diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,33715 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9999939379610938, + "eval_steps": 300, + "global_step": 20620, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00024248155624662798, + "grad_norm": 0.53515625, + "learning_rate": 2.4248302618816684e-07, + "loss": 2.8874, + "num_input_tokens_seen": 2621440, + "step": 5 + }, + { + "epoch": 0.00048496311249325596, + "grad_norm": 0.5390625, + "learning_rate": 4.849660523763337e-07, + "loss": 2.8873, + "num_input_tokens_seen": 5242880, + "step": 10 + }, + { + "epoch": 0.0007274446687398839, + "grad_norm": 0.515625, + "learning_rate": 7.274490785645005e-07, + "loss": 2.8857, + "num_input_tokens_seen": 7864320, + "step": 15 + }, + { + "epoch": 0.0009699262249865119, + "grad_norm": 0.51953125, + "learning_rate": 9.699321047526673e-07, + "loss": 2.8831, + "num_input_tokens_seen": 10485760, + "step": 20 + }, + { + "epoch": 0.00121240778123314, + "grad_norm": 0.51171875, + "learning_rate": 1.212415130940834e-06, + "loss": 2.8732, + "num_input_tokens_seen": 13107200, + "step": 25 + }, + { + "epoch": 0.0014548893374797679, + "grad_norm": 0.51171875, + "learning_rate": 1.454898157129001e-06, + "loss": 2.8937, + "num_input_tokens_seen": 15728640, + "step": 30 + }, + { + "epoch": 0.001697370893726396, + "grad_norm": 0.51171875, + "learning_rate": 1.697381183317168e-06, + "loss": 2.879, + "num_input_tokens_seen": 18350080, + "step": 35 + }, + { + "epoch": 0.0019398524499730238, + "grad_norm": 0.4921875, + "learning_rate": 1.9398642095053347e-06, + "loss": 2.879, + "num_input_tokens_seen": 20971520, + "step": 40 + }, + { + "epoch": 0.0021823340062196517, + "grad_norm": 0.482421875, + "learning_rate": 2.1823472356935016e-06, + "loss": 2.8919, + "num_input_tokens_seen": 23592960, + "step": 45 + }, + { + "epoch": 0.00242481556246628, + "grad_norm": 0.474609375, + "learning_rate": 2.424830261881668e-06, + "loss": 2.8731, + "num_input_tokens_seen": 26214400, + "step": 50 + }, + { + "epoch": 0.002667297118712908, + "grad_norm": 0.46484375, + "learning_rate": 2.667313288069835e-06, + "loss": 2.8727, + "num_input_tokens_seen": 28835840, + "step": 55 + }, + { + "epoch": 0.0029097786749595358, + "grad_norm": 0.44921875, + "learning_rate": 2.909796314258002e-06, + "loss": 2.8773, + "num_input_tokens_seen": 31457280, + "step": 60 + }, + { + "epoch": 0.003152260231206164, + "grad_norm": 0.435546875, + "learning_rate": 3.152279340446169e-06, + "loss": 2.876, + "num_input_tokens_seen": 34078720, + "step": 65 + }, + { + "epoch": 0.003394741787452792, + "grad_norm": 0.4375, + "learning_rate": 3.394762366634336e-06, + "loss": 2.8747, + "num_input_tokens_seen": 36700160, + "step": 70 + }, + { + "epoch": 0.00363722334369942, + "grad_norm": 0.42578125, + "learning_rate": 3.6372453928225025e-06, + "loss": 2.8939, + "num_input_tokens_seen": 39321600, + "step": 75 + }, + { + "epoch": 0.0038797048999460477, + "grad_norm": 0.3984375, + "learning_rate": 3.879728419010669e-06, + "loss": 2.8853, + "num_input_tokens_seen": 41943040, + "step": 80 + }, + { + "epoch": 0.0041221864561926756, + "grad_norm": 0.408203125, + "learning_rate": 4.122211445198836e-06, + "loss": 2.8875, + "num_input_tokens_seen": 44564480, + "step": 85 + }, + { + "epoch": 0.004364668012439303, + "grad_norm": 0.396484375, + "learning_rate": 4.364694471387003e-06, + "loss": 2.8808, + "num_input_tokens_seen": 47185920, + "step": 90 + }, + { + "epoch": 0.004607149568685932, + "grad_norm": 0.380859375, + "learning_rate": 4.60717749757517e-06, + "loss": 2.871, + "num_input_tokens_seen": 49807360, + "step": 95 + }, + { + "epoch": 0.00484963112493256, + "grad_norm": 0.369140625, + "learning_rate": 4.849660523763336e-06, + "loss": 2.881, + "num_input_tokens_seen": 52428800, + "step": 100 + }, + { + "epoch": 0.005092112681179188, + "grad_norm": 0.353515625, + "learning_rate": 5.092143549951504e-06, + "loss": 2.8749, + "num_input_tokens_seen": 55050240, + "step": 105 + }, + { + "epoch": 0.005334594237425816, + "grad_norm": 0.33984375, + "learning_rate": 5.33462657613967e-06, + "loss": 2.8786, + "num_input_tokens_seen": 57671680, + "step": 110 + }, + { + "epoch": 0.005577075793672444, + "grad_norm": 0.333984375, + "learning_rate": 5.5771096023278376e-06, + "loss": 2.8811, + "num_input_tokens_seen": 60293120, + "step": 115 + }, + { + "epoch": 0.0058195573499190715, + "grad_norm": 0.3125, + "learning_rate": 5.819592628516004e-06, + "loss": 2.8714, + "num_input_tokens_seen": 62914560, + "step": 120 + }, + { + "epoch": 0.006062038906165699, + "grad_norm": 0.306640625, + "learning_rate": 6.0620756547041715e-06, + "loss": 2.8902, + "num_input_tokens_seen": 65536000, + "step": 125 + }, + { + "epoch": 0.006304520462412328, + "grad_norm": 0.2890625, + "learning_rate": 6.304558680892338e-06, + "loss": 2.8573, + "num_input_tokens_seen": 68157440, + "step": 130 + }, + { + "epoch": 0.006547002018658956, + "grad_norm": 0.29296875, + "learning_rate": 6.5470417070805045e-06, + "loss": 2.8806, + "num_input_tokens_seen": 70778880, + "step": 135 + }, + { + "epoch": 0.006789483574905584, + "grad_norm": 0.287109375, + "learning_rate": 6.789524733268672e-06, + "loss": 2.8807, + "num_input_tokens_seen": 73400320, + "step": 140 + }, + { + "epoch": 0.007031965131152212, + "grad_norm": 0.271484375, + "learning_rate": 7.0320077594568375e-06, + "loss": 2.8691, + "num_input_tokens_seen": 76021760, + "step": 145 + }, + { + "epoch": 0.00727444668739884, + "grad_norm": 0.267578125, + "learning_rate": 7.274490785645005e-06, + "loss": 2.8789, + "num_input_tokens_seen": 78643200, + "step": 150 + }, + { + "epoch": 0.0075169282436454675, + "grad_norm": 0.265625, + "learning_rate": 7.516973811833172e-06, + "loss": 2.8767, + "num_input_tokens_seen": 81264640, + "step": 155 + }, + { + "epoch": 0.007759409799892095, + "grad_norm": 0.2734375, + "learning_rate": 7.759456838021339e-06, + "loss": 2.8645, + "num_input_tokens_seen": 83886080, + "step": 160 + }, + { + "epoch": 0.008001891356138723, + "grad_norm": 0.263671875, + "learning_rate": 8.001939864209505e-06, + "loss": 2.8634, + "num_input_tokens_seen": 86507520, + "step": 165 + }, + { + "epoch": 0.008244372912385351, + "grad_norm": 0.255859375, + "learning_rate": 8.244422890397672e-06, + "loss": 2.8609, + "num_input_tokens_seen": 89128960, + "step": 170 + }, + { + "epoch": 0.008486854468631979, + "grad_norm": 0.259765625, + "learning_rate": 8.48690591658584e-06, + "loss": 2.8685, + "num_input_tokens_seen": 91750400, + "step": 175 + }, + { + "epoch": 0.008729336024878607, + "grad_norm": 0.25, + "learning_rate": 8.729388942774007e-06, + "loss": 2.8798, + "num_input_tokens_seen": 94371840, + "step": 180 + }, + { + "epoch": 0.008971817581125236, + "grad_norm": 0.255859375, + "learning_rate": 8.971871968962173e-06, + "loss": 2.8717, + "num_input_tokens_seen": 96993280, + "step": 185 + }, + { + "epoch": 0.009214299137371864, + "grad_norm": 0.25, + "learning_rate": 9.21435499515034e-06, + "loss": 2.8696, + "num_input_tokens_seen": 99614720, + "step": 190 + }, + { + "epoch": 0.009456780693618492, + "grad_norm": 0.2490234375, + "learning_rate": 9.456838021338508e-06, + "loss": 2.8578, + "num_input_tokens_seen": 102236160, + "step": 195 + }, + { + "epoch": 0.00969926224986512, + "grad_norm": 0.2490234375, + "learning_rate": 9.699321047526673e-06, + "loss": 2.8771, + "num_input_tokens_seen": 104857600, + "step": 200 + }, + { + "epoch": 0.009941743806111748, + "grad_norm": 0.2421875, + "learning_rate": 9.941804073714841e-06, + "loss": 2.8708, + "num_input_tokens_seen": 107479040, + "step": 205 + }, + { + "epoch": 0.010184225362358376, + "grad_norm": 0.248046875, + "learning_rate": 1.0184287099903007e-05, + "loss": 2.8761, + "num_input_tokens_seen": 110100480, + "step": 210 + }, + { + "epoch": 0.010426706918605004, + "grad_norm": 0.2392578125, + "learning_rate": 1.0426770126091174e-05, + "loss": 2.863, + "num_input_tokens_seen": 112721920, + "step": 215 + }, + { + "epoch": 0.010669188474851632, + "grad_norm": 0.2470703125, + "learning_rate": 1.066925315227934e-05, + "loss": 2.8727, + "num_input_tokens_seen": 115343360, + "step": 220 + }, + { + "epoch": 0.01091167003109826, + "grad_norm": 0.2412109375, + "learning_rate": 1.0911736178467507e-05, + "loss": 2.845, + "num_input_tokens_seen": 117964800, + "step": 225 + }, + { + "epoch": 0.011154151587344887, + "grad_norm": 0.248046875, + "learning_rate": 1.1154219204655675e-05, + "loss": 2.8734, + "num_input_tokens_seen": 120586240, + "step": 230 + }, + { + "epoch": 0.011396633143591515, + "grad_norm": 0.24609375, + "learning_rate": 1.1396702230843842e-05, + "loss": 2.8739, + "num_input_tokens_seen": 123207680, + "step": 235 + }, + { + "epoch": 0.011639114699838143, + "grad_norm": 0.2451171875, + "learning_rate": 1.1639185257032008e-05, + "loss": 2.8528, + "num_input_tokens_seen": 125829120, + "step": 240 + }, + { + "epoch": 0.011881596256084771, + "grad_norm": 0.2470703125, + "learning_rate": 1.1881668283220175e-05, + "loss": 2.8602, + "num_input_tokens_seen": 128450560, + "step": 245 + }, + { + "epoch": 0.012124077812331399, + "grad_norm": 0.2421875, + "learning_rate": 1.2124151309408343e-05, + "loss": 2.8657, + "num_input_tokens_seen": 131072000, + "step": 250 + }, + { + "epoch": 0.012366559368578028, + "grad_norm": 0.2431640625, + "learning_rate": 1.236663433559651e-05, + "loss": 2.8519, + "num_input_tokens_seen": 133693440, + "step": 255 + }, + { + "epoch": 0.012609040924824656, + "grad_norm": 0.2412109375, + "learning_rate": 1.2609117361784676e-05, + "loss": 2.8603, + "num_input_tokens_seen": 136314880, + "step": 260 + }, + { + "epoch": 0.012851522481071284, + "grad_norm": 0.2392578125, + "learning_rate": 1.2851600387972842e-05, + "loss": 2.8633, + "num_input_tokens_seen": 138936320, + "step": 265 + }, + { + "epoch": 0.013094004037317912, + "grad_norm": 0.2451171875, + "learning_rate": 1.3094083414161009e-05, + "loss": 2.8777, + "num_input_tokens_seen": 141557760, + "step": 270 + }, + { + "epoch": 0.01333648559356454, + "grad_norm": 0.2353515625, + "learning_rate": 1.3336566440349177e-05, + "loss": 2.8497, + "num_input_tokens_seen": 144179200, + "step": 275 + }, + { + "epoch": 0.013578967149811168, + "grad_norm": 0.2373046875, + "learning_rate": 1.3579049466537344e-05, + "loss": 2.8623, + "num_input_tokens_seen": 146800640, + "step": 280 + }, + { + "epoch": 0.013821448706057796, + "grad_norm": 0.2392578125, + "learning_rate": 1.3821532492725509e-05, + "loss": 2.8706, + "num_input_tokens_seen": 149422080, + "step": 285 + }, + { + "epoch": 0.014063930262304424, + "grad_norm": 0.2412109375, + "learning_rate": 1.4064015518913675e-05, + "loss": 2.8755, + "num_input_tokens_seen": 152043520, + "step": 290 + }, + { + "epoch": 0.014306411818551051, + "grad_norm": 0.236328125, + "learning_rate": 1.4306498545101843e-05, + "loss": 2.8671, + "num_input_tokens_seen": 154664960, + "step": 295 + }, + { + "epoch": 0.01454889337479768, + "grad_norm": 0.2431640625, + "learning_rate": 1.454898157129001e-05, + "loss": 2.8567, + "num_input_tokens_seen": 157286400, + "step": 300 + }, + { + "epoch": 0.01454889337479768, + "eval_accuracy": 0.4449780166096727, + "eval_loss": 2.829090118408203, + "eval_runtime": 5.8559, + "eval_samples_per_second": 51.231, + "eval_steps_per_second": 6.489, + "num_input_tokens_seen": 157286400, + "step": 300 + }, + { + "epoch": 0.014791374931044307, + "grad_norm": 0.2392578125, + "learning_rate": 1.4791464597478178e-05, + "loss": 2.8608, + "num_input_tokens_seen": 159907840, + "step": 305 + }, + { + "epoch": 0.015033856487290935, + "grad_norm": 0.240234375, + "learning_rate": 1.5033947623666345e-05, + "loss": 2.8402, + "num_input_tokens_seen": 162529280, + "step": 310 + }, + { + "epoch": 0.015276338043537563, + "grad_norm": 0.244140625, + "learning_rate": 1.5276430649854513e-05, + "loss": 2.8556, + "num_input_tokens_seen": 165150720, + "step": 315 + }, + { + "epoch": 0.01551881959978419, + "grad_norm": 0.2412109375, + "learning_rate": 1.5518913676042678e-05, + "loss": 2.8615, + "num_input_tokens_seen": 167772160, + "step": 320 + }, + { + "epoch": 0.01576130115603082, + "grad_norm": 0.2412109375, + "learning_rate": 1.5761396702230842e-05, + "loss": 2.8763, + "num_input_tokens_seen": 170393600, + "step": 325 + }, + { + "epoch": 0.016003782712277446, + "grad_norm": 0.2412109375, + "learning_rate": 1.600387972841901e-05, + "loss": 2.8641, + "num_input_tokens_seen": 173015040, + "step": 330 + }, + { + "epoch": 0.016246264268524074, + "grad_norm": 0.2392578125, + "learning_rate": 1.624636275460718e-05, + "loss": 2.8674, + "num_input_tokens_seen": 175636480, + "step": 335 + }, + { + "epoch": 0.016488745824770702, + "grad_norm": 0.2392578125, + "learning_rate": 1.6488845780795344e-05, + "loss": 2.8575, + "num_input_tokens_seen": 178257920, + "step": 340 + }, + { + "epoch": 0.01673122738101733, + "grad_norm": 0.240234375, + "learning_rate": 1.6731328806983512e-05, + "loss": 2.8661, + "num_input_tokens_seen": 180879360, + "step": 345 + }, + { + "epoch": 0.016973708937263958, + "grad_norm": 0.23828125, + "learning_rate": 1.697381183317168e-05, + "loss": 2.861, + "num_input_tokens_seen": 183500800, + "step": 350 + }, + { + "epoch": 0.017216190493510586, + "grad_norm": 0.23828125, + "learning_rate": 1.7216294859359848e-05, + "loss": 2.8666, + "num_input_tokens_seen": 186122240, + "step": 355 + }, + { + "epoch": 0.017458672049757214, + "grad_norm": 0.2353515625, + "learning_rate": 1.7458777885548013e-05, + "loss": 2.8575, + "num_input_tokens_seen": 188743680, + "step": 360 + }, + { + "epoch": 0.017701153606003845, + "grad_norm": 0.2490234375, + "learning_rate": 1.7701260911736178e-05, + "loss": 2.8656, + "num_input_tokens_seen": 191365120, + "step": 365 + }, + { + "epoch": 0.017943635162250473, + "grad_norm": 0.236328125, + "learning_rate": 1.7943743937924346e-05, + "loss": 2.8487, + "num_input_tokens_seen": 193986560, + "step": 370 + }, + { + "epoch": 0.0181861167184971, + "grad_norm": 0.244140625, + "learning_rate": 1.8186226964112514e-05, + "loss": 2.8808, + "num_input_tokens_seen": 196608000, + "step": 375 + }, + { + "epoch": 0.01842859827474373, + "grad_norm": 0.244140625, + "learning_rate": 1.842870999030068e-05, + "loss": 2.8551, + "num_input_tokens_seen": 199229440, + "step": 380 + }, + { + "epoch": 0.018671079830990357, + "grad_norm": 0.2353515625, + "learning_rate": 1.8671193016488847e-05, + "loss": 2.8548, + "num_input_tokens_seen": 201850880, + "step": 385 + }, + { + "epoch": 0.018913561387236984, + "grad_norm": 0.2431640625, + "learning_rate": 1.8913676042677016e-05, + "loss": 2.8544, + "num_input_tokens_seen": 204472320, + "step": 390 + }, + { + "epoch": 0.019156042943483612, + "grad_norm": 0.2392578125, + "learning_rate": 1.915615906886518e-05, + "loss": 2.8602, + "num_input_tokens_seen": 207093760, + "step": 395 + }, + { + "epoch": 0.01939852449973024, + "grad_norm": 0.2392578125, + "learning_rate": 1.9398642095053345e-05, + "loss": 2.8397, + "num_input_tokens_seen": 209715200, + "step": 400 + }, + { + "epoch": 0.019641006055976868, + "grad_norm": 0.24609375, + "learning_rate": 1.9641125121241513e-05, + "loss": 2.8642, + "num_input_tokens_seen": 212336640, + "step": 405 + }, + { + "epoch": 0.019883487612223496, + "grad_norm": 0.236328125, + "learning_rate": 1.9883608147429682e-05, + "loss": 2.8498, + "num_input_tokens_seen": 214958080, + "step": 410 + }, + { + "epoch": 0.020125969168470124, + "grad_norm": 0.25, + "learning_rate": 2.0126091173617847e-05, + "loss": 2.8676, + "num_input_tokens_seen": 217579520, + "step": 415 + }, + { + "epoch": 0.02036845072471675, + "grad_norm": 0.2314453125, + "learning_rate": 2.0368574199806015e-05, + "loss": 2.8698, + "num_input_tokens_seen": 220200960, + "step": 420 + }, + { + "epoch": 0.02061093228096338, + "grad_norm": 0.236328125, + "learning_rate": 2.0611057225994183e-05, + "loss": 2.8542, + "num_input_tokens_seen": 222822400, + "step": 425 + }, + { + "epoch": 0.020853413837210007, + "grad_norm": 0.236328125, + "learning_rate": 2.0853540252182348e-05, + "loss": 2.8501, + "num_input_tokens_seen": 225443840, + "step": 430 + }, + { + "epoch": 0.021095895393456635, + "grad_norm": 0.240234375, + "learning_rate": 2.1096023278370516e-05, + "loss": 2.8506, + "num_input_tokens_seen": 228065280, + "step": 435 + }, + { + "epoch": 0.021338376949703263, + "grad_norm": 0.2392578125, + "learning_rate": 2.133850630455868e-05, + "loss": 2.8552, + "num_input_tokens_seen": 230686720, + "step": 440 + }, + { + "epoch": 0.02158085850594989, + "grad_norm": 0.2431640625, + "learning_rate": 2.158098933074685e-05, + "loss": 2.8736, + "num_input_tokens_seen": 233308160, + "step": 445 + }, + { + "epoch": 0.02182334006219652, + "grad_norm": 0.248046875, + "learning_rate": 2.1823472356935014e-05, + "loss": 2.8596, + "num_input_tokens_seen": 235929600, + "step": 450 + }, + { + "epoch": 0.022065821618443147, + "grad_norm": 0.2392578125, + "learning_rate": 2.2065955383123182e-05, + "loss": 2.8558, + "num_input_tokens_seen": 238551040, + "step": 455 + }, + { + "epoch": 0.022308303174689775, + "grad_norm": 0.2392578125, + "learning_rate": 2.230843840931135e-05, + "loss": 2.8413, + "num_input_tokens_seen": 241172480, + "step": 460 + }, + { + "epoch": 0.022550784730936402, + "grad_norm": 0.2421875, + "learning_rate": 2.255092143549952e-05, + "loss": 2.8529, + "num_input_tokens_seen": 243793920, + "step": 465 + }, + { + "epoch": 0.02279326628718303, + "grad_norm": 0.2353515625, + "learning_rate": 2.2793404461687683e-05, + "loss": 2.8531, + "num_input_tokens_seen": 246415360, + "step": 470 + }, + { + "epoch": 0.023035747843429658, + "grad_norm": 0.248046875, + "learning_rate": 2.3035887487875848e-05, + "loss": 2.8582, + "num_input_tokens_seen": 249036800, + "step": 475 + }, + { + "epoch": 0.023278229399676286, + "grad_norm": 0.248046875, + "learning_rate": 2.3278370514064016e-05, + "loss": 2.8544, + "num_input_tokens_seen": 251658240, + "step": 480 + }, + { + "epoch": 0.023520710955922914, + "grad_norm": 0.2392578125, + "learning_rate": 2.3520853540252185e-05, + "loss": 2.8485, + "num_input_tokens_seen": 254279680, + "step": 485 + }, + { + "epoch": 0.023763192512169542, + "grad_norm": 0.2392578125, + "learning_rate": 2.376333656644035e-05, + "loss": 2.8555, + "num_input_tokens_seen": 256901120, + "step": 490 + }, + { + "epoch": 0.02400567406841617, + "grad_norm": 0.2470703125, + "learning_rate": 2.4005819592628518e-05, + "loss": 2.851, + "num_input_tokens_seen": 259522560, + "step": 495 + }, + { + "epoch": 0.024248155624662798, + "grad_norm": 0.2421875, + "learning_rate": 2.4248302618816686e-05, + "loss": 2.859, + "num_input_tokens_seen": 262144000, + "step": 500 + }, + { + "epoch": 0.024490637180909425, + "grad_norm": 0.2353515625, + "learning_rate": 2.449078564500485e-05, + "loss": 2.8565, + "num_input_tokens_seen": 264765440, + "step": 505 + }, + { + "epoch": 0.024733118737156057, + "grad_norm": 0.24609375, + "learning_rate": 2.473326867119302e-05, + "loss": 2.8593, + "num_input_tokens_seen": 267386880, + "step": 510 + }, + { + "epoch": 0.024975600293402685, + "grad_norm": 0.240234375, + "learning_rate": 2.4975751697381184e-05, + "loss": 2.8371, + "num_input_tokens_seen": 270008320, + "step": 515 + }, + { + "epoch": 0.025218081849649313, + "grad_norm": 0.236328125, + "learning_rate": 2.5218234723569352e-05, + "loss": 2.8615, + "num_input_tokens_seen": 272629760, + "step": 520 + }, + { + "epoch": 0.02546056340589594, + "grad_norm": 0.240234375, + "learning_rate": 2.5460717749757517e-05, + "loss": 2.8606, + "num_input_tokens_seen": 275251200, + "step": 525 + }, + { + "epoch": 0.02570304496214257, + "grad_norm": 0.2412109375, + "learning_rate": 2.5703200775945685e-05, + "loss": 2.8607, + "num_input_tokens_seen": 277872640, + "step": 530 + }, + { + "epoch": 0.025945526518389196, + "grad_norm": 0.2470703125, + "learning_rate": 2.5945683802133853e-05, + "loss": 2.8498, + "num_input_tokens_seen": 280494080, + "step": 535 + }, + { + "epoch": 0.026188008074635824, + "grad_norm": 0.232421875, + "learning_rate": 2.6188166828322018e-05, + "loss": 2.8578, + "num_input_tokens_seen": 283115520, + "step": 540 + }, + { + "epoch": 0.026430489630882452, + "grad_norm": 0.236328125, + "learning_rate": 2.6430649854510186e-05, + "loss": 2.858, + "num_input_tokens_seen": 285736960, + "step": 545 + }, + { + "epoch": 0.02667297118712908, + "grad_norm": 0.2421875, + "learning_rate": 2.6673132880698354e-05, + "loss": 2.8645, + "num_input_tokens_seen": 288358400, + "step": 550 + }, + { + "epoch": 0.026915452743375708, + "grad_norm": 0.2373046875, + "learning_rate": 2.6915615906886523e-05, + "loss": 2.8586, + "num_input_tokens_seen": 290979840, + "step": 555 + }, + { + "epoch": 0.027157934299622336, + "grad_norm": 0.2392578125, + "learning_rate": 2.7158098933074687e-05, + "loss": 2.8533, + "num_input_tokens_seen": 293601280, + "step": 560 + }, + { + "epoch": 0.027400415855868963, + "grad_norm": 0.2373046875, + "learning_rate": 2.7400581959262856e-05, + "loss": 2.8527, + "num_input_tokens_seen": 296222720, + "step": 565 + }, + { + "epoch": 0.02764289741211559, + "grad_norm": 0.2490234375, + "learning_rate": 2.7643064985451017e-05, + "loss": 2.8535, + "num_input_tokens_seen": 298844160, + "step": 570 + }, + { + "epoch": 0.02788537896836222, + "grad_norm": 0.2451171875, + "learning_rate": 2.7885548011639185e-05, + "loss": 2.8571, + "num_input_tokens_seen": 301465600, + "step": 575 + }, + { + "epoch": 0.028127860524608847, + "grad_norm": 0.236328125, + "learning_rate": 2.812803103782735e-05, + "loss": 2.8551, + "num_input_tokens_seen": 304087040, + "step": 580 + }, + { + "epoch": 0.028370342080855475, + "grad_norm": 0.236328125, + "learning_rate": 2.837051406401552e-05, + "loss": 2.8241, + "num_input_tokens_seen": 306708480, + "step": 585 + }, + { + "epoch": 0.028612823637102103, + "grad_norm": 0.2421875, + "learning_rate": 2.8612997090203687e-05, + "loss": 2.8533, + "num_input_tokens_seen": 309329920, + "step": 590 + }, + { + "epoch": 0.02885530519334873, + "grad_norm": 0.2421875, + "learning_rate": 2.8855480116391855e-05, + "loss": 2.8432, + "num_input_tokens_seen": 311951360, + "step": 595 + }, + { + "epoch": 0.02909778674959536, + "grad_norm": 0.2373046875, + "learning_rate": 2.909796314258002e-05, + "loss": 2.8517, + "num_input_tokens_seen": 314572800, + "step": 600 + }, + { + "epoch": 0.02909778674959536, + "eval_accuracy": 0.44654942191825436, + "eval_loss": 2.81530499458313, + "eval_runtime": 5.7832, + "eval_samples_per_second": 51.874, + "eval_steps_per_second": 6.571, + "num_input_tokens_seen": 314572800, + "step": 600 + }, + { + "epoch": 0.029340268305841986, + "grad_norm": 0.248046875, + "learning_rate": 2.9340446168768188e-05, + "loss": 2.8588, + "num_input_tokens_seen": 317194240, + "step": 605 + }, + { + "epoch": 0.029582749862088614, + "grad_norm": 0.2431640625, + "learning_rate": 2.9582929194956356e-05, + "loss": 2.8523, + "num_input_tokens_seen": 319815680, + "step": 610 + }, + { + "epoch": 0.029825231418335242, + "grad_norm": 0.23828125, + "learning_rate": 2.982541222114452e-05, + "loss": 2.8448, + "num_input_tokens_seen": 322437120, + "step": 615 + }, + { + "epoch": 0.03006771297458187, + "grad_norm": 0.23828125, + "learning_rate": 3.006789524733269e-05, + "loss": 2.843, + "num_input_tokens_seen": 325058560, + "step": 620 + }, + { + "epoch": 0.030310194530828498, + "grad_norm": 0.236328125, + "learning_rate": 3.0310378273520857e-05, + "loss": 2.8493, + "num_input_tokens_seen": 327680000, + "step": 625 + }, + { + "epoch": 0.030552676087075126, + "grad_norm": 0.2431640625, + "learning_rate": 3.0552861299709026e-05, + "loss": 2.8579, + "num_input_tokens_seen": 330301440, + "step": 630 + }, + { + "epoch": 0.030795157643321754, + "grad_norm": 0.2421875, + "learning_rate": 3.0795344325897194e-05, + "loss": 2.8366, + "num_input_tokens_seen": 332922880, + "step": 635 + }, + { + "epoch": 0.03103763919956838, + "grad_norm": 0.24609375, + "learning_rate": 3.1037827352085355e-05, + "loss": 2.8389, + "num_input_tokens_seen": 335544320, + "step": 640 + }, + { + "epoch": 0.03128012075581501, + "grad_norm": 0.2431640625, + "learning_rate": 3.1280310378273517e-05, + "loss": 2.8449, + "num_input_tokens_seen": 338165760, + "step": 645 + }, + { + "epoch": 0.03152260231206164, + "grad_norm": 0.2373046875, + "learning_rate": 3.1522793404461685e-05, + "loss": 2.8578, + "num_input_tokens_seen": 340787200, + "step": 650 + }, + { + "epoch": 0.031765083868308265, + "grad_norm": 0.2470703125, + "learning_rate": 3.176527643064985e-05, + "loss": 2.8494, + "num_input_tokens_seen": 343408640, + "step": 655 + }, + { + "epoch": 0.03200756542455489, + "grad_norm": 0.2431640625, + "learning_rate": 3.200775945683802e-05, + "loss": 2.8448, + "num_input_tokens_seen": 346030080, + "step": 660 + }, + { + "epoch": 0.03225004698080152, + "grad_norm": 0.248046875, + "learning_rate": 3.225024248302619e-05, + "loss": 2.8583, + "num_input_tokens_seen": 348651520, + "step": 665 + }, + { + "epoch": 0.03249252853704815, + "grad_norm": 0.234375, + "learning_rate": 3.249272550921436e-05, + "loss": 2.8622, + "num_input_tokens_seen": 351272960, + "step": 670 + }, + { + "epoch": 0.03273501009329478, + "grad_norm": 0.25390625, + "learning_rate": 3.2735208535402526e-05, + "loss": 2.8409, + "num_input_tokens_seen": 353894400, + "step": 675 + }, + { + "epoch": 0.032977491649541404, + "grad_norm": 0.251953125, + "learning_rate": 3.297769156159069e-05, + "loss": 2.8468, + "num_input_tokens_seen": 356515840, + "step": 680 + }, + { + "epoch": 0.03321997320578803, + "grad_norm": 0.248046875, + "learning_rate": 3.3220174587778856e-05, + "loss": 2.8385, + "num_input_tokens_seen": 359137280, + "step": 685 + }, + { + "epoch": 0.03346245476203466, + "grad_norm": 0.2578125, + "learning_rate": 3.3462657613967024e-05, + "loss": 2.8417, + "num_input_tokens_seen": 361758720, + "step": 690 + }, + { + "epoch": 0.03370493631828129, + "grad_norm": 0.251953125, + "learning_rate": 3.370514064015519e-05, + "loss": 2.8397, + "num_input_tokens_seen": 364380160, + "step": 695 + }, + { + "epoch": 0.033947417874527916, + "grad_norm": 0.234375, + "learning_rate": 3.394762366634336e-05, + "loss": 2.844, + "num_input_tokens_seen": 367001600, + "step": 700 + }, + { + "epoch": 0.034189899430774544, + "grad_norm": 0.2451171875, + "learning_rate": 3.419010669253153e-05, + "loss": 2.8459, + "num_input_tokens_seen": 369623040, + "step": 705 + }, + { + "epoch": 0.03443238098702117, + "grad_norm": 0.24609375, + "learning_rate": 3.4432589718719697e-05, + "loss": 2.8481, + "num_input_tokens_seen": 372244480, + "step": 710 + }, + { + "epoch": 0.0346748625432678, + "grad_norm": 0.2470703125, + "learning_rate": 3.467507274490786e-05, + "loss": 2.8362, + "num_input_tokens_seen": 374865920, + "step": 715 + }, + { + "epoch": 0.03491734409951443, + "grad_norm": 0.2412109375, + "learning_rate": 3.4917555771096026e-05, + "loss": 2.8333, + "num_input_tokens_seen": 377487360, + "step": 720 + }, + { + "epoch": 0.035159825655761055, + "grad_norm": 0.2412109375, + "learning_rate": 3.516003879728419e-05, + "loss": 2.844, + "num_input_tokens_seen": 380108800, + "step": 725 + }, + { + "epoch": 0.03540230721200769, + "grad_norm": 0.2392578125, + "learning_rate": 3.5402521823472356e-05, + "loss": 2.8341, + "num_input_tokens_seen": 382730240, + "step": 730 + }, + { + "epoch": 0.03564478876825432, + "grad_norm": 0.236328125, + "learning_rate": 3.5645004849660524e-05, + "loss": 2.8418, + "num_input_tokens_seen": 385351680, + "step": 735 + }, + { + "epoch": 0.035887270324500946, + "grad_norm": 0.2421875, + "learning_rate": 3.588748787584869e-05, + "loss": 2.8405, + "num_input_tokens_seen": 387973120, + "step": 740 + }, + { + "epoch": 0.036129751880747574, + "grad_norm": 0.2412109375, + "learning_rate": 3.612997090203686e-05, + "loss": 2.8324, + "num_input_tokens_seen": 390594560, + "step": 745 + }, + { + "epoch": 0.0363722334369942, + "grad_norm": 0.2412109375, + "learning_rate": 3.637245392822503e-05, + "loss": 2.8495, + "num_input_tokens_seen": 393216000, + "step": 750 + }, + { + "epoch": 0.03661471499324083, + "grad_norm": 0.251953125, + "learning_rate": 3.661493695441319e-05, + "loss": 2.8399, + "num_input_tokens_seen": 395837440, + "step": 755 + }, + { + "epoch": 0.03685719654948746, + "grad_norm": 0.2470703125, + "learning_rate": 3.685741998060136e-05, + "loss": 2.8427, + "num_input_tokens_seen": 398458880, + "step": 760 + }, + { + "epoch": 0.037099678105734085, + "grad_norm": 0.25390625, + "learning_rate": 3.709990300678953e-05, + "loss": 2.8604, + "num_input_tokens_seen": 401080320, + "step": 765 + }, + { + "epoch": 0.03734215966198071, + "grad_norm": 0.2451171875, + "learning_rate": 3.7342386032977695e-05, + "loss": 2.8341, + "num_input_tokens_seen": 403701760, + "step": 770 + }, + { + "epoch": 0.03758464121822734, + "grad_norm": 0.2333984375, + "learning_rate": 3.758486905916586e-05, + "loss": 2.8477, + "num_input_tokens_seen": 406323200, + "step": 775 + }, + { + "epoch": 0.03782712277447397, + "grad_norm": 0.2373046875, + "learning_rate": 3.782735208535403e-05, + "loss": 2.8507, + "num_input_tokens_seen": 408944640, + "step": 780 + }, + { + "epoch": 0.0380696043307206, + "grad_norm": 0.251953125, + "learning_rate": 3.806983511154219e-05, + "loss": 2.8484, + "num_input_tokens_seen": 411566080, + "step": 785 + }, + { + "epoch": 0.038312085886967225, + "grad_norm": 0.25, + "learning_rate": 3.831231813773036e-05, + "loss": 2.833, + "num_input_tokens_seen": 414187520, + "step": 790 + }, + { + "epoch": 0.03855456744321385, + "grad_norm": 0.2392578125, + "learning_rate": 3.855480116391853e-05, + "loss": 2.8401, + "num_input_tokens_seen": 416808960, + "step": 795 + }, + { + "epoch": 0.03879704899946048, + "grad_norm": 0.240234375, + "learning_rate": 3.879728419010669e-05, + "loss": 2.8459, + "num_input_tokens_seen": 419430400, + "step": 800 + }, + { + "epoch": 0.03903953055570711, + "grad_norm": 0.25390625, + "learning_rate": 3.903976721629486e-05, + "loss": 2.8508, + "num_input_tokens_seen": 422051840, + "step": 805 + }, + { + "epoch": 0.039282012111953736, + "grad_norm": 0.240234375, + "learning_rate": 3.928225024248303e-05, + "loss": 2.8379, + "num_input_tokens_seen": 424673280, + "step": 810 + }, + { + "epoch": 0.039524493668200364, + "grad_norm": 0.248046875, + "learning_rate": 3.9524733268671195e-05, + "loss": 2.826, + "num_input_tokens_seen": 427294720, + "step": 815 + }, + { + "epoch": 0.03976697522444699, + "grad_norm": 0.244140625, + "learning_rate": 3.9767216294859363e-05, + "loss": 2.837, + "num_input_tokens_seen": 429916160, + "step": 820 + }, + { + "epoch": 0.04000945678069362, + "grad_norm": 0.2470703125, + "learning_rate": 4.0009699321047525e-05, + "loss": 2.8427, + "num_input_tokens_seen": 432537600, + "step": 825 + }, + { + "epoch": 0.04025193833694025, + "grad_norm": 0.240234375, + "learning_rate": 4.025218234723569e-05, + "loss": 2.8387, + "num_input_tokens_seen": 435159040, + "step": 830 + }, + { + "epoch": 0.040494419893186875, + "grad_norm": 0.248046875, + "learning_rate": 4.049466537342386e-05, + "loss": 2.8321, + "num_input_tokens_seen": 437780480, + "step": 835 + }, + { + "epoch": 0.0407369014494335, + "grad_norm": 0.25, + "learning_rate": 4.073714839961203e-05, + "loss": 2.8436, + "num_input_tokens_seen": 440401920, + "step": 840 + }, + { + "epoch": 0.04097938300568013, + "grad_norm": 0.248046875, + "learning_rate": 4.09796314258002e-05, + "loss": 2.8422, + "num_input_tokens_seen": 443023360, + "step": 845 + }, + { + "epoch": 0.04122186456192676, + "grad_norm": 0.2490234375, + "learning_rate": 4.1222114451988366e-05, + "loss": 2.8361, + "num_input_tokens_seen": 445644800, + "step": 850 + }, + { + "epoch": 0.04146434611817339, + "grad_norm": 0.236328125, + "learning_rate": 4.1464597478176534e-05, + "loss": 2.8283, + "num_input_tokens_seen": 448266240, + "step": 855 + }, + { + "epoch": 0.041706827674420015, + "grad_norm": 0.2578125, + "learning_rate": 4.1707080504364696e-05, + "loss": 2.8344, + "num_input_tokens_seen": 450887680, + "step": 860 + }, + { + "epoch": 0.04194930923066664, + "grad_norm": 0.2421875, + "learning_rate": 4.1949563530552864e-05, + "loss": 2.8247, + "num_input_tokens_seen": 453509120, + "step": 865 + }, + { + "epoch": 0.04219179078691327, + "grad_norm": 0.244140625, + "learning_rate": 4.219204655674103e-05, + "loss": 2.8487, + "num_input_tokens_seen": 456130560, + "step": 870 + }, + { + "epoch": 0.0424342723431599, + "grad_norm": 0.23828125, + "learning_rate": 4.2434529582929193e-05, + "loss": 2.8383, + "num_input_tokens_seen": 458752000, + "step": 875 + }, + { + "epoch": 0.042676753899406526, + "grad_norm": 0.251953125, + "learning_rate": 4.267701260911736e-05, + "loss": 2.839, + "num_input_tokens_seen": 461373440, + "step": 880 + }, + { + "epoch": 0.042919235455653154, + "grad_norm": 0.24609375, + "learning_rate": 4.291949563530553e-05, + "loss": 2.8285, + "num_input_tokens_seen": 463994880, + "step": 885 + }, + { + "epoch": 0.04316171701189978, + "grad_norm": 0.244140625, + "learning_rate": 4.31619786614937e-05, + "loss": 2.8289, + "num_input_tokens_seen": 466616320, + "step": 890 + }, + { + "epoch": 0.04340419856814641, + "grad_norm": 0.2490234375, + "learning_rate": 4.3404461687681866e-05, + "loss": 2.847, + "num_input_tokens_seen": 469237760, + "step": 895 + }, + { + "epoch": 0.04364668012439304, + "grad_norm": 0.2451171875, + "learning_rate": 4.364694471387003e-05, + "loss": 2.8224, + "num_input_tokens_seen": 471859200, + "step": 900 + }, + { + "epoch": 0.04364668012439304, + "eval_accuracy": 0.44812734082397004, + "eval_loss": 2.802464246749878, + "eval_runtime": 5.8687, + "eval_samples_per_second": 51.118, + "eval_steps_per_second": 6.475, + "num_input_tokens_seen": 471859200, + "step": 900 + }, + { + "epoch": 0.043889161680639666, + "grad_norm": 0.248046875, + "learning_rate": 4.3889427740058196e-05, + "loss": 2.8469, + "num_input_tokens_seen": 474480640, + "step": 905 + }, + { + "epoch": 0.044131643236886293, + "grad_norm": 0.2421875, + "learning_rate": 4.4131910766246364e-05, + "loss": 2.8375, + "num_input_tokens_seen": 477102080, + "step": 910 + }, + { + "epoch": 0.04437412479313292, + "grad_norm": 0.25390625, + "learning_rate": 4.437439379243453e-05, + "loss": 2.8275, + "num_input_tokens_seen": 479723520, + "step": 915 + }, + { + "epoch": 0.04461660634937955, + "grad_norm": 0.244140625, + "learning_rate": 4.46168768186227e-05, + "loss": 2.835, + "num_input_tokens_seen": 482344960, + "step": 920 + }, + { + "epoch": 0.04485908790562618, + "grad_norm": 0.2578125, + "learning_rate": 4.485935984481087e-05, + "loss": 2.8451, + "num_input_tokens_seen": 484966400, + "step": 925 + }, + { + "epoch": 0.045101569461872805, + "grad_norm": 0.25390625, + "learning_rate": 4.510184287099904e-05, + "loss": 2.8424, + "num_input_tokens_seen": 487587840, + "step": 930 + }, + { + "epoch": 0.04534405101811943, + "grad_norm": 0.2421875, + "learning_rate": 4.53443258971872e-05, + "loss": 2.8322, + "num_input_tokens_seen": 490209280, + "step": 935 + }, + { + "epoch": 0.04558653257436606, + "grad_norm": 0.2470703125, + "learning_rate": 4.558680892337537e-05, + "loss": 2.8285, + "num_input_tokens_seen": 492830720, + "step": 940 + }, + { + "epoch": 0.04582901413061269, + "grad_norm": 0.244140625, + "learning_rate": 4.5829291949563535e-05, + "loss": 2.8391, + "num_input_tokens_seen": 495452160, + "step": 945 + }, + { + "epoch": 0.046071495686859316, + "grad_norm": 0.251953125, + "learning_rate": 4.6071774975751696e-05, + "loss": 2.8359, + "num_input_tokens_seen": 498073600, + "step": 950 + }, + { + "epoch": 0.046313977243105944, + "grad_norm": 0.25, + "learning_rate": 4.6314258001939865e-05, + "loss": 2.8355, + "num_input_tokens_seen": 500695040, + "step": 955 + }, + { + "epoch": 0.04655645879935257, + "grad_norm": 0.248046875, + "learning_rate": 4.655674102812803e-05, + "loss": 2.831, + "num_input_tokens_seen": 503316480, + "step": 960 + }, + { + "epoch": 0.0467989403555992, + "grad_norm": 0.248046875, + "learning_rate": 4.67992240543162e-05, + "loss": 2.835, + "num_input_tokens_seen": 505937920, + "step": 965 + }, + { + "epoch": 0.04704142191184583, + "grad_norm": 0.2421875, + "learning_rate": 4.704170708050437e-05, + "loss": 2.8501, + "num_input_tokens_seen": 508559360, + "step": 970 + }, + { + "epoch": 0.047283903468092456, + "grad_norm": 0.25390625, + "learning_rate": 4.728419010669253e-05, + "loss": 2.8216, + "num_input_tokens_seen": 511180800, + "step": 975 + }, + { + "epoch": 0.047526385024339084, + "grad_norm": 0.24609375, + "learning_rate": 4.75266731328807e-05, + "loss": 2.827, + "num_input_tokens_seen": 513802240, + "step": 980 + }, + { + "epoch": 0.04776886658058571, + "grad_norm": 0.248046875, + "learning_rate": 4.776915615906887e-05, + "loss": 2.8333, + "num_input_tokens_seen": 516423680, + "step": 985 + }, + { + "epoch": 0.04801134813683234, + "grad_norm": 0.244140625, + "learning_rate": 4.8011639185257035e-05, + "loss": 2.8285, + "num_input_tokens_seen": 519045120, + "step": 990 + }, + { + "epoch": 0.04825382969307897, + "grad_norm": 0.251953125, + "learning_rate": 4.8254122211445203e-05, + "loss": 2.8277, + "num_input_tokens_seen": 521666560, + "step": 995 + }, + { + "epoch": 0.048496311249325595, + "grad_norm": 0.25, + "learning_rate": 4.849660523763337e-05, + "loss": 2.8289, + "num_input_tokens_seen": 524288000, + "step": 1000 + }, + { + "epoch": 0.04873879280557222, + "grad_norm": 0.248046875, + "learning_rate": 4.873908826382153e-05, + "loss": 2.8286, + "num_input_tokens_seen": 526909440, + "step": 1005 + }, + { + "epoch": 0.04898127436181885, + "grad_norm": 0.251953125, + "learning_rate": 4.89815712900097e-05, + "loss": 2.832, + "num_input_tokens_seen": 529530880, + "step": 1010 + }, + { + "epoch": 0.04922375591806548, + "grad_norm": 0.2578125, + "learning_rate": 4.922405431619787e-05, + "loss": 2.827, + "num_input_tokens_seen": 532152320, + "step": 1015 + }, + { + "epoch": 0.049466237474312114, + "grad_norm": 0.248046875, + "learning_rate": 4.946653734238604e-05, + "loss": 2.8312, + "num_input_tokens_seen": 534773760, + "step": 1020 + }, + { + "epoch": 0.04970871903055874, + "grad_norm": 0.2451171875, + "learning_rate": 4.97090203685742e-05, + "loss": 2.841, + "num_input_tokens_seen": 537395200, + "step": 1025 + }, + { + "epoch": 0.04995120058680537, + "grad_norm": 0.2490234375, + "learning_rate": 4.995150339476237e-05, + "loss": 2.812, + "num_input_tokens_seen": 540016640, + "step": 1030 + }, + { + "epoch": 0.050193682143052, + "grad_norm": 0.255859375, + "learning_rate": 4.999999485594985e-05, + "loss": 2.8351, + "num_input_tokens_seen": 542638080, + "step": 1035 + }, + { + "epoch": 0.050436163699298625, + "grad_norm": 0.25390625, + "learning_rate": 4.999997395824976e-05, + "loss": 2.827, + "num_input_tokens_seen": 545259520, + "step": 1040 + }, + { + "epoch": 0.05067864525554525, + "grad_norm": 0.24609375, + "learning_rate": 4.999993698541001e-05, + "loss": 2.8299, + "num_input_tokens_seen": 547880960, + "step": 1045 + }, + { + "epoch": 0.05092112681179188, + "grad_norm": 0.26171875, + "learning_rate": 4.999988393745438e-05, + "loss": 2.825, + "num_input_tokens_seen": 550502400, + "step": 1050 + }, + { + "epoch": 0.05116360836803851, + "grad_norm": 0.255859375, + "learning_rate": 4.999981481441698e-05, + "loss": 2.8304, + "num_input_tokens_seen": 553123840, + "step": 1055 + }, + { + "epoch": 0.05140608992428514, + "grad_norm": 0.248046875, + "learning_rate": 4.999972961634226e-05, + "loss": 2.8367, + "num_input_tokens_seen": 555745280, + "step": 1060 + }, + { + "epoch": 0.051648571480531764, + "grad_norm": 0.2392578125, + "learning_rate": 4.999962834328499e-05, + "loss": 2.8163, + "num_input_tokens_seen": 558366720, + "step": 1065 + }, + { + "epoch": 0.05189105303677839, + "grad_norm": 0.25390625, + "learning_rate": 4.99995109953103e-05, + "loss": 2.8154, + "num_input_tokens_seen": 560988160, + "step": 1070 + }, + { + "epoch": 0.05213353459302502, + "grad_norm": 0.2490234375, + "learning_rate": 4.999937757249364e-05, + "loss": 2.8226, + "num_input_tokens_seen": 563609600, + "step": 1075 + }, + { + "epoch": 0.05237601614927165, + "grad_norm": 0.2578125, + "learning_rate": 4.9999228074920814e-05, + "loss": 2.8235, + "num_input_tokens_seen": 566231040, + "step": 1080 + }, + { + "epoch": 0.052618497705518276, + "grad_norm": 0.25390625, + "learning_rate": 4.9999062502687935e-05, + "loss": 2.8257, + "num_input_tokens_seen": 568852480, + "step": 1085 + }, + { + "epoch": 0.052860979261764904, + "grad_norm": 0.271484375, + "learning_rate": 4.9998880855901476e-05, + "loss": 2.825, + "num_input_tokens_seen": 571473920, + "step": 1090 + }, + { + "epoch": 0.05310346081801153, + "grad_norm": 0.263671875, + "learning_rate": 4.999868313467824e-05, + "loss": 2.8405, + "num_input_tokens_seen": 574095360, + "step": 1095 + }, + { + "epoch": 0.05334594237425816, + "grad_norm": 0.259765625, + "learning_rate": 4.9998469339145346e-05, + "loss": 2.8366, + "num_input_tokens_seen": 576716800, + "step": 1100 + }, + { + "epoch": 0.05358842393050479, + "grad_norm": 0.251953125, + "learning_rate": 4.999823946944028e-05, + "loss": 2.8472, + "num_input_tokens_seen": 579338240, + "step": 1105 + }, + { + "epoch": 0.053830905486751415, + "grad_norm": 0.259765625, + "learning_rate": 4.999799352571085e-05, + "loss": 2.82, + "num_input_tokens_seen": 581959680, + "step": 1110 + }, + { + "epoch": 0.05407338704299804, + "grad_norm": 0.2734375, + "learning_rate": 4.999773150811519e-05, + "loss": 2.8273, + "num_input_tokens_seen": 584581120, + "step": 1115 + }, + { + "epoch": 0.05431586859924467, + "grad_norm": 0.263671875, + "learning_rate": 4.999745341682179e-05, + "loss": 2.8349, + "num_input_tokens_seen": 587202560, + "step": 1120 + }, + { + "epoch": 0.0545583501554913, + "grad_norm": 0.25390625, + "learning_rate": 4.999715925200946e-05, + "loss": 2.8207, + "num_input_tokens_seen": 589824000, + "step": 1125 + }, + { + "epoch": 0.05480083171173793, + "grad_norm": 0.2490234375, + "learning_rate": 4.999684901386734e-05, + "loss": 2.8225, + "num_input_tokens_seen": 592445440, + "step": 1130 + }, + { + "epoch": 0.055043313267984555, + "grad_norm": 0.2451171875, + "learning_rate": 4.999652270259493e-05, + "loss": 2.8355, + "num_input_tokens_seen": 595066880, + "step": 1135 + }, + { + "epoch": 0.05528579482423118, + "grad_norm": 0.263671875, + "learning_rate": 4.9996180318402056e-05, + "loss": 2.8275, + "num_input_tokens_seen": 597688320, + "step": 1140 + }, + { + "epoch": 0.05552827638047781, + "grad_norm": 0.259765625, + "learning_rate": 4.999582186150884e-05, + "loss": 2.8332, + "num_input_tokens_seen": 600309760, + "step": 1145 + }, + { + "epoch": 0.05577075793672444, + "grad_norm": 0.2412109375, + "learning_rate": 4.9995447332145804e-05, + "loss": 2.828, + "num_input_tokens_seen": 602931200, + "step": 1150 + }, + { + "epoch": 0.056013239492971066, + "grad_norm": 0.251953125, + "learning_rate": 4.999505673055377e-05, + "loss": 2.836, + "num_input_tokens_seen": 605552640, + "step": 1155 + }, + { + "epoch": 0.056255721049217694, + "grad_norm": 0.255859375, + "learning_rate": 4.999465005698388e-05, + "loss": 2.8321, + "num_input_tokens_seen": 608174080, + "step": 1160 + }, + { + "epoch": 0.05649820260546432, + "grad_norm": 0.2451171875, + "learning_rate": 4.999422731169764e-05, + "loss": 2.8375, + "num_input_tokens_seen": 610795520, + "step": 1165 + }, + { + "epoch": 0.05674068416171095, + "grad_norm": 0.265625, + "learning_rate": 4.999378849496687e-05, + "loss": 2.8394, + "num_input_tokens_seen": 613416960, + "step": 1170 + }, + { + "epoch": 0.05698316571795758, + "grad_norm": 0.251953125, + "learning_rate": 4.999333360707374e-05, + "loss": 2.8123, + "num_input_tokens_seen": 616038400, + "step": 1175 + }, + { + "epoch": 0.057225647274204205, + "grad_norm": 0.267578125, + "learning_rate": 4.999286264831075e-05, + "loss": 2.8249, + "num_input_tokens_seen": 618659840, + "step": 1180 + }, + { + "epoch": 0.05746812883045083, + "grad_norm": 0.255859375, + "learning_rate": 4.9992375618980715e-05, + "loss": 2.8277, + "num_input_tokens_seen": 621281280, + "step": 1185 + }, + { + "epoch": 0.05771061038669746, + "grad_norm": 0.2578125, + "learning_rate": 4.99918725193968e-05, + "loss": 2.8284, + "num_input_tokens_seen": 623902720, + "step": 1190 + }, + { + "epoch": 0.05795309194294409, + "grad_norm": 0.2490234375, + "learning_rate": 4.999135334988251e-05, + "loss": 2.829, + "num_input_tokens_seen": 626524160, + "step": 1195 + }, + { + "epoch": 0.05819557349919072, + "grad_norm": 0.2451171875, + "learning_rate": 4.9990818110771674e-05, + "loss": 2.8178, + "num_input_tokens_seen": 629145600, + "step": 1200 + }, + { + "epoch": 0.05819557349919072, + "eval_accuracy": 0.4494723986321446, + "eval_loss": 2.7911999225616455, + "eval_runtime": 5.7873, + "eval_samples_per_second": 51.837, + "eval_steps_per_second": 6.566, + "num_input_tokens_seen": 629145600, + "step": 1200 + }, + { + "epoch": 0.058438055055437345, + "grad_norm": 0.2470703125, + "learning_rate": 4.999026680240845e-05, + "loss": 2.8176, + "num_input_tokens_seen": 631767040, + "step": 1205 + }, + { + "epoch": 0.05868053661168397, + "grad_norm": 0.251953125, + "learning_rate": 4.998969942514733e-05, + "loss": 2.8341, + "num_input_tokens_seen": 634388480, + "step": 1210 + }, + { + "epoch": 0.0589230181679306, + "grad_norm": 0.25390625, + "learning_rate": 4.9989115979353154e-05, + "loss": 2.8302, + "num_input_tokens_seen": 637009920, + "step": 1215 + }, + { + "epoch": 0.05916549972417723, + "grad_norm": 0.2412109375, + "learning_rate": 4.998851646540106e-05, + "loss": 2.8287, + "num_input_tokens_seen": 639631360, + "step": 1220 + }, + { + "epoch": 0.059407981280423856, + "grad_norm": 0.2490234375, + "learning_rate": 4.998790088367655e-05, + "loss": 2.8195, + "num_input_tokens_seen": 642252800, + "step": 1225 + }, + { + "epoch": 0.059650462836670484, + "grad_norm": 0.25, + "learning_rate": 4.998726923457546e-05, + "loss": 2.8209, + "num_input_tokens_seen": 644874240, + "step": 1230 + }, + { + "epoch": 0.05989294439291711, + "grad_norm": 0.26171875, + "learning_rate": 4.9986621518503925e-05, + "loss": 2.8037, + "num_input_tokens_seen": 647495680, + "step": 1235 + }, + { + "epoch": 0.06013542594916374, + "grad_norm": 0.2578125, + "learning_rate": 4.9985957735878434e-05, + "loss": 2.8244, + "num_input_tokens_seen": 650117120, + "step": 1240 + }, + { + "epoch": 0.06037790750541037, + "grad_norm": 0.25, + "learning_rate": 4.9985277887125816e-05, + "loss": 2.8115, + "num_input_tokens_seen": 652738560, + "step": 1245 + }, + { + "epoch": 0.060620389061656996, + "grad_norm": 0.248046875, + "learning_rate": 4.99845819726832e-05, + "loss": 2.8152, + "num_input_tokens_seen": 655360000, + "step": 1250 + }, + { + "epoch": 0.060862870617903624, + "grad_norm": 0.244140625, + "learning_rate": 4.998386999299808e-05, + "loss": 2.811, + "num_input_tokens_seen": 657981440, + "step": 1255 + }, + { + "epoch": 0.06110535217415025, + "grad_norm": 0.2412109375, + "learning_rate": 4.998314194852825e-05, + "loss": 2.8287, + "num_input_tokens_seen": 660602880, + "step": 1260 + }, + { + "epoch": 0.06134783373039688, + "grad_norm": 0.2578125, + "learning_rate": 4.998239783974185e-05, + "loss": 2.8147, + "num_input_tokens_seen": 663224320, + "step": 1265 + }, + { + "epoch": 0.06159031528664351, + "grad_norm": 0.25, + "learning_rate": 4.998163766711735e-05, + "loss": 2.8149, + "num_input_tokens_seen": 665845760, + "step": 1270 + }, + { + "epoch": 0.061832796842890135, + "grad_norm": 0.236328125, + "learning_rate": 4.998086143114355e-05, + "loss": 2.817, + "num_input_tokens_seen": 668467200, + "step": 1275 + }, + { + "epoch": 0.06207527839913676, + "grad_norm": 0.2578125, + "learning_rate": 4.998006913231957e-05, + "loss": 2.8326, + "num_input_tokens_seen": 671088640, + "step": 1280 + }, + { + "epoch": 0.06231775995538339, + "grad_norm": 0.2490234375, + "learning_rate": 4.997926077115487e-05, + "loss": 2.816, + "num_input_tokens_seen": 673710080, + "step": 1285 + }, + { + "epoch": 0.06256024151163002, + "grad_norm": 0.25390625, + "learning_rate": 4.997843634816921e-05, + "loss": 2.8116, + "num_input_tokens_seen": 676331520, + "step": 1290 + }, + { + "epoch": 0.06280272306787665, + "grad_norm": 0.26171875, + "learning_rate": 4.9977595863892725e-05, + "loss": 2.8138, + "num_input_tokens_seen": 678952960, + "step": 1295 + }, + { + "epoch": 0.06304520462412327, + "grad_norm": 0.25, + "learning_rate": 4.9976739318865836e-05, + "loss": 2.8286, + "num_input_tokens_seen": 681574400, + "step": 1300 + }, + { + "epoch": 0.06328768618036991, + "grad_norm": 0.25, + "learning_rate": 4.997586671363931e-05, + "loss": 2.8235, + "num_input_tokens_seen": 684195840, + "step": 1305 + }, + { + "epoch": 0.06353016773661653, + "grad_norm": 0.25, + "learning_rate": 4.997497804877423e-05, + "loss": 2.8223, + "num_input_tokens_seen": 686817280, + "step": 1310 + }, + { + "epoch": 0.06377264929286316, + "grad_norm": 0.248046875, + "learning_rate": 4.9974073324842034e-05, + "loss": 2.8338, + "num_input_tokens_seen": 689438720, + "step": 1315 + }, + { + "epoch": 0.06401513084910979, + "grad_norm": 0.263671875, + "learning_rate": 4.997315254242445e-05, + "loss": 2.8224, + "num_input_tokens_seen": 692060160, + "step": 1320 + }, + { + "epoch": 0.06425761240535642, + "grad_norm": 0.2421875, + "learning_rate": 4.997221570211355e-05, + "loss": 2.8266, + "num_input_tokens_seen": 694681600, + "step": 1325 + }, + { + "epoch": 0.06450009396160304, + "grad_norm": 0.25, + "learning_rate": 4.997126280451173e-05, + "loss": 2.8305, + "num_input_tokens_seen": 697303040, + "step": 1330 + }, + { + "epoch": 0.06474257551784968, + "grad_norm": 0.2451171875, + "learning_rate": 4.9970293850231695e-05, + "loss": 2.8248, + "num_input_tokens_seen": 699924480, + "step": 1335 + }, + { + "epoch": 0.0649850570740963, + "grad_norm": 0.2470703125, + "learning_rate": 4.996930883989651e-05, + "loss": 2.8145, + "num_input_tokens_seen": 702545920, + "step": 1340 + }, + { + "epoch": 0.06522753863034293, + "grad_norm": 0.2470703125, + "learning_rate": 4.9968307774139535e-05, + "loss": 2.8303, + "num_input_tokens_seen": 705167360, + "step": 1345 + }, + { + "epoch": 0.06547002018658955, + "grad_norm": 0.2470703125, + "learning_rate": 4.9967290653604454e-05, + "loss": 2.8118, + "num_input_tokens_seen": 707788800, + "step": 1350 + }, + { + "epoch": 0.06571250174283619, + "grad_norm": 0.26953125, + "learning_rate": 4.996625747894529e-05, + "loss": 2.8133, + "num_input_tokens_seen": 710410240, + "step": 1355 + }, + { + "epoch": 0.06595498329908281, + "grad_norm": 0.2578125, + "learning_rate": 4.996520825082638e-05, + "loss": 2.8189, + "num_input_tokens_seen": 713031680, + "step": 1360 + }, + { + "epoch": 0.06619746485532944, + "grad_norm": 0.2431640625, + "learning_rate": 4.996414296992238e-05, + "loss": 2.8219, + "num_input_tokens_seen": 715653120, + "step": 1365 + }, + { + "epoch": 0.06643994641157606, + "grad_norm": 0.2578125, + "learning_rate": 4.9963061636918276e-05, + "loss": 2.819, + "num_input_tokens_seen": 718274560, + "step": 1370 + }, + { + "epoch": 0.0666824279678227, + "grad_norm": 0.25, + "learning_rate": 4.9961964252509367e-05, + "loss": 2.8041, + "num_input_tokens_seen": 720896000, + "step": 1375 + }, + { + "epoch": 0.06692490952406932, + "grad_norm": 0.248046875, + "learning_rate": 4.996085081740128e-05, + "loss": 2.8279, + "num_input_tokens_seen": 723517440, + "step": 1380 + }, + { + "epoch": 0.06716739108031596, + "grad_norm": 0.248046875, + "learning_rate": 4.995972133230997e-05, + "loss": 2.8206, + "num_input_tokens_seen": 726138880, + "step": 1385 + }, + { + "epoch": 0.06740987263656258, + "grad_norm": 0.25, + "learning_rate": 4.99585757979617e-05, + "loss": 2.816, + "num_input_tokens_seen": 728760320, + "step": 1390 + }, + { + "epoch": 0.06765235419280921, + "grad_norm": 0.24609375, + "learning_rate": 4.995741421509305e-05, + "loss": 2.8114, + "num_input_tokens_seen": 731381760, + "step": 1395 + }, + { + "epoch": 0.06789483574905583, + "grad_norm": 0.2470703125, + "learning_rate": 4.995623658445092e-05, + "loss": 2.8284, + "num_input_tokens_seen": 734003200, + "step": 1400 + }, + { + "epoch": 0.06813731730530247, + "grad_norm": 0.248046875, + "learning_rate": 4.995504290679254e-05, + "loss": 2.8262, + "num_input_tokens_seen": 736624640, + "step": 1405 + }, + { + "epoch": 0.06837979886154909, + "grad_norm": 0.2490234375, + "learning_rate": 4.995383318288546e-05, + "loss": 2.8213, + "num_input_tokens_seen": 739246080, + "step": 1410 + }, + { + "epoch": 0.06862228041779572, + "grad_norm": 0.2431640625, + "learning_rate": 4.9952607413507525e-05, + "loss": 2.8199, + "num_input_tokens_seen": 741867520, + "step": 1415 + }, + { + "epoch": 0.06886476197404234, + "grad_norm": 0.24609375, + "learning_rate": 4.995136559944692e-05, + "loss": 2.8199, + "num_input_tokens_seen": 744488960, + "step": 1420 + }, + { + "epoch": 0.06910724353028898, + "grad_norm": 0.259765625, + "learning_rate": 4.9950107741502136e-05, + "loss": 2.8219, + "num_input_tokens_seen": 747110400, + "step": 1425 + }, + { + "epoch": 0.0693497250865356, + "grad_norm": 0.26171875, + "learning_rate": 4.9948833840482e-05, + "loss": 2.8168, + "num_input_tokens_seen": 749731840, + "step": 1430 + }, + { + "epoch": 0.06959220664278223, + "grad_norm": 0.263671875, + "learning_rate": 4.994754389720561e-05, + "loss": 2.83, + "num_input_tokens_seen": 752353280, + "step": 1435 + }, + { + "epoch": 0.06983468819902885, + "grad_norm": 0.25, + "learning_rate": 4.9946237912502435e-05, + "loss": 2.8221, + "num_input_tokens_seen": 754974720, + "step": 1440 + }, + { + "epoch": 0.07007716975527549, + "grad_norm": 0.2578125, + "learning_rate": 4.994491588721221e-05, + "loss": 2.8179, + "num_input_tokens_seen": 757596160, + "step": 1445 + }, + { + "epoch": 0.07031965131152211, + "grad_norm": 0.26171875, + "learning_rate": 4.994357782218502e-05, + "loss": 2.8064, + "num_input_tokens_seen": 760217600, + "step": 1450 + }, + { + "epoch": 0.07056213286776875, + "grad_norm": 0.251953125, + "learning_rate": 4.994222371828124e-05, + "loss": 2.8171, + "num_input_tokens_seen": 762839040, + "step": 1455 + }, + { + "epoch": 0.07080461442401538, + "grad_norm": 0.24609375, + "learning_rate": 4.994085357637157e-05, + "loss": 2.8152, + "num_input_tokens_seen": 765460480, + "step": 1460 + }, + { + "epoch": 0.071047095980262, + "grad_norm": 0.244140625, + "learning_rate": 4.9939467397337025e-05, + "loss": 2.8093, + "num_input_tokens_seen": 768081920, + "step": 1465 + }, + { + "epoch": 0.07128957753650864, + "grad_norm": 0.2451171875, + "learning_rate": 4.9938065182068925e-05, + "loss": 2.8236, + "num_input_tokens_seen": 770703360, + "step": 1470 + }, + { + "epoch": 0.07153205909275526, + "grad_norm": 0.2451171875, + "learning_rate": 4.9936646931468896e-05, + "loss": 2.8191, + "num_input_tokens_seen": 773324800, + "step": 1475 + }, + { + "epoch": 0.07177454064900189, + "grad_norm": 0.255859375, + "learning_rate": 4.9935212646448886e-05, + "loss": 2.8199, + "num_input_tokens_seen": 775946240, + "step": 1480 + }, + { + "epoch": 0.07201702220524851, + "grad_norm": 0.2470703125, + "learning_rate": 4.993376232793115e-05, + "loss": 2.8172, + "num_input_tokens_seen": 778567680, + "step": 1485 + }, + { + "epoch": 0.07225950376149515, + "grad_norm": 0.25390625, + "learning_rate": 4.993229597684825e-05, + "loss": 2.8199, + "num_input_tokens_seen": 781189120, + "step": 1490 + }, + { + "epoch": 0.07250198531774177, + "grad_norm": 0.2578125, + "learning_rate": 4.9930813594143064e-05, + "loss": 2.8152, + "num_input_tokens_seen": 783810560, + "step": 1495 + }, + { + "epoch": 0.0727444668739884, + "grad_norm": 0.251953125, + "learning_rate": 4.9929315180768775e-05, + "loss": 2.8001, + "num_input_tokens_seen": 786432000, + "step": 1500 + }, + { + "epoch": 0.0727444668739884, + "eval_accuracy": 0.45046246539651524, + "eval_loss": 2.7831716537475586, + "eval_runtime": 5.8093, + "eval_samples_per_second": 51.641, + "eval_steps_per_second": 6.541, + "num_input_tokens_seen": 786432000, + "step": 1500 + }, + { + "epoch": 0.07298694843023502, + "grad_norm": 0.26171875, + "learning_rate": 4.992780073768886e-05, + "loss": 2.814, + "num_input_tokens_seen": 789053440, + "step": 1505 + }, + { + "epoch": 0.07322942998648166, + "grad_norm": 0.255859375, + "learning_rate": 4.992627026587713e-05, + "loss": 2.806, + "num_input_tokens_seen": 791674880, + "step": 1510 + }, + { + "epoch": 0.07347191154272828, + "grad_norm": 0.267578125, + "learning_rate": 4.992472376631767e-05, + "loss": 2.8164, + "num_input_tokens_seen": 794296320, + "step": 1515 + }, + { + "epoch": 0.07371439309897491, + "grad_norm": 0.259765625, + "learning_rate": 4.992316124000491e-05, + "loss": 2.8305, + "num_input_tokens_seen": 796917760, + "step": 1520 + }, + { + "epoch": 0.07395687465522154, + "grad_norm": 0.25, + "learning_rate": 4.992158268794355e-05, + "loss": 2.8207, + "num_input_tokens_seen": 799539200, + "step": 1525 + }, + { + "epoch": 0.07419935621146817, + "grad_norm": 0.25, + "learning_rate": 4.991998811114861e-05, + "loss": 2.8167, + "num_input_tokens_seen": 802160640, + "step": 1530 + }, + { + "epoch": 0.07444183776771479, + "grad_norm": 0.2490234375, + "learning_rate": 4.991837751064542e-05, + "loss": 2.7985, + "num_input_tokens_seen": 804782080, + "step": 1535 + }, + { + "epoch": 0.07468431932396143, + "grad_norm": 0.25390625, + "learning_rate": 4.99167508874696e-05, + "loss": 2.8054, + "num_input_tokens_seen": 807403520, + "step": 1540 + }, + { + "epoch": 0.07492680088020805, + "grad_norm": 0.2578125, + "learning_rate": 4.991510824266707e-05, + "loss": 2.8267, + "num_input_tokens_seen": 810024960, + "step": 1545 + }, + { + "epoch": 0.07516928243645468, + "grad_norm": 0.251953125, + "learning_rate": 4.991344957729409e-05, + "loss": 2.824, + "num_input_tokens_seen": 812646400, + "step": 1550 + }, + { + "epoch": 0.0754117639927013, + "grad_norm": 0.26171875, + "learning_rate": 4.991177489241716e-05, + "loss": 2.8155, + "num_input_tokens_seen": 815267840, + "step": 1555 + }, + { + "epoch": 0.07565424554894794, + "grad_norm": 0.2470703125, + "learning_rate": 4.991008418911313e-05, + "loss": 2.8131, + "num_input_tokens_seen": 817889280, + "step": 1560 + }, + { + "epoch": 0.07589672710519456, + "grad_norm": 0.2470703125, + "learning_rate": 4.9908377468469124e-05, + "loss": 2.8172, + "num_input_tokens_seen": 820510720, + "step": 1565 + }, + { + "epoch": 0.0761392086614412, + "grad_norm": 0.2578125, + "learning_rate": 4.990665473158259e-05, + "loss": 2.829, + "num_input_tokens_seen": 823132160, + "step": 1570 + }, + { + "epoch": 0.07638169021768781, + "grad_norm": 0.2578125, + "learning_rate": 4.990491597956124e-05, + "loss": 2.7951, + "num_input_tokens_seen": 825753600, + "step": 1575 + }, + { + "epoch": 0.07662417177393445, + "grad_norm": 0.26171875, + "learning_rate": 4.990316121352312e-05, + "loss": 2.8291, + "num_input_tokens_seen": 828375040, + "step": 1580 + }, + { + "epoch": 0.07686665333018107, + "grad_norm": 0.2470703125, + "learning_rate": 4.990139043459654e-05, + "loss": 2.8227, + "num_input_tokens_seen": 830996480, + "step": 1585 + }, + { + "epoch": 0.0771091348864277, + "grad_norm": 0.2470703125, + "learning_rate": 4.9899603643920126e-05, + "loss": 2.8165, + "num_input_tokens_seen": 833617920, + "step": 1590 + }, + { + "epoch": 0.07735161644267433, + "grad_norm": 0.259765625, + "learning_rate": 4.98978008426428e-05, + "loss": 2.8119, + "num_input_tokens_seen": 836239360, + "step": 1595 + }, + { + "epoch": 0.07759409799892096, + "grad_norm": 0.255859375, + "learning_rate": 4.9895982031923766e-05, + "loss": 2.811, + "num_input_tokens_seen": 838860800, + "step": 1600 + }, + { + "epoch": 0.07783657955516758, + "grad_norm": 0.255859375, + "learning_rate": 4.989414721293254e-05, + "loss": 2.8195, + "num_input_tokens_seen": 841482240, + "step": 1605 + }, + { + "epoch": 0.07807906111141422, + "grad_norm": 0.2451171875, + "learning_rate": 4.989229638684892e-05, + "loss": 2.8394, + "num_input_tokens_seen": 844103680, + "step": 1610 + }, + { + "epoch": 0.07832154266766084, + "grad_norm": 0.25, + "learning_rate": 4.989042955486299e-05, + "loss": 2.8155, + "num_input_tokens_seen": 846725120, + "step": 1615 + }, + { + "epoch": 0.07856402422390747, + "grad_norm": 0.251953125, + "learning_rate": 4.9888546718175154e-05, + "loss": 2.8113, + "num_input_tokens_seen": 849346560, + "step": 1620 + }, + { + "epoch": 0.07880650578015409, + "grad_norm": 0.26171875, + "learning_rate": 4.9886647877996074e-05, + "loss": 2.8253, + "num_input_tokens_seen": 851968000, + "step": 1625 + }, + { + "epoch": 0.07904898733640073, + "grad_norm": 0.263671875, + "learning_rate": 4.988473303554672e-05, + "loss": 2.8264, + "num_input_tokens_seen": 854589440, + "step": 1630 + }, + { + "epoch": 0.07929146889264735, + "grad_norm": 0.25390625, + "learning_rate": 4.988280219205833e-05, + "loss": 2.8174, + "num_input_tokens_seen": 857210880, + "step": 1635 + }, + { + "epoch": 0.07953395044889398, + "grad_norm": 0.2578125, + "learning_rate": 4.988085534877248e-05, + "loss": 2.8017, + "num_input_tokens_seen": 859832320, + "step": 1640 + }, + { + "epoch": 0.0797764320051406, + "grad_norm": 0.2470703125, + "learning_rate": 4.987889250694098e-05, + "loss": 2.8149, + "num_input_tokens_seen": 862453760, + "step": 1645 + }, + { + "epoch": 0.08001891356138724, + "grad_norm": 0.2490234375, + "learning_rate": 4.9876913667825955e-05, + "loss": 2.8148, + "num_input_tokens_seen": 865075200, + "step": 1650 + }, + { + "epoch": 0.08026139511763386, + "grad_norm": 0.255859375, + "learning_rate": 4.987491883269981e-05, + "loss": 2.8222, + "num_input_tokens_seen": 867696640, + "step": 1655 + }, + { + "epoch": 0.0805038766738805, + "grad_norm": 0.251953125, + "learning_rate": 4.987290800284524e-05, + "loss": 2.8074, + "num_input_tokens_seen": 870318080, + "step": 1660 + }, + { + "epoch": 0.08074635823012712, + "grad_norm": 0.25390625, + "learning_rate": 4.987088117955523e-05, + "loss": 2.8234, + "num_input_tokens_seen": 872939520, + "step": 1665 + }, + { + "epoch": 0.08098883978637375, + "grad_norm": 0.25, + "learning_rate": 4.9868838364133016e-05, + "loss": 2.8321, + "num_input_tokens_seen": 875560960, + "step": 1670 + }, + { + "epoch": 0.08123132134262037, + "grad_norm": 0.2451171875, + "learning_rate": 4.986677955789216e-05, + "loss": 2.8036, + "num_input_tokens_seen": 878182400, + "step": 1675 + }, + { + "epoch": 0.081473802898867, + "grad_norm": 0.2451171875, + "learning_rate": 4.9864704762156487e-05, + "loss": 2.8171, + "num_input_tokens_seen": 880803840, + "step": 1680 + }, + { + "epoch": 0.08171628445511363, + "grad_norm": 0.244140625, + "learning_rate": 4.986261397826009e-05, + "loss": 2.8167, + "num_input_tokens_seen": 883425280, + "step": 1685 + }, + { + "epoch": 0.08195876601136026, + "grad_norm": 0.2490234375, + "learning_rate": 4.9860507207547366e-05, + "loss": 2.8094, + "num_input_tokens_seen": 886046720, + "step": 1690 + }, + { + "epoch": 0.08220124756760688, + "grad_norm": 0.25, + "learning_rate": 4.985838445137299e-05, + "loss": 2.7931, + "num_input_tokens_seen": 888668160, + "step": 1695 + }, + { + "epoch": 0.08244372912385352, + "grad_norm": 0.251953125, + "learning_rate": 4.985624571110189e-05, + "loss": 2.8207, + "num_input_tokens_seen": 891289600, + "step": 1700 + }, + { + "epoch": 0.08268621068010014, + "grad_norm": 0.25390625, + "learning_rate": 4.9854090988109294e-05, + "loss": 2.7987, + "num_input_tokens_seen": 893911040, + "step": 1705 + }, + { + "epoch": 0.08292869223634677, + "grad_norm": 0.25390625, + "learning_rate": 4.9851920283780714e-05, + "loss": 2.8237, + "num_input_tokens_seen": 896532480, + "step": 1710 + }, + { + "epoch": 0.0831711737925934, + "grad_norm": 0.2578125, + "learning_rate": 4.984973359951192e-05, + "loss": 2.8, + "num_input_tokens_seen": 899153920, + "step": 1715 + }, + { + "epoch": 0.08341365534884003, + "grad_norm": 0.2451171875, + "learning_rate": 4.984753093670895e-05, + "loss": 2.812, + "num_input_tokens_seen": 901775360, + "step": 1720 + }, + { + "epoch": 0.08365613690508665, + "grad_norm": 0.25390625, + "learning_rate": 4.984531229678815e-05, + "loss": 2.8196, + "num_input_tokens_seen": 904396800, + "step": 1725 + }, + { + "epoch": 0.08389861846133329, + "grad_norm": 0.25390625, + "learning_rate": 4.984307768117611e-05, + "loss": 2.827, + "num_input_tokens_seen": 907018240, + "step": 1730 + }, + { + "epoch": 0.0841411000175799, + "grad_norm": 0.2421875, + "learning_rate": 4.98408270913097e-05, + "loss": 2.8098, + "num_input_tokens_seen": 909639680, + "step": 1735 + }, + { + "epoch": 0.08438358157382654, + "grad_norm": 0.2373046875, + "learning_rate": 4.9838560528636066e-05, + "loss": 2.8144, + "num_input_tokens_seen": 912261120, + "step": 1740 + }, + { + "epoch": 0.08462606313007316, + "grad_norm": 0.251953125, + "learning_rate": 4.983627799461263e-05, + "loss": 2.8083, + "num_input_tokens_seen": 914882560, + "step": 1745 + }, + { + "epoch": 0.0848685446863198, + "grad_norm": 0.2578125, + "learning_rate": 4.9833979490707064e-05, + "loss": 2.8096, + "num_input_tokens_seen": 917504000, + "step": 1750 + }, + { + "epoch": 0.08511102624256643, + "grad_norm": 0.259765625, + "learning_rate": 4.983166501839732e-05, + "loss": 2.8097, + "num_input_tokens_seen": 920125440, + "step": 1755 + }, + { + "epoch": 0.08535350779881305, + "grad_norm": 0.25390625, + "learning_rate": 4.9829334579171626e-05, + "loss": 2.8097, + "num_input_tokens_seen": 922746880, + "step": 1760 + }, + { + "epoch": 0.08559598935505969, + "grad_norm": 0.24609375, + "learning_rate": 4.9826988174528465e-05, + "loss": 2.8175, + "num_input_tokens_seen": 925368320, + "step": 1765 + }, + { + "epoch": 0.08583847091130631, + "grad_norm": 0.2451171875, + "learning_rate": 4.98246258059766e-05, + "loss": 2.8214, + "num_input_tokens_seen": 927989760, + "step": 1770 + }, + { + "epoch": 0.08608095246755294, + "grad_norm": 0.2353515625, + "learning_rate": 4.982224747503503e-05, + "loss": 2.8127, + "num_input_tokens_seen": 930611200, + "step": 1775 + }, + { + "epoch": 0.08632343402379956, + "grad_norm": 0.2578125, + "learning_rate": 4.9819853183233046e-05, + "loss": 2.8065, + "num_input_tokens_seen": 933232640, + "step": 1780 + }, + { + "epoch": 0.0865659155800462, + "grad_norm": 0.248046875, + "learning_rate": 4.9817442932110193e-05, + "loss": 2.8097, + "num_input_tokens_seen": 935854080, + "step": 1785 + }, + { + "epoch": 0.08680839713629282, + "grad_norm": 0.25390625, + "learning_rate": 4.9815016723216273e-05, + "loss": 2.8254, + "num_input_tokens_seen": 938475520, + "step": 1790 + }, + { + "epoch": 0.08705087869253945, + "grad_norm": 0.251953125, + "learning_rate": 4.9812574558111365e-05, + "loss": 2.8086, + "num_input_tokens_seen": 941096960, + "step": 1795 + }, + { + "epoch": 0.08729336024878608, + "grad_norm": 0.2490234375, + "learning_rate": 4.9810116438365784e-05, + "loss": 2.8045, + "num_input_tokens_seen": 943718400, + "step": 1800 + }, + { + "epoch": 0.08729336024878608, + "eval_accuracy": 0.4512392118547468, + "eval_loss": 2.7771894931793213, + "eval_runtime": 5.8369, + "eval_samples_per_second": 51.397, + "eval_steps_per_second": 6.51, + "num_input_tokens_seen": 943718400, + "step": 1800 + }, + { + "epoch": 0.08753584180503271, + "grad_norm": 0.2412109375, + "learning_rate": 4.9807642365560123e-05, + "loss": 2.8076, + "num_input_tokens_seen": 946339840, + "step": 1805 + }, + { + "epoch": 0.08777832336127933, + "grad_norm": 0.244140625, + "learning_rate": 4.980515234128522e-05, + "loss": 2.8131, + "num_input_tokens_seen": 948961280, + "step": 1810 + }, + { + "epoch": 0.08802080491752597, + "grad_norm": 0.2578125, + "learning_rate": 4.980264636714219e-05, + "loss": 2.8187, + "num_input_tokens_seen": 951582720, + "step": 1815 + }, + { + "epoch": 0.08826328647377259, + "grad_norm": 0.2490234375, + "learning_rate": 4.980012444474238e-05, + "loss": 2.8076, + "num_input_tokens_seen": 954204160, + "step": 1820 + }, + { + "epoch": 0.08850576803001922, + "grad_norm": 0.251953125, + "learning_rate": 4.97975865757074e-05, + "loss": 2.8258, + "num_input_tokens_seen": 956825600, + "step": 1825 + }, + { + "epoch": 0.08874824958626584, + "grad_norm": 0.255859375, + "learning_rate": 4.979503276166912e-05, + "loss": 2.81, + "num_input_tokens_seen": 959447040, + "step": 1830 + }, + { + "epoch": 0.08899073114251248, + "grad_norm": 0.263671875, + "learning_rate": 4.979246300426965e-05, + "loss": 2.8154, + "num_input_tokens_seen": 962068480, + "step": 1835 + }, + { + "epoch": 0.0892332126987591, + "grad_norm": 0.267578125, + "learning_rate": 4.978987730516137e-05, + "loss": 2.8244, + "num_input_tokens_seen": 964689920, + "step": 1840 + }, + { + "epoch": 0.08947569425500573, + "grad_norm": 0.2470703125, + "learning_rate": 4.9787275666006904e-05, + "loss": 2.8089, + "num_input_tokens_seen": 967311360, + "step": 1845 + }, + { + "epoch": 0.08971817581125235, + "grad_norm": 0.24609375, + "learning_rate": 4.9784658088479106e-05, + "loss": 2.8155, + "num_input_tokens_seen": 969932800, + "step": 1850 + }, + { + "epoch": 0.08996065736749899, + "grad_norm": 0.251953125, + "learning_rate": 4.978202457426111e-05, + "loss": 2.8164, + "num_input_tokens_seen": 972554240, + "step": 1855 + }, + { + "epoch": 0.09020313892374561, + "grad_norm": 0.2490234375, + "learning_rate": 4.977937512504628e-05, + "loss": 2.7966, + "num_input_tokens_seen": 975175680, + "step": 1860 + }, + { + "epoch": 0.09044562047999224, + "grad_norm": 0.24609375, + "learning_rate": 4.977670974253822e-05, + "loss": 2.8174, + "num_input_tokens_seen": 977797120, + "step": 1865 + }, + { + "epoch": 0.09068810203623887, + "grad_norm": 0.263671875, + "learning_rate": 4.97740284284508e-05, + "loss": 2.8133, + "num_input_tokens_seen": 980418560, + "step": 1870 + }, + { + "epoch": 0.0909305835924855, + "grad_norm": 0.2470703125, + "learning_rate": 4.977133118450811e-05, + "loss": 2.8081, + "num_input_tokens_seen": 983040000, + "step": 1875 + }, + { + "epoch": 0.09117306514873212, + "grad_norm": 0.25390625, + "learning_rate": 4.976861801244449e-05, + "loss": 2.8187, + "num_input_tokens_seen": 985661440, + "step": 1880 + }, + { + "epoch": 0.09141554670497876, + "grad_norm": 0.255859375, + "learning_rate": 4.976588891400455e-05, + "loss": 2.8081, + "num_input_tokens_seen": 988282880, + "step": 1885 + }, + { + "epoch": 0.09165802826122538, + "grad_norm": 0.2490234375, + "learning_rate": 4.97631438909431e-05, + "loss": 2.7958, + "num_input_tokens_seen": 990904320, + "step": 1890 + }, + { + "epoch": 0.09190050981747201, + "grad_norm": 0.24609375, + "learning_rate": 4.97603829450252e-05, + "loss": 2.8202, + "num_input_tokens_seen": 993525760, + "step": 1895 + }, + { + "epoch": 0.09214299137371863, + "grad_norm": 0.279296875, + "learning_rate": 4.975760607802618e-05, + "loss": 2.7975, + "num_input_tokens_seen": 996147200, + "step": 1900 + }, + { + "epoch": 0.09238547292996527, + "grad_norm": 0.2431640625, + "learning_rate": 4.975481329173156e-05, + "loss": 2.8094, + "num_input_tokens_seen": 998768640, + "step": 1905 + }, + { + "epoch": 0.09262795448621189, + "grad_norm": 0.248046875, + "learning_rate": 4.975200458793713e-05, + "loss": 2.8066, + "num_input_tokens_seen": 1001390080, + "step": 1910 + }, + { + "epoch": 0.09287043604245852, + "grad_norm": 0.2451171875, + "learning_rate": 4.97491799684489e-05, + "loss": 2.8147, + "num_input_tokens_seen": 1004011520, + "step": 1915 + }, + { + "epoch": 0.09311291759870514, + "grad_norm": 0.248046875, + "learning_rate": 4.9746339435083124e-05, + "loss": 2.7911, + "num_input_tokens_seen": 1006632960, + "step": 1920 + }, + { + "epoch": 0.09335539915495178, + "grad_norm": 0.2421875, + "learning_rate": 4.9743482989666275e-05, + "loss": 2.822, + "num_input_tokens_seen": 1009254400, + "step": 1925 + }, + { + "epoch": 0.0935978807111984, + "grad_norm": 0.25390625, + "learning_rate": 4.9740610634035064e-05, + "loss": 2.8057, + "num_input_tokens_seen": 1011875840, + "step": 1930 + }, + { + "epoch": 0.09384036226744503, + "grad_norm": 0.248046875, + "learning_rate": 4.973772237003644e-05, + "loss": 2.8066, + "num_input_tokens_seen": 1014497280, + "step": 1935 + }, + { + "epoch": 0.09408284382369166, + "grad_norm": 0.25390625, + "learning_rate": 4.973481819952758e-05, + "loss": 2.8002, + "num_input_tokens_seen": 1017118720, + "step": 1940 + }, + { + "epoch": 0.09432532537993829, + "grad_norm": 0.25, + "learning_rate": 4.973189812437588e-05, + "loss": 2.8146, + "num_input_tokens_seen": 1019740160, + "step": 1945 + }, + { + "epoch": 0.09456780693618491, + "grad_norm": 0.2578125, + "learning_rate": 4.9728962146458956e-05, + "loss": 2.8133, + "num_input_tokens_seen": 1022361600, + "step": 1950 + }, + { + "epoch": 0.09481028849243155, + "grad_norm": 0.255859375, + "learning_rate": 4.9726010267664666e-05, + "loss": 2.7953, + "num_input_tokens_seen": 1024983040, + "step": 1955 + }, + { + "epoch": 0.09505277004867817, + "grad_norm": 0.2451171875, + "learning_rate": 4.972304248989109e-05, + "loss": 2.7998, + "num_input_tokens_seen": 1027604480, + "step": 1960 + }, + { + "epoch": 0.0952952516049248, + "grad_norm": 0.25, + "learning_rate": 4.9720058815046534e-05, + "loss": 2.817, + "num_input_tokens_seen": 1030225920, + "step": 1965 + }, + { + "epoch": 0.09553773316117142, + "grad_norm": 0.25390625, + "learning_rate": 4.9717059245049505e-05, + "loss": 2.8026, + "num_input_tokens_seen": 1032847360, + "step": 1970 + }, + { + "epoch": 0.09578021471741806, + "grad_norm": 0.25, + "learning_rate": 4.9714043781828754e-05, + "loss": 2.8179, + "num_input_tokens_seen": 1035468800, + "step": 1975 + }, + { + "epoch": 0.09602269627366468, + "grad_norm": 0.255859375, + "learning_rate": 4.9711012427323235e-05, + "loss": 2.8098, + "num_input_tokens_seen": 1038090240, + "step": 1980 + }, + { + "epoch": 0.09626517782991131, + "grad_norm": 0.25390625, + "learning_rate": 4.970796518348214e-05, + "loss": 2.8045, + "num_input_tokens_seen": 1040711680, + "step": 1985 + }, + { + "epoch": 0.09650765938615793, + "grad_norm": 0.263671875, + "learning_rate": 4.970490205226486e-05, + "loss": 2.8143, + "num_input_tokens_seen": 1043333120, + "step": 1990 + }, + { + "epoch": 0.09675014094240457, + "grad_norm": 0.2470703125, + "learning_rate": 4.9701823035640994e-05, + "loss": 2.8037, + "num_input_tokens_seen": 1045954560, + "step": 1995 + }, + { + "epoch": 0.09699262249865119, + "grad_norm": 0.23828125, + "learning_rate": 4.9698728135590394e-05, + "loss": 2.8136, + "num_input_tokens_seen": 1048576000, + "step": 2000 + }, + { + "epoch": 0.09723510405489783, + "grad_norm": 0.255859375, + "learning_rate": 4.9695617354103085e-05, + "loss": 2.7976, + "num_input_tokens_seen": 1051197440, + "step": 2005 + }, + { + "epoch": 0.09747758561114445, + "grad_norm": 0.24609375, + "learning_rate": 4.9692490693179324e-05, + "loss": 2.8091, + "num_input_tokens_seen": 1053818880, + "step": 2010 + }, + { + "epoch": 0.09772006716739108, + "grad_norm": 0.25390625, + "learning_rate": 4.968934815482956e-05, + "loss": 2.8106, + "num_input_tokens_seen": 1056440320, + "step": 2015 + }, + { + "epoch": 0.0979625487236377, + "grad_norm": 0.251953125, + "learning_rate": 4.9686189741074494e-05, + "loss": 2.8134, + "num_input_tokens_seen": 1059061760, + "step": 2020 + }, + { + "epoch": 0.09820503027988434, + "grad_norm": 0.248046875, + "learning_rate": 4.968301545394498e-05, + "loss": 2.8143, + "num_input_tokens_seen": 1061683200, + "step": 2025 + }, + { + "epoch": 0.09844751183613096, + "grad_norm": 0.251953125, + "learning_rate": 4.967982529548211e-05, + "loss": 2.806, + "num_input_tokens_seen": 1064304640, + "step": 2030 + }, + { + "epoch": 0.09868999339237759, + "grad_norm": 0.240234375, + "learning_rate": 4.967661926773718e-05, + "loss": 2.8234, + "num_input_tokens_seen": 1066926080, + "step": 2035 + }, + { + "epoch": 0.09893247494862423, + "grad_norm": 0.26171875, + "learning_rate": 4.967339737277169e-05, + "loss": 2.7927, + "num_input_tokens_seen": 1069547520, + "step": 2040 + }, + { + "epoch": 0.09917495650487085, + "grad_norm": 0.244140625, + "learning_rate": 4.967015961265732e-05, + "loss": 2.805, + "num_input_tokens_seen": 1072168960, + "step": 2045 + }, + { + "epoch": 0.09941743806111748, + "grad_norm": 0.248046875, + "learning_rate": 4.9666905989475995e-05, + "loss": 2.802, + "num_input_tokens_seen": 1074790400, + "step": 2050 + }, + { + "epoch": 0.0996599196173641, + "grad_norm": 0.2470703125, + "learning_rate": 4.96636365053198e-05, + "loss": 2.8156, + "num_input_tokens_seen": 1077411840, + "step": 2055 + }, + { + "epoch": 0.09990240117361074, + "grad_norm": 0.251953125, + "learning_rate": 4.966035116229103e-05, + "loss": 2.8105, + "num_input_tokens_seen": 1080033280, + "step": 2060 + }, + { + "epoch": 0.10014488272985736, + "grad_norm": 0.2490234375, + "learning_rate": 4.9657049962502196e-05, + "loss": 2.8111, + "num_input_tokens_seen": 1082654720, + "step": 2065 + }, + { + "epoch": 0.100387364286104, + "grad_norm": 0.255859375, + "learning_rate": 4.965373290807598e-05, + "loss": 2.8031, + "num_input_tokens_seen": 1085276160, + "step": 2070 + }, + { + "epoch": 0.10062984584235062, + "grad_norm": 0.251953125, + "learning_rate": 4.9650400001145265e-05, + "loss": 2.8049, + "num_input_tokens_seen": 1087897600, + "step": 2075 + }, + { + "epoch": 0.10087232739859725, + "grad_norm": 0.25, + "learning_rate": 4.9647051243853135e-05, + "loss": 2.8112, + "num_input_tokens_seen": 1090519040, + "step": 2080 + }, + { + "epoch": 0.10111480895484387, + "grad_norm": 0.2431640625, + "learning_rate": 4.964368663835288e-05, + "loss": 2.8095, + "num_input_tokens_seen": 1093140480, + "step": 2085 + }, + { + "epoch": 0.1013572905110905, + "grad_norm": 0.244140625, + "learning_rate": 4.964030618680793e-05, + "loss": 2.8092, + "num_input_tokens_seen": 1095761920, + "step": 2090 + }, + { + "epoch": 0.10159977206733713, + "grad_norm": 0.2392578125, + "learning_rate": 4.963690989139196e-05, + "loss": 2.8094, + "num_input_tokens_seen": 1098383360, + "step": 2095 + }, + { + "epoch": 0.10184225362358376, + "grad_norm": 0.255859375, + "learning_rate": 4.96334977542888e-05, + "loss": 2.8019, + "num_input_tokens_seen": 1101004800, + "step": 2100 + }, + { + "epoch": 0.10184225362358376, + "eval_accuracy": 0.4515551213157466, + "eval_loss": 2.772890329360962, + "eval_runtime": 5.8572, + "eval_samples_per_second": 51.219, + "eval_steps_per_second": 6.488, + "num_input_tokens_seen": 1101004800, + "step": 2100 + }, + { + "epoch": 0.10208473517983038, + "grad_norm": 0.2451171875, + "learning_rate": 4.963006977769248e-05, + "loss": 2.7996, + "num_input_tokens_seen": 1103626240, + "step": 2105 + }, + { + "epoch": 0.10232721673607702, + "grad_norm": 0.2451171875, + "learning_rate": 4.9626625963807205e-05, + "loss": 2.8103, + "num_input_tokens_seen": 1106247680, + "step": 2110 + }, + { + "epoch": 0.10256969829232364, + "grad_norm": 0.25390625, + "learning_rate": 4.962316631484737e-05, + "loss": 2.7967, + "num_input_tokens_seen": 1108869120, + "step": 2115 + }, + { + "epoch": 0.10281217984857027, + "grad_norm": 0.255859375, + "learning_rate": 4.9619690833037545e-05, + "loss": 2.8031, + "num_input_tokens_seen": 1111490560, + "step": 2120 + }, + { + "epoch": 0.1030546614048169, + "grad_norm": 0.251953125, + "learning_rate": 4.96161995206125e-05, + "loss": 2.8048, + "num_input_tokens_seen": 1114112000, + "step": 2125 + }, + { + "epoch": 0.10329714296106353, + "grad_norm": 0.25390625, + "learning_rate": 4.9612692379817175e-05, + "loss": 2.8085, + "num_input_tokens_seen": 1116733440, + "step": 2130 + }, + { + "epoch": 0.10353962451731015, + "grad_norm": 0.263671875, + "learning_rate": 4.960916941290666e-05, + "loss": 2.8057, + "num_input_tokens_seen": 1119354880, + "step": 2135 + }, + { + "epoch": 0.10378210607355678, + "grad_norm": 0.2578125, + "learning_rate": 4.960563062214627e-05, + "loss": 2.7938, + "num_input_tokens_seen": 1121976320, + "step": 2140 + }, + { + "epoch": 0.1040245876298034, + "grad_norm": 0.267578125, + "learning_rate": 4.960207600981145e-05, + "loss": 2.8095, + "num_input_tokens_seen": 1124597760, + "step": 2145 + }, + { + "epoch": 0.10426706918605004, + "grad_norm": 0.2490234375, + "learning_rate": 4.9598505578187844e-05, + "loss": 2.8159, + "num_input_tokens_seen": 1127219200, + "step": 2150 + }, + { + "epoch": 0.10450955074229666, + "grad_norm": 0.259765625, + "learning_rate": 4.9594919329571264e-05, + "loss": 2.7916, + "num_input_tokens_seen": 1129840640, + "step": 2155 + }, + { + "epoch": 0.1047520322985433, + "grad_norm": 0.25390625, + "learning_rate": 4.959131726626769e-05, + "loss": 2.804, + "num_input_tokens_seen": 1132462080, + "step": 2160 + }, + { + "epoch": 0.10499451385478992, + "grad_norm": 0.248046875, + "learning_rate": 4.9587699390593276e-05, + "loss": 2.805, + "num_input_tokens_seen": 1135083520, + "step": 2165 + }, + { + "epoch": 0.10523699541103655, + "grad_norm": 0.259765625, + "learning_rate": 4.9584065704874326e-05, + "loss": 2.8, + "num_input_tokens_seen": 1137704960, + "step": 2170 + }, + { + "epoch": 0.10547947696728317, + "grad_norm": 0.26953125, + "learning_rate": 4.9580416211447336e-05, + "loss": 2.7925, + "num_input_tokens_seen": 1140326400, + "step": 2175 + }, + { + "epoch": 0.10572195852352981, + "grad_norm": 0.2578125, + "learning_rate": 4.9576750912658945e-05, + "loss": 2.7941, + "num_input_tokens_seen": 1142947840, + "step": 2180 + }, + { + "epoch": 0.10596444007977643, + "grad_norm": 0.251953125, + "learning_rate": 4.957306981086596e-05, + "loss": 2.8094, + "num_input_tokens_seen": 1145569280, + "step": 2185 + }, + { + "epoch": 0.10620692163602306, + "grad_norm": 0.2451171875, + "learning_rate": 4.9569372908435365e-05, + "loss": 2.7984, + "num_input_tokens_seen": 1148190720, + "step": 2190 + }, + { + "epoch": 0.10644940319226968, + "grad_norm": 0.244140625, + "learning_rate": 4.956566020774428e-05, + "loss": 2.8152, + "num_input_tokens_seen": 1150812160, + "step": 2195 + }, + { + "epoch": 0.10669188474851632, + "grad_norm": 0.26171875, + "learning_rate": 4.956193171118e-05, + "loss": 2.8166, + "num_input_tokens_seen": 1153433600, + "step": 2200 + }, + { + "epoch": 0.10693436630476294, + "grad_norm": 0.2451171875, + "learning_rate": 4.955818742113997e-05, + "loss": 2.7986, + "num_input_tokens_seen": 1156055040, + "step": 2205 + }, + { + "epoch": 0.10717684786100957, + "grad_norm": 0.251953125, + "learning_rate": 4.95544273400318e-05, + "loss": 2.8067, + "num_input_tokens_seen": 1158676480, + "step": 2210 + }, + { + "epoch": 0.1074193294172562, + "grad_norm": 0.240234375, + "learning_rate": 4.955065147027323e-05, + "loss": 2.8053, + "num_input_tokens_seen": 1161297920, + "step": 2215 + }, + { + "epoch": 0.10766181097350283, + "grad_norm": 0.259765625, + "learning_rate": 4.954685981429218e-05, + "loss": 2.7979, + "num_input_tokens_seen": 1163919360, + "step": 2220 + }, + { + "epoch": 0.10790429252974945, + "grad_norm": 0.248046875, + "learning_rate": 4.95430523745267e-05, + "loss": 2.7966, + "num_input_tokens_seen": 1166540800, + "step": 2225 + }, + { + "epoch": 0.10814677408599609, + "grad_norm": 0.25390625, + "learning_rate": 4.9539229153425e-05, + "loss": 2.8017, + "num_input_tokens_seen": 1169162240, + "step": 2230 + }, + { + "epoch": 0.10838925564224271, + "grad_norm": 0.255859375, + "learning_rate": 4.953539015344545e-05, + "loss": 2.801, + "num_input_tokens_seen": 1171783680, + "step": 2235 + }, + { + "epoch": 0.10863173719848934, + "grad_norm": 0.2578125, + "learning_rate": 4.953153537705653e-05, + "loss": 2.802, + "num_input_tokens_seen": 1174405120, + "step": 2240 + }, + { + "epoch": 0.10887421875473596, + "grad_norm": 0.244140625, + "learning_rate": 4.952766482673689e-05, + "loss": 2.8066, + "num_input_tokens_seen": 1177026560, + "step": 2245 + }, + { + "epoch": 0.1091167003109826, + "grad_norm": 0.25, + "learning_rate": 4.952377850497533e-05, + "loss": 2.8077, + "num_input_tokens_seen": 1179648000, + "step": 2250 + }, + { + "epoch": 0.10935918186722922, + "grad_norm": 0.25390625, + "learning_rate": 4.951987641427076e-05, + "loss": 2.7906, + "num_input_tokens_seen": 1182269440, + "step": 2255 + }, + { + "epoch": 0.10960166342347585, + "grad_norm": 0.259765625, + "learning_rate": 4.951595855713227e-05, + "loss": 2.8113, + "num_input_tokens_seen": 1184890880, + "step": 2260 + }, + { + "epoch": 0.10984414497972247, + "grad_norm": 0.267578125, + "learning_rate": 4.951202493607905e-05, + "loss": 2.8124, + "num_input_tokens_seen": 1187512320, + "step": 2265 + }, + { + "epoch": 0.11008662653596911, + "grad_norm": 0.255859375, + "learning_rate": 4.950807555364045e-05, + "loss": 2.7866, + "num_input_tokens_seen": 1190133760, + "step": 2270 + }, + { + "epoch": 0.11032910809221573, + "grad_norm": 0.25, + "learning_rate": 4.9504110412355954e-05, + "loss": 2.8151, + "num_input_tokens_seen": 1192755200, + "step": 2275 + }, + { + "epoch": 0.11057158964846237, + "grad_norm": 0.263671875, + "learning_rate": 4.950012951477516e-05, + "loss": 2.7983, + "num_input_tokens_seen": 1195376640, + "step": 2280 + }, + { + "epoch": 0.11081407120470899, + "grad_norm": 0.263671875, + "learning_rate": 4.9496132863457813e-05, + "loss": 2.8271, + "num_input_tokens_seen": 1197998080, + "step": 2285 + }, + { + "epoch": 0.11105655276095562, + "grad_norm": 0.248046875, + "learning_rate": 4.949212046097379e-05, + "loss": 2.799, + "num_input_tokens_seen": 1200619520, + "step": 2290 + }, + { + "epoch": 0.11129903431720224, + "grad_norm": 0.25390625, + "learning_rate": 4.948809230990309e-05, + "loss": 2.8016, + "num_input_tokens_seen": 1203240960, + "step": 2295 + }, + { + "epoch": 0.11154151587344888, + "grad_norm": 0.2412109375, + "learning_rate": 4.9484048412835836e-05, + "loss": 2.8, + "num_input_tokens_seen": 1205862400, + "step": 2300 + }, + { + "epoch": 0.1117839974296955, + "grad_norm": 0.255859375, + "learning_rate": 4.947998877237228e-05, + "loss": 2.8049, + "num_input_tokens_seen": 1208483840, + "step": 2305 + }, + { + "epoch": 0.11202647898594213, + "grad_norm": 0.25, + "learning_rate": 4.94759133911228e-05, + "loss": 2.8028, + "num_input_tokens_seen": 1211105280, + "step": 2310 + }, + { + "epoch": 0.11226896054218875, + "grad_norm": 0.251953125, + "learning_rate": 4.947182227170788e-05, + "loss": 2.8036, + "num_input_tokens_seen": 1213726720, + "step": 2315 + }, + { + "epoch": 0.11251144209843539, + "grad_norm": 0.24609375, + "learning_rate": 4.9467715416758155e-05, + "loss": 2.8069, + "num_input_tokens_seen": 1216348160, + "step": 2320 + }, + { + "epoch": 0.11275392365468201, + "grad_norm": 0.259765625, + "learning_rate": 4.946359282891434e-05, + "loss": 2.7988, + "num_input_tokens_seen": 1218969600, + "step": 2325 + }, + { + "epoch": 0.11299640521092864, + "grad_norm": 0.25, + "learning_rate": 4.945945451082729e-05, + "loss": 2.8013, + "num_input_tokens_seen": 1221591040, + "step": 2330 + }, + { + "epoch": 0.11323888676717528, + "grad_norm": 0.248046875, + "learning_rate": 4.9455300465157976e-05, + "loss": 2.8114, + "num_input_tokens_seen": 1224212480, + "step": 2335 + }, + { + "epoch": 0.1134813683234219, + "grad_norm": 0.2451171875, + "learning_rate": 4.945113069457747e-05, + "loss": 2.8018, + "num_input_tokens_seen": 1226833920, + "step": 2340 + }, + { + "epoch": 0.11372384987966853, + "grad_norm": 0.248046875, + "learning_rate": 4.944694520176697e-05, + "loss": 2.8051, + "num_input_tokens_seen": 1229455360, + "step": 2345 + }, + { + "epoch": 0.11396633143591516, + "grad_norm": 0.26171875, + "learning_rate": 4.944274398941775e-05, + "loss": 2.7898, + "num_input_tokens_seen": 1232076800, + "step": 2350 + }, + { + "epoch": 0.11420881299216179, + "grad_norm": 0.251953125, + "learning_rate": 4.9438527060231244e-05, + "loss": 2.7976, + "num_input_tokens_seen": 1234698240, + "step": 2355 + }, + { + "epoch": 0.11445129454840841, + "grad_norm": 0.26171875, + "learning_rate": 4.943429441691894e-05, + "loss": 2.8107, + "num_input_tokens_seen": 1237319680, + "step": 2360 + }, + { + "epoch": 0.11469377610465505, + "grad_norm": 0.255859375, + "learning_rate": 4.943004606220247e-05, + "loss": 2.7924, + "num_input_tokens_seen": 1239941120, + "step": 2365 + }, + { + "epoch": 0.11493625766090167, + "grad_norm": 0.26171875, + "learning_rate": 4.942578199881355e-05, + "loss": 2.7947, + "num_input_tokens_seen": 1242562560, + "step": 2370 + }, + { + "epoch": 0.1151787392171483, + "grad_norm": 0.2490234375, + "learning_rate": 4.9421502229494e-05, + "loss": 2.7961, + "num_input_tokens_seen": 1245184000, + "step": 2375 + }, + { + "epoch": 0.11542122077339492, + "grad_norm": 0.2421875, + "learning_rate": 4.941720675699573e-05, + "loss": 2.7909, + "num_input_tokens_seen": 1247805440, + "step": 2380 + }, + { + "epoch": 0.11566370232964156, + "grad_norm": 0.251953125, + "learning_rate": 4.9412895584080766e-05, + "loss": 2.8032, + "num_input_tokens_seen": 1250426880, + "step": 2385 + }, + { + "epoch": 0.11590618388588818, + "grad_norm": 0.2490234375, + "learning_rate": 4.940856871352121e-05, + "loss": 2.8052, + "num_input_tokens_seen": 1253048320, + "step": 2390 + }, + { + "epoch": 0.11614866544213481, + "grad_norm": 0.2451171875, + "learning_rate": 4.9404226148099274e-05, + "loss": 2.7942, + "num_input_tokens_seen": 1255669760, + "step": 2395 + }, + { + "epoch": 0.11639114699838143, + "grad_norm": 0.25, + "learning_rate": 4.9399867890607254e-05, + "loss": 2.7995, + "num_input_tokens_seen": 1258291200, + "step": 2400 + }, + { + "epoch": 0.11639114699838143, + "eval_accuracy": 0.4522162514248494, + "eval_loss": 2.7690587043762207, + "eval_runtime": 6.2539, + "eval_samples_per_second": 47.97, + "eval_steps_per_second": 6.076, + "num_input_tokens_seen": 1258291200, + "step": 2400 + }, + { + "epoch": 0.11663362855462807, + "grad_norm": 0.25390625, + "learning_rate": 4.939549394384754e-05, + "loss": 2.7948, + "num_input_tokens_seen": 1260912640, + "step": 2405 + }, + { + "epoch": 0.11687611011087469, + "grad_norm": 0.255859375, + "learning_rate": 4.939110431063258e-05, + "loss": 2.8082, + "num_input_tokens_seen": 1263534080, + "step": 2410 + }, + { + "epoch": 0.11711859166712132, + "grad_norm": 0.25, + "learning_rate": 4.9386698993784984e-05, + "loss": 2.7991, + "num_input_tokens_seen": 1266155520, + "step": 2415 + }, + { + "epoch": 0.11736107322336795, + "grad_norm": 0.2470703125, + "learning_rate": 4.938227799613736e-05, + "loss": 2.8032, + "num_input_tokens_seen": 1268776960, + "step": 2420 + }, + { + "epoch": 0.11760355477961458, + "grad_norm": 0.259765625, + "learning_rate": 4.937784132053245e-05, + "loss": 2.8013, + "num_input_tokens_seen": 1271398400, + "step": 2425 + }, + { + "epoch": 0.1178460363358612, + "grad_norm": 0.2578125, + "learning_rate": 4.937338896982306e-05, + "loss": 2.8085, + "num_input_tokens_seen": 1274019840, + "step": 2430 + }, + { + "epoch": 0.11808851789210784, + "grad_norm": 0.248046875, + "learning_rate": 4.936892094687209e-05, + "loss": 2.8037, + "num_input_tokens_seen": 1276641280, + "step": 2435 + }, + { + "epoch": 0.11833099944835446, + "grad_norm": 0.251953125, + "learning_rate": 4.9364437254552495e-05, + "loss": 2.8014, + "num_input_tokens_seen": 1279262720, + "step": 2440 + }, + { + "epoch": 0.11857348100460109, + "grad_norm": 0.2451171875, + "learning_rate": 4.935993789574733e-05, + "loss": 2.7895, + "num_input_tokens_seen": 1281884160, + "step": 2445 + }, + { + "epoch": 0.11881596256084771, + "grad_norm": 0.251953125, + "learning_rate": 4.93554228733497e-05, + "loss": 2.7933, + "num_input_tokens_seen": 1284505600, + "step": 2450 + }, + { + "epoch": 0.11905844411709435, + "grad_norm": 0.2578125, + "learning_rate": 4.935089219026279e-05, + "loss": 2.8195, + "num_input_tokens_seen": 1287127040, + "step": 2455 + }, + { + "epoch": 0.11930092567334097, + "grad_norm": 0.2578125, + "learning_rate": 4.9346345849399864e-05, + "loss": 2.8013, + "num_input_tokens_seen": 1289748480, + "step": 2460 + }, + { + "epoch": 0.1195434072295876, + "grad_norm": 0.248046875, + "learning_rate": 4.9341783853684246e-05, + "loss": 2.8148, + "num_input_tokens_seen": 1292369920, + "step": 2465 + }, + { + "epoch": 0.11978588878583422, + "grad_norm": 0.255859375, + "learning_rate": 4.9337206206049325e-05, + "loss": 2.8012, + "num_input_tokens_seen": 1294991360, + "step": 2470 + }, + { + "epoch": 0.12002837034208086, + "grad_norm": 0.25390625, + "learning_rate": 4.933261290943856e-05, + "loss": 2.7881, + "num_input_tokens_seen": 1297612800, + "step": 2475 + }, + { + "epoch": 0.12027085189832748, + "grad_norm": 0.251953125, + "learning_rate": 4.932800396680548e-05, + "loss": 2.7861, + "num_input_tokens_seen": 1300234240, + "step": 2480 + }, + { + "epoch": 0.12051333345457411, + "grad_norm": 0.2451171875, + "learning_rate": 4.9323379381113644e-05, + "loss": 2.7839, + "num_input_tokens_seen": 1302855680, + "step": 2485 + }, + { + "epoch": 0.12075581501082074, + "grad_norm": 0.25390625, + "learning_rate": 4.93187391553367e-05, + "loss": 2.793, + "num_input_tokens_seen": 1305477120, + "step": 2490 + }, + { + "epoch": 0.12099829656706737, + "grad_norm": 0.255859375, + "learning_rate": 4.931408329245835e-05, + "loss": 2.8094, + "num_input_tokens_seen": 1308098560, + "step": 2495 + }, + { + "epoch": 0.12124077812331399, + "grad_norm": 0.251953125, + "learning_rate": 4.9309411795472327e-05, + "loss": 2.7796, + "num_input_tokens_seen": 1310720000, + "step": 2500 + }, + { + "epoch": 0.12148325967956063, + "grad_norm": 0.255859375, + "learning_rate": 4.930472466738244e-05, + "loss": 2.7933, + "num_input_tokens_seen": 1313341440, + "step": 2505 + }, + { + "epoch": 0.12172574123580725, + "grad_norm": 0.251953125, + "learning_rate": 4.930002191120254e-05, + "loss": 2.7996, + "num_input_tokens_seen": 1315962880, + "step": 2510 + }, + { + "epoch": 0.12196822279205388, + "grad_norm": 0.255859375, + "learning_rate": 4.9295303529956535e-05, + "loss": 2.7956, + "num_input_tokens_seen": 1318584320, + "step": 2515 + }, + { + "epoch": 0.1222107043483005, + "grad_norm": 0.255859375, + "learning_rate": 4.929056952667838e-05, + "loss": 2.801, + "num_input_tokens_seen": 1321205760, + "step": 2520 + }, + { + "epoch": 0.12245318590454714, + "grad_norm": 0.248046875, + "learning_rate": 4.928581990441204e-05, + "loss": 2.7956, + "num_input_tokens_seen": 1323827200, + "step": 2525 + }, + { + "epoch": 0.12269566746079376, + "grad_norm": 0.244140625, + "learning_rate": 4.928105466621157e-05, + "loss": 2.803, + "num_input_tokens_seen": 1326448640, + "step": 2530 + }, + { + "epoch": 0.1229381490170404, + "grad_norm": 0.25, + "learning_rate": 4.927627381514106e-05, + "loss": 2.8075, + "num_input_tokens_seen": 1329070080, + "step": 2535 + }, + { + "epoch": 0.12318063057328701, + "grad_norm": 0.2578125, + "learning_rate": 4.927147735427461e-05, + "loss": 2.8163, + "num_input_tokens_seen": 1331691520, + "step": 2540 + }, + { + "epoch": 0.12342311212953365, + "grad_norm": 0.25390625, + "learning_rate": 4.926666528669637e-05, + "loss": 2.8067, + "num_input_tokens_seen": 1334312960, + "step": 2545 + }, + { + "epoch": 0.12366559368578027, + "grad_norm": 0.255859375, + "learning_rate": 4.926183761550055e-05, + "loss": 2.7921, + "num_input_tokens_seen": 1336934400, + "step": 2550 + }, + { + "epoch": 0.1239080752420269, + "grad_norm": 0.259765625, + "learning_rate": 4.925699434379136e-05, + "loss": 2.8004, + "num_input_tokens_seen": 1339555840, + "step": 2555 + }, + { + "epoch": 0.12415055679827353, + "grad_norm": 0.263671875, + "learning_rate": 4.925213547468305e-05, + "loss": 2.8096, + "num_input_tokens_seen": 1342177280, + "step": 2560 + }, + { + "epoch": 0.12439303835452016, + "grad_norm": 0.2490234375, + "learning_rate": 4.924726101129991e-05, + "loss": 2.8016, + "num_input_tokens_seen": 1344798720, + "step": 2565 + }, + { + "epoch": 0.12463551991076678, + "grad_norm": 0.255859375, + "learning_rate": 4.924237095677625e-05, + "loss": 2.7967, + "num_input_tokens_seen": 1347420160, + "step": 2570 + }, + { + "epoch": 0.12487800146701342, + "grad_norm": 0.255859375, + "learning_rate": 4.923746531425641e-05, + "loss": 2.811, + "num_input_tokens_seen": 1350041600, + "step": 2575 + }, + { + "epoch": 0.12512048302326004, + "grad_norm": 0.2578125, + "learning_rate": 4.923254408689474e-05, + "loss": 2.8002, + "num_input_tokens_seen": 1352663040, + "step": 2580 + }, + { + "epoch": 0.12536296457950666, + "grad_norm": 0.25, + "learning_rate": 4.922760727785563e-05, + "loss": 2.8053, + "num_input_tokens_seen": 1355284480, + "step": 2585 + }, + { + "epoch": 0.1256054461357533, + "grad_norm": 0.2490234375, + "learning_rate": 4.922265489031346e-05, + "loss": 2.8038, + "num_input_tokens_seen": 1357905920, + "step": 2590 + }, + { + "epoch": 0.12584792769199993, + "grad_norm": 0.2470703125, + "learning_rate": 4.9217686927452664e-05, + "loss": 2.7968, + "num_input_tokens_seen": 1360527360, + "step": 2595 + }, + { + "epoch": 0.12609040924824655, + "grad_norm": 0.24609375, + "learning_rate": 4.9212703392467667e-05, + "loss": 2.7843, + "num_input_tokens_seen": 1363148800, + "step": 2600 + }, + { + "epoch": 0.12633289080449317, + "grad_norm": 0.25390625, + "learning_rate": 4.920770428856292e-05, + "loss": 2.8016, + "num_input_tokens_seen": 1365770240, + "step": 2605 + }, + { + "epoch": 0.12657537236073982, + "grad_norm": 0.26171875, + "learning_rate": 4.9202689618952866e-05, + "loss": 2.7902, + "num_input_tokens_seen": 1368391680, + "step": 2610 + }, + { + "epoch": 0.12681785391698644, + "grad_norm": 0.2578125, + "learning_rate": 4.9197659386861976e-05, + "loss": 2.8136, + "num_input_tokens_seen": 1371013120, + "step": 2615 + }, + { + "epoch": 0.12706033547323306, + "grad_norm": 0.2470703125, + "learning_rate": 4.9192613595524724e-05, + "loss": 2.8071, + "num_input_tokens_seen": 1373634560, + "step": 2620 + }, + { + "epoch": 0.1273028170294797, + "grad_norm": 0.2578125, + "learning_rate": 4.918755224818558e-05, + "loss": 2.8067, + "num_input_tokens_seen": 1376256000, + "step": 2625 + }, + { + "epoch": 0.12754529858572633, + "grad_norm": 0.244140625, + "learning_rate": 4.918247534809902e-05, + "loss": 2.7912, + "num_input_tokens_seen": 1378877440, + "step": 2630 + }, + { + "epoch": 0.12778778014197295, + "grad_norm": 0.2421875, + "learning_rate": 4.9177382898529534e-05, + "loss": 2.7971, + "num_input_tokens_seen": 1381498880, + "step": 2635 + }, + { + "epoch": 0.12803026169821957, + "grad_norm": 0.25390625, + "learning_rate": 4.917227490275158e-05, + "loss": 2.7934, + "num_input_tokens_seen": 1384120320, + "step": 2640 + }, + { + "epoch": 0.12827274325446622, + "grad_norm": 0.263671875, + "learning_rate": 4.916715136404964e-05, + "loss": 2.7972, + "num_input_tokens_seen": 1386741760, + "step": 2645 + }, + { + "epoch": 0.12851522481071284, + "grad_norm": 0.2578125, + "learning_rate": 4.91620122857182e-05, + "loss": 2.8053, + "num_input_tokens_seen": 1389363200, + "step": 2650 + }, + { + "epoch": 0.12875770636695946, + "grad_norm": 0.2578125, + "learning_rate": 4.9156857671061696e-05, + "loss": 2.7921, + "num_input_tokens_seen": 1391984640, + "step": 2655 + }, + { + "epoch": 0.12900018792320608, + "grad_norm": 0.255859375, + "learning_rate": 4.9151687523394584e-05, + "loss": 2.8009, + "num_input_tokens_seen": 1394606080, + "step": 2660 + }, + { + "epoch": 0.12924266947945273, + "grad_norm": 0.255859375, + "learning_rate": 4.91465018460413e-05, + "loss": 2.8027, + "num_input_tokens_seen": 1397227520, + "step": 2665 + }, + { + "epoch": 0.12948515103569935, + "grad_norm": 0.259765625, + "learning_rate": 4.914130064233627e-05, + "loss": 2.8085, + "num_input_tokens_seen": 1399848960, + "step": 2670 + }, + { + "epoch": 0.12972763259194597, + "grad_norm": 0.251953125, + "learning_rate": 4.91360839156239e-05, + "loss": 2.7943, + "num_input_tokens_seen": 1402470400, + "step": 2675 + }, + { + "epoch": 0.1299701141481926, + "grad_norm": 0.248046875, + "learning_rate": 4.9130851669258574e-05, + "loss": 2.8179, + "num_input_tokens_seen": 1405091840, + "step": 2680 + }, + { + "epoch": 0.13021259570443924, + "grad_norm": 0.25390625, + "learning_rate": 4.9125603906604664e-05, + "loss": 2.8101, + "num_input_tokens_seen": 1407713280, + "step": 2685 + }, + { + "epoch": 0.13045507726068586, + "grad_norm": 0.251953125, + "learning_rate": 4.912034063103651e-05, + "loss": 2.7838, + "num_input_tokens_seen": 1410334720, + "step": 2690 + }, + { + "epoch": 0.13069755881693249, + "grad_norm": 0.251953125, + "learning_rate": 4.911506184593844e-05, + "loss": 2.8124, + "num_input_tokens_seen": 1412956160, + "step": 2695 + }, + { + "epoch": 0.1309400403731791, + "grad_norm": 0.271484375, + "learning_rate": 4.910976755470473e-05, + "loss": 2.8006, + "num_input_tokens_seen": 1415577600, + "step": 2700 + }, + { + "epoch": 0.1309400403731791, + "eval_accuracy": 0.4526038104543234, + "eval_loss": 2.7656688690185547, + "eval_runtime": 5.786, + "eval_samples_per_second": 51.849, + "eval_steps_per_second": 6.568, + "num_input_tokens_seen": 1415577600, + "step": 2700 + }, + { + "epoch": 0.13118252192942575, + "grad_norm": 0.2578125, + "learning_rate": 4.910445776073966e-05, + "loss": 2.796, + "num_input_tokens_seen": 1418199040, + "step": 2705 + }, + { + "epoch": 0.13142500348567238, + "grad_norm": 0.26171875, + "learning_rate": 4.909913246745745e-05, + "loss": 2.805, + "num_input_tokens_seen": 1420820480, + "step": 2710 + }, + { + "epoch": 0.131667485041919, + "grad_norm": 0.251953125, + "learning_rate": 4.909379167828231e-05, + "loss": 2.7952, + "num_input_tokens_seen": 1423441920, + "step": 2715 + }, + { + "epoch": 0.13190996659816562, + "grad_norm": 0.25, + "learning_rate": 4.9088435396648383e-05, + "loss": 2.8165, + "num_input_tokens_seen": 1426063360, + "step": 2720 + }, + { + "epoch": 0.13215244815441227, + "grad_norm": 0.271484375, + "learning_rate": 4.908306362599979e-05, + "loss": 2.7988, + "num_input_tokens_seen": 1428684800, + "step": 2725 + }, + { + "epoch": 0.1323949297106589, + "grad_norm": 0.25, + "learning_rate": 4.907767636979063e-05, + "loss": 2.7978, + "num_input_tokens_seen": 1431306240, + "step": 2730 + }, + { + "epoch": 0.1326374112669055, + "grad_norm": 0.255859375, + "learning_rate": 4.907227363148493e-05, + "loss": 2.8041, + "num_input_tokens_seen": 1433927680, + "step": 2735 + }, + { + "epoch": 0.13287989282315213, + "grad_norm": 0.2578125, + "learning_rate": 4.90668554145567e-05, + "loss": 2.8011, + "num_input_tokens_seen": 1436549120, + "step": 2740 + }, + { + "epoch": 0.13312237437939878, + "grad_norm": 0.25390625, + "learning_rate": 4.9061421722489866e-05, + "loss": 2.8083, + "num_input_tokens_seen": 1439170560, + "step": 2745 + }, + { + "epoch": 0.1333648559356454, + "grad_norm": 0.2578125, + "learning_rate": 4.905597255877834e-05, + "loss": 2.7869, + "num_input_tokens_seen": 1441792000, + "step": 2750 + }, + { + "epoch": 0.13360733749189202, + "grad_norm": 0.2490234375, + "learning_rate": 4.905050792692596e-05, + "loss": 2.8159, + "num_input_tokens_seen": 1444413440, + "step": 2755 + }, + { + "epoch": 0.13384981904813864, + "grad_norm": 0.255859375, + "learning_rate": 4.9045027830446534e-05, + "loss": 2.7949, + "num_input_tokens_seen": 1447034880, + "step": 2760 + }, + { + "epoch": 0.1340923006043853, + "grad_norm": 0.26171875, + "learning_rate": 4.903953227286378e-05, + "loss": 2.8046, + "num_input_tokens_seen": 1449656320, + "step": 2765 + }, + { + "epoch": 0.1343347821606319, + "grad_norm": 0.25390625, + "learning_rate": 4.903402125771139e-05, + "loss": 2.8053, + "num_input_tokens_seen": 1452277760, + "step": 2770 + }, + { + "epoch": 0.13457726371687853, + "grad_norm": 0.255859375, + "learning_rate": 4.9028494788532966e-05, + "loss": 2.8006, + "num_input_tokens_seen": 1454899200, + "step": 2775 + }, + { + "epoch": 0.13481974527312515, + "grad_norm": 0.26171875, + "learning_rate": 4.902295286888208e-05, + "loss": 2.7883, + "num_input_tokens_seen": 1457520640, + "step": 2780 + }, + { + "epoch": 0.1350622268293718, + "grad_norm": 0.263671875, + "learning_rate": 4.9017395502322206e-05, + "loss": 2.7899, + "num_input_tokens_seen": 1460142080, + "step": 2785 + }, + { + "epoch": 0.13530470838561842, + "grad_norm": 0.2578125, + "learning_rate": 4.9011822692426765e-05, + "loss": 2.8109, + "num_input_tokens_seen": 1462763520, + "step": 2790 + }, + { + "epoch": 0.13554718994186504, + "grad_norm": 0.255859375, + "learning_rate": 4.900623444277913e-05, + "loss": 2.7942, + "num_input_tokens_seen": 1465384960, + "step": 2795 + }, + { + "epoch": 0.13578967149811166, + "grad_norm": 0.267578125, + "learning_rate": 4.900063075697256e-05, + "loss": 2.8041, + "num_input_tokens_seen": 1468006400, + "step": 2800 + }, + { + "epoch": 0.1360321530543583, + "grad_norm": 0.263671875, + "learning_rate": 4.899501163861026e-05, + "loss": 2.801, + "num_input_tokens_seen": 1470627840, + "step": 2805 + }, + { + "epoch": 0.13627463461060493, + "grad_norm": 0.255859375, + "learning_rate": 4.898937709130537e-05, + "loss": 2.7967, + "num_input_tokens_seen": 1473249280, + "step": 2810 + }, + { + "epoch": 0.13651711616685155, + "grad_norm": 0.25, + "learning_rate": 4.8983727118680934e-05, + "loss": 2.7884, + "num_input_tokens_seen": 1475870720, + "step": 2815 + }, + { + "epoch": 0.13675959772309818, + "grad_norm": 0.25, + "learning_rate": 4.897806172436991e-05, + "loss": 2.7912, + "num_input_tokens_seen": 1478492160, + "step": 2820 + }, + { + "epoch": 0.13700207927934482, + "grad_norm": 0.26171875, + "learning_rate": 4.89723809120152e-05, + "loss": 2.7917, + "num_input_tokens_seen": 1481113600, + "step": 2825 + }, + { + "epoch": 0.13724456083559144, + "grad_norm": 0.255859375, + "learning_rate": 4.8966684685269586e-05, + "loss": 2.7988, + "num_input_tokens_seen": 1483735040, + "step": 2830 + }, + { + "epoch": 0.13748704239183807, + "grad_norm": 0.271484375, + "learning_rate": 4.8960973047795786e-05, + "loss": 2.794, + "num_input_tokens_seen": 1486356480, + "step": 2835 + }, + { + "epoch": 0.1377295239480847, + "grad_norm": 0.26171875, + "learning_rate": 4.895524600326642e-05, + "loss": 2.7972, + "num_input_tokens_seen": 1488977920, + "step": 2840 + }, + { + "epoch": 0.13797200550433134, + "grad_norm": 0.255859375, + "learning_rate": 4.894950355536401e-05, + "loss": 2.7936, + "num_input_tokens_seen": 1491599360, + "step": 2845 + }, + { + "epoch": 0.13821448706057796, + "grad_norm": 0.25, + "learning_rate": 4.894374570778099e-05, + "loss": 2.7919, + "num_input_tokens_seen": 1494220800, + "step": 2850 + }, + { + "epoch": 0.13845696861682458, + "grad_norm": 0.255859375, + "learning_rate": 4.893797246421968e-05, + "loss": 2.7955, + "num_input_tokens_seen": 1496842240, + "step": 2855 + }, + { + "epoch": 0.1386994501730712, + "grad_norm": 0.251953125, + "learning_rate": 4.893218382839232e-05, + "loss": 2.8069, + "num_input_tokens_seen": 1499463680, + "step": 2860 + }, + { + "epoch": 0.13894193172931785, + "grad_norm": 0.2451171875, + "learning_rate": 4.8926379804021037e-05, + "loss": 2.7934, + "num_input_tokens_seen": 1502085120, + "step": 2865 + }, + { + "epoch": 0.13918441328556447, + "grad_norm": 0.255859375, + "learning_rate": 4.892056039483787e-05, + "loss": 2.7899, + "num_input_tokens_seen": 1504706560, + "step": 2870 + }, + { + "epoch": 0.1394268948418111, + "grad_norm": 0.267578125, + "learning_rate": 4.891472560458471e-05, + "loss": 2.7911, + "num_input_tokens_seen": 1507328000, + "step": 2875 + }, + { + "epoch": 0.1396693763980577, + "grad_norm": 0.2470703125, + "learning_rate": 4.890887543701338e-05, + "loss": 2.7826, + "num_input_tokens_seen": 1509949440, + "step": 2880 + }, + { + "epoch": 0.13991185795430436, + "grad_norm": 0.255859375, + "learning_rate": 4.890300989588557e-05, + "loss": 2.7796, + "num_input_tokens_seen": 1512570880, + "step": 2885 + }, + { + "epoch": 0.14015433951055098, + "grad_norm": 0.25, + "learning_rate": 4.889712898497286e-05, + "loss": 2.7972, + "num_input_tokens_seen": 1515192320, + "step": 2890 + }, + { + "epoch": 0.1403968210667976, + "grad_norm": 0.25, + "learning_rate": 4.889123270805671e-05, + "loss": 2.7947, + "num_input_tokens_seen": 1517813760, + "step": 2895 + }, + { + "epoch": 0.14063930262304422, + "grad_norm": 0.267578125, + "learning_rate": 4.888532106892847e-05, + "loss": 2.8126, + "num_input_tokens_seen": 1520435200, + "step": 2900 + }, + { + "epoch": 0.14088178417929087, + "grad_norm": 0.2578125, + "learning_rate": 4.8879394071389355e-05, + "loss": 2.8139, + "num_input_tokens_seen": 1523056640, + "step": 2905 + }, + { + "epoch": 0.1411242657355375, + "grad_norm": 0.26171875, + "learning_rate": 4.887345171925046e-05, + "loss": 2.8028, + "num_input_tokens_seen": 1525678080, + "step": 2910 + }, + { + "epoch": 0.1413667472917841, + "grad_norm": 0.2578125, + "learning_rate": 4.886749401633276e-05, + "loss": 2.7921, + "num_input_tokens_seen": 1528299520, + "step": 2915 + }, + { + "epoch": 0.14160922884803076, + "grad_norm": 0.259765625, + "learning_rate": 4.88615209664671e-05, + "loss": 2.7959, + "num_input_tokens_seen": 1530920960, + "step": 2920 + }, + { + "epoch": 0.14185171040427738, + "grad_norm": 0.26953125, + "learning_rate": 4.8855532573494175e-05, + "loss": 2.7985, + "num_input_tokens_seen": 1533542400, + "step": 2925 + }, + { + "epoch": 0.142094191960524, + "grad_norm": 0.2578125, + "learning_rate": 4.8849528841264555e-05, + "loss": 2.7886, + "num_input_tokens_seen": 1536163840, + "step": 2930 + }, + { + "epoch": 0.14233667351677062, + "grad_norm": 0.259765625, + "learning_rate": 4.884350977363871e-05, + "loss": 2.8094, + "num_input_tokens_seen": 1538785280, + "step": 2935 + }, + { + "epoch": 0.14257915507301727, + "grad_norm": 0.259765625, + "learning_rate": 4.88374753744869e-05, + "loss": 2.7781, + "num_input_tokens_seen": 1541406720, + "step": 2940 + }, + { + "epoch": 0.1428216366292639, + "grad_norm": 0.2470703125, + "learning_rate": 4.88314256476893e-05, + "loss": 2.7846, + "num_input_tokens_seen": 1544028160, + "step": 2945 + }, + { + "epoch": 0.1430641181855105, + "grad_norm": 0.251953125, + "learning_rate": 4.882536059713592e-05, + "loss": 2.7976, + "num_input_tokens_seen": 1546649600, + "step": 2950 + }, + { + "epoch": 0.14330659974175713, + "grad_norm": 0.248046875, + "learning_rate": 4.8819280226726624e-05, + "loss": 2.7954, + "num_input_tokens_seen": 1549271040, + "step": 2955 + }, + { + "epoch": 0.14354908129800378, + "grad_norm": 0.25, + "learning_rate": 4.8813184540371125e-05, + "loss": 2.7906, + "num_input_tokens_seen": 1551892480, + "step": 2960 + }, + { + "epoch": 0.1437915628542504, + "grad_norm": 0.25390625, + "learning_rate": 4.8807073541989e-05, + "loss": 2.7971, + "num_input_tokens_seen": 1554513920, + "step": 2965 + }, + { + "epoch": 0.14403404441049703, + "grad_norm": 0.26953125, + "learning_rate": 4.880094723550965e-05, + "loss": 2.8003, + "num_input_tokens_seen": 1557135360, + "step": 2970 + }, + { + "epoch": 0.14427652596674365, + "grad_norm": 0.28515625, + "learning_rate": 4.879480562487232e-05, + "loss": 2.7965, + "num_input_tokens_seen": 1559756800, + "step": 2975 + }, + { + "epoch": 0.1445190075229903, + "grad_norm": 0.26171875, + "learning_rate": 4.878864871402612e-05, + "loss": 2.8034, + "num_input_tokens_seen": 1562378240, + "step": 2980 + }, + { + "epoch": 0.14476148907923692, + "grad_norm": 0.25, + "learning_rate": 4.878247650692998e-05, + "loss": 2.7928, + "num_input_tokens_seen": 1564999680, + "step": 2985 + }, + { + "epoch": 0.14500397063548354, + "grad_norm": 0.255859375, + "learning_rate": 4.877628900755265e-05, + "loss": 2.7956, + "num_input_tokens_seen": 1567621120, + "step": 2990 + }, + { + "epoch": 0.14524645219173016, + "grad_norm": 0.2470703125, + "learning_rate": 4.8770086219872756e-05, + "loss": 2.7934, + "num_input_tokens_seen": 1570242560, + "step": 2995 + }, + { + "epoch": 0.1454889337479768, + "grad_norm": 0.2490234375, + "learning_rate": 4.876386814787871e-05, + "loss": 2.7886, + "num_input_tokens_seen": 1572864000, + "step": 3000 + }, + { + "epoch": 0.1454889337479768, + "eval_accuracy": 0.45276827878195736, + "eval_loss": 2.763141393661499, + "eval_runtime": 5.8531, + "eval_samples_per_second": 51.255, + "eval_steps_per_second": 6.492, + "num_input_tokens_seen": 1572864000, + "step": 3000 + }, + { + "epoch": 0.14573141530422343, + "grad_norm": 0.255859375, + "learning_rate": 4.875763479556879e-05, + "loss": 2.7761, + "num_input_tokens_seen": 1575485440, + "step": 3005 + }, + { + "epoch": 0.14597389686047005, + "grad_norm": 0.263671875, + "learning_rate": 4.8751386166951065e-05, + "loss": 2.7871, + "num_input_tokens_seen": 1578106880, + "step": 3010 + }, + { + "epoch": 0.14621637841671667, + "grad_norm": 0.251953125, + "learning_rate": 4.874512226604344e-05, + "loss": 2.7915, + "num_input_tokens_seen": 1580728320, + "step": 3015 + }, + { + "epoch": 0.14645885997296332, + "grad_norm": 0.259765625, + "learning_rate": 4.8738843096873646e-05, + "loss": 2.7887, + "num_input_tokens_seen": 1583349760, + "step": 3020 + }, + { + "epoch": 0.14670134152920994, + "grad_norm": 0.265625, + "learning_rate": 4.873254866347924e-05, + "loss": 2.7887, + "num_input_tokens_seen": 1585971200, + "step": 3025 + }, + { + "epoch": 0.14694382308545656, + "grad_norm": 0.25390625, + "learning_rate": 4.872623896990757e-05, + "loss": 2.8081, + "num_input_tokens_seen": 1588592640, + "step": 3030 + }, + { + "epoch": 0.14718630464170318, + "grad_norm": 0.2490234375, + "learning_rate": 4.871991402021581e-05, + "loss": 2.7981, + "num_input_tokens_seen": 1591214080, + "step": 3035 + }, + { + "epoch": 0.14742878619794983, + "grad_norm": 0.251953125, + "learning_rate": 4.871357381847094e-05, + "loss": 2.7952, + "num_input_tokens_seen": 1593835520, + "step": 3040 + }, + { + "epoch": 0.14767126775419645, + "grad_norm": 0.255859375, + "learning_rate": 4.870721836874976e-05, + "loss": 2.7912, + "num_input_tokens_seen": 1596456960, + "step": 3045 + }, + { + "epoch": 0.14791374931044307, + "grad_norm": 0.25, + "learning_rate": 4.870084767513885e-05, + "loss": 2.791, + "num_input_tokens_seen": 1599078400, + "step": 3050 + }, + { + "epoch": 0.1481562308666897, + "grad_norm": 0.2470703125, + "learning_rate": 4.869446174173462e-05, + "loss": 2.7779, + "num_input_tokens_seen": 1601699840, + "step": 3055 + }, + { + "epoch": 0.14839871242293634, + "grad_norm": 0.251953125, + "learning_rate": 4.8688060572643254e-05, + "loss": 2.7998, + "num_input_tokens_seen": 1604321280, + "step": 3060 + }, + { + "epoch": 0.14864119397918296, + "grad_norm": 0.2470703125, + "learning_rate": 4.868164417198074e-05, + "loss": 2.7951, + "num_input_tokens_seen": 1606942720, + "step": 3065 + }, + { + "epoch": 0.14888367553542958, + "grad_norm": 0.26171875, + "learning_rate": 4.867521254387289e-05, + "loss": 2.7777, + "num_input_tokens_seen": 1609564160, + "step": 3070 + }, + { + "epoch": 0.1491261570916762, + "grad_norm": 0.25, + "learning_rate": 4.866876569245524e-05, + "loss": 2.8008, + "num_input_tokens_seen": 1612185600, + "step": 3075 + }, + { + "epoch": 0.14936863864792285, + "grad_norm": 0.251953125, + "learning_rate": 4.86623036218732e-05, + "loss": 2.7931, + "num_input_tokens_seen": 1614807040, + "step": 3080 + }, + { + "epoch": 0.14961112020416947, + "grad_norm": 0.244140625, + "learning_rate": 4.8655826336281886e-05, + "loss": 2.7892, + "num_input_tokens_seen": 1617428480, + "step": 3085 + }, + { + "epoch": 0.1498536017604161, + "grad_norm": 0.2578125, + "learning_rate": 4.864933383984625e-05, + "loss": 2.8002, + "num_input_tokens_seen": 1620049920, + "step": 3090 + }, + { + "epoch": 0.15009608331666272, + "grad_norm": 0.25390625, + "learning_rate": 4.864282613674101e-05, + "loss": 2.7988, + "num_input_tokens_seen": 1622671360, + "step": 3095 + }, + { + "epoch": 0.15033856487290936, + "grad_norm": 0.2470703125, + "learning_rate": 4.863630323115065e-05, + "loss": 2.782, + "num_input_tokens_seen": 1625292800, + "step": 3100 + }, + { + "epoch": 0.15058104642915598, + "grad_norm": 0.255859375, + "learning_rate": 4.862976512726944e-05, + "loss": 2.7963, + "num_input_tokens_seen": 1627914240, + "step": 3105 + }, + { + "epoch": 0.1508235279854026, + "grad_norm": 0.271484375, + "learning_rate": 4.862321182930143e-05, + "loss": 2.7979, + "num_input_tokens_seen": 1630535680, + "step": 3110 + }, + { + "epoch": 0.15106600954164923, + "grad_norm": 0.25, + "learning_rate": 4.861664334146043e-05, + "loss": 2.8031, + "num_input_tokens_seen": 1633157120, + "step": 3115 + }, + { + "epoch": 0.15130849109789588, + "grad_norm": 0.259765625, + "learning_rate": 4.861005966797002e-05, + "loss": 2.7898, + "num_input_tokens_seen": 1635778560, + "step": 3120 + }, + { + "epoch": 0.1515509726541425, + "grad_norm": 0.25390625, + "learning_rate": 4.860346081306353e-05, + "loss": 2.786, + "num_input_tokens_seen": 1638400000, + "step": 3125 + }, + { + "epoch": 0.15179345421038912, + "grad_norm": 0.2470703125, + "learning_rate": 4.859684678098407e-05, + "loss": 2.7989, + "num_input_tokens_seen": 1641021440, + "step": 3130 + }, + { + "epoch": 0.15203593576663574, + "grad_norm": 0.259765625, + "learning_rate": 4.859021757598452e-05, + "loss": 2.8032, + "num_input_tokens_seen": 1643642880, + "step": 3135 + }, + { + "epoch": 0.1522784173228824, + "grad_norm": 0.25390625, + "learning_rate": 4.858357320232749e-05, + "loss": 2.7905, + "num_input_tokens_seen": 1646264320, + "step": 3140 + }, + { + "epoch": 0.152520898879129, + "grad_norm": 0.259765625, + "learning_rate": 4.8576913664285346e-05, + "loss": 2.788, + "num_input_tokens_seen": 1648885760, + "step": 3145 + }, + { + "epoch": 0.15276338043537563, + "grad_norm": 0.27734375, + "learning_rate": 4.8570238966140215e-05, + "loss": 2.8114, + "num_input_tokens_seen": 1651507200, + "step": 3150 + }, + { + "epoch": 0.15300586199162225, + "grad_norm": 0.26171875, + "learning_rate": 4.856354911218398e-05, + "loss": 2.8087, + "num_input_tokens_seen": 1654128640, + "step": 3155 + }, + { + "epoch": 0.1532483435478689, + "grad_norm": 0.267578125, + "learning_rate": 4.855684410671825e-05, + "loss": 2.7873, + "num_input_tokens_seen": 1656750080, + "step": 3160 + }, + { + "epoch": 0.15349082510411552, + "grad_norm": 0.24609375, + "learning_rate": 4.855012395405439e-05, + "loss": 2.7929, + "num_input_tokens_seen": 1659371520, + "step": 3165 + }, + { + "epoch": 0.15373330666036214, + "grad_norm": 0.26171875, + "learning_rate": 4.85433886585135e-05, + "loss": 2.8022, + "num_input_tokens_seen": 1661992960, + "step": 3170 + }, + { + "epoch": 0.15397578821660876, + "grad_norm": 0.2470703125, + "learning_rate": 4.853663822442641e-05, + "loss": 2.7892, + "num_input_tokens_seen": 1664614400, + "step": 3175 + }, + { + "epoch": 0.1542182697728554, + "grad_norm": 0.2490234375, + "learning_rate": 4.8529872656133704e-05, + "loss": 2.8025, + "num_input_tokens_seen": 1667235840, + "step": 3180 + }, + { + "epoch": 0.15446075132910203, + "grad_norm": 0.25390625, + "learning_rate": 4.852309195798567e-05, + "loss": 2.7959, + "num_input_tokens_seen": 1669857280, + "step": 3185 + }, + { + "epoch": 0.15470323288534865, + "grad_norm": 0.267578125, + "learning_rate": 4.851629613434236e-05, + "loss": 2.8077, + "num_input_tokens_seen": 1672478720, + "step": 3190 + }, + { + "epoch": 0.15494571444159527, + "grad_norm": 0.2734375, + "learning_rate": 4.85094851895735e-05, + "loss": 2.7869, + "num_input_tokens_seen": 1675100160, + "step": 3195 + }, + { + "epoch": 0.15518819599784192, + "grad_norm": 0.2490234375, + "learning_rate": 4.8502659128058595e-05, + "loss": 2.8001, + "num_input_tokens_seen": 1677721600, + "step": 3200 + }, + { + "epoch": 0.15543067755408854, + "grad_norm": 0.263671875, + "learning_rate": 4.849581795418684e-05, + "loss": 2.7906, + "num_input_tokens_seen": 1680343040, + "step": 3205 + }, + { + "epoch": 0.15567315911033516, + "grad_norm": 0.265625, + "learning_rate": 4.8488961672357145e-05, + "loss": 2.7865, + "num_input_tokens_seen": 1682964480, + "step": 3210 + }, + { + "epoch": 0.1559156406665818, + "grad_norm": 0.2578125, + "learning_rate": 4.848209028697816e-05, + "loss": 2.8061, + "num_input_tokens_seen": 1685585920, + "step": 3215 + }, + { + "epoch": 0.15615812222282843, + "grad_norm": 0.2470703125, + "learning_rate": 4.847520380246821e-05, + "loss": 2.7891, + "num_input_tokens_seen": 1688207360, + "step": 3220 + }, + { + "epoch": 0.15640060377907505, + "grad_norm": 0.248046875, + "learning_rate": 4.846830222325536e-05, + "loss": 2.8028, + "num_input_tokens_seen": 1690828800, + "step": 3225 + }, + { + "epoch": 0.15664308533532167, + "grad_norm": 0.259765625, + "learning_rate": 4.846138555377735e-05, + "loss": 2.7781, + "num_input_tokens_seen": 1693450240, + "step": 3230 + }, + { + "epoch": 0.15688556689156832, + "grad_norm": 0.2451171875, + "learning_rate": 4.845445379848166e-05, + "loss": 2.8013, + "num_input_tokens_seen": 1696071680, + "step": 3235 + }, + { + "epoch": 0.15712804844781494, + "grad_norm": 0.255859375, + "learning_rate": 4.844750696182545e-05, + "loss": 2.8074, + "num_input_tokens_seen": 1698693120, + "step": 3240 + }, + { + "epoch": 0.15737053000406157, + "grad_norm": 0.255859375, + "learning_rate": 4.8440545048275573e-05, + "loss": 2.8084, + "num_input_tokens_seen": 1701314560, + "step": 3245 + }, + { + "epoch": 0.15761301156030819, + "grad_norm": 0.2470703125, + "learning_rate": 4.84335680623086e-05, + "loss": 2.7899, + "num_input_tokens_seen": 1703936000, + "step": 3250 + }, + { + "epoch": 0.15785549311655483, + "grad_norm": 0.255859375, + "learning_rate": 4.842657600841075e-05, + "loss": 2.798, + "num_input_tokens_seen": 1706557440, + "step": 3255 + }, + { + "epoch": 0.15809797467280146, + "grad_norm": 0.251953125, + "learning_rate": 4.841956889107797e-05, + "loss": 2.7949, + "num_input_tokens_seen": 1709178880, + "step": 3260 + }, + { + "epoch": 0.15834045622904808, + "grad_norm": 0.2490234375, + "learning_rate": 4.8412546714815885e-05, + "loss": 2.7997, + "num_input_tokens_seen": 1711800320, + "step": 3265 + }, + { + "epoch": 0.1585829377852947, + "grad_norm": 0.2578125, + "learning_rate": 4.840550948413979e-05, + "loss": 2.7884, + "num_input_tokens_seen": 1714421760, + "step": 3270 + }, + { + "epoch": 0.15882541934154135, + "grad_norm": 0.259765625, + "learning_rate": 4.839845720357467e-05, + "loss": 2.8002, + "num_input_tokens_seen": 1717043200, + "step": 3275 + }, + { + "epoch": 0.15906790089778797, + "grad_norm": 0.2470703125, + "learning_rate": 4.839138987765519e-05, + "loss": 2.8018, + "num_input_tokens_seen": 1719664640, + "step": 3280 + }, + { + "epoch": 0.1593103824540346, + "grad_norm": 0.26171875, + "learning_rate": 4.838430751092569e-05, + "loss": 2.7909, + "num_input_tokens_seen": 1722286080, + "step": 3285 + }, + { + "epoch": 0.1595528640102812, + "grad_norm": 0.2490234375, + "learning_rate": 4.837721010794016e-05, + "loss": 2.7843, + "num_input_tokens_seen": 1724907520, + "step": 3290 + }, + { + "epoch": 0.15979534556652786, + "grad_norm": 0.265625, + "learning_rate": 4.837009767326228e-05, + "loss": 2.8041, + "num_input_tokens_seen": 1727528960, + "step": 3295 + }, + { + "epoch": 0.16003782712277448, + "grad_norm": 0.251953125, + "learning_rate": 4.83629702114654e-05, + "loss": 2.7907, + "num_input_tokens_seen": 1730150400, + "step": 3300 + }, + { + "epoch": 0.16003782712277448, + "eval_accuracy": 0.45322423058133854, + "eval_loss": 2.760647773742676, + "eval_runtime": 5.7789, + "eval_samples_per_second": 51.913, + "eval_steps_per_second": 6.576, + "num_input_tokens_seen": 1730150400, + "step": 3300 + }, + { + "epoch": 0.1602803086790211, + "grad_norm": 0.2578125, + "learning_rate": 4.8355827727132516e-05, + "loss": 2.7834, + "num_input_tokens_seen": 1732771840, + "step": 3305 + }, + { + "epoch": 0.16052279023526772, + "grad_norm": 0.25390625, + "learning_rate": 4.8348670224856285e-05, + "loss": 2.7893, + "num_input_tokens_seen": 1735393280, + "step": 3310 + }, + { + "epoch": 0.16076527179151437, + "grad_norm": 0.259765625, + "learning_rate": 4.834149770923903e-05, + "loss": 2.7888, + "num_input_tokens_seen": 1738014720, + "step": 3315 + }, + { + "epoch": 0.161007753347761, + "grad_norm": 0.248046875, + "learning_rate": 4.833431018489273e-05, + "loss": 2.7999, + "num_input_tokens_seen": 1740636160, + "step": 3320 + }, + { + "epoch": 0.1612502349040076, + "grad_norm": 0.251953125, + "learning_rate": 4.8327107656439e-05, + "loss": 2.7849, + "num_input_tokens_seen": 1743257600, + "step": 3325 + }, + { + "epoch": 0.16149271646025423, + "grad_norm": 0.244140625, + "learning_rate": 4.831989012850912e-05, + "loss": 2.7892, + "num_input_tokens_seen": 1745879040, + "step": 3330 + }, + { + "epoch": 0.16173519801650088, + "grad_norm": 0.26171875, + "learning_rate": 4.831265760574398e-05, + "loss": 2.7839, + "num_input_tokens_seen": 1748500480, + "step": 3335 + }, + { + "epoch": 0.1619776795727475, + "grad_norm": 0.255859375, + "learning_rate": 4.830541009279417e-05, + "loss": 2.8026, + "num_input_tokens_seen": 1751121920, + "step": 3340 + }, + { + "epoch": 0.16222016112899412, + "grad_norm": 0.259765625, + "learning_rate": 4.829814759431987e-05, + "loss": 2.7879, + "num_input_tokens_seen": 1753743360, + "step": 3345 + }, + { + "epoch": 0.16246264268524074, + "grad_norm": 0.26171875, + "learning_rate": 4.829087011499091e-05, + "loss": 2.7794, + "num_input_tokens_seen": 1756364800, + "step": 3350 + }, + { + "epoch": 0.1627051242414874, + "grad_norm": 0.25, + "learning_rate": 4.8283577659486764e-05, + "loss": 2.7939, + "num_input_tokens_seen": 1758986240, + "step": 3355 + }, + { + "epoch": 0.162947605797734, + "grad_norm": 0.255859375, + "learning_rate": 4.827627023249652e-05, + "loss": 2.7916, + "num_input_tokens_seen": 1761607680, + "step": 3360 + }, + { + "epoch": 0.16319008735398063, + "grad_norm": 0.251953125, + "learning_rate": 4.8268947838718895e-05, + "loss": 2.7954, + "num_input_tokens_seen": 1764229120, + "step": 3365 + }, + { + "epoch": 0.16343256891022725, + "grad_norm": 0.2734375, + "learning_rate": 4.826161048286224e-05, + "loss": 2.7845, + "num_input_tokens_seen": 1766850560, + "step": 3370 + }, + { + "epoch": 0.1636750504664739, + "grad_norm": 0.251953125, + "learning_rate": 4.8254258169644526e-05, + "loss": 2.8021, + "num_input_tokens_seen": 1769472000, + "step": 3375 + }, + { + "epoch": 0.16391753202272052, + "grad_norm": 0.267578125, + "learning_rate": 4.824689090379333e-05, + "loss": 2.7891, + "num_input_tokens_seen": 1772093440, + "step": 3380 + }, + { + "epoch": 0.16416001357896715, + "grad_norm": 0.255859375, + "learning_rate": 4.8239508690045846e-05, + "loss": 2.8082, + "num_input_tokens_seen": 1774714880, + "step": 3385 + }, + { + "epoch": 0.16440249513521377, + "grad_norm": 0.2578125, + "learning_rate": 4.8232111533148895e-05, + "loss": 2.7915, + "num_input_tokens_seen": 1777336320, + "step": 3390 + }, + { + "epoch": 0.16464497669146042, + "grad_norm": 0.255859375, + "learning_rate": 4.822469943785888e-05, + "loss": 2.804, + "num_input_tokens_seen": 1779957760, + "step": 3395 + }, + { + "epoch": 0.16488745824770704, + "grad_norm": 0.25, + "learning_rate": 4.8217272408941835e-05, + "loss": 2.7979, + "num_input_tokens_seen": 1782579200, + "step": 3400 + }, + { + "epoch": 0.16512993980395366, + "grad_norm": 0.25390625, + "learning_rate": 4.820983045117339e-05, + "loss": 2.7809, + "num_input_tokens_seen": 1785200640, + "step": 3405 + }, + { + "epoch": 0.16537242136020028, + "grad_norm": 0.2578125, + "learning_rate": 4.820237356933876e-05, + "loss": 2.7977, + "num_input_tokens_seen": 1787822080, + "step": 3410 + }, + { + "epoch": 0.16561490291644693, + "grad_norm": 0.26171875, + "learning_rate": 4.819490176823277e-05, + "loss": 2.7775, + "num_input_tokens_seen": 1790443520, + "step": 3415 + }, + { + "epoch": 0.16585738447269355, + "grad_norm": 0.259765625, + "learning_rate": 4.8187415052659835e-05, + "loss": 2.7965, + "num_input_tokens_seen": 1793064960, + "step": 3420 + }, + { + "epoch": 0.16609986602894017, + "grad_norm": 0.259765625, + "learning_rate": 4.8179913427433965e-05, + "loss": 2.7936, + "num_input_tokens_seen": 1795686400, + "step": 3425 + }, + { + "epoch": 0.1663423475851868, + "grad_norm": 0.26171875, + "learning_rate": 4.8172396897378745e-05, + "loss": 2.789, + "num_input_tokens_seen": 1798307840, + "step": 3430 + }, + { + "epoch": 0.16658482914143344, + "grad_norm": 0.25, + "learning_rate": 4.816486546732736e-05, + "loss": 2.7786, + "num_input_tokens_seen": 1800929280, + "step": 3435 + }, + { + "epoch": 0.16682731069768006, + "grad_norm": 0.251953125, + "learning_rate": 4.815731914212256e-05, + "loss": 2.8067, + "num_input_tokens_seen": 1803550720, + "step": 3440 + }, + { + "epoch": 0.16706979225392668, + "grad_norm": 0.25390625, + "learning_rate": 4.8149757926616666e-05, + "loss": 2.7945, + "num_input_tokens_seen": 1806172160, + "step": 3445 + }, + { + "epoch": 0.1673122738101733, + "grad_norm": 0.2421875, + "learning_rate": 4.814218182567162e-05, + "loss": 2.8073, + "num_input_tokens_seen": 1808793600, + "step": 3450 + }, + { + "epoch": 0.16755475536641995, + "grad_norm": 0.25, + "learning_rate": 4.813459084415887e-05, + "loss": 2.7857, + "num_input_tokens_seen": 1811415040, + "step": 3455 + }, + { + "epoch": 0.16779723692266657, + "grad_norm": 0.25390625, + "learning_rate": 4.812698498695948e-05, + "loss": 2.79, + "num_input_tokens_seen": 1814036480, + "step": 3460 + }, + { + "epoch": 0.1680397184789132, + "grad_norm": 0.25, + "learning_rate": 4.811936425896406e-05, + "loss": 2.8142, + "num_input_tokens_seen": 1816657920, + "step": 3465 + }, + { + "epoch": 0.1682822000351598, + "grad_norm": 0.25390625, + "learning_rate": 4.811172866507279e-05, + "loss": 2.7948, + "num_input_tokens_seen": 1819279360, + "step": 3470 + }, + { + "epoch": 0.16852468159140646, + "grad_norm": 0.2578125, + "learning_rate": 4.8104078210195406e-05, + "loss": 2.8081, + "num_input_tokens_seen": 1821900800, + "step": 3475 + }, + { + "epoch": 0.16876716314765308, + "grad_norm": 0.26953125, + "learning_rate": 4.809641289925119e-05, + "loss": 2.8015, + "num_input_tokens_seen": 1824522240, + "step": 3480 + }, + { + "epoch": 0.1690096447038997, + "grad_norm": 0.263671875, + "learning_rate": 4.8088732737168986e-05, + "loss": 2.783, + "num_input_tokens_seen": 1827143680, + "step": 3485 + }, + { + "epoch": 0.16925212626014632, + "grad_norm": 0.265625, + "learning_rate": 4.808103772888719e-05, + "loss": 2.7795, + "num_input_tokens_seen": 1829765120, + "step": 3490 + }, + { + "epoch": 0.16949460781639297, + "grad_norm": 0.25390625, + "learning_rate": 4.807332787935374e-05, + "loss": 2.7744, + "num_input_tokens_seen": 1832386560, + "step": 3495 + }, + { + "epoch": 0.1697370893726396, + "grad_norm": 0.267578125, + "learning_rate": 4.8065603193526114e-05, + "loss": 2.7926, + "num_input_tokens_seen": 1835008000, + "step": 3500 + }, + { + "epoch": 0.16997957092888621, + "grad_norm": 0.26171875, + "learning_rate": 4.805786367637134e-05, + "loss": 2.7882, + "num_input_tokens_seen": 1837629440, + "step": 3505 + }, + { + "epoch": 0.17022205248513286, + "grad_norm": 0.25390625, + "learning_rate": 4.805010933286598e-05, + "loss": 2.795, + "num_input_tokens_seen": 1840250880, + "step": 3510 + }, + { + "epoch": 0.17046453404137948, + "grad_norm": 0.2490234375, + "learning_rate": 4.8042340167996105e-05, + "loss": 2.7866, + "num_input_tokens_seen": 1842872320, + "step": 3515 + }, + { + "epoch": 0.1707070155976261, + "grad_norm": 0.251953125, + "learning_rate": 4.803455618675736e-05, + "loss": 2.7983, + "num_input_tokens_seen": 1845493760, + "step": 3520 + }, + { + "epoch": 0.17094949715387273, + "grad_norm": 0.24609375, + "learning_rate": 4.802675739415488e-05, + "loss": 2.7836, + "num_input_tokens_seen": 1848115200, + "step": 3525 + }, + { + "epoch": 0.17119197871011937, + "grad_norm": 0.24609375, + "learning_rate": 4.801894379520333e-05, + "loss": 2.7891, + "num_input_tokens_seen": 1850736640, + "step": 3530 + }, + { + "epoch": 0.171434460266366, + "grad_norm": 0.2490234375, + "learning_rate": 4.801111539492692e-05, + "loss": 2.7963, + "num_input_tokens_seen": 1853358080, + "step": 3535 + }, + { + "epoch": 0.17167694182261262, + "grad_norm": 0.259765625, + "learning_rate": 4.800327219835936e-05, + "loss": 2.7872, + "num_input_tokens_seen": 1855979520, + "step": 3540 + }, + { + "epoch": 0.17191942337885924, + "grad_norm": 0.275390625, + "learning_rate": 4.7995414210543866e-05, + "loss": 2.7856, + "num_input_tokens_seen": 1858600960, + "step": 3545 + }, + { + "epoch": 0.17216190493510589, + "grad_norm": 0.251953125, + "learning_rate": 4.798754143653317e-05, + "loss": 2.7839, + "num_input_tokens_seen": 1861222400, + "step": 3550 + }, + { + "epoch": 0.1724043864913525, + "grad_norm": 0.2490234375, + "learning_rate": 4.797965388138953e-05, + "loss": 2.7956, + "num_input_tokens_seen": 1863843840, + "step": 3555 + }, + { + "epoch": 0.17264686804759913, + "grad_norm": 0.25, + "learning_rate": 4.7971751550184674e-05, + "loss": 2.786, + "num_input_tokens_seen": 1866465280, + "step": 3560 + }, + { + "epoch": 0.17288934960384575, + "grad_norm": 0.251953125, + "learning_rate": 4.796383444799987e-05, + "loss": 2.7978, + "num_input_tokens_seen": 1869086720, + "step": 3565 + }, + { + "epoch": 0.1731318311600924, + "grad_norm": 0.259765625, + "learning_rate": 4.795590257992584e-05, + "loss": 2.8049, + "num_input_tokens_seen": 1871708160, + "step": 3570 + }, + { + "epoch": 0.17337431271633902, + "grad_norm": 0.2490234375, + "learning_rate": 4.794795595106285e-05, + "loss": 2.7942, + "num_input_tokens_seen": 1874329600, + "step": 3575 + }, + { + "epoch": 0.17361679427258564, + "grad_norm": 0.251953125, + "learning_rate": 4.793999456652062e-05, + "loss": 2.7885, + "num_input_tokens_seen": 1876951040, + "step": 3580 + }, + { + "epoch": 0.17385927582883226, + "grad_norm": 0.255859375, + "learning_rate": 4.7932018431418366e-05, + "loss": 2.8022, + "num_input_tokens_seen": 1879572480, + "step": 3585 + }, + { + "epoch": 0.1741017573850789, + "grad_norm": 0.26171875, + "learning_rate": 4.792402755088481e-05, + "loss": 2.7914, + "num_input_tokens_seen": 1882193920, + "step": 3590 + }, + { + "epoch": 0.17434423894132553, + "grad_norm": 0.275390625, + "learning_rate": 4.791602193005812e-05, + "loss": 2.7815, + "num_input_tokens_seen": 1884815360, + "step": 3595 + }, + { + "epoch": 0.17458672049757215, + "grad_norm": 0.26171875, + "learning_rate": 4.7908001574085964e-05, + "loss": 2.7907, + "num_input_tokens_seen": 1887436800, + "step": 3600 + }, + { + "epoch": 0.17458672049757215, + "eval_accuracy": 0.4535759648265755, + "eval_loss": 2.7588062286376953, + "eval_runtime": 5.9537, + "eval_samples_per_second": 50.389, + "eval_steps_per_second": 6.383, + "num_input_tokens_seen": 1887436800, + "step": 3600 + }, + { + "epoch": 0.17482920205381877, + "grad_norm": 0.255859375, + "learning_rate": 4.789996648812548e-05, + "loss": 2.792, + "num_input_tokens_seen": 1890058240, + "step": 3605 + }, + { + "epoch": 0.17507168361006542, + "grad_norm": 0.251953125, + "learning_rate": 4.78919166773433e-05, + "loss": 2.775, + "num_input_tokens_seen": 1892679680, + "step": 3610 + }, + { + "epoch": 0.17531416516631204, + "grad_norm": 0.25390625, + "learning_rate": 4.7883852146915465e-05, + "loss": 2.8045, + "num_input_tokens_seen": 1895301120, + "step": 3615 + }, + { + "epoch": 0.17555664672255866, + "grad_norm": 0.2578125, + "learning_rate": 4.787577290202755e-05, + "loss": 2.7993, + "num_input_tokens_seen": 1897922560, + "step": 3620 + }, + { + "epoch": 0.17579912827880528, + "grad_norm": 0.25390625, + "learning_rate": 4.7867678947874546e-05, + "loss": 2.7879, + "num_input_tokens_seen": 1900544000, + "step": 3625 + }, + { + "epoch": 0.17604160983505193, + "grad_norm": 0.251953125, + "learning_rate": 4.785957028966092e-05, + "loss": 2.7804, + "num_input_tokens_seen": 1903165440, + "step": 3630 + }, + { + "epoch": 0.17628409139129855, + "grad_norm": 0.263671875, + "learning_rate": 4.785144693260059e-05, + "loss": 2.8043, + "num_input_tokens_seen": 1905786880, + "step": 3635 + }, + { + "epoch": 0.17652657294754517, + "grad_norm": 0.2490234375, + "learning_rate": 4.784330888191691e-05, + "loss": 2.791, + "num_input_tokens_seen": 1908408320, + "step": 3640 + }, + { + "epoch": 0.1767690545037918, + "grad_norm": 0.25390625, + "learning_rate": 4.783515614284273e-05, + "loss": 2.7967, + "num_input_tokens_seen": 1911029760, + "step": 3645 + }, + { + "epoch": 0.17701153606003844, + "grad_norm": 0.25390625, + "learning_rate": 4.782698872062028e-05, + "loss": 2.7859, + "num_input_tokens_seen": 1913651200, + "step": 3650 + }, + { + "epoch": 0.17725401761628506, + "grad_norm": 0.259765625, + "learning_rate": 4.7818806620501284e-05, + "loss": 2.7977, + "num_input_tokens_seen": 1916272640, + "step": 3655 + }, + { + "epoch": 0.17749649917253169, + "grad_norm": 0.27734375, + "learning_rate": 4.781060984774687e-05, + "loss": 2.802, + "num_input_tokens_seen": 1918894080, + "step": 3660 + }, + { + "epoch": 0.1777389807287783, + "grad_norm": 0.259765625, + "learning_rate": 4.780239840762763e-05, + "loss": 2.7888, + "num_input_tokens_seen": 1921515520, + "step": 3665 + }, + { + "epoch": 0.17798146228502496, + "grad_norm": 0.2490234375, + "learning_rate": 4.7794172305423554e-05, + "loss": 2.7871, + "num_input_tokens_seen": 1924136960, + "step": 3670 + }, + { + "epoch": 0.17822394384127158, + "grad_norm": 0.2470703125, + "learning_rate": 4.77859315464241e-05, + "loss": 2.7989, + "num_input_tokens_seen": 1926758400, + "step": 3675 + }, + { + "epoch": 0.1784664253975182, + "grad_norm": 0.265625, + "learning_rate": 4.7777676135928096e-05, + "loss": 2.7838, + "num_input_tokens_seen": 1929379840, + "step": 3680 + }, + { + "epoch": 0.17870890695376482, + "grad_norm": 0.25390625, + "learning_rate": 4.776940607924385e-05, + "loss": 2.7912, + "num_input_tokens_seen": 1932001280, + "step": 3685 + }, + { + "epoch": 0.17895138851001147, + "grad_norm": 0.248046875, + "learning_rate": 4.776112138168904e-05, + "loss": 2.8105, + "num_input_tokens_seen": 1934622720, + "step": 3690 + }, + { + "epoch": 0.1791938700662581, + "grad_norm": 0.271484375, + "learning_rate": 4.7752822048590805e-05, + "loss": 2.7914, + "num_input_tokens_seen": 1937244160, + "step": 3695 + }, + { + "epoch": 0.1794363516225047, + "grad_norm": 0.255859375, + "learning_rate": 4.7744508085285645e-05, + "loss": 2.7828, + "num_input_tokens_seen": 1939865600, + "step": 3700 + }, + { + "epoch": 0.17967883317875133, + "grad_norm": 0.26171875, + "learning_rate": 4.773617949711949e-05, + "loss": 2.801, + "num_input_tokens_seen": 1942487040, + "step": 3705 + }, + { + "epoch": 0.17992131473499798, + "grad_norm": 0.25, + "learning_rate": 4.7727836289447685e-05, + "loss": 2.8001, + "num_input_tokens_seen": 1945108480, + "step": 3710 + }, + { + "epoch": 0.1801637962912446, + "grad_norm": 0.2490234375, + "learning_rate": 4.771947846763496e-05, + "loss": 2.7896, + "num_input_tokens_seen": 1947729920, + "step": 3715 + }, + { + "epoch": 0.18040627784749122, + "grad_norm": 0.255859375, + "learning_rate": 4.7711106037055456e-05, + "loss": 2.7967, + "num_input_tokens_seen": 1950351360, + "step": 3720 + }, + { + "epoch": 0.18064875940373784, + "grad_norm": 0.255859375, + "learning_rate": 4.770271900309268e-05, + "loss": 2.7843, + "num_input_tokens_seen": 1952972800, + "step": 3725 + }, + { + "epoch": 0.1808912409599845, + "grad_norm": 0.271484375, + "learning_rate": 4.7694317371139556e-05, + "loss": 2.7914, + "num_input_tokens_seen": 1955594240, + "step": 3730 + }, + { + "epoch": 0.1811337225162311, + "grad_norm": 0.25390625, + "learning_rate": 4.768590114659839e-05, + "loss": 2.7873, + "num_input_tokens_seen": 1958215680, + "step": 3735 + }, + { + "epoch": 0.18137620407247773, + "grad_norm": 0.251953125, + "learning_rate": 4.767747033488087e-05, + "loss": 2.8016, + "num_input_tokens_seen": 1960837120, + "step": 3740 + }, + { + "epoch": 0.18161868562872435, + "grad_norm": 0.255859375, + "learning_rate": 4.766902494140805e-05, + "loss": 2.7838, + "num_input_tokens_seen": 1963458560, + "step": 3745 + }, + { + "epoch": 0.181861167184971, + "grad_norm": 0.26171875, + "learning_rate": 4.766056497161037e-05, + "loss": 2.8001, + "num_input_tokens_seen": 1966080000, + "step": 3750 + }, + { + "epoch": 0.18210364874121762, + "grad_norm": 0.255859375, + "learning_rate": 4.7652090430927656e-05, + "loss": 2.7851, + "num_input_tokens_seen": 1968701440, + "step": 3755 + }, + { + "epoch": 0.18234613029746424, + "grad_norm": 0.259765625, + "learning_rate": 4.7643601324809077e-05, + "loss": 2.7882, + "num_input_tokens_seen": 1971322880, + "step": 3760 + }, + { + "epoch": 0.18258861185371086, + "grad_norm": 0.255859375, + "learning_rate": 4.7635097658713195e-05, + "loss": 2.8002, + "num_input_tokens_seen": 1973944320, + "step": 3765 + }, + { + "epoch": 0.1828310934099575, + "grad_norm": 0.248046875, + "learning_rate": 4.762657943810791e-05, + "loss": 2.7898, + "num_input_tokens_seen": 1976565760, + "step": 3770 + }, + { + "epoch": 0.18307357496620413, + "grad_norm": 0.2578125, + "learning_rate": 4.76180466684705e-05, + "loss": 2.8035, + "num_input_tokens_seen": 1979187200, + "step": 3775 + }, + { + "epoch": 0.18331605652245075, + "grad_norm": 0.25390625, + "learning_rate": 4.760949935528758e-05, + "loss": 2.785, + "num_input_tokens_seen": 1981808640, + "step": 3780 + }, + { + "epoch": 0.18355853807869738, + "grad_norm": 0.2470703125, + "learning_rate": 4.7600937504055126e-05, + "loss": 2.7879, + "num_input_tokens_seen": 1984430080, + "step": 3785 + }, + { + "epoch": 0.18380101963494402, + "grad_norm": 0.2451171875, + "learning_rate": 4.759236112027847e-05, + "loss": 2.7969, + "num_input_tokens_seen": 1987051520, + "step": 3790 + }, + { + "epoch": 0.18404350119119064, + "grad_norm": 0.2470703125, + "learning_rate": 4.758377020947228e-05, + "loss": 2.801, + "num_input_tokens_seen": 1989672960, + "step": 3795 + }, + { + "epoch": 0.18428598274743727, + "grad_norm": 0.265625, + "learning_rate": 4.7575164777160555e-05, + "loss": 2.7953, + "num_input_tokens_seen": 1992294400, + "step": 3800 + }, + { + "epoch": 0.18452846430368391, + "grad_norm": 0.251953125, + "learning_rate": 4.756654482887665e-05, + "loss": 2.7936, + "num_input_tokens_seen": 1994915840, + "step": 3805 + }, + { + "epoch": 0.18477094585993054, + "grad_norm": 0.267578125, + "learning_rate": 4.7557910370163245e-05, + "loss": 2.7776, + "num_input_tokens_seen": 1997537280, + "step": 3810 + }, + { + "epoch": 0.18501342741617716, + "grad_norm": 0.25390625, + "learning_rate": 4.754926140657235e-05, + "loss": 2.7977, + "num_input_tokens_seen": 2000158720, + "step": 3815 + }, + { + "epoch": 0.18525590897242378, + "grad_norm": 0.2890625, + "learning_rate": 4.75405979436653e-05, + "loss": 2.7957, + "num_input_tokens_seen": 2002780160, + "step": 3820 + }, + { + "epoch": 0.18549839052867043, + "grad_norm": 0.265625, + "learning_rate": 4.753191998701276e-05, + "loss": 2.7826, + "num_input_tokens_seen": 2005401600, + "step": 3825 + }, + { + "epoch": 0.18574087208491705, + "grad_norm": 0.265625, + "learning_rate": 4.7523227542194714e-05, + "loss": 2.8021, + "num_input_tokens_seen": 2008023040, + "step": 3830 + }, + { + "epoch": 0.18598335364116367, + "grad_norm": 0.259765625, + "learning_rate": 4.751452061480045e-05, + "loss": 2.7876, + "num_input_tokens_seen": 2010644480, + "step": 3835 + }, + { + "epoch": 0.1862258351974103, + "grad_norm": 0.251953125, + "learning_rate": 4.750579921042858e-05, + "loss": 2.7959, + "num_input_tokens_seen": 2013265920, + "step": 3840 + }, + { + "epoch": 0.18646831675365694, + "grad_norm": 0.263671875, + "learning_rate": 4.749706333468702e-05, + "loss": 2.7865, + "num_input_tokens_seen": 2015887360, + "step": 3845 + }, + { + "epoch": 0.18671079830990356, + "grad_norm": 0.255859375, + "learning_rate": 4.7488312993193e-05, + "loss": 2.7897, + "num_input_tokens_seen": 2018508800, + "step": 3850 + }, + { + "epoch": 0.18695327986615018, + "grad_norm": 0.259765625, + "learning_rate": 4.747954819157303e-05, + "loss": 2.7692, + "num_input_tokens_seen": 2021130240, + "step": 3855 + }, + { + "epoch": 0.1871957614223968, + "grad_norm": 0.267578125, + "learning_rate": 4.747076893546294e-05, + "loss": 2.7851, + "num_input_tokens_seen": 2023751680, + "step": 3860 + }, + { + "epoch": 0.18743824297864345, + "grad_norm": 0.251953125, + "learning_rate": 4.746197523050785e-05, + "loss": 2.7784, + "num_input_tokens_seen": 2026373120, + "step": 3865 + }, + { + "epoch": 0.18768072453489007, + "grad_norm": 0.2578125, + "learning_rate": 4.745316708236217e-05, + "loss": 2.78, + "num_input_tokens_seen": 2028994560, + "step": 3870 + }, + { + "epoch": 0.1879232060911367, + "grad_norm": 0.267578125, + "learning_rate": 4.744434449668959e-05, + "loss": 2.7942, + "num_input_tokens_seen": 2031616000, + "step": 3875 + }, + { + "epoch": 0.1881656876473833, + "grad_norm": 0.2490234375, + "learning_rate": 4.7435507479163085e-05, + "loss": 2.7872, + "num_input_tokens_seen": 2034237440, + "step": 3880 + }, + { + "epoch": 0.18840816920362996, + "grad_norm": 0.25, + "learning_rate": 4.7426656035464915e-05, + "loss": 2.7738, + "num_input_tokens_seen": 2036858880, + "step": 3885 + }, + { + "epoch": 0.18865065075987658, + "grad_norm": 0.24609375, + "learning_rate": 4.7417790171286614e-05, + "loss": 2.8005, + "num_input_tokens_seen": 2039480320, + "step": 3890 + }, + { + "epoch": 0.1888931323161232, + "grad_norm": 0.255859375, + "learning_rate": 4.740890989232899e-05, + "loss": 2.793, + "num_input_tokens_seen": 2042101760, + "step": 3895 + }, + { + "epoch": 0.18913561387236982, + "grad_norm": 0.2490234375, + "learning_rate": 4.7400015204302105e-05, + "loss": 2.7788, + "num_input_tokens_seen": 2044723200, + "step": 3900 + }, + { + "epoch": 0.18913561387236982, + "eval_accuracy": 0.4537322911577919, + "eval_loss": 2.7569406032562256, + "eval_runtime": 5.7814, + "eval_samples_per_second": 51.891, + "eval_steps_per_second": 6.573, + "num_input_tokens_seen": 2044723200, + "step": 3900 + }, + { + "epoch": 0.18937809542861647, + "grad_norm": 0.25390625, + "learning_rate": 4.739110611292532e-05, + "loss": 2.7841, + "num_input_tokens_seen": 2047344640, + "step": 3905 + }, + { + "epoch": 0.1896205769848631, + "grad_norm": 0.25390625, + "learning_rate": 4.738218262392722e-05, + "loss": 2.8006, + "num_input_tokens_seen": 2049966080, + "step": 3910 + }, + { + "epoch": 0.1898630585411097, + "grad_norm": 0.259765625, + "learning_rate": 4.7373244743045676e-05, + "loss": 2.7932, + "num_input_tokens_seen": 2052587520, + "step": 3915 + }, + { + "epoch": 0.19010554009735633, + "grad_norm": 0.26953125, + "learning_rate": 4.736429247602778e-05, + "loss": 2.7886, + "num_input_tokens_seen": 2055208960, + "step": 3920 + }, + { + "epoch": 0.19034802165360298, + "grad_norm": 0.25, + "learning_rate": 4.735532582862993e-05, + "loss": 2.7903, + "num_input_tokens_seen": 2057830400, + "step": 3925 + }, + { + "epoch": 0.1905905032098496, + "grad_norm": 0.2578125, + "learning_rate": 4.734634480661771e-05, + "loss": 2.7968, + "num_input_tokens_seen": 2060451840, + "step": 3930 + }, + { + "epoch": 0.19083298476609623, + "grad_norm": 0.25390625, + "learning_rate": 4.733734941576598e-05, + "loss": 2.7749, + "num_input_tokens_seen": 2063073280, + "step": 3935 + }, + { + "epoch": 0.19107546632234285, + "grad_norm": 0.251953125, + "learning_rate": 4.732833966185883e-05, + "loss": 2.7859, + "num_input_tokens_seen": 2065694720, + "step": 3940 + }, + { + "epoch": 0.1913179478785895, + "grad_norm": 0.251953125, + "learning_rate": 4.73193155506896e-05, + "loss": 2.7785, + "num_input_tokens_seen": 2068316160, + "step": 3945 + }, + { + "epoch": 0.19156042943483612, + "grad_norm": 0.265625, + "learning_rate": 4.731027708806084e-05, + "loss": 2.7971, + "num_input_tokens_seen": 2070937600, + "step": 3950 + }, + { + "epoch": 0.19180291099108274, + "grad_norm": 0.25, + "learning_rate": 4.730122427978434e-05, + "loss": 2.7928, + "num_input_tokens_seen": 2073559040, + "step": 3955 + }, + { + "epoch": 0.19204539254732936, + "grad_norm": 0.248046875, + "learning_rate": 4.72921571316811e-05, + "loss": 2.7878, + "num_input_tokens_seen": 2076180480, + "step": 3960 + }, + { + "epoch": 0.192287874103576, + "grad_norm": 0.25390625, + "learning_rate": 4.7283075649581374e-05, + "loss": 2.789, + "num_input_tokens_seen": 2078801920, + "step": 3965 + }, + { + "epoch": 0.19253035565982263, + "grad_norm": 0.25390625, + "learning_rate": 4.727397983932461e-05, + "loss": 2.7807, + "num_input_tokens_seen": 2081423360, + "step": 3970 + }, + { + "epoch": 0.19277283721606925, + "grad_norm": 0.25390625, + "learning_rate": 4.726486970675945e-05, + "loss": 2.7927, + "num_input_tokens_seen": 2084044800, + "step": 3975 + }, + { + "epoch": 0.19301531877231587, + "grad_norm": 0.25390625, + "learning_rate": 4.725574525774379e-05, + "loss": 2.7903, + "num_input_tokens_seen": 2086666240, + "step": 3980 + }, + { + "epoch": 0.19325780032856252, + "grad_norm": 0.2490234375, + "learning_rate": 4.72466064981447e-05, + "loss": 2.8101, + "num_input_tokens_seen": 2089287680, + "step": 3985 + }, + { + "epoch": 0.19350028188480914, + "grad_norm": 0.263671875, + "learning_rate": 4.7237453433838445e-05, + "loss": 2.7965, + "num_input_tokens_seen": 2091909120, + "step": 3990 + }, + { + "epoch": 0.19374276344105576, + "grad_norm": 0.26953125, + "learning_rate": 4.7228286070710525e-05, + "loss": 2.7921, + "num_input_tokens_seen": 2094530560, + "step": 3995 + }, + { + "epoch": 0.19398524499730238, + "grad_norm": 0.26171875, + "learning_rate": 4.7219104414655595e-05, + "loss": 2.7743, + "num_input_tokens_seen": 2097152000, + "step": 4000 + }, + { + "epoch": 0.19422772655354903, + "grad_norm": 0.2490234375, + "learning_rate": 4.720990847157752e-05, + "loss": 2.7748, + "num_input_tokens_seen": 2099773440, + "step": 4005 + }, + { + "epoch": 0.19447020810979565, + "grad_norm": 0.255859375, + "learning_rate": 4.720069824738936e-05, + "loss": 2.7936, + "num_input_tokens_seen": 2102394880, + "step": 4010 + }, + { + "epoch": 0.19471268966604227, + "grad_norm": 0.265625, + "learning_rate": 4.719147374801335e-05, + "loss": 2.7772, + "num_input_tokens_seen": 2105016320, + "step": 4015 + }, + { + "epoch": 0.1949551712222889, + "grad_norm": 0.255859375, + "learning_rate": 4.718223497938088e-05, + "loss": 2.7926, + "num_input_tokens_seen": 2107637760, + "step": 4020 + }, + { + "epoch": 0.19519765277853554, + "grad_norm": 0.267578125, + "learning_rate": 4.717298194743254e-05, + "loss": 2.7731, + "num_input_tokens_seen": 2110259200, + "step": 4025 + }, + { + "epoch": 0.19544013433478216, + "grad_norm": 0.255859375, + "learning_rate": 4.71637146581181e-05, + "loss": 2.7816, + "num_input_tokens_seen": 2112880640, + "step": 4030 + }, + { + "epoch": 0.19568261589102878, + "grad_norm": 0.2578125, + "learning_rate": 4.715443311739648e-05, + "loss": 2.7954, + "num_input_tokens_seen": 2115502080, + "step": 4035 + }, + { + "epoch": 0.1959250974472754, + "grad_norm": 0.255859375, + "learning_rate": 4.714513733123577e-05, + "loss": 2.7833, + "num_input_tokens_seen": 2118123520, + "step": 4040 + }, + { + "epoch": 0.19616757900352205, + "grad_norm": 0.25390625, + "learning_rate": 4.713582730561321e-05, + "loss": 2.7868, + "num_input_tokens_seen": 2120744960, + "step": 4045 + }, + { + "epoch": 0.19641006055976867, + "grad_norm": 0.263671875, + "learning_rate": 4.712650304651521e-05, + "loss": 2.7841, + "num_input_tokens_seen": 2123366400, + "step": 4050 + }, + { + "epoch": 0.1966525421160153, + "grad_norm": 0.275390625, + "learning_rate": 4.7117164559937335e-05, + "loss": 2.7979, + "num_input_tokens_seen": 2125987840, + "step": 4055 + }, + { + "epoch": 0.19689502367226192, + "grad_norm": 0.263671875, + "learning_rate": 4.7107811851884284e-05, + "loss": 2.777, + "num_input_tokens_seen": 2128609280, + "step": 4060 + }, + { + "epoch": 0.19713750522850856, + "grad_norm": 0.24609375, + "learning_rate": 4.70984449283699e-05, + "loss": 2.8029, + "num_input_tokens_seen": 2131230720, + "step": 4065 + }, + { + "epoch": 0.19737998678475518, + "grad_norm": 0.255859375, + "learning_rate": 4.708906379541719e-05, + "loss": 2.7869, + "num_input_tokens_seen": 2133852160, + "step": 4070 + }, + { + "epoch": 0.1976224683410018, + "grad_norm": 0.26171875, + "learning_rate": 4.7079668459058256e-05, + "loss": 2.7896, + "num_input_tokens_seen": 2136473600, + "step": 4075 + }, + { + "epoch": 0.19786494989724845, + "grad_norm": 0.259765625, + "learning_rate": 4.7070258925334374e-05, + "loss": 2.786, + "num_input_tokens_seen": 2139095040, + "step": 4080 + }, + { + "epoch": 0.19810743145349508, + "grad_norm": 0.267578125, + "learning_rate": 4.706083520029594e-05, + "loss": 2.7735, + "num_input_tokens_seen": 2141716480, + "step": 4085 + }, + { + "epoch": 0.1983499130097417, + "grad_norm": 0.263671875, + "learning_rate": 4.705139729000246e-05, + "loss": 2.7909, + "num_input_tokens_seen": 2144337920, + "step": 4090 + }, + { + "epoch": 0.19859239456598832, + "grad_norm": 0.2470703125, + "learning_rate": 4.7041945200522566e-05, + "loss": 2.7978, + "num_input_tokens_seen": 2146959360, + "step": 4095 + }, + { + "epoch": 0.19883487612223497, + "grad_norm": 0.251953125, + "learning_rate": 4.703247893793401e-05, + "loss": 2.7886, + "num_input_tokens_seen": 2149580800, + "step": 4100 + }, + { + "epoch": 0.1990773576784816, + "grad_norm": 0.255859375, + "learning_rate": 4.702299850832367e-05, + "loss": 2.7938, + "num_input_tokens_seen": 2152202240, + "step": 4105 + }, + { + "epoch": 0.1993198392347282, + "grad_norm": 0.25390625, + "learning_rate": 4.701350391778751e-05, + "loss": 2.799, + "num_input_tokens_seen": 2154823680, + "step": 4110 + }, + { + "epoch": 0.19956232079097483, + "grad_norm": 0.251953125, + "learning_rate": 4.700399517243062e-05, + "loss": 2.7861, + "num_input_tokens_seen": 2157445120, + "step": 4115 + }, + { + "epoch": 0.19980480234722148, + "grad_norm": 0.26953125, + "learning_rate": 4.699447227836716e-05, + "loss": 2.8033, + "num_input_tokens_seen": 2160066560, + "step": 4120 + }, + { + "epoch": 0.2000472839034681, + "grad_norm": 0.25390625, + "learning_rate": 4.698493524172045e-05, + "loss": 2.7837, + "num_input_tokens_seen": 2162688000, + "step": 4125 + }, + { + "epoch": 0.20028976545971472, + "grad_norm": 0.26171875, + "learning_rate": 4.697538406862283e-05, + "loss": 2.7977, + "num_input_tokens_seen": 2165309440, + "step": 4130 + }, + { + "epoch": 0.20053224701596134, + "grad_norm": 0.287109375, + "learning_rate": 4.696581876521578e-05, + "loss": 2.7901, + "num_input_tokens_seen": 2167930880, + "step": 4135 + }, + { + "epoch": 0.200774728572208, + "grad_norm": 0.271484375, + "learning_rate": 4.6956239337649846e-05, + "loss": 2.7915, + "num_input_tokens_seen": 2170552320, + "step": 4140 + }, + { + "epoch": 0.2010172101284546, + "grad_norm": 0.259765625, + "learning_rate": 4.694664579208465e-05, + "loss": 2.7782, + "num_input_tokens_seen": 2173173760, + "step": 4145 + }, + { + "epoch": 0.20125969168470123, + "grad_norm": 0.26171875, + "learning_rate": 4.6937038134688923e-05, + "loss": 2.7809, + "num_input_tokens_seen": 2175795200, + "step": 4150 + }, + { + "epoch": 0.20150217324094785, + "grad_norm": 0.251953125, + "learning_rate": 4.692741637164043e-05, + "loss": 2.8012, + "num_input_tokens_seen": 2178416640, + "step": 4155 + }, + { + "epoch": 0.2017446547971945, + "grad_norm": 0.251953125, + "learning_rate": 4.6917780509126045e-05, + "loss": 2.7837, + "num_input_tokens_seen": 2181038080, + "step": 4160 + }, + { + "epoch": 0.20198713635344112, + "grad_norm": 0.255859375, + "learning_rate": 4.690813055334167e-05, + "loss": 2.7862, + "num_input_tokens_seen": 2183659520, + "step": 4165 + }, + { + "epoch": 0.20222961790968774, + "grad_norm": 0.265625, + "learning_rate": 4.689846651049228e-05, + "loss": 2.7854, + "num_input_tokens_seen": 2186280960, + "step": 4170 + }, + { + "epoch": 0.20247209946593436, + "grad_norm": 0.2734375, + "learning_rate": 4.6888788386791935e-05, + "loss": 2.8001, + "num_input_tokens_seen": 2188902400, + "step": 4175 + }, + { + "epoch": 0.202714581022181, + "grad_norm": 0.255859375, + "learning_rate": 4.6879096188463725e-05, + "loss": 2.791, + "num_input_tokens_seen": 2191523840, + "step": 4180 + }, + { + "epoch": 0.20295706257842763, + "grad_norm": 0.265625, + "learning_rate": 4.6869389921739795e-05, + "loss": 2.7943, + "num_input_tokens_seen": 2194145280, + "step": 4185 + }, + { + "epoch": 0.20319954413467425, + "grad_norm": 0.2578125, + "learning_rate": 4.685966959286132e-05, + "loss": 2.7996, + "num_input_tokens_seen": 2196766720, + "step": 4190 + }, + { + "epoch": 0.20344202569092087, + "grad_norm": 0.255859375, + "learning_rate": 4.684993520807855e-05, + "loss": 2.7908, + "num_input_tokens_seen": 2199388160, + "step": 4195 + }, + { + "epoch": 0.20368450724716752, + "grad_norm": 0.259765625, + "learning_rate": 4.6840186773650743e-05, + "loss": 2.7942, + "num_input_tokens_seen": 2202009600, + "step": 4200 + }, + { + "epoch": 0.20368450724716752, + "eval_accuracy": 0.4539895782445856, + "eval_loss": 2.7551848888397217, + "eval_runtime": 6.2629, + "eval_samples_per_second": 47.901, + "eval_steps_per_second": 6.067, + "num_input_tokens_seen": 2202009600, + "step": 4200 + }, + { + "epoch": 0.20392698880341414, + "grad_norm": 0.25390625, + "learning_rate": 4.683042429584621e-05, + "loss": 2.8141, + "num_input_tokens_seen": 2204631040, + "step": 4205 + }, + { + "epoch": 0.20416947035966077, + "grad_norm": 0.2578125, + "learning_rate": 4.6820647780942286e-05, + "loss": 2.7836, + "num_input_tokens_seen": 2207252480, + "step": 4210 + }, + { + "epoch": 0.20441195191590739, + "grad_norm": 0.265625, + "learning_rate": 4.681085723522533e-05, + "loss": 2.813, + "num_input_tokens_seen": 2209873920, + "step": 4215 + }, + { + "epoch": 0.20465443347215403, + "grad_norm": 0.265625, + "learning_rate": 4.680105266499072e-05, + "loss": 2.7833, + "num_input_tokens_seen": 2212495360, + "step": 4220 + }, + { + "epoch": 0.20489691502840066, + "grad_norm": 0.265625, + "learning_rate": 4.6791234076542864e-05, + "loss": 2.7967, + "num_input_tokens_seen": 2215116800, + "step": 4225 + }, + { + "epoch": 0.20513939658464728, + "grad_norm": 0.251953125, + "learning_rate": 4.678140147619516e-05, + "loss": 2.7865, + "num_input_tokens_seen": 2217738240, + "step": 4230 + }, + { + "epoch": 0.2053818781408939, + "grad_norm": 0.24609375, + "learning_rate": 4.6771554870270055e-05, + "loss": 2.7994, + "num_input_tokens_seen": 2220359680, + "step": 4235 + }, + { + "epoch": 0.20562435969714055, + "grad_norm": 0.251953125, + "learning_rate": 4.6761694265098965e-05, + "loss": 2.7748, + "num_input_tokens_seen": 2222981120, + "step": 4240 + }, + { + "epoch": 0.20586684125338717, + "grad_norm": 0.24609375, + "learning_rate": 4.675181966702232e-05, + "loss": 2.785, + "num_input_tokens_seen": 2225602560, + "step": 4245 + }, + { + "epoch": 0.2061093228096338, + "grad_norm": 0.248046875, + "learning_rate": 4.6741931082389545e-05, + "loss": 2.7839, + "num_input_tokens_seen": 2228224000, + "step": 4250 + }, + { + "epoch": 0.2063518043658804, + "grad_norm": 0.255859375, + "learning_rate": 4.673202851755907e-05, + "loss": 2.7781, + "num_input_tokens_seen": 2230845440, + "step": 4255 + }, + { + "epoch": 0.20659428592212706, + "grad_norm": 0.259765625, + "learning_rate": 4.6722111978898306e-05, + "loss": 2.7961, + "num_input_tokens_seen": 2233466880, + "step": 4260 + }, + { + "epoch": 0.20683676747837368, + "grad_norm": 0.267578125, + "learning_rate": 4.671218147278364e-05, + "loss": 2.8002, + "num_input_tokens_seen": 2236088320, + "step": 4265 + }, + { + "epoch": 0.2070792490346203, + "grad_norm": 0.2578125, + "learning_rate": 4.6702237005600456e-05, + "loss": 2.7901, + "num_input_tokens_seen": 2238709760, + "step": 4270 + }, + { + "epoch": 0.20732173059086692, + "grad_norm": 0.259765625, + "learning_rate": 4.6692278583743116e-05, + "loss": 2.8018, + "num_input_tokens_seen": 2241331200, + "step": 4275 + }, + { + "epoch": 0.20756421214711357, + "grad_norm": 0.251953125, + "learning_rate": 4.6682306213614935e-05, + "loss": 2.8013, + "num_input_tokens_seen": 2243952640, + "step": 4280 + }, + { + "epoch": 0.2078066937033602, + "grad_norm": 0.265625, + "learning_rate": 4.6672319901628214e-05, + "loss": 2.778, + "num_input_tokens_seen": 2246574080, + "step": 4285 + }, + { + "epoch": 0.2080491752596068, + "grad_norm": 0.263671875, + "learning_rate": 4.666231965420421e-05, + "loss": 2.7975, + "num_input_tokens_seen": 2249195520, + "step": 4290 + }, + { + "epoch": 0.20829165681585343, + "grad_norm": 0.26171875, + "learning_rate": 4.665230547777316e-05, + "loss": 2.7774, + "num_input_tokens_seen": 2251816960, + "step": 4295 + }, + { + "epoch": 0.20853413837210008, + "grad_norm": 0.25, + "learning_rate": 4.6642277378774224e-05, + "loss": 2.7957, + "num_input_tokens_seen": 2254438400, + "step": 4300 + }, + { + "epoch": 0.2087766199283467, + "grad_norm": 0.25390625, + "learning_rate": 4.6632235363655544e-05, + "loss": 2.7776, + "num_input_tokens_seen": 2257059840, + "step": 4305 + }, + { + "epoch": 0.20901910148459332, + "grad_norm": 0.259765625, + "learning_rate": 4.662217943887419e-05, + "loss": 2.7951, + "num_input_tokens_seen": 2259681280, + "step": 4310 + }, + { + "epoch": 0.20926158304083994, + "grad_norm": 0.2470703125, + "learning_rate": 4.661210961089619e-05, + "loss": 2.7819, + "num_input_tokens_seen": 2262302720, + "step": 4315 + }, + { + "epoch": 0.2095040645970866, + "grad_norm": 0.251953125, + "learning_rate": 4.660202588619651e-05, + "loss": 2.792, + "num_input_tokens_seen": 2264924160, + "step": 4320 + }, + { + "epoch": 0.2097465461533332, + "grad_norm": 0.255859375, + "learning_rate": 4.659192827125904e-05, + "loss": 2.7868, + "num_input_tokens_seen": 2267545600, + "step": 4325 + }, + { + "epoch": 0.20998902770957983, + "grad_norm": 0.251953125, + "learning_rate": 4.6581816772576616e-05, + "loss": 2.7768, + "num_input_tokens_seen": 2270167040, + "step": 4330 + }, + { + "epoch": 0.21023150926582646, + "grad_norm": 0.255859375, + "learning_rate": 4.657169139665098e-05, + "loss": 2.7881, + "num_input_tokens_seen": 2272788480, + "step": 4335 + }, + { + "epoch": 0.2104739908220731, + "grad_norm": 0.25390625, + "learning_rate": 4.656155214999283e-05, + "loss": 2.7805, + "num_input_tokens_seen": 2275409920, + "step": 4340 + }, + { + "epoch": 0.21071647237831972, + "grad_norm": 0.259765625, + "learning_rate": 4.655139903912176e-05, + "loss": 2.7865, + "num_input_tokens_seen": 2278031360, + "step": 4345 + }, + { + "epoch": 0.21095895393456635, + "grad_norm": 0.251953125, + "learning_rate": 4.654123207056629e-05, + "loss": 2.7831, + "num_input_tokens_seen": 2280652800, + "step": 4350 + }, + { + "epoch": 0.21120143549081297, + "grad_norm": 0.25, + "learning_rate": 4.653105125086382e-05, + "loss": 2.7884, + "num_input_tokens_seen": 2283274240, + "step": 4355 + }, + { + "epoch": 0.21144391704705962, + "grad_norm": 0.25390625, + "learning_rate": 4.652085658656071e-05, + "loss": 2.7898, + "num_input_tokens_seen": 2285895680, + "step": 4360 + }, + { + "epoch": 0.21168639860330624, + "grad_norm": 0.2490234375, + "learning_rate": 4.6510648084212185e-05, + "loss": 2.7872, + "num_input_tokens_seen": 2288517120, + "step": 4365 + }, + { + "epoch": 0.21192888015955286, + "grad_norm": 0.25, + "learning_rate": 4.650042575038236e-05, + "loss": 2.7695, + "num_input_tokens_seen": 2291138560, + "step": 4370 + }, + { + "epoch": 0.2121713617157995, + "grad_norm": 0.26171875, + "learning_rate": 4.6490189591644274e-05, + "loss": 2.7835, + "num_input_tokens_seen": 2293760000, + "step": 4375 + }, + { + "epoch": 0.21241384327204613, + "grad_norm": 0.251953125, + "learning_rate": 4.647993961457984e-05, + "loss": 2.7704, + "num_input_tokens_seen": 2296381440, + "step": 4380 + }, + { + "epoch": 0.21265632482829275, + "grad_norm": 0.26171875, + "learning_rate": 4.646967582577986e-05, + "loss": 2.7883, + "num_input_tokens_seen": 2299002880, + "step": 4385 + }, + { + "epoch": 0.21289880638453937, + "grad_norm": 0.251953125, + "learning_rate": 4.6459398231843996e-05, + "loss": 2.782, + "num_input_tokens_seen": 2301624320, + "step": 4390 + }, + { + "epoch": 0.21314128794078602, + "grad_norm": 0.251953125, + "learning_rate": 4.644910683938084e-05, + "loss": 2.7908, + "num_input_tokens_seen": 2304245760, + "step": 4395 + }, + { + "epoch": 0.21338376949703264, + "grad_norm": 0.271484375, + "learning_rate": 4.643880165500778e-05, + "loss": 2.7901, + "num_input_tokens_seen": 2306867200, + "step": 4400 + }, + { + "epoch": 0.21362625105327926, + "grad_norm": 0.25390625, + "learning_rate": 4.642848268535115e-05, + "loss": 2.7943, + "num_input_tokens_seen": 2309488640, + "step": 4405 + }, + { + "epoch": 0.21386873260952588, + "grad_norm": 0.2490234375, + "learning_rate": 4.641814993704609e-05, + "loss": 2.7824, + "num_input_tokens_seen": 2312110080, + "step": 4410 + }, + { + "epoch": 0.21411121416577253, + "grad_norm": 0.2490234375, + "learning_rate": 4.640780341673663e-05, + "loss": 2.7923, + "num_input_tokens_seen": 2314731520, + "step": 4415 + }, + { + "epoch": 0.21435369572201915, + "grad_norm": 0.255859375, + "learning_rate": 4.6397443131075647e-05, + "loss": 2.79, + "num_input_tokens_seen": 2317352960, + "step": 4420 + }, + { + "epoch": 0.21459617727826577, + "grad_norm": 0.25390625, + "learning_rate": 4.638706908672487e-05, + "loss": 2.7971, + "num_input_tokens_seen": 2319974400, + "step": 4425 + }, + { + "epoch": 0.2148386588345124, + "grad_norm": 0.251953125, + "learning_rate": 4.637668129035487e-05, + "loss": 2.7933, + "num_input_tokens_seen": 2322595840, + "step": 4430 + }, + { + "epoch": 0.21508114039075904, + "grad_norm": 0.255859375, + "learning_rate": 4.636627974864507e-05, + "loss": 2.7892, + "num_input_tokens_seen": 2325217280, + "step": 4435 + }, + { + "epoch": 0.21532362194700566, + "grad_norm": 0.26953125, + "learning_rate": 4.6355864468283726e-05, + "loss": 2.792, + "num_input_tokens_seen": 2327838720, + "step": 4440 + }, + { + "epoch": 0.21556610350325228, + "grad_norm": 0.265625, + "learning_rate": 4.634543545596792e-05, + "loss": 2.7811, + "num_input_tokens_seen": 2330460160, + "step": 4445 + }, + { + "epoch": 0.2158085850594989, + "grad_norm": 0.248046875, + "learning_rate": 4.633499271840359e-05, + "loss": 2.7816, + "num_input_tokens_seen": 2333081600, + "step": 4450 + }, + { + "epoch": 0.21605106661574555, + "grad_norm": 0.251953125, + "learning_rate": 4.632453626230546e-05, + "loss": 2.8012, + "num_input_tokens_seen": 2335703040, + "step": 4455 + }, + { + "epoch": 0.21629354817199217, + "grad_norm": 0.259765625, + "learning_rate": 4.631406609439711e-05, + "loss": 2.7855, + "num_input_tokens_seen": 2338324480, + "step": 4460 + }, + { + "epoch": 0.2165360297282388, + "grad_norm": 0.25, + "learning_rate": 4.630358222141092e-05, + "loss": 2.766, + "num_input_tokens_seen": 2340945920, + "step": 4465 + }, + { + "epoch": 0.21677851128448541, + "grad_norm": 0.25390625, + "learning_rate": 4.629308465008809e-05, + "loss": 2.7734, + "num_input_tokens_seen": 2343567360, + "step": 4470 + }, + { + "epoch": 0.21702099284073206, + "grad_norm": 0.2578125, + "learning_rate": 4.628257338717862e-05, + "loss": 2.7903, + "num_input_tokens_seen": 2346188800, + "step": 4475 + }, + { + "epoch": 0.21726347439697868, + "grad_norm": 0.255859375, + "learning_rate": 4.6272048439441315e-05, + "loss": 2.7796, + "num_input_tokens_seen": 2348810240, + "step": 4480 + }, + { + "epoch": 0.2175059559532253, + "grad_norm": 0.251953125, + "learning_rate": 4.62615098136438e-05, + "loss": 2.7905, + "num_input_tokens_seen": 2351431680, + "step": 4485 + }, + { + "epoch": 0.21774843750947193, + "grad_norm": 0.251953125, + "learning_rate": 4.625095751656245e-05, + "loss": 2.7908, + "num_input_tokens_seen": 2354053120, + "step": 4490 + }, + { + "epoch": 0.21799091906571857, + "grad_norm": 0.255859375, + "learning_rate": 4.624039155498247e-05, + "loss": 2.7879, + "num_input_tokens_seen": 2356674560, + "step": 4495 + }, + { + "epoch": 0.2182334006219652, + "grad_norm": 0.251953125, + "learning_rate": 4.622981193569784e-05, + "loss": 2.793, + "num_input_tokens_seen": 2359296000, + "step": 4500 + }, + { + "epoch": 0.2182334006219652, + "eval_accuracy": 0.4542712913206318, + "eval_loss": 2.7538249492645264, + "eval_runtime": 5.8817, + "eval_samples_per_second": 51.006, + "eval_steps_per_second": 6.461, + "num_input_tokens_seen": 2359296000, + "step": 4500 + }, + { + "epoch": 0.21847588217821182, + "grad_norm": 0.255859375, + "learning_rate": 4.621921866551133e-05, + "loss": 2.7789, + "num_input_tokens_seen": 2361917440, + "step": 4505 + }, + { + "epoch": 0.21871836373445844, + "grad_norm": 0.255859375, + "learning_rate": 4.620861175123446e-05, + "loss": 2.7708, + "num_input_tokens_seen": 2364538880, + "step": 4510 + }, + { + "epoch": 0.2189608452907051, + "grad_norm": 0.2578125, + "learning_rate": 4.6197991199687566e-05, + "loss": 2.8001, + "num_input_tokens_seen": 2367160320, + "step": 4515 + }, + { + "epoch": 0.2192033268469517, + "grad_norm": 0.2451171875, + "learning_rate": 4.6187357017699716e-05, + "loss": 2.7862, + "num_input_tokens_seen": 2369781760, + "step": 4520 + }, + { + "epoch": 0.21944580840319833, + "grad_norm": 0.25390625, + "learning_rate": 4.617670921210875e-05, + "loss": 2.7765, + "num_input_tokens_seen": 2372403200, + "step": 4525 + }, + { + "epoch": 0.21968828995944495, + "grad_norm": 0.259765625, + "learning_rate": 4.616604778976128e-05, + "loss": 2.7928, + "num_input_tokens_seen": 2375024640, + "step": 4530 + }, + { + "epoch": 0.2199307715156916, + "grad_norm": 0.2734375, + "learning_rate": 4.615537275751266e-05, + "loss": 2.7938, + "num_input_tokens_seen": 2377646080, + "step": 4535 + }, + { + "epoch": 0.22017325307193822, + "grad_norm": 0.271484375, + "learning_rate": 4.614468412222702e-05, + "loss": 2.7835, + "num_input_tokens_seen": 2380267520, + "step": 4540 + }, + { + "epoch": 0.22041573462818484, + "grad_norm": 0.265625, + "learning_rate": 4.61339818907772e-05, + "loss": 2.7925, + "num_input_tokens_seen": 2382888960, + "step": 4545 + }, + { + "epoch": 0.22065821618443146, + "grad_norm": 0.27734375, + "learning_rate": 4.612326607004481e-05, + "loss": 2.797, + "num_input_tokens_seen": 2385510400, + "step": 4550 + }, + { + "epoch": 0.2209006977406781, + "grad_norm": 0.26953125, + "learning_rate": 4.61125366669202e-05, + "loss": 2.791, + "num_input_tokens_seen": 2388131840, + "step": 4555 + }, + { + "epoch": 0.22114317929692473, + "grad_norm": 0.283203125, + "learning_rate": 4.610179368830243e-05, + "loss": 2.7996, + "num_input_tokens_seen": 2390753280, + "step": 4560 + }, + { + "epoch": 0.22138566085317135, + "grad_norm": 0.259765625, + "learning_rate": 4.60910371410993e-05, + "loss": 2.7943, + "num_input_tokens_seen": 2393374720, + "step": 4565 + }, + { + "epoch": 0.22162814240941797, + "grad_norm": 0.259765625, + "learning_rate": 4.608026703222735e-05, + "loss": 2.79, + "num_input_tokens_seen": 2395996160, + "step": 4570 + }, + { + "epoch": 0.22187062396566462, + "grad_norm": 0.25, + "learning_rate": 4.6069483368611815e-05, + "loss": 2.7917, + "num_input_tokens_seen": 2398617600, + "step": 4575 + }, + { + "epoch": 0.22211310552191124, + "grad_norm": 0.2470703125, + "learning_rate": 4.605868615718667e-05, + "loss": 2.7813, + "num_input_tokens_seen": 2401239040, + "step": 4580 + }, + { + "epoch": 0.22235558707815786, + "grad_norm": 0.25390625, + "learning_rate": 4.604787540489458e-05, + "loss": 2.7822, + "num_input_tokens_seen": 2403860480, + "step": 4585 + }, + { + "epoch": 0.22259806863440448, + "grad_norm": 0.271484375, + "learning_rate": 4.603705111868693e-05, + "loss": 2.7883, + "num_input_tokens_seen": 2406481920, + "step": 4590 + }, + { + "epoch": 0.22284055019065113, + "grad_norm": 0.255859375, + "learning_rate": 4.6026213305523794e-05, + "loss": 2.7924, + "num_input_tokens_seen": 2409103360, + "step": 4595 + }, + { + "epoch": 0.22308303174689775, + "grad_norm": 0.2578125, + "learning_rate": 4.601536197237397e-05, + "loss": 2.7906, + "num_input_tokens_seen": 2411724800, + "step": 4600 + }, + { + "epoch": 0.22332551330314437, + "grad_norm": 0.265625, + "learning_rate": 4.600449712621493e-05, + "loss": 2.7683, + "num_input_tokens_seen": 2414346240, + "step": 4605 + }, + { + "epoch": 0.223567994859391, + "grad_norm": 0.263671875, + "learning_rate": 4.5993618774032824e-05, + "loss": 2.7971, + "num_input_tokens_seen": 2416967680, + "step": 4610 + }, + { + "epoch": 0.22381047641563764, + "grad_norm": 0.25, + "learning_rate": 4.5982726922822515e-05, + "loss": 2.8059, + "num_input_tokens_seen": 2419589120, + "step": 4615 + }, + { + "epoch": 0.22405295797188426, + "grad_norm": 0.251953125, + "learning_rate": 4.5971821579587536e-05, + "loss": 2.7974, + "num_input_tokens_seen": 2422210560, + "step": 4620 + }, + { + "epoch": 0.22429543952813089, + "grad_norm": 0.2578125, + "learning_rate": 4.596090275134007e-05, + "loss": 2.7842, + "num_input_tokens_seen": 2424832000, + "step": 4625 + }, + { + "epoch": 0.2245379210843775, + "grad_norm": 0.251953125, + "learning_rate": 4.594997044510101e-05, + "loss": 2.7906, + "num_input_tokens_seen": 2427453440, + "step": 4630 + }, + { + "epoch": 0.22478040264062416, + "grad_norm": 0.251953125, + "learning_rate": 4.5939024667899886e-05, + "loss": 2.7935, + "num_input_tokens_seen": 2430074880, + "step": 4635 + }, + { + "epoch": 0.22502288419687078, + "grad_norm": 0.244140625, + "learning_rate": 4.592806542677491e-05, + "loss": 2.7775, + "num_input_tokens_seen": 2432696320, + "step": 4640 + }, + { + "epoch": 0.2252653657531174, + "grad_norm": 0.26171875, + "learning_rate": 4.5917092728772944e-05, + "loss": 2.7844, + "num_input_tokens_seen": 2435317760, + "step": 4645 + }, + { + "epoch": 0.22550784730936402, + "grad_norm": 0.26953125, + "learning_rate": 4.590610658094949e-05, + "loss": 2.7916, + "num_input_tokens_seen": 2437939200, + "step": 4650 + }, + { + "epoch": 0.22575032886561067, + "grad_norm": 0.251953125, + "learning_rate": 4.589510699036872e-05, + "loss": 2.7813, + "num_input_tokens_seen": 2440560640, + "step": 4655 + }, + { + "epoch": 0.2259928104218573, + "grad_norm": 0.244140625, + "learning_rate": 4.588409396410343e-05, + "loss": 2.7931, + "num_input_tokens_seen": 2443182080, + "step": 4660 + }, + { + "epoch": 0.2262352919781039, + "grad_norm": 0.25390625, + "learning_rate": 4.5873067509235065e-05, + "loss": 2.7913, + "num_input_tokens_seen": 2445803520, + "step": 4665 + }, + { + "epoch": 0.22647777353435056, + "grad_norm": 0.251953125, + "learning_rate": 4.5862027632853724e-05, + "loss": 2.7831, + "num_input_tokens_seen": 2448424960, + "step": 4670 + }, + { + "epoch": 0.22672025509059718, + "grad_norm": 0.255859375, + "learning_rate": 4.5850974342058095e-05, + "loss": 2.7832, + "num_input_tokens_seen": 2451046400, + "step": 4675 + }, + { + "epoch": 0.2269627366468438, + "grad_norm": 0.251953125, + "learning_rate": 4.5839907643955525e-05, + "loss": 2.7763, + "num_input_tokens_seen": 2453667840, + "step": 4680 + }, + { + "epoch": 0.22720521820309042, + "grad_norm": 0.25390625, + "learning_rate": 4.582882754566196e-05, + "loss": 2.7882, + "num_input_tokens_seen": 2456289280, + "step": 4685 + }, + { + "epoch": 0.22744769975933707, + "grad_norm": 0.255859375, + "learning_rate": 4.581773405430199e-05, + "loss": 2.7681, + "num_input_tokens_seen": 2458910720, + "step": 4690 + }, + { + "epoch": 0.2276901813155837, + "grad_norm": 0.2578125, + "learning_rate": 4.5806627177008775e-05, + "loss": 2.7858, + "num_input_tokens_seen": 2461532160, + "step": 4695 + }, + { + "epoch": 0.2279326628718303, + "grad_norm": 0.2578125, + "learning_rate": 4.579550692092412e-05, + "loss": 2.7882, + "num_input_tokens_seen": 2464153600, + "step": 4700 + }, + { + "epoch": 0.22817514442807693, + "grad_norm": 0.2490234375, + "learning_rate": 4.578437329319842e-05, + "loss": 2.7782, + "num_input_tokens_seen": 2466775040, + "step": 4705 + }, + { + "epoch": 0.22841762598432358, + "grad_norm": 0.25390625, + "learning_rate": 4.5773226300990666e-05, + "loss": 2.7819, + "num_input_tokens_seen": 2469396480, + "step": 4710 + }, + { + "epoch": 0.2286601075405702, + "grad_norm": 0.2578125, + "learning_rate": 4.576206595146845e-05, + "loss": 2.7845, + "num_input_tokens_seen": 2472017920, + "step": 4715 + }, + { + "epoch": 0.22890258909681682, + "grad_norm": 0.251953125, + "learning_rate": 4.5750892251807934e-05, + "loss": 2.7834, + "num_input_tokens_seen": 2474639360, + "step": 4720 + }, + { + "epoch": 0.22914507065306344, + "grad_norm": 0.259765625, + "learning_rate": 4.573970520919388e-05, + "loss": 2.7876, + "num_input_tokens_seen": 2477260800, + "step": 4725 + }, + { + "epoch": 0.2293875522093101, + "grad_norm": 0.255859375, + "learning_rate": 4.572850483081964e-05, + "loss": 2.7961, + "num_input_tokens_seen": 2479882240, + "step": 4730 + }, + { + "epoch": 0.2296300337655567, + "grad_norm": 0.24609375, + "learning_rate": 4.5717291123887106e-05, + "loss": 2.7874, + "num_input_tokens_seen": 2482503680, + "step": 4735 + }, + { + "epoch": 0.22987251532180333, + "grad_norm": 0.255859375, + "learning_rate": 4.570606409560677e-05, + "loss": 2.7892, + "num_input_tokens_seen": 2485125120, + "step": 4740 + }, + { + "epoch": 0.23011499687804995, + "grad_norm": 0.259765625, + "learning_rate": 4.569482375319769e-05, + "loss": 2.7874, + "num_input_tokens_seen": 2487746560, + "step": 4745 + }, + { + "epoch": 0.2303574784342966, + "grad_norm": 0.2578125, + "learning_rate": 4.5683570103887475e-05, + "loss": 2.7831, + "num_input_tokens_seen": 2490368000, + "step": 4750 + }, + { + "epoch": 0.23059995999054322, + "grad_norm": 0.25390625, + "learning_rate": 4.567230315491228e-05, + "loss": 2.786, + "num_input_tokens_seen": 2492989440, + "step": 4755 + }, + { + "epoch": 0.23084244154678984, + "grad_norm": 0.25390625, + "learning_rate": 4.566102291351683e-05, + "loss": 2.7816, + "num_input_tokens_seen": 2495610880, + "step": 4760 + }, + { + "epoch": 0.23108492310303647, + "grad_norm": 0.2734375, + "learning_rate": 4.5649729386954395e-05, + "loss": 2.8002, + "num_input_tokens_seen": 2498232320, + "step": 4765 + }, + { + "epoch": 0.23132740465928311, + "grad_norm": 0.2470703125, + "learning_rate": 4.563842258248677e-05, + "loss": 2.7963, + "num_input_tokens_seen": 2500853760, + "step": 4770 + }, + { + "epoch": 0.23156988621552974, + "grad_norm": 0.25, + "learning_rate": 4.562710250738433e-05, + "loss": 2.786, + "num_input_tokens_seen": 2503475200, + "step": 4775 + }, + { + "epoch": 0.23181236777177636, + "grad_norm": 0.271484375, + "learning_rate": 4.561576916892592e-05, + "loss": 2.7741, + "num_input_tokens_seen": 2506096640, + "step": 4780 + }, + { + "epoch": 0.23205484932802298, + "grad_norm": 0.259765625, + "learning_rate": 4.560442257439896e-05, + "loss": 2.7907, + "num_input_tokens_seen": 2508718080, + "step": 4785 + }, + { + "epoch": 0.23229733088426963, + "grad_norm": 0.25390625, + "learning_rate": 4.55930627310994e-05, + "loss": 2.7808, + "num_input_tokens_seen": 2511339520, + "step": 4790 + }, + { + "epoch": 0.23253981244051625, + "grad_norm": 0.259765625, + "learning_rate": 4.558168964633166e-05, + "loss": 2.7782, + "num_input_tokens_seen": 2513960960, + "step": 4795 + }, + { + "epoch": 0.23278229399676287, + "grad_norm": 0.259765625, + "learning_rate": 4.557030332740873e-05, + "loss": 2.7958, + "num_input_tokens_seen": 2516582400, + "step": 4800 + }, + { + "epoch": 0.23278229399676287, + "eval_accuracy": 0.4543738804754926, + "eval_loss": 2.7526228427886963, + "eval_runtime": 5.8809, + "eval_samples_per_second": 51.013, + "eval_steps_per_second": 6.462, + "num_input_tokens_seen": 2516582400, + "step": 4800 + }, + { + "epoch": 0.2330247755530095, + "grad_norm": 0.2490234375, + "learning_rate": 4.555890378165206e-05, + "loss": 2.7984, + "num_input_tokens_seen": 2519203840, + "step": 4805 + }, + { + "epoch": 0.23326725710925614, + "grad_norm": 0.25390625, + "learning_rate": 4.5547491016391645e-05, + "loss": 2.7877, + "num_input_tokens_seen": 2521825280, + "step": 4810 + }, + { + "epoch": 0.23350973866550276, + "grad_norm": 0.2451171875, + "learning_rate": 4.553606503896597e-05, + "loss": 2.7959, + "num_input_tokens_seen": 2524446720, + "step": 4815 + }, + { + "epoch": 0.23375222022174938, + "grad_norm": 0.255859375, + "learning_rate": 4.552462585672199e-05, + "loss": 2.7823, + "num_input_tokens_seen": 2527068160, + "step": 4820 + }, + { + "epoch": 0.233994701777996, + "grad_norm": 0.259765625, + "learning_rate": 4.551317347701519e-05, + "loss": 2.7733, + "num_input_tokens_seen": 2529689600, + "step": 4825 + }, + { + "epoch": 0.23423718333424265, + "grad_norm": 0.255859375, + "learning_rate": 4.550170790720951e-05, + "loss": 2.7665, + "num_input_tokens_seen": 2532311040, + "step": 4830 + }, + { + "epoch": 0.23447966489048927, + "grad_norm": 0.25390625, + "learning_rate": 4.549022915467739e-05, + "loss": 2.7968, + "num_input_tokens_seen": 2534932480, + "step": 4835 + }, + { + "epoch": 0.2347221464467359, + "grad_norm": 0.25390625, + "learning_rate": 4.5478737226799736e-05, + "loss": 2.7886, + "num_input_tokens_seen": 2537553920, + "step": 4840 + }, + { + "epoch": 0.2349646280029825, + "grad_norm": 0.2578125, + "learning_rate": 4.5467232130965935e-05, + "loss": 2.7939, + "num_input_tokens_seen": 2540175360, + "step": 4845 + }, + { + "epoch": 0.23520710955922916, + "grad_norm": 0.259765625, + "learning_rate": 4.5455713874573825e-05, + "loss": 2.7814, + "num_input_tokens_seen": 2542796800, + "step": 4850 + }, + { + "epoch": 0.23544959111547578, + "grad_norm": 0.251953125, + "learning_rate": 4.5444182465029726e-05, + "loss": 2.7878, + "num_input_tokens_seen": 2545418240, + "step": 4855 + }, + { + "epoch": 0.2356920726717224, + "grad_norm": 0.26171875, + "learning_rate": 4.54326379097484e-05, + "loss": 2.7799, + "num_input_tokens_seen": 2548039680, + "step": 4860 + }, + { + "epoch": 0.23593455422796902, + "grad_norm": 0.26953125, + "learning_rate": 4.542108021615308e-05, + "loss": 2.8001, + "num_input_tokens_seen": 2550661120, + "step": 4865 + }, + { + "epoch": 0.23617703578421567, + "grad_norm": 0.26953125, + "learning_rate": 4.540950939167542e-05, + "loss": 2.7806, + "num_input_tokens_seen": 2553282560, + "step": 4870 + }, + { + "epoch": 0.2364195173404623, + "grad_norm": 0.2578125, + "learning_rate": 4.539792544375554e-05, + "loss": 2.7804, + "num_input_tokens_seen": 2555904000, + "step": 4875 + }, + { + "epoch": 0.2366619988967089, + "grad_norm": 0.263671875, + "learning_rate": 4.538632837984199e-05, + "loss": 2.7779, + "num_input_tokens_seen": 2558525440, + "step": 4880 + }, + { + "epoch": 0.23690448045295553, + "grad_norm": 0.251953125, + "learning_rate": 4.537471820739176e-05, + "loss": 2.7868, + "num_input_tokens_seen": 2561146880, + "step": 4885 + }, + { + "epoch": 0.23714696200920218, + "grad_norm": 0.24609375, + "learning_rate": 4.536309493387025e-05, + "loss": 2.7886, + "num_input_tokens_seen": 2563768320, + "step": 4890 + }, + { + "epoch": 0.2373894435654488, + "grad_norm": 0.2734375, + "learning_rate": 4.5351458566751317e-05, + "loss": 2.7962, + "num_input_tokens_seen": 2566389760, + "step": 4895 + }, + { + "epoch": 0.23763192512169543, + "grad_norm": 0.2578125, + "learning_rate": 4.53398091135172e-05, + "loss": 2.7919, + "num_input_tokens_seen": 2569011200, + "step": 4900 + }, + { + "epoch": 0.23787440667794205, + "grad_norm": 0.251953125, + "learning_rate": 4.532814658165858e-05, + "loss": 2.7918, + "num_input_tokens_seen": 2571632640, + "step": 4905 + }, + { + "epoch": 0.2381168882341887, + "grad_norm": 0.2578125, + "learning_rate": 4.5316470978674536e-05, + "loss": 2.7812, + "num_input_tokens_seen": 2574254080, + "step": 4910 + }, + { + "epoch": 0.23835936979043532, + "grad_norm": 0.2578125, + "learning_rate": 4.530478231207255e-05, + "loss": 2.791, + "num_input_tokens_seen": 2576875520, + "step": 4915 + }, + { + "epoch": 0.23860185134668194, + "grad_norm": 0.2451171875, + "learning_rate": 4.5293080589368513e-05, + "loss": 2.7838, + "num_input_tokens_seen": 2579496960, + "step": 4920 + }, + { + "epoch": 0.23884433290292856, + "grad_norm": 0.248046875, + "learning_rate": 4.52813658180867e-05, + "loss": 2.7861, + "num_input_tokens_seen": 2582118400, + "step": 4925 + }, + { + "epoch": 0.2390868144591752, + "grad_norm": 0.25, + "learning_rate": 4.52696380057598e-05, + "loss": 2.7665, + "num_input_tokens_seen": 2584739840, + "step": 4930 + }, + { + "epoch": 0.23932929601542183, + "grad_norm": 0.2470703125, + "learning_rate": 4.5257897159928844e-05, + "loss": 2.7923, + "num_input_tokens_seen": 2587361280, + "step": 4935 + }, + { + "epoch": 0.23957177757166845, + "grad_norm": 0.25390625, + "learning_rate": 4.524614328814327e-05, + "loss": 2.7835, + "num_input_tokens_seen": 2589982720, + "step": 4940 + }, + { + "epoch": 0.23981425912791507, + "grad_norm": 0.28515625, + "learning_rate": 4.523437639796092e-05, + "loss": 2.787, + "num_input_tokens_seen": 2592604160, + "step": 4945 + }, + { + "epoch": 0.24005674068416172, + "grad_norm": 0.265625, + "learning_rate": 4.5222596496947954e-05, + "loss": 2.7922, + "num_input_tokens_seen": 2595225600, + "step": 4950 + }, + { + "epoch": 0.24029922224040834, + "grad_norm": 0.255859375, + "learning_rate": 4.521080359267893e-05, + "loss": 2.7975, + "num_input_tokens_seen": 2597847040, + "step": 4955 + }, + { + "epoch": 0.24054170379665496, + "grad_norm": 0.259765625, + "learning_rate": 4.519899769273676e-05, + "loss": 2.7844, + "num_input_tokens_seen": 2600468480, + "step": 4960 + }, + { + "epoch": 0.2407841853529016, + "grad_norm": 0.248046875, + "learning_rate": 4.518717880471271e-05, + "loss": 2.7835, + "num_input_tokens_seen": 2603089920, + "step": 4965 + }, + { + "epoch": 0.24102666690914823, + "grad_norm": 0.25390625, + "learning_rate": 4.517534693620639e-05, + "loss": 2.79, + "num_input_tokens_seen": 2605711360, + "step": 4970 + }, + { + "epoch": 0.24126914846539485, + "grad_norm": 0.2578125, + "learning_rate": 4.516350209482577e-05, + "loss": 2.7865, + "num_input_tokens_seen": 2608332800, + "step": 4975 + }, + { + "epoch": 0.24151163002164147, + "grad_norm": 0.259765625, + "learning_rate": 4.515164428818717e-05, + "loss": 2.7822, + "num_input_tokens_seen": 2610954240, + "step": 4980 + }, + { + "epoch": 0.24175411157788812, + "grad_norm": 0.25390625, + "learning_rate": 4.513977352391522e-05, + "loss": 2.7986, + "num_input_tokens_seen": 2613575680, + "step": 4985 + }, + { + "epoch": 0.24199659313413474, + "grad_norm": 0.26171875, + "learning_rate": 4.5127889809642896e-05, + "loss": 2.7906, + "num_input_tokens_seen": 2616197120, + "step": 4990 + }, + { + "epoch": 0.24223907469038136, + "grad_norm": 0.26171875, + "learning_rate": 4.511599315301151e-05, + "loss": 2.7838, + "num_input_tokens_seen": 2618818560, + "step": 4995 + }, + { + "epoch": 0.24248155624662798, + "grad_norm": 0.2578125, + "learning_rate": 4.5104083561670686e-05, + "loss": 2.7927, + "num_input_tokens_seen": 2621440000, + "step": 5000 + }, + { + "epoch": 0.24272403780287463, + "grad_norm": 0.263671875, + "learning_rate": 4.5092161043278344e-05, + "loss": 2.7774, + "num_input_tokens_seen": 2624061440, + "step": 5005 + }, + { + "epoch": 0.24296651935912125, + "grad_norm": 0.26171875, + "learning_rate": 4.508022560550077e-05, + "loss": 2.7792, + "num_input_tokens_seen": 2626682880, + "step": 5010 + }, + { + "epoch": 0.24320900091536787, + "grad_norm": 0.26171875, + "learning_rate": 4.506827725601251e-05, + "loss": 2.7848, + "num_input_tokens_seen": 2629304320, + "step": 5015 + }, + { + "epoch": 0.2434514824716145, + "grad_norm": 0.25, + "learning_rate": 4.5056316002496424e-05, + "loss": 2.7974, + "num_input_tokens_seen": 2631925760, + "step": 5020 + }, + { + "epoch": 0.24369396402786114, + "grad_norm": 0.271484375, + "learning_rate": 4.504434185264368e-05, + "loss": 2.8029, + "num_input_tokens_seen": 2634547200, + "step": 5025 + }, + { + "epoch": 0.24393644558410776, + "grad_norm": 0.25390625, + "learning_rate": 4.5032354814153724e-05, + "loss": 2.7913, + "num_input_tokens_seen": 2637168640, + "step": 5030 + }, + { + "epoch": 0.24417892714035438, + "grad_norm": 0.25390625, + "learning_rate": 4.502035489473429e-05, + "loss": 2.794, + "num_input_tokens_seen": 2639790080, + "step": 5035 + }, + { + "epoch": 0.244421408696601, + "grad_norm": 0.265625, + "learning_rate": 4.500834210210143e-05, + "loss": 2.7875, + "num_input_tokens_seen": 2642411520, + "step": 5040 + }, + { + "epoch": 0.24466389025284765, + "grad_norm": 0.2470703125, + "learning_rate": 4.499631644397941e-05, + "loss": 2.788, + "num_input_tokens_seen": 2645032960, + "step": 5045 + }, + { + "epoch": 0.24490637180909428, + "grad_norm": 0.259765625, + "learning_rate": 4.498427792810084e-05, + "loss": 2.7795, + "num_input_tokens_seen": 2647654400, + "step": 5050 + }, + { + "epoch": 0.2451488533653409, + "grad_norm": 0.2421875, + "learning_rate": 4.497222656220652e-05, + "loss": 2.7842, + "num_input_tokens_seen": 2650275840, + "step": 5055 + }, + { + "epoch": 0.24539133492158752, + "grad_norm": 0.251953125, + "learning_rate": 4.496016235404559e-05, + "loss": 2.8141, + "num_input_tokens_seen": 2652897280, + "step": 5060 + }, + { + "epoch": 0.24563381647783417, + "grad_norm": 0.263671875, + "learning_rate": 4.4948085311375386e-05, + "loss": 2.7749, + "num_input_tokens_seen": 2655518720, + "step": 5065 + }, + { + "epoch": 0.2458762980340808, + "grad_norm": 0.251953125, + "learning_rate": 4.4935995441961535e-05, + "loss": 2.7758, + "num_input_tokens_seen": 2658140160, + "step": 5070 + }, + { + "epoch": 0.2461187795903274, + "grad_norm": 0.2578125, + "learning_rate": 4.4923892753577895e-05, + "loss": 2.7871, + "num_input_tokens_seen": 2660761600, + "step": 5075 + }, + { + "epoch": 0.24636126114657403, + "grad_norm": 0.25390625, + "learning_rate": 4.491177725400657e-05, + "loss": 2.7752, + "num_input_tokens_seen": 2663383040, + "step": 5080 + }, + { + "epoch": 0.24660374270282068, + "grad_norm": 0.255859375, + "learning_rate": 4.489964895103791e-05, + "loss": 2.7848, + "num_input_tokens_seen": 2666004480, + "step": 5085 + }, + { + "epoch": 0.2468462242590673, + "grad_norm": 0.255859375, + "learning_rate": 4.488750785247048e-05, + "loss": 2.7906, + "num_input_tokens_seen": 2668625920, + "step": 5090 + }, + { + "epoch": 0.24708870581531392, + "grad_norm": 0.259765625, + "learning_rate": 4.487535396611108e-05, + "loss": 2.7912, + "num_input_tokens_seen": 2671247360, + "step": 5095 + }, + { + "epoch": 0.24733118737156054, + "grad_norm": 0.2578125, + "learning_rate": 4.486318729977474e-05, + "loss": 2.78, + "num_input_tokens_seen": 2673868800, + "step": 5100 + }, + { + "epoch": 0.24733118737156054, + "eval_accuracy": 0.45466047874938936, + "eval_loss": 2.751500129699707, + "eval_runtime": 5.8324, + "eval_samples_per_second": 51.437, + "eval_steps_per_second": 6.515, + "num_input_tokens_seen": 2673868800, + "step": 5100 + }, + { + "epoch": 0.2475736689278072, + "grad_norm": 0.25, + "learning_rate": 4.48510078612847e-05, + "loss": 2.7894, + "num_input_tokens_seen": 2676490240, + "step": 5105 + }, + { + "epoch": 0.2478161504840538, + "grad_norm": 0.24609375, + "learning_rate": 4.4838815658472425e-05, + "loss": 2.7768, + "num_input_tokens_seen": 2679111680, + "step": 5110 + }, + { + "epoch": 0.24805863204030043, + "grad_norm": 0.24609375, + "learning_rate": 4.482661069917756e-05, + "loss": 2.7889, + "num_input_tokens_seen": 2681733120, + "step": 5115 + }, + { + "epoch": 0.24830111359654705, + "grad_norm": 0.259765625, + "learning_rate": 4.481439299124799e-05, + "loss": 2.7859, + "num_input_tokens_seen": 2684354560, + "step": 5120 + }, + { + "epoch": 0.2485435951527937, + "grad_norm": 0.25390625, + "learning_rate": 4.4802162542539774e-05, + "loss": 2.7794, + "num_input_tokens_seen": 2686976000, + "step": 5125 + }, + { + "epoch": 0.24878607670904032, + "grad_norm": 0.2734375, + "learning_rate": 4.478991936091714e-05, + "loss": 2.7955, + "num_input_tokens_seen": 2689597440, + "step": 5130 + }, + { + "epoch": 0.24902855826528694, + "grad_norm": 0.26171875, + "learning_rate": 4.477766345425257e-05, + "loss": 2.7731, + "num_input_tokens_seen": 2692218880, + "step": 5135 + }, + { + "epoch": 0.24927103982153356, + "grad_norm": 0.251953125, + "learning_rate": 4.476539483042666e-05, + "loss": 2.7784, + "num_input_tokens_seen": 2694840320, + "step": 5140 + }, + { + "epoch": 0.2495135213777802, + "grad_norm": 0.2578125, + "learning_rate": 4.475311349732823e-05, + "loss": 2.7798, + "num_input_tokens_seen": 2697461760, + "step": 5145 + }, + { + "epoch": 0.24975600293402683, + "grad_norm": 0.275390625, + "learning_rate": 4.4740819462854245e-05, + "loss": 2.7754, + "num_input_tokens_seen": 2700083200, + "step": 5150 + }, + { + "epoch": 0.24999848449027345, + "grad_norm": 0.25390625, + "learning_rate": 4.4728512734909844e-05, + "loss": 2.7828, + "num_input_tokens_seen": 2702704640, + "step": 5155 + }, + { + "epoch": 0.2502409660465201, + "grad_norm": 0.255859375, + "learning_rate": 4.471619332140833e-05, + "loss": 2.7775, + "num_input_tokens_seen": 2705326080, + "step": 5160 + }, + { + "epoch": 0.2504834476027667, + "grad_norm": 0.265625, + "learning_rate": 4.470386123027117e-05, + "loss": 2.7935, + "num_input_tokens_seen": 2707947520, + "step": 5165 + }, + { + "epoch": 0.2507259291590133, + "grad_norm": 0.251953125, + "learning_rate": 4.469151646942797e-05, + "loss": 2.7943, + "num_input_tokens_seen": 2710568960, + "step": 5170 + }, + { + "epoch": 0.25096841071526, + "grad_norm": 0.255859375, + "learning_rate": 4.467915904681649e-05, + "loss": 2.7892, + "num_input_tokens_seen": 2713190400, + "step": 5175 + }, + { + "epoch": 0.2512108922715066, + "grad_norm": 0.25390625, + "learning_rate": 4.466678897038263e-05, + "loss": 2.7888, + "num_input_tokens_seen": 2715811840, + "step": 5180 + }, + { + "epoch": 0.25145337382775323, + "grad_norm": 0.26953125, + "learning_rate": 4.465440624808043e-05, + "loss": 2.7868, + "num_input_tokens_seen": 2718433280, + "step": 5185 + }, + { + "epoch": 0.25169585538399986, + "grad_norm": 0.2470703125, + "learning_rate": 4.4642010887872056e-05, + "loss": 2.7908, + "num_input_tokens_seen": 2721054720, + "step": 5190 + }, + { + "epoch": 0.2519383369402465, + "grad_norm": 0.255859375, + "learning_rate": 4.4629602897727804e-05, + "loss": 2.7858, + "num_input_tokens_seen": 2723676160, + "step": 5195 + }, + { + "epoch": 0.2521808184964931, + "grad_norm": 0.255859375, + "learning_rate": 4.461718228562608e-05, + "loss": 2.7764, + "num_input_tokens_seen": 2726297600, + "step": 5200 + }, + { + "epoch": 0.2524233000527397, + "grad_norm": 0.2578125, + "learning_rate": 4.460474905955342e-05, + "loss": 2.7833, + "num_input_tokens_seen": 2728919040, + "step": 5205 + }, + { + "epoch": 0.25266578160898634, + "grad_norm": 0.255859375, + "learning_rate": 4.4592303227504476e-05, + "loss": 2.7957, + "num_input_tokens_seen": 2731540480, + "step": 5210 + }, + { + "epoch": 0.252908263165233, + "grad_norm": 0.25390625, + "learning_rate": 4.457984479748197e-05, + "loss": 2.778, + "num_input_tokens_seen": 2734161920, + "step": 5215 + }, + { + "epoch": 0.25315074472147964, + "grad_norm": 0.263671875, + "learning_rate": 4.456737377749678e-05, + "loss": 2.8094, + "num_input_tokens_seen": 2736783360, + "step": 5220 + }, + { + "epoch": 0.25339322627772626, + "grad_norm": 0.244140625, + "learning_rate": 4.455489017556784e-05, + "loss": 2.7743, + "num_input_tokens_seen": 2739404800, + "step": 5225 + }, + { + "epoch": 0.2536357078339729, + "grad_norm": 0.251953125, + "learning_rate": 4.454239399972218e-05, + "loss": 2.7785, + "num_input_tokens_seen": 2742026240, + "step": 5230 + }, + { + "epoch": 0.2538781893902195, + "grad_norm": 0.255859375, + "learning_rate": 4.452988525799492e-05, + "loss": 2.779, + "num_input_tokens_seen": 2744647680, + "step": 5235 + }, + { + "epoch": 0.2541206709464661, + "grad_norm": 0.255859375, + "learning_rate": 4.451736395842926e-05, + "loss": 2.7809, + "num_input_tokens_seen": 2747269120, + "step": 5240 + }, + { + "epoch": 0.25436315250271274, + "grad_norm": 0.255859375, + "learning_rate": 4.450483010907648e-05, + "loss": 2.7811, + "num_input_tokens_seen": 2749890560, + "step": 5245 + }, + { + "epoch": 0.2546056340589594, + "grad_norm": 0.25, + "learning_rate": 4.449228371799591e-05, + "loss": 2.7724, + "num_input_tokens_seen": 2752512000, + "step": 5250 + }, + { + "epoch": 0.25484811561520604, + "grad_norm": 0.251953125, + "learning_rate": 4.447972479325497e-05, + "loss": 2.7836, + "num_input_tokens_seen": 2755133440, + "step": 5255 + }, + { + "epoch": 0.25509059717145266, + "grad_norm": 0.2578125, + "learning_rate": 4.446715334292913e-05, + "loss": 2.7972, + "num_input_tokens_seen": 2757754880, + "step": 5260 + }, + { + "epoch": 0.2553330787276993, + "grad_norm": 0.2578125, + "learning_rate": 4.445456937510188e-05, + "loss": 2.7827, + "num_input_tokens_seen": 2760376320, + "step": 5265 + }, + { + "epoch": 0.2555755602839459, + "grad_norm": 0.25, + "learning_rate": 4.4441972897864833e-05, + "loss": 2.7784, + "num_input_tokens_seen": 2762997760, + "step": 5270 + }, + { + "epoch": 0.2558180418401925, + "grad_norm": 0.2451171875, + "learning_rate": 4.442936391931759e-05, + "loss": 2.7896, + "num_input_tokens_seen": 2765619200, + "step": 5275 + }, + { + "epoch": 0.25606052339643914, + "grad_norm": 0.26171875, + "learning_rate": 4.4416742447567784e-05, + "loss": 2.7843, + "num_input_tokens_seen": 2768240640, + "step": 5280 + }, + { + "epoch": 0.25630300495268576, + "grad_norm": 0.255859375, + "learning_rate": 4.440410849073112e-05, + "loss": 2.785, + "num_input_tokens_seen": 2770862080, + "step": 5285 + }, + { + "epoch": 0.25654548650893244, + "grad_norm": 0.2578125, + "learning_rate": 4.43914620569313e-05, + "loss": 2.7713, + "num_input_tokens_seen": 2773483520, + "step": 5290 + }, + { + "epoch": 0.25678796806517906, + "grad_norm": 0.26953125, + "learning_rate": 4.4378803154300066e-05, + "loss": 2.7791, + "num_input_tokens_seen": 2776104960, + "step": 5295 + }, + { + "epoch": 0.2570304496214257, + "grad_norm": 0.26171875, + "learning_rate": 4.4366131790977174e-05, + "loss": 2.8016, + "num_input_tokens_seen": 2778726400, + "step": 5300 + }, + { + "epoch": 0.2572729311776723, + "grad_norm": 0.26171875, + "learning_rate": 4.435344797511038e-05, + "loss": 2.7764, + "num_input_tokens_seen": 2781347840, + "step": 5305 + }, + { + "epoch": 0.2575154127339189, + "grad_norm": 0.2470703125, + "learning_rate": 4.4340751714855475e-05, + "loss": 2.7858, + "num_input_tokens_seen": 2783969280, + "step": 5310 + }, + { + "epoch": 0.25775789429016555, + "grad_norm": 0.259765625, + "learning_rate": 4.432804301837621e-05, + "loss": 2.7711, + "num_input_tokens_seen": 2786590720, + "step": 5315 + }, + { + "epoch": 0.25800037584641217, + "grad_norm": 0.263671875, + "learning_rate": 4.4315321893844375e-05, + "loss": 2.7839, + "num_input_tokens_seen": 2789212160, + "step": 5320 + }, + { + "epoch": 0.2582428574026588, + "grad_norm": 0.25, + "learning_rate": 4.430258834943972e-05, + "loss": 2.7914, + "num_input_tokens_seen": 2791833600, + "step": 5325 + }, + { + "epoch": 0.25848533895890546, + "grad_norm": 0.25390625, + "learning_rate": 4.4289842393350004e-05, + "loss": 2.7876, + "num_input_tokens_seen": 2794455040, + "step": 5330 + }, + { + "epoch": 0.2587278205151521, + "grad_norm": 0.2490234375, + "learning_rate": 4.4277084033770946e-05, + "loss": 2.7891, + "num_input_tokens_seen": 2797076480, + "step": 5335 + }, + { + "epoch": 0.2589703020713987, + "grad_norm": 0.2490234375, + "learning_rate": 4.426431327890626e-05, + "loss": 2.7691, + "num_input_tokens_seen": 2799697920, + "step": 5340 + }, + { + "epoch": 0.2592127836276453, + "grad_norm": 0.263671875, + "learning_rate": 4.425153013696762e-05, + "loss": 2.7807, + "num_input_tokens_seen": 2802319360, + "step": 5345 + }, + { + "epoch": 0.25945526518389195, + "grad_norm": 0.248046875, + "learning_rate": 4.423873461617467e-05, + "loss": 2.7843, + "num_input_tokens_seen": 2804940800, + "step": 5350 + }, + { + "epoch": 0.25969774674013857, + "grad_norm": 0.255859375, + "learning_rate": 4.4225926724755e-05, + "loss": 2.7738, + "num_input_tokens_seen": 2807562240, + "step": 5355 + }, + { + "epoch": 0.2599402282963852, + "grad_norm": 0.255859375, + "learning_rate": 4.421310647094417e-05, + "loss": 2.7872, + "num_input_tokens_seen": 2810183680, + "step": 5360 + }, + { + "epoch": 0.2601827098526318, + "grad_norm": 0.255859375, + "learning_rate": 4.420027386298568e-05, + "loss": 2.7801, + "num_input_tokens_seen": 2812805120, + "step": 5365 + }, + { + "epoch": 0.2604251914088785, + "grad_norm": 0.25390625, + "learning_rate": 4.418742890913097e-05, + "loss": 2.7878, + "num_input_tokens_seen": 2815426560, + "step": 5370 + }, + { + "epoch": 0.2606676729651251, + "grad_norm": 0.255859375, + "learning_rate": 4.417457161763945e-05, + "loss": 2.7867, + "num_input_tokens_seen": 2818048000, + "step": 5375 + }, + { + "epoch": 0.26091015452137173, + "grad_norm": 0.255859375, + "learning_rate": 4.4161701996778415e-05, + "loss": 2.7785, + "num_input_tokens_seen": 2820669440, + "step": 5380 + }, + { + "epoch": 0.26115263607761835, + "grad_norm": 0.25390625, + "learning_rate": 4.4148820054823125e-05, + "loss": 2.7838, + "num_input_tokens_seen": 2823290880, + "step": 5385 + }, + { + "epoch": 0.26139511763386497, + "grad_norm": 0.2578125, + "learning_rate": 4.4135925800056744e-05, + "loss": 2.79, + "num_input_tokens_seen": 2825912320, + "step": 5390 + }, + { + "epoch": 0.2616375991901116, + "grad_norm": 0.267578125, + "learning_rate": 4.412301924077036e-05, + "loss": 2.7882, + "num_input_tokens_seen": 2828533760, + "step": 5395 + }, + { + "epoch": 0.2618800807463582, + "grad_norm": 0.265625, + "learning_rate": 4.411010038526297e-05, + "loss": 2.7937, + "num_input_tokens_seen": 2831155200, + "step": 5400 + }, + { + "epoch": 0.2618800807463582, + "eval_accuracy": 0.45476958150138413, + "eval_loss": 2.750626564025879, + "eval_runtime": 5.809, + "eval_samples_per_second": 51.644, + "eval_steps_per_second": 6.542, + "num_input_tokens_seen": 2831155200, + "step": 5400 + }, + { + "epoch": 0.26212256230260483, + "grad_norm": 0.259765625, + "learning_rate": 4.409716924184148e-05, + "loss": 2.7784, + "num_input_tokens_seen": 2833776640, + "step": 5405 + }, + { + "epoch": 0.2623650438588515, + "grad_norm": 0.25390625, + "learning_rate": 4.4084225818820694e-05, + "loss": 2.7894, + "num_input_tokens_seen": 2836398080, + "step": 5410 + }, + { + "epoch": 0.26260752541509813, + "grad_norm": 0.251953125, + "learning_rate": 4.407127012452332e-05, + "loss": 2.778, + "num_input_tokens_seen": 2839019520, + "step": 5415 + }, + { + "epoch": 0.26285000697134475, + "grad_norm": 0.2578125, + "learning_rate": 4.4058302167279944e-05, + "loss": 2.7857, + "num_input_tokens_seen": 2841640960, + "step": 5420 + }, + { + "epoch": 0.2630924885275914, + "grad_norm": 0.26171875, + "learning_rate": 4.404532195542905e-05, + "loss": 2.7766, + "num_input_tokens_seen": 2844262400, + "step": 5425 + }, + { + "epoch": 0.263334970083838, + "grad_norm": 0.2470703125, + "learning_rate": 4.4032329497316985e-05, + "loss": 2.7816, + "num_input_tokens_seen": 2846883840, + "step": 5430 + }, + { + "epoch": 0.2635774516400846, + "grad_norm": 0.248046875, + "learning_rate": 4.401932480129799e-05, + "loss": 2.8017, + "num_input_tokens_seen": 2849505280, + "step": 5435 + }, + { + "epoch": 0.26381993319633124, + "grad_norm": 0.248046875, + "learning_rate": 4.400630787573416e-05, + "loss": 2.7787, + "num_input_tokens_seen": 2852126720, + "step": 5440 + }, + { + "epoch": 0.26406241475257786, + "grad_norm": 0.251953125, + "learning_rate": 4.399327872899547e-05, + "loss": 2.7934, + "num_input_tokens_seen": 2854748160, + "step": 5445 + }, + { + "epoch": 0.26430489630882453, + "grad_norm": 0.26171875, + "learning_rate": 4.398023736945973e-05, + "loss": 2.7813, + "num_input_tokens_seen": 2857369600, + "step": 5450 + }, + { + "epoch": 0.26454737786507115, + "grad_norm": 0.255859375, + "learning_rate": 4.396718380551263e-05, + "loss": 2.7795, + "num_input_tokens_seen": 2859991040, + "step": 5455 + }, + { + "epoch": 0.2647898594213178, + "grad_norm": 0.259765625, + "learning_rate": 4.3954118045547675e-05, + "loss": 2.7947, + "num_input_tokens_seen": 2862612480, + "step": 5460 + }, + { + "epoch": 0.2650323409775644, + "grad_norm": 0.259765625, + "learning_rate": 4.394104009796623e-05, + "loss": 2.7809, + "num_input_tokens_seen": 2865233920, + "step": 5465 + }, + { + "epoch": 0.265274822533811, + "grad_norm": 0.251953125, + "learning_rate": 4.392794997117753e-05, + "loss": 2.7705, + "num_input_tokens_seen": 2867855360, + "step": 5470 + }, + { + "epoch": 0.26551730409005764, + "grad_norm": 0.259765625, + "learning_rate": 4.391484767359858e-05, + "loss": 2.788, + "num_input_tokens_seen": 2870476800, + "step": 5475 + }, + { + "epoch": 0.26575978564630426, + "grad_norm": 0.25, + "learning_rate": 4.390173321365423e-05, + "loss": 2.7913, + "num_input_tokens_seen": 2873098240, + "step": 5480 + }, + { + "epoch": 0.2660022672025509, + "grad_norm": 0.26171875, + "learning_rate": 4.388860659977719e-05, + "loss": 2.7851, + "num_input_tokens_seen": 2875719680, + "step": 5485 + }, + { + "epoch": 0.26624474875879756, + "grad_norm": 0.259765625, + "learning_rate": 4.387546784040794e-05, + "loss": 2.7839, + "num_input_tokens_seen": 2878341120, + "step": 5490 + }, + { + "epoch": 0.2664872303150442, + "grad_norm": 0.2470703125, + "learning_rate": 4.3862316943994766e-05, + "loss": 2.7604, + "num_input_tokens_seen": 2880962560, + "step": 5495 + }, + { + "epoch": 0.2667297118712908, + "grad_norm": 0.2578125, + "learning_rate": 4.3849153918993815e-05, + "loss": 2.7719, + "num_input_tokens_seen": 2883584000, + "step": 5500 + }, + { + "epoch": 0.2669721934275374, + "grad_norm": 0.25390625, + "learning_rate": 4.383597877386896e-05, + "loss": 2.7873, + "num_input_tokens_seen": 2886205440, + "step": 5505 + }, + { + "epoch": 0.26721467498378404, + "grad_norm": 0.25, + "learning_rate": 4.382279151709192e-05, + "loss": 2.7806, + "num_input_tokens_seen": 2888826880, + "step": 5510 + }, + { + "epoch": 0.26745715654003066, + "grad_norm": 0.263671875, + "learning_rate": 4.380959215714218e-05, + "loss": 2.7814, + "num_input_tokens_seen": 2891448320, + "step": 5515 + }, + { + "epoch": 0.2676996380962773, + "grad_norm": 0.26953125, + "learning_rate": 4.3796380702507014e-05, + "loss": 2.7763, + "num_input_tokens_seen": 2894069760, + "step": 5520 + }, + { + "epoch": 0.2679421196525239, + "grad_norm": 0.267578125, + "learning_rate": 4.3783157161681466e-05, + "loss": 2.7891, + "num_input_tokens_seen": 2896691200, + "step": 5525 + }, + { + "epoch": 0.2681846012087706, + "grad_norm": 0.26171875, + "learning_rate": 4.376992154316835e-05, + "loss": 2.7868, + "num_input_tokens_seen": 2899312640, + "step": 5530 + }, + { + "epoch": 0.2684270827650172, + "grad_norm": 0.26171875, + "learning_rate": 4.375667385547826e-05, + "loss": 2.791, + "num_input_tokens_seen": 2901934080, + "step": 5535 + }, + { + "epoch": 0.2686695643212638, + "grad_norm": 0.24609375, + "learning_rate": 4.3743414107129546e-05, + "loss": 2.7801, + "num_input_tokens_seen": 2904555520, + "step": 5540 + }, + { + "epoch": 0.26891204587751044, + "grad_norm": 0.251953125, + "learning_rate": 4.3730142306648294e-05, + "loss": 2.7961, + "num_input_tokens_seen": 2907176960, + "step": 5545 + }, + { + "epoch": 0.26915452743375706, + "grad_norm": 0.2578125, + "learning_rate": 4.3716858462568365e-05, + "loss": 2.7786, + "num_input_tokens_seen": 2909798400, + "step": 5550 + }, + { + "epoch": 0.2693970089900037, + "grad_norm": 0.255859375, + "learning_rate": 4.370356258343135e-05, + "loss": 2.7981, + "num_input_tokens_seen": 2912419840, + "step": 5555 + }, + { + "epoch": 0.2696394905462503, + "grad_norm": 0.255859375, + "learning_rate": 4.369025467778659e-05, + "loss": 2.7813, + "num_input_tokens_seen": 2915041280, + "step": 5560 + }, + { + "epoch": 0.269881972102497, + "grad_norm": 0.251953125, + "learning_rate": 4.3676934754191145e-05, + "loss": 2.7738, + "num_input_tokens_seen": 2917662720, + "step": 5565 + }, + { + "epoch": 0.2701244536587436, + "grad_norm": 0.251953125, + "learning_rate": 4.3663602821209805e-05, + "loss": 2.7794, + "num_input_tokens_seen": 2920284160, + "step": 5570 + }, + { + "epoch": 0.2703669352149902, + "grad_norm": 0.259765625, + "learning_rate": 4.36502588874151e-05, + "loss": 2.7904, + "num_input_tokens_seen": 2922905600, + "step": 5575 + }, + { + "epoch": 0.27060941677123684, + "grad_norm": 0.2578125, + "learning_rate": 4.363690296138725e-05, + "loss": 2.7908, + "num_input_tokens_seen": 2925527040, + "step": 5580 + }, + { + "epoch": 0.27085189832748346, + "grad_norm": 0.26953125, + "learning_rate": 4.36235350517142e-05, + "loss": 2.7871, + "num_input_tokens_seen": 2928148480, + "step": 5585 + }, + { + "epoch": 0.2710943798837301, + "grad_norm": 0.26171875, + "learning_rate": 4.3610155166991605e-05, + "loss": 2.778, + "num_input_tokens_seen": 2930769920, + "step": 5590 + }, + { + "epoch": 0.2713368614399767, + "grad_norm": 0.2578125, + "learning_rate": 4.359676331582282e-05, + "loss": 2.7791, + "num_input_tokens_seen": 2933391360, + "step": 5595 + }, + { + "epoch": 0.2715793429962233, + "grad_norm": 0.2470703125, + "learning_rate": 4.358335950681888e-05, + "loss": 2.7795, + "num_input_tokens_seen": 2936012800, + "step": 5600 + }, + { + "epoch": 0.27182182455247, + "grad_norm": 0.259765625, + "learning_rate": 4.356994374859852e-05, + "loss": 2.7851, + "num_input_tokens_seen": 2938634240, + "step": 5605 + }, + { + "epoch": 0.2720643061087166, + "grad_norm": 0.255859375, + "learning_rate": 4.355651604978815e-05, + "loss": 2.7762, + "num_input_tokens_seen": 2941255680, + "step": 5610 + }, + { + "epoch": 0.27230678766496325, + "grad_norm": 0.25390625, + "learning_rate": 4.3543076419021874e-05, + "loss": 2.776, + "num_input_tokens_seen": 2943877120, + "step": 5615 + }, + { + "epoch": 0.27254926922120987, + "grad_norm": 0.25390625, + "learning_rate": 4.3529624864941456e-05, + "loss": 2.7986, + "num_input_tokens_seen": 2946498560, + "step": 5620 + }, + { + "epoch": 0.2727917507774565, + "grad_norm": 0.2490234375, + "learning_rate": 4.351616139619632e-05, + "loss": 2.7924, + "num_input_tokens_seen": 2949120000, + "step": 5625 + }, + { + "epoch": 0.2730342323337031, + "grad_norm": 0.25390625, + "learning_rate": 4.350268602144358e-05, + "loss": 2.7873, + "num_input_tokens_seen": 2951741440, + "step": 5630 + }, + { + "epoch": 0.27327671388994973, + "grad_norm": 0.26171875, + "learning_rate": 4.3489198749347976e-05, + "loss": 2.7869, + "num_input_tokens_seen": 2954362880, + "step": 5635 + }, + { + "epoch": 0.27351919544619635, + "grad_norm": 0.255859375, + "learning_rate": 4.347569958858191e-05, + "loss": 2.786, + "num_input_tokens_seen": 2956984320, + "step": 5640 + }, + { + "epoch": 0.273761677002443, + "grad_norm": 0.26171875, + "learning_rate": 4.3462188547825415e-05, + "loss": 2.779, + "num_input_tokens_seen": 2959605760, + "step": 5645 + }, + { + "epoch": 0.27400415855868965, + "grad_norm": 0.25, + "learning_rate": 4.3448665635766187e-05, + "loss": 2.7765, + "num_input_tokens_seen": 2962227200, + "step": 5650 + }, + { + "epoch": 0.27424664011493627, + "grad_norm": 0.251953125, + "learning_rate": 4.343513086109955e-05, + "loss": 2.7761, + "num_input_tokens_seen": 2964848640, + "step": 5655 + }, + { + "epoch": 0.2744891216711829, + "grad_norm": 0.267578125, + "learning_rate": 4.342158423252843e-05, + "loss": 2.79, + "num_input_tokens_seen": 2967470080, + "step": 5660 + }, + { + "epoch": 0.2747316032274295, + "grad_norm": 0.2578125, + "learning_rate": 4.3408025758763403e-05, + "loss": 2.7829, + "num_input_tokens_seen": 2970091520, + "step": 5665 + }, + { + "epoch": 0.27497408478367613, + "grad_norm": 0.259765625, + "learning_rate": 4.339445544852265e-05, + "loss": 2.7945, + "num_input_tokens_seen": 2972712960, + "step": 5670 + }, + { + "epoch": 0.27521656633992275, + "grad_norm": 0.25, + "learning_rate": 4.338087331053198e-05, + "loss": 2.7835, + "num_input_tokens_seen": 2975334400, + "step": 5675 + }, + { + "epoch": 0.2754590478961694, + "grad_norm": 0.240234375, + "learning_rate": 4.336727935352477e-05, + "loss": 2.7977, + "num_input_tokens_seen": 2977955840, + "step": 5680 + }, + { + "epoch": 0.27570152945241605, + "grad_norm": 0.251953125, + "learning_rate": 4.335367358624204e-05, + "loss": 2.7855, + "num_input_tokens_seen": 2980577280, + "step": 5685 + }, + { + "epoch": 0.27594401100866267, + "grad_norm": 0.259765625, + "learning_rate": 4.334005601743236e-05, + "loss": 2.7781, + "num_input_tokens_seen": 2983198720, + "step": 5690 + }, + { + "epoch": 0.2761864925649093, + "grad_norm": 0.259765625, + "learning_rate": 4.3326426655851936e-05, + "loss": 2.7889, + "num_input_tokens_seen": 2985820160, + "step": 5695 + }, + { + "epoch": 0.2764289741211559, + "grad_norm": 0.24609375, + "learning_rate": 4.331278551026453e-05, + "loss": 2.7717, + "num_input_tokens_seen": 2988441600, + "step": 5700 + }, + { + "epoch": 0.2764289741211559, + "eval_accuracy": 0.4547614395049666, + "eval_loss": 2.749791145324707, + "eval_runtime": 5.8606, + "eval_samples_per_second": 51.189, + "eval_steps_per_second": 6.484, + "num_input_tokens_seen": 2988441600, + "step": 5700 + }, + { + "epoch": 0.27667145567740253, + "grad_norm": 0.267578125, + "learning_rate": 4.329913258944146e-05, + "loss": 2.7882, + "num_input_tokens_seen": 2991063040, + "step": 5705 + }, + { + "epoch": 0.27691393723364915, + "grad_norm": 0.251953125, + "learning_rate": 4.328546790216167e-05, + "loss": 2.7819, + "num_input_tokens_seen": 2993684480, + "step": 5710 + }, + { + "epoch": 0.2771564187898958, + "grad_norm": 0.2578125, + "learning_rate": 4.327179145721161e-05, + "loss": 2.7812, + "num_input_tokens_seen": 2996305920, + "step": 5715 + }, + { + "epoch": 0.2773989003461424, + "grad_norm": 0.255859375, + "learning_rate": 4.325810326338535e-05, + "loss": 2.8015, + "num_input_tokens_seen": 2998927360, + "step": 5720 + }, + { + "epoch": 0.2776413819023891, + "grad_norm": 0.255859375, + "learning_rate": 4.3244403329484456e-05, + "loss": 2.79, + "num_input_tokens_seen": 3001548800, + "step": 5725 + }, + { + "epoch": 0.2778838634586357, + "grad_norm": 0.263671875, + "learning_rate": 4.323069166431809e-05, + "loss": 2.7898, + "num_input_tokens_seen": 3004170240, + "step": 5730 + }, + { + "epoch": 0.2781263450148823, + "grad_norm": 0.255859375, + "learning_rate": 4.321696827670293e-05, + "loss": 2.7953, + "num_input_tokens_seen": 3006791680, + "step": 5735 + }, + { + "epoch": 0.27836882657112894, + "grad_norm": 0.248046875, + "learning_rate": 4.320323317546321e-05, + "loss": 2.7848, + "num_input_tokens_seen": 3009413120, + "step": 5740 + }, + { + "epoch": 0.27861130812737556, + "grad_norm": 0.259765625, + "learning_rate": 4.3189486369430674e-05, + "loss": 2.7783, + "num_input_tokens_seen": 3012034560, + "step": 5745 + }, + { + "epoch": 0.2788537896836222, + "grad_norm": 0.255859375, + "learning_rate": 4.317572786744461e-05, + "loss": 2.7932, + "num_input_tokens_seen": 3014656000, + "step": 5750 + }, + { + "epoch": 0.2790962712398688, + "grad_norm": 0.259765625, + "learning_rate": 4.3161957678351825e-05, + "loss": 2.7695, + "num_input_tokens_seen": 3017277440, + "step": 5755 + }, + { + "epoch": 0.2793387527961154, + "grad_norm": 0.26171875, + "learning_rate": 4.314817581100662e-05, + "loss": 2.7823, + "num_input_tokens_seen": 3019898880, + "step": 5760 + }, + { + "epoch": 0.2795812343523621, + "grad_norm": 0.251953125, + "learning_rate": 4.313438227427084e-05, + "loss": 2.7727, + "num_input_tokens_seen": 3022520320, + "step": 5765 + }, + { + "epoch": 0.2798237159086087, + "grad_norm": 0.248046875, + "learning_rate": 4.312057707701382e-05, + "loss": 2.7871, + "num_input_tokens_seen": 3025141760, + "step": 5770 + }, + { + "epoch": 0.28006619746485534, + "grad_norm": 0.25390625, + "learning_rate": 4.3106760228112365e-05, + "loss": 2.7925, + "num_input_tokens_seen": 3027763200, + "step": 5775 + }, + { + "epoch": 0.28030867902110196, + "grad_norm": 0.2578125, + "learning_rate": 4.309293173645082e-05, + "loss": 2.787, + "num_input_tokens_seen": 3030384640, + "step": 5780 + }, + { + "epoch": 0.2805511605773486, + "grad_norm": 0.259765625, + "learning_rate": 4.307909161092096e-05, + "loss": 2.7925, + "num_input_tokens_seen": 3033006080, + "step": 5785 + }, + { + "epoch": 0.2807936421335952, + "grad_norm": 0.263671875, + "learning_rate": 4.30652398604221e-05, + "loss": 2.7864, + "num_input_tokens_seen": 3035627520, + "step": 5790 + }, + { + "epoch": 0.2810361236898418, + "grad_norm": 0.2578125, + "learning_rate": 4.3051376493860995e-05, + "loss": 2.7921, + "num_input_tokens_seen": 3038248960, + "step": 5795 + }, + { + "epoch": 0.28127860524608844, + "grad_norm": 0.255859375, + "learning_rate": 4.303750152015188e-05, + "loss": 2.7858, + "num_input_tokens_seen": 3040870400, + "step": 5800 + }, + { + "epoch": 0.2815210868023351, + "grad_norm": 0.265625, + "learning_rate": 4.302361494821644e-05, + "loss": 2.7797, + "num_input_tokens_seen": 3043491840, + "step": 5805 + }, + { + "epoch": 0.28176356835858174, + "grad_norm": 0.25, + "learning_rate": 4.3009716786983834e-05, + "loss": 2.775, + "num_input_tokens_seen": 3046113280, + "step": 5810 + }, + { + "epoch": 0.28200604991482836, + "grad_norm": 0.2490234375, + "learning_rate": 4.299580704539067e-05, + "loss": 2.7769, + "num_input_tokens_seen": 3048734720, + "step": 5815 + }, + { + "epoch": 0.282248531471075, + "grad_norm": 0.26171875, + "learning_rate": 4.298188573238098e-05, + "loss": 2.781, + "num_input_tokens_seen": 3051356160, + "step": 5820 + }, + { + "epoch": 0.2824910130273216, + "grad_norm": 0.265625, + "learning_rate": 4.2967952856906276e-05, + "loss": 2.7854, + "num_input_tokens_seen": 3053977600, + "step": 5825 + }, + { + "epoch": 0.2827334945835682, + "grad_norm": 0.2470703125, + "learning_rate": 4.295400842792549e-05, + "loss": 2.7853, + "num_input_tokens_seen": 3056599040, + "step": 5830 + }, + { + "epoch": 0.28297597613981484, + "grad_norm": 0.255859375, + "learning_rate": 4.2940052454404954e-05, + "loss": 2.7991, + "num_input_tokens_seen": 3059220480, + "step": 5835 + }, + { + "epoch": 0.2832184576960615, + "grad_norm": 0.255859375, + "learning_rate": 4.2926084945318454e-05, + "loss": 2.7957, + "num_input_tokens_seen": 3061841920, + "step": 5840 + }, + { + "epoch": 0.28346093925230814, + "grad_norm": 0.255859375, + "learning_rate": 4.2912105909647194e-05, + "loss": 2.7762, + "num_input_tokens_seen": 3064463360, + "step": 5845 + }, + { + "epoch": 0.28370342080855476, + "grad_norm": 0.26171875, + "learning_rate": 4.289811535637978e-05, + "loss": 2.7674, + "num_input_tokens_seen": 3067084800, + "step": 5850 + }, + { + "epoch": 0.2839459023648014, + "grad_norm": 0.244140625, + "learning_rate": 4.288411329451222e-05, + "loss": 2.7743, + "num_input_tokens_seen": 3069706240, + "step": 5855 + }, + { + "epoch": 0.284188383921048, + "grad_norm": 0.251953125, + "learning_rate": 4.287009973304792e-05, + "loss": 2.7811, + "num_input_tokens_seen": 3072327680, + "step": 5860 + }, + { + "epoch": 0.2844308654772946, + "grad_norm": 0.255859375, + "learning_rate": 4.2856074680997705e-05, + "loss": 2.7976, + "num_input_tokens_seen": 3074949120, + "step": 5865 + }, + { + "epoch": 0.28467334703354125, + "grad_norm": 0.25390625, + "learning_rate": 4.284203814737976e-05, + "loss": 2.7823, + "num_input_tokens_seen": 3077570560, + "step": 5870 + }, + { + "epoch": 0.28491582858978787, + "grad_norm": 0.251953125, + "learning_rate": 4.282799014121967e-05, + "loss": 2.7877, + "num_input_tokens_seen": 3080192000, + "step": 5875 + }, + { + "epoch": 0.28515831014603454, + "grad_norm": 0.25390625, + "learning_rate": 4.281393067155038e-05, + "loss": 2.7856, + "num_input_tokens_seen": 3082813440, + "step": 5880 + }, + { + "epoch": 0.28540079170228116, + "grad_norm": 0.251953125, + "learning_rate": 4.279985974741223e-05, + "loss": 2.7772, + "num_input_tokens_seen": 3085434880, + "step": 5885 + }, + { + "epoch": 0.2856432732585278, + "grad_norm": 0.25390625, + "learning_rate": 4.2785777377852904e-05, + "loss": 2.7833, + "num_input_tokens_seen": 3088056320, + "step": 5890 + }, + { + "epoch": 0.2858857548147744, + "grad_norm": 0.265625, + "learning_rate": 4.277168357192746e-05, + "loss": 2.7821, + "num_input_tokens_seen": 3090677760, + "step": 5895 + }, + { + "epoch": 0.286128236371021, + "grad_norm": 0.26171875, + "learning_rate": 4.27575783386983e-05, + "loss": 2.7749, + "num_input_tokens_seen": 3093299200, + "step": 5900 + }, + { + "epoch": 0.28637071792726765, + "grad_norm": 0.2578125, + "learning_rate": 4.2743461687235176e-05, + "loss": 2.7761, + "num_input_tokens_seen": 3095920640, + "step": 5905 + }, + { + "epoch": 0.28661319948351427, + "grad_norm": 0.275390625, + "learning_rate": 4.272933362661518e-05, + "loss": 2.7836, + "num_input_tokens_seen": 3098542080, + "step": 5910 + }, + { + "epoch": 0.2868556810397609, + "grad_norm": 0.265625, + "learning_rate": 4.2715194165922754e-05, + "loss": 2.795, + "num_input_tokens_seen": 3101163520, + "step": 5915 + }, + { + "epoch": 0.28709816259600757, + "grad_norm": 0.25390625, + "learning_rate": 4.2701043314249644e-05, + "loss": 2.7857, + "num_input_tokens_seen": 3103784960, + "step": 5920 + }, + { + "epoch": 0.2873406441522542, + "grad_norm": 0.25390625, + "learning_rate": 4.268688108069496e-05, + "loss": 2.7809, + "num_input_tokens_seen": 3106406400, + "step": 5925 + }, + { + "epoch": 0.2875831257085008, + "grad_norm": 0.25390625, + "learning_rate": 4.267270747436508e-05, + "loss": 2.7832, + "num_input_tokens_seen": 3109027840, + "step": 5930 + }, + { + "epoch": 0.28782560726474743, + "grad_norm": 0.26171875, + "learning_rate": 4.2658522504373736e-05, + "loss": 2.7991, + "num_input_tokens_seen": 3111649280, + "step": 5935 + }, + { + "epoch": 0.28806808882099405, + "grad_norm": 0.251953125, + "learning_rate": 4.264432617984195e-05, + "loss": 2.7908, + "num_input_tokens_seen": 3114270720, + "step": 5940 + }, + { + "epoch": 0.28831057037724067, + "grad_norm": 0.25, + "learning_rate": 4.263011850989805e-05, + "loss": 2.7745, + "num_input_tokens_seen": 3116892160, + "step": 5945 + }, + { + "epoch": 0.2885530519334873, + "grad_norm": 0.25, + "learning_rate": 4.2615899503677656e-05, + "loss": 2.7903, + "num_input_tokens_seen": 3119513600, + "step": 5950 + }, + { + "epoch": 0.2887955334897339, + "grad_norm": 0.25, + "learning_rate": 4.260166917032368e-05, + "loss": 2.7734, + "num_input_tokens_seen": 3122135040, + "step": 5955 + }, + { + "epoch": 0.2890380150459806, + "grad_norm": 0.265625, + "learning_rate": 4.25874275189863e-05, + "loss": 2.7876, + "num_input_tokens_seen": 3124756480, + "step": 5960 + }, + { + "epoch": 0.2892804966022272, + "grad_norm": 0.267578125, + "learning_rate": 4.2573174558823004e-05, + "loss": 2.7744, + "num_input_tokens_seen": 3127377920, + "step": 5965 + }, + { + "epoch": 0.28952297815847383, + "grad_norm": 0.265625, + "learning_rate": 4.2558910298998535e-05, + "loss": 2.7804, + "num_input_tokens_seen": 3129999360, + "step": 5970 + }, + { + "epoch": 0.28976545971472045, + "grad_norm": 0.25, + "learning_rate": 4.2544634748684886e-05, + "loss": 2.7791, + "num_input_tokens_seen": 3132620800, + "step": 5975 + }, + { + "epoch": 0.2900079412709671, + "grad_norm": 0.26171875, + "learning_rate": 4.253034791706134e-05, + "loss": 2.7848, + "num_input_tokens_seen": 3135242240, + "step": 5980 + }, + { + "epoch": 0.2902504228272137, + "grad_norm": 0.2578125, + "learning_rate": 4.251604981331441e-05, + "loss": 2.7896, + "num_input_tokens_seen": 3137863680, + "step": 5985 + }, + { + "epoch": 0.2904929043834603, + "grad_norm": 0.2470703125, + "learning_rate": 4.2501740446637874e-05, + "loss": 2.7807, + "num_input_tokens_seen": 3140485120, + "step": 5990 + }, + { + "epoch": 0.29073538593970694, + "grad_norm": 0.25390625, + "learning_rate": 4.248741982623274e-05, + "loss": 2.7796, + "num_input_tokens_seen": 3143106560, + "step": 5995 + }, + { + "epoch": 0.2909778674959536, + "grad_norm": 0.25390625, + "learning_rate": 4.2473087961307256e-05, + "loss": 2.7832, + "num_input_tokens_seen": 3145728000, + "step": 6000 + }, + { + "epoch": 0.2909778674959536, + "eval_accuracy": 0.4548298322748738, + "eval_loss": 2.7490339279174805, + "eval_runtime": 5.8334, + "eval_samples_per_second": 51.428, + "eval_steps_per_second": 6.514, + "num_input_tokens_seen": 3145728000, + "step": 6000 + }, + { + "epoch": 0.29122034905220023, + "grad_norm": 0.2470703125, + "learning_rate": 4.24587448610769e-05, + "loss": 2.7829, + "num_input_tokens_seen": 3148349440, + "step": 6005 + }, + { + "epoch": 0.29146283060844685, + "grad_norm": 0.259765625, + "learning_rate": 4.244439053476438e-05, + "loss": 2.7767, + "num_input_tokens_seen": 3150970880, + "step": 6010 + }, + { + "epoch": 0.2917053121646935, + "grad_norm": 0.259765625, + "learning_rate": 4.24300249915996e-05, + "loss": 2.8001, + "num_input_tokens_seen": 3153592320, + "step": 6015 + }, + { + "epoch": 0.2919477937209401, + "grad_norm": 0.26171875, + "learning_rate": 4.2415648240819726e-05, + "loss": 2.7843, + "num_input_tokens_seen": 3156213760, + "step": 6020 + }, + { + "epoch": 0.2921902752771867, + "grad_norm": 0.251953125, + "learning_rate": 4.2401260291669074e-05, + "loss": 2.7845, + "num_input_tokens_seen": 3158835200, + "step": 6025 + }, + { + "epoch": 0.29243275683343334, + "grad_norm": 0.255859375, + "learning_rate": 4.238686115339919e-05, + "loss": 2.7737, + "num_input_tokens_seen": 3161456640, + "step": 6030 + }, + { + "epoch": 0.29267523838967996, + "grad_norm": 0.2578125, + "learning_rate": 4.2372450835268816e-05, + "loss": 2.7867, + "num_input_tokens_seen": 3164078080, + "step": 6035 + }, + { + "epoch": 0.29291771994592664, + "grad_norm": 0.265625, + "learning_rate": 4.235802934654388e-05, + "loss": 2.796, + "num_input_tokens_seen": 3166699520, + "step": 6040 + }, + { + "epoch": 0.29316020150217326, + "grad_norm": 0.25390625, + "learning_rate": 4.234359669649747e-05, + "loss": 2.781, + "num_input_tokens_seen": 3169320960, + "step": 6045 + }, + { + "epoch": 0.2934026830584199, + "grad_norm": 0.25, + "learning_rate": 4.2329152894409895e-05, + "loss": 2.7849, + "num_input_tokens_seen": 3171942400, + "step": 6050 + }, + { + "epoch": 0.2936451646146665, + "grad_norm": 0.251953125, + "learning_rate": 4.231469794956859e-05, + "loss": 2.7789, + "num_input_tokens_seen": 3174563840, + "step": 6055 + }, + { + "epoch": 0.2938876461709131, + "grad_norm": 0.263671875, + "learning_rate": 4.230023187126818e-05, + "loss": 2.7813, + "num_input_tokens_seen": 3177185280, + "step": 6060 + }, + { + "epoch": 0.29413012772715974, + "grad_norm": 0.2578125, + "learning_rate": 4.2285754668810454e-05, + "loss": 2.7695, + "num_input_tokens_seen": 3179806720, + "step": 6065 + }, + { + "epoch": 0.29437260928340636, + "grad_norm": 0.25390625, + "learning_rate": 4.227126635150434e-05, + "loss": 2.7834, + "num_input_tokens_seen": 3182428160, + "step": 6070 + }, + { + "epoch": 0.294615090839653, + "grad_norm": 0.2578125, + "learning_rate": 4.2256766928665895e-05, + "loss": 2.7758, + "num_input_tokens_seen": 3185049600, + "step": 6075 + }, + { + "epoch": 0.29485757239589966, + "grad_norm": 0.265625, + "learning_rate": 4.224225640961837e-05, + "loss": 2.7878, + "num_input_tokens_seen": 3187671040, + "step": 6080 + }, + { + "epoch": 0.2951000539521463, + "grad_norm": 0.251953125, + "learning_rate": 4.222773480369211e-05, + "loss": 2.7845, + "num_input_tokens_seen": 3190292480, + "step": 6085 + }, + { + "epoch": 0.2953425355083929, + "grad_norm": 0.24609375, + "learning_rate": 4.221320212022458e-05, + "loss": 2.7655, + "num_input_tokens_seen": 3192913920, + "step": 6090 + }, + { + "epoch": 0.2955850170646395, + "grad_norm": 0.255859375, + "learning_rate": 4.219865836856042e-05, + "loss": 2.7847, + "num_input_tokens_seen": 3195535360, + "step": 6095 + }, + { + "epoch": 0.29582749862088614, + "grad_norm": 0.259765625, + "learning_rate": 4.218410355805132e-05, + "loss": 2.7824, + "num_input_tokens_seen": 3198156800, + "step": 6100 + }, + { + "epoch": 0.29606998017713276, + "grad_norm": 0.263671875, + "learning_rate": 4.216953769805613e-05, + "loss": 2.757, + "num_input_tokens_seen": 3200778240, + "step": 6105 + }, + { + "epoch": 0.2963124617333794, + "grad_norm": 0.255859375, + "learning_rate": 4.21549607979408e-05, + "loss": 2.7787, + "num_input_tokens_seen": 3203399680, + "step": 6110 + }, + { + "epoch": 0.296554943289626, + "grad_norm": 0.25390625, + "learning_rate": 4.2140372867078345e-05, + "loss": 2.798, + "num_input_tokens_seen": 3206021120, + "step": 6115 + }, + { + "epoch": 0.2967974248458727, + "grad_norm": 0.24609375, + "learning_rate": 4.212577391484891e-05, + "loss": 2.7794, + "num_input_tokens_seen": 3208642560, + "step": 6120 + }, + { + "epoch": 0.2970399064021193, + "grad_norm": 0.25390625, + "learning_rate": 4.21111639506397e-05, + "loss": 2.7806, + "num_input_tokens_seen": 3211264000, + "step": 6125 + }, + { + "epoch": 0.2972823879583659, + "grad_norm": 0.259765625, + "learning_rate": 4.209654298384503e-05, + "loss": 2.777, + "num_input_tokens_seen": 3213885440, + "step": 6130 + }, + { + "epoch": 0.29752486951461254, + "grad_norm": 0.26171875, + "learning_rate": 4.208191102386627e-05, + "loss": 2.7611, + "num_input_tokens_seen": 3216506880, + "step": 6135 + }, + { + "epoch": 0.29776735107085917, + "grad_norm": 0.26953125, + "learning_rate": 4.2067268080111856e-05, + "loss": 2.7974, + "num_input_tokens_seen": 3219128320, + "step": 6140 + }, + { + "epoch": 0.2980098326271058, + "grad_norm": 0.255859375, + "learning_rate": 4.205261416199729e-05, + "loss": 2.7778, + "num_input_tokens_seen": 3221749760, + "step": 6145 + }, + { + "epoch": 0.2982523141833524, + "grad_norm": 0.25, + "learning_rate": 4.203794927894514e-05, + "loss": 2.7706, + "num_input_tokens_seen": 3224371200, + "step": 6150 + }, + { + "epoch": 0.2984947957395991, + "grad_norm": 0.2578125, + "learning_rate": 4.2023273440385014e-05, + "loss": 2.7762, + "num_input_tokens_seen": 3226992640, + "step": 6155 + }, + { + "epoch": 0.2987372772958457, + "grad_norm": 0.255859375, + "learning_rate": 4.2008586655753566e-05, + "loss": 2.777, + "num_input_tokens_seen": 3229614080, + "step": 6160 + }, + { + "epoch": 0.2989797588520923, + "grad_norm": 0.2578125, + "learning_rate": 4.199388893449449e-05, + "loss": 2.786, + "num_input_tokens_seen": 3232235520, + "step": 6165 + }, + { + "epoch": 0.29922224040833895, + "grad_norm": 0.26953125, + "learning_rate": 4.1979180286058515e-05, + "loss": 2.7808, + "num_input_tokens_seen": 3234856960, + "step": 6170 + }, + { + "epoch": 0.29946472196458557, + "grad_norm": 0.25390625, + "learning_rate": 4.196446071990341e-05, + "loss": 2.7693, + "num_input_tokens_seen": 3237478400, + "step": 6175 + }, + { + "epoch": 0.2997072035208322, + "grad_norm": 0.26171875, + "learning_rate": 4.194973024549392e-05, + "loss": 2.7775, + "num_input_tokens_seen": 3240099840, + "step": 6180 + }, + { + "epoch": 0.2999496850770788, + "grad_norm": 0.255859375, + "learning_rate": 4.193498887230184e-05, + "loss": 2.7776, + "num_input_tokens_seen": 3242721280, + "step": 6185 + }, + { + "epoch": 0.30019216663332543, + "grad_norm": 0.259765625, + "learning_rate": 4.1920236609805986e-05, + "loss": 2.7848, + "num_input_tokens_seen": 3245342720, + "step": 6190 + }, + { + "epoch": 0.3004346481895721, + "grad_norm": 0.263671875, + "learning_rate": 4.190547346749213e-05, + "loss": 2.7975, + "num_input_tokens_seen": 3247964160, + "step": 6195 + }, + { + "epoch": 0.3006771297458187, + "grad_norm": 0.265625, + "learning_rate": 4.1890699454853067e-05, + "loss": 2.7854, + "num_input_tokens_seen": 3250585600, + "step": 6200 + }, + { + "epoch": 0.30091961130206535, + "grad_norm": 0.251953125, + "learning_rate": 4.18759145813886e-05, + "loss": 2.7638, + "num_input_tokens_seen": 3253207040, + "step": 6205 + }, + { + "epoch": 0.30116209285831197, + "grad_norm": 0.2734375, + "learning_rate": 4.186111885660547e-05, + "loss": 2.7825, + "num_input_tokens_seen": 3255828480, + "step": 6210 + }, + { + "epoch": 0.3014045744145586, + "grad_norm": 0.26171875, + "learning_rate": 4.184631229001744e-05, + "loss": 2.7761, + "num_input_tokens_seen": 3258449920, + "step": 6215 + }, + { + "epoch": 0.3016470559708052, + "grad_norm": 0.251953125, + "learning_rate": 4.1831494891145215e-05, + "loss": 2.7822, + "num_input_tokens_seen": 3261071360, + "step": 6220 + }, + { + "epoch": 0.30188953752705183, + "grad_norm": 0.251953125, + "learning_rate": 4.1816666669516474e-05, + "loss": 2.7881, + "num_input_tokens_seen": 3263692800, + "step": 6225 + }, + { + "epoch": 0.30213201908329845, + "grad_norm": 0.25, + "learning_rate": 4.180182763466586e-05, + "loss": 2.7894, + "num_input_tokens_seen": 3266314240, + "step": 6230 + }, + { + "epoch": 0.30237450063954513, + "grad_norm": 0.24609375, + "learning_rate": 4.178697779613497e-05, + "loss": 2.776, + "num_input_tokens_seen": 3268935680, + "step": 6235 + }, + { + "epoch": 0.30261698219579175, + "grad_norm": 0.255859375, + "learning_rate": 4.177211716347234e-05, + "loss": 2.7779, + "num_input_tokens_seen": 3271557120, + "step": 6240 + }, + { + "epoch": 0.30285946375203837, + "grad_norm": 0.25, + "learning_rate": 4.1757245746233435e-05, + "loss": 2.7768, + "num_input_tokens_seen": 3274178560, + "step": 6245 + }, + { + "epoch": 0.303101945308285, + "grad_norm": 0.2490234375, + "learning_rate": 4.174236355398069e-05, + "loss": 2.7701, + "num_input_tokens_seen": 3276800000, + "step": 6250 + }, + { + "epoch": 0.3033444268645316, + "grad_norm": 0.251953125, + "learning_rate": 4.172747059628345e-05, + "loss": 2.7897, + "num_input_tokens_seen": 3279421440, + "step": 6255 + }, + { + "epoch": 0.30358690842077823, + "grad_norm": 0.251953125, + "learning_rate": 4.171256688271795e-05, + "loss": 2.7801, + "num_input_tokens_seen": 3282042880, + "step": 6260 + }, + { + "epoch": 0.30382938997702486, + "grad_norm": 0.2578125, + "learning_rate": 4.1697652422867403e-05, + "loss": 2.7879, + "num_input_tokens_seen": 3284664320, + "step": 6265 + }, + { + "epoch": 0.3040718715332715, + "grad_norm": 0.259765625, + "learning_rate": 4.1682727226321885e-05, + "loss": 2.7765, + "num_input_tokens_seen": 3287285760, + "step": 6270 + }, + { + "epoch": 0.30431435308951815, + "grad_norm": 0.2490234375, + "learning_rate": 4.166779130267839e-05, + "loss": 2.7829, + "num_input_tokens_seen": 3289907200, + "step": 6275 + }, + { + "epoch": 0.3045568346457648, + "grad_norm": 0.2490234375, + "learning_rate": 4.1652844661540825e-05, + "loss": 2.7875, + "num_input_tokens_seen": 3292528640, + "step": 6280 + }, + { + "epoch": 0.3047993162020114, + "grad_norm": 0.263671875, + "learning_rate": 4.163788731251995e-05, + "loss": 2.7917, + "num_input_tokens_seen": 3295150080, + "step": 6285 + }, + { + "epoch": 0.305041797758258, + "grad_norm": 0.251953125, + "learning_rate": 4.1622919265233456e-05, + "loss": 2.7794, + "num_input_tokens_seen": 3297771520, + "step": 6290 + }, + { + "epoch": 0.30528427931450464, + "grad_norm": 0.255859375, + "learning_rate": 4.1607940529305876e-05, + "loss": 2.7816, + "num_input_tokens_seen": 3300392960, + "step": 6295 + }, + { + "epoch": 0.30552676087075126, + "grad_norm": 0.259765625, + "learning_rate": 4.159295111436864e-05, + "loss": 2.768, + "num_input_tokens_seen": 3303014400, + "step": 6300 + }, + { + "epoch": 0.30552676087075126, + "eval_accuracy": 0.45503989578244586, + "eval_loss": 2.7481751441955566, + "eval_runtime": 5.8537, + "eval_samples_per_second": 51.25, + "eval_steps_per_second": 6.492, + "num_input_tokens_seen": 3303014400, + "step": 6300 + }, + { + "epoch": 0.3057692424269979, + "grad_norm": 0.248046875, + "learning_rate": 4.1577951030060034e-05, + "loss": 2.7833, + "num_input_tokens_seen": 3305635840, + "step": 6305 + }, + { + "epoch": 0.3060117239832445, + "grad_norm": 0.251953125, + "learning_rate": 4.1562940286025195e-05, + "loss": 2.7898, + "num_input_tokens_seen": 3308257280, + "step": 6310 + }, + { + "epoch": 0.3062542055394912, + "grad_norm": 0.263671875, + "learning_rate": 4.1547918891916144e-05, + "loss": 2.7797, + "num_input_tokens_seen": 3310878720, + "step": 6315 + }, + { + "epoch": 0.3064966870957378, + "grad_norm": 0.255859375, + "learning_rate": 4.153288685739172e-05, + "loss": 2.78, + "num_input_tokens_seen": 3313500160, + "step": 6320 + }, + { + "epoch": 0.3067391686519844, + "grad_norm": 0.251953125, + "learning_rate": 4.1517844192117614e-05, + "loss": 2.7905, + "num_input_tokens_seen": 3316121600, + "step": 6325 + }, + { + "epoch": 0.30698165020823104, + "grad_norm": 0.255859375, + "learning_rate": 4.150279090576636e-05, + "loss": 2.7758, + "num_input_tokens_seen": 3318743040, + "step": 6330 + }, + { + "epoch": 0.30722413176447766, + "grad_norm": 0.255859375, + "learning_rate": 4.148772700801731e-05, + "loss": 2.7731, + "num_input_tokens_seen": 3321364480, + "step": 6335 + }, + { + "epoch": 0.3074666133207243, + "grad_norm": 0.265625, + "learning_rate": 4.1472652508556646e-05, + "loss": 2.7888, + "num_input_tokens_seen": 3323985920, + "step": 6340 + }, + { + "epoch": 0.3077090948769709, + "grad_norm": 0.251953125, + "learning_rate": 4.145756741707737e-05, + "loss": 2.7849, + "num_input_tokens_seen": 3326607360, + "step": 6345 + }, + { + "epoch": 0.3079515764332175, + "grad_norm": 0.259765625, + "learning_rate": 4.144247174327929e-05, + "loss": 2.7932, + "num_input_tokens_seen": 3329228800, + "step": 6350 + }, + { + "epoch": 0.3081940579894642, + "grad_norm": 0.2421875, + "learning_rate": 4.1427365496869005e-05, + "loss": 2.7765, + "num_input_tokens_seen": 3331850240, + "step": 6355 + }, + { + "epoch": 0.3084365395457108, + "grad_norm": 0.25390625, + "learning_rate": 4.141224868755994e-05, + "loss": 2.7782, + "num_input_tokens_seen": 3334471680, + "step": 6360 + }, + { + "epoch": 0.30867902110195744, + "grad_norm": 0.255859375, + "learning_rate": 4.13971213250723e-05, + "loss": 2.7946, + "num_input_tokens_seen": 3337093120, + "step": 6365 + }, + { + "epoch": 0.30892150265820406, + "grad_norm": 0.259765625, + "learning_rate": 4.1381983419133056e-05, + "loss": 2.7732, + "num_input_tokens_seen": 3339714560, + "step": 6370 + }, + { + "epoch": 0.3091639842144507, + "grad_norm": 0.259765625, + "learning_rate": 4.1366834979476e-05, + "loss": 2.7807, + "num_input_tokens_seen": 3342336000, + "step": 6375 + }, + { + "epoch": 0.3094064657706973, + "grad_norm": 0.25390625, + "learning_rate": 4.135167601584166e-05, + "loss": 2.7756, + "num_input_tokens_seen": 3344957440, + "step": 6380 + }, + { + "epoch": 0.3096489473269439, + "grad_norm": 0.259765625, + "learning_rate": 4.133650653797734e-05, + "loss": 2.7711, + "num_input_tokens_seen": 3347578880, + "step": 6385 + }, + { + "epoch": 0.30989142888319055, + "grad_norm": 0.248046875, + "learning_rate": 4.132132655563711e-05, + "loss": 2.7809, + "num_input_tokens_seen": 3350200320, + "step": 6390 + }, + { + "epoch": 0.3101339104394372, + "grad_norm": 0.255859375, + "learning_rate": 4.1306136078581814e-05, + "loss": 2.7921, + "num_input_tokens_seen": 3352821760, + "step": 6395 + }, + { + "epoch": 0.31037639199568384, + "grad_norm": 0.251953125, + "learning_rate": 4.129093511657899e-05, + "loss": 2.7853, + "num_input_tokens_seen": 3355443200, + "step": 6400 + }, + { + "epoch": 0.31061887355193046, + "grad_norm": 0.251953125, + "learning_rate": 4.1275723679402984e-05, + "loss": 2.7895, + "num_input_tokens_seen": 3358064640, + "step": 6405 + }, + { + "epoch": 0.3108613551081771, + "grad_norm": 0.267578125, + "learning_rate": 4.126050177683483e-05, + "loss": 2.7789, + "num_input_tokens_seen": 3360686080, + "step": 6410 + }, + { + "epoch": 0.3111038366644237, + "grad_norm": 0.25390625, + "learning_rate": 4.12452694186623e-05, + "loss": 2.774, + "num_input_tokens_seen": 3363307520, + "step": 6415 + }, + { + "epoch": 0.3113463182206703, + "grad_norm": 0.25390625, + "learning_rate": 4.123002661467992e-05, + "loss": 2.7821, + "num_input_tokens_seen": 3365928960, + "step": 6420 + }, + { + "epoch": 0.31158879977691695, + "grad_norm": 0.25390625, + "learning_rate": 4.1214773374688877e-05, + "loss": 2.777, + "num_input_tokens_seen": 3368550400, + "step": 6425 + }, + { + "epoch": 0.3118312813331636, + "grad_norm": 0.25, + "learning_rate": 4.119950970849712e-05, + "loss": 2.7865, + "num_input_tokens_seen": 3371171840, + "step": 6430 + }, + { + "epoch": 0.31207376288941024, + "grad_norm": 0.255859375, + "learning_rate": 4.118423562591928e-05, + "loss": 2.7788, + "num_input_tokens_seen": 3373793280, + "step": 6435 + }, + { + "epoch": 0.31231624444565687, + "grad_norm": 0.259765625, + "learning_rate": 4.1168951136776676e-05, + "loss": 2.7915, + "num_input_tokens_seen": 3376414720, + "step": 6440 + }, + { + "epoch": 0.3125587260019035, + "grad_norm": 0.263671875, + "learning_rate": 4.1153656250897344e-05, + "loss": 2.7871, + "num_input_tokens_seen": 3379036160, + "step": 6445 + }, + { + "epoch": 0.3128012075581501, + "grad_norm": 0.251953125, + "learning_rate": 4.113835097811598e-05, + "loss": 2.7868, + "num_input_tokens_seen": 3381657600, + "step": 6450 + }, + { + "epoch": 0.31304368911439673, + "grad_norm": 0.2470703125, + "learning_rate": 4.112303532827398e-05, + "loss": 2.7706, + "num_input_tokens_seen": 3384279040, + "step": 6455 + }, + { + "epoch": 0.31328617067064335, + "grad_norm": 0.251953125, + "learning_rate": 4.11077093112194e-05, + "loss": 2.7861, + "num_input_tokens_seen": 3386900480, + "step": 6460 + }, + { + "epoch": 0.31352865222688997, + "grad_norm": 0.25, + "learning_rate": 4.1092372936806964e-05, + "loss": 2.7775, + "num_input_tokens_seen": 3389521920, + "step": 6465 + }, + { + "epoch": 0.31377113378313665, + "grad_norm": 0.2578125, + "learning_rate": 4.107702621489805e-05, + "loss": 2.7656, + "num_input_tokens_seen": 3392143360, + "step": 6470 + }, + { + "epoch": 0.31401361533938327, + "grad_norm": 0.25, + "learning_rate": 4.106166915536071e-05, + "loss": 2.7714, + "num_input_tokens_seen": 3394764800, + "step": 6475 + }, + { + "epoch": 0.3142560968956299, + "grad_norm": 0.244140625, + "learning_rate": 4.104630176806962e-05, + "loss": 2.7862, + "num_input_tokens_seen": 3397386240, + "step": 6480 + }, + { + "epoch": 0.3144985784518765, + "grad_norm": 0.2578125, + "learning_rate": 4.103092406290611e-05, + "loss": 2.7868, + "num_input_tokens_seen": 3400007680, + "step": 6485 + }, + { + "epoch": 0.31474106000812313, + "grad_norm": 0.267578125, + "learning_rate": 4.101553604975813e-05, + "loss": 2.7785, + "num_input_tokens_seen": 3402629120, + "step": 6490 + }, + { + "epoch": 0.31498354156436975, + "grad_norm": 0.248046875, + "learning_rate": 4.100013773852027e-05, + "loss": 2.7844, + "num_input_tokens_seen": 3405250560, + "step": 6495 + }, + { + "epoch": 0.31522602312061637, + "grad_norm": 0.25, + "learning_rate": 4.098472913909376e-05, + "loss": 2.7916, + "num_input_tokens_seen": 3407872000, + "step": 6500 + }, + { + "epoch": 0.315468504676863, + "grad_norm": 0.267578125, + "learning_rate": 4.096931026138642e-05, + "loss": 2.7721, + "num_input_tokens_seen": 3410493440, + "step": 6505 + }, + { + "epoch": 0.31571098623310967, + "grad_norm": 0.25, + "learning_rate": 4.095388111531266e-05, + "loss": 2.7925, + "num_input_tokens_seen": 3413114880, + "step": 6510 + }, + { + "epoch": 0.3159534677893563, + "grad_norm": 0.2578125, + "learning_rate": 4.093844171079355e-05, + "loss": 2.7947, + "num_input_tokens_seen": 3415736320, + "step": 6515 + }, + { + "epoch": 0.3161959493456029, + "grad_norm": 0.2578125, + "learning_rate": 4.09229920577567e-05, + "loss": 2.7875, + "num_input_tokens_seen": 3418357760, + "step": 6520 + }, + { + "epoch": 0.31643843090184953, + "grad_norm": 0.255859375, + "learning_rate": 4.090753216613635e-05, + "loss": 2.793, + "num_input_tokens_seen": 3420979200, + "step": 6525 + }, + { + "epoch": 0.31668091245809615, + "grad_norm": 0.2578125, + "learning_rate": 4.0892062045873296e-05, + "loss": 2.788, + "num_input_tokens_seen": 3423600640, + "step": 6530 + }, + { + "epoch": 0.3169233940143428, + "grad_norm": 0.251953125, + "learning_rate": 4.087658170691493e-05, + "loss": 2.7857, + "num_input_tokens_seen": 3426222080, + "step": 6535 + }, + { + "epoch": 0.3171658755705894, + "grad_norm": 0.2421875, + "learning_rate": 4.0861091159215194e-05, + "loss": 2.7709, + "num_input_tokens_seen": 3428843520, + "step": 6540 + }, + { + "epoch": 0.317408357126836, + "grad_norm": 0.2490234375, + "learning_rate": 4.0845590412734625e-05, + "loss": 2.7727, + "num_input_tokens_seen": 3431464960, + "step": 6545 + }, + { + "epoch": 0.3176508386830827, + "grad_norm": 0.248046875, + "learning_rate": 4.083007947744029e-05, + "loss": 2.7918, + "num_input_tokens_seen": 3434086400, + "step": 6550 + }, + { + "epoch": 0.3178933202393293, + "grad_norm": 0.263671875, + "learning_rate": 4.081455836330581e-05, + "loss": 2.7887, + "num_input_tokens_seen": 3436707840, + "step": 6555 + }, + { + "epoch": 0.31813580179557593, + "grad_norm": 0.251953125, + "learning_rate": 4.079902708031137e-05, + "loss": 2.7735, + "num_input_tokens_seen": 3439329280, + "step": 6560 + }, + { + "epoch": 0.31837828335182256, + "grad_norm": 0.25390625, + "learning_rate": 4.078348563844368e-05, + "loss": 2.7832, + "num_input_tokens_seen": 3441950720, + "step": 6565 + }, + { + "epoch": 0.3186207649080692, + "grad_norm": 0.24609375, + "learning_rate": 4.076793404769599e-05, + "loss": 2.7718, + "num_input_tokens_seen": 3444572160, + "step": 6570 + }, + { + "epoch": 0.3188632464643158, + "grad_norm": 0.251953125, + "learning_rate": 4.075237231806806e-05, + "loss": 2.7927, + "num_input_tokens_seen": 3447193600, + "step": 6575 + }, + { + "epoch": 0.3191057280205624, + "grad_norm": 0.259765625, + "learning_rate": 4.0736800459566175e-05, + "loss": 2.7647, + "num_input_tokens_seen": 3449815040, + "step": 6580 + }, + { + "epoch": 0.31934820957680904, + "grad_norm": 0.26953125, + "learning_rate": 4.0721218482203146e-05, + "loss": 2.7822, + "num_input_tokens_seen": 3452436480, + "step": 6585 + }, + { + "epoch": 0.3195906911330557, + "grad_norm": 0.255859375, + "learning_rate": 4.0705626395998294e-05, + "loss": 2.7805, + "num_input_tokens_seen": 3455057920, + "step": 6590 + }, + { + "epoch": 0.31983317268930234, + "grad_norm": 0.255859375, + "learning_rate": 4.0690024210977405e-05, + "loss": 2.7821, + "num_input_tokens_seen": 3457679360, + "step": 6595 + }, + { + "epoch": 0.32007565424554896, + "grad_norm": 0.25, + "learning_rate": 4.06744119371728e-05, + "loss": 2.7653, + "num_input_tokens_seen": 3460300800, + "step": 6600 + }, + { + "epoch": 0.32007565424554896, + "eval_accuracy": 0.4551441133365901, + "eval_loss": 2.7476091384887695, + "eval_runtime": 5.9773, + "eval_samples_per_second": 50.19, + "eval_steps_per_second": 6.357, + "num_input_tokens_seen": 3460300800, + "step": 6600 + }, + { + "epoch": 0.3203181358017956, + "grad_norm": 0.271484375, + "learning_rate": 4.0658789584623246e-05, + "loss": 2.7825, + "num_input_tokens_seen": 3462922240, + "step": 6605 + }, + { + "epoch": 0.3205606173580422, + "grad_norm": 0.251953125, + "learning_rate": 4.064315716337404e-05, + "loss": 2.7973, + "num_input_tokens_seen": 3465543680, + "step": 6610 + }, + { + "epoch": 0.3208030989142888, + "grad_norm": 0.255859375, + "learning_rate": 4.062751468347691e-05, + "loss": 2.7879, + "num_input_tokens_seen": 3468165120, + "step": 6615 + }, + { + "epoch": 0.32104558047053544, + "grad_norm": 0.265625, + "learning_rate": 4.0611862154990074e-05, + "loss": 2.7872, + "num_input_tokens_seen": 3470786560, + "step": 6620 + }, + { + "epoch": 0.32128806202678206, + "grad_norm": 0.25, + "learning_rate": 4.059619958797821e-05, + "loss": 2.7656, + "num_input_tokens_seen": 3473408000, + "step": 6625 + }, + { + "epoch": 0.32153054358302874, + "grad_norm": 0.26171875, + "learning_rate": 4.0580526992512435e-05, + "loss": 2.7809, + "num_input_tokens_seen": 3476029440, + "step": 6630 + }, + { + "epoch": 0.32177302513927536, + "grad_norm": 0.2578125, + "learning_rate": 4.056484437867033e-05, + "loss": 2.7756, + "num_input_tokens_seen": 3478650880, + "step": 6635 + }, + { + "epoch": 0.322015506695522, + "grad_norm": 0.2578125, + "learning_rate": 4.054915175653592e-05, + "loss": 2.786, + "num_input_tokens_seen": 3481272320, + "step": 6640 + }, + { + "epoch": 0.3222579882517686, + "grad_norm": 0.265625, + "learning_rate": 4.053344913619965e-05, + "loss": 2.7802, + "num_input_tokens_seen": 3483893760, + "step": 6645 + }, + { + "epoch": 0.3225004698080152, + "grad_norm": 0.263671875, + "learning_rate": 4.051773652775842e-05, + "loss": 2.794, + "num_input_tokens_seen": 3486515200, + "step": 6650 + }, + { + "epoch": 0.32274295136426184, + "grad_norm": 0.25390625, + "learning_rate": 4.0502013941315516e-05, + "loss": 2.7916, + "num_input_tokens_seen": 3489136640, + "step": 6655 + }, + { + "epoch": 0.32298543292050846, + "grad_norm": 0.2451171875, + "learning_rate": 4.048628138698067e-05, + "loss": 2.78, + "num_input_tokens_seen": 3491758080, + "step": 6660 + }, + { + "epoch": 0.3232279144767551, + "grad_norm": 0.2578125, + "learning_rate": 4.047053887487e-05, + "loss": 2.7756, + "num_input_tokens_seen": 3494379520, + "step": 6665 + }, + { + "epoch": 0.32347039603300176, + "grad_norm": 0.248046875, + "learning_rate": 4.045478641510606e-05, + "loss": 2.7777, + "num_input_tokens_seen": 3497000960, + "step": 6670 + }, + { + "epoch": 0.3237128775892484, + "grad_norm": 0.2578125, + "learning_rate": 4.0439024017817774e-05, + "loss": 2.7721, + "num_input_tokens_seen": 3499622400, + "step": 6675 + }, + { + "epoch": 0.323955359145495, + "grad_norm": 0.251953125, + "learning_rate": 4.042325169314045e-05, + "loss": 2.7897, + "num_input_tokens_seen": 3502243840, + "step": 6680 + }, + { + "epoch": 0.3241978407017416, + "grad_norm": 0.2578125, + "learning_rate": 4.0407469451215804e-05, + "loss": 2.7719, + "num_input_tokens_seen": 3504865280, + "step": 6685 + }, + { + "epoch": 0.32444032225798825, + "grad_norm": 0.263671875, + "learning_rate": 4.039167730219191e-05, + "loss": 2.7843, + "num_input_tokens_seen": 3507486720, + "step": 6690 + }, + { + "epoch": 0.32468280381423487, + "grad_norm": 0.265625, + "learning_rate": 4.037587525622322e-05, + "loss": 2.7853, + "num_input_tokens_seen": 3510108160, + "step": 6695 + }, + { + "epoch": 0.3249252853704815, + "grad_norm": 0.263671875, + "learning_rate": 4.036006332347055e-05, + "loss": 2.7768, + "num_input_tokens_seen": 3512729600, + "step": 6700 + }, + { + "epoch": 0.32516776692672816, + "grad_norm": 0.2578125, + "learning_rate": 4.0344241514101075e-05, + "loss": 2.7949, + "num_input_tokens_seen": 3515351040, + "step": 6705 + }, + { + "epoch": 0.3254102484829748, + "grad_norm": 0.26171875, + "learning_rate": 4.0328409838288304e-05, + "loss": 2.7683, + "num_input_tokens_seen": 3517972480, + "step": 6710 + }, + { + "epoch": 0.3256527300392214, + "grad_norm": 0.2578125, + "learning_rate": 4.031256830621212e-05, + "loss": 2.7793, + "num_input_tokens_seen": 3520593920, + "step": 6715 + }, + { + "epoch": 0.325895211595468, + "grad_norm": 0.251953125, + "learning_rate": 4.029671692805872e-05, + "loss": 2.7752, + "num_input_tokens_seen": 3523215360, + "step": 6720 + }, + { + "epoch": 0.32613769315171465, + "grad_norm": 0.259765625, + "learning_rate": 4.0280855714020625e-05, + "loss": 2.7741, + "num_input_tokens_seen": 3525836800, + "step": 6725 + }, + { + "epoch": 0.32638017470796127, + "grad_norm": 0.251953125, + "learning_rate": 4.026498467429672e-05, + "loss": 2.7804, + "num_input_tokens_seen": 3528458240, + "step": 6730 + }, + { + "epoch": 0.3266226562642079, + "grad_norm": 0.25, + "learning_rate": 4.024910381909218e-05, + "loss": 2.7863, + "num_input_tokens_seen": 3531079680, + "step": 6735 + }, + { + "epoch": 0.3268651378204545, + "grad_norm": 0.2490234375, + "learning_rate": 4.0233213158618475e-05, + "loss": 2.7718, + "num_input_tokens_seen": 3533701120, + "step": 6740 + }, + { + "epoch": 0.3271076193767012, + "grad_norm": 0.2578125, + "learning_rate": 4.021731270309341e-05, + "loss": 2.7902, + "num_input_tokens_seen": 3536322560, + "step": 6745 + }, + { + "epoch": 0.3273501009329478, + "grad_norm": 0.26171875, + "learning_rate": 4.020140246274109e-05, + "loss": 2.7941, + "num_input_tokens_seen": 3538944000, + "step": 6750 + }, + { + "epoch": 0.32759258248919443, + "grad_norm": 0.265625, + "learning_rate": 4.018548244779187e-05, + "loss": 2.7729, + "num_input_tokens_seen": 3541565440, + "step": 6755 + }, + { + "epoch": 0.32783506404544105, + "grad_norm": 0.25390625, + "learning_rate": 4.0169552668482445e-05, + "loss": 2.7828, + "num_input_tokens_seen": 3544186880, + "step": 6760 + }, + { + "epoch": 0.32807754560168767, + "grad_norm": 0.2578125, + "learning_rate": 4.0153613135055755e-05, + "loss": 2.7757, + "num_input_tokens_seen": 3546808320, + "step": 6765 + }, + { + "epoch": 0.3283200271579343, + "grad_norm": 0.255859375, + "learning_rate": 4.013766385776102e-05, + "loss": 2.7726, + "num_input_tokens_seen": 3549429760, + "step": 6770 + }, + { + "epoch": 0.3285625087141809, + "grad_norm": 0.2578125, + "learning_rate": 4.012170484685371e-05, + "loss": 2.7886, + "num_input_tokens_seen": 3552051200, + "step": 6775 + }, + { + "epoch": 0.32880499027042753, + "grad_norm": 0.2578125, + "learning_rate": 4.01057361125956e-05, + "loss": 2.7867, + "num_input_tokens_seen": 3554672640, + "step": 6780 + }, + { + "epoch": 0.3290474718266742, + "grad_norm": 0.25390625, + "learning_rate": 4.0089757665254655e-05, + "loss": 2.7773, + "num_input_tokens_seen": 3557294080, + "step": 6785 + }, + { + "epoch": 0.32928995338292083, + "grad_norm": 0.25390625, + "learning_rate": 4.0073769515105134e-05, + "loss": 2.7719, + "num_input_tokens_seen": 3559915520, + "step": 6790 + }, + { + "epoch": 0.32953243493916745, + "grad_norm": 0.25, + "learning_rate": 4.0057771672427515e-05, + "loss": 2.799, + "num_input_tokens_seen": 3562536960, + "step": 6795 + }, + { + "epoch": 0.32977491649541407, + "grad_norm": 0.244140625, + "learning_rate": 4.004176414750851e-05, + "loss": 2.7746, + "num_input_tokens_seen": 3565158400, + "step": 6800 + }, + { + "epoch": 0.3300173980516607, + "grad_norm": 0.25, + "learning_rate": 4.002574695064106e-05, + "loss": 2.7851, + "num_input_tokens_seen": 3567779840, + "step": 6805 + }, + { + "epoch": 0.3302598796079073, + "grad_norm": 0.25390625, + "learning_rate": 4.000972009212431e-05, + "loss": 2.7826, + "num_input_tokens_seen": 3570401280, + "step": 6810 + }, + { + "epoch": 0.33050236116415393, + "grad_norm": 0.263671875, + "learning_rate": 3.999368358226365e-05, + "loss": 2.7749, + "num_input_tokens_seen": 3573022720, + "step": 6815 + }, + { + "epoch": 0.33074484272040056, + "grad_norm": 0.255859375, + "learning_rate": 3.997763743137064e-05, + "loss": 2.7771, + "num_input_tokens_seen": 3575644160, + "step": 6820 + }, + { + "epoch": 0.33098732427664723, + "grad_norm": 0.26171875, + "learning_rate": 3.996158164976307e-05, + "loss": 2.7887, + "num_input_tokens_seen": 3578265600, + "step": 6825 + }, + { + "epoch": 0.33122980583289385, + "grad_norm": 0.2578125, + "learning_rate": 3.994551624776489e-05, + "loss": 2.7924, + "num_input_tokens_seen": 3580887040, + "step": 6830 + }, + { + "epoch": 0.3314722873891405, + "grad_norm": 0.2490234375, + "learning_rate": 3.992944123570627e-05, + "loss": 2.7865, + "num_input_tokens_seen": 3583508480, + "step": 6835 + }, + { + "epoch": 0.3317147689453871, + "grad_norm": 0.255859375, + "learning_rate": 3.991335662392353e-05, + "loss": 2.7756, + "num_input_tokens_seen": 3586129920, + "step": 6840 + }, + { + "epoch": 0.3319572505016337, + "grad_norm": 0.2490234375, + "learning_rate": 3.989726242275918e-05, + "loss": 2.7716, + "num_input_tokens_seen": 3588751360, + "step": 6845 + }, + { + "epoch": 0.33219973205788034, + "grad_norm": 0.255859375, + "learning_rate": 3.988115864256191e-05, + "loss": 2.7869, + "num_input_tokens_seen": 3591372800, + "step": 6850 + }, + { + "epoch": 0.33244221361412696, + "grad_norm": 0.259765625, + "learning_rate": 3.986504529368653e-05, + "loss": 2.8014, + "num_input_tokens_seen": 3593994240, + "step": 6855 + }, + { + "epoch": 0.3326846951703736, + "grad_norm": 0.251953125, + "learning_rate": 3.984892238649403e-05, + "loss": 2.782, + "num_input_tokens_seen": 3596615680, + "step": 6860 + }, + { + "epoch": 0.33292717672662026, + "grad_norm": 0.2451171875, + "learning_rate": 3.983278993135154e-05, + "loss": 2.7856, + "num_input_tokens_seen": 3599237120, + "step": 6865 + }, + { + "epoch": 0.3331696582828669, + "grad_norm": 0.251953125, + "learning_rate": 3.9816647938632326e-05, + "loss": 2.7754, + "num_input_tokens_seen": 3601858560, + "step": 6870 + }, + { + "epoch": 0.3334121398391135, + "grad_norm": 0.2470703125, + "learning_rate": 3.9800496418715805e-05, + "loss": 2.79, + "num_input_tokens_seen": 3604480000, + "step": 6875 + }, + { + "epoch": 0.3336546213953601, + "grad_norm": 0.25, + "learning_rate": 3.9784335381987485e-05, + "loss": 2.7723, + "num_input_tokens_seen": 3607101440, + "step": 6880 + }, + { + "epoch": 0.33389710295160674, + "grad_norm": 0.26171875, + "learning_rate": 3.9768164838839026e-05, + "loss": 2.7887, + "num_input_tokens_seen": 3609722880, + "step": 6885 + }, + { + "epoch": 0.33413958450785336, + "grad_norm": 0.248046875, + "learning_rate": 3.97519847996682e-05, + "loss": 2.7714, + "num_input_tokens_seen": 3612344320, + "step": 6890 + }, + { + "epoch": 0.3343820660641, + "grad_norm": 0.2490234375, + "learning_rate": 3.973579527487884e-05, + "loss": 2.7813, + "num_input_tokens_seen": 3614965760, + "step": 6895 + }, + { + "epoch": 0.3346245476203466, + "grad_norm": 0.251953125, + "learning_rate": 3.971959627488094e-05, + "loss": 2.7843, + "num_input_tokens_seen": 3617587200, + "step": 6900 + }, + { + "epoch": 0.3346245476203466, + "eval_accuracy": 0.45507409216739947, + "eval_loss": 2.7469868659973145, + "eval_runtime": 5.8611, + "eval_samples_per_second": 51.185, + "eval_steps_per_second": 6.483, + "num_input_tokens_seen": 3617587200, + "step": 6900 + }, + { + "epoch": 0.3348670291765933, + "grad_norm": 0.25390625, + "learning_rate": 3.9703387810090555e-05, + "loss": 2.7783, + "num_input_tokens_seen": 3620208640, + "step": 6905 + }, + { + "epoch": 0.3351095107328399, + "grad_norm": 0.251953125, + "learning_rate": 3.968716989092982e-05, + "loss": 2.7871, + "num_input_tokens_seen": 3622830080, + "step": 6910 + }, + { + "epoch": 0.3353519922890865, + "grad_norm": 0.25390625, + "learning_rate": 3.9670942527826956e-05, + "loss": 2.7889, + "num_input_tokens_seen": 3625451520, + "step": 6915 + }, + { + "epoch": 0.33559447384533314, + "grad_norm": 0.2392578125, + "learning_rate": 3.965470573121627e-05, + "loss": 2.7702, + "num_input_tokens_seen": 3628072960, + "step": 6920 + }, + { + "epoch": 0.33583695540157976, + "grad_norm": 0.244140625, + "learning_rate": 3.9638459511538116e-05, + "loss": 2.7807, + "num_input_tokens_seen": 3630694400, + "step": 6925 + }, + { + "epoch": 0.3360794369578264, + "grad_norm": 0.26171875, + "learning_rate": 3.9622203879238925e-05, + "loss": 2.7756, + "num_input_tokens_seen": 3633315840, + "step": 6930 + }, + { + "epoch": 0.336321918514073, + "grad_norm": 0.25390625, + "learning_rate": 3.960593884477116e-05, + "loss": 2.7712, + "num_input_tokens_seen": 3635937280, + "step": 6935 + }, + { + "epoch": 0.3365644000703196, + "grad_norm": 0.248046875, + "learning_rate": 3.958966441859334e-05, + "loss": 2.7854, + "num_input_tokens_seen": 3638558720, + "step": 6940 + }, + { + "epoch": 0.3368068816265663, + "grad_norm": 0.25, + "learning_rate": 3.957338061117003e-05, + "loss": 2.7908, + "num_input_tokens_seen": 3641180160, + "step": 6945 + }, + { + "epoch": 0.3370493631828129, + "grad_norm": 0.2470703125, + "learning_rate": 3.955708743297182e-05, + "loss": 2.764, + "num_input_tokens_seen": 3643801600, + "step": 6950 + }, + { + "epoch": 0.33729184473905954, + "grad_norm": 0.25, + "learning_rate": 3.954078489447531e-05, + "loss": 2.7863, + "num_input_tokens_seen": 3646423040, + "step": 6955 + }, + { + "epoch": 0.33753432629530616, + "grad_norm": 0.26171875, + "learning_rate": 3.952447300616315e-05, + "loss": 2.7685, + "num_input_tokens_seen": 3649044480, + "step": 6960 + }, + { + "epoch": 0.3377768078515528, + "grad_norm": 0.26171875, + "learning_rate": 3.9508151778523996e-05, + "loss": 2.7669, + "num_input_tokens_seen": 3651665920, + "step": 6965 + }, + { + "epoch": 0.3380192894077994, + "grad_norm": 0.259765625, + "learning_rate": 3.949182122205247e-05, + "loss": 2.7774, + "num_input_tokens_seen": 3654287360, + "step": 6970 + }, + { + "epoch": 0.338261770964046, + "grad_norm": 0.259765625, + "learning_rate": 3.947548134724924e-05, + "loss": 2.7872, + "num_input_tokens_seen": 3656908800, + "step": 6975 + }, + { + "epoch": 0.33850425252029265, + "grad_norm": 0.2578125, + "learning_rate": 3.945913216462095e-05, + "loss": 2.7829, + "num_input_tokens_seen": 3659530240, + "step": 6980 + }, + { + "epoch": 0.3387467340765393, + "grad_norm": 0.251953125, + "learning_rate": 3.944277368468022e-05, + "loss": 2.791, + "num_input_tokens_seen": 3662151680, + "step": 6985 + }, + { + "epoch": 0.33898921563278595, + "grad_norm": 0.2490234375, + "learning_rate": 3.942640591794565e-05, + "loss": 2.7832, + "num_input_tokens_seen": 3664773120, + "step": 6990 + }, + { + "epoch": 0.33923169718903257, + "grad_norm": 0.25390625, + "learning_rate": 3.9410028874941836e-05, + "loss": 2.7644, + "num_input_tokens_seen": 3667394560, + "step": 6995 + }, + { + "epoch": 0.3394741787452792, + "grad_norm": 0.25, + "learning_rate": 3.93936425661993e-05, + "loss": 2.7861, + "num_input_tokens_seen": 3670016000, + "step": 7000 + }, + { + "epoch": 0.3397166603015258, + "grad_norm": 0.251953125, + "learning_rate": 3.9377247002254546e-05, + "loss": 2.7767, + "num_input_tokens_seen": 3672637440, + "step": 7005 + }, + { + "epoch": 0.33995914185777243, + "grad_norm": 0.2490234375, + "learning_rate": 3.936084219365003e-05, + "loss": 2.7971, + "num_input_tokens_seen": 3675258880, + "step": 7010 + }, + { + "epoch": 0.34020162341401905, + "grad_norm": 0.2470703125, + "learning_rate": 3.9344428150934135e-05, + "loss": 2.7858, + "num_input_tokens_seen": 3677880320, + "step": 7015 + }, + { + "epoch": 0.3404441049702657, + "grad_norm": 0.251953125, + "learning_rate": 3.9328004884661205e-05, + "loss": 2.7868, + "num_input_tokens_seen": 3680501760, + "step": 7020 + }, + { + "epoch": 0.34068658652651235, + "grad_norm": 0.25, + "learning_rate": 3.9311572405391495e-05, + "loss": 2.7893, + "num_input_tokens_seen": 3683123200, + "step": 7025 + }, + { + "epoch": 0.34092906808275897, + "grad_norm": 0.2451171875, + "learning_rate": 3.9295130723691206e-05, + "loss": 2.7755, + "num_input_tokens_seen": 3685744640, + "step": 7030 + }, + { + "epoch": 0.3411715496390056, + "grad_norm": 0.26171875, + "learning_rate": 3.927867985013241e-05, + "loss": 2.7824, + "num_input_tokens_seen": 3688366080, + "step": 7035 + }, + { + "epoch": 0.3414140311952522, + "grad_norm": 0.25390625, + "learning_rate": 3.926221979529316e-05, + "loss": 2.7815, + "num_input_tokens_seen": 3690987520, + "step": 7040 + }, + { + "epoch": 0.34165651275149883, + "grad_norm": 0.2490234375, + "learning_rate": 3.924575056975737e-05, + "loss": 2.7821, + "num_input_tokens_seen": 3693608960, + "step": 7045 + }, + { + "epoch": 0.34189899430774545, + "grad_norm": 0.25, + "learning_rate": 3.922927218411482e-05, + "loss": 2.7823, + "num_input_tokens_seen": 3696230400, + "step": 7050 + }, + { + "epoch": 0.3421414758639921, + "grad_norm": 0.25, + "learning_rate": 3.921278464896124e-05, + "loss": 2.7939, + "num_input_tokens_seen": 3698851840, + "step": 7055 + }, + { + "epoch": 0.34238395742023875, + "grad_norm": 0.2431640625, + "learning_rate": 3.919628797489823e-05, + "loss": 2.787, + "num_input_tokens_seen": 3701473280, + "step": 7060 + }, + { + "epoch": 0.34262643897648537, + "grad_norm": 0.255859375, + "learning_rate": 3.9179782172533216e-05, + "loss": 2.7935, + "num_input_tokens_seen": 3704094720, + "step": 7065 + }, + { + "epoch": 0.342868920532732, + "grad_norm": 0.255859375, + "learning_rate": 3.916326725247957e-05, + "loss": 2.7681, + "num_input_tokens_seen": 3706716160, + "step": 7070 + }, + { + "epoch": 0.3431114020889786, + "grad_norm": 0.251953125, + "learning_rate": 3.9146743225356483e-05, + "loss": 2.7899, + "num_input_tokens_seen": 3709337600, + "step": 7075 + }, + { + "epoch": 0.34335388364522523, + "grad_norm": 0.2470703125, + "learning_rate": 3.9130210101788994e-05, + "loss": 2.797, + "num_input_tokens_seen": 3711959040, + "step": 7080 + }, + { + "epoch": 0.34359636520147185, + "grad_norm": 0.265625, + "learning_rate": 3.9113667892408015e-05, + "loss": 2.7703, + "num_input_tokens_seen": 3714580480, + "step": 7085 + }, + { + "epoch": 0.3438388467577185, + "grad_norm": 0.255859375, + "learning_rate": 3.909711660785028e-05, + "loss": 2.7729, + "num_input_tokens_seen": 3717201920, + "step": 7090 + }, + { + "epoch": 0.3440813283139651, + "grad_norm": 0.25, + "learning_rate": 3.908055625875838e-05, + "loss": 2.7915, + "num_input_tokens_seen": 3719823360, + "step": 7095 + }, + { + "epoch": 0.34432380987021177, + "grad_norm": 0.25, + "learning_rate": 3.906398685578071e-05, + "loss": 2.7717, + "num_input_tokens_seen": 3722444800, + "step": 7100 + }, + { + "epoch": 0.3445662914264584, + "grad_norm": 0.2431640625, + "learning_rate": 3.9047408409571516e-05, + "loss": 2.7824, + "num_input_tokens_seen": 3725066240, + "step": 7105 + }, + { + "epoch": 0.344808772982705, + "grad_norm": 0.259765625, + "learning_rate": 3.903082093079083e-05, + "loss": 2.774, + "num_input_tokens_seen": 3727687680, + "step": 7110 + }, + { + "epoch": 0.34505125453895164, + "grad_norm": 0.25390625, + "learning_rate": 3.901422443010451e-05, + "loss": 2.7723, + "num_input_tokens_seen": 3730309120, + "step": 7115 + }, + { + "epoch": 0.34529373609519826, + "grad_norm": 0.2470703125, + "learning_rate": 3.8997618918184206e-05, + "loss": 2.7748, + "num_input_tokens_seen": 3732930560, + "step": 7120 + }, + { + "epoch": 0.3455362176514449, + "grad_norm": 0.267578125, + "learning_rate": 3.898100440570737e-05, + "loss": 2.7756, + "num_input_tokens_seen": 3735552000, + "step": 7125 + }, + { + "epoch": 0.3457786992076915, + "grad_norm": 0.259765625, + "learning_rate": 3.896438090335724e-05, + "loss": 2.7742, + "num_input_tokens_seen": 3738173440, + "step": 7130 + }, + { + "epoch": 0.3460211807639381, + "grad_norm": 0.255859375, + "learning_rate": 3.8947748421822826e-05, + "loss": 2.7715, + "num_input_tokens_seen": 3740794880, + "step": 7135 + }, + { + "epoch": 0.3462636623201848, + "grad_norm": 0.25, + "learning_rate": 3.893110697179892e-05, + "loss": 2.7701, + "num_input_tokens_seen": 3743416320, + "step": 7140 + }, + { + "epoch": 0.3465061438764314, + "grad_norm": 0.255859375, + "learning_rate": 3.891445656398608e-05, + "loss": 2.7833, + "num_input_tokens_seen": 3746037760, + "step": 7145 + }, + { + "epoch": 0.34674862543267804, + "grad_norm": 0.255859375, + "learning_rate": 3.8897797209090616e-05, + "loss": 2.777, + "num_input_tokens_seen": 3748659200, + "step": 7150 + }, + { + "epoch": 0.34699110698892466, + "grad_norm": 0.25390625, + "learning_rate": 3.8881128917824606e-05, + "loss": 2.7722, + "num_input_tokens_seen": 3751280640, + "step": 7155 + }, + { + "epoch": 0.3472335885451713, + "grad_norm": 0.2451171875, + "learning_rate": 3.886445170090586e-05, + "loss": 2.7933, + "num_input_tokens_seen": 3753902080, + "step": 7160 + }, + { + "epoch": 0.3474760701014179, + "grad_norm": 0.255859375, + "learning_rate": 3.884776556905793e-05, + "loss": 2.7814, + "num_input_tokens_seen": 3756523520, + "step": 7165 + }, + { + "epoch": 0.3477185516576645, + "grad_norm": 0.248046875, + "learning_rate": 3.883107053301012e-05, + "loss": 2.7849, + "num_input_tokens_seen": 3759144960, + "step": 7170 + }, + { + "epoch": 0.34796103321391114, + "grad_norm": 0.255859375, + "learning_rate": 3.8814366603497415e-05, + "loss": 2.7713, + "num_input_tokens_seen": 3761766400, + "step": 7175 + }, + { + "epoch": 0.3482035147701578, + "grad_norm": 0.26171875, + "learning_rate": 3.8797653791260565e-05, + "loss": 2.7698, + "num_input_tokens_seen": 3764387840, + "step": 7180 + }, + { + "epoch": 0.34844599632640444, + "grad_norm": 0.251953125, + "learning_rate": 3.878093210704602e-05, + "loss": 2.7729, + "num_input_tokens_seen": 3767009280, + "step": 7185 + }, + { + "epoch": 0.34868847788265106, + "grad_norm": 0.255859375, + "learning_rate": 3.8764201561605904e-05, + "loss": 2.7877, + "num_input_tokens_seen": 3769630720, + "step": 7190 + }, + { + "epoch": 0.3489309594388977, + "grad_norm": 0.265625, + "learning_rate": 3.874746216569808e-05, + "loss": 2.7723, + "num_input_tokens_seen": 3772252160, + "step": 7195 + }, + { + "epoch": 0.3491734409951443, + "grad_norm": 0.25390625, + "learning_rate": 3.873071393008608e-05, + "loss": 2.7765, + "num_input_tokens_seen": 3774873600, + "step": 7200 + }, + { + "epoch": 0.3491734409951443, + "eval_accuracy": 0.4550219833903273, + "eval_loss": 2.7463502883911133, + "eval_runtime": 5.8692, + "eval_samples_per_second": 51.114, + "eval_steps_per_second": 6.474, + "num_input_tokens_seen": 3774873600, + "step": 7200 + }, + { + "epoch": 0.3494159225513909, + "grad_norm": 0.2578125, + "learning_rate": 3.871395686553912e-05, + "loss": 2.7959, + "num_input_tokens_seen": 3777495040, + "step": 7205 + }, + { + "epoch": 0.34965840410763754, + "grad_norm": 0.255859375, + "learning_rate": 3.869719098283211e-05, + "loss": 2.7815, + "num_input_tokens_seen": 3780116480, + "step": 7210 + }, + { + "epoch": 0.34990088566388416, + "grad_norm": 0.25, + "learning_rate": 3.868041629274561e-05, + "loss": 2.774, + "num_input_tokens_seen": 3782737920, + "step": 7215 + }, + { + "epoch": 0.35014336722013084, + "grad_norm": 0.25, + "learning_rate": 3.866363280606584e-05, + "loss": 2.7742, + "num_input_tokens_seen": 3785359360, + "step": 7220 + }, + { + "epoch": 0.35038584877637746, + "grad_norm": 0.251953125, + "learning_rate": 3.86468405335847e-05, + "loss": 2.7824, + "num_input_tokens_seen": 3787980800, + "step": 7225 + }, + { + "epoch": 0.3506283303326241, + "grad_norm": 0.26171875, + "learning_rate": 3.863003948609972e-05, + "loss": 2.7731, + "num_input_tokens_seen": 3790602240, + "step": 7230 + }, + { + "epoch": 0.3508708118888707, + "grad_norm": 0.26171875, + "learning_rate": 3.861322967441409e-05, + "loss": 2.7944, + "num_input_tokens_seen": 3793223680, + "step": 7235 + }, + { + "epoch": 0.3511132934451173, + "grad_norm": 0.2490234375, + "learning_rate": 3.8596411109336604e-05, + "loss": 2.7845, + "num_input_tokens_seen": 3795845120, + "step": 7240 + }, + { + "epoch": 0.35135577500136395, + "grad_norm": 0.2490234375, + "learning_rate": 3.8579583801681725e-05, + "loss": 2.7809, + "num_input_tokens_seen": 3798466560, + "step": 7245 + }, + { + "epoch": 0.35159825655761057, + "grad_norm": 0.2578125, + "learning_rate": 3.8562747762269504e-05, + "loss": 2.7752, + "num_input_tokens_seen": 3801088000, + "step": 7250 + }, + { + "epoch": 0.3518407381138572, + "grad_norm": 0.26171875, + "learning_rate": 3.854590300192562e-05, + "loss": 2.7826, + "num_input_tokens_seen": 3803709440, + "step": 7255 + }, + { + "epoch": 0.35208321967010386, + "grad_norm": 0.26171875, + "learning_rate": 3.8529049531481364e-05, + "loss": 2.7801, + "num_input_tokens_seen": 3806330880, + "step": 7260 + }, + { + "epoch": 0.3523257012263505, + "grad_norm": 0.2490234375, + "learning_rate": 3.8512187361773625e-05, + "loss": 2.7696, + "num_input_tokens_seen": 3808952320, + "step": 7265 + }, + { + "epoch": 0.3525681827825971, + "grad_norm": 0.24609375, + "learning_rate": 3.849531650364488e-05, + "loss": 2.7825, + "num_input_tokens_seen": 3811573760, + "step": 7270 + }, + { + "epoch": 0.3528106643388437, + "grad_norm": 0.255859375, + "learning_rate": 3.847843696794319e-05, + "loss": 2.7795, + "num_input_tokens_seen": 3814195200, + "step": 7275 + }, + { + "epoch": 0.35305314589509035, + "grad_norm": 0.2470703125, + "learning_rate": 3.846154876552222e-05, + "loss": 2.7835, + "num_input_tokens_seen": 3816816640, + "step": 7280 + }, + { + "epoch": 0.35329562745133697, + "grad_norm": 0.265625, + "learning_rate": 3.844465190724116e-05, + "loss": 2.7722, + "num_input_tokens_seen": 3819438080, + "step": 7285 + }, + { + "epoch": 0.3535381090075836, + "grad_norm": 0.2470703125, + "learning_rate": 3.8427746403964836e-05, + "loss": 2.7649, + "num_input_tokens_seen": 3822059520, + "step": 7290 + }, + { + "epoch": 0.35378059056383027, + "grad_norm": 0.255859375, + "learning_rate": 3.8410832266563555e-05, + "loss": 2.7647, + "num_input_tokens_seen": 3824680960, + "step": 7295 + }, + { + "epoch": 0.3540230721200769, + "grad_norm": 0.265625, + "learning_rate": 3.839390950591324e-05, + "loss": 2.7675, + "num_input_tokens_seen": 3827302400, + "step": 7300 + }, + { + "epoch": 0.3542655536763235, + "grad_norm": 0.26171875, + "learning_rate": 3.837697813289531e-05, + "loss": 2.7912, + "num_input_tokens_seen": 3829923840, + "step": 7305 + }, + { + "epoch": 0.35450803523257013, + "grad_norm": 0.251953125, + "learning_rate": 3.836003815839676e-05, + "loss": 2.7815, + "num_input_tokens_seen": 3832545280, + "step": 7310 + }, + { + "epoch": 0.35475051678881675, + "grad_norm": 0.267578125, + "learning_rate": 3.834308959331009e-05, + "loss": 2.7858, + "num_input_tokens_seen": 3835166720, + "step": 7315 + }, + { + "epoch": 0.35499299834506337, + "grad_norm": 0.26953125, + "learning_rate": 3.832613244853335e-05, + "loss": 2.7794, + "num_input_tokens_seen": 3837788160, + "step": 7320 + }, + { + "epoch": 0.35523547990131, + "grad_norm": 0.279296875, + "learning_rate": 3.8309166734970064e-05, + "loss": 2.7687, + "num_input_tokens_seen": 3840409600, + "step": 7325 + }, + { + "epoch": 0.3554779614575566, + "grad_norm": 0.26171875, + "learning_rate": 3.829219246352931e-05, + "loss": 2.7792, + "num_input_tokens_seen": 3843031040, + "step": 7330 + }, + { + "epoch": 0.3557204430138033, + "grad_norm": 0.28125, + "learning_rate": 3.827520964512564e-05, + "loss": 2.7775, + "num_input_tokens_seen": 3845652480, + "step": 7335 + }, + { + "epoch": 0.3559629245700499, + "grad_norm": 0.2578125, + "learning_rate": 3.8258218290679124e-05, + "loss": 2.7859, + "num_input_tokens_seen": 3848273920, + "step": 7340 + }, + { + "epoch": 0.35620540612629653, + "grad_norm": 0.263671875, + "learning_rate": 3.8241218411115306e-05, + "loss": 2.7754, + "num_input_tokens_seen": 3850895360, + "step": 7345 + }, + { + "epoch": 0.35644788768254315, + "grad_norm": 0.25390625, + "learning_rate": 3.8224210017365205e-05, + "loss": 2.7796, + "num_input_tokens_seen": 3853516800, + "step": 7350 + }, + { + "epoch": 0.3566903692387898, + "grad_norm": 0.2578125, + "learning_rate": 3.820719312036535e-05, + "loss": 2.7865, + "num_input_tokens_seen": 3856138240, + "step": 7355 + }, + { + "epoch": 0.3569328507950364, + "grad_norm": 0.251953125, + "learning_rate": 3.819016773105768e-05, + "loss": 2.7742, + "num_input_tokens_seen": 3858759680, + "step": 7360 + }, + { + "epoch": 0.357175332351283, + "grad_norm": 0.248046875, + "learning_rate": 3.817313386038964e-05, + "loss": 2.7804, + "num_input_tokens_seen": 3861381120, + "step": 7365 + }, + { + "epoch": 0.35741781390752964, + "grad_norm": 0.2451171875, + "learning_rate": 3.815609151931412e-05, + "loss": 2.7833, + "num_input_tokens_seen": 3864002560, + "step": 7370 + }, + { + "epoch": 0.3576602954637763, + "grad_norm": 0.251953125, + "learning_rate": 3.813904071878945e-05, + "loss": 2.7833, + "num_input_tokens_seen": 3866624000, + "step": 7375 + }, + { + "epoch": 0.35790277702002293, + "grad_norm": 0.248046875, + "learning_rate": 3.81219814697794e-05, + "loss": 2.7804, + "num_input_tokens_seen": 3869245440, + "step": 7380 + }, + { + "epoch": 0.35814525857626955, + "grad_norm": 0.255859375, + "learning_rate": 3.810491378325318e-05, + "loss": 2.7805, + "num_input_tokens_seen": 3871866880, + "step": 7385 + }, + { + "epoch": 0.3583877401325162, + "grad_norm": 0.265625, + "learning_rate": 3.80878376701854e-05, + "loss": 2.787, + "num_input_tokens_seen": 3874488320, + "step": 7390 + }, + { + "epoch": 0.3586302216887628, + "grad_norm": 0.267578125, + "learning_rate": 3.807075314155613e-05, + "loss": 2.7921, + "num_input_tokens_seen": 3877109760, + "step": 7395 + }, + { + "epoch": 0.3588727032450094, + "grad_norm": 0.259765625, + "learning_rate": 3.8053660208350815e-05, + "loss": 2.7695, + "num_input_tokens_seen": 3879731200, + "step": 7400 + }, + { + "epoch": 0.35911518480125604, + "grad_norm": 0.259765625, + "learning_rate": 3.803655888156033e-05, + "loss": 2.7807, + "num_input_tokens_seen": 3882352640, + "step": 7405 + }, + { + "epoch": 0.35935766635750266, + "grad_norm": 0.255859375, + "learning_rate": 3.801944917218092e-05, + "loss": 2.7863, + "num_input_tokens_seen": 3884974080, + "step": 7410 + }, + { + "epoch": 0.35960014791374934, + "grad_norm": 0.255859375, + "learning_rate": 3.800233109121425e-05, + "loss": 2.7784, + "num_input_tokens_seen": 3887595520, + "step": 7415 + }, + { + "epoch": 0.35984262946999596, + "grad_norm": 0.2490234375, + "learning_rate": 3.798520464966734e-05, + "loss": 2.7841, + "num_input_tokens_seen": 3890216960, + "step": 7420 + }, + { + "epoch": 0.3600851110262426, + "grad_norm": 0.2451171875, + "learning_rate": 3.7968069858552604e-05, + "loss": 2.7715, + "num_input_tokens_seen": 3892838400, + "step": 7425 + }, + { + "epoch": 0.3603275925824892, + "grad_norm": 0.255859375, + "learning_rate": 3.795092672888782e-05, + "loss": 2.7832, + "num_input_tokens_seen": 3895459840, + "step": 7430 + }, + { + "epoch": 0.3605700741387358, + "grad_norm": 0.26171875, + "learning_rate": 3.7933775271696136e-05, + "loss": 2.7782, + "num_input_tokens_seen": 3898081280, + "step": 7435 + }, + { + "epoch": 0.36081255569498244, + "grad_norm": 0.263671875, + "learning_rate": 3.791661549800604e-05, + "loss": 2.7799, + "num_input_tokens_seen": 3900702720, + "step": 7440 + }, + { + "epoch": 0.36105503725122906, + "grad_norm": 0.2470703125, + "learning_rate": 3.789944741885136e-05, + "loss": 2.7891, + "num_input_tokens_seen": 3903324160, + "step": 7445 + }, + { + "epoch": 0.3612975188074757, + "grad_norm": 0.26171875, + "learning_rate": 3.78822710452713e-05, + "loss": 2.784, + "num_input_tokens_seen": 3905945600, + "step": 7450 + }, + { + "epoch": 0.36154000036372236, + "grad_norm": 0.25390625, + "learning_rate": 3.786508638831036e-05, + "loss": 2.776, + "num_input_tokens_seen": 3908567040, + "step": 7455 + }, + { + "epoch": 0.361782481919969, + "grad_norm": 0.248046875, + "learning_rate": 3.78478934590184e-05, + "loss": 2.7755, + "num_input_tokens_seen": 3911188480, + "step": 7460 + }, + { + "epoch": 0.3620249634762156, + "grad_norm": 0.244140625, + "learning_rate": 3.783069226845056e-05, + "loss": 2.7844, + "num_input_tokens_seen": 3913809920, + "step": 7465 + }, + { + "epoch": 0.3622674450324622, + "grad_norm": 0.25390625, + "learning_rate": 3.7813482827667325e-05, + "loss": 2.7889, + "num_input_tokens_seen": 3916431360, + "step": 7470 + }, + { + "epoch": 0.36250992658870884, + "grad_norm": 0.251953125, + "learning_rate": 3.779626514773448e-05, + "loss": 2.7728, + "num_input_tokens_seen": 3919052800, + "step": 7475 + }, + { + "epoch": 0.36275240814495546, + "grad_norm": 0.26953125, + "learning_rate": 3.777903923972307e-05, + "loss": 2.7841, + "num_input_tokens_seen": 3921674240, + "step": 7480 + }, + { + "epoch": 0.3629948897012021, + "grad_norm": 0.259765625, + "learning_rate": 3.77618051147095e-05, + "loss": 2.7764, + "num_input_tokens_seen": 3924295680, + "step": 7485 + }, + { + "epoch": 0.3632373712574487, + "grad_norm": 0.26171875, + "learning_rate": 3.77445627837754e-05, + "loss": 2.7899, + "num_input_tokens_seen": 3926917120, + "step": 7490 + }, + { + "epoch": 0.3634798528136954, + "grad_norm": 0.2578125, + "learning_rate": 3.77273122580077e-05, + "loss": 2.7736, + "num_input_tokens_seen": 3929538560, + "step": 7495 + }, + { + "epoch": 0.363722334369942, + "grad_norm": 0.25, + "learning_rate": 3.771005354849859e-05, + "loss": 2.7778, + "num_input_tokens_seen": 3932160000, + "step": 7500 + }, + { + "epoch": 0.363722334369942, + "eval_accuracy": 0.45522390490148185, + "eval_loss": 2.7460319995880127, + "eval_runtime": 5.7761, + "eval_samples_per_second": 51.938, + "eval_steps_per_second": 6.579, + "num_input_tokens_seen": 3932160000, + "step": 7500 + }, + { + "epoch": 0.3639648159261886, + "grad_norm": 0.2451171875, + "learning_rate": 3.769278666634555e-05, + "loss": 2.786, + "num_input_tokens_seen": 3934781440, + "step": 7505 + }, + { + "epoch": 0.36420729748243524, + "grad_norm": 0.2578125, + "learning_rate": 3.767551162265126e-05, + "loss": 2.7914, + "num_input_tokens_seen": 3937402880, + "step": 7510 + }, + { + "epoch": 0.36444977903868186, + "grad_norm": 0.255859375, + "learning_rate": 3.7658228428523714e-05, + "loss": 2.767, + "num_input_tokens_seen": 3940024320, + "step": 7515 + }, + { + "epoch": 0.3646922605949285, + "grad_norm": 0.263671875, + "learning_rate": 3.764093709507609e-05, + "loss": 2.7903, + "num_input_tokens_seen": 3942645760, + "step": 7520 + }, + { + "epoch": 0.3649347421511751, + "grad_norm": 0.263671875, + "learning_rate": 3.7623637633426835e-05, + "loss": 2.7748, + "num_input_tokens_seen": 3945267200, + "step": 7525 + }, + { + "epoch": 0.3651772237074217, + "grad_norm": 0.26171875, + "learning_rate": 3.760633005469961e-05, + "loss": 2.7837, + "num_input_tokens_seen": 3947888640, + "step": 7530 + }, + { + "epoch": 0.3654197052636684, + "grad_norm": 0.259765625, + "learning_rate": 3.758901437002329e-05, + "loss": 2.7716, + "num_input_tokens_seen": 3950510080, + "step": 7535 + }, + { + "epoch": 0.365662186819915, + "grad_norm": 0.263671875, + "learning_rate": 3.7571690590531975e-05, + "loss": 2.7612, + "num_input_tokens_seen": 3953131520, + "step": 7540 + }, + { + "epoch": 0.36590466837616165, + "grad_norm": 0.25, + "learning_rate": 3.755435872736496e-05, + "loss": 2.7766, + "num_input_tokens_seen": 3955752960, + "step": 7545 + }, + { + "epoch": 0.36614714993240827, + "grad_norm": 0.259765625, + "learning_rate": 3.7537018791666746e-05, + "loss": 2.7842, + "num_input_tokens_seen": 3958374400, + "step": 7550 + }, + { + "epoch": 0.3663896314886549, + "grad_norm": 0.25390625, + "learning_rate": 3.751967079458702e-05, + "loss": 2.7718, + "num_input_tokens_seen": 3960995840, + "step": 7555 + }, + { + "epoch": 0.3666321130449015, + "grad_norm": 0.25390625, + "learning_rate": 3.750231474728065e-05, + "loss": 2.7661, + "num_input_tokens_seen": 3963617280, + "step": 7560 + }, + { + "epoch": 0.36687459460114813, + "grad_norm": 0.25, + "learning_rate": 3.7484950660907683e-05, + "loss": 2.7754, + "num_input_tokens_seen": 3966238720, + "step": 7565 + }, + { + "epoch": 0.36711707615739475, + "grad_norm": 0.2490234375, + "learning_rate": 3.746757854663333e-05, + "loss": 2.7748, + "num_input_tokens_seen": 3968860160, + "step": 7570 + }, + { + "epoch": 0.3673595577136414, + "grad_norm": 0.25390625, + "learning_rate": 3.745019841562798e-05, + "loss": 2.7734, + "num_input_tokens_seen": 3971481600, + "step": 7575 + }, + { + "epoch": 0.36760203926988805, + "grad_norm": 0.25390625, + "learning_rate": 3.7432810279067153e-05, + "loss": 2.7905, + "num_input_tokens_seen": 3974103040, + "step": 7580 + }, + { + "epoch": 0.36784452082613467, + "grad_norm": 0.263671875, + "learning_rate": 3.741541414813155e-05, + "loss": 2.7909, + "num_input_tokens_seen": 3976724480, + "step": 7585 + }, + { + "epoch": 0.3680870023823813, + "grad_norm": 0.25390625, + "learning_rate": 3.739801003400697e-05, + "loss": 2.7889, + "num_input_tokens_seen": 3979345920, + "step": 7590 + }, + { + "epoch": 0.3683294839386279, + "grad_norm": 0.2578125, + "learning_rate": 3.7380597947884374e-05, + "loss": 2.8, + "num_input_tokens_seen": 3981967360, + "step": 7595 + }, + { + "epoch": 0.36857196549487453, + "grad_norm": 0.25, + "learning_rate": 3.736317790095985e-05, + "loss": 2.7804, + "num_input_tokens_seen": 3984588800, + "step": 7600 + }, + { + "epoch": 0.36881444705112115, + "grad_norm": 0.25390625, + "learning_rate": 3.734574990443459e-05, + "loss": 2.7764, + "num_input_tokens_seen": 3987210240, + "step": 7605 + }, + { + "epoch": 0.36905692860736783, + "grad_norm": 0.265625, + "learning_rate": 3.73283139695149e-05, + "loss": 2.7686, + "num_input_tokens_seen": 3989831680, + "step": 7610 + }, + { + "epoch": 0.36929941016361445, + "grad_norm": 0.2578125, + "learning_rate": 3.731087010741222e-05, + "loss": 2.7881, + "num_input_tokens_seen": 3992453120, + "step": 7615 + }, + { + "epoch": 0.36954189171986107, + "grad_norm": 0.255859375, + "learning_rate": 3.7293418329343026e-05, + "loss": 2.7763, + "num_input_tokens_seen": 3995074560, + "step": 7620 + }, + { + "epoch": 0.3697843732761077, + "grad_norm": 0.26953125, + "learning_rate": 3.7275958646528944e-05, + "loss": 2.7661, + "num_input_tokens_seen": 3997696000, + "step": 7625 + }, + { + "epoch": 0.3700268548323543, + "grad_norm": 0.25390625, + "learning_rate": 3.725849107019666e-05, + "loss": 2.7846, + "num_input_tokens_seen": 4000317440, + "step": 7630 + }, + { + "epoch": 0.37026933638860093, + "grad_norm": 0.25390625, + "learning_rate": 3.7241015611577926e-05, + "loss": 2.7895, + "num_input_tokens_seen": 4002938880, + "step": 7635 + }, + { + "epoch": 0.37051181794484755, + "grad_norm": 0.255859375, + "learning_rate": 3.7223532281909574e-05, + "loss": 2.783, + "num_input_tokens_seen": 4005560320, + "step": 7640 + }, + { + "epoch": 0.3707542995010942, + "grad_norm": 0.26171875, + "learning_rate": 3.7206041092433495e-05, + "loss": 2.7809, + "num_input_tokens_seen": 4008181760, + "step": 7645 + }, + { + "epoch": 0.37099678105734085, + "grad_norm": 0.2578125, + "learning_rate": 3.7188542054396625e-05, + "loss": 2.765, + "num_input_tokens_seen": 4010803200, + "step": 7650 + }, + { + "epoch": 0.3712392626135875, + "grad_norm": 0.25, + "learning_rate": 3.7171035179050964e-05, + "loss": 2.7904, + "num_input_tokens_seen": 4013424640, + "step": 7655 + }, + { + "epoch": 0.3714817441698341, + "grad_norm": 0.25390625, + "learning_rate": 3.7153520477653545e-05, + "loss": 2.7808, + "num_input_tokens_seen": 4016046080, + "step": 7660 + }, + { + "epoch": 0.3717242257260807, + "grad_norm": 0.25390625, + "learning_rate": 3.713599796146644e-05, + "loss": 2.785, + "num_input_tokens_seen": 4018667520, + "step": 7665 + }, + { + "epoch": 0.37196670728232734, + "grad_norm": 0.259765625, + "learning_rate": 3.7118467641756705e-05, + "loss": 2.7786, + "num_input_tokens_seen": 4021288960, + "step": 7670 + }, + { + "epoch": 0.37220918883857396, + "grad_norm": 0.25390625, + "learning_rate": 3.710092952979647e-05, + "loss": 2.7713, + "num_input_tokens_seen": 4023910400, + "step": 7675 + }, + { + "epoch": 0.3724516703948206, + "grad_norm": 0.2392578125, + "learning_rate": 3.708338363686285e-05, + "loss": 2.778, + "num_input_tokens_seen": 4026531840, + "step": 7680 + }, + { + "epoch": 0.3726941519510672, + "grad_norm": 0.25390625, + "learning_rate": 3.706582997423794e-05, + "loss": 2.7676, + "num_input_tokens_seen": 4029153280, + "step": 7685 + }, + { + "epoch": 0.3729366335073139, + "grad_norm": 0.2451171875, + "learning_rate": 3.704826855320889e-05, + "loss": 2.7797, + "num_input_tokens_seen": 4031774720, + "step": 7690 + }, + { + "epoch": 0.3731791150635605, + "grad_norm": 0.259765625, + "learning_rate": 3.703069938506778e-05, + "loss": 2.7757, + "num_input_tokens_seen": 4034396160, + "step": 7695 + }, + { + "epoch": 0.3734215966198071, + "grad_norm": 0.2578125, + "learning_rate": 3.70131224811117e-05, + "loss": 2.7846, + "num_input_tokens_seen": 4037017600, + "step": 7700 + }, + { + "epoch": 0.37366407817605374, + "grad_norm": 0.25390625, + "learning_rate": 3.6995537852642714e-05, + "loss": 2.7727, + "num_input_tokens_seen": 4039639040, + "step": 7705 + }, + { + "epoch": 0.37390655973230036, + "grad_norm": 0.251953125, + "learning_rate": 3.697794551096784e-05, + "loss": 2.777, + "num_input_tokens_seen": 4042260480, + "step": 7710 + }, + { + "epoch": 0.374149041288547, + "grad_norm": 0.25390625, + "learning_rate": 3.696034546739907e-05, + "loss": 2.774, + "num_input_tokens_seen": 4044881920, + "step": 7715 + }, + { + "epoch": 0.3743915228447936, + "grad_norm": 0.26171875, + "learning_rate": 3.6942737733253345e-05, + "loss": 2.7813, + "num_input_tokens_seen": 4047503360, + "step": 7720 + }, + { + "epoch": 0.3746340044010402, + "grad_norm": 0.25390625, + "learning_rate": 3.6925122319852546e-05, + "loss": 2.7747, + "num_input_tokens_seen": 4050124800, + "step": 7725 + }, + { + "epoch": 0.3748764859572869, + "grad_norm": 0.259765625, + "learning_rate": 3.690749923852349e-05, + "loss": 2.7741, + "num_input_tokens_seen": 4052746240, + "step": 7730 + }, + { + "epoch": 0.3751189675135335, + "grad_norm": 0.25, + "learning_rate": 3.688986850059792e-05, + "loss": 2.7742, + "num_input_tokens_seen": 4055367680, + "step": 7735 + }, + { + "epoch": 0.37536144906978014, + "grad_norm": 0.2470703125, + "learning_rate": 3.6872230117412534e-05, + "loss": 2.7745, + "num_input_tokens_seen": 4057989120, + "step": 7740 + }, + { + "epoch": 0.37560393062602676, + "grad_norm": 0.26953125, + "learning_rate": 3.68545841003089e-05, + "loss": 2.7821, + "num_input_tokens_seen": 4060610560, + "step": 7745 + }, + { + "epoch": 0.3758464121822734, + "grad_norm": 0.25, + "learning_rate": 3.6836930460633534e-05, + "loss": 2.7731, + "num_input_tokens_seen": 4063232000, + "step": 7750 + }, + { + "epoch": 0.37608889373852, + "grad_norm": 0.25, + "learning_rate": 3.6819269209737836e-05, + "loss": 2.7888, + "num_input_tokens_seen": 4065853440, + "step": 7755 + }, + { + "epoch": 0.3763313752947666, + "grad_norm": 0.267578125, + "learning_rate": 3.680160035897809e-05, + "loss": 2.7733, + "num_input_tokens_seen": 4068474880, + "step": 7760 + }, + { + "epoch": 0.37657385685101324, + "grad_norm": 0.2578125, + "learning_rate": 3.678392391971548e-05, + "loss": 2.7787, + "num_input_tokens_seen": 4071096320, + "step": 7765 + }, + { + "epoch": 0.3768163384072599, + "grad_norm": 0.2578125, + "learning_rate": 3.676623990331607e-05, + "loss": 2.7751, + "num_input_tokens_seen": 4073717760, + "step": 7770 + }, + { + "epoch": 0.37705881996350654, + "grad_norm": 0.251953125, + "learning_rate": 3.674854832115079e-05, + "loss": 2.7868, + "num_input_tokens_seen": 4076339200, + "step": 7775 + }, + { + "epoch": 0.37730130151975316, + "grad_norm": 0.248046875, + "learning_rate": 3.673084918459544e-05, + "loss": 2.7887, + "num_input_tokens_seen": 4078960640, + "step": 7780 + }, + { + "epoch": 0.3775437830759998, + "grad_norm": 0.265625, + "learning_rate": 3.671314250503068e-05, + "loss": 2.7777, + "num_input_tokens_seen": 4081582080, + "step": 7785 + }, + { + "epoch": 0.3777862646322464, + "grad_norm": 0.251953125, + "learning_rate": 3.669542829384201e-05, + "loss": 2.7748, + "num_input_tokens_seen": 4084203520, + "step": 7790 + }, + { + "epoch": 0.378028746188493, + "grad_norm": 0.2578125, + "learning_rate": 3.6677706562419786e-05, + "loss": 2.7666, + "num_input_tokens_seen": 4086824960, + "step": 7795 + }, + { + "epoch": 0.37827122774473965, + "grad_norm": 0.2578125, + "learning_rate": 3.6659977322159185e-05, + "loss": 2.7655, + "num_input_tokens_seen": 4089446400, + "step": 7800 + }, + { + "epoch": 0.37827122774473965, + "eval_accuracy": 0.4553134668620746, + "eval_loss": 2.7454605102539062, + "eval_runtime": 6.4652, + "eval_samples_per_second": 46.402, + "eval_steps_per_second": 5.878, + "num_input_tokens_seen": 4089446400, + "step": 7800 + }, + { + "epoch": 0.37851370930098627, + "grad_norm": 0.25, + "learning_rate": 3.664224058446022e-05, + "loss": 2.7846, + "num_input_tokens_seen": 4092067840, + "step": 7805 + }, + { + "epoch": 0.37875619085723294, + "grad_norm": 0.26953125, + "learning_rate": 3.662449636072772e-05, + "loss": 2.7803, + "num_input_tokens_seen": 4094689280, + "step": 7810 + }, + { + "epoch": 0.37899867241347956, + "grad_norm": 0.25390625, + "learning_rate": 3.660674466237134e-05, + "loss": 2.7708, + "num_input_tokens_seen": 4097310720, + "step": 7815 + }, + { + "epoch": 0.3792411539697262, + "grad_norm": 0.251953125, + "learning_rate": 3.658898550080554e-05, + "loss": 2.7793, + "num_input_tokens_seen": 4099932160, + "step": 7820 + }, + { + "epoch": 0.3794836355259728, + "grad_norm": 0.2578125, + "learning_rate": 3.657121888744955e-05, + "loss": 2.7767, + "num_input_tokens_seen": 4102553600, + "step": 7825 + }, + { + "epoch": 0.3797261170822194, + "grad_norm": 0.265625, + "learning_rate": 3.655344483372743e-05, + "loss": 2.7808, + "num_input_tokens_seen": 4105175040, + "step": 7830 + }, + { + "epoch": 0.37996859863846605, + "grad_norm": 0.25, + "learning_rate": 3.6535663351068006e-05, + "loss": 2.7628, + "num_input_tokens_seen": 4107796480, + "step": 7835 + }, + { + "epoch": 0.38021108019471267, + "grad_norm": 0.251953125, + "learning_rate": 3.6517874450904885e-05, + "loss": 2.7667, + "num_input_tokens_seen": 4110417920, + "step": 7840 + }, + { + "epoch": 0.3804535617509593, + "grad_norm": 0.248046875, + "learning_rate": 3.6500078144676425e-05, + "loss": 2.7799, + "num_input_tokens_seen": 4113039360, + "step": 7845 + }, + { + "epoch": 0.38069604330720597, + "grad_norm": 0.251953125, + "learning_rate": 3.648227444382578e-05, + "loss": 2.7923, + "num_input_tokens_seen": 4115660800, + "step": 7850 + }, + { + "epoch": 0.3809385248634526, + "grad_norm": 0.25, + "learning_rate": 3.6464463359800834e-05, + "loss": 2.7731, + "num_input_tokens_seen": 4118282240, + "step": 7855 + }, + { + "epoch": 0.3811810064196992, + "grad_norm": 0.248046875, + "learning_rate": 3.644664490405422e-05, + "loss": 2.787, + "num_input_tokens_seen": 4120903680, + "step": 7860 + }, + { + "epoch": 0.38142348797594583, + "grad_norm": 0.251953125, + "learning_rate": 3.642881908804334e-05, + "loss": 2.7851, + "num_input_tokens_seen": 4123525120, + "step": 7865 + }, + { + "epoch": 0.38166596953219245, + "grad_norm": 0.251953125, + "learning_rate": 3.641098592323027e-05, + "loss": 2.7845, + "num_input_tokens_seen": 4126146560, + "step": 7870 + }, + { + "epoch": 0.38190845108843907, + "grad_norm": 0.251953125, + "learning_rate": 3.639314542108187e-05, + "loss": 2.7905, + "num_input_tokens_seen": 4128768000, + "step": 7875 + }, + { + "epoch": 0.3821509326446857, + "grad_norm": 0.248046875, + "learning_rate": 3.637529759306969e-05, + "loss": 2.7899, + "num_input_tokens_seen": 4131389440, + "step": 7880 + }, + { + "epoch": 0.38239341420093237, + "grad_norm": 0.25390625, + "learning_rate": 3.635744245066999e-05, + "loss": 2.7767, + "num_input_tokens_seen": 4134010880, + "step": 7885 + }, + { + "epoch": 0.382635895757179, + "grad_norm": 0.259765625, + "learning_rate": 3.633958000536375e-05, + "loss": 2.7913, + "num_input_tokens_seen": 4136632320, + "step": 7890 + }, + { + "epoch": 0.3828783773134256, + "grad_norm": 0.25, + "learning_rate": 3.6321710268636623e-05, + "loss": 2.7836, + "num_input_tokens_seen": 4139253760, + "step": 7895 + }, + { + "epoch": 0.38312085886967223, + "grad_norm": 0.2470703125, + "learning_rate": 3.6303833251978966e-05, + "loss": 2.7747, + "num_input_tokens_seen": 4141875200, + "step": 7900 + }, + { + "epoch": 0.38336334042591885, + "grad_norm": 0.255859375, + "learning_rate": 3.62859489668858e-05, + "loss": 2.7795, + "num_input_tokens_seen": 4144496640, + "step": 7905 + }, + { + "epoch": 0.3836058219821655, + "grad_norm": 0.26953125, + "learning_rate": 3.626805742485686e-05, + "loss": 2.7651, + "num_input_tokens_seen": 4147118080, + "step": 7910 + }, + { + "epoch": 0.3838483035384121, + "grad_norm": 0.259765625, + "learning_rate": 3.625015863739649e-05, + "loss": 2.7785, + "num_input_tokens_seen": 4149739520, + "step": 7915 + }, + { + "epoch": 0.3840907850946587, + "grad_norm": 0.255859375, + "learning_rate": 3.623225261601375e-05, + "loss": 2.7778, + "num_input_tokens_seen": 4152360960, + "step": 7920 + }, + { + "epoch": 0.3843332666509054, + "grad_norm": 0.251953125, + "learning_rate": 3.62143393722223e-05, + "loss": 2.8004, + "num_input_tokens_seen": 4154982400, + "step": 7925 + }, + { + "epoch": 0.384575748207152, + "grad_norm": 0.265625, + "learning_rate": 3.619641891754048e-05, + "loss": 2.7604, + "num_input_tokens_seen": 4157603840, + "step": 7930 + }, + { + "epoch": 0.38481822976339863, + "grad_norm": 0.263671875, + "learning_rate": 3.6178491263491236e-05, + "loss": 2.7872, + "num_input_tokens_seen": 4160225280, + "step": 7935 + }, + { + "epoch": 0.38506071131964525, + "grad_norm": 0.2412109375, + "learning_rate": 3.61605564216022e-05, + "loss": 2.7679, + "num_input_tokens_seen": 4162846720, + "step": 7940 + }, + { + "epoch": 0.3853031928758919, + "grad_norm": 0.259765625, + "learning_rate": 3.6142614403405553e-05, + "loss": 2.7745, + "num_input_tokens_seen": 4165468160, + "step": 7945 + }, + { + "epoch": 0.3855456744321385, + "grad_norm": 0.251953125, + "learning_rate": 3.612466522043813e-05, + "loss": 2.7776, + "num_input_tokens_seen": 4168089600, + "step": 7950 + }, + { + "epoch": 0.3857881559883851, + "grad_norm": 0.25, + "learning_rate": 3.610670888424139e-05, + "loss": 2.7669, + "num_input_tokens_seen": 4170711040, + "step": 7955 + }, + { + "epoch": 0.38603063754463174, + "grad_norm": 0.25, + "learning_rate": 3.608874540636134e-05, + "loss": 2.7912, + "num_input_tokens_seen": 4173332480, + "step": 7960 + }, + { + "epoch": 0.3862731191008784, + "grad_norm": 0.2490234375, + "learning_rate": 3.607077479834863e-05, + "loss": 2.7734, + "num_input_tokens_seen": 4175953920, + "step": 7965 + }, + { + "epoch": 0.38651560065712504, + "grad_norm": 0.26171875, + "learning_rate": 3.605279707175846e-05, + "loss": 2.7793, + "num_input_tokens_seen": 4178575360, + "step": 7970 + }, + { + "epoch": 0.38675808221337166, + "grad_norm": 0.24609375, + "learning_rate": 3.603481223815064e-05, + "loss": 2.7734, + "num_input_tokens_seen": 4181196800, + "step": 7975 + }, + { + "epoch": 0.3870005637696183, + "grad_norm": 0.26171875, + "learning_rate": 3.6016820309089504e-05, + "loss": 2.7832, + "num_input_tokens_seen": 4183818240, + "step": 7980 + }, + { + "epoch": 0.3872430453258649, + "grad_norm": 0.248046875, + "learning_rate": 3.599882129614399e-05, + "loss": 2.7774, + "num_input_tokens_seen": 4186439680, + "step": 7985 + }, + { + "epoch": 0.3874855268821115, + "grad_norm": 0.25, + "learning_rate": 3.598081521088758e-05, + "loss": 2.7679, + "num_input_tokens_seen": 4189061120, + "step": 7990 + }, + { + "epoch": 0.38772800843835814, + "grad_norm": 0.2431640625, + "learning_rate": 3.5962802064898295e-05, + "loss": 2.7803, + "num_input_tokens_seen": 4191682560, + "step": 7995 + }, + { + "epoch": 0.38797048999460476, + "grad_norm": 0.25, + "learning_rate": 3.5944781869758695e-05, + "loss": 2.7687, + "num_input_tokens_seen": 4194304000, + "step": 8000 + }, + { + "epoch": 0.38821297155085144, + "grad_norm": 0.251953125, + "learning_rate": 3.592675463705588e-05, + "loss": 2.7724, + "num_input_tokens_seen": 4196925440, + "step": 8005 + }, + { + "epoch": 0.38845545310709806, + "grad_norm": 0.265625, + "learning_rate": 3.5908720378381475e-05, + "loss": 2.7742, + "num_input_tokens_seen": 4199546880, + "step": 8010 + }, + { + "epoch": 0.3886979346633447, + "grad_norm": 0.25390625, + "learning_rate": 3.5890679105331624e-05, + "loss": 2.7902, + "num_input_tokens_seen": 4202168320, + "step": 8015 + }, + { + "epoch": 0.3889404162195913, + "grad_norm": 0.2578125, + "learning_rate": 3.587263082950698e-05, + "loss": 2.7769, + "num_input_tokens_seen": 4204789760, + "step": 8020 + }, + { + "epoch": 0.3891828977758379, + "grad_norm": 0.255859375, + "learning_rate": 3.5854575562512695e-05, + "loss": 2.7873, + "num_input_tokens_seen": 4207411200, + "step": 8025 + }, + { + "epoch": 0.38942537933208454, + "grad_norm": 0.25390625, + "learning_rate": 3.583651331595841e-05, + "loss": 2.776, + "num_input_tokens_seen": 4210032640, + "step": 8030 + }, + { + "epoch": 0.38966786088833116, + "grad_norm": 0.255859375, + "learning_rate": 3.581844410145827e-05, + "loss": 2.7833, + "num_input_tokens_seen": 4212654080, + "step": 8035 + }, + { + "epoch": 0.3899103424445778, + "grad_norm": 0.2431640625, + "learning_rate": 3.5800367930630905e-05, + "loss": 2.7792, + "num_input_tokens_seen": 4215275520, + "step": 8040 + }, + { + "epoch": 0.39015282400082446, + "grad_norm": 0.251953125, + "learning_rate": 3.578228481509938e-05, + "loss": 2.7588, + "num_input_tokens_seen": 4217896960, + "step": 8045 + }, + { + "epoch": 0.3903953055570711, + "grad_norm": 0.2431640625, + "learning_rate": 3.5764194766491263e-05, + "loss": 2.7772, + "num_input_tokens_seen": 4220518400, + "step": 8050 + }, + { + "epoch": 0.3906377871133177, + "grad_norm": 0.263671875, + "learning_rate": 3.574609779643858e-05, + "loss": 2.7874, + "num_input_tokens_seen": 4223139840, + "step": 8055 + }, + { + "epoch": 0.3908802686695643, + "grad_norm": 0.2490234375, + "learning_rate": 3.572799391657778e-05, + "loss": 2.7831, + "num_input_tokens_seen": 4225761280, + "step": 8060 + }, + { + "epoch": 0.39112275022581094, + "grad_norm": 0.2578125, + "learning_rate": 3.570988313854979e-05, + "loss": 2.7774, + "num_input_tokens_seen": 4228382720, + "step": 8065 + }, + { + "epoch": 0.39136523178205757, + "grad_norm": 0.2578125, + "learning_rate": 3.569176547399993e-05, + "loss": 2.7759, + "num_input_tokens_seen": 4231004160, + "step": 8070 + }, + { + "epoch": 0.3916077133383042, + "grad_norm": 0.25, + "learning_rate": 3.5673640934577976e-05, + "loss": 2.7794, + "num_input_tokens_seen": 4233625600, + "step": 8075 + }, + { + "epoch": 0.3918501948945508, + "grad_norm": 0.25390625, + "learning_rate": 3.565550953193814e-05, + "loss": 2.7808, + "num_input_tokens_seen": 4236247040, + "step": 8080 + }, + { + "epoch": 0.3920926764507975, + "grad_norm": 0.2490234375, + "learning_rate": 3.5637371277739006e-05, + "loss": 2.7834, + "num_input_tokens_seen": 4238868480, + "step": 8085 + }, + { + "epoch": 0.3923351580070441, + "grad_norm": 0.251953125, + "learning_rate": 3.56192261836436e-05, + "loss": 2.7626, + "num_input_tokens_seen": 4241489920, + "step": 8090 + }, + { + "epoch": 0.3925776395632907, + "grad_norm": 0.2451171875, + "learning_rate": 3.560107426131932e-05, + "loss": 2.776, + "num_input_tokens_seen": 4244111360, + "step": 8095 + }, + { + "epoch": 0.39282012111953735, + "grad_norm": 0.251953125, + "learning_rate": 3.558291552243798e-05, + "loss": 2.7943, + "num_input_tokens_seen": 4246732800, + "step": 8100 + }, + { + "epoch": 0.39282012111953735, + "eval_accuracy": 0.45537208923628075, + "eval_loss": 2.7449140548706055, + "eval_runtime": 5.8144, + "eval_samples_per_second": 51.596, + "eval_steps_per_second": 6.536, + "num_input_tokens_seen": 4246732800, + "step": 8100 + }, + { + "epoch": 0.39306260267578397, + "grad_norm": 0.2421875, + "learning_rate": 3.5564749978675734e-05, + "loss": 2.762, + "num_input_tokens_seen": 4249354240, + "step": 8105 + }, + { + "epoch": 0.3933050842320306, + "grad_norm": 0.2578125, + "learning_rate": 3.554657764171317e-05, + "loss": 2.7846, + "num_input_tokens_seen": 4251975680, + "step": 8110 + }, + { + "epoch": 0.3935475657882772, + "grad_norm": 0.251953125, + "learning_rate": 3.5528398523235194e-05, + "loss": 2.7749, + "num_input_tokens_seen": 4254597120, + "step": 8115 + }, + { + "epoch": 0.39379004734452383, + "grad_norm": 0.255859375, + "learning_rate": 3.551021263493111e-05, + "loss": 2.7709, + "num_input_tokens_seen": 4257218560, + "step": 8120 + }, + { + "epoch": 0.3940325289007705, + "grad_norm": 0.25390625, + "learning_rate": 3.5492019988494555e-05, + "loss": 2.7745, + "num_input_tokens_seen": 4259840000, + "step": 8125 + }, + { + "epoch": 0.3942750104570171, + "grad_norm": 0.251953125, + "learning_rate": 3.54738205956235e-05, + "loss": 2.7728, + "num_input_tokens_seen": 4262461440, + "step": 8130 + }, + { + "epoch": 0.39451749201326375, + "grad_norm": 0.259765625, + "learning_rate": 3.545561446802028e-05, + "loss": 2.7631, + "num_input_tokens_seen": 4265082880, + "step": 8135 + }, + { + "epoch": 0.39475997356951037, + "grad_norm": 0.25, + "learning_rate": 3.5437401617391544e-05, + "loss": 2.7748, + "num_input_tokens_seen": 4267704320, + "step": 8140 + }, + { + "epoch": 0.395002455125757, + "grad_norm": 0.26171875, + "learning_rate": 3.5419182055448287e-05, + "loss": 2.7731, + "num_input_tokens_seen": 4270325760, + "step": 8145 + }, + { + "epoch": 0.3952449366820036, + "grad_norm": 0.25, + "learning_rate": 3.540095579390577e-05, + "loss": 2.7674, + "num_input_tokens_seen": 4272947200, + "step": 8150 + }, + { + "epoch": 0.39548741823825023, + "grad_norm": 0.2451171875, + "learning_rate": 3.538272284448362e-05, + "loss": 2.7688, + "num_input_tokens_seen": 4275568640, + "step": 8155 + }, + { + "epoch": 0.3957298997944969, + "grad_norm": 0.2470703125, + "learning_rate": 3.5364483218905714e-05, + "loss": 2.7853, + "num_input_tokens_seen": 4278190080, + "step": 8160 + }, + { + "epoch": 0.39597238135074353, + "grad_norm": 0.2578125, + "learning_rate": 3.534623692890027e-05, + "loss": 2.7762, + "num_input_tokens_seen": 4280811520, + "step": 8165 + }, + { + "epoch": 0.39621486290699015, + "grad_norm": 0.251953125, + "learning_rate": 3.532798398619975e-05, + "loss": 2.7617, + "num_input_tokens_seen": 4283432960, + "step": 8170 + }, + { + "epoch": 0.39645734446323677, + "grad_norm": 0.25, + "learning_rate": 3.530972440254092e-05, + "loss": 2.7788, + "num_input_tokens_seen": 4286054400, + "step": 8175 + }, + { + "epoch": 0.3966998260194834, + "grad_norm": 0.26171875, + "learning_rate": 3.5291458189664796e-05, + "loss": 2.7804, + "num_input_tokens_seen": 4288675840, + "step": 8180 + }, + { + "epoch": 0.39694230757573, + "grad_norm": 0.26171875, + "learning_rate": 3.527318535931667e-05, + "loss": 2.8013, + "num_input_tokens_seen": 4291297280, + "step": 8185 + }, + { + "epoch": 0.39718478913197663, + "grad_norm": 0.248046875, + "learning_rate": 3.525490592324609e-05, + "loss": 2.7848, + "num_input_tokens_seen": 4293918720, + "step": 8190 + }, + { + "epoch": 0.39742727068822326, + "grad_norm": 0.2470703125, + "learning_rate": 3.5236619893206854e-05, + "loss": 2.8064, + "num_input_tokens_seen": 4296540160, + "step": 8195 + }, + { + "epoch": 0.39766975224446993, + "grad_norm": 0.2412109375, + "learning_rate": 3.5218327280956975e-05, + "loss": 2.7625, + "num_input_tokens_seen": 4299161600, + "step": 8200 + }, + { + "epoch": 0.39791223380071655, + "grad_norm": 0.255859375, + "learning_rate": 3.520002809825874e-05, + "loss": 2.7813, + "num_input_tokens_seen": 4301783040, + "step": 8205 + }, + { + "epoch": 0.3981547153569632, + "grad_norm": 0.2578125, + "learning_rate": 3.518172235687862e-05, + "loss": 2.7884, + "num_input_tokens_seen": 4304404480, + "step": 8210 + }, + { + "epoch": 0.3983971969132098, + "grad_norm": 0.259765625, + "learning_rate": 3.516341006858733e-05, + "loss": 2.7766, + "num_input_tokens_seen": 4307025920, + "step": 8215 + }, + { + "epoch": 0.3986396784694564, + "grad_norm": 0.248046875, + "learning_rate": 3.514509124515979e-05, + "loss": 2.7809, + "num_input_tokens_seen": 4309647360, + "step": 8220 + }, + { + "epoch": 0.39888216002570304, + "grad_norm": 0.248046875, + "learning_rate": 3.5126765898375105e-05, + "loss": 2.7709, + "num_input_tokens_seen": 4312268800, + "step": 8225 + }, + { + "epoch": 0.39912464158194966, + "grad_norm": 0.251953125, + "learning_rate": 3.510843404001659e-05, + "loss": 2.7879, + "num_input_tokens_seen": 4314890240, + "step": 8230 + }, + { + "epoch": 0.3993671231381963, + "grad_norm": 0.259765625, + "learning_rate": 3.509009568187176e-05, + "loss": 2.7797, + "num_input_tokens_seen": 4317511680, + "step": 8235 + }, + { + "epoch": 0.39960960469444295, + "grad_norm": 0.251953125, + "learning_rate": 3.5071750835732276e-05, + "loss": 2.7759, + "num_input_tokens_seen": 4320133120, + "step": 8240 + }, + { + "epoch": 0.3998520862506896, + "grad_norm": 0.2578125, + "learning_rate": 3.505339951339399e-05, + "loss": 2.7877, + "num_input_tokens_seen": 4322754560, + "step": 8245 + }, + { + "epoch": 0.4000945678069362, + "grad_norm": 0.26171875, + "learning_rate": 3.503504172665694e-05, + "loss": 2.7775, + "num_input_tokens_seen": 4325376000, + "step": 8250 + }, + { + "epoch": 0.4003370493631828, + "grad_norm": 0.24609375, + "learning_rate": 3.5016677487325265e-05, + "loss": 2.7786, + "num_input_tokens_seen": 4327997440, + "step": 8255 + }, + { + "epoch": 0.40057953091942944, + "grad_norm": 0.255859375, + "learning_rate": 3.499830680720731e-05, + "loss": 2.7727, + "num_input_tokens_seen": 4330618880, + "step": 8260 + }, + { + "epoch": 0.40082201247567606, + "grad_norm": 0.251953125, + "learning_rate": 3.497992969811553e-05, + "loss": 2.7847, + "num_input_tokens_seen": 4333240320, + "step": 8265 + }, + { + "epoch": 0.4010644940319227, + "grad_norm": 0.25390625, + "learning_rate": 3.496154617186651e-05, + "loss": 2.7776, + "num_input_tokens_seen": 4335861760, + "step": 8270 + }, + { + "epoch": 0.4013069755881693, + "grad_norm": 0.26171875, + "learning_rate": 3.494315624028098e-05, + "loss": 2.761, + "num_input_tokens_seen": 4338483200, + "step": 8275 + }, + { + "epoch": 0.401549457144416, + "grad_norm": 0.25390625, + "learning_rate": 3.49247599151838e-05, + "loss": 2.7779, + "num_input_tokens_seen": 4341104640, + "step": 8280 + }, + { + "epoch": 0.4017919387006626, + "grad_norm": 0.259765625, + "learning_rate": 3.4906357208403896e-05, + "loss": 2.7821, + "num_input_tokens_seen": 4343726080, + "step": 8285 + }, + { + "epoch": 0.4020344202569092, + "grad_norm": 0.248046875, + "learning_rate": 3.488794813177433e-05, + "loss": 2.7739, + "num_input_tokens_seen": 4346347520, + "step": 8290 + }, + { + "epoch": 0.40227690181315584, + "grad_norm": 0.2490234375, + "learning_rate": 3.486953269713226e-05, + "loss": 2.78, + "num_input_tokens_seen": 4348968960, + "step": 8295 + }, + { + "epoch": 0.40251938336940246, + "grad_norm": 0.2490234375, + "learning_rate": 3.4851110916318924e-05, + "loss": 2.7706, + "num_input_tokens_seen": 4351590400, + "step": 8300 + }, + { + "epoch": 0.4027618649256491, + "grad_norm": 0.248046875, + "learning_rate": 3.483268280117964e-05, + "loss": 2.7744, + "num_input_tokens_seen": 4354211840, + "step": 8305 + }, + { + "epoch": 0.4030043464818957, + "grad_norm": 0.255859375, + "learning_rate": 3.4814248363563794e-05, + "loss": 2.7724, + "num_input_tokens_seen": 4356833280, + "step": 8310 + }, + { + "epoch": 0.4032468280381423, + "grad_norm": 0.263671875, + "learning_rate": 3.4795807615324864e-05, + "loss": 2.7788, + "num_input_tokens_seen": 4359454720, + "step": 8315 + }, + { + "epoch": 0.403489309594389, + "grad_norm": 0.265625, + "learning_rate": 3.477736056832035e-05, + "loss": 2.7863, + "num_input_tokens_seen": 4362076160, + "step": 8320 + }, + { + "epoch": 0.4037317911506356, + "grad_norm": 0.263671875, + "learning_rate": 3.4758907234411824e-05, + "loss": 2.7689, + "num_input_tokens_seen": 4364697600, + "step": 8325 + }, + { + "epoch": 0.40397427270688224, + "grad_norm": 0.255859375, + "learning_rate": 3.474044762546489e-05, + "loss": 2.7864, + "num_input_tokens_seen": 4367319040, + "step": 8330 + }, + { + "epoch": 0.40421675426312886, + "grad_norm": 0.244140625, + "learning_rate": 3.4721981753349205e-05, + "loss": 2.7751, + "num_input_tokens_seen": 4369940480, + "step": 8335 + }, + { + "epoch": 0.4044592358193755, + "grad_norm": 0.248046875, + "learning_rate": 3.4703509629938425e-05, + "loss": 2.7779, + "num_input_tokens_seen": 4372561920, + "step": 8340 + }, + { + "epoch": 0.4047017173756221, + "grad_norm": 0.2451171875, + "learning_rate": 3.468503126711025e-05, + "loss": 2.7835, + "num_input_tokens_seen": 4375183360, + "step": 8345 + }, + { + "epoch": 0.4049441989318687, + "grad_norm": 0.25, + "learning_rate": 3.466654667674638e-05, + "loss": 2.7711, + "num_input_tokens_seen": 4377804800, + "step": 8350 + }, + { + "epoch": 0.40518668048811535, + "grad_norm": 0.263671875, + "learning_rate": 3.4648055870732524e-05, + "loss": 2.7625, + "num_input_tokens_seen": 4380426240, + "step": 8355 + }, + { + "epoch": 0.405429162044362, + "grad_norm": 0.25, + "learning_rate": 3.462955886095839e-05, + "loss": 2.7825, + "num_input_tokens_seen": 4383047680, + "step": 8360 + }, + { + "epoch": 0.40567164360060864, + "grad_norm": 0.26171875, + "learning_rate": 3.461105565931766e-05, + "loss": 2.7853, + "num_input_tokens_seen": 4385669120, + "step": 8365 + }, + { + "epoch": 0.40591412515685527, + "grad_norm": 0.25390625, + "learning_rate": 3.4592546277708016e-05, + "loss": 2.7774, + "num_input_tokens_seen": 4388290560, + "step": 8370 + }, + { + "epoch": 0.4061566067131019, + "grad_norm": 0.244140625, + "learning_rate": 3.4574030728031107e-05, + "loss": 2.7627, + "num_input_tokens_seen": 4390912000, + "step": 8375 + }, + { + "epoch": 0.4063990882693485, + "grad_norm": 0.255859375, + "learning_rate": 3.455550902219254e-05, + "loss": 2.7861, + "num_input_tokens_seen": 4393533440, + "step": 8380 + }, + { + "epoch": 0.40664156982559513, + "grad_norm": 0.26171875, + "learning_rate": 3.4536981172101895e-05, + "loss": 2.7708, + "num_input_tokens_seen": 4396154880, + "step": 8385 + }, + { + "epoch": 0.40688405138184175, + "grad_norm": 0.255859375, + "learning_rate": 3.451844718967269e-05, + "loss": 2.7833, + "num_input_tokens_seen": 4398776320, + "step": 8390 + }, + { + "epoch": 0.40712653293808837, + "grad_norm": 0.251953125, + "learning_rate": 3.44999070868224e-05, + "loss": 2.764, + "num_input_tokens_seen": 4401397760, + "step": 8395 + }, + { + "epoch": 0.40736901449433505, + "grad_norm": 0.2490234375, + "learning_rate": 3.448136087547242e-05, + "loss": 2.7715, + "num_input_tokens_seen": 4404019200, + "step": 8400 + }, + { + "epoch": 0.40736901449433505, + "eval_accuracy": 0.4552369320957499, + "eval_loss": 2.744718313217163, + "eval_runtime": 5.8034, + "eval_samples_per_second": 51.694, + "eval_steps_per_second": 6.548, + "num_input_tokens_seen": 4404019200, + "step": 8400 + }, + { + "epoch": 0.40761149605058167, + "grad_norm": 0.263671875, + "learning_rate": 3.4462808567548084e-05, + "loss": 2.7877, + "num_input_tokens_seen": 4406640640, + "step": 8405 + }, + { + "epoch": 0.4078539776068283, + "grad_norm": 0.2578125, + "learning_rate": 3.444425017497864e-05, + "loss": 2.7712, + "num_input_tokens_seen": 4409262080, + "step": 8410 + }, + { + "epoch": 0.4080964591630749, + "grad_norm": 0.2451171875, + "learning_rate": 3.442568570969724e-05, + "loss": 2.771, + "num_input_tokens_seen": 4411883520, + "step": 8415 + }, + { + "epoch": 0.40833894071932153, + "grad_norm": 0.25390625, + "learning_rate": 3.440711518364097e-05, + "loss": 2.767, + "num_input_tokens_seen": 4414504960, + "step": 8420 + }, + { + "epoch": 0.40858142227556815, + "grad_norm": 0.255859375, + "learning_rate": 3.4388538608750784e-05, + "loss": 2.7818, + "num_input_tokens_seen": 4417126400, + "step": 8425 + }, + { + "epoch": 0.40882390383181477, + "grad_norm": 0.25, + "learning_rate": 3.4369955996971536e-05, + "loss": 2.7995, + "num_input_tokens_seen": 4419747840, + "step": 8430 + }, + { + "epoch": 0.4090663853880614, + "grad_norm": 0.2578125, + "learning_rate": 3.435136736025198e-05, + "loss": 2.78, + "num_input_tokens_seen": 4422369280, + "step": 8435 + }, + { + "epoch": 0.40930886694430807, + "grad_norm": 0.2451171875, + "learning_rate": 3.433277271054469e-05, + "loss": 2.773, + "num_input_tokens_seen": 4424990720, + "step": 8440 + }, + { + "epoch": 0.4095513485005547, + "grad_norm": 0.2490234375, + "learning_rate": 3.431417205980616e-05, + "loss": 2.7688, + "num_input_tokens_seen": 4427612160, + "step": 8445 + }, + { + "epoch": 0.4097938300568013, + "grad_norm": 0.26171875, + "learning_rate": 3.4295565419996735e-05, + "loss": 2.7796, + "num_input_tokens_seen": 4430233600, + "step": 8450 + }, + { + "epoch": 0.41003631161304793, + "grad_norm": 0.2578125, + "learning_rate": 3.42769528030806e-05, + "loss": 2.7825, + "num_input_tokens_seen": 4432855040, + "step": 8455 + }, + { + "epoch": 0.41027879316929455, + "grad_norm": 0.251953125, + "learning_rate": 3.425833422102576e-05, + "loss": 2.7858, + "num_input_tokens_seen": 4435476480, + "step": 8460 + }, + { + "epoch": 0.4105212747255412, + "grad_norm": 0.255859375, + "learning_rate": 3.42397096858041e-05, + "loss": 2.7787, + "num_input_tokens_seen": 4438097920, + "step": 8465 + }, + { + "epoch": 0.4107637562817878, + "grad_norm": 0.25390625, + "learning_rate": 3.4221079209391314e-05, + "loss": 2.7618, + "num_input_tokens_seen": 4440719360, + "step": 8470 + }, + { + "epoch": 0.41100623783803447, + "grad_norm": 0.2578125, + "learning_rate": 3.420244280376691e-05, + "loss": 2.7701, + "num_input_tokens_seen": 4443340800, + "step": 8475 + }, + { + "epoch": 0.4112487193942811, + "grad_norm": 0.26171875, + "learning_rate": 3.418380048091421e-05, + "loss": 2.7797, + "num_input_tokens_seen": 4445962240, + "step": 8480 + }, + { + "epoch": 0.4114912009505277, + "grad_norm": 0.255859375, + "learning_rate": 3.4165152252820346e-05, + "loss": 2.7982, + "num_input_tokens_seen": 4448583680, + "step": 8485 + }, + { + "epoch": 0.41173368250677433, + "grad_norm": 0.255859375, + "learning_rate": 3.414649813147625e-05, + "loss": 2.786, + "num_input_tokens_seen": 4451205120, + "step": 8490 + }, + { + "epoch": 0.41197616406302096, + "grad_norm": 0.259765625, + "learning_rate": 3.412783812887663e-05, + "loss": 2.7949, + "num_input_tokens_seen": 4453826560, + "step": 8495 + }, + { + "epoch": 0.4122186456192676, + "grad_norm": 0.255859375, + "learning_rate": 3.410917225701999e-05, + "loss": 2.775, + "num_input_tokens_seen": 4456448000, + "step": 8500 + }, + { + "epoch": 0.4124611271755142, + "grad_norm": 0.248046875, + "learning_rate": 3.4090500527908604e-05, + "loss": 2.7739, + "num_input_tokens_seen": 4459069440, + "step": 8505 + }, + { + "epoch": 0.4127036087317608, + "grad_norm": 0.255859375, + "learning_rate": 3.407182295354851e-05, + "loss": 2.7748, + "num_input_tokens_seen": 4461690880, + "step": 8510 + }, + { + "epoch": 0.4129460902880075, + "grad_norm": 0.2470703125, + "learning_rate": 3.4053139545949503e-05, + "loss": 2.7677, + "num_input_tokens_seen": 4464312320, + "step": 8515 + }, + { + "epoch": 0.4131885718442541, + "grad_norm": 0.240234375, + "learning_rate": 3.4034450317125135e-05, + "loss": 2.7693, + "num_input_tokens_seen": 4466933760, + "step": 8520 + }, + { + "epoch": 0.41343105340050074, + "grad_norm": 0.26171875, + "learning_rate": 3.4015755279092685e-05, + "loss": 2.7851, + "num_input_tokens_seen": 4469555200, + "step": 8525 + }, + { + "epoch": 0.41367353495674736, + "grad_norm": 0.26171875, + "learning_rate": 3.399705444387319e-05, + "loss": 2.7801, + "num_input_tokens_seen": 4472176640, + "step": 8530 + }, + { + "epoch": 0.413916016512994, + "grad_norm": 0.2470703125, + "learning_rate": 3.39783478234914e-05, + "loss": 2.7755, + "num_input_tokens_seen": 4474798080, + "step": 8535 + }, + { + "epoch": 0.4141584980692406, + "grad_norm": 0.255859375, + "learning_rate": 3.39596354299758e-05, + "loss": 2.7839, + "num_input_tokens_seen": 4477419520, + "step": 8540 + }, + { + "epoch": 0.4144009796254872, + "grad_norm": 0.259765625, + "learning_rate": 3.3940917275358565e-05, + "loss": 2.7853, + "num_input_tokens_seen": 4480040960, + "step": 8545 + }, + { + "epoch": 0.41464346118173384, + "grad_norm": 0.251953125, + "learning_rate": 3.392219337167559e-05, + "loss": 2.7964, + "num_input_tokens_seen": 4482662400, + "step": 8550 + }, + { + "epoch": 0.4148859427379805, + "grad_norm": 0.259765625, + "learning_rate": 3.390346373096645e-05, + "loss": 2.7604, + "num_input_tokens_seen": 4485283840, + "step": 8555 + }, + { + "epoch": 0.41512842429422714, + "grad_norm": 0.26953125, + "learning_rate": 3.3884728365274435e-05, + "loss": 2.779, + "num_input_tokens_seen": 4487905280, + "step": 8560 + }, + { + "epoch": 0.41537090585047376, + "grad_norm": 0.26171875, + "learning_rate": 3.38659872866465e-05, + "loss": 2.7849, + "num_input_tokens_seen": 4490526720, + "step": 8565 + }, + { + "epoch": 0.4156133874067204, + "grad_norm": 0.255859375, + "learning_rate": 3.384724050713327e-05, + "loss": 2.7886, + "num_input_tokens_seen": 4493148160, + "step": 8570 + }, + { + "epoch": 0.415855868962967, + "grad_norm": 0.251953125, + "learning_rate": 3.382848803878905e-05, + "loss": 2.777, + "num_input_tokens_seen": 4495769600, + "step": 8575 + }, + { + "epoch": 0.4160983505192136, + "grad_norm": 0.251953125, + "learning_rate": 3.3809729893671796e-05, + "loss": 2.7784, + "num_input_tokens_seen": 4498391040, + "step": 8580 + }, + { + "epoch": 0.41634083207546024, + "grad_norm": 0.2578125, + "learning_rate": 3.379096608384309e-05, + "loss": 2.7808, + "num_input_tokens_seen": 4501012480, + "step": 8585 + }, + { + "epoch": 0.41658331363170686, + "grad_norm": 0.25390625, + "learning_rate": 3.3772196621368216e-05, + "loss": 2.7823, + "num_input_tokens_seen": 4503633920, + "step": 8590 + }, + { + "epoch": 0.41682579518795354, + "grad_norm": 0.25, + "learning_rate": 3.375342151831603e-05, + "loss": 2.7768, + "num_input_tokens_seen": 4506255360, + "step": 8595 + }, + { + "epoch": 0.41706827674420016, + "grad_norm": 0.25390625, + "learning_rate": 3.3734640786759035e-05, + "loss": 2.7818, + "num_input_tokens_seen": 4508876800, + "step": 8600 + }, + { + "epoch": 0.4173107583004468, + "grad_norm": 0.25390625, + "learning_rate": 3.3715854438773374e-05, + "loss": 2.782, + "num_input_tokens_seen": 4511498240, + "step": 8605 + }, + { + "epoch": 0.4175532398566934, + "grad_norm": 0.25390625, + "learning_rate": 3.369706248643879e-05, + "loss": 2.7812, + "num_input_tokens_seen": 4514119680, + "step": 8610 + }, + { + "epoch": 0.41779572141294, + "grad_norm": 0.2470703125, + "learning_rate": 3.367826494183861e-05, + "loss": 2.7964, + "num_input_tokens_seen": 4516741120, + "step": 8615 + }, + { + "epoch": 0.41803820296918665, + "grad_norm": 0.25, + "learning_rate": 3.365946181705979e-05, + "loss": 2.8021, + "num_input_tokens_seen": 4519362560, + "step": 8620 + }, + { + "epoch": 0.41828068452543327, + "grad_norm": 0.259765625, + "learning_rate": 3.364065312419285e-05, + "loss": 2.7659, + "num_input_tokens_seen": 4521984000, + "step": 8625 + }, + { + "epoch": 0.4185231660816799, + "grad_norm": 0.251953125, + "learning_rate": 3.3621838875331886e-05, + "loss": 2.769, + "num_input_tokens_seen": 4524605440, + "step": 8630 + }, + { + "epoch": 0.41876564763792656, + "grad_norm": 0.251953125, + "learning_rate": 3.360301908257459e-05, + "loss": 2.7614, + "num_input_tokens_seen": 4527226880, + "step": 8635 + }, + { + "epoch": 0.4190081291941732, + "grad_norm": 0.25, + "learning_rate": 3.35841937580222e-05, + "loss": 2.7745, + "num_input_tokens_seen": 4529848320, + "step": 8640 + }, + { + "epoch": 0.4192506107504198, + "grad_norm": 0.2470703125, + "learning_rate": 3.356536291377953e-05, + "loss": 2.7709, + "num_input_tokens_seen": 4532469760, + "step": 8645 + }, + { + "epoch": 0.4194930923066664, + "grad_norm": 0.251953125, + "learning_rate": 3.3546526561954914e-05, + "loss": 2.7621, + "num_input_tokens_seen": 4535091200, + "step": 8650 + }, + { + "epoch": 0.41973557386291305, + "grad_norm": 0.25390625, + "learning_rate": 3.3527684714660255e-05, + "loss": 2.7709, + "num_input_tokens_seen": 4537712640, + "step": 8655 + }, + { + "epoch": 0.41997805541915967, + "grad_norm": 0.251953125, + "learning_rate": 3.350883738401098e-05, + "loss": 2.7755, + "num_input_tokens_seen": 4540334080, + "step": 8660 + }, + { + "epoch": 0.4202205369754063, + "grad_norm": 0.244140625, + "learning_rate": 3.348998458212603e-05, + "loss": 2.7841, + "num_input_tokens_seen": 4542955520, + "step": 8665 + }, + { + "epoch": 0.4204630185316529, + "grad_norm": 0.25390625, + "learning_rate": 3.347112632112788e-05, + "loss": 2.7876, + "num_input_tokens_seen": 4545576960, + "step": 8670 + }, + { + "epoch": 0.4207055000878996, + "grad_norm": 0.263671875, + "learning_rate": 3.345226261314251e-05, + "loss": 2.7698, + "num_input_tokens_seen": 4548198400, + "step": 8675 + }, + { + "epoch": 0.4209479816441462, + "grad_norm": 0.2578125, + "learning_rate": 3.34333934702994e-05, + "loss": 2.7672, + "num_input_tokens_seen": 4550819840, + "step": 8680 + }, + { + "epoch": 0.42119046320039283, + "grad_norm": 0.271484375, + "learning_rate": 3.3414518904731537e-05, + "loss": 2.7668, + "num_input_tokens_seen": 4553441280, + "step": 8685 + }, + { + "epoch": 0.42143294475663945, + "grad_norm": 0.259765625, + "learning_rate": 3.339563892857538e-05, + "loss": 2.767, + "num_input_tokens_seen": 4556062720, + "step": 8690 + }, + { + "epoch": 0.42167542631288607, + "grad_norm": 0.2578125, + "learning_rate": 3.3376753553970864e-05, + "loss": 2.7825, + "num_input_tokens_seen": 4558684160, + "step": 8695 + }, + { + "epoch": 0.4219179078691327, + "grad_norm": 0.263671875, + "learning_rate": 3.33578627930614e-05, + "loss": 2.7828, + "num_input_tokens_seen": 4561305600, + "step": 8700 + }, + { + "epoch": 0.4219179078691327, + "eval_accuracy": 0.4554388536069044, + "eval_loss": 2.744344472885132, + "eval_runtime": 6.4252, + "eval_samples_per_second": 46.691, + "eval_steps_per_second": 5.914, + "num_input_tokens_seen": 4561305600, + "step": 8700 + }, + { + "epoch": 0.4221603894253793, + "grad_norm": 0.244140625, + "learning_rate": 3.333896665799388e-05, + "loss": 2.7885, + "num_input_tokens_seen": 4563927040, + "step": 8705 + }, + { + "epoch": 0.42240287098162593, + "grad_norm": 0.25390625, + "learning_rate": 3.332006516091863e-05, + "loss": 2.7741, + "num_input_tokens_seen": 4566548480, + "step": 8710 + }, + { + "epoch": 0.4226453525378726, + "grad_norm": 0.24609375, + "learning_rate": 3.330115831398944e-05, + "loss": 2.7702, + "num_input_tokens_seen": 4569169920, + "step": 8715 + }, + { + "epoch": 0.42288783409411923, + "grad_norm": 0.2490234375, + "learning_rate": 3.328224612936351e-05, + "loss": 2.764, + "num_input_tokens_seen": 4571791360, + "step": 8720 + }, + { + "epoch": 0.42313031565036585, + "grad_norm": 0.251953125, + "learning_rate": 3.326332861920151e-05, + "loss": 2.7803, + "num_input_tokens_seen": 4574412800, + "step": 8725 + }, + { + "epoch": 0.4233727972066125, + "grad_norm": 0.2470703125, + "learning_rate": 3.324440579566751e-05, + "loss": 2.7652, + "num_input_tokens_seen": 4577034240, + "step": 8730 + }, + { + "epoch": 0.4236152787628591, + "grad_norm": 0.259765625, + "learning_rate": 3.3225477670929e-05, + "loss": 2.7707, + "num_input_tokens_seen": 4579655680, + "step": 8735 + }, + { + "epoch": 0.4238577603191057, + "grad_norm": 0.251953125, + "learning_rate": 3.32065442571569e-05, + "loss": 2.7752, + "num_input_tokens_seen": 4582277120, + "step": 8740 + }, + { + "epoch": 0.42410024187535234, + "grad_norm": 0.251953125, + "learning_rate": 3.318760556652551e-05, + "loss": 2.7779, + "num_input_tokens_seen": 4584898560, + "step": 8745 + }, + { + "epoch": 0.424342723431599, + "grad_norm": 0.255859375, + "learning_rate": 3.31686616112125e-05, + "loss": 2.7869, + "num_input_tokens_seen": 4587520000, + "step": 8750 + }, + { + "epoch": 0.42458520498784563, + "grad_norm": 0.2470703125, + "learning_rate": 3.314971240339898e-05, + "loss": 2.7758, + "num_input_tokens_seen": 4590141440, + "step": 8755 + }, + { + "epoch": 0.42482768654409225, + "grad_norm": 0.248046875, + "learning_rate": 3.31307579552694e-05, + "loss": 2.7691, + "num_input_tokens_seen": 4592762880, + "step": 8760 + }, + { + "epoch": 0.4250701681003389, + "grad_norm": 0.25390625, + "learning_rate": 3.3111798279011594e-05, + "loss": 2.7729, + "num_input_tokens_seen": 4595384320, + "step": 8765 + }, + { + "epoch": 0.4253126496565855, + "grad_norm": 0.24609375, + "learning_rate": 3.309283338681674e-05, + "loss": 2.7736, + "num_input_tokens_seen": 4598005760, + "step": 8770 + }, + { + "epoch": 0.4255551312128321, + "grad_norm": 0.2578125, + "learning_rate": 3.3073863290879395e-05, + "loss": 2.7777, + "num_input_tokens_seen": 4600627200, + "step": 8775 + }, + { + "epoch": 0.42579761276907874, + "grad_norm": 0.255859375, + "learning_rate": 3.305488800339744e-05, + "loss": 2.7623, + "num_input_tokens_seen": 4603248640, + "step": 8780 + }, + { + "epoch": 0.42604009432532536, + "grad_norm": 0.255859375, + "learning_rate": 3.303590753657211e-05, + "loss": 2.7881, + "num_input_tokens_seen": 4605870080, + "step": 8785 + }, + { + "epoch": 0.42628257588157203, + "grad_norm": 0.263671875, + "learning_rate": 3.3016921902607954e-05, + "loss": 2.7774, + "num_input_tokens_seen": 4608491520, + "step": 8790 + }, + { + "epoch": 0.42652505743781866, + "grad_norm": 0.25390625, + "learning_rate": 3.299793111371287e-05, + "loss": 2.7848, + "num_input_tokens_seen": 4611112960, + "step": 8795 + }, + { + "epoch": 0.4267675389940653, + "grad_norm": 0.25390625, + "learning_rate": 3.297893518209804e-05, + "loss": 2.7778, + "num_input_tokens_seen": 4613734400, + "step": 8800 + }, + { + "epoch": 0.4270100205503119, + "grad_norm": 0.25, + "learning_rate": 3.295993411997798e-05, + "loss": 2.7885, + "num_input_tokens_seen": 4616355840, + "step": 8805 + }, + { + "epoch": 0.4272525021065585, + "grad_norm": 0.265625, + "learning_rate": 3.294092793957047e-05, + "loss": 2.7789, + "num_input_tokens_seen": 4618977280, + "step": 8810 + }, + { + "epoch": 0.42749498366280514, + "grad_norm": 0.2490234375, + "learning_rate": 3.292191665309663e-05, + "loss": 2.7705, + "num_input_tokens_seen": 4621598720, + "step": 8815 + }, + { + "epoch": 0.42773746521905176, + "grad_norm": 0.25390625, + "learning_rate": 3.2902900272780814e-05, + "loss": 2.7909, + "num_input_tokens_seen": 4624220160, + "step": 8820 + }, + { + "epoch": 0.4279799467752984, + "grad_norm": 0.251953125, + "learning_rate": 3.2883878810850687e-05, + "loss": 2.7787, + "num_input_tokens_seen": 4626841600, + "step": 8825 + }, + { + "epoch": 0.42822242833154506, + "grad_norm": 0.25390625, + "learning_rate": 3.286485227953716e-05, + "loss": 2.7682, + "num_input_tokens_seen": 4629463040, + "step": 8830 + }, + { + "epoch": 0.4284649098877917, + "grad_norm": 0.259765625, + "learning_rate": 3.284582069107441e-05, + "loss": 2.7724, + "num_input_tokens_seen": 4632084480, + "step": 8835 + }, + { + "epoch": 0.4287073914440383, + "grad_norm": 0.255859375, + "learning_rate": 3.2826784057699876e-05, + "loss": 2.7818, + "num_input_tokens_seen": 4634705920, + "step": 8840 + }, + { + "epoch": 0.4289498730002849, + "grad_norm": 0.251953125, + "learning_rate": 3.2807742391654234e-05, + "loss": 2.7841, + "num_input_tokens_seen": 4637327360, + "step": 8845 + }, + { + "epoch": 0.42919235455653154, + "grad_norm": 0.25390625, + "learning_rate": 3.278869570518138e-05, + "loss": 2.7849, + "num_input_tokens_seen": 4639948800, + "step": 8850 + }, + { + "epoch": 0.42943483611277816, + "grad_norm": 0.244140625, + "learning_rate": 3.2769644010528476e-05, + "loss": 2.7787, + "num_input_tokens_seen": 4642570240, + "step": 8855 + }, + { + "epoch": 0.4296773176690248, + "grad_norm": 0.24609375, + "learning_rate": 3.275058731994586e-05, + "loss": 2.7786, + "num_input_tokens_seen": 4645191680, + "step": 8860 + }, + { + "epoch": 0.4299197992252714, + "grad_norm": 0.25390625, + "learning_rate": 3.273152564568711e-05, + "loss": 2.7695, + "num_input_tokens_seen": 4647813120, + "step": 8865 + }, + { + "epoch": 0.4301622807815181, + "grad_norm": 0.259765625, + "learning_rate": 3.2712459000008996e-05, + "loss": 2.7708, + "num_input_tokens_seen": 4650434560, + "step": 8870 + }, + { + "epoch": 0.4304047623377647, + "grad_norm": 0.2578125, + "learning_rate": 3.269338739517149e-05, + "loss": 2.7691, + "num_input_tokens_seen": 4653056000, + "step": 8875 + }, + { + "epoch": 0.4306472438940113, + "grad_norm": 0.2451171875, + "learning_rate": 3.2674310843437774e-05, + "loss": 2.7763, + "num_input_tokens_seen": 4655677440, + "step": 8880 + }, + { + "epoch": 0.43088972545025794, + "grad_norm": 0.251953125, + "learning_rate": 3.265522935707417e-05, + "loss": 2.7744, + "num_input_tokens_seen": 4658298880, + "step": 8885 + }, + { + "epoch": 0.43113220700650456, + "grad_norm": 0.25390625, + "learning_rate": 3.2636142948350196e-05, + "loss": 2.782, + "num_input_tokens_seen": 4660920320, + "step": 8890 + }, + { + "epoch": 0.4313746885627512, + "grad_norm": 0.2578125, + "learning_rate": 3.261705162953853e-05, + "loss": 2.7692, + "num_input_tokens_seen": 4663541760, + "step": 8895 + }, + { + "epoch": 0.4316171701189978, + "grad_norm": 0.24609375, + "learning_rate": 3.259795541291503e-05, + "loss": 2.7821, + "num_input_tokens_seen": 4666163200, + "step": 8900 + }, + { + "epoch": 0.4318596516752444, + "grad_norm": 0.2578125, + "learning_rate": 3.2578854310758656e-05, + "loss": 2.7684, + "num_input_tokens_seen": 4668784640, + "step": 8905 + }, + { + "epoch": 0.4321021332314911, + "grad_norm": 0.25, + "learning_rate": 3.255974833535154e-05, + "loss": 2.783, + "num_input_tokens_seen": 4671406080, + "step": 8910 + }, + { + "epoch": 0.4323446147877377, + "grad_norm": 0.25, + "learning_rate": 3.2540637498978963e-05, + "loss": 2.7794, + "num_input_tokens_seen": 4674027520, + "step": 8915 + }, + { + "epoch": 0.43258709634398435, + "grad_norm": 0.255859375, + "learning_rate": 3.25215218139293e-05, + "loss": 2.7685, + "num_input_tokens_seen": 4676648960, + "step": 8920 + }, + { + "epoch": 0.43282957790023097, + "grad_norm": 0.2431640625, + "learning_rate": 3.250240129249405e-05, + "loss": 2.7674, + "num_input_tokens_seen": 4679270400, + "step": 8925 + }, + { + "epoch": 0.4330720594564776, + "grad_norm": 0.251953125, + "learning_rate": 3.2483275946967825e-05, + "loss": 2.7863, + "num_input_tokens_seen": 4681891840, + "step": 8930 + }, + { + "epoch": 0.4333145410127242, + "grad_norm": 0.2470703125, + "learning_rate": 3.246414578964837e-05, + "loss": 2.7796, + "num_input_tokens_seen": 4684513280, + "step": 8935 + }, + { + "epoch": 0.43355702256897083, + "grad_norm": 0.255859375, + "learning_rate": 3.244501083283647e-05, + "loss": 2.7703, + "num_input_tokens_seen": 4687134720, + "step": 8940 + }, + { + "epoch": 0.43379950412521745, + "grad_norm": 0.25, + "learning_rate": 3.242587108883602e-05, + "loss": 2.7785, + "num_input_tokens_seen": 4689756160, + "step": 8945 + }, + { + "epoch": 0.4340419856814641, + "grad_norm": 0.25, + "learning_rate": 3.240672656995402e-05, + "loss": 2.7784, + "num_input_tokens_seen": 4692377600, + "step": 8950 + }, + { + "epoch": 0.43428446723771075, + "grad_norm": 0.255859375, + "learning_rate": 3.2387577288500484e-05, + "loss": 2.7953, + "num_input_tokens_seen": 4694999040, + "step": 8955 + }, + { + "epoch": 0.43452694879395737, + "grad_norm": 0.24609375, + "learning_rate": 3.236842325678854e-05, + "loss": 2.7773, + "num_input_tokens_seen": 4697620480, + "step": 8960 + }, + { + "epoch": 0.434769430350204, + "grad_norm": 0.25, + "learning_rate": 3.2349264487134354e-05, + "loss": 2.7707, + "num_input_tokens_seen": 4700241920, + "step": 8965 + }, + { + "epoch": 0.4350119119064506, + "grad_norm": 0.267578125, + "learning_rate": 3.233010099185711e-05, + "loss": 2.7754, + "num_input_tokens_seen": 4702863360, + "step": 8970 + }, + { + "epoch": 0.43525439346269723, + "grad_norm": 0.26171875, + "learning_rate": 3.231093278327908e-05, + "loss": 2.7774, + "num_input_tokens_seen": 4705484800, + "step": 8975 + }, + { + "epoch": 0.43549687501894385, + "grad_norm": 0.25390625, + "learning_rate": 3.229175987372553e-05, + "loss": 2.7743, + "num_input_tokens_seen": 4708106240, + "step": 8980 + }, + { + "epoch": 0.4357393565751905, + "grad_norm": 0.26171875, + "learning_rate": 3.2272582275524765e-05, + "loss": 2.7851, + "num_input_tokens_seen": 4710727680, + "step": 8985 + }, + { + "epoch": 0.43598183813143715, + "grad_norm": 0.244140625, + "learning_rate": 3.22534000010081e-05, + "loss": 2.7697, + "num_input_tokens_seen": 4713349120, + "step": 8990 + }, + { + "epoch": 0.43622431968768377, + "grad_norm": 0.25390625, + "learning_rate": 3.2234213062509865e-05, + "loss": 2.7605, + "num_input_tokens_seen": 4715970560, + "step": 8995 + }, + { + "epoch": 0.4364668012439304, + "grad_norm": 0.251953125, + "learning_rate": 3.221502147236737e-05, + "loss": 2.7883, + "num_input_tokens_seen": 4718592000, + "step": 9000 + }, + { + "epoch": 0.4364668012439304, + "eval_accuracy": 0.4555984367366878, + "eval_loss": 2.7440221309661865, + "eval_runtime": 5.8804, + "eval_samples_per_second": 51.017, + "eval_steps_per_second": 6.462, + "num_input_tokens_seen": 4718592000, + "step": 9000 + }, + { + "epoch": 0.436709282800177, + "grad_norm": 0.255859375, + "learning_rate": 3.219582524292093e-05, + "loss": 2.7729, + "num_input_tokens_seen": 4721213440, + "step": 9005 + }, + { + "epoch": 0.43695176435642363, + "grad_norm": 0.251953125, + "learning_rate": 3.217662438651383e-05, + "loss": 2.7722, + "num_input_tokens_seen": 4723834880, + "step": 9010 + }, + { + "epoch": 0.43719424591267025, + "grad_norm": 0.259765625, + "learning_rate": 3.2157418915492367e-05, + "loss": 2.7743, + "num_input_tokens_seen": 4726456320, + "step": 9015 + }, + { + "epoch": 0.4374367274689169, + "grad_norm": 0.2578125, + "learning_rate": 3.213820884220575e-05, + "loss": 2.7798, + "num_input_tokens_seen": 4729077760, + "step": 9020 + }, + { + "epoch": 0.4376792090251635, + "grad_norm": 0.25390625, + "learning_rate": 3.211899417900621e-05, + "loss": 2.7814, + "num_input_tokens_seen": 4731699200, + "step": 9025 + }, + { + "epoch": 0.4379216905814102, + "grad_norm": 0.2490234375, + "learning_rate": 3.2099774938248864e-05, + "loss": 2.7762, + "num_input_tokens_seen": 4734320640, + "step": 9030 + }, + { + "epoch": 0.4381641721376568, + "grad_norm": 0.248046875, + "learning_rate": 3.208055113229183e-05, + "loss": 2.7765, + "num_input_tokens_seen": 4736942080, + "step": 9035 + }, + { + "epoch": 0.4384066536939034, + "grad_norm": 0.25390625, + "learning_rate": 3.2061322773496106e-05, + "loss": 2.7686, + "num_input_tokens_seen": 4739563520, + "step": 9040 + }, + { + "epoch": 0.43864913525015004, + "grad_norm": 0.255859375, + "learning_rate": 3.2042089874225665e-05, + "loss": 2.7919, + "num_input_tokens_seen": 4742184960, + "step": 9045 + }, + { + "epoch": 0.43889161680639666, + "grad_norm": 0.2470703125, + "learning_rate": 3.202285244684738e-05, + "loss": 2.7705, + "num_input_tokens_seen": 4744806400, + "step": 9050 + }, + { + "epoch": 0.4391340983626433, + "grad_norm": 0.24609375, + "learning_rate": 3.200361050373105e-05, + "loss": 2.7638, + "num_input_tokens_seen": 4747427840, + "step": 9055 + }, + { + "epoch": 0.4393765799188899, + "grad_norm": 0.251953125, + "learning_rate": 3.198436405724934e-05, + "loss": 2.7729, + "num_input_tokens_seen": 4750049280, + "step": 9060 + }, + { + "epoch": 0.4396190614751366, + "grad_norm": 0.2578125, + "learning_rate": 3.1965113119777844e-05, + "loss": 2.7669, + "num_input_tokens_seen": 4752670720, + "step": 9065 + }, + { + "epoch": 0.4398615430313832, + "grad_norm": 0.25390625, + "learning_rate": 3.194585770369504e-05, + "loss": 2.7682, + "num_input_tokens_seen": 4755292160, + "step": 9070 + }, + { + "epoch": 0.4401040245876298, + "grad_norm": 0.2578125, + "learning_rate": 3.1926597821382295e-05, + "loss": 2.7861, + "num_input_tokens_seen": 4757913600, + "step": 9075 + }, + { + "epoch": 0.44034650614387644, + "grad_norm": 0.255859375, + "learning_rate": 3.19073334852238e-05, + "loss": 2.7897, + "num_input_tokens_seen": 4760535040, + "step": 9080 + }, + { + "epoch": 0.44058898770012306, + "grad_norm": 0.248046875, + "learning_rate": 3.188806470760667e-05, + "loss": 2.7811, + "num_input_tokens_seen": 4763156480, + "step": 9085 + }, + { + "epoch": 0.4408314692563697, + "grad_norm": 0.255859375, + "learning_rate": 3.1868791500920836e-05, + "loss": 2.7823, + "num_input_tokens_seen": 4765777920, + "step": 9090 + }, + { + "epoch": 0.4410739508126163, + "grad_norm": 0.2412109375, + "learning_rate": 3.18495138775591e-05, + "loss": 2.7865, + "num_input_tokens_seen": 4768399360, + "step": 9095 + }, + { + "epoch": 0.4413164323688629, + "grad_norm": 0.2490234375, + "learning_rate": 3.183023184991709e-05, + "loss": 2.764, + "num_input_tokens_seen": 4771020800, + "step": 9100 + }, + { + "epoch": 0.4415589139251096, + "grad_norm": 0.255859375, + "learning_rate": 3.181094543039328e-05, + "loss": 2.7795, + "num_input_tokens_seen": 4773642240, + "step": 9105 + }, + { + "epoch": 0.4418013954813562, + "grad_norm": 0.2431640625, + "learning_rate": 3.179165463138893e-05, + "loss": 2.7685, + "num_input_tokens_seen": 4776263680, + "step": 9110 + }, + { + "epoch": 0.44204387703760284, + "grad_norm": 0.24609375, + "learning_rate": 3.177235946530818e-05, + "loss": 2.7632, + "num_input_tokens_seen": 4778885120, + "step": 9115 + }, + { + "epoch": 0.44228635859384946, + "grad_norm": 0.2578125, + "learning_rate": 3.175305994455791e-05, + "loss": 2.7808, + "num_input_tokens_seen": 4781506560, + "step": 9120 + }, + { + "epoch": 0.4425288401500961, + "grad_norm": 0.25, + "learning_rate": 3.1733756081547864e-05, + "loss": 2.7894, + "num_input_tokens_seen": 4784128000, + "step": 9125 + }, + { + "epoch": 0.4427713217063427, + "grad_norm": 0.251953125, + "learning_rate": 3.171444788869052e-05, + "loss": 2.7745, + "num_input_tokens_seen": 4786749440, + "step": 9130 + }, + { + "epoch": 0.4430138032625893, + "grad_norm": 0.25, + "learning_rate": 3.1695135378401185e-05, + "loss": 2.7749, + "num_input_tokens_seen": 4789370880, + "step": 9135 + }, + { + "epoch": 0.44325628481883594, + "grad_norm": 0.259765625, + "learning_rate": 3.167581856309792e-05, + "loss": 2.782, + "num_input_tokens_seen": 4791992320, + "step": 9140 + }, + { + "epoch": 0.4434987663750826, + "grad_norm": 0.251953125, + "learning_rate": 3.1656497455201546e-05, + "loss": 2.785, + "num_input_tokens_seen": 4794613760, + "step": 9145 + }, + { + "epoch": 0.44374124793132924, + "grad_norm": 0.25390625, + "learning_rate": 3.163717206713567e-05, + "loss": 2.7691, + "num_input_tokens_seen": 4797235200, + "step": 9150 + }, + { + "epoch": 0.44398372948757586, + "grad_norm": 0.240234375, + "learning_rate": 3.161784241132663e-05, + "loss": 2.7753, + "num_input_tokens_seen": 4799856640, + "step": 9155 + }, + { + "epoch": 0.4442262110438225, + "grad_norm": 0.244140625, + "learning_rate": 3.159850850020352e-05, + "loss": 2.78, + "num_input_tokens_seen": 4802478080, + "step": 9160 + }, + { + "epoch": 0.4444686926000691, + "grad_norm": 0.263671875, + "learning_rate": 3.157917034619817e-05, + "loss": 2.7739, + "num_input_tokens_seen": 4805099520, + "step": 9165 + }, + { + "epoch": 0.4447111741563157, + "grad_norm": 0.2470703125, + "learning_rate": 3.155982796174512e-05, + "loss": 2.7739, + "num_input_tokens_seen": 4807720960, + "step": 9170 + }, + { + "epoch": 0.44495365571256235, + "grad_norm": 0.25, + "learning_rate": 3.154048135928165e-05, + "loss": 2.7868, + "num_input_tokens_seen": 4810342400, + "step": 9175 + }, + { + "epoch": 0.44519613726880897, + "grad_norm": 0.2470703125, + "learning_rate": 3.1521130551247755e-05, + "loss": 2.7715, + "num_input_tokens_seen": 4812963840, + "step": 9180 + }, + { + "epoch": 0.44543861882505564, + "grad_norm": 0.248046875, + "learning_rate": 3.150177555008612e-05, + "loss": 2.7747, + "num_input_tokens_seen": 4815585280, + "step": 9185 + }, + { + "epoch": 0.44568110038130226, + "grad_norm": 0.25, + "learning_rate": 3.148241636824213e-05, + "loss": 2.7619, + "num_input_tokens_seen": 4818206720, + "step": 9190 + }, + { + "epoch": 0.4459235819375489, + "grad_norm": 0.25390625, + "learning_rate": 3.146305301816386e-05, + "loss": 2.7788, + "num_input_tokens_seen": 4820828160, + "step": 9195 + }, + { + "epoch": 0.4461660634937955, + "grad_norm": 0.26171875, + "learning_rate": 3.1443685512302065e-05, + "loss": 2.7807, + "num_input_tokens_seen": 4823449600, + "step": 9200 + }, + { + "epoch": 0.4464085450500421, + "grad_norm": 0.255859375, + "learning_rate": 3.142431386311018e-05, + "loss": 2.781, + "num_input_tokens_seen": 4826071040, + "step": 9205 + }, + { + "epoch": 0.44665102660628875, + "grad_norm": 0.2578125, + "learning_rate": 3.140493808304429e-05, + "loss": 2.7921, + "num_input_tokens_seen": 4828692480, + "step": 9210 + }, + { + "epoch": 0.44689350816253537, + "grad_norm": 0.251953125, + "learning_rate": 3.138555818456314e-05, + "loss": 2.7781, + "num_input_tokens_seen": 4831313920, + "step": 9215 + }, + { + "epoch": 0.447135989718782, + "grad_norm": 0.25, + "learning_rate": 3.136617418012813e-05, + "loss": 2.7862, + "num_input_tokens_seen": 4833935360, + "step": 9220 + }, + { + "epoch": 0.44737847127502867, + "grad_norm": 0.248046875, + "learning_rate": 3.134678608220329e-05, + "loss": 2.7759, + "num_input_tokens_seen": 4836556800, + "step": 9225 + }, + { + "epoch": 0.4476209528312753, + "grad_norm": 0.24609375, + "learning_rate": 3.13273939032553e-05, + "loss": 2.7752, + "num_input_tokens_seen": 4839178240, + "step": 9230 + }, + { + "epoch": 0.4478634343875219, + "grad_norm": 0.2470703125, + "learning_rate": 3.130799765575344e-05, + "loss": 2.7787, + "num_input_tokens_seen": 4841799680, + "step": 9235 + }, + { + "epoch": 0.44810591594376853, + "grad_norm": 0.251953125, + "learning_rate": 3.128859735216963e-05, + "loss": 2.7803, + "num_input_tokens_seen": 4844421120, + "step": 9240 + }, + { + "epoch": 0.44834839750001515, + "grad_norm": 0.271484375, + "learning_rate": 3.126919300497839e-05, + "loss": 2.7779, + "num_input_tokens_seen": 4847042560, + "step": 9245 + }, + { + "epoch": 0.44859087905626177, + "grad_norm": 0.255859375, + "learning_rate": 3.124978462665681e-05, + "loss": 2.7938, + "num_input_tokens_seen": 4849664000, + "step": 9250 + }, + { + "epoch": 0.4488333606125084, + "grad_norm": 0.25390625, + "learning_rate": 3.123037222968463e-05, + "loss": 2.7858, + "num_input_tokens_seen": 4852285440, + "step": 9255 + }, + { + "epoch": 0.449075842168755, + "grad_norm": 0.25390625, + "learning_rate": 3.121095582654412e-05, + "loss": 2.7809, + "num_input_tokens_seen": 4854906880, + "step": 9260 + }, + { + "epoch": 0.4493183237250017, + "grad_norm": 0.2451171875, + "learning_rate": 3.119153542972017e-05, + "loss": 2.7655, + "num_input_tokens_seen": 4857528320, + "step": 9265 + }, + { + "epoch": 0.4495608052812483, + "grad_norm": 0.251953125, + "learning_rate": 3.117211105170019e-05, + "loss": 2.7759, + "num_input_tokens_seen": 4860149760, + "step": 9270 + }, + { + "epoch": 0.44980328683749493, + "grad_norm": 0.263671875, + "learning_rate": 3.11526827049742e-05, + "loss": 2.7733, + "num_input_tokens_seen": 4862771200, + "step": 9275 + }, + { + "epoch": 0.45004576839374155, + "grad_norm": 0.244140625, + "learning_rate": 3.113325040203474e-05, + "loss": 2.7925, + "num_input_tokens_seen": 4865392640, + "step": 9280 + }, + { + "epoch": 0.4502882499499882, + "grad_norm": 0.25390625, + "learning_rate": 3.1113814155376897e-05, + "loss": 2.7686, + "num_input_tokens_seen": 4868014080, + "step": 9285 + }, + { + "epoch": 0.4505307315062348, + "grad_norm": 0.2451171875, + "learning_rate": 3.1094373977498306e-05, + "loss": 2.7807, + "num_input_tokens_seen": 4870635520, + "step": 9290 + }, + { + "epoch": 0.4507732130624814, + "grad_norm": 0.2421875, + "learning_rate": 3.107492988089912e-05, + "loss": 2.7717, + "num_input_tokens_seen": 4873256960, + "step": 9295 + }, + { + "epoch": 0.45101569461872804, + "grad_norm": 0.2470703125, + "learning_rate": 3.105548187808202e-05, + "loss": 2.7627, + "num_input_tokens_seen": 4875878400, + "step": 9300 + }, + { + "epoch": 0.45101569461872804, + "eval_accuracy": 0.45560657873310534, + "eval_loss": 2.7437386512756348, + "eval_runtime": 5.8688, + "eval_samples_per_second": 51.118, + "eval_steps_per_second": 6.475, + "num_input_tokens_seen": 4875878400, + "step": 9300 + }, + { + "epoch": 0.4512581761749747, + "grad_norm": 0.244140625, + "learning_rate": 3.103602998155219e-05, + "loss": 2.7802, + "num_input_tokens_seen": 4878499840, + "step": 9305 + }, + { + "epoch": 0.45150065773122133, + "grad_norm": 0.2578125, + "learning_rate": 3.1016574203817316e-05, + "loss": 2.7694, + "num_input_tokens_seen": 4881121280, + "step": 9310 + }, + { + "epoch": 0.45174313928746795, + "grad_norm": 0.25390625, + "learning_rate": 3.099711455738759e-05, + "loss": 2.7778, + "num_input_tokens_seen": 4883742720, + "step": 9315 + }, + { + "epoch": 0.4519856208437146, + "grad_norm": 0.2470703125, + "learning_rate": 3.097765105477569e-05, + "loss": 2.7555, + "num_input_tokens_seen": 4886364160, + "step": 9320 + }, + { + "epoch": 0.4522281023999612, + "grad_norm": 0.2470703125, + "learning_rate": 3.0958183708496756e-05, + "loss": 2.7702, + "num_input_tokens_seen": 4888985600, + "step": 9325 + }, + { + "epoch": 0.4524705839562078, + "grad_norm": 0.240234375, + "learning_rate": 3.093871253106843e-05, + "loss": 2.781, + "num_input_tokens_seen": 4891607040, + "step": 9330 + }, + { + "epoch": 0.45271306551245444, + "grad_norm": 0.251953125, + "learning_rate": 3.0919237535010805e-05, + "loss": 2.7824, + "num_input_tokens_seen": 4894228480, + "step": 9335 + }, + { + "epoch": 0.4529555470687011, + "grad_norm": 0.24609375, + "learning_rate": 3.08997587328464e-05, + "loss": 2.7741, + "num_input_tokens_seen": 4896849920, + "step": 9340 + }, + { + "epoch": 0.45319802862494774, + "grad_norm": 0.2421875, + "learning_rate": 3.088027613710022e-05, + "loss": 2.7757, + "num_input_tokens_seen": 4899471360, + "step": 9345 + }, + { + "epoch": 0.45344051018119436, + "grad_norm": 0.25390625, + "learning_rate": 3.0860789760299705e-05, + "loss": 2.7778, + "num_input_tokens_seen": 4902092800, + "step": 9350 + }, + { + "epoch": 0.453682991737441, + "grad_norm": 0.2451171875, + "learning_rate": 3.08412996149747e-05, + "loss": 2.7714, + "num_input_tokens_seen": 4904714240, + "step": 9355 + }, + { + "epoch": 0.4539254732936876, + "grad_norm": 0.2490234375, + "learning_rate": 3.0821805713657504e-05, + "loss": 2.7841, + "num_input_tokens_seen": 4907335680, + "step": 9360 + }, + { + "epoch": 0.4541679548499342, + "grad_norm": 0.25, + "learning_rate": 3.0802308068882817e-05, + "loss": 2.7705, + "num_input_tokens_seen": 4909957120, + "step": 9365 + }, + { + "epoch": 0.45441043640618084, + "grad_norm": 0.248046875, + "learning_rate": 3.078280669318774e-05, + "loss": 2.7756, + "num_input_tokens_seen": 4912578560, + "step": 9370 + }, + { + "epoch": 0.45465291796242746, + "grad_norm": 0.2490234375, + "learning_rate": 3.076330159911178e-05, + "loss": 2.7805, + "num_input_tokens_seen": 4915200000, + "step": 9375 + }, + { + "epoch": 0.45489539951867414, + "grad_norm": 0.2431640625, + "learning_rate": 3.074379279919683e-05, + "loss": 2.7801, + "num_input_tokens_seen": 4917821440, + "step": 9380 + }, + { + "epoch": 0.45513788107492076, + "grad_norm": 0.25, + "learning_rate": 3.072428030598719e-05, + "loss": 2.771, + "num_input_tokens_seen": 4920442880, + "step": 9385 + }, + { + "epoch": 0.4553803626311674, + "grad_norm": 0.259765625, + "learning_rate": 3.07047641320295e-05, + "loss": 2.7795, + "num_input_tokens_seen": 4923064320, + "step": 9390 + }, + { + "epoch": 0.455622844187414, + "grad_norm": 0.24609375, + "learning_rate": 3.0685244289872777e-05, + "loss": 2.7682, + "num_input_tokens_seen": 4925685760, + "step": 9395 + }, + { + "epoch": 0.4558653257436606, + "grad_norm": 0.251953125, + "learning_rate": 3.066572079206841e-05, + "loss": 2.7947, + "num_input_tokens_seen": 4928307200, + "step": 9400 + }, + { + "epoch": 0.45610780729990724, + "grad_norm": 0.263671875, + "learning_rate": 3.064619365117013e-05, + "loss": 2.7797, + "num_input_tokens_seen": 4930928640, + "step": 9405 + }, + { + "epoch": 0.45635028885615386, + "grad_norm": 0.2431640625, + "learning_rate": 3.0626662879734015e-05, + "loss": 2.7681, + "num_input_tokens_seen": 4933550080, + "step": 9410 + }, + { + "epoch": 0.4565927704124005, + "grad_norm": 0.25, + "learning_rate": 3.060712849031846e-05, + "loss": 2.7614, + "num_input_tokens_seen": 4936171520, + "step": 9415 + }, + { + "epoch": 0.45683525196864716, + "grad_norm": 0.25390625, + "learning_rate": 3.058759049548422e-05, + "loss": 2.7784, + "num_input_tokens_seen": 4938792960, + "step": 9420 + }, + { + "epoch": 0.4570777335248938, + "grad_norm": 0.255859375, + "learning_rate": 3.056804890779433e-05, + "loss": 2.7753, + "num_input_tokens_seen": 4941414400, + "step": 9425 + }, + { + "epoch": 0.4573202150811404, + "grad_norm": 0.2490234375, + "learning_rate": 3.054850373981415e-05, + "loss": 2.7847, + "num_input_tokens_seen": 4944035840, + "step": 9430 + }, + { + "epoch": 0.457562696637387, + "grad_norm": 0.25, + "learning_rate": 3.052895500411136e-05, + "loss": 2.7763, + "num_input_tokens_seen": 4946657280, + "step": 9435 + }, + { + "epoch": 0.45780517819363364, + "grad_norm": 0.24609375, + "learning_rate": 3.0509402713255913e-05, + "loss": 2.7803, + "num_input_tokens_seen": 4949278720, + "step": 9440 + }, + { + "epoch": 0.45804765974988026, + "grad_norm": 0.2490234375, + "learning_rate": 3.048984687982006e-05, + "loss": 2.7799, + "num_input_tokens_seen": 4951900160, + "step": 9445 + }, + { + "epoch": 0.4582901413061269, + "grad_norm": 0.255859375, + "learning_rate": 3.0470287516378315e-05, + "loss": 2.7765, + "num_input_tokens_seen": 4954521600, + "step": 9450 + }, + { + "epoch": 0.4585326228623735, + "grad_norm": 0.248046875, + "learning_rate": 3.045072463550747e-05, + "loss": 2.7775, + "num_input_tokens_seen": 4957143040, + "step": 9455 + }, + { + "epoch": 0.4587751044186202, + "grad_norm": 0.26171875, + "learning_rate": 3.043115824978659e-05, + "loss": 2.7816, + "num_input_tokens_seen": 4959764480, + "step": 9460 + }, + { + "epoch": 0.4590175859748668, + "grad_norm": 0.2578125, + "learning_rate": 3.041158837179698e-05, + "loss": 2.7748, + "num_input_tokens_seen": 4962385920, + "step": 9465 + }, + { + "epoch": 0.4592600675311134, + "grad_norm": 0.259765625, + "learning_rate": 3.039201501412218e-05, + "loss": 2.7679, + "num_input_tokens_seen": 4965007360, + "step": 9470 + }, + { + "epoch": 0.45950254908736005, + "grad_norm": 0.2578125, + "learning_rate": 3.0372438189348013e-05, + "loss": 2.7898, + "num_input_tokens_seen": 4967628800, + "step": 9475 + }, + { + "epoch": 0.45974503064360667, + "grad_norm": 0.2451171875, + "learning_rate": 3.0352857910062466e-05, + "loss": 2.7744, + "num_input_tokens_seen": 4970250240, + "step": 9480 + }, + { + "epoch": 0.4599875121998533, + "grad_norm": 0.2470703125, + "learning_rate": 3.03332741888558e-05, + "loss": 2.7674, + "num_input_tokens_seen": 4972871680, + "step": 9485 + }, + { + "epoch": 0.4602299937560999, + "grad_norm": 0.2490234375, + "learning_rate": 3.0313687038320464e-05, + "loss": 2.7796, + "num_input_tokens_seen": 4975493120, + "step": 9490 + }, + { + "epoch": 0.46047247531234653, + "grad_norm": 0.25, + "learning_rate": 3.029409647105112e-05, + "loss": 2.779, + "num_input_tokens_seen": 4978114560, + "step": 9495 + }, + { + "epoch": 0.4607149568685932, + "grad_norm": 0.248046875, + "learning_rate": 3.0274502499644625e-05, + "loss": 2.7835, + "num_input_tokens_seen": 4980736000, + "step": 9500 + }, + { + "epoch": 0.4609574384248398, + "grad_norm": 0.2470703125, + "learning_rate": 3.0254905136700036e-05, + "loss": 2.7697, + "num_input_tokens_seen": 4983357440, + "step": 9505 + }, + { + "epoch": 0.46119991998108645, + "grad_norm": 0.255859375, + "learning_rate": 3.0235304394818553e-05, + "loss": 2.7712, + "num_input_tokens_seen": 4985978880, + "step": 9510 + }, + { + "epoch": 0.46144240153733307, + "grad_norm": 0.255859375, + "learning_rate": 3.0215700286603606e-05, + "loss": 2.7681, + "num_input_tokens_seen": 4988600320, + "step": 9515 + }, + { + "epoch": 0.4616848830935797, + "grad_norm": 0.251953125, + "learning_rate": 3.0196092824660732e-05, + "loss": 2.7777, + "num_input_tokens_seen": 4991221760, + "step": 9520 + }, + { + "epoch": 0.4619273646498263, + "grad_norm": 0.251953125, + "learning_rate": 3.0176482021597675e-05, + "loss": 2.7898, + "num_input_tokens_seen": 4993843200, + "step": 9525 + }, + { + "epoch": 0.46216984620607293, + "grad_norm": 0.2470703125, + "learning_rate": 3.0156867890024286e-05, + "loss": 2.7796, + "num_input_tokens_seen": 4996464640, + "step": 9530 + }, + { + "epoch": 0.46241232776231955, + "grad_norm": 0.2451171875, + "learning_rate": 3.0137250442552594e-05, + "loss": 2.7688, + "num_input_tokens_seen": 4999086080, + "step": 9535 + }, + { + "epoch": 0.46265480931856623, + "grad_norm": 0.25390625, + "learning_rate": 3.011762969179672e-05, + "loss": 2.7564, + "num_input_tokens_seen": 5001707520, + "step": 9540 + }, + { + "epoch": 0.46289729087481285, + "grad_norm": 0.24609375, + "learning_rate": 3.0098005650372933e-05, + "loss": 2.7553, + "num_input_tokens_seen": 5004328960, + "step": 9545 + }, + { + "epoch": 0.46313977243105947, + "grad_norm": 0.248046875, + "learning_rate": 3.007837833089963e-05, + "loss": 2.7563, + "num_input_tokens_seen": 5006950400, + "step": 9550 + }, + { + "epoch": 0.4633822539873061, + "grad_norm": 0.24609375, + "learning_rate": 3.005874774599729e-05, + "loss": 2.7804, + "num_input_tokens_seen": 5009571840, + "step": 9555 + }, + { + "epoch": 0.4636247355435527, + "grad_norm": 0.251953125, + "learning_rate": 3.00391139082885e-05, + "loss": 2.7697, + "num_input_tokens_seen": 5012193280, + "step": 9560 + }, + { + "epoch": 0.46386721709979933, + "grad_norm": 0.25, + "learning_rate": 3.0019476830397942e-05, + "loss": 2.7715, + "num_input_tokens_seen": 5014814720, + "step": 9565 + }, + { + "epoch": 0.46410969865604595, + "grad_norm": 0.25, + "learning_rate": 2.9999836524952385e-05, + "loss": 2.7585, + "num_input_tokens_seen": 5017436160, + "step": 9570 + }, + { + "epoch": 0.4643521802122926, + "grad_norm": 0.25, + "learning_rate": 2.9980193004580648e-05, + "loss": 2.7711, + "num_input_tokens_seen": 5020057600, + "step": 9575 + }, + { + "epoch": 0.46459466176853925, + "grad_norm": 0.25390625, + "learning_rate": 2.9960546281913664e-05, + "loss": 2.7938, + "num_input_tokens_seen": 5022679040, + "step": 9580 + }, + { + "epoch": 0.4648371433247859, + "grad_norm": 0.25, + "learning_rate": 2.9940896369584394e-05, + "loss": 2.775, + "num_input_tokens_seen": 5025300480, + "step": 9585 + }, + { + "epoch": 0.4650796248810325, + "grad_norm": 0.25390625, + "learning_rate": 2.992124328022784e-05, + "loss": 2.775, + "num_input_tokens_seen": 5027921920, + "step": 9590 + }, + { + "epoch": 0.4653221064372791, + "grad_norm": 0.25390625, + "learning_rate": 2.9901587026481072e-05, + "loss": 2.7668, + "num_input_tokens_seen": 5030543360, + "step": 9595 + }, + { + "epoch": 0.46556458799352574, + "grad_norm": 0.2578125, + "learning_rate": 2.9881927620983174e-05, + "loss": 2.7841, + "num_input_tokens_seen": 5033164800, + "step": 9600 + }, + { + "epoch": 0.46556458799352574, + "eval_accuracy": 0.45567334310372903, + "eval_loss": 2.743530511856079, + "eval_runtime": 6.337, + "eval_samples_per_second": 47.341, + "eval_steps_per_second": 5.997, + "num_input_tokens_seen": 5033164800, + "step": 9600 + }, + { + "epoch": 0.46580706954977236, + "grad_norm": 0.25, + "learning_rate": 2.9862265076375285e-05, + "loss": 2.7953, + "num_input_tokens_seen": 5035786240, + "step": 9605 + }, + { + "epoch": 0.466049551106019, + "grad_norm": 0.2470703125, + "learning_rate": 2.9842599405300524e-05, + "loss": 2.7845, + "num_input_tokens_seen": 5038407680, + "step": 9610 + }, + { + "epoch": 0.46629203266226565, + "grad_norm": 0.2490234375, + "learning_rate": 2.9822930620404065e-05, + "loss": 2.7753, + "num_input_tokens_seen": 5041029120, + "step": 9615 + }, + { + "epoch": 0.4665345142185123, + "grad_norm": 0.24609375, + "learning_rate": 2.9803258734333033e-05, + "loss": 2.7817, + "num_input_tokens_seen": 5043650560, + "step": 9620 + }, + { + "epoch": 0.4667769957747589, + "grad_norm": 0.25, + "learning_rate": 2.9783583759736587e-05, + "loss": 2.7734, + "num_input_tokens_seen": 5046272000, + "step": 9625 + }, + { + "epoch": 0.4670194773310055, + "grad_norm": 0.25, + "learning_rate": 2.976390570926586e-05, + "loss": 2.7708, + "num_input_tokens_seen": 5048893440, + "step": 9630 + }, + { + "epoch": 0.46726195888725214, + "grad_norm": 0.2451171875, + "learning_rate": 2.9744224595573956e-05, + "loss": 2.7804, + "num_input_tokens_seen": 5051514880, + "step": 9635 + }, + { + "epoch": 0.46750444044349876, + "grad_norm": 0.248046875, + "learning_rate": 2.9724540431315962e-05, + "loss": 2.774, + "num_input_tokens_seen": 5054136320, + "step": 9640 + }, + { + "epoch": 0.4677469219997454, + "grad_norm": 0.25390625, + "learning_rate": 2.970485322914891e-05, + "loss": 2.7814, + "num_input_tokens_seen": 5056757760, + "step": 9645 + }, + { + "epoch": 0.467989403555992, + "grad_norm": 0.251953125, + "learning_rate": 2.9685163001731803e-05, + "loss": 2.7618, + "num_input_tokens_seen": 5059379200, + "step": 9650 + }, + { + "epoch": 0.4682318851122387, + "grad_norm": 0.24609375, + "learning_rate": 2.9665469761725567e-05, + "loss": 2.7925, + "num_input_tokens_seen": 5062000640, + "step": 9655 + }, + { + "epoch": 0.4684743666684853, + "grad_norm": 0.259765625, + "learning_rate": 2.964577352179309e-05, + "loss": 2.7791, + "num_input_tokens_seen": 5064622080, + "step": 9660 + }, + { + "epoch": 0.4687168482247319, + "grad_norm": 0.251953125, + "learning_rate": 2.9626074294599177e-05, + "loss": 2.7738, + "num_input_tokens_seen": 5067243520, + "step": 9665 + }, + { + "epoch": 0.46895932978097854, + "grad_norm": 0.25, + "learning_rate": 2.9606372092810554e-05, + "loss": 2.7793, + "num_input_tokens_seen": 5069864960, + "step": 9670 + }, + { + "epoch": 0.46920181133722516, + "grad_norm": 0.255859375, + "learning_rate": 2.9586666929095857e-05, + "loss": 2.7774, + "num_input_tokens_seen": 5072486400, + "step": 9675 + }, + { + "epoch": 0.4694442928934718, + "grad_norm": 0.248046875, + "learning_rate": 2.9566958816125628e-05, + "loss": 2.7787, + "num_input_tokens_seen": 5075107840, + "step": 9680 + }, + { + "epoch": 0.4696867744497184, + "grad_norm": 0.251953125, + "learning_rate": 2.954724776657231e-05, + "loss": 2.7714, + "num_input_tokens_seen": 5077729280, + "step": 9685 + }, + { + "epoch": 0.469929256005965, + "grad_norm": 0.251953125, + "learning_rate": 2.952753379311023e-05, + "loss": 2.7759, + "num_input_tokens_seen": 5080350720, + "step": 9690 + }, + { + "epoch": 0.4701717375622117, + "grad_norm": 0.263671875, + "learning_rate": 2.9507816908415598e-05, + "loss": 2.784, + "num_input_tokens_seen": 5082972160, + "step": 9695 + }, + { + "epoch": 0.4704142191184583, + "grad_norm": 0.244140625, + "learning_rate": 2.948809712516649e-05, + "loss": 2.7722, + "num_input_tokens_seen": 5085593600, + "step": 9700 + }, + { + "epoch": 0.47065670067470494, + "grad_norm": 0.259765625, + "learning_rate": 2.9468374456042857e-05, + "loss": 2.7904, + "num_input_tokens_seen": 5088215040, + "step": 9705 + }, + { + "epoch": 0.47089918223095156, + "grad_norm": 0.2578125, + "learning_rate": 2.9448648913726495e-05, + "loss": 2.771, + "num_input_tokens_seen": 5090836480, + "step": 9710 + }, + { + "epoch": 0.4711416637871982, + "grad_norm": 0.251953125, + "learning_rate": 2.942892051090104e-05, + "loss": 2.784, + "num_input_tokens_seen": 5093457920, + "step": 9715 + }, + { + "epoch": 0.4713841453434448, + "grad_norm": 0.25, + "learning_rate": 2.9409189260252e-05, + "loss": 2.7794, + "num_input_tokens_seen": 5096079360, + "step": 9720 + }, + { + "epoch": 0.4716266268996914, + "grad_norm": 0.251953125, + "learning_rate": 2.9389455174466684e-05, + "loss": 2.7578, + "num_input_tokens_seen": 5098700800, + "step": 9725 + }, + { + "epoch": 0.47186910845593805, + "grad_norm": 0.251953125, + "learning_rate": 2.936971826623423e-05, + "loss": 2.7874, + "num_input_tokens_seen": 5101322240, + "step": 9730 + }, + { + "epoch": 0.4721115900121847, + "grad_norm": 0.2578125, + "learning_rate": 2.9349978548245587e-05, + "loss": 2.7734, + "num_input_tokens_seen": 5103943680, + "step": 9735 + }, + { + "epoch": 0.47235407156843134, + "grad_norm": 0.25390625, + "learning_rate": 2.9330236033193538e-05, + "loss": 2.7948, + "num_input_tokens_seen": 5106565120, + "step": 9740 + }, + { + "epoch": 0.47259655312467796, + "grad_norm": 0.27734375, + "learning_rate": 2.931049073377261e-05, + "loss": 2.7769, + "num_input_tokens_seen": 5109186560, + "step": 9745 + }, + { + "epoch": 0.4728390346809246, + "grad_norm": 0.2392578125, + "learning_rate": 2.9290742662679183e-05, + "loss": 2.7823, + "num_input_tokens_seen": 5111808000, + "step": 9750 + }, + { + "epoch": 0.4730815162371712, + "grad_norm": 0.25, + "learning_rate": 2.927099183261138e-05, + "loss": 2.7735, + "num_input_tokens_seen": 5114429440, + "step": 9755 + }, + { + "epoch": 0.4733239977934178, + "grad_norm": 0.2490234375, + "learning_rate": 2.9251238256269116e-05, + "loss": 2.7771, + "num_input_tokens_seen": 5117050880, + "step": 9760 + }, + { + "epoch": 0.47356647934966445, + "grad_norm": 0.251953125, + "learning_rate": 2.923148194635405e-05, + "loss": 2.7808, + "num_input_tokens_seen": 5119672320, + "step": 9765 + }, + { + "epoch": 0.47380896090591107, + "grad_norm": 0.248046875, + "learning_rate": 2.9211722915569622e-05, + "loss": 2.7758, + "num_input_tokens_seen": 5122293760, + "step": 9770 + }, + { + "epoch": 0.47405144246215775, + "grad_norm": 0.255859375, + "learning_rate": 2.9191961176621007e-05, + "loss": 2.7765, + "num_input_tokens_seen": 5124915200, + "step": 9775 + }, + { + "epoch": 0.47429392401840437, + "grad_norm": 0.251953125, + "learning_rate": 2.9172196742215135e-05, + "loss": 2.7782, + "num_input_tokens_seen": 5127536640, + "step": 9780 + }, + { + "epoch": 0.474536405574651, + "grad_norm": 0.2451171875, + "learning_rate": 2.9152429625060664e-05, + "loss": 2.7861, + "num_input_tokens_seen": 5130158080, + "step": 9785 + }, + { + "epoch": 0.4747788871308976, + "grad_norm": 0.2490234375, + "learning_rate": 2.913265983786796e-05, + "loss": 2.7683, + "num_input_tokens_seen": 5132779520, + "step": 9790 + }, + { + "epoch": 0.47502136868714423, + "grad_norm": 0.26953125, + "learning_rate": 2.911288739334912e-05, + "loss": 2.7809, + "num_input_tokens_seen": 5135400960, + "step": 9795 + }, + { + "epoch": 0.47526385024339085, + "grad_norm": 0.255859375, + "learning_rate": 2.9093112304217962e-05, + "loss": 2.776, + "num_input_tokens_seen": 5138022400, + "step": 9800 + }, + { + "epoch": 0.47550633179963747, + "grad_norm": 0.26171875, + "learning_rate": 2.907333458318998e-05, + "loss": 2.7738, + "num_input_tokens_seen": 5140643840, + "step": 9805 + }, + { + "epoch": 0.4757488133558841, + "grad_norm": 0.2470703125, + "learning_rate": 2.905355424298239e-05, + "loss": 2.7948, + "num_input_tokens_seen": 5143265280, + "step": 9810 + }, + { + "epoch": 0.47599129491213077, + "grad_norm": 0.2431640625, + "learning_rate": 2.903377129631406e-05, + "loss": 2.7737, + "num_input_tokens_seen": 5145886720, + "step": 9815 + }, + { + "epoch": 0.4762337764683774, + "grad_norm": 0.2431640625, + "learning_rate": 2.9013985755905544e-05, + "loss": 2.7716, + "num_input_tokens_seen": 5148508160, + "step": 9820 + }, + { + "epoch": 0.476476258024624, + "grad_norm": 0.24609375, + "learning_rate": 2.899419763447908e-05, + "loss": 2.7773, + "num_input_tokens_seen": 5151129600, + "step": 9825 + }, + { + "epoch": 0.47671873958087063, + "grad_norm": 0.251953125, + "learning_rate": 2.8974406944758548e-05, + "loss": 2.779, + "num_input_tokens_seen": 5153751040, + "step": 9830 + }, + { + "epoch": 0.47696122113711725, + "grad_norm": 0.251953125, + "learning_rate": 2.8954613699469485e-05, + "loss": 2.767, + "num_input_tokens_seen": 5156372480, + "step": 9835 + }, + { + "epoch": 0.4772037026933639, + "grad_norm": 0.25, + "learning_rate": 2.8934817911339075e-05, + "loss": 2.79, + "num_input_tokens_seen": 5158993920, + "step": 9840 + }, + { + "epoch": 0.4774461842496105, + "grad_norm": 0.25, + "learning_rate": 2.8915019593096138e-05, + "loss": 2.7798, + "num_input_tokens_seen": 5161615360, + "step": 9845 + }, + { + "epoch": 0.4776886658058571, + "grad_norm": 0.255859375, + "learning_rate": 2.8895218757471105e-05, + "loss": 2.7674, + "num_input_tokens_seen": 5164236800, + "step": 9850 + }, + { + "epoch": 0.4779311473621038, + "grad_norm": 0.2431640625, + "learning_rate": 2.8875415417196038e-05, + "loss": 2.7766, + "num_input_tokens_seen": 5166858240, + "step": 9855 + }, + { + "epoch": 0.4781736289183504, + "grad_norm": 0.2470703125, + "learning_rate": 2.8855609585004613e-05, + "loss": 2.7881, + "num_input_tokens_seen": 5169479680, + "step": 9860 + }, + { + "epoch": 0.47841611047459703, + "grad_norm": 0.236328125, + "learning_rate": 2.8835801273632112e-05, + "loss": 2.7514, + "num_input_tokens_seen": 5172101120, + "step": 9865 + }, + { + "epoch": 0.47865859203084365, + "grad_norm": 0.2470703125, + "learning_rate": 2.8815990495815394e-05, + "loss": 2.7634, + "num_input_tokens_seen": 5174722560, + "step": 9870 + }, + { + "epoch": 0.4789010735870903, + "grad_norm": 0.2578125, + "learning_rate": 2.8796177264292905e-05, + "loss": 2.7633, + "num_input_tokens_seen": 5177344000, + "step": 9875 + }, + { + "epoch": 0.4791435551433369, + "grad_norm": 0.2578125, + "learning_rate": 2.8776361591804703e-05, + "loss": 2.7783, + "num_input_tokens_seen": 5179965440, + "step": 9880 + }, + { + "epoch": 0.4793860366995835, + "grad_norm": 0.244140625, + "learning_rate": 2.875654349109235e-05, + "loss": 2.7606, + "num_input_tokens_seen": 5182586880, + "step": 9885 + }, + { + "epoch": 0.47962851825583014, + "grad_norm": 0.25390625, + "learning_rate": 2.873672297489905e-05, + "loss": 2.7646, + "num_input_tokens_seen": 5185208320, + "step": 9890 + }, + { + "epoch": 0.4798709998120768, + "grad_norm": 0.25, + "learning_rate": 2.8716900055969497e-05, + "loss": 2.7728, + "num_input_tokens_seen": 5187829760, + "step": 9895 + }, + { + "epoch": 0.48011348136832344, + "grad_norm": 0.25390625, + "learning_rate": 2.869707474704995e-05, + "loss": 2.7734, + "num_input_tokens_seen": 5190451200, + "step": 9900 + }, + { + "epoch": 0.48011348136832344, + "eval_accuracy": 0.4556896270965641, + "eval_loss": 2.743288516998291, + "eval_runtime": 6.1957, + "eval_samples_per_second": 48.421, + "eval_steps_per_second": 6.133, + "num_input_tokens_seen": 5190451200, + "step": 9900 + }, + { + "epoch": 0.48035596292457006, + "grad_norm": 0.2470703125, + "learning_rate": 2.8677247060888217e-05, + "loss": 2.7768, + "num_input_tokens_seen": 5193072640, + "step": 9905 + }, + { + "epoch": 0.4805984444808167, + "grad_norm": 0.244140625, + "learning_rate": 2.8657417010233616e-05, + "loss": 2.775, + "num_input_tokens_seen": 5195694080, + "step": 9910 + }, + { + "epoch": 0.4808409260370633, + "grad_norm": 0.23828125, + "learning_rate": 2.8637584607836997e-05, + "loss": 2.7721, + "num_input_tokens_seen": 5198315520, + "step": 9915 + }, + { + "epoch": 0.4810834075933099, + "grad_norm": 0.26171875, + "learning_rate": 2.8617749866450716e-05, + "loss": 2.775, + "num_input_tokens_seen": 5200936960, + "step": 9920 + }, + { + "epoch": 0.48132588914955654, + "grad_norm": 0.248046875, + "learning_rate": 2.8597912798828647e-05, + "loss": 2.7946, + "num_input_tokens_seen": 5203558400, + "step": 9925 + }, + { + "epoch": 0.4815683707058032, + "grad_norm": 0.2431640625, + "learning_rate": 2.8578073417726132e-05, + "loss": 2.7977, + "num_input_tokens_seen": 5206179840, + "step": 9930 + }, + { + "epoch": 0.48181085226204984, + "grad_norm": 0.2451171875, + "learning_rate": 2.8558231735900028e-05, + "loss": 2.7676, + "num_input_tokens_seen": 5208801280, + "step": 9935 + }, + { + "epoch": 0.48205333381829646, + "grad_norm": 0.255859375, + "learning_rate": 2.8538387766108655e-05, + "loss": 2.7659, + "num_input_tokens_seen": 5211422720, + "step": 9940 + }, + { + "epoch": 0.4822958153745431, + "grad_norm": 0.251953125, + "learning_rate": 2.8518541521111813e-05, + "loss": 2.7696, + "num_input_tokens_seen": 5214044160, + "step": 9945 + }, + { + "epoch": 0.4825382969307897, + "grad_norm": 0.251953125, + "learning_rate": 2.849869301367076e-05, + "loss": 2.7786, + "num_input_tokens_seen": 5216665600, + "step": 9950 + }, + { + "epoch": 0.4827807784870363, + "grad_norm": 0.255859375, + "learning_rate": 2.8478842256548215e-05, + "loss": 2.7738, + "num_input_tokens_seen": 5219287040, + "step": 9955 + }, + { + "epoch": 0.48302326004328294, + "grad_norm": 0.255859375, + "learning_rate": 2.8458989262508334e-05, + "loss": 2.7723, + "num_input_tokens_seen": 5221908480, + "step": 9960 + }, + { + "epoch": 0.48326574159952956, + "grad_norm": 0.2470703125, + "learning_rate": 2.8439134044316716e-05, + "loss": 2.7677, + "num_input_tokens_seen": 5224529920, + "step": 9965 + }, + { + "epoch": 0.48350822315577624, + "grad_norm": 0.248046875, + "learning_rate": 2.8419276614740397e-05, + "loss": 2.7737, + "num_input_tokens_seen": 5227151360, + "step": 9970 + }, + { + "epoch": 0.48375070471202286, + "grad_norm": 0.24609375, + "learning_rate": 2.8399416986547817e-05, + "loss": 2.7726, + "num_input_tokens_seen": 5229772800, + "step": 9975 + }, + { + "epoch": 0.4839931862682695, + "grad_norm": 0.24609375, + "learning_rate": 2.8379555172508853e-05, + "loss": 2.777, + "num_input_tokens_seen": 5232394240, + "step": 9980 + }, + { + "epoch": 0.4842356678245161, + "grad_norm": 0.25, + "learning_rate": 2.835969118539477e-05, + "loss": 2.7792, + "num_input_tokens_seen": 5235015680, + "step": 9985 + }, + { + "epoch": 0.4844781493807627, + "grad_norm": 0.248046875, + "learning_rate": 2.8339825037978234e-05, + "loss": 2.7743, + "num_input_tokens_seen": 5237637120, + "step": 9990 + }, + { + "epoch": 0.48472063093700934, + "grad_norm": 0.251953125, + "learning_rate": 2.831995674303331e-05, + "loss": 2.7795, + "num_input_tokens_seen": 5240258560, + "step": 9995 + }, + { + "epoch": 0.48496311249325597, + "grad_norm": 0.259765625, + "learning_rate": 2.830008631333543e-05, + "loss": 2.779, + "num_input_tokens_seen": 5242880000, + "step": 10000 + }, + { + "epoch": 0.4852055940495026, + "grad_norm": 0.2578125, + "learning_rate": 2.8280213761661394e-05, + "loss": 2.793, + "num_input_tokens_seen": 5245501440, + "step": 10005 + }, + { + "epoch": 0.48544807560574926, + "grad_norm": 0.2470703125, + "learning_rate": 2.8260339100789397e-05, + "loss": 2.7825, + "num_input_tokens_seen": 5248122880, + "step": 10010 + }, + { + "epoch": 0.4856905571619959, + "grad_norm": 0.25, + "learning_rate": 2.8240462343498963e-05, + "loss": 2.7842, + "num_input_tokens_seen": 5250744320, + "step": 10015 + }, + { + "epoch": 0.4859330387182425, + "grad_norm": 0.255859375, + "learning_rate": 2.822058350257097e-05, + "loss": 2.7721, + "num_input_tokens_seen": 5253365760, + "step": 10020 + }, + { + "epoch": 0.4861755202744891, + "grad_norm": 0.2412109375, + "learning_rate": 2.8200702590787637e-05, + "loss": 2.7691, + "num_input_tokens_seen": 5255987200, + "step": 10025 + }, + { + "epoch": 0.48641800183073575, + "grad_norm": 0.25, + "learning_rate": 2.8180819620932503e-05, + "loss": 2.7571, + "num_input_tokens_seen": 5258608640, + "step": 10030 + }, + { + "epoch": 0.48666048338698237, + "grad_norm": 0.255859375, + "learning_rate": 2.816093460579047e-05, + "loss": 2.7751, + "num_input_tokens_seen": 5261230080, + "step": 10035 + }, + { + "epoch": 0.486902964943229, + "grad_norm": 0.251953125, + "learning_rate": 2.8141047558147704e-05, + "loss": 2.771, + "num_input_tokens_seen": 5263851520, + "step": 10040 + }, + { + "epoch": 0.4871454464994756, + "grad_norm": 0.2451171875, + "learning_rate": 2.8121158490791706e-05, + "loss": 2.788, + "num_input_tokens_seen": 5266472960, + "step": 10045 + }, + { + "epoch": 0.4873879280557223, + "grad_norm": 0.259765625, + "learning_rate": 2.810126741651128e-05, + "loss": 2.7768, + "num_input_tokens_seen": 5269094400, + "step": 10050 + }, + { + "epoch": 0.4876304096119689, + "grad_norm": 0.25, + "learning_rate": 2.808137434809649e-05, + "loss": 2.7901, + "num_input_tokens_seen": 5271715840, + "step": 10055 + }, + { + "epoch": 0.48787289116821553, + "grad_norm": 0.25390625, + "learning_rate": 2.806147929833872e-05, + "loss": 2.7703, + "num_input_tokens_seen": 5274337280, + "step": 10060 + }, + { + "epoch": 0.48811537272446215, + "grad_norm": 0.2490234375, + "learning_rate": 2.804158228003062e-05, + "loss": 2.7837, + "num_input_tokens_seen": 5276958720, + "step": 10065 + }, + { + "epoch": 0.48835785428070877, + "grad_norm": 0.259765625, + "learning_rate": 2.8021683305966078e-05, + "loss": 2.7862, + "num_input_tokens_seen": 5279580160, + "step": 10070 + }, + { + "epoch": 0.4886003358369554, + "grad_norm": 0.2451171875, + "learning_rate": 2.8001782388940268e-05, + "loss": 2.7691, + "num_input_tokens_seen": 5282201600, + "step": 10075 + }, + { + "epoch": 0.488842817393202, + "grad_norm": 0.2451171875, + "learning_rate": 2.79818795417496e-05, + "loss": 2.7753, + "num_input_tokens_seen": 5284823040, + "step": 10080 + }, + { + "epoch": 0.48908529894944863, + "grad_norm": 0.25, + "learning_rate": 2.7961974777191736e-05, + "loss": 2.7761, + "num_input_tokens_seen": 5287444480, + "step": 10085 + }, + { + "epoch": 0.4893277805056953, + "grad_norm": 0.25390625, + "learning_rate": 2.7942068108065555e-05, + "loss": 2.7633, + "num_input_tokens_seen": 5290065920, + "step": 10090 + }, + { + "epoch": 0.48957026206194193, + "grad_norm": 0.251953125, + "learning_rate": 2.7922159547171184e-05, + "loss": 2.7823, + "num_input_tokens_seen": 5292687360, + "step": 10095 + }, + { + "epoch": 0.48981274361818855, + "grad_norm": 0.251953125, + "learning_rate": 2.7902249107309943e-05, + "loss": 2.7701, + "num_input_tokens_seen": 5295308800, + "step": 10100 + }, + { + "epoch": 0.49005522517443517, + "grad_norm": 0.2470703125, + "learning_rate": 2.788233680128436e-05, + "loss": 2.7816, + "num_input_tokens_seen": 5297930240, + "step": 10105 + }, + { + "epoch": 0.4902977067306818, + "grad_norm": 0.2412109375, + "learning_rate": 2.7862422641898182e-05, + "loss": 2.7677, + "num_input_tokens_seen": 5300551680, + "step": 10110 + }, + { + "epoch": 0.4905401882869284, + "grad_norm": 0.265625, + "learning_rate": 2.7842506641956346e-05, + "loss": 2.7749, + "num_input_tokens_seen": 5303173120, + "step": 10115 + }, + { + "epoch": 0.49078266984317503, + "grad_norm": 0.25, + "learning_rate": 2.782258881426495e-05, + "loss": 2.7748, + "num_input_tokens_seen": 5305794560, + "step": 10120 + }, + { + "epoch": 0.49102515139942166, + "grad_norm": 0.248046875, + "learning_rate": 2.7802669171631297e-05, + "loss": 2.7566, + "num_input_tokens_seen": 5308416000, + "step": 10125 + }, + { + "epoch": 0.49126763295566833, + "grad_norm": 0.25390625, + "learning_rate": 2.7782747726863827e-05, + "loss": 2.7735, + "num_input_tokens_seen": 5311037440, + "step": 10130 + }, + { + "epoch": 0.49151011451191495, + "grad_norm": 0.25390625, + "learning_rate": 2.776282449277216e-05, + "loss": 2.7642, + "num_input_tokens_seen": 5313658880, + "step": 10135 + }, + { + "epoch": 0.4917525960681616, + "grad_norm": 0.2431640625, + "learning_rate": 2.7742899482167063e-05, + "loss": 2.7836, + "num_input_tokens_seen": 5316280320, + "step": 10140 + }, + { + "epoch": 0.4919950776244082, + "grad_norm": 0.2412109375, + "learning_rate": 2.7722972707860435e-05, + "loss": 2.7686, + "num_input_tokens_seen": 5318901760, + "step": 10145 + }, + { + "epoch": 0.4922375591806548, + "grad_norm": 0.248046875, + "learning_rate": 2.770304418266532e-05, + "loss": 2.7712, + "num_input_tokens_seen": 5321523200, + "step": 10150 + }, + { + "epoch": 0.49248004073690144, + "grad_norm": 0.2421875, + "learning_rate": 2.768311391939589e-05, + "loss": 2.7749, + "num_input_tokens_seen": 5324144640, + "step": 10155 + }, + { + "epoch": 0.49272252229314806, + "grad_norm": 0.24609375, + "learning_rate": 2.7663181930867428e-05, + "loss": 2.7686, + "num_input_tokens_seen": 5326766080, + "step": 10160 + }, + { + "epoch": 0.4929650038493947, + "grad_norm": 0.251953125, + "learning_rate": 2.7643248229896314e-05, + "loss": 2.7831, + "num_input_tokens_seen": 5329387520, + "step": 10165 + }, + { + "epoch": 0.49320748540564135, + "grad_norm": 0.255859375, + "learning_rate": 2.762331282930005e-05, + "loss": 2.7871, + "num_input_tokens_seen": 5332008960, + "step": 10170 + }, + { + "epoch": 0.493449966961888, + "grad_norm": 0.25390625, + "learning_rate": 2.7603375741897235e-05, + "loss": 2.7793, + "num_input_tokens_seen": 5334630400, + "step": 10175 + }, + { + "epoch": 0.4936924485181346, + "grad_norm": 0.251953125, + "learning_rate": 2.7583436980507528e-05, + "loss": 2.776, + "num_input_tokens_seen": 5337251840, + "step": 10180 + }, + { + "epoch": 0.4939349300743812, + "grad_norm": 0.259765625, + "learning_rate": 2.756349655795168e-05, + "loss": 2.7764, + "num_input_tokens_seen": 5339873280, + "step": 10185 + }, + { + "epoch": 0.49417741163062784, + "grad_norm": 0.24609375, + "learning_rate": 2.754355448705151e-05, + "loss": 2.7685, + "num_input_tokens_seen": 5342494720, + "step": 10190 + }, + { + "epoch": 0.49441989318687446, + "grad_norm": 0.24609375, + "learning_rate": 2.7523610780629893e-05, + "loss": 2.7762, + "num_input_tokens_seen": 5345116160, + "step": 10195 + }, + { + "epoch": 0.4946623747431211, + "grad_norm": 0.2412109375, + "learning_rate": 2.7503665451510746e-05, + "loss": 2.7829, + "num_input_tokens_seen": 5347737600, + "step": 10200 + }, + { + "epoch": 0.4946623747431211, + "eval_accuracy": 0.45571893828366716, + "eval_loss": 2.742992639541626, + "eval_runtime": 5.8537, + "eval_samples_per_second": 51.25, + "eval_steps_per_second": 6.492, + "num_input_tokens_seen": 5347737600, + "step": 10200 + }, + { + "epoch": 0.49490485629936776, + "grad_norm": 0.25, + "learning_rate": 2.7483718512519062e-05, + "loss": 2.7699, + "num_input_tokens_seen": 5350359040, + "step": 10205 + }, + { + "epoch": 0.4951473378556144, + "grad_norm": 0.25390625, + "learning_rate": 2.7463769976480837e-05, + "loss": 2.7728, + "num_input_tokens_seen": 5352980480, + "step": 10210 + }, + { + "epoch": 0.495389819411861, + "grad_norm": 0.2470703125, + "learning_rate": 2.7443819856223097e-05, + "loss": 2.778, + "num_input_tokens_seen": 5355601920, + "step": 10215 + }, + { + "epoch": 0.4956323009681076, + "grad_norm": 0.2431640625, + "learning_rate": 2.7423868164573907e-05, + "loss": 2.7786, + "num_input_tokens_seen": 5358223360, + "step": 10220 + }, + { + "epoch": 0.49587478252435424, + "grad_norm": 0.255859375, + "learning_rate": 2.7403914914362317e-05, + "loss": 2.7663, + "num_input_tokens_seen": 5360844800, + "step": 10225 + }, + { + "epoch": 0.49611726408060086, + "grad_norm": 0.2373046875, + "learning_rate": 2.73839601184184e-05, + "loss": 2.7782, + "num_input_tokens_seen": 5363466240, + "step": 10230 + }, + { + "epoch": 0.4963597456368475, + "grad_norm": 0.25390625, + "learning_rate": 2.7364003789573216e-05, + "loss": 2.7852, + "num_input_tokens_seen": 5366087680, + "step": 10235 + }, + { + "epoch": 0.4966022271930941, + "grad_norm": 0.25390625, + "learning_rate": 2.7344045940658807e-05, + "loss": 2.7884, + "num_input_tokens_seen": 5368709120, + "step": 10240 + }, + { + "epoch": 0.4968447087493408, + "grad_norm": 0.251953125, + "learning_rate": 2.732408658450819e-05, + "loss": 2.7718, + "num_input_tokens_seen": 5371330560, + "step": 10245 + }, + { + "epoch": 0.4970871903055874, + "grad_norm": 0.2470703125, + "learning_rate": 2.730412573395536e-05, + "loss": 2.7532, + "num_input_tokens_seen": 5373952000, + "step": 10250 + }, + { + "epoch": 0.497329671861834, + "grad_norm": 0.2431640625, + "learning_rate": 2.7284163401835274e-05, + "loss": 2.7714, + "num_input_tokens_seen": 5376573440, + "step": 10255 + }, + { + "epoch": 0.49757215341808064, + "grad_norm": 0.259765625, + "learning_rate": 2.726419960098382e-05, + "loss": 2.7828, + "num_input_tokens_seen": 5379194880, + "step": 10260 + }, + { + "epoch": 0.49781463497432726, + "grad_norm": 0.255859375, + "learning_rate": 2.7244234344237868e-05, + "loss": 2.773, + "num_input_tokens_seen": 5381816320, + "step": 10265 + }, + { + "epoch": 0.4980571165305739, + "grad_norm": 0.251953125, + "learning_rate": 2.722426764443519e-05, + "loss": 2.7669, + "num_input_tokens_seen": 5384437760, + "step": 10270 + }, + { + "epoch": 0.4982995980868205, + "grad_norm": 0.248046875, + "learning_rate": 2.72042995144145e-05, + "loss": 2.765, + "num_input_tokens_seen": 5387059200, + "step": 10275 + }, + { + "epoch": 0.4985420796430671, + "grad_norm": 0.24609375, + "learning_rate": 2.718432996701543e-05, + "loss": 2.7815, + "num_input_tokens_seen": 5389680640, + "step": 10280 + }, + { + "epoch": 0.4987845611993138, + "grad_norm": 0.2470703125, + "learning_rate": 2.7164359015078533e-05, + "loss": 2.7781, + "num_input_tokens_seen": 5392302080, + "step": 10285 + }, + { + "epoch": 0.4990270427555604, + "grad_norm": 0.255859375, + "learning_rate": 2.7144386671445242e-05, + "loss": 2.7668, + "num_input_tokens_seen": 5394923520, + "step": 10290 + }, + { + "epoch": 0.49926952431180704, + "grad_norm": 0.244140625, + "learning_rate": 2.7124412948957913e-05, + "loss": 2.7715, + "num_input_tokens_seen": 5397544960, + "step": 10295 + }, + { + "epoch": 0.49951200586805367, + "grad_norm": 0.25390625, + "learning_rate": 2.7104437860459763e-05, + "loss": 2.7773, + "num_input_tokens_seen": 5400166400, + "step": 10300 + }, + { + "epoch": 0.4997544874243003, + "grad_norm": 0.248046875, + "learning_rate": 2.7084461418794903e-05, + "loss": 2.7761, + "num_input_tokens_seen": 5402787840, + "step": 10305 + }, + { + "epoch": 0.4999969689805469, + "grad_norm": 0.251953125, + "learning_rate": 2.7064483636808313e-05, + "loss": 2.7861, + "num_input_tokens_seen": 5405409280, + "step": 10310 + }, + { + "epoch": 0.5002394505367935, + "grad_norm": 0.25, + "learning_rate": 2.7044504527345826e-05, + "loss": 2.7889, + "num_input_tokens_seen": 5408030720, + "step": 10315 + }, + { + "epoch": 0.5004819320930401, + "grad_norm": 0.244140625, + "learning_rate": 2.702452410325414e-05, + "loss": 2.7849, + "num_input_tokens_seen": 5410652160, + "step": 10320 + }, + { + "epoch": 0.5007244136492868, + "grad_norm": 0.251953125, + "learning_rate": 2.7004542377380788e-05, + "loss": 2.7642, + "num_input_tokens_seen": 5413273600, + "step": 10325 + }, + { + "epoch": 0.5009668952055334, + "grad_norm": 0.2490234375, + "learning_rate": 2.698455936257415e-05, + "loss": 2.763, + "num_input_tokens_seen": 5415895040, + "step": 10330 + }, + { + "epoch": 0.50120937676178, + "grad_norm": 0.2470703125, + "learning_rate": 2.6964575071683423e-05, + "loss": 2.7908, + "num_input_tokens_seen": 5418516480, + "step": 10335 + }, + { + "epoch": 0.5014518583180266, + "grad_norm": 0.2451171875, + "learning_rate": 2.694458951755863e-05, + "loss": 2.7781, + "num_input_tokens_seen": 5421137920, + "step": 10340 + }, + { + "epoch": 0.5016943398742734, + "grad_norm": 0.2421875, + "learning_rate": 2.6924602713050623e-05, + "loss": 2.7945, + "num_input_tokens_seen": 5423759360, + "step": 10345 + }, + { + "epoch": 0.50193682143052, + "grad_norm": 0.248046875, + "learning_rate": 2.6904614671011025e-05, + "loss": 2.7726, + "num_input_tokens_seen": 5426380800, + "step": 10350 + }, + { + "epoch": 0.5021793029867666, + "grad_norm": 0.251953125, + "learning_rate": 2.688462540429228e-05, + "loss": 2.7678, + "num_input_tokens_seen": 5429002240, + "step": 10355 + }, + { + "epoch": 0.5024217845430132, + "grad_norm": 0.255859375, + "learning_rate": 2.686463492574761e-05, + "loss": 2.7737, + "num_input_tokens_seen": 5431623680, + "step": 10360 + }, + { + "epoch": 0.5026642660992598, + "grad_norm": 0.248046875, + "learning_rate": 2.6844643248231016e-05, + "loss": 2.782, + "num_input_tokens_seen": 5434245120, + "step": 10365 + }, + { + "epoch": 0.5029067476555065, + "grad_norm": 0.2421875, + "learning_rate": 2.6824650384597272e-05, + "loss": 2.771, + "num_input_tokens_seen": 5436866560, + "step": 10370 + }, + { + "epoch": 0.5031492292117531, + "grad_norm": 0.2470703125, + "learning_rate": 2.6804656347701923e-05, + "loss": 2.7808, + "num_input_tokens_seen": 5439488000, + "step": 10375 + }, + { + "epoch": 0.5033917107679997, + "grad_norm": 0.25, + "learning_rate": 2.6784661150401248e-05, + "loss": 2.7828, + "num_input_tokens_seen": 5442109440, + "step": 10380 + }, + { + "epoch": 0.5036341923242463, + "grad_norm": 0.251953125, + "learning_rate": 2.6764664805552287e-05, + "loss": 2.7549, + "num_input_tokens_seen": 5444730880, + "step": 10385 + }, + { + "epoch": 0.503876673880493, + "grad_norm": 0.251953125, + "learning_rate": 2.6744667326012822e-05, + "loss": 2.7733, + "num_input_tokens_seen": 5447352320, + "step": 10390 + }, + { + "epoch": 0.5041191554367396, + "grad_norm": 0.2421875, + "learning_rate": 2.672466872464134e-05, + "loss": 2.7889, + "num_input_tokens_seen": 5449973760, + "step": 10395 + }, + { + "epoch": 0.5043616369929862, + "grad_norm": 0.25390625, + "learning_rate": 2.6704669014297083e-05, + "loss": 2.7714, + "num_input_tokens_seen": 5452595200, + "step": 10400 + }, + { + "epoch": 0.5046041185492328, + "grad_norm": 0.2578125, + "learning_rate": 2.6684668207839996e-05, + "loss": 2.7692, + "num_input_tokens_seen": 5455216640, + "step": 10405 + }, + { + "epoch": 0.5048466001054794, + "grad_norm": 0.2490234375, + "learning_rate": 2.6664666318130704e-05, + "loss": 2.7891, + "num_input_tokens_seen": 5457838080, + "step": 10410 + }, + { + "epoch": 0.5050890816617261, + "grad_norm": 0.2578125, + "learning_rate": 2.6644663358030552e-05, + "loss": 2.7877, + "num_input_tokens_seen": 5460459520, + "step": 10415 + }, + { + "epoch": 0.5053315632179727, + "grad_norm": 0.255859375, + "learning_rate": 2.6624659340401576e-05, + "loss": 2.7846, + "num_input_tokens_seen": 5463080960, + "step": 10420 + }, + { + "epoch": 0.5055740447742194, + "grad_norm": 0.2578125, + "learning_rate": 2.6604654278106477e-05, + "loss": 2.7814, + "num_input_tokens_seen": 5465702400, + "step": 10425 + }, + { + "epoch": 0.505816526330466, + "grad_norm": 0.25, + "learning_rate": 2.6584648184008642e-05, + "loss": 2.7728, + "num_input_tokens_seen": 5468323840, + "step": 10430 + }, + { + "epoch": 0.5060590078867127, + "grad_norm": 0.24609375, + "learning_rate": 2.6564641070972117e-05, + "loss": 2.7744, + "num_input_tokens_seen": 5470945280, + "step": 10435 + }, + { + "epoch": 0.5063014894429593, + "grad_norm": 0.259765625, + "learning_rate": 2.6544632951861586e-05, + "loss": 2.7633, + "num_input_tokens_seen": 5473566720, + "step": 10440 + }, + { + "epoch": 0.5065439709992059, + "grad_norm": 0.2421875, + "learning_rate": 2.6524623839542408e-05, + "loss": 2.7841, + "num_input_tokens_seen": 5476188160, + "step": 10445 + }, + { + "epoch": 0.5067864525554525, + "grad_norm": 0.244140625, + "learning_rate": 2.6504613746880557e-05, + "loss": 2.7543, + "num_input_tokens_seen": 5478809600, + "step": 10450 + }, + { + "epoch": 0.5070289341116991, + "grad_norm": 0.25390625, + "learning_rate": 2.648460268674266e-05, + "loss": 2.7903, + "num_input_tokens_seen": 5481431040, + "step": 10455 + }, + { + "epoch": 0.5072714156679458, + "grad_norm": 0.251953125, + "learning_rate": 2.6464590671995943e-05, + "loss": 2.7832, + "num_input_tokens_seen": 5484052480, + "step": 10460 + }, + { + "epoch": 0.5075138972241924, + "grad_norm": 0.255859375, + "learning_rate": 2.6444577715508268e-05, + "loss": 2.7857, + "num_input_tokens_seen": 5486673920, + "step": 10465 + }, + { + "epoch": 0.507756378780439, + "grad_norm": 0.2451171875, + "learning_rate": 2.642456383014808e-05, + "loss": 2.7707, + "num_input_tokens_seen": 5489295360, + "step": 10470 + }, + { + "epoch": 0.5079988603366856, + "grad_norm": 0.25390625, + "learning_rate": 2.6404549028784438e-05, + "loss": 2.7745, + "num_input_tokens_seen": 5491916800, + "step": 10475 + }, + { + "epoch": 0.5082413418929322, + "grad_norm": 0.2421875, + "learning_rate": 2.6384533324286982e-05, + "loss": 2.7767, + "num_input_tokens_seen": 5494538240, + "step": 10480 + }, + { + "epoch": 0.5084838234491789, + "grad_norm": 0.2412109375, + "learning_rate": 2.636451672952594e-05, + "loss": 2.7725, + "num_input_tokens_seen": 5497159680, + "step": 10485 + }, + { + "epoch": 0.5087263050054255, + "grad_norm": 0.251953125, + "learning_rate": 2.63444992573721e-05, + "loss": 2.7672, + "num_input_tokens_seen": 5499781120, + "step": 10490 + }, + { + "epoch": 0.5089687865616721, + "grad_norm": 0.259765625, + "learning_rate": 2.632448092069683e-05, + "loss": 2.7843, + "num_input_tokens_seen": 5502402560, + "step": 10495 + }, + { + "epoch": 0.5092112681179188, + "grad_norm": 0.2490234375, + "learning_rate": 2.6304461732372047e-05, + "loss": 2.781, + "num_input_tokens_seen": 5505024000, + "step": 10500 + }, + { + "epoch": 0.5092112681179188, + "eval_accuracy": 0.45571568148510017, + "eval_loss": 2.7428910732269287, + "eval_runtime": 6.7158, + "eval_samples_per_second": 44.671, + "eval_steps_per_second": 5.658, + "num_input_tokens_seen": 5505024000, + "step": 10500 + }, + { + "epoch": 0.5094537496741655, + "grad_norm": 0.251953125, + "learning_rate": 2.62844417052702e-05, + "loss": 2.786, + "num_input_tokens_seen": 5507645440, + "step": 10505 + }, + { + "epoch": 0.5096962312304121, + "grad_norm": 0.25, + "learning_rate": 2.6264420852264297e-05, + "loss": 2.7865, + "num_input_tokens_seen": 5510266880, + "step": 10510 + }, + { + "epoch": 0.5099387127866587, + "grad_norm": 0.2470703125, + "learning_rate": 2.624439918622789e-05, + "loss": 2.7763, + "num_input_tokens_seen": 5512888320, + "step": 10515 + }, + { + "epoch": 0.5101811943429053, + "grad_norm": 0.244140625, + "learning_rate": 2.6224376720035014e-05, + "loss": 2.7807, + "num_input_tokens_seen": 5515509760, + "step": 10520 + }, + { + "epoch": 0.5104236758991519, + "grad_norm": 0.2578125, + "learning_rate": 2.620435346656025e-05, + "loss": 2.7708, + "num_input_tokens_seen": 5518131200, + "step": 10525 + }, + { + "epoch": 0.5106661574553986, + "grad_norm": 0.265625, + "learning_rate": 2.6184329438678677e-05, + "loss": 2.7785, + "num_input_tokens_seen": 5520752640, + "step": 10530 + }, + { + "epoch": 0.5109086390116452, + "grad_norm": 0.251953125, + "learning_rate": 2.6164304649265852e-05, + "loss": 2.789, + "num_input_tokens_seen": 5523374080, + "step": 10535 + }, + { + "epoch": 0.5111511205678918, + "grad_norm": 0.2451171875, + "learning_rate": 2.614427911119786e-05, + "loss": 2.7725, + "num_input_tokens_seen": 5525995520, + "step": 10540 + }, + { + "epoch": 0.5113936021241384, + "grad_norm": 0.24609375, + "learning_rate": 2.6124252837351247e-05, + "loss": 2.7703, + "num_input_tokens_seen": 5528616960, + "step": 10545 + }, + { + "epoch": 0.511636083680385, + "grad_norm": 0.2412109375, + "learning_rate": 2.6104225840603024e-05, + "loss": 2.7767, + "num_input_tokens_seen": 5531238400, + "step": 10550 + }, + { + "epoch": 0.5118785652366317, + "grad_norm": 0.24609375, + "learning_rate": 2.608419813383067e-05, + "loss": 2.7687, + "num_input_tokens_seen": 5533859840, + "step": 10555 + }, + { + "epoch": 0.5121210467928783, + "grad_norm": 0.251953125, + "learning_rate": 2.6064169729912145e-05, + "loss": 2.777, + "num_input_tokens_seen": 5536481280, + "step": 10560 + }, + { + "epoch": 0.5123635283491249, + "grad_norm": 0.2353515625, + "learning_rate": 2.604414064172581e-05, + "loss": 2.7811, + "num_input_tokens_seen": 5539102720, + "step": 10565 + }, + { + "epoch": 0.5126060099053715, + "grad_norm": 0.25, + "learning_rate": 2.602411088215052e-05, + "loss": 2.7831, + "num_input_tokens_seen": 5541724160, + "step": 10570 + }, + { + "epoch": 0.5128484914616182, + "grad_norm": 0.251953125, + "learning_rate": 2.6004080464065532e-05, + "loss": 2.7783, + "num_input_tokens_seen": 5544345600, + "step": 10575 + }, + { + "epoch": 0.5130909730178649, + "grad_norm": 0.26171875, + "learning_rate": 2.598404940035052e-05, + "loss": 2.7667, + "num_input_tokens_seen": 5546967040, + "step": 10580 + }, + { + "epoch": 0.5133334545741115, + "grad_norm": 0.251953125, + "learning_rate": 2.5964017703885586e-05, + "loss": 2.7711, + "num_input_tokens_seen": 5549588480, + "step": 10585 + }, + { + "epoch": 0.5135759361303581, + "grad_norm": 0.2431640625, + "learning_rate": 2.5943985387551234e-05, + "loss": 2.7631, + "num_input_tokens_seen": 5552209920, + "step": 10590 + }, + { + "epoch": 0.5138184176866047, + "grad_norm": 0.2470703125, + "learning_rate": 2.5923952464228373e-05, + "loss": 2.7706, + "num_input_tokens_seen": 5554831360, + "step": 10595 + }, + { + "epoch": 0.5140608992428514, + "grad_norm": 0.248046875, + "learning_rate": 2.59039189467983e-05, + "loss": 2.7733, + "num_input_tokens_seen": 5557452800, + "step": 10600 + }, + { + "epoch": 0.514303380799098, + "grad_norm": 0.2578125, + "learning_rate": 2.5883884848142693e-05, + "loss": 2.7746, + "num_input_tokens_seen": 5560074240, + "step": 10605 + }, + { + "epoch": 0.5145458623553446, + "grad_norm": 0.255859375, + "learning_rate": 2.586385018114359e-05, + "loss": 2.7857, + "num_input_tokens_seen": 5562695680, + "step": 10610 + }, + { + "epoch": 0.5147883439115912, + "grad_norm": 0.248046875, + "learning_rate": 2.5843814958683423e-05, + "loss": 2.7802, + "num_input_tokens_seen": 5565317120, + "step": 10615 + }, + { + "epoch": 0.5150308254678378, + "grad_norm": 0.24609375, + "learning_rate": 2.5823779193644953e-05, + "loss": 2.7669, + "num_input_tokens_seen": 5567938560, + "step": 10620 + }, + { + "epoch": 0.5152733070240845, + "grad_norm": 0.26171875, + "learning_rate": 2.5803742898911316e-05, + "loss": 2.782, + "num_input_tokens_seen": 5570560000, + "step": 10625 + }, + { + "epoch": 0.5155157885803311, + "grad_norm": 0.2470703125, + "learning_rate": 2.578370608736596e-05, + "loss": 2.7886, + "num_input_tokens_seen": 5573181440, + "step": 10630 + }, + { + "epoch": 0.5157582701365777, + "grad_norm": 0.2470703125, + "learning_rate": 2.576366877189269e-05, + "loss": 2.7732, + "num_input_tokens_seen": 5575802880, + "step": 10635 + }, + { + "epoch": 0.5160007516928243, + "grad_norm": 0.2470703125, + "learning_rate": 2.5743630965375624e-05, + "loss": 2.7792, + "num_input_tokens_seen": 5578424320, + "step": 10640 + }, + { + "epoch": 0.516243233249071, + "grad_norm": 0.2451171875, + "learning_rate": 2.5723592680699194e-05, + "loss": 2.7736, + "num_input_tokens_seen": 5581045760, + "step": 10645 + }, + { + "epoch": 0.5164857148053176, + "grad_norm": 0.24609375, + "learning_rate": 2.5703553930748138e-05, + "loss": 2.7993, + "num_input_tokens_seen": 5583667200, + "step": 10650 + }, + { + "epoch": 0.5167281963615642, + "grad_norm": 0.248046875, + "learning_rate": 2.5683514728407505e-05, + "loss": 2.7852, + "num_input_tokens_seen": 5586288640, + "step": 10655 + }, + { + "epoch": 0.5169706779178109, + "grad_norm": 0.24609375, + "learning_rate": 2.5663475086562628e-05, + "loss": 2.7788, + "num_input_tokens_seen": 5588910080, + "step": 10660 + }, + { + "epoch": 0.5172131594740575, + "grad_norm": 0.248046875, + "learning_rate": 2.564343501809912e-05, + "loss": 2.775, + "num_input_tokens_seen": 5591531520, + "step": 10665 + }, + { + "epoch": 0.5174556410303042, + "grad_norm": 0.251953125, + "learning_rate": 2.562339453590287e-05, + "loss": 2.7701, + "num_input_tokens_seen": 5594152960, + "step": 10670 + }, + { + "epoch": 0.5176981225865508, + "grad_norm": 0.2451171875, + "learning_rate": 2.5603353652860034e-05, + "loss": 2.7722, + "num_input_tokens_seen": 5596774400, + "step": 10675 + }, + { + "epoch": 0.5179406041427974, + "grad_norm": 0.2431640625, + "learning_rate": 2.5583312381857017e-05, + "loss": 2.7631, + "num_input_tokens_seen": 5599395840, + "step": 10680 + }, + { + "epoch": 0.518183085699044, + "grad_norm": 0.2412109375, + "learning_rate": 2.5563270735780504e-05, + "loss": 2.7686, + "num_input_tokens_seen": 5602017280, + "step": 10685 + }, + { + "epoch": 0.5184255672552907, + "grad_norm": 0.259765625, + "learning_rate": 2.554322872751737e-05, + "loss": 2.7939, + "num_input_tokens_seen": 5604638720, + "step": 10690 + }, + { + "epoch": 0.5186680488115373, + "grad_norm": 0.265625, + "learning_rate": 2.5523186369954777e-05, + "loss": 2.7759, + "num_input_tokens_seen": 5607260160, + "step": 10695 + }, + { + "epoch": 0.5189105303677839, + "grad_norm": 0.2490234375, + "learning_rate": 2.5503143675980072e-05, + "loss": 2.7725, + "num_input_tokens_seen": 5609881600, + "step": 10700 + }, + { + "epoch": 0.5191530119240305, + "grad_norm": 0.25, + "learning_rate": 2.5483100658480828e-05, + "loss": 2.7973, + "num_input_tokens_seen": 5612503040, + "step": 10705 + }, + { + "epoch": 0.5193954934802771, + "grad_norm": 0.25, + "learning_rate": 2.546305733034484e-05, + "loss": 2.7867, + "num_input_tokens_seen": 5615124480, + "step": 10710 + }, + { + "epoch": 0.5196379750365238, + "grad_norm": 0.2578125, + "learning_rate": 2.54430137044601e-05, + "loss": 2.767, + "num_input_tokens_seen": 5617745920, + "step": 10715 + }, + { + "epoch": 0.5198804565927704, + "grad_norm": 0.25390625, + "learning_rate": 2.5422969793714773e-05, + "loss": 2.7687, + "num_input_tokens_seen": 5620367360, + "step": 10720 + }, + { + "epoch": 0.520122938149017, + "grad_norm": 0.2451171875, + "learning_rate": 2.540292561099722e-05, + "loss": 2.7716, + "num_input_tokens_seen": 5622988800, + "step": 10725 + }, + { + "epoch": 0.5203654197052636, + "grad_norm": 0.25390625, + "learning_rate": 2.5382881169195982e-05, + "loss": 2.7808, + "num_input_tokens_seen": 5625610240, + "step": 10730 + }, + { + "epoch": 0.5206079012615102, + "grad_norm": 0.2470703125, + "learning_rate": 2.5362836481199752e-05, + "loss": 2.7769, + "num_input_tokens_seen": 5628231680, + "step": 10735 + }, + { + "epoch": 0.520850382817757, + "grad_norm": 0.251953125, + "learning_rate": 2.53427915598974e-05, + "loss": 2.7741, + "num_input_tokens_seen": 5630853120, + "step": 10740 + }, + { + "epoch": 0.5210928643740036, + "grad_norm": 0.251953125, + "learning_rate": 2.532274641817793e-05, + "loss": 2.777, + "num_input_tokens_seen": 5633474560, + "step": 10745 + }, + { + "epoch": 0.5213353459302502, + "grad_norm": 0.2490234375, + "learning_rate": 2.530270106893049e-05, + "loss": 2.7766, + "num_input_tokens_seen": 5636096000, + "step": 10750 + }, + { + "epoch": 0.5215778274864968, + "grad_norm": 0.24609375, + "learning_rate": 2.5282655525044375e-05, + "loss": 2.771, + "num_input_tokens_seen": 5638717440, + "step": 10755 + }, + { + "epoch": 0.5218203090427435, + "grad_norm": 0.255859375, + "learning_rate": 2.5262609799408983e-05, + "loss": 2.7734, + "num_input_tokens_seen": 5641338880, + "step": 10760 + }, + { + "epoch": 0.5220627905989901, + "grad_norm": 0.2431640625, + "learning_rate": 2.5242563904913846e-05, + "loss": 2.7722, + "num_input_tokens_seen": 5643960320, + "step": 10765 + }, + { + "epoch": 0.5223052721552367, + "grad_norm": 0.25, + "learning_rate": 2.5222517854448603e-05, + "loss": 2.7821, + "num_input_tokens_seen": 5646581760, + "step": 10770 + }, + { + "epoch": 0.5225477537114833, + "grad_norm": 0.255859375, + "learning_rate": 2.5202471660902992e-05, + "loss": 2.7856, + "num_input_tokens_seen": 5649203200, + "step": 10775 + }, + { + "epoch": 0.5227902352677299, + "grad_norm": 0.244140625, + "learning_rate": 2.518242533716683e-05, + "loss": 2.7788, + "num_input_tokens_seen": 5651824640, + "step": 10780 + }, + { + "epoch": 0.5230327168239766, + "grad_norm": 0.2578125, + "learning_rate": 2.516237889613004e-05, + "loss": 2.7902, + "num_input_tokens_seen": 5654446080, + "step": 10785 + }, + { + "epoch": 0.5232751983802232, + "grad_norm": 0.25390625, + "learning_rate": 2.51423323506826e-05, + "loss": 2.7657, + "num_input_tokens_seen": 5657067520, + "step": 10790 + }, + { + "epoch": 0.5235176799364698, + "grad_norm": 0.2451171875, + "learning_rate": 2.5122285713714573e-05, + "loss": 2.7771, + "num_input_tokens_seen": 5659688960, + "step": 10795 + }, + { + "epoch": 0.5237601614927164, + "grad_norm": 0.2451171875, + "learning_rate": 2.510223899811606e-05, + "loss": 2.7757, + "num_input_tokens_seen": 5662310400, + "step": 10800 + }, + { + "epoch": 0.5237601614927164, + "eval_accuracy": 0.4557319654779352, + "eval_loss": 2.74280047416687, + "eval_runtime": 6.1505, + "eval_samples_per_second": 48.777, + "eval_steps_per_second": 6.178, + "num_input_tokens_seen": 5662310400, + "step": 10800 + }, + { + "epoch": 0.524002643048963, + "grad_norm": 0.2412109375, + "learning_rate": 2.5082192216777232e-05, + "loss": 2.8038, + "num_input_tokens_seen": 5664931840, + "step": 10805 + }, + { + "epoch": 0.5242451246052097, + "grad_norm": 0.251953125, + "learning_rate": 2.5062145382588304e-05, + "loss": 2.7682, + "num_input_tokens_seen": 5667553280, + "step": 10810 + }, + { + "epoch": 0.5244876061614564, + "grad_norm": 0.255859375, + "learning_rate": 2.50420985084395e-05, + "loss": 2.781, + "num_input_tokens_seen": 5670174720, + "step": 10815 + }, + { + "epoch": 0.524730087717703, + "grad_norm": 0.2451171875, + "learning_rate": 2.5022051607221086e-05, + "loss": 2.7693, + "num_input_tokens_seen": 5672796160, + "step": 10820 + }, + { + "epoch": 0.5249725692739496, + "grad_norm": 0.255859375, + "learning_rate": 2.500200469182336e-05, + "loss": 2.778, + "num_input_tokens_seen": 5675417600, + "step": 10825 + }, + { + "epoch": 0.5252150508301963, + "grad_norm": 0.263671875, + "learning_rate": 2.4981957775136602e-05, + "loss": 2.7754, + "num_input_tokens_seen": 5678039040, + "step": 10830 + }, + { + "epoch": 0.5254575323864429, + "grad_norm": 0.2470703125, + "learning_rate": 2.4961910870051105e-05, + "loss": 2.7791, + "num_input_tokens_seen": 5680660480, + "step": 10835 + }, + { + "epoch": 0.5257000139426895, + "grad_norm": 0.2578125, + "learning_rate": 2.4941863989457158e-05, + "loss": 2.7608, + "num_input_tokens_seen": 5683281920, + "step": 10840 + }, + { + "epoch": 0.5259424954989361, + "grad_norm": 0.2431640625, + "learning_rate": 2.4921817146245035e-05, + "loss": 2.7732, + "num_input_tokens_seen": 5685903360, + "step": 10845 + }, + { + "epoch": 0.5261849770551827, + "grad_norm": 0.248046875, + "learning_rate": 2.490177035330497e-05, + "loss": 2.7663, + "num_input_tokens_seen": 5688524800, + "step": 10850 + }, + { + "epoch": 0.5264274586114294, + "grad_norm": 0.25, + "learning_rate": 2.4881723623527182e-05, + "loss": 2.7741, + "num_input_tokens_seen": 5691146240, + "step": 10855 + }, + { + "epoch": 0.526669940167676, + "grad_norm": 0.25390625, + "learning_rate": 2.4861676969801846e-05, + "loss": 2.7632, + "num_input_tokens_seen": 5693767680, + "step": 10860 + }, + { + "epoch": 0.5269124217239226, + "grad_norm": 0.2470703125, + "learning_rate": 2.484163040501908e-05, + "loss": 2.7633, + "num_input_tokens_seen": 5696389120, + "step": 10865 + }, + { + "epoch": 0.5271549032801692, + "grad_norm": 0.2451171875, + "learning_rate": 2.4821583942068956e-05, + "loss": 2.7789, + "num_input_tokens_seen": 5699010560, + "step": 10870 + }, + { + "epoch": 0.5273973848364159, + "grad_norm": 0.2451171875, + "learning_rate": 2.4801537593841475e-05, + "loss": 2.7693, + "num_input_tokens_seen": 5701632000, + "step": 10875 + }, + { + "epoch": 0.5276398663926625, + "grad_norm": 0.25, + "learning_rate": 2.4781491373226568e-05, + "loss": 2.7647, + "num_input_tokens_seen": 5704253440, + "step": 10880 + }, + { + "epoch": 0.5278823479489091, + "grad_norm": 0.255859375, + "learning_rate": 2.476144529311407e-05, + "loss": 2.7807, + "num_input_tokens_seen": 5706874880, + "step": 10885 + }, + { + "epoch": 0.5281248295051557, + "grad_norm": 0.24609375, + "learning_rate": 2.4741399366393738e-05, + "loss": 2.7813, + "num_input_tokens_seen": 5709496320, + "step": 10890 + }, + { + "epoch": 0.5283673110614024, + "grad_norm": 0.251953125, + "learning_rate": 2.4721353605955232e-05, + "loss": 2.767, + "num_input_tokens_seen": 5712117760, + "step": 10895 + }, + { + "epoch": 0.5286097926176491, + "grad_norm": 0.251953125, + "learning_rate": 2.4701308024688102e-05, + "loss": 2.7709, + "num_input_tokens_seen": 5714739200, + "step": 10900 + }, + { + "epoch": 0.5288522741738957, + "grad_norm": 0.25390625, + "learning_rate": 2.468126263548178e-05, + "loss": 2.7725, + "num_input_tokens_seen": 5717360640, + "step": 10905 + }, + { + "epoch": 0.5290947557301423, + "grad_norm": 0.2451171875, + "learning_rate": 2.4661217451225585e-05, + "loss": 2.7726, + "num_input_tokens_seen": 5719982080, + "step": 10910 + }, + { + "epoch": 0.5293372372863889, + "grad_norm": 0.2412109375, + "learning_rate": 2.464117248480868e-05, + "loss": 2.7946, + "num_input_tokens_seen": 5722603520, + "step": 10915 + }, + { + "epoch": 0.5295797188426355, + "grad_norm": 0.2470703125, + "learning_rate": 2.4621127749120114e-05, + "loss": 2.781, + "num_input_tokens_seen": 5725224960, + "step": 10920 + }, + { + "epoch": 0.5298222003988822, + "grad_norm": 0.25, + "learning_rate": 2.4601083257048774e-05, + "loss": 2.7655, + "num_input_tokens_seen": 5727846400, + "step": 10925 + }, + { + "epoch": 0.5300646819551288, + "grad_norm": 0.2412109375, + "learning_rate": 2.4581039021483396e-05, + "loss": 2.7875, + "num_input_tokens_seen": 5730467840, + "step": 10930 + }, + { + "epoch": 0.5303071635113754, + "grad_norm": 0.240234375, + "learning_rate": 2.4560995055312546e-05, + "loss": 2.7598, + "num_input_tokens_seen": 5733089280, + "step": 10935 + }, + { + "epoch": 0.530549645067622, + "grad_norm": 0.251953125, + "learning_rate": 2.4540951371424632e-05, + "loss": 2.7835, + "num_input_tokens_seen": 5735710720, + "step": 10940 + }, + { + "epoch": 0.5307921266238687, + "grad_norm": 0.2490234375, + "learning_rate": 2.452090798270785e-05, + "loss": 2.7637, + "num_input_tokens_seen": 5738332160, + "step": 10945 + }, + { + "epoch": 0.5310346081801153, + "grad_norm": 0.251953125, + "learning_rate": 2.450086490205023e-05, + "loss": 2.7786, + "num_input_tokens_seen": 5740953600, + "step": 10950 + }, + { + "epoch": 0.5312770897363619, + "grad_norm": 0.251953125, + "learning_rate": 2.4480822142339606e-05, + "loss": 2.7929, + "num_input_tokens_seen": 5743575040, + "step": 10955 + }, + { + "epoch": 0.5315195712926085, + "grad_norm": 0.2421875, + "learning_rate": 2.4460779716463585e-05, + "loss": 2.7764, + "num_input_tokens_seen": 5746196480, + "step": 10960 + }, + { + "epoch": 0.5317620528488551, + "grad_norm": 0.2451171875, + "learning_rate": 2.444073763730958e-05, + "loss": 2.771, + "num_input_tokens_seen": 5748817920, + "step": 10965 + }, + { + "epoch": 0.5320045344051018, + "grad_norm": 0.244140625, + "learning_rate": 2.4420695917764787e-05, + "loss": 2.7694, + "num_input_tokens_seen": 5751439360, + "step": 10970 + }, + { + "epoch": 0.5322470159613485, + "grad_norm": 0.244140625, + "learning_rate": 2.4400654570716132e-05, + "loss": 2.79, + "num_input_tokens_seen": 5754060800, + "step": 10975 + }, + { + "epoch": 0.5324894975175951, + "grad_norm": 0.25, + "learning_rate": 2.438061360905034e-05, + "loss": 2.7789, + "num_input_tokens_seen": 5756682240, + "step": 10980 + }, + { + "epoch": 0.5327319790738417, + "grad_norm": 0.2431640625, + "learning_rate": 2.436057304565387e-05, + "loss": 2.7824, + "num_input_tokens_seen": 5759303680, + "step": 10985 + }, + { + "epoch": 0.5329744606300884, + "grad_norm": 0.248046875, + "learning_rate": 2.4340532893412927e-05, + "loss": 2.767, + "num_input_tokens_seen": 5761925120, + "step": 10990 + }, + { + "epoch": 0.533216942186335, + "grad_norm": 0.2490234375, + "learning_rate": 2.4320493165213464e-05, + "loss": 2.7737, + "num_input_tokens_seen": 5764546560, + "step": 10995 + }, + { + "epoch": 0.5334594237425816, + "grad_norm": 0.2451171875, + "learning_rate": 2.4300453873941158e-05, + "loss": 2.7691, + "num_input_tokens_seen": 5767168000, + "step": 11000 + }, + { + "epoch": 0.5337019052988282, + "grad_norm": 0.2451171875, + "learning_rate": 2.428041503248138e-05, + "loss": 2.7648, + "num_input_tokens_seen": 5769789440, + "step": 11005 + }, + { + "epoch": 0.5339443868550748, + "grad_norm": 0.255859375, + "learning_rate": 2.4260376653719232e-05, + "loss": 2.7915, + "num_input_tokens_seen": 5772410880, + "step": 11010 + }, + { + "epoch": 0.5341868684113215, + "grad_norm": 0.2431640625, + "learning_rate": 2.4240338750539526e-05, + "loss": 2.7746, + "num_input_tokens_seen": 5775032320, + "step": 11015 + }, + { + "epoch": 0.5344293499675681, + "grad_norm": 0.2470703125, + "learning_rate": 2.422030133582675e-05, + "loss": 2.7697, + "num_input_tokens_seen": 5777653760, + "step": 11020 + }, + { + "epoch": 0.5346718315238147, + "grad_norm": 0.2431640625, + "learning_rate": 2.4200264422465097e-05, + "loss": 2.7823, + "num_input_tokens_seen": 5780275200, + "step": 11025 + }, + { + "epoch": 0.5349143130800613, + "grad_norm": 0.2490234375, + "learning_rate": 2.4180228023338423e-05, + "loss": 2.77, + "num_input_tokens_seen": 5782896640, + "step": 11030 + }, + { + "epoch": 0.5351567946363079, + "grad_norm": 0.251953125, + "learning_rate": 2.4160192151330274e-05, + "loss": 2.7643, + "num_input_tokens_seen": 5785518080, + "step": 11035 + }, + { + "epoch": 0.5353992761925546, + "grad_norm": 0.240234375, + "learning_rate": 2.4140156819323812e-05, + "loss": 2.7711, + "num_input_tokens_seen": 5788139520, + "step": 11040 + }, + { + "epoch": 0.5356417577488012, + "grad_norm": 0.2470703125, + "learning_rate": 2.4120122040201888e-05, + "loss": 2.7834, + "num_input_tokens_seen": 5790760960, + "step": 11045 + }, + { + "epoch": 0.5358842393050478, + "grad_norm": 0.2412109375, + "learning_rate": 2.4100087826847e-05, + "loss": 2.7871, + "num_input_tokens_seen": 5793382400, + "step": 11050 + }, + { + "epoch": 0.5361267208612945, + "grad_norm": 0.26171875, + "learning_rate": 2.4080054192141273e-05, + "loss": 2.7802, + "num_input_tokens_seen": 5796003840, + "step": 11055 + }, + { + "epoch": 0.5363692024175412, + "grad_norm": 0.2431640625, + "learning_rate": 2.4060021148966455e-05, + "loss": 2.7675, + "num_input_tokens_seen": 5798625280, + "step": 11060 + }, + { + "epoch": 0.5366116839737878, + "grad_norm": 0.2421875, + "learning_rate": 2.4039988710203927e-05, + "loss": 2.7878, + "num_input_tokens_seen": 5801246720, + "step": 11065 + }, + { + "epoch": 0.5368541655300344, + "grad_norm": 0.244140625, + "learning_rate": 2.4019956888734653e-05, + "loss": 2.7701, + "num_input_tokens_seen": 5803868160, + "step": 11070 + }, + { + "epoch": 0.537096647086281, + "grad_norm": 0.2412109375, + "learning_rate": 2.399992569743923e-05, + "loss": 2.7742, + "num_input_tokens_seen": 5806489600, + "step": 11075 + }, + { + "epoch": 0.5373391286425276, + "grad_norm": 0.244140625, + "learning_rate": 2.3979895149197844e-05, + "loss": 2.7734, + "num_input_tokens_seen": 5809111040, + "step": 11080 + }, + { + "epoch": 0.5375816101987743, + "grad_norm": 0.2490234375, + "learning_rate": 2.3959865256890258e-05, + "loss": 2.7798, + "num_input_tokens_seen": 5811732480, + "step": 11085 + }, + { + "epoch": 0.5378240917550209, + "grad_norm": 0.244140625, + "learning_rate": 2.3939836033395818e-05, + "loss": 2.7678, + "num_input_tokens_seen": 5814353920, + "step": 11090 + }, + { + "epoch": 0.5380665733112675, + "grad_norm": 0.25390625, + "learning_rate": 2.391980749159345e-05, + "loss": 2.7885, + "num_input_tokens_seen": 5816975360, + "step": 11095 + }, + { + "epoch": 0.5383090548675141, + "grad_norm": 0.251953125, + "learning_rate": 2.3899779644361612e-05, + "loss": 2.779, + "num_input_tokens_seen": 5819596800, + "step": 11100 + }, + { + "epoch": 0.5383090548675141, + "eval_accuracy": 0.4559078326005537, + "eval_loss": 2.7425777912139893, + "eval_runtime": 6.3102, + "eval_samples_per_second": 47.542, + "eval_steps_per_second": 6.022, + "num_input_tokens_seen": 5819596800, + "step": 11100 + }, + { + "epoch": 0.5385515364237607, + "grad_norm": 0.2412109375, + "learning_rate": 2.3879752504578347e-05, + "loss": 2.7573, + "num_input_tokens_seen": 5822218240, + "step": 11105 + }, + { + "epoch": 0.5387940179800074, + "grad_norm": 0.2578125, + "learning_rate": 2.385972608512123e-05, + "loss": 2.781, + "num_input_tokens_seen": 5824839680, + "step": 11110 + }, + { + "epoch": 0.539036499536254, + "grad_norm": 0.244140625, + "learning_rate": 2.3839700398867372e-05, + "loss": 2.7713, + "num_input_tokens_seen": 5827461120, + "step": 11115 + }, + { + "epoch": 0.5392789810925006, + "grad_norm": 0.248046875, + "learning_rate": 2.3819675458693423e-05, + "loss": 2.7705, + "num_input_tokens_seen": 5830082560, + "step": 11120 + }, + { + "epoch": 0.5395214626487472, + "grad_norm": 0.26171875, + "learning_rate": 2.3799651277475537e-05, + "loss": 2.7725, + "num_input_tokens_seen": 5832704000, + "step": 11125 + }, + { + "epoch": 0.539763944204994, + "grad_norm": 0.2470703125, + "learning_rate": 2.3779627868089386e-05, + "loss": 2.7977, + "num_input_tokens_seen": 5835325440, + "step": 11130 + }, + { + "epoch": 0.5400064257612406, + "grad_norm": 0.25390625, + "learning_rate": 2.375960524341015e-05, + "loss": 2.7826, + "num_input_tokens_seen": 5837946880, + "step": 11135 + }, + { + "epoch": 0.5402489073174872, + "grad_norm": 0.248046875, + "learning_rate": 2.37395834163125e-05, + "loss": 2.7648, + "num_input_tokens_seen": 5840568320, + "step": 11140 + }, + { + "epoch": 0.5404913888737338, + "grad_norm": 0.2392578125, + "learning_rate": 2.3719562399670604e-05, + "loss": 2.7645, + "num_input_tokens_seen": 5843189760, + "step": 11145 + }, + { + "epoch": 0.5407338704299804, + "grad_norm": 0.25, + "learning_rate": 2.369954220635809e-05, + "loss": 2.7695, + "num_input_tokens_seen": 5845811200, + "step": 11150 + }, + { + "epoch": 0.5409763519862271, + "grad_norm": 0.2490234375, + "learning_rate": 2.367952284924808e-05, + "loss": 2.7828, + "num_input_tokens_seen": 5848432640, + "step": 11155 + }, + { + "epoch": 0.5412188335424737, + "grad_norm": 0.24609375, + "learning_rate": 2.365950434121314e-05, + "loss": 2.7753, + "num_input_tokens_seen": 5851054080, + "step": 11160 + }, + { + "epoch": 0.5414613150987203, + "grad_norm": 0.2451171875, + "learning_rate": 2.3639486695125284e-05, + "loss": 2.7763, + "num_input_tokens_seen": 5853675520, + "step": 11165 + }, + { + "epoch": 0.5417037966549669, + "grad_norm": 0.25, + "learning_rate": 2.3619469923856e-05, + "loss": 2.7781, + "num_input_tokens_seen": 5856296960, + "step": 11170 + }, + { + "epoch": 0.5419462782112136, + "grad_norm": 0.2412109375, + "learning_rate": 2.359945404027619e-05, + "loss": 2.7736, + "num_input_tokens_seen": 5858918400, + "step": 11175 + }, + { + "epoch": 0.5421887597674602, + "grad_norm": 0.23828125, + "learning_rate": 2.3579439057256198e-05, + "loss": 2.784, + "num_input_tokens_seen": 5861539840, + "step": 11180 + }, + { + "epoch": 0.5424312413237068, + "grad_norm": 0.2470703125, + "learning_rate": 2.355942498766578e-05, + "loss": 2.7725, + "num_input_tokens_seen": 5864161280, + "step": 11185 + }, + { + "epoch": 0.5426737228799534, + "grad_norm": 0.240234375, + "learning_rate": 2.3539411844374104e-05, + "loss": 2.7762, + "num_input_tokens_seen": 5866782720, + "step": 11190 + }, + { + "epoch": 0.5429162044362, + "grad_norm": 0.2490234375, + "learning_rate": 2.351939964024975e-05, + "loss": 2.7915, + "num_input_tokens_seen": 5869404160, + "step": 11195 + }, + { + "epoch": 0.5431586859924467, + "grad_norm": 0.240234375, + "learning_rate": 2.3499388388160686e-05, + "loss": 2.7741, + "num_input_tokens_seen": 5872025600, + "step": 11200 + }, + { + "epoch": 0.5434011675486933, + "grad_norm": 0.255859375, + "learning_rate": 2.3479378100974277e-05, + "loss": 2.7715, + "num_input_tokens_seen": 5874647040, + "step": 11205 + }, + { + "epoch": 0.54364364910494, + "grad_norm": 0.255859375, + "learning_rate": 2.3459368791557267e-05, + "loss": 2.7739, + "num_input_tokens_seen": 5877268480, + "step": 11210 + }, + { + "epoch": 0.5438861306611866, + "grad_norm": 0.255859375, + "learning_rate": 2.3439360472775758e-05, + "loss": 2.7784, + "num_input_tokens_seen": 5879889920, + "step": 11215 + }, + { + "epoch": 0.5441286122174332, + "grad_norm": 0.2412109375, + "learning_rate": 2.3419353157495236e-05, + "loss": 2.7808, + "num_input_tokens_seen": 5882511360, + "step": 11220 + }, + { + "epoch": 0.5443710937736799, + "grad_norm": 0.2470703125, + "learning_rate": 2.3399346858580514e-05, + "loss": 2.762, + "num_input_tokens_seen": 5885132800, + "step": 11225 + }, + { + "epoch": 0.5446135753299265, + "grad_norm": 0.251953125, + "learning_rate": 2.3379341588895778e-05, + "loss": 2.7911, + "num_input_tokens_seen": 5887754240, + "step": 11230 + }, + { + "epoch": 0.5448560568861731, + "grad_norm": 0.251953125, + "learning_rate": 2.3359337361304536e-05, + "loss": 2.7616, + "num_input_tokens_seen": 5890375680, + "step": 11235 + }, + { + "epoch": 0.5450985384424197, + "grad_norm": 0.25, + "learning_rate": 2.3339334188669638e-05, + "loss": 2.7732, + "num_input_tokens_seen": 5892997120, + "step": 11240 + }, + { + "epoch": 0.5453410199986664, + "grad_norm": 0.25390625, + "learning_rate": 2.3319332083853246e-05, + "loss": 2.7783, + "num_input_tokens_seen": 5895618560, + "step": 11245 + }, + { + "epoch": 0.545583501554913, + "grad_norm": 0.248046875, + "learning_rate": 2.3299331059716843e-05, + "loss": 2.7717, + "num_input_tokens_seen": 5898240000, + "step": 11250 + }, + { + "epoch": 0.5458259831111596, + "grad_norm": 0.25390625, + "learning_rate": 2.3279331129121202e-05, + "loss": 2.7611, + "num_input_tokens_seen": 5900861440, + "step": 11255 + }, + { + "epoch": 0.5460684646674062, + "grad_norm": 0.2431640625, + "learning_rate": 2.325933230492641e-05, + "loss": 2.7695, + "num_input_tokens_seen": 5903482880, + "step": 11260 + }, + { + "epoch": 0.5463109462236528, + "grad_norm": 0.2470703125, + "learning_rate": 2.3239334599991833e-05, + "loss": 2.7831, + "num_input_tokens_seen": 5906104320, + "step": 11265 + }, + { + "epoch": 0.5465534277798995, + "grad_norm": 0.251953125, + "learning_rate": 2.3219338027176124e-05, + "loss": 2.7846, + "num_input_tokens_seen": 5908725760, + "step": 11270 + }, + { + "epoch": 0.5467959093361461, + "grad_norm": 0.24609375, + "learning_rate": 2.31993425993372e-05, + "loss": 2.7985, + "num_input_tokens_seen": 5911347200, + "step": 11275 + }, + { + "epoch": 0.5470383908923927, + "grad_norm": 0.25, + "learning_rate": 2.3179348329332266e-05, + "loss": 2.7801, + "num_input_tokens_seen": 5913968640, + "step": 11280 + }, + { + "epoch": 0.5472808724486393, + "grad_norm": 0.2470703125, + "learning_rate": 2.315935523001773e-05, + "loss": 2.7719, + "num_input_tokens_seen": 5916590080, + "step": 11285 + }, + { + "epoch": 0.547523354004886, + "grad_norm": 0.248046875, + "learning_rate": 2.3139363314249304e-05, + "loss": 2.7801, + "num_input_tokens_seen": 5919211520, + "step": 11290 + }, + { + "epoch": 0.5477658355611327, + "grad_norm": 0.2451171875, + "learning_rate": 2.3119372594881902e-05, + "loss": 2.7853, + "num_input_tokens_seen": 5921832960, + "step": 11295 + }, + { + "epoch": 0.5480083171173793, + "grad_norm": 0.2451171875, + "learning_rate": 2.3099383084769686e-05, + "loss": 2.7669, + "num_input_tokens_seen": 5924454400, + "step": 11300 + }, + { + "epoch": 0.5482507986736259, + "grad_norm": 0.25, + "learning_rate": 2.3079394796766036e-05, + "loss": 2.786, + "num_input_tokens_seen": 5927075840, + "step": 11305 + }, + { + "epoch": 0.5484932802298725, + "grad_norm": 0.2490234375, + "learning_rate": 2.305940774372356e-05, + "loss": 2.7816, + "num_input_tokens_seen": 5929697280, + "step": 11310 + }, + { + "epoch": 0.5487357617861192, + "grad_norm": 0.24609375, + "learning_rate": 2.3039421938494036e-05, + "loss": 2.7675, + "num_input_tokens_seen": 5932318720, + "step": 11315 + }, + { + "epoch": 0.5489782433423658, + "grad_norm": 0.2412109375, + "learning_rate": 2.3019437393928467e-05, + "loss": 2.7605, + "num_input_tokens_seen": 5934940160, + "step": 11320 + }, + { + "epoch": 0.5492207248986124, + "grad_norm": 0.2490234375, + "learning_rate": 2.2999454122877047e-05, + "loss": 2.7724, + "num_input_tokens_seen": 5937561600, + "step": 11325 + }, + { + "epoch": 0.549463206454859, + "grad_norm": 0.25, + "learning_rate": 2.297947213818914e-05, + "loss": 2.7615, + "num_input_tokens_seen": 5940183040, + "step": 11330 + }, + { + "epoch": 0.5497056880111056, + "grad_norm": 0.2470703125, + "learning_rate": 2.2959491452713287e-05, + "loss": 2.7897, + "num_input_tokens_seen": 5942804480, + "step": 11335 + }, + { + "epoch": 0.5499481695673523, + "grad_norm": 0.25390625, + "learning_rate": 2.2939512079297208e-05, + "loss": 2.7792, + "num_input_tokens_seen": 5945425920, + "step": 11340 + }, + { + "epoch": 0.5501906511235989, + "grad_norm": 0.240234375, + "learning_rate": 2.2919534030787743e-05, + "loss": 2.7748, + "num_input_tokens_seen": 5948047360, + "step": 11345 + }, + { + "epoch": 0.5504331326798455, + "grad_norm": 0.2451171875, + "learning_rate": 2.289955732003091e-05, + "loss": 2.765, + "num_input_tokens_seen": 5950668800, + "step": 11350 + }, + { + "epoch": 0.5506756142360921, + "grad_norm": 0.25390625, + "learning_rate": 2.2879581959871856e-05, + "loss": 2.7712, + "num_input_tokens_seen": 5953290240, + "step": 11355 + }, + { + "epoch": 0.5509180957923387, + "grad_norm": 0.2470703125, + "learning_rate": 2.2859607963154872e-05, + "loss": 2.769, + "num_input_tokens_seen": 5955911680, + "step": 11360 + }, + { + "epoch": 0.5511605773485855, + "grad_norm": 0.2412109375, + "learning_rate": 2.283963534272336e-05, + "loss": 2.7719, + "num_input_tokens_seen": 5958533120, + "step": 11365 + }, + { + "epoch": 0.5514030589048321, + "grad_norm": 0.2392578125, + "learning_rate": 2.2819664111419835e-05, + "loss": 2.7636, + "num_input_tokens_seen": 5961154560, + "step": 11370 + }, + { + "epoch": 0.5516455404610787, + "grad_norm": 0.23828125, + "learning_rate": 2.2799694282085937e-05, + "loss": 2.7702, + "num_input_tokens_seen": 5963776000, + "step": 11375 + }, + { + "epoch": 0.5518880220173253, + "grad_norm": 0.2431640625, + "learning_rate": 2.2779725867562373e-05, + "loss": 2.7808, + "num_input_tokens_seen": 5966397440, + "step": 11380 + }, + { + "epoch": 0.552130503573572, + "grad_norm": 0.244140625, + "learning_rate": 2.2759758880688958e-05, + "loss": 2.765, + "num_input_tokens_seen": 5969018880, + "step": 11385 + }, + { + "epoch": 0.5523729851298186, + "grad_norm": 0.2421875, + "learning_rate": 2.2739793334304605e-05, + "loss": 2.7826, + "num_input_tokens_seen": 5971640320, + "step": 11390 + }, + { + "epoch": 0.5526154666860652, + "grad_norm": 0.2470703125, + "learning_rate": 2.2719829241247277e-05, + "loss": 2.7833, + "num_input_tokens_seen": 5974261760, + "step": 11395 + }, + { + "epoch": 0.5528579482423118, + "grad_norm": 0.2490234375, + "learning_rate": 2.2699866614354013e-05, + "loss": 2.7771, + "num_input_tokens_seen": 5976883200, + "step": 11400 + }, + { + "epoch": 0.5528579482423118, + "eval_accuracy": 0.45590946099983715, + "eval_loss": 2.742483377456665, + "eval_runtime": 5.8696, + "eval_samples_per_second": 51.111, + "eval_steps_per_second": 6.474, + "num_input_tokens_seen": 5976883200, + "step": 11400 + }, + { + "epoch": 0.5531004297985584, + "grad_norm": 0.24609375, + "learning_rate": 2.2679905466460915e-05, + "loss": 2.7636, + "num_input_tokens_seen": 5979504640, + "step": 11405 + }, + { + "epoch": 0.5533429113548051, + "grad_norm": 0.25, + "learning_rate": 2.26599458104031e-05, + "loss": 2.7702, + "num_input_tokens_seen": 5982126080, + "step": 11410 + }, + { + "epoch": 0.5535853929110517, + "grad_norm": 0.2451171875, + "learning_rate": 2.2639987659014775e-05, + "loss": 2.7812, + "num_input_tokens_seen": 5984747520, + "step": 11415 + }, + { + "epoch": 0.5538278744672983, + "grad_norm": 0.251953125, + "learning_rate": 2.2620031025129145e-05, + "loss": 2.7808, + "num_input_tokens_seen": 5987368960, + "step": 11420 + }, + { + "epoch": 0.5540703560235449, + "grad_norm": 0.24609375, + "learning_rate": 2.2600075921578463e-05, + "loss": 2.7771, + "num_input_tokens_seen": 5989990400, + "step": 11425 + }, + { + "epoch": 0.5543128375797916, + "grad_norm": 0.2490234375, + "learning_rate": 2.258012236119397e-05, + "loss": 2.7849, + "num_input_tokens_seen": 5992611840, + "step": 11430 + }, + { + "epoch": 0.5545553191360382, + "grad_norm": 0.244140625, + "learning_rate": 2.2560170356805947e-05, + "loss": 2.7683, + "num_input_tokens_seen": 5995233280, + "step": 11435 + }, + { + "epoch": 0.5547978006922848, + "grad_norm": 0.2392578125, + "learning_rate": 2.2540219921243642e-05, + "loss": 2.7702, + "num_input_tokens_seen": 5997854720, + "step": 11440 + }, + { + "epoch": 0.5550402822485315, + "grad_norm": 0.2470703125, + "learning_rate": 2.2520271067335314e-05, + "loss": 2.7836, + "num_input_tokens_seen": 6000476160, + "step": 11445 + }, + { + "epoch": 0.5552827638047781, + "grad_norm": 0.25390625, + "learning_rate": 2.2500323807908206e-05, + "loss": 2.7707, + "num_input_tokens_seen": 6003097600, + "step": 11450 + }, + { + "epoch": 0.5555252453610248, + "grad_norm": 0.25390625, + "learning_rate": 2.2480378155788525e-05, + "loss": 2.7719, + "num_input_tokens_seen": 6005719040, + "step": 11455 + }, + { + "epoch": 0.5557677269172714, + "grad_norm": 0.244140625, + "learning_rate": 2.2460434123801454e-05, + "loss": 2.7837, + "num_input_tokens_seen": 6008340480, + "step": 11460 + }, + { + "epoch": 0.556010208473518, + "grad_norm": 0.25, + "learning_rate": 2.2440491724771133e-05, + "loss": 2.7644, + "num_input_tokens_seen": 6010961920, + "step": 11465 + }, + { + "epoch": 0.5562526900297646, + "grad_norm": 0.24609375, + "learning_rate": 2.242055097152064e-05, + "loss": 2.7792, + "num_input_tokens_seen": 6013583360, + "step": 11470 + }, + { + "epoch": 0.5564951715860113, + "grad_norm": 0.251953125, + "learning_rate": 2.2400611876872007e-05, + "loss": 2.7749, + "num_input_tokens_seen": 6016204800, + "step": 11475 + }, + { + "epoch": 0.5567376531422579, + "grad_norm": 0.2470703125, + "learning_rate": 2.2380674453646204e-05, + "loss": 2.774, + "num_input_tokens_seen": 6018826240, + "step": 11480 + }, + { + "epoch": 0.5569801346985045, + "grad_norm": 0.255859375, + "learning_rate": 2.236073871466311e-05, + "loss": 2.7635, + "num_input_tokens_seen": 6021447680, + "step": 11485 + }, + { + "epoch": 0.5572226162547511, + "grad_norm": 0.25, + "learning_rate": 2.2340804672741535e-05, + "loss": 2.7605, + "num_input_tokens_seen": 6024069120, + "step": 11490 + }, + { + "epoch": 0.5574650978109977, + "grad_norm": 0.25, + "learning_rate": 2.2320872340699198e-05, + "loss": 2.7857, + "num_input_tokens_seen": 6026690560, + "step": 11495 + }, + { + "epoch": 0.5577075793672444, + "grad_norm": 0.24609375, + "learning_rate": 2.23009417313527e-05, + "loss": 2.768, + "num_input_tokens_seen": 6029312000, + "step": 11500 + }, + { + "epoch": 0.557950060923491, + "grad_norm": 0.2490234375, + "learning_rate": 2.2281012857517553e-05, + "loss": 2.7794, + "num_input_tokens_seen": 6031933440, + "step": 11505 + }, + { + "epoch": 0.5581925424797376, + "grad_norm": 0.2490234375, + "learning_rate": 2.2261085732008148e-05, + "loss": 2.7714, + "num_input_tokens_seen": 6034554880, + "step": 11510 + }, + { + "epoch": 0.5584350240359842, + "grad_norm": 0.2490234375, + "learning_rate": 2.2241160367637754e-05, + "loss": 2.7871, + "num_input_tokens_seen": 6037176320, + "step": 11515 + }, + { + "epoch": 0.5586775055922308, + "grad_norm": 0.24609375, + "learning_rate": 2.22212367772185e-05, + "loss": 2.7635, + "num_input_tokens_seen": 6039797760, + "step": 11520 + }, + { + "epoch": 0.5589199871484776, + "grad_norm": 0.251953125, + "learning_rate": 2.220131497356139e-05, + "loss": 2.7709, + "num_input_tokens_seen": 6042419200, + "step": 11525 + }, + { + "epoch": 0.5591624687047242, + "grad_norm": 0.2412109375, + "learning_rate": 2.2181394969476257e-05, + "loss": 2.7635, + "num_input_tokens_seen": 6045040640, + "step": 11530 + }, + { + "epoch": 0.5594049502609708, + "grad_norm": 0.251953125, + "learning_rate": 2.216147677777179e-05, + "loss": 2.7799, + "num_input_tokens_seen": 6047662080, + "step": 11535 + }, + { + "epoch": 0.5596474318172174, + "grad_norm": 0.24609375, + "learning_rate": 2.2141560411255515e-05, + "loss": 2.7749, + "num_input_tokens_seen": 6050283520, + "step": 11540 + }, + { + "epoch": 0.559889913373464, + "grad_norm": 0.25, + "learning_rate": 2.212164588273377e-05, + "loss": 2.7767, + "num_input_tokens_seen": 6052904960, + "step": 11545 + }, + { + "epoch": 0.5601323949297107, + "grad_norm": 0.2451171875, + "learning_rate": 2.2101733205011737e-05, + "loss": 2.793, + "num_input_tokens_seen": 6055526400, + "step": 11550 + }, + { + "epoch": 0.5603748764859573, + "grad_norm": 0.2421875, + "learning_rate": 2.2081822390893382e-05, + "loss": 2.7892, + "num_input_tokens_seen": 6058147840, + "step": 11555 + }, + { + "epoch": 0.5606173580422039, + "grad_norm": 0.2431640625, + "learning_rate": 2.2061913453181494e-05, + "loss": 2.7631, + "num_input_tokens_seen": 6060769280, + "step": 11560 + }, + { + "epoch": 0.5608598395984505, + "grad_norm": 0.2431640625, + "learning_rate": 2.2042006404677627e-05, + "loss": 2.7831, + "num_input_tokens_seen": 6063390720, + "step": 11565 + }, + { + "epoch": 0.5611023211546972, + "grad_norm": 0.251953125, + "learning_rate": 2.2022101258182147e-05, + "loss": 2.7708, + "num_input_tokens_seen": 6066012160, + "step": 11570 + }, + { + "epoch": 0.5613448027109438, + "grad_norm": 0.25390625, + "learning_rate": 2.200219802649419e-05, + "loss": 2.7789, + "num_input_tokens_seen": 6068633600, + "step": 11575 + }, + { + "epoch": 0.5615872842671904, + "grad_norm": 0.2490234375, + "learning_rate": 2.1982296722411657e-05, + "loss": 2.7754, + "num_input_tokens_seen": 6071255040, + "step": 11580 + }, + { + "epoch": 0.561829765823437, + "grad_norm": 0.251953125, + "learning_rate": 2.1962397358731206e-05, + "loss": 2.7727, + "num_input_tokens_seen": 6073876480, + "step": 11585 + }, + { + "epoch": 0.5620722473796836, + "grad_norm": 0.244140625, + "learning_rate": 2.1942499948248264e-05, + "loss": 2.7907, + "num_input_tokens_seen": 6076497920, + "step": 11590 + }, + { + "epoch": 0.5623147289359303, + "grad_norm": 0.2431640625, + "learning_rate": 2.192260450375698e-05, + "loss": 2.7701, + "num_input_tokens_seen": 6079119360, + "step": 11595 + }, + { + "epoch": 0.5625572104921769, + "grad_norm": 0.2431640625, + "learning_rate": 2.1902711038050248e-05, + "loss": 2.7711, + "num_input_tokens_seen": 6081740800, + "step": 11600 + }, + { + "epoch": 0.5627996920484236, + "grad_norm": 0.2451171875, + "learning_rate": 2.1882819563919695e-05, + "loss": 2.7881, + "num_input_tokens_seen": 6084362240, + "step": 11605 + }, + { + "epoch": 0.5630421736046702, + "grad_norm": 0.236328125, + "learning_rate": 2.1862930094155666e-05, + "loss": 2.783, + "num_input_tokens_seen": 6086983680, + "step": 11610 + }, + { + "epoch": 0.5632846551609169, + "grad_norm": 0.2431640625, + "learning_rate": 2.1843042641547205e-05, + "loss": 2.7647, + "num_input_tokens_seen": 6089605120, + "step": 11615 + }, + { + "epoch": 0.5635271367171635, + "grad_norm": 0.23828125, + "learning_rate": 2.1823157218882096e-05, + "loss": 2.7783, + "num_input_tokens_seen": 6092226560, + "step": 11620 + }, + { + "epoch": 0.5637696182734101, + "grad_norm": 0.2431640625, + "learning_rate": 2.1803273838946755e-05, + "loss": 2.7705, + "num_input_tokens_seen": 6094848000, + "step": 11625 + }, + { + "epoch": 0.5640120998296567, + "grad_norm": 0.2490234375, + "learning_rate": 2.1783392514526336e-05, + "loss": 2.7722, + "num_input_tokens_seen": 6097469440, + "step": 11630 + }, + { + "epoch": 0.5642545813859033, + "grad_norm": 0.2470703125, + "learning_rate": 2.176351325840465e-05, + "loss": 2.7664, + "num_input_tokens_seen": 6100090880, + "step": 11635 + }, + { + "epoch": 0.56449706294215, + "grad_norm": 0.2431640625, + "learning_rate": 2.174363608336418e-05, + "loss": 2.7781, + "num_input_tokens_seen": 6102712320, + "step": 11640 + }, + { + "epoch": 0.5647395444983966, + "grad_norm": 0.2431640625, + "learning_rate": 2.172376100218609e-05, + "loss": 2.7783, + "num_input_tokens_seen": 6105333760, + "step": 11645 + }, + { + "epoch": 0.5649820260546432, + "grad_norm": 0.2431640625, + "learning_rate": 2.1703888027650182e-05, + "loss": 2.7825, + "num_input_tokens_seen": 6107955200, + "step": 11650 + }, + { + "epoch": 0.5652245076108898, + "grad_norm": 0.244140625, + "learning_rate": 2.1684017172534883e-05, + "loss": 2.772, + "num_input_tokens_seen": 6110576640, + "step": 11655 + }, + { + "epoch": 0.5654669891671364, + "grad_norm": 0.25, + "learning_rate": 2.1664148449617282e-05, + "loss": 2.7866, + "num_input_tokens_seen": 6113198080, + "step": 11660 + }, + { + "epoch": 0.5657094707233831, + "grad_norm": 0.240234375, + "learning_rate": 2.16442818716731e-05, + "loss": 2.766, + "num_input_tokens_seen": 6115819520, + "step": 11665 + }, + { + "epoch": 0.5659519522796297, + "grad_norm": 0.2421875, + "learning_rate": 2.162441745147666e-05, + "loss": 2.775, + "num_input_tokens_seen": 6118440960, + "step": 11670 + }, + { + "epoch": 0.5661944338358763, + "grad_norm": 0.2412109375, + "learning_rate": 2.1604555201800924e-05, + "loss": 2.7797, + "num_input_tokens_seen": 6121062400, + "step": 11675 + }, + { + "epoch": 0.566436915392123, + "grad_norm": 0.2490234375, + "learning_rate": 2.1584695135417434e-05, + "loss": 2.7739, + "num_input_tokens_seen": 6123683840, + "step": 11680 + }, + { + "epoch": 0.5666793969483697, + "grad_norm": 0.24609375, + "learning_rate": 2.156483726509635e-05, + "loss": 2.7712, + "num_input_tokens_seen": 6126305280, + "step": 11685 + }, + { + "epoch": 0.5669218785046163, + "grad_norm": 0.244140625, + "learning_rate": 2.1544981603606384e-05, + "loss": 2.7839, + "num_input_tokens_seen": 6128926720, + "step": 11690 + }, + { + "epoch": 0.5671643600608629, + "grad_norm": 0.240234375, + "learning_rate": 2.1525128163714855e-05, + "loss": 2.7843, + "num_input_tokens_seen": 6131548160, + "step": 11695 + }, + { + "epoch": 0.5674068416171095, + "grad_norm": 0.2470703125, + "learning_rate": 2.150527695818766e-05, + "loss": 2.7828, + "num_input_tokens_seen": 6134169600, + "step": 11700 + }, + { + "epoch": 0.5674068416171095, + "eval_accuracy": 0.4560267057482495, + "eval_loss": 2.7423644065856934, + "eval_runtime": 5.8594, + "eval_samples_per_second": 51.199, + "eval_steps_per_second": 6.485, + "num_input_tokens_seen": 6134169600, + "step": 11700 + }, + { + "epoch": 0.5676493231733561, + "grad_norm": 0.24609375, + "learning_rate": 2.1485427999789247e-05, + "loss": 2.7716, + "num_input_tokens_seen": 6136791040, + "step": 11705 + }, + { + "epoch": 0.5678918047296028, + "grad_norm": 0.2470703125, + "learning_rate": 2.1465581301282617e-05, + "loss": 2.7704, + "num_input_tokens_seen": 6139412480, + "step": 11710 + }, + { + "epoch": 0.5681342862858494, + "grad_norm": 0.25, + "learning_rate": 2.144573687542933e-05, + "loss": 2.7742, + "num_input_tokens_seen": 6142033920, + "step": 11715 + }, + { + "epoch": 0.568376767842096, + "grad_norm": 0.248046875, + "learning_rate": 2.1425894734989453e-05, + "loss": 2.7573, + "num_input_tokens_seen": 6144655360, + "step": 11720 + }, + { + "epoch": 0.5686192493983426, + "grad_norm": 0.2431640625, + "learning_rate": 2.1406054892721626e-05, + "loss": 2.7731, + "num_input_tokens_seen": 6147276800, + "step": 11725 + }, + { + "epoch": 0.5688617309545893, + "grad_norm": 0.2470703125, + "learning_rate": 2.1386217361382983e-05, + "loss": 2.7861, + "num_input_tokens_seen": 6149898240, + "step": 11730 + }, + { + "epoch": 0.5691042125108359, + "grad_norm": 0.26171875, + "learning_rate": 2.136638215372919e-05, + "loss": 2.7702, + "num_input_tokens_seen": 6152519680, + "step": 11735 + }, + { + "epoch": 0.5693466940670825, + "grad_norm": 0.2451171875, + "learning_rate": 2.13465492825144e-05, + "loss": 2.7707, + "num_input_tokens_seen": 6155141120, + "step": 11740 + }, + { + "epoch": 0.5695891756233291, + "grad_norm": 0.2490234375, + "learning_rate": 2.132671876049129e-05, + "loss": 2.7842, + "num_input_tokens_seen": 6157762560, + "step": 11745 + }, + { + "epoch": 0.5698316571795757, + "grad_norm": 0.2431640625, + "learning_rate": 2.130689060041098e-05, + "loss": 2.7857, + "num_input_tokens_seen": 6160384000, + "step": 11750 + }, + { + "epoch": 0.5700741387358224, + "grad_norm": 0.2470703125, + "learning_rate": 2.1287064815023125e-05, + "loss": 2.7658, + "num_input_tokens_seen": 6163005440, + "step": 11755 + }, + { + "epoch": 0.5703166202920691, + "grad_norm": 0.2373046875, + "learning_rate": 2.126724141707582e-05, + "loss": 2.7904, + "num_input_tokens_seen": 6165626880, + "step": 11760 + }, + { + "epoch": 0.5705591018483157, + "grad_norm": 0.2470703125, + "learning_rate": 2.1247420419315638e-05, + "loss": 2.7774, + "num_input_tokens_seen": 6168248320, + "step": 11765 + }, + { + "epoch": 0.5708015834045623, + "grad_norm": 0.251953125, + "learning_rate": 2.1227601834487602e-05, + "loss": 2.765, + "num_input_tokens_seen": 6170869760, + "step": 11770 + }, + { + "epoch": 0.571044064960809, + "grad_norm": 0.2421875, + "learning_rate": 2.120778567533519e-05, + "loss": 2.7586, + "num_input_tokens_seen": 6173491200, + "step": 11775 + }, + { + "epoch": 0.5712865465170556, + "grad_norm": 0.25390625, + "learning_rate": 2.118797195460031e-05, + "loss": 2.7777, + "num_input_tokens_seen": 6176112640, + "step": 11780 + }, + { + "epoch": 0.5715290280733022, + "grad_norm": 0.2412109375, + "learning_rate": 2.116816068502331e-05, + "loss": 2.7851, + "num_input_tokens_seen": 6178734080, + "step": 11785 + }, + { + "epoch": 0.5717715096295488, + "grad_norm": 0.25, + "learning_rate": 2.114835187934296e-05, + "loss": 2.7728, + "num_input_tokens_seen": 6181355520, + "step": 11790 + }, + { + "epoch": 0.5720139911857954, + "grad_norm": 0.251953125, + "learning_rate": 2.1128545550296448e-05, + "loss": 2.7659, + "num_input_tokens_seen": 6183976960, + "step": 11795 + }, + { + "epoch": 0.572256472742042, + "grad_norm": 0.255859375, + "learning_rate": 2.1108741710619367e-05, + "loss": 2.7775, + "num_input_tokens_seen": 6186598400, + "step": 11800 + }, + { + "epoch": 0.5724989542982887, + "grad_norm": 0.2412109375, + "learning_rate": 2.1088940373045717e-05, + "loss": 2.7777, + "num_input_tokens_seen": 6189219840, + "step": 11805 + }, + { + "epoch": 0.5727414358545353, + "grad_norm": 0.2490234375, + "learning_rate": 2.106914155030787e-05, + "loss": 2.7961, + "num_input_tokens_seen": 6191841280, + "step": 11810 + }, + { + "epoch": 0.5729839174107819, + "grad_norm": 0.2431640625, + "learning_rate": 2.1049345255136595e-05, + "loss": 2.7603, + "num_input_tokens_seen": 6194462720, + "step": 11815 + }, + { + "epoch": 0.5732263989670285, + "grad_norm": 0.26171875, + "learning_rate": 2.1029551500261035e-05, + "loss": 2.7735, + "num_input_tokens_seen": 6197084160, + "step": 11820 + }, + { + "epoch": 0.5734688805232752, + "grad_norm": 0.248046875, + "learning_rate": 2.10097602984087e-05, + "loss": 2.7866, + "num_input_tokens_seen": 6199705600, + "step": 11825 + }, + { + "epoch": 0.5737113620795218, + "grad_norm": 0.24609375, + "learning_rate": 2.0989971662305458e-05, + "loss": 2.7766, + "num_input_tokens_seen": 6202327040, + "step": 11830 + }, + { + "epoch": 0.5739538436357684, + "grad_norm": 0.244140625, + "learning_rate": 2.0970185604675523e-05, + "loss": 2.7827, + "num_input_tokens_seen": 6204948480, + "step": 11835 + }, + { + "epoch": 0.5741963251920151, + "grad_norm": 0.248046875, + "learning_rate": 2.095040213824146e-05, + "loss": 2.7836, + "num_input_tokens_seen": 6207569920, + "step": 11840 + }, + { + "epoch": 0.5744388067482618, + "grad_norm": 0.2421875, + "learning_rate": 2.093062127572415e-05, + "loss": 2.7746, + "num_input_tokens_seen": 6210191360, + "step": 11845 + }, + { + "epoch": 0.5746812883045084, + "grad_norm": 0.25, + "learning_rate": 2.0910843029842818e-05, + "loss": 2.7715, + "num_input_tokens_seen": 6212812800, + "step": 11850 + }, + { + "epoch": 0.574923769860755, + "grad_norm": 0.2451171875, + "learning_rate": 2.0891067413315002e-05, + "loss": 2.7736, + "num_input_tokens_seen": 6215434240, + "step": 11855 + }, + { + "epoch": 0.5751662514170016, + "grad_norm": 0.244140625, + "learning_rate": 2.0871294438856543e-05, + "loss": 2.7608, + "num_input_tokens_seen": 6218055680, + "step": 11860 + }, + { + "epoch": 0.5754087329732482, + "grad_norm": 0.244140625, + "learning_rate": 2.0851524119181585e-05, + "loss": 2.7733, + "num_input_tokens_seen": 6220677120, + "step": 11865 + }, + { + "epoch": 0.5756512145294949, + "grad_norm": 0.2451171875, + "learning_rate": 2.083175646700258e-05, + "loss": 2.782, + "num_input_tokens_seen": 6223298560, + "step": 11870 + }, + { + "epoch": 0.5758936960857415, + "grad_norm": 0.2578125, + "learning_rate": 2.081199149503024e-05, + "loss": 2.7788, + "num_input_tokens_seen": 6225920000, + "step": 11875 + }, + { + "epoch": 0.5761361776419881, + "grad_norm": 0.251953125, + "learning_rate": 2.0792229215973567e-05, + "loss": 2.7713, + "num_input_tokens_seen": 6228541440, + "step": 11880 + }, + { + "epoch": 0.5763786591982347, + "grad_norm": 0.2421875, + "learning_rate": 2.0772469642539834e-05, + "loss": 2.7621, + "num_input_tokens_seen": 6231162880, + "step": 11885 + }, + { + "epoch": 0.5766211407544813, + "grad_norm": 0.2470703125, + "learning_rate": 2.0752712787434565e-05, + "loss": 2.7741, + "num_input_tokens_seen": 6233784320, + "step": 11890 + }, + { + "epoch": 0.576863622310728, + "grad_norm": 0.255859375, + "learning_rate": 2.0732958663361545e-05, + "loss": 2.7787, + "num_input_tokens_seen": 6236405760, + "step": 11895 + }, + { + "epoch": 0.5771061038669746, + "grad_norm": 0.2490234375, + "learning_rate": 2.0713207283022808e-05, + "loss": 2.7852, + "num_input_tokens_seen": 6239027200, + "step": 11900 + }, + { + "epoch": 0.5773485854232212, + "grad_norm": 0.244140625, + "learning_rate": 2.0693458659118596e-05, + "loss": 2.7799, + "num_input_tokens_seen": 6241648640, + "step": 11905 + }, + { + "epoch": 0.5775910669794678, + "grad_norm": 0.2470703125, + "learning_rate": 2.067371280434741e-05, + "loss": 2.7902, + "num_input_tokens_seen": 6244270080, + "step": 11910 + }, + { + "epoch": 0.5778335485357144, + "grad_norm": 0.25, + "learning_rate": 2.0653969731405954e-05, + "loss": 2.7689, + "num_input_tokens_seen": 6246891520, + "step": 11915 + }, + { + "epoch": 0.5780760300919612, + "grad_norm": 0.2470703125, + "learning_rate": 2.063422945298915e-05, + "loss": 2.7857, + "num_input_tokens_seen": 6249512960, + "step": 11920 + }, + { + "epoch": 0.5783185116482078, + "grad_norm": 0.25390625, + "learning_rate": 2.0614491981790125e-05, + "loss": 2.7829, + "num_input_tokens_seen": 6252134400, + "step": 11925 + }, + { + "epoch": 0.5785609932044544, + "grad_norm": 0.2490234375, + "learning_rate": 2.059475733050019e-05, + "loss": 2.7723, + "num_input_tokens_seen": 6254755840, + "step": 11930 + }, + { + "epoch": 0.578803474760701, + "grad_norm": 0.24609375, + "learning_rate": 2.0575025511808847e-05, + "loss": 2.7719, + "num_input_tokens_seen": 6257377280, + "step": 11935 + }, + { + "epoch": 0.5790459563169477, + "grad_norm": 0.251953125, + "learning_rate": 2.0555296538403786e-05, + "loss": 2.7708, + "num_input_tokens_seen": 6259998720, + "step": 11940 + }, + { + "epoch": 0.5792884378731943, + "grad_norm": 0.2451171875, + "learning_rate": 2.0535570422970854e-05, + "loss": 2.7837, + "num_input_tokens_seen": 6262620160, + "step": 11945 + }, + { + "epoch": 0.5795309194294409, + "grad_norm": 0.248046875, + "learning_rate": 2.051584717819407e-05, + "loss": 2.775, + "num_input_tokens_seen": 6265241600, + "step": 11950 + }, + { + "epoch": 0.5797734009856875, + "grad_norm": 0.255859375, + "learning_rate": 2.0496126816755598e-05, + "loss": 2.7863, + "num_input_tokens_seen": 6267863040, + "step": 11955 + }, + { + "epoch": 0.5800158825419341, + "grad_norm": 0.25, + "learning_rate": 2.0476409351335772e-05, + "loss": 2.7821, + "num_input_tokens_seen": 6270484480, + "step": 11960 + }, + { + "epoch": 0.5802583640981808, + "grad_norm": 0.255859375, + "learning_rate": 2.0456694794613026e-05, + "loss": 2.791, + "num_input_tokens_seen": 6273105920, + "step": 11965 + }, + { + "epoch": 0.5805008456544274, + "grad_norm": 0.25, + "learning_rate": 2.043698315926395e-05, + "loss": 2.7786, + "num_input_tokens_seen": 6275727360, + "step": 11970 + }, + { + "epoch": 0.580743327210674, + "grad_norm": 0.2392578125, + "learning_rate": 2.0417274457963248e-05, + "loss": 2.7756, + "num_input_tokens_seen": 6278348800, + "step": 11975 + }, + { + "epoch": 0.5809858087669206, + "grad_norm": 0.24609375, + "learning_rate": 2.0397568703383734e-05, + "loss": 2.793, + "num_input_tokens_seen": 6280970240, + "step": 11980 + }, + { + "epoch": 0.5812282903231673, + "grad_norm": 0.248046875, + "learning_rate": 2.037786590819634e-05, + "loss": 2.7853, + "num_input_tokens_seen": 6283591680, + "step": 11985 + }, + { + "epoch": 0.5814707718794139, + "grad_norm": 0.251953125, + "learning_rate": 2.035816608507008e-05, + "loss": 2.7757, + "num_input_tokens_seen": 6286213120, + "step": 11990 + }, + { + "epoch": 0.5817132534356606, + "grad_norm": 0.24609375, + "learning_rate": 2.0338469246672085e-05, + "loss": 2.7765, + "num_input_tokens_seen": 6288834560, + "step": 11995 + }, + { + "epoch": 0.5819557349919072, + "grad_norm": 0.2451171875, + "learning_rate": 2.0318775405667512e-05, + "loss": 2.7814, + "num_input_tokens_seen": 6291456000, + "step": 12000 + }, + { + "epoch": 0.5819557349919072, + "eval_accuracy": 0.45578733105357433, + "eval_loss": 2.742259979248047, + "eval_runtime": 5.8929, + "eval_samples_per_second": 50.909, + "eval_steps_per_second": 6.448, + "num_input_tokens_seen": 6291456000, + "step": 12000 + }, + { + "epoch": 0.5821982165481538, + "grad_norm": 0.255859375, + "learning_rate": 2.0299084574719634e-05, + "loss": 2.7799, + "num_input_tokens_seen": 6294077440, + "step": 12005 + }, + { + "epoch": 0.5824406981044005, + "grad_norm": 0.2451171875, + "learning_rate": 2.0279396766489787e-05, + "loss": 2.7878, + "num_input_tokens_seen": 6296698880, + "step": 12010 + }, + { + "epoch": 0.5826831796606471, + "grad_norm": 0.24609375, + "learning_rate": 2.0259711993637354e-05, + "loss": 2.7795, + "num_input_tokens_seen": 6299320320, + "step": 12015 + }, + { + "epoch": 0.5829256612168937, + "grad_norm": 0.251953125, + "learning_rate": 2.024003026881976e-05, + "loss": 2.7704, + "num_input_tokens_seen": 6301941760, + "step": 12020 + }, + { + "epoch": 0.5831681427731403, + "grad_norm": 0.2392578125, + "learning_rate": 2.0220351604692497e-05, + "loss": 2.7574, + "num_input_tokens_seen": 6304563200, + "step": 12025 + }, + { + "epoch": 0.583410624329387, + "grad_norm": 0.2421875, + "learning_rate": 2.0200676013909042e-05, + "loss": 2.7772, + "num_input_tokens_seen": 6307184640, + "step": 12030 + }, + { + "epoch": 0.5836531058856336, + "grad_norm": 0.248046875, + "learning_rate": 2.0181003509120927e-05, + "loss": 2.8018, + "num_input_tokens_seen": 6309806080, + "step": 12035 + }, + { + "epoch": 0.5838955874418802, + "grad_norm": 0.2392578125, + "learning_rate": 2.0161334102977708e-05, + "loss": 2.7769, + "num_input_tokens_seen": 6312427520, + "step": 12040 + }, + { + "epoch": 0.5841380689981268, + "grad_norm": 0.24609375, + "learning_rate": 2.0141667808126935e-05, + "loss": 2.7753, + "num_input_tokens_seen": 6315048960, + "step": 12045 + }, + { + "epoch": 0.5843805505543734, + "grad_norm": 0.2412109375, + "learning_rate": 2.0122004637214154e-05, + "loss": 2.7767, + "num_input_tokens_seen": 6317670400, + "step": 12050 + }, + { + "epoch": 0.5846230321106201, + "grad_norm": 0.2451171875, + "learning_rate": 2.0102344602882916e-05, + "loss": 2.7824, + "num_input_tokens_seen": 6320291840, + "step": 12055 + }, + { + "epoch": 0.5848655136668667, + "grad_norm": 0.248046875, + "learning_rate": 2.0082687717774725e-05, + "loss": 2.7849, + "num_input_tokens_seen": 6322913280, + "step": 12060 + }, + { + "epoch": 0.5851079952231133, + "grad_norm": 0.2392578125, + "learning_rate": 2.0063033994529096e-05, + "loss": 2.7667, + "num_input_tokens_seen": 6325534720, + "step": 12065 + }, + { + "epoch": 0.5853504767793599, + "grad_norm": 0.251953125, + "learning_rate": 2.0043383445783498e-05, + "loss": 2.7749, + "num_input_tokens_seen": 6328156160, + "step": 12070 + }, + { + "epoch": 0.5855929583356067, + "grad_norm": 0.244140625, + "learning_rate": 2.002373608417335e-05, + "loss": 2.7736, + "num_input_tokens_seen": 6330777600, + "step": 12075 + }, + { + "epoch": 0.5858354398918533, + "grad_norm": 0.25390625, + "learning_rate": 2.0004091922332034e-05, + "loss": 2.7702, + "num_input_tokens_seen": 6333399040, + "step": 12080 + }, + { + "epoch": 0.5860779214480999, + "grad_norm": 0.2431640625, + "learning_rate": 1.998445097289087e-05, + "loss": 2.7792, + "num_input_tokens_seen": 6336020480, + "step": 12085 + }, + { + "epoch": 0.5863204030043465, + "grad_norm": 0.244140625, + "learning_rate": 1.9964813248479102e-05, + "loss": 2.7845, + "num_input_tokens_seen": 6338641920, + "step": 12090 + }, + { + "epoch": 0.5865628845605931, + "grad_norm": 0.25, + "learning_rate": 1.9945178761723915e-05, + "loss": 2.7684, + "num_input_tokens_seen": 6341263360, + "step": 12095 + }, + { + "epoch": 0.5868053661168398, + "grad_norm": 0.2470703125, + "learning_rate": 1.992554752525041e-05, + "loss": 2.7881, + "num_input_tokens_seen": 6343884800, + "step": 12100 + }, + { + "epoch": 0.5870478476730864, + "grad_norm": 0.251953125, + "learning_rate": 1.990591955168159e-05, + "loss": 2.7734, + "num_input_tokens_seen": 6346506240, + "step": 12105 + }, + { + "epoch": 0.587290329229333, + "grad_norm": 0.25390625, + "learning_rate": 1.9886294853638364e-05, + "loss": 2.7566, + "num_input_tokens_seen": 6349127680, + "step": 12110 + }, + { + "epoch": 0.5875328107855796, + "grad_norm": 0.25390625, + "learning_rate": 1.9866673443739548e-05, + "loss": 2.7629, + "num_input_tokens_seen": 6351749120, + "step": 12115 + }, + { + "epoch": 0.5877752923418262, + "grad_norm": 0.25, + "learning_rate": 1.9847055334601814e-05, + "loss": 2.784, + "num_input_tokens_seen": 6354370560, + "step": 12120 + }, + { + "epoch": 0.5880177738980729, + "grad_norm": 0.2470703125, + "learning_rate": 1.9827440538839737e-05, + "loss": 2.7889, + "num_input_tokens_seen": 6356992000, + "step": 12125 + }, + { + "epoch": 0.5882602554543195, + "grad_norm": 0.2421875, + "learning_rate": 1.980782906906575e-05, + "loss": 2.7823, + "num_input_tokens_seen": 6359613440, + "step": 12130 + }, + { + "epoch": 0.5885027370105661, + "grad_norm": 0.24609375, + "learning_rate": 1.978822093789016e-05, + "loss": 2.7825, + "num_input_tokens_seen": 6362234880, + "step": 12135 + }, + { + "epoch": 0.5887452185668127, + "grad_norm": 0.2490234375, + "learning_rate": 1.9768616157921107e-05, + "loss": 2.7652, + "num_input_tokens_seen": 6364856320, + "step": 12140 + }, + { + "epoch": 0.5889877001230593, + "grad_norm": 0.2421875, + "learning_rate": 1.9749014741764596e-05, + "loss": 2.7771, + "num_input_tokens_seen": 6367477760, + "step": 12145 + }, + { + "epoch": 0.589230181679306, + "grad_norm": 0.2490234375, + "learning_rate": 1.972941670202446e-05, + "loss": 2.767, + "num_input_tokens_seen": 6370099200, + "step": 12150 + }, + { + "epoch": 0.5894726632355527, + "grad_norm": 0.251953125, + "learning_rate": 1.970982205130235e-05, + "loss": 2.7573, + "num_input_tokens_seen": 6372720640, + "step": 12155 + }, + { + "epoch": 0.5897151447917993, + "grad_norm": 0.251953125, + "learning_rate": 1.9690230802197757e-05, + "loss": 2.7676, + "num_input_tokens_seen": 6375342080, + "step": 12160 + }, + { + "epoch": 0.5899576263480459, + "grad_norm": 0.25390625, + "learning_rate": 1.9670642967307976e-05, + "loss": 2.78, + "num_input_tokens_seen": 6377963520, + "step": 12165 + }, + { + "epoch": 0.5902001079042926, + "grad_norm": 0.2431640625, + "learning_rate": 1.9651058559228107e-05, + "loss": 2.7715, + "num_input_tokens_seen": 6380584960, + "step": 12170 + }, + { + "epoch": 0.5904425894605392, + "grad_norm": 0.24609375, + "learning_rate": 1.963147759055105e-05, + "loss": 2.764, + "num_input_tokens_seen": 6383206400, + "step": 12175 + }, + { + "epoch": 0.5906850710167858, + "grad_norm": 0.2431640625, + "learning_rate": 1.961190007386749e-05, + "loss": 2.7718, + "num_input_tokens_seen": 6385827840, + "step": 12180 + }, + { + "epoch": 0.5909275525730324, + "grad_norm": 0.25390625, + "learning_rate": 1.9592326021765887e-05, + "loss": 2.7784, + "num_input_tokens_seen": 6388449280, + "step": 12185 + }, + { + "epoch": 0.591170034129279, + "grad_norm": 0.2392578125, + "learning_rate": 1.957275544683248e-05, + "loss": 2.777, + "num_input_tokens_seen": 6391070720, + "step": 12190 + }, + { + "epoch": 0.5914125156855257, + "grad_norm": 0.25, + "learning_rate": 1.9553188361651276e-05, + "loss": 2.7786, + "num_input_tokens_seen": 6393692160, + "step": 12195 + }, + { + "epoch": 0.5916549972417723, + "grad_norm": 0.2451171875, + "learning_rate": 1.953362477880403e-05, + "loss": 2.7628, + "num_input_tokens_seen": 6396313600, + "step": 12200 + }, + { + "epoch": 0.5918974787980189, + "grad_norm": 0.2431640625, + "learning_rate": 1.9514064710870248e-05, + "loss": 2.7649, + "num_input_tokens_seen": 6398935040, + "step": 12205 + }, + { + "epoch": 0.5921399603542655, + "grad_norm": 0.248046875, + "learning_rate": 1.9494508170427183e-05, + "loss": 2.7757, + "num_input_tokens_seen": 6401556480, + "step": 12210 + }, + { + "epoch": 0.5923824419105121, + "grad_norm": 0.24609375, + "learning_rate": 1.9474955170049802e-05, + "loss": 2.7814, + "num_input_tokens_seen": 6404177920, + "step": 12215 + }, + { + "epoch": 0.5926249234667588, + "grad_norm": 0.248046875, + "learning_rate": 1.945540572231081e-05, + "loss": 2.7674, + "num_input_tokens_seen": 6406799360, + "step": 12220 + }, + { + "epoch": 0.5928674050230054, + "grad_norm": 0.25390625, + "learning_rate": 1.9435859839780623e-05, + "loss": 2.7723, + "num_input_tokens_seen": 6409420800, + "step": 12225 + }, + { + "epoch": 0.593109886579252, + "grad_norm": 0.248046875, + "learning_rate": 1.9416317535027374e-05, + "loss": 2.7838, + "num_input_tokens_seen": 6412042240, + "step": 12230 + }, + { + "epoch": 0.5933523681354987, + "grad_norm": 0.24609375, + "learning_rate": 1.9396778820616876e-05, + "loss": 2.769, + "num_input_tokens_seen": 6414663680, + "step": 12235 + }, + { + "epoch": 0.5935948496917454, + "grad_norm": 0.2470703125, + "learning_rate": 1.937724370911266e-05, + "loss": 2.7728, + "num_input_tokens_seen": 6417285120, + "step": 12240 + }, + { + "epoch": 0.593837331247992, + "grad_norm": 0.2421875, + "learning_rate": 1.9357712213075907e-05, + "loss": 2.7887, + "num_input_tokens_seen": 6419906560, + "step": 12245 + }, + { + "epoch": 0.5940798128042386, + "grad_norm": 0.251953125, + "learning_rate": 1.9338184345065495e-05, + "loss": 2.7656, + "num_input_tokens_seen": 6422528000, + "step": 12250 + }, + { + "epoch": 0.5943222943604852, + "grad_norm": 0.244140625, + "learning_rate": 1.9318660117637978e-05, + "loss": 2.7674, + "num_input_tokens_seen": 6425149440, + "step": 12255 + }, + { + "epoch": 0.5945647759167318, + "grad_norm": 0.2490234375, + "learning_rate": 1.9299139543347542e-05, + "loss": 2.7721, + "num_input_tokens_seen": 6427770880, + "step": 12260 + }, + { + "epoch": 0.5948072574729785, + "grad_norm": 0.244140625, + "learning_rate": 1.927962263474604e-05, + "loss": 2.7753, + "num_input_tokens_seen": 6430392320, + "step": 12265 + }, + { + "epoch": 0.5950497390292251, + "grad_norm": 0.24609375, + "learning_rate": 1.9260109404382985e-05, + "loss": 2.7697, + "num_input_tokens_seen": 6433013760, + "step": 12270 + }, + { + "epoch": 0.5952922205854717, + "grad_norm": 0.24609375, + "learning_rate": 1.9240599864805485e-05, + "loss": 2.7753, + "num_input_tokens_seen": 6435635200, + "step": 12275 + }, + { + "epoch": 0.5955347021417183, + "grad_norm": 0.2392578125, + "learning_rate": 1.92210940285583e-05, + "loss": 2.7812, + "num_input_tokens_seen": 6438256640, + "step": 12280 + }, + { + "epoch": 0.595777183697965, + "grad_norm": 0.240234375, + "learning_rate": 1.9201591908183808e-05, + "loss": 2.7734, + "num_input_tokens_seen": 6440878080, + "step": 12285 + }, + { + "epoch": 0.5960196652542116, + "grad_norm": 0.2470703125, + "learning_rate": 1.9182093516221995e-05, + "loss": 2.7799, + "num_input_tokens_seen": 6443499520, + "step": 12290 + }, + { + "epoch": 0.5962621468104582, + "grad_norm": 0.2451171875, + "learning_rate": 1.916259886521044e-05, + "loss": 2.7748, + "num_input_tokens_seen": 6446120960, + "step": 12295 + }, + { + "epoch": 0.5965046283667048, + "grad_norm": 0.240234375, + "learning_rate": 1.914310796768434e-05, + "loss": 2.7735, + "num_input_tokens_seen": 6448742400, + "step": 12300 + }, + { + "epoch": 0.5965046283667048, + "eval_accuracy": 0.45588014981273406, + "eval_loss": 2.742222309112549, + "eval_runtime": 5.8637, + "eval_samples_per_second": 51.162, + "eval_steps_per_second": 6.481, + "num_input_tokens_seen": 6448742400, + "step": 12300 + }, + { + "epoch": 0.5967471099229514, + "grad_norm": 0.2431640625, + "learning_rate": 1.9123620836176467e-05, + "loss": 2.7739, + "num_input_tokens_seen": 6451363840, + "step": 12305 + }, + { + "epoch": 0.5969895914791982, + "grad_norm": 0.2353515625, + "learning_rate": 1.9104137483217148e-05, + "loss": 2.7703, + "num_input_tokens_seen": 6453985280, + "step": 12310 + }, + { + "epoch": 0.5972320730354448, + "grad_norm": 0.2431640625, + "learning_rate": 1.9084657921334314e-05, + "loss": 2.7687, + "num_input_tokens_seen": 6456606720, + "step": 12315 + }, + { + "epoch": 0.5974745545916914, + "grad_norm": 0.2451171875, + "learning_rate": 1.9065182163053435e-05, + "loss": 2.7785, + "num_input_tokens_seen": 6459228160, + "step": 12320 + }, + { + "epoch": 0.597717036147938, + "grad_norm": 0.2490234375, + "learning_rate": 1.904571022089756e-05, + "loss": 2.7631, + "num_input_tokens_seen": 6461849600, + "step": 12325 + }, + { + "epoch": 0.5979595177041847, + "grad_norm": 0.2451171875, + "learning_rate": 1.9026242107387266e-05, + "loss": 2.7736, + "num_input_tokens_seen": 6464471040, + "step": 12330 + }, + { + "epoch": 0.5982019992604313, + "grad_norm": 0.2431640625, + "learning_rate": 1.9006777835040675e-05, + "loss": 2.7736, + "num_input_tokens_seen": 6467092480, + "step": 12335 + }, + { + "epoch": 0.5984444808166779, + "grad_norm": 0.248046875, + "learning_rate": 1.8987317416373418e-05, + "loss": 2.7852, + "num_input_tokens_seen": 6469713920, + "step": 12340 + }, + { + "epoch": 0.5986869623729245, + "grad_norm": 0.2412109375, + "learning_rate": 1.8967860863898677e-05, + "loss": 2.7914, + "num_input_tokens_seen": 6472335360, + "step": 12345 + }, + { + "epoch": 0.5989294439291711, + "grad_norm": 0.251953125, + "learning_rate": 1.894840819012714e-05, + "loss": 2.7865, + "num_input_tokens_seen": 6474956800, + "step": 12350 + }, + { + "epoch": 0.5991719254854178, + "grad_norm": 0.255859375, + "learning_rate": 1.892895940756699e-05, + "loss": 2.7708, + "num_input_tokens_seen": 6477578240, + "step": 12355 + }, + { + "epoch": 0.5994144070416644, + "grad_norm": 0.2373046875, + "learning_rate": 1.8909514528723933e-05, + "loss": 2.7799, + "num_input_tokens_seen": 6480199680, + "step": 12360 + }, + { + "epoch": 0.599656888597911, + "grad_norm": 0.2451171875, + "learning_rate": 1.8890073566101138e-05, + "loss": 2.7649, + "num_input_tokens_seen": 6482821120, + "step": 12365 + }, + { + "epoch": 0.5998993701541576, + "grad_norm": 0.2373046875, + "learning_rate": 1.887063653219925e-05, + "loss": 2.7881, + "num_input_tokens_seen": 6485442560, + "step": 12370 + }, + { + "epoch": 0.6001418517104042, + "grad_norm": 0.251953125, + "learning_rate": 1.8851203439516423e-05, + "loss": 2.7742, + "num_input_tokens_seen": 6488064000, + "step": 12375 + }, + { + "epoch": 0.6003843332666509, + "grad_norm": 0.2421875, + "learning_rate": 1.8831774300548252e-05, + "loss": 2.7796, + "num_input_tokens_seen": 6490685440, + "step": 12380 + }, + { + "epoch": 0.6006268148228975, + "grad_norm": 0.24609375, + "learning_rate": 1.8812349127787792e-05, + "loss": 2.7747, + "num_input_tokens_seen": 6493306880, + "step": 12385 + }, + { + "epoch": 0.6008692963791442, + "grad_norm": 0.24609375, + "learning_rate": 1.8792927933725555e-05, + "loss": 2.7859, + "num_input_tokens_seen": 6495928320, + "step": 12390 + }, + { + "epoch": 0.6011117779353908, + "grad_norm": 0.2451171875, + "learning_rate": 1.8773510730849497e-05, + "loss": 2.7795, + "num_input_tokens_seen": 6498549760, + "step": 12395 + }, + { + "epoch": 0.6013542594916375, + "grad_norm": 0.24609375, + "learning_rate": 1.8754097531644975e-05, + "loss": 2.7734, + "num_input_tokens_seen": 6501171200, + "step": 12400 + }, + { + "epoch": 0.6015967410478841, + "grad_norm": 0.2421875, + "learning_rate": 1.873468834859482e-05, + "loss": 2.7718, + "num_input_tokens_seen": 6503792640, + "step": 12405 + }, + { + "epoch": 0.6018392226041307, + "grad_norm": 0.24609375, + "learning_rate": 1.8715283194179254e-05, + "loss": 2.7756, + "num_input_tokens_seen": 6506414080, + "step": 12410 + }, + { + "epoch": 0.6020817041603773, + "grad_norm": 0.25, + "learning_rate": 1.869588208087591e-05, + "loss": 2.7581, + "num_input_tokens_seen": 6509035520, + "step": 12415 + }, + { + "epoch": 0.6023241857166239, + "grad_norm": 0.244140625, + "learning_rate": 1.8676485021159827e-05, + "loss": 2.769, + "num_input_tokens_seen": 6511656960, + "step": 12420 + }, + { + "epoch": 0.6025666672728706, + "grad_norm": 0.2421875, + "learning_rate": 1.8657092027503444e-05, + "loss": 2.7913, + "num_input_tokens_seen": 6514278400, + "step": 12425 + }, + { + "epoch": 0.6028091488291172, + "grad_norm": 0.248046875, + "learning_rate": 1.863770311237656e-05, + "loss": 2.783, + "num_input_tokens_seen": 6516899840, + "step": 12430 + }, + { + "epoch": 0.6030516303853638, + "grad_norm": 0.248046875, + "learning_rate": 1.8618318288246373e-05, + "loss": 2.7735, + "num_input_tokens_seen": 6519521280, + "step": 12435 + }, + { + "epoch": 0.6032941119416104, + "grad_norm": 0.23828125, + "learning_rate": 1.8598937567577456e-05, + "loss": 2.7734, + "num_input_tokens_seen": 6522142720, + "step": 12440 + }, + { + "epoch": 0.603536593497857, + "grad_norm": 0.244140625, + "learning_rate": 1.8579560962831727e-05, + "loss": 2.7794, + "num_input_tokens_seen": 6524764160, + "step": 12445 + }, + { + "epoch": 0.6037790750541037, + "grad_norm": 0.2470703125, + "learning_rate": 1.8560188486468463e-05, + "loss": 2.7717, + "num_input_tokens_seen": 6527385600, + "step": 12450 + }, + { + "epoch": 0.6040215566103503, + "grad_norm": 0.2490234375, + "learning_rate": 1.8540820150944292e-05, + "loss": 2.7831, + "num_input_tokens_seen": 6530007040, + "step": 12455 + }, + { + "epoch": 0.6042640381665969, + "grad_norm": 0.251953125, + "learning_rate": 1.8521455968713176e-05, + "loss": 2.7735, + "num_input_tokens_seen": 6532628480, + "step": 12460 + }, + { + "epoch": 0.6045065197228435, + "grad_norm": 0.248046875, + "learning_rate": 1.85020959522264e-05, + "loss": 2.7771, + "num_input_tokens_seen": 6535249920, + "step": 12465 + }, + { + "epoch": 0.6047490012790903, + "grad_norm": 0.2431640625, + "learning_rate": 1.8482740113932573e-05, + "loss": 2.7757, + "num_input_tokens_seen": 6537871360, + "step": 12470 + }, + { + "epoch": 0.6049914828353369, + "grad_norm": 0.251953125, + "learning_rate": 1.8463388466277625e-05, + "loss": 2.7683, + "num_input_tokens_seen": 6540492800, + "step": 12475 + }, + { + "epoch": 0.6052339643915835, + "grad_norm": 0.2490234375, + "learning_rate": 1.844404102170479e-05, + "loss": 2.7574, + "num_input_tokens_seen": 6543114240, + "step": 12480 + }, + { + "epoch": 0.6054764459478301, + "grad_norm": 0.248046875, + "learning_rate": 1.842469779265459e-05, + "loss": 2.7632, + "num_input_tokens_seen": 6545735680, + "step": 12485 + }, + { + "epoch": 0.6057189275040767, + "grad_norm": 0.251953125, + "learning_rate": 1.8405358791564846e-05, + "loss": 2.7693, + "num_input_tokens_seen": 6548357120, + "step": 12490 + }, + { + "epoch": 0.6059614090603234, + "grad_norm": 0.2431640625, + "learning_rate": 1.8386024030870653e-05, + "loss": 2.7743, + "num_input_tokens_seen": 6550978560, + "step": 12495 + }, + { + "epoch": 0.60620389061657, + "grad_norm": 0.23828125, + "learning_rate": 1.8366693523004385e-05, + "loss": 2.764, + "num_input_tokens_seen": 6553600000, + "step": 12500 + }, + { + "epoch": 0.6064463721728166, + "grad_norm": 0.251953125, + "learning_rate": 1.834736728039568e-05, + "loss": 2.7782, + "num_input_tokens_seen": 6556221440, + "step": 12505 + }, + { + "epoch": 0.6066888537290632, + "grad_norm": 0.2431640625, + "learning_rate": 1.8328045315471432e-05, + "loss": 2.7852, + "num_input_tokens_seen": 6558842880, + "step": 12510 + }, + { + "epoch": 0.6069313352853098, + "grad_norm": 0.2431640625, + "learning_rate": 1.8308727640655786e-05, + "loss": 2.7706, + "num_input_tokens_seen": 6561464320, + "step": 12515 + }, + { + "epoch": 0.6071738168415565, + "grad_norm": 0.240234375, + "learning_rate": 1.828941426837013e-05, + "loss": 2.7746, + "num_input_tokens_seen": 6564085760, + "step": 12520 + }, + { + "epoch": 0.6074162983978031, + "grad_norm": 0.24609375, + "learning_rate": 1.8270105211033082e-05, + "loss": 2.7705, + "num_input_tokens_seen": 6566707200, + "step": 12525 + }, + { + "epoch": 0.6076587799540497, + "grad_norm": 0.2431640625, + "learning_rate": 1.8250800481060482e-05, + "loss": 2.7724, + "num_input_tokens_seen": 6569328640, + "step": 12530 + }, + { + "epoch": 0.6079012615102963, + "grad_norm": 0.2412109375, + "learning_rate": 1.8231500090865395e-05, + "loss": 2.7723, + "num_input_tokens_seen": 6571950080, + "step": 12535 + }, + { + "epoch": 0.608143743066543, + "grad_norm": 0.25, + "learning_rate": 1.821220405285809e-05, + "loss": 2.7829, + "num_input_tokens_seen": 6574571520, + "step": 12540 + }, + { + "epoch": 0.6083862246227897, + "grad_norm": 0.2451171875, + "learning_rate": 1.8192912379446047e-05, + "loss": 2.7766, + "num_input_tokens_seen": 6577192960, + "step": 12545 + }, + { + "epoch": 0.6086287061790363, + "grad_norm": 0.25, + "learning_rate": 1.8173625083033935e-05, + "loss": 2.775, + "num_input_tokens_seen": 6579814400, + "step": 12550 + }, + { + "epoch": 0.6088711877352829, + "grad_norm": 0.2451171875, + "learning_rate": 1.81543421760236e-05, + "loss": 2.7615, + "num_input_tokens_seen": 6582435840, + "step": 12555 + }, + { + "epoch": 0.6091136692915295, + "grad_norm": 0.2373046875, + "learning_rate": 1.8135063670814062e-05, + "loss": 2.7693, + "num_input_tokens_seen": 6585057280, + "step": 12560 + }, + { + "epoch": 0.6093561508477762, + "grad_norm": 0.2470703125, + "learning_rate": 1.8115789579801534e-05, + "loss": 2.7738, + "num_input_tokens_seen": 6587678720, + "step": 12565 + }, + { + "epoch": 0.6095986324040228, + "grad_norm": 0.2421875, + "learning_rate": 1.8096519915379376e-05, + "loss": 2.7831, + "num_input_tokens_seen": 6590300160, + "step": 12570 + }, + { + "epoch": 0.6098411139602694, + "grad_norm": 0.25, + "learning_rate": 1.80772546899381e-05, + "loss": 2.7983, + "num_input_tokens_seen": 6592921600, + "step": 12575 + }, + { + "epoch": 0.610083595516516, + "grad_norm": 0.263671875, + "learning_rate": 1.8057993915865372e-05, + "loss": 2.7901, + "num_input_tokens_seen": 6595543040, + "step": 12580 + }, + { + "epoch": 0.6103260770727627, + "grad_norm": 0.2431640625, + "learning_rate": 1.8038737605545977e-05, + "loss": 2.7852, + "num_input_tokens_seen": 6598164480, + "step": 12585 + }, + { + "epoch": 0.6105685586290093, + "grad_norm": 0.236328125, + "learning_rate": 1.8019485771361854e-05, + "loss": 2.7747, + "num_input_tokens_seen": 6600785920, + "step": 12590 + }, + { + "epoch": 0.6108110401852559, + "grad_norm": 0.259765625, + "learning_rate": 1.8000238425692052e-05, + "loss": 2.7769, + "num_input_tokens_seen": 6603407360, + "step": 12595 + }, + { + "epoch": 0.6110535217415025, + "grad_norm": 0.25, + "learning_rate": 1.7980995580912728e-05, + "loss": 2.7848, + "num_input_tokens_seen": 6606028800, + "step": 12600 + }, + { + "epoch": 0.6110535217415025, + "eval_accuracy": 0.4559224881941052, + "eval_loss": 2.7420172691345215, + "eval_runtime": 5.8785, + "eval_samples_per_second": 51.034, + "eval_steps_per_second": 6.464, + "num_input_tokens_seen": 6606028800, + "step": 12600 + }, + { + "epoch": 0.6112960032977491, + "grad_norm": 0.2431640625, + "learning_rate": 1.7961757249397153e-05, + "loss": 2.7728, + "num_input_tokens_seen": 6608650240, + "step": 12605 + }, + { + "epoch": 0.6115384848539958, + "grad_norm": 0.2451171875, + "learning_rate": 1.7942523443515703e-05, + "loss": 2.7823, + "num_input_tokens_seen": 6611271680, + "step": 12610 + }, + { + "epoch": 0.6117809664102424, + "grad_norm": 0.2490234375, + "learning_rate": 1.792329417563584e-05, + "loss": 2.7777, + "num_input_tokens_seen": 6613893120, + "step": 12615 + }, + { + "epoch": 0.612023447966489, + "grad_norm": 0.240234375, + "learning_rate": 1.7904069458122085e-05, + "loss": 2.7676, + "num_input_tokens_seen": 6616514560, + "step": 12620 + }, + { + "epoch": 0.6122659295227357, + "grad_norm": 0.25, + "learning_rate": 1.788484930333606e-05, + "loss": 2.7701, + "num_input_tokens_seen": 6619136000, + "step": 12625 + }, + { + "epoch": 0.6125084110789824, + "grad_norm": 0.24609375, + "learning_rate": 1.786563372363644e-05, + "loss": 2.7708, + "num_input_tokens_seen": 6621757440, + "step": 12630 + }, + { + "epoch": 0.612750892635229, + "grad_norm": 0.25390625, + "learning_rate": 1.7846422731378976e-05, + "loss": 2.7842, + "num_input_tokens_seen": 6624378880, + "step": 12635 + }, + { + "epoch": 0.6129933741914756, + "grad_norm": 0.24609375, + "learning_rate": 1.7827216338916444e-05, + "loss": 2.783, + "num_input_tokens_seen": 6627000320, + "step": 12640 + }, + { + "epoch": 0.6132358557477222, + "grad_norm": 0.240234375, + "learning_rate": 1.780801455859869e-05, + "loss": 2.7665, + "num_input_tokens_seen": 6629621760, + "step": 12645 + }, + { + "epoch": 0.6134783373039688, + "grad_norm": 0.244140625, + "learning_rate": 1.778881740277256e-05, + "loss": 2.7907, + "num_input_tokens_seen": 6632243200, + "step": 12650 + }, + { + "epoch": 0.6137208188602155, + "grad_norm": 0.240234375, + "learning_rate": 1.7769624883781952e-05, + "loss": 2.7794, + "num_input_tokens_seen": 6634864640, + "step": 12655 + }, + { + "epoch": 0.6139633004164621, + "grad_norm": 0.248046875, + "learning_rate": 1.7750437013967773e-05, + "loss": 2.7774, + "num_input_tokens_seen": 6637486080, + "step": 12660 + }, + { + "epoch": 0.6142057819727087, + "grad_norm": 0.2431640625, + "learning_rate": 1.7731253805667946e-05, + "loss": 2.7857, + "num_input_tokens_seen": 6640107520, + "step": 12665 + }, + { + "epoch": 0.6144482635289553, + "grad_norm": 0.2373046875, + "learning_rate": 1.7712075271217403e-05, + "loss": 2.7687, + "num_input_tokens_seen": 6642728960, + "step": 12670 + }, + { + "epoch": 0.6146907450852019, + "grad_norm": 0.240234375, + "learning_rate": 1.769290142294806e-05, + "loss": 2.7591, + "num_input_tokens_seen": 6645350400, + "step": 12675 + }, + { + "epoch": 0.6149332266414486, + "grad_norm": 0.2412109375, + "learning_rate": 1.7673732273188807e-05, + "loss": 2.776, + "num_input_tokens_seen": 6647971840, + "step": 12680 + }, + { + "epoch": 0.6151757081976952, + "grad_norm": 0.244140625, + "learning_rate": 1.765456783426553e-05, + "loss": 2.7742, + "num_input_tokens_seen": 6650593280, + "step": 12685 + }, + { + "epoch": 0.6154181897539418, + "grad_norm": 0.251953125, + "learning_rate": 1.763540811850109e-05, + "loss": 2.7853, + "num_input_tokens_seen": 6653214720, + "step": 12690 + }, + { + "epoch": 0.6156606713101884, + "grad_norm": 0.2578125, + "learning_rate": 1.7616253138215307e-05, + "loss": 2.7748, + "num_input_tokens_seen": 6655836160, + "step": 12695 + }, + { + "epoch": 0.615903152866435, + "grad_norm": 0.2421875, + "learning_rate": 1.7597102905724944e-05, + "loss": 2.7738, + "num_input_tokens_seen": 6658457600, + "step": 12700 + }, + { + "epoch": 0.6161456344226818, + "grad_norm": 0.2470703125, + "learning_rate": 1.757795743334374e-05, + "loss": 2.784, + "num_input_tokens_seen": 6661079040, + "step": 12705 + }, + { + "epoch": 0.6163881159789284, + "grad_norm": 0.24609375, + "learning_rate": 1.755881673338232e-05, + "loss": 2.7678, + "num_input_tokens_seen": 6663700480, + "step": 12710 + }, + { + "epoch": 0.616630597535175, + "grad_norm": 0.2490234375, + "learning_rate": 1.7539680818148303e-05, + "loss": 2.7799, + "num_input_tokens_seen": 6666321920, + "step": 12715 + }, + { + "epoch": 0.6168730790914216, + "grad_norm": 0.25, + "learning_rate": 1.7520549699946184e-05, + "loss": 2.7806, + "num_input_tokens_seen": 6668943360, + "step": 12720 + }, + { + "epoch": 0.6171155606476683, + "grad_norm": 0.2470703125, + "learning_rate": 1.7501423391077403e-05, + "loss": 2.7836, + "num_input_tokens_seen": 6671564800, + "step": 12725 + }, + { + "epoch": 0.6173580422039149, + "grad_norm": 0.2392578125, + "learning_rate": 1.7482301903840292e-05, + "loss": 2.7726, + "num_input_tokens_seen": 6674186240, + "step": 12730 + }, + { + "epoch": 0.6176005237601615, + "grad_norm": 0.2451171875, + "learning_rate": 1.74631852505301e-05, + "loss": 2.7747, + "num_input_tokens_seen": 6676807680, + "step": 12735 + }, + { + "epoch": 0.6178430053164081, + "grad_norm": 0.251953125, + "learning_rate": 1.7444073443438923e-05, + "loss": 2.7729, + "num_input_tokens_seen": 6679429120, + "step": 12740 + }, + { + "epoch": 0.6180854868726547, + "grad_norm": 0.24609375, + "learning_rate": 1.7424966494855798e-05, + "loss": 2.7976, + "num_input_tokens_seen": 6682050560, + "step": 12745 + }, + { + "epoch": 0.6183279684289014, + "grad_norm": 0.2421875, + "learning_rate": 1.740586441706661e-05, + "loss": 2.7827, + "num_input_tokens_seen": 6684672000, + "step": 12750 + }, + { + "epoch": 0.618570449985148, + "grad_norm": 0.2353515625, + "learning_rate": 1.7386767222354105e-05, + "loss": 2.7784, + "num_input_tokens_seen": 6687293440, + "step": 12755 + }, + { + "epoch": 0.6188129315413946, + "grad_norm": 0.2412109375, + "learning_rate": 1.7367674922997907e-05, + "loss": 2.7745, + "num_input_tokens_seen": 6689914880, + "step": 12760 + }, + { + "epoch": 0.6190554130976412, + "grad_norm": 0.248046875, + "learning_rate": 1.734858753127448e-05, + "loss": 2.7726, + "num_input_tokens_seen": 6692536320, + "step": 12765 + }, + { + "epoch": 0.6192978946538878, + "grad_norm": 0.248046875, + "learning_rate": 1.7329505059457143e-05, + "loss": 2.7715, + "num_input_tokens_seen": 6695157760, + "step": 12770 + }, + { + "epoch": 0.6195403762101345, + "grad_norm": 0.24609375, + "learning_rate": 1.7310427519816036e-05, + "loss": 2.761, + "num_input_tokens_seen": 6697779200, + "step": 12775 + }, + { + "epoch": 0.6197828577663811, + "grad_norm": 0.25, + "learning_rate": 1.7291354924618136e-05, + "loss": 2.7712, + "num_input_tokens_seen": 6700400640, + "step": 12780 + }, + { + "epoch": 0.6200253393226278, + "grad_norm": 0.251953125, + "learning_rate": 1.7272287286127247e-05, + "loss": 2.7901, + "num_input_tokens_seen": 6703022080, + "step": 12785 + }, + { + "epoch": 0.6202678208788744, + "grad_norm": 0.2421875, + "learning_rate": 1.725322461660398e-05, + "loss": 2.8016, + "num_input_tokens_seen": 6705643520, + "step": 12790 + }, + { + "epoch": 0.6205103024351211, + "grad_norm": 0.25, + "learning_rate": 1.7234166928305744e-05, + "loss": 2.7778, + "num_input_tokens_seen": 6708264960, + "step": 12795 + }, + { + "epoch": 0.6207527839913677, + "grad_norm": 0.2412109375, + "learning_rate": 1.7215114233486762e-05, + "loss": 2.7917, + "num_input_tokens_seen": 6710886400, + "step": 12800 + }, + { + "epoch": 0.6209952655476143, + "grad_norm": 0.25, + "learning_rate": 1.7196066544398026e-05, + "loss": 2.7839, + "num_input_tokens_seen": 6713507840, + "step": 12805 + }, + { + "epoch": 0.6212377471038609, + "grad_norm": 0.24609375, + "learning_rate": 1.7177023873287324e-05, + "loss": 2.7706, + "num_input_tokens_seen": 6716129280, + "step": 12810 + }, + { + "epoch": 0.6214802286601075, + "grad_norm": 0.2470703125, + "learning_rate": 1.715798623239921e-05, + "loss": 2.766, + "num_input_tokens_seen": 6718750720, + "step": 12815 + }, + { + "epoch": 0.6217227102163542, + "grad_norm": 0.24609375, + "learning_rate": 1.7138953633975007e-05, + "loss": 2.7767, + "num_input_tokens_seen": 6721372160, + "step": 12820 + }, + { + "epoch": 0.6219651917726008, + "grad_norm": 0.2470703125, + "learning_rate": 1.71199260902528e-05, + "loss": 2.7666, + "num_input_tokens_seen": 6723993600, + "step": 12825 + }, + { + "epoch": 0.6222076733288474, + "grad_norm": 0.240234375, + "learning_rate": 1.7100903613467416e-05, + "loss": 2.7815, + "num_input_tokens_seen": 6726615040, + "step": 12830 + }, + { + "epoch": 0.622450154885094, + "grad_norm": 0.240234375, + "learning_rate": 1.7081886215850424e-05, + "loss": 2.7743, + "num_input_tokens_seen": 6729236480, + "step": 12835 + }, + { + "epoch": 0.6226926364413407, + "grad_norm": 0.24609375, + "learning_rate": 1.7062873909630127e-05, + "loss": 2.7768, + "num_input_tokens_seen": 6731857920, + "step": 12840 + }, + { + "epoch": 0.6229351179975873, + "grad_norm": 0.2451171875, + "learning_rate": 1.7043866707031562e-05, + "loss": 2.7609, + "num_input_tokens_seen": 6734479360, + "step": 12845 + }, + { + "epoch": 0.6231775995538339, + "grad_norm": 0.240234375, + "learning_rate": 1.702486462027648e-05, + "loss": 2.7722, + "num_input_tokens_seen": 6737100800, + "step": 12850 + }, + { + "epoch": 0.6234200811100805, + "grad_norm": 0.2421875, + "learning_rate": 1.7005867661583336e-05, + "loss": 2.7858, + "num_input_tokens_seen": 6739722240, + "step": 12855 + }, + { + "epoch": 0.6236625626663272, + "grad_norm": 0.244140625, + "learning_rate": 1.6986875843167306e-05, + "loss": 2.7844, + "num_input_tokens_seen": 6742343680, + "step": 12860 + }, + { + "epoch": 0.6239050442225739, + "grad_norm": 0.244140625, + "learning_rate": 1.696788917724023e-05, + "loss": 2.7749, + "num_input_tokens_seen": 6744965120, + "step": 12865 + }, + { + "epoch": 0.6241475257788205, + "grad_norm": 0.25, + "learning_rate": 1.694890767601066e-05, + "loss": 2.7744, + "num_input_tokens_seen": 6747586560, + "step": 12870 + }, + { + "epoch": 0.6243900073350671, + "grad_norm": 0.244140625, + "learning_rate": 1.6929931351683824e-05, + "loss": 2.7816, + "num_input_tokens_seen": 6750208000, + "step": 12875 + }, + { + "epoch": 0.6246324888913137, + "grad_norm": 0.2451171875, + "learning_rate": 1.691096021646162e-05, + "loss": 2.7778, + "num_input_tokens_seen": 6752829440, + "step": 12880 + }, + { + "epoch": 0.6248749704475604, + "grad_norm": 0.248046875, + "learning_rate": 1.6891994282542595e-05, + "loss": 2.78, + "num_input_tokens_seen": 6755450880, + "step": 12885 + }, + { + "epoch": 0.625117452003807, + "grad_norm": 0.2451171875, + "learning_rate": 1.687303356212198e-05, + "loss": 2.7668, + "num_input_tokens_seen": 6758072320, + "step": 12890 + }, + { + "epoch": 0.6253599335600536, + "grad_norm": 0.25, + "learning_rate": 1.6854078067391617e-05, + "loss": 2.7691, + "num_input_tokens_seen": 6760693760, + "step": 12895 + }, + { + "epoch": 0.6256024151163002, + "grad_norm": 0.2421875, + "learning_rate": 1.6835127810540018e-05, + "loss": 2.7748, + "num_input_tokens_seen": 6763315200, + "step": 12900 + }, + { + "epoch": 0.6256024151163002, + "eval_accuracy": 0.4559110893991207, + "eval_loss": 2.742025375366211, + "eval_runtime": 5.8351, + "eval_samples_per_second": 51.413, + "eval_steps_per_second": 6.512, + "num_input_tokens_seen": 6763315200, + "step": 12900 + }, + { + "epoch": 0.6258448966725468, + "grad_norm": 0.240234375, + "learning_rate": 1.681618280375232e-05, + "loss": 2.7705, + "num_input_tokens_seen": 6765936640, + "step": 12905 + }, + { + "epoch": 0.6260873782287935, + "grad_norm": 0.2412109375, + "learning_rate": 1.6797243059210273e-05, + "loss": 2.7711, + "num_input_tokens_seen": 6768558080, + "step": 12910 + }, + { + "epoch": 0.6263298597850401, + "grad_norm": 0.240234375, + "learning_rate": 1.6778308589092255e-05, + "loss": 2.7629, + "num_input_tokens_seen": 6771179520, + "step": 12915 + }, + { + "epoch": 0.6265723413412867, + "grad_norm": 0.2431640625, + "learning_rate": 1.675937940557325e-05, + "loss": 2.7645, + "num_input_tokens_seen": 6773800960, + "step": 12920 + }, + { + "epoch": 0.6268148228975333, + "grad_norm": 0.24609375, + "learning_rate": 1.6740455520824852e-05, + "loss": 2.7832, + "num_input_tokens_seen": 6776422400, + "step": 12925 + }, + { + "epoch": 0.6270573044537799, + "grad_norm": 0.255859375, + "learning_rate": 1.6721536947015216e-05, + "loss": 2.7708, + "num_input_tokens_seen": 6779043840, + "step": 12930 + }, + { + "epoch": 0.6272997860100266, + "grad_norm": 0.2373046875, + "learning_rate": 1.670262369630911e-05, + "loss": 2.7812, + "num_input_tokens_seen": 6781665280, + "step": 12935 + }, + { + "epoch": 0.6275422675662733, + "grad_norm": 0.236328125, + "learning_rate": 1.6683715780867882e-05, + "loss": 2.7591, + "num_input_tokens_seen": 6784286720, + "step": 12940 + }, + { + "epoch": 0.6277847491225199, + "grad_norm": 0.2470703125, + "learning_rate": 1.6664813212849424e-05, + "loss": 2.7775, + "num_input_tokens_seen": 6786908160, + "step": 12945 + }, + { + "epoch": 0.6280272306787665, + "grad_norm": 0.2470703125, + "learning_rate": 1.664591600440822e-05, + "loss": 2.7786, + "num_input_tokens_seen": 6789529600, + "step": 12950 + }, + { + "epoch": 0.6282697122350132, + "grad_norm": 0.248046875, + "learning_rate": 1.6627024167695296e-05, + "loss": 2.7621, + "num_input_tokens_seen": 6792151040, + "step": 12955 + }, + { + "epoch": 0.6285121937912598, + "grad_norm": 0.2421875, + "learning_rate": 1.660813771485821e-05, + "loss": 2.7683, + "num_input_tokens_seen": 6794772480, + "step": 12960 + }, + { + "epoch": 0.6287546753475064, + "grad_norm": 0.23828125, + "learning_rate": 1.6589256658041062e-05, + "loss": 2.7679, + "num_input_tokens_seen": 6797393920, + "step": 12965 + }, + { + "epoch": 0.628997156903753, + "grad_norm": 0.2421875, + "learning_rate": 1.6570381009384506e-05, + "loss": 2.7724, + "num_input_tokens_seen": 6800015360, + "step": 12970 + }, + { + "epoch": 0.6292396384599996, + "grad_norm": 0.2421875, + "learning_rate": 1.65515107810257e-05, + "loss": 2.7841, + "num_input_tokens_seen": 6802636800, + "step": 12975 + }, + { + "epoch": 0.6294821200162463, + "grad_norm": 0.2392578125, + "learning_rate": 1.653264598509831e-05, + "loss": 2.7713, + "num_input_tokens_seen": 6805258240, + "step": 12980 + }, + { + "epoch": 0.6297246015724929, + "grad_norm": 0.244140625, + "learning_rate": 1.6513786633732537e-05, + "loss": 2.7771, + "num_input_tokens_seen": 6807879680, + "step": 12985 + }, + { + "epoch": 0.6299670831287395, + "grad_norm": 0.2451171875, + "learning_rate": 1.6494932739055035e-05, + "loss": 2.7914, + "num_input_tokens_seen": 6810501120, + "step": 12990 + }, + { + "epoch": 0.6302095646849861, + "grad_norm": 0.2412109375, + "learning_rate": 1.6476084313188988e-05, + "loss": 2.7755, + "num_input_tokens_seen": 6813122560, + "step": 12995 + }, + { + "epoch": 0.6304520462412327, + "grad_norm": 0.248046875, + "learning_rate": 1.6457241368254056e-05, + "loss": 2.7811, + "num_input_tokens_seen": 6815744000, + "step": 13000 + }, + { + "epoch": 0.6306945277974794, + "grad_norm": 0.25, + "learning_rate": 1.6438403916366368e-05, + "loss": 2.7864, + "num_input_tokens_seen": 6818365440, + "step": 13005 + }, + { + "epoch": 0.630937009353726, + "grad_norm": 0.240234375, + "learning_rate": 1.6419571969638525e-05, + "loss": 2.7722, + "num_input_tokens_seen": 6820986880, + "step": 13010 + }, + { + "epoch": 0.6311794909099726, + "grad_norm": 0.2431640625, + "learning_rate": 1.6400745540179592e-05, + "loss": 2.7862, + "num_input_tokens_seen": 6823608320, + "step": 13015 + }, + { + "epoch": 0.6314219724662193, + "grad_norm": 0.2421875, + "learning_rate": 1.6381924640095065e-05, + "loss": 2.7735, + "num_input_tokens_seen": 6826229760, + "step": 13020 + }, + { + "epoch": 0.631664454022466, + "grad_norm": 0.2451171875, + "learning_rate": 1.6363109281486904e-05, + "loss": 2.7704, + "num_input_tokens_seen": 6828851200, + "step": 13025 + }, + { + "epoch": 0.6319069355787126, + "grad_norm": 0.2451171875, + "learning_rate": 1.634429947645351e-05, + "loss": 2.7741, + "num_input_tokens_seen": 6831472640, + "step": 13030 + }, + { + "epoch": 0.6321494171349592, + "grad_norm": 0.244140625, + "learning_rate": 1.6325495237089704e-05, + "loss": 2.7672, + "num_input_tokens_seen": 6834094080, + "step": 13035 + }, + { + "epoch": 0.6323918986912058, + "grad_norm": 0.2431640625, + "learning_rate": 1.630669657548673e-05, + "loss": 2.7789, + "num_input_tokens_seen": 6836715520, + "step": 13040 + }, + { + "epoch": 0.6326343802474524, + "grad_norm": 0.2421875, + "learning_rate": 1.628790350373225e-05, + "loss": 2.7799, + "num_input_tokens_seen": 6839336960, + "step": 13045 + }, + { + "epoch": 0.6328768618036991, + "grad_norm": 0.2421875, + "learning_rate": 1.626911603391031e-05, + "loss": 2.7767, + "num_input_tokens_seen": 6841958400, + "step": 13050 + }, + { + "epoch": 0.6331193433599457, + "grad_norm": 0.251953125, + "learning_rate": 1.6250334178101378e-05, + "loss": 2.7711, + "num_input_tokens_seen": 6844579840, + "step": 13055 + }, + { + "epoch": 0.6333618249161923, + "grad_norm": 0.2431640625, + "learning_rate": 1.6231557948382314e-05, + "loss": 2.7827, + "num_input_tokens_seen": 6847201280, + "step": 13060 + }, + { + "epoch": 0.6336043064724389, + "grad_norm": 0.248046875, + "learning_rate": 1.6212787356826344e-05, + "loss": 2.7837, + "num_input_tokens_seen": 6849822720, + "step": 13065 + }, + { + "epoch": 0.6338467880286855, + "grad_norm": 0.248046875, + "learning_rate": 1.6194022415503072e-05, + "loss": 2.7746, + "num_input_tokens_seen": 6852444160, + "step": 13070 + }, + { + "epoch": 0.6340892695849322, + "grad_norm": 0.2431640625, + "learning_rate": 1.6175263136478478e-05, + "loss": 2.786, + "num_input_tokens_seen": 6855065600, + "step": 13075 + }, + { + "epoch": 0.6343317511411788, + "grad_norm": 0.25, + "learning_rate": 1.61565095318149e-05, + "loss": 2.7771, + "num_input_tokens_seen": 6857687040, + "step": 13080 + }, + { + "epoch": 0.6345742326974254, + "grad_norm": 0.2490234375, + "learning_rate": 1.6137761613571012e-05, + "loss": 2.7756, + "num_input_tokens_seen": 6860308480, + "step": 13085 + }, + { + "epoch": 0.634816714253672, + "grad_norm": 0.259765625, + "learning_rate": 1.611901939380185e-05, + "loss": 2.793, + "num_input_tokens_seen": 6862929920, + "step": 13090 + }, + { + "epoch": 0.6350591958099187, + "grad_norm": 0.240234375, + "learning_rate": 1.610028288455878e-05, + "loss": 2.7822, + "num_input_tokens_seen": 6865551360, + "step": 13095 + }, + { + "epoch": 0.6353016773661654, + "grad_norm": 0.2421875, + "learning_rate": 1.6081552097889484e-05, + "loss": 2.7829, + "num_input_tokens_seen": 6868172800, + "step": 13100 + }, + { + "epoch": 0.635544158922412, + "grad_norm": 0.2373046875, + "learning_rate": 1.6062827045837993e-05, + "loss": 2.7773, + "num_input_tokens_seen": 6870794240, + "step": 13105 + }, + { + "epoch": 0.6357866404786586, + "grad_norm": 0.2431640625, + "learning_rate": 1.604410774044462e-05, + "loss": 2.7833, + "num_input_tokens_seen": 6873415680, + "step": 13110 + }, + { + "epoch": 0.6360291220349052, + "grad_norm": 0.24609375, + "learning_rate": 1.6025394193745994e-05, + "loss": 2.7855, + "num_input_tokens_seen": 6876037120, + "step": 13115 + }, + { + "epoch": 0.6362716035911519, + "grad_norm": 0.2490234375, + "learning_rate": 1.6006686417775046e-05, + "loss": 2.785, + "num_input_tokens_seen": 6878658560, + "step": 13120 + }, + { + "epoch": 0.6365140851473985, + "grad_norm": 0.2421875, + "learning_rate": 1.5987984424560994e-05, + "loss": 2.7609, + "num_input_tokens_seen": 6881280000, + "step": 13125 + }, + { + "epoch": 0.6367565667036451, + "grad_norm": 0.2421875, + "learning_rate": 1.5969288226129337e-05, + "loss": 2.7638, + "num_input_tokens_seen": 6883901440, + "step": 13130 + }, + { + "epoch": 0.6369990482598917, + "grad_norm": 0.263671875, + "learning_rate": 1.5950597834501845e-05, + "loss": 2.7693, + "num_input_tokens_seen": 6886522880, + "step": 13135 + }, + { + "epoch": 0.6372415298161384, + "grad_norm": 0.240234375, + "learning_rate": 1.593191326169657e-05, + "loss": 2.775, + "num_input_tokens_seen": 6889144320, + "step": 13140 + }, + { + "epoch": 0.637484011372385, + "grad_norm": 0.2421875, + "learning_rate": 1.5913234519727783e-05, + "loss": 2.7841, + "num_input_tokens_seen": 6891765760, + "step": 13145 + }, + { + "epoch": 0.6377264929286316, + "grad_norm": 0.23828125, + "learning_rate": 1.5894561620606053e-05, + "loss": 2.7787, + "num_input_tokens_seen": 6894387200, + "step": 13150 + }, + { + "epoch": 0.6379689744848782, + "grad_norm": 0.2412109375, + "learning_rate": 1.587589457633816e-05, + "loss": 2.785, + "num_input_tokens_seen": 6897008640, + "step": 13155 + }, + { + "epoch": 0.6382114560411248, + "grad_norm": 0.251953125, + "learning_rate": 1.5857233398927136e-05, + "loss": 2.7756, + "num_input_tokens_seen": 6899630080, + "step": 13160 + }, + { + "epoch": 0.6384539375973715, + "grad_norm": 0.2421875, + "learning_rate": 1.5838578100372236e-05, + "loss": 2.769, + "num_input_tokens_seen": 6902251520, + "step": 13165 + }, + { + "epoch": 0.6386964191536181, + "grad_norm": 0.2490234375, + "learning_rate": 1.5819928692668935e-05, + "loss": 2.7676, + "num_input_tokens_seen": 6904872960, + "step": 13170 + }, + { + "epoch": 0.6389389007098648, + "grad_norm": 0.26171875, + "learning_rate": 1.5801285187808905e-05, + "loss": 2.7651, + "num_input_tokens_seen": 6907494400, + "step": 13175 + }, + { + "epoch": 0.6391813822661114, + "grad_norm": 0.234375, + "learning_rate": 1.5782647597780054e-05, + "loss": 2.7674, + "num_input_tokens_seen": 6910115840, + "step": 13180 + }, + { + "epoch": 0.639423863822358, + "grad_norm": 0.2392578125, + "learning_rate": 1.5764015934566455e-05, + "loss": 2.7672, + "num_input_tokens_seen": 6912737280, + "step": 13185 + }, + { + "epoch": 0.6396663453786047, + "grad_norm": 0.2470703125, + "learning_rate": 1.5745390210148396e-05, + "loss": 2.7798, + "num_input_tokens_seen": 6915358720, + "step": 13190 + }, + { + "epoch": 0.6399088269348513, + "grad_norm": 0.240234375, + "learning_rate": 1.5726770436502323e-05, + "loss": 2.7751, + "num_input_tokens_seen": 6917980160, + "step": 13195 + }, + { + "epoch": 0.6401513084910979, + "grad_norm": 0.251953125, + "learning_rate": 1.5708156625600885e-05, + "loss": 2.7697, + "num_input_tokens_seen": 6920601600, + "step": 13200 + }, + { + "epoch": 0.6401513084910979, + "eval_accuracy": 0.4559745969711773, + "eval_loss": 2.741948127746582, + "eval_runtime": 5.8815, + "eval_samples_per_second": 51.008, + "eval_steps_per_second": 6.461, + "num_input_tokens_seen": 6920601600, + "step": 13200 + }, + { + "epoch": 0.6403937900473445, + "grad_norm": 0.251953125, + "learning_rate": 1.5689548789412854e-05, + "loss": 2.7679, + "num_input_tokens_seen": 6923223040, + "step": 13205 + }, + { + "epoch": 0.6406362716035912, + "grad_norm": 0.2431640625, + "learning_rate": 1.56709469399032e-05, + "loss": 2.7839, + "num_input_tokens_seen": 6925844480, + "step": 13210 + }, + { + "epoch": 0.6408787531598378, + "grad_norm": 0.240234375, + "learning_rate": 1.5652351089033028e-05, + "loss": 2.771, + "num_input_tokens_seen": 6928465920, + "step": 13215 + }, + { + "epoch": 0.6411212347160844, + "grad_norm": 0.24609375, + "learning_rate": 1.5633761248759583e-05, + "loss": 2.7905, + "num_input_tokens_seen": 6931087360, + "step": 13220 + }, + { + "epoch": 0.641363716272331, + "grad_norm": 0.2421875, + "learning_rate": 1.561517743103625e-05, + "loss": 2.7569, + "num_input_tokens_seen": 6933708800, + "step": 13225 + }, + { + "epoch": 0.6416061978285776, + "grad_norm": 0.2431640625, + "learning_rate": 1.5596599647812543e-05, + "loss": 2.7699, + "num_input_tokens_seen": 6936330240, + "step": 13230 + }, + { + "epoch": 0.6418486793848243, + "grad_norm": 0.24609375, + "learning_rate": 1.557802791103409e-05, + "loss": 2.776, + "num_input_tokens_seen": 6938951680, + "step": 13235 + }, + { + "epoch": 0.6420911609410709, + "grad_norm": 0.2412109375, + "learning_rate": 1.555946223264263e-05, + "loss": 2.7667, + "num_input_tokens_seen": 6941573120, + "step": 13240 + }, + { + "epoch": 0.6423336424973175, + "grad_norm": 0.248046875, + "learning_rate": 1.5540902624576015e-05, + "loss": 2.7752, + "num_input_tokens_seen": 6944194560, + "step": 13245 + }, + { + "epoch": 0.6425761240535641, + "grad_norm": 0.2431640625, + "learning_rate": 1.5522349098768185e-05, + "loss": 2.7761, + "num_input_tokens_seen": 6946816000, + "step": 13250 + }, + { + "epoch": 0.6428186056098109, + "grad_norm": 0.240234375, + "learning_rate": 1.5503801667149175e-05, + "loss": 2.7806, + "num_input_tokens_seen": 6949437440, + "step": 13255 + }, + { + "epoch": 0.6430610871660575, + "grad_norm": 0.25, + "learning_rate": 1.5485260341645108e-05, + "loss": 2.762, + "num_input_tokens_seen": 6952058880, + "step": 13260 + }, + { + "epoch": 0.6433035687223041, + "grad_norm": 0.244140625, + "learning_rate": 1.546672513417817e-05, + "loss": 2.7758, + "num_input_tokens_seen": 6954680320, + "step": 13265 + }, + { + "epoch": 0.6435460502785507, + "grad_norm": 0.24609375, + "learning_rate": 1.5448196056666607e-05, + "loss": 2.772, + "num_input_tokens_seen": 6957301760, + "step": 13270 + }, + { + "epoch": 0.6437885318347973, + "grad_norm": 0.2373046875, + "learning_rate": 1.5429673121024733e-05, + "loss": 2.7636, + "num_input_tokens_seen": 6959923200, + "step": 13275 + }, + { + "epoch": 0.644031013391044, + "grad_norm": 0.244140625, + "learning_rate": 1.541115633916291e-05, + "loss": 2.7764, + "num_input_tokens_seen": 6962544640, + "step": 13280 + }, + { + "epoch": 0.6442734949472906, + "grad_norm": 0.244140625, + "learning_rate": 1.5392645722987553e-05, + "loss": 2.7853, + "num_input_tokens_seen": 6965166080, + "step": 13285 + }, + { + "epoch": 0.6445159765035372, + "grad_norm": 0.25, + "learning_rate": 1.53741412844011e-05, + "loss": 2.7587, + "num_input_tokens_seen": 6967787520, + "step": 13290 + }, + { + "epoch": 0.6447584580597838, + "grad_norm": 0.2451171875, + "learning_rate": 1.535564303530203e-05, + "loss": 2.7621, + "num_input_tokens_seen": 6970408960, + "step": 13295 + }, + { + "epoch": 0.6450009396160304, + "grad_norm": 0.2490234375, + "learning_rate": 1.533715098758481e-05, + "loss": 2.7802, + "num_input_tokens_seen": 6973030400, + "step": 13300 + }, + { + "epoch": 0.6452434211722771, + "grad_norm": 0.2431640625, + "learning_rate": 1.531866515313996e-05, + "loss": 2.7959, + "num_input_tokens_seen": 6975651840, + "step": 13305 + }, + { + "epoch": 0.6454859027285237, + "grad_norm": 0.2470703125, + "learning_rate": 1.5300185543853975e-05, + "loss": 2.7798, + "num_input_tokens_seen": 6978273280, + "step": 13310 + }, + { + "epoch": 0.6457283842847703, + "grad_norm": 0.23828125, + "learning_rate": 1.5281712171609376e-05, + "loss": 2.7766, + "num_input_tokens_seen": 6980894720, + "step": 13315 + }, + { + "epoch": 0.6459708658410169, + "grad_norm": 0.251953125, + "learning_rate": 1.5263245048284645e-05, + "loss": 2.765, + "num_input_tokens_seen": 6983516160, + "step": 13320 + }, + { + "epoch": 0.6462133473972635, + "grad_norm": 0.23828125, + "learning_rate": 1.524478418575427e-05, + "loss": 2.7727, + "num_input_tokens_seen": 6986137600, + "step": 13325 + }, + { + "epoch": 0.6464558289535102, + "grad_norm": 0.2490234375, + "learning_rate": 1.5226329595888683e-05, + "loss": 2.7955, + "num_input_tokens_seen": 6988759040, + "step": 13330 + }, + { + "epoch": 0.6466983105097569, + "grad_norm": 0.244140625, + "learning_rate": 1.5207881290554307e-05, + "loss": 2.7674, + "num_input_tokens_seen": 6991380480, + "step": 13335 + }, + { + "epoch": 0.6469407920660035, + "grad_norm": 0.24609375, + "learning_rate": 1.5189439281613524e-05, + "loss": 2.7589, + "num_input_tokens_seen": 6994001920, + "step": 13340 + }, + { + "epoch": 0.6471832736222501, + "grad_norm": 0.2431640625, + "learning_rate": 1.517100358092466e-05, + "loss": 2.7761, + "num_input_tokens_seen": 6996623360, + "step": 13345 + }, + { + "epoch": 0.6474257551784968, + "grad_norm": 0.240234375, + "learning_rate": 1.515257420034198e-05, + "loss": 2.7635, + "num_input_tokens_seen": 6999244800, + "step": 13350 + }, + { + "epoch": 0.6476682367347434, + "grad_norm": 0.244140625, + "learning_rate": 1.5134151151715702e-05, + "loss": 2.7863, + "num_input_tokens_seen": 7001866240, + "step": 13355 + }, + { + "epoch": 0.64791071829099, + "grad_norm": 0.2470703125, + "learning_rate": 1.5115734446891943e-05, + "loss": 2.7819, + "num_input_tokens_seen": 7004487680, + "step": 13360 + }, + { + "epoch": 0.6481531998472366, + "grad_norm": 0.24609375, + "learning_rate": 1.5097324097712778e-05, + "loss": 2.7658, + "num_input_tokens_seen": 7007109120, + "step": 13365 + }, + { + "epoch": 0.6483956814034832, + "grad_norm": 0.248046875, + "learning_rate": 1.5078920116016165e-05, + "loss": 2.7801, + "num_input_tokens_seen": 7009730560, + "step": 13370 + }, + { + "epoch": 0.6486381629597299, + "grad_norm": 0.251953125, + "learning_rate": 1.5060522513635986e-05, + "loss": 2.7734, + "num_input_tokens_seen": 7012352000, + "step": 13375 + }, + { + "epoch": 0.6488806445159765, + "grad_norm": 0.2490234375, + "learning_rate": 1.5042131302402013e-05, + "loss": 2.7738, + "num_input_tokens_seen": 7014973440, + "step": 13380 + }, + { + "epoch": 0.6491231260722231, + "grad_norm": 0.248046875, + "learning_rate": 1.5023746494139915e-05, + "loss": 2.7794, + "num_input_tokens_seen": 7017594880, + "step": 13385 + }, + { + "epoch": 0.6493656076284697, + "grad_norm": 0.2431640625, + "learning_rate": 1.5005368100671219e-05, + "loss": 2.7745, + "num_input_tokens_seen": 7020216320, + "step": 13390 + }, + { + "epoch": 0.6496080891847164, + "grad_norm": 0.240234375, + "learning_rate": 1.4986996133813367e-05, + "loss": 2.776, + "num_input_tokens_seen": 7022837760, + "step": 13395 + }, + { + "epoch": 0.649850570740963, + "grad_norm": 0.2431640625, + "learning_rate": 1.4968630605379641e-05, + "loss": 2.7723, + "num_input_tokens_seen": 7025459200, + "step": 13400 + }, + { + "epoch": 0.6500930522972096, + "grad_norm": 0.2373046875, + "learning_rate": 1.495027152717919e-05, + "loss": 2.7722, + "num_input_tokens_seen": 7028080640, + "step": 13405 + }, + { + "epoch": 0.6503355338534563, + "grad_norm": 0.2431640625, + "learning_rate": 1.4931918911017023e-05, + "loss": 2.7961, + "num_input_tokens_seen": 7030702080, + "step": 13410 + }, + { + "epoch": 0.650578015409703, + "grad_norm": 0.25, + "learning_rate": 1.491357276869398e-05, + "loss": 2.7773, + "num_input_tokens_seen": 7033323520, + "step": 13415 + }, + { + "epoch": 0.6508204969659496, + "grad_norm": 0.2421875, + "learning_rate": 1.4895233112006749e-05, + "loss": 2.7686, + "num_input_tokens_seen": 7035944960, + "step": 13420 + }, + { + "epoch": 0.6510629785221962, + "grad_norm": 0.24609375, + "learning_rate": 1.4876899952747838e-05, + "loss": 2.7715, + "num_input_tokens_seen": 7038566400, + "step": 13425 + }, + { + "epoch": 0.6513054600784428, + "grad_norm": 0.244140625, + "learning_rate": 1.4858573302705592e-05, + "loss": 2.7805, + "num_input_tokens_seen": 7041187840, + "step": 13430 + }, + { + "epoch": 0.6515479416346894, + "grad_norm": 0.2421875, + "learning_rate": 1.4840253173664154e-05, + "loss": 2.7632, + "num_input_tokens_seen": 7043809280, + "step": 13435 + }, + { + "epoch": 0.651790423190936, + "grad_norm": 0.2412109375, + "learning_rate": 1.4821939577403483e-05, + "loss": 2.7731, + "num_input_tokens_seen": 7046430720, + "step": 13440 + }, + { + "epoch": 0.6520329047471827, + "grad_norm": 0.2451171875, + "learning_rate": 1.4803632525699338e-05, + "loss": 2.7714, + "num_input_tokens_seen": 7049052160, + "step": 13445 + }, + { + "epoch": 0.6522753863034293, + "grad_norm": 0.2392578125, + "learning_rate": 1.4785332030323273e-05, + "loss": 2.7744, + "num_input_tokens_seen": 7051673600, + "step": 13450 + }, + { + "epoch": 0.6525178678596759, + "grad_norm": 0.2421875, + "learning_rate": 1.4767038103042613e-05, + "loss": 2.7845, + "num_input_tokens_seen": 7054295040, + "step": 13455 + }, + { + "epoch": 0.6527603494159225, + "grad_norm": 0.2431640625, + "learning_rate": 1.4748750755620466e-05, + "loss": 2.7849, + "num_input_tokens_seen": 7056916480, + "step": 13460 + }, + { + "epoch": 0.6530028309721692, + "grad_norm": 0.251953125, + "learning_rate": 1.4730469999815716e-05, + "loss": 2.7664, + "num_input_tokens_seen": 7059537920, + "step": 13465 + }, + { + "epoch": 0.6532453125284158, + "grad_norm": 0.25, + "learning_rate": 1.4712195847383003e-05, + "loss": 2.7801, + "num_input_tokens_seen": 7062159360, + "step": 13470 + }, + { + "epoch": 0.6534877940846624, + "grad_norm": 0.240234375, + "learning_rate": 1.4693928310072719e-05, + "loss": 2.7584, + "num_input_tokens_seen": 7064780800, + "step": 13475 + }, + { + "epoch": 0.653730275640909, + "grad_norm": 0.244140625, + "learning_rate": 1.4675667399631012e-05, + "loss": 2.7703, + "num_input_tokens_seen": 7067402240, + "step": 13480 + }, + { + "epoch": 0.6539727571971556, + "grad_norm": 0.2431640625, + "learning_rate": 1.4657413127799752e-05, + "loss": 2.7785, + "num_input_tokens_seen": 7070023680, + "step": 13485 + }, + { + "epoch": 0.6542152387534024, + "grad_norm": 0.2431640625, + "learning_rate": 1.4639165506316554e-05, + "loss": 2.781, + "num_input_tokens_seen": 7072645120, + "step": 13490 + }, + { + "epoch": 0.654457720309649, + "grad_norm": 0.2431640625, + "learning_rate": 1.4620924546914749e-05, + "loss": 2.7606, + "num_input_tokens_seen": 7075266560, + "step": 13495 + }, + { + "epoch": 0.6547002018658956, + "grad_norm": 0.265625, + "learning_rate": 1.4602690261323399e-05, + "loss": 2.7689, + "num_input_tokens_seen": 7077888000, + "step": 13500 + }, + { + "epoch": 0.6547002018658956, + "eval_accuracy": 0.45599088096401236, + "eval_loss": 2.7419145107269287, + "eval_runtime": 5.8941, + "eval_samples_per_second": 50.898, + "eval_steps_per_second": 6.447, + "num_input_tokens_seen": 7077888000, + "step": 13500 + }, + { + "epoch": 0.6549426834221422, + "grad_norm": 0.25390625, + "learning_rate": 1.4584462661267251e-05, + "loss": 2.7809, + "num_input_tokens_seen": 7080509440, + "step": 13505 + }, + { + "epoch": 0.6551851649783889, + "grad_norm": 0.2431640625, + "learning_rate": 1.456624175846678e-05, + "loss": 2.7776, + "num_input_tokens_seen": 7083130880, + "step": 13510 + }, + { + "epoch": 0.6554276465346355, + "grad_norm": 0.2412109375, + "learning_rate": 1.4548027564638125e-05, + "loss": 2.7563, + "num_input_tokens_seen": 7085752320, + "step": 13515 + }, + { + "epoch": 0.6556701280908821, + "grad_norm": 0.2451171875, + "learning_rate": 1.4529820091493123e-05, + "loss": 2.7728, + "num_input_tokens_seen": 7088373760, + "step": 13520 + }, + { + "epoch": 0.6559126096471287, + "grad_norm": 0.2412109375, + "learning_rate": 1.4511619350739313e-05, + "loss": 2.7903, + "num_input_tokens_seen": 7090995200, + "step": 13525 + }, + { + "epoch": 0.6561550912033753, + "grad_norm": 0.2451171875, + "learning_rate": 1.4493425354079876e-05, + "loss": 2.7614, + "num_input_tokens_seen": 7093616640, + "step": 13530 + }, + { + "epoch": 0.656397572759622, + "grad_norm": 0.2431640625, + "learning_rate": 1.4475238113213662e-05, + "loss": 2.7707, + "num_input_tokens_seen": 7096238080, + "step": 13535 + }, + { + "epoch": 0.6566400543158686, + "grad_norm": 0.2421875, + "learning_rate": 1.4457057639835197e-05, + "loss": 2.7766, + "num_input_tokens_seen": 7098859520, + "step": 13540 + }, + { + "epoch": 0.6568825358721152, + "grad_norm": 0.2421875, + "learning_rate": 1.4438883945634618e-05, + "loss": 2.7718, + "num_input_tokens_seen": 7101480960, + "step": 13545 + }, + { + "epoch": 0.6571250174283618, + "grad_norm": 0.2421875, + "learning_rate": 1.4420717042297727e-05, + "loss": 2.7778, + "num_input_tokens_seen": 7104102400, + "step": 13550 + }, + { + "epoch": 0.6573674989846084, + "grad_norm": 0.2470703125, + "learning_rate": 1.4402556941505969e-05, + "loss": 2.7788, + "num_input_tokens_seen": 7106723840, + "step": 13555 + }, + { + "epoch": 0.6576099805408551, + "grad_norm": 0.2451171875, + "learning_rate": 1.4384403654936387e-05, + "loss": 2.779, + "num_input_tokens_seen": 7109345280, + "step": 13560 + }, + { + "epoch": 0.6578524620971017, + "grad_norm": 0.24609375, + "learning_rate": 1.4366257194261671e-05, + "loss": 2.7803, + "num_input_tokens_seen": 7111966720, + "step": 13565 + }, + { + "epoch": 0.6580949436533484, + "grad_norm": 0.2373046875, + "learning_rate": 1.4348117571150102e-05, + "loss": 2.7755, + "num_input_tokens_seen": 7114588160, + "step": 13570 + }, + { + "epoch": 0.658337425209595, + "grad_norm": 0.2451171875, + "learning_rate": 1.4329984797265572e-05, + "loss": 2.7782, + "num_input_tokens_seen": 7117209600, + "step": 13575 + }, + { + "epoch": 0.6585799067658417, + "grad_norm": 0.248046875, + "learning_rate": 1.431185888426757e-05, + "loss": 2.7722, + "num_input_tokens_seen": 7119831040, + "step": 13580 + }, + { + "epoch": 0.6588223883220883, + "grad_norm": 0.248046875, + "learning_rate": 1.4293739843811171e-05, + "loss": 2.7824, + "num_input_tokens_seen": 7122452480, + "step": 13585 + }, + { + "epoch": 0.6590648698783349, + "grad_norm": 0.24609375, + "learning_rate": 1.4275627687547028e-05, + "loss": 2.7731, + "num_input_tokens_seen": 7125073920, + "step": 13590 + }, + { + "epoch": 0.6593073514345815, + "grad_norm": 0.2431640625, + "learning_rate": 1.4257522427121379e-05, + "loss": 2.7752, + "num_input_tokens_seen": 7127695360, + "step": 13595 + }, + { + "epoch": 0.6595498329908281, + "grad_norm": 0.2431640625, + "learning_rate": 1.4239424074176009e-05, + "loss": 2.7782, + "num_input_tokens_seen": 7130316800, + "step": 13600 + }, + { + "epoch": 0.6597923145470748, + "grad_norm": 0.240234375, + "learning_rate": 1.422133264034829e-05, + "loss": 2.7719, + "num_input_tokens_seen": 7132938240, + "step": 13605 + }, + { + "epoch": 0.6600347961033214, + "grad_norm": 0.25, + "learning_rate": 1.4203248137271102e-05, + "loss": 2.7672, + "num_input_tokens_seen": 7135559680, + "step": 13610 + }, + { + "epoch": 0.660277277659568, + "grad_norm": 0.2412109375, + "learning_rate": 1.4185170576572907e-05, + "loss": 2.783, + "num_input_tokens_seen": 7138181120, + "step": 13615 + }, + { + "epoch": 0.6605197592158146, + "grad_norm": 0.244140625, + "learning_rate": 1.416709996987769e-05, + "loss": 2.7854, + "num_input_tokens_seen": 7140802560, + "step": 13620 + }, + { + "epoch": 0.6607622407720612, + "grad_norm": 0.248046875, + "learning_rate": 1.414903632880496e-05, + "loss": 2.7667, + "num_input_tokens_seen": 7143424000, + "step": 13625 + }, + { + "epoch": 0.6610047223283079, + "grad_norm": 0.251953125, + "learning_rate": 1.4130979664969756e-05, + "loss": 2.7828, + "num_input_tokens_seen": 7146045440, + "step": 13630 + }, + { + "epoch": 0.6612472038845545, + "grad_norm": 0.248046875, + "learning_rate": 1.4112929989982623e-05, + "loss": 2.7924, + "num_input_tokens_seen": 7148666880, + "step": 13635 + }, + { + "epoch": 0.6614896854408011, + "grad_norm": 0.251953125, + "learning_rate": 1.4094887315449617e-05, + "loss": 2.7629, + "num_input_tokens_seen": 7151288320, + "step": 13640 + }, + { + "epoch": 0.6617321669970477, + "grad_norm": 0.234375, + "learning_rate": 1.407685165297229e-05, + "loss": 2.7658, + "num_input_tokens_seen": 7153909760, + "step": 13645 + }, + { + "epoch": 0.6619746485532945, + "grad_norm": 0.2373046875, + "learning_rate": 1.4058823014147683e-05, + "loss": 2.7697, + "num_input_tokens_seen": 7156531200, + "step": 13650 + }, + { + "epoch": 0.6622171301095411, + "grad_norm": 0.2490234375, + "learning_rate": 1.4040801410568327e-05, + "loss": 2.7729, + "num_input_tokens_seen": 7159152640, + "step": 13655 + }, + { + "epoch": 0.6624596116657877, + "grad_norm": 0.2421875, + "learning_rate": 1.4022786853822224e-05, + "loss": 2.762, + "num_input_tokens_seen": 7161774080, + "step": 13660 + }, + { + "epoch": 0.6627020932220343, + "grad_norm": 0.2470703125, + "learning_rate": 1.4004779355492858e-05, + "loss": 2.7642, + "num_input_tokens_seen": 7164395520, + "step": 13665 + }, + { + "epoch": 0.662944574778281, + "grad_norm": 0.2451171875, + "learning_rate": 1.3986778927159141e-05, + "loss": 2.7636, + "num_input_tokens_seen": 7167016960, + "step": 13670 + }, + { + "epoch": 0.6631870563345276, + "grad_norm": 0.2451171875, + "learning_rate": 1.3968785580395474e-05, + "loss": 2.7879, + "num_input_tokens_seen": 7169638400, + "step": 13675 + }, + { + "epoch": 0.6634295378907742, + "grad_norm": 0.25, + "learning_rate": 1.395079932677169e-05, + "loss": 2.7893, + "num_input_tokens_seen": 7172259840, + "step": 13680 + }, + { + "epoch": 0.6636720194470208, + "grad_norm": 0.2470703125, + "learning_rate": 1.3932820177853063e-05, + "loss": 2.7727, + "num_input_tokens_seen": 7174881280, + "step": 13685 + }, + { + "epoch": 0.6639145010032674, + "grad_norm": 0.25, + "learning_rate": 1.3914848145200293e-05, + "loss": 2.7701, + "num_input_tokens_seen": 7177502720, + "step": 13690 + }, + { + "epoch": 0.664156982559514, + "grad_norm": 0.2431640625, + "learning_rate": 1.3896883240369518e-05, + "loss": 2.7756, + "num_input_tokens_seen": 7180124160, + "step": 13695 + }, + { + "epoch": 0.6643994641157607, + "grad_norm": 0.2490234375, + "learning_rate": 1.3878925474912283e-05, + "loss": 2.7799, + "num_input_tokens_seen": 7182745600, + "step": 13700 + }, + { + "epoch": 0.6646419456720073, + "grad_norm": 0.2421875, + "learning_rate": 1.3860974860375536e-05, + "loss": 2.7682, + "num_input_tokens_seen": 7185367040, + "step": 13705 + }, + { + "epoch": 0.6648844272282539, + "grad_norm": 0.2451171875, + "learning_rate": 1.3843031408301644e-05, + "loss": 2.7735, + "num_input_tokens_seen": 7187988480, + "step": 13710 + }, + { + "epoch": 0.6651269087845005, + "grad_norm": 0.240234375, + "learning_rate": 1.382509513022835e-05, + "loss": 2.7747, + "num_input_tokens_seen": 7190609920, + "step": 13715 + }, + { + "epoch": 0.6653693903407472, + "grad_norm": 0.2431640625, + "learning_rate": 1.3807166037688801e-05, + "loss": 2.7777, + "num_input_tokens_seen": 7193231360, + "step": 13720 + }, + { + "epoch": 0.6656118718969939, + "grad_norm": 0.2412109375, + "learning_rate": 1.3789244142211511e-05, + "loss": 2.772, + "num_input_tokens_seen": 7195852800, + "step": 13725 + }, + { + "epoch": 0.6658543534532405, + "grad_norm": 0.2431640625, + "learning_rate": 1.3771329455320381e-05, + "loss": 2.7602, + "num_input_tokens_seen": 7198474240, + "step": 13730 + }, + { + "epoch": 0.6660968350094871, + "grad_norm": 0.2451171875, + "learning_rate": 1.3753421988534648e-05, + "loss": 2.7839, + "num_input_tokens_seen": 7201095680, + "step": 13735 + }, + { + "epoch": 0.6663393165657338, + "grad_norm": 0.2451171875, + "learning_rate": 1.3735521753368932e-05, + "loss": 2.7655, + "num_input_tokens_seen": 7203717120, + "step": 13740 + }, + { + "epoch": 0.6665817981219804, + "grad_norm": 0.2490234375, + "learning_rate": 1.3717628761333202e-05, + "loss": 2.7761, + "num_input_tokens_seen": 7206338560, + "step": 13745 + }, + { + "epoch": 0.666824279678227, + "grad_norm": 0.23828125, + "learning_rate": 1.3699743023932751e-05, + "loss": 2.7763, + "num_input_tokens_seen": 7208960000, + "step": 13750 + }, + { + "epoch": 0.6670667612344736, + "grad_norm": 0.244140625, + "learning_rate": 1.3681864552668239e-05, + "loss": 2.7852, + "num_input_tokens_seen": 7211581440, + "step": 13755 + }, + { + "epoch": 0.6673092427907202, + "grad_norm": 0.2451171875, + "learning_rate": 1.3663993359035637e-05, + "loss": 2.7769, + "num_input_tokens_seen": 7214202880, + "step": 13760 + }, + { + "epoch": 0.6675517243469669, + "grad_norm": 0.244140625, + "learning_rate": 1.3646129454526213e-05, + "loss": 2.7822, + "num_input_tokens_seen": 7216824320, + "step": 13765 + }, + { + "epoch": 0.6677942059032135, + "grad_norm": 0.2421875, + "learning_rate": 1.3628272850626577e-05, + "loss": 2.7662, + "num_input_tokens_seen": 7219445760, + "step": 13770 + }, + { + "epoch": 0.6680366874594601, + "grad_norm": 0.23828125, + "learning_rate": 1.361042355881864e-05, + "loss": 2.781, + "num_input_tokens_seen": 7222067200, + "step": 13775 + }, + { + "epoch": 0.6682791690157067, + "grad_norm": 0.240234375, + "learning_rate": 1.3592581590579608e-05, + "loss": 2.776, + "num_input_tokens_seen": 7224688640, + "step": 13780 + }, + { + "epoch": 0.6685216505719533, + "grad_norm": 0.2412109375, + "learning_rate": 1.3574746957381979e-05, + "loss": 2.7814, + "num_input_tokens_seen": 7227310080, + "step": 13785 + }, + { + "epoch": 0.6687641321282, + "grad_norm": 0.2392578125, + "learning_rate": 1.3556919670693541e-05, + "loss": 2.7791, + "num_input_tokens_seen": 7229931520, + "step": 13790 + }, + { + "epoch": 0.6690066136844466, + "grad_norm": 0.248046875, + "learning_rate": 1.3539099741977334e-05, + "loss": 2.7707, + "num_input_tokens_seen": 7232552960, + "step": 13795 + }, + { + "epoch": 0.6692490952406932, + "grad_norm": 0.2412109375, + "learning_rate": 1.3521287182691695e-05, + "loss": 2.7747, + "num_input_tokens_seen": 7235174400, + "step": 13800 + }, + { + "epoch": 0.6692490952406932, + "eval_accuracy": 0.45594365738479076, + "eval_loss": 2.741860866546631, + "eval_runtime": 5.8127, + "eval_samples_per_second": 51.611, + "eval_steps_per_second": 6.537, + "num_input_tokens_seen": 7235174400, + "step": 13800 + }, + { + "epoch": 0.6694915767969399, + "grad_norm": 0.248046875, + "learning_rate": 1.3503482004290194e-05, + "loss": 2.7805, + "num_input_tokens_seen": 7237795840, + "step": 13805 + }, + { + "epoch": 0.6697340583531866, + "grad_norm": 0.2421875, + "learning_rate": 1.3485684218221694e-05, + "loss": 2.7735, + "num_input_tokens_seen": 7240417280, + "step": 13810 + }, + { + "epoch": 0.6699765399094332, + "grad_norm": 0.2431640625, + "learning_rate": 1.3467893835930281e-05, + "loss": 2.7828, + "num_input_tokens_seen": 7243038720, + "step": 13815 + }, + { + "epoch": 0.6702190214656798, + "grad_norm": 0.25390625, + "learning_rate": 1.3450110868855283e-05, + "loss": 2.7697, + "num_input_tokens_seen": 7245660160, + "step": 13820 + }, + { + "epoch": 0.6704615030219264, + "grad_norm": 0.2412109375, + "learning_rate": 1.3432335328431244e-05, + "loss": 2.7886, + "num_input_tokens_seen": 7248281600, + "step": 13825 + }, + { + "epoch": 0.670703984578173, + "grad_norm": 0.2431640625, + "learning_rate": 1.3414567226087954e-05, + "loss": 2.7661, + "num_input_tokens_seen": 7250903040, + "step": 13830 + }, + { + "epoch": 0.6709464661344197, + "grad_norm": 0.2470703125, + "learning_rate": 1.3396806573250418e-05, + "loss": 2.7697, + "num_input_tokens_seen": 7253524480, + "step": 13835 + }, + { + "epoch": 0.6711889476906663, + "grad_norm": 0.259765625, + "learning_rate": 1.337905338133884e-05, + "loss": 2.7628, + "num_input_tokens_seen": 7256145920, + "step": 13840 + }, + { + "epoch": 0.6714314292469129, + "grad_norm": 0.23828125, + "learning_rate": 1.3361307661768647e-05, + "loss": 2.7721, + "num_input_tokens_seen": 7258767360, + "step": 13845 + }, + { + "epoch": 0.6716739108031595, + "grad_norm": 0.2470703125, + "learning_rate": 1.3343569425950442e-05, + "loss": 2.7688, + "num_input_tokens_seen": 7261388800, + "step": 13850 + }, + { + "epoch": 0.6719163923594061, + "grad_norm": 0.248046875, + "learning_rate": 1.3325838685289998e-05, + "loss": 2.7698, + "num_input_tokens_seen": 7264010240, + "step": 13855 + }, + { + "epoch": 0.6721588739156528, + "grad_norm": 0.2392578125, + "learning_rate": 1.3308115451188327e-05, + "loss": 2.7731, + "num_input_tokens_seen": 7266631680, + "step": 13860 + }, + { + "epoch": 0.6724013554718994, + "grad_norm": 0.23828125, + "learning_rate": 1.3290399735041564e-05, + "loss": 2.7781, + "num_input_tokens_seen": 7269253120, + "step": 13865 + }, + { + "epoch": 0.672643837028146, + "grad_norm": 0.2412109375, + "learning_rate": 1.3272691548241023e-05, + "loss": 2.7817, + "num_input_tokens_seen": 7271874560, + "step": 13870 + }, + { + "epoch": 0.6728863185843926, + "grad_norm": 0.2431640625, + "learning_rate": 1.3254990902173186e-05, + "loss": 2.7869, + "num_input_tokens_seen": 7274496000, + "step": 13875 + }, + { + "epoch": 0.6731288001406392, + "grad_norm": 0.23828125, + "learning_rate": 1.3237297808219676e-05, + "loss": 2.7643, + "num_input_tokens_seen": 7277117440, + "step": 13880 + }, + { + "epoch": 0.673371281696886, + "grad_norm": 0.236328125, + "learning_rate": 1.3219612277757271e-05, + "loss": 2.7731, + "num_input_tokens_seen": 7279738880, + "step": 13885 + }, + { + "epoch": 0.6736137632531326, + "grad_norm": 0.2412109375, + "learning_rate": 1.3201934322157861e-05, + "loss": 2.7812, + "num_input_tokens_seen": 7282360320, + "step": 13890 + }, + { + "epoch": 0.6738562448093792, + "grad_norm": 0.2373046875, + "learning_rate": 1.318426395278849e-05, + "loss": 2.7731, + "num_input_tokens_seen": 7284981760, + "step": 13895 + }, + { + "epoch": 0.6740987263656258, + "grad_norm": 0.244140625, + "learning_rate": 1.3166601181011312e-05, + "loss": 2.7771, + "num_input_tokens_seen": 7287603200, + "step": 13900 + }, + { + "epoch": 0.6743412079218725, + "grad_norm": 0.2431640625, + "learning_rate": 1.3148946018183612e-05, + "loss": 2.7676, + "num_input_tokens_seen": 7290224640, + "step": 13905 + }, + { + "epoch": 0.6745836894781191, + "grad_norm": 0.244140625, + "learning_rate": 1.3131298475657755e-05, + "loss": 2.7656, + "num_input_tokens_seen": 7292846080, + "step": 13910 + }, + { + "epoch": 0.6748261710343657, + "grad_norm": 0.2421875, + "learning_rate": 1.3113658564781233e-05, + "loss": 2.77, + "num_input_tokens_seen": 7295467520, + "step": 13915 + }, + { + "epoch": 0.6750686525906123, + "grad_norm": 0.23828125, + "learning_rate": 1.3096026296896612e-05, + "loss": 2.7662, + "num_input_tokens_seen": 7298088960, + "step": 13920 + }, + { + "epoch": 0.675311134146859, + "grad_norm": 0.2490234375, + "learning_rate": 1.3078401683341554e-05, + "loss": 2.7743, + "num_input_tokens_seen": 7300710400, + "step": 13925 + }, + { + "epoch": 0.6755536157031056, + "grad_norm": 0.25, + "learning_rate": 1.3060784735448794e-05, + "loss": 2.7882, + "num_input_tokens_seen": 7303331840, + "step": 13930 + }, + { + "epoch": 0.6757960972593522, + "grad_norm": 0.2451171875, + "learning_rate": 1.3043175464546142e-05, + "loss": 2.7775, + "num_input_tokens_seen": 7305953280, + "step": 13935 + }, + { + "epoch": 0.6760385788155988, + "grad_norm": 0.2470703125, + "learning_rate": 1.302557388195647e-05, + "loss": 2.7761, + "num_input_tokens_seen": 7308574720, + "step": 13940 + }, + { + "epoch": 0.6762810603718454, + "grad_norm": 0.24609375, + "learning_rate": 1.3007979998997711e-05, + "loss": 2.7898, + "num_input_tokens_seen": 7311196160, + "step": 13945 + }, + { + "epoch": 0.676523541928092, + "grad_norm": 0.240234375, + "learning_rate": 1.2990393826982828e-05, + "loss": 2.769, + "num_input_tokens_seen": 7313817600, + "step": 13950 + }, + { + "epoch": 0.6767660234843387, + "grad_norm": 0.240234375, + "learning_rate": 1.2972815377219843e-05, + "loss": 2.771, + "num_input_tokens_seen": 7316439040, + "step": 13955 + }, + { + "epoch": 0.6770085050405853, + "grad_norm": 0.2412109375, + "learning_rate": 1.2955244661011811e-05, + "loss": 2.7778, + "num_input_tokens_seen": 7319060480, + "step": 13960 + }, + { + "epoch": 0.677250986596832, + "grad_norm": 0.2421875, + "learning_rate": 1.2937681689656817e-05, + "loss": 2.7733, + "num_input_tokens_seen": 7321681920, + "step": 13965 + }, + { + "epoch": 0.6774934681530786, + "grad_norm": 0.2412109375, + "learning_rate": 1.2920126474447958e-05, + "loss": 2.7849, + "num_input_tokens_seen": 7324303360, + "step": 13970 + }, + { + "epoch": 0.6777359497093253, + "grad_norm": 0.2431640625, + "learning_rate": 1.2902579026673345e-05, + "loss": 2.7883, + "num_input_tokens_seen": 7326924800, + "step": 13975 + }, + { + "epoch": 0.6779784312655719, + "grad_norm": 0.2431640625, + "learning_rate": 1.2885039357616102e-05, + "loss": 2.7775, + "num_input_tokens_seen": 7329546240, + "step": 13980 + }, + { + "epoch": 0.6782209128218185, + "grad_norm": 0.2412109375, + "learning_rate": 1.2867507478554341e-05, + "loss": 2.7754, + "num_input_tokens_seen": 7332167680, + "step": 13985 + }, + { + "epoch": 0.6784633943780651, + "grad_norm": 0.248046875, + "learning_rate": 1.2849983400761173e-05, + "loss": 2.7754, + "num_input_tokens_seen": 7334789120, + "step": 13990 + }, + { + "epoch": 0.6787058759343118, + "grad_norm": 0.2421875, + "learning_rate": 1.283246713550469e-05, + "loss": 2.7898, + "num_input_tokens_seen": 7337410560, + "step": 13995 + }, + { + "epoch": 0.6789483574905584, + "grad_norm": 0.244140625, + "learning_rate": 1.2814958694047955e-05, + "loss": 2.7864, + "num_input_tokens_seen": 7340032000, + "step": 14000 + }, + { + "epoch": 0.679190839046805, + "grad_norm": 0.2490234375, + "learning_rate": 1.2797458087649022e-05, + "loss": 2.7635, + "num_input_tokens_seen": 7342653440, + "step": 14005 + }, + { + "epoch": 0.6794333206030516, + "grad_norm": 0.2392578125, + "learning_rate": 1.2779965327560867e-05, + "loss": 2.7807, + "num_input_tokens_seen": 7345274880, + "step": 14010 + }, + { + "epoch": 0.6796758021592982, + "grad_norm": 0.2431640625, + "learning_rate": 1.2762480425031454e-05, + "loss": 2.7861, + "num_input_tokens_seen": 7347896320, + "step": 14015 + }, + { + "epoch": 0.6799182837155449, + "grad_norm": 0.244140625, + "learning_rate": 1.2745003391303684e-05, + "loss": 2.787, + "num_input_tokens_seen": 7350517760, + "step": 14020 + }, + { + "epoch": 0.6801607652717915, + "grad_norm": 0.244140625, + "learning_rate": 1.2727534237615404e-05, + "loss": 2.7763, + "num_input_tokens_seen": 7353139200, + "step": 14025 + }, + { + "epoch": 0.6804032468280381, + "grad_norm": 0.2490234375, + "learning_rate": 1.2710072975199383e-05, + "loss": 2.7791, + "num_input_tokens_seen": 7355760640, + "step": 14030 + }, + { + "epoch": 0.6806457283842847, + "grad_norm": 0.2412109375, + "learning_rate": 1.2692619615283318e-05, + "loss": 2.767, + "num_input_tokens_seen": 7358382080, + "step": 14035 + }, + { + "epoch": 0.6808882099405315, + "grad_norm": 0.2421875, + "learning_rate": 1.2675174169089854e-05, + "loss": 2.762, + "num_input_tokens_seen": 7361003520, + "step": 14040 + }, + { + "epoch": 0.6811306914967781, + "grad_norm": 0.2431640625, + "learning_rate": 1.2657736647836491e-05, + "loss": 2.7783, + "num_input_tokens_seen": 7363624960, + "step": 14045 + }, + { + "epoch": 0.6813731730530247, + "grad_norm": 0.240234375, + "learning_rate": 1.2640307062735679e-05, + "loss": 2.7651, + "num_input_tokens_seen": 7366246400, + "step": 14050 + }, + { + "epoch": 0.6816156546092713, + "grad_norm": 0.2421875, + "learning_rate": 1.2622885424994746e-05, + "loss": 2.7771, + "num_input_tokens_seen": 7368867840, + "step": 14055 + }, + { + "epoch": 0.6818581361655179, + "grad_norm": 0.2373046875, + "learning_rate": 1.2605471745815917e-05, + "loss": 2.7671, + "num_input_tokens_seen": 7371489280, + "step": 14060 + }, + { + "epoch": 0.6821006177217646, + "grad_norm": 0.2421875, + "learning_rate": 1.2588066036396292e-05, + "loss": 2.7893, + "num_input_tokens_seen": 7374110720, + "step": 14065 + }, + { + "epoch": 0.6823430992780112, + "grad_norm": 0.240234375, + "learning_rate": 1.2570668307927868e-05, + "loss": 2.7845, + "num_input_tokens_seen": 7376732160, + "step": 14070 + }, + { + "epoch": 0.6825855808342578, + "grad_norm": 0.2470703125, + "learning_rate": 1.2553278571597467e-05, + "loss": 2.772, + "num_input_tokens_seen": 7379353600, + "step": 14075 + }, + { + "epoch": 0.6828280623905044, + "grad_norm": 0.255859375, + "learning_rate": 1.2535896838586813e-05, + "loss": 2.7774, + "num_input_tokens_seen": 7381975040, + "step": 14080 + }, + { + "epoch": 0.683070543946751, + "grad_norm": 0.244140625, + "learning_rate": 1.2518523120072467e-05, + "loss": 2.7781, + "num_input_tokens_seen": 7384596480, + "step": 14085 + }, + { + "epoch": 0.6833130255029977, + "grad_norm": 0.2373046875, + "learning_rate": 1.250115742722583e-05, + "loss": 2.7711, + "num_input_tokens_seen": 7387217920, + "step": 14090 + }, + { + "epoch": 0.6835555070592443, + "grad_norm": 0.2421875, + "learning_rate": 1.2483799771213168e-05, + "loss": 2.7704, + "num_input_tokens_seen": 7389839360, + "step": 14095 + }, + { + "epoch": 0.6837979886154909, + "grad_norm": 0.236328125, + "learning_rate": 1.2466450163195564e-05, + "loss": 2.786, + "num_input_tokens_seen": 7392460800, + "step": 14100 + }, + { + "epoch": 0.6837979886154909, + "eval_accuracy": 0.4560804429246051, + "eval_loss": 2.7417774200439453, + "eval_runtime": 5.8578, + "eval_samples_per_second": 51.213, + "eval_steps_per_second": 6.487, + "num_input_tokens_seen": 7392460800, + "step": 14100 + }, + { + "epoch": 0.6840404701717375, + "grad_norm": 0.2451171875, + "learning_rate": 1.2449108614328905e-05, + "loss": 2.7715, + "num_input_tokens_seen": 7395082240, + "step": 14105 + }, + { + "epoch": 0.6842829517279841, + "grad_norm": 0.2431640625, + "learning_rate": 1.2431775135763927e-05, + "loss": 2.7642, + "num_input_tokens_seen": 7397703680, + "step": 14110 + }, + { + "epoch": 0.6845254332842308, + "grad_norm": 0.2392578125, + "learning_rate": 1.241444973864616e-05, + "loss": 2.7718, + "num_input_tokens_seen": 7400325120, + "step": 14115 + }, + { + "epoch": 0.6847679148404775, + "grad_norm": 0.2412109375, + "learning_rate": 1.2397132434115952e-05, + "loss": 2.7672, + "num_input_tokens_seen": 7402946560, + "step": 14120 + }, + { + "epoch": 0.6850103963967241, + "grad_norm": 0.24609375, + "learning_rate": 1.2379823233308426e-05, + "loss": 2.7775, + "num_input_tokens_seen": 7405568000, + "step": 14125 + }, + { + "epoch": 0.6852528779529707, + "grad_norm": 0.23828125, + "learning_rate": 1.2362522147353525e-05, + "loss": 2.7756, + "num_input_tokens_seen": 7408189440, + "step": 14130 + }, + { + "epoch": 0.6854953595092174, + "grad_norm": 0.2470703125, + "learning_rate": 1.2345229187375934e-05, + "loss": 2.7837, + "num_input_tokens_seen": 7410810880, + "step": 14135 + }, + { + "epoch": 0.685737841065464, + "grad_norm": 0.2421875, + "learning_rate": 1.2327944364495133e-05, + "loss": 2.7667, + "num_input_tokens_seen": 7413432320, + "step": 14140 + }, + { + "epoch": 0.6859803226217106, + "grad_norm": 0.2412109375, + "learning_rate": 1.2310667689825393e-05, + "loss": 2.7667, + "num_input_tokens_seen": 7416053760, + "step": 14145 + }, + { + "epoch": 0.6862228041779572, + "grad_norm": 0.2470703125, + "learning_rate": 1.229339917447571e-05, + "loss": 2.7726, + "num_input_tokens_seen": 7418675200, + "step": 14150 + }, + { + "epoch": 0.6864652857342038, + "grad_norm": 0.2392578125, + "learning_rate": 1.2276138829549852e-05, + "loss": 2.781, + "num_input_tokens_seen": 7421296640, + "step": 14155 + }, + { + "epoch": 0.6867077672904505, + "grad_norm": 0.244140625, + "learning_rate": 1.2258886666146336e-05, + "loss": 2.7706, + "num_input_tokens_seen": 7423918080, + "step": 14160 + }, + { + "epoch": 0.6869502488466971, + "grad_norm": 0.24609375, + "learning_rate": 1.2241642695358391e-05, + "loss": 2.7736, + "num_input_tokens_seen": 7426539520, + "step": 14165 + }, + { + "epoch": 0.6871927304029437, + "grad_norm": 0.248046875, + "learning_rate": 1.2224406928274013e-05, + "loss": 2.7664, + "num_input_tokens_seen": 7429160960, + "step": 14170 + }, + { + "epoch": 0.6874352119591903, + "grad_norm": 0.251953125, + "learning_rate": 1.2207179375975899e-05, + "loss": 2.7743, + "num_input_tokens_seen": 7431782400, + "step": 14175 + }, + { + "epoch": 0.687677693515437, + "grad_norm": 0.24609375, + "learning_rate": 1.2189960049541482e-05, + "loss": 2.7699, + "num_input_tokens_seen": 7434403840, + "step": 14180 + }, + { + "epoch": 0.6879201750716836, + "grad_norm": 0.2490234375, + "learning_rate": 1.217274896004289e-05, + "loss": 2.7784, + "num_input_tokens_seen": 7437025280, + "step": 14185 + }, + { + "epoch": 0.6881626566279302, + "grad_norm": 0.2451171875, + "learning_rate": 1.2155546118546965e-05, + "loss": 2.7813, + "num_input_tokens_seen": 7439646720, + "step": 14190 + }, + { + "epoch": 0.6884051381841768, + "grad_norm": 0.244140625, + "learning_rate": 1.2138351536115238e-05, + "loss": 2.7777, + "num_input_tokens_seen": 7442268160, + "step": 14195 + }, + { + "epoch": 0.6886476197404235, + "grad_norm": 0.2578125, + "learning_rate": 1.212116522380394e-05, + "loss": 2.7671, + "num_input_tokens_seen": 7444889600, + "step": 14200 + }, + { + "epoch": 0.6888901012966702, + "grad_norm": 0.2421875, + "learning_rate": 1.210398719266397e-05, + "loss": 2.7755, + "num_input_tokens_seen": 7447511040, + "step": 14205 + }, + { + "epoch": 0.6891325828529168, + "grad_norm": 0.248046875, + "learning_rate": 1.2086817453740914e-05, + "loss": 2.7646, + "num_input_tokens_seen": 7450132480, + "step": 14210 + }, + { + "epoch": 0.6893750644091634, + "grad_norm": 0.240234375, + "learning_rate": 1.2069656018075018e-05, + "loss": 2.7783, + "num_input_tokens_seen": 7452753920, + "step": 14215 + }, + { + "epoch": 0.68961754596541, + "grad_norm": 0.2470703125, + "learning_rate": 1.2052502896701195e-05, + "loss": 2.7845, + "num_input_tokens_seen": 7455375360, + "step": 14220 + }, + { + "epoch": 0.6898600275216566, + "grad_norm": 0.2470703125, + "learning_rate": 1.2035358100649019e-05, + "loss": 2.7704, + "num_input_tokens_seen": 7457996800, + "step": 14225 + }, + { + "epoch": 0.6901025090779033, + "grad_norm": 0.236328125, + "learning_rate": 1.2018221640942681e-05, + "loss": 2.7633, + "num_input_tokens_seen": 7460618240, + "step": 14230 + }, + { + "epoch": 0.6903449906341499, + "grad_norm": 0.24609375, + "learning_rate": 1.2001093528601043e-05, + "loss": 2.7743, + "num_input_tokens_seen": 7463239680, + "step": 14235 + }, + { + "epoch": 0.6905874721903965, + "grad_norm": 0.2412109375, + "learning_rate": 1.1983973774637585e-05, + "loss": 2.7795, + "num_input_tokens_seen": 7465861120, + "step": 14240 + }, + { + "epoch": 0.6908299537466431, + "grad_norm": 0.244140625, + "learning_rate": 1.196686239006042e-05, + "loss": 2.7712, + "num_input_tokens_seen": 7468482560, + "step": 14245 + }, + { + "epoch": 0.6910724353028898, + "grad_norm": 0.2421875, + "learning_rate": 1.1949759385872273e-05, + "loss": 2.7846, + "num_input_tokens_seen": 7471104000, + "step": 14250 + }, + { + "epoch": 0.6913149168591364, + "grad_norm": 0.24609375, + "learning_rate": 1.1932664773070481e-05, + "loss": 2.7697, + "num_input_tokens_seen": 7473725440, + "step": 14255 + }, + { + "epoch": 0.691557398415383, + "grad_norm": 0.248046875, + "learning_rate": 1.1915578562646992e-05, + "loss": 2.7537, + "num_input_tokens_seen": 7476346880, + "step": 14260 + }, + { + "epoch": 0.6917998799716296, + "grad_norm": 0.2431640625, + "learning_rate": 1.1898500765588342e-05, + "loss": 2.7691, + "num_input_tokens_seen": 7478968320, + "step": 14265 + }, + { + "epoch": 0.6920423615278762, + "grad_norm": 0.2431640625, + "learning_rate": 1.188143139287566e-05, + "loss": 2.762, + "num_input_tokens_seen": 7481589760, + "step": 14270 + }, + { + "epoch": 0.6922848430841229, + "grad_norm": 0.244140625, + "learning_rate": 1.1864370455484663e-05, + "loss": 2.7646, + "num_input_tokens_seen": 7484211200, + "step": 14275 + }, + { + "epoch": 0.6925273246403696, + "grad_norm": 0.244140625, + "learning_rate": 1.1847317964385643e-05, + "loss": 2.777, + "num_input_tokens_seen": 7486832640, + "step": 14280 + }, + { + "epoch": 0.6927698061966162, + "grad_norm": 0.2421875, + "learning_rate": 1.1830273930543462e-05, + "loss": 2.7653, + "num_input_tokens_seen": 7489454080, + "step": 14285 + }, + { + "epoch": 0.6930122877528628, + "grad_norm": 0.2421875, + "learning_rate": 1.1813238364917523e-05, + "loss": 2.7679, + "num_input_tokens_seen": 7492075520, + "step": 14290 + }, + { + "epoch": 0.6932547693091095, + "grad_norm": 0.2451171875, + "learning_rate": 1.1796211278461811e-05, + "loss": 2.7715, + "num_input_tokens_seen": 7494696960, + "step": 14295 + }, + { + "epoch": 0.6934972508653561, + "grad_norm": 0.2421875, + "learning_rate": 1.177919268212485e-05, + "loss": 2.7786, + "num_input_tokens_seen": 7497318400, + "step": 14300 + }, + { + "epoch": 0.6937397324216027, + "grad_norm": 0.240234375, + "learning_rate": 1.1762182586849708e-05, + "loss": 2.7742, + "num_input_tokens_seen": 7499939840, + "step": 14305 + }, + { + "epoch": 0.6939822139778493, + "grad_norm": 0.248046875, + "learning_rate": 1.1745181003573971e-05, + "loss": 2.7611, + "num_input_tokens_seen": 7502561280, + "step": 14310 + }, + { + "epoch": 0.6942246955340959, + "grad_norm": 0.25, + "learning_rate": 1.1728187943229776e-05, + "loss": 2.7864, + "num_input_tokens_seen": 7505182720, + "step": 14315 + }, + { + "epoch": 0.6944671770903426, + "grad_norm": 0.25390625, + "learning_rate": 1.171120341674376e-05, + "loss": 2.7645, + "num_input_tokens_seen": 7507804160, + "step": 14320 + }, + { + "epoch": 0.6947096586465892, + "grad_norm": 0.23828125, + "learning_rate": 1.169422743503708e-05, + "loss": 2.7846, + "num_input_tokens_seen": 7510425600, + "step": 14325 + }, + { + "epoch": 0.6949521402028358, + "grad_norm": 0.23828125, + "learning_rate": 1.1677260009025403e-05, + "loss": 2.7721, + "num_input_tokens_seen": 7513047040, + "step": 14330 + }, + { + "epoch": 0.6951946217590824, + "grad_norm": 0.2470703125, + "learning_rate": 1.1660301149618885e-05, + "loss": 2.7866, + "num_input_tokens_seen": 7515668480, + "step": 14335 + }, + { + "epoch": 0.695437103315329, + "grad_norm": 0.2392578125, + "learning_rate": 1.1643350867722184e-05, + "loss": 2.775, + "num_input_tokens_seen": 7518289920, + "step": 14340 + }, + { + "epoch": 0.6956795848715757, + "grad_norm": 0.2431640625, + "learning_rate": 1.1626409174234432e-05, + "loss": 2.7889, + "num_input_tokens_seen": 7520911360, + "step": 14345 + }, + { + "epoch": 0.6959220664278223, + "grad_norm": 0.2451171875, + "learning_rate": 1.1609476080049252e-05, + "loss": 2.7905, + "num_input_tokens_seen": 7523532800, + "step": 14350 + }, + { + "epoch": 0.696164547984069, + "grad_norm": 0.2421875, + "learning_rate": 1.1592551596054717e-05, + "loss": 2.7599, + "num_input_tokens_seen": 7526154240, + "step": 14355 + }, + { + "epoch": 0.6964070295403156, + "grad_norm": 0.2431640625, + "learning_rate": 1.1575635733133383e-05, + "loss": 2.7664, + "num_input_tokens_seen": 7528775680, + "step": 14360 + }, + { + "epoch": 0.6966495110965623, + "grad_norm": 0.25, + "learning_rate": 1.1558728502162256e-05, + "loss": 2.783, + "num_input_tokens_seen": 7531397120, + "step": 14365 + }, + { + "epoch": 0.6968919926528089, + "grad_norm": 0.2392578125, + "learning_rate": 1.1541829914012789e-05, + "loss": 2.7766, + "num_input_tokens_seen": 7534018560, + "step": 14370 + }, + { + "epoch": 0.6971344742090555, + "grad_norm": 0.244140625, + "learning_rate": 1.1524939979550873e-05, + "loss": 2.7805, + "num_input_tokens_seen": 7536640000, + "step": 14375 + }, + { + "epoch": 0.6973769557653021, + "grad_norm": 0.24609375, + "learning_rate": 1.1508058709636869e-05, + "loss": 2.7912, + "num_input_tokens_seen": 7539261440, + "step": 14380 + }, + { + "epoch": 0.6976194373215487, + "grad_norm": 0.2431640625, + "learning_rate": 1.149118611512551e-05, + "loss": 2.7833, + "num_input_tokens_seen": 7541882880, + "step": 14385 + }, + { + "epoch": 0.6978619188777954, + "grad_norm": 0.2431640625, + "learning_rate": 1.147432220686599e-05, + "loss": 2.7638, + "num_input_tokens_seen": 7544504320, + "step": 14390 + }, + { + "epoch": 0.698104400434042, + "grad_norm": 0.23828125, + "learning_rate": 1.1457466995701907e-05, + "loss": 2.7693, + "num_input_tokens_seen": 7547125760, + "step": 14395 + }, + { + "epoch": 0.6983468819902886, + "grad_norm": 0.2470703125, + "learning_rate": 1.144062049247127e-05, + "loss": 2.7801, + "num_input_tokens_seen": 7549747200, + "step": 14400 + }, + { + "epoch": 0.6983468819902886, + "eval_accuracy": 0.4559990229604299, + "eval_loss": 2.7416889667510986, + "eval_runtime": 5.8935, + "eval_samples_per_second": 50.903, + "eval_steps_per_second": 6.448, + "num_input_tokens_seen": 7549747200, + "step": 14400 + }, + { + "epoch": 0.6985893635465352, + "grad_norm": 0.2431640625, + "learning_rate": 1.1423782708006478e-05, + "loss": 2.7737, + "num_input_tokens_seen": 7552368640, + "step": 14405 + }, + { + "epoch": 0.6988318451027818, + "grad_norm": 0.2470703125, + "learning_rate": 1.140695365313435e-05, + "loss": 2.7775, + "num_input_tokens_seen": 7554990080, + "step": 14410 + }, + { + "epoch": 0.6990743266590285, + "grad_norm": 0.2412109375, + "learning_rate": 1.1390133338676054e-05, + "loss": 2.7722, + "num_input_tokens_seen": 7557611520, + "step": 14415 + }, + { + "epoch": 0.6993168082152751, + "grad_norm": 0.2412109375, + "learning_rate": 1.137332177544716e-05, + "loss": 2.7895, + "num_input_tokens_seen": 7560232960, + "step": 14420 + }, + { + "epoch": 0.6995592897715217, + "grad_norm": 0.24609375, + "learning_rate": 1.1356518974257607e-05, + "loss": 2.7665, + "num_input_tokens_seen": 7562854400, + "step": 14425 + }, + { + "epoch": 0.6998017713277683, + "grad_norm": 0.240234375, + "learning_rate": 1.1339724945911714e-05, + "loss": 2.7744, + "num_input_tokens_seen": 7565475840, + "step": 14430 + }, + { + "epoch": 0.7000442528840151, + "grad_norm": 0.2392578125, + "learning_rate": 1.1322939701208141e-05, + "loss": 2.7759, + "num_input_tokens_seen": 7568097280, + "step": 14435 + }, + { + "epoch": 0.7002867344402617, + "grad_norm": 0.2431640625, + "learning_rate": 1.1306163250939913e-05, + "loss": 2.771, + "num_input_tokens_seen": 7570718720, + "step": 14440 + }, + { + "epoch": 0.7005292159965083, + "grad_norm": 0.2431640625, + "learning_rate": 1.1289395605894374e-05, + "loss": 2.7742, + "num_input_tokens_seen": 7573340160, + "step": 14445 + }, + { + "epoch": 0.7007716975527549, + "grad_norm": 0.2490234375, + "learning_rate": 1.1272636776853231e-05, + "loss": 2.7796, + "num_input_tokens_seen": 7575961600, + "step": 14450 + }, + { + "epoch": 0.7010141791090015, + "grad_norm": 0.236328125, + "learning_rate": 1.125588677459252e-05, + "loss": 2.7699, + "num_input_tokens_seen": 7578583040, + "step": 14455 + }, + { + "epoch": 0.7012566606652482, + "grad_norm": 0.244140625, + "learning_rate": 1.1239145609882596e-05, + "loss": 2.7712, + "num_input_tokens_seen": 7581204480, + "step": 14460 + }, + { + "epoch": 0.7014991422214948, + "grad_norm": 0.2431640625, + "learning_rate": 1.1222413293488134e-05, + "loss": 2.7802, + "num_input_tokens_seen": 7583825920, + "step": 14465 + }, + { + "epoch": 0.7017416237777414, + "grad_norm": 0.2578125, + "learning_rate": 1.1205689836168123e-05, + "loss": 2.7592, + "num_input_tokens_seen": 7586447360, + "step": 14470 + }, + { + "epoch": 0.701984105333988, + "grad_norm": 0.25, + "learning_rate": 1.1188975248675837e-05, + "loss": 2.7796, + "num_input_tokens_seen": 7589068800, + "step": 14475 + }, + { + "epoch": 0.7022265868902346, + "grad_norm": 0.240234375, + "learning_rate": 1.117226954175886e-05, + "loss": 2.7748, + "num_input_tokens_seen": 7591690240, + "step": 14480 + }, + { + "epoch": 0.7024690684464813, + "grad_norm": 0.2373046875, + "learning_rate": 1.115557272615908e-05, + "loss": 2.7782, + "num_input_tokens_seen": 7594311680, + "step": 14485 + }, + { + "epoch": 0.7027115500027279, + "grad_norm": 0.244140625, + "learning_rate": 1.113888481261265e-05, + "loss": 2.776, + "num_input_tokens_seen": 7596933120, + "step": 14490 + }, + { + "epoch": 0.7029540315589745, + "grad_norm": 0.234375, + "learning_rate": 1.1122205811850001e-05, + "loss": 2.7818, + "num_input_tokens_seen": 7599554560, + "step": 14495 + }, + { + "epoch": 0.7031965131152211, + "grad_norm": 0.23828125, + "learning_rate": 1.1105535734595832e-05, + "loss": 2.7658, + "num_input_tokens_seen": 7602176000, + "step": 14500 + }, + { + "epoch": 0.7034389946714678, + "grad_norm": 0.251953125, + "learning_rate": 1.1088874591569119e-05, + "loss": 2.7687, + "num_input_tokens_seen": 7604797440, + "step": 14505 + }, + { + "epoch": 0.7036814762277144, + "grad_norm": 0.26171875, + "learning_rate": 1.1072222393483061e-05, + "loss": 2.7809, + "num_input_tokens_seen": 7607418880, + "step": 14510 + }, + { + "epoch": 0.7039239577839611, + "grad_norm": 0.2431640625, + "learning_rate": 1.1055579151045137e-05, + "loss": 2.7769, + "num_input_tokens_seen": 7610040320, + "step": 14515 + }, + { + "epoch": 0.7041664393402077, + "grad_norm": 0.248046875, + "learning_rate": 1.1038944874957058e-05, + "loss": 2.7791, + "num_input_tokens_seen": 7612661760, + "step": 14520 + }, + { + "epoch": 0.7044089208964543, + "grad_norm": 0.2451171875, + "learning_rate": 1.102231957591476e-05, + "loss": 2.7721, + "num_input_tokens_seen": 7615283200, + "step": 14525 + }, + { + "epoch": 0.704651402452701, + "grad_norm": 0.2431640625, + "learning_rate": 1.1005703264608422e-05, + "loss": 2.7704, + "num_input_tokens_seen": 7617904640, + "step": 14530 + }, + { + "epoch": 0.7048938840089476, + "grad_norm": 0.2353515625, + "learning_rate": 1.098909595172243e-05, + "loss": 2.7726, + "num_input_tokens_seen": 7620526080, + "step": 14535 + }, + { + "epoch": 0.7051363655651942, + "grad_norm": 0.2412109375, + "learning_rate": 1.0972497647935395e-05, + "loss": 2.7839, + "num_input_tokens_seen": 7623147520, + "step": 14540 + }, + { + "epoch": 0.7053788471214408, + "grad_norm": 0.2451171875, + "learning_rate": 1.0955908363920128e-05, + "loss": 2.7762, + "num_input_tokens_seen": 7625768960, + "step": 14545 + }, + { + "epoch": 0.7056213286776875, + "grad_norm": 0.25, + "learning_rate": 1.0939328110343645e-05, + "loss": 2.7703, + "num_input_tokens_seen": 7628390400, + "step": 14550 + }, + { + "epoch": 0.7058638102339341, + "grad_norm": 0.240234375, + "learning_rate": 1.0922756897867148e-05, + "loss": 2.7857, + "num_input_tokens_seen": 7631011840, + "step": 14555 + }, + { + "epoch": 0.7061062917901807, + "grad_norm": 0.2431640625, + "learning_rate": 1.090619473714603e-05, + "loss": 2.772, + "num_input_tokens_seen": 7633633280, + "step": 14560 + }, + { + "epoch": 0.7063487733464273, + "grad_norm": 0.2412109375, + "learning_rate": 1.0889641638829881e-05, + "loss": 2.7785, + "num_input_tokens_seen": 7636254720, + "step": 14565 + }, + { + "epoch": 0.7065912549026739, + "grad_norm": 0.2490234375, + "learning_rate": 1.0873097613562421e-05, + "loss": 2.7744, + "num_input_tokens_seen": 7638876160, + "step": 14570 + }, + { + "epoch": 0.7068337364589206, + "grad_norm": 0.24609375, + "learning_rate": 1.0856562671981574e-05, + "loss": 2.7844, + "num_input_tokens_seen": 7641497600, + "step": 14575 + }, + { + "epoch": 0.7070762180151672, + "grad_norm": 0.240234375, + "learning_rate": 1.0840036824719407e-05, + "loss": 2.7699, + "num_input_tokens_seen": 7644119040, + "step": 14580 + }, + { + "epoch": 0.7073186995714138, + "grad_norm": 0.2431640625, + "learning_rate": 1.082352008240215e-05, + "loss": 2.7642, + "num_input_tokens_seen": 7646740480, + "step": 14585 + }, + { + "epoch": 0.7075611811276605, + "grad_norm": 0.25, + "learning_rate": 1.0807012455650164e-05, + "loss": 2.779, + "num_input_tokens_seen": 7649361920, + "step": 14590 + }, + { + "epoch": 0.7078036626839072, + "grad_norm": 0.2431640625, + "learning_rate": 1.0790513955077963e-05, + "loss": 2.7698, + "num_input_tokens_seen": 7651983360, + "step": 14595 + }, + { + "epoch": 0.7080461442401538, + "grad_norm": 0.244140625, + "learning_rate": 1.0774024591294184e-05, + "loss": 2.7717, + "num_input_tokens_seen": 7654604800, + "step": 14600 + }, + { + "epoch": 0.7082886257964004, + "grad_norm": 0.251953125, + "learning_rate": 1.075754437490159e-05, + "loss": 2.7837, + "num_input_tokens_seen": 7657226240, + "step": 14605 + }, + { + "epoch": 0.708531107352647, + "grad_norm": 0.2421875, + "learning_rate": 1.074107331649706e-05, + "loss": 2.7815, + "num_input_tokens_seen": 7659847680, + "step": 14610 + }, + { + "epoch": 0.7087735889088936, + "grad_norm": 0.240234375, + "learning_rate": 1.0724611426671596e-05, + "loss": 2.7753, + "num_input_tokens_seen": 7662469120, + "step": 14615 + }, + { + "epoch": 0.7090160704651403, + "grad_norm": 0.2470703125, + "learning_rate": 1.0708158716010288e-05, + "loss": 2.7691, + "num_input_tokens_seen": 7665090560, + "step": 14620 + }, + { + "epoch": 0.7092585520213869, + "grad_norm": 0.24609375, + "learning_rate": 1.0691715195092348e-05, + "loss": 2.7713, + "num_input_tokens_seen": 7667712000, + "step": 14625 + }, + { + "epoch": 0.7095010335776335, + "grad_norm": 0.2412109375, + "learning_rate": 1.0675280874491036e-05, + "loss": 2.7836, + "num_input_tokens_seen": 7670333440, + "step": 14630 + }, + { + "epoch": 0.7097435151338801, + "grad_norm": 0.2412109375, + "learning_rate": 1.065885576477374e-05, + "loss": 2.7788, + "num_input_tokens_seen": 7672954880, + "step": 14635 + }, + { + "epoch": 0.7099859966901267, + "grad_norm": 0.2431640625, + "learning_rate": 1.0642439876501903e-05, + "loss": 2.7919, + "num_input_tokens_seen": 7675576320, + "step": 14640 + }, + { + "epoch": 0.7102284782463734, + "grad_norm": 0.2412109375, + "learning_rate": 1.062603322023105e-05, + "loss": 2.7742, + "num_input_tokens_seen": 7678197760, + "step": 14645 + }, + { + "epoch": 0.71047095980262, + "grad_norm": 0.248046875, + "learning_rate": 1.0609635806510757e-05, + "loss": 2.7755, + "num_input_tokens_seen": 7680819200, + "step": 14650 + }, + { + "epoch": 0.7107134413588666, + "grad_norm": 0.2431640625, + "learning_rate": 1.0593247645884666e-05, + "loss": 2.7858, + "num_input_tokens_seen": 7683440640, + "step": 14655 + }, + { + "epoch": 0.7109559229151132, + "grad_norm": 0.2451171875, + "learning_rate": 1.0576868748890468e-05, + "loss": 2.7576, + "num_input_tokens_seen": 7686062080, + "step": 14660 + }, + { + "epoch": 0.7111984044713598, + "grad_norm": 0.23828125, + "learning_rate": 1.0560499126059894e-05, + "loss": 2.7726, + "num_input_tokens_seen": 7688683520, + "step": 14665 + }, + { + "epoch": 0.7114408860276066, + "grad_norm": 0.2412109375, + "learning_rate": 1.0544138787918716e-05, + "loss": 2.781, + "num_input_tokens_seen": 7691304960, + "step": 14670 + }, + { + "epoch": 0.7116833675838532, + "grad_norm": 0.2333984375, + "learning_rate": 1.0527787744986733e-05, + "loss": 2.7759, + "num_input_tokens_seen": 7693926400, + "step": 14675 + }, + { + "epoch": 0.7119258491400998, + "grad_norm": 0.248046875, + "learning_rate": 1.051144600777777e-05, + "loss": 2.7842, + "num_input_tokens_seen": 7696547840, + "step": 14680 + }, + { + "epoch": 0.7121683306963464, + "grad_norm": 0.2421875, + "learning_rate": 1.0495113586799663e-05, + "loss": 2.7737, + "num_input_tokens_seen": 7699169280, + "step": 14685 + }, + { + "epoch": 0.7124108122525931, + "grad_norm": 0.2373046875, + "learning_rate": 1.047879049255427e-05, + "loss": 2.7812, + "num_input_tokens_seen": 7701790720, + "step": 14690 + }, + { + "epoch": 0.7126532938088397, + "grad_norm": 0.244140625, + "learning_rate": 1.0462476735537427e-05, + "loss": 2.7582, + "num_input_tokens_seen": 7704412160, + "step": 14695 + }, + { + "epoch": 0.7128957753650863, + "grad_norm": 0.240234375, + "learning_rate": 1.0446172326238987e-05, + "loss": 2.7658, + "num_input_tokens_seen": 7707033600, + "step": 14700 + }, + { + "epoch": 0.7128957753650863, + "eval_accuracy": 0.45605764533463605, + "eval_loss": 2.7416930198669434, + "eval_runtime": 5.8258, + "eval_samples_per_second": 51.495, + "eval_steps_per_second": 6.523, + "num_input_tokens_seen": 7707033600, + "step": 14700 + }, + { + "epoch": 0.7131382569213329, + "grad_norm": 0.2431640625, + "learning_rate": 1.0429877275142793e-05, + "loss": 2.7716, + "num_input_tokens_seen": 7709655040, + "step": 14705 + }, + { + "epoch": 0.7133807384775795, + "grad_norm": 0.25, + "learning_rate": 1.041359159272666e-05, + "loss": 2.7776, + "num_input_tokens_seen": 7712276480, + "step": 14710 + }, + { + "epoch": 0.7136232200338262, + "grad_norm": 0.2451171875, + "learning_rate": 1.0397315289462379e-05, + "loss": 2.7718, + "num_input_tokens_seen": 7714897920, + "step": 14715 + }, + { + "epoch": 0.7138657015900728, + "grad_norm": 0.2490234375, + "learning_rate": 1.0381048375815736e-05, + "loss": 2.7821, + "num_input_tokens_seen": 7717519360, + "step": 14720 + }, + { + "epoch": 0.7141081831463194, + "grad_norm": 0.240234375, + "learning_rate": 1.0364790862246435e-05, + "loss": 2.7742, + "num_input_tokens_seen": 7720140800, + "step": 14725 + }, + { + "epoch": 0.714350664702566, + "grad_norm": 0.2451171875, + "learning_rate": 1.0348542759208166e-05, + "loss": 2.7596, + "num_input_tokens_seen": 7722762240, + "step": 14730 + }, + { + "epoch": 0.7145931462588127, + "grad_norm": 0.2421875, + "learning_rate": 1.0332304077148564e-05, + "loss": 2.7709, + "num_input_tokens_seen": 7725383680, + "step": 14735 + }, + { + "epoch": 0.7148356278150593, + "grad_norm": 0.2421875, + "learning_rate": 1.03160748265092e-05, + "loss": 2.7716, + "num_input_tokens_seen": 7728005120, + "step": 14740 + }, + { + "epoch": 0.7150781093713059, + "grad_norm": 0.2431640625, + "learning_rate": 1.0299855017725585e-05, + "loss": 2.7775, + "num_input_tokens_seen": 7730626560, + "step": 14745 + }, + { + "epoch": 0.7153205909275526, + "grad_norm": 0.244140625, + "learning_rate": 1.0283644661227168e-05, + "loss": 2.7817, + "num_input_tokens_seen": 7733248000, + "step": 14750 + }, + { + "epoch": 0.7155630724837992, + "grad_norm": 0.244140625, + "learning_rate": 1.026744376743729e-05, + "loss": 2.7767, + "num_input_tokens_seen": 7735869440, + "step": 14755 + }, + { + "epoch": 0.7158055540400459, + "grad_norm": 0.2412109375, + "learning_rate": 1.0251252346773235e-05, + "loss": 2.7886, + "num_input_tokens_seen": 7738490880, + "step": 14760 + }, + { + "epoch": 0.7160480355962925, + "grad_norm": 0.2373046875, + "learning_rate": 1.023507040964618e-05, + "loss": 2.7864, + "num_input_tokens_seen": 7741112320, + "step": 14765 + }, + { + "epoch": 0.7162905171525391, + "grad_norm": 0.2490234375, + "learning_rate": 1.021889796646123e-05, + "loss": 2.7857, + "num_input_tokens_seen": 7743733760, + "step": 14770 + }, + { + "epoch": 0.7165329987087857, + "grad_norm": 0.2490234375, + "learning_rate": 1.020273502761736e-05, + "loss": 2.7647, + "num_input_tokens_seen": 7746355200, + "step": 14775 + }, + { + "epoch": 0.7167754802650323, + "grad_norm": 0.2451171875, + "learning_rate": 1.0186581603507444e-05, + "loss": 2.7694, + "num_input_tokens_seen": 7748976640, + "step": 14780 + }, + { + "epoch": 0.717017961821279, + "grad_norm": 0.236328125, + "learning_rate": 1.0170437704518224e-05, + "loss": 2.7795, + "num_input_tokens_seen": 7751598080, + "step": 14785 + }, + { + "epoch": 0.7172604433775256, + "grad_norm": 0.2451171875, + "learning_rate": 1.0154303341030334e-05, + "loss": 2.779, + "num_input_tokens_seen": 7754219520, + "step": 14790 + }, + { + "epoch": 0.7175029249337722, + "grad_norm": 0.2431640625, + "learning_rate": 1.0138178523418266e-05, + "loss": 2.775, + "num_input_tokens_seen": 7756840960, + "step": 14795 + }, + { + "epoch": 0.7177454064900188, + "grad_norm": 0.2451171875, + "learning_rate": 1.0122063262050386e-05, + "loss": 2.7736, + "num_input_tokens_seen": 7759462400, + "step": 14800 + }, + { + "epoch": 0.7179878880462655, + "grad_norm": 0.2470703125, + "learning_rate": 1.0105957567288904e-05, + "loss": 2.7799, + "num_input_tokens_seen": 7762083840, + "step": 14805 + }, + { + "epoch": 0.7182303696025121, + "grad_norm": 0.24609375, + "learning_rate": 1.0089861449489881e-05, + "loss": 2.7856, + "num_input_tokens_seen": 7764705280, + "step": 14810 + }, + { + "epoch": 0.7184728511587587, + "grad_norm": 0.244140625, + "learning_rate": 1.0073774919003235e-05, + "loss": 2.776, + "num_input_tokens_seen": 7767326720, + "step": 14815 + }, + { + "epoch": 0.7187153327150053, + "grad_norm": 0.24609375, + "learning_rate": 1.0057697986172677e-05, + "loss": 2.7797, + "num_input_tokens_seen": 7769948160, + "step": 14820 + }, + { + "epoch": 0.7189578142712519, + "grad_norm": 0.2431640625, + "learning_rate": 1.00416306613358e-05, + "loss": 2.7675, + "num_input_tokens_seen": 7772569600, + "step": 14825 + }, + { + "epoch": 0.7192002958274987, + "grad_norm": 0.244140625, + "learning_rate": 1.002557295482399e-05, + "loss": 2.7745, + "num_input_tokens_seen": 7775191040, + "step": 14830 + }, + { + "epoch": 0.7194427773837453, + "grad_norm": 0.2412109375, + "learning_rate": 1.0009524876962445e-05, + "loss": 2.7749, + "num_input_tokens_seen": 7777812480, + "step": 14835 + }, + { + "epoch": 0.7196852589399919, + "grad_norm": 0.2490234375, + "learning_rate": 9.993486438070187e-06, + "loss": 2.7682, + "num_input_tokens_seen": 7780433920, + "step": 14840 + }, + { + "epoch": 0.7199277404962385, + "grad_norm": 0.240234375, + "learning_rate": 9.97745764846004e-06, + "loss": 2.783, + "num_input_tokens_seen": 7783055360, + "step": 14845 + }, + { + "epoch": 0.7201702220524852, + "grad_norm": 0.2490234375, + "learning_rate": 9.961438518438596e-06, + "loss": 2.765, + "num_input_tokens_seen": 7785676800, + "step": 14850 + }, + { + "epoch": 0.7204127036087318, + "grad_norm": 0.2392578125, + "learning_rate": 9.945429058306262e-06, + "loss": 2.7655, + "num_input_tokens_seen": 7788298240, + "step": 14855 + }, + { + "epoch": 0.7206551851649784, + "grad_norm": 0.2412109375, + "learning_rate": 9.92942927835723e-06, + "loss": 2.7745, + "num_input_tokens_seen": 7790919680, + "step": 14860 + }, + { + "epoch": 0.720897666721225, + "grad_norm": 0.2373046875, + "learning_rate": 9.91343918887945e-06, + "loss": 2.7714, + "num_input_tokens_seen": 7793541120, + "step": 14865 + }, + { + "epoch": 0.7211401482774716, + "grad_norm": 0.25390625, + "learning_rate": 9.897458800154654e-06, + "loss": 2.7894, + "num_input_tokens_seen": 7796162560, + "step": 14870 + }, + { + "epoch": 0.7213826298337183, + "grad_norm": 0.23828125, + "learning_rate": 9.881488122458332e-06, + "loss": 2.7655, + "num_input_tokens_seen": 7798784000, + "step": 14875 + }, + { + "epoch": 0.7216251113899649, + "grad_norm": 0.2412109375, + "learning_rate": 9.865527166059726e-06, + "loss": 2.7617, + "num_input_tokens_seen": 7801405440, + "step": 14880 + }, + { + "epoch": 0.7218675929462115, + "grad_norm": 0.2431640625, + "learning_rate": 9.849575941221837e-06, + "loss": 2.7722, + "num_input_tokens_seen": 7804026880, + "step": 14885 + }, + { + "epoch": 0.7221100745024581, + "grad_norm": 0.2373046875, + "learning_rate": 9.833634458201397e-06, + "loss": 2.7608, + "num_input_tokens_seen": 7806648320, + "step": 14890 + }, + { + "epoch": 0.7223525560587047, + "grad_norm": 0.2421875, + "learning_rate": 9.817702727248878e-06, + "loss": 2.7713, + "num_input_tokens_seen": 7809269760, + "step": 14895 + }, + { + "epoch": 0.7225950376149514, + "grad_norm": 0.244140625, + "learning_rate": 9.80178075860849e-06, + "loss": 2.7885, + "num_input_tokens_seen": 7811891200, + "step": 14900 + }, + { + "epoch": 0.7228375191711981, + "grad_norm": 0.2421875, + "learning_rate": 9.785868562518161e-06, + "loss": 2.771, + "num_input_tokens_seen": 7814512640, + "step": 14905 + }, + { + "epoch": 0.7230800007274447, + "grad_norm": 0.2421875, + "learning_rate": 9.769966149209517e-06, + "loss": 2.7874, + "num_input_tokens_seen": 7817134080, + "step": 14910 + }, + { + "epoch": 0.7233224822836913, + "grad_norm": 0.2392578125, + "learning_rate": 9.754073528907918e-06, + "loss": 2.771, + "num_input_tokens_seen": 7819755520, + "step": 14915 + }, + { + "epoch": 0.723564963839938, + "grad_norm": 0.251953125, + "learning_rate": 9.738190711832415e-06, + "loss": 2.7903, + "num_input_tokens_seen": 7822376960, + "step": 14920 + }, + { + "epoch": 0.7238074453961846, + "grad_norm": 0.24609375, + "learning_rate": 9.722317708195767e-06, + "loss": 2.7652, + "num_input_tokens_seen": 7824998400, + "step": 14925 + }, + { + "epoch": 0.7240499269524312, + "grad_norm": 0.2431640625, + "learning_rate": 9.706454528204406e-06, + "loss": 2.7664, + "num_input_tokens_seen": 7827619840, + "step": 14930 + }, + { + "epoch": 0.7242924085086778, + "grad_norm": 0.240234375, + "learning_rate": 9.690601182058462e-06, + "loss": 2.7806, + "num_input_tokens_seen": 7830241280, + "step": 14935 + }, + { + "epoch": 0.7245348900649244, + "grad_norm": 0.2490234375, + "learning_rate": 9.674757679951733e-06, + "loss": 2.78, + "num_input_tokens_seen": 7832862720, + "step": 14940 + }, + { + "epoch": 0.7247773716211711, + "grad_norm": 0.2431640625, + "learning_rate": 9.65892403207169e-06, + "loss": 2.7743, + "num_input_tokens_seen": 7835484160, + "step": 14945 + }, + { + "epoch": 0.7250198531774177, + "grad_norm": 0.236328125, + "learning_rate": 9.643100248599465e-06, + "loss": 2.773, + "num_input_tokens_seen": 7838105600, + "step": 14950 + }, + { + "epoch": 0.7252623347336643, + "grad_norm": 0.2421875, + "learning_rate": 9.627286339709857e-06, + "loss": 2.792, + "num_input_tokens_seen": 7840727040, + "step": 14955 + }, + { + "epoch": 0.7255048162899109, + "grad_norm": 0.2392578125, + "learning_rate": 9.611482315571301e-06, + "loss": 2.7856, + "num_input_tokens_seen": 7843348480, + "step": 14960 + }, + { + "epoch": 0.7257472978461575, + "grad_norm": 0.248046875, + "learning_rate": 9.595688186345889e-06, + "loss": 2.787, + "num_input_tokens_seen": 7845969920, + "step": 14965 + }, + { + "epoch": 0.7259897794024042, + "grad_norm": 0.2373046875, + "learning_rate": 9.57990396218935e-06, + "loss": 2.7743, + "num_input_tokens_seen": 7848591360, + "step": 14970 + }, + { + "epoch": 0.7262322609586508, + "grad_norm": 0.24609375, + "learning_rate": 9.564129653251023e-06, + "loss": 2.7873, + "num_input_tokens_seen": 7851212800, + "step": 14975 + }, + { + "epoch": 0.7264747425148974, + "grad_norm": 0.25390625, + "learning_rate": 9.548365269673892e-06, + "loss": 2.7697, + "num_input_tokens_seen": 7853834240, + "step": 14980 + }, + { + "epoch": 0.7267172240711441, + "grad_norm": 0.2490234375, + "learning_rate": 9.532610821594562e-06, + "loss": 2.7779, + "num_input_tokens_seen": 7856455680, + "step": 14985 + }, + { + "epoch": 0.7269597056273908, + "grad_norm": 0.2490234375, + "learning_rate": 9.516866319143236e-06, + "loss": 2.779, + "num_input_tokens_seen": 7859077120, + "step": 14990 + }, + { + "epoch": 0.7272021871836374, + "grad_norm": 0.2412109375, + "learning_rate": 9.50113177244373e-06, + "loss": 2.7757, + "num_input_tokens_seen": 7861698560, + "step": 14995 + }, + { + "epoch": 0.727444668739884, + "grad_norm": 0.24609375, + "learning_rate": 9.485407191613455e-06, + "loss": 2.7717, + "num_input_tokens_seen": 7864320000, + "step": 15000 + }, + { + "epoch": 0.727444668739884, + "eval_accuracy": 0.45601693535254845, + "eval_loss": 2.7416605949401855, + "eval_runtime": 5.8717, + "eval_samples_per_second": 51.092, + "eval_steps_per_second": 6.472, + "num_input_tokens_seen": 7864320000, + "step": 15000 + }, + { + "epoch": 0.7276871502961306, + "grad_norm": 0.2421875, + "learning_rate": 9.469692586763412e-06, + "loss": 2.7943, + "num_input_tokens_seen": 7866941440, + "step": 15005 + }, + { + "epoch": 0.7279296318523772, + "grad_norm": 0.2421875, + "learning_rate": 9.453987967998196e-06, + "loss": 2.7836, + "num_input_tokens_seen": 7869562880, + "step": 15010 + }, + { + "epoch": 0.7281721134086239, + "grad_norm": 0.240234375, + "learning_rate": 9.438293345415972e-06, + "loss": 2.7705, + "num_input_tokens_seen": 7872184320, + "step": 15015 + }, + { + "epoch": 0.7284145949648705, + "grad_norm": 0.2470703125, + "learning_rate": 9.422608729108482e-06, + "loss": 2.7605, + "num_input_tokens_seen": 7874805760, + "step": 15020 + }, + { + "epoch": 0.7286570765211171, + "grad_norm": 0.2421875, + "learning_rate": 9.406934129161026e-06, + "loss": 2.7745, + "num_input_tokens_seen": 7877427200, + "step": 15025 + }, + { + "epoch": 0.7288995580773637, + "grad_norm": 0.240234375, + "learning_rate": 9.39126955565249e-06, + "loss": 2.7718, + "num_input_tokens_seen": 7880048640, + "step": 15030 + }, + { + "epoch": 0.7291420396336104, + "grad_norm": 0.2392578125, + "learning_rate": 9.375615018655265e-06, + "loss": 2.7636, + "num_input_tokens_seen": 7882670080, + "step": 15035 + }, + { + "epoch": 0.729384521189857, + "grad_norm": 0.2373046875, + "learning_rate": 9.35997052823533e-06, + "loss": 2.7707, + "num_input_tokens_seen": 7885291520, + "step": 15040 + }, + { + "epoch": 0.7296270027461036, + "grad_norm": 0.2470703125, + "learning_rate": 9.344336094452188e-06, + "loss": 2.7737, + "num_input_tokens_seen": 7887912960, + "step": 15045 + }, + { + "epoch": 0.7298694843023502, + "grad_norm": 0.24609375, + "learning_rate": 9.328711727358872e-06, + "loss": 2.7785, + "num_input_tokens_seen": 7890534400, + "step": 15050 + }, + { + "epoch": 0.7301119658585968, + "grad_norm": 0.244140625, + "learning_rate": 9.313097437001964e-06, + "loss": 2.7749, + "num_input_tokens_seen": 7893155840, + "step": 15055 + }, + { + "epoch": 0.7303544474148435, + "grad_norm": 0.2451171875, + "learning_rate": 9.297493233421548e-06, + "loss": 2.7579, + "num_input_tokens_seen": 7895777280, + "step": 15060 + }, + { + "epoch": 0.7305969289710902, + "grad_norm": 0.2412109375, + "learning_rate": 9.281899126651208e-06, + "loss": 2.7679, + "num_input_tokens_seen": 7898398720, + "step": 15065 + }, + { + "epoch": 0.7308394105273368, + "grad_norm": 0.2373046875, + "learning_rate": 9.266315126718064e-06, + "loss": 2.7735, + "num_input_tokens_seen": 7901020160, + "step": 15070 + }, + { + "epoch": 0.7310818920835834, + "grad_norm": 0.2392578125, + "learning_rate": 9.250741243642724e-06, + "loss": 2.7689, + "num_input_tokens_seen": 7903641600, + "step": 15075 + }, + { + "epoch": 0.73132437363983, + "grad_norm": 0.2451171875, + "learning_rate": 9.235177487439289e-06, + "loss": 2.7721, + "num_input_tokens_seen": 7906263040, + "step": 15080 + }, + { + "epoch": 0.7315668551960767, + "grad_norm": 0.251953125, + "learning_rate": 9.219623868115354e-06, + "loss": 2.768, + "num_input_tokens_seen": 7908884480, + "step": 15085 + }, + { + "epoch": 0.7318093367523233, + "grad_norm": 0.240234375, + "learning_rate": 9.204080395672004e-06, + "loss": 2.7745, + "num_input_tokens_seen": 7911505920, + "step": 15090 + }, + { + "epoch": 0.7320518183085699, + "grad_norm": 0.2421875, + "learning_rate": 9.188547080103767e-06, + "loss": 2.7722, + "num_input_tokens_seen": 7914127360, + "step": 15095 + }, + { + "epoch": 0.7322942998648165, + "grad_norm": 0.2412109375, + "learning_rate": 9.173023931398677e-06, + "loss": 2.7603, + "num_input_tokens_seen": 7916748800, + "step": 15100 + }, + { + "epoch": 0.7325367814210632, + "grad_norm": 0.25, + "learning_rate": 9.157510959538204e-06, + "loss": 2.7757, + "num_input_tokens_seen": 7919370240, + "step": 15105 + }, + { + "epoch": 0.7327792629773098, + "grad_norm": 0.2431640625, + "learning_rate": 9.142008174497302e-06, + "loss": 2.7881, + "num_input_tokens_seen": 7921991680, + "step": 15110 + }, + { + "epoch": 0.7330217445335564, + "grad_norm": 0.2421875, + "learning_rate": 9.126515586244353e-06, + "loss": 2.7824, + "num_input_tokens_seen": 7924613120, + "step": 15115 + }, + { + "epoch": 0.733264226089803, + "grad_norm": 0.2412109375, + "learning_rate": 9.111033204741182e-06, + "loss": 2.7632, + "num_input_tokens_seen": 7927234560, + "step": 15120 + }, + { + "epoch": 0.7335067076460496, + "grad_norm": 0.2470703125, + "learning_rate": 9.095561039943073e-06, + "loss": 2.7759, + "num_input_tokens_seen": 7929856000, + "step": 15125 + }, + { + "epoch": 0.7337491892022963, + "grad_norm": 0.240234375, + "learning_rate": 9.080099101798703e-06, + "loss": 2.7831, + "num_input_tokens_seen": 7932477440, + "step": 15130 + }, + { + "epoch": 0.7339916707585429, + "grad_norm": 0.25390625, + "learning_rate": 9.06464740025021e-06, + "loss": 2.7837, + "num_input_tokens_seen": 7935098880, + "step": 15135 + }, + { + "epoch": 0.7342341523147895, + "grad_norm": 0.2421875, + "learning_rate": 9.049205945233124e-06, + "loss": 2.7663, + "num_input_tokens_seen": 7937720320, + "step": 15140 + }, + { + "epoch": 0.7344766338710362, + "grad_norm": 0.2431640625, + "learning_rate": 9.033774746676404e-06, + "loss": 2.7727, + "num_input_tokens_seen": 7940341760, + "step": 15145 + }, + { + "epoch": 0.7347191154272829, + "grad_norm": 0.2392578125, + "learning_rate": 9.01835381450241e-06, + "loss": 2.778, + "num_input_tokens_seen": 7942963200, + "step": 15150 + }, + { + "epoch": 0.7349615969835295, + "grad_norm": 0.248046875, + "learning_rate": 9.0029431586269e-06, + "loss": 2.7732, + "num_input_tokens_seen": 7945584640, + "step": 15155 + }, + { + "epoch": 0.7352040785397761, + "grad_norm": 0.25390625, + "learning_rate": 8.987542788959e-06, + "loss": 2.7764, + "num_input_tokens_seen": 7948206080, + "step": 15160 + }, + { + "epoch": 0.7354465600960227, + "grad_norm": 0.234375, + "learning_rate": 8.97215271540127e-06, + "loss": 2.7871, + "num_input_tokens_seen": 7950827520, + "step": 15165 + }, + { + "epoch": 0.7356890416522693, + "grad_norm": 0.23828125, + "learning_rate": 8.956772947849613e-06, + "loss": 2.7671, + "num_input_tokens_seen": 7953448960, + "step": 15170 + }, + { + "epoch": 0.735931523208516, + "grad_norm": 0.240234375, + "learning_rate": 8.941403496193315e-06, + "loss": 2.7794, + "num_input_tokens_seen": 7956070400, + "step": 15175 + }, + { + "epoch": 0.7361740047647626, + "grad_norm": 0.2421875, + "learning_rate": 8.926044370315035e-06, + "loss": 2.7523, + "num_input_tokens_seen": 7958691840, + "step": 15180 + }, + { + "epoch": 0.7364164863210092, + "grad_norm": 0.255859375, + "learning_rate": 8.910695580090789e-06, + "loss": 2.7807, + "num_input_tokens_seen": 7961313280, + "step": 15185 + }, + { + "epoch": 0.7366589678772558, + "grad_norm": 0.24609375, + "learning_rate": 8.895357135389931e-06, + "loss": 2.783, + "num_input_tokens_seen": 7963934720, + "step": 15190 + }, + { + "epoch": 0.7369014494335024, + "grad_norm": 0.2421875, + "learning_rate": 8.880029046075186e-06, + "loss": 2.7765, + "num_input_tokens_seen": 7966556160, + "step": 15195 + }, + { + "epoch": 0.7371439309897491, + "grad_norm": 0.23828125, + "learning_rate": 8.86471132200261e-06, + "loss": 2.7609, + "num_input_tokens_seen": 7969177600, + "step": 15200 + }, + { + "epoch": 0.7373864125459957, + "grad_norm": 0.2431640625, + "learning_rate": 8.8494039730216e-06, + "loss": 2.7844, + "num_input_tokens_seen": 7971799040, + "step": 15205 + }, + { + "epoch": 0.7376288941022423, + "grad_norm": 0.240234375, + "learning_rate": 8.834107008974871e-06, + "loss": 2.7716, + "num_input_tokens_seen": 7974420480, + "step": 15210 + }, + { + "epoch": 0.7378713756584889, + "grad_norm": 0.23828125, + "learning_rate": 8.818820439698466e-06, + "loss": 2.7762, + "num_input_tokens_seen": 7977041920, + "step": 15215 + }, + { + "epoch": 0.7381138572147357, + "grad_norm": 0.24609375, + "learning_rate": 8.803544275021749e-06, + "loss": 2.7816, + "num_input_tokens_seen": 7979663360, + "step": 15220 + }, + { + "epoch": 0.7383563387709823, + "grad_norm": 0.244140625, + "learning_rate": 8.78827852476739e-06, + "loss": 2.7946, + "num_input_tokens_seen": 7982284800, + "step": 15225 + }, + { + "epoch": 0.7385988203272289, + "grad_norm": 0.2431640625, + "learning_rate": 8.77302319875136e-06, + "loss": 2.7671, + "num_input_tokens_seen": 7984906240, + "step": 15230 + }, + { + "epoch": 0.7388413018834755, + "grad_norm": 0.2392578125, + "learning_rate": 8.757778306782927e-06, + "loss": 2.7829, + "num_input_tokens_seen": 7987527680, + "step": 15235 + }, + { + "epoch": 0.7390837834397221, + "grad_norm": 0.2490234375, + "learning_rate": 8.742543858664653e-06, + "loss": 2.7741, + "num_input_tokens_seen": 7990149120, + "step": 15240 + }, + { + "epoch": 0.7393262649959688, + "grad_norm": 0.25, + "learning_rate": 8.727319864192394e-06, + "loss": 2.7885, + "num_input_tokens_seen": 7992770560, + "step": 15245 + }, + { + "epoch": 0.7395687465522154, + "grad_norm": 0.236328125, + "learning_rate": 8.712106333155257e-06, + "loss": 2.7774, + "num_input_tokens_seen": 7995392000, + "step": 15250 + }, + { + "epoch": 0.739811228108462, + "grad_norm": 0.2421875, + "learning_rate": 8.696903275335642e-06, + "loss": 2.7666, + "num_input_tokens_seen": 7998013440, + "step": 15255 + }, + { + "epoch": 0.7400537096647086, + "grad_norm": 0.2412109375, + "learning_rate": 8.681710700509213e-06, + "loss": 2.7855, + "num_input_tokens_seen": 8000634880, + "step": 15260 + }, + { + "epoch": 0.7402961912209552, + "grad_norm": 0.24609375, + "learning_rate": 8.666528618444892e-06, + "loss": 2.7845, + "num_input_tokens_seen": 8003256320, + "step": 15265 + }, + { + "epoch": 0.7405386727772019, + "grad_norm": 0.2431640625, + "learning_rate": 8.65135703890485e-06, + "loss": 2.7693, + "num_input_tokens_seen": 8005877760, + "step": 15270 + }, + { + "epoch": 0.7407811543334485, + "grad_norm": 0.24609375, + "learning_rate": 8.636195971644511e-06, + "loss": 2.7681, + "num_input_tokens_seen": 8008499200, + "step": 15275 + }, + { + "epoch": 0.7410236358896951, + "grad_norm": 0.25, + "learning_rate": 8.621045426412533e-06, + "loss": 2.7567, + "num_input_tokens_seen": 8011120640, + "step": 15280 + }, + { + "epoch": 0.7412661174459417, + "grad_norm": 0.24609375, + "learning_rate": 8.605905412950812e-06, + "loss": 2.7772, + "num_input_tokens_seen": 8013742080, + "step": 15285 + }, + { + "epoch": 0.7415085990021884, + "grad_norm": 0.2373046875, + "learning_rate": 8.590775940994472e-06, + "loss": 2.7667, + "num_input_tokens_seen": 8016363520, + "step": 15290 + }, + { + "epoch": 0.741751080558435, + "grad_norm": 0.2353515625, + "learning_rate": 8.575657020271857e-06, + "loss": 2.765, + "num_input_tokens_seen": 8018984960, + "step": 15295 + }, + { + "epoch": 0.7419935621146817, + "grad_norm": 0.240234375, + "learning_rate": 8.56054866050453e-06, + "loss": 2.7717, + "num_input_tokens_seen": 8021606400, + "step": 15300 + }, + { + "epoch": 0.7419935621146817, + "eval_accuracy": 0.456018563751832, + "eval_loss": 2.7416841983795166, + "eval_runtime": 5.8648, + "eval_samples_per_second": 51.152, + "eval_steps_per_second": 6.479, + "num_input_tokens_seen": 8021606400, + "step": 15300 + }, + { + "epoch": 0.7422360436709283, + "grad_norm": 0.2392578125, + "learning_rate": 8.54545087140726e-06, + "loss": 2.7912, + "num_input_tokens_seen": 8024227840, + "step": 15305 + }, + { + "epoch": 0.742478525227175, + "grad_norm": 0.25390625, + "learning_rate": 8.530363662688023e-06, + "loss": 2.776, + "num_input_tokens_seen": 8026849280, + "step": 15310 + }, + { + "epoch": 0.7427210067834216, + "grad_norm": 0.244140625, + "learning_rate": 8.515287044047982e-06, + "loss": 2.783, + "num_input_tokens_seen": 8029470720, + "step": 15315 + }, + { + "epoch": 0.7429634883396682, + "grad_norm": 0.2412109375, + "learning_rate": 8.500221025181496e-06, + "loss": 2.7864, + "num_input_tokens_seen": 8032092160, + "step": 15320 + }, + { + "epoch": 0.7432059698959148, + "grad_norm": 0.244140625, + "learning_rate": 8.485165615776114e-06, + "loss": 2.7776, + "num_input_tokens_seen": 8034713600, + "step": 15325 + }, + { + "epoch": 0.7434484514521614, + "grad_norm": 0.2470703125, + "learning_rate": 8.47012082551256e-06, + "loss": 2.7796, + "num_input_tokens_seen": 8037335040, + "step": 15330 + }, + { + "epoch": 0.743690933008408, + "grad_norm": 0.2431640625, + "learning_rate": 8.455086664064713e-06, + "loss": 2.7897, + "num_input_tokens_seen": 8039956480, + "step": 15335 + }, + { + "epoch": 0.7439334145646547, + "grad_norm": 0.2392578125, + "learning_rate": 8.440063141099666e-06, + "loss": 2.7773, + "num_input_tokens_seen": 8042577920, + "step": 15340 + }, + { + "epoch": 0.7441758961209013, + "grad_norm": 0.2451171875, + "learning_rate": 8.425050266277609e-06, + "loss": 2.7693, + "num_input_tokens_seen": 8045199360, + "step": 15345 + }, + { + "epoch": 0.7444183776771479, + "grad_norm": 0.2431640625, + "learning_rate": 8.410048049251922e-06, + "loss": 2.7884, + "num_input_tokens_seen": 8047820800, + "step": 15350 + }, + { + "epoch": 0.7446608592333945, + "grad_norm": 0.2431640625, + "learning_rate": 8.395056499669127e-06, + "loss": 2.7588, + "num_input_tokens_seen": 8050442240, + "step": 15355 + }, + { + "epoch": 0.7449033407896412, + "grad_norm": 0.2392578125, + "learning_rate": 8.380075627168884e-06, + "loss": 2.76, + "num_input_tokens_seen": 8053063680, + "step": 15360 + }, + { + "epoch": 0.7451458223458878, + "grad_norm": 0.244140625, + "learning_rate": 8.365105441383986e-06, + "loss": 2.7677, + "num_input_tokens_seen": 8055685120, + "step": 15365 + }, + { + "epoch": 0.7453883039021344, + "grad_norm": 0.23828125, + "learning_rate": 8.350145951940364e-06, + "loss": 2.7764, + "num_input_tokens_seen": 8058306560, + "step": 15370 + }, + { + "epoch": 0.745630785458381, + "grad_norm": 0.2412109375, + "learning_rate": 8.335197168457048e-06, + "loss": 2.7667, + "num_input_tokens_seen": 8060928000, + "step": 15375 + }, + { + "epoch": 0.7458732670146277, + "grad_norm": 0.248046875, + "learning_rate": 8.32025910054621e-06, + "loss": 2.7695, + "num_input_tokens_seen": 8063549440, + "step": 15380 + }, + { + "epoch": 0.7461157485708744, + "grad_norm": 0.2451171875, + "learning_rate": 8.305331757813115e-06, + "loss": 2.7786, + "num_input_tokens_seen": 8066170880, + "step": 15385 + }, + { + "epoch": 0.746358230127121, + "grad_norm": 0.2412109375, + "learning_rate": 8.290415149856134e-06, + "loss": 2.758, + "num_input_tokens_seen": 8068792320, + "step": 15390 + }, + { + "epoch": 0.7466007116833676, + "grad_norm": 0.240234375, + "learning_rate": 8.275509286266755e-06, + "loss": 2.7753, + "num_input_tokens_seen": 8071413760, + "step": 15395 + }, + { + "epoch": 0.7468431932396142, + "grad_norm": 0.24609375, + "learning_rate": 8.260614176629544e-06, + "loss": 2.7729, + "num_input_tokens_seen": 8074035200, + "step": 15400 + }, + { + "epoch": 0.7470856747958609, + "grad_norm": 0.248046875, + "learning_rate": 8.245729830522128e-06, + "loss": 2.7754, + "num_input_tokens_seen": 8076656640, + "step": 15405 + }, + { + "epoch": 0.7473281563521075, + "grad_norm": 0.2412109375, + "learning_rate": 8.230856257515245e-06, + "loss": 2.7642, + "num_input_tokens_seen": 8079278080, + "step": 15410 + }, + { + "epoch": 0.7475706379083541, + "grad_norm": 0.2431640625, + "learning_rate": 8.215993467172697e-06, + "loss": 2.7596, + "num_input_tokens_seen": 8081899520, + "step": 15415 + }, + { + "epoch": 0.7478131194646007, + "grad_norm": 0.23828125, + "learning_rate": 8.201141469051351e-06, + "loss": 2.7666, + "num_input_tokens_seen": 8084520960, + "step": 15420 + }, + { + "epoch": 0.7480556010208473, + "grad_norm": 0.25390625, + "learning_rate": 8.186300272701134e-06, + "loss": 2.7715, + "num_input_tokens_seen": 8087142400, + "step": 15425 + }, + { + "epoch": 0.748298082577094, + "grad_norm": 0.2470703125, + "learning_rate": 8.171469887665035e-06, + "loss": 2.7671, + "num_input_tokens_seen": 8089763840, + "step": 15430 + }, + { + "epoch": 0.7485405641333406, + "grad_norm": 0.2392578125, + "learning_rate": 8.15665032347907e-06, + "loss": 2.78, + "num_input_tokens_seen": 8092385280, + "step": 15435 + }, + { + "epoch": 0.7487830456895872, + "grad_norm": 0.2421875, + "learning_rate": 8.141841589672316e-06, + "loss": 2.7722, + "num_input_tokens_seen": 8095006720, + "step": 15440 + }, + { + "epoch": 0.7490255272458338, + "grad_norm": 0.2392578125, + "learning_rate": 8.127043695766879e-06, + "loss": 2.7718, + "num_input_tokens_seen": 8097628160, + "step": 15445 + }, + { + "epoch": 0.7492680088020804, + "grad_norm": 0.2392578125, + "learning_rate": 8.11225665127791e-06, + "loss": 2.7835, + "num_input_tokens_seen": 8100249600, + "step": 15450 + }, + { + "epoch": 0.7495104903583272, + "grad_norm": 0.2431640625, + "learning_rate": 8.097480465713563e-06, + "loss": 2.7689, + "num_input_tokens_seen": 8102871040, + "step": 15455 + }, + { + "epoch": 0.7497529719145738, + "grad_norm": 0.23828125, + "learning_rate": 8.082715148575018e-06, + "loss": 2.7796, + "num_input_tokens_seen": 8105492480, + "step": 15460 + }, + { + "epoch": 0.7499954534708204, + "grad_norm": 0.244140625, + "learning_rate": 8.067960709356478e-06, + "loss": 2.7587, + "num_input_tokens_seen": 8108113920, + "step": 15465 + }, + { + "epoch": 0.750237935027067, + "grad_norm": 0.244140625, + "learning_rate": 8.053217157545117e-06, + "loss": 2.7646, + "num_input_tokens_seen": 8110735360, + "step": 15470 + }, + { + "epoch": 0.7504804165833137, + "grad_norm": 0.2451171875, + "learning_rate": 8.038484502621144e-06, + "loss": 2.7692, + "num_input_tokens_seen": 8113356800, + "step": 15475 + }, + { + "epoch": 0.7507228981395603, + "grad_norm": 0.2412109375, + "learning_rate": 8.023762754057748e-06, + "loss": 2.7649, + "num_input_tokens_seen": 8115978240, + "step": 15480 + }, + { + "epoch": 0.7509653796958069, + "grad_norm": 0.244140625, + "learning_rate": 8.009051921321101e-06, + "loss": 2.7863, + "num_input_tokens_seen": 8118599680, + "step": 15485 + }, + { + "epoch": 0.7512078612520535, + "grad_norm": 0.240234375, + "learning_rate": 7.994352013870366e-06, + "loss": 2.7695, + "num_input_tokens_seen": 8121221120, + "step": 15490 + }, + { + "epoch": 0.7514503428083001, + "grad_norm": 0.2451171875, + "learning_rate": 7.979663041157673e-06, + "loss": 2.7841, + "num_input_tokens_seen": 8123842560, + "step": 15495 + }, + { + "epoch": 0.7516928243645468, + "grad_norm": 0.2412109375, + "learning_rate": 7.964985012628123e-06, + "loss": 2.7864, + "num_input_tokens_seen": 8126464000, + "step": 15500 + }, + { + "epoch": 0.7519353059207934, + "grad_norm": 0.2431640625, + "learning_rate": 7.950317937719782e-06, + "loss": 2.7813, + "num_input_tokens_seen": 8129085440, + "step": 15505 + }, + { + "epoch": 0.75217778747704, + "grad_norm": 0.244140625, + "learning_rate": 7.935661825863669e-06, + "loss": 2.7689, + "num_input_tokens_seen": 8131706880, + "step": 15510 + }, + { + "epoch": 0.7524202690332866, + "grad_norm": 0.2451171875, + "learning_rate": 7.921016686483757e-06, + "loss": 2.779, + "num_input_tokens_seen": 8134328320, + "step": 15515 + }, + { + "epoch": 0.7526627505895332, + "grad_norm": 0.2421875, + "learning_rate": 7.906382528996958e-06, + "loss": 2.7874, + "num_input_tokens_seen": 8136949760, + "step": 15520 + }, + { + "epoch": 0.7529052321457799, + "grad_norm": 0.2490234375, + "learning_rate": 7.891759362813142e-06, + "loss": 2.7839, + "num_input_tokens_seen": 8139571200, + "step": 15525 + }, + { + "epoch": 0.7531477137020265, + "grad_norm": 0.2421875, + "learning_rate": 7.877147197335075e-06, + "loss": 2.7597, + "num_input_tokens_seen": 8142192640, + "step": 15530 + }, + { + "epoch": 0.7533901952582732, + "grad_norm": 0.2431640625, + "learning_rate": 7.862546041958482e-06, + "loss": 2.7801, + "num_input_tokens_seen": 8144814080, + "step": 15535 + }, + { + "epoch": 0.7536326768145198, + "grad_norm": 0.24609375, + "learning_rate": 7.847955906071994e-06, + "loss": 2.7751, + "num_input_tokens_seen": 8147435520, + "step": 15540 + }, + { + "epoch": 0.7538751583707665, + "grad_norm": 0.2490234375, + "learning_rate": 7.833376799057163e-06, + "loss": 2.7744, + "num_input_tokens_seen": 8150056960, + "step": 15545 + }, + { + "epoch": 0.7541176399270131, + "grad_norm": 0.2353515625, + "learning_rate": 7.81880873028844e-06, + "loss": 2.7735, + "num_input_tokens_seen": 8152678400, + "step": 15550 + }, + { + "epoch": 0.7543601214832597, + "grad_norm": 0.2470703125, + "learning_rate": 7.804251709133192e-06, + "loss": 2.7793, + "num_input_tokens_seen": 8155299840, + "step": 15555 + }, + { + "epoch": 0.7546026030395063, + "grad_norm": 0.2373046875, + "learning_rate": 7.789705744951672e-06, + "loss": 2.7715, + "num_input_tokens_seen": 8157921280, + "step": 15560 + }, + { + "epoch": 0.754845084595753, + "grad_norm": 0.24609375, + "learning_rate": 7.775170847097026e-06, + "loss": 2.7799, + "num_input_tokens_seen": 8160542720, + "step": 15565 + }, + { + "epoch": 0.7550875661519996, + "grad_norm": 0.240234375, + "learning_rate": 7.760647024915283e-06, + "loss": 2.7828, + "num_input_tokens_seen": 8163164160, + "step": 15570 + }, + { + "epoch": 0.7553300477082462, + "grad_norm": 0.25390625, + "learning_rate": 7.746134287745349e-06, + "loss": 2.784, + "num_input_tokens_seen": 8165785600, + "step": 15575 + }, + { + "epoch": 0.7555725292644928, + "grad_norm": 0.2470703125, + "learning_rate": 7.731632644919012e-06, + "loss": 2.7859, + "num_input_tokens_seen": 8168407040, + "step": 15580 + }, + { + "epoch": 0.7558150108207394, + "grad_norm": 0.23828125, + "learning_rate": 7.717142105760921e-06, + "loss": 2.7775, + "num_input_tokens_seen": 8171028480, + "step": 15585 + }, + { + "epoch": 0.756057492376986, + "grad_norm": 0.2421875, + "learning_rate": 7.702662679588572e-06, + "loss": 2.7839, + "num_input_tokens_seen": 8173649920, + "step": 15590 + }, + { + "epoch": 0.7562999739332327, + "grad_norm": 0.2431640625, + "learning_rate": 7.688194375712327e-06, + "loss": 2.7691, + "num_input_tokens_seen": 8176271360, + "step": 15595 + }, + { + "epoch": 0.7565424554894793, + "grad_norm": 0.2421875, + "learning_rate": 7.673737203435405e-06, + "loss": 2.777, + "num_input_tokens_seen": 8178892800, + "step": 15600 + }, + { + "epoch": 0.7565424554894793, + "eval_accuracy": 0.45587363621560006, + "eval_loss": 2.741650342941284, + "eval_runtime": 5.8981, + "eval_samples_per_second": 50.864, + "eval_steps_per_second": 6.443, + "num_input_tokens_seen": 8178892800, + "step": 15600 + }, + { + "epoch": 0.7567849370457259, + "grad_norm": 0.2490234375, + "learning_rate": 7.65929117205385e-06, + "loss": 2.7818, + "num_input_tokens_seen": 8181514240, + "step": 15605 + }, + { + "epoch": 0.7570274186019725, + "grad_norm": 0.2451171875, + "learning_rate": 7.644856290856559e-06, + "loss": 2.777, + "num_input_tokens_seen": 8184135680, + "step": 15610 + }, + { + "epoch": 0.7572699001582193, + "grad_norm": 0.2451171875, + "learning_rate": 7.630432569125245e-06, + "loss": 2.7863, + "num_input_tokens_seen": 8186757120, + "step": 15615 + }, + { + "epoch": 0.7575123817144659, + "grad_norm": 0.23828125, + "learning_rate": 7.616020016134451e-06, + "loss": 2.7721, + "num_input_tokens_seen": 8189378560, + "step": 15620 + }, + { + "epoch": 0.7577548632707125, + "grad_norm": 0.2353515625, + "learning_rate": 7.601618641151542e-06, + "loss": 2.7765, + "num_input_tokens_seen": 8192000000, + "step": 15625 + }, + { + "epoch": 0.7579973448269591, + "grad_norm": 0.25, + "learning_rate": 7.587228453436693e-06, + "loss": 2.7773, + "num_input_tokens_seen": 8194621440, + "step": 15630 + }, + { + "epoch": 0.7582398263832058, + "grad_norm": 0.244140625, + "learning_rate": 7.572849462242879e-06, + "loss": 2.7723, + "num_input_tokens_seen": 8197242880, + "step": 15635 + }, + { + "epoch": 0.7584823079394524, + "grad_norm": 0.2412109375, + "learning_rate": 7.558481676815887e-06, + "loss": 2.7852, + "num_input_tokens_seen": 8199864320, + "step": 15640 + }, + { + "epoch": 0.758724789495699, + "grad_norm": 0.2412109375, + "learning_rate": 7.544125106394289e-06, + "loss": 2.7693, + "num_input_tokens_seen": 8202485760, + "step": 15645 + }, + { + "epoch": 0.7589672710519456, + "grad_norm": 0.2353515625, + "learning_rate": 7.52977976020946e-06, + "loss": 2.7761, + "num_input_tokens_seen": 8205107200, + "step": 15650 + }, + { + "epoch": 0.7592097526081922, + "grad_norm": 0.2421875, + "learning_rate": 7.5154456474855305e-06, + "loss": 2.7785, + "num_input_tokens_seen": 8207728640, + "step": 15655 + }, + { + "epoch": 0.7594522341644389, + "grad_norm": 0.244140625, + "learning_rate": 7.501122777439435e-06, + "loss": 2.7717, + "num_input_tokens_seen": 8210350080, + "step": 15660 + }, + { + "epoch": 0.7596947157206855, + "grad_norm": 0.2421875, + "learning_rate": 7.486811159280863e-06, + "loss": 2.7675, + "num_input_tokens_seen": 8212971520, + "step": 15665 + }, + { + "epoch": 0.7599371972769321, + "grad_norm": 0.251953125, + "learning_rate": 7.472510802212279e-06, + "loss": 2.7714, + "num_input_tokens_seen": 8215592960, + "step": 15670 + }, + { + "epoch": 0.7601796788331787, + "grad_norm": 0.234375, + "learning_rate": 7.458221715428893e-06, + "loss": 2.7733, + "num_input_tokens_seen": 8218214400, + "step": 15675 + }, + { + "epoch": 0.7604221603894253, + "grad_norm": 0.2412109375, + "learning_rate": 7.443943908118703e-06, + "loss": 2.777, + "num_input_tokens_seen": 8220835840, + "step": 15680 + }, + { + "epoch": 0.760664641945672, + "grad_norm": 0.2451171875, + "learning_rate": 7.4296773894624e-06, + "loss": 2.773, + "num_input_tokens_seen": 8223457280, + "step": 15685 + }, + { + "epoch": 0.7609071235019186, + "grad_norm": 0.24609375, + "learning_rate": 7.415422168633457e-06, + "loss": 2.7707, + "num_input_tokens_seen": 8226078720, + "step": 15690 + }, + { + "epoch": 0.7611496050581653, + "grad_norm": 0.23828125, + "learning_rate": 7.4011782547980665e-06, + "loss": 2.7761, + "num_input_tokens_seen": 8228700160, + "step": 15695 + }, + { + "epoch": 0.7613920866144119, + "grad_norm": 0.24609375, + "learning_rate": 7.386945657115158e-06, + "loss": 2.7625, + "num_input_tokens_seen": 8231321600, + "step": 15700 + }, + { + "epoch": 0.7616345681706586, + "grad_norm": 0.2412109375, + "learning_rate": 7.372724384736377e-06, + "loss": 2.7709, + "num_input_tokens_seen": 8233943040, + "step": 15705 + }, + { + "epoch": 0.7618770497269052, + "grad_norm": 0.24609375, + "learning_rate": 7.358514446806103e-06, + "loss": 2.7756, + "num_input_tokens_seen": 8236564480, + "step": 15710 + }, + { + "epoch": 0.7621195312831518, + "grad_norm": 0.2392578125, + "learning_rate": 7.3443158524613946e-06, + "loss": 2.7763, + "num_input_tokens_seen": 8239185920, + "step": 15715 + }, + { + "epoch": 0.7623620128393984, + "grad_norm": 0.2431640625, + "learning_rate": 7.330128610832049e-06, + "loss": 2.7757, + "num_input_tokens_seen": 8241807360, + "step": 15720 + }, + { + "epoch": 0.762604494395645, + "grad_norm": 0.240234375, + "learning_rate": 7.3159527310405454e-06, + "loss": 2.7879, + "num_input_tokens_seen": 8244428800, + "step": 15725 + }, + { + "epoch": 0.7628469759518917, + "grad_norm": 0.251953125, + "learning_rate": 7.301788222202063e-06, + "loss": 2.7797, + "num_input_tokens_seen": 8247050240, + "step": 15730 + }, + { + "epoch": 0.7630894575081383, + "grad_norm": 0.25, + "learning_rate": 7.28763509342448e-06, + "loss": 2.787, + "num_input_tokens_seen": 8249671680, + "step": 15735 + }, + { + "epoch": 0.7633319390643849, + "grad_norm": 0.2392578125, + "learning_rate": 7.273493353808347e-06, + "loss": 2.7699, + "num_input_tokens_seen": 8252293120, + "step": 15740 + }, + { + "epoch": 0.7635744206206315, + "grad_norm": 0.2392578125, + "learning_rate": 7.259363012446876e-06, + "loss": 2.7927, + "num_input_tokens_seen": 8254914560, + "step": 15745 + }, + { + "epoch": 0.7638169021768781, + "grad_norm": 0.2451171875, + "learning_rate": 7.245244078425975e-06, + "loss": 2.769, + "num_input_tokens_seen": 8257536000, + "step": 15750 + }, + { + "epoch": 0.7640593837331248, + "grad_norm": 0.240234375, + "learning_rate": 7.231136560824206e-06, + "loss": 2.7718, + "num_input_tokens_seen": 8260157440, + "step": 15755 + }, + { + "epoch": 0.7643018652893714, + "grad_norm": 0.25, + "learning_rate": 7.217040468712788e-06, + "loss": 2.7671, + "num_input_tokens_seen": 8262778880, + "step": 15760 + }, + { + "epoch": 0.764544346845618, + "grad_norm": 0.2353515625, + "learning_rate": 7.202955811155601e-06, + "loss": 2.7749, + "num_input_tokens_seen": 8265400320, + "step": 15765 + }, + { + "epoch": 0.7647868284018647, + "grad_norm": 0.2373046875, + "learning_rate": 7.188882597209162e-06, + "loss": 2.7727, + "num_input_tokens_seen": 8268021760, + "step": 15770 + }, + { + "epoch": 0.7650293099581114, + "grad_norm": 0.23828125, + "learning_rate": 7.174820835922649e-06, + "loss": 2.7711, + "num_input_tokens_seen": 8270643200, + "step": 15775 + }, + { + "epoch": 0.765271791514358, + "grad_norm": 0.240234375, + "learning_rate": 7.160770536337838e-06, + "loss": 2.7879, + "num_input_tokens_seen": 8273264640, + "step": 15780 + }, + { + "epoch": 0.7655142730706046, + "grad_norm": 0.24609375, + "learning_rate": 7.146731707489179e-06, + "loss": 2.7784, + "num_input_tokens_seen": 8275886080, + "step": 15785 + }, + { + "epoch": 0.7657567546268512, + "grad_norm": 0.244140625, + "learning_rate": 7.132704358403724e-06, + "loss": 2.7661, + "num_input_tokens_seen": 8278507520, + "step": 15790 + }, + { + "epoch": 0.7659992361830978, + "grad_norm": 0.23828125, + "learning_rate": 7.118688498101145e-06, + "loss": 2.7736, + "num_input_tokens_seen": 8281128960, + "step": 15795 + }, + { + "epoch": 0.7662417177393445, + "grad_norm": 0.248046875, + "learning_rate": 7.104684135593726e-06, + "loss": 2.7734, + "num_input_tokens_seen": 8283750400, + "step": 15800 + }, + { + "epoch": 0.7664841992955911, + "grad_norm": 0.2431640625, + "learning_rate": 7.0906912798863666e-06, + "loss": 2.7645, + "num_input_tokens_seen": 8286371840, + "step": 15805 + }, + { + "epoch": 0.7667266808518377, + "grad_norm": 0.240234375, + "learning_rate": 7.076709939976548e-06, + "loss": 2.7691, + "num_input_tokens_seen": 8288993280, + "step": 15810 + }, + { + "epoch": 0.7669691624080843, + "grad_norm": 0.240234375, + "learning_rate": 7.062740124854367e-06, + "loss": 2.7679, + "num_input_tokens_seen": 8291614720, + "step": 15815 + }, + { + "epoch": 0.767211643964331, + "grad_norm": 0.244140625, + "learning_rate": 7.0487818435025e-06, + "loss": 2.7658, + "num_input_tokens_seen": 8294236160, + "step": 15820 + }, + { + "epoch": 0.7674541255205776, + "grad_norm": 0.2421875, + "learning_rate": 7.034835104896209e-06, + "loss": 2.7741, + "num_input_tokens_seen": 8296857600, + "step": 15825 + }, + { + "epoch": 0.7676966070768242, + "grad_norm": 0.2431640625, + "learning_rate": 7.020899918003337e-06, + "loss": 2.7861, + "num_input_tokens_seen": 8299479040, + "step": 15830 + }, + { + "epoch": 0.7679390886330708, + "grad_norm": 0.2373046875, + "learning_rate": 7.006976291784295e-06, + "loss": 2.7906, + "num_input_tokens_seen": 8302100480, + "step": 15835 + }, + { + "epoch": 0.7681815701893174, + "grad_norm": 0.2470703125, + "learning_rate": 6.99306423519206e-06, + "loss": 2.763, + "num_input_tokens_seen": 8304721920, + "step": 15840 + }, + { + "epoch": 0.768424051745564, + "grad_norm": 0.2451171875, + "learning_rate": 6.9791637571721744e-06, + "loss": 2.7909, + "num_input_tokens_seen": 8307343360, + "step": 15845 + }, + { + "epoch": 0.7686665333018108, + "grad_norm": 0.2421875, + "learning_rate": 6.96527486666273e-06, + "loss": 2.7655, + "num_input_tokens_seen": 8309964800, + "step": 15850 + }, + { + "epoch": 0.7689090148580574, + "grad_norm": 0.25, + "learning_rate": 6.951397572594373e-06, + "loss": 2.7775, + "num_input_tokens_seen": 8312586240, + "step": 15855 + }, + { + "epoch": 0.769151496414304, + "grad_norm": 0.244140625, + "learning_rate": 6.937531883890286e-06, + "loss": 2.7707, + "num_input_tokens_seen": 8315207680, + "step": 15860 + }, + { + "epoch": 0.7693939779705506, + "grad_norm": 0.24609375, + "learning_rate": 6.923677809466206e-06, + "loss": 2.7756, + "num_input_tokens_seen": 8317829120, + "step": 15865 + }, + { + "epoch": 0.7696364595267973, + "grad_norm": 0.2451171875, + "learning_rate": 6.909835358230371e-06, + "loss": 2.7584, + "num_input_tokens_seen": 8320450560, + "step": 15870 + }, + { + "epoch": 0.7698789410830439, + "grad_norm": 0.2412109375, + "learning_rate": 6.896004539083573e-06, + "loss": 2.7819, + "num_input_tokens_seen": 8323072000, + "step": 15875 + }, + { + "epoch": 0.7701214226392905, + "grad_norm": 0.2451171875, + "learning_rate": 6.8821853609191165e-06, + "loss": 2.7654, + "num_input_tokens_seen": 8325693440, + "step": 15880 + }, + { + "epoch": 0.7703639041955371, + "grad_norm": 0.2421875, + "learning_rate": 6.868377832622813e-06, + "loss": 2.7798, + "num_input_tokens_seen": 8328314880, + "step": 15885 + }, + { + "epoch": 0.7706063857517838, + "grad_norm": 0.2412109375, + "learning_rate": 6.854581963072998e-06, + "loss": 2.7789, + "num_input_tokens_seen": 8330936320, + "step": 15890 + }, + { + "epoch": 0.7708488673080304, + "grad_norm": 0.2431640625, + "learning_rate": 6.840797761140497e-06, + "loss": 2.7896, + "num_input_tokens_seen": 8333557760, + "step": 15895 + }, + { + "epoch": 0.771091348864277, + "grad_norm": 0.2392578125, + "learning_rate": 6.827025235688641e-06, + "loss": 2.7793, + "num_input_tokens_seen": 8336179200, + "step": 15900 + }, + { + "epoch": 0.771091348864277, + "eval_accuracy": 0.45600227975899693, + "eval_loss": 2.7416441440582275, + "eval_runtime": 5.8798, + "eval_samples_per_second": 51.022, + "eval_steps_per_second": 6.463, + "num_input_tokens_seen": 8336179200, + "step": 15900 + }, + { + "epoch": 0.7713338304205236, + "grad_norm": 0.24609375, + "learning_rate": 6.813264395573246e-06, + "loss": 2.7759, + "num_input_tokens_seen": 8338800640, + "step": 15905 + }, + { + "epoch": 0.7715763119767702, + "grad_norm": 0.248046875, + "learning_rate": 6.7995152496426215e-06, + "loss": 2.7731, + "num_input_tokens_seen": 8341422080, + "step": 15910 + }, + { + "epoch": 0.7718187935330169, + "grad_norm": 0.2412109375, + "learning_rate": 6.785777806737554e-06, + "loss": 2.7777, + "num_input_tokens_seen": 8344043520, + "step": 15915 + }, + { + "epoch": 0.7720612750892635, + "grad_norm": 0.251953125, + "learning_rate": 6.772052075691304e-06, + "loss": 2.7744, + "num_input_tokens_seen": 8346664960, + "step": 15920 + }, + { + "epoch": 0.7723037566455101, + "grad_norm": 0.251953125, + "learning_rate": 6.758338065329603e-06, + "loss": 2.7757, + "num_input_tokens_seen": 8349286400, + "step": 15925 + }, + { + "epoch": 0.7725462382017568, + "grad_norm": 0.2451171875, + "learning_rate": 6.744635784470654e-06, + "loss": 2.7793, + "num_input_tokens_seen": 8351907840, + "step": 15930 + }, + { + "epoch": 0.7727887197580035, + "grad_norm": 0.251953125, + "learning_rate": 6.730945241925093e-06, + "loss": 2.7813, + "num_input_tokens_seen": 8354529280, + "step": 15935 + }, + { + "epoch": 0.7730312013142501, + "grad_norm": 0.2353515625, + "learning_rate": 6.717266446496034e-06, + "loss": 2.764, + "num_input_tokens_seen": 8357150720, + "step": 15940 + }, + { + "epoch": 0.7732736828704967, + "grad_norm": 0.2431640625, + "learning_rate": 6.703599406979025e-06, + "loss": 2.7767, + "num_input_tokens_seen": 8359772160, + "step": 15945 + }, + { + "epoch": 0.7735161644267433, + "grad_norm": 0.251953125, + "learning_rate": 6.689944132162057e-06, + "loss": 2.7694, + "num_input_tokens_seen": 8362393600, + "step": 15950 + }, + { + "epoch": 0.7737586459829899, + "grad_norm": 0.244140625, + "learning_rate": 6.676300630825563e-06, + "loss": 2.7759, + "num_input_tokens_seen": 8365015040, + "step": 15955 + }, + { + "epoch": 0.7740011275392366, + "grad_norm": 0.2412109375, + "learning_rate": 6.662668911742395e-06, + "loss": 2.778, + "num_input_tokens_seen": 8367636480, + "step": 15960 + }, + { + "epoch": 0.7742436090954832, + "grad_norm": 0.25, + "learning_rate": 6.649048983677833e-06, + "loss": 2.7658, + "num_input_tokens_seen": 8370257920, + "step": 15965 + }, + { + "epoch": 0.7744860906517298, + "grad_norm": 0.234375, + "learning_rate": 6.63544085538958e-06, + "loss": 2.765, + "num_input_tokens_seen": 8372879360, + "step": 15970 + }, + { + "epoch": 0.7747285722079764, + "grad_norm": 0.2421875, + "learning_rate": 6.621844535627744e-06, + "loss": 2.7716, + "num_input_tokens_seen": 8375500800, + "step": 15975 + }, + { + "epoch": 0.774971053764223, + "grad_norm": 0.2412109375, + "learning_rate": 6.608260033134847e-06, + "loss": 2.7742, + "num_input_tokens_seen": 8378122240, + "step": 15980 + }, + { + "epoch": 0.7752135353204697, + "grad_norm": 0.2392578125, + "learning_rate": 6.594687356645807e-06, + "loss": 2.765, + "num_input_tokens_seen": 8380743680, + "step": 15985 + }, + { + "epoch": 0.7754560168767163, + "grad_norm": 0.23828125, + "learning_rate": 6.5811265148879444e-06, + "loss": 2.7757, + "num_input_tokens_seen": 8383365120, + "step": 15990 + }, + { + "epoch": 0.7756984984329629, + "grad_norm": 0.2353515625, + "learning_rate": 6.5675775165809585e-06, + "loss": 2.7782, + "num_input_tokens_seen": 8385986560, + "step": 15995 + }, + { + "epoch": 0.7759409799892095, + "grad_norm": 0.2392578125, + "learning_rate": 6.554040370436939e-06, + "loss": 2.7818, + "num_input_tokens_seen": 8388608000, + "step": 16000 + }, + { + "epoch": 0.7761834615454561, + "grad_norm": 0.2373046875, + "learning_rate": 6.540515085160359e-06, + "loss": 2.7716, + "num_input_tokens_seen": 8391229440, + "step": 16005 + }, + { + "epoch": 0.7764259431017029, + "grad_norm": 0.240234375, + "learning_rate": 6.527001669448063e-06, + "loss": 2.7779, + "num_input_tokens_seen": 8393850880, + "step": 16010 + }, + { + "epoch": 0.7766684246579495, + "grad_norm": 0.2353515625, + "learning_rate": 6.51350013198925e-06, + "loss": 2.7789, + "num_input_tokens_seen": 8396472320, + "step": 16015 + }, + { + "epoch": 0.7769109062141961, + "grad_norm": 0.236328125, + "learning_rate": 6.500010481465521e-06, + "loss": 2.7627, + "num_input_tokens_seen": 8399093760, + "step": 16020 + }, + { + "epoch": 0.7771533877704427, + "grad_norm": 0.2451171875, + "learning_rate": 6.4865327265507815e-06, + "loss": 2.7756, + "num_input_tokens_seen": 8401715200, + "step": 16025 + }, + { + "epoch": 0.7773958693266894, + "grad_norm": 0.2412109375, + "learning_rate": 6.473066875911315e-06, + "loss": 2.7769, + "num_input_tokens_seen": 8404336640, + "step": 16030 + }, + { + "epoch": 0.777638350882936, + "grad_norm": 0.23828125, + "learning_rate": 6.459612938205755e-06, + "loss": 2.7742, + "num_input_tokens_seen": 8406958080, + "step": 16035 + }, + { + "epoch": 0.7778808324391826, + "grad_norm": 0.2451171875, + "learning_rate": 6.446170922085063e-06, + "loss": 2.7723, + "num_input_tokens_seen": 8409579520, + "step": 16040 + }, + { + "epoch": 0.7781233139954292, + "grad_norm": 0.2451171875, + "learning_rate": 6.432740836192541e-06, + "loss": 2.7519, + "num_input_tokens_seen": 8412200960, + "step": 16045 + }, + { + "epoch": 0.7783657955516758, + "grad_norm": 0.2431640625, + "learning_rate": 6.419322689163826e-06, + "loss": 2.7786, + "num_input_tokens_seen": 8414822400, + "step": 16050 + }, + { + "epoch": 0.7786082771079225, + "grad_norm": 0.2421875, + "learning_rate": 6.4059164896268534e-06, + "loss": 2.776, + "num_input_tokens_seen": 8417443840, + "step": 16055 + }, + { + "epoch": 0.7788507586641691, + "grad_norm": 0.25390625, + "learning_rate": 6.392522246201901e-06, + "loss": 2.7704, + "num_input_tokens_seen": 8420065280, + "step": 16060 + }, + { + "epoch": 0.7790932402204157, + "grad_norm": 0.2421875, + "learning_rate": 6.379139967501555e-06, + "loss": 2.7883, + "num_input_tokens_seen": 8422686720, + "step": 16065 + }, + { + "epoch": 0.7793357217766623, + "grad_norm": 0.2373046875, + "learning_rate": 6.365769662130694e-06, + "loss": 2.7877, + "num_input_tokens_seen": 8425308160, + "step": 16070 + }, + { + "epoch": 0.779578203332909, + "grad_norm": 0.2470703125, + "learning_rate": 6.352411338686523e-06, + "loss": 2.7648, + "num_input_tokens_seen": 8427929600, + "step": 16075 + }, + { + "epoch": 0.7798206848891556, + "grad_norm": 0.2470703125, + "learning_rate": 6.339065005758521e-06, + "loss": 2.7677, + "num_input_tokens_seen": 8430551040, + "step": 16080 + }, + { + "epoch": 0.7800631664454023, + "grad_norm": 0.2373046875, + "learning_rate": 6.325730671928468e-06, + "loss": 2.7849, + "num_input_tokens_seen": 8433172480, + "step": 16085 + }, + { + "epoch": 0.7803056480016489, + "grad_norm": 0.2431640625, + "learning_rate": 6.312408345770413e-06, + "loss": 2.7696, + "num_input_tokens_seen": 8435793920, + "step": 16090 + }, + { + "epoch": 0.7805481295578955, + "grad_norm": 0.2431640625, + "learning_rate": 6.299098035850701e-06, + "loss": 2.7739, + "num_input_tokens_seen": 8438415360, + "step": 16095 + }, + { + "epoch": 0.7807906111141422, + "grad_norm": 0.2431640625, + "learning_rate": 6.2857997507279445e-06, + "loss": 2.7591, + "num_input_tokens_seen": 8441036800, + "step": 16100 + }, + { + "epoch": 0.7810330926703888, + "grad_norm": 0.2431640625, + "learning_rate": 6.272513498953022e-06, + "loss": 2.7713, + "num_input_tokens_seen": 8443658240, + "step": 16105 + }, + { + "epoch": 0.7812755742266354, + "grad_norm": 0.2451171875, + "learning_rate": 6.259239289069083e-06, + "loss": 2.7859, + "num_input_tokens_seen": 8446279680, + "step": 16110 + }, + { + "epoch": 0.781518055782882, + "grad_norm": 0.2412109375, + "learning_rate": 6.245977129611527e-06, + "loss": 2.7817, + "num_input_tokens_seen": 8448901120, + "step": 16115 + }, + { + "epoch": 0.7817605373391286, + "grad_norm": 0.2431640625, + "learning_rate": 6.2327270291079876e-06, + "loss": 2.7595, + "num_input_tokens_seen": 8451522560, + "step": 16120 + }, + { + "epoch": 0.7820030188953753, + "grad_norm": 0.2431640625, + "learning_rate": 6.219488996078385e-06, + "loss": 2.7827, + "num_input_tokens_seen": 8454144000, + "step": 16125 + }, + { + "epoch": 0.7822455004516219, + "grad_norm": 0.2373046875, + "learning_rate": 6.206263039034846e-06, + "loss": 2.7706, + "num_input_tokens_seen": 8456765440, + "step": 16130 + }, + { + "epoch": 0.7824879820078685, + "grad_norm": 0.2392578125, + "learning_rate": 6.193049166481749e-06, + "loss": 2.7797, + "num_input_tokens_seen": 8459386880, + "step": 16135 + }, + { + "epoch": 0.7827304635641151, + "grad_norm": 0.2421875, + "learning_rate": 6.179847386915691e-06, + "loss": 2.7743, + "num_input_tokens_seen": 8462008320, + "step": 16140 + }, + { + "epoch": 0.7829729451203618, + "grad_norm": 0.248046875, + "learning_rate": 6.16665770882551e-06, + "loss": 2.7944, + "num_input_tokens_seen": 8464629760, + "step": 16145 + }, + { + "epoch": 0.7832154266766084, + "grad_norm": 0.2490234375, + "learning_rate": 6.1534801406922385e-06, + "loss": 2.7896, + "num_input_tokens_seen": 8467251200, + "step": 16150 + }, + { + "epoch": 0.783457908232855, + "grad_norm": 0.2421875, + "learning_rate": 6.140314690989138e-06, + "loss": 2.7812, + "num_input_tokens_seen": 8469872640, + "step": 16155 + }, + { + "epoch": 0.7837003897891016, + "grad_norm": 0.24609375, + "learning_rate": 6.1271613681816776e-06, + "loss": 2.7842, + "num_input_tokens_seen": 8472494080, + "step": 16160 + }, + { + "epoch": 0.7839428713453483, + "grad_norm": 0.2373046875, + "learning_rate": 6.114020180727525e-06, + "loss": 2.7922, + "num_input_tokens_seen": 8475115520, + "step": 16165 + }, + { + "epoch": 0.784185352901595, + "grad_norm": 0.2412109375, + "learning_rate": 6.100891137076548e-06, + "loss": 2.785, + "num_input_tokens_seen": 8477736960, + "step": 16170 + }, + { + "epoch": 0.7844278344578416, + "grad_norm": 0.2490234375, + "learning_rate": 6.087774245670802e-06, + "loss": 2.7698, + "num_input_tokens_seen": 8480358400, + "step": 16175 + }, + { + "epoch": 0.7846703160140882, + "grad_norm": 0.236328125, + "learning_rate": 6.074669514944528e-06, + "loss": 2.7688, + "num_input_tokens_seen": 8482979840, + "step": 16180 + }, + { + "epoch": 0.7849127975703348, + "grad_norm": 0.2431640625, + "learning_rate": 6.061576953324155e-06, + "loss": 2.7773, + "num_input_tokens_seen": 8485601280, + "step": 16185 + }, + { + "epoch": 0.7851552791265815, + "grad_norm": 0.25, + "learning_rate": 6.048496569228279e-06, + "loss": 2.7752, + "num_input_tokens_seen": 8488222720, + "step": 16190 + }, + { + "epoch": 0.7853977606828281, + "grad_norm": 0.2421875, + "learning_rate": 6.03542837106767e-06, + "loss": 2.7706, + "num_input_tokens_seen": 8490844160, + "step": 16195 + }, + { + "epoch": 0.7856402422390747, + "grad_norm": 0.2451171875, + "learning_rate": 6.0223723672452605e-06, + "loss": 2.7718, + "num_input_tokens_seen": 8493465600, + "step": 16200 + }, + { + "epoch": 0.7856402422390747, + "eval_accuracy": 0.4559338869890897, + "eval_loss": 2.741624355316162, + "eval_runtime": 5.8381, + "eval_samples_per_second": 51.387, + "eval_steps_per_second": 6.509, + "num_input_tokens_seen": 8493465600, + "step": 16200 + }, + { + "epoch": 0.7858827237953213, + "grad_norm": 0.240234375, + "learning_rate": 6.0093285661561495e-06, + "loss": 2.7792, + "num_input_tokens_seen": 8496087040, + "step": 16205 + }, + { + "epoch": 0.7861252053515679, + "grad_norm": 0.24609375, + "learning_rate": 5.996296976187568e-06, + "loss": 2.7785, + "num_input_tokens_seen": 8498708480, + "step": 16210 + }, + { + "epoch": 0.7863676869078146, + "grad_norm": 0.2451171875, + "learning_rate": 5.983277605718921e-06, + "loss": 2.7751, + "num_input_tokens_seen": 8501329920, + "step": 16215 + }, + { + "epoch": 0.7866101684640612, + "grad_norm": 0.251953125, + "learning_rate": 5.9702704631217385e-06, + "loss": 2.7609, + "num_input_tokens_seen": 8503951360, + "step": 16220 + }, + { + "epoch": 0.7868526500203078, + "grad_norm": 0.240234375, + "learning_rate": 5.9572755567596975e-06, + "loss": 2.7857, + "num_input_tokens_seen": 8506572800, + "step": 16225 + }, + { + "epoch": 0.7870951315765544, + "grad_norm": 0.25, + "learning_rate": 5.944292894988607e-06, + "loss": 2.7803, + "num_input_tokens_seen": 8509194240, + "step": 16230 + }, + { + "epoch": 0.787337613132801, + "grad_norm": 0.2412109375, + "learning_rate": 5.931322486156396e-06, + "loss": 2.7821, + "num_input_tokens_seen": 8511815680, + "step": 16235 + }, + { + "epoch": 0.7875800946890477, + "grad_norm": 0.23828125, + "learning_rate": 5.918364338603119e-06, + "loss": 2.7767, + "num_input_tokens_seen": 8514437120, + "step": 16240 + }, + { + "epoch": 0.7878225762452944, + "grad_norm": 0.2451171875, + "learning_rate": 5.905418460660947e-06, + "loss": 2.7744, + "num_input_tokens_seen": 8517058560, + "step": 16245 + }, + { + "epoch": 0.788065057801541, + "grad_norm": 0.2412109375, + "learning_rate": 5.892484860654162e-06, + "loss": 2.7776, + "num_input_tokens_seen": 8519680000, + "step": 16250 + }, + { + "epoch": 0.7883075393577876, + "grad_norm": 0.2470703125, + "learning_rate": 5.879563546899148e-06, + "loss": 2.7839, + "num_input_tokens_seen": 8522301440, + "step": 16255 + }, + { + "epoch": 0.7885500209140343, + "grad_norm": 0.2451171875, + "learning_rate": 5.8666545277043875e-06, + "loss": 2.7745, + "num_input_tokens_seen": 8524922880, + "step": 16260 + }, + { + "epoch": 0.7887925024702809, + "grad_norm": 0.2392578125, + "learning_rate": 5.853757811370467e-06, + "loss": 2.7805, + "num_input_tokens_seen": 8527544320, + "step": 16265 + }, + { + "epoch": 0.7890349840265275, + "grad_norm": 0.2451171875, + "learning_rate": 5.840873406190056e-06, + "loss": 2.7676, + "num_input_tokens_seen": 8530165760, + "step": 16270 + }, + { + "epoch": 0.7892774655827741, + "grad_norm": 0.2412109375, + "learning_rate": 5.828001320447898e-06, + "loss": 2.795, + "num_input_tokens_seen": 8532787200, + "step": 16275 + }, + { + "epoch": 0.7895199471390207, + "grad_norm": 0.248046875, + "learning_rate": 5.815141562420834e-06, + "loss": 2.7721, + "num_input_tokens_seen": 8535408640, + "step": 16280 + }, + { + "epoch": 0.7897624286952674, + "grad_norm": 0.240234375, + "learning_rate": 5.802294140377762e-06, + "loss": 2.7863, + "num_input_tokens_seen": 8538030080, + "step": 16285 + }, + { + "epoch": 0.790004910251514, + "grad_norm": 0.2412109375, + "learning_rate": 5.78945906257966e-06, + "loss": 2.7674, + "num_input_tokens_seen": 8540651520, + "step": 16290 + }, + { + "epoch": 0.7902473918077606, + "grad_norm": 0.24609375, + "learning_rate": 5.776636337279561e-06, + "loss": 2.7645, + "num_input_tokens_seen": 8543272960, + "step": 16295 + }, + { + "epoch": 0.7904898733640072, + "grad_norm": 0.2392578125, + "learning_rate": 5.7638259727225585e-06, + "loss": 2.7767, + "num_input_tokens_seen": 8545894400, + "step": 16300 + }, + { + "epoch": 0.7907323549202538, + "grad_norm": 0.2421875, + "learning_rate": 5.751027977145795e-06, + "loss": 2.7812, + "num_input_tokens_seen": 8548515840, + "step": 16305 + }, + { + "epoch": 0.7909748364765005, + "grad_norm": 0.2412109375, + "learning_rate": 5.738242358778467e-06, + "loss": 2.7753, + "num_input_tokens_seen": 8551137280, + "step": 16310 + }, + { + "epoch": 0.7912173180327471, + "grad_norm": 0.2412109375, + "learning_rate": 5.725469125841801e-06, + "loss": 2.7749, + "num_input_tokens_seen": 8553758720, + "step": 16315 + }, + { + "epoch": 0.7914597995889938, + "grad_norm": 0.2490234375, + "learning_rate": 5.712708286549071e-06, + "loss": 2.7923, + "num_input_tokens_seen": 8556380160, + "step": 16320 + }, + { + "epoch": 0.7917022811452404, + "grad_norm": 0.236328125, + "learning_rate": 5.699959849105571e-06, + "loss": 2.7709, + "num_input_tokens_seen": 8559001600, + "step": 16325 + }, + { + "epoch": 0.7919447627014871, + "grad_norm": 0.2470703125, + "learning_rate": 5.687223821708637e-06, + "loss": 2.7688, + "num_input_tokens_seen": 8561623040, + "step": 16330 + }, + { + "epoch": 0.7921872442577337, + "grad_norm": 0.2470703125, + "learning_rate": 5.674500212547598e-06, + "loss": 2.7836, + "num_input_tokens_seen": 8564244480, + "step": 16335 + }, + { + "epoch": 0.7924297258139803, + "grad_norm": 0.2431640625, + "learning_rate": 5.661789029803824e-06, + "loss": 2.7867, + "num_input_tokens_seen": 8566865920, + "step": 16340 + }, + { + "epoch": 0.7926722073702269, + "grad_norm": 0.2421875, + "learning_rate": 5.649090281650682e-06, + "loss": 2.7848, + "num_input_tokens_seen": 8569487360, + "step": 16345 + }, + { + "epoch": 0.7929146889264735, + "grad_norm": 0.240234375, + "learning_rate": 5.636403976253548e-06, + "loss": 2.7859, + "num_input_tokens_seen": 8572108800, + "step": 16350 + }, + { + "epoch": 0.7931571704827202, + "grad_norm": 0.2421875, + "learning_rate": 5.623730121769788e-06, + "loss": 2.774, + "num_input_tokens_seen": 8574730240, + "step": 16355 + }, + { + "epoch": 0.7933996520389668, + "grad_norm": 0.232421875, + "learning_rate": 5.611068726348795e-06, + "loss": 2.7726, + "num_input_tokens_seen": 8577351680, + "step": 16360 + }, + { + "epoch": 0.7936421335952134, + "grad_norm": 0.2451171875, + "learning_rate": 5.598419798131896e-06, + "loss": 2.777, + "num_input_tokens_seen": 8579973120, + "step": 16365 + }, + { + "epoch": 0.79388461515146, + "grad_norm": 0.240234375, + "learning_rate": 5.585783345252446e-06, + "loss": 2.7749, + "num_input_tokens_seen": 8582594560, + "step": 16370 + }, + { + "epoch": 0.7941270967077066, + "grad_norm": 0.2373046875, + "learning_rate": 5.57315937583576e-06, + "loss": 2.7805, + "num_input_tokens_seen": 8585216000, + "step": 16375 + }, + { + "epoch": 0.7943695782639533, + "grad_norm": 0.2451171875, + "learning_rate": 5.560547897999127e-06, + "loss": 2.7796, + "num_input_tokens_seen": 8587837440, + "step": 16380 + }, + { + "epoch": 0.7946120598201999, + "grad_norm": 0.2421875, + "learning_rate": 5.547948919851811e-06, + "loss": 2.7696, + "num_input_tokens_seen": 8590458880, + "step": 16385 + }, + { + "epoch": 0.7948545413764465, + "grad_norm": 0.2333984375, + "learning_rate": 5.535362449495032e-06, + "loss": 2.7814, + "num_input_tokens_seen": 8593080320, + "step": 16390 + }, + { + "epoch": 0.7950970229326931, + "grad_norm": 0.236328125, + "learning_rate": 5.522788495021975e-06, + "loss": 2.7723, + "num_input_tokens_seen": 8595701760, + "step": 16395 + }, + { + "epoch": 0.7953395044889399, + "grad_norm": 0.2373046875, + "learning_rate": 5.510227064517756e-06, + "loss": 2.7899, + "num_input_tokens_seen": 8598323200, + "step": 16400 + }, + { + "epoch": 0.7955819860451865, + "grad_norm": 0.2451171875, + "learning_rate": 5.4976781660594555e-06, + "loss": 2.7542, + "num_input_tokens_seen": 8600944640, + "step": 16405 + }, + { + "epoch": 0.7958244676014331, + "grad_norm": 0.2333984375, + "learning_rate": 5.485141807716107e-06, + "loss": 2.7663, + "num_input_tokens_seen": 8603566080, + "step": 16410 + }, + { + "epoch": 0.7960669491576797, + "grad_norm": 0.2392578125, + "learning_rate": 5.472617997548662e-06, + "loss": 2.7791, + "num_input_tokens_seen": 8606187520, + "step": 16415 + }, + { + "epoch": 0.7963094307139263, + "grad_norm": 0.236328125, + "learning_rate": 5.460106743610008e-06, + "loss": 2.7766, + "num_input_tokens_seen": 8608808960, + "step": 16420 + }, + { + "epoch": 0.796551912270173, + "grad_norm": 0.2431640625, + "learning_rate": 5.4476080539449665e-06, + "loss": 2.78, + "num_input_tokens_seen": 8611430400, + "step": 16425 + }, + { + "epoch": 0.7967943938264196, + "grad_norm": 0.23828125, + "learning_rate": 5.435121936590256e-06, + "loss": 2.7582, + "num_input_tokens_seen": 8614051840, + "step": 16430 + }, + { + "epoch": 0.7970368753826662, + "grad_norm": 0.2431640625, + "learning_rate": 5.422648399574543e-06, + "loss": 2.7646, + "num_input_tokens_seen": 8616673280, + "step": 16435 + }, + { + "epoch": 0.7972793569389128, + "grad_norm": 0.2412109375, + "learning_rate": 5.4101874509183805e-06, + "loss": 2.7804, + "num_input_tokens_seen": 8619294720, + "step": 16440 + }, + { + "epoch": 0.7975218384951595, + "grad_norm": 0.25, + "learning_rate": 5.3977390986342415e-06, + "loss": 2.7669, + "num_input_tokens_seen": 8621916160, + "step": 16445 + }, + { + "epoch": 0.7977643200514061, + "grad_norm": 0.2451171875, + "learning_rate": 5.385303350726495e-06, + "loss": 2.7883, + "num_input_tokens_seen": 8624537600, + "step": 16450 + }, + { + "epoch": 0.7980068016076527, + "grad_norm": 0.2392578125, + "learning_rate": 5.372880215191409e-06, + "loss": 2.7731, + "num_input_tokens_seen": 8627159040, + "step": 16455 + }, + { + "epoch": 0.7982492831638993, + "grad_norm": 0.2421875, + "learning_rate": 5.360469700017118e-06, + "loss": 2.7927, + "num_input_tokens_seen": 8629780480, + "step": 16460 + }, + { + "epoch": 0.7984917647201459, + "grad_norm": 0.2431640625, + "learning_rate": 5.348071813183681e-06, + "loss": 2.7798, + "num_input_tokens_seen": 8632401920, + "step": 16465 + }, + { + "epoch": 0.7987342462763926, + "grad_norm": 0.2490234375, + "learning_rate": 5.335686562663011e-06, + "loss": 2.7614, + "num_input_tokens_seen": 8635023360, + "step": 16470 + }, + { + "epoch": 0.7989767278326392, + "grad_norm": 0.2392578125, + "learning_rate": 5.3233139564189016e-06, + "loss": 2.7756, + "num_input_tokens_seen": 8637644800, + "step": 16475 + }, + { + "epoch": 0.7992192093888859, + "grad_norm": 0.236328125, + "learning_rate": 5.310954002407012e-06, + "loss": 2.7858, + "num_input_tokens_seen": 8640266240, + "step": 16480 + }, + { + "epoch": 0.7994616909451325, + "grad_norm": 0.2353515625, + "learning_rate": 5.298606708574883e-06, + "loss": 2.7759, + "num_input_tokens_seen": 8642887680, + "step": 16485 + }, + { + "epoch": 0.7997041725013792, + "grad_norm": 0.244140625, + "learning_rate": 5.286272082861885e-06, + "loss": 2.7638, + "num_input_tokens_seen": 8645509120, + "step": 16490 + }, + { + "epoch": 0.7999466540576258, + "grad_norm": 0.2451171875, + "learning_rate": 5.2739501331992654e-06, + "loss": 2.7705, + "num_input_tokens_seen": 8648130560, + "step": 16495 + }, + { + "epoch": 0.8001891356138724, + "grad_norm": 0.2431640625, + "learning_rate": 5.261640867510118e-06, + "loss": 2.7757, + "num_input_tokens_seen": 8650752000, + "step": 16500 + }, + { + "epoch": 0.8001891356138724, + "eval_accuracy": 0.45599088096401236, + "eval_loss": 2.7416229248046875, + "eval_runtime": 5.8629, + "eval_samples_per_second": 51.169, + "eval_steps_per_second": 6.481, + "num_input_tokens_seen": 8650752000, + "step": 16500 + }, + { + "epoch": 0.800431617170119, + "grad_norm": 0.248046875, + "learning_rate": 5.249344293709374e-06, + "loss": 2.7701, + "num_input_tokens_seen": 8653373440, + "step": 16505 + }, + { + "epoch": 0.8006740987263656, + "grad_norm": 0.2431640625, + "learning_rate": 5.23706041970381e-06, + "loss": 2.7628, + "num_input_tokens_seen": 8655994880, + "step": 16510 + }, + { + "epoch": 0.8009165802826123, + "grad_norm": 0.2392578125, + "learning_rate": 5.224789253392032e-06, + "loss": 2.7705, + "num_input_tokens_seen": 8658616320, + "step": 16515 + }, + { + "epoch": 0.8011590618388589, + "grad_norm": 0.24609375, + "learning_rate": 5.212530802664478e-06, + "loss": 2.77, + "num_input_tokens_seen": 8661237760, + "step": 16520 + }, + { + "epoch": 0.8014015433951055, + "grad_norm": 0.2470703125, + "learning_rate": 5.200285075403408e-06, + "loss": 2.766, + "num_input_tokens_seen": 8663859200, + "step": 16525 + }, + { + "epoch": 0.8016440249513521, + "grad_norm": 0.2431640625, + "learning_rate": 5.188052079482899e-06, + "loss": 2.7898, + "num_input_tokens_seen": 8666480640, + "step": 16530 + }, + { + "epoch": 0.8018865065075987, + "grad_norm": 0.2421875, + "learning_rate": 5.175831822768848e-06, + "loss": 2.7635, + "num_input_tokens_seen": 8669102080, + "step": 16535 + }, + { + "epoch": 0.8021289880638454, + "grad_norm": 0.244140625, + "learning_rate": 5.163624313118956e-06, + "loss": 2.7692, + "num_input_tokens_seen": 8671723520, + "step": 16540 + }, + { + "epoch": 0.802371469620092, + "grad_norm": 0.240234375, + "learning_rate": 5.151429558382725e-06, + "loss": 2.7643, + "num_input_tokens_seen": 8674344960, + "step": 16545 + }, + { + "epoch": 0.8026139511763386, + "grad_norm": 0.251953125, + "learning_rate": 5.13924756640147e-06, + "loss": 2.7837, + "num_input_tokens_seen": 8676966400, + "step": 16550 + }, + { + "epoch": 0.8028564327325852, + "grad_norm": 0.240234375, + "learning_rate": 5.127078345008268e-06, + "loss": 2.7662, + "num_input_tokens_seen": 8679587840, + "step": 16555 + }, + { + "epoch": 0.803098914288832, + "grad_norm": 0.2421875, + "learning_rate": 5.1149219020280164e-06, + "loss": 2.78, + "num_input_tokens_seen": 8682209280, + "step": 16560 + }, + { + "epoch": 0.8033413958450786, + "grad_norm": 0.244140625, + "learning_rate": 5.1027782452773815e-06, + "loss": 2.7752, + "num_input_tokens_seen": 8684830720, + "step": 16565 + }, + { + "epoch": 0.8035838774013252, + "grad_norm": 0.2412109375, + "learning_rate": 5.0906473825648144e-06, + "loss": 2.7597, + "num_input_tokens_seen": 8687452160, + "step": 16570 + }, + { + "epoch": 0.8038263589575718, + "grad_norm": 0.26171875, + "learning_rate": 5.0785293216905314e-06, + "loss": 2.7859, + "num_input_tokens_seen": 8690073600, + "step": 16575 + }, + { + "epoch": 0.8040688405138184, + "grad_norm": 0.2412109375, + "learning_rate": 5.066424070446521e-06, + "loss": 2.7611, + "num_input_tokens_seen": 8692695040, + "step": 16580 + }, + { + "epoch": 0.8043113220700651, + "grad_norm": 0.2421875, + "learning_rate": 5.054331636616541e-06, + "loss": 2.7694, + "num_input_tokens_seen": 8695316480, + "step": 16585 + }, + { + "epoch": 0.8045538036263117, + "grad_norm": 0.240234375, + "learning_rate": 5.042252027976097e-06, + "loss": 2.7625, + "num_input_tokens_seen": 8697937920, + "step": 16590 + }, + { + "epoch": 0.8047962851825583, + "grad_norm": 0.2451171875, + "learning_rate": 5.030185252292452e-06, + "loss": 2.7724, + "num_input_tokens_seen": 8700559360, + "step": 16595 + }, + { + "epoch": 0.8050387667388049, + "grad_norm": 0.2412109375, + "learning_rate": 5.018131317324623e-06, + "loss": 2.7761, + "num_input_tokens_seen": 8703180800, + "step": 16600 + }, + { + "epoch": 0.8052812482950515, + "grad_norm": 0.2431640625, + "learning_rate": 5.006090230823366e-06, + "loss": 2.7766, + "num_input_tokens_seen": 8705802240, + "step": 16605 + }, + { + "epoch": 0.8055237298512982, + "grad_norm": 0.240234375, + "learning_rate": 4.994062000531175e-06, + "loss": 2.78, + "num_input_tokens_seen": 8708423680, + "step": 16610 + }, + { + "epoch": 0.8057662114075448, + "grad_norm": 0.244140625, + "learning_rate": 4.982046634182269e-06, + "loss": 2.7699, + "num_input_tokens_seen": 8711045120, + "step": 16615 + }, + { + "epoch": 0.8060086929637914, + "grad_norm": 0.2490234375, + "learning_rate": 4.970044139502608e-06, + "loss": 2.7882, + "num_input_tokens_seen": 8713666560, + "step": 16620 + }, + { + "epoch": 0.806251174520038, + "grad_norm": 0.2314453125, + "learning_rate": 4.958054524209873e-06, + "loss": 2.7768, + "num_input_tokens_seen": 8716288000, + "step": 16625 + }, + { + "epoch": 0.8064936560762846, + "grad_norm": 0.2392578125, + "learning_rate": 4.946077796013462e-06, + "loss": 2.7735, + "num_input_tokens_seen": 8718909440, + "step": 16630 + }, + { + "epoch": 0.8067361376325314, + "grad_norm": 0.2412109375, + "learning_rate": 4.934113962614484e-06, + "loss": 2.7737, + "num_input_tokens_seen": 8721530880, + "step": 16635 + }, + { + "epoch": 0.806978619188778, + "grad_norm": 0.25, + "learning_rate": 4.922163031705762e-06, + "loss": 2.7673, + "num_input_tokens_seen": 8724152320, + "step": 16640 + }, + { + "epoch": 0.8072211007450246, + "grad_norm": 0.24609375, + "learning_rate": 4.910225010971817e-06, + "loss": 2.7684, + "num_input_tokens_seen": 8726773760, + "step": 16645 + }, + { + "epoch": 0.8074635823012712, + "grad_norm": 0.23828125, + "learning_rate": 4.8982999080888684e-06, + "loss": 2.7862, + "num_input_tokens_seen": 8729395200, + "step": 16650 + }, + { + "epoch": 0.8077060638575179, + "grad_norm": 0.24609375, + "learning_rate": 4.886387730724837e-06, + "loss": 2.7664, + "num_input_tokens_seen": 8732016640, + "step": 16655 + }, + { + "epoch": 0.8079485454137645, + "grad_norm": 0.2421875, + "learning_rate": 4.874488486539325e-06, + "loss": 2.7739, + "num_input_tokens_seen": 8734638080, + "step": 16660 + }, + { + "epoch": 0.8081910269700111, + "grad_norm": 0.255859375, + "learning_rate": 4.862602183183623e-06, + "loss": 2.777, + "num_input_tokens_seen": 8737259520, + "step": 16665 + }, + { + "epoch": 0.8084335085262577, + "grad_norm": 0.240234375, + "learning_rate": 4.850728828300702e-06, + "loss": 2.7794, + "num_input_tokens_seen": 8739880960, + "step": 16670 + }, + { + "epoch": 0.8086759900825043, + "grad_norm": 0.2470703125, + "learning_rate": 4.838868429525189e-06, + "loss": 2.7603, + "num_input_tokens_seen": 8742502400, + "step": 16675 + }, + { + "epoch": 0.808918471638751, + "grad_norm": 0.240234375, + "learning_rate": 4.827020994483405e-06, + "loss": 2.779, + "num_input_tokens_seen": 8745123840, + "step": 16680 + }, + { + "epoch": 0.8091609531949976, + "grad_norm": 0.2431640625, + "learning_rate": 4.815186530793325e-06, + "loss": 2.7771, + "num_input_tokens_seen": 8747745280, + "step": 16685 + }, + { + "epoch": 0.8094034347512442, + "grad_norm": 0.24609375, + "learning_rate": 4.803365046064573e-06, + "loss": 2.7663, + "num_input_tokens_seen": 8750366720, + "step": 16690 + }, + { + "epoch": 0.8096459163074908, + "grad_norm": 0.2431640625, + "learning_rate": 4.791556547898454e-06, + "loss": 2.7674, + "num_input_tokens_seen": 8752988160, + "step": 16695 + }, + { + "epoch": 0.8098883978637375, + "grad_norm": 0.2373046875, + "learning_rate": 4.779761043887898e-06, + "loss": 2.7811, + "num_input_tokens_seen": 8755609600, + "step": 16700 + }, + { + "epoch": 0.8101308794199841, + "grad_norm": 0.2451171875, + "learning_rate": 4.767978541617493e-06, + "loss": 2.7719, + "num_input_tokens_seen": 8758231040, + "step": 16705 + }, + { + "epoch": 0.8103733609762307, + "grad_norm": 0.24609375, + "learning_rate": 4.756209048663454e-06, + "loss": 2.765, + "num_input_tokens_seen": 8760852480, + "step": 16710 + }, + { + "epoch": 0.8106158425324774, + "grad_norm": 0.2451171875, + "learning_rate": 4.744452572593638e-06, + "loss": 2.7773, + "num_input_tokens_seen": 8763473920, + "step": 16715 + }, + { + "epoch": 0.810858324088724, + "grad_norm": 0.2451171875, + "learning_rate": 4.732709120967541e-06, + "loss": 2.7854, + "num_input_tokens_seen": 8766095360, + "step": 16720 + }, + { + "epoch": 0.8111008056449707, + "grad_norm": 0.2412109375, + "learning_rate": 4.720978701336268e-06, + "loss": 2.7703, + "num_input_tokens_seen": 8768716800, + "step": 16725 + }, + { + "epoch": 0.8113432872012173, + "grad_norm": 0.240234375, + "learning_rate": 4.709261321242556e-06, + "loss": 2.7705, + "num_input_tokens_seen": 8771338240, + "step": 16730 + }, + { + "epoch": 0.8115857687574639, + "grad_norm": 0.2431640625, + "learning_rate": 4.697556988220758e-06, + "loss": 2.7867, + "num_input_tokens_seen": 8773959680, + "step": 16735 + }, + { + "epoch": 0.8118282503137105, + "grad_norm": 0.2431640625, + "learning_rate": 4.685865709796822e-06, + "loss": 2.7696, + "num_input_tokens_seen": 8776581120, + "step": 16740 + }, + { + "epoch": 0.8120707318699572, + "grad_norm": 0.25, + "learning_rate": 4.6741874934883165e-06, + "loss": 2.7622, + "num_input_tokens_seen": 8779202560, + "step": 16745 + }, + { + "epoch": 0.8123132134262038, + "grad_norm": 0.2431640625, + "learning_rate": 4.662522346804413e-06, + "loss": 2.7774, + "num_input_tokens_seen": 8781824000, + "step": 16750 + }, + { + "epoch": 0.8125556949824504, + "grad_norm": 0.2431640625, + "learning_rate": 4.650870277245872e-06, + "loss": 2.7759, + "num_input_tokens_seen": 8784445440, + "step": 16755 + }, + { + "epoch": 0.812798176538697, + "grad_norm": 0.2451171875, + "learning_rate": 4.639231292305049e-06, + "loss": 2.7717, + "num_input_tokens_seen": 8787066880, + "step": 16760 + }, + { + "epoch": 0.8130406580949436, + "grad_norm": 0.2431640625, + "learning_rate": 4.627605399465887e-06, + "loss": 2.7829, + "num_input_tokens_seen": 8789688320, + "step": 16765 + }, + { + "epoch": 0.8132831396511903, + "grad_norm": 0.25, + "learning_rate": 4.615992606203898e-06, + "loss": 2.7757, + "num_input_tokens_seen": 8792309760, + "step": 16770 + }, + { + "epoch": 0.8135256212074369, + "grad_norm": 0.2470703125, + "learning_rate": 4.604392919986183e-06, + "loss": 2.7807, + "num_input_tokens_seen": 8794931200, + "step": 16775 + }, + { + "epoch": 0.8137681027636835, + "grad_norm": 0.2431640625, + "learning_rate": 4.592806348271414e-06, + "loss": 2.7764, + "num_input_tokens_seen": 8797552640, + "step": 16780 + }, + { + "epoch": 0.8140105843199301, + "grad_norm": 0.240234375, + "learning_rate": 4.5812328985098325e-06, + "loss": 2.7687, + "num_input_tokens_seen": 8800174080, + "step": 16785 + }, + { + "epoch": 0.8142530658761767, + "grad_norm": 0.2470703125, + "learning_rate": 4.569672578143236e-06, + "loss": 2.7812, + "num_input_tokens_seen": 8802795520, + "step": 16790 + }, + { + "epoch": 0.8144955474324235, + "grad_norm": 0.2373046875, + "learning_rate": 4.55812539460499e-06, + "loss": 2.7851, + "num_input_tokens_seen": 8805416960, + "step": 16795 + }, + { + "epoch": 0.8147380289886701, + "grad_norm": 0.2451171875, + "learning_rate": 4.546591355319988e-06, + "loss": 2.7763, + "num_input_tokens_seen": 8808038400, + "step": 16800 + }, + { + "epoch": 0.8147380289886701, + "eval_accuracy": 0.4559338869890897, + "eval_loss": 2.741605520248413, + "eval_runtime": 5.8207, + "eval_samples_per_second": 51.54, + "eval_steps_per_second": 6.528, + "num_input_tokens_seen": 8808038400, + "step": 16800 + }, + { + "epoch": 0.8149805105449167, + "grad_norm": 0.244140625, + "learning_rate": 4.535070467704705e-06, + "loss": 2.7509, + "num_input_tokens_seen": 8810659840, + "step": 16805 + }, + { + "epoch": 0.8152229921011633, + "grad_norm": 0.240234375, + "learning_rate": 4.523562739167139e-06, + "loss": 2.7738, + "num_input_tokens_seen": 8813281280, + "step": 16810 + }, + { + "epoch": 0.81546547365741, + "grad_norm": 0.240234375, + "learning_rate": 4.512068177106834e-06, + "loss": 2.7704, + "num_input_tokens_seen": 8815902720, + "step": 16815 + }, + { + "epoch": 0.8157079552136566, + "grad_norm": 0.23828125, + "learning_rate": 4.500586788914862e-06, + "loss": 2.7768, + "num_input_tokens_seen": 8818524160, + "step": 16820 + }, + { + "epoch": 0.8159504367699032, + "grad_norm": 0.2412109375, + "learning_rate": 4.489118581973837e-06, + "loss": 2.7685, + "num_input_tokens_seen": 8821145600, + "step": 16825 + }, + { + "epoch": 0.8161929183261498, + "grad_norm": 0.2431640625, + "learning_rate": 4.477663563657871e-06, + "loss": 2.7703, + "num_input_tokens_seen": 8823767040, + "step": 16830 + }, + { + "epoch": 0.8164353998823964, + "grad_norm": 0.2412109375, + "learning_rate": 4.4662217413326244e-06, + "loss": 2.7731, + "num_input_tokens_seen": 8826388480, + "step": 16835 + }, + { + "epoch": 0.8166778814386431, + "grad_norm": 0.23828125, + "learning_rate": 4.454793122355253e-06, + "loss": 2.7761, + "num_input_tokens_seen": 8829009920, + "step": 16840 + }, + { + "epoch": 0.8169203629948897, + "grad_norm": 0.2431640625, + "learning_rate": 4.443377714074437e-06, + "loss": 2.7776, + "num_input_tokens_seen": 8831631360, + "step": 16845 + }, + { + "epoch": 0.8171628445511363, + "grad_norm": 0.240234375, + "learning_rate": 4.4319755238303535e-06, + "loss": 2.7614, + "num_input_tokens_seen": 8834252800, + "step": 16850 + }, + { + "epoch": 0.8174053261073829, + "grad_norm": 0.2470703125, + "learning_rate": 4.420586558954678e-06, + "loss": 2.7568, + "num_input_tokens_seen": 8836874240, + "step": 16855 + }, + { + "epoch": 0.8176478076636295, + "grad_norm": 0.2421875, + "learning_rate": 4.4092108267705935e-06, + "loss": 2.7777, + "num_input_tokens_seen": 8839495680, + "step": 16860 + }, + { + "epoch": 0.8178902892198762, + "grad_norm": 0.248046875, + "learning_rate": 4.39784833459276e-06, + "loss": 2.7866, + "num_input_tokens_seen": 8842117120, + "step": 16865 + }, + { + "epoch": 0.8181327707761228, + "grad_norm": 0.25, + "learning_rate": 4.386499089727336e-06, + "loss": 2.7815, + "num_input_tokens_seen": 8844738560, + "step": 16870 + }, + { + "epoch": 0.8183752523323695, + "grad_norm": 0.2353515625, + "learning_rate": 4.375163099471954e-06, + "loss": 2.7827, + "num_input_tokens_seen": 8847360000, + "step": 16875 + }, + { + "epoch": 0.8186177338886161, + "grad_norm": 0.2451171875, + "learning_rate": 4.36384037111573e-06, + "loss": 2.7754, + "num_input_tokens_seen": 8849981440, + "step": 16880 + }, + { + "epoch": 0.8188602154448628, + "grad_norm": 0.2431640625, + "learning_rate": 4.3525309119392454e-06, + "loss": 2.7746, + "num_input_tokens_seen": 8852602880, + "step": 16885 + }, + { + "epoch": 0.8191026970011094, + "grad_norm": 0.240234375, + "learning_rate": 4.34123472921456e-06, + "loss": 2.7826, + "num_input_tokens_seen": 8855224320, + "step": 16890 + }, + { + "epoch": 0.819345178557356, + "grad_norm": 0.2421875, + "learning_rate": 4.3299518302051785e-06, + "loss": 2.7716, + "num_input_tokens_seen": 8857845760, + "step": 16895 + }, + { + "epoch": 0.8195876601136026, + "grad_norm": 0.244140625, + "learning_rate": 4.318682222166082e-06, + "loss": 2.7766, + "num_input_tokens_seen": 8860467200, + "step": 16900 + }, + { + "epoch": 0.8198301416698492, + "grad_norm": 0.2490234375, + "learning_rate": 4.3074259123436985e-06, + "loss": 2.7779, + "num_input_tokens_seen": 8863088640, + "step": 16905 + }, + { + "epoch": 0.8200726232260959, + "grad_norm": 0.2431640625, + "learning_rate": 4.296182907975907e-06, + "loss": 2.7793, + "num_input_tokens_seen": 8865710080, + "step": 16910 + }, + { + "epoch": 0.8203151047823425, + "grad_norm": 0.236328125, + "learning_rate": 4.284953216292028e-06, + "loss": 2.7676, + "num_input_tokens_seen": 8868331520, + "step": 16915 + }, + { + "epoch": 0.8205575863385891, + "grad_norm": 0.24609375, + "learning_rate": 4.273736844512824e-06, + "loss": 2.7754, + "num_input_tokens_seen": 8870952960, + "step": 16920 + }, + { + "epoch": 0.8208000678948357, + "grad_norm": 0.23828125, + "learning_rate": 4.262533799850494e-06, + "loss": 2.7845, + "num_input_tokens_seen": 8873574400, + "step": 16925 + }, + { + "epoch": 0.8210425494510823, + "grad_norm": 0.2392578125, + "learning_rate": 4.251344089508661e-06, + "loss": 2.77, + "num_input_tokens_seen": 8876195840, + "step": 16930 + }, + { + "epoch": 0.821285031007329, + "grad_norm": 0.2431640625, + "learning_rate": 4.240167720682384e-06, + "loss": 2.7704, + "num_input_tokens_seen": 8878817280, + "step": 16935 + }, + { + "epoch": 0.8215275125635756, + "grad_norm": 0.2451171875, + "learning_rate": 4.229004700558134e-06, + "loss": 2.7912, + "num_input_tokens_seen": 8881438720, + "step": 16940 + }, + { + "epoch": 0.8217699941198222, + "grad_norm": 0.236328125, + "learning_rate": 4.217855036313806e-06, + "loss": 2.7766, + "num_input_tokens_seen": 8884060160, + "step": 16945 + }, + { + "epoch": 0.8220124756760689, + "grad_norm": 0.248046875, + "learning_rate": 4.206718735118706e-06, + "loss": 2.782, + "num_input_tokens_seen": 8886681600, + "step": 16950 + }, + { + "epoch": 0.8222549572323156, + "grad_norm": 0.2451171875, + "learning_rate": 4.1955958041335395e-06, + "loss": 2.774, + "num_input_tokens_seen": 8889303040, + "step": 16955 + }, + { + "epoch": 0.8224974387885622, + "grad_norm": 0.2470703125, + "learning_rate": 4.18448625051042e-06, + "loss": 2.7903, + "num_input_tokens_seen": 8891924480, + "step": 16960 + }, + { + "epoch": 0.8227399203448088, + "grad_norm": 0.244140625, + "learning_rate": 4.173390081392864e-06, + "loss": 2.7809, + "num_input_tokens_seen": 8894545920, + "step": 16965 + }, + { + "epoch": 0.8229824019010554, + "grad_norm": 0.2412109375, + "learning_rate": 4.162307303915777e-06, + "loss": 2.7625, + "num_input_tokens_seen": 8897167360, + "step": 16970 + }, + { + "epoch": 0.823224883457302, + "grad_norm": 0.2431640625, + "learning_rate": 4.151237925205448e-06, + "loss": 2.7629, + "num_input_tokens_seen": 8899788800, + "step": 16975 + }, + { + "epoch": 0.8234673650135487, + "grad_norm": 0.2373046875, + "learning_rate": 4.140181952379574e-06, + "loss": 2.7776, + "num_input_tokens_seen": 8902410240, + "step": 16980 + }, + { + "epoch": 0.8237098465697953, + "grad_norm": 0.2470703125, + "learning_rate": 4.129139392547199e-06, + "loss": 2.7826, + "num_input_tokens_seen": 8905031680, + "step": 16985 + }, + { + "epoch": 0.8239523281260419, + "grad_norm": 0.2392578125, + "learning_rate": 4.11811025280876e-06, + "loss": 2.765, + "num_input_tokens_seen": 8907653120, + "step": 16990 + }, + { + "epoch": 0.8241948096822885, + "grad_norm": 0.2451171875, + "learning_rate": 4.107094540256065e-06, + "loss": 2.7687, + "num_input_tokens_seen": 8910274560, + "step": 16995 + }, + { + "epoch": 0.8244372912385352, + "grad_norm": 0.2490234375, + "learning_rate": 4.09609226197229e-06, + "loss": 2.7773, + "num_input_tokens_seen": 8912896000, + "step": 17000 + }, + { + "epoch": 0.8246797727947818, + "grad_norm": 0.2412109375, + "learning_rate": 4.085103425031961e-06, + "loss": 2.7758, + "num_input_tokens_seen": 8915517440, + "step": 17005 + }, + { + "epoch": 0.8249222543510284, + "grad_norm": 0.2451171875, + "learning_rate": 4.074128036500977e-06, + "loss": 2.7864, + "num_input_tokens_seen": 8918138880, + "step": 17010 + }, + { + "epoch": 0.825164735907275, + "grad_norm": 0.2392578125, + "learning_rate": 4.06316610343658e-06, + "loss": 2.7751, + "num_input_tokens_seen": 8920760320, + "step": 17015 + }, + { + "epoch": 0.8254072174635216, + "grad_norm": 0.2412109375, + "learning_rate": 4.052217632887354e-06, + "loss": 2.7696, + "num_input_tokens_seen": 8923381760, + "step": 17020 + }, + { + "epoch": 0.8256496990197683, + "grad_norm": 0.2431640625, + "learning_rate": 4.041282631893239e-06, + "loss": 2.7632, + "num_input_tokens_seen": 8926003200, + "step": 17025 + }, + { + "epoch": 0.825892180576015, + "grad_norm": 0.2431640625, + "learning_rate": 4.030361107485503e-06, + "loss": 2.7936, + "num_input_tokens_seen": 8928624640, + "step": 17030 + }, + { + "epoch": 0.8261346621322616, + "grad_norm": 0.251953125, + "learning_rate": 4.019453066686768e-06, + "loss": 2.7729, + "num_input_tokens_seen": 8931246080, + "step": 17035 + }, + { + "epoch": 0.8263771436885082, + "grad_norm": 0.244140625, + "learning_rate": 4.008558516510966e-06, + "loss": 2.7768, + "num_input_tokens_seen": 8933867520, + "step": 17040 + }, + { + "epoch": 0.8266196252447549, + "grad_norm": 0.240234375, + "learning_rate": 3.997677463963364e-06, + "loss": 2.7713, + "num_input_tokens_seen": 8936488960, + "step": 17045 + }, + { + "epoch": 0.8268621068010015, + "grad_norm": 0.240234375, + "learning_rate": 3.986809916040538e-06, + "loss": 2.767, + "num_input_tokens_seen": 8939110400, + "step": 17050 + }, + { + "epoch": 0.8271045883572481, + "grad_norm": 0.2412109375, + "learning_rate": 3.975955879730392e-06, + "loss": 2.7786, + "num_input_tokens_seen": 8941731840, + "step": 17055 + }, + { + "epoch": 0.8273470699134947, + "grad_norm": 0.2421875, + "learning_rate": 3.965115362012145e-06, + "loss": 2.7786, + "num_input_tokens_seen": 8944353280, + "step": 17060 + }, + { + "epoch": 0.8275895514697413, + "grad_norm": 0.2451171875, + "learning_rate": 3.95428836985631e-06, + "loss": 2.7854, + "num_input_tokens_seen": 8946974720, + "step": 17065 + }, + { + "epoch": 0.827832033025988, + "grad_norm": 0.2431640625, + "learning_rate": 3.943474910224717e-06, + "loss": 2.7802, + "num_input_tokens_seen": 8949596160, + "step": 17070 + }, + { + "epoch": 0.8280745145822346, + "grad_norm": 0.24609375, + "learning_rate": 3.932674990070492e-06, + "loss": 2.7667, + "num_input_tokens_seen": 8952217600, + "step": 17075 + }, + { + "epoch": 0.8283169961384812, + "grad_norm": 0.2412109375, + "learning_rate": 3.9218886163380406e-06, + "loss": 2.778, + "num_input_tokens_seen": 8954839040, + "step": 17080 + }, + { + "epoch": 0.8285594776947278, + "grad_norm": 0.2412109375, + "learning_rate": 3.911115795963066e-06, + "loss": 2.757, + "num_input_tokens_seen": 8957460480, + "step": 17085 + }, + { + "epoch": 0.8288019592509744, + "grad_norm": 0.2431640625, + "learning_rate": 3.900356535872574e-06, + "loss": 2.7719, + "num_input_tokens_seen": 8960081920, + "step": 17090 + }, + { + "epoch": 0.8290444408072211, + "grad_norm": 0.244140625, + "learning_rate": 3.889610842984826e-06, + "loss": 2.7834, + "num_input_tokens_seen": 8962703360, + "step": 17095 + }, + { + "epoch": 0.8292869223634677, + "grad_norm": 0.2421875, + "learning_rate": 3.878878724209373e-06, + "loss": 2.7581, + "num_input_tokens_seen": 8965324800, + "step": 17100 + }, + { + "epoch": 0.8292869223634677, + "eval_accuracy": 0.45594365738479076, + "eval_loss": 2.7415835857391357, + "eval_runtime": 5.7955, + "eval_samples_per_second": 51.764, + "eval_steps_per_second": 6.557, + "num_input_tokens_seen": 8965324800, + "step": 17100 + }, + { + "epoch": 0.8295294039197143, + "grad_norm": 0.236328125, + "learning_rate": 3.868160186447039e-06, + "loss": 2.7817, + "num_input_tokens_seen": 8967946240, + "step": 17105 + }, + { + "epoch": 0.829771885475961, + "grad_norm": 0.2421875, + "learning_rate": 3.857455236589902e-06, + "loss": 2.771, + "num_input_tokens_seen": 8970567680, + "step": 17110 + }, + { + "epoch": 0.8300143670322077, + "grad_norm": 0.25390625, + "learning_rate": 3.846763881521315e-06, + "loss": 2.7916, + "num_input_tokens_seen": 8973189120, + "step": 17115 + }, + { + "epoch": 0.8302568485884543, + "grad_norm": 0.2431640625, + "learning_rate": 3.836086128115884e-06, + "loss": 2.7745, + "num_input_tokens_seen": 8975810560, + "step": 17120 + }, + { + "epoch": 0.8304993301447009, + "grad_norm": 0.2431640625, + "learning_rate": 3.825421983239477e-06, + "loss": 2.7617, + "num_input_tokens_seen": 8978432000, + "step": 17125 + }, + { + "epoch": 0.8307418117009475, + "grad_norm": 0.2451171875, + "learning_rate": 3.8147714537492e-06, + "loss": 2.7784, + "num_input_tokens_seen": 8981053440, + "step": 17130 + }, + { + "epoch": 0.8309842932571941, + "grad_norm": 0.23828125, + "learning_rate": 3.804134546493415e-06, + "loss": 2.7833, + "num_input_tokens_seen": 8983674880, + "step": 17135 + }, + { + "epoch": 0.8312267748134408, + "grad_norm": 0.2431640625, + "learning_rate": 3.793511268311717e-06, + "loss": 2.7544, + "num_input_tokens_seen": 8986296320, + "step": 17140 + }, + { + "epoch": 0.8314692563696874, + "grad_norm": 0.2421875, + "learning_rate": 3.7829016260349405e-06, + "loss": 2.785, + "num_input_tokens_seen": 8988917760, + "step": 17145 + }, + { + "epoch": 0.831711737925934, + "grad_norm": 0.240234375, + "learning_rate": 3.772305626485151e-06, + "loss": 2.7674, + "num_input_tokens_seen": 8991539200, + "step": 17150 + }, + { + "epoch": 0.8319542194821806, + "grad_norm": 0.244140625, + "learning_rate": 3.7617232764756455e-06, + "loss": 2.7631, + "num_input_tokens_seen": 8994160640, + "step": 17155 + }, + { + "epoch": 0.8321967010384272, + "grad_norm": 0.2490234375, + "learning_rate": 3.7511545828109396e-06, + "loss": 2.7729, + "num_input_tokens_seen": 8996782080, + "step": 17160 + }, + { + "epoch": 0.8324391825946739, + "grad_norm": 0.23828125, + "learning_rate": 3.7405995522867692e-06, + "loss": 2.7832, + "num_input_tokens_seen": 8999403520, + "step": 17165 + }, + { + "epoch": 0.8326816641509205, + "grad_norm": 0.2412109375, + "learning_rate": 3.730058191690089e-06, + "loss": 2.7824, + "num_input_tokens_seen": 9002024960, + "step": 17170 + }, + { + "epoch": 0.8329241457071671, + "grad_norm": 0.2421875, + "learning_rate": 3.7195305077990544e-06, + "loss": 2.7762, + "num_input_tokens_seen": 9004646400, + "step": 17175 + }, + { + "epoch": 0.8331666272634137, + "grad_norm": 0.2392578125, + "learning_rate": 3.7090165073830315e-06, + "loss": 2.7828, + "num_input_tokens_seen": 9007267840, + "step": 17180 + }, + { + "epoch": 0.8334091088196603, + "grad_norm": 0.2451171875, + "learning_rate": 3.6985161972025896e-06, + "loss": 2.7801, + "num_input_tokens_seen": 9009889280, + "step": 17185 + }, + { + "epoch": 0.8336515903759071, + "grad_norm": 0.2451171875, + "learning_rate": 3.6880295840094947e-06, + "loss": 2.7699, + "num_input_tokens_seen": 9012510720, + "step": 17190 + }, + { + "epoch": 0.8338940719321537, + "grad_norm": 0.2353515625, + "learning_rate": 3.677556674546706e-06, + "loss": 2.7702, + "num_input_tokens_seen": 9015132160, + "step": 17195 + }, + { + "epoch": 0.8341365534884003, + "grad_norm": 0.23828125, + "learning_rate": 3.6670974755483673e-06, + "loss": 2.7698, + "num_input_tokens_seen": 9017753600, + "step": 17200 + }, + { + "epoch": 0.8343790350446469, + "grad_norm": 0.25390625, + "learning_rate": 3.65665199373981e-06, + "loss": 2.7663, + "num_input_tokens_seen": 9020375040, + "step": 17205 + }, + { + "epoch": 0.8346215166008936, + "grad_norm": 0.2421875, + "learning_rate": 3.6462202358375468e-06, + "loss": 2.77, + "num_input_tokens_seen": 9022996480, + "step": 17210 + }, + { + "epoch": 0.8348639981571402, + "grad_norm": 0.2421875, + "learning_rate": 3.6358022085492576e-06, + "loss": 2.781, + "num_input_tokens_seen": 9025617920, + "step": 17215 + }, + { + "epoch": 0.8351064797133868, + "grad_norm": 0.244140625, + "learning_rate": 3.625397918573806e-06, + "loss": 2.7623, + "num_input_tokens_seen": 9028239360, + "step": 17220 + }, + { + "epoch": 0.8353489612696334, + "grad_norm": 0.2431640625, + "learning_rate": 3.615007372601209e-06, + "loss": 2.7775, + "num_input_tokens_seen": 9030860800, + "step": 17225 + }, + { + "epoch": 0.83559144282588, + "grad_norm": 0.2470703125, + "learning_rate": 3.604630577312662e-06, + "loss": 2.7881, + "num_input_tokens_seen": 9033482240, + "step": 17230 + }, + { + "epoch": 0.8358339243821267, + "grad_norm": 0.2431640625, + "learning_rate": 3.594267539380497e-06, + "loss": 2.7761, + "num_input_tokens_seen": 9036103680, + "step": 17235 + }, + { + "epoch": 0.8360764059383733, + "grad_norm": 0.2421875, + "learning_rate": 3.5839182654682197e-06, + "loss": 2.7679, + "num_input_tokens_seen": 9038725120, + "step": 17240 + }, + { + "epoch": 0.8363188874946199, + "grad_norm": 0.2412109375, + "learning_rate": 3.5735827622304763e-06, + "loss": 2.7748, + "num_input_tokens_seen": 9041346560, + "step": 17245 + }, + { + "epoch": 0.8365613690508665, + "grad_norm": 0.23828125, + "learning_rate": 3.563261036313059e-06, + "loss": 2.7666, + "num_input_tokens_seen": 9043968000, + "step": 17250 + }, + { + "epoch": 0.8368038506071132, + "grad_norm": 0.244140625, + "learning_rate": 3.552953094352904e-06, + "loss": 2.7714, + "num_input_tokens_seen": 9046589440, + "step": 17255 + }, + { + "epoch": 0.8370463321633598, + "grad_norm": 0.2373046875, + "learning_rate": 3.5426589429780803e-06, + "loss": 2.7695, + "num_input_tokens_seen": 9049210880, + "step": 17260 + }, + { + "epoch": 0.8372888137196065, + "grad_norm": 0.244140625, + "learning_rate": 3.5323785888077942e-06, + "loss": 2.774, + "num_input_tokens_seen": 9051832320, + "step": 17265 + }, + { + "epoch": 0.8375312952758531, + "grad_norm": 0.2421875, + "learning_rate": 3.522112038452377e-06, + "loss": 2.7805, + "num_input_tokens_seen": 9054453760, + "step": 17270 + }, + { + "epoch": 0.8377737768320997, + "grad_norm": 0.25, + "learning_rate": 3.5118592985132843e-06, + "loss": 2.7826, + "num_input_tokens_seen": 9057075200, + "step": 17275 + }, + { + "epoch": 0.8380162583883464, + "grad_norm": 0.251953125, + "learning_rate": 3.5016203755830924e-06, + "loss": 2.7708, + "num_input_tokens_seen": 9059696640, + "step": 17280 + }, + { + "epoch": 0.838258739944593, + "grad_norm": 0.236328125, + "learning_rate": 3.4913952762454904e-06, + "loss": 2.7727, + "num_input_tokens_seen": 9062318080, + "step": 17285 + }, + { + "epoch": 0.8385012215008396, + "grad_norm": 0.23828125, + "learning_rate": 3.4811840070752914e-06, + "loss": 2.7693, + "num_input_tokens_seen": 9064939520, + "step": 17290 + }, + { + "epoch": 0.8387437030570862, + "grad_norm": 0.2373046875, + "learning_rate": 3.4709865746383906e-06, + "loss": 2.769, + "num_input_tokens_seen": 9067560960, + "step": 17295 + }, + { + "epoch": 0.8389861846133329, + "grad_norm": 0.2412109375, + "learning_rate": 3.4608029854918095e-06, + "loss": 2.7697, + "num_input_tokens_seen": 9070182400, + "step": 17300 + }, + { + "epoch": 0.8392286661695795, + "grad_norm": 0.244140625, + "learning_rate": 3.4506332461836543e-06, + "loss": 2.7832, + "num_input_tokens_seen": 9072803840, + "step": 17305 + }, + { + "epoch": 0.8394711477258261, + "grad_norm": 0.2412109375, + "learning_rate": 3.4404773632531363e-06, + "loss": 2.7537, + "num_input_tokens_seen": 9075425280, + "step": 17310 + }, + { + "epoch": 0.8397136292820727, + "grad_norm": 0.2431640625, + "learning_rate": 3.430335343230545e-06, + "loss": 2.7754, + "num_input_tokens_seen": 9078046720, + "step": 17315 + }, + { + "epoch": 0.8399561108383193, + "grad_norm": 0.2412109375, + "learning_rate": 3.420207192637273e-06, + "loss": 2.7824, + "num_input_tokens_seen": 9080668160, + "step": 17320 + }, + { + "epoch": 0.840198592394566, + "grad_norm": 0.2451171875, + "learning_rate": 3.4100929179857827e-06, + "loss": 2.7847, + "num_input_tokens_seen": 9083289600, + "step": 17325 + }, + { + "epoch": 0.8404410739508126, + "grad_norm": 0.2392578125, + "learning_rate": 3.399992525779608e-06, + "loss": 2.772, + "num_input_tokens_seen": 9085911040, + "step": 17330 + }, + { + "epoch": 0.8406835555070592, + "grad_norm": 0.2421875, + "learning_rate": 3.389906022513367e-06, + "loss": 2.7597, + "num_input_tokens_seen": 9088532480, + "step": 17335 + }, + { + "epoch": 0.8409260370633058, + "grad_norm": 0.2412109375, + "learning_rate": 3.379833414672748e-06, + "loss": 2.7796, + "num_input_tokens_seen": 9091153920, + "step": 17340 + }, + { + "epoch": 0.8411685186195526, + "grad_norm": 0.23828125, + "learning_rate": 3.3697747087344996e-06, + "loss": 2.7706, + "num_input_tokens_seen": 9093775360, + "step": 17345 + }, + { + "epoch": 0.8414110001757992, + "grad_norm": 0.2470703125, + "learning_rate": 3.359729911166429e-06, + "loss": 2.7746, + "num_input_tokens_seen": 9096396800, + "step": 17350 + }, + { + "epoch": 0.8416534817320458, + "grad_norm": 0.2421875, + "learning_rate": 3.349699028427414e-06, + "loss": 2.7739, + "num_input_tokens_seen": 9099018240, + "step": 17355 + }, + { + "epoch": 0.8418959632882924, + "grad_norm": 0.2470703125, + "learning_rate": 3.339682066967362e-06, + "loss": 2.7794, + "num_input_tokens_seen": 9101639680, + "step": 17360 + }, + { + "epoch": 0.842138444844539, + "grad_norm": 0.2421875, + "learning_rate": 3.329679033227248e-06, + "loss": 2.7765, + "num_input_tokens_seen": 9104261120, + "step": 17365 + }, + { + "epoch": 0.8423809264007857, + "grad_norm": 0.240234375, + "learning_rate": 3.319689933639078e-06, + "loss": 2.7758, + "num_input_tokens_seen": 9106882560, + "step": 17370 + }, + { + "epoch": 0.8426234079570323, + "grad_norm": 0.2392578125, + "learning_rate": 3.3097147746259187e-06, + "loss": 2.7804, + "num_input_tokens_seen": 9109504000, + "step": 17375 + }, + { + "epoch": 0.8428658895132789, + "grad_norm": 0.2431640625, + "learning_rate": 3.299753562601854e-06, + "loss": 2.7664, + "num_input_tokens_seen": 9112125440, + "step": 17380 + }, + { + "epoch": 0.8431083710695255, + "grad_norm": 0.234375, + "learning_rate": 3.2898063039720096e-06, + "loss": 2.7705, + "num_input_tokens_seen": 9114746880, + "step": 17385 + }, + { + "epoch": 0.8433508526257721, + "grad_norm": 0.240234375, + "learning_rate": 3.279873005132525e-06, + "loss": 2.7841, + "num_input_tokens_seen": 9117368320, + "step": 17390 + }, + { + "epoch": 0.8435933341820188, + "grad_norm": 0.24609375, + "learning_rate": 3.26995367247058e-06, + "loss": 2.7931, + "num_input_tokens_seen": 9119989760, + "step": 17395 + }, + { + "epoch": 0.8438358157382654, + "grad_norm": 0.2373046875, + "learning_rate": 3.2600483123643665e-06, + "loss": 2.7719, + "num_input_tokens_seen": 9122611200, + "step": 17400 + }, + { + "epoch": 0.8438358157382654, + "eval_accuracy": 0.45597296857189384, + "eval_loss": 2.741610288619995, + "eval_runtime": 5.8647, + "eval_samples_per_second": 51.153, + "eval_steps_per_second": 6.479, + "num_input_tokens_seen": 9122611200, + "step": 17400 + }, + { + "epoch": 0.844078297294512, + "grad_norm": 0.236328125, + "learning_rate": 3.2501569311830904e-06, + "loss": 2.7818, + "num_input_tokens_seen": 9125232640, + "step": 17405 + }, + { + "epoch": 0.8443207788507586, + "grad_norm": 0.244140625, + "learning_rate": 3.2402795352869774e-06, + "loss": 2.7849, + "num_input_tokens_seen": 9127854080, + "step": 17410 + }, + { + "epoch": 0.8445632604070052, + "grad_norm": 0.2451171875, + "learning_rate": 3.2304161310272556e-06, + "loss": 2.7755, + "num_input_tokens_seen": 9130475520, + "step": 17415 + }, + { + "epoch": 0.8448057419632519, + "grad_norm": 0.25, + "learning_rate": 3.220566724746141e-06, + "loss": 2.7745, + "num_input_tokens_seen": 9133096960, + "step": 17420 + }, + { + "epoch": 0.8450482235194986, + "grad_norm": 0.240234375, + "learning_rate": 3.2107313227768803e-06, + "loss": 2.7735, + "num_input_tokens_seen": 9135718400, + "step": 17425 + }, + { + "epoch": 0.8452907050757452, + "grad_norm": 0.2392578125, + "learning_rate": 3.200909931443691e-06, + "loss": 2.7659, + "num_input_tokens_seen": 9138339840, + "step": 17430 + }, + { + "epoch": 0.8455331866319918, + "grad_norm": 0.2373046875, + "learning_rate": 3.1911025570617924e-06, + "loss": 2.7857, + "num_input_tokens_seen": 9140961280, + "step": 17435 + }, + { + "epoch": 0.8457756681882385, + "grad_norm": 0.244140625, + "learning_rate": 3.1813092059373857e-06, + "loss": 2.7647, + "num_input_tokens_seen": 9143582720, + "step": 17440 + }, + { + "epoch": 0.8460181497444851, + "grad_norm": 0.2451171875, + "learning_rate": 3.1715298843676606e-06, + "loss": 2.7746, + "num_input_tokens_seen": 9146204160, + "step": 17445 + }, + { + "epoch": 0.8462606313007317, + "grad_norm": 0.2451171875, + "learning_rate": 3.1617645986407733e-06, + "loss": 2.7904, + "num_input_tokens_seen": 9148825600, + "step": 17450 + }, + { + "epoch": 0.8465031128569783, + "grad_norm": 0.2431640625, + "learning_rate": 3.152013355035871e-06, + "loss": 2.7764, + "num_input_tokens_seen": 9151447040, + "step": 17455 + }, + { + "epoch": 0.846745594413225, + "grad_norm": 0.2373046875, + "learning_rate": 3.1422761598230598e-06, + "loss": 2.7668, + "num_input_tokens_seen": 9154068480, + "step": 17460 + }, + { + "epoch": 0.8469880759694716, + "grad_norm": 0.244140625, + "learning_rate": 3.1325530192634207e-06, + "loss": 2.76, + "num_input_tokens_seen": 9156689920, + "step": 17465 + }, + { + "epoch": 0.8472305575257182, + "grad_norm": 0.2421875, + "learning_rate": 3.1228439396089936e-06, + "loss": 2.7694, + "num_input_tokens_seen": 9159311360, + "step": 17470 + }, + { + "epoch": 0.8474730390819648, + "grad_norm": 0.2353515625, + "learning_rate": 3.1131489271027743e-06, + "loss": 2.763, + "num_input_tokens_seen": 9161932800, + "step": 17475 + }, + { + "epoch": 0.8477155206382114, + "grad_norm": 0.248046875, + "learning_rate": 3.10346798797872e-06, + "loss": 2.7746, + "num_input_tokens_seen": 9164554240, + "step": 17480 + }, + { + "epoch": 0.847958002194458, + "grad_norm": 0.2412109375, + "learning_rate": 3.093801128461735e-06, + "loss": 2.7769, + "num_input_tokens_seen": 9167175680, + "step": 17485 + }, + { + "epoch": 0.8482004837507047, + "grad_norm": 0.240234375, + "learning_rate": 3.0841483547676656e-06, + "loss": 2.7659, + "num_input_tokens_seen": 9169797120, + "step": 17490 + }, + { + "epoch": 0.8484429653069513, + "grad_norm": 0.23828125, + "learning_rate": 3.0745096731033124e-06, + "loss": 2.7704, + "num_input_tokens_seen": 9172418560, + "step": 17495 + }, + { + "epoch": 0.848685446863198, + "grad_norm": 0.23828125, + "learning_rate": 3.0648850896664054e-06, + "loss": 2.7727, + "num_input_tokens_seen": 9175040000, + "step": 17500 + }, + { + "epoch": 0.8489279284194446, + "grad_norm": 0.244140625, + "learning_rate": 3.0552746106456087e-06, + "loss": 2.7707, + "num_input_tokens_seen": 9177661440, + "step": 17505 + }, + { + "epoch": 0.8491704099756913, + "grad_norm": 0.240234375, + "learning_rate": 3.0456782422205313e-06, + "loss": 2.7882, + "num_input_tokens_seen": 9180282880, + "step": 17510 + }, + { + "epoch": 0.8494128915319379, + "grad_norm": 0.24609375, + "learning_rate": 3.0360959905616825e-06, + "loss": 2.7695, + "num_input_tokens_seen": 9182904320, + "step": 17515 + }, + { + "epoch": 0.8496553730881845, + "grad_norm": 0.2470703125, + "learning_rate": 3.026527861830519e-06, + "loss": 2.7869, + "num_input_tokens_seen": 9185525760, + "step": 17520 + }, + { + "epoch": 0.8498978546444311, + "grad_norm": 0.255859375, + "learning_rate": 3.016973862179406e-06, + "loss": 2.7681, + "num_input_tokens_seen": 9188147200, + "step": 17525 + }, + { + "epoch": 0.8501403362006777, + "grad_norm": 0.2451171875, + "learning_rate": 3.007433997751624e-06, + "loss": 2.7779, + "num_input_tokens_seen": 9190768640, + "step": 17530 + }, + { + "epoch": 0.8503828177569244, + "grad_norm": 0.2373046875, + "learning_rate": 2.9979082746813684e-06, + "loss": 2.7594, + "num_input_tokens_seen": 9193390080, + "step": 17535 + }, + { + "epoch": 0.850625299313171, + "grad_norm": 0.2470703125, + "learning_rate": 2.9883966990937374e-06, + "loss": 2.7668, + "num_input_tokens_seen": 9196011520, + "step": 17540 + }, + { + "epoch": 0.8508677808694176, + "grad_norm": 0.2412109375, + "learning_rate": 2.9788992771047324e-06, + "loss": 2.7633, + "num_input_tokens_seen": 9198632960, + "step": 17545 + }, + { + "epoch": 0.8511102624256642, + "grad_norm": 0.240234375, + "learning_rate": 2.9694160148212554e-06, + "loss": 2.7752, + "num_input_tokens_seen": 9201254400, + "step": 17550 + }, + { + "epoch": 0.8513527439819109, + "grad_norm": 0.24609375, + "learning_rate": 2.9599469183411027e-06, + "loss": 2.7721, + "num_input_tokens_seen": 9203875840, + "step": 17555 + }, + { + "epoch": 0.8515952255381575, + "grad_norm": 0.2412109375, + "learning_rate": 2.9504919937529657e-06, + "loss": 2.7741, + "num_input_tokens_seen": 9206497280, + "step": 17560 + }, + { + "epoch": 0.8518377070944041, + "grad_norm": 0.2412109375, + "learning_rate": 2.941051247136417e-06, + "loss": 2.7825, + "num_input_tokens_seen": 9209118720, + "step": 17565 + }, + { + "epoch": 0.8520801886506507, + "grad_norm": 0.236328125, + "learning_rate": 2.931624684561923e-06, + "loss": 2.7743, + "num_input_tokens_seen": 9211740160, + "step": 17570 + }, + { + "epoch": 0.8523226702068973, + "grad_norm": 0.2392578125, + "learning_rate": 2.9222123120908108e-06, + "loss": 2.784, + "num_input_tokens_seen": 9214361600, + "step": 17575 + }, + { + "epoch": 0.8525651517631441, + "grad_norm": 0.23828125, + "learning_rate": 2.912814135775299e-06, + "loss": 2.7787, + "num_input_tokens_seen": 9216983040, + "step": 17580 + }, + { + "epoch": 0.8528076333193907, + "grad_norm": 0.2451171875, + "learning_rate": 2.903430161658477e-06, + "loss": 2.7779, + "num_input_tokens_seen": 9219604480, + "step": 17585 + }, + { + "epoch": 0.8530501148756373, + "grad_norm": 0.236328125, + "learning_rate": 2.8940603957742952e-06, + "loss": 2.7788, + "num_input_tokens_seen": 9222225920, + "step": 17590 + }, + { + "epoch": 0.8532925964318839, + "grad_norm": 0.2421875, + "learning_rate": 2.884704844147576e-06, + "loss": 2.7872, + "num_input_tokens_seen": 9224847360, + "step": 17595 + }, + { + "epoch": 0.8535350779881306, + "grad_norm": 0.2470703125, + "learning_rate": 2.8753635127939937e-06, + "loss": 2.7634, + "num_input_tokens_seen": 9227468800, + "step": 17600 + }, + { + "epoch": 0.8537775595443772, + "grad_norm": 0.232421875, + "learning_rate": 2.8660364077200824e-06, + "loss": 2.776, + "num_input_tokens_seen": 9230090240, + "step": 17605 + }, + { + "epoch": 0.8540200411006238, + "grad_norm": 0.23828125, + "learning_rate": 2.8567235349232334e-06, + "loss": 2.7772, + "num_input_tokens_seen": 9232711680, + "step": 17610 + }, + { + "epoch": 0.8542625226568704, + "grad_norm": 0.248046875, + "learning_rate": 2.8474249003916762e-06, + "loss": 2.7883, + "num_input_tokens_seen": 9235333120, + "step": 17615 + }, + { + "epoch": 0.854505004213117, + "grad_norm": 0.2431640625, + "learning_rate": 2.838140510104498e-06, + "loss": 2.7755, + "num_input_tokens_seen": 9237954560, + "step": 17620 + }, + { + "epoch": 0.8547474857693637, + "grad_norm": 0.240234375, + "learning_rate": 2.828870370031614e-06, + "loss": 2.7773, + "num_input_tokens_seen": 9240576000, + "step": 17625 + }, + { + "epoch": 0.8549899673256103, + "grad_norm": 0.2421875, + "learning_rate": 2.8196144861337896e-06, + "loss": 2.7741, + "num_input_tokens_seen": 9243197440, + "step": 17630 + }, + { + "epoch": 0.8552324488818569, + "grad_norm": 0.2412109375, + "learning_rate": 2.8103728643626064e-06, + "loss": 2.7591, + "num_input_tokens_seen": 9245818880, + "step": 17635 + }, + { + "epoch": 0.8554749304381035, + "grad_norm": 0.2392578125, + "learning_rate": 2.8011455106604882e-06, + "loss": 2.7689, + "num_input_tokens_seen": 9248440320, + "step": 17640 + }, + { + "epoch": 0.8557174119943501, + "grad_norm": 0.24609375, + "learning_rate": 2.7919324309606836e-06, + "loss": 2.782, + "num_input_tokens_seen": 9251061760, + "step": 17645 + }, + { + "epoch": 0.8559598935505968, + "grad_norm": 0.240234375, + "learning_rate": 2.7827336311872563e-06, + "loss": 2.7647, + "num_input_tokens_seen": 9253683200, + "step": 17650 + }, + { + "epoch": 0.8562023751068434, + "grad_norm": 0.23828125, + "learning_rate": 2.773549117255095e-06, + "loss": 2.7577, + "num_input_tokens_seen": 9256304640, + "step": 17655 + }, + { + "epoch": 0.8564448566630901, + "grad_norm": 0.2412109375, + "learning_rate": 2.7643788950698996e-06, + "loss": 2.7694, + "num_input_tokens_seen": 9258926080, + "step": 17660 + }, + { + "epoch": 0.8566873382193367, + "grad_norm": 0.25390625, + "learning_rate": 2.7552229705281903e-06, + "loss": 2.7797, + "num_input_tokens_seen": 9261547520, + "step": 17665 + }, + { + "epoch": 0.8569298197755834, + "grad_norm": 0.2392578125, + "learning_rate": 2.7460813495172655e-06, + "loss": 2.7775, + "num_input_tokens_seen": 9264168960, + "step": 17670 + }, + { + "epoch": 0.85717230133183, + "grad_norm": 0.236328125, + "learning_rate": 2.736954037915254e-06, + "loss": 2.7733, + "num_input_tokens_seen": 9266790400, + "step": 17675 + }, + { + "epoch": 0.8574147828880766, + "grad_norm": 0.2421875, + "learning_rate": 2.7278410415910753e-06, + "loss": 2.7777, + "num_input_tokens_seen": 9269411840, + "step": 17680 + }, + { + "epoch": 0.8576572644443232, + "grad_norm": 0.248046875, + "learning_rate": 2.7187423664044392e-06, + "loss": 2.7751, + "num_input_tokens_seen": 9272033280, + "step": 17685 + }, + { + "epoch": 0.8578997460005698, + "grad_norm": 0.240234375, + "learning_rate": 2.709658018205852e-06, + "loss": 2.7821, + "num_input_tokens_seen": 9274654720, + "step": 17690 + }, + { + "epoch": 0.8581422275568165, + "grad_norm": 0.2431640625, + "learning_rate": 2.7005880028366127e-06, + "loss": 2.7772, + "num_input_tokens_seen": 9277276160, + "step": 17695 + }, + { + "epoch": 0.8583847091130631, + "grad_norm": 0.240234375, + "learning_rate": 2.6915323261287902e-06, + "loss": 2.7609, + "num_input_tokens_seen": 9279897600, + "step": 17700 + }, + { + "epoch": 0.8583847091130631, + "eval_accuracy": 0.4559599413776258, + "eval_loss": 2.7415964603424072, + "eval_runtime": 5.9052, + "eval_samples_per_second": 50.803, + "eval_steps_per_second": 6.435, + "num_input_tokens_seen": 9279897600, + "step": 17700 + }, + { + "epoch": 0.8586271906693097, + "grad_norm": 0.240234375, + "learning_rate": 2.68249099390524e-06, + "loss": 2.7497, + "num_input_tokens_seen": 9282519040, + "step": 17705 + }, + { + "epoch": 0.8588696722255563, + "grad_norm": 0.2421875, + "learning_rate": 2.6734640119795956e-06, + "loss": 2.7744, + "num_input_tokens_seen": 9285140480, + "step": 17710 + }, + { + "epoch": 0.859112153781803, + "grad_norm": 0.2421875, + "learning_rate": 2.6644513861562692e-06, + "loss": 2.7646, + "num_input_tokens_seen": 9287761920, + "step": 17715 + }, + { + "epoch": 0.8593546353380496, + "grad_norm": 0.2412109375, + "learning_rate": 2.6554531222304334e-06, + "loss": 2.7827, + "num_input_tokens_seen": 9290383360, + "step": 17720 + }, + { + "epoch": 0.8595971168942962, + "grad_norm": 0.2490234375, + "learning_rate": 2.6464692259880326e-06, + "loss": 2.7679, + "num_input_tokens_seen": 9293004800, + "step": 17725 + }, + { + "epoch": 0.8598395984505428, + "grad_norm": 0.24609375, + "learning_rate": 2.637499703205759e-06, + "loss": 2.7727, + "num_input_tokens_seen": 9295626240, + "step": 17730 + }, + { + "epoch": 0.8600820800067894, + "grad_norm": 0.236328125, + "learning_rate": 2.628544559651075e-06, + "loss": 2.7729, + "num_input_tokens_seen": 9298247680, + "step": 17735 + }, + { + "epoch": 0.8603245615630362, + "grad_norm": 0.2373046875, + "learning_rate": 2.619603801082193e-06, + "loss": 2.7817, + "num_input_tokens_seen": 9300869120, + "step": 17740 + }, + { + "epoch": 0.8605670431192828, + "grad_norm": 0.236328125, + "learning_rate": 2.6106774332480795e-06, + "loss": 2.7744, + "num_input_tokens_seen": 9303490560, + "step": 17745 + }, + { + "epoch": 0.8608095246755294, + "grad_norm": 0.244140625, + "learning_rate": 2.6017654618884446e-06, + "loss": 2.7788, + "num_input_tokens_seen": 9306112000, + "step": 17750 + }, + { + "epoch": 0.861052006231776, + "grad_norm": 0.2353515625, + "learning_rate": 2.592867892733744e-06, + "loss": 2.7656, + "num_input_tokens_seen": 9308733440, + "step": 17755 + }, + { + "epoch": 0.8612944877880226, + "grad_norm": 0.244140625, + "learning_rate": 2.58398473150516e-06, + "loss": 2.7706, + "num_input_tokens_seen": 9311354880, + "step": 17760 + }, + { + "epoch": 0.8615369693442693, + "grad_norm": 0.232421875, + "learning_rate": 2.5751159839146306e-06, + "loss": 2.785, + "num_input_tokens_seen": 9313976320, + "step": 17765 + }, + { + "epoch": 0.8617794509005159, + "grad_norm": 0.244140625, + "learning_rate": 2.566261655664812e-06, + "loss": 2.7827, + "num_input_tokens_seen": 9316597760, + "step": 17770 + }, + { + "epoch": 0.8620219324567625, + "grad_norm": 0.2373046875, + "learning_rate": 2.557421752449096e-06, + "loss": 2.7805, + "num_input_tokens_seen": 9319219200, + "step": 17775 + }, + { + "epoch": 0.8622644140130091, + "grad_norm": 0.2373046875, + "learning_rate": 2.5485962799515926e-06, + "loss": 2.7767, + "num_input_tokens_seen": 9321840640, + "step": 17780 + }, + { + "epoch": 0.8625068955692557, + "grad_norm": 0.24609375, + "learning_rate": 2.5397852438471424e-06, + "loss": 2.7797, + "num_input_tokens_seen": 9324462080, + "step": 17785 + }, + { + "epoch": 0.8627493771255024, + "grad_norm": 0.2353515625, + "learning_rate": 2.5309886498012858e-06, + "loss": 2.7749, + "num_input_tokens_seen": 9327083520, + "step": 17790 + }, + { + "epoch": 0.862991858681749, + "grad_norm": 0.23828125, + "learning_rate": 2.5222065034702953e-06, + "loss": 2.7847, + "num_input_tokens_seen": 9329704960, + "step": 17795 + }, + { + "epoch": 0.8632343402379956, + "grad_norm": 0.2412109375, + "learning_rate": 2.5134388105011423e-06, + "loss": 2.7751, + "num_input_tokens_seen": 9332326400, + "step": 17800 + }, + { + "epoch": 0.8634768217942422, + "grad_norm": 0.2392578125, + "learning_rate": 2.50468557653151e-06, + "loss": 2.7782, + "num_input_tokens_seen": 9334947840, + "step": 17805 + }, + { + "epoch": 0.8637193033504889, + "grad_norm": 0.2421875, + "learning_rate": 2.495946807189781e-06, + "loss": 2.7754, + "num_input_tokens_seen": 9337569280, + "step": 17810 + }, + { + "epoch": 0.8639617849067356, + "grad_norm": 0.2421875, + "learning_rate": 2.4872225080950413e-06, + "loss": 2.7742, + "num_input_tokens_seen": 9340190720, + "step": 17815 + }, + { + "epoch": 0.8642042664629822, + "grad_norm": 0.240234375, + "learning_rate": 2.4785126848570677e-06, + "loss": 2.7639, + "num_input_tokens_seen": 9342812160, + "step": 17820 + }, + { + "epoch": 0.8644467480192288, + "grad_norm": 0.234375, + "learning_rate": 2.4698173430763333e-06, + "loss": 2.7779, + "num_input_tokens_seen": 9345433600, + "step": 17825 + }, + { + "epoch": 0.8646892295754754, + "grad_norm": 0.23828125, + "learning_rate": 2.4611364883439956e-06, + "loss": 2.7735, + "num_input_tokens_seen": 9348055040, + "step": 17830 + }, + { + "epoch": 0.8649317111317221, + "grad_norm": 0.2392578125, + "learning_rate": 2.4524701262418987e-06, + "loss": 2.7766, + "num_input_tokens_seen": 9350676480, + "step": 17835 + }, + { + "epoch": 0.8651741926879687, + "grad_norm": 0.2373046875, + "learning_rate": 2.4438182623425674e-06, + "loss": 2.7746, + "num_input_tokens_seen": 9353297920, + "step": 17840 + }, + { + "epoch": 0.8654166742442153, + "grad_norm": 0.23828125, + "learning_rate": 2.4351809022092027e-06, + "loss": 2.772, + "num_input_tokens_seen": 9355919360, + "step": 17845 + }, + { + "epoch": 0.8656591558004619, + "grad_norm": 0.23828125, + "learning_rate": 2.4265580513956887e-06, + "loss": 2.7688, + "num_input_tokens_seen": 9358540800, + "step": 17850 + }, + { + "epoch": 0.8659016373567086, + "grad_norm": 0.2431640625, + "learning_rate": 2.417949715446563e-06, + "loss": 2.7797, + "num_input_tokens_seen": 9361162240, + "step": 17855 + }, + { + "epoch": 0.8661441189129552, + "grad_norm": 0.2470703125, + "learning_rate": 2.409355899897045e-06, + "loss": 2.7749, + "num_input_tokens_seen": 9363783680, + "step": 17860 + }, + { + "epoch": 0.8663866004692018, + "grad_norm": 0.2431640625, + "learning_rate": 2.4007766102730064e-06, + "loss": 2.7594, + "num_input_tokens_seen": 9366405120, + "step": 17865 + }, + { + "epoch": 0.8666290820254484, + "grad_norm": 0.244140625, + "learning_rate": 2.392211852090989e-06, + "loss": 2.7497, + "num_input_tokens_seen": 9369026560, + "step": 17870 + }, + { + "epoch": 0.866871563581695, + "grad_norm": 0.2373046875, + "learning_rate": 2.383661630858186e-06, + "loss": 2.7691, + "num_input_tokens_seen": 9371648000, + "step": 17875 + }, + { + "epoch": 0.8671140451379417, + "grad_norm": 0.2373046875, + "learning_rate": 2.3751259520724434e-06, + "loss": 2.7691, + "num_input_tokens_seen": 9374269440, + "step": 17880 + }, + { + "epoch": 0.8673565266941883, + "grad_norm": 0.24609375, + "learning_rate": 2.3666048212222512e-06, + "loss": 2.7712, + "num_input_tokens_seen": 9376890880, + "step": 17885 + }, + { + "epoch": 0.8675990082504349, + "grad_norm": 0.2373046875, + "learning_rate": 2.358098243786755e-06, + "loss": 2.7797, + "num_input_tokens_seen": 9379512320, + "step": 17890 + }, + { + "epoch": 0.8678414898066816, + "grad_norm": 0.25, + "learning_rate": 2.3496062252357343e-06, + "loss": 2.7765, + "num_input_tokens_seen": 9382133760, + "step": 17895 + }, + { + "epoch": 0.8680839713629283, + "grad_norm": 0.244140625, + "learning_rate": 2.3411287710296104e-06, + "loss": 2.7689, + "num_input_tokens_seen": 9384755200, + "step": 17900 + }, + { + "epoch": 0.8683264529191749, + "grad_norm": 0.2431640625, + "learning_rate": 2.3326658866194422e-06, + "loss": 2.7686, + "num_input_tokens_seen": 9387376640, + "step": 17905 + }, + { + "epoch": 0.8685689344754215, + "grad_norm": 0.2431640625, + "learning_rate": 2.3242175774469215e-06, + "loss": 2.774, + "num_input_tokens_seen": 9389998080, + "step": 17910 + }, + { + "epoch": 0.8688114160316681, + "grad_norm": 0.2421875, + "learning_rate": 2.31578384894435e-06, + "loss": 2.7767, + "num_input_tokens_seen": 9392619520, + "step": 17915 + }, + { + "epoch": 0.8690538975879147, + "grad_norm": 0.2392578125, + "learning_rate": 2.3073647065346788e-06, + "loss": 2.7694, + "num_input_tokens_seen": 9395240960, + "step": 17920 + }, + { + "epoch": 0.8692963791441614, + "grad_norm": 0.251953125, + "learning_rate": 2.2989601556314634e-06, + "loss": 2.769, + "num_input_tokens_seen": 9397862400, + "step": 17925 + }, + { + "epoch": 0.869538860700408, + "grad_norm": 0.23828125, + "learning_rate": 2.2905702016388864e-06, + "loss": 2.7607, + "num_input_tokens_seen": 9400483840, + "step": 17930 + }, + { + "epoch": 0.8697813422566546, + "grad_norm": 0.2431640625, + "learning_rate": 2.2821948499517383e-06, + "loss": 2.7688, + "num_input_tokens_seen": 9403105280, + "step": 17935 + }, + { + "epoch": 0.8700238238129012, + "grad_norm": 0.248046875, + "learning_rate": 2.2738341059554274e-06, + "loss": 2.7786, + "num_input_tokens_seen": 9405726720, + "step": 17940 + }, + { + "epoch": 0.8702663053691478, + "grad_norm": 0.251953125, + "learning_rate": 2.2654879750259567e-06, + "loss": 2.7688, + "num_input_tokens_seen": 9408348160, + "step": 17945 + }, + { + "epoch": 0.8705087869253945, + "grad_norm": 0.2392578125, + "learning_rate": 2.257156462529947e-06, + "loss": 2.7638, + "num_input_tokens_seen": 9410969600, + "step": 17950 + }, + { + "epoch": 0.8707512684816411, + "grad_norm": 0.2421875, + "learning_rate": 2.2488395738246127e-06, + "loss": 2.7643, + "num_input_tokens_seen": 9413591040, + "step": 17955 + }, + { + "epoch": 0.8709937500378877, + "grad_norm": 0.251953125, + "learning_rate": 2.2405373142577598e-06, + "loss": 2.7659, + "num_input_tokens_seen": 9416212480, + "step": 17960 + }, + { + "epoch": 0.8712362315941343, + "grad_norm": 0.23046875, + "learning_rate": 2.2322496891678008e-06, + "loss": 2.7605, + "num_input_tokens_seen": 9418833920, + "step": 17965 + }, + { + "epoch": 0.871478713150381, + "grad_norm": 0.240234375, + "learning_rate": 2.2239767038837235e-06, + "loss": 2.7799, + "num_input_tokens_seen": 9421455360, + "step": 17970 + }, + { + "epoch": 0.8717211947066277, + "grad_norm": 0.2451171875, + "learning_rate": 2.2157183637251166e-06, + "loss": 2.7794, + "num_input_tokens_seen": 9424076800, + "step": 17975 + }, + { + "epoch": 0.8719636762628743, + "grad_norm": 0.2490234375, + "learning_rate": 2.2074746740021357e-06, + "loss": 2.7728, + "num_input_tokens_seen": 9426698240, + "step": 17980 + }, + { + "epoch": 0.8722061578191209, + "grad_norm": 0.240234375, + "learning_rate": 2.199245640015529e-06, + "loss": 2.7765, + "num_input_tokens_seen": 9429319680, + "step": 17985 + }, + { + "epoch": 0.8724486393753675, + "grad_norm": 0.2470703125, + "learning_rate": 2.1910312670566173e-06, + "loss": 2.7675, + "num_input_tokens_seen": 9431941120, + "step": 17990 + }, + { + "epoch": 0.8726911209316142, + "grad_norm": 0.2412109375, + "learning_rate": 2.1828315604072892e-06, + "loss": 2.7799, + "num_input_tokens_seen": 9434562560, + "step": 17995 + }, + { + "epoch": 0.8729336024878608, + "grad_norm": 0.23828125, + "learning_rate": 2.1746465253400155e-06, + "loss": 2.7753, + "num_input_tokens_seen": 9437184000, + "step": 18000 + }, + { + "epoch": 0.8729336024878608, + "eval_accuracy": 0.45591271779840414, + "eval_loss": 2.741586685180664, + "eval_runtime": 5.9258, + "eval_samples_per_second": 50.626, + "eval_steps_per_second": 6.413, + "num_input_tokens_seen": 9437184000, + "step": 18000 + }, + { + "epoch": 0.8731760840441074, + "grad_norm": 0.2392578125, + "learning_rate": 2.1664761671178286e-06, + "loss": 2.7627, + "num_input_tokens_seen": 9439805440, + "step": 18005 + }, + { + "epoch": 0.873418565600354, + "grad_norm": 0.2431640625, + "learning_rate": 2.1583204909943033e-06, + "loss": 2.7716, + "num_input_tokens_seen": 9442426880, + "step": 18010 + }, + { + "epoch": 0.8736610471566006, + "grad_norm": 0.2431640625, + "learning_rate": 2.1501795022136032e-06, + "loss": 2.7705, + "num_input_tokens_seen": 9445048320, + "step": 18015 + }, + { + "epoch": 0.8739035287128473, + "grad_norm": 0.2373046875, + "learning_rate": 2.1420532060104304e-06, + "loss": 2.7747, + "num_input_tokens_seen": 9447669760, + "step": 18020 + }, + { + "epoch": 0.8741460102690939, + "grad_norm": 0.248046875, + "learning_rate": 2.133941607610043e-06, + "loss": 2.779, + "num_input_tokens_seen": 9450291200, + "step": 18025 + }, + { + "epoch": 0.8743884918253405, + "grad_norm": 0.2451171875, + "learning_rate": 2.1258447122282534e-06, + "loss": 2.7875, + "num_input_tokens_seen": 9452912640, + "step": 18030 + }, + { + "epoch": 0.8746309733815871, + "grad_norm": 0.2490234375, + "learning_rate": 2.1177625250714207e-06, + "loss": 2.7781, + "num_input_tokens_seen": 9455534080, + "step": 18035 + }, + { + "epoch": 0.8748734549378337, + "grad_norm": 0.240234375, + "learning_rate": 2.1096950513364273e-06, + "loss": 2.7652, + "num_input_tokens_seen": 9458155520, + "step": 18040 + }, + { + "epoch": 0.8751159364940804, + "grad_norm": 0.2431640625, + "learning_rate": 2.101642296210715e-06, + "loss": 2.7838, + "num_input_tokens_seen": 9460776960, + "step": 18045 + }, + { + "epoch": 0.875358418050327, + "grad_norm": 0.2353515625, + "learning_rate": 2.093604264872262e-06, + "loss": 2.7826, + "num_input_tokens_seen": 9463398400, + "step": 18050 + }, + { + "epoch": 0.8756008996065737, + "grad_norm": 0.2431640625, + "learning_rate": 2.0855809624895695e-06, + "loss": 2.7639, + "num_input_tokens_seen": 9466019840, + "step": 18055 + }, + { + "epoch": 0.8758433811628203, + "grad_norm": 0.244140625, + "learning_rate": 2.077572394221669e-06, + "loss": 2.7792, + "num_input_tokens_seen": 9468641280, + "step": 18060 + }, + { + "epoch": 0.876085862719067, + "grad_norm": 0.25, + "learning_rate": 2.069578565218125e-06, + "loss": 2.7733, + "num_input_tokens_seen": 9471262720, + "step": 18065 + }, + { + "epoch": 0.8763283442753136, + "grad_norm": 0.2333984375, + "learning_rate": 2.0615994806190143e-06, + "loss": 2.7806, + "num_input_tokens_seen": 9473884160, + "step": 18070 + }, + { + "epoch": 0.8765708258315602, + "grad_norm": 0.236328125, + "learning_rate": 2.0536351455549383e-06, + "loss": 2.7733, + "num_input_tokens_seen": 9476505600, + "step": 18075 + }, + { + "epoch": 0.8768133073878068, + "grad_norm": 0.240234375, + "learning_rate": 2.045685565147015e-06, + "loss": 2.7788, + "num_input_tokens_seen": 9479127040, + "step": 18080 + }, + { + "epoch": 0.8770557889440534, + "grad_norm": 0.2412109375, + "learning_rate": 2.037750744506878e-06, + "loss": 2.7666, + "num_input_tokens_seen": 9481748480, + "step": 18085 + }, + { + "epoch": 0.8772982705003001, + "grad_norm": 0.2470703125, + "learning_rate": 2.0298306887366616e-06, + "loss": 2.7749, + "num_input_tokens_seen": 9484369920, + "step": 18090 + }, + { + "epoch": 0.8775407520565467, + "grad_norm": 0.2431640625, + "learning_rate": 2.0219254029290174e-06, + "loss": 2.7599, + "num_input_tokens_seen": 9486991360, + "step": 18095 + }, + { + "epoch": 0.8777832336127933, + "grad_norm": 0.2421875, + "learning_rate": 2.014034892167083e-06, + "loss": 2.7807, + "num_input_tokens_seen": 9489612800, + "step": 18100 + }, + { + "epoch": 0.8780257151690399, + "grad_norm": 0.240234375, + "learning_rate": 2.006159161524515e-06, + "loss": 2.791, + "num_input_tokens_seen": 9492234240, + "step": 18105 + }, + { + "epoch": 0.8782681967252866, + "grad_norm": 0.244140625, + "learning_rate": 1.9982982160654586e-06, + "loss": 2.769, + "num_input_tokens_seen": 9494855680, + "step": 18110 + }, + { + "epoch": 0.8785106782815332, + "grad_norm": 0.2412109375, + "learning_rate": 1.9904520608445444e-06, + "loss": 2.7708, + "num_input_tokens_seen": 9497477120, + "step": 18115 + }, + { + "epoch": 0.8787531598377798, + "grad_norm": 0.2412109375, + "learning_rate": 1.9826207009069038e-06, + "loss": 2.78, + "num_input_tokens_seen": 9500098560, + "step": 18120 + }, + { + "epoch": 0.8789956413940264, + "grad_norm": 0.2431640625, + "learning_rate": 1.9748041412881473e-06, + "loss": 2.7839, + "num_input_tokens_seen": 9502720000, + "step": 18125 + }, + { + "epoch": 0.8792381229502731, + "grad_norm": 0.236328125, + "learning_rate": 1.96700238701438e-06, + "loss": 2.7713, + "num_input_tokens_seen": 9505341440, + "step": 18130 + }, + { + "epoch": 0.8794806045065198, + "grad_norm": 0.2470703125, + "learning_rate": 1.9592154431021666e-06, + "loss": 2.7611, + "num_input_tokens_seen": 9507962880, + "step": 18135 + }, + { + "epoch": 0.8797230860627664, + "grad_norm": 0.248046875, + "learning_rate": 1.951443314558565e-06, + "loss": 2.7761, + "num_input_tokens_seen": 9510584320, + "step": 18140 + }, + { + "epoch": 0.879965567619013, + "grad_norm": 0.2431640625, + "learning_rate": 1.9436860063811042e-06, + "loss": 2.7841, + "num_input_tokens_seen": 9513205760, + "step": 18145 + }, + { + "epoch": 0.8802080491752596, + "grad_norm": 0.2421875, + "learning_rate": 1.9359435235577818e-06, + "loss": 2.7725, + "num_input_tokens_seen": 9515827200, + "step": 18150 + }, + { + "epoch": 0.8804505307315063, + "grad_norm": 0.2431640625, + "learning_rate": 1.9282158710670627e-06, + "loss": 2.7834, + "num_input_tokens_seen": 9518448640, + "step": 18155 + }, + { + "epoch": 0.8806930122877529, + "grad_norm": 0.25, + "learning_rate": 1.9205030538778756e-06, + "loss": 2.7838, + "num_input_tokens_seen": 9521070080, + "step": 18160 + }, + { + "epoch": 0.8809354938439995, + "grad_norm": 0.2421875, + "learning_rate": 1.9128050769496086e-06, + "loss": 2.7681, + "num_input_tokens_seen": 9523691520, + "step": 18165 + }, + { + "epoch": 0.8811779754002461, + "grad_norm": 0.244140625, + "learning_rate": 1.9051219452321106e-06, + "loss": 2.7725, + "num_input_tokens_seen": 9526312960, + "step": 18170 + }, + { + "epoch": 0.8814204569564927, + "grad_norm": 0.2353515625, + "learning_rate": 1.8974536636656825e-06, + "loss": 2.7649, + "num_input_tokens_seen": 9528934400, + "step": 18175 + }, + { + "epoch": 0.8816629385127394, + "grad_norm": 0.244140625, + "learning_rate": 1.8898002371810774e-06, + "loss": 2.7761, + "num_input_tokens_seen": 9531555840, + "step": 18180 + }, + { + "epoch": 0.881905420068986, + "grad_norm": 0.2421875, + "learning_rate": 1.8821616706995004e-06, + "loss": 2.7944, + "num_input_tokens_seen": 9534177280, + "step": 18185 + }, + { + "epoch": 0.8821479016252326, + "grad_norm": 0.2470703125, + "learning_rate": 1.8745379691325947e-06, + "loss": 2.7843, + "num_input_tokens_seen": 9536798720, + "step": 18190 + }, + { + "epoch": 0.8823903831814792, + "grad_norm": 0.244140625, + "learning_rate": 1.866929137382445e-06, + "loss": 2.7689, + "num_input_tokens_seen": 9539420160, + "step": 18195 + }, + { + "epoch": 0.8826328647377258, + "grad_norm": 0.2392578125, + "learning_rate": 1.8593351803415788e-06, + "loss": 2.7867, + "num_input_tokens_seen": 9542041600, + "step": 18200 + }, + { + "epoch": 0.8828753462939725, + "grad_norm": 0.2451171875, + "learning_rate": 1.8517561028929597e-06, + "loss": 2.7613, + "num_input_tokens_seen": 9544663040, + "step": 18205 + }, + { + "epoch": 0.8831178278502192, + "grad_norm": 0.240234375, + "learning_rate": 1.8441919099099813e-06, + "loss": 2.7643, + "num_input_tokens_seen": 9547284480, + "step": 18210 + }, + { + "epoch": 0.8833603094064658, + "grad_norm": 0.232421875, + "learning_rate": 1.8366426062564696e-06, + "loss": 2.7662, + "num_input_tokens_seen": 9549905920, + "step": 18215 + }, + { + "epoch": 0.8836027909627124, + "grad_norm": 0.2431640625, + "learning_rate": 1.8291081967866692e-06, + "loss": 2.7678, + "num_input_tokens_seen": 9552527360, + "step": 18220 + }, + { + "epoch": 0.8838452725189591, + "grad_norm": 0.2470703125, + "learning_rate": 1.8215886863452548e-06, + "loss": 2.7744, + "num_input_tokens_seen": 9555148800, + "step": 18225 + }, + { + "epoch": 0.8840877540752057, + "grad_norm": 0.236328125, + "learning_rate": 1.8140840797673198e-06, + "loss": 2.7649, + "num_input_tokens_seen": 9557770240, + "step": 18230 + }, + { + "epoch": 0.8843302356314523, + "grad_norm": 0.244140625, + "learning_rate": 1.8065943818783736e-06, + "loss": 2.777, + "num_input_tokens_seen": 9560391680, + "step": 18235 + }, + { + "epoch": 0.8845727171876989, + "grad_norm": 0.2470703125, + "learning_rate": 1.7991195974943364e-06, + "loss": 2.7729, + "num_input_tokens_seen": 9563013120, + "step": 18240 + }, + { + "epoch": 0.8848151987439455, + "grad_norm": 0.251953125, + "learning_rate": 1.791659731421541e-06, + "loss": 2.7799, + "num_input_tokens_seen": 9565634560, + "step": 18245 + }, + { + "epoch": 0.8850576803001922, + "grad_norm": 0.2412109375, + "learning_rate": 1.7842147884567368e-06, + "loss": 2.7769, + "num_input_tokens_seen": 9568256000, + "step": 18250 + }, + { + "epoch": 0.8853001618564388, + "grad_norm": 0.240234375, + "learning_rate": 1.7767847733870523e-06, + "loss": 2.7899, + "num_input_tokens_seen": 9570877440, + "step": 18255 + }, + { + "epoch": 0.8855426434126854, + "grad_norm": 0.2470703125, + "learning_rate": 1.769369690990047e-06, + "loss": 2.7757, + "num_input_tokens_seen": 9573498880, + "step": 18260 + }, + { + "epoch": 0.885785124968932, + "grad_norm": 0.2431640625, + "learning_rate": 1.7619695460336593e-06, + "loss": 2.7993, + "num_input_tokens_seen": 9576120320, + "step": 18265 + }, + { + "epoch": 0.8860276065251786, + "grad_norm": 0.2373046875, + "learning_rate": 1.7545843432762305e-06, + "loss": 2.7668, + "num_input_tokens_seen": 9578741760, + "step": 18270 + }, + { + "epoch": 0.8862700880814253, + "grad_norm": 0.248046875, + "learning_rate": 1.7472140874664921e-06, + "loss": 2.7855, + "num_input_tokens_seen": 9581363200, + "step": 18275 + }, + { + "epoch": 0.8865125696376719, + "grad_norm": 0.2392578125, + "learning_rate": 1.7398587833435593e-06, + "loss": 2.7856, + "num_input_tokens_seen": 9583984640, + "step": 18280 + }, + { + "epoch": 0.8867550511939185, + "grad_norm": 0.2412109375, + "learning_rate": 1.732518435636954e-06, + "loss": 2.778, + "num_input_tokens_seen": 9586606080, + "step": 18285 + }, + { + "epoch": 0.8869975327501652, + "grad_norm": 0.244140625, + "learning_rate": 1.7251930490665509e-06, + "loss": 2.7769, + "num_input_tokens_seen": 9589227520, + "step": 18290 + }, + { + "epoch": 0.8872400143064119, + "grad_norm": 0.24609375, + "learning_rate": 1.7178826283426235e-06, + "loss": 2.7797, + "num_input_tokens_seen": 9591848960, + "step": 18295 + }, + { + "epoch": 0.8874824958626585, + "grad_norm": 0.2451171875, + "learning_rate": 1.7105871781658178e-06, + "loss": 2.7674, + "num_input_tokens_seen": 9594470400, + "step": 18300 + }, + { + "epoch": 0.8874824958626585, + "eval_accuracy": 0.4559631981761928, + "eval_loss": 2.741525888442993, + "eval_runtime": 5.9551, + "eval_samples_per_second": 50.377, + "eval_steps_per_second": 6.381, + "num_input_tokens_seen": 9594470400, + "step": 18300 + }, + { + "epoch": 0.8877249774189051, + "grad_norm": 0.2451171875, + "learning_rate": 1.703306703227156e-06, + "loss": 2.7576, + "num_input_tokens_seen": 9597091840, + "step": 18305 + }, + { + "epoch": 0.8879674589751517, + "grad_norm": 0.240234375, + "learning_rate": 1.6960412082080295e-06, + "loss": 2.7891, + "num_input_tokens_seen": 9599713280, + "step": 18310 + }, + { + "epoch": 0.8882099405313983, + "grad_norm": 0.2431640625, + "learning_rate": 1.688790697780196e-06, + "loss": 2.7742, + "num_input_tokens_seen": 9602334720, + "step": 18315 + }, + { + "epoch": 0.888452422087645, + "grad_norm": 0.248046875, + "learning_rate": 1.6815551766057757e-06, + "loss": 2.7634, + "num_input_tokens_seen": 9604956160, + "step": 18320 + }, + { + "epoch": 0.8886949036438916, + "grad_norm": 0.2373046875, + "learning_rate": 1.6743346493372587e-06, + "loss": 2.7755, + "num_input_tokens_seen": 9607577600, + "step": 18325 + }, + { + "epoch": 0.8889373852001382, + "grad_norm": 0.2431640625, + "learning_rate": 1.6671291206174805e-06, + "loss": 2.7775, + "num_input_tokens_seen": 9610199040, + "step": 18330 + }, + { + "epoch": 0.8891798667563848, + "grad_norm": 0.244140625, + "learning_rate": 1.6599385950796547e-06, + "loss": 2.7788, + "num_input_tokens_seen": 9612820480, + "step": 18335 + }, + { + "epoch": 0.8894223483126315, + "grad_norm": 0.2421875, + "learning_rate": 1.6527630773473247e-06, + "loss": 2.7869, + "num_input_tokens_seen": 9615441920, + "step": 18340 + }, + { + "epoch": 0.8896648298688781, + "grad_norm": 0.2431640625, + "learning_rate": 1.6456025720343993e-06, + "loss": 2.7755, + "num_input_tokens_seen": 9618063360, + "step": 18345 + }, + { + "epoch": 0.8899073114251247, + "grad_norm": 0.236328125, + "learning_rate": 1.6384570837451236e-06, + "loss": 2.7877, + "num_input_tokens_seen": 9620684800, + "step": 18350 + }, + { + "epoch": 0.8901497929813713, + "grad_norm": 0.2490234375, + "learning_rate": 1.6313266170740915e-06, + "loss": 2.7855, + "num_input_tokens_seen": 9623306240, + "step": 18355 + }, + { + "epoch": 0.8903922745376179, + "grad_norm": 0.2412109375, + "learning_rate": 1.6242111766062384e-06, + "loss": 2.7603, + "num_input_tokens_seen": 9625927680, + "step": 18360 + }, + { + "epoch": 0.8906347560938647, + "grad_norm": 0.23828125, + "learning_rate": 1.6171107669168378e-06, + "loss": 2.7888, + "num_input_tokens_seen": 9628549120, + "step": 18365 + }, + { + "epoch": 0.8908772376501113, + "grad_norm": 0.2421875, + "learning_rate": 1.6100253925714987e-06, + "loss": 2.7791, + "num_input_tokens_seen": 9631170560, + "step": 18370 + }, + { + "epoch": 0.8911197192063579, + "grad_norm": 0.2431640625, + "learning_rate": 1.6029550581261598e-06, + "loss": 2.7952, + "num_input_tokens_seen": 9633792000, + "step": 18375 + }, + { + "epoch": 0.8913622007626045, + "grad_norm": 0.240234375, + "learning_rate": 1.5958997681270876e-06, + "loss": 2.7777, + "num_input_tokens_seen": 9636413440, + "step": 18380 + }, + { + "epoch": 0.8916046823188511, + "grad_norm": 0.240234375, + "learning_rate": 1.588859527110878e-06, + "loss": 2.7779, + "num_input_tokens_seen": 9639034880, + "step": 18385 + }, + { + "epoch": 0.8918471638750978, + "grad_norm": 0.2373046875, + "learning_rate": 1.5818343396044543e-06, + "loss": 2.7775, + "num_input_tokens_seen": 9641656320, + "step": 18390 + }, + { + "epoch": 0.8920896454313444, + "grad_norm": 0.248046875, + "learning_rate": 1.574824210125056e-06, + "loss": 2.7864, + "num_input_tokens_seen": 9644277760, + "step": 18395 + }, + { + "epoch": 0.892332126987591, + "grad_norm": 0.236328125, + "learning_rate": 1.5678291431802355e-06, + "loss": 2.7854, + "num_input_tokens_seen": 9646899200, + "step": 18400 + }, + { + "epoch": 0.8925746085438376, + "grad_norm": 0.2490234375, + "learning_rate": 1.560849143267873e-06, + "loss": 2.7777, + "num_input_tokens_seen": 9649520640, + "step": 18405 + }, + { + "epoch": 0.8928170901000843, + "grad_norm": 0.2392578125, + "learning_rate": 1.5538842148761418e-06, + "loss": 2.7695, + "num_input_tokens_seen": 9652142080, + "step": 18410 + }, + { + "epoch": 0.8930595716563309, + "grad_norm": 0.2412109375, + "learning_rate": 1.5469343624835403e-06, + "loss": 2.7667, + "num_input_tokens_seen": 9654763520, + "step": 18415 + }, + { + "epoch": 0.8933020532125775, + "grad_norm": 0.2392578125, + "learning_rate": 1.5399995905588633e-06, + "loss": 2.7874, + "num_input_tokens_seen": 9657384960, + "step": 18420 + }, + { + "epoch": 0.8935445347688241, + "grad_norm": 0.2490234375, + "learning_rate": 1.5330799035612187e-06, + "loss": 2.7733, + "num_input_tokens_seen": 9660006400, + "step": 18425 + }, + { + "epoch": 0.8937870163250707, + "grad_norm": 0.24609375, + "learning_rate": 1.5261753059400003e-06, + "loss": 2.7813, + "num_input_tokens_seen": 9662627840, + "step": 18430 + }, + { + "epoch": 0.8940294978813174, + "grad_norm": 0.23828125, + "learning_rate": 1.519285802134915e-06, + "loss": 2.7694, + "num_input_tokens_seen": 9665249280, + "step": 18435 + }, + { + "epoch": 0.894271979437564, + "grad_norm": 0.2412109375, + "learning_rate": 1.512411396575955e-06, + "loss": 2.7832, + "num_input_tokens_seen": 9667870720, + "step": 18440 + }, + { + "epoch": 0.8945144609938107, + "grad_norm": 0.25, + "learning_rate": 1.5055520936834016e-06, + "loss": 2.7733, + "num_input_tokens_seen": 9670492160, + "step": 18445 + }, + { + "epoch": 0.8947569425500573, + "grad_norm": 0.25, + "learning_rate": 1.498707897867835e-06, + "loss": 2.7886, + "num_input_tokens_seen": 9673113600, + "step": 18450 + }, + { + "epoch": 0.894999424106304, + "grad_norm": 0.2392578125, + "learning_rate": 1.491878813530115e-06, + "loss": 2.7727, + "num_input_tokens_seen": 9675735040, + "step": 18455 + }, + { + "epoch": 0.8952419056625506, + "grad_norm": 0.2451171875, + "learning_rate": 1.485064845061382e-06, + "loss": 2.7743, + "num_input_tokens_seen": 9678356480, + "step": 18460 + }, + { + "epoch": 0.8954843872187972, + "grad_norm": 0.248046875, + "learning_rate": 1.4782659968430646e-06, + "loss": 2.7796, + "num_input_tokens_seen": 9680977920, + "step": 18465 + }, + { + "epoch": 0.8957268687750438, + "grad_norm": 0.236328125, + "learning_rate": 1.4714822732468658e-06, + "loss": 2.7592, + "num_input_tokens_seen": 9683599360, + "step": 18470 + }, + { + "epoch": 0.8959693503312904, + "grad_norm": 0.251953125, + "learning_rate": 1.4647136786347548e-06, + "loss": 2.7815, + "num_input_tokens_seen": 9686220800, + "step": 18475 + }, + { + "epoch": 0.8962118318875371, + "grad_norm": 0.2392578125, + "learning_rate": 1.4579602173589862e-06, + "loss": 2.7768, + "num_input_tokens_seen": 9688842240, + "step": 18480 + }, + { + "epoch": 0.8964543134437837, + "grad_norm": 0.2421875, + "learning_rate": 1.4512218937620752e-06, + "loss": 2.7648, + "num_input_tokens_seen": 9691463680, + "step": 18485 + }, + { + "epoch": 0.8966967950000303, + "grad_norm": 0.2451171875, + "learning_rate": 1.444498712176809e-06, + "loss": 2.7862, + "num_input_tokens_seen": 9694085120, + "step": 18490 + }, + { + "epoch": 0.8969392765562769, + "grad_norm": 0.2412109375, + "learning_rate": 1.4377906769262317e-06, + "loss": 2.7747, + "num_input_tokens_seen": 9696706560, + "step": 18495 + }, + { + "epoch": 0.8971817581125235, + "grad_norm": 0.234375, + "learning_rate": 1.4310977923236547e-06, + "loss": 2.78, + "num_input_tokens_seen": 9699328000, + "step": 18500 + }, + { + "epoch": 0.8974242396687702, + "grad_norm": 0.244140625, + "learning_rate": 1.4244200626726462e-06, + "loss": 2.7744, + "num_input_tokens_seen": 9701949440, + "step": 18505 + }, + { + "epoch": 0.8976667212250168, + "grad_norm": 0.2353515625, + "learning_rate": 1.4177574922670218e-06, + "loss": 2.769, + "num_input_tokens_seen": 9704570880, + "step": 18510 + }, + { + "epoch": 0.8979092027812634, + "grad_norm": 0.25, + "learning_rate": 1.4111100853908627e-06, + "loss": 2.7785, + "num_input_tokens_seen": 9707192320, + "step": 18515 + }, + { + "epoch": 0.89815168433751, + "grad_norm": 0.2451171875, + "learning_rate": 1.4044778463184915e-06, + "loss": 2.7619, + "num_input_tokens_seen": 9709813760, + "step": 18520 + }, + { + "epoch": 0.8983941658937568, + "grad_norm": 0.240234375, + "learning_rate": 1.3978607793144776e-06, + "loss": 2.7747, + "num_input_tokens_seen": 9712435200, + "step": 18525 + }, + { + "epoch": 0.8986366474500034, + "grad_norm": 0.2373046875, + "learning_rate": 1.3912588886336398e-06, + "loss": 2.7821, + "num_input_tokens_seen": 9715056640, + "step": 18530 + }, + { + "epoch": 0.89887912900625, + "grad_norm": 0.25, + "learning_rate": 1.3846721785210292e-06, + "loss": 2.7866, + "num_input_tokens_seen": 9717678080, + "step": 18535 + }, + { + "epoch": 0.8991216105624966, + "grad_norm": 0.2431640625, + "learning_rate": 1.3781006532119445e-06, + "loss": 2.7765, + "num_input_tokens_seen": 9720299520, + "step": 18540 + }, + { + "epoch": 0.8993640921187432, + "grad_norm": 0.2470703125, + "learning_rate": 1.3715443169319191e-06, + "loss": 2.7961, + "num_input_tokens_seen": 9722920960, + "step": 18545 + }, + { + "epoch": 0.8996065736749899, + "grad_norm": 0.24609375, + "learning_rate": 1.3650031738967172e-06, + "loss": 2.7744, + "num_input_tokens_seen": 9725542400, + "step": 18550 + }, + { + "epoch": 0.8998490552312365, + "grad_norm": 0.2392578125, + "learning_rate": 1.3584772283123353e-06, + "loss": 2.777, + "num_input_tokens_seen": 9728163840, + "step": 18555 + }, + { + "epoch": 0.9000915367874831, + "grad_norm": 0.240234375, + "learning_rate": 1.3519664843749947e-06, + "loss": 2.7662, + "num_input_tokens_seen": 9730785280, + "step": 18560 + }, + { + "epoch": 0.9003340183437297, + "grad_norm": 0.2470703125, + "learning_rate": 1.345470946271149e-06, + "loss": 2.7771, + "num_input_tokens_seen": 9733406720, + "step": 18565 + }, + { + "epoch": 0.9005764998999763, + "grad_norm": 0.244140625, + "learning_rate": 1.3389906181774658e-06, + "loss": 2.7581, + "num_input_tokens_seen": 9736028160, + "step": 18570 + }, + { + "epoch": 0.900818981456223, + "grad_norm": 0.2412109375, + "learning_rate": 1.332525504260837e-06, + "loss": 2.7769, + "num_input_tokens_seen": 9738649600, + "step": 18575 + }, + { + "epoch": 0.9010614630124696, + "grad_norm": 0.23828125, + "learning_rate": 1.3260756086783732e-06, + "loss": 2.7798, + "num_input_tokens_seen": 9741271040, + "step": 18580 + }, + { + "epoch": 0.9013039445687162, + "grad_norm": 0.2333984375, + "learning_rate": 1.3196409355773986e-06, + "loss": 2.7692, + "num_input_tokens_seen": 9743892480, + "step": 18585 + }, + { + "epoch": 0.9015464261249628, + "grad_norm": 0.2392578125, + "learning_rate": 1.3132214890954453e-06, + "loss": 2.7824, + "num_input_tokens_seen": 9746513920, + "step": 18590 + }, + { + "epoch": 0.9017889076812095, + "grad_norm": 0.240234375, + "learning_rate": 1.3068172733602613e-06, + "loss": 2.7804, + "num_input_tokens_seen": 9749135360, + "step": 18595 + }, + { + "epoch": 0.9020313892374561, + "grad_norm": 0.2412109375, + "learning_rate": 1.3004282924897915e-06, + "loss": 2.7601, + "num_input_tokens_seen": 9751756800, + "step": 18600 + }, + { + "epoch": 0.9020313892374561, + "eval_accuracy": 0.4559957661618629, + "eval_loss": 2.741598606109619, + "eval_runtime": 5.8657, + "eval_samples_per_second": 51.145, + "eval_steps_per_second": 6.478, + "num_input_tokens_seen": 9751756800, + "step": 18600 + }, + { + "epoch": 0.9022738707937028, + "grad_norm": 0.2392578125, + "learning_rate": 1.294054550592194e-06, + "loss": 2.7642, + "num_input_tokens_seen": 9754378240, + "step": 18605 + }, + { + "epoch": 0.9025163523499494, + "grad_norm": 0.240234375, + "learning_rate": 1.2876960517658242e-06, + "loss": 2.7627, + "num_input_tokens_seen": 9756999680, + "step": 18610 + }, + { + "epoch": 0.902758833906196, + "grad_norm": 0.2421875, + "learning_rate": 1.2813528000992337e-06, + "loss": 2.7592, + "num_input_tokens_seen": 9759621120, + "step": 18615 + }, + { + "epoch": 0.9030013154624427, + "grad_norm": 0.248046875, + "learning_rate": 1.2750247996711789e-06, + "loss": 2.7852, + "num_input_tokens_seen": 9762242560, + "step": 18620 + }, + { + "epoch": 0.9032437970186893, + "grad_norm": 0.2421875, + "learning_rate": 1.2687120545506054e-06, + "loss": 2.7813, + "num_input_tokens_seen": 9764864000, + "step": 18625 + }, + { + "epoch": 0.9034862785749359, + "grad_norm": 0.240234375, + "learning_rate": 1.262414568796641e-06, + "loss": 2.7722, + "num_input_tokens_seen": 9767485440, + "step": 18630 + }, + { + "epoch": 0.9037287601311825, + "grad_norm": 0.2412109375, + "learning_rate": 1.2561323464586105e-06, + "loss": 2.7861, + "num_input_tokens_seen": 9770106880, + "step": 18635 + }, + { + "epoch": 0.9039712416874292, + "grad_norm": 0.2431640625, + "learning_rate": 1.2498653915760216e-06, + "loss": 2.7734, + "num_input_tokens_seen": 9772728320, + "step": 18640 + }, + { + "epoch": 0.9042137232436758, + "grad_norm": 0.2451171875, + "learning_rate": 1.2436137081785677e-06, + "loss": 2.7927, + "num_input_tokens_seen": 9775349760, + "step": 18645 + }, + { + "epoch": 0.9044562047999224, + "grad_norm": 0.2392578125, + "learning_rate": 1.2373773002861161e-06, + "loss": 2.7753, + "num_input_tokens_seen": 9777971200, + "step": 18650 + }, + { + "epoch": 0.904698686356169, + "grad_norm": 0.23828125, + "learning_rate": 1.231156171908726e-06, + "loss": 2.7794, + "num_input_tokens_seen": 9780592640, + "step": 18655 + }, + { + "epoch": 0.9049411679124156, + "grad_norm": 0.24609375, + "learning_rate": 1.2249503270466089e-06, + "loss": 2.7738, + "num_input_tokens_seen": 9783214080, + "step": 18660 + }, + { + "epoch": 0.9051836494686623, + "grad_norm": 0.236328125, + "learning_rate": 1.2187597696901698e-06, + "loss": 2.7723, + "num_input_tokens_seen": 9785835520, + "step": 18665 + }, + { + "epoch": 0.9054261310249089, + "grad_norm": 0.2412109375, + "learning_rate": 1.212584503819969e-06, + "loss": 2.7683, + "num_input_tokens_seen": 9788456960, + "step": 18670 + }, + { + "epoch": 0.9056686125811555, + "grad_norm": 0.2431640625, + "learning_rate": 1.2064245334067526e-06, + "loss": 2.791, + "num_input_tokens_seen": 9791078400, + "step": 18675 + }, + { + "epoch": 0.9059110941374022, + "grad_norm": 0.248046875, + "learning_rate": 1.2002798624114102e-06, + "loss": 2.7764, + "num_input_tokens_seen": 9793699840, + "step": 18680 + }, + { + "epoch": 0.9061535756936488, + "grad_norm": 0.2353515625, + "learning_rate": 1.1941504947850125e-06, + "loss": 2.7651, + "num_input_tokens_seen": 9796321280, + "step": 18685 + }, + { + "epoch": 0.9063960572498955, + "grad_norm": 0.2421875, + "learning_rate": 1.18803643446877e-06, + "loss": 2.7711, + "num_input_tokens_seen": 9798942720, + "step": 18690 + }, + { + "epoch": 0.9066385388061421, + "grad_norm": 0.240234375, + "learning_rate": 1.1819376853940688e-06, + "loss": 2.7745, + "num_input_tokens_seen": 9801564160, + "step": 18695 + }, + { + "epoch": 0.9068810203623887, + "grad_norm": 0.251953125, + "learning_rate": 1.1758542514824416e-06, + "loss": 2.7726, + "num_input_tokens_seen": 9804185600, + "step": 18700 + }, + { + "epoch": 0.9071235019186353, + "grad_norm": 0.240234375, + "learning_rate": 1.169786136645573e-06, + "loss": 2.7618, + "num_input_tokens_seen": 9806807040, + "step": 18705 + }, + { + "epoch": 0.907365983474882, + "grad_norm": 0.248046875, + "learning_rate": 1.1637333447853006e-06, + "loss": 2.7598, + "num_input_tokens_seen": 9809428480, + "step": 18710 + }, + { + "epoch": 0.9076084650311286, + "grad_norm": 0.2431640625, + "learning_rate": 1.1576958797936105e-06, + "loss": 2.7753, + "num_input_tokens_seen": 9812049920, + "step": 18715 + }, + { + "epoch": 0.9078509465873752, + "grad_norm": 0.2412109375, + "learning_rate": 1.1516737455526228e-06, + "loss": 2.7694, + "num_input_tokens_seen": 9814671360, + "step": 18720 + }, + { + "epoch": 0.9080934281436218, + "grad_norm": 0.2373046875, + "learning_rate": 1.1456669459346091e-06, + "loss": 2.7757, + "num_input_tokens_seen": 9817292800, + "step": 18725 + }, + { + "epoch": 0.9083359096998684, + "grad_norm": 0.2412109375, + "learning_rate": 1.139675484801986e-06, + "loss": 2.7682, + "num_input_tokens_seen": 9819914240, + "step": 18730 + }, + { + "epoch": 0.9085783912561151, + "grad_norm": 0.2412109375, + "learning_rate": 1.133699366007293e-06, + "loss": 2.7597, + "num_input_tokens_seen": 9822535680, + "step": 18735 + }, + { + "epoch": 0.9088208728123617, + "grad_norm": 0.25, + "learning_rate": 1.1277385933932183e-06, + "loss": 2.777, + "num_input_tokens_seen": 9825157120, + "step": 18740 + }, + { + "epoch": 0.9090633543686083, + "grad_norm": 0.2353515625, + "learning_rate": 1.1217931707925704e-06, + "loss": 2.784, + "num_input_tokens_seen": 9827778560, + "step": 18745 + }, + { + "epoch": 0.9093058359248549, + "grad_norm": 0.23828125, + "learning_rate": 1.1158631020282972e-06, + "loss": 2.7776, + "num_input_tokens_seen": 9830400000, + "step": 18750 + }, + { + "epoch": 0.9095483174811015, + "grad_norm": 0.2412109375, + "learning_rate": 1.1099483909134678e-06, + "loss": 2.7726, + "num_input_tokens_seen": 9833021440, + "step": 18755 + }, + { + "epoch": 0.9097907990373483, + "grad_norm": 0.240234375, + "learning_rate": 1.1040490412512787e-06, + "loss": 2.7638, + "num_input_tokens_seen": 9835642880, + "step": 18760 + }, + { + "epoch": 0.9100332805935949, + "grad_norm": 0.244140625, + "learning_rate": 1.0981650568350487e-06, + "loss": 2.7806, + "num_input_tokens_seen": 9838264320, + "step": 18765 + }, + { + "epoch": 0.9102757621498415, + "grad_norm": 0.2412109375, + "learning_rate": 1.0922964414482151e-06, + "loss": 2.7563, + "num_input_tokens_seen": 9840885760, + "step": 18770 + }, + { + "epoch": 0.9105182437060881, + "grad_norm": 0.240234375, + "learning_rate": 1.086443198864337e-06, + "loss": 2.7906, + "num_input_tokens_seen": 9843507200, + "step": 18775 + }, + { + "epoch": 0.9107607252623348, + "grad_norm": 0.2353515625, + "learning_rate": 1.0806053328470843e-06, + "loss": 2.7728, + "num_input_tokens_seen": 9846128640, + "step": 18780 + }, + { + "epoch": 0.9110032068185814, + "grad_norm": 0.244140625, + "learning_rate": 1.0747828471502435e-06, + "loss": 2.7883, + "num_input_tokens_seen": 9848750080, + "step": 18785 + }, + { + "epoch": 0.911245688374828, + "grad_norm": 0.2392578125, + "learning_rate": 1.0689757455177057e-06, + "loss": 2.7661, + "num_input_tokens_seen": 9851371520, + "step": 18790 + }, + { + "epoch": 0.9114881699310746, + "grad_norm": 0.2431640625, + "learning_rate": 1.0631840316834785e-06, + "loss": 2.7722, + "num_input_tokens_seen": 9853992960, + "step": 18795 + }, + { + "epoch": 0.9117306514873212, + "grad_norm": 0.24609375, + "learning_rate": 1.0574077093716661e-06, + "loss": 2.7715, + "num_input_tokens_seen": 9856614400, + "step": 18800 + }, + { + "epoch": 0.9119731330435679, + "grad_norm": 0.244140625, + "learning_rate": 1.051646782296481e-06, + "loss": 2.7717, + "num_input_tokens_seen": 9859235840, + "step": 18805 + }, + { + "epoch": 0.9122156145998145, + "grad_norm": 0.2421875, + "learning_rate": 1.0459012541622376e-06, + "loss": 2.7854, + "num_input_tokens_seen": 9861857280, + "step": 18810 + }, + { + "epoch": 0.9124580961560611, + "grad_norm": 0.23828125, + "learning_rate": 1.040171128663342e-06, + "loss": 2.7665, + "num_input_tokens_seen": 9864478720, + "step": 18815 + }, + { + "epoch": 0.9127005777123077, + "grad_norm": 0.2392578125, + "learning_rate": 1.0344564094843023e-06, + "loss": 2.7631, + "num_input_tokens_seen": 9867100160, + "step": 18820 + }, + { + "epoch": 0.9129430592685543, + "grad_norm": 0.2431640625, + "learning_rate": 1.028757100299721e-06, + "loss": 2.7683, + "num_input_tokens_seen": 9869721600, + "step": 18825 + }, + { + "epoch": 0.913185540824801, + "grad_norm": 0.2421875, + "learning_rate": 1.0230732047742857e-06, + "loss": 2.7905, + "num_input_tokens_seen": 9872343040, + "step": 18830 + }, + { + "epoch": 0.9134280223810476, + "grad_norm": 0.2470703125, + "learning_rate": 1.0174047265627818e-06, + "loss": 2.7753, + "num_input_tokens_seen": 9874964480, + "step": 18835 + }, + { + "epoch": 0.9136705039372943, + "grad_norm": 0.2421875, + "learning_rate": 1.011751669310071e-06, + "loss": 2.77, + "num_input_tokens_seen": 9877585920, + "step": 18840 + }, + { + "epoch": 0.9139129854935409, + "grad_norm": 0.2373046875, + "learning_rate": 1.006114036651107e-06, + "loss": 2.7594, + "num_input_tokens_seen": 9880207360, + "step": 18845 + }, + { + "epoch": 0.9141554670497876, + "grad_norm": 0.25, + "learning_rate": 1.0004918322109258e-06, + "loss": 2.7693, + "num_input_tokens_seen": 9882828800, + "step": 18850 + }, + { + "epoch": 0.9143979486060342, + "grad_norm": 0.2412109375, + "learning_rate": 9.948850596046332e-07, + "loss": 2.7856, + "num_input_tokens_seen": 9885450240, + "step": 18855 + }, + { + "epoch": 0.9146404301622808, + "grad_norm": 0.236328125, + "learning_rate": 9.892937224374261e-07, + "loss": 2.7889, + "num_input_tokens_seen": 9888071680, + "step": 18860 + }, + { + "epoch": 0.9148829117185274, + "grad_norm": 0.2421875, + "learning_rate": 9.837178243045641e-07, + "loss": 2.7731, + "num_input_tokens_seen": 9890693120, + "step": 18865 + }, + { + "epoch": 0.915125393274774, + "grad_norm": 0.2451171875, + "learning_rate": 9.781573687913909e-07, + "loss": 2.7776, + "num_input_tokens_seen": 9893314560, + "step": 18870 + }, + { + "epoch": 0.9153678748310207, + "grad_norm": 0.2412109375, + "learning_rate": 9.726123594733072e-07, + "loss": 2.7727, + "num_input_tokens_seen": 9895936000, + "step": 18875 + }, + { + "epoch": 0.9156103563872673, + "grad_norm": 0.244140625, + "learning_rate": 9.67082799915789e-07, + "loss": 2.7768, + "num_input_tokens_seen": 9898557440, + "step": 18880 + }, + { + "epoch": 0.9158528379435139, + "grad_norm": 0.2412109375, + "learning_rate": 9.615686936743834e-07, + "loss": 2.7858, + "num_input_tokens_seen": 9901178880, + "step": 18885 + }, + { + "epoch": 0.9160953194997605, + "grad_norm": 0.2421875, + "learning_rate": 9.560700442946906e-07, + "loss": 2.7612, + "num_input_tokens_seen": 9903800320, + "step": 18890 + }, + { + "epoch": 0.9163378010560072, + "grad_norm": 0.2470703125, + "learning_rate": 9.50586855312377e-07, + "loss": 2.7813, + "num_input_tokens_seen": 9906421760, + "step": 18895 + }, + { + "epoch": 0.9165802826122538, + "grad_norm": 0.23828125, + "learning_rate": 9.451191302531693e-07, + "loss": 2.7823, + "num_input_tokens_seen": 9909043200, + "step": 18900 + }, + { + "epoch": 0.9165802826122538, + "eval_accuracy": 0.45599088096401236, + "eval_loss": 2.7415921688079834, + "eval_runtime": 5.8578, + "eval_samples_per_second": 51.213, + "eval_steps_per_second": 6.487, + "num_input_tokens_seen": 9909043200, + "step": 18900 + }, + { + "epoch": 0.9168227641685004, + "grad_norm": 0.236328125, + "learning_rate": 9.396668726328467e-07, + "loss": 2.767, + "num_input_tokens_seen": 9911664640, + "step": 18905 + }, + { + "epoch": 0.917065245724747, + "grad_norm": 0.240234375, + "learning_rate": 9.342300859572467e-07, + "loss": 2.756, + "num_input_tokens_seen": 9914286080, + "step": 18910 + }, + { + "epoch": 0.9173077272809936, + "grad_norm": 0.23828125, + "learning_rate": 9.288087737222562e-07, + "loss": 2.7867, + "num_input_tokens_seen": 9916907520, + "step": 18915 + }, + { + "epoch": 0.9175502088372404, + "grad_norm": 0.2451171875, + "learning_rate": 9.234029394138116e-07, + "loss": 2.7726, + "num_input_tokens_seen": 9919528960, + "step": 18920 + }, + { + "epoch": 0.917792690393487, + "grad_norm": 0.2451171875, + "learning_rate": 9.18012586507902e-07, + "loss": 2.7795, + "num_input_tokens_seen": 9922150400, + "step": 18925 + }, + { + "epoch": 0.9180351719497336, + "grad_norm": 0.240234375, + "learning_rate": 9.126377184705576e-07, + "loss": 2.7705, + "num_input_tokens_seen": 9924771840, + "step": 18930 + }, + { + "epoch": 0.9182776535059802, + "grad_norm": 0.244140625, + "learning_rate": 9.072783387578499e-07, + "loss": 2.7789, + "num_input_tokens_seen": 9927393280, + "step": 18935 + }, + { + "epoch": 0.9185201350622269, + "grad_norm": 0.2451171875, + "learning_rate": 9.019344508158945e-07, + "loss": 2.7743, + "num_input_tokens_seen": 9930014720, + "step": 18940 + }, + { + "epoch": 0.9187626166184735, + "grad_norm": 0.248046875, + "learning_rate": 8.9660605808084e-07, + "loss": 2.774, + "num_input_tokens_seen": 9932636160, + "step": 18945 + }, + { + "epoch": 0.9190050981747201, + "grad_norm": 0.2392578125, + "learning_rate": 8.912931639788847e-07, + "loss": 2.7861, + "num_input_tokens_seen": 9935257600, + "step": 18950 + }, + { + "epoch": 0.9192475797309667, + "grad_norm": 0.2431640625, + "learning_rate": 8.859957719262429e-07, + "loss": 2.7704, + "num_input_tokens_seen": 9937879040, + "step": 18955 + }, + { + "epoch": 0.9194900612872133, + "grad_norm": 0.251953125, + "learning_rate": 8.807138853291818e-07, + "loss": 2.7902, + "num_input_tokens_seen": 9940500480, + "step": 18960 + }, + { + "epoch": 0.91973254284346, + "grad_norm": 0.2431640625, + "learning_rate": 8.754475075839846e-07, + "loss": 2.7725, + "num_input_tokens_seen": 9943121920, + "step": 18965 + }, + { + "epoch": 0.9199750243997066, + "grad_norm": 0.2392578125, + "learning_rate": 8.701966420769591e-07, + "loss": 2.7651, + "num_input_tokens_seen": 9945743360, + "step": 18970 + }, + { + "epoch": 0.9202175059559532, + "grad_norm": 0.2431640625, + "learning_rate": 8.649612921844491e-07, + "loss": 2.7622, + "num_input_tokens_seen": 9948364800, + "step": 18975 + }, + { + "epoch": 0.9204599875121998, + "grad_norm": 0.244140625, + "learning_rate": 8.597414612728172e-07, + "loss": 2.7767, + "num_input_tokens_seen": 9950986240, + "step": 18980 + }, + { + "epoch": 0.9207024690684464, + "grad_norm": 0.2431640625, + "learning_rate": 8.54537152698448e-07, + "loss": 2.7723, + "num_input_tokens_seen": 9953607680, + "step": 18985 + }, + { + "epoch": 0.9209449506246931, + "grad_norm": 0.24609375, + "learning_rate": 8.493483698077398e-07, + "loss": 2.7736, + "num_input_tokens_seen": 9956229120, + "step": 18990 + }, + { + "epoch": 0.9211874321809398, + "grad_norm": 0.2412109375, + "learning_rate": 8.441751159371209e-07, + "loss": 2.7762, + "num_input_tokens_seen": 9958850560, + "step": 18995 + }, + { + "epoch": 0.9214299137371864, + "grad_norm": 0.255859375, + "learning_rate": 8.390173944130192e-07, + "loss": 2.7613, + "num_input_tokens_seen": 9961472000, + "step": 19000 + }, + { + "epoch": 0.921672395293433, + "grad_norm": 0.2392578125, + "learning_rate": 8.338752085518819e-07, + "loss": 2.7686, + "num_input_tokens_seen": 9964093440, + "step": 19005 + }, + { + "epoch": 0.9219148768496797, + "grad_norm": 0.240234375, + "learning_rate": 8.28748561660167e-07, + "loss": 2.7632, + "num_input_tokens_seen": 9966714880, + "step": 19010 + }, + { + "epoch": 0.9221573584059263, + "grad_norm": 0.25, + "learning_rate": 8.23637457034343e-07, + "loss": 2.7734, + "num_input_tokens_seen": 9969336320, + "step": 19015 + }, + { + "epoch": 0.9223998399621729, + "grad_norm": 0.2421875, + "learning_rate": 8.185418979608811e-07, + "loss": 2.7805, + "num_input_tokens_seen": 9971957760, + "step": 19020 + }, + { + "epoch": 0.9226423215184195, + "grad_norm": 0.2314453125, + "learning_rate": 8.134618877162631e-07, + "loss": 2.7605, + "num_input_tokens_seen": 9974579200, + "step": 19025 + }, + { + "epoch": 0.9228848030746661, + "grad_norm": 0.2421875, + "learning_rate": 8.083974295669566e-07, + "loss": 2.7682, + "num_input_tokens_seen": 9977200640, + "step": 19030 + }, + { + "epoch": 0.9231272846309128, + "grad_norm": 0.2392578125, + "learning_rate": 8.033485267694457e-07, + "loss": 2.7782, + "num_input_tokens_seen": 9979822080, + "step": 19035 + }, + { + "epoch": 0.9233697661871594, + "grad_norm": 0.2392578125, + "learning_rate": 7.983151825702085e-07, + "loss": 2.7764, + "num_input_tokens_seen": 9982443520, + "step": 19040 + }, + { + "epoch": 0.923612247743406, + "grad_norm": 0.2392578125, + "learning_rate": 7.932974002057115e-07, + "loss": 2.7665, + "num_input_tokens_seen": 9985064960, + "step": 19045 + }, + { + "epoch": 0.9238547292996526, + "grad_norm": 0.2431640625, + "learning_rate": 7.882951829024237e-07, + "loss": 2.7671, + "num_input_tokens_seen": 9987686400, + "step": 19050 + }, + { + "epoch": 0.9240972108558992, + "grad_norm": 0.2421875, + "learning_rate": 7.833085338768003e-07, + "loss": 2.7761, + "num_input_tokens_seen": 9990307840, + "step": 19055 + }, + { + "epoch": 0.9243396924121459, + "grad_norm": 0.234375, + "learning_rate": 7.783374563352902e-07, + "loss": 2.7673, + "num_input_tokens_seen": 9992929280, + "step": 19060 + }, + { + "epoch": 0.9245821739683925, + "grad_norm": 0.240234375, + "learning_rate": 7.73381953474328e-07, + "loss": 2.7913, + "num_input_tokens_seen": 9995550720, + "step": 19065 + }, + { + "epoch": 0.9248246555246391, + "grad_norm": 0.2451171875, + "learning_rate": 7.68442028480329e-07, + "loss": 2.7814, + "num_input_tokens_seen": 9998172160, + "step": 19070 + }, + { + "epoch": 0.9250671370808858, + "grad_norm": 0.2412109375, + "learning_rate": 7.635176845296966e-07, + "loss": 2.7733, + "num_input_tokens_seen": 10000793600, + "step": 19075 + }, + { + "epoch": 0.9253096186371325, + "grad_norm": 0.2412109375, + "learning_rate": 7.586089247888173e-07, + "loss": 2.7804, + "num_input_tokens_seen": 10003415040, + "step": 19080 + }, + { + "epoch": 0.9255521001933791, + "grad_norm": 0.2373046875, + "learning_rate": 7.537157524140554e-07, + "loss": 2.7858, + "num_input_tokens_seen": 10006036480, + "step": 19085 + }, + { + "epoch": 0.9257945817496257, + "grad_norm": 0.2431640625, + "learning_rate": 7.488381705517494e-07, + "loss": 2.7652, + "num_input_tokens_seen": 10008657920, + "step": 19090 + }, + { + "epoch": 0.9260370633058723, + "grad_norm": 0.2412109375, + "learning_rate": 7.439761823382129e-07, + "loss": 2.7694, + "num_input_tokens_seen": 10011279360, + "step": 19095 + }, + { + "epoch": 0.9262795448621189, + "grad_norm": 0.240234375, + "learning_rate": 7.391297908997341e-07, + "loss": 2.762, + "num_input_tokens_seen": 10013900800, + "step": 19100 + }, + { + "epoch": 0.9265220264183656, + "grad_norm": 0.2431640625, + "learning_rate": 7.342989993525784e-07, + "loss": 2.7734, + "num_input_tokens_seen": 10016522240, + "step": 19105 + }, + { + "epoch": 0.9267645079746122, + "grad_norm": 0.2353515625, + "learning_rate": 7.294838108029722e-07, + "loss": 2.768, + "num_input_tokens_seen": 10019143680, + "step": 19110 + }, + { + "epoch": 0.9270069895308588, + "grad_norm": 0.25, + "learning_rate": 7.246842283471084e-07, + "loss": 2.7644, + "num_input_tokens_seen": 10021765120, + "step": 19115 + }, + { + "epoch": 0.9272494710871054, + "grad_norm": 0.236328125, + "learning_rate": 7.199002550711542e-07, + "loss": 2.7643, + "num_input_tokens_seen": 10024386560, + "step": 19120 + }, + { + "epoch": 0.927491952643352, + "grad_norm": 0.248046875, + "learning_rate": 7.151318940512325e-07, + "loss": 2.7756, + "num_input_tokens_seen": 10027008000, + "step": 19125 + }, + { + "epoch": 0.9277344341995987, + "grad_norm": 0.2392578125, + "learning_rate": 7.103791483534267e-07, + "loss": 2.7716, + "num_input_tokens_seen": 10029629440, + "step": 19130 + }, + { + "epoch": 0.9279769157558453, + "grad_norm": 0.244140625, + "learning_rate": 7.056420210337866e-07, + "loss": 2.7803, + "num_input_tokens_seen": 10032250880, + "step": 19135 + }, + { + "epoch": 0.9282193973120919, + "grad_norm": 0.2392578125, + "learning_rate": 7.009205151383119e-07, + "loss": 2.778, + "num_input_tokens_seen": 10034872320, + "step": 19140 + }, + { + "epoch": 0.9284618788683385, + "grad_norm": 0.2412109375, + "learning_rate": 6.962146337029573e-07, + "loss": 2.7613, + "num_input_tokens_seen": 10037493760, + "step": 19145 + }, + { + "epoch": 0.9287043604245852, + "grad_norm": 0.244140625, + "learning_rate": 6.915243797536442e-07, + "loss": 2.7707, + "num_input_tokens_seen": 10040115200, + "step": 19150 + }, + { + "epoch": 0.9289468419808319, + "grad_norm": 0.2353515625, + "learning_rate": 6.868497563062237e-07, + "loss": 2.7964, + "num_input_tokens_seen": 10042736640, + "step": 19155 + }, + { + "epoch": 0.9291893235370785, + "grad_norm": 0.251953125, + "learning_rate": 6.821907663665111e-07, + "loss": 2.7723, + "num_input_tokens_seen": 10045358080, + "step": 19160 + }, + { + "epoch": 0.9294318050933251, + "grad_norm": 0.2470703125, + "learning_rate": 6.775474129302711e-07, + "loss": 2.7703, + "num_input_tokens_seen": 10047979520, + "step": 19165 + }, + { + "epoch": 0.9296742866495717, + "grad_norm": 0.2412109375, + "learning_rate": 6.729196989832043e-07, + "loss": 2.7921, + "num_input_tokens_seen": 10050600960, + "step": 19170 + }, + { + "epoch": 0.9299167682058184, + "grad_norm": 0.244140625, + "learning_rate": 6.683076275009581e-07, + "loss": 2.7561, + "num_input_tokens_seen": 10053222400, + "step": 19175 + }, + { + "epoch": 0.930159249762065, + "grad_norm": 0.2431640625, + "learning_rate": 6.637112014491298e-07, + "loss": 2.7668, + "num_input_tokens_seen": 10055843840, + "step": 19180 + }, + { + "epoch": 0.9304017313183116, + "grad_norm": 0.2353515625, + "learning_rate": 6.591304237832441e-07, + "loss": 2.7632, + "num_input_tokens_seen": 10058465280, + "step": 19185 + }, + { + "epoch": 0.9306442128745582, + "grad_norm": 0.23828125, + "learning_rate": 6.545652974487754e-07, + "loss": 2.7815, + "num_input_tokens_seen": 10061086720, + "step": 19190 + }, + { + "epoch": 0.9308866944308049, + "grad_norm": 0.2412109375, + "learning_rate": 6.500158253811228e-07, + "loss": 2.7864, + "num_input_tokens_seen": 10063708160, + "step": 19195 + }, + { + "epoch": 0.9311291759870515, + "grad_norm": 0.248046875, + "learning_rate": 6.45482010505627e-07, + "loss": 2.7767, + "num_input_tokens_seen": 10066329600, + "step": 19200 + }, + { + "epoch": 0.9311291759870515, + "eval_accuracy": 0.45601367855398145, + "eval_loss": 2.7415668964385986, + "eval_runtime": 5.8962, + "eval_samples_per_second": 50.88, + "eval_steps_per_second": 6.445, + "num_input_tokens_seen": 10066329600, + "step": 19200 + }, + { + "epoch": 0.9313716575432981, + "grad_norm": 0.244140625, + "learning_rate": 6.409638557375613e-07, + "loss": 2.7922, + "num_input_tokens_seen": 10068951040, + "step": 19205 + }, + { + "epoch": 0.9316141390995447, + "grad_norm": 0.24609375, + "learning_rate": 6.364613639821243e-07, + "loss": 2.7786, + "num_input_tokens_seen": 10071572480, + "step": 19210 + }, + { + "epoch": 0.9318566206557913, + "grad_norm": 0.251953125, + "learning_rate": 6.319745381344527e-07, + "loss": 2.7652, + "num_input_tokens_seen": 10074193920, + "step": 19215 + }, + { + "epoch": 0.932099102212038, + "grad_norm": 0.2392578125, + "learning_rate": 6.275033810795944e-07, + "loss": 2.7671, + "num_input_tokens_seen": 10076815360, + "step": 19220 + }, + { + "epoch": 0.9323415837682846, + "grad_norm": 0.244140625, + "learning_rate": 6.230478956925384e-07, + "loss": 2.7792, + "num_input_tokens_seen": 10079436800, + "step": 19225 + }, + { + "epoch": 0.9325840653245313, + "grad_norm": 0.2451171875, + "learning_rate": 6.186080848381876e-07, + "loss": 2.774, + "num_input_tokens_seen": 10082058240, + "step": 19230 + }, + { + "epoch": 0.9328265468807779, + "grad_norm": 0.2421875, + "learning_rate": 6.141839513713666e-07, + "loss": 2.7823, + "num_input_tokens_seen": 10084679680, + "step": 19235 + }, + { + "epoch": 0.9330690284370246, + "grad_norm": 0.2421875, + "learning_rate": 6.097754981368192e-07, + "loss": 2.7738, + "num_input_tokens_seen": 10087301120, + "step": 19240 + }, + { + "epoch": 0.9333115099932712, + "grad_norm": 0.2412109375, + "learning_rate": 6.05382727969217e-07, + "loss": 2.7682, + "num_input_tokens_seen": 10089922560, + "step": 19245 + }, + { + "epoch": 0.9335539915495178, + "grad_norm": 0.2353515625, + "learning_rate": 6.010056436931311e-07, + "loss": 2.777, + "num_input_tokens_seen": 10092544000, + "step": 19250 + }, + { + "epoch": 0.9337964731057644, + "grad_norm": 0.2373046875, + "learning_rate": 5.966442481230544e-07, + "loss": 2.7631, + "num_input_tokens_seen": 10095165440, + "step": 19255 + }, + { + "epoch": 0.934038954662011, + "grad_norm": 0.248046875, + "learning_rate": 5.922985440633965e-07, + "loss": 2.7715, + "num_input_tokens_seen": 10097786880, + "step": 19260 + }, + { + "epoch": 0.9342814362182577, + "grad_norm": 0.240234375, + "learning_rate": 5.879685343084668e-07, + "loss": 2.7787, + "num_input_tokens_seen": 10100408320, + "step": 19265 + }, + { + "epoch": 0.9345239177745043, + "grad_norm": 0.2421875, + "learning_rate": 5.836542216424907e-07, + "loss": 2.7713, + "num_input_tokens_seen": 10103029760, + "step": 19270 + }, + { + "epoch": 0.9347663993307509, + "grad_norm": 0.2431640625, + "learning_rate": 5.793556088396018e-07, + "loss": 2.7874, + "num_input_tokens_seen": 10105651200, + "step": 19275 + }, + { + "epoch": 0.9350088808869975, + "grad_norm": 0.240234375, + "learning_rate": 5.750726986638283e-07, + "loss": 2.7682, + "num_input_tokens_seen": 10108272640, + "step": 19280 + }, + { + "epoch": 0.9352513624432441, + "grad_norm": 0.25390625, + "learning_rate": 5.708054938691115e-07, + "loss": 2.7856, + "num_input_tokens_seen": 10110894080, + "step": 19285 + }, + { + "epoch": 0.9354938439994908, + "grad_norm": 0.2470703125, + "learning_rate": 5.665539971992928e-07, + "loss": 2.7906, + "num_input_tokens_seen": 10113515520, + "step": 19290 + }, + { + "epoch": 0.9357363255557374, + "grad_norm": 0.2470703125, + "learning_rate": 5.623182113881048e-07, + "loss": 2.7751, + "num_input_tokens_seen": 10116136960, + "step": 19295 + }, + { + "epoch": 0.935978807111984, + "grad_norm": 0.244140625, + "learning_rate": 5.580981391591911e-07, + "loss": 2.7692, + "num_input_tokens_seen": 10118758400, + "step": 19300 + }, + { + "epoch": 0.9362212886682306, + "grad_norm": 0.2470703125, + "learning_rate": 5.538937832260838e-07, + "loss": 2.7718, + "num_input_tokens_seen": 10121379840, + "step": 19305 + }, + { + "epoch": 0.9364637702244774, + "grad_norm": 0.23828125, + "learning_rate": 5.497051462922093e-07, + "loss": 2.787, + "num_input_tokens_seen": 10124001280, + "step": 19310 + }, + { + "epoch": 0.936706251780724, + "grad_norm": 0.248046875, + "learning_rate": 5.455322310508826e-07, + "loss": 2.7713, + "num_input_tokens_seen": 10126622720, + "step": 19315 + }, + { + "epoch": 0.9369487333369706, + "grad_norm": 0.2431640625, + "learning_rate": 5.413750401853213e-07, + "loss": 2.7678, + "num_input_tokens_seen": 10129244160, + "step": 19320 + }, + { + "epoch": 0.9371912148932172, + "grad_norm": 0.2431640625, + "learning_rate": 5.372335763686203e-07, + "loss": 2.7763, + "num_input_tokens_seen": 10131865600, + "step": 19325 + }, + { + "epoch": 0.9374336964494638, + "grad_norm": 0.2373046875, + "learning_rate": 5.331078422637692e-07, + "loss": 2.7631, + "num_input_tokens_seen": 10134487040, + "step": 19330 + }, + { + "epoch": 0.9376761780057105, + "grad_norm": 0.2421875, + "learning_rate": 5.289978405236429e-07, + "loss": 2.77, + "num_input_tokens_seen": 10137108480, + "step": 19335 + }, + { + "epoch": 0.9379186595619571, + "grad_norm": 0.240234375, + "learning_rate": 5.249035737909913e-07, + "loss": 2.7744, + "num_input_tokens_seen": 10139729920, + "step": 19340 + }, + { + "epoch": 0.9381611411182037, + "grad_norm": 0.236328125, + "learning_rate": 5.208250446984586e-07, + "loss": 2.7839, + "num_input_tokens_seen": 10142351360, + "step": 19345 + }, + { + "epoch": 0.9384036226744503, + "grad_norm": 0.2421875, + "learning_rate": 5.167622558685609e-07, + "loss": 2.7684, + "num_input_tokens_seen": 10144972800, + "step": 19350 + }, + { + "epoch": 0.9386461042306969, + "grad_norm": 0.240234375, + "learning_rate": 5.127152099137028e-07, + "loss": 2.765, + "num_input_tokens_seen": 10147594240, + "step": 19355 + }, + { + "epoch": 0.9388885857869436, + "grad_norm": 0.248046875, + "learning_rate": 5.086839094361557e-07, + "loss": 2.7745, + "num_input_tokens_seen": 10150215680, + "step": 19360 + }, + { + "epoch": 0.9391310673431902, + "grad_norm": 0.2392578125, + "learning_rate": 5.046683570280708e-07, + "loss": 2.7613, + "num_input_tokens_seen": 10152837120, + "step": 19365 + }, + { + "epoch": 0.9393735488994368, + "grad_norm": 0.2412109375, + "learning_rate": 5.006685552714802e-07, + "loss": 2.7718, + "num_input_tokens_seen": 10155458560, + "step": 19370 + }, + { + "epoch": 0.9396160304556834, + "grad_norm": 0.2353515625, + "learning_rate": 4.966845067382708e-07, + "loss": 2.761, + "num_input_tokens_seen": 10158080000, + "step": 19375 + }, + { + "epoch": 0.93985851201193, + "grad_norm": 0.2412109375, + "learning_rate": 4.927162139902186e-07, + "loss": 2.7875, + "num_input_tokens_seen": 10160701440, + "step": 19380 + }, + { + "epoch": 0.9401009935681767, + "grad_norm": 0.240234375, + "learning_rate": 4.887636795789574e-07, + "loss": 2.7785, + "num_input_tokens_seen": 10163322880, + "step": 19385 + }, + { + "epoch": 0.9403434751244234, + "grad_norm": 0.240234375, + "learning_rate": 4.848269060459904e-07, + "loss": 2.7833, + "num_input_tokens_seen": 10165944320, + "step": 19390 + }, + { + "epoch": 0.94058595668067, + "grad_norm": 0.248046875, + "learning_rate": 4.809058959226925e-07, + "loss": 2.781, + "num_input_tokens_seen": 10168565760, + "step": 19395 + }, + { + "epoch": 0.9408284382369166, + "grad_norm": 0.251953125, + "learning_rate": 4.770006517302917e-07, + "loss": 2.7736, + "num_input_tokens_seen": 10171187200, + "step": 19400 + }, + { + "epoch": 0.9410709197931633, + "grad_norm": 0.2451171875, + "learning_rate": 4.7311117597989007e-07, + "loss": 2.7747, + "num_input_tokens_seen": 10173808640, + "step": 19405 + }, + { + "epoch": 0.9413134013494099, + "grad_norm": 0.2421875, + "learning_rate": 4.692374711724401e-07, + "loss": 2.7705, + "num_input_tokens_seen": 10176430080, + "step": 19410 + }, + { + "epoch": 0.9415558829056565, + "grad_norm": 0.2412109375, + "learning_rate": 4.6537953979876035e-07, + "loss": 2.7655, + "num_input_tokens_seen": 10179051520, + "step": 19415 + }, + { + "epoch": 0.9417983644619031, + "grad_norm": 0.25, + "learning_rate": 4.615373843395249e-07, + "loss": 2.7514, + "num_input_tokens_seen": 10181672960, + "step": 19420 + }, + { + "epoch": 0.9420408460181497, + "grad_norm": 0.2412109375, + "learning_rate": 4.577110072652657e-07, + "loss": 2.7658, + "num_input_tokens_seen": 10184294400, + "step": 19425 + }, + { + "epoch": 0.9422833275743964, + "grad_norm": 0.248046875, + "learning_rate": 4.5390041103636484e-07, + "loss": 2.7706, + "num_input_tokens_seen": 10186915840, + "step": 19430 + }, + { + "epoch": 0.942525809130643, + "grad_norm": 0.2373046875, + "learning_rate": 4.501055981030594e-07, + "loss": 2.7677, + "num_input_tokens_seen": 10189537280, + "step": 19435 + }, + { + "epoch": 0.9427682906868896, + "grad_norm": 0.2421875, + "learning_rate": 4.463265709054365e-07, + "loss": 2.7691, + "num_input_tokens_seen": 10192158720, + "step": 19440 + }, + { + "epoch": 0.9430107722431362, + "grad_norm": 0.2421875, + "learning_rate": 4.425633318734357e-07, + "loss": 2.7725, + "num_input_tokens_seen": 10194780160, + "step": 19445 + }, + { + "epoch": 0.9432532537993829, + "grad_norm": 0.244140625, + "learning_rate": 4.3881588342684357e-07, + "loss": 2.7773, + "num_input_tokens_seen": 10197401600, + "step": 19450 + }, + { + "epoch": 0.9434957353556295, + "grad_norm": 0.2431640625, + "learning_rate": 4.350842279752937e-07, + "loss": 2.7832, + "num_input_tokens_seen": 10200023040, + "step": 19455 + }, + { + "epoch": 0.9437382169118761, + "grad_norm": 0.2392578125, + "learning_rate": 4.3136836791826395e-07, + "loss": 2.7669, + "num_input_tokens_seen": 10202644480, + "step": 19460 + }, + { + "epoch": 0.9439806984681227, + "grad_norm": 0.2451171875, + "learning_rate": 4.276683056450737e-07, + "loss": 2.7769, + "num_input_tokens_seen": 10205265920, + "step": 19465 + }, + { + "epoch": 0.9442231800243694, + "grad_norm": 0.23828125, + "learning_rate": 4.239840435348863e-07, + "loss": 2.7749, + "num_input_tokens_seen": 10207887360, + "step": 19470 + }, + { + "epoch": 0.9444656615806161, + "grad_norm": 0.2392578125, + "learning_rate": 4.203155839567069e-07, + "loss": 2.7921, + "num_input_tokens_seen": 10210508800, + "step": 19475 + }, + { + "epoch": 0.9447081431368627, + "grad_norm": 0.2421875, + "learning_rate": 4.166629292693791e-07, + "loss": 2.7716, + "num_input_tokens_seen": 10213130240, + "step": 19480 + }, + { + "epoch": 0.9449506246931093, + "grad_norm": 0.236328125, + "learning_rate": 4.1302608182157697e-07, + "loss": 2.7795, + "num_input_tokens_seen": 10215751680, + "step": 19485 + }, + { + "epoch": 0.9451931062493559, + "grad_norm": 0.2431640625, + "learning_rate": 4.0940504395182435e-07, + "loss": 2.7746, + "num_input_tokens_seen": 10218373120, + "step": 19490 + }, + { + "epoch": 0.9454355878056026, + "grad_norm": 0.244140625, + "learning_rate": 4.0579981798846423e-07, + "loss": 2.769, + "num_input_tokens_seen": 10220994560, + "step": 19495 + }, + { + "epoch": 0.9456780693618492, + "grad_norm": 0.2470703125, + "learning_rate": 4.0221040624968397e-07, + "loss": 2.7759, + "num_input_tokens_seen": 10223616000, + "step": 19500 + }, + { + "epoch": 0.9456780693618492, + "eval_accuracy": 0.4559973945611464, + "eval_loss": 2.7415828704833984, + "eval_runtime": 5.8729, + "eval_samples_per_second": 51.082, + "eval_steps_per_second": 6.47, + "num_input_tokens_seen": 10223616000, + "step": 19500 + }, + { + "epoch": 0.9459205509180958, + "grad_norm": 0.244140625, + "learning_rate": 3.986368110434929e-07, + "loss": 2.7689, + "num_input_tokens_seen": 10226237440, + "step": 19505 + }, + { + "epoch": 0.9461630324743424, + "grad_norm": 0.24609375, + "learning_rate": 3.950790346677391e-07, + "loss": 2.7671, + "num_input_tokens_seen": 10228858880, + "step": 19510 + }, + { + "epoch": 0.946405514030589, + "grad_norm": 0.244140625, + "learning_rate": 3.915370794100953e-07, + "loss": 2.7709, + "num_input_tokens_seen": 10231480320, + "step": 19515 + }, + { + "epoch": 0.9466479955868357, + "grad_norm": 0.240234375, + "learning_rate": 3.880109475480592e-07, + "loss": 2.7691, + "num_input_tokens_seen": 10234101760, + "step": 19520 + }, + { + "epoch": 0.9468904771430823, + "grad_norm": 0.236328125, + "learning_rate": 3.84500641348956e-07, + "loss": 2.7686, + "num_input_tokens_seen": 10236723200, + "step": 19525 + }, + { + "epoch": 0.9471329586993289, + "grad_norm": 0.236328125, + "learning_rate": 3.81006163069933e-07, + "loss": 2.7714, + "num_input_tokens_seen": 10239344640, + "step": 19530 + }, + { + "epoch": 0.9473754402555755, + "grad_norm": 0.2353515625, + "learning_rate": 3.775275149579649e-07, + "loss": 2.7857, + "num_input_tokens_seen": 10241966080, + "step": 19535 + }, + { + "epoch": 0.9476179218118221, + "grad_norm": 0.2470703125, + "learning_rate": 3.740646992498431e-07, + "loss": 2.7812, + "num_input_tokens_seen": 10244587520, + "step": 19540 + }, + { + "epoch": 0.9478604033680689, + "grad_norm": 0.2490234375, + "learning_rate": 3.706177181721782e-07, + "loss": 2.7824, + "num_input_tokens_seen": 10247208960, + "step": 19545 + }, + { + "epoch": 0.9481028849243155, + "grad_norm": 0.2421875, + "learning_rate": 3.6718657394140264e-07, + "loss": 2.7808, + "num_input_tokens_seen": 10249830400, + "step": 19550 + }, + { + "epoch": 0.9483453664805621, + "grad_norm": 0.2392578125, + "learning_rate": 3.6377126876376286e-07, + "loss": 2.7664, + "num_input_tokens_seen": 10252451840, + "step": 19555 + }, + { + "epoch": 0.9485878480368087, + "grad_norm": 0.248046875, + "learning_rate": 3.6037180483532163e-07, + "loss": 2.7897, + "num_input_tokens_seen": 10255073280, + "step": 19560 + }, + { + "epoch": 0.9488303295930554, + "grad_norm": 0.236328125, + "learning_rate": 3.569881843419526e-07, + "loss": 2.779, + "num_input_tokens_seen": 10257694720, + "step": 19565 + }, + { + "epoch": 0.949072811149302, + "grad_norm": 0.2412109375, + "learning_rate": 3.5362040945934873e-07, + "loss": 2.7659, + "num_input_tokens_seen": 10260316160, + "step": 19570 + }, + { + "epoch": 0.9493152927055486, + "grad_norm": 0.2431640625, + "learning_rate": 3.5026848235300836e-07, + "loss": 2.7726, + "num_input_tokens_seen": 10262937600, + "step": 19575 + }, + { + "epoch": 0.9495577742617952, + "grad_norm": 0.240234375, + "learning_rate": 3.4693240517824076e-07, + "loss": 2.7706, + "num_input_tokens_seen": 10265559040, + "step": 19580 + }, + { + "epoch": 0.9498002558180418, + "grad_norm": 0.2451171875, + "learning_rate": 3.4361218008016893e-07, + "loss": 2.7792, + "num_input_tokens_seen": 10268180480, + "step": 19585 + }, + { + "epoch": 0.9500427373742885, + "grad_norm": 0.25, + "learning_rate": 3.4030780919371284e-07, + "loss": 2.7616, + "num_input_tokens_seen": 10270801920, + "step": 19590 + }, + { + "epoch": 0.9502852189305351, + "grad_norm": 0.244140625, + "learning_rate": 3.3701929464360905e-07, + "loss": 2.7807, + "num_input_tokens_seen": 10273423360, + "step": 19595 + }, + { + "epoch": 0.9505277004867817, + "grad_norm": 0.244140625, + "learning_rate": 3.3374663854438825e-07, + "loss": 2.7795, + "num_input_tokens_seen": 10276044800, + "step": 19600 + }, + { + "epoch": 0.9507701820430283, + "grad_norm": 0.23828125, + "learning_rate": 3.304898430003894e-07, + "loss": 2.7736, + "num_input_tokens_seen": 10278666240, + "step": 19605 + }, + { + "epoch": 0.9510126635992749, + "grad_norm": 0.248046875, + "learning_rate": 3.272489101057541e-07, + "loss": 2.7625, + "num_input_tokens_seen": 10281287680, + "step": 19610 + }, + { + "epoch": 0.9512551451555216, + "grad_norm": 0.23828125, + "learning_rate": 3.2402384194442635e-07, + "loss": 2.7683, + "num_input_tokens_seen": 10283909120, + "step": 19615 + }, + { + "epoch": 0.9514976267117682, + "grad_norm": 0.23828125, + "learning_rate": 3.2081464059013635e-07, + "loss": 2.7761, + "num_input_tokens_seen": 10286530560, + "step": 19620 + }, + { + "epoch": 0.9517401082680149, + "grad_norm": 0.240234375, + "learning_rate": 3.1762130810642497e-07, + "loss": 2.7908, + "num_input_tokens_seen": 10289152000, + "step": 19625 + }, + { + "epoch": 0.9519825898242615, + "grad_norm": 0.2421875, + "learning_rate": 3.144438465466276e-07, + "loss": 2.7889, + "num_input_tokens_seen": 10291773440, + "step": 19630 + }, + { + "epoch": 0.9522250713805082, + "grad_norm": 0.2353515625, + "learning_rate": 3.1128225795386545e-07, + "loss": 2.7676, + "num_input_tokens_seen": 10294394880, + "step": 19635 + }, + { + "epoch": 0.9524675529367548, + "grad_norm": 0.240234375, + "learning_rate": 3.0813654436106787e-07, + "loss": 2.7925, + "num_input_tokens_seen": 10297016320, + "step": 19640 + }, + { + "epoch": 0.9527100344930014, + "grad_norm": 0.24609375, + "learning_rate": 3.0500670779094186e-07, + "loss": 2.7682, + "num_input_tokens_seen": 10299637760, + "step": 19645 + }, + { + "epoch": 0.952952516049248, + "grad_norm": 0.240234375, + "learning_rate": 3.0189275025599706e-07, + "loss": 2.7753, + "num_input_tokens_seen": 10302259200, + "step": 19650 + }, + { + "epoch": 0.9531949976054946, + "grad_norm": 0.23828125, + "learning_rate": 2.9879467375852065e-07, + "loss": 2.7743, + "num_input_tokens_seen": 10304880640, + "step": 19655 + }, + { + "epoch": 0.9534374791617413, + "grad_norm": 0.24609375, + "learning_rate": 2.957124802906025e-07, + "loss": 2.7639, + "num_input_tokens_seen": 10307502080, + "step": 19660 + }, + { + "epoch": 0.9536799607179879, + "grad_norm": 0.2392578125, + "learning_rate": 2.926461718341073e-07, + "loss": 2.7674, + "num_input_tokens_seen": 10310123520, + "step": 19665 + }, + { + "epoch": 0.9539224422742345, + "grad_norm": 0.244140625, + "learning_rate": 2.895957503606939e-07, + "loss": 2.7662, + "num_input_tokens_seen": 10312744960, + "step": 19670 + }, + { + "epoch": 0.9541649238304811, + "grad_norm": 0.244140625, + "learning_rate": 2.8656121783180447e-07, + "loss": 2.7779, + "num_input_tokens_seen": 10315366400, + "step": 19675 + }, + { + "epoch": 0.9544074053867277, + "grad_norm": 0.234375, + "learning_rate": 2.835425761986532e-07, + "loss": 2.7761, + "num_input_tokens_seen": 10317987840, + "step": 19680 + }, + { + "epoch": 0.9546498869429744, + "grad_norm": 0.2373046875, + "learning_rate": 2.805398274022514e-07, + "loss": 2.7715, + "num_input_tokens_seen": 10320609280, + "step": 19685 + }, + { + "epoch": 0.954892368499221, + "grad_norm": 0.25, + "learning_rate": 2.775529733733878e-07, + "loss": 2.7503, + "num_input_tokens_seen": 10323230720, + "step": 19690 + }, + { + "epoch": 0.9551348500554676, + "grad_norm": 0.2392578125, + "learning_rate": 2.7458201603262344e-07, + "loss": 2.7773, + "num_input_tokens_seen": 10325852160, + "step": 19695 + }, + { + "epoch": 0.9553773316117142, + "grad_norm": 0.234375, + "learning_rate": 2.7162695729030517e-07, + "loss": 2.7728, + "num_input_tokens_seen": 10328473600, + "step": 19700 + }, + { + "epoch": 0.955619813167961, + "grad_norm": 0.2353515625, + "learning_rate": 2.6868779904655475e-07, + "loss": 2.7788, + "num_input_tokens_seen": 10331095040, + "step": 19705 + }, + { + "epoch": 0.9558622947242076, + "grad_norm": 0.240234375, + "learning_rate": 2.657645431912714e-07, + "loss": 2.7761, + "num_input_tokens_seen": 10333716480, + "step": 19710 + }, + { + "epoch": 0.9561047762804542, + "grad_norm": 0.23828125, + "learning_rate": 2.628571916041184e-07, + "loss": 2.7804, + "num_input_tokens_seen": 10336337920, + "step": 19715 + }, + { + "epoch": 0.9563472578367008, + "grad_norm": 0.2412109375, + "learning_rate": 2.5996574615455015e-07, + "loss": 2.7756, + "num_input_tokens_seen": 10338959360, + "step": 19720 + }, + { + "epoch": 0.9565897393929474, + "grad_norm": 0.2392578125, + "learning_rate": 2.570902087017768e-07, + "loss": 2.7901, + "num_input_tokens_seen": 10341580800, + "step": 19725 + }, + { + "epoch": 0.9568322209491941, + "grad_norm": 0.2470703125, + "learning_rate": 2.5423058109479427e-07, + "loss": 2.7712, + "num_input_tokens_seen": 10344202240, + "step": 19730 + }, + { + "epoch": 0.9570747025054407, + "grad_norm": 0.2373046875, + "learning_rate": 2.513868651723539e-07, + "loss": 2.7652, + "num_input_tokens_seen": 10346823680, + "step": 19735 + }, + { + "epoch": 0.9573171840616873, + "grad_norm": 0.2431640625, + "learning_rate": 2.485590627629902e-07, + "loss": 2.7714, + "num_input_tokens_seen": 10349445120, + "step": 19740 + }, + { + "epoch": 0.9575596656179339, + "grad_norm": 0.24609375, + "learning_rate": 2.457471756849905e-07, + "loss": 2.7796, + "num_input_tokens_seen": 10352066560, + "step": 19745 + }, + { + "epoch": 0.9578021471741806, + "grad_norm": 0.234375, + "learning_rate": 2.429512057464195e-07, + "loss": 2.7815, + "num_input_tokens_seen": 10354688000, + "step": 19750 + }, + { + "epoch": 0.9580446287304272, + "grad_norm": 0.2373046875, + "learning_rate": 2.40171154745103e-07, + "loss": 2.7736, + "num_input_tokens_seen": 10357309440, + "step": 19755 + }, + { + "epoch": 0.9582871102866738, + "grad_norm": 0.2392578125, + "learning_rate": 2.3740702446863327e-07, + "loss": 2.7773, + "num_input_tokens_seen": 10359930880, + "step": 19760 + }, + { + "epoch": 0.9585295918429204, + "grad_norm": 0.24609375, + "learning_rate": 2.3465881669435807e-07, + "loss": 2.7771, + "num_input_tokens_seen": 10362552320, + "step": 19765 + }, + { + "epoch": 0.958772073399167, + "grad_norm": 0.2353515625, + "learning_rate": 2.3192653318939994e-07, + "loss": 2.7682, + "num_input_tokens_seen": 10365173760, + "step": 19770 + }, + { + "epoch": 0.9590145549554137, + "grad_norm": 0.2412109375, + "learning_rate": 2.2921017571062575e-07, + "loss": 2.7875, + "num_input_tokens_seen": 10367795200, + "step": 19775 + }, + { + "epoch": 0.9592570365116603, + "grad_norm": 0.2451171875, + "learning_rate": 2.2650974600467444e-07, + "loss": 2.7694, + "num_input_tokens_seen": 10370416640, + "step": 19780 + }, + { + "epoch": 0.959499518067907, + "grad_norm": 0.251953125, + "learning_rate": 2.238252458079404e-07, + "loss": 2.7656, + "num_input_tokens_seen": 10373038080, + "step": 19785 + }, + { + "epoch": 0.9597419996241536, + "grad_norm": 0.2421875, + "learning_rate": 2.2115667684657337e-07, + "loss": 2.7817, + "num_input_tokens_seen": 10375659520, + "step": 19790 + }, + { + "epoch": 0.9599844811804003, + "grad_norm": 0.2412109375, + "learning_rate": 2.1850404083647857e-07, + "loss": 2.7811, + "num_input_tokens_seen": 10378280960, + "step": 19795 + }, + { + "epoch": 0.9602269627366469, + "grad_norm": 0.248046875, + "learning_rate": 2.158673394833166e-07, + "loss": 2.7722, + "num_input_tokens_seen": 10380902400, + "step": 19800 + }, + { + "epoch": 0.9602269627366469, + "eval_accuracy": 0.4560039081582804, + "eval_loss": 2.741548776626587, + "eval_runtime": 5.9407, + "eval_samples_per_second": 50.499, + "eval_steps_per_second": 6.397, + "num_input_tokens_seen": 10380902400, + "step": 19800 + }, + { + "epoch": 0.9604694442928935, + "grad_norm": 0.2412109375, + "learning_rate": 2.1324657448250628e-07, + "loss": 2.7782, + "num_input_tokens_seen": 10383523840, + "step": 19805 + }, + { + "epoch": 0.9607119258491401, + "grad_norm": 0.2412109375, + "learning_rate": 2.106417475192135e-07, + "loss": 2.7799, + "num_input_tokens_seen": 10386145280, + "step": 19810 + }, + { + "epoch": 0.9609544074053867, + "grad_norm": 0.236328125, + "learning_rate": 2.0805286026835958e-07, + "loss": 2.7777, + "num_input_tokens_seen": 10388766720, + "step": 19815 + }, + { + "epoch": 0.9611968889616334, + "grad_norm": 0.255859375, + "learning_rate": 2.054799143946129e-07, + "loss": 2.7767, + "num_input_tokens_seen": 10391388160, + "step": 19820 + }, + { + "epoch": 0.96143937051788, + "grad_norm": 0.2412109375, + "learning_rate": 2.029229115523973e-07, + "loss": 2.7986, + "num_input_tokens_seen": 10394009600, + "step": 19825 + }, + { + "epoch": 0.9616818520741266, + "grad_norm": 0.2421875, + "learning_rate": 2.003818533858809e-07, + "loss": 2.7712, + "num_input_tokens_seen": 10396631040, + "step": 19830 + }, + { + "epoch": 0.9619243336303732, + "grad_norm": 0.2392578125, + "learning_rate": 1.9785674152897616e-07, + "loss": 2.7747, + "num_input_tokens_seen": 10399252480, + "step": 19835 + }, + { + "epoch": 0.9621668151866198, + "grad_norm": 0.2353515625, + "learning_rate": 1.9534757760534817e-07, + "loss": 2.7692, + "num_input_tokens_seen": 10401873920, + "step": 19840 + }, + { + "epoch": 0.9624092967428665, + "grad_norm": 0.2431640625, + "learning_rate": 1.9285436322840633e-07, + "loss": 2.762, + "num_input_tokens_seen": 10404495360, + "step": 19845 + }, + { + "epoch": 0.9626517782991131, + "grad_norm": 0.2431640625, + "learning_rate": 1.9037710000130438e-07, + "loss": 2.7655, + "num_input_tokens_seen": 10407116800, + "step": 19850 + }, + { + "epoch": 0.9628942598553597, + "grad_norm": 0.24609375, + "learning_rate": 1.87915789516932e-07, + "loss": 2.7766, + "num_input_tokens_seen": 10409738240, + "step": 19855 + }, + { + "epoch": 0.9631367414116064, + "grad_norm": 0.2421875, + "learning_rate": 1.8547043335793435e-07, + "loss": 2.7652, + "num_input_tokens_seen": 10412359680, + "step": 19860 + }, + { + "epoch": 0.963379222967853, + "grad_norm": 0.248046875, + "learning_rate": 1.830410330966842e-07, + "loss": 2.779, + "num_input_tokens_seen": 10414981120, + "step": 19865 + }, + { + "epoch": 0.9636217045240997, + "grad_norm": 0.2421875, + "learning_rate": 1.8062759029530696e-07, + "loss": 2.7745, + "num_input_tokens_seen": 10417602560, + "step": 19870 + }, + { + "epoch": 0.9638641860803463, + "grad_norm": 0.2373046875, + "learning_rate": 1.7823010650565852e-07, + "loss": 2.7718, + "num_input_tokens_seen": 10420224000, + "step": 19875 + }, + { + "epoch": 0.9641066676365929, + "grad_norm": 0.25, + "learning_rate": 1.758485832693335e-07, + "loss": 2.7801, + "num_input_tokens_seen": 10422845440, + "step": 19880 + }, + { + "epoch": 0.9643491491928395, + "grad_norm": 0.2421875, + "learning_rate": 1.7348302211767087e-07, + "loss": 2.7739, + "num_input_tokens_seen": 10425466880, + "step": 19885 + }, + { + "epoch": 0.9645916307490862, + "grad_norm": 0.248046875, + "learning_rate": 1.7113342457173996e-07, + "loss": 2.7781, + "num_input_tokens_seen": 10428088320, + "step": 19890 + }, + { + "epoch": 0.9648341123053328, + "grad_norm": 0.2412109375, + "learning_rate": 1.6879979214234898e-07, + "loss": 2.7845, + "num_input_tokens_seen": 10430709760, + "step": 19895 + }, + { + "epoch": 0.9650765938615794, + "grad_norm": 0.2451171875, + "learning_rate": 1.664821263300309e-07, + "loss": 2.7658, + "num_input_tokens_seen": 10433331200, + "step": 19900 + }, + { + "epoch": 0.965319075417826, + "grad_norm": 0.244140625, + "learning_rate": 1.641804286250659e-07, + "loss": 2.753, + "num_input_tokens_seen": 10435952640, + "step": 19905 + }, + { + "epoch": 0.9655615569740726, + "grad_norm": 0.2421875, + "learning_rate": 1.6189470050745615e-07, + "loss": 2.7827, + "num_input_tokens_seen": 10438574080, + "step": 19910 + }, + { + "epoch": 0.9658040385303193, + "grad_norm": 0.2431640625, + "learning_rate": 1.596249434469399e-07, + "loss": 2.7746, + "num_input_tokens_seen": 10441195520, + "step": 19915 + }, + { + "epoch": 0.9660465200865659, + "grad_norm": 0.2373046875, + "learning_rate": 1.573711589029858e-07, + "loss": 2.7763, + "num_input_tokens_seen": 10443816960, + "step": 19920 + }, + { + "epoch": 0.9662890016428125, + "grad_norm": 0.2412109375, + "learning_rate": 1.5513334832479575e-07, + "loss": 2.7699, + "num_input_tokens_seen": 10446438400, + "step": 19925 + }, + { + "epoch": 0.9665314831990591, + "grad_norm": 0.2421875, + "learning_rate": 1.5291151315129093e-07, + "loss": 2.7926, + "num_input_tokens_seen": 10449059840, + "step": 19930 + }, + { + "epoch": 0.9667739647553057, + "grad_norm": 0.240234375, + "learning_rate": 1.5070565481112297e-07, + "loss": 2.7908, + "num_input_tokens_seen": 10451681280, + "step": 19935 + }, + { + "epoch": 0.9670164463115525, + "grad_norm": 0.2421875, + "learning_rate": 1.4851577472267952e-07, + "loss": 2.7765, + "num_input_tokens_seen": 10454302720, + "step": 19940 + }, + { + "epoch": 0.9672589278677991, + "grad_norm": 0.2421875, + "learning_rate": 1.46341874294062e-07, + "loss": 2.7696, + "num_input_tokens_seen": 10456924160, + "step": 19945 + }, + { + "epoch": 0.9675014094240457, + "grad_norm": 0.2431640625, + "learning_rate": 1.441839549231022e-07, + "loss": 2.7869, + "num_input_tokens_seen": 10459545600, + "step": 19950 + }, + { + "epoch": 0.9677438909802923, + "grad_norm": 0.2431640625, + "learning_rate": 1.4204201799735973e-07, + "loss": 2.7702, + "num_input_tokens_seen": 10462167040, + "step": 19955 + }, + { + "epoch": 0.967986372536539, + "grad_norm": 0.2490234375, + "learning_rate": 1.399160648941078e-07, + "loss": 2.7836, + "num_input_tokens_seen": 10464788480, + "step": 19960 + }, + { + "epoch": 0.9682288540927856, + "grad_norm": 0.2431640625, + "learning_rate": 1.3780609698035296e-07, + "loss": 2.762, + "num_input_tokens_seen": 10467409920, + "step": 19965 + }, + { + "epoch": 0.9684713356490322, + "grad_norm": 0.2353515625, + "learning_rate": 1.357121156128127e-07, + "loss": 2.7724, + "num_input_tokens_seen": 10470031360, + "step": 19970 + }, + { + "epoch": 0.9687138172052788, + "grad_norm": 0.236328125, + "learning_rate": 1.3363412213793226e-07, + "loss": 2.7693, + "num_input_tokens_seen": 10472652800, + "step": 19975 + }, + { + "epoch": 0.9689562987615254, + "grad_norm": 0.2421875, + "learning_rate": 1.3157211789187885e-07, + "loss": 2.7787, + "num_input_tokens_seen": 10475274240, + "step": 19980 + }, + { + "epoch": 0.9691987803177721, + "grad_norm": 0.2373046875, + "learning_rate": 1.29526104200528e-07, + "loss": 2.7758, + "num_input_tokens_seen": 10477895680, + "step": 19985 + }, + { + "epoch": 0.9694412618740187, + "grad_norm": 0.2431640625, + "learning_rate": 1.2749608237948296e-07, + "loss": 2.7705, + "num_input_tokens_seen": 10480517120, + "step": 19990 + }, + { + "epoch": 0.9696837434302653, + "grad_norm": 0.2412109375, + "learning_rate": 1.2548205373405508e-07, + "loss": 2.7784, + "num_input_tokens_seen": 10483138560, + "step": 19995 + }, + { + "epoch": 0.9699262249865119, + "grad_norm": 0.2451171875, + "learning_rate": 1.2348401955928623e-07, + "loss": 2.7826, + "num_input_tokens_seen": 10485760000, + "step": 20000 + }, + { + "epoch": 0.9701687065427586, + "grad_norm": 0.2373046875, + "learning_rate": 1.215019811399154e-07, + "loss": 2.7817, + "num_input_tokens_seen": 10488381440, + "step": 20005 + }, + { + "epoch": 0.9704111880990052, + "grad_norm": 0.2421875, + "learning_rate": 1.1953593975041477e-07, + "loss": 2.771, + "num_input_tokens_seen": 10491002880, + "step": 20010 + }, + { + "epoch": 0.9706536696552518, + "grad_norm": 0.244140625, + "learning_rate": 1.1758589665495368e-07, + "loss": 2.7607, + "num_input_tokens_seen": 10493624320, + "step": 20015 + }, + { + "epoch": 0.9708961512114985, + "grad_norm": 0.23828125, + "learning_rate": 1.1565185310742632e-07, + "loss": 2.7761, + "num_input_tokens_seen": 10496245760, + "step": 20020 + }, + { + "epoch": 0.9711386327677451, + "grad_norm": 0.2431640625, + "learning_rate": 1.1373381035143239e-07, + "loss": 2.7788, + "num_input_tokens_seen": 10498867200, + "step": 20025 + }, + { + "epoch": 0.9713811143239918, + "grad_norm": 0.2431640625, + "learning_rate": 1.1183176962028808e-07, + "loss": 2.7646, + "num_input_tokens_seen": 10501488640, + "step": 20030 + }, + { + "epoch": 0.9716235958802384, + "grad_norm": 0.2490234375, + "learning_rate": 1.0994573213701509e-07, + "loss": 2.7843, + "num_input_tokens_seen": 10504110080, + "step": 20035 + }, + { + "epoch": 0.971866077436485, + "grad_norm": 0.232421875, + "learning_rate": 1.0807569911434611e-07, + "loss": 2.781, + "num_input_tokens_seen": 10506731520, + "step": 20040 + }, + { + "epoch": 0.9721085589927316, + "grad_norm": 0.2392578125, + "learning_rate": 1.0622167175472763e-07, + "loss": 2.7677, + "num_input_tokens_seen": 10509352960, + "step": 20045 + }, + { + "epoch": 0.9723510405489783, + "grad_norm": 0.2421875, + "learning_rate": 1.0438365125031157e-07, + "loss": 2.7833, + "num_input_tokens_seen": 10511974400, + "step": 20050 + }, + { + "epoch": 0.9725935221052249, + "grad_norm": 0.2421875, + "learning_rate": 1.0256163878295255e-07, + "loss": 2.7805, + "num_input_tokens_seen": 10514595840, + "step": 20055 + }, + { + "epoch": 0.9728360036614715, + "grad_norm": 0.2431640625, + "learning_rate": 1.0075563552421896e-07, + "loss": 2.7851, + "num_input_tokens_seen": 10517217280, + "step": 20060 + }, + { + "epoch": 0.9730784852177181, + "grad_norm": 0.240234375, + "learning_rate": 9.89656426353791e-08, + "loss": 2.7839, + "num_input_tokens_seen": 10519838720, + "step": 20065 + }, + { + "epoch": 0.9733209667739647, + "grad_norm": 0.25, + "learning_rate": 9.71916612674123e-08, + "loss": 2.7593, + "num_input_tokens_seen": 10522460160, + "step": 20070 + }, + { + "epoch": 0.9735634483302114, + "grad_norm": 0.2412109375, + "learning_rate": 9.543369256100055e-08, + "loss": 2.7544, + "num_input_tokens_seen": 10525081600, + "step": 20075 + }, + { + "epoch": 0.973805929886458, + "grad_norm": 0.2451171875, + "learning_rate": 9.369173764652572e-08, + "loss": 2.7705, + "num_input_tokens_seen": 10527703040, + "step": 20080 + }, + { + "epoch": 0.9740484114427046, + "grad_norm": 0.23828125, + "learning_rate": 9.196579764407797e-08, + "loss": 2.7658, + "num_input_tokens_seen": 10530324480, + "step": 20085 + }, + { + "epoch": 0.9742908929989512, + "grad_norm": 0.2412109375, + "learning_rate": 9.025587366344456e-08, + "loss": 2.7833, + "num_input_tokens_seen": 10532945920, + "step": 20090 + }, + { + "epoch": 0.9745333745551978, + "grad_norm": 0.24609375, + "learning_rate": 8.856196680412099e-08, + "loss": 2.7785, + "num_input_tokens_seen": 10535567360, + "step": 20095 + }, + { + "epoch": 0.9747758561114446, + "grad_norm": 0.255859375, + "learning_rate": 8.688407815529709e-08, + "loss": 2.7764, + "num_input_tokens_seen": 10538188800, + "step": 20100 + }, + { + "epoch": 0.9747758561114446, + "eval_accuracy": 0.4559745969711773, + "eval_loss": 2.741562604904175, + "eval_runtime": 5.908, + "eval_samples_per_second": 50.778, + "eval_steps_per_second": 6.432, + "num_input_tokens_seen": 10538188800, + "step": 20100 + }, + { + "epoch": 0.9750183376676912, + "grad_norm": 0.2421875, + "learning_rate": 8.522220879586818e-08, + "loss": 2.771, + "num_input_tokens_seen": 10540810240, + "step": 20105 + }, + { + "epoch": 0.9752608192239378, + "grad_norm": 0.236328125, + "learning_rate": 8.357635979442668e-08, + "loss": 2.7807, + "num_input_tokens_seen": 10543431680, + "step": 20110 + }, + { + "epoch": 0.9755033007801844, + "grad_norm": 0.248046875, + "learning_rate": 8.194653220926219e-08, + "loss": 2.7655, + "num_input_tokens_seen": 10546053120, + "step": 20115 + }, + { + "epoch": 0.9757457823364311, + "grad_norm": 0.2451171875, + "learning_rate": 8.033272708836414e-08, + "loss": 2.7725, + "num_input_tokens_seen": 10548674560, + "step": 20120 + }, + { + "epoch": 0.9759882638926777, + "grad_norm": 0.25, + "learning_rate": 7.873494546941917e-08, + "loss": 2.7939, + "num_input_tokens_seen": 10551296000, + "step": 20125 + }, + { + "epoch": 0.9762307454489243, + "grad_norm": 0.2392578125, + "learning_rate": 7.7153188379811e-08, + "loss": 2.7654, + "num_input_tokens_seen": 10553917440, + "step": 20130 + }, + { + "epoch": 0.9764732270051709, + "grad_norm": 0.2451171875, + "learning_rate": 7.558745683662049e-08, + "loss": 2.7739, + "num_input_tokens_seen": 10556538880, + "step": 20135 + }, + { + "epoch": 0.9767157085614175, + "grad_norm": 0.23828125, + "learning_rate": 7.40377518466201e-08, + "loss": 2.7829, + "num_input_tokens_seen": 10559160320, + "step": 20140 + }, + { + "epoch": 0.9769581901176642, + "grad_norm": 0.248046875, + "learning_rate": 7.250407440628493e-08, + "loss": 2.7772, + "num_input_tokens_seen": 10561781760, + "step": 20145 + }, + { + "epoch": 0.9772006716739108, + "grad_norm": 0.248046875, + "learning_rate": 7.098642550177336e-08, + "loss": 2.7752, + "num_input_tokens_seen": 10564403200, + "step": 20150 + }, + { + "epoch": 0.9774431532301574, + "grad_norm": 0.2392578125, + "learning_rate": 6.948480610894648e-08, + "loss": 2.7711, + "num_input_tokens_seen": 10567024640, + "step": 20155 + }, + { + "epoch": 0.977685634786404, + "grad_norm": 0.23828125, + "learning_rate": 6.799921719335411e-08, + "loss": 2.7867, + "num_input_tokens_seen": 10569646080, + "step": 20160 + }, + { + "epoch": 0.9779281163426506, + "grad_norm": 0.240234375, + "learning_rate": 6.65296597102405e-08, + "loss": 2.78, + "num_input_tokens_seen": 10572267520, + "step": 20165 + }, + { + "epoch": 0.9781705978988973, + "grad_norm": 0.2431640625, + "learning_rate": 6.507613460453865e-08, + "loss": 2.7664, + "num_input_tokens_seen": 10574888960, + "step": 20170 + }, + { + "epoch": 0.978413079455144, + "grad_norm": 0.2392578125, + "learning_rate": 6.363864281087595e-08, + "loss": 2.7752, + "num_input_tokens_seen": 10577510400, + "step": 20175 + }, + { + "epoch": 0.9786555610113906, + "grad_norm": 0.2421875, + "learning_rate": 6.221718525356579e-08, + "loss": 2.7806, + "num_input_tokens_seen": 10580131840, + "step": 20180 + }, + { + "epoch": 0.9788980425676372, + "grad_norm": 0.240234375, + "learning_rate": 6.081176284661594e-08, + "loss": 2.7785, + "num_input_tokens_seen": 10582753280, + "step": 20185 + }, + { + "epoch": 0.9791405241238839, + "grad_norm": 0.23828125, + "learning_rate": 5.9422376493722953e-08, + "loss": 2.7799, + "num_input_tokens_seen": 10585374720, + "step": 20190 + }, + { + "epoch": 0.9793830056801305, + "grad_norm": 0.244140625, + "learning_rate": 5.804902708826665e-08, + "loss": 2.7928, + "num_input_tokens_seen": 10587996160, + "step": 20195 + }, + { + "epoch": 0.9796254872363771, + "grad_norm": 0.2421875, + "learning_rate": 5.669171551332675e-08, + "loss": 2.7681, + "num_input_tokens_seen": 10590617600, + "step": 20200 + }, + { + "epoch": 0.9798679687926237, + "grad_norm": 0.2392578125, + "learning_rate": 5.535044264165512e-08, + "loss": 2.7884, + "num_input_tokens_seen": 10593239040, + "step": 20205 + }, + { + "epoch": 0.9801104503488703, + "grad_norm": 0.2412109375, + "learning_rate": 5.402520933570354e-08, + "loss": 2.7794, + "num_input_tokens_seen": 10595860480, + "step": 20210 + }, + { + "epoch": 0.980352931905117, + "grad_norm": 0.23828125, + "learning_rate": 5.271601644760426e-08, + "loss": 2.7662, + "num_input_tokens_seen": 10598481920, + "step": 20215 + }, + { + "epoch": 0.9805954134613636, + "grad_norm": 0.248046875, + "learning_rate": 5.1422864819178354e-08, + "loss": 2.7762, + "num_input_tokens_seen": 10601103360, + "step": 20220 + }, + { + "epoch": 0.9808378950176102, + "grad_norm": 0.244140625, + "learning_rate": 5.0145755281924556e-08, + "loss": 2.7787, + "num_input_tokens_seen": 10603724800, + "step": 20225 + }, + { + "epoch": 0.9810803765738568, + "grad_norm": 0.2431640625, + "learning_rate": 4.888468865703877e-08, + "loss": 2.7703, + "num_input_tokens_seen": 10606346240, + "step": 20230 + }, + { + "epoch": 0.9813228581301034, + "grad_norm": 0.2392578125, + "learning_rate": 4.76396657553918e-08, + "loss": 2.7845, + "num_input_tokens_seen": 10608967680, + "step": 20235 + }, + { + "epoch": 0.9815653396863501, + "grad_norm": 0.23828125, + "learning_rate": 4.6410687377540505e-08, + "loss": 2.7719, + "num_input_tokens_seen": 10611589120, + "step": 20240 + }, + { + "epoch": 0.9818078212425967, + "grad_norm": 0.2490234375, + "learning_rate": 4.519775431372775e-08, + "loss": 2.779, + "num_input_tokens_seen": 10614210560, + "step": 20245 + }, + { + "epoch": 0.9820503027988433, + "grad_norm": 0.240234375, + "learning_rate": 4.40008673438741e-08, + "loss": 2.77, + "num_input_tokens_seen": 10616832000, + "step": 20250 + }, + { + "epoch": 0.98229278435509, + "grad_norm": 0.232421875, + "learning_rate": 4.282002723758616e-08, + "loss": 2.7562, + "num_input_tokens_seen": 10619453440, + "step": 20255 + }, + { + "epoch": 0.9825352659113367, + "grad_norm": 0.244140625, + "learning_rate": 4.165523475415656e-08, + "loss": 2.7774, + "num_input_tokens_seen": 10622074880, + "step": 20260 + }, + { + "epoch": 0.9827777474675833, + "grad_norm": 0.2490234375, + "learning_rate": 4.05064906425473e-08, + "loss": 2.7768, + "num_input_tokens_seen": 10624696320, + "step": 20265 + }, + { + "epoch": 0.9830202290238299, + "grad_norm": 0.2333984375, + "learning_rate": 3.937379564141197e-08, + "loss": 2.7821, + "num_input_tokens_seen": 10627317760, + "step": 20270 + }, + { + "epoch": 0.9832627105800765, + "grad_norm": 0.24609375, + "learning_rate": 3.8257150479079073e-08, + "loss": 2.7754, + "num_input_tokens_seen": 10629939200, + "step": 20275 + }, + { + "epoch": 0.9835051921363231, + "grad_norm": 0.2333984375, + "learning_rate": 3.71565558735576e-08, + "loss": 2.7737, + "num_input_tokens_seen": 10632560640, + "step": 20280 + }, + { + "epoch": 0.9837476736925698, + "grad_norm": 0.240234375, + "learning_rate": 3.6072012532539776e-08, + "loss": 2.7817, + "num_input_tokens_seen": 10635182080, + "step": 20285 + }, + { + "epoch": 0.9839901552488164, + "grad_norm": 0.2431640625, + "learning_rate": 3.500352115339001e-08, + "loss": 2.7746, + "num_input_tokens_seen": 10637803520, + "step": 20290 + }, + { + "epoch": 0.984232636805063, + "grad_norm": 0.2392578125, + "learning_rate": 3.39510824231587e-08, + "loss": 2.763, + "num_input_tokens_seen": 10640424960, + "step": 20295 + }, + { + "epoch": 0.9844751183613096, + "grad_norm": 0.24609375, + "learning_rate": 3.2914697018565645e-08, + "loss": 2.7655, + "num_input_tokens_seen": 10643046400, + "step": 20300 + }, + { + "epoch": 0.9847175999175563, + "grad_norm": 0.240234375, + "learning_rate": 3.189436560601944e-08, + "loss": 2.7836, + "num_input_tokens_seen": 10645667840, + "step": 20305 + }, + { + "epoch": 0.9849600814738029, + "grad_norm": 0.2373046875, + "learning_rate": 3.0890088841595276e-08, + "loss": 2.7791, + "num_input_tokens_seen": 10648289280, + "step": 20310 + }, + { + "epoch": 0.9852025630300495, + "grad_norm": 0.2431640625, + "learning_rate": 2.990186737104883e-08, + "loss": 2.7727, + "num_input_tokens_seen": 10650910720, + "step": 20315 + }, + { + "epoch": 0.9854450445862961, + "grad_norm": 0.244140625, + "learning_rate": 2.8929701829816247e-08, + "loss": 2.779, + "num_input_tokens_seen": 10653532160, + "step": 20320 + }, + { + "epoch": 0.9856875261425427, + "grad_norm": 0.248046875, + "learning_rate": 2.7973592843003048e-08, + "loss": 2.7835, + "num_input_tokens_seen": 10656153600, + "step": 20325 + }, + { + "epoch": 0.9859300076987894, + "grad_norm": 0.2431640625, + "learning_rate": 2.7033541025395237e-08, + "loss": 2.7604, + "num_input_tokens_seen": 10658775040, + "step": 20330 + }, + { + "epoch": 0.9861724892550361, + "grad_norm": 0.2353515625, + "learning_rate": 2.610954698145096e-08, + "loss": 2.7776, + "num_input_tokens_seen": 10661396480, + "step": 20335 + }, + { + "epoch": 0.9864149708112827, + "grad_norm": 0.24609375, + "learning_rate": 2.5201611305303296e-08, + "loss": 2.7612, + "num_input_tokens_seen": 10664017920, + "step": 20340 + }, + { + "epoch": 0.9866574523675293, + "grad_norm": 0.2392578125, + "learning_rate": 2.4309734580760248e-08, + "loss": 2.7758, + "num_input_tokens_seen": 10666639360, + "step": 20345 + }, + { + "epoch": 0.986899933923776, + "grad_norm": 0.2392578125, + "learning_rate": 2.343391738130751e-08, + "loss": 2.7584, + "num_input_tokens_seen": 10669260800, + "step": 20350 + }, + { + "epoch": 0.9871424154800226, + "grad_norm": 0.2451171875, + "learning_rate": 2.2574160270100174e-08, + "loss": 2.7869, + "num_input_tokens_seen": 10671882240, + "step": 20355 + }, + { + "epoch": 0.9873848970362692, + "grad_norm": 0.2421875, + "learning_rate": 2.1730463799965463e-08, + "loss": 2.7762, + "num_input_tokens_seen": 10674503680, + "step": 20360 + }, + { + "epoch": 0.9876273785925158, + "grad_norm": 0.2392578125, + "learning_rate": 2.090282851340275e-08, + "loss": 2.7877, + "num_input_tokens_seen": 10677125120, + "step": 20365 + }, + { + "epoch": 0.9878698601487624, + "grad_norm": 0.2470703125, + "learning_rate": 2.0091254942594674e-08, + "loss": 2.7722, + "num_input_tokens_seen": 10679746560, + "step": 20370 + }, + { + "epoch": 0.9881123417050091, + "grad_norm": 0.2353515625, + "learning_rate": 1.9295743609382134e-08, + "loss": 2.7713, + "num_input_tokens_seen": 10682368000, + "step": 20375 + }, + { + "epoch": 0.9883548232612557, + "grad_norm": 0.2333984375, + "learning_rate": 1.8516295025283735e-08, + "loss": 2.7799, + "num_input_tokens_seen": 10684989440, + "step": 20380 + }, + { + "epoch": 0.9885973048175023, + "grad_norm": 0.2412109375, + "learning_rate": 1.7752909691493013e-08, + "loss": 2.7776, + "num_input_tokens_seen": 10687610880, + "step": 20385 + }, + { + "epoch": 0.9888397863737489, + "grad_norm": 0.24609375, + "learning_rate": 1.7005588098870094e-08, + "loss": 2.7842, + "num_input_tokens_seen": 10690232320, + "step": 20390 + }, + { + "epoch": 0.9890822679299955, + "grad_norm": 0.240234375, + "learning_rate": 1.6274330727947262e-08, + "loss": 2.7785, + "num_input_tokens_seen": 10692853760, + "step": 20395 + }, + { + "epoch": 0.9893247494862422, + "grad_norm": 0.2353515625, + "learning_rate": 1.5559138048928946e-08, + "loss": 2.7724, + "num_input_tokens_seen": 10695475200, + "step": 20400 + }, + { + "epoch": 0.9893247494862422, + "eval_accuracy": 0.4559290017912392, + "eval_loss": 2.741572380065918, + "eval_runtime": 5.8791, + "eval_samples_per_second": 51.028, + "eval_steps_per_second": 6.464, + "num_input_tokens_seen": 10695475200, + "step": 20400 + }, + { + "epoch": 0.9895672310424888, + "grad_norm": 0.24609375, + "learning_rate": 1.4860010521683399e-08, + "loss": 2.7642, + "num_input_tokens_seen": 10698096640, + "step": 20405 + }, + { + "epoch": 0.9898097125987355, + "grad_norm": 0.2451171875, + "learning_rate": 1.4176948595762129e-08, + "loss": 2.7712, + "num_input_tokens_seen": 10700718080, + "step": 20410 + }, + { + "epoch": 0.9900521941549821, + "grad_norm": 0.2470703125, + "learning_rate": 1.3509952710372143e-08, + "loss": 2.7852, + "num_input_tokens_seen": 10703339520, + "step": 20415 + }, + { + "epoch": 0.9902946757112288, + "grad_norm": 0.25390625, + "learning_rate": 1.2859023294395367e-08, + "loss": 2.7632, + "num_input_tokens_seen": 10705960960, + "step": 20420 + }, + { + "epoch": 0.9905371572674754, + "grad_norm": 0.23828125, + "learning_rate": 1.222416076638866e-08, + "loss": 2.7766, + "num_input_tokens_seen": 10708582400, + "step": 20425 + }, + { + "epoch": 0.990779638823722, + "grad_norm": 0.240234375, + "learning_rate": 1.1605365534569924e-08, + "loss": 2.7627, + "num_input_tokens_seen": 10711203840, + "step": 20430 + }, + { + "epoch": 0.9910221203799686, + "grad_norm": 0.244140625, + "learning_rate": 1.1002637996826437e-08, + "loss": 2.774, + "num_input_tokens_seen": 10713825280, + "step": 20435 + }, + { + "epoch": 0.9912646019362152, + "grad_norm": 0.2392578125, + "learning_rate": 1.041597854071763e-08, + "loss": 2.7797, + "num_input_tokens_seen": 10716446720, + "step": 20440 + }, + { + "epoch": 0.9915070834924619, + "grad_norm": 0.2490234375, + "learning_rate": 9.845387543469532e-09, + "loss": 2.7749, + "num_input_tokens_seen": 10719068160, + "step": 20445 + }, + { + "epoch": 0.9917495650487085, + "grad_norm": 0.234375, + "learning_rate": 9.290865371977542e-09, + "loss": 2.7768, + "num_input_tokens_seen": 10721689600, + "step": 20450 + }, + { + "epoch": 0.9919920466049551, + "grad_norm": 0.2392578125, + "learning_rate": 8.752412382798114e-09, + "loss": 2.7748, + "num_input_tokens_seen": 10724311040, + "step": 20455 + }, + { + "epoch": 0.9922345281612017, + "grad_norm": 0.2412109375, + "learning_rate": 8.230028922162625e-09, + "loss": 2.7646, + "num_input_tokens_seen": 10726932480, + "step": 20460 + }, + { + "epoch": 0.9924770097174483, + "grad_norm": 0.240234375, + "learning_rate": 7.723715325966274e-09, + "loss": 2.7714, + "num_input_tokens_seen": 10729553920, + "step": 20465 + }, + { + "epoch": 0.992719491273695, + "grad_norm": 0.2392578125, + "learning_rate": 7.233471919773638e-09, + "loss": 2.7831, + "num_input_tokens_seen": 10732175360, + "step": 20470 + }, + { + "epoch": 0.9929619728299416, + "grad_norm": 0.244140625, + "learning_rate": 6.759299018813114e-09, + "loss": 2.7739, + "num_input_tokens_seen": 10734796800, + "step": 20475 + }, + { + "epoch": 0.9932044543861882, + "grad_norm": 0.24609375, + "learning_rate": 6.301196927979702e-09, + "loss": 2.7904, + "num_input_tokens_seen": 10737418240, + "step": 20480 + }, + { + "epoch": 0.9934469359424348, + "grad_norm": 0.244140625, + "learning_rate": 5.859165941837774e-09, + "loss": 2.7693, + "num_input_tokens_seen": 10740039680, + "step": 20485 + }, + { + "epoch": 0.9936894174986816, + "grad_norm": 0.2392578125, + "learning_rate": 5.433206344615527e-09, + "loss": 2.7836, + "num_input_tokens_seen": 10742661120, + "step": 20490 + }, + { + "epoch": 0.9939318990549282, + "grad_norm": 0.2431640625, + "learning_rate": 5.0233184102049805e-09, + "loss": 2.7773, + "num_input_tokens_seen": 10745282560, + "step": 20495 + }, + { + "epoch": 0.9941743806111748, + "grad_norm": 0.25, + "learning_rate": 4.6295024021703045e-09, + "loss": 2.7832, + "num_input_tokens_seen": 10747904000, + "step": 20500 + }, + { + "epoch": 0.9944168621674214, + "grad_norm": 0.2421875, + "learning_rate": 4.251758573736719e-09, + "loss": 2.7662, + "num_input_tokens_seen": 10750525440, + "step": 20505 + }, + { + "epoch": 0.994659343723668, + "grad_norm": 0.2421875, + "learning_rate": 3.8900871677960415e-09, + "loss": 2.781, + "num_input_tokens_seen": 10753146880, + "step": 20510 + }, + { + "epoch": 0.9949018252799147, + "grad_norm": 0.248046875, + "learning_rate": 3.5444884169039126e-09, + "loss": 2.7735, + "num_input_tokens_seen": 10755768320, + "step": 20515 + }, + { + "epoch": 0.9951443068361613, + "grad_norm": 0.240234375, + "learning_rate": 3.2149625432825736e-09, + "loss": 2.7776, + "num_input_tokens_seen": 10758389760, + "step": 20520 + }, + { + "epoch": 0.9953867883924079, + "grad_norm": 0.25, + "learning_rate": 2.901509758820864e-09, + "loss": 2.7735, + "num_input_tokens_seen": 10761011200, + "step": 20525 + }, + { + "epoch": 0.9956292699486545, + "grad_norm": 0.2392578125, + "learning_rate": 2.6041302650686717e-09, + "loss": 2.7768, + "num_input_tokens_seen": 10763632640, + "step": 20530 + }, + { + "epoch": 0.9958717515049011, + "grad_norm": 0.2451171875, + "learning_rate": 2.3228242532452592e-09, + "loss": 2.7728, + "num_input_tokens_seen": 10766254080, + "step": 20535 + }, + { + "epoch": 0.9961142330611478, + "grad_norm": 0.25, + "learning_rate": 2.0575919042309378e-09, + "loss": 2.7801, + "num_input_tokens_seen": 10768875520, + "step": 20540 + }, + { + "epoch": 0.9963567146173944, + "grad_norm": 0.2412109375, + "learning_rate": 1.8084333885698413e-09, + "loss": 2.7743, + "num_input_tokens_seen": 10771496960, + "step": 20545 + }, + { + "epoch": 0.996599196173641, + "grad_norm": 0.2373046875, + "learning_rate": 1.575348866475479e-09, + "loss": 2.7709, + "num_input_tokens_seen": 10774118400, + "step": 20550 + }, + { + "epoch": 0.9968416777298876, + "grad_norm": 0.236328125, + "learning_rate": 1.3583384878224081e-09, + "loss": 2.7763, + "num_input_tokens_seen": 10776739840, + "step": 20555 + }, + { + "epoch": 0.9970841592861343, + "grad_norm": 0.23828125, + "learning_rate": 1.1574023921490095e-09, + "loss": 2.7781, + "num_input_tokens_seen": 10779361280, + "step": 20560 + }, + { + "epoch": 0.9973266408423809, + "grad_norm": 0.24609375, + "learning_rate": 9.725407086574878e-10, + "loss": 2.7687, + "num_input_tokens_seen": 10781982720, + "step": 20565 + }, + { + "epoch": 0.9975691223986276, + "grad_norm": 0.2421875, + "learning_rate": 8.037535562166465e-10, + "loss": 2.7821, + "num_input_tokens_seen": 10784604160, + "step": 20570 + }, + { + "epoch": 0.9978116039548742, + "grad_norm": 0.24609375, + "learning_rate": 6.510410433563374e-10, + "loss": 2.786, + "num_input_tokens_seen": 10787225600, + "step": 20575 + }, + { + "epoch": 0.9980540855111208, + "grad_norm": 0.2490234375, + "learning_rate": 5.144032682730116e-10, + "loss": 2.7812, + "num_input_tokens_seen": 10789847040, + "step": 20580 + }, + { + "epoch": 0.9982965670673675, + "grad_norm": 0.2421875, + "learning_rate": 3.9384031882416797e-10, + "loss": 2.786, + "num_input_tokens_seen": 10792468480, + "step": 20585 + }, + { + "epoch": 0.9985390486236141, + "grad_norm": 0.240234375, + "learning_rate": 2.8935227253112927e-10, + "loss": 2.7736, + "num_input_tokens_seen": 10795089920, + "step": 20590 + }, + { + "epoch": 0.9987815301798607, + "grad_norm": 0.2392578125, + "learning_rate": 2.0093919658459303e-10, + "loss": 2.7753, + "num_input_tokens_seen": 10797711360, + "step": 20595 + }, + { + "epoch": 0.9990240117361073, + "grad_norm": 0.2392578125, + "learning_rate": 1.2860114783352917e-10, + "loss": 2.765, + "num_input_tokens_seen": 10800332800, + "step": 20600 + }, + { + "epoch": 0.999266493292354, + "grad_norm": 0.2373046875, + "learning_rate": 7.233817278795574e-11, + "loss": 2.7774, + "num_input_tokens_seen": 10802954240, + "step": 20605 + }, + { + "epoch": 0.9995089748486006, + "grad_norm": 0.2392578125, + "learning_rate": 3.215030763004112e-11, + "loss": 2.7738, + "num_input_tokens_seen": 10805575680, + "step": 20610 + }, + { + "epoch": 0.9997514564048472, + "grad_norm": 0.2431640625, + "learning_rate": 8.037578200226214e-12, + "loss": 2.7771, + "num_input_tokens_seen": 10808197120, + "step": 20615 + }, + { + "epoch": 0.9999939379610938, + "grad_norm": 0.2353515625, + "learning_rate": 0.0, + "loss": 2.7621, + "num_input_tokens_seen": 10810818560, + "step": 20620 + }, + { + "epoch": 0.9999939379610938, + "num_input_tokens_seen": 10810818560, + "step": 20620, + "total_flos": 1.199767969182253e+19, + "train_loss": 2.7848671950609445, + "train_runtime": 243040.2706, + "train_samples_per_second": 21.72, + "train_steps_per_second": 0.085 + } + ], + "logging_steps": 5, + "max_steps": 20620, + "num_input_tokens_seen": 10810818560, + "num_train_epochs": 1, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.199767969182253e+19, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}