diff --git "a/Llama-3.3-70B-Instruct/checkpoint-2500/trainer_state.json" "b/Llama-3.3-70B-Instruct/checkpoint-2500/trainer_state.json" new file mode 100644--- /dev/null +++ "b/Llama-3.3-70B-Instruct/checkpoint-2500/trainer_state.json" @@ -0,0 +1,25034 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 2500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 1.7942025661468506, + "epoch": 0.00040004000400040005, + "grad_norm": 0.47672003507614136, + "learning_rate": 0.0, + "loss": 2.2188, + "mean_token_accuracy": 0.5192891135811806, + "num_tokens": 8850.0, + "step": 1 + }, + { + "entropy": 1.739880234003067, + "epoch": 0.0008000800080008001, + "grad_norm": 0.4743156433105469, + "learning_rate": 2.666666666666667e-06, + "loss": 2.1894, + "mean_token_accuracy": 0.5170402973890305, + "num_tokens": 18057.0, + "step": 2 + }, + { + "entropy": 1.7690136432647705, + "epoch": 0.0012001200120012002, + "grad_norm": 0.5005162358283997, + "learning_rate": 5.333333333333334e-06, + "loss": 2.2131, + "mean_token_accuracy": 0.5172632932662964, + "num_tokens": 26915.0, + "step": 3 + }, + { + "entropy": 1.866851270198822, + "epoch": 0.0016001600160016002, + "grad_norm": 0.438918799161911, + "learning_rate": 8.000000000000001e-06, + "loss": 2.2875, + "mean_token_accuracy": 0.5107089728116989, + "num_tokens": 35231.0, + "step": 4 + }, + { + "entropy": 1.8996970057487488, + "epoch": 0.002000200020002, + "grad_norm": 0.4285155236721039, + "learning_rate": 1.0666666666666667e-05, + "loss": 2.2935, + "mean_token_accuracy": 0.5128495469689369, + "num_tokens": 43540.0, + "step": 5 + }, + { + "entropy": 1.797807365655899, + "epoch": 0.0024002400240024004, + "grad_norm": 0.4465991258621216, + "learning_rate": 1.3333333333333333e-05, + "loss": 2.1917, + "mean_token_accuracy": 0.5254444032907486, + "num_tokens": 52236.0, + "step": 6 + }, + { + "entropy": 1.8983636498451233, + "epoch": 0.0028002800280028, + "grad_norm": 0.4536067545413971, + "learning_rate": 1.6000000000000003e-05, + "loss": 2.2677, + "mean_token_accuracy": 0.5144101679325104, + "num_tokens": 60443.0, + "step": 7 + }, + { + "entropy": 1.8427878618240356, + "epoch": 0.0032003200320032004, + "grad_norm": 0.5053722858428955, + "learning_rate": 1.866666666666667e-05, + "loss": 2.2356, + "mean_token_accuracy": 0.5142018273472786, + "num_tokens": 69155.0, + "step": 8 + }, + { + "entropy": 1.8648996651172638, + "epoch": 0.0036003600360036, + "grad_norm": 0.5287893414497375, + "learning_rate": 2.1333333333333335e-05, + "loss": 2.2435, + "mean_token_accuracy": 0.5086963996291161, + "num_tokens": 77156.0, + "step": 9 + }, + { + "entropy": 1.886999636888504, + "epoch": 0.004000400040004, + "grad_norm": 0.43816184997558594, + "learning_rate": 2.4e-05, + "loss": 2.1821, + "mean_token_accuracy": 0.5133799910545349, + "num_tokens": 85650.0, + "step": 10 + }, + { + "entropy": 2.0165862143039703, + "epoch": 0.0044004400440044, + "grad_norm": 0.3899831175804138, + "learning_rate": 2.6666666666666667e-05, + "loss": 2.1903, + "mean_token_accuracy": 0.5218925848603249, + "num_tokens": 93953.0, + "step": 11 + }, + { + "entropy": 2.033858895301819, + "epoch": 0.004800480048004801, + "grad_norm": 0.43466004729270935, + "learning_rate": 2.9333333333333336e-05, + "loss": 2.0937, + "mean_token_accuracy": 0.5258676409721375, + "num_tokens": 102592.0, + "step": 12 + }, + { + "entropy": 2.2364404797554016, + "epoch": 0.005200520052005201, + "grad_norm": 0.39024344086647034, + "learning_rate": 3.2000000000000005e-05, + "loss": 2.1801, + "mean_token_accuracy": 0.5228476375341415, + "num_tokens": 110784.0, + "step": 13 + }, + { + "entropy": 2.1504173278808594, + "epoch": 0.0056005600560056, + "grad_norm": 0.389006644487381, + "learning_rate": 3.466666666666667e-05, + "loss": 2.0215, + "mean_token_accuracy": 0.5430122464895248, + "num_tokens": 120082.0, + "step": 14 + }, + { + "entropy": 2.2962915897369385, + "epoch": 0.006000600060006, + "grad_norm": 0.4784089922904968, + "learning_rate": 3.733333333333334e-05, + "loss": 2.061, + "mean_token_accuracy": 0.531621664762497, + "num_tokens": 128363.0, + "step": 15 + }, + { + "entropy": 2.342404544353485, + "epoch": 0.006400640064006401, + "grad_norm": 0.5089271068572998, + "learning_rate": 4e-05, + "loss": 2.07, + "mean_token_accuracy": 0.5325157046318054, + "num_tokens": 136997.0, + "step": 16 + }, + { + "entropy": 2.283275544643402, + "epoch": 0.006800680068006801, + "grad_norm": 0.5488889813423157, + "learning_rate": 4.266666666666667e-05, + "loss": 2.0056, + "mean_token_accuracy": 0.5334787666797638, + "num_tokens": 145030.0, + "step": 17 + }, + { + "entropy": 2.050345718860626, + "epoch": 0.0072007200720072, + "grad_norm": 0.5031075477600098, + "learning_rate": 4.5333333333333335e-05, + "loss": 1.9162, + "mean_token_accuracy": 0.5427921563386917, + "num_tokens": 153623.0, + "step": 18 + }, + { + "entropy": 1.9828232526779175, + "epoch": 0.007600760076007601, + "grad_norm": 0.5337665677070618, + "learning_rate": 4.8e-05, + "loss": 1.9185, + "mean_token_accuracy": 0.5508822798728943, + "num_tokens": 161947.0, + "step": 19 + }, + { + "entropy": 1.8197293877601624, + "epoch": 0.008000800080008, + "grad_norm": 0.4948204755783081, + "learning_rate": 5.0666666666666674e-05, + "loss": 1.857, + "mean_token_accuracy": 0.552571251988411, + "num_tokens": 170516.0, + "step": 20 + }, + { + "entropy": 1.789840191602707, + "epoch": 0.0084008400840084, + "grad_norm": 0.4926859438419342, + "learning_rate": 5.333333333333333e-05, + "loss": 1.886, + "mean_token_accuracy": 0.5518065690994263, + "num_tokens": 178469.0, + "step": 21 + }, + { + "entropy": 1.6451906859874725, + "epoch": 0.0088008800880088, + "grad_norm": 0.4017632007598877, + "learning_rate": 5.6000000000000006e-05, + "loss": 1.7526, + "mean_token_accuracy": 0.5742013603448868, + "num_tokens": 186348.0, + "step": 22 + }, + { + "entropy": 1.6792134046554565, + "epoch": 0.0092009200920092, + "grad_norm": 0.6260354518890381, + "learning_rate": 5.866666666666667e-05, + "loss": 1.8468, + "mean_token_accuracy": 0.5656454414129257, + "num_tokens": 195071.0, + "step": 23 + }, + { + "entropy": 1.647391676902771, + "epoch": 0.009600960096009602, + "grad_norm": 0.46580520272254944, + "learning_rate": 6.133333333333334e-05, + "loss": 1.7595, + "mean_token_accuracy": 0.567480742931366, + "num_tokens": 202951.0, + "step": 24 + }, + { + "entropy": 1.6090652346611023, + "epoch": 0.010001000100010001, + "grad_norm": 0.4587379992008209, + "learning_rate": 6.400000000000001e-05, + "loss": 1.6638, + "mean_token_accuracy": 0.5937570631504059, + "num_tokens": 211268.0, + "step": 25 + }, + { + "entropy": 1.6326420307159424, + "epoch": 0.010401040104010401, + "grad_norm": 0.44421494007110596, + "learning_rate": 6.666666666666667e-05, + "loss": 1.6439, + "mean_token_accuracy": 0.5923638790845871, + "num_tokens": 219692.0, + "step": 26 + }, + { + "entropy": 1.7234179377555847, + "epoch": 0.010801080108010801, + "grad_norm": 0.4389747381210327, + "learning_rate": 6.933333333333334e-05, + "loss": 1.7108, + "mean_token_accuracy": 0.5803089290857315, + "num_tokens": 228047.0, + "step": 27 + }, + { + "entropy": 1.6885777115821838, + "epoch": 0.0112011201120112, + "grad_norm": 0.4335879981517792, + "learning_rate": 7.2e-05, + "loss": 1.6299, + "mean_token_accuracy": 0.586303323507309, + "num_tokens": 236376.0, + "step": 28 + }, + { + "entropy": 1.6646342873573303, + "epoch": 0.0116011601160116, + "grad_norm": 0.38126322627067566, + "learning_rate": 7.466666666666667e-05, + "loss": 1.6067, + "mean_token_accuracy": 0.5964086949825287, + "num_tokens": 245092.0, + "step": 29 + }, + { + "entropy": 1.6213374137878418, + "epoch": 0.012001200120012, + "grad_norm": 0.39270561933517456, + "learning_rate": 7.733333333333333e-05, + "loss": 1.5822, + "mean_token_accuracy": 0.6026028245687485, + "num_tokens": 253673.0, + "step": 30 + }, + { + "entropy": 1.5640352368354797, + "epoch": 0.012401240124012402, + "grad_norm": 0.3869155943393707, + "learning_rate": 8e-05, + "loss": 1.5011, + "mean_token_accuracy": 0.6241087764501572, + "num_tokens": 262625.0, + "step": 31 + }, + { + "entropy": 1.520020067691803, + "epoch": 0.012801280128012802, + "grad_norm": 0.3769737184047699, + "learning_rate": 8.266666666666667e-05, + "loss": 1.5088, + "mean_token_accuracy": 0.6204348653554916, + "num_tokens": 271309.0, + "step": 32 + }, + { + "entropy": 1.5669251084327698, + "epoch": 0.013201320132013201, + "grad_norm": 0.4119971692562103, + "learning_rate": 8.533333333333334e-05, + "loss": 1.598, + "mean_token_accuracy": 0.6009179204702377, + "num_tokens": 279702.0, + "step": 33 + }, + { + "entropy": 1.4570423662662506, + "epoch": 0.013601360136013601, + "grad_norm": 0.39608579874038696, + "learning_rate": 8.800000000000001e-05, + "loss": 1.4757, + "mean_token_accuracy": 0.6308933645486832, + "num_tokens": 288493.0, + "step": 34 + }, + { + "entropy": 1.4845676720142365, + "epoch": 0.014001400140014001, + "grad_norm": 0.37827152013778687, + "learning_rate": 9.066666666666667e-05, + "loss": 1.5051, + "mean_token_accuracy": 0.6212253570556641, + "num_tokens": 296999.0, + "step": 35 + }, + { + "entropy": 1.5079152584075928, + "epoch": 0.0144014401440144, + "grad_norm": 0.39496058225631714, + "learning_rate": 9.333333333333334e-05, + "loss": 1.5177, + "mean_token_accuracy": 0.6146594285964966, + "num_tokens": 305146.0, + "step": 36 + }, + { + "entropy": 1.4583857357501984, + "epoch": 0.014801480148014802, + "grad_norm": 0.41785281896591187, + "learning_rate": 9.6e-05, + "loss": 1.4723, + "mean_token_accuracy": 0.6168077737092972, + "num_tokens": 313647.0, + "step": 37 + }, + { + "entropy": 1.3630880415439606, + "epoch": 0.015201520152015202, + "grad_norm": 0.3789471983909607, + "learning_rate": 9.866666666666668e-05, + "loss": 1.3449, + "mean_token_accuracy": 0.6459334343671799, + "num_tokens": 322633.0, + "step": 38 + }, + { + "entropy": 1.4223653674125671, + "epoch": 0.015601560156015602, + "grad_norm": 0.4337131381034851, + "learning_rate": 0.00010133333333333335, + "loss": 1.4755, + "mean_token_accuracy": 0.6144974380731583, + "num_tokens": 331687.0, + "step": 39 + }, + { + "entropy": 1.3911584913730621, + "epoch": 0.016001600160016, + "grad_norm": 0.41617903113365173, + "learning_rate": 0.00010400000000000001, + "loss": 1.3826, + "mean_token_accuracy": 0.6441078633069992, + "num_tokens": 339868.0, + "step": 40 + }, + { + "entropy": 1.4160181879997253, + "epoch": 0.016401640164016403, + "grad_norm": 0.43531423807144165, + "learning_rate": 0.00010666666666666667, + "loss": 1.4294, + "mean_token_accuracy": 0.6320265531539917, + "num_tokens": 348029.0, + "step": 41 + }, + { + "entropy": 1.482937514781952, + "epoch": 0.0168016801680168, + "grad_norm": 0.4324755072593689, + "learning_rate": 0.00010933333333333333, + "loss": 1.5147, + "mean_token_accuracy": 0.6166313588619232, + "num_tokens": 356240.0, + "step": 42 + }, + { + "entropy": 1.4201266169548035, + "epoch": 0.017201720172017203, + "grad_norm": 0.3948879837989807, + "learning_rate": 0.00011200000000000001, + "loss": 1.3994, + "mean_token_accuracy": 0.6290998160839081, + "num_tokens": 364425.0, + "step": 43 + }, + { + "entropy": 1.357359528541565, + "epoch": 0.0176017601760176, + "grad_norm": 0.41655364632606506, + "learning_rate": 0.00011466666666666667, + "loss": 1.2924, + "mean_token_accuracy": 0.6492937654256821, + "num_tokens": 373138.0, + "step": 44 + }, + { + "entropy": 1.391854703426361, + "epoch": 0.018001800180018002, + "grad_norm": 0.417074590921402, + "learning_rate": 0.00011733333333333334, + "loss": 1.3507, + "mean_token_accuracy": 0.6494302302598953, + "num_tokens": 382100.0, + "step": 45 + }, + { + "entropy": 1.4749327600002289, + "epoch": 0.0184018401840184, + "grad_norm": 0.41923800110816956, + "learning_rate": 0.00012, + "loss": 1.5085, + "mean_token_accuracy": 0.612814411520958, + "num_tokens": 390052.0, + "step": 46 + }, + { + "entropy": 1.4137325286865234, + "epoch": 0.018801880188018802, + "grad_norm": 0.3833743929862976, + "learning_rate": 0.00012266666666666668, + "loss": 1.3916, + "mean_token_accuracy": 0.6410449594259262, + "num_tokens": 398110.0, + "step": 47 + }, + { + "entropy": 1.3919320702552795, + "epoch": 0.019201920192019203, + "grad_norm": 0.37842363119125366, + "learning_rate": 0.00012533333333333334, + "loss": 1.4084, + "mean_token_accuracy": 0.6312015205621719, + "num_tokens": 406666.0, + "step": 48 + }, + { + "entropy": 1.3608618378639221, + "epoch": 0.0196019601960196, + "grad_norm": 0.4568133056163788, + "learning_rate": 0.00012800000000000002, + "loss": 1.368, + "mean_token_accuracy": 0.6458054482936859, + "num_tokens": 415283.0, + "step": 49 + }, + { + "entropy": 1.3759468793869019, + "epoch": 0.020002000200020003, + "grad_norm": 0.3905130922794342, + "learning_rate": 0.00013066666666666668, + "loss": 1.3781, + "mean_token_accuracy": 0.6408856809139252, + "num_tokens": 423867.0, + "step": 50 + }, + { + "entropy": 1.3894509375095367, + "epoch": 0.0204020402040204, + "grad_norm": 0.39885976910591125, + "learning_rate": 0.00013333333333333334, + "loss": 1.3832, + "mean_token_accuracy": 0.6394526213407516, + "num_tokens": 432299.0, + "step": 51 + }, + { + "entropy": 1.3620089888572693, + "epoch": 0.020802080208020803, + "grad_norm": 0.44015854597091675, + "learning_rate": 0.00013600000000000003, + "loss": 1.3381, + "mean_token_accuracy": 0.6432337760925293, + "num_tokens": 440734.0, + "step": 52 + }, + { + "entropy": 1.3622656762599945, + "epoch": 0.0212021202120212, + "grad_norm": 0.49739453196525574, + "learning_rate": 0.00013866666666666669, + "loss": 1.3649, + "mean_token_accuracy": 0.6373352855443954, + "num_tokens": 448710.0, + "step": 53 + }, + { + "entropy": 1.2986978590488434, + "epoch": 0.021602160216021602, + "grad_norm": 0.37318113446235657, + "learning_rate": 0.00014133333333333334, + "loss": 1.3366, + "mean_token_accuracy": 0.6431873738765717, + "num_tokens": 457247.0, + "step": 54 + }, + { + "entropy": 1.2725946605205536, + "epoch": 0.022002200220022004, + "grad_norm": 0.4199654757976532, + "learning_rate": 0.000144, + "loss": 1.3302, + "mean_token_accuracy": 0.6447762101888657, + "num_tokens": 465701.0, + "step": 55 + }, + { + "entropy": 1.2967428863048553, + "epoch": 0.0224022402240224, + "grad_norm": 0.40956538915634155, + "learning_rate": 0.00014666666666666666, + "loss": 1.3352, + "mean_token_accuracy": 0.6408500224351883, + "num_tokens": 474476.0, + "step": 56 + }, + { + "entropy": 1.3544551134109497, + "epoch": 0.022802280228022803, + "grad_norm": 0.39519739151000977, + "learning_rate": 0.00014933333333333335, + "loss": 1.3406, + "mean_token_accuracy": 0.6500163674354553, + "num_tokens": 482570.0, + "step": 57 + }, + { + "entropy": 1.3824973404407501, + "epoch": 0.0232023202320232, + "grad_norm": 0.3799802362918854, + "learning_rate": 0.000152, + "loss": 1.3278, + "mean_token_accuracy": 0.6473122090101242, + "num_tokens": 491111.0, + "step": 58 + }, + { + "entropy": 1.3626296520233154, + "epoch": 0.023602360236023603, + "grad_norm": 0.3700718879699707, + "learning_rate": 0.00015466666666666667, + "loss": 1.3304, + "mean_token_accuracy": 0.645874097943306, + "num_tokens": 500032.0, + "step": 59 + }, + { + "entropy": 1.3258526921272278, + "epoch": 0.024002400240024, + "grad_norm": 0.366222620010376, + "learning_rate": 0.00015733333333333333, + "loss": 1.3073, + "mean_token_accuracy": 0.6523128002882004, + "num_tokens": 508045.0, + "step": 60 + }, + { + "entropy": 1.2787662744522095, + "epoch": 0.024402440244024402, + "grad_norm": 0.37774235010147095, + "learning_rate": 0.00016, + "loss": 1.2839, + "mean_token_accuracy": 0.657956600189209, + "num_tokens": 516334.0, + "step": 61 + }, + { + "entropy": 1.2824394404888153, + "epoch": 0.024802480248024804, + "grad_norm": 0.3594248294830322, + "learning_rate": 0.00016266666666666667, + "loss": 1.3335, + "mean_token_accuracy": 0.6513591110706329, + "num_tokens": 524762.0, + "step": 62 + }, + { + "entropy": 1.2761549651622772, + "epoch": 0.025202520252025202, + "grad_norm": 0.38247525691986084, + "learning_rate": 0.00016533333333333333, + "loss": 1.322, + "mean_token_accuracy": 0.6528888940811157, + "num_tokens": 533302.0, + "step": 63 + }, + { + "entropy": 1.285708099603653, + "epoch": 0.025602560256025603, + "grad_norm": 0.4210297167301178, + "learning_rate": 0.000168, + "loss": 1.2522, + "mean_token_accuracy": 0.6581785976886749, + "num_tokens": 542110.0, + "step": 64 + }, + { + "entropy": 1.3535743653774261, + "epoch": 0.026002600260026, + "grad_norm": 0.3659783601760864, + "learning_rate": 0.00017066666666666668, + "loss": 1.3343, + "mean_token_accuracy": 0.6510991156101227, + "num_tokens": 550717.0, + "step": 65 + }, + { + "entropy": 1.3446696996688843, + "epoch": 0.026402640264026403, + "grad_norm": 0.35590988397598267, + "learning_rate": 0.00017333333333333334, + "loss": 1.3224, + "mean_token_accuracy": 0.6442483812570572, + "num_tokens": 559025.0, + "step": 66 + }, + { + "entropy": 1.3695125877857208, + "epoch": 0.0268026802680268, + "grad_norm": 0.3491916358470917, + "learning_rate": 0.00017600000000000002, + "loss": 1.3288, + "mean_token_accuracy": 0.6431872397661209, + "num_tokens": 567724.0, + "step": 67 + }, + { + "entropy": 1.3363787531852722, + "epoch": 0.027202720272027203, + "grad_norm": 0.3625618517398834, + "learning_rate": 0.00017866666666666668, + "loss": 1.2804, + "mean_token_accuracy": 0.6557945609092712, + "num_tokens": 576144.0, + "step": 68 + }, + { + "entropy": 1.3033888339996338, + "epoch": 0.027602760276027604, + "grad_norm": 0.35051390528678894, + "learning_rate": 0.00018133333333333334, + "loss": 1.2841, + "mean_token_accuracy": 0.6544656604528427, + "num_tokens": 584831.0, + "step": 69 + }, + { + "entropy": 1.3235229551792145, + "epoch": 0.028002800280028002, + "grad_norm": 0.3980117738246918, + "learning_rate": 0.00018400000000000003, + "loss": 1.3492, + "mean_token_accuracy": 0.6482396423816681, + "num_tokens": 593412.0, + "step": 70 + }, + { + "entropy": 1.2970213294029236, + "epoch": 0.028402840284028404, + "grad_norm": 0.3519047796726227, + "learning_rate": 0.0001866666666666667, + "loss": 1.3083, + "mean_token_accuracy": 0.6536522507667542, + "num_tokens": 601675.0, + "step": 71 + }, + { + "entropy": 1.2363843321800232, + "epoch": 0.0288028802880288, + "grad_norm": 0.356121689081192, + "learning_rate": 0.00018933333333333335, + "loss": 1.2331, + "mean_token_accuracy": 0.6689527034759521, + "num_tokens": 610155.0, + "step": 72 + }, + { + "entropy": 1.2743788659572601, + "epoch": 0.029202920292029203, + "grad_norm": 0.352166086435318, + "learning_rate": 0.000192, + "loss": 1.2953, + "mean_token_accuracy": 0.6543757170438766, + "num_tokens": 619084.0, + "step": 73 + }, + { + "entropy": 1.251781314611435, + "epoch": 0.029602960296029605, + "grad_norm": 0.3690275251865387, + "learning_rate": 0.0001946666666666667, + "loss": 1.249, + "mean_token_accuracy": 0.6584222465753555, + "num_tokens": 627717.0, + "step": 74 + }, + { + "entropy": 1.3367043435573578, + "epoch": 0.030003000300030003, + "grad_norm": 0.3400121331214905, + "learning_rate": 0.00019733333333333335, + "loss": 1.2895, + "mean_token_accuracy": 0.6532490998506546, + "num_tokens": 637070.0, + "step": 75 + }, + { + "entropy": 1.2800488770008087, + "epoch": 0.030403040304030404, + "grad_norm": 0.34383344650268555, + "learning_rate": 0.0002, + "loss": 1.2733, + "mean_token_accuracy": 0.6612512767314911, + "num_tokens": 646123.0, + "step": 76 + }, + { + "entropy": 1.328520268201828, + "epoch": 0.030803080308030802, + "grad_norm": 0.3561513125896454, + "learning_rate": 0.00019999992447535154, + "loss": 1.3263, + "mean_token_accuracy": 0.6502320766448975, + "num_tokens": 654808.0, + "step": 77 + }, + { + "entropy": 1.2899321019649506, + "epoch": 0.031203120312031204, + "grad_norm": 0.3678707480430603, + "learning_rate": 0.00019999969790153286, + "loss": 1.3406, + "mean_token_accuracy": 0.6464085876941681, + "num_tokens": 663045.0, + "step": 78 + }, + { + "entropy": 1.3219149708747864, + "epoch": 0.0316031603160316, + "grad_norm": 0.38404518365859985, + "learning_rate": 0.00019999932027892428, + "loss": 1.302, + "mean_token_accuracy": 0.6544652730226517, + "num_tokens": 671266.0, + "step": 79 + }, + { + "entropy": 1.227865844964981, + "epoch": 0.032003200320032, + "grad_norm": 0.3195721209049225, + "learning_rate": 0.0001999987916081595, + "loss": 1.2129, + "mean_token_accuracy": 0.6690118610858917, + "num_tokens": 680536.0, + "step": 80 + }, + { + "entropy": 1.2681958079338074, + "epoch": 0.032403240324032405, + "grad_norm": 0.33165785670280457, + "learning_rate": 0.00019999811189012589, + "loss": 1.2616, + "mean_token_accuracy": 0.6542633771896362, + "num_tokens": 689078.0, + "step": 81 + }, + { + "entropy": 1.2480992376804352, + "epoch": 0.032803280328032806, + "grad_norm": 0.3365044891834259, + "learning_rate": 0.00019999728112596419, + "loss": 1.2532, + "mean_token_accuracy": 0.6593984663486481, + "num_tokens": 697600.0, + "step": 82 + }, + { + "entropy": 1.2559486627578735, + "epoch": 0.0332033203320332, + "grad_norm": 0.3525690734386444, + "learning_rate": 0.0001999962993170687, + "loss": 1.2407, + "mean_token_accuracy": 0.6652248501777649, + "num_tokens": 706449.0, + "step": 83 + }, + { + "entropy": 1.2723756432533264, + "epoch": 0.0336033603360336, + "grad_norm": 0.3243389129638672, + "learning_rate": 0.00019999516646508717, + "loss": 1.2759, + "mean_token_accuracy": 0.6553087830543518, + "num_tokens": 715261.0, + "step": 84 + }, + { + "entropy": 1.286735862493515, + "epoch": 0.034003400340034004, + "grad_norm": 0.3348769247531891, + "learning_rate": 0.000199993882571921, + "loss": 1.3288, + "mean_token_accuracy": 0.6503776162862778, + "num_tokens": 723935.0, + "step": 85 + }, + { + "entropy": 1.2838447391986847, + "epoch": 0.034403440344034406, + "grad_norm": 0.31921443343162537, + "learning_rate": 0.0001999924476397249, + "loss": 1.2712, + "mean_token_accuracy": 0.6571811884641647, + "num_tokens": 732552.0, + "step": 86 + }, + { + "entropy": 1.2601779401302338, + "epoch": 0.0348034803480348, + "grad_norm": 0.3210558593273163, + "learning_rate": 0.0001999908616709071, + "loss": 1.2409, + "mean_token_accuracy": 0.6692058891057968, + "num_tokens": 741619.0, + "step": 87 + }, + { + "entropy": 1.2706993520259857, + "epoch": 0.0352035203520352, + "grad_norm": 0.3449415862560272, + "learning_rate": 0.00019998912466812952, + "loss": 1.2301, + "mean_token_accuracy": 0.6645237505435944, + "num_tokens": 750045.0, + "step": 88 + }, + { + "entropy": 1.264108419418335, + "epoch": 0.0356035603560356, + "grad_norm": 0.3272925913333893, + "learning_rate": 0.00019998723663430733, + "loss": 1.2593, + "mean_token_accuracy": 0.6653023958206177, + "num_tokens": 758535.0, + "step": 89 + }, + { + "entropy": 1.174435406923294, + "epoch": 0.036003600360036005, + "grad_norm": 0.3484836518764496, + "learning_rate": 0.00019998519757260928, + "loss": 1.1771, + "mean_token_accuracy": 0.6722908169031143, + "num_tokens": 766995.0, + "step": 90 + }, + { + "entropy": 1.2018343806266785, + "epoch": 0.036403640364036406, + "grad_norm": 0.3412557542324066, + "learning_rate": 0.00019998300748645754, + "loss": 1.2204, + "mean_token_accuracy": 0.6707678735256195, + "num_tokens": 775542.0, + "step": 91 + }, + { + "entropy": 1.3117725551128387, + "epoch": 0.0368036803680368, + "grad_norm": 0.3464583158493042, + "learning_rate": 0.00019998066637952783, + "loss": 1.304, + "mean_token_accuracy": 0.645479291677475, + "num_tokens": 783830.0, + "step": 92 + }, + { + "entropy": 1.266638070344925, + "epoch": 0.0372037203720372, + "grad_norm": 0.35132962465286255, + "learning_rate": 0.0001999781742557493, + "loss": 1.2571, + "mean_token_accuracy": 0.6589740812778473, + "num_tokens": 792085.0, + "step": 93 + }, + { + "entropy": 1.266037255525589, + "epoch": 0.037603760376037604, + "grad_norm": 0.3320970833301544, + "learning_rate": 0.00019997553111930448, + "loss": 1.2761, + "mean_token_accuracy": 0.654522180557251, + "num_tokens": 800687.0, + "step": 94 + }, + { + "entropy": 1.324877679347992, + "epoch": 0.038003800380038005, + "grad_norm": 0.34410229325294495, + "learning_rate": 0.00019997273697462952, + "loss": 1.3059, + "mean_token_accuracy": 0.6469769328832626, + "num_tokens": 808479.0, + "step": 95 + }, + { + "entropy": 1.24421826004982, + "epoch": 0.03840384038403841, + "grad_norm": 0.3413639962673187, + "learning_rate": 0.00019996979182641383, + "loss": 1.2116, + "mean_token_accuracy": 0.6725156307220459, + "num_tokens": 817193.0, + "step": 96 + }, + { + "entropy": 1.2131675779819489, + "epoch": 0.0388038803880388, + "grad_norm": 0.31536421179771423, + "learning_rate": 0.00019996669567960031, + "loss": 1.2337, + "mean_token_accuracy": 0.6649139970541, + "num_tokens": 825915.0, + "step": 97 + }, + { + "entropy": 1.2785483300685883, + "epoch": 0.0392039203920392, + "grad_norm": 0.3453619182109833, + "learning_rate": 0.00019996344853938534, + "loss": 1.2257, + "mean_token_accuracy": 0.6682975143194199, + "num_tokens": 833771.0, + "step": 98 + }, + { + "entropy": 1.2706316709518433, + "epoch": 0.039603960396039604, + "grad_norm": 0.34687721729278564, + "learning_rate": 0.00019996005041121871, + "loss": 1.2578, + "mean_token_accuracy": 0.6584849059581757, + "num_tokens": 842093.0, + "step": 99 + }, + { + "entropy": 1.310558557510376, + "epoch": 0.040004000400040006, + "grad_norm": 0.34193679690361023, + "learning_rate": 0.0001999565013008035, + "loss": 1.338, + "mean_token_accuracy": 0.6487725079059601, + "num_tokens": 850079.0, + "step": 100 + }, + { + "entropy": 1.2646283209323883, + "epoch": 0.04040404040404041, + "grad_norm": 0.3951033651828766, + "learning_rate": 0.00019995280121409636, + "loss": 1.3172, + "mean_token_accuracy": 0.6424316316843033, + "num_tokens": 858250.0, + "step": 101 + }, + { + "entropy": 1.2900939583778381, + "epoch": 0.0408040804080408, + "grad_norm": 0.3364447057247162, + "learning_rate": 0.00019994895015730717, + "loss": 1.2487, + "mean_token_accuracy": 0.6626600474119186, + "num_tokens": 866623.0, + "step": 102 + }, + { + "entropy": 1.294897198677063, + "epoch": 0.041204120412041204, + "grad_norm": 0.3506770431995392, + "learning_rate": 0.00019994494813689928, + "loss": 1.2672, + "mean_token_accuracy": 0.6523661762475967, + "num_tokens": 875370.0, + "step": 103 + }, + { + "entropy": 1.2744373679161072, + "epoch": 0.041604160416041605, + "grad_norm": 0.31772273778915405, + "learning_rate": 0.00019994079515958942, + "loss": 1.2437, + "mean_token_accuracy": 0.6669129282236099, + "num_tokens": 884081.0, + "step": 104 + }, + { + "entropy": 1.2323677241802216, + "epoch": 0.04200420042004201, + "grad_norm": 0.31223100423812866, + "learning_rate": 0.00019993649123234758, + "loss": 1.2034, + "mean_token_accuracy": 0.6670378148555756, + "num_tokens": 892383.0, + "step": 105 + }, + { + "entropy": 1.1459662318229675, + "epoch": 0.0424042404240424, + "grad_norm": 0.3307859003543854, + "learning_rate": 0.00019993203636239717, + "loss": 1.2135, + "mean_token_accuracy": 0.6718799471855164, + "num_tokens": 900628.0, + "step": 106 + }, + { + "entropy": 1.2268281877040863, + "epoch": 0.0428042804280428, + "grad_norm": 0.35912272334098816, + "learning_rate": 0.00019992743055721493, + "loss": 1.2609, + "mean_token_accuracy": 0.6666164696216583, + "num_tokens": 909062.0, + "step": 107 + }, + { + "entropy": 1.200032651424408, + "epoch": 0.043204320432043204, + "grad_norm": 0.35117003321647644, + "learning_rate": 0.00019992267382453092, + "loss": 1.2047, + "mean_token_accuracy": 0.6681774854660034, + "num_tokens": 918221.0, + "step": 108 + }, + { + "entropy": 1.3714069724082947, + "epoch": 0.043604360436043606, + "grad_norm": 0.33686235547065735, + "learning_rate": 0.0001999177661723284, + "loss": 1.2777, + "mean_token_accuracy": 0.655053585767746, + "num_tokens": 926443.0, + "step": 109 + }, + { + "entropy": 1.3487186133861542, + "epoch": 0.04400440044004401, + "grad_norm": 0.3200630843639374, + "learning_rate": 0.0001999127076088441, + "loss": 1.3107, + "mean_token_accuracy": 0.6602136790752411, + "num_tokens": 934650.0, + "step": 110 + }, + { + "entropy": 1.2584488987922668, + "epoch": 0.0444044404440444, + "grad_norm": 0.31613630056381226, + "learning_rate": 0.0001999074981425679, + "loss": 1.2226, + "mean_token_accuracy": 0.6622737497091293, + "num_tokens": 942947.0, + "step": 111 + }, + { + "entropy": 1.1936236023902893, + "epoch": 0.0448044804480448, + "grad_norm": 0.316254198551178, + "learning_rate": 0.00019990213778224298, + "loss": 1.2106, + "mean_token_accuracy": 0.6652569025754929, + "num_tokens": 951465.0, + "step": 112 + }, + { + "entropy": 1.165192574262619, + "epoch": 0.045204520452045205, + "grad_norm": 0.31257057189941406, + "learning_rate": 0.00019989662653686576, + "loss": 1.2065, + "mean_token_accuracy": 0.6672259867191315, + "num_tokens": 960215.0, + "step": 113 + }, + { + "entropy": 1.180109590291977, + "epoch": 0.045604560456045606, + "grad_norm": 0.3332797884941101, + "learning_rate": 0.00019989096441568591, + "loss": 1.2285, + "mean_token_accuracy": 0.6671265214681625, + "num_tokens": 968893.0, + "step": 114 + }, + { + "entropy": 1.220985621213913, + "epoch": 0.04600460046004601, + "grad_norm": 0.3698706030845642, + "learning_rate": 0.0001998851514282063, + "loss": 1.2314, + "mean_token_accuracy": 0.6654269397258759, + "num_tokens": 976891.0, + "step": 115 + }, + { + "entropy": 1.2753552794456482, + "epoch": 0.0464046404640464, + "grad_norm": 0.32274726033210754, + "learning_rate": 0.00019987918758418308, + "loss": 1.2811, + "mean_token_accuracy": 0.6611100733280182, + "num_tokens": 984914.0, + "step": 116 + }, + { + "entropy": 1.308321624994278, + "epoch": 0.046804680468046804, + "grad_norm": 0.33258453011512756, + "learning_rate": 0.00019987307289362545, + "loss": 1.2541, + "mean_token_accuracy": 0.6605920940637589, + "num_tokens": 993096.0, + "step": 117 + }, + { + "entropy": 1.2893326878547668, + "epoch": 0.047204720472047206, + "grad_norm": 0.33915621042251587, + "learning_rate": 0.00019986680736679586, + "loss": 1.2511, + "mean_token_accuracy": 0.6640890389680862, + "num_tokens": 1001323.0, + "step": 118 + }, + { + "entropy": 1.30213862657547, + "epoch": 0.04760476047604761, + "grad_norm": 0.3717119097709656, + "learning_rate": 0.00019986039101420994, + "loss": 1.3143, + "mean_token_accuracy": 0.649169459939003, + "num_tokens": 1009892.0, + "step": 119 + }, + { + "entropy": 1.3021227717399597, + "epoch": 0.048004800480048, + "grad_norm": 0.32890114188194275, + "learning_rate": 0.0001998538238466364, + "loss": 1.2351, + "mean_token_accuracy": 0.6693892329931259, + "num_tokens": 1017992.0, + "step": 120 + }, + { + "entropy": 1.2010404765605927, + "epoch": 0.0484048404840484, + "grad_norm": 0.3222126066684723, + "learning_rate": 0.00019984710587509706, + "loss": 1.1934, + "mean_token_accuracy": 0.6745197772979736, + "num_tokens": 1026224.0, + "step": 121 + }, + { + "entropy": 1.2384890913963318, + "epoch": 0.048804880488048805, + "grad_norm": 0.32965728640556335, + "learning_rate": 0.00019984023711086687, + "loss": 1.2587, + "mean_token_accuracy": 0.6567209810018539, + "num_tokens": 1034674.0, + "step": 122 + }, + { + "entropy": 1.1893330216407776, + "epoch": 0.049204920492049206, + "grad_norm": 0.3488786518573761, + "learning_rate": 0.0001998332175654739, + "loss": 1.1999, + "mean_token_accuracy": 0.6683076322078705, + "num_tokens": 1042546.0, + "step": 123 + }, + { + "entropy": 1.2300190329551697, + "epoch": 0.04960496049604961, + "grad_norm": 0.33502018451690674, + "learning_rate": 0.00019982604725069918, + "loss": 1.2714, + "mean_token_accuracy": 0.6550982743501663, + "num_tokens": 1051075.0, + "step": 124 + }, + { + "entropy": 1.263420820236206, + "epoch": 0.05000500050005, + "grad_norm": 0.35562458634376526, + "learning_rate": 0.00019981872617857684, + "loss": 1.2535, + "mean_token_accuracy": 0.6570105701684952, + "num_tokens": 1059384.0, + "step": 125 + }, + { + "entropy": 1.2463673949241638, + "epoch": 0.050405040504050404, + "grad_norm": 0.3122851252555847, + "learning_rate": 0.00019981125436139405, + "loss": 1.2035, + "mean_token_accuracy": 0.6734038293361664, + "num_tokens": 1068524.0, + "step": 126 + }, + { + "entropy": 1.3272143006324768, + "epoch": 0.050805080508050805, + "grad_norm": 0.37185049057006836, + "learning_rate": 0.00019980363181169096, + "loss": 1.2723, + "mean_token_accuracy": 0.6541654914617538, + "num_tokens": 1076256.0, + "step": 127 + }, + { + "entropy": 1.2414169907569885, + "epoch": 0.05120512051205121, + "grad_norm": 0.32138875126838684, + "learning_rate": 0.00019979585854226065, + "loss": 1.1992, + "mean_token_accuracy": 0.6784048974514008, + "num_tokens": 1084784.0, + "step": 128 + }, + { + "entropy": 1.1664628982543945, + "epoch": 0.05160516051605161, + "grad_norm": 0.31607839465141296, + "learning_rate": 0.00019978793456614918, + "loss": 1.1728, + "mean_token_accuracy": 0.6773318648338318, + "num_tokens": 1094177.0, + "step": 129 + }, + { + "entropy": 1.1460879147052765, + "epoch": 0.052005200520052, + "grad_norm": 0.3119550347328186, + "learning_rate": 0.0001997798598966556, + "loss": 1.1576, + "mean_token_accuracy": 0.6763872653245926, + "num_tokens": 1102808.0, + "step": 130 + }, + { + "entropy": 1.1866309642791748, + "epoch": 0.052405240524052404, + "grad_norm": 0.3441757261753082, + "learning_rate": 0.00019977163454733184, + "loss": 1.2228, + "mean_token_accuracy": 0.6688681393861771, + "num_tokens": 1111447.0, + "step": 131 + }, + { + "entropy": 1.1310507953166962, + "epoch": 0.052805280528052806, + "grad_norm": 0.3540189862251282, + "learning_rate": 0.00019976325853198268, + "loss": 1.1514, + "mean_token_accuracy": 0.6831837445497513, + "num_tokens": 1120000.0, + "step": 132 + }, + { + "entropy": 1.19211745262146, + "epoch": 0.05320532053205321, + "grad_norm": 0.3323245942592621, + "learning_rate": 0.00019975473186466583, + "loss": 1.2119, + "mean_token_accuracy": 0.6718263179063797, + "num_tokens": 1128658.0, + "step": 133 + }, + { + "entropy": 1.1928575336933136, + "epoch": 0.0536053605360536, + "grad_norm": 0.34882429242134094, + "learning_rate": 0.0001997460545596918, + "loss": 1.2066, + "mean_token_accuracy": 0.6791622638702393, + "num_tokens": 1137143.0, + "step": 134 + }, + { + "entropy": 1.226127952337265, + "epoch": 0.054005400540054004, + "grad_norm": 0.3233380913734436, + "learning_rate": 0.00019973722663162396, + "loss": 1.1884, + "mean_token_accuracy": 0.6750646978616714, + "num_tokens": 1145501.0, + "step": 135 + }, + { + "entropy": 1.2761054337024689, + "epoch": 0.054405440544054405, + "grad_norm": 0.308118611574173, + "learning_rate": 0.00019972824809527838, + "loss": 1.224, + "mean_token_accuracy": 0.6631017774343491, + "num_tokens": 1153912.0, + "step": 136 + }, + { + "entropy": 1.3157364130020142, + "epoch": 0.05480548054805481, + "grad_norm": 0.33582690358161926, + "learning_rate": 0.00019971911896572405, + "loss": 1.2701, + "mean_token_accuracy": 0.6578985750675201, + "num_tokens": 1161769.0, + "step": 137 + }, + { + "entropy": 1.2075002789497375, + "epoch": 0.05520552055205521, + "grad_norm": 0.3170996606349945, + "learning_rate": 0.00019970983925828256, + "loss": 1.1906, + "mean_token_accuracy": 0.6732707768678665, + "num_tokens": 1170319.0, + "step": 138 + }, + { + "entropy": 1.1732978522777557, + "epoch": 0.0556055605560556, + "grad_norm": 0.32156452536582947, + "learning_rate": 0.0001997004089885283, + "loss": 1.1782, + "mean_token_accuracy": 0.6732619553804398, + "num_tokens": 1178801.0, + "step": 139 + }, + { + "entropy": 1.1573354601860046, + "epoch": 0.056005600560056004, + "grad_norm": 0.33083587884902954, + "learning_rate": 0.00019969082817228832, + "loss": 1.2067, + "mean_token_accuracy": 0.6737565696239471, + "num_tokens": 1186994.0, + "step": 140 + }, + { + "entropy": 1.211174637079239, + "epoch": 0.056405640564056406, + "grad_norm": 0.34685665369033813, + "learning_rate": 0.00019968109682564237, + "loss": 1.2586, + "mean_token_accuracy": 0.6569341272115707, + "num_tokens": 1194743.0, + "step": 141 + }, + { + "entropy": 1.2521505057811737, + "epoch": 0.05680568056805681, + "grad_norm": 0.35258418321609497, + "learning_rate": 0.00019967121496492282, + "loss": 1.2599, + "mean_token_accuracy": 0.6645904332399368, + "num_tokens": 1202435.0, + "step": 142 + }, + { + "entropy": 1.2398549616336823, + "epoch": 0.05720572057205721, + "grad_norm": 0.3388517200946808, + "learning_rate": 0.00019966118260671465, + "loss": 1.2081, + "mean_token_accuracy": 0.6675426363945007, + "num_tokens": 1210326.0, + "step": 143 + }, + { + "entropy": 1.297620803117752, + "epoch": 0.0576057605760576, + "grad_norm": 0.34630584716796875, + "learning_rate": 0.0001996509997678554, + "loss": 1.2857, + "mean_token_accuracy": 0.6573289930820465, + "num_tokens": 1218682.0, + "step": 144 + }, + { + "entropy": 1.248921811580658, + "epoch": 0.058005800580058005, + "grad_norm": 0.33417370915412903, + "learning_rate": 0.00019964066646543517, + "loss": 1.2036, + "mean_token_accuracy": 0.6730931401252747, + "num_tokens": 1227725.0, + "step": 145 + }, + { + "entropy": 1.2742219269275665, + "epoch": 0.058405840584058406, + "grad_norm": 0.31867334246635437, + "learning_rate": 0.00019963018271679667, + "loss": 1.2356, + "mean_token_accuracy": 0.6603083312511444, + "num_tokens": 1236112.0, + "step": 146 + }, + { + "entropy": 1.2454158961772919, + "epoch": 0.05880588058805881, + "grad_norm": 0.31619757413864136, + "learning_rate": 0.000199619548539535, + "loss": 1.2272, + "mean_token_accuracy": 0.664936900138855, + "num_tokens": 1244932.0, + "step": 147 + }, + { + "entropy": 1.1861615478992462, + "epoch": 0.05920592059205921, + "grad_norm": 0.3590589761734009, + "learning_rate": 0.00019960876395149778, + "loss": 1.2122, + "mean_token_accuracy": 0.6684562414884567, + "num_tokens": 1253316.0, + "step": 148 + }, + { + "entropy": 1.1777002215385437, + "epoch": 0.059605960596059604, + "grad_norm": 0.3057377338409424, + "learning_rate": 0.00019959782897078504, + "loss": 1.1483, + "mean_token_accuracy": 0.6810255944728851, + "num_tokens": 1261895.0, + "step": 149 + }, + { + "entropy": 1.2077372670173645, + "epoch": 0.060006000600060005, + "grad_norm": 0.32661283016204834, + "learning_rate": 0.00019958674361574927, + "loss": 1.2242, + "mean_token_accuracy": 0.6603673696517944, + "num_tokens": 1270647.0, + "step": 150 + }, + { + "entropy": 1.2129946649074554, + "epoch": 0.06040604060406041, + "grad_norm": 0.33181479573249817, + "learning_rate": 0.00019957550790499526, + "loss": 1.214, + "mean_token_accuracy": 0.6734245270490646, + "num_tokens": 1279483.0, + "step": 151 + }, + { + "entropy": 1.2279469072818756, + "epoch": 0.06080608060806081, + "grad_norm": 0.36564233899116516, + "learning_rate": 0.00019956412185738025, + "loss": 1.2227, + "mean_token_accuracy": 0.664169505238533, + "num_tokens": 1288062.0, + "step": 152 + }, + { + "entropy": 1.1853630542755127, + "epoch": 0.0612061206120612, + "grad_norm": 0.3081769645214081, + "learning_rate": 0.0001995525854920137, + "loss": 1.2009, + "mean_token_accuracy": 0.6692493110895157, + "num_tokens": 1296644.0, + "step": 153 + }, + { + "entropy": 1.1182245910167694, + "epoch": 0.061606160616061605, + "grad_norm": 0.28534799814224243, + "learning_rate": 0.00019954089882825738, + "loss": 1.0659, + "mean_token_accuracy": 0.7025346755981445, + "num_tokens": 1305683.0, + "step": 154 + }, + { + "entropy": 1.1886220276355743, + "epoch": 0.062006200620062006, + "grad_norm": 0.3182019293308258, + "learning_rate": 0.0001995290618857253, + "loss": 1.1576, + "mean_token_accuracy": 0.6741877645254135, + "num_tokens": 1314385.0, + "step": 155 + }, + { + "entropy": 1.2045941054821014, + "epoch": 0.06240624062406241, + "grad_norm": 0.3276945948600769, + "learning_rate": 0.0001995170746842838, + "loss": 1.165, + "mean_token_accuracy": 0.6834963709115982, + "num_tokens": 1322826.0, + "step": 156 + }, + { + "entropy": 1.2731471955776215, + "epoch": 0.0628062806280628, + "grad_norm": 0.3397105932235718, + "learning_rate": 0.00019950493724405117, + "loss": 1.2985, + "mean_token_accuracy": 0.648296907544136, + "num_tokens": 1331327.0, + "step": 157 + }, + { + "entropy": 1.1947194337844849, + "epoch": 0.0632063206320632, + "grad_norm": 0.2986201047897339, + "learning_rate": 0.00019949264958539807, + "loss": 1.205, + "mean_token_accuracy": 0.6792440861463547, + "num_tokens": 1340147.0, + "step": 158 + }, + { + "entropy": 1.1570270955562592, + "epoch": 0.0636063606360636, + "grad_norm": 0.3215077519416809, + "learning_rate": 0.00019948021172894718, + "loss": 1.1681, + "mean_token_accuracy": 0.6815727949142456, + "num_tokens": 1348989.0, + "step": 159 + }, + { + "entropy": 1.122036024928093, + "epoch": 0.064006400640064, + "grad_norm": 0.3120049238204956, + "learning_rate": 0.00019946762369557323, + "loss": 1.1377, + "mean_token_accuracy": 0.6871893852949142, + "num_tokens": 1357863.0, + "step": 160 + }, + { + "entropy": 1.2672194242477417, + "epoch": 0.06440644064406441, + "grad_norm": 0.33700302243232727, + "learning_rate": 0.00019945488550640313, + "loss": 1.2532, + "mean_token_accuracy": 0.664255827665329, + "num_tokens": 1365945.0, + "step": 161 + }, + { + "entropy": 1.1509548127651215, + "epoch": 0.06480648064806481, + "grad_norm": 0.3201735019683838, + "learning_rate": 0.00019944199718281559, + "loss": 1.1387, + "mean_token_accuracy": 0.6814217865467072, + "num_tokens": 1375147.0, + "step": 162 + }, + { + "entropy": 1.1635609865188599, + "epoch": 0.06520652065206521, + "grad_norm": 0.2953193187713623, + "learning_rate": 0.0001994289587464415, + "loss": 1.1817, + "mean_token_accuracy": 0.6780352145433426, + "num_tokens": 1383893.0, + "step": 163 + }, + { + "entropy": 1.1869005262851715, + "epoch": 0.06560656065606561, + "grad_norm": 0.30155807733535767, + "learning_rate": 0.00019941577021916355, + "loss": 1.1834, + "mean_token_accuracy": 0.6724350303411484, + "num_tokens": 1392477.0, + "step": 164 + }, + { + "entropy": 1.1506932377815247, + "epoch": 0.066006600660066, + "grad_norm": 0.31121376156806946, + "learning_rate": 0.00019940243162311642, + "loss": 1.1673, + "mean_token_accuracy": 0.6797937452793121, + "num_tokens": 1400899.0, + "step": 165 + }, + { + "entropy": 1.2660083770751953, + "epoch": 0.0664066406640664, + "grad_norm": 0.3299071788787842, + "learning_rate": 0.00019938894298068661, + "loss": 1.2725, + "mean_token_accuracy": 0.6537068784236908, + "num_tokens": 1409546.0, + "step": 166 + }, + { + "entropy": 1.2500199675559998, + "epoch": 0.0668066806680668, + "grad_norm": 0.3030771017074585, + "learning_rate": 0.00019937530431451243, + "loss": 1.1776, + "mean_token_accuracy": 0.6745365858078003, + "num_tokens": 1417712.0, + "step": 167 + }, + { + "entropy": 1.2582001090049744, + "epoch": 0.0672067206720672, + "grad_norm": 0.30366259813308716, + "learning_rate": 0.00019936151564748403, + "loss": 1.2339, + "mean_token_accuracy": 0.6664343029260635, + "num_tokens": 1426352.0, + "step": 168 + }, + { + "entropy": 1.2371725142002106, + "epoch": 0.0676067606760676, + "grad_norm": 0.3065868616104126, + "learning_rate": 0.00019934757700274325, + "loss": 1.223, + "mean_token_accuracy": 0.6679128706455231, + "num_tokens": 1434986.0, + "step": 169 + }, + { + "entropy": 1.2751116156578064, + "epoch": 0.06800680068006801, + "grad_norm": 0.3346325755119324, + "learning_rate": 0.00019933348840368368, + "loss": 1.2569, + "mean_token_accuracy": 0.6594884544610977, + "num_tokens": 1442823.0, + "step": 170 + }, + { + "entropy": 1.1633991301059723, + "epoch": 0.06840684068406841, + "grad_norm": 0.3242139518260956, + "learning_rate": 0.0001993192498739506, + "loss": 1.1805, + "mean_token_accuracy": 0.6728992164134979, + "num_tokens": 1451134.0, + "step": 171 + }, + { + "entropy": 1.2180014848709106, + "epoch": 0.06880688068806881, + "grad_norm": 0.3972644507884979, + "learning_rate": 0.0001993048614374409, + "loss": 1.2393, + "mean_token_accuracy": 0.6580066382884979, + "num_tokens": 1459262.0, + "step": 172 + }, + { + "entropy": 1.1176005005836487, + "epoch": 0.06920692069206921, + "grad_norm": 0.3137458264827728, + "learning_rate": 0.00019929032311830303, + "loss": 1.1644, + "mean_token_accuracy": 0.6814699321985245, + "num_tokens": 1467853.0, + "step": 173 + }, + { + "entropy": 1.1198759078979492, + "epoch": 0.0696069606960696, + "grad_norm": 0.3517007529735565, + "learning_rate": 0.000199275634940937, + "loss": 1.1312, + "mean_token_accuracy": 0.6874582916498184, + "num_tokens": 1476497.0, + "step": 174 + }, + { + "entropy": 1.2389306426048279, + "epoch": 0.07000700070007, + "grad_norm": 0.32016775012016296, + "learning_rate": 0.00019926079692999445, + "loss": 1.214, + "mean_token_accuracy": 0.6705743223428726, + "num_tokens": 1484294.0, + "step": 175 + }, + { + "entropy": 1.3337944746017456, + "epoch": 0.0704070407040704, + "grad_norm": 0.33495742082595825, + "learning_rate": 0.00019924580911037827, + "loss": 1.2954, + "mean_token_accuracy": 0.6510952711105347, + "num_tokens": 1492575.0, + "step": 176 + }, + { + "entropy": 1.2905775010585785, + "epoch": 0.0708070807080708, + "grad_norm": 0.3236202001571655, + "learning_rate": 0.00019923067150724296, + "loss": 1.219, + "mean_token_accuracy": 0.6705390512943268, + "num_tokens": 1500716.0, + "step": 177 + }, + { + "entropy": 1.2353481650352478, + "epoch": 0.0712071207120712, + "grad_norm": 0.3262037932872772, + "learning_rate": 0.00019921538414599437, + "loss": 1.2076, + "mean_token_accuracy": 0.6677059978246689, + "num_tokens": 1509105.0, + "step": 178 + }, + { + "entropy": 1.2299005091190338, + "epoch": 0.07160716071607161, + "grad_norm": 0.3147687315940857, + "learning_rate": 0.00019919994705228965, + "loss": 1.2301, + "mean_token_accuracy": 0.6644129753112793, + "num_tokens": 1516981.0, + "step": 179 + }, + { + "entropy": 1.1565956473350525, + "epoch": 0.07200720072007201, + "grad_norm": 0.31962037086486816, + "learning_rate": 0.00019918436025203728, + "loss": 1.2013, + "mean_token_accuracy": 0.6825570911169052, + "num_tokens": 1524951.0, + "step": 180 + }, + { + "entropy": 1.1386863589286804, + "epoch": 0.07240724072407241, + "grad_norm": 0.30647844076156616, + "learning_rate": 0.00019916862377139695, + "loss": 1.1697, + "mean_token_accuracy": 0.6716460883617401, + "num_tokens": 1533450.0, + "step": 181 + }, + { + "entropy": 1.1206298768520355, + "epoch": 0.07280728072807281, + "grad_norm": 0.2919379472732544, + "learning_rate": 0.00019915273763677959, + "loss": 1.1221, + "mean_token_accuracy": 0.6845085620880127, + "num_tokens": 1542345.0, + "step": 182 + }, + { + "entropy": 1.1708945035934448, + "epoch": 0.07320732073207321, + "grad_norm": 0.3223237097263336, + "learning_rate": 0.00019913670187484737, + "loss": 1.1722, + "mean_token_accuracy": 0.681228905916214, + "num_tokens": 1551016.0, + "step": 183 + }, + { + "entropy": 1.1606915593147278, + "epoch": 0.0736073607360736, + "grad_norm": 0.3167206943035126, + "learning_rate": 0.00019912051651251346, + "loss": 1.1381, + "mean_token_accuracy": 0.686376079916954, + "num_tokens": 1560201.0, + "step": 184 + }, + { + "entropy": 1.2089463472366333, + "epoch": 0.074007400740074, + "grad_norm": 0.331546813249588, + "learning_rate": 0.00019910418157694217, + "loss": 1.1998, + "mean_token_accuracy": 0.6701401472091675, + "num_tokens": 1568847.0, + "step": 185 + }, + { + "entropy": 1.2552906274795532, + "epoch": 0.0744074407440744, + "grad_norm": 0.3218790292739868, + "learning_rate": 0.00019908769709554887, + "loss": 1.2302, + "mean_token_accuracy": 0.6671873778104782, + "num_tokens": 1577212.0, + "step": 186 + }, + { + "entropy": 1.0971337109804153, + "epoch": 0.0748074807480748, + "grad_norm": 0.2888547480106354, + "learning_rate": 0.00019907106309599985, + "loss": 1.1053, + "mean_token_accuracy": 0.6914333999156952, + "num_tokens": 1586544.0, + "step": 187 + }, + { + "entropy": 1.1342568099498749, + "epoch": 0.07520752075207521, + "grad_norm": 0.3135220408439636, + "learning_rate": 0.00019905427960621245, + "loss": 1.1553, + "mean_token_accuracy": 0.678636908531189, + "num_tokens": 1595573.0, + "step": 188 + }, + { + "entropy": 1.2157914340496063, + "epoch": 0.07560756075607561, + "grad_norm": 0.32912546396255493, + "learning_rate": 0.00019903734665435472, + "loss": 1.2219, + "mean_token_accuracy": 0.6693233996629715, + "num_tokens": 1603723.0, + "step": 189 + }, + { + "entropy": 1.1541197896003723, + "epoch": 0.07600760076007601, + "grad_norm": 0.31249913573265076, + "learning_rate": 0.00019902026426884574, + "loss": 1.1311, + "mean_token_accuracy": 0.6898495107889175, + "num_tokens": 1612212.0, + "step": 190 + }, + { + "entropy": 1.211905598640442, + "epoch": 0.07640764076407641, + "grad_norm": 0.3106580078601837, + "learning_rate": 0.00019900303247835527, + "loss": 1.168, + "mean_token_accuracy": 0.675964280962944, + "num_tokens": 1620162.0, + "step": 191 + }, + { + "entropy": 1.2080174088478088, + "epoch": 0.07680768076807681, + "grad_norm": 0.32318130135536194, + "learning_rate": 0.00019898565131180393, + "loss": 1.1781, + "mean_token_accuracy": 0.6760376244783401, + "num_tokens": 1628883.0, + "step": 192 + }, + { + "entropy": 1.2078506350517273, + "epoch": 0.0772077207720772, + "grad_norm": 0.33328673243522644, + "learning_rate": 0.0001989681207983629, + "loss": 1.2092, + "mean_token_accuracy": 0.6628051847219467, + "num_tokens": 1637332.0, + "step": 193 + }, + { + "entropy": 1.210196852684021, + "epoch": 0.0776077607760776, + "grad_norm": 0.32340574264526367, + "learning_rate": 0.00019895044096745416, + "loss": 1.2329, + "mean_token_accuracy": 0.6619292944669724, + "num_tokens": 1645906.0, + "step": 194 + }, + { + "entropy": 1.1815847158432007, + "epoch": 0.078007800780078, + "grad_norm": 0.3175504505634308, + "learning_rate": 0.00019893261184875016, + "loss": 1.2045, + "mean_token_accuracy": 0.6673628389835358, + "num_tokens": 1654114.0, + "step": 195 + }, + { + "entropy": 1.1910730004310608, + "epoch": 0.0784078407840784, + "grad_norm": 0.3114391565322876, + "learning_rate": 0.00019891463347217395, + "loss": 1.1889, + "mean_token_accuracy": 0.6714468449354172, + "num_tokens": 1662666.0, + "step": 196 + }, + { + "entropy": 1.1541639566421509, + "epoch": 0.07880788078807881, + "grad_norm": 0.3364032506942749, + "learning_rate": 0.0001988965058678992, + "loss": 1.1622, + "mean_token_accuracy": 0.67988321185112, + "num_tokens": 1671435.0, + "step": 197 + }, + { + "entropy": 1.222437858581543, + "epoch": 0.07920792079207921, + "grad_norm": 0.3355000913143158, + "learning_rate": 0.00019887822906634983, + "loss": 1.1804, + "mean_token_accuracy": 0.6725995391607285, + "num_tokens": 1679662.0, + "step": 198 + }, + { + "entropy": 1.2075644731521606, + "epoch": 0.07960796079607961, + "grad_norm": 0.33377805352211, + "learning_rate": 0.00019885980309820032, + "loss": 1.1547, + "mean_token_accuracy": 0.6831348687410355, + "num_tokens": 1687663.0, + "step": 199 + }, + { + "entropy": 1.248348981142044, + "epoch": 0.08000800080008001, + "grad_norm": 0.3341095447540283, + "learning_rate": 0.0001988412279943754, + "loss": 1.2665, + "mean_token_accuracy": 0.6561878323554993, + "num_tokens": 1696479.0, + "step": 200 + }, + { + "entropy": 1.224026381969452, + "epoch": 0.08040804080408041, + "grad_norm": 0.33011487126350403, + "learning_rate": 0.00019882250378605015, + "loss": 1.2181, + "mean_token_accuracy": 0.6664289385080338, + "num_tokens": 1704885.0, + "step": 201 + }, + { + "entropy": 1.1437757015228271, + "epoch": 0.08080808080808081, + "grad_norm": 0.31265076994895935, + "learning_rate": 0.00019880363050464993, + "loss": 1.1773, + "mean_token_accuracy": 0.6812110096216202, + "num_tokens": 1713409.0, + "step": 202 + }, + { + "entropy": 1.2059556543827057, + "epoch": 0.0812081208120812, + "grad_norm": 0.315448135137558, + "learning_rate": 0.00019878460818185023, + "loss": 1.2278, + "mean_token_accuracy": 0.6699778735637665, + "num_tokens": 1721548.0, + "step": 203 + }, + { + "entropy": 1.2078820168972015, + "epoch": 0.0816081608160816, + "grad_norm": 0.3079279363155365, + "learning_rate": 0.00019876543684957667, + "loss": 1.1845, + "mean_token_accuracy": 0.6785111278295517, + "num_tokens": 1729809.0, + "step": 204 + }, + { + "entropy": 1.199218899011612, + "epoch": 0.082008200820082, + "grad_norm": 0.3043046295642853, + "learning_rate": 0.000198746116540005, + "loss": 1.1722, + "mean_token_accuracy": 0.6754065752029419, + "num_tokens": 1738734.0, + "step": 205 + }, + { + "entropy": 1.2172024846076965, + "epoch": 0.08240824082408241, + "grad_norm": 0.313902884721756, + "learning_rate": 0.00019872664728556101, + "loss": 1.1869, + "mean_token_accuracy": 0.6728281825780869, + "num_tokens": 1746870.0, + "step": 206 + }, + { + "entropy": 1.1678736209869385, + "epoch": 0.08280828082808281, + "grad_norm": 0.3191705644130707, + "learning_rate": 0.00019870702911892042, + "loss": 1.1546, + "mean_token_accuracy": 0.6843972355127335, + "num_tokens": 1755295.0, + "step": 207 + }, + { + "entropy": 1.279354214668274, + "epoch": 0.08320832083208321, + "grad_norm": 0.3313900828361511, + "learning_rate": 0.0001986872620730089, + "loss": 1.2558, + "mean_token_accuracy": 0.659809798002243, + "num_tokens": 1763606.0, + "step": 208 + }, + { + "entropy": 1.078108698129654, + "epoch": 0.08360836083608361, + "grad_norm": 0.283428430557251, + "learning_rate": 0.00019866734618100202, + "loss": 1.1032, + "mean_token_accuracy": 0.69297856092453, + "num_tokens": 1772887.0, + "step": 209 + }, + { + "entropy": 1.186295509338379, + "epoch": 0.08400840084008401, + "grad_norm": 0.35003766417503357, + "learning_rate": 0.0001986472814763251, + "loss": 1.2374, + "mean_token_accuracy": 0.6684627532958984, + "num_tokens": 1781067.0, + "step": 210 + }, + { + "entropy": 1.1557523012161255, + "epoch": 0.08440844084408441, + "grad_norm": 0.31848254799842834, + "learning_rate": 0.00019862706799265322, + "loss": 1.1854, + "mean_token_accuracy": 0.6773674935102463, + "num_tokens": 1789844.0, + "step": 211 + }, + { + "entropy": 1.218627154827118, + "epoch": 0.0848084808480848, + "grad_norm": 0.3408789038658142, + "learning_rate": 0.00019860670576391128, + "loss": 1.1708, + "mean_token_accuracy": 0.6817043423652649, + "num_tokens": 1798509.0, + "step": 212 + }, + { + "entropy": 1.2130761444568634, + "epoch": 0.0852085208520852, + "grad_norm": 0.7527572512626648, + "learning_rate": 0.0001985861948242736, + "loss": 1.2157, + "mean_token_accuracy": 0.6661449372768402, + "num_tokens": 1807202.0, + "step": 213 + }, + { + "entropy": 1.2128455638885498, + "epoch": 0.0856085608560856, + "grad_norm": 0.29946374893188477, + "learning_rate": 0.00019856553520816435, + "loss": 1.1896, + "mean_token_accuracy": 0.6733538210391998, + "num_tokens": 1816131.0, + "step": 214 + }, + { + "entropy": 1.2612944841384888, + "epoch": 0.086008600860086, + "grad_norm": 0.32515719532966614, + "learning_rate": 0.00019854472695025698, + "loss": 1.2329, + "mean_token_accuracy": 0.669788658618927, + "num_tokens": 1824283.0, + "step": 215 + }, + { + "entropy": 1.1807590425014496, + "epoch": 0.08640864086408641, + "grad_norm": 0.3279406726360321, + "learning_rate": 0.0001985237700854746, + "loss": 1.1565, + "mean_token_accuracy": 0.6816118210554123, + "num_tokens": 1833322.0, + "step": 216 + }, + { + "entropy": 1.2046120464801788, + "epoch": 0.08680868086808681, + "grad_norm": 0.2987005412578583, + "learning_rate": 0.00019850266464898955, + "loss": 1.179, + "mean_token_accuracy": 0.6783045381307602, + "num_tokens": 1842092.0, + "step": 217 + }, + { + "entropy": 1.1976227462291718, + "epoch": 0.08720872087208721, + "grad_norm": 0.30504319071769714, + "learning_rate": 0.00019848141067622374, + "loss": 1.1589, + "mean_token_accuracy": 0.6762242764234543, + "num_tokens": 1850740.0, + "step": 218 + }, + { + "entropy": 1.2001455426216125, + "epoch": 0.08760876087608761, + "grad_norm": 0.35163310170173645, + "learning_rate": 0.0001984600082028482, + "loss": 1.1941, + "mean_token_accuracy": 0.6701504737138748, + "num_tokens": 1858729.0, + "step": 219 + }, + { + "entropy": 1.0998838245868683, + "epoch": 0.08800880088008801, + "grad_norm": 0.3166980445384979, + "learning_rate": 0.0001984384572647832, + "loss": 1.1238, + "mean_token_accuracy": 0.683118149638176, + "num_tokens": 1867218.0, + "step": 220 + }, + { + "entropy": 1.1223637461662292, + "epoch": 0.0884088408840884, + "grad_norm": 0.3210962116718292, + "learning_rate": 0.0001984167578981983, + "loss": 1.158, + "mean_token_accuracy": 0.685064285993576, + "num_tokens": 1875656.0, + "step": 221 + }, + { + "entropy": 1.1469238698482513, + "epoch": 0.0888088808880888, + "grad_norm": 0.37055703997612, + "learning_rate": 0.00019839491013951213, + "loss": 1.1976, + "mean_token_accuracy": 0.66952283680439, + "num_tokens": 1884042.0, + "step": 222 + }, + { + "entropy": 1.2010729908943176, + "epoch": 0.0892089208920892, + "grad_norm": 0.30089443922042847, + "learning_rate": 0.00019837291402539223, + "loss": 1.1677, + "mean_token_accuracy": 0.6765223145484924, + "num_tokens": 1892519.0, + "step": 223 + }, + { + "entropy": 1.222718983888626, + "epoch": 0.0896089608960896, + "grad_norm": 0.3071632981300354, + "learning_rate": 0.00019835076959275532, + "loss": 1.1918, + "mean_token_accuracy": 0.6696299612522125, + "num_tokens": 1900924.0, + "step": 224 + }, + { + "entropy": 1.216365933418274, + "epoch": 0.09000900090009001, + "grad_norm": 0.3337574303150177, + "learning_rate": 0.00019832847687876692, + "loss": 1.1572, + "mean_token_accuracy": 0.6832773238420486, + "num_tokens": 1909276.0, + "step": 225 + }, + { + "entropy": 1.1910041272640228, + "epoch": 0.09040904090409041, + "grad_norm": 0.3146218955516815, + "learning_rate": 0.0001983060359208415, + "loss": 1.1782, + "mean_token_accuracy": 0.679167777299881, + "num_tokens": 1918407.0, + "step": 226 + }, + { + "entropy": 1.162790209054947, + "epoch": 0.09080908090809081, + "grad_norm": 0.2975619435310364, + "learning_rate": 0.0001982834467566423, + "loss": 1.1683, + "mean_token_accuracy": 0.6799277067184448, + "num_tokens": 1927282.0, + "step": 227 + }, + { + "entropy": 1.192271113395691, + "epoch": 0.09120912091209121, + "grad_norm": 0.3205324113368988, + "learning_rate": 0.0001982607094240813, + "loss": 1.1681, + "mean_token_accuracy": 0.6754294186830521, + "num_tokens": 1935737.0, + "step": 228 + }, + { + "entropy": 1.1858693957328796, + "epoch": 0.09160916091609161, + "grad_norm": 0.3366444706916809, + "learning_rate": 0.00019823782396131902, + "loss": 1.1944, + "mean_token_accuracy": 0.6657039225101471, + "num_tokens": 1943472.0, + "step": 229 + }, + { + "entropy": 1.1361185312271118, + "epoch": 0.09200920092009202, + "grad_norm": 0.31257081031799316, + "learning_rate": 0.00019821479040676488, + "loss": 1.1529, + "mean_token_accuracy": 0.6812857985496521, + "num_tokens": 1952251.0, + "step": 230 + }, + { + "entropy": 1.2052267491817474, + "epoch": 0.0924092409240924, + "grad_norm": 0.3371609151363373, + "learning_rate": 0.0001981916087990766, + "loss": 1.2363, + "mean_token_accuracy": 0.6580934226512909, + "num_tokens": 1960349.0, + "step": 231 + }, + { + "entropy": 1.1373478174209595, + "epoch": 0.0928092809280928, + "grad_norm": 0.30473393201828003, + "learning_rate": 0.00019816827917716048, + "loss": 1.1727, + "mean_token_accuracy": 0.6796131581068039, + "num_tokens": 1969233.0, + "step": 232 + }, + { + "entropy": 1.1681481301784515, + "epoch": 0.0932093209320932, + "grad_norm": 0.3225601315498352, + "learning_rate": 0.0001981448015801712, + "loss": 1.1528, + "mean_token_accuracy": 0.6749817878007889, + "num_tokens": 1977270.0, + "step": 233 + }, + { + "entropy": 1.2196559309959412, + "epoch": 0.09360936093609361, + "grad_norm": 0.33247852325439453, + "learning_rate": 0.00019812117604751185, + "loss": 1.1834, + "mean_token_accuracy": 0.6816778779029846, + "num_tokens": 1985087.0, + "step": 234 + }, + { + "entropy": 1.218104362487793, + "epoch": 0.09400940094009401, + "grad_norm": 0.3164643347263336, + "learning_rate": 0.00019809740261883372, + "loss": 1.1791, + "mean_token_accuracy": 0.6742540150880814, + "num_tokens": 1993142.0, + "step": 235 + }, + { + "entropy": 1.2172793745994568, + "epoch": 0.09440944094409441, + "grad_norm": 0.31248074769973755, + "learning_rate": 0.0001980734813340364, + "loss": 1.2067, + "mean_token_accuracy": 0.6745200008153915, + "num_tokens": 2001487.0, + "step": 236 + }, + { + "entropy": 1.203236162662506, + "epoch": 0.09480948094809481, + "grad_norm": 0.32407742738723755, + "learning_rate": 0.0001980494122332676, + "loss": 1.1664, + "mean_token_accuracy": 0.6777038276195526, + "num_tokens": 2010136.0, + "step": 237 + }, + { + "entropy": 1.1953341364860535, + "epoch": 0.09520952095209521, + "grad_norm": 0.3571881651878357, + "learning_rate": 0.00019802519535692302, + "loss": 1.1651, + "mean_token_accuracy": 0.6782020479440689, + "num_tokens": 2018515.0, + "step": 238 + }, + { + "entropy": 1.208018183708191, + "epoch": 0.09560956095609562, + "grad_norm": 0.3488442599773407, + "learning_rate": 0.00019800083074564658, + "loss": 1.2217, + "mean_token_accuracy": 0.6720796823501587, + "num_tokens": 2026942.0, + "step": 239 + }, + { + "entropy": 1.1499423384666443, + "epoch": 0.096009600960096, + "grad_norm": 0.30266088247299194, + "learning_rate": 0.00019797631844032992, + "loss": 1.1776, + "mean_token_accuracy": 0.6771319806575775, + "num_tokens": 2035674.0, + "step": 240 + }, + { + "entropy": 1.1237535774707794, + "epoch": 0.0964096409640964, + "grad_norm": 0.3096405863761902, + "learning_rate": 0.00019795165848211278, + "loss": 1.1122, + "mean_token_accuracy": 0.6934310793876648, + "num_tokens": 2044052.0, + "step": 241 + }, + { + "entropy": 1.1529573500156403, + "epoch": 0.0968096809680968, + "grad_norm": 0.3192532956600189, + "learning_rate": 0.0001979268509123825, + "loss": 1.1804, + "mean_token_accuracy": 0.6760334223508835, + "num_tokens": 2052448.0, + "step": 242 + }, + { + "entropy": 1.2383974194526672, + "epoch": 0.09720972097209721, + "grad_norm": 0.3160487711429596, + "learning_rate": 0.00019790189577277432, + "loss": 1.2465, + "mean_token_accuracy": 0.6652619689702988, + "num_tokens": 2060776.0, + "step": 243 + }, + { + "entropy": 1.2161905169487, + "epoch": 0.09760976097609761, + "grad_norm": 0.32217562198638916, + "learning_rate": 0.00019787679310517107, + "loss": 1.1872, + "mean_token_accuracy": 0.6732243746519089, + "num_tokens": 2068794.0, + "step": 244 + }, + { + "entropy": 1.1646412014961243, + "epoch": 0.09800980098009801, + "grad_norm": 0.3009166419506073, + "learning_rate": 0.00019785154295170316, + "loss": 1.1652, + "mean_token_accuracy": 0.6807472556829453, + "num_tokens": 2077262.0, + "step": 245 + }, + { + "entropy": 1.2155237197875977, + "epoch": 0.09840984098409841, + "grad_norm": 0.3069799840450287, + "learning_rate": 0.00019782614535474862, + "loss": 1.216, + "mean_token_accuracy": 0.6698369234800339, + "num_tokens": 2085649.0, + "step": 246 + }, + { + "entropy": 1.1119366884231567, + "epoch": 0.09880988098809881, + "grad_norm": 0.30247923731803894, + "learning_rate": 0.00019780060035693285, + "loss": 1.1038, + "mean_token_accuracy": 0.6942414045333862, + "num_tokens": 2094198.0, + "step": 247 + }, + { + "entropy": 1.2534517645835876, + "epoch": 0.09920992099209922, + "grad_norm": 0.3274390697479248, + "learning_rate": 0.0001977749080011287, + "loss": 1.2635, + "mean_token_accuracy": 0.6554094851016998, + "num_tokens": 2102101.0, + "step": 248 + }, + { + "entropy": 1.1967229545116425, + "epoch": 0.09960996099609962, + "grad_norm": 0.29584378004074097, + "learning_rate": 0.00019774906833045625, + "loss": 1.1822, + "mean_token_accuracy": 0.6769470870494843, + "num_tokens": 2110466.0, + "step": 249 + }, + { + "entropy": 1.1380691528320312, + "epoch": 0.1000100010001, + "grad_norm": 0.28823035955429077, + "learning_rate": 0.00019772308138828299, + "loss": 1.0987, + "mean_token_accuracy": 0.6907877773046494, + "num_tokens": 2119656.0, + "step": 250 + }, + { + "entropy": 1.155064195394516, + "epoch": 0.1004100410041004, + "grad_norm": 0.3187693655490875, + "learning_rate": 0.00019769694721822337, + "loss": 1.1542, + "mean_token_accuracy": 0.6734511256217957, + "num_tokens": 2128073.0, + "step": 251 + }, + { + "entropy": 1.1665138900279999, + "epoch": 0.10081008100810081, + "grad_norm": 0.30443915724754333, + "learning_rate": 0.00019767066586413905, + "loss": 1.2047, + "mean_token_accuracy": 0.6689727902412415, + "num_tokens": 2136624.0, + "step": 252 + }, + { + "entropy": 1.1986846625804901, + "epoch": 0.10121012101210121, + "grad_norm": 0.2993563413619995, + "learning_rate": 0.0001976442373701387, + "loss": 1.1885, + "mean_token_accuracy": 0.6774641126394272, + "num_tokens": 2144946.0, + "step": 253 + }, + { + "entropy": 1.1575412154197693, + "epoch": 0.10161016101610161, + "grad_norm": 0.31819280982017517, + "learning_rate": 0.00019761766178057796, + "loss": 1.1617, + "mean_token_accuracy": 0.6737077832221985, + "num_tokens": 2153241.0, + "step": 254 + }, + { + "entropy": 1.1932867169380188, + "epoch": 0.10201020102010201, + "grad_norm": 0.33500298857688904, + "learning_rate": 0.00019759093914005932, + "loss": 1.1739, + "mean_token_accuracy": 0.6722579598426819, + "num_tokens": 2161532.0, + "step": 255 + }, + { + "entropy": 1.2010496854782104, + "epoch": 0.10241024102410241, + "grad_norm": 0.3177407681941986, + "learning_rate": 0.00019756406949343204, + "loss": 1.1888, + "mean_token_accuracy": 0.6757108420133591, + "num_tokens": 2170296.0, + "step": 256 + }, + { + "entropy": 1.1958762109279633, + "epoch": 0.10281028102810282, + "grad_norm": 0.30990293622016907, + "learning_rate": 0.00019753705288579217, + "loss": 1.1797, + "mean_token_accuracy": 0.6757787764072418, + "num_tokens": 2178618.0, + "step": 257 + }, + { + "entropy": 1.1743170619010925, + "epoch": 0.10321032103210322, + "grad_norm": 0.3038559854030609, + "learning_rate": 0.00019750988936248235, + "loss": 1.169, + "mean_token_accuracy": 0.6733282506465912, + "num_tokens": 2187168.0, + "step": 258 + }, + { + "entropy": 1.1737709939479828, + "epoch": 0.1036103610361036, + "grad_norm": 0.321360319852829, + "learning_rate": 0.0001974825789690918, + "loss": 1.1957, + "mean_token_accuracy": 0.6770029366016388, + "num_tokens": 2195246.0, + "step": 259 + }, + { + "entropy": 1.172276645898819, + "epoch": 0.104010401040104, + "grad_norm": 0.3069777488708496, + "learning_rate": 0.00019745512175145627, + "loss": 1.2094, + "mean_token_accuracy": 0.6666506826877594, + "num_tokens": 2203717.0, + "step": 260 + }, + { + "entropy": 1.3047214448451996, + "epoch": 0.10441044104410441, + "grad_norm": 0.3076897859573364, + "learning_rate": 0.0001974275177556579, + "loss": 1.301, + "mean_token_accuracy": 0.6500514298677444, + "num_tokens": 2212037.0, + "step": 261 + }, + { + "entropy": 1.1853089034557343, + "epoch": 0.10481048104810481, + "grad_norm": 0.30814552307128906, + "learning_rate": 0.00019739976702802517, + "loss": 1.121, + "mean_token_accuracy": 0.6797177791595459, + "num_tokens": 2220415.0, + "step": 262 + }, + { + "entropy": 1.14727121591568, + "epoch": 0.10521052105210521, + "grad_norm": 0.3139231503009796, + "learning_rate": 0.0001973718696151329, + "loss": 1.0951, + "mean_token_accuracy": 0.6984894424676895, + "num_tokens": 2228773.0, + "step": 263 + }, + { + "entropy": 1.1453731060028076, + "epoch": 0.10561056105610561, + "grad_norm": 0.3104467988014221, + "learning_rate": 0.00019734382556380194, + "loss": 1.145, + "mean_token_accuracy": 0.6833966672420502, + "num_tokens": 2236602.0, + "step": 264 + }, + { + "entropy": 1.129274994134903, + "epoch": 0.10601060106010601, + "grad_norm": 0.29663506150245667, + "learning_rate": 0.0001973156349210994, + "loss": 1.1386, + "mean_token_accuracy": 0.6783726066350937, + "num_tokens": 2245313.0, + "step": 265 + }, + { + "entropy": 1.1950629949569702, + "epoch": 0.10641064106410641, + "grad_norm": 0.3033241033554077, + "learning_rate": 0.0001972872977343383, + "loss": 1.2095, + "mean_token_accuracy": 0.6765413582324982, + "num_tokens": 2254362.0, + "step": 266 + }, + { + "entropy": 1.2014857530593872, + "epoch": 0.10681068106810682, + "grad_norm": 0.31535446643829346, + "learning_rate": 0.00019725881405107778, + "loss": 1.2053, + "mean_token_accuracy": 0.6713583916425705, + "num_tokens": 2262331.0, + "step": 267 + }, + { + "entropy": 1.1801405549049377, + "epoch": 0.1072107210721072, + "grad_norm": 0.30611008405685425, + "learning_rate": 0.0001972301839191226, + "loss": 1.1823, + "mean_token_accuracy": 0.6748154610395432, + "num_tokens": 2270765.0, + "step": 268 + }, + { + "entropy": 1.1290169060230255, + "epoch": 0.1076107610761076, + "grad_norm": 0.30215638875961304, + "learning_rate": 0.00019720140738652345, + "loss": 1.1209, + "mean_token_accuracy": 0.6912433356046677, + "num_tokens": 2279593.0, + "step": 269 + }, + { + "entropy": 1.1610883474349976, + "epoch": 0.10801080108010801, + "grad_norm": 0.30377084016799927, + "learning_rate": 0.00019717248450157681, + "loss": 1.1863, + "mean_token_accuracy": 0.6740070879459381, + "num_tokens": 2288100.0, + "step": 270 + }, + { + "entropy": 1.1068450212478638, + "epoch": 0.10841084108410841, + "grad_norm": 0.3132963478565216, + "learning_rate": 0.00019714341531282462, + "loss": 1.0841, + "mean_token_accuracy": 0.6911667734384537, + "num_tokens": 2296290.0, + "step": 271 + }, + { + "entropy": 1.168148934841156, + "epoch": 0.10881088108810881, + "grad_norm": 0.3282947242259979, + "learning_rate": 0.0001971141998690545, + "loss": 1.1941, + "mean_token_accuracy": 0.673908457159996, + "num_tokens": 2304766.0, + "step": 272 + }, + { + "entropy": 1.1689501702785492, + "epoch": 0.10921092109210921, + "grad_norm": 0.2957140803337097, + "learning_rate": 0.00019708483821929943, + "loss": 1.1398, + "mean_token_accuracy": 0.6831405013799667, + "num_tokens": 2313114.0, + "step": 273 + }, + { + "entropy": 1.1905297338962555, + "epoch": 0.10961096109610961, + "grad_norm": 0.29807668924331665, + "learning_rate": 0.00019705533041283779, + "loss": 1.1736, + "mean_token_accuracy": 0.6775653660297394, + "num_tokens": 2321660.0, + "step": 274 + }, + { + "entropy": 1.1815482079982758, + "epoch": 0.11001100110011001, + "grad_norm": 0.29083186388015747, + "learning_rate": 0.00019702567649919337, + "loss": 1.1603, + "mean_token_accuracy": 0.6754807829856873, + "num_tokens": 2330342.0, + "step": 275 + }, + { + "entropy": 1.1261299550533295, + "epoch": 0.11041104110411042, + "grad_norm": 0.2901794910430908, + "learning_rate": 0.00019699587652813503, + "loss": 1.1284, + "mean_token_accuracy": 0.691281333565712, + "num_tokens": 2338852.0, + "step": 276 + }, + { + "entropy": 1.184859186410904, + "epoch": 0.11081108110811082, + "grad_norm": 0.310745507478714, + "learning_rate": 0.00019696593054967682, + "loss": 1.2127, + "mean_token_accuracy": 0.6673152446746826, + "num_tokens": 2346809.0, + "step": 277 + }, + { + "entropy": 1.1188380122184753, + "epoch": 0.1112111211121112, + "grad_norm": 0.29587554931640625, + "learning_rate": 0.00019693583861407786, + "loss": 1.0981, + "mean_token_accuracy": 0.6947813928127289, + "num_tokens": 2355532.0, + "step": 278 + }, + { + "entropy": 1.172318309545517, + "epoch": 0.1116111611161116, + "grad_norm": 0.3138435482978821, + "learning_rate": 0.00019690560077184223, + "loss": 1.1441, + "mean_token_accuracy": 0.6789282411336899, + "num_tokens": 2363938.0, + "step": 279 + }, + { + "entropy": 1.1374418139457703, + "epoch": 0.11201120112011201, + "grad_norm": 0.34152451157569885, + "learning_rate": 0.0001968752170737188, + "loss": 1.1081, + "mean_token_accuracy": 0.6848500221967697, + "num_tokens": 2372334.0, + "step": 280 + }, + { + "entropy": 1.1317946314811707, + "epoch": 0.11241124112411241, + "grad_norm": 0.29949530959129333, + "learning_rate": 0.0001968446875707014, + "loss": 1.1138, + "mean_token_accuracy": 0.6870416551828384, + "num_tokens": 2380730.0, + "step": 281 + }, + { + "entropy": 1.0892143547534943, + "epoch": 0.11281128112811281, + "grad_norm": 0.3009011447429657, + "learning_rate": 0.00019681401231402842, + "loss": 1.0712, + "mean_token_accuracy": 0.6998904794454575, + "num_tokens": 2389463.0, + "step": 282 + }, + { + "entropy": 1.1513322591781616, + "epoch": 0.11321132113211321, + "grad_norm": 0.29763105511665344, + "learning_rate": 0.00019678319135518294, + "loss": 1.1861, + "mean_token_accuracy": 0.6697124987840652, + "num_tokens": 2398473.0, + "step": 283 + }, + { + "entropy": 1.1688634753227234, + "epoch": 0.11361136113611361, + "grad_norm": 0.33001646399497986, + "learning_rate": 0.00019675222474589257, + "loss": 1.2012, + "mean_token_accuracy": 0.673338770866394, + "num_tokens": 2406493.0, + "step": 284 + }, + { + "entropy": 1.1393934190273285, + "epoch": 0.11401140114011402, + "grad_norm": 0.2978336215019226, + "learning_rate": 0.00019672111253812933, + "loss": 1.1566, + "mean_token_accuracy": 0.6849386692047119, + "num_tokens": 2414963.0, + "step": 285 + }, + { + "entropy": 1.1978220045566559, + "epoch": 0.11441144114411442, + "grad_norm": 0.296939879655838, + "learning_rate": 0.00019668985478410968, + "loss": 1.1508, + "mean_token_accuracy": 0.6871092170476913, + "num_tokens": 2423476.0, + "step": 286 + }, + { + "entropy": 1.1493785977363586, + "epoch": 0.1148114811481148, + "grad_norm": 0.3038109242916107, + "learning_rate": 0.00019665845153629425, + "loss": 1.1429, + "mean_token_accuracy": 0.6873074918985367, + "num_tokens": 2432015.0, + "step": 287 + }, + { + "entropy": 1.1764490902423859, + "epoch": 0.1152115211521152, + "grad_norm": 0.28137773275375366, + "learning_rate": 0.00019662690284738793, + "loss": 1.1206, + "mean_token_accuracy": 0.6875211298465729, + "num_tokens": 2440577.0, + "step": 288 + }, + { + "entropy": 1.1811064779758453, + "epoch": 0.11561156115611561, + "grad_norm": 0.2927968502044678, + "learning_rate": 0.00019659520877033976, + "loss": 1.1828, + "mean_token_accuracy": 0.67679663002491, + "num_tokens": 2449585.0, + "step": 289 + }, + { + "entropy": 1.1157205402851105, + "epoch": 0.11601160116011601, + "grad_norm": 0.2844160199165344, + "learning_rate": 0.0001965633693583426, + "loss": 1.1127, + "mean_token_accuracy": 0.6861093044281006, + "num_tokens": 2458691.0, + "step": 290 + }, + { + "entropy": 1.1210555136203766, + "epoch": 0.11641164116411641, + "grad_norm": 0.30678603053092957, + "learning_rate": 0.0001965313846648334, + "loss": 1.1495, + "mean_token_accuracy": 0.6870106756687164, + "num_tokens": 2466917.0, + "step": 291 + }, + { + "entropy": 1.1256535351276398, + "epoch": 0.11681168116811681, + "grad_norm": 0.31176719069480896, + "learning_rate": 0.00019649925474349292, + "loss": 1.1516, + "mean_token_accuracy": 0.679766371846199, + "num_tokens": 2475064.0, + "step": 292 + }, + { + "entropy": 1.1276935040950775, + "epoch": 0.11721172117211721, + "grad_norm": 0.29645654559135437, + "learning_rate": 0.00019646697964824562, + "loss": 1.1372, + "mean_token_accuracy": 0.6837837547063828, + "num_tokens": 2483736.0, + "step": 293 + }, + { + "entropy": 1.1446107029914856, + "epoch": 0.11761176117611762, + "grad_norm": 0.2959735691547394, + "learning_rate": 0.00019643455943325953, + "loss": 1.1344, + "mean_token_accuracy": 0.6885244697332382, + "num_tokens": 2492223.0, + "step": 294 + }, + { + "entropy": 1.1486328840255737, + "epoch": 0.11801180118011802, + "grad_norm": 0.35478872060775757, + "learning_rate": 0.00019640199415294645, + "loss": 1.1195, + "mean_token_accuracy": 0.6887603253126144, + "num_tokens": 2500600.0, + "step": 295 + }, + { + "entropy": 1.126534789800644, + "epoch": 0.11841184118411842, + "grad_norm": 0.2932710349559784, + "learning_rate": 0.00019636928386196145, + "loss": 1.1047, + "mean_token_accuracy": 0.696495532989502, + "num_tokens": 2509047.0, + "step": 296 + }, + { + "entropy": 1.1546699106693268, + "epoch": 0.1188118811881188, + "grad_norm": 0.2861276865005493, + "learning_rate": 0.00019633642861520306, + "loss": 1.1463, + "mean_token_accuracy": 0.6796572506427765, + "num_tokens": 2517885.0, + "step": 297 + }, + { + "entropy": 1.1594507992267609, + "epoch": 0.11921192119211921, + "grad_norm": 0.5982229709625244, + "learning_rate": 0.0001963034284678131, + "loss": 1.1527, + "mean_token_accuracy": 0.6782443970441818, + "num_tokens": 2525962.0, + "step": 298 + }, + { + "entropy": 1.1879192888736725, + "epoch": 0.11961196119611961, + "grad_norm": 0.30875492095947266, + "learning_rate": 0.00019627028347517648, + "loss": 1.1854, + "mean_token_accuracy": 0.675933450460434, + "num_tokens": 2534220.0, + "step": 299 + }, + { + "entropy": 1.1593869030475616, + "epoch": 0.12001200120012001, + "grad_norm": 0.3053128719329834, + "learning_rate": 0.00019623699369292137, + "loss": 1.1617, + "mean_token_accuracy": 0.677645817399025, + "num_tokens": 2542206.0, + "step": 300 + }, + { + "entropy": 1.1326042711734772, + "epoch": 0.12041204120412041, + "grad_norm": 0.3102218508720398, + "learning_rate": 0.00019620355917691884, + "loss": 1.1384, + "mean_token_accuracy": 0.6767238080501556, + "num_tokens": 2550584.0, + "step": 301 + }, + { + "entropy": 1.1040166020393372, + "epoch": 0.12081208120812081, + "grad_norm": 0.3166041970252991, + "learning_rate": 0.00019616997998328292, + "loss": 1.1206, + "mean_token_accuracy": 0.6878381818532944, + "num_tokens": 2558969.0, + "step": 302 + }, + { + "entropy": 1.1306456625461578, + "epoch": 0.12121212121212122, + "grad_norm": 0.31803345680236816, + "learning_rate": 0.00019613625616837034, + "loss": 1.1286, + "mean_token_accuracy": 0.6829645335674286, + "num_tokens": 2567510.0, + "step": 303 + }, + { + "entropy": 1.2087586522102356, + "epoch": 0.12161216121612162, + "grad_norm": 0.313399076461792, + "learning_rate": 0.0001961023877887807, + "loss": 1.2, + "mean_token_accuracy": 0.6653729230165482, + "num_tokens": 2575393.0, + "step": 304 + }, + { + "entropy": 1.1803353130817413, + "epoch": 0.12201220122012202, + "grad_norm": 0.2919938862323761, + "learning_rate": 0.0001960683749013562, + "loss": 1.1749, + "mean_token_accuracy": 0.6795784384012222, + "num_tokens": 2583973.0, + "step": 305 + }, + { + "entropy": 1.206252634525299, + "epoch": 0.1224122412241224, + "grad_norm": 0.30734333395957947, + "learning_rate": 0.00019603421756318146, + "loss": 1.2079, + "mean_token_accuracy": 0.6748498380184174, + "num_tokens": 2592413.0, + "step": 306 + }, + { + "entropy": 1.1237642168998718, + "epoch": 0.12281228122812281, + "grad_norm": 0.2940463721752167, + "learning_rate": 0.00019599991583158367, + "loss": 1.0924, + "mean_token_accuracy": 0.6870536357164383, + "num_tokens": 2601189.0, + "step": 307 + }, + { + "entropy": 1.1055436730384827, + "epoch": 0.12321232123212321, + "grad_norm": 0.2887219488620758, + "learning_rate": 0.00019596546976413226, + "loss": 1.1143, + "mean_token_accuracy": 0.6970756649971008, + "num_tokens": 2610378.0, + "step": 308 + }, + { + "entropy": 1.1455924063920975, + "epoch": 0.12361236123612361, + "grad_norm": 0.30642586946487427, + "learning_rate": 0.00019593087941863893, + "loss": 1.1163, + "mean_token_accuracy": 0.6846802532672882, + "num_tokens": 2618765.0, + "step": 309 + }, + { + "entropy": 1.1495613157749176, + "epoch": 0.12401240124012401, + "grad_norm": 0.2958558201789856, + "learning_rate": 0.00019589614485315766, + "loss": 1.1277, + "mean_token_accuracy": 0.692332923412323, + "num_tokens": 2627306.0, + "step": 310 + }, + { + "entropy": 1.1369233131408691, + "epoch": 0.12441244124412441, + "grad_norm": 0.2962513566017151, + "learning_rate": 0.0001958612661259842, + "loss": 1.1458, + "mean_token_accuracy": 0.6847312748432159, + "num_tokens": 2635802.0, + "step": 311 + }, + { + "entropy": 1.1192970275878906, + "epoch": 0.12481248124812482, + "grad_norm": 0.3100016117095947, + "learning_rate": 0.00019582624329565656, + "loss": 1.1479, + "mean_token_accuracy": 0.679630234837532, + "num_tokens": 2644316.0, + "step": 312 + }, + { + "entropy": 1.1962910890579224, + "epoch": 0.1252125212521252, + "grad_norm": 0.3248625099658966, + "learning_rate": 0.0001957910764209543, + "loss": 1.2285, + "mean_token_accuracy": 0.6648171693086624, + "num_tokens": 2652787.0, + "step": 313 + }, + { + "entropy": 1.1034400761127472, + "epoch": 0.1256125612561256, + "grad_norm": 0.2892885208129883, + "learning_rate": 0.00019575576556089897, + "loss": 1.1218, + "mean_token_accuracy": 0.685823604464531, + "num_tokens": 2661638.0, + "step": 314 + }, + { + "entropy": 1.1764290630817413, + "epoch": 0.126012601260126, + "grad_norm": 0.2998030483722687, + "learning_rate": 0.00019572031077475367, + "loss": 1.0975, + "mean_token_accuracy": 0.6871052384376526, + "num_tokens": 2670313.0, + "step": 315 + }, + { + "entropy": 1.2649544775485992, + "epoch": 0.1264126412641264, + "grad_norm": 0.31360095739364624, + "learning_rate": 0.0001956847121220231, + "loss": 1.2167, + "mean_token_accuracy": 0.660548061132431, + "num_tokens": 2678587.0, + "step": 316 + }, + { + "entropy": 1.1531548500061035, + "epoch": 0.1268126812681268, + "grad_norm": 0.3179381787776947, + "learning_rate": 0.0001956489696624533, + "loss": 1.1596, + "mean_token_accuracy": 0.6832859367132187, + "num_tokens": 2686845.0, + "step": 317 + }, + { + "entropy": 1.1491257846355438, + "epoch": 0.1272127212721272, + "grad_norm": 0.3010673224925995, + "learning_rate": 0.00019561308345603188, + "loss": 1.1856, + "mean_token_accuracy": 0.6756436675786972, + "num_tokens": 2695519.0, + "step": 318 + }, + { + "entropy": 1.099882572889328, + "epoch": 0.1276127612761276, + "grad_norm": 0.3057318925857544, + "learning_rate": 0.0001955770535629875, + "loss": 1.1369, + "mean_token_accuracy": 0.6802153438329697, + "num_tokens": 2704317.0, + "step": 319 + }, + { + "entropy": 1.1104555130004883, + "epoch": 0.128012801280128, + "grad_norm": 0.30537816882133484, + "learning_rate": 0.00019554088004379, + "loss": 1.0916, + "mean_token_accuracy": 0.6971182078123093, + "num_tokens": 2712576.0, + "step": 320 + }, + { + "entropy": 1.1894198954105377, + "epoch": 0.12841284128412842, + "grad_norm": 0.2941950261592865, + "learning_rate": 0.00019550456295915042, + "loss": 1.1728, + "mean_token_accuracy": 0.6762441992759705, + "num_tokens": 2721000.0, + "step": 321 + }, + { + "entropy": 1.1880941092967987, + "epoch": 0.12881288128812882, + "grad_norm": 0.3045370280742645, + "learning_rate": 0.00019546810237002066, + "loss": 1.1695, + "mean_token_accuracy": 0.6775896400213242, + "num_tokens": 2729281.0, + "step": 322 + }, + { + "entropy": 1.1603459417819977, + "epoch": 0.12921292129212922, + "grad_norm": 0.29477667808532715, + "learning_rate": 0.00019543149833759334, + "loss": 1.13, + "mean_token_accuracy": 0.6883135735988617, + "num_tokens": 2737775.0, + "step": 323 + }, + { + "entropy": 1.148952156305313, + "epoch": 0.12961296129612962, + "grad_norm": 0.2921348214149475, + "learning_rate": 0.000195394750923302, + "loss": 1.1492, + "mean_token_accuracy": 0.6808929741382599, + "num_tokens": 2746681.0, + "step": 324 + }, + { + "entropy": 1.2179997265338898, + "epoch": 0.13001300130013002, + "grad_norm": 0.3009890019893646, + "learning_rate": 0.0001953578601888208, + "loss": 1.2338, + "mean_token_accuracy": 0.6610979735851288, + "num_tokens": 2755045.0, + "step": 325 + }, + { + "entropy": 1.2134989798069, + "epoch": 0.13041304130413042, + "grad_norm": 0.3033868968486786, + "learning_rate": 0.00019532082619606436, + "loss": 1.2165, + "mean_token_accuracy": 0.6606318801641464, + "num_tokens": 2763287.0, + "step": 326 + }, + { + "entropy": 1.0881072580814362, + "epoch": 0.13081308130813082, + "grad_norm": 0.2861042022705078, + "learning_rate": 0.0001952836490071878, + "loss": 1.0643, + "mean_token_accuracy": 0.6997469067573547, + "num_tokens": 2772109.0, + "step": 327 + }, + { + "entropy": 1.2652019262313843, + "epoch": 0.13121312131213123, + "grad_norm": 0.3063291311264038, + "learning_rate": 0.00019524632868458649, + "loss": 1.2374, + "mean_token_accuracy": 0.6631722450256348, + "num_tokens": 2780001.0, + "step": 328 + }, + { + "entropy": 1.1232223510742188, + "epoch": 0.1316131613161316, + "grad_norm": 0.2938007712364197, + "learning_rate": 0.00019520886529089616, + "loss": 1.1047, + "mean_token_accuracy": 0.6943131983280182, + "num_tokens": 2788572.0, + "step": 329 + }, + { + "entropy": 1.182855635881424, + "epoch": 0.132013201320132, + "grad_norm": 0.2949009835720062, + "learning_rate": 0.00019517125888899255, + "loss": 1.1657, + "mean_token_accuracy": 0.6759148836135864, + "num_tokens": 2797349.0, + "step": 330 + }, + { + "entropy": 1.1421308815479279, + "epoch": 0.1324132413241324, + "grad_norm": 0.3349224328994751, + "learning_rate": 0.00019513350954199142, + "loss": 1.1379, + "mean_token_accuracy": 0.6823170036077499, + "num_tokens": 2805345.0, + "step": 331 + }, + { + "entropy": 1.0656911730766296, + "epoch": 0.1328132813281328, + "grad_norm": 0.3012828230857849, + "learning_rate": 0.00019509561731324848, + "loss": 1.0942, + "mean_token_accuracy": 0.6952732652425766, + "num_tokens": 2814123.0, + "step": 332 + }, + { + "entropy": 1.0468103885650635, + "epoch": 0.1332133213321332, + "grad_norm": 0.30162152647972107, + "learning_rate": 0.0001950575822663592, + "loss": 1.1012, + "mean_token_accuracy": 0.6894596368074417, + "num_tokens": 2823120.0, + "step": 333 + }, + { + "entropy": 1.089416727423668, + "epoch": 0.1336133613361336, + "grad_norm": 0.3064773976802826, + "learning_rate": 0.00019501940446515882, + "loss": 1.1036, + "mean_token_accuracy": 0.6885414123535156, + "num_tokens": 2831735.0, + "step": 334 + }, + { + "entropy": 1.1649364531040192, + "epoch": 0.134013401340134, + "grad_norm": 0.35003024339675903, + "learning_rate": 0.00019498108397372212, + "loss": 1.1766, + "mean_token_accuracy": 0.6764324754476547, + "num_tokens": 2839670.0, + "step": 335 + }, + { + "entropy": 1.1590066254138947, + "epoch": 0.1344134413441344, + "grad_norm": 0.26645922660827637, + "learning_rate": 0.0001949426208563633, + "loss": 1.1091, + "mean_token_accuracy": 0.6905470341444016, + "num_tokens": 2848911.0, + "step": 336 + }, + { + "entropy": 1.251402735710144, + "epoch": 0.1348134813481348, + "grad_norm": 0.31132251024246216, + "learning_rate": 0.000194904015177636, + "loss": 1.1918, + "mean_token_accuracy": 0.6727328300476074, + "num_tokens": 2857199.0, + "step": 337 + }, + { + "entropy": 1.220662236213684, + "epoch": 0.1352135213521352, + "grad_norm": 0.3061762750148773, + "learning_rate": 0.00019486526700233315, + "loss": 1.1868, + "mean_token_accuracy": 0.672507032752037, + "num_tokens": 2865223.0, + "step": 338 + }, + { + "entropy": 1.0638089627027512, + "epoch": 0.13561356135613561, + "grad_norm": 0.29525840282440186, + "learning_rate": 0.00019482637639548682, + "loss": 1.0514, + "mean_token_accuracy": 0.7034783512353897, + "num_tokens": 2873440.0, + "step": 339 + }, + { + "entropy": 1.1221419274806976, + "epoch": 0.13601360136013602, + "grad_norm": 0.2899990379810333, + "learning_rate": 0.00019478734342236808, + "loss": 1.1505, + "mean_token_accuracy": 0.675692155957222, + "num_tokens": 2882408.0, + "step": 340 + }, + { + "entropy": 1.145202785730362, + "epoch": 0.13641364136413642, + "grad_norm": 0.2904442250728607, + "learning_rate": 0.0001947481681484869, + "loss": 1.1848, + "mean_token_accuracy": 0.6750968992710114, + "num_tokens": 2891461.0, + "step": 341 + }, + { + "entropy": 1.081279844045639, + "epoch": 0.13681368136813682, + "grad_norm": 0.30348628759384155, + "learning_rate": 0.00019470885063959225, + "loss": 1.0734, + "mean_token_accuracy": 0.6975607126951218, + "num_tokens": 2900223.0, + "step": 342 + }, + { + "entropy": 1.0558022856712341, + "epoch": 0.13721372137213722, + "grad_norm": 0.28773176670074463, + "learning_rate": 0.00019466939096167164, + "loss": 1.0604, + "mean_token_accuracy": 0.6948001831769943, + "num_tokens": 2909084.0, + "step": 343 + }, + { + "entropy": 1.1171001195907593, + "epoch": 0.13761376137613762, + "grad_norm": 0.29017966985702515, + "learning_rate": 0.00019462978918095128, + "loss": 1.1181, + "mean_token_accuracy": 0.68596550822258, + "num_tokens": 2917795.0, + "step": 344 + }, + { + "entropy": 1.1633701920509338, + "epoch": 0.13801380138013802, + "grad_norm": 0.28877806663513184, + "learning_rate": 0.00019459004536389587, + "loss": 1.1716, + "mean_token_accuracy": 0.6693498939275742, + "num_tokens": 2925764.0, + "step": 345 + }, + { + "entropy": 1.2091334760189056, + "epoch": 0.13841384138413843, + "grad_norm": 0.3057492971420288, + "learning_rate": 0.00019455015957720842, + "loss": 1.2115, + "mean_token_accuracy": 0.6683546006679535, + "num_tokens": 2934337.0, + "step": 346 + }, + { + "entropy": 1.117457777261734, + "epoch": 0.13881388138813883, + "grad_norm": 0.3619987964630127, + "learning_rate": 0.0001945101318878303, + "loss": 1.0944, + "mean_token_accuracy": 0.6917587071657181, + "num_tokens": 2942882.0, + "step": 347 + }, + { + "entropy": 1.1964794397354126, + "epoch": 0.1392139213921392, + "grad_norm": 0.29087069630622864, + "learning_rate": 0.000194469962362941, + "loss": 1.1536, + "mean_token_accuracy": 0.6789288967847824, + "num_tokens": 2951358.0, + "step": 348 + }, + { + "entropy": 1.1352568864822388, + "epoch": 0.1396139613961396, + "grad_norm": 0.30058935284614563, + "learning_rate": 0.00019442965106995807, + "loss": 1.1042, + "mean_token_accuracy": 0.6969415545463562, + "num_tokens": 2959902.0, + "step": 349 + }, + { + "entropy": 1.1815881133079529, + "epoch": 0.14001400140014, + "grad_norm": 0.29818278551101685, + "learning_rate": 0.00019438919807653694, + "loss": 1.1937, + "mean_token_accuracy": 0.6777724772691727, + "num_tokens": 2968375.0, + "step": 350 + }, + { + "entropy": 1.1138464957475662, + "epoch": 0.1404140414041404, + "grad_norm": 0.29378682374954224, + "learning_rate": 0.00019434860345057096, + "loss": 1.136, + "mean_token_accuracy": 0.6846367418766022, + "num_tokens": 2976891.0, + "step": 351 + }, + { + "entropy": 1.1382241249084473, + "epoch": 0.1408140814081408, + "grad_norm": 0.298759788274765, + "learning_rate": 0.00019430786726019102, + "loss": 1.1675, + "mean_token_accuracy": 0.6828837245702744, + "num_tokens": 2984891.0, + "step": 352 + }, + { + "entropy": 1.2404142022132874, + "epoch": 0.1412141214121412, + "grad_norm": 0.3150947093963623, + "learning_rate": 0.00019426698957376585, + "loss": 1.2342, + "mean_token_accuracy": 0.6579574644565582, + "num_tokens": 2993072.0, + "step": 353 + }, + { + "entropy": 1.1687238216400146, + "epoch": 0.1416141614161416, + "grad_norm": 0.29389873147010803, + "learning_rate": 0.00019422597045990142, + "loss": 1.1767, + "mean_token_accuracy": 0.6675811409950256, + "num_tokens": 3001760.0, + "step": 354 + }, + { + "entropy": 1.1566392183303833, + "epoch": 0.142014201420142, + "grad_norm": 0.288309246301651, + "learning_rate": 0.00019418480998744118, + "loss": 1.1291, + "mean_token_accuracy": 0.6857695430517197, + "num_tokens": 3010111.0, + "step": 355 + }, + { + "entropy": 1.1949766874313354, + "epoch": 0.1424142414241424, + "grad_norm": 0.29533353447914124, + "learning_rate": 0.00019414350822546584, + "loss": 1.1664, + "mean_token_accuracy": 0.6795456558465958, + "num_tokens": 3018712.0, + "step": 356 + }, + { + "entropy": 1.1488195657730103, + "epoch": 0.14281428142814281, + "grad_norm": 0.3124019205570221, + "learning_rate": 0.00019410206524329314, + "loss": 1.129, + "mean_token_accuracy": 0.6900259405374527, + "num_tokens": 3026707.0, + "step": 357 + }, + { + "entropy": 1.1078391075134277, + "epoch": 0.14321432143214322, + "grad_norm": 0.4887332618236542, + "learning_rate": 0.00019406048111047792, + "loss": 1.1122, + "mean_token_accuracy": 0.6845664978027344, + "num_tokens": 3035277.0, + "step": 358 + }, + { + "entropy": 1.1673301458358765, + "epoch": 0.14361436143614362, + "grad_norm": 0.30997899174690247, + "learning_rate": 0.0001940187558968119, + "loss": 1.1427, + "mean_token_accuracy": 0.6802043169736862, + "num_tokens": 3043456.0, + "step": 359 + }, + { + "entropy": 1.1499980092048645, + "epoch": 0.14401440144014402, + "grad_norm": 0.3066644072532654, + "learning_rate": 0.00019397688967232352, + "loss": 1.1497, + "mean_token_accuracy": 0.6805084347724915, + "num_tokens": 3051649.0, + "step": 360 + }, + { + "entropy": 1.131559580564499, + "epoch": 0.14441444144414442, + "grad_norm": 0.296249657869339, + "learning_rate": 0.000193934882507278, + "loss": 1.1349, + "mean_token_accuracy": 0.6809341907501221, + "num_tokens": 3060190.0, + "step": 361 + }, + { + "entropy": 1.1443010866641998, + "epoch": 0.14481448144814482, + "grad_norm": 0.31838539242744446, + "learning_rate": 0.00019389273447217704, + "loss": 1.1696, + "mean_token_accuracy": 0.6759007275104523, + "num_tokens": 3068580.0, + "step": 362 + }, + { + "entropy": 1.133973866701126, + "epoch": 0.14521452145214522, + "grad_norm": 0.2861894965171814, + "learning_rate": 0.0001938504456377587, + "loss": 1.1291, + "mean_token_accuracy": 0.6851497888565063, + "num_tokens": 3077427.0, + "step": 363 + }, + { + "entropy": 1.136247158050537, + "epoch": 0.14561456145614562, + "grad_norm": 0.2967614531517029, + "learning_rate": 0.00019380801607499746, + "loss": 1.0995, + "mean_token_accuracy": 0.6911982148885727, + "num_tokens": 3085196.0, + "step": 364 + }, + { + "entropy": 1.184772402048111, + "epoch": 0.14601460146014603, + "grad_norm": 0.3119775354862213, + "learning_rate": 0.00019376544585510393, + "loss": 1.2257, + "mean_token_accuracy": 0.666557103395462, + "num_tokens": 3093621.0, + "step": 365 + }, + { + "entropy": 1.1576828956604004, + "epoch": 0.14641464146414643, + "grad_norm": 0.3863295018672943, + "learning_rate": 0.0001937227350495248, + "loss": 1.1722, + "mean_token_accuracy": 0.6755800992250443, + "num_tokens": 3102047.0, + "step": 366 + }, + { + "entropy": 1.0888293087482452, + "epoch": 0.1468146814681468, + "grad_norm": 0.2931033670902252, + "learning_rate": 0.00019367988372994265, + "loss": 1.0546, + "mean_token_accuracy": 0.6972876787185669, + "num_tokens": 3110407.0, + "step": 367 + }, + { + "entropy": 1.171687364578247, + "epoch": 0.1472147214721472, + "grad_norm": 0.43645840883255005, + "learning_rate": 0.000193636891968276, + "loss": 1.1192, + "mean_token_accuracy": 0.6813657730817795, + "num_tokens": 3118726.0, + "step": 368 + }, + { + "entropy": 1.1906355917453766, + "epoch": 0.1476147614761476, + "grad_norm": 0.30559539794921875, + "learning_rate": 0.00019359375983667902, + "loss": 1.1854, + "mean_token_accuracy": 0.6698572039604187, + "num_tokens": 3126856.0, + "step": 369 + }, + { + "entropy": 1.1418620645999908, + "epoch": 0.148014801480148, + "grad_norm": 0.31266874074935913, + "learning_rate": 0.00019355048740754145, + "loss": 1.1375, + "mean_token_accuracy": 0.678287535905838, + "num_tokens": 3135201.0, + "step": 370 + }, + { + "entropy": 1.1904971301555634, + "epoch": 0.1484148414841484, + "grad_norm": 0.3213047981262207, + "learning_rate": 0.00019350707475348852, + "loss": 1.1842, + "mean_token_accuracy": 0.6759228259325027, + "num_tokens": 3143256.0, + "step": 371 + }, + { + "entropy": 1.1902599036693573, + "epoch": 0.1488148814881488, + "grad_norm": 0.5613988041877747, + "learning_rate": 0.00019346352194738077, + "loss": 1.2442, + "mean_token_accuracy": 0.6619480550289154, + "num_tokens": 3150704.0, + "step": 372 + }, + { + "entropy": 1.0474575012922287, + "epoch": 0.1492149214921492, + "grad_norm": 0.2898733615875244, + "learning_rate": 0.00019341982906231407, + "loss": 1.0636, + "mean_token_accuracy": 0.6995494663715363, + "num_tokens": 3159711.0, + "step": 373 + }, + { + "entropy": 1.226840317249298, + "epoch": 0.1496149614961496, + "grad_norm": 0.314718633890152, + "learning_rate": 0.0001933759961716192, + "loss": 1.1882, + "mean_token_accuracy": 0.6709526926279068, + "num_tokens": 3167294.0, + "step": 374 + }, + { + "entropy": 1.1560609936714172, + "epoch": 0.15001500150015, + "grad_norm": 0.29525458812713623, + "learning_rate": 0.00019333202334886207, + "loss": 1.1088, + "mean_token_accuracy": 0.6907341927289963, + "num_tokens": 3175676.0, + "step": 375 + }, + { + "entropy": 1.1789807677268982, + "epoch": 0.15041504150415042, + "grad_norm": 0.2906891405582428, + "learning_rate": 0.0001932879106678434, + "loss": 1.1488, + "mean_token_accuracy": 0.6830808073282242, + "num_tokens": 3184781.0, + "step": 376 + }, + { + "entropy": 1.2095182836055756, + "epoch": 0.15081508150815082, + "grad_norm": 0.29173582792282104, + "learning_rate": 0.00019324365820259858, + "loss": 1.1471, + "mean_token_accuracy": 0.6814120411872864, + "num_tokens": 3193359.0, + "step": 377 + }, + { + "entropy": 1.1557953655719757, + "epoch": 0.15121512151215122, + "grad_norm": 0.30150917172431946, + "learning_rate": 0.0001931992660273977, + "loss": 1.1842, + "mean_token_accuracy": 0.6736668199300766, + "num_tokens": 3201977.0, + "step": 378 + }, + { + "entropy": 1.1141368001699448, + "epoch": 0.15161516151615162, + "grad_norm": 0.3033373951911926, + "learning_rate": 0.00019315473421674525, + "loss": 1.1433, + "mean_token_accuracy": 0.6801392734050751, + "num_tokens": 3210612.0, + "step": 379 + }, + { + "entropy": 1.0636587738990784, + "epoch": 0.15201520152015202, + "grad_norm": 0.2994931936264038, + "learning_rate": 0.00019311006284538013, + "loss": 1.0722, + "mean_token_accuracy": 0.6968654096126556, + "num_tokens": 3219123.0, + "step": 380 + }, + { + "entropy": 1.2064105868339539, + "epoch": 0.15241524152415242, + "grad_norm": 0.3521154820919037, + "learning_rate": 0.00019306525198827548, + "loss": 1.2385, + "mean_token_accuracy": 0.6615314930677414, + "num_tokens": 3227445.0, + "step": 381 + }, + { + "entropy": 1.127672255039215, + "epoch": 0.15281528152815282, + "grad_norm": 0.2892846465110779, + "learning_rate": 0.00019302030172063837, + "loss": 1.1389, + "mean_token_accuracy": 0.6847521215677261, + "num_tokens": 3236240.0, + "step": 382 + }, + { + "entropy": 1.1575649082660675, + "epoch": 0.15321532153215323, + "grad_norm": 0.31099551916122437, + "learning_rate": 0.0001929752121179101, + "loss": 1.1524, + "mean_token_accuracy": 0.6786007881164551, + "num_tokens": 3244515.0, + "step": 383 + }, + { + "entropy": 1.1269442737102509, + "epoch": 0.15361536153615363, + "grad_norm": 0.2906751036643982, + "learning_rate": 0.0001929299832557657, + "loss": 1.0972, + "mean_token_accuracy": 0.6957235038280487, + "num_tokens": 3253311.0, + "step": 384 + }, + { + "entropy": 1.2260091006755829, + "epoch": 0.15401540154015403, + "grad_norm": 0.2963874638080597, + "learning_rate": 0.00019288461521011388, + "loss": 1.1781, + "mean_token_accuracy": 0.6785955429077148, + "num_tokens": 3261634.0, + "step": 385 + }, + { + "entropy": 1.1854043006896973, + "epoch": 0.1544154415441544, + "grad_norm": 0.30083367228507996, + "learning_rate": 0.00019283910805709698, + "loss": 1.1677, + "mean_token_accuracy": 0.6692470908164978, + "num_tokens": 3270087.0, + "step": 386 + }, + { + "entropy": 1.2266800105571747, + "epoch": 0.1548154815481548, + "grad_norm": 0.3198303282260895, + "learning_rate": 0.00019279346187309085, + "loss": 1.2064, + "mean_token_accuracy": 0.6682067066431046, + "num_tokens": 3278271.0, + "step": 387 + }, + { + "entropy": 1.1660953760147095, + "epoch": 0.1552155215521552, + "grad_norm": 0.33573225140571594, + "learning_rate": 0.00019274767673470463, + "loss": 1.1942, + "mean_token_accuracy": 0.6672907918691635, + "num_tokens": 3286608.0, + "step": 388 + }, + { + "entropy": 1.0843549370765686, + "epoch": 0.1556155615561556, + "grad_norm": 0.30995887517929077, + "learning_rate": 0.00019270175271878068, + "loss": 1.0992, + "mean_token_accuracy": 0.6958242803812027, + "num_tokens": 3295009.0, + "step": 389 + }, + { + "entropy": 1.128290981054306, + "epoch": 0.156015601560156, + "grad_norm": 0.3144836127758026, + "learning_rate": 0.00019265568990239445, + "loss": 1.137, + "mean_token_accuracy": 0.6823694556951523, + "num_tokens": 3303299.0, + "step": 390 + }, + { + "entropy": 1.195746123790741, + "epoch": 0.1564156415641564, + "grad_norm": 0.30768823623657227, + "learning_rate": 0.00019260948836285439, + "loss": 1.1869, + "mean_token_accuracy": 0.6803343147039413, + "num_tokens": 3311591.0, + "step": 391 + }, + { + "entropy": 1.1737743616104126, + "epoch": 0.1568156815681568, + "grad_norm": 0.29867610335350037, + "learning_rate": 0.00019256314817770164, + "loss": 1.1703, + "mean_token_accuracy": 0.6784539520740509, + "num_tokens": 3320022.0, + "step": 392 + }, + { + "entropy": 1.2264443039894104, + "epoch": 0.1572157215721572, + "grad_norm": 0.30367588996887207, + "learning_rate": 0.00019251666942471016, + "loss": 1.1963, + "mean_token_accuracy": 0.6694721430540085, + "num_tokens": 3328671.0, + "step": 393 + }, + { + "entropy": 1.1673425137996674, + "epoch": 0.15761576157615761, + "grad_norm": 0.312225341796875, + "learning_rate": 0.00019247005218188645, + "loss": 1.1641, + "mean_token_accuracy": 0.6831966638565063, + "num_tokens": 3336686.0, + "step": 394 + }, + { + "entropy": 1.1570010483264923, + "epoch": 0.15801580158015802, + "grad_norm": 0.325536847114563, + "learning_rate": 0.00019242329652746938, + "loss": 1.1245, + "mean_token_accuracy": 0.6909505128860474, + "num_tokens": 3344988.0, + "step": 395 + }, + { + "entropy": 1.118729829788208, + "epoch": 0.15841584158415842, + "grad_norm": 0.31520524621009827, + "learning_rate": 0.00019237640253993017, + "loss": 1.1096, + "mean_token_accuracy": 0.686091959476471, + "num_tokens": 3353202.0, + "step": 396 + }, + { + "entropy": 1.1297271251678467, + "epoch": 0.15881588158815882, + "grad_norm": 0.31851935386657715, + "learning_rate": 0.00019232937029797217, + "loss": 1.1385, + "mean_token_accuracy": 0.6839326471090317, + "num_tokens": 3362000.0, + "step": 397 + }, + { + "entropy": 1.111870676279068, + "epoch": 0.15921592159215922, + "grad_norm": 0.29706814885139465, + "learning_rate": 0.00019228219988053085, + "loss": 1.132, + "mean_token_accuracy": 0.6736722886562347, + "num_tokens": 3370452.0, + "step": 398 + }, + { + "entropy": 1.0942797362804413, + "epoch": 0.15961596159615962, + "grad_norm": 0.3211657702922821, + "learning_rate": 0.00019223489136677347, + "loss": 1.1642, + "mean_token_accuracy": 0.6759698241949081, + "num_tokens": 3378774.0, + "step": 399 + }, + { + "entropy": 1.1003531515598297, + "epoch": 0.16001600160016002, + "grad_norm": 0.2938557267189026, + "learning_rate": 0.00019218744483609918, + "loss": 1.0841, + "mean_token_accuracy": 0.689574733376503, + "num_tokens": 3387752.0, + "step": 400 + }, + { + "entropy": 1.1808100640773773, + "epoch": 0.16041604160416043, + "grad_norm": 0.3016187250614166, + "learning_rate": 0.00019213986036813863, + "loss": 1.1379, + "mean_token_accuracy": 0.6819901168346405, + "num_tokens": 3395722.0, + "step": 401 + }, + { + "entropy": 1.1858965158462524, + "epoch": 0.16081608160816083, + "grad_norm": 0.2888219952583313, + "learning_rate": 0.00019209213804275408, + "loss": 1.1126, + "mean_token_accuracy": 0.6891250312328339, + "num_tokens": 3404658.0, + "step": 402 + }, + { + "entropy": 1.1066676825284958, + "epoch": 0.16121612161216123, + "grad_norm": 0.2900371551513672, + "learning_rate": 0.00019204427794003911, + "loss": 1.0613, + "mean_token_accuracy": 0.6994702219963074, + "num_tokens": 3413044.0, + "step": 403 + }, + { + "entropy": 1.0648207068443298, + "epoch": 0.16161616161616163, + "grad_norm": 0.2870444357395172, + "learning_rate": 0.00019199628014031857, + "loss": 1.0816, + "mean_token_accuracy": 0.6926587671041489, + "num_tokens": 3421932.0, + "step": 404 + }, + { + "entropy": 1.1214756965637207, + "epoch": 0.162016201620162, + "grad_norm": 0.3146369755268097, + "learning_rate": 0.00019194814472414844, + "loss": 1.1529, + "mean_token_accuracy": 0.679986834526062, + "num_tokens": 3429660.0, + "step": 405 + }, + { + "entropy": 1.0432531386613846, + "epoch": 0.1624162416241624, + "grad_norm": 0.3081408441066742, + "learning_rate": 0.00019189987177231554, + "loss": 1.0802, + "mean_token_accuracy": 0.6946697533130646, + "num_tokens": 3437779.0, + "step": 406 + }, + { + "entropy": 1.1035350263118744, + "epoch": 0.1628162816281628, + "grad_norm": 0.3021145761013031, + "learning_rate": 0.00019185146136583761, + "loss": 1.1354, + "mean_token_accuracy": 0.6885717958211899, + "num_tokens": 3446116.0, + "step": 407 + }, + { + "entropy": 1.1501671075820923, + "epoch": 0.1632163216321632, + "grad_norm": 0.41734570264816284, + "learning_rate": 0.00019180291358596312, + "loss": 1.1233, + "mean_token_accuracy": 0.6793646067380905, + "num_tokens": 3454845.0, + "step": 408 + }, + { + "entropy": 1.1991091966629028, + "epoch": 0.1636163616361636, + "grad_norm": 0.29790523648262024, + "learning_rate": 0.00019175422851417103, + "loss": 1.1549, + "mean_token_accuracy": 0.6777328252792358, + "num_tokens": 3463400.0, + "step": 409 + }, + { + "entropy": 1.1822619140148163, + "epoch": 0.164016401640164, + "grad_norm": 0.31777262687683105, + "learning_rate": 0.00019170540623217065, + "loss": 1.1476, + "mean_token_accuracy": 0.6912225484848022, + "num_tokens": 3471177.0, + "step": 410 + }, + { + "entropy": 1.1974277198314667, + "epoch": 0.1644164416441644, + "grad_norm": 0.30301401019096375, + "learning_rate": 0.00019165644682190178, + "loss": 1.1863, + "mean_token_accuracy": 0.6698818802833557, + "num_tokens": 3479462.0, + "step": 411 + }, + { + "entropy": 1.1671889424324036, + "epoch": 0.16481648164816481, + "grad_norm": 0.3080313801765442, + "learning_rate": 0.0001916073503655342, + "loss": 1.1485, + "mean_token_accuracy": 0.6848516017198563, + "num_tokens": 3487668.0, + "step": 412 + }, + { + "entropy": 1.1198955476284027, + "epoch": 0.16521652165216522, + "grad_norm": 0.282215416431427, + "learning_rate": 0.00019155811694546773, + "loss": 1.117, + "mean_token_accuracy": 0.6849533915519714, + "num_tokens": 3496407.0, + "step": 413 + }, + { + "entropy": 1.1208362877368927, + "epoch": 0.16561656165616562, + "grad_norm": 0.2846994996070862, + "learning_rate": 0.0001915087466443321, + "loss": 1.1486, + "mean_token_accuracy": 0.6762874126434326, + "num_tokens": 3505305.0, + "step": 414 + }, + { + "entropy": 1.1050612926483154, + "epoch": 0.16601660166016602, + "grad_norm": 0.2926284670829773, + "learning_rate": 0.00019145923954498674, + "loss": 1.1086, + "mean_token_accuracy": 0.6887543201446533, + "num_tokens": 3513791.0, + "step": 415 + }, + { + "entropy": 1.1567849516868591, + "epoch": 0.16641664166416642, + "grad_norm": 0.3551363945007324, + "learning_rate": 0.00019140959573052068, + "loss": 1.1884, + "mean_token_accuracy": 0.6731236577033997, + "num_tokens": 3522187.0, + "step": 416 + }, + { + "entropy": 1.0714478492736816, + "epoch": 0.16681668166816682, + "grad_norm": 0.2826900780200958, + "learning_rate": 0.00019135981528425238, + "loss": 1.07, + "mean_token_accuracy": 0.6979558169841766, + "num_tokens": 3530921.0, + "step": 417 + }, + { + "entropy": 1.1964420974254608, + "epoch": 0.16721672167216722, + "grad_norm": 0.283438116312027, + "learning_rate": 0.0001913098982897297, + "loss": 1.2064, + "mean_token_accuracy": 0.6715447902679443, + "num_tokens": 3539583.0, + "step": 418 + }, + { + "entropy": 1.1429602801799774, + "epoch": 0.16761676167616762, + "grad_norm": 0.27956098318099976, + "learning_rate": 0.0001912598448307295, + "loss": 1.103, + "mean_token_accuracy": 0.692705973982811, + "num_tokens": 3548027.0, + "step": 419 + }, + { + "entropy": 1.1086672246456146, + "epoch": 0.16801680168016803, + "grad_norm": 0.30192887783050537, + "learning_rate": 0.0001912096549912579, + "loss": 1.0665, + "mean_token_accuracy": 0.6996335387229919, + "num_tokens": 3556575.0, + "step": 420 + }, + { + "entropy": 1.122267097234726, + "epoch": 0.16841684168416843, + "grad_norm": 0.28671419620513916, + "learning_rate": 0.0001911593288555497, + "loss": 1.0995, + "mean_token_accuracy": 0.6916577368974686, + "num_tokens": 3564842.0, + "step": 421 + }, + { + "entropy": 1.1425860822200775, + "epoch": 0.16881688168816883, + "grad_norm": 0.31337839365005493, + "learning_rate": 0.0001911088665080685, + "loss": 1.1492, + "mean_token_accuracy": 0.6899708062410355, + "num_tokens": 3573378.0, + "step": 422 + }, + { + "entropy": 1.1819129288196564, + "epoch": 0.1692169216921692, + "grad_norm": 0.3169664442539215, + "learning_rate": 0.00019105826803350668, + "loss": 1.2067, + "mean_token_accuracy": 0.6600329726934433, + "num_tokens": 3581995.0, + "step": 423 + }, + { + "entropy": 1.1388654112815857, + "epoch": 0.1696169616961696, + "grad_norm": 0.3174993097782135, + "learning_rate": 0.00019100753351678485, + "loss": 1.1679, + "mean_token_accuracy": 0.6717206537723541, + "num_tokens": 3590053.0, + "step": 424 + }, + { + "entropy": 1.0764131546020508, + "epoch": 0.17001700170017, + "grad_norm": 0.27433347702026367, + "learning_rate": 0.0001909566630430521, + "loss": 1.0583, + "mean_token_accuracy": 0.698042631149292, + "num_tokens": 3598969.0, + "step": 425 + }, + { + "entropy": 1.1677474975585938, + "epoch": 0.1704170417041704, + "grad_norm": 0.28440240025520325, + "learning_rate": 0.0001909056566976856, + "loss": 1.1686, + "mean_token_accuracy": 0.6792843639850616, + "num_tokens": 3608017.0, + "step": 426 + }, + { + "entropy": 1.0982355326414108, + "epoch": 0.1708170817081708, + "grad_norm": 0.281744122505188, + "learning_rate": 0.00019085451456629063, + "loss": 1.0735, + "mean_token_accuracy": 0.6970892697572708, + "num_tokens": 3616898.0, + "step": 427 + }, + { + "entropy": 1.1331664025783539, + "epoch": 0.1712171217121712, + "grad_norm": 0.29245954751968384, + "learning_rate": 0.00019080323673470028, + "loss": 1.1029, + "mean_token_accuracy": 0.6925027072429657, + "num_tokens": 3625372.0, + "step": 428 + }, + { + "entropy": 1.165515422821045, + "epoch": 0.1716171617161716, + "grad_norm": 0.314475953578949, + "learning_rate": 0.00019075182328897553, + "loss": 1.159, + "mean_token_accuracy": 0.6840381771326065, + "num_tokens": 3633550.0, + "step": 429 + }, + { + "entropy": 1.2059255242347717, + "epoch": 0.172017201720172, + "grad_norm": 0.29410937428474426, + "learning_rate": 0.00019070027431540484, + "loss": 1.1995, + "mean_token_accuracy": 0.667696550488472, + "num_tokens": 3641944.0, + "step": 430 + }, + { + "entropy": 1.160342425107956, + "epoch": 0.17241724172417242, + "grad_norm": 0.29798951745033264, + "learning_rate": 0.00019064858990050412, + "loss": 1.1249, + "mean_token_accuracy": 0.6896940916776657, + "num_tokens": 3650633.0, + "step": 431 + }, + { + "entropy": 1.097832590341568, + "epoch": 0.17281728172817282, + "grad_norm": 0.3146847188472748, + "learning_rate": 0.0001905967701310167, + "loss": 1.084, + "mean_token_accuracy": 0.6950473189353943, + "num_tokens": 3659275.0, + "step": 432 + }, + { + "entropy": 1.1250872611999512, + "epoch": 0.17321732173217322, + "grad_norm": 0.29490962624549866, + "learning_rate": 0.00019054481509391303, + "loss": 1.1453, + "mean_token_accuracy": 0.6784237176179886, + "num_tokens": 3667707.0, + "step": 433 + }, + { + "entropy": 1.11842879652977, + "epoch": 0.17361736173617362, + "grad_norm": 0.3015720844268799, + "learning_rate": 0.00019049272487639053, + "loss": 1.1348, + "mean_token_accuracy": 0.6827126741409302, + "num_tokens": 3676215.0, + "step": 434 + }, + { + "entropy": 1.1079545319080353, + "epoch": 0.17401740174017402, + "grad_norm": 0.2959752380847931, + "learning_rate": 0.00019044049956587359, + "loss": 1.1308, + "mean_token_accuracy": 0.6799913793802261, + "num_tokens": 3684832.0, + "step": 435 + }, + { + "entropy": 1.0760809183120728, + "epoch": 0.17441744174417442, + "grad_norm": 0.28142601251602173, + "learning_rate": 0.0001903881392500132, + "loss": 1.057, + "mean_token_accuracy": 0.7040259689092636, + "num_tokens": 3693191.0, + "step": 436 + }, + { + "entropy": 1.1367475986480713, + "epoch": 0.17481748174817482, + "grad_norm": 0.2840285301208496, + "learning_rate": 0.00019033564401668712, + "loss": 1.1166, + "mean_token_accuracy": 0.6871612221002579, + "num_tokens": 3701978.0, + "step": 437 + }, + { + "entropy": 1.0345291048288345, + "epoch": 0.17521752175217523, + "grad_norm": 0.27927252650260925, + "learning_rate": 0.00019028301395399935, + "loss": 1.0161, + "mean_token_accuracy": 0.7020839005708694, + "num_tokens": 3711010.0, + "step": 438 + }, + { + "entropy": 1.1218744814395905, + "epoch": 0.17561756175617563, + "grad_norm": 0.28972747921943665, + "learning_rate": 0.00019023024915028035, + "loss": 1.1142, + "mean_token_accuracy": 0.6823008805513382, + "num_tokens": 3719811.0, + "step": 439 + }, + { + "entropy": 1.112653136253357, + "epoch": 0.17601760176017603, + "grad_norm": 0.2937675714492798, + "learning_rate": 0.0001901773496940866, + "loss": 1.099, + "mean_token_accuracy": 0.6938609182834625, + "num_tokens": 3728397.0, + "step": 440 + }, + { + "entropy": 1.0891221165657043, + "epoch": 0.17641764176417643, + "grad_norm": 0.2878448963165283, + "learning_rate": 0.00019012431567420058, + "loss": 1.0985, + "mean_token_accuracy": 0.6925668865442276, + "num_tokens": 3737299.0, + "step": 441 + }, + { + "entropy": 1.099565714597702, + "epoch": 0.1768176817681768, + "grad_norm": 0.307413786649704, + "learning_rate": 0.00019007114717963067, + "loss": 1.1189, + "mean_token_accuracy": 0.6934941560029984, + "num_tokens": 3746139.0, + "step": 442 + }, + { + "entropy": 1.1932236850261688, + "epoch": 0.1772177217721772, + "grad_norm": 0.3038841485977173, + "learning_rate": 0.00019001784429961086, + "loss": 1.1788, + "mean_token_accuracy": 0.6709124445915222, + "num_tokens": 3754953.0, + "step": 443 + }, + { + "entropy": 1.0702079832553864, + "epoch": 0.1776177617761776, + "grad_norm": 0.2820574939250946, + "learning_rate": 0.0001899644071236008, + "loss": 1.0416, + "mean_token_accuracy": 0.7032249569892883, + "num_tokens": 3763751.0, + "step": 444 + }, + { + "entropy": 1.2229497730731964, + "epoch": 0.178017801780178, + "grad_norm": 0.3014878034591675, + "learning_rate": 0.00018991083574128545, + "loss": 1.2192, + "mean_token_accuracy": 0.6651740819215775, + "num_tokens": 3771604.0, + "step": 445 + }, + { + "entropy": 1.1150319874286652, + "epoch": 0.1784178417841784, + "grad_norm": 0.2991960644721985, + "learning_rate": 0.000189857130242575, + "loss": 1.09, + "mean_token_accuracy": 0.6914113610982895, + "num_tokens": 3780403.0, + "step": 446 + }, + { + "entropy": 1.1689063012599945, + "epoch": 0.1788178817881788, + "grad_norm": 0.2982667088508606, + "learning_rate": 0.0001898032907176048, + "loss": 1.1627, + "mean_token_accuracy": 0.6814263015985489, + "num_tokens": 3788759.0, + "step": 447 + }, + { + "entropy": 1.1139529049396515, + "epoch": 0.1792179217921792, + "grad_norm": 0.29409554600715637, + "learning_rate": 0.00018974931725673509, + "loss": 1.1114, + "mean_token_accuracy": 0.6805879026651382, + "num_tokens": 3796931.0, + "step": 448 + }, + { + "entropy": 1.1041430234909058, + "epoch": 0.17961796179617961, + "grad_norm": 0.2944853901863098, + "learning_rate": 0.00018969520995055085, + "loss": 1.1119, + "mean_token_accuracy": 0.6940512806177139, + "num_tokens": 3805323.0, + "step": 449 + }, + { + "entropy": 1.1486750543117523, + "epoch": 0.18001800180018002, + "grad_norm": 0.302370548248291, + "learning_rate": 0.00018964096888986182, + "loss": 1.1553, + "mean_token_accuracy": 0.6763848960399628, + "num_tokens": 3813607.0, + "step": 450 + }, + { + "entropy": 1.1423940062522888, + "epoch": 0.18041804180418042, + "grad_norm": 0.28140193223953247, + "learning_rate": 0.00018958659416570212, + "loss": 1.1566, + "mean_token_accuracy": 0.6711086183786392, + "num_tokens": 3822080.0, + "step": 451 + }, + { + "entropy": 1.0220871269702911, + "epoch": 0.18081808180818082, + "grad_norm": 0.2903229892253876, + "learning_rate": 0.00018953208586933027, + "loss": 1.0243, + "mean_token_accuracy": 0.7029541581869125, + "num_tokens": 3830561.0, + "step": 452 + }, + { + "entropy": 1.1911540031433105, + "epoch": 0.18121812181218122, + "grad_norm": 0.3021875321865082, + "learning_rate": 0.0001894774440922289, + "loss": 1.1799, + "mean_token_accuracy": 0.6771095544099808, + "num_tokens": 3838855.0, + "step": 453 + }, + { + "entropy": 1.1234095692634583, + "epoch": 0.18161816181618162, + "grad_norm": 0.30030199885368347, + "learning_rate": 0.00018942266892610474, + "loss": 1.1306, + "mean_token_accuracy": 0.688039630651474, + "num_tokens": 3847225.0, + "step": 454 + }, + { + "entropy": 1.2189615964889526, + "epoch": 0.18201820182018202, + "grad_norm": 0.2934826910495758, + "learning_rate": 0.00018936776046288832, + "loss": 1.192, + "mean_token_accuracy": 0.6768446713685989, + "num_tokens": 3855549.0, + "step": 455 + }, + { + "entropy": 1.090735375881195, + "epoch": 0.18241824182418243, + "grad_norm": 0.2921765148639679, + "learning_rate": 0.0001893127187947339, + "loss": 1.0824, + "mean_token_accuracy": 0.6897251307964325, + "num_tokens": 3863912.0, + "step": 456 + }, + { + "entropy": 1.0907158553600311, + "epoch": 0.18281828182818283, + "grad_norm": 0.28869226574897766, + "learning_rate": 0.00018925754401401935, + "loss": 1.1011, + "mean_token_accuracy": 0.6976663619279861, + "num_tokens": 3872222.0, + "step": 457 + }, + { + "entropy": 1.0765265822410583, + "epoch": 0.18321832183218323, + "grad_norm": 0.27985134720802307, + "learning_rate": 0.0001892022362133459, + "loss": 1.0954, + "mean_token_accuracy": 0.6934731006622314, + "num_tokens": 3880811.0, + "step": 458 + }, + { + "entropy": 1.1287130117416382, + "epoch": 0.18361836183618363, + "grad_norm": 0.2834780216217041, + "learning_rate": 0.000189146795485538, + "loss": 1.1133, + "mean_token_accuracy": 0.6809262037277222, + "num_tokens": 3889241.0, + "step": 459 + }, + { + "entropy": 1.1771635711193085, + "epoch": 0.18401840184018403, + "grad_norm": 0.2930743992328644, + "learning_rate": 0.00018909122192364334, + "loss": 1.1473, + "mean_token_accuracy": 0.6786583662033081, + "num_tokens": 3897826.0, + "step": 460 + }, + { + "entropy": 1.156456857919693, + "epoch": 0.1844184418441844, + "grad_norm": 0.31029045581817627, + "learning_rate": 0.00018903551562093237, + "loss": 1.1329, + "mean_token_accuracy": 0.6835081726312637, + "num_tokens": 3906455.0, + "step": 461 + }, + { + "entropy": 1.197271704673767, + "epoch": 0.1848184818481848, + "grad_norm": 0.28894633054733276, + "learning_rate": 0.00018897967667089839, + "loss": 1.1518, + "mean_token_accuracy": 0.6705130338668823, + "num_tokens": 3914939.0, + "step": 462 + }, + { + "entropy": 1.187122493982315, + "epoch": 0.1852185218521852, + "grad_norm": 0.2882704734802246, + "learning_rate": 0.0001889237051672574, + "loss": 1.172, + "mean_token_accuracy": 0.6756406724452972, + "num_tokens": 3923526.0, + "step": 463 + }, + { + "entropy": 1.1045761406421661, + "epoch": 0.1856185618561856, + "grad_norm": 0.290786474943161, + "learning_rate": 0.00018886760120394774, + "loss": 1.1039, + "mean_token_accuracy": 0.6829386353492737, + "num_tokens": 3931690.0, + "step": 464 + }, + { + "entropy": 1.0771204233169556, + "epoch": 0.186018601860186, + "grad_norm": 0.29037660360336304, + "learning_rate": 0.00018881136487513016, + "loss": 1.0961, + "mean_token_accuracy": 0.6865667402744293, + "num_tokens": 3940222.0, + "step": 465 + }, + { + "entropy": 1.0926263481378555, + "epoch": 0.1864186418641864, + "grad_norm": 0.28368324041366577, + "learning_rate": 0.0001887549962751875, + "loss": 1.1276, + "mean_token_accuracy": 0.6901869177818298, + "num_tokens": 3948870.0, + "step": 466 + }, + { + "entropy": 1.0631737411022186, + "epoch": 0.18681868186818681, + "grad_norm": 0.28324657678604126, + "learning_rate": 0.00018869849549872465, + "loss": 1.0782, + "mean_token_accuracy": 0.6920218467712402, + "num_tokens": 3957291.0, + "step": 467 + }, + { + "entropy": 1.1629198789596558, + "epoch": 0.18721872187218722, + "grad_norm": 0.28869321942329407, + "learning_rate": 0.00018864186264056827, + "loss": 1.1439, + "mean_token_accuracy": 0.6795201748609543, + "num_tokens": 3966005.0, + "step": 468 + }, + { + "entropy": 1.1176329255104065, + "epoch": 0.18761876187618762, + "grad_norm": 0.30285438895225525, + "learning_rate": 0.00018858509779576678, + "loss": 1.1113, + "mean_token_accuracy": 0.6858499944210052, + "num_tokens": 3974237.0, + "step": 469 + }, + { + "entropy": 1.1664519608020782, + "epoch": 0.18801880188018802, + "grad_norm": 0.29232847690582275, + "learning_rate": 0.00018852820105959002, + "loss": 1.1352, + "mean_token_accuracy": 0.6848191022872925, + "num_tokens": 3982719.0, + "step": 470 + }, + { + "entropy": 1.0966509878635406, + "epoch": 0.18841884188418842, + "grad_norm": 0.28050824999809265, + "learning_rate": 0.00018847117252752924, + "loss": 1.103, + "mean_token_accuracy": 0.6891407370567322, + "num_tokens": 3991387.0, + "step": 471 + }, + { + "entropy": 1.0832321643829346, + "epoch": 0.18881888188818882, + "grad_norm": 0.30679091811180115, + "learning_rate": 0.00018841401229529692, + "loss": 1.0987, + "mean_token_accuracy": 0.6983061581850052, + "num_tokens": 3999901.0, + "step": 472 + }, + { + "entropy": 1.1181371808052063, + "epoch": 0.18921892189218922, + "grad_norm": 0.29978105425834656, + "learning_rate": 0.00018835672045882648, + "loss": 1.1526, + "mean_token_accuracy": 0.6812323331832886, + "num_tokens": 4008189.0, + "step": 473 + }, + { + "entropy": 1.094124659895897, + "epoch": 0.18961896189618963, + "grad_norm": 0.2761591672897339, + "learning_rate": 0.00018829929711427232, + "loss": 1.088, + "mean_token_accuracy": 0.6916481256484985, + "num_tokens": 4017035.0, + "step": 474 + }, + { + "entropy": 1.174016386270523, + "epoch": 0.19001900190019003, + "grad_norm": 0.2957269549369812, + "learning_rate": 0.0001882417423580095, + "loss": 1.15, + "mean_token_accuracy": 0.687277153134346, + "num_tokens": 4025132.0, + "step": 475 + }, + { + "entropy": 1.141076147556305, + "epoch": 0.19041904190419043, + "grad_norm": 0.29672884941101074, + "learning_rate": 0.0001881840562866336, + "loss": 1.0997, + "mean_token_accuracy": 0.6899784505367279, + "num_tokens": 4033594.0, + "step": 476 + }, + { + "entropy": 1.103248655796051, + "epoch": 0.19081908190819083, + "grad_norm": 0.2912473976612091, + "learning_rate": 0.00018812623899696067, + "loss": 1.0915, + "mean_token_accuracy": 0.6886222809553146, + "num_tokens": 4042053.0, + "step": 477 + }, + { + "entropy": 1.170788824558258, + "epoch": 0.19121912191219123, + "grad_norm": 0.2797233462333679, + "learning_rate": 0.0001880682905860269, + "loss": 1.1159, + "mean_token_accuracy": 0.6844299733638763, + "num_tokens": 4050555.0, + "step": 478 + }, + { + "entropy": 1.160698264837265, + "epoch": 0.19161916191619163, + "grad_norm": 0.2921246886253357, + "learning_rate": 0.00018801021115108862, + "loss": 1.1606, + "mean_token_accuracy": 0.6748001426458359, + "num_tokens": 4059040.0, + "step": 479 + }, + { + "entropy": 1.0824988782405853, + "epoch": 0.192019201920192, + "grad_norm": 0.29058167338371277, + "learning_rate": 0.000187952000789622, + "loss": 1.1117, + "mean_token_accuracy": 0.6949323862791061, + "num_tokens": 4067919.0, + "step": 480 + }, + { + "entropy": 1.1407755315303802, + "epoch": 0.1924192419241924, + "grad_norm": 0.3058508634567261, + "learning_rate": 0.00018789365959932303, + "loss": 1.1914, + "mean_token_accuracy": 0.6748262792825699, + "num_tokens": 4076495.0, + "step": 481 + }, + { + "entropy": 1.1213767230510712, + "epoch": 0.1928192819281928, + "grad_norm": 0.2868844270706177, + "learning_rate": 0.00018783518767810715, + "loss": 1.117, + "mean_token_accuracy": 0.6884360611438751, + "num_tokens": 4084846.0, + "step": 482 + }, + { + "entropy": 1.1594094932079315, + "epoch": 0.1932193219321932, + "grad_norm": 0.29103291034698486, + "learning_rate": 0.0001877765851241093, + "loss": 1.1595, + "mean_token_accuracy": 0.6784193813800812, + "num_tokens": 4093093.0, + "step": 483 + }, + { + "entropy": 1.0897391140460968, + "epoch": 0.1936193619361936, + "grad_norm": 0.29071077704429626, + "learning_rate": 0.00018771785203568366, + "loss": 1.0775, + "mean_token_accuracy": 0.6933843791484833, + "num_tokens": 4101392.0, + "step": 484 + }, + { + "entropy": 1.05050827562809, + "epoch": 0.19401940194019401, + "grad_norm": 0.2660689949989319, + "learning_rate": 0.00018765898851140345, + "loss": 1.003, + "mean_token_accuracy": 0.7151510417461395, + "num_tokens": 4110388.0, + "step": 485 + }, + { + "entropy": 1.1417682468891144, + "epoch": 0.19441944194419442, + "grad_norm": 0.2760656774044037, + "learning_rate": 0.00018759999465006087, + "loss": 1.1208, + "mean_token_accuracy": 0.6870895624160767, + "num_tokens": 4119451.0, + "step": 486 + }, + { + "entropy": 1.1158250570297241, + "epoch": 0.19481948194819482, + "grad_norm": 0.27844175696372986, + "learning_rate": 0.00018754087055066675, + "loss": 1.0741, + "mean_token_accuracy": 0.7000212967395782, + "num_tokens": 4127997.0, + "step": 487 + }, + { + "entropy": 1.0569812506437302, + "epoch": 0.19521952195219522, + "grad_norm": 0.28110507130622864, + "learning_rate": 0.00018748161631245065, + "loss": 1.0375, + "mean_token_accuracy": 0.7026449292898178, + "num_tokens": 4136878.0, + "step": 488 + }, + { + "entropy": 1.084457129240036, + "epoch": 0.19561956195619562, + "grad_norm": 0.26859092712402344, + "learning_rate": 0.00018742223203486042, + "loss": 1.0676, + "mean_token_accuracy": 0.6930870711803436, + "num_tokens": 4146324.0, + "step": 489 + }, + { + "entropy": 1.0949542820453644, + "epoch": 0.19601960196019602, + "grad_norm": 0.28605908155441284, + "learning_rate": 0.00018736271781756223, + "loss": 1.125, + "mean_token_accuracy": 0.6920661330223083, + "num_tokens": 4154496.0, + "step": 490 + }, + { + "entropy": 1.1369201838970184, + "epoch": 0.19641964196419642, + "grad_norm": 0.3030281364917755, + "learning_rate": 0.00018730307376044027, + "loss": 1.119, + "mean_token_accuracy": 0.6900736391544342, + "num_tokens": 4163381.0, + "step": 491 + }, + { + "entropy": 1.1063465178012848, + "epoch": 0.19681968196819682, + "grad_norm": 0.29392218589782715, + "learning_rate": 0.00018724329996359676, + "loss": 1.1376, + "mean_token_accuracy": 0.6872988492250443, + "num_tokens": 4172190.0, + "step": 492 + }, + { + "entropy": 1.1071143746376038, + "epoch": 0.19721972197219723, + "grad_norm": 0.28501084446907043, + "learning_rate": 0.00018718339652735154, + "loss": 1.1166, + "mean_token_accuracy": 0.6885866820812225, + "num_tokens": 4180585.0, + "step": 493 + }, + { + "entropy": 1.1584193706512451, + "epoch": 0.19761976197619763, + "grad_norm": 0.29230597615242004, + "learning_rate": 0.00018712336355224205, + "loss": 1.1594, + "mean_token_accuracy": 0.6756969690322876, + "num_tokens": 4188810.0, + "step": 494 + }, + { + "entropy": 1.0776985734701157, + "epoch": 0.19801980198019803, + "grad_norm": 0.2801620662212372, + "learning_rate": 0.0001870632011390232, + "loss": 1.0296, + "mean_token_accuracy": 0.7065073400735855, + "num_tokens": 4197309.0, + "step": 495 + }, + { + "entropy": 1.1805840134620667, + "epoch": 0.19841984198419843, + "grad_norm": 0.3022160530090332, + "learning_rate": 0.00018700290938866712, + "loss": 1.1913, + "mean_token_accuracy": 0.6692783236503601, + "num_tokens": 4205630.0, + "step": 496 + }, + { + "entropy": 1.0833539962768555, + "epoch": 0.19881988198819883, + "grad_norm": 0.306426078081131, + "learning_rate": 0.00018694248840236296, + "loss": 1.0954, + "mean_token_accuracy": 0.6928739845752716, + "num_tokens": 4214058.0, + "step": 497 + }, + { + "entropy": 1.0818894803524017, + "epoch": 0.19921992199219923, + "grad_norm": 0.2984001934528351, + "learning_rate": 0.00018688193828151682, + "loss": 1.0926, + "mean_token_accuracy": 0.6913997977972031, + "num_tokens": 4222853.0, + "step": 498 + }, + { + "entropy": 1.0889964997768402, + "epoch": 0.1996199619961996, + "grad_norm": 0.2939610481262207, + "learning_rate": 0.0001868212591277515, + "loss": 1.0606, + "mean_token_accuracy": 0.6952404677867889, + "num_tokens": 4231395.0, + "step": 499 + }, + { + "entropy": 1.0546657741069794, + "epoch": 0.2000200020002, + "grad_norm": 0.28841695189476013, + "learning_rate": 0.00018676045104290637, + "loss": 1.0682, + "mean_token_accuracy": 0.6971585303544998, + "num_tokens": 4240525.0, + "step": 500 + }, + { + "entropy": 1.0262439101934433, + "epoch": 0.2004200420042004, + "grad_norm": 0.26636794209480286, + "learning_rate": 0.00018669951412903725, + "loss": 1.0214, + "mean_token_accuracy": 0.7044205665588379, + "num_tokens": 4249084.0, + "step": 501 + }, + { + "entropy": 1.0894670486450195, + "epoch": 0.2008200820082008, + "grad_norm": 0.2951303720474243, + "learning_rate": 0.00018663844848841604, + "loss": 1.105, + "mean_token_accuracy": 0.6910962909460068, + "num_tokens": 4257560.0, + "step": 502 + }, + { + "entropy": 1.0957359969615936, + "epoch": 0.2012201220122012, + "grad_norm": 0.2873813211917877, + "learning_rate": 0.0001865772542235308, + "loss": 1.1213, + "mean_token_accuracy": 0.6891957819461823, + "num_tokens": 4265965.0, + "step": 503 + }, + { + "entropy": 1.1311607211828232, + "epoch": 0.20162016201620162, + "grad_norm": 0.27662351727485657, + "learning_rate": 0.0001865159314370854, + "loss": 1.1205, + "mean_token_accuracy": 0.68673375248909, + "num_tokens": 4274589.0, + "step": 504 + }, + { + "entropy": 1.1106986701488495, + "epoch": 0.20202020202020202, + "grad_norm": 0.28083881735801697, + "learning_rate": 0.00018645448023199944, + "loss": 1.0789, + "mean_token_accuracy": 0.6987014263868332, + "num_tokens": 4283584.0, + "step": 505 + }, + { + "entropy": 1.1293477416038513, + "epoch": 0.20242024202420242, + "grad_norm": 0.2881603240966797, + "learning_rate": 0.00018639290071140811, + "loss": 1.1016, + "mean_token_accuracy": 0.6911357790231705, + "num_tokens": 4292407.0, + "step": 506 + }, + { + "entropy": 1.1496111154556274, + "epoch": 0.20282028202820282, + "grad_norm": 0.29198089241981506, + "learning_rate": 0.00018633119297866183, + "loss": 1.1281, + "mean_token_accuracy": 0.6901517957448959, + "num_tokens": 4300732.0, + "step": 507 + }, + { + "entropy": 1.152004212141037, + "epoch": 0.20322032203220322, + "grad_norm": 0.2951393723487854, + "learning_rate": 0.00018626935713732625, + "loss": 1.1192, + "mean_token_accuracy": 0.687685951590538, + "num_tokens": 4308893.0, + "step": 508 + }, + { + "entropy": 1.1889654099941254, + "epoch": 0.20362036203620362, + "grad_norm": 0.2978850305080414, + "learning_rate": 0.00018620739329118212, + "loss": 1.1862, + "mean_token_accuracy": 0.6711174994707108, + "num_tokens": 4317109.0, + "step": 509 + }, + { + "entropy": 1.1013084948062897, + "epoch": 0.20402040204020402, + "grad_norm": 0.5161345601081848, + "learning_rate": 0.00018614530154422484, + "loss": 1.0913, + "mean_token_accuracy": 0.7007726728916168, + "num_tokens": 4325247.0, + "step": 510 + }, + { + "entropy": 1.1211208403110504, + "epoch": 0.20442044204420443, + "grad_norm": 0.3235238194465637, + "learning_rate": 0.00018608308200066464, + "loss": 1.1299, + "mean_token_accuracy": 0.6799585521221161, + "num_tokens": 4333072.0, + "step": 511 + }, + { + "entropy": 1.0299284011125565, + "epoch": 0.20482048204820483, + "grad_norm": 0.2876908779144287, + "learning_rate": 0.00018602073476492616, + "loss": 1.043, + "mean_token_accuracy": 0.7059151381254196, + "num_tokens": 4341981.0, + "step": 512 + }, + { + "entropy": 1.017932489514351, + "epoch": 0.20522052205220523, + "grad_norm": 0.280977725982666, + "learning_rate": 0.00018595825994164835, + "loss": 1.0268, + "mean_token_accuracy": 0.7061383426189423, + "num_tokens": 4350628.0, + "step": 513 + }, + { + "entropy": 1.1029777228832245, + "epoch": 0.20562056205620563, + "grad_norm": 0.31565791368484497, + "learning_rate": 0.0001858956576356843, + "loss": 1.1273, + "mean_token_accuracy": 0.6841246038675308, + "num_tokens": 4359006.0, + "step": 514 + }, + { + "entropy": 1.0328813642263412, + "epoch": 0.20602060206020603, + "grad_norm": 0.29239556193351746, + "learning_rate": 0.00018583292795210104, + "loss": 1.0174, + "mean_token_accuracy": 0.7031887769699097, + "num_tokens": 4367605.0, + "step": 515 + }, + { + "entropy": 1.1951524019241333, + "epoch": 0.20642064206420643, + "grad_norm": 0.30958452820777893, + "learning_rate": 0.00018577007099617945, + "loss": 1.1925, + "mean_token_accuracy": 0.6704075187444687, + "num_tokens": 4376023.0, + "step": 516 + }, + { + "entropy": 1.0933611989021301, + "epoch": 0.2068206820682068, + "grad_norm": 0.29714569449424744, + "learning_rate": 0.0001857070868734139, + "loss": 1.0989, + "mean_token_accuracy": 0.6912316530942917, + "num_tokens": 4384573.0, + "step": 517 + }, + { + "entropy": 1.2128576934337616, + "epoch": 0.2072207220722072, + "grad_norm": 0.29116129875183105, + "learning_rate": 0.0001856439756895123, + "loss": 1.1684, + "mean_token_accuracy": 0.6702051758766174, + "num_tokens": 4392513.0, + "step": 518 + }, + { + "entropy": 1.0891981720924377, + "epoch": 0.2076207620762076, + "grad_norm": 0.27925246953964233, + "learning_rate": 0.0001855807375503957, + "loss": 1.0479, + "mean_token_accuracy": 0.7012913972139359, + "num_tokens": 4401143.0, + "step": 519 + }, + { + "entropy": 1.1257116198539734, + "epoch": 0.208020802080208, + "grad_norm": 0.2964246869087219, + "learning_rate": 0.00018551737256219838, + "loss": 1.1196, + "mean_token_accuracy": 0.6898142993450165, + "num_tokens": 4409657.0, + "step": 520 + }, + { + "entropy": 1.1296975314617157, + "epoch": 0.2084208420842084, + "grad_norm": 0.30565595626831055, + "learning_rate": 0.00018545388083126736, + "loss": 1.1382, + "mean_token_accuracy": 0.6826473474502563, + "num_tokens": 4417741.0, + "step": 521 + }, + { + "entropy": 1.0755872130393982, + "epoch": 0.20882088208820881, + "grad_norm": 0.29240506887435913, + "learning_rate": 0.0001853902624641624, + "loss": 1.1055, + "mean_token_accuracy": 0.6916440278291702, + "num_tokens": 4426230.0, + "step": 522 + }, + { + "entropy": 1.052267149090767, + "epoch": 0.20922092209220922, + "grad_norm": 0.2946294844150543, + "learning_rate": 0.00018532651756765587, + "loss": 1.0449, + "mean_token_accuracy": 0.7057672888040543, + "num_tokens": 4434607.0, + "step": 523 + }, + { + "entropy": 1.10684734582901, + "epoch": 0.20962096209620962, + "grad_norm": 0.30656522512435913, + "learning_rate": 0.00018526264624873245, + "loss": 1.0888, + "mean_token_accuracy": 0.6939042061567307, + "num_tokens": 4442338.0, + "step": 524 + }, + { + "entropy": 1.1186622977256775, + "epoch": 0.21002100210021002, + "grad_norm": 0.29416730999946594, + "learning_rate": 0.000185198648614589, + "loss": 1.1268, + "mean_token_accuracy": 0.6892451792955399, + "num_tokens": 4450857.0, + "step": 525 + }, + { + "entropy": 1.0635767579078674, + "epoch": 0.21042104210421042, + "grad_norm": 0.2916122376918793, + "learning_rate": 0.0001851345247726344, + "loss": 1.0295, + "mean_token_accuracy": 0.7052903473377228, + "num_tokens": 4459303.0, + "step": 526 + }, + { + "entropy": 1.1515401303768158, + "epoch": 0.21082108210821082, + "grad_norm": 0.3031848073005676, + "learning_rate": 0.00018507027483048933, + "loss": 1.1575, + "mean_token_accuracy": 0.6841557770967484, + "num_tokens": 4467401.0, + "step": 527 + }, + { + "entropy": 1.147591769695282, + "epoch": 0.21122112211221122, + "grad_norm": 0.28864341974258423, + "learning_rate": 0.00018500589889598607, + "loss": 1.121, + "mean_token_accuracy": 0.6861714273691177, + "num_tokens": 4476169.0, + "step": 528 + }, + { + "entropy": 1.055920347571373, + "epoch": 0.21162116211621163, + "grad_norm": 0.3123279809951782, + "learning_rate": 0.00018494139707716844, + "loss": 1.0688, + "mean_token_accuracy": 0.6997086852788925, + "num_tokens": 4484032.0, + "step": 529 + }, + { + "entropy": 1.0830091834068298, + "epoch": 0.21202120212021203, + "grad_norm": 0.3152300715446472, + "learning_rate": 0.00018487676948229144, + "loss": 1.1229, + "mean_token_accuracy": 0.6820532232522964, + "num_tokens": 4491892.0, + "step": 530 + }, + { + "entropy": 1.068447858095169, + "epoch": 0.21242124212421243, + "grad_norm": 0.28578075766563416, + "learning_rate": 0.00018481201621982127, + "loss": 1.0712, + "mean_token_accuracy": 0.6947162747383118, + "num_tokens": 4500697.0, + "step": 531 + }, + { + "entropy": 1.1199464201927185, + "epoch": 0.21282128212821283, + "grad_norm": 0.30974507331848145, + "learning_rate": 0.00018474713739843494, + "loss": 1.1072, + "mean_token_accuracy": 0.6834284961223602, + "num_tokens": 4508847.0, + "step": 532 + }, + { + "entropy": 1.187211662530899, + "epoch": 0.21322132213221323, + "grad_norm": 0.31375956535339355, + "learning_rate": 0.00018468213312702026, + "loss": 1.1774, + "mean_token_accuracy": 0.673480212688446, + "num_tokens": 4517091.0, + "step": 533 + }, + { + "entropy": 1.1514391899108887, + "epoch": 0.21362136213621363, + "grad_norm": 0.30260926485061646, + "learning_rate": 0.00018461700351467551, + "loss": 1.1146, + "mean_token_accuracy": 0.6896267980337143, + "num_tokens": 4525142.0, + "step": 534 + }, + { + "entropy": 1.1040992140769958, + "epoch": 0.21402140214021403, + "grad_norm": 0.30096668004989624, + "learning_rate": 0.00018455174867070944, + "loss": 1.0594, + "mean_token_accuracy": 0.7020068466663361, + "num_tokens": 4533646.0, + "step": 535 + }, + { + "entropy": 1.0806946754455566, + "epoch": 0.2144214421442144, + "grad_norm": 0.28161150217056274, + "learning_rate": 0.00018448636870464086, + "loss": 1.0793, + "mean_token_accuracy": 0.6937113851308823, + "num_tokens": 4542649.0, + "step": 536 + }, + { + "entropy": 1.083901584148407, + "epoch": 0.2148214821482148, + "grad_norm": 0.2811968922615051, + "learning_rate": 0.00018442086372619872, + "loss": 1.1117, + "mean_token_accuracy": 0.6878249049186707, + "num_tokens": 4551189.0, + "step": 537 + }, + { + "entropy": 1.0583822429180145, + "epoch": 0.2152215221522152, + "grad_norm": 0.3026861250400543, + "learning_rate": 0.00018435523384532158, + "loss": 1.0557, + "mean_token_accuracy": 0.7094537913799286, + "num_tokens": 4559796.0, + "step": 538 + }, + { + "entropy": 1.1090587675571442, + "epoch": 0.2156215621562156, + "grad_norm": 0.29840296506881714, + "learning_rate": 0.00018428947917215784, + "loss": 1.1148, + "mean_token_accuracy": 0.6940523684024811, + "num_tokens": 4568046.0, + "step": 539 + }, + { + "entropy": 1.1120129823684692, + "epoch": 0.21602160216021601, + "grad_norm": 0.29248231649398804, + "learning_rate": 0.00018422359981706523, + "loss": 1.1115, + "mean_token_accuracy": 0.6920116245746613, + "num_tokens": 4576386.0, + "step": 540 + }, + { + "entropy": 1.1465645730495453, + "epoch": 0.21642164216421642, + "grad_norm": 0.2965434789657593, + "learning_rate": 0.00018415759589061073, + "loss": 1.1503, + "mean_token_accuracy": 0.6868140697479248, + "num_tokens": 4585011.0, + "step": 541 + }, + { + "entropy": 1.093808352947235, + "epoch": 0.21682168216821682, + "grad_norm": 0.2780090272426605, + "learning_rate": 0.0001840914675035704, + "loss": 1.0829, + "mean_token_accuracy": 0.6833611726760864, + "num_tokens": 4593590.0, + "step": 542 + }, + { + "entropy": 1.130886971950531, + "epoch": 0.21722172217221722, + "grad_norm": 0.27986133098602295, + "learning_rate": 0.00018402521476692927, + "loss": 1.1197, + "mean_token_accuracy": 0.6846215426921844, + "num_tokens": 4602083.0, + "step": 543 + }, + { + "entropy": 1.0494624972343445, + "epoch": 0.21762176217621762, + "grad_norm": 0.2879413664340973, + "learning_rate": 0.00018395883779188092, + "loss": 1.0301, + "mean_token_accuracy": 0.7053423970937729, + "num_tokens": 4611011.0, + "step": 544 + }, + { + "entropy": 1.1110905408859253, + "epoch": 0.21802180218021802, + "grad_norm": 0.3037598729133606, + "learning_rate": 0.00018389233668982756, + "loss": 1.0923, + "mean_token_accuracy": 0.6870447546243668, + "num_tokens": 4619178.0, + "step": 545 + }, + { + "entropy": 1.1053301692008972, + "epoch": 0.21842184218421842, + "grad_norm": 0.28943902254104614, + "learning_rate": 0.00018382571157237967, + "loss": 1.0879, + "mean_token_accuracy": 0.6886709928512573, + "num_tokens": 4627556.0, + "step": 546 + }, + { + "entropy": 1.1321443915367126, + "epoch": 0.21882188218821882, + "grad_norm": 0.32171791791915894, + "learning_rate": 0.0001837589625513559, + "loss": 1.1438, + "mean_token_accuracy": 0.6873240023851395, + "num_tokens": 4635484.0, + "step": 547 + }, + { + "entropy": 1.0812406539916992, + "epoch": 0.21922192219221923, + "grad_norm": 0.2908923625946045, + "learning_rate": 0.0001836920897387828, + "loss": 1.104, + "mean_token_accuracy": 0.684965506196022, + "num_tokens": 4643939.0, + "step": 548 + }, + { + "entropy": 0.9942754209041595, + "epoch": 0.21962196219621963, + "grad_norm": 0.28201499581336975, + "learning_rate": 0.0001836250932468948, + "loss": 1.0129, + "mean_token_accuracy": 0.714864045381546, + "num_tokens": 4652659.0, + "step": 549 + }, + { + "entropy": 1.0881839394569397, + "epoch": 0.22002200220022003, + "grad_norm": 0.3206377327442169, + "learning_rate": 0.00018355797318813378, + "loss": 1.0986, + "mean_token_accuracy": 0.6900723427534103, + "num_tokens": 4661980.0, + "step": 550 + }, + { + "entropy": 1.1605649590492249, + "epoch": 0.22042204220422043, + "grad_norm": 0.29112040996551514, + "learning_rate": 0.00018349072967514896, + "loss": 1.136, + "mean_token_accuracy": 0.6774939745664597, + "num_tokens": 4670381.0, + "step": 551 + }, + { + "entropy": 1.1249060332775116, + "epoch": 0.22082208220822083, + "grad_norm": 0.28106650710105896, + "learning_rate": 0.00018342336282079703, + "loss": 1.0801, + "mean_token_accuracy": 0.6969916224479675, + "num_tokens": 4678379.0, + "step": 552 + }, + { + "entropy": 1.1373517513275146, + "epoch": 0.22122212221222123, + "grad_norm": 0.29038748145103455, + "learning_rate": 0.0001833558727381413, + "loss": 1.1249, + "mean_token_accuracy": 0.685653880238533, + "num_tokens": 4687106.0, + "step": 553 + }, + { + "entropy": 1.1011317372322083, + "epoch": 0.22162216221622164, + "grad_norm": 0.4556286036968231, + "learning_rate": 0.00018328825954045224, + "loss": 1.0865, + "mean_token_accuracy": 0.6966835558414459, + "num_tokens": 4695525.0, + "step": 554 + }, + { + "entropy": 1.1565632820129395, + "epoch": 0.222022202220222, + "grad_norm": 0.30804142355918884, + "learning_rate": 0.0001832205233412067, + "loss": 1.1458, + "mean_token_accuracy": 0.6828287094831467, + "num_tokens": 4703439.0, + "step": 555 + }, + { + "entropy": 1.1422687768936157, + "epoch": 0.2224222422242224, + "grad_norm": 0.3176911473274231, + "learning_rate": 0.0001831526642540882, + "loss": 1.1593, + "mean_token_accuracy": 0.6771116405725479, + "num_tokens": 4711333.0, + "step": 556 + }, + { + "entropy": 1.1018043160438538, + "epoch": 0.2228222822282228, + "grad_norm": 0.2886631488800049, + "learning_rate": 0.00018308468239298627, + "loss": 1.1393, + "mean_token_accuracy": 0.6870120018720627, + "num_tokens": 4719833.0, + "step": 557 + }, + { + "entropy": 1.1138159930706024, + "epoch": 0.2232223222322232, + "grad_norm": 0.27667999267578125, + "learning_rate": 0.00018301657787199663, + "loss": 1.1306, + "mean_token_accuracy": 0.687031626701355, + "num_tokens": 4728767.0, + "step": 558 + }, + { + "entropy": 1.1035625040531158, + "epoch": 0.22362236223622362, + "grad_norm": 0.2869037687778473, + "learning_rate": 0.00018294835080542087, + "loss": 1.0772, + "mean_token_accuracy": 0.6963266283273697, + "num_tokens": 4737151.0, + "step": 559 + }, + { + "entropy": 1.2061055600643158, + "epoch": 0.22402240224022402, + "grad_norm": 0.2924922704696655, + "learning_rate": 0.00018288000130776616, + "loss": 1.1747, + "mean_token_accuracy": 0.6786110997200012, + "num_tokens": 4745324.0, + "step": 560 + }, + { + "entropy": 1.174630492925644, + "epoch": 0.22442244224422442, + "grad_norm": 0.2991234362125397, + "learning_rate": 0.00018281152949374527, + "loss": 1.1306, + "mean_token_accuracy": 0.680591881275177, + "num_tokens": 4753729.0, + "step": 561 + }, + { + "entropy": 1.2046549618244171, + "epoch": 0.22482248224822482, + "grad_norm": 0.3071587085723877, + "learning_rate": 0.00018274293547827618, + "loss": 1.1628, + "mean_token_accuracy": 0.6722983866930008, + "num_tokens": 4761781.0, + "step": 562 + }, + { + "entropy": 1.1196357905864716, + "epoch": 0.22522252225222522, + "grad_norm": 0.2907363176345825, + "learning_rate": 0.0001826742193764819, + "loss": 1.1153, + "mean_token_accuracy": 0.6894876211881638, + "num_tokens": 4770708.0, + "step": 563 + }, + { + "entropy": 1.0620996952056885, + "epoch": 0.22562256225622562, + "grad_norm": 0.2802564799785614, + "learning_rate": 0.0001826053813036905, + "loss": 1.0624, + "mean_token_accuracy": 0.6971993446350098, + "num_tokens": 4779276.0, + "step": 564 + }, + { + "entropy": 1.0771796256303787, + "epoch": 0.22602260226022602, + "grad_norm": 0.2916572093963623, + "learning_rate": 0.00018253642137543464, + "loss": 1.1013, + "mean_token_accuracy": 0.6843440979719162, + "num_tokens": 4787447.0, + "step": 565 + }, + { + "entropy": 1.0631023943424225, + "epoch": 0.22642264226422643, + "grad_norm": 0.3070925772190094, + "learning_rate": 0.00018246733970745153, + "loss": 1.1064, + "mean_token_accuracy": 0.6869925558567047, + "num_tokens": 4796372.0, + "step": 566 + }, + { + "entropy": 1.1108772158622742, + "epoch": 0.22682268226822683, + "grad_norm": 0.3028910458087921, + "learning_rate": 0.00018239813641568274, + "loss": 1.1657, + "mean_token_accuracy": 0.6808956861495972, + "num_tokens": 4804762.0, + "step": 567 + }, + { + "entropy": 1.079510360956192, + "epoch": 0.22722272227222723, + "grad_norm": 0.280043363571167, + "learning_rate": 0.00018232881161627386, + "loss": 1.113, + "mean_token_accuracy": 0.6885326355695724, + "num_tokens": 4814036.0, + "step": 568 + }, + { + "entropy": 1.1098740994930267, + "epoch": 0.22762276227622763, + "grad_norm": 0.2952498197555542, + "learning_rate": 0.0001822593654255745, + "loss": 1.0887, + "mean_token_accuracy": 0.6888742446899414, + "num_tokens": 4822598.0, + "step": 569 + }, + { + "entropy": 1.1819509863853455, + "epoch": 0.22802280228022803, + "grad_norm": 0.314456969499588, + "learning_rate": 0.00018218979796013796, + "loss": 1.1136, + "mean_token_accuracy": 0.6837149411439896, + "num_tokens": 4830904.0, + "step": 570 + }, + { + "entropy": 1.211066722869873, + "epoch": 0.22842284228422843, + "grad_norm": 0.2834117114543915, + "learning_rate": 0.00018212010933672113, + "loss": 1.1476, + "mean_token_accuracy": 0.6775008738040924, + "num_tokens": 4839677.0, + "step": 571 + }, + { + "entropy": 1.1728490889072418, + "epoch": 0.22882288228822883, + "grad_norm": 0.3034587800502777, + "learning_rate": 0.00018205029967228415, + "loss": 1.1481, + "mean_token_accuracy": 0.6803870648145676, + "num_tokens": 4847588.0, + "step": 572 + }, + { + "entropy": 1.1998023986816406, + "epoch": 0.22922292229222924, + "grad_norm": 0.306550532579422, + "learning_rate": 0.00018198036908399038, + "loss": 1.2199, + "mean_token_accuracy": 0.6653586775064468, + "num_tokens": 4855843.0, + "step": 573 + }, + { + "entropy": 1.0715917497873306, + "epoch": 0.2296229622962296, + "grad_norm": 0.29695501923561096, + "learning_rate": 0.00018191031768920613, + "loss": 1.0415, + "mean_token_accuracy": 0.6996985673904419, + "num_tokens": 4863980.0, + "step": 574 + }, + { + "entropy": 1.093893676996231, + "epoch": 0.23002300230023, + "grad_norm": 0.283250093460083, + "learning_rate": 0.00018184014560550046, + "loss": 1.0919, + "mean_token_accuracy": 0.6895446330308914, + "num_tokens": 4872715.0, + "step": 575 + }, + { + "entropy": 1.0581503957509995, + "epoch": 0.2304230423042304, + "grad_norm": 0.2988159656524658, + "learning_rate": 0.00018176985295064487, + "loss": 1.1007, + "mean_token_accuracy": 0.6927359253168106, + "num_tokens": 4881374.0, + "step": 576 + }, + { + "entropy": 1.0833570659160614, + "epoch": 0.23082308230823081, + "grad_norm": 0.30794355273246765, + "learning_rate": 0.00018169943984261343, + "loss": 1.0973, + "mean_token_accuracy": 0.6918238550424576, + "num_tokens": 4890619.0, + "step": 577 + }, + { + "entropy": 1.101259171962738, + "epoch": 0.23122312231223122, + "grad_norm": 0.3109910488128662, + "learning_rate": 0.00018162890639958224, + "loss": 1.156, + "mean_token_accuracy": 0.6827266663312912, + "num_tokens": 4898828.0, + "step": 578 + }, + { + "entropy": 1.0989751517772675, + "epoch": 0.23162316231623162, + "grad_norm": 0.3026745021343231, + "learning_rate": 0.0001815582527399293, + "loss": 1.0911, + "mean_token_accuracy": 0.6918766647577286, + "num_tokens": 4906748.0, + "step": 579 + }, + { + "entropy": 1.0654544532299042, + "epoch": 0.23202320232023202, + "grad_norm": 0.27107569575309753, + "learning_rate": 0.00018148747898223455, + "loss": 1.0198, + "mean_token_accuracy": 0.7077686190605164, + "num_tokens": 4915151.0, + "step": 580 + }, + { + "entropy": 1.1444284319877625, + "epoch": 0.23242324232423242, + "grad_norm": 0.2838669419288635, + "learning_rate": 0.00018141658524527932, + "loss": 1.1231, + "mean_token_accuracy": 0.6798969656229019, + "num_tokens": 4923719.0, + "step": 581 + }, + { + "entropy": 1.2222770154476166, + "epoch": 0.23282328232823282, + "grad_norm": 0.30396589636802673, + "learning_rate": 0.0001813455716480464, + "loss": 1.1758, + "mean_token_accuracy": 0.676687553524971, + "num_tokens": 4931410.0, + "step": 582 + }, + { + "entropy": 1.11738121509552, + "epoch": 0.23322332233223322, + "grad_norm": 0.27988696098327637, + "learning_rate": 0.00018127443830971975, + "loss": 1.0998, + "mean_token_accuracy": 0.6916311383247375, + "num_tokens": 4940311.0, + "step": 583 + }, + { + "entropy": 1.113107830286026, + "epoch": 0.23362336233623363, + "grad_norm": 0.2928112745285034, + "learning_rate": 0.00018120318534968426, + "loss": 1.115, + "mean_token_accuracy": 0.6865992546081543, + "num_tokens": 4948772.0, + "step": 584 + }, + { + "entropy": 1.1359272599220276, + "epoch": 0.23402340234023403, + "grad_norm": 0.3683016002178192, + "learning_rate": 0.00018113181288752553, + "loss": 1.1288, + "mean_token_accuracy": 0.6873686760663986, + "num_tokens": 4957234.0, + "step": 585 + }, + { + "entropy": 1.093790888786316, + "epoch": 0.23442344234423443, + "grad_norm": 0.2831033766269684, + "learning_rate": 0.00018106032104302984, + "loss": 1.0756, + "mean_token_accuracy": 0.6959938704967499, + "num_tokens": 4965643.0, + "step": 586 + }, + { + "entropy": 1.105111837387085, + "epoch": 0.23482348234823483, + "grad_norm": 0.3010050654411316, + "learning_rate": 0.00018098870993618377, + "loss": 1.1216, + "mean_token_accuracy": 0.6800498068332672, + "num_tokens": 4973675.0, + "step": 587 + }, + { + "entropy": 1.0883657187223434, + "epoch": 0.23522352235223523, + "grad_norm": 0.2842932939529419, + "learning_rate": 0.00018091697968717406, + "loss": 1.1282, + "mean_token_accuracy": 0.6877963691949844, + "num_tokens": 4982827.0, + "step": 588 + }, + { + "entropy": 1.1229371428489685, + "epoch": 0.23562356235623563, + "grad_norm": 0.28499653935432434, + "learning_rate": 0.00018084513041638744, + "loss": 1.1214, + "mean_token_accuracy": 0.6902081966400146, + "num_tokens": 4991354.0, + "step": 589 + }, + { + "entropy": 1.1008753180503845, + "epoch": 0.23602360236023603, + "grad_norm": 0.2883416414260864, + "learning_rate": 0.00018077316224441036, + "loss": 1.1068, + "mean_token_accuracy": 0.6919411867856979, + "num_tokens": 4999539.0, + "step": 590 + }, + { + "entropy": 1.15566685795784, + "epoch": 0.23642364236423644, + "grad_norm": 0.29074421525001526, + "learning_rate": 0.0001807010752920288, + "loss": 1.1387, + "mean_token_accuracy": 0.6731951385736465, + "num_tokens": 5008212.0, + "step": 591 + }, + { + "entropy": 1.1169771254062653, + "epoch": 0.23682368236823684, + "grad_norm": 0.28261739015579224, + "learning_rate": 0.00018062886968022823, + "loss": 1.104, + "mean_token_accuracy": 0.6941290497779846, + "num_tokens": 5016866.0, + "step": 592 + }, + { + "entropy": 1.1271122097969055, + "epoch": 0.2372237223722372, + "grad_norm": 0.28691786527633667, + "learning_rate": 0.00018055654553019305, + "loss": 1.1201, + "mean_token_accuracy": 0.684315949678421, + "num_tokens": 5025079.0, + "step": 593 + }, + { + "entropy": 1.1080365180969238, + "epoch": 0.2376237623762376, + "grad_norm": 0.2816408574581146, + "learning_rate": 0.00018048410296330684, + "loss": 1.1097, + "mean_token_accuracy": 0.690007746219635, + "num_tokens": 5033953.0, + "step": 594 + }, + { + "entropy": 1.1288244724273682, + "epoch": 0.23802380238023801, + "grad_norm": 0.29708895087242126, + "learning_rate": 0.00018041154210115173, + "loss": 1.1102, + "mean_token_accuracy": 0.6903732120990753, + "num_tokens": 5042392.0, + "step": 595 + }, + { + "entropy": 1.1925886869430542, + "epoch": 0.23842384238423842, + "grad_norm": 0.29431167244911194, + "learning_rate": 0.0001803388630655085, + "loss": 1.1543, + "mean_token_accuracy": 0.6811997294425964, + "num_tokens": 5050808.0, + "step": 596 + }, + { + "entropy": 1.0786546766757965, + "epoch": 0.23882388238823882, + "grad_norm": 0.2869158089160919, + "learning_rate": 0.0001802660659783562, + "loss": 1.0753, + "mean_token_accuracy": 0.6927260458469391, + "num_tokens": 5059301.0, + "step": 597 + }, + { + "entropy": 1.0867765545845032, + "epoch": 0.23922392239223922, + "grad_norm": 0.291801393032074, + "learning_rate": 0.0001801931509618721, + "loss": 1.1104, + "mean_token_accuracy": 0.689553901553154, + "num_tokens": 5067726.0, + "step": 598 + }, + { + "entropy": 1.02767014503479, + "epoch": 0.23962396239623962, + "grad_norm": 0.2894926369190216, + "learning_rate": 0.00018012011813843128, + "loss": 1.0422, + "mean_token_accuracy": 0.7023687213659286, + "num_tokens": 5075977.0, + "step": 599 + }, + { + "entropy": 1.0798131823539734, + "epoch": 0.24002400240024002, + "grad_norm": 0.29748791456222534, + "learning_rate": 0.0001800469676306066, + "loss": 1.0712, + "mean_token_accuracy": 0.6980320513248444, + "num_tokens": 5084198.0, + "step": 600 + }, + { + "entropy": 1.0867089629173279, + "epoch": 0.24042404240424042, + "grad_norm": 0.288310706615448, + "learning_rate": 0.00017997369956116845, + "loss": 1.0574, + "mean_token_accuracy": 0.6984989941120148, + "num_tokens": 5092776.0, + "step": 601 + }, + { + "entropy": 1.1064570248126984, + "epoch": 0.24082408240824082, + "grad_norm": 0.3069087862968445, + "learning_rate": 0.00017990031405308446, + "loss": 1.1216, + "mean_token_accuracy": 0.6880150139331818, + "num_tokens": 5101715.0, + "step": 602 + }, + { + "entropy": 1.0482181906700134, + "epoch": 0.24122412241224123, + "grad_norm": 0.2942591905593872, + "learning_rate": 0.00017982681122951944, + "loss": 1.0567, + "mean_token_accuracy": 0.6972368657588959, + "num_tokens": 5109937.0, + "step": 603 + }, + { + "entropy": 1.1223880648612976, + "epoch": 0.24162416241624163, + "grad_norm": 0.2994782328605652, + "learning_rate": 0.00017975319121383502, + "loss": 1.1291, + "mean_token_accuracy": 0.6875152140855789, + "num_tokens": 5118109.0, + "step": 604 + }, + { + "entropy": 1.0888287127017975, + "epoch": 0.24202420242024203, + "grad_norm": 0.29462310671806335, + "learning_rate": 0.0001796794541295896, + "loss": 1.0608, + "mean_token_accuracy": 0.6931962221860886, + "num_tokens": 5126797.0, + "step": 605 + }, + { + "entropy": 1.0591014921665192, + "epoch": 0.24242424242424243, + "grad_norm": 0.29095667600631714, + "learning_rate": 0.00017960560010053795, + "loss": 1.0672, + "mean_token_accuracy": 0.698840007185936, + "num_tokens": 5135620.0, + "step": 606 + }, + { + "entropy": 1.1206754446029663, + "epoch": 0.24282428242824283, + "grad_norm": 0.3482569456100464, + "learning_rate": 0.00017953162925063123, + "loss": 1.1526, + "mean_token_accuracy": 0.6860251873731613, + "num_tokens": 5144271.0, + "step": 607 + }, + { + "entropy": 1.0811570882797241, + "epoch": 0.24322432243224323, + "grad_norm": 0.284400999546051, + "learning_rate": 0.00017945754170401655, + "loss": 1.0643, + "mean_token_accuracy": 0.6971986293792725, + "num_tokens": 5152932.0, + "step": 608 + }, + { + "entropy": 1.119075819849968, + "epoch": 0.24362436243624364, + "grad_norm": 0.2821342647075653, + "learning_rate": 0.00017938333758503696, + "loss": 1.1306, + "mean_token_accuracy": 0.6846693158149719, + "num_tokens": 5161586.0, + "step": 609 + }, + { + "entropy": 1.0774531364440918, + "epoch": 0.24402440244024404, + "grad_norm": 0.27296972274780273, + "learning_rate": 0.00017930901701823114, + "loss": 1.0712, + "mean_token_accuracy": 0.6962114125490189, + "num_tokens": 5170238.0, + "step": 610 + }, + { + "entropy": 1.0940572917461395, + "epoch": 0.2444244424442444, + "grad_norm": 0.27615806460380554, + "learning_rate": 0.0001792345801283332, + "loss": 1.0685, + "mean_token_accuracy": 0.6920359581708908, + "num_tokens": 5178768.0, + "step": 611 + }, + { + "entropy": 1.0925448536872864, + "epoch": 0.2448244824482448, + "grad_norm": 0.30458247661590576, + "learning_rate": 0.0001791600270402724, + "loss": 1.0697, + "mean_token_accuracy": 0.6994819045066833, + "num_tokens": 5187099.0, + "step": 612 + }, + { + "entropy": 1.2210338413715363, + "epoch": 0.2452245224522452, + "grad_norm": 0.32235804200172424, + "learning_rate": 0.00017908535787917318, + "loss": 1.2142, + "mean_token_accuracy": 0.6628595441579819, + "num_tokens": 5195299.0, + "step": 613 + }, + { + "entropy": 1.1216653883457184, + "epoch": 0.24562456245624562, + "grad_norm": 0.27859562635421753, + "learning_rate": 0.00017901057277035462, + "loss": 1.097, + "mean_token_accuracy": 0.6943556666374207, + "num_tokens": 5204125.0, + "step": 614 + }, + { + "entropy": 1.081338807940483, + "epoch": 0.24602460246024602, + "grad_norm": 0.2708154320716858, + "learning_rate": 0.00017893567183933055, + "loss": 1.061, + "mean_token_accuracy": 0.7044036388397217, + "num_tokens": 5212778.0, + "step": 615 + }, + { + "entropy": 1.1373992264270782, + "epoch": 0.24642464246424642, + "grad_norm": 0.3036040961742401, + "learning_rate": 0.00017886065521180905, + "loss": 1.1584, + "mean_token_accuracy": 0.6825258284807205, + "num_tokens": 5221291.0, + "step": 616 + }, + { + "entropy": 1.024650439620018, + "epoch": 0.24682468246824682, + "grad_norm": 0.2775669991970062, + "learning_rate": 0.0001787855230136925, + "loss": 1.025, + "mean_token_accuracy": 0.7081364095211029, + "num_tokens": 5230205.0, + "step": 617 + }, + { + "entropy": 1.1333202421665192, + "epoch": 0.24722472247224722, + "grad_norm": 0.3236136734485626, + "learning_rate": 0.00017871027537107715, + "loss": 1.148, + "mean_token_accuracy": 0.6804903447628021, + "num_tokens": 5238426.0, + "step": 618 + }, + { + "entropy": 1.1499110460281372, + "epoch": 0.24762476247624762, + "grad_norm": 0.29872870445251465, + "learning_rate": 0.00017863491241025303, + "loss": 1.1332, + "mean_token_accuracy": 0.6817525327205658, + "num_tokens": 5246612.0, + "step": 619 + }, + { + "entropy": 1.1693151593208313, + "epoch": 0.24802480248024802, + "grad_norm": 0.30231523513793945, + "learning_rate": 0.00017855943425770373, + "loss": 1.1831, + "mean_token_accuracy": 0.6763779520988464, + "num_tokens": 5254504.0, + "step": 620 + }, + { + "entropy": 1.1351750791072845, + "epoch": 0.24842484248424843, + "grad_norm": 0.3206844627857208, + "learning_rate": 0.00017848384104010622, + "loss": 1.1089, + "mean_token_accuracy": 0.6927164494991302, + "num_tokens": 5262338.0, + "step": 621 + }, + { + "entropy": 1.113035649061203, + "epoch": 0.24882488248824883, + "grad_norm": 0.29072827100753784, + "learning_rate": 0.00017840813288433043, + "loss": 1.0959, + "mean_token_accuracy": 0.6916071623563766, + "num_tokens": 5270905.0, + "step": 622 + }, + { + "entropy": 1.1295787394046783, + "epoch": 0.24922492249224923, + "grad_norm": 0.29548099637031555, + "learning_rate": 0.00017833230991743935, + "loss": 1.1163, + "mean_token_accuracy": 0.6818942427635193, + "num_tokens": 5279331.0, + "step": 623 + }, + { + "entropy": 1.104911208152771, + "epoch": 0.24962496249624963, + "grad_norm": 0.2787920832633972, + "learning_rate": 0.00017825637226668857, + "loss": 1.0958, + "mean_token_accuracy": 0.6957663297653198, + "num_tokens": 5287769.0, + "step": 624 + }, + { + "entropy": 1.1054737269878387, + "epoch": 0.25002500250025, + "grad_norm": 0.28175434470176697, + "learning_rate": 0.0001781803200595262, + "loss": 1.0872, + "mean_token_accuracy": 0.6950071603059769, + "num_tokens": 5296433.0, + "step": 625 + }, + { + "entropy": 1.131270483136177, + "epoch": 0.2504250425042504, + "grad_norm": 0.2943648099899292, + "learning_rate": 0.00017810415342359257, + "loss": 1.1122, + "mean_token_accuracy": 0.6866819262504578, + "num_tokens": 5304657.0, + "step": 626 + }, + { + "entropy": 1.0925902724266052, + "epoch": 0.2508250825082508, + "grad_norm": 0.29442209005355835, + "learning_rate": 0.00017802787248672014, + "loss": 1.1154, + "mean_token_accuracy": 0.6824711561203003, + "num_tokens": 5313139.0, + "step": 627 + }, + { + "entropy": 1.143124371767044, + "epoch": 0.2512251225122512, + "grad_norm": 0.29950326681137085, + "learning_rate": 0.00017795147737693313, + "loss": 1.1756, + "mean_token_accuracy": 0.6731129884719849, + "num_tokens": 5321350.0, + "step": 628 + }, + { + "entropy": 1.0993148982524872, + "epoch": 0.2516251625162516, + "grad_norm": 0.30338600277900696, + "learning_rate": 0.00017787496822244746, + "loss": 1.0981, + "mean_token_accuracy": 0.6886722892522812, + "num_tokens": 5329313.0, + "step": 629 + }, + { + "entropy": 1.1802239120006561, + "epoch": 0.252025202520252, + "grad_norm": 0.29385799169540405, + "learning_rate": 0.0001777983451516703, + "loss": 1.1945, + "mean_token_accuracy": 0.6706182807683945, + "num_tokens": 5337729.0, + "step": 630 + }, + { + "entropy": 1.0929347574710846, + "epoch": 0.2524252425242524, + "grad_norm": 0.26575225591659546, + "learning_rate": 0.00017772160829320011, + "loss": 1.0616, + "mean_token_accuracy": 0.6999123692512512, + "num_tokens": 5346716.0, + "step": 631 + }, + { + "entropy": 1.1688721179962158, + "epoch": 0.2528252825282528, + "grad_norm": 0.31775790452957153, + "learning_rate": 0.0001776447577758264, + "loss": 1.151, + "mean_token_accuracy": 0.6752089411020279, + "num_tokens": 5354899.0, + "step": 632 + }, + { + "entropy": 1.1078169494867325, + "epoch": 0.2532253225322532, + "grad_norm": 0.27552899718284607, + "learning_rate": 0.00017756779372852933, + "loss": 1.0546, + "mean_token_accuracy": 0.6943843811750412, + "num_tokens": 5363530.0, + "step": 633 + }, + { + "entropy": 1.1095669269561768, + "epoch": 0.2536253625362536, + "grad_norm": 0.2830117344856262, + "learning_rate": 0.00017749071628047955, + "loss": 1.0784, + "mean_token_accuracy": 0.697126716375351, + "num_tokens": 5371827.0, + "step": 634 + }, + { + "entropy": 1.0879399627447128, + "epoch": 0.254025402540254, + "grad_norm": 0.30304059386253357, + "learning_rate": 0.00017741352556103817, + "loss": 1.0702, + "mean_token_accuracy": 0.6919046491384506, + "num_tokens": 5380347.0, + "step": 635 + }, + { + "entropy": 1.0853265523910522, + "epoch": 0.2544254425442544, + "grad_norm": 0.2943440079689026, + "learning_rate": 0.00017733622169975637, + "loss": 1.0854, + "mean_token_accuracy": 0.6890179812908173, + "num_tokens": 5388763.0, + "step": 636 + }, + { + "entropy": 1.0664988607168198, + "epoch": 0.2548254825482548, + "grad_norm": 0.2727278470993042, + "learning_rate": 0.00017725880482637513, + "loss": 1.0667, + "mean_token_accuracy": 0.6993520259857178, + "num_tokens": 5397563.0, + "step": 637 + }, + { + "entropy": 1.0242808163166046, + "epoch": 0.2552255225522552, + "grad_norm": 0.27862703800201416, + "learning_rate": 0.0001771812750708252, + "loss": 1.0348, + "mean_token_accuracy": 0.7033644020557404, + "num_tokens": 5406556.0, + "step": 638 + }, + { + "entropy": 1.053386002779007, + "epoch": 0.2556255625562556, + "grad_norm": 0.28933432698249817, + "learning_rate": 0.00017710363256322678, + "loss": 1.0792, + "mean_token_accuracy": 0.6895127892494202, + "num_tokens": 5415237.0, + "step": 639 + }, + { + "entropy": 1.1210744678974152, + "epoch": 0.256025602560256, + "grad_norm": 0.30109062790870667, + "learning_rate": 0.0001770258774338892, + "loss": 1.1524, + "mean_token_accuracy": 0.6795550584793091, + "num_tokens": 5423641.0, + "step": 640 + }, + { + "entropy": 1.0831237733364105, + "epoch": 0.25642564256425643, + "grad_norm": 0.2758908271789551, + "learning_rate": 0.00017694800981331093, + "loss": 1.0824, + "mean_token_accuracy": 0.695437878370285, + "num_tokens": 5432211.0, + "step": 641 + }, + { + "entropy": 1.1548921763896942, + "epoch": 0.25682568256825683, + "grad_norm": 0.2990299463272095, + "learning_rate": 0.00017687002983217913, + "loss": 1.1408, + "mean_token_accuracy": 0.6746811717748642, + "num_tokens": 5440127.0, + "step": 642 + }, + { + "entropy": 1.1489346325397491, + "epoch": 0.25722572257225723, + "grad_norm": 0.3072243928909302, + "learning_rate": 0.00017679193762136966, + "loss": 1.0941, + "mean_token_accuracy": 0.6883689165115356, + "num_tokens": 5448376.0, + "step": 643 + }, + { + "entropy": 1.1230524778366089, + "epoch": 0.25762576257625763, + "grad_norm": 0.2968003749847412, + "learning_rate": 0.0001767137333119466, + "loss": 1.1107, + "mean_token_accuracy": 0.6885704845190048, + "num_tokens": 5456809.0, + "step": 644 + }, + { + "entropy": 1.143693596124649, + "epoch": 0.25802580258025803, + "grad_norm": 0.28341495990753174, + "learning_rate": 0.00017663541703516234, + "loss": 1.1154, + "mean_token_accuracy": 0.6828522235155106, + "num_tokens": 5465050.0, + "step": 645 + }, + { + "entropy": 1.170055627822876, + "epoch": 0.25842584258425844, + "grad_norm": 0.30185526609420776, + "learning_rate": 0.00017655698892245697, + "loss": 1.1687, + "mean_token_accuracy": 0.6691694855690002, + "num_tokens": 5472891.0, + "step": 646 + }, + { + "entropy": 1.0880989134311676, + "epoch": 0.25882588258825884, + "grad_norm": 0.2795546352863312, + "learning_rate": 0.0001764784491054585, + "loss": 1.0972, + "mean_token_accuracy": 0.695548877120018, + "num_tokens": 5481495.0, + "step": 647 + }, + { + "entropy": 1.1015138924121857, + "epoch": 0.25922592259225924, + "grad_norm": 0.29018738865852356, + "learning_rate": 0.00017639979771598224, + "loss": 1.1197, + "mean_token_accuracy": 0.6800331175327301, + "num_tokens": 5489680.0, + "step": 648 + }, + { + "entropy": 0.9927779585123062, + "epoch": 0.25962596259625964, + "grad_norm": 0.2688060402870178, + "learning_rate": 0.00017632103488603079, + "loss": 1.0086, + "mean_token_accuracy": 0.7094009071588516, + "num_tokens": 5498800.0, + "step": 649 + }, + { + "entropy": 1.0720245838165283, + "epoch": 0.26002600260026004, + "grad_norm": 0.28839677572250366, + "learning_rate": 0.0001762421607477939, + "loss": 1.0876, + "mean_token_accuracy": 0.7007628977298737, + "num_tokens": 5507502.0, + "step": 650 + }, + { + "entropy": 1.122203767299652, + "epoch": 0.26042604260426044, + "grad_norm": 0.29731228947639465, + "learning_rate": 0.00017616317543364804, + "loss": 1.1028, + "mean_token_accuracy": 0.6887506395578384, + "num_tokens": 5516053.0, + "step": 651 + }, + { + "entropy": 1.110014021396637, + "epoch": 0.26082608260826085, + "grad_norm": 0.27744606137275696, + "learning_rate": 0.00017608407907615626, + "loss": 1.0957, + "mean_token_accuracy": 0.6962503045797348, + "num_tokens": 5524681.0, + "step": 652 + }, + { + "entropy": 1.1026263982057571, + "epoch": 0.26122612261226125, + "grad_norm": 0.28282374143600464, + "learning_rate": 0.00017600487180806796, + "loss": 1.0764, + "mean_token_accuracy": 0.6935568153858185, + "num_tokens": 5533370.0, + "step": 653 + }, + { + "entropy": 1.1099156141281128, + "epoch": 0.26162616261626165, + "grad_norm": 0.28605568408966064, + "learning_rate": 0.00017592555376231875, + "loss": 1.0807, + "mean_token_accuracy": 0.6910391598939896, + "num_tokens": 5541384.0, + "step": 654 + }, + { + "entropy": 1.092376857995987, + "epoch": 0.26202620262026205, + "grad_norm": 0.2738531231880188, + "learning_rate": 0.0001758461250720302, + "loss": 1.0773, + "mean_token_accuracy": 0.6990807056427002, + "num_tokens": 5550252.0, + "step": 655 + }, + { + "entropy": 1.0651516020298004, + "epoch": 0.26242624262426245, + "grad_norm": 0.2816525995731354, + "learning_rate": 0.00017576658587050933, + "loss": 1.0526, + "mean_token_accuracy": 0.6976151317358017, + "num_tokens": 5558861.0, + "step": 656 + }, + { + "entropy": 1.0657248795032501, + "epoch": 0.26282628262826285, + "grad_norm": 0.2783919870853424, + "learning_rate": 0.00017568693629124902, + "loss": 1.0721, + "mean_token_accuracy": 0.6979729235172272, + "num_tokens": 5567291.0, + "step": 657 + }, + { + "entropy": 1.083315372467041, + "epoch": 0.2632263226322632, + "grad_norm": 0.2877815067768097, + "learning_rate": 0.00017560717646792704, + "loss": 1.0905, + "mean_token_accuracy": 0.6918681263923645, + "num_tokens": 5575781.0, + "step": 658 + }, + { + "entropy": 1.1818140149116516, + "epoch": 0.2636263626362636, + "grad_norm": 0.2953146696090698, + "learning_rate": 0.00017552730653440642, + "loss": 1.1789, + "mean_token_accuracy": 0.6852588057518005, + "num_tokens": 5584446.0, + "step": 659 + }, + { + "entropy": 1.0801471322774887, + "epoch": 0.264026402640264, + "grad_norm": 0.2707977294921875, + "learning_rate": 0.00017544732662473484, + "loss": 1.0574, + "mean_token_accuracy": 0.7031010687351227, + "num_tokens": 5593177.0, + "step": 660 + }, + { + "entropy": 1.0957476496696472, + "epoch": 0.2644264426442644, + "grad_norm": 0.37613731622695923, + "learning_rate": 0.0001753672368731447, + "loss": 1.0782, + "mean_token_accuracy": 0.6951645314693451, + "num_tokens": 5602128.0, + "step": 661 + }, + { + "entropy": 1.1321073472499847, + "epoch": 0.2648264826482648, + "grad_norm": 0.29851654171943665, + "learning_rate": 0.00017528703741405264, + "loss": 1.1152, + "mean_token_accuracy": 0.6860791891813278, + "num_tokens": 5610626.0, + "step": 662 + }, + { + "entropy": 1.2013218402862549, + "epoch": 0.2652265226522652, + "grad_norm": 0.3014949560165405, + "learning_rate": 0.00017520672838205944, + "loss": 1.2006, + "mean_token_accuracy": 0.6706251055002213, + "num_tokens": 5618853.0, + "step": 663 + }, + { + "entropy": 1.0558192133903503, + "epoch": 0.2656265626562656, + "grad_norm": 0.28242382407188416, + "learning_rate": 0.00017512630991194978, + "loss": 1.0378, + "mean_token_accuracy": 0.7036908119916916, + "num_tokens": 5627482.0, + "step": 664 + }, + { + "entropy": 1.0262843817472458, + "epoch": 0.266026602660266, + "grad_norm": 0.28988927602767944, + "learning_rate": 0.0001750457821386921, + "loss": 1.077, + "mean_token_accuracy": 0.6931398808956146, + "num_tokens": 5636270.0, + "step": 665 + }, + { + "entropy": 1.095171958208084, + "epoch": 0.2664266426642664, + "grad_norm": 0.3045620918273926, + "learning_rate": 0.0001749651451974382, + "loss": 1.1113, + "mean_token_accuracy": 0.6824102252721786, + "num_tokens": 5644085.0, + "step": 666 + }, + { + "entropy": 1.127378910779953, + "epoch": 0.2668266826682668, + "grad_norm": 0.2747885286808014, + "learning_rate": 0.00017488439922352307, + "loss": 1.1299, + "mean_token_accuracy": 0.6829666942358017, + "num_tokens": 5652696.0, + "step": 667 + }, + { + "entropy": 1.0842707455158234, + "epoch": 0.2672267226722672, + "grad_norm": 0.26867473125457764, + "learning_rate": 0.00017480354435246478, + "loss": 1.0853, + "mean_token_accuracy": 0.690593883395195, + "num_tokens": 5661554.0, + "step": 668 + }, + { + "entropy": 1.112890213727951, + "epoch": 0.2676267626762676, + "grad_norm": 0.29059144854545593, + "learning_rate": 0.00017472258071996407, + "loss": 1.0908, + "mean_token_accuracy": 0.6880731582641602, + "num_tokens": 5670132.0, + "step": 669 + }, + { + "entropy": 1.145853877067566, + "epoch": 0.268026802680268, + "grad_norm": 0.29494479298591614, + "learning_rate": 0.00017464150846190437, + "loss": 1.1019, + "mean_token_accuracy": 0.6911070793867111, + "num_tokens": 5678381.0, + "step": 670 + }, + { + "entropy": 1.142566204071045, + "epoch": 0.2684268426842684, + "grad_norm": 0.28019067645072937, + "learning_rate": 0.00017456032771435122, + "loss": 1.124, + "mean_token_accuracy": 0.6836016476154327, + "num_tokens": 5687057.0, + "step": 671 + }, + { + "entropy": 1.0647007524967194, + "epoch": 0.2688268826882688, + "grad_norm": 0.2911562919616699, + "learning_rate": 0.00017447903861355239, + "loss": 1.0921, + "mean_token_accuracy": 0.6882173418998718, + "num_tokens": 5695515.0, + "step": 672 + }, + { + "entropy": 1.0366332828998566, + "epoch": 0.2692269226922692, + "grad_norm": 0.3142814040184021, + "learning_rate": 0.00017439764129593746, + "loss": 1.051, + "mean_token_accuracy": 0.7006321102380753, + "num_tokens": 5703354.0, + "step": 673 + }, + { + "entropy": 1.1360971927642822, + "epoch": 0.2696269626962696, + "grad_norm": 0.30167388916015625, + "learning_rate": 0.00017431613589811762, + "loss": 1.1498, + "mean_token_accuracy": 0.6700090169906616, + "num_tokens": 5711576.0, + "step": 674 + }, + { + "entropy": 1.0242177248001099, + "epoch": 0.27002700270027, + "grad_norm": 0.2788066267967224, + "learning_rate": 0.0001742345225568854, + "loss": 1.0338, + "mean_token_accuracy": 0.7015746533870697, + "num_tokens": 5720715.0, + "step": 675 + }, + { + "entropy": 1.1082607805728912, + "epoch": 0.2704270427042704, + "grad_norm": 0.2898354232311249, + "learning_rate": 0.00017415280140921463, + "loss": 1.1171, + "mean_token_accuracy": 0.6895153373479843, + "num_tokens": 5729509.0, + "step": 676 + }, + { + "entropy": 1.1076451241970062, + "epoch": 0.2708270827082708, + "grad_norm": 0.2778789699077606, + "learning_rate": 0.00017407097259225998, + "loss": 1.0859, + "mean_token_accuracy": 0.6947187334299088, + "num_tokens": 5738253.0, + "step": 677 + }, + { + "entropy": 1.0562281757593155, + "epoch": 0.27122712271227123, + "grad_norm": 0.26829174160957336, + "learning_rate": 0.00017398903624335683, + "loss": 1.0027, + "mean_token_accuracy": 0.7157648652791977, + "num_tokens": 5747099.0, + "step": 678 + }, + { + "entropy": 1.0607249587774277, + "epoch": 0.27162716271627163, + "grad_norm": 0.2683940827846527, + "learning_rate": 0.00017390699250002104, + "loss": 1.0232, + "mean_token_accuracy": 0.7041188776493073, + "num_tokens": 5755713.0, + "step": 679 + }, + { + "entropy": 1.0563473254442215, + "epoch": 0.27202720272027203, + "grad_norm": 0.2920173108577728, + "learning_rate": 0.0001738248414999487, + "loss": 1.0611, + "mean_token_accuracy": 0.6956310868263245, + "num_tokens": 5764388.0, + "step": 680 + }, + { + "entropy": 1.0579988360404968, + "epoch": 0.27242724272427243, + "grad_norm": 0.33409756422042847, + "learning_rate": 0.000173742583381016, + "loss": 1.0374, + "mean_token_accuracy": 0.7006884515285492, + "num_tokens": 5772727.0, + "step": 681 + }, + { + "entropy": 1.0571569800376892, + "epoch": 0.27282728272827284, + "grad_norm": 0.2924825847148895, + "learning_rate": 0.00017366021828127875, + "loss": 1.1006, + "mean_token_accuracy": 0.6947790086269379, + "num_tokens": 5780842.0, + "step": 682 + }, + { + "entropy": 1.0650258660316467, + "epoch": 0.27322732273227324, + "grad_norm": 0.28513267636299133, + "learning_rate": 0.00017357774633897248, + "loss": 1.0666, + "mean_token_accuracy": 0.690870001912117, + "num_tokens": 5789436.0, + "step": 683 + }, + { + "entropy": 1.1257043778896332, + "epoch": 0.27362736273627364, + "grad_norm": 0.28454840183258057, + "learning_rate": 0.00017349516769251194, + "loss": 1.1533, + "mean_token_accuracy": 0.6815509647130966, + "num_tokens": 5798195.0, + "step": 684 + }, + { + "entropy": 1.1244366765022278, + "epoch": 0.27402740274027404, + "grad_norm": 0.29272058606147766, + "learning_rate": 0.00017341248248049093, + "loss": 1.1247, + "mean_token_accuracy": 0.6854085922241211, + "num_tokens": 5806251.0, + "step": 685 + }, + { + "entropy": 1.0767890512943268, + "epoch": 0.27442744274427444, + "grad_norm": 0.2824472188949585, + "learning_rate": 0.00017332969084168215, + "loss": 1.0563, + "mean_token_accuracy": 0.7033595740795135, + "num_tokens": 5814255.0, + "step": 686 + }, + { + "entropy": 1.0219203233718872, + "epoch": 0.27482748274827484, + "grad_norm": 0.27691930532455444, + "learning_rate": 0.00017324679291503702, + "loss": 0.9937, + "mean_token_accuracy": 0.7168900966644287, + "num_tokens": 5823084.0, + "step": 687 + }, + { + "entropy": 1.1252110302448273, + "epoch": 0.27522752275227524, + "grad_norm": 0.2884853780269623, + "learning_rate": 0.00017316378883968514, + "loss": 1.1066, + "mean_token_accuracy": 0.6892745494842529, + "num_tokens": 5831387.0, + "step": 688 + }, + { + "entropy": 1.105349838733673, + "epoch": 0.27562756275627565, + "grad_norm": 0.2879672646522522, + "learning_rate": 0.0001730806787549344, + "loss": 1.1072, + "mean_token_accuracy": 0.6888235807418823, + "num_tokens": 5839551.0, + "step": 689 + }, + { + "entropy": 1.131139874458313, + "epoch": 0.27602760276027605, + "grad_norm": 0.30160000920295715, + "learning_rate": 0.00017299746280027058, + "loss": 1.1623, + "mean_token_accuracy": 0.6782182604074478, + "num_tokens": 5847254.0, + "step": 690 + }, + { + "entropy": 1.0617091208696365, + "epoch": 0.27642764276427645, + "grad_norm": 0.2791113257408142, + "learning_rate": 0.00017291414111535717, + "loss": 1.0536, + "mean_token_accuracy": 0.6977207660675049, + "num_tokens": 5855858.0, + "step": 691 + }, + { + "entropy": 1.1499719023704529, + "epoch": 0.27682768276827685, + "grad_norm": 0.2876189649105072, + "learning_rate": 0.00017283071384003505, + "loss": 1.1294, + "mean_token_accuracy": 0.6871919184923172, + "num_tokens": 5864094.0, + "step": 692 + }, + { + "entropy": 1.1329189538955688, + "epoch": 0.27722772277227725, + "grad_norm": 0.2858215570449829, + "learning_rate": 0.00017274718111432236, + "loss": 1.1429, + "mean_token_accuracy": 0.6772816032171249, + "num_tokens": 5872247.0, + "step": 693 + }, + { + "entropy": 1.1265292763710022, + "epoch": 0.27762776277627765, + "grad_norm": 0.3134947121143341, + "learning_rate": 0.00017266354307841415, + "loss": 1.0869, + "mean_token_accuracy": 0.6974654197692871, + "num_tokens": 5880536.0, + "step": 694 + }, + { + "entropy": 1.0841507613658905, + "epoch": 0.27802780278027805, + "grad_norm": 0.6050116419792175, + "learning_rate": 0.00017257979987268235, + "loss": 1.0632, + "mean_token_accuracy": 0.6975610852241516, + "num_tokens": 5889501.0, + "step": 695 + }, + { + "entropy": 1.1113188862800598, + "epoch": 0.2784278427842784, + "grad_norm": 0.2796708047389984, + "learning_rate": 0.00017249595163767526, + "loss": 1.0941, + "mean_token_accuracy": 0.684520959854126, + "num_tokens": 5898282.0, + "step": 696 + }, + { + "entropy": 1.1131321787834167, + "epoch": 0.2788278827882788, + "grad_norm": 0.28280454874038696, + "learning_rate": 0.00017241199851411755, + "loss": 1.0935, + "mean_token_accuracy": 0.6851336359977722, + "num_tokens": 5906844.0, + "step": 697 + }, + { + "entropy": 1.0409544110298157, + "epoch": 0.2792279227922792, + "grad_norm": 0.2832724153995514, + "learning_rate": 0.0001723279406429099, + "loss": 1.0492, + "mean_token_accuracy": 0.7042761296033859, + "num_tokens": 5915393.0, + "step": 698 + }, + { + "entropy": 1.0621929168701172, + "epoch": 0.2796279627962796, + "grad_norm": 0.29447096586227417, + "learning_rate": 0.0001722437781651287, + "loss": 1.0545, + "mean_token_accuracy": 0.6977622658014297, + "num_tokens": 5923692.0, + "step": 699 + }, + { + "entropy": 1.1395678222179413, + "epoch": 0.28002800280028, + "grad_norm": 0.31043651700019836, + "learning_rate": 0.0001721595112220261, + "loss": 1.138, + "mean_token_accuracy": 0.6795233935117722, + "num_tokens": 5931810.0, + "step": 700 + }, + { + "entropy": 1.027369812130928, + "epoch": 0.2804280428042804, + "grad_norm": 0.2683529853820801, + "learning_rate": 0.00017207513995502939, + "loss": 1.0294, + "mean_token_accuracy": 0.7050721347332001, + "num_tokens": 5940621.0, + "step": 701 + }, + { + "entropy": 1.0491604506969452, + "epoch": 0.2808280828082808, + "grad_norm": 0.32895132899284363, + "learning_rate": 0.00017199066450574103, + "loss": 1.0606, + "mean_token_accuracy": 0.7005548626184464, + "num_tokens": 5949502.0, + "step": 702 + }, + { + "entropy": 1.0781410187482834, + "epoch": 0.2812281228122812, + "grad_norm": 0.28801974654197693, + "learning_rate": 0.00017190608501593832, + "loss": 1.0628, + "mean_token_accuracy": 0.6932818591594696, + "num_tokens": 5957775.0, + "step": 703 + }, + { + "entropy": 1.0721731036901474, + "epoch": 0.2816281628162816, + "grad_norm": 0.29104089736938477, + "learning_rate": 0.00017182140162757317, + "loss": 1.064, + "mean_token_accuracy": 0.6989105641841888, + "num_tokens": 5966616.0, + "step": 704 + }, + { + "entropy": 1.1266924142837524, + "epoch": 0.282028202820282, + "grad_norm": 0.6884987950325012, + "learning_rate": 0.0001717366144827719, + "loss": 1.1258, + "mean_token_accuracy": 0.6810166537761688, + "num_tokens": 5974939.0, + "step": 705 + }, + { + "entropy": 1.0985174477100372, + "epoch": 0.2824282428242824, + "grad_norm": 0.28093576431274414, + "learning_rate": 0.00017165172372383492, + "loss": 1.1116, + "mean_token_accuracy": 0.6828480958938599, + "num_tokens": 5983292.0, + "step": 706 + }, + { + "entropy": 1.0656504034996033, + "epoch": 0.2828282828282828, + "grad_norm": 0.2942239046096802, + "learning_rate": 0.00017156672949323657, + "loss": 1.0699, + "mean_token_accuracy": 0.6981309652328491, + "num_tokens": 5991826.0, + "step": 707 + }, + { + "entropy": 1.1407359838485718, + "epoch": 0.2832283228322832, + "grad_norm": 0.29357463121414185, + "learning_rate": 0.00017148163193362483, + "loss": 1.1394, + "mean_token_accuracy": 0.6824936717748642, + "num_tokens": 5999949.0, + "step": 708 + }, + { + "entropy": 1.0804316103458405, + "epoch": 0.2836283628362836, + "grad_norm": 0.2689114212989807, + "learning_rate": 0.00017139643118782108, + "loss": 1.0767, + "mean_token_accuracy": 0.6897957473993301, + "num_tokens": 6008835.0, + "step": 709 + }, + { + "entropy": 1.101131021976471, + "epoch": 0.284028402840284, + "grad_norm": 0.28493279218673706, + "learning_rate": 0.00017131112739881996, + "loss": 1.1052, + "mean_token_accuracy": 0.6911808103322983, + "num_tokens": 6017021.0, + "step": 710 + }, + { + "entropy": 1.1580690145492554, + "epoch": 0.2844284428442844, + "grad_norm": 0.3016761243343353, + "learning_rate": 0.00017122572070978894, + "loss": 1.1283, + "mean_token_accuracy": 0.686494454741478, + "num_tokens": 6025161.0, + "step": 711 + }, + { + "entropy": 1.1091115474700928, + "epoch": 0.2848284828482848, + "grad_norm": 0.2701038420200348, + "learning_rate": 0.00017114021126406827, + "loss": 1.0878, + "mean_token_accuracy": 0.6891522109508514, + "num_tokens": 6034151.0, + "step": 712 + }, + { + "entropy": 1.1154497265815735, + "epoch": 0.2852285228522852, + "grad_norm": 0.30466794967651367, + "learning_rate": 0.00017105459920517064, + "loss": 1.0921, + "mean_token_accuracy": 0.685915395617485, + "num_tokens": 6042283.0, + "step": 713 + }, + { + "entropy": 1.0915995389223099, + "epoch": 0.28562856285628563, + "grad_norm": 0.2853371202945709, + "learning_rate": 0.000170968884676781, + "loss": 1.0873, + "mean_token_accuracy": 0.6917872279882431, + "num_tokens": 6050337.0, + "step": 714 + }, + { + "entropy": 1.1279576122760773, + "epoch": 0.28602860286028603, + "grad_norm": 0.29733631014823914, + "learning_rate": 0.0001708830678227561, + "loss": 1.1515, + "mean_token_accuracy": 0.6802446395158768, + "num_tokens": 6058694.0, + "step": 715 + }, + { + "entropy": 1.0887485444545746, + "epoch": 0.28642864286428643, + "grad_norm": 0.30301228165626526, + "learning_rate": 0.00017079714878712467, + "loss": 1.0889, + "mean_token_accuracy": 0.6893156468868256, + "num_tokens": 6067275.0, + "step": 716 + }, + { + "entropy": 1.072039246559143, + "epoch": 0.28682868286828683, + "grad_norm": 0.29240816831588745, + "learning_rate": 0.0001707111277140868, + "loss": 1.0517, + "mean_token_accuracy": 0.697363555431366, + "num_tokens": 6075637.0, + "step": 717 + }, + { + "entropy": 1.1270709931850433, + "epoch": 0.28722872287228723, + "grad_norm": 0.3018655776977539, + "learning_rate": 0.00017062500474801384, + "loss": 1.0939, + "mean_token_accuracy": 0.6922550350427628, + "num_tokens": 6083361.0, + "step": 718 + }, + { + "entropy": 1.0984892398118973, + "epoch": 0.28762876287628764, + "grad_norm": 0.29802289605140686, + "learning_rate": 0.00017053878003344815, + "loss": 1.0963, + "mean_token_accuracy": 0.6809424310922623, + "num_tokens": 6091835.0, + "step": 719 + }, + { + "entropy": 1.1713212132453918, + "epoch": 0.28802880288028804, + "grad_norm": 0.33682453632354736, + "learning_rate": 0.00017045245371510287, + "loss": 1.1858, + "mean_token_accuracy": 0.6739713847637177, + "num_tokens": 6099900.0, + "step": 720 + }, + { + "entropy": 1.122830256819725, + "epoch": 0.28842884288428844, + "grad_norm": 0.3071252107620239, + "learning_rate": 0.0001703660259378617, + "loss": 1.1379, + "mean_token_accuracy": 0.6854307949542999, + "num_tokens": 6108329.0, + "step": 721 + }, + { + "entropy": 1.0424728989601135, + "epoch": 0.28882888288828884, + "grad_norm": 0.30790361762046814, + "learning_rate": 0.00017027949684677855, + "loss": 1.0526, + "mean_token_accuracy": 0.6959585100412369, + "num_tokens": 6116855.0, + "step": 722 + }, + { + "entropy": 1.0758645981550217, + "epoch": 0.28922892289228924, + "grad_norm": 0.2846760153770447, + "learning_rate": 0.0001701928665870774, + "loss": 1.0731, + "mean_token_accuracy": 0.6950832605361938, + "num_tokens": 6125086.0, + "step": 723 + }, + { + "entropy": 1.0865240395069122, + "epoch": 0.28962896289628964, + "grad_norm": 0.2850196361541748, + "learning_rate": 0.00017010613530415205, + "loss": 1.0926, + "mean_token_accuracy": 0.6894428730010986, + "num_tokens": 6133780.0, + "step": 724 + }, + { + "entropy": 1.0899761021137238, + "epoch": 0.29002900290029004, + "grad_norm": 0.28621774911880493, + "learning_rate": 0.0001700193031435658, + "loss": 1.1001, + "mean_token_accuracy": 0.6898534744977951, + "num_tokens": 6142262.0, + "step": 725 + }, + { + "entropy": 1.1941851675510406, + "epoch": 0.29042904290429045, + "grad_norm": 0.3125251233577728, + "learning_rate": 0.0001699323702510513, + "loss": 1.1832, + "mean_token_accuracy": 0.6761420220136642, + "num_tokens": 6150245.0, + "step": 726 + }, + { + "entropy": 1.1271757185459137, + "epoch": 0.29082908290829085, + "grad_norm": 0.28578904271125793, + "learning_rate": 0.00016984533677251025, + "loss": 1.0737, + "mean_token_accuracy": 0.6905937194824219, + "num_tokens": 6159218.0, + "step": 727 + }, + { + "entropy": 1.098642647266388, + "epoch": 0.29122912291229125, + "grad_norm": 0.2851860225200653, + "learning_rate": 0.00016975820285401312, + "loss": 1.0452, + "mean_token_accuracy": 0.7096526771783829, + "num_tokens": 6167338.0, + "step": 728 + }, + { + "entropy": 1.1110363900661469, + "epoch": 0.29162916291629165, + "grad_norm": 0.2929728329181671, + "learning_rate": 0.000169670968641799, + "loss": 1.1248, + "mean_token_accuracy": 0.6792106926441193, + "num_tokens": 6176005.0, + "step": 729 + }, + { + "entropy": 1.1245019137859344, + "epoch": 0.29202920292029205, + "grad_norm": 0.3063944876194, + "learning_rate": 0.00016958363428227536, + "loss": 1.1492, + "mean_token_accuracy": 0.6829055845737457, + "num_tokens": 6184025.0, + "step": 730 + }, + { + "entropy": 1.1318716704845428, + "epoch": 0.29242924292429245, + "grad_norm": 0.29579731822013855, + "learning_rate": 0.0001694961999220176, + "loss": 1.1696, + "mean_token_accuracy": 0.6744921058416367, + "num_tokens": 6192317.0, + "step": 731 + }, + { + "entropy": 1.010225847363472, + "epoch": 0.29282928292829286, + "grad_norm": 0.2861863672733307, + "learning_rate": 0.00016940866570776904, + "loss": 1.0189, + "mean_token_accuracy": 0.7033442556858063, + "num_tokens": 6200610.0, + "step": 732 + }, + { + "entropy": 1.0887831151485443, + "epoch": 0.29322932293229326, + "grad_norm": 0.28155192732810974, + "learning_rate": 0.00016932103178644064, + "loss": 1.1011, + "mean_token_accuracy": 0.6909735649824142, + "num_tokens": 6208897.0, + "step": 733 + }, + { + "entropy": 1.1203160285949707, + "epoch": 0.2936293629362936, + "grad_norm": 0.28371837735176086, + "learning_rate": 0.00016923329830511062, + "loss": 1.1141, + "mean_token_accuracy": 0.693743571639061, + "num_tokens": 6218081.0, + "step": 734 + }, + { + "entropy": 1.14133021235466, + "epoch": 0.294029402940294, + "grad_norm": 0.320532888174057, + "learning_rate": 0.00016914546541102433, + "loss": 1.0939, + "mean_token_accuracy": 0.6943132728338242, + "num_tokens": 6226339.0, + "step": 735 + }, + { + "entropy": 1.149755209684372, + "epoch": 0.2944294429442944, + "grad_norm": 0.30331504344940186, + "learning_rate": 0.00016905753325159397, + "loss": 1.1189, + "mean_token_accuracy": 0.6852170079946518, + "num_tokens": 6234335.0, + "step": 736 + }, + { + "entropy": 1.1070905029773712, + "epoch": 0.2948294829482948, + "grad_norm": 0.29018259048461914, + "learning_rate": 0.0001689695019743983, + "loss": 1.1001, + "mean_token_accuracy": 0.6850669682025909, + "num_tokens": 6242493.0, + "step": 737 + }, + { + "entropy": 1.1114173233509064, + "epoch": 0.2952295229522952, + "grad_norm": 0.29160076379776, + "learning_rate": 0.00016888137172718244, + "loss": 1.0858, + "mean_token_accuracy": 0.6926248967647552, + "num_tokens": 6250672.0, + "step": 738 + }, + { + "entropy": 1.1600793302059174, + "epoch": 0.2956295629562956, + "grad_norm": 0.3031260371208191, + "learning_rate": 0.00016879314265785765, + "loss": 1.1883, + "mean_token_accuracy": 0.6790725439786911, + "num_tokens": 6258626.0, + "step": 739 + }, + { + "entropy": 1.0873356759548187, + "epoch": 0.296029602960296, + "grad_norm": 0.28583449125289917, + "learning_rate": 0.0001687048149145011, + "loss": 1.0791, + "mean_token_accuracy": 0.692878320813179, + "num_tokens": 6266650.0, + "step": 740 + }, + { + "entropy": 1.099444955587387, + "epoch": 0.2964296429642964, + "grad_norm": 0.2827002704143524, + "learning_rate": 0.00016861638864535539, + "loss": 1.1162, + "mean_token_accuracy": 0.6850938946008682, + "num_tokens": 6275680.0, + "step": 741 + }, + { + "entropy": 1.134610891342163, + "epoch": 0.2968296829682968, + "grad_norm": 0.2991955280303955, + "learning_rate": 0.00016852786399882864, + "loss": 1.1229, + "mean_token_accuracy": 0.6870722472667694, + "num_tokens": 6283858.0, + "step": 742 + }, + { + "entropy": 1.0856669247150421, + "epoch": 0.2972297229722972, + "grad_norm": 0.28329774737358093, + "learning_rate": 0.00016843924112349402, + "loss": 1.07, + "mean_token_accuracy": 0.6930081695318222, + "num_tokens": 6292296.0, + "step": 743 + }, + { + "entropy": 1.1310625076293945, + "epoch": 0.2976297629762976, + "grad_norm": 0.28257298469543457, + "learning_rate": 0.00016835052016808954, + "loss": 1.102, + "mean_token_accuracy": 0.6885871589183807, + "num_tokens": 6300757.0, + "step": 744 + }, + { + "entropy": 1.1257978677749634, + "epoch": 0.298029802980298, + "grad_norm": 0.2671114504337311, + "learning_rate": 0.0001682617012815179, + "loss": 1.1301, + "mean_token_accuracy": 0.6859237104654312, + "num_tokens": 6309897.0, + "step": 745 + }, + { + "entropy": 1.0704855620861053, + "epoch": 0.2984298429842984, + "grad_norm": 0.2760133743286133, + "learning_rate": 0.00016817278461284604, + "loss": 1.0525, + "mean_token_accuracy": 0.6995566487312317, + "num_tokens": 6318736.0, + "step": 746 + }, + { + "entropy": 1.093798190355301, + "epoch": 0.2988298829882988, + "grad_norm": 0.28297320008277893, + "learning_rate": 0.00016808377031130506, + "loss": 1.0673, + "mean_token_accuracy": 0.6956609338521957, + "num_tokens": 6327224.0, + "step": 747 + }, + { + "entropy": 1.041995644569397, + "epoch": 0.2992299229922992, + "grad_norm": 0.2877747416496277, + "learning_rate": 0.00016799465852629, + "loss": 1.0494, + "mean_token_accuracy": 0.7076048702001572, + "num_tokens": 6335509.0, + "step": 748 + }, + { + "entropy": 1.0487827211618423, + "epoch": 0.2996299629962996, + "grad_norm": 0.2713603973388672, + "learning_rate": 0.00016790544940735946, + "loss": 1.0288, + "mean_token_accuracy": 0.7073977887630463, + "num_tokens": 6344202.0, + "step": 749 + }, + { + "entropy": 1.0660082548856735, + "epoch": 0.3000300030003, + "grad_norm": 0.2747448682785034, + "learning_rate": 0.0001678161431042353, + "loss": 1.0377, + "mean_token_accuracy": 0.7035159468650818, + "num_tokens": 6352913.0, + "step": 750 + }, + { + "entropy": 1.0016452819108963, + "epoch": 0.30043004300430043, + "grad_norm": 0.2882155776023865, + "learning_rate": 0.0001677267397668026, + "loss": 0.9705, + "mean_token_accuracy": 0.7112107425928116, + "num_tokens": 6360938.0, + "step": 751 + }, + { + "entropy": 1.1216872036457062, + "epoch": 0.30083008300830083, + "grad_norm": 0.3194028437137604, + "learning_rate": 0.00016763723954510927, + "loss": 1.1309, + "mean_token_accuracy": 0.6844885051250458, + "num_tokens": 6368453.0, + "step": 752 + }, + { + "entropy": 1.1009182035923004, + "epoch": 0.30123012301230123, + "grad_norm": 0.2956380546092987, + "learning_rate": 0.00016754764258936585, + "loss": 1.1303, + "mean_token_accuracy": 0.6881340146064758, + "num_tokens": 6376852.0, + "step": 753 + }, + { + "entropy": 1.0586482286453247, + "epoch": 0.30163016301630163, + "grad_norm": 0.2933512330055237, + "learning_rate": 0.0001674579490499451, + "loss": 1.1076, + "mean_token_accuracy": 0.6953225135803223, + "num_tokens": 6385605.0, + "step": 754 + }, + { + "entropy": 1.0930490493774414, + "epoch": 0.30203020302030203, + "grad_norm": 0.2938788831233978, + "learning_rate": 0.0001673681590773821, + "loss": 1.1208, + "mean_token_accuracy": 0.6880796253681183, + "num_tokens": 6393893.0, + "step": 755 + }, + { + "entropy": 1.1133641004562378, + "epoch": 0.30243024302430244, + "grad_norm": 0.2956829369068146, + "learning_rate": 0.00016727827282237353, + "loss": 1.1267, + "mean_token_accuracy": 0.6788761764764786, + "num_tokens": 6401834.0, + "step": 756 + }, + { + "entropy": 1.0259417593479156, + "epoch": 0.30283028302830284, + "grad_norm": 0.27117884159088135, + "learning_rate": 0.0001671882904357779, + "loss": 0.9955, + "mean_token_accuracy": 0.7157447934150696, + "num_tokens": 6410323.0, + "step": 757 + }, + { + "entropy": 1.1092566549777985, + "epoch": 0.30323032303230324, + "grad_norm": 0.27452918887138367, + "learning_rate": 0.00016709821206861486, + "loss": 1.0647, + "mean_token_accuracy": 0.6969586163759232, + "num_tokens": 6418684.0, + "step": 758 + }, + { + "entropy": 1.0679335445165634, + "epoch": 0.30363036303630364, + "grad_norm": 0.289108008146286, + "learning_rate": 0.00016700803787206528, + "loss": 1.0644, + "mean_token_accuracy": 0.6940675526857376, + "num_tokens": 6426713.0, + "step": 759 + }, + { + "entropy": 1.1053009331226349, + "epoch": 0.30403040304030404, + "grad_norm": 0.26713645458221436, + "learning_rate": 0.0001669177679974708, + "loss": 1.1088, + "mean_token_accuracy": 0.6891184747219086, + "num_tokens": 6435379.0, + "step": 760 + }, + { + "entropy": 1.041573852300644, + "epoch": 0.30443044304430444, + "grad_norm": 0.25917789340019226, + "learning_rate": 0.00016682740259633367, + "loss": 1.0105, + "mean_token_accuracy": 0.70865198969841, + "num_tokens": 6444521.0, + "step": 761 + }, + { + "entropy": 1.1321850717067719, + "epoch": 0.30483048304830485, + "grad_norm": 0.27994304895401, + "learning_rate": 0.00016673694182031635, + "loss": 1.1047, + "mean_token_accuracy": 0.6865262985229492, + "num_tokens": 6452940.0, + "step": 762 + }, + { + "entropy": 1.110209971666336, + "epoch": 0.30523052305230525, + "grad_norm": 0.28705158829689026, + "learning_rate": 0.00016664638582124166, + "loss": 1.127, + "mean_token_accuracy": 0.6904176324605942, + "num_tokens": 6461829.0, + "step": 763 + }, + { + "entropy": 1.0719735622406006, + "epoch": 0.30563056305630565, + "grad_norm": 0.28352898359298706, + "learning_rate": 0.0001665557347510919, + "loss": 1.1221, + "mean_token_accuracy": 0.6906540542840958, + "num_tokens": 6470104.0, + "step": 764 + }, + { + "entropy": 1.062385305762291, + "epoch": 0.30603060306030605, + "grad_norm": 0.29124680161476135, + "learning_rate": 0.0001664649887620091, + "loss": 1.0674, + "mean_token_accuracy": 0.6923017352819443, + "num_tokens": 6478093.0, + "step": 765 + }, + { + "entropy": 1.0240153968334198, + "epoch": 0.30643064306430645, + "grad_norm": 0.28491702675819397, + "learning_rate": 0.0001663741480062946, + "loss": 1.0333, + "mean_token_accuracy": 0.70469731092453, + "num_tokens": 6486649.0, + "step": 766 + }, + { + "entropy": 1.0592333674430847, + "epoch": 0.30683068306830685, + "grad_norm": 0.2836754620075226, + "learning_rate": 0.0001662832126364087, + "loss": 1.0646, + "mean_token_accuracy": 0.7026871144771576, + "num_tokens": 6494743.0, + "step": 767 + }, + { + "entropy": 1.094917356967926, + "epoch": 0.30723072307230725, + "grad_norm": 0.2856273353099823, + "learning_rate": 0.0001661921828049706, + "loss": 1.0838, + "mean_token_accuracy": 0.6910727024078369, + "num_tokens": 6503286.0, + "step": 768 + }, + { + "entropy": 1.1565030813217163, + "epoch": 0.30763076307630766, + "grad_norm": 0.2800493538379669, + "learning_rate": 0.00016610105866475804, + "loss": 1.1374, + "mean_token_accuracy": 0.6781891137361526, + "num_tokens": 6511830.0, + "step": 769 + }, + { + "entropy": 1.1632217466831207, + "epoch": 0.30803080308030806, + "grad_norm": 0.2924068570137024, + "learning_rate": 0.0001660098403687069, + "loss": 1.1591, + "mean_token_accuracy": 0.6806275993585587, + "num_tokens": 6520491.0, + "step": 770 + }, + { + "entropy": 1.086751252412796, + "epoch": 0.3084308430843084, + "grad_norm": 0.2794789969921112, + "learning_rate": 0.00016591852806991118, + "loss": 1.0644, + "mean_token_accuracy": 0.6998393088579178, + "num_tokens": 6528906.0, + "step": 771 + }, + { + "entropy": 1.149342566728592, + "epoch": 0.3088308830883088, + "grad_norm": 0.30849379301071167, + "learning_rate": 0.00016582712192162268, + "loss": 1.1226, + "mean_token_accuracy": 0.6808988600969315, + "num_tokens": 6537265.0, + "step": 772 + }, + { + "entropy": 1.1131223142147064, + "epoch": 0.3092309230923092, + "grad_norm": 0.29778435826301575, + "learning_rate": 0.00016573562207725068, + "loss": 1.128, + "mean_token_accuracy": 0.6877892315387726, + "num_tokens": 6545129.0, + "step": 773 + }, + { + "entropy": 1.0878064036369324, + "epoch": 0.3096309630963096, + "grad_norm": 0.28501614928245544, + "learning_rate": 0.00016564402869036167, + "loss": 1.0878, + "mean_token_accuracy": 0.6953048408031464, + "num_tokens": 6553283.0, + "step": 774 + }, + { + "entropy": 1.045453280210495, + "epoch": 0.31003100310031, + "grad_norm": 0.2768403887748718, + "learning_rate": 0.00016555234191467918, + "loss": 1.0292, + "mean_token_accuracy": 0.697388768196106, + "num_tokens": 6561455.0, + "step": 775 + }, + { + "entropy": 1.1227385699748993, + "epoch": 0.3104310431043104, + "grad_norm": 0.29729950428009033, + "learning_rate": 0.0001654605619040835, + "loss": 1.1477, + "mean_token_accuracy": 0.6787715554237366, + "num_tokens": 6570138.0, + "step": 776 + }, + { + "entropy": 1.064333364367485, + "epoch": 0.3108310831083108, + "grad_norm": 0.27092444896698, + "learning_rate": 0.00016536868881261124, + "loss": 1.0404, + "mean_token_accuracy": 0.7003030031919479, + "num_tokens": 6578952.0, + "step": 777 + }, + { + "entropy": 1.124427080154419, + "epoch": 0.3112311231123112, + "grad_norm": 0.2813462018966675, + "learning_rate": 0.0001652767227944555, + "loss": 1.0937, + "mean_token_accuracy": 0.6872838139533997, + "num_tokens": 6586835.0, + "step": 778 + }, + { + "entropy": 1.0890995860099792, + "epoch": 0.3116311631163116, + "grad_norm": 0.2971119284629822, + "learning_rate": 0.00016518466400396517, + "loss": 1.0661, + "mean_token_accuracy": 0.7000496536493301, + "num_tokens": 6595159.0, + "step": 779 + }, + { + "entropy": 1.1272623836994171, + "epoch": 0.312031203120312, + "grad_norm": 0.28871408104896545, + "learning_rate": 0.00016509251259564482, + "loss": 1.129, + "mean_token_accuracy": 0.6788710057735443, + "num_tokens": 6603340.0, + "step": 780 + }, + { + "entropy": 1.0659702718257904, + "epoch": 0.3124312431243124, + "grad_norm": 0.29359593987464905, + "learning_rate": 0.00016500026872415453, + "loss": 1.0992, + "mean_token_accuracy": 0.6883674561977386, + "num_tokens": 6611730.0, + "step": 781 + }, + { + "entropy": 0.9968152344226837, + "epoch": 0.3128312831283128, + "grad_norm": 0.2773456871509552, + "learning_rate": 0.00016490793254430954, + "loss": 1.0233, + "mean_token_accuracy": 0.7069334089756012, + "num_tokens": 6620199.0, + "step": 782 + }, + { + "entropy": 1.1206261813640594, + "epoch": 0.3132313231323132, + "grad_norm": 0.2897351384162903, + "learning_rate": 0.00016481550421108002, + "loss": 1.1346, + "mean_token_accuracy": 0.6881282329559326, + "num_tokens": 6628435.0, + "step": 783 + }, + { + "entropy": 1.0389655977487564, + "epoch": 0.3136313631363136, + "grad_norm": 0.26741456985473633, + "learning_rate": 0.0001647229838795908, + "loss": 1.0191, + "mean_token_accuracy": 0.7035332918167114, + "num_tokens": 6637235.0, + "step": 784 + }, + { + "entropy": 1.0993463099002838, + "epoch": 0.314031403140314, + "grad_norm": 0.31597909331321716, + "learning_rate": 0.0001646303717051211, + "loss": 1.0889, + "mean_token_accuracy": 0.6884125471115112, + "num_tokens": 6645716.0, + "step": 785 + }, + { + "entropy": 1.1081691682338715, + "epoch": 0.3144314431443144, + "grad_norm": 0.2796851694583893, + "learning_rate": 0.0001645376678431043, + "loss": 1.076, + "mean_token_accuracy": 0.6917168349027634, + "num_tokens": 6654145.0, + "step": 786 + }, + { + "entropy": 1.1828051209449768, + "epoch": 0.31483148314831483, + "grad_norm": 0.28401651978492737, + "learning_rate": 0.00016444487244912773, + "loss": 1.1821, + "mean_token_accuracy": 0.6732564568519592, + "num_tokens": 6662504.0, + "step": 787 + }, + { + "entropy": 1.128604382276535, + "epoch": 0.31523152315231523, + "grad_norm": 0.27873510122299194, + "learning_rate": 0.00016435198567893216, + "loss": 1.118, + "mean_token_accuracy": 0.6792010068893433, + "num_tokens": 6671479.0, + "step": 788 + }, + { + "entropy": 1.1247271299362183, + "epoch": 0.31563156315631563, + "grad_norm": 0.2788650393486023, + "learning_rate": 0.00016425900768841188, + "loss": 1.1009, + "mean_token_accuracy": 0.691927507519722, + "num_tokens": 6680257.0, + "step": 789 + }, + { + "entropy": 1.0758639425039291, + "epoch": 0.31603160316031603, + "grad_norm": 0.28194695711135864, + "learning_rate": 0.0001641659386336142, + "loss": 1.0888, + "mean_token_accuracy": 0.6895868927240372, + "num_tokens": 6688483.0, + "step": 790 + }, + { + "entropy": 1.0651999115943909, + "epoch": 0.31643164316431643, + "grad_norm": 0.2820742130279541, + "learning_rate": 0.00016407277867073935, + "loss": 1.0638, + "mean_token_accuracy": 0.7065901160240173, + "num_tokens": 6696578.0, + "step": 791 + }, + { + "entropy": 1.09292770922184, + "epoch": 0.31683168316831684, + "grad_norm": 0.3139644265174866, + "learning_rate": 0.00016397952795614, + "loss": 1.087, + "mean_token_accuracy": 0.6921844035387039, + "num_tokens": 6704077.0, + "step": 792 + }, + { + "entropy": 1.0606204271316528, + "epoch": 0.31723172317231724, + "grad_norm": 0.2857057452201843, + "learning_rate": 0.00016388618664632122, + "loss": 1.0396, + "mean_token_accuracy": 0.7051282078027725, + "num_tokens": 6712476.0, + "step": 793 + }, + { + "entropy": 1.0821565091609955, + "epoch": 0.31763176317631764, + "grad_norm": 0.3205065429210663, + "learning_rate": 0.00016379275489794014, + "loss": 1.0877, + "mean_token_accuracy": 0.6946270614862442, + "num_tokens": 6721063.0, + "step": 794 + }, + { + "entropy": 0.9861936420202255, + "epoch": 0.31803180318031804, + "grad_norm": 0.27878350019454956, + "learning_rate": 0.00016369923286780564, + "loss": 0.99, + "mean_token_accuracy": 0.7179621458053589, + "num_tokens": 6730152.0, + "step": 795 + }, + { + "entropy": 1.039843127131462, + "epoch": 0.31843184318431844, + "grad_norm": 0.280078262090683, + "learning_rate": 0.00016360562071287814, + "loss": 1.0454, + "mean_token_accuracy": 0.7031548768281937, + "num_tokens": 6738925.0, + "step": 796 + }, + { + "entropy": 1.1295160055160522, + "epoch": 0.31883188318831884, + "grad_norm": 0.283812552690506, + "learning_rate": 0.0001635119185902693, + "loss": 1.1148, + "mean_token_accuracy": 0.6886333674192429, + "num_tokens": 6747506.0, + "step": 797 + }, + { + "entropy": 1.0683157593011856, + "epoch": 0.31923192319231924, + "grad_norm": 0.27449098229408264, + "learning_rate": 0.00016341812665724174, + "loss": 1.0546, + "mean_token_accuracy": 0.7023375034332275, + "num_tokens": 6755942.0, + "step": 798 + }, + { + "entropy": 1.1760147511959076, + "epoch": 0.31963196319631965, + "grad_norm": 0.47003164887428284, + "learning_rate": 0.00016332424507120892, + "loss": 1.1797, + "mean_token_accuracy": 0.6720208674669266, + "num_tokens": 6764505.0, + "step": 799 + }, + { + "entropy": 1.0772924423217773, + "epoch": 0.32003200320032005, + "grad_norm": 0.27927297353744507, + "learning_rate": 0.00016323027398973468, + "loss": 1.0921, + "mean_token_accuracy": 0.6903192400932312, + "num_tokens": 6773032.0, + "step": 800 + }, + { + "entropy": 1.1052745580673218, + "epoch": 0.32043204320432045, + "grad_norm": 0.27928632497787476, + "learning_rate": 0.00016313621357053306, + "loss": 1.1058, + "mean_token_accuracy": 0.6886380016803741, + "num_tokens": 6781616.0, + "step": 801 + }, + { + "entropy": 1.1271063834428787, + "epoch": 0.32083208320832085, + "grad_norm": 0.2914389967918396, + "learning_rate": 0.00016304206397146806, + "loss": 1.086, + "mean_token_accuracy": 0.694695845246315, + "num_tokens": 6789756.0, + "step": 802 + }, + { + "entropy": 1.040708601474762, + "epoch": 0.32123212321232125, + "grad_norm": 0.27147534489631653, + "learning_rate": 0.00016294782535055343, + "loss": 1.0009, + "mean_token_accuracy": 0.7129931598901749, + "num_tokens": 6798848.0, + "step": 803 + }, + { + "entropy": 1.0811657011508942, + "epoch": 0.32163216321632165, + "grad_norm": 0.3022315204143524, + "learning_rate": 0.00016285349786595215, + "loss": 1.0647, + "mean_token_accuracy": 0.6991878598928452, + "num_tokens": 6806919.0, + "step": 804 + }, + { + "entropy": 1.135229378938675, + "epoch": 0.32203220322032206, + "grad_norm": 0.29291465878486633, + "learning_rate": 0.0001627590816759765, + "loss": 1.1024, + "mean_token_accuracy": 0.6896921247243881, + "num_tokens": 6814990.0, + "step": 805 + }, + { + "entropy": 1.0377559065818787, + "epoch": 0.32243224322432246, + "grad_norm": 0.27473223209381104, + "learning_rate": 0.00016266457693908762, + "loss": 1.0429, + "mean_token_accuracy": 0.7063091546297073, + "num_tokens": 6823272.0, + "step": 806 + }, + { + "entropy": 1.0943911969661713, + "epoch": 0.32283228322832286, + "grad_norm": 0.29321154952049255, + "learning_rate": 0.00016256998381389515, + "loss": 1.0941, + "mean_token_accuracy": 0.6934124380350113, + "num_tokens": 6831083.0, + "step": 807 + }, + { + "entropy": 1.0594700127840042, + "epoch": 0.32323232323232326, + "grad_norm": 0.2797102630138397, + "learning_rate": 0.00016247530245915717, + "loss": 1.0382, + "mean_token_accuracy": 0.7056247442960739, + "num_tokens": 6839763.0, + "step": 808 + }, + { + "entropy": 1.1099072992801666, + "epoch": 0.3236323632363236, + "grad_norm": 0.3049979507923126, + "learning_rate": 0.00016238053303377977, + "loss": 1.1221, + "mean_token_accuracy": 0.6817839741706848, + "num_tokens": 6847359.0, + "step": 809 + }, + { + "entropy": 1.09202378988266, + "epoch": 0.324032403240324, + "grad_norm": 0.2954918444156647, + "learning_rate": 0.00016228567569681704, + "loss": 1.1315, + "mean_token_accuracy": 0.6827646791934967, + "num_tokens": 6855784.0, + "step": 810 + }, + { + "entropy": 1.0541664212942123, + "epoch": 0.3244324432443244, + "grad_norm": 0.27822670340538025, + "learning_rate": 0.00016219073060747032, + "loss": 1.0442, + "mean_token_accuracy": 0.701767086982727, + "num_tokens": 6864119.0, + "step": 811 + }, + { + "entropy": 1.0675218999385834, + "epoch": 0.3248324832483248, + "grad_norm": 0.28887197375297546, + "learning_rate": 0.0001620956979250884, + "loss": 1.0772, + "mean_token_accuracy": 0.6991849839687347, + "num_tokens": 6872758.0, + "step": 812 + }, + { + "entropy": 1.0720003098249435, + "epoch": 0.3252325232523252, + "grad_norm": 0.2933163046836853, + "learning_rate": 0.00016200057780916714, + "loss": 1.0587, + "mean_token_accuracy": 0.6943808794021606, + "num_tokens": 6881116.0, + "step": 813 + }, + { + "entropy": 1.0669308304786682, + "epoch": 0.3256325632563256, + "grad_norm": 0.28430935740470886, + "learning_rate": 0.00016190537041934895, + "loss": 1.0457, + "mean_token_accuracy": 0.6947089582681656, + "num_tokens": 6890191.0, + "step": 814 + }, + { + "entropy": 1.0767860561609268, + "epoch": 0.326032603260326, + "grad_norm": 0.2899090647697449, + "learning_rate": 0.0001618100759154229, + "loss": 1.0592, + "mean_token_accuracy": 0.6964694112539291, + "num_tokens": 6898441.0, + "step": 815 + }, + { + "entropy": 1.0372967422008514, + "epoch": 0.3264326432643264, + "grad_norm": 0.26370012760162354, + "learning_rate": 0.00016171469445732414, + "loss": 1.0413, + "mean_token_accuracy": 0.7014907449483871, + "num_tokens": 6907393.0, + "step": 816 + }, + { + "entropy": 1.1349544823169708, + "epoch": 0.3268326832683268, + "grad_norm": 0.3018488883972168, + "learning_rate": 0.0001616192262051339, + "loss": 1.1484, + "mean_token_accuracy": 0.6796787530183792, + "num_tokens": 6915634.0, + "step": 817 + }, + { + "entropy": 1.0578317642211914, + "epoch": 0.3272327232723272, + "grad_norm": 0.2874612808227539, + "learning_rate": 0.00016152367131907884, + "loss": 1.0546, + "mean_token_accuracy": 0.7041870951652527, + "num_tokens": 6924310.0, + "step": 818 + }, + { + "entropy": 1.0999657213687897, + "epoch": 0.3276327632763276, + "grad_norm": 0.2909809648990631, + "learning_rate": 0.00016142802995953125, + "loss": 1.0544, + "mean_token_accuracy": 0.6940512210130692, + "num_tokens": 6932221.0, + "step": 819 + }, + { + "entropy": 1.125622183084488, + "epoch": 0.328032803280328, + "grad_norm": 0.27627530694007874, + "learning_rate": 0.00016133230228700847, + "loss": 1.089, + "mean_token_accuracy": 0.6924064308404922, + "num_tokens": 6940568.0, + "step": 820 + }, + { + "entropy": 1.087424635887146, + "epoch": 0.3284328432843284, + "grad_norm": 0.29548829793930054, + "learning_rate": 0.00016123648846217266, + "loss": 1.081, + "mean_token_accuracy": 0.6912556141614914, + "num_tokens": 6948622.0, + "step": 821 + }, + { + "entropy": 1.085502713918686, + "epoch": 0.3288328832883288, + "grad_norm": 0.2892586886882782, + "learning_rate": 0.0001611405886458306, + "loss": 1.1124, + "mean_token_accuracy": 0.6914178133010864, + "num_tokens": 6957285.0, + "step": 822 + }, + { + "entropy": 1.0553169250488281, + "epoch": 0.3292329232923292, + "grad_norm": 0.2946050465106964, + "learning_rate": 0.00016104460299893347, + "loss": 1.0632, + "mean_token_accuracy": 0.7002202570438385, + "num_tokens": 6965695.0, + "step": 823 + }, + { + "entropy": 1.030663713812828, + "epoch": 0.32963296329632963, + "grad_norm": 0.3052348792552948, + "learning_rate": 0.0001609485316825764, + "loss": 1.0493, + "mean_token_accuracy": 0.7051063477993011, + "num_tokens": 6974226.0, + "step": 824 + }, + { + "entropy": 1.1097224652767181, + "epoch": 0.33003300330033003, + "grad_norm": 0.30264967679977417, + "learning_rate": 0.00016085237485799828, + "loss": 1.1389, + "mean_token_accuracy": 0.6822833269834518, + "num_tokens": 6982021.0, + "step": 825 + }, + { + "entropy": 1.128312110900879, + "epoch": 0.33043304330433043, + "grad_norm": 0.2931164801120758, + "learning_rate": 0.00016075613268658157, + "loss": 1.1506, + "mean_token_accuracy": 0.6762620359659195, + "num_tokens": 6990298.0, + "step": 826 + }, + { + "entropy": 1.101962387561798, + "epoch": 0.33083308330833083, + "grad_norm": 0.27896103262901306, + "learning_rate": 0.00016065980532985205, + "loss": 1.094, + "mean_token_accuracy": 0.6888219267129898, + "num_tokens": 6998254.0, + "step": 827 + }, + { + "entropy": 1.1733005940914154, + "epoch": 0.33123312331233123, + "grad_norm": 0.2988252639770508, + "learning_rate": 0.00016056339294947828, + "loss": 1.1204, + "mean_token_accuracy": 0.6861321032047272, + "num_tokens": 7006271.0, + "step": 828 + }, + { + "entropy": 1.1069701611995697, + "epoch": 0.33163316331633164, + "grad_norm": 0.28877320885658264, + "learning_rate": 0.0001604668957072717, + "loss": 1.0698, + "mean_token_accuracy": 0.6956338733434677, + "num_tokens": 7014834.0, + "step": 829 + }, + { + "entropy": 1.0836674869060516, + "epoch": 0.33203320332033204, + "grad_norm": 0.27765166759490967, + "learning_rate": 0.00016037031376518604, + "loss": 1.0627, + "mean_token_accuracy": 0.697109192609787, + "num_tokens": 7023451.0, + "step": 830 + }, + { + "entropy": 1.063753291964531, + "epoch": 0.33243324332433244, + "grad_norm": 0.28693315386772156, + "learning_rate": 0.00016027364728531725, + "loss": 1.0482, + "mean_token_accuracy": 0.6978173106908798, + "num_tokens": 7031712.0, + "step": 831 + }, + { + "entropy": 1.1097122728824615, + "epoch": 0.33283328332833284, + "grad_norm": 0.2758033573627472, + "learning_rate": 0.00016017689642990317, + "loss": 1.0938, + "mean_token_accuracy": 0.6949873864650726, + "num_tokens": 7040458.0, + "step": 832 + }, + { + "entropy": 1.0569183826446533, + "epoch": 0.33323332333233324, + "grad_norm": 0.29176971316337585, + "learning_rate": 0.00016008006136132322, + "loss": 1.0602, + "mean_token_accuracy": 0.6964530944824219, + "num_tokens": 7048362.0, + "step": 833 + }, + { + "entropy": 1.0865219980478287, + "epoch": 0.33363336333633364, + "grad_norm": 0.29049283266067505, + "learning_rate": 0.00015998314224209813, + "loss": 1.1176, + "mean_token_accuracy": 0.6898873746395111, + "num_tokens": 7056670.0, + "step": 834 + }, + { + "entropy": 1.1126761436462402, + "epoch": 0.33403340334033405, + "grad_norm": 0.29394441843032837, + "learning_rate": 0.00015988613923488977, + "loss": 1.1482, + "mean_token_accuracy": 0.6840843707323074, + "num_tokens": 7065032.0, + "step": 835 + }, + { + "entropy": 1.045666515827179, + "epoch": 0.33443344334433445, + "grad_norm": 0.2821688652038574, + "learning_rate": 0.00015978905250250077, + "loss": 1.0387, + "mean_token_accuracy": 0.7068224251270294, + "num_tokens": 7073623.0, + "step": 836 + }, + { + "entropy": 1.0684998333454132, + "epoch": 0.33483348334833485, + "grad_norm": 0.2719772458076477, + "learning_rate": 0.00015969188220787427, + "loss": 1.0812, + "mean_token_accuracy": 0.6889199614524841, + "num_tokens": 7082422.0, + "step": 837 + }, + { + "entropy": 1.0654973536729813, + "epoch": 0.33523352335233525, + "grad_norm": 0.2691149413585663, + "learning_rate": 0.00015959462851409362, + "loss": 1.0652, + "mean_token_accuracy": 0.7005332261323929, + "num_tokens": 7091003.0, + "step": 838 + }, + { + "entropy": 1.0488538593053818, + "epoch": 0.33563356335633565, + "grad_norm": 0.2605346441268921, + "learning_rate": 0.0001594972915843822, + "loss": 1.0211, + "mean_token_accuracy": 0.7114692479372025, + "num_tokens": 7100016.0, + "step": 839 + }, + { + "entropy": 1.0963743925094604, + "epoch": 0.33603360336033605, + "grad_norm": 0.27182987332344055, + "learning_rate": 0.00015939987158210305, + "loss": 1.0643, + "mean_token_accuracy": 0.6910522431135178, + "num_tokens": 7108835.0, + "step": 840 + }, + { + "entropy": 1.0916715860366821, + "epoch": 0.33643364336433645, + "grad_norm": 0.2755962908267975, + "learning_rate": 0.00015930236867075866, + "loss": 1.0926, + "mean_token_accuracy": 0.7015759348869324, + "num_tokens": 7117237.0, + "step": 841 + }, + { + "entropy": 1.1315371990203857, + "epoch": 0.33683368336833686, + "grad_norm": 0.3034347891807556, + "learning_rate": 0.00015920478301399067, + "loss": 1.1157, + "mean_token_accuracy": 0.6885997503995895, + "num_tokens": 7124962.0, + "step": 842 + }, + { + "entropy": 1.0618019700050354, + "epoch": 0.33723372337233726, + "grad_norm": 0.2787151336669922, + "learning_rate": 0.00015910711477557958, + "loss": 1.045, + "mean_token_accuracy": 0.7079458236694336, + "num_tokens": 7133411.0, + "step": 843 + }, + { + "entropy": 1.0296967178583145, + "epoch": 0.33763376337633766, + "grad_norm": 0.2781917154788971, + "learning_rate": 0.00015900936411944442, + "loss": 1.013, + "mean_token_accuracy": 0.7101086229085922, + "num_tokens": 7141823.0, + "step": 844 + }, + { + "entropy": 1.0734319388866425, + "epoch": 0.33803380338033806, + "grad_norm": 0.2912190854549408, + "learning_rate": 0.00015891153120964268, + "loss": 1.0788, + "mean_token_accuracy": 0.6987548768520355, + "num_tokens": 7150448.0, + "step": 845 + }, + { + "entropy": 1.0448311865329742, + "epoch": 0.3384338433843384, + "grad_norm": 0.2986731231212616, + "learning_rate": 0.0001588136162103698, + "loss": 1.0754, + "mean_token_accuracy": 0.6940102428197861, + "num_tokens": 7158805.0, + "step": 846 + }, + { + "entropy": 1.0017264634370804, + "epoch": 0.3388338833883388, + "grad_norm": 0.2855966091156006, + "learning_rate": 0.00015871561928595905, + "loss": 0.9981, + "mean_token_accuracy": 0.7037784457206726, + "num_tokens": 7167706.0, + "step": 847 + }, + { + "entropy": 1.0189149230718613, + "epoch": 0.3392339233923392, + "grad_norm": 0.537360668182373, + "learning_rate": 0.00015861754060088115, + "loss": 1.0022, + "mean_token_accuracy": 0.7124061584472656, + "num_tokens": 7176198.0, + "step": 848 + }, + { + "entropy": 1.1068767607212067, + "epoch": 0.3396339633963396, + "grad_norm": 0.28569576144218445, + "learning_rate": 0.00015851938031974402, + "loss": 1.1023, + "mean_token_accuracy": 0.6933800876140594, + "num_tokens": 7184584.0, + "step": 849 + }, + { + "entropy": 1.066732108592987, + "epoch": 0.34003400340034, + "grad_norm": 0.39018917083740234, + "learning_rate": 0.00015842113860729264, + "loss": 1.0651, + "mean_token_accuracy": 0.6958982199430466, + "num_tokens": 7193139.0, + "step": 850 + }, + { + "entropy": 1.0800736099481583, + "epoch": 0.3404340434043404, + "grad_norm": 0.2878192663192749, + "learning_rate": 0.00015832281562840856, + "loss": 1.0673, + "mean_token_accuracy": 0.6933024674654007, + "num_tokens": 7201410.0, + "step": 851 + }, + { + "entropy": 1.014617919921875, + "epoch": 0.3408340834083408, + "grad_norm": 0.26971009373664856, + "learning_rate": 0.0001582244115481097, + "loss": 1.0191, + "mean_token_accuracy": 0.7077207118272781, + "num_tokens": 7210125.0, + "step": 852 + }, + { + "entropy": 1.0885990262031555, + "epoch": 0.3412341234123412, + "grad_norm": 0.27623969316482544, + "learning_rate": 0.0001581259265315502, + "loss": 1.0661, + "mean_token_accuracy": 0.6973797082901001, + "num_tokens": 7218646.0, + "step": 853 + }, + { + "entropy": 1.1416289806365967, + "epoch": 0.3416341634163416, + "grad_norm": 0.2842303216457367, + "learning_rate": 0.00015802736074401993, + "loss": 1.1511, + "mean_token_accuracy": 0.6813026964664459, + "num_tokens": 7227152.0, + "step": 854 + }, + { + "entropy": 1.038357600569725, + "epoch": 0.342034203420342, + "grad_norm": 0.282131165266037, + "learning_rate": 0.00015792871435094441, + "loss": 1.0278, + "mean_token_accuracy": 0.7014752775430679, + "num_tokens": 7235267.0, + "step": 855 + }, + { + "entropy": 1.1013155579566956, + "epoch": 0.3424342434243424, + "grad_norm": 0.2835225760936737, + "learning_rate": 0.00015782998751788434, + "loss": 1.0874, + "mean_token_accuracy": 0.693645104765892, + "num_tokens": 7243328.0, + "step": 856 + }, + { + "entropy": 1.059894472360611, + "epoch": 0.3428342834283428, + "grad_norm": 0.28241097927093506, + "learning_rate": 0.00015773118041053553, + "loss": 1.0265, + "mean_token_accuracy": 0.7004377990961075, + "num_tokens": 7251677.0, + "step": 857 + }, + { + "entropy": 1.0202269852161407, + "epoch": 0.3432343234323432, + "grad_norm": 0.283618688583374, + "learning_rate": 0.00015763229319472848, + "loss": 1.0223, + "mean_token_accuracy": 0.7050719112157822, + "num_tokens": 7259971.0, + "step": 858 + }, + { + "entropy": 1.116369366645813, + "epoch": 0.3436343634363436, + "grad_norm": 0.3200959861278534, + "learning_rate": 0.00015753332603642806, + "loss": 1.1532, + "mean_token_accuracy": 0.674355760216713, + "num_tokens": 7267967.0, + "step": 859 + }, + { + "entropy": 1.0624642968177795, + "epoch": 0.344034403440344, + "grad_norm": 0.29742002487182617, + "learning_rate": 0.0001574342791017335, + "loss": 1.0547, + "mean_token_accuracy": 0.696457028388977, + "num_tokens": 7276332.0, + "step": 860 + }, + { + "entropy": 1.066754311323166, + "epoch": 0.34443444344434443, + "grad_norm": 0.2807653844356537, + "learning_rate": 0.00015733515255687765, + "loss": 1.0588, + "mean_token_accuracy": 0.6977074891328812, + "num_tokens": 7284858.0, + "step": 861 + }, + { + "entropy": 1.1038541793823242, + "epoch": 0.34483448344834483, + "grad_norm": 0.2900453507900238, + "learning_rate": 0.00015723594656822724, + "loss": 1.0969, + "mean_token_accuracy": 0.69842229783535, + "num_tokens": 7293225.0, + "step": 862 + }, + { + "entropy": 1.177257478237152, + "epoch": 0.34523452345234523, + "grad_norm": 0.2871565520763397, + "learning_rate": 0.00015713666130228213, + "loss": 1.1619, + "mean_token_accuracy": 0.6842920780181885, + "num_tokens": 7301308.0, + "step": 863 + }, + { + "entropy": 1.008369117975235, + "epoch": 0.34563456345634563, + "grad_norm": 0.2610812187194824, + "learning_rate": 0.00015703729692567538, + "loss": 0.99, + "mean_token_accuracy": 0.7193544209003448, + "num_tokens": 7310533.0, + "step": 864 + }, + { + "entropy": 1.149832397699356, + "epoch": 0.34603460346034604, + "grad_norm": 0.30235856771469116, + "learning_rate": 0.00015693785360517272, + "loss": 1.1233, + "mean_token_accuracy": 0.6869359463453293, + "num_tokens": 7318428.0, + "step": 865 + }, + { + "entropy": 1.0422538667917252, + "epoch": 0.34643464346434644, + "grad_norm": 0.28922754526138306, + "learning_rate": 0.00015683833150767243, + "loss": 1.0495, + "mean_token_accuracy": 0.704685240983963, + "num_tokens": 7326598.0, + "step": 866 + }, + { + "entropy": 1.148168683052063, + "epoch": 0.34683468346834684, + "grad_norm": 0.3053227961063385, + "learning_rate": 0.00015673873080020495, + "loss": 1.169, + "mean_token_accuracy": 0.6736445873975754, + "num_tokens": 7334421.0, + "step": 867 + }, + { + "entropy": 1.0667313486337662, + "epoch": 0.34723472347234724, + "grad_norm": 0.2832089960575104, + "learning_rate": 0.00015663905164993273, + "loss": 1.0555, + "mean_token_accuracy": 0.7032380551099777, + "num_tokens": 7342749.0, + "step": 868 + }, + { + "entropy": 1.096857875585556, + "epoch": 0.34763476347634764, + "grad_norm": 0.31131234765052795, + "learning_rate": 0.00015653929422414985, + "loss": 1.0936, + "mean_token_accuracy": 0.6929104775190353, + "num_tokens": 7350582.0, + "step": 869 + }, + { + "entropy": 1.098680704832077, + "epoch": 0.34803480348034804, + "grad_norm": 0.27815455198287964, + "learning_rate": 0.00015643945869028166, + "loss": 1.0877, + "mean_token_accuracy": 0.6938360929489136, + "num_tokens": 7359143.0, + "step": 870 + }, + { + "entropy": 1.0873310565948486, + "epoch": 0.34843484348434844, + "grad_norm": 0.2917746603488922, + "learning_rate": 0.00015633954521588483, + "loss": 1.117, + "mean_token_accuracy": 0.6897922307252884, + "num_tokens": 7367909.0, + "step": 871 + }, + { + "entropy": 1.057071477174759, + "epoch": 0.34883488348834885, + "grad_norm": 0.2769242525100708, + "learning_rate": 0.00015623955396864653, + "loss": 1.0681, + "mean_token_accuracy": 0.6951342225074768, + "num_tokens": 7376423.0, + "step": 872 + }, + { + "entropy": 1.086044579744339, + "epoch": 0.34923492349234925, + "grad_norm": 0.2736515998840332, + "learning_rate": 0.00015613948511638472, + "loss": 1.0553, + "mean_token_accuracy": 0.7070475369691849, + "num_tokens": 7385063.0, + "step": 873 + }, + { + "entropy": 1.1222947537899017, + "epoch": 0.34963496349634965, + "grad_norm": 0.28925344347953796, + "learning_rate": 0.0001560393388270475, + "loss": 1.1254, + "mean_token_accuracy": 0.6802875846624374, + "num_tokens": 7393125.0, + "step": 874 + }, + { + "entropy": 1.1106415390968323, + "epoch": 0.35003500350035005, + "grad_norm": 0.2733059227466583, + "learning_rate": 0.00015593911526871285, + "loss": 1.0946, + "mean_token_accuracy": 0.6951452642679214, + "num_tokens": 7401571.0, + "step": 875 + }, + { + "entropy": 1.1220622211694717, + "epoch": 0.35043504350435045, + "grad_norm": 0.298059344291687, + "learning_rate": 0.00015583881460958868, + "loss": 1.0862, + "mean_token_accuracy": 0.6904398500919342, + "num_tokens": 7409882.0, + "step": 876 + }, + { + "entropy": 1.0419830232858658, + "epoch": 0.35083508350835085, + "grad_norm": 0.2728874087333679, + "learning_rate": 0.00015573843701801202, + "loss": 1.0041, + "mean_token_accuracy": 0.7102435976266861, + "num_tokens": 7418236.0, + "step": 877 + }, + { + "entropy": 1.0908347070217133, + "epoch": 0.35123512351235125, + "grad_norm": 0.28049230575561523, + "learning_rate": 0.00015563798266244924, + "loss": 1.1089, + "mean_token_accuracy": 0.6883530765771866, + "num_tokens": 7426726.0, + "step": 878 + }, + { + "entropy": 1.1272117793560028, + "epoch": 0.35163516351635166, + "grad_norm": 0.2921125590801239, + "learning_rate": 0.00015553745171149538, + "loss": 1.1567, + "mean_token_accuracy": 0.6770938485860825, + "num_tokens": 7434853.0, + "step": 879 + }, + { + "entropy": 1.0632250159978867, + "epoch": 0.35203520352035206, + "grad_norm": 0.27936652302742004, + "learning_rate": 0.00015543684433387412, + "loss": 1.0579, + "mean_token_accuracy": 0.708150640130043, + "num_tokens": 7443189.0, + "step": 880 + }, + { + "entropy": 1.0871057212352753, + "epoch": 0.35243524352435246, + "grad_norm": 0.6187314391136169, + "learning_rate": 0.0001553361606984374, + "loss": 1.0604, + "mean_token_accuracy": 0.6939795911312103, + "num_tokens": 7451483.0, + "step": 881 + }, + { + "entropy": 1.0746873915195465, + "epoch": 0.35283528352835286, + "grad_norm": 0.7716014981269836, + "learning_rate": 0.0001552354009741651, + "loss": 1.0564, + "mean_token_accuracy": 0.7002977579832077, + "num_tokens": 7459979.0, + "step": 882 + }, + { + "entropy": 1.0537183582782745, + "epoch": 0.35323532353235326, + "grad_norm": 0.27472689747810364, + "learning_rate": 0.0001551345653301649, + "loss": 1.0433, + "mean_token_accuracy": 0.7038500010967255, + "num_tokens": 7468789.0, + "step": 883 + }, + { + "entropy": 1.1011912524700165, + "epoch": 0.3536353635363536, + "grad_norm": 0.360210657119751, + "learning_rate": 0.00015503365393567176, + "loss": 1.0947, + "mean_token_accuracy": 0.6963257640600204, + "num_tokens": 7477409.0, + "step": 884 + }, + { + "entropy": 1.1495015323162079, + "epoch": 0.354035403540354, + "grad_norm": 0.29732295870780945, + "learning_rate": 0.0001549326669600479, + "loss": 1.1287, + "mean_token_accuracy": 0.6785084009170532, + "num_tokens": 7485711.0, + "step": 885 + }, + { + "entropy": 1.108874648809433, + "epoch": 0.3544354435443544, + "grad_norm": 0.3098621368408203, + "learning_rate": 0.00015483160457278238, + "loss": 1.1087, + "mean_token_accuracy": 0.6923833787441254, + "num_tokens": 7493530.0, + "step": 886 + }, + { + "entropy": 1.0474091470241547, + "epoch": 0.3548354835483548, + "grad_norm": 0.2870088219642639, + "learning_rate": 0.00015473046694349066, + "loss": 1.0534, + "mean_token_accuracy": 0.6980384141206741, + "num_tokens": 7501732.0, + "step": 887 + }, + { + "entropy": 1.0329750925302505, + "epoch": 0.3552355235523552, + "grad_norm": 0.3107498586177826, + "learning_rate": 0.0001546292542419147, + "loss": 1.0566, + "mean_token_accuracy": 0.6981071829795837, + "num_tokens": 7510442.0, + "step": 888 + }, + { + "entropy": 1.0194391161203384, + "epoch": 0.3556355635563556, + "grad_norm": 0.3267696797847748, + "learning_rate": 0.0001545279666379223, + "loss": 1.0501, + "mean_token_accuracy": 0.701299399137497, + "num_tokens": 7518738.0, + "step": 889 + }, + { + "entropy": 1.0635437816381454, + "epoch": 0.356035603560356, + "grad_norm": 0.3418838083744049, + "learning_rate": 0.0001544266043015071, + "loss": 1.0778, + "mean_token_accuracy": 0.6955218762159348, + "num_tokens": 7527025.0, + "step": 890 + }, + { + "entropy": 1.0254553109407425, + "epoch": 0.3564356435643564, + "grad_norm": 0.3042067885398865, + "learning_rate": 0.00015432516740278805, + "loss": 1.0357, + "mean_token_accuracy": 0.7102135270833969, + "num_tokens": 7535610.0, + "step": 891 + }, + { + "entropy": 1.1282758712768555, + "epoch": 0.3568356835683568, + "grad_norm": 0.4066726267337799, + "learning_rate": 0.00015422365611200928, + "loss": 1.114, + "mean_token_accuracy": 0.6894948929548264, + "num_tokens": 7543754.0, + "step": 892 + }, + { + "entropy": 1.0651661455631256, + "epoch": 0.3572357235723572, + "grad_norm": 0.28593435883522034, + "learning_rate": 0.00015412207059953986, + "loss": 1.0305, + "mean_token_accuracy": 0.7083335816860199, + "num_tokens": 7552042.0, + "step": 893 + }, + { + "entropy": 1.0586566478013992, + "epoch": 0.3576357635763576, + "grad_norm": 0.27906718850135803, + "learning_rate": 0.00015402041103587322, + "loss": 1.0557, + "mean_token_accuracy": 0.7047479301691055, + "num_tokens": 7560468.0, + "step": 894 + }, + { + "entropy": 1.0721811354160309, + "epoch": 0.358035803580358, + "grad_norm": 0.2834453284740448, + "learning_rate": 0.0001539186775916273, + "loss": 1.0397, + "mean_token_accuracy": 0.7045527845621109, + "num_tokens": 7568795.0, + "step": 895 + }, + { + "entropy": 1.0335296243429184, + "epoch": 0.3584358435843584, + "grad_norm": 0.27770015597343445, + "learning_rate": 0.00015381687043754388, + "loss": 0.9887, + "mean_token_accuracy": 0.7156312018632889, + "num_tokens": 7577229.0, + "step": 896 + }, + { + "entropy": 1.0903809666633606, + "epoch": 0.35883588358835883, + "grad_norm": 0.2974568009376526, + "learning_rate": 0.00015371498974448854, + "loss": 1.1003, + "mean_token_accuracy": 0.69133260846138, + "num_tokens": 7585531.0, + "step": 897 + }, + { + "entropy": 1.0514666438102722, + "epoch": 0.35923592359235923, + "grad_norm": 0.2924467921257019, + "learning_rate": 0.00015361303568345023, + "loss": 1.0391, + "mean_token_accuracy": 0.703434020280838, + "num_tokens": 7594483.0, + "step": 898 + }, + { + "entropy": 1.077709585428238, + "epoch": 0.35963596359635963, + "grad_norm": 0.29305940866470337, + "learning_rate": 0.00015351100842554103, + "loss": 1.0843, + "mean_token_accuracy": 0.6986344307661057, + "num_tokens": 7603375.0, + "step": 899 + }, + { + "entropy": 1.0290792882442474, + "epoch": 0.36003600360036003, + "grad_norm": 0.29663342237472534, + "learning_rate": 0.000153408908141996, + "loss": 1.042, + "mean_token_accuracy": 0.6997575759887695, + "num_tokens": 7611926.0, + "step": 900 + }, + { + "entropy": 1.0703576654195786, + "epoch": 0.36043604360436043, + "grad_norm": 0.30351656675338745, + "learning_rate": 0.0001533067350041725, + "loss": 1.0628, + "mean_token_accuracy": 0.6892614960670471, + "num_tokens": 7619932.0, + "step": 901 + }, + { + "entropy": 1.0805663019418716, + "epoch": 0.36083608360836084, + "grad_norm": 0.29656746983528137, + "learning_rate": 0.00015320448918355035, + "loss": 1.0862, + "mean_token_accuracy": 0.6913487464189529, + "num_tokens": 7628049.0, + "step": 902 + }, + { + "entropy": 1.0883187055587769, + "epoch": 0.36123612361236124, + "grad_norm": 0.3021232783794403, + "learning_rate": 0.00015310217085173138, + "loss": 1.1159, + "mean_token_accuracy": 0.6815456300973892, + "num_tokens": 7636056.0, + "step": 903 + }, + { + "entropy": 1.0530999600887299, + "epoch": 0.36163616361636164, + "grad_norm": 0.2987232804298401, + "learning_rate": 0.00015299978018043907, + "loss": 1.0699, + "mean_token_accuracy": 0.6974405944347382, + "num_tokens": 7644312.0, + "step": 904 + }, + { + "entropy": 1.1183582544326782, + "epoch": 0.36203620362036204, + "grad_norm": 0.3061085343360901, + "learning_rate": 0.00015289731734151825, + "loss": 1.1329, + "mean_token_accuracy": 0.6872691959142685, + "num_tokens": 7652522.0, + "step": 905 + }, + { + "entropy": 1.1113695204257965, + "epoch": 0.36243624362436244, + "grad_norm": 0.28838422894477844, + "learning_rate": 0.0001527947825069349, + "loss": 1.0724, + "mean_token_accuracy": 0.6988827735185623, + "num_tokens": 7660856.0, + "step": 906 + }, + { + "entropy": 1.1046296060085297, + "epoch": 0.36283628362836284, + "grad_norm": 0.2806946039199829, + "learning_rate": 0.00015269217584877587, + "loss": 1.1038, + "mean_token_accuracy": 0.6897422969341278, + "num_tokens": 7669535.0, + "step": 907 + }, + { + "entropy": 1.136566162109375, + "epoch": 0.36323632363236324, + "grad_norm": 0.36092713475227356, + "learning_rate": 0.00015258949753924856, + "loss": 1.1109, + "mean_token_accuracy": 0.6891215145587921, + "num_tokens": 7678514.0, + "step": 908 + }, + { + "entropy": 1.0875504910945892, + "epoch": 0.36363636363636365, + "grad_norm": 0.28009262681007385, + "learning_rate": 0.00015248674775068056, + "loss": 1.0703, + "mean_token_accuracy": 0.7001459747552872, + "num_tokens": 7686709.0, + "step": 909 + }, + { + "entropy": 1.0666522532701492, + "epoch": 0.36403640364036405, + "grad_norm": 0.26744940876960754, + "learning_rate": 0.0001523839266555195, + "loss": 1.0488, + "mean_token_accuracy": 0.7005031406879425, + "num_tokens": 7695309.0, + "step": 910 + }, + { + "entropy": 1.0553468465805054, + "epoch": 0.36443644364436445, + "grad_norm": 0.5155460834503174, + "learning_rate": 0.00015228103442633262, + "loss": 1.0373, + "mean_token_accuracy": 0.7081728577613831, + "num_tokens": 7704491.0, + "step": 911 + }, + { + "entropy": 1.118385136127472, + "epoch": 0.36483648364836485, + "grad_norm": 0.29043763875961304, + "learning_rate": 0.0001521780712358066, + "loss": 1.116, + "mean_token_accuracy": 0.6923219412565231, + "num_tokens": 7712576.0, + "step": 912 + }, + { + "entropy": 1.0479677617549896, + "epoch": 0.36523652365236525, + "grad_norm": 0.2803533673286438, + "learning_rate": 0.00015207503725674714, + "loss": 1.0644, + "mean_token_accuracy": 0.7001222670078278, + "num_tokens": 7721282.0, + "step": 913 + }, + { + "entropy": 1.0972586572170258, + "epoch": 0.36563656365636565, + "grad_norm": 0.29950302839279175, + "learning_rate": 0.00015197193266207882, + "loss": 1.0928, + "mean_token_accuracy": 0.6889860928058624, + "num_tokens": 7729578.0, + "step": 914 + }, + { + "entropy": 1.0062318444252014, + "epoch": 0.36603660366036606, + "grad_norm": 0.2677382528781891, + "learning_rate": 0.00015186875762484474, + "loss": 0.9679, + "mean_token_accuracy": 0.7164999395608902, + "num_tokens": 7738697.0, + "step": 915 + }, + { + "entropy": 1.1188140213489532, + "epoch": 0.36643664366436646, + "grad_norm": 0.2942350208759308, + "learning_rate": 0.00015176551231820616, + "loss": 1.0911, + "mean_token_accuracy": 0.6896274834871292, + "num_tokens": 7746855.0, + "step": 916 + }, + { + "entropy": 1.1077412962913513, + "epoch": 0.36683668366836686, + "grad_norm": 0.303210973739624, + "learning_rate": 0.00015166219691544234, + "loss": 1.1176, + "mean_token_accuracy": 0.6888789385557175, + "num_tokens": 7755286.0, + "step": 917 + }, + { + "entropy": 1.0603497922420502, + "epoch": 0.36723672367236726, + "grad_norm": 0.5446072816848755, + "learning_rate": 0.00015155881158995015, + "loss": 1.0986, + "mean_token_accuracy": 0.694971576333046, + "num_tokens": 7763579.0, + "step": 918 + }, + { + "entropy": 1.0791847109794617, + "epoch": 0.36763676367636766, + "grad_norm": 0.3222018778324127, + "learning_rate": 0.00015145535651524386, + "loss": 1.0854, + "mean_token_accuracy": 0.6917635649442673, + "num_tokens": 7771570.0, + "step": 919 + }, + { + "entropy": 1.0514254868030548, + "epoch": 0.36803680368036806, + "grad_norm": 0.3010207414627075, + "learning_rate": 0.00015135183186495467, + "loss": 1.0795, + "mean_token_accuracy": 0.6944009959697723, + "num_tokens": 7779285.0, + "step": 920 + }, + { + "entropy": 1.0478507727384567, + "epoch": 0.3684368436843684, + "grad_norm": 0.28592145442962646, + "learning_rate": 0.0001512482378128307, + "loss": 1.0585, + "mean_token_accuracy": 0.6918022036552429, + "num_tokens": 7787857.0, + "step": 921 + }, + { + "entropy": 1.0478255897760391, + "epoch": 0.3688368836883688, + "grad_norm": 0.28327181935310364, + "learning_rate": 0.00015114457453273652, + "loss": 1.0153, + "mean_token_accuracy": 0.7086196690797806, + "num_tokens": 7796395.0, + "step": 922 + }, + { + "entropy": 1.100317269563675, + "epoch": 0.3692369236923692, + "grad_norm": 0.28867024183273315, + "learning_rate": 0.0001510408421986528, + "loss": 1.0998, + "mean_token_accuracy": 0.6896563619375229, + "num_tokens": 7805226.0, + "step": 923 + }, + { + "entropy": 1.080020621418953, + "epoch": 0.3696369636963696, + "grad_norm": 0.2910299599170685, + "learning_rate": 0.00015093704098467618, + "loss": 1.0933, + "mean_token_accuracy": 0.6951110363006592, + "num_tokens": 7813827.0, + "step": 924 + }, + { + "entropy": 1.0695101618766785, + "epoch": 0.37003700370037, + "grad_norm": 0.2869861423969269, + "learning_rate": 0.0001508331710650189, + "loss": 1.0529, + "mean_token_accuracy": 0.6969519108533859, + "num_tokens": 7822356.0, + "step": 925 + }, + { + "entropy": 1.1095967292785645, + "epoch": 0.3704370437043704, + "grad_norm": 0.29034659266471863, + "learning_rate": 0.0001507292326140085, + "loss": 1.072, + "mean_token_accuracy": 0.6946455985307693, + "num_tokens": 7830583.0, + "step": 926 + }, + { + "entropy": 1.1217932105064392, + "epoch": 0.3708370837083708, + "grad_norm": 0.29387617111206055, + "learning_rate": 0.00015062522580608752, + "loss": 1.1192, + "mean_token_accuracy": 0.6784617453813553, + "num_tokens": 7839048.0, + "step": 927 + }, + { + "entropy": 1.130223274230957, + "epoch": 0.3712371237123712, + "grad_norm": 0.2845928370952606, + "learning_rate": 0.00015052115081581327, + "loss": 1.129, + "mean_token_accuracy": 0.6883646845817566, + "num_tokens": 7848074.0, + "step": 928 + }, + { + "entropy": 1.1167317032814026, + "epoch": 0.3716371637163716, + "grad_norm": 0.2936380207538605, + "learning_rate": 0.00015041700781785738, + "loss": 1.0962, + "mean_token_accuracy": 0.6888892650604248, + "num_tokens": 7856034.0, + "step": 929 + }, + { + "entropy": 1.075846090912819, + "epoch": 0.372037203720372, + "grad_norm": 0.27974218130111694, + "learning_rate": 0.00015031279698700582, + "loss": 1.0586, + "mean_token_accuracy": 0.7014043778181076, + "num_tokens": 7864995.0, + "step": 930 + }, + { + "entropy": 1.0563067346811295, + "epoch": 0.3724372437243724, + "grad_norm": 0.2781369090080261, + "learning_rate": 0.00015020851849815824, + "loss": 1.0347, + "mean_token_accuracy": 0.7019869238138199, + "num_tokens": 7873511.0, + "step": 931 + }, + { + "entropy": 1.0313759744167328, + "epoch": 0.3728372837283728, + "grad_norm": 0.2687495946884155, + "learning_rate": 0.0001501041725263278, + "loss": 1.0243, + "mean_token_accuracy": 0.7043203860521317, + "num_tokens": 7882234.0, + "step": 932 + }, + { + "entropy": 1.0888259410858154, + "epoch": 0.3732373237323732, + "grad_norm": 0.2853333353996277, + "learning_rate": 0.00014999975924664117, + "loss": 1.0905, + "mean_token_accuracy": 0.6935629546642303, + "num_tokens": 7890888.0, + "step": 933 + }, + { + "entropy": 1.1315006613731384, + "epoch": 0.37363736373637363, + "grad_norm": 0.2985461354255676, + "learning_rate": 0.00014989527883433766, + "loss": 1.1348, + "mean_token_accuracy": 0.6838447153568268, + "num_tokens": 7899031.0, + "step": 934 + }, + { + "entropy": 1.0415536165237427, + "epoch": 0.37403740374037403, + "grad_norm": 0.2740395665168762, + "learning_rate": 0.00014979073146476957, + "loss": 1.0351, + "mean_token_accuracy": 0.7056934386491776, + "num_tokens": 7908145.0, + "step": 935 + }, + { + "entropy": 1.0645959973335266, + "epoch": 0.37443744374437443, + "grad_norm": 0.2813519835472107, + "learning_rate": 0.00014968611731340127, + "loss": 1.0825, + "mean_token_accuracy": 0.6910433173179626, + "num_tokens": 7916852.0, + "step": 936 + }, + { + "entropy": 1.126308649778366, + "epoch": 0.37483748374837483, + "grad_norm": 0.3061455190181732, + "learning_rate": 0.00014958143655580946, + "loss": 1.152, + "mean_token_accuracy": 0.6772955060005188, + "num_tokens": 7924675.0, + "step": 937 + }, + { + "entropy": 1.043073609471321, + "epoch": 0.37523752375237523, + "grad_norm": 0.28251731395721436, + "learning_rate": 0.0001494766893676825, + "loss": 1.0443, + "mean_token_accuracy": 0.7035960704088211, + "num_tokens": 7933505.0, + "step": 938 + }, + { + "entropy": 1.0509282499551773, + "epoch": 0.37563756375637564, + "grad_norm": 0.2664625644683838, + "learning_rate": 0.00014937187592482023, + "loss": 1.0323, + "mean_token_accuracy": 0.7018367052078247, + "num_tokens": 7942577.0, + "step": 939 + }, + { + "entropy": 1.0920611917972565, + "epoch": 0.37603760376037604, + "grad_norm": 0.28298696875572205, + "learning_rate": 0.00014926699640313378, + "loss": 1.0707, + "mean_token_accuracy": 0.692681297659874, + "num_tokens": 7950911.0, + "step": 940 + }, + { + "entropy": 1.07115238904953, + "epoch": 0.37643764376437644, + "grad_norm": 0.2810773253440857, + "learning_rate": 0.00014916205097864507, + "loss": 1.0388, + "mean_token_accuracy": 0.7026078999042511, + "num_tokens": 7959296.0, + "step": 941 + }, + { + "entropy": 1.124040812253952, + "epoch": 0.37683768376837684, + "grad_norm": 0.2959596812725067, + "learning_rate": 0.0001490570398274868, + "loss": 1.0967, + "mean_token_accuracy": 0.6954542547464371, + "num_tokens": 7968025.0, + "step": 942 + }, + { + "entropy": 1.091953307390213, + "epoch": 0.37723772377237724, + "grad_norm": 0.2868008613586426, + "learning_rate": 0.00014895196312590174, + "loss": 1.051, + "mean_token_accuracy": 0.6980269998311996, + "num_tokens": 7977035.0, + "step": 943 + }, + { + "entropy": 1.1035798490047455, + "epoch": 0.37763776377637764, + "grad_norm": 0.2808520495891571, + "learning_rate": 0.00014884682105024288, + "loss": 1.0924, + "mean_token_accuracy": 0.6858002245426178, + "num_tokens": 7985284.0, + "step": 944 + }, + { + "entropy": 1.1127160340547562, + "epoch": 0.37803780378037805, + "grad_norm": 0.30014780163764954, + "learning_rate": 0.00014874161377697275, + "loss": 1.097, + "mean_token_accuracy": 0.6914821416139603, + "num_tokens": 7993547.0, + "step": 945 + }, + { + "entropy": 1.028899535536766, + "epoch": 0.37843784378437845, + "grad_norm": 0.2756885588169098, + "learning_rate": 0.00014863634148266355, + "loss": 1.0497, + "mean_token_accuracy": 0.6963004320859909, + "num_tokens": 8002631.0, + "step": 946 + }, + { + "entropy": 1.1323621571063995, + "epoch": 0.37883788378837885, + "grad_norm": 0.2932755947113037, + "learning_rate": 0.0001485310043439963, + "loss": 1.1345, + "mean_token_accuracy": 0.6825042515993118, + "num_tokens": 8010885.0, + "step": 947 + }, + { + "entropy": 0.9664577692747116, + "epoch": 0.37923792379237925, + "grad_norm": 0.2600715458393097, + "learning_rate": 0.00014842560253776116, + "loss": 0.9722, + "mean_token_accuracy": 0.7187516391277313, + "num_tokens": 8019671.0, + "step": 948 + }, + { + "entropy": 1.0276516675949097, + "epoch": 0.37963796379637965, + "grad_norm": 0.27302515506744385, + "learning_rate": 0.00014832013624085656, + "loss": 1.046, + "mean_token_accuracy": 0.7035380154848099, + "num_tokens": 8028487.0, + "step": 949 + }, + { + "entropy": 0.9928306490182877, + "epoch": 0.38003800380038005, + "grad_norm": 0.2691206932067871, + "learning_rate": 0.00014821460563028927, + "loss": 0.9891, + "mean_token_accuracy": 0.7167342454195023, + "num_tokens": 8037532.0, + "step": 950 + }, + { + "entropy": 1.0739206820726395, + "epoch": 0.38043804380438045, + "grad_norm": 0.28372377157211304, + "learning_rate": 0.00014810901088317414, + "loss": 1.0932, + "mean_token_accuracy": 0.6955729722976685, + "num_tokens": 8046444.0, + "step": 951 + }, + { + "entropy": 0.9927343726158142, + "epoch": 0.38083808380838086, + "grad_norm": 0.26338404417037964, + "learning_rate": 0.00014800335217673335, + "loss": 0.9804, + "mean_token_accuracy": 0.7078811824321747, + "num_tokens": 8055410.0, + "step": 952 + }, + { + "entropy": 1.1160430908203125, + "epoch": 0.38123812381238126, + "grad_norm": 0.2774490416049957, + "learning_rate": 0.00014789762968829678, + "loss": 1.0933, + "mean_token_accuracy": 0.6894954890012741, + "num_tokens": 8063907.0, + "step": 953 + }, + { + "entropy": 1.103876382112503, + "epoch": 0.38163816381638166, + "grad_norm": 0.2906893193721771, + "learning_rate": 0.00014779184359530102, + "loss": 1.0719, + "mean_token_accuracy": 0.6967519670724869, + "num_tokens": 8072357.0, + "step": 954 + }, + { + "entropy": 1.074035719037056, + "epoch": 0.38203820382038206, + "grad_norm": 0.31264954805374146, + "learning_rate": 0.0001476859940752897, + "loss": 1.0714, + "mean_token_accuracy": 0.7001534551382065, + "num_tokens": 8080607.0, + "step": 955 + }, + { + "entropy": 1.1370624005794525, + "epoch": 0.38243824382438246, + "grad_norm": 0.2863542437553406, + "learning_rate": 0.00014758008130591268, + "loss": 1.1447, + "mean_token_accuracy": 0.6822319477796555, + "num_tokens": 8089363.0, + "step": 956 + }, + { + "entropy": 1.0512937009334564, + "epoch": 0.38283828382838286, + "grad_norm": 0.29377084970474243, + "learning_rate": 0.0001474741054649261, + "loss": 1.0675, + "mean_token_accuracy": 0.7034764885902405, + "num_tokens": 8097615.0, + "step": 957 + }, + { + "entropy": 1.0039722472429276, + "epoch": 0.38323832383238327, + "grad_norm": 0.27713558077812195, + "learning_rate": 0.00014736806673019194, + "loss": 0.9879, + "mean_token_accuracy": 0.7184752523899078, + "num_tokens": 8106089.0, + "step": 958 + }, + { + "entropy": 1.066543161869049, + "epoch": 0.3836383638363836, + "grad_norm": 0.30339565873146057, + "learning_rate": 0.0001472619652796777, + "loss": 1.0867, + "mean_token_accuracy": 0.6899354606866837, + "num_tokens": 8114473.0, + "step": 959 + }, + { + "entropy": 0.999586284160614, + "epoch": 0.384038403840384, + "grad_norm": 0.27283790707588196, + "learning_rate": 0.00014715580129145612, + "loss": 0.9943, + "mean_token_accuracy": 0.7097521424293518, + "num_tokens": 8123795.0, + "step": 960 + }, + { + "entropy": 1.036647766828537, + "epoch": 0.3844384438443844, + "grad_norm": 0.27897384762763977, + "learning_rate": 0.00014704957494370496, + "loss": 1.0193, + "mean_token_accuracy": 0.706264078617096, + "num_tokens": 8132070.0, + "step": 961 + }, + { + "entropy": 1.0590356588363647, + "epoch": 0.3848384838483848, + "grad_norm": 0.27091649174690247, + "learning_rate": 0.00014694328641470661, + "loss": 1.0589, + "mean_token_accuracy": 0.7038314491510391, + "num_tokens": 8140787.0, + "step": 962 + }, + { + "entropy": 0.9753771871328354, + "epoch": 0.3852385238523852, + "grad_norm": 0.2739521563053131, + "learning_rate": 0.00014683693588284782, + "loss": 0.993, + "mean_token_accuracy": 0.715995579957962, + "num_tokens": 8149604.0, + "step": 963 + }, + { + "entropy": 1.0164705663919449, + "epoch": 0.3856385638563856, + "grad_norm": 0.2854687571525574, + "learning_rate": 0.00014673052352661938, + "loss": 1.0331, + "mean_token_accuracy": 0.7069838047027588, + "num_tokens": 8157679.0, + "step": 964 + }, + { + "entropy": 1.0143132507801056, + "epoch": 0.386038603860386, + "grad_norm": 0.2912575900554657, + "learning_rate": 0.0001466240495246159, + "loss": 1.0315, + "mean_token_accuracy": 0.6974526792764664, + "num_tokens": 8166431.0, + "step": 965 + }, + { + "entropy": 1.1075432002544403, + "epoch": 0.3864386438643864, + "grad_norm": 0.30956971645355225, + "learning_rate": 0.0001465175140555354, + "loss": 1.0957, + "mean_token_accuracy": 0.6925620138645172, + "num_tokens": 8174677.0, + "step": 966 + }, + { + "entropy": 1.0882379412651062, + "epoch": 0.3868386838683868, + "grad_norm": 0.27571985125541687, + "learning_rate": 0.00014641091729817904, + "loss": 1.0479, + "mean_token_accuracy": 0.6960329711437225, + "num_tokens": 8183154.0, + "step": 967 + }, + { + "entropy": 1.0937244892120361, + "epoch": 0.3872387238723872, + "grad_norm": 0.2793300747871399, + "learning_rate": 0.00014630425943145097, + "loss": 1.0643, + "mean_token_accuracy": 0.6938710659742355, + "num_tokens": 8191787.0, + "step": 968 + }, + { + "entropy": 1.1157473027706146, + "epoch": 0.3876387638763876, + "grad_norm": 0.2754529118537903, + "learning_rate": 0.00014619754063435766, + "loss": 1.0876, + "mean_token_accuracy": 0.6887020170688629, + "num_tokens": 8200517.0, + "step": 969 + }, + { + "entropy": 1.0828372240066528, + "epoch": 0.38803880388038803, + "grad_norm": 0.2722908854484558, + "learning_rate": 0.00014609076108600816, + "loss": 1.0581, + "mean_token_accuracy": 0.698416456580162, + "num_tokens": 8209360.0, + "step": 970 + }, + { + "entropy": 1.0577463954687119, + "epoch": 0.38843884388438843, + "grad_norm": 0.2847084701061249, + "learning_rate": 0.0001459839209656132, + "loss": 1.0557, + "mean_token_accuracy": 0.6946806460618973, + "num_tokens": 8217237.0, + "step": 971 + }, + { + "entropy": 1.0650206506252289, + "epoch": 0.38883888388838883, + "grad_norm": 0.27916163206100464, + "learning_rate": 0.00014587702045248534, + "loss": 1.0815, + "mean_token_accuracy": 0.689092680811882, + "num_tokens": 8225607.0, + "step": 972 + }, + { + "entropy": 1.0789687633514404, + "epoch": 0.38923892389238923, + "grad_norm": 0.2880748510360718, + "learning_rate": 0.00014577005972603841, + "loss": 1.0924, + "mean_token_accuracy": 0.6933542042970657, + "num_tokens": 8234136.0, + "step": 973 + }, + { + "entropy": 1.101224035024643, + "epoch": 0.38963896389638963, + "grad_norm": 0.2778017520904541, + "learning_rate": 0.00014566303896578733, + "loss": 1.1119, + "mean_token_accuracy": 0.6942825764417648, + "num_tokens": 8242315.0, + "step": 974 + }, + { + "entropy": 1.0111819058656693, + "epoch": 0.39003900390039004, + "grad_norm": 0.2581962049007416, + "learning_rate": 0.00014555595835134778, + "loss": 1.0169, + "mean_token_accuracy": 0.7096911817789078, + "num_tokens": 8251477.0, + "step": 975 + }, + { + "entropy": 1.0564925074577332, + "epoch": 0.39043904390439044, + "grad_norm": 0.2755223512649536, + "learning_rate": 0.00014544881806243583, + "loss": 1.0526, + "mean_token_accuracy": 0.701107919216156, + "num_tokens": 8260126.0, + "step": 976 + }, + { + "entropy": 1.1073401868343353, + "epoch": 0.39083908390839084, + "grad_norm": 0.2744315564632416, + "learning_rate": 0.00014534161827886789, + "loss": 1.0984, + "mean_token_accuracy": 0.699100449681282, + "num_tokens": 8268590.0, + "step": 977 + }, + { + "entropy": 1.0539606809616089, + "epoch": 0.39123912391239124, + "grad_norm": 0.25940003991127014, + "learning_rate": 0.00014523435918055994, + "loss": 1.0384, + "mean_token_accuracy": 0.6963090151548386, + "num_tokens": 8277319.0, + "step": 978 + }, + { + "entropy": 1.0601672530174255, + "epoch": 0.39163916391639164, + "grad_norm": 0.28110653162002563, + "learning_rate": 0.0001451270409475278, + "loss": 1.0504, + "mean_token_accuracy": 0.7001344114542007, + "num_tokens": 8285416.0, + "step": 979 + }, + { + "entropy": 1.0488615334033966, + "epoch": 0.39203920392039204, + "grad_norm": 0.2758018672466278, + "learning_rate": 0.0001450196637598863, + "loss": 1.0669, + "mean_token_accuracy": 0.6955180466175079, + "num_tokens": 8294416.0, + "step": 980 + }, + { + "entropy": 1.117949515581131, + "epoch": 0.39243924392439244, + "grad_norm": 0.28350189328193665, + "learning_rate": 0.00014491222779784937, + "loss": 1.0906, + "mean_token_accuracy": 0.6835110485553741, + "num_tokens": 8302573.0, + "step": 981 + }, + { + "entropy": 1.0202916115522385, + "epoch": 0.39283928392839285, + "grad_norm": 0.2640414535999298, + "learning_rate": 0.00014480473324172955, + "loss": 1.0212, + "mean_token_accuracy": 0.7104804664850235, + "num_tokens": 8311587.0, + "step": 982 + }, + { + "entropy": 1.0706768333911896, + "epoch": 0.39323932393239325, + "grad_norm": 0.28815537691116333, + "learning_rate": 0.0001446971802719376, + "loss": 1.0872, + "mean_token_accuracy": 0.6972401142120361, + "num_tokens": 8320166.0, + "step": 983 + }, + { + "entropy": 1.1208703815937042, + "epoch": 0.39363936393639365, + "grad_norm": 0.2919588088989258, + "learning_rate": 0.00014458956906898248, + "loss": 1.1294, + "mean_token_accuracy": 0.6903548091650009, + "num_tokens": 8328361.0, + "step": 984 + }, + { + "entropy": 1.0810618698596954, + "epoch": 0.39403940394039405, + "grad_norm": 0.27421703934669495, + "learning_rate": 0.00014448189981347082, + "loss": 1.0735, + "mean_token_accuracy": 0.6916549801826477, + "num_tokens": 8337258.0, + "step": 985 + }, + { + "entropy": 1.094863623380661, + "epoch": 0.39443944394439445, + "grad_norm": 0.2965344190597534, + "learning_rate": 0.00014437417268610666, + "loss": 1.0839, + "mean_token_accuracy": 0.6977574676275253, + "num_tokens": 8345002.0, + "step": 986 + }, + { + "entropy": 1.080283299088478, + "epoch": 0.39483948394839485, + "grad_norm": 0.28247109055519104, + "learning_rate": 0.0001442663878676912, + "loss": 1.0663, + "mean_token_accuracy": 0.7037357091903687, + "num_tokens": 8353390.0, + "step": 987 + }, + { + "entropy": 1.121001273393631, + "epoch": 0.39523952395239526, + "grad_norm": 0.2788020074367523, + "learning_rate": 0.00014415854553912245, + "loss": 1.108, + "mean_token_accuracy": 0.6830210089683533, + "num_tokens": 8361205.0, + "step": 988 + }, + { + "entropy": 1.1115176677703857, + "epoch": 0.39563956395639566, + "grad_norm": 0.2805117964744568, + "learning_rate": 0.00014405064588139495, + "loss": 1.089, + "mean_token_accuracy": 0.6899392008781433, + "num_tokens": 8369550.0, + "step": 989 + }, + { + "entropy": 1.0624737441539764, + "epoch": 0.39603960396039606, + "grad_norm": 0.2841082215309143, + "learning_rate": 0.0001439426890755994, + "loss": 1.0458, + "mean_token_accuracy": 0.7019693553447723, + "num_tokens": 8377848.0, + "step": 990 + }, + { + "entropy": 1.06924706697464, + "epoch": 0.39643964396439646, + "grad_norm": 0.3040372431278229, + "learning_rate": 0.00014383467530292251, + "loss": 1.061, + "mean_token_accuracy": 0.6970397382974625, + "num_tokens": 8385571.0, + "step": 991 + }, + { + "entropy": 1.044583037495613, + "epoch": 0.39683968396839686, + "grad_norm": 0.2906639277935028, + "learning_rate": 0.00014372660474464656, + "loss": 1.0526, + "mean_token_accuracy": 0.7003406286239624, + "num_tokens": 8393697.0, + "step": 992 + }, + { + "entropy": 1.1328657269477844, + "epoch": 0.39723972397239726, + "grad_norm": 0.3031742572784424, + "learning_rate": 0.00014361847758214913, + "loss": 1.1476, + "mean_token_accuracy": 0.6823091059923172, + "num_tokens": 8401849.0, + "step": 993 + }, + { + "entropy": 1.0815677642822266, + "epoch": 0.39763976397639766, + "grad_norm": 0.27805039286613464, + "learning_rate": 0.00014351029399690274, + "loss": 1.1008, + "mean_token_accuracy": 0.6917327642440796, + "num_tokens": 8410629.0, + "step": 994 + }, + { + "entropy": 1.0821519196033478, + "epoch": 0.39803980398039807, + "grad_norm": 0.2780751883983612, + "learning_rate": 0.0001434020541704747, + "loss": 1.0726, + "mean_token_accuracy": 0.6965376734733582, + "num_tokens": 8419235.0, + "step": 995 + }, + { + "entropy": 1.057197779417038, + "epoch": 0.39843984398439847, + "grad_norm": 0.27425616979599, + "learning_rate": 0.00014329375828452668, + "loss": 1.051, + "mean_token_accuracy": 0.7033206820487976, + "num_tokens": 8428037.0, + "step": 996 + }, + { + "entropy": 1.041703313589096, + "epoch": 0.3988398839883988, + "grad_norm": 0.30760639905929565, + "learning_rate": 0.00014318540652081443, + "loss": 1.0806, + "mean_token_accuracy": 0.7055813670158386, + "num_tokens": 8436169.0, + "step": 997 + }, + { + "entropy": 1.0875943005084991, + "epoch": 0.3992399239923992, + "grad_norm": 0.3065730929374695, + "learning_rate": 0.00014307699906118747, + "loss": 1.0873, + "mean_token_accuracy": 0.6942591369152069, + "num_tokens": 8443897.0, + "step": 998 + }, + { + "entropy": 1.0474766790866852, + "epoch": 0.3996399639963996, + "grad_norm": 0.26605191826820374, + "learning_rate": 0.00014296853608758875, + "loss": 1.0185, + "mean_token_accuracy": 0.7072035074234009, + "num_tokens": 8452578.0, + "step": 999 + }, + { + "entropy": 1.0564881265163422, + "epoch": 0.4000400040004, + "grad_norm": 0.37414297461509705, + "learning_rate": 0.00014286001778205455, + "loss": 1.0458, + "mean_token_accuracy": 0.7062159180641174, + "num_tokens": 8461870.0, + "step": 1000 + }, + { + "entropy": 1.1129979491233826, + "epoch": 0.4004400440044004, + "grad_norm": 0.29261261224746704, + "learning_rate": 0.0001427514443267139, + "loss": 1.1069, + "mean_token_accuracy": 0.6929229497909546, + "num_tokens": 8470164.0, + "step": 1001 + }, + { + "entropy": 1.0556962490081787, + "epoch": 0.4008400840084008, + "grad_norm": 0.2848963141441345, + "learning_rate": 0.0001426428159037883, + "loss": 1.0388, + "mean_token_accuracy": 0.7014807015657425, + "num_tokens": 8478437.0, + "step": 1002 + }, + { + "entropy": 1.0442138612270355, + "epoch": 0.4012401240124012, + "grad_norm": 0.28170058131217957, + "learning_rate": 0.0001425341326955917, + "loss": 1.0305, + "mean_token_accuracy": 0.7050503641366959, + "num_tokens": 8486487.0, + "step": 1003 + }, + { + "entropy": 1.0774947702884674, + "epoch": 0.4016401640164016, + "grad_norm": 0.26965805888175964, + "learning_rate": 0.0001424253948845299, + "loss": 1.0456, + "mean_token_accuracy": 0.6997365057468414, + "num_tokens": 8495046.0, + "step": 1004 + }, + { + "entropy": 1.0515923351049423, + "epoch": 0.402040204020402, + "grad_norm": 0.2713454067707062, + "learning_rate": 0.00014231660265310027, + "loss": 1.0548, + "mean_token_accuracy": 0.698898434638977, + "num_tokens": 8503858.0, + "step": 1005 + }, + { + "entropy": 1.0121905505657196, + "epoch": 0.4024402440244024, + "grad_norm": 0.271133154630661, + "learning_rate": 0.00014220775618389164, + "loss": 1.0199, + "mean_token_accuracy": 0.701520636677742, + "num_tokens": 8512516.0, + "step": 1006 + }, + { + "entropy": 1.0727417469024658, + "epoch": 0.40284028402840283, + "grad_norm": 0.2647912800312042, + "learning_rate": 0.00014209885565958383, + "loss": 1.0383, + "mean_token_accuracy": 0.7043611109256744, + "num_tokens": 8521192.0, + "step": 1007 + }, + { + "entropy": 1.1496848165988922, + "epoch": 0.40324032403240323, + "grad_norm": 0.2885759174823761, + "learning_rate": 0.00014198990126294736, + "loss": 1.1768, + "mean_token_accuracy": 0.6770444214344025, + "num_tokens": 8529616.0, + "step": 1008 + }, + { + "entropy": 1.033217117190361, + "epoch": 0.40364036403640363, + "grad_norm": 0.2660588324069977, + "learning_rate": 0.00014188089317684313, + "loss": 1.0525, + "mean_token_accuracy": 0.6963855475187302, + "num_tokens": 8538526.0, + "step": 1009 + }, + { + "entropy": 1.1125969886779785, + "epoch": 0.40404040404040403, + "grad_norm": 0.2757780849933624, + "learning_rate": 0.00014177183158422225, + "loss": 1.1339, + "mean_token_accuracy": 0.6857900768518448, + "num_tokens": 8547026.0, + "step": 1010 + }, + { + "entropy": 1.0918416678905487, + "epoch": 0.40444044404440443, + "grad_norm": 0.28014034032821655, + "learning_rate": 0.0001416627166681255, + "loss": 1.1054, + "mean_token_accuracy": 0.6861250847578049, + "num_tokens": 8555349.0, + "step": 1011 + }, + { + "entropy": 1.0753582417964935, + "epoch": 0.40484048404840484, + "grad_norm": 0.3646489381790161, + "learning_rate": 0.00014155354861168336, + "loss": 1.0491, + "mean_token_accuracy": 0.7051423341035843, + "num_tokens": 8563911.0, + "step": 1012 + }, + { + "entropy": 1.0683763027191162, + "epoch": 0.40524052405240524, + "grad_norm": 0.3172706663608551, + "learning_rate": 0.00014144432759811528, + "loss": 1.0421, + "mean_token_accuracy": 0.7051568329334259, + "num_tokens": 8571945.0, + "step": 1013 + }, + { + "entropy": 1.0641514360904694, + "epoch": 0.40564056405640564, + "grad_norm": 0.28131359815597534, + "learning_rate": 0.00014133505381072964, + "loss": 1.0352, + "mean_token_accuracy": 0.7000925838947296, + "num_tokens": 8580376.0, + "step": 1014 + }, + { + "entropy": 1.0351819843053818, + "epoch": 0.40604060406040604, + "grad_norm": 0.28761935234069824, + "learning_rate": 0.0001412257274329235, + "loss": 1.0113, + "mean_token_accuracy": 0.7115456312894821, + "num_tokens": 8588367.0, + "step": 1015 + }, + { + "entropy": 1.0777496695518494, + "epoch": 0.40644064406440644, + "grad_norm": 0.28460052609443665, + "learning_rate": 0.00014111634864818207, + "loss": 1.0776, + "mean_token_accuracy": 0.7007474452257156, + "num_tokens": 8596782.0, + "step": 1016 + }, + { + "entropy": 1.0805185437202454, + "epoch": 0.40684068406840684, + "grad_norm": 0.28818634152412415, + "learning_rate": 0.00014100691764007856, + "loss": 1.0846, + "mean_token_accuracy": 0.6960993409156799, + "num_tokens": 8605342.0, + "step": 1017 + }, + { + "entropy": 1.0042619407176971, + "epoch": 0.40724072407240725, + "grad_norm": 0.29423537850379944, + "learning_rate": 0.0001408974345922738, + "loss": 1.021, + "mean_token_accuracy": 0.7037978619337082, + "num_tokens": 8613979.0, + "step": 1018 + }, + { + "entropy": 1.1245154440402985, + "epoch": 0.40764076407640765, + "grad_norm": 0.2859850525856018, + "learning_rate": 0.00014078789968851604, + "loss": 1.125, + "mean_token_accuracy": 0.6848827451467514, + "num_tokens": 8622298.0, + "step": 1019 + }, + { + "entropy": 1.0281118154525757, + "epoch": 0.40804080408040805, + "grad_norm": 0.2765551209449768, + "learning_rate": 0.00014067831311264045, + "loss": 1.0267, + "mean_token_accuracy": 0.707539513707161, + "num_tokens": 8631045.0, + "step": 1020 + }, + { + "entropy": 1.0507335513830185, + "epoch": 0.40844084408440845, + "grad_norm": 0.2800913453102112, + "learning_rate": 0.00014056867504856906, + "loss": 1.0506, + "mean_token_accuracy": 0.6984911859035492, + "num_tokens": 8639547.0, + "step": 1021 + }, + { + "entropy": 1.0513606369495392, + "epoch": 0.40884088408840885, + "grad_norm": 0.27509069442749023, + "learning_rate": 0.0001404589856803101, + "loss": 1.028, + "mean_token_accuracy": 0.7054529637098312, + "num_tokens": 8647815.0, + "step": 1022 + }, + { + "entropy": 1.1418087482452393, + "epoch": 0.40924092409240925, + "grad_norm": 0.2814921438694, + "learning_rate": 0.00014034924519195816, + "loss": 1.1206, + "mean_token_accuracy": 0.6775806397199631, + "num_tokens": 8656195.0, + "step": 1023 + }, + { + "entropy": 1.029011845588684, + "epoch": 0.40964096409640965, + "grad_norm": 0.2578503489494324, + "learning_rate": 0.00014023945376769346, + "loss": 1.0126, + "mean_token_accuracy": 0.7048416584730148, + "num_tokens": 8665264.0, + "step": 1024 + }, + { + "entropy": 1.1143612414598465, + "epoch": 0.41004100410041006, + "grad_norm": 0.3079990744590759, + "learning_rate": 0.00014012961159178168, + "loss": 1.1241, + "mean_token_accuracy": 0.6879839301109314, + "num_tokens": 8672905.0, + "step": 1025 + }, + { + "entropy": 1.0813851356506348, + "epoch": 0.41044104410441046, + "grad_norm": 0.28009915351867676, + "learning_rate": 0.0001400197188485739, + "loss": 1.0658, + "mean_token_accuracy": 0.6991480588912964, + "num_tokens": 8681000.0, + "step": 1026 + }, + { + "entropy": 1.064486175775528, + "epoch": 0.41084108410841086, + "grad_norm": 0.28381359577178955, + "learning_rate": 0.00013990977572250575, + "loss": 1.0433, + "mean_token_accuracy": 0.6988044679164886, + "num_tokens": 8689443.0, + "step": 1027 + }, + { + "entropy": 1.0475238263607025, + "epoch": 0.41124112411241126, + "grad_norm": 0.2739580571651459, + "learning_rate": 0.00013979978239809767, + "loss": 1.0467, + "mean_token_accuracy": 0.7031860649585724, + "num_tokens": 8697903.0, + "step": 1028 + }, + { + "entropy": 1.0483440160751343, + "epoch": 0.41164116411641166, + "grad_norm": 0.40402668714523315, + "learning_rate": 0.00013968973905995426, + "loss": 1.0334, + "mean_token_accuracy": 0.7062229514122009, + "num_tokens": 8706237.0, + "step": 1029 + }, + { + "entropy": 1.0165838450193405, + "epoch": 0.41204120412041206, + "grad_norm": 0.2735487222671509, + "learning_rate": 0.00013957964589276405, + "loss": 1.016, + "mean_token_accuracy": 0.7004164904356003, + "num_tokens": 8714634.0, + "step": 1030 + }, + { + "entropy": 1.1147255897521973, + "epoch": 0.41244124412441246, + "grad_norm": 0.31641674041748047, + "learning_rate": 0.0001394695030812992, + "loss": 1.1297, + "mean_token_accuracy": 0.6843569129705429, + "num_tokens": 8722324.0, + "step": 1031 + }, + { + "entropy": 1.0679795444011688, + "epoch": 0.41284128412841287, + "grad_norm": 0.2834770083427429, + "learning_rate": 0.00013935931081041525, + "loss": 1.0972, + "mean_token_accuracy": 0.6912588328123093, + "num_tokens": 8730804.0, + "step": 1032 + }, + { + "entropy": 1.0787553638219833, + "epoch": 0.41324132413241327, + "grad_norm": 0.30491507053375244, + "learning_rate": 0.00013924906926505065, + "loss": 1.0907, + "mean_token_accuracy": 0.6908698379993439, + "num_tokens": 8738790.0, + "step": 1033 + }, + { + "entropy": 1.0311247110366821, + "epoch": 0.4136413641364136, + "grad_norm": 0.26184898614883423, + "learning_rate": 0.00013913877863022664, + "loss": 1.0272, + "mean_token_accuracy": 0.7079911082983017, + "num_tokens": 8747750.0, + "step": 1034 + }, + { + "entropy": 1.041083186864853, + "epoch": 0.414041404140414, + "grad_norm": 0.27645936608314514, + "learning_rate": 0.00013902843909104678, + "loss": 1.0389, + "mean_token_accuracy": 0.7058251202106476, + "num_tokens": 8756091.0, + "step": 1035 + }, + { + "entropy": 1.0887877643108368, + "epoch": 0.4144414441444144, + "grad_norm": 0.9865003228187561, + "learning_rate": 0.00013891805083269672, + "loss": 1.0759, + "mean_token_accuracy": 0.6894579231739044, + "num_tokens": 8764862.0, + "step": 1036 + }, + { + "entropy": 1.0995206534862518, + "epoch": 0.4148414841484148, + "grad_norm": 0.30941081047058105, + "learning_rate": 0.00013880761404044394, + "loss": 1.0385, + "mean_token_accuracy": 0.7029267102479935, + "num_tokens": 8773401.0, + "step": 1037 + }, + { + "entropy": 1.086568921804428, + "epoch": 0.4152415241524152, + "grad_norm": 0.2822316586971283, + "learning_rate": 0.00013869712889963726, + "loss": 1.035, + "mean_token_accuracy": 0.7046221941709518, + "num_tokens": 8781883.0, + "step": 1038 + }, + { + "entropy": 1.0994454473257065, + "epoch": 0.4156415641564156, + "grad_norm": 0.2749359607696533, + "learning_rate": 0.00013858659559570676, + "loss": 1.0608, + "mean_token_accuracy": 0.6954016536474228, + "num_tokens": 8790239.0, + "step": 1039 + }, + { + "entropy": 1.0364155620336533, + "epoch": 0.416041604160416, + "grad_norm": 0.2692890465259552, + "learning_rate": 0.00013847601431416333, + "loss": 1.0023, + "mean_token_accuracy": 0.709053099155426, + "num_tokens": 8798973.0, + "step": 1040 + }, + { + "entropy": 1.0574538707733154, + "epoch": 0.4164416441644164, + "grad_norm": 0.27456435561180115, + "learning_rate": 0.00013836538524059829, + "loss": 1.0525, + "mean_token_accuracy": 0.7031033635139465, + "num_tokens": 8807732.0, + "step": 1041 + }, + { + "entropy": 1.0957830548286438, + "epoch": 0.4168416841684168, + "grad_norm": 0.35573887825012207, + "learning_rate": 0.00013825470856068325, + "loss": 1.1048, + "mean_token_accuracy": 0.6894460171461105, + "num_tokens": 8815712.0, + "step": 1042 + }, + { + "entropy": 0.9589244276285172, + "epoch": 0.4172417241724172, + "grad_norm": 0.3061108887195587, + "learning_rate": 0.0001381439844601698, + "loss": 0.9884, + "mean_token_accuracy": 0.7132687419652939, + "num_tokens": 8824617.0, + "step": 1043 + }, + { + "entropy": 1.0054947882890701, + "epoch": 0.41764176417641763, + "grad_norm": 0.3023373484611511, + "learning_rate": 0.00013803321312488888, + "loss": 1.0485, + "mean_token_accuracy": 0.696058988571167, + "num_tokens": 8833194.0, + "step": 1044 + }, + { + "entropy": 0.9493149220943451, + "epoch": 0.41804180418041803, + "grad_norm": 0.2729393243789673, + "learning_rate": 0.00013792239474075095, + "loss": 0.9627, + "mean_token_accuracy": 0.7210550159215927, + "num_tokens": 8841862.0, + "step": 1045 + }, + { + "entropy": 0.9781219363212585, + "epoch": 0.41844184418441843, + "grad_norm": 0.2843363583087921, + "learning_rate": 0.00013781152949374526, + "loss": 1.003, + "mean_token_accuracy": 0.7074914574623108, + "num_tokens": 8850472.0, + "step": 1046 + }, + { + "entropy": 1.0082087069749832, + "epoch": 0.41884188418841883, + "grad_norm": 0.2942999005317688, + "learning_rate": 0.00013770061756993986, + "loss": 1.0391, + "mean_token_accuracy": 0.7023905217647552, + "num_tokens": 8858856.0, + "step": 1047 + }, + { + "entropy": 1.0682973861694336, + "epoch": 0.41924192419241924, + "grad_norm": 0.28278663754463196, + "learning_rate": 0.00013758965915548107, + "loss": 1.073, + "mean_token_accuracy": 0.6936942636966705, + "num_tokens": 8867291.0, + "step": 1048 + }, + { + "entropy": 1.1049893498420715, + "epoch": 0.41964196419641964, + "grad_norm": 0.27693507075309753, + "learning_rate": 0.0001374786544365931, + "loss": 1.0577, + "mean_token_accuracy": 0.6980084627866745, + "num_tokens": 8875638.0, + "step": 1049 + }, + { + "entropy": 1.1289234459400177, + "epoch": 0.42004200420042004, + "grad_norm": 0.28352290391921997, + "learning_rate": 0.00013736760359957817, + "loss": 1.1144, + "mean_token_accuracy": 0.6885149627923965, + "num_tokens": 8884057.0, + "step": 1050 + }, + { + "entropy": 1.0156038105487823, + "epoch": 0.42044204420442044, + "grad_norm": 0.27039048075675964, + "learning_rate": 0.00013725650683081556, + "loss": 0.991, + "mean_token_accuracy": 0.7194395214319229, + "num_tokens": 8893066.0, + "step": 1051 + }, + { + "entropy": 1.0353493839502335, + "epoch": 0.42084208420842084, + "grad_norm": 0.26982343196868896, + "learning_rate": 0.00013714536431676198, + "loss": 1.0154, + "mean_token_accuracy": 0.7048549950122833, + "num_tokens": 8901237.0, + "step": 1052 + }, + { + "entropy": 1.073839321732521, + "epoch": 0.42124212421242124, + "grad_norm": 0.28820687532424927, + "learning_rate": 0.00013703417624395066, + "loss": 1.0322, + "mean_token_accuracy": 0.7007460743188858, + "num_tokens": 8909157.0, + "step": 1053 + }, + { + "entropy": 1.0300287902355194, + "epoch": 0.42164216421642164, + "grad_norm": 0.28941988945007324, + "learning_rate": 0.00013692294279899137, + "loss": 1.0218, + "mean_token_accuracy": 0.7061614096164703, + "num_tokens": 8917158.0, + "step": 1054 + }, + { + "entropy": 1.0641874819993973, + "epoch": 0.42204220422042205, + "grad_norm": 0.29239678382873535, + "learning_rate": 0.00013681166416857008, + "loss": 1.0559, + "mean_token_accuracy": 0.699013888835907, + "num_tokens": 8925044.0, + "step": 1055 + }, + { + "entropy": 1.0947994887828827, + "epoch": 0.42244224422442245, + "grad_norm": 0.28455162048339844, + "learning_rate": 0.00013670034053944852, + "loss": 1.089, + "mean_token_accuracy": 0.6942250430583954, + "num_tokens": 8933342.0, + "step": 1056 + }, + { + "entropy": 1.0030068457126617, + "epoch": 0.42284228422842285, + "grad_norm": 0.26619860529899597, + "learning_rate": 0.00013658897209846402, + "loss": 0.9902, + "mean_token_accuracy": 0.7261633425951004, + "num_tokens": 8941689.0, + "step": 1057 + }, + { + "entropy": 1.0688401758670807, + "epoch": 0.42324232423242325, + "grad_norm": 0.27335381507873535, + "learning_rate": 0.00013647755903252904, + "loss": 1.0855, + "mean_token_accuracy": 0.6985708028078079, + "num_tokens": 8950937.0, + "step": 1058 + }, + { + "entropy": 1.0137758702039719, + "epoch": 0.42364236423642365, + "grad_norm": 0.2710599899291992, + "learning_rate": 0.00013636610152863098, + "loss": 1.0078, + "mean_token_accuracy": 0.7089750915765762, + "num_tokens": 8959935.0, + "step": 1059 + }, + { + "entropy": 1.0563341677188873, + "epoch": 0.42404240424042405, + "grad_norm": 0.27815142273902893, + "learning_rate": 0.00013625459977383182, + "loss": 1.0616, + "mean_token_accuracy": 0.6942877024412155, + "num_tokens": 8968796.0, + "step": 1060 + }, + { + "entropy": 1.045782208442688, + "epoch": 0.42444244424442445, + "grad_norm": 0.2918384075164795, + "learning_rate": 0.0001361430539552678, + "loss": 1.0415, + "mean_token_accuracy": 0.701434388756752, + "num_tokens": 8977061.0, + "step": 1061 + }, + { + "entropy": 0.9723037481307983, + "epoch": 0.42484248424842486, + "grad_norm": 0.2681103050708771, + "learning_rate": 0.00013603146426014912, + "loss": 0.9757, + "mean_token_accuracy": 0.7213161736726761, + "num_tokens": 8985899.0, + "step": 1062 + }, + { + "entropy": 1.1067449450492859, + "epoch": 0.42524252425242526, + "grad_norm": 0.2858854830265045, + "learning_rate": 0.0001359198308757596, + "loss": 1.1232, + "mean_token_accuracy": 0.6929030865430832, + "num_tokens": 8994564.0, + "step": 1063 + }, + { + "entropy": 1.0669856071472168, + "epoch": 0.42564256425642566, + "grad_norm": 0.2883087992668152, + "learning_rate": 0.0001358081539894564, + "loss": 1.0588, + "mean_token_accuracy": 0.7063491493463516, + "num_tokens": 9002479.0, + "step": 1064 + }, + { + "entropy": 1.0177536457777023, + "epoch": 0.42604260426042606, + "grad_norm": 0.2760503590106964, + "learning_rate": 0.0001356964337886697, + "loss": 1.0312, + "mean_token_accuracy": 0.7046210169792175, + "num_tokens": 9011211.0, + "step": 1065 + }, + { + "entropy": 1.0280053317546844, + "epoch": 0.42644264426442646, + "grad_norm": 0.2825869023799896, + "learning_rate": 0.00013558467046090238, + "loss": 1.0205, + "mean_token_accuracy": 0.7057770639657974, + "num_tokens": 9020262.0, + "step": 1066 + }, + { + "entropy": 0.9986815601587296, + "epoch": 0.42684268426842686, + "grad_norm": 0.28064122796058655, + "learning_rate": 0.00013547286419372963, + "loss": 0.9991, + "mean_token_accuracy": 0.7176034897565842, + "num_tokens": 9028584.0, + "step": 1067 + }, + { + "entropy": 1.0227905958890915, + "epoch": 0.42724272427242727, + "grad_norm": 0.26080429553985596, + "learning_rate": 0.00013536101517479883, + "loss": 0.9936, + "mean_token_accuracy": 0.7132055163383484, + "num_tokens": 9037597.0, + "step": 1068 + }, + { + "entropy": 1.0808660089969635, + "epoch": 0.42764276427642767, + "grad_norm": 0.2762328088283539, + "learning_rate": 0.00013524912359182896, + "loss": 1.0631, + "mean_token_accuracy": 0.6888589859008789, + "num_tokens": 9046079.0, + "step": 1069 + }, + { + "entropy": 1.0591990798711777, + "epoch": 0.42804280428042807, + "grad_norm": 0.2844753563404083, + "learning_rate": 0.0001351371896326106, + "loss": 1.036, + "mean_token_accuracy": 0.7079353630542755, + "num_tokens": 9054405.0, + "step": 1070 + }, + { + "entropy": 1.0911228507757187, + "epoch": 0.42844284428442847, + "grad_norm": 0.29953333735466003, + "learning_rate": 0.00013502521348500532, + "loss": 1.1155, + "mean_token_accuracy": 0.6843238472938538, + "num_tokens": 9062871.0, + "step": 1071 + }, + { + "entropy": 1.0610769093036652, + "epoch": 0.4288428842884288, + "grad_norm": 0.2912682294845581, + "learning_rate": 0.00013491319533694558, + "loss": 1.0741, + "mean_token_accuracy": 0.6926146000623703, + "num_tokens": 9071278.0, + "step": 1072 + }, + { + "entropy": 0.9504960030317307, + "epoch": 0.4292429242924292, + "grad_norm": 0.2632628381252289, + "learning_rate": 0.00013480113537643425, + "loss": 0.9364, + "mean_token_accuracy": 0.7235381752252579, + "num_tokens": 9080175.0, + "step": 1073 + }, + { + "entropy": 1.0200758129358292, + "epoch": 0.4296429642964296, + "grad_norm": 0.27097657322883606, + "learning_rate": 0.00013468903379154447, + "loss": 1.0006, + "mean_token_accuracy": 0.7189835608005524, + "num_tokens": 9089060.0, + "step": 1074 + }, + { + "entropy": 1.0200105756521225, + "epoch": 0.43004300430043, + "grad_norm": 0.2758127450942993, + "learning_rate": 0.0001345768907704192, + "loss": 1.0336, + "mean_token_accuracy": 0.7064051777124405, + "num_tokens": 9097828.0, + "step": 1075 + }, + { + "entropy": 1.0201881229877472, + "epoch": 0.4304430443044304, + "grad_norm": 0.2691255211830139, + "learning_rate": 0.0001344647065012709, + "loss": 1.0275, + "mean_token_accuracy": 0.7121488749980927, + "num_tokens": 9106731.0, + "step": 1076 + }, + { + "entropy": 1.1221066415309906, + "epoch": 0.4308430843084308, + "grad_norm": 0.2993243336677551, + "learning_rate": 0.00013435248117238123, + "loss": 1.1421, + "mean_token_accuracy": 0.6776039004325867, + "num_tokens": 9115035.0, + "step": 1077 + }, + { + "entropy": 1.0720972120761871, + "epoch": 0.4312431243124312, + "grad_norm": 0.2779398560523987, + "learning_rate": 0.00013424021497210098, + "loss": 1.0716, + "mean_token_accuracy": 0.6933643668889999, + "num_tokens": 9123642.0, + "step": 1078 + }, + { + "entropy": 1.0738157331943512, + "epoch": 0.4316431643164316, + "grad_norm": 0.29163047671318054, + "learning_rate": 0.00013412790808884923, + "loss": 1.0381, + "mean_token_accuracy": 0.7066617906093597, + "num_tokens": 9131864.0, + "step": 1079 + }, + { + "entropy": 1.109875112771988, + "epoch": 0.43204320432043203, + "grad_norm": 0.2966859042644501, + "learning_rate": 0.00013401556071111358, + "loss": 1.1007, + "mean_token_accuracy": 0.687449187040329, + "num_tokens": 9140269.0, + "step": 1080 + }, + { + "entropy": 1.107882410287857, + "epoch": 0.43244324432443243, + "grad_norm": 0.28887271881103516, + "learning_rate": 0.0001339031730274495, + "loss": 1.0736, + "mean_token_accuracy": 0.6897438317537308, + "num_tokens": 9148732.0, + "step": 1081 + }, + { + "entropy": 1.0696381032466888, + "epoch": 0.43284328432843283, + "grad_norm": 0.2797048091888428, + "learning_rate": 0.00013379074522648, + "loss": 1.0449, + "mean_token_accuracy": 0.7049586474895477, + "num_tokens": 9157526.0, + "step": 1082 + }, + { + "entropy": 1.0501088351011276, + "epoch": 0.43324332433243323, + "grad_norm": 0.27868708968162537, + "learning_rate": 0.0001336782774968957, + "loss": 1.0028, + "mean_token_accuracy": 0.7111310511827469, + "num_tokens": 9165865.0, + "step": 1083 + }, + { + "entropy": 1.0442237854003906, + "epoch": 0.43364336433643363, + "grad_norm": 0.27140721678733826, + "learning_rate": 0.0001335657700274539, + "loss": 1.0335, + "mean_token_accuracy": 0.7035298943519592, + "num_tokens": 9174174.0, + "step": 1084 + }, + { + "entropy": 1.0614895671606064, + "epoch": 0.43404340434043404, + "grad_norm": 0.2773546874523163, + "learning_rate": 0.00013345322300697886, + "loss": 1.0804, + "mean_token_accuracy": 0.6917308121919632, + "num_tokens": 9183266.0, + "step": 1085 + }, + { + "entropy": 1.008152112364769, + "epoch": 0.43444344434443444, + "grad_norm": 0.2809455096721649, + "learning_rate": 0.00013334063662436108, + "loss": 1.0219, + "mean_token_accuracy": 0.708100438117981, + "num_tokens": 9192072.0, + "step": 1086 + }, + { + "entropy": 1.0305002331733704, + "epoch": 0.43484348434843484, + "grad_norm": 0.2724243104457855, + "learning_rate": 0.00013322801106855717, + "loss": 1.0223, + "mean_token_accuracy": 0.710504487156868, + "num_tokens": 9200477.0, + "step": 1087 + }, + { + "entropy": 1.0657826513051987, + "epoch": 0.43524352435243524, + "grad_norm": 0.27904054522514343, + "learning_rate": 0.0001331153465285894, + "loss": 1.0891, + "mean_token_accuracy": 0.6961464732885361, + "num_tokens": 9209069.0, + "step": 1088 + }, + { + "entropy": 1.0728546530008316, + "epoch": 0.43564356435643564, + "grad_norm": 0.27009743452072144, + "learning_rate": 0.00013300264319354566, + "loss": 1.0889, + "mean_token_accuracy": 0.6917052567005157, + "num_tokens": 9217591.0, + "step": 1089 + }, + { + "entropy": 1.0864580869674683, + "epoch": 0.43604360436043604, + "grad_norm": 0.2904224991798401, + "learning_rate": 0.00013288990125257883, + "loss": 1.0739, + "mean_token_accuracy": 0.6920597553253174, + "num_tokens": 9225945.0, + "step": 1090 + }, + { + "entropy": 1.117465227842331, + "epoch": 0.43644364436443644, + "grad_norm": 0.2865927815437317, + "learning_rate": 0.00013277712089490646, + "loss": 1.1253, + "mean_token_accuracy": 0.6862503290176392, + "num_tokens": 9234548.0, + "step": 1091 + }, + { + "entropy": 1.0954654812812805, + "epoch": 0.43684368436843685, + "grad_norm": 0.26863545179367065, + "learning_rate": 0.0001326643023098108, + "loss": 1.0697, + "mean_token_accuracy": 0.6935992538928986, + "num_tokens": 9243416.0, + "step": 1092 + }, + { + "entropy": 1.1119301319122314, + "epoch": 0.43724372437243725, + "grad_norm": 0.276238352060318, + "learning_rate": 0.0001325514456866382, + "loss": 1.0758, + "mean_token_accuracy": 0.6904617846012115, + "num_tokens": 9251689.0, + "step": 1093 + }, + { + "entropy": 1.032733991742134, + "epoch": 0.43764376437643765, + "grad_norm": 0.2758907675743103, + "learning_rate": 0.0001324385512147987, + "loss": 1.0232, + "mean_token_accuracy": 0.706854984164238, + "num_tokens": 9260113.0, + "step": 1094 + }, + { + "entropy": 1.0635818541049957, + "epoch": 0.43804380438043805, + "grad_norm": 0.28907299041748047, + "learning_rate": 0.00013232561908376603, + "loss": 1.0692, + "mean_token_accuracy": 0.6992032378911972, + "num_tokens": 9268211.0, + "step": 1095 + }, + { + "entropy": 1.0752255469560623, + "epoch": 0.43844384438443845, + "grad_norm": 0.2850283980369568, + "learning_rate": 0.0001322126494830771, + "loss": 1.053, + "mean_token_accuracy": 0.6939669996500015, + "num_tokens": 9276457.0, + "step": 1096 + }, + { + "entropy": 1.008211925625801, + "epoch": 0.43884388438843885, + "grad_norm": 0.2715367078781128, + "learning_rate": 0.0001320996426023316, + "loss": 0.9909, + "mean_token_accuracy": 0.7140958458185196, + "num_tokens": 9285285.0, + "step": 1097 + }, + { + "entropy": 1.022285670042038, + "epoch": 0.43924392439243926, + "grad_norm": 0.2728317081928253, + "learning_rate": 0.0001319865986311919, + "loss": 1.0249, + "mean_token_accuracy": 0.705712080001831, + "num_tokens": 9294523.0, + "step": 1098 + }, + { + "entropy": 1.0105868428945541, + "epoch": 0.43964396439643966, + "grad_norm": 0.2848093807697296, + "learning_rate": 0.0001318735177593826, + "loss": 1.0378, + "mean_token_accuracy": 0.7051835805177689, + "num_tokens": 9303287.0, + "step": 1099 + }, + { + "entropy": 1.0157582014799118, + "epoch": 0.44004400440044006, + "grad_norm": 0.27847665548324585, + "learning_rate": 0.0001317604001766902, + "loss": 0.9756, + "mean_token_accuracy": 0.7164225578308105, + "num_tokens": 9311887.0, + "step": 1100 + }, + { + "entropy": 0.978501096367836, + "epoch": 0.44044404440444046, + "grad_norm": 0.27765464782714844, + "learning_rate": 0.00013164724607296285, + "loss": 0.9991, + "mean_token_accuracy": 0.7061675041913986, + "num_tokens": 9320467.0, + "step": 1101 + }, + { + "entropy": 0.9910889863967896, + "epoch": 0.44084408440844086, + "grad_norm": 3.775918483734131, + "learning_rate": 0.0001315340556381099, + "loss": 0.9972, + "mean_token_accuracy": 0.7156453728675842, + "num_tokens": 9329391.0, + "step": 1102 + }, + { + "entropy": 1.0246813148260117, + "epoch": 0.44124412441244126, + "grad_norm": 0.27218103408813477, + "learning_rate": 0.0001314208290621018, + "loss": 1.0317, + "mean_token_accuracy": 0.7027438580989838, + "num_tokens": 9338010.0, + "step": 1103 + }, + { + "entropy": 1.0212002247571945, + "epoch": 0.44164416441644166, + "grad_norm": 0.2997588813304901, + "learning_rate": 0.0001313075665349696, + "loss": 1.0397, + "mean_token_accuracy": 0.7081174552440643, + "num_tokens": 9346696.0, + "step": 1104 + }, + { + "entropy": 0.9461395740509033, + "epoch": 0.44204420442044207, + "grad_norm": 0.26615843176841736, + "learning_rate": 0.00013119426824680466, + "loss": 0.9504, + "mean_token_accuracy": 0.7210829704999924, + "num_tokens": 9355659.0, + "step": 1105 + }, + { + "entropy": 1.004364088177681, + "epoch": 0.44244424442444247, + "grad_norm": 0.26553845405578613, + "learning_rate": 0.00013108093438775838, + "loss": 1.0097, + "mean_token_accuracy": 0.7155507951974869, + "num_tokens": 9364135.0, + "step": 1106 + }, + { + "entropy": 1.0103053450584412, + "epoch": 0.44284428442844287, + "grad_norm": 0.27923908829689026, + "learning_rate": 0.00013096756514804195, + "loss": 1.0138, + "mean_token_accuracy": 0.7101486325263977, + "num_tokens": 9372527.0, + "step": 1107 + }, + { + "entropy": 1.0269133895635605, + "epoch": 0.44324432443244327, + "grad_norm": 0.2674485743045807, + "learning_rate": 0.00013085416071792583, + "loss": 0.9751, + "mean_token_accuracy": 0.7137272208929062, + "num_tokens": 9381125.0, + "step": 1108 + }, + { + "entropy": 1.0579135715961456, + "epoch": 0.4436443644364436, + "grad_norm": 0.26586848497390747, + "learning_rate": 0.00013074072128773948, + "loss": 1.0219, + "mean_token_accuracy": 0.7124139070510864, + "num_tokens": 9390032.0, + "step": 1109 + }, + { + "entropy": 1.0446073412895203, + "epoch": 0.444044404440444, + "grad_norm": 0.2824837565422058, + "learning_rate": 0.00013062724704787128, + "loss": 1.0082, + "mean_token_accuracy": 0.7081329077482224, + "num_tokens": 9398519.0, + "step": 1110 + }, + { + "entropy": 1.061277151107788, + "epoch": 0.4444444444444444, + "grad_norm": 0.2836729884147644, + "learning_rate": 0.00013051373818876794, + "loss": 1.0535, + "mean_token_accuracy": 0.6958392560482025, + "num_tokens": 9406556.0, + "step": 1111 + }, + { + "entropy": 1.0872269868850708, + "epoch": 0.4448444844484448, + "grad_norm": 0.29192447662353516, + "learning_rate": 0.0001304001949009342, + "loss": 1.0609, + "mean_token_accuracy": 0.6928906589746475, + "num_tokens": 9414514.0, + "step": 1112 + }, + { + "entropy": 0.9852748960256577, + "epoch": 0.4452445244524452, + "grad_norm": 0.2699228823184967, + "learning_rate": 0.00013028661737493274, + "loss": 0.9536, + "mean_token_accuracy": 0.7223382592201233, + "num_tokens": 9423170.0, + "step": 1113 + }, + { + "entropy": 1.057640478014946, + "epoch": 0.4456445644564456, + "grad_norm": 0.2802077531814575, + "learning_rate": 0.0001301730058013836, + "loss": 1.0995, + "mean_token_accuracy": 0.6964540928602219, + "num_tokens": 9431975.0, + "step": 1114 + }, + { + "entropy": 0.9976014345884323, + "epoch": 0.446044604460446, + "grad_norm": 0.3011581301689148, + "learning_rate": 0.00013005936037096397, + "loss": 1.0274, + "mean_token_accuracy": 0.7089135646820068, + "num_tokens": 9441094.0, + "step": 1115 + }, + { + "entropy": 0.961296558380127, + "epoch": 0.4464446444644464, + "grad_norm": 0.2698821723461151, + "learning_rate": 0.0001299456812744079, + "loss": 0.9653, + "mean_token_accuracy": 0.7177059799432755, + "num_tokens": 9450028.0, + "step": 1116 + }, + { + "entropy": 0.9484410583972931, + "epoch": 0.44684468446844683, + "grad_norm": 0.27624866366386414, + "learning_rate": 0.00012983196870250586, + "loss": 0.9737, + "mean_token_accuracy": 0.7100328207015991, + "num_tokens": 9458937.0, + "step": 1117 + }, + { + "entropy": 1.068996086716652, + "epoch": 0.44724472447244723, + "grad_norm": 0.29967278242111206, + "learning_rate": 0.00012971822284610465, + "loss": 1.0742, + "mean_token_accuracy": 0.6923859566450119, + "num_tokens": 9467077.0, + "step": 1118 + }, + { + "entropy": 1.0366225838661194, + "epoch": 0.44764476447644763, + "grad_norm": 0.2766386866569519, + "learning_rate": 0.00012960444389610676, + "loss": 1.0371, + "mean_token_accuracy": 0.706090047955513, + "num_tokens": 9475481.0, + "step": 1119 + }, + { + "entropy": 1.0347706973552704, + "epoch": 0.44804480448044803, + "grad_norm": 0.2707776129245758, + "learning_rate": 0.00012949063204347036, + "loss": 1.0249, + "mean_token_accuracy": 0.7036603093147278, + "num_tokens": 9484221.0, + "step": 1120 + }, + { + "entropy": 1.0510926991701126, + "epoch": 0.44844484448444843, + "grad_norm": 0.28996744751930237, + "learning_rate": 0.00012937678747920874, + "loss": 1.0355, + "mean_token_accuracy": 0.7060926407575607, + "num_tokens": 9492893.0, + "step": 1121 + }, + { + "entropy": 1.105236679315567, + "epoch": 0.44884488448844884, + "grad_norm": 0.2742345631122589, + "learning_rate": 0.0001292629103943902, + "loss": 1.0782, + "mean_token_accuracy": 0.6901605576276779, + "num_tokens": 9501744.0, + "step": 1122 + }, + { + "entropy": 1.07321497797966, + "epoch": 0.44924492449244924, + "grad_norm": 0.28404390811920166, + "learning_rate": 0.00012914900098013753, + "loss": 1.0669, + "mean_token_accuracy": 0.7028540223836899, + "num_tokens": 9510027.0, + "step": 1123 + }, + { + "entropy": 1.1015549898147583, + "epoch": 0.44964496449644964, + "grad_norm": 0.2823750972747803, + "learning_rate": 0.00012903505942762775, + "loss": 1.1081, + "mean_token_accuracy": 0.6984256058931351, + "num_tokens": 9518840.0, + "step": 1124 + }, + { + "entropy": 1.052079826593399, + "epoch": 0.45004500450045004, + "grad_norm": 0.25515690445899963, + "learning_rate": 0.00012892108592809197, + "loss": 1.0425, + "mean_token_accuracy": 0.7003075927495956, + "num_tokens": 9528091.0, + "step": 1125 + }, + { + "entropy": 1.0781229138374329, + "epoch": 0.45044504450445044, + "grad_norm": 0.2739901840686798, + "learning_rate": 0.00012880708067281477, + "loss": 1.0819, + "mean_token_accuracy": 0.6893839240074158, + "num_tokens": 9536899.0, + "step": 1126 + }, + { + "entropy": 1.075578659772873, + "epoch": 0.45084508450845084, + "grad_norm": 0.26945534348487854, + "learning_rate": 0.0001286930438531341, + "loss": 1.0433, + "mean_token_accuracy": 0.7041110694408417, + "num_tokens": 9545360.0, + "step": 1127 + }, + { + "entropy": 1.0728818476200104, + "epoch": 0.45124512451245125, + "grad_norm": 0.2697220742702484, + "learning_rate": 0.00012857897566044083, + "loss": 1.0389, + "mean_token_accuracy": 0.7042641639709473, + "num_tokens": 9553461.0, + "step": 1128 + }, + { + "entropy": 1.0571295022964478, + "epoch": 0.45164516451645165, + "grad_norm": 0.28375330567359924, + "learning_rate": 0.00012846487628617853, + "loss": 1.0462, + "mean_token_accuracy": 0.7024986445903778, + "num_tokens": 9561895.0, + "step": 1129 + }, + { + "entropy": 1.1276902556419373, + "epoch": 0.45204520452045205, + "grad_norm": 0.2849422097206116, + "learning_rate": 0.00012835074592184318, + "loss": 1.1399, + "mean_token_accuracy": 0.6824175119400024, + "num_tokens": 9570485.0, + "step": 1130 + }, + { + "entropy": 1.1072853207588196, + "epoch": 0.45244524452445245, + "grad_norm": 0.29853615164756775, + "learning_rate": 0.00012823658475898252, + "loss": 1.1147, + "mean_token_accuracy": 0.6860185712575912, + "num_tokens": 9578491.0, + "step": 1131 + }, + { + "entropy": 1.0149793177843094, + "epoch": 0.45284528452845285, + "grad_norm": 0.28600630164146423, + "learning_rate": 0.00012812239298919627, + "loss": 1.014, + "mean_token_accuracy": 0.704537644982338, + "num_tokens": 9586838.0, + "step": 1132 + }, + { + "entropy": 1.1063433587551117, + "epoch": 0.45324532453245325, + "grad_norm": 0.3022083640098572, + "learning_rate": 0.00012800817080413533, + "loss": 1.1467, + "mean_token_accuracy": 0.6827908307313919, + "num_tokens": 9595267.0, + "step": 1133 + }, + { + "entropy": 1.0249161571264267, + "epoch": 0.45364536453645365, + "grad_norm": 0.2835840880870819, + "learning_rate": 0.00012789391839550178, + "loss": 1.0456, + "mean_token_accuracy": 0.7070167511701584, + "num_tokens": 9604011.0, + "step": 1134 + }, + { + "entropy": 1.0561366230249405, + "epoch": 0.45404540454045406, + "grad_norm": 0.2859001159667969, + "learning_rate": 0.00012777963595504824, + "loss": 1.0675, + "mean_token_accuracy": 0.6980157792568207, + "num_tokens": 9612216.0, + "step": 1135 + }, + { + "entropy": 1.0399661362171173, + "epoch": 0.45444544454445446, + "grad_norm": 0.29342570900917053, + "learning_rate": 0.00012766532367457798, + "loss": 1.0374, + "mean_token_accuracy": 0.7106993645429611, + "num_tokens": 9620504.0, + "step": 1136 + }, + { + "entropy": 1.0340932309627533, + "epoch": 0.45484548454845486, + "grad_norm": 0.2839612662792206, + "learning_rate": 0.00012755098174594413, + "loss": 1.0287, + "mean_token_accuracy": 0.7077745944261551, + "num_tokens": 9629453.0, + "step": 1137 + }, + { + "entropy": 1.100700169801712, + "epoch": 0.45524552455245526, + "grad_norm": 0.2859300971031189, + "learning_rate": 0.00012743661036104966, + "loss": 1.0797, + "mean_token_accuracy": 0.7003969699144363, + "num_tokens": 9637783.0, + "step": 1138 + }, + { + "entropy": 1.0822851955890656, + "epoch": 0.45564556455645566, + "grad_norm": 0.2751210033893585, + "learning_rate": 0.00012732220971184706, + "loss": 1.0728, + "mean_token_accuracy": 0.6973112225532532, + "num_tokens": 9646257.0, + "step": 1139 + }, + { + "entropy": 1.104035496711731, + "epoch": 0.45604560456045606, + "grad_norm": 0.28797420859336853, + "learning_rate": 0.00012720777999033776, + "loss": 1.0954, + "mean_token_accuracy": 0.6958659440279007, + "num_tokens": 9654530.0, + "step": 1140 + }, + { + "entropy": 1.0562431812286377, + "epoch": 0.45644564456445647, + "grad_norm": 0.26881882548332214, + "learning_rate": 0.00012709332138857214, + "loss": 1.0295, + "mean_token_accuracy": 0.7127672582864761, + "num_tokens": 9663434.0, + "step": 1141 + }, + { + "entropy": 1.0703443586826324, + "epoch": 0.45684568456845687, + "grad_norm": 0.2677030563354492, + "learning_rate": 0.00012697883409864896, + "loss": 1.0616, + "mean_token_accuracy": 0.6976016610860825, + "num_tokens": 9672461.0, + "step": 1142 + }, + { + "entropy": 1.0949411541223526, + "epoch": 0.45724572457245727, + "grad_norm": 0.28501182794570923, + "learning_rate": 0.00012686431831271524, + "loss": 1.103, + "mean_token_accuracy": 0.6902806162834167, + "num_tokens": 9680528.0, + "step": 1143 + }, + { + "entropy": 1.005786269903183, + "epoch": 0.45764576457645767, + "grad_norm": 0.25482404232025146, + "learning_rate": 0.00012674977422296566, + "loss": 0.9894, + "mean_token_accuracy": 0.7139841914176941, + "num_tokens": 9689527.0, + "step": 1144 + }, + { + "entropy": 1.1074579656124115, + "epoch": 0.45804580458045807, + "grad_norm": 0.2734793424606323, + "learning_rate": 0.0001266352020216425, + "loss": 1.1117, + "mean_token_accuracy": 0.6888532489538193, + "num_tokens": 9698009.0, + "step": 1145 + }, + { + "entropy": 1.0540065169334412, + "epoch": 0.4584458445844585, + "grad_norm": 0.28011709451675415, + "learning_rate": 0.00012652060190103525, + "loss": 1.0338, + "mean_token_accuracy": 0.706258237361908, + "num_tokens": 9706216.0, + "step": 1146 + }, + { + "entropy": 1.0336142927408218, + "epoch": 0.4588458845884588, + "grad_norm": 0.269119530916214, + "learning_rate": 0.00012640597405348021, + "loss": 1.0131, + "mean_token_accuracy": 0.7098353654146194, + "num_tokens": 9714798.0, + "step": 1147 + }, + { + "entropy": 1.0047630667686462, + "epoch": 0.4592459245924592, + "grad_norm": 0.28839540481567383, + "learning_rate": 0.0001262913186713602, + "loss": 1.0122, + "mean_token_accuracy": 0.7081219106912613, + "num_tokens": 9722990.0, + "step": 1148 + }, + { + "entropy": 1.026741698384285, + "epoch": 0.4596459645964596, + "grad_norm": 0.2674139142036438, + "learning_rate": 0.00012617663594710428, + "loss": 1.0042, + "mean_token_accuracy": 0.7052233070135117, + "num_tokens": 9731343.0, + "step": 1149 + }, + { + "entropy": 1.0402609556913376, + "epoch": 0.46004600460046, + "grad_norm": 0.26873916387557983, + "learning_rate": 0.0001260619260731874, + "loss": 1.028, + "mean_token_accuracy": 0.7058455795049667, + "num_tokens": 9739848.0, + "step": 1150 + }, + { + "entropy": 1.0488994717597961, + "epoch": 0.4604460446044604, + "grad_norm": 0.27595794200897217, + "learning_rate": 0.00012594718924213008, + "loss": 1.046, + "mean_token_accuracy": 0.6971295177936554, + "num_tokens": 9748360.0, + "step": 1151 + }, + { + "entropy": 1.0770216286182404, + "epoch": 0.4608460846084608, + "grad_norm": 0.2854619026184082, + "learning_rate": 0.000125832425646498, + "loss": 1.0968, + "mean_token_accuracy": 0.6906839609146118, + "num_tokens": 9757092.0, + "step": 1152 + }, + { + "entropy": 1.0207491368055344, + "epoch": 0.46124612461246123, + "grad_norm": 0.31677699089050293, + "learning_rate": 0.00012571763547890194, + "loss": 1.018, + "mean_token_accuracy": 0.7025559991598129, + "num_tokens": 9765842.0, + "step": 1153 + }, + { + "entropy": 1.1116138696670532, + "epoch": 0.46164616461646163, + "grad_norm": 0.2786920666694641, + "learning_rate": 0.0001256028189319971, + "loss": 1.1158, + "mean_token_accuracy": 0.6885127276182175, + "num_tokens": 9774402.0, + "step": 1154 + }, + { + "entropy": 1.0323432981967926, + "epoch": 0.46204620462046203, + "grad_norm": 0.28469038009643555, + "learning_rate": 0.00012548797619848302, + "loss": 1.0265, + "mean_token_accuracy": 0.7077233344316483, + "num_tokens": 9782160.0, + "step": 1155 + }, + { + "entropy": 1.046909049153328, + "epoch": 0.46244624462446243, + "grad_norm": 0.27737781405448914, + "learning_rate": 0.00012537310747110322, + "loss": 1.0367, + "mean_token_accuracy": 0.7028863728046417, + "num_tokens": 9790295.0, + "step": 1156 + }, + { + "entropy": 1.1289311945438385, + "epoch": 0.46284628462846283, + "grad_norm": 0.2808120548725128, + "learning_rate": 0.00012525821294264483, + "loss": 1.095, + "mean_token_accuracy": 0.6931657195091248, + "num_tokens": 9798709.0, + "step": 1157 + }, + { + "entropy": 1.1053792238235474, + "epoch": 0.46324632463246324, + "grad_norm": 0.28860142827033997, + "learning_rate": 0.00012514329280593822, + "loss": 1.0952, + "mean_token_accuracy": 0.6944225430488586, + "num_tokens": 9807172.0, + "step": 1158 + }, + { + "entropy": 1.0881357640028, + "epoch": 0.46364636463646364, + "grad_norm": 0.27057912945747375, + "learning_rate": 0.0001250283472538568, + "loss": 1.0431, + "mean_token_accuracy": 0.7000266313552856, + "num_tokens": 9816365.0, + "step": 1159 + }, + { + "entropy": 1.1066660284996033, + "epoch": 0.46404640464046404, + "grad_norm": 0.28617826104164124, + "learning_rate": 0.00012491337647931668, + "loss": 1.0889, + "mean_token_accuracy": 0.691657304763794, + "num_tokens": 9824363.0, + "step": 1160 + }, + { + "entropy": 1.001677080988884, + "epoch": 0.46444644464446444, + "grad_norm": 0.27024298906326294, + "learning_rate": 0.00012479838067527615, + "loss": 0.9825, + "mean_token_accuracy": 0.7192128002643585, + "num_tokens": 9832812.0, + "step": 1161 + }, + { + "entropy": 1.0446459203958511, + "epoch": 0.46484648464846484, + "grad_norm": 0.276801735162735, + "learning_rate": 0.0001246833600347357, + "loss": 1.0475, + "mean_token_accuracy": 0.7010513544082642, + "num_tokens": 9841496.0, + "step": 1162 + }, + { + "entropy": 1.050861656665802, + "epoch": 0.46524652465246524, + "grad_norm": 0.27767911553382874, + "learning_rate": 0.00012456831475073733, + "loss": 1.0362, + "mean_token_accuracy": 0.7021040171384811, + "num_tokens": 9849719.0, + "step": 1163 + }, + { + "entropy": 1.0699202418327332, + "epoch": 0.46564656465646564, + "grad_norm": 0.28933465480804443, + "learning_rate": 0.0001244532450163645, + "loss": 1.0832, + "mean_token_accuracy": 0.692017450928688, + "num_tokens": 9857945.0, + "step": 1164 + }, + { + "entropy": 1.0126815885305405, + "epoch": 0.46604660466046605, + "grad_norm": 0.31153056025505066, + "learning_rate": 0.00012433815102474172, + "loss": 1.0226, + "mean_token_accuracy": 0.7060019373893738, + "num_tokens": 9867012.0, + "step": 1165 + }, + { + "entropy": 1.0489352345466614, + "epoch": 0.46644664466446645, + "grad_norm": 0.27665770053863525, + "learning_rate": 0.00012422303296903407, + "loss": 1.0633, + "mean_token_accuracy": 0.6941277831792831, + "num_tokens": 9875589.0, + "step": 1166 + }, + { + "entropy": 0.9545290023088455, + "epoch": 0.46684668466846685, + "grad_norm": 0.27470943331718445, + "learning_rate": 0.0001241078910424473, + "loss": 0.9548, + "mean_token_accuracy": 0.7193215638399124, + "num_tokens": 9883814.0, + "step": 1167 + }, + { + "entropy": 1.022799789905548, + "epoch": 0.46724672467246725, + "grad_norm": 0.277195543050766, + "learning_rate": 0.00012399272543822687, + "loss": 1.0367, + "mean_token_accuracy": 0.6937270313501358, + "num_tokens": 9892320.0, + "step": 1168 + }, + { + "entropy": 1.0433712303638458, + "epoch": 0.46764676467646765, + "grad_norm": 0.2765945494174957, + "learning_rate": 0.00012387753634965823, + "loss": 1.0369, + "mean_token_accuracy": 0.7028937488794327, + "num_tokens": 9900730.0, + "step": 1169 + }, + { + "entropy": 1.0591256469488144, + "epoch": 0.46804680468046805, + "grad_norm": 0.2880633771419525, + "learning_rate": 0.00012376232397006616, + "loss": 1.0813, + "mean_token_accuracy": 0.694982573390007, + "num_tokens": 9908657.0, + "step": 1170 + }, + { + "entropy": 1.0674507915973663, + "epoch": 0.46844684468446846, + "grad_norm": 0.28274503350257874, + "learning_rate": 0.00012364708849281453, + "loss": 1.0586, + "mean_token_accuracy": 0.6906309872865677, + "num_tokens": 9916969.0, + "step": 1171 + }, + { + "entropy": 1.049183338880539, + "epoch": 0.46884688468846886, + "grad_norm": 0.2749859094619751, + "learning_rate": 0.000123531830111306, + "loss": 1.0575, + "mean_token_accuracy": 0.7043784409761429, + "num_tokens": 9925646.0, + "step": 1172 + }, + { + "entropy": 0.9694782495498657, + "epoch": 0.46924692469246926, + "grad_norm": 0.25871407985687256, + "learning_rate": 0.00012341654901898162, + "loss": 0.9384, + "mean_token_accuracy": 0.7253002226352692, + "num_tokens": 9934372.0, + "step": 1173 + }, + { + "entropy": 1.0723985135555267, + "epoch": 0.46964696469646966, + "grad_norm": 0.27434173226356506, + "learning_rate": 0.0001233012454093206, + "loss": 1.0564, + "mean_token_accuracy": 0.7020715475082397, + "num_tokens": 9942756.0, + "step": 1174 + }, + { + "entropy": 1.0543183088302612, + "epoch": 0.47004700470047006, + "grad_norm": 0.266204297542572, + "learning_rate": 0.00012318591947583994, + "loss": 1.0166, + "mean_token_accuracy": 0.7074469327926636, + "num_tokens": 9951531.0, + "step": 1175 + }, + { + "entropy": 1.047581046819687, + "epoch": 0.47044704470447046, + "grad_norm": 0.28069522976875305, + "learning_rate": 0.00012307057141209415, + "loss": 1.0526, + "mean_token_accuracy": 0.6994545012712479, + "num_tokens": 9959470.0, + "step": 1176 + }, + { + "entropy": 1.0692080706357956, + "epoch": 0.47084708470847086, + "grad_norm": 0.28120487928390503, + "learning_rate": 0.00012295520141167472, + "loss": 1.0844, + "mean_token_accuracy": 0.6893647909164429, + "num_tokens": 9967904.0, + "step": 1177 + }, + { + "entropy": 1.0024761706590652, + "epoch": 0.47124712471247127, + "grad_norm": 0.26624584197998047, + "learning_rate": 0.00012283980966821013, + "loss": 0.9834, + "mean_token_accuracy": 0.7126830667257309, + "num_tokens": 9976789.0, + "step": 1178 + }, + { + "entropy": 1.0361256003379822, + "epoch": 0.47164716471647167, + "grad_norm": 0.27504217624664307, + "learning_rate": 0.00012272439637536534, + "loss": 1.0445, + "mean_token_accuracy": 0.7050766050815582, + "num_tokens": 9985459.0, + "step": 1179 + }, + { + "entropy": 1.0629410147666931, + "epoch": 0.47204720472047207, + "grad_norm": 0.27471524477005005, + "learning_rate": 0.00012260896172684127, + "loss": 1.055, + "mean_token_accuracy": 0.6989741027355194, + "num_tokens": 9993929.0, + "step": 1180 + }, + { + "entropy": 1.0462766587734222, + "epoch": 0.47244724472447247, + "grad_norm": 0.2865782678127289, + "learning_rate": 0.00012249350591637503, + "loss": 1.042, + "mean_token_accuracy": 0.6999727338552475, + "num_tokens": 10001748.0, + "step": 1181 + }, + { + "entropy": 1.1149352192878723, + "epoch": 0.47284728472847287, + "grad_norm": 0.28089362382888794, + "learning_rate": 0.00012237802913773888, + "loss": 1.1013, + "mean_token_accuracy": 0.6825429499149323, + "num_tokens": 10010101.0, + "step": 1182 + }, + { + "entropy": 1.07693812251091, + "epoch": 0.4732473247324733, + "grad_norm": 0.2729126513004303, + "learning_rate": 0.00012226253158474057, + "loss": 1.0438, + "mean_token_accuracy": 0.6978691816329956, + "num_tokens": 10018398.0, + "step": 1183 + }, + { + "entropy": 1.05135877430439, + "epoch": 0.4736473647364737, + "grad_norm": 0.2718242108821869, + "learning_rate": 0.0001221470134512225, + "loss": 1.0607, + "mean_token_accuracy": 0.7074489444494247, + "num_tokens": 10026781.0, + "step": 1184 + }, + { + "entropy": 1.0263345539569855, + "epoch": 0.474047404740474, + "grad_norm": 0.6789280772209167, + "learning_rate": 0.00012203147493106177, + "loss": 1.0024, + "mean_token_accuracy": 0.7096521854400635, + "num_tokens": 10035261.0, + "step": 1185 + }, + { + "entropy": 1.0280724912881851, + "epoch": 0.4744474447444744, + "grad_norm": 0.27539801597595215, + "learning_rate": 0.0001219159162181696, + "loss": 1.0143, + "mean_token_accuracy": 0.7133204191923141, + "num_tokens": 10043921.0, + "step": 1186 + }, + { + "entropy": 0.9636574983596802, + "epoch": 0.4748474847484748, + "grad_norm": 0.28285086154937744, + "learning_rate": 0.00012180033750649112, + "loss": 0.9457, + "mean_token_accuracy": 0.7227327674627304, + "num_tokens": 10052454.0, + "step": 1187 + }, + { + "entropy": 1.0144190043210983, + "epoch": 0.4752475247524752, + "grad_norm": 0.269448846578598, + "learning_rate": 0.00012168473899000501, + "loss": 1.0292, + "mean_token_accuracy": 0.7110306620597839, + "num_tokens": 10061638.0, + "step": 1188 + }, + { + "entropy": 1.0919167697429657, + "epoch": 0.4756475647564756, + "grad_norm": 0.28879913687705994, + "learning_rate": 0.0001215691208627233, + "loss": 1.1159, + "mean_token_accuracy": 0.6884833872318268, + "num_tokens": 10069893.0, + "step": 1189 + }, + { + "entropy": 1.085253357887268, + "epoch": 0.47604760476047603, + "grad_norm": 0.28444284200668335, + "learning_rate": 0.00012145348331869075, + "loss": 1.1204, + "mean_token_accuracy": 0.6920699924230576, + "num_tokens": 10078415.0, + "step": 1190 + }, + { + "entropy": 0.953729435801506, + "epoch": 0.47644764476447643, + "grad_norm": 0.3254362642765045, + "learning_rate": 0.00012133782655198482, + "loss": 0.9629, + "mean_token_accuracy": 0.7239073067903519, + "num_tokens": 10087226.0, + "step": 1191 + }, + { + "entropy": 1.0226439833641052, + "epoch": 0.47684768476847683, + "grad_norm": 0.26624399423599243, + "learning_rate": 0.00012122215075671526, + "loss": 1.0328, + "mean_token_accuracy": 0.7047979682683945, + "num_tokens": 10096193.0, + "step": 1192 + }, + { + "entropy": 1.0484410375356674, + "epoch": 0.47724772477247723, + "grad_norm": 0.2892693877220154, + "learning_rate": 0.00012110645612702365, + "loss": 1.0451, + "mean_token_accuracy": 0.7011135369539261, + "num_tokens": 10104621.0, + "step": 1193 + }, + { + "entropy": 1.0708759278059006, + "epoch": 0.47764776477647763, + "grad_norm": 0.27555426955223083, + "learning_rate": 0.00012099074285708329, + "loss": 1.0454, + "mean_token_accuracy": 0.7028330713510513, + "num_tokens": 10112773.0, + "step": 1194 + }, + { + "entropy": 1.040980964899063, + "epoch": 0.47804780478047804, + "grad_norm": 0.2761571407318115, + "learning_rate": 0.00012087501114109867, + "loss": 1.0273, + "mean_token_accuracy": 0.7079716473817825, + "num_tokens": 10121521.0, + "step": 1195 + }, + { + "entropy": 1.089932233095169, + "epoch": 0.47844784478447844, + "grad_norm": 0.2951667904853821, + "learning_rate": 0.00012075926117330531, + "loss": 1.0858, + "mean_token_accuracy": 0.6962671279907227, + "num_tokens": 10129267.0, + "step": 1196 + }, + { + "entropy": 1.032705768942833, + "epoch": 0.47884788478847884, + "grad_norm": 0.2880473732948303, + "learning_rate": 0.00012064349314796932, + "loss": 1.0112, + "mean_token_accuracy": 0.7052841782569885, + "num_tokens": 10137491.0, + "step": 1197 + }, + { + "entropy": 1.0878003239631653, + "epoch": 0.47924792479247924, + "grad_norm": 0.28499460220336914, + "learning_rate": 0.00012052770725938718, + "loss": 1.0814, + "mean_token_accuracy": 0.6916963458061218, + "num_tokens": 10145930.0, + "step": 1198 + }, + { + "entropy": 1.0935112237930298, + "epoch": 0.47964796479647964, + "grad_norm": 0.28688332438468933, + "learning_rate": 0.0001204119037018852, + "loss": 1.1054, + "mean_token_accuracy": 0.6919155716896057, + "num_tokens": 10154236.0, + "step": 1199 + }, + { + "entropy": 1.0754076540470123, + "epoch": 0.48004800480048004, + "grad_norm": 0.2808684706687927, + "learning_rate": 0.00012029608266981957, + "loss": 1.0406, + "mean_token_accuracy": 0.6963625550270081, + "num_tokens": 10162432.0, + "step": 1200 + }, + { + "entropy": 1.0716854929924011, + "epoch": 0.48044804480448045, + "grad_norm": 0.2715063989162445, + "learning_rate": 0.0001201802443575756, + "loss": 1.031, + "mean_token_accuracy": 0.7017153948545456, + "num_tokens": 10171054.0, + "step": 1201 + }, + { + "entropy": 1.0666370242834091, + "epoch": 0.48084808480848085, + "grad_norm": 0.29229459166526794, + "learning_rate": 0.00012006438895956771, + "loss": 1.0635, + "mean_token_accuracy": 0.6983866840600967, + "num_tokens": 10179542.0, + "step": 1202 + }, + { + "entropy": 1.0412549823522568, + "epoch": 0.48124812481248125, + "grad_norm": 0.2688295245170593, + "learning_rate": 0.00011994851667023894, + "loss": 1.0186, + "mean_token_accuracy": 0.7059452533721924, + "num_tokens": 10188289.0, + "step": 1203 + }, + { + "entropy": 1.0636851787567139, + "epoch": 0.48164816481648165, + "grad_norm": 0.30092504620552063, + "learning_rate": 0.00011983262768406079, + "loss": 1.0644, + "mean_token_accuracy": 0.6932414025068283, + "num_tokens": 10196491.0, + "step": 1204 + }, + { + "entropy": 0.975030779838562, + "epoch": 0.48204820482048205, + "grad_norm": 0.2549944519996643, + "learning_rate": 0.00011971672219553263, + "loss": 0.9944, + "mean_token_accuracy": 0.7116395086050034, + "num_tokens": 10206189.0, + "step": 1205 + }, + { + "entropy": 1.1082092225551605, + "epoch": 0.48244824482448245, + "grad_norm": 0.2835574150085449, + "learning_rate": 0.00011960080039918158, + "loss": 1.074, + "mean_token_accuracy": 0.7005282640457153, + "num_tokens": 10214005.0, + "step": 1206 + }, + { + "entropy": 1.0646294951438904, + "epoch": 0.48284828482848285, + "grad_norm": 0.26771602034568787, + "learning_rate": 0.00011948486248956226, + "loss": 1.0154, + "mean_token_accuracy": 0.705711156129837, + "num_tokens": 10222793.0, + "step": 1207 + }, + { + "entropy": 1.0140301436185837, + "epoch": 0.48324832483248326, + "grad_norm": 0.33192697167396545, + "learning_rate": 0.00011936890866125611, + "loss": 1.0, + "mean_token_accuracy": 0.7157297134399414, + "num_tokens": 10232230.0, + "step": 1208 + }, + { + "entropy": 1.043394774198532, + "epoch": 0.48364836483648366, + "grad_norm": 0.2728542685508728, + "learning_rate": 0.00011925293910887145, + "loss": 1.0055, + "mean_token_accuracy": 0.7038876116275787, + "num_tokens": 10240781.0, + "step": 1209 + }, + { + "entropy": 1.029653251171112, + "epoch": 0.48404840484048406, + "grad_norm": 0.2822779417037964, + "learning_rate": 0.00011913695402704295, + "loss": 1.0383, + "mean_token_accuracy": 0.7005468308925629, + "num_tokens": 10248807.0, + "step": 1210 + }, + { + "entropy": 1.0139478743076324, + "epoch": 0.48444844484448446, + "grad_norm": 0.2801291048526764, + "learning_rate": 0.00011902095361043131, + "loss": 1.0368, + "mean_token_accuracy": 0.7071302235126495, + "num_tokens": 10257395.0, + "step": 1211 + }, + { + "entropy": 1.0725680589675903, + "epoch": 0.48484848484848486, + "grad_norm": 0.3040355145931244, + "learning_rate": 0.00011890493805372298, + "loss": 1.1203, + "mean_token_accuracy": 0.6856051832437515, + "num_tokens": 10265509.0, + "step": 1212 + }, + { + "entropy": 1.0069702416658401, + "epoch": 0.48524852485248526, + "grad_norm": 0.2773860692977905, + "learning_rate": 0.00011878890755162988, + "loss": 1.0282, + "mean_token_accuracy": 0.7073894739151001, + "num_tokens": 10274035.0, + "step": 1213 + }, + { + "entropy": 1.047236606478691, + "epoch": 0.48564856485648566, + "grad_norm": 0.2850005328655243, + "learning_rate": 0.00011867286229888891, + "loss": 1.0494, + "mean_token_accuracy": 0.7006160318851471, + "num_tokens": 10282244.0, + "step": 1214 + }, + { + "entropy": 1.070036232471466, + "epoch": 0.48604860486048607, + "grad_norm": 0.2913147509098053, + "learning_rate": 0.0001185568024902618, + "loss": 1.0774, + "mean_token_accuracy": 0.6965321898460388, + "num_tokens": 10289870.0, + "step": 1215 + }, + { + "entropy": 1.037326529622078, + "epoch": 0.48644864486448647, + "grad_norm": 0.2713771164417267, + "learning_rate": 0.00011844072832053467, + "loss": 1.0376, + "mean_token_accuracy": 0.7031806707382202, + "num_tokens": 10298756.0, + "step": 1216 + }, + { + "entropy": 1.0518364310264587, + "epoch": 0.48684868486848687, + "grad_norm": 0.2933819890022278, + "learning_rate": 0.00011832463998451779, + "loss": 1.0361, + "mean_token_accuracy": 0.6981233209371567, + "num_tokens": 10306270.0, + "step": 1217 + }, + { + "entropy": 1.0379336476325989, + "epoch": 0.48724872487248727, + "grad_norm": 0.27227991819381714, + "learning_rate": 0.00011820853767704516, + "loss": 1.0288, + "mean_token_accuracy": 0.7028944045305252, + "num_tokens": 10314656.0, + "step": 1218 + }, + { + "entropy": 1.0987060964107513, + "epoch": 0.4876487648764877, + "grad_norm": 0.26948776841163635, + "learning_rate": 0.00011809242159297427, + "loss": 1.1052, + "mean_token_accuracy": 0.6947290152311325, + "num_tokens": 10323627.0, + "step": 1219 + }, + { + "entropy": 1.0974067151546478, + "epoch": 0.4880488048804881, + "grad_norm": 0.27042073011398315, + "learning_rate": 0.00011797629192718567, + "loss": 1.0819, + "mean_token_accuracy": 0.6888881325721741, + "num_tokens": 10332314.0, + "step": 1220 + }, + { + "entropy": 1.0214581340551376, + "epoch": 0.4884488448844885, + "grad_norm": 0.2635052502155304, + "learning_rate": 0.00011786014887458276, + "loss": 1.0123, + "mean_token_accuracy": 0.7094965726137161, + "num_tokens": 10341196.0, + "step": 1221 + }, + { + "entropy": 1.0715034306049347, + "epoch": 0.4888488848884888, + "grad_norm": 0.2929968237876892, + "learning_rate": 0.00011774399263009139, + "loss": 1.0545, + "mean_token_accuracy": 0.7037334144115448, + "num_tokens": 10349562.0, + "step": 1222 + }, + { + "entropy": 1.0725442618131638, + "epoch": 0.4892489248924892, + "grad_norm": 0.26310214400291443, + "learning_rate": 0.00011762782338865959, + "loss": 1.0541, + "mean_token_accuracy": 0.703344076871872, + "num_tokens": 10358235.0, + "step": 1223 + }, + { + "entropy": 1.051314577460289, + "epoch": 0.4896489648964896, + "grad_norm": 0.2607339918613434, + "learning_rate": 0.0001175116413452571, + "loss": 1.0018, + "mean_token_accuracy": 0.7108362466096878, + "num_tokens": 10367389.0, + "step": 1224 + }, + { + "entropy": 1.1295893788337708, + "epoch": 0.49004900490049, + "grad_norm": 0.2898363471031189, + "learning_rate": 0.0001173954466948752, + "loss": 1.1392, + "mean_token_accuracy": 0.683552473783493, + "num_tokens": 10375471.0, + "step": 1225 + }, + { + "entropy": 0.9824663400650024, + "epoch": 0.4904490449044904, + "grad_norm": 0.2647607624530792, + "learning_rate": 0.0001172792396325264, + "loss": 0.9565, + "mean_token_accuracy": 0.719332680106163, + "num_tokens": 10384714.0, + "step": 1226 + }, + { + "entropy": 1.0681991577148438, + "epoch": 0.49084908490849083, + "grad_norm": 0.2768138349056244, + "learning_rate": 0.00011716302035324391, + "loss": 1.0779, + "mean_token_accuracy": 0.6966089904308319, + "num_tokens": 10392747.0, + "step": 1227 + }, + { + "entropy": 1.0471950471401215, + "epoch": 0.49124912491249123, + "grad_norm": 0.2710064947605133, + "learning_rate": 0.00011704678905208157, + "loss": 1.048, + "mean_token_accuracy": 0.6996793001890182, + "num_tokens": 10401255.0, + "step": 1228 + }, + { + "entropy": 1.0608052462339401, + "epoch": 0.49164916491649163, + "grad_norm": 0.2862861454486847, + "learning_rate": 0.00011693054592411335, + "loss": 1.0695, + "mean_token_accuracy": 0.69645856320858, + "num_tokens": 10409512.0, + "step": 1229 + }, + { + "entropy": 1.105454921722412, + "epoch": 0.49204920492049203, + "grad_norm": 0.28317776322364807, + "learning_rate": 0.00011681429116443299, + "loss": 1.1115, + "mean_token_accuracy": 0.6897053122520447, + "num_tokens": 10417719.0, + "step": 1230 + }, + { + "entropy": 1.0387613475322723, + "epoch": 0.49244924492449244, + "grad_norm": 0.28159594535827637, + "learning_rate": 0.00011669802496815387, + "loss": 1.0256, + "mean_token_accuracy": 0.7071799635887146, + "num_tokens": 10425958.0, + "step": 1231 + }, + { + "entropy": 1.0679793953895569, + "epoch": 0.49284928492849284, + "grad_norm": 0.30830681324005127, + "learning_rate": 0.00011658174753040849, + "loss": 1.0412, + "mean_token_accuracy": 0.6991352289915085, + "num_tokens": 10433869.0, + "step": 1232 + }, + { + "entropy": 0.9922660440206528, + "epoch": 0.49324932493249324, + "grad_norm": 0.27384141087532043, + "learning_rate": 0.00011646545904634827, + "loss": 0.9771, + "mean_token_accuracy": 0.7225242108106613, + "num_tokens": 10442113.0, + "step": 1233 + }, + { + "entropy": 1.0876508951187134, + "epoch": 0.49364936493649364, + "grad_norm": 0.31505510210990906, + "learning_rate": 0.00011634915971114311, + "loss": 1.1156, + "mean_token_accuracy": 0.6855295896530151, + "num_tokens": 10450432.0, + "step": 1234 + }, + { + "entropy": 1.0242749601602554, + "epoch": 0.49404940494049404, + "grad_norm": 0.27038609981536865, + "learning_rate": 0.00011623284971998117, + "loss": 1.0281, + "mean_token_accuracy": 0.7091301083564758, + "num_tokens": 10458876.0, + "step": 1235 + }, + { + "entropy": 1.053745061159134, + "epoch": 0.49444944494449444, + "grad_norm": 0.30394574999809265, + "learning_rate": 0.00011611652926806847, + "loss": 1.0607, + "mean_token_accuracy": 0.6978646367788315, + "num_tokens": 10467151.0, + "step": 1236 + }, + { + "entropy": 0.965473547577858, + "epoch": 0.49484948494849484, + "grad_norm": 0.2717350423336029, + "learning_rate": 0.00011600019855062858, + "loss": 0.9774, + "mean_token_accuracy": 0.7160736620426178, + "num_tokens": 10475832.0, + "step": 1237 + }, + { + "entropy": 0.9652626514434814, + "epoch": 0.49524952495249525, + "grad_norm": 0.2608972191810608, + "learning_rate": 0.00011588385776290236, + "loss": 0.9607, + "mean_token_accuracy": 0.7265899926424026, + "num_tokens": 10484661.0, + "step": 1238 + }, + { + "entropy": 1.047718033194542, + "epoch": 0.49564956495649565, + "grad_norm": 0.2587495446205139, + "learning_rate": 0.00011576750710014745, + "loss": 1.0143, + "mean_token_accuracy": 0.707798570394516, + "num_tokens": 10493523.0, + "step": 1239 + }, + { + "entropy": 1.065189689397812, + "epoch": 0.49604960496049605, + "grad_norm": 0.2733771800994873, + "learning_rate": 0.00011565114675763822, + "loss": 1.0581, + "mean_token_accuracy": 0.6994892358779907, + "num_tokens": 10502261.0, + "step": 1240 + }, + { + "entropy": 1.0445655137300491, + "epoch": 0.49644964496449645, + "grad_norm": 0.2651207447052002, + "learning_rate": 0.00011553477693066514, + "loss": 1.0238, + "mean_token_accuracy": 0.7065546363592148, + "num_tokens": 10510310.0, + "step": 1241 + }, + { + "entropy": 1.0850439071655273, + "epoch": 0.49684968496849685, + "grad_norm": 0.2725779414176941, + "learning_rate": 0.00011541839781453469, + "loss": 1.0541, + "mean_token_accuracy": 0.6929461508989334, + "num_tokens": 10518886.0, + "step": 1242 + }, + { + "entropy": 1.0431684851646423, + "epoch": 0.49724972497249725, + "grad_norm": 0.2630937099456787, + "learning_rate": 0.00011530200960456889, + "loss": 1.0124, + "mean_token_accuracy": 0.711039587855339, + "num_tokens": 10527636.0, + "step": 1243 + }, + { + "entropy": 1.0256138890981674, + "epoch": 0.49764976497649765, + "grad_norm": 0.26908180117607117, + "learning_rate": 0.00011518561249610507, + "loss": 1.0171, + "mean_token_accuracy": 0.7215728908777237, + "num_tokens": 10536277.0, + "step": 1244 + }, + { + "entropy": 1.0535799711942673, + "epoch": 0.49804980498049806, + "grad_norm": 0.263375848531723, + "learning_rate": 0.00011506920668449544, + "loss": 1.0596, + "mean_token_accuracy": 0.704230546951294, + "num_tokens": 10545191.0, + "step": 1245 + }, + { + "entropy": 1.0292758494615555, + "epoch": 0.49844984498449846, + "grad_norm": 0.26524925231933594, + "learning_rate": 0.00011495279236510686, + "loss": 1.0175, + "mean_token_accuracy": 0.7077385634183884, + "num_tokens": 10553758.0, + "step": 1246 + }, + { + "entropy": 1.1013007760047913, + "epoch": 0.49884988498849886, + "grad_norm": 0.27598246932029724, + "learning_rate": 0.00011483636973332045, + "loss": 1.1049, + "mean_token_accuracy": 0.6921682953834534, + "num_tokens": 10562162.0, + "step": 1247 + }, + { + "entropy": 1.031398817896843, + "epoch": 0.49924992499249926, + "grad_norm": 0.27202895283699036, + "learning_rate": 0.00011471993898453127, + "loss": 1.0397, + "mean_token_accuracy": 0.7061713486909866, + "num_tokens": 10570664.0, + "step": 1248 + }, + { + "entropy": 0.9900976419448853, + "epoch": 0.49964996499649966, + "grad_norm": 0.2520087957382202, + "learning_rate": 0.00011460350031414806, + "loss": 0.9704, + "mean_token_accuracy": 0.7221287339925766, + "num_tokens": 10579689.0, + "step": 1249 + }, + { + "entropy": 1.0074497312307358, + "epoch": 0.5000500050005, + "grad_norm": 0.2785857021808624, + "learning_rate": 0.00011448705391759274, + "loss": 1.0116, + "mean_token_accuracy": 0.7141975462436676, + "num_tokens": 10587781.0, + "step": 1250 + }, + { + "entropy": 1.0371801257133484, + "epoch": 0.5004500450045004, + "grad_norm": 0.28973206877708435, + "learning_rate": 0.00011437059999030035, + "loss": 1.0654, + "mean_token_accuracy": 0.7015185356140137, + "num_tokens": 10596047.0, + "step": 1251 + }, + { + "entropy": 1.0523166060447693, + "epoch": 0.5008500850085008, + "grad_norm": 0.27080467343330383, + "learning_rate": 0.00011425413872771846, + "loss": 1.0577, + "mean_token_accuracy": 0.7006728053092957, + "num_tokens": 10604789.0, + "step": 1252 + }, + { + "entropy": 1.0120891481637955, + "epoch": 0.5012501250125012, + "grad_norm": 0.27563995122909546, + "learning_rate": 0.00011413767032530693, + "loss": 1.0228, + "mean_token_accuracy": 0.7076671272516251, + "num_tokens": 10613888.0, + "step": 1253 + }, + { + "entropy": 1.0062814950942993, + "epoch": 0.5016501650165016, + "grad_norm": 0.28668683767318726, + "learning_rate": 0.00011402119497853774, + "loss": 0.9828, + "mean_token_accuracy": 0.7231711149215698, + "num_tokens": 10622254.0, + "step": 1254 + }, + { + "entropy": 1.0650264471769333, + "epoch": 0.502050205020502, + "grad_norm": 0.2679363787174225, + "learning_rate": 0.00011390471288289434, + "loss": 1.0545, + "mean_token_accuracy": 0.7008716613054276, + "num_tokens": 10631022.0, + "step": 1255 + }, + { + "entropy": 1.0252198576927185, + "epoch": 0.5024502450245024, + "grad_norm": 0.27498337626457214, + "learning_rate": 0.0001137882242338717, + "loss": 0.993, + "mean_token_accuracy": 0.7149540930986404, + "num_tokens": 10639344.0, + "step": 1256 + }, + { + "entropy": 1.1092785596847534, + "epoch": 0.5028502850285028, + "grad_norm": 0.2966563403606415, + "learning_rate": 0.0001136717292269756, + "loss": 1.1098, + "mean_token_accuracy": 0.6908989101648331, + "num_tokens": 10646839.0, + "step": 1257 + }, + { + "entropy": 1.057497262954712, + "epoch": 0.5032503250325032, + "grad_norm": 0.27825766801834106, + "learning_rate": 0.00011355522805772267, + "loss": 1.0333, + "mean_token_accuracy": 0.7044285386800766, + "num_tokens": 10655070.0, + "step": 1258 + }, + { + "entropy": 1.0453435629606247, + "epoch": 0.5036503650365036, + "grad_norm": 0.27042120695114136, + "learning_rate": 0.00011343872092163976, + "loss": 1.0087, + "mean_token_accuracy": 0.7102141380310059, + "num_tokens": 10663512.0, + "step": 1259 + }, + { + "entropy": 1.018640398979187, + "epoch": 0.504050405040504, + "grad_norm": 0.2638070285320282, + "learning_rate": 0.00011332220801426374, + "loss": 1.0107, + "mean_token_accuracy": 0.7088742554187775, + "num_tokens": 10672566.0, + "step": 1260 + }, + { + "entropy": 1.0925059020519257, + "epoch": 0.5044504450445044, + "grad_norm": 0.27771055698394775, + "learning_rate": 0.00011320568953114123, + "loss": 1.0988, + "mean_token_accuracy": 0.686533659696579, + "num_tokens": 10681194.0, + "step": 1261 + }, + { + "entropy": 1.0306050777435303, + "epoch": 0.5048504850485048, + "grad_norm": 0.26585057377815247, + "learning_rate": 0.00011308916566782817, + "loss": 1.0178, + "mean_token_accuracy": 0.7127333581447601, + "num_tokens": 10689825.0, + "step": 1262 + }, + { + "entropy": 1.045543909072876, + "epoch": 0.5052505250525052, + "grad_norm": 0.277130126953125, + "learning_rate": 0.00011297263661988952, + "loss": 1.0407, + "mean_token_accuracy": 0.6991783976554871, + "num_tokens": 10698303.0, + "step": 1263 + }, + { + "entropy": 1.0117101669311523, + "epoch": 0.5056505650565056, + "grad_norm": 0.2671206593513489, + "learning_rate": 0.00011285610258289895, + "loss": 1.0342, + "mean_token_accuracy": 0.7066033482551575, + "num_tokens": 10707420.0, + "step": 1264 + }, + { + "entropy": 1.0369693338871002, + "epoch": 0.506050605060506, + "grad_norm": 0.31301817297935486, + "learning_rate": 0.00011273956375243855, + "loss": 1.0624, + "mean_token_accuracy": 0.6983072012662888, + "num_tokens": 10716011.0, + "step": 1265 + }, + { + "entropy": 1.036535769701004, + "epoch": 0.5064506450645064, + "grad_norm": 0.2716148793697357, + "learning_rate": 0.0001126230203240984, + "loss": 1.0456, + "mean_token_accuracy": 0.7010018676519394, + "num_tokens": 10724738.0, + "step": 1266 + }, + { + "entropy": 1.108202874660492, + "epoch": 0.5068506850685068, + "grad_norm": 0.28018778562545776, + "learning_rate": 0.00011250647249347625, + "loss": 1.1027, + "mean_token_accuracy": 0.6950458139181137, + "num_tokens": 10732976.0, + "step": 1267 + }, + { + "entropy": 0.9922303408384323, + "epoch": 0.5072507250725072, + "grad_norm": 0.2626727223396301, + "learning_rate": 0.00011238992045617738, + "loss": 0.9886, + "mean_token_accuracy": 0.7123332321643829, + "num_tokens": 10741560.0, + "step": 1268 + }, + { + "entropy": 0.9976973235607147, + "epoch": 0.5076507650765076, + "grad_norm": 0.26754385232925415, + "learning_rate": 0.000112273364407814, + "loss": 0.9935, + "mean_token_accuracy": 0.7140461802482605, + "num_tokens": 10750145.0, + "step": 1269 + }, + { + "entropy": 1.079247385263443, + "epoch": 0.508050805080508, + "grad_norm": 0.27707529067993164, + "learning_rate": 0.0001121568045440051, + "loss": 1.073, + "mean_token_accuracy": 0.6975164860486984, + "num_tokens": 10758419.0, + "step": 1270 + }, + { + "entropy": 1.0950802564620972, + "epoch": 0.5084508450845084, + "grad_norm": 0.27835536003112793, + "learning_rate": 0.00011204024106037609, + "loss": 1.0931, + "mean_token_accuracy": 0.691558301448822, + "num_tokens": 10766993.0, + "step": 1271 + }, + { + "entropy": 1.0205723345279694, + "epoch": 0.5088508850885088, + "grad_norm": 0.27321383357048035, + "learning_rate": 0.00011192367415255846, + "loss": 0.9912, + "mean_token_accuracy": 0.7130928039550781, + "num_tokens": 10775273.0, + "step": 1272 + }, + { + "entropy": 1.0592113733291626, + "epoch": 0.5092509250925092, + "grad_norm": 0.27176037430763245, + "learning_rate": 0.00011180710401618942, + "loss": 1.075, + "mean_token_accuracy": 0.6953422427177429, + "num_tokens": 10783957.0, + "step": 1273 + }, + { + "entropy": 1.0103185027837753, + "epoch": 0.5096509650965096, + "grad_norm": 0.27208417654037476, + "learning_rate": 0.00011169053084691156, + "loss": 0.998, + "mean_token_accuracy": 0.7147505730390549, + "num_tokens": 10792683.0, + "step": 1274 + }, + { + "entropy": 1.0187568217515945, + "epoch": 0.51005100510051, + "grad_norm": 0.2991833984851837, + "learning_rate": 0.00011157395484037265, + "loss": 1.003, + "mean_token_accuracy": 0.7118721753358841, + "num_tokens": 10801284.0, + "step": 1275 + }, + { + "entropy": 0.9936603307723999, + "epoch": 0.5104510451045104, + "grad_norm": 0.27325525879859924, + "learning_rate": 0.00011145737619222516, + "loss": 0.9992, + "mean_token_accuracy": 0.7168545573949814, + "num_tokens": 10810063.0, + "step": 1276 + }, + { + "entropy": 1.049889400601387, + "epoch": 0.5108510851085108, + "grad_norm": 0.2775174677371979, + "learning_rate": 0.00011134079509812598, + "loss": 1.0235, + "mean_token_accuracy": 0.7104092687368393, + "num_tokens": 10818601.0, + "step": 1277 + }, + { + "entropy": 0.992682933807373, + "epoch": 0.5112511251125113, + "grad_norm": 0.25520840287208557, + "learning_rate": 0.00011122421175373621, + "loss": 0.9986, + "mean_token_accuracy": 0.7169641852378845, + "num_tokens": 10827626.0, + "step": 1278 + }, + { + "entropy": 1.030014991760254, + "epoch": 0.5116511651165117, + "grad_norm": 0.2913283407688141, + "learning_rate": 0.00011110762635472059, + "loss": 1.0282, + "mean_token_accuracy": 0.7072141766548157, + "num_tokens": 10835314.0, + "step": 1279 + }, + { + "entropy": 1.0588295012712479, + "epoch": 0.512051205120512, + "grad_norm": 0.2976827025413513, + "learning_rate": 0.0001109910390967474, + "loss": 1.0447, + "mean_token_accuracy": 0.7050375491380692, + "num_tokens": 10843318.0, + "step": 1280 + }, + { + "entropy": 1.0610300153493881, + "epoch": 0.5124512451245125, + "grad_norm": 0.2706773281097412, + "learning_rate": 0.00011087445017548796, + "loss": 1.0866, + "mean_token_accuracy": 0.6970679312944412, + "num_tokens": 10851648.0, + "step": 1281 + }, + { + "entropy": 1.0447220504283905, + "epoch": 0.5128512851285129, + "grad_norm": 0.27463045716285706, + "learning_rate": 0.00011075785978661652, + "loss": 1.0721, + "mean_token_accuracy": 0.6946658194065094, + "num_tokens": 10860122.0, + "step": 1282 + }, + { + "entropy": 1.1017592251300812, + "epoch": 0.5132513251325133, + "grad_norm": 0.28269118070602417, + "learning_rate": 0.00011064126812580965, + "loss": 1.1284, + "mean_token_accuracy": 0.6818974614143372, + "num_tokens": 10868435.0, + "step": 1283 + }, + { + "entropy": 1.0387571156024933, + "epoch": 0.5136513651365137, + "grad_norm": 0.2689679265022278, + "learning_rate": 0.00011052467538874611, + "loss": 1.0228, + "mean_token_accuracy": 0.7006640285253525, + "num_tokens": 10876528.0, + "step": 1284 + }, + { + "entropy": 1.0352961868047714, + "epoch": 0.5140514051405141, + "grad_norm": 0.27280402183532715, + "learning_rate": 0.00011040808177110649, + "loss": 1.0215, + "mean_token_accuracy": 0.709163710474968, + "num_tokens": 10884585.0, + "step": 1285 + }, + { + "entropy": 1.1173689365386963, + "epoch": 0.5144514451445145, + "grad_norm": 0.2804111838340759, + "learning_rate": 0.00011029148746857281, + "loss": 1.0957, + "mean_token_accuracy": 0.6961445212364197, + "num_tokens": 10892591.0, + "step": 1286 + }, + { + "entropy": 1.1053667068481445, + "epoch": 0.5148514851485149, + "grad_norm": 0.28149059414863586, + "learning_rate": 0.00011017489267682826, + "loss": 1.093, + "mean_token_accuracy": 0.6881065517663956, + "num_tokens": 10900626.0, + "step": 1287 + }, + { + "entropy": 0.9665865749120712, + "epoch": 0.5152515251525153, + "grad_norm": 0.26786574721336365, + "learning_rate": 0.00011005829759155686, + "loss": 0.9195, + "mean_token_accuracy": 0.729648694396019, + "num_tokens": 10909634.0, + "step": 1288 + }, + { + "entropy": 1.075927883386612, + "epoch": 0.5156515651565157, + "grad_norm": 0.2875480651855469, + "learning_rate": 0.00010994170240844315, + "loss": 1.0704, + "mean_token_accuracy": 0.6970002949237823, + "num_tokens": 10917670.0, + "step": 1289 + }, + { + "entropy": 1.0841463804244995, + "epoch": 0.5160516051605161, + "grad_norm": 0.2773706316947937, + "learning_rate": 0.00010982510732317175, + "loss": 1.0792, + "mean_token_accuracy": 0.6925313770771027, + "num_tokens": 10926167.0, + "step": 1290 + }, + { + "entropy": 1.0074465870857239, + "epoch": 0.5164516451645165, + "grad_norm": 0.2599256932735443, + "learning_rate": 0.00010970851253142724, + "loss": 1.0116, + "mean_token_accuracy": 0.7092914432287216, + "num_tokens": 10935080.0, + "step": 1291 + }, + { + "entropy": 1.0281668603420258, + "epoch": 0.5168516851685169, + "grad_norm": 0.28148072957992554, + "learning_rate": 0.00010959191822889354, + "loss": 1.0283, + "mean_token_accuracy": 0.7076976746320724, + "num_tokens": 10943078.0, + "step": 1292 + }, + { + "entropy": 0.9694829136133194, + "epoch": 0.5172517251725173, + "grad_norm": 0.2585827112197876, + "learning_rate": 0.00010947532461125394, + "loss": 0.9766, + "mean_token_accuracy": 0.7176331132650375, + "num_tokens": 10952192.0, + "step": 1293 + }, + { + "entropy": 0.9736534804105759, + "epoch": 0.5176517651765177, + "grad_norm": 0.27464941143989563, + "learning_rate": 0.00010935873187419037, + "loss": 0.9817, + "mean_token_accuracy": 0.7154617607593536, + "num_tokens": 10960724.0, + "step": 1294 + }, + { + "entropy": 1.0511492788791656, + "epoch": 0.5180518051805181, + "grad_norm": 0.28993791341781616, + "learning_rate": 0.00010924214021338349, + "loss": 1.0457, + "mean_token_accuracy": 0.69554802775383, + "num_tokens": 10968711.0, + "step": 1295 + }, + { + "entropy": 1.0582757592201233, + "epoch": 0.5184518451845185, + "grad_norm": 0.27050113677978516, + "learning_rate": 0.00010912554982451206, + "loss": 1.0472, + "mean_token_accuracy": 0.6984061449766159, + "num_tokens": 10977194.0, + "step": 1296 + }, + { + "entropy": 1.1033360809087753, + "epoch": 0.5188518851885189, + "grad_norm": 0.28481534123420715, + "learning_rate": 0.00010900896090325265, + "loss": 1.1028, + "mean_token_accuracy": 0.6904129832983017, + "num_tokens": 10985213.0, + "step": 1297 + }, + { + "entropy": 1.1085593104362488, + "epoch": 0.5192519251925193, + "grad_norm": 0.28705355525016785, + "learning_rate": 0.00010889237364527942, + "loss": 1.0829, + "mean_token_accuracy": 0.6932549774646759, + "num_tokens": 10992859.0, + "step": 1298 + }, + { + "entropy": 1.0766243934631348, + "epoch": 0.5196519651965197, + "grad_norm": 0.2605285346508026, + "learning_rate": 0.0001087757882462638, + "loss": 1.0576, + "mean_token_accuracy": 0.7053855359554291, + "num_tokens": 11001783.0, + "step": 1299 + }, + { + "entropy": 1.0741495192050934, + "epoch": 0.5200520052005201, + "grad_norm": 0.2878367006778717, + "learning_rate": 0.00010865920490187402, + "loss": 1.0623, + "mean_token_accuracy": 0.6969932615756989, + "num_tokens": 11010176.0, + "step": 1300 + }, + { + "entropy": 0.9880045801401138, + "epoch": 0.5204520452045205, + "grad_norm": 0.2559458017349243, + "learning_rate": 0.00010854262380777486, + "loss": 0.9526, + "mean_token_accuracy": 0.7208569049835205, + "num_tokens": 11018723.0, + "step": 1301 + }, + { + "entropy": 1.0562446415424347, + "epoch": 0.5208520852085209, + "grad_norm": 0.2802310883998871, + "learning_rate": 0.0001084260451596274, + "loss": 1.0222, + "mean_token_accuracy": 0.7001920342445374, + "num_tokens": 11026854.0, + "step": 1302 + }, + { + "entropy": 1.0014354437589645, + "epoch": 0.5212521252125213, + "grad_norm": 0.26478952169418335, + "learning_rate": 0.00010830946915308846, + "loss": 0.9914, + "mean_token_accuracy": 0.7087557017803192, + "num_tokens": 11035486.0, + "step": 1303 + }, + { + "entropy": 1.0268438011407852, + "epoch": 0.5216521652165217, + "grad_norm": 0.2702075242996216, + "learning_rate": 0.00010819289598381059, + "loss": 1.0282, + "mean_token_accuracy": 0.7052408754825592, + "num_tokens": 11044128.0, + "step": 1304 + }, + { + "entropy": 0.9880654811859131, + "epoch": 0.5220522052205221, + "grad_norm": 0.2600576877593994, + "learning_rate": 0.00010807632584744156, + "loss": 1.0115, + "mean_token_accuracy": 0.7128196358680725, + "num_tokens": 11053243.0, + "step": 1305 + }, + { + "entropy": 1.0364452451467514, + "epoch": 0.5224522452245225, + "grad_norm": 0.2720400393009186, + "learning_rate": 0.00010795975893962392, + "loss": 1.0348, + "mean_token_accuracy": 0.6999799758195877, + "num_tokens": 11062202.0, + "step": 1306 + }, + { + "entropy": 1.0009511858224869, + "epoch": 0.5228522852285229, + "grad_norm": 0.26903238892555237, + "learning_rate": 0.0001078431954559949, + "loss": 1.0151, + "mean_token_accuracy": 0.7087407410144806, + "num_tokens": 11071007.0, + "step": 1307 + }, + { + "entropy": 1.0160877853631973, + "epoch": 0.5232523252325233, + "grad_norm": 0.26749610900878906, + "learning_rate": 0.00010772663559218601, + "loss": 1.003, + "mean_token_accuracy": 0.7074311375617981, + "num_tokens": 11079596.0, + "step": 1308 + }, + { + "entropy": 1.0440826714038849, + "epoch": 0.5236523652365237, + "grad_norm": 0.28473010659217834, + "learning_rate": 0.00010761007954382265, + "loss": 1.0687, + "mean_token_accuracy": 0.6992804110050201, + "num_tokens": 11088330.0, + "step": 1309 + }, + { + "entropy": 0.9919937252998352, + "epoch": 0.5240524052405241, + "grad_norm": 0.27380526065826416, + "learning_rate": 0.00010749352750652377, + "loss": 1.0275, + "mean_token_accuracy": 0.7110543847084045, + "num_tokens": 11097128.0, + "step": 1310 + }, + { + "entropy": 0.9802242964506149, + "epoch": 0.5244524452445245, + "grad_norm": 0.2662859261035919, + "learning_rate": 0.00010737697967590165, + "loss": 0.9454, + "mean_token_accuracy": 0.7290502190589905, + "num_tokens": 11105631.0, + "step": 1311 + }, + { + "entropy": 1.1597804725170135, + "epoch": 0.5248524852485249, + "grad_norm": 0.29659485816955566, + "learning_rate": 0.00010726043624756146, + "loss": 1.1839, + "mean_token_accuracy": 0.6738938242197037, + "num_tokens": 11113946.0, + "step": 1312 + }, + { + "entropy": 1.070378065109253, + "epoch": 0.5252525252525253, + "grad_norm": 0.2672848403453827, + "learning_rate": 0.00010714389741710104, + "loss": 1.0617, + "mean_token_accuracy": 0.7019895911216736, + "num_tokens": 11122433.0, + "step": 1313 + }, + { + "entropy": 1.0828298181295395, + "epoch": 0.5256525652565257, + "grad_norm": 0.2810417711734772, + "learning_rate": 0.00010702736338011052, + "loss": 1.0658, + "mean_token_accuracy": 0.6972759962081909, + "num_tokens": 11130701.0, + "step": 1314 + }, + { + "entropy": 1.024320736527443, + "epoch": 0.5260526052605261, + "grad_norm": 0.26652878522872925, + "learning_rate": 0.00010691083433217186, + "loss": 0.9959, + "mean_token_accuracy": 0.7137831896543503, + "num_tokens": 11139619.0, + "step": 1315 + }, + { + "entropy": 1.1144897043704987, + "epoch": 0.5264526452645264, + "grad_norm": 0.2799079418182373, + "learning_rate": 0.0001067943104688588, + "loss": 1.1029, + "mean_token_accuracy": 0.6890621185302734, + "num_tokens": 11147939.0, + "step": 1316 + }, + { + "entropy": 1.0311930030584335, + "epoch": 0.5268526852685268, + "grad_norm": 0.2706814706325531, + "learning_rate": 0.00010667779198573627, + "loss": 1.0117, + "mean_token_accuracy": 0.7063543796539307, + "num_tokens": 11156374.0, + "step": 1317 + }, + { + "entropy": 1.0864450633525848, + "epoch": 0.5272527252725272, + "grad_norm": 0.2820771336555481, + "learning_rate": 0.00010656127907836026, + "loss": 1.0661, + "mean_token_accuracy": 0.6956162005662918, + "num_tokens": 11164693.0, + "step": 1318 + }, + { + "entropy": 1.0353470146656036, + "epoch": 0.5276527652765276, + "grad_norm": 0.27724042534828186, + "learning_rate": 0.00010644477194227734, + "loss": 1.0276, + "mean_token_accuracy": 0.7052579969167709, + "num_tokens": 11173114.0, + "step": 1319 + }, + { + "entropy": 1.0413073599338531, + "epoch": 0.528052805280528, + "grad_norm": 0.28393012285232544, + "learning_rate": 0.0001063282707730244, + "loss": 1.0337, + "mean_token_accuracy": 0.7042751908302307, + "num_tokens": 11181564.0, + "step": 1320 + }, + { + "entropy": 1.0246710628271103, + "epoch": 0.5284528452845284, + "grad_norm": 0.29631638526916504, + "learning_rate": 0.00010621177576612835, + "loss": 1.0218, + "mean_token_accuracy": 0.7020586580038071, + "num_tokens": 11189040.0, + "step": 1321 + }, + { + "entropy": 1.0028817057609558, + "epoch": 0.5288528852885288, + "grad_norm": 0.2566564381122589, + "learning_rate": 0.00010609528711710565, + "loss": 1.0002, + "mean_token_accuracy": 0.7111796587705612, + "num_tokens": 11197996.0, + "step": 1322 + }, + { + "entropy": 0.9709349572658539, + "epoch": 0.5292529252925292, + "grad_norm": 0.27473345398902893, + "learning_rate": 0.00010597880502146229, + "loss": 0.9939, + "mean_token_accuracy": 0.715512290596962, + "num_tokens": 11206382.0, + "step": 1323 + }, + { + "entropy": 0.9763269275426865, + "epoch": 0.5296529652965296, + "grad_norm": 0.2653784453868866, + "learning_rate": 0.0001058623296746931, + "loss": 0.9679, + "mean_token_accuracy": 0.7186905294656754, + "num_tokens": 11214886.0, + "step": 1324 + }, + { + "entropy": 1.0729904472827911, + "epoch": 0.53005300530053, + "grad_norm": 0.28102266788482666, + "learning_rate": 0.00010574586127228159, + "loss": 1.0775, + "mean_token_accuracy": 0.6961369812488556, + "num_tokens": 11223201.0, + "step": 1325 + }, + { + "entropy": 1.0208614021539688, + "epoch": 0.5304530453045304, + "grad_norm": 0.27487441897392273, + "learning_rate": 0.0001056294000096997, + "loss": 1.0063, + "mean_token_accuracy": 0.7091988325119019, + "num_tokens": 11231346.0, + "step": 1326 + }, + { + "entropy": 1.012270748615265, + "epoch": 0.5308530853085308, + "grad_norm": 0.2647680938243866, + "learning_rate": 0.00010551294608240727, + "loss": 0.9747, + "mean_token_accuracy": 0.7185727059841156, + "num_tokens": 11239642.0, + "step": 1327 + }, + { + "entropy": 1.033418208360672, + "epoch": 0.5312531253125312, + "grad_norm": 0.26809975504875183, + "learning_rate": 0.00010539649968585197, + "loss": 1.0179, + "mean_token_accuracy": 0.714934840798378, + "num_tokens": 11248410.0, + "step": 1328 + }, + { + "entropy": 1.065150409936905, + "epoch": 0.5316531653165316, + "grad_norm": 0.28084465861320496, + "learning_rate": 0.00010528006101546877, + "loss": 1.0387, + "mean_token_accuracy": 0.7042829692363739, + "num_tokens": 11256588.0, + "step": 1329 + }, + { + "entropy": 1.0852755010128021, + "epoch": 0.532053205320532, + "grad_norm": 0.27670884132385254, + "learning_rate": 0.0001051636302666796, + "loss": 1.0864, + "mean_token_accuracy": 0.6874248832464218, + "num_tokens": 11264481.0, + "step": 1330 + }, + { + "entropy": 1.0997845232486725, + "epoch": 0.5324532453245324, + "grad_norm": 0.2876107394695282, + "learning_rate": 0.00010504720763489315, + "loss": 1.0561, + "mean_token_accuracy": 0.6976020336151123, + "num_tokens": 11272272.0, + "step": 1331 + }, + { + "entropy": 1.032047763466835, + "epoch": 0.5328532853285328, + "grad_norm": 0.2897169888019562, + "learning_rate": 0.0001049307933155046, + "loss": 1.0592, + "mean_token_accuracy": 0.7061616331338882, + "num_tokens": 11281105.0, + "step": 1332 + }, + { + "entropy": 1.0849225223064423, + "epoch": 0.5332533253325332, + "grad_norm": 0.2792850434780121, + "learning_rate": 0.00010481438750389496, + "loss": 1.1047, + "mean_token_accuracy": 0.6897002756595612, + "num_tokens": 11289351.0, + "step": 1333 + }, + { + "entropy": 0.9816831797361374, + "epoch": 0.5336533653365336, + "grad_norm": 0.26215890049934387, + "learning_rate": 0.00010469799039543113, + "loss": 0.9764, + "mean_token_accuracy": 0.7185827791690826, + "num_tokens": 11298473.0, + "step": 1334 + }, + { + "entropy": 1.0031894445419312, + "epoch": 0.534053405340534, + "grad_norm": 0.26744338870048523, + "learning_rate": 0.00010458160218546536, + "loss": 0.9861, + "mean_token_accuracy": 0.7116531431674957, + "num_tokens": 11307265.0, + "step": 1335 + }, + { + "entropy": 1.017135813832283, + "epoch": 0.5344534453445344, + "grad_norm": 0.27627044916152954, + "learning_rate": 0.00010446522306933488, + "loss": 1.0104, + "mean_token_accuracy": 0.7080786973237991, + "num_tokens": 11315804.0, + "step": 1336 + }, + { + "entropy": 1.042254090309143, + "epoch": 0.5348534853485348, + "grad_norm": 0.2655766010284424, + "learning_rate": 0.00010434885324236181, + "loss": 1.0097, + "mean_token_accuracy": 0.7134181559085846, + "num_tokens": 11324475.0, + "step": 1337 + }, + { + "entropy": 1.0531800091266632, + "epoch": 0.5352535253525352, + "grad_norm": 0.27266669273376465, + "learning_rate": 0.00010423249289985258, + "loss": 1.0647, + "mean_token_accuracy": 0.6915110796689987, + "num_tokens": 11333071.0, + "step": 1338 + }, + { + "entropy": 0.9756899923086166, + "epoch": 0.5356535653565356, + "grad_norm": 0.2634299397468567, + "learning_rate": 0.00010411614223709767, + "loss": 0.9531, + "mean_token_accuracy": 0.7148783951997757, + "num_tokens": 11341501.0, + "step": 1339 + }, + { + "entropy": 0.9587087482213974, + "epoch": 0.536053605360536, + "grad_norm": 0.2695959806442261, + "learning_rate": 0.00010399980144937147, + "loss": 0.985, + "mean_token_accuracy": 0.7126426845788956, + "num_tokens": 11350100.0, + "step": 1340 + }, + { + "entropy": 0.9378992319107056, + "epoch": 0.5364536453645364, + "grad_norm": 0.25868654251098633, + "learning_rate": 0.00010388347073193154, + "loss": 0.9311, + "mean_token_accuracy": 0.7284160107374191, + "num_tokens": 11359045.0, + "step": 1341 + }, + { + "entropy": 1.0406609773635864, + "epoch": 0.5368536853685368, + "grad_norm": 0.2725643217563629, + "learning_rate": 0.00010376715028001887, + "loss": 1.037, + "mean_token_accuracy": 0.7020009607076645, + "num_tokens": 11367364.0, + "step": 1342 + }, + { + "entropy": 0.9961258918046951, + "epoch": 0.5372537253725372, + "grad_norm": 0.2746151387691498, + "learning_rate": 0.00010365084028885693, + "loss": 1.0186, + "mean_token_accuracy": 0.7039443403482437, + "num_tokens": 11376051.0, + "step": 1343 + }, + { + "entropy": 1.0464848279953003, + "epoch": 0.5376537653765376, + "grad_norm": 0.26878607273101807, + "learning_rate": 0.00010353454095365179, + "loss": 1.0621, + "mean_token_accuracy": 0.6985943615436554, + "num_tokens": 11384597.0, + "step": 1344 + }, + { + "entropy": 1.041772037744522, + "epoch": 0.538053805380538, + "grad_norm": 0.2740170359611511, + "learning_rate": 0.00010341825246959154, + "loss": 1.0314, + "mean_token_accuracy": 0.7068160176277161, + "num_tokens": 11392872.0, + "step": 1345 + }, + { + "entropy": 1.029935359954834, + "epoch": 0.5384538453845384, + "grad_norm": 0.262114942073822, + "learning_rate": 0.00010330197503184615, + "loss": 1.0266, + "mean_token_accuracy": 0.7030296176671982, + "num_tokens": 11401502.0, + "step": 1346 + }, + { + "entropy": 1.004430666565895, + "epoch": 0.5388538853885388, + "grad_norm": 0.26751378178596497, + "learning_rate": 0.00010318570883556705, + "loss": 0.9679, + "mean_token_accuracy": 0.711813285946846, + "num_tokens": 11409898.0, + "step": 1347 + }, + { + "entropy": 1.0005677491426468, + "epoch": 0.5392539253925392, + "grad_norm": 0.2700152099132538, + "learning_rate": 0.00010306945407588671, + "loss": 0.9796, + "mean_token_accuracy": 0.723316490650177, + "num_tokens": 11418488.0, + "step": 1348 + }, + { + "entropy": 1.0096771270036697, + "epoch": 0.5396539653965396, + "grad_norm": 0.9679727554321289, + "learning_rate": 0.00010295321094791845, + "loss": 1.0243, + "mean_token_accuracy": 0.7018518298864365, + "num_tokens": 11427051.0, + "step": 1349 + }, + { + "entropy": 1.089304506778717, + "epoch": 0.54005400540054, + "grad_norm": 0.2839471995830536, + "learning_rate": 0.0001028369796467561, + "loss": 1.088, + "mean_token_accuracy": 0.6944049447774887, + "num_tokens": 11435001.0, + "step": 1350 + }, + { + "entropy": 1.0282764285802841, + "epoch": 0.5404540454045405, + "grad_norm": 0.26269182562828064, + "learning_rate": 0.00010272076036747365, + "loss": 1.0266, + "mean_token_accuracy": 0.7040301859378815, + "num_tokens": 11443763.0, + "step": 1351 + }, + { + "entropy": 1.0533899366855621, + "epoch": 0.5408540854085409, + "grad_norm": 0.32176774740219116, + "learning_rate": 0.00010260455330512482, + "loss": 1.0087, + "mean_token_accuracy": 0.7061284184455872, + "num_tokens": 11451705.0, + "step": 1352 + }, + { + "entropy": 1.0775714367628098, + "epoch": 0.5412541254125413, + "grad_norm": 0.28142815828323364, + "learning_rate": 0.00010248835865474296, + "loss": 1.0776, + "mean_token_accuracy": 0.6951066851615906, + "num_tokens": 11459765.0, + "step": 1353 + }, + { + "entropy": 1.020856887102127, + "epoch": 0.5416541654165417, + "grad_norm": 0.2611066401004791, + "learning_rate": 0.00010237217661134046, + "loss": 1.0336, + "mean_token_accuracy": 0.7059824913740158, + "num_tokens": 11468926.0, + "step": 1354 + }, + { + "entropy": 1.1061126589775085, + "epoch": 0.5420542054205421, + "grad_norm": 0.2862061858177185, + "learning_rate": 0.00010225600736990859, + "loss": 1.1091, + "mean_token_accuracy": 0.6876944750547409, + "num_tokens": 11476954.0, + "step": 1355 + }, + { + "entropy": 1.0608557611703873, + "epoch": 0.5424542454245425, + "grad_norm": 0.2664395868778229, + "learning_rate": 0.00010213985112541726, + "loss": 1.0603, + "mean_token_accuracy": 0.7015472501516342, + "num_tokens": 11485670.0, + "step": 1356 + }, + { + "entropy": 1.0586974024772644, + "epoch": 0.5428542854285429, + "grad_norm": 0.2706284523010254, + "learning_rate": 0.00010202370807281434, + "loss": 1.0314, + "mean_token_accuracy": 0.7011439502239227, + "num_tokens": 11494032.0, + "step": 1357 + }, + { + "entropy": 1.050519347190857, + "epoch": 0.5432543254325433, + "grad_norm": 0.2765798270702362, + "learning_rate": 0.00010190757840702577, + "loss": 1.0447, + "mean_token_accuracy": 0.6995206028223038, + "num_tokens": 11502197.0, + "step": 1358 + }, + { + "entropy": 1.0513278990983963, + "epoch": 0.5436543654365437, + "grad_norm": 0.27133309841156006, + "learning_rate": 0.00010179146232295485, + "loss": 1.0583, + "mean_token_accuracy": 0.7067796587944031, + "num_tokens": 11510786.0, + "step": 1359 + }, + { + "entropy": 1.0462380051612854, + "epoch": 0.5440544054405441, + "grad_norm": 0.26026999950408936, + "learning_rate": 0.00010167536001548223, + "loss": 1.0099, + "mean_token_accuracy": 0.7059488594532013, + "num_tokens": 11519593.0, + "step": 1360 + }, + { + "entropy": 1.0743788182735443, + "epoch": 0.5444544454445445, + "grad_norm": 0.2812996804714203, + "learning_rate": 0.00010155927167946535, + "loss": 1.0645, + "mean_token_accuracy": 0.6941137313842773, + "num_tokens": 11528166.0, + "step": 1361 + }, + { + "entropy": 1.055093988776207, + "epoch": 0.5448544854485449, + "grad_norm": 0.2756078839302063, + "learning_rate": 0.00010144319750973826, + "loss": 1.0534, + "mean_token_accuracy": 0.7102519571781158, + "num_tokens": 11536684.0, + "step": 1362 + }, + { + "entropy": 1.0474064648151398, + "epoch": 0.5452545254525453, + "grad_norm": 0.2721620202064514, + "learning_rate": 0.00010132713770111113, + "loss": 1.016, + "mean_token_accuracy": 0.7052261829376221, + "num_tokens": 11544811.0, + "step": 1363 + }, + { + "entropy": 1.0915633738040924, + "epoch": 0.5456545654565457, + "grad_norm": 0.27760884165763855, + "learning_rate": 0.00010121109244837014, + "loss": 1.0599, + "mean_token_accuracy": 0.6947304308414459, + "num_tokens": 11552786.0, + "step": 1364 + }, + { + "entropy": 1.0741884410381317, + "epoch": 0.5460546054605461, + "grad_norm": 0.837235152721405, + "learning_rate": 0.00010109506194627703, + "loss": 1.0674, + "mean_token_accuracy": 0.7048592269420624, + "num_tokens": 11561648.0, + "step": 1365 + }, + { + "entropy": 0.9851405918598175, + "epoch": 0.5464546454645465, + "grad_norm": 0.2715883255004883, + "learning_rate": 0.00010097904638956872, + "loss": 0.9863, + "mean_token_accuracy": 0.7147142440080643, + "num_tokens": 11570055.0, + "step": 1366 + }, + { + "entropy": 1.0060627907514572, + "epoch": 0.5468546854685469, + "grad_norm": 0.27613934874534607, + "learning_rate": 0.00010086304597295708, + "loss": 0.9965, + "mean_token_accuracy": 0.7056114673614502, + "num_tokens": 11578684.0, + "step": 1367 + }, + { + "entropy": 0.9418110847473145, + "epoch": 0.5472547254725473, + "grad_norm": 0.2585083842277527, + "learning_rate": 0.00010074706089112858, + "loss": 0.9512, + "mean_token_accuracy": 0.7222006767988205, + "num_tokens": 11587949.0, + "step": 1368 + }, + { + "entropy": 1.0209503471851349, + "epoch": 0.5476547654765477, + "grad_norm": 0.27833181619644165, + "learning_rate": 0.0001006310913387439, + "loss": 1.0355, + "mean_token_accuracy": 0.7012454718351364, + "num_tokens": 11596206.0, + "step": 1369 + }, + { + "entropy": 0.9860797673463821, + "epoch": 0.5480548054805481, + "grad_norm": 0.26168200373649597, + "learning_rate": 0.00010051513751043778, + "loss": 0.9982, + "mean_token_accuracy": 0.7086170762777328, + "num_tokens": 11605345.0, + "step": 1370 + }, + { + "entropy": 1.041551724076271, + "epoch": 0.5484548454845485, + "grad_norm": 0.283366858959198, + "learning_rate": 0.00010039919960081843, + "loss": 1.0305, + "mean_token_accuracy": 0.7039183974266052, + "num_tokens": 11613425.0, + "step": 1371 + }, + { + "entropy": 1.062867134809494, + "epoch": 0.5488548854885489, + "grad_norm": 0.27917370200157166, + "learning_rate": 0.00010028327780446742, + "loss": 1.09, + "mean_token_accuracy": 0.6929521709680557, + "num_tokens": 11621896.0, + "step": 1372 + }, + { + "entropy": 1.0244268774986267, + "epoch": 0.5492549254925493, + "grad_norm": 0.35438400506973267, + "learning_rate": 0.00010016737231593925, + "loss": 1.0194, + "mean_token_accuracy": 0.7129424512386322, + "num_tokens": 11630351.0, + "step": 1373 + }, + { + "entropy": 1.04619961977005, + "epoch": 0.5496549654965497, + "grad_norm": 0.2773226499557495, + "learning_rate": 0.00010005148332976104, + "loss": 1.0304, + "mean_token_accuracy": 0.7031717002391815, + "num_tokens": 11638395.0, + "step": 1374 + }, + { + "entropy": 1.0367863774299622, + "epoch": 0.5500550055005501, + "grad_norm": 0.28236082196235657, + "learning_rate": 9.993561104043232e-05, + "loss": 1.0375, + "mean_token_accuracy": 0.7051004767417908, + "num_tokens": 11647143.0, + "step": 1375 + }, + { + "entropy": 1.090814620256424, + "epoch": 0.5504550455045505, + "grad_norm": 0.2837185859680176, + "learning_rate": 9.981975564242443e-05, + "loss": 1.0839, + "mean_token_accuracy": 0.6908467561006546, + "num_tokens": 11655518.0, + "step": 1376 + }, + { + "entropy": 1.0528740286827087, + "epoch": 0.5508550855085509, + "grad_norm": 0.2918621301651001, + "learning_rate": 9.970391733018048e-05, + "loss": 1.0352, + "mean_token_accuracy": 0.7053203582763672, + "num_tokens": 11663283.0, + "step": 1377 + }, + { + "entropy": 1.0195574909448624, + "epoch": 0.5512551255125513, + "grad_norm": 0.27379870414733887, + "learning_rate": 9.958809629811478e-05, + "loss": 0.989, + "mean_token_accuracy": 0.7122834622859955, + "num_tokens": 11671987.0, + "step": 1378 + }, + { + "entropy": 1.0768060982227325, + "epoch": 0.5516551655165517, + "grad_norm": 0.2727426588535309, + "learning_rate": 9.947229274061285e-05, + "loss": 1.059, + "mean_token_accuracy": 0.7000274807214737, + "num_tokens": 11680286.0, + "step": 1379 + }, + { + "entropy": 1.0743230283260345, + "epoch": 0.5520552055205521, + "grad_norm": 0.28503862023353577, + "learning_rate": 9.935650685203069e-05, + "loss": 1.0638, + "mean_token_accuracy": 0.700126513838768, + "num_tokens": 11688778.0, + "step": 1380 + }, + { + "entropy": 1.069139450788498, + "epoch": 0.5524552455245525, + "grad_norm": 0.26232168078422546, + "learning_rate": 9.924073882669471e-05, + "loss": 1.0481, + "mean_token_accuracy": 0.702171802520752, + "num_tokens": 11698135.0, + "step": 1381 + }, + { + "entropy": 1.0853570103645325, + "epoch": 0.5528552855285529, + "grad_norm": 0.30101171135902405, + "learning_rate": 9.912498885890137e-05, + "loss": 1.1139, + "mean_token_accuracy": 0.6967767179012299, + "num_tokens": 11706337.0, + "step": 1382 + }, + { + "entropy": 1.0328886955976486, + "epoch": 0.5532553255325533, + "grad_norm": 0.27778464555740356, + "learning_rate": 9.900925714291671e-05, + "loss": 1.0045, + "mean_token_accuracy": 0.7103684544563293, + "num_tokens": 11714413.0, + "step": 1383 + }, + { + "entropy": 1.0112949907779694, + "epoch": 0.5536553655365537, + "grad_norm": 0.2873450517654419, + "learning_rate": 9.889354387297636e-05, + "loss": 0.9922, + "mean_token_accuracy": 0.7146047055721283, + "num_tokens": 11722121.0, + "step": 1384 + }, + { + "entropy": 1.074519470334053, + "epoch": 0.5540554055405541, + "grad_norm": 0.28095701336860657, + "learning_rate": 9.877784924328476e-05, + "loss": 1.0812, + "mean_token_accuracy": 0.6868065893650055, + "num_tokens": 11730383.0, + "step": 1385 + }, + { + "entropy": 1.022032842040062, + "epoch": 0.5544554455445545, + "grad_norm": 0.29711857438087463, + "learning_rate": 9.86621734480152e-05, + "loss": 1.0182, + "mean_token_accuracy": 0.7119694799184799, + "num_tokens": 11738939.0, + "step": 1386 + }, + { + "entropy": 1.0074398219585419, + "epoch": 0.5548554855485549, + "grad_norm": 0.25953930616378784, + "learning_rate": 9.85465166813093e-05, + "loss": 1.0292, + "mean_token_accuracy": 0.707723006606102, + "num_tokens": 11748108.0, + "step": 1387 + }, + { + "entropy": 0.9986685365438461, + "epoch": 0.5552555255525553, + "grad_norm": 0.2853337228298187, + "learning_rate": 9.843087913727671e-05, + "loss": 1.018, + "mean_token_accuracy": 0.7085285782814026, + "num_tokens": 11756454.0, + "step": 1388 + }, + { + "entropy": 1.0449780225753784, + "epoch": 0.5556555655565557, + "grad_norm": 0.3201696574687958, + "learning_rate": 9.8315261009995e-05, + "loss": 1.0503, + "mean_token_accuracy": 0.7019772082567215, + "num_tokens": 11764520.0, + "step": 1389 + }, + { + "entropy": 1.0016443133354187, + "epoch": 0.5560556055605561, + "grad_norm": 0.2722102105617523, + "learning_rate": 9.819966249350892e-05, + "loss": 0.9785, + "mean_token_accuracy": 0.7104931026697159, + "num_tokens": 11773234.0, + "step": 1390 + }, + { + "entropy": 1.0610063672065735, + "epoch": 0.5564556455645564, + "grad_norm": 0.27778545022010803, + "learning_rate": 9.808408378183045e-05, + "loss": 1.0518, + "mean_token_accuracy": 0.7019657343626022, + "num_tokens": 11781823.0, + "step": 1391 + }, + { + "entropy": 1.0407916605472565, + "epoch": 0.5568556855685568, + "grad_norm": 0.2848045825958252, + "learning_rate": 9.796852506893823e-05, + "loss": 1.0163, + "mean_token_accuracy": 0.7181297093629837, + "num_tokens": 11790243.0, + "step": 1392 + }, + { + "entropy": 1.031977429986, + "epoch": 0.5572557255725572, + "grad_norm": 0.2774193286895752, + "learning_rate": 9.785298654877751e-05, + "loss": 1.0411, + "mean_token_accuracy": 0.7074624449014664, + "num_tokens": 11799338.0, + "step": 1393 + }, + { + "entropy": 1.0174032896757126, + "epoch": 0.5576557655765576, + "grad_norm": 0.26397672295570374, + "learning_rate": 9.773746841525946e-05, + "loss": 1.0232, + "mean_token_accuracy": 0.7066680341959, + "num_tokens": 11808204.0, + "step": 1394 + }, + { + "entropy": 1.0628741383552551, + "epoch": 0.558055805580558, + "grad_norm": 0.271405965089798, + "learning_rate": 9.762197086226114e-05, + "loss": 1.0445, + "mean_token_accuracy": 0.6998479664325714, + "num_tokens": 11816702.0, + "step": 1395 + }, + { + "entropy": 0.9912206828594208, + "epoch": 0.5584558455845584, + "grad_norm": 0.2695449888706207, + "learning_rate": 9.750649408362502e-05, + "loss": 0.9858, + "mean_token_accuracy": 0.7102565914392471, + "num_tokens": 11824938.0, + "step": 1396 + }, + { + "entropy": 1.085845246911049, + "epoch": 0.5588558855885588, + "grad_norm": 0.28878486156463623, + "learning_rate": 9.739103827315872e-05, + "loss": 1.0856, + "mean_token_accuracy": 0.6961684226989746, + "num_tokens": 11833055.0, + "step": 1397 + }, + { + "entropy": 1.0149171352386475, + "epoch": 0.5592559255925592, + "grad_norm": 0.27814212441444397, + "learning_rate": 9.72756036246347e-05, + "loss": 0.9873, + "mean_token_accuracy": 0.7214252650737762, + "num_tokens": 11841358.0, + "step": 1398 + }, + { + "entropy": 0.978545218706131, + "epoch": 0.5596559655965596, + "grad_norm": 0.2617882788181305, + "learning_rate": 9.716019033178986e-05, + "loss": 0.9557, + "mean_token_accuracy": 0.7193311303853989, + "num_tokens": 11849864.0, + "step": 1399 + }, + { + "entropy": 1.0388765186071396, + "epoch": 0.56005600560056, + "grad_norm": 0.27917811274528503, + "learning_rate": 9.70447985883253e-05, + "loss": 1.0561, + "mean_token_accuracy": 0.7081725299358368, + "num_tokens": 11857832.0, + "step": 1400 + }, + { + "entropy": 1.0027438551187515, + "epoch": 0.5604560456045604, + "grad_norm": 0.2660987675189972, + "learning_rate": 9.692942858790591e-05, + "loss": 0.9864, + "mean_token_accuracy": 0.7185069024562836, + "num_tokens": 11866681.0, + "step": 1401 + }, + { + "entropy": 1.0447413921356201, + "epoch": 0.5608560856085608, + "grad_norm": 0.29531651735305786, + "learning_rate": 9.681408052416005e-05, + "loss": 1.0843, + "mean_token_accuracy": 0.6928424537181854, + "num_tokens": 11875227.0, + "step": 1402 + }, + { + "entropy": 1.0347902327775955, + "epoch": 0.5612561256125612, + "grad_norm": 0.274422287940979, + "learning_rate": 9.669875459067941e-05, + "loss": 1.0359, + "mean_token_accuracy": 0.708528995513916, + "num_tokens": 11883626.0, + "step": 1403 + }, + { + "entropy": 1.0262574702501297, + "epoch": 0.5616561656165616, + "grad_norm": 0.2685241401195526, + "learning_rate": 9.658345098101842e-05, + "loss": 0.9927, + "mean_token_accuracy": 0.7188732028007507, + "num_tokens": 11892093.0, + "step": 1404 + }, + { + "entropy": 0.9697751253843307, + "epoch": 0.562056205620562, + "grad_norm": 0.25515463948249817, + "learning_rate": 9.646816988869405e-05, + "loss": 0.9417, + "mean_token_accuracy": 0.722034215927124, + "num_tokens": 11900926.0, + "step": 1405 + }, + { + "entropy": 1.0813296437263489, + "epoch": 0.5624562456245624, + "grad_norm": 0.2833826243877411, + "learning_rate": 9.635291150718549e-05, + "loss": 1.0443, + "mean_token_accuracy": 0.7046888172626495, + "num_tokens": 11909087.0, + "step": 1406 + }, + { + "entropy": 1.034473478794098, + "epoch": 0.5628562856285628, + "grad_norm": 0.2755141258239746, + "learning_rate": 9.623767602993388e-05, + "loss": 0.9953, + "mean_token_accuracy": 0.7113285213708878, + "num_tokens": 11917725.0, + "step": 1407 + }, + { + "entropy": 1.0415022820234299, + "epoch": 0.5632563256325632, + "grad_norm": 0.2981629967689514, + "learning_rate": 9.612246365034179e-05, + "loss": 1.0263, + "mean_token_accuracy": 0.706835612654686, + "num_tokens": 11925619.0, + "step": 1408 + }, + { + "entropy": 0.9672080725431442, + "epoch": 0.5636563656365636, + "grad_norm": 0.2576509118080139, + "learning_rate": 9.600727456177317e-05, + "loss": 0.9674, + "mean_token_accuracy": 0.7194074839353561, + "num_tokens": 11934428.0, + "step": 1409 + }, + { + "entropy": 1.0478382408618927, + "epoch": 0.564056405640564, + "grad_norm": 0.2701607942581177, + "learning_rate": 9.589210895755276e-05, + "loss": 1.0464, + "mean_token_accuracy": 0.705416277050972, + "num_tokens": 11943161.0, + "step": 1410 + }, + { + "entropy": 1.0357476025819778, + "epoch": 0.5644564456445644, + "grad_norm": 0.28543442487716675, + "learning_rate": 9.577696703096591e-05, + "loss": 1.0479, + "mean_token_accuracy": 0.700953334569931, + "num_tokens": 11951642.0, + "step": 1411 + }, + { + "entropy": 1.0380288660526276, + "epoch": 0.5648564856485648, + "grad_norm": 0.2840733528137207, + "learning_rate": 9.566184897525832e-05, + "loss": 1.0468, + "mean_token_accuracy": 0.6977168768644333, + "num_tokens": 11959731.0, + "step": 1412 + }, + { + "entropy": 0.9919664859771729, + "epoch": 0.5652565256525652, + "grad_norm": 0.2730622887611389, + "learning_rate": 9.554675498363553e-05, + "loss": 1.0138, + "mean_token_accuracy": 0.7065150737762451, + "num_tokens": 11968244.0, + "step": 1413 + }, + { + "entropy": 1.027969315648079, + "epoch": 0.5656565656565656, + "grad_norm": 0.3118537664413452, + "learning_rate": 9.543168524926272e-05, + "loss": 1.0657, + "mean_token_accuracy": 0.6998304575681686, + "num_tokens": 11976500.0, + "step": 1414 + }, + { + "entropy": 0.974114865064621, + "epoch": 0.566056605660566, + "grad_norm": 0.2740616202354431, + "learning_rate": 9.531663996526437e-05, + "loss": 0.9782, + "mean_token_accuracy": 0.7178534716367722, + "num_tokens": 11985707.0, + "step": 1415 + }, + { + "entropy": 1.0297018885612488, + "epoch": 0.5664566456645664, + "grad_norm": 0.28345346450805664, + "learning_rate": 9.520161932472387e-05, + "loss": 1.0354, + "mean_token_accuracy": 0.7030294239521027, + "num_tokens": 11993730.0, + "step": 1416 + }, + { + "entropy": 0.9867464303970337, + "epoch": 0.5668566856685668, + "grad_norm": 0.26605159044265747, + "learning_rate": 9.508662352068336e-05, + "loss": 0.9617, + "mean_token_accuracy": 0.7241377532482147, + "num_tokens": 12002330.0, + "step": 1417 + }, + { + "entropy": 0.9821142554283142, + "epoch": 0.5672567256725672, + "grad_norm": 0.2651683986186981, + "learning_rate": 9.497165274614321e-05, + "loss": 0.9684, + "mean_token_accuracy": 0.7132197320461273, + "num_tokens": 12011321.0, + "step": 1418 + }, + { + "entropy": 1.057016059756279, + "epoch": 0.5676567656765676, + "grad_norm": 0.2834841310977936, + "learning_rate": 9.485670719406182e-05, + "loss": 1.043, + "mean_token_accuracy": 0.7110841572284698, + "num_tokens": 12019069.0, + "step": 1419 + }, + { + "entropy": 1.0375907570123672, + "epoch": 0.568056805680568, + "grad_norm": 0.274261474609375, + "learning_rate": 9.47417870573552e-05, + "loss": 1.012, + "mean_token_accuracy": 0.6996363401412964, + "num_tokens": 12027468.0, + "step": 1420 + }, + { + "entropy": 0.9975807368755341, + "epoch": 0.5684568456845684, + "grad_norm": 0.2589299976825714, + "learning_rate": 9.462689252889676e-05, + "loss": 0.9705, + "mean_token_accuracy": 0.7124952524900436, + "num_tokens": 12035913.0, + "step": 1421 + }, + { + "entropy": 1.0123953074216843, + "epoch": 0.5688568856885688, + "grad_norm": 0.27150651812553406, + "learning_rate": 9.4512023801517e-05, + "loss": 0.9997, + "mean_token_accuracy": 0.7083428651094437, + "num_tokens": 12044329.0, + "step": 1422 + }, + { + "entropy": 1.014956459403038, + "epoch": 0.5692569256925692, + "grad_norm": 0.26314038038253784, + "learning_rate": 9.439718106800293e-05, + "loss": 1.0147, + "mean_token_accuracy": 0.7100427746772766, + "num_tokens": 12053065.0, + "step": 1423 + }, + { + "entropy": 0.9784752130508423, + "epoch": 0.5696569656965697, + "grad_norm": 0.28080591559410095, + "learning_rate": 9.428236452109811e-05, + "loss": 0.9811, + "mean_token_accuracy": 0.7160182595252991, + "num_tokens": 12061243.0, + "step": 1424 + }, + { + "entropy": 1.018771931529045, + "epoch": 0.57005700570057, + "grad_norm": 0.2826145589351654, + "learning_rate": 9.416757435350198e-05, + "loss": 1.0439, + "mean_token_accuracy": 0.7015163451433182, + "num_tokens": 12069347.0, + "step": 1425 + }, + { + "entropy": 0.951250284910202, + "epoch": 0.5704570457045705, + "grad_norm": 0.26394495368003845, + "learning_rate": 9.405281075786995e-05, + "loss": 0.9595, + "mean_token_accuracy": 0.7201048731803894, + "num_tokens": 12077883.0, + "step": 1426 + }, + { + "entropy": 0.9762776345014572, + "epoch": 0.5708570857085709, + "grad_norm": 0.272976815700531, + "learning_rate": 9.393807392681262e-05, + "loss": 0.9858, + "mean_token_accuracy": 0.7145939618349075, + "num_tokens": 12086303.0, + "step": 1427 + }, + { + "entropy": 1.045241117477417, + "epoch": 0.5712571257125713, + "grad_norm": 0.27602118253707886, + "learning_rate": 9.382336405289575e-05, + "loss": 1.0496, + "mean_token_accuracy": 0.7051439881324768, + "num_tokens": 12094352.0, + "step": 1428 + }, + { + "entropy": 1.068339616060257, + "epoch": 0.5716571657165717, + "grad_norm": 0.28006798028945923, + "learning_rate": 9.370868132863983e-05, + "loss": 1.069, + "mean_token_accuracy": 0.6973517686128616, + "num_tokens": 12102866.0, + "step": 1429 + }, + { + "entropy": 1.0083343386650085, + "epoch": 0.5720572057205721, + "grad_norm": 0.2719053626060486, + "learning_rate": 9.35940259465198e-05, + "loss": 0.9864, + "mean_token_accuracy": 0.7176250368356705, + "num_tokens": 12111492.0, + "step": 1430 + }, + { + "entropy": 1.0729386806488037, + "epoch": 0.5724572457245725, + "grad_norm": 0.28686070442199707, + "learning_rate": 9.347939809896475e-05, + "loss": 1.0771, + "mean_token_accuracy": 0.6966959685087204, + "num_tokens": 12119860.0, + "step": 1431 + }, + { + "entropy": 1.028654858469963, + "epoch": 0.5728572857285729, + "grad_norm": 0.267328143119812, + "learning_rate": 9.336479797835751e-05, + "loss": 1.0126, + "mean_token_accuracy": 0.7092250138521194, + "num_tokens": 12128211.0, + "step": 1432 + }, + { + "entropy": 1.0675599575042725, + "epoch": 0.5732573257325733, + "grad_norm": 0.2720050513744354, + "learning_rate": 9.32502257770344e-05, + "loss": 1.059, + "mean_token_accuracy": 0.6944447904825211, + "num_tokens": 12136496.0, + "step": 1433 + }, + { + "entropy": 1.0634731203317642, + "epoch": 0.5736573657365737, + "grad_norm": 0.27224984765052795, + "learning_rate": 9.313568168728477e-05, + "loss": 1.0343, + "mean_token_accuracy": 0.6975902765989304, + "num_tokens": 12144990.0, + "step": 1434 + }, + { + "entropy": 1.0794326663017273, + "epoch": 0.5740574057405741, + "grad_norm": 0.2870640456676483, + "learning_rate": 9.302116590135104e-05, + "loss": 1.0537, + "mean_token_accuracy": 0.7010065019130707, + "num_tokens": 12152721.0, + "step": 1435 + }, + { + "entropy": 0.9973935335874557, + "epoch": 0.5744574457445745, + "grad_norm": 0.2561502456665039, + "learning_rate": 9.290667861142788e-05, + "loss": 0.974, + "mean_token_accuracy": 0.7223565578460693, + "num_tokens": 12161617.0, + "step": 1436 + }, + { + "entropy": 1.0682669281959534, + "epoch": 0.5748574857485749, + "grad_norm": 0.2843833863735199, + "learning_rate": 9.27922200096623e-05, + "loss": 1.0432, + "mean_token_accuracy": 0.701179713010788, + "num_tokens": 12169335.0, + "step": 1437 + }, + { + "entropy": 1.023609682917595, + "epoch": 0.5752575257525753, + "grad_norm": 0.26150861382484436, + "learning_rate": 9.2677790288153e-05, + "loss": 0.9991, + "mean_token_accuracy": 0.7105388194322586, + "num_tokens": 12178101.0, + "step": 1438 + }, + { + "entropy": 1.0566135942935944, + "epoch": 0.5756575657565757, + "grad_norm": 0.27409628033638, + "learning_rate": 9.256338963895035e-05, + "loss": 1.0372, + "mean_token_accuracy": 0.7043111026287079, + "num_tokens": 12186502.0, + "step": 1439 + }, + { + "entropy": 1.0071329474449158, + "epoch": 0.5760576057605761, + "grad_norm": 0.2723731994628906, + "learning_rate": 9.24490182540559e-05, + "loss": 1.0062, + "mean_token_accuracy": 0.7074045389890671, + "num_tokens": 12195376.0, + "step": 1440 + }, + { + "entropy": 1.0620497465133667, + "epoch": 0.5764576457645765, + "grad_norm": 0.28541213274002075, + "learning_rate": 9.233467632542203e-05, + "loss": 1.0632, + "mean_token_accuracy": 0.6984334588050842, + "num_tokens": 12203614.0, + "step": 1441 + }, + { + "entropy": 1.0232061743736267, + "epoch": 0.5768576857685769, + "grad_norm": 0.299996942281723, + "learning_rate": 9.222036404495176e-05, + "loss": 1.0225, + "mean_token_accuracy": 0.7038890570402145, + "num_tokens": 12212355.0, + "step": 1442 + }, + { + "entropy": 1.0834916532039642, + "epoch": 0.5772577257725773, + "grad_norm": 0.29710832238197327, + "learning_rate": 9.210608160449825e-05, + "loss": 1.0882, + "mean_token_accuracy": 0.6926500052213669, + "num_tokens": 12220876.0, + "step": 1443 + }, + { + "entropy": 0.9862376004457474, + "epoch": 0.5776577657765777, + "grad_norm": 0.27031514048576355, + "learning_rate": 9.199182919586466e-05, + "loss": 1.0187, + "mean_token_accuracy": 0.7079644799232483, + "num_tokens": 12229523.0, + "step": 1444 + }, + { + "entropy": 1.022323340177536, + "epoch": 0.5780578057805781, + "grad_norm": 0.2753877639770508, + "learning_rate": 9.187760701080375e-05, + "loss": 1.0466, + "mean_token_accuracy": 0.6970907300710678, + "num_tokens": 12238070.0, + "step": 1445 + }, + { + "entropy": 1.017315074801445, + "epoch": 0.5784578457845785, + "grad_norm": 0.2782132625579834, + "learning_rate": 9.17634152410175e-05, + "loss": 1.0013, + "mean_token_accuracy": 0.7108462601900101, + "num_tokens": 12246344.0, + "step": 1446 + }, + { + "entropy": 1.0499614477157593, + "epoch": 0.5788578857885789, + "grad_norm": 0.27874356508255005, + "learning_rate": 9.164925407815687e-05, + "loss": 1.0627, + "mean_token_accuracy": 0.6959181725978851, + "num_tokens": 12254441.0, + "step": 1447 + }, + { + "entropy": 1.016101136803627, + "epoch": 0.5792579257925793, + "grad_norm": 0.2705437242984772, + "learning_rate": 9.153512371382145e-05, + "loss": 1.0061, + "mean_token_accuracy": 0.7082370519638062, + "num_tokens": 12263183.0, + "step": 1448 + }, + { + "entropy": 1.0472546815872192, + "epoch": 0.5796579657965797, + "grad_norm": 0.26952528953552246, + "learning_rate": 9.14210243395592e-05, + "loss": 1.0547, + "mean_token_accuracy": 0.702600434422493, + "num_tokens": 12271464.0, + "step": 1449 + }, + { + "entropy": 1.0200649946928024, + "epoch": 0.5800580058005801, + "grad_norm": 0.2601044774055481, + "learning_rate": 9.130695614686593e-05, + "loss": 1.0235, + "mean_token_accuracy": 0.7050524055957794, + "num_tokens": 12280351.0, + "step": 1450 + }, + { + "entropy": 1.0527155250310898, + "epoch": 0.5804580458045805, + "grad_norm": 0.2646445631980896, + "learning_rate": 9.119291932718525e-05, + "loss": 1.0119, + "mean_token_accuracy": 0.7147404998540878, + "num_tokens": 12288860.0, + "step": 1451 + }, + { + "entropy": 1.0286945551633835, + "epoch": 0.5808580858085809, + "grad_norm": 0.2639307379722595, + "learning_rate": 9.107891407190807e-05, + "loss": 1.0031, + "mean_token_accuracy": 0.7100252509117126, + "num_tokens": 12297417.0, + "step": 1452 + }, + { + "entropy": 1.0290382355451584, + "epoch": 0.5812581258125813, + "grad_norm": 0.28312456607818604, + "learning_rate": 9.096494057237223e-05, + "loss": 1.0279, + "mean_token_accuracy": 0.7009018808603287, + "num_tokens": 12305741.0, + "step": 1453 + }, + { + "entropy": 1.1024768650531769, + "epoch": 0.5816581658165817, + "grad_norm": 0.2756511867046356, + "learning_rate": 9.085099901986252e-05, + "loss": 1.0893, + "mean_token_accuracy": 0.6902471333742142, + "num_tokens": 12314136.0, + "step": 1454 + }, + { + "entropy": 1.0359731316566467, + "epoch": 0.5820582058205821, + "grad_norm": 0.2630627155303955, + "learning_rate": 9.073708960560982e-05, + "loss": 1.0205, + "mean_token_accuracy": 0.7086706906557083, + "num_tokens": 12323001.0, + "step": 1455 + }, + { + "entropy": 1.0457979142665863, + "epoch": 0.5824582458245825, + "grad_norm": 0.27305370569229126, + "learning_rate": 9.062321252079126e-05, + "loss": 1.0109, + "mean_token_accuracy": 0.7109446823596954, + "num_tokens": 12330758.0, + "step": 1456 + }, + { + "entropy": 1.0786918103694916, + "epoch": 0.5828582858285829, + "grad_norm": 0.2732437252998352, + "learning_rate": 9.050936795652969e-05, + "loss": 1.0814, + "mean_token_accuracy": 0.6952021420001984, + "num_tokens": 12339068.0, + "step": 1457 + }, + { + "entropy": 1.0849244892597198, + "epoch": 0.5832583258325833, + "grad_norm": 0.2845315635204315, + "learning_rate": 9.039555610389326e-05, + "loss": 1.0735, + "mean_token_accuracy": 0.6946322470903397, + "num_tokens": 12347267.0, + "step": 1458 + }, + { + "entropy": 0.9938002228736877, + "epoch": 0.5836583658365837, + "grad_norm": 0.2742798328399658, + "learning_rate": 9.028177715389538e-05, + "loss": 0.9809, + "mean_token_accuracy": 0.7214517593383789, + "num_tokens": 12355895.0, + "step": 1459 + }, + { + "entropy": 0.9613644331693649, + "epoch": 0.5840584058405841, + "grad_norm": 0.268015593290329, + "learning_rate": 9.016803129749413e-05, + "loss": 0.9906, + "mean_token_accuracy": 0.7143443375825882, + "num_tokens": 12364641.0, + "step": 1460 + }, + { + "entropy": 1.0632926225662231, + "epoch": 0.5844584458445845, + "grad_norm": 0.2709764540195465, + "learning_rate": 9.005431872559213e-05, + "loss": 1.0558, + "mean_token_accuracy": 0.6997475773096085, + "num_tokens": 12372935.0, + "step": 1461 + }, + { + "entropy": 1.0536611080169678, + "epoch": 0.5848584858485849, + "grad_norm": 0.27664220333099365, + "learning_rate": 8.994063962903605e-05, + "loss": 1.0373, + "mean_token_accuracy": 0.7011358588933945, + "num_tokens": 12380977.0, + "step": 1462 + }, + { + "entropy": 1.018237143754959, + "epoch": 0.5852585258525853, + "grad_norm": 0.26905640959739685, + "learning_rate": 8.98269941986164e-05, + "loss": 1.0191, + "mean_token_accuracy": 0.706588625907898, + "num_tokens": 12389637.0, + "step": 1463 + }, + { + "entropy": 1.1027434766292572, + "epoch": 0.5856585658565857, + "grad_norm": 0.2918579876422882, + "learning_rate": 8.971338262506728e-05, + "loss": 1.1046, + "mean_token_accuracy": 0.6864562034606934, + "num_tokens": 12397560.0, + "step": 1464 + }, + { + "entropy": 1.033021241426468, + "epoch": 0.5860586058605861, + "grad_norm": 0.2683352530002594, + "learning_rate": 8.95998050990658e-05, + "loss": 1.015, + "mean_token_accuracy": 0.7026935517787933, + "num_tokens": 12405995.0, + "step": 1465 + }, + { + "entropy": 1.0776333510875702, + "epoch": 0.5864586458645865, + "grad_norm": 0.27302148938179016, + "learning_rate": 8.948626181123212e-05, + "loss": 1.0895, + "mean_token_accuracy": 0.7019542455673218, + "num_tokens": 12414269.0, + "step": 1466 + }, + { + "entropy": 0.9995755553245544, + "epoch": 0.5868586858685868, + "grad_norm": 0.26064565777778625, + "learning_rate": 8.937275295212874e-05, + "loss": 0.9818, + "mean_token_accuracy": 0.7125249952077866, + "num_tokens": 12423052.0, + "step": 1467 + }, + { + "entropy": 1.0405015349388123, + "epoch": 0.5872587258725872, + "grad_norm": 0.2737524211406708, + "learning_rate": 8.925927871226054e-05, + "loss": 1.0347, + "mean_token_accuracy": 0.7030630558729172, + "num_tokens": 12431298.0, + "step": 1468 + }, + { + "entropy": 1.0388181060552597, + "epoch": 0.5876587658765876, + "grad_norm": 0.2806417644023895, + "learning_rate": 8.91458392820742e-05, + "loss": 1.0506, + "mean_token_accuracy": 0.6979903727769852, + "num_tokens": 12439714.0, + "step": 1469 + }, + { + "entropy": 1.0125087350606918, + "epoch": 0.588058805880588, + "grad_norm": 0.2720699608325958, + "learning_rate": 8.903243485195807e-05, + "loss": 1.012, + "mean_token_accuracy": 0.7127858400344849, + "num_tokens": 12447754.0, + "step": 1470 + }, + { + "entropy": 0.9408924430608749, + "epoch": 0.5884588458845884, + "grad_norm": 0.25853264331817627, + "learning_rate": 8.891906561224161e-05, + "loss": 0.9322, + "mean_token_accuracy": 0.7274314314126968, + "num_tokens": 12456662.0, + "step": 1471 + }, + { + "entropy": 1.0014375895261765, + "epoch": 0.5888588858885888, + "grad_norm": 0.25125372409820557, + "learning_rate": 8.880573175319535e-05, + "loss": 0.999, + "mean_token_accuracy": 0.7154225707054138, + "num_tokens": 12465587.0, + "step": 1472 + }, + { + "entropy": 1.0588608533143997, + "epoch": 0.5892589258925892, + "grad_norm": 0.2735210657119751, + "learning_rate": 8.869243346503044e-05, + "loss": 1.0749, + "mean_token_accuracy": 0.6971971541643143, + "num_tokens": 12473766.0, + "step": 1473 + }, + { + "entropy": 1.0112529695034027, + "epoch": 0.5896589658965896, + "grad_norm": 0.2681281566619873, + "learning_rate": 8.857917093789823e-05, + "loss": 1.0381, + "mean_token_accuracy": 0.7023854553699493, + "num_tokens": 12482669.0, + "step": 1474 + }, + { + "entropy": 1.0297947525978088, + "epoch": 0.59005900590059, + "grad_norm": 0.2669883668422699, + "learning_rate": 8.846594436189015e-05, + "loss": 1.0175, + "mean_token_accuracy": 0.706328734755516, + "num_tokens": 12490956.0, + "step": 1475 + }, + { + "entropy": 1.0480745136737823, + "epoch": 0.5904590459045904, + "grad_norm": 0.28414660692214966, + "learning_rate": 8.835275392703721e-05, + "loss": 1.0322, + "mean_token_accuracy": 0.7039245367050171, + "num_tokens": 12498814.0, + "step": 1476 + }, + { + "entropy": 1.0192039608955383, + "epoch": 0.5908590859085908, + "grad_norm": 0.2634120583534241, + "learning_rate": 8.82395998233098e-05, + "loss": 0.9925, + "mean_token_accuracy": 0.7203864604234695, + "num_tokens": 12507501.0, + "step": 1477 + }, + { + "entropy": 1.059102863073349, + "epoch": 0.5912591259125912, + "grad_norm": 0.29845336079597473, + "learning_rate": 8.81264822406174e-05, + "loss": 1.0238, + "mean_token_accuracy": 0.7050190567970276, + "num_tokens": 12516278.0, + "step": 1478 + }, + { + "entropy": 1.061889424920082, + "epoch": 0.5916591659165916, + "grad_norm": 0.2791065573692322, + "learning_rate": 8.801340136880812e-05, + "loss": 1.046, + "mean_token_accuracy": 0.7031095772981644, + "num_tokens": 12524364.0, + "step": 1479 + }, + { + "entropy": 1.0364331305027008, + "epoch": 0.592059205920592, + "grad_norm": 0.27705058455467224, + "learning_rate": 8.790035739766842e-05, + "loss": 1.0256, + "mean_token_accuracy": 0.7009413093328476, + "num_tokens": 12532418.0, + "step": 1480 + }, + { + "entropy": 1.062977910041809, + "epoch": 0.5924592459245924, + "grad_norm": 0.27835676074028015, + "learning_rate": 8.77873505169229e-05, + "loss": 1.0643, + "mean_token_accuracy": 0.6955220699310303, + "num_tokens": 12540931.0, + "step": 1481 + }, + { + "entropy": 1.0140399634838104, + "epoch": 0.5928592859285928, + "grad_norm": 0.27127519249916077, + "learning_rate": 8.7674380916234e-05, + "loss": 1.0061, + "mean_token_accuracy": 0.708841398358345, + "num_tokens": 12549027.0, + "step": 1482 + }, + { + "entropy": 1.0879786908626556, + "epoch": 0.5932593259325932, + "grad_norm": 0.2876133322715759, + "learning_rate": 8.756144878520131e-05, + "loss": 1.1027, + "mean_token_accuracy": 0.696457713842392, + "num_tokens": 12557333.0, + "step": 1483 + }, + { + "entropy": 0.9865775853395462, + "epoch": 0.5936593659365936, + "grad_norm": 0.2730310559272766, + "learning_rate": 8.744855431336185e-05, + "loss": 1.0229, + "mean_token_accuracy": 0.7061936110258102, + "num_tokens": 12565824.0, + "step": 1484 + }, + { + "entropy": 1.0515245646238327, + "epoch": 0.594059405940594, + "grad_norm": 0.3587878346443176, + "learning_rate": 8.733569769018921e-05, + "loss": 1.043, + "mean_token_accuracy": 0.7014940679073334, + "num_tokens": 12574251.0, + "step": 1485 + }, + { + "entropy": 1.0622943490743637, + "epoch": 0.5944594459445944, + "grad_norm": 0.28619709610939026, + "learning_rate": 8.722287910509354e-05, + "loss": 1.0568, + "mean_token_accuracy": 0.7075288891792297, + "num_tokens": 12582665.0, + "step": 1486 + }, + { + "entropy": 0.9456561505794525, + "epoch": 0.5948594859485948, + "grad_norm": 0.2537696957588196, + "learning_rate": 8.71100987474212e-05, + "loss": 0.9159, + "mean_token_accuracy": 0.7290042191743851, + "num_tokens": 12591691.0, + "step": 1487 + }, + { + "entropy": 1.0309175848960876, + "epoch": 0.5952595259525952, + "grad_norm": 0.27209457755088806, + "learning_rate": 8.699735680645433e-05, + "loss": 1.0201, + "mean_token_accuracy": 0.6985432356595993, + "num_tokens": 12600291.0, + "step": 1488 + }, + { + "entropy": 1.0240623950958252, + "epoch": 0.5956595659565956, + "grad_norm": 0.2610955536365509, + "learning_rate": 8.68846534714106e-05, + "loss": 1.0045, + "mean_token_accuracy": 0.7094159424304962, + "num_tokens": 12609167.0, + "step": 1489 + }, + { + "entropy": 1.0332093834877014, + "epoch": 0.596059605960596, + "grad_norm": 0.27262207865715027, + "learning_rate": 8.67719889314429e-05, + "loss": 1.0105, + "mean_token_accuracy": 0.7129793018102646, + "num_tokens": 12617540.0, + "step": 1490 + }, + { + "entropy": 1.058265596628189, + "epoch": 0.5964596459645964, + "grad_norm": 0.27217963337898254, + "learning_rate": 8.665936337563896e-05, + "loss": 1.0502, + "mean_token_accuracy": 0.6983201503753662, + "num_tokens": 12626182.0, + "step": 1491 + }, + { + "entropy": 0.9456982165575027, + "epoch": 0.5968596859685968, + "grad_norm": 0.2769935131072998, + "learning_rate": 8.654677699302116e-05, + "loss": 0.9529, + "mean_token_accuracy": 0.7268555164337158, + "num_tokens": 12634170.0, + "step": 1492 + }, + { + "entropy": 0.9828983396291733, + "epoch": 0.5972597259725972, + "grad_norm": 0.2708134949207306, + "learning_rate": 8.643422997254613e-05, + "loss": 0.9747, + "mean_token_accuracy": 0.7163682281970978, + "num_tokens": 12642913.0, + "step": 1493 + }, + { + "entropy": 1.0749245584011078, + "epoch": 0.5976597659765976, + "grad_norm": 0.271984338760376, + "learning_rate": 8.632172250310436e-05, + "loss": 1.0666, + "mean_token_accuracy": 0.6928652822971344, + "num_tokens": 12651484.0, + "step": 1494 + }, + { + "entropy": 1.0282465517520905, + "epoch": 0.598059805980598, + "grad_norm": 0.2757219970226288, + "learning_rate": 8.620925477351998e-05, + "loss": 1.0138, + "mean_token_accuracy": 0.7086197435855865, + "num_tokens": 12659761.0, + "step": 1495 + }, + { + "entropy": 0.9936039745807648, + "epoch": 0.5984598459845984, + "grad_norm": 0.25934457778930664, + "learning_rate": 8.609682697255055e-05, + "loss": 0.9725, + "mean_token_accuracy": 0.7205819189548492, + "num_tokens": 12668256.0, + "step": 1496 + }, + { + "entropy": 0.9988133311271667, + "epoch": 0.5988598859885989, + "grad_norm": 0.26249489188194275, + "learning_rate": 8.598443928888644e-05, + "loss": 0.9836, + "mean_token_accuracy": 0.7176023125648499, + "num_tokens": 12676880.0, + "step": 1497 + }, + { + "entropy": 1.0762478858232498, + "epoch": 0.5992599259925993, + "grad_norm": 0.31243762373924255, + "learning_rate": 8.587209191115078e-05, + "loss": 1.0264, + "mean_token_accuracy": 0.6947596222162247, + "num_tokens": 12685335.0, + "step": 1498 + }, + { + "entropy": 1.0324297845363617, + "epoch": 0.5996599659965997, + "grad_norm": 0.28065040707588196, + "learning_rate": 8.575978502789908e-05, + "loss": 1.0241, + "mean_token_accuracy": 0.7071689367294312, + "num_tokens": 12693468.0, + "step": 1499 + }, + { + "entropy": 1.0827740728855133, + "epoch": 0.6000600060006, + "grad_norm": 0.2789984941482544, + "learning_rate": 8.564751882761877e-05, + "loss": 1.0838, + "mean_token_accuracy": 0.694503903388977, + "num_tokens": 12701768.0, + "step": 1500 + }, + { + "entropy": 1.013455092906952, + "epoch": 0.6004600460046005, + "grad_norm": 0.29872167110443115, + "learning_rate": 8.553529349872916e-05, + "loss": 1.0109, + "mean_token_accuracy": 0.7092700153589249, + "num_tokens": 12709959.0, + "step": 1501 + }, + { + "entropy": 1.0475957095623016, + "epoch": 0.6008600860086009, + "grad_norm": 0.28878945112228394, + "learning_rate": 8.542310922958084e-05, + "loss": 1.0784, + "mean_token_accuracy": 0.6954021900892258, + "num_tokens": 12717660.0, + "step": 1502 + }, + { + "entropy": 1.1063966155052185, + "epoch": 0.6012601260126013, + "grad_norm": 0.28748103976249695, + "learning_rate": 8.531096620845555e-05, + "loss": 1.1068, + "mean_token_accuracy": 0.6871594041585922, + "num_tokens": 12725730.0, + "step": 1503 + }, + { + "entropy": 0.9524229764938354, + "epoch": 0.6016601660166017, + "grad_norm": 0.25865939259529114, + "learning_rate": 8.519886462356578e-05, + "loss": 0.9638, + "mean_token_accuracy": 0.7249769419431686, + "num_tokens": 12735001.0, + "step": 1504 + }, + { + "entropy": 1.0370503216981888, + "epoch": 0.6020602060206021, + "grad_norm": 0.31152400374412537, + "learning_rate": 8.508680466305443e-05, + "loss": 1.024, + "mean_token_accuracy": 0.7025562822818756, + "num_tokens": 12743591.0, + "step": 1505 + }, + { + "entropy": 1.018994227051735, + "epoch": 0.6024602460246025, + "grad_norm": 0.26931026577949524, + "learning_rate": 8.497478651499471e-05, + "loss": 0.9877, + "mean_token_accuracy": 0.7157720029354095, + "num_tokens": 12751944.0, + "step": 1506 + }, + { + "entropy": 1.016649305820465, + "epoch": 0.6028602860286029, + "grad_norm": 0.2694644331932068, + "learning_rate": 8.486281036738943e-05, + "loss": 1.0248, + "mean_token_accuracy": 0.7083172798156738, + "num_tokens": 12760864.0, + "step": 1507 + }, + { + "entropy": 1.0304310470819473, + "epoch": 0.6032603260326033, + "grad_norm": 0.2672014534473419, + "learning_rate": 8.475087640817108e-05, + "loss": 1.0325, + "mean_token_accuracy": 0.7036655992269516, + "num_tokens": 12769469.0, + "step": 1508 + }, + { + "entropy": 1.040782168507576, + "epoch": 0.6036603660366037, + "grad_norm": 0.2753554880619049, + "learning_rate": 8.46389848252012e-05, + "loss": 1.0326, + "mean_token_accuracy": 0.7016676664352417, + "num_tokens": 12777814.0, + "step": 1509 + }, + { + "entropy": 1.0358385145664215, + "epoch": 0.6040604060406041, + "grad_norm": 0.2651107907295227, + "learning_rate": 8.452713580627039e-05, + "loss": 1.0173, + "mean_token_accuracy": 0.7018054127693176, + "num_tokens": 12786330.0, + "step": 1510 + }, + { + "entropy": 1.0044610053300858, + "epoch": 0.6044604460446045, + "grad_norm": 0.2694355249404907, + "learning_rate": 8.441532953909763e-05, + "loss": 0.9893, + "mean_token_accuracy": 0.7137744575738907, + "num_tokens": 12794673.0, + "step": 1511 + }, + { + "entropy": 1.0558724254369736, + "epoch": 0.6048604860486049, + "grad_norm": 0.2795282006263733, + "learning_rate": 8.430356621133031e-05, + "loss": 1.0407, + "mean_token_accuracy": 0.705982118844986, + "num_tokens": 12803052.0, + "step": 1512 + }, + { + "entropy": 1.0269375890493393, + "epoch": 0.6052605260526053, + "grad_norm": 0.2889425754547119, + "learning_rate": 8.419184601054362e-05, + "loss": 1.0355, + "mean_token_accuracy": 0.7007274180650711, + "num_tokens": 12810813.0, + "step": 1513 + }, + { + "entropy": 1.0418251603841782, + "epoch": 0.6056605660566057, + "grad_norm": 0.27037250995635986, + "learning_rate": 8.408016912424039e-05, + "loss": 1.0251, + "mean_token_accuracy": 0.6994795799255371, + "num_tokens": 12819095.0, + "step": 1514 + }, + { + "entropy": 1.0403810739517212, + "epoch": 0.6060606060606061, + "grad_norm": 0.2668045163154602, + "learning_rate": 8.396853573985089e-05, + "loss": 1.0721, + "mean_token_accuracy": 0.6973385959863663, + "num_tokens": 12827832.0, + "step": 1515 + }, + { + "entropy": 1.0411263704299927, + "epoch": 0.6064606460646065, + "grad_norm": 0.2765207290649414, + "learning_rate": 8.38569460447322e-05, + "loss": 1.025, + "mean_token_accuracy": 0.7055956721305847, + "num_tokens": 12835890.0, + "step": 1516 + }, + { + "entropy": 1.0502442419528961, + "epoch": 0.6068606860686069, + "grad_norm": 0.2856801748275757, + "learning_rate": 8.374540022616821e-05, + "loss": 1.0448, + "mean_token_accuracy": 0.7053952515125275, + "num_tokens": 12844017.0, + "step": 1517 + }, + { + "entropy": 1.01394422352314, + "epoch": 0.6072607260726073, + "grad_norm": 0.26755231618881226, + "learning_rate": 8.363389847136907e-05, + "loss": 1.0064, + "mean_token_accuracy": 0.7070586681365967, + "num_tokens": 12852626.0, + "step": 1518 + }, + { + "entropy": 0.9804023802280426, + "epoch": 0.6076607660766077, + "grad_norm": 0.26526373624801636, + "learning_rate": 8.3522440967471e-05, + "loss": 1.0061, + "mean_token_accuracy": 0.7098437249660492, + "num_tokens": 12861881.0, + "step": 1519 + }, + { + "entropy": 1.0379801392555237, + "epoch": 0.6080608060806081, + "grad_norm": 0.26620543003082275, + "learning_rate": 8.341102790153601e-05, + "loss": 1.0311, + "mean_token_accuracy": 0.7044303268194199, + "num_tokens": 12870118.0, + "step": 1520 + }, + { + "entropy": 0.967769056558609, + "epoch": 0.6084608460846085, + "grad_norm": 0.25754064321517944, + "learning_rate": 8.329965946055152e-05, + "loss": 0.9434, + "mean_token_accuracy": 0.7191864252090454, + "num_tokens": 12878439.0, + "step": 1521 + }, + { + "entropy": 1.0572359263896942, + "epoch": 0.6088608860886089, + "grad_norm": 0.26906558871269226, + "learning_rate": 8.318833583142996e-05, + "loss": 1.0365, + "mean_token_accuracy": 0.6988092958927155, + "num_tokens": 12886648.0, + "step": 1522 + }, + { + "entropy": 1.0653652846813202, + "epoch": 0.6092609260926093, + "grad_norm": 0.2928856909275055, + "learning_rate": 8.307705720100866e-05, + "loss": 1.0706, + "mean_token_accuracy": 0.6963316053152084, + "num_tokens": 12894835.0, + "step": 1523 + }, + { + "entropy": 1.0382554978132248, + "epoch": 0.6096609660966097, + "grad_norm": 0.2624090611934662, + "learning_rate": 8.296582375604938e-05, + "loss": 1.0437, + "mean_token_accuracy": 0.7062574923038483, + "num_tokens": 12903691.0, + "step": 1524 + }, + { + "entropy": 1.0709368288516998, + "epoch": 0.6100610061006101, + "grad_norm": 0.274172306060791, + "learning_rate": 8.285463568323804e-05, + "loss": 1.0612, + "mean_token_accuracy": 0.6956117451190948, + "num_tokens": 12911911.0, + "step": 1525 + }, + { + "entropy": 0.9851703196763992, + "epoch": 0.6104610461046105, + "grad_norm": 0.2564312517642975, + "learning_rate": 8.274349316918446e-05, + "loss": 0.9524, + "mean_token_accuracy": 0.7253227233886719, + "num_tokens": 12920257.0, + "step": 1526 + }, + { + "entropy": 1.0781344771385193, + "epoch": 0.6108610861086109, + "grad_norm": 0.26705825328826904, + "learning_rate": 8.26323964004219e-05, + "loss": 1.0668, + "mean_token_accuracy": 0.7006201595067978, + "num_tokens": 12928604.0, + "step": 1527 + }, + { + "entropy": 1.0537561178207397, + "epoch": 0.6112611261126113, + "grad_norm": 0.272816926240921, + "learning_rate": 8.252134556340689e-05, + "loss": 1.044, + "mean_token_accuracy": 0.7040509581565857, + "num_tokens": 12936661.0, + "step": 1528 + }, + { + "entropy": 0.9914866387844086, + "epoch": 0.6116611661166117, + "grad_norm": 0.2764982581138611, + "learning_rate": 8.241034084451897e-05, + "loss": 0.9914, + "mean_token_accuracy": 0.7138096541166306, + "num_tokens": 12944739.0, + "step": 1529 + }, + { + "entropy": 1.0856202393770218, + "epoch": 0.6120612061206121, + "grad_norm": 0.2835068702697754, + "learning_rate": 8.229938243006015e-05, + "loss": 1.0692, + "mean_token_accuracy": 0.7003717571496964, + "num_tokens": 12953180.0, + "step": 1530 + }, + { + "entropy": 1.0558515936136246, + "epoch": 0.6124612461246125, + "grad_norm": 0.2758465111255646, + "learning_rate": 8.218847050625476e-05, + "loss": 1.0325, + "mean_token_accuracy": 0.6995532363653183, + "num_tokens": 12961250.0, + "step": 1531 + }, + { + "entropy": 1.0177436172962189, + "epoch": 0.6128612861286129, + "grad_norm": 0.28152191638946533, + "learning_rate": 8.20776052592491e-05, + "loss": 1.026, + "mean_token_accuracy": 0.7140059620141983, + "num_tokens": 12969512.0, + "step": 1532 + }, + { + "entropy": 0.9774316251277924, + "epoch": 0.6132613261326133, + "grad_norm": 0.2512759864330292, + "learning_rate": 8.196678687511115e-05, + "loss": 0.9613, + "mean_token_accuracy": 0.7146789878606796, + "num_tokens": 12978373.0, + "step": 1533 + }, + { + "entropy": 1.0321957021951675, + "epoch": 0.6136613661366137, + "grad_norm": 0.27034422755241394, + "learning_rate": 8.185601553983024e-05, + "loss": 1.0185, + "mean_token_accuracy": 0.7069217562675476, + "num_tokens": 12986768.0, + "step": 1534 + }, + { + "entropy": 1.0240458995103836, + "epoch": 0.6140614061406141, + "grad_norm": 0.2746856212615967, + "learning_rate": 8.174529143931677e-05, + "loss": 1.0082, + "mean_token_accuracy": 0.7121590375900269, + "num_tokens": 12994746.0, + "step": 1535 + }, + { + "entropy": 1.0193761140108109, + "epoch": 0.6144614461446145, + "grad_norm": 0.27232956886291504, + "learning_rate": 8.163461475940175e-05, + "loss": 1.0132, + "mean_token_accuracy": 0.7069653272628784, + "num_tokens": 13003164.0, + "step": 1536 + }, + { + "entropy": 1.0601644665002823, + "epoch": 0.6148614861486149, + "grad_norm": 0.2746070921421051, + "learning_rate": 8.15239856858367e-05, + "loss": 1.0566, + "mean_token_accuracy": 0.7062048316001892, + "num_tokens": 13011370.0, + "step": 1537 + }, + { + "entropy": 0.9899421185255051, + "epoch": 0.6152615261526153, + "grad_norm": 0.2799742817878723, + "learning_rate": 8.141340440429323e-05, + "loss": 0.9763, + "mean_token_accuracy": 0.7185059487819672, + "num_tokens": 13019460.0, + "step": 1538 + }, + { + "entropy": 1.0483266413211823, + "epoch": 0.6156615661566157, + "grad_norm": 0.2726534605026245, + "learning_rate": 8.130287110036277e-05, + "loss": 1.065, + "mean_token_accuracy": 0.7006007134914398, + "num_tokens": 13028092.0, + "step": 1539 + }, + { + "entropy": 0.985085055232048, + "epoch": 0.6160616061606161, + "grad_norm": 0.2885760962963104, + "learning_rate": 8.11923859595561e-05, + "loss": 0.9872, + "mean_token_accuracy": 0.7170431017875671, + "num_tokens": 13036463.0, + "step": 1540 + }, + { + "entropy": 1.0274955481290817, + "epoch": 0.6164616461646165, + "grad_norm": 0.2816045582294464, + "learning_rate": 8.108194916730331e-05, + "loss": 1.0341, + "mean_token_accuracy": 0.6977733969688416, + "num_tokens": 13044855.0, + "step": 1541 + }, + { + "entropy": 1.0117756128311157, + "epoch": 0.6168616861686168, + "grad_norm": 0.27995944023132324, + "learning_rate": 8.097156090895323e-05, + "loss": 1.024, + "mean_token_accuracy": 0.7018360942602158, + "num_tokens": 13053160.0, + "step": 1542 + }, + { + "entropy": 1.0170161128044128, + "epoch": 0.6172617261726172, + "grad_norm": 0.27276766300201416, + "learning_rate": 8.08612213697734e-05, + "loss": 1.0125, + "mean_token_accuracy": 0.7152312844991684, + "num_tokens": 13061388.0, + "step": 1543 + }, + { + "entropy": 1.0133970826864243, + "epoch": 0.6176617661766176, + "grad_norm": 0.26618948578834534, + "learning_rate": 8.075093073494937e-05, + "loss": 1.019, + "mean_token_accuracy": 0.7065234184265137, + "num_tokens": 13070066.0, + "step": 1544 + }, + { + "entropy": 0.9973821491003036, + "epoch": 0.618061806180618, + "grad_norm": 0.26258668303489685, + "learning_rate": 8.064068918958476e-05, + "loss": 0.9739, + "mean_token_accuracy": 0.7232815474271774, + "num_tokens": 13078406.0, + "step": 1545 + }, + { + "entropy": 1.0118751972913742, + "epoch": 0.6184618461846184, + "grad_norm": 0.2750121057033539, + "learning_rate": 8.053049691870082e-05, + "loss": 1.0024, + "mean_token_accuracy": 0.7085379064083099, + "num_tokens": 13086673.0, + "step": 1546 + }, + { + "entropy": 0.9946679323911667, + "epoch": 0.6188618861886188, + "grad_norm": 0.2601962089538574, + "learning_rate": 8.042035410723596e-05, + "loss": 0.99, + "mean_token_accuracy": 0.7155250161886215, + "num_tokens": 13095588.0, + "step": 1547 + }, + { + "entropy": 1.0767863094806671, + "epoch": 0.6192619261926192, + "grad_norm": 0.2891894280910492, + "learning_rate": 8.031026094004577e-05, + "loss": 1.0602, + "mean_token_accuracy": 0.6983335912227631, + "num_tokens": 13103897.0, + "step": 1548 + }, + { + "entropy": 1.0941403806209564, + "epoch": 0.6196619661966196, + "grad_norm": 0.2829514741897583, + "learning_rate": 8.020021760190237e-05, + "loss": 1.0723, + "mean_token_accuracy": 0.6875952482223511, + "num_tokens": 13111878.0, + "step": 1549 + }, + { + "entropy": 1.0751255452632904, + "epoch": 0.62006200620062, + "grad_norm": 0.27135440707206726, + "learning_rate": 8.009022427749431e-05, + "loss": 1.0622, + "mean_token_accuracy": 0.7019540965557098, + "num_tokens": 13120181.0, + "step": 1550 + }, + { + "entropy": 1.0659511983394623, + "epoch": 0.6204620462046204, + "grad_norm": 0.2654080092906952, + "learning_rate": 7.998028115142617e-05, + "loss": 1.0583, + "mean_token_accuracy": 0.7013959288597107, + "num_tokens": 13128574.0, + "step": 1551 + }, + { + "entropy": 0.9782145470380783, + "epoch": 0.6208620862086208, + "grad_norm": 0.26692116260528564, + "learning_rate": 7.98703884082183e-05, + "loss": 0.9575, + "mean_token_accuracy": 0.7152103185653687, + "num_tokens": 13136708.0, + "step": 1552 + }, + { + "entropy": 1.0135638266801834, + "epoch": 0.6212621262126212, + "grad_norm": 0.26060575246810913, + "learning_rate": 7.976054623230657e-05, + "loss": 0.9797, + "mean_token_accuracy": 0.7244757264852524, + "num_tokens": 13145158.0, + "step": 1553 + }, + { + "entropy": 0.9974073618650436, + "epoch": 0.6216621662166216, + "grad_norm": 0.27319982647895813, + "learning_rate": 7.965075480804185e-05, + "loss": 0.9896, + "mean_token_accuracy": 0.7175292372703552, + "num_tokens": 13153808.0, + "step": 1554 + }, + { + "entropy": 1.0359916388988495, + "epoch": 0.622062206220622, + "grad_norm": 0.269796222448349, + "learning_rate": 7.95410143196899e-05, + "loss": 1.0701, + "mean_token_accuracy": 0.6935649365186691, + "num_tokens": 13162452.0, + "step": 1555 + }, + { + "entropy": 0.9728094786405563, + "epoch": 0.6224622462246224, + "grad_norm": 0.2637464702129364, + "learning_rate": 7.943132495143095e-05, + "loss": 0.961, + "mean_token_accuracy": 0.7222893387079239, + "num_tokens": 13171207.0, + "step": 1556 + }, + { + "entropy": 0.9538765847682953, + "epoch": 0.6228622862286228, + "grad_norm": 0.2502013146877289, + "learning_rate": 7.932168688735955e-05, + "loss": 0.9404, + "mean_token_accuracy": 0.7296594232320786, + "num_tokens": 13180300.0, + "step": 1557 + }, + { + "entropy": 1.0499037504196167, + "epoch": 0.6232623262326232, + "grad_norm": 0.2804265022277832, + "learning_rate": 7.921210031148396e-05, + "loss": 1.0968, + "mean_token_accuracy": 0.6903265118598938, + "num_tokens": 13188671.0, + "step": 1558 + }, + { + "entropy": 1.0219275802373886, + "epoch": 0.6236623662366236, + "grad_norm": 0.27089381217956543, + "learning_rate": 7.910256540772623e-05, + "loss": 1.0194, + "mean_token_accuracy": 0.705549344420433, + "num_tokens": 13197076.0, + "step": 1559 + }, + { + "entropy": 0.9941435605287552, + "epoch": 0.624062406240624, + "grad_norm": 0.2763214707374573, + "learning_rate": 7.899308235992149e-05, + "loss": 0.9864, + "mean_token_accuracy": 0.7113952338695526, + "num_tokens": 13205980.0, + "step": 1560 + }, + { + "entropy": 1.0282153934240341, + "epoch": 0.6244624462446244, + "grad_norm": 0.2696799635887146, + "learning_rate": 7.888365135181794e-05, + "loss": 1.0325, + "mean_token_accuracy": 0.7014058232307434, + "num_tokens": 13214566.0, + "step": 1561 + }, + { + "entropy": 1.0038295835256577, + "epoch": 0.6248624862486248, + "grad_norm": 0.26871535181999207, + "learning_rate": 7.87742725670765e-05, + "loss": 0.9762, + "mean_token_accuracy": 0.7163118869066238, + "num_tokens": 13223057.0, + "step": 1562 + }, + { + "entropy": 1.0745265483856201, + "epoch": 0.6252625262526252, + "grad_norm": 0.29424744844436646, + "learning_rate": 7.866494618927036e-05, + "loss": 1.0382, + "mean_token_accuracy": 0.7064184695482254, + "num_tokens": 13231148.0, + "step": 1563 + }, + { + "entropy": 0.9946384131908417, + "epoch": 0.6256625662566256, + "grad_norm": 0.3346986472606659, + "learning_rate": 7.855567240188475e-05, + "loss": 0.9923, + "mean_token_accuracy": 0.7161682844161987, + "num_tokens": 13240039.0, + "step": 1564 + }, + { + "entropy": 1.0118414759635925, + "epoch": 0.626062606260626, + "grad_norm": 0.25484490394592285, + "learning_rate": 7.844645138831667e-05, + "loss": 0.987, + "mean_token_accuracy": 0.7205919176340103, + "num_tokens": 13248948.0, + "step": 1565 + }, + { + "entropy": 0.9960702657699585, + "epoch": 0.6264626462646264, + "grad_norm": 0.2648907005786896, + "learning_rate": 7.833728333187446e-05, + "loss": 0.9631, + "mean_token_accuracy": 0.7116621434688568, + "num_tokens": 13257211.0, + "step": 1566 + }, + { + "entropy": 0.9825841188430786, + "epoch": 0.6268626862686268, + "grad_norm": 0.2625282406806946, + "learning_rate": 7.822816841577775e-05, + "loss": 0.9795, + "mean_token_accuracy": 0.7155411392450333, + "num_tokens": 13265851.0, + "step": 1567 + }, + { + "entropy": 1.0454929769039154, + "epoch": 0.6272627262726272, + "grad_norm": 0.293143093585968, + "learning_rate": 7.81191068231569e-05, + "loss": 1.0431, + "mean_token_accuracy": 0.7011211663484573, + "num_tokens": 13273945.0, + "step": 1568 + }, + { + "entropy": 1.0135682225227356, + "epoch": 0.6276627662766276, + "grad_norm": 0.3275827169418335, + "learning_rate": 7.80100987370527e-05, + "loss": 1.0311, + "mean_token_accuracy": 0.7014229595661163, + "num_tokens": 13281869.0, + "step": 1569 + }, + { + "entropy": 1.0300889909267426, + "epoch": 0.628062806280628, + "grad_norm": 0.2853155732154846, + "learning_rate": 7.790114434041618e-05, + "loss": 1.0506, + "mean_token_accuracy": 0.7058955878019333, + "num_tokens": 13289941.0, + "step": 1570 + }, + { + "entropy": 1.031609669327736, + "epoch": 0.6284628462846285, + "grad_norm": 0.2898198962211609, + "learning_rate": 7.779224381610837e-05, + "loss": 1.049, + "mean_token_accuracy": 0.6984302401542664, + "num_tokens": 13297915.0, + "step": 1571 + }, + { + "entropy": 1.0139999240636826, + "epoch": 0.6288628862886289, + "grad_norm": 0.27943116426467896, + "learning_rate": 7.768339734689975e-05, + "loss": 1.0191, + "mean_token_accuracy": 0.7088859528303146, + "num_tokens": 13306418.0, + "step": 1572 + }, + { + "entropy": 0.9866802096366882, + "epoch": 0.6292629262926293, + "grad_norm": 0.27253884077072144, + "learning_rate": 7.757460511547014e-05, + "loss": 0.9733, + "mean_token_accuracy": 0.7180858701467514, + "num_tokens": 13314614.0, + "step": 1573 + }, + { + "entropy": 1.0598840117454529, + "epoch": 0.6296629662966297, + "grad_norm": 0.2764323353767395, + "learning_rate": 7.746586730440832e-05, + "loss": 1.0658, + "mean_token_accuracy": 0.7007716596126556, + "num_tokens": 13322681.0, + "step": 1574 + }, + { + "entropy": 0.9893881529569626, + "epoch": 0.6300630063006301, + "grad_norm": 0.2750706374645233, + "learning_rate": 7.735718409621172e-05, + "loss": 0.9657, + "mean_token_accuracy": 0.7191077917814255, + "num_tokens": 13330674.0, + "step": 1575 + }, + { + "entropy": 0.9749124646186829, + "epoch": 0.6304630463046305, + "grad_norm": 0.2668512761592865, + "learning_rate": 7.724855567328613e-05, + "loss": 0.9696, + "mean_token_accuracy": 0.7182539701461792, + "num_tokens": 13339522.0, + "step": 1576 + }, + { + "entropy": 1.0415545254945755, + "epoch": 0.6308630863086309, + "grad_norm": 0.2874346077442169, + "learning_rate": 7.713998221794545e-05, + "loss": 1.0609, + "mean_token_accuracy": 0.7055009007453918, + "num_tokens": 13347407.0, + "step": 1577 + }, + { + "entropy": 1.0135153234004974, + "epoch": 0.6312631263126313, + "grad_norm": 0.26647090911865234, + "learning_rate": 7.703146391241125e-05, + "loss": 0.9931, + "mean_token_accuracy": 0.7190768420696259, + "num_tokens": 13355687.0, + "step": 1578 + }, + { + "entropy": 1.0682182312011719, + "epoch": 0.6316631663166317, + "grad_norm": 0.2694565951824188, + "learning_rate": 7.69230009388126e-05, + "loss": 1.0432, + "mean_token_accuracy": 0.7075686752796173, + "num_tokens": 13364295.0, + "step": 1579 + }, + { + "entropy": 1.0312392264604568, + "epoch": 0.6320632063206321, + "grad_norm": 0.2587052285671234, + "learning_rate": 7.681459347918558e-05, + "loss": 1.0107, + "mean_token_accuracy": 0.7058295011520386, + "num_tokens": 13373238.0, + "step": 1580 + }, + { + "entropy": 1.0258017629384995, + "epoch": 0.6324632463246325, + "grad_norm": 0.2643885910511017, + "learning_rate": 7.670624171547334e-05, + "loss": 1.01, + "mean_token_accuracy": 0.7163285911083221, + "num_tokens": 13381853.0, + "step": 1581 + }, + { + "entropy": 1.0313010215759277, + "epoch": 0.6328632863286329, + "grad_norm": 0.27118563652038574, + "learning_rate": 7.659794582952531e-05, + "loss": 1.0362, + "mean_token_accuracy": 0.7045850604772568, + "num_tokens": 13390184.0, + "step": 1582 + }, + { + "entropy": 1.00785331428051, + "epoch": 0.6332633263326333, + "grad_norm": 0.2647589147090912, + "learning_rate": 7.648970600309731e-05, + "loss": 1.0044, + "mean_token_accuracy": 0.7158198356628418, + "num_tokens": 13398817.0, + "step": 1583 + }, + { + "entropy": 1.0150828510522842, + "epoch": 0.6336633663366337, + "grad_norm": 0.2572803497314453, + "learning_rate": 7.638152241785091e-05, + "loss": 0.9581, + "mean_token_accuracy": 0.7213051170110703, + "num_tokens": 13407527.0, + "step": 1584 + }, + { + "entropy": 1.0241198241710663, + "epoch": 0.6340634063406341, + "grad_norm": 0.2671639025211334, + "learning_rate": 7.627339525535344e-05, + "loss": 1.0008, + "mean_token_accuracy": 0.716378390789032, + "num_tokens": 13416206.0, + "step": 1585 + }, + { + "entropy": 0.9638293385505676, + "epoch": 0.6344634463446345, + "grad_norm": 0.2568714916706085, + "learning_rate": 7.61653246970775e-05, + "loss": 0.9437, + "mean_token_accuracy": 0.7253225445747375, + "num_tokens": 13425007.0, + "step": 1586 + }, + { + "entropy": 1.0536439418792725, + "epoch": 0.6348634863486349, + "grad_norm": 0.28028926253318787, + "learning_rate": 7.605731092440063e-05, + "loss": 1.0611, + "mean_token_accuracy": 0.6932785362005234, + "num_tokens": 13433083.0, + "step": 1587 + }, + { + "entropy": 0.9797328561544418, + "epoch": 0.6352635263526353, + "grad_norm": 0.2622891962528229, + "learning_rate": 7.594935411860509e-05, + "loss": 0.989, + "mean_token_accuracy": 0.711808443069458, + "num_tokens": 13441708.0, + "step": 1588 + }, + { + "entropy": 1.0745765268802643, + "epoch": 0.6356635663566357, + "grad_norm": 0.2960025370121002, + "learning_rate": 7.584145446087756e-05, + "loss": 1.1009, + "mean_token_accuracy": 0.6931444555521011, + "num_tokens": 13449558.0, + "step": 1589 + }, + { + "entropy": 0.9538265764713287, + "epoch": 0.6360636063606361, + "grad_norm": 0.26100143790245056, + "learning_rate": 7.573361213230883e-05, + "loss": 0.9343, + "mean_token_accuracy": 0.7261921465396881, + "num_tokens": 13458607.0, + "step": 1590 + }, + { + "entropy": 1.0278007835149765, + "epoch": 0.6364636463646365, + "grad_norm": 0.27330920100212097, + "learning_rate": 7.562582731389336e-05, + "loss": 1.0378, + "mean_token_accuracy": 0.695374146103859, + "num_tokens": 13467504.0, + "step": 1591 + }, + { + "entropy": 1.0486839711666107, + "epoch": 0.6368636863686369, + "grad_norm": 0.2801864445209503, + "learning_rate": 7.551810018652923e-05, + "loss": 1.0382, + "mean_token_accuracy": 0.7030842453241348, + "num_tokens": 13475907.0, + "step": 1592 + }, + { + "entropy": 1.045127511024475, + "epoch": 0.6372637263726373, + "grad_norm": 0.2759176194667816, + "learning_rate": 7.541043093101754e-05, + "loss": 1.0447, + "mean_token_accuracy": 0.6948913931846619, + "num_tokens": 13484282.0, + "step": 1593 + }, + { + "entropy": 1.036905661225319, + "epoch": 0.6376637663766377, + "grad_norm": 0.2795928418636322, + "learning_rate": 7.530281972806241e-05, + "loss": 1.0349, + "mean_token_accuracy": 0.702324241399765, + "num_tokens": 13492805.0, + "step": 1594 + }, + { + "entropy": 0.9836885780096054, + "epoch": 0.6380638063806381, + "grad_norm": 0.26184821128845215, + "learning_rate": 7.519526675827049e-05, + "loss": 0.9879, + "mean_token_accuracy": 0.7129903584718704, + "num_tokens": 13501585.0, + "step": 1595 + }, + { + "entropy": 1.0157838761806488, + "epoch": 0.6384638463846385, + "grad_norm": 0.27548947930336, + "learning_rate": 7.508777220215062e-05, + "loss": 1.0059, + "mean_token_accuracy": 0.7044865936040878, + "num_tokens": 13509958.0, + "step": 1596 + }, + { + "entropy": 0.9340676069259644, + "epoch": 0.6388638863886389, + "grad_norm": 0.2639513313770294, + "learning_rate": 7.498033624011369e-05, + "loss": 0.907, + "mean_token_accuracy": 0.728716105222702, + "num_tokens": 13518413.0, + "step": 1597 + }, + { + "entropy": 1.0446272939443588, + "epoch": 0.6392639263926393, + "grad_norm": 0.2784963846206665, + "learning_rate": 7.487295905247223e-05, + "loss": 1.045, + "mean_token_accuracy": 0.7025579214096069, + "num_tokens": 13527021.0, + "step": 1598 + }, + { + "entropy": 1.024623617529869, + "epoch": 0.6396639663966397, + "grad_norm": 0.2779615819454193, + "learning_rate": 7.476564081944004e-05, + "loss": 1.0174, + "mean_token_accuracy": 0.7070256471633911, + "num_tokens": 13535341.0, + "step": 1599 + }, + { + "entropy": 1.0059790164232254, + "epoch": 0.6400640064006401, + "grad_norm": 0.27965015172958374, + "learning_rate": 7.465838172113214e-05, + "loss": 0.9881, + "mean_token_accuracy": 0.7175890058279037, + "num_tokens": 13543993.0, + "step": 1600 + }, + { + "entropy": 0.9897790998220444, + "epoch": 0.6404640464046405, + "grad_norm": 0.272169291973114, + "learning_rate": 7.455118193756419e-05, + "loss": 0.9669, + "mean_token_accuracy": 0.7144671529531479, + "num_tokens": 13552010.0, + "step": 1601 + }, + { + "entropy": 1.001438856124878, + "epoch": 0.6408640864086409, + "grad_norm": 0.26202327013015747, + "learning_rate": 7.444404164865227e-05, + "loss": 1.0109, + "mean_token_accuracy": 0.7063065022230148, + "num_tokens": 13560954.0, + "step": 1602 + }, + { + "entropy": 1.1066432297229767, + "epoch": 0.6412641264126413, + "grad_norm": 0.28559938073158264, + "learning_rate": 7.433696103421269e-05, + "loss": 1.1105, + "mean_token_accuracy": 0.6887119114398956, + "num_tokens": 13569195.0, + "step": 1603 + }, + { + "entropy": 1.0169396549463272, + "epoch": 0.6416641664166417, + "grad_norm": 0.26923683285713196, + "learning_rate": 7.422994027396162e-05, + "loss": 1.0162, + "mean_token_accuracy": 0.7034173458814621, + "num_tokens": 13577758.0, + "step": 1604 + }, + { + "entropy": 1.0646847486495972, + "epoch": 0.6420642064206421, + "grad_norm": 0.270306795835495, + "learning_rate": 7.412297954751468e-05, + "loss": 1.0604, + "mean_token_accuracy": 0.6981649547815323, + "num_tokens": 13586217.0, + "step": 1605 + }, + { + "entropy": 1.0149510949850082, + "epoch": 0.6424642464246425, + "grad_norm": 0.2878839373588562, + "learning_rate": 7.401607903438679e-05, + "loss": 1.0228, + "mean_token_accuracy": 0.705387219786644, + "num_tokens": 13594423.0, + "step": 1606 + }, + { + "entropy": 1.0177496373653412, + "epoch": 0.6428642864286429, + "grad_norm": 0.2713722288608551, + "learning_rate": 7.390923891399186e-05, + "loss": 0.9927, + "mean_token_accuracy": 0.7136061638593674, + "num_tokens": 13602600.0, + "step": 1607 + }, + { + "entropy": 0.9525837898254395, + "epoch": 0.6432643264326433, + "grad_norm": 0.29586541652679443, + "learning_rate": 7.380245936564232e-05, + "loss": 0.9576, + "mean_token_accuracy": 0.7250726222991943, + "num_tokens": 13611083.0, + "step": 1608 + }, + { + "entropy": 0.9942192137241364, + "epoch": 0.6436643664366437, + "grad_norm": 0.27431654930114746, + "learning_rate": 7.369574056854906e-05, + "loss": 0.9736, + "mean_token_accuracy": 0.7180140018463135, + "num_tokens": 13619984.0, + "step": 1609 + }, + { + "entropy": 0.9461335688829422, + "epoch": 0.6440644064406441, + "grad_norm": 0.2640526592731476, + "learning_rate": 7.3589082701821e-05, + "loss": 0.9329, + "mean_token_accuracy": 0.7252113670110703, + "num_tokens": 13628746.0, + "step": 1610 + }, + { + "entropy": 1.0502605885267258, + "epoch": 0.6444644464446445, + "grad_norm": 0.2833975553512573, + "learning_rate": 7.348248594446465e-05, + "loss": 1.0499, + "mean_token_accuracy": 0.699222132563591, + "num_tokens": 13636877.0, + "step": 1611 + }, + { + "entropy": 0.9922800660133362, + "epoch": 0.6448644864486449, + "grad_norm": 0.2834934592247009, + "learning_rate": 7.337595047538414e-05, + "loss": 1.0232, + "mean_token_accuracy": 0.711656853556633, + "num_tokens": 13645328.0, + "step": 1612 + }, + { + "entropy": 1.055614948272705, + "epoch": 0.6452645264526453, + "grad_norm": 0.27245664596557617, + "learning_rate": 7.326947647338063e-05, + "loss": 1.0397, + "mean_token_accuracy": 0.7050585448741913, + "num_tokens": 13653571.0, + "step": 1613 + }, + { + "entropy": 1.1498643159866333, + "epoch": 0.6456645664566457, + "grad_norm": 0.2952424883842468, + "learning_rate": 7.316306411715221e-05, + "loss": 1.1557, + "mean_token_accuracy": 0.6816436797380447, + "num_tokens": 13661753.0, + "step": 1614 + }, + { + "entropy": 0.968589186668396, + "epoch": 0.6460646064606461, + "grad_norm": 0.2699323296546936, + "learning_rate": 7.305671358529342e-05, + "loss": 0.9542, + "mean_token_accuracy": 0.7205090671777725, + "num_tokens": 13670209.0, + "step": 1615 + }, + { + "entropy": 1.0400151312351227, + "epoch": 0.6464646464646465, + "grad_norm": 0.26660025119781494, + "learning_rate": 7.295042505629508e-05, + "loss": 1.0378, + "mean_token_accuracy": 0.709984079003334, + "num_tokens": 13678837.0, + "step": 1616 + }, + { + "entropy": 0.9415567070245743, + "epoch": 0.6468646864686468, + "grad_norm": 0.2540196478366852, + "learning_rate": 7.284419870854389e-05, + "loss": 0.9151, + "mean_token_accuracy": 0.7249320894479752, + "num_tokens": 13687859.0, + "step": 1617 + }, + { + "entropy": 0.9984412640333176, + "epoch": 0.6472647264726472, + "grad_norm": 0.255563348531723, + "learning_rate": 7.273803472032233e-05, + "loss": 1.0019, + "mean_token_accuracy": 0.7149690836668015, + "num_tokens": 13696943.0, + "step": 1618 + }, + { + "entropy": 1.0580152124166489, + "epoch": 0.6476647664766476, + "grad_norm": 0.27433598041534424, + "learning_rate": 7.263193326980809e-05, + "loss": 1.0531, + "mean_token_accuracy": 0.6998144388198853, + "num_tokens": 13705037.0, + "step": 1619 + }, + { + "entropy": 0.990716278553009, + "epoch": 0.648064806480648, + "grad_norm": 0.27015000581741333, + "learning_rate": 7.25258945350739e-05, + "loss": 0.9924, + "mean_token_accuracy": 0.7103289216756821, + "num_tokens": 13713516.0, + "step": 1620 + }, + { + "entropy": 0.974513903260231, + "epoch": 0.6484648464846484, + "grad_norm": 0.2904779016971588, + "learning_rate": 7.241991869408736e-05, + "loss": 0.9813, + "mean_token_accuracy": 0.7205400764942169, + "num_tokens": 13722206.0, + "step": 1621 + }, + { + "entropy": 1.0186756998300552, + "epoch": 0.6488648864886488, + "grad_norm": 0.26860541105270386, + "learning_rate": 7.231400592471032e-05, + "loss": 1.0145, + "mean_token_accuracy": 0.7109009027481079, + "num_tokens": 13730656.0, + "step": 1622 + }, + { + "entropy": 1.0084389001131058, + "epoch": 0.6492649264926492, + "grad_norm": 0.26772019267082214, + "learning_rate": 7.220815640469896e-05, + "loss": 1.0143, + "mean_token_accuracy": 0.7064168900251389, + "num_tokens": 13738909.0, + "step": 1623 + }, + { + "entropy": 1.031301110982895, + "epoch": 0.6496649664966496, + "grad_norm": 0.276083379983902, + "learning_rate": 7.210237031170327e-05, + "loss": 1.0221, + "mean_token_accuracy": 0.7155923545360565, + "num_tokens": 13747732.0, + "step": 1624 + }, + { + "entropy": 1.0622113943099976, + "epoch": 0.65006500650065, + "grad_norm": 0.2660558223724365, + "learning_rate": 7.199664782326665e-05, + "loss": 1.0143, + "mean_token_accuracy": 0.7118292450904846, + "num_tokens": 13756144.0, + "step": 1625 + }, + { + "entropy": 1.0278472006320953, + "epoch": 0.6504650465046504, + "grad_norm": 0.2955634593963623, + "learning_rate": 7.189098911682592e-05, + "loss": 1.0088, + "mean_token_accuracy": 0.7070183306932449, + "num_tokens": 13764311.0, + "step": 1626 + }, + { + "entropy": 1.0049774646759033, + "epoch": 0.6508650865086508, + "grad_norm": 0.2686450183391571, + "learning_rate": 7.178539436971072e-05, + "loss": 0.9736, + "mean_token_accuracy": 0.7252427935600281, + "num_tokens": 13772678.0, + "step": 1627 + }, + { + "entropy": 0.9852571785449982, + "epoch": 0.6512651265126512, + "grad_norm": 0.25933679938316345, + "learning_rate": 7.167986375914346e-05, + "loss": 0.9629, + "mean_token_accuracy": 0.7202611118555069, + "num_tokens": 13781195.0, + "step": 1628 + }, + { + "entropy": 1.0735368728637695, + "epoch": 0.6516651665166516, + "grad_norm": 0.30488941073417664, + "learning_rate": 7.157439746223886e-05, + "loss": 1.0725, + "mean_token_accuracy": 0.6960095465183258, + "num_tokens": 13789719.0, + "step": 1629 + }, + { + "entropy": 0.9660122990608215, + "epoch": 0.652065206520652, + "grad_norm": 0.2656802237033844, + "learning_rate": 7.146899565600372e-05, + "loss": 0.9692, + "mean_token_accuracy": 0.7151325345039368, + "num_tokens": 13798530.0, + "step": 1630 + }, + { + "entropy": 0.9490653723478317, + "epoch": 0.6524652465246524, + "grad_norm": 0.27268168330192566, + "learning_rate": 7.136365851733649e-05, + "loss": 0.9483, + "mean_token_accuracy": 0.7265384644269943, + "num_tokens": 13807042.0, + "step": 1631 + }, + { + "entropy": 0.9809153825044632, + "epoch": 0.6528652865286528, + "grad_norm": 0.2744136154651642, + "learning_rate": 7.125838622302723e-05, + "loss": 1.0233, + "mean_token_accuracy": 0.7129384577274323, + "num_tokens": 13815560.0, + "step": 1632 + }, + { + "entropy": 1.0176912099123, + "epoch": 0.6532653265326532, + "grad_norm": 0.26673975586891174, + "learning_rate": 7.115317894975717e-05, + "loss": 1.0126, + "mean_token_accuracy": 0.712721660733223, + "num_tokens": 13824129.0, + "step": 1633 + }, + { + "entropy": 0.9501180797815323, + "epoch": 0.6536653665366536, + "grad_norm": 0.257599800825119, + "learning_rate": 7.104803687409829e-05, + "loss": 0.9556, + "mean_token_accuracy": 0.7245234847068787, + "num_tokens": 13832880.0, + "step": 1634 + }, + { + "entropy": 1.0651895105838776, + "epoch": 0.654065406540654, + "grad_norm": 0.26873040199279785, + "learning_rate": 7.094296017251325e-05, + "loss": 1.0579, + "mean_token_accuracy": 0.698590487241745, + "num_tokens": 13841503.0, + "step": 1635 + }, + { + "entropy": 1.0840655267238617, + "epoch": 0.6544654465446544, + "grad_norm": 0.2880321145057678, + "learning_rate": 7.083794902135492e-05, + "loss": 1.0866, + "mean_token_accuracy": 0.6838184744119644, + "num_tokens": 13849420.0, + "step": 1636 + }, + { + "entropy": 0.9999715089797974, + "epoch": 0.6548654865486548, + "grad_norm": 0.26062822341918945, + "learning_rate": 7.073300359686623e-05, + "loss": 0.9826, + "mean_token_accuracy": 0.7190447300672531, + "num_tokens": 13858139.0, + "step": 1637 + }, + { + "entropy": 1.0640488415956497, + "epoch": 0.6552655265526552, + "grad_norm": 0.27405962347984314, + "learning_rate": 7.06281240751798e-05, + "loss": 1.0309, + "mean_token_accuracy": 0.6959473788738251, + "num_tokens": 13866414.0, + "step": 1638 + }, + { + "entropy": 1.0301588326692581, + "epoch": 0.6556655665566556, + "grad_norm": 0.2698734998703003, + "learning_rate": 7.052331063231756e-05, + "loss": 1.0018, + "mean_token_accuracy": 0.7170789837837219, + "num_tokens": 13875000.0, + "step": 1639 + }, + { + "entropy": 1.0607908368110657, + "epoch": 0.656065606560656, + "grad_norm": 0.27450287342071533, + "learning_rate": 7.041856344419057e-05, + "loss": 1.0541, + "mean_token_accuracy": 0.7017786651849747, + "num_tokens": 13883097.0, + "step": 1640 + }, + { + "entropy": 1.0127398818731308, + "epoch": 0.6564656465646564, + "grad_norm": 0.25994282960891724, + "learning_rate": 7.031388268659873e-05, + "loss": 0.9852, + "mean_token_accuracy": 0.7097787111997604, + "num_tokens": 13891637.0, + "step": 1641 + }, + { + "entropy": 1.0960640162229538, + "epoch": 0.6568656865686568, + "grad_norm": 0.2895820140838623, + "learning_rate": 7.020926853523046e-05, + "loss": 1.0666, + "mean_token_accuracy": 0.6987890899181366, + "num_tokens": 13899373.0, + "step": 1642 + }, + { + "entropy": 1.0522951185703278, + "epoch": 0.6572657265726572, + "grad_norm": 0.2727259695529938, + "learning_rate": 7.010472116566236e-05, + "loss": 1.0474, + "mean_token_accuracy": 0.6956601291894913, + "num_tokens": 13907797.0, + "step": 1643 + }, + { + "entropy": 1.0702660530805588, + "epoch": 0.6576657665766577, + "grad_norm": 0.2680768072605133, + "learning_rate": 7.000024075335889e-05, + "loss": 1.0309, + "mean_token_accuracy": 0.708165630698204, + "num_tokens": 13916312.0, + "step": 1644 + }, + { + "entropy": 1.0627661794424057, + "epoch": 0.658065806580658, + "grad_norm": 0.3145293891429901, + "learning_rate": 6.98958274736722e-05, + "loss": 1.0796, + "mean_token_accuracy": 0.6964007914066315, + "num_tokens": 13924352.0, + "step": 1645 + }, + { + "entropy": 0.9756138473749161, + "epoch": 0.6584658465846585, + "grad_norm": 0.27289339900016785, + "learning_rate": 6.97914815018418e-05, + "loss": 0.961, + "mean_token_accuracy": 0.7181849479675293, + "num_tokens": 13932442.0, + "step": 1646 + }, + { + "entropy": 0.9699608683586121, + "epoch": 0.6588658865886589, + "grad_norm": 0.25802376866340637, + "learning_rate": 6.968720301299419e-05, + "loss": 0.9615, + "mean_token_accuracy": 0.7225667983293533, + "num_tokens": 13941262.0, + "step": 1647 + }, + { + "entropy": 1.0219633281230927, + "epoch": 0.6592659265926593, + "grad_norm": 0.3511042296886444, + "learning_rate": 6.958299218214264e-05, + "loss": 1.0131, + "mean_token_accuracy": 0.7100156396627426, + "num_tokens": 13949827.0, + "step": 1648 + }, + { + "entropy": 1.0462029874324799, + "epoch": 0.6596659665966597, + "grad_norm": 0.2791305184364319, + "learning_rate": 6.947884918418676e-05, + "loss": 1.0671, + "mean_token_accuracy": 0.7006105333566666, + "num_tokens": 13957756.0, + "step": 1649 + }, + { + "entropy": 1.0098459422588348, + "epoch": 0.6600660066006601, + "grad_norm": 0.4169783294200897, + "learning_rate": 6.937477419391246e-05, + "loss": 1.0015, + "mean_token_accuracy": 0.7111160457134247, + "num_tokens": 13966719.0, + "step": 1650 + }, + { + "entropy": 1.0646288245916367, + "epoch": 0.6604660466046605, + "grad_norm": 0.27171847224235535, + "learning_rate": 6.927076738599152e-05, + "loss": 1.0602, + "mean_token_accuracy": 0.697830319404602, + "num_tokens": 13975314.0, + "step": 1651 + }, + { + "entropy": 1.0919907093048096, + "epoch": 0.6608660866086609, + "grad_norm": 0.29635563492774963, + "learning_rate": 6.916682893498112e-05, + "loss": 1.0749, + "mean_token_accuracy": 0.697185218334198, + "num_tokens": 13983357.0, + "step": 1652 + }, + { + "entropy": 0.9884568750858307, + "epoch": 0.6612661266126613, + "grad_norm": 0.25534170866012573, + "learning_rate": 6.906295901532384e-05, + "loss": 0.962, + "mean_token_accuracy": 0.7231160551309586, + "num_tokens": 13992096.0, + "step": 1653 + }, + { + "entropy": 0.9850638955831528, + "epoch": 0.6616661666166617, + "grad_norm": 0.25842273235321045, + "learning_rate": 6.895915780134723e-05, + "loss": 0.9632, + "mean_token_accuracy": 0.7194929718971252, + "num_tokens": 14001129.0, + "step": 1654 + }, + { + "entropy": 0.9968108385801315, + "epoch": 0.6620662066206621, + "grad_norm": 0.2713010311126709, + "learning_rate": 6.885542546726348e-05, + "loss": 0.9887, + "mean_token_accuracy": 0.7222445607185364, + "num_tokens": 14009495.0, + "step": 1655 + }, + { + "entropy": 0.9788186699151993, + "epoch": 0.6624662466246625, + "grad_norm": 0.26947879791259766, + "learning_rate": 6.87517621871693e-05, + "loss": 0.9889, + "mean_token_accuracy": 0.7264079749584198, + "num_tokens": 14017790.0, + "step": 1656 + }, + { + "entropy": 0.9631485044956207, + "epoch": 0.6628662866286629, + "grad_norm": 0.25087204575538635, + "learning_rate": 6.864816813504535e-05, + "loss": 0.9651, + "mean_token_accuracy": 0.7197734266519547, + "num_tokens": 14026861.0, + "step": 1657 + }, + { + "entropy": 1.079709142446518, + "epoch": 0.6632663266326633, + "grad_norm": 0.2889139652252197, + "learning_rate": 6.854464348475616e-05, + "loss": 1.0842, + "mean_token_accuracy": 0.6918900012969971, + "num_tokens": 14034862.0, + "step": 1658 + }, + { + "entropy": 1.0417302548885345, + "epoch": 0.6636663666366637, + "grad_norm": 0.2676405906677246, + "learning_rate": 6.844118841004985e-05, + "loss": 1.0144, + "mean_token_accuracy": 0.7100715786218643, + "num_tokens": 14043494.0, + "step": 1659 + }, + { + "entropy": 1.0130602270364761, + "epoch": 0.6640664066406641, + "grad_norm": 0.27214542031288147, + "learning_rate": 6.833780308455766e-05, + "loss": 0.9982, + "mean_token_accuracy": 0.7130683213472366, + "num_tokens": 14051752.0, + "step": 1660 + }, + { + "entropy": 1.0701176971197128, + "epoch": 0.6644664466446645, + "grad_norm": 0.300357848405838, + "learning_rate": 6.823448768179386e-05, + "loss": 1.0575, + "mean_token_accuracy": 0.7042479515075684, + "num_tokens": 14060043.0, + "step": 1661 + }, + { + "entropy": 1.0084046572446823, + "epoch": 0.6648664866486649, + "grad_norm": 0.2671807110309601, + "learning_rate": 6.813124237515531e-05, + "loss": 0.9842, + "mean_token_accuracy": 0.7157805114984512, + "num_tokens": 14068497.0, + "step": 1662 + }, + { + "entropy": 0.981174111366272, + "epoch": 0.6652665266526653, + "grad_norm": 0.26440972089767456, + "learning_rate": 6.80280673379212e-05, + "loss": 0.9979, + "mean_token_accuracy": 0.7082539051771164, + "num_tokens": 14077218.0, + "step": 1663 + }, + { + "entropy": 1.025006741285324, + "epoch": 0.6656665666566657, + "grad_norm": 0.27925798296928406, + "learning_rate": 6.792496274325287e-05, + "loss": 1.0551, + "mean_token_accuracy": 0.6998407542705536, + "num_tokens": 14086070.0, + "step": 1664 + }, + { + "entropy": 0.9587132632732391, + "epoch": 0.6660666066606661, + "grad_norm": 0.26028576493263245, + "learning_rate": 6.782192876419343e-05, + "loss": 0.959, + "mean_token_accuracy": 0.7174195349216461, + "num_tokens": 14095006.0, + "step": 1665 + }, + { + "entropy": 0.985969752073288, + "epoch": 0.6664666466646665, + "grad_norm": 0.25720733404159546, + "learning_rate": 6.771896557366741e-05, + "loss": 0.9688, + "mean_token_accuracy": 0.7136876732110977, + "num_tokens": 14104064.0, + "step": 1666 + }, + { + "entropy": 1.065902903676033, + "epoch": 0.6668666866686669, + "grad_norm": 0.2784879207611084, + "learning_rate": 6.76160733444805e-05, + "loss": 1.0457, + "mean_token_accuracy": 0.7069020867347717, + "num_tokens": 14112007.0, + "step": 1667 + }, + { + "entropy": 1.0856628715991974, + "epoch": 0.6672667266726673, + "grad_norm": 0.3326982259750366, + "learning_rate": 6.751325224931946e-05, + "loss": 1.0598, + "mean_token_accuracy": 0.6970634609460831, + "num_tokens": 14120171.0, + "step": 1668 + }, + { + "entropy": 0.997719332575798, + "epoch": 0.6676667666766677, + "grad_norm": 0.2628896236419678, + "learning_rate": 6.741050246075145e-05, + "loss": 0.985, + "mean_token_accuracy": 0.7180589735507965, + "num_tokens": 14128514.0, + "step": 1669 + }, + { + "entropy": 1.0338243544101715, + "epoch": 0.6680668066806681, + "grad_norm": 0.25854727625846863, + "learning_rate": 6.730782415122414e-05, + "loss": 0.9783, + "mean_token_accuracy": 0.7106697857379913, + "num_tokens": 14137474.0, + "step": 1670 + }, + { + "entropy": 0.9883899241685867, + "epoch": 0.6684668466846685, + "grad_norm": 0.26366254687309265, + "learning_rate": 6.720521749306515e-05, + "loss": 0.9534, + "mean_token_accuracy": 0.7206936627626419, + "num_tokens": 14145995.0, + "step": 1671 + }, + { + "entropy": 1.0207647532224655, + "epoch": 0.6688668866886689, + "grad_norm": 0.2655137777328491, + "learning_rate": 6.710268265848178e-05, + "loss": 1.0277, + "mean_token_accuracy": 0.7113603800535202, + "num_tokens": 14154774.0, + "step": 1672 + }, + { + "entropy": 0.9538712203502655, + "epoch": 0.6692669266926693, + "grad_norm": 0.25187575817108154, + "learning_rate": 6.700021981956097e-05, + "loss": 0.9553, + "mean_token_accuracy": 0.7227759063243866, + "num_tokens": 14164088.0, + "step": 1673 + }, + { + "entropy": 1.0018862634897232, + "epoch": 0.6696669666966697, + "grad_norm": 0.27122962474823, + "learning_rate": 6.68978291482686e-05, + "loss": 1.0079, + "mean_token_accuracy": 0.7081445306539536, + "num_tokens": 14172576.0, + "step": 1674 + }, + { + "entropy": 0.973240464925766, + "epoch": 0.6700670067006701, + "grad_norm": 0.2733827233314514, + "learning_rate": 6.679551081644969e-05, + "loss": 1.0043, + "mean_token_accuracy": 0.7077698558568954, + "num_tokens": 14181014.0, + "step": 1675 + }, + { + "entropy": 0.9457416981458664, + "epoch": 0.6704670467046705, + "grad_norm": 0.25809189677238464, + "learning_rate": 6.669326499582755e-05, + "loss": 0.9401, + "mean_token_accuracy": 0.7268485575914383, + "num_tokens": 14189859.0, + "step": 1676 + }, + { + "entropy": 0.997815266251564, + "epoch": 0.6708670867086709, + "grad_norm": 0.2723693251609802, + "learning_rate": 6.659109185800407e-05, + "loss": 1.0145, + "mean_token_accuracy": 0.7113381922245026, + "num_tokens": 14198079.0, + "step": 1677 + }, + { + "entropy": 0.9542207270860672, + "epoch": 0.6712671267126713, + "grad_norm": 0.2691044509410858, + "learning_rate": 6.648899157445895e-05, + "loss": 0.9359, + "mean_token_accuracy": 0.7276975959539413, + "num_tokens": 14206551.0, + "step": 1678 + }, + { + "entropy": 1.0198415368795395, + "epoch": 0.6716671667166717, + "grad_norm": 0.2660420536994934, + "learning_rate": 6.63869643165498e-05, + "loss": 1.022, + "mean_token_accuracy": 0.7114955633878708, + "num_tokens": 14215400.0, + "step": 1679 + }, + { + "entropy": 0.9750362634658813, + "epoch": 0.6720672067206721, + "grad_norm": 0.26570627093315125, + "learning_rate": 6.628501025551151e-05, + "loss": 0.989, + "mean_token_accuracy": 0.7151263505220413, + "num_tokens": 14224113.0, + "step": 1680 + }, + { + "entropy": 1.017204463481903, + "epoch": 0.6724672467246725, + "grad_norm": 0.2774355113506317, + "learning_rate": 6.618312956245614e-05, + "loss": 1.0181, + "mean_token_accuracy": 0.7077021151781082, + "num_tokens": 14232086.0, + "step": 1681 + }, + { + "entropy": 0.9677879959344864, + "epoch": 0.6728672867286729, + "grad_norm": 0.32099971175193787, + "learning_rate": 6.608132240837275e-05, + "loss": 0.9422, + "mean_token_accuracy": 0.7280850857496262, + "num_tokens": 14240459.0, + "step": 1682 + }, + { + "entropy": 1.060670018196106, + "epoch": 0.6732673267326733, + "grad_norm": 0.2783302366733551, + "learning_rate": 6.597958896412679e-05, + "loss": 1.0447, + "mean_token_accuracy": 0.6954112499952316, + "num_tokens": 14248407.0, + "step": 1683 + }, + { + "entropy": 1.0016647726297379, + "epoch": 0.6736673667366737, + "grad_norm": 0.26950353384017944, + "learning_rate": 6.587792940046018e-05, + "loss": 0.9878, + "mean_token_accuracy": 0.7129129469394684, + "num_tokens": 14256627.0, + "step": 1684 + }, + { + "entropy": 0.9813555777072906, + "epoch": 0.6740674067406741, + "grad_norm": 0.2673988938331604, + "learning_rate": 6.57763438879907e-05, + "loss": 0.9653, + "mean_token_accuracy": 0.7173869907855988, + "num_tokens": 14265263.0, + "step": 1685 + }, + { + "entropy": 1.0122930258512497, + "epoch": 0.6744674467446745, + "grad_norm": 0.26911890506744385, + "learning_rate": 6.567483259721197e-05, + "loss": 1.0097, + "mean_token_accuracy": 0.7106683850288391, + "num_tokens": 14273662.0, + "step": 1686 + }, + { + "entropy": 1.021504744887352, + "epoch": 0.6748674867486749, + "grad_norm": 0.2620517313480377, + "learning_rate": 6.557339569849294e-05, + "loss": 1.0034, + "mean_token_accuracy": 0.7133632749319077, + "num_tokens": 14282262.0, + "step": 1687 + }, + { + "entropy": 0.9968883991241455, + "epoch": 0.6752675267526753, + "grad_norm": 0.26752281188964844, + "learning_rate": 6.547203336207771e-05, + "loss": 1.0046, + "mean_token_accuracy": 0.7056008726358414, + "num_tokens": 14290789.0, + "step": 1688 + }, + { + "entropy": 0.9682450890541077, + "epoch": 0.6756675667566757, + "grad_norm": 0.260568767786026, + "learning_rate": 6.537074575808535e-05, + "loss": 0.9443, + "mean_token_accuracy": 0.7242724299430847, + "num_tokens": 14299181.0, + "step": 1689 + }, + { + "entropy": 1.050703912973404, + "epoch": 0.6760676067606761, + "grad_norm": 0.2962005138397217, + "learning_rate": 6.526953305650938e-05, + "loss": 1.0294, + "mean_token_accuracy": 0.7044802010059357, + "num_tokens": 14307321.0, + "step": 1690 + }, + { + "entropy": 0.9813262969255447, + "epoch": 0.6764676467646765, + "grad_norm": 0.253266841173172, + "learning_rate": 6.516839542721769e-05, + "loss": 0.9626, + "mean_token_accuracy": 0.7241119593381882, + "num_tokens": 14316479.0, + "step": 1691 + }, + { + "entropy": 0.9923852384090424, + "epoch": 0.6768676867686768, + "grad_norm": 0.26217323541641235, + "learning_rate": 6.50673330399521e-05, + "loss": 0.982, + "mean_token_accuracy": 0.7107029408216476, + "num_tokens": 14325282.0, + "step": 1692 + }, + { + "entropy": 1.0181312710046768, + "epoch": 0.6772677267726772, + "grad_norm": 0.2839345335960388, + "learning_rate": 6.496634606432825e-05, + "loss": 1.0202, + "mean_token_accuracy": 0.7026841938495636, + "num_tokens": 14333438.0, + "step": 1693 + }, + { + "entropy": 0.999792292714119, + "epoch": 0.6776677667766776, + "grad_norm": 0.264396607875824, + "learning_rate": 6.486543466983513e-05, + "loss": 0.9893, + "mean_token_accuracy": 0.716277465224266, + "num_tokens": 14342032.0, + "step": 1694 + }, + { + "entropy": 0.9855992943048477, + "epoch": 0.678067806780678, + "grad_norm": 0.26503491401672363, + "learning_rate": 6.476459902583492e-05, + "loss": 0.979, + "mean_token_accuracy": 0.7194619327783585, + "num_tokens": 14350564.0, + "step": 1695 + }, + { + "entropy": 1.0777447074651718, + "epoch": 0.6784678467846784, + "grad_norm": 0.29566964507102966, + "learning_rate": 6.466383930156264e-05, + "loss": 1.0768, + "mean_token_accuracy": 0.687192901968956, + "num_tokens": 14358466.0, + "step": 1696 + }, + { + "entropy": 0.9953999668359756, + "epoch": 0.6788678867886788, + "grad_norm": 0.28468579053878784, + "learning_rate": 6.45631556661259e-05, + "loss": 0.9989, + "mean_token_accuracy": 0.7100808173418045, + "num_tokens": 14366611.0, + "step": 1697 + }, + { + "entropy": 1.0714083313941956, + "epoch": 0.6792679267926792, + "grad_norm": 0.3021942973136902, + "learning_rate": 6.446254828850465e-05, + "loss": 1.0653, + "mean_token_accuracy": 0.6963720619678497, + "num_tokens": 14374060.0, + "step": 1698 + }, + { + "entropy": 0.9291437566280365, + "epoch": 0.6796679667966796, + "grad_norm": 0.2660292685031891, + "learning_rate": 6.436201733755078e-05, + "loss": 0.9191, + "mean_token_accuracy": 0.7243731766939163, + "num_tokens": 14382764.0, + "step": 1699 + }, + { + "entropy": 1.0128522515296936, + "epoch": 0.68006800680068, + "grad_norm": 0.2768755853176117, + "learning_rate": 6.426156298198799e-05, + "loss": 1.0262, + "mean_token_accuracy": 0.7017718255519867, + "num_tokens": 14390964.0, + "step": 1700 + }, + { + "entropy": 1.0110303908586502, + "epoch": 0.6804680468046804, + "grad_norm": 0.2666500210762024, + "learning_rate": 6.416118539041135e-05, + "loss": 0.9815, + "mean_token_accuracy": 0.7174566239118576, + "num_tokens": 14399490.0, + "step": 1701 + }, + { + "entropy": 1.0420980006456375, + "epoch": 0.6808680868086808, + "grad_norm": 0.2784508764743805, + "learning_rate": 6.406088473128715e-05, + "loss": 1.0386, + "mean_token_accuracy": 0.6995486319065094, + "num_tokens": 14407652.0, + "step": 1702 + }, + { + "entropy": 1.012734666466713, + "epoch": 0.6812681268126812, + "grad_norm": 0.8273247480392456, + "learning_rate": 6.396066117295255e-05, + "loss": 1.0101, + "mean_token_accuracy": 0.7025185227394104, + "num_tokens": 14415881.0, + "step": 1703 + }, + { + "entropy": 1.0576525628566742, + "epoch": 0.6816681668166816, + "grad_norm": 0.27910560369491577, + "learning_rate": 6.38605148836153e-05, + "loss": 1.0523, + "mean_token_accuracy": 0.6959236860275269, + "num_tokens": 14424010.0, + "step": 1704 + }, + { + "entropy": 1.0119645297527313, + "epoch": 0.682068206820682, + "grad_norm": 0.2754209637641907, + "learning_rate": 6.37604460313535e-05, + "loss": 1.0317, + "mean_token_accuracy": 0.7020839750766754, + "num_tokens": 14432429.0, + "step": 1705 + }, + { + "entropy": 1.0269517749547958, + "epoch": 0.6824682468246824, + "grad_norm": 0.27835139632225037, + "learning_rate": 6.36604547841152e-05, + "loss": 1.041, + "mean_token_accuracy": 0.7017010748386383, + "num_tokens": 14440902.0, + "step": 1706 + }, + { + "entropy": 0.9707730263471603, + "epoch": 0.6828682868286828, + "grad_norm": 0.26176971197128296, + "learning_rate": 6.356054130971829e-05, + "loss": 0.9572, + "mean_token_accuracy": 0.7232692986726761, + "num_tokens": 14449785.0, + "step": 1707 + }, + { + "entropy": 0.9665889739990234, + "epoch": 0.6832683268326832, + "grad_norm": 0.27762916684150696, + "learning_rate": 6.346070577585016e-05, + "loss": 0.9367, + "mean_token_accuracy": 0.7257691025733948, + "num_tokens": 14458021.0, + "step": 1708 + }, + { + "entropy": 1.0595256239175797, + "epoch": 0.6836683668366836, + "grad_norm": 0.27217841148376465, + "learning_rate": 6.336094835006728e-05, + "loss": 1.0478, + "mean_token_accuracy": 0.7001218944787979, + "num_tokens": 14466628.0, + "step": 1709 + }, + { + "entropy": 1.0229178220033646, + "epoch": 0.684068406840684, + "grad_norm": 0.27034687995910645, + "learning_rate": 6.326126919979508e-05, + "loss": 1.001, + "mean_token_accuracy": 0.7214628607034683, + "num_tokens": 14475005.0, + "step": 1710 + }, + { + "entropy": 1.0742666274309158, + "epoch": 0.6844684468446844, + "grad_norm": 0.26247477531433105, + "learning_rate": 6.31616684923276e-05, + "loss": 1.036, + "mean_token_accuracy": 0.7063274681568146, + "num_tokens": 14483906.0, + "step": 1711 + }, + { + "entropy": 1.0338006019592285, + "epoch": 0.6848684868486848, + "grad_norm": 0.27423030138015747, + "learning_rate": 6.306214639482728e-05, + "loss": 1.0224, + "mean_token_accuracy": 0.6980048716068268, + "num_tokens": 14492413.0, + "step": 1712 + }, + { + "entropy": 1.0091119557619095, + "epoch": 0.6852685268526852, + "grad_norm": 0.2621704638004303, + "learning_rate": 6.296270307432464e-05, + "loss": 0.9878, + "mean_token_accuracy": 0.7100193202495575, + "num_tokens": 14501474.0, + "step": 1713 + }, + { + "entropy": 0.9728426784276962, + "epoch": 0.6856685668566856, + "grad_norm": 0.26898258924484253, + "learning_rate": 6.28633386977179e-05, + "loss": 0.9701, + "mean_token_accuracy": 0.7212784141302109, + "num_tokens": 14510063.0, + "step": 1714 + }, + { + "entropy": 1.0189223885536194, + "epoch": 0.686068606860686, + "grad_norm": 0.2636651396751404, + "learning_rate": 6.276405343177282e-05, + "loss": 0.9824, + "mean_token_accuracy": 0.712503045797348, + "num_tokens": 14518253.0, + "step": 1715 + }, + { + "entropy": 1.0570265352725983, + "epoch": 0.6864686468646864, + "grad_norm": 0.28508809208869934, + "learning_rate": 6.266484744312233e-05, + "loss": 1.0374, + "mean_token_accuracy": 0.7031292468309402, + "num_tokens": 14526507.0, + "step": 1716 + }, + { + "entropy": 0.9812213033437729, + "epoch": 0.6868686868686869, + "grad_norm": 0.2743547856807709, + "learning_rate": 6.256572089826653e-05, + "loss": 0.9905, + "mean_token_accuracy": 0.7102383077144623, + "num_tokens": 14535088.0, + "step": 1717 + }, + { + "entropy": 1.006604164838791, + "epoch": 0.6872687268726873, + "grad_norm": 0.2674405872821808, + "learning_rate": 6.246667396357194e-05, + "loss": 1.0127, + "mean_token_accuracy": 0.7061139792203903, + "num_tokens": 14543937.0, + "step": 1718 + }, + { + "entropy": 0.9494961500167847, + "epoch": 0.6876687668766877, + "grad_norm": 0.28018060326576233, + "learning_rate": 6.236770680527157e-05, + "loss": 0.9356, + "mean_token_accuracy": 0.7249604016542435, + "num_tokens": 14552910.0, + "step": 1719 + }, + { + "entropy": 1.0812702625989914, + "epoch": 0.688068806880688, + "grad_norm": 0.29672771692276, + "learning_rate": 6.226881958946449e-05, + "loss": 1.0624, + "mean_token_accuracy": 0.6920069754123688, + "num_tokens": 14560737.0, + "step": 1720 + }, + { + "entropy": 0.9836938381195068, + "epoch": 0.6884688468846885, + "grad_norm": 0.3399813175201416, + "learning_rate": 6.217001248211567e-05, + "loss": 0.9976, + "mean_token_accuracy": 0.7139115333557129, + "num_tokens": 14568579.0, + "step": 1721 + }, + { + "entropy": 0.987627312541008, + "epoch": 0.6888688868886889, + "grad_norm": 0.2826646566390991, + "learning_rate": 6.207128564905562e-05, + "loss": 0.9966, + "mean_token_accuracy": 0.7129846215248108, + "num_tokens": 14576635.0, + "step": 1722 + }, + { + "entropy": 1.0464074462652206, + "epoch": 0.6892689268926893, + "grad_norm": 0.27533891797065735, + "learning_rate": 6.197263925598011e-05, + "loss": 1.0384, + "mean_token_accuracy": 0.7028329521417618, + "num_tokens": 14585007.0, + "step": 1723 + }, + { + "entropy": 0.9444098025560379, + "epoch": 0.6896689668966897, + "grad_norm": 0.2756772041320801, + "learning_rate": 6.187407346844985e-05, + "loss": 0.9534, + "mean_token_accuracy": 0.7279033213853836, + "num_tokens": 14593529.0, + "step": 1724 + }, + { + "entropy": 0.9967755675315857, + "epoch": 0.6900690069006901, + "grad_norm": 0.26740506291389465, + "learning_rate": 6.177558845189028e-05, + "loss": 1.033, + "mean_token_accuracy": 0.7090263813734055, + "num_tokens": 14602491.0, + "step": 1725 + }, + { + "entropy": 1.03732468187809, + "epoch": 0.6904690469046905, + "grad_norm": 0.2828628718852997, + "learning_rate": 6.167718437159147e-05, + "loss": 1.0351, + "mean_token_accuracy": 0.7089848220348358, + "num_tokens": 14610761.0, + "step": 1726 + }, + { + "entropy": 0.9896856844425201, + "epoch": 0.6908690869086909, + "grad_norm": 0.27672529220581055, + "learning_rate": 6.157886139270737e-05, + "loss": 0.9693, + "mean_token_accuracy": 0.7154839187860489, + "num_tokens": 14618923.0, + "step": 1727 + }, + { + "entropy": 1.0823362171649933, + "epoch": 0.6912691269126913, + "grad_norm": 0.28537455201148987, + "learning_rate": 6.1480619680256e-05, + "loss": 1.0526, + "mean_token_accuracy": 0.7022448480129242, + "num_tokens": 14626897.0, + "step": 1728 + }, + { + "entropy": 0.9889559894800186, + "epoch": 0.6916691669166917, + "grad_norm": 0.2795410454273224, + "learning_rate": 6.138245939911889e-05, + "loss": 0.9725, + "mean_token_accuracy": 0.7180508375167847, + "num_tokens": 14635293.0, + "step": 1729 + }, + { + "entropy": 1.0522978901863098, + "epoch": 0.6920692069206921, + "grad_norm": 0.28873902559280396, + "learning_rate": 6.128438071404096e-05, + "loss": 1.0489, + "mean_token_accuracy": 0.7007178366184235, + "num_tokens": 14643015.0, + "step": 1730 + }, + { + "entropy": 1.0208117067813873, + "epoch": 0.6924692469246925, + "grad_norm": 0.27143338322639465, + "learning_rate": 6.11863837896302e-05, + "loss": 1.0067, + "mean_token_accuracy": 0.7131657749414444, + "num_tokens": 14651696.0, + "step": 1731 + }, + { + "entropy": 0.9751124382019043, + "epoch": 0.6928692869286929, + "grad_norm": 0.26996859908103943, + "learning_rate": 6.108846879035734e-05, + "loss": 0.9877, + "mean_token_accuracy": 0.7182182818651199, + "num_tokens": 14660330.0, + "step": 1732 + }, + { + "entropy": 1.0386084169149399, + "epoch": 0.6932693269326933, + "grad_norm": 0.2620426416397095, + "learning_rate": 6.0990635880555605e-05, + "loss": 1.0419, + "mean_token_accuracy": 0.6971972435712814, + "num_tokens": 14668896.0, + "step": 1733 + }, + { + "entropy": 1.030503273010254, + "epoch": 0.6936693669366937, + "grad_norm": 0.2773856520652771, + "learning_rate": 6.089288522442047e-05, + "loss": 1.0502, + "mean_token_accuracy": 0.7009502202272415, + "num_tokens": 14677147.0, + "step": 1734 + }, + { + "entropy": 1.068430334329605, + "epoch": 0.6940694069406941, + "grad_norm": 0.2722591757774353, + "learning_rate": 6.0795216986009343e-05, + "loss": 1.0541, + "mean_token_accuracy": 0.6991900503635406, + "num_tokens": 14685288.0, + "step": 1735 + }, + { + "entropy": 1.0391612648963928, + "epoch": 0.6944694469446945, + "grad_norm": 0.2577204406261444, + "learning_rate": 6.069763132924136e-05, + "loss": 1.0077, + "mean_token_accuracy": 0.7030885368585587, + "num_tokens": 14694075.0, + "step": 1736 + }, + { + "entropy": 0.9834558367729187, + "epoch": 0.6948694869486949, + "grad_norm": 0.2547317445278168, + "learning_rate": 6.0600128417896985e-05, + "loss": 0.9785, + "mean_token_accuracy": 0.7158985435962677, + "num_tokens": 14703458.0, + "step": 1737 + }, + { + "entropy": 1.0091035515069962, + "epoch": 0.6952695269526953, + "grad_norm": 0.2737278938293457, + "learning_rate": 6.0502708415617825e-05, + "loss": 0.9951, + "mean_token_accuracy": 0.7118311822414398, + "num_tokens": 14712631.0, + "step": 1738 + }, + { + "entropy": 1.079922005534172, + "epoch": 0.6956695669566957, + "grad_norm": 0.27021172642707825, + "learning_rate": 6.040537148590639e-05, + "loss": 1.0725, + "mean_token_accuracy": 0.6975164413452148, + "num_tokens": 14721104.0, + "step": 1739 + }, + { + "entropy": 1.0949593782424927, + "epoch": 0.6960696069606961, + "grad_norm": 0.29049038887023926, + "learning_rate": 6.030811779212575e-05, + "loss": 1.0846, + "mean_token_accuracy": 0.6940733790397644, + "num_tokens": 14729081.0, + "step": 1740 + }, + { + "entropy": 1.0108844488859177, + "epoch": 0.6964696469646965, + "grad_norm": 0.27273714542388916, + "learning_rate": 6.021094749749926e-05, + "loss": 0.9855, + "mean_token_accuracy": 0.7195252031087875, + "num_tokens": 14737319.0, + "step": 1741 + }, + { + "entropy": 1.0772527605295181, + "epoch": 0.6968696869686969, + "grad_norm": 0.2999832034111023, + "learning_rate": 6.011386076511023e-05, + "loss": 1.0622, + "mean_token_accuracy": 0.6960748881101608, + "num_tokens": 14745832.0, + "step": 1742 + }, + { + "entropy": 1.020627036690712, + "epoch": 0.6972697269726973, + "grad_norm": 0.3140758275985718, + "learning_rate": 6.001685775790189e-05, + "loss": 1.007, + "mean_token_accuracy": 0.7125598192214966, + "num_tokens": 14754440.0, + "step": 1743 + }, + { + "entropy": 1.0556238293647766, + "epoch": 0.6976697669766977, + "grad_norm": 0.27166274189949036, + "learning_rate": 5.99199386386768e-05, + "loss": 1.0331, + "mean_token_accuracy": 0.703571617603302, + "num_tokens": 14762634.0, + "step": 1744 + }, + { + "entropy": 0.9904855936765671, + "epoch": 0.6980698069806981, + "grad_norm": 0.26129502058029175, + "learning_rate": 5.982310357009685e-05, + "loss": 0.9904, + "mean_token_accuracy": 0.7075749039649963, + "num_tokens": 14771138.0, + "step": 1745 + }, + { + "entropy": 1.0195970982313156, + "epoch": 0.6984698469846985, + "grad_norm": 0.2795777916908264, + "learning_rate": 5.9726352714682775e-05, + "loss": 1.0274, + "mean_token_accuracy": 0.7089248299598694, + "num_tokens": 14779566.0, + "step": 1746 + }, + { + "entropy": 0.99138243496418, + "epoch": 0.6988698869886989, + "grad_norm": 0.26780256628990173, + "learning_rate": 5.962968623481396e-05, + "loss": 1.0081, + "mean_token_accuracy": 0.7046643495559692, + "num_tokens": 14788536.0, + "step": 1747 + }, + { + "entropy": 1.0575937926769257, + "epoch": 0.6992699269926993, + "grad_norm": 0.29785388708114624, + "learning_rate": 5.9533104292728324e-05, + "loss": 1.0661, + "mean_token_accuracy": 0.6998780220746994, + "num_tokens": 14796213.0, + "step": 1748 + }, + { + "entropy": 0.9873844385147095, + "epoch": 0.6996699669966997, + "grad_norm": 0.2899465560913086, + "learning_rate": 5.94366070505217e-05, + "loss": 0.9922, + "mean_token_accuracy": 0.706996351480484, + "num_tokens": 14804370.0, + "step": 1749 + }, + { + "entropy": 0.9575809687376022, + "epoch": 0.7000700070007001, + "grad_norm": 0.274586021900177, + "learning_rate": 5.9340194670147973e-05, + "loss": 0.9724, + "mean_token_accuracy": 0.721545085310936, + "num_tokens": 14813051.0, + "step": 1750 + }, + { + "entropy": 0.9586543887853622, + "epoch": 0.7004700470047005, + "grad_norm": 0.28914564847946167, + "learning_rate": 5.924386731341842e-05, + "loss": 0.9796, + "mean_token_accuracy": 0.7135427743196487, + "num_tokens": 14821172.0, + "step": 1751 + }, + { + "entropy": 1.0004163980484009, + "epoch": 0.7008700870087009, + "grad_norm": 0.27769964933395386, + "learning_rate": 5.9147625142001764e-05, + "loss": 0.9951, + "mean_token_accuracy": 0.7122161984443665, + "num_tokens": 14829772.0, + "step": 1752 + }, + { + "entropy": 1.0019551217556, + "epoch": 0.7012701270127013, + "grad_norm": 0.2939901053905487, + "learning_rate": 5.905146831742362e-05, + "loss": 1.0219, + "mean_token_accuracy": 0.7102178186178207, + "num_tokens": 14837988.0, + "step": 1753 + }, + { + "entropy": 0.9679895341396332, + "epoch": 0.7016701670167017, + "grad_norm": 0.2724636197090149, + "learning_rate": 5.8955397001066536e-05, + "loss": 0.9812, + "mean_token_accuracy": 0.7152968049049377, + "num_tokens": 14846910.0, + "step": 1754 + }, + { + "entropy": 0.97493676841259, + "epoch": 0.7020702070207021, + "grad_norm": 0.2568642497062683, + "learning_rate": 5.8859411354169416e-05, + "loss": 0.95, + "mean_token_accuracy": 0.7257141470909119, + "num_tokens": 14855827.0, + "step": 1755 + }, + { + "entropy": 0.9756352007389069, + "epoch": 0.7024702470247025, + "grad_norm": 0.2747461497783661, + "learning_rate": 5.8763511537827355e-05, + "loss": 0.9653, + "mean_token_accuracy": 0.7180861979722977, + "num_tokens": 14864251.0, + "step": 1756 + }, + { + "entropy": 1.0415329039096832, + "epoch": 0.7028702870287029, + "grad_norm": 0.25803324580192566, + "learning_rate": 5.866769771299157e-05, + "loss": 1.0031, + "mean_token_accuracy": 0.707523301243782, + "num_tokens": 14872957.0, + "step": 1757 + }, + { + "entropy": 1.0091711729764938, + "epoch": 0.7032703270327033, + "grad_norm": 0.29207488894462585, + "learning_rate": 5.8571970040468746e-05, + "loss": 0.9835, + "mean_token_accuracy": 0.716585099697113, + "num_tokens": 14882012.0, + "step": 1758 + }, + { + "entropy": 0.9983259588479996, + "epoch": 0.7036703670367037, + "grad_norm": 0.25907623767852783, + "learning_rate": 5.847632868092119e-05, + "loss": 0.9707, + "mean_token_accuracy": 0.7173559665679932, + "num_tokens": 14891230.0, + "step": 1759 + }, + { + "entropy": 1.0519304722547531, + "epoch": 0.7040704070407041, + "grad_norm": 0.28447479009628296, + "learning_rate": 5.838077379486613e-05, + "loss": 1.0175, + "mean_token_accuracy": 0.7079452872276306, + "num_tokens": 14899730.0, + "step": 1760 + }, + { + "entropy": 1.0942412167787552, + "epoch": 0.7044704470447045, + "grad_norm": 0.36881446838378906, + "learning_rate": 5.828530554267586e-05, + "loss": 1.0674, + "mean_token_accuracy": 0.6937002837657928, + "num_tokens": 14907376.0, + "step": 1761 + }, + { + "entropy": 1.0107639878988266, + "epoch": 0.7048704870487049, + "grad_norm": 0.25858837366104126, + "learning_rate": 5.818992408457713e-05, + "loss": 0.9887, + "mean_token_accuracy": 0.7183995991945267, + "num_tokens": 14916470.0, + "step": 1762 + }, + { + "entropy": 0.9500168710947037, + "epoch": 0.7052705270527053, + "grad_norm": 0.26319435238838196, + "learning_rate": 5.8094629580651064e-05, + "loss": 0.9498, + "mean_token_accuracy": 0.7253866195678711, + "num_tokens": 14925458.0, + "step": 1763 + }, + { + "entropy": 1.045823484659195, + "epoch": 0.7056705670567057, + "grad_norm": 0.2821919322013855, + "learning_rate": 5.799942219083292e-05, + "loss": 1.0512, + "mean_token_accuracy": 0.7003477662801743, + "num_tokens": 14933502.0, + "step": 1764 + }, + { + "entropy": 1.0344386994838715, + "epoch": 0.7060706070607061, + "grad_norm": 0.27198636531829834, + "learning_rate": 5.79043020749116e-05, + "loss": 1.038, + "mean_token_accuracy": 0.7082102447748184, + "num_tokens": 14941873.0, + "step": 1765 + }, + { + "entropy": 1.0037032216787338, + "epoch": 0.7064706470647065, + "grad_norm": 0.2968989610671997, + "learning_rate": 5.780926939252973e-05, + "loss": 0.9974, + "mean_token_accuracy": 0.713927611708641, + "num_tokens": 14950671.0, + "step": 1766 + }, + { + "entropy": 0.978719636797905, + "epoch": 0.7068706870687068, + "grad_norm": 0.270354688167572, + "learning_rate": 5.7714324303183e-05, + "loss": 0.9816, + "mean_token_accuracy": 0.7204948514699936, + "num_tokens": 14959650.0, + "step": 1767 + }, + { + "entropy": 0.9896121025085449, + "epoch": 0.7072707270727072, + "grad_norm": 0.2604040503501892, + "learning_rate": 5.761946696622023e-05, + "loss": 1.003, + "mean_token_accuracy": 0.7133939117193222, + "num_tokens": 14968597.0, + "step": 1768 + }, + { + "entropy": 0.9682731628417969, + "epoch": 0.7076707670767076, + "grad_norm": 0.28701862692832947, + "learning_rate": 5.752469754084284e-05, + "loss": 0.9833, + "mean_token_accuracy": 0.7168468236923218, + "num_tokens": 14977616.0, + "step": 1769 + }, + { + "entropy": 0.9932399839162827, + "epoch": 0.708070807080708, + "grad_norm": 0.34703361988067627, + "learning_rate": 5.743001618610488e-05, + "loss": 0.9832, + "mean_token_accuracy": 0.7117743045091629, + "num_tokens": 14986044.0, + "step": 1770 + }, + { + "entropy": 1.0184959173202515, + "epoch": 0.7084708470847084, + "grad_norm": 0.26878491044044495, + "learning_rate": 5.733542306091242e-05, + "loss": 1.0217, + "mean_token_accuracy": 0.7011438608169556, + "num_tokens": 14994772.0, + "step": 1771 + }, + { + "entropy": 0.9796338677406311, + "epoch": 0.7088708870887088, + "grad_norm": 0.26278552412986755, + "learning_rate": 5.7240918324023494e-05, + "loss": 0.9794, + "mean_token_accuracy": 0.7240202724933624, + "num_tokens": 15003572.0, + "step": 1772 + }, + { + "entropy": 0.990461066365242, + "epoch": 0.7092709270927092, + "grad_norm": 0.26781025528907776, + "learning_rate": 5.7146502134047875e-05, + "loss": 0.9902, + "mean_token_accuracy": 0.7067979574203491, + "num_tokens": 15012301.0, + "step": 1773 + }, + { + "entropy": 1.0179385840892792, + "epoch": 0.7096709670967096, + "grad_norm": 0.2828406095504761, + "learning_rate": 5.70521746494466e-05, + "loss": 1.0076, + "mean_token_accuracy": 0.7100344747304916, + "num_tokens": 15020408.0, + "step": 1774 + }, + { + "entropy": 1.003205344080925, + "epoch": 0.71007100710071, + "grad_norm": 0.26414579153060913, + "learning_rate": 5.6957936028531957e-05, + "loss": 0.9925, + "mean_token_accuracy": 0.7125920504331589, + "num_tokens": 15028884.0, + "step": 1775 + }, + { + "entropy": 0.9922258406877518, + "epoch": 0.7104710471047104, + "grad_norm": 0.27857187390327454, + "learning_rate": 5.686378642946699e-05, + "loss": 0.983, + "mean_token_accuracy": 0.7044591158628464, + "num_tokens": 15037063.0, + "step": 1776 + }, + { + "entropy": 1.017089143395424, + "epoch": 0.7108710871087108, + "grad_norm": 0.27137330174446106, + "learning_rate": 5.676972601026536e-05, + "loss": 1.0123, + "mean_token_accuracy": 0.7080775946378708, + "num_tokens": 15045701.0, + "step": 1777 + }, + { + "entropy": 1.0307128429412842, + "epoch": 0.7112711271127112, + "grad_norm": 0.29888835549354553, + "learning_rate": 5.667575492879109e-05, + "loss": 1.0471, + "mean_token_accuracy": 0.7057403326034546, + "num_tokens": 15053676.0, + "step": 1778 + }, + { + "entropy": 0.9718946516513824, + "epoch": 0.7116711671167116, + "grad_norm": 0.2679046392440796, + "learning_rate": 5.6581873342758286e-05, + "loss": 0.9758, + "mean_token_accuracy": 0.7167105078697205, + "num_tokens": 15062802.0, + "step": 1779 + }, + { + "entropy": 0.9497180134057999, + "epoch": 0.712071207120712, + "grad_norm": 0.2675117254257202, + "learning_rate": 5.6488081409730755e-05, + "loss": 0.9477, + "mean_token_accuracy": 0.728540375828743, + "num_tokens": 15071359.0, + "step": 1780 + }, + { + "entropy": 1.0519812256097794, + "epoch": 0.7124712471247124, + "grad_norm": 0.2799879014492035, + "learning_rate": 5.63943792871219e-05, + "loss": 1.0514, + "mean_token_accuracy": 0.7074429243803024, + "num_tokens": 15080072.0, + "step": 1781 + }, + { + "entropy": 1.0389542430639267, + "epoch": 0.7128712871287128, + "grad_norm": 0.27244290709495544, + "learning_rate": 5.630076713219436e-05, + "loss": 1.0447, + "mean_token_accuracy": 0.6993273347616196, + "num_tokens": 15088332.0, + "step": 1782 + }, + { + "entropy": 0.990209087729454, + "epoch": 0.7132713271327132, + "grad_norm": 0.29146119952201843, + "learning_rate": 5.6207245102059856e-05, + "loss": 0.9815, + "mean_token_accuracy": 0.7138239592313766, + "num_tokens": 15096462.0, + "step": 1783 + }, + { + "entropy": 1.0564813762903214, + "epoch": 0.7136713671367136, + "grad_norm": 0.2933248281478882, + "learning_rate": 5.6113813353678804e-05, + "loss": 1.0356, + "mean_token_accuracy": 0.7028339803218842, + "num_tokens": 15104143.0, + "step": 1784 + }, + { + "entropy": 0.9876126497983932, + "epoch": 0.714071407140714, + "grad_norm": 0.2617014944553375, + "learning_rate": 5.602047204386005e-05, + "loss": 0.9254, + "mean_token_accuracy": 0.7260624468326569, + "num_tokens": 15112327.0, + "step": 1785 + }, + { + "entropy": 1.0618331730365753, + "epoch": 0.7144714471447144, + "grad_norm": 0.28332602977752686, + "learning_rate": 5.592722132926069e-05, + "loss": 1.0622, + "mean_token_accuracy": 0.696587011218071, + "num_tokens": 15120083.0, + "step": 1786 + }, + { + "entropy": 1.023923709988594, + "epoch": 0.7148714871487148, + "grad_norm": 0.2692929208278656, + "learning_rate": 5.5834061366385803e-05, + "loss": 1.0013, + "mean_token_accuracy": 0.7142926305532455, + "num_tokens": 15128498.0, + "step": 1787 + }, + { + "entropy": 1.004018858075142, + "epoch": 0.7152715271527152, + "grad_norm": 0.2672841548919678, + "learning_rate": 5.5740992311588156e-05, + "loss": 0.9659, + "mean_token_accuracy": 0.7099398225545883, + "num_tokens": 15136904.0, + "step": 1788 + }, + { + "entropy": 1.0538471639156342, + "epoch": 0.7156715671567156, + "grad_norm": 0.26868534088134766, + "learning_rate": 5.564801432106788e-05, + "loss": 1.0294, + "mean_token_accuracy": 0.7028517872095108, + "num_tokens": 15145680.0, + "step": 1789 + }, + { + "entropy": 0.9799318313598633, + "epoch": 0.716071607160716, + "grad_norm": 0.26032477617263794, + "learning_rate": 5.555512755087233e-05, + "loss": 0.9495, + "mean_token_accuracy": 0.7227335125207901, + "num_tokens": 15154280.0, + "step": 1790 + }, + { + "entropy": 1.0494555830955505, + "epoch": 0.7164716471647165, + "grad_norm": 0.2679308354854584, + "learning_rate": 5.5462332156895716e-05, + "loss": 1.0489, + "mean_token_accuracy": 0.7021457850933075, + "num_tokens": 15163007.0, + "step": 1791 + }, + { + "entropy": 1.0031582117080688, + "epoch": 0.7168716871687169, + "grad_norm": 0.2884860038757324, + "learning_rate": 5.5369628294878904e-05, + "loss": 0.9951, + "mean_token_accuracy": 0.7163615226745605, + "num_tokens": 15171235.0, + "step": 1792 + }, + { + "entropy": 0.9432924091815948, + "epoch": 0.7172717271727173, + "grad_norm": 0.2778607904911041, + "learning_rate": 5.527701612040923e-05, + "loss": 0.9528, + "mean_token_accuracy": 0.7242428511381149, + "num_tokens": 15180066.0, + "step": 1793 + }, + { + "entropy": 1.009034276008606, + "epoch": 0.7176717671767177, + "grad_norm": 0.2800300121307373, + "learning_rate": 5.518449578892002e-05, + "loss": 1.0083, + "mean_token_accuracy": 0.708551898598671, + "num_tokens": 15188193.0, + "step": 1794 + }, + { + "entropy": 1.0099316984415054, + "epoch": 0.7180718071807181, + "grad_norm": 0.2840254604816437, + "learning_rate": 5.509206745569049e-05, + "loss": 0.9836, + "mean_token_accuracy": 0.7170423716306686, + "num_tokens": 15196097.0, + "step": 1795 + }, + { + "entropy": 0.9986254423856735, + "epoch": 0.7184718471847185, + "grad_norm": 0.33301305770874023, + "learning_rate": 5.499973127584548e-05, + "loss": 1.0318, + "mean_token_accuracy": 0.7075883597135544, + "num_tokens": 15204216.0, + "step": 1796 + }, + { + "entropy": 0.9982891827821732, + "epoch": 0.7188718871887189, + "grad_norm": 0.27618408203125, + "learning_rate": 5.490748740435519e-05, + "loss": 1.0249, + "mean_token_accuracy": 0.7009115666151047, + "num_tokens": 15212539.0, + "step": 1797 + }, + { + "entropy": 0.9678696244955063, + "epoch": 0.7192719271927193, + "grad_norm": 0.2656507194042206, + "learning_rate": 5.481533599603486e-05, + "loss": 0.9852, + "mean_token_accuracy": 0.7157487571239471, + "num_tokens": 15221297.0, + "step": 1798 + }, + { + "entropy": 1.0259725600481033, + "epoch": 0.7196719671967197, + "grad_norm": 0.28802576661109924, + "learning_rate": 5.472327720554451e-05, + "loss": 1.031, + "mean_token_accuracy": 0.7029864639043808, + "num_tokens": 15229288.0, + "step": 1799 + }, + { + "entropy": 1.0600991249084473, + "epoch": 0.7200720072007201, + "grad_norm": 0.28619131445884705, + "learning_rate": 5.463131118738876e-05, + "loss": 1.0689, + "mean_token_accuracy": 0.7005249708890915, + "num_tokens": 15237216.0, + "step": 1800 + }, + { + "entropy": 0.9764162600040436, + "epoch": 0.7204720472047205, + "grad_norm": 0.26304998993873596, + "learning_rate": 5.453943809591654e-05, + "loss": 0.9712, + "mean_token_accuracy": 0.7151237577199936, + "num_tokens": 15246136.0, + "step": 1801 + }, + { + "entropy": 0.9988236576318741, + "epoch": 0.7208720872087209, + "grad_norm": 0.2807040512561798, + "learning_rate": 5.444765808532084e-05, + "loss": 1.0067, + "mean_token_accuracy": 0.706684798002243, + "num_tokens": 15254540.0, + "step": 1802 + }, + { + "entropy": 1.0510361343622208, + "epoch": 0.7212721272127213, + "grad_norm": 0.2888534963130951, + "learning_rate": 5.435597130963836e-05, + "loss": 1.0524, + "mean_token_accuracy": 0.6980572938919067, + "num_tokens": 15262314.0, + "step": 1803 + }, + { + "entropy": 0.9987234771251678, + "epoch": 0.7216721672167217, + "grad_norm": 0.2656492292881012, + "learning_rate": 5.426437792274934e-05, + "loss": 0.9988, + "mean_token_accuracy": 0.7166716009378433, + "num_tokens": 15270785.0, + "step": 1804 + }, + { + "entropy": 0.9834738671779633, + "epoch": 0.7220722072207221, + "grad_norm": 0.25241294503211975, + "learning_rate": 5.417287807837731e-05, + "loss": 0.9541, + "mean_token_accuracy": 0.7222414165735245, + "num_tokens": 15279683.0, + "step": 1805 + }, + { + "entropy": 0.996497631072998, + "epoch": 0.7224722472247225, + "grad_norm": 0.2662406861782074, + "learning_rate": 5.408147193008883e-05, + "loss": 0.9612, + "mean_token_accuracy": 0.7186982780694962, + "num_tokens": 15288112.0, + "step": 1806 + }, + { + "entropy": 1.1060806512832642, + "epoch": 0.7228722872287229, + "grad_norm": 0.2686406075954437, + "learning_rate": 5.3990159631293145e-05, + "loss": 1.0839, + "mean_token_accuracy": 0.699811652302742, + "num_tokens": 15296612.0, + "step": 1807 + }, + { + "entropy": 0.9561125785112381, + "epoch": 0.7232723272327233, + "grad_norm": 0.25071460008621216, + "learning_rate": 5.3898941335242005e-05, + "loss": 0.9193, + "mean_token_accuracy": 0.7281172424554825, + "num_tokens": 15305775.0, + "step": 1808 + }, + { + "entropy": 1.0753975808620453, + "epoch": 0.7236723672367237, + "grad_norm": 0.26959118247032166, + "learning_rate": 5.380781719502939e-05, + "loss": 1.0395, + "mean_token_accuracy": 0.6984004378318787, + "num_tokens": 15314122.0, + "step": 1809 + }, + { + "entropy": 1.1326527893543243, + "epoch": 0.7240724072407241, + "grad_norm": 0.286304771900177, + "learning_rate": 5.371678736359129e-05, + "loss": 1.113, + "mean_token_accuracy": 0.6936499923467636, + "num_tokens": 15322165.0, + "step": 1810 + }, + { + "entropy": 0.981751024723053, + "epoch": 0.7244724472447245, + "grad_norm": 0.2600105106830597, + "learning_rate": 5.3625851993705434e-05, + "loss": 0.974, + "mean_token_accuracy": 0.7217129468917847, + "num_tokens": 15330440.0, + "step": 1811 + }, + { + "entropy": 1.010168731212616, + "epoch": 0.7248724872487249, + "grad_norm": 0.2711855471134186, + "learning_rate": 5.353501123799094e-05, + "loss": 1.0229, + "mean_token_accuracy": 0.7068586051464081, + "num_tokens": 15339036.0, + "step": 1812 + }, + { + "entropy": 0.9361576586961746, + "epoch": 0.7252725272527253, + "grad_norm": 0.2616553008556366, + "learning_rate": 5.344426524890813e-05, + "loss": 0.9302, + "mean_token_accuracy": 0.7275435030460358, + "num_tokens": 15347515.0, + "step": 1813 + }, + { + "entropy": 1.0117815881967545, + "epoch": 0.7256725672567257, + "grad_norm": 0.2709094285964966, + "learning_rate": 5.335361417875835e-05, + "loss": 1.0028, + "mean_token_accuracy": 0.7136920392513275, + "num_tokens": 15355966.0, + "step": 1814 + }, + { + "entropy": 0.984452024102211, + "epoch": 0.7260726072607261, + "grad_norm": 0.2603929340839386, + "learning_rate": 5.326305817968362e-05, + "loss": 0.9608, + "mean_token_accuracy": 0.714958980679512, + "num_tokens": 15364671.0, + "step": 1815 + }, + { + "entropy": 0.9944485574960709, + "epoch": 0.7264726472647265, + "grad_norm": 0.28045469522476196, + "learning_rate": 5.317259740366638e-05, + "loss": 0.9823, + "mean_token_accuracy": 0.7156690210103989, + "num_tokens": 15372862.0, + "step": 1816 + }, + { + "entropy": 1.0883727371692657, + "epoch": 0.7268726872687269, + "grad_norm": 0.27829018235206604, + "learning_rate": 5.308223200252924e-05, + "loss": 1.1059, + "mean_token_accuracy": 0.6830866485834122, + "num_tokens": 15381097.0, + "step": 1817 + }, + { + "entropy": 1.0059540271759033, + "epoch": 0.7272727272727273, + "grad_norm": 0.29057520627975464, + "learning_rate": 5.299196212793474e-05, + "loss": 1.0114, + "mean_token_accuracy": 0.7061628550291061, + "num_tokens": 15389085.0, + "step": 1818 + }, + { + "entropy": 1.0167541205883026, + "epoch": 0.7276727672767277, + "grad_norm": 0.28216996788978577, + "learning_rate": 5.290178793138514e-05, + "loss": 1.0236, + "mean_token_accuracy": 0.7119895964860916, + "num_tokens": 15397413.0, + "step": 1819 + }, + { + "entropy": 1.0413401126861572, + "epoch": 0.7280728072807281, + "grad_norm": 0.28212466835975647, + "learning_rate": 5.281170956422212e-05, + "loss": 1.0602, + "mean_token_accuracy": 0.7006165534257889, + "num_tokens": 15406255.0, + "step": 1820 + }, + { + "entropy": 1.0448304861783981, + "epoch": 0.7284728472847285, + "grad_norm": 0.33545419573783875, + "learning_rate": 5.272172717762649e-05, + "loss": 1.036, + "mean_token_accuracy": 0.7051610499620438, + "num_tokens": 15414562.0, + "step": 1821 + }, + { + "entropy": 0.9899641573429108, + "epoch": 0.7288728872887289, + "grad_norm": 0.3488135039806366, + "learning_rate": 5.263184092261793e-05, + "loss": 0.9772, + "mean_token_accuracy": 0.7190723717212677, + "num_tokens": 15422963.0, + "step": 1822 + }, + { + "entropy": 1.005012184381485, + "epoch": 0.7292729272927293, + "grad_norm": 0.25827670097351074, + "learning_rate": 5.2542050950054925e-05, + "loss": 0.9892, + "mean_token_accuracy": 0.7183214128017426, + "num_tokens": 15431800.0, + "step": 1823 + }, + { + "entropy": 0.9645077586174011, + "epoch": 0.7296729672967297, + "grad_norm": 0.2731607258319855, + "learning_rate": 5.245235741063419e-05, + "loss": 0.9445, + "mean_token_accuracy": 0.7265629917383194, + "num_tokens": 15440368.0, + "step": 1824 + }, + { + "entropy": 0.9944679439067841, + "epoch": 0.7300730073007301, + "grad_norm": 0.25965458154678345, + "learning_rate": 5.236276045489075e-05, + "loss": 0.972, + "mean_token_accuracy": 0.7143032103776932, + "num_tokens": 15449115.0, + "step": 1825 + }, + { + "entropy": 1.0461916625499725, + "epoch": 0.7304730473047305, + "grad_norm": 0.29211509227752686, + "learning_rate": 5.227326023319743e-05, + "loss": 1.0405, + "mean_token_accuracy": 0.7003129124641418, + "num_tokens": 15456967.0, + "step": 1826 + }, + { + "entropy": 1.056776612997055, + "epoch": 0.7308730873087309, + "grad_norm": 0.27467435598373413, + "learning_rate": 5.2183856895764724e-05, + "loss": 1.0183, + "mean_token_accuracy": 0.7041320502758026, + "num_tokens": 15465130.0, + "step": 1827 + }, + { + "entropy": 0.977705329656601, + "epoch": 0.7312731273127313, + "grad_norm": 0.27317294478416443, + "learning_rate": 5.2094550592640545e-05, + "loss": 0.9779, + "mean_token_accuracy": 0.719263955950737, + "num_tokens": 15473632.0, + "step": 1828 + }, + { + "entropy": 1.0370897054672241, + "epoch": 0.7316731673167317, + "grad_norm": 0.281005859375, + "learning_rate": 5.200534147370999e-05, + "loss": 1.0404, + "mean_token_accuracy": 0.7069350332021713, + "num_tokens": 15481927.0, + "step": 1829 + }, + { + "entropy": 1.0068767219781876, + "epoch": 0.7320732073207321, + "grad_norm": 0.2759975790977478, + "learning_rate": 5.191622968869496e-05, + "loss": 1.0152, + "mean_token_accuracy": 0.7068546414375305, + "num_tokens": 15490190.0, + "step": 1830 + }, + { + "entropy": 0.9722078442573547, + "epoch": 0.7324732473247325, + "grad_norm": 0.27043867111206055, + "learning_rate": 5.1827215387154005e-05, + "loss": 0.9704, + "mean_token_accuracy": 0.7117862403392792, + "num_tokens": 15498557.0, + "step": 1831 + }, + { + "entropy": 1.0644948482513428, + "epoch": 0.7328732873287329, + "grad_norm": 0.28107914328575134, + "learning_rate": 5.1738298718482145e-05, + "loss": 1.0814, + "mean_token_accuracy": 0.6910319030284882, + "num_tokens": 15506849.0, + "step": 1832 + }, + { + "entropy": 1.0685735642910004, + "epoch": 0.7332733273327333, + "grad_norm": 0.28577595949172974, + "learning_rate": 5.1649479831910465e-05, + "loss": 1.0934, + "mean_token_accuracy": 0.6918471455574036, + "num_tokens": 15515198.0, + "step": 1833 + }, + { + "entropy": 1.0507451742887497, + "epoch": 0.7336733673367337, + "grad_norm": 0.27079933881759644, + "learning_rate": 5.156075887650601e-05, + "loss": 1.0396, + "mean_token_accuracy": 0.6995644718408585, + "num_tokens": 15523462.0, + "step": 1834 + }, + { + "entropy": 0.9441281706094742, + "epoch": 0.7340734073407341, + "grad_norm": 0.24909089505672455, + "learning_rate": 5.1472136001171355e-05, + "loss": 0.9166, + "mean_token_accuracy": 0.7352582365274429, + "num_tokens": 15532457.0, + "step": 1835 + }, + { + "entropy": 1.017049789428711, + "epoch": 0.7344734473447345, + "grad_norm": 0.2845374047756195, + "learning_rate": 5.1383611354644635e-05, + "loss": 0.9849, + "mean_token_accuracy": 0.7082022130489349, + "num_tokens": 15540454.0, + "step": 1836 + }, + { + "entropy": 1.0176618993282318, + "epoch": 0.7348734873487349, + "grad_norm": 0.29254093766212463, + "learning_rate": 5.129518508549895e-05, + "loss": 1.0197, + "mean_token_accuracy": 0.7119490802288055, + "num_tokens": 15549135.0, + "step": 1837 + }, + { + "entropy": 0.956862673163414, + "epoch": 0.7352735273527353, + "grad_norm": 0.2697528302669525, + "learning_rate": 5.1206857342142345e-05, + "loss": 0.9325, + "mean_token_accuracy": 0.735243484377861, + "num_tokens": 15557664.0, + "step": 1838 + }, + { + "entropy": 1.0698621571063995, + "epoch": 0.7356735673567357, + "grad_norm": 0.28195714950561523, + "learning_rate": 5.1118628272817595e-05, + "loss": 1.024, + "mean_token_accuracy": 0.7082817703485489, + "num_tokens": 15565839.0, + "step": 1839 + }, + { + "entropy": 0.9802502542734146, + "epoch": 0.7360736073607361, + "grad_norm": 0.26460593938827515, + "learning_rate": 5.1030498025601733e-05, + "loss": 0.9781, + "mean_token_accuracy": 0.7241973578929901, + "num_tokens": 15574279.0, + "step": 1840 + }, + { + "entropy": 1.1119203567504883, + "epoch": 0.7364736473647365, + "grad_norm": 0.2905811667442322, + "learning_rate": 5.094246674840607e-05, + "loss": 1.0962, + "mean_token_accuracy": 0.6892087161540985, + "num_tokens": 15582432.0, + "step": 1841 + }, + { + "entropy": 1.0160206407308578, + "epoch": 0.7368736873687368, + "grad_norm": 0.2707589566707611, + "learning_rate": 5.0854534588975665e-05, + "loss": 1.0089, + "mean_token_accuracy": 0.7130383402109146, + "num_tokens": 15590930.0, + "step": 1842 + }, + { + "entropy": 1.0062616467475891, + "epoch": 0.7372737273727372, + "grad_norm": 0.2603473961353302, + "learning_rate": 5.0766701694889397e-05, + "loss": 1.006, + "mean_token_accuracy": 0.7103791981935501, + "num_tokens": 15599983.0, + "step": 1843 + }, + { + "entropy": 0.9740394353866577, + "epoch": 0.7376737673767376, + "grad_norm": 0.28221848607063293, + "learning_rate": 5.0678968213559354e-05, + "loss": 0.9587, + "mean_token_accuracy": 0.7255450040102005, + "num_tokens": 15608457.0, + "step": 1844 + }, + { + "entropy": 0.9599699825048447, + "epoch": 0.738073807380738, + "grad_norm": 0.27412593364715576, + "learning_rate": 5.059133429223097e-05, + "loss": 0.9478, + "mean_token_accuracy": 0.7196529805660248, + "num_tokens": 15617394.0, + "step": 1845 + }, + { + "entropy": 0.9502219259738922, + "epoch": 0.7384738473847384, + "grad_norm": 0.2622402608394623, + "learning_rate": 5.050380007798246e-05, + "loss": 0.9574, + "mean_token_accuracy": 0.7234981656074524, + "num_tokens": 15626538.0, + "step": 1846 + }, + { + "entropy": 0.998581051826477, + "epoch": 0.7388738873887388, + "grad_norm": 0.271270215511322, + "learning_rate": 5.0416365717724665e-05, + "loss": 1.0105, + "mean_token_accuracy": 0.7077009230852127, + "num_tokens": 15635162.0, + "step": 1847 + }, + { + "entropy": 0.9634309262037277, + "epoch": 0.7392739273927392, + "grad_norm": 0.27745920419692993, + "learning_rate": 5.0329031358201015e-05, + "loss": 1.0002, + "mean_token_accuracy": 0.7089777886867523, + "num_tokens": 15643837.0, + "step": 1848 + }, + { + "entropy": 1.0162126570940018, + "epoch": 0.7396739673967396, + "grad_norm": 0.27735885977745056, + "learning_rate": 5.024179714598689e-05, + "loss": 1.0298, + "mean_token_accuracy": 0.7059819251298904, + "num_tokens": 15652237.0, + "step": 1849 + }, + { + "entropy": 1.0339131653308868, + "epoch": 0.74007400740074, + "grad_norm": 0.28404727578163147, + "learning_rate": 5.015466322748978e-05, + "loss": 1.0362, + "mean_token_accuracy": 0.7047589719295502, + "num_tokens": 15660286.0, + "step": 1850 + }, + { + "entropy": 0.9711570888757706, + "epoch": 0.7404740474047404, + "grad_norm": 0.26058757305145264, + "learning_rate": 5.006762974894872e-05, + "loss": 0.9581, + "mean_token_accuracy": 0.7212998569011688, + "num_tokens": 15669186.0, + "step": 1851 + }, + { + "entropy": 1.0277953743934631, + "epoch": 0.7408740874087408, + "grad_norm": 0.27081987261772156, + "learning_rate": 4.99806968564342e-05, + "loss": 1.0342, + "mean_token_accuracy": 0.7055795639753342, + "num_tokens": 15677400.0, + "step": 1852 + }, + { + "entropy": 0.9815719574689865, + "epoch": 0.7412741274127412, + "grad_norm": 0.26796236634254456, + "learning_rate": 4.9893864695847956e-05, + "loss": 0.9678, + "mean_token_accuracy": 0.71828992664814, + "num_tokens": 15686031.0, + "step": 1853 + }, + { + "entropy": 0.9924973100423813, + "epoch": 0.7416741674167416, + "grad_norm": 0.2643292248249054, + "learning_rate": 4.98071334129226e-05, + "loss": 0.9768, + "mean_token_accuracy": 0.7220223546028137, + "num_tokens": 15694807.0, + "step": 1854 + }, + { + "entropy": 1.0251242369413376, + "epoch": 0.742074207420742, + "grad_norm": 0.2991834282875061, + "learning_rate": 4.972050315322147e-05, + "loss": 0.9928, + "mean_token_accuracy": 0.7162717431783676, + "num_tokens": 15703344.0, + "step": 1855 + }, + { + "entropy": 1.0116465091705322, + "epoch": 0.7424742474247424, + "grad_norm": 0.27762356400489807, + "learning_rate": 4.9633974062138325e-05, + "loss": 0.9823, + "mean_token_accuracy": 0.7113604992628098, + "num_tokens": 15711478.0, + "step": 1856 + }, + { + "entropy": 1.0004895776510239, + "epoch": 0.7428742874287428, + "grad_norm": 0.2630826234817505, + "learning_rate": 4.954754628489713e-05, + "loss": 0.9747, + "mean_token_accuracy": 0.7103775888681412, + "num_tokens": 15720319.0, + "step": 1857 + }, + { + "entropy": 1.0129406601190567, + "epoch": 0.7432743274327432, + "grad_norm": 0.2682814300060272, + "learning_rate": 4.946121996655185e-05, + "loss": 1.0119, + "mean_token_accuracy": 0.7098504602909088, + "num_tokens": 15729007.0, + "step": 1858 + }, + { + "entropy": 1.0161653012037277, + "epoch": 0.7436743674367436, + "grad_norm": 0.2788386344909668, + "learning_rate": 4.9374995251986176e-05, + "loss": 1.0179, + "mean_token_accuracy": 0.7106742113828659, + "num_tokens": 15736920.0, + "step": 1859 + }, + { + "entropy": 0.9509435147047043, + "epoch": 0.744074407440744, + "grad_norm": 0.2619096040725708, + "learning_rate": 4.928887228591322e-05, + "loss": 0.9406, + "mean_token_accuracy": 0.7242615818977356, + "num_tokens": 15745497.0, + "step": 1860 + }, + { + "entropy": 1.0005668848752975, + "epoch": 0.7444744474447444, + "grad_norm": 0.2754281759262085, + "learning_rate": 4.920285121287533e-05, + "loss": 0.9943, + "mean_token_accuracy": 0.7127867043018341, + "num_tokens": 15753599.0, + "step": 1861 + }, + { + "entropy": 1.0017747282981873, + "epoch": 0.7448744874487448, + "grad_norm": 0.2939200699329376, + "learning_rate": 4.9116932177243885e-05, + "loss": 1.0248, + "mean_token_accuracy": 0.7136629968881607, + "num_tokens": 15761824.0, + "step": 1862 + }, + { + "entropy": 1.0249672681093216, + "epoch": 0.7452745274527453, + "grad_norm": 0.27471113204956055, + "learning_rate": 4.903111532321904e-05, + "loss": 1.0218, + "mean_token_accuracy": 0.7043896913528442, + "num_tokens": 15770307.0, + "step": 1863 + }, + { + "entropy": 1.009742945432663, + "epoch": 0.7456745674567457, + "grad_norm": 0.27961236238479614, + "learning_rate": 4.894540079482938e-05, + "loss": 1.0081, + "mean_token_accuracy": 0.712365448474884, + "num_tokens": 15778325.0, + "step": 1864 + }, + { + "entropy": 1.0164244323968887, + "epoch": 0.746074607460746, + "grad_norm": 0.28695395588874817, + "learning_rate": 4.8859788735931766e-05, + "loss": 1.0286, + "mean_token_accuracy": 0.706282764673233, + "num_tokens": 15787042.0, + "step": 1865 + }, + { + "entropy": 1.0082041770219803, + "epoch": 0.7464746474647465, + "grad_norm": 0.26509660482406616, + "learning_rate": 4.8774279290211086e-05, + "loss": 0.9933, + "mean_token_accuracy": 0.7103676646947861, + "num_tokens": 15795732.0, + "step": 1866 + }, + { + "entropy": 1.0401188731193542, + "epoch": 0.7468746874687469, + "grad_norm": 0.26926189661026, + "learning_rate": 4.868887260118006e-05, + "loss": 1.0332, + "mean_token_accuracy": 0.7102043181657791, + "num_tokens": 15804075.0, + "step": 1867 + }, + { + "entropy": 0.9792807251214981, + "epoch": 0.7472747274727473, + "grad_norm": 0.2628943920135498, + "learning_rate": 4.860356881217893e-05, + "loss": 0.9745, + "mean_token_accuracy": 0.7135739922523499, + "num_tokens": 15812816.0, + "step": 1868 + }, + { + "entropy": 0.9877045750617981, + "epoch": 0.7476747674767477, + "grad_norm": 0.26374948024749756, + "learning_rate": 4.851836806637521e-05, + "loss": 0.9923, + "mean_token_accuracy": 0.7132317572832108, + "num_tokens": 15821452.0, + "step": 1869 + }, + { + "entropy": 1.0461764335632324, + "epoch": 0.7480748074807481, + "grad_norm": 0.28898099064826965, + "learning_rate": 4.843327050676345e-05, + "loss": 1.0186, + "mean_token_accuracy": 0.7102282494306564, + "num_tokens": 15829742.0, + "step": 1870 + }, + { + "entropy": 0.9934623837471008, + "epoch": 0.7484748474847485, + "grad_norm": 0.2716710567474365, + "learning_rate": 4.834827627616507e-05, + "loss": 0.9525, + "mean_token_accuracy": 0.7150904685258865, + "num_tokens": 15838382.0, + "step": 1871 + }, + { + "entropy": 0.9838602244853973, + "epoch": 0.7488748874887489, + "grad_norm": 0.27053049206733704, + "learning_rate": 4.826338551722811e-05, + "loss": 0.9553, + "mean_token_accuracy": 0.7207248359918594, + "num_tokens": 15846607.0, + "step": 1872 + }, + { + "entropy": 1.0216495096683502, + "epoch": 0.7492749274927493, + "grad_norm": 0.28442302346229553, + "learning_rate": 4.817859837242685e-05, + "loss": 1.0245, + "mean_token_accuracy": 0.7057902067899704, + "num_tokens": 15854387.0, + "step": 1873 + }, + { + "entropy": 0.9820602685213089, + "epoch": 0.7496749674967497, + "grad_norm": 0.27369996905326843, + "learning_rate": 4.8093914984061725e-05, + "loss": 0.9653, + "mean_token_accuracy": 0.7155852615833282, + "num_tokens": 15862731.0, + "step": 1874 + }, + { + "entropy": 1.0199142843484879, + "epoch": 0.7500750075007501, + "grad_norm": 0.2855715751647949, + "learning_rate": 4.8009335494259e-05, + "loss": 1.0152, + "mean_token_accuracy": 0.7063765376806259, + "num_tokens": 15870566.0, + "step": 1875 + }, + { + "entropy": 1.0198665708303452, + "epoch": 0.7504750475047505, + "grad_norm": 0.2721792161464691, + "learning_rate": 4.7924860044970615e-05, + "loss": 0.9984, + "mean_token_accuracy": 0.7152540385723114, + "num_tokens": 15879099.0, + "step": 1876 + }, + { + "entropy": 1.0618086457252502, + "epoch": 0.7508750875087509, + "grad_norm": 0.27827250957489014, + "learning_rate": 4.7840488777973915e-05, + "loss": 1.0724, + "mean_token_accuracy": 0.6893226355314255, + "num_tokens": 15887326.0, + "step": 1877 + }, + { + "entropy": 0.9397483468055725, + "epoch": 0.7512751275127513, + "grad_norm": 0.43993356823921204, + "learning_rate": 4.775622183487131e-05, + "loss": 0.9454, + "mean_token_accuracy": 0.730973482131958, + "num_tokens": 15895870.0, + "step": 1878 + }, + { + "entropy": 0.9606287926435471, + "epoch": 0.7516751675167517, + "grad_norm": 0.2690504491329193, + "learning_rate": 4.7672059357090126e-05, + "loss": 0.9634, + "mean_token_accuracy": 0.7162826806306839, + "num_tokens": 15904497.0, + "step": 1879 + }, + { + "entropy": 0.9990681856870651, + "epoch": 0.7520752075207521, + "grad_norm": 0.2679041922092438, + "learning_rate": 4.7588001485882434e-05, + "loss": 1.0203, + "mean_token_accuracy": 0.7068963646888733, + "num_tokens": 15913204.0, + "step": 1880 + }, + { + "entropy": 0.9585936516523361, + "epoch": 0.7524752475247525, + "grad_norm": 0.26224806904792786, + "learning_rate": 4.7504048362324736e-05, + "loss": 0.9405, + "mean_token_accuracy": 0.726496234536171, + "num_tokens": 15921941.0, + "step": 1881 + }, + { + "entropy": 0.9815015941858292, + "epoch": 0.7528752875287529, + "grad_norm": 0.27497175335884094, + "learning_rate": 4.742020012731768e-05, + "loss": 0.9997, + "mean_token_accuracy": 0.7151841968297958, + "num_tokens": 15930177.0, + "step": 1882 + }, + { + "entropy": 0.9939230531454086, + "epoch": 0.7532753275327533, + "grad_norm": 0.2709159255027771, + "learning_rate": 4.733645692158588e-05, + "loss": 0.9818, + "mean_token_accuracy": 0.7145293056964874, + "num_tokens": 15938584.0, + "step": 1883 + }, + { + "entropy": 0.9190539568662643, + "epoch": 0.7536753675367537, + "grad_norm": 0.2666955292224884, + "learning_rate": 4.725281888567768e-05, + "loss": 0.9117, + "mean_token_accuracy": 0.7326617985963821, + "num_tokens": 15947234.0, + "step": 1884 + }, + { + "entropy": 1.031174674630165, + "epoch": 0.7540754075407541, + "grad_norm": 0.27601635456085205, + "learning_rate": 4.716928615996495e-05, + "loss": 1.0639, + "mean_token_accuracy": 0.6970148980617523, + "num_tokens": 15955968.0, + "step": 1885 + }, + { + "entropy": 1.0320014953613281, + "epoch": 0.7544754475447545, + "grad_norm": 0.2994645833969116, + "learning_rate": 4.7085858884642844e-05, + "loss": 1.0134, + "mean_token_accuracy": 0.7071530073881149, + "num_tokens": 15964191.0, + "step": 1886 + }, + { + "entropy": 0.9597108364105225, + "epoch": 0.7548754875487549, + "grad_norm": 0.27724534273147583, + "learning_rate": 4.700253719972943e-05, + "loss": 0.975, + "mean_token_accuracy": 0.7223957926034927, + "num_tokens": 15972645.0, + "step": 1887 + }, + { + "entropy": 0.955593153834343, + "epoch": 0.7552755275527553, + "grad_norm": 0.26258203387260437, + "learning_rate": 4.6919321245065606e-05, + "loss": 0.9674, + "mean_token_accuracy": 0.7175847887992859, + "num_tokens": 15981096.0, + "step": 1888 + }, + { + "entropy": 0.9830133765935898, + "epoch": 0.7556755675567557, + "grad_norm": 0.27360105514526367, + "learning_rate": 4.683621116031487e-05, + "loss": 0.9948, + "mean_token_accuracy": 0.7130820006132126, + "num_tokens": 15989504.0, + "step": 1889 + }, + { + "entropy": 1.0339231044054031, + "epoch": 0.7560756075607561, + "grad_norm": 0.2787891924381256, + "learning_rate": 4.6753207084963006e-05, + "loss": 1.0311, + "mean_token_accuracy": 0.7053349763154984, + "num_tokens": 15997579.0, + "step": 1890 + }, + { + "entropy": 0.9780951589345932, + "epoch": 0.7564756475647565, + "grad_norm": 0.27285584807395935, + "learning_rate": 4.667030915831785e-05, + "loss": 0.987, + "mean_token_accuracy": 0.7080614268779755, + "num_tokens": 16006083.0, + "step": 1891 + }, + { + "entropy": 1.009865179657936, + "epoch": 0.7568756875687569, + "grad_norm": 0.2621045410633087, + "learning_rate": 4.658751751950912e-05, + "loss": 0.9829, + "mean_token_accuracy": 0.7100094705820084, + "num_tokens": 16014573.0, + "step": 1892 + }, + { + "entropy": 0.9707681834697723, + "epoch": 0.7572757275727573, + "grad_norm": 0.27257344126701355, + "learning_rate": 4.6504832307488095e-05, + "loss": 0.994, + "mean_token_accuracy": 0.7105017453432083, + "num_tokens": 16023410.0, + "step": 1893 + }, + { + "entropy": 1.110841542482376, + "epoch": 0.7576757675767577, + "grad_norm": 0.2954443395137787, + "learning_rate": 4.642225366102751e-05, + "loss": 1.1003, + "mean_token_accuracy": 0.6916090846061707, + "num_tokens": 16031587.0, + "step": 1894 + }, + { + "entropy": 0.9800740778446198, + "epoch": 0.7580758075807581, + "grad_norm": 0.26876506209373474, + "learning_rate": 4.6339781718721247e-05, + "loss": 0.9511, + "mean_token_accuracy": 0.7219282537698746, + "num_tokens": 16040031.0, + "step": 1895 + }, + { + "entropy": 1.0435761511325836, + "epoch": 0.7584758475847585, + "grad_norm": 0.28686362504959106, + "learning_rate": 4.625741661898403e-05, + "loss": 1.0193, + "mean_token_accuracy": 0.7097796499729156, + "num_tokens": 16048046.0, + "step": 1896 + }, + { + "entropy": 0.9513996839523315, + "epoch": 0.7588758875887589, + "grad_norm": 0.25612878799438477, + "learning_rate": 4.617515850005129e-05, + "loss": 0.9352, + "mean_token_accuracy": 0.7250348627567291, + "num_tokens": 16057223.0, + "step": 1897 + }, + { + "entropy": 1.0131810307502747, + "epoch": 0.7592759275927593, + "grad_norm": 0.27990591526031494, + "learning_rate": 4.609300749997899e-05, + "loss": 0.9905, + "mean_token_accuracy": 0.7139701694250107, + "num_tokens": 16065653.0, + "step": 1898 + }, + { + "entropy": 0.9703545868396759, + "epoch": 0.7596759675967597, + "grad_norm": 0.27052369713783264, + "learning_rate": 4.601096375664319e-05, + "loss": 0.9778, + "mean_token_accuracy": 0.7232963591814041, + "num_tokens": 16074036.0, + "step": 1899 + }, + { + "entropy": 0.957008957862854, + "epoch": 0.7600760076007601, + "grad_norm": 0.25624436140060425, + "learning_rate": 4.592902740774003e-05, + "loss": 0.9277, + "mean_token_accuracy": 0.7242179960012436, + "num_tokens": 16082761.0, + "step": 1900 + }, + { + "entropy": 0.9569432586431503, + "epoch": 0.7604760476047605, + "grad_norm": 0.2628895044326782, + "learning_rate": 4.5847198590785394e-05, + "loss": 0.9451, + "mean_token_accuracy": 0.7298707962036133, + "num_tokens": 16091432.0, + "step": 1901 + }, + { + "entropy": 1.0490912795066833, + "epoch": 0.7608760876087609, + "grad_norm": 0.2754358947277069, + "learning_rate": 4.5765477443114605e-05, + "loss": 1.0366, + "mean_token_accuracy": 0.7074502110481262, + "num_tokens": 16099911.0, + "step": 1902 + }, + { + "entropy": 1.043460726737976, + "epoch": 0.7612761276127613, + "grad_norm": 0.28964099287986755, + "learning_rate": 4.568386410188239e-05, + "loss": 1.0273, + "mean_token_accuracy": 0.7074701637029648, + "num_tokens": 16108278.0, + "step": 1903 + }, + { + "entropy": 0.9766198992729187, + "epoch": 0.7616761676167617, + "grad_norm": 0.27041909098625183, + "learning_rate": 4.560235870406255e-05, + "loss": 0.9675, + "mean_token_accuracy": 0.7158672213554382, + "num_tokens": 16116506.0, + "step": 1904 + }, + { + "entropy": 0.9987732619047165, + "epoch": 0.7620762076207621, + "grad_norm": 0.27061161398887634, + "learning_rate": 4.5520961386447615e-05, + "loss": 1.0059, + "mean_token_accuracy": 0.7086792439222336, + "num_tokens": 16125191.0, + "step": 1905 + }, + { + "entropy": 1.0371974408626556, + "epoch": 0.7624762476247625, + "grad_norm": 0.2819973826408386, + "learning_rate": 4.543967228564878e-05, + "loss": 1.0305, + "mean_token_accuracy": 0.7030836045742035, + "num_tokens": 16133274.0, + "step": 1906 + }, + { + "entropy": 1.0003003627061844, + "epoch": 0.7628762876287629, + "grad_norm": 0.27602076530456543, + "learning_rate": 4.535849153809566e-05, + "loss": 1.0019, + "mean_token_accuracy": 0.709424614906311, + "num_tokens": 16141821.0, + "step": 1907 + }, + { + "entropy": 1.0470286309719086, + "epoch": 0.7632763276327633, + "grad_norm": 0.28162887692451477, + "learning_rate": 4.527741928003591e-05, + "loss": 1.0439, + "mean_token_accuracy": 0.7074371576309204, + "num_tokens": 16150027.0, + "step": 1908 + }, + { + "entropy": 0.9791869074106216, + "epoch": 0.7636763676367637, + "grad_norm": 0.2656901776790619, + "learning_rate": 4.519645564753524e-05, + "loss": 0.9987, + "mean_token_accuracy": 0.7083437889814377, + "num_tokens": 16158785.0, + "step": 1909 + }, + { + "entropy": 0.9447029531002045, + "epoch": 0.7640764076407641, + "grad_norm": 0.2460569143295288, + "learning_rate": 4.5115600776476965e-05, + "loss": 0.9327, + "mean_token_accuracy": 0.7275337874889374, + "num_tokens": 16167836.0, + "step": 1910 + }, + { + "entropy": 1.0694310367107391, + "epoch": 0.7644764476447645, + "grad_norm": 0.26458919048309326, + "learning_rate": 4.503485480256182e-05, + "loss": 1.0647, + "mean_token_accuracy": 0.6963541060686111, + "num_tokens": 16176394.0, + "step": 1911 + }, + { + "entropy": 1.0426035374403, + "epoch": 0.7648764876487649, + "grad_norm": 0.2894461750984192, + "learning_rate": 4.495421786130791e-05, + "loss": 1.0623, + "mean_token_accuracy": 0.6906712204217911, + "num_tokens": 16184366.0, + "step": 1912 + }, + { + "entropy": 0.9670244306325912, + "epoch": 0.7652765276527653, + "grad_norm": 0.26042723655700684, + "learning_rate": 4.4873690088050216e-05, + "loss": 0.9548, + "mean_token_accuracy": 0.7282930314540863, + "num_tokens": 16193060.0, + "step": 1913 + }, + { + "entropy": 0.9921520948410034, + "epoch": 0.7656765676567657, + "grad_norm": 0.262412428855896, + "learning_rate": 4.47932716179406e-05, + "loss": 0.9846, + "mean_token_accuracy": 0.7203300595283508, + "num_tokens": 16201655.0, + "step": 1914 + }, + { + "entropy": 1.0915074795484543, + "epoch": 0.7660766076607661, + "grad_norm": 0.292971670627594, + "learning_rate": 4.4712962585947374e-05, + "loss": 1.0753, + "mean_token_accuracy": 0.6966143548488617, + "num_tokens": 16210144.0, + "step": 1915 + }, + { + "entropy": 1.0475950688123703, + "epoch": 0.7664766476647665, + "grad_norm": 0.277006596326828, + "learning_rate": 4.463276312685532e-05, + "loss": 1.0298, + "mean_token_accuracy": 0.6964527070522308, + "num_tokens": 16218483.0, + "step": 1916 + }, + { + "entropy": 1.0462198555469513, + "epoch": 0.7668766876687669, + "grad_norm": 0.2759718894958496, + "learning_rate": 4.455267337526518e-05, + "loss": 1.0278, + "mean_token_accuracy": 0.7054939270019531, + "num_tokens": 16226805.0, + "step": 1917 + }, + { + "entropy": 1.0280572772026062, + "epoch": 0.7672767276727672, + "grad_norm": 0.2560069262981415, + "learning_rate": 4.44726934655936e-05, + "loss": 0.9866, + "mean_token_accuracy": 0.713211715221405, + "num_tokens": 16235866.0, + "step": 1918 + }, + { + "entropy": 1.0016445219516754, + "epoch": 0.7676767676767676, + "grad_norm": 0.25520822405815125, + "learning_rate": 4.439282353207298e-05, + "loss": 0.9807, + "mean_token_accuracy": 0.7121459543704987, + "num_tokens": 16244744.0, + "step": 1919 + }, + { + "entropy": 1.0172135829925537, + "epoch": 0.768076807680768, + "grad_norm": 0.2599153220653534, + "learning_rate": 4.4313063708751e-05, + "loss": 0.9982, + "mean_token_accuracy": 0.7122907936573029, + "num_tokens": 16253744.0, + "step": 1920 + }, + { + "entropy": 1.0128151327371597, + "epoch": 0.7684768476847684, + "grad_norm": 0.2659081518650055, + "learning_rate": 4.423341412949067e-05, + "loss": 0.9942, + "mean_token_accuracy": 0.7118425518274307, + "num_tokens": 16262386.0, + "step": 1921 + }, + { + "entropy": 1.026433140039444, + "epoch": 0.7688768876887688, + "grad_norm": 0.2670094966888428, + "learning_rate": 4.4153874927969845e-05, + "loss": 1.0156, + "mean_token_accuracy": 0.7092668265104294, + "num_tokens": 16270640.0, + "step": 1922 + }, + { + "entropy": 1.0363490730524063, + "epoch": 0.7692769276927692, + "grad_norm": 0.27132827043533325, + "learning_rate": 4.407444623768125e-05, + "loss": 1.0132, + "mean_token_accuracy": 0.7055574208498001, + "num_tokens": 16278876.0, + "step": 1923 + }, + { + "entropy": 0.9468106627464294, + "epoch": 0.7696769676967696, + "grad_norm": 0.26042991876602173, + "learning_rate": 4.3995128191932047e-05, + "loss": 0.9167, + "mean_token_accuracy": 0.7311969697475433, + "num_tokens": 16287548.0, + "step": 1924 + }, + { + "entropy": 1.0284443199634552, + "epoch": 0.77007700770077, + "grad_norm": 0.27916139364242554, + "learning_rate": 4.391592092384378e-05, + "loss": 1.0266, + "mean_token_accuracy": 0.704498752951622, + "num_tokens": 16295955.0, + "step": 1925 + }, + { + "entropy": 0.9854629188776016, + "epoch": 0.7704770477047704, + "grad_norm": 0.25771310925483704, + "learning_rate": 4.383682456635199e-05, + "loss": 0.9709, + "mean_token_accuracy": 0.718521773815155, + "num_tokens": 16304862.0, + "step": 1926 + }, + { + "entropy": 0.9819846153259277, + "epoch": 0.7708770877087708, + "grad_norm": 0.2799040675163269, + "learning_rate": 4.3757839252206096e-05, + "loss": 0.9819, + "mean_token_accuracy": 0.7141459286212921, + "num_tokens": 16313266.0, + "step": 1927 + }, + { + "entropy": 0.9515479207038879, + "epoch": 0.7712771277127712, + "grad_norm": 0.2587561011314392, + "learning_rate": 4.367896511396923e-05, + "loss": 0.9541, + "mean_token_accuracy": 0.7254490554332733, + "num_tokens": 16322317.0, + "step": 1928 + }, + { + "entropy": 1.0280934870243073, + "epoch": 0.7716771677167716, + "grad_norm": 0.2749437093734741, + "learning_rate": 4.36002022840178e-05, + "loss": 1.0228, + "mean_token_accuracy": 0.7083956003189087, + "num_tokens": 16330454.0, + "step": 1929 + }, + { + "entropy": 0.9578103572130203, + "epoch": 0.772077207720772, + "grad_norm": 0.2623768746852875, + "learning_rate": 4.352155089454154e-05, + "loss": 0.9617, + "mean_token_accuracy": 0.7173874974250793, + "num_tokens": 16339834.0, + "step": 1930 + }, + { + "entropy": 1.0000793635845184, + "epoch": 0.7724772477247724, + "grad_norm": 0.2740638852119446, + "learning_rate": 4.344301107754306e-05, + "loss": 0.9928, + "mean_token_accuracy": 0.7162513881921768, + "num_tokens": 16348024.0, + "step": 1931 + }, + { + "entropy": 1.0323897451162338, + "epoch": 0.7728772877287728, + "grad_norm": 0.27349600195884705, + "learning_rate": 4.33645829648377e-05, + "loss": 1.0267, + "mean_token_accuracy": 0.7085477560758591, + "num_tokens": 16356599.0, + "step": 1932 + }, + { + "entropy": 1.0117439031600952, + "epoch": 0.7732773277327732, + "grad_norm": 0.29218804836273193, + "learning_rate": 4.328626668805339e-05, + "loss": 1.0217, + "mean_token_accuracy": 0.7080961018800735, + "num_tokens": 16364845.0, + "step": 1933 + }, + { + "entropy": 1.0359125137329102, + "epoch": 0.7736773677367736, + "grad_norm": 0.28453513979911804, + "learning_rate": 4.3208062378630375e-05, + "loss": 1.0283, + "mean_token_accuracy": 0.705451488494873, + "num_tokens": 16372909.0, + "step": 1934 + }, + { + "entropy": 1.005581259727478, + "epoch": 0.774077407740774, + "grad_norm": 0.26857098937034607, + "learning_rate": 4.312997016782091e-05, + "loss": 1.0014, + "mean_token_accuracy": 0.711369127035141, + "num_tokens": 16381566.0, + "step": 1935 + }, + { + "entropy": 0.9689909517765045, + "epoch": 0.7744774477447744, + "grad_norm": 0.2715790569782257, + "learning_rate": 4.305199018668912e-05, + "loss": 0.984, + "mean_token_accuracy": 0.7201101034879684, + "num_tokens": 16390258.0, + "step": 1936 + }, + { + "entropy": 1.0137456953525543, + "epoch": 0.7748774877487749, + "grad_norm": 0.2683742940425873, + "learning_rate": 4.2974122566110844e-05, + "loss": 0.9983, + "mean_token_accuracy": 0.7097334861755371, + "num_tokens": 16398853.0, + "step": 1937 + }, + { + "entropy": 0.9720441848039627, + "epoch": 0.7752775277527753, + "grad_norm": 0.26082566380500793, + "learning_rate": 4.2896367436773245e-05, + "loss": 0.969, + "mean_token_accuracy": 0.7214338928461075, + "num_tokens": 16407799.0, + "step": 1938 + }, + { + "entropy": 0.9808981567621231, + "epoch": 0.7756775677567757, + "grad_norm": 0.2646254301071167, + "learning_rate": 4.281872492917481e-05, + "loss": 0.9799, + "mean_token_accuracy": 0.7232773154973984, + "num_tokens": 16416416.0, + "step": 1939 + }, + { + "entropy": 0.9311534464359283, + "epoch": 0.7760776077607761, + "grad_norm": 0.2656506597995758, + "learning_rate": 4.274119517362489e-05, + "loss": 0.9242, + "mean_token_accuracy": 0.7243125587701797, + "num_tokens": 16425361.0, + "step": 1940 + }, + { + "entropy": 1.016297996044159, + "epoch": 0.7764776477647765, + "grad_norm": 0.27736979722976685, + "learning_rate": 4.266377830024366e-05, + "loss": 0.986, + "mean_token_accuracy": 0.711946576833725, + "num_tokens": 16433386.0, + "step": 1941 + }, + { + "entropy": 0.9611827433109283, + "epoch": 0.7768776877687769, + "grad_norm": 0.2671481966972351, + "learning_rate": 4.258647443896182e-05, + "loss": 0.9524, + "mean_token_accuracy": 0.7246019542217255, + "num_tokens": 16442011.0, + "step": 1942 + }, + { + "entropy": 0.9082488715648651, + "epoch": 0.7772777277727773, + "grad_norm": 0.27807632088661194, + "learning_rate": 4.250928371952047e-05, + "loss": 0.8695, + "mean_token_accuracy": 0.736086055636406, + "num_tokens": 16451073.0, + "step": 1943 + }, + { + "entropy": 1.0304416120052338, + "epoch": 0.7776777677767777, + "grad_norm": 0.27288827300071716, + "learning_rate": 4.243220627147072e-05, + "loss": 1.0216, + "mean_token_accuracy": 0.7135422825813293, + "num_tokens": 16459229.0, + "step": 1944 + }, + { + "entropy": 0.9455744475126266, + "epoch": 0.7780778077807781, + "grad_norm": 0.2790471911430359, + "learning_rate": 4.235524222417363e-05, + "loss": 0.948, + "mean_token_accuracy": 0.7159022688865662, + "num_tokens": 16467087.0, + "step": 1945 + }, + { + "entropy": 1.0211792439222336, + "epoch": 0.7784778477847785, + "grad_norm": 0.27309513092041016, + "learning_rate": 4.2278391706799874e-05, + "loss": 1.018, + "mean_token_accuracy": 0.7086057513952255, + "num_tokens": 16475758.0, + "step": 1946 + }, + { + "entropy": 0.9979550689458847, + "epoch": 0.7788778877887789, + "grad_norm": 0.2779162526130676, + "learning_rate": 4.220165484832973e-05, + "loss": 1.0023, + "mean_token_accuracy": 0.7086283564567566, + "num_tokens": 16484312.0, + "step": 1947 + }, + { + "entropy": 0.9505866169929504, + "epoch": 0.7792779277927793, + "grad_norm": 0.26656338572502136, + "learning_rate": 4.212503177755257e-05, + "loss": 0.9636, + "mean_token_accuracy": 0.7210749685764313, + "num_tokens": 16493313.0, + "step": 1948 + }, + { + "entropy": 1.0232224017381668, + "epoch": 0.7796779677967797, + "grad_norm": 0.32780158519744873, + "learning_rate": 4.2048522623066874e-05, + "loss": 1.0308, + "mean_token_accuracy": 0.702821210026741, + "num_tokens": 16501539.0, + "step": 1949 + }, + { + "entropy": 0.924216017127037, + "epoch": 0.7800780078007801, + "grad_norm": 0.25110381841659546, + "learning_rate": 4.197212751327986e-05, + "loss": 0.934, + "mean_token_accuracy": 0.7253291010856628, + "num_tokens": 16510871.0, + "step": 1950 + }, + { + "entropy": 1.0044145286083221, + "epoch": 0.7804780478047805, + "grad_norm": 0.26127859950065613, + "learning_rate": 4.1895846576407424e-05, + "loss": 0.9783, + "mean_token_accuracy": 0.7151003181934357, + "num_tokens": 16519253.0, + "step": 1951 + }, + { + "entropy": 1.0258438885211945, + "epoch": 0.7808780878087809, + "grad_norm": 0.2743532061576843, + "learning_rate": 4.181967994047383e-05, + "loss": 0.9981, + "mean_token_accuracy": 0.7089982479810715, + "num_tokens": 16527817.0, + "step": 1952 + }, + { + "entropy": 1.0317530930042267, + "epoch": 0.7812781278127813, + "grad_norm": 0.26868540048599243, + "learning_rate": 4.174362773331149e-05, + "loss": 1.0002, + "mean_token_accuracy": 0.7139078229665756, + "num_tokens": 16535884.0, + "step": 1953 + }, + { + "entropy": 0.968609943985939, + "epoch": 0.7816781678167817, + "grad_norm": 0.2641928791999817, + "learning_rate": 4.166769008256071e-05, + "loss": 0.9353, + "mean_token_accuracy": 0.7213132083415985, + "num_tokens": 16544289.0, + "step": 1954 + }, + { + "entropy": 0.9925180822610855, + "epoch": 0.7820782078207821, + "grad_norm": 0.27063336968421936, + "learning_rate": 4.1591867115669566e-05, + "loss": 0.9812, + "mean_token_accuracy": 0.7163388580083847, + "num_tokens": 16552613.0, + "step": 1955 + }, + { + "entropy": 0.9772813022136688, + "epoch": 0.7824782478247825, + "grad_norm": 0.2611922025680542, + "learning_rate": 4.1516158959893805e-05, + "loss": 0.9841, + "mean_token_accuracy": 0.7160541415214539, + "num_tokens": 16561498.0, + "step": 1956 + }, + { + "entropy": 1.0075124353170395, + "epoch": 0.7828782878287829, + "grad_norm": 0.2790793180465698, + "learning_rate": 4.144056574229627e-05, + "loss": 1.0054, + "mean_token_accuracy": 0.7092307507991791, + "num_tokens": 16569728.0, + "step": 1957 + }, + { + "entropy": 0.9551549553871155, + "epoch": 0.7832783278327833, + "grad_norm": 0.25812435150146484, + "learning_rate": 4.1365087589747e-05, + "loss": 0.9536, + "mean_token_accuracy": 0.7238068729639053, + "num_tokens": 16578910.0, + "step": 1958 + }, + { + "entropy": 0.987942636013031, + "epoch": 0.7836783678367837, + "grad_norm": 0.2676485478878021, + "learning_rate": 4.128972462892286e-05, + "loss": 0.9687, + "mean_token_accuracy": 0.716212660074234, + "num_tokens": 16587211.0, + "step": 1959 + }, + { + "entropy": 0.9852515012025833, + "epoch": 0.7840784078407841, + "grad_norm": 0.27511346340179443, + "learning_rate": 4.12144769863075e-05, + "loss": 0.9598, + "mean_token_accuracy": 0.7187111079692841, + "num_tokens": 16596050.0, + "step": 1960 + }, + { + "entropy": 0.9317512512207031, + "epoch": 0.7844784478447845, + "grad_norm": 0.2653049826622009, + "learning_rate": 4.1139344788190945e-05, + "loss": 0.9359, + "mean_token_accuracy": 0.7221696078777313, + "num_tokens": 16604975.0, + "step": 1961 + }, + { + "entropy": 1.0157409012317657, + "epoch": 0.7848784878487849, + "grad_norm": 0.2787778079509735, + "learning_rate": 4.1064328160669474e-05, + "loss": 1.0122, + "mean_token_accuracy": 0.7127035409212112, + "num_tokens": 16613360.0, + "step": 1962 + }, + { + "entropy": 1.1090724468231201, + "epoch": 0.7852785278527853, + "grad_norm": 0.29519206285476685, + "learning_rate": 4.09894272296454e-05, + "loss": 1.1014, + "mean_token_accuracy": 0.695744514465332, + "num_tokens": 16621401.0, + "step": 1963 + }, + { + "entropy": 0.966411218047142, + "epoch": 0.7856785678567857, + "grad_norm": 0.26962825655937195, + "learning_rate": 4.0914642120826835e-05, + "loss": 0.9565, + "mean_token_accuracy": 0.7174637019634247, + "num_tokens": 16629812.0, + "step": 1964 + }, + { + "entropy": 1.0061307102441788, + "epoch": 0.7860786078607861, + "grad_norm": 0.268784761428833, + "learning_rate": 4.083997295972761e-05, + "loss": 1.0125, + "mean_token_accuracy": 0.713878944516182, + "num_tokens": 16638099.0, + "step": 1965 + }, + { + "entropy": 1.016554832458496, + "epoch": 0.7864786478647865, + "grad_norm": 0.27387306094169617, + "learning_rate": 4.076541987166684e-05, + "loss": 1.0167, + "mean_token_accuracy": 0.7045346647500992, + "num_tokens": 16646546.0, + "step": 1966 + }, + { + "entropy": 1.0119438171386719, + "epoch": 0.7868786878687869, + "grad_norm": 0.2817784547805786, + "learning_rate": 4.069098298176888e-05, + "loss": 0.9897, + "mean_token_accuracy": 0.7150414735078812, + "num_tokens": 16654516.0, + "step": 1967 + }, + { + "entropy": 1.003562331199646, + "epoch": 0.7872787278727873, + "grad_norm": 0.27532950043678284, + "learning_rate": 4.061666241496305e-05, + "loss": 1.0028, + "mean_token_accuracy": 0.7141562849283218, + "num_tokens": 16662395.0, + "step": 1968 + }, + { + "entropy": 0.9775811582803726, + "epoch": 0.7876787678767877, + "grad_norm": 0.27311626076698303, + "learning_rate": 4.054245829598345e-05, + "loss": 0.979, + "mean_token_accuracy": 0.7140754014253616, + "num_tokens": 16670653.0, + "step": 1969 + }, + { + "entropy": 1.0271909534931183, + "epoch": 0.7880788078807881, + "grad_norm": 0.2770668864250183, + "learning_rate": 4.0468370749368786e-05, + "loss": 1.0283, + "mean_token_accuracy": 0.7098975032567978, + "num_tokens": 16678841.0, + "step": 1970 + }, + { + "entropy": 0.9999922960996628, + "epoch": 0.7884788478847885, + "grad_norm": 0.27080389857292175, + "learning_rate": 4.039439989946206e-05, + "loss": 1.0194, + "mean_token_accuracy": 0.7124370634555817, + "num_tokens": 16687177.0, + "step": 1971 + }, + { + "entropy": 0.9522688090801239, + "epoch": 0.7888788878887889, + "grad_norm": 0.25555744767189026, + "learning_rate": 4.032054587041042e-05, + "loss": 0.9471, + "mean_token_accuracy": 0.726886659860611, + "num_tokens": 16696048.0, + "step": 1972 + }, + { + "entropy": 0.9692239016294479, + "epoch": 0.7892789278927893, + "grad_norm": 0.2677621841430664, + "learning_rate": 4.024680878616499e-05, + "loss": 0.9558, + "mean_token_accuracy": 0.7195596694946289, + "num_tokens": 16704418.0, + "step": 1973 + }, + { + "entropy": 0.9616890549659729, + "epoch": 0.7896789678967897, + "grad_norm": 0.2704871594905853, + "learning_rate": 4.017318877048056e-05, + "loss": 0.9583, + "mean_token_accuracy": 0.716183066368103, + "num_tokens": 16712897.0, + "step": 1974 + }, + { + "entropy": 1.0769119262695312, + "epoch": 0.7900790079007901, + "grad_norm": 0.28852173686027527, + "learning_rate": 4.009968594691555e-05, + "loss": 1.1032, + "mean_token_accuracy": 0.6949293166399002, + "num_tokens": 16721064.0, + "step": 1975 + }, + { + "entropy": 0.9898983836174011, + "epoch": 0.7904790479047905, + "grad_norm": 0.29250386357307434, + "learning_rate": 4.002630043883159e-05, + "loss": 0.9795, + "mean_token_accuracy": 0.7169605493545532, + "num_tokens": 16728909.0, + "step": 1976 + }, + { + "entropy": 1.0148055404424667, + "epoch": 0.7908790879087909, + "grad_norm": 0.2684784233570099, + "learning_rate": 3.995303236939341e-05, + "loss": 1.0182, + "mean_token_accuracy": 0.7063656002283096, + "num_tokens": 16738054.0, + "step": 1977 + }, + { + "entropy": 1.0401248186826706, + "epoch": 0.7912791279127913, + "grad_norm": 0.277036190032959, + "learning_rate": 3.9879881861568736e-05, + "loss": 1.0037, + "mean_token_accuracy": 0.7143462896347046, + "num_tokens": 16746093.0, + "step": 1978 + }, + { + "entropy": 1.038544237613678, + "epoch": 0.7916791679167917, + "grad_norm": 0.2686924636363983, + "learning_rate": 3.98068490381279e-05, + "loss": 1.0112, + "mean_token_accuracy": 0.7029880285263062, + "num_tokens": 16754423.0, + "step": 1979 + }, + { + "entropy": 1.0010389536619186, + "epoch": 0.7920792079207921, + "grad_norm": 0.2896207571029663, + "learning_rate": 3.97339340216438e-05, + "loss": 0.9903, + "mean_token_accuracy": 0.7086462378501892, + "num_tokens": 16762672.0, + "step": 1980 + }, + { + "entropy": 1.0201686918735504, + "epoch": 0.7924792479247925, + "grad_norm": 0.26131436228752136, + "learning_rate": 3.96611369344915e-05, + "loss": 1.0047, + "mean_token_accuracy": 0.7100804895162582, + "num_tokens": 16771269.0, + "step": 1981 + }, + { + "entropy": 0.9933746755123138, + "epoch": 0.7928792879287929, + "grad_norm": 0.27140122652053833, + "learning_rate": 3.958845789884829e-05, + "loss": 0.9832, + "mean_token_accuracy": 0.7210771441459656, + "num_tokens": 16779655.0, + "step": 1982 + }, + { + "entropy": 1.0229352116584778, + "epoch": 0.7932793279327933, + "grad_norm": 0.29264238476753235, + "learning_rate": 3.951589703669317e-05, + "loss": 1.0021, + "mean_token_accuracy": 0.7064402997493744, + "num_tokens": 16787371.0, + "step": 1983 + }, + { + "entropy": 0.9796026796102524, + "epoch": 0.7936793679367937, + "grad_norm": 0.2693035900592804, + "learning_rate": 3.944345446980694e-05, + "loss": 0.9876, + "mean_token_accuracy": 0.7197673469781876, + "num_tokens": 16795684.0, + "step": 1984 + }, + { + "entropy": 0.9865094423294067, + "epoch": 0.7940794079407941, + "grad_norm": 0.2630591094493866, + "learning_rate": 3.93711303197718e-05, + "loss": 0.9765, + "mean_token_accuracy": 0.7150878310203552, + "num_tokens": 16804384.0, + "step": 1985 + }, + { + "entropy": 1.061917930841446, + "epoch": 0.7944794479447945, + "grad_norm": 0.2843262553215027, + "learning_rate": 3.929892470797119e-05, + "loss": 1.095, + "mean_token_accuracy": 0.6901217699050903, + "num_tokens": 16812725.0, + "step": 1986 + }, + { + "entropy": 0.9478074610233307, + "epoch": 0.7948794879487949, + "grad_norm": 0.2727768123149872, + "learning_rate": 3.9226837755589665e-05, + "loss": 0.9281, + "mean_token_accuracy": 0.7279782295227051, + "num_tokens": 16821079.0, + "step": 1987 + }, + { + "entropy": 1.0381271839141846, + "epoch": 0.7952795279527953, + "grad_norm": 0.2811879813671112, + "learning_rate": 3.915486958361256e-05, + "loss": 1.0465, + "mean_token_accuracy": 0.6986168771982193, + "num_tokens": 16829503.0, + "step": 1988 + }, + { + "entropy": 0.9512267708778381, + "epoch": 0.7956795679567957, + "grad_norm": 0.2610643804073334, + "learning_rate": 3.9083020312825944e-05, + "loss": 0.9577, + "mean_token_accuracy": 0.7150786072015762, + "num_tokens": 16838256.0, + "step": 1989 + }, + { + "entropy": 0.979460746049881, + "epoch": 0.7960796079607961, + "grad_norm": 0.27445539832115173, + "learning_rate": 3.901129006381623e-05, + "loss": 0.9913, + "mean_token_accuracy": 0.7098689675331116, + "num_tokens": 16846664.0, + "step": 1990 + }, + { + "entropy": 1.009248524904251, + "epoch": 0.7964796479647965, + "grad_norm": 0.2770299017429352, + "learning_rate": 3.893967895697017e-05, + "loss": 0.9883, + "mean_token_accuracy": 0.717975303530693, + "num_tokens": 16854642.0, + "step": 1991 + }, + { + "entropy": 0.9395347088575363, + "epoch": 0.7968796879687969, + "grad_norm": 0.26004788279533386, + "learning_rate": 3.886818711247451e-05, + "loss": 0.9232, + "mean_token_accuracy": 0.7248664647340775, + "num_tokens": 16863502.0, + "step": 1992 + }, + { + "entropy": 0.9856249690055847, + "epoch": 0.7972797279727972, + "grad_norm": 0.2668319046497345, + "learning_rate": 3.879681465031577e-05, + "loss": 0.9497, + "mean_token_accuracy": 0.7299875915050507, + "num_tokens": 16871661.0, + "step": 1993 + }, + { + "entropy": 1.0015004724264145, + "epoch": 0.7976797679767976, + "grad_norm": 0.26891690492630005, + "learning_rate": 3.872556169028027e-05, + "loss": 0.9771, + "mean_token_accuracy": 0.7114444971084595, + "num_tokens": 16880070.0, + "step": 1994 + }, + { + "entropy": 0.981217086315155, + "epoch": 0.798079807980798, + "grad_norm": 0.2644566297531128, + "learning_rate": 3.86544283519536e-05, + "loss": 0.9556, + "mean_token_accuracy": 0.722639188170433, + "num_tokens": 16888561.0, + "step": 1995 + }, + { + "entropy": 1.0057890564203262, + "epoch": 0.7984798479847984, + "grad_norm": 0.2662646770477295, + "learning_rate": 3.858341475472071e-05, + "loss": 0.991, + "mean_token_accuracy": 0.717515230178833, + "num_tokens": 16897089.0, + "step": 1996 + }, + { + "entropy": 1.0025110095739365, + "epoch": 0.7988798879887988, + "grad_norm": 0.278546541929245, + "learning_rate": 3.851252101776546e-05, + "loss": 1.0042, + "mean_token_accuracy": 0.7160844504833221, + "num_tokens": 16905223.0, + "step": 1997 + }, + { + "entropy": 1.0081955790519714, + "epoch": 0.7992799279927992, + "grad_norm": 0.2812212407588959, + "learning_rate": 3.84417472600707e-05, + "loss": 1.0406, + "mean_token_accuracy": 0.7019616961479187, + "num_tokens": 16913937.0, + "step": 1998 + }, + { + "entropy": 0.9704654067754745, + "epoch": 0.7996799679967996, + "grad_norm": 0.2580846846103668, + "learning_rate": 3.837109360041778e-05, + "loss": 0.964, + "mean_token_accuracy": 0.7241959124803543, + "num_tokens": 16923011.0, + "step": 1999 + }, + { + "entropy": 1.0043010711669922, + "epoch": 0.8000800080008, + "grad_norm": 0.2711995542049408, + "learning_rate": 3.830056015738657e-05, + "loss": 0.9877, + "mean_token_accuracy": 0.7192713171243668, + "num_tokens": 16931363.0, + "step": 2000 + }, + { + "entropy": 0.9635373055934906, + "epoch": 0.8004800480048004, + "grad_norm": 0.2713865339756012, + "learning_rate": 3.8230147049355147e-05, + "loss": 0.9352, + "mean_token_accuracy": 0.7228316217660904, + "num_tokens": 16939713.0, + "step": 2001 + }, + { + "entropy": 1.0078991651535034, + "epoch": 0.8008800880088008, + "grad_norm": 0.2812260091304779, + "learning_rate": 3.8159854394499586e-05, + "loss": 1.0054, + "mean_token_accuracy": 0.7135846018791199, + "num_tokens": 16947936.0, + "step": 2002 + }, + { + "entropy": 1.0065274685621262, + "epoch": 0.8012801280128012, + "grad_norm": 0.26969239115715027, + "learning_rate": 3.80896823107939e-05, + "loss": 1.0145, + "mean_token_accuracy": 0.7034779191017151, + "num_tokens": 16956339.0, + "step": 2003 + }, + { + "entropy": 1.00691257417202, + "epoch": 0.8016801680168016, + "grad_norm": 0.2723468542098999, + "learning_rate": 3.801963091600964e-05, + "loss": 1.0111, + "mean_token_accuracy": 0.7163326442241669, + "num_tokens": 16964985.0, + "step": 2004 + }, + { + "entropy": 1.0333246290683746, + "epoch": 0.802080208020802, + "grad_norm": 0.27602705359458923, + "learning_rate": 3.794970032771589e-05, + "loss": 1.0249, + "mean_token_accuracy": 0.7057675272226334, + "num_tokens": 16972966.0, + "step": 2005 + }, + { + "entropy": 1.0007657259702682, + "epoch": 0.8024802480248024, + "grad_norm": 0.26916199922561646, + "learning_rate": 3.7879890663278916e-05, + "loss": 1.0171, + "mean_token_accuracy": 0.7068461924791336, + "num_tokens": 16981807.0, + "step": 2006 + }, + { + "entropy": 0.92459736764431, + "epoch": 0.8028802880288028, + "grad_norm": 0.25741347670555115, + "learning_rate": 3.781020203986205e-05, + "loss": 0.9352, + "mean_token_accuracy": 0.7312233299016953, + "num_tokens": 16990719.0, + "step": 2007 + }, + { + "entropy": 1.0224385410547256, + "epoch": 0.8032803280328032, + "grad_norm": 0.2788792550563812, + "learning_rate": 3.7740634574425506e-05, + "loss": 1.0308, + "mean_token_accuracy": 0.7040697932243347, + "num_tokens": 16999029.0, + "step": 2008 + }, + { + "entropy": 0.9978064298629761, + "epoch": 0.8036803680368036, + "grad_norm": 0.2727094888687134, + "learning_rate": 3.767118838372615e-05, + "loss": 0.9978, + "mean_token_accuracy": 0.7079032957553864, + "num_tokens": 17007737.0, + "step": 2009 + }, + { + "entropy": 0.9661367386579514, + "epoch": 0.804080408040804, + "grad_norm": 0.26969122886657715, + "learning_rate": 3.7601863584317285e-05, + "loss": 0.9547, + "mean_token_accuracy": 0.723625510931015, + "num_tokens": 17016262.0, + "step": 2010 + }, + { + "entropy": 0.9814732074737549, + "epoch": 0.8044804480448045, + "grad_norm": 0.2953326106071472, + "learning_rate": 3.753266029254846e-05, + "loss": 0.9663, + "mean_token_accuracy": 0.7209582924842834, + "num_tokens": 17024304.0, + "step": 2011 + }, + { + "entropy": 0.9851955622434616, + "epoch": 0.8048804880488049, + "grad_norm": 0.2813256084918976, + "learning_rate": 3.746357862456538e-05, + "loss": 0.9685, + "mean_token_accuracy": 0.7190544307231903, + "num_tokens": 17032402.0, + "step": 2012 + }, + { + "entropy": 1.0187128186225891, + "epoch": 0.8052805280528053, + "grad_norm": 0.26697590947151184, + "learning_rate": 3.739461869630951e-05, + "loss": 1.0075, + "mean_token_accuracy": 0.7025318443775177, + "num_tokens": 17041180.0, + "step": 2013 + }, + { + "entropy": 0.9576444774866104, + "epoch": 0.8056805680568057, + "grad_norm": 0.26378315687179565, + "learning_rate": 3.7325780623518116e-05, + "loss": 0.9556, + "mean_token_accuracy": 0.7270702868700027, + "num_tokens": 17049763.0, + "step": 2014 + }, + { + "entropy": 1.0027414560317993, + "epoch": 0.8060806080608061, + "grad_norm": 0.2704271376132965, + "learning_rate": 3.725706452172387e-05, + "loss": 1.0004, + "mean_token_accuracy": 0.7113276720046997, + "num_tokens": 17058120.0, + "step": 2015 + }, + { + "entropy": 1.0297887027263641, + "epoch": 0.8064806480648065, + "grad_norm": 0.27671483159065247, + "learning_rate": 3.718847050625475e-05, + "loss": 0.9968, + "mean_token_accuracy": 0.7076510936021805, + "num_tokens": 17066404.0, + "step": 2016 + }, + { + "entropy": 1.0210773199796677, + "epoch": 0.8068806880688069, + "grad_norm": 0.2748508155345917, + "learning_rate": 3.7119998692233825e-05, + "loss": 0.9969, + "mean_token_accuracy": 0.7072038799524307, + "num_tokens": 17074759.0, + "step": 2017 + }, + { + "entropy": 0.9729053229093552, + "epoch": 0.8072807280728073, + "grad_norm": 0.27099964022636414, + "learning_rate": 3.705164919457914e-05, + "loss": 0.9706, + "mean_token_accuracy": 0.7229728996753693, + "num_tokens": 17083756.0, + "step": 2018 + }, + { + "entropy": 0.9743250012397766, + "epoch": 0.8076807680768077, + "grad_norm": 0.28145357966423035, + "learning_rate": 3.6983422128003384e-05, + "loss": 0.9703, + "mean_token_accuracy": 0.718558207154274, + "num_tokens": 17092240.0, + "step": 2019 + }, + { + "entropy": 1.0586475431919098, + "epoch": 0.8080808080808081, + "grad_norm": 0.2733355462551117, + "learning_rate": 3.691531760701376e-05, + "loss": 1.0613, + "mean_token_accuracy": 0.7037104517221451, + "num_tokens": 17100454.0, + "step": 2020 + }, + { + "entropy": 1.039633885025978, + "epoch": 0.8084808480848085, + "grad_norm": 0.276384562253952, + "learning_rate": 3.684733574591183e-05, + "loss": 1.0328, + "mean_token_accuracy": 0.7017520070075989, + "num_tokens": 17109136.0, + "step": 2021 + }, + { + "entropy": 1.0072901844978333, + "epoch": 0.8088808880888089, + "grad_norm": 0.26822522282600403, + "learning_rate": 3.6779476658793275e-05, + "loss": 1.0106, + "mean_token_accuracy": 0.7135123759508133, + "num_tokens": 17117724.0, + "step": 2022 + }, + { + "entropy": 0.9677459597587585, + "epoch": 0.8092809280928093, + "grad_norm": 0.2706298828125, + "learning_rate": 3.67117404595478e-05, + "loss": 0.9633, + "mean_token_accuracy": 0.7211137413978577, + "num_tokens": 17126483.0, + "step": 2023 + }, + { + "entropy": 1.0450319945812225, + "epoch": 0.8096809680968097, + "grad_norm": 0.28412947058677673, + "learning_rate": 3.6644127261858715e-05, + "loss": 1.0322, + "mean_token_accuracy": 0.7079120427370071, + "num_tokens": 17134545.0, + "step": 2024 + }, + { + "entropy": 0.9584459215402603, + "epoch": 0.8100810081008101, + "grad_norm": 0.2641433775424957, + "learning_rate": 3.657663717920301e-05, + "loss": 0.9721, + "mean_token_accuracy": 0.725227415561676, + "num_tokens": 17143655.0, + "step": 2025 + }, + { + "entropy": 1.0107422322034836, + "epoch": 0.8104810481048105, + "grad_norm": 0.29035574197769165, + "learning_rate": 3.650927032485101e-05, + "loss": 0.9965, + "mean_token_accuracy": 0.7132721692323685, + "num_tokens": 17152205.0, + "step": 2026 + }, + { + "entropy": 0.9810872226953506, + "epoch": 0.8108810881088109, + "grad_norm": 0.2626109719276428, + "learning_rate": 3.6442026811866246e-05, + "loss": 0.9901, + "mean_token_accuracy": 0.7163696587085724, + "num_tokens": 17161110.0, + "step": 2027 + }, + { + "entropy": 1.0157355666160583, + "epoch": 0.8112811281128113, + "grad_norm": 0.27817121148109436, + "learning_rate": 3.637490675310521e-05, + "loss": 1.015, + "mean_token_accuracy": 0.7027568519115448, + "num_tokens": 17169499.0, + "step": 2028 + }, + { + "entropy": 0.9125346094369888, + "epoch": 0.8116811681168117, + "grad_norm": 0.2570400536060333, + "learning_rate": 3.63079102612172e-05, + "loss": 0.899, + "mean_token_accuracy": 0.7330032587051392, + "num_tokens": 17178316.0, + "step": 2029 + }, + { + "entropy": 0.9756700098514557, + "epoch": 0.8120812081208121, + "grad_norm": 0.27032753825187683, + "learning_rate": 3.624103744864412e-05, + "loss": 0.9661, + "mean_token_accuracy": 0.7205221056938171, + "num_tokens": 17186813.0, + "step": 2030 + }, + { + "entropy": 0.9386903941631317, + "epoch": 0.8124812481248125, + "grad_norm": 0.2541970908641815, + "learning_rate": 3.617428842762033e-05, + "loss": 0.9078, + "mean_token_accuracy": 0.7335416674613953, + "num_tokens": 17195759.0, + "step": 2031 + }, + { + "entropy": 0.9615795612335205, + "epoch": 0.8128812881288129, + "grad_norm": 0.2687077820301056, + "learning_rate": 3.6107663310172466e-05, + "loss": 0.9687, + "mean_token_accuracy": 0.7204932868480682, + "num_tokens": 17204389.0, + "step": 2032 + }, + { + "entropy": 0.989258661866188, + "epoch": 0.8132813281328133, + "grad_norm": 0.27682381868362427, + "learning_rate": 3.604116220811911e-05, + "loss": 0.9825, + "mean_token_accuracy": 0.7154345959424973, + "num_tokens": 17212858.0, + "step": 2033 + }, + { + "entropy": 0.94056037068367, + "epoch": 0.8136813681368137, + "grad_norm": 0.2550995945930481, + "learning_rate": 3.597478523307075e-05, + "loss": 0.9285, + "mean_token_accuracy": 0.7282757312059402, + "num_tokens": 17221792.0, + "step": 2034 + }, + { + "entropy": 0.9527640044689178, + "epoch": 0.8140814081408141, + "grad_norm": 0.26832711696624756, + "learning_rate": 3.590853249642958e-05, + "loss": 0.9366, + "mean_token_accuracy": 0.7245687991380692, + "num_tokens": 17230163.0, + "step": 2035 + }, + { + "entropy": 0.9529808014631271, + "epoch": 0.8144814481448145, + "grad_norm": 0.2673467993736267, + "learning_rate": 3.584240410938928e-05, + "loss": 0.9474, + "mean_token_accuracy": 0.726386770606041, + "num_tokens": 17238450.0, + "step": 2036 + }, + { + "entropy": 1.0539619475603104, + "epoch": 0.8148814881488149, + "grad_norm": 0.27707141637802124, + "learning_rate": 3.57764001829348e-05, + "loss": 1.0435, + "mean_token_accuracy": 0.7051855176687241, + "num_tokens": 17246946.0, + "step": 2037 + }, + { + "entropy": 1.009418860077858, + "epoch": 0.8152815281528153, + "grad_norm": 0.28269508481025696, + "learning_rate": 3.5710520827842173e-05, + "loss": 0.9882, + "mean_token_accuracy": 0.7132804542779922, + "num_tokens": 17255305.0, + "step": 2038 + }, + { + "entropy": 1.019100084900856, + "epoch": 0.8156815681568157, + "grad_norm": 0.2711150348186493, + "learning_rate": 3.564476615467843e-05, + "loss": 1.0194, + "mean_token_accuracy": 0.7061824649572372, + "num_tokens": 17263864.0, + "step": 2039 + }, + { + "entropy": 1.0736053884029388, + "epoch": 0.8160816081608161, + "grad_norm": 0.2723540961742401, + "learning_rate": 3.557913627380132e-05, + "loss": 1.0576, + "mean_token_accuracy": 0.7029701471328735, + "num_tokens": 17272116.0, + "step": 2040 + }, + { + "entropy": 1.0191252380609512, + "epoch": 0.8164816481648165, + "grad_norm": 0.27950039505958557, + "learning_rate": 3.551363129535915e-05, + "loss": 0.9975, + "mean_token_accuracy": 0.7131896466016769, + "num_tokens": 17280206.0, + "step": 2041 + }, + { + "entropy": 0.9911477714776993, + "epoch": 0.8168816881688169, + "grad_norm": 0.2681981921195984, + "learning_rate": 3.544825132929061e-05, + "loss": 1.004, + "mean_token_accuracy": 0.70693239569664, + "num_tokens": 17288478.0, + "step": 2042 + }, + { + "entropy": 1.000399798154831, + "epoch": 0.8172817281728173, + "grad_norm": 0.28713151812553406, + "learning_rate": 3.538299648532451e-05, + "loss": 0.9949, + "mean_token_accuracy": 0.7131373733282089, + "num_tokens": 17297250.0, + "step": 2043 + }, + { + "entropy": 0.9497536718845367, + "epoch": 0.8176817681768177, + "grad_norm": 0.27053046226501465, + "learning_rate": 3.531786687297975e-05, + "loss": 0.9356, + "mean_token_accuracy": 0.728474423289299, + "num_tokens": 17305502.0, + "step": 2044 + }, + { + "entropy": 0.9798545986413956, + "epoch": 0.8180818081808181, + "grad_norm": 0.27028828859329224, + "learning_rate": 3.5252862601565075e-05, + "loss": 0.9758, + "mean_token_accuracy": 0.7154517769813538, + "num_tokens": 17313764.0, + "step": 2045 + }, + { + "entropy": 1.0283550471067429, + "epoch": 0.8184818481848185, + "grad_norm": 0.2827967405319214, + "learning_rate": 3.518798378017876e-05, + "loss": 1.0318, + "mean_token_accuracy": 0.6984479278326035, + "num_tokens": 17322128.0, + "step": 2046 + }, + { + "entropy": 0.9976890832185745, + "epoch": 0.8188818881888189, + "grad_norm": 0.2819385528564453, + "learning_rate": 3.5123230517708575e-05, + "loss": 1.015, + "mean_token_accuracy": 0.7138688266277313, + "num_tokens": 17330576.0, + "step": 2047 + }, + { + "entropy": 1.0195940285921097, + "epoch": 0.8192819281928193, + "grad_norm": 0.2740410566329956, + "learning_rate": 3.505860292283158e-05, + "loss": 0.9985, + "mean_token_accuracy": 0.7097611874341965, + "num_tokens": 17339575.0, + "step": 2048 + }, + { + "entropy": 1.0220210999250412, + "epoch": 0.8196819681968197, + "grad_norm": 0.2828964293003082, + "learning_rate": 3.499410110401393e-05, + "loss": 1.0345, + "mean_token_accuracy": 0.7077115327119827, + "num_tokens": 17348115.0, + "step": 2049 + }, + { + "entropy": 0.9760714322328568, + "epoch": 0.8200820082008201, + "grad_norm": 0.26418930292129517, + "learning_rate": 3.492972516951069e-05, + "loss": 0.9806, + "mean_token_accuracy": 0.7120557576417923, + "num_tokens": 17356430.0, + "step": 2050 + }, + { + "entropy": 0.9735960364341736, + "epoch": 0.8204820482048205, + "grad_norm": 0.2626088261604309, + "learning_rate": 3.486547522736562e-05, + "loss": 0.9682, + "mean_token_accuracy": 0.7207171022891998, + "num_tokens": 17365071.0, + "step": 2051 + }, + { + "entropy": 1.011793702840805, + "epoch": 0.8208820882088209, + "grad_norm": 0.2711929976940155, + "learning_rate": 3.4801351385411e-05, + "loss": 0.9851, + "mean_token_accuracy": 0.7158538699150085, + "num_tokens": 17373555.0, + "step": 2052 + }, + { + "entropy": 1.0244064629077911, + "epoch": 0.8212821282128213, + "grad_norm": 0.27666717767715454, + "learning_rate": 3.473735375126757e-05, + "loss": 1.0138, + "mean_token_accuracy": 0.7107319235801697, + "num_tokens": 17381949.0, + "step": 2053 + }, + { + "entropy": 0.97138811647892, + "epoch": 0.8216821682168217, + "grad_norm": 0.2735407054424286, + "learning_rate": 3.467348243234414e-05, + "loss": 0.945, + "mean_token_accuracy": 0.7178121209144592, + "num_tokens": 17390588.0, + "step": 2054 + }, + { + "entropy": 0.9377847164869308, + "epoch": 0.8220822082208221, + "grad_norm": 0.2633952498435974, + "learning_rate": 3.4609737535837626e-05, + "loss": 0.9113, + "mean_token_accuracy": 0.7370244413614273, + "num_tokens": 17399643.0, + "step": 2055 + }, + { + "entropy": 0.991148442029953, + "epoch": 0.8224822482248225, + "grad_norm": 0.2799851596355438, + "learning_rate": 3.454611916873268e-05, + "loss": 0.9963, + "mean_token_accuracy": 0.7115125805139542, + "num_tokens": 17407903.0, + "step": 2056 + }, + { + "entropy": 0.9597851932048798, + "epoch": 0.8228822882288229, + "grad_norm": 0.2782130241394043, + "learning_rate": 3.448262743780164e-05, + "loss": 0.945, + "mean_token_accuracy": 0.7253594845533371, + "num_tokens": 17416514.0, + "step": 2057 + }, + { + "entropy": 1.0511965602636337, + "epoch": 0.8232823282328233, + "grad_norm": 0.27706894278526306, + "learning_rate": 3.441926244960428e-05, + "loss": 1.0374, + "mean_token_accuracy": 0.6994187384843826, + "num_tokens": 17424902.0, + "step": 2058 + }, + { + "entropy": 0.9959340244531631, + "epoch": 0.8236823682368237, + "grad_norm": 0.27629873156547546, + "learning_rate": 3.435602431048772e-05, + "loss": 0.9763, + "mean_token_accuracy": 0.7143826335668564, + "num_tokens": 17432846.0, + "step": 2059 + }, + { + "entropy": 1.017971083521843, + "epoch": 0.8240824082408241, + "grad_norm": 0.2750287652015686, + "learning_rate": 3.4292913126586134e-05, + "loss": 1.0108, + "mean_token_accuracy": 0.7165692001581192, + "num_tokens": 17440984.0, + "step": 2060 + }, + { + "entropy": 1.0105548053979874, + "epoch": 0.8244824482448245, + "grad_norm": 0.26840513944625854, + "learning_rate": 3.4229929003820574e-05, + "loss": 0.9918, + "mean_token_accuracy": 0.7102389186620712, + "num_tokens": 17449525.0, + "step": 2061 + }, + { + "entropy": 0.9582232981920242, + "epoch": 0.8248824882488249, + "grad_norm": 0.43530935049057007, + "learning_rate": 3.416707204789897e-05, + "loss": 0.9651, + "mean_token_accuracy": 0.7199460566043854, + "num_tokens": 17458407.0, + "step": 2062 + }, + { + "entropy": 1.001932516694069, + "epoch": 0.8252825282528253, + "grad_norm": 0.2566477358341217, + "learning_rate": 3.410434236431572e-05, + "loss": 0.9741, + "mean_token_accuracy": 0.7102504223585129, + "num_tokens": 17467438.0, + "step": 2063 + }, + { + "entropy": 1.065904676914215, + "epoch": 0.8256825682568257, + "grad_norm": 0.27691999077796936, + "learning_rate": 3.404174005835167e-05, + "loss": 1.0313, + "mean_token_accuracy": 0.70762038230896, + "num_tokens": 17475648.0, + "step": 2064 + }, + { + "entropy": 1.0009035468101501, + "epoch": 0.8260826082608261, + "grad_norm": 0.2606492042541504, + "learning_rate": 3.397926523507387e-05, + "loss": 0.994, + "mean_token_accuracy": 0.7153479903936386, + "num_tokens": 17484860.0, + "step": 2065 + }, + { + "entropy": 1.0189557075500488, + "epoch": 0.8264826482648265, + "grad_norm": 0.27343183755874634, + "learning_rate": 3.3916917999335374e-05, + "loss": 1.0146, + "mean_token_accuracy": 0.7130550295114517, + "num_tokens": 17493480.0, + "step": 2066 + }, + { + "entropy": 0.9680293947458267, + "epoch": 0.8268826882688269, + "grad_norm": 0.2784017324447632, + "learning_rate": 3.385469845577518e-05, + "loss": 0.9646, + "mean_token_accuracy": 0.7227997481822968, + "num_tokens": 17501210.0, + "step": 2067 + }, + { + "entropy": 1.0657645165920258, + "epoch": 0.8272827282728272, + "grad_norm": 0.2874472439289093, + "learning_rate": 3.3792606708817915e-05, + "loss": 1.0634, + "mean_token_accuracy": 0.7018077522516251, + "num_tokens": 17508972.0, + "step": 2068 + }, + { + "entropy": 0.9324468076229095, + "epoch": 0.8276827682768276, + "grad_norm": 0.2767597734928131, + "learning_rate": 3.3730642862673756e-05, + "loss": 0.9196, + "mean_token_accuracy": 0.735635831952095, + "num_tokens": 17517354.0, + "step": 2069 + }, + { + "entropy": 0.9914631545543671, + "epoch": 0.828082808280828, + "grad_norm": 0.2730765640735626, + "learning_rate": 3.366880702133818e-05, + "loss": 0.9844, + "mean_token_accuracy": 0.717356875538826, + "num_tokens": 17525728.0, + "step": 2070 + }, + { + "entropy": 0.9847633242607117, + "epoch": 0.8284828482848284, + "grad_norm": 0.29539087414741516, + "learning_rate": 3.36070992885919e-05, + "loss": 0.99, + "mean_token_accuracy": 0.7104747742414474, + "num_tokens": 17533668.0, + "step": 2071 + }, + { + "entropy": 1.0158415734767914, + "epoch": 0.8288828882888288, + "grad_norm": 0.2817295491695404, + "learning_rate": 3.354551976800055e-05, + "loss": 1.0288, + "mean_token_accuracy": 0.7023755311965942, + "num_tokens": 17542184.0, + "step": 2072 + }, + { + "entropy": 0.9625203460454941, + "epoch": 0.8292829282928292, + "grad_norm": 0.2756093740463257, + "learning_rate": 3.348406856291463e-05, + "loss": 0.9806, + "mean_token_accuracy": 0.71139857172966, + "num_tokens": 17550425.0, + "step": 2073 + }, + { + "entropy": 0.9610450565814972, + "epoch": 0.8296829682968296, + "grad_norm": 0.2707453966140747, + "learning_rate": 3.3422745776469245e-05, + "loss": 0.9576, + "mean_token_accuracy": 0.7208888977766037, + "num_tokens": 17558824.0, + "step": 2074 + }, + { + "entropy": 0.9931362271308899, + "epoch": 0.83008300830083, + "grad_norm": 0.28165721893310547, + "learning_rate": 3.336155151158399e-05, + "loss": 1.0019, + "mean_token_accuracy": 0.7113382965326309, + "num_tokens": 17567377.0, + "step": 2075 + }, + { + "entropy": 0.9668380320072174, + "epoch": 0.8304830483048304, + "grad_norm": 0.27379605174064636, + "learning_rate": 3.3300485870962776e-05, + "loss": 0.9564, + "mean_token_accuracy": 0.719313234090805, + "num_tokens": 17575704.0, + "step": 2076 + }, + { + "entropy": 0.9624220579862595, + "epoch": 0.8308830883088308, + "grad_norm": 0.2773764431476593, + "learning_rate": 3.3239548957093614e-05, + "loss": 0.9457, + "mean_token_accuracy": 0.7229533940553665, + "num_tokens": 17584408.0, + "step": 2077 + }, + { + "entropy": 0.9824807643890381, + "epoch": 0.8312831283128312, + "grad_norm": 0.2664353549480438, + "learning_rate": 3.317874087224851e-05, + "loss": 0.9685, + "mean_token_accuracy": 0.7247719168663025, + "num_tokens": 17592686.0, + "step": 2078 + }, + { + "entropy": 1.0105426758527756, + "epoch": 0.8316831683168316, + "grad_norm": 0.2658998668193817, + "learning_rate": 3.311806171848319e-05, + "loss": 0.9957, + "mean_token_accuracy": 0.7090546786785126, + "num_tokens": 17601541.0, + "step": 2079 + }, + { + "entropy": 1.0066430270671844, + "epoch": 0.832083208320832, + "grad_norm": 0.2909826636314392, + "learning_rate": 3.3057511597637055e-05, + "loss": 0.9938, + "mean_token_accuracy": 0.7174773663282394, + "num_tokens": 17610117.0, + "step": 2080 + }, + { + "entropy": 0.9916943311691284, + "epoch": 0.8324832483248324, + "grad_norm": 0.2579765021800995, + "learning_rate": 3.2997090611332906e-05, + "loss": 0.9751, + "mean_token_accuracy": 0.7166784852743149, + "num_tokens": 17618841.0, + "step": 2081 + }, + { + "entropy": 0.9940323084592819, + "epoch": 0.8328832883288328, + "grad_norm": 0.2621162235736847, + "learning_rate": 3.2936798860976804e-05, + "loss": 0.9705, + "mean_token_accuracy": 0.7259275764226913, + "num_tokens": 17627510.0, + "step": 2082 + }, + { + "entropy": 1.0059725642204285, + "epoch": 0.8332833283328333, + "grad_norm": 0.2642369568347931, + "learning_rate": 3.2876636447757955e-05, + "loss": 0.9915, + "mean_token_accuracy": 0.7130823880434036, + "num_tokens": 17636194.0, + "step": 2083 + }, + { + "entropy": 0.9440857470035553, + "epoch": 0.8336833683368337, + "grad_norm": 0.2581683099269867, + "learning_rate": 3.281660347264849e-05, + "loss": 0.9495, + "mean_token_accuracy": 0.7229010760784149, + "num_tokens": 17645200.0, + "step": 2084 + }, + { + "entropy": 0.9961853176355362, + "epoch": 0.834083408340834, + "grad_norm": 0.2759520709514618, + "learning_rate": 3.275670003640328e-05, + "loss": 0.9801, + "mean_token_accuracy": 0.7187341302633286, + "num_tokens": 17653461.0, + "step": 2085 + }, + { + "entropy": 1.0344959497451782, + "epoch": 0.8344834483448345, + "grad_norm": 0.27701860666275024, + "learning_rate": 3.269692623955972e-05, + "loss": 1.0255, + "mean_token_accuracy": 0.7085929214954376, + "num_tokens": 17661384.0, + "step": 2086 + }, + { + "entropy": 0.9866359680891037, + "epoch": 0.8348834883488349, + "grad_norm": 0.27445974946022034, + "learning_rate": 3.26372821824378e-05, + "loss": 0.9585, + "mean_token_accuracy": 0.7200487852096558, + "num_tokens": 17669484.0, + "step": 2087 + }, + { + "entropy": 0.9635800272226334, + "epoch": 0.8352835283528353, + "grad_norm": 0.2594285309314728, + "learning_rate": 3.257776796513959e-05, + "loss": 0.9508, + "mean_token_accuracy": 0.7214874774217606, + "num_tokens": 17678223.0, + "step": 2088 + }, + { + "entropy": 0.9684711694717407, + "epoch": 0.8356835683568357, + "grad_norm": 0.2751005291938782, + "learning_rate": 3.251838368754937e-05, + "loss": 0.9684, + "mean_token_accuracy": 0.7256612181663513, + "num_tokens": 17686839.0, + "step": 2089 + }, + { + "entropy": 1.0673697143793106, + "epoch": 0.8360836083608361, + "grad_norm": 0.28038227558135986, + "learning_rate": 3.245912944933327e-05, + "loss": 1.0235, + "mean_token_accuracy": 0.7030902355909348, + "num_tokens": 17695254.0, + "step": 2090 + }, + { + "entropy": 0.9623095542192459, + "epoch": 0.8364836483648365, + "grad_norm": 0.28984689712524414, + "learning_rate": 3.240000534993914e-05, + "loss": 0.9747, + "mean_token_accuracy": 0.7110813707113266, + "num_tokens": 17704061.0, + "step": 2091 + }, + { + "entropy": 1.0633483678102493, + "epoch": 0.8368836883688369, + "grad_norm": 0.8798717260360718, + "learning_rate": 3.234101148859654e-05, + "loss": 1.0512, + "mean_token_accuracy": 0.7025868892669678, + "num_tokens": 17712302.0, + "step": 2092 + }, + { + "entropy": 1.03202024102211, + "epoch": 0.8372837283728373, + "grad_norm": 0.2923773527145386, + "learning_rate": 3.228214796431635e-05, + "loss": 1.0332, + "mean_token_accuracy": 0.7073835581541061, + "num_tokens": 17720296.0, + "step": 2093 + }, + { + "entropy": 0.9463024139404297, + "epoch": 0.8376837683768377, + "grad_norm": 0.27599889039993286, + "learning_rate": 3.2223414875890715e-05, + "loss": 0.9679, + "mean_token_accuracy": 0.7202658504247665, + "num_tokens": 17728788.0, + "step": 2094 + }, + { + "entropy": 1.0158434212207794, + "epoch": 0.8380838083808381, + "grad_norm": 0.29166507720947266, + "learning_rate": 3.216481232189289e-05, + "loss": 1.0126, + "mean_token_accuracy": 0.7013648748397827, + "num_tokens": 17736358.0, + "step": 2095 + }, + { + "entropy": 0.9892474412918091, + "epoch": 0.8384838483848385, + "grad_norm": 0.2672601044178009, + "learning_rate": 3.210634040067701e-05, + "loss": 0.9929, + "mean_token_accuracy": 0.7188314646482468, + "num_tokens": 17745167.0, + "step": 2096 + }, + { + "entropy": 0.9320423752069473, + "epoch": 0.8388838883888389, + "grad_norm": 0.24849964678287506, + "learning_rate": 3.2047999210378e-05, + "loss": 0.9238, + "mean_token_accuracy": 0.7290596067905426, + "num_tokens": 17754597.0, + "step": 2097 + }, + { + "entropy": 1.0247018188238144, + "epoch": 0.8392839283928393, + "grad_norm": 0.29533711075782776, + "learning_rate": 3.19897888489114e-05, + "loss": 1.029, + "mean_token_accuracy": 0.7054344117641449, + "num_tokens": 17762676.0, + "step": 2098 + }, + { + "entropy": 0.989239364862442, + "epoch": 0.8396839683968397, + "grad_norm": 0.2893329858779907, + "learning_rate": 3.193170941397312e-05, + "loss": 0.9532, + "mean_token_accuracy": 0.7169255912303925, + "num_tokens": 17770546.0, + "step": 2099 + }, + { + "entropy": 1.0008102357387543, + "epoch": 0.8400840084008401, + "grad_norm": 0.2754252254962921, + "learning_rate": 3.187376100303936e-05, + "loss": 0.9987, + "mean_token_accuracy": 0.7139949649572372, + "num_tokens": 17779400.0, + "step": 2100 + }, + { + "entropy": 1.0641101747751236, + "epoch": 0.8404840484048405, + "grad_norm": 0.42326244711875916, + "learning_rate": 3.1815943713366404e-05, + "loss": 1.0492, + "mean_token_accuracy": 0.7013251036405563, + "num_tokens": 17787809.0, + "step": 2101 + }, + { + "entropy": 0.9669213443994522, + "epoch": 0.8408840884088409, + "grad_norm": 0.2707522511482239, + "learning_rate": 3.1758257641990516e-05, + "loss": 0.9489, + "mean_token_accuracy": 0.7224874347448349, + "num_tokens": 17796644.0, + "step": 2102 + }, + { + "entropy": 0.9760477691888809, + "epoch": 0.8412841284128413, + "grad_norm": 0.2864903509616852, + "learning_rate": 3.1700702885727694e-05, + "loss": 0.9759, + "mean_token_accuracy": 0.7200045883655548, + "num_tokens": 17804960.0, + "step": 2103 + }, + { + "entropy": 0.9742254018783569, + "epoch": 0.8416841684168417, + "grad_norm": 0.2591839134693146, + "learning_rate": 3.1643279541173536e-05, + "loss": 0.9548, + "mean_token_accuracy": 0.7120869308710098, + "num_tokens": 17813891.0, + "step": 2104 + }, + { + "entropy": 0.9866287559270859, + "epoch": 0.8420842084208421, + "grad_norm": 0.2730100750923157, + "learning_rate": 3.1585987704703104e-05, + "loss": 1.0004, + "mean_token_accuracy": 0.7125454992055893, + "num_tokens": 17822443.0, + "step": 2105 + }, + { + "entropy": 0.951354056596756, + "epoch": 0.8424842484248425, + "grad_norm": 0.2735122740268707, + "learning_rate": 3.1528827472470766e-05, + "loss": 0.9736, + "mean_token_accuracy": 0.713584765791893, + "num_tokens": 17831317.0, + "step": 2106 + }, + { + "entropy": 0.978367954492569, + "epoch": 0.8428842884288429, + "grad_norm": 0.2732836604118347, + "learning_rate": 3.1471798940410016e-05, + "loss": 0.9708, + "mean_token_accuracy": 0.7119806408882141, + "num_tokens": 17839839.0, + "step": 2107 + }, + { + "entropy": 0.9020749479532242, + "epoch": 0.8432843284328433, + "grad_norm": 0.2637436091899872, + "learning_rate": 3.141490220423326e-05, + "loss": 0.9104, + "mean_token_accuracy": 0.7338805794715881, + "num_tokens": 17848659.0, + "step": 2108 + }, + { + "entropy": 1.0166822969913483, + "epoch": 0.8436843684368437, + "grad_norm": 0.27595508098602295, + "learning_rate": 3.135813735943174e-05, + "loss": 0.9894, + "mean_token_accuracy": 0.7151621133089066, + "num_tokens": 17857284.0, + "step": 2109 + }, + { + "entropy": 1.0501763224601746, + "epoch": 0.8440844084408441, + "grad_norm": 0.3070598840713501, + "learning_rate": 3.130150450127536e-05, + "loss": 1.0674, + "mean_token_accuracy": 0.7024436146020889, + "num_tokens": 17865438.0, + "step": 2110 + }, + { + "entropy": 1.015353575348854, + "epoch": 0.8444844484448445, + "grad_norm": 0.27963122725486755, + "learning_rate": 3.1245003724812515e-05, + "loss": 0.9957, + "mean_token_accuracy": 0.7134422063827515, + "num_tokens": 17873866.0, + "step": 2111 + }, + { + "entropy": 0.9441725760698318, + "epoch": 0.8448844884488449, + "grad_norm": 0.2618277668952942, + "learning_rate": 3.118863512486987e-05, + "loss": 0.9086, + "mean_token_accuracy": 0.7279205322265625, + "num_tokens": 17882602.0, + "step": 2112 + }, + { + "entropy": 0.9971486032009125, + "epoch": 0.8452845284528453, + "grad_norm": 0.263083815574646, + "learning_rate": 3.11323987960523e-05, + "loss": 0.9903, + "mean_token_accuracy": 0.7106924206018448, + "num_tokens": 17891440.0, + "step": 2113 + }, + { + "entropy": 0.982336163520813, + "epoch": 0.8456845684568457, + "grad_norm": 0.2669735550880432, + "learning_rate": 3.107629483274263e-05, + "loss": 0.968, + "mean_token_accuracy": 0.7132144868373871, + "num_tokens": 17900249.0, + "step": 2114 + }, + { + "entropy": 1.0065630078315735, + "epoch": 0.8460846084608461, + "grad_norm": 0.30517905950546265, + "learning_rate": 3.102032332910161e-05, + "loss": 0.9651, + "mean_token_accuracy": 0.7078851014375687, + "num_tokens": 17908385.0, + "step": 2115 + }, + { + "entropy": 1.0191326141357422, + "epoch": 0.8464846484648465, + "grad_norm": 0.26707661151885986, + "learning_rate": 3.096448437906767e-05, + "loss": 0.9956, + "mean_token_accuracy": 0.7159338295459747, + "num_tokens": 17917259.0, + "step": 2116 + }, + { + "entropy": 0.9746287018060684, + "epoch": 0.8468846884688469, + "grad_norm": 0.2584819197654724, + "learning_rate": 3.0908778076356684e-05, + "loss": 0.9467, + "mean_token_accuracy": 0.7253472954034805, + "num_tokens": 17925994.0, + "step": 2117 + }, + { + "entropy": 1.0090270191431046, + "epoch": 0.8472847284728473, + "grad_norm": 0.2734073996543884, + "learning_rate": 3.0853204514462e-05, + "loss": 0.9692, + "mean_token_accuracy": 0.7133794575929642, + "num_tokens": 17934362.0, + "step": 2118 + }, + { + "entropy": 0.9983870387077332, + "epoch": 0.8476847684768477, + "grad_norm": 0.26347869634628296, + "learning_rate": 3.079776378665412e-05, + "loss": 0.9823, + "mean_token_accuracy": 0.7129227966070175, + "num_tokens": 17943341.0, + "step": 2119 + }, + { + "entropy": 0.9843608438968658, + "epoch": 0.8480848084808481, + "grad_norm": 0.27451270818710327, + "learning_rate": 3.074245598598067e-05, + "loss": 0.9872, + "mean_token_accuracy": 0.7140398174524307, + "num_tokens": 17952069.0, + "step": 2120 + }, + { + "entropy": 0.9673940539360046, + "epoch": 0.8484848484848485, + "grad_norm": 0.317597359418869, + "learning_rate": 3.068728120526613e-05, + "loss": 0.9688, + "mean_token_accuracy": 0.719158872961998, + "num_tokens": 17960476.0, + "step": 2121 + }, + { + "entropy": 0.9594562947750092, + "epoch": 0.8488848884888489, + "grad_norm": 0.2942793369293213, + "learning_rate": 3.063223953711172e-05, + "loss": 0.9554, + "mean_token_accuracy": 0.7194225639104843, + "num_tokens": 17969101.0, + "step": 2122 + }, + { + "entropy": 0.9935691058635712, + "epoch": 0.8492849284928493, + "grad_norm": 0.2690642178058624, + "learning_rate": 3.057733107389528e-05, + "loss": 0.9854, + "mean_token_accuracy": 0.7167229503393173, + "num_tokens": 17977385.0, + "step": 2123 + }, + { + "entropy": 0.9991063326597214, + "epoch": 0.8496849684968497, + "grad_norm": 0.27631133794784546, + "learning_rate": 3.0522555907771086e-05, + "loss": 0.9991, + "mean_token_accuracy": 0.7149428129196167, + "num_tokens": 17985598.0, + "step": 2124 + }, + { + "entropy": 0.9635533839464188, + "epoch": 0.8500850085008501, + "grad_norm": 0.27808842062950134, + "learning_rate": 3.0467914130669738e-05, + "loss": 0.9731, + "mean_token_accuracy": 0.7171238511800766, + "num_tokens": 17993885.0, + "step": 2125 + }, + { + "entropy": 1.004217341542244, + "epoch": 0.8504850485048505, + "grad_norm": 0.2750919759273529, + "learning_rate": 3.041340583429789e-05, + "loss": 1.0007, + "mean_token_accuracy": 0.715006560087204, + "num_tokens": 18002119.0, + "step": 2126 + }, + { + "entropy": 1.0279668420553207, + "epoch": 0.8508850885088509, + "grad_norm": 0.2880896329879761, + "learning_rate": 3.0359031110138186e-05, + "loss": 1.0195, + "mean_token_accuracy": 0.7084156274795532, + "num_tokens": 18010351.0, + "step": 2127 + }, + { + "entropy": 0.9716871529817581, + "epoch": 0.8512851285128513, + "grad_norm": 0.27417635917663574, + "learning_rate": 3.030479004944917e-05, + "loss": 0.9924, + "mean_token_accuracy": 0.7134369313716888, + "num_tokens": 18018495.0, + "step": 2128 + }, + { + "entropy": 1.0350272506475449, + "epoch": 0.8516851685168517, + "grad_norm": 0.2761119306087494, + "learning_rate": 3.0250682743264942e-05, + "loss": 0.9978, + "mean_token_accuracy": 0.7072334438562393, + "num_tokens": 18026752.0, + "step": 2129 + }, + { + "entropy": 0.9716001451015472, + "epoch": 0.8520852085208521, + "grad_norm": 0.2638903260231018, + "learning_rate": 3.0196709282395218e-05, + "loss": 0.9613, + "mean_token_accuracy": 0.717596709728241, + "num_tokens": 18035745.0, + "step": 2130 + }, + { + "entropy": 1.0161199867725372, + "epoch": 0.8524852485248525, + "grad_norm": 0.28679123520851135, + "learning_rate": 3.0142869757425018e-05, + "loss": 1.0337, + "mean_token_accuracy": 0.7060398608446121, + "num_tokens": 18044034.0, + "step": 2131 + }, + { + "entropy": 1.0533818304538727, + "epoch": 0.8528852885288529, + "grad_norm": 0.3014936149120331, + "learning_rate": 3.008916425871457e-05, + "loss": 1.0699, + "mean_token_accuracy": 0.6964138448238373, + "num_tokens": 18051505.0, + "step": 2132 + }, + { + "entropy": 0.9945299923419952, + "epoch": 0.8532853285328533, + "grad_norm": 0.273005872964859, + "learning_rate": 3.00355928763992e-05, + "loss": 0.9719, + "mean_token_accuracy": 0.7163169234991074, + "num_tokens": 18059802.0, + "step": 2133 + }, + { + "entropy": 0.9948825985193253, + "epoch": 0.8536853685368537, + "grad_norm": 0.29109591245651245, + "learning_rate": 2.9982155700389155e-05, + "loss": 0.9771, + "mean_token_accuracy": 0.7187463045120239, + "num_tokens": 18068178.0, + "step": 2134 + }, + { + "entropy": 1.0293094664812088, + "epoch": 0.8540854085408541, + "grad_norm": 0.2749274671077728, + "learning_rate": 2.992885282036937e-05, + "loss": 1.0078, + "mean_token_accuracy": 0.7097570449113846, + "num_tokens": 18076820.0, + "step": 2135 + }, + { + "entropy": 1.0526157766580582, + "epoch": 0.8544854485448545, + "grad_norm": 0.28302180767059326, + "learning_rate": 2.9875684325799435e-05, + "loss": 1.047, + "mean_token_accuracy": 0.7043512165546417, + "num_tokens": 18084952.0, + "step": 2136 + }, + { + "entropy": 0.9516177028417587, + "epoch": 0.8548854885488549, + "grad_norm": 0.2609308362007141, + "learning_rate": 2.9822650305913418e-05, + "loss": 0.9578, + "mean_token_accuracy": 0.7261099219322205, + "num_tokens": 18093687.0, + "step": 2137 + }, + { + "entropy": 1.0099827349185944, + "epoch": 0.8552855285528553, + "grad_norm": 0.26971903443336487, + "learning_rate": 2.9769750849719635e-05, + "loss": 1.0015, + "mean_token_accuracy": 0.7061579674482346, + "num_tokens": 18102460.0, + "step": 2138 + }, + { + "entropy": 0.9993090331554413, + "epoch": 0.8556855685568557, + "grad_norm": 0.28215447068214417, + "learning_rate": 2.9716986046000627e-05, + "loss": 0.9991, + "mean_token_accuracy": 0.7060889005661011, + "num_tokens": 18110990.0, + "step": 2139 + }, + { + "entropy": 1.0264450907707214, + "epoch": 0.8560856085608561, + "grad_norm": 0.2870274782180786, + "learning_rate": 2.966435598331289e-05, + "loss": 1.0203, + "mean_token_accuracy": 0.7105409353971481, + "num_tokens": 18118864.0, + "step": 2140 + }, + { + "entropy": 1.0282356441020966, + "epoch": 0.8564856485648565, + "grad_norm": 0.27672097086906433, + "learning_rate": 2.9611860749986787e-05, + "loss": 1.0265, + "mean_token_accuracy": 0.7136689871549606, + "num_tokens": 18127338.0, + "step": 2141 + }, + { + "entropy": 0.964838981628418, + "epoch": 0.8568856885688569, + "grad_norm": 0.28100237250328064, + "learning_rate": 2.9559500434126443e-05, + "loss": 0.9605, + "mean_token_accuracy": 0.7258533388376236, + "num_tokens": 18135334.0, + "step": 2142 + }, + { + "entropy": 1.0913909375667572, + "epoch": 0.8572857285728572, + "grad_norm": 0.2851468622684479, + "learning_rate": 2.950727512360947e-05, + "loss": 1.0994, + "mean_token_accuracy": 0.6915436089038849, + "num_tokens": 18143396.0, + "step": 2143 + }, + { + "entropy": 1.0187119543552399, + "epoch": 0.8576857685768576, + "grad_norm": 0.27882036566734314, + "learning_rate": 2.9455184906086985e-05, + "loss": 1.0088, + "mean_token_accuracy": 0.7103749662637711, + "num_tokens": 18151780.0, + "step": 2144 + }, + { + "entropy": 1.0597985535860062, + "epoch": 0.858085808580858, + "grad_norm": 0.2869499921798706, + "learning_rate": 2.94032298689833e-05, + "loss": 1.0613, + "mean_token_accuracy": 0.6957389265298843, + "num_tokens": 18159986.0, + "step": 2145 + }, + { + "entropy": 1.0229371786117554, + "epoch": 0.8584858485848584, + "grad_norm": 0.28310292959213257, + "learning_rate": 2.9351410099495897e-05, + "loss": 1.0125, + "mean_token_accuracy": 0.7124859392642975, + "num_tokens": 18167982.0, + "step": 2146 + }, + { + "entropy": 1.0382138192653656, + "epoch": 0.8588858885888588, + "grad_norm": 0.2825257182121277, + "learning_rate": 2.9299725684595188e-05, + "loss": 1.0214, + "mean_token_accuracy": 0.7089748978614807, + "num_tokens": 18176253.0, + "step": 2147 + }, + { + "entropy": 0.941718190908432, + "epoch": 0.8592859285928592, + "grad_norm": 0.2738848030567169, + "learning_rate": 2.9248176711024487e-05, + "loss": 0.9303, + "mean_token_accuracy": 0.7288882583379745, + "num_tokens": 18184710.0, + "step": 2148 + }, + { + "entropy": 0.9622817188501358, + "epoch": 0.8596859685968596, + "grad_norm": 0.27742713689804077, + "learning_rate": 2.9196763265299742e-05, + "loss": 0.9728, + "mean_token_accuracy": 0.7219050526618958, + "num_tokens": 18193294.0, + "step": 2149 + }, + { + "entropy": 0.9239647835493088, + "epoch": 0.86008600860086, + "grad_norm": 0.28026142716407776, + "learning_rate": 2.9145485433709414e-05, + "loss": 0.9275, + "mean_token_accuracy": 0.7269940376281738, + "num_tokens": 18201770.0, + "step": 2150 + }, + { + "entropy": 1.0369948595762253, + "epoch": 0.8604860486048604, + "grad_norm": 0.2716243863105774, + "learning_rate": 2.9094343302314432e-05, + "loss": 1.0422, + "mean_token_accuracy": 0.6954501569271088, + "num_tokens": 18210526.0, + "step": 2151 + }, + { + "entropy": 1.0153424739837646, + "epoch": 0.8608860886088608, + "grad_norm": 0.27101534605026245, + "learning_rate": 2.9043336956947926e-05, + "loss": 0.9848, + "mean_token_accuracy": 0.7124868631362915, + "num_tokens": 18218969.0, + "step": 2152 + }, + { + "entropy": 0.9554331749677658, + "epoch": 0.8612861286128612, + "grad_norm": 0.2736670970916748, + "learning_rate": 2.8992466483215164e-05, + "loss": 0.9453, + "mean_token_accuracy": 0.727552741765976, + "num_tokens": 18227260.0, + "step": 2153 + }, + { + "entropy": 0.9299874305725098, + "epoch": 0.8616861686168616, + "grad_norm": 0.2705608308315277, + "learning_rate": 2.894173196649333e-05, + "loss": 0.9302, + "mean_token_accuracy": 0.7337226867675781, + "num_tokens": 18235646.0, + "step": 2154 + }, + { + "entropy": 0.9814595133066177, + "epoch": 0.862086208620862, + "grad_norm": 0.2662833631038666, + "learning_rate": 2.8891133491931493e-05, + "loss": 0.9683, + "mean_token_accuracy": 0.7199150025844574, + "num_tokens": 18243935.0, + "step": 2155 + }, + { + "entropy": 0.9749583452939987, + "epoch": 0.8624862486248625, + "grad_norm": 0.263253390789032, + "learning_rate": 2.884067114445036e-05, + "loss": 0.9411, + "mean_token_accuracy": 0.7275680005550385, + "num_tokens": 18252209.0, + "step": 2156 + }, + { + "entropy": 1.0012638121843338, + "epoch": 0.8628862886288629, + "grad_norm": 0.2764817774295807, + "learning_rate": 2.879034500874213e-05, + "loss": 0.9755, + "mean_token_accuracy": 0.7161267846822739, + "num_tokens": 18260485.0, + "step": 2157 + }, + { + "entropy": 0.9523182362318039, + "epoch": 0.8632863286328633, + "grad_norm": 0.27631309628486633, + "learning_rate": 2.8740155169270512e-05, + "loss": 0.958, + "mean_token_accuracy": 0.7227918207645416, + "num_tokens": 18268859.0, + "step": 2158 + }, + { + "entropy": 1.0415673404932022, + "epoch": 0.8636863686368637, + "grad_norm": 0.2767874598503113, + "learning_rate": 2.8690101710270324e-05, + "loss": 1.0535, + "mean_token_accuracy": 0.7011487782001495, + "num_tokens": 18277221.0, + "step": 2159 + }, + { + "entropy": 0.9915614873170853, + "epoch": 0.8640864086408641, + "grad_norm": 0.29252851009368896, + "learning_rate": 2.864018471574763e-05, + "loss": 0.9861, + "mean_token_accuracy": 0.7134775966405869, + "num_tokens": 18284946.0, + "step": 2160 + }, + { + "entropy": 0.9464387148618698, + "epoch": 0.8644864486448645, + "grad_norm": 0.2662280201911926, + "learning_rate": 2.859040426947934e-05, + "loss": 0.965, + "mean_token_accuracy": 0.7173628509044647, + "num_tokens": 18293553.0, + "step": 2161 + }, + { + "entropy": 0.9689690619707108, + "epoch": 0.8648864886488649, + "grad_norm": 0.26857635378837585, + "learning_rate": 2.8540760455013282e-05, + "loss": 0.96, + "mean_token_accuracy": 0.7226193845272064, + "num_tokens": 18302193.0, + "step": 2162 + }, + { + "entropy": 1.0029151290655136, + "epoch": 0.8652865286528653, + "grad_norm": 0.2796790599822998, + "learning_rate": 2.849125335566791e-05, + "loss": 0.97, + "mean_token_accuracy": 0.7172962576150894, + "num_tokens": 18310507.0, + "step": 2163 + }, + { + "entropy": 0.9760487377643585, + "epoch": 0.8656865686568657, + "grad_norm": 0.2637076675891876, + "learning_rate": 2.8441883054532293e-05, + "loss": 0.9546, + "mean_token_accuracy": 0.7192352414131165, + "num_tokens": 18318970.0, + "step": 2164 + }, + { + "entropy": 0.9584449529647827, + "epoch": 0.8660866086608661, + "grad_norm": 0.26355016231536865, + "learning_rate": 2.8392649634465835e-05, + "loss": 0.9509, + "mean_token_accuracy": 0.7185089588165283, + "num_tokens": 18327700.0, + "step": 2165 + }, + { + "entropy": 1.049717366695404, + "epoch": 0.8664866486648665, + "grad_norm": 0.29061606526374817, + "learning_rate": 2.834355317809824e-05, + "loss": 1.0225, + "mean_token_accuracy": 0.7090509980916977, + "num_tokens": 18335894.0, + "step": 2166 + }, + { + "entropy": 0.9913681596517563, + "epoch": 0.8668866886688669, + "grad_norm": 0.26392555236816406, + "learning_rate": 2.829459376782937e-05, + "loss": 0.9747, + "mean_token_accuracy": 0.7136086523532867, + "num_tokens": 18344529.0, + "step": 2167 + }, + { + "entropy": 0.9642865359783173, + "epoch": 0.8672867286728673, + "grad_norm": 0.25934305787086487, + "learning_rate": 2.824577148582901e-05, + "loss": 0.944, + "mean_token_accuracy": 0.726129800081253, + "num_tokens": 18353113.0, + "step": 2168 + }, + { + "entropy": 0.9423133283853531, + "epoch": 0.8676867686768677, + "grad_norm": 0.2827508747577667, + "learning_rate": 2.8197086414036894e-05, + "loss": 0.9541, + "mean_token_accuracy": 0.7242550104856491, + "num_tokens": 18361728.0, + "step": 2169 + }, + { + "entropy": 1.0420671701431274, + "epoch": 0.8680868086808681, + "grad_norm": 0.2840612530708313, + "learning_rate": 2.8148538634162398e-05, + "loss": 1.0169, + "mean_token_accuracy": 0.7035350203514099, + "num_tokens": 18369584.0, + "step": 2170 + }, + { + "entropy": 1.0695709139108658, + "epoch": 0.8684868486848685, + "grad_norm": 0.28362417221069336, + "learning_rate": 2.8100128227684468e-05, + "loss": 1.0471, + "mean_token_accuracy": 0.6974106878042221, + "num_tokens": 18377546.0, + "step": 2171 + }, + { + "entropy": 0.983172282576561, + "epoch": 0.8688868886888689, + "grad_norm": 0.2956814467906952, + "learning_rate": 2.8051855275851558e-05, + "loss": 0.9387, + "mean_token_accuracy": 0.7204053848981857, + "num_tokens": 18385560.0, + "step": 2172 + }, + { + "entropy": 0.932046964764595, + "epoch": 0.8692869286928693, + "grad_norm": 0.26182591915130615, + "learning_rate": 2.800371985968141e-05, + "loss": 0.9499, + "mean_token_accuracy": 0.7206301689147949, + "num_tokens": 18394576.0, + "step": 2173 + }, + { + "entropy": 1.0572405755519867, + "epoch": 0.8696869686968697, + "grad_norm": 0.28390926122665405, + "learning_rate": 2.7955722059960894e-05, + "loss": 1.0569, + "mean_token_accuracy": 0.6974535435438156, + "num_tokens": 18402577.0, + "step": 2174 + }, + { + "entropy": 0.9815231263637543, + "epoch": 0.8700870087008701, + "grad_norm": 0.2790232002735138, + "learning_rate": 2.790786195724595e-05, + "loss": 0.9971, + "mean_token_accuracy": 0.7073682099580765, + "num_tokens": 18410618.0, + "step": 2175 + }, + { + "entropy": 0.9327611476182938, + "epoch": 0.8704870487048705, + "grad_norm": 0.25862571597099304, + "learning_rate": 2.78601396318614e-05, + "loss": 0.9226, + "mean_token_accuracy": 0.7243570238351822, + "num_tokens": 18419380.0, + "step": 2176 + }, + { + "entropy": 0.9254949539899826, + "epoch": 0.8708870887088709, + "grad_norm": 0.26269569993019104, + "learning_rate": 2.7812555163900844e-05, + "loss": 0.9325, + "mean_token_accuracy": 0.7311051785945892, + "num_tokens": 18428025.0, + "step": 2177 + }, + { + "entropy": 0.9135541617870331, + "epoch": 0.8712871287128713, + "grad_norm": 0.25966623425483704, + "learning_rate": 2.776510863322654e-05, + "loss": 0.8852, + "mean_token_accuracy": 0.7377202808856964, + "num_tokens": 18436802.0, + "step": 2178 + }, + { + "entropy": 0.89944988489151, + "epoch": 0.8716871687168717, + "grad_norm": 0.2622186243534088, + "learning_rate": 2.7717800119469177e-05, + "loss": 0.8889, + "mean_token_accuracy": 0.7363101989030838, + "num_tokens": 18445611.0, + "step": 2179 + }, + { + "entropy": 0.9706479012966156, + "epoch": 0.8720872087208721, + "grad_norm": 0.26975756883621216, + "learning_rate": 2.7670629702027834e-05, + "loss": 0.9472, + "mean_token_accuracy": 0.7252829074859619, + "num_tokens": 18454204.0, + "step": 2180 + }, + { + "entropy": 1.0048835277557373, + "epoch": 0.8724872487248725, + "grad_norm": 0.2846691608428955, + "learning_rate": 2.762359746006985e-05, + "loss": 0.9811, + "mean_token_accuracy": 0.7185728698968887, + "num_tokens": 18462461.0, + "step": 2181 + }, + { + "entropy": 0.9324361234903336, + "epoch": 0.8728872887288729, + "grad_norm": 0.2700250446796417, + "learning_rate": 2.757670347253064e-05, + "loss": 0.9294, + "mean_token_accuracy": 0.7228376716375351, + "num_tokens": 18471452.0, + "step": 2182 + }, + { + "entropy": 0.9833199083805084, + "epoch": 0.8732873287328733, + "grad_norm": 0.2866455316543579, + "learning_rate": 2.7529947818113573e-05, + "loss": 0.9853, + "mean_token_accuracy": 0.7095342129468918, + "num_tokens": 18480025.0, + "step": 2183 + }, + { + "entropy": 0.9470414966344833, + "epoch": 0.8736873687368737, + "grad_norm": 0.272357314825058, + "learning_rate": 2.7483330575289857e-05, + "loss": 0.9394, + "mean_token_accuracy": 0.7261380702257156, + "num_tokens": 18488396.0, + "step": 2184 + }, + { + "entropy": 1.0140750706195831, + "epoch": 0.8740874087408741, + "grad_norm": 0.2741706073284149, + "learning_rate": 2.7436851822298366e-05, + "loss": 1.0304, + "mean_token_accuracy": 0.7084567248821259, + "num_tokens": 18497139.0, + "step": 2185 + }, + { + "entropy": 1.0197201818227768, + "epoch": 0.8744874487448745, + "grad_norm": 0.29986757040023804, + "learning_rate": 2.739051163714563e-05, + "loss": 1.0247, + "mean_token_accuracy": 0.7002385556697845, + "num_tokens": 18504666.0, + "step": 2186 + }, + { + "entropy": 0.9356887936592102, + "epoch": 0.8748874887488749, + "grad_norm": 0.2690699100494385, + "learning_rate": 2.7344310097605557e-05, + "loss": 0.9244, + "mean_token_accuracy": 0.730567991733551, + "num_tokens": 18513057.0, + "step": 2187 + }, + { + "entropy": 1.0004611015319824, + "epoch": 0.8752875287528753, + "grad_norm": 0.2861672639846802, + "learning_rate": 2.729824728121934e-05, + "loss": 0.9907, + "mean_token_accuracy": 0.7126326858997345, + "num_tokens": 18521387.0, + "step": 2188 + }, + { + "entropy": 0.9684345722198486, + "epoch": 0.8756875687568757, + "grad_norm": 0.2717658579349518, + "learning_rate": 2.7252323265295404e-05, + "loss": 0.956, + "mean_token_accuracy": 0.7117027640342712, + "num_tokens": 18529873.0, + "step": 2189 + }, + { + "entropy": 0.9778385311365128, + "epoch": 0.8760876087608761, + "grad_norm": 0.27245545387268066, + "learning_rate": 2.720653812690917e-05, + "loss": 0.9745, + "mean_token_accuracy": 0.709427073597908, + "num_tokens": 18538818.0, + "step": 2190 + }, + { + "entropy": 0.9669632613658905, + "epoch": 0.8764876487648765, + "grad_norm": 0.26816239953041077, + "learning_rate": 2.7160891942903045e-05, + "loss": 0.9804, + "mean_token_accuracy": 0.7194944471120834, + "num_tokens": 18547313.0, + "step": 2191 + }, + { + "entropy": 1.0532904863357544, + "epoch": 0.8768876887688769, + "grad_norm": 0.2953072786331177, + "learning_rate": 2.7115384789886156e-05, + "loss": 1.0531, + "mean_token_accuracy": 0.6962666511535645, + "num_tokens": 18554975.0, + "step": 2192 + }, + { + "entropy": 1.0173091888427734, + "epoch": 0.8772877287728773, + "grad_norm": 0.2817433774471283, + "learning_rate": 2.707001674423434e-05, + "loss": 1.019, + "mean_token_accuracy": 0.7059982717037201, + "num_tokens": 18563364.0, + "step": 2193 + }, + { + "entropy": 0.9924028366804123, + "epoch": 0.8776877687768777, + "grad_norm": 0.2706199288368225, + "learning_rate": 2.7024787882089886e-05, + "loss": 0.9907, + "mean_token_accuracy": 0.7131213247776031, + "num_tokens": 18571551.0, + "step": 2194 + }, + { + "entropy": 0.985996350646019, + "epoch": 0.8780878087808781, + "grad_norm": 0.2814847528934479, + "learning_rate": 2.697969827936163e-05, + "loss": 0.9794, + "mean_token_accuracy": 0.7160655558109283, + "num_tokens": 18579751.0, + "step": 2195 + }, + { + "entropy": 0.9783134758472443, + "epoch": 0.8784878487848785, + "grad_norm": 0.26095759868621826, + "learning_rate": 2.6934748011724565e-05, + "loss": 0.9828, + "mean_token_accuracy": 0.7168547958135605, + "num_tokens": 18588714.0, + "step": 2196 + }, + { + "entropy": 0.9713470190763474, + "epoch": 0.8788878887888789, + "grad_norm": 0.2688662111759186, + "learning_rate": 2.6889937154619877e-05, + "loss": 0.9438, + "mean_token_accuracy": 0.7216840088367462, + "num_tokens": 18597164.0, + "step": 2197 + }, + { + "entropy": 0.9978623539209366, + "epoch": 0.8792879287928793, + "grad_norm": 0.2630065977573395, + "learning_rate": 2.6845265783254747e-05, + "loss": 0.9872, + "mean_token_accuracy": 0.7164197117090225, + "num_tokens": 18605965.0, + "step": 2198 + }, + { + "entropy": 1.0386171787977219, + "epoch": 0.8796879687968797, + "grad_norm": 0.2761189341545105, + "learning_rate": 2.6800733972602305e-05, + "loss": 1.0229, + "mean_token_accuracy": 0.704839438199997, + "num_tokens": 18614234.0, + "step": 2199 + }, + { + "entropy": 0.9876759797334671, + "epoch": 0.8800880088008801, + "grad_norm": 0.2752560079097748, + "learning_rate": 2.675634179740143e-05, + "loss": 0.9794, + "mean_token_accuracy": 0.7118789851665497, + "num_tokens": 18622596.0, + "step": 2200 + }, + { + "entropy": 0.986650213599205, + "epoch": 0.8804880488048805, + "grad_norm": 0.2920258939266205, + "learning_rate": 2.6712089332156633e-05, + "loss": 0.9817, + "mean_token_accuracy": 0.7178271263837814, + "num_tokens": 18631279.0, + "step": 2201 + }, + { + "entropy": 0.9731039553880692, + "epoch": 0.8808880888088809, + "grad_norm": 0.9087457060813904, + "learning_rate": 2.666797665113796e-05, + "loss": 0.9683, + "mean_token_accuracy": 0.7138585150241852, + "num_tokens": 18639804.0, + "step": 2202 + }, + { + "entropy": 0.9395751804113388, + "epoch": 0.8812881288128813, + "grad_norm": 0.2754904627799988, + "learning_rate": 2.6624003828380817e-05, + "loss": 0.9355, + "mean_token_accuracy": 0.7262525409460068, + "num_tokens": 18648139.0, + "step": 2203 + }, + { + "entropy": 0.9982647448778152, + "epoch": 0.8816881688168817, + "grad_norm": 0.27934756875038147, + "learning_rate": 2.658017093768595e-05, + "loss": 0.9875, + "mean_token_accuracy": 0.716104120016098, + "num_tokens": 18656147.0, + "step": 2204 + }, + { + "entropy": 0.9956264644861221, + "epoch": 0.8820882088208821, + "grad_norm": 0.2969841957092285, + "learning_rate": 2.6536478052619217e-05, + "loss": 0.9816, + "mean_token_accuracy": 0.7126675695180893, + "num_tokens": 18664238.0, + "step": 2205 + }, + { + "entropy": 0.9727550446987152, + "epoch": 0.8824882488248825, + "grad_norm": 0.2762468755245209, + "learning_rate": 2.64929252465115e-05, + "loss": 0.9846, + "mean_token_accuracy": 0.7177352160215378, + "num_tokens": 18672564.0, + "step": 2206 + }, + { + "entropy": 0.9850186109542847, + "epoch": 0.8828882888288829, + "grad_norm": 0.27210354804992676, + "learning_rate": 2.6449512592458565e-05, + "loss": 0.9741, + "mean_token_accuracy": 0.720297172665596, + "num_tokens": 18681004.0, + "step": 2207 + }, + { + "entropy": 0.9614686220884323, + "epoch": 0.8832883288328833, + "grad_norm": 0.264845073223114, + "learning_rate": 2.6406240163320994e-05, + "loss": 0.9533, + "mean_token_accuracy": 0.722334548830986, + "num_tokens": 18689885.0, + "step": 2208 + }, + { + "entropy": 1.03648242354393, + "epoch": 0.8836883688368837, + "grad_norm": 0.2741720676422119, + "learning_rate": 2.6363108031724017e-05, + "loss": 1.0362, + "mean_token_accuracy": 0.7057934403419495, + "num_tokens": 18698555.0, + "step": 2209 + }, + { + "entropy": 1.02606800198555, + "epoch": 0.8840884088408841, + "grad_norm": 0.277656227350235, + "learning_rate": 2.6320116270057383e-05, + "loss": 1.0237, + "mean_token_accuracy": 0.7055680453777313, + "num_tokens": 18706741.0, + "step": 2210 + }, + { + "entropy": 0.9356466084718704, + "epoch": 0.8844884488448845, + "grad_norm": 0.26150599122047424, + "learning_rate": 2.6277264950475245e-05, + "loss": 0.9447, + "mean_token_accuracy": 0.7282193601131439, + "num_tokens": 18716143.0, + "step": 2211 + }, + { + "entropy": 1.056101143360138, + "epoch": 0.8848884888488849, + "grad_norm": 0.2866343557834625, + "learning_rate": 2.6234554144896108e-05, + "loss": 1.0791, + "mean_token_accuracy": 0.6931745857000351, + "num_tokens": 18724514.0, + "step": 2212 + }, + { + "entropy": 0.9648923426866531, + "epoch": 0.8852885288528853, + "grad_norm": 0.2688848376274109, + "learning_rate": 2.6191983925002543e-05, + "loss": 0.9715, + "mean_token_accuracy": 0.7164793908596039, + "num_tokens": 18733181.0, + "step": 2213 + }, + { + "entropy": 1.062942996621132, + "epoch": 0.8856885688568857, + "grad_norm": 0.29393380880355835, + "learning_rate": 2.6149554362241306e-05, + "loss": 1.0713, + "mean_token_accuracy": 0.7015903443098068, + "num_tokens": 18741344.0, + "step": 2214 + }, + { + "entropy": 0.9797134846448898, + "epoch": 0.8860886088608861, + "grad_norm": 0.2761925458908081, + "learning_rate": 2.6107265527822995e-05, + "loss": 0.985, + "mean_token_accuracy": 0.7158007919788361, + "num_tokens": 18749851.0, + "step": 2215 + }, + { + "entropy": 1.0139793455600739, + "epoch": 0.8864886488648865, + "grad_norm": 0.268429160118103, + "learning_rate": 2.6065117492721998e-05, + "loss": 1.005, + "mean_token_accuracy": 0.7136865556240082, + "num_tokens": 18758598.0, + "step": 2216 + }, + { + "entropy": 1.0130019783973694, + "epoch": 0.8868886888688869, + "grad_norm": 0.2725018560886383, + "learning_rate": 2.6023110327676487e-05, + "loss": 1.0144, + "mean_token_accuracy": 0.7079477310180664, + "num_tokens": 18767308.0, + "step": 2217 + }, + { + "entropy": 0.932479053735733, + "epoch": 0.8872887288728872, + "grad_norm": 0.26445889472961426, + "learning_rate": 2.598124410318813e-05, + "loss": 0.9335, + "mean_token_accuracy": 0.7274748682975769, + "num_tokens": 18775415.0, + "step": 2218 + }, + { + "entropy": 0.87993124127388, + "epoch": 0.8876887688768876, + "grad_norm": 0.2508007884025574, + "learning_rate": 2.5939518889522102e-05, + "loss": 0.8472, + "mean_token_accuracy": 0.7505761086940765, + "num_tokens": 18784389.0, + "step": 2219 + }, + { + "entropy": 1.043866142630577, + "epoch": 0.888088808880888, + "grad_norm": 0.27530816197395325, + "learning_rate": 2.5897934756706876e-05, + "loss": 1.0335, + "mean_token_accuracy": 0.7001327872276306, + "num_tokens": 18792928.0, + "step": 2220 + }, + { + "entropy": 1.0333503186702728, + "epoch": 0.8884888488848884, + "grad_norm": 0.27308905124664307, + "learning_rate": 2.585649177453417e-05, + "loss": 1.041, + "mean_token_accuracy": 0.7032440900802612, + "num_tokens": 18801503.0, + "step": 2221 + }, + { + "entropy": 0.9919215440750122, + "epoch": 0.8888888888888888, + "grad_norm": 0.26338210701942444, + "learning_rate": 2.5815190012558794e-05, + "loss": 0.9699, + "mean_token_accuracy": 0.7216924577951431, + "num_tokens": 18810316.0, + "step": 2222 + }, + { + "entropy": 1.0359559506177902, + "epoch": 0.8892889288928892, + "grad_norm": 0.29428067803382874, + "learning_rate": 2.5774029540098575e-05, + "loss": 1.0227, + "mean_token_accuracy": 0.713511124253273, + "num_tokens": 18819367.0, + "step": 2223 + }, + { + "entropy": 1.0021261870861053, + "epoch": 0.8896889688968896, + "grad_norm": 0.2801613509654999, + "learning_rate": 2.5733010426234155e-05, + "loss": 0.9997, + "mean_token_accuracy": 0.7141763269901276, + "num_tokens": 18827438.0, + "step": 2224 + }, + { + "entropy": 0.9627701044082642, + "epoch": 0.89008900890089, + "grad_norm": 0.2627711594104767, + "learning_rate": 2.5692132739808966e-05, + "loss": 0.9481, + "mean_token_accuracy": 0.7173673510551453, + "num_tokens": 18836180.0, + "step": 2225 + }, + { + "entropy": 1.069669246673584, + "epoch": 0.8904890489048904, + "grad_norm": 0.278899610042572, + "learning_rate": 2.5651396549429086e-05, + "loss": 1.0737, + "mean_token_accuracy": 0.6924329102039337, + "num_tokens": 18844442.0, + "step": 2226 + }, + { + "entropy": 0.9717269390821457, + "epoch": 0.8908890889088908, + "grad_norm": 0.2696807384490967, + "learning_rate": 2.561080192346307e-05, + "loss": 0.9448, + "mean_token_accuracy": 0.7184331566095352, + "num_tokens": 18852872.0, + "step": 2227 + }, + { + "entropy": 0.9667552411556244, + "epoch": 0.8912891289128912, + "grad_norm": 0.2702970802783966, + "learning_rate": 2.5570348930041955e-05, + "loss": 0.9628, + "mean_token_accuracy": 0.7182015180587769, + "num_tokens": 18861143.0, + "step": 2228 + }, + { + "entropy": 1.0737375617027283, + "epoch": 0.8916891689168917, + "grad_norm": 0.3116845488548279, + "learning_rate": 2.5530037637059e-05, + "loss": 1.0372, + "mean_token_accuracy": 0.7034114301204681, + "num_tokens": 18869658.0, + "step": 2229 + }, + { + "entropy": 0.9506860226392746, + "epoch": 0.892089208920892, + "grad_norm": 0.26155102252960205, + "learning_rate": 2.5489868112169714e-05, + "loss": 0.962, + "mean_token_accuracy": 0.7216337621212006, + "num_tokens": 18878905.0, + "step": 2230 + }, + { + "entropy": 1.0561456680297852, + "epoch": 0.8924892489248925, + "grad_norm": 0.28905296325683594, + "learning_rate": 2.5449840422791597e-05, + "loss": 1.0526, + "mean_token_accuracy": 0.7033811956644058, + "num_tokens": 18887172.0, + "step": 2231 + }, + { + "entropy": 1.0258281230926514, + "epoch": 0.8928892889288929, + "grad_norm": 0.2898278832435608, + "learning_rate": 2.5409954636104155e-05, + "loss": 1.0517, + "mean_token_accuracy": 0.7023575901985168, + "num_tokens": 18895121.0, + "step": 2232 + }, + { + "entropy": 1.0229663848876953, + "epoch": 0.8932893289328933, + "grad_norm": 0.2888786494731903, + "learning_rate": 2.5370210819048727e-05, + "loss": 1.0131, + "mean_token_accuracy": 0.7120652347803116, + "num_tokens": 18902964.0, + "step": 2233 + }, + { + "entropy": 0.9883149564266205, + "epoch": 0.8936893689368937, + "grad_norm": 0.27499204874038696, + "learning_rate": 2.533060903832838e-05, + "loss": 0.9839, + "mean_token_accuracy": 0.7089062333106995, + "num_tokens": 18911544.0, + "step": 2234 + }, + { + "entropy": 0.9663169384002686, + "epoch": 0.8940894089408941, + "grad_norm": 0.26842859387397766, + "learning_rate": 2.5291149360407786e-05, + "loss": 0.9655, + "mean_token_accuracy": 0.7180008888244629, + "num_tokens": 18920143.0, + "step": 2235 + }, + { + "entropy": 0.9023353457450867, + "epoch": 0.8944894489448945, + "grad_norm": 0.2601992189884186, + "learning_rate": 2.5251831851513114e-05, + "loss": 0.8981, + "mean_token_accuracy": 0.7359243631362915, + "num_tokens": 18928799.0, + "step": 2236 + }, + { + "entropy": 0.9708894640207291, + "epoch": 0.8948894889488949, + "grad_norm": 0.2788752317428589, + "learning_rate": 2.5212656577631967e-05, + "loss": 0.9441, + "mean_token_accuracy": 0.7214881479740143, + "num_tokens": 18936978.0, + "step": 2237 + }, + { + "entropy": 1.0582826137542725, + "epoch": 0.8952895289528953, + "grad_norm": 0.2946871817111969, + "learning_rate": 2.5173623604513196e-05, + "loss": 1.0374, + "mean_token_accuracy": 0.7023088335990906, + "num_tokens": 18944615.0, + "step": 2238 + }, + { + "entropy": 0.9690254032611847, + "epoch": 0.8956895689568957, + "grad_norm": 0.27072715759277344, + "learning_rate": 2.513473299766685e-05, + "loss": 0.947, + "mean_token_accuracy": 0.7224423438310623, + "num_tokens": 18952922.0, + "step": 2239 + }, + { + "entropy": 0.9651824086904526, + "epoch": 0.8960896089608961, + "grad_norm": 0.2778373658657074, + "learning_rate": 2.5095984822364005e-05, + "loss": 0.9374, + "mean_token_accuracy": 0.7249150723218918, + "num_tokens": 18961109.0, + "step": 2240 + }, + { + "entropy": 1.0088869333267212, + "epoch": 0.8964896489648965, + "grad_norm": 0.2761286795139313, + "learning_rate": 2.505737914363672e-05, + "loss": 1.0065, + "mean_token_accuracy": 0.7075244188308716, + "num_tokens": 18969951.0, + "step": 2241 + }, + { + "entropy": 0.9694850146770477, + "epoch": 0.8968896889688969, + "grad_norm": 0.3218696117401123, + "learning_rate": 2.5018916026277905e-05, + "loss": 0.923, + "mean_token_accuracy": 0.7256114631891251, + "num_tokens": 18978028.0, + "step": 2242 + }, + { + "entropy": 1.0194066166877747, + "epoch": 0.8972897289728973, + "grad_norm": 0.2797492444515228, + "learning_rate": 2.4980595534841162e-05, + "loss": 0.9674, + "mean_token_accuracy": 0.7165752500295639, + "num_tokens": 18985872.0, + "step": 2243 + }, + { + "entropy": 0.99969682097435, + "epoch": 0.8976897689768977, + "grad_norm": 0.31677117943763733, + "learning_rate": 2.4942417733640792e-05, + "loss": 0.9827, + "mean_token_accuracy": 0.7128837704658508, + "num_tokens": 18994479.0, + "step": 2244 + }, + { + "entropy": 1.0108253061771393, + "epoch": 0.8980898089808981, + "grad_norm": 0.28153467178344727, + "learning_rate": 2.4904382686751544e-05, + "loss": 1.0113, + "mean_token_accuracy": 0.7091988027095795, + "num_tokens": 19002728.0, + "step": 2245 + }, + { + "entropy": 0.9804398417472839, + "epoch": 0.8984898489848985, + "grad_norm": 0.26510095596313477, + "learning_rate": 2.4866490458008593e-05, + "loss": 0.9774, + "mean_token_accuracy": 0.7151379138231277, + "num_tokens": 19011318.0, + "step": 2246 + }, + { + "entropy": 0.9866217970848083, + "epoch": 0.8988898889888989, + "grad_norm": 0.2682114541530609, + "learning_rate": 2.482874111100746e-05, + "loss": 1.0065, + "mean_token_accuracy": 0.7112932205200195, + "num_tokens": 19020252.0, + "step": 2247 + }, + { + "entropy": 0.9449676722288132, + "epoch": 0.8992899289928993, + "grad_norm": 0.2661628723144531, + "learning_rate": 2.4791134709103847e-05, + "loss": 0.9294, + "mean_token_accuracy": 0.7302011847496033, + "num_tokens": 19028873.0, + "step": 2248 + }, + { + "entropy": 0.9688703715801239, + "epoch": 0.8996899689968997, + "grad_norm": 0.7239289879798889, + "learning_rate": 2.475367131541351e-05, + "loss": 0.9929, + "mean_token_accuracy": 0.7183982878923416, + "num_tokens": 19037392.0, + "step": 2249 + }, + { + "entropy": 1.0238465517759323, + "epoch": 0.9000900090009001, + "grad_norm": 0.2872447073459625, + "learning_rate": 2.471635099281225e-05, + "loss": 1.0406, + "mean_token_accuracy": 0.6974342912435532, + "num_tokens": 19045887.0, + "step": 2250 + }, + { + "entropy": 0.9326125681400299, + "epoch": 0.9004900490049005, + "grad_norm": 0.26564764976501465, + "learning_rate": 2.4679173803935662e-05, + "loss": 0.9359, + "mean_token_accuracy": 0.728179857134819, + "num_tokens": 19054954.0, + "step": 2251 + }, + { + "entropy": 0.9703081697225571, + "epoch": 0.9008900890089009, + "grad_norm": 0.2698241174221039, + "learning_rate": 2.464213981117922e-05, + "loss": 0.9478, + "mean_token_accuracy": 0.721969798207283, + "num_tokens": 19063275.0, + "step": 2252 + }, + { + "entropy": 0.9978289753198624, + "epoch": 0.9012901290129013, + "grad_norm": 0.26937800645828247, + "learning_rate": 2.4605249076698024e-05, + "loss": 0.988, + "mean_token_accuracy": 0.7167630344629288, + "num_tokens": 19072271.0, + "step": 2253 + }, + { + "entropy": 1.0496771037578583, + "epoch": 0.9016901690169017, + "grad_norm": 0.2990860044956207, + "learning_rate": 2.4568501662406707e-05, + "loss": 1.0594, + "mean_token_accuracy": 0.6951986998319626, + "num_tokens": 19080241.0, + "step": 2254 + }, + { + "entropy": 0.9984473437070847, + "epoch": 0.9020902090209021, + "grad_norm": 0.708158552646637, + "learning_rate": 2.4531897629979386e-05, + "loss": 0.9879, + "mean_token_accuracy": 0.7197008579969406, + "num_tokens": 19088791.0, + "step": 2255 + }, + { + "entropy": 0.9427698403596878, + "epoch": 0.9024902490249025, + "grad_norm": 0.277446985244751, + "learning_rate": 2.4495437040849574e-05, + "loss": 0.9303, + "mean_token_accuracy": 0.7229152172803879, + "num_tokens": 19097256.0, + "step": 2256 + }, + { + "entropy": 0.9827859252691269, + "epoch": 0.9028902890289029, + "grad_norm": 0.2789204716682434, + "learning_rate": 2.445911995620999e-05, + "loss": 1.0079, + "mean_token_accuracy": 0.7090061157941818, + "num_tokens": 19105885.0, + "step": 2257 + }, + { + "entropy": 1.0108841061592102, + "epoch": 0.9032903290329033, + "grad_norm": 0.26850205659866333, + "learning_rate": 2.4422946437012536e-05, + "loss": 1.0236, + "mean_token_accuracy": 0.7104100286960602, + "num_tokens": 19115007.0, + "step": 2258 + }, + { + "entropy": 0.9476294368505478, + "epoch": 0.9036903690369037, + "grad_norm": 0.2696217894554138, + "learning_rate": 2.4386916543968142e-05, + "loss": 0.943, + "mean_token_accuracy": 0.7236519455909729, + "num_tokens": 19123454.0, + "step": 2259 + }, + { + "entropy": 0.9873618334531784, + "epoch": 0.9040904090409041, + "grad_norm": 0.28916820883750916, + "learning_rate": 2.4351030337546692e-05, + "loss": 0.9666, + "mean_token_accuracy": 0.7205635011196136, + "num_tokens": 19131701.0, + "step": 2260 + }, + { + "entropy": 0.9932132512331009, + "epoch": 0.9044904490449045, + "grad_norm": 0.2720109224319458, + "learning_rate": 2.431528787797692e-05, + "loss": 0.9913, + "mean_token_accuracy": 0.717472180724144, + "num_tokens": 19140111.0, + "step": 2261 + }, + { + "entropy": 0.9586424082517624, + "epoch": 0.9048904890489049, + "grad_norm": 0.2670898735523224, + "learning_rate": 2.4279689225246332e-05, + "loss": 0.9482, + "mean_token_accuracy": 0.7239266484975815, + "num_tokens": 19148767.0, + "step": 2262 + }, + { + "entropy": 1.0042395889759064, + "epoch": 0.9052905290529053, + "grad_norm": 0.2839438021183014, + "learning_rate": 2.424423443910105e-05, + "loss": 0.9855, + "mean_token_accuracy": 0.7169470191001892, + "num_tokens": 19157452.0, + "step": 2263 + }, + { + "entropy": 0.9648073464632034, + "epoch": 0.9056905690569057, + "grad_norm": 0.2718361020088196, + "learning_rate": 2.4208923579045713e-05, + "loss": 0.9433, + "mean_token_accuracy": 0.7234213948249817, + "num_tokens": 19165644.0, + "step": 2264 + }, + { + "entropy": 0.9273928254842758, + "epoch": 0.9060906090609061, + "grad_norm": 0.2682346999645233, + "learning_rate": 2.4173756704343466e-05, + "loss": 0.9151, + "mean_token_accuracy": 0.7302965372800827, + "num_tokens": 19174249.0, + "step": 2265 + }, + { + "entropy": 0.9811443537473679, + "epoch": 0.9064906490649065, + "grad_norm": 0.30159369111061096, + "learning_rate": 2.413873387401579e-05, + "loss": 0.9816, + "mean_token_accuracy": 0.7216386198997498, + "num_tokens": 19182338.0, + "step": 2266 + }, + { + "entropy": 1.0018774569034576, + "epoch": 0.9068906890689069, + "grad_norm": 0.2812402546405792, + "learning_rate": 2.4103855146842362e-05, + "loss": 0.9969, + "mean_token_accuracy": 0.7099272310733795, + "num_tokens": 19190458.0, + "step": 2267 + }, + { + "entropy": 1.0026495605707169, + "epoch": 0.9072907290729073, + "grad_norm": 0.27345219254493713, + "learning_rate": 2.4069120581361058e-05, + "loss": 0.989, + "mean_token_accuracy": 0.7142843753099442, + "num_tokens": 19198874.0, + "step": 2268 + }, + { + "entropy": 0.921228438615799, + "epoch": 0.9076907690769077, + "grad_norm": 0.24714891612529755, + "learning_rate": 2.4034530235867763e-05, + "loss": 0.9083, + "mean_token_accuracy": 0.7291627377271652, + "num_tokens": 19208150.0, + "step": 2269 + }, + { + "entropy": 0.9894952327013016, + "epoch": 0.9080908090809081, + "grad_norm": 0.2700880765914917, + "learning_rate": 2.4000084168416354e-05, + "loss": 0.9998, + "mean_token_accuracy": 0.7132097780704498, + "num_tokens": 19216873.0, + "step": 2270 + }, + { + "entropy": 1.0085628032684326, + "epoch": 0.9084908490849085, + "grad_norm": 0.2816976010799408, + "learning_rate": 2.3965782436818565e-05, + "loss": 1.0359, + "mean_token_accuracy": 0.7083477526903152, + "num_tokens": 19225197.0, + "step": 2271 + }, + { + "entropy": 0.9701843708753586, + "epoch": 0.9088908890889089, + "grad_norm": 0.26289743185043335, + "learning_rate": 2.3931625098643836e-05, + "loss": 0.9496, + "mean_token_accuracy": 0.7221592664718628, + "num_tokens": 19234125.0, + "step": 2272 + }, + { + "entropy": 0.9146246761083603, + "epoch": 0.9092909290929093, + "grad_norm": 0.2602092921733856, + "learning_rate": 2.3897612211219303e-05, + "loss": 0.902, + "mean_token_accuracy": 0.7362543046474457, + "num_tokens": 19243135.0, + "step": 2273 + }, + { + "entropy": 0.8980617672204971, + "epoch": 0.9096909690969097, + "grad_norm": 0.2603513300418854, + "learning_rate": 2.386374383162967e-05, + "loss": 0.899, + "mean_token_accuracy": 0.7336671501398087, + "num_tokens": 19252038.0, + "step": 2274 + }, + { + "entropy": 1.0436131060123444, + "epoch": 0.9100910091009101, + "grad_norm": 0.29944920539855957, + "learning_rate": 2.3830020016717115e-05, + "loss": 1.0393, + "mean_token_accuracy": 0.6970981061458588, + "num_tokens": 19259950.0, + "step": 2275 + }, + { + "entropy": 1.0258405208587646, + "epoch": 0.9104910491049105, + "grad_norm": 0.27880239486694336, + "learning_rate": 2.3796440823081167e-05, + "loss": 1.0354, + "mean_token_accuracy": 0.7028295993804932, + "num_tokens": 19268453.0, + "step": 2276 + }, + { + "entropy": 0.9721348583698273, + "epoch": 0.9108910891089109, + "grad_norm": 0.26681485772132874, + "learning_rate": 2.3763006307078645e-05, + "loss": 0.9756, + "mean_token_accuracy": 0.7119433879852295, + "num_tokens": 19276707.0, + "step": 2277 + }, + { + "entropy": 1.017838940024376, + "epoch": 0.9112911291129113, + "grad_norm": 0.2884996831417084, + "learning_rate": 2.3729716524823526e-05, + "loss": 0.9922, + "mean_token_accuracy": 0.7157018482685089, + "num_tokens": 19284943.0, + "step": 2278 + }, + { + "entropy": 1.0605157166719437, + "epoch": 0.9116911691169117, + "grad_norm": 0.280300110578537, + "learning_rate": 2.369657153218693e-05, + "loss": 1.0486, + "mean_token_accuracy": 0.7065360695123672, + "num_tokens": 19293031.0, + "step": 2279 + }, + { + "entropy": 1.0589729994535446, + "epoch": 0.9120912091209121, + "grad_norm": 0.27290043234825134, + "learning_rate": 2.3663571384796957e-05, + "loss": 1.0459, + "mean_token_accuracy": 0.7018911391496658, + "num_tokens": 19301651.0, + "step": 2280 + }, + { + "entropy": 0.9826093465089798, + "epoch": 0.9124912491249125, + "grad_norm": 0.2680128812789917, + "learning_rate": 2.363071613803856e-05, + "loss": 0.9573, + "mean_token_accuracy": 0.7138935178518295, + "num_tokens": 19310086.0, + "step": 2281 + }, + { + "entropy": 0.971860259771347, + "epoch": 0.9128912891289129, + "grad_norm": 0.27887630462646484, + "learning_rate": 2.3598005847053554e-05, + "loss": 0.9886, + "mean_token_accuracy": 0.7163799405097961, + "num_tokens": 19318399.0, + "step": 2282 + }, + { + "entropy": 1.021589994430542, + "epoch": 0.9132913291329133, + "grad_norm": 0.26739901304244995, + "learning_rate": 2.3565440566740454e-05, + "loss": 0.9989, + "mean_token_accuracy": 0.7092142999172211, + "num_tokens": 19327156.0, + "step": 2283 + }, + { + "entropy": 0.9474208950996399, + "epoch": 0.9136913691369137, + "grad_norm": 0.2645983397960663, + "learning_rate": 2.353302035175441e-05, + "loss": 0.9137, + "mean_token_accuracy": 0.7302363365888596, + "num_tokens": 19335324.0, + "step": 2284 + }, + { + "entropy": 0.9661424905061722, + "epoch": 0.9140914091409141, + "grad_norm": 0.2717445194721222, + "learning_rate": 2.3500745256507096e-05, + "loss": 0.9452, + "mean_token_accuracy": 0.7226364761590958, + "num_tokens": 19343501.0, + "step": 2285 + }, + { + "entropy": 0.9596416503190994, + "epoch": 0.9144914491449145, + "grad_norm": 0.2595052719116211, + "learning_rate": 2.346861533516661e-05, + "loss": 0.9479, + "mean_token_accuracy": 0.7214115411043167, + "num_tokens": 19352435.0, + "step": 2286 + }, + { + "entropy": 0.9727799147367477, + "epoch": 0.9148914891489149, + "grad_norm": 0.43755877017974854, + "learning_rate": 2.3436630641657426e-05, + "loss": 0.954, + "mean_token_accuracy": 0.7276749312877655, + "num_tokens": 19360557.0, + "step": 2287 + }, + { + "entropy": 1.0018401890993118, + "epoch": 0.9152915291529153, + "grad_norm": 0.345107764005661, + "learning_rate": 2.340479122966028e-05, + "loss": 0.972, + "mean_token_accuracy": 0.7154461145401001, + "num_tokens": 19368917.0, + "step": 2288 + }, + { + "entropy": 1.012854054570198, + "epoch": 0.9156915691569157, + "grad_norm": 0.28347092866897583, + "learning_rate": 2.337309715261207e-05, + "loss": 1.0066, + "mean_token_accuracy": 0.7090090662240982, + "num_tokens": 19376882.0, + "step": 2289 + }, + { + "entropy": 0.9660994559526443, + "epoch": 0.9160916091609161, + "grad_norm": 0.2826177477836609, + "learning_rate": 2.334154846370578e-05, + "loss": 0.9565, + "mean_token_accuracy": 0.7186962813138962, + "num_tokens": 19385186.0, + "step": 2290 + }, + { + "entropy": 0.954737514257431, + "epoch": 0.9164916491649165, + "grad_norm": 0.27803346514701843, + "learning_rate": 2.3310145215890347e-05, + "loss": 0.9511, + "mean_token_accuracy": 0.7256408333778381, + "num_tokens": 19393487.0, + "step": 2291 + }, + { + "entropy": 0.8998585194349289, + "epoch": 0.916891689168917, + "grad_norm": 0.26049819588661194, + "learning_rate": 2.3278887461870674e-05, + "loss": 0.8833, + "mean_token_accuracy": 0.7403996884822845, + "num_tokens": 19402617.0, + "step": 2292 + }, + { + "entropy": 1.041173830628395, + "epoch": 0.9172917291729173, + "grad_norm": 0.2888355553150177, + "learning_rate": 2.324777525410744e-05, + "loss": 1.0644, + "mean_token_accuracy": 0.6953865587711334, + "num_tokens": 19410688.0, + "step": 2293 + }, + { + "entropy": 0.9584503918886185, + "epoch": 0.9176917691769176, + "grad_norm": 0.2723056972026825, + "learning_rate": 2.3216808644817065e-05, + "loss": 0.9493, + "mean_token_accuracy": 0.7216292470693588, + "num_tokens": 19419116.0, + "step": 2294 + }, + { + "entropy": 0.9782332926988602, + "epoch": 0.918091809180918, + "grad_norm": 0.27995359897613525, + "learning_rate": 2.3185987685971583e-05, + "loss": 0.9641, + "mean_token_accuracy": 0.7185875177383423, + "num_tokens": 19427020.0, + "step": 2295 + }, + { + "entropy": 0.9136113524436951, + "epoch": 0.9184918491849184, + "grad_norm": 0.25388288497924805, + "learning_rate": 2.31553124292986e-05, + "loss": 0.9257, + "mean_token_accuracy": 0.7280556410551071, + "num_tokens": 19436014.0, + "step": 2296 + }, + { + "entropy": 1.0487522184848785, + "epoch": 0.9188918891889188, + "grad_norm": 0.28332212567329407, + "learning_rate": 2.312478292628119e-05, + "loss": 1.0536, + "mean_token_accuracy": 0.708299070596695, + "num_tokens": 19444183.0, + "step": 2297 + }, + { + "entropy": 0.9704254120588303, + "epoch": 0.9192919291929192, + "grad_norm": 0.2901343107223511, + "learning_rate": 2.3094399228157802e-05, + "loss": 0.9713, + "mean_token_accuracy": 0.7209707945585251, + "num_tokens": 19453048.0, + "step": 2298 + }, + { + "entropy": 1.0115653425455093, + "epoch": 0.9196919691969196, + "grad_norm": 0.2903510332107544, + "learning_rate": 2.3064161385922155e-05, + "loss": 1.0413, + "mean_token_accuracy": 0.7076993882656097, + "num_tokens": 19461076.0, + "step": 2299 + }, + { + "entropy": 1.0295554995536804, + "epoch": 0.92009200920092, + "grad_norm": 0.2709880769252777, + "learning_rate": 2.30340694503232e-05, + "loss": 1.0323, + "mean_token_accuracy": 0.7053157240152359, + "num_tokens": 19469570.0, + "step": 2300 + }, + { + "entropy": 1.0212224423885345, + "epoch": 0.9204920492049204, + "grad_norm": 0.27854815125465393, + "learning_rate": 2.3004123471865e-05, + "loss": 1.0287, + "mean_token_accuracy": 0.6994395107030869, + "num_tokens": 19477832.0, + "step": 2301 + }, + { + "entropy": 0.9922874569892883, + "epoch": 0.9208920892089208, + "grad_norm": 0.264057993888855, + "learning_rate": 2.2974323500806645e-05, + "loss": 0.9669, + "mean_token_accuracy": 0.7249541133642197, + "num_tokens": 19486779.0, + "step": 2302 + }, + { + "entropy": 0.95318603515625, + "epoch": 0.9212921292129213, + "grad_norm": 0.26305684447288513, + "learning_rate": 2.2944669587162208e-05, + "loss": 0.9176, + "mean_token_accuracy": 0.7292317003011703, + "num_tokens": 19495514.0, + "step": 2303 + }, + { + "entropy": 0.9749800115823746, + "epoch": 0.9216921692169217, + "grad_norm": 0.26093053817749023, + "learning_rate": 2.2915161780700612e-05, + "loss": 0.9422, + "mean_token_accuracy": 0.7223565578460693, + "num_tokens": 19504188.0, + "step": 2304 + }, + { + "entropy": 0.912230372428894, + "epoch": 0.922092209220922, + "grad_norm": 0.2507255971431732, + "learning_rate": 2.2885800130945527e-05, + "loss": 0.898, + "mean_token_accuracy": 0.7360979169607162, + "num_tokens": 19513064.0, + "step": 2305 + }, + { + "entropy": 0.9483017772436142, + "epoch": 0.9224922492249225, + "grad_norm": 0.26613110303878784, + "learning_rate": 2.2856584687175384e-05, + "loss": 0.9589, + "mean_token_accuracy": 0.7217685580253601, + "num_tokens": 19521910.0, + "step": 2306 + }, + { + "entropy": 0.9551469832658768, + "epoch": 0.9228922892289229, + "grad_norm": 0.26871639490127563, + "learning_rate": 2.2827515498423204e-05, + "loss": 0.9429, + "mean_token_accuracy": 0.7248653769493103, + "num_tokens": 19530313.0, + "step": 2307 + }, + { + "entropy": 0.999338909983635, + "epoch": 0.9232923292329233, + "grad_norm": 0.403361976146698, + "learning_rate": 2.2798592613476548e-05, + "loss": 1.0084, + "mean_token_accuracy": 0.7132540941238403, + "num_tokens": 19538710.0, + "step": 2308 + }, + { + "entropy": 0.9814122468233109, + "epoch": 0.9236923692369237, + "grad_norm": 0.2683962881565094, + "learning_rate": 2.276981608087743e-05, + "loss": 0.9686, + "mean_token_accuracy": 0.720722883939743, + "num_tokens": 19546993.0, + "step": 2309 + }, + { + "entropy": 0.9732807576656342, + "epoch": 0.9240924092409241, + "grad_norm": 0.2812177836894989, + "learning_rate": 2.2741185948922253e-05, + "loss": 0.953, + "mean_token_accuracy": 0.7221063822507858, + "num_tokens": 19555783.0, + "step": 2310 + }, + { + "entropy": 1.0106874257326126, + "epoch": 0.9244924492449245, + "grad_norm": 0.2853434085845947, + "learning_rate": 2.2712702265661688e-05, + "loss": 0.9725, + "mean_token_accuracy": 0.7165887653827667, + "num_tokens": 19563966.0, + "step": 2311 + }, + { + "entropy": 0.943798765540123, + "epoch": 0.9248924892489249, + "grad_norm": 0.2996790409088135, + "learning_rate": 2.2684365078900597e-05, + "loss": 0.9275, + "mean_token_accuracy": 0.7276749759912491, + "num_tokens": 19572654.0, + "step": 2312 + }, + { + "entropy": 0.970567137002945, + "epoch": 0.9252925292529253, + "grad_norm": 0.2730634808540344, + "learning_rate": 2.2656174436198062e-05, + "loss": 0.9726, + "mean_token_accuracy": 0.7224336862564087, + "num_tokens": 19581333.0, + "step": 2313 + }, + { + "entropy": 0.9818492382764816, + "epoch": 0.9256925692569257, + "grad_norm": 0.26741307973861694, + "learning_rate": 2.2628130384867116e-05, + "loss": 0.9769, + "mean_token_accuracy": 0.7179026156663895, + "num_tokens": 19589951.0, + "step": 2314 + }, + { + "entropy": 1.0244228690862656, + "epoch": 0.9260926092609261, + "grad_norm": 0.2780371308326721, + "learning_rate": 2.260023297197483e-05, + "loss": 1.0343, + "mean_token_accuracy": 0.7006355226039886, + "num_tokens": 19598105.0, + "step": 2315 + }, + { + "entropy": 1.0183679014444351, + "epoch": 0.9264926492649265, + "grad_norm": 0.2769782841205597, + "learning_rate": 2.2572482244342112e-05, + "loss": 1.0175, + "mean_token_accuracy": 0.716263547539711, + "num_tokens": 19606544.0, + "step": 2316 + }, + { + "entropy": 0.9957336634397507, + "epoch": 0.9268926892689269, + "grad_norm": 0.27466949820518494, + "learning_rate": 2.254487824854376e-05, + "loss": 0.9907, + "mean_token_accuracy": 0.7170198857784271, + "num_tokens": 19614949.0, + "step": 2317 + }, + { + "entropy": 1.0327147841453552, + "epoch": 0.9272927292729273, + "grad_norm": 0.28644171357154846, + "learning_rate": 2.2517421030908222e-05, + "loss": 1.044, + "mean_token_accuracy": 0.7020446062088013, + "num_tokens": 19622997.0, + "step": 2318 + }, + { + "entropy": 1.038459062576294, + "epoch": 0.9276927692769277, + "grad_norm": 0.285683274269104, + "learning_rate": 2.2490110637517685e-05, + "loss": 1.0463, + "mean_token_accuracy": 0.7004373371601105, + "num_tokens": 19631749.0, + "step": 2319 + }, + { + "entropy": 0.9613916277885437, + "epoch": 0.9280928092809281, + "grad_norm": 0.25578829646110535, + "learning_rate": 2.2462947114207854e-05, + "loss": 0.9527, + "mean_token_accuracy": 0.7301425039768219, + "num_tokens": 19641303.0, + "step": 2320 + }, + { + "entropy": 0.9866868853569031, + "epoch": 0.9284928492849285, + "grad_norm": 0.26847824454307556, + "learning_rate": 2.2435930506567965e-05, + "loss": 0.9878, + "mean_token_accuracy": 0.7204152345657349, + "num_tokens": 19649782.0, + "step": 2321 + }, + { + "entropy": 0.9710914194583893, + "epoch": 0.9288928892889289, + "grad_norm": 0.26206812262535095, + "learning_rate": 2.2409060859940695e-05, + "loss": 0.9485, + "mean_token_accuracy": 0.7232882082462311, + "num_tokens": 19658626.0, + "step": 2322 + }, + { + "entropy": 1.0226665139198303, + "epoch": 0.9292929292929293, + "grad_norm": 0.2804519534111023, + "learning_rate": 2.238233821942204e-05, + "loss": 0.986, + "mean_token_accuracy": 0.7142560333013535, + "num_tokens": 19666757.0, + "step": 2323 + }, + { + "entropy": 0.9637891501188278, + "epoch": 0.9296929692969297, + "grad_norm": 0.2797803282737732, + "learning_rate": 2.2355762629861316e-05, + "loss": 0.9424, + "mean_token_accuracy": 0.7185655832290649, + "num_tokens": 19675497.0, + "step": 2324 + }, + { + "entropy": 0.9991123974323273, + "epoch": 0.9300930093009301, + "grad_norm": 0.27481308579444885, + "learning_rate": 2.2329334135860984e-05, + "loss": 0.995, + "mean_token_accuracy": 0.7101224809885025, + "num_tokens": 19683888.0, + "step": 2325 + }, + { + "entropy": 1.0052718073129654, + "epoch": 0.9304930493049305, + "grad_norm": 0.2845103442668915, + "learning_rate": 2.2303052781776664e-05, + "loss": 1.0042, + "mean_token_accuracy": 0.7074219584465027, + "num_tokens": 19692145.0, + "step": 2326 + }, + { + "entropy": 0.9744933843612671, + "epoch": 0.9308930893089309, + "grad_norm": 0.27387216687202454, + "learning_rate": 2.227691861171703e-05, + "loss": 0.959, + "mean_token_accuracy": 0.7219040244817734, + "num_tokens": 19700754.0, + "step": 2327 + }, + { + "entropy": 0.9641012251377106, + "epoch": 0.9312931293129313, + "grad_norm": 0.26637667417526245, + "learning_rate": 2.2250931669543738e-05, + "loss": 0.9535, + "mean_token_accuracy": 0.7160860002040863, + "num_tokens": 19709017.0, + "step": 2328 + }, + { + "entropy": 0.9845741540193558, + "epoch": 0.9316931693169317, + "grad_norm": 0.27426692843437195, + "learning_rate": 2.222509199887132e-05, + "loss": 0.9772, + "mean_token_accuracy": 0.7194301187992096, + "num_tokens": 19717625.0, + "step": 2329 + }, + { + "entropy": 0.9977894574403763, + "epoch": 0.9320932093209321, + "grad_norm": 0.2805306017398834, + "learning_rate": 2.219939964306716e-05, + "loss": 1.032, + "mean_token_accuracy": 0.7036940306425095, + "num_tokens": 19726276.0, + "step": 2330 + }, + { + "entropy": 0.9852121770381927, + "epoch": 0.9324932493249325, + "grad_norm": 0.2779141068458557, + "learning_rate": 2.217385464525139e-05, + "loss": 0.9711, + "mean_token_accuracy": 0.7175202965736389, + "num_tokens": 19734130.0, + "step": 2331 + }, + { + "entropy": 0.8873776495456696, + "epoch": 0.9328932893289329, + "grad_norm": 0.2514788508415222, + "learning_rate": 2.2148457048296855e-05, + "loss": 0.9002, + "mean_token_accuracy": 0.7339921742677689, + "num_tokens": 19743431.0, + "step": 2332 + }, + { + "entropy": 0.9724482893943787, + "epoch": 0.9332933293329333, + "grad_norm": 0.2966557741165161, + "learning_rate": 2.212320689482896e-05, + "loss": 0.9661, + "mean_token_accuracy": 0.715372234582901, + "num_tokens": 19751987.0, + "step": 2333 + }, + { + "entropy": 0.9664241522550583, + "epoch": 0.9336933693369337, + "grad_norm": 0.28058329224586487, + "learning_rate": 2.2098104227225697e-05, + "loss": 0.9684, + "mean_token_accuracy": 0.7205367088317871, + "num_tokens": 19760305.0, + "step": 2334 + }, + { + "entropy": 0.9810750633478165, + "epoch": 0.9340934093409341, + "grad_norm": 0.27565303444862366, + "learning_rate": 2.2073149087617506e-05, + "loss": 0.9809, + "mean_token_accuracy": 0.7130838334560394, + "num_tokens": 19768369.0, + "step": 2335 + }, + { + "entropy": 0.9952069073915482, + "epoch": 0.9344934493449345, + "grad_norm": 0.2863253057003021, + "learning_rate": 2.204834151788723e-05, + "loss": 0.9933, + "mean_token_accuracy": 0.7132820039987564, + "num_tokens": 19776780.0, + "step": 2336 + }, + { + "entropy": 0.9877703338861465, + "epoch": 0.9348934893489349, + "grad_norm": 0.2666737735271454, + "learning_rate": 2.2023681559670066e-05, + "loss": 0.9919, + "mean_token_accuracy": 0.7156518548727036, + "num_tokens": 19785530.0, + "step": 2337 + }, + { + "entropy": 0.9513733983039856, + "epoch": 0.9352935293529353, + "grad_norm": 0.26828715205192566, + "learning_rate": 2.1999169254353453e-05, + "loss": 0.9301, + "mean_token_accuracy": 0.7273254543542862, + "num_tokens": 19793764.0, + "step": 2338 + }, + { + "entropy": 1.0556580275297165, + "epoch": 0.9356935693569357, + "grad_norm": 0.28253740072250366, + "learning_rate": 2.1974804643076995e-05, + "loss": 1.0472, + "mean_token_accuracy": 0.706752598285675, + "num_tokens": 19802093.0, + "step": 2339 + }, + { + "entropy": 0.9990429878234863, + "epoch": 0.9360936093609361, + "grad_norm": 0.29421958327293396, + "learning_rate": 2.1950587766732432e-05, + "loss": 0.9721, + "mean_token_accuracy": 0.7191312462091446, + "num_tokens": 19810448.0, + "step": 2340 + }, + { + "entropy": 0.964095801115036, + "epoch": 0.9364936493649365, + "grad_norm": 0.2708730697631836, + "learning_rate": 2.192651866596361e-05, + "loss": 0.9627, + "mean_token_accuracy": 0.7125521749258041, + "num_tokens": 19818949.0, + "step": 2341 + }, + { + "entropy": 0.9399523735046387, + "epoch": 0.9368936893689369, + "grad_norm": 0.27227523922920227, + "learning_rate": 2.1902597381166288e-05, + "loss": 0.9104, + "mean_token_accuracy": 0.7366859614849091, + "num_tokens": 19827553.0, + "step": 2342 + }, + { + "entropy": 0.9567810297012329, + "epoch": 0.9372937293729373, + "grad_norm": 0.27436161041259766, + "learning_rate": 2.1878823952488174e-05, + "loss": 0.9465, + "mean_token_accuracy": 0.7260952144861221, + "num_tokens": 19835974.0, + "step": 2343 + }, + { + "entropy": 0.9675860404968262, + "epoch": 0.9376937693769377, + "grad_norm": 0.2518310546875, + "learning_rate": 2.1855198419828812e-05, + "loss": 0.9689, + "mean_token_accuracy": 0.7226860225200653, + "num_tokens": 19845547.0, + "step": 2344 + }, + { + "entropy": 1.0453436076641083, + "epoch": 0.9380938093809381, + "grad_norm": 0.299511581659317, + "learning_rate": 2.1831720822839536e-05, + "loss": 1.049, + "mean_token_accuracy": 0.6997034549713135, + "num_tokens": 19853066.0, + "step": 2345 + }, + { + "entropy": 0.9644649773836136, + "epoch": 0.9384938493849385, + "grad_norm": 0.264595091342926, + "learning_rate": 2.1808391200923413e-05, + "loss": 0.957, + "mean_token_accuracy": 0.7219135612249374, + "num_tokens": 19862224.0, + "step": 2346 + }, + { + "entropy": 1.0132753252983093, + "epoch": 0.9388938893889389, + "grad_norm": 0.37538883090019226, + "learning_rate": 2.1785209593235134e-05, + "loss": 0.9839, + "mean_token_accuracy": 0.7137714922428131, + "num_tokens": 19870806.0, + "step": 2347 + }, + { + "entropy": 0.9609867632389069, + "epoch": 0.9392939293929393, + "grad_norm": 0.27916139364242554, + "learning_rate": 2.1762176038680977e-05, + "loss": 0.9626, + "mean_token_accuracy": 0.7172022461891174, + "num_tokens": 19879461.0, + "step": 2348 + }, + { + "entropy": 1.0132823437452316, + "epoch": 0.9396939693969397, + "grad_norm": 0.2786515951156616, + "learning_rate": 2.173929057591874e-05, + "loss": 1.0221, + "mean_token_accuracy": 0.7039013057947159, + "num_tokens": 19888227.0, + "step": 2349 + }, + { + "entropy": 1.0180546641349792, + "epoch": 0.9400940094009401, + "grad_norm": 0.27992039918899536, + "learning_rate": 2.1716553243357698e-05, + "loss": 1.0204, + "mean_token_accuracy": 0.7067273855209351, + "num_tokens": 19896720.0, + "step": 2350 + }, + { + "entropy": 1.0408493876457214, + "epoch": 0.9404940494049405, + "grad_norm": 0.2790587246417999, + "learning_rate": 2.169396407915849e-05, + "loss": 1.0501, + "mean_token_accuracy": 0.705540657043457, + "num_tokens": 19905013.0, + "step": 2351 + }, + { + "entropy": 1.0345445573329926, + "epoch": 0.9408940894089409, + "grad_norm": 0.273943156003952, + "learning_rate": 2.167152312123308e-05, + "loss": 1.0173, + "mean_token_accuracy": 0.7121898382902145, + "num_tokens": 19913538.0, + "step": 2352 + }, + { + "entropy": 0.9663247913122177, + "epoch": 0.9412941294129413, + "grad_norm": 0.2653229534626007, + "learning_rate": 2.1649230407244698e-05, + "loss": 0.9419, + "mean_token_accuracy": 0.726273849606514, + "num_tokens": 19922256.0, + "step": 2353 + }, + { + "entropy": 0.9664977788925171, + "epoch": 0.9416941694169417, + "grad_norm": 0.2674228549003601, + "learning_rate": 2.1627085974607782e-05, + "loss": 0.9565, + "mean_token_accuracy": 0.7184318602085114, + "num_tokens": 19931171.0, + "step": 2354 + }, + { + "entropy": 0.9625849276781082, + "epoch": 0.9420942094209421, + "grad_norm": 0.2613987922668457, + "learning_rate": 2.1605089860487907e-05, + "loss": 0.9363, + "mean_token_accuracy": 0.7207747399806976, + "num_tokens": 19939833.0, + "step": 2355 + }, + { + "entropy": 0.9748395383358002, + "epoch": 0.9424942494249425, + "grad_norm": 0.28217843174934387, + "learning_rate": 2.1583242101801697e-05, + "loss": 0.974, + "mean_token_accuracy": 0.7144483029842377, + "num_tokens": 19948227.0, + "step": 2356 + }, + { + "entropy": 0.9736945480108261, + "epoch": 0.9428942894289429, + "grad_norm": 0.26603707671165466, + "learning_rate": 2.1561542735216794e-05, + "loss": 0.9388, + "mean_token_accuracy": 0.7239621728658676, + "num_tokens": 19956596.0, + "step": 2357 + }, + { + "entropy": 0.960863932967186, + "epoch": 0.9432943294329433, + "grad_norm": 0.26408496499061584, + "learning_rate": 2.153999179715183e-05, + "loss": 0.9218, + "mean_token_accuracy": 0.7286081165075302, + "num_tokens": 19964970.0, + "step": 2358 + }, + { + "entropy": 1.0230866521596909, + "epoch": 0.9436943694369437, + "grad_norm": 0.28705504536628723, + "learning_rate": 2.151858932377627e-05, + "loss": 1.0234, + "mean_token_accuracy": 0.713117703795433, + "num_tokens": 19973359.0, + "step": 2359 + }, + { + "entropy": 0.9959753602743149, + "epoch": 0.9440944094409441, + "grad_norm": 0.2832450866699219, + "learning_rate": 2.1497335351010447e-05, + "loss": 0.9769, + "mean_token_accuracy": 0.7197229862213135, + "num_tokens": 19981896.0, + "step": 2360 + }, + { + "entropy": 0.9927230030298233, + "epoch": 0.9444944494449445, + "grad_norm": 0.2726488709449768, + "learning_rate": 2.1476229914525436e-05, + "loss": 0.9978, + "mean_token_accuracy": 0.7118695974349976, + "num_tokens": 19990415.0, + "step": 2361 + }, + { + "entropy": 0.9793954789638519, + "epoch": 0.9448944894489449, + "grad_norm": 0.2690231204032898, + "learning_rate": 2.1455273049743027e-05, + "loss": 0.9323, + "mean_token_accuracy": 0.7275529503822327, + "num_tokens": 19998944.0, + "step": 2362 + }, + { + "entropy": 1.03518944978714, + "epoch": 0.9452945294529453, + "grad_norm": 0.29312771558761597, + "learning_rate": 2.1434464791835663e-05, + "loss": 1.0349, + "mean_token_accuracy": 0.7070243060588837, + "num_tokens": 20006851.0, + "step": 2363 + }, + { + "entropy": 0.9960467368364334, + "epoch": 0.9456945694569457, + "grad_norm": 0.3007810115814209, + "learning_rate": 2.141380517572639e-05, + "loss": 0.9844, + "mean_token_accuracy": 0.7212018370628357, + "num_tokens": 20015222.0, + "step": 2364 + }, + { + "entropy": 0.939300462603569, + "epoch": 0.9460946094609461, + "grad_norm": 0.2736949920654297, + "learning_rate": 2.1393294236088748e-05, + "loss": 0.9436, + "mean_token_accuracy": 0.7257458716630936, + "num_tokens": 20023283.0, + "step": 2365 + }, + { + "entropy": 0.9431233108043671, + "epoch": 0.9464946494649465, + "grad_norm": 0.2756460905075073, + "learning_rate": 2.137293200734678e-05, + "loss": 0.9486, + "mean_token_accuracy": 0.719787061214447, + "num_tokens": 20031340.0, + "step": 2366 + }, + { + "entropy": 1.0425260961055756, + "epoch": 0.946894689468947, + "grad_norm": 0.2824780344963074, + "learning_rate": 2.1352718523674924e-05, + "loss": 1.0311, + "mean_token_accuracy": 0.7027536332607269, + "num_tokens": 20039699.0, + "step": 2367 + }, + { + "entropy": 0.9542181193828583, + "epoch": 0.9472947294729473, + "grad_norm": 0.27516576647758484, + "learning_rate": 2.1332653818997998e-05, + "loss": 0.9212, + "mean_token_accuracy": 0.7305203378200531, + "num_tokens": 20047734.0, + "step": 2368 + }, + { + "entropy": 0.9827549904584885, + "epoch": 0.9476947694769476, + "grad_norm": 0.2946023643016815, + "learning_rate": 2.1312737926991104e-05, + "loss": 0.9632, + "mean_token_accuracy": 0.7185213714838028, + "num_tokens": 20055792.0, + "step": 2369 + }, + { + "entropy": 0.9645734429359436, + "epoch": 0.948094809480948, + "grad_norm": 0.27533453702926636, + "learning_rate": 2.1292970881079605e-05, + "loss": 0.9631, + "mean_token_accuracy": 0.7234614342451096, + "num_tokens": 20064318.0, + "step": 2370 + }, + { + "entropy": 0.9942159205675125, + "epoch": 0.9484948494849484, + "grad_norm": 0.2776359021663666, + "learning_rate": 2.1273352714439017e-05, + "loss": 0.992, + "mean_token_accuracy": 0.7136521339416504, + "num_tokens": 20072459.0, + "step": 2371 + }, + { + "entropy": 1.0424735695123672, + "epoch": 0.9488948894889488, + "grad_norm": 0.2981562912464142, + "learning_rate": 2.1253883459995015e-05, + "loss": 1.0385, + "mean_token_accuracy": 0.7040879279375076, + "num_tokens": 20080409.0, + "step": 2372 + }, + { + "entropy": 0.9822475463151932, + "epoch": 0.9492949294929492, + "grad_norm": 0.27773499488830566, + "learning_rate": 2.1234563150423348e-05, + "loss": 0.9827, + "mean_token_accuracy": 0.71448814868927, + "num_tokens": 20088946.0, + "step": 2373 + }, + { + "entropy": 1.0132358819246292, + "epoch": 0.9496949694969496, + "grad_norm": 0.29248520731925964, + "learning_rate": 2.1215391818149793e-05, + "loss": 0.9987, + "mean_token_accuracy": 0.7130976021289825, + "num_tokens": 20096666.0, + "step": 2374 + }, + { + "entropy": 0.9485601186752319, + "epoch": 0.95009500950095, + "grad_norm": 0.2719818949699402, + "learning_rate": 2.1196369495350073e-05, + "loss": 0.938, + "mean_token_accuracy": 0.7244333475828171, + "num_tokens": 20105065.0, + "step": 2375 + }, + { + "entropy": 0.9716391265392303, + "epoch": 0.9504950495049505, + "grad_norm": 0.3735992908477783, + "learning_rate": 2.1177496213949837e-05, + "loss": 0.9396, + "mean_token_accuracy": 0.727724626660347, + "num_tokens": 20113407.0, + "step": 2376 + }, + { + "entropy": 1.0061940997838974, + "epoch": 0.9508950895089509, + "grad_norm": 0.2700502276420593, + "learning_rate": 2.1158772005624612e-05, + "loss": 0.9892, + "mean_token_accuracy": 0.7175813615322113, + "num_tokens": 20122031.0, + "step": 2377 + }, + { + "entropy": 0.9904760718345642, + "epoch": 0.9512951295129513, + "grad_norm": 0.27231210470199585, + "learning_rate": 2.1140196901799703e-05, + "loss": 0.9975, + "mean_token_accuracy": 0.714013934135437, + "num_tokens": 20131056.0, + "step": 2378 + }, + { + "entropy": 1.017128050327301, + "epoch": 0.9516951695169517, + "grad_norm": 0.2766156494617462, + "learning_rate": 2.112177093365018e-05, + "loss": 1.0452, + "mean_token_accuracy": 0.6952586770057678, + "num_tokens": 20139703.0, + "step": 2379 + }, + { + "entropy": 0.9598387330770493, + "epoch": 0.9520952095209521, + "grad_norm": 0.28724250197410583, + "learning_rate": 2.110349413210082e-05, + "loss": 0.9782, + "mean_token_accuracy": 0.7183012068271637, + "num_tokens": 20147994.0, + "step": 2380 + }, + { + "entropy": 0.9710117876529694, + "epoch": 0.9524952495249525, + "grad_norm": 0.3157157599925995, + "learning_rate": 2.1085366527826043e-05, + "loss": 0.9735, + "mean_token_accuracy": 0.712793618440628, + "num_tokens": 20156598.0, + "step": 2381 + }, + { + "entropy": 0.9960335046052933, + "epoch": 0.9528952895289529, + "grad_norm": 0.28033870458602905, + "learning_rate": 2.106738815124987e-05, + "loss": 0.9862, + "mean_token_accuracy": 0.7139060944318771, + "num_tokens": 20165046.0, + "step": 2382 + }, + { + "entropy": 0.996942862868309, + "epoch": 0.9532953295329533, + "grad_norm": 0.2790244519710541, + "learning_rate": 2.1049559032545866e-05, + "loss": 1.0096, + "mean_token_accuracy": 0.7061429917812347, + "num_tokens": 20173340.0, + "step": 2383 + }, + { + "entropy": 0.964057669043541, + "epoch": 0.9536953695369537, + "grad_norm": 0.2753821313381195, + "learning_rate": 2.1031879201637095e-05, + "loss": 0.9802, + "mean_token_accuracy": 0.7193226218223572, + "num_tokens": 20182269.0, + "step": 2384 + }, + { + "entropy": 1.0183822959661484, + "epoch": 0.9540954095409541, + "grad_norm": 0.2773980498313904, + "learning_rate": 2.1014348688196087e-05, + "loss": 1.0008, + "mean_token_accuracy": 0.7045246809720993, + "num_tokens": 20190678.0, + "step": 2385 + }, + { + "entropy": 1.0287631154060364, + "epoch": 0.9544954495449545, + "grad_norm": 0.2752661406993866, + "learning_rate": 2.099696752164472e-05, + "loss": 1.0133, + "mean_token_accuracy": 0.7137541323900223, + "num_tokens": 20199210.0, + "step": 2386 + }, + { + "entropy": 0.944546103477478, + "epoch": 0.9548954895489549, + "grad_norm": 0.2641877233982086, + "learning_rate": 2.097973573115427e-05, + "loss": 0.96, + "mean_token_accuracy": 0.7231301069259644, + "num_tokens": 20207767.0, + "step": 2387 + }, + { + "entropy": 0.9959643632173538, + "epoch": 0.9552955295529553, + "grad_norm": 0.27660784125328064, + "learning_rate": 2.0962653345645296e-05, + "loss": 1.0108, + "mean_token_accuracy": 0.7122054696083069, + "num_tokens": 20216197.0, + "step": 2388 + }, + { + "entropy": 0.959757387638092, + "epoch": 0.9556955695569557, + "grad_norm": 0.26441681385040283, + "learning_rate": 2.0945720393787582e-05, + "loss": 0.949, + "mean_token_accuracy": 0.7258389443159103, + "num_tokens": 20225453.0, + "step": 2389 + }, + { + "entropy": 0.983758345246315, + "epoch": 0.9560956095609561, + "grad_norm": 0.2762783169746399, + "learning_rate": 2.092893690400015e-05, + "loss": 0.9727, + "mean_token_accuracy": 0.7112600058317184, + "num_tokens": 20233809.0, + "step": 2390 + }, + { + "entropy": 0.9165778309106827, + "epoch": 0.9564956495649565, + "grad_norm": 0.26227495074272156, + "learning_rate": 2.091230290445114e-05, + "loss": 0.8951, + "mean_token_accuracy": 0.7351661920547485, + "num_tokens": 20242629.0, + "step": 2391 + }, + { + "entropy": 0.9548200964927673, + "epoch": 0.9568956895689569, + "grad_norm": 0.260128915309906, + "learning_rate": 2.0895818423057832e-05, + "loss": 0.9339, + "mean_token_accuracy": 0.7276741862297058, + "num_tokens": 20251681.0, + "step": 2392 + }, + { + "entropy": 1.0740379095077515, + "epoch": 0.9572957295729573, + "grad_norm": 0.28067219257354736, + "learning_rate": 2.0879483487486562e-05, + "loss": 1.055, + "mean_token_accuracy": 0.6991075277328491, + "num_tokens": 20259717.0, + "step": 2393 + }, + { + "entropy": 0.9968211948871613, + "epoch": 0.9576957695769577, + "grad_norm": 0.2671966254711151, + "learning_rate": 2.0863298125152643e-05, + "loss": 0.9866, + "mean_token_accuracy": 0.7100729495286942, + "num_tokens": 20268169.0, + "step": 2394 + }, + { + "entropy": 1.0036395192146301, + "epoch": 0.9580958095809581, + "grad_norm": 0.27651920914649963, + "learning_rate": 2.0847262363220415e-05, + "loss": 0.9798, + "mean_token_accuracy": 0.7135031670331955, + "num_tokens": 20276721.0, + "step": 2395 + }, + { + "entropy": 0.9911931902170181, + "epoch": 0.9584958495849585, + "grad_norm": 0.26799485087394714, + "learning_rate": 2.0831376228603072e-05, + "loss": 0.9726, + "mean_token_accuracy": 0.7173360884189606, + "num_tokens": 20285480.0, + "step": 2396 + }, + { + "entropy": 0.9950195848941803, + "epoch": 0.9588958895889589, + "grad_norm": 0.27110594511032104, + "learning_rate": 2.081563974796273e-05, + "loss": 0.9756, + "mean_token_accuracy": 0.7186154872179031, + "num_tokens": 20293789.0, + "step": 2397 + }, + { + "entropy": 0.9782933741807938, + "epoch": 0.9592959295929593, + "grad_norm": 0.2800324857234955, + "learning_rate": 2.0800052947710344e-05, + "loss": 0.9438, + "mean_token_accuracy": 0.722812220454216, + "num_tokens": 20302263.0, + "step": 2398 + }, + { + "entropy": 1.0024579167366028, + "epoch": 0.9596959695969597, + "grad_norm": 0.2720474302768707, + "learning_rate": 2.0784615854005616e-05, + "loss": 0.9937, + "mean_token_accuracy": 0.7157480418682098, + "num_tokens": 20311146.0, + "step": 2399 + }, + { + "entropy": 1.0033270418643951, + "epoch": 0.9600960096009601, + "grad_norm": 0.3063996732234955, + "learning_rate": 2.0769328492757027e-05, + "loss": 0.9838, + "mean_token_accuracy": 0.716188833117485, + "num_tokens": 20319199.0, + "step": 2400 + }, + { + "entropy": 0.9949142783880234, + "epoch": 0.9604960496049605, + "grad_norm": 0.2749100625514984, + "learning_rate": 2.0754190889621745e-05, + "loss": 0.9781, + "mean_token_accuracy": 0.7132418155670166, + "num_tokens": 20327851.0, + "step": 2401 + }, + { + "entropy": 0.9816398024559021, + "epoch": 0.9608960896089609, + "grad_norm": 0.2839379608631134, + "learning_rate": 2.073920307000559e-05, + "loss": 0.9753, + "mean_token_accuracy": 0.7201118767261505, + "num_tokens": 20336273.0, + "step": 2402 + }, + { + "entropy": 0.9545728862285614, + "epoch": 0.9612961296129613, + "grad_norm": 0.26824185252189636, + "learning_rate": 2.0724365059062996e-05, + "loss": 0.9521, + "mean_token_accuracy": 0.7189462631940842, + "num_tokens": 20344903.0, + "step": 2403 + }, + { + "entropy": 0.9927709251642227, + "epoch": 0.9616961696169617, + "grad_norm": 0.270592600107193, + "learning_rate": 2.0709676881697005e-05, + "loss": 0.9811, + "mean_token_accuracy": 0.7200025469064713, + "num_tokens": 20353506.0, + "step": 2404 + }, + { + "entropy": 0.9889578968286514, + "epoch": 0.9620962096209621, + "grad_norm": 0.2813127338886261, + "learning_rate": 2.0695138562559116e-05, + "loss": 0.9886, + "mean_token_accuracy": 0.7160389274358749, + "num_tokens": 20361900.0, + "step": 2405 + }, + { + "entropy": 0.9087959676980972, + "epoch": 0.9624962496249625, + "grad_norm": 0.26845166087150574, + "learning_rate": 2.0680750126049406e-05, + "loss": 0.8988, + "mean_token_accuracy": 0.7330845445394516, + "num_tokens": 20370557.0, + "step": 2406 + }, + { + "entropy": 0.9479487389326096, + "epoch": 0.9628962896289629, + "grad_norm": 0.2901884913444519, + "learning_rate": 2.0666511596316317e-05, + "loss": 0.9431, + "mean_token_accuracy": 0.7246840000152588, + "num_tokens": 20378906.0, + "step": 2407 + }, + { + "entropy": 1.013877585530281, + "epoch": 0.9632963296329633, + "grad_norm": 0.3049609959125519, + "learning_rate": 2.065242299725676e-05, + "loss": 1.0312, + "mean_token_accuracy": 0.703667089343071, + "num_tokens": 20387033.0, + "step": 2408 + }, + { + "entropy": 0.9906213879585266, + "epoch": 0.9636963696369637, + "grad_norm": 0.2845284640789032, + "learning_rate": 2.0638484352515992e-05, + "loss": 1.0047, + "mean_token_accuracy": 0.7039853781461716, + "num_tokens": 20395051.0, + "step": 2409 + }, + { + "entropy": 1.002466693520546, + "epoch": 0.9640964096409641, + "grad_norm": 0.31863096356391907, + "learning_rate": 2.062469568548757e-05, + "loss": 0.9962, + "mean_token_accuracy": 0.7128480970859528, + "num_tokens": 20403512.0, + "step": 2410 + }, + { + "entropy": 0.9596149772405624, + "epoch": 0.9644964496449645, + "grad_norm": 0.27144333720207214, + "learning_rate": 2.061105701931339e-05, + "loss": 0.9559, + "mean_token_accuracy": 0.7172646522521973, + "num_tokens": 20412153.0, + "step": 2411 + }, + { + "entropy": 0.9948017299175262, + "epoch": 0.9648964896489649, + "grad_norm": 0.41683128476142883, + "learning_rate": 2.0597568376883576e-05, + "loss": 0.9825, + "mean_token_accuracy": 0.7179954349994659, + "num_tokens": 20420648.0, + "step": 2412 + }, + { + "entropy": 0.986390233039856, + "epoch": 0.9652965296529653, + "grad_norm": 0.2858465015888214, + "learning_rate": 2.058422978083645e-05, + "loss": 0.9933, + "mean_token_accuracy": 0.7189463526010513, + "num_tokens": 20429303.0, + "step": 2413 + }, + { + "entropy": 0.9524229913949966, + "epoch": 0.9656965696569657, + "grad_norm": 0.2609981894493103, + "learning_rate": 2.0571041253558515e-05, + "loss": 0.9336, + "mean_token_accuracy": 0.7246441096067429, + "num_tokens": 20438012.0, + "step": 2414 + }, + { + "entropy": 0.9558107107877731, + "epoch": 0.9660966096609661, + "grad_norm": 0.2726347744464874, + "learning_rate": 2.0558002817184415e-05, + "loss": 0.9599, + "mean_token_accuracy": 0.724203035235405, + "num_tokens": 20447405.0, + "step": 2415 + }, + { + "entropy": 0.9882210046052933, + "epoch": 0.9664966496649665, + "grad_norm": 0.27617108821868896, + "learning_rate": 2.05451144935969e-05, + "loss": 0.9788, + "mean_token_accuracy": 0.7152123898267746, + "num_tokens": 20455703.0, + "step": 2416 + }, + { + "entropy": 0.9659506529569626, + "epoch": 0.9668966896689669, + "grad_norm": 0.2855198383331299, + "learning_rate": 2.053237630442677e-05, + "loss": 0.983, + "mean_token_accuracy": 0.7135730385780334, + "num_tokens": 20463695.0, + "step": 2417 + }, + { + "entropy": 1.0079179406166077, + "epoch": 0.9672967296729673, + "grad_norm": 0.2796943485736847, + "learning_rate": 2.0519788271052847e-05, + "loss": 0.9919, + "mean_token_accuracy": 0.7062940448522568, + "num_tokens": 20471958.0, + "step": 2418 + }, + { + "entropy": 0.9693615585565567, + "epoch": 0.9676967696769677, + "grad_norm": 0.2655262351036072, + "learning_rate": 2.0507350414601943e-05, + "loss": 0.9701, + "mean_token_accuracy": 0.7185338884592056, + "num_tokens": 20481061.0, + "step": 2419 + }, + { + "entropy": 0.9539649784564972, + "epoch": 0.9680968096809681, + "grad_norm": 0.2730397880077362, + "learning_rate": 2.0495062755948847e-05, + "loss": 0.9366, + "mean_token_accuracy": 0.7266983389854431, + "num_tokens": 20489761.0, + "step": 2420 + }, + { + "entropy": 0.9920918941497803, + "epoch": 0.9684968496849685, + "grad_norm": 0.2750607132911682, + "learning_rate": 2.0482925315716232e-05, + "loss": 0.9772, + "mean_token_accuracy": 0.7113059461116791, + "num_tokens": 20497905.0, + "step": 2421 + }, + { + "entropy": 1.0253147333860397, + "epoch": 0.9688968896889689, + "grad_norm": 0.3007024824619293, + "learning_rate": 2.0470938114274697e-05, + "loss": 1.0315, + "mean_token_accuracy": 0.7078686952590942, + "num_tokens": 20505899.0, + "step": 2422 + }, + { + "entropy": 1.0094438046216965, + "epoch": 0.9692969296929693, + "grad_norm": 0.2836493253707886, + "learning_rate": 2.0459101171742647e-05, + "loss": 0.9891, + "mean_token_accuracy": 0.7134174853563309, + "num_tokens": 20513953.0, + "step": 2423 + }, + { + "entropy": 0.955817848443985, + "epoch": 0.9696969696969697, + "grad_norm": 0.2712668478488922, + "learning_rate": 2.0447414507986318e-05, + "loss": 0.9558, + "mean_token_accuracy": 0.7231526523828506, + "num_tokens": 20522153.0, + "step": 2424 + }, + { + "entropy": 1.0050187855958939, + "epoch": 0.9700970097009701, + "grad_norm": 0.27840614318847656, + "learning_rate": 2.0435878142619764e-05, + "loss": 0.9768, + "mean_token_accuracy": 0.7130183726549149, + "num_tokens": 20530592.0, + "step": 2425 + }, + { + "entropy": 1.0189116597175598, + "epoch": 0.9704970497049705, + "grad_norm": 0.2866596579551697, + "learning_rate": 2.0424492095004746e-05, + "loss": 0.9993, + "mean_token_accuracy": 0.7079493552446365, + "num_tokens": 20539139.0, + "step": 2426 + }, + { + "entropy": 1.0096582174301147, + "epoch": 0.9708970897089709, + "grad_norm": 0.2881893813610077, + "learning_rate": 2.041325638425076e-05, + "loss": 0.9951, + "mean_token_accuracy": 0.7045658081769943, + "num_tokens": 20547462.0, + "step": 2427 + }, + { + "entropy": 1.0677605867385864, + "epoch": 0.9712971297129713, + "grad_norm": 0.30001285672187805, + "learning_rate": 2.040217102921498e-05, + "loss": 1.0286, + "mean_token_accuracy": 0.7036509662866592, + "num_tokens": 20555121.0, + "step": 2428 + }, + { + "entropy": 0.9506776034832001, + "epoch": 0.9716971697169717, + "grad_norm": 0.2731800377368927, + "learning_rate": 2.0391236048502242e-05, + "loss": 0.9339, + "mean_token_accuracy": 0.7271087169647217, + "num_tokens": 20563685.0, + "step": 2429 + }, + { + "entropy": 0.9550203680992126, + "epoch": 0.9720972097209721, + "grad_norm": 0.2724727392196655, + "learning_rate": 2.0380451460465018e-05, + "loss": 0.9502, + "mean_token_accuracy": 0.7215763628482819, + "num_tokens": 20572689.0, + "step": 2430 + }, + { + "entropy": 0.9121042639017105, + "epoch": 0.9724972497249725, + "grad_norm": 0.2900939881801605, + "learning_rate": 2.0369817283203334e-05, + "loss": 0.9254, + "mean_token_accuracy": 0.732434019446373, + "num_tokens": 20582292.0, + "step": 2431 + }, + { + "entropy": 0.9434911757707596, + "epoch": 0.9728972897289729, + "grad_norm": 0.2622632682323456, + "learning_rate": 2.0359333534564824e-05, + "loss": 0.938, + "mean_token_accuracy": 0.7272087335586548, + "num_tokens": 20591541.0, + "step": 2432 + }, + { + "entropy": 0.9906485825777054, + "epoch": 0.9732973297329733, + "grad_norm": 0.28157860040664673, + "learning_rate": 2.0349000232144613e-05, + "loss": 0.9934, + "mean_token_accuracy": 0.7150762975215912, + "num_tokens": 20599540.0, + "step": 2433 + }, + { + "entropy": 0.9661772102117538, + "epoch": 0.9736973697369737, + "grad_norm": 0.2621872127056122, + "learning_rate": 2.0338817393285355e-05, + "loss": 0.9652, + "mean_token_accuracy": 0.7222464829683304, + "num_tokens": 20608550.0, + "step": 2434 + }, + { + "entropy": 0.9918557405471802, + "epoch": 0.9740974097409741, + "grad_norm": 0.27559375762939453, + "learning_rate": 2.0328785035077176e-05, + "loss": 0.9989, + "mean_token_accuracy": 0.713169515132904, + "num_tokens": 20617041.0, + "step": 2435 + }, + { + "entropy": 1.0233299136161804, + "epoch": 0.9744974497449745, + "grad_norm": 0.30615246295928955, + "learning_rate": 2.031890317435762e-05, + "loss": 1.0095, + "mean_token_accuracy": 0.7100730687379837, + "num_tokens": 20624757.0, + "step": 2436 + }, + { + "entropy": 0.9643544405698776, + "epoch": 0.9748974897489749, + "grad_norm": 0.2745738625526428, + "learning_rate": 2.0309171827711674e-05, + "loss": 0.9641, + "mean_token_accuracy": 0.720672994852066, + "num_tokens": 20633470.0, + "step": 2437 + }, + { + "entropy": 0.9720654040575027, + "epoch": 0.9752975297529753, + "grad_norm": 0.2879060208797455, + "learning_rate": 2.0299591011471713e-05, + "loss": 0.9946, + "mean_token_accuracy": 0.716559961438179, + "num_tokens": 20641828.0, + "step": 2438 + }, + { + "entropy": 0.9831184446811676, + "epoch": 0.9756975697569757, + "grad_norm": 0.2751798927783966, + "learning_rate": 2.029016074171745e-05, + "loss": 0.969, + "mean_token_accuracy": 0.7220011800527573, + "num_tokens": 20650260.0, + "step": 2439 + }, + { + "entropy": 0.9996557533740997, + "epoch": 0.9760976097609761, + "grad_norm": 0.28198355436325073, + "learning_rate": 2.0280881034275973e-05, + "loss": 0.9777, + "mean_token_accuracy": 0.7079188078641891, + "num_tokens": 20658449.0, + "step": 2440 + }, + { + "entropy": 0.9666790813207626, + "epoch": 0.9764976497649765, + "grad_norm": 0.26889392733573914, + "learning_rate": 2.0271751904721624e-05, + "loss": 0.9708, + "mean_token_accuracy": 0.7175091207027435, + "num_tokens": 20667169.0, + "step": 2441 + }, + { + "entropy": 0.9895788580179214, + "epoch": 0.976897689768977, + "grad_norm": 0.2628633677959442, + "learning_rate": 2.026277336837607e-05, + "loss": 0.9729, + "mean_token_accuracy": 0.7148797661066055, + "num_tokens": 20676384.0, + "step": 2442 + }, + { + "entropy": 0.9558957517147064, + "epoch": 0.9772977297729774, + "grad_norm": 0.2795051634311676, + "learning_rate": 2.0253945440308203e-05, + "loss": 0.9522, + "mean_token_accuracy": 0.7188701331615448, + "num_tokens": 20684734.0, + "step": 2443 + }, + { + "entropy": 0.9548690766096115, + "epoch": 0.9776977697769776, + "grad_norm": 0.2819373607635498, + "learning_rate": 2.0245268135334184e-05, + "loss": 0.9453, + "mean_token_accuracy": 0.7260729223489761, + "num_tokens": 20692801.0, + "step": 2444 + }, + { + "entropy": 1.0021701008081436, + "epoch": 0.978097809780978, + "grad_norm": 0.27613645792007446, + "learning_rate": 2.0236741468017333e-05, + "loss": 1.0107, + "mean_token_accuracy": 0.7153311520814896, + "num_tokens": 20701885.0, + "step": 2445 + }, + { + "entropy": 1.0036893337965012, + "epoch": 0.9784978497849784, + "grad_norm": 0.2793533504009247, + "learning_rate": 2.0228365452668185e-05, + "loss": 1.0002, + "mean_token_accuracy": 0.7130041867494583, + "num_tokens": 20710144.0, + "step": 2446 + }, + { + "entropy": 1.0133201628923416, + "epoch": 0.9788978897889788, + "grad_norm": 0.2862626314163208, + "learning_rate": 2.022014010334442e-05, + "loss": 1.0096, + "mean_token_accuracy": 0.7105721235275269, + "num_tokens": 20718096.0, + "step": 2447 + }, + { + "entropy": 0.9612603336572647, + "epoch": 0.9792979297929792, + "grad_norm": 0.2879790663719177, + "learning_rate": 2.0212065433850848e-05, + "loss": 0.9447, + "mean_token_accuracy": 0.7260202467441559, + "num_tokens": 20726644.0, + "step": 2448 + }, + { + "entropy": 1.0567083954811096, + "epoch": 0.9796979697969797, + "grad_norm": 0.2972792983055115, + "learning_rate": 2.02041414577394e-05, + "loss": 1.0506, + "mean_token_accuracy": 0.7030387818813324, + "num_tokens": 20734884.0, + "step": 2449 + }, + { + "entropy": 1.022926077246666, + "epoch": 0.98009800980098, + "grad_norm": 0.2758072316646576, + "learning_rate": 2.0196368188309066e-05, + "loss": 1.035, + "mean_token_accuracy": 0.7045374661684036, + "num_tokens": 20743699.0, + "step": 2450 + }, + { + "entropy": 0.9960336089134216, + "epoch": 0.9804980498049805, + "grad_norm": 0.2660010755062103, + "learning_rate": 2.0188745638605954e-05, + "loss": 0.9734, + "mean_token_accuracy": 0.7160214334726334, + "num_tokens": 20752452.0, + "step": 2451 + }, + { + "entropy": 0.9995721727609634, + "epoch": 0.9808980898089809, + "grad_norm": 0.28082606196403503, + "learning_rate": 2.0181273821423156e-05, + "loss": 0.9971, + "mean_token_accuracy": 0.7084928452968597, + "num_tokens": 20760836.0, + "step": 2452 + }, + { + "entropy": 0.9560614675283432, + "epoch": 0.9812981298129813, + "grad_norm": 0.2678420841693878, + "learning_rate": 2.0173952749300828e-05, + "loss": 0.9405, + "mean_token_accuracy": 0.7232837229967117, + "num_tokens": 20769225.0, + "step": 2453 + }, + { + "entropy": 0.9495169818401337, + "epoch": 0.9816981698169817, + "grad_norm": 0.2654285430908203, + "learning_rate": 2.0166782434526122e-05, + "loss": 0.9558, + "mean_token_accuracy": 0.7211422771215439, + "num_tokens": 20777875.0, + "step": 2454 + }, + { + "entropy": 1.018739253282547, + "epoch": 0.9820982098209821, + "grad_norm": 0.2949577867984772, + "learning_rate": 2.015976288913313e-05, + "loss": 1.0094, + "mean_token_accuracy": 0.7056130915880203, + "num_tokens": 20785839.0, + "step": 2455 + }, + { + "entropy": 0.9763962775468826, + "epoch": 0.9824982498249825, + "grad_norm": 0.2666266858577728, + "learning_rate": 2.0152894124902965e-05, + "loss": 0.978, + "mean_token_accuracy": 0.7198098599910736, + "num_tokens": 20794554.0, + "step": 2456 + }, + { + "entropy": 1.009479433298111, + "epoch": 0.9828982898289829, + "grad_norm": 0.2788332998752594, + "learning_rate": 2.0146176153363624e-05, + "loss": 0.985, + "mean_token_accuracy": 0.7122802287340164, + "num_tokens": 20802351.0, + "step": 2457 + }, + { + "entropy": 0.9735076129436493, + "epoch": 0.9832983298329833, + "grad_norm": 0.2884746789932251, + "learning_rate": 2.013960898579007e-05, + "loss": 0.958, + "mean_token_accuracy": 0.7136785238981247, + "num_tokens": 20810842.0, + "step": 2458 + }, + { + "entropy": 0.965003564953804, + "epoch": 0.9836983698369837, + "grad_norm": 0.27916714549064636, + "learning_rate": 2.0133192633204132e-05, + "loss": 0.96, + "mean_token_accuracy": 0.7118100225925446, + "num_tokens": 20819466.0, + "step": 2459 + }, + { + "entropy": 0.9633422493934631, + "epoch": 0.9840984098409841, + "grad_norm": 0.2712916433811188, + "learning_rate": 2.012692710637456e-05, + "loss": 0.9457, + "mean_token_accuracy": 0.7228126376867294, + "num_tokens": 20827682.0, + "step": 2460 + }, + { + "entropy": 0.9507994949817657, + "epoch": 0.9844984498449845, + "grad_norm": 0.25448885560035706, + "learning_rate": 2.012081241581693e-05, + "loss": 0.9113, + "mean_token_accuracy": 0.7316667288541794, + "num_tokens": 20836917.0, + "step": 2461 + }, + { + "entropy": 0.9506393224000931, + "epoch": 0.9848984898489849, + "grad_norm": 0.2657206952571869, + "learning_rate": 2.011484857179368e-05, + "loss": 0.9309, + "mean_token_accuracy": 0.7254895865917206, + "num_tokens": 20845279.0, + "step": 2462 + }, + { + "entropy": 1.0227565467357635, + "epoch": 0.9852985298529853, + "grad_norm": 0.29193630814552307, + "learning_rate": 2.0109035584314094e-05, + "loss": 1.012, + "mean_token_accuracy": 0.7041127979755402, + "num_tokens": 20853231.0, + "step": 2463 + }, + { + "entropy": 0.9397972375154495, + "epoch": 0.9856985698569857, + "grad_norm": 0.2575511038303375, + "learning_rate": 2.0103373463134245e-05, + "loss": 0.913, + "mean_token_accuracy": 0.7340124100446701, + "num_tokens": 20862221.0, + "step": 2464 + }, + { + "entropy": 0.9329790025949478, + "epoch": 0.9860986098609861, + "grad_norm": 0.2540740966796875, + "learning_rate": 2.009786221775703e-05, + "loss": 0.9173, + "mean_token_accuracy": 0.7351619005203247, + "num_tokens": 20871476.0, + "step": 2465 + }, + { + "entropy": 1.0199775248765945, + "epoch": 0.9864986498649865, + "grad_norm": 0.2796716094017029, + "learning_rate": 2.0092501857432107e-05, + "loss": 1.0179, + "mean_token_accuracy": 0.6999193280935287, + "num_tokens": 20880090.0, + "step": 2466 + }, + { + "entropy": 0.9695195406675339, + "epoch": 0.9868986898689869, + "grad_norm": 0.27739787101745605, + "learning_rate": 2.00872923911559e-05, + "loss": 0.9746, + "mean_token_accuracy": 0.7206200659275055, + "num_tokens": 20888419.0, + "step": 2467 + }, + { + "entropy": 1.0019965767860413, + "epoch": 0.9872987298729873, + "grad_norm": 0.27934759855270386, + "learning_rate": 2.0082233827671596e-05, + "loss": 1.0108, + "mean_token_accuracy": 0.7085425704717636, + "num_tokens": 20897079.0, + "step": 2468 + }, + { + "entropy": 0.9811052680015564, + "epoch": 0.9876987698769877, + "grad_norm": 0.2722490131855011, + "learning_rate": 2.0077326175469102e-05, + "loss": 0.9928, + "mean_token_accuracy": 0.7087418138980865, + "num_tokens": 20905361.0, + "step": 2469 + }, + { + "entropy": 0.9794397950172424, + "epoch": 0.9880988098809881, + "grad_norm": 0.28395360708236694, + "learning_rate": 2.007256944278507e-05, + "loss": 0.97, + "mean_token_accuracy": 0.7176538854837418, + "num_tokens": 20913774.0, + "step": 2470 + }, + { + "entropy": 0.9159517139196396, + "epoch": 0.9884988498849885, + "grad_norm": 0.2760423719882965, + "learning_rate": 2.0067963637602835e-05, + "loss": 0.8928, + "mean_token_accuracy": 0.7376326471567154, + "num_tokens": 20922389.0, + "step": 2471 + }, + { + "entropy": 0.9945347309112549, + "epoch": 0.9888988898889889, + "grad_norm": 0.27142515778541565, + "learning_rate": 2.0063508767652435e-05, + "loss": 1.0018, + "mean_token_accuracy": 0.7107403725385666, + "num_tokens": 20931060.0, + "step": 2472 + }, + { + "entropy": 1.0348006933927536, + "epoch": 0.9892989298929893, + "grad_norm": 0.32605183124542236, + "learning_rate": 2.0059204840410603e-05, + "loss": 1.0557, + "mean_token_accuracy": 0.6948443055152893, + "num_tokens": 20939119.0, + "step": 2473 + }, + { + "entropy": 0.9200804829597473, + "epoch": 0.9896989698969897, + "grad_norm": 0.2669326663017273, + "learning_rate": 2.005505186310073e-05, + "loss": 0.9071, + "mean_token_accuracy": 0.7361815869808197, + "num_tokens": 20947775.0, + "step": 2474 + }, + { + "entropy": 1.0185156613588333, + "epoch": 0.9900990099009901, + "grad_norm": 0.29418912529945374, + "learning_rate": 2.0051049842692848e-05, + "loss": 1.0021, + "mean_token_accuracy": 0.709437221288681, + "num_tokens": 20956088.0, + "step": 2475 + }, + { + "entropy": 0.9796518385410309, + "epoch": 0.9904990499049905, + "grad_norm": 0.280699223279953, + "learning_rate": 2.0047198785903658e-05, + "loss": 1.0058, + "mean_token_accuracy": 0.7055308222770691, + "num_tokens": 20964706.0, + "step": 2476 + }, + { + "entropy": 0.907961905002594, + "epoch": 0.9908990899089909, + "grad_norm": 0.2608216404914856, + "learning_rate": 2.0043498699196492e-05, + "loss": 0.8956, + "mean_token_accuracy": 0.7403623461723328, + "num_tokens": 20973542.0, + "step": 2477 + }, + { + "entropy": 0.9948689937591553, + "epoch": 0.9912991299129913, + "grad_norm": 0.3232859969139099, + "learning_rate": 2.0039949588781304e-05, + "loss": 0.9522, + "mean_token_accuracy": 0.7226274609565735, + "num_tokens": 20981410.0, + "step": 2478 + }, + { + "entropy": 0.985618531703949, + "epoch": 0.9916991699169917, + "grad_norm": 0.27184760570526123, + "learning_rate": 2.003655146061465e-05, + "loss": 0.9866, + "mean_token_accuracy": 0.7221778035163879, + "num_tokens": 20989949.0, + "step": 2479 + }, + { + "entropy": 0.9985558688640594, + "epoch": 0.9920992099209921, + "grad_norm": 0.2757585346698761, + "learning_rate": 2.0033304320399687e-05, + "loss": 0.9884, + "mean_token_accuracy": 0.7159407138824463, + "num_tokens": 20998697.0, + "step": 2480 + }, + { + "entropy": 0.9517724364995956, + "epoch": 0.9924992499249925, + "grad_norm": 0.2671005129814148, + "learning_rate": 2.0030208173586187e-05, + "loss": 0.9376, + "mean_token_accuracy": 0.7269866764545441, + "num_tokens": 21007282.0, + "step": 2481 + }, + { + "entropy": 0.9665936082601547, + "epoch": 0.9928992899289929, + "grad_norm": 0.2887428104877472, + "learning_rate": 2.0027263025370483e-05, + "loss": 0.9592, + "mean_token_accuracy": 0.7233573496341705, + "num_tokens": 21015666.0, + "step": 2482 + }, + { + "entropy": 0.9012265652418137, + "epoch": 0.9932993299329933, + "grad_norm": 0.25956735014915466, + "learning_rate": 2.00244688806955e-05, + "loss": 0.892, + "mean_token_accuracy": 0.7305159866809845, + "num_tokens": 21024972.0, + "step": 2483 + }, + { + "entropy": 0.9850020110607147, + "epoch": 0.9936993699369937, + "grad_norm": 0.3015810549259186, + "learning_rate": 2.0021825744250704e-05, + "loss": 0.9908, + "mean_token_accuracy": 0.7177742719650269, + "num_tokens": 21033250.0, + "step": 2484 + }, + { + "entropy": 0.9780118018388748, + "epoch": 0.9940994099409941, + "grad_norm": 0.2778491675853729, + "learning_rate": 2.0019333620472163e-05, + "loss": 0.9794, + "mean_token_accuracy": 0.7102801352739334, + "num_tokens": 21041875.0, + "step": 2485 + }, + { + "entropy": 1.0078285485506058, + "epoch": 0.9944994499449945, + "grad_norm": 0.2893142104148865, + "learning_rate": 2.0016992513542472e-05, + "loss": 0.9978, + "mean_token_accuracy": 0.7134518623352051, + "num_tokens": 21050268.0, + "step": 2486 + }, + { + "entropy": 0.9275661110877991, + "epoch": 0.9948994899489949, + "grad_norm": 0.2969188988208771, + "learning_rate": 2.0014802427390747e-05, + "loss": 0.9134, + "mean_token_accuracy": 0.7350037544965744, + "num_tokens": 21059382.0, + "step": 2487 + }, + { + "entropy": 0.9648205041885376, + "epoch": 0.9952995299529953, + "grad_norm": 0.2793810963630676, + "learning_rate": 2.0012763365692676e-05, + "loss": 0.9747, + "mean_token_accuracy": 0.7179182171821594, + "num_tokens": 21067996.0, + "step": 2488 + }, + { + "entropy": 0.9704231321811676, + "epoch": 0.9956995699569957, + "grad_norm": 0.26565852761268616, + "learning_rate": 2.0010875331870473e-05, + "loss": 0.9492, + "mean_token_accuracy": 0.7252022475004196, + "num_tokens": 21076822.0, + "step": 2489 + }, + { + "entropy": 0.9329577386379242, + "epoch": 0.9960996099609961, + "grad_norm": 0.27157798409461975, + "learning_rate": 2.000913832909288e-05, + "loss": 0.9031, + "mean_token_accuracy": 0.7388479858636856, + "num_tokens": 21085550.0, + "step": 2490 + }, + { + "entropy": 0.9128458350896835, + "epoch": 0.9964996499649965, + "grad_norm": 0.2784136235713959, + "learning_rate": 2.0007552360275137e-05, + "loss": 0.9138, + "mean_token_accuracy": 0.736114427447319, + "num_tokens": 21094195.0, + "step": 2491 + }, + { + "entropy": 0.9786698520183563, + "epoch": 0.9968996899689969, + "grad_norm": 0.31316542625427246, + "learning_rate": 2.000611742807902e-05, + "loss": 0.9745, + "mean_token_accuracy": 0.7177849858999252, + "num_tokens": 21102853.0, + "step": 2492 + }, + { + "entropy": 0.9893364608287811, + "epoch": 0.9972997299729973, + "grad_norm": 0.2826778292655945, + "learning_rate": 2.000483353491282e-05, + "loss": 0.9644, + "mean_token_accuracy": 0.7182137966156006, + "num_tokens": 21111108.0, + "step": 2493 + }, + { + "entropy": 0.8975424766540527, + "epoch": 0.9976997699769977, + "grad_norm": 0.2523338794708252, + "learning_rate": 2.0003700682931318e-05, + "loss": 0.9028, + "mean_token_accuracy": 0.7352657467126846, + "num_tokens": 21120445.0, + "step": 2494 + }, + { + "entropy": 0.9527763277292252, + "epoch": 0.9980998099809981, + "grad_norm": 0.3071540296077728, + "learning_rate": 2.0002718874035826e-05, + "loss": 0.9377, + "mean_token_accuracy": 0.7227376848459244, + "num_tokens": 21128543.0, + "step": 2495 + }, + { + "entropy": 1.0110877454280853, + "epoch": 0.9984998499849985, + "grad_norm": 0.2756589353084564, + "learning_rate": 2.000188810987411e-05, + "loss": 1.0089, + "mean_token_accuracy": 0.7205765396356583, + "num_tokens": 21136655.0, + "step": 2496 + }, + { + "entropy": 0.9824761748313904, + "epoch": 0.9988998899889989, + "grad_norm": 0.2756806015968323, + "learning_rate": 2.0001208391840494e-05, + "loss": 1.0084, + "mean_token_accuracy": 0.7160620838403702, + "num_tokens": 21145194.0, + "step": 2497 + }, + { + "entropy": 0.9683626592159271, + "epoch": 0.9992999299929993, + "grad_norm": 0.27295100688934326, + "learning_rate": 2.000067972107574e-05, + "loss": 0.9762, + "mean_token_accuracy": 0.7167785614728928, + "num_tokens": 21154405.0, + "step": 2498 + }, + { + "entropy": 0.9322485774755478, + "epoch": 0.9996999699969997, + "grad_norm": 0.2707328796386719, + "learning_rate": 2.0000302098467144e-05, + "loss": 0.9356, + "mean_token_accuracy": 0.7272337526082993, + "num_tokens": 21163775.0, + "step": 2499 + }, + { + "entropy": 0.9817899664243063, + "epoch": 1.0, + "grad_norm": 0.3254396915435791, + "learning_rate": 2.0000075524648477e-05, + "loss": 0.9914, + "mean_token_accuracy": 0.7150203982988993, + "num_tokens": 21170109.0, + "step": 2500 + } + ], + "logging_steps": 1, + "max_steps": 2500, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.0378361243855094e+19, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}