diff --git "a/Llama-3.3-70B-Instruct/checkpoint-500/trainer_state.json" "b/Llama-3.3-70B-Instruct/checkpoint-500/trainer_state.json" new file mode 100644--- /dev/null +++ "b/Llama-3.3-70B-Instruct/checkpoint-500/trainer_state.json" @@ -0,0 +1,5034 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.2000200020002, + "eval_steps": 500, + "global_step": 500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 1.7942025661468506, + "epoch": 0.00040004000400040005, + "grad_norm": 0.47672003507614136, + "learning_rate": 0.0, + "loss": 2.2188, + "mean_token_accuracy": 0.5192891135811806, + "num_tokens": 8850.0, + "step": 1 + }, + { + "entropy": 1.739880234003067, + "epoch": 0.0008000800080008001, + "grad_norm": 0.4743156433105469, + "learning_rate": 2.666666666666667e-06, + "loss": 2.1894, + "mean_token_accuracy": 0.5170402973890305, + "num_tokens": 18057.0, + "step": 2 + }, + { + "entropy": 1.7690136432647705, + "epoch": 0.0012001200120012002, + "grad_norm": 0.5005162358283997, + "learning_rate": 5.333333333333334e-06, + "loss": 2.2131, + "mean_token_accuracy": 0.5172632932662964, + "num_tokens": 26915.0, + "step": 3 + }, + { + "entropy": 1.866851270198822, + "epoch": 0.0016001600160016002, + "grad_norm": 0.438918799161911, + "learning_rate": 8.000000000000001e-06, + "loss": 2.2875, + "mean_token_accuracy": 0.5107089728116989, + "num_tokens": 35231.0, + "step": 4 + }, + { + "entropy": 1.8996970057487488, + "epoch": 0.002000200020002, + "grad_norm": 0.4285155236721039, + "learning_rate": 1.0666666666666667e-05, + "loss": 2.2935, + "mean_token_accuracy": 0.5128495469689369, + "num_tokens": 43540.0, + "step": 5 + }, + { + "entropy": 1.797807365655899, + "epoch": 0.0024002400240024004, + "grad_norm": 0.4465991258621216, + "learning_rate": 1.3333333333333333e-05, + "loss": 2.1917, + "mean_token_accuracy": 0.5254444032907486, + "num_tokens": 52236.0, + "step": 6 + }, + { + "entropy": 1.8983636498451233, + "epoch": 0.0028002800280028, + "grad_norm": 0.4536067545413971, + "learning_rate": 1.6000000000000003e-05, + "loss": 2.2677, + "mean_token_accuracy": 0.5144101679325104, + "num_tokens": 60443.0, + "step": 7 + }, + { + "entropy": 1.8427878618240356, + "epoch": 0.0032003200320032004, + "grad_norm": 0.5053722858428955, + "learning_rate": 1.866666666666667e-05, + "loss": 2.2356, + "mean_token_accuracy": 0.5142018273472786, + "num_tokens": 69155.0, + "step": 8 + }, + { + "entropy": 1.8648996651172638, + "epoch": 0.0036003600360036, + "grad_norm": 0.5287893414497375, + "learning_rate": 2.1333333333333335e-05, + "loss": 2.2435, + "mean_token_accuracy": 0.5086963996291161, + "num_tokens": 77156.0, + "step": 9 + }, + { + "entropy": 1.886999636888504, + "epoch": 0.004000400040004, + "grad_norm": 0.43816184997558594, + "learning_rate": 2.4e-05, + "loss": 2.1821, + "mean_token_accuracy": 0.5133799910545349, + "num_tokens": 85650.0, + "step": 10 + }, + { + "entropy": 2.0165862143039703, + "epoch": 0.0044004400440044, + "grad_norm": 0.3899831175804138, + "learning_rate": 2.6666666666666667e-05, + "loss": 2.1903, + "mean_token_accuracy": 0.5218925848603249, + "num_tokens": 93953.0, + "step": 11 + }, + { + "entropy": 2.033858895301819, + "epoch": 0.004800480048004801, + "grad_norm": 0.43466004729270935, + "learning_rate": 2.9333333333333336e-05, + "loss": 2.0937, + "mean_token_accuracy": 0.5258676409721375, + "num_tokens": 102592.0, + "step": 12 + }, + { + "entropy": 2.2364404797554016, + "epoch": 0.005200520052005201, + "grad_norm": 0.39024344086647034, + "learning_rate": 3.2000000000000005e-05, + "loss": 2.1801, + "mean_token_accuracy": 0.5228476375341415, + "num_tokens": 110784.0, + "step": 13 + }, + { + "entropy": 2.1504173278808594, + "epoch": 0.0056005600560056, + "grad_norm": 0.389006644487381, + "learning_rate": 3.466666666666667e-05, + "loss": 2.0215, + "mean_token_accuracy": 0.5430122464895248, + "num_tokens": 120082.0, + "step": 14 + }, + { + "entropy": 2.2962915897369385, + "epoch": 0.006000600060006, + "grad_norm": 0.4784089922904968, + "learning_rate": 3.733333333333334e-05, + "loss": 2.061, + "mean_token_accuracy": 0.531621664762497, + "num_tokens": 128363.0, + "step": 15 + }, + { + "entropy": 2.342404544353485, + "epoch": 0.006400640064006401, + "grad_norm": 0.5089271068572998, + "learning_rate": 4e-05, + "loss": 2.07, + "mean_token_accuracy": 0.5325157046318054, + "num_tokens": 136997.0, + "step": 16 + }, + { + "entropy": 2.283275544643402, + "epoch": 0.006800680068006801, + "grad_norm": 0.5488889813423157, + "learning_rate": 4.266666666666667e-05, + "loss": 2.0056, + "mean_token_accuracy": 0.5334787666797638, + "num_tokens": 145030.0, + "step": 17 + }, + { + "entropy": 2.050345718860626, + "epoch": 0.0072007200720072, + "grad_norm": 0.5031075477600098, + "learning_rate": 4.5333333333333335e-05, + "loss": 1.9162, + "mean_token_accuracy": 0.5427921563386917, + "num_tokens": 153623.0, + "step": 18 + }, + { + "entropy": 1.9828232526779175, + "epoch": 0.007600760076007601, + "grad_norm": 0.5337665677070618, + "learning_rate": 4.8e-05, + "loss": 1.9185, + "mean_token_accuracy": 0.5508822798728943, + "num_tokens": 161947.0, + "step": 19 + }, + { + "entropy": 1.8197293877601624, + "epoch": 0.008000800080008, + "grad_norm": 0.4948204755783081, + "learning_rate": 5.0666666666666674e-05, + "loss": 1.857, + "mean_token_accuracy": 0.552571251988411, + "num_tokens": 170516.0, + "step": 20 + }, + { + "entropy": 1.789840191602707, + "epoch": 0.0084008400840084, + "grad_norm": 0.4926859438419342, + "learning_rate": 5.333333333333333e-05, + "loss": 1.886, + "mean_token_accuracy": 0.5518065690994263, + "num_tokens": 178469.0, + "step": 21 + }, + { + "entropy": 1.6451906859874725, + "epoch": 0.0088008800880088, + "grad_norm": 0.4017632007598877, + "learning_rate": 5.6000000000000006e-05, + "loss": 1.7526, + "mean_token_accuracy": 0.5742013603448868, + "num_tokens": 186348.0, + "step": 22 + }, + { + "entropy": 1.6792134046554565, + "epoch": 0.0092009200920092, + "grad_norm": 0.6260354518890381, + "learning_rate": 5.866666666666667e-05, + "loss": 1.8468, + "mean_token_accuracy": 0.5656454414129257, + "num_tokens": 195071.0, + "step": 23 + }, + { + "entropy": 1.647391676902771, + "epoch": 0.009600960096009602, + "grad_norm": 0.46580520272254944, + "learning_rate": 6.133333333333334e-05, + "loss": 1.7595, + "mean_token_accuracy": 0.567480742931366, + "num_tokens": 202951.0, + "step": 24 + }, + { + "entropy": 1.6090652346611023, + "epoch": 0.010001000100010001, + "grad_norm": 0.4587379992008209, + "learning_rate": 6.400000000000001e-05, + "loss": 1.6638, + "mean_token_accuracy": 0.5937570631504059, + "num_tokens": 211268.0, + "step": 25 + }, + { + "entropy": 1.6326420307159424, + "epoch": 0.010401040104010401, + "grad_norm": 0.44421494007110596, + "learning_rate": 6.666666666666667e-05, + "loss": 1.6439, + "mean_token_accuracy": 0.5923638790845871, + "num_tokens": 219692.0, + "step": 26 + }, + { + "entropy": 1.7234179377555847, + "epoch": 0.010801080108010801, + "grad_norm": 0.4389747381210327, + "learning_rate": 6.933333333333334e-05, + "loss": 1.7108, + "mean_token_accuracy": 0.5803089290857315, + "num_tokens": 228047.0, + "step": 27 + }, + { + "entropy": 1.6885777115821838, + "epoch": 0.0112011201120112, + "grad_norm": 0.4335879981517792, + "learning_rate": 7.2e-05, + "loss": 1.6299, + "mean_token_accuracy": 0.586303323507309, + "num_tokens": 236376.0, + "step": 28 + }, + { + "entropy": 1.6646342873573303, + "epoch": 0.0116011601160116, + "grad_norm": 0.38126322627067566, + "learning_rate": 7.466666666666667e-05, + "loss": 1.6067, + "mean_token_accuracy": 0.5964086949825287, + "num_tokens": 245092.0, + "step": 29 + }, + { + "entropy": 1.6213374137878418, + "epoch": 0.012001200120012, + "grad_norm": 0.39270561933517456, + "learning_rate": 7.733333333333333e-05, + "loss": 1.5822, + "mean_token_accuracy": 0.6026028245687485, + "num_tokens": 253673.0, + "step": 30 + }, + { + "entropy": 1.5640352368354797, + "epoch": 0.012401240124012402, + "grad_norm": 0.3869155943393707, + "learning_rate": 8e-05, + "loss": 1.5011, + "mean_token_accuracy": 0.6241087764501572, + "num_tokens": 262625.0, + "step": 31 + }, + { + "entropy": 1.520020067691803, + "epoch": 0.012801280128012802, + "grad_norm": 0.3769737184047699, + "learning_rate": 8.266666666666667e-05, + "loss": 1.5088, + "mean_token_accuracy": 0.6204348653554916, + "num_tokens": 271309.0, + "step": 32 + }, + { + "entropy": 1.5669251084327698, + "epoch": 0.013201320132013201, + "grad_norm": 0.4119971692562103, + "learning_rate": 8.533333333333334e-05, + "loss": 1.598, + "mean_token_accuracy": 0.6009179204702377, + "num_tokens": 279702.0, + "step": 33 + }, + { + "entropy": 1.4570423662662506, + "epoch": 0.013601360136013601, + "grad_norm": 0.39608579874038696, + "learning_rate": 8.800000000000001e-05, + "loss": 1.4757, + "mean_token_accuracy": 0.6308933645486832, + "num_tokens": 288493.0, + "step": 34 + }, + { + "entropy": 1.4845676720142365, + "epoch": 0.014001400140014001, + "grad_norm": 0.37827152013778687, + "learning_rate": 9.066666666666667e-05, + "loss": 1.5051, + "mean_token_accuracy": 0.6212253570556641, + "num_tokens": 296999.0, + "step": 35 + }, + { + "entropy": 1.5079152584075928, + "epoch": 0.0144014401440144, + "grad_norm": 0.39496058225631714, + "learning_rate": 9.333333333333334e-05, + "loss": 1.5177, + "mean_token_accuracy": 0.6146594285964966, + "num_tokens": 305146.0, + "step": 36 + }, + { + "entropy": 1.4583857357501984, + "epoch": 0.014801480148014802, + "grad_norm": 0.41785281896591187, + "learning_rate": 9.6e-05, + "loss": 1.4723, + "mean_token_accuracy": 0.6168077737092972, + "num_tokens": 313647.0, + "step": 37 + }, + { + "entropy": 1.3630880415439606, + "epoch": 0.015201520152015202, + "grad_norm": 0.3789471983909607, + "learning_rate": 9.866666666666668e-05, + "loss": 1.3449, + "mean_token_accuracy": 0.6459334343671799, + "num_tokens": 322633.0, + "step": 38 + }, + { + "entropy": 1.4223653674125671, + "epoch": 0.015601560156015602, + "grad_norm": 0.4337131381034851, + "learning_rate": 0.00010133333333333335, + "loss": 1.4755, + "mean_token_accuracy": 0.6144974380731583, + "num_tokens": 331687.0, + "step": 39 + }, + { + "entropy": 1.3911584913730621, + "epoch": 0.016001600160016, + "grad_norm": 0.41617903113365173, + "learning_rate": 0.00010400000000000001, + "loss": 1.3826, + "mean_token_accuracy": 0.6441078633069992, + "num_tokens": 339868.0, + "step": 40 + }, + { + "entropy": 1.4160181879997253, + "epoch": 0.016401640164016403, + "grad_norm": 0.43531423807144165, + "learning_rate": 0.00010666666666666667, + "loss": 1.4294, + "mean_token_accuracy": 0.6320265531539917, + "num_tokens": 348029.0, + "step": 41 + }, + { + "entropy": 1.482937514781952, + "epoch": 0.0168016801680168, + "grad_norm": 0.4324755072593689, + "learning_rate": 0.00010933333333333333, + "loss": 1.5147, + "mean_token_accuracy": 0.6166313588619232, + "num_tokens": 356240.0, + "step": 42 + }, + { + "entropy": 1.4201266169548035, + "epoch": 0.017201720172017203, + "grad_norm": 0.3948879837989807, + "learning_rate": 0.00011200000000000001, + "loss": 1.3994, + "mean_token_accuracy": 0.6290998160839081, + "num_tokens": 364425.0, + "step": 43 + }, + { + "entropy": 1.357359528541565, + "epoch": 0.0176017601760176, + "grad_norm": 0.41655364632606506, + "learning_rate": 0.00011466666666666667, + "loss": 1.2924, + "mean_token_accuracy": 0.6492937654256821, + "num_tokens": 373138.0, + "step": 44 + }, + { + "entropy": 1.391854703426361, + "epoch": 0.018001800180018002, + "grad_norm": 0.417074590921402, + "learning_rate": 0.00011733333333333334, + "loss": 1.3507, + "mean_token_accuracy": 0.6494302302598953, + "num_tokens": 382100.0, + "step": 45 + }, + { + "entropy": 1.4749327600002289, + "epoch": 0.0184018401840184, + "grad_norm": 0.41923800110816956, + "learning_rate": 0.00012, + "loss": 1.5085, + "mean_token_accuracy": 0.612814411520958, + "num_tokens": 390052.0, + "step": 46 + }, + { + "entropy": 1.4137325286865234, + "epoch": 0.018801880188018802, + "grad_norm": 0.3833743929862976, + "learning_rate": 0.00012266666666666668, + "loss": 1.3916, + "mean_token_accuracy": 0.6410449594259262, + "num_tokens": 398110.0, + "step": 47 + }, + { + "entropy": 1.3919320702552795, + "epoch": 0.019201920192019203, + "grad_norm": 0.37842363119125366, + "learning_rate": 0.00012533333333333334, + "loss": 1.4084, + "mean_token_accuracy": 0.6312015205621719, + "num_tokens": 406666.0, + "step": 48 + }, + { + "entropy": 1.3608618378639221, + "epoch": 0.0196019601960196, + "grad_norm": 0.4568133056163788, + "learning_rate": 0.00012800000000000002, + "loss": 1.368, + "mean_token_accuracy": 0.6458054482936859, + "num_tokens": 415283.0, + "step": 49 + }, + { + "entropy": 1.3759468793869019, + "epoch": 0.020002000200020003, + "grad_norm": 0.3905130922794342, + "learning_rate": 0.00013066666666666668, + "loss": 1.3781, + "mean_token_accuracy": 0.6408856809139252, + "num_tokens": 423867.0, + "step": 50 + }, + { + "entropy": 1.3894509375095367, + "epoch": 0.0204020402040204, + "grad_norm": 0.39885976910591125, + "learning_rate": 0.00013333333333333334, + "loss": 1.3832, + "mean_token_accuracy": 0.6394526213407516, + "num_tokens": 432299.0, + "step": 51 + }, + { + "entropy": 1.3620089888572693, + "epoch": 0.020802080208020803, + "grad_norm": 0.44015854597091675, + "learning_rate": 0.00013600000000000003, + "loss": 1.3381, + "mean_token_accuracy": 0.6432337760925293, + "num_tokens": 440734.0, + "step": 52 + }, + { + "entropy": 1.3622656762599945, + "epoch": 0.0212021202120212, + "grad_norm": 0.49739453196525574, + "learning_rate": 0.00013866666666666669, + "loss": 1.3649, + "mean_token_accuracy": 0.6373352855443954, + "num_tokens": 448710.0, + "step": 53 + }, + { + "entropy": 1.2986978590488434, + "epoch": 0.021602160216021602, + "grad_norm": 0.37318113446235657, + "learning_rate": 0.00014133333333333334, + "loss": 1.3366, + "mean_token_accuracy": 0.6431873738765717, + "num_tokens": 457247.0, + "step": 54 + }, + { + "entropy": 1.2725946605205536, + "epoch": 0.022002200220022004, + "grad_norm": 0.4199654757976532, + "learning_rate": 0.000144, + "loss": 1.3302, + "mean_token_accuracy": 0.6447762101888657, + "num_tokens": 465701.0, + "step": 55 + }, + { + "entropy": 1.2967428863048553, + "epoch": 0.0224022402240224, + "grad_norm": 0.40956538915634155, + "learning_rate": 0.00014666666666666666, + "loss": 1.3352, + "mean_token_accuracy": 0.6408500224351883, + "num_tokens": 474476.0, + "step": 56 + }, + { + "entropy": 1.3544551134109497, + "epoch": 0.022802280228022803, + "grad_norm": 0.39519739151000977, + "learning_rate": 0.00014933333333333335, + "loss": 1.3406, + "mean_token_accuracy": 0.6500163674354553, + "num_tokens": 482570.0, + "step": 57 + }, + { + "entropy": 1.3824973404407501, + "epoch": 0.0232023202320232, + "grad_norm": 0.3799802362918854, + "learning_rate": 0.000152, + "loss": 1.3278, + "mean_token_accuracy": 0.6473122090101242, + "num_tokens": 491111.0, + "step": 58 + }, + { + "entropy": 1.3626296520233154, + "epoch": 0.023602360236023603, + "grad_norm": 0.3700718879699707, + "learning_rate": 0.00015466666666666667, + "loss": 1.3304, + "mean_token_accuracy": 0.645874097943306, + "num_tokens": 500032.0, + "step": 59 + }, + { + "entropy": 1.3258526921272278, + "epoch": 0.024002400240024, + "grad_norm": 0.366222620010376, + "learning_rate": 0.00015733333333333333, + "loss": 1.3073, + "mean_token_accuracy": 0.6523128002882004, + "num_tokens": 508045.0, + "step": 60 + }, + { + "entropy": 1.2787662744522095, + "epoch": 0.024402440244024402, + "grad_norm": 0.37774235010147095, + "learning_rate": 0.00016, + "loss": 1.2839, + "mean_token_accuracy": 0.657956600189209, + "num_tokens": 516334.0, + "step": 61 + }, + { + "entropy": 1.2824394404888153, + "epoch": 0.024802480248024804, + "grad_norm": 0.3594248294830322, + "learning_rate": 0.00016266666666666667, + "loss": 1.3335, + "mean_token_accuracy": 0.6513591110706329, + "num_tokens": 524762.0, + "step": 62 + }, + { + "entropy": 1.2761549651622772, + "epoch": 0.025202520252025202, + "grad_norm": 0.38247525691986084, + "learning_rate": 0.00016533333333333333, + "loss": 1.322, + "mean_token_accuracy": 0.6528888940811157, + "num_tokens": 533302.0, + "step": 63 + }, + { + "entropy": 1.285708099603653, + "epoch": 0.025602560256025603, + "grad_norm": 0.4210297167301178, + "learning_rate": 0.000168, + "loss": 1.2522, + "mean_token_accuracy": 0.6581785976886749, + "num_tokens": 542110.0, + "step": 64 + }, + { + "entropy": 1.3535743653774261, + "epoch": 0.026002600260026, + "grad_norm": 0.3659783601760864, + "learning_rate": 0.00017066666666666668, + "loss": 1.3343, + "mean_token_accuracy": 0.6510991156101227, + "num_tokens": 550717.0, + "step": 65 + }, + { + "entropy": 1.3446696996688843, + "epoch": 0.026402640264026403, + "grad_norm": 0.35590988397598267, + "learning_rate": 0.00017333333333333334, + "loss": 1.3224, + "mean_token_accuracy": 0.6442483812570572, + "num_tokens": 559025.0, + "step": 66 + }, + { + "entropy": 1.3695125877857208, + "epoch": 0.0268026802680268, + "grad_norm": 0.3491916358470917, + "learning_rate": 0.00017600000000000002, + "loss": 1.3288, + "mean_token_accuracy": 0.6431872397661209, + "num_tokens": 567724.0, + "step": 67 + }, + { + "entropy": 1.3363787531852722, + "epoch": 0.027202720272027203, + "grad_norm": 0.3625618517398834, + "learning_rate": 0.00017866666666666668, + "loss": 1.2804, + "mean_token_accuracy": 0.6557945609092712, + "num_tokens": 576144.0, + "step": 68 + }, + { + "entropy": 1.3033888339996338, + "epoch": 0.027602760276027604, + "grad_norm": 0.35051390528678894, + "learning_rate": 0.00018133333333333334, + "loss": 1.2841, + "mean_token_accuracy": 0.6544656604528427, + "num_tokens": 584831.0, + "step": 69 + }, + { + "entropy": 1.3235229551792145, + "epoch": 0.028002800280028002, + "grad_norm": 0.3980117738246918, + "learning_rate": 0.00018400000000000003, + "loss": 1.3492, + "mean_token_accuracy": 0.6482396423816681, + "num_tokens": 593412.0, + "step": 70 + }, + { + "entropy": 1.2970213294029236, + "epoch": 0.028402840284028404, + "grad_norm": 0.3519047796726227, + "learning_rate": 0.0001866666666666667, + "loss": 1.3083, + "mean_token_accuracy": 0.6536522507667542, + "num_tokens": 601675.0, + "step": 71 + }, + { + "entropy": 1.2363843321800232, + "epoch": 0.0288028802880288, + "grad_norm": 0.356121689081192, + "learning_rate": 0.00018933333333333335, + "loss": 1.2331, + "mean_token_accuracy": 0.6689527034759521, + "num_tokens": 610155.0, + "step": 72 + }, + { + "entropy": 1.2743788659572601, + "epoch": 0.029202920292029203, + "grad_norm": 0.352166086435318, + "learning_rate": 0.000192, + "loss": 1.2953, + "mean_token_accuracy": 0.6543757170438766, + "num_tokens": 619084.0, + "step": 73 + }, + { + "entropy": 1.251781314611435, + "epoch": 0.029602960296029605, + "grad_norm": 0.3690275251865387, + "learning_rate": 0.0001946666666666667, + "loss": 1.249, + "mean_token_accuracy": 0.6584222465753555, + "num_tokens": 627717.0, + "step": 74 + }, + { + "entropy": 1.3367043435573578, + "epoch": 0.030003000300030003, + "grad_norm": 0.3400121331214905, + "learning_rate": 0.00019733333333333335, + "loss": 1.2895, + "mean_token_accuracy": 0.6532490998506546, + "num_tokens": 637070.0, + "step": 75 + }, + { + "entropy": 1.2800488770008087, + "epoch": 0.030403040304030404, + "grad_norm": 0.34383344650268555, + "learning_rate": 0.0002, + "loss": 1.2733, + "mean_token_accuracy": 0.6612512767314911, + "num_tokens": 646123.0, + "step": 76 + }, + { + "entropy": 1.328520268201828, + "epoch": 0.030803080308030802, + "grad_norm": 0.3561513125896454, + "learning_rate": 0.00019999992447535154, + "loss": 1.3263, + "mean_token_accuracy": 0.6502320766448975, + "num_tokens": 654808.0, + "step": 77 + }, + { + "entropy": 1.2899321019649506, + "epoch": 0.031203120312031204, + "grad_norm": 0.3678707480430603, + "learning_rate": 0.00019999969790153286, + "loss": 1.3406, + "mean_token_accuracy": 0.6464085876941681, + "num_tokens": 663045.0, + "step": 78 + }, + { + "entropy": 1.3219149708747864, + "epoch": 0.0316031603160316, + "grad_norm": 0.38404518365859985, + "learning_rate": 0.00019999932027892428, + "loss": 1.302, + "mean_token_accuracy": 0.6544652730226517, + "num_tokens": 671266.0, + "step": 79 + }, + { + "entropy": 1.227865844964981, + "epoch": 0.032003200320032, + "grad_norm": 0.3195721209049225, + "learning_rate": 0.0001999987916081595, + "loss": 1.2129, + "mean_token_accuracy": 0.6690118610858917, + "num_tokens": 680536.0, + "step": 80 + }, + { + "entropy": 1.2681958079338074, + "epoch": 0.032403240324032405, + "grad_norm": 0.33165785670280457, + "learning_rate": 0.00019999811189012589, + "loss": 1.2616, + "mean_token_accuracy": 0.6542633771896362, + "num_tokens": 689078.0, + "step": 81 + }, + { + "entropy": 1.2480992376804352, + "epoch": 0.032803280328032806, + "grad_norm": 0.3365044891834259, + "learning_rate": 0.00019999728112596419, + "loss": 1.2532, + "mean_token_accuracy": 0.6593984663486481, + "num_tokens": 697600.0, + "step": 82 + }, + { + "entropy": 1.2559486627578735, + "epoch": 0.0332033203320332, + "grad_norm": 0.3525690734386444, + "learning_rate": 0.0001999962993170687, + "loss": 1.2407, + "mean_token_accuracy": 0.6652248501777649, + "num_tokens": 706449.0, + "step": 83 + }, + { + "entropy": 1.2723756432533264, + "epoch": 0.0336033603360336, + "grad_norm": 0.3243389129638672, + "learning_rate": 0.00019999516646508717, + "loss": 1.2759, + "mean_token_accuracy": 0.6553087830543518, + "num_tokens": 715261.0, + "step": 84 + }, + { + "entropy": 1.286735862493515, + "epoch": 0.034003400340034004, + "grad_norm": 0.3348769247531891, + "learning_rate": 0.000199993882571921, + "loss": 1.3288, + "mean_token_accuracy": 0.6503776162862778, + "num_tokens": 723935.0, + "step": 85 + }, + { + "entropy": 1.2838447391986847, + "epoch": 0.034403440344034406, + "grad_norm": 0.31921443343162537, + "learning_rate": 0.0001999924476397249, + "loss": 1.2712, + "mean_token_accuracy": 0.6571811884641647, + "num_tokens": 732552.0, + "step": 86 + }, + { + "entropy": 1.2601779401302338, + "epoch": 0.0348034803480348, + "grad_norm": 0.3210558593273163, + "learning_rate": 0.0001999908616709071, + "loss": 1.2409, + "mean_token_accuracy": 0.6692058891057968, + "num_tokens": 741619.0, + "step": 87 + }, + { + "entropy": 1.2706993520259857, + "epoch": 0.0352035203520352, + "grad_norm": 0.3449415862560272, + "learning_rate": 0.00019998912466812952, + "loss": 1.2301, + "mean_token_accuracy": 0.6645237505435944, + "num_tokens": 750045.0, + "step": 88 + }, + { + "entropy": 1.264108419418335, + "epoch": 0.0356035603560356, + "grad_norm": 0.3272925913333893, + "learning_rate": 0.00019998723663430733, + "loss": 1.2593, + "mean_token_accuracy": 0.6653023958206177, + "num_tokens": 758535.0, + "step": 89 + }, + { + "entropy": 1.174435406923294, + "epoch": 0.036003600360036005, + "grad_norm": 0.3484836518764496, + "learning_rate": 0.00019998519757260928, + "loss": 1.1771, + "mean_token_accuracy": 0.6722908169031143, + "num_tokens": 766995.0, + "step": 90 + }, + { + "entropy": 1.2018343806266785, + "epoch": 0.036403640364036406, + "grad_norm": 0.3412557542324066, + "learning_rate": 0.00019998300748645754, + "loss": 1.2204, + "mean_token_accuracy": 0.6707678735256195, + "num_tokens": 775542.0, + "step": 91 + }, + { + "entropy": 1.3117725551128387, + "epoch": 0.0368036803680368, + "grad_norm": 0.3464583158493042, + "learning_rate": 0.00019998066637952783, + "loss": 1.304, + "mean_token_accuracy": 0.645479291677475, + "num_tokens": 783830.0, + "step": 92 + }, + { + "entropy": 1.266638070344925, + "epoch": 0.0372037203720372, + "grad_norm": 0.35132962465286255, + "learning_rate": 0.0001999781742557493, + "loss": 1.2571, + "mean_token_accuracy": 0.6589740812778473, + "num_tokens": 792085.0, + "step": 93 + }, + { + "entropy": 1.266037255525589, + "epoch": 0.037603760376037604, + "grad_norm": 0.3320970833301544, + "learning_rate": 0.00019997553111930448, + "loss": 1.2761, + "mean_token_accuracy": 0.654522180557251, + "num_tokens": 800687.0, + "step": 94 + }, + { + "entropy": 1.324877679347992, + "epoch": 0.038003800380038005, + "grad_norm": 0.34410229325294495, + "learning_rate": 0.00019997273697462952, + "loss": 1.3059, + "mean_token_accuracy": 0.6469769328832626, + "num_tokens": 808479.0, + "step": 95 + }, + { + "entropy": 1.24421826004982, + "epoch": 0.03840384038403841, + "grad_norm": 0.3413639962673187, + "learning_rate": 0.00019996979182641383, + "loss": 1.2116, + "mean_token_accuracy": 0.6725156307220459, + "num_tokens": 817193.0, + "step": 96 + }, + { + "entropy": 1.2131675779819489, + "epoch": 0.0388038803880388, + "grad_norm": 0.31536421179771423, + "learning_rate": 0.00019996669567960031, + "loss": 1.2337, + "mean_token_accuracy": 0.6649139970541, + "num_tokens": 825915.0, + "step": 97 + }, + { + "entropy": 1.2785483300685883, + "epoch": 0.0392039203920392, + "grad_norm": 0.3453619182109833, + "learning_rate": 0.00019996344853938534, + "loss": 1.2257, + "mean_token_accuracy": 0.6682975143194199, + "num_tokens": 833771.0, + "step": 98 + }, + { + "entropy": 1.2706316709518433, + "epoch": 0.039603960396039604, + "grad_norm": 0.34687721729278564, + "learning_rate": 0.00019996005041121871, + "loss": 1.2578, + "mean_token_accuracy": 0.6584849059581757, + "num_tokens": 842093.0, + "step": 99 + }, + { + "entropy": 1.310558557510376, + "epoch": 0.040004000400040006, + "grad_norm": 0.34193679690361023, + "learning_rate": 0.0001999565013008035, + "loss": 1.338, + "mean_token_accuracy": 0.6487725079059601, + "num_tokens": 850079.0, + "step": 100 + }, + { + "entropy": 1.2646283209323883, + "epoch": 0.04040404040404041, + "grad_norm": 0.3951033651828766, + "learning_rate": 0.00019995280121409636, + "loss": 1.3172, + "mean_token_accuracy": 0.6424316316843033, + "num_tokens": 858250.0, + "step": 101 + }, + { + "entropy": 1.2900939583778381, + "epoch": 0.0408040804080408, + "grad_norm": 0.3364447057247162, + "learning_rate": 0.00019994895015730717, + "loss": 1.2487, + "mean_token_accuracy": 0.6626600474119186, + "num_tokens": 866623.0, + "step": 102 + }, + { + "entropy": 1.294897198677063, + "epoch": 0.041204120412041204, + "grad_norm": 0.3506770431995392, + "learning_rate": 0.00019994494813689928, + "loss": 1.2672, + "mean_token_accuracy": 0.6523661762475967, + "num_tokens": 875370.0, + "step": 103 + }, + { + "entropy": 1.2744373679161072, + "epoch": 0.041604160416041605, + "grad_norm": 0.31772273778915405, + "learning_rate": 0.00019994079515958942, + "loss": 1.2437, + "mean_token_accuracy": 0.6669129282236099, + "num_tokens": 884081.0, + "step": 104 + }, + { + "entropy": 1.2323677241802216, + "epoch": 0.04200420042004201, + "grad_norm": 0.31223100423812866, + "learning_rate": 0.00019993649123234758, + "loss": 1.2034, + "mean_token_accuracy": 0.6670378148555756, + "num_tokens": 892383.0, + "step": 105 + }, + { + "entropy": 1.1459662318229675, + "epoch": 0.0424042404240424, + "grad_norm": 0.3307859003543854, + "learning_rate": 0.00019993203636239717, + "loss": 1.2135, + "mean_token_accuracy": 0.6718799471855164, + "num_tokens": 900628.0, + "step": 106 + }, + { + "entropy": 1.2268281877040863, + "epoch": 0.0428042804280428, + "grad_norm": 0.35912272334098816, + "learning_rate": 0.00019992743055721493, + "loss": 1.2609, + "mean_token_accuracy": 0.6666164696216583, + "num_tokens": 909062.0, + "step": 107 + }, + { + "entropy": 1.200032651424408, + "epoch": 0.043204320432043204, + "grad_norm": 0.35117003321647644, + "learning_rate": 0.00019992267382453092, + "loss": 1.2047, + "mean_token_accuracy": 0.6681774854660034, + "num_tokens": 918221.0, + "step": 108 + }, + { + "entropy": 1.3714069724082947, + "epoch": 0.043604360436043606, + "grad_norm": 0.33686235547065735, + "learning_rate": 0.0001999177661723284, + "loss": 1.2777, + "mean_token_accuracy": 0.655053585767746, + "num_tokens": 926443.0, + "step": 109 + }, + { + "entropy": 1.3487186133861542, + "epoch": 0.04400440044004401, + "grad_norm": 0.3200630843639374, + "learning_rate": 0.0001999127076088441, + "loss": 1.3107, + "mean_token_accuracy": 0.6602136790752411, + "num_tokens": 934650.0, + "step": 110 + }, + { + "entropy": 1.2584488987922668, + "epoch": 0.0444044404440444, + "grad_norm": 0.31613630056381226, + "learning_rate": 0.0001999074981425679, + "loss": 1.2226, + "mean_token_accuracy": 0.6622737497091293, + "num_tokens": 942947.0, + "step": 111 + }, + { + "entropy": 1.1936236023902893, + "epoch": 0.0448044804480448, + "grad_norm": 0.316254198551178, + "learning_rate": 0.00019990213778224298, + "loss": 1.2106, + "mean_token_accuracy": 0.6652569025754929, + "num_tokens": 951465.0, + "step": 112 + }, + { + "entropy": 1.165192574262619, + "epoch": 0.045204520452045205, + "grad_norm": 0.31257057189941406, + "learning_rate": 0.00019989662653686576, + "loss": 1.2065, + "mean_token_accuracy": 0.6672259867191315, + "num_tokens": 960215.0, + "step": 113 + }, + { + "entropy": 1.180109590291977, + "epoch": 0.045604560456045606, + "grad_norm": 0.3332797884941101, + "learning_rate": 0.00019989096441568591, + "loss": 1.2285, + "mean_token_accuracy": 0.6671265214681625, + "num_tokens": 968893.0, + "step": 114 + }, + { + "entropy": 1.220985621213913, + "epoch": 0.04600460046004601, + "grad_norm": 0.3698706030845642, + "learning_rate": 0.0001998851514282063, + "loss": 1.2314, + "mean_token_accuracy": 0.6654269397258759, + "num_tokens": 976891.0, + "step": 115 + }, + { + "entropy": 1.2753552794456482, + "epoch": 0.0464046404640464, + "grad_norm": 0.32274726033210754, + "learning_rate": 0.00019987918758418308, + "loss": 1.2811, + "mean_token_accuracy": 0.6611100733280182, + "num_tokens": 984914.0, + "step": 116 + }, + { + "entropy": 1.308321624994278, + "epoch": 0.046804680468046804, + "grad_norm": 0.33258453011512756, + "learning_rate": 0.00019987307289362545, + "loss": 1.2541, + "mean_token_accuracy": 0.6605920940637589, + "num_tokens": 993096.0, + "step": 117 + }, + { + "entropy": 1.2893326878547668, + "epoch": 0.047204720472047206, + "grad_norm": 0.33915621042251587, + "learning_rate": 0.00019986680736679586, + "loss": 1.2511, + "mean_token_accuracy": 0.6640890389680862, + "num_tokens": 1001323.0, + "step": 118 + }, + { + "entropy": 1.30213862657547, + "epoch": 0.04760476047604761, + "grad_norm": 0.3717119097709656, + "learning_rate": 0.00019986039101420994, + "loss": 1.3143, + "mean_token_accuracy": 0.649169459939003, + "num_tokens": 1009892.0, + "step": 119 + }, + { + "entropy": 1.3021227717399597, + "epoch": 0.048004800480048, + "grad_norm": 0.32890114188194275, + "learning_rate": 0.0001998538238466364, + "loss": 1.2351, + "mean_token_accuracy": 0.6693892329931259, + "num_tokens": 1017992.0, + "step": 120 + }, + { + "entropy": 1.2010404765605927, + "epoch": 0.0484048404840484, + "grad_norm": 0.3222126066684723, + "learning_rate": 0.00019984710587509706, + "loss": 1.1934, + "mean_token_accuracy": 0.6745197772979736, + "num_tokens": 1026224.0, + "step": 121 + }, + { + "entropy": 1.2384890913963318, + "epoch": 0.048804880488048805, + "grad_norm": 0.32965728640556335, + "learning_rate": 0.00019984023711086687, + "loss": 1.2587, + "mean_token_accuracy": 0.6567209810018539, + "num_tokens": 1034674.0, + "step": 122 + }, + { + "entropy": 1.1893330216407776, + "epoch": 0.049204920492049206, + "grad_norm": 0.3488786518573761, + "learning_rate": 0.0001998332175654739, + "loss": 1.1999, + "mean_token_accuracy": 0.6683076322078705, + "num_tokens": 1042546.0, + "step": 123 + }, + { + "entropy": 1.2300190329551697, + "epoch": 0.04960496049604961, + "grad_norm": 0.33502018451690674, + "learning_rate": 0.00019982604725069918, + "loss": 1.2714, + "mean_token_accuracy": 0.6550982743501663, + "num_tokens": 1051075.0, + "step": 124 + }, + { + "entropy": 1.263420820236206, + "epoch": 0.05000500050005, + "grad_norm": 0.35562458634376526, + "learning_rate": 0.00019981872617857684, + "loss": 1.2535, + "mean_token_accuracy": 0.6570105701684952, + "num_tokens": 1059384.0, + "step": 125 + }, + { + "entropy": 1.2463673949241638, + "epoch": 0.050405040504050404, + "grad_norm": 0.3122851252555847, + "learning_rate": 0.00019981125436139405, + "loss": 1.2035, + "mean_token_accuracy": 0.6734038293361664, + "num_tokens": 1068524.0, + "step": 126 + }, + { + "entropy": 1.3272143006324768, + "epoch": 0.050805080508050805, + "grad_norm": 0.37185049057006836, + "learning_rate": 0.00019980363181169096, + "loss": 1.2723, + "mean_token_accuracy": 0.6541654914617538, + "num_tokens": 1076256.0, + "step": 127 + }, + { + "entropy": 1.2414169907569885, + "epoch": 0.05120512051205121, + "grad_norm": 0.32138875126838684, + "learning_rate": 0.00019979585854226065, + "loss": 1.1992, + "mean_token_accuracy": 0.6784048974514008, + "num_tokens": 1084784.0, + "step": 128 + }, + { + "entropy": 1.1664628982543945, + "epoch": 0.05160516051605161, + "grad_norm": 0.31607839465141296, + "learning_rate": 0.00019978793456614918, + "loss": 1.1728, + "mean_token_accuracy": 0.6773318648338318, + "num_tokens": 1094177.0, + "step": 129 + }, + { + "entropy": 1.1460879147052765, + "epoch": 0.052005200520052, + "grad_norm": 0.3119550347328186, + "learning_rate": 0.0001997798598966556, + "loss": 1.1576, + "mean_token_accuracy": 0.6763872653245926, + "num_tokens": 1102808.0, + "step": 130 + }, + { + "entropy": 1.1866309642791748, + "epoch": 0.052405240524052404, + "grad_norm": 0.3441757261753082, + "learning_rate": 0.00019977163454733184, + "loss": 1.2228, + "mean_token_accuracy": 0.6688681393861771, + "num_tokens": 1111447.0, + "step": 131 + }, + { + "entropy": 1.1310507953166962, + "epoch": 0.052805280528052806, + "grad_norm": 0.3540189862251282, + "learning_rate": 0.00019976325853198268, + "loss": 1.1514, + "mean_token_accuracy": 0.6831837445497513, + "num_tokens": 1120000.0, + "step": 132 + }, + { + "entropy": 1.19211745262146, + "epoch": 0.05320532053205321, + "grad_norm": 0.3323245942592621, + "learning_rate": 0.00019975473186466583, + "loss": 1.2119, + "mean_token_accuracy": 0.6718263179063797, + "num_tokens": 1128658.0, + "step": 133 + }, + { + "entropy": 1.1928575336933136, + "epoch": 0.0536053605360536, + "grad_norm": 0.34882429242134094, + "learning_rate": 0.0001997460545596918, + "loss": 1.2066, + "mean_token_accuracy": 0.6791622638702393, + "num_tokens": 1137143.0, + "step": 134 + }, + { + "entropy": 1.226127952337265, + "epoch": 0.054005400540054004, + "grad_norm": 0.3233380913734436, + "learning_rate": 0.00019973722663162396, + "loss": 1.1884, + "mean_token_accuracy": 0.6750646978616714, + "num_tokens": 1145501.0, + "step": 135 + }, + { + "entropy": 1.2761054337024689, + "epoch": 0.054405440544054405, + "grad_norm": 0.308118611574173, + "learning_rate": 0.00019972824809527838, + "loss": 1.224, + "mean_token_accuracy": 0.6631017774343491, + "num_tokens": 1153912.0, + "step": 136 + }, + { + "entropy": 1.3157364130020142, + "epoch": 0.05480548054805481, + "grad_norm": 0.33582690358161926, + "learning_rate": 0.00019971911896572405, + "loss": 1.2701, + "mean_token_accuracy": 0.6578985750675201, + "num_tokens": 1161769.0, + "step": 137 + }, + { + "entropy": 1.2075002789497375, + "epoch": 0.05520552055205521, + "grad_norm": 0.3170996606349945, + "learning_rate": 0.00019970983925828256, + "loss": 1.1906, + "mean_token_accuracy": 0.6732707768678665, + "num_tokens": 1170319.0, + "step": 138 + }, + { + "entropy": 1.1732978522777557, + "epoch": 0.0556055605560556, + "grad_norm": 0.32156452536582947, + "learning_rate": 0.0001997004089885283, + "loss": 1.1782, + "mean_token_accuracy": 0.6732619553804398, + "num_tokens": 1178801.0, + "step": 139 + }, + { + "entropy": 1.1573354601860046, + "epoch": 0.056005600560056004, + "grad_norm": 0.33083587884902954, + "learning_rate": 0.00019969082817228832, + "loss": 1.2067, + "mean_token_accuracy": 0.6737565696239471, + "num_tokens": 1186994.0, + "step": 140 + }, + { + "entropy": 1.211174637079239, + "epoch": 0.056405640564056406, + "grad_norm": 0.34685665369033813, + "learning_rate": 0.00019968109682564237, + "loss": 1.2586, + "mean_token_accuracy": 0.6569341272115707, + "num_tokens": 1194743.0, + "step": 141 + }, + { + "entropy": 1.2521505057811737, + "epoch": 0.05680568056805681, + "grad_norm": 0.35258418321609497, + "learning_rate": 0.00019967121496492282, + "loss": 1.2599, + "mean_token_accuracy": 0.6645904332399368, + "num_tokens": 1202435.0, + "step": 142 + }, + { + "entropy": 1.2398549616336823, + "epoch": 0.05720572057205721, + "grad_norm": 0.3388517200946808, + "learning_rate": 0.00019966118260671465, + "loss": 1.2081, + "mean_token_accuracy": 0.6675426363945007, + "num_tokens": 1210326.0, + "step": 143 + }, + { + "entropy": 1.297620803117752, + "epoch": 0.0576057605760576, + "grad_norm": 0.34630584716796875, + "learning_rate": 0.0001996509997678554, + "loss": 1.2857, + "mean_token_accuracy": 0.6573289930820465, + "num_tokens": 1218682.0, + "step": 144 + }, + { + "entropy": 1.248921811580658, + "epoch": 0.058005800580058005, + "grad_norm": 0.33417370915412903, + "learning_rate": 0.00019964066646543517, + "loss": 1.2036, + "mean_token_accuracy": 0.6730931401252747, + "num_tokens": 1227725.0, + "step": 145 + }, + { + "entropy": 1.2742219269275665, + "epoch": 0.058405840584058406, + "grad_norm": 0.31867334246635437, + "learning_rate": 0.00019963018271679667, + "loss": 1.2356, + "mean_token_accuracy": 0.6603083312511444, + "num_tokens": 1236112.0, + "step": 146 + }, + { + "entropy": 1.2454158961772919, + "epoch": 0.05880588058805881, + "grad_norm": 0.31619757413864136, + "learning_rate": 0.000199619548539535, + "loss": 1.2272, + "mean_token_accuracy": 0.664936900138855, + "num_tokens": 1244932.0, + "step": 147 + }, + { + "entropy": 1.1861615478992462, + "epoch": 0.05920592059205921, + "grad_norm": 0.3590589761734009, + "learning_rate": 0.00019960876395149778, + "loss": 1.2122, + "mean_token_accuracy": 0.6684562414884567, + "num_tokens": 1253316.0, + "step": 148 + }, + { + "entropy": 1.1777002215385437, + "epoch": 0.059605960596059604, + "grad_norm": 0.3057377338409424, + "learning_rate": 0.00019959782897078504, + "loss": 1.1483, + "mean_token_accuracy": 0.6810255944728851, + "num_tokens": 1261895.0, + "step": 149 + }, + { + "entropy": 1.2077372670173645, + "epoch": 0.060006000600060005, + "grad_norm": 0.32661283016204834, + "learning_rate": 0.00019958674361574927, + "loss": 1.2242, + "mean_token_accuracy": 0.6603673696517944, + "num_tokens": 1270647.0, + "step": 150 + }, + { + "entropy": 1.2129946649074554, + "epoch": 0.06040604060406041, + "grad_norm": 0.33181479573249817, + "learning_rate": 0.00019957550790499526, + "loss": 1.214, + "mean_token_accuracy": 0.6734245270490646, + "num_tokens": 1279483.0, + "step": 151 + }, + { + "entropy": 1.2279469072818756, + "epoch": 0.06080608060806081, + "grad_norm": 0.36564233899116516, + "learning_rate": 0.00019956412185738025, + "loss": 1.2227, + "mean_token_accuracy": 0.664169505238533, + "num_tokens": 1288062.0, + "step": 152 + }, + { + "entropy": 1.1853630542755127, + "epoch": 0.0612061206120612, + "grad_norm": 0.3081769645214081, + "learning_rate": 0.0001995525854920137, + "loss": 1.2009, + "mean_token_accuracy": 0.6692493110895157, + "num_tokens": 1296644.0, + "step": 153 + }, + { + "entropy": 1.1182245910167694, + "epoch": 0.061606160616061605, + "grad_norm": 0.28534799814224243, + "learning_rate": 0.00019954089882825738, + "loss": 1.0659, + "mean_token_accuracy": 0.7025346755981445, + "num_tokens": 1305683.0, + "step": 154 + }, + { + "entropy": 1.1886220276355743, + "epoch": 0.062006200620062006, + "grad_norm": 0.3182019293308258, + "learning_rate": 0.0001995290618857253, + "loss": 1.1576, + "mean_token_accuracy": 0.6741877645254135, + "num_tokens": 1314385.0, + "step": 155 + }, + { + "entropy": 1.2045941054821014, + "epoch": 0.06240624062406241, + "grad_norm": 0.3276945948600769, + "learning_rate": 0.0001995170746842838, + "loss": 1.165, + "mean_token_accuracy": 0.6834963709115982, + "num_tokens": 1322826.0, + "step": 156 + }, + { + "entropy": 1.2731471955776215, + "epoch": 0.0628062806280628, + "grad_norm": 0.3397105932235718, + "learning_rate": 0.00019950493724405117, + "loss": 1.2985, + "mean_token_accuracy": 0.648296907544136, + "num_tokens": 1331327.0, + "step": 157 + }, + { + "entropy": 1.1947194337844849, + "epoch": 0.0632063206320632, + "grad_norm": 0.2986201047897339, + "learning_rate": 0.00019949264958539807, + "loss": 1.205, + "mean_token_accuracy": 0.6792440861463547, + "num_tokens": 1340147.0, + "step": 158 + }, + { + "entropy": 1.1570270955562592, + "epoch": 0.0636063606360636, + "grad_norm": 0.3215077519416809, + "learning_rate": 0.00019948021172894718, + "loss": 1.1681, + "mean_token_accuracy": 0.6815727949142456, + "num_tokens": 1348989.0, + "step": 159 + }, + { + "entropy": 1.122036024928093, + "epoch": 0.064006400640064, + "grad_norm": 0.3120049238204956, + "learning_rate": 0.00019946762369557323, + "loss": 1.1377, + "mean_token_accuracy": 0.6871893852949142, + "num_tokens": 1357863.0, + "step": 160 + }, + { + "entropy": 1.2672194242477417, + "epoch": 0.06440644064406441, + "grad_norm": 0.33700302243232727, + "learning_rate": 0.00019945488550640313, + "loss": 1.2532, + "mean_token_accuracy": 0.664255827665329, + "num_tokens": 1365945.0, + "step": 161 + }, + { + "entropy": 1.1509548127651215, + "epoch": 0.06480648064806481, + "grad_norm": 0.3201735019683838, + "learning_rate": 0.00019944199718281559, + "loss": 1.1387, + "mean_token_accuracy": 0.6814217865467072, + "num_tokens": 1375147.0, + "step": 162 + }, + { + "entropy": 1.1635609865188599, + "epoch": 0.06520652065206521, + "grad_norm": 0.2953193187713623, + "learning_rate": 0.0001994289587464415, + "loss": 1.1817, + "mean_token_accuracy": 0.6780352145433426, + "num_tokens": 1383893.0, + "step": 163 + }, + { + "entropy": 1.1869005262851715, + "epoch": 0.06560656065606561, + "grad_norm": 0.30155807733535767, + "learning_rate": 0.00019941577021916355, + "loss": 1.1834, + "mean_token_accuracy": 0.6724350303411484, + "num_tokens": 1392477.0, + "step": 164 + }, + { + "entropy": 1.1506932377815247, + "epoch": 0.066006600660066, + "grad_norm": 0.31121376156806946, + "learning_rate": 0.00019940243162311642, + "loss": 1.1673, + "mean_token_accuracy": 0.6797937452793121, + "num_tokens": 1400899.0, + "step": 165 + }, + { + "entropy": 1.2660083770751953, + "epoch": 0.0664066406640664, + "grad_norm": 0.3299071788787842, + "learning_rate": 0.00019938894298068661, + "loss": 1.2725, + "mean_token_accuracy": 0.6537068784236908, + "num_tokens": 1409546.0, + "step": 166 + }, + { + "entropy": 1.2500199675559998, + "epoch": 0.0668066806680668, + "grad_norm": 0.3030771017074585, + "learning_rate": 0.00019937530431451243, + "loss": 1.1776, + "mean_token_accuracy": 0.6745365858078003, + "num_tokens": 1417712.0, + "step": 167 + }, + { + "entropy": 1.2582001090049744, + "epoch": 0.0672067206720672, + "grad_norm": 0.30366259813308716, + "learning_rate": 0.00019936151564748403, + "loss": 1.2339, + "mean_token_accuracy": 0.6664343029260635, + "num_tokens": 1426352.0, + "step": 168 + }, + { + "entropy": 1.2371725142002106, + "epoch": 0.0676067606760676, + "grad_norm": 0.3065868616104126, + "learning_rate": 0.00019934757700274325, + "loss": 1.223, + "mean_token_accuracy": 0.6679128706455231, + "num_tokens": 1434986.0, + "step": 169 + }, + { + "entropy": 1.2751116156578064, + "epoch": 0.06800680068006801, + "grad_norm": 0.3346325755119324, + "learning_rate": 0.00019933348840368368, + "loss": 1.2569, + "mean_token_accuracy": 0.6594884544610977, + "num_tokens": 1442823.0, + "step": 170 + }, + { + "entropy": 1.1633991301059723, + "epoch": 0.06840684068406841, + "grad_norm": 0.3242139518260956, + "learning_rate": 0.0001993192498739506, + "loss": 1.1805, + "mean_token_accuracy": 0.6728992164134979, + "num_tokens": 1451134.0, + "step": 171 + }, + { + "entropy": 1.2180014848709106, + "epoch": 0.06880688068806881, + "grad_norm": 0.3972644507884979, + "learning_rate": 0.0001993048614374409, + "loss": 1.2393, + "mean_token_accuracy": 0.6580066382884979, + "num_tokens": 1459262.0, + "step": 172 + }, + { + "entropy": 1.1176005005836487, + "epoch": 0.06920692069206921, + "grad_norm": 0.3137458264827728, + "learning_rate": 0.00019929032311830303, + "loss": 1.1644, + "mean_token_accuracy": 0.6814699321985245, + "num_tokens": 1467853.0, + "step": 173 + }, + { + "entropy": 1.1198759078979492, + "epoch": 0.0696069606960696, + "grad_norm": 0.3517007529735565, + "learning_rate": 0.000199275634940937, + "loss": 1.1312, + "mean_token_accuracy": 0.6874582916498184, + "num_tokens": 1476497.0, + "step": 174 + }, + { + "entropy": 1.2389306426048279, + "epoch": 0.07000700070007, + "grad_norm": 0.32016775012016296, + "learning_rate": 0.00019926079692999445, + "loss": 1.214, + "mean_token_accuracy": 0.6705743223428726, + "num_tokens": 1484294.0, + "step": 175 + }, + { + "entropy": 1.3337944746017456, + "epoch": 0.0704070407040704, + "grad_norm": 0.33495742082595825, + "learning_rate": 0.00019924580911037827, + "loss": 1.2954, + "mean_token_accuracy": 0.6510952711105347, + "num_tokens": 1492575.0, + "step": 176 + }, + { + "entropy": 1.2905775010585785, + "epoch": 0.0708070807080708, + "grad_norm": 0.3236202001571655, + "learning_rate": 0.00019923067150724296, + "loss": 1.219, + "mean_token_accuracy": 0.6705390512943268, + "num_tokens": 1500716.0, + "step": 177 + }, + { + "entropy": 1.2353481650352478, + "epoch": 0.0712071207120712, + "grad_norm": 0.3262037932872772, + "learning_rate": 0.00019921538414599437, + "loss": 1.2076, + "mean_token_accuracy": 0.6677059978246689, + "num_tokens": 1509105.0, + "step": 178 + }, + { + "entropy": 1.2299005091190338, + "epoch": 0.07160716071607161, + "grad_norm": 0.3147687315940857, + "learning_rate": 0.00019919994705228965, + "loss": 1.2301, + "mean_token_accuracy": 0.6644129753112793, + "num_tokens": 1516981.0, + "step": 179 + }, + { + "entropy": 1.1565956473350525, + "epoch": 0.07200720072007201, + "grad_norm": 0.31962037086486816, + "learning_rate": 0.00019918436025203728, + "loss": 1.2013, + "mean_token_accuracy": 0.6825570911169052, + "num_tokens": 1524951.0, + "step": 180 + }, + { + "entropy": 1.1386863589286804, + "epoch": 0.07240724072407241, + "grad_norm": 0.30647844076156616, + "learning_rate": 0.00019916862377139695, + "loss": 1.1697, + "mean_token_accuracy": 0.6716460883617401, + "num_tokens": 1533450.0, + "step": 181 + }, + { + "entropy": 1.1206298768520355, + "epoch": 0.07280728072807281, + "grad_norm": 0.2919379472732544, + "learning_rate": 0.00019915273763677959, + "loss": 1.1221, + "mean_token_accuracy": 0.6845085620880127, + "num_tokens": 1542345.0, + "step": 182 + }, + { + "entropy": 1.1708945035934448, + "epoch": 0.07320732073207321, + "grad_norm": 0.3223237097263336, + "learning_rate": 0.00019913670187484737, + "loss": 1.1722, + "mean_token_accuracy": 0.681228905916214, + "num_tokens": 1551016.0, + "step": 183 + }, + { + "entropy": 1.1606915593147278, + "epoch": 0.0736073607360736, + "grad_norm": 0.3167206943035126, + "learning_rate": 0.00019912051651251346, + "loss": 1.1381, + "mean_token_accuracy": 0.686376079916954, + "num_tokens": 1560201.0, + "step": 184 + }, + { + "entropy": 1.2089463472366333, + "epoch": 0.074007400740074, + "grad_norm": 0.331546813249588, + "learning_rate": 0.00019910418157694217, + "loss": 1.1998, + "mean_token_accuracy": 0.6701401472091675, + "num_tokens": 1568847.0, + "step": 185 + }, + { + "entropy": 1.2552906274795532, + "epoch": 0.0744074407440744, + "grad_norm": 0.3218790292739868, + "learning_rate": 0.00019908769709554887, + "loss": 1.2302, + "mean_token_accuracy": 0.6671873778104782, + "num_tokens": 1577212.0, + "step": 186 + }, + { + "entropy": 1.0971337109804153, + "epoch": 0.0748074807480748, + "grad_norm": 0.2888547480106354, + "learning_rate": 0.00019907106309599985, + "loss": 1.1053, + "mean_token_accuracy": 0.6914333999156952, + "num_tokens": 1586544.0, + "step": 187 + }, + { + "entropy": 1.1342568099498749, + "epoch": 0.07520752075207521, + "grad_norm": 0.3135220408439636, + "learning_rate": 0.00019905427960621245, + "loss": 1.1553, + "mean_token_accuracy": 0.678636908531189, + "num_tokens": 1595573.0, + "step": 188 + }, + { + "entropy": 1.2157914340496063, + "epoch": 0.07560756075607561, + "grad_norm": 0.32912546396255493, + "learning_rate": 0.00019903734665435472, + "loss": 1.2219, + "mean_token_accuracy": 0.6693233996629715, + "num_tokens": 1603723.0, + "step": 189 + }, + { + "entropy": 1.1541197896003723, + "epoch": 0.07600760076007601, + "grad_norm": 0.31249913573265076, + "learning_rate": 0.00019902026426884574, + "loss": 1.1311, + "mean_token_accuracy": 0.6898495107889175, + "num_tokens": 1612212.0, + "step": 190 + }, + { + "entropy": 1.211905598640442, + "epoch": 0.07640764076407641, + "grad_norm": 0.3106580078601837, + "learning_rate": 0.00019900303247835527, + "loss": 1.168, + "mean_token_accuracy": 0.675964280962944, + "num_tokens": 1620162.0, + "step": 191 + }, + { + "entropy": 1.2080174088478088, + "epoch": 0.07680768076807681, + "grad_norm": 0.32318130135536194, + "learning_rate": 0.00019898565131180393, + "loss": 1.1781, + "mean_token_accuracy": 0.6760376244783401, + "num_tokens": 1628883.0, + "step": 192 + }, + { + "entropy": 1.2078506350517273, + "epoch": 0.0772077207720772, + "grad_norm": 0.33328673243522644, + "learning_rate": 0.0001989681207983629, + "loss": 1.2092, + "mean_token_accuracy": 0.6628051847219467, + "num_tokens": 1637332.0, + "step": 193 + }, + { + "entropy": 1.210196852684021, + "epoch": 0.0776077607760776, + "grad_norm": 0.32340574264526367, + "learning_rate": 0.00019895044096745416, + "loss": 1.2329, + "mean_token_accuracy": 0.6619292944669724, + "num_tokens": 1645906.0, + "step": 194 + }, + { + "entropy": 1.1815847158432007, + "epoch": 0.078007800780078, + "grad_norm": 0.3175504505634308, + "learning_rate": 0.00019893261184875016, + "loss": 1.2045, + "mean_token_accuracy": 0.6673628389835358, + "num_tokens": 1654114.0, + "step": 195 + }, + { + "entropy": 1.1910730004310608, + "epoch": 0.0784078407840784, + "grad_norm": 0.3114391565322876, + "learning_rate": 0.00019891463347217395, + "loss": 1.1889, + "mean_token_accuracy": 0.6714468449354172, + "num_tokens": 1662666.0, + "step": 196 + }, + { + "entropy": 1.1541639566421509, + "epoch": 0.07880788078807881, + "grad_norm": 0.3364032506942749, + "learning_rate": 0.0001988965058678992, + "loss": 1.1622, + "mean_token_accuracy": 0.67988321185112, + "num_tokens": 1671435.0, + "step": 197 + }, + { + "entropy": 1.222437858581543, + "epoch": 0.07920792079207921, + "grad_norm": 0.3355000913143158, + "learning_rate": 0.00019887822906634983, + "loss": 1.1804, + "mean_token_accuracy": 0.6725995391607285, + "num_tokens": 1679662.0, + "step": 198 + }, + { + "entropy": 1.2075644731521606, + "epoch": 0.07960796079607961, + "grad_norm": 0.33377805352211, + "learning_rate": 0.00019885980309820032, + "loss": 1.1547, + "mean_token_accuracy": 0.6831348687410355, + "num_tokens": 1687663.0, + "step": 199 + }, + { + "entropy": 1.248348981142044, + "epoch": 0.08000800080008001, + "grad_norm": 0.3341095447540283, + "learning_rate": 0.0001988412279943754, + "loss": 1.2665, + "mean_token_accuracy": 0.6561878323554993, + "num_tokens": 1696479.0, + "step": 200 + }, + { + "entropy": 1.224026381969452, + "epoch": 0.08040804080408041, + "grad_norm": 0.33011487126350403, + "learning_rate": 0.00019882250378605015, + "loss": 1.2181, + "mean_token_accuracy": 0.6664289385080338, + "num_tokens": 1704885.0, + "step": 201 + }, + { + "entropy": 1.1437757015228271, + "epoch": 0.08080808080808081, + "grad_norm": 0.31265076994895935, + "learning_rate": 0.00019880363050464993, + "loss": 1.1773, + "mean_token_accuracy": 0.6812110096216202, + "num_tokens": 1713409.0, + "step": 202 + }, + { + "entropy": 1.2059556543827057, + "epoch": 0.0812081208120812, + "grad_norm": 0.315448135137558, + "learning_rate": 0.00019878460818185023, + "loss": 1.2278, + "mean_token_accuracy": 0.6699778735637665, + "num_tokens": 1721548.0, + "step": 203 + }, + { + "entropy": 1.2078820168972015, + "epoch": 0.0816081608160816, + "grad_norm": 0.3079279363155365, + "learning_rate": 0.00019876543684957667, + "loss": 1.1845, + "mean_token_accuracy": 0.6785111278295517, + "num_tokens": 1729809.0, + "step": 204 + }, + { + "entropy": 1.199218899011612, + "epoch": 0.082008200820082, + "grad_norm": 0.3043046295642853, + "learning_rate": 0.000198746116540005, + "loss": 1.1722, + "mean_token_accuracy": 0.6754065752029419, + "num_tokens": 1738734.0, + "step": 205 + }, + { + "entropy": 1.2172024846076965, + "epoch": 0.08240824082408241, + "grad_norm": 0.313902884721756, + "learning_rate": 0.00019872664728556101, + "loss": 1.1869, + "mean_token_accuracy": 0.6728281825780869, + "num_tokens": 1746870.0, + "step": 206 + }, + { + "entropy": 1.1678736209869385, + "epoch": 0.08280828082808281, + "grad_norm": 0.3191705644130707, + "learning_rate": 0.00019870702911892042, + "loss": 1.1546, + "mean_token_accuracy": 0.6843972355127335, + "num_tokens": 1755295.0, + "step": 207 + }, + { + "entropy": 1.279354214668274, + "epoch": 0.08320832083208321, + "grad_norm": 0.3313900828361511, + "learning_rate": 0.0001986872620730089, + "loss": 1.2558, + "mean_token_accuracy": 0.659809798002243, + "num_tokens": 1763606.0, + "step": 208 + }, + { + "entropy": 1.078108698129654, + "epoch": 0.08360836083608361, + "grad_norm": 0.283428430557251, + "learning_rate": 0.00019866734618100202, + "loss": 1.1032, + "mean_token_accuracy": 0.69297856092453, + "num_tokens": 1772887.0, + "step": 209 + }, + { + "entropy": 1.186295509338379, + "epoch": 0.08400840084008401, + "grad_norm": 0.35003766417503357, + "learning_rate": 0.0001986472814763251, + "loss": 1.2374, + "mean_token_accuracy": 0.6684627532958984, + "num_tokens": 1781067.0, + "step": 210 + }, + { + "entropy": 1.1557523012161255, + "epoch": 0.08440844084408441, + "grad_norm": 0.31848254799842834, + "learning_rate": 0.00019862706799265322, + "loss": 1.1854, + "mean_token_accuracy": 0.6773674935102463, + "num_tokens": 1789844.0, + "step": 211 + }, + { + "entropy": 1.218627154827118, + "epoch": 0.0848084808480848, + "grad_norm": 0.3408789038658142, + "learning_rate": 0.00019860670576391128, + "loss": 1.1708, + "mean_token_accuracy": 0.6817043423652649, + "num_tokens": 1798509.0, + "step": 212 + }, + { + "entropy": 1.2130761444568634, + "epoch": 0.0852085208520852, + "grad_norm": 0.7527572512626648, + "learning_rate": 0.0001985861948242736, + "loss": 1.2157, + "mean_token_accuracy": 0.6661449372768402, + "num_tokens": 1807202.0, + "step": 213 + }, + { + "entropy": 1.2128455638885498, + "epoch": 0.0856085608560856, + "grad_norm": 0.29946374893188477, + "learning_rate": 0.00019856553520816435, + "loss": 1.1896, + "mean_token_accuracy": 0.6733538210391998, + "num_tokens": 1816131.0, + "step": 214 + }, + { + "entropy": 1.2612944841384888, + "epoch": 0.086008600860086, + "grad_norm": 0.32515719532966614, + "learning_rate": 0.00019854472695025698, + "loss": 1.2329, + "mean_token_accuracy": 0.669788658618927, + "num_tokens": 1824283.0, + "step": 215 + }, + { + "entropy": 1.1807590425014496, + "epoch": 0.08640864086408641, + "grad_norm": 0.3279406726360321, + "learning_rate": 0.0001985237700854746, + "loss": 1.1565, + "mean_token_accuracy": 0.6816118210554123, + "num_tokens": 1833322.0, + "step": 216 + }, + { + "entropy": 1.2046120464801788, + "epoch": 0.08680868086808681, + "grad_norm": 0.2987005412578583, + "learning_rate": 0.00019850266464898955, + "loss": 1.179, + "mean_token_accuracy": 0.6783045381307602, + "num_tokens": 1842092.0, + "step": 217 + }, + { + "entropy": 1.1976227462291718, + "epoch": 0.08720872087208721, + "grad_norm": 0.30504319071769714, + "learning_rate": 0.00019848141067622374, + "loss": 1.1589, + "mean_token_accuracy": 0.6762242764234543, + "num_tokens": 1850740.0, + "step": 218 + }, + { + "entropy": 1.2001455426216125, + "epoch": 0.08760876087608761, + "grad_norm": 0.35163310170173645, + "learning_rate": 0.0001984600082028482, + "loss": 1.1941, + "mean_token_accuracy": 0.6701504737138748, + "num_tokens": 1858729.0, + "step": 219 + }, + { + "entropy": 1.0998838245868683, + "epoch": 0.08800880088008801, + "grad_norm": 0.3166980445384979, + "learning_rate": 0.0001984384572647832, + "loss": 1.1238, + "mean_token_accuracy": 0.683118149638176, + "num_tokens": 1867218.0, + "step": 220 + }, + { + "entropy": 1.1223637461662292, + "epoch": 0.0884088408840884, + "grad_norm": 0.3210962116718292, + "learning_rate": 0.0001984167578981983, + "loss": 1.158, + "mean_token_accuracy": 0.685064285993576, + "num_tokens": 1875656.0, + "step": 221 + }, + { + "entropy": 1.1469238698482513, + "epoch": 0.0888088808880888, + "grad_norm": 0.37055703997612, + "learning_rate": 0.00019839491013951213, + "loss": 1.1976, + "mean_token_accuracy": 0.66952283680439, + "num_tokens": 1884042.0, + "step": 222 + }, + { + "entropy": 1.2010729908943176, + "epoch": 0.0892089208920892, + "grad_norm": 0.30089443922042847, + "learning_rate": 0.00019837291402539223, + "loss": 1.1677, + "mean_token_accuracy": 0.6765223145484924, + "num_tokens": 1892519.0, + "step": 223 + }, + { + "entropy": 1.222718983888626, + "epoch": 0.0896089608960896, + "grad_norm": 0.3071632981300354, + "learning_rate": 0.00019835076959275532, + "loss": 1.1918, + "mean_token_accuracy": 0.6696299612522125, + "num_tokens": 1900924.0, + "step": 224 + }, + { + "entropy": 1.216365933418274, + "epoch": 0.09000900090009001, + "grad_norm": 0.3337574303150177, + "learning_rate": 0.00019832847687876692, + "loss": 1.1572, + "mean_token_accuracy": 0.6832773238420486, + "num_tokens": 1909276.0, + "step": 225 + }, + { + "entropy": 1.1910041272640228, + "epoch": 0.09040904090409041, + "grad_norm": 0.3146218955516815, + "learning_rate": 0.0001983060359208415, + "loss": 1.1782, + "mean_token_accuracy": 0.679167777299881, + "num_tokens": 1918407.0, + "step": 226 + }, + { + "entropy": 1.162790209054947, + "epoch": 0.09080908090809081, + "grad_norm": 0.2975619435310364, + "learning_rate": 0.0001982834467566423, + "loss": 1.1683, + "mean_token_accuracy": 0.6799277067184448, + "num_tokens": 1927282.0, + "step": 227 + }, + { + "entropy": 1.192271113395691, + "epoch": 0.09120912091209121, + "grad_norm": 0.3205324113368988, + "learning_rate": 0.0001982607094240813, + "loss": 1.1681, + "mean_token_accuracy": 0.6754294186830521, + "num_tokens": 1935737.0, + "step": 228 + }, + { + "entropy": 1.1858693957328796, + "epoch": 0.09160916091609161, + "grad_norm": 0.3366444706916809, + "learning_rate": 0.00019823782396131902, + "loss": 1.1944, + "mean_token_accuracy": 0.6657039225101471, + "num_tokens": 1943472.0, + "step": 229 + }, + { + "entropy": 1.1361185312271118, + "epoch": 0.09200920092009202, + "grad_norm": 0.31257081031799316, + "learning_rate": 0.00019821479040676488, + "loss": 1.1529, + "mean_token_accuracy": 0.6812857985496521, + "num_tokens": 1952251.0, + "step": 230 + }, + { + "entropy": 1.2052267491817474, + "epoch": 0.0924092409240924, + "grad_norm": 0.3371609151363373, + "learning_rate": 0.0001981916087990766, + "loss": 1.2363, + "mean_token_accuracy": 0.6580934226512909, + "num_tokens": 1960349.0, + "step": 231 + }, + { + "entropy": 1.1373478174209595, + "epoch": 0.0928092809280928, + "grad_norm": 0.30473393201828003, + "learning_rate": 0.00019816827917716048, + "loss": 1.1727, + "mean_token_accuracy": 0.6796131581068039, + "num_tokens": 1969233.0, + "step": 232 + }, + { + "entropy": 1.1681481301784515, + "epoch": 0.0932093209320932, + "grad_norm": 0.3225601315498352, + "learning_rate": 0.0001981448015801712, + "loss": 1.1528, + "mean_token_accuracy": 0.6749817878007889, + "num_tokens": 1977270.0, + "step": 233 + }, + { + "entropy": 1.2196559309959412, + "epoch": 0.09360936093609361, + "grad_norm": 0.33247852325439453, + "learning_rate": 0.00019812117604751185, + "loss": 1.1834, + "mean_token_accuracy": 0.6816778779029846, + "num_tokens": 1985087.0, + "step": 234 + }, + { + "entropy": 1.218104362487793, + "epoch": 0.09400940094009401, + "grad_norm": 0.3164643347263336, + "learning_rate": 0.00019809740261883372, + "loss": 1.1791, + "mean_token_accuracy": 0.6742540150880814, + "num_tokens": 1993142.0, + "step": 235 + }, + { + "entropy": 1.2172793745994568, + "epoch": 0.09440944094409441, + "grad_norm": 0.31248074769973755, + "learning_rate": 0.0001980734813340364, + "loss": 1.2067, + "mean_token_accuracy": 0.6745200008153915, + "num_tokens": 2001487.0, + "step": 236 + }, + { + "entropy": 1.203236162662506, + "epoch": 0.09480948094809481, + "grad_norm": 0.32407742738723755, + "learning_rate": 0.0001980494122332676, + "loss": 1.1664, + "mean_token_accuracy": 0.6777038276195526, + "num_tokens": 2010136.0, + "step": 237 + }, + { + "entropy": 1.1953341364860535, + "epoch": 0.09520952095209521, + "grad_norm": 0.3571881651878357, + "learning_rate": 0.00019802519535692302, + "loss": 1.1651, + "mean_token_accuracy": 0.6782020479440689, + "num_tokens": 2018515.0, + "step": 238 + }, + { + "entropy": 1.208018183708191, + "epoch": 0.09560956095609562, + "grad_norm": 0.3488442599773407, + "learning_rate": 0.00019800083074564658, + "loss": 1.2217, + "mean_token_accuracy": 0.6720796823501587, + "num_tokens": 2026942.0, + "step": 239 + }, + { + "entropy": 1.1499423384666443, + "epoch": 0.096009600960096, + "grad_norm": 0.30266088247299194, + "learning_rate": 0.00019797631844032992, + "loss": 1.1776, + "mean_token_accuracy": 0.6771319806575775, + "num_tokens": 2035674.0, + "step": 240 + }, + { + "entropy": 1.1237535774707794, + "epoch": 0.0964096409640964, + "grad_norm": 0.3096405863761902, + "learning_rate": 0.00019795165848211278, + "loss": 1.1122, + "mean_token_accuracy": 0.6934310793876648, + "num_tokens": 2044052.0, + "step": 241 + }, + { + "entropy": 1.1529573500156403, + "epoch": 0.0968096809680968, + "grad_norm": 0.3192532956600189, + "learning_rate": 0.0001979268509123825, + "loss": 1.1804, + "mean_token_accuracy": 0.6760334223508835, + "num_tokens": 2052448.0, + "step": 242 + }, + { + "entropy": 1.2383974194526672, + "epoch": 0.09720972097209721, + "grad_norm": 0.3160487711429596, + "learning_rate": 0.00019790189577277432, + "loss": 1.2465, + "mean_token_accuracy": 0.6652619689702988, + "num_tokens": 2060776.0, + "step": 243 + }, + { + "entropy": 1.2161905169487, + "epoch": 0.09760976097609761, + "grad_norm": 0.32217562198638916, + "learning_rate": 0.00019787679310517107, + "loss": 1.1872, + "mean_token_accuracy": 0.6732243746519089, + "num_tokens": 2068794.0, + "step": 244 + }, + { + "entropy": 1.1646412014961243, + "epoch": 0.09800980098009801, + "grad_norm": 0.3009166419506073, + "learning_rate": 0.00019785154295170316, + "loss": 1.1652, + "mean_token_accuracy": 0.6807472556829453, + "num_tokens": 2077262.0, + "step": 245 + }, + { + "entropy": 1.2155237197875977, + "epoch": 0.09840984098409841, + "grad_norm": 0.3069799840450287, + "learning_rate": 0.00019782614535474862, + "loss": 1.216, + "mean_token_accuracy": 0.6698369234800339, + "num_tokens": 2085649.0, + "step": 246 + }, + { + "entropy": 1.1119366884231567, + "epoch": 0.09880988098809881, + "grad_norm": 0.30247923731803894, + "learning_rate": 0.00019780060035693285, + "loss": 1.1038, + "mean_token_accuracy": 0.6942414045333862, + "num_tokens": 2094198.0, + "step": 247 + }, + { + "entropy": 1.2534517645835876, + "epoch": 0.09920992099209922, + "grad_norm": 0.3274390697479248, + "learning_rate": 0.0001977749080011287, + "loss": 1.2635, + "mean_token_accuracy": 0.6554094851016998, + "num_tokens": 2102101.0, + "step": 248 + }, + { + "entropy": 1.1967229545116425, + "epoch": 0.09960996099609962, + "grad_norm": 0.29584378004074097, + "learning_rate": 0.00019774906833045625, + "loss": 1.1822, + "mean_token_accuracy": 0.6769470870494843, + "num_tokens": 2110466.0, + "step": 249 + }, + { + "entropy": 1.1380691528320312, + "epoch": 0.1000100010001, + "grad_norm": 0.28823035955429077, + "learning_rate": 0.00019772308138828299, + "loss": 1.0987, + "mean_token_accuracy": 0.6907877773046494, + "num_tokens": 2119656.0, + "step": 250 + }, + { + "entropy": 1.155064195394516, + "epoch": 0.1004100410041004, + "grad_norm": 0.3187693655490875, + "learning_rate": 0.00019769694721822337, + "loss": 1.1542, + "mean_token_accuracy": 0.6734511256217957, + "num_tokens": 2128073.0, + "step": 251 + }, + { + "entropy": 1.1665138900279999, + "epoch": 0.10081008100810081, + "grad_norm": 0.30443915724754333, + "learning_rate": 0.00019767066586413905, + "loss": 1.2047, + "mean_token_accuracy": 0.6689727902412415, + "num_tokens": 2136624.0, + "step": 252 + }, + { + "entropy": 1.1986846625804901, + "epoch": 0.10121012101210121, + "grad_norm": 0.2993563413619995, + "learning_rate": 0.0001976442373701387, + "loss": 1.1885, + "mean_token_accuracy": 0.6774641126394272, + "num_tokens": 2144946.0, + "step": 253 + }, + { + "entropy": 1.1575412154197693, + "epoch": 0.10161016101610161, + "grad_norm": 0.31819280982017517, + "learning_rate": 0.00019761766178057796, + "loss": 1.1617, + "mean_token_accuracy": 0.6737077832221985, + "num_tokens": 2153241.0, + "step": 254 + }, + { + "entropy": 1.1932867169380188, + "epoch": 0.10201020102010201, + "grad_norm": 0.33500298857688904, + "learning_rate": 0.00019759093914005932, + "loss": 1.1739, + "mean_token_accuracy": 0.6722579598426819, + "num_tokens": 2161532.0, + "step": 255 + }, + { + "entropy": 1.2010496854782104, + "epoch": 0.10241024102410241, + "grad_norm": 0.3177407681941986, + "learning_rate": 0.00019756406949343204, + "loss": 1.1888, + "mean_token_accuracy": 0.6757108420133591, + "num_tokens": 2170296.0, + "step": 256 + }, + { + "entropy": 1.1958762109279633, + "epoch": 0.10281028102810282, + "grad_norm": 0.30990293622016907, + "learning_rate": 0.00019753705288579217, + "loss": 1.1797, + "mean_token_accuracy": 0.6757787764072418, + "num_tokens": 2178618.0, + "step": 257 + }, + { + "entropy": 1.1743170619010925, + "epoch": 0.10321032103210322, + "grad_norm": 0.3038559854030609, + "learning_rate": 0.00019750988936248235, + "loss": 1.169, + "mean_token_accuracy": 0.6733282506465912, + "num_tokens": 2187168.0, + "step": 258 + }, + { + "entropy": 1.1737709939479828, + "epoch": 0.1036103610361036, + "grad_norm": 0.321360319852829, + "learning_rate": 0.0001974825789690918, + "loss": 1.1957, + "mean_token_accuracy": 0.6770029366016388, + "num_tokens": 2195246.0, + "step": 259 + }, + { + "entropy": 1.172276645898819, + "epoch": 0.104010401040104, + "grad_norm": 0.3069777488708496, + "learning_rate": 0.00019745512175145627, + "loss": 1.2094, + "mean_token_accuracy": 0.6666506826877594, + "num_tokens": 2203717.0, + "step": 260 + }, + { + "entropy": 1.3047214448451996, + "epoch": 0.10441044104410441, + "grad_norm": 0.3076897859573364, + "learning_rate": 0.0001974275177556579, + "loss": 1.301, + "mean_token_accuracy": 0.6500514298677444, + "num_tokens": 2212037.0, + "step": 261 + }, + { + "entropy": 1.1853089034557343, + "epoch": 0.10481048104810481, + "grad_norm": 0.30814552307128906, + "learning_rate": 0.00019739976702802517, + "loss": 1.121, + "mean_token_accuracy": 0.6797177791595459, + "num_tokens": 2220415.0, + "step": 262 + }, + { + "entropy": 1.14727121591568, + "epoch": 0.10521052105210521, + "grad_norm": 0.3139231503009796, + "learning_rate": 0.0001973718696151329, + "loss": 1.0951, + "mean_token_accuracy": 0.6984894424676895, + "num_tokens": 2228773.0, + "step": 263 + }, + { + "entropy": 1.1453731060028076, + "epoch": 0.10561056105610561, + "grad_norm": 0.3104467988014221, + "learning_rate": 0.00019734382556380194, + "loss": 1.145, + "mean_token_accuracy": 0.6833966672420502, + "num_tokens": 2236602.0, + "step": 264 + }, + { + "entropy": 1.129274994134903, + "epoch": 0.10601060106010601, + "grad_norm": 0.29663506150245667, + "learning_rate": 0.0001973156349210994, + "loss": 1.1386, + "mean_token_accuracy": 0.6783726066350937, + "num_tokens": 2245313.0, + "step": 265 + }, + { + "entropy": 1.1950629949569702, + "epoch": 0.10641064106410641, + "grad_norm": 0.3033241033554077, + "learning_rate": 0.0001972872977343383, + "loss": 1.2095, + "mean_token_accuracy": 0.6765413582324982, + "num_tokens": 2254362.0, + "step": 266 + }, + { + "entropy": 1.2014857530593872, + "epoch": 0.10681068106810682, + "grad_norm": 0.31535446643829346, + "learning_rate": 0.00019725881405107778, + "loss": 1.2053, + "mean_token_accuracy": 0.6713583916425705, + "num_tokens": 2262331.0, + "step": 267 + }, + { + "entropy": 1.1801405549049377, + "epoch": 0.1072107210721072, + "grad_norm": 0.30611008405685425, + "learning_rate": 0.0001972301839191226, + "loss": 1.1823, + "mean_token_accuracy": 0.6748154610395432, + "num_tokens": 2270765.0, + "step": 268 + }, + { + "entropy": 1.1290169060230255, + "epoch": 0.1076107610761076, + "grad_norm": 0.30215638875961304, + "learning_rate": 0.00019720140738652345, + "loss": 1.1209, + "mean_token_accuracy": 0.6912433356046677, + "num_tokens": 2279593.0, + "step": 269 + }, + { + "entropy": 1.1610883474349976, + "epoch": 0.10801080108010801, + "grad_norm": 0.30377084016799927, + "learning_rate": 0.00019717248450157681, + "loss": 1.1863, + "mean_token_accuracy": 0.6740070879459381, + "num_tokens": 2288100.0, + "step": 270 + }, + { + "entropy": 1.1068450212478638, + "epoch": 0.10841084108410841, + "grad_norm": 0.3132963478565216, + "learning_rate": 0.00019714341531282462, + "loss": 1.0841, + "mean_token_accuracy": 0.6911667734384537, + "num_tokens": 2296290.0, + "step": 271 + }, + { + "entropy": 1.168148934841156, + "epoch": 0.10881088108810881, + "grad_norm": 0.3282947242259979, + "learning_rate": 0.0001971141998690545, + "loss": 1.1941, + "mean_token_accuracy": 0.673908457159996, + "num_tokens": 2304766.0, + "step": 272 + }, + { + "entropy": 1.1689501702785492, + "epoch": 0.10921092109210921, + "grad_norm": 0.2957140803337097, + "learning_rate": 0.00019708483821929943, + "loss": 1.1398, + "mean_token_accuracy": 0.6831405013799667, + "num_tokens": 2313114.0, + "step": 273 + }, + { + "entropy": 1.1905297338962555, + "epoch": 0.10961096109610961, + "grad_norm": 0.29807668924331665, + "learning_rate": 0.00019705533041283779, + "loss": 1.1736, + "mean_token_accuracy": 0.6775653660297394, + "num_tokens": 2321660.0, + "step": 274 + }, + { + "entropy": 1.1815482079982758, + "epoch": 0.11001100110011001, + "grad_norm": 0.29083186388015747, + "learning_rate": 0.00019702567649919337, + "loss": 1.1603, + "mean_token_accuracy": 0.6754807829856873, + "num_tokens": 2330342.0, + "step": 275 + }, + { + "entropy": 1.1261299550533295, + "epoch": 0.11041104110411042, + "grad_norm": 0.2901794910430908, + "learning_rate": 0.00019699587652813503, + "loss": 1.1284, + "mean_token_accuracy": 0.691281333565712, + "num_tokens": 2338852.0, + "step": 276 + }, + { + "entropy": 1.184859186410904, + "epoch": 0.11081108110811082, + "grad_norm": 0.310745507478714, + "learning_rate": 0.00019696593054967682, + "loss": 1.2127, + "mean_token_accuracy": 0.6673152446746826, + "num_tokens": 2346809.0, + "step": 277 + }, + { + "entropy": 1.1188380122184753, + "epoch": 0.1112111211121112, + "grad_norm": 0.29587554931640625, + "learning_rate": 0.00019693583861407786, + "loss": 1.0981, + "mean_token_accuracy": 0.6947813928127289, + "num_tokens": 2355532.0, + "step": 278 + }, + { + "entropy": 1.172318309545517, + "epoch": 0.1116111611161116, + "grad_norm": 0.3138435482978821, + "learning_rate": 0.00019690560077184223, + "loss": 1.1441, + "mean_token_accuracy": 0.6789282411336899, + "num_tokens": 2363938.0, + "step": 279 + }, + { + "entropy": 1.1374418139457703, + "epoch": 0.11201120112011201, + "grad_norm": 0.34152451157569885, + "learning_rate": 0.0001968752170737188, + "loss": 1.1081, + "mean_token_accuracy": 0.6848500221967697, + "num_tokens": 2372334.0, + "step": 280 + }, + { + "entropy": 1.1317946314811707, + "epoch": 0.11241124112411241, + "grad_norm": 0.29949530959129333, + "learning_rate": 0.0001968446875707014, + "loss": 1.1138, + "mean_token_accuracy": 0.6870416551828384, + "num_tokens": 2380730.0, + "step": 281 + }, + { + "entropy": 1.0892143547534943, + "epoch": 0.11281128112811281, + "grad_norm": 0.3009011447429657, + "learning_rate": 0.00019681401231402842, + "loss": 1.0712, + "mean_token_accuracy": 0.6998904794454575, + "num_tokens": 2389463.0, + "step": 282 + }, + { + "entropy": 1.1513322591781616, + "epoch": 0.11321132113211321, + "grad_norm": 0.29763105511665344, + "learning_rate": 0.00019678319135518294, + "loss": 1.1861, + "mean_token_accuracy": 0.6697124987840652, + "num_tokens": 2398473.0, + "step": 283 + }, + { + "entropy": 1.1688634753227234, + "epoch": 0.11361136113611361, + "grad_norm": 0.33001646399497986, + "learning_rate": 0.00019675222474589257, + "loss": 1.2012, + "mean_token_accuracy": 0.673338770866394, + "num_tokens": 2406493.0, + "step": 284 + }, + { + "entropy": 1.1393934190273285, + "epoch": 0.11401140114011402, + "grad_norm": 0.2978336215019226, + "learning_rate": 0.00019672111253812933, + "loss": 1.1566, + "mean_token_accuracy": 0.6849386692047119, + "num_tokens": 2414963.0, + "step": 285 + }, + { + "entropy": 1.1978220045566559, + "epoch": 0.11441144114411442, + "grad_norm": 0.296939879655838, + "learning_rate": 0.00019668985478410968, + "loss": 1.1508, + "mean_token_accuracy": 0.6871092170476913, + "num_tokens": 2423476.0, + "step": 286 + }, + { + "entropy": 1.1493785977363586, + "epoch": 0.1148114811481148, + "grad_norm": 0.3038109242916107, + "learning_rate": 0.00019665845153629425, + "loss": 1.1429, + "mean_token_accuracy": 0.6873074918985367, + "num_tokens": 2432015.0, + "step": 287 + }, + { + "entropy": 1.1764490902423859, + "epoch": 0.1152115211521152, + "grad_norm": 0.28137773275375366, + "learning_rate": 0.00019662690284738793, + "loss": 1.1206, + "mean_token_accuracy": 0.6875211298465729, + "num_tokens": 2440577.0, + "step": 288 + }, + { + "entropy": 1.1811064779758453, + "epoch": 0.11561156115611561, + "grad_norm": 0.2927968502044678, + "learning_rate": 0.00019659520877033976, + "loss": 1.1828, + "mean_token_accuracy": 0.67679663002491, + "num_tokens": 2449585.0, + "step": 289 + }, + { + "entropy": 1.1157205402851105, + "epoch": 0.11601160116011601, + "grad_norm": 0.2844160199165344, + "learning_rate": 0.0001965633693583426, + "loss": 1.1127, + "mean_token_accuracy": 0.6861093044281006, + "num_tokens": 2458691.0, + "step": 290 + }, + { + "entropy": 1.1210555136203766, + "epoch": 0.11641164116411641, + "grad_norm": 0.30678603053092957, + "learning_rate": 0.0001965313846648334, + "loss": 1.1495, + "mean_token_accuracy": 0.6870106756687164, + "num_tokens": 2466917.0, + "step": 291 + }, + { + "entropy": 1.1256535351276398, + "epoch": 0.11681168116811681, + "grad_norm": 0.31176719069480896, + "learning_rate": 0.00019649925474349292, + "loss": 1.1516, + "mean_token_accuracy": 0.679766371846199, + "num_tokens": 2475064.0, + "step": 292 + }, + { + "entropy": 1.1276935040950775, + "epoch": 0.11721172117211721, + "grad_norm": 0.29645654559135437, + "learning_rate": 0.00019646697964824562, + "loss": 1.1372, + "mean_token_accuracy": 0.6837837547063828, + "num_tokens": 2483736.0, + "step": 293 + }, + { + "entropy": 1.1446107029914856, + "epoch": 0.11761176117611762, + "grad_norm": 0.2959735691547394, + "learning_rate": 0.00019643455943325953, + "loss": 1.1344, + "mean_token_accuracy": 0.6885244697332382, + "num_tokens": 2492223.0, + "step": 294 + }, + { + "entropy": 1.1486328840255737, + "epoch": 0.11801180118011802, + "grad_norm": 0.35478872060775757, + "learning_rate": 0.00019640199415294645, + "loss": 1.1195, + "mean_token_accuracy": 0.6887603253126144, + "num_tokens": 2500600.0, + "step": 295 + }, + { + "entropy": 1.126534789800644, + "epoch": 0.11841184118411842, + "grad_norm": 0.2932710349559784, + "learning_rate": 0.00019636928386196145, + "loss": 1.1047, + "mean_token_accuracy": 0.696495532989502, + "num_tokens": 2509047.0, + "step": 296 + }, + { + "entropy": 1.1546699106693268, + "epoch": 0.1188118811881188, + "grad_norm": 0.2861276865005493, + "learning_rate": 0.00019633642861520306, + "loss": 1.1463, + "mean_token_accuracy": 0.6796572506427765, + "num_tokens": 2517885.0, + "step": 297 + }, + { + "entropy": 1.1594507992267609, + "epoch": 0.11921192119211921, + "grad_norm": 0.5982229709625244, + "learning_rate": 0.0001963034284678131, + "loss": 1.1527, + "mean_token_accuracy": 0.6782443970441818, + "num_tokens": 2525962.0, + "step": 298 + }, + { + "entropy": 1.1879192888736725, + "epoch": 0.11961196119611961, + "grad_norm": 0.30875492095947266, + "learning_rate": 0.00019627028347517648, + "loss": 1.1854, + "mean_token_accuracy": 0.675933450460434, + "num_tokens": 2534220.0, + "step": 299 + }, + { + "entropy": 1.1593869030475616, + "epoch": 0.12001200120012001, + "grad_norm": 0.3053128719329834, + "learning_rate": 0.00019623699369292137, + "loss": 1.1617, + "mean_token_accuracy": 0.677645817399025, + "num_tokens": 2542206.0, + "step": 300 + }, + { + "entropy": 1.1326042711734772, + "epoch": 0.12041204120412041, + "grad_norm": 0.3102218508720398, + "learning_rate": 0.00019620355917691884, + "loss": 1.1384, + "mean_token_accuracy": 0.6767238080501556, + "num_tokens": 2550584.0, + "step": 301 + }, + { + "entropy": 1.1040166020393372, + "epoch": 0.12081208120812081, + "grad_norm": 0.3166041970252991, + "learning_rate": 0.00019616997998328292, + "loss": 1.1206, + "mean_token_accuracy": 0.6878381818532944, + "num_tokens": 2558969.0, + "step": 302 + }, + { + "entropy": 1.1306456625461578, + "epoch": 0.12121212121212122, + "grad_norm": 0.31803345680236816, + "learning_rate": 0.00019613625616837034, + "loss": 1.1286, + "mean_token_accuracy": 0.6829645335674286, + "num_tokens": 2567510.0, + "step": 303 + }, + { + "entropy": 1.2087586522102356, + "epoch": 0.12161216121612162, + "grad_norm": 0.313399076461792, + "learning_rate": 0.0001961023877887807, + "loss": 1.2, + "mean_token_accuracy": 0.6653729230165482, + "num_tokens": 2575393.0, + "step": 304 + }, + { + "entropy": 1.1803353130817413, + "epoch": 0.12201220122012202, + "grad_norm": 0.2919938862323761, + "learning_rate": 0.0001960683749013562, + "loss": 1.1749, + "mean_token_accuracy": 0.6795784384012222, + "num_tokens": 2583973.0, + "step": 305 + }, + { + "entropy": 1.206252634525299, + "epoch": 0.1224122412241224, + "grad_norm": 0.30734333395957947, + "learning_rate": 0.00019603421756318146, + "loss": 1.2079, + "mean_token_accuracy": 0.6748498380184174, + "num_tokens": 2592413.0, + "step": 306 + }, + { + "entropy": 1.1237642168998718, + "epoch": 0.12281228122812281, + "grad_norm": 0.2940463721752167, + "learning_rate": 0.00019599991583158367, + "loss": 1.0924, + "mean_token_accuracy": 0.6870536357164383, + "num_tokens": 2601189.0, + "step": 307 + }, + { + "entropy": 1.1055436730384827, + "epoch": 0.12321232123212321, + "grad_norm": 0.2887219488620758, + "learning_rate": 0.00019596546976413226, + "loss": 1.1143, + "mean_token_accuracy": 0.6970756649971008, + "num_tokens": 2610378.0, + "step": 308 + }, + { + "entropy": 1.1455924063920975, + "epoch": 0.12361236123612361, + "grad_norm": 0.30642586946487427, + "learning_rate": 0.00019593087941863893, + "loss": 1.1163, + "mean_token_accuracy": 0.6846802532672882, + "num_tokens": 2618765.0, + "step": 309 + }, + { + "entropy": 1.1495613157749176, + "epoch": 0.12401240124012401, + "grad_norm": 0.2958558201789856, + "learning_rate": 0.00019589614485315766, + "loss": 1.1277, + "mean_token_accuracy": 0.692332923412323, + "num_tokens": 2627306.0, + "step": 310 + }, + { + "entropy": 1.1369233131408691, + "epoch": 0.12441244124412441, + "grad_norm": 0.2962513566017151, + "learning_rate": 0.0001958612661259842, + "loss": 1.1458, + "mean_token_accuracy": 0.6847312748432159, + "num_tokens": 2635802.0, + "step": 311 + }, + { + "entropy": 1.1192970275878906, + "epoch": 0.12481248124812482, + "grad_norm": 0.3100016117095947, + "learning_rate": 0.00019582624329565656, + "loss": 1.1479, + "mean_token_accuracy": 0.679630234837532, + "num_tokens": 2644316.0, + "step": 312 + }, + { + "entropy": 1.1962910890579224, + "epoch": 0.1252125212521252, + "grad_norm": 0.3248625099658966, + "learning_rate": 0.0001957910764209543, + "loss": 1.2285, + "mean_token_accuracy": 0.6648171693086624, + "num_tokens": 2652787.0, + "step": 313 + }, + { + "entropy": 1.1034400761127472, + "epoch": 0.1256125612561256, + "grad_norm": 0.2892885208129883, + "learning_rate": 0.00019575576556089897, + "loss": 1.1218, + "mean_token_accuracy": 0.685823604464531, + "num_tokens": 2661638.0, + "step": 314 + }, + { + "entropy": 1.1764290630817413, + "epoch": 0.126012601260126, + "grad_norm": 0.2998030483722687, + "learning_rate": 0.00019572031077475367, + "loss": 1.0975, + "mean_token_accuracy": 0.6871052384376526, + "num_tokens": 2670313.0, + "step": 315 + }, + { + "entropy": 1.2649544775485992, + "epoch": 0.1264126412641264, + "grad_norm": 0.31360095739364624, + "learning_rate": 0.0001956847121220231, + "loss": 1.2167, + "mean_token_accuracy": 0.660548061132431, + "num_tokens": 2678587.0, + "step": 316 + }, + { + "entropy": 1.1531548500061035, + "epoch": 0.1268126812681268, + "grad_norm": 0.3179381787776947, + "learning_rate": 0.0001956489696624533, + "loss": 1.1596, + "mean_token_accuracy": 0.6832859367132187, + "num_tokens": 2686845.0, + "step": 317 + }, + { + "entropy": 1.1491257846355438, + "epoch": 0.1272127212721272, + "grad_norm": 0.3010673224925995, + "learning_rate": 0.00019561308345603188, + "loss": 1.1856, + "mean_token_accuracy": 0.6756436675786972, + "num_tokens": 2695519.0, + "step": 318 + }, + { + "entropy": 1.099882572889328, + "epoch": 0.1276127612761276, + "grad_norm": 0.3057318925857544, + "learning_rate": 0.0001955770535629875, + "loss": 1.1369, + "mean_token_accuracy": 0.6802153438329697, + "num_tokens": 2704317.0, + "step": 319 + }, + { + "entropy": 1.1104555130004883, + "epoch": 0.128012801280128, + "grad_norm": 0.30537816882133484, + "learning_rate": 0.00019554088004379, + "loss": 1.0916, + "mean_token_accuracy": 0.6971182078123093, + "num_tokens": 2712576.0, + "step": 320 + }, + { + "entropy": 1.1894198954105377, + "epoch": 0.12841284128412842, + "grad_norm": 0.2941950261592865, + "learning_rate": 0.00019550456295915042, + "loss": 1.1728, + "mean_token_accuracy": 0.6762441992759705, + "num_tokens": 2721000.0, + "step": 321 + }, + { + "entropy": 1.1880941092967987, + "epoch": 0.12881288128812882, + "grad_norm": 0.3045370280742645, + "learning_rate": 0.00019546810237002066, + "loss": 1.1695, + "mean_token_accuracy": 0.6775896400213242, + "num_tokens": 2729281.0, + "step": 322 + }, + { + "entropy": 1.1603459417819977, + "epoch": 0.12921292129212922, + "grad_norm": 0.29477667808532715, + "learning_rate": 0.00019543149833759334, + "loss": 1.13, + "mean_token_accuracy": 0.6883135735988617, + "num_tokens": 2737775.0, + "step": 323 + }, + { + "entropy": 1.148952156305313, + "epoch": 0.12961296129612962, + "grad_norm": 0.2921348214149475, + "learning_rate": 0.000195394750923302, + "loss": 1.1492, + "mean_token_accuracy": 0.6808929741382599, + "num_tokens": 2746681.0, + "step": 324 + }, + { + "entropy": 1.2179997265338898, + "epoch": 0.13001300130013002, + "grad_norm": 0.3009890019893646, + "learning_rate": 0.0001953578601888208, + "loss": 1.2338, + "mean_token_accuracy": 0.6610979735851288, + "num_tokens": 2755045.0, + "step": 325 + }, + { + "entropy": 1.2134989798069, + "epoch": 0.13041304130413042, + "grad_norm": 0.3033868968486786, + "learning_rate": 0.00019532082619606436, + "loss": 1.2165, + "mean_token_accuracy": 0.6606318801641464, + "num_tokens": 2763287.0, + "step": 326 + }, + { + "entropy": 1.0881072580814362, + "epoch": 0.13081308130813082, + "grad_norm": 0.2861042022705078, + "learning_rate": 0.0001952836490071878, + "loss": 1.0643, + "mean_token_accuracy": 0.6997469067573547, + "num_tokens": 2772109.0, + "step": 327 + }, + { + "entropy": 1.2652019262313843, + "epoch": 0.13121312131213123, + "grad_norm": 0.3063291311264038, + "learning_rate": 0.00019524632868458649, + "loss": 1.2374, + "mean_token_accuracy": 0.6631722450256348, + "num_tokens": 2780001.0, + "step": 328 + }, + { + "entropy": 1.1232223510742188, + "epoch": 0.1316131613161316, + "grad_norm": 0.2938007712364197, + "learning_rate": 0.00019520886529089616, + "loss": 1.1047, + "mean_token_accuracy": 0.6943131983280182, + "num_tokens": 2788572.0, + "step": 329 + }, + { + "entropy": 1.182855635881424, + "epoch": 0.132013201320132, + "grad_norm": 0.2949009835720062, + "learning_rate": 0.00019517125888899255, + "loss": 1.1657, + "mean_token_accuracy": 0.6759148836135864, + "num_tokens": 2797349.0, + "step": 330 + }, + { + "entropy": 1.1421308815479279, + "epoch": 0.1324132413241324, + "grad_norm": 0.3349224328994751, + "learning_rate": 0.00019513350954199142, + "loss": 1.1379, + "mean_token_accuracy": 0.6823170036077499, + "num_tokens": 2805345.0, + "step": 331 + }, + { + "entropy": 1.0656911730766296, + "epoch": 0.1328132813281328, + "grad_norm": 0.3012828230857849, + "learning_rate": 0.00019509561731324848, + "loss": 1.0942, + "mean_token_accuracy": 0.6952732652425766, + "num_tokens": 2814123.0, + "step": 332 + }, + { + "entropy": 1.0468103885650635, + "epoch": 0.1332133213321332, + "grad_norm": 0.30162152647972107, + "learning_rate": 0.0001950575822663592, + "loss": 1.1012, + "mean_token_accuracy": 0.6894596368074417, + "num_tokens": 2823120.0, + "step": 333 + }, + { + "entropy": 1.089416727423668, + "epoch": 0.1336133613361336, + "grad_norm": 0.3064773976802826, + "learning_rate": 0.00019501940446515882, + "loss": 1.1036, + "mean_token_accuracy": 0.6885414123535156, + "num_tokens": 2831735.0, + "step": 334 + }, + { + "entropy": 1.1649364531040192, + "epoch": 0.134013401340134, + "grad_norm": 0.35003024339675903, + "learning_rate": 0.00019498108397372212, + "loss": 1.1766, + "mean_token_accuracy": 0.6764324754476547, + "num_tokens": 2839670.0, + "step": 335 + }, + { + "entropy": 1.1590066254138947, + "epoch": 0.1344134413441344, + "grad_norm": 0.26645922660827637, + "learning_rate": 0.0001949426208563633, + "loss": 1.1091, + "mean_token_accuracy": 0.6905470341444016, + "num_tokens": 2848911.0, + "step": 336 + }, + { + "entropy": 1.251402735710144, + "epoch": 0.1348134813481348, + "grad_norm": 0.31132251024246216, + "learning_rate": 0.000194904015177636, + "loss": 1.1918, + "mean_token_accuracy": 0.6727328300476074, + "num_tokens": 2857199.0, + "step": 337 + }, + { + "entropy": 1.220662236213684, + "epoch": 0.1352135213521352, + "grad_norm": 0.3061762750148773, + "learning_rate": 0.00019486526700233315, + "loss": 1.1868, + "mean_token_accuracy": 0.672507032752037, + "num_tokens": 2865223.0, + "step": 338 + }, + { + "entropy": 1.0638089627027512, + "epoch": 0.13561356135613561, + "grad_norm": 0.29525840282440186, + "learning_rate": 0.00019482637639548682, + "loss": 1.0514, + "mean_token_accuracy": 0.7034783512353897, + "num_tokens": 2873440.0, + "step": 339 + }, + { + "entropy": 1.1221419274806976, + "epoch": 0.13601360136013602, + "grad_norm": 0.2899990379810333, + "learning_rate": 0.00019478734342236808, + "loss": 1.1505, + "mean_token_accuracy": 0.675692155957222, + "num_tokens": 2882408.0, + "step": 340 + }, + { + "entropy": 1.145202785730362, + "epoch": 0.13641364136413642, + "grad_norm": 0.2904442250728607, + "learning_rate": 0.0001947481681484869, + "loss": 1.1848, + "mean_token_accuracy": 0.6750968992710114, + "num_tokens": 2891461.0, + "step": 341 + }, + { + "entropy": 1.081279844045639, + "epoch": 0.13681368136813682, + "grad_norm": 0.30348628759384155, + "learning_rate": 0.00019470885063959225, + "loss": 1.0734, + "mean_token_accuracy": 0.6975607126951218, + "num_tokens": 2900223.0, + "step": 342 + }, + { + "entropy": 1.0558022856712341, + "epoch": 0.13721372137213722, + "grad_norm": 0.28773176670074463, + "learning_rate": 0.00019466939096167164, + "loss": 1.0604, + "mean_token_accuracy": 0.6948001831769943, + "num_tokens": 2909084.0, + "step": 343 + }, + { + "entropy": 1.1171001195907593, + "epoch": 0.13761376137613762, + "grad_norm": 0.29017966985702515, + "learning_rate": 0.00019462978918095128, + "loss": 1.1181, + "mean_token_accuracy": 0.68596550822258, + "num_tokens": 2917795.0, + "step": 344 + }, + { + "entropy": 1.1633701920509338, + "epoch": 0.13801380138013802, + "grad_norm": 0.28877806663513184, + "learning_rate": 0.00019459004536389587, + "loss": 1.1716, + "mean_token_accuracy": 0.6693498939275742, + "num_tokens": 2925764.0, + "step": 345 + }, + { + "entropy": 1.2091334760189056, + "epoch": 0.13841384138413843, + "grad_norm": 0.3057492971420288, + "learning_rate": 0.00019455015957720842, + "loss": 1.2115, + "mean_token_accuracy": 0.6683546006679535, + "num_tokens": 2934337.0, + "step": 346 + }, + { + "entropy": 1.117457777261734, + "epoch": 0.13881388138813883, + "grad_norm": 0.3619987964630127, + "learning_rate": 0.0001945101318878303, + "loss": 1.0944, + "mean_token_accuracy": 0.6917587071657181, + "num_tokens": 2942882.0, + "step": 347 + }, + { + "entropy": 1.1964794397354126, + "epoch": 0.1392139213921392, + "grad_norm": 0.29087069630622864, + "learning_rate": 0.000194469962362941, + "loss": 1.1536, + "mean_token_accuracy": 0.6789288967847824, + "num_tokens": 2951358.0, + "step": 348 + }, + { + "entropy": 1.1352568864822388, + "epoch": 0.1396139613961396, + "grad_norm": 0.30058935284614563, + "learning_rate": 0.00019442965106995807, + "loss": 1.1042, + "mean_token_accuracy": 0.6969415545463562, + "num_tokens": 2959902.0, + "step": 349 + }, + { + "entropy": 1.1815881133079529, + "epoch": 0.14001400140014, + "grad_norm": 0.29818278551101685, + "learning_rate": 0.00019438919807653694, + "loss": 1.1937, + "mean_token_accuracy": 0.6777724772691727, + "num_tokens": 2968375.0, + "step": 350 + }, + { + "entropy": 1.1138464957475662, + "epoch": 0.1404140414041404, + "grad_norm": 0.29378682374954224, + "learning_rate": 0.00019434860345057096, + "loss": 1.136, + "mean_token_accuracy": 0.6846367418766022, + "num_tokens": 2976891.0, + "step": 351 + }, + { + "entropy": 1.1382241249084473, + "epoch": 0.1408140814081408, + "grad_norm": 0.298759788274765, + "learning_rate": 0.00019430786726019102, + "loss": 1.1675, + "mean_token_accuracy": 0.6828837245702744, + "num_tokens": 2984891.0, + "step": 352 + }, + { + "entropy": 1.2404142022132874, + "epoch": 0.1412141214121412, + "grad_norm": 0.3150947093963623, + "learning_rate": 0.00019426698957376585, + "loss": 1.2342, + "mean_token_accuracy": 0.6579574644565582, + "num_tokens": 2993072.0, + "step": 353 + }, + { + "entropy": 1.1687238216400146, + "epoch": 0.1416141614161416, + "grad_norm": 0.29389873147010803, + "learning_rate": 0.00019422597045990142, + "loss": 1.1767, + "mean_token_accuracy": 0.6675811409950256, + "num_tokens": 3001760.0, + "step": 354 + }, + { + "entropy": 1.1566392183303833, + "epoch": 0.142014201420142, + "grad_norm": 0.288309246301651, + "learning_rate": 0.00019418480998744118, + "loss": 1.1291, + "mean_token_accuracy": 0.6857695430517197, + "num_tokens": 3010111.0, + "step": 355 + }, + { + "entropy": 1.1949766874313354, + "epoch": 0.1424142414241424, + "grad_norm": 0.29533353447914124, + "learning_rate": 0.00019414350822546584, + "loss": 1.1664, + "mean_token_accuracy": 0.6795456558465958, + "num_tokens": 3018712.0, + "step": 356 + }, + { + "entropy": 1.1488195657730103, + "epoch": 0.14281428142814281, + "grad_norm": 0.3124019205570221, + "learning_rate": 0.00019410206524329314, + "loss": 1.129, + "mean_token_accuracy": 0.6900259405374527, + "num_tokens": 3026707.0, + "step": 357 + }, + { + "entropy": 1.1078391075134277, + "epoch": 0.14321432143214322, + "grad_norm": 0.4887332618236542, + "learning_rate": 0.00019406048111047792, + "loss": 1.1122, + "mean_token_accuracy": 0.6845664978027344, + "num_tokens": 3035277.0, + "step": 358 + }, + { + "entropy": 1.1673301458358765, + "epoch": 0.14361436143614362, + "grad_norm": 0.30997899174690247, + "learning_rate": 0.0001940187558968119, + "loss": 1.1427, + "mean_token_accuracy": 0.6802043169736862, + "num_tokens": 3043456.0, + "step": 359 + }, + { + "entropy": 1.1499980092048645, + "epoch": 0.14401440144014402, + "grad_norm": 0.3066644072532654, + "learning_rate": 0.00019397688967232352, + "loss": 1.1497, + "mean_token_accuracy": 0.6805084347724915, + "num_tokens": 3051649.0, + "step": 360 + }, + { + "entropy": 1.131559580564499, + "epoch": 0.14441444144414442, + "grad_norm": 0.296249657869339, + "learning_rate": 0.000193934882507278, + "loss": 1.1349, + "mean_token_accuracy": 0.6809341907501221, + "num_tokens": 3060190.0, + "step": 361 + }, + { + "entropy": 1.1443010866641998, + "epoch": 0.14481448144814482, + "grad_norm": 0.31838539242744446, + "learning_rate": 0.00019389273447217704, + "loss": 1.1696, + "mean_token_accuracy": 0.6759007275104523, + "num_tokens": 3068580.0, + "step": 362 + }, + { + "entropy": 1.133973866701126, + "epoch": 0.14521452145214522, + "grad_norm": 0.2861894965171814, + "learning_rate": 0.0001938504456377587, + "loss": 1.1291, + "mean_token_accuracy": 0.6851497888565063, + "num_tokens": 3077427.0, + "step": 363 + }, + { + "entropy": 1.136247158050537, + "epoch": 0.14561456145614562, + "grad_norm": 0.2967614531517029, + "learning_rate": 0.00019380801607499746, + "loss": 1.0995, + "mean_token_accuracy": 0.6911982148885727, + "num_tokens": 3085196.0, + "step": 364 + }, + { + "entropy": 1.184772402048111, + "epoch": 0.14601460146014603, + "grad_norm": 0.3119775354862213, + "learning_rate": 0.00019376544585510393, + "loss": 1.2257, + "mean_token_accuracy": 0.666557103395462, + "num_tokens": 3093621.0, + "step": 365 + }, + { + "entropy": 1.1576828956604004, + "epoch": 0.14641464146414643, + "grad_norm": 0.3863295018672943, + "learning_rate": 0.0001937227350495248, + "loss": 1.1722, + "mean_token_accuracy": 0.6755800992250443, + "num_tokens": 3102047.0, + "step": 366 + }, + { + "entropy": 1.0888293087482452, + "epoch": 0.1468146814681468, + "grad_norm": 0.2931033670902252, + "learning_rate": 0.00019367988372994265, + "loss": 1.0546, + "mean_token_accuracy": 0.6972876787185669, + "num_tokens": 3110407.0, + "step": 367 + }, + { + "entropy": 1.171687364578247, + "epoch": 0.1472147214721472, + "grad_norm": 0.43645840883255005, + "learning_rate": 0.000193636891968276, + "loss": 1.1192, + "mean_token_accuracy": 0.6813657730817795, + "num_tokens": 3118726.0, + "step": 368 + }, + { + "entropy": 1.1906355917453766, + "epoch": 0.1476147614761476, + "grad_norm": 0.30559539794921875, + "learning_rate": 0.00019359375983667902, + "loss": 1.1854, + "mean_token_accuracy": 0.6698572039604187, + "num_tokens": 3126856.0, + "step": 369 + }, + { + "entropy": 1.1418620645999908, + "epoch": 0.148014801480148, + "grad_norm": 0.31266874074935913, + "learning_rate": 0.00019355048740754145, + "loss": 1.1375, + "mean_token_accuracy": 0.678287535905838, + "num_tokens": 3135201.0, + "step": 370 + }, + { + "entropy": 1.1904971301555634, + "epoch": 0.1484148414841484, + "grad_norm": 0.3213047981262207, + "learning_rate": 0.00019350707475348852, + "loss": 1.1842, + "mean_token_accuracy": 0.6759228259325027, + "num_tokens": 3143256.0, + "step": 371 + }, + { + "entropy": 1.1902599036693573, + "epoch": 0.1488148814881488, + "grad_norm": 0.5613988041877747, + "learning_rate": 0.00019346352194738077, + "loss": 1.2442, + "mean_token_accuracy": 0.6619480550289154, + "num_tokens": 3150704.0, + "step": 372 + }, + { + "entropy": 1.0474575012922287, + "epoch": 0.1492149214921492, + "grad_norm": 0.2898733615875244, + "learning_rate": 0.00019341982906231407, + "loss": 1.0636, + "mean_token_accuracy": 0.6995494663715363, + "num_tokens": 3159711.0, + "step": 373 + }, + { + "entropy": 1.226840317249298, + "epoch": 0.1496149614961496, + "grad_norm": 0.314718633890152, + "learning_rate": 0.0001933759961716192, + "loss": 1.1882, + "mean_token_accuracy": 0.6709526926279068, + "num_tokens": 3167294.0, + "step": 374 + }, + { + "entropy": 1.1560609936714172, + "epoch": 0.15001500150015, + "grad_norm": 0.29525458812713623, + "learning_rate": 0.00019333202334886207, + "loss": 1.1088, + "mean_token_accuracy": 0.6907341927289963, + "num_tokens": 3175676.0, + "step": 375 + }, + { + "entropy": 1.1789807677268982, + "epoch": 0.15041504150415042, + "grad_norm": 0.2906891405582428, + "learning_rate": 0.0001932879106678434, + "loss": 1.1488, + "mean_token_accuracy": 0.6830808073282242, + "num_tokens": 3184781.0, + "step": 376 + }, + { + "entropy": 1.2095182836055756, + "epoch": 0.15081508150815082, + "grad_norm": 0.29173582792282104, + "learning_rate": 0.00019324365820259858, + "loss": 1.1471, + "mean_token_accuracy": 0.6814120411872864, + "num_tokens": 3193359.0, + "step": 377 + }, + { + "entropy": 1.1557953655719757, + "epoch": 0.15121512151215122, + "grad_norm": 0.30150917172431946, + "learning_rate": 0.0001931992660273977, + "loss": 1.1842, + "mean_token_accuracy": 0.6736668199300766, + "num_tokens": 3201977.0, + "step": 378 + }, + { + "entropy": 1.1141368001699448, + "epoch": 0.15161516151615162, + "grad_norm": 0.3033373951911926, + "learning_rate": 0.00019315473421674525, + "loss": 1.1433, + "mean_token_accuracy": 0.6801392734050751, + "num_tokens": 3210612.0, + "step": 379 + }, + { + "entropy": 1.0636587738990784, + "epoch": 0.15201520152015202, + "grad_norm": 0.2994931936264038, + "learning_rate": 0.00019311006284538013, + "loss": 1.0722, + "mean_token_accuracy": 0.6968654096126556, + "num_tokens": 3219123.0, + "step": 380 + }, + { + "entropy": 1.2064105868339539, + "epoch": 0.15241524152415242, + "grad_norm": 0.3521154820919037, + "learning_rate": 0.00019306525198827548, + "loss": 1.2385, + "mean_token_accuracy": 0.6615314930677414, + "num_tokens": 3227445.0, + "step": 381 + }, + { + "entropy": 1.127672255039215, + "epoch": 0.15281528152815282, + "grad_norm": 0.2892846465110779, + "learning_rate": 0.00019302030172063837, + "loss": 1.1389, + "mean_token_accuracy": 0.6847521215677261, + "num_tokens": 3236240.0, + "step": 382 + }, + { + "entropy": 1.1575649082660675, + "epoch": 0.15321532153215323, + "grad_norm": 0.31099551916122437, + "learning_rate": 0.0001929752121179101, + "loss": 1.1524, + "mean_token_accuracy": 0.6786007881164551, + "num_tokens": 3244515.0, + "step": 383 + }, + { + "entropy": 1.1269442737102509, + "epoch": 0.15361536153615363, + "grad_norm": 0.2906751036643982, + "learning_rate": 0.0001929299832557657, + "loss": 1.0972, + "mean_token_accuracy": 0.6957235038280487, + "num_tokens": 3253311.0, + "step": 384 + }, + { + "entropy": 1.2260091006755829, + "epoch": 0.15401540154015403, + "grad_norm": 0.2963874638080597, + "learning_rate": 0.00019288461521011388, + "loss": 1.1781, + "mean_token_accuracy": 0.6785955429077148, + "num_tokens": 3261634.0, + "step": 385 + }, + { + "entropy": 1.1854043006896973, + "epoch": 0.1544154415441544, + "grad_norm": 0.30083367228507996, + "learning_rate": 0.00019283910805709698, + "loss": 1.1677, + "mean_token_accuracy": 0.6692470908164978, + "num_tokens": 3270087.0, + "step": 386 + }, + { + "entropy": 1.2266800105571747, + "epoch": 0.1548154815481548, + "grad_norm": 0.3198303282260895, + "learning_rate": 0.00019279346187309085, + "loss": 1.2064, + "mean_token_accuracy": 0.6682067066431046, + "num_tokens": 3278271.0, + "step": 387 + }, + { + "entropy": 1.1660953760147095, + "epoch": 0.1552155215521552, + "grad_norm": 0.33573225140571594, + "learning_rate": 0.00019274767673470463, + "loss": 1.1942, + "mean_token_accuracy": 0.6672907918691635, + "num_tokens": 3286608.0, + "step": 388 + }, + { + "entropy": 1.0843549370765686, + "epoch": 0.1556155615561556, + "grad_norm": 0.30995887517929077, + "learning_rate": 0.00019270175271878068, + "loss": 1.0992, + "mean_token_accuracy": 0.6958242803812027, + "num_tokens": 3295009.0, + "step": 389 + }, + { + "entropy": 1.128290981054306, + "epoch": 0.156015601560156, + "grad_norm": 0.3144836127758026, + "learning_rate": 0.00019265568990239445, + "loss": 1.137, + "mean_token_accuracy": 0.6823694556951523, + "num_tokens": 3303299.0, + "step": 390 + }, + { + "entropy": 1.195746123790741, + "epoch": 0.1564156415641564, + "grad_norm": 0.30768823623657227, + "learning_rate": 0.00019260948836285439, + "loss": 1.1869, + "mean_token_accuracy": 0.6803343147039413, + "num_tokens": 3311591.0, + "step": 391 + }, + { + "entropy": 1.1737743616104126, + "epoch": 0.1568156815681568, + "grad_norm": 0.29867610335350037, + "learning_rate": 0.00019256314817770164, + "loss": 1.1703, + "mean_token_accuracy": 0.6784539520740509, + "num_tokens": 3320022.0, + "step": 392 + }, + { + "entropy": 1.2264443039894104, + "epoch": 0.1572157215721572, + "grad_norm": 0.30367588996887207, + "learning_rate": 0.00019251666942471016, + "loss": 1.1963, + "mean_token_accuracy": 0.6694721430540085, + "num_tokens": 3328671.0, + "step": 393 + }, + { + "entropy": 1.1673425137996674, + "epoch": 0.15761576157615761, + "grad_norm": 0.312225341796875, + "learning_rate": 0.00019247005218188645, + "loss": 1.1641, + "mean_token_accuracy": 0.6831966638565063, + "num_tokens": 3336686.0, + "step": 394 + }, + { + "entropy": 1.1570010483264923, + "epoch": 0.15801580158015802, + "grad_norm": 0.325536847114563, + "learning_rate": 0.00019242329652746938, + "loss": 1.1245, + "mean_token_accuracy": 0.6909505128860474, + "num_tokens": 3344988.0, + "step": 395 + }, + { + "entropy": 1.118729829788208, + "epoch": 0.15841584158415842, + "grad_norm": 0.31520524621009827, + "learning_rate": 0.00019237640253993017, + "loss": 1.1096, + "mean_token_accuracy": 0.686091959476471, + "num_tokens": 3353202.0, + "step": 396 + }, + { + "entropy": 1.1297271251678467, + "epoch": 0.15881588158815882, + "grad_norm": 0.31851935386657715, + "learning_rate": 0.00019232937029797217, + "loss": 1.1385, + "mean_token_accuracy": 0.6839326471090317, + "num_tokens": 3362000.0, + "step": 397 + }, + { + "entropy": 1.111870676279068, + "epoch": 0.15921592159215922, + "grad_norm": 0.29706814885139465, + "learning_rate": 0.00019228219988053085, + "loss": 1.132, + "mean_token_accuracy": 0.6736722886562347, + "num_tokens": 3370452.0, + "step": 398 + }, + { + "entropy": 1.0942797362804413, + "epoch": 0.15961596159615962, + "grad_norm": 0.3211657702922821, + "learning_rate": 0.00019223489136677347, + "loss": 1.1642, + "mean_token_accuracy": 0.6759698241949081, + "num_tokens": 3378774.0, + "step": 399 + }, + { + "entropy": 1.1003531515598297, + "epoch": 0.16001600160016002, + "grad_norm": 0.2938557267189026, + "learning_rate": 0.00019218744483609918, + "loss": 1.0841, + "mean_token_accuracy": 0.689574733376503, + "num_tokens": 3387752.0, + "step": 400 + }, + { + "entropy": 1.1808100640773773, + "epoch": 0.16041604160416043, + "grad_norm": 0.3016187250614166, + "learning_rate": 0.00019213986036813863, + "loss": 1.1379, + "mean_token_accuracy": 0.6819901168346405, + "num_tokens": 3395722.0, + "step": 401 + }, + { + "entropy": 1.1858965158462524, + "epoch": 0.16081608160816083, + "grad_norm": 0.2888219952583313, + "learning_rate": 0.00019209213804275408, + "loss": 1.1126, + "mean_token_accuracy": 0.6891250312328339, + "num_tokens": 3404658.0, + "step": 402 + }, + { + "entropy": 1.1066676825284958, + "epoch": 0.16121612161216123, + "grad_norm": 0.2900371551513672, + "learning_rate": 0.00019204427794003911, + "loss": 1.0613, + "mean_token_accuracy": 0.6994702219963074, + "num_tokens": 3413044.0, + "step": 403 + }, + { + "entropy": 1.0648207068443298, + "epoch": 0.16161616161616163, + "grad_norm": 0.2870444357395172, + "learning_rate": 0.00019199628014031857, + "loss": 1.0816, + "mean_token_accuracy": 0.6926587671041489, + "num_tokens": 3421932.0, + "step": 404 + }, + { + "entropy": 1.1214756965637207, + "epoch": 0.162016201620162, + "grad_norm": 0.3146369755268097, + "learning_rate": 0.00019194814472414844, + "loss": 1.1529, + "mean_token_accuracy": 0.679986834526062, + "num_tokens": 3429660.0, + "step": 405 + }, + { + "entropy": 1.0432531386613846, + "epoch": 0.1624162416241624, + "grad_norm": 0.3081408441066742, + "learning_rate": 0.00019189987177231554, + "loss": 1.0802, + "mean_token_accuracy": 0.6946697533130646, + "num_tokens": 3437779.0, + "step": 406 + }, + { + "entropy": 1.1035350263118744, + "epoch": 0.1628162816281628, + "grad_norm": 0.3021145761013031, + "learning_rate": 0.00019185146136583761, + "loss": 1.1354, + "mean_token_accuracy": 0.6885717958211899, + "num_tokens": 3446116.0, + "step": 407 + }, + { + "entropy": 1.1501671075820923, + "epoch": 0.1632163216321632, + "grad_norm": 0.41734570264816284, + "learning_rate": 0.00019180291358596312, + "loss": 1.1233, + "mean_token_accuracy": 0.6793646067380905, + "num_tokens": 3454845.0, + "step": 408 + }, + { + "entropy": 1.1991091966629028, + "epoch": 0.1636163616361636, + "grad_norm": 0.29790523648262024, + "learning_rate": 0.00019175422851417103, + "loss": 1.1549, + "mean_token_accuracy": 0.6777328252792358, + "num_tokens": 3463400.0, + "step": 409 + }, + { + "entropy": 1.1822619140148163, + "epoch": 0.164016401640164, + "grad_norm": 0.31777262687683105, + "learning_rate": 0.00019170540623217065, + "loss": 1.1476, + "mean_token_accuracy": 0.6912225484848022, + "num_tokens": 3471177.0, + "step": 410 + }, + { + "entropy": 1.1974277198314667, + "epoch": 0.1644164416441644, + "grad_norm": 0.30301401019096375, + "learning_rate": 0.00019165644682190178, + "loss": 1.1863, + "mean_token_accuracy": 0.6698818802833557, + "num_tokens": 3479462.0, + "step": 411 + }, + { + "entropy": 1.1671889424324036, + "epoch": 0.16481648164816481, + "grad_norm": 0.3080313801765442, + "learning_rate": 0.0001916073503655342, + "loss": 1.1485, + "mean_token_accuracy": 0.6848516017198563, + "num_tokens": 3487668.0, + "step": 412 + }, + { + "entropy": 1.1198955476284027, + "epoch": 0.16521652165216522, + "grad_norm": 0.282215416431427, + "learning_rate": 0.00019155811694546773, + "loss": 1.117, + "mean_token_accuracy": 0.6849533915519714, + "num_tokens": 3496407.0, + "step": 413 + }, + { + "entropy": 1.1208362877368927, + "epoch": 0.16561656165616562, + "grad_norm": 0.2846994996070862, + "learning_rate": 0.0001915087466443321, + "loss": 1.1486, + "mean_token_accuracy": 0.6762874126434326, + "num_tokens": 3505305.0, + "step": 414 + }, + { + "entropy": 1.1050612926483154, + "epoch": 0.16601660166016602, + "grad_norm": 0.2926284670829773, + "learning_rate": 0.00019145923954498674, + "loss": 1.1086, + "mean_token_accuracy": 0.6887543201446533, + "num_tokens": 3513791.0, + "step": 415 + }, + { + "entropy": 1.1567849516868591, + "epoch": 0.16641664166416642, + "grad_norm": 0.3551363945007324, + "learning_rate": 0.00019140959573052068, + "loss": 1.1884, + "mean_token_accuracy": 0.6731236577033997, + "num_tokens": 3522187.0, + "step": 416 + }, + { + "entropy": 1.0714478492736816, + "epoch": 0.16681668166816682, + "grad_norm": 0.2826900780200958, + "learning_rate": 0.00019135981528425238, + "loss": 1.07, + "mean_token_accuracy": 0.6979558169841766, + "num_tokens": 3530921.0, + "step": 417 + }, + { + "entropy": 1.1964420974254608, + "epoch": 0.16721672167216722, + "grad_norm": 0.283438116312027, + "learning_rate": 0.0001913098982897297, + "loss": 1.2064, + "mean_token_accuracy": 0.6715447902679443, + "num_tokens": 3539583.0, + "step": 418 + }, + { + "entropy": 1.1429602801799774, + "epoch": 0.16761676167616762, + "grad_norm": 0.27956098318099976, + "learning_rate": 0.0001912598448307295, + "loss": 1.103, + "mean_token_accuracy": 0.692705973982811, + "num_tokens": 3548027.0, + "step": 419 + }, + { + "entropy": 1.1086672246456146, + "epoch": 0.16801680168016803, + "grad_norm": 0.30192887783050537, + "learning_rate": 0.0001912096549912579, + "loss": 1.0665, + "mean_token_accuracy": 0.6996335387229919, + "num_tokens": 3556575.0, + "step": 420 + }, + { + "entropy": 1.122267097234726, + "epoch": 0.16841684168416843, + "grad_norm": 0.28671419620513916, + "learning_rate": 0.0001911593288555497, + "loss": 1.0995, + "mean_token_accuracy": 0.6916577368974686, + "num_tokens": 3564842.0, + "step": 421 + }, + { + "entropy": 1.1425860822200775, + "epoch": 0.16881688168816883, + "grad_norm": 0.31337839365005493, + "learning_rate": 0.0001911088665080685, + "loss": 1.1492, + "mean_token_accuracy": 0.6899708062410355, + "num_tokens": 3573378.0, + "step": 422 + }, + { + "entropy": 1.1819129288196564, + "epoch": 0.1692169216921692, + "grad_norm": 0.3169664442539215, + "learning_rate": 0.00019105826803350668, + "loss": 1.2067, + "mean_token_accuracy": 0.6600329726934433, + "num_tokens": 3581995.0, + "step": 423 + }, + { + "entropy": 1.1388654112815857, + "epoch": 0.1696169616961696, + "grad_norm": 0.3174993097782135, + "learning_rate": 0.00019100753351678485, + "loss": 1.1679, + "mean_token_accuracy": 0.6717206537723541, + "num_tokens": 3590053.0, + "step": 424 + }, + { + "entropy": 1.0764131546020508, + "epoch": 0.17001700170017, + "grad_norm": 0.27433347702026367, + "learning_rate": 0.0001909566630430521, + "loss": 1.0583, + "mean_token_accuracy": 0.698042631149292, + "num_tokens": 3598969.0, + "step": 425 + }, + { + "entropy": 1.1677474975585938, + "epoch": 0.1704170417041704, + "grad_norm": 0.28440240025520325, + "learning_rate": 0.0001909056566976856, + "loss": 1.1686, + "mean_token_accuracy": 0.6792843639850616, + "num_tokens": 3608017.0, + "step": 426 + }, + { + "entropy": 1.0982355326414108, + "epoch": 0.1708170817081708, + "grad_norm": 0.281744122505188, + "learning_rate": 0.00019085451456629063, + "loss": 1.0735, + "mean_token_accuracy": 0.6970892697572708, + "num_tokens": 3616898.0, + "step": 427 + }, + { + "entropy": 1.1331664025783539, + "epoch": 0.1712171217121712, + "grad_norm": 0.29245954751968384, + "learning_rate": 0.00019080323673470028, + "loss": 1.1029, + "mean_token_accuracy": 0.6925027072429657, + "num_tokens": 3625372.0, + "step": 428 + }, + { + "entropy": 1.165515422821045, + "epoch": 0.1716171617161716, + "grad_norm": 0.314475953578949, + "learning_rate": 0.00019075182328897553, + "loss": 1.159, + "mean_token_accuracy": 0.6840381771326065, + "num_tokens": 3633550.0, + "step": 429 + }, + { + "entropy": 1.2059255242347717, + "epoch": 0.172017201720172, + "grad_norm": 0.29410937428474426, + "learning_rate": 0.00019070027431540484, + "loss": 1.1995, + "mean_token_accuracy": 0.667696550488472, + "num_tokens": 3641944.0, + "step": 430 + }, + { + "entropy": 1.160342425107956, + "epoch": 0.17241724172417242, + "grad_norm": 0.29798951745033264, + "learning_rate": 0.00019064858990050412, + "loss": 1.1249, + "mean_token_accuracy": 0.6896940916776657, + "num_tokens": 3650633.0, + "step": 431 + }, + { + "entropy": 1.097832590341568, + "epoch": 0.17281728172817282, + "grad_norm": 0.3146847188472748, + "learning_rate": 0.0001905967701310167, + "loss": 1.084, + "mean_token_accuracy": 0.6950473189353943, + "num_tokens": 3659275.0, + "step": 432 + }, + { + "entropy": 1.1250872611999512, + "epoch": 0.17321732173217322, + "grad_norm": 0.29490962624549866, + "learning_rate": 0.00019054481509391303, + "loss": 1.1453, + "mean_token_accuracy": 0.6784237176179886, + "num_tokens": 3667707.0, + "step": 433 + }, + { + "entropy": 1.11842879652977, + "epoch": 0.17361736173617362, + "grad_norm": 0.3015720844268799, + "learning_rate": 0.00019049272487639053, + "loss": 1.1348, + "mean_token_accuracy": 0.6827126741409302, + "num_tokens": 3676215.0, + "step": 434 + }, + { + "entropy": 1.1079545319080353, + "epoch": 0.17401740174017402, + "grad_norm": 0.2959752380847931, + "learning_rate": 0.00019044049956587359, + "loss": 1.1308, + "mean_token_accuracy": 0.6799913793802261, + "num_tokens": 3684832.0, + "step": 435 + }, + { + "entropy": 1.0760809183120728, + "epoch": 0.17441744174417442, + "grad_norm": 0.28142601251602173, + "learning_rate": 0.0001903881392500132, + "loss": 1.057, + "mean_token_accuracy": 0.7040259689092636, + "num_tokens": 3693191.0, + "step": 436 + }, + { + "entropy": 1.1367475986480713, + "epoch": 0.17481748174817482, + "grad_norm": 0.2840285301208496, + "learning_rate": 0.00019033564401668712, + "loss": 1.1166, + "mean_token_accuracy": 0.6871612221002579, + "num_tokens": 3701978.0, + "step": 437 + }, + { + "entropy": 1.0345291048288345, + "epoch": 0.17521752175217523, + "grad_norm": 0.27927252650260925, + "learning_rate": 0.00019028301395399935, + "loss": 1.0161, + "mean_token_accuracy": 0.7020839005708694, + "num_tokens": 3711010.0, + "step": 438 + }, + { + "entropy": 1.1218744814395905, + "epoch": 0.17561756175617563, + "grad_norm": 0.28972747921943665, + "learning_rate": 0.00019023024915028035, + "loss": 1.1142, + "mean_token_accuracy": 0.6823008805513382, + "num_tokens": 3719811.0, + "step": 439 + }, + { + "entropy": 1.112653136253357, + "epoch": 0.17601760176017603, + "grad_norm": 0.2937675714492798, + "learning_rate": 0.0001901773496940866, + "loss": 1.099, + "mean_token_accuracy": 0.6938609182834625, + "num_tokens": 3728397.0, + "step": 440 + }, + { + "entropy": 1.0891221165657043, + "epoch": 0.17641764176417643, + "grad_norm": 0.2878448963165283, + "learning_rate": 0.00019012431567420058, + "loss": 1.0985, + "mean_token_accuracy": 0.6925668865442276, + "num_tokens": 3737299.0, + "step": 441 + }, + { + "entropy": 1.099565714597702, + "epoch": 0.1768176817681768, + "grad_norm": 0.307413786649704, + "learning_rate": 0.00019007114717963067, + "loss": 1.1189, + "mean_token_accuracy": 0.6934941560029984, + "num_tokens": 3746139.0, + "step": 442 + }, + { + "entropy": 1.1932236850261688, + "epoch": 0.1772177217721772, + "grad_norm": 0.3038841485977173, + "learning_rate": 0.00019001784429961086, + "loss": 1.1788, + "mean_token_accuracy": 0.6709124445915222, + "num_tokens": 3754953.0, + "step": 443 + }, + { + "entropy": 1.0702079832553864, + "epoch": 0.1776177617761776, + "grad_norm": 0.2820574939250946, + "learning_rate": 0.0001899644071236008, + "loss": 1.0416, + "mean_token_accuracy": 0.7032249569892883, + "num_tokens": 3763751.0, + "step": 444 + }, + { + "entropy": 1.2229497730731964, + "epoch": 0.178017801780178, + "grad_norm": 0.3014878034591675, + "learning_rate": 0.00018991083574128545, + "loss": 1.2192, + "mean_token_accuracy": 0.6651740819215775, + "num_tokens": 3771604.0, + "step": 445 + }, + { + "entropy": 1.1150319874286652, + "epoch": 0.1784178417841784, + "grad_norm": 0.2991960644721985, + "learning_rate": 0.000189857130242575, + "loss": 1.09, + "mean_token_accuracy": 0.6914113610982895, + "num_tokens": 3780403.0, + "step": 446 + }, + { + "entropy": 1.1689063012599945, + "epoch": 0.1788178817881788, + "grad_norm": 0.2982667088508606, + "learning_rate": 0.0001898032907176048, + "loss": 1.1627, + "mean_token_accuracy": 0.6814263015985489, + "num_tokens": 3788759.0, + "step": 447 + }, + { + "entropy": 1.1139529049396515, + "epoch": 0.1792179217921792, + "grad_norm": 0.29409554600715637, + "learning_rate": 0.00018974931725673509, + "loss": 1.1114, + "mean_token_accuracy": 0.6805879026651382, + "num_tokens": 3796931.0, + "step": 448 + }, + { + "entropy": 1.1041430234909058, + "epoch": 0.17961796179617961, + "grad_norm": 0.2944853901863098, + "learning_rate": 0.00018969520995055085, + "loss": 1.1119, + "mean_token_accuracy": 0.6940512806177139, + "num_tokens": 3805323.0, + "step": 449 + }, + { + "entropy": 1.1486750543117523, + "epoch": 0.18001800180018002, + "grad_norm": 0.302370548248291, + "learning_rate": 0.00018964096888986182, + "loss": 1.1553, + "mean_token_accuracy": 0.6763848960399628, + "num_tokens": 3813607.0, + "step": 450 + }, + { + "entropy": 1.1423940062522888, + "epoch": 0.18041804180418042, + "grad_norm": 0.28140193223953247, + "learning_rate": 0.00018958659416570212, + "loss": 1.1566, + "mean_token_accuracy": 0.6711086183786392, + "num_tokens": 3822080.0, + "step": 451 + }, + { + "entropy": 1.0220871269702911, + "epoch": 0.18081808180818082, + "grad_norm": 0.2903229892253876, + "learning_rate": 0.00018953208586933027, + "loss": 1.0243, + "mean_token_accuracy": 0.7029541581869125, + "num_tokens": 3830561.0, + "step": 452 + }, + { + "entropy": 1.1911540031433105, + "epoch": 0.18121812181218122, + "grad_norm": 0.3021875321865082, + "learning_rate": 0.0001894774440922289, + "loss": 1.1799, + "mean_token_accuracy": 0.6771095544099808, + "num_tokens": 3838855.0, + "step": 453 + }, + { + "entropy": 1.1234095692634583, + "epoch": 0.18161816181618162, + "grad_norm": 0.30030199885368347, + "learning_rate": 0.00018942266892610474, + "loss": 1.1306, + "mean_token_accuracy": 0.688039630651474, + "num_tokens": 3847225.0, + "step": 454 + }, + { + "entropy": 1.2189615964889526, + "epoch": 0.18201820182018202, + "grad_norm": 0.2934826910495758, + "learning_rate": 0.00018936776046288832, + "loss": 1.192, + "mean_token_accuracy": 0.6768446713685989, + "num_tokens": 3855549.0, + "step": 455 + }, + { + "entropy": 1.090735375881195, + "epoch": 0.18241824182418243, + "grad_norm": 0.2921765148639679, + "learning_rate": 0.0001893127187947339, + "loss": 1.0824, + "mean_token_accuracy": 0.6897251307964325, + "num_tokens": 3863912.0, + "step": 456 + }, + { + "entropy": 1.0907158553600311, + "epoch": 0.18281828182818283, + "grad_norm": 0.28869226574897766, + "learning_rate": 0.00018925754401401935, + "loss": 1.1011, + "mean_token_accuracy": 0.6976663619279861, + "num_tokens": 3872222.0, + "step": 457 + }, + { + "entropy": 1.0765265822410583, + "epoch": 0.18321832183218323, + "grad_norm": 0.27985134720802307, + "learning_rate": 0.0001892022362133459, + "loss": 1.0954, + "mean_token_accuracy": 0.6934731006622314, + "num_tokens": 3880811.0, + "step": 458 + }, + { + "entropy": 1.1287130117416382, + "epoch": 0.18361836183618363, + "grad_norm": 0.2834780216217041, + "learning_rate": 0.000189146795485538, + "loss": 1.1133, + "mean_token_accuracy": 0.6809262037277222, + "num_tokens": 3889241.0, + "step": 459 + }, + { + "entropy": 1.1771635711193085, + "epoch": 0.18401840184018403, + "grad_norm": 0.2930743992328644, + "learning_rate": 0.00018909122192364334, + "loss": 1.1473, + "mean_token_accuracy": 0.6786583662033081, + "num_tokens": 3897826.0, + "step": 460 + }, + { + "entropy": 1.156456857919693, + "epoch": 0.1844184418441844, + "grad_norm": 0.31029045581817627, + "learning_rate": 0.00018903551562093237, + "loss": 1.1329, + "mean_token_accuracy": 0.6835081726312637, + "num_tokens": 3906455.0, + "step": 461 + }, + { + "entropy": 1.197271704673767, + "epoch": 0.1848184818481848, + "grad_norm": 0.28894633054733276, + "learning_rate": 0.00018897967667089839, + "loss": 1.1518, + "mean_token_accuracy": 0.6705130338668823, + "num_tokens": 3914939.0, + "step": 462 + }, + { + "entropy": 1.187122493982315, + "epoch": 0.1852185218521852, + "grad_norm": 0.2882704734802246, + "learning_rate": 0.0001889237051672574, + "loss": 1.172, + "mean_token_accuracy": 0.6756406724452972, + "num_tokens": 3923526.0, + "step": 463 + }, + { + "entropy": 1.1045761406421661, + "epoch": 0.1856185618561856, + "grad_norm": 0.290786474943161, + "learning_rate": 0.00018886760120394774, + "loss": 1.1039, + "mean_token_accuracy": 0.6829386353492737, + "num_tokens": 3931690.0, + "step": 464 + }, + { + "entropy": 1.0771204233169556, + "epoch": 0.186018601860186, + "grad_norm": 0.29037660360336304, + "learning_rate": 0.00018881136487513016, + "loss": 1.0961, + "mean_token_accuracy": 0.6865667402744293, + "num_tokens": 3940222.0, + "step": 465 + }, + { + "entropy": 1.0926263481378555, + "epoch": 0.1864186418641864, + "grad_norm": 0.28368324041366577, + "learning_rate": 0.0001887549962751875, + "loss": 1.1276, + "mean_token_accuracy": 0.6901869177818298, + "num_tokens": 3948870.0, + "step": 466 + }, + { + "entropy": 1.0631737411022186, + "epoch": 0.18681868186818681, + "grad_norm": 0.28324657678604126, + "learning_rate": 0.00018869849549872465, + "loss": 1.0782, + "mean_token_accuracy": 0.6920218467712402, + "num_tokens": 3957291.0, + "step": 467 + }, + { + "entropy": 1.1629198789596558, + "epoch": 0.18721872187218722, + "grad_norm": 0.28869321942329407, + "learning_rate": 0.00018864186264056827, + "loss": 1.1439, + "mean_token_accuracy": 0.6795201748609543, + "num_tokens": 3966005.0, + "step": 468 + }, + { + "entropy": 1.1176329255104065, + "epoch": 0.18761876187618762, + "grad_norm": 0.30285438895225525, + "learning_rate": 0.00018858509779576678, + "loss": 1.1113, + "mean_token_accuracy": 0.6858499944210052, + "num_tokens": 3974237.0, + "step": 469 + }, + { + "entropy": 1.1664519608020782, + "epoch": 0.18801880188018802, + "grad_norm": 0.29232847690582275, + "learning_rate": 0.00018852820105959002, + "loss": 1.1352, + "mean_token_accuracy": 0.6848191022872925, + "num_tokens": 3982719.0, + "step": 470 + }, + { + "entropy": 1.0966509878635406, + "epoch": 0.18841884188418842, + "grad_norm": 0.28050824999809265, + "learning_rate": 0.00018847117252752924, + "loss": 1.103, + "mean_token_accuracy": 0.6891407370567322, + "num_tokens": 3991387.0, + "step": 471 + }, + { + "entropy": 1.0832321643829346, + "epoch": 0.18881888188818882, + "grad_norm": 0.30679091811180115, + "learning_rate": 0.00018841401229529692, + "loss": 1.0987, + "mean_token_accuracy": 0.6983061581850052, + "num_tokens": 3999901.0, + "step": 472 + }, + { + "entropy": 1.1181371808052063, + "epoch": 0.18921892189218922, + "grad_norm": 0.29978105425834656, + "learning_rate": 0.00018835672045882648, + "loss": 1.1526, + "mean_token_accuracy": 0.6812323331832886, + "num_tokens": 4008189.0, + "step": 473 + }, + { + "entropy": 1.094124659895897, + "epoch": 0.18961896189618963, + "grad_norm": 0.2761591672897339, + "learning_rate": 0.00018829929711427232, + "loss": 1.088, + "mean_token_accuracy": 0.6916481256484985, + "num_tokens": 4017035.0, + "step": 474 + }, + { + "entropy": 1.174016386270523, + "epoch": 0.19001900190019003, + "grad_norm": 0.2957269549369812, + "learning_rate": 0.0001882417423580095, + "loss": 1.15, + "mean_token_accuracy": 0.687277153134346, + "num_tokens": 4025132.0, + "step": 475 + }, + { + "entropy": 1.141076147556305, + "epoch": 0.19041904190419043, + "grad_norm": 0.29672884941101074, + "learning_rate": 0.0001881840562866336, + "loss": 1.0997, + "mean_token_accuracy": 0.6899784505367279, + "num_tokens": 4033594.0, + "step": 476 + }, + { + "entropy": 1.103248655796051, + "epoch": 0.19081908190819083, + "grad_norm": 0.2912473976612091, + "learning_rate": 0.00018812623899696067, + "loss": 1.0915, + "mean_token_accuracy": 0.6886222809553146, + "num_tokens": 4042053.0, + "step": 477 + }, + { + "entropy": 1.170788824558258, + "epoch": 0.19121912191219123, + "grad_norm": 0.2797233462333679, + "learning_rate": 0.0001880682905860269, + "loss": 1.1159, + "mean_token_accuracy": 0.6844299733638763, + "num_tokens": 4050555.0, + "step": 478 + }, + { + "entropy": 1.160698264837265, + "epoch": 0.19161916191619163, + "grad_norm": 0.2921246886253357, + "learning_rate": 0.00018801021115108862, + "loss": 1.1606, + "mean_token_accuracy": 0.6748001426458359, + "num_tokens": 4059040.0, + "step": 479 + }, + { + "entropy": 1.0824988782405853, + "epoch": 0.192019201920192, + "grad_norm": 0.29058167338371277, + "learning_rate": 0.000187952000789622, + "loss": 1.1117, + "mean_token_accuracy": 0.6949323862791061, + "num_tokens": 4067919.0, + "step": 480 + }, + { + "entropy": 1.1407755315303802, + "epoch": 0.1924192419241924, + "grad_norm": 0.3058508634567261, + "learning_rate": 0.00018789365959932303, + "loss": 1.1914, + "mean_token_accuracy": 0.6748262792825699, + "num_tokens": 4076495.0, + "step": 481 + }, + { + "entropy": 1.1213767230510712, + "epoch": 0.1928192819281928, + "grad_norm": 0.2868844270706177, + "learning_rate": 0.00018783518767810715, + "loss": 1.117, + "mean_token_accuracy": 0.6884360611438751, + "num_tokens": 4084846.0, + "step": 482 + }, + { + "entropy": 1.1594094932079315, + "epoch": 0.1932193219321932, + "grad_norm": 0.29103291034698486, + "learning_rate": 0.0001877765851241093, + "loss": 1.1595, + "mean_token_accuracy": 0.6784193813800812, + "num_tokens": 4093093.0, + "step": 483 + }, + { + "entropy": 1.0897391140460968, + "epoch": 0.1936193619361936, + "grad_norm": 0.29071077704429626, + "learning_rate": 0.00018771785203568366, + "loss": 1.0775, + "mean_token_accuracy": 0.6933843791484833, + "num_tokens": 4101392.0, + "step": 484 + }, + { + "entropy": 1.05050827562809, + "epoch": 0.19401940194019401, + "grad_norm": 0.2660689949989319, + "learning_rate": 0.00018765898851140345, + "loss": 1.003, + "mean_token_accuracy": 0.7151510417461395, + "num_tokens": 4110388.0, + "step": 485 + }, + { + "entropy": 1.1417682468891144, + "epoch": 0.19441944194419442, + "grad_norm": 0.2760656774044037, + "learning_rate": 0.00018759999465006087, + "loss": 1.1208, + "mean_token_accuracy": 0.6870895624160767, + "num_tokens": 4119451.0, + "step": 486 + }, + { + "entropy": 1.1158250570297241, + "epoch": 0.19481948194819482, + "grad_norm": 0.27844175696372986, + "learning_rate": 0.00018754087055066675, + "loss": 1.0741, + "mean_token_accuracy": 0.7000212967395782, + "num_tokens": 4127997.0, + "step": 487 + }, + { + "entropy": 1.0569812506437302, + "epoch": 0.19521952195219522, + "grad_norm": 0.28110507130622864, + "learning_rate": 0.00018748161631245065, + "loss": 1.0375, + "mean_token_accuracy": 0.7026449292898178, + "num_tokens": 4136878.0, + "step": 488 + }, + { + "entropy": 1.084457129240036, + "epoch": 0.19561956195619562, + "grad_norm": 0.26859092712402344, + "learning_rate": 0.00018742223203486042, + "loss": 1.0676, + "mean_token_accuracy": 0.6930870711803436, + "num_tokens": 4146324.0, + "step": 489 + }, + { + "entropy": 1.0949542820453644, + "epoch": 0.19601960196019602, + "grad_norm": 0.28605908155441284, + "learning_rate": 0.00018736271781756223, + "loss": 1.125, + "mean_token_accuracy": 0.6920661330223083, + "num_tokens": 4154496.0, + "step": 490 + }, + { + "entropy": 1.1369201838970184, + "epoch": 0.19641964196419642, + "grad_norm": 0.3030281364917755, + "learning_rate": 0.00018730307376044027, + "loss": 1.119, + "mean_token_accuracy": 0.6900736391544342, + "num_tokens": 4163381.0, + "step": 491 + }, + { + "entropy": 1.1063465178012848, + "epoch": 0.19681968196819682, + "grad_norm": 0.29392218589782715, + "learning_rate": 0.00018724329996359676, + "loss": 1.1376, + "mean_token_accuracy": 0.6872988492250443, + "num_tokens": 4172190.0, + "step": 492 + }, + { + "entropy": 1.1071143746376038, + "epoch": 0.19721972197219723, + "grad_norm": 0.28501084446907043, + "learning_rate": 0.00018718339652735154, + "loss": 1.1166, + "mean_token_accuracy": 0.6885866820812225, + "num_tokens": 4180585.0, + "step": 493 + }, + { + "entropy": 1.1584193706512451, + "epoch": 0.19761976197619763, + "grad_norm": 0.29230597615242004, + "learning_rate": 0.00018712336355224205, + "loss": 1.1594, + "mean_token_accuracy": 0.6756969690322876, + "num_tokens": 4188810.0, + "step": 494 + }, + { + "entropy": 1.0776985734701157, + "epoch": 0.19801980198019803, + "grad_norm": 0.2801620662212372, + "learning_rate": 0.0001870632011390232, + "loss": 1.0296, + "mean_token_accuracy": 0.7065073400735855, + "num_tokens": 4197309.0, + "step": 495 + }, + { + "entropy": 1.1805840134620667, + "epoch": 0.19841984198419843, + "grad_norm": 0.3022160530090332, + "learning_rate": 0.00018700290938866712, + "loss": 1.1913, + "mean_token_accuracy": 0.6692783236503601, + "num_tokens": 4205630.0, + "step": 496 + }, + { + "entropy": 1.0833539962768555, + "epoch": 0.19881988198819883, + "grad_norm": 0.306426078081131, + "learning_rate": 0.00018694248840236296, + "loss": 1.0954, + "mean_token_accuracy": 0.6928739845752716, + "num_tokens": 4214058.0, + "step": 497 + }, + { + "entropy": 1.0818894803524017, + "epoch": 0.19921992199219923, + "grad_norm": 0.2984001934528351, + "learning_rate": 0.00018688193828151682, + "loss": 1.0926, + "mean_token_accuracy": 0.6913997977972031, + "num_tokens": 4222853.0, + "step": 498 + }, + { + "entropy": 1.0889964997768402, + "epoch": 0.1996199619961996, + "grad_norm": 0.2939610481262207, + "learning_rate": 0.0001868212591277515, + "loss": 1.0606, + "mean_token_accuracy": 0.6952404677867889, + "num_tokens": 4231395.0, + "step": 499 + }, + { + "entropy": 1.0546657741069794, + "epoch": 0.2000200020002, + "grad_norm": 0.28841695189476013, + "learning_rate": 0.00018676045104290637, + "loss": 1.0682, + "mean_token_accuracy": 0.6971585303544998, + "num_tokens": 4240525.0, + "step": 500 + } + ], + "logging_steps": 1, + "max_steps": 2500, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.077959380899922e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}