| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.2000200020002, | |
| "eval_steps": 500, | |
| "global_step": 500, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "entropy": 1.7942025661468506, | |
| "epoch": 0.00040004000400040005, | |
| "grad_norm": 0.47672003507614136, | |
| "learning_rate": 0.0, | |
| "loss": 2.2188, | |
| "mean_token_accuracy": 0.5192891135811806, | |
| "num_tokens": 8850.0, | |
| "step": 1 | |
| }, | |
| { | |
| "entropy": 1.739880234003067, | |
| "epoch": 0.0008000800080008001, | |
| "grad_norm": 0.4743156433105469, | |
| "learning_rate": 2.666666666666667e-06, | |
| "loss": 2.1894, | |
| "mean_token_accuracy": 0.5170402973890305, | |
| "num_tokens": 18057.0, | |
| "step": 2 | |
| }, | |
| { | |
| "entropy": 1.7690136432647705, | |
| "epoch": 0.0012001200120012002, | |
| "grad_norm": 0.5005162358283997, | |
| "learning_rate": 5.333333333333334e-06, | |
| "loss": 2.2131, | |
| "mean_token_accuracy": 0.5172632932662964, | |
| "num_tokens": 26915.0, | |
| "step": 3 | |
| }, | |
| { | |
| "entropy": 1.866851270198822, | |
| "epoch": 0.0016001600160016002, | |
| "grad_norm": 0.438918799161911, | |
| "learning_rate": 8.000000000000001e-06, | |
| "loss": 2.2875, | |
| "mean_token_accuracy": 0.5107089728116989, | |
| "num_tokens": 35231.0, | |
| "step": 4 | |
| }, | |
| { | |
| "entropy": 1.8996970057487488, | |
| "epoch": 0.002000200020002, | |
| "grad_norm": 0.4285155236721039, | |
| "learning_rate": 1.0666666666666667e-05, | |
| "loss": 2.2935, | |
| "mean_token_accuracy": 0.5128495469689369, | |
| "num_tokens": 43540.0, | |
| "step": 5 | |
| }, | |
| { | |
| "entropy": 1.797807365655899, | |
| "epoch": 0.0024002400240024004, | |
| "grad_norm": 0.4465991258621216, | |
| "learning_rate": 1.3333333333333333e-05, | |
| "loss": 2.1917, | |
| "mean_token_accuracy": 0.5254444032907486, | |
| "num_tokens": 52236.0, | |
| "step": 6 | |
| }, | |
| { | |
| "entropy": 1.8983636498451233, | |
| "epoch": 0.0028002800280028, | |
| "grad_norm": 0.4536067545413971, | |
| "learning_rate": 1.6000000000000003e-05, | |
| "loss": 2.2677, | |
| "mean_token_accuracy": 0.5144101679325104, | |
| "num_tokens": 60443.0, | |
| "step": 7 | |
| }, | |
| { | |
| "entropy": 1.8427878618240356, | |
| "epoch": 0.0032003200320032004, | |
| "grad_norm": 0.5053722858428955, | |
| "learning_rate": 1.866666666666667e-05, | |
| "loss": 2.2356, | |
| "mean_token_accuracy": 0.5142018273472786, | |
| "num_tokens": 69155.0, | |
| "step": 8 | |
| }, | |
| { | |
| "entropy": 1.8648996651172638, | |
| "epoch": 0.0036003600360036, | |
| "grad_norm": 0.5287893414497375, | |
| "learning_rate": 2.1333333333333335e-05, | |
| "loss": 2.2435, | |
| "mean_token_accuracy": 0.5086963996291161, | |
| "num_tokens": 77156.0, | |
| "step": 9 | |
| }, | |
| { | |
| "entropy": 1.886999636888504, | |
| "epoch": 0.004000400040004, | |
| "grad_norm": 0.43816184997558594, | |
| "learning_rate": 2.4e-05, | |
| "loss": 2.1821, | |
| "mean_token_accuracy": 0.5133799910545349, | |
| "num_tokens": 85650.0, | |
| "step": 10 | |
| }, | |
| { | |
| "entropy": 2.0165862143039703, | |
| "epoch": 0.0044004400440044, | |
| "grad_norm": 0.3899831175804138, | |
| "learning_rate": 2.6666666666666667e-05, | |
| "loss": 2.1903, | |
| "mean_token_accuracy": 0.5218925848603249, | |
| "num_tokens": 93953.0, | |
| "step": 11 | |
| }, | |
| { | |
| "entropy": 2.033858895301819, | |
| "epoch": 0.004800480048004801, | |
| "grad_norm": 0.43466004729270935, | |
| "learning_rate": 2.9333333333333336e-05, | |
| "loss": 2.0937, | |
| "mean_token_accuracy": 0.5258676409721375, | |
| "num_tokens": 102592.0, | |
| "step": 12 | |
| }, | |
| { | |
| "entropy": 2.2364404797554016, | |
| "epoch": 0.005200520052005201, | |
| "grad_norm": 0.39024344086647034, | |
| "learning_rate": 3.2000000000000005e-05, | |
| "loss": 2.1801, | |
| "mean_token_accuracy": 0.5228476375341415, | |
| "num_tokens": 110784.0, | |
| "step": 13 | |
| }, | |
| { | |
| "entropy": 2.1504173278808594, | |
| "epoch": 0.0056005600560056, | |
| "grad_norm": 0.389006644487381, | |
| "learning_rate": 3.466666666666667e-05, | |
| "loss": 2.0215, | |
| "mean_token_accuracy": 0.5430122464895248, | |
| "num_tokens": 120082.0, | |
| "step": 14 | |
| }, | |
| { | |
| "entropy": 2.2962915897369385, | |
| "epoch": 0.006000600060006, | |
| "grad_norm": 0.4784089922904968, | |
| "learning_rate": 3.733333333333334e-05, | |
| "loss": 2.061, | |
| "mean_token_accuracy": 0.531621664762497, | |
| "num_tokens": 128363.0, | |
| "step": 15 | |
| }, | |
| { | |
| "entropy": 2.342404544353485, | |
| "epoch": 0.006400640064006401, | |
| "grad_norm": 0.5089271068572998, | |
| "learning_rate": 4e-05, | |
| "loss": 2.07, | |
| "mean_token_accuracy": 0.5325157046318054, | |
| "num_tokens": 136997.0, | |
| "step": 16 | |
| }, | |
| { | |
| "entropy": 2.283275544643402, | |
| "epoch": 0.006800680068006801, | |
| "grad_norm": 0.5488889813423157, | |
| "learning_rate": 4.266666666666667e-05, | |
| "loss": 2.0056, | |
| "mean_token_accuracy": 0.5334787666797638, | |
| "num_tokens": 145030.0, | |
| "step": 17 | |
| }, | |
| { | |
| "entropy": 2.050345718860626, | |
| "epoch": 0.0072007200720072, | |
| "grad_norm": 0.5031075477600098, | |
| "learning_rate": 4.5333333333333335e-05, | |
| "loss": 1.9162, | |
| "mean_token_accuracy": 0.5427921563386917, | |
| "num_tokens": 153623.0, | |
| "step": 18 | |
| }, | |
| { | |
| "entropy": 1.9828232526779175, | |
| "epoch": 0.007600760076007601, | |
| "grad_norm": 0.5337665677070618, | |
| "learning_rate": 4.8e-05, | |
| "loss": 1.9185, | |
| "mean_token_accuracy": 0.5508822798728943, | |
| "num_tokens": 161947.0, | |
| "step": 19 | |
| }, | |
| { | |
| "entropy": 1.8197293877601624, | |
| "epoch": 0.008000800080008, | |
| "grad_norm": 0.4948204755783081, | |
| "learning_rate": 5.0666666666666674e-05, | |
| "loss": 1.857, | |
| "mean_token_accuracy": 0.552571251988411, | |
| "num_tokens": 170516.0, | |
| "step": 20 | |
| }, | |
| { | |
| "entropy": 1.789840191602707, | |
| "epoch": 0.0084008400840084, | |
| "grad_norm": 0.4926859438419342, | |
| "learning_rate": 5.333333333333333e-05, | |
| "loss": 1.886, | |
| "mean_token_accuracy": 0.5518065690994263, | |
| "num_tokens": 178469.0, | |
| "step": 21 | |
| }, | |
| { | |
| "entropy": 1.6451906859874725, | |
| "epoch": 0.0088008800880088, | |
| "grad_norm": 0.4017632007598877, | |
| "learning_rate": 5.6000000000000006e-05, | |
| "loss": 1.7526, | |
| "mean_token_accuracy": 0.5742013603448868, | |
| "num_tokens": 186348.0, | |
| "step": 22 | |
| }, | |
| { | |
| "entropy": 1.6792134046554565, | |
| "epoch": 0.0092009200920092, | |
| "grad_norm": 0.6260354518890381, | |
| "learning_rate": 5.866666666666667e-05, | |
| "loss": 1.8468, | |
| "mean_token_accuracy": 0.5656454414129257, | |
| "num_tokens": 195071.0, | |
| "step": 23 | |
| }, | |
| { | |
| "entropy": 1.647391676902771, | |
| "epoch": 0.009600960096009602, | |
| "grad_norm": 0.46580520272254944, | |
| "learning_rate": 6.133333333333334e-05, | |
| "loss": 1.7595, | |
| "mean_token_accuracy": 0.567480742931366, | |
| "num_tokens": 202951.0, | |
| "step": 24 | |
| }, | |
| { | |
| "entropy": 1.6090652346611023, | |
| "epoch": 0.010001000100010001, | |
| "grad_norm": 0.4587379992008209, | |
| "learning_rate": 6.400000000000001e-05, | |
| "loss": 1.6638, | |
| "mean_token_accuracy": 0.5937570631504059, | |
| "num_tokens": 211268.0, | |
| "step": 25 | |
| }, | |
| { | |
| "entropy": 1.6326420307159424, | |
| "epoch": 0.010401040104010401, | |
| "grad_norm": 0.44421494007110596, | |
| "learning_rate": 6.666666666666667e-05, | |
| "loss": 1.6439, | |
| "mean_token_accuracy": 0.5923638790845871, | |
| "num_tokens": 219692.0, | |
| "step": 26 | |
| }, | |
| { | |
| "entropy": 1.7234179377555847, | |
| "epoch": 0.010801080108010801, | |
| "grad_norm": 0.4389747381210327, | |
| "learning_rate": 6.933333333333334e-05, | |
| "loss": 1.7108, | |
| "mean_token_accuracy": 0.5803089290857315, | |
| "num_tokens": 228047.0, | |
| "step": 27 | |
| }, | |
| { | |
| "entropy": 1.6885777115821838, | |
| "epoch": 0.0112011201120112, | |
| "grad_norm": 0.4335879981517792, | |
| "learning_rate": 7.2e-05, | |
| "loss": 1.6299, | |
| "mean_token_accuracy": 0.586303323507309, | |
| "num_tokens": 236376.0, | |
| "step": 28 | |
| }, | |
| { | |
| "entropy": 1.6646342873573303, | |
| "epoch": 0.0116011601160116, | |
| "grad_norm": 0.38126322627067566, | |
| "learning_rate": 7.466666666666667e-05, | |
| "loss": 1.6067, | |
| "mean_token_accuracy": 0.5964086949825287, | |
| "num_tokens": 245092.0, | |
| "step": 29 | |
| }, | |
| { | |
| "entropy": 1.6213374137878418, | |
| "epoch": 0.012001200120012, | |
| "grad_norm": 0.39270561933517456, | |
| "learning_rate": 7.733333333333333e-05, | |
| "loss": 1.5822, | |
| "mean_token_accuracy": 0.6026028245687485, | |
| "num_tokens": 253673.0, | |
| "step": 30 | |
| }, | |
| { | |
| "entropy": 1.5640352368354797, | |
| "epoch": 0.012401240124012402, | |
| "grad_norm": 0.3869155943393707, | |
| "learning_rate": 8e-05, | |
| "loss": 1.5011, | |
| "mean_token_accuracy": 0.6241087764501572, | |
| "num_tokens": 262625.0, | |
| "step": 31 | |
| }, | |
| { | |
| "entropy": 1.520020067691803, | |
| "epoch": 0.012801280128012802, | |
| "grad_norm": 0.3769737184047699, | |
| "learning_rate": 8.266666666666667e-05, | |
| "loss": 1.5088, | |
| "mean_token_accuracy": 0.6204348653554916, | |
| "num_tokens": 271309.0, | |
| "step": 32 | |
| }, | |
| { | |
| "entropy": 1.5669251084327698, | |
| "epoch": 0.013201320132013201, | |
| "grad_norm": 0.4119971692562103, | |
| "learning_rate": 8.533333333333334e-05, | |
| "loss": 1.598, | |
| "mean_token_accuracy": 0.6009179204702377, | |
| "num_tokens": 279702.0, | |
| "step": 33 | |
| }, | |
| { | |
| "entropy": 1.4570423662662506, | |
| "epoch": 0.013601360136013601, | |
| "grad_norm": 0.39608579874038696, | |
| "learning_rate": 8.800000000000001e-05, | |
| "loss": 1.4757, | |
| "mean_token_accuracy": 0.6308933645486832, | |
| "num_tokens": 288493.0, | |
| "step": 34 | |
| }, | |
| { | |
| "entropy": 1.4845676720142365, | |
| "epoch": 0.014001400140014001, | |
| "grad_norm": 0.37827152013778687, | |
| "learning_rate": 9.066666666666667e-05, | |
| "loss": 1.5051, | |
| "mean_token_accuracy": 0.6212253570556641, | |
| "num_tokens": 296999.0, | |
| "step": 35 | |
| }, | |
| { | |
| "entropy": 1.5079152584075928, | |
| "epoch": 0.0144014401440144, | |
| "grad_norm": 0.39496058225631714, | |
| "learning_rate": 9.333333333333334e-05, | |
| "loss": 1.5177, | |
| "mean_token_accuracy": 0.6146594285964966, | |
| "num_tokens": 305146.0, | |
| "step": 36 | |
| }, | |
| { | |
| "entropy": 1.4583857357501984, | |
| "epoch": 0.014801480148014802, | |
| "grad_norm": 0.41785281896591187, | |
| "learning_rate": 9.6e-05, | |
| "loss": 1.4723, | |
| "mean_token_accuracy": 0.6168077737092972, | |
| "num_tokens": 313647.0, | |
| "step": 37 | |
| }, | |
| { | |
| "entropy": 1.3630880415439606, | |
| "epoch": 0.015201520152015202, | |
| "grad_norm": 0.3789471983909607, | |
| "learning_rate": 9.866666666666668e-05, | |
| "loss": 1.3449, | |
| "mean_token_accuracy": 0.6459334343671799, | |
| "num_tokens": 322633.0, | |
| "step": 38 | |
| }, | |
| { | |
| "entropy": 1.4223653674125671, | |
| "epoch": 0.015601560156015602, | |
| "grad_norm": 0.4337131381034851, | |
| "learning_rate": 0.00010133333333333335, | |
| "loss": 1.4755, | |
| "mean_token_accuracy": 0.6144974380731583, | |
| "num_tokens": 331687.0, | |
| "step": 39 | |
| }, | |
| { | |
| "entropy": 1.3911584913730621, | |
| "epoch": 0.016001600160016, | |
| "grad_norm": 0.41617903113365173, | |
| "learning_rate": 0.00010400000000000001, | |
| "loss": 1.3826, | |
| "mean_token_accuracy": 0.6441078633069992, | |
| "num_tokens": 339868.0, | |
| "step": 40 | |
| }, | |
| { | |
| "entropy": 1.4160181879997253, | |
| "epoch": 0.016401640164016403, | |
| "grad_norm": 0.43531423807144165, | |
| "learning_rate": 0.00010666666666666667, | |
| "loss": 1.4294, | |
| "mean_token_accuracy": 0.6320265531539917, | |
| "num_tokens": 348029.0, | |
| "step": 41 | |
| }, | |
| { | |
| "entropy": 1.482937514781952, | |
| "epoch": 0.0168016801680168, | |
| "grad_norm": 0.4324755072593689, | |
| "learning_rate": 0.00010933333333333333, | |
| "loss": 1.5147, | |
| "mean_token_accuracy": 0.6166313588619232, | |
| "num_tokens": 356240.0, | |
| "step": 42 | |
| }, | |
| { | |
| "entropy": 1.4201266169548035, | |
| "epoch": 0.017201720172017203, | |
| "grad_norm": 0.3948879837989807, | |
| "learning_rate": 0.00011200000000000001, | |
| "loss": 1.3994, | |
| "mean_token_accuracy": 0.6290998160839081, | |
| "num_tokens": 364425.0, | |
| "step": 43 | |
| }, | |
| { | |
| "entropy": 1.357359528541565, | |
| "epoch": 0.0176017601760176, | |
| "grad_norm": 0.41655364632606506, | |
| "learning_rate": 0.00011466666666666667, | |
| "loss": 1.2924, | |
| "mean_token_accuracy": 0.6492937654256821, | |
| "num_tokens": 373138.0, | |
| "step": 44 | |
| }, | |
| { | |
| "entropy": 1.391854703426361, | |
| "epoch": 0.018001800180018002, | |
| "grad_norm": 0.417074590921402, | |
| "learning_rate": 0.00011733333333333334, | |
| "loss": 1.3507, | |
| "mean_token_accuracy": 0.6494302302598953, | |
| "num_tokens": 382100.0, | |
| "step": 45 | |
| }, | |
| { | |
| "entropy": 1.4749327600002289, | |
| "epoch": 0.0184018401840184, | |
| "grad_norm": 0.41923800110816956, | |
| "learning_rate": 0.00012, | |
| "loss": 1.5085, | |
| "mean_token_accuracy": 0.612814411520958, | |
| "num_tokens": 390052.0, | |
| "step": 46 | |
| }, | |
| { | |
| "entropy": 1.4137325286865234, | |
| "epoch": 0.018801880188018802, | |
| "grad_norm": 0.3833743929862976, | |
| "learning_rate": 0.00012266666666666668, | |
| "loss": 1.3916, | |
| "mean_token_accuracy": 0.6410449594259262, | |
| "num_tokens": 398110.0, | |
| "step": 47 | |
| }, | |
| { | |
| "entropy": 1.3919320702552795, | |
| "epoch": 0.019201920192019203, | |
| "grad_norm": 0.37842363119125366, | |
| "learning_rate": 0.00012533333333333334, | |
| "loss": 1.4084, | |
| "mean_token_accuracy": 0.6312015205621719, | |
| "num_tokens": 406666.0, | |
| "step": 48 | |
| }, | |
| { | |
| "entropy": 1.3608618378639221, | |
| "epoch": 0.0196019601960196, | |
| "grad_norm": 0.4568133056163788, | |
| "learning_rate": 0.00012800000000000002, | |
| "loss": 1.368, | |
| "mean_token_accuracy": 0.6458054482936859, | |
| "num_tokens": 415283.0, | |
| "step": 49 | |
| }, | |
| { | |
| "entropy": 1.3759468793869019, | |
| "epoch": 0.020002000200020003, | |
| "grad_norm": 0.3905130922794342, | |
| "learning_rate": 0.00013066666666666668, | |
| "loss": 1.3781, | |
| "mean_token_accuracy": 0.6408856809139252, | |
| "num_tokens": 423867.0, | |
| "step": 50 | |
| }, | |
| { | |
| "entropy": 1.3894509375095367, | |
| "epoch": 0.0204020402040204, | |
| "grad_norm": 0.39885976910591125, | |
| "learning_rate": 0.00013333333333333334, | |
| "loss": 1.3832, | |
| "mean_token_accuracy": 0.6394526213407516, | |
| "num_tokens": 432299.0, | |
| "step": 51 | |
| }, | |
| { | |
| "entropy": 1.3620089888572693, | |
| "epoch": 0.020802080208020803, | |
| "grad_norm": 0.44015854597091675, | |
| "learning_rate": 0.00013600000000000003, | |
| "loss": 1.3381, | |
| "mean_token_accuracy": 0.6432337760925293, | |
| "num_tokens": 440734.0, | |
| "step": 52 | |
| }, | |
| { | |
| "entropy": 1.3622656762599945, | |
| "epoch": 0.0212021202120212, | |
| "grad_norm": 0.49739453196525574, | |
| "learning_rate": 0.00013866666666666669, | |
| "loss": 1.3649, | |
| "mean_token_accuracy": 0.6373352855443954, | |
| "num_tokens": 448710.0, | |
| "step": 53 | |
| }, | |
| { | |
| "entropy": 1.2986978590488434, | |
| "epoch": 0.021602160216021602, | |
| "grad_norm": 0.37318113446235657, | |
| "learning_rate": 0.00014133333333333334, | |
| "loss": 1.3366, | |
| "mean_token_accuracy": 0.6431873738765717, | |
| "num_tokens": 457247.0, | |
| "step": 54 | |
| }, | |
| { | |
| "entropy": 1.2725946605205536, | |
| "epoch": 0.022002200220022004, | |
| "grad_norm": 0.4199654757976532, | |
| "learning_rate": 0.000144, | |
| "loss": 1.3302, | |
| "mean_token_accuracy": 0.6447762101888657, | |
| "num_tokens": 465701.0, | |
| "step": 55 | |
| }, | |
| { | |
| "entropy": 1.2967428863048553, | |
| "epoch": 0.0224022402240224, | |
| "grad_norm": 0.40956538915634155, | |
| "learning_rate": 0.00014666666666666666, | |
| "loss": 1.3352, | |
| "mean_token_accuracy": 0.6408500224351883, | |
| "num_tokens": 474476.0, | |
| "step": 56 | |
| }, | |
| { | |
| "entropy": 1.3544551134109497, | |
| "epoch": 0.022802280228022803, | |
| "grad_norm": 0.39519739151000977, | |
| "learning_rate": 0.00014933333333333335, | |
| "loss": 1.3406, | |
| "mean_token_accuracy": 0.6500163674354553, | |
| "num_tokens": 482570.0, | |
| "step": 57 | |
| }, | |
| { | |
| "entropy": 1.3824973404407501, | |
| "epoch": 0.0232023202320232, | |
| "grad_norm": 0.3799802362918854, | |
| "learning_rate": 0.000152, | |
| "loss": 1.3278, | |
| "mean_token_accuracy": 0.6473122090101242, | |
| "num_tokens": 491111.0, | |
| "step": 58 | |
| }, | |
| { | |
| "entropy": 1.3626296520233154, | |
| "epoch": 0.023602360236023603, | |
| "grad_norm": 0.3700718879699707, | |
| "learning_rate": 0.00015466666666666667, | |
| "loss": 1.3304, | |
| "mean_token_accuracy": 0.645874097943306, | |
| "num_tokens": 500032.0, | |
| "step": 59 | |
| }, | |
| { | |
| "entropy": 1.3258526921272278, | |
| "epoch": 0.024002400240024, | |
| "grad_norm": 0.366222620010376, | |
| "learning_rate": 0.00015733333333333333, | |
| "loss": 1.3073, | |
| "mean_token_accuracy": 0.6523128002882004, | |
| "num_tokens": 508045.0, | |
| "step": 60 | |
| }, | |
| { | |
| "entropy": 1.2787662744522095, | |
| "epoch": 0.024402440244024402, | |
| "grad_norm": 0.37774235010147095, | |
| "learning_rate": 0.00016, | |
| "loss": 1.2839, | |
| "mean_token_accuracy": 0.657956600189209, | |
| "num_tokens": 516334.0, | |
| "step": 61 | |
| }, | |
| { | |
| "entropy": 1.2824394404888153, | |
| "epoch": 0.024802480248024804, | |
| "grad_norm": 0.3594248294830322, | |
| "learning_rate": 0.00016266666666666667, | |
| "loss": 1.3335, | |
| "mean_token_accuracy": 0.6513591110706329, | |
| "num_tokens": 524762.0, | |
| "step": 62 | |
| }, | |
| { | |
| "entropy": 1.2761549651622772, | |
| "epoch": 0.025202520252025202, | |
| "grad_norm": 0.38247525691986084, | |
| "learning_rate": 0.00016533333333333333, | |
| "loss": 1.322, | |
| "mean_token_accuracy": 0.6528888940811157, | |
| "num_tokens": 533302.0, | |
| "step": 63 | |
| }, | |
| { | |
| "entropy": 1.285708099603653, | |
| "epoch": 0.025602560256025603, | |
| "grad_norm": 0.4210297167301178, | |
| "learning_rate": 0.000168, | |
| "loss": 1.2522, | |
| "mean_token_accuracy": 0.6581785976886749, | |
| "num_tokens": 542110.0, | |
| "step": 64 | |
| }, | |
| { | |
| "entropy": 1.3535743653774261, | |
| "epoch": 0.026002600260026, | |
| "grad_norm": 0.3659783601760864, | |
| "learning_rate": 0.00017066666666666668, | |
| "loss": 1.3343, | |
| "mean_token_accuracy": 0.6510991156101227, | |
| "num_tokens": 550717.0, | |
| "step": 65 | |
| }, | |
| { | |
| "entropy": 1.3446696996688843, | |
| "epoch": 0.026402640264026403, | |
| "grad_norm": 0.35590988397598267, | |
| "learning_rate": 0.00017333333333333334, | |
| "loss": 1.3224, | |
| "mean_token_accuracy": 0.6442483812570572, | |
| "num_tokens": 559025.0, | |
| "step": 66 | |
| }, | |
| { | |
| "entropy": 1.3695125877857208, | |
| "epoch": 0.0268026802680268, | |
| "grad_norm": 0.3491916358470917, | |
| "learning_rate": 0.00017600000000000002, | |
| "loss": 1.3288, | |
| "mean_token_accuracy": 0.6431872397661209, | |
| "num_tokens": 567724.0, | |
| "step": 67 | |
| }, | |
| { | |
| "entropy": 1.3363787531852722, | |
| "epoch": 0.027202720272027203, | |
| "grad_norm": 0.3625618517398834, | |
| "learning_rate": 0.00017866666666666668, | |
| "loss": 1.2804, | |
| "mean_token_accuracy": 0.6557945609092712, | |
| "num_tokens": 576144.0, | |
| "step": 68 | |
| }, | |
| { | |
| "entropy": 1.3033888339996338, | |
| "epoch": 0.027602760276027604, | |
| "grad_norm": 0.35051390528678894, | |
| "learning_rate": 0.00018133333333333334, | |
| "loss": 1.2841, | |
| "mean_token_accuracy": 0.6544656604528427, | |
| "num_tokens": 584831.0, | |
| "step": 69 | |
| }, | |
| { | |
| "entropy": 1.3235229551792145, | |
| "epoch": 0.028002800280028002, | |
| "grad_norm": 0.3980117738246918, | |
| "learning_rate": 0.00018400000000000003, | |
| "loss": 1.3492, | |
| "mean_token_accuracy": 0.6482396423816681, | |
| "num_tokens": 593412.0, | |
| "step": 70 | |
| }, | |
| { | |
| "entropy": 1.2970213294029236, | |
| "epoch": 0.028402840284028404, | |
| "grad_norm": 0.3519047796726227, | |
| "learning_rate": 0.0001866666666666667, | |
| "loss": 1.3083, | |
| "mean_token_accuracy": 0.6536522507667542, | |
| "num_tokens": 601675.0, | |
| "step": 71 | |
| }, | |
| { | |
| "entropy": 1.2363843321800232, | |
| "epoch": 0.0288028802880288, | |
| "grad_norm": 0.356121689081192, | |
| "learning_rate": 0.00018933333333333335, | |
| "loss": 1.2331, | |
| "mean_token_accuracy": 0.6689527034759521, | |
| "num_tokens": 610155.0, | |
| "step": 72 | |
| }, | |
| { | |
| "entropy": 1.2743788659572601, | |
| "epoch": 0.029202920292029203, | |
| "grad_norm": 0.352166086435318, | |
| "learning_rate": 0.000192, | |
| "loss": 1.2953, | |
| "mean_token_accuracy": 0.6543757170438766, | |
| "num_tokens": 619084.0, | |
| "step": 73 | |
| }, | |
| { | |
| "entropy": 1.251781314611435, | |
| "epoch": 0.029602960296029605, | |
| "grad_norm": 0.3690275251865387, | |
| "learning_rate": 0.0001946666666666667, | |
| "loss": 1.249, | |
| "mean_token_accuracy": 0.6584222465753555, | |
| "num_tokens": 627717.0, | |
| "step": 74 | |
| }, | |
| { | |
| "entropy": 1.3367043435573578, | |
| "epoch": 0.030003000300030003, | |
| "grad_norm": 0.3400121331214905, | |
| "learning_rate": 0.00019733333333333335, | |
| "loss": 1.2895, | |
| "mean_token_accuracy": 0.6532490998506546, | |
| "num_tokens": 637070.0, | |
| "step": 75 | |
| }, | |
| { | |
| "entropy": 1.2800488770008087, | |
| "epoch": 0.030403040304030404, | |
| "grad_norm": 0.34383344650268555, | |
| "learning_rate": 0.0002, | |
| "loss": 1.2733, | |
| "mean_token_accuracy": 0.6612512767314911, | |
| "num_tokens": 646123.0, | |
| "step": 76 | |
| }, | |
| { | |
| "entropy": 1.328520268201828, | |
| "epoch": 0.030803080308030802, | |
| "grad_norm": 0.3561513125896454, | |
| "learning_rate": 0.00019999992447535154, | |
| "loss": 1.3263, | |
| "mean_token_accuracy": 0.6502320766448975, | |
| "num_tokens": 654808.0, | |
| "step": 77 | |
| }, | |
| { | |
| "entropy": 1.2899321019649506, | |
| "epoch": 0.031203120312031204, | |
| "grad_norm": 0.3678707480430603, | |
| "learning_rate": 0.00019999969790153286, | |
| "loss": 1.3406, | |
| "mean_token_accuracy": 0.6464085876941681, | |
| "num_tokens": 663045.0, | |
| "step": 78 | |
| }, | |
| { | |
| "entropy": 1.3219149708747864, | |
| "epoch": 0.0316031603160316, | |
| "grad_norm": 0.38404518365859985, | |
| "learning_rate": 0.00019999932027892428, | |
| "loss": 1.302, | |
| "mean_token_accuracy": 0.6544652730226517, | |
| "num_tokens": 671266.0, | |
| "step": 79 | |
| }, | |
| { | |
| "entropy": 1.227865844964981, | |
| "epoch": 0.032003200320032, | |
| "grad_norm": 0.3195721209049225, | |
| "learning_rate": 0.0001999987916081595, | |
| "loss": 1.2129, | |
| "mean_token_accuracy": 0.6690118610858917, | |
| "num_tokens": 680536.0, | |
| "step": 80 | |
| }, | |
| { | |
| "entropy": 1.2681958079338074, | |
| "epoch": 0.032403240324032405, | |
| "grad_norm": 0.33165785670280457, | |
| "learning_rate": 0.00019999811189012589, | |
| "loss": 1.2616, | |
| "mean_token_accuracy": 0.6542633771896362, | |
| "num_tokens": 689078.0, | |
| "step": 81 | |
| }, | |
| { | |
| "entropy": 1.2480992376804352, | |
| "epoch": 0.032803280328032806, | |
| "grad_norm": 0.3365044891834259, | |
| "learning_rate": 0.00019999728112596419, | |
| "loss": 1.2532, | |
| "mean_token_accuracy": 0.6593984663486481, | |
| "num_tokens": 697600.0, | |
| "step": 82 | |
| }, | |
| { | |
| "entropy": 1.2559486627578735, | |
| "epoch": 0.0332033203320332, | |
| "grad_norm": 0.3525690734386444, | |
| "learning_rate": 0.0001999962993170687, | |
| "loss": 1.2407, | |
| "mean_token_accuracy": 0.6652248501777649, | |
| "num_tokens": 706449.0, | |
| "step": 83 | |
| }, | |
| { | |
| "entropy": 1.2723756432533264, | |
| "epoch": 0.0336033603360336, | |
| "grad_norm": 0.3243389129638672, | |
| "learning_rate": 0.00019999516646508717, | |
| "loss": 1.2759, | |
| "mean_token_accuracy": 0.6553087830543518, | |
| "num_tokens": 715261.0, | |
| "step": 84 | |
| }, | |
| { | |
| "entropy": 1.286735862493515, | |
| "epoch": 0.034003400340034004, | |
| "grad_norm": 0.3348769247531891, | |
| "learning_rate": 0.000199993882571921, | |
| "loss": 1.3288, | |
| "mean_token_accuracy": 0.6503776162862778, | |
| "num_tokens": 723935.0, | |
| "step": 85 | |
| }, | |
| { | |
| "entropy": 1.2838447391986847, | |
| "epoch": 0.034403440344034406, | |
| "grad_norm": 0.31921443343162537, | |
| "learning_rate": 0.0001999924476397249, | |
| "loss": 1.2712, | |
| "mean_token_accuracy": 0.6571811884641647, | |
| "num_tokens": 732552.0, | |
| "step": 86 | |
| }, | |
| { | |
| "entropy": 1.2601779401302338, | |
| "epoch": 0.0348034803480348, | |
| "grad_norm": 0.3210558593273163, | |
| "learning_rate": 0.0001999908616709071, | |
| "loss": 1.2409, | |
| "mean_token_accuracy": 0.6692058891057968, | |
| "num_tokens": 741619.0, | |
| "step": 87 | |
| }, | |
| { | |
| "entropy": 1.2706993520259857, | |
| "epoch": 0.0352035203520352, | |
| "grad_norm": 0.3449415862560272, | |
| "learning_rate": 0.00019998912466812952, | |
| "loss": 1.2301, | |
| "mean_token_accuracy": 0.6645237505435944, | |
| "num_tokens": 750045.0, | |
| "step": 88 | |
| }, | |
| { | |
| "entropy": 1.264108419418335, | |
| "epoch": 0.0356035603560356, | |
| "grad_norm": 0.3272925913333893, | |
| "learning_rate": 0.00019998723663430733, | |
| "loss": 1.2593, | |
| "mean_token_accuracy": 0.6653023958206177, | |
| "num_tokens": 758535.0, | |
| "step": 89 | |
| }, | |
| { | |
| "entropy": 1.174435406923294, | |
| "epoch": 0.036003600360036005, | |
| "grad_norm": 0.3484836518764496, | |
| "learning_rate": 0.00019998519757260928, | |
| "loss": 1.1771, | |
| "mean_token_accuracy": 0.6722908169031143, | |
| "num_tokens": 766995.0, | |
| "step": 90 | |
| }, | |
| { | |
| "entropy": 1.2018343806266785, | |
| "epoch": 0.036403640364036406, | |
| "grad_norm": 0.3412557542324066, | |
| "learning_rate": 0.00019998300748645754, | |
| "loss": 1.2204, | |
| "mean_token_accuracy": 0.6707678735256195, | |
| "num_tokens": 775542.0, | |
| "step": 91 | |
| }, | |
| { | |
| "entropy": 1.3117725551128387, | |
| "epoch": 0.0368036803680368, | |
| "grad_norm": 0.3464583158493042, | |
| "learning_rate": 0.00019998066637952783, | |
| "loss": 1.304, | |
| "mean_token_accuracy": 0.645479291677475, | |
| "num_tokens": 783830.0, | |
| "step": 92 | |
| }, | |
| { | |
| "entropy": 1.266638070344925, | |
| "epoch": 0.0372037203720372, | |
| "grad_norm": 0.35132962465286255, | |
| "learning_rate": 0.0001999781742557493, | |
| "loss": 1.2571, | |
| "mean_token_accuracy": 0.6589740812778473, | |
| "num_tokens": 792085.0, | |
| "step": 93 | |
| }, | |
| { | |
| "entropy": 1.266037255525589, | |
| "epoch": 0.037603760376037604, | |
| "grad_norm": 0.3320970833301544, | |
| "learning_rate": 0.00019997553111930448, | |
| "loss": 1.2761, | |
| "mean_token_accuracy": 0.654522180557251, | |
| "num_tokens": 800687.0, | |
| "step": 94 | |
| }, | |
| { | |
| "entropy": 1.324877679347992, | |
| "epoch": 0.038003800380038005, | |
| "grad_norm": 0.34410229325294495, | |
| "learning_rate": 0.00019997273697462952, | |
| "loss": 1.3059, | |
| "mean_token_accuracy": 0.6469769328832626, | |
| "num_tokens": 808479.0, | |
| "step": 95 | |
| }, | |
| { | |
| "entropy": 1.24421826004982, | |
| "epoch": 0.03840384038403841, | |
| "grad_norm": 0.3413639962673187, | |
| "learning_rate": 0.00019996979182641383, | |
| "loss": 1.2116, | |
| "mean_token_accuracy": 0.6725156307220459, | |
| "num_tokens": 817193.0, | |
| "step": 96 | |
| }, | |
| { | |
| "entropy": 1.2131675779819489, | |
| "epoch": 0.0388038803880388, | |
| "grad_norm": 0.31536421179771423, | |
| "learning_rate": 0.00019996669567960031, | |
| "loss": 1.2337, | |
| "mean_token_accuracy": 0.6649139970541, | |
| "num_tokens": 825915.0, | |
| "step": 97 | |
| }, | |
| { | |
| "entropy": 1.2785483300685883, | |
| "epoch": 0.0392039203920392, | |
| "grad_norm": 0.3453619182109833, | |
| "learning_rate": 0.00019996344853938534, | |
| "loss": 1.2257, | |
| "mean_token_accuracy": 0.6682975143194199, | |
| "num_tokens": 833771.0, | |
| "step": 98 | |
| }, | |
| { | |
| "entropy": 1.2706316709518433, | |
| "epoch": 0.039603960396039604, | |
| "grad_norm": 0.34687721729278564, | |
| "learning_rate": 0.00019996005041121871, | |
| "loss": 1.2578, | |
| "mean_token_accuracy": 0.6584849059581757, | |
| "num_tokens": 842093.0, | |
| "step": 99 | |
| }, | |
| { | |
| "entropy": 1.310558557510376, | |
| "epoch": 0.040004000400040006, | |
| "grad_norm": 0.34193679690361023, | |
| "learning_rate": 0.0001999565013008035, | |
| "loss": 1.338, | |
| "mean_token_accuracy": 0.6487725079059601, | |
| "num_tokens": 850079.0, | |
| "step": 100 | |
| }, | |
| { | |
| "entropy": 1.2646283209323883, | |
| "epoch": 0.04040404040404041, | |
| "grad_norm": 0.3951033651828766, | |
| "learning_rate": 0.00019995280121409636, | |
| "loss": 1.3172, | |
| "mean_token_accuracy": 0.6424316316843033, | |
| "num_tokens": 858250.0, | |
| "step": 101 | |
| }, | |
| { | |
| "entropy": 1.2900939583778381, | |
| "epoch": 0.0408040804080408, | |
| "grad_norm": 0.3364447057247162, | |
| "learning_rate": 0.00019994895015730717, | |
| "loss": 1.2487, | |
| "mean_token_accuracy": 0.6626600474119186, | |
| "num_tokens": 866623.0, | |
| "step": 102 | |
| }, | |
| { | |
| "entropy": 1.294897198677063, | |
| "epoch": 0.041204120412041204, | |
| "grad_norm": 0.3506770431995392, | |
| "learning_rate": 0.00019994494813689928, | |
| "loss": 1.2672, | |
| "mean_token_accuracy": 0.6523661762475967, | |
| "num_tokens": 875370.0, | |
| "step": 103 | |
| }, | |
| { | |
| "entropy": 1.2744373679161072, | |
| "epoch": 0.041604160416041605, | |
| "grad_norm": 0.31772273778915405, | |
| "learning_rate": 0.00019994079515958942, | |
| "loss": 1.2437, | |
| "mean_token_accuracy": 0.6669129282236099, | |
| "num_tokens": 884081.0, | |
| "step": 104 | |
| }, | |
| { | |
| "entropy": 1.2323677241802216, | |
| "epoch": 0.04200420042004201, | |
| "grad_norm": 0.31223100423812866, | |
| "learning_rate": 0.00019993649123234758, | |
| "loss": 1.2034, | |
| "mean_token_accuracy": 0.6670378148555756, | |
| "num_tokens": 892383.0, | |
| "step": 105 | |
| }, | |
| { | |
| "entropy": 1.1459662318229675, | |
| "epoch": 0.0424042404240424, | |
| "grad_norm": 0.3307859003543854, | |
| "learning_rate": 0.00019993203636239717, | |
| "loss": 1.2135, | |
| "mean_token_accuracy": 0.6718799471855164, | |
| "num_tokens": 900628.0, | |
| "step": 106 | |
| }, | |
| { | |
| "entropy": 1.2268281877040863, | |
| "epoch": 0.0428042804280428, | |
| "grad_norm": 0.35912272334098816, | |
| "learning_rate": 0.00019992743055721493, | |
| "loss": 1.2609, | |
| "mean_token_accuracy": 0.6666164696216583, | |
| "num_tokens": 909062.0, | |
| "step": 107 | |
| }, | |
| { | |
| "entropy": 1.200032651424408, | |
| "epoch": 0.043204320432043204, | |
| "grad_norm": 0.35117003321647644, | |
| "learning_rate": 0.00019992267382453092, | |
| "loss": 1.2047, | |
| "mean_token_accuracy": 0.6681774854660034, | |
| "num_tokens": 918221.0, | |
| "step": 108 | |
| }, | |
| { | |
| "entropy": 1.3714069724082947, | |
| "epoch": 0.043604360436043606, | |
| "grad_norm": 0.33686235547065735, | |
| "learning_rate": 0.0001999177661723284, | |
| "loss": 1.2777, | |
| "mean_token_accuracy": 0.655053585767746, | |
| "num_tokens": 926443.0, | |
| "step": 109 | |
| }, | |
| { | |
| "entropy": 1.3487186133861542, | |
| "epoch": 0.04400440044004401, | |
| "grad_norm": 0.3200630843639374, | |
| "learning_rate": 0.0001999127076088441, | |
| "loss": 1.3107, | |
| "mean_token_accuracy": 0.6602136790752411, | |
| "num_tokens": 934650.0, | |
| "step": 110 | |
| }, | |
| { | |
| "entropy": 1.2584488987922668, | |
| "epoch": 0.0444044404440444, | |
| "grad_norm": 0.31613630056381226, | |
| "learning_rate": 0.0001999074981425679, | |
| "loss": 1.2226, | |
| "mean_token_accuracy": 0.6622737497091293, | |
| "num_tokens": 942947.0, | |
| "step": 111 | |
| }, | |
| { | |
| "entropy": 1.1936236023902893, | |
| "epoch": 0.0448044804480448, | |
| "grad_norm": 0.316254198551178, | |
| "learning_rate": 0.00019990213778224298, | |
| "loss": 1.2106, | |
| "mean_token_accuracy": 0.6652569025754929, | |
| "num_tokens": 951465.0, | |
| "step": 112 | |
| }, | |
| { | |
| "entropy": 1.165192574262619, | |
| "epoch": 0.045204520452045205, | |
| "grad_norm": 0.31257057189941406, | |
| "learning_rate": 0.00019989662653686576, | |
| "loss": 1.2065, | |
| "mean_token_accuracy": 0.6672259867191315, | |
| "num_tokens": 960215.0, | |
| "step": 113 | |
| }, | |
| { | |
| "entropy": 1.180109590291977, | |
| "epoch": 0.045604560456045606, | |
| "grad_norm": 0.3332797884941101, | |
| "learning_rate": 0.00019989096441568591, | |
| "loss": 1.2285, | |
| "mean_token_accuracy": 0.6671265214681625, | |
| "num_tokens": 968893.0, | |
| "step": 114 | |
| }, | |
| { | |
| "entropy": 1.220985621213913, | |
| "epoch": 0.04600460046004601, | |
| "grad_norm": 0.3698706030845642, | |
| "learning_rate": 0.0001998851514282063, | |
| "loss": 1.2314, | |
| "mean_token_accuracy": 0.6654269397258759, | |
| "num_tokens": 976891.0, | |
| "step": 115 | |
| }, | |
| { | |
| "entropy": 1.2753552794456482, | |
| "epoch": 0.0464046404640464, | |
| "grad_norm": 0.32274726033210754, | |
| "learning_rate": 0.00019987918758418308, | |
| "loss": 1.2811, | |
| "mean_token_accuracy": 0.6611100733280182, | |
| "num_tokens": 984914.0, | |
| "step": 116 | |
| }, | |
| { | |
| "entropy": 1.308321624994278, | |
| "epoch": 0.046804680468046804, | |
| "grad_norm": 0.33258453011512756, | |
| "learning_rate": 0.00019987307289362545, | |
| "loss": 1.2541, | |
| "mean_token_accuracy": 0.6605920940637589, | |
| "num_tokens": 993096.0, | |
| "step": 117 | |
| }, | |
| { | |
| "entropy": 1.2893326878547668, | |
| "epoch": 0.047204720472047206, | |
| "grad_norm": 0.33915621042251587, | |
| "learning_rate": 0.00019986680736679586, | |
| "loss": 1.2511, | |
| "mean_token_accuracy": 0.6640890389680862, | |
| "num_tokens": 1001323.0, | |
| "step": 118 | |
| }, | |
| { | |
| "entropy": 1.30213862657547, | |
| "epoch": 0.04760476047604761, | |
| "grad_norm": 0.3717119097709656, | |
| "learning_rate": 0.00019986039101420994, | |
| "loss": 1.3143, | |
| "mean_token_accuracy": 0.649169459939003, | |
| "num_tokens": 1009892.0, | |
| "step": 119 | |
| }, | |
| { | |
| "entropy": 1.3021227717399597, | |
| "epoch": 0.048004800480048, | |
| "grad_norm": 0.32890114188194275, | |
| "learning_rate": 0.0001998538238466364, | |
| "loss": 1.2351, | |
| "mean_token_accuracy": 0.6693892329931259, | |
| "num_tokens": 1017992.0, | |
| "step": 120 | |
| }, | |
| { | |
| "entropy": 1.2010404765605927, | |
| "epoch": 0.0484048404840484, | |
| "grad_norm": 0.3222126066684723, | |
| "learning_rate": 0.00019984710587509706, | |
| "loss": 1.1934, | |
| "mean_token_accuracy": 0.6745197772979736, | |
| "num_tokens": 1026224.0, | |
| "step": 121 | |
| }, | |
| { | |
| "entropy": 1.2384890913963318, | |
| "epoch": 0.048804880488048805, | |
| "grad_norm": 0.32965728640556335, | |
| "learning_rate": 0.00019984023711086687, | |
| "loss": 1.2587, | |
| "mean_token_accuracy": 0.6567209810018539, | |
| "num_tokens": 1034674.0, | |
| "step": 122 | |
| }, | |
| { | |
| "entropy": 1.1893330216407776, | |
| "epoch": 0.049204920492049206, | |
| "grad_norm": 0.3488786518573761, | |
| "learning_rate": 0.0001998332175654739, | |
| "loss": 1.1999, | |
| "mean_token_accuracy": 0.6683076322078705, | |
| "num_tokens": 1042546.0, | |
| "step": 123 | |
| }, | |
| { | |
| "entropy": 1.2300190329551697, | |
| "epoch": 0.04960496049604961, | |
| "grad_norm": 0.33502018451690674, | |
| "learning_rate": 0.00019982604725069918, | |
| "loss": 1.2714, | |
| "mean_token_accuracy": 0.6550982743501663, | |
| "num_tokens": 1051075.0, | |
| "step": 124 | |
| }, | |
| { | |
| "entropy": 1.263420820236206, | |
| "epoch": 0.05000500050005, | |
| "grad_norm": 0.35562458634376526, | |
| "learning_rate": 0.00019981872617857684, | |
| "loss": 1.2535, | |
| "mean_token_accuracy": 0.6570105701684952, | |
| "num_tokens": 1059384.0, | |
| "step": 125 | |
| }, | |
| { | |
| "entropy": 1.2463673949241638, | |
| "epoch": 0.050405040504050404, | |
| "grad_norm": 0.3122851252555847, | |
| "learning_rate": 0.00019981125436139405, | |
| "loss": 1.2035, | |
| "mean_token_accuracy": 0.6734038293361664, | |
| "num_tokens": 1068524.0, | |
| "step": 126 | |
| }, | |
| { | |
| "entropy": 1.3272143006324768, | |
| "epoch": 0.050805080508050805, | |
| "grad_norm": 0.37185049057006836, | |
| "learning_rate": 0.00019980363181169096, | |
| "loss": 1.2723, | |
| "mean_token_accuracy": 0.6541654914617538, | |
| "num_tokens": 1076256.0, | |
| "step": 127 | |
| }, | |
| { | |
| "entropy": 1.2414169907569885, | |
| "epoch": 0.05120512051205121, | |
| "grad_norm": 0.32138875126838684, | |
| "learning_rate": 0.00019979585854226065, | |
| "loss": 1.1992, | |
| "mean_token_accuracy": 0.6784048974514008, | |
| "num_tokens": 1084784.0, | |
| "step": 128 | |
| }, | |
| { | |
| "entropy": 1.1664628982543945, | |
| "epoch": 0.05160516051605161, | |
| "grad_norm": 0.31607839465141296, | |
| "learning_rate": 0.00019978793456614918, | |
| "loss": 1.1728, | |
| "mean_token_accuracy": 0.6773318648338318, | |
| "num_tokens": 1094177.0, | |
| "step": 129 | |
| }, | |
| { | |
| "entropy": 1.1460879147052765, | |
| "epoch": 0.052005200520052, | |
| "grad_norm": 0.3119550347328186, | |
| "learning_rate": 0.0001997798598966556, | |
| "loss": 1.1576, | |
| "mean_token_accuracy": 0.6763872653245926, | |
| "num_tokens": 1102808.0, | |
| "step": 130 | |
| }, | |
| { | |
| "entropy": 1.1866309642791748, | |
| "epoch": 0.052405240524052404, | |
| "grad_norm": 0.3441757261753082, | |
| "learning_rate": 0.00019977163454733184, | |
| "loss": 1.2228, | |
| "mean_token_accuracy": 0.6688681393861771, | |
| "num_tokens": 1111447.0, | |
| "step": 131 | |
| }, | |
| { | |
| "entropy": 1.1310507953166962, | |
| "epoch": 0.052805280528052806, | |
| "grad_norm": 0.3540189862251282, | |
| "learning_rate": 0.00019976325853198268, | |
| "loss": 1.1514, | |
| "mean_token_accuracy": 0.6831837445497513, | |
| "num_tokens": 1120000.0, | |
| "step": 132 | |
| }, | |
| { | |
| "entropy": 1.19211745262146, | |
| "epoch": 0.05320532053205321, | |
| "grad_norm": 0.3323245942592621, | |
| "learning_rate": 0.00019975473186466583, | |
| "loss": 1.2119, | |
| "mean_token_accuracy": 0.6718263179063797, | |
| "num_tokens": 1128658.0, | |
| "step": 133 | |
| }, | |
| { | |
| "entropy": 1.1928575336933136, | |
| "epoch": 0.0536053605360536, | |
| "grad_norm": 0.34882429242134094, | |
| "learning_rate": 0.0001997460545596918, | |
| "loss": 1.2066, | |
| "mean_token_accuracy": 0.6791622638702393, | |
| "num_tokens": 1137143.0, | |
| "step": 134 | |
| }, | |
| { | |
| "entropy": 1.226127952337265, | |
| "epoch": 0.054005400540054004, | |
| "grad_norm": 0.3233380913734436, | |
| "learning_rate": 0.00019973722663162396, | |
| "loss": 1.1884, | |
| "mean_token_accuracy": 0.6750646978616714, | |
| "num_tokens": 1145501.0, | |
| "step": 135 | |
| }, | |
| { | |
| "entropy": 1.2761054337024689, | |
| "epoch": 0.054405440544054405, | |
| "grad_norm": 0.308118611574173, | |
| "learning_rate": 0.00019972824809527838, | |
| "loss": 1.224, | |
| "mean_token_accuracy": 0.6631017774343491, | |
| "num_tokens": 1153912.0, | |
| "step": 136 | |
| }, | |
| { | |
| "entropy": 1.3157364130020142, | |
| "epoch": 0.05480548054805481, | |
| "grad_norm": 0.33582690358161926, | |
| "learning_rate": 0.00019971911896572405, | |
| "loss": 1.2701, | |
| "mean_token_accuracy": 0.6578985750675201, | |
| "num_tokens": 1161769.0, | |
| "step": 137 | |
| }, | |
| { | |
| "entropy": 1.2075002789497375, | |
| "epoch": 0.05520552055205521, | |
| "grad_norm": 0.3170996606349945, | |
| "learning_rate": 0.00019970983925828256, | |
| "loss": 1.1906, | |
| "mean_token_accuracy": 0.6732707768678665, | |
| "num_tokens": 1170319.0, | |
| "step": 138 | |
| }, | |
| { | |
| "entropy": 1.1732978522777557, | |
| "epoch": 0.0556055605560556, | |
| "grad_norm": 0.32156452536582947, | |
| "learning_rate": 0.0001997004089885283, | |
| "loss": 1.1782, | |
| "mean_token_accuracy": 0.6732619553804398, | |
| "num_tokens": 1178801.0, | |
| "step": 139 | |
| }, | |
| { | |
| "entropy": 1.1573354601860046, | |
| "epoch": 0.056005600560056004, | |
| "grad_norm": 0.33083587884902954, | |
| "learning_rate": 0.00019969082817228832, | |
| "loss": 1.2067, | |
| "mean_token_accuracy": 0.6737565696239471, | |
| "num_tokens": 1186994.0, | |
| "step": 140 | |
| }, | |
| { | |
| "entropy": 1.211174637079239, | |
| "epoch": 0.056405640564056406, | |
| "grad_norm": 0.34685665369033813, | |
| "learning_rate": 0.00019968109682564237, | |
| "loss": 1.2586, | |
| "mean_token_accuracy": 0.6569341272115707, | |
| "num_tokens": 1194743.0, | |
| "step": 141 | |
| }, | |
| { | |
| "entropy": 1.2521505057811737, | |
| "epoch": 0.05680568056805681, | |
| "grad_norm": 0.35258418321609497, | |
| "learning_rate": 0.00019967121496492282, | |
| "loss": 1.2599, | |
| "mean_token_accuracy": 0.6645904332399368, | |
| "num_tokens": 1202435.0, | |
| "step": 142 | |
| }, | |
| { | |
| "entropy": 1.2398549616336823, | |
| "epoch": 0.05720572057205721, | |
| "grad_norm": 0.3388517200946808, | |
| "learning_rate": 0.00019966118260671465, | |
| "loss": 1.2081, | |
| "mean_token_accuracy": 0.6675426363945007, | |
| "num_tokens": 1210326.0, | |
| "step": 143 | |
| }, | |
| { | |
| "entropy": 1.297620803117752, | |
| "epoch": 0.0576057605760576, | |
| "grad_norm": 0.34630584716796875, | |
| "learning_rate": 0.0001996509997678554, | |
| "loss": 1.2857, | |
| "mean_token_accuracy": 0.6573289930820465, | |
| "num_tokens": 1218682.0, | |
| "step": 144 | |
| }, | |
| { | |
| "entropy": 1.248921811580658, | |
| "epoch": 0.058005800580058005, | |
| "grad_norm": 0.33417370915412903, | |
| "learning_rate": 0.00019964066646543517, | |
| "loss": 1.2036, | |
| "mean_token_accuracy": 0.6730931401252747, | |
| "num_tokens": 1227725.0, | |
| "step": 145 | |
| }, | |
| { | |
| "entropy": 1.2742219269275665, | |
| "epoch": 0.058405840584058406, | |
| "grad_norm": 0.31867334246635437, | |
| "learning_rate": 0.00019963018271679667, | |
| "loss": 1.2356, | |
| "mean_token_accuracy": 0.6603083312511444, | |
| "num_tokens": 1236112.0, | |
| "step": 146 | |
| }, | |
| { | |
| "entropy": 1.2454158961772919, | |
| "epoch": 0.05880588058805881, | |
| "grad_norm": 0.31619757413864136, | |
| "learning_rate": 0.000199619548539535, | |
| "loss": 1.2272, | |
| "mean_token_accuracy": 0.664936900138855, | |
| "num_tokens": 1244932.0, | |
| "step": 147 | |
| }, | |
| { | |
| "entropy": 1.1861615478992462, | |
| "epoch": 0.05920592059205921, | |
| "grad_norm": 0.3590589761734009, | |
| "learning_rate": 0.00019960876395149778, | |
| "loss": 1.2122, | |
| "mean_token_accuracy": 0.6684562414884567, | |
| "num_tokens": 1253316.0, | |
| "step": 148 | |
| }, | |
| { | |
| "entropy": 1.1777002215385437, | |
| "epoch": 0.059605960596059604, | |
| "grad_norm": 0.3057377338409424, | |
| "learning_rate": 0.00019959782897078504, | |
| "loss": 1.1483, | |
| "mean_token_accuracy": 0.6810255944728851, | |
| "num_tokens": 1261895.0, | |
| "step": 149 | |
| }, | |
| { | |
| "entropy": 1.2077372670173645, | |
| "epoch": 0.060006000600060005, | |
| "grad_norm": 0.32661283016204834, | |
| "learning_rate": 0.00019958674361574927, | |
| "loss": 1.2242, | |
| "mean_token_accuracy": 0.6603673696517944, | |
| "num_tokens": 1270647.0, | |
| "step": 150 | |
| }, | |
| { | |
| "entropy": 1.2129946649074554, | |
| "epoch": 0.06040604060406041, | |
| "grad_norm": 0.33181479573249817, | |
| "learning_rate": 0.00019957550790499526, | |
| "loss": 1.214, | |
| "mean_token_accuracy": 0.6734245270490646, | |
| "num_tokens": 1279483.0, | |
| "step": 151 | |
| }, | |
| { | |
| "entropy": 1.2279469072818756, | |
| "epoch": 0.06080608060806081, | |
| "grad_norm": 0.36564233899116516, | |
| "learning_rate": 0.00019956412185738025, | |
| "loss": 1.2227, | |
| "mean_token_accuracy": 0.664169505238533, | |
| "num_tokens": 1288062.0, | |
| "step": 152 | |
| }, | |
| { | |
| "entropy": 1.1853630542755127, | |
| "epoch": 0.0612061206120612, | |
| "grad_norm": 0.3081769645214081, | |
| "learning_rate": 0.0001995525854920137, | |
| "loss": 1.2009, | |
| "mean_token_accuracy": 0.6692493110895157, | |
| "num_tokens": 1296644.0, | |
| "step": 153 | |
| }, | |
| { | |
| "entropy": 1.1182245910167694, | |
| "epoch": 0.061606160616061605, | |
| "grad_norm": 0.28534799814224243, | |
| "learning_rate": 0.00019954089882825738, | |
| "loss": 1.0659, | |
| "mean_token_accuracy": 0.7025346755981445, | |
| "num_tokens": 1305683.0, | |
| "step": 154 | |
| }, | |
| { | |
| "entropy": 1.1886220276355743, | |
| "epoch": 0.062006200620062006, | |
| "grad_norm": 0.3182019293308258, | |
| "learning_rate": 0.0001995290618857253, | |
| "loss": 1.1576, | |
| "mean_token_accuracy": 0.6741877645254135, | |
| "num_tokens": 1314385.0, | |
| "step": 155 | |
| }, | |
| { | |
| "entropy": 1.2045941054821014, | |
| "epoch": 0.06240624062406241, | |
| "grad_norm": 0.3276945948600769, | |
| "learning_rate": 0.0001995170746842838, | |
| "loss": 1.165, | |
| "mean_token_accuracy": 0.6834963709115982, | |
| "num_tokens": 1322826.0, | |
| "step": 156 | |
| }, | |
| { | |
| "entropy": 1.2731471955776215, | |
| "epoch": 0.0628062806280628, | |
| "grad_norm": 0.3397105932235718, | |
| "learning_rate": 0.00019950493724405117, | |
| "loss": 1.2985, | |
| "mean_token_accuracy": 0.648296907544136, | |
| "num_tokens": 1331327.0, | |
| "step": 157 | |
| }, | |
| { | |
| "entropy": 1.1947194337844849, | |
| "epoch": 0.0632063206320632, | |
| "grad_norm": 0.2986201047897339, | |
| "learning_rate": 0.00019949264958539807, | |
| "loss": 1.205, | |
| "mean_token_accuracy": 0.6792440861463547, | |
| "num_tokens": 1340147.0, | |
| "step": 158 | |
| }, | |
| { | |
| "entropy": 1.1570270955562592, | |
| "epoch": 0.0636063606360636, | |
| "grad_norm": 0.3215077519416809, | |
| "learning_rate": 0.00019948021172894718, | |
| "loss": 1.1681, | |
| "mean_token_accuracy": 0.6815727949142456, | |
| "num_tokens": 1348989.0, | |
| "step": 159 | |
| }, | |
| { | |
| "entropy": 1.122036024928093, | |
| "epoch": 0.064006400640064, | |
| "grad_norm": 0.3120049238204956, | |
| "learning_rate": 0.00019946762369557323, | |
| "loss": 1.1377, | |
| "mean_token_accuracy": 0.6871893852949142, | |
| "num_tokens": 1357863.0, | |
| "step": 160 | |
| }, | |
| { | |
| "entropy": 1.2672194242477417, | |
| "epoch": 0.06440644064406441, | |
| "grad_norm": 0.33700302243232727, | |
| "learning_rate": 0.00019945488550640313, | |
| "loss": 1.2532, | |
| "mean_token_accuracy": 0.664255827665329, | |
| "num_tokens": 1365945.0, | |
| "step": 161 | |
| }, | |
| { | |
| "entropy": 1.1509548127651215, | |
| "epoch": 0.06480648064806481, | |
| "grad_norm": 0.3201735019683838, | |
| "learning_rate": 0.00019944199718281559, | |
| "loss": 1.1387, | |
| "mean_token_accuracy": 0.6814217865467072, | |
| "num_tokens": 1375147.0, | |
| "step": 162 | |
| }, | |
| { | |
| "entropy": 1.1635609865188599, | |
| "epoch": 0.06520652065206521, | |
| "grad_norm": 0.2953193187713623, | |
| "learning_rate": 0.0001994289587464415, | |
| "loss": 1.1817, | |
| "mean_token_accuracy": 0.6780352145433426, | |
| "num_tokens": 1383893.0, | |
| "step": 163 | |
| }, | |
| { | |
| "entropy": 1.1869005262851715, | |
| "epoch": 0.06560656065606561, | |
| "grad_norm": 0.30155807733535767, | |
| "learning_rate": 0.00019941577021916355, | |
| "loss": 1.1834, | |
| "mean_token_accuracy": 0.6724350303411484, | |
| "num_tokens": 1392477.0, | |
| "step": 164 | |
| }, | |
| { | |
| "entropy": 1.1506932377815247, | |
| "epoch": 0.066006600660066, | |
| "grad_norm": 0.31121376156806946, | |
| "learning_rate": 0.00019940243162311642, | |
| "loss": 1.1673, | |
| "mean_token_accuracy": 0.6797937452793121, | |
| "num_tokens": 1400899.0, | |
| "step": 165 | |
| }, | |
| { | |
| "entropy": 1.2660083770751953, | |
| "epoch": 0.0664066406640664, | |
| "grad_norm": 0.3299071788787842, | |
| "learning_rate": 0.00019938894298068661, | |
| "loss": 1.2725, | |
| "mean_token_accuracy": 0.6537068784236908, | |
| "num_tokens": 1409546.0, | |
| "step": 166 | |
| }, | |
| { | |
| "entropy": 1.2500199675559998, | |
| "epoch": 0.0668066806680668, | |
| "grad_norm": 0.3030771017074585, | |
| "learning_rate": 0.00019937530431451243, | |
| "loss": 1.1776, | |
| "mean_token_accuracy": 0.6745365858078003, | |
| "num_tokens": 1417712.0, | |
| "step": 167 | |
| }, | |
| { | |
| "entropy": 1.2582001090049744, | |
| "epoch": 0.0672067206720672, | |
| "grad_norm": 0.30366259813308716, | |
| "learning_rate": 0.00019936151564748403, | |
| "loss": 1.2339, | |
| "mean_token_accuracy": 0.6664343029260635, | |
| "num_tokens": 1426352.0, | |
| "step": 168 | |
| }, | |
| { | |
| "entropy": 1.2371725142002106, | |
| "epoch": 0.0676067606760676, | |
| "grad_norm": 0.3065868616104126, | |
| "learning_rate": 0.00019934757700274325, | |
| "loss": 1.223, | |
| "mean_token_accuracy": 0.6679128706455231, | |
| "num_tokens": 1434986.0, | |
| "step": 169 | |
| }, | |
| { | |
| "entropy": 1.2751116156578064, | |
| "epoch": 0.06800680068006801, | |
| "grad_norm": 0.3346325755119324, | |
| "learning_rate": 0.00019933348840368368, | |
| "loss": 1.2569, | |
| "mean_token_accuracy": 0.6594884544610977, | |
| "num_tokens": 1442823.0, | |
| "step": 170 | |
| }, | |
| { | |
| "entropy": 1.1633991301059723, | |
| "epoch": 0.06840684068406841, | |
| "grad_norm": 0.3242139518260956, | |
| "learning_rate": 0.0001993192498739506, | |
| "loss": 1.1805, | |
| "mean_token_accuracy": 0.6728992164134979, | |
| "num_tokens": 1451134.0, | |
| "step": 171 | |
| }, | |
| { | |
| "entropy": 1.2180014848709106, | |
| "epoch": 0.06880688068806881, | |
| "grad_norm": 0.3972644507884979, | |
| "learning_rate": 0.0001993048614374409, | |
| "loss": 1.2393, | |
| "mean_token_accuracy": 0.6580066382884979, | |
| "num_tokens": 1459262.0, | |
| "step": 172 | |
| }, | |
| { | |
| "entropy": 1.1176005005836487, | |
| "epoch": 0.06920692069206921, | |
| "grad_norm": 0.3137458264827728, | |
| "learning_rate": 0.00019929032311830303, | |
| "loss": 1.1644, | |
| "mean_token_accuracy": 0.6814699321985245, | |
| "num_tokens": 1467853.0, | |
| "step": 173 | |
| }, | |
| { | |
| "entropy": 1.1198759078979492, | |
| "epoch": 0.0696069606960696, | |
| "grad_norm": 0.3517007529735565, | |
| "learning_rate": 0.000199275634940937, | |
| "loss": 1.1312, | |
| "mean_token_accuracy": 0.6874582916498184, | |
| "num_tokens": 1476497.0, | |
| "step": 174 | |
| }, | |
| { | |
| "entropy": 1.2389306426048279, | |
| "epoch": 0.07000700070007, | |
| "grad_norm": 0.32016775012016296, | |
| "learning_rate": 0.00019926079692999445, | |
| "loss": 1.214, | |
| "mean_token_accuracy": 0.6705743223428726, | |
| "num_tokens": 1484294.0, | |
| "step": 175 | |
| }, | |
| { | |
| "entropy": 1.3337944746017456, | |
| "epoch": 0.0704070407040704, | |
| "grad_norm": 0.33495742082595825, | |
| "learning_rate": 0.00019924580911037827, | |
| "loss": 1.2954, | |
| "mean_token_accuracy": 0.6510952711105347, | |
| "num_tokens": 1492575.0, | |
| "step": 176 | |
| }, | |
| { | |
| "entropy": 1.2905775010585785, | |
| "epoch": 0.0708070807080708, | |
| "grad_norm": 0.3236202001571655, | |
| "learning_rate": 0.00019923067150724296, | |
| "loss": 1.219, | |
| "mean_token_accuracy": 0.6705390512943268, | |
| "num_tokens": 1500716.0, | |
| "step": 177 | |
| }, | |
| { | |
| "entropy": 1.2353481650352478, | |
| "epoch": 0.0712071207120712, | |
| "grad_norm": 0.3262037932872772, | |
| "learning_rate": 0.00019921538414599437, | |
| "loss": 1.2076, | |
| "mean_token_accuracy": 0.6677059978246689, | |
| "num_tokens": 1509105.0, | |
| "step": 178 | |
| }, | |
| { | |
| "entropy": 1.2299005091190338, | |
| "epoch": 0.07160716071607161, | |
| "grad_norm": 0.3147687315940857, | |
| "learning_rate": 0.00019919994705228965, | |
| "loss": 1.2301, | |
| "mean_token_accuracy": 0.6644129753112793, | |
| "num_tokens": 1516981.0, | |
| "step": 179 | |
| }, | |
| { | |
| "entropy": 1.1565956473350525, | |
| "epoch": 0.07200720072007201, | |
| "grad_norm": 0.31962037086486816, | |
| "learning_rate": 0.00019918436025203728, | |
| "loss": 1.2013, | |
| "mean_token_accuracy": 0.6825570911169052, | |
| "num_tokens": 1524951.0, | |
| "step": 180 | |
| }, | |
| { | |
| "entropy": 1.1386863589286804, | |
| "epoch": 0.07240724072407241, | |
| "grad_norm": 0.30647844076156616, | |
| "learning_rate": 0.00019916862377139695, | |
| "loss": 1.1697, | |
| "mean_token_accuracy": 0.6716460883617401, | |
| "num_tokens": 1533450.0, | |
| "step": 181 | |
| }, | |
| { | |
| "entropy": 1.1206298768520355, | |
| "epoch": 0.07280728072807281, | |
| "grad_norm": 0.2919379472732544, | |
| "learning_rate": 0.00019915273763677959, | |
| "loss": 1.1221, | |
| "mean_token_accuracy": 0.6845085620880127, | |
| "num_tokens": 1542345.0, | |
| "step": 182 | |
| }, | |
| { | |
| "entropy": 1.1708945035934448, | |
| "epoch": 0.07320732073207321, | |
| "grad_norm": 0.3223237097263336, | |
| "learning_rate": 0.00019913670187484737, | |
| "loss": 1.1722, | |
| "mean_token_accuracy": 0.681228905916214, | |
| "num_tokens": 1551016.0, | |
| "step": 183 | |
| }, | |
| { | |
| "entropy": 1.1606915593147278, | |
| "epoch": 0.0736073607360736, | |
| "grad_norm": 0.3167206943035126, | |
| "learning_rate": 0.00019912051651251346, | |
| "loss": 1.1381, | |
| "mean_token_accuracy": 0.686376079916954, | |
| "num_tokens": 1560201.0, | |
| "step": 184 | |
| }, | |
| { | |
| "entropy": 1.2089463472366333, | |
| "epoch": 0.074007400740074, | |
| "grad_norm": 0.331546813249588, | |
| "learning_rate": 0.00019910418157694217, | |
| "loss": 1.1998, | |
| "mean_token_accuracy": 0.6701401472091675, | |
| "num_tokens": 1568847.0, | |
| "step": 185 | |
| }, | |
| { | |
| "entropy": 1.2552906274795532, | |
| "epoch": 0.0744074407440744, | |
| "grad_norm": 0.3218790292739868, | |
| "learning_rate": 0.00019908769709554887, | |
| "loss": 1.2302, | |
| "mean_token_accuracy": 0.6671873778104782, | |
| "num_tokens": 1577212.0, | |
| "step": 186 | |
| }, | |
| { | |
| "entropy": 1.0971337109804153, | |
| "epoch": 0.0748074807480748, | |
| "grad_norm": 0.2888547480106354, | |
| "learning_rate": 0.00019907106309599985, | |
| "loss": 1.1053, | |
| "mean_token_accuracy": 0.6914333999156952, | |
| "num_tokens": 1586544.0, | |
| "step": 187 | |
| }, | |
| { | |
| "entropy": 1.1342568099498749, | |
| "epoch": 0.07520752075207521, | |
| "grad_norm": 0.3135220408439636, | |
| "learning_rate": 0.00019905427960621245, | |
| "loss": 1.1553, | |
| "mean_token_accuracy": 0.678636908531189, | |
| "num_tokens": 1595573.0, | |
| "step": 188 | |
| }, | |
| { | |
| "entropy": 1.2157914340496063, | |
| "epoch": 0.07560756075607561, | |
| "grad_norm": 0.32912546396255493, | |
| "learning_rate": 0.00019903734665435472, | |
| "loss": 1.2219, | |
| "mean_token_accuracy": 0.6693233996629715, | |
| "num_tokens": 1603723.0, | |
| "step": 189 | |
| }, | |
| { | |
| "entropy": 1.1541197896003723, | |
| "epoch": 0.07600760076007601, | |
| "grad_norm": 0.31249913573265076, | |
| "learning_rate": 0.00019902026426884574, | |
| "loss": 1.1311, | |
| "mean_token_accuracy": 0.6898495107889175, | |
| "num_tokens": 1612212.0, | |
| "step": 190 | |
| }, | |
| { | |
| "entropy": 1.211905598640442, | |
| "epoch": 0.07640764076407641, | |
| "grad_norm": 0.3106580078601837, | |
| "learning_rate": 0.00019900303247835527, | |
| "loss": 1.168, | |
| "mean_token_accuracy": 0.675964280962944, | |
| "num_tokens": 1620162.0, | |
| "step": 191 | |
| }, | |
| { | |
| "entropy": 1.2080174088478088, | |
| "epoch": 0.07680768076807681, | |
| "grad_norm": 0.32318130135536194, | |
| "learning_rate": 0.00019898565131180393, | |
| "loss": 1.1781, | |
| "mean_token_accuracy": 0.6760376244783401, | |
| "num_tokens": 1628883.0, | |
| "step": 192 | |
| }, | |
| { | |
| "entropy": 1.2078506350517273, | |
| "epoch": 0.0772077207720772, | |
| "grad_norm": 0.33328673243522644, | |
| "learning_rate": 0.0001989681207983629, | |
| "loss": 1.2092, | |
| "mean_token_accuracy": 0.6628051847219467, | |
| "num_tokens": 1637332.0, | |
| "step": 193 | |
| }, | |
| { | |
| "entropy": 1.210196852684021, | |
| "epoch": 0.0776077607760776, | |
| "grad_norm": 0.32340574264526367, | |
| "learning_rate": 0.00019895044096745416, | |
| "loss": 1.2329, | |
| "mean_token_accuracy": 0.6619292944669724, | |
| "num_tokens": 1645906.0, | |
| "step": 194 | |
| }, | |
| { | |
| "entropy": 1.1815847158432007, | |
| "epoch": 0.078007800780078, | |
| "grad_norm": 0.3175504505634308, | |
| "learning_rate": 0.00019893261184875016, | |
| "loss": 1.2045, | |
| "mean_token_accuracy": 0.6673628389835358, | |
| "num_tokens": 1654114.0, | |
| "step": 195 | |
| }, | |
| { | |
| "entropy": 1.1910730004310608, | |
| "epoch": 0.0784078407840784, | |
| "grad_norm": 0.3114391565322876, | |
| "learning_rate": 0.00019891463347217395, | |
| "loss": 1.1889, | |
| "mean_token_accuracy": 0.6714468449354172, | |
| "num_tokens": 1662666.0, | |
| "step": 196 | |
| }, | |
| { | |
| "entropy": 1.1541639566421509, | |
| "epoch": 0.07880788078807881, | |
| "grad_norm": 0.3364032506942749, | |
| "learning_rate": 0.0001988965058678992, | |
| "loss": 1.1622, | |
| "mean_token_accuracy": 0.67988321185112, | |
| "num_tokens": 1671435.0, | |
| "step": 197 | |
| }, | |
| { | |
| "entropy": 1.222437858581543, | |
| "epoch": 0.07920792079207921, | |
| "grad_norm": 0.3355000913143158, | |
| "learning_rate": 0.00019887822906634983, | |
| "loss": 1.1804, | |
| "mean_token_accuracy": 0.6725995391607285, | |
| "num_tokens": 1679662.0, | |
| "step": 198 | |
| }, | |
| { | |
| "entropy": 1.2075644731521606, | |
| "epoch": 0.07960796079607961, | |
| "grad_norm": 0.33377805352211, | |
| "learning_rate": 0.00019885980309820032, | |
| "loss": 1.1547, | |
| "mean_token_accuracy": 0.6831348687410355, | |
| "num_tokens": 1687663.0, | |
| "step": 199 | |
| }, | |
| { | |
| "entropy": 1.248348981142044, | |
| "epoch": 0.08000800080008001, | |
| "grad_norm": 0.3341095447540283, | |
| "learning_rate": 0.0001988412279943754, | |
| "loss": 1.2665, | |
| "mean_token_accuracy": 0.6561878323554993, | |
| "num_tokens": 1696479.0, | |
| "step": 200 | |
| }, | |
| { | |
| "entropy": 1.224026381969452, | |
| "epoch": 0.08040804080408041, | |
| "grad_norm": 0.33011487126350403, | |
| "learning_rate": 0.00019882250378605015, | |
| "loss": 1.2181, | |
| "mean_token_accuracy": 0.6664289385080338, | |
| "num_tokens": 1704885.0, | |
| "step": 201 | |
| }, | |
| { | |
| "entropy": 1.1437757015228271, | |
| "epoch": 0.08080808080808081, | |
| "grad_norm": 0.31265076994895935, | |
| "learning_rate": 0.00019880363050464993, | |
| "loss": 1.1773, | |
| "mean_token_accuracy": 0.6812110096216202, | |
| "num_tokens": 1713409.0, | |
| "step": 202 | |
| }, | |
| { | |
| "entropy": 1.2059556543827057, | |
| "epoch": 0.0812081208120812, | |
| "grad_norm": 0.315448135137558, | |
| "learning_rate": 0.00019878460818185023, | |
| "loss": 1.2278, | |
| "mean_token_accuracy": 0.6699778735637665, | |
| "num_tokens": 1721548.0, | |
| "step": 203 | |
| }, | |
| { | |
| "entropy": 1.2078820168972015, | |
| "epoch": 0.0816081608160816, | |
| "grad_norm": 0.3079279363155365, | |
| "learning_rate": 0.00019876543684957667, | |
| "loss": 1.1845, | |
| "mean_token_accuracy": 0.6785111278295517, | |
| "num_tokens": 1729809.0, | |
| "step": 204 | |
| }, | |
| { | |
| "entropy": 1.199218899011612, | |
| "epoch": 0.082008200820082, | |
| "grad_norm": 0.3043046295642853, | |
| "learning_rate": 0.000198746116540005, | |
| "loss": 1.1722, | |
| "mean_token_accuracy": 0.6754065752029419, | |
| "num_tokens": 1738734.0, | |
| "step": 205 | |
| }, | |
| { | |
| "entropy": 1.2172024846076965, | |
| "epoch": 0.08240824082408241, | |
| "grad_norm": 0.313902884721756, | |
| "learning_rate": 0.00019872664728556101, | |
| "loss": 1.1869, | |
| "mean_token_accuracy": 0.6728281825780869, | |
| "num_tokens": 1746870.0, | |
| "step": 206 | |
| }, | |
| { | |
| "entropy": 1.1678736209869385, | |
| "epoch": 0.08280828082808281, | |
| "grad_norm": 0.3191705644130707, | |
| "learning_rate": 0.00019870702911892042, | |
| "loss": 1.1546, | |
| "mean_token_accuracy": 0.6843972355127335, | |
| "num_tokens": 1755295.0, | |
| "step": 207 | |
| }, | |
| { | |
| "entropy": 1.279354214668274, | |
| "epoch": 0.08320832083208321, | |
| "grad_norm": 0.3313900828361511, | |
| "learning_rate": 0.0001986872620730089, | |
| "loss": 1.2558, | |
| "mean_token_accuracy": 0.659809798002243, | |
| "num_tokens": 1763606.0, | |
| "step": 208 | |
| }, | |
| { | |
| "entropy": 1.078108698129654, | |
| "epoch": 0.08360836083608361, | |
| "grad_norm": 0.283428430557251, | |
| "learning_rate": 0.00019866734618100202, | |
| "loss": 1.1032, | |
| "mean_token_accuracy": 0.69297856092453, | |
| "num_tokens": 1772887.0, | |
| "step": 209 | |
| }, | |
| { | |
| "entropy": 1.186295509338379, | |
| "epoch": 0.08400840084008401, | |
| "grad_norm": 0.35003766417503357, | |
| "learning_rate": 0.0001986472814763251, | |
| "loss": 1.2374, | |
| "mean_token_accuracy": 0.6684627532958984, | |
| "num_tokens": 1781067.0, | |
| "step": 210 | |
| }, | |
| { | |
| "entropy": 1.1557523012161255, | |
| "epoch": 0.08440844084408441, | |
| "grad_norm": 0.31848254799842834, | |
| "learning_rate": 0.00019862706799265322, | |
| "loss": 1.1854, | |
| "mean_token_accuracy": 0.6773674935102463, | |
| "num_tokens": 1789844.0, | |
| "step": 211 | |
| }, | |
| { | |
| "entropy": 1.218627154827118, | |
| "epoch": 0.0848084808480848, | |
| "grad_norm": 0.3408789038658142, | |
| "learning_rate": 0.00019860670576391128, | |
| "loss": 1.1708, | |
| "mean_token_accuracy": 0.6817043423652649, | |
| "num_tokens": 1798509.0, | |
| "step": 212 | |
| }, | |
| { | |
| "entropy": 1.2130761444568634, | |
| "epoch": 0.0852085208520852, | |
| "grad_norm": 0.7527572512626648, | |
| "learning_rate": 0.0001985861948242736, | |
| "loss": 1.2157, | |
| "mean_token_accuracy": 0.6661449372768402, | |
| "num_tokens": 1807202.0, | |
| "step": 213 | |
| }, | |
| { | |
| "entropy": 1.2128455638885498, | |
| "epoch": 0.0856085608560856, | |
| "grad_norm": 0.29946374893188477, | |
| "learning_rate": 0.00019856553520816435, | |
| "loss": 1.1896, | |
| "mean_token_accuracy": 0.6733538210391998, | |
| "num_tokens": 1816131.0, | |
| "step": 214 | |
| }, | |
| { | |
| "entropy": 1.2612944841384888, | |
| "epoch": 0.086008600860086, | |
| "grad_norm": 0.32515719532966614, | |
| "learning_rate": 0.00019854472695025698, | |
| "loss": 1.2329, | |
| "mean_token_accuracy": 0.669788658618927, | |
| "num_tokens": 1824283.0, | |
| "step": 215 | |
| }, | |
| { | |
| "entropy": 1.1807590425014496, | |
| "epoch": 0.08640864086408641, | |
| "grad_norm": 0.3279406726360321, | |
| "learning_rate": 0.0001985237700854746, | |
| "loss": 1.1565, | |
| "mean_token_accuracy": 0.6816118210554123, | |
| "num_tokens": 1833322.0, | |
| "step": 216 | |
| }, | |
| { | |
| "entropy": 1.2046120464801788, | |
| "epoch": 0.08680868086808681, | |
| "grad_norm": 0.2987005412578583, | |
| "learning_rate": 0.00019850266464898955, | |
| "loss": 1.179, | |
| "mean_token_accuracy": 0.6783045381307602, | |
| "num_tokens": 1842092.0, | |
| "step": 217 | |
| }, | |
| { | |
| "entropy": 1.1976227462291718, | |
| "epoch": 0.08720872087208721, | |
| "grad_norm": 0.30504319071769714, | |
| "learning_rate": 0.00019848141067622374, | |
| "loss": 1.1589, | |
| "mean_token_accuracy": 0.6762242764234543, | |
| "num_tokens": 1850740.0, | |
| "step": 218 | |
| }, | |
| { | |
| "entropy": 1.2001455426216125, | |
| "epoch": 0.08760876087608761, | |
| "grad_norm": 0.35163310170173645, | |
| "learning_rate": 0.0001984600082028482, | |
| "loss": 1.1941, | |
| "mean_token_accuracy": 0.6701504737138748, | |
| "num_tokens": 1858729.0, | |
| "step": 219 | |
| }, | |
| { | |
| "entropy": 1.0998838245868683, | |
| "epoch": 0.08800880088008801, | |
| "grad_norm": 0.3166980445384979, | |
| "learning_rate": 0.0001984384572647832, | |
| "loss": 1.1238, | |
| "mean_token_accuracy": 0.683118149638176, | |
| "num_tokens": 1867218.0, | |
| "step": 220 | |
| }, | |
| { | |
| "entropy": 1.1223637461662292, | |
| "epoch": 0.0884088408840884, | |
| "grad_norm": 0.3210962116718292, | |
| "learning_rate": 0.0001984167578981983, | |
| "loss": 1.158, | |
| "mean_token_accuracy": 0.685064285993576, | |
| "num_tokens": 1875656.0, | |
| "step": 221 | |
| }, | |
| { | |
| "entropy": 1.1469238698482513, | |
| "epoch": 0.0888088808880888, | |
| "grad_norm": 0.37055703997612, | |
| "learning_rate": 0.00019839491013951213, | |
| "loss": 1.1976, | |
| "mean_token_accuracy": 0.66952283680439, | |
| "num_tokens": 1884042.0, | |
| "step": 222 | |
| }, | |
| { | |
| "entropy": 1.2010729908943176, | |
| "epoch": 0.0892089208920892, | |
| "grad_norm": 0.30089443922042847, | |
| "learning_rate": 0.00019837291402539223, | |
| "loss": 1.1677, | |
| "mean_token_accuracy": 0.6765223145484924, | |
| "num_tokens": 1892519.0, | |
| "step": 223 | |
| }, | |
| { | |
| "entropy": 1.222718983888626, | |
| "epoch": 0.0896089608960896, | |
| "grad_norm": 0.3071632981300354, | |
| "learning_rate": 0.00019835076959275532, | |
| "loss": 1.1918, | |
| "mean_token_accuracy": 0.6696299612522125, | |
| "num_tokens": 1900924.0, | |
| "step": 224 | |
| }, | |
| { | |
| "entropy": 1.216365933418274, | |
| "epoch": 0.09000900090009001, | |
| "grad_norm": 0.3337574303150177, | |
| "learning_rate": 0.00019832847687876692, | |
| "loss": 1.1572, | |
| "mean_token_accuracy": 0.6832773238420486, | |
| "num_tokens": 1909276.0, | |
| "step": 225 | |
| }, | |
| { | |
| "entropy": 1.1910041272640228, | |
| "epoch": 0.09040904090409041, | |
| "grad_norm": 0.3146218955516815, | |
| "learning_rate": 0.0001983060359208415, | |
| "loss": 1.1782, | |
| "mean_token_accuracy": 0.679167777299881, | |
| "num_tokens": 1918407.0, | |
| "step": 226 | |
| }, | |
| { | |
| "entropy": 1.162790209054947, | |
| "epoch": 0.09080908090809081, | |
| "grad_norm": 0.2975619435310364, | |
| "learning_rate": 0.0001982834467566423, | |
| "loss": 1.1683, | |
| "mean_token_accuracy": 0.6799277067184448, | |
| "num_tokens": 1927282.0, | |
| "step": 227 | |
| }, | |
| { | |
| "entropy": 1.192271113395691, | |
| "epoch": 0.09120912091209121, | |
| "grad_norm": 0.3205324113368988, | |
| "learning_rate": 0.0001982607094240813, | |
| "loss": 1.1681, | |
| "mean_token_accuracy": 0.6754294186830521, | |
| "num_tokens": 1935737.0, | |
| "step": 228 | |
| }, | |
| { | |
| "entropy": 1.1858693957328796, | |
| "epoch": 0.09160916091609161, | |
| "grad_norm": 0.3366444706916809, | |
| "learning_rate": 0.00019823782396131902, | |
| "loss": 1.1944, | |
| "mean_token_accuracy": 0.6657039225101471, | |
| "num_tokens": 1943472.0, | |
| "step": 229 | |
| }, | |
| { | |
| "entropy": 1.1361185312271118, | |
| "epoch": 0.09200920092009202, | |
| "grad_norm": 0.31257081031799316, | |
| "learning_rate": 0.00019821479040676488, | |
| "loss": 1.1529, | |
| "mean_token_accuracy": 0.6812857985496521, | |
| "num_tokens": 1952251.0, | |
| "step": 230 | |
| }, | |
| { | |
| "entropy": 1.2052267491817474, | |
| "epoch": 0.0924092409240924, | |
| "grad_norm": 0.3371609151363373, | |
| "learning_rate": 0.0001981916087990766, | |
| "loss": 1.2363, | |
| "mean_token_accuracy": 0.6580934226512909, | |
| "num_tokens": 1960349.0, | |
| "step": 231 | |
| }, | |
| { | |
| "entropy": 1.1373478174209595, | |
| "epoch": 0.0928092809280928, | |
| "grad_norm": 0.30473393201828003, | |
| "learning_rate": 0.00019816827917716048, | |
| "loss": 1.1727, | |
| "mean_token_accuracy": 0.6796131581068039, | |
| "num_tokens": 1969233.0, | |
| "step": 232 | |
| }, | |
| { | |
| "entropy": 1.1681481301784515, | |
| "epoch": 0.0932093209320932, | |
| "grad_norm": 0.3225601315498352, | |
| "learning_rate": 0.0001981448015801712, | |
| "loss": 1.1528, | |
| "mean_token_accuracy": 0.6749817878007889, | |
| "num_tokens": 1977270.0, | |
| "step": 233 | |
| }, | |
| { | |
| "entropy": 1.2196559309959412, | |
| "epoch": 0.09360936093609361, | |
| "grad_norm": 0.33247852325439453, | |
| "learning_rate": 0.00019812117604751185, | |
| "loss": 1.1834, | |
| "mean_token_accuracy": 0.6816778779029846, | |
| "num_tokens": 1985087.0, | |
| "step": 234 | |
| }, | |
| { | |
| "entropy": 1.218104362487793, | |
| "epoch": 0.09400940094009401, | |
| "grad_norm": 0.3164643347263336, | |
| "learning_rate": 0.00019809740261883372, | |
| "loss": 1.1791, | |
| "mean_token_accuracy": 0.6742540150880814, | |
| "num_tokens": 1993142.0, | |
| "step": 235 | |
| }, | |
| { | |
| "entropy": 1.2172793745994568, | |
| "epoch": 0.09440944094409441, | |
| "grad_norm": 0.31248074769973755, | |
| "learning_rate": 0.0001980734813340364, | |
| "loss": 1.2067, | |
| "mean_token_accuracy": 0.6745200008153915, | |
| "num_tokens": 2001487.0, | |
| "step": 236 | |
| }, | |
| { | |
| "entropy": 1.203236162662506, | |
| "epoch": 0.09480948094809481, | |
| "grad_norm": 0.32407742738723755, | |
| "learning_rate": 0.0001980494122332676, | |
| "loss": 1.1664, | |
| "mean_token_accuracy": 0.6777038276195526, | |
| "num_tokens": 2010136.0, | |
| "step": 237 | |
| }, | |
| { | |
| "entropy": 1.1953341364860535, | |
| "epoch": 0.09520952095209521, | |
| "grad_norm": 0.3571881651878357, | |
| "learning_rate": 0.00019802519535692302, | |
| "loss": 1.1651, | |
| "mean_token_accuracy": 0.6782020479440689, | |
| "num_tokens": 2018515.0, | |
| "step": 238 | |
| }, | |
| { | |
| "entropy": 1.208018183708191, | |
| "epoch": 0.09560956095609562, | |
| "grad_norm": 0.3488442599773407, | |
| "learning_rate": 0.00019800083074564658, | |
| "loss": 1.2217, | |
| "mean_token_accuracy": 0.6720796823501587, | |
| "num_tokens": 2026942.0, | |
| "step": 239 | |
| }, | |
| { | |
| "entropy": 1.1499423384666443, | |
| "epoch": 0.096009600960096, | |
| "grad_norm": 0.30266088247299194, | |
| "learning_rate": 0.00019797631844032992, | |
| "loss": 1.1776, | |
| "mean_token_accuracy": 0.6771319806575775, | |
| "num_tokens": 2035674.0, | |
| "step": 240 | |
| }, | |
| { | |
| "entropy": 1.1237535774707794, | |
| "epoch": 0.0964096409640964, | |
| "grad_norm": 0.3096405863761902, | |
| "learning_rate": 0.00019795165848211278, | |
| "loss": 1.1122, | |
| "mean_token_accuracy": 0.6934310793876648, | |
| "num_tokens": 2044052.0, | |
| "step": 241 | |
| }, | |
| { | |
| "entropy": 1.1529573500156403, | |
| "epoch": 0.0968096809680968, | |
| "grad_norm": 0.3192532956600189, | |
| "learning_rate": 0.0001979268509123825, | |
| "loss": 1.1804, | |
| "mean_token_accuracy": 0.6760334223508835, | |
| "num_tokens": 2052448.0, | |
| "step": 242 | |
| }, | |
| { | |
| "entropy": 1.2383974194526672, | |
| "epoch": 0.09720972097209721, | |
| "grad_norm": 0.3160487711429596, | |
| "learning_rate": 0.00019790189577277432, | |
| "loss": 1.2465, | |
| "mean_token_accuracy": 0.6652619689702988, | |
| "num_tokens": 2060776.0, | |
| "step": 243 | |
| }, | |
| { | |
| "entropy": 1.2161905169487, | |
| "epoch": 0.09760976097609761, | |
| "grad_norm": 0.32217562198638916, | |
| "learning_rate": 0.00019787679310517107, | |
| "loss": 1.1872, | |
| "mean_token_accuracy": 0.6732243746519089, | |
| "num_tokens": 2068794.0, | |
| "step": 244 | |
| }, | |
| { | |
| "entropy": 1.1646412014961243, | |
| "epoch": 0.09800980098009801, | |
| "grad_norm": 0.3009166419506073, | |
| "learning_rate": 0.00019785154295170316, | |
| "loss": 1.1652, | |
| "mean_token_accuracy": 0.6807472556829453, | |
| "num_tokens": 2077262.0, | |
| "step": 245 | |
| }, | |
| { | |
| "entropy": 1.2155237197875977, | |
| "epoch": 0.09840984098409841, | |
| "grad_norm": 0.3069799840450287, | |
| "learning_rate": 0.00019782614535474862, | |
| "loss": 1.216, | |
| "mean_token_accuracy": 0.6698369234800339, | |
| "num_tokens": 2085649.0, | |
| "step": 246 | |
| }, | |
| { | |
| "entropy": 1.1119366884231567, | |
| "epoch": 0.09880988098809881, | |
| "grad_norm": 0.30247923731803894, | |
| "learning_rate": 0.00019780060035693285, | |
| "loss": 1.1038, | |
| "mean_token_accuracy": 0.6942414045333862, | |
| "num_tokens": 2094198.0, | |
| "step": 247 | |
| }, | |
| { | |
| "entropy": 1.2534517645835876, | |
| "epoch": 0.09920992099209922, | |
| "grad_norm": 0.3274390697479248, | |
| "learning_rate": 0.0001977749080011287, | |
| "loss": 1.2635, | |
| "mean_token_accuracy": 0.6554094851016998, | |
| "num_tokens": 2102101.0, | |
| "step": 248 | |
| }, | |
| { | |
| "entropy": 1.1967229545116425, | |
| "epoch": 0.09960996099609962, | |
| "grad_norm": 0.29584378004074097, | |
| "learning_rate": 0.00019774906833045625, | |
| "loss": 1.1822, | |
| "mean_token_accuracy": 0.6769470870494843, | |
| "num_tokens": 2110466.0, | |
| "step": 249 | |
| }, | |
| { | |
| "entropy": 1.1380691528320312, | |
| "epoch": 0.1000100010001, | |
| "grad_norm": 0.28823035955429077, | |
| "learning_rate": 0.00019772308138828299, | |
| "loss": 1.0987, | |
| "mean_token_accuracy": 0.6907877773046494, | |
| "num_tokens": 2119656.0, | |
| "step": 250 | |
| }, | |
| { | |
| "entropy": 1.155064195394516, | |
| "epoch": 0.1004100410041004, | |
| "grad_norm": 0.3187693655490875, | |
| "learning_rate": 0.00019769694721822337, | |
| "loss": 1.1542, | |
| "mean_token_accuracy": 0.6734511256217957, | |
| "num_tokens": 2128073.0, | |
| "step": 251 | |
| }, | |
| { | |
| "entropy": 1.1665138900279999, | |
| "epoch": 0.10081008100810081, | |
| "grad_norm": 0.30443915724754333, | |
| "learning_rate": 0.00019767066586413905, | |
| "loss": 1.2047, | |
| "mean_token_accuracy": 0.6689727902412415, | |
| "num_tokens": 2136624.0, | |
| "step": 252 | |
| }, | |
| { | |
| "entropy": 1.1986846625804901, | |
| "epoch": 0.10121012101210121, | |
| "grad_norm": 0.2993563413619995, | |
| "learning_rate": 0.0001976442373701387, | |
| "loss": 1.1885, | |
| "mean_token_accuracy": 0.6774641126394272, | |
| "num_tokens": 2144946.0, | |
| "step": 253 | |
| }, | |
| { | |
| "entropy": 1.1575412154197693, | |
| "epoch": 0.10161016101610161, | |
| "grad_norm": 0.31819280982017517, | |
| "learning_rate": 0.00019761766178057796, | |
| "loss": 1.1617, | |
| "mean_token_accuracy": 0.6737077832221985, | |
| "num_tokens": 2153241.0, | |
| "step": 254 | |
| }, | |
| { | |
| "entropy": 1.1932867169380188, | |
| "epoch": 0.10201020102010201, | |
| "grad_norm": 0.33500298857688904, | |
| "learning_rate": 0.00019759093914005932, | |
| "loss": 1.1739, | |
| "mean_token_accuracy": 0.6722579598426819, | |
| "num_tokens": 2161532.0, | |
| "step": 255 | |
| }, | |
| { | |
| "entropy": 1.2010496854782104, | |
| "epoch": 0.10241024102410241, | |
| "grad_norm": 0.3177407681941986, | |
| "learning_rate": 0.00019756406949343204, | |
| "loss": 1.1888, | |
| "mean_token_accuracy": 0.6757108420133591, | |
| "num_tokens": 2170296.0, | |
| "step": 256 | |
| }, | |
| { | |
| "entropy": 1.1958762109279633, | |
| "epoch": 0.10281028102810282, | |
| "grad_norm": 0.30990293622016907, | |
| "learning_rate": 0.00019753705288579217, | |
| "loss": 1.1797, | |
| "mean_token_accuracy": 0.6757787764072418, | |
| "num_tokens": 2178618.0, | |
| "step": 257 | |
| }, | |
| { | |
| "entropy": 1.1743170619010925, | |
| "epoch": 0.10321032103210322, | |
| "grad_norm": 0.3038559854030609, | |
| "learning_rate": 0.00019750988936248235, | |
| "loss": 1.169, | |
| "mean_token_accuracy": 0.6733282506465912, | |
| "num_tokens": 2187168.0, | |
| "step": 258 | |
| }, | |
| { | |
| "entropy": 1.1737709939479828, | |
| "epoch": 0.1036103610361036, | |
| "grad_norm": 0.321360319852829, | |
| "learning_rate": 0.0001974825789690918, | |
| "loss": 1.1957, | |
| "mean_token_accuracy": 0.6770029366016388, | |
| "num_tokens": 2195246.0, | |
| "step": 259 | |
| }, | |
| { | |
| "entropy": 1.172276645898819, | |
| "epoch": 0.104010401040104, | |
| "grad_norm": 0.3069777488708496, | |
| "learning_rate": 0.00019745512175145627, | |
| "loss": 1.2094, | |
| "mean_token_accuracy": 0.6666506826877594, | |
| "num_tokens": 2203717.0, | |
| "step": 260 | |
| }, | |
| { | |
| "entropy": 1.3047214448451996, | |
| "epoch": 0.10441044104410441, | |
| "grad_norm": 0.3076897859573364, | |
| "learning_rate": 0.0001974275177556579, | |
| "loss": 1.301, | |
| "mean_token_accuracy": 0.6500514298677444, | |
| "num_tokens": 2212037.0, | |
| "step": 261 | |
| }, | |
| { | |
| "entropy": 1.1853089034557343, | |
| "epoch": 0.10481048104810481, | |
| "grad_norm": 0.30814552307128906, | |
| "learning_rate": 0.00019739976702802517, | |
| "loss": 1.121, | |
| "mean_token_accuracy": 0.6797177791595459, | |
| "num_tokens": 2220415.0, | |
| "step": 262 | |
| }, | |
| { | |
| "entropy": 1.14727121591568, | |
| "epoch": 0.10521052105210521, | |
| "grad_norm": 0.3139231503009796, | |
| "learning_rate": 0.0001973718696151329, | |
| "loss": 1.0951, | |
| "mean_token_accuracy": 0.6984894424676895, | |
| "num_tokens": 2228773.0, | |
| "step": 263 | |
| }, | |
| { | |
| "entropy": 1.1453731060028076, | |
| "epoch": 0.10561056105610561, | |
| "grad_norm": 0.3104467988014221, | |
| "learning_rate": 0.00019734382556380194, | |
| "loss": 1.145, | |
| "mean_token_accuracy": 0.6833966672420502, | |
| "num_tokens": 2236602.0, | |
| "step": 264 | |
| }, | |
| { | |
| "entropy": 1.129274994134903, | |
| "epoch": 0.10601060106010601, | |
| "grad_norm": 0.29663506150245667, | |
| "learning_rate": 0.0001973156349210994, | |
| "loss": 1.1386, | |
| "mean_token_accuracy": 0.6783726066350937, | |
| "num_tokens": 2245313.0, | |
| "step": 265 | |
| }, | |
| { | |
| "entropy": 1.1950629949569702, | |
| "epoch": 0.10641064106410641, | |
| "grad_norm": 0.3033241033554077, | |
| "learning_rate": 0.0001972872977343383, | |
| "loss": 1.2095, | |
| "mean_token_accuracy": 0.6765413582324982, | |
| "num_tokens": 2254362.0, | |
| "step": 266 | |
| }, | |
| { | |
| "entropy": 1.2014857530593872, | |
| "epoch": 0.10681068106810682, | |
| "grad_norm": 0.31535446643829346, | |
| "learning_rate": 0.00019725881405107778, | |
| "loss": 1.2053, | |
| "mean_token_accuracy": 0.6713583916425705, | |
| "num_tokens": 2262331.0, | |
| "step": 267 | |
| }, | |
| { | |
| "entropy": 1.1801405549049377, | |
| "epoch": 0.1072107210721072, | |
| "grad_norm": 0.30611008405685425, | |
| "learning_rate": 0.0001972301839191226, | |
| "loss": 1.1823, | |
| "mean_token_accuracy": 0.6748154610395432, | |
| "num_tokens": 2270765.0, | |
| "step": 268 | |
| }, | |
| { | |
| "entropy": 1.1290169060230255, | |
| "epoch": 0.1076107610761076, | |
| "grad_norm": 0.30215638875961304, | |
| "learning_rate": 0.00019720140738652345, | |
| "loss": 1.1209, | |
| "mean_token_accuracy": 0.6912433356046677, | |
| "num_tokens": 2279593.0, | |
| "step": 269 | |
| }, | |
| { | |
| "entropy": 1.1610883474349976, | |
| "epoch": 0.10801080108010801, | |
| "grad_norm": 0.30377084016799927, | |
| "learning_rate": 0.00019717248450157681, | |
| "loss": 1.1863, | |
| "mean_token_accuracy": 0.6740070879459381, | |
| "num_tokens": 2288100.0, | |
| "step": 270 | |
| }, | |
| { | |
| "entropy": 1.1068450212478638, | |
| "epoch": 0.10841084108410841, | |
| "grad_norm": 0.3132963478565216, | |
| "learning_rate": 0.00019714341531282462, | |
| "loss": 1.0841, | |
| "mean_token_accuracy": 0.6911667734384537, | |
| "num_tokens": 2296290.0, | |
| "step": 271 | |
| }, | |
| { | |
| "entropy": 1.168148934841156, | |
| "epoch": 0.10881088108810881, | |
| "grad_norm": 0.3282947242259979, | |
| "learning_rate": 0.0001971141998690545, | |
| "loss": 1.1941, | |
| "mean_token_accuracy": 0.673908457159996, | |
| "num_tokens": 2304766.0, | |
| "step": 272 | |
| }, | |
| { | |
| "entropy": 1.1689501702785492, | |
| "epoch": 0.10921092109210921, | |
| "grad_norm": 0.2957140803337097, | |
| "learning_rate": 0.00019708483821929943, | |
| "loss": 1.1398, | |
| "mean_token_accuracy": 0.6831405013799667, | |
| "num_tokens": 2313114.0, | |
| "step": 273 | |
| }, | |
| { | |
| "entropy": 1.1905297338962555, | |
| "epoch": 0.10961096109610961, | |
| "grad_norm": 0.29807668924331665, | |
| "learning_rate": 0.00019705533041283779, | |
| "loss": 1.1736, | |
| "mean_token_accuracy": 0.6775653660297394, | |
| "num_tokens": 2321660.0, | |
| "step": 274 | |
| }, | |
| { | |
| "entropy": 1.1815482079982758, | |
| "epoch": 0.11001100110011001, | |
| "grad_norm": 0.29083186388015747, | |
| "learning_rate": 0.00019702567649919337, | |
| "loss": 1.1603, | |
| "mean_token_accuracy": 0.6754807829856873, | |
| "num_tokens": 2330342.0, | |
| "step": 275 | |
| }, | |
| { | |
| "entropy": 1.1261299550533295, | |
| "epoch": 0.11041104110411042, | |
| "grad_norm": 0.2901794910430908, | |
| "learning_rate": 0.00019699587652813503, | |
| "loss": 1.1284, | |
| "mean_token_accuracy": 0.691281333565712, | |
| "num_tokens": 2338852.0, | |
| "step": 276 | |
| }, | |
| { | |
| "entropy": 1.184859186410904, | |
| "epoch": 0.11081108110811082, | |
| "grad_norm": 0.310745507478714, | |
| "learning_rate": 0.00019696593054967682, | |
| "loss": 1.2127, | |
| "mean_token_accuracy": 0.6673152446746826, | |
| "num_tokens": 2346809.0, | |
| "step": 277 | |
| }, | |
| { | |
| "entropy": 1.1188380122184753, | |
| "epoch": 0.1112111211121112, | |
| "grad_norm": 0.29587554931640625, | |
| "learning_rate": 0.00019693583861407786, | |
| "loss": 1.0981, | |
| "mean_token_accuracy": 0.6947813928127289, | |
| "num_tokens": 2355532.0, | |
| "step": 278 | |
| }, | |
| { | |
| "entropy": 1.172318309545517, | |
| "epoch": 0.1116111611161116, | |
| "grad_norm": 0.3138435482978821, | |
| "learning_rate": 0.00019690560077184223, | |
| "loss": 1.1441, | |
| "mean_token_accuracy": 0.6789282411336899, | |
| "num_tokens": 2363938.0, | |
| "step": 279 | |
| }, | |
| { | |
| "entropy": 1.1374418139457703, | |
| "epoch": 0.11201120112011201, | |
| "grad_norm": 0.34152451157569885, | |
| "learning_rate": 0.0001968752170737188, | |
| "loss": 1.1081, | |
| "mean_token_accuracy": 0.6848500221967697, | |
| "num_tokens": 2372334.0, | |
| "step": 280 | |
| }, | |
| { | |
| "entropy": 1.1317946314811707, | |
| "epoch": 0.11241124112411241, | |
| "grad_norm": 0.29949530959129333, | |
| "learning_rate": 0.0001968446875707014, | |
| "loss": 1.1138, | |
| "mean_token_accuracy": 0.6870416551828384, | |
| "num_tokens": 2380730.0, | |
| "step": 281 | |
| }, | |
| { | |
| "entropy": 1.0892143547534943, | |
| "epoch": 0.11281128112811281, | |
| "grad_norm": 0.3009011447429657, | |
| "learning_rate": 0.00019681401231402842, | |
| "loss": 1.0712, | |
| "mean_token_accuracy": 0.6998904794454575, | |
| "num_tokens": 2389463.0, | |
| "step": 282 | |
| }, | |
| { | |
| "entropy": 1.1513322591781616, | |
| "epoch": 0.11321132113211321, | |
| "grad_norm": 0.29763105511665344, | |
| "learning_rate": 0.00019678319135518294, | |
| "loss": 1.1861, | |
| "mean_token_accuracy": 0.6697124987840652, | |
| "num_tokens": 2398473.0, | |
| "step": 283 | |
| }, | |
| { | |
| "entropy": 1.1688634753227234, | |
| "epoch": 0.11361136113611361, | |
| "grad_norm": 0.33001646399497986, | |
| "learning_rate": 0.00019675222474589257, | |
| "loss": 1.2012, | |
| "mean_token_accuracy": 0.673338770866394, | |
| "num_tokens": 2406493.0, | |
| "step": 284 | |
| }, | |
| { | |
| "entropy": 1.1393934190273285, | |
| "epoch": 0.11401140114011402, | |
| "grad_norm": 0.2978336215019226, | |
| "learning_rate": 0.00019672111253812933, | |
| "loss": 1.1566, | |
| "mean_token_accuracy": 0.6849386692047119, | |
| "num_tokens": 2414963.0, | |
| "step": 285 | |
| }, | |
| { | |
| "entropy": 1.1978220045566559, | |
| "epoch": 0.11441144114411442, | |
| "grad_norm": 0.296939879655838, | |
| "learning_rate": 0.00019668985478410968, | |
| "loss": 1.1508, | |
| "mean_token_accuracy": 0.6871092170476913, | |
| "num_tokens": 2423476.0, | |
| "step": 286 | |
| }, | |
| { | |
| "entropy": 1.1493785977363586, | |
| "epoch": 0.1148114811481148, | |
| "grad_norm": 0.3038109242916107, | |
| "learning_rate": 0.00019665845153629425, | |
| "loss": 1.1429, | |
| "mean_token_accuracy": 0.6873074918985367, | |
| "num_tokens": 2432015.0, | |
| "step": 287 | |
| }, | |
| { | |
| "entropy": 1.1764490902423859, | |
| "epoch": 0.1152115211521152, | |
| "grad_norm": 0.28137773275375366, | |
| "learning_rate": 0.00019662690284738793, | |
| "loss": 1.1206, | |
| "mean_token_accuracy": 0.6875211298465729, | |
| "num_tokens": 2440577.0, | |
| "step": 288 | |
| }, | |
| { | |
| "entropy": 1.1811064779758453, | |
| "epoch": 0.11561156115611561, | |
| "grad_norm": 0.2927968502044678, | |
| "learning_rate": 0.00019659520877033976, | |
| "loss": 1.1828, | |
| "mean_token_accuracy": 0.67679663002491, | |
| "num_tokens": 2449585.0, | |
| "step": 289 | |
| }, | |
| { | |
| "entropy": 1.1157205402851105, | |
| "epoch": 0.11601160116011601, | |
| "grad_norm": 0.2844160199165344, | |
| "learning_rate": 0.0001965633693583426, | |
| "loss": 1.1127, | |
| "mean_token_accuracy": 0.6861093044281006, | |
| "num_tokens": 2458691.0, | |
| "step": 290 | |
| }, | |
| { | |
| "entropy": 1.1210555136203766, | |
| "epoch": 0.11641164116411641, | |
| "grad_norm": 0.30678603053092957, | |
| "learning_rate": 0.0001965313846648334, | |
| "loss": 1.1495, | |
| "mean_token_accuracy": 0.6870106756687164, | |
| "num_tokens": 2466917.0, | |
| "step": 291 | |
| }, | |
| { | |
| "entropy": 1.1256535351276398, | |
| "epoch": 0.11681168116811681, | |
| "grad_norm": 0.31176719069480896, | |
| "learning_rate": 0.00019649925474349292, | |
| "loss": 1.1516, | |
| "mean_token_accuracy": 0.679766371846199, | |
| "num_tokens": 2475064.0, | |
| "step": 292 | |
| }, | |
| { | |
| "entropy": 1.1276935040950775, | |
| "epoch": 0.11721172117211721, | |
| "grad_norm": 0.29645654559135437, | |
| "learning_rate": 0.00019646697964824562, | |
| "loss": 1.1372, | |
| "mean_token_accuracy": 0.6837837547063828, | |
| "num_tokens": 2483736.0, | |
| "step": 293 | |
| }, | |
| { | |
| "entropy": 1.1446107029914856, | |
| "epoch": 0.11761176117611762, | |
| "grad_norm": 0.2959735691547394, | |
| "learning_rate": 0.00019643455943325953, | |
| "loss": 1.1344, | |
| "mean_token_accuracy": 0.6885244697332382, | |
| "num_tokens": 2492223.0, | |
| "step": 294 | |
| }, | |
| { | |
| "entropy": 1.1486328840255737, | |
| "epoch": 0.11801180118011802, | |
| "grad_norm": 0.35478872060775757, | |
| "learning_rate": 0.00019640199415294645, | |
| "loss": 1.1195, | |
| "mean_token_accuracy": 0.6887603253126144, | |
| "num_tokens": 2500600.0, | |
| "step": 295 | |
| }, | |
| { | |
| "entropy": 1.126534789800644, | |
| "epoch": 0.11841184118411842, | |
| "grad_norm": 0.2932710349559784, | |
| "learning_rate": 0.00019636928386196145, | |
| "loss": 1.1047, | |
| "mean_token_accuracy": 0.696495532989502, | |
| "num_tokens": 2509047.0, | |
| "step": 296 | |
| }, | |
| { | |
| "entropy": 1.1546699106693268, | |
| "epoch": 0.1188118811881188, | |
| "grad_norm": 0.2861276865005493, | |
| "learning_rate": 0.00019633642861520306, | |
| "loss": 1.1463, | |
| "mean_token_accuracy": 0.6796572506427765, | |
| "num_tokens": 2517885.0, | |
| "step": 297 | |
| }, | |
| { | |
| "entropy": 1.1594507992267609, | |
| "epoch": 0.11921192119211921, | |
| "grad_norm": 0.5982229709625244, | |
| "learning_rate": 0.0001963034284678131, | |
| "loss": 1.1527, | |
| "mean_token_accuracy": 0.6782443970441818, | |
| "num_tokens": 2525962.0, | |
| "step": 298 | |
| }, | |
| { | |
| "entropy": 1.1879192888736725, | |
| "epoch": 0.11961196119611961, | |
| "grad_norm": 0.30875492095947266, | |
| "learning_rate": 0.00019627028347517648, | |
| "loss": 1.1854, | |
| "mean_token_accuracy": 0.675933450460434, | |
| "num_tokens": 2534220.0, | |
| "step": 299 | |
| }, | |
| { | |
| "entropy": 1.1593869030475616, | |
| "epoch": 0.12001200120012001, | |
| "grad_norm": 0.3053128719329834, | |
| "learning_rate": 0.00019623699369292137, | |
| "loss": 1.1617, | |
| "mean_token_accuracy": 0.677645817399025, | |
| "num_tokens": 2542206.0, | |
| "step": 300 | |
| }, | |
| { | |
| "entropy": 1.1326042711734772, | |
| "epoch": 0.12041204120412041, | |
| "grad_norm": 0.3102218508720398, | |
| "learning_rate": 0.00019620355917691884, | |
| "loss": 1.1384, | |
| "mean_token_accuracy": 0.6767238080501556, | |
| "num_tokens": 2550584.0, | |
| "step": 301 | |
| }, | |
| { | |
| "entropy": 1.1040166020393372, | |
| "epoch": 0.12081208120812081, | |
| "grad_norm": 0.3166041970252991, | |
| "learning_rate": 0.00019616997998328292, | |
| "loss": 1.1206, | |
| "mean_token_accuracy": 0.6878381818532944, | |
| "num_tokens": 2558969.0, | |
| "step": 302 | |
| }, | |
| { | |
| "entropy": 1.1306456625461578, | |
| "epoch": 0.12121212121212122, | |
| "grad_norm": 0.31803345680236816, | |
| "learning_rate": 0.00019613625616837034, | |
| "loss": 1.1286, | |
| "mean_token_accuracy": 0.6829645335674286, | |
| "num_tokens": 2567510.0, | |
| "step": 303 | |
| }, | |
| { | |
| "entropy": 1.2087586522102356, | |
| "epoch": 0.12161216121612162, | |
| "grad_norm": 0.313399076461792, | |
| "learning_rate": 0.0001961023877887807, | |
| "loss": 1.2, | |
| "mean_token_accuracy": 0.6653729230165482, | |
| "num_tokens": 2575393.0, | |
| "step": 304 | |
| }, | |
| { | |
| "entropy": 1.1803353130817413, | |
| "epoch": 0.12201220122012202, | |
| "grad_norm": 0.2919938862323761, | |
| "learning_rate": 0.0001960683749013562, | |
| "loss": 1.1749, | |
| "mean_token_accuracy": 0.6795784384012222, | |
| "num_tokens": 2583973.0, | |
| "step": 305 | |
| }, | |
| { | |
| "entropy": 1.206252634525299, | |
| "epoch": 0.1224122412241224, | |
| "grad_norm": 0.30734333395957947, | |
| "learning_rate": 0.00019603421756318146, | |
| "loss": 1.2079, | |
| "mean_token_accuracy": 0.6748498380184174, | |
| "num_tokens": 2592413.0, | |
| "step": 306 | |
| }, | |
| { | |
| "entropy": 1.1237642168998718, | |
| "epoch": 0.12281228122812281, | |
| "grad_norm": 0.2940463721752167, | |
| "learning_rate": 0.00019599991583158367, | |
| "loss": 1.0924, | |
| "mean_token_accuracy": 0.6870536357164383, | |
| "num_tokens": 2601189.0, | |
| "step": 307 | |
| }, | |
| { | |
| "entropy": 1.1055436730384827, | |
| "epoch": 0.12321232123212321, | |
| "grad_norm": 0.2887219488620758, | |
| "learning_rate": 0.00019596546976413226, | |
| "loss": 1.1143, | |
| "mean_token_accuracy": 0.6970756649971008, | |
| "num_tokens": 2610378.0, | |
| "step": 308 | |
| }, | |
| { | |
| "entropy": 1.1455924063920975, | |
| "epoch": 0.12361236123612361, | |
| "grad_norm": 0.30642586946487427, | |
| "learning_rate": 0.00019593087941863893, | |
| "loss": 1.1163, | |
| "mean_token_accuracy": 0.6846802532672882, | |
| "num_tokens": 2618765.0, | |
| "step": 309 | |
| }, | |
| { | |
| "entropy": 1.1495613157749176, | |
| "epoch": 0.12401240124012401, | |
| "grad_norm": 0.2958558201789856, | |
| "learning_rate": 0.00019589614485315766, | |
| "loss": 1.1277, | |
| "mean_token_accuracy": 0.692332923412323, | |
| "num_tokens": 2627306.0, | |
| "step": 310 | |
| }, | |
| { | |
| "entropy": 1.1369233131408691, | |
| "epoch": 0.12441244124412441, | |
| "grad_norm": 0.2962513566017151, | |
| "learning_rate": 0.0001958612661259842, | |
| "loss": 1.1458, | |
| "mean_token_accuracy": 0.6847312748432159, | |
| "num_tokens": 2635802.0, | |
| "step": 311 | |
| }, | |
| { | |
| "entropy": 1.1192970275878906, | |
| "epoch": 0.12481248124812482, | |
| "grad_norm": 0.3100016117095947, | |
| "learning_rate": 0.00019582624329565656, | |
| "loss": 1.1479, | |
| "mean_token_accuracy": 0.679630234837532, | |
| "num_tokens": 2644316.0, | |
| "step": 312 | |
| }, | |
| { | |
| "entropy": 1.1962910890579224, | |
| "epoch": 0.1252125212521252, | |
| "grad_norm": 0.3248625099658966, | |
| "learning_rate": 0.0001957910764209543, | |
| "loss": 1.2285, | |
| "mean_token_accuracy": 0.6648171693086624, | |
| "num_tokens": 2652787.0, | |
| "step": 313 | |
| }, | |
| { | |
| "entropy": 1.1034400761127472, | |
| "epoch": 0.1256125612561256, | |
| "grad_norm": 0.2892885208129883, | |
| "learning_rate": 0.00019575576556089897, | |
| "loss": 1.1218, | |
| "mean_token_accuracy": 0.685823604464531, | |
| "num_tokens": 2661638.0, | |
| "step": 314 | |
| }, | |
| { | |
| "entropy": 1.1764290630817413, | |
| "epoch": 0.126012601260126, | |
| "grad_norm": 0.2998030483722687, | |
| "learning_rate": 0.00019572031077475367, | |
| "loss": 1.0975, | |
| "mean_token_accuracy": 0.6871052384376526, | |
| "num_tokens": 2670313.0, | |
| "step": 315 | |
| }, | |
| { | |
| "entropy": 1.2649544775485992, | |
| "epoch": 0.1264126412641264, | |
| "grad_norm": 0.31360095739364624, | |
| "learning_rate": 0.0001956847121220231, | |
| "loss": 1.2167, | |
| "mean_token_accuracy": 0.660548061132431, | |
| "num_tokens": 2678587.0, | |
| "step": 316 | |
| }, | |
| { | |
| "entropy": 1.1531548500061035, | |
| "epoch": 0.1268126812681268, | |
| "grad_norm": 0.3179381787776947, | |
| "learning_rate": 0.0001956489696624533, | |
| "loss": 1.1596, | |
| "mean_token_accuracy": 0.6832859367132187, | |
| "num_tokens": 2686845.0, | |
| "step": 317 | |
| }, | |
| { | |
| "entropy": 1.1491257846355438, | |
| "epoch": 0.1272127212721272, | |
| "grad_norm": 0.3010673224925995, | |
| "learning_rate": 0.00019561308345603188, | |
| "loss": 1.1856, | |
| "mean_token_accuracy": 0.6756436675786972, | |
| "num_tokens": 2695519.0, | |
| "step": 318 | |
| }, | |
| { | |
| "entropy": 1.099882572889328, | |
| "epoch": 0.1276127612761276, | |
| "grad_norm": 0.3057318925857544, | |
| "learning_rate": 0.0001955770535629875, | |
| "loss": 1.1369, | |
| "mean_token_accuracy": 0.6802153438329697, | |
| "num_tokens": 2704317.0, | |
| "step": 319 | |
| }, | |
| { | |
| "entropy": 1.1104555130004883, | |
| "epoch": 0.128012801280128, | |
| "grad_norm": 0.30537816882133484, | |
| "learning_rate": 0.00019554088004379, | |
| "loss": 1.0916, | |
| "mean_token_accuracy": 0.6971182078123093, | |
| "num_tokens": 2712576.0, | |
| "step": 320 | |
| }, | |
| { | |
| "entropy": 1.1894198954105377, | |
| "epoch": 0.12841284128412842, | |
| "grad_norm": 0.2941950261592865, | |
| "learning_rate": 0.00019550456295915042, | |
| "loss": 1.1728, | |
| "mean_token_accuracy": 0.6762441992759705, | |
| "num_tokens": 2721000.0, | |
| "step": 321 | |
| }, | |
| { | |
| "entropy": 1.1880941092967987, | |
| "epoch": 0.12881288128812882, | |
| "grad_norm": 0.3045370280742645, | |
| "learning_rate": 0.00019546810237002066, | |
| "loss": 1.1695, | |
| "mean_token_accuracy": 0.6775896400213242, | |
| "num_tokens": 2729281.0, | |
| "step": 322 | |
| }, | |
| { | |
| "entropy": 1.1603459417819977, | |
| "epoch": 0.12921292129212922, | |
| "grad_norm": 0.29477667808532715, | |
| "learning_rate": 0.00019543149833759334, | |
| "loss": 1.13, | |
| "mean_token_accuracy": 0.6883135735988617, | |
| "num_tokens": 2737775.0, | |
| "step": 323 | |
| }, | |
| { | |
| "entropy": 1.148952156305313, | |
| "epoch": 0.12961296129612962, | |
| "grad_norm": 0.2921348214149475, | |
| "learning_rate": 0.000195394750923302, | |
| "loss": 1.1492, | |
| "mean_token_accuracy": 0.6808929741382599, | |
| "num_tokens": 2746681.0, | |
| "step": 324 | |
| }, | |
| { | |
| "entropy": 1.2179997265338898, | |
| "epoch": 0.13001300130013002, | |
| "grad_norm": 0.3009890019893646, | |
| "learning_rate": 0.0001953578601888208, | |
| "loss": 1.2338, | |
| "mean_token_accuracy": 0.6610979735851288, | |
| "num_tokens": 2755045.0, | |
| "step": 325 | |
| }, | |
| { | |
| "entropy": 1.2134989798069, | |
| "epoch": 0.13041304130413042, | |
| "grad_norm": 0.3033868968486786, | |
| "learning_rate": 0.00019532082619606436, | |
| "loss": 1.2165, | |
| "mean_token_accuracy": 0.6606318801641464, | |
| "num_tokens": 2763287.0, | |
| "step": 326 | |
| }, | |
| { | |
| "entropy": 1.0881072580814362, | |
| "epoch": 0.13081308130813082, | |
| "grad_norm": 0.2861042022705078, | |
| "learning_rate": 0.0001952836490071878, | |
| "loss": 1.0643, | |
| "mean_token_accuracy": 0.6997469067573547, | |
| "num_tokens": 2772109.0, | |
| "step": 327 | |
| }, | |
| { | |
| "entropy": 1.2652019262313843, | |
| "epoch": 0.13121312131213123, | |
| "grad_norm": 0.3063291311264038, | |
| "learning_rate": 0.00019524632868458649, | |
| "loss": 1.2374, | |
| "mean_token_accuracy": 0.6631722450256348, | |
| "num_tokens": 2780001.0, | |
| "step": 328 | |
| }, | |
| { | |
| "entropy": 1.1232223510742188, | |
| "epoch": 0.1316131613161316, | |
| "grad_norm": 0.2938007712364197, | |
| "learning_rate": 0.00019520886529089616, | |
| "loss": 1.1047, | |
| "mean_token_accuracy": 0.6943131983280182, | |
| "num_tokens": 2788572.0, | |
| "step": 329 | |
| }, | |
| { | |
| "entropy": 1.182855635881424, | |
| "epoch": 0.132013201320132, | |
| "grad_norm": 0.2949009835720062, | |
| "learning_rate": 0.00019517125888899255, | |
| "loss": 1.1657, | |
| "mean_token_accuracy": 0.6759148836135864, | |
| "num_tokens": 2797349.0, | |
| "step": 330 | |
| }, | |
| { | |
| "entropy": 1.1421308815479279, | |
| "epoch": 0.1324132413241324, | |
| "grad_norm": 0.3349224328994751, | |
| "learning_rate": 0.00019513350954199142, | |
| "loss": 1.1379, | |
| "mean_token_accuracy": 0.6823170036077499, | |
| "num_tokens": 2805345.0, | |
| "step": 331 | |
| }, | |
| { | |
| "entropy": 1.0656911730766296, | |
| "epoch": 0.1328132813281328, | |
| "grad_norm": 0.3012828230857849, | |
| "learning_rate": 0.00019509561731324848, | |
| "loss": 1.0942, | |
| "mean_token_accuracy": 0.6952732652425766, | |
| "num_tokens": 2814123.0, | |
| "step": 332 | |
| }, | |
| { | |
| "entropy": 1.0468103885650635, | |
| "epoch": 0.1332133213321332, | |
| "grad_norm": 0.30162152647972107, | |
| "learning_rate": 0.0001950575822663592, | |
| "loss": 1.1012, | |
| "mean_token_accuracy": 0.6894596368074417, | |
| "num_tokens": 2823120.0, | |
| "step": 333 | |
| }, | |
| { | |
| "entropy": 1.089416727423668, | |
| "epoch": 0.1336133613361336, | |
| "grad_norm": 0.3064773976802826, | |
| "learning_rate": 0.00019501940446515882, | |
| "loss": 1.1036, | |
| "mean_token_accuracy": 0.6885414123535156, | |
| "num_tokens": 2831735.0, | |
| "step": 334 | |
| }, | |
| { | |
| "entropy": 1.1649364531040192, | |
| "epoch": 0.134013401340134, | |
| "grad_norm": 0.35003024339675903, | |
| "learning_rate": 0.00019498108397372212, | |
| "loss": 1.1766, | |
| "mean_token_accuracy": 0.6764324754476547, | |
| "num_tokens": 2839670.0, | |
| "step": 335 | |
| }, | |
| { | |
| "entropy": 1.1590066254138947, | |
| "epoch": 0.1344134413441344, | |
| "grad_norm": 0.26645922660827637, | |
| "learning_rate": 0.0001949426208563633, | |
| "loss": 1.1091, | |
| "mean_token_accuracy": 0.6905470341444016, | |
| "num_tokens": 2848911.0, | |
| "step": 336 | |
| }, | |
| { | |
| "entropy": 1.251402735710144, | |
| "epoch": 0.1348134813481348, | |
| "grad_norm": 0.31132251024246216, | |
| "learning_rate": 0.000194904015177636, | |
| "loss": 1.1918, | |
| "mean_token_accuracy": 0.6727328300476074, | |
| "num_tokens": 2857199.0, | |
| "step": 337 | |
| }, | |
| { | |
| "entropy": 1.220662236213684, | |
| "epoch": 0.1352135213521352, | |
| "grad_norm": 0.3061762750148773, | |
| "learning_rate": 0.00019486526700233315, | |
| "loss": 1.1868, | |
| "mean_token_accuracy": 0.672507032752037, | |
| "num_tokens": 2865223.0, | |
| "step": 338 | |
| }, | |
| { | |
| "entropy": 1.0638089627027512, | |
| "epoch": 0.13561356135613561, | |
| "grad_norm": 0.29525840282440186, | |
| "learning_rate": 0.00019482637639548682, | |
| "loss": 1.0514, | |
| "mean_token_accuracy": 0.7034783512353897, | |
| "num_tokens": 2873440.0, | |
| "step": 339 | |
| }, | |
| { | |
| "entropy": 1.1221419274806976, | |
| "epoch": 0.13601360136013602, | |
| "grad_norm": 0.2899990379810333, | |
| "learning_rate": 0.00019478734342236808, | |
| "loss": 1.1505, | |
| "mean_token_accuracy": 0.675692155957222, | |
| "num_tokens": 2882408.0, | |
| "step": 340 | |
| }, | |
| { | |
| "entropy": 1.145202785730362, | |
| "epoch": 0.13641364136413642, | |
| "grad_norm": 0.2904442250728607, | |
| "learning_rate": 0.0001947481681484869, | |
| "loss": 1.1848, | |
| "mean_token_accuracy": 0.6750968992710114, | |
| "num_tokens": 2891461.0, | |
| "step": 341 | |
| }, | |
| { | |
| "entropy": 1.081279844045639, | |
| "epoch": 0.13681368136813682, | |
| "grad_norm": 0.30348628759384155, | |
| "learning_rate": 0.00019470885063959225, | |
| "loss": 1.0734, | |
| "mean_token_accuracy": 0.6975607126951218, | |
| "num_tokens": 2900223.0, | |
| "step": 342 | |
| }, | |
| { | |
| "entropy": 1.0558022856712341, | |
| "epoch": 0.13721372137213722, | |
| "grad_norm": 0.28773176670074463, | |
| "learning_rate": 0.00019466939096167164, | |
| "loss": 1.0604, | |
| "mean_token_accuracy": 0.6948001831769943, | |
| "num_tokens": 2909084.0, | |
| "step": 343 | |
| }, | |
| { | |
| "entropy": 1.1171001195907593, | |
| "epoch": 0.13761376137613762, | |
| "grad_norm": 0.29017966985702515, | |
| "learning_rate": 0.00019462978918095128, | |
| "loss": 1.1181, | |
| "mean_token_accuracy": 0.68596550822258, | |
| "num_tokens": 2917795.0, | |
| "step": 344 | |
| }, | |
| { | |
| "entropy": 1.1633701920509338, | |
| "epoch": 0.13801380138013802, | |
| "grad_norm": 0.28877806663513184, | |
| "learning_rate": 0.00019459004536389587, | |
| "loss": 1.1716, | |
| "mean_token_accuracy": 0.6693498939275742, | |
| "num_tokens": 2925764.0, | |
| "step": 345 | |
| }, | |
| { | |
| "entropy": 1.2091334760189056, | |
| "epoch": 0.13841384138413843, | |
| "grad_norm": 0.3057492971420288, | |
| "learning_rate": 0.00019455015957720842, | |
| "loss": 1.2115, | |
| "mean_token_accuracy": 0.6683546006679535, | |
| "num_tokens": 2934337.0, | |
| "step": 346 | |
| }, | |
| { | |
| "entropy": 1.117457777261734, | |
| "epoch": 0.13881388138813883, | |
| "grad_norm": 0.3619987964630127, | |
| "learning_rate": 0.0001945101318878303, | |
| "loss": 1.0944, | |
| "mean_token_accuracy": 0.6917587071657181, | |
| "num_tokens": 2942882.0, | |
| "step": 347 | |
| }, | |
| { | |
| "entropy": 1.1964794397354126, | |
| "epoch": 0.1392139213921392, | |
| "grad_norm": 0.29087069630622864, | |
| "learning_rate": 0.000194469962362941, | |
| "loss": 1.1536, | |
| "mean_token_accuracy": 0.6789288967847824, | |
| "num_tokens": 2951358.0, | |
| "step": 348 | |
| }, | |
| { | |
| "entropy": 1.1352568864822388, | |
| "epoch": 0.1396139613961396, | |
| "grad_norm": 0.30058935284614563, | |
| "learning_rate": 0.00019442965106995807, | |
| "loss": 1.1042, | |
| "mean_token_accuracy": 0.6969415545463562, | |
| "num_tokens": 2959902.0, | |
| "step": 349 | |
| }, | |
| { | |
| "entropy": 1.1815881133079529, | |
| "epoch": 0.14001400140014, | |
| "grad_norm": 0.29818278551101685, | |
| "learning_rate": 0.00019438919807653694, | |
| "loss": 1.1937, | |
| "mean_token_accuracy": 0.6777724772691727, | |
| "num_tokens": 2968375.0, | |
| "step": 350 | |
| }, | |
| { | |
| "entropy": 1.1138464957475662, | |
| "epoch": 0.1404140414041404, | |
| "grad_norm": 0.29378682374954224, | |
| "learning_rate": 0.00019434860345057096, | |
| "loss": 1.136, | |
| "mean_token_accuracy": 0.6846367418766022, | |
| "num_tokens": 2976891.0, | |
| "step": 351 | |
| }, | |
| { | |
| "entropy": 1.1382241249084473, | |
| "epoch": 0.1408140814081408, | |
| "grad_norm": 0.298759788274765, | |
| "learning_rate": 0.00019430786726019102, | |
| "loss": 1.1675, | |
| "mean_token_accuracy": 0.6828837245702744, | |
| "num_tokens": 2984891.0, | |
| "step": 352 | |
| }, | |
| { | |
| "entropy": 1.2404142022132874, | |
| "epoch": 0.1412141214121412, | |
| "grad_norm": 0.3150947093963623, | |
| "learning_rate": 0.00019426698957376585, | |
| "loss": 1.2342, | |
| "mean_token_accuracy": 0.6579574644565582, | |
| "num_tokens": 2993072.0, | |
| "step": 353 | |
| }, | |
| { | |
| "entropy": 1.1687238216400146, | |
| "epoch": 0.1416141614161416, | |
| "grad_norm": 0.29389873147010803, | |
| "learning_rate": 0.00019422597045990142, | |
| "loss": 1.1767, | |
| "mean_token_accuracy": 0.6675811409950256, | |
| "num_tokens": 3001760.0, | |
| "step": 354 | |
| }, | |
| { | |
| "entropy": 1.1566392183303833, | |
| "epoch": 0.142014201420142, | |
| "grad_norm": 0.288309246301651, | |
| "learning_rate": 0.00019418480998744118, | |
| "loss": 1.1291, | |
| "mean_token_accuracy": 0.6857695430517197, | |
| "num_tokens": 3010111.0, | |
| "step": 355 | |
| }, | |
| { | |
| "entropy": 1.1949766874313354, | |
| "epoch": 0.1424142414241424, | |
| "grad_norm": 0.29533353447914124, | |
| "learning_rate": 0.00019414350822546584, | |
| "loss": 1.1664, | |
| "mean_token_accuracy": 0.6795456558465958, | |
| "num_tokens": 3018712.0, | |
| "step": 356 | |
| }, | |
| { | |
| "entropy": 1.1488195657730103, | |
| "epoch": 0.14281428142814281, | |
| "grad_norm": 0.3124019205570221, | |
| "learning_rate": 0.00019410206524329314, | |
| "loss": 1.129, | |
| "mean_token_accuracy": 0.6900259405374527, | |
| "num_tokens": 3026707.0, | |
| "step": 357 | |
| }, | |
| { | |
| "entropy": 1.1078391075134277, | |
| "epoch": 0.14321432143214322, | |
| "grad_norm": 0.4887332618236542, | |
| "learning_rate": 0.00019406048111047792, | |
| "loss": 1.1122, | |
| "mean_token_accuracy": 0.6845664978027344, | |
| "num_tokens": 3035277.0, | |
| "step": 358 | |
| }, | |
| { | |
| "entropy": 1.1673301458358765, | |
| "epoch": 0.14361436143614362, | |
| "grad_norm": 0.30997899174690247, | |
| "learning_rate": 0.0001940187558968119, | |
| "loss": 1.1427, | |
| "mean_token_accuracy": 0.6802043169736862, | |
| "num_tokens": 3043456.0, | |
| "step": 359 | |
| }, | |
| { | |
| "entropy": 1.1499980092048645, | |
| "epoch": 0.14401440144014402, | |
| "grad_norm": 0.3066644072532654, | |
| "learning_rate": 0.00019397688967232352, | |
| "loss": 1.1497, | |
| "mean_token_accuracy": 0.6805084347724915, | |
| "num_tokens": 3051649.0, | |
| "step": 360 | |
| }, | |
| { | |
| "entropy": 1.131559580564499, | |
| "epoch": 0.14441444144414442, | |
| "grad_norm": 0.296249657869339, | |
| "learning_rate": 0.000193934882507278, | |
| "loss": 1.1349, | |
| "mean_token_accuracy": 0.6809341907501221, | |
| "num_tokens": 3060190.0, | |
| "step": 361 | |
| }, | |
| { | |
| "entropy": 1.1443010866641998, | |
| "epoch": 0.14481448144814482, | |
| "grad_norm": 0.31838539242744446, | |
| "learning_rate": 0.00019389273447217704, | |
| "loss": 1.1696, | |
| "mean_token_accuracy": 0.6759007275104523, | |
| "num_tokens": 3068580.0, | |
| "step": 362 | |
| }, | |
| { | |
| "entropy": 1.133973866701126, | |
| "epoch": 0.14521452145214522, | |
| "grad_norm": 0.2861894965171814, | |
| "learning_rate": 0.0001938504456377587, | |
| "loss": 1.1291, | |
| "mean_token_accuracy": 0.6851497888565063, | |
| "num_tokens": 3077427.0, | |
| "step": 363 | |
| }, | |
| { | |
| "entropy": 1.136247158050537, | |
| "epoch": 0.14561456145614562, | |
| "grad_norm": 0.2967614531517029, | |
| "learning_rate": 0.00019380801607499746, | |
| "loss": 1.0995, | |
| "mean_token_accuracy": 0.6911982148885727, | |
| "num_tokens": 3085196.0, | |
| "step": 364 | |
| }, | |
| { | |
| "entropy": 1.184772402048111, | |
| "epoch": 0.14601460146014603, | |
| "grad_norm": 0.3119775354862213, | |
| "learning_rate": 0.00019376544585510393, | |
| "loss": 1.2257, | |
| "mean_token_accuracy": 0.666557103395462, | |
| "num_tokens": 3093621.0, | |
| "step": 365 | |
| }, | |
| { | |
| "entropy": 1.1576828956604004, | |
| "epoch": 0.14641464146414643, | |
| "grad_norm": 0.3863295018672943, | |
| "learning_rate": 0.0001937227350495248, | |
| "loss": 1.1722, | |
| "mean_token_accuracy": 0.6755800992250443, | |
| "num_tokens": 3102047.0, | |
| "step": 366 | |
| }, | |
| { | |
| "entropy": 1.0888293087482452, | |
| "epoch": 0.1468146814681468, | |
| "grad_norm": 0.2931033670902252, | |
| "learning_rate": 0.00019367988372994265, | |
| "loss": 1.0546, | |
| "mean_token_accuracy": 0.6972876787185669, | |
| "num_tokens": 3110407.0, | |
| "step": 367 | |
| }, | |
| { | |
| "entropy": 1.171687364578247, | |
| "epoch": 0.1472147214721472, | |
| "grad_norm": 0.43645840883255005, | |
| "learning_rate": 0.000193636891968276, | |
| "loss": 1.1192, | |
| "mean_token_accuracy": 0.6813657730817795, | |
| "num_tokens": 3118726.0, | |
| "step": 368 | |
| }, | |
| { | |
| "entropy": 1.1906355917453766, | |
| "epoch": 0.1476147614761476, | |
| "grad_norm": 0.30559539794921875, | |
| "learning_rate": 0.00019359375983667902, | |
| "loss": 1.1854, | |
| "mean_token_accuracy": 0.6698572039604187, | |
| "num_tokens": 3126856.0, | |
| "step": 369 | |
| }, | |
| { | |
| "entropy": 1.1418620645999908, | |
| "epoch": 0.148014801480148, | |
| "grad_norm": 0.31266874074935913, | |
| "learning_rate": 0.00019355048740754145, | |
| "loss": 1.1375, | |
| "mean_token_accuracy": 0.678287535905838, | |
| "num_tokens": 3135201.0, | |
| "step": 370 | |
| }, | |
| { | |
| "entropy": 1.1904971301555634, | |
| "epoch": 0.1484148414841484, | |
| "grad_norm": 0.3213047981262207, | |
| "learning_rate": 0.00019350707475348852, | |
| "loss": 1.1842, | |
| "mean_token_accuracy": 0.6759228259325027, | |
| "num_tokens": 3143256.0, | |
| "step": 371 | |
| }, | |
| { | |
| "entropy": 1.1902599036693573, | |
| "epoch": 0.1488148814881488, | |
| "grad_norm": 0.5613988041877747, | |
| "learning_rate": 0.00019346352194738077, | |
| "loss": 1.2442, | |
| "mean_token_accuracy": 0.6619480550289154, | |
| "num_tokens": 3150704.0, | |
| "step": 372 | |
| }, | |
| { | |
| "entropy": 1.0474575012922287, | |
| "epoch": 0.1492149214921492, | |
| "grad_norm": 0.2898733615875244, | |
| "learning_rate": 0.00019341982906231407, | |
| "loss": 1.0636, | |
| "mean_token_accuracy": 0.6995494663715363, | |
| "num_tokens": 3159711.0, | |
| "step": 373 | |
| }, | |
| { | |
| "entropy": 1.226840317249298, | |
| "epoch": 0.1496149614961496, | |
| "grad_norm": 0.314718633890152, | |
| "learning_rate": 0.0001933759961716192, | |
| "loss": 1.1882, | |
| "mean_token_accuracy": 0.6709526926279068, | |
| "num_tokens": 3167294.0, | |
| "step": 374 | |
| }, | |
| { | |
| "entropy": 1.1560609936714172, | |
| "epoch": 0.15001500150015, | |
| "grad_norm": 0.29525458812713623, | |
| "learning_rate": 0.00019333202334886207, | |
| "loss": 1.1088, | |
| "mean_token_accuracy": 0.6907341927289963, | |
| "num_tokens": 3175676.0, | |
| "step": 375 | |
| }, | |
| { | |
| "entropy": 1.1789807677268982, | |
| "epoch": 0.15041504150415042, | |
| "grad_norm": 0.2906891405582428, | |
| "learning_rate": 0.0001932879106678434, | |
| "loss": 1.1488, | |
| "mean_token_accuracy": 0.6830808073282242, | |
| "num_tokens": 3184781.0, | |
| "step": 376 | |
| }, | |
| { | |
| "entropy": 1.2095182836055756, | |
| "epoch": 0.15081508150815082, | |
| "grad_norm": 0.29173582792282104, | |
| "learning_rate": 0.00019324365820259858, | |
| "loss": 1.1471, | |
| "mean_token_accuracy": 0.6814120411872864, | |
| "num_tokens": 3193359.0, | |
| "step": 377 | |
| }, | |
| { | |
| "entropy": 1.1557953655719757, | |
| "epoch": 0.15121512151215122, | |
| "grad_norm": 0.30150917172431946, | |
| "learning_rate": 0.0001931992660273977, | |
| "loss": 1.1842, | |
| "mean_token_accuracy": 0.6736668199300766, | |
| "num_tokens": 3201977.0, | |
| "step": 378 | |
| }, | |
| { | |
| "entropy": 1.1141368001699448, | |
| "epoch": 0.15161516151615162, | |
| "grad_norm": 0.3033373951911926, | |
| "learning_rate": 0.00019315473421674525, | |
| "loss": 1.1433, | |
| "mean_token_accuracy": 0.6801392734050751, | |
| "num_tokens": 3210612.0, | |
| "step": 379 | |
| }, | |
| { | |
| "entropy": 1.0636587738990784, | |
| "epoch": 0.15201520152015202, | |
| "grad_norm": 0.2994931936264038, | |
| "learning_rate": 0.00019311006284538013, | |
| "loss": 1.0722, | |
| "mean_token_accuracy": 0.6968654096126556, | |
| "num_tokens": 3219123.0, | |
| "step": 380 | |
| }, | |
| { | |
| "entropy": 1.2064105868339539, | |
| "epoch": 0.15241524152415242, | |
| "grad_norm": 0.3521154820919037, | |
| "learning_rate": 0.00019306525198827548, | |
| "loss": 1.2385, | |
| "mean_token_accuracy": 0.6615314930677414, | |
| "num_tokens": 3227445.0, | |
| "step": 381 | |
| }, | |
| { | |
| "entropy": 1.127672255039215, | |
| "epoch": 0.15281528152815282, | |
| "grad_norm": 0.2892846465110779, | |
| "learning_rate": 0.00019302030172063837, | |
| "loss": 1.1389, | |
| "mean_token_accuracy": 0.6847521215677261, | |
| "num_tokens": 3236240.0, | |
| "step": 382 | |
| }, | |
| { | |
| "entropy": 1.1575649082660675, | |
| "epoch": 0.15321532153215323, | |
| "grad_norm": 0.31099551916122437, | |
| "learning_rate": 0.0001929752121179101, | |
| "loss": 1.1524, | |
| "mean_token_accuracy": 0.6786007881164551, | |
| "num_tokens": 3244515.0, | |
| "step": 383 | |
| }, | |
| { | |
| "entropy": 1.1269442737102509, | |
| "epoch": 0.15361536153615363, | |
| "grad_norm": 0.2906751036643982, | |
| "learning_rate": 0.0001929299832557657, | |
| "loss": 1.0972, | |
| "mean_token_accuracy": 0.6957235038280487, | |
| "num_tokens": 3253311.0, | |
| "step": 384 | |
| }, | |
| { | |
| "entropy": 1.2260091006755829, | |
| "epoch": 0.15401540154015403, | |
| "grad_norm": 0.2963874638080597, | |
| "learning_rate": 0.00019288461521011388, | |
| "loss": 1.1781, | |
| "mean_token_accuracy": 0.6785955429077148, | |
| "num_tokens": 3261634.0, | |
| "step": 385 | |
| }, | |
| { | |
| "entropy": 1.1854043006896973, | |
| "epoch": 0.1544154415441544, | |
| "grad_norm": 0.30083367228507996, | |
| "learning_rate": 0.00019283910805709698, | |
| "loss": 1.1677, | |
| "mean_token_accuracy": 0.6692470908164978, | |
| "num_tokens": 3270087.0, | |
| "step": 386 | |
| }, | |
| { | |
| "entropy": 1.2266800105571747, | |
| "epoch": 0.1548154815481548, | |
| "grad_norm": 0.3198303282260895, | |
| "learning_rate": 0.00019279346187309085, | |
| "loss": 1.2064, | |
| "mean_token_accuracy": 0.6682067066431046, | |
| "num_tokens": 3278271.0, | |
| "step": 387 | |
| }, | |
| { | |
| "entropy": 1.1660953760147095, | |
| "epoch": 0.1552155215521552, | |
| "grad_norm": 0.33573225140571594, | |
| "learning_rate": 0.00019274767673470463, | |
| "loss": 1.1942, | |
| "mean_token_accuracy": 0.6672907918691635, | |
| "num_tokens": 3286608.0, | |
| "step": 388 | |
| }, | |
| { | |
| "entropy": 1.0843549370765686, | |
| "epoch": 0.1556155615561556, | |
| "grad_norm": 0.30995887517929077, | |
| "learning_rate": 0.00019270175271878068, | |
| "loss": 1.0992, | |
| "mean_token_accuracy": 0.6958242803812027, | |
| "num_tokens": 3295009.0, | |
| "step": 389 | |
| }, | |
| { | |
| "entropy": 1.128290981054306, | |
| "epoch": 0.156015601560156, | |
| "grad_norm": 0.3144836127758026, | |
| "learning_rate": 0.00019265568990239445, | |
| "loss": 1.137, | |
| "mean_token_accuracy": 0.6823694556951523, | |
| "num_tokens": 3303299.0, | |
| "step": 390 | |
| }, | |
| { | |
| "entropy": 1.195746123790741, | |
| "epoch": 0.1564156415641564, | |
| "grad_norm": 0.30768823623657227, | |
| "learning_rate": 0.00019260948836285439, | |
| "loss": 1.1869, | |
| "mean_token_accuracy": 0.6803343147039413, | |
| "num_tokens": 3311591.0, | |
| "step": 391 | |
| }, | |
| { | |
| "entropy": 1.1737743616104126, | |
| "epoch": 0.1568156815681568, | |
| "grad_norm": 0.29867610335350037, | |
| "learning_rate": 0.00019256314817770164, | |
| "loss": 1.1703, | |
| "mean_token_accuracy": 0.6784539520740509, | |
| "num_tokens": 3320022.0, | |
| "step": 392 | |
| }, | |
| { | |
| "entropy": 1.2264443039894104, | |
| "epoch": 0.1572157215721572, | |
| "grad_norm": 0.30367588996887207, | |
| "learning_rate": 0.00019251666942471016, | |
| "loss": 1.1963, | |
| "mean_token_accuracy": 0.6694721430540085, | |
| "num_tokens": 3328671.0, | |
| "step": 393 | |
| }, | |
| { | |
| "entropy": 1.1673425137996674, | |
| "epoch": 0.15761576157615761, | |
| "grad_norm": 0.312225341796875, | |
| "learning_rate": 0.00019247005218188645, | |
| "loss": 1.1641, | |
| "mean_token_accuracy": 0.6831966638565063, | |
| "num_tokens": 3336686.0, | |
| "step": 394 | |
| }, | |
| { | |
| "entropy": 1.1570010483264923, | |
| "epoch": 0.15801580158015802, | |
| "grad_norm": 0.325536847114563, | |
| "learning_rate": 0.00019242329652746938, | |
| "loss": 1.1245, | |
| "mean_token_accuracy": 0.6909505128860474, | |
| "num_tokens": 3344988.0, | |
| "step": 395 | |
| }, | |
| { | |
| "entropy": 1.118729829788208, | |
| "epoch": 0.15841584158415842, | |
| "grad_norm": 0.31520524621009827, | |
| "learning_rate": 0.00019237640253993017, | |
| "loss": 1.1096, | |
| "mean_token_accuracy": 0.686091959476471, | |
| "num_tokens": 3353202.0, | |
| "step": 396 | |
| }, | |
| { | |
| "entropy": 1.1297271251678467, | |
| "epoch": 0.15881588158815882, | |
| "grad_norm": 0.31851935386657715, | |
| "learning_rate": 0.00019232937029797217, | |
| "loss": 1.1385, | |
| "mean_token_accuracy": 0.6839326471090317, | |
| "num_tokens": 3362000.0, | |
| "step": 397 | |
| }, | |
| { | |
| "entropy": 1.111870676279068, | |
| "epoch": 0.15921592159215922, | |
| "grad_norm": 0.29706814885139465, | |
| "learning_rate": 0.00019228219988053085, | |
| "loss": 1.132, | |
| "mean_token_accuracy": 0.6736722886562347, | |
| "num_tokens": 3370452.0, | |
| "step": 398 | |
| }, | |
| { | |
| "entropy": 1.0942797362804413, | |
| "epoch": 0.15961596159615962, | |
| "grad_norm": 0.3211657702922821, | |
| "learning_rate": 0.00019223489136677347, | |
| "loss": 1.1642, | |
| "mean_token_accuracy": 0.6759698241949081, | |
| "num_tokens": 3378774.0, | |
| "step": 399 | |
| }, | |
| { | |
| "entropy": 1.1003531515598297, | |
| "epoch": 0.16001600160016002, | |
| "grad_norm": 0.2938557267189026, | |
| "learning_rate": 0.00019218744483609918, | |
| "loss": 1.0841, | |
| "mean_token_accuracy": 0.689574733376503, | |
| "num_tokens": 3387752.0, | |
| "step": 400 | |
| }, | |
| { | |
| "entropy": 1.1808100640773773, | |
| "epoch": 0.16041604160416043, | |
| "grad_norm": 0.3016187250614166, | |
| "learning_rate": 0.00019213986036813863, | |
| "loss": 1.1379, | |
| "mean_token_accuracy": 0.6819901168346405, | |
| "num_tokens": 3395722.0, | |
| "step": 401 | |
| }, | |
| { | |
| "entropy": 1.1858965158462524, | |
| "epoch": 0.16081608160816083, | |
| "grad_norm": 0.2888219952583313, | |
| "learning_rate": 0.00019209213804275408, | |
| "loss": 1.1126, | |
| "mean_token_accuracy": 0.6891250312328339, | |
| "num_tokens": 3404658.0, | |
| "step": 402 | |
| }, | |
| { | |
| "entropy": 1.1066676825284958, | |
| "epoch": 0.16121612161216123, | |
| "grad_norm": 0.2900371551513672, | |
| "learning_rate": 0.00019204427794003911, | |
| "loss": 1.0613, | |
| "mean_token_accuracy": 0.6994702219963074, | |
| "num_tokens": 3413044.0, | |
| "step": 403 | |
| }, | |
| { | |
| "entropy": 1.0648207068443298, | |
| "epoch": 0.16161616161616163, | |
| "grad_norm": 0.2870444357395172, | |
| "learning_rate": 0.00019199628014031857, | |
| "loss": 1.0816, | |
| "mean_token_accuracy": 0.6926587671041489, | |
| "num_tokens": 3421932.0, | |
| "step": 404 | |
| }, | |
| { | |
| "entropy": 1.1214756965637207, | |
| "epoch": 0.162016201620162, | |
| "grad_norm": 0.3146369755268097, | |
| "learning_rate": 0.00019194814472414844, | |
| "loss": 1.1529, | |
| "mean_token_accuracy": 0.679986834526062, | |
| "num_tokens": 3429660.0, | |
| "step": 405 | |
| }, | |
| { | |
| "entropy": 1.0432531386613846, | |
| "epoch": 0.1624162416241624, | |
| "grad_norm": 0.3081408441066742, | |
| "learning_rate": 0.00019189987177231554, | |
| "loss": 1.0802, | |
| "mean_token_accuracy": 0.6946697533130646, | |
| "num_tokens": 3437779.0, | |
| "step": 406 | |
| }, | |
| { | |
| "entropy": 1.1035350263118744, | |
| "epoch": 0.1628162816281628, | |
| "grad_norm": 0.3021145761013031, | |
| "learning_rate": 0.00019185146136583761, | |
| "loss": 1.1354, | |
| "mean_token_accuracy": 0.6885717958211899, | |
| "num_tokens": 3446116.0, | |
| "step": 407 | |
| }, | |
| { | |
| "entropy": 1.1501671075820923, | |
| "epoch": 0.1632163216321632, | |
| "grad_norm": 0.41734570264816284, | |
| "learning_rate": 0.00019180291358596312, | |
| "loss": 1.1233, | |
| "mean_token_accuracy": 0.6793646067380905, | |
| "num_tokens": 3454845.0, | |
| "step": 408 | |
| }, | |
| { | |
| "entropy": 1.1991091966629028, | |
| "epoch": 0.1636163616361636, | |
| "grad_norm": 0.29790523648262024, | |
| "learning_rate": 0.00019175422851417103, | |
| "loss": 1.1549, | |
| "mean_token_accuracy": 0.6777328252792358, | |
| "num_tokens": 3463400.0, | |
| "step": 409 | |
| }, | |
| { | |
| "entropy": 1.1822619140148163, | |
| "epoch": 0.164016401640164, | |
| "grad_norm": 0.31777262687683105, | |
| "learning_rate": 0.00019170540623217065, | |
| "loss": 1.1476, | |
| "mean_token_accuracy": 0.6912225484848022, | |
| "num_tokens": 3471177.0, | |
| "step": 410 | |
| }, | |
| { | |
| "entropy": 1.1974277198314667, | |
| "epoch": 0.1644164416441644, | |
| "grad_norm": 0.30301401019096375, | |
| "learning_rate": 0.00019165644682190178, | |
| "loss": 1.1863, | |
| "mean_token_accuracy": 0.6698818802833557, | |
| "num_tokens": 3479462.0, | |
| "step": 411 | |
| }, | |
| { | |
| "entropy": 1.1671889424324036, | |
| "epoch": 0.16481648164816481, | |
| "grad_norm": 0.3080313801765442, | |
| "learning_rate": 0.0001916073503655342, | |
| "loss": 1.1485, | |
| "mean_token_accuracy": 0.6848516017198563, | |
| "num_tokens": 3487668.0, | |
| "step": 412 | |
| }, | |
| { | |
| "entropy": 1.1198955476284027, | |
| "epoch": 0.16521652165216522, | |
| "grad_norm": 0.282215416431427, | |
| "learning_rate": 0.00019155811694546773, | |
| "loss": 1.117, | |
| "mean_token_accuracy": 0.6849533915519714, | |
| "num_tokens": 3496407.0, | |
| "step": 413 | |
| }, | |
| { | |
| "entropy": 1.1208362877368927, | |
| "epoch": 0.16561656165616562, | |
| "grad_norm": 0.2846994996070862, | |
| "learning_rate": 0.0001915087466443321, | |
| "loss": 1.1486, | |
| "mean_token_accuracy": 0.6762874126434326, | |
| "num_tokens": 3505305.0, | |
| "step": 414 | |
| }, | |
| { | |
| "entropy": 1.1050612926483154, | |
| "epoch": 0.16601660166016602, | |
| "grad_norm": 0.2926284670829773, | |
| "learning_rate": 0.00019145923954498674, | |
| "loss": 1.1086, | |
| "mean_token_accuracy": 0.6887543201446533, | |
| "num_tokens": 3513791.0, | |
| "step": 415 | |
| }, | |
| { | |
| "entropy": 1.1567849516868591, | |
| "epoch": 0.16641664166416642, | |
| "grad_norm": 0.3551363945007324, | |
| "learning_rate": 0.00019140959573052068, | |
| "loss": 1.1884, | |
| "mean_token_accuracy": 0.6731236577033997, | |
| "num_tokens": 3522187.0, | |
| "step": 416 | |
| }, | |
| { | |
| "entropy": 1.0714478492736816, | |
| "epoch": 0.16681668166816682, | |
| "grad_norm": 0.2826900780200958, | |
| "learning_rate": 0.00019135981528425238, | |
| "loss": 1.07, | |
| "mean_token_accuracy": 0.6979558169841766, | |
| "num_tokens": 3530921.0, | |
| "step": 417 | |
| }, | |
| { | |
| "entropy": 1.1964420974254608, | |
| "epoch": 0.16721672167216722, | |
| "grad_norm": 0.283438116312027, | |
| "learning_rate": 0.0001913098982897297, | |
| "loss": 1.2064, | |
| "mean_token_accuracy": 0.6715447902679443, | |
| "num_tokens": 3539583.0, | |
| "step": 418 | |
| }, | |
| { | |
| "entropy": 1.1429602801799774, | |
| "epoch": 0.16761676167616762, | |
| "grad_norm": 0.27956098318099976, | |
| "learning_rate": 0.0001912598448307295, | |
| "loss": 1.103, | |
| "mean_token_accuracy": 0.692705973982811, | |
| "num_tokens": 3548027.0, | |
| "step": 419 | |
| }, | |
| { | |
| "entropy": 1.1086672246456146, | |
| "epoch": 0.16801680168016803, | |
| "grad_norm": 0.30192887783050537, | |
| "learning_rate": 0.0001912096549912579, | |
| "loss": 1.0665, | |
| "mean_token_accuracy": 0.6996335387229919, | |
| "num_tokens": 3556575.0, | |
| "step": 420 | |
| }, | |
| { | |
| "entropy": 1.122267097234726, | |
| "epoch": 0.16841684168416843, | |
| "grad_norm": 0.28671419620513916, | |
| "learning_rate": 0.0001911593288555497, | |
| "loss": 1.0995, | |
| "mean_token_accuracy": 0.6916577368974686, | |
| "num_tokens": 3564842.0, | |
| "step": 421 | |
| }, | |
| { | |
| "entropy": 1.1425860822200775, | |
| "epoch": 0.16881688168816883, | |
| "grad_norm": 0.31337839365005493, | |
| "learning_rate": 0.0001911088665080685, | |
| "loss": 1.1492, | |
| "mean_token_accuracy": 0.6899708062410355, | |
| "num_tokens": 3573378.0, | |
| "step": 422 | |
| }, | |
| { | |
| "entropy": 1.1819129288196564, | |
| "epoch": 0.1692169216921692, | |
| "grad_norm": 0.3169664442539215, | |
| "learning_rate": 0.00019105826803350668, | |
| "loss": 1.2067, | |
| "mean_token_accuracy": 0.6600329726934433, | |
| "num_tokens": 3581995.0, | |
| "step": 423 | |
| }, | |
| { | |
| "entropy": 1.1388654112815857, | |
| "epoch": 0.1696169616961696, | |
| "grad_norm": 0.3174993097782135, | |
| "learning_rate": 0.00019100753351678485, | |
| "loss": 1.1679, | |
| "mean_token_accuracy": 0.6717206537723541, | |
| "num_tokens": 3590053.0, | |
| "step": 424 | |
| }, | |
| { | |
| "entropy": 1.0764131546020508, | |
| "epoch": 0.17001700170017, | |
| "grad_norm": 0.27433347702026367, | |
| "learning_rate": 0.0001909566630430521, | |
| "loss": 1.0583, | |
| "mean_token_accuracy": 0.698042631149292, | |
| "num_tokens": 3598969.0, | |
| "step": 425 | |
| }, | |
| { | |
| "entropy": 1.1677474975585938, | |
| "epoch": 0.1704170417041704, | |
| "grad_norm": 0.28440240025520325, | |
| "learning_rate": 0.0001909056566976856, | |
| "loss": 1.1686, | |
| "mean_token_accuracy": 0.6792843639850616, | |
| "num_tokens": 3608017.0, | |
| "step": 426 | |
| }, | |
| { | |
| "entropy": 1.0982355326414108, | |
| "epoch": 0.1708170817081708, | |
| "grad_norm": 0.281744122505188, | |
| "learning_rate": 0.00019085451456629063, | |
| "loss": 1.0735, | |
| "mean_token_accuracy": 0.6970892697572708, | |
| "num_tokens": 3616898.0, | |
| "step": 427 | |
| }, | |
| { | |
| "entropy": 1.1331664025783539, | |
| "epoch": 0.1712171217121712, | |
| "grad_norm": 0.29245954751968384, | |
| "learning_rate": 0.00019080323673470028, | |
| "loss": 1.1029, | |
| "mean_token_accuracy": 0.6925027072429657, | |
| "num_tokens": 3625372.0, | |
| "step": 428 | |
| }, | |
| { | |
| "entropy": 1.165515422821045, | |
| "epoch": 0.1716171617161716, | |
| "grad_norm": 0.314475953578949, | |
| "learning_rate": 0.00019075182328897553, | |
| "loss": 1.159, | |
| "mean_token_accuracy": 0.6840381771326065, | |
| "num_tokens": 3633550.0, | |
| "step": 429 | |
| }, | |
| { | |
| "entropy": 1.2059255242347717, | |
| "epoch": 0.172017201720172, | |
| "grad_norm": 0.29410937428474426, | |
| "learning_rate": 0.00019070027431540484, | |
| "loss": 1.1995, | |
| "mean_token_accuracy": 0.667696550488472, | |
| "num_tokens": 3641944.0, | |
| "step": 430 | |
| }, | |
| { | |
| "entropy": 1.160342425107956, | |
| "epoch": 0.17241724172417242, | |
| "grad_norm": 0.29798951745033264, | |
| "learning_rate": 0.00019064858990050412, | |
| "loss": 1.1249, | |
| "mean_token_accuracy": 0.6896940916776657, | |
| "num_tokens": 3650633.0, | |
| "step": 431 | |
| }, | |
| { | |
| "entropy": 1.097832590341568, | |
| "epoch": 0.17281728172817282, | |
| "grad_norm": 0.3146847188472748, | |
| "learning_rate": 0.0001905967701310167, | |
| "loss": 1.084, | |
| "mean_token_accuracy": 0.6950473189353943, | |
| "num_tokens": 3659275.0, | |
| "step": 432 | |
| }, | |
| { | |
| "entropy": 1.1250872611999512, | |
| "epoch": 0.17321732173217322, | |
| "grad_norm": 0.29490962624549866, | |
| "learning_rate": 0.00019054481509391303, | |
| "loss": 1.1453, | |
| "mean_token_accuracy": 0.6784237176179886, | |
| "num_tokens": 3667707.0, | |
| "step": 433 | |
| }, | |
| { | |
| "entropy": 1.11842879652977, | |
| "epoch": 0.17361736173617362, | |
| "grad_norm": 0.3015720844268799, | |
| "learning_rate": 0.00019049272487639053, | |
| "loss": 1.1348, | |
| "mean_token_accuracy": 0.6827126741409302, | |
| "num_tokens": 3676215.0, | |
| "step": 434 | |
| }, | |
| { | |
| "entropy": 1.1079545319080353, | |
| "epoch": 0.17401740174017402, | |
| "grad_norm": 0.2959752380847931, | |
| "learning_rate": 0.00019044049956587359, | |
| "loss": 1.1308, | |
| "mean_token_accuracy": 0.6799913793802261, | |
| "num_tokens": 3684832.0, | |
| "step": 435 | |
| }, | |
| { | |
| "entropy": 1.0760809183120728, | |
| "epoch": 0.17441744174417442, | |
| "grad_norm": 0.28142601251602173, | |
| "learning_rate": 0.0001903881392500132, | |
| "loss": 1.057, | |
| "mean_token_accuracy": 0.7040259689092636, | |
| "num_tokens": 3693191.0, | |
| "step": 436 | |
| }, | |
| { | |
| "entropy": 1.1367475986480713, | |
| "epoch": 0.17481748174817482, | |
| "grad_norm": 0.2840285301208496, | |
| "learning_rate": 0.00019033564401668712, | |
| "loss": 1.1166, | |
| "mean_token_accuracy": 0.6871612221002579, | |
| "num_tokens": 3701978.0, | |
| "step": 437 | |
| }, | |
| { | |
| "entropy": 1.0345291048288345, | |
| "epoch": 0.17521752175217523, | |
| "grad_norm": 0.27927252650260925, | |
| "learning_rate": 0.00019028301395399935, | |
| "loss": 1.0161, | |
| "mean_token_accuracy": 0.7020839005708694, | |
| "num_tokens": 3711010.0, | |
| "step": 438 | |
| }, | |
| { | |
| "entropy": 1.1218744814395905, | |
| "epoch": 0.17561756175617563, | |
| "grad_norm": 0.28972747921943665, | |
| "learning_rate": 0.00019023024915028035, | |
| "loss": 1.1142, | |
| "mean_token_accuracy": 0.6823008805513382, | |
| "num_tokens": 3719811.0, | |
| "step": 439 | |
| }, | |
| { | |
| "entropy": 1.112653136253357, | |
| "epoch": 0.17601760176017603, | |
| "grad_norm": 0.2937675714492798, | |
| "learning_rate": 0.0001901773496940866, | |
| "loss": 1.099, | |
| "mean_token_accuracy": 0.6938609182834625, | |
| "num_tokens": 3728397.0, | |
| "step": 440 | |
| }, | |
| { | |
| "entropy": 1.0891221165657043, | |
| "epoch": 0.17641764176417643, | |
| "grad_norm": 0.2878448963165283, | |
| "learning_rate": 0.00019012431567420058, | |
| "loss": 1.0985, | |
| "mean_token_accuracy": 0.6925668865442276, | |
| "num_tokens": 3737299.0, | |
| "step": 441 | |
| }, | |
| { | |
| "entropy": 1.099565714597702, | |
| "epoch": 0.1768176817681768, | |
| "grad_norm": 0.307413786649704, | |
| "learning_rate": 0.00019007114717963067, | |
| "loss": 1.1189, | |
| "mean_token_accuracy": 0.6934941560029984, | |
| "num_tokens": 3746139.0, | |
| "step": 442 | |
| }, | |
| { | |
| "entropy": 1.1932236850261688, | |
| "epoch": 0.1772177217721772, | |
| "grad_norm": 0.3038841485977173, | |
| "learning_rate": 0.00019001784429961086, | |
| "loss": 1.1788, | |
| "mean_token_accuracy": 0.6709124445915222, | |
| "num_tokens": 3754953.0, | |
| "step": 443 | |
| }, | |
| { | |
| "entropy": 1.0702079832553864, | |
| "epoch": 0.1776177617761776, | |
| "grad_norm": 0.2820574939250946, | |
| "learning_rate": 0.0001899644071236008, | |
| "loss": 1.0416, | |
| "mean_token_accuracy": 0.7032249569892883, | |
| "num_tokens": 3763751.0, | |
| "step": 444 | |
| }, | |
| { | |
| "entropy": 1.2229497730731964, | |
| "epoch": 0.178017801780178, | |
| "grad_norm": 0.3014878034591675, | |
| "learning_rate": 0.00018991083574128545, | |
| "loss": 1.2192, | |
| "mean_token_accuracy": 0.6651740819215775, | |
| "num_tokens": 3771604.0, | |
| "step": 445 | |
| }, | |
| { | |
| "entropy": 1.1150319874286652, | |
| "epoch": 0.1784178417841784, | |
| "grad_norm": 0.2991960644721985, | |
| "learning_rate": 0.000189857130242575, | |
| "loss": 1.09, | |
| "mean_token_accuracy": 0.6914113610982895, | |
| "num_tokens": 3780403.0, | |
| "step": 446 | |
| }, | |
| { | |
| "entropy": 1.1689063012599945, | |
| "epoch": 0.1788178817881788, | |
| "grad_norm": 0.2982667088508606, | |
| "learning_rate": 0.0001898032907176048, | |
| "loss": 1.1627, | |
| "mean_token_accuracy": 0.6814263015985489, | |
| "num_tokens": 3788759.0, | |
| "step": 447 | |
| }, | |
| { | |
| "entropy": 1.1139529049396515, | |
| "epoch": 0.1792179217921792, | |
| "grad_norm": 0.29409554600715637, | |
| "learning_rate": 0.00018974931725673509, | |
| "loss": 1.1114, | |
| "mean_token_accuracy": 0.6805879026651382, | |
| "num_tokens": 3796931.0, | |
| "step": 448 | |
| }, | |
| { | |
| "entropy": 1.1041430234909058, | |
| "epoch": 0.17961796179617961, | |
| "grad_norm": 0.2944853901863098, | |
| "learning_rate": 0.00018969520995055085, | |
| "loss": 1.1119, | |
| "mean_token_accuracy": 0.6940512806177139, | |
| "num_tokens": 3805323.0, | |
| "step": 449 | |
| }, | |
| { | |
| "entropy": 1.1486750543117523, | |
| "epoch": 0.18001800180018002, | |
| "grad_norm": 0.302370548248291, | |
| "learning_rate": 0.00018964096888986182, | |
| "loss": 1.1553, | |
| "mean_token_accuracy": 0.6763848960399628, | |
| "num_tokens": 3813607.0, | |
| "step": 450 | |
| }, | |
| { | |
| "entropy": 1.1423940062522888, | |
| "epoch": 0.18041804180418042, | |
| "grad_norm": 0.28140193223953247, | |
| "learning_rate": 0.00018958659416570212, | |
| "loss": 1.1566, | |
| "mean_token_accuracy": 0.6711086183786392, | |
| "num_tokens": 3822080.0, | |
| "step": 451 | |
| }, | |
| { | |
| "entropy": 1.0220871269702911, | |
| "epoch": 0.18081808180818082, | |
| "grad_norm": 0.2903229892253876, | |
| "learning_rate": 0.00018953208586933027, | |
| "loss": 1.0243, | |
| "mean_token_accuracy": 0.7029541581869125, | |
| "num_tokens": 3830561.0, | |
| "step": 452 | |
| }, | |
| { | |
| "entropy": 1.1911540031433105, | |
| "epoch": 0.18121812181218122, | |
| "grad_norm": 0.3021875321865082, | |
| "learning_rate": 0.0001894774440922289, | |
| "loss": 1.1799, | |
| "mean_token_accuracy": 0.6771095544099808, | |
| "num_tokens": 3838855.0, | |
| "step": 453 | |
| }, | |
| { | |
| "entropy": 1.1234095692634583, | |
| "epoch": 0.18161816181618162, | |
| "grad_norm": 0.30030199885368347, | |
| "learning_rate": 0.00018942266892610474, | |
| "loss": 1.1306, | |
| "mean_token_accuracy": 0.688039630651474, | |
| "num_tokens": 3847225.0, | |
| "step": 454 | |
| }, | |
| { | |
| "entropy": 1.2189615964889526, | |
| "epoch": 0.18201820182018202, | |
| "grad_norm": 0.2934826910495758, | |
| "learning_rate": 0.00018936776046288832, | |
| "loss": 1.192, | |
| "mean_token_accuracy": 0.6768446713685989, | |
| "num_tokens": 3855549.0, | |
| "step": 455 | |
| }, | |
| { | |
| "entropy": 1.090735375881195, | |
| "epoch": 0.18241824182418243, | |
| "grad_norm": 0.2921765148639679, | |
| "learning_rate": 0.0001893127187947339, | |
| "loss": 1.0824, | |
| "mean_token_accuracy": 0.6897251307964325, | |
| "num_tokens": 3863912.0, | |
| "step": 456 | |
| }, | |
| { | |
| "entropy": 1.0907158553600311, | |
| "epoch": 0.18281828182818283, | |
| "grad_norm": 0.28869226574897766, | |
| "learning_rate": 0.00018925754401401935, | |
| "loss": 1.1011, | |
| "mean_token_accuracy": 0.6976663619279861, | |
| "num_tokens": 3872222.0, | |
| "step": 457 | |
| }, | |
| { | |
| "entropy": 1.0765265822410583, | |
| "epoch": 0.18321832183218323, | |
| "grad_norm": 0.27985134720802307, | |
| "learning_rate": 0.0001892022362133459, | |
| "loss": 1.0954, | |
| "mean_token_accuracy": 0.6934731006622314, | |
| "num_tokens": 3880811.0, | |
| "step": 458 | |
| }, | |
| { | |
| "entropy": 1.1287130117416382, | |
| "epoch": 0.18361836183618363, | |
| "grad_norm": 0.2834780216217041, | |
| "learning_rate": 0.000189146795485538, | |
| "loss": 1.1133, | |
| "mean_token_accuracy": 0.6809262037277222, | |
| "num_tokens": 3889241.0, | |
| "step": 459 | |
| }, | |
| { | |
| "entropy": 1.1771635711193085, | |
| "epoch": 0.18401840184018403, | |
| "grad_norm": 0.2930743992328644, | |
| "learning_rate": 0.00018909122192364334, | |
| "loss": 1.1473, | |
| "mean_token_accuracy": 0.6786583662033081, | |
| "num_tokens": 3897826.0, | |
| "step": 460 | |
| }, | |
| { | |
| "entropy": 1.156456857919693, | |
| "epoch": 0.1844184418441844, | |
| "grad_norm": 0.31029045581817627, | |
| "learning_rate": 0.00018903551562093237, | |
| "loss": 1.1329, | |
| "mean_token_accuracy": 0.6835081726312637, | |
| "num_tokens": 3906455.0, | |
| "step": 461 | |
| }, | |
| { | |
| "entropy": 1.197271704673767, | |
| "epoch": 0.1848184818481848, | |
| "grad_norm": 0.28894633054733276, | |
| "learning_rate": 0.00018897967667089839, | |
| "loss": 1.1518, | |
| "mean_token_accuracy": 0.6705130338668823, | |
| "num_tokens": 3914939.0, | |
| "step": 462 | |
| }, | |
| { | |
| "entropy": 1.187122493982315, | |
| "epoch": 0.1852185218521852, | |
| "grad_norm": 0.2882704734802246, | |
| "learning_rate": 0.0001889237051672574, | |
| "loss": 1.172, | |
| "mean_token_accuracy": 0.6756406724452972, | |
| "num_tokens": 3923526.0, | |
| "step": 463 | |
| }, | |
| { | |
| "entropy": 1.1045761406421661, | |
| "epoch": 0.1856185618561856, | |
| "grad_norm": 0.290786474943161, | |
| "learning_rate": 0.00018886760120394774, | |
| "loss": 1.1039, | |
| "mean_token_accuracy": 0.6829386353492737, | |
| "num_tokens": 3931690.0, | |
| "step": 464 | |
| }, | |
| { | |
| "entropy": 1.0771204233169556, | |
| "epoch": 0.186018601860186, | |
| "grad_norm": 0.29037660360336304, | |
| "learning_rate": 0.00018881136487513016, | |
| "loss": 1.0961, | |
| "mean_token_accuracy": 0.6865667402744293, | |
| "num_tokens": 3940222.0, | |
| "step": 465 | |
| }, | |
| { | |
| "entropy": 1.0926263481378555, | |
| "epoch": 0.1864186418641864, | |
| "grad_norm": 0.28368324041366577, | |
| "learning_rate": 0.0001887549962751875, | |
| "loss": 1.1276, | |
| "mean_token_accuracy": 0.6901869177818298, | |
| "num_tokens": 3948870.0, | |
| "step": 466 | |
| }, | |
| { | |
| "entropy": 1.0631737411022186, | |
| "epoch": 0.18681868186818681, | |
| "grad_norm": 0.28324657678604126, | |
| "learning_rate": 0.00018869849549872465, | |
| "loss": 1.0782, | |
| "mean_token_accuracy": 0.6920218467712402, | |
| "num_tokens": 3957291.0, | |
| "step": 467 | |
| }, | |
| { | |
| "entropy": 1.1629198789596558, | |
| "epoch": 0.18721872187218722, | |
| "grad_norm": 0.28869321942329407, | |
| "learning_rate": 0.00018864186264056827, | |
| "loss": 1.1439, | |
| "mean_token_accuracy": 0.6795201748609543, | |
| "num_tokens": 3966005.0, | |
| "step": 468 | |
| }, | |
| { | |
| "entropy": 1.1176329255104065, | |
| "epoch": 0.18761876187618762, | |
| "grad_norm": 0.30285438895225525, | |
| "learning_rate": 0.00018858509779576678, | |
| "loss": 1.1113, | |
| "mean_token_accuracy": 0.6858499944210052, | |
| "num_tokens": 3974237.0, | |
| "step": 469 | |
| }, | |
| { | |
| "entropy": 1.1664519608020782, | |
| "epoch": 0.18801880188018802, | |
| "grad_norm": 0.29232847690582275, | |
| "learning_rate": 0.00018852820105959002, | |
| "loss": 1.1352, | |
| "mean_token_accuracy": 0.6848191022872925, | |
| "num_tokens": 3982719.0, | |
| "step": 470 | |
| }, | |
| { | |
| "entropy": 1.0966509878635406, | |
| "epoch": 0.18841884188418842, | |
| "grad_norm": 0.28050824999809265, | |
| "learning_rate": 0.00018847117252752924, | |
| "loss": 1.103, | |
| "mean_token_accuracy": 0.6891407370567322, | |
| "num_tokens": 3991387.0, | |
| "step": 471 | |
| }, | |
| { | |
| "entropy": 1.0832321643829346, | |
| "epoch": 0.18881888188818882, | |
| "grad_norm": 0.30679091811180115, | |
| "learning_rate": 0.00018841401229529692, | |
| "loss": 1.0987, | |
| "mean_token_accuracy": 0.6983061581850052, | |
| "num_tokens": 3999901.0, | |
| "step": 472 | |
| }, | |
| { | |
| "entropy": 1.1181371808052063, | |
| "epoch": 0.18921892189218922, | |
| "grad_norm": 0.29978105425834656, | |
| "learning_rate": 0.00018835672045882648, | |
| "loss": 1.1526, | |
| "mean_token_accuracy": 0.6812323331832886, | |
| "num_tokens": 4008189.0, | |
| "step": 473 | |
| }, | |
| { | |
| "entropy": 1.094124659895897, | |
| "epoch": 0.18961896189618963, | |
| "grad_norm": 0.2761591672897339, | |
| "learning_rate": 0.00018829929711427232, | |
| "loss": 1.088, | |
| "mean_token_accuracy": 0.6916481256484985, | |
| "num_tokens": 4017035.0, | |
| "step": 474 | |
| }, | |
| { | |
| "entropy": 1.174016386270523, | |
| "epoch": 0.19001900190019003, | |
| "grad_norm": 0.2957269549369812, | |
| "learning_rate": 0.0001882417423580095, | |
| "loss": 1.15, | |
| "mean_token_accuracy": 0.687277153134346, | |
| "num_tokens": 4025132.0, | |
| "step": 475 | |
| }, | |
| { | |
| "entropy": 1.141076147556305, | |
| "epoch": 0.19041904190419043, | |
| "grad_norm": 0.29672884941101074, | |
| "learning_rate": 0.0001881840562866336, | |
| "loss": 1.0997, | |
| "mean_token_accuracy": 0.6899784505367279, | |
| "num_tokens": 4033594.0, | |
| "step": 476 | |
| }, | |
| { | |
| "entropy": 1.103248655796051, | |
| "epoch": 0.19081908190819083, | |
| "grad_norm": 0.2912473976612091, | |
| "learning_rate": 0.00018812623899696067, | |
| "loss": 1.0915, | |
| "mean_token_accuracy": 0.6886222809553146, | |
| "num_tokens": 4042053.0, | |
| "step": 477 | |
| }, | |
| { | |
| "entropy": 1.170788824558258, | |
| "epoch": 0.19121912191219123, | |
| "grad_norm": 0.2797233462333679, | |
| "learning_rate": 0.0001880682905860269, | |
| "loss": 1.1159, | |
| "mean_token_accuracy": 0.6844299733638763, | |
| "num_tokens": 4050555.0, | |
| "step": 478 | |
| }, | |
| { | |
| "entropy": 1.160698264837265, | |
| "epoch": 0.19161916191619163, | |
| "grad_norm": 0.2921246886253357, | |
| "learning_rate": 0.00018801021115108862, | |
| "loss": 1.1606, | |
| "mean_token_accuracy": 0.6748001426458359, | |
| "num_tokens": 4059040.0, | |
| "step": 479 | |
| }, | |
| { | |
| "entropy": 1.0824988782405853, | |
| "epoch": 0.192019201920192, | |
| "grad_norm": 0.29058167338371277, | |
| "learning_rate": 0.000187952000789622, | |
| "loss": 1.1117, | |
| "mean_token_accuracy": 0.6949323862791061, | |
| "num_tokens": 4067919.0, | |
| "step": 480 | |
| }, | |
| { | |
| "entropy": 1.1407755315303802, | |
| "epoch": 0.1924192419241924, | |
| "grad_norm": 0.3058508634567261, | |
| "learning_rate": 0.00018789365959932303, | |
| "loss": 1.1914, | |
| "mean_token_accuracy": 0.6748262792825699, | |
| "num_tokens": 4076495.0, | |
| "step": 481 | |
| }, | |
| { | |
| "entropy": 1.1213767230510712, | |
| "epoch": 0.1928192819281928, | |
| "grad_norm": 0.2868844270706177, | |
| "learning_rate": 0.00018783518767810715, | |
| "loss": 1.117, | |
| "mean_token_accuracy": 0.6884360611438751, | |
| "num_tokens": 4084846.0, | |
| "step": 482 | |
| }, | |
| { | |
| "entropy": 1.1594094932079315, | |
| "epoch": 0.1932193219321932, | |
| "grad_norm": 0.29103291034698486, | |
| "learning_rate": 0.0001877765851241093, | |
| "loss": 1.1595, | |
| "mean_token_accuracy": 0.6784193813800812, | |
| "num_tokens": 4093093.0, | |
| "step": 483 | |
| }, | |
| { | |
| "entropy": 1.0897391140460968, | |
| "epoch": 0.1936193619361936, | |
| "grad_norm": 0.29071077704429626, | |
| "learning_rate": 0.00018771785203568366, | |
| "loss": 1.0775, | |
| "mean_token_accuracy": 0.6933843791484833, | |
| "num_tokens": 4101392.0, | |
| "step": 484 | |
| }, | |
| { | |
| "entropy": 1.05050827562809, | |
| "epoch": 0.19401940194019401, | |
| "grad_norm": 0.2660689949989319, | |
| "learning_rate": 0.00018765898851140345, | |
| "loss": 1.003, | |
| "mean_token_accuracy": 0.7151510417461395, | |
| "num_tokens": 4110388.0, | |
| "step": 485 | |
| }, | |
| { | |
| "entropy": 1.1417682468891144, | |
| "epoch": 0.19441944194419442, | |
| "grad_norm": 0.2760656774044037, | |
| "learning_rate": 0.00018759999465006087, | |
| "loss": 1.1208, | |
| "mean_token_accuracy": 0.6870895624160767, | |
| "num_tokens": 4119451.0, | |
| "step": 486 | |
| }, | |
| { | |
| "entropy": 1.1158250570297241, | |
| "epoch": 0.19481948194819482, | |
| "grad_norm": 0.27844175696372986, | |
| "learning_rate": 0.00018754087055066675, | |
| "loss": 1.0741, | |
| "mean_token_accuracy": 0.7000212967395782, | |
| "num_tokens": 4127997.0, | |
| "step": 487 | |
| }, | |
| { | |
| "entropy": 1.0569812506437302, | |
| "epoch": 0.19521952195219522, | |
| "grad_norm": 0.28110507130622864, | |
| "learning_rate": 0.00018748161631245065, | |
| "loss": 1.0375, | |
| "mean_token_accuracy": 0.7026449292898178, | |
| "num_tokens": 4136878.0, | |
| "step": 488 | |
| }, | |
| { | |
| "entropy": 1.084457129240036, | |
| "epoch": 0.19561956195619562, | |
| "grad_norm": 0.26859092712402344, | |
| "learning_rate": 0.00018742223203486042, | |
| "loss": 1.0676, | |
| "mean_token_accuracy": 0.6930870711803436, | |
| "num_tokens": 4146324.0, | |
| "step": 489 | |
| }, | |
| { | |
| "entropy": 1.0949542820453644, | |
| "epoch": 0.19601960196019602, | |
| "grad_norm": 0.28605908155441284, | |
| "learning_rate": 0.00018736271781756223, | |
| "loss": 1.125, | |
| "mean_token_accuracy": 0.6920661330223083, | |
| "num_tokens": 4154496.0, | |
| "step": 490 | |
| }, | |
| { | |
| "entropy": 1.1369201838970184, | |
| "epoch": 0.19641964196419642, | |
| "grad_norm": 0.3030281364917755, | |
| "learning_rate": 0.00018730307376044027, | |
| "loss": 1.119, | |
| "mean_token_accuracy": 0.6900736391544342, | |
| "num_tokens": 4163381.0, | |
| "step": 491 | |
| }, | |
| { | |
| "entropy": 1.1063465178012848, | |
| "epoch": 0.19681968196819682, | |
| "grad_norm": 0.29392218589782715, | |
| "learning_rate": 0.00018724329996359676, | |
| "loss": 1.1376, | |
| "mean_token_accuracy": 0.6872988492250443, | |
| "num_tokens": 4172190.0, | |
| "step": 492 | |
| }, | |
| { | |
| "entropy": 1.1071143746376038, | |
| "epoch": 0.19721972197219723, | |
| "grad_norm": 0.28501084446907043, | |
| "learning_rate": 0.00018718339652735154, | |
| "loss": 1.1166, | |
| "mean_token_accuracy": 0.6885866820812225, | |
| "num_tokens": 4180585.0, | |
| "step": 493 | |
| }, | |
| { | |
| "entropy": 1.1584193706512451, | |
| "epoch": 0.19761976197619763, | |
| "grad_norm": 0.29230597615242004, | |
| "learning_rate": 0.00018712336355224205, | |
| "loss": 1.1594, | |
| "mean_token_accuracy": 0.6756969690322876, | |
| "num_tokens": 4188810.0, | |
| "step": 494 | |
| }, | |
| { | |
| "entropy": 1.0776985734701157, | |
| "epoch": 0.19801980198019803, | |
| "grad_norm": 0.2801620662212372, | |
| "learning_rate": 0.0001870632011390232, | |
| "loss": 1.0296, | |
| "mean_token_accuracy": 0.7065073400735855, | |
| "num_tokens": 4197309.0, | |
| "step": 495 | |
| }, | |
| { | |
| "entropy": 1.1805840134620667, | |
| "epoch": 0.19841984198419843, | |
| "grad_norm": 0.3022160530090332, | |
| "learning_rate": 0.00018700290938866712, | |
| "loss": 1.1913, | |
| "mean_token_accuracy": 0.6692783236503601, | |
| "num_tokens": 4205630.0, | |
| "step": 496 | |
| }, | |
| { | |
| "entropy": 1.0833539962768555, | |
| "epoch": 0.19881988198819883, | |
| "grad_norm": 0.306426078081131, | |
| "learning_rate": 0.00018694248840236296, | |
| "loss": 1.0954, | |
| "mean_token_accuracy": 0.6928739845752716, | |
| "num_tokens": 4214058.0, | |
| "step": 497 | |
| }, | |
| { | |
| "entropy": 1.0818894803524017, | |
| "epoch": 0.19921992199219923, | |
| "grad_norm": 0.2984001934528351, | |
| "learning_rate": 0.00018688193828151682, | |
| "loss": 1.0926, | |
| "mean_token_accuracy": 0.6913997977972031, | |
| "num_tokens": 4222853.0, | |
| "step": 498 | |
| }, | |
| { | |
| "entropy": 1.0889964997768402, | |
| "epoch": 0.1996199619961996, | |
| "grad_norm": 0.2939610481262207, | |
| "learning_rate": 0.0001868212591277515, | |
| "loss": 1.0606, | |
| "mean_token_accuracy": 0.6952404677867889, | |
| "num_tokens": 4231395.0, | |
| "step": 499 | |
| }, | |
| { | |
| "entropy": 1.0546657741069794, | |
| "epoch": 0.2000200020002, | |
| "grad_norm": 0.28841695189476013, | |
| "learning_rate": 0.00018676045104290637, | |
| "loss": 1.0682, | |
| "mean_token_accuracy": 0.6971585303544998, | |
| "num_tokens": 4240525.0, | |
| "step": 500 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 2500, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.077959380899922e+18, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |