michaelwaves's picture
Add files using upload-large-folder tool
6f60be0 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.2000200020002,
"eval_steps": 500,
"global_step": 500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"entropy": 1.7942025661468506,
"epoch": 0.00040004000400040005,
"grad_norm": 0.47672003507614136,
"learning_rate": 0.0,
"loss": 2.2188,
"mean_token_accuracy": 0.5192891135811806,
"num_tokens": 8850.0,
"step": 1
},
{
"entropy": 1.739880234003067,
"epoch": 0.0008000800080008001,
"grad_norm": 0.4743156433105469,
"learning_rate": 2.666666666666667e-06,
"loss": 2.1894,
"mean_token_accuracy": 0.5170402973890305,
"num_tokens": 18057.0,
"step": 2
},
{
"entropy": 1.7690136432647705,
"epoch": 0.0012001200120012002,
"grad_norm": 0.5005162358283997,
"learning_rate": 5.333333333333334e-06,
"loss": 2.2131,
"mean_token_accuracy": 0.5172632932662964,
"num_tokens": 26915.0,
"step": 3
},
{
"entropy": 1.866851270198822,
"epoch": 0.0016001600160016002,
"grad_norm": 0.438918799161911,
"learning_rate": 8.000000000000001e-06,
"loss": 2.2875,
"mean_token_accuracy": 0.5107089728116989,
"num_tokens": 35231.0,
"step": 4
},
{
"entropy": 1.8996970057487488,
"epoch": 0.002000200020002,
"grad_norm": 0.4285155236721039,
"learning_rate": 1.0666666666666667e-05,
"loss": 2.2935,
"mean_token_accuracy": 0.5128495469689369,
"num_tokens": 43540.0,
"step": 5
},
{
"entropy": 1.797807365655899,
"epoch": 0.0024002400240024004,
"grad_norm": 0.4465991258621216,
"learning_rate": 1.3333333333333333e-05,
"loss": 2.1917,
"mean_token_accuracy": 0.5254444032907486,
"num_tokens": 52236.0,
"step": 6
},
{
"entropy": 1.8983636498451233,
"epoch": 0.0028002800280028,
"grad_norm": 0.4536067545413971,
"learning_rate": 1.6000000000000003e-05,
"loss": 2.2677,
"mean_token_accuracy": 0.5144101679325104,
"num_tokens": 60443.0,
"step": 7
},
{
"entropy": 1.8427878618240356,
"epoch": 0.0032003200320032004,
"grad_norm": 0.5053722858428955,
"learning_rate": 1.866666666666667e-05,
"loss": 2.2356,
"mean_token_accuracy": 0.5142018273472786,
"num_tokens": 69155.0,
"step": 8
},
{
"entropy": 1.8648996651172638,
"epoch": 0.0036003600360036,
"grad_norm": 0.5287893414497375,
"learning_rate": 2.1333333333333335e-05,
"loss": 2.2435,
"mean_token_accuracy": 0.5086963996291161,
"num_tokens": 77156.0,
"step": 9
},
{
"entropy": 1.886999636888504,
"epoch": 0.004000400040004,
"grad_norm": 0.43816184997558594,
"learning_rate": 2.4e-05,
"loss": 2.1821,
"mean_token_accuracy": 0.5133799910545349,
"num_tokens": 85650.0,
"step": 10
},
{
"entropy": 2.0165862143039703,
"epoch": 0.0044004400440044,
"grad_norm": 0.3899831175804138,
"learning_rate": 2.6666666666666667e-05,
"loss": 2.1903,
"mean_token_accuracy": 0.5218925848603249,
"num_tokens": 93953.0,
"step": 11
},
{
"entropy": 2.033858895301819,
"epoch": 0.004800480048004801,
"grad_norm": 0.43466004729270935,
"learning_rate": 2.9333333333333336e-05,
"loss": 2.0937,
"mean_token_accuracy": 0.5258676409721375,
"num_tokens": 102592.0,
"step": 12
},
{
"entropy": 2.2364404797554016,
"epoch": 0.005200520052005201,
"grad_norm": 0.39024344086647034,
"learning_rate": 3.2000000000000005e-05,
"loss": 2.1801,
"mean_token_accuracy": 0.5228476375341415,
"num_tokens": 110784.0,
"step": 13
},
{
"entropy": 2.1504173278808594,
"epoch": 0.0056005600560056,
"grad_norm": 0.389006644487381,
"learning_rate": 3.466666666666667e-05,
"loss": 2.0215,
"mean_token_accuracy": 0.5430122464895248,
"num_tokens": 120082.0,
"step": 14
},
{
"entropy": 2.2962915897369385,
"epoch": 0.006000600060006,
"grad_norm": 0.4784089922904968,
"learning_rate": 3.733333333333334e-05,
"loss": 2.061,
"mean_token_accuracy": 0.531621664762497,
"num_tokens": 128363.0,
"step": 15
},
{
"entropy": 2.342404544353485,
"epoch": 0.006400640064006401,
"grad_norm": 0.5089271068572998,
"learning_rate": 4e-05,
"loss": 2.07,
"mean_token_accuracy": 0.5325157046318054,
"num_tokens": 136997.0,
"step": 16
},
{
"entropy": 2.283275544643402,
"epoch": 0.006800680068006801,
"grad_norm": 0.5488889813423157,
"learning_rate": 4.266666666666667e-05,
"loss": 2.0056,
"mean_token_accuracy": 0.5334787666797638,
"num_tokens": 145030.0,
"step": 17
},
{
"entropy": 2.050345718860626,
"epoch": 0.0072007200720072,
"grad_norm": 0.5031075477600098,
"learning_rate": 4.5333333333333335e-05,
"loss": 1.9162,
"mean_token_accuracy": 0.5427921563386917,
"num_tokens": 153623.0,
"step": 18
},
{
"entropy": 1.9828232526779175,
"epoch": 0.007600760076007601,
"grad_norm": 0.5337665677070618,
"learning_rate": 4.8e-05,
"loss": 1.9185,
"mean_token_accuracy": 0.5508822798728943,
"num_tokens": 161947.0,
"step": 19
},
{
"entropy": 1.8197293877601624,
"epoch": 0.008000800080008,
"grad_norm": 0.4948204755783081,
"learning_rate": 5.0666666666666674e-05,
"loss": 1.857,
"mean_token_accuracy": 0.552571251988411,
"num_tokens": 170516.0,
"step": 20
},
{
"entropy": 1.789840191602707,
"epoch": 0.0084008400840084,
"grad_norm": 0.4926859438419342,
"learning_rate": 5.333333333333333e-05,
"loss": 1.886,
"mean_token_accuracy": 0.5518065690994263,
"num_tokens": 178469.0,
"step": 21
},
{
"entropy": 1.6451906859874725,
"epoch": 0.0088008800880088,
"grad_norm": 0.4017632007598877,
"learning_rate": 5.6000000000000006e-05,
"loss": 1.7526,
"mean_token_accuracy": 0.5742013603448868,
"num_tokens": 186348.0,
"step": 22
},
{
"entropy": 1.6792134046554565,
"epoch": 0.0092009200920092,
"grad_norm": 0.6260354518890381,
"learning_rate": 5.866666666666667e-05,
"loss": 1.8468,
"mean_token_accuracy": 0.5656454414129257,
"num_tokens": 195071.0,
"step": 23
},
{
"entropy": 1.647391676902771,
"epoch": 0.009600960096009602,
"grad_norm": 0.46580520272254944,
"learning_rate": 6.133333333333334e-05,
"loss": 1.7595,
"mean_token_accuracy": 0.567480742931366,
"num_tokens": 202951.0,
"step": 24
},
{
"entropy": 1.6090652346611023,
"epoch": 0.010001000100010001,
"grad_norm": 0.4587379992008209,
"learning_rate": 6.400000000000001e-05,
"loss": 1.6638,
"mean_token_accuracy": 0.5937570631504059,
"num_tokens": 211268.0,
"step": 25
},
{
"entropy": 1.6326420307159424,
"epoch": 0.010401040104010401,
"grad_norm": 0.44421494007110596,
"learning_rate": 6.666666666666667e-05,
"loss": 1.6439,
"mean_token_accuracy": 0.5923638790845871,
"num_tokens": 219692.0,
"step": 26
},
{
"entropy": 1.7234179377555847,
"epoch": 0.010801080108010801,
"grad_norm": 0.4389747381210327,
"learning_rate": 6.933333333333334e-05,
"loss": 1.7108,
"mean_token_accuracy": 0.5803089290857315,
"num_tokens": 228047.0,
"step": 27
},
{
"entropy": 1.6885777115821838,
"epoch": 0.0112011201120112,
"grad_norm": 0.4335879981517792,
"learning_rate": 7.2e-05,
"loss": 1.6299,
"mean_token_accuracy": 0.586303323507309,
"num_tokens": 236376.0,
"step": 28
},
{
"entropy": 1.6646342873573303,
"epoch": 0.0116011601160116,
"grad_norm": 0.38126322627067566,
"learning_rate": 7.466666666666667e-05,
"loss": 1.6067,
"mean_token_accuracy": 0.5964086949825287,
"num_tokens": 245092.0,
"step": 29
},
{
"entropy": 1.6213374137878418,
"epoch": 0.012001200120012,
"grad_norm": 0.39270561933517456,
"learning_rate": 7.733333333333333e-05,
"loss": 1.5822,
"mean_token_accuracy": 0.6026028245687485,
"num_tokens": 253673.0,
"step": 30
},
{
"entropy": 1.5640352368354797,
"epoch": 0.012401240124012402,
"grad_norm": 0.3869155943393707,
"learning_rate": 8e-05,
"loss": 1.5011,
"mean_token_accuracy": 0.6241087764501572,
"num_tokens": 262625.0,
"step": 31
},
{
"entropy": 1.520020067691803,
"epoch": 0.012801280128012802,
"grad_norm": 0.3769737184047699,
"learning_rate": 8.266666666666667e-05,
"loss": 1.5088,
"mean_token_accuracy": 0.6204348653554916,
"num_tokens": 271309.0,
"step": 32
},
{
"entropy": 1.5669251084327698,
"epoch": 0.013201320132013201,
"grad_norm": 0.4119971692562103,
"learning_rate": 8.533333333333334e-05,
"loss": 1.598,
"mean_token_accuracy": 0.6009179204702377,
"num_tokens": 279702.0,
"step": 33
},
{
"entropy": 1.4570423662662506,
"epoch": 0.013601360136013601,
"grad_norm": 0.39608579874038696,
"learning_rate": 8.800000000000001e-05,
"loss": 1.4757,
"mean_token_accuracy": 0.6308933645486832,
"num_tokens": 288493.0,
"step": 34
},
{
"entropy": 1.4845676720142365,
"epoch": 0.014001400140014001,
"grad_norm": 0.37827152013778687,
"learning_rate": 9.066666666666667e-05,
"loss": 1.5051,
"mean_token_accuracy": 0.6212253570556641,
"num_tokens": 296999.0,
"step": 35
},
{
"entropy": 1.5079152584075928,
"epoch": 0.0144014401440144,
"grad_norm": 0.39496058225631714,
"learning_rate": 9.333333333333334e-05,
"loss": 1.5177,
"mean_token_accuracy": 0.6146594285964966,
"num_tokens": 305146.0,
"step": 36
},
{
"entropy": 1.4583857357501984,
"epoch": 0.014801480148014802,
"grad_norm": 0.41785281896591187,
"learning_rate": 9.6e-05,
"loss": 1.4723,
"mean_token_accuracy": 0.6168077737092972,
"num_tokens": 313647.0,
"step": 37
},
{
"entropy": 1.3630880415439606,
"epoch": 0.015201520152015202,
"grad_norm": 0.3789471983909607,
"learning_rate": 9.866666666666668e-05,
"loss": 1.3449,
"mean_token_accuracy": 0.6459334343671799,
"num_tokens": 322633.0,
"step": 38
},
{
"entropy": 1.4223653674125671,
"epoch": 0.015601560156015602,
"grad_norm": 0.4337131381034851,
"learning_rate": 0.00010133333333333335,
"loss": 1.4755,
"mean_token_accuracy": 0.6144974380731583,
"num_tokens": 331687.0,
"step": 39
},
{
"entropy": 1.3911584913730621,
"epoch": 0.016001600160016,
"grad_norm": 0.41617903113365173,
"learning_rate": 0.00010400000000000001,
"loss": 1.3826,
"mean_token_accuracy": 0.6441078633069992,
"num_tokens": 339868.0,
"step": 40
},
{
"entropy": 1.4160181879997253,
"epoch": 0.016401640164016403,
"grad_norm": 0.43531423807144165,
"learning_rate": 0.00010666666666666667,
"loss": 1.4294,
"mean_token_accuracy": 0.6320265531539917,
"num_tokens": 348029.0,
"step": 41
},
{
"entropy": 1.482937514781952,
"epoch": 0.0168016801680168,
"grad_norm": 0.4324755072593689,
"learning_rate": 0.00010933333333333333,
"loss": 1.5147,
"mean_token_accuracy": 0.6166313588619232,
"num_tokens": 356240.0,
"step": 42
},
{
"entropy": 1.4201266169548035,
"epoch": 0.017201720172017203,
"grad_norm": 0.3948879837989807,
"learning_rate": 0.00011200000000000001,
"loss": 1.3994,
"mean_token_accuracy": 0.6290998160839081,
"num_tokens": 364425.0,
"step": 43
},
{
"entropy": 1.357359528541565,
"epoch": 0.0176017601760176,
"grad_norm": 0.41655364632606506,
"learning_rate": 0.00011466666666666667,
"loss": 1.2924,
"mean_token_accuracy": 0.6492937654256821,
"num_tokens": 373138.0,
"step": 44
},
{
"entropy": 1.391854703426361,
"epoch": 0.018001800180018002,
"grad_norm": 0.417074590921402,
"learning_rate": 0.00011733333333333334,
"loss": 1.3507,
"mean_token_accuracy": 0.6494302302598953,
"num_tokens": 382100.0,
"step": 45
},
{
"entropy": 1.4749327600002289,
"epoch": 0.0184018401840184,
"grad_norm": 0.41923800110816956,
"learning_rate": 0.00012,
"loss": 1.5085,
"mean_token_accuracy": 0.612814411520958,
"num_tokens": 390052.0,
"step": 46
},
{
"entropy": 1.4137325286865234,
"epoch": 0.018801880188018802,
"grad_norm": 0.3833743929862976,
"learning_rate": 0.00012266666666666668,
"loss": 1.3916,
"mean_token_accuracy": 0.6410449594259262,
"num_tokens": 398110.0,
"step": 47
},
{
"entropy": 1.3919320702552795,
"epoch": 0.019201920192019203,
"grad_norm": 0.37842363119125366,
"learning_rate": 0.00012533333333333334,
"loss": 1.4084,
"mean_token_accuracy": 0.6312015205621719,
"num_tokens": 406666.0,
"step": 48
},
{
"entropy": 1.3608618378639221,
"epoch": 0.0196019601960196,
"grad_norm": 0.4568133056163788,
"learning_rate": 0.00012800000000000002,
"loss": 1.368,
"mean_token_accuracy": 0.6458054482936859,
"num_tokens": 415283.0,
"step": 49
},
{
"entropy": 1.3759468793869019,
"epoch": 0.020002000200020003,
"grad_norm": 0.3905130922794342,
"learning_rate": 0.00013066666666666668,
"loss": 1.3781,
"mean_token_accuracy": 0.6408856809139252,
"num_tokens": 423867.0,
"step": 50
},
{
"entropy": 1.3894509375095367,
"epoch": 0.0204020402040204,
"grad_norm": 0.39885976910591125,
"learning_rate": 0.00013333333333333334,
"loss": 1.3832,
"mean_token_accuracy": 0.6394526213407516,
"num_tokens": 432299.0,
"step": 51
},
{
"entropy": 1.3620089888572693,
"epoch": 0.020802080208020803,
"grad_norm": 0.44015854597091675,
"learning_rate": 0.00013600000000000003,
"loss": 1.3381,
"mean_token_accuracy": 0.6432337760925293,
"num_tokens": 440734.0,
"step": 52
},
{
"entropy": 1.3622656762599945,
"epoch": 0.0212021202120212,
"grad_norm": 0.49739453196525574,
"learning_rate": 0.00013866666666666669,
"loss": 1.3649,
"mean_token_accuracy": 0.6373352855443954,
"num_tokens": 448710.0,
"step": 53
},
{
"entropy": 1.2986978590488434,
"epoch": 0.021602160216021602,
"grad_norm": 0.37318113446235657,
"learning_rate": 0.00014133333333333334,
"loss": 1.3366,
"mean_token_accuracy": 0.6431873738765717,
"num_tokens": 457247.0,
"step": 54
},
{
"entropy": 1.2725946605205536,
"epoch": 0.022002200220022004,
"grad_norm": 0.4199654757976532,
"learning_rate": 0.000144,
"loss": 1.3302,
"mean_token_accuracy": 0.6447762101888657,
"num_tokens": 465701.0,
"step": 55
},
{
"entropy": 1.2967428863048553,
"epoch": 0.0224022402240224,
"grad_norm": 0.40956538915634155,
"learning_rate": 0.00014666666666666666,
"loss": 1.3352,
"mean_token_accuracy": 0.6408500224351883,
"num_tokens": 474476.0,
"step": 56
},
{
"entropy": 1.3544551134109497,
"epoch": 0.022802280228022803,
"grad_norm": 0.39519739151000977,
"learning_rate": 0.00014933333333333335,
"loss": 1.3406,
"mean_token_accuracy": 0.6500163674354553,
"num_tokens": 482570.0,
"step": 57
},
{
"entropy": 1.3824973404407501,
"epoch": 0.0232023202320232,
"grad_norm": 0.3799802362918854,
"learning_rate": 0.000152,
"loss": 1.3278,
"mean_token_accuracy": 0.6473122090101242,
"num_tokens": 491111.0,
"step": 58
},
{
"entropy": 1.3626296520233154,
"epoch": 0.023602360236023603,
"grad_norm": 0.3700718879699707,
"learning_rate": 0.00015466666666666667,
"loss": 1.3304,
"mean_token_accuracy": 0.645874097943306,
"num_tokens": 500032.0,
"step": 59
},
{
"entropy": 1.3258526921272278,
"epoch": 0.024002400240024,
"grad_norm": 0.366222620010376,
"learning_rate": 0.00015733333333333333,
"loss": 1.3073,
"mean_token_accuracy": 0.6523128002882004,
"num_tokens": 508045.0,
"step": 60
},
{
"entropy": 1.2787662744522095,
"epoch": 0.024402440244024402,
"grad_norm": 0.37774235010147095,
"learning_rate": 0.00016,
"loss": 1.2839,
"mean_token_accuracy": 0.657956600189209,
"num_tokens": 516334.0,
"step": 61
},
{
"entropy": 1.2824394404888153,
"epoch": 0.024802480248024804,
"grad_norm": 0.3594248294830322,
"learning_rate": 0.00016266666666666667,
"loss": 1.3335,
"mean_token_accuracy": 0.6513591110706329,
"num_tokens": 524762.0,
"step": 62
},
{
"entropy": 1.2761549651622772,
"epoch": 0.025202520252025202,
"grad_norm": 0.38247525691986084,
"learning_rate": 0.00016533333333333333,
"loss": 1.322,
"mean_token_accuracy": 0.6528888940811157,
"num_tokens": 533302.0,
"step": 63
},
{
"entropy": 1.285708099603653,
"epoch": 0.025602560256025603,
"grad_norm": 0.4210297167301178,
"learning_rate": 0.000168,
"loss": 1.2522,
"mean_token_accuracy": 0.6581785976886749,
"num_tokens": 542110.0,
"step": 64
},
{
"entropy": 1.3535743653774261,
"epoch": 0.026002600260026,
"grad_norm": 0.3659783601760864,
"learning_rate": 0.00017066666666666668,
"loss": 1.3343,
"mean_token_accuracy": 0.6510991156101227,
"num_tokens": 550717.0,
"step": 65
},
{
"entropy": 1.3446696996688843,
"epoch": 0.026402640264026403,
"grad_norm": 0.35590988397598267,
"learning_rate": 0.00017333333333333334,
"loss": 1.3224,
"mean_token_accuracy": 0.6442483812570572,
"num_tokens": 559025.0,
"step": 66
},
{
"entropy": 1.3695125877857208,
"epoch": 0.0268026802680268,
"grad_norm": 0.3491916358470917,
"learning_rate": 0.00017600000000000002,
"loss": 1.3288,
"mean_token_accuracy": 0.6431872397661209,
"num_tokens": 567724.0,
"step": 67
},
{
"entropy": 1.3363787531852722,
"epoch": 0.027202720272027203,
"grad_norm": 0.3625618517398834,
"learning_rate": 0.00017866666666666668,
"loss": 1.2804,
"mean_token_accuracy": 0.6557945609092712,
"num_tokens": 576144.0,
"step": 68
},
{
"entropy": 1.3033888339996338,
"epoch": 0.027602760276027604,
"grad_norm": 0.35051390528678894,
"learning_rate": 0.00018133333333333334,
"loss": 1.2841,
"mean_token_accuracy": 0.6544656604528427,
"num_tokens": 584831.0,
"step": 69
},
{
"entropy": 1.3235229551792145,
"epoch": 0.028002800280028002,
"grad_norm": 0.3980117738246918,
"learning_rate": 0.00018400000000000003,
"loss": 1.3492,
"mean_token_accuracy": 0.6482396423816681,
"num_tokens": 593412.0,
"step": 70
},
{
"entropy": 1.2970213294029236,
"epoch": 0.028402840284028404,
"grad_norm": 0.3519047796726227,
"learning_rate": 0.0001866666666666667,
"loss": 1.3083,
"mean_token_accuracy": 0.6536522507667542,
"num_tokens": 601675.0,
"step": 71
},
{
"entropy": 1.2363843321800232,
"epoch": 0.0288028802880288,
"grad_norm": 0.356121689081192,
"learning_rate": 0.00018933333333333335,
"loss": 1.2331,
"mean_token_accuracy": 0.6689527034759521,
"num_tokens": 610155.0,
"step": 72
},
{
"entropy": 1.2743788659572601,
"epoch": 0.029202920292029203,
"grad_norm": 0.352166086435318,
"learning_rate": 0.000192,
"loss": 1.2953,
"mean_token_accuracy": 0.6543757170438766,
"num_tokens": 619084.0,
"step": 73
},
{
"entropy": 1.251781314611435,
"epoch": 0.029602960296029605,
"grad_norm": 0.3690275251865387,
"learning_rate": 0.0001946666666666667,
"loss": 1.249,
"mean_token_accuracy": 0.6584222465753555,
"num_tokens": 627717.0,
"step": 74
},
{
"entropy": 1.3367043435573578,
"epoch": 0.030003000300030003,
"grad_norm": 0.3400121331214905,
"learning_rate": 0.00019733333333333335,
"loss": 1.2895,
"mean_token_accuracy": 0.6532490998506546,
"num_tokens": 637070.0,
"step": 75
},
{
"entropy": 1.2800488770008087,
"epoch": 0.030403040304030404,
"grad_norm": 0.34383344650268555,
"learning_rate": 0.0002,
"loss": 1.2733,
"mean_token_accuracy": 0.6612512767314911,
"num_tokens": 646123.0,
"step": 76
},
{
"entropy": 1.328520268201828,
"epoch": 0.030803080308030802,
"grad_norm": 0.3561513125896454,
"learning_rate": 0.00019999992447535154,
"loss": 1.3263,
"mean_token_accuracy": 0.6502320766448975,
"num_tokens": 654808.0,
"step": 77
},
{
"entropy": 1.2899321019649506,
"epoch": 0.031203120312031204,
"grad_norm": 0.3678707480430603,
"learning_rate": 0.00019999969790153286,
"loss": 1.3406,
"mean_token_accuracy": 0.6464085876941681,
"num_tokens": 663045.0,
"step": 78
},
{
"entropy": 1.3219149708747864,
"epoch": 0.0316031603160316,
"grad_norm": 0.38404518365859985,
"learning_rate": 0.00019999932027892428,
"loss": 1.302,
"mean_token_accuracy": 0.6544652730226517,
"num_tokens": 671266.0,
"step": 79
},
{
"entropy": 1.227865844964981,
"epoch": 0.032003200320032,
"grad_norm": 0.3195721209049225,
"learning_rate": 0.0001999987916081595,
"loss": 1.2129,
"mean_token_accuracy": 0.6690118610858917,
"num_tokens": 680536.0,
"step": 80
},
{
"entropy": 1.2681958079338074,
"epoch": 0.032403240324032405,
"grad_norm": 0.33165785670280457,
"learning_rate": 0.00019999811189012589,
"loss": 1.2616,
"mean_token_accuracy": 0.6542633771896362,
"num_tokens": 689078.0,
"step": 81
},
{
"entropy": 1.2480992376804352,
"epoch": 0.032803280328032806,
"grad_norm": 0.3365044891834259,
"learning_rate": 0.00019999728112596419,
"loss": 1.2532,
"mean_token_accuracy": 0.6593984663486481,
"num_tokens": 697600.0,
"step": 82
},
{
"entropy": 1.2559486627578735,
"epoch": 0.0332033203320332,
"grad_norm": 0.3525690734386444,
"learning_rate": 0.0001999962993170687,
"loss": 1.2407,
"mean_token_accuracy": 0.6652248501777649,
"num_tokens": 706449.0,
"step": 83
},
{
"entropy": 1.2723756432533264,
"epoch": 0.0336033603360336,
"grad_norm": 0.3243389129638672,
"learning_rate": 0.00019999516646508717,
"loss": 1.2759,
"mean_token_accuracy": 0.6553087830543518,
"num_tokens": 715261.0,
"step": 84
},
{
"entropy": 1.286735862493515,
"epoch": 0.034003400340034004,
"grad_norm": 0.3348769247531891,
"learning_rate": 0.000199993882571921,
"loss": 1.3288,
"mean_token_accuracy": 0.6503776162862778,
"num_tokens": 723935.0,
"step": 85
},
{
"entropy": 1.2838447391986847,
"epoch": 0.034403440344034406,
"grad_norm": 0.31921443343162537,
"learning_rate": 0.0001999924476397249,
"loss": 1.2712,
"mean_token_accuracy": 0.6571811884641647,
"num_tokens": 732552.0,
"step": 86
},
{
"entropy": 1.2601779401302338,
"epoch": 0.0348034803480348,
"grad_norm": 0.3210558593273163,
"learning_rate": 0.0001999908616709071,
"loss": 1.2409,
"mean_token_accuracy": 0.6692058891057968,
"num_tokens": 741619.0,
"step": 87
},
{
"entropy": 1.2706993520259857,
"epoch": 0.0352035203520352,
"grad_norm": 0.3449415862560272,
"learning_rate": 0.00019998912466812952,
"loss": 1.2301,
"mean_token_accuracy": 0.6645237505435944,
"num_tokens": 750045.0,
"step": 88
},
{
"entropy": 1.264108419418335,
"epoch": 0.0356035603560356,
"grad_norm": 0.3272925913333893,
"learning_rate": 0.00019998723663430733,
"loss": 1.2593,
"mean_token_accuracy": 0.6653023958206177,
"num_tokens": 758535.0,
"step": 89
},
{
"entropy": 1.174435406923294,
"epoch": 0.036003600360036005,
"grad_norm": 0.3484836518764496,
"learning_rate": 0.00019998519757260928,
"loss": 1.1771,
"mean_token_accuracy": 0.6722908169031143,
"num_tokens": 766995.0,
"step": 90
},
{
"entropy": 1.2018343806266785,
"epoch": 0.036403640364036406,
"grad_norm": 0.3412557542324066,
"learning_rate": 0.00019998300748645754,
"loss": 1.2204,
"mean_token_accuracy": 0.6707678735256195,
"num_tokens": 775542.0,
"step": 91
},
{
"entropy": 1.3117725551128387,
"epoch": 0.0368036803680368,
"grad_norm": 0.3464583158493042,
"learning_rate": 0.00019998066637952783,
"loss": 1.304,
"mean_token_accuracy": 0.645479291677475,
"num_tokens": 783830.0,
"step": 92
},
{
"entropy": 1.266638070344925,
"epoch": 0.0372037203720372,
"grad_norm": 0.35132962465286255,
"learning_rate": 0.0001999781742557493,
"loss": 1.2571,
"mean_token_accuracy": 0.6589740812778473,
"num_tokens": 792085.0,
"step": 93
},
{
"entropy": 1.266037255525589,
"epoch": 0.037603760376037604,
"grad_norm": 0.3320970833301544,
"learning_rate": 0.00019997553111930448,
"loss": 1.2761,
"mean_token_accuracy": 0.654522180557251,
"num_tokens": 800687.0,
"step": 94
},
{
"entropy": 1.324877679347992,
"epoch": 0.038003800380038005,
"grad_norm": 0.34410229325294495,
"learning_rate": 0.00019997273697462952,
"loss": 1.3059,
"mean_token_accuracy": 0.6469769328832626,
"num_tokens": 808479.0,
"step": 95
},
{
"entropy": 1.24421826004982,
"epoch": 0.03840384038403841,
"grad_norm": 0.3413639962673187,
"learning_rate": 0.00019996979182641383,
"loss": 1.2116,
"mean_token_accuracy": 0.6725156307220459,
"num_tokens": 817193.0,
"step": 96
},
{
"entropy": 1.2131675779819489,
"epoch": 0.0388038803880388,
"grad_norm": 0.31536421179771423,
"learning_rate": 0.00019996669567960031,
"loss": 1.2337,
"mean_token_accuracy": 0.6649139970541,
"num_tokens": 825915.0,
"step": 97
},
{
"entropy": 1.2785483300685883,
"epoch": 0.0392039203920392,
"grad_norm": 0.3453619182109833,
"learning_rate": 0.00019996344853938534,
"loss": 1.2257,
"mean_token_accuracy": 0.6682975143194199,
"num_tokens": 833771.0,
"step": 98
},
{
"entropy": 1.2706316709518433,
"epoch": 0.039603960396039604,
"grad_norm": 0.34687721729278564,
"learning_rate": 0.00019996005041121871,
"loss": 1.2578,
"mean_token_accuracy": 0.6584849059581757,
"num_tokens": 842093.0,
"step": 99
},
{
"entropy": 1.310558557510376,
"epoch": 0.040004000400040006,
"grad_norm": 0.34193679690361023,
"learning_rate": 0.0001999565013008035,
"loss": 1.338,
"mean_token_accuracy": 0.6487725079059601,
"num_tokens": 850079.0,
"step": 100
},
{
"entropy": 1.2646283209323883,
"epoch": 0.04040404040404041,
"grad_norm": 0.3951033651828766,
"learning_rate": 0.00019995280121409636,
"loss": 1.3172,
"mean_token_accuracy": 0.6424316316843033,
"num_tokens": 858250.0,
"step": 101
},
{
"entropy": 1.2900939583778381,
"epoch": 0.0408040804080408,
"grad_norm": 0.3364447057247162,
"learning_rate": 0.00019994895015730717,
"loss": 1.2487,
"mean_token_accuracy": 0.6626600474119186,
"num_tokens": 866623.0,
"step": 102
},
{
"entropy": 1.294897198677063,
"epoch": 0.041204120412041204,
"grad_norm": 0.3506770431995392,
"learning_rate": 0.00019994494813689928,
"loss": 1.2672,
"mean_token_accuracy": 0.6523661762475967,
"num_tokens": 875370.0,
"step": 103
},
{
"entropy": 1.2744373679161072,
"epoch": 0.041604160416041605,
"grad_norm": 0.31772273778915405,
"learning_rate": 0.00019994079515958942,
"loss": 1.2437,
"mean_token_accuracy": 0.6669129282236099,
"num_tokens": 884081.0,
"step": 104
},
{
"entropy": 1.2323677241802216,
"epoch": 0.04200420042004201,
"grad_norm": 0.31223100423812866,
"learning_rate": 0.00019993649123234758,
"loss": 1.2034,
"mean_token_accuracy": 0.6670378148555756,
"num_tokens": 892383.0,
"step": 105
},
{
"entropy": 1.1459662318229675,
"epoch": 0.0424042404240424,
"grad_norm": 0.3307859003543854,
"learning_rate": 0.00019993203636239717,
"loss": 1.2135,
"mean_token_accuracy": 0.6718799471855164,
"num_tokens": 900628.0,
"step": 106
},
{
"entropy": 1.2268281877040863,
"epoch": 0.0428042804280428,
"grad_norm": 0.35912272334098816,
"learning_rate": 0.00019992743055721493,
"loss": 1.2609,
"mean_token_accuracy": 0.6666164696216583,
"num_tokens": 909062.0,
"step": 107
},
{
"entropy": 1.200032651424408,
"epoch": 0.043204320432043204,
"grad_norm": 0.35117003321647644,
"learning_rate": 0.00019992267382453092,
"loss": 1.2047,
"mean_token_accuracy": 0.6681774854660034,
"num_tokens": 918221.0,
"step": 108
},
{
"entropy": 1.3714069724082947,
"epoch": 0.043604360436043606,
"grad_norm": 0.33686235547065735,
"learning_rate": 0.0001999177661723284,
"loss": 1.2777,
"mean_token_accuracy": 0.655053585767746,
"num_tokens": 926443.0,
"step": 109
},
{
"entropy": 1.3487186133861542,
"epoch": 0.04400440044004401,
"grad_norm": 0.3200630843639374,
"learning_rate": 0.0001999127076088441,
"loss": 1.3107,
"mean_token_accuracy": 0.6602136790752411,
"num_tokens": 934650.0,
"step": 110
},
{
"entropy": 1.2584488987922668,
"epoch": 0.0444044404440444,
"grad_norm": 0.31613630056381226,
"learning_rate": 0.0001999074981425679,
"loss": 1.2226,
"mean_token_accuracy": 0.6622737497091293,
"num_tokens": 942947.0,
"step": 111
},
{
"entropy": 1.1936236023902893,
"epoch": 0.0448044804480448,
"grad_norm": 0.316254198551178,
"learning_rate": 0.00019990213778224298,
"loss": 1.2106,
"mean_token_accuracy": 0.6652569025754929,
"num_tokens": 951465.0,
"step": 112
},
{
"entropy": 1.165192574262619,
"epoch": 0.045204520452045205,
"grad_norm": 0.31257057189941406,
"learning_rate": 0.00019989662653686576,
"loss": 1.2065,
"mean_token_accuracy": 0.6672259867191315,
"num_tokens": 960215.0,
"step": 113
},
{
"entropy": 1.180109590291977,
"epoch": 0.045604560456045606,
"grad_norm": 0.3332797884941101,
"learning_rate": 0.00019989096441568591,
"loss": 1.2285,
"mean_token_accuracy": 0.6671265214681625,
"num_tokens": 968893.0,
"step": 114
},
{
"entropy": 1.220985621213913,
"epoch": 0.04600460046004601,
"grad_norm": 0.3698706030845642,
"learning_rate": 0.0001998851514282063,
"loss": 1.2314,
"mean_token_accuracy": 0.6654269397258759,
"num_tokens": 976891.0,
"step": 115
},
{
"entropy": 1.2753552794456482,
"epoch": 0.0464046404640464,
"grad_norm": 0.32274726033210754,
"learning_rate": 0.00019987918758418308,
"loss": 1.2811,
"mean_token_accuracy": 0.6611100733280182,
"num_tokens": 984914.0,
"step": 116
},
{
"entropy": 1.308321624994278,
"epoch": 0.046804680468046804,
"grad_norm": 0.33258453011512756,
"learning_rate": 0.00019987307289362545,
"loss": 1.2541,
"mean_token_accuracy": 0.6605920940637589,
"num_tokens": 993096.0,
"step": 117
},
{
"entropy": 1.2893326878547668,
"epoch": 0.047204720472047206,
"grad_norm": 0.33915621042251587,
"learning_rate": 0.00019986680736679586,
"loss": 1.2511,
"mean_token_accuracy": 0.6640890389680862,
"num_tokens": 1001323.0,
"step": 118
},
{
"entropy": 1.30213862657547,
"epoch": 0.04760476047604761,
"grad_norm": 0.3717119097709656,
"learning_rate": 0.00019986039101420994,
"loss": 1.3143,
"mean_token_accuracy": 0.649169459939003,
"num_tokens": 1009892.0,
"step": 119
},
{
"entropy": 1.3021227717399597,
"epoch": 0.048004800480048,
"grad_norm": 0.32890114188194275,
"learning_rate": 0.0001998538238466364,
"loss": 1.2351,
"mean_token_accuracy": 0.6693892329931259,
"num_tokens": 1017992.0,
"step": 120
},
{
"entropy": 1.2010404765605927,
"epoch": 0.0484048404840484,
"grad_norm": 0.3222126066684723,
"learning_rate": 0.00019984710587509706,
"loss": 1.1934,
"mean_token_accuracy": 0.6745197772979736,
"num_tokens": 1026224.0,
"step": 121
},
{
"entropy": 1.2384890913963318,
"epoch": 0.048804880488048805,
"grad_norm": 0.32965728640556335,
"learning_rate": 0.00019984023711086687,
"loss": 1.2587,
"mean_token_accuracy": 0.6567209810018539,
"num_tokens": 1034674.0,
"step": 122
},
{
"entropy": 1.1893330216407776,
"epoch": 0.049204920492049206,
"grad_norm": 0.3488786518573761,
"learning_rate": 0.0001998332175654739,
"loss": 1.1999,
"mean_token_accuracy": 0.6683076322078705,
"num_tokens": 1042546.0,
"step": 123
},
{
"entropy": 1.2300190329551697,
"epoch": 0.04960496049604961,
"grad_norm": 0.33502018451690674,
"learning_rate": 0.00019982604725069918,
"loss": 1.2714,
"mean_token_accuracy": 0.6550982743501663,
"num_tokens": 1051075.0,
"step": 124
},
{
"entropy": 1.263420820236206,
"epoch": 0.05000500050005,
"grad_norm": 0.35562458634376526,
"learning_rate": 0.00019981872617857684,
"loss": 1.2535,
"mean_token_accuracy": 0.6570105701684952,
"num_tokens": 1059384.0,
"step": 125
},
{
"entropy": 1.2463673949241638,
"epoch": 0.050405040504050404,
"grad_norm": 0.3122851252555847,
"learning_rate": 0.00019981125436139405,
"loss": 1.2035,
"mean_token_accuracy": 0.6734038293361664,
"num_tokens": 1068524.0,
"step": 126
},
{
"entropy": 1.3272143006324768,
"epoch": 0.050805080508050805,
"grad_norm": 0.37185049057006836,
"learning_rate": 0.00019980363181169096,
"loss": 1.2723,
"mean_token_accuracy": 0.6541654914617538,
"num_tokens": 1076256.0,
"step": 127
},
{
"entropy": 1.2414169907569885,
"epoch": 0.05120512051205121,
"grad_norm": 0.32138875126838684,
"learning_rate": 0.00019979585854226065,
"loss": 1.1992,
"mean_token_accuracy": 0.6784048974514008,
"num_tokens": 1084784.0,
"step": 128
},
{
"entropy": 1.1664628982543945,
"epoch": 0.05160516051605161,
"grad_norm": 0.31607839465141296,
"learning_rate": 0.00019978793456614918,
"loss": 1.1728,
"mean_token_accuracy": 0.6773318648338318,
"num_tokens": 1094177.0,
"step": 129
},
{
"entropy": 1.1460879147052765,
"epoch": 0.052005200520052,
"grad_norm": 0.3119550347328186,
"learning_rate": 0.0001997798598966556,
"loss": 1.1576,
"mean_token_accuracy": 0.6763872653245926,
"num_tokens": 1102808.0,
"step": 130
},
{
"entropy": 1.1866309642791748,
"epoch": 0.052405240524052404,
"grad_norm": 0.3441757261753082,
"learning_rate": 0.00019977163454733184,
"loss": 1.2228,
"mean_token_accuracy": 0.6688681393861771,
"num_tokens": 1111447.0,
"step": 131
},
{
"entropy": 1.1310507953166962,
"epoch": 0.052805280528052806,
"grad_norm": 0.3540189862251282,
"learning_rate": 0.00019976325853198268,
"loss": 1.1514,
"mean_token_accuracy": 0.6831837445497513,
"num_tokens": 1120000.0,
"step": 132
},
{
"entropy": 1.19211745262146,
"epoch": 0.05320532053205321,
"grad_norm": 0.3323245942592621,
"learning_rate": 0.00019975473186466583,
"loss": 1.2119,
"mean_token_accuracy": 0.6718263179063797,
"num_tokens": 1128658.0,
"step": 133
},
{
"entropy": 1.1928575336933136,
"epoch": 0.0536053605360536,
"grad_norm": 0.34882429242134094,
"learning_rate": 0.0001997460545596918,
"loss": 1.2066,
"mean_token_accuracy": 0.6791622638702393,
"num_tokens": 1137143.0,
"step": 134
},
{
"entropy": 1.226127952337265,
"epoch": 0.054005400540054004,
"grad_norm": 0.3233380913734436,
"learning_rate": 0.00019973722663162396,
"loss": 1.1884,
"mean_token_accuracy": 0.6750646978616714,
"num_tokens": 1145501.0,
"step": 135
},
{
"entropy": 1.2761054337024689,
"epoch": 0.054405440544054405,
"grad_norm": 0.308118611574173,
"learning_rate": 0.00019972824809527838,
"loss": 1.224,
"mean_token_accuracy": 0.6631017774343491,
"num_tokens": 1153912.0,
"step": 136
},
{
"entropy": 1.3157364130020142,
"epoch": 0.05480548054805481,
"grad_norm": 0.33582690358161926,
"learning_rate": 0.00019971911896572405,
"loss": 1.2701,
"mean_token_accuracy": 0.6578985750675201,
"num_tokens": 1161769.0,
"step": 137
},
{
"entropy": 1.2075002789497375,
"epoch": 0.05520552055205521,
"grad_norm": 0.3170996606349945,
"learning_rate": 0.00019970983925828256,
"loss": 1.1906,
"mean_token_accuracy": 0.6732707768678665,
"num_tokens": 1170319.0,
"step": 138
},
{
"entropy": 1.1732978522777557,
"epoch": 0.0556055605560556,
"grad_norm": 0.32156452536582947,
"learning_rate": 0.0001997004089885283,
"loss": 1.1782,
"mean_token_accuracy": 0.6732619553804398,
"num_tokens": 1178801.0,
"step": 139
},
{
"entropy": 1.1573354601860046,
"epoch": 0.056005600560056004,
"grad_norm": 0.33083587884902954,
"learning_rate": 0.00019969082817228832,
"loss": 1.2067,
"mean_token_accuracy": 0.6737565696239471,
"num_tokens": 1186994.0,
"step": 140
},
{
"entropy": 1.211174637079239,
"epoch": 0.056405640564056406,
"grad_norm": 0.34685665369033813,
"learning_rate": 0.00019968109682564237,
"loss": 1.2586,
"mean_token_accuracy": 0.6569341272115707,
"num_tokens": 1194743.0,
"step": 141
},
{
"entropy": 1.2521505057811737,
"epoch": 0.05680568056805681,
"grad_norm": 0.35258418321609497,
"learning_rate": 0.00019967121496492282,
"loss": 1.2599,
"mean_token_accuracy": 0.6645904332399368,
"num_tokens": 1202435.0,
"step": 142
},
{
"entropy": 1.2398549616336823,
"epoch": 0.05720572057205721,
"grad_norm": 0.3388517200946808,
"learning_rate": 0.00019966118260671465,
"loss": 1.2081,
"mean_token_accuracy": 0.6675426363945007,
"num_tokens": 1210326.0,
"step": 143
},
{
"entropy": 1.297620803117752,
"epoch": 0.0576057605760576,
"grad_norm": 0.34630584716796875,
"learning_rate": 0.0001996509997678554,
"loss": 1.2857,
"mean_token_accuracy": 0.6573289930820465,
"num_tokens": 1218682.0,
"step": 144
},
{
"entropy": 1.248921811580658,
"epoch": 0.058005800580058005,
"grad_norm": 0.33417370915412903,
"learning_rate": 0.00019964066646543517,
"loss": 1.2036,
"mean_token_accuracy": 0.6730931401252747,
"num_tokens": 1227725.0,
"step": 145
},
{
"entropy": 1.2742219269275665,
"epoch": 0.058405840584058406,
"grad_norm": 0.31867334246635437,
"learning_rate": 0.00019963018271679667,
"loss": 1.2356,
"mean_token_accuracy": 0.6603083312511444,
"num_tokens": 1236112.0,
"step": 146
},
{
"entropy": 1.2454158961772919,
"epoch": 0.05880588058805881,
"grad_norm": 0.31619757413864136,
"learning_rate": 0.000199619548539535,
"loss": 1.2272,
"mean_token_accuracy": 0.664936900138855,
"num_tokens": 1244932.0,
"step": 147
},
{
"entropy": 1.1861615478992462,
"epoch": 0.05920592059205921,
"grad_norm": 0.3590589761734009,
"learning_rate": 0.00019960876395149778,
"loss": 1.2122,
"mean_token_accuracy": 0.6684562414884567,
"num_tokens": 1253316.0,
"step": 148
},
{
"entropy": 1.1777002215385437,
"epoch": 0.059605960596059604,
"grad_norm": 0.3057377338409424,
"learning_rate": 0.00019959782897078504,
"loss": 1.1483,
"mean_token_accuracy": 0.6810255944728851,
"num_tokens": 1261895.0,
"step": 149
},
{
"entropy": 1.2077372670173645,
"epoch": 0.060006000600060005,
"grad_norm": 0.32661283016204834,
"learning_rate": 0.00019958674361574927,
"loss": 1.2242,
"mean_token_accuracy": 0.6603673696517944,
"num_tokens": 1270647.0,
"step": 150
},
{
"entropy": 1.2129946649074554,
"epoch": 0.06040604060406041,
"grad_norm": 0.33181479573249817,
"learning_rate": 0.00019957550790499526,
"loss": 1.214,
"mean_token_accuracy": 0.6734245270490646,
"num_tokens": 1279483.0,
"step": 151
},
{
"entropy": 1.2279469072818756,
"epoch": 0.06080608060806081,
"grad_norm": 0.36564233899116516,
"learning_rate": 0.00019956412185738025,
"loss": 1.2227,
"mean_token_accuracy": 0.664169505238533,
"num_tokens": 1288062.0,
"step": 152
},
{
"entropy": 1.1853630542755127,
"epoch": 0.0612061206120612,
"grad_norm": 0.3081769645214081,
"learning_rate": 0.0001995525854920137,
"loss": 1.2009,
"mean_token_accuracy": 0.6692493110895157,
"num_tokens": 1296644.0,
"step": 153
},
{
"entropy": 1.1182245910167694,
"epoch": 0.061606160616061605,
"grad_norm": 0.28534799814224243,
"learning_rate": 0.00019954089882825738,
"loss": 1.0659,
"mean_token_accuracy": 0.7025346755981445,
"num_tokens": 1305683.0,
"step": 154
},
{
"entropy": 1.1886220276355743,
"epoch": 0.062006200620062006,
"grad_norm": 0.3182019293308258,
"learning_rate": 0.0001995290618857253,
"loss": 1.1576,
"mean_token_accuracy": 0.6741877645254135,
"num_tokens": 1314385.0,
"step": 155
},
{
"entropy": 1.2045941054821014,
"epoch": 0.06240624062406241,
"grad_norm": 0.3276945948600769,
"learning_rate": 0.0001995170746842838,
"loss": 1.165,
"mean_token_accuracy": 0.6834963709115982,
"num_tokens": 1322826.0,
"step": 156
},
{
"entropy": 1.2731471955776215,
"epoch": 0.0628062806280628,
"grad_norm": 0.3397105932235718,
"learning_rate": 0.00019950493724405117,
"loss": 1.2985,
"mean_token_accuracy": 0.648296907544136,
"num_tokens": 1331327.0,
"step": 157
},
{
"entropy": 1.1947194337844849,
"epoch": 0.0632063206320632,
"grad_norm": 0.2986201047897339,
"learning_rate": 0.00019949264958539807,
"loss": 1.205,
"mean_token_accuracy": 0.6792440861463547,
"num_tokens": 1340147.0,
"step": 158
},
{
"entropy": 1.1570270955562592,
"epoch": 0.0636063606360636,
"grad_norm": 0.3215077519416809,
"learning_rate": 0.00019948021172894718,
"loss": 1.1681,
"mean_token_accuracy": 0.6815727949142456,
"num_tokens": 1348989.0,
"step": 159
},
{
"entropy": 1.122036024928093,
"epoch": 0.064006400640064,
"grad_norm": 0.3120049238204956,
"learning_rate": 0.00019946762369557323,
"loss": 1.1377,
"mean_token_accuracy": 0.6871893852949142,
"num_tokens": 1357863.0,
"step": 160
},
{
"entropy": 1.2672194242477417,
"epoch": 0.06440644064406441,
"grad_norm": 0.33700302243232727,
"learning_rate": 0.00019945488550640313,
"loss": 1.2532,
"mean_token_accuracy": 0.664255827665329,
"num_tokens": 1365945.0,
"step": 161
},
{
"entropy": 1.1509548127651215,
"epoch": 0.06480648064806481,
"grad_norm": 0.3201735019683838,
"learning_rate": 0.00019944199718281559,
"loss": 1.1387,
"mean_token_accuracy": 0.6814217865467072,
"num_tokens": 1375147.0,
"step": 162
},
{
"entropy": 1.1635609865188599,
"epoch": 0.06520652065206521,
"grad_norm": 0.2953193187713623,
"learning_rate": 0.0001994289587464415,
"loss": 1.1817,
"mean_token_accuracy": 0.6780352145433426,
"num_tokens": 1383893.0,
"step": 163
},
{
"entropy": 1.1869005262851715,
"epoch": 0.06560656065606561,
"grad_norm": 0.30155807733535767,
"learning_rate": 0.00019941577021916355,
"loss": 1.1834,
"mean_token_accuracy": 0.6724350303411484,
"num_tokens": 1392477.0,
"step": 164
},
{
"entropy": 1.1506932377815247,
"epoch": 0.066006600660066,
"grad_norm": 0.31121376156806946,
"learning_rate": 0.00019940243162311642,
"loss": 1.1673,
"mean_token_accuracy": 0.6797937452793121,
"num_tokens": 1400899.0,
"step": 165
},
{
"entropy": 1.2660083770751953,
"epoch": 0.0664066406640664,
"grad_norm": 0.3299071788787842,
"learning_rate": 0.00019938894298068661,
"loss": 1.2725,
"mean_token_accuracy": 0.6537068784236908,
"num_tokens": 1409546.0,
"step": 166
},
{
"entropy": 1.2500199675559998,
"epoch": 0.0668066806680668,
"grad_norm": 0.3030771017074585,
"learning_rate": 0.00019937530431451243,
"loss": 1.1776,
"mean_token_accuracy": 0.6745365858078003,
"num_tokens": 1417712.0,
"step": 167
},
{
"entropy": 1.2582001090049744,
"epoch": 0.0672067206720672,
"grad_norm": 0.30366259813308716,
"learning_rate": 0.00019936151564748403,
"loss": 1.2339,
"mean_token_accuracy": 0.6664343029260635,
"num_tokens": 1426352.0,
"step": 168
},
{
"entropy": 1.2371725142002106,
"epoch": 0.0676067606760676,
"grad_norm": 0.3065868616104126,
"learning_rate": 0.00019934757700274325,
"loss": 1.223,
"mean_token_accuracy": 0.6679128706455231,
"num_tokens": 1434986.0,
"step": 169
},
{
"entropy": 1.2751116156578064,
"epoch": 0.06800680068006801,
"grad_norm": 0.3346325755119324,
"learning_rate": 0.00019933348840368368,
"loss": 1.2569,
"mean_token_accuracy": 0.6594884544610977,
"num_tokens": 1442823.0,
"step": 170
},
{
"entropy": 1.1633991301059723,
"epoch": 0.06840684068406841,
"grad_norm": 0.3242139518260956,
"learning_rate": 0.0001993192498739506,
"loss": 1.1805,
"mean_token_accuracy": 0.6728992164134979,
"num_tokens": 1451134.0,
"step": 171
},
{
"entropy": 1.2180014848709106,
"epoch": 0.06880688068806881,
"grad_norm": 0.3972644507884979,
"learning_rate": 0.0001993048614374409,
"loss": 1.2393,
"mean_token_accuracy": 0.6580066382884979,
"num_tokens": 1459262.0,
"step": 172
},
{
"entropy": 1.1176005005836487,
"epoch": 0.06920692069206921,
"grad_norm": 0.3137458264827728,
"learning_rate": 0.00019929032311830303,
"loss": 1.1644,
"mean_token_accuracy": 0.6814699321985245,
"num_tokens": 1467853.0,
"step": 173
},
{
"entropy": 1.1198759078979492,
"epoch": 0.0696069606960696,
"grad_norm": 0.3517007529735565,
"learning_rate": 0.000199275634940937,
"loss": 1.1312,
"mean_token_accuracy": 0.6874582916498184,
"num_tokens": 1476497.0,
"step": 174
},
{
"entropy": 1.2389306426048279,
"epoch": 0.07000700070007,
"grad_norm": 0.32016775012016296,
"learning_rate": 0.00019926079692999445,
"loss": 1.214,
"mean_token_accuracy": 0.6705743223428726,
"num_tokens": 1484294.0,
"step": 175
},
{
"entropy": 1.3337944746017456,
"epoch": 0.0704070407040704,
"grad_norm": 0.33495742082595825,
"learning_rate": 0.00019924580911037827,
"loss": 1.2954,
"mean_token_accuracy": 0.6510952711105347,
"num_tokens": 1492575.0,
"step": 176
},
{
"entropy": 1.2905775010585785,
"epoch": 0.0708070807080708,
"grad_norm": 0.3236202001571655,
"learning_rate": 0.00019923067150724296,
"loss": 1.219,
"mean_token_accuracy": 0.6705390512943268,
"num_tokens": 1500716.0,
"step": 177
},
{
"entropy": 1.2353481650352478,
"epoch": 0.0712071207120712,
"grad_norm": 0.3262037932872772,
"learning_rate": 0.00019921538414599437,
"loss": 1.2076,
"mean_token_accuracy": 0.6677059978246689,
"num_tokens": 1509105.0,
"step": 178
},
{
"entropy": 1.2299005091190338,
"epoch": 0.07160716071607161,
"grad_norm": 0.3147687315940857,
"learning_rate": 0.00019919994705228965,
"loss": 1.2301,
"mean_token_accuracy": 0.6644129753112793,
"num_tokens": 1516981.0,
"step": 179
},
{
"entropy": 1.1565956473350525,
"epoch": 0.07200720072007201,
"grad_norm": 0.31962037086486816,
"learning_rate": 0.00019918436025203728,
"loss": 1.2013,
"mean_token_accuracy": 0.6825570911169052,
"num_tokens": 1524951.0,
"step": 180
},
{
"entropy": 1.1386863589286804,
"epoch": 0.07240724072407241,
"grad_norm": 0.30647844076156616,
"learning_rate": 0.00019916862377139695,
"loss": 1.1697,
"mean_token_accuracy": 0.6716460883617401,
"num_tokens": 1533450.0,
"step": 181
},
{
"entropy": 1.1206298768520355,
"epoch": 0.07280728072807281,
"grad_norm": 0.2919379472732544,
"learning_rate": 0.00019915273763677959,
"loss": 1.1221,
"mean_token_accuracy": 0.6845085620880127,
"num_tokens": 1542345.0,
"step": 182
},
{
"entropy": 1.1708945035934448,
"epoch": 0.07320732073207321,
"grad_norm": 0.3223237097263336,
"learning_rate": 0.00019913670187484737,
"loss": 1.1722,
"mean_token_accuracy": 0.681228905916214,
"num_tokens": 1551016.0,
"step": 183
},
{
"entropy": 1.1606915593147278,
"epoch": 0.0736073607360736,
"grad_norm": 0.3167206943035126,
"learning_rate": 0.00019912051651251346,
"loss": 1.1381,
"mean_token_accuracy": 0.686376079916954,
"num_tokens": 1560201.0,
"step": 184
},
{
"entropy": 1.2089463472366333,
"epoch": 0.074007400740074,
"grad_norm": 0.331546813249588,
"learning_rate": 0.00019910418157694217,
"loss": 1.1998,
"mean_token_accuracy": 0.6701401472091675,
"num_tokens": 1568847.0,
"step": 185
},
{
"entropy": 1.2552906274795532,
"epoch": 0.0744074407440744,
"grad_norm": 0.3218790292739868,
"learning_rate": 0.00019908769709554887,
"loss": 1.2302,
"mean_token_accuracy": 0.6671873778104782,
"num_tokens": 1577212.0,
"step": 186
},
{
"entropy": 1.0971337109804153,
"epoch": 0.0748074807480748,
"grad_norm": 0.2888547480106354,
"learning_rate": 0.00019907106309599985,
"loss": 1.1053,
"mean_token_accuracy": 0.6914333999156952,
"num_tokens": 1586544.0,
"step": 187
},
{
"entropy": 1.1342568099498749,
"epoch": 0.07520752075207521,
"grad_norm": 0.3135220408439636,
"learning_rate": 0.00019905427960621245,
"loss": 1.1553,
"mean_token_accuracy": 0.678636908531189,
"num_tokens": 1595573.0,
"step": 188
},
{
"entropy": 1.2157914340496063,
"epoch": 0.07560756075607561,
"grad_norm": 0.32912546396255493,
"learning_rate": 0.00019903734665435472,
"loss": 1.2219,
"mean_token_accuracy": 0.6693233996629715,
"num_tokens": 1603723.0,
"step": 189
},
{
"entropy": 1.1541197896003723,
"epoch": 0.07600760076007601,
"grad_norm": 0.31249913573265076,
"learning_rate": 0.00019902026426884574,
"loss": 1.1311,
"mean_token_accuracy": 0.6898495107889175,
"num_tokens": 1612212.0,
"step": 190
},
{
"entropy": 1.211905598640442,
"epoch": 0.07640764076407641,
"grad_norm": 0.3106580078601837,
"learning_rate": 0.00019900303247835527,
"loss": 1.168,
"mean_token_accuracy": 0.675964280962944,
"num_tokens": 1620162.0,
"step": 191
},
{
"entropy": 1.2080174088478088,
"epoch": 0.07680768076807681,
"grad_norm": 0.32318130135536194,
"learning_rate": 0.00019898565131180393,
"loss": 1.1781,
"mean_token_accuracy": 0.6760376244783401,
"num_tokens": 1628883.0,
"step": 192
},
{
"entropy": 1.2078506350517273,
"epoch": 0.0772077207720772,
"grad_norm": 0.33328673243522644,
"learning_rate": 0.0001989681207983629,
"loss": 1.2092,
"mean_token_accuracy": 0.6628051847219467,
"num_tokens": 1637332.0,
"step": 193
},
{
"entropy": 1.210196852684021,
"epoch": 0.0776077607760776,
"grad_norm": 0.32340574264526367,
"learning_rate": 0.00019895044096745416,
"loss": 1.2329,
"mean_token_accuracy": 0.6619292944669724,
"num_tokens": 1645906.0,
"step": 194
},
{
"entropy": 1.1815847158432007,
"epoch": 0.078007800780078,
"grad_norm": 0.3175504505634308,
"learning_rate": 0.00019893261184875016,
"loss": 1.2045,
"mean_token_accuracy": 0.6673628389835358,
"num_tokens": 1654114.0,
"step": 195
},
{
"entropy": 1.1910730004310608,
"epoch": 0.0784078407840784,
"grad_norm": 0.3114391565322876,
"learning_rate": 0.00019891463347217395,
"loss": 1.1889,
"mean_token_accuracy": 0.6714468449354172,
"num_tokens": 1662666.0,
"step": 196
},
{
"entropy": 1.1541639566421509,
"epoch": 0.07880788078807881,
"grad_norm": 0.3364032506942749,
"learning_rate": 0.0001988965058678992,
"loss": 1.1622,
"mean_token_accuracy": 0.67988321185112,
"num_tokens": 1671435.0,
"step": 197
},
{
"entropy": 1.222437858581543,
"epoch": 0.07920792079207921,
"grad_norm": 0.3355000913143158,
"learning_rate": 0.00019887822906634983,
"loss": 1.1804,
"mean_token_accuracy": 0.6725995391607285,
"num_tokens": 1679662.0,
"step": 198
},
{
"entropy": 1.2075644731521606,
"epoch": 0.07960796079607961,
"grad_norm": 0.33377805352211,
"learning_rate": 0.00019885980309820032,
"loss": 1.1547,
"mean_token_accuracy": 0.6831348687410355,
"num_tokens": 1687663.0,
"step": 199
},
{
"entropy": 1.248348981142044,
"epoch": 0.08000800080008001,
"grad_norm": 0.3341095447540283,
"learning_rate": 0.0001988412279943754,
"loss": 1.2665,
"mean_token_accuracy": 0.6561878323554993,
"num_tokens": 1696479.0,
"step": 200
},
{
"entropy": 1.224026381969452,
"epoch": 0.08040804080408041,
"grad_norm": 0.33011487126350403,
"learning_rate": 0.00019882250378605015,
"loss": 1.2181,
"mean_token_accuracy": 0.6664289385080338,
"num_tokens": 1704885.0,
"step": 201
},
{
"entropy": 1.1437757015228271,
"epoch": 0.08080808080808081,
"grad_norm": 0.31265076994895935,
"learning_rate": 0.00019880363050464993,
"loss": 1.1773,
"mean_token_accuracy": 0.6812110096216202,
"num_tokens": 1713409.0,
"step": 202
},
{
"entropy": 1.2059556543827057,
"epoch": 0.0812081208120812,
"grad_norm": 0.315448135137558,
"learning_rate": 0.00019878460818185023,
"loss": 1.2278,
"mean_token_accuracy": 0.6699778735637665,
"num_tokens": 1721548.0,
"step": 203
},
{
"entropy": 1.2078820168972015,
"epoch": 0.0816081608160816,
"grad_norm": 0.3079279363155365,
"learning_rate": 0.00019876543684957667,
"loss": 1.1845,
"mean_token_accuracy": 0.6785111278295517,
"num_tokens": 1729809.0,
"step": 204
},
{
"entropy": 1.199218899011612,
"epoch": 0.082008200820082,
"grad_norm": 0.3043046295642853,
"learning_rate": 0.000198746116540005,
"loss": 1.1722,
"mean_token_accuracy": 0.6754065752029419,
"num_tokens": 1738734.0,
"step": 205
},
{
"entropy": 1.2172024846076965,
"epoch": 0.08240824082408241,
"grad_norm": 0.313902884721756,
"learning_rate": 0.00019872664728556101,
"loss": 1.1869,
"mean_token_accuracy": 0.6728281825780869,
"num_tokens": 1746870.0,
"step": 206
},
{
"entropy": 1.1678736209869385,
"epoch": 0.08280828082808281,
"grad_norm": 0.3191705644130707,
"learning_rate": 0.00019870702911892042,
"loss": 1.1546,
"mean_token_accuracy": 0.6843972355127335,
"num_tokens": 1755295.0,
"step": 207
},
{
"entropy": 1.279354214668274,
"epoch": 0.08320832083208321,
"grad_norm": 0.3313900828361511,
"learning_rate": 0.0001986872620730089,
"loss": 1.2558,
"mean_token_accuracy": 0.659809798002243,
"num_tokens": 1763606.0,
"step": 208
},
{
"entropy": 1.078108698129654,
"epoch": 0.08360836083608361,
"grad_norm": 0.283428430557251,
"learning_rate": 0.00019866734618100202,
"loss": 1.1032,
"mean_token_accuracy": 0.69297856092453,
"num_tokens": 1772887.0,
"step": 209
},
{
"entropy": 1.186295509338379,
"epoch": 0.08400840084008401,
"grad_norm": 0.35003766417503357,
"learning_rate": 0.0001986472814763251,
"loss": 1.2374,
"mean_token_accuracy": 0.6684627532958984,
"num_tokens": 1781067.0,
"step": 210
},
{
"entropy": 1.1557523012161255,
"epoch": 0.08440844084408441,
"grad_norm": 0.31848254799842834,
"learning_rate": 0.00019862706799265322,
"loss": 1.1854,
"mean_token_accuracy": 0.6773674935102463,
"num_tokens": 1789844.0,
"step": 211
},
{
"entropy": 1.218627154827118,
"epoch": 0.0848084808480848,
"grad_norm": 0.3408789038658142,
"learning_rate": 0.00019860670576391128,
"loss": 1.1708,
"mean_token_accuracy": 0.6817043423652649,
"num_tokens": 1798509.0,
"step": 212
},
{
"entropy": 1.2130761444568634,
"epoch": 0.0852085208520852,
"grad_norm": 0.7527572512626648,
"learning_rate": 0.0001985861948242736,
"loss": 1.2157,
"mean_token_accuracy": 0.6661449372768402,
"num_tokens": 1807202.0,
"step": 213
},
{
"entropy": 1.2128455638885498,
"epoch": 0.0856085608560856,
"grad_norm": 0.29946374893188477,
"learning_rate": 0.00019856553520816435,
"loss": 1.1896,
"mean_token_accuracy": 0.6733538210391998,
"num_tokens": 1816131.0,
"step": 214
},
{
"entropy": 1.2612944841384888,
"epoch": 0.086008600860086,
"grad_norm": 0.32515719532966614,
"learning_rate": 0.00019854472695025698,
"loss": 1.2329,
"mean_token_accuracy": 0.669788658618927,
"num_tokens": 1824283.0,
"step": 215
},
{
"entropy": 1.1807590425014496,
"epoch": 0.08640864086408641,
"grad_norm": 0.3279406726360321,
"learning_rate": 0.0001985237700854746,
"loss": 1.1565,
"mean_token_accuracy": 0.6816118210554123,
"num_tokens": 1833322.0,
"step": 216
},
{
"entropy": 1.2046120464801788,
"epoch": 0.08680868086808681,
"grad_norm": 0.2987005412578583,
"learning_rate": 0.00019850266464898955,
"loss": 1.179,
"mean_token_accuracy": 0.6783045381307602,
"num_tokens": 1842092.0,
"step": 217
},
{
"entropy": 1.1976227462291718,
"epoch": 0.08720872087208721,
"grad_norm": 0.30504319071769714,
"learning_rate": 0.00019848141067622374,
"loss": 1.1589,
"mean_token_accuracy": 0.6762242764234543,
"num_tokens": 1850740.0,
"step": 218
},
{
"entropy": 1.2001455426216125,
"epoch": 0.08760876087608761,
"grad_norm": 0.35163310170173645,
"learning_rate": 0.0001984600082028482,
"loss": 1.1941,
"mean_token_accuracy": 0.6701504737138748,
"num_tokens": 1858729.0,
"step": 219
},
{
"entropy": 1.0998838245868683,
"epoch": 0.08800880088008801,
"grad_norm": 0.3166980445384979,
"learning_rate": 0.0001984384572647832,
"loss": 1.1238,
"mean_token_accuracy": 0.683118149638176,
"num_tokens": 1867218.0,
"step": 220
},
{
"entropy": 1.1223637461662292,
"epoch": 0.0884088408840884,
"grad_norm": 0.3210962116718292,
"learning_rate": 0.0001984167578981983,
"loss": 1.158,
"mean_token_accuracy": 0.685064285993576,
"num_tokens": 1875656.0,
"step": 221
},
{
"entropy": 1.1469238698482513,
"epoch": 0.0888088808880888,
"grad_norm": 0.37055703997612,
"learning_rate": 0.00019839491013951213,
"loss": 1.1976,
"mean_token_accuracy": 0.66952283680439,
"num_tokens": 1884042.0,
"step": 222
},
{
"entropy": 1.2010729908943176,
"epoch": 0.0892089208920892,
"grad_norm": 0.30089443922042847,
"learning_rate": 0.00019837291402539223,
"loss": 1.1677,
"mean_token_accuracy": 0.6765223145484924,
"num_tokens": 1892519.0,
"step": 223
},
{
"entropy": 1.222718983888626,
"epoch": 0.0896089608960896,
"grad_norm": 0.3071632981300354,
"learning_rate": 0.00019835076959275532,
"loss": 1.1918,
"mean_token_accuracy": 0.6696299612522125,
"num_tokens": 1900924.0,
"step": 224
},
{
"entropy": 1.216365933418274,
"epoch": 0.09000900090009001,
"grad_norm": 0.3337574303150177,
"learning_rate": 0.00019832847687876692,
"loss": 1.1572,
"mean_token_accuracy": 0.6832773238420486,
"num_tokens": 1909276.0,
"step": 225
},
{
"entropy": 1.1910041272640228,
"epoch": 0.09040904090409041,
"grad_norm": 0.3146218955516815,
"learning_rate": 0.0001983060359208415,
"loss": 1.1782,
"mean_token_accuracy": 0.679167777299881,
"num_tokens": 1918407.0,
"step": 226
},
{
"entropy": 1.162790209054947,
"epoch": 0.09080908090809081,
"grad_norm": 0.2975619435310364,
"learning_rate": 0.0001982834467566423,
"loss": 1.1683,
"mean_token_accuracy": 0.6799277067184448,
"num_tokens": 1927282.0,
"step": 227
},
{
"entropy": 1.192271113395691,
"epoch": 0.09120912091209121,
"grad_norm": 0.3205324113368988,
"learning_rate": 0.0001982607094240813,
"loss": 1.1681,
"mean_token_accuracy": 0.6754294186830521,
"num_tokens": 1935737.0,
"step": 228
},
{
"entropy": 1.1858693957328796,
"epoch": 0.09160916091609161,
"grad_norm": 0.3366444706916809,
"learning_rate": 0.00019823782396131902,
"loss": 1.1944,
"mean_token_accuracy": 0.6657039225101471,
"num_tokens": 1943472.0,
"step": 229
},
{
"entropy": 1.1361185312271118,
"epoch": 0.09200920092009202,
"grad_norm": 0.31257081031799316,
"learning_rate": 0.00019821479040676488,
"loss": 1.1529,
"mean_token_accuracy": 0.6812857985496521,
"num_tokens": 1952251.0,
"step": 230
},
{
"entropy": 1.2052267491817474,
"epoch": 0.0924092409240924,
"grad_norm": 0.3371609151363373,
"learning_rate": 0.0001981916087990766,
"loss": 1.2363,
"mean_token_accuracy": 0.6580934226512909,
"num_tokens": 1960349.0,
"step": 231
},
{
"entropy": 1.1373478174209595,
"epoch": 0.0928092809280928,
"grad_norm": 0.30473393201828003,
"learning_rate": 0.00019816827917716048,
"loss": 1.1727,
"mean_token_accuracy": 0.6796131581068039,
"num_tokens": 1969233.0,
"step": 232
},
{
"entropy": 1.1681481301784515,
"epoch": 0.0932093209320932,
"grad_norm": 0.3225601315498352,
"learning_rate": 0.0001981448015801712,
"loss": 1.1528,
"mean_token_accuracy": 0.6749817878007889,
"num_tokens": 1977270.0,
"step": 233
},
{
"entropy": 1.2196559309959412,
"epoch": 0.09360936093609361,
"grad_norm": 0.33247852325439453,
"learning_rate": 0.00019812117604751185,
"loss": 1.1834,
"mean_token_accuracy": 0.6816778779029846,
"num_tokens": 1985087.0,
"step": 234
},
{
"entropy": 1.218104362487793,
"epoch": 0.09400940094009401,
"grad_norm": 0.3164643347263336,
"learning_rate": 0.00019809740261883372,
"loss": 1.1791,
"mean_token_accuracy": 0.6742540150880814,
"num_tokens": 1993142.0,
"step": 235
},
{
"entropy": 1.2172793745994568,
"epoch": 0.09440944094409441,
"grad_norm": 0.31248074769973755,
"learning_rate": 0.0001980734813340364,
"loss": 1.2067,
"mean_token_accuracy": 0.6745200008153915,
"num_tokens": 2001487.0,
"step": 236
},
{
"entropy": 1.203236162662506,
"epoch": 0.09480948094809481,
"grad_norm": 0.32407742738723755,
"learning_rate": 0.0001980494122332676,
"loss": 1.1664,
"mean_token_accuracy": 0.6777038276195526,
"num_tokens": 2010136.0,
"step": 237
},
{
"entropy": 1.1953341364860535,
"epoch": 0.09520952095209521,
"grad_norm": 0.3571881651878357,
"learning_rate": 0.00019802519535692302,
"loss": 1.1651,
"mean_token_accuracy": 0.6782020479440689,
"num_tokens": 2018515.0,
"step": 238
},
{
"entropy": 1.208018183708191,
"epoch": 0.09560956095609562,
"grad_norm": 0.3488442599773407,
"learning_rate": 0.00019800083074564658,
"loss": 1.2217,
"mean_token_accuracy": 0.6720796823501587,
"num_tokens": 2026942.0,
"step": 239
},
{
"entropy": 1.1499423384666443,
"epoch": 0.096009600960096,
"grad_norm": 0.30266088247299194,
"learning_rate": 0.00019797631844032992,
"loss": 1.1776,
"mean_token_accuracy": 0.6771319806575775,
"num_tokens": 2035674.0,
"step": 240
},
{
"entropy": 1.1237535774707794,
"epoch": 0.0964096409640964,
"grad_norm": 0.3096405863761902,
"learning_rate": 0.00019795165848211278,
"loss": 1.1122,
"mean_token_accuracy": 0.6934310793876648,
"num_tokens": 2044052.0,
"step": 241
},
{
"entropy": 1.1529573500156403,
"epoch": 0.0968096809680968,
"grad_norm": 0.3192532956600189,
"learning_rate": 0.0001979268509123825,
"loss": 1.1804,
"mean_token_accuracy": 0.6760334223508835,
"num_tokens": 2052448.0,
"step": 242
},
{
"entropy": 1.2383974194526672,
"epoch": 0.09720972097209721,
"grad_norm": 0.3160487711429596,
"learning_rate": 0.00019790189577277432,
"loss": 1.2465,
"mean_token_accuracy": 0.6652619689702988,
"num_tokens": 2060776.0,
"step": 243
},
{
"entropy": 1.2161905169487,
"epoch": 0.09760976097609761,
"grad_norm": 0.32217562198638916,
"learning_rate": 0.00019787679310517107,
"loss": 1.1872,
"mean_token_accuracy": 0.6732243746519089,
"num_tokens": 2068794.0,
"step": 244
},
{
"entropy": 1.1646412014961243,
"epoch": 0.09800980098009801,
"grad_norm": 0.3009166419506073,
"learning_rate": 0.00019785154295170316,
"loss": 1.1652,
"mean_token_accuracy": 0.6807472556829453,
"num_tokens": 2077262.0,
"step": 245
},
{
"entropy": 1.2155237197875977,
"epoch": 0.09840984098409841,
"grad_norm": 0.3069799840450287,
"learning_rate": 0.00019782614535474862,
"loss": 1.216,
"mean_token_accuracy": 0.6698369234800339,
"num_tokens": 2085649.0,
"step": 246
},
{
"entropy": 1.1119366884231567,
"epoch": 0.09880988098809881,
"grad_norm": 0.30247923731803894,
"learning_rate": 0.00019780060035693285,
"loss": 1.1038,
"mean_token_accuracy": 0.6942414045333862,
"num_tokens": 2094198.0,
"step": 247
},
{
"entropy": 1.2534517645835876,
"epoch": 0.09920992099209922,
"grad_norm": 0.3274390697479248,
"learning_rate": 0.0001977749080011287,
"loss": 1.2635,
"mean_token_accuracy": 0.6554094851016998,
"num_tokens": 2102101.0,
"step": 248
},
{
"entropy": 1.1967229545116425,
"epoch": 0.09960996099609962,
"grad_norm": 0.29584378004074097,
"learning_rate": 0.00019774906833045625,
"loss": 1.1822,
"mean_token_accuracy": 0.6769470870494843,
"num_tokens": 2110466.0,
"step": 249
},
{
"entropy": 1.1380691528320312,
"epoch": 0.1000100010001,
"grad_norm": 0.28823035955429077,
"learning_rate": 0.00019772308138828299,
"loss": 1.0987,
"mean_token_accuracy": 0.6907877773046494,
"num_tokens": 2119656.0,
"step": 250
},
{
"entropy": 1.155064195394516,
"epoch": 0.1004100410041004,
"grad_norm": 0.3187693655490875,
"learning_rate": 0.00019769694721822337,
"loss": 1.1542,
"mean_token_accuracy": 0.6734511256217957,
"num_tokens": 2128073.0,
"step": 251
},
{
"entropy": 1.1665138900279999,
"epoch": 0.10081008100810081,
"grad_norm": 0.30443915724754333,
"learning_rate": 0.00019767066586413905,
"loss": 1.2047,
"mean_token_accuracy": 0.6689727902412415,
"num_tokens": 2136624.0,
"step": 252
},
{
"entropy": 1.1986846625804901,
"epoch": 0.10121012101210121,
"grad_norm": 0.2993563413619995,
"learning_rate": 0.0001976442373701387,
"loss": 1.1885,
"mean_token_accuracy": 0.6774641126394272,
"num_tokens": 2144946.0,
"step": 253
},
{
"entropy": 1.1575412154197693,
"epoch": 0.10161016101610161,
"grad_norm": 0.31819280982017517,
"learning_rate": 0.00019761766178057796,
"loss": 1.1617,
"mean_token_accuracy": 0.6737077832221985,
"num_tokens": 2153241.0,
"step": 254
},
{
"entropy": 1.1932867169380188,
"epoch": 0.10201020102010201,
"grad_norm": 0.33500298857688904,
"learning_rate": 0.00019759093914005932,
"loss": 1.1739,
"mean_token_accuracy": 0.6722579598426819,
"num_tokens": 2161532.0,
"step": 255
},
{
"entropy": 1.2010496854782104,
"epoch": 0.10241024102410241,
"grad_norm": 0.3177407681941986,
"learning_rate": 0.00019756406949343204,
"loss": 1.1888,
"mean_token_accuracy": 0.6757108420133591,
"num_tokens": 2170296.0,
"step": 256
},
{
"entropy": 1.1958762109279633,
"epoch": 0.10281028102810282,
"grad_norm": 0.30990293622016907,
"learning_rate": 0.00019753705288579217,
"loss": 1.1797,
"mean_token_accuracy": 0.6757787764072418,
"num_tokens": 2178618.0,
"step": 257
},
{
"entropy": 1.1743170619010925,
"epoch": 0.10321032103210322,
"grad_norm": 0.3038559854030609,
"learning_rate": 0.00019750988936248235,
"loss": 1.169,
"mean_token_accuracy": 0.6733282506465912,
"num_tokens": 2187168.0,
"step": 258
},
{
"entropy": 1.1737709939479828,
"epoch": 0.1036103610361036,
"grad_norm": 0.321360319852829,
"learning_rate": 0.0001974825789690918,
"loss": 1.1957,
"mean_token_accuracy": 0.6770029366016388,
"num_tokens": 2195246.0,
"step": 259
},
{
"entropy": 1.172276645898819,
"epoch": 0.104010401040104,
"grad_norm": 0.3069777488708496,
"learning_rate": 0.00019745512175145627,
"loss": 1.2094,
"mean_token_accuracy": 0.6666506826877594,
"num_tokens": 2203717.0,
"step": 260
},
{
"entropy": 1.3047214448451996,
"epoch": 0.10441044104410441,
"grad_norm": 0.3076897859573364,
"learning_rate": 0.0001974275177556579,
"loss": 1.301,
"mean_token_accuracy": 0.6500514298677444,
"num_tokens": 2212037.0,
"step": 261
},
{
"entropy": 1.1853089034557343,
"epoch": 0.10481048104810481,
"grad_norm": 0.30814552307128906,
"learning_rate": 0.00019739976702802517,
"loss": 1.121,
"mean_token_accuracy": 0.6797177791595459,
"num_tokens": 2220415.0,
"step": 262
},
{
"entropy": 1.14727121591568,
"epoch": 0.10521052105210521,
"grad_norm": 0.3139231503009796,
"learning_rate": 0.0001973718696151329,
"loss": 1.0951,
"mean_token_accuracy": 0.6984894424676895,
"num_tokens": 2228773.0,
"step": 263
},
{
"entropy": 1.1453731060028076,
"epoch": 0.10561056105610561,
"grad_norm": 0.3104467988014221,
"learning_rate": 0.00019734382556380194,
"loss": 1.145,
"mean_token_accuracy": 0.6833966672420502,
"num_tokens": 2236602.0,
"step": 264
},
{
"entropy": 1.129274994134903,
"epoch": 0.10601060106010601,
"grad_norm": 0.29663506150245667,
"learning_rate": 0.0001973156349210994,
"loss": 1.1386,
"mean_token_accuracy": 0.6783726066350937,
"num_tokens": 2245313.0,
"step": 265
},
{
"entropy": 1.1950629949569702,
"epoch": 0.10641064106410641,
"grad_norm": 0.3033241033554077,
"learning_rate": 0.0001972872977343383,
"loss": 1.2095,
"mean_token_accuracy": 0.6765413582324982,
"num_tokens": 2254362.0,
"step": 266
},
{
"entropy": 1.2014857530593872,
"epoch": 0.10681068106810682,
"grad_norm": 0.31535446643829346,
"learning_rate": 0.00019725881405107778,
"loss": 1.2053,
"mean_token_accuracy": 0.6713583916425705,
"num_tokens": 2262331.0,
"step": 267
},
{
"entropy": 1.1801405549049377,
"epoch": 0.1072107210721072,
"grad_norm": 0.30611008405685425,
"learning_rate": 0.0001972301839191226,
"loss": 1.1823,
"mean_token_accuracy": 0.6748154610395432,
"num_tokens": 2270765.0,
"step": 268
},
{
"entropy": 1.1290169060230255,
"epoch": 0.1076107610761076,
"grad_norm": 0.30215638875961304,
"learning_rate": 0.00019720140738652345,
"loss": 1.1209,
"mean_token_accuracy": 0.6912433356046677,
"num_tokens": 2279593.0,
"step": 269
},
{
"entropy": 1.1610883474349976,
"epoch": 0.10801080108010801,
"grad_norm": 0.30377084016799927,
"learning_rate": 0.00019717248450157681,
"loss": 1.1863,
"mean_token_accuracy": 0.6740070879459381,
"num_tokens": 2288100.0,
"step": 270
},
{
"entropy": 1.1068450212478638,
"epoch": 0.10841084108410841,
"grad_norm": 0.3132963478565216,
"learning_rate": 0.00019714341531282462,
"loss": 1.0841,
"mean_token_accuracy": 0.6911667734384537,
"num_tokens": 2296290.0,
"step": 271
},
{
"entropy": 1.168148934841156,
"epoch": 0.10881088108810881,
"grad_norm": 0.3282947242259979,
"learning_rate": 0.0001971141998690545,
"loss": 1.1941,
"mean_token_accuracy": 0.673908457159996,
"num_tokens": 2304766.0,
"step": 272
},
{
"entropy": 1.1689501702785492,
"epoch": 0.10921092109210921,
"grad_norm": 0.2957140803337097,
"learning_rate": 0.00019708483821929943,
"loss": 1.1398,
"mean_token_accuracy": 0.6831405013799667,
"num_tokens": 2313114.0,
"step": 273
},
{
"entropy": 1.1905297338962555,
"epoch": 0.10961096109610961,
"grad_norm": 0.29807668924331665,
"learning_rate": 0.00019705533041283779,
"loss": 1.1736,
"mean_token_accuracy": 0.6775653660297394,
"num_tokens": 2321660.0,
"step": 274
},
{
"entropy": 1.1815482079982758,
"epoch": 0.11001100110011001,
"grad_norm": 0.29083186388015747,
"learning_rate": 0.00019702567649919337,
"loss": 1.1603,
"mean_token_accuracy": 0.6754807829856873,
"num_tokens": 2330342.0,
"step": 275
},
{
"entropy": 1.1261299550533295,
"epoch": 0.11041104110411042,
"grad_norm": 0.2901794910430908,
"learning_rate": 0.00019699587652813503,
"loss": 1.1284,
"mean_token_accuracy": 0.691281333565712,
"num_tokens": 2338852.0,
"step": 276
},
{
"entropy": 1.184859186410904,
"epoch": 0.11081108110811082,
"grad_norm": 0.310745507478714,
"learning_rate": 0.00019696593054967682,
"loss": 1.2127,
"mean_token_accuracy": 0.6673152446746826,
"num_tokens": 2346809.0,
"step": 277
},
{
"entropy": 1.1188380122184753,
"epoch": 0.1112111211121112,
"grad_norm": 0.29587554931640625,
"learning_rate": 0.00019693583861407786,
"loss": 1.0981,
"mean_token_accuracy": 0.6947813928127289,
"num_tokens": 2355532.0,
"step": 278
},
{
"entropy": 1.172318309545517,
"epoch": 0.1116111611161116,
"grad_norm": 0.3138435482978821,
"learning_rate": 0.00019690560077184223,
"loss": 1.1441,
"mean_token_accuracy": 0.6789282411336899,
"num_tokens": 2363938.0,
"step": 279
},
{
"entropy": 1.1374418139457703,
"epoch": 0.11201120112011201,
"grad_norm": 0.34152451157569885,
"learning_rate": 0.0001968752170737188,
"loss": 1.1081,
"mean_token_accuracy": 0.6848500221967697,
"num_tokens": 2372334.0,
"step": 280
},
{
"entropy": 1.1317946314811707,
"epoch": 0.11241124112411241,
"grad_norm": 0.29949530959129333,
"learning_rate": 0.0001968446875707014,
"loss": 1.1138,
"mean_token_accuracy": 0.6870416551828384,
"num_tokens": 2380730.0,
"step": 281
},
{
"entropy": 1.0892143547534943,
"epoch": 0.11281128112811281,
"grad_norm": 0.3009011447429657,
"learning_rate": 0.00019681401231402842,
"loss": 1.0712,
"mean_token_accuracy": 0.6998904794454575,
"num_tokens": 2389463.0,
"step": 282
},
{
"entropy": 1.1513322591781616,
"epoch": 0.11321132113211321,
"grad_norm": 0.29763105511665344,
"learning_rate": 0.00019678319135518294,
"loss": 1.1861,
"mean_token_accuracy": 0.6697124987840652,
"num_tokens": 2398473.0,
"step": 283
},
{
"entropy": 1.1688634753227234,
"epoch": 0.11361136113611361,
"grad_norm": 0.33001646399497986,
"learning_rate": 0.00019675222474589257,
"loss": 1.2012,
"mean_token_accuracy": 0.673338770866394,
"num_tokens": 2406493.0,
"step": 284
},
{
"entropy": 1.1393934190273285,
"epoch": 0.11401140114011402,
"grad_norm": 0.2978336215019226,
"learning_rate": 0.00019672111253812933,
"loss": 1.1566,
"mean_token_accuracy": 0.6849386692047119,
"num_tokens": 2414963.0,
"step": 285
},
{
"entropy": 1.1978220045566559,
"epoch": 0.11441144114411442,
"grad_norm": 0.296939879655838,
"learning_rate": 0.00019668985478410968,
"loss": 1.1508,
"mean_token_accuracy": 0.6871092170476913,
"num_tokens": 2423476.0,
"step": 286
},
{
"entropy": 1.1493785977363586,
"epoch": 0.1148114811481148,
"grad_norm": 0.3038109242916107,
"learning_rate": 0.00019665845153629425,
"loss": 1.1429,
"mean_token_accuracy": 0.6873074918985367,
"num_tokens": 2432015.0,
"step": 287
},
{
"entropy": 1.1764490902423859,
"epoch": 0.1152115211521152,
"grad_norm": 0.28137773275375366,
"learning_rate": 0.00019662690284738793,
"loss": 1.1206,
"mean_token_accuracy": 0.6875211298465729,
"num_tokens": 2440577.0,
"step": 288
},
{
"entropy": 1.1811064779758453,
"epoch": 0.11561156115611561,
"grad_norm": 0.2927968502044678,
"learning_rate": 0.00019659520877033976,
"loss": 1.1828,
"mean_token_accuracy": 0.67679663002491,
"num_tokens": 2449585.0,
"step": 289
},
{
"entropy": 1.1157205402851105,
"epoch": 0.11601160116011601,
"grad_norm": 0.2844160199165344,
"learning_rate": 0.0001965633693583426,
"loss": 1.1127,
"mean_token_accuracy": 0.6861093044281006,
"num_tokens": 2458691.0,
"step": 290
},
{
"entropy": 1.1210555136203766,
"epoch": 0.11641164116411641,
"grad_norm": 0.30678603053092957,
"learning_rate": 0.0001965313846648334,
"loss": 1.1495,
"mean_token_accuracy": 0.6870106756687164,
"num_tokens": 2466917.0,
"step": 291
},
{
"entropy": 1.1256535351276398,
"epoch": 0.11681168116811681,
"grad_norm": 0.31176719069480896,
"learning_rate": 0.00019649925474349292,
"loss": 1.1516,
"mean_token_accuracy": 0.679766371846199,
"num_tokens": 2475064.0,
"step": 292
},
{
"entropy": 1.1276935040950775,
"epoch": 0.11721172117211721,
"grad_norm": 0.29645654559135437,
"learning_rate": 0.00019646697964824562,
"loss": 1.1372,
"mean_token_accuracy": 0.6837837547063828,
"num_tokens": 2483736.0,
"step": 293
},
{
"entropy": 1.1446107029914856,
"epoch": 0.11761176117611762,
"grad_norm": 0.2959735691547394,
"learning_rate": 0.00019643455943325953,
"loss": 1.1344,
"mean_token_accuracy": 0.6885244697332382,
"num_tokens": 2492223.0,
"step": 294
},
{
"entropy": 1.1486328840255737,
"epoch": 0.11801180118011802,
"grad_norm": 0.35478872060775757,
"learning_rate": 0.00019640199415294645,
"loss": 1.1195,
"mean_token_accuracy": 0.6887603253126144,
"num_tokens": 2500600.0,
"step": 295
},
{
"entropy": 1.126534789800644,
"epoch": 0.11841184118411842,
"grad_norm": 0.2932710349559784,
"learning_rate": 0.00019636928386196145,
"loss": 1.1047,
"mean_token_accuracy": 0.696495532989502,
"num_tokens": 2509047.0,
"step": 296
},
{
"entropy": 1.1546699106693268,
"epoch": 0.1188118811881188,
"grad_norm": 0.2861276865005493,
"learning_rate": 0.00019633642861520306,
"loss": 1.1463,
"mean_token_accuracy": 0.6796572506427765,
"num_tokens": 2517885.0,
"step": 297
},
{
"entropy": 1.1594507992267609,
"epoch": 0.11921192119211921,
"grad_norm": 0.5982229709625244,
"learning_rate": 0.0001963034284678131,
"loss": 1.1527,
"mean_token_accuracy": 0.6782443970441818,
"num_tokens": 2525962.0,
"step": 298
},
{
"entropy": 1.1879192888736725,
"epoch": 0.11961196119611961,
"grad_norm": 0.30875492095947266,
"learning_rate": 0.00019627028347517648,
"loss": 1.1854,
"mean_token_accuracy": 0.675933450460434,
"num_tokens": 2534220.0,
"step": 299
},
{
"entropy": 1.1593869030475616,
"epoch": 0.12001200120012001,
"grad_norm": 0.3053128719329834,
"learning_rate": 0.00019623699369292137,
"loss": 1.1617,
"mean_token_accuracy": 0.677645817399025,
"num_tokens": 2542206.0,
"step": 300
},
{
"entropy": 1.1326042711734772,
"epoch": 0.12041204120412041,
"grad_norm": 0.3102218508720398,
"learning_rate": 0.00019620355917691884,
"loss": 1.1384,
"mean_token_accuracy": 0.6767238080501556,
"num_tokens": 2550584.0,
"step": 301
},
{
"entropy": 1.1040166020393372,
"epoch": 0.12081208120812081,
"grad_norm": 0.3166041970252991,
"learning_rate": 0.00019616997998328292,
"loss": 1.1206,
"mean_token_accuracy": 0.6878381818532944,
"num_tokens": 2558969.0,
"step": 302
},
{
"entropy": 1.1306456625461578,
"epoch": 0.12121212121212122,
"grad_norm": 0.31803345680236816,
"learning_rate": 0.00019613625616837034,
"loss": 1.1286,
"mean_token_accuracy": 0.6829645335674286,
"num_tokens": 2567510.0,
"step": 303
},
{
"entropy": 1.2087586522102356,
"epoch": 0.12161216121612162,
"grad_norm": 0.313399076461792,
"learning_rate": 0.0001961023877887807,
"loss": 1.2,
"mean_token_accuracy": 0.6653729230165482,
"num_tokens": 2575393.0,
"step": 304
},
{
"entropy": 1.1803353130817413,
"epoch": 0.12201220122012202,
"grad_norm": 0.2919938862323761,
"learning_rate": 0.0001960683749013562,
"loss": 1.1749,
"mean_token_accuracy": 0.6795784384012222,
"num_tokens": 2583973.0,
"step": 305
},
{
"entropy": 1.206252634525299,
"epoch": 0.1224122412241224,
"grad_norm": 0.30734333395957947,
"learning_rate": 0.00019603421756318146,
"loss": 1.2079,
"mean_token_accuracy": 0.6748498380184174,
"num_tokens": 2592413.0,
"step": 306
},
{
"entropy": 1.1237642168998718,
"epoch": 0.12281228122812281,
"grad_norm": 0.2940463721752167,
"learning_rate": 0.00019599991583158367,
"loss": 1.0924,
"mean_token_accuracy": 0.6870536357164383,
"num_tokens": 2601189.0,
"step": 307
},
{
"entropy": 1.1055436730384827,
"epoch": 0.12321232123212321,
"grad_norm": 0.2887219488620758,
"learning_rate": 0.00019596546976413226,
"loss": 1.1143,
"mean_token_accuracy": 0.6970756649971008,
"num_tokens": 2610378.0,
"step": 308
},
{
"entropy": 1.1455924063920975,
"epoch": 0.12361236123612361,
"grad_norm": 0.30642586946487427,
"learning_rate": 0.00019593087941863893,
"loss": 1.1163,
"mean_token_accuracy": 0.6846802532672882,
"num_tokens": 2618765.0,
"step": 309
},
{
"entropy": 1.1495613157749176,
"epoch": 0.12401240124012401,
"grad_norm": 0.2958558201789856,
"learning_rate": 0.00019589614485315766,
"loss": 1.1277,
"mean_token_accuracy": 0.692332923412323,
"num_tokens": 2627306.0,
"step": 310
},
{
"entropy": 1.1369233131408691,
"epoch": 0.12441244124412441,
"grad_norm": 0.2962513566017151,
"learning_rate": 0.0001958612661259842,
"loss": 1.1458,
"mean_token_accuracy": 0.6847312748432159,
"num_tokens": 2635802.0,
"step": 311
},
{
"entropy": 1.1192970275878906,
"epoch": 0.12481248124812482,
"grad_norm": 0.3100016117095947,
"learning_rate": 0.00019582624329565656,
"loss": 1.1479,
"mean_token_accuracy": 0.679630234837532,
"num_tokens": 2644316.0,
"step": 312
},
{
"entropy": 1.1962910890579224,
"epoch": 0.1252125212521252,
"grad_norm": 0.3248625099658966,
"learning_rate": 0.0001957910764209543,
"loss": 1.2285,
"mean_token_accuracy": 0.6648171693086624,
"num_tokens": 2652787.0,
"step": 313
},
{
"entropy": 1.1034400761127472,
"epoch": 0.1256125612561256,
"grad_norm": 0.2892885208129883,
"learning_rate": 0.00019575576556089897,
"loss": 1.1218,
"mean_token_accuracy": 0.685823604464531,
"num_tokens": 2661638.0,
"step": 314
},
{
"entropy": 1.1764290630817413,
"epoch": 0.126012601260126,
"grad_norm": 0.2998030483722687,
"learning_rate": 0.00019572031077475367,
"loss": 1.0975,
"mean_token_accuracy": 0.6871052384376526,
"num_tokens": 2670313.0,
"step": 315
},
{
"entropy": 1.2649544775485992,
"epoch": 0.1264126412641264,
"grad_norm": 0.31360095739364624,
"learning_rate": 0.0001956847121220231,
"loss": 1.2167,
"mean_token_accuracy": 0.660548061132431,
"num_tokens": 2678587.0,
"step": 316
},
{
"entropy": 1.1531548500061035,
"epoch": 0.1268126812681268,
"grad_norm": 0.3179381787776947,
"learning_rate": 0.0001956489696624533,
"loss": 1.1596,
"mean_token_accuracy": 0.6832859367132187,
"num_tokens": 2686845.0,
"step": 317
},
{
"entropy": 1.1491257846355438,
"epoch": 0.1272127212721272,
"grad_norm": 0.3010673224925995,
"learning_rate": 0.00019561308345603188,
"loss": 1.1856,
"mean_token_accuracy": 0.6756436675786972,
"num_tokens": 2695519.0,
"step": 318
},
{
"entropy": 1.099882572889328,
"epoch": 0.1276127612761276,
"grad_norm": 0.3057318925857544,
"learning_rate": 0.0001955770535629875,
"loss": 1.1369,
"mean_token_accuracy": 0.6802153438329697,
"num_tokens": 2704317.0,
"step": 319
},
{
"entropy": 1.1104555130004883,
"epoch": 0.128012801280128,
"grad_norm": 0.30537816882133484,
"learning_rate": 0.00019554088004379,
"loss": 1.0916,
"mean_token_accuracy": 0.6971182078123093,
"num_tokens": 2712576.0,
"step": 320
},
{
"entropy": 1.1894198954105377,
"epoch": 0.12841284128412842,
"grad_norm": 0.2941950261592865,
"learning_rate": 0.00019550456295915042,
"loss": 1.1728,
"mean_token_accuracy": 0.6762441992759705,
"num_tokens": 2721000.0,
"step": 321
},
{
"entropy": 1.1880941092967987,
"epoch": 0.12881288128812882,
"grad_norm": 0.3045370280742645,
"learning_rate": 0.00019546810237002066,
"loss": 1.1695,
"mean_token_accuracy": 0.6775896400213242,
"num_tokens": 2729281.0,
"step": 322
},
{
"entropy": 1.1603459417819977,
"epoch": 0.12921292129212922,
"grad_norm": 0.29477667808532715,
"learning_rate": 0.00019543149833759334,
"loss": 1.13,
"mean_token_accuracy": 0.6883135735988617,
"num_tokens": 2737775.0,
"step": 323
},
{
"entropy": 1.148952156305313,
"epoch": 0.12961296129612962,
"grad_norm": 0.2921348214149475,
"learning_rate": 0.000195394750923302,
"loss": 1.1492,
"mean_token_accuracy": 0.6808929741382599,
"num_tokens": 2746681.0,
"step": 324
},
{
"entropy": 1.2179997265338898,
"epoch": 0.13001300130013002,
"grad_norm": 0.3009890019893646,
"learning_rate": 0.0001953578601888208,
"loss": 1.2338,
"mean_token_accuracy": 0.6610979735851288,
"num_tokens": 2755045.0,
"step": 325
},
{
"entropy": 1.2134989798069,
"epoch": 0.13041304130413042,
"grad_norm": 0.3033868968486786,
"learning_rate": 0.00019532082619606436,
"loss": 1.2165,
"mean_token_accuracy": 0.6606318801641464,
"num_tokens": 2763287.0,
"step": 326
},
{
"entropy": 1.0881072580814362,
"epoch": 0.13081308130813082,
"grad_norm": 0.2861042022705078,
"learning_rate": 0.0001952836490071878,
"loss": 1.0643,
"mean_token_accuracy": 0.6997469067573547,
"num_tokens": 2772109.0,
"step": 327
},
{
"entropy": 1.2652019262313843,
"epoch": 0.13121312131213123,
"grad_norm": 0.3063291311264038,
"learning_rate": 0.00019524632868458649,
"loss": 1.2374,
"mean_token_accuracy": 0.6631722450256348,
"num_tokens": 2780001.0,
"step": 328
},
{
"entropy": 1.1232223510742188,
"epoch": 0.1316131613161316,
"grad_norm": 0.2938007712364197,
"learning_rate": 0.00019520886529089616,
"loss": 1.1047,
"mean_token_accuracy": 0.6943131983280182,
"num_tokens": 2788572.0,
"step": 329
},
{
"entropy": 1.182855635881424,
"epoch": 0.132013201320132,
"grad_norm": 0.2949009835720062,
"learning_rate": 0.00019517125888899255,
"loss": 1.1657,
"mean_token_accuracy": 0.6759148836135864,
"num_tokens": 2797349.0,
"step": 330
},
{
"entropy": 1.1421308815479279,
"epoch": 0.1324132413241324,
"grad_norm": 0.3349224328994751,
"learning_rate": 0.00019513350954199142,
"loss": 1.1379,
"mean_token_accuracy": 0.6823170036077499,
"num_tokens": 2805345.0,
"step": 331
},
{
"entropy": 1.0656911730766296,
"epoch": 0.1328132813281328,
"grad_norm": 0.3012828230857849,
"learning_rate": 0.00019509561731324848,
"loss": 1.0942,
"mean_token_accuracy": 0.6952732652425766,
"num_tokens": 2814123.0,
"step": 332
},
{
"entropy": 1.0468103885650635,
"epoch": 0.1332133213321332,
"grad_norm": 0.30162152647972107,
"learning_rate": 0.0001950575822663592,
"loss": 1.1012,
"mean_token_accuracy": 0.6894596368074417,
"num_tokens": 2823120.0,
"step": 333
},
{
"entropy": 1.089416727423668,
"epoch": 0.1336133613361336,
"grad_norm": 0.3064773976802826,
"learning_rate": 0.00019501940446515882,
"loss": 1.1036,
"mean_token_accuracy": 0.6885414123535156,
"num_tokens": 2831735.0,
"step": 334
},
{
"entropy": 1.1649364531040192,
"epoch": 0.134013401340134,
"grad_norm": 0.35003024339675903,
"learning_rate": 0.00019498108397372212,
"loss": 1.1766,
"mean_token_accuracy": 0.6764324754476547,
"num_tokens": 2839670.0,
"step": 335
},
{
"entropy": 1.1590066254138947,
"epoch": 0.1344134413441344,
"grad_norm": 0.26645922660827637,
"learning_rate": 0.0001949426208563633,
"loss": 1.1091,
"mean_token_accuracy": 0.6905470341444016,
"num_tokens": 2848911.0,
"step": 336
},
{
"entropy": 1.251402735710144,
"epoch": 0.1348134813481348,
"grad_norm": 0.31132251024246216,
"learning_rate": 0.000194904015177636,
"loss": 1.1918,
"mean_token_accuracy": 0.6727328300476074,
"num_tokens": 2857199.0,
"step": 337
},
{
"entropy": 1.220662236213684,
"epoch": 0.1352135213521352,
"grad_norm": 0.3061762750148773,
"learning_rate": 0.00019486526700233315,
"loss": 1.1868,
"mean_token_accuracy": 0.672507032752037,
"num_tokens": 2865223.0,
"step": 338
},
{
"entropy": 1.0638089627027512,
"epoch": 0.13561356135613561,
"grad_norm": 0.29525840282440186,
"learning_rate": 0.00019482637639548682,
"loss": 1.0514,
"mean_token_accuracy": 0.7034783512353897,
"num_tokens": 2873440.0,
"step": 339
},
{
"entropy": 1.1221419274806976,
"epoch": 0.13601360136013602,
"grad_norm": 0.2899990379810333,
"learning_rate": 0.00019478734342236808,
"loss": 1.1505,
"mean_token_accuracy": 0.675692155957222,
"num_tokens": 2882408.0,
"step": 340
},
{
"entropy": 1.145202785730362,
"epoch": 0.13641364136413642,
"grad_norm": 0.2904442250728607,
"learning_rate": 0.0001947481681484869,
"loss": 1.1848,
"mean_token_accuracy": 0.6750968992710114,
"num_tokens": 2891461.0,
"step": 341
},
{
"entropy": 1.081279844045639,
"epoch": 0.13681368136813682,
"grad_norm": 0.30348628759384155,
"learning_rate": 0.00019470885063959225,
"loss": 1.0734,
"mean_token_accuracy": 0.6975607126951218,
"num_tokens": 2900223.0,
"step": 342
},
{
"entropy": 1.0558022856712341,
"epoch": 0.13721372137213722,
"grad_norm": 0.28773176670074463,
"learning_rate": 0.00019466939096167164,
"loss": 1.0604,
"mean_token_accuracy": 0.6948001831769943,
"num_tokens": 2909084.0,
"step": 343
},
{
"entropy": 1.1171001195907593,
"epoch": 0.13761376137613762,
"grad_norm": 0.29017966985702515,
"learning_rate": 0.00019462978918095128,
"loss": 1.1181,
"mean_token_accuracy": 0.68596550822258,
"num_tokens": 2917795.0,
"step": 344
},
{
"entropy": 1.1633701920509338,
"epoch": 0.13801380138013802,
"grad_norm": 0.28877806663513184,
"learning_rate": 0.00019459004536389587,
"loss": 1.1716,
"mean_token_accuracy": 0.6693498939275742,
"num_tokens": 2925764.0,
"step": 345
},
{
"entropy": 1.2091334760189056,
"epoch": 0.13841384138413843,
"grad_norm": 0.3057492971420288,
"learning_rate": 0.00019455015957720842,
"loss": 1.2115,
"mean_token_accuracy": 0.6683546006679535,
"num_tokens": 2934337.0,
"step": 346
},
{
"entropy": 1.117457777261734,
"epoch": 0.13881388138813883,
"grad_norm": 0.3619987964630127,
"learning_rate": 0.0001945101318878303,
"loss": 1.0944,
"mean_token_accuracy": 0.6917587071657181,
"num_tokens": 2942882.0,
"step": 347
},
{
"entropy": 1.1964794397354126,
"epoch": 0.1392139213921392,
"grad_norm": 0.29087069630622864,
"learning_rate": 0.000194469962362941,
"loss": 1.1536,
"mean_token_accuracy": 0.6789288967847824,
"num_tokens": 2951358.0,
"step": 348
},
{
"entropy": 1.1352568864822388,
"epoch": 0.1396139613961396,
"grad_norm": 0.30058935284614563,
"learning_rate": 0.00019442965106995807,
"loss": 1.1042,
"mean_token_accuracy": 0.6969415545463562,
"num_tokens": 2959902.0,
"step": 349
},
{
"entropy": 1.1815881133079529,
"epoch": 0.14001400140014,
"grad_norm": 0.29818278551101685,
"learning_rate": 0.00019438919807653694,
"loss": 1.1937,
"mean_token_accuracy": 0.6777724772691727,
"num_tokens": 2968375.0,
"step": 350
},
{
"entropy": 1.1138464957475662,
"epoch": 0.1404140414041404,
"grad_norm": 0.29378682374954224,
"learning_rate": 0.00019434860345057096,
"loss": 1.136,
"mean_token_accuracy": 0.6846367418766022,
"num_tokens": 2976891.0,
"step": 351
},
{
"entropy": 1.1382241249084473,
"epoch": 0.1408140814081408,
"grad_norm": 0.298759788274765,
"learning_rate": 0.00019430786726019102,
"loss": 1.1675,
"mean_token_accuracy": 0.6828837245702744,
"num_tokens": 2984891.0,
"step": 352
},
{
"entropy": 1.2404142022132874,
"epoch": 0.1412141214121412,
"grad_norm": 0.3150947093963623,
"learning_rate": 0.00019426698957376585,
"loss": 1.2342,
"mean_token_accuracy": 0.6579574644565582,
"num_tokens": 2993072.0,
"step": 353
},
{
"entropy": 1.1687238216400146,
"epoch": 0.1416141614161416,
"grad_norm": 0.29389873147010803,
"learning_rate": 0.00019422597045990142,
"loss": 1.1767,
"mean_token_accuracy": 0.6675811409950256,
"num_tokens": 3001760.0,
"step": 354
},
{
"entropy": 1.1566392183303833,
"epoch": 0.142014201420142,
"grad_norm": 0.288309246301651,
"learning_rate": 0.00019418480998744118,
"loss": 1.1291,
"mean_token_accuracy": 0.6857695430517197,
"num_tokens": 3010111.0,
"step": 355
},
{
"entropy": 1.1949766874313354,
"epoch": 0.1424142414241424,
"grad_norm": 0.29533353447914124,
"learning_rate": 0.00019414350822546584,
"loss": 1.1664,
"mean_token_accuracy": 0.6795456558465958,
"num_tokens": 3018712.0,
"step": 356
},
{
"entropy": 1.1488195657730103,
"epoch": 0.14281428142814281,
"grad_norm": 0.3124019205570221,
"learning_rate": 0.00019410206524329314,
"loss": 1.129,
"mean_token_accuracy": 0.6900259405374527,
"num_tokens": 3026707.0,
"step": 357
},
{
"entropy": 1.1078391075134277,
"epoch": 0.14321432143214322,
"grad_norm": 0.4887332618236542,
"learning_rate": 0.00019406048111047792,
"loss": 1.1122,
"mean_token_accuracy": 0.6845664978027344,
"num_tokens": 3035277.0,
"step": 358
},
{
"entropy": 1.1673301458358765,
"epoch": 0.14361436143614362,
"grad_norm": 0.30997899174690247,
"learning_rate": 0.0001940187558968119,
"loss": 1.1427,
"mean_token_accuracy": 0.6802043169736862,
"num_tokens": 3043456.0,
"step": 359
},
{
"entropy": 1.1499980092048645,
"epoch": 0.14401440144014402,
"grad_norm": 0.3066644072532654,
"learning_rate": 0.00019397688967232352,
"loss": 1.1497,
"mean_token_accuracy": 0.6805084347724915,
"num_tokens": 3051649.0,
"step": 360
},
{
"entropy": 1.131559580564499,
"epoch": 0.14441444144414442,
"grad_norm": 0.296249657869339,
"learning_rate": 0.000193934882507278,
"loss": 1.1349,
"mean_token_accuracy": 0.6809341907501221,
"num_tokens": 3060190.0,
"step": 361
},
{
"entropy": 1.1443010866641998,
"epoch": 0.14481448144814482,
"grad_norm": 0.31838539242744446,
"learning_rate": 0.00019389273447217704,
"loss": 1.1696,
"mean_token_accuracy": 0.6759007275104523,
"num_tokens": 3068580.0,
"step": 362
},
{
"entropy": 1.133973866701126,
"epoch": 0.14521452145214522,
"grad_norm": 0.2861894965171814,
"learning_rate": 0.0001938504456377587,
"loss": 1.1291,
"mean_token_accuracy": 0.6851497888565063,
"num_tokens": 3077427.0,
"step": 363
},
{
"entropy": 1.136247158050537,
"epoch": 0.14561456145614562,
"grad_norm": 0.2967614531517029,
"learning_rate": 0.00019380801607499746,
"loss": 1.0995,
"mean_token_accuracy": 0.6911982148885727,
"num_tokens": 3085196.0,
"step": 364
},
{
"entropy": 1.184772402048111,
"epoch": 0.14601460146014603,
"grad_norm": 0.3119775354862213,
"learning_rate": 0.00019376544585510393,
"loss": 1.2257,
"mean_token_accuracy": 0.666557103395462,
"num_tokens": 3093621.0,
"step": 365
},
{
"entropy": 1.1576828956604004,
"epoch": 0.14641464146414643,
"grad_norm": 0.3863295018672943,
"learning_rate": 0.0001937227350495248,
"loss": 1.1722,
"mean_token_accuracy": 0.6755800992250443,
"num_tokens": 3102047.0,
"step": 366
},
{
"entropy": 1.0888293087482452,
"epoch": 0.1468146814681468,
"grad_norm": 0.2931033670902252,
"learning_rate": 0.00019367988372994265,
"loss": 1.0546,
"mean_token_accuracy": 0.6972876787185669,
"num_tokens": 3110407.0,
"step": 367
},
{
"entropy": 1.171687364578247,
"epoch": 0.1472147214721472,
"grad_norm": 0.43645840883255005,
"learning_rate": 0.000193636891968276,
"loss": 1.1192,
"mean_token_accuracy": 0.6813657730817795,
"num_tokens": 3118726.0,
"step": 368
},
{
"entropy": 1.1906355917453766,
"epoch": 0.1476147614761476,
"grad_norm": 0.30559539794921875,
"learning_rate": 0.00019359375983667902,
"loss": 1.1854,
"mean_token_accuracy": 0.6698572039604187,
"num_tokens": 3126856.0,
"step": 369
},
{
"entropy": 1.1418620645999908,
"epoch": 0.148014801480148,
"grad_norm": 0.31266874074935913,
"learning_rate": 0.00019355048740754145,
"loss": 1.1375,
"mean_token_accuracy": 0.678287535905838,
"num_tokens": 3135201.0,
"step": 370
},
{
"entropy": 1.1904971301555634,
"epoch": 0.1484148414841484,
"grad_norm": 0.3213047981262207,
"learning_rate": 0.00019350707475348852,
"loss": 1.1842,
"mean_token_accuracy": 0.6759228259325027,
"num_tokens": 3143256.0,
"step": 371
},
{
"entropy": 1.1902599036693573,
"epoch": 0.1488148814881488,
"grad_norm": 0.5613988041877747,
"learning_rate": 0.00019346352194738077,
"loss": 1.2442,
"mean_token_accuracy": 0.6619480550289154,
"num_tokens": 3150704.0,
"step": 372
},
{
"entropy": 1.0474575012922287,
"epoch": 0.1492149214921492,
"grad_norm": 0.2898733615875244,
"learning_rate": 0.00019341982906231407,
"loss": 1.0636,
"mean_token_accuracy": 0.6995494663715363,
"num_tokens": 3159711.0,
"step": 373
},
{
"entropy": 1.226840317249298,
"epoch": 0.1496149614961496,
"grad_norm": 0.314718633890152,
"learning_rate": 0.0001933759961716192,
"loss": 1.1882,
"mean_token_accuracy": 0.6709526926279068,
"num_tokens": 3167294.0,
"step": 374
},
{
"entropy": 1.1560609936714172,
"epoch": 0.15001500150015,
"grad_norm": 0.29525458812713623,
"learning_rate": 0.00019333202334886207,
"loss": 1.1088,
"mean_token_accuracy": 0.6907341927289963,
"num_tokens": 3175676.0,
"step": 375
},
{
"entropy": 1.1789807677268982,
"epoch": 0.15041504150415042,
"grad_norm": 0.2906891405582428,
"learning_rate": 0.0001932879106678434,
"loss": 1.1488,
"mean_token_accuracy": 0.6830808073282242,
"num_tokens": 3184781.0,
"step": 376
},
{
"entropy": 1.2095182836055756,
"epoch": 0.15081508150815082,
"grad_norm": 0.29173582792282104,
"learning_rate": 0.00019324365820259858,
"loss": 1.1471,
"mean_token_accuracy": 0.6814120411872864,
"num_tokens": 3193359.0,
"step": 377
},
{
"entropy": 1.1557953655719757,
"epoch": 0.15121512151215122,
"grad_norm": 0.30150917172431946,
"learning_rate": 0.0001931992660273977,
"loss": 1.1842,
"mean_token_accuracy": 0.6736668199300766,
"num_tokens": 3201977.0,
"step": 378
},
{
"entropy": 1.1141368001699448,
"epoch": 0.15161516151615162,
"grad_norm": 0.3033373951911926,
"learning_rate": 0.00019315473421674525,
"loss": 1.1433,
"mean_token_accuracy": 0.6801392734050751,
"num_tokens": 3210612.0,
"step": 379
},
{
"entropy": 1.0636587738990784,
"epoch": 0.15201520152015202,
"grad_norm": 0.2994931936264038,
"learning_rate": 0.00019311006284538013,
"loss": 1.0722,
"mean_token_accuracy": 0.6968654096126556,
"num_tokens": 3219123.0,
"step": 380
},
{
"entropy": 1.2064105868339539,
"epoch": 0.15241524152415242,
"grad_norm": 0.3521154820919037,
"learning_rate": 0.00019306525198827548,
"loss": 1.2385,
"mean_token_accuracy": 0.6615314930677414,
"num_tokens": 3227445.0,
"step": 381
},
{
"entropy": 1.127672255039215,
"epoch": 0.15281528152815282,
"grad_norm": 0.2892846465110779,
"learning_rate": 0.00019302030172063837,
"loss": 1.1389,
"mean_token_accuracy": 0.6847521215677261,
"num_tokens": 3236240.0,
"step": 382
},
{
"entropy": 1.1575649082660675,
"epoch": 0.15321532153215323,
"grad_norm": 0.31099551916122437,
"learning_rate": 0.0001929752121179101,
"loss": 1.1524,
"mean_token_accuracy": 0.6786007881164551,
"num_tokens": 3244515.0,
"step": 383
},
{
"entropy": 1.1269442737102509,
"epoch": 0.15361536153615363,
"grad_norm": 0.2906751036643982,
"learning_rate": 0.0001929299832557657,
"loss": 1.0972,
"mean_token_accuracy": 0.6957235038280487,
"num_tokens": 3253311.0,
"step": 384
},
{
"entropy": 1.2260091006755829,
"epoch": 0.15401540154015403,
"grad_norm": 0.2963874638080597,
"learning_rate": 0.00019288461521011388,
"loss": 1.1781,
"mean_token_accuracy": 0.6785955429077148,
"num_tokens": 3261634.0,
"step": 385
},
{
"entropy": 1.1854043006896973,
"epoch": 0.1544154415441544,
"grad_norm": 0.30083367228507996,
"learning_rate": 0.00019283910805709698,
"loss": 1.1677,
"mean_token_accuracy": 0.6692470908164978,
"num_tokens": 3270087.0,
"step": 386
},
{
"entropy": 1.2266800105571747,
"epoch": 0.1548154815481548,
"grad_norm": 0.3198303282260895,
"learning_rate": 0.00019279346187309085,
"loss": 1.2064,
"mean_token_accuracy": 0.6682067066431046,
"num_tokens": 3278271.0,
"step": 387
},
{
"entropy": 1.1660953760147095,
"epoch": 0.1552155215521552,
"grad_norm": 0.33573225140571594,
"learning_rate": 0.00019274767673470463,
"loss": 1.1942,
"mean_token_accuracy": 0.6672907918691635,
"num_tokens": 3286608.0,
"step": 388
},
{
"entropy": 1.0843549370765686,
"epoch": 0.1556155615561556,
"grad_norm": 0.30995887517929077,
"learning_rate": 0.00019270175271878068,
"loss": 1.0992,
"mean_token_accuracy": 0.6958242803812027,
"num_tokens": 3295009.0,
"step": 389
},
{
"entropy": 1.128290981054306,
"epoch": 0.156015601560156,
"grad_norm": 0.3144836127758026,
"learning_rate": 0.00019265568990239445,
"loss": 1.137,
"mean_token_accuracy": 0.6823694556951523,
"num_tokens": 3303299.0,
"step": 390
},
{
"entropy": 1.195746123790741,
"epoch": 0.1564156415641564,
"grad_norm": 0.30768823623657227,
"learning_rate": 0.00019260948836285439,
"loss": 1.1869,
"mean_token_accuracy": 0.6803343147039413,
"num_tokens": 3311591.0,
"step": 391
},
{
"entropy": 1.1737743616104126,
"epoch": 0.1568156815681568,
"grad_norm": 0.29867610335350037,
"learning_rate": 0.00019256314817770164,
"loss": 1.1703,
"mean_token_accuracy": 0.6784539520740509,
"num_tokens": 3320022.0,
"step": 392
},
{
"entropy": 1.2264443039894104,
"epoch": 0.1572157215721572,
"grad_norm": 0.30367588996887207,
"learning_rate": 0.00019251666942471016,
"loss": 1.1963,
"mean_token_accuracy": 0.6694721430540085,
"num_tokens": 3328671.0,
"step": 393
},
{
"entropy": 1.1673425137996674,
"epoch": 0.15761576157615761,
"grad_norm": 0.312225341796875,
"learning_rate": 0.00019247005218188645,
"loss": 1.1641,
"mean_token_accuracy": 0.6831966638565063,
"num_tokens": 3336686.0,
"step": 394
},
{
"entropy": 1.1570010483264923,
"epoch": 0.15801580158015802,
"grad_norm": 0.325536847114563,
"learning_rate": 0.00019242329652746938,
"loss": 1.1245,
"mean_token_accuracy": 0.6909505128860474,
"num_tokens": 3344988.0,
"step": 395
},
{
"entropy": 1.118729829788208,
"epoch": 0.15841584158415842,
"grad_norm": 0.31520524621009827,
"learning_rate": 0.00019237640253993017,
"loss": 1.1096,
"mean_token_accuracy": 0.686091959476471,
"num_tokens": 3353202.0,
"step": 396
},
{
"entropy": 1.1297271251678467,
"epoch": 0.15881588158815882,
"grad_norm": 0.31851935386657715,
"learning_rate": 0.00019232937029797217,
"loss": 1.1385,
"mean_token_accuracy": 0.6839326471090317,
"num_tokens": 3362000.0,
"step": 397
},
{
"entropy": 1.111870676279068,
"epoch": 0.15921592159215922,
"grad_norm": 0.29706814885139465,
"learning_rate": 0.00019228219988053085,
"loss": 1.132,
"mean_token_accuracy": 0.6736722886562347,
"num_tokens": 3370452.0,
"step": 398
},
{
"entropy": 1.0942797362804413,
"epoch": 0.15961596159615962,
"grad_norm": 0.3211657702922821,
"learning_rate": 0.00019223489136677347,
"loss": 1.1642,
"mean_token_accuracy": 0.6759698241949081,
"num_tokens": 3378774.0,
"step": 399
},
{
"entropy": 1.1003531515598297,
"epoch": 0.16001600160016002,
"grad_norm": 0.2938557267189026,
"learning_rate": 0.00019218744483609918,
"loss": 1.0841,
"mean_token_accuracy": 0.689574733376503,
"num_tokens": 3387752.0,
"step": 400
},
{
"entropy": 1.1808100640773773,
"epoch": 0.16041604160416043,
"grad_norm": 0.3016187250614166,
"learning_rate": 0.00019213986036813863,
"loss": 1.1379,
"mean_token_accuracy": 0.6819901168346405,
"num_tokens": 3395722.0,
"step": 401
},
{
"entropy": 1.1858965158462524,
"epoch": 0.16081608160816083,
"grad_norm": 0.2888219952583313,
"learning_rate": 0.00019209213804275408,
"loss": 1.1126,
"mean_token_accuracy": 0.6891250312328339,
"num_tokens": 3404658.0,
"step": 402
},
{
"entropy": 1.1066676825284958,
"epoch": 0.16121612161216123,
"grad_norm": 0.2900371551513672,
"learning_rate": 0.00019204427794003911,
"loss": 1.0613,
"mean_token_accuracy": 0.6994702219963074,
"num_tokens": 3413044.0,
"step": 403
},
{
"entropy": 1.0648207068443298,
"epoch": 0.16161616161616163,
"grad_norm": 0.2870444357395172,
"learning_rate": 0.00019199628014031857,
"loss": 1.0816,
"mean_token_accuracy": 0.6926587671041489,
"num_tokens": 3421932.0,
"step": 404
},
{
"entropy": 1.1214756965637207,
"epoch": 0.162016201620162,
"grad_norm": 0.3146369755268097,
"learning_rate": 0.00019194814472414844,
"loss": 1.1529,
"mean_token_accuracy": 0.679986834526062,
"num_tokens": 3429660.0,
"step": 405
},
{
"entropy": 1.0432531386613846,
"epoch": 0.1624162416241624,
"grad_norm": 0.3081408441066742,
"learning_rate": 0.00019189987177231554,
"loss": 1.0802,
"mean_token_accuracy": 0.6946697533130646,
"num_tokens": 3437779.0,
"step": 406
},
{
"entropy": 1.1035350263118744,
"epoch": 0.1628162816281628,
"grad_norm": 0.3021145761013031,
"learning_rate": 0.00019185146136583761,
"loss": 1.1354,
"mean_token_accuracy": 0.6885717958211899,
"num_tokens": 3446116.0,
"step": 407
},
{
"entropy": 1.1501671075820923,
"epoch": 0.1632163216321632,
"grad_norm": 0.41734570264816284,
"learning_rate": 0.00019180291358596312,
"loss": 1.1233,
"mean_token_accuracy": 0.6793646067380905,
"num_tokens": 3454845.0,
"step": 408
},
{
"entropy": 1.1991091966629028,
"epoch": 0.1636163616361636,
"grad_norm": 0.29790523648262024,
"learning_rate": 0.00019175422851417103,
"loss": 1.1549,
"mean_token_accuracy": 0.6777328252792358,
"num_tokens": 3463400.0,
"step": 409
},
{
"entropy": 1.1822619140148163,
"epoch": 0.164016401640164,
"grad_norm": 0.31777262687683105,
"learning_rate": 0.00019170540623217065,
"loss": 1.1476,
"mean_token_accuracy": 0.6912225484848022,
"num_tokens": 3471177.0,
"step": 410
},
{
"entropy": 1.1974277198314667,
"epoch": 0.1644164416441644,
"grad_norm": 0.30301401019096375,
"learning_rate": 0.00019165644682190178,
"loss": 1.1863,
"mean_token_accuracy": 0.6698818802833557,
"num_tokens": 3479462.0,
"step": 411
},
{
"entropy": 1.1671889424324036,
"epoch": 0.16481648164816481,
"grad_norm": 0.3080313801765442,
"learning_rate": 0.0001916073503655342,
"loss": 1.1485,
"mean_token_accuracy": 0.6848516017198563,
"num_tokens": 3487668.0,
"step": 412
},
{
"entropy": 1.1198955476284027,
"epoch": 0.16521652165216522,
"grad_norm": 0.282215416431427,
"learning_rate": 0.00019155811694546773,
"loss": 1.117,
"mean_token_accuracy": 0.6849533915519714,
"num_tokens": 3496407.0,
"step": 413
},
{
"entropy": 1.1208362877368927,
"epoch": 0.16561656165616562,
"grad_norm": 0.2846994996070862,
"learning_rate": 0.0001915087466443321,
"loss": 1.1486,
"mean_token_accuracy": 0.6762874126434326,
"num_tokens": 3505305.0,
"step": 414
},
{
"entropy": 1.1050612926483154,
"epoch": 0.16601660166016602,
"grad_norm": 0.2926284670829773,
"learning_rate": 0.00019145923954498674,
"loss": 1.1086,
"mean_token_accuracy": 0.6887543201446533,
"num_tokens": 3513791.0,
"step": 415
},
{
"entropy": 1.1567849516868591,
"epoch": 0.16641664166416642,
"grad_norm": 0.3551363945007324,
"learning_rate": 0.00019140959573052068,
"loss": 1.1884,
"mean_token_accuracy": 0.6731236577033997,
"num_tokens": 3522187.0,
"step": 416
},
{
"entropy": 1.0714478492736816,
"epoch": 0.16681668166816682,
"grad_norm": 0.2826900780200958,
"learning_rate": 0.00019135981528425238,
"loss": 1.07,
"mean_token_accuracy": 0.6979558169841766,
"num_tokens": 3530921.0,
"step": 417
},
{
"entropy": 1.1964420974254608,
"epoch": 0.16721672167216722,
"grad_norm": 0.283438116312027,
"learning_rate": 0.0001913098982897297,
"loss": 1.2064,
"mean_token_accuracy": 0.6715447902679443,
"num_tokens": 3539583.0,
"step": 418
},
{
"entropy": 1.1429602801799774,
"epoch": 0.16761676167616762,
"grad_norm": 0.27956098318099976,
"learning_rate": 0.0001912598448307295,
"loss": 1.103,
"mean_token_accuracy": 0.692705973982811,
"num_tokens": 3548027.0,
"step": 419
},
{
"entropy": 1.1086672246456146,
"epoch": 0.16801680168016803,
"grad_norm": 0.30192887783050537,
"learning_rate": 0.0001912096549912579,
"loss": 1.0665,
"mean_token_accuracy": 0.6996335387229919,
"num_tokens": 3556575.0,
"step": 420
},
{
"entropy": 1.122267097234726,
"epoch": 0.16841684168416843,
"grad_norm": 0.28671419620513916,
"learning_rate": 0.0001911593288555497,
"loss": 1.0995,
"mean_token_accuracy": 0.6916577368974686,
"num_tokens": 3564842.0,
"step": 421
},
{
"entropy": 1.1425860822200775,
"epoch": 0.16881688168816883,
"grad_norm": 0.31337839365005493,
"learning_rate": 0.0001911088665080685,
"loss": 1.1492,
"mean_token_accuracy": 0.6899708062410355,
"num_tokens": 3573378.0,
"step": 422
},
{
"entropy": 1.1819129288196564,
"epoch": 0.1692169216921692,
"grad_norm": 0.3169664442539215,
"learning_rate": 0.00019105826803350668,
"loss": 1.2067,
"mean_token_accuracy": 0.6600329726934433,
"num_tokens": 3581995.0,
"step": 423
},
{
"entropy": 1.1388654112815857,
"epoch": 0.1696169616961696,
"grad_norm": 0.3174993097782135,
"learning_rate": 0.00019100753351678485,
"loss": 1.1679,
"mean_token_accuracy": 0.6717206537723541,
"num_tokens": 3590053.0,
"step": 424
},
{
"entropy": 1.0764131546020508,
"epoch": 0.17001700170017,
"grad_norm": 0.27433347702026367,
"learning_rate": 0.0001909566630430521,
"loss": 1.0583,
"mean_token_accuracy": 0.698042631149292,
"num_tokens": 3598969.0,
"step": 425
},
{
"entropy": 1.1677474975585938,
"epoch": 0.1704170417041704,
"grad_norm": 0.28440240025520325,
"learning_rate": 0.0001909056566976856,
"loss": 1.1686,
"mean_token_accuracy": 0.6792843639850616,
"num_tokens": 3608017.0,
"step": 426
},
{
"entropy": 1.0982355326414108,
"epoch": 0.1708170817081708,
"grad_norm": 0.281744122505188,
"learning_rate": 0.00019085451456629063,
"loss": 1.0735,
"mean_token_accuracy": 0.6970892697572708,
"num_tokens": 3616898.0,
"step": 427
},
{
"entropy": 1.1331664025783539,
"epoch": 0.1712171217121712,
"grad_norm": 0.29245954751968384,
"learning_rate": 0.00019080323673470028,
"loss": 1.1029,
"mean_token_accuracy": 0.6925027072429657,
"num_tokens": 3625372.0,
"step": 428
},
{
"entropy": 1.165515422821045,
"epoch": 0.1716171617161716,
"grad_norm": 0.314475953578949,
"learning_rate": 0.00019075182328897553,
"loss": 1.159,
"mean_token_accuracy": 0.6840381771326065,
"num_tokens": 3633550.0,
"step": 429
},
{
"entropy": 1.2059255242347717,
"epoch": 0.172017201720172,
"grad_norm": 0.29410937428474426,
"learning_rate": 0.00019070027431540484,
"loss": 1.1995,
"mean_token_accuracy": 0.667696550488472,
"num_tokens": 3641944.0,
"step": 430
},
{
"entropy": 1.160342425107956,
"epoch": 0.17241724172417242,
"grad_norm": 0.29798951745033264,
"learning_rate": 0.00019064858990050412,
"loss": 1.1249,
"mean_token_accuracy": 0.6896940916776657,
"num_tokens": 3650633.0,
"step": 431
},
{
"entropy": 1.097832590341568,
"epoch": 0.17281728172817282,
"grad_norm": 0.3146847188472748,
"learning_rate": 0.0001905967701310167,
"loss": 1.084,
"mean_token_accuracy": 0.6950473189353943,
"num_tokens": 3659275.0,
"step": 432
},
{
"entropy": 1.1250872611999512,
"epoch": 0.17321732173217322,
"grad_norm": 0.29490962624549866,
"learning_rate": 0.00019054481509391303,
"loss": 1.1453,
"mean_token_accuracy": 0.6784237176179886,
"num_tokens": 3667707.0,
"step": 433
},
{
"entropy": 1.11842879652977,
"epoch": 0.17361736173617362,
"grad_norm": 0.3015720844268799,
"learning_rate": 0.00019049272487639053,
"loss": 1.1348,
"mean_token_accuracy": 0.6827126741409302,
"num_tokens": 3676215.0,
"step": 434
},
{
"entropy": 1.1079545319080353,
"epoch": 0.17401740174017402,
"grad_norm": 0.2959752380847931,
"learning_rate": 0.00019044049956587359,
"loss": 1.1308,
"mean_token_accuracy": 0.6799913793802261,
"num_tokens": 3684832.0,
"step": 435
},
{
"entropy": 1.0760809183120728,
"epoch": 0.17441744174417442,
"grad_norm": 0.28142601251602173,
"learning_rate": 0.0001903881392500132,
"loss": 1.057,
"mean_token_accuracy": 0.7040259689092636,
"num_tokens": 3693191.0,
"step": 436
},
{
"entropy": 1.1367475986480713,
"epoch": 0.17481748174817482,
"grad_norm": 0.2840285301208496,
"learning_rate": 0.00019033564401668712,
"loss": 1.1166,
"mean_token_accuracy": 0.6871612221002579,
"num_tokens": 3701978.0,
"step": 437
},
{
"entropy": 1.0345291048288345,
"epoch": 0.17521752175217523,
"grad_norm": 0.27927252650260925,
"learning_rate": 0.00019028301395399935,
"loss": 1.0161,
"mean_token_accuracy": 0.7020839005708694,
"num_tokens": 3711010.0,
"step": 438
},
{
"entropy": 1.1218744814395905,
"epoch": 0.17561756175617563,
"grad_norm": 0.28972747921943665,
"learning_rate": 0.00019023024915028035,
"loss": 1.1142,
"mean_token_accuracy": 0.6823008805513382,
"num_tokens": 3719811.0,
"step": 439
},
{
"entropy": 1.112653136253357,
"epoch": 0.17601760176017603,
"grad_norm": 0.2937675714492798,
"learning_rate": 0.0001901773496940866,
"loss": 1.099,
"mean_token_accuracy": 0.6938609182834625,
"num_tokens": 3728397.0,
"step": 440
},
{
"entropy": 1.0891221165657043,
"epoch": 0.17641764176417643,
"grad_norm": 0.2878448963165283,
"learning_rate": 0.00019012431567420058,
"loss": 1.0985,
"mean_token_accuracy": 0.6925668865442276,
"num_tokens": 3737299.0,
"step": 441
},
{
"entropy": 1.099565714597702,
"epoch": 0.1768176817681768,
"grad_norm": 0.307413786649704,
"learning_rate": 0.00019007114717963067,
"loss": 1.1189,
"mean_token_accuracy": 0.6934941560029984,
"num_tokens": 3746139.0,
"step": 442
},
{
"entropy": 1.1932236850261688,
"epoch": 0.1772177217721772,
"grad_norm": 0.3038841485977173,
"learning_rate": 0.00019001784429961086,
"loss": 1.1788,
"mean_token_accuracy": 0.6709124445915222,
"num_tokens": 3754953.0,
"step": 443
},
{
"entropy": 1.0702079832553864,
"epoch": 0.1776177617761776,
"grad_norm": 0.2820574939250946,
"learning_rate": 0.0001899644071236008,
"loss": 1.0416,
"mean_token_accuracy": 0.7032249569892883,
"num_tokens": 3763751.0,
"step": 444
},
{
"entropy": 1.2229497730731964,
"epoch": 0.178017801780178,
"grad_norm": 0.3014878034591675,
"learning_rate": 0.00018991083574128545,
"loss": 1.2192,
"mean_token_accuracy": 0.6651740819215775,
"num_tokens": 3771604.0,
"step": 445
},
{
"entropy": 1.1150319874286652,
"epoch": 0.1784178417841784,
"grad_norm": 0.2991960644721985,
"learning_rate": 0.000189857130242575,
"loss": 1.09,
"mean_token_accuracy": 0.6914113610982895,
"num_tokens": 3780403.0,
"step": 446
},
{
"entropy": 1.1689063012599945,
"epoch": 0.1788178817881788,
"grad_norm": 0.2982667088508606,
"learning_rate": 0.0001898032907176048,
"loss": 1.1627,
"mean_token_accuracy": 0.6814263015985489,
"num_tokens": 3788759.0,
"step": 447
},
{
"entropy": 1.1139529049396515,
"epoch": 0.1792179217921792,
"grad_norm": 0.29409554600715637,
"learning_rate": 0.00018974931725673509,
"loss": 1.1114,
"mean_token_accuracy": 0.6805879026651382,
"num_tokens": 3796931.0,
"step": 448
},
{
"entropy": 1.1041430234909058,
"epoch": 0.17961796179617961,
"grad_norm": 0.2944853901863098,
"learning_rate": 0.00018969520995055085,
"loss": 1.1119,
"mean_token_accuracy": 0.6940512806177139,
"num_tokens": 3805323.0,
"step": 449
},
{
"entropy": 1.1486750543117523,
"epoch": 0.18001800180018002,
"grad_norm": 0.302370548248291,
"learning_rate": 0.00018964096888986182,
"loss": 1.1553,
"mean_token_accuracy": 0.6763848960399628,
"num_tokens": 3813607.0,
"step": 450
},
{
"entropy": 1.1423940062522888,
"epoch": 0.18041804180418042,
"grad_norm": 0.28140193223953247,
"learning_rate": 0.00018958659416570212,
"loss": 1.1566,
"mean_token_accuracy": 0.6711086183786392,
"num_tokens": 3822080.0,
"step": 451
},
{
"entropy": 1.0220871269702911,
"epoch": 0.18081808180818082,
"grad_norm": 0.2903229892253876,
"learning_rate": 0.00018953208586933027,
"loss": 1.0243,
"mean_token_accuracy": 0.7029541581869125,
"num_tokens": 3830561.0,
"step": 452
},
{
"entropy": 1.1911540031433105,
"epoch": 0.18121812181218122,
"grad_norm": 0.3021875321865082,
"learning_rate": 0.0001894774440922289,
"loss": 1.1799,
"mean_token_accuracy": 0.6771095544099808,
"num_tokens": 3838855.0,
"step": 453
},
{
"entropy": 1.1234095692634583,
"epoch": 0.18161816181618162,
"grad_norm": 0.30030199885368347,
"learning_rate": 0.00018942266892610474,
"loss": 1.1306,
"mean_token_accuracy": 0.688039630651474,
"num_tokens": 3847225.0,
"step": 454
},
{
"entropy": 1.2189615964889526,
"epoch": 0.18201820182018202,
"grad_norm": 0.2934826910495758,
"learning_rate": 0.00018936776046288832,
"loss": 1.192,
"mean_token_accuracy": 0.6768446713685989,
"num_tokens": 3855549.0,
"step": 455
},
{
"entropy": 1.090735375881195,
"epoch": 0.18241824182418243,
"grad_norm": 0.2921765148639679,
"learning_rate": 0.0001893127187947339,
"loss": 1.0824,
"mean_token_accuracy": 0.6897251307964325,
"num_tokens": 3863912.0,
"step": 456
},
{
"entropy": 1.0907158553600311,
"epoch": 0.18281828182818283,
"grad_norm": 0.28869226574897766,
"learning_rate": 0.00018925754401401935,
"loss": 1.1011,
"mean_token_accuracy": 0.6976663619279861,
"num_tokens": 3872222.0,
"step": 457
},
{
"entropy": 1.0765265822410583,
"epoch": 0.18321832183218323,
"grad_norm": 0.27985134720802307,
"learning_rate": 0.0001892022362133459,
"loss": 1.0954,
"mean_token_accuracy": 0.6934731006622314,
"num_tokens": 3880811.0,
"step": 458
},
{
"entropy": 1.1287130117416382,
"epoch": 0.18361836183618363,
"grad_norm": 0.2834780216217041,
"learning_rate": 0.000189146795485538,
"loss": 1.1133,
"mean_token_accuracy": 0.6809262037277222,
"num_tokens": 3889241.0,
"step": 459
},
{
"entropy": 1.1771635711193085,
"epoch": 0.18401840184018403,
"grad_norm": 0.2930743992328644,
"learning_rate": 0.00018909122192364334,
"loss": 1.1473,
"mean_token_accuracy": 0.6786583662033081,
"num_tokens": 3897826.0,
"step": 460
},
{
"entropy": 1.156456857919693,
"epoch": 0.1844184418441844,
"grad_norm": 0.31029045581817627,
"learning_rate": 0.00018903551562093237,
"loss": 1.1329,
"mean_token_accuracy": 0.6835081726312637,
"num_tokens": 3906455.0,
"step": 461
},
{
"entropy": 1.197271704673767,
"epoch": 0.1848184818481848,
"grad_norm": 0.28894633054733276,
"learning_rate": 0.00018897967667089839,
"loss": 1.1518,
"mean_token_accuracy": 0.6705130338668823,
"num_tokens": 3914939.0,
"step": 462
},
{
"entropy": 1.187122493982315,
"epoch": 0.1852185218521852,
"grad_norm": 0.2882704734802246,
"learning_rate": 0.0001889237051672574,
"loss": 1.172,
"mean_token_accuracy": 0.6756406724452972,
"num_tokens": 3923526.0,
"step": 463
},
{
"entropy": 1.1045761406421661,
"epoch": 0.1856185618561856,
"grad_norm": 0.290786474943161,
"learning_rate": 0.00018886760120394774,
"loss": 1.1039,
"mean_token_accuracy": 0.6829386353492737,
"num_tokens": 3931690.0,
"step": 464
},
{
"entropy": 1.0771204233169556,
"epoch": 0.186018601860186,
"grad_norm": 0.29037660360336304,
"learning_rate": 0.00018881136487513016,
"loss": 1.0961,
"mean_token_accuracy": 0.6865667402744293,
"num_tokens": 3940222.0,
"step": 465
},
{
"entropy": 1.0926263481378555,
"epoch": 0.1864186418641864,
"grad_norm": 0.28368324041366577,
"learning_rate": 0.0001887549962751875,
"loss": 1.1276,
"mean_token_accuracy": 0.6901869177818298,
"num_tokens": 3948870.0,
"step": 466
},
{
"entropy": 1.0631737411022186,
"epoch": 0.18681868186818681,
"grad_norm": 0.28324657678604126,
"learning_rate": 0.00018869849549872465,
"loss": 1.0782,
"mean_token_accuracy": 0.6920218467712402,
"num_tokens": 3957291.0,
"step": 467
},
{
"entropy": 1.1629198789596558,
"epoch": 0.18721872187218722,
"grad_norm": 0.28869321942329407,
"learning_rate": 0.00018864186264056827,
"loss": 1.1439,
"mean_token_accuracy": 0.6795201748609543,
"num_tokens": 3966005.0,
"step": 468
},
{
"entropy": 1.1176329255104065,
"epoch": 0.18761876187618762,
"grad_norm": 0.30285438895225525,
"learning_rate": 0.00018858509779576678,
"loss": 1.1113,
"mean_token_accuracy": 0.6858499944210052,
"num_tokens": 3974237.0,
"step": 469
},
{
"entropy": 1.1664519608020782,
"epoch": 0.18801880188018802,
"grad_norm": 0.29232847690582275,
"learning_rate": 0.00018852820105959002,
"loss": 1.1352,
"mean_token_accuracy": 0.6848191022872925,
"num_tokens": 3982719.0,
"step": 470
},
{
"entropy": 1.0966509878635406,
"epoch": 0.18841884188418842,
"grad_norm": 0.28050824999809265,
"learning_rate": 0.00018847117252752924,
"loss": 1.103,
"mean_token_accuracy": 0.6891407370567322,
"num_tokens": 3991387.0,
"step": 471
},
{
"entropy": 1.0832321643829346,
"epoch": 0.18881888188818882,
"grad_norm": 0.30679091811180115,
"learning_rate": 0.00018841401229529692,
"loss": 1.0987,
"mean_token_accuracy": 0.6983061581850052,
"num_tokens": 3999901.0,
"step": 472
},
{
"entropy": 1.1181371808052063,
"epoch": 0.18921892189218922,
"grad_norm": 0.29978105425834656,
"learning_rate": 0.00018835672045882648,
"loss": 1.1526,
"mean_token_accuracy": 0.6812323331832886,
"num_tokens": 4008189.0,
"step": 473
},
{
"entropy": 1.094124659895897,
"epoch": 0.18961896189618963,
"grad_norm": 0.2761591672897339,
"learning_rate": 0.00018829929711427232,
"loss": 1.088,
"mean_token_accuracy": 0.6916481256484985,
"num_tokens": 4017035.0,
"step": 474
},
{
"entropy": 1.174016386270523,
"epoch": 0.19001900190019003,
"grad_norm": 0.2957269549369812,
"learning_rate": 0.0001882417423580095,
"loss": 1.15,
"mean_token_accuracy": 0.687277153134346,
"num_tokens": 4025132.0,
"step": 475
},
{
"entropy": 1.141076147556305,
"epoch": 0.19041904190419043,
"grad_norm": 0.29672884941101074,
"learning_rate": 0.0001881840562866336,
"loss": 1.0997,
"mean_token_accuracy": 0.6899784505367279,
"num_tokens": 4033594.0,
"step": 476
},
{
"entropy": 1.103248655796051,
"epoch": 0.19081908190819083,
"grad_norm": 0.2912473976612091,
"learning_rate": 0.00018812623899696067,
"loss": 1.0915,
"mean_token_accuracy": 0.6886222809553146,
"num_tokens": 4042053.0,
"step": 477
},
{
"entropy": 1.170788824558258,
"epoch": 0.19121912191219123,
"grad_norm": 0.2797233462333679,
"learning_rate": 0.0001880682905860269,
"loss": 1.1159,
"mean_token_accuracy": 0.6844299733638763,
"num_tokens": 4050555.0,
"step": 478
},
{
"entropy": 1.160698264837265,
"epoch": 0.19161916191619163,
"grad_norm": 0.2921246886253357,
"learning_rate": 0.00018801021115108862,
"loss": 1.1606,
"mean_token_accuracy": 0.6748001426458359,
"num_tokens": 4059040.0,
"step": 479
},
{
"entropy": 1.0824988782405853,
"epoch": 0.192019201920192,
"grad_norm": 0.29058167338371277,
"learning_rate": 0.000187952000789622,
"loss": 1.1117,
"mean_token_accuracy": 0.6949323862791061,
"num_tokens": 4067919.0,
"step": 480
},
{
"entropy": 1.1407755315303802,
"epoch": 0.1924192419241924,
"grad_norm": 0.3058508634567261,
"learning_rate": 0.00018789365959932303,
"loss": 1.1914,
"mean_token_accuracy": 0.6748262792825699,
"num_tokens": 4076495.0,
"step": 481
},
{
"entropy": 1.1213767230510712,
"epoch": 0.1928192819281928,
"grad_norm": 0.2868844270706177,
"learning_rate": 0.00018783518767810715,
"loss": 1.117,
"mean_token_accuracy": 0.6884360611438751,
"num_tokens": 4084846.0,
"step": 482
},
{
"entropy": 1.1594094932079315,
"epoch": 0.1932193219321932,
"grad_norm": 0.29103291034698486,
"learning_rate": 0.0001877765851241093,
"loss": 1.1595,
"mean_token_accuracy": 0.6784193813800812,
"num_tokens": 4093093.0,
"step": 483
},
{
"entropy": 1.0897391140460968,
"epoch": 0.1936193619361936,
"grad_norm": 0.29071077704429626,
"learning_rate": 0.00018771785203568366,
"loss": 1.0775,
"mean_token_accuracy": 0.6933843791484833,
"num_tokens": 4101392.0,
"step": 484
},
{
"entropy": 1.05050827562809,
"epoch": 0.19401940194019401,
"grad_norm": 0.2660689949989319,
"learning_rate": 0.00018765898851140345,
"loss": 1.003,
"mean_token_accuracy": 0.7151510417461395,
"num_tokens": 4110388.0,
"step": 485
},
{
"entropy": 1.1417682468891144,
"epoch": 0.19441944194419442,
"grad_norm": 0.2760656774044037,
"learning_rate": 0.00018759999465006087,
"loss": 1.1208,
"mean_token_accuracy": 0.6870895624160767,
"num_tokens": 4119451.0,
"step": 486
},
{
"entropy": 1.1158250570297241,
"epoch": 0.19481948194819482,
"grad_norm": 0.27844175696372986,
"learning_rate": 0.00018754087055066675,
"loss": 1.0741,
"mean_token_accuracy": 0.7000212967395782,
"num_tokens": 4127997.0,
"step": 487
},
{
"entropy": 1.0569812506437302,
"epoch": 0.19521952195219522,
"grad_norm": 0.28110507130622864,
"learning_rate": 0.00018748161631245065,
"loss": 1.0375,
"mean_token_accuracy": 0.7026449292898178,
"num_tokens": 4136878.0,
"step": 488
},
{
"entropy": 1.084457129240036,
"epoch": 0.19561956195619562,
"grad_norm": 0.26859092712402344,
"learning_rate": 0.00018742223203486042,
"loss": 1.0676,
"mean_token_accuracy": 0.6930870711803436,
"num_tokens": 4146324.0,
"step": 489
},
{
"entropy": 1.0949542820453644,
"epoch": 0.19601960196019602,
"grad_norm": 0.28605908155441284,
"learning_rate": 0.00018736271781756223,
"loss": 1.125,
"mean_token_accuracy": 0.6920661330223083,
"num_tokens": 4154496.0,
"step": 490
},
{
"entropy": 1.1369201838970184,
"epoch": 0.19641964196419642,
"grad_norm": 0.3030281364917755,
"learning_rate": 0.00018730307376044027,
"loss": 1.119,
"mean_token_accuracy": 0.6900736391544342,
"num_tokens": 4163381.0,
"step": 491
},
{
"entropy": 1.1063465178012848,
"epoch": 0.19681968196819682,
"grad_norm": 0.29392218589782715,
"learning_rate": 0.00018724329996359676,
"loss": 1.1376,
"mean_token_accuracy": 0.6872988492250443,
"num_tokens": 4172190.0,
"step": 492
},
{
"entropy": 1.1071143746376038,
"epoch": 0.19721972197219723,
"grad_norm": 0.28501084446907043,
"learning_rate": 0.00018718339652735154,
"loss": 1.1166,
"mean_token_accuracy": 0.6885866820812225,
"num_tokens": 4180585.0,
"step": 493
},
{
"entropy": 1.1584193706512451,
"epoch": 0.19761976197619763,
"grad_norm": 0.29230597615242004,
"learning_rate": 0.00018712336355224205,
"loss": 1.1594,
"mean_token_accuracy": 0.6756969690322876,
"num_tokens": 4188810.0,
"step": 494
},
{
"entropy": 1.0776985734701157,
"epoch": 0.19801980198019803,
"grad_norm": 0.2801620662212372,
"learning_rate": 0.0001870632011390232,
"loss": 1.0296,
"mean_token_accuracy": 0.7065073400735855,
"num_tokens": 4197309.0,
"step": 495
},
{
"entropy": 1.1805840134620667,
"epoch": 0.19841984198419843,
"grad_norm": 0.3022160530090332,
"learning_rate": 0.00018700290938866712,
"loss": 1.1913,
"mean_token_accuracy": 0.6692783236503601,
"num_tokens": 4205630.0,
"step": 496
},
{
"entropy": 1.0833539962768555,
"epoch": 0.19881988198819883,
"grad_norm": 0.306426078081131,
"learning_rate": 0.00018694248840236296,
"loss": 1.0954,
"mean_token_accuracy": 0.6928739845752716,
"num_tokens": 4214058.0,
"step": 497
},
{
"entropy": 1.0818894803524017,
"epoch": 0.19921992199219923,
"grad_norm": 0.2984001934528351,
"learning_rate": 0.00018688193828151682,
"loss": 1.0926,
"mean_token_accuracy": 0.6913997977972031,
"num_tokens": 4222853.0,
"step": 498
},
{
"entropy": 1.0889964997768402,
"epoch": 0.1996199619961996,
"grad_norm": 0.2939610481262207,
"learning_rate": 0.0001868212591277515,
"loss": 1.0606,
"mean_token_accuracy": 0.6952404677867889,
"num_tokens": 4231395.0,
"step": 499
},
{
"entropy": 1.0546657741069794,
"epoch": 0.2000200020002,
"grad_norm": 0.28841695189476013,
"learning_rate": 0.00018676045104290637,
"loss": 1.0682,
"mean_token_accuracy": 0.6971585303544998,
"num_tokens": 4240525.0,
"step": 500
}
],
"logging_steps": 1,
"max_steps": 2500,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2.077959380899922e+18,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}