diff --git "a/checkpoint-1080/trainer_state.json" "b/checkpoint-1080/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-1080/trainer_state.json" @@ -0,0 +1,10867 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.8, + "eval_steps": 360, + "global_step": 1080, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 1.1883099973201752, + "epoch": 0.0016666666666666668, + "grad_norm": 0.31513479351997375, + "learning_rate": 0.0, + "loss": 2.0341, + "mean_token_accuracy": 0.5869412161409855, + "num_tokens": 12531.0, + "step": 1 + }, + { + "entropy": 1.2275140136480331, + "epoch": 0.0033333333333333335, + "grad_norm": 0.3110893964767456, + "learning_rate": 2.2222222222222225e-06, + "loss": 2.0736, + "mean_token_accuracy": 0.5821192935109138, + "num_tokens": 25080.0, + "step": 2 + }, + { + "entropy": 1.1326133534312248, + "epoch": 0.005, + "grad_norm": 0.32383376359939575, + "learning_rate": 4.444444444444445e-06, + "loss": 1.9606, + "mean_token_accuracy": 0.6036411970853806, + "num_tokens": 37694.0, + "step": 3 + }, + { + "entropy": 1.2363431453704834, + "epoch": 0.006666666666666667, + "grad_norm": 0.3201318383216858, + "learning_rate": 6.666666666666667e-06, + "loss": 2.0939, + "mean_token_accuracy": 0.5772385410964489, + "num_tokens": 50110.0, + "step": 4 + }, + { + "entropy": 1.1547652631998062, + "epoch": 0.008333333333333333, + "grad_norm": 0.3156285881996155, + "learning_rate": 8.88888888888889e-06, + "loss": 2.0016, + "mean_token_accuracy": 0.603407584130764, + "num_tokens": 62553.0, + "step": 5 + }, + { + "entropy": 1.221859723329544, + "epoch": 0.01, + "grad_norm": 0.32244929671287537, + "learning_rate": 1.1111111111111112e-05, + "loss": 2.0972, + "mean_token_accuracy": 0.577469028532505, + "num_tokens": 75072.0, + "step": 6 + }, + { + "entropy": 1.2124661356210709, + "epoch": 0.011666666666666667, + "grad_norm": 0.32469165325164795, + "learning_rate": 1.3333333333333333e-05, + "loss": 2.028, + "mean_token_accuracy": 0.5872330367565155, + "num_tokens": 87639.0, + "step": 7 + }, + { + "entropy": 1.1714164167642593, + "epoch": 0.013333333333333334, + "grad_norm": 0.3768642246723175, + "learning_rate": 1.5555555555555555e-05, + "loss": 2.0241, + "mean_token_accuracy": 0.585505448281765, + "num_tokens": 100287.0, + "step": 8 + }, + { + "entropy": 1.1553556099534035, + "epoch": 0.015, + "grad_norm": 0.37776222825050354, + "learning_rate": 1.777777777777778e-05, + "loss": 1.9622, + "mean_token_accuracy": 0.600151389837265, + "num_tokens": 113153.0, + "step": 9 + }, + { + "entropy": 1.1338117942214012, + "epoch": 0.016666666666666666, + "grad_norm": 0.3281095027923584, + "learning_rate": 2e-05, + "loss": 1.9465, + "mean_token_accuracy": 0.6039463356137276, + "num_tokens": 125548.0, + "step": 10 + }, + { + "entropy": 1.124964714050293, + "epoch": 0.018333333333333333, + "grad_norm": 0.35819536447525024, + "learning_rate": 2.2222222222222223e-05, + "loss": 1.9556, + "mean_token_accuracy": 0.6038774251937866, + "num_tokens": 138024.0, + "step": 11 + }, + { + "entropy": 1.247290499508381, + "epoch": 0.02, + "grad_norm": 0.40456947684288025, + "learning_rate": 2.4444444444444445e-05, + "loss": 2.0633, + "mean_token_accuracy": 0.5761790797114372, + "num_tokens": 150424.0, + "step": 12 + }, + { + "entropy": 1.2814347296953201, + "epoch": 0.021666666666666667, + "grad_norm": 0.40442585945129395, + "learning_rate": 2.6666666666666667e-05, + "loss": 2.1256, + "mean_token_accuracy": 0.5704136714339256, + "num_tokens": 162786.0, + "step": 13 + }, + { + "entropy": 1.1805027946829796, + "epoch": 0.023333333333333334, + "grad_norm": 0.43369293212890625, + "learning_rate": 2.8888888888888888e-05, + "loss": 1.9641, + "mean_token_accuracy": 0.5983459949493408, + "num_tokens": 175429.0, + "step": 14 + }, + { + "entropy": 1.2493249326944351, + "epoch": 0.025, + "grad_norm": 0.4346776008605957, + "learning_rate": 3.111111111111111e-05, + "loss": 2.0526, + "mean_token_accuracy": 0.5874154344201088, + "num_tokens": 187863.0, + "step": 15 + }, + { + "entropy": 1.1698118671774864, + "epoch": 0.02666666666666667, + "grad_norm": 0.4407603144645691, + "learning_rate": 3.3333333333333335e-05, + "loss": 1.9261, + "mean_token_accuracy": 0.6163465455174446, + "num_tokens": 200573.0, + "step": 16 + }, + { + "entropy": 1.189228169620037, + "epoch": 0.028333333333333332, + "grad_norm": 0.4984031617641449, + "learning_rate": 3.555555555555556e-05, + "loss": 1.9534, + "mean_token_accuracy": 0.5960768610239029, + "num_tokens": 213121.0, + "step": 17 + }, + { + "entropy": 1.234485238790512, + "epoch": 0.03, + "grad_norm": 0.545619547367096, + "learning_rate": 3.777777777777778e-05, + "loss": 2.0114, + "mean_token_accuracy": 0.582790918648243, + "num_tokens": 225055.0, + "step": 18 + }, + { + "entropy": 1.2693497836589813, + "epoch": 0.03166666666666667, + "grad_norm": 0.5483012199401855, + "learning_rate": 4e-05, + "loss": 2.0327, + "mean_token_accuracy": 0.5864806175231934, + "num_tokens": 237675.0, + "step": 19 + }, + { + "entropy": 1.3061316907405853, + "epoch": 0.03333333333333333, + "grad_norm": 0.6728277206420898, + "learning_rate": 4.222222222222222e-05, + "loss": 2.0134, + "mean_token_accuracy": 0.581995002925396, + "num_tokens": 250378.0, + "step": 20 + }, + { + "entropy": 1.3692396432161331, + "epoch": 0.035, + "grad_norm": 0.6627784371376038, + "learning_rate": 4.4444444444444447e-05, + "loss": 2.0612, + "mean_token_accuracy": 0.5739701353013515, + "num_tokens": 262331.0, + "step": 21 + }, + { + "entropy": 1.240286722779274, + "epoch": 0.03666666666666667, + "grad_norm": 0.6716634631156921, + "learning_rate": 4.666666666666667e-05, + "loss": 1.8739, + "mean_token_accuracy": 0.6026649698615074, + "num_tokens": 274867.0, + "step": 22 + }, + { + "entropy": 1.1950730830430984, + "epoch": 0.03833333333333333, + "grad_norm": 0.6836673617362976, + "learning_rate": 4.888888888888889e-05, + "loss": 1.7975, + "mean_token_accuracy": 0.616993211209774, + "num_tokens": 287375.0, + "step": 23 + }, + { + "entropy": 1.3049941956996918, + "epoch": 0.04, + "grad_norm": 0.7128519415855408, + "learning_rate": 5.111111111111111e-05, + "loss": 1.9134, + "mean_token_accuracy": 0.5976476445794106, + "num_tokens": 299836.0, + "step": 24 + }, + { + "entropy": 1.3695744276046753, + "epoch": 0.041666666666666664, + "grad_norm": 1.17112398147583, + "learning_rate": 5.333333333333333e-05, + "loss": 1.9372, + "mean_token_accuracy": 0.5883589163422585, + "num_tokens": 312372.0, + "step": 25 + }, + { + "entropy": 1.1785986423492432, + "epoch": 0.043333333333333335, + "grad_norm": 0.625380277633667, + "learning_rate": 5.555555555555556e-05, + "loss": 1.6627, + "mean_token_accuracy": 0.6405750215053558, + "num_tokens": 324984.0, + "step": 26 + }, + { + "entropy": 1.2021356672048569, + "epoch": 0.045, + "grad_norm": 0.7430734038352966, + "learning_rate": 5.7777777777777776e-05, + "loss": 1.6448, + "mean_token_accuracy": 0.6367640718817711, + "num_tokens": 337559.0, + "step": 27 + }, + { + "entropy": 1.3892791867256165, + "epoch": 0.04666666666666667, + "grad_norm": 0.7540942430496216, + "learning_rate": 6e-05, + "loss": 1.8673, + "mean_token_accuracy": 0.6006477549672127, + "num_tokens": 349939.0, + "step": 28 + }, + { + "entropy": 1.2635403275489807, + "epoch": 0.04833333333333333, + "grad_norm": 0.5877339839935303, + "learning_rate": 6.222222222222222e-05, + "loss": 1.6482, + "mean_token_accuracy": 0.6342698633670807, + "num_tokens": 362517.0, + "step": 29 + }, + { + "entropy": 1.1776714846491814, + "epoch": 0.05, + "grad_norm": 0.6017478108406067, + "learning_rate": 6.444444444444446e-05, + "loss": 1.4892, + "mean_token_accuracy": 0.6633898839354515, + "num_tokens": 375048.0, + "step": 30 + }, + { + "entropy": 1.3733180239796638, + "epoch": 0.051666666666666666, + "grad_norm": 0.4876861274242401, + "learning_rate": 6.666666666666667e-05, + "loss": 1.6464, + "mean_token_accuracy": 0.6362894400954247, + "num_tokens": 387684.0, + "step": 31 + }, + { + "entropy": 1.3747638314962387, + "epoch": 0.05333333333333334, + "grad_norm": 0.5345461368560791, + "learning_rate": 6.88888888888889e-05, + "loss": 1.6545, + "mean_token_accuracy": 0.6348208487033844, + "num_tokens": 400150.0, + "step": 32 + }, + { + "entropy": 1.2881105542182922, + "epoch": 0.055, + "grad_norm": 0.5066477656364441, + "learning_rate": 7.111111111111112e-05, + "loss": 1.4894, + "mean_token_accuracy": 0.6688544601202011, + "num_tokens": 412800.0, + "step": 33 + }, + { + "entropy": 1.546808585524559, + "epoch": 0.056666666666666664, + "grad_norm": 0.46314069628715515, + "learning_rate": 7.333333333333333e-05, + "loss": 1.7549, + "mean_token_accuracy": 0.6177285388112068, + "num_tokens": 425287.0, + "step": 34 + }, + { + "entropy": 1.343202069401741, + "epoch": 0.058333333333333334, + "grad_norm": 0.4545861780643463, + "learning_rate": 7.555555555555556e-05, + "loss": 1.4505, + "mean_token_accuracy": 0.6723693758249283, + "num_tokens": 437762.0, + "step": 35 + }, + { + "entropy": 1.4139084219932556, + "epoch": 0.06, + "grad_norm": 0.48711973428726196, + "learning_rate": 7.777777777777778e-05, + "loss": 1.5045, + "mean_token_accuracy": 0.6590398252010345, + "num_tokens": 450223.0, + "step": 36 + }, + { + "entropy": 1.3924538046121597, + "epoch": 0.06166666666666667, + "grad_norm": 0.47148916125297546, + "learning_rate": 8e-05, + "loss": 1.4763, + "mean_token_accuracy": 0.671143427491188, + "num_tokens": 462605.0, + "step": 37 + }, + { + "entropy": 1.3940246105194092, + "epoch": 0.06333333333333334, + "grad_norm": 0.532814621925354, + "learning_rate": 8.222222222222222e-05, + "loss": 1.4072, + "mean_token_accuracy": 0.6783580556511879, + "num_tokens": 475221.0, + "step": 38 + }, + { + "entropy": 1.4382469952106476, + "epoch": 0.065, + "grad_norm": 0.4498147666454315, + "learning_rate": 8.444444444444444e-05, + "loss": 1.4616, + "mean_token_accuracy": 0.6687446236610413, + "num_tokens": 487648.0, + "step": 39 + }, + { + "entropy": 1.3836999088525772, + "epoch": 0.06666666666666667, + "grad_norm": 0.4689527153968811, + "learning_rate": 8.666666666666667e-05, + "loss": 1.3839, + "mean_token_accuracy": 0.6905905231833458, + "num_tokens": 500175.0, + "step": 40 + }, + { + "entropy": 1.3075188547372818, + "epoch": 0.06833333333333333, + "grad_norm": 0.45789051055908203, + "learning_rate": 8.888888888888889e-05, + "loss": 1.2932, + "mean_token_accuracy": 0.7051132321357727, + "num_tokens": 512860.0, + "step": 41 + }, + { + "entropy": 1.2035221755504608, + "epoch": 0.07, + "grad_norm": 0.48102283477783203, + "learning_rate": 9.111111111111112e-05, + "loss": 1.2177, + "mean_token_accuracy": 0.7276435941457748, + "num_tokens": 525540.0, + "step": 42 + }, + { + "entropy": 1.2424119412899017, + "epoch": 0.07166666666666667, + "grad_norm": 0.48856499791145325, + "learning_rate": 9.333333333333334e-05, + "loss": 1.244, + "mean_token_accuracy": 0.7171106263995171, + "num_tokens": 538098.0, + "step": 43 + }, + { + "entropy": 1.2872763872146606, + "epoch": 0.07333333333333333, + "grad_norm": 0.46277451515197754, + "learning_rate": 9.555555555555557e-05, + "loss": 1.3318, + "mean_token_accuracy": 0.7009768709540367, + "num_tokens": 550788.0, + "step": 44 + }, + { + "entropy": 1.3774635940790176, + "epoch": 0.075, + "grad_norm": 0.48092466592788696, + "learning_rate": 9.777777777777778e-05, + "loss": 1.4124, + "mean_token_accuracy": 0.6832538694143295, + "num_tokens": 563184.0, + "step": 45 + }, + { + "entropy": 1.1826415210962296, + "epoch": 0.07666666666666666, + "grad_norm": 0.395088255405426, + "learning_rate": 0.0001, + "loss": 1.2277, + "mean_token_accuracy": 0.7197900637984276, + "num_tokens": 575833.0, + "step": 46 + }, + { + "entropy": 1.2140811532735825, + "epoch": 0.07833333333333334, + "grad_norm": 0.3708420693874359, + "learning_rate": 0.00010222222222222222, + "loss": 1.3184, + "mean_token_accuracy": 0.7010470479726791, + "num_tokens": 588340.0, + "step": 47 + }, + { + "entropy": 1.1609551459550858, + "epoch": 0.08, + "grad_norm": 0.29149869084358215, + "learning_rate": 0.00010444444444444445, + "loss": 1.2231, + "mean_token_accuracy": 0.7239813506603241, + "num_tokens": 600817.0, + "step": 48 + }, + { + "entropy": 1.094582460820675, + "epoch": 0.08166666666666667, + "grad_norm": 0.31041696667671204, + "learning_rate": 0.00010666666666666667, + "loss": 1.1622, + "mean_token_accuracy": 0.7331580147147179, + "num_tokens": 613072.0, + "step": 49 + }, + { + "entropy": 1.2046773582696915, + "epoch": 0.08333333333333333, + "grad_norm": 0.34948456287384033, + "learning_rate": 0.00010888888888888889, + "loss": 1.3382, + "mean_token_accuracy": 0.7008231580257416, + "num_tokens": 625279.0, + "step": 50 + }, + { + "entropy": 1.1427595689892769, + "epoch": 0.085, + "grad_norm": 0.30553025007247925, + "learning_rate": 0.00011111111111111112, + "loss": 1.1741, + "mean_token_accuracy": 0.7270924001932144, + "num_tokens": 637559.0, + "step": 51 + }, + { + "entropy": 1.0716800168156624, + "epoch": 0.08666666666666667, + "grad_norm": 0.3007242679595947, + "learning_rate": 0.00011333333333333334, + "loss": 1.1271, + "mean_token_accuracy": 0.7399003505706787, + "num_tokens": 650264.0, + "step": 52 + }, + { + "entropy": 1.3152555897831917, + "epoch": 0.08833333333333333, + "grad_norm": 0.4258122146129608, + "learning_rate": 0.00011555555555555555, + "loss": 1.3586, + "mean_token_accuracy": 0.6957122161984444, + "num_tokens": 662826.0, + "step": 53 + }, + { + "entropy": 1.2838200628757477, + "epoch": 0.09, + "grad_norm": 0.32665765285491943, + "learning_rate": 0.00011777777777777779, + "loss": 1.3561, + "mean_token_accuracy": 0.6944003701210022, + "num_tokens": 675585.0, + "step": 54 + }, + { + "entropy": 1.0607134401798248, + "epoch": 0.09166666666666666, + "grad_norm": 0.30055567622184753, + "learning_rate": 0.00012, + "loss": 1.1042, + "mean_token_accuracy": 0.7346132323145866, + "num_tokens": 688083.0, + "step": 55 + }, + { + "entropy": 1.2985290735960007, + "epoch": 0.09333333333333334, + "grad_norm": 0.2808513343334198, + "learning_rate": 0.00012222222222222224, + "loss": 1.346, + "mean_token_accuracy": 0.6928954645991325, + "num_tokens": 700448.0, + "step": 56 + }, + { + "entropy": 1.1469447389245033, + "epoch": 0.095, + "grad_norm": 0.3286835551261902, + "learning_rate": 0.00012444444444444444, + "loss": 1.168, + "mean_token_accuracy": 0.7282102331519127, + "num_tokens": 713117.0, + "step": 57 + }, + { + "entropy": 1.211227871477604, + "epoch": 0.09666666666666666, + "grad_norm": 0.2934599220752716, + "learning_rate": 0.00012666666666666666, + "loss": 1.2521, + "mean_token_accuracy": 0.7141993716359138, + "num_tokens": 725605.0, + "step": 58 + }, + { + "entropy": 1.1839049607515335, + "epoch": 0.09833333333333333, + "grad_norm": 0.3096744120121002, + "learning_rate": 0.00012888888888888892, + "loss": 1.2122, + "mean_token_accuracy": 0.7217304483056068, + "num_tokens": 738207.0, + "step": 59 + }, + { + "entropy": 0.9791679158806801, + "epoch": 0.1, + "grad_norm": 0.2698657810688019, + "learning_rate": 0.00013111111111111111, + "loss": 1.0068, + "mean_token_accuracy": 0.7672298699617386, + "num_tokens": 750830.0, + "step": 60 + }, + { + "entropy": 1.1061433926224709, + "epoch": 0.10166666666666667, + "grad_norm": 0.32427090406417847, + "learning_rate": 0.00013333333333333334, + "loss": 1.151, + "mean_token_accuracy": 0.7332676723599434, + "num_tokens": 763526.0, + "step": 61 + }, + { + "entropy": 1.0617150589823723, + "epoch": 0.10333333333333333, + "grad_norm": 0.2928755581378937, + "learning_rate": 0.00013555555555555556, + "loss": 1.0784, + "mean_token_accuracy": 0.7479719892144203, + "num_tokens": 776010.0, + "step": 62 + }, + { + "entropy": 1.0685010254383087, + "epoch": 0.105, + "grad_norm": 0.35550597310066223, + "learning_rate": 0.0001377777777777778, + "loss": 1.1365, + "mean_token_accuracy": 0.7320908978581429, + "num_tokens": 788523.0, + "step": 63 + }, + { + "entropy": 1.2497737780213356, + "epoch": 0.10666666666666667, + "grad_norm": 0.32172951102256775, + "learning_rate": 0.00014, + "loss": 1.277, + "mean_token_accuracy": 0.7090576663613319, + "num_tokens": 801026.0, + "step": 64 + }, + { + "entropy": 1.212308518588543, + "epoch": 0.10833333333333334, + "grad_norm": 0.30178961157798767, + "learning_rate": 0.00014222222222222224, + "loss": 1.2619, + "mean_token_accuracy": 0.712481826543808, + "num_tokens": 813435.0, + "step": 65 + }, + { + "entropy": 1.197321593761444, + "epoch": 0.11, + "grad_norm": 0.28265058994293213, + "learning_rate": 0.00014444444444444444, + "loss": 1.251, + "mean_token_accuracy": 0.7203180119395256, + "num_tokens": 825830.0, + "step": 66 + }, + { + "entropy": 1.0528302267193794, + "epoch": 0.11166666666666666, + "grad_norm": 0.2757464349269867, + "learning_rate": 0.00014666666666666666, + "loss": 1.0901, + "mean_token_accuracy": 0.75026024132967, + "num_tokens": 838764.0, + "step": 67 + }, + { + "entropy": 1.2962219715118408, + "epoch": 0.11333333333333333, + "grad_norm": 0.3328060209751129, + "learning_rate": 0.0001488888888888889, + "loss": 1.3067, + "mean_token_accuracy": 0.704713948071003, + "num_tokens": 851146.0, + "step": 68 + }, + { + "entropy": 1.1621850430965424, + "epoch": 0.115, + "grad_norm": 0.32625430822372437, + "learning_rate": 0.0001511111111111111, + "loss": 1.1708, + "mean_token_accuracy": 0.7335870340466499, + "num_tokens": 863579.0, + "step": 69 + }, + { + "entropy": 1.1265803426504135, + "epoch": 0.11666666666666667, + "grad_norm": 0.34571829438209534, + "learning_rate": 0.00015333333333333334, + "loss": 1.1124, + "mean_token_accuracy": 0.7329849451780319, + "num_tokens": 875785.0, + "step": 70 + }, + { + "entropy": 1.0563486441969872, + "epoch": 0.11833333333333333, + "grad_norm": 0.34876880049705505, + "learning_rate": 0.00015555555555555556, + "loss": 1.0546, + "mean_token_accuracy": 0.7550168186426163, + "num_tokens": 888578.0, + "step": 71 + }, + { + "entropy": 1.1433459967374802, + "epoch": 0.12, + "grad_norm": 0.39345014095306396, + "learning_rate": 0.0001577777777777778, + "loss": 1.1505, + "mean_token_accuracy": 0.7349046617746353, + "num_tokens": 901173.0, + "step": 72 + }, + { + "entropy": 1.2071392461657524, + "epoch": 0.12166666666666667, + "grad_norm": 0.3662756681442261, + "learning_rate": 0.00016, + "loss": 1.2599, + "mean_token_accuracy": 0.7073846533894539, + "num_tokens": 913731.0, + "step": 73 + }, + { + "entropy": 1.1201880425214767, + "epoch": 0.12333333333333334, + "grad_norm": 0.34027227759361267, + "learning_rate": 0.00016222222222222224, + "loss": 1.1595, + "mean_token_accuracy": 0.7306345105171204, + "num_tokens": 926306.0, + "step": 74 + }, + { + "entropy": 1.1337391138076782, + "epoch": 0.125, + "grad_norm": 0.45978736877441406, + "learning_rate": 0.00016444444444444444, + "loss": 1.2174, + "mean_token_accuracy": 0.7151773795485497, + "num_tokens": 938678.0, + "step": 75 + }, + { + "entropy": 1.1473943069577217, + "epoch": 0.12666666666666668, + "grad_norm": 0.662312388420105, + "learning_rate": 0.0001666666666666667, + "loss": 1.2348, + "mean_token_accuracy": 0.7175232917070389, + "num_tokens": 951039.0, + "step": 76 + }, + { + "entropy": 1.0714805275201797, + "epoch": 0.12833333333333333, + "grad_norm": 0.30166077613830566, + "learning_rate": 0.00016888888888888889, + "loss": 1.1602, + "mean_token_accuracy": 0.7314794659614563, + "num_tokens": 963428.0, + "step": 77 + }, + { + "entropy": 1.2219713553786278, + "epoch": 0.13, + "grad_norm": 0.269293874502182, + "learning_rate": 0.0001711111111111111, + "loss": 1.2639, + "mean_token_accuracy": 0.7121034041047096, + "num_tokens": 975918.0, + "step": 78 + }, + { + "entropy": 1.1331642344594002, + "epoch": 0.13166666666666665, + "grad_norm": 0.289949893951416, + "learning_rate": 0.00017333333333333334, + "loss": 1.202, + "mean_token_accuracy": 0.720516249537468, + "num_tokens": 988256.0, + "step": 79 + }, + { + "entropy": 1.199205830693245, + "epoch": 0.13333333333333333, + "grad_norm": 0.384086549282074, + "learning_rate": 0.00017555555555555556, + "loss": 1.2254, + "mean_token_accuracy": 0.7111634537577629, + "num_tokens": 1000806.0, + "step": 80 + }, + { + "entropy": 1.0977754443883896, + "epoch": 0.135, + "grad_norm": 0.2883825898170471, + "learning_rate": 0.00017777777777777779, + "loss": 1.1091, + "mean_token_accuracy": 0.7458898946642876, + "num_tokens": 1013837.0, + "step": 81 + }, + { + "entropy": 1.168524369597435, + "epoch": 0.13666666666666666, + "grad_norm": 0.38602787256240845, + "learning_rate": 0.00018, + "loss": 1.1589, + "mean_token_accuracy": 0.7247432917356491, + "num_tokens": 1026588.0, + "step": 82 + }, + { + "entropy": 1.1804613769054413, + "epoch": 0.13833333333333334, + "grad_norm": 0.4096560478210449, + "learning_rate": 0.00018222222222222224, + "loss": 1.1928, + "mean_token_accuracy": 0.7234074100852013, + "num_tokens": 1038970.0, + "step": 83 + }, + { + "entropy": 1.220597304403782, + "epoch": 0.14, + "grad_norm": 0.34397998452186584, + "learning_rate": 0.00018444444444444446, + "loss": 1.2601, + "mean_token_accuracy": 0.7074739634990692, + "num_tokens": 1051554.0, + "step": 84 + }, + { + "entropy": 0.9805111661553383, + "epoch": 0.14166666666666666, + "grad_norm": 0.5971760153770447, + "learning_rate": 0.0001866666666666667, + "loss": 1.0317, + "mean_token_accuracy": 0.7597767561674118, + "num_tokens": 1064327.0, + "step": 85 + }, + { + "entropy": 1.1483803391456604, + "epoch": 0.14333333333333334, + "grad_norm": 0.3001822233200073, + "learning_rate": 0.00018888888888888888, + "loss": 1.2179, + "mean_token_accuracy": 0.7179044857621193, + "num_tokens": 1076977.0, + "step": 86 + }, + { + "entropy": 1.1420434266328812, + "epoch": 0.145, + "grad_norm": 0.5787074565887451, + "learning_rate": 0.00019111111111111114, + "loss": 1.1715, + "mean_token_accuracy": 0.7311113104224205, + "num_tokens": 1089404.0, + "step": 87 + }, + { + "entropy": 1.2739483416080475, + "epoch": 0.14666666666666667, + "grad_norm": 0.4274302124977112, + "learning_rate": 0.00019333333333333333, + "loss": 1.3089, + "mean_token_accuracy": 0.6991681382060051, + "num_tokens": 1101887.0, + "step": 88 + }, + { + "entropy": 1.0925018265843391, + "epoch": 0.14833333333333334, + "grad_norm": 0.3573361933231354, + "learning_rate": 0.00019555555555555556, + "loss": 1.1291, + "mean_token_accuracy": 0.7353638261556625, + "num_tokens": 1114866.0, + "step": 89 + }, + { + "entropy": 1.3409122675657272, + "epoch": 0.15, + "grad_norm": 0.486543208360672, + "learning_rate": 0.00019777777777777778, + "loss": 1.3796, + "mean_token_accuracy": 0.6867416799068451, + "num_tokens": 1127433.0, + "step": 90 + }, + { + "entropy": 1.087234228849411, + "epoch": 0.15166666666666667, + "grad_norm": 0.3652050793170929, + "learning_rate": 0.0002, + "loss": 1.1062, + "mean_token_accuracy": 0.7423817366361618, + "num_tokens": 1139907.0, + "step": 91 + }, + { + "entropy": 1.2978694140911102, + "epoch": 0.15333333333333332, + "grad_norm": 0.42118868231773376, + "learning_rate": 0.0001998830409356725, + "loss": 1.3141, + "mean_token_accuracy": 0.6988303884863853, + "num_tokens": 1152242.0, + "step": 92 + }, + { + "entropy": 0.991526797413826, + "epoch": 0.155, + "grad_norm": 0.7384818196296692, + "learning_rate": 0.00019976608187134506, + "loss": 1.0096, + "mean_token_accuracy": 0.760047473013401, + "num_tokens": 1164770.0, + "step": 93 + }, + { + "entropy": 1.1662617474794388, + "epoch": 0.15666666666666668, + "grad_norm": 0.33492511510849, + "learning_rate": 0.00019964912280701755, + "loss": 1.1827, + "mean_token_accuracy": 0.7286000698804855, + "num_tokens": 1177211.0, + "step": 94 + }, + { + "entropy": 0.9847327098250389, + "epoch": 0.15833333333333333, + "grad_norm": 0.5884389877319336, + "learning_rate": 0.00019953216374269005, + "loss": 1.0243, + "mean_token_accuracy": 0.7573123648762703, + "num_tokens": 1189909.0, + "step": 95 + }, + { + "entropy": 1.0150301530957222, + "epoch": 0.16, + "grad_norm": 0.38942259550094604, + "learning_rate": 0.0001994152046783626, + "loss": 1.0619, + "mean_token_accuracy": 0.7528154477477074, + "num_tokens": 1202264.0, + "step": 96 + }, + { + "entropy": 1.0576318800449371, + "epoch": 0.16166666666666665, + "grad_norm": 0.46958282589912415, + "learning_rate": 0.0001992982456140351, + "loss": 1.0924, + "mean_token_accuracy": 0.7439210563898087, + "num_tokens": 1214929.0, + "step": 97 + }, + { + "entropy": 1.098344899713993, + "epoch": 0.16333333333333333, + "grad_norm": 0.45931607484817505, + "learning_rate": 0.0001991812865497076, + "loss": 1.1208, + "mean_token_accuracy": 0.731578603386879, + "num_tokens": 1227226.0, + "step": 98 + }, + { + "entropy": 1.154830977320671, + "epoch": 0.165, + "grad_norm": 0.3942425549030304, + "learning_rate": 0.00019906432748538014, + "loss": 1.1668, + "mean_token_accuracy": 0.7254332229495049, + "num_tokens": 1239739.0, + "step": 99 + }, + { + "entropy": 1.0297606065869331, + "epoch": 0.16666666666666666, + "grad_norm": 0.316371351480484, + "learning_rate": 0.00019894736842105264, + "loss": 1.0509, + "mean_token_accuracy": 0.7479109838604927, + "num_tokens": 1252407.0, + "step": 100 + }, + { + "entropy": 1.0756673142313957, + "epoch": 0.16833333333333333, + "grad_norm": 0.7114313244819641, + "learning_rate": 0.00019883040935672513, + "loss": 1.0986, + "mean_token_accuracy": 0.7459438368678093, + "num_tokens": 1264801.0, + "step": 101 + }, + { + "entropy": 0.973985955119133, + "epoch": 0.17, + "grad_norm": 0.4525020122528076, + "learning_rate": 0.00019871345029239768, + "loss": 0.9842, + "mean_token_accuracy": 0.7690640017390251, + "num_tokens": 1277422.0, + "step": 102 + }, + { + "entropy": 1.0724262371659279, + "epoch": 0.17166666666666666, + "grad_norm": 0.5805977582931519, + "learning_rate": 0.00019859649122807018, + "loss": 1.1007, + "mean_token_accuracy": 0.7470309287309647, + "num_tokens": 1289893.0, + "step": 103 + }, + { + "entropy": 1.093192383646965, + "epoch": 0.17333333333333334, + "grad_norm": 0.391726553440094, + "learning_rate": 0.0001984795321637427, + "loss": 1.1168, + "mean_token_accuracy": 0.7345704063773155, + "num_tokens": 1302535.0, + "step": 104 + }, + { + "entropy": 1.030884176492691, + "epoch": 0.175, + "grad_norm": 0.4719918668270111, + "learning_rate": 0.00019836257309941522, + "loss": 1.0402, + "mean_token_accuracy": 0.7528453394770622, + "num_tokens": 1315365.0, + "step": 105 + }, + { + "entropy": 1.022902749478817, + "epoch": 0.17666666666666667, + "grad_norm": 0.4833899736404419, + "learning_rate": 0.00019824561403508772, + "loss": 1.0441, + "mean_token_accuracy": 0.7529502660036087, + "num_tokens": 1327961.0, + "step": 106 + }, + { + "entropy": 1.1461059749126434, + "epoch": 0.17833333333333334, + "grad_norm": 0.33028945326805115, + "learning_rate": 0.00019812865497076024, + "loss": 1.189, + "mean_token_accuracy": 0.7240455821156502, + "num_tokens": 1340552.0, + "step": 107 + }, + { + "entropy": 1.0696540772914886, + "epoch": 0.18, + "grad_norm": 0.39427050948143005, + "learning_rate": 0.00019801169590643277, + "loss": 1.0901, + "mean_token_accuracy": 0.7402398586273193, + "num_tokens": 1352996.0, + "step": 108 + }, + { + "entropy": 1.2089848518371582, + "epoch": 0.18166666666666667, + "grad_norm": 0.49633994698524475, + "learning_rate": 0.00019789473684210526, + "loss": 1.2587, + "mean_token_accuracy": 0.7089790180325508, + "num_tokens": 1365591.0, + "step": 109 + }, + { + "entropy": 1.1154760420322418, + "epoch": 0.18333333333333332, + "grad_norm": 0.3988918662071228, + "learning_rate": 0.00019777777777777778, + "loss": 1.1758, + "mean_token_accuracy": 0.728231742978096, + "num_tokens": 1378160.0, + "step": 110 + }, + { + "entropy": 1.1121669262647629, + "epoch": 0.185, + "grad_norm": 0.38943156599998474, + "learning_rate": 0.0001976608187134503, + "loss": 1.1454, + "mean_token_accuracy": 0.7311063706874847, + "num_tokens": 1390786.0, + "step": 111 + }, + { + "entropy": 1.0965967029333115, + "epoch": 0.18666666666666668, + "grad_norm": 0.5972121357917786, + "learning_rate": 0.0001975438596491228, + "loss": 1.1035, + "mean_token_accuracy": 0.7371370121836662, + "num_tokens": 1403473.0, + "step": 112 + }, + { + "entropy": 0.9760538339614868, + "epoch": 0.18833333333333332, + "grad_norm": 0.32120487093925476, + "learning_rate": 0.00019742690058479533, + "loss": 0.9931, + "mean_token_accuracy": 0.7640335187315941, + "num_tokens": 1415959.0, + "step": 113 + }, + { + "entropy": 1.1588884815573692, + "epoch": 0.19, + "grad_norm": 0.6615768074989319, + "learning_rate": 0.00019730994152046785, + "loss": 1.1925, + "mean_token_accuracy": 0.7154464647173882, + "num_tokens": 1428492.0, + "step": 114 + }, + { + "entropy": 1.0532007440924644, + "epoch": 0.19166666666666668, + "grad_norm": 0.4122265875339508, + "learning_rate": 0.00019719298245614035, + "loss": 1.0635, + "mean_token_accuracy": 0.7488418594002724, + "num_tokens": 1440911.0, + "step": 115 + }, + { + "entropy": 1.2064021080732346, + "epoch": 0.19333333333333333, + "grad_norm": 0.4100744128227234, + "learning_rate": 0.00019707602339181287, + "loss": 1.2092, + "mean_token_accuracy": 0.7215066328644753, + "num_tokens": 1453573.0, + "step": 116 + }, + { + "entropy": 0.980360358953476, + "epoch": 0.195, + "grad_norm": 0.4821430444717407, + "learning_rate": 0.0001969590643274854, + "loss": 0.9949, + "mean_token_accuracy": 0.76118303835392, + "num_tokens": 1466030.0, + "step": 117 + }, + { + "entropy": 1.1557006016373634, + "epoch": 0.19666666666666666, + "grad_norm": 0.4301220774650574, + "learning_rate": 0.0001968421052631579, + "loss": 1.207, + "mean_token_accuracy": 0.7246038690209389, + "num_tokens": 1478409.0, + "step": 118 + }, + { + "entropy": 0.994934193789959, + "epoch": 0.19833333333333333, + "grad_norm": 0.33380362391471863, + "learning_rate": 0.0001967251461988304, + "loss": 1.0463, + "mean_token_accuracy": 0.7522767782211304, + "num_tokens": 1490956.0, + "step": 119 + }, + { + "entropy": 1.0550952181220055, + "epoch": 0.2, + "grad_norm": 0.43042588233947754, + "learning_rate": 0.00019660818713450293, + "loss": 1.1072, + "mean_token_accuracy": 0.7346431165933609, + "num_tokens": 1503441.0, + "step": 120 + }, + { + "entropy": 1.3361243903636932, + "epoch": 0.20166666666666666, + "grad_norm": 0.40949326753616333, + "learning_rate": 0.00019649122807017543, + "loss": 1.4268, + "mean_token_accuracy": 0.6733630150556564, + "num_tokens": 1515872.0, + "step": 121 + }, + { + "entropy": 0.9732860177755356, + "epoch": 0.20333333333333334, + "grad_norm": 0.35589590668678284, + "learning_rate": 0.00019637426900584798, + "loss": 1.0341, + "mean_token_accuracy": 0.759332112967968, + "num_tokens": 1528695.0, + "step": 122 + }, + { + "entropy": 1.0260741487145424, + "epoch": 0.205, + "grad_norm": 0.4480772614479065, + "learning_rate": 0.00019625730994152048, + "loss": 1.0118, + "mean_token_accuracy": 0.7584766522049904, + "num_tokens": 1541198.0, + "step": 123 + }, + { + "entropy": 1.1190374419093132, + "epoch": 0.20666666666666667, + "grad_norm": 0.7018595933914185, + "learning_rate": 0.000196140350877193, + "loss": 1.1102, + "mean_token_accuracy": 0.7376047149300575, + "num_tokens": 1553802.0, + "step": 124 + }, + { + "entropy": 1.1323942467570305, + "epoch": 0.20833333333333334, + "grad_norm": 0.32317179441452026, + "learning_rate": 0.00019602339181286552, + "loss": 1.1489, + "mean_token_accuracy": 0.7312273606657982, + "num_tokens": 1566099.0, + "step": 125 + }, + { + "entropy": 1.2469940930604935, + "epoch": 0.21, + "grad_norm": 0.4663606286048889, + "learning_rate": 0.00019590643274853802, + "loss": 1.2628, + "mean_token_accuracy": 0.7112779766321182, + "num_tokens": 1578467.0, + "step": 126 + }, + { + "entropy": 1.117019146680832, + "epoch": 0.21166666666666667, + "grad_norm": 0.5317836403846741, + "learning_rate": 0.00019578947368421054, + "loss": 1.1085, + "mean_token_accuracy": 0.7378982827067375, + "num_tokens": 1591089.0, + "step": 127 + }, + { + "entropy": 1.2175140753388405, + "epoch": 0.21333333333333335, + "grad_norm": 0.38012969493865967, + "learning_rate": 0.00019567251461988306, + "loss": 1.2299, + "mean_token_accuracy": 0.717160701751709, + "num_tokens": 1603396.0, + "step": 128 + }, + { + "entropy": 1.1809884086251259, + "epoch": 0.215, + "grad_norm": 0.40074971318244934, + "learning_rate": 0.00019555555555555556, + "loss": 1.1807, + "mean_token_accuracy": 0.7240558713674545, + "num_tokens": 1615698.0, + "step": 129 + }, + { + "entropy": 0.9892362505197525, + "epoch": 0.21666666666666667, + "grad_norm": 0.44255295395851135, + "learning_rate": 0.00019543859649122808, + "loss": 1.0268, + "mean_token_accuracy": 0.7633307725191116, + "num_tokens": 1628080.0, + "step": 130 + }, + { + "entropy": 1.1162428334355354, + "epoch": 0.21833333333333332, + "grad_norm": 0.3332391679286957, + "learning_rate": 0.0001953216374269006, + "loss": 1.171, + "mean_token_accuracy": 0.7306881099939346, + "num_tokens": 1640529.0, + "step": 131 + }, + { + "entropy": 1.0377461314201355, + "epoch": 0.22, + "grad_norm": 0.33684980869293213, + "learning_rate": 0.0001952046783625731, + "loss": 1.1064, + "mean_token_accuracy": 0.7430030331015587, + "num_tokens": 1653382.0, + "step": 132 + }, + { + "entropy": 1.156599409878254, + "epoch": 0.22166666666666668, + "grad_norm": 0.42173266410827637, + "learning_rate": 0.00019508771929824562, + "loss": 1.2381, + "mean_token_accuracy": 0.7101339101791382, + "num_tokens": 1666061.0, + "step": 133 + }, + { + "entropy": 1.1487335935235023, + "epoch": 0.22333333333333333, + "grad_norm": 0.5505969524383545, + "learning_rate": 0.00019497076023391815, + "loss": 1.1949, + "mean_token_accuracy": 0.7228164002299309, + "num_tokens": 1678760.0, + "step": 134 + }, + { + "entropy": 1.1205493286252022, + "epoch": 0.225, + "grad_norm": 0.4036419987678528, + "learning_rate": 0.00019485380116959064, + "loss": 1.1353, + "mean_token_accuracy": 0.7325499951839447, + "num_tokens": 1691086.0, + "step": 135 + }, + { + "entropy": 1.023528330028057, + "epoch": 0.22666666666666666, + "grad_norm": 0.4935557246208191, + "learning_rate": 0.00019473684210526317, + "loss": 1.0273, + "mean_token_accuracy": 0.7585752308368683, + "num_tokens": 1703917.0, + "step": 136 + }, + { + "entropy": 1.3035095483064651, + "epoch": 0.22833333333333333, + "grad_norm": 0.4646526277065277, + "learning_rate": 0.0001946198830409357, + "loss": 1.337, + "mean_token_accuracy": 0.6854947060346603, + "num_tokens": 1716473.0, + "step": 137 + }, + { + "entropy": 1.0940191820263863, + "epoch": 0.23, + "grad_norm": 0.5709595680236816, + "learning_rate": 0.00019450292397660819, + "loss": 1.1138, + "mean_token_accuracy": 0.7348800003528595, + "num_tokens": 1728903.0, + "step": 138 + }, + { + "entropy": 1.1010525897145271, + "epoch": 0.23166666666666666, + "grad_norm": 0.28354716300964355, + "learning_rate": 0.0001943859649122807, + "loss": 1.1311, + "mean_token_accuracy": 0.7405002191662788, + "num_tokens": 1741242.0, + "step": 139 + }, + { + "entropy": 1.167225994169712, + "epoch": 0.23333333333333334, + "grad_norm": 0.47398078441619873, + "learning_rate": 0.00019426900584795323, + "loss": 1.1551, + "mean_token_accuracy": 0.7253175228834152, + "num_tokens": 1753803.0, + "step": 140 + }, + { + "entropy": 1.153997391462326, + "epoch": 0.235, + "grad_norm": 0.42761462926864624, + "learning_rate": 0.00019415204678362573, + "loss": 1.1576, + "mean_token_accuracy": 0.7320474088191986, + "num_tokens": 1766016.0, + "step": 141 + }, + { + "entropy": 1.118414282798767, + "epoch": 0.23666666666666666, + "grad_norm": 0.45717811584472656, + "learning_rate": 0.00019403508771929825, + "loss": 1.1184, + "mean_token_accuracy": 0.7379914745688438, + "num_tokens": 1778602.0, + "step": 142 + }, + { + "entropy": 1.0639612078666687, + "epoch": 0.23833333333333334, + "grad_norm": 0.4153079688549042, + "learning_rate": 0.00019391812865497077, + "loss": 1.0899, + "mean_token_accuracy": 0.7416495755314827, + "num_tokens": 1791202.0, + "step": 143 + }, + { + "entropy": 1.0359639376401901, + "epoch": 0.24, + "grad_norm": 0.46512192487716675, + "learning_rate": 0.0001938011695906433, + "loss": 1.0682, + "mean_token_accuracy": 0.7477246001362801, + "num_tokens": 1803814.0, + "step": 144 + }, + { + "entropy": 1.1540048494935036, + "epoch": 0.24166666666666667, + "grad_norm": 0.5459341406822205, + "learning_rate": 0.0001936842105263158, + "loss": 1.22, + "mean_token_accuracy": 0.7181112244725227, + "num_tokens": 1816320.0, + "step": 145 + }, + { + "entropy": 1.0917329415678978, + "epoch": 0.24333333333333335, + "grad_norm": 0.448598712682724, + "learning_rate": 0.00019356725146198832, + "loss": 1.1382, + "mean_token_accuracy": 0.7411187067627907, + "num_tokens": 1828742.0, + "step": 146 + }, + { + "entropy": 1.0613715201616287, + "epoch": 0.245, + "grad_norm": 0.40278568863868713, + "learning_rate": 0.00019345029239766084, + "loss": 1.1119, + "mean_token_accuracy": 0.7407647371292114, + "num_tokens": 1841462.0, + "step": 147 + }, + { + "entropy": 1.0917879864573479, + "epoch": 0.24666666666666667, + "grad_norm": 0.47940903902053833, + "learning_rate": 0.00019333333333333333, + "loss": 1.1205, + "mean_token_accuracy": 0.7301802858710289, + "num_tokens": 1854255.0, + "step": 148 + }, + { + "entropy": 1.018385998904705, + "epoch": 0.24833333333333332, + "grad_norm": 0.4194647967815399, + "learning_rate": 0.00019321637426900586, + "loss": 1.0555, + "mean_token_accuracy": 0.7508860379457474, + "num_tokens": 1866779.0, + "step": 149 + }, + { + "entropy": 1.0104970261454582, + "epoch": 0.25, + "grad_norm": 0.32308053970336914, + "learning_rate": 0.00019309941520467838, + "loss": 1.0155, + "mean_token_accuracy": 0.7584084123373032, + "num_tokens": 1879629.0, + "step": 150 + }, + { + "entropy": 1.1477160826325417, + "epoch": 0.25166666666666665, + "grad_norm": 0.2899821400642395, + "learning_rate": 0.00019298245614035088, + "loss": 1.1195, + "mean_token_accuracy": 0.7364438623189926, + "num_tokens": 1892328.0, + "step": 151 + }, + { + "entropy": 1.1384098306298256, + "epoch": 0.25333333333333335, + "grad_norm": 0.4167255163192749, + "learning_rate": 0.0001928654970760234, + "loss": 1.1635, + "mean_token_accuracy": 0.7330257892608643, + "num_tokens": 1904903.0, + "step": 152 + }, + { + "entropy": 1.0779243260622025, + "epoch": 0.255, + "grad_norm": 0.3709876537322998, + "learning_rate": 0.00019274853801169592, + "loss": 1.1139, + "mean_token_accuracy": 0.7434241771697998, + "num_tokens": 1917471.0, + "step": 153 + }, + { + "entropy": 1.2101422995328903, + "epoch": 0.25666666666666665, + "grad_norm": 0.3245011568069458, + "learning_rate": 0.00019263157894736842, + "loss": 1.2267, + "mean_token_accuracy": 0.7105641290545464, + "num_tokens": 1930428.0, + "step": 154 + }, + { + "entropy": 1.1308674216270447, + "epoch": 0.25833333333333336, + "grad_norm": 0.5401122570037842, + "learning_rate": 0.00019251461988304094, + "loss": 1.1383, + "mean_token_accuracy": 0.7337904721498489, + "num_tokens": 1942865.0, + "step": 155 + }, + { + "entropy": 1.109563060104847, + "epoch": 0.26, + "grad_norm": 0.36856064200401306, + "learning_rate": 0.00019239766081871346, + "loss": 1.1064, + "mean_token_accuracy": 0.7388525083661079, + "num_tokens": 1955547.0, + "step": 156 + }, + { + "entropy": 0.8726945370435715, + "epoch": 0.26166666666666666, + "grad_norm": 0.3195601999759674, + "learning_rate": 0.00019228070175438596, + "loss": 0.8676, + "mean_token_accuracy": 0.7892613261938095, + "num_tokens": 1968285.0, + "step": 157 + }, + { + "entropy": 1.2257354855537415, + "epoch": 0.2633333333333333, + "grad_norm": 0.44171369075775146, + "learning_rate": 0.00019216374269005848, + "loss": 1.2279, + "mean_token_accuracy": 0.7155179604887962, + "num_tokens": 1980596.0, + "step": 158 + }, + { + "entropy": 1.1729735136032104, + "epoch": 0.265, + "grad_norm": 0.39025864005088806, + "learning_rate": 0.000192046783625731, + "loss": 1.1974, + "mean_token_accuracy": 0.7177061587572098, + "num_tokens": 1993239.0, + "step": 159 + }, + { + "entropy": 1.1277462840080261, + "epoch": 0.26666666666666666, + "grad_norm": 0.3358098566532135, + "learning_rate": 0.0001919298245614035, + "loss": 1.1326, + "mean_token_accuracy": 0.7304977104067802, + "num_tokens": 2005914.0, + "step": 160 + }, + { + "entropy": 1.1848002821207047, + "epoch": 0.2683333333333333, + "grad_norm": 0.395328551530838, + "learning_rate": 0.00019181286549707603, + "loss": 1.1864, + "mean_token_accuracy": 0.7176820933818817, + "num_tokens": 2018380.0, + "step": 161 + }, + { + "entropy": 1.1120817065238953, + "epoch": 0.27, + "grad_norm": 0.40584561228752136, + "learning_rate": 0.00019169590643274855, + "loss": 1.1281, + "mean_token_accuracy": 0.7257193401455879, + "num_tokens": 2030805.0, + "step": 162 + }, + { + "entropy": 1.1387057602405548, + "epoch": 0.27166666666666667, + "grad_norm": 0.4128866493701935, + "learning_rate": 0.00019157894736842104, + "loss": 1.1639, + "mean_token_accuracy": 0.7216651067137718, + "num_tokens": 2043168.0, + "step": 163 + }, + { + "entropy": 0.9902090951800346, + "epoch": 0.2733333333333333, + "grad_norm": 0.4670630097389221, + "learning_rate": 0.0001914619883040936, + "loss": 1.0027, + "mean_token_accuracy": 0.7596156373620033, + "num_tokens": 2055446.0, + "step": 164 + }, + { + "entropy": 1.1425034403800964, + "epoch": 0.275, + "grad_norm": 0.46681496500968933, + "learning_rate": 0.0001913450292397661, + "loss": 1.1538, + "mean_token_accuracy": 0.7130375802516937, + "num_tokens": 2067917.0, + "step": 165 + }, + { + "entropy": 1.0597015172243118, + "epoch": 0.27666666666666667, + "grad_norm": 0.47518017888069153, + "learning_rate": 0.0001912280701754386, + "loss": 1.0724, + "mean_token_accuracy": 0.7431479915976524, + "num_tokens": 2080651.0, + "step": 166 + }, + { + "entropy": 0.9730403125286102, + "epoch": 0.2783333333333333, + "grad_norm": 0.733403742313385, + "learning_rate": 0.00019111111111111114, + "loss": 0.963, + "mean_token_accuracy": 0.7653697729110718, + "num_tokens": 2093312.0, + "step": 167 + }, + { + "entropy": 1.0079271346330643, + "epoch": 0.28, + "grad_norm": 0.3541069030761719, + "learning_rate": 0.00019099415204678363, + "loss": 0.9992, + "mean_token_accuracy": 0.7640089094638824, + "num_tokens": 2105865.0, + "step": 168 + }, + { + "entropy": 1.1272375360131264, + "epoch": 0.2816666666666667, + "grad_norm": 13.837325096130371, + "learning_rate": 0.00019087719298245616, + "loss": 1.1298, + "mean_token_accuracy": 0.7283240929245949, + "num_tokens": 2118722.0, + "step": 169 + }, + { + "entropy": 1.1928050369024277, + "epoch": 0.2833333333333333, + "grad_norm": 0.6107990145683289, + "learning_rate": 0.00019076023391812868, + "loss": 1.1818, + "mean_token_accuracy": 0.7213939651846886, + "num_tokens": 2131235.0, + "step": 170 + }, + { + "entropy": 1.1104667708277702, + "epoch": 0.285, + "grad_norm": 0.5343078374862671, + "learning_rate": 0.00019064327485380117, + "loss": 1.1138, + "mean_token_accuracy": 0.7371880561113358, + "num_tokens": 2143714.0, + "step": 171 + }, + { + "entropy": 1.0297225266695023, + "epoch": 0.2866666666666667, + "grad_norm": 0.3125198781490326, + "learning_rate": 0.0001905263157894737, + "loss": 1.0159, + "mean_token_accuracy": 0.7521278113126755, + "num_tokens": 2156404.0, + "step": 172 + }, + { + "entropy": 1.0767759680747986, + "epoch": 0.28833333333333333, + "grad_norm": 0.34481510519981384, + "learning_rate": 0.00019040935672514622, + "loss": 1.0683, + "mean_token_accuracy": 0.7420379370450974, + "num_tokens": 2169219.0, + "step": 173 + }, + { + "entropy": 0.9333702996373177, + "epoch": 0.29, + "grad_norm": 0.48810121417045593, + "learning_rate": 0.00019029239766081872, + "loss": 0.9179, + "mean_token_accuracy": 0.7737013623118401, + "num_tokens": 2181773.0, + "step": 174 + }, + { + "entropy": 1.0734611302614212, + "epoch": 0.2916666666666667, + "grad_norm": 0.7126191854476929, + "learning_rate": 0.00019017543859649124, + "loss": 1.1299, + "mean_token_accuracy": 0.7278291434049606, + "num_tokens": 2194152.0, + "step": 175 + }, + { + "entropy": 1.0925179943442345, + "epoch": 0.29333333333333333, + "grad_norm": 0.4982717037200928, + "learning_rate": 0.00019005847953216376, + "loss": 1.1199, + "mean_token_accuracy": 0.7315082252025604, + "num_tokens": 2206831.0, + "step": 176 + }, + { + "entropy": 1.0705517753958702, + "epoch": 0.295, + "grad_norm": 0.3316156566143036, + "learning_rate": 0.00018994152046783626, + "loss": 1.0693, + "mean_token_accuracy": 0.7411659136414528, + "num_tokens": 2219538.0, + "step": 177 + }, + { + "entropy": 1.1149278432130814, + "epoch": 0.2966666666666667, + "grad_norm": 0.3496127128601074, + "learning_rate": 0.00018982456140350878, + "loss": 1.1338, + "mean_token_accuracy": 0.7307734712958336, + "num_tokens": 2232143.0, + "step": 178 + }, + { + "entropy": 1.054306723177433, + "epoch": 0.29833333333333334, + "grad_norm": 0.40683513879776, + "learning_rate": 0.0001897076023391813, + "loss": 1.0751, + "mean_token_accuracy": 0.7427195087075233, + "num_tokens": 2244992.0, + "step": 179 + }, + { + "entropy": 1.1751435473561287, + "epoch": 0.3, + "grad_norm": 0.41907891631126404, + "learning_rate": 0.0001895906432748538, + "loss": 1.198, + "mean_token_accuracy": 0.7172646000981331, + "num_tokens": 2257612.0, + "step": 180 + }, + { + "entropy": 1.1129144579172134, + "epoch": 0.3016666666666667, + "grad_norm": 0.3949214220046997, + "learning_rate": 0.00018947368421052632, + "loss": 1.0958, + "mean_token_accuracy": 0.7357127368450165, + "num_tokens": 2270112.0, + "step": 181 + }, + { + "entropy": 1.179383508861065, + "epoch": 0.30333333333333334, + "grad_norm": 0.3359801471233368, + "learning_rate": 0.00018935672514619885, + "loss": 1.1459, + "mean_token_accuracy": 0.7238163203001022, + "num_tokens": 2282533.0, + "step": 182 + }, + { + "entropy": 1.1705670580267906, + "epoch": 0.305, + "grad_norm": 0.3245558440685272, + "learning_rate": 0.00018923976608187134, + "loss": 1.1676, + "mean_token_accuracy": 0.7258678451180458, + "num_tokens": 2295083.0, + "step": 183 + }, + { + "entropy": 1.0460694283246994, + "epoch": 0.30666666666666664, + "grad_norm": 0.36032363772392273, + "learning_rate": 0.0001891228070175439, + "loss": 1.0124, + "mean_token_accuracy": 0.7577406391501427, + "num_tokens": 2307968.0, + "step": 184 + }, + { + "entropy": 1.1906094327569008, + "epoch": 0.30833333333333335, + "grad_norm": 0.3748197853565216, + "learning_rate": 0.0001890058479532164, + "loss": 1.1827, + "mean_token_accuracy": 0.7173843756318092, + "num_tokens": 2320581.0, + "step": 185 + }, + { + "entropy": 1.0227904915809631, + "epoch": 0.31, + "grad_norm": 0.3617149591445923, + "learning_rate": 0.00018888888888888888, + "loss": 1.0322, + "mean_token_accuracy": 0.7545242831110954, + "num_tokens": 2333156.0, + "step": 186 + }, + { + "entropy": 1.0227366983890533, + "epoch": 0.31166666666666665, + "grad_norm": 0.32479673624038696, + "learning_rate": 0.00018877192982456143, + "loss": 1.0207, + "mean_token_accuracy": 0.7527587786316872, + "num_tokens": 2345652.0, + "step": 187 + }, + { + "entropy": 1.1130978390574455, + "epoch": 0.31333333333333335, + "grad_norm": 0.30757713317871094, + "learning_rate": 0.00018865497076023393, + "loss": 1.1258, + "mean_token_accuracy": 0.7307859510183334, + "num_tokens": 2358342.0, + "step": 188 + }, + { + "entropy": 1.1219107881188393, + "epoch": 0.315, + "grad_norm": 0.39883914589881897, + "learning_rate": 0.00018853801169590643, + "loss": 1.1484, + "mean_token_accuracy": 0.7276914939284325, + "num_tokens": 2370759.0, + "step": 189 + }, + { + "entropy": 1.0377257764339447, + "epoch": 0.31666666666666665, + "grad_norm": 0.3542444705963135, + "learning_rate": 0.00018842105263157898, + "loss": 1.043, + "mean_token_accuracy": 0.755888819694519, + "num_tokens": 2383289.0, + "step": 190 + }, + { + "entropy": 1.0208693370223045, + "epoch": 0.31833333333333336, + "grad_norm": 0.34766149520874023, + "learning_rate": 0.00018830409356725147, + "loss": 0.9811, + "mean_token_accuracy": 0.7647728249430656, + "num_tokens": 2395819.0, + "step": 191 + }, + { + "entropy": 1.116250567138195, + "epoch": 0.32, + "grad_norm": 0.32695358991622925, + "learning_rate": 0.00018818713450292397, + "loss": 1.1336, + "mean_token_accuracy": 0.735762432217598, + "num_tokens": 2408408.0, + "step": 192 + }, + { + "entropy": 1.1328190714120865, + "epoch": 0.32166666666666666, + "grad_norm": 0.36984795331954956, + "learning_rate": 0.00018807017543859652, + "loss": 1.1403, + "mean_token_accuracy": 0.7324722409248352, + "num_tokens": 2420683.0, + "step": 193 + }, + { + "entropy": 1.1805738806724548, + "epoch": 0.3233333333333333, + "grad_norm": 0.44600820541381836, + "learning_rate": 0.00018795321637426901, + "loss": 1.1871, + "mean_token_accuracy": 0.7195275351405144, + "num_tokens": 2433445.0, + "step": 194 + }, + { + "entropy": 1.1397397369146347, + "epoch": 0.325, + "grad_norm": 0.415822833776474, + "learning_rate": 0.0001878362573099415, + "loss": 1.1042, + "mean_token_accuracy": 0.7342669293284416, + "num_tokens": 2446015.0, + "step": 195 + }, + { + "entropy": 1.045611895620823, + "epoch": 0.32666666666666666, + "grad_norm": 0.33927756547927856, + "learning_rate": 0.00018771929824561406, + "loss": 1.0144, + "mean_token_accuracy": 0.7533715888857841, + "num_tokens": 2458426.0, + "step": 196 + }, + { + "entropy": 1.2224002107977867, + "epoch": 0.3283333333333333, + "grad_norm": 0.3514537215232849, + "learning_rate": 0.00018760233918128656, + "loss": 1.198, + "mean_token_accuracy": 0.7203835994005203, + "num_tokens": 2470769.0, + "step": 197 + }, + { + "entropy": 1.091364249587059, + "epoch": 0.33, + "grad_norm": 0.34935829043388367, + "learning_rate": 0.00018748538011695905, + "loss": 1.0844, + "mean_token_accuracy": 0.7347274050116539, + "num_tokens": 2483142.0, + "step": 198 + }, + { + "entropy": 0.9068185538053513, + "epoch": 0.33166666666666667, + "grad_norm": 0.3763393759727478, + "learning_rate": 0.0001873684210526316, + "loss": 0.9062, + "mean_token_accuracy": 0.7771774157881737, + "num_tokens": 2496030.0, + "step": 199 + }, + { + "entropy": 1.1054254174232483, + "epoch": 0.3333333333333333, + "grad_norm": 0.3939693570137024, + "learning_rate": 0.0001872514619883041, + "loss": 1.1066, + "mean_token_accuracy": 0.7332883253693581, + "num_tokens": 2508253.0, + "step": 200 + }, + { + "entropy": 1.0377614423632622, + "epoch": 0.335, + "grad_norm": 0.3749332129955292, + "learning_rate": 0.0001871345029239766, + "loss": 1.0401, + "mean_token_accuracy": 0.7471970319747925, + "num_tokens": 2520898.0, + "step": 201 + }, + { + "entropy": 1.0859627276659012, + "epoch": 0.33666666666666667, + "grad_norm": 0.3993607759475708, + "learning_rate": 0.00018701754385964914, + "loss": 1.0711, + "mean_token_accuracy": 0.7411193326115608, + "num_tokens": 2533581.0, + "step": 202 + }, + { + "entropy": 1.148285612463951, + "epoch": 0.3383333333333333, + "grad_norm": 0.3228718042373657, + "learning_rate": 0.00018690058479532164, + "loss": 1.1805, + "mean_token_accuracy": 0.7245375514030457, + "num_tokens": 2546041.0, + "step": 203 + }, + { + "entropy": 0.9141278266906738, + "epoch": 0.34, + "grad_norm": 0.36570003628730774, + "learning_rate": 0.00018678362573099416, + "loss": 0.8984, + "mean_token_accuracy": 0.7771345153450966, + "num_tokens": 2558399.0, + "step": 204 + }, + { + "entropy": 1.099015660583973, + "epoch": 0.3416666666666667, + "grad_norm": 0.3728063404560089, + "learning_rate": 0.0001866666666666667, + "loss": 1.1063, + "mean_token_accuracy": 0.7349843755364418, + "num_tokens": 2570753.0, + "step": 205 + }, + { + "entropy": 1.0622873678803444, + "epoch": 0.3433333333333333, + "grad_norm": 0.3592352271080017, + "learning_rate": 0.00018654970760233918, + "loss": 1.0546, + "mean_token_accuracy": 0.7437913119792938, + "num_tokens": 2583317.0, + "step": 206 + }, + { + "entropy": 1.0601943358778954, + "epoch": 0.345, + "grad_norm": 0.3239583373069763, + "learning_rate": 0.0001864327485380117, + "loss": 1.0572, + "mean_token_accuracy": 0.7508477568626404, + "num_tokens": 2596203.0, + "step": 207 + }, + { + "entropy": 1.0785654410719872, + "epoch": 0.3466666666666667, + "grad_norm": 0.3292888104915619, + "learning_rate": 0.00018631578947368423, + "loss": 1.0613, + "mean_token_accuracy": 0.7437207996845245, + "num_tokens": 2608688.0, + "step": 208 + }, + { + "entropy": 1.133879691362381, + "epoch": 0.34833333333333333, + "grad_norm": 0.46649813652038574, + "learning_rate": 0.00018619883040935672, + "loss": 1.1081, + "mean_token_accuracy": 0.7371273636817932, + "num_tokens": 2621110.0, + "step": 209 + }, + { + "entropy": 1.0522583425045013, + "epoch": 0.35, + "grad_norm": 0.44118815660476685, + "learning_rate": 0.00018608187134502925, + "loss": 1.0359, + "mean_token_accuracy": 0.7514612525701523, + "num_tokens": 2633452.0, + "step": 210 + }, + { + "entropy": 1.1348539143800735, + "epoch": 0.3516666666666667, + "grad_norm": 0.3560009300708771, + "learning_rate": 0.00018596491228070177, + "loss": 1.1507, + "mean_token_accuracy": 0.7266515046358109, + "num_tokens": 2645787.0, + "step": 211 + }, + { + "entropy": 1.0085133984684944, + "epoch": 0.35333333333333333, + "grad_norm": 0.39134904742240906, + "learning_rate": 0.00018584795321637427, + "loss": 1.0109, + "mean_token_accuracy": 0.7624108791351318, + "num_tokens": 2658104.0, + "step": 212 + }, + { + "entropy": 0.9829937592148781, + "epoch": 0.355, + "grad_norm": 0.3849150836467743, + "learning_rate": 0.0001857309941520468, + "loss": 0.9771, + "mean_token_accuracy": 0.7678255960345268, + "num_tokens": 2670756.0, + "step": 213 + }, + { + "entropy": 1.1382714584469795, + "epoch": 0.3566666666666667, + "grad_norm": 0.3976750075817108, + "learning_rate": 0.0001856140350877193, + "loss": 1.13, + "mean_token_accuracy": 0.7280794978141785, + "num_tokens": 2683295.0, + "step": 214 + }, + { + "entropy": 1.217024527490139, + "epoch": 0.35833333333333334, + "grad_norm": 0.36979228258132935, + "learning_rate": 0.0001854970760233918, + "loss": 1.2515, + "mean_token_accuracy": 0.7075021639466286, + "num_tokens": 2695796.0, + "step": 215 + }, + { + "entropy": 1.189859315752983, + "epoch": 0.36, + "grad_norm": 0.3615109324455261, + "learning_rate": 0.00018538011695906433, + "loss": 1.1924, + "mean_token_accuracy": 0.7155702859163284, + "num_tokens": 2708402.0, + "step": 216 + }, + { + "entropy": 1.1346632614731789, + "epoch": 0.3616666666666667, + "grad_norm": 0.31497734785079956, + "learning_rate": 0.00018526315789473685, + "loss": 1.1414, + "mean_token_accuracy": 0.7290553748607635, + "num_tokens": 2721310.0, + "step": 217 + }, + { + "entropy": 1.0656853094696999, + "epoch": 0.36333333333333334, + "grad_norm": 0.36364349722862244, + "learning_rate": 0.00018514619883040935, + "loss": 1.0464, + "mean_token_accuracy": 0.7473271563649178, + "num_tokens": 2733663.0, + "step": 218 + }, + { + "entropy": 1.267977461218834, + "epoch": 0.365, + "grad_norm": 0.37948599457740784, + "learning_rate": 0.00018502923976608187, + "loss": 1.268, + "mean_token_accuracy": 0.7058289349079132, + "num_tokens": 2746382.0, + "step": 219 + }, + { + "entropy": 1.1866832301020622, + "epoch": 0.36666666666666664, + "grad_norm": 0.34032464027404785, + "learning_rate": 0.0001849122807017544, + "loss": 1.2023, + "mean_token_accuracy": 0.7172464728355408, + "num_tokens": 2758970.0, + "step": 220 + }, + { + "entropy": 1.0964962020516396, + "epoch": 0.36833333333333335, + "grad_norm": 0.34577351808547974, + "learning_rate": 0.0001847953216374269, + "loss": 1.0842, + "mean_token_accuracy": 0.744616910815239, + "num_tokens": 2771288.0, + "step": 221 + }, + { + "entropy": 1.0916159451007843, + "epoch": 0.37, + "grad_norm": 0.32731419801712036, + "learning_rate": 0.00018467836257309942, + "loss": 1.074, + "mean_token_accuracy": 0.739419586956501, + "num_tokens": 2783755.0, + "step": 222 + }, + { + "entropy": 1.1499411910772324, + "epoch": 0.37166666666666665, + "grad_norm": 0.3359861671924591, + "learning_rate": 0.00018456140350877194, + "loss": 1.1354, + "mean_token_accuracy": 0.7299651131033897, + "num_tokens": 2796265.0, + "step": 223 + }, + { + "entropy": 1.2112242728471756, + "epoch": 0.37333333333333335, + "grad_norm": 0.3740575909614563, + "learning_rate": 0.00018444444444444446, + "loss": 1.2154, + "mean_token_accuracy": 0.7131304666399956, + "num_tokens": 2808704.0, + "step": 224 + }, + { + "entropy": 1.105809710919857, + "epoch": 0.375, + "grad_norm": 0.3910123109817505, + "learning_rate": 0.00018432748538011698, + "loss": 1.1179, + "mean_token_accuracy": 0.7317347005009651, + "num_tokens": 2821062.0, + "step": 225 + }, + { + "entropy": 1.1065769121050835, + "epoch": 0.37666666666666665, + "grad_norm": 0.37251028418540955, + "learning_rate": 0.00018421052631578948, + "loss": 1.1162, + "mean_token_accuracy": 0.73953577876091, + "num_tokens": 2833606.0, + "step": 226 + }, + { + "entropy": 1.0926253944635391, + "epoch": 0.37833333333333335, + "grad_norm": 0.3471030294895172, + "learning_rate": 0.000184093567251462, + "loss": 1.0956, + "mean_token_accuracy": 0.7368374243378639, + "num_tokens": 2846358.0, + "step": 227 + }, + { + "entropy": 1.0553656443953514, + "epoch": 0.38, + "grad_norm": 0.3455545902252197, + "learning_rate": 0.00018397660818713453, + "loss": 1.0357, + "mean_token_accuracy": 0.7497663721442223, + "num_tokens": 2858901.0, + "step": 228 + }, + { + "entropy": 1.1180525943636894, + "epoch": 0.38166666666666665, + "grad_norm": 0.426946759223938, + "learning_rate": 0.00018385964912280702, + "loss": 1.1552, + "mean_token_accuracy": 0.7236368507146835, + "num_tokens": 2871380.0, + "step": 229 + }, + { + "entropy": 1.0417871698737144, + "epoch": 0.38333333333333336, + "grad_norm": 0.446646511554718, + "learning_rate": 0.00018374269005847955, + "loss": 1.021, + "mean_token_accuracy": 0.7536017820239067, + "num_tokens": 2883953.0, + "step": 230 + }, + { + "entropy": 1.0821654945611954, + "epoch": 0.385, + "grad_norm": 0.3330538272857666, + "learning_rate": 0.00018362573099415207, + "loss": 1.091, + "mean_token_accuracy": 0.7424333989620209, + "num_tokens": 2896783.0, + "step": 231 + }, + { + "entropy": 1.261072151362896, + "epoch": 0.38666666666666666, + "grad_norm": 0.40883293747901917, + "learning_rate": 0.00018350877192982456, + "loss": 1.2329, + "mean_token_accuracy": 0.7064631283283234, + "num_tokens": 2909412.0, + "step": 232 + }, + { + "entropy": 1.219589687883854, + "epoch": 0.3883333333333333, + "grad_norm": 0.3914692997932434, + "learning_rate": 0.0001833918128654971, + "loss": 1.219, + "mean_token_accuracy": 0.7204131335020065, + "num_tokens": 2921869.0, + "step": 233 + }, + { + "entropy": 1.2079207003116608, + "epoch": 0.39, + "grad_norm": 0.3635447025299072, + "learning_rate": 0.0001832748538011696, + "loss": 1.1823, + "mean_token_accuracy": 0.7185709178447723, + "num_tokens": 2934474.0, + "step": 234 + }, + { + "entropy": 1.0661711767315865, + "epoch": 0.39166666666666666, + "grad_norm": 0.4999198615550995, + "learning_rate": 0.0001831578947368421, + "loss": 1.0022, + "mean_token_accuracy": 0.7570779994130135, + "num_tokens": 2947214.0, + "step": 235 + }, + { + "entropy": 0.9990430921316147, + "epoch": 0.3933333333333333, + "grad_norm": 0.4003547728061676, + "learning_rate": 0.00018304093567251463, + "loss": 0.9514, + "mean_token_accuracy": 0.7665991857647896, + "num_tokens": 2959663.0, + "step": 236 + }, + { + "entropy": 1.0679278895258904, + "epoch": 0.395, + "grad_norm": 0.38362008333206177, + "learning_rate": 0.00018292397660818715, + "loss": 1.0679, + "mean_token_accuracy": 0.7407658472657204, + "num_tokens": 2972227.0, + "step": 237 + }, + { + "entropy": 1.0128286629915237, + "epoch": 0.39666666666666667, + "grad_norm": 0.3753218948841095, + "learning_rate": 0.00018280701754385965, + "loss": 1.0775, + "mean_token_accuracy": 0.7455108985304832, + "num_tokens": 2984809.0, + "step": 238 + }, + { + "entropy": 1.2022801265120506, + "epoch": 0.3983333333333333, + "grad_norm": 0.4850371778011322, + "learning_rate": 0.00018269005847953217, + "loss": 1.2478, + "mean_token_accuracy": 0.7030462697148323, + "num_tokens": 2997396.0, + "step": 239 + }, + { + "entropy": 1.0180853754281998, + "epoch": 0.4, + "grad_norm": 0.3701488971710205, + "learning_rate": 0.0001825730994152047, + "loss": 1.0154, + "mean_token_accuracy": 0.7558140829205513, + "num_tokens": 3009921.0, + "step": 240 + }, + { + "entropy": 1.0700580030679703, + "epoch": 0.40166666666666667, + "grad_norm": 0.3202139437198639, + "learning_rate": 0.0001824561403508772, + "loss": 1.0867, + "mean_token_accuracy": 0.7432869449257851, + "num_tokens": 3022468.0, + "step": 241 + }, + { + "entropy": 1.0071598812937737, + "epoch": 0.4033333333333333, + "grad_norm": 0.42016497254371643, + "learning_rate": 0.0001823391812865497, + "loss": 1.0128, + "mean_token_accuracy": 0.7569889947772026, + "num_tokens": 3035124.0, + "step": 242 + }, + { + "entropy": 1.1294294893741608, + "epoch": 0.405, + "grad_norm": 0.3567630648612976, + "learning_rate": 0.00018222222222222224, + "loss": 1.1274, + "mean_token_accuracy": 0.7349538579583168, + "num_tokens": 3047710.0, + "step": 243 + }, + { + "entropy": 1.1060679331421852, + "epoch": 0.4066666666666667, + "grad_norm": 0.34659335017204285, + "learning_rate": 0.00018210526315789476, + "loss": 1.1141, + "mean_token_accuracy": 0.7334257215261459, + "num_tokens": 3060375.0, + "step": 244 + }, + { + "entropy": 1.210126355290413, + "epoch": 0.4083333333333333, + "grad_norm": 0.4510009288787842, + "learning_rate": 0.00018198830409356726, + "loss": 1.1887, + "mean_token_accuracy": 0.7228007987141609, + "num_tokens": 3072667.0, + "step": 245 + }, + { + "entropy": 0.9602404832839966, + "epoch": 0.41, + "grad_norm": 0.4799667298793793, + "learning_rate": 0.00018187134502923978, + "loss": 0.928, + "mean_token_accuracy": 0.7675070241093636, + "num_tokens": 3085435.0, + "step": 246 + }, + { + "entropy": 1.0962599590420723, + "epoch": 0.4116666666666667, + "grad_norm": 0.3477798402309418, + "learning_rate": 0.0001817543859649123, + "loss": 1.0924, + "mean_token_accuracy": 0.7440480887889862, + "num_tokens": 3097879.0, + "step": 247 + }, + { + "entropy": 1.1427634581923485, + "epoch": 0.41333333333333333, + "grad_norm": 0.343257337808609, + "learning_rate": 0.0001816374269005848, + "loss": 1.1493, + "mean_token_accuracy": 0.7298817038536072, + "num_tokens": 3110255.0, + "step": 248 + }, + { + "entropy": 1.0447398945689201, + "epoch": 0.415, + "grad_norm": 0.49843472242355347, + "learning_rate": 0.00018152046783625732, + "loss": 1.0298, + "mean_token_accuracy": 0.7523345276713371, + "num_tokens": 3122802.0, + "step": 249 + }, + { + "entropy": 1.1335030645132065, + "epoch": 0.4166666666666667, + "grad_norm": 0.3733726739883423, + "learning_rate": 0.00018140350877192984, + "loss": 1.1147, + "mean_token_accuracy": 0.7301384806632996, + "num_tokens": 3135401.0, + "step": 250 + }, + { + "entropy": 1.1334904357790947, + "epoch": 0.41833333333333333, + "grad_norm": 0.3926542103290558, + "learning_rate": 0.00018128654970760234, + "loss": 1.1316, + "mean_token_accuracy": 0.735007993876934, + "num_tokens": 3147925.0, + "step": 251 + }, + { + "entropy": 0.9874943792819977, + "epoch": 0.42, + "grad_norm": 0.5496231317520142, + "learning_rate": 0.00018116959064327486, + "loss": 0.9596, + "mean_token_accuracy": 0.7662070468068123, + "num_tokens": 3160379.0, + "step": 252 + }, + { + "entropy": 0.9868237897753716, + "epoch": 0.4216666666666667, + "grad_norm": 0.46299904584884644, + "learning_rate": 0.00018105263157894739, + "loss": 0.9998, + "mean_token_accuracy": 0.7589811682701111, + "num_tokens": 3172837.0, + "step": 253 + }, + { + "entropy": 0.9789915308356285, + "epoch": 0.42333333333333334, + "grad_norm": 0.3225744068622589, + "learning_rate": 0.00018093567251461988, + "loss": 0.9982, + "mean_token_accuracy": 0.7591699734330177, + "num_tokens": 3185252.0, + "step": 254 + }, + { + "entropy": 1.1388946995139122, + "epoch": 0.425, + "grad_norm": 0.4054366648197174, + "learning_rate": 0.0001808187134502924, + "loss": 1.149, + "mean_token_accuracy": 0.7230332866311073, + "num_tokens": 3197900.0, + "step": 255 + }, + { + "entropy": 1.0267946869134903, + "epoch": 0.4266666666666667, + "grad_norm": 0.43072929978370667, + "learning_rate": 0.00018070175438596493, + "loss": 1.0254, + "mean_token_accuracy": 0.7437686920166016, + "num_tokens": 3210354.0, + "step": 256 + }, + { + "entropy": 1.151397317647934, + "epoch": 0.42833333333333334, + "grad_norm": 0.5497655272483826, + "learning_rate": 0.00018058479532163742, + "loss": 1.1575, + "mean_token_accuracy": 0.7231586053967476, + "num_tokens": 3222849.0, + "step": 257 + }, + { + "entropy": 0.9719003960490227, + "epoch": 0.43, + "grad_norm": 0.39777350425720215, + "learning_rate": 0.00018046783625730995, + "loss": 0.9829, + "mean_token_accuracy": 0.76484714448452, + "num_tokens": 3235515.0, + "step": 258 + }, + { + "entropy": 1.1252076029777527, + "epoch": 0.43166666666666664, + "grad_norm": 0.5205410718917847, + "learning_rate": 0.00018035087719298247, + "loss": 1.1477, + "mean_token_accuracy": 0.73708376288414, + "num_tokens": 3248200.0, + "step": 259 + }, + { + "entropy": 1.1057978570461273, + "epoch": 0.43333333333333335, + "grad_norm": 0.35812073945999146, + "learning_rate": 0.00018023391812865497, + "loss": 1.0672, + "mean_token_accuracy": 0.7432841360569, + "num_tokens": 3260782.0, + "step": 260 + }, + { + "entropy": 1.1699188724160194, + "epoch": 0.435, + "grad_norm": 0.4195549786090851, + "learning_rate": 0.0001801169590643275, + "loss": 1.1592, + "mean_token_accuracy": 0.720735527575016, + "num_tokens": 3273231.0, + "step": 261 + }, + { + "entropy": 1.0379428714513779, + "epoch": 0.43666666666666665, + "grad_norm": 0.7551639080047607, + "learning_rate": 0.00018, + "loss": 1.0302, + "mean_token_accuracy": 0.7501674890518188, + "num_tokens": 3285787.0, + "step": 262 + }, + { + "entropy": 1.071130983531475, + "epoch": 0.43833333333333335, + "grad_norm": 0.4712306261062622, + "learning_rate": 0.0001798830409356725, + "loss": 1.0726, + "mean_token_accuracy": 0.7447422966361046, + "num_tokens": 3298694.0, + "step": 263 + }, + { + "entropy": 1.2623703926801682, + "epoch": 0.44, + "grad_norm": 0.33710184693336487, + "learning_rate": 0.00017976608187134503, + "loss": 1.2761, + "mean_token_accuracy": 0.7032147943973541, + "num_tokens": 3311387.0, + "step": 264 + }, + { + "entropy": 1.1410105228424072, + "epoch": 0.44166666666666665, + "grad_norm": 0.6147916913032532, + "learning_rate": 0.00017964912280701755, + "loss": 1.1392, + "mean_token_accuracy": 0.7260777652263641, + "num_tokens": 3324108.0, + "step": 265 + }, + { + "entropy": 1.1146948486566544, + "epoch": 0.44333333333333336, + "grad_norm": 0.4302304983139038, + "learning_rate": 0.00017953216374269005, + "loss": 1.1261, + "mean_token_accuracy": 0.7268940955400467, + "num_tokens": 3336483.0, + "step": 266 + }, + { + "entropy": 1.1186750009655952, + "epoch": 0.445, + "grad_norm": 0.33564555644989014, + "learning_rate": 0.0001794152046783626, + "loss": 1.1414, + "mean_token_accuracy": 0.7282785773277283, + "num_tokens": 3349243.0, + "step": 267 + }, + { + "entropy": 1.135681688785553, + "epoch": 0.44666666666666666, + "grad_norm": 0.3261569142341614, + "learning_rate": 0.0001792982456140351, + "loss": 1.112, + "mean_token_accuracy": 0.7374716177582741, + "num_tokens": 3361738.0, + "step": 268 + }, + { + "entropy": 1.017588496208191, + "epoch": 0.4483333333333333, + "grad_norm": 0.3962899446487427, + "learning_rate": 0.00017918128654970762, + "loss": 0.9988, + "mean_token_accuracy": 0.760860413312912, + "num_tokens": 3374177.0, + "step": 269 + }, + { + "entropy": 1.1711449921131134, + "epoch": 0.45, + "grad_norm": 0.43412500619888306, + "learning_rate": 0.00017906432748538014, + "loss": 1.1504, + "mean_token_accuracy": 0.724379375576973, + "num_tokens": 3386986.0, + "step": 270 + }, + { + "entropy": 0.9756506755948067, + "epoch": 0.45166666666666666, + "grad_norm": 0.3354060649871826, + "learning_rate": 0.00017894736842105264, + "loss": 0.9702, + "mean_token_accuracy": 0.7611405923962593, + "num_tokens": 3399391.0, + "step": 271 + }, + { + "entropy": 1.1847113892436028, + "epoch": 0.4533333333333333, + "grad_norm": 0.3396911919116974, + "learning_rate": 0.00017883040935672516, + "loss": 1.1831, + "mean_token_accuracy": 0.7153457403182983, + "num_tokens": 3411895.0, + "step": 272 + }, + { + "entropy": 1.11505925655365, + "epoch": 0.455, + "grad_norm": 0.4412688910961151, + "learning_rate": 0.00017871345029239768, + "loss": 1.1175, + "mean_token_accuracy": 0.7404645457863808, + "num_tokens": 3424508.0, + "step": 273 + }, + { + "entropy": 1.1529017016291618, + "epoch": 0.45666666666666667, + "grad_norm": 0.37485653162002563, + "learning_rate": 0.00017859649122807018, + "loss": 1.1488, + "mean_token_accuracy": 0.7300072684884071, + "num_tokens": 3437006.0, + "step": 274 + }, + { + "entropy": 1.0116957277059555, + "epoch": 0.4583333333333333, + "grad_norm": 0.3534213900566101, + "learning_rate": 0.0001784795321637427, + "loss": 1.0102, + "mean_token_accuracy": 0.7638874277472496, + "num_tokens": 3449704.0, + "step": 275 + }, + { + "entropy": 0.9486217200756073, + "epoch": 0.46, + "grad_norm": 0.4339519143104553, + "learning_rate": 0.00017836257309941523, + "loss": 0.9528, + "mean_token_accuracy": 0.7683136314153671, + "num_tokens": 3462323.0, + "step": 276 + }, + { + "entropy": 1.1378100514411926, + "epoch": 0.46166666666666667, + "grad_norm": 0.410174697637558, + "learning_rate": 0.00017824561403508772, + "loss": 1.143, + "mean_token_accuracy": 0.7271644920110703, + "num_tokens": 3474728.0, + "step": 277 + }, + { + "entropy": 1.035399042069912, + "epoch": 0.4633333333333333, + "grad_norm": 0.36434057354927063, + "learning_rate": 0.00017812865497076024, + "loss": 1.0336, + "mean_token_accuracy": 0.7525843381881714, + "num_tokens": 3487241.0, + "step": 278 + }, + { + "entropy": 1.035888947546482, + "epoch": 0.465, + "grad_norm": 0.3117313086986542, + "learning_rate": 0.00017801169590643277, + "loss": 1.0461, + "mean_token_accuracy": 0.7474090084433556, + "num_tokens": 3499778.0, + "step": 279 + }, + { + "entropy": 0.9683891534805298, + "epoch": 0.4666666666666667, + "grad_norm": 0.3829636871814728, + "learning_rate": 0.00017789473684210526, + "loss": 0.9695, + "mean_token_accuracy": 0.7638266384601593, + "num_tokens": 3512453.0, + "step": 280 + }, + { + "entropy": 1.0482271388173103, + "epoch": 0.4683333333333333, + "grad_norm": 0.4052916169166565, + "learning_rate": 0.00017777777777777779, + "loss": 1.0454, + "mean_token_accuracy": 0.7466916441917419, + "num_tokens": 3525098.0, + "step": 281 + }, + { + "entropy": 0.9748341217637062, + "epoch": 0.47, + "grad_norm": 0.44575342535972595, + "learning_rate": 0.0001776608187134503, + "loss": 0.9795, + "mean_token_accuracy": 0.7599806860089302, + "num_tokens": 3537524.0, + "step": 282 + }, + { + "entropy": 1.153599664568901, + "epoch": 0.4716666666666667, + "grad_norm": 0.37257784605026245, + "learning_rate": 0.0001775438596491228, + "loss": 1.1518, + "mean_token_accuracy": 0.7194614708423615, + "num_tokens": 3550045.0, + "step": 283 + }, + { + "entropy": 1.004349708557129, + "epoch": 0.47333333333333333, + "grad_norm": 0.38646551966667175, + "learning_rate": 0.00017742690058479533, + "loss": 0.9497, + "mean_token_accuracy": 0.7630625516176224, + "num_tokens": 3562622.0, + "step": 284 + }, + { + "entropy": 0.9980553165078163, + "epoch": 0.475, + "grad_norm": 0.49010729789733887, + "learning_rate": 0.00017730994152046785, + "loss": 0.975, + "mean_token_accuracy": 0.7679460272192955, + "num_tokens": 3575340.0, + "step": 285 + }, + { + "entropy": 1.088503360748291, + "epoch": 0.4766666666666667, + "grad_norm": 0.5452380180358887, + "learning_rate": 0.00017719298245614035, + "loss": 1.1096, + "mean_token_accuracy": 0.7412427291274071, + "num_tokens": 3587845.0, + "step": 286 + }, + { + "entropy": 1.0458046644926071, + "epoch": 0.47833333333333333, + "grad_norm": 0.6659599542617798, + "learning_rate": 0.0001770760233918129, + "loss": 1.048, + "mean_token_accuracy": 0.7454439774155617, + "num_tokens": 3600369.0, + "step": 287 + }, + { + "entropy": 1.1275902390480042, + "epoch": 0.48, + "grad_norm": 0.4208016097545624, + "learning_rate": 0.0001769590643274854, + "loss": 1.1682, + "mean_token_accuracy": 0.7288567647337914, + "num_tokens": 3612814.0, + "step": 288 + }, + { + "entropy": 1.1426914036273956, + "epoch": 0.4816666666666667, + "grad_norm": 0.5147913694381714, + "learning_rate": 0.0001768421052631579, + "loss": 1.1828, + "mean_token_accuracy": 0.723309837281704, + "num_tokens": 3625180.0, + "step": 289 + }, + { + "entropy": 1.0466126427054405, + "epoch": 0.48333333333333334, + "grad_norm": 0.5052932500839233, + "learning_rate": 0.00017672514619883044, + "loss": 1.0574, + "mean_token_accuracy": 0.7450472787022591, + "num_tokens": 3637813.0, + "step": 290 + }, + { + "entropy": 1.0919866040349007, + "epoch": 0.485, + "grad_norm": 0.3577198386192322, + "learning_rate": 0.00017660818713450294, + "loss": 1.0871, + "mean_token_accuracy": 0.7373141944408417, + "num_tokens": 3650370.0, + "step": 291 + }, + { + "entropy": 1.0942333936691284, + "epoch": 0.4866666666666667, + "grad_norm": 0.438251256942749, + "learning_rate": 0.00017649122807017543, + "loss": 1.088, + "mean_token_accuracy": 0.7383796274662018, + "num_tokens": 3663061.0, + "step": 292 + }, + { + "entropy": 0.9413135126233101, + "epoch": 0.48833333333333334, + "grad_norm": 0.44067561626434326, + "learning_rate": 0.00017637426900584798, + "loss": 0.9098, + "mean_token_accuracy": 0.7814988046884537, + "num_tokens": 3675856.0, + "step": 293 + }, + { + "entropy": 1.0296655967831612, + "epoch": 0.49, + "grad_norm": 0.3933659791946411, + "learning_rate": 0.00017625730994152048, + "loss": 1.0182, + "mean_token_accuracy": 0.7536342963576317, + "num_tokens": 3688322.0, + "step": 294 + }, + { + "entropy": 1.203233003616333, + "epoch": 0.49166666666666664, + "grad_norm": 0.3295867443084717, + "learning_rate": 0.00017614035087719297, + "loss": 1.1907, + "mean_token_accuracy": 0.7196495458483696, + "num_tokens": 3700776.0, + "step": 295 + }, + { + "entropy": 0.968670666217804, + "epoch": 0.49333333333333335, + "grad_norm": 0.3442942202091217, + "learning_rate": 0.00017602339181286552, + "loss": 0.9705, + "mean_token_accuracy": 0.7706414982676506, + "num_tokens": 3712967.0, + "step": 296 + }, + { + "entropy": 1.0014533996582031, + "epoch": 0.495, + "grad_norm": 0.41025930643081665, + "learning_rate": 0.00017590643274853802, + "loss": 0.9909, + "mean_token_accuracy": 0.7594960108399391, + "num_tokens": 3725548.0, + "step": 297 + }, + { + "entropy": 0.9800790995359421, + "epoch": 0.49666666666666665, + "grad_norm": 0.35484063625335693, + "learning_rate": 0.00017578947368421052, + "loss": 0.9636, + "mean_token_accuracy": 0.765408881008625, + "num_tokens": 3738348.0, + "step": 298 + }, + { + "entropy": 1.0904498919844627, + "epoch": 0.49833333333333335, + "grad_norm": 0.3571205139160156, + "learning_rate": 0.00017567251461988307, + "loss": 1.1312, + "mean_token_accuracy": 0.735343262553215, + "num_tokens": 3750904.0, + "step": 299 + }, + { + "entropy": 0.8727906718850136, + "epoch": 0.5, + "grad_norm": 0.3699484169483185, + "learning_rate": 0.00017555555555555556, + "loss": 0.8909, + "mean_token_accuracy": 0.7798029482364655, + "num_tokens": 3763871.0, + "step": 300 + }, + { + "entropy": 1.032260425388813, + "epoch": 0.5016666666666667, + "grad_norm": 0.31954166293144226, + "learning_rate": 0.00017543859649122806, + "loss": 1.0508, + "mean_token_accuracy": 0.7496752962470055, + "num_tokens": 3776386.0, + "step": 301 + }, + { + "entropy": 0.9070574641227722, + "epoch": 0.5033333333333333, + "grad_norm": 0.3604190945625305, + "learning_rate": 0.0001753216374269006, + "loss": 0.9088, + "mean_token_accuracy": 0.7788211852312088, + "num_tokens": 3789088.0, + "step": 302 + }, + { + "entropy": 1.0767110586166382, + "epoch": 0.505, + "grad_norm": 0.3933933675289154, + "learning_rate": 0.0001752046783625731, + "loss": 1.0663, + "mean_token_accuracy": 0.7495614886283875, + "num_tokens": 3801836.0, + "step": 303 + }, + { + "entropy": 1.0157058015465736, + "epoch": 0.5066666666666667, + "grad_norm": 0.3408040404319763, + "learning_rate": 0.0001750877192982456, + "loss": 1.0113, + "mean_token_accuracy": 0.746182844042778, + "num_tokens": 3814328.0, + "step": 304 + }, + { + "entropy": 1.2159147933125496, + "epoch": 0.5083333333333333, + "grad_norm": 0.3778250217437744, + "learning_rate": 0.00017497076023391815, + "loss": 1.2088, + "mean_token_accuracy": 0.7146832719445229, + "num_tokens": 3827003.0, + "step": 305 + }, + { + "entropy": 1.103552520275116, + "epoch": 0.51, + "grad_norm": 0.3790249526500702, + "learning_rate": 0.00017485380116959065, + "loss": 1.0762, + "mean_token_accuracy": 0.7435255199670792, + "num_tokens": 3839330.0, + "step": 306 + }, + { + "entropy": 0.9754323288798332, + "epoch": 0.5116666666666667, + "grad_norm": 0.3479340970516205, + "learning_rate": 0.00017473684210526317, + "loss": 0.9591, + "mean_token_accuracy": 0.7697796747088432, + "num_tokens": 3851732.0, + "step": 307 + }, + { + "entropy": 1.0752006620168686, + "epoch": 0.5133333333333333, + "grad_norm": 0.36727213859558105, + "learning_rate": 0.0001746198830409357, + "loss": 1.076, + "mean_token_accuracy": 0.745739258825779, + "num_tokens": 3864226.0, + "step": 308 + }, + { + "entropy": 1.0623352229595184, + "epoch": 0.515, + "grad_norm": 0.5051237940788269, + "learning_rate": 0.0001745029239766082, + "loss": 1.0474, + "mean_token_accuracy": 0.7463502958416939, + "num_tokens": 3876593.0, + "step": 309 + }, + { + "entropy": 1.212167464196682, + "epoch": 0.5166666666666667, + "grad_norm": 0.39934271574020386, + "learning_rate": 0.0001743859649122807, + "loss": 1.2181, + "mean_token_accuracy": 0.7197644412517548, + "num_tokens": 3889321.0, + "step": 310 + }, + { + "entropy": 1.1309688091278076, + "epoch": 0.5183333333333333, + "grad_norm": 0.3818771243095398, + "learning_rate": 0.00017426900584795323, + "loss": 1.1528, + "mean_token_accuracy": 0.7310786545276642, + "num_tokens": 3901680.0, + "step": 311 + }, + { + "entropy": 1.0219652131199837, + "epoch": 0.52, + "grad_norm": 0.38632193207740784, + "learning_rate": 0.00017415204678362573, + "loss": 0.9868, + "mean_token_accuracy": 0.75808484852314, + "num_tokens": 3914432.0, + "step": 312 + }, + { + "entropy": 1.2867136895656586, + "epoch": 0.5216666666666666, + "grad_norm": 0.3463575839996338, + "learning_rate": 0.00017403508771929825, + "loss": 1.3031, + "mean_token_accuracy": 0.6908155083656311, + "num_tokens": 3926978.0, + "step": 313 + }, + { + "entropy": 1.0409365370869637, + "epoch": 0.5233333333333333, + "grad_norm": 0.3130481541156769, + "learning_rate": 0.00017391812865497078, + "loss": 1.0231, + "mean_token_accuracy": 0.7590188607573509, + "num_tokens": 3939332.0, + "step": 314 + }, + { + "entropy": 1.1452669128775597, + "epoch": 0.525, + "grad_norm": 0.3694944679737091, + "learning_rate": 0.00017380116959064327, + "loss": 1.0962, + "mean_token_accuracy": 0.7332403659820557, + "num_tokens": 3951894.0, + "step": 315 + }, + { + "entropy": 1.03856360912323, + "epoch": 0.5266666666666666, + "grad_norm": 0.363551527261734, + "learning_rate": 0.0001736842105263158, + "loss": 1.0056, + "mean_token_accuracy": 0.7608824819326401, + "num_tokens": 3964580.0, + "step": 316 + }, + { + "entropy": 1.2034416571259499, + "epoch": 0.5283333333333333, + "grad_norm": 0.4383612275123596, + "learning_rate": 0.00017356725146198832, + "loss": 1.1768, + "mean_token_accuracy": 0.7235340550541878, + "num_tokens": 3976928.0, + "step": 317 + }, + { + "entropy": 1.0424899756908417, + "epoch": 0.53, + "grad_norm": 0.4106066823005676, + "learning_rate": 0.0001734502923976608, + "loss": 1.0418, + "mean_token_accuracy": 0.7455758079886436, + "num_tokens": 3989698.0, + "step": 318 + }, + { + "entropy": 1.0708102360367775, + "epoch": 0.5316666666666666, + "grad_norm": 0.3656046688556671, + "learning_rate": 0.00017333333333333334, + "loss": 1.0867, + "mean_token_accuracy": 0.739653930068016, + "num_tokens": 4001878.0, + "step": 319 + }, + { + "entropy": 1.0080599710345268, + "epoch": 0.5333333333333333, + "grad_norm": 0.39112988114356995, + "learning_rate": 0.00017321637426900586, + "loss": 1.0352, + "mean_token_accuracy": 0.753649964928627, + "num_tokens": 4014519.0, + "step": 320 + }, + { + "entropy": 1.184828795492649, + "epoch": 0.535, + "grad_norm": 0.7222509980201721, + "learning_rate": 0.00017309941520467836, + "loss": 1.2284, + "mean_token_accuracy": 0.7146416530013084, + "num_tokens": 4026885.0, + "step": 321 + }, + { + "entropy": 1.1485476717352867, + "epoch": 0.5366666666666666, + "grad_norm": 0.4598398506641388, + "learning_rate": 0.00017298245614035088, + "loss": 1.1697, + "mean_token_accuracy": 0.7225939184427261, + "num_tokens": 4039427.0, + "step": 322 + }, + { + "entropy": 1.0673358216881752, + "epoch": 0.5383333333333333, + "grad_norm": 0.3209613263607025, + "learning_rate": 0.0001728654970760234, + "loss": 1.0625, + "mean_token_accuracy": 0.7471172362565994, + "num_tokens": 4052273.0, + "step": 323 + }, + { + "entropy": 1.103698968887329, + "epoch": 0.54, + "grad_norm": 0.4327990412712097, + "learning_rate": 0.0001727485380116959, + "loss": 1.1195, + "mean_token_accuracy": 0.7404018118977547, + "num_tokens": 4064623.0, + "step": 324 + }, + { + "entropy": 0.978014774620533, + "epoch": 0.5416666666666666, + "grad_norm": 0.48123371601104736, + "learning_rate": 0.00017263157894736842, + "loss": 0.9427, + "mean_token_accuracy": 0.76978749781847, + "num_tokens": 4077102.0, + "step": 325 + }, + { + "entropy": 1.196822389960289, + "epoch": 0.5433333333333333, + "grad_norm": 0.4324086308479309, + "learning_rate": 0.00017251461988304094, + "loss": 1.1894, + "mean_token_accuracy": 0.722084753215313, + "num_tokens": 4089599.0, + "step": 326 + }, + { + "entropy": 1.194992557168007, + "epoch": 0.545, + "grad_norm": 0.31563282012939453, + "learning_rate": 0.00017239766081871347, + "loss": 1.1794, + "mean_token_accuracy": 0.7149224281311035, + "num_tokens": 4101954.0, + "step": 327 + }, + { + "entropy": 1.1106646209955215, + "epoch": 0.5466666666666666, + "grad_norm": 0.3899517357349396, + "learning_rate": 0.000172280701754386, + "loss": 1.0646, + "mean_token_accuracy": 0.7455310076475143, + "num_tokens": 4114386.0, + "step": 328 + }, + { + "entropy": 1.0722733810544014, + "epoch": 0.5483333333333333, + "grad_norm": 0.4056398570537567, + "learning_rate": 0.00017216374269005849, + "loss": 1.0508, + "mean_token_accuracy": 0.7531849294900894, + "num_tokens": 4126890.0, + "step": 329 + }, + { + "entropy": 1.180145487189293, + "epoch": 0.55, + "grad_norm": 0.4245923161506653, + "learning_rate": 0.000172046783625731, + "loss": 1.1845, + "mean_token_accuracy": 0.7225575000047684, + "num_tokens": 4139334.0, + "step": 330 + }, + { + "entropy": 1.0642430186271667, + "epoch": 0.5516666666666666, + "grad_norm": 0.33601292967796326, + "learning_rate": 0.00017192982456140353, + "loss": 1.0667, + "mean_token_accuracy": 0.7459002658724785, + "num_tokens": 4152125.0, + "step": 331 + }, + { + "entropy": 1.1839115172624588, + "epoch": 0.5533333333333333, + "grad_norm": 0.41593268513679504, + "learning_rate": 0.00017181286549707603, + "loss": 1.1974, + "mean_token_accuracy": 0.7193257734179497, + "num_tokens": 4164994.0, + "step": 332 + }, + { + "entropy": 1.013751097023487, + "epoch": 0.555, + "grad_norm": 0.5192084312438965, + "learning_rate": 0.00017169590643274855, + "loss": 1.0265, + "mean_token_accuracy": 0.7517165392637253, + "num_tokens": 4177479.0, + "step": 333 + }, + { + "entropy": 1.0605740025639534, + "epoch": 0.5566666666666666, + "grad_norm": 0.48862767219543457, + "learning_rate": 0.00017157894736842107, + "loss": 1.087, + "mean_token_accuracy": 0.7423926591873169, + "num_tokens": 4189789.0, + "step": 334 + }, + { + "entropy": 0.9547868818044662, + "epoch": 0.5583333333333333, + "grad_norm": 0.3805651068687439, + "learning_rate": 0.00017146198830409357, + "loss": 0.9617, + "mean_token_accuracy": 0.7645149901509285, + "num_tokens": 4202221.0, + "step": 335 + }, + { + "entropy": 1.1145575419068336, + "epoch": 0.56, + "grad_norm": 0.4303564429283142, + "learning_rate": 0.0001713450292397661, + "loss": 1.1506, + "mean_token_accuracy": 0.7287673428654671, + "num_tokens": 4214721.0, + "step": 336 + }, + { + "entropy": 1.0919224098324776, + "epoch": 0.5616666666666666, + "grad_norm": 0.3729289770126343, + "learning_rate": 0.00017122807017543862, + "loss": 1.106, + "mean_token_accuracy": 0.7369844168424606, + "num_tokens": 4227452.0, + "step": 337 + }, + { + "entropy": 1.1665791720151901, + "epoch": 0.5633333333333334, + "grad_norm": 0.4484737813472748, + "learning_rate": 0.0001711111111111111, + "loss": 1.1683, + "mean_token_accuracy": 0.7236492782831192, + "num_tokens": 4240203.0, + "step": 338 + }, + { + "entropy": 1.0452025011181831, + "epoch": 0.565, + "grad_norm": 0.45528316497802734, + "learning_rate": 0.00017099415204678363, + "loss": 1.025, + "mean_token_accuracy": 0.7446356862783432, + "num_tokens": 4252566.0, + "step": 339 + }, + { + "entropy": 1.2566066607832909, + "epoch": 0.5666666666666667, + "grad_norm": 0.33891013264656067, + "learning_rate": 0.00017087719298245616, + "loss": 1.2393, + "mean_token_accuracy": 0.7148144468665123, + "num_tokens": 4264997.0, + "step": 340 + }, + { + "entropy": 1.083845317363739, + "epoch": 0.5683333333333334, + "grad_norm": 0.3768393099308014, + "learning_rate": 0.00017076023391812865, + "loss": 1.0713, + "mean_token_accuracy": 0.7463406249880791, + "num_tokens": 4277370.0, + "step": 341 + }, + { + "entropy": 1.0517774820327759, + "epoch": 0.57, + "grad_norm": 0.41828426718711853, + "learning_rate": 0.00017064327485380118, + "loss": 1.0319, + "mean_token_accuracy": 0.7481891736388206, + "num_tokens": 4289689.0, + "step": 342 + }, + { + "entropy": 1.0900000855326653, + "epoch": 0.5716666666666667, + "grad_norm": 0.34837833046913147, + "learning_rate": 0.0001705263157894737, + "loss": 1.0622, + "mean_token_accuracy": 0.7406937256455421, + "num_tokens": 4302237.0, + "step": 343 + }, + { + "entropy": 1.1605029106140137, + "epoch": 0.5733333333333334, + "grad_norm": 0.4013015329837799, + "learning_rate": 0.0001704093567251462, + "loss": 1.155, + "mean_token_accuracy": 0.7249374613165855, + "num_tokens": 4314466.0, + "step": 344 + }, + { + "entropy": 1.1528219133615494, + "epoch": 0.575, + "grad_norm": 0.3658445477485657, + "learning_rate": 0.00017029239766081872, + "loss": 1.1334, + "mean_token_accuracy": 0.7317510321736336, + "num_tokens": 4326923.0, + "step": 345 + }, + { + "entropy": 0.9520823210477829, + "epoch": 0.5766666666666667, + "grad_norm": 0.3611161410808563, + "learning_rate": 0.00017017543859649124, + "loss": 0.9579, + "mean_token_accuracy": 0.7622030377388, + "num_tokens": 4339742.0, + "step": 346 + }, + { + "entropy": 1.050878219306469, + "epoch": 0.5783333333333334, + "grad_norm": 0.34087908267974854, + "learning_rate": 0.00017005847953216376, + "loss": 1.0817, + "mean_token_accuracy": 0.7354206740856171, + "num_tokens": 4352376.0, + "step": 347 + }, + { + "entropy": 0.9521585963666439, + "epoch": 0.58, + "grad_norm": 0.33794334530830383, + "learning_rate": 0.00016994152046783626, + "loss": 0.9943, + "mean_token_accuracy": 0.756027527153492, + "num_tokens": 4365003.0, + "step": 348 + }, + { + "entropy": 1.0915799364447594, + "epoch": 0.5816666666666667, + "grad_norm": 0.40143731236457825, + "learning_rate": 0.00016982456140350878, + "loss": 1.0984, + "mean_token_accuracy": 0.7424183115363121, + "num_tokens": 4377303.0, + "step": 349 + }, + { + "entropy": 0.9480448961257935, + "epoch": 0.5833333333333334, + "grad_norm": 0.4219553470611572, + "learning_rate": 0.0001697076023391813, + "loss": 0.9469, + "mean_token_accuracy": 0.770964540541172, + "num_tokens": 4390015.0, + "step": 350 + }, + { + "entropy": 1.147115521132946, + "epoch": 0.585, + "grad_norm": 0.3754667043685913, + "learning_rate": 0.0001695906432748538, + "loss": 1.1543, + "mean_token_accuracy": 0.7254085168242455, + "num_tokens": 4402733.0, + "step": 351 + }, + { + "entropy": 1.253534510731697, + "epoch": 0.5866666666666667, + "grad_norm": 5.795986652374268, + "learning_rate": 0.00016947368421052633, + "loss": 1.2822, + "mean_token_accuracy": 0.7053791284561157, + "num_tokens": 4415352.0, + "step": 352 + }, + { + "entropy": 1.1879041716456413, + "epoch": 0.5883333333333334, + "grad_norm": 0.3433243930339813, + "learning_rate": 0.00016935672514619885, + "loss": 1.1967, + "mean_token_accuracy": 0.716492310166359, + "num_tokens": 4428044.0, + "step": 353 + }, + { + "entropy": 1.1484075263142586, + "epoch": 0.59, + "grad_norm": 0.3310488760471344, + "learning_rate": 0.00016923976608187134, + "loss": 1.1446, + "mean_token_accuracy": 0.733469732105732, + "num_tokens": 4440325.0, + "step": 354 + }, + { + "entropy": 1.0385578274726868, + "epoch": 0.5916666666666667, + "grad_norm": 0.36995747685432434, + "learning_rate": 0.00016912280701754387, + "loss": 1.0412, + "mean_token_accuracy": 0.7494801208376884, + "num_tokens": 4452827.0, + "step": 355 + }, + { + "entropy": 1.0025876611471176, + "epoch": 0.5933333333333334, + "grad_norm": 0.3320871889591217, + "learning_rate": 0.0001690058479532164, + "loss": 1.0015, + "mean_token_accuracy": 0.7598021700978279, + "num_tokens": 4465154.0, + "step": 356 + }, + { + "entropy": 0.9714228957891464, + "epoch": 0.595, + "grad_norm": 0.3272383511066437, + "learning_rate": 0.00016888888888888889, + "loss": 0.9632, + "mean_token_accuracy": 0.7730072066187859, + "num_tokens": 4477940.0, + "step": 357 + }, + { + "entropy": 0.9843970835208893, + "epoch": 0.5966666666666667, + "grad_norm": 0.331716924905777, + "learning_rate": 0.0001687719298245614, + "loss": 0.9832, + "mean_token_accuracy": 0.7648526951670647, + "num_tokens": 4490324.0, + "step": 358 + }, + { + "entropy": 1.0355599969625473, + "epoch": 0.5983333333333334, + "grad_norm": 0.328522652387619, + "learning_rate": 0.00016865497076023393, + "loss": 1.0229, + "mean_token_accuracy": 0.7536440342664719, + "num_tokens": 4502903.0, + "step": 359 + }, + { + "entropy": 1.1647923961281776, + "epoch": 0.6, + "grad_norm": 0.35590431094169617, + "learning_rate": 0.00016853801169590643, + "loss": 1.1555, + "mean_token_accuracy": 0.7302551120519638, + "num_tokens": 4515421.0, + "step": 360 + }, + { + "epoch": 0.6, + "eval_entropy": 1.1506038707096735, + "eval_loss": 1.147754430770874, + "eval_mean_token_accuracy": 0.7277677131266367, + "eval_num_tokens": 4515421.0, + "eval_runtime": 2668.7626, + "eval_samples_per_second": 1.874, + "eval_steps_per_second": 0.937, + "step": 360 + }, + { + "entropy": 1.0822272449731827, + "epoch": 0.6016666666666667, + "grad_norm": 0.3461451232433319, + "learning_rate": 0.00016842105263157895, + "loss": 1.0538, + "mean_token_accuracy": 0.7523676231503487, + "num_tokens": 4527959.0, + "step": 361 + }, + { + "entropy": 0.9786264225840569, + "epoch": 0.6033333333333334, + "grad_norm": 0.34319812059402466, + "learning_rate": 0.00016830409356725147, + "loss": 0.9882, + "mean_token_accuracy": 0.7686471715569496, + "num_tokens": 4540528.0, + "step": 362 + }, + { + "entropy": 1.0155667290091515, + "epoch": 0.605, + "grad_norm": 0.424983412027359, + "learning_rate": 0.00016818713450292397, + "loss": 0.9836, + "mean_token_accuracy": 0.7593458294868469, + "num_tokens": 4553090.0, + "step": 363 + }, + { + "entropy": 1.1010105088353157, + "epoch": 0.6066666666666667, + "grad_norm": 0.35969412326812744, + "learning_rate": 0.0001680701754385965, + "loss": 1.093, + "mean_token_accuracy": 0.7331129014492035, + "num_tokens": 4565595.0, + "step": 364 + }, + { + "entropy": 1.0532179400324821, + "epoch": 0.6083333333333333, + "grad_norm": 0.3190361559391022, + "learning_rate": 0.00016795321637426902, + "loss": 1.0569, + "mean_token_accuracy": 0.7494515404105186, + "num_tokens": 4577921.0, + "step": 365 + }, + { + "entropy": 1.1271280273795128, + "epoch": 0.61, + "grad_norm": 0.43350517749786377, + "learning_rate": 0.0001678362573099415, + "loss": 1.1579, + "mean_token_accuracy": 0.729541227221489, + "num_tokens": 4590555.0, + "step": 366 + }, + { + "entropy": 1.1090258359909058, + "epoch": 0.6116666666666667, + "grad_norm": 0.4741346538066864, + "learning_rate": 0.00016771929824561406, + "loss": 1.1043, + "mean_token_accuracy": 0.7302690967917442, + "num_tokens": 4603046.0, + "step": 367 + }, + { + "entropy": 1.0478442907333374, + "epoch": 0.6133333333333333, + "grad_norm": 0.4757939279079437, + "learning_rate": 0.00016760233918128656, + "loss": 1.0388, + "mean_token_accuracy": 0.7470605447888374, + "num_tokens": 4615534.0, + "step": 368 + }, + { + "entropy": 1.1102875471115112, + "epoch": 0.615, + "grad_norm": 0.34702980518341064, + "learning_rate": 0.00016748538011695905, + "loss": 1.1123, + "mean_token_accuracy": 0.7333204820752144, + "num_tokens": 4628160.0, + "step": 369 + }, + { + "entropy": 1.037328228354454, + "epoch": 0.6166666666666667, + "grad_norm": 0.3619624972343445, + "learning_rate": 0.0001673684210526316, + "loss": 1.0005, + "mean_token_accuracy": 0.7570820525288582, + "num_tokens": 4640833.0, + "step": 370 + }, + { + "entropy": 1.138997420668602, + "epoch": 0.6183333333333333, + "grad_norm": 0.5491196513175964, + "learning_rate": 0.0001672514619883041, + "loss": 1.1537, + "mean_token_accuracy": 0.7282362058758736, + "num_tokens": 4653071.0, + "step": 371 + }, + { + "entropy": 1.0007020235061646, + "epoch": 0.62, + "grad_norm": 0.33528798818588257, + "learning_rate": 0.00016713450292397662, + "loss": 0.9821, + "mean_token_accuracy": 0.7631944566965103, + "num_tokens": 4665686.0, + "step": 372 + }, + { + "entropy": 1.0706755891442299, + "epoch": 0.6216666666666667, + "grad_norm": 0.5516906976699829, + "learning_rate": 0.00016701754385964915, + "loss": 1.0717, + "mean_token_accuracy": 0.7461396679282188, + "num_tokens": 4678137.0, + "step": 373 + }, + { + "entropy": 1.0305498763918877, + "epoch": 0.6233333333333333, + "grad_norm": 0.4353298544883728, + "learning_rate": 0.00016690058479532164, + "loss": 1.0007, + "mean_token_accuracy": 0.7523628026247025, + "num_tokens": 4690948.0, + "step": 374 + }, + { + "entropy": 1.0295665189623833, + "epoch": 0.625, + "grad_norm": 0.33500105142593384, + "learning_rate": 0.00016678362573099417, + "loss": 1.0147, + "mean_token_accuracy": 0.7575552314519882, + "num_tokens": 4703464.0, + "step": 375 + }, + { + "entropy": 1.261782169342041, + "epoch": 0.6266666666666667, + "grad_norm": 0.3233359158039093, + "learning_rate": 0.0001666666666666667, + "loss": 1.2725, + "mean_token_accuracy": 0.6999640017747879, + "num_tokens": 4716083.0, + "step": 376 + }, + { + "entropy": 1.0816773176193237, + "epoch": 0.6283333333333333, + "grad_norm": 0.36155110597610474, + "learning_rate": 0.00016654970760233918, + "loss": 1.0828, + "mean_token_accuracy": 0.7376217916607857, + "num_tokens": 4728514.0, + "step": 377 + }, + { + "entropy": 0.9655618295073509, + "epoch": 0.63, + "grad_norm": 0.4339034855365753, + "learning_rate": 0.0001664327485380117, + "loss": 0.9711, + "mean_token_accuracy": 0.761038102209568, + "num_tokens": 4740972.0, + "step": 378 + }, + { + "entropy": 1.0789566859602928, + "epoch": 0.6316666666666667, + "grad_norm": 1.1587278842926025, + "learning_rate": 0.00016631578947368423, + "loss": 1.0795, + "mean_token_accuracy": 0.7451052665710449, + "num_tokens": 4753592.0, + "step": 379 + }, + { + "entropy": 1.1246184334158897, + "epoch": 0.6333333333333333, + "grad_norm": 0.3487575948238373, + "learning_rate": 0.00016619883040935673, + "loss": 1.1458, + "mean_token_accuracy": 0.7255075052380562, + "num_tokens": 4766271.0, + "step": 380 + }, + { + "entropy": 0.9961473643779755, + "epoch": 0.635, + "grad_norm": 0.3496881425380707, + "learning_rate": 0.00016608187134502925, + "loss": 0.9703, + "mean_token_accuracy": 0.7653394937515259, + "num_tokens": 4778509.0, + "step": 381 + }, + { + "entropy": 1.0803877338767052, + "epoch": 0.6366666666666667, + "grad_norm": 0.42682647705078125, + "learning_rate": 0.00016596491228070177, + "loss": 1.0697, + "mean_token_accuracy": 0.7415556833148003, + "num_tokens": 4790982.0, + "step": 382 + }, + { + "entropy": 1.23675137758255, + "epoch": 0.6383333333333333, + "grad_norm": 0.48014000058174133, + "learning_rate": 0.00016584795321637427, + "loss": 1.2376, + "mean_token_accuracy": 0.7128911018371582, + "num_tokens": 4803455.0, + "step": 383 + }, + { + "entropy": 1.1108812019228935, + "epoch": 0.64, + "grad_norm": 0.38982605934143066, + "learning_rate": 0.0001657309941520468, + "loss": 1.1064, + "mean_token_accuracy": 0.7403939291834831, + "num_tokens": 4815947.0, + "step": 384 + }, + { + "entropy": 1.0063207522034645, + "epoch": 0.6416666666666667, + "grad_norm": 0.350036084651947, + "learning_rate": 0.00016561403508771931, + "loss": 0.9986, + "mean_token_accuracy": 0.7624973133206367, + "num_tokens": 4828658.0, + "step": 385 + }, + { + "entropy": 1.0487488061189651, + "epoch": 0.6433333333333333, + "grad_norm": 0.41516250371932983, + "learning_rate": 0.0001654970760233918, + "loss": 1.0116, + "mean_token_accuracy": 0.751943901181221, + "num_tokens": 4841203.0, + "step": 386 + }, + { + "entropy": 1.055719830095768, + "epoch": 0.645, + "grad_norm": 0.3185690939426422, + "learning_rate": 0.00016538011695906433, + "loss": 1.051, + "mean_token_accuracy": 0.7423844113945961, + "num_tokens": 4853646.0, + "step": 387 + }, + { + "entropy": 1.0037056729197502, + "epoch": 0.6466666666666666, + "grad_norm": 0.38994213938713074, + "learning_rate": 0.00016526315789473686, + "loss": 1.0071, + "mean_token_accuracy": 0.759202741086483, + "num_tokens": 4866080.0, + "step": 388 + }, + { + "entropy": 1.2100782170891762, + "epoch": 0.6483333333333333, + "grad_norm": 0.3755752742290497, + "learning_rate": 0.00016514619883040935, + "loss": 1.2323, + "mean_token_accuracy": 0.6998582407832146, + "num_tokens": 4878573.0, + "step": 389 + }, + { + "entropy": 0.964059017598629, + "epoch": 0.65, + "grad_norm": 0.47546395659446716, + "learning_rate": 0.0001650292397660819, + "loss": 0.9479, + "mean_token_accuracy": 0.7689564228057861, + "num_tokens": 4891251.0, + "step": 390 + }, + { + "entropy": 1.0938122794032097, + "epoch": 0.6516666666666666, + "grad_norm": 0.31692859530448914, + "learning_rate": 0.0001649122807017544, + "loss": 1.112, + "mean_token_accuracy": 0.734823040664196, + "num_tokens": 4903849.0, + "step": 391 + }, + { + "entropy": 1.2476731687784195, + "epoch": 0.6533333333333333, + "grad_norm": 0.3951464295387268, + "learning_rate": 0.0001647953216374269, + "loss": 1.2505, + "mean_token_accuracy": 0.7056182846426964, + "num_tokens": 4916605.0, + "step": 392 + }, + { + "entropy": 1.0463567823171616, + "epoch": 0.655, + "grad_norm": 0.3860931992530823, + "learning_rate": 0.00016467836257309944, + "loss": 1.0277, + "mean_token_accuracy": 0.7453161776065826, + "num_tokens": 4929002.0, + "step": 393 + }, + { + "entropy": 1.1087151244282722, + "epoch": 0.6566666666666666, + "grad_norm": 0.34755659103393555, + "learning_rate": 0.00016456140350877194, + "loss": 1.09, + "mean_token_accuracy": 0.7396251112222672, + "num_tokens": 4941434.0, + "step": 394 + }, + { + "entropy": 1.2049994841217995, + "epoch": 0.6583333333333333, + "grad_norm": 0.39863044023513794, + "learning_rate": 0.00016444444444444444, + "loss": 1.1623, + "mean_token_accuracy": 0.7202948480844498, + "num_tokens": 4953876.0, + "step": 395 + }, + { + "entropy": 1.0984854400157928, + "epoch": 0.66, + "grad_norm": 0.38562482595443726, + "learning_rate": 0.00016432748538011699, + "loss": 1.0935, + "mean_token_accuracy": 0.7386272475123405, + "num_tokens": 4966358.0, + "step": 396 + }, + { + "entropy": 1.0148514583706856, + "epoch": 0.6616666666666666, + "grad_norm": 0.3571801781654358, + "learning_rate": 0.00016421052631578948, + "loss": 1.0091, + "mean_token_accuracy": 0.7530024722218513, + "num_tokens": 4979075.0, + "step": 397 + }, + { + "entropy": 1.1101968213915825, + "epoch": 0.6633333333333333, + "grad_norm": 0.37536031007766724, + "learning_rate": 0.00016409356725146198, + "loss": 1.1301, + "mean_token_accuracy": 0.7349445819854736, + "num_tokens": 4991282.0, + "step": 398 + }, + { + "entropy": 1.0139321312308311, + "epoch": 0.665, + "grad_norm": 0.3825819194316864, + "learning_rate": 0.00016397660818713453, + "loss": 1.0468, + "mean_token_accuracy": 0.7493115812540054, + "num_tokens": 5003653.0, + "step": 399 + }, + { + "entropy": 0.9064864292740822, + "epoch": 0.6666666666666666, + "grad_norm": 0.37946513295173645, + "learning_rate": 0.00016385964912280702, + "loss": 0.9145, + "mean_token_accuracy": 0.7758191227912903, + "num_tokens": 5016221.0, + "step": 400 + }, + { + "entropy": 1.0882866755127907, + "epoch": 0.6683333333333333, + "grad_norm": 0.32677122950553894, + "learning_rate": 0.00016374269005847952, + "loss": 1.08, + "mean_token_accuracy": 0.7416599541902542, + "num_tokens": 5028657.0, + "step": 401 + }, + { + "entropy": 1.1420171335339546, + "epoch": 0.67, + "grad_norm": 0.34600427746772766, + "learning_rate": 0.00016362573099415207, + "loss": 1.1095, + "mean_token_accuracy": 0.7331014648079872, + "num_tokens": 5041256.0, + "step": 402 + }, + { + "entropy": 1.2241623848676682, + "epoch": 0.6716666666666666, + "grad_norm": 0.38560718297958374, + "learning_rate": 0.00016350877192982457, + "loss": 1.2226, + "mean_token_accuracy": 0.7100344523787498, + "num_tokens": 5053831.0, + "step": 403 + }, + { + "entropy": 0.9865270294249058, + "epoch": 0.6733333333333333, + "grad_norm": 0.3292683959007263, + "learning_rate": 0.00016339181286549706, + "loss": 0.9717, + "mean_token_accuracy": 0.7668761685490608, + "num_tokens": 5066367.0, + "step": 404 + }, + { + "entropy": 1.0482201799750328, + "epoch": 0.675, + "grad_norm": 0.5308311581611633, + "learning_rate": 0.0001632748538011696, + "loss": 1.0109, + "mean_token_accuracy": 0.7543414235115051, + "num_tokens": 5079159.0, + "step": 405 + }, + { + "entropy": 1.0350475907325745, + "epoch": 0.6766666666666666, + "grad_norm": 0.34866708517074585, + "learning_rate": 0.0001631578947368421, + "loss": 1.0179, + "mean_token_accuracy": 0.7569469437003136, + "num_tokens": 5091744.0, + "step": 406 + }, + { + "entropy": 1.0928167328238487, + "epoch": 0.6783333333333333, + "grad_norm": 0.38286733627319336, + "learning_rate": 0.00016304093567251463, + "loss": 1.0727, + "mean_token_accuracy": 0.7430339977145195, + "num_tokens": 5104211.0, + "step": 407 + }, + { + "entropy": 1.1038938909769058, + "epoch": 0.68, + "grad_norm": 0.3320970833301544, + "learning_rate": 0.00016292397660818715, + "loss": 1.1293, + "mean_token_accuracy": 0.7329866364598274, + "num_tokens": 5116819.0, + "step": 408 + }, + { + "entropy": 1.042891263961792, + "epoch": 0.6816666666666666, + "grad_norm": 0.38465678691864014, + "learning_rate": 0.00016280701754385965, + "loss": 1.0725, + "mean_token_accuracy": 0.747420534491539, + "num_tokens": 5129352.0, + "step": 409 + }, + { + "entropy": 1.083320826292038, + "epoch": 0.6833333333333333, + "grad_norm": 0.35532549023628235, + "learning_rate": 0.00016269005847953217, + "loss": 1.081, + "mean_token_accuracy": 0.7357378974556923, + "num_tokens": 5141789.0, + "step": 410 + }, + { + "entropy": 1.0968919321894646, + "epoch": 0.685, + "grad_norm": 0.37026599049568176, + "learning_rate": 0.0001625730994152047, + "loss": 1.0946, + "mean_token_accuracy": 0.7453345954418182, + "num_tokens": 5154353.0, + "step": 411 + }, + { + "entropy": 1.1025621965527534, + "epoch": 0.6866666666666666, + "grad_norm": 0.38080209493637085, + "learning_rate": 0.0001624561403508772, + "loss": 1.1408, + "mean_token_accuracy": 0.7331846132874489, + "num_tokens": 5167104.0, + "step": 412 + }, + { + "entropy": 0.9193796887993813, + "epoch": 0.6883333333333334, + "grad_norm": 0.4362526834011078, + "learning_rate": 0.00016233918128654972, + "loss": 0.9057, + "mean_token_accuracy": 0.7817874625325203, + "num_tokens": 5179789.0, + "step": 413 + }, + { + "entropy": 1.1486607491970062, + "epoch": 0.69, + "grad_norm": 0.43080970644950867, + "learning_rate": 0.00016222222222222224, + "loss": 1.1557, + "mean_token_accuracy": 0.7257590591907501, + "num_tokens": 5192550.0, + "step": 414 + }, + { + "entropy": 1.1312269866466522, + "epoch": 0.6916666666666667, + "grad_norm": 0.33069857954978943, + "learning_rate": 0.00016210526315789473, + "loss": 1.1336, + "mean_token_accuracy": 0.7364302352070808, + "num_tokens": 5205107.0, + "step": 415 + }, + { + "entropy": 1.1854391023516655, + "epoch": 0.6933333333333334, + "grad_norm": 0.5881476998329163, + "learning_rate": 0.00016198830409356726, + "loss": 1.1957, + "mean_token_accuracy": 0.7226643934845924, + "num_tokens": 5217821.0, + "step": 416 + }, + { + "entropy": 1.153312124311924, + "epoch": 0.695, + "grad_norm": 0.506568431854248, + "learning_rate": 0.00016187134502923978, + "loss": 1.1506, + "mean_token_accuracy": 0.7315347641706467, + "num_tokens": 5230369.0, + "step": 417 + }, + { + "entropy": 1.0305966809391975, + "epoch": 0.6966666666666667, + "grad_norm": 0.37616896629333496, + "learning_rate": 0.00016175438596491228, + "loss": 1.0028, + "mean_token_accuracy": 0.7581542059779167, + "num_tokens": 5243034.0, + "step": 418 + }, + { + "entropy": 1.184016190469265, + "epoch": 0.6983333333333334, + "grad_norm": 0.3351752460002899, + "learning_rate": 0.0001616374269005848, + "loss": 1.1678, + "mean_token_accuracy": 0.7203118875622749, + "num_tokens": 5255634.0, + "step": 419 + }, + { + "entropy": 1.1107853651046753, + "epoch": 0.7, + "grad_norm": 0.5095561742782593, + "learning_rate": 0.00016152046783625732, + "loss": 1.1212, + "mean_token_accuracy": 0.7350276410579681, + "num_tokens": 5268326.0, + "step": 420 + }, + { + "entropy": 1.0543791353702545, + "epoch": 0.7016666666666667, + "grad_norm": 0.4145483672618866, + "learning_rate": 0.00016140350877192982, + "loss": 1.0542, + "mean_token_accuracy": 0.7537256851792336, + "num_tokens": 5281032.0, + "step": 421 + }, + { + "entropy": 1.0694367215037346, + "epoch": 0.7033333333333334, + "grad_norm": 0.4112991690635681, + "learning_rate": 0.00016128654970760234, + "loss": 1.0322, + "mean_token_accuracy": 0.7503436282277107, + "num_tokens": 5293573.0, + "step": 422 + }, + { + "entropy": 1.116036280989647, + "epoch": 0.705, + "grad_norm": 0.614185094833374, + "learning_rate": 0.00016116959064327486, + "loss": 1.1065, + "mean_token_accuracy": 0.7300728484988213, + "num_tokens": 5306194.0, + "step": 423 + }, + { + "entropy": 1.042599968612194, + "epoch": 0.7066666666666667, + "grad_norm": 0.9990386366844177, + "learning_rate": 0.00016105263157894736, + "loss": 1.0299, + "mean_token_accuracy": 0.7559118717908859, + "num_tokens": 5318533.0, + "step": 424 + }, + { + "entropy": 1.119065299630165, + "epoch": 0.7083333333333334, + "grad_norm": 0.48257675766944885, + "learning_rate": 0.00016093567251461988, + "loss": 1.1202, + "mean_token_accuracy": 0.7334137335419655, + "num_tokens": 5331208.0, + "step": 425 + }, + { + "entropy": 1.1456444934010506, + "epoch": 0.71, + "grad_norm": 0.3156353533267975, + "learning_rate": 0.0001608187134502924, + "loss": 1.1071, + "mean_token_accuracy": 0.7339949384331703, + "num_tokens": 5343567.0, + "step": 426 + }, + { + "entropy": 1.1843851804733276, + "epoch": 0.7116666666666667, + "grad_norm": 0.4149649143218994, + "learning_rate": 0.00016070175438596493, + "loss": 1.2098, + "mean_token_accuracy": 0.7177803814411163, + "num_tokens": 5355834.0, + "step": 427 + }, + { + "entropy": 1.0262960121035576, + "epoch": 0.7133333333333334, + "grad_norm": 0.47338202595710754, + "learning_rate": 0.00016058479532163743, + "loss": 1.0406, + "mean_token_accuracy": 0.7452673614025116, + "num_tokens": 5368384.0, + "step": 428 + }, + { + "entropy": 1.0742842629551888, + "epoch": 0.715, + "grad_norm": 0.4460916519165039, + "learning_rate": 0.00016046783625730995, + "loss": 1.0959, + "mean_token_accuracy": 0.7396591976284981, + "num_tokens": 5380753.0, + "step": 429 + }, + { + "entropy": 0.9951488599181175, + "epoch": 0.7166666666666667, + "grad_norm": 0.3373461961746216, + "learning_rate": 0.00016035087719298247, + "loss": 0.9911, + "mean_token_accuracy": 0.7618727758526802, + "num_tokens": 5393349.0, + "step": 430 + }, + { + "entropy": 1.121171198785305, + "epoch": 0.7183333333333334, + "grad_norm": 0.4891211688518524, + "learning_rate": 0.000160233918128655, + "loss": 1.1048, + "mean_token_accuracy": 0.7355387806892395, + "num_tokens": 5405990.0, + "step": 431 + }, + { + "entropy": 1.1132899299263954, + "epoch": 0.72, + "grad_norm": 0.4116326570510864, + "learning_rate": 0.0001601169590643275, + "loss": 1.0877, + "mean_token_accuracy": 0.732665404677391, + "num_tokens": 5418879.0, + "step": 432 + }, + { + "entropy": 1.0193076133728027, + "epoch": 0.7216666666666667, + "grad_norm": 0.4597654640674591, + "learning_rate": 0.00016, + "loss": 0.9755, + "mean_token_accuracy": 0.7617568150162697, + "num_tokens": 5431625.0, + "step": 433 + }, + { + "entropy": 1.102819487452507, + "epoch": 0.7233333333333334, + "grad_norm": 0.3408207595348358, + "learning_rate": 0.00015988304093567254, + "loss": 1.1085, + "mean_token_accuracy": 0.7342933788895607, + "num_tokens": 5444373.0, + "step": 434 + }, + { + "entropy": 1.1037746369838715, + "epoch": 0.725, + "grad_norm": 0.3480106294155121, + "learning_rate": 0.00015976608187134503, + "loss": 1.0826, + "mean_token_accuracy": 0.7379643693566322, + "num_tokens": 5456763.0, + "step": 435 + }, + { + "entropy": 1.0365785732865334, + "epoch": 0.7266666666666667, + "grad_norm": 0.6273422241210938, + "learning_rate": 0.00015964912280701756, + "loss": 1.046, + "mean_token_accuracy": 0.7498196363449097, + "num_tokens": 5469348.0, + "step": 436 + }, + { + "entropy": 1.0879196152091026, + "epoch": 0.7283333333333334, + "grad_norm": 0.4042668342590332, + "learning_rate": 0.00015953216374269008, + "loss": 1.0885, + "mean_token_accuracy": 0.746056891977787, + "num_tokens": 5481851.0, + "step": 437 + }, + { + "entropy": 1.0286137238144875, + "epoch": 0.73, + "grad_norm": 0.3042530119419098, + "learning_rate": 0.00015941520467836257, + "loss": 1.0445, + "mean_token_accuracy": 0.7513305693864822, + "num_tokens": 5494525.0, + "step": 438 + }, + { + "entropy": 1.0987597107887268, + "epoch": 0.7316666666666667, + "grad_norm": 0.3762005567550659, + "learning_rate": 0.0001592982456140351, + "loss": 1.0922, + "mean_token_accuracy": 0.7425737306475639, + "num_tokens": 5507144.0, + "step": 439 + }, + { + "entropy": 1.127465382218361, + "epoch": 0.7333333333333333, + "grad_norm": 0.5283383131027222, + "learning_rate": 0.00015918128654970762, + "loss": 1.1109, + "mean_token_accuracy": 0.7371382638812065, + "num_tokens": 5519953.0, + "step": 440 + }, + { + "entropy": 1.2035595402121544, + "epoch": 0.735, + "grad_norm": 0.4114871025085449, + "learning_rate": 0.00015906432748538012, + "loss": 1.2056, + "mean_token_accuracy": 0.7149497643113136, + "num_tokens": 5532573.0, + "step": 441 + }, + { + "entropy": 1.0220743417739868, + "epoch": 0.7366666666666667, + "grad_norm": 0.3489610552787781, + "learning_rate": 0.00015894736842105264, + "loss": 0.9867, + "mean_token_accuracy": 0.7571059986948967, + "num_tokens": 5545041.0, + "step": 442 + }, + { + "entropy": 0.8919041678309441, + "epoch": 0.7383333333333333, + "grad_norm": 0.44151565432548523, + "learning_rate": 0.00015883040935672516, + "loss": 0.8787, + "mean_token_accuracy": 0.7870308607816696, + "num_tokens": 5557653.0, + "step": 443 + }, + { + "entropy": 1.0376613810658455, + "epoch": 0.74, + "grad_norm": 0.418760746717453, + "learning_rate": 0.00015871345029239766, + "loss": 1.0233, + "mean_token_accuracy": 0.756847932934761, + "num_tokens": 5570252.0, + "step": 444 + }, + { + "entropy": 1.2031901478767395, + "epoch": 0.7416666666666667, + "grad_norm": 0.3491179049015045, + "learning_rate": 0.00015859649122807018, + "loss": 1.2008, + "mean_token_accuracy": 0.7186564728617668, + "num_tokens": 5582827.0, + "step": 445 + }, + { + "entropy": 1.010102555155754, + "epoch": 0.7433333333333333, + "grad_norm": 0.3810936212539673, + "learning_rate": 0.0001584795321637427, + "loss": 0.984, + "mean_token_accuracy": 0.7599733769893646, + "num_tokens": 5595401.0, + "step": 446 + }, + { + "entropy": 1.075411356985569, + "epoch": 0.745, + "grad_norm": 0.43530669808387756, + "learning_rate": 0.0001583625730994152, + "loss": 1.1041, + "mean_token_accuracy": 0.7356607168912888, + "num_tokens": 5607942.0, + "step": 447 + }, + { + "entropy": 0.9819160103797913, + "epoch": 0.7466666666666667, + "grad_norm": 0.40340355038642883, + "learning_rate": 0.00015824561403508772, + "loss": 1.0154, + "mean_token_accuracy": 0.7579329013824463, + "num_tokens": 5620235.0, + "step": 448 + }, + { + "entropy": 1.0106851756572723, + "epoch": 0.7483333333333333, + "grad_norm": 0.34522029757499695, + "learning_rate": 0.00015812865497076025, + "loss": 1.0051, + "mean_token_accuracy": 0.756496749818325, + "num_tokens": 5633075.0, + "step": 449 + }, + { + "entropy": 1.1666646376252174, + "epoch": 0.75, + "grad_norm": 0.31447482109069824, + "learning_rate": 0.00015801169590643277, + "loss": 1.1664, + "mean_token_accuracy": 0.7229723930358887, + "num_tokens": 5645725.0, + "step": 450 + }, + { + "entropy": 0.9302625432610512, + "epoch": 0.7516666666666667, + "grad_norm": 0.33545219898223877, + "learning_rate": 0.00015789473684210527, + "loss": 0.9136, + "mean_token_accuracy": 0.7761102318763733, + "num_tokens": 5658368.0, + "step": 451 + }, + { + "entropy": 1.0969965159893036, + "epoch": 0.7533333333333333, + "grad_norm": 0.3889496624469757, + "learning_rate": 0.0001577777777777778, + "loss": 1.0953, + "mean_token_accuracy": 0.7352195754647255, + "num_tokens": 5671044.0, + "step": 452 + }, + { + "entropy": 1.110503688454628, + "epoch": 0.755, + "grad_norm": 0.36706921458244324, + "learning_rate": 0.0001576608187134503, + "loss": 1.1267, + "mean_token_accuracy": 0.734914131462574, + "num_tokens": 5683498.0, + "step": 453 + }, + { + "entropy": 1.1655322015285492, + "epoch": 0.7566666666666667, + "grad_norm": 0.3752106726169586, + "learning_rate": 0.0001575438596491228, + "loss": 1.1619, + "mean_token_accuracy": 0.7204124853014946, + "num_tokens": 5695777.0, + "step": 454 + }, + { + "entropy": 1.0306529253721237, + "epoch": 0.7583333333333333, + "grad_norm": 0.32710975408554077, + "learning_rate": 0.00015742690058479533, + "loss": 1.0054, + "mean_token_accuracy": 0.7572605907917023, + "num_tokens": 5708165.0, + "step": 455 + }, + { + "entropy": 1.2195520401000977, + "epoch": 0.76, + "grad_norm": 0.44669049978256226, + "learning_rate": 0.00015730994152046785, + "loss": 1.2255, + "mean_token_accuracy": 0.7160174250602722, + "num_tokens": 5720892.0, + "step": 456 + }, + { + "entropy": 1.0541856065392494, + "epoch": 0.7616666666666667, + "grad_norm": 0.34097859263420105, + "learning_rate": 0.00015719298245614035, + "loss": 1.043, + "mean_token_accuracy": 0.7480626776814461, + "num_tokens": 5733726.0, + "step": 457 + }, + { + "entropy": 1.1402226686477661, + "epoch": 0.7633333333333333, + "grad_norm": 0.8296970725059509, + "learning_rate": 0.00015707602339181287, + "loss": 1.0894, + "mean_token_accuracy": 0.7324612215161324, + "num_tokens": 5746163.0, + "step": 458 + }, + { + "entropy": 1.223743736743927, + "epoch": 0.765, + "grad_norm": 0.31552544236183167, + "learning_rate": 0.0001569590643274854, + "loss": 1.2019, + "mean_token_accuracy": 0.7200068011879921, + "num_tokens": 5758756.0, + "step": 459 + }, + { + "entropy": 1.0422032475471497, + "epoch": 0.7666666666666667, + "grad_norm": 0.33989831805229187, + "learning_rate": 0.0001568421052631579, + "loss": 1.0594, + "mean_token_accuracy": 0.7472169026732445, + "num_tokens": 5771343.0, + "step": 460 + }, + { + "entropy": 0.9894929677248001, + "epoch": 0.7683333333333333, + "grad_norm": 0.3527223765850067, + "learning_rate": 0.00015672514619883041, + "loss": 0.9773, + "mean_token_accuracy": 0.7673698663711548, + "num_tokens": 5783675.0, + "step": 461 + }, + { + "entropy": 1.0100511014461517, + "epoch": 0.77, + "grad_norm": 0.3458307981491089, + "learning_rate": 0.00015660818713450294, + "loss": 1.0227, + "mean_token_accuracy": 0.7536729276180267, + "num_tokens": 5796529.0, + "step": 462 + }, + { + "entropy": 1.1438388898968697, + "epoch": 0.7716666666666666, + "grad_norm": 0.3677491843700409, + "learning_rate": 0.00015649122807017543, + "loss": 1.1439, + "mean_token_accuracy": 0.7285068556666374, + "num_tokens": 5808813.0, + "step": 463 + }, + { + "entropy": 1.1831453144550323, + "epoch": 0.7733333333333333, + "grad_norm": 0.3970377445220947, + "learning_rate": 0.00015637426900584796, + "loss": 1.2011, + "mean_token_accuracy": 0.7191813364624977, + "num_tokens": 5821697.0, + "step": 464 + }, + { + "entropy": 1.0034284479916096, + "epoch": 0.775, + "grad_norm": 0.3054230809211731, + "learning_rate": 0.00015625730994152048, + "loss": 0.9897, + "mean_token_accuracy": 0.7667393088340759, + "num_tokens": 5834449.0, + "step": 465 + }, + { + "entropy": 1.0818930864334106, + "epoch": 0.7766666666666666, + "grad_norm": 1.377423644065857, + "learning_rate": 0.00015614035087719297, + "loss": 1.0527, + "mean_token_accuracy": 0.7446305453777313, + "num_tokens": 5847123.0, + "step": 466 + }, + { + "entropy": 1.2061632052063942, + "epoch": 0.7783333333333333, + "grad_norm": 0.36091288924217224, + "learning_rate": 0.0001560233918128655, + "loss": 1.225, + "mean_token_accuracy": 0.7133935913443565, + "num_tokens": 5859520.0, + "step": 467 + }, + { + "entropy": 0.9294106736779213, + "epoch": 0.78, + "grad_norm": 0.37731432914733887, + "learning_rate": 0.00015590643274853802, + "loss": 0.9141, + "mean_token_accuracy": 0.7746392264962196, + "num_tokens": 5872398.0, + "step": 468 + }, + { + "entropy": 1.1364581286907196, + "epoch": 0.7816666666666666, + "grad_norm": 0.3194917142391205, + "learning_rate": 0.00015578947368421052, + "loss": 1.1358, + "mean_token_accuracy": 0.7337777689099312, + "num_tokens": 5884964.0, + "step": 469 + }, + { + "entropy": 1.0378025621175766, + "epoch": 0.7833333333333333, + "grad_norm": 0.28554415702819824, + "learning_rate": 0.00015567251461988307, + "loss": 1.0323, + "mean_token_accuracy": 0.7506354302167892, + "num_tokens": 5897550.0, + "step": 470 + }, + { + "entropy": 1.0520753636956215, + "epoch": 0.785, + "grad_norm": 0.3658890128135681, + "learning_rate": 0.00015555555555555556, + "loss": 1.0722, + "mean_token_accuracy": 0.7424618750810623, + "num_tokens": 5910216.0, + "step": 471 + }, + { + "entropy": 1.0621570646762848, + "epoch": 0.7866666666666666, + "grad_norm": 0.5311276912689209, + "learning_rate": 0.00015543859649122806, + "loss": 1.0606, + "mean_token_accuracy": 0.7419012188911438, + "num_tokens": 5922884.0, + "step": 472 + }, + { + "entropy": 1.1522653177380562, + "epoch": 0.7883333333333333, + "grad_norm": 0.4982028603553772, + "learning_rate": 0.0001553216374269006, + "loss": 1.1606, + "mean_token_accuracy": 0.725653164088726, + "num_tokens": 5935273.0, + "step": 473 + }, + { + "entropy": 1.0625966489315033, + "epoch": 0.79, + "grad_norm": 0.4393594264984131, + "learning_rate": 0.0001552046783625731, + "loss": 1.0302, + "mean_token_accuracy": 0.748927153646946, + "num_tokens": 5947999.0, + "step": 474 + }, + { + "entropy": 1.1587400287389755, + "epoch": 0.7916666666666666, + "grad_norm": 4.552830696105957, + "learning_rate": 0.00015508771929824563, + "loss": 1.1518, + "mean_token_accuracy": 0.7315473929047585, + "num_tokens": 5960567.0, + "step": 475 + }, + { + "entropy": 0.9702800586819649, + "epoch": 0.7933333333333333, + "grad_norm": 0.5625200271606445, + "learning_rate": 0.00015497076023391815, + "loss": 0.9613, + "mean_token_accuracy": 0.7680332958698273, + "num_tokens": 5972922.0, + "step": 476 + }, + { + "entropy": 1.066114716231823, + "epoch": 0.795, + "grad_norm": 0.41820967197418213, + "learning_rate": 0.00015485380116959065, + "loss": 1.053, + "mean_token_accuracy": 0.7436480596661568, + "num_tokens": 5985492.0, + "step": 477 + }, + { + "entropy": 1.121892273426056, + "epoch": 0.7966666666666666, + "grad_norm": 0.43082207441329956, + "learning_rate": 0.00015473684210526317, + "loss": 1.135, + "mean_token_accuracy": 0.726841926574707, + "num_tokens": 5997846.0, + "step": 478 + }, + { + "entropy": 0.9342339262366295, + "epoch": 0.7983333333333333, + "grad_norm": 0.7145920395851135, + "learning_rate": 0.0001546198830409357, + "loss": 0.9339, + "mean_token_accuracy": 0.7741554453969002, + "num_tokens": 6010409.0, + "step": 479 + }, + { + "entropy": 1.0903588011860847, + "epoch": 0.8, + "grad_norm": 0.5080858469009399, + "learning_rate": 0.0001545029239766082, + "loss": 1.0853, + "mean_token_accuracy": 0.7472739815711975, + "num_tokens": 6022911.0, + "step": 480 + }, + { + "entropy": 1.0341752544045448, + "epoch": 0.8016666666666666, + "grad_norm": 0.3803417384624481, + "learning_rate": 0.0001543859649122807, + "loss": 1.0156, + "mean_token_accuracy": 0.7580198422074318, + "num_tokens": 6035536.0, + "step": 481 + }, + { + "entropy": 0.9857856929302216, + "epoch": 0.8033333333333333, + "grad_norm": 0.5306246876716614, + "learning_rate": 0.00015426900584795324, + "loss": 0.9642, + "mean_token_accuracy": 0.7662367448210716, + "num_tokens": 6048113.0, + "step": 482 + }, + { + "entropy": 1.0703137665987015, + "epoch": 0.805, + "grad_norm": 0.49546366930007935, + "learning_rate": 0.00015415204678362573, + "loss": 1.0364, + "mean_token_accuracy": 0.7481872513890266, + "num_tokens": 6060978.0, + "step": 483 + }, + { + "entropy": 1.1418191492557526, + "epoch": 0.8066666666666666, + "grad_norm": 0.39517495036125183, + "learning_rate": 0.00015403508771929825, + "loss": 1.1241, + "mean_token_accuracy": 0.7268876954913139, + "num_tokens": 6073300.0, + "step": 484 + }, + { + "entropy": 1.0620516315102577, + "epoch": 0.8083333333333333, + "grad_norm": 0.33354660868644714, + "learning_rate": 0.00015391812865497078, + "loss": 1.0627, + "mean_token_accuracy": 0.7447437271475792, + "num_tokens": 6085969.0, + "step": 485 + }, + { + "entropy": 1.1370228081941605, + "epoch": 0.81, + "grad_norm": 0.4519972801208496, + "learning_rate": 0.00015380116959064327, + "loss": 1.1648, + "mean_token_accuracy": 0.727062314748764, + "num_tokens": 6098405.0, + "step": 486 + }, + { + "entropy": 1.116714984178543, + "epoch": 0.8116666666666666, + "grad_norm": 0.3834986984729767, + "learning_rate": 0.0001536842105263158, + "loss": 1.1237, + "mean_token_accuracy": 0.7360536903142929, + "num_tokens": 6110790.0, + "step": 487 + }, + { + "entropy": 1.1766095086932182, + "epoch": 0.8133333333333334, + "grad_norm": 0.3951236605644226, + "learning_rate": 0.00015356725146198832, + "loss": 1.2038, + "mean_token_accuracy": 0.7180357128381729, + "num_tokens": 6123104.0, + "step": 488 + }, + { + "entropy": 1.0317028015851974, + "epoch": 0.815, + "grad_norm": 0.31130674481391907, + "learning_rate": 0.00015345029239766081, + "loss": 1.018, + "mean_token_accuracy": 0.7511586546897888, + "num_tokens": 6135499.0, + "step": 489 + }, + { + "entropy": 1.0695944800972939, + "epoch": 0.8166666666666667, + "grad_norm": 0.41369152069091797, + "learning_rate": 0.00015333333333333334, + "loss": 1.0622, + "mean_token_accuracy": 0.743607684969902, + "num_tokens": 6147940.0, + "step": 490 + }, + { + "entropy": 0.973351001739502, + "epoch": 0.8183333333333334, + "grad_norm": 0.4467531442642212, + "learning_rate": 0.00015321637426900586, + "loss": 0.9566, + "mean_token_accuracy": 0.7704202383756638, + "num_tokens": 6160394.0, + "step": 491 + }, + { + "entropy": 1.0809285417199135, + "epoch": 0.82, + "grad_norm": 0.37388381361961365, + "learning_rate": 0.00015309941520467836, + "loss": 1.0841, + "mean_token_accuracy": 0.7388180121779442, + "num_tokens": 6172807.0, + "step": 492 + }, + { + "entropy": 1.1079175993800163, + "epoch": 0.8216666666666667, + "grad_norm": 0.2852994203567505, + "learning_rate": 0.0001529824561403509, + "loss": 1.1107, + "mean_token_accuracy": 0.7399151921272278, + "num_tokens": 6185562.0, + "step": 493 + }, + { + "entropy": 1.051186740398407, + "epoch": 0.8233333333333334, + "grad_norm": 0.35674288868904114, + "learning_rate": 0.0001528654970760234, + "loss": 1.036, + "mean_token_accuracy": 0.7519017159938812, + "num_tokens": 6198341.0, + "step": 494 + }, + { + "entropy": 1.112432986497879, + "epoch": 0.825, + "grad_norm": 0.47925853729248047, + "learning_rate": 0.0001527485380116959, + "loss": 1.0952, + "mean_token_accuracy": 0.7376601323485374, + "num_tokens": 6210805.0, + "step": 495 + }, + { + "entropy": 1.136579304933548, + "epoch": 0.8266666666666667, + "grad_norm": 0.3833317160606384, + "learning_rate": 0.00015263157894736845, + "loss": 1.1597, + "mean_token_accuracy": 0.7291416153311729, + "num_tokens": 6223352.0, + "step": 496 + }, + { + "entropy": 0.9923493564128876, + "epoch": 0.8283333333333334, + "grad_norm": 0.3395523726940155, + "learning_rate": 0.00015251461988304094, + "loss": 0.9754, + "mean_token_accuracy": 0.7562162950634956, + "num_tokens": 6235932.0, + "step": 497 + }, + { + "entropy": 1.0614645034074783, + "epoch": 0.83, + "grad_norm": 0.3680817782878876, + "learning_rate": 0.00015239766081871344, + "loss": 1.0593, + "mean_token_accuracy": 0.7413745895028114, + "num_tokens": 6248285.0, + "step": 498 + }, + { + "entropy": 0.9913108944892883, + "epoch": 0.8316666666666667, + "grad_norm": 0.3792550265789032, + "learning_rate": 0.000152280701754386, + "loss": 0.9644, + "mean_token_accuracy": 0.7651605606079102, + "num_tokens": 6261083.0, + "step": 499 + }, + { + "entropy": 1.07400331646204, + "epoch": 0.8333333333333334, + "grad_norm": 0.40772777795791626, + "learning_rate": 0.0001521637426900585, + "loss": 1.0577, + "mean_token_accuracy": 0.7462185472249985, + "num_tokens": 6273916.0, + "step": 500 + }, + { + "entropy": 1.1181387081742287, + "epoch": 0.835, + "grad_norm": 0.33552879095077515, + "learning_rate": 0.00015204678362573098, + "loss": 1.0951, + "mean_token_accuracy": 0.7397628724575043, + "num_tokens": 6286445.0, + "step": 501 + }, + { + "entropy": 1.1042726710438728, + "epoch": 0.8366666666666667, + "grad_norm": 0.36931732296943665, + "learning_rate": 0.00015192982456140353, + "loss": 1.1075, + "mean_token_accuracy": 0.7333213239908218, + "num_tokens": 6298861.0, + "step": 502 + }, + { + "entropy": 1.0911922678351402, + "epoch": 0.8383333333333334, + "grad_norm": 0.72311931848526, + "learning_rate": 0.00015181286549707603, + "loss": 1.0636, + "mean_token_accuracy": 0.7454661652445793, + "num_tokens": 6311619.0, + "step": 503 + }, + { + "entropy": 0.9951315149664879, + "epoch": 0.84, + "grad_norm": 0.40310415625572205, + "learning_rate": 0.00015169590643274852, + "loss": 1.0008, + "mean_token_accuracy": 0.7613855600357056, + "num_tokens": 6324047.0, + "step": 504 + }, + { + "entropy": 0.9872754141688347, + "epoch": 0.8416666666666667, + "grad_norm": 0.3429985046386719, + "learning_rate": 0.00015157894736842108, + "loss": 0.9861, + "mean_token_accuracy": 0.7662120833992958, + "num_tokens": 6336544.0, + "step": 505 + }, + { + "entropy": 1.0408969223499298, + "epoch": 0.8433333333333334, + "grad_norm": 0.36479246616363525, + "learning_rate": 0.00015146198830409357, + "loss": 1.0479, + "mean_token_accuracy": 0.7461319342255592, + "num_tokens": 6349291.0, + "step": 506 + }, + { + "entropy": 1.1309384107589722, + "epoch": 0.845, + "grad_norm": 0.3272276818752289, + "learning_rate": 0.00015134502923976607, + "loss": 1.1283, + "mean_token_accuracy": 0.732874296605587, + "num_tokens": 6361636.0, + "step": 507 + }, + { + "entropy": 1.025696039199829, + "epoch": 0.8466666666666667, + "grad_norm": 0.30461591482162476, + "learning_rate": 0.00015122807017543862, + "loss": 1.008, + "mean_token_accuracy": 0.754217803478241, + "num_tokens": 6374330.0, + "step": 508 + }, + { + "entropy": 0.9422205537557602, + "epoch": 0.8483333333333334, + "grad_norm": 0.34366074204444885, + "learning_rate": 0.0001511111111111111, + "loss": 0.9349, + "mean_token_accuracy": 0.7703981176018715, + "num_tokens": 6386892.0, + "step": 509 + }, + { + "entropy": 1.0122661367058754, + "epoch": 0.85, + "grad_norm": 0.35145244002342224, + "learning_rate": 0.00015099415204678364, + "loss": 1.011, + "mean_token_accuracy": 0.749379850924015, + "num_tokens": 6399805.0, + "step": 510 + }, + { + "entropy": 0.980590432882309, + "epoch": 0.8516666666666667, + "grad_norm": 0.4205039441585541, + "learning_rate": 0.00015087719298245616, + "loss": 0.9929, + "mean_token_accuracy": 0.7566556483507156, + "num_tokens": 6412486.0, + "step": 511 + }, + { + "entropy": 1.0508618205785751, + "epoch": 0.8533333333333334, + "grad_norm": 0.40067777037620544, + "learning_rate": 0.00015076023391812865, + "loss": 1.0756, + "mean_token_accuracy": 0.7451096102595329, + "num_tokens": 6425135.0, + "step": 512 + }, + { + "entropy": 0.8931740894913673, + "epoch": 0.855, + "grad_norm": 0.3182304799556732, + "learning_rate": 0.00015064327485380118, + "loss": 0.9151, + "mean_token_accuracy": 0.7833112999796867, + "num_tokens": 6437931.0, + "step": 513 + }, + { + "entropy": 1.039138525724411, + "epoch": 0.8566666666666667, + "grad_norm": 0.35510554909706116, + "learning_rate": 0.0001505263157894737, + "loss": 1.0345, + "mean_token_accuracy": 0.7493056431412697, + "num_tokens": 6450669.0, + "step": 514 + }, + { + "entropy": 0.9968626797199249, + "epoch": 0.8583333333333333, + "grad_norm": 0.4760706424713135, + "learning_rate": 0.0001504093567251462, + "loss": 0.9712, + "mean_token_accuracy": 0.7673781663179398, + "num_tokens": 6463375.0, + "step": 515 + }, + { + "entropy": 1.2044792175292969, + "epoch": 0.86, + "grad_norm": 0.32037344574928284, + "learning_rate": 0.00015029239766081872, + "loss": 1.1869, + "mean_token_accuracy": 0.7217776477336884, + "num_tokens": 6476101.0, + "step": 516 + }, + { + "entropy": 1.0119360834360123, + "epoch": 0.8616666666666667, + "grad_norm": 0.3619234263896942, + "learning_rate": 0.00015017543859649124, + "loss": 1.0088, + "mean_token_accuracy": 0.7546406164765358, + "num_tokens": 6488799.0, + "step": 517 + }, + { + "entropy": 0.9528638869524002, + "epoch": 0.8633333333333333, + "grad_norm": 0.3633744716644287, + "learning_rate": 0.00015005847953216374, + "loss": 0.9415, + "mean_token_accuracy": 0.7687265649437904, + "num_tokens": 6501728.0, + "step": 518 + }, + { + "entropy": 0.9753984436392784, + "epoch": 0.865, + "grad_norm": 0.35341712832450867, + "learning_rate": 0.00014994152046783626, + "loss": 0.962, + "mean_token_accuracy": 0.7667989581823349, + "num_tokens": 6514229.0, + "step": 519 + }, + { + "entropy": 1.1342968195676804, + "epoch": 0.8666666666666667, + "grad_norm": 0.35511359572410583, + "learning_rate": 0.00014982456140350878, + "loss": 1.1138, + "mean_token_accuracy": 0.7273155152797699, + "num_tokens": 6526771.0, + "step": 520 + }, + { + "entropy": 1.1301146745681763, + "epoch": 0.8683333333333333, + "grad_norm": 0.54920893907547, + "learning_rate": 0.00014970760233918128, + "loss": 1.125, + "mean_token_accuracy": 0.726063072681427, + "num_tokens": 6539121.0, + "step": 521 + }, + { + "entropy": 1.108246959745884, + "epoch": 0.87, + "grad_norm": 0.4356490671634674, + "learning_rate": 0.0001495906432748538, + "loss": 1.1239, + "mean_token_accuracy": 0.7272974625229836, + "num_tokens": 6551645.0, + "step": 522 + }, + { + "entropy": 1.0279260724782944, + "epoch": 0.8716666666666667, + "grad_norm": 0.3281795084476471, + "learning_rate": 0.00014947368421052633, + "loss": 1.0203, + "mean_token_accuracy": 0.7577808052301407, + "num_tokens": 6564090.0, + "step": 523 + }, + { + "entropy": 1.0240607187151909, + "epoch": 0.8733333333333333, + "grad_norm": 0.38245126605033875, + "learning_rate": 0.00014935672514619882, + "loss": 1.0336, + "mean_token_accuracy": 0.754253052175045, + "num_tokens": 6576574.0, + "step": 524 + }, + { + "entropy": 1.0713168308138847, + "epoch": 0.875, + "grad_norm": 0.4837334454059601, + "learning_rate": 0.00014923976608187135, + "loss": 1.0833, + "mean_token_accuracy": 0.7464174851775169, + "num_tokens": 6589106.0, + "step": 525 + }, + { + "entropy": 1.0884229391813278, + "epoch": 0.8766666666666667, + "grad_norm": 0.2943100035190582, + "learning_rate": 0.00014912280701754387, + "loss": 1.1, + "mean_token_accuracy": 0.7427090853452682, + "num_tokens": 6601258.0, + "step": 526 + }, + { + "entropy": 1.2013995423913002, + "epoch": 0.8783333333333333, + "grad_norm": 0.37657371163368225, + "learning_rate": 0.00014900584795321636, + "loss": 1.2225, + "mean_token_accuracy": 0.7097373679280281, + "num_tokens": 6613781.0, + "step": 527 + }, + { + "entropy": 1.1369323432445526, + "epoch": 0.88, + "grad_norm": 0.47003990411758423, + "learning_rate": 0.0001488888888888889, + "loss": 1.1584, + "mean_token_accuracy": 0.7299115657806396, + "num_tokens": 6626534.0, + "step": 528 + }, + { + "entropy": 0.957214891910553, + "epoch": 0.8816666666666667, + "grad_norm": 0.31164368987083435, + "learning_rate": 0.0001487719298245614, + "loss": 0.9584, + "mean_token_accuracy": 0.7725479602813721, + "num_tokens": 6639006.0, + "step": 529 + }, + { + "entropy": 1.051169142127037, + "epoch": 0.8833333333333333, + "grad_norm": 0.3350316882133484, + "learning_rate": 0.00014865497076023393, + "loss": 1.0422, + "mean_token_accuracy": 0.7498800754547119, + "num_tokens": 6651122.0, + "step": 530 + }, + { + "entropy": 1.1519297286868095, + "epoch": 0.885, + "grad_norm": 0.7732307314872742, + "learning_rate": 0.00014853801169590643, + "loss": 1.1511, + "mean_token_accuracy": 0.7270490527153015, + "num_tokens": 6663635.0, + "step": 531 + }, + { + "entropy": 0.9805091023445129, + "epoch": 0.8866666666666667, + "grad_norm": 0.4671543836593628, + "learning_rate": 0.00014842105263157895, + "loss": 0.9635, + "mean_token_accuracy": 0.7717467620968819, + "num_tokens": 6676184.0, + "step": 532 + }, + { + "entropy": 1.125210352241993, + "epoch": 0.8883333333333333, + "grad_norm": 0.3196561932563782, + "learning_rate": 0.00014830409356725148, + "loss": 1.1003, + "mean_token_accuracy": 0.7358310669660568, + "num_tokens": 6688989.0, + "step": 533 + }, + { + "entropy": 0.9900911301374435, + "epoch": 0.89, + "grad_norm": 0.3698914647102356, + "learning_rate": 0.000148187134502924, + "loss": 0.9624, + "mean_token_accuracy": 0.7684887275099754, + "num_tokens": 6701563.0, + "step": 534 + }, + { + "entropy": 1.1607655212283134, + "epoch": 0.8916666666666667, + "grad_norm": 0.36797863245010376, + "learning_rate": 0.0001480701754385965, + "loss": 1.1324, + "mean_token_accuracy": 0.726492203772068, + "num_tokens": 6714026.0, + "step": 535 + }, + { + "entropy": 1.122398853302002, + "epoch": 0.8933333333333333, + "grad_norm": 0.32914605736732483, + "learning_rate": 0.00014795321637426902, + "loss": 1.0988, + "mean_token_accuracy": 0.7404111847281456, + "num_tokens": 6726144.0, + "step": 536 + }, + { + "entropy": 1.1616889387369156, + "epoch": 0.895, + "grad_norm": 0.35523873567581177, + "learning_rate": 0.00014783625730994154, + "loss": 1.2038, + "mean_token_accuracy": 0.7175712808966637, + "num_tokens": 6738984.0, + "step": 537 + }, + { + "entropy": 1.038257472217083, + "epoch": 0.8966666666666666, + "grad_norm": 0.3147364556789398, + "learning_rate": 0.00014771929824561404, + "loss": 1.0323, + "mean_token_accuracy": 0.755009114742279, + "num_tokens": 6751404.0, + "step": 538 + }, + { + "entropy": 1.0097395554184914, + "epoch": 0.8983333333333333, + "grad_norm": 0.38519787788391113, + "learning_rate": 0.00014760233918128656, + "loss": 1.0577, + "mean_token_accuracy": 0.7535994872450829, + "num_tokens": 6763959.0, + "step": 539 + }, + { + "entropy": 1.1510286554694176, + "epoch": 0.9, + "grad_norm": 0.5456175804138184, + "learning_rate": 0.00014748538011695908, + "loss": 1.1552, + "mean_token_accuracy": 0.7292575761675835, + "num_tokens": 6776521.0, + "step": 540 + }, + { + "entropy": 1.0821500197052956, + "epoch": 0.9016666666666666, + "grad_norm": 0.358005166053772, + "learning_rate": 0.00014736842105263158, + "loss": 1.0762, + "mean_token_accuracy": 0.7408832535147667, + "num_tokens": 6789060.0, + "step": 541 + }, + { + "entropy": 1.111682377755642, + "epoch": 0.9033333333333333, + "grad_norm": 0.30763792991638184, + "learning_rate": 0.0001472514619883041, + "loss": 1.1123, + "mean_token_accuracy": 0.7355025187134743, + "num_tokens": 6801425.0, + "step": 542 + }, + { + "entropy": 0.953965000808239, + "epoch": 0.905, + "grad_norm": 0.4438503384590149, + "learning_rate": 0.00014713450292397662, + "loss": 0.9275, + "mean_token_accuracy": 0.7724104151129723, + "num_tokens": 6814332.0, + "step": 543 + }, + { + "entropy": 1.0561645030975342, + "epoch": 0.9066666666666666, + "grad_norm": 0.5390836596488953, + "learning_rate": 0.00014701754385964912, + "loss": 1.035, + "mean_token_accuracy": 0.7513830289244652, + "num_tokens": 6826918.0, + "step": 544 + }, + { + "entropy": 1.144611619412899, + "epoch": 0.9083333333333333, + "grad_norm": 0.6197843551635742, + "learning_rate": 0.00014690058479532164, + "loss": 1.0745, + "mean_token_accuracy": 0.736310139298439, + "num_tokens": 6839687.0, + "step": 545 + }, + { + "entropy": 1.1314348950982094, + "epoch": 0.91, + "grad_norm": 0.4797782003879547, + "learning_rate": 0.00014678362573099417, + "loss": 1.1071, + "mean_token_accuracy": 0.7364379167556763, + "num_tokens": 6852267.0, + "step": 546 + }, + { + "entropy": 1.0520601645112038, + "epoch": 0.9116666666666666, + "grad_norm": 0.40423494577407837, + "learning_rate": 0.00014666666666666666, + "loss": 1.0716, + "mean_token_accuracy": 0.7455736324191093, + "num_tokens": 6864783.0, + "step": 547 + }, + { + "entropy": 1.0261236801743507, + "epoch": 0.9133333333333333, + "grad_norm": 0.38641560077667236, + "learning_rate": 0.00014654970760233919, + "loss": 1.0317, + "mean_token_accuracy": 0.7592292949557304, + "num_tokens": 6877450.0, + "step": 548 + }, + { + "entropy": 0.996756412088871, + "epoch": 0.915, + "grad_norm": 0.45368504524230957, + "learning_rate": 0.0001464327485380117, + "loss": 1.0146, + "mean_token_accuracy": 0.7616635635495186, + "num_tokens": 6889999.0, + "step": 549 + }, + { + "entropy": 1.1064547002315521, + "epoch": 0.9166666666666666, + "grad_norm": 1.2801135778427124, + "learning_rate": 0.00014631578947368423, + "loss": 1.1435, + "mean_token_accuracy": 0.7284985184669495, + "num_tokens": 6902795.0, + "step": 550 + }, + { + "entropy": 1.0498828887939453, + "epoch": 0.9183333333333333, + "grad_norm": 0.33059030771255493, + "learning_rate": 0.00014619883040935673, + "loss": 1.0641, + "mean_token_accuracy": 0.7469140291213989, + "num_tokens": 6915455.0, + "step": 551 + }, + { + "entropy": 1.0957630798220634, + "epoch": 0.92, + "grad_norm": 0.4147501289844513, + "learning_rate": 0.00014608187134502925, + "loss": 1.1234, + "mean_token_accuracy": 0.7397946789860725, + "num_tokens": 6927897.0, + "step": 552 + }, + { + "entropy": 1.0699311718344688, + "epoch": 0.9216666666666666, + "grad_norm": 0.5108545422554016, + "learning_rate": 0.00014596491228070177, + "loss": 1.0595, + "mean_token_accuracy": 0.745146743953228, + "num_tokens": 6940318.0, + "step": 553 + }, + { + "entropy": 1.1025114730000496, + "epoch": 0.9233333333333333, + "grad_norm": 0.4209730625152588, + "learning_rate": 0.00014584795321637427, + "loss": 1.0914, + "mean_token_accuracy": 0.738226130604744, + "num_tokens": 6952938.0, + "step": 554 + }, + { + "entropy": 1.115426942706108, + "epoch": 0.925, + "grad_norm": 0.6522241830825806, + "learning_rate": 0.0001457309941520468, + "loss": 1.105, + "mean_token_accuracy": 0.7367953211069107, + "num_tokens": 6965483.0, + "step": 555 + }, + { + "entropy": 1.0508125722408295, + "epoch": 0.9266666666666666, + "grad_norm": 0.4295806884765625, + "learning_rate": 0.00014561403508771932, + "loss": 1.0411, + "mean_token_accuracy": 0.7544974535703659, + "num_tokens": 6978017.0, + "step": 556 + }, + { + "entropy": 0.9892331138253212, + "epoch": 0.9283333333333333, + "grad_norm": 0.37067389488220215, + "learning_rate": 0.0001454970760233918, + "loss": 0.9823, + "mean_token_accuracy": 0.765356183052063, + "num_tokens": 6990616.0, + "step": 557 + }, + { + "entropy": 1.095439076423645, + "epoch": 0.93, + "grad_norm": 0.5443627238273621, + "learning_rate": 0.00014538011695906433, + "loss": 1.0851, + "mean_token_accuracy": 0.7415796294808388, + "num_tokens": 7003399.0, + "step": 558 + }, + { + "entropy": 1.1333895400166512, + "epoch": 0.9316666666666666, + "grad_norm": 0.5122875571250916, + "learning_rate": 0.00014526315789473686, + "loss": 1.1032, + "mean_token_accuracy": 0.7338119447231293, + "num_tokens": 7015872.0, + "step": 559 + }, + { + "entropy": 1.1227083802223206, + "epoch": 0.9333333333333333, + "grad_norm": 0.3637396991252899, + "learning_rate": 0.00014514619883040935, + "loss": 1.0848, + "mean_token_accuracy": 0.7447784096002579, + "num_tokens": 7028607.0, + "step": 560 + }, + { + "entropy": 1.014683723449707, + "epoch": 0.935, + "grad_norm": 0.44447654485702515, + "learning_rate": 0.00014502923976608188, + "loss": 1.016, + "mean_token_accuracy": 0.7610589489340782, + "num_tokens": 7041311.0, + "step": 561 + }, + { + "entropy": 1.0506829991936684, + "epoch": 0.9366666666666666, + "grad_norm": 0.6196885704994202, + "learning_rate": 0.0001449122807017544, + "loss": 1.067, + "mean_token_accuracy": 0.749383956193924, + "num_tokens": 7054093.0, + "step": 562 + }, + { + "entropy": 1.099650725722313, + "epoch": 0.9383333333333334, + "grad_norm": 0.4171883165836334, + "learning_rate": 0.0001447953216374269, + "loss": 1.0832, + "mean_token_accuracy": 0.7402654960751534, + "num_tokens": 7066465.0, + "step": 563 + }, + { + "entropy": 1.1942984014749527, + "epoch": 0.94, + "grad_norm": 0.33442190289497375, + "learning_rate": 0.00014467836257309942, + "loss": 1.1844, + "mean_token_accuracy": 0.7190638408064842, + "num_tokens": 7079064.0, + "step": 564 + }, + { + "entropy": 1.024195820093155, + "epoch": 0.9416666666666667, + "grad_norm": 0.7153050303459167, + "learning_rate": 0.00014456140350877194, + "loss": 1.0327, + "mean_token_accuracy": 0.7473888471722603, + "num_tokens": 7091659.0, + "step": 565 + }, + { + "entropy": 1.0238566473126411, + "epoch": 0.9433333333333334, + "grad_norm": 0.39869338274002075, + "learning_rate": 0.00014444444444444444, + "loss": 1.0185, + "mean_token_accuracy": 0.7563499286770821, + "num_tokens": 7104066.0, + "step": 566 + }, + { + "entropy": 1.0474332720041275, + "epoch": 0.945, + "grad_norm": 0.3648073971271515, + "learning_rate": 0.00014432748538011696, + "loss": 1.0393, + "mean_token_accuracy": 0.7489822506904602, + "num_tokens": 7116617.0, + "step": 567 + }, + { + "entropy": 1.050786353647709, + "epoch": 0.9466666666666667, + "grad_norm": 0.4042346477508545, + "learning_rate": 0.00014421052631578948, + "loss": 1.0455, + "mean_token_accuracy": 0.755046546459198, + "num_tokens": 7128987.0, + "step": 568 + }, + { + "entropy": 1.034841150045395, + "epoch": 0.9483333333333334, + "grad_norm": 0.4897487163543701, + "learning_rate": 0.00014409356725146198, + "loss": 1.0298, + "mean_token_accuracy": 0.7585836425423622, + "num_tokens": 7141286.0, + "step": 569 + }, + { + "entropy": 0.9620069712400436, + "epoch": 0.95, + "grad_norm": 0.31527337431907654, + "learning_rate": 0.00014397660818713453, + "loss": 0.9693, + "mean_token_accuracy": 0.7721031606197357, + "num_tokens": 7153845.0, + "step": 570 + }, + { + "entropy": 1.0886893197894096, + "epoch": 0.9516666666666667, + "grad_norm": 0.322122186422348, + "learning_rate": 0.00014385964912280703, + "loss": 1.1107, + "mean_token_accuracy": 0.7335245460271835, + "num_tokens": 7166461.0, + "step": 571 + }, + { + "entropy": 1.077827326953411, + "epoch": 0.9533333333333334, + "grad_norm": 0.342206209897995, + "learning_rate": 0.00014374269005847952, + "loss": 1.1087, + "mean_token_accuracy": 0.7354479655623436, + "num_tokens": 7179412.0, + "step": 572 + }, + { + "entropy": 0.9776570126414299, + "epoch": 0.955, + "grad_norm": 0.36812132596969604, + "learning_rate": 0.00014362573099415207, + "loss": 0.9973, + "mean_token_accuracy": 0.7615081444382668, + "num_tokens": 7191954.0, + "step": 573 + }, + { + "entropy": 1.0732961222529411, + "epoch": 0.9566666666666667, + "grad_norm": 0.3786291182041168, + "learning_rate": 0.00014350877192982457, + "loss": 1.0882, + "mean_token_accuracy": 0.7390999048948288, + "num_tokens": 7204637.0, + "step": 574 + }, + { + "entropy": 1.1834777668118477, + "epoch": 0.9583333333333334, + "grad_norm": 0.5077952146530151, + "learning_rate": 0.00014339181286549706, + "loss": 1.1723, + "mean_token_accuracy": 0.721016451716423, + "num_tokens": 7217015.0, + "step": 575 + }, + { + "entropy": 1.0592687726020813, + "epoch": 0.96, + "grad_norm": 0.37753114104270935, + "learning_rate": 0.00014327485380116961, + "loss": 1.03, + "mean_token_accuracy": 0.759399339556694, + "num_tokens": 7229837.0, + "step": 576 + }, + { + "entropy": 1.1041856706142426, + "epoch": 0.9616666666666667, + "grad_norm": 0.6394175291061401, + "learning_rate": 0.0001431578947368421, + "loss": 1.1038, + "mean_token_accuracy": 0.734016478061676, + "num_tokens": 7242931.0, + "step": 577 + }, + { + "entropy": 0.9978655651211739, + "epoch": 0.9633333333333334, + "grad_norm": 0.506219208240509, + "learning_rate": 0.00014304093567251463, + "loss": 0.9685, + "mean_token_accuracy": 0.7632486671209335, + "num_tokens": 7255579.0, + "step": 578 + }, + { + "entropy": 1.181634321808815, + "epoch": 0.965, + "grad_norm": 0.2994771897792816, + "learning_rate": 0.00014292397660818716, + "loss": 1.1717, + "mean_token_accuracy": 0.7260859459638596, + "num_tokens": 7268114.0, + "step": 579 + }, + { + "entropy": 1.074332445859909, + "epoch": 0.9666666666666667, + "grad_norm": 0.4389213025569916, + "learning_rate": 0.00014280701754385965, + "loss": 1.0814, + "mean_token_accuracy": 0.7418688982725143, + "num_tokens": 7280696.0, + "step": 580 + }, + { + "entropy": 1.1216760650277138, + "epoch": 0.9683333333333334, + "grad_norm": 0.445691853761673, + "learning_rate": 0.00014269005847953217, + "loss": 1.1332, + "mean_token_accuracy": 0.7311763614416122, + "num_tokens": 7293554.0, + "step": 581 + }, + { + "entropy": 1.0460280254483223, + "epoch": 0.97, + "grad_norm": 0.4536060690879822, + "learning_rate": 0.0001425730994152047, + "loss": 1.0734, + "mean_token_accuracy": 0.7441517636179924, + "num_tokens": 7306011.0, + "step": 582 + }, + { + "entropy": 0.9758674651384354, + "epoch": 0.9716666666666667, + "grad_norm": 0.3244321048259735, + "learning_rate": 0.0001424561403508772, + "loss": 0.9507, + "mean_token_accuracy": 0.7699640765786171, + "num_tokens": 7318847.0, + "step": 583 + }, + { + "entropy": 1.0076914280653, + "epoch": 0.9733333333333334, + "grad_norm": 0.3470350503921509, + "learning_rate": 0.00014233918128654972, + "loss": 1.0062, + "mean_token_accuracy": 0.7613753005862236, + "num_tokens": 7331344.0, + "step": 584 + }, + { + "entropy": 1.0001231580972672, + "epoch": 0.975, + "grad_norm": 0.47927960753440857, + "learning_rate": 0.00014222222222222224, + "loss": 0.9762, + "mean_token_accuracy": 0.7627003714442253, + "num_tokens": 7344081.0, + "step": 585 + }, + { + "entropy": 1.0094628855586052, + "epoch": 0.9766666666666667, + "grad_norm": 0.3564485013484955, + "learning_rate": 0.00014210526315789474, + "loss": 0.9876, + "mean_token_accuracy": 0.7616174221038818, + "num_tokens": 7356857.0, + "step": 586 + }, + { + "entropy": 1.1501418203115463, + "epoch": 0.9783333333333334, + "grad_norm": 0.4095627963542938, + "learning_rate": 0.00014198830409356726, + "loss": 1.1615, + "mean_token_accuracy": 0.7238694280385971, + "num_tokens": 7369198.0, + "step": 587 + }, + { + "entropy": 0.9223703965544701, + "epoch": 0.98, + "grad_norm": 0.5018359422683716, + "learning_rate": 0.00014187134502923978, + "loss": 0.8939, + "mean_token_accuracy": 0.7803265228867531, + "num_tokens": 7381601.0, + "step": 588 + }, + { + "entropy": 1.0110130235552788, + "epoch": 0.9816666666666667, + "grad_norm": 0.311329185962677, + "learning_rate": 0.00014175438596491228, + "loss": 1.0116, + "mean_token_accuracy": 0.7552156001329422, + "num_tokens": 7393736.0, + "step": 589 + }, + { + "entropy": 1.1393386349081993, + "epoch": 0.9833333333333333, + "grad_norm": 0.37725117802619934, + "learning_rate": 0.0001416374269005848, + "loss": 1.1883, + "mean_token_accuracy": 0.7277703955769539, + "num_tokens": 7406423.0, + "step": 590 + }, + { + "entropy": 1.0763930901885033, + "epoch": 0.985, + "grad_norm": 0.39282989501953125, + "learning_rate": 0.00014152046783625732, + "loss": 1.0786, + "mean_token_accuracy": 0.7410120218992233, + "num_tokens": 7419133.0, + "step": 591 + }, + { + "entropy": 1.016746073961258, + "epoch": 0.9866666666666667, + "grad_norm": 0.3819109797477722, + "learning_rate": 0.00014140350877192982, + "loss": 1.0051, + "mean_token_accuracy": 0.7601424679160118, + "num_tokens": 7431867.0, + "step": 592 + }, + { + "entropy": 1.1087006330490112, + "epoch": 0.9883333333333333, + "grad_norm": 0.2999494969844818, + "learning_rate": 0.00014128654970760234, + "loss": 1.1193, + "mean_token_accuracy": 0.7328962907195091, + "num_tokens": 7444366.0, + "step": 593 + }, + { + "entropy": 1.0927283689379692, + "epoch": 0.99, + "grad_norm": 0.38941463828086853, + "learning_rate": 0.00014116959064327487, + "loss": 1.0806, + "mean_token_accuracy": 0.7454044669866562, + "num_tokens": 7457006.0, + "step": 594 + }, + { + "entropy": 1.0681473091244698, + "epoch": 0.9916666666666667, + "grad_norm": 0.38292431831359863, + "learning_rate": 0.00014105263157894736, + "loss": 1.0783, + "mean_token_accuracy": 0.7404307276010513, + "num_tokens": 7469273.0, + "step": 595 + }, + { + "entropy": 1.133509248495102, + "epoch": 0.9933333333333333, + "grad_norm": 0.3152455985546112, + "learning_rate": 0.0001409356725146199, + "loss": 1.1119, + "mean_token_accuracy": 0.7372884005308151, + "num_tokens": 7481852.0, + "step": 596 + }, + { + "entropy": 1.166313149034977, + "epoch": 0.995, + "grad_norm": 0.3544023633003235, + "learning_rate": 0.0001408187134502924, + "loss": 1.1587, + "mean_token_accuracy": 0.7204631865024567, + "num_tokens": 7494352.0, + "step": 597 + }, + { + "entropy": 1.128667414188385, + "epoch": 0.9966666666666667, + "grad_norm": 0.3434421718120575, + "learning_rate": 0.0001407017543859649, + "loss": 1.1268, + "mean_token_accuracy": 0.7295839041471481, + "num_tokens": 7506807.0, + "step": 598 + }, + { + "entropy": 0.996250681579113, + "epoch": 0.9983333333333333, + "grad_norm": 0.29589951038360596, + "learning_rate": 0.00014058479532163745, + "loss": 0.9773, + "mean_token_accuracy": 0.7680518105626106, + "num_tokens": 7519235.0, + "step": 599 + }, + { + "entropy": 1.0180830582976341, + "epoch": 1.0, + "grad_norm": 0.32254987955093384, + "learning_rate": 0.00014046783625730995, + "loss": 0.9957, + "mean_token_accuracy": 0.7620798721909523, + "num_tokens": 7531661.0, + "step": 600 + }, + { + "entropy": 1.1241462379693985, + "epoch": 1.0016666666666667, + "grad_norm": 0.3471614718437195, + "learning_rate": 0.00014035087719298245, + "loss": 1.11, + "mean_token_accuracy": 0.736703634262085, + "num_tokens": 7544221.0, + "step": 601 + }, + { + "entropy": 1.0065209418535233, + "epoch": 1.0033333333333334, + "grad_norm": 0.3413226008415222, + "learning_rate": 0.000140233918128655, + "loss": 0.9849, + "mean_token_accuracy": 0.7642301768064499, + "num_tokens": 7556642.0, + "step": 602 + }, + { + "entropy": 1.2092658504843712, + "epoch": 1.005, + "grad_norm": 0.29548096656799316, + "learning_rate": 0.0001401169590643275, + "loss": 1.195, + "mean_token_accuracy": 0.719109907746315, + "num_tokens": 7569043.0, + "step": 603 + }, + { + "entropy": 1.1198002099990845, + "epoch": 1.0066666666666666, + "grad_norm": 0.45361337065696716, + "learning_rate": 0.00014, + "loss": 1.1547, + "mean_token_accuracy": 0.7289656400680542, + "num_tokens": 7581641.0, + "step": 604 + }, + { + "entropy": 1.211386151611805, + "epoch": 1.0083333333333333, + "grad_norm": 0.3899124264717102, + "learning_rate": 0.00013988304093567254, + "loss": 1.2301, + "mean_token_accuracy": 0.713324747979641, + "num_tokens": 7594098.0, + "step": 605 + }, + { + "entropy": 1.0142735317349434, + "epoch": 1.01, + "grad_norm": 0.2941846549510956, + "learning_rate": 0.00013976608187134503, + "loss": 1.0154, + "mean_token_accuracy": 0.7602197378873825, + "num_tokens": 7606658.0, + "step": 606 + }, + { + "entropy": 0.9515361040830612, + "epoch": 1.0116666666666667, + "grad_norm": 0.3320436477661133, + "learning_rate": 0.00013964912280701753, + "loss": 0.9443, + "mean_token_accuracy": 0.7731414288282394, + "num_tokens": 7619292.0, + "step": 607 + }, + { + "entropy": 0.9823862388730049, + "epoch": 1.0133333333333334, + "grad_norm": 0.45267120003700256, + "learning_rate": 0.00013953216374269008, + "loss": 0.984, + "mean_token_accuracy": 0.7694632411003113, + "num_tokens": 7631882.0, + "step": 608 + }, + { + "entropy": 1.0947947576642036, + "epoch": 1.015, + "grad_norm": 0.34952351450920105, + "learning_rate": 0.00013941520467836258, + "loss": 1.0837, + "mean_token_accuracy": 0.7458218857645988, + "num_tokens": 7644653.0, + "step": 609 + }, + { + "entropy": 1.0054941028356552, + "epoch": 1.0166666666666666, + "grad_norm": 0.4093509316444397, + "learning_rate": 0.00013929824561403507, + "loss": 0.9902, + "mean_token_accuracy": 0.7687611132860184, + "num_tokens": 7657159.0, + "step": 610 + }, + { + "entropy": 1.1774860545992851, + "epoch": 1.0183333333333333, + "grad_norm": 0.3505180776119232, + "learning_rate": 0.00013918128654970762, + "loss": 1.1882, + "mean_token_accuracy": 0.7232249453663826, + "num_tokens": 7669875.0, + "step": 611 + }, + { + "entropy": 1.0407119169831276, + "epoch": 1.02, + "grad_norm": 0.34889596700668335, + "learning_rate": 0.00013906432748538012, + "loss": 1.0021, + "mean_token_accuracy": 0.7623367980122566, + "num_tokens": 7682410.0, + "step": 612 + }, + { + "entropy": 0.9798188135027885, + "epoch": 1.0216666666666667, + "grad_norm": 0.34436383843421936, + "learning_rate": 0.00013894736842105264, + "loss": 0.9655, + "mean_token_accuracy": 0.7712270691990852, + "num_tokens": 7695026.0, + "step": 613 + }, + { + "entropy": 1.0230854898691177, + "epoch": 1.0233333333333334, + "grad_norm": 0.30474236607551575, + "learning_rate": 0.00013883040935672516, + "loss": 1.0168, + "mean_token_accuracy": 0.7577306106686592, + "num_tokens": 7707287.0, + "step": 614 + }, + { + "entropy": 1.0280113369226456, + "epoch": 1.025, + "grad_norm": 0.3403511345386505, + "learning_rate": 0.00013871345029239766, + "loss": 1.0075, + "mean_token_accuracy": 0.7558683082461357, + "num_tokens": 7720140.0, + "step": 615 + }, + { + "entropy": 1.0159951895475388, + "epoch": 1.0266666666666666, + "grad_norm": 0.4081074297428131, + "learning_rate": 0.00013859649122807018, + "loss": 1.0084, + "mean_token_accuracy": 0.7621957957744598, + "num_tokens": 7732366.0, + "step": 616 + }, + { + "entropy": 1.110385812819004, + "epoch": 1.0283333333333333, + "grad_norm": 0.35655292868614197, + "learning_rate": 0.0001384795321637427, + "loss": 1.1002, + "mean_token_accuracy": 0.7427221015095711, + "num_tokens": 7745123.0, + "step": 617 + }, + { + "entropy": 0.8933522030711174, + "epoch": 1.03, + "grad_norm": 0.44295790791511536, + "learning_rate": 0.0001383625730994152, + "loss": 0.8798, + "mean_token_accuracy": 0.7849943116307259, + "num_tokens": 7757701.0, + "step": 618 + }, + { + "entropy": 1.1147135198116302, + "epoch": 1.0316666666666667, + "grad_norm": 0.40337374806404114, + "learning_rate": 0.00013824561403508772, + "loss": 1.1254, + "mean_token_accuracy": 0.7315419912338257, + "num_tokens": 7770054.0, + "step": 619 + }, + { + "entropy": 0.8377392664551735, + "epoch": 1.0333333333333334, + "grad_norm": 0.47487035393714905, + "learning_rate": 0.00013812865497076025, + "loss": 0.8388, + "mean_token_accuracy": 0.7951571643352509, + "num_tokens": 7782805.0, + "step": 620 + }, + { + "entropy": 1.1692701056599617, + "epoch": 1.035, + "grad_norm": 0.31510019302368164, + "learning_rate": 0.00013801169590643274, + "loss": 1.1988, + "mean_token_accuracy": 0.7232685908675194, + "num_tokens": 7795507.0, + "step": 621 + }, + { + "entropy": 1.06218171864748, + "epoch": 1.0366666666666666, + "grad_norm": 0.5403528809547424, + "learning_rate": 0.00013789473684210527, + "loss": 1.0487, + "mean_token_accuracy": 0.7484084740281105, + "num_tokens": 7807693.0, + "step": 622 + }, + { + "entropy": 0.9076170176267624, + "epoch": 1.0383333333333333, + "grad_norm": 0.3648722171783447, + "learning_rate": 0.0001377777777777778, + "loss": 0.9021, + "mean_token_accuracy": 0.7875769883394241, + "num_tokens": 7820364.0, + "step": 623 + }, + { + "entropy": 1.0128463730216026, + "epoch": 1.04, + "grad_norm": 0.35606029629707336, + "learning_rate": 0.00013766081871345029, + "loss": 1.0369, + "mean_token_accuracy": 0.7596360221505165, + "num_tokens": 7833058.0, + "step": 624 + }, + { + "entropy": 1.127178505063057, + "epoch": 1.0416666666666667, + "grad_norm": 0.3587397038936615, + "learning_rate": 0.0001375438596491228, + "loss": 1.1201, + "mean_token_accuracy": 0.7313251048326492, + "num_tokens": 7845496.0, + "step": 625 + }, + { + "entropy": 0.9487481713294983, + "epoch": 1.0433333333333334, + "grad_norm": 0.4658021926879883, + "learning_rate": 0.00013742690058479533, + "loss": 0.94, + "mean_token_accuracy": 0.778320774435997, + "num_tokens": 7858159.0, + "step": 626 + }, + { + "entropy": 1.1421342343091965, + "epoch": 1.045, + "grad_norm": 0.361741840839386, + "learning_rate": 0.00013730994152046783, + "loss": 1.1211, + "mean_token_accuracy": 0.729691281914711, + "num_tokens": 7870529.0, + "step": 627 + }, + { + "entropy": 1.1835851818323135, + "epoch": 1.0466666666666666, + "grad_norm": 0.41982337832450867, + "learning_rate": 0.00013719298245614035, + "loss": 1.1648, + "mean_token_accuracy": 0.7253150641918182, + "num_tokens": 7882888.0, + "step": 628 + }, + { + "entropy": 1.0434592813253403, + "epoch": 1.0483333333333333, + "grad_norm": 0.3684554100036621, + "learning_rate": 0.00013707602339181287, + "loss": 1.0318, + "mean_token_accuracy": 0.7509391456842422, + "num_tokens": 7895483.0, + "step": 629 + }, + { + "entropy": 1.0862918049097061, + "epoch": 1.05, + "grad_norm": 0.3779102563858032, + "learning_rate": 0.00013695906432748537, + "loss": 1.0687, + "mean_token_accuracy": 0.7403496354818344, + "num_tokens": 7908116.0, + "step": 630 + }, + { + "entropy": 1.0131681859493256, + "epoch": 1.0516666666666667, + "grad_norm": 0.3559149503707886, + "learning_rate": 0.0001368421052631579, + "loss": 0.9913, + "mean_token_accuracy": 0.7679590433835983, + "num_tokens": 7920785.0, + "step": 631 + }, + { + "entropy": 0.9745831564068794, + "epoch": 1.0533333333333332, + "grad_norm": 0.344426691532135, + "learning_rate": 0.00013672514619883042, + "loss": 0.9656, + "mean_token_accuracy": 0.7636517956852913, + "num_tokens": 7933396.0, + "step": 632 + }, + { + "entropy": 0.8822155594825745, + "epoch": 1.055, + "grad_norm": 0.41743558645248413, + "learning_rate": 0.00013660818713450294, + "loss": 0.881, + "mean_token_accuracy": 0.7784114480018616, + "num_tokens": 7945821.0, + "step": 633 + }, + { + "entropy": 0.966235339641571, + "epoch": 1.0566666666666666, + "grad_norm": 0.4000849723815918, + "learning_rate": 0.00013649122807017543, + "loss": 0.9528, + "mean_token_accuracy": 0.7646506726741791, + "num_tokens": 7958541.0, + "step": 634 + }, + { + "entropy": 1.165940783917904, + "epoch": 1.0583333333333333, + "grad_norm": 0.527256965637207, + "learning_rate": 0.00013637426900584796, + "loss": 1.1738, + "mean_token_accuracy": 0.7271719202399254, + "num_tokens": 7970896.0, + "step": 635 + }, + { + "entropy": 1.0754114240407944, + "epoch": 1.06, + "grad_norm": 0.38329169154167175, + "learning_rate": 0.00013625730994152048, + "loss": 1.0728, + "mean_token_accuracy": 0.7432007640600204, + "num_tokens": 7983186.0, + "step": 636 + }, + { + "entropy": 0.948194220662117, + "epoch": 1.0616666666666668, + "grad_norm": 0.37484967708587646, + "learning_rate": 0.000136140350877193, + "loss": 0.9581, + "mean_token_accuracy": 0.7700267806649208, + "num_tokens": 7995749.0, + "step": 637 + }, + { + "entropy": 1.0313308089971542, + "epoch": 1.0633333333333332, + "grad_norm": 0.356343537569046, + "learning_rate": 0.0001360233918128655, + "loss": 1.056, + "mean_token_accuracy": 0.7483900561928749, + "num_tokens": 8008421.0, + "step": 638 + }, + { + "entropy": 0.9141811951994896, + "epoch": 1.065, + "grad_norm": 0.5060378313064575, + "learning_rate": 0.00013590643274853802, + "loss": 0.9068, + "mean_token_accuracy": 0.7790825441479683, + "num_tokens": 8021117.0, + "step": 639 + }, + { + "entropy": 0.9227776229381561, + "epoch": 1.0666666666666667, + "grad_norm": 0.3761998414993286, + "learning_rate": 0.00013578947368421055, + "loss": 0.9256, + "mean_token_accuracy": 0.7767295092344284, + "num_tokens": 8033634.0, + "step": 640 + }, + { + "entropy": 1.018035314977169, + "epoch": 1.0683333333333334, + "grad_norm": 0.33513742685317993, + "learning_rate": 0.00013567251461988304, + "loss": 1.0056, + "mean_token_accuracy": 0.7575684189796448, + "num_tokens": 8046127.0, + "step": 641 + }, + { + "entropy": 0.9292529672384262, + "epoch": 1.07, + "grad_norm": 1.8294329643249512, + "learning_rate": 0.00013555555555555556, + "loss": 0.9008, + "mean_token_accuracy": 0.7848961800336838, + "num_tokens": 8058773.0, + "step": 642 + }, + { + "entropy": 1.1138723865151405, + "epoch": 1.0716666666666668, + "grad_norm": 0.44567447900772095, + "learning_rate": 0.0001354385964912281, + "loss": 1.0769, + "mean_token_accuracy": 0.7429224848747253, + "num_tokens": 8071248.0, + "step": 643 + }, + { + "entropy": 1.099638320505619, + "epoch": 1.0733333333333333, + "grad_norm": 0.3087984621524811, + "learning_rate": 0.00013532163742690058, + "loss": 1.0883, + "mean_token_accuracy": 0.7450162544846535, + "num_tokens": 8083877.0, + "step": 644 + }, + { + "entropy": 1.0040937513113022, + "epoch": 1.075, + "grad_norm": 0.4541734457015991, + "learning_rate": 0.0001352046783625731, + "loss": 0.9684, + "mean_token_accuracy": 0.7722791135311127, + "num_tokens": 8096576.0, + "step": 645 + }, + { + "entropy": 0.9650404900312424, + "epoch": 1.0766666666666667, + "grad_norm": 0.48237717151641846, + "learning_rate": 0.00013508771929824563, + "loss": 0.9591, + "mean_token_accuracy": 0.7744063958525658, + "num_tokens": 8109075.0, + "step": 646 + }, + { + "entropy": 0.9337432831525803, + "epoch": 1.0783333333333334, + "grad_norm": 0.43152591586112976, + "learning_rate": 0.00013497076023391813, + "loss": 0.9395, + "mean_token_accuracy": 0.7721759006381035, + "num_tokens": 8121886.0, + "step": 647 + }, + { + "entropy": 1.1030597686767578, + "epoch": 1.08, + "grad_norm": 0.6193279027938843, + "learning_rate": 0.00013485380116959065, + "loss": 1.1343, + "mean_token_accuracy": 0.7327957898378372, + "num_tokens": 8134506.0, + "step": 648 + }, + { + "entropy": 1.0601794198155403, + "epoch": 1.0816666666666666, + "grad_norm": 0.32980358600616455, + "learning_rate": 0.00013473684210526317, + "loss": 1.0671, + "mean_token_accuracy": 0.7458383813500404, + "num_tokens": 8146814.0, + "step": 649 + }, + { + "entropy": 1.0408655479550362, + "epoch": 1.0833333333333333, + "grad_norm": 0.5926406383514404, + "learning_rate": 0.00013461988304093567, + "loss": 1.0491, + "mean_token_accuracy": 0.7476745769381523, + "num_tokens": 8159248.0, + "step": 650 + }, + { + "entropy": 1.0431873723864555, + "epoch": 1.085, + "grad_norm": 0.4574386775493622, + "learning_rate": 0.0001345029239766082, + "loss": 1.0554, + "mean_token_accuracy": 0.7551540955901146, + "num_tokens": 8171730.0, + "step": 651 + }, + { + "entropy": 1.086970031261444, + "epoch": 1.0866666666666667, + "grad_norm": 0.36465033888816833, + "learning_rate": 0.00013438596491228071, + "loss": 1.0768, + "mean_token_accuracy": 0.7430087327957153, + "num_tokens": 8184030.0, + "step": 652 + }, + { + "entropy": 0.9763112142682076, + "epoch": 1.0883333333333334, + "grad_norm": 0.4279404878616333, + "learning_rate": 0.00013426900584795324, + "loss": 0.9455, + "mean_token_accuracy": 0.7743237987160683, + "num_tokens": 8196477.0, + "step": 653 + }, + { + "entropy": 0.9648814126849174, + "epoch": 1.09, + "grad_norm": 0.38802456855773926, + "learning_rate": 0.00013415204678362573, + "loss": 0.9467, + "mean_token_accuracy": 0.7732060924172401, + "num_tokens": 8209184.0, + "step": 654 + }, + { + "entropy": 0.9697625562548637, + "epoch": 1.0916666666666666, + "grad_norm": 0.5765432119369507, + "learning_rate": 0.00013403508771929826, + "loss": 0.9383, + "mean_token_accuracy": 0.7735694646835327, + "num_tokens": 8221636.0, + "step": 655 + }, + { + "entropy": 1.032564863562584, + "epoch": 1.0933333333333333, + "grad_norm": 0.32644009590148926, + "learning_rate": 0.00013391812865497078, + "loss": 0.998, + "mean_token_accuracy": 0.7601174414157867, + "num_tokens": 8234233.0, + "step": 656 + }, + { + "entropy": 0.9871031567454338, + "epoch": 1.095, + "grad_norm": 0.37354862689971924, + "learning_rate": 0.00013380116959064327, + "loss": 0.9648, + "mean_token_accuracy": 0.7655048966407776, + "num_tokens": 8246750.0, + "step": 657 + }, + { + "entropy": 0.987128734588623, + "epoch": 1.0966666666666667, + "grad_norm": 0.35152730345726013, + "learning_rate": 0.0001336842105263158, + "loss": 0.9735, + "mean_token_accuracy": 0.7622586041688919, + "num_tokens": 8259211.0, + "step": 658 + }, + { + "entropy": 1.0830785781145096, + "epoch": 1.0983333333333334, + "grad_norm": 0.36088189482688904, + "learning_rate": 0.00013356725146198832, + "loss": 1.1024, + "mean_token_accuracy": 0.7382792606949806, + "num_tokens": 8271621.0, + "step": 659 + }, + { + "entropy": 1.0922202467918396, + "epoch": 1.1, + "grad_norm": 0.43077078461647034, + "learning_rate": 0.00013345029239766082, + "loss": 1.0774, + "mean_token_accuracy": 0.7471672371029854, + "num_tokens": 8284002.0, + "step": 660 + }, + { + "entropy": 1.1043326631188393, + "epoch": 1.1016666666666666, + "grad_norm": 0.35150665044784546, + "learning_rate": 0.00013333333333333334, + "loss": 1.0946, + "mean_token_accuracy": 0.7388827204704285, + "num_tokens": 8296436.0, + "step": 661 + }, + { + "entropy": 0.9849683940410614, + "epoch": 1.1033333333333333, + "grad_norm": 0.3895672857761383, + "learning_rate": 0.00013321637426900586, + "loss": 0.9802, + "mean_token_accuracy": 0.7634393870830536, + "num_tokens": 8309100.0, + "step": 662 + }, + { + "entropy": 1.1219883561134338, + "epoch": 1.105, + "grad_norm": 0.3513847887516022, + "learning_rate": 0.00013309941520467836, + "loss": 1.1286, + "mean_token_accuracy": 0.7343808338046074, + "num_tokens": 8321419.0, + "step": 663 + }, + { + "entropy": 1.055749535560608, + "epoch": 1.1066666666666667, + "grad_norm": 0.3936227858066559, + "learning_rate": 0.00013298245614035088, + "loss": 1.0459, + "mean_token_accuracy": 0.7490439489483833, + "num_tokens": 8333640.0, + "step": 664 + }, + { + "entropy": 0.9720334634184837, + "epoch": 1.1083333333333334, + "grad_norm": 0.3029753267765045, + "learning_rate": 0.0001328654970760234, + "loss": 0.9635, + "mean_token_accuracy": 0.7607943564653397, + "num_tokens": 8346373.0, + "step": 665 + }, + { + "entropy": 1.1321242824196815, + "epoch": 1.11, + "grad_norm": 0.47852373123168945, + "learning_rate": 0.0001327485380116959, + "loss": 1.1348, + "mean_token_accuracy": 0.7242363542318344, + "num_tokens": 8358810.0, + "step": 666 + }, + { + "entropy": 0.9721924886107445, + "epoch": 1.1116666666666666, + "grad_norm": 0.9138944149017334, + "learning_rate": 0.00013263157894736842, + "loss": 0.9505, + "mean_token_accuracy": 0.7762376815080643, + "num_tokens": 8371471.0, + "step": 667 + }, + { + "entropy": 0.9570804685354233, + "epoch": 1.1133333333333333, + "grad_norm": 0.42705467343330383, + "learning_rate": 0.00013251461988304095, + "loss": 0.9619, + "mean_token_accuracy": 0.7696249037981033, + "num_tokens": 8384041.0, + "step": 668 + }, + { + "entropy": 1.0321296378970146, + "epoch": 1.115, + "grad_norm": 0.382088303565979, + "learning_rate": 0.00013239766081871344, + "loss": 1.0341, + "mean_token_accuracy": 0.7548602446913719, + "num_tokens": 8396378.0, + "step": 669 + }, + { + "entropy": 1.1567051485180855, + "epoch": 1.1166666666666667, + "grad_norm": 0.3601890206336975, + "learning_rate": 0.00013228070175438597, + "loss": 1.1401, + "mean_token_accuracy": 0.7298571541905403, + "num_tokens": 8408835.0, + "step": 670 + }, + { + "entropy": 1.1171844527125359, + "epoch": 1.1183333333333334, + "grad_norm": 0.35644233226776123, + "learning_rate": 0.0001321637426900585, + "loss": 1.1135, + "mean_token_accuracy": 0.7378202676773071, + "num_tokens": 8421299.0, + "step": 671 + }, + { + "entropy": 1.0986279770731926, + "epoch": 1.12, + "grad_norm": 0.36225011944770813, + "learning_rate": 0.00013204678362573098, + "loss": 1.0893, + "mean_token_accuracy": 0.7424418106675148, + "num_tokens": 8433566.0, + "step": 672 + }, + { + "entropy": 0.9406923651695251, + "epoch": 1.1216666666666666, + "grad_norm": 0.36922165751457214, + "learning_rate": 0.00013192982456140353, + "loss": 0.92, + "mean_token_accuracy": 0.7776926532387733, + "num_tokens": 8445982.0, + "step": 673 + }, + { + "entropy": 0.982761062681675, + "epoch": 1.1233333333333333, + "grad_norm": 0.4194695055484772, + "learning_rate": 0.00013181286549707603, + "loss": 0.9678, + "mean_token_accuracy": 0.7725221887230873, + "num_tokens": 8458397.0, + "step": 674 + }, + { + "entropy": 1.0702436491847038, + "epoch": 1.125, + "grad_norm": 0.3364623785018921, + "learning_rate": 0.00013169590643274853, + "loss": 1.0965, + "mean_token_accuracy": 0.7367196604609489, + "num_tokens": 8470996.0, + "step": 675 + }, + { + "entropy": 1.117453172802925, + "epoch": 1.1266666666666667, + "grad_norm": 0.35230013728141785, + "learning_rate": 0.00013157894736842108, + "loss": 1.1182, + "mean_token_accuracy": 0.736903615295887, + "num_tokens": 8483765.0, + "step": 676 + }, + { + "entropy": 1.088931068778038, + "epoch": 1.1283333333333334, + "grad_norm": 0.37009745836257935, + "learning_rate": 0.00013146198830409357, + "loss": 1.1245, + "mean_token_accuracy": 0.7352629750967026, + "num_tokens": 8496465.0, + "step": 677 + }, + { + "entropy": 1.0094858780503273, + "epoch": 1.13, + "grad_norm": 0.3854370713233948, + "learning_rate": 0.00013134502923976607, + "loss": 1.0274, + "mean_token_accuracy": 0.7591015845537186, + "num_tokens": 8509037.0, + "step": 678 + }, + { + "entropy": 1.1349955797195435, + "epoch": 1.1316666666666666, + "grad_norm": 0.38764774799346924, + "learning_rate": 0.00013122807017543862, + "loss": 1.1466, + "mean_token_accuracy": 0.7316305935382843, + "num_tokens": 8521698.0, + "step": 679 + }, + { + "entropy": 0.9581394642591476, + "epoch": 1.1333333333333333, + "grad_norm": 0.3510202169418335, + "learning_rate": 0.00013111111111111111, + "loss": 0.9305, + "mean_token_accuracy": 0.7765649557113647, + "num_tokens": 8534228.0, + "step": 680 + }, + { + "entropy": 1.0585462525486946, + "epoch": 1.135, + "grad_norm": 0.6673049330711365, + "learning_rate": 0.00013099415204678364, + "loss": 1.0507, + "mean_token_accuracy": 0.7504134178161621, + "num_tokens": 8546770.0, + "step": 681 + }, + { + "entropy": 1.122896485030651, + "epoch": 1.1366666666666667, + "grad_norm": 0.34946927428245544, + "learning_rate": 0.00013087719298245616, + "loss": 1.1012, + "mean_token_accuracy": 0.7360977083444595, + "num_tokens": 8559354.0, + "step": 682 + }, + { + "entropy": 1.0782023295760155, + "epoch": 1.1383333333333334, + "grad_norm": 0.34809041023254395, + "learning_rate": 0.00013076023391812866, + "loss": 1.0375, + "mean_token_accuracy": 0.7485805526375771, + "num_tokens": 8571826.0, + "step": 683 + }, + { + "entropy": 0.932886391878128, + "epoch": 1.1400000000000001, + "grad_norm": 0.520618736743927, + "learning_rate": 0.00013064327485380118, + "loss": 0.9169, + "mean_token_accuracy": 0.7839078307151794, + "num_tokens": 8584449.0, + "step": 684 + }, + { + "entropy": 0.9941971525549889, + "epoch": 1.1416666666666666, + "grad_norm": 0.3270869851112366, + "learning_rate": 0.0001305263157894737, + "loss": 0.999, + "mean_token_accuracy": 0.769331268966198, + "num_tokens": 8596790.0, + "step": 685 + }, + { + "entropy": 1.0389942303299904, + "epoch": 1.1433333333333333, + "grad_norm": 0.37851324677467346, + "learning_rate": 0.0001304093567251462, + "loss": 1.0379, + "mean_token_accuracy": 0.7544435933232307, + "num_tokens": 8609395.0, + "step": 686 + }, + { + "entropy": 0.9451502487063408, + "epoch": 1.145, + "grad_norm": 0.5139569640159607, + "learning_rate": 0.00013029239766081872, + "loss": 0.9436, + "mean_token_accuracy": 0.7673129811882973, + "num_tokens": 8622265.0, + "step": 687 + }, + { + "entropy": 1.1505193412303925, + "epoch": 1.1466666666666667, + "grad_norm": 0.6956432461738586, + "learning_rate": 0.00013017543859649124, + "loss": 1.1384, + "mean_token_accuracy": 0.7312004566192627, + "num_tokens": 8634777.0, + "step": 688 + }, + { + "entropy": 1.1453742682933807, + "epoch": 1.1483333333333334, + "grad_norm": 0.8315491676330566, + "learning_rate": 0.00013005847953216374, + "loss": 1.1393, + "mean_token_accuracy": 0.7327683791518211, + "num_tokens": 8647293.0, + "step": 689 + }, + { + "entropy": 0.924394853413105, + "epoch": 1.15, + "grad_norm": 0.43016624450683594, + "learning_rate": 0.00012994152046783626, + "loss": 0.9055, + "mean_token_accuracy": 0.7811589166522026, + "num_tokens": 8659936.0, + "step": 690 + }, + { + "entropy": 1.1657907515764236, + "epoch": 1.1516666666666666, + "grad_norm": 0.3746880292892456, + "learning_rate": 0.0001298245614035088, + "loss": 1.1476, + "mean_token_accuracy": 0.7288665175437927, + "num_tokens": 8672578.0, + "step": 691 + }, + { + "entropy": 1.1058854684233665, + "epoch": 1.1533333333333333, + "grad_norm": 0.5405567288398743, + "learning_rate": 0.00012970760233918128, + "loss": 1.1241, + "mean_token_accuracy": 0.7345702573657036, + "num_tokens": 8684983.0, + "step": 692 + }, + { + "entropy": 0.9981840997934341, + "epoch": 1.155, + "grad_norm": 0.7133622765541077, + "learning_rate": 0.0001295906432748538, + "loss": 0.99, + "mean_token_accuracy": 0.7627017050981522, + "num_tokens": 8697774.0, + "step": 693 + }, + { + "entropy": 1.0237222835421562, + "epoch": 1.1566666666666667, + "grad_norm": 0.4631465673446655, + "learning_rate": 0.00012947368421052633, + "loss": 0.9932, + "mean_token_accuracy": 0.7638612240552902, + "num_tokens": 8709955.0, + "step": 694 + }, + { + "entropy": 1.0166835561394691, + "epoch": 1.1583333333333332, + "grad_norm": 0.4396076798439026, + "learning_rate": 0.00012935672514619882, + "loss": 1.0102, + "mean_token_accuracy": 0.759076252579689, + "num_tokens": 8722501.0, + "step": 695 + }, + { + "entropy": 1.139280654489994, + "epoch": 1.16, + "grad_norm": 0.5104596614837646, + "learning_rate": 0.00012923976608187135, + "loss": 1.1324, + "mean_token_accuracy": 0.7318560257554054, + "num_tokens": 8735256.0, + "step": 696 + }, + { + "entropy": 0.996860571205616, + "epoch": 1.1616666666666666, + "grad_norm": 0.41430947184562683, + "learning_rate": 0.00012912280701754387, + "loss": 0.9843, + "mean_token_accuracy": 0.7681520283222198, + "num_tokens": 8748117.0, + "step": 697 + }, + { + "entropy": 1.088785506784916, + "epoch": 1.1633333333333333, + "grad_norm": 0.4602546989917755, + "learning_rate": 0.00012900584795321637, + "loss": 1.0874, + "mean_token_accuracy": 0.7467052713036537, + "num_tokens": 8760752.0, + "step": 698 + }, + { + "entropy": 1.0092452727258205, + "epoch": 1.165, + "grad_norm": 0.43864133954048157, + "learning_rate": 0.00012888888888888892, + "loss": 0.9948, + "mean_token_accuracy": 0.7639948949217796, + "num_tokens": 8773427.0, + "step": 699 + }, + { + "entropy": 1.0612296387553215, + "epoch": 1.1666666666666667, + "grad_norm": 0.424075722694397, + "learning_rate": 0.0001287719298245614, + "loss": 1.0701, + "mean_token_accuracy": 0.7479716464877129, + "num_tokens": 8786118.0, + "step": 700 + }, + { + "entropy": 1.0840974599123, + "epoch": 1.1683333333333334, + "grad_norm": 0.561173677444458, + "learning_rate": 0.0001286549707602339, + "loss": 1.07, + "mean_token_accuracy": 0.7422567680478096, + "num_tokens": 8798822.0, + "step": 701 + }, + { + "entropy": 1.059173308312893, + "epoch": 1.17, + "grad_norm": 0.33119791746139526, + "learning_rate": 0.00012853801169590646, + "loss": 1.0362, + "mean_token_accuracy": 0.7531376257538795, + "num_tokens": 8811334.0, + "step": 702 + }, + { + "entropy": 0.9581945165991783, + "epoch": 1.1716666666666666, + "grad_norm": 2.1980700492858887, + "learning_rate": 0.00012842105263157895, + "loss": 0.9294, + "mean_token_accuracy": 0.7717323303222656, + "num_tokens": 8824060.0, + "step": 703 + }, + { + "entropy": 1.000770427286625, + "epoch": 1.1733333333333333, + "grad_norm": 0.344192773103714, + "learning_rate": 0.00012830409356725145, + "loss": 1.0056, + "mean_token_accuracy": 0.7614035829901695, + "num_tokens": 8836623.0, + "step": 704 + }, + { + "entropy": 1.109620176255703, + "epoch": 1.175, + "grad_norm": 0.5027482509613037, + "learning_rate": 0.000128187134502924, + "loss": 1.1039, + "mean_token_accuracy": 0.7359057664871216, + "num_tokens": 8848962.0, + "step": 705 + }, + { + "entropy": 1.1144047752022743, + "epoch": 1.1766666666666667, + "grad_norm": 0.41070157289505005, + "learning_rate": 0.0001280701754385965, + "loss": 1.132, + "mean_token_accuracy": 0.73667923361063, + "num_tokens": 8861555.0, + "step": 706 + }, + { + "entropy": 0.9361367151141167, + "epoch": 1.1783333333333332, + "grad_norm": 0.3477269113063812, + "learning_rate": 0.000127953216374269, + "loss": 0.9413, + "mean_token_accuracy": 0.7774360477924347, + "num_tokens": 8874006.0, + "step": 707 + }, + { + "entropy": 1.1030670925974846, + "epoch": 1.18, + "grad_norm": 1.3029204607009888, + "learning_rate": 0.00012783625730994154, + "loss": 1.0991, + "mean_token_accuracy": 0.7363706007599831, + "num_tokens": 8886475.0, + "step": 708 + }, + { + "entropy": 0.9754893407225609, + "epoch": 1.1816666666666666, + "grad_norm": 0.42802709341049194, + "learning_rate": 0.00012771929824561404, + "loss": 0.9424, + "mean_token_accuracy": 0.7736692875623703, + "num_tokens": 8898956.0, + "step": 709 + }, + { + "entropy": 0.9986881986260414, + "epoch": 1.1833333333333333, + "grad_norm": 0.4046630561351776, + "learning_rate": 0.00012760233918128653, + "loss": 1.0008, + "mean_token_accuracy": 0.7546000257134438, + "num_tokens": 8911710.0, + "step": 710 + }, + { + "entropy": 1.1963095217943192, + "epoch": 1.185, + "grad_norm": 0.36986932158470154, + "learning_rate": 0.00012748538011695908, + "loss": 1.1593, + "mean_token_accuracy": 0.721351720392704, + "num_tokens": 8924129.0, + "step": 711 + }, + { + "entropy": 0.9531427696347237, + "epoch": 1.1866666666666668, + "grad_norm": 0.9468092322349548, + "learning_rate": 0.00012736842105263158, + "loss": 0.9382, + "mean_token_accuracy": 0.774338111281395, + "num_tokens": 8936909.0, + "step": 712 + }, + { + "entropy": 0.9403403848409653, + "epoch": 1.1883333333333332, + "grad_norm": 0.35958191752433777, + "learning_rate": 0.0001272514619883041, + "loss": 0.9279, + "mean_token_accuracy": 0.7800295427441597, + "num_tokens": 8949251.0, + "step": 713 + }, + { + "entropy": 1.0490553975105286, + "epoch": 1.19, + "grad_norm": 0.5040456652641296, + "learning_rate": 0.00012713450292397663, + "loss": 1.0677, + "mean_token_accuracy": 0.7471382990479469, + "num_tokens": 8961862.0, + "step": 714 + }, + { + "entropy": 0.9729638993740082, + "epoch": 1.1916666666666667, + "grad_norm": 0.37353086471557617, + "learning_rate": 0.00012701754385964912, + "loss": 0.9867, + "mean_token_accuracy": 0.7674241736531258, + "num_tokens": 8974543.0, + "step": 715 + }, + { + "entropy": 1.1073362156748772, + "epoch": 1.1933333333333334, + "grad_norm": 0.37765026092529297, + "learning_rate": 0.00012690058479532165, + "loss": 1.1119, + "mean_token_accuracy": 0.7373607456684113, + "num_tokens": 8987091.0, + "step": 716 + }, + { + "entropy": 1.0764212757349014, + "epoch": 1.195, + "grad_norm": 0.4003337323665619, + "learning_rate": 0.00012678362573099417, + "loss": 1.0417, + "mean_token_accuracy": 0.7387293726205826, + "num_tokens": 8999414.0, + "step": 717 + }, + { + "entropy": 0.9788024201989174, + "epoch": 1.1966666666666668, + "grad_norm": 0.33461993932724, + "learning_rate": 0.00012666666666666666, + "loss": 0.9539, + "mean_token_accuracy": 0.7684841901063919, + "num_tokens": 9011861.0, + "step": 718 + }, + { + "entropy": 1.1892686560750008, + "epoch": 1.1983333333333333, + "grad_norm": 0.3961053192615509, + "learning_rate": 0.0001265497076023392, + "loss": 1.2005, + "mean_token_accuracy": 0.7188596576452255, + "num_tokens": 9024474.0, + "step": 719 + }, + { + "entropy": 1.0204068645834923, + "epoch": 1.2, + "grad_norm": 0.3742503821849823, + "learning_rate": 0.0001264327485380117, + "loss": 1.0065, + "mean_token_accuracy": 0.7588150277733803, + "num_tokens": 9037064.0, + "step": 720 + }, + { + "epoch": 1.2, + "eval_entropy": 1.1187498391699382, + "eval_loss": 1.1341508626937866, + "eval_mean_token_accuracy": 0.7310460867499504, + "eval_num_tokens": 9037064.0, + "eval_runtime": 2668.7172, + "eval_samples_per_second": 1.874, + "eval_steps_per_second": 0.937, + "step": 720 + }, + { + "entropy": 1.1895209550857544, + "epoch": 1.2016666666666667, + "grad_norm": 0.3838255703449249, + "learning_rate": 0.0001263157894736842, + "loss": 1.2079, + "mean_token_accuracy": 0.7193019464612007, + "num_tokens": 9049604.0, + "step": 721 + }, + { + "entropy": 1.0349683538079262, + "epoch": 1.2033333333333334, + "grad_norm": 0.4433048665523529, + "learning_rate": 0.00012619883040935673, + "loss": 1.0267, + "mean_token_accuracy": 0.7509428858757019, + "num_tokens": 9062116.0, + "step": 722 + }, + { + "entropy": 1.030206061899662, + "epoch": 1.205, + "grad_norm": 0.34358423948287964, + "learning_rate": 0.00012608187134502925, + "loss": 1.0245, + "mean_token_accuracy": 0.7634887248277664, + "num_tokens": 9074529.0, + "step": 723 + }, + { + "entropy": 1.141561210155487, + "epoch": 1.2066666666666666, + "grad_norm": 0.3249812424182892, + "learning_rate": 0.00012596491228070175, + "loss": 1.1205, + "mean_token_accuracy": 0.7343723922967911, + "num_tokens": 9087182.0, + "step": 724 + }, + { + "entropy": 1.0616471990942955, + "epoch": 1.2083333333333333, + "grad_norm": 0.4340597093105316, + "learning_rate": 0.00012584795321637427, + "loss": 1.0581, + "mean_token_accuracy": 0.7507943511009216, + "num_tokens": 9099877.0, + "step": 725 + }, + { + "entropy": 0.9706320464611053, + "epoch": 1.21, + "grad_norm": 0.3568764328956604, + "learning_rate": 0.0001257309941520468, + "loss": 0.9761, + "mean_token_accuracy": 0.7712465897202492, + "num_tokens": 9112601.0, + "step": 726 + }, + { + "entropy": 0.91465774923563, + "epoch": 1.2116666666666667, + "grad_norm": 0.3260438144207001, + "learning_rate": 0.0001256140350877193, + "loss": 0.9254, + "mean_token_accuracy": 0.7811517640948296, + "num_tokens": 9125033.0, + "step": 727 + }, + { + "entropy": 0.9401550590991974, + "epoch": 1.2133333333333334, + "grad_norm": 0.3260049819946289, + "learning_rate": 0.00012549707602339181, + "loss": 0.9633, + "mean_token_accuracy": 0.7732428461313248, + "num_tokens": 9137669.0, + "step": 728 + }, + { + "entropy": 0.9846240356564522, + "epoch": 1.215, + "grad_norm": 0.5016320943832397, + "learning_rate": 0.00012538011695906434, + "loss": 1.0008, + "mean_token_accuracy": 0.7595839649438858, + "num_tokens": 9150317.0, + "step": 729 + }, + { + "entropy": 0.9101727679371834, + "epoch": 1.2166666666666668, + "grad_norm": 0.3477831184864044, + "learning_rate": 0.00012526315789473683, + "loss": 0.9248, + "mean_token_accuracy": 0.7770381346344948, + "num_tokens": 9162864.0, + "step": 730 + }, + { + "entropy": 1.055095262825489, + "epoch": 1.2183333333333333, + "grad_norm": 0.3225356936454773, + "learning_rate": 0.00012514619883040936, + "loss": 1.0594, + "mean_token_accuracy": 0.7478219419717789, + "num_tokens": 9175636.0, + "step": 731 + }, + { + "entropy": 0.9906914457678795, + "epoch": 1.22, + "grad_norm": 0.36654067039489746, + "learning_rate": 0.00012502923976608188, + "loss": 0.9822, + "mean_token_accuracy": 0.7651053443551064, + "num_tokens": 9188056.0, + "step": 732 + }, + { + "entropy": 1.0227198526263237, + "epoch": 1.2216666666666667, + "grad_norm": 0.42129281163215637, + "learning_rate": 0.0001249122807017544, + "loss": 1.0036, + "mean_token_accuracy": 0.7546610608696938, + "num_tokens": 9200761.0, + "step": 733 + }, + { + "entropy": 1.051247701048851, + "epoch": 1.2233333333333334, + "grad_norm": 0.39238080382347107, + "learning_rate": 0.0001247953216374269, + "loss": 1.0324, + "mean_token_accuracy": 0.7527084350585938, + "num_tokens": 9213241.0, + "step": 734 + }, + { + "entropy": 1.1549249365925789, + "epoch": 1.225, + "grad_norm": 0.3630923926830292, + "learning_rate": 0.00012467836257309942, + "loss": 1.1407, + "mean_token_accuracy": 0.7309977412223816, + "num_tokens": 9225826.0, + "step": 735 + }, + { + "entropy": 0.9571069180965424, + "epoch": 1.2266666666666666, + "grad_norm": 0.4624479115009308, + "learning_rate": 0.00012456140350877194, + "loss": 0.9133, + "mean_token_accuracy": 0.7807331830263138, + "num_tokens": 9238128.0, + "step": 736 + }, + { + "entropy": 1.1259456798434258, + "epoch": 1.2283333333333333, + "grad_norm": 0.41041669249534607, + "learning_rate": 0.00012444444444444444, + "loss": 1.1123, + "mean_token_accuracy": 0.7379928156733513, + "num_tokens": 9250736.0, + "step": 737 + }, + { + "entropy": 1.0745483115315437, + "epoch": 1.23, + "grad_norm": 0.3722630739212036, + "learning_rate": 0.00012432748538011696, + "loss": 1.0743, + "mean_token_accuracy": 0.7459522411227226, + "num_tokens": 9263669.0, + "step": 738 + }, + { + "entropy": 1.017277792096138, + "epoch": 1.2316666666666667, + "grad_norm": 0.5359635949134827, + "learning_rate": 0.00012421052631578949, + "loss": 0.9979, + "mean_token_accuracy": 0.756466805934906, + "num_tokens": 9276278.0, + "step": 739 + }, + { + "entropy": 1.0748402923345566, + "epoch": 1.2333333333333334, + "grad_norm": 0.35720473527908325, + "learning_rate": 0.000124093567251462, + "loss": 1.1062, + "mean_token_accuracy": 0.7410350739955902, + "num_tokens": 9288698.0, + "step": 740 + }, + { + "entropy": 1.130936212837696, + "epoch": 1.2349999999999999, + "grad_norm": 0.37070757150650024, + "learning_rate": 0.0001239766081871345, + "loss": 1.1396, + "mean_token_accuracy": 0.7324651181697845, + "num_tokens": 9301584.0, + "step": 741 + }, + { + "entropy": 1.0489218086004257, + "epoch": 1.2366666666666666, + "grad_norm": 0.3306441605091095, + "learning_rate": 0.00012385964912280703, + "loss": 1.0664, + "mean_token_accuracy": 0.7477298304438591, + "num_tokens": 9314084.0, + "step": 742 + }, + { + "entropy": 0.9938762001693249, + "epoch": 1.2383333333333333, + "grad_norm": 0.31619134545326233, + "learning_rate": 0.00012374269005847955, + "loss": 0.9852, + "mean_token_accuracy": 0.7605732753872871, + "num_tokens": 9326397.0, + "step": 743 + }, + { + "entropy": 1.0824184268712997, + "epoch": 1.24, + "grad_norm": 0.3798287510871887, + "learning_rate": 0.00012362573099415205, + "loss": 1.0686, + "mean_token_accuracy": 0.7485281676054001, + "num_tokens": 9339046.0, + "step": 744 + }, + { + "entropy": 1.069664090871811, + "epoch": 1.2416666666666667, + "grad_norm": 0.306922972202301, + "learning_rate": 0.00012350877192982457, + "loss": 1.0744, + "mean_token_accuracy": 0.7507193833589554, + "num_tokens": 9351671.0, + "step": 745 + }, + { + "entropy": 1.0598416477441788, + "epoch": 1.2433333333333334, + "grad_norm": 0.33934125304222107, + "learning_rate": 0.0001233918128654971, + "loss": 1.0267, + "mean_token_accuracy": 0.7564558759331703, + "num_tokens": 9364530.0, + "step": 746 + }, + { + "entropy": 0.9628029838204384, + "epoch": 1.245, + "grad_norm": 0.41921910643577576, + "learning_rate": 0.0001232748538011696, + "loss": 0.9517, + "mean_token_accuracy": 0.7717362120747566, + "num_tokens": 9376918.0, + "step": 747 + }, + { + "entropy": 1.0656858682632446, + "epoch": 1.2466666666666666, + "grad_norm": 0.38583341240882874, + "learning_rate": 0.0001231578947368421, + "loss": 1.0376, + "mean_token_accuracy": 0.7540598586201668, + "num_tokens": 9389510.0, + "step": 748 + }, + { + "entropy": 1.0888371095061302, + "epoch": 1.2483333333333333, + "grad_norm": 0.34380537271499634, + "learning_rate": 0.00012304093567251463, + "loss": 1.0715, + "mean_token_accuracy": 0.7473909482359886, + "num_tokens": 9402055.0, + "step": 749 + }, + { + "entropy": 1.0373852625489235, + "epoch": 1.25, + "grad_norm": 0.3231388330459595, + "learning_rate": 0.00012292397660818713, + "loss": 1.0305, + "mean_token_accuracy": 0.7523825243115425, + "num_tokens": 9414512.0, + "step": 750 + }, + { + "entropy": 1.106944017112255, + "epoch": 1.2516666666666667, + "grad_norm": 0.36373844742774963, + "learning_rate": 0.00012280701754385965, + "loss": 1.0971, + "mean_token_accuracy": 0.7417397871613503, + "num_tokens": 9426843.0, + "step": 751 + }, + { + "entropy": 0.9788102731108665, + "epoch": 1.2533333333333334, + "grad_norm": 0.31296437978744507, + "learning_rate": 0.00012269005847953218, + "loss": 0.9652, + "mean_token_accuracy": 0.7717032134532928, + "num_tokens": 9439540.0, + "step": 752 + }, + { + "entropy": 0.9851587638258934, + "epoch": 1.255, + "grad_norm": 0.39397764205932617, + "learning_rate": 0.0001225730994152047, + "loss": 0.9935, + "mean_token_accuracy": 0.7582743614912033, + "num_tokens": 9451949.0, + "step": 753 + }, + { + "entropy": 1.1130240112543106, + "epoch": 1.2566666666666666, + "grad_norm": 1.2226158380508423, + "learning_rate": 0.0001224561403508772, + "loss": 1.1278, + "mean_token_accuracy": 0.7374890893697739, + "num_tokens": 9464393.0, + "step": 754 + }, + { + "entropy": 0.9969649165868759, + "epoch": 1.2583333333333333, + "grad_norm": 0.5840109586715698, + "learning_rate": 0.00012233918128654972, + "loss": 1.0028, + "mean_token_accuracy": 0.7622894421219826, + "num_tokens": 9477027.0, + "step": 755 + }, + { + "entropy": 1.0583245903253555, + "epoch": 1.26, + "grad_norm": 0.49765530228614807, + "learning_rate": 0.00012222222222222224, + "loss": 1.056, + "mean_token_accuracy": 0.7438866198062897, + "num_tokens": 9489474.0, + "step": 756 + }, + { + "entropy": 0.9789082854986191, + "epoch": 1.2616666666666667, + "grad_norm": 0.394092321395874, + "learning_rate": 0.00012210526315789474, + "loss": 0.9702, + "mean_token_accuracy": 0.7751469835639, + "num_tokens": 9502055.0, + "step": 757 + }, + { + "entropy": 0.9848510399460793, + "epoch": 1.2633333333333332, + "grad_norm": 0.34688401222229004, + "learning_rate": 0.00012198830409356725, + "loss": 0.9855, + "mean_token_accuracy": 0.7628518790006638, + "num_tokens": 9514580.0, + "step": 758 + }, + { + "entropy": 1.0338635221123695, + "epoch": 1.2650000000000001, + "grad_norm": 0.33692267537117004, + "learning_rate": 0.00012187134502923978, + "loss": 1.0188, + "mean_token_accuracy": 0.7581898495554924, + "num_tokens": 9527194.0, + "step": 759 + }, + { + "entropy": 0.9540547728538513, + "epoch": 1.2666666666666666, + "grad_norm": 0.38330939412117004, + "learning_rate": 0.00012175438596491229, + "loss": 0.9213, + "mean_token_accuracy": 0.7747813165187836, + "num_tokens": 9539878.0, + "step": 760 + }, + { + "entropy": 0.9945897087454796, + "epoch": 1.2683333333333333, + "grad_norm": 0.4628215730190277, + "learning_rate": 0.00012163742690058479, + "loss": 0.9471, + "mean_token_accuracy": 0.7692919299006462, + "num_tokens": 9552618.0, + "step": 761 + }, + { + "entropy": 0.8448145538568497, + "epoch": 1.27, + "grad_norm": 0.545650064945221, + "learning_rate": 0.00012152046783625733, + "loss": 0.8138, + "mean_token_accuracy": 0.8002117648720741, + "num_tokens": 9565245.0, + "step": 762 + }, + { + "entropy": 1.096286728978157, + "epoch": 1.2716666666666667, + "grad_norm": 0.3716736137866974, + "learning_rate": 0.00012140350877192984, + "loss": 1.104, + "mean_token_accuracy": 0.7303317859768867, + "num_tokens": 9577885.0, + "step": 763 + }, + { + "entropy": 1.1562151238322258, + "epoch": 1.2733333333333334, + "grad_norm": 0.5344386100769043, + "learning_rate": 0.00012128654970760233, + "loss": 1.1596, + "mean_token_accuracy": 0.7277534902095795, + "num_tokens": 9590261.0, + "step": 764 + }, + { + "entropy": 0.9411380141973495, + "epoch": 1.275, + "grad_norm": 0.5676725506782532, + "learning_rate": 0.00012116959064327487, + "loss": 0.959, + "mean_token_accuracy": 0.7726512774825096, + "num_tokens": 9602848.0, + "step": 765 + }, + { + "entropy": 0.9246381893754005, + "epoch": 1.2766666666666666, + "grad_norm": 0.6678740382194519, + "learning_rate": 0.00012105263157894738, + "loss": 0.9213, + "mean_token_accuracy": 0.7782176956534386, + "num_tokens": 9615488.0, + "step": 766 + }, + { + "entropy": 0.9777566716074944, + "epoch": 1.2783333333333333, + "grad_norm": 0.46021127700805664, + "learning_rate": 0.00012093567251461989, + "loss": 0.9976, + "mean_token_accuracy": 0.7602128386497498, + "num_tokens": 9628260.0, + "step": 767 + }, + { + "entropy": 1.0313529521226883, + "epoch": 1.28, + "grad_norm": 0.37414881587028503, + "learning_rate": 0.00012081871345029241, + "loss": 1.0473, + "mean_token_accuracy": 0.7518272027373314, + "num_tokens": 9640722.0, + "step": 768 + }, + { + "entropy": 1.0204561650753021, + "epoch": 1.2816666666666667, + "grad_norm": 0.5116375088691711, + "learning_rate": 0.00012070175438596492, + "loss": 1.0463, + "mean_token_accuracy": 0.7471587732434273, + "num_tokens": 9653280.0, + "step": 769 + }, + { + "entropy": 0.9984316229820251, + "epoch": 1.2833333333333332, + "grad_norm": 0.48678651452064514, + "learning_rate": 0.00012058479532163743, + "loss": 1.0045, + "mean_token_accuracy": 0.7568108588457108, + "num_tokens": 9665894.0, + "step": 770 + }, + { + "entropy": 0.8993102163076401, + "epoch": 1.285, + "grad_norm": 0.33669960498809814, + "learning_rate": 0.00012046783625730995, + "loss": 0.8766, + "mean_token_accuracy": 0.7823315188288689, + "num_tokens": 9678454.0, + "step": 771 + }, + { + "entropy": 1.0980085879564285, + "epoch": 1.2866666666666666, + "grad_norm": 0.3783906102180481, + "learning_rate": 0.00012035087719298246, + "loss": 1.0533, + "mean_token_accuracy": 0.7443738207221031, + "num_tokens": 9691210.0, + "step": 772 + }, + { + "entropy": 0.9627801030874252, + "epoch": 1.2883333333333333, + "grad_norm": 0.3682050108909607, + "learning_rate": 0.00012023391812865498, + "loss": 0.9516, + "mean_token_accuracy": 0.7748104557394981, + "num_tokens": 9703463.0, + "step": 773 + }, + { + "entropy": 1.0400335937738419, + "epoch": 1.29, + "grad_norm": 0.3542267680168152, + "learning_rate": 0.0001201169590643275, + "loss": 0.9967, + "mean_token_accuracy": 0.7683819159865379, + "num_tokens": 9716027.0, + "step": 774 + }, + { + "entropy": 0.9925283342599869, + "epoch": 1.2916666666666667, + "grad_norm": 0.3950151801109314, + "learning_rate": 0.00012, + "loss": 0.9498, + "mean_token_accuracy": 0.7683768942952156, + "num_tokens": 9728291.0, + "step": 775 + }, + { + "entropy": 1.1487708538770676, + "epoch": 1.2933333333333334, + "grad_norm": 0.3613208532333374, + "learning_rate": 0.00011988304093567253, + "loss": 1.1255, + "mean_token_accuracy": 0.7375266328454018, + "num_tokens": 9740911.0, + "step": 776 + }, + { + "entropy": 0.9904927983880043, + "epoch": 1.295, + "grad_norm": 0.3638273775577545, + "learning_rate": 0.00011976608187134504, + "loss": 0.9816, + "mean_token_accuracy": 0.7648102417588234, + "num_tokens": 9753671.0, + "step": 777 + }, + { + "entropy": 0.9700521230697632, + "epoch": 1.2966666666666666, + "grad_norm": 0.44246748089790344, + "learning_rate": 0.00011964912280701755, + "loss": 0.9678, + "mean_token_accuracy": 0.7655287235975266, + "num_tokens": 9766264.0, + "step": 778 + }, + { + "entropy": 1.1071206703782082, + "epoch": 1.2983333333333333, + "grad_norm": 0.41459017992019653, + "learning_rate": 0.00011953216374269007, + "loss": 1.1313, + "mean_token_accuracy": 0.7323150411248207, + "num_tokens": 9778506.0, + "step": 779 + }, + { + "entropy": 0.9841584786772728, + "epoch": 1.3, + "grad_norm": 0.4696158468723297, + "learning_rate": 0.00011941520467836258, + "loss": 0.9963, + "mean_token_accuracy": 0.7636822164058685, + "num_tokens": 9791182.0, + "step": 780 + }, + { + "entropy": 0.9327910616993904, + "epoch": 1.3016666666666667, + "grad_norm": 0.7686042785644531, + "learning_rate": 0.00011929824561403509, + "loss": 0.9611, + "mean_token_accuracy": 0.7661004289984703, + "num_tokens": 9803792.0, + "step": 781 + }, + { + "entropy": 1.0880418792366982, + "epoch": 1.3033333333333332, + "grad_norm": 0.36805927753448486, + "learning_rate": 0.00011918128654970761, + "loss": 1.1148, + "mean_token_accuracy": 0.734060674905777, + "num_tokens": 9816277.0, + "step": 782 + }, + { + "entropy": 0.9658422768115997, + "epoch": 1.305, + "grad_norm": 0.6255410313606262, + "learning_rate": 0.00011906432748538012, + "loss": 0.9658, + "mean_token_accuracy": 0.7605624049901962, + "num_tokens": 9828950.0, + "step": 783 + }, + { + "entropy": 0.9103493466973305, + "epoch": 1.3066666666666666, + "grad_norm": 0.5097252130508423, + "learning_rate": 0.00011894736842105263, + "loss": 0.8901, + "mean_token_accuracy": 0.7865965068340302, + "num_tokens": 9841394.0, + "step": 784 + }, + { + "entropy": 0.9112555459141731, + "epoch": 1.3083333333333333, + "grad_norm": 0.4171690046787262, + "learning_rate": 0.00011883040935672515, + "loss": 0.9152, + "mean_token_accuracy": 0.7758986055850983, + "num_tokens": 9854242.0, + "step": 785 + }, + { + "entropy": 0.9862824454903603, + "epoch": 1.31, + "grad_norm": 0.7817356586456299, + "learning_rate": 0.00011871345029239766, + "loss": 0.9753, + "mean_token_accuracy": 0.7611427754163742, + "num_tokens": 9866662.0, + "step": 786 + }, + { + "entropy": 1.1108715310692787, + "epoch": 1.3116666666666665, + "grad_norm": 0.3802628517150879, + "learning_rate": 0.00011859649122807017, + "loss": 1.1302, + "mean_token_accuracy": 0.7318329811096191, + "num_tokens": 9879163.0, + "step": 787 + }, + { + "entropy": 1.0539921298623085, + "epoch": 1.3133333333333335, + "grad_norm": 0.37499651312828064, + "learning_rate": 0.00011847953216374271, + "loss": 1.0575, + "mean_token_accuracy": 0.7487984895706177, + "num_tokens": 9891670.0, + "step": 788 + }, + { + "entropy": 1.0349926948547363, + "epoch": 1.315, + "grad_norm": 0.571121871471405, + "learning_rate": 0.0001183625730994152, + "loss": 1.0738, + "mean_token_accuracy": 0.7484195157885551, + "num_tokens": 9903992.0, + "step": 789 + }, + { + "entropy": 0.9439081102609634, + "epoch": 1.3166666666666667, + "grad_norm": 0.4074806272983551, + "learning_rate": 0.00011824561403508771, + "loss": 0.92, + "mean_token_accuracy": 0.7820922136306763, + "num_tokens": 9916572.0, + "step": 790 + }, + { + "entropy": 0.9308940395712852, + "epoch": 1.3183333333333334, + "grad_norm": 0.5895468592643738, + "learning_rate": 0.00011812865497076025, + "loss": 0.9146, + "mean_token_accuracy": 0.78362637758255, + "num_tokens": 9928971.0, + "step": 791 + }, + { + "entropy": 1.1400123611092567, + "epoch": 1.32, + "grad_norm": 0.4412790536880493, + "learning_rate": 0.00011801169590643275, + "loss": 1.1464, + "mean_token_accuracy": 0.7309730723500252, + "num_tokens": 9941405.0, + "step": 792 + }, + { + "entropy": 0.9348255023360252, + "epoch": 1.3216666666666668, + "grad_norm": 0.3576184809207916, + "learning_rate": 0.00011789473684210525, + "loss": 0.9154, + "mean_token_accuracy": 0.7779194116592407, + "num_tokens": 9953803.0, + "step": 793 + }, + { + "entropy": 0.8785227425396442, + "epoch": 1.3233333333333333, + "grad_norm": 0.42946749925613403, + "learning_rate": 0.00011777777777777779, + "loss": 0.8667, + "mean_token_accuracy": 0.7964329123497009, + "num_tokens": 9966445.0, + "step": 794 + }, + { + "entropy": 1.033266007900238, + "epoch": 1.325, + "grad_norm": 0.758540153503418, + "learning_rate": 0.00011766081871345029, + "loss": 1.0317, + "mean_token_accuracy": 0.7529370561242104, + "num_tokens": 9978921.0, + "step": 795 + }, + { + "entropy": 1.023316115140915, + "epoch": 1.3266666666666667, + "grad_norm": 0.32743772864341736, + "learning_rate": 0.00011754385964912282, + "loss": 1.0038, + "mean_token_accuracy": 0.7565391361713409, + "num_tokens": 9991602.0, + "step": 796 + }, + { + "entropy": 1.0393253713846207, + "epoch": 1.3283333333333334, + "grad_norm": 0.769688606262207, + "learning_rate": 0.00011742690058479533, + "loss": 0.9982, + "mean_token_accuracy": 0.7604466900229454, + "num_tokens": 10004156.0, + "step": 797 + }, + { + "entropy": 1.0976822525262833, + "epoch": 1.33, + "grad_norm": 0.4449421763420105, + "learning_rate": 0.00011730994152046784, + "loss": 1.0685, + "mean_token_accuracy": 0.7425640299916267, + "num_tokens": 10016749.0, + "step": 798 + }, + { + "entropy": 1.133506491780281, + "epoch": 1.3316666666666666, + "grad_norm": 0.33224308490753174, + "learning_rate": 0.00011719298245614037, + "loss": 1.1163, + "mean_token_accuracy": 0.7311032935976982, + "num_tokens": 10029307.0, + "step": 799 + }, + { + "entropy": 0.9819313511252403, + "epoch": 1.3333333333333333, + "grad_norm": 0.4555339217185974, + "learning_rate": 0.00011707602339181288, + "loss": 0.9963, + "mean_token_accuracy": 0.7624331265687943, + "num_tokens": 10041671.0, + "step": 800 + }, + { + "entropy": 0.8490017838776112, + "epoch": 1.335, + "grad_norm": 0.46284276247024536, + "learning_rate": 0.00011695906432748539, + "loss": 0.8662, + "mean_token_accuracy": 0.7898927256464958, + "num_tokens": 10054407.0, + "step": 801 + }, + { + "entropy": 0.9487814530730247, + "epoch": 1.3366666666666667, + "grad_norm": 0.36387118697166443, + "learning_rate": 0.00011684210526315791, + "loss": 0.939, + "mean_token_accuracy": 0.7769873812794685, + "num_tokens": 10067181.0, + "step": 802 + }, + { + "entropy": 1.049319937825203, + "epoch": 1.3383333333333334, + "grad_norm": 0.3920729458332062, + "learning_rate": 0.00011672514619883042, + "loss": 1.0566, + "mean_token_accuracy": 0.7499385103583336, + "num_tokens": 10079648.0, + "step": 803 + }, + { + "entropy": 0.9268646985292435, + "epoch": 1.34, + "grad_norm": 0.37466734647750854, + "learning_rate": 0.00011660818713450293, + "loss": 0.8918, + "mean_token_accuracy": 0.7850276306271553, + "num_tokens": 10092233.0, + "step": 804 + }, + { + "entropy": 1.1615970730781555, + "epoch": 1.3416666666666668, + "grad_norm": 0.3827607333660126, + "learning_rate": 0.00011649122807017545, + "loss": 1.1766, + "mean_token_accuracy": 0.7194992825388908, + "num_tokens": 10105123.0, + "step": 805 + }, + { + "entropy": 1.0954640060663223, + "epoch": 1.3433333333333333, + "grad_norm": 0.3855399787425995, + "learning_rate": 0.00011637426900584796, + "loss": 1.1217, + "mean_token_accuracy": 0.7424816787242889, + "num_tokens": 10117720.0, + "step": 806 + }, + { + "entropy": 1.0013692080974579, + "epoch": 1.345, + "grad_norm": 0.48803019523620605, + "learning_rate": 0.00011625730994152047, + "loss": 1.0108, + "mean_token_accuracy": 0.7596750035881996, + "num_tokens": 10130194.0, + "step": 807 + }, + { + "entropy": 0.9630342796444893, + "epoch": 1.3466666666666667, + "grad_norm": 0.5283374786376953, + "learning_rate": 0.00011614035087719299, + "loss": 0.941, + "mean_token_accuracy": 0.7686321437358856, + "num_tokens": 10142856.0, + "step": 808 + }, + { + "entropy": 1.0880045965313911, + "epoch": 1.3483333333333334, + "grad_norm": 0.49206939339637756, + "learning_rate": 0.0001160233918128655, + "loss": 1.0745, + "mean_token_accuracy": 0.7435120195150375, + "num_tokens": 10155326.0, + "step": 809 + }, + { + "entropy": 1.082480400800705, + "epoch": 1.35, + "grad_norm": 0.41293981671333313, + "learning_rate": 0.00011590643274853801, + "loss": 1.0921, + "mean_token_accuracy": 0.7371596023440361, + "num_tokens": 10167789.0, + "step": 810 + }, + { + "entropy": 1.0294490680098534, + "epoch": 1.3516666666666666, + "grad_norm": 0.4838137626647949, + "learning_rate": 0.00011578947368421053, + "loss": 1.0284, + "mean_token_accuracy": 0.7529405504465103, + "num_tokens": 10180153.0, + "step": 811 + }, + { + "entropy": 1.0858699977397919, + "epoch": 1.3533333333333333, + "grad_norm": 0.43557876348495483, + "learning_rate": 0.00011567251461988304, + "loss": 1.0725, + "mean_token_accuracy": 0.7415995746850967, + "num_tokens": 10192703.0, + "step": 812 + }, + { + "entropy": 0.9729868844151497, + "epoch": 1.355, + "grad_norm": 0.5065525770187378, + "learning_rate": 0.00011555555555555555, + "loss": 0.9715, + "mean_token_accuracy": 0.7719387412071228, + "num_tokens": 10205169.0, + "step": 813 + }, + { + "entropy": 1.004656471312046, + "epoch": 1.3566666666666667, + "grad_norm": 0.4453844726085663, + "learning_rate": 0.00011543859649122808, + "loss": 1.0198, + "mean_token_accuracy": 0.7633600905537605, + "num_tokens": 10217893.0, + "step": 814 + }, + { + "entropy": 1.1781500577926636, + "epoch": 1.3583333333333334, + "grad_norm": 0.4585513770580292, + "learning_rate": 0.00011532163742690059, + "loss": 1.1959, + "mean_token_accuracy": 0.7197646796703339, + "num_tokens": 10230751.0, + "step": 815 + }, + { + "entropy": 1.1357240229845047, + "epoch": 1.3599999999999999, + "grad_norm": 0.4589082598686218, + "learning_rate": 0.00011520467836257311, + "loss": 1.1389, + "mean_token_accuracy": 0.7281661555171013, + "num_tokens": 10242960.0, + "step": 816 + }, + { + "entropy": 0.977837011218071, + "epoch": 1.3616666666666668, + "grad_norm": 0.35289326310157776, + "learning_rate": 0.00011508771929824562, + "loss": 0.993, + "mean_token_accuracy": 0.763344369828701, + "num_tokens": 10255565.0, + "step": 817 + }, + { + "entropy": 0.9367702156305313, + "epoch": 1.3633333333333333, + "grad_norm": 0.3776535093784332, + "learning_rate": 0.00011497076023391813, + "loss": 0.9283, + "mean_token_accuracy": 0.7799378857016563, + "num_tokens": 10267996.0, + "step": 818 + }, + { + "entropy": 1.1042609736323357, + "epoch": 1.365, + "grad_norm": 0.35418426990509033, + "learning_rate": 0.00011485380116959066, + "loss": 1.0843, + "mean_token_accuracy": 0.7454045936465263, + "num_tokens": 10280299.0, + "step": 819 + }, + { + "entropy": 1.0538883432745934, + "epoch": 1.3666666666666667, + "grad_norm": 0.48952603340148926, + "learning_rate": 0.00011473684210526316, + "loss": 1.0108, + "mean_token_accuracy": 0.7581906095147133, + "num_tokens": 10292891.0, + "step": 820 + }, + { + "entropy": 1.0277271196246147, + "epoch": 1.3683333333333334, + "grad_norm": 0.36572200059890747, + "learning_rate": 0.00011461988304093567, + "loss": 0.9814, + "mean_token_accuracy": 0.7698658257722855, + "num_tokens": 10305316.0, + "step": 821 + }, + { + "entropy": 1.2049788609147072, + "epoch": 1.37, + "grad_norm": 0.32481735944747925, + "learning_rate": 0.0001145029239766082, + "loss": 1.1662, + "mean_token_accuracy": 0.7243218049407005, + "num_tokens": 10317552.0, + "step": 822 + }, + { + "entropy": 1.0278822854161263, + "epoch": 1.3716666666666666, + "grad_norm": 0.5740171074867249, + "learning_rate": 0.0001143859649122807, + "loss": 0.9984, + "mean_token_accuracy": 0.7567669078707695, + "num_tokens": 10330519.0, + "step": 823 + }, + { + "entropy": 1.1082218512892723, + "epoch": 1.3733333333333333, + "grad_norm": 0.4737369418144226, + "learning_rate": 0.00011426900584795321, + "loss": 1.136, + "mean_token_accuracy": 0.7322841584682465, + "num_tokens": 10343070.0, + "step": 824 + }, + { + "entropy": 0.9878677502274513, + "epoch": 1.375, + "grad_norm": 0.338487446308136, + "learning_rate": 0.00011415204678362575, + "loss": 1.0096, + "mean_token_accuracy": 0.7636533156037331, + "num_tokens": 10355763.0, + "step": 825 + }, + { + "entropy": 1.0612533316016197, + "epoch": 1.3766666666666667, + "grad_norm": 0.9741193056106567, + "learning_rate": 0.00011403508771929824, + "loss": 1.0522, + "mean_token_accuracy": 0.7476543188095093, + "num_tokens": 10368182.0, + "step": 826 + }, + { + "entropy": 0.9444241896271706, + "epoch": 1.3783333333333334, + "grad_norm": 0.395882248878479, + "learning_rate": 0.00011391812865497075, + "loss": 0.9429, + "mean_token_accuracy": 0.7722252234816551, + "num_tokens": 10380635.0, + "step": 827 + }, + { + "entropy": 0.9475426152348518, + "epoch": 1.38, + "grad_norm": 0.5885340571403503, + "learning_rate": 0.00011380116959064329, + "loss": 0.9643, + "mean_token_accuracy": 0.7732816264033318, + "num_tokens": 10393166.0, + "step": 828 + }, + { + "entropy": 1.027025744318962, + "epoch": 1.3816666666666666, + "grad_norm": 0.4447604715824127, + "learning_rate": 0.0001136842105263158, + "loss": 1.0358, + "mean_token_accuracy": 0.7527819871902466, + "num_tokens": 10405898.0, + "step": 829 + }, + { + "entropy": 1.028792716562748, + "epoch": 1.3833333333333333, + "grad_norm": 4.340502738952637, + "learning_rate": 0.0001135672514619883, + "loss": 1.0166, + "mean_token_accuracy": 0.7590840607881546, + "num_tokens": 10418260.0, + "step": 830 + }, + { + "entropy": 0.9294743090867996, + "epoch": 1.385, + "grad_norm": 0.3686159551143646, + "learning_rate": 0.00011345029239766083, + "loss": 0.9325, + "mean_token_accuracy": 0.7747152373194695, + "num_tokens": 10430739.0, + "step": 831 + }, + { + "entropy": 1.0500045493245125, + "epoch": 1.3866666666666667, + "grad_norm": 0.6629594564437866, + "learning_rate": 0.00011333333333333334, + "loss": 1.054, + "mean_token_accuracy": 0.7504790723323822, + "num_tokens": 10443176.0, + "step": 832 + }, + { + "entropy": 0.943693071603775, + "epoch": 1.3883333333333332, + "grad_norm": 0.6458562016487122, + "learning_rate": 0.00011321637426900584, + "loss": 0.9239, + "mean_token_accuracy": 0.7761659324169159, + "num_tokens": 10456060.0, + "step": 833 + }, + { + "entropy": 0.9273689910769463, + "epoch": 1.3900000000000001, + "grad_norm": 0.5584622621536255, + "learning_rate": 0.00011309941520467837, + "loss": 0.8941, + "mean_token_accuracy": 0.7786312326788902, + "num_tokens": 10468776.0, + "step": 834 + }, + { + "entropy": 1.0498060882091522, + "epoch": 1.3916666666666666, + "grad_norm": 0.8290840983390808, + "learning_rate": 0.00011298245614035088, + "loss": 1.0591, + "mean_token_accuracy": 0.7479442059993744, + "num_tokens": 10481203.0, + "step": 835 + }, + { + "entropy": 0.9626110792160034, + "epoch": 1.3933333333333333, + "grad_norm": 0.47482046484947205, + "learning_rate": 0.0001128654970760234, + "loss": 0.945, + "mean_token_accuracy": 0.7699298560619354, + "num_tokens": 10493832.0, + "step": 836 + }, + { + "entropy": 1.0231076627969742, + "epoch": 1.395, + "grad_norm": 0.3916915953159332, + "learning_rate": 0.00011274853801169592, + "loss": 1.0063, + "mean_token_accuracy": 0.7689137682318687, + "num_tokens": 10506284.0, + "step": 837 + }, + { + "entropy": 1.1151341199874878, + "epoch": 1.3966666666666667, + "grad_norm": 0.5548978447914124, + "learning_rate": 0.00011263157894736843, + "loss": 1.109, + "mean_token_accuracy": 0.7429637610912323, + "num_tokens": 10518567.0, + "step": 838 + }, + { + "entropy": 1.066704586148262, + "epoch": 1.3983333333333334, + "grad_norm": 0.5807605981826782, + "learning_rate": 0.00011251461988304095, + "loss": 1.0531, + "mean_token_accuracy": 0.7495111003518105, + "num_tokens": 10531465.0, + "step": 839 + }, + { + "entropy": 0.9433969557285309, + "epoch": 1.4, + "grad_norm": 0.35977932810783386, + "learning_rate": 0.00011239766081871346, + "loss": 0.9214, + "mean_token_accuracy": 0.7798573076725006, + "num_tokens": 10543955.0, + "step": 840 + }, + { + "entropy": 1.0538764372467995, + "epoch": 1.4016666666666666, + "grad_norm": 0.5120930671691895, + "learning_rate": 0.00011228070175438597, + "loss": 1.0399, + "mean_token_accuracy": 0.7506494671106339, + "num_tokens": 10556527.0, + "step": 841 + }, + { + "entropy": 1.0073631629347801, + "epoch": 1.4033333333333333, + "grad_norm": 0.5792893767356873, + "learning_rate": 0.00011216374269005849, + "loss": 1.0071, + "mean_token_accuracy": 0.7614312022924423, + "num_tokens": 10569305.0, + "step": 842 + }, + { + "entropy": 1.1166088730096817, + "epoch": 1.405, + "grad_norm": 0.3618125319480896, + "learning_rate": 0.000112046783625731, + "loss": 1.0961, + "mean_token_accuracy": 0.7349946275353432, + "num_tokens": 10581842.0, + "step": 843 + }, + { + "entropy": 0.9652634114027023, + "epoch": 1.4066666666666667, + "grad_norm": 0.413026362657547, + "learning_rate": 0.00011192982456140351, + "loss": 0.9539, + "mean_token_accuracy": 0.7748262882232666, + "num_tokens": 10594474.0, + "step": 844 + }, + { + "entropy": 1.0343084707856178, + "epoch": 1.4083333333333332, + "grad_norm": 0.44033581018447876, + "learning_rate": 0.00011181286549707603, + "loss": 1.0534, + "mean_token_accuracy": 0.7520105093717575, + "num_tokens": 10607053.0, + "step": 845 + }, + { + "entropy": 0.9925975799560547, + "epoch": 1.41, + "grad_norm": 0.3442208170890808, + "learning_rate": 0.00011169590643274854, + "loss": 0.9824, + "mean_token_accuracy": 0.767107367515564, + "num_tokens": 10619633.0, + "step": 846 + }, + { + "entropy": 0.9882001951336861, + "epoch": 1.4116666666666666, + "grad_norm": 0.372429221868515, + "learning_rate": 0.00011157894736842105, + "loss": 0.997, + "mean_token_accuracy": 0.754954032599926, + "num_tokens": 10631881.0, + "step": 847 + }, + { + "entropy": 1.216900959610939, + "epoch": 1.4133333333333333, + "grad_norm": 0.32626840472221375, + "learning_rate": 0.00011146198830409357, + "loss": 1.1901, + "mean_token_accuracy": 0.7172373160719872, + "num_tokens": 10644620.0, + "step": 848 + }, + { + "entropy": 1.0781096443533897, + "epoch": 1.415, + "grad_norm": 0.48700082302093506, + "learning_rate": 0.00011134502923976608, + "loss": 1.0506, + "mean_token_accuracy": 0.7515213713049889, + "num_tokens": 10657275.0, + "step": 849 + }, + { + "entropy": 1.006898395717144, + "epoch": 1.4166666666666667, + "grad_norm": 0.4172718822956085, + "learning_rate": 0.0001112280701754386, + "loss": 1.0189, + "mean_token_accuracy": 0.7525632977485657, + "num_tokens": 10669659.0, + "step": 850 + }, + { + "entropy": 0.9942312985658646, + "epoch": 1.4183333333333334, + "grad_norm": 0.3973284959793091, + "learning_rate": 0.00011111111111111112, + "loss": 1.0055, + "mean_token_accuracy": 0.7587251886725426, + "num_tokens": 10682439.0, + "step": 851 + }, + { + "entropy": 1.0647492855787277, + "epoch": 1.42, + "grad_norm": 0.36533141136169434, + "learning_rate": 0.00011099415204678363, + "loss": 1.0481, + "mean_token_accuracy": 0.7488600835204124, + "num_tokens": 10694974.0, + "step": 852 + }, + { + "entropy": 0.965335488319397, + "epoch": 1.4216666666666666, + "grad_norm": 0.3551306426525116, + "learning_rate": 0.00011087719298245614, + "loss": 0.9249, + "mean_token_accuracy": 0.7786455601453781, + "num_tokens": 10707370.0, + "step": 853 + }, + { + "entropy": 0.985395722091198, + "epoch": 1.4233333333333333, + "grad_norm": 0.3487420082092285, + "learning_rate": 0.00011076023391812866, + "loss": 0.9543, + "mean_token_accuracy": 0.7682265117764473, + "num_tokens": 10719952.0, + "step": 854 + }, + { + "entropy": 1.171361893415451, + "epoch": 1.425, + "grad_norm": 0.3417138159275055, + "learning_rate": 0.00011064327485380117, + "loss": 1.1721, + "mean_token_accuracy": 0.7295513302087784, + "num_tokens": 10732226.0, + "step": 855 + }, + { + "entropy": 0.8770536556839943, + "epoch": 1.4266666666666667, + "grad_norm": 0.3753270208835602, + "learning_rate": 0.0001105263157894737, + "loss": 0.8653, + "mean_token_accuracy": 0.7902911081910133, + "num_tokens": 10744771.0, + "step": 856 + }, + { + "entropy": 0.993248276412487, + "epoch": 1.4283333333333332, + "grad_norm": 0.3620983958244324, + "learning_rate": 0.0001104093567251462, + "loss": 1.0105, + "mean_token_accuracy": 0.7606182098388672, + "num_tokens": 10757054.0, + "step": 857 + }, + { + "entropy": 1.0585757717490196, + "epoch": 1.43, + "grad_norm": 0.6057856678962708, + "learning_rate": 0.00011029239766081871, + "loss": 1.0547, + "mean_token_accuracy": 0.748723529279232, + "num_tokens": 10769655.0, + "step": 858 + }, + { + "entropy": 1.044035878032446, + "epoch": 1.4316666666666666, + "grad_norm": 0.48957177996635437, + "learning_rate": 0.00011017543859649125, + "loss": 1.0722, + "mean_token_accuracy": 0.7449115514755249, + "num_tokens": 10782498.0, + "step": 859 + }, + { + "entropy": 1.0420118942856789, + "epoch": 1.4333333333333333, + "grad_norm": 1.1686972379684448, + "learning_rate": 0.00011005847953216376, + "loss": 1.0403, + "mean_token_accuracy": 0.7532968968153, + "num_tokens": 10794988.0, + "step": 860 + }, + { + "entropy": 0.9224732890725136, + "epoch": 1.435, + "grad_norm": 0.554107129573822, + "learning_rate": 0.00010994152046783625, + "loss": 0.8907, + "mean_token_accuracy": 0.7824530005455017, + "num_tokens": 10807684.0, + "step": 861 + }, + { + "entropy": 1.0446735471487045, + "epoch": 1.4366666666666665, + "grad_norm": 0.5582395195960999, + "learning_rate": 0.00010982456140350879, + "loss": 1.0294, + "mean_token_accuracy": 0.7591670379042625, + "num_tokens": 10820456.0, + "step": 862 + }, + { + "entropy": 1.1306973919272423, + "epoch": 1.4383333333333335, + "grad_norm": 0.6018702387809753, + "learning_rate": 0.0001097076023391813, + "loss": 1.0978, + "mean_token_accuracy": 0.7418882921338081, + "num_tokens": 10832758.0, + "step": 863 + }, + { + "entropy": 1.0693649873137474, + "epoch": 1.44, + "grad_norm": 0.5192097425460815, + "learning_rate": 0.0001095906432748538, + "loss": 1.0388, + "mean_token_accuracy": 0.7521316781640053, + "num_tokens": 10845340.0, + "step": 864 + }, + { + "entropy": 1.1522692888975143, + "epoch": 1.4416666666666667, + "grad_norm": 0.6497692465782166, + "learning_rate": 0.00010947368421052633, + "loss": 1.1314, + "mean_token_accuracy": 0.7329999729990959, + "num_tokens": 10857967.0, + "step": 865 + }, + { + "entropy": 1.0177212581038475, + "epoch": 1.4433333333333334, + "grad_norm": 0.5419860482215881, + "learning_rate": 0.00010935672514619884, + "loss": 1.0031, + "mean_token_accuracy": 0.7599886953830719, + "num_tokens": 10870669.0, + "step": 866 + }, + { + "entropy": 1.023183934390545, + "epoch": 1.445, + "grad_norm": 0.5257622003555298, + "learning_rate": 0.00010923976608187134, + "loss": 0.9979, + "mean_token_accuracy": 0.7620133087038994, + "num_tokens": 10883191.0, + "step": 867 + }, + { + "entropy": 1.0132087841629982, + "epoch": 1.4466666666666668, + "grad_norm": 0.5330016016960144, + "learning_rate": 0.00010912280701754387, + "loss": 1.0076, + "mean_token_accuracy": 0.7519190832972527, + "num_tokens": 10895676.0, + "step": 868 + }, + { + "entropy": 0.9974637776613235, + "epoch": 1.4483333333333333, + "grad_norm": 0.36778372526168823, + "learning_rate": 0.00010900584795321638, + "loss": 0.9833, + "mean_token_accuracy": 0.7621881663799286, + "num_tokens": 10908336.0, + "step": 869 + }, + { + "entropy": 0.8968958966434002, + "epoch": 1.45, + "grad_norm": 0.5859230756759644, + "learning_rate": 0.00010888888888888889, + "loss": 0.8937, + "mean_token_accuracy": 0.7845388129353523, + "num_tokens": 10920842.0, + "step": 870 + }, + { + "entropy": 1.1610392034053802, + "epoch": 1.4516666666666667, + "grad_norm": 0.6320472359657288, + "learning_rate": 0.00010877192982456141, + "loss": 1.1792, + "mean_token_accuracy": 0.7180443182587624, + "num_tokens": 10933308.0, + "step": 871 + }, + { + "entropy": 0.9494692981243134, + "epoch": 1.4533333333333334, + "grad_norm": 0.34578004479408264, + "learning_rate": 0.00010865497076023392, + "loss": 0.9557, + "mean_token_accuracy": 0.7738220021128654, + "num_tokens": 10945907.0, + "step": 872 + }, + { + "entropy": 1.1724983602762222, + "epoch": 1.455, + "grad_norm": 0.36840155720710754, + "learning_rate": 0.00010853801169590643, + "loss": 1.2108, + "mean_token_accuracy": 0.7143998891115189, + "num_tokens": 10958532.0, + "step": 873 + }, + { + "entropy": 0.9114683270454407, + "epoch": 1.4566666666666666, + "grad_norm": 0.49961942434310913, + "learning_rate": 0.00010842105263157896, + "loss": 0.9228, + "mean_token_accuracy": 0.7720942944288254, + "num_tokens": 10970796.0, + "step": 874 + }, + { + "entropy": 1.1112416312098503, + "epoch": 1.4583333333333333, + "grad_norm": 0.41927391290664673, + "learning_rate": 0.00010830409356725147, + "loss": 1.1155, + "mean_token_accuracy": 0.7332775890827179, + "num_tokens": 10983480.0, + "step": 875 + }, + { + "entropy": 1.108654335141182, + "epoch": 1.46, + "grad_norm": 0.3839800953865051, + "learning_rate": 0.00010818713450292399, + "loss": 1.1028, + "mean_token_accuracy": 0.744315542280674, + "num_tokens": 10996321.0, + "step": 876 + }, + { + "entropy": 1.0760968998074532, + "epoch": 1.4616666666666667, + "grad_norm": 0.3334137499332428, + "learning_rate": 0.0001080701754385965, + "loss": 1.0539, + "mean_token_accuracy": 0.7508803084492683, + "num_tokens": 11008706.0, + "step": 877 + }, + { + "entropy": 0.9972855970263481, + "epoch": 1.4633333333333334, + "grad_norm": 0.42082127928733826, + "learning_rate": 0.00010795321637426901, + "loss": 0.9675, + "mean_token_accuracy": 0.7756324484944344, + "num_tokens": 11021264.0, + "step": 878 + }, + { + "entropy": 1.0188745334744453, + "epoch": 1.465, + "grad_norm": 0.4535685181617737, + "learning_rate": 0.00010783625730994153, + "loss": 1.0095, + "mean_token_accuracy": 0.761257492005825, + "num_tokens": 11033968.0, + "step": 879 + }, + { + "entropy": 1.1675554513931274, + "epoch": 1.4666666666666668, + "grad_norm": 0.41163545846939087, + "learning_rate": 0.00010771929824561404, + "loss": 1.1454, + "mean_token_accuracy": 0.7286913469433784, + "num_tokens": 11046089.0, + "step": 880 + }, + { + "entropy": 0.9919754564762115, + "epoch": 1.4683333333333333, + "grad_norm": 0.37073564529418945, + "learning_rate": 0.00010760233918128655, + "loss": 0.9886, + "mean_token_accuracy": 0.7659579887986183, + "num_tokens": 11058355.0, + "step": 881 + }, + { + "entropy": 1.1535531505942345, + "epoch": 1.47, + "grad_norm": 0.43245184421539307, + "learning_rate": 0.00010748538011695907, + "loss": 1.1231, + "mean_token_accuracy": 0.7396808043122292, + "num_tokens": 11070883.0, + "step": 882 + }, + { + "entropy": 1.0976762846112251, + "epoch": 1.4716666666666667, + "grad_norm": 0.40902963280677795, + "learning_rate": 0.00010736842105263158, + "loss": 1.0822, + "mean_token_accuracy": 0.7501776218414307, + "num_tokens": 11083431.0, + "step": 883 + }, + { + "entropy": 1.0290912240743637, + "epoch": 1.4733333333333334, + "grad_norm": 0.4858871400356293, + "learning_rate": 0.00010725146198830409, + "loss": 1.0126, + "mean_token_accuracy": 0.7592322379350662, + "num_tokens": 11096095.0, + "step": 884 + }, + { + "entropy": 0.8518183901906013, + "epoch": 1.475, + "grad_norm": 0.3919861614704132, + "learning_rate": 0.00010713450292397661, + "loss": 0.8265, + "mean_token_accuracy": 0.7959297299385071, + "num_tokens": 11108770.0, + "step": 885 + }, + { + "entropy": 1.1525486186146736, + "epoch": 1.4766666666666666, + "grad_norm": 0.4030328691005707, + "learning_rate": 0.00010701754385964912, + "loss": 1.1595, + "mean_token_accuracy": 0.7353719994425774, + "num_tokens": 11121258.0, + "step": 886 + }, + { + "entropy": 1.0715966746211052, + "epoch": 1.4783333333333333, + "grad_norm": 0.41663604974746704, + "learning_rate": 0.00010690058479532163, + "loss": 1.0769, + "mean_token_accuracy": 0.7448949441313744, + "num_tokens": 11133725.0, + "step": 887 + }, + { + "entropy": 0.8812004327774048, + "epoch": 1.48, + "grad_norm": 0.37869131565093994, + "learning_rate": 0.00010678362573099416, + "loss": 0.8715, + "mean_token_accuracy": 0.7830679789185524, + "num_tokens": 11146636.0, + "step": 888 + }, + { + "entropy": 1.017636887729168, + "epoch": 1.4816666666666667, + "grad_norm": 0.36093369126319885, + "learning_rate": 0.00010666666666666667, + "loss": 1.0347, + "mean_token_accuracy": 0.7492915317416191, + "num_tokens": 11159517.0, + "step": 889 + }, + { + "entropy": 1.1328649371862411, + "epoch": 1.4833333333333334, + "grad_norm": 0.3405679762363434, + "learning_rate": 0.00010654970760233918, + "loss": 1.1572, + "mean_token_accuracy": 0.7216387167572975, + "num_tokens": 11172100.0, + "step": 890 + }, + { + "entropy": 1.0226778164505959, + "epoch": 1.4849999999999999, + "grad_norm": 0.37923672795295715, + "learning_rate": 0.00010643274853801171, + "loss": 1.026, + "mean_token_accuracy": 0.7598460465669632, + "num_tokens": 11184561.0, + "step": 891 + }, + { + "entropy": 1.0817178189754486, + "epoch": 1.4866666666666668, + "grad_norm": 0.3810464143753052, + "learning_rate": 0.00010631578947368421, + "loss": 1.0715, + "mean_token_accuracy": 0.746548056602478, + "num_tokens": 11196944.0, + "step": 892 + }, + { + "entropy": 0.9835236445069313, + "epoch": 1.4883333333333333, + "grad_norm": 0.37003716826438904, + "learning_rate": 0.00010619883040935672, + "loss": 0.9567, + "mean_token_accuracy": 0.7729001268744469, + "num_tokens": 11209530.0, + "step": 893 + }, + { + "entropy": 1.1025512740015984, + "epoch": 1.49, + "grad_norm": 0.3993394672870636, + "learning_rate": 0.00010608187134502925, + "loss": 1.0974, + "mean_token_accuracy": 0.7313189208507538, + "num_tokens": 11221987.0, + "step": 894 + }, + { + "entropy": 1.0139773339033127, + "epoch": 1.4916666666666667, + "grad_norm": 0.31166166067123413, + "learning_rate": 0.00010596491228070175, + "loss": 1.0037, + "mean_token_accuracy": 0.7613530680537224, + "num_tokens": 11234942.0, + "step": 895 + }, + { + "entropy": 1.0583973452448845, + "epoch": 1.4933333333333334, + "grad_norm": 0.3589918613433838, + "learning_rate": 0.00010584795321637429, + "loss": 1.0734, + "mean_token_accuracy": 0.7417832165956497, + "num_tokens": 11247788.0, + "step": 896 + }, + { + "entropy": 1.1149739176034927, + "epoch": 1.495, + "grad_norm": 0.4475265145301819, + "learning_rate": 0.0001057309941520468, + "loss": 1.1002, + "mean_token_accuracy": 0.7413612082600594, + "num_tokens": 11260738.0, + "step": 897 + }, + { + "entropy": 1.0767404064536095, + "epoch": 1.4966666666666666, + "grad_norm": 0.43724361062049866, + "learning_rate": 0.00010561403508771929, + "loss": 1.077, + "mean_token_accuracy": 0.7468773946166039, + "num_tokens": 11273395.0, + "step": 898 + }, + { + "entropy": 1.1489032730460167, + "epoch": 1.4983333333333333, + "grad_norm": 0.328046053647995, + "learning_rate": 0.00010549707602339183, + "loss": 1.1427, + "mean_token_accuracy": 0.7237901836633682, + "num_tokens": 11285769.0, + "step": 899 + }, + { + "entropy": 0.9795516058802605, + "epoch": 1.5, + "grad_norm": 0.3441937565803528, + "learning_rate": 0.00010538011695906434, + "loss": 0.9833, + "mean_token_accuracy": 0.7675316259264946, + "num_tokens": 11298603.0, + "step": 900 + }, + { + "entropy": 1.0521475449204445, + "epoch": 1.5016666666666667, + "grad_norm": 0.36500778794288635, + "learning_rate": 0.00010526315789473685, + "loss": 1.0503, + "mean_token_accuracy": 0.7476348206400871, + "num_tokens": 11311395.0, + "step": 901 + }, + { + "entropy": 0.9882413372397423, + "epoch": 1.5033333333333334, + "grad_norm": 0.31784096360206604, + "learning_rate": 0.00010514619883040937, + "loss": 0.9629, + "mean_token_accuracy": 0.7727819085121155, + "num_tokens": 11324223.0, + "step": 902 + }, + { + "entropy": 1.0452478006482124, + "epoch": 1.505, + "grad_norm": 0.3701280355453491, + "learning_rate": 0.00010502923976608188, + "loss": 1.0313, + "mean_token_accuracy": 0.7561671435832977, + "num_tokens": 11336798.0, + "step": 903 + }, + { + "entropy": 1.1844883859157562, + "epoch": 1.5066666666666668, + "grad_norm": 0.3551271855831146, + "learning_rate": 0.00010491228070175439, + "loss": 1.166, + "mean_token_accuracy": 0.7237709909677505, + "num_tokens": 11349352.0, + "step": 904 + }, + { + "entropy": 1.0555674508213997, + "epoch": 1.5083333333333333, + "grad_norm": 0.34529635310173035, + "learning_rate": 0.00010479532163742691, + "loss": 1.0294, + "mean_token_accuracy": 0.7504124864935875, + "num_tokens": 11362065.0, + "step": 905 + }, + { + "entropy": 0.9649133235216141, + "epoch": 1.51, + "grad_norm": 0.43623584508895874, + "learning_rate": 0.00010467836257309942, + "loss": 0.9695, + "mean_token_accuracy": 0.7675874829292297, + "num_tokens": 11374499.0, + "step": 906 + }, + { + "entropy": 1.0951272398233414, + "epoch": 1.5116666666666667, + "grad_norm": 0.4360898733139038, + "learning_rate": 0.00010456140350877193, + "loss": 1.0914, + "mean_token_accuracy": 0.7397733628749847, + "num_tokens": 11387186.0, + "step": 907 + }, + { + "entropy": 0.97959403693676, + "epoch": 1.5133333333333332, + "grad_norm": 0.39979809522628784, + "learning_rate": 0.00010444444444444445, + "loss": 0.9577, + "mean_token_accuracy": 0.7697319462895393, + "num_tokens": 11399647.0, + "step": 908 + }, + { + "entropy": 0.997077964246273, + "epoch": 1.5150000000000001, + "grad_norm": 0.4603921175003052, + "learning_rate": 0.00010432748538011696, + "loss": 0.991, + "mean_token_accuracy": 0.7578662112355232, + "num_tokens": 11412192.0, + "step": 909 + }, + { + "entropy": 0.9444845467805862, + "epoch": 1.5166666666666666, + "grad_norm": 0.4429384768009186, + "learning_rate": 0.00010421052631578947, + "loss": 0.9316, + "mean_token_accuracy": 0.7735821157693863, + "num_tokens": 11424501.0, + "step": 910 + }, + { + "entropy": 1.052556574344635, + "epoch": 1.5183333333333333, + "grad_norm": 0.3348616659641266, + "learning_rate": 0.000104093567251462, + "loss": 1.0526, + "mean_token_accuracy": 0.7529369965195656, + "num_tokens": 11437123.0, + "step": 911 + }, + { + "entropy": 1.0511446967720985, + "epoch": 1.52, + "grad_norm": 0.46372920274734497, + "learning_rate": 0.0001039766081871345, + "loss": 1.0789, + "mean_token_accuracy": 0.7440238147974014, + "num_tokens": 11449811.0, + "step": 912 + }, + { + "entropy": 0.9433198198676109, + "epoch": 1.5216666666666665, + "grad_norm": 0.4527278244495392, + "learning_rate": 0.00010385964912280702, + "loss": 0.9384, + "mean_token_accuracy": 0.7744319513440132, + "num_tokens": 11462288.0, + "step": 913 + }, + { + "entropy": 0.9461892023682594, + "epoch": 1.5233333333333334, + "grad_norm": 0.569244384765625, + "learning_rate": 0.00010374269005847954, + "loss": 0.9473, + "mean_token_accuracy": 0.7712215408682823, + "num_tokens": 11474652.0, + "step": 914 + }, + { + "entropy": 0.9832625687122345, + "epoch": 1.525, + "grad_norm": 0.47432512044906616, + "learning_rate": 0.00010362573099415205, + "loss": 0.9741, + "mean_token_accuracy": 0.7722496688365936, + "num_tokens": 11487038.0, + "step": 915 + }, + { + "entropy": 1.113365113735199, + "epoch": 1.5266666666666666, + "grad_norm": 0.5353997945785522, + "learning_rate": 0.00010350877192982457, + "loss": 1.1232, + "mean_token_accuracy": 0.7369033172726631, + "num_tokens": 11499827.0, + "step": 916 + }, + { + "entropy": 1.045794539153576, + "epoch": 1.5283333333333333, + "grad_norm": 0.4842732548713684, + "learning_rate": 0.00010339181286549708, + "loss": 1.0219, + "mean_token_accuracy": 0.7538414746522903, + "num_tokens": 11512411.0, + "step": 917 + }, + { + "entropy": 1.008032351732254, + "epoch": 1.53, + "grad_norm": 0.4723028242588043, + "learning_rate": 0.00010327485380116959, + "loss": 0.9897, + "mean_token_accuracy": 0.7631094679236412, + "num_tokens": 11525202.0, + "step": 918 + }, + { + "entropy": 1.2124775275588036, + "epoch": 1.5316666666666667, + "grad_norm": 0.6457427740097046, + "learning_rate": 0.00010315789473684211, + "loss": 1.2234, + "mean_token_accuracy": 0.7147088348865509, + "num_tokens": 11537747.0, + "step": 919 + }, + { + "entropy": 1.0353680774569511, + "epoch": 1.5333333333333332, + "grad_norm": 0.5149745345115662, + "learning_rate": 0.00010304093567251462, + "loss": 1.0249, + "mean_token_accuracy": 0.7582436203956604, + "num_tokens": 11550393.0, + "step": 920 + }, + { + "entropy": 1.0725176259875298, + "epoch": 1.5350000000000001, + "grad_norm": 0.3201546370983124, + "learning_rate": 0.00010292397660818713, + "loss": 1.08, + "mean_token_accuracy": 0.7444223612546921, + "num_tokens": 11562681.0, + "step": 921 + }, + { + "entropy": 1.0299292877316475, + "epoch": 1.5366666666666666, + "grad_norm": 0.6729260087013245, + "learning_rate": 0.00010280701754385967, + "loss": 1.0095, + "mean_token_accuracy": 0.7589081674814224, + "num_tokens": 11575287.0, + "step": 922 + }, + { + "entropy": 1.129386618733406, + "epoch": 1.5383333333333333, + "grad_norm": 0.647361159324646, + "learning_rate": 0.00010269005847953216, + "loss": 1.1285, + "mean_token_accuracy": 0.7395085915923119, + "num_tokens": 11587776.0, + "step": 923 + }, + { + "entropy": 0.901118665933609, + "epoch": 1.54, + "grad_norm": 0.4085787832736969, + "learning_rate": 0.00010257309941520467, + "loss": 0.8937, + "mean_token_accuracy": 0.7793251350522041, + "num_tokens": 11600608.0, + "step": 924 + }, + { + "entropy": 1.0647885873913765, + "epoch": 1.5416666666666665, + "grad_norm": 0.38735586404800415, + "learning_rate": 0.00010245614035087721, + "loss": 1.0462, + "mean_token_accuracy": 0.740901917219162, + "num_tokens": 11613369.0, + "step": 925 + }, + { + "entropy": 0.9393143653869629, + "epoch": 1.5433333333333334, + "grad_norm": 0.6960055232048035, + "learning_rate": 0.00010233918128654971, + "loss": 0.9437, + "mean_token_accuracy": 0.7748453468084335, + "num_tokens": 11625871.0, + "step": 926 + }, + { + "entropy": 1.1271546632051468, + "epoch": 1.545, + "grad_norm": 0.5912812948226929, + "learning_rate": 0.00010222222222222222, + "loss": 1.1254, + "mean_token_accuracy": 0.7337081283330917, + "num_tokens": 11638374.0, + "step": 927 + }, + { + "entropy": 0.9807042926549911, + "epoch": 1.5466666666666666, + "grad_norm": 0.3811970055103302, + "learning_rate": 0.00010210526315789475, + "loss": 0.9585, + "mean_token_accuracy": 0.7691030874848366, + "num_tokens": 11651054.0, + "step": 928 + }, + { + "entropy": 1.0841076374053955, + "epoch": 1.5483333333333333, + "grad_norm": 0.5395686626434326, + "learning_rate": 0.00010198830409356725, + "loss": 1.0496, + "mean_token_accuracy": 0.7462358325719833, + "num_tokens": 11663627.0, + "step": 929 + }, + { + "entropy": 1.065460205078125, + "epoch": 1.55, + "grad_norm": 0.763958752155304, + "learning_rate": 0.00010187134502923976, + "loss": 1.0523, + "mean_token_accuracy": 0.7501270100474358, + "num_tokens": 11675897.0, + "step": 930 + }, + { + "entropy": 1.0842286050319672, + "epoch": 1.5516666666666667, + "grad_norm": 0.5680394172668457, + "learning_rate": 0.0001017543859649123, + "loss": 1.0838, + "mean_token_accuracy": 0.7463953346014023, + "num_tokens": 11688459.0, + "step": 931 + }, + { + "entropy": 1.0065980926156044, + "epoch": 1.5533333333333332, + "grad_norm": 0.5923241376876831, + "learning_rate": 0.0001016374269005848, + "loss": 0.9959, + "mean_token_accuracy": 0.7607046961784363, + "num_tokens": 11700966.0, + "step": 932 + }, + { + "entropy": 1.0660686418414116, + "epoch": 1.5550000000000002, + "grad_norm": 0.6443043351173401, + "learning_rate": 0.0001015204678362573, + "loss": 1.0844, + "mean_token_accuracy": 0.7445439100265503, + "num_tokens": 11713700.0, + "step": 933 + }, + { + "entropy": 1.0937560498714447, + "epoch": 1.5566666666666666, + "grad_norm": 0.5176796317100525, + "learning_rate": 0.00010140350877192984, + "loss": 1.1249, + "mean_token_accuracy": 0.7354537099599838, + "num_tokens": 11726035.0, + "step": 934 + }, + { + "entropy": 0.9690373241901398, + "epoch": 1.5583333333333333, + "grad_norm": 0.36154699325561523, + "learning_rate": 0.00010128654970760235, + "loss": 0.9794, + "mean_token_accuracy": 0.7714982256293297, + "num_tokens": 11738341.0, + "step": 935 + }, + { + "entropy": 1.124791868031025, + "epoch": 1.56, + "grad_norm": 0.47166070342063904, + "learning_rate": 0.00010116959064327487, + "loss": 1.1183, + "mean_token_accuracy": 0.7350871786475182, + "num_tokens": 11750883.0, + "step": 936 + }, + { + "entropy": 1.076091207563877, + "epoch": 1.5616666666666665, + "grad_norm": 0.5033275485038757, + "learning_rate": 0.00010105263157894738, + "loss": 1.0704, + "mean_token_accuracy": 0.7419762536883354, + "num_tokens": 11763453.0, + "step": 937 + }, + { + "entropy": 1.044884666800499, + "epoch": 1.5633333333333335, + "grad_norm": 0.381427139043808, + "learning_rate": 0.00010093567251461989, + "loss": 1.0389, + "mean_token_accuracy": 0.7595488056540489, + "num_tokens": 11775807.0, + "step": 938 + }, + { + "entropy": 1.0149907171726227, + "epoch": 1.565, + "grad_norm": 0.3665209114551544, + "learning_rate": 0.00010081871345029241, + "loss": 1.0111, + "mean_token_accuracy": 0.7635852620005608, + "num_tokens": 11788144.0, + "step": 939 + }, + { + "entropy": 0.9833221808075905, + "epoch": 1.5666666666666667, + "grad_norm": 0.3676835894584656, + "learning_rate": 0.00010070175438596492, + "loss": 0.9584, + "mean_token_accuracy": 0.7688944488763809, + "num_tokens": 11800736.0, + "step": 940 + }, + { + "entropy": 1.0265733674168587, + "epoch": 1.5683333333333334, + "grad_norm": 0.356206476688385, + "learning_rate": 0.00010058479532163743, + "loss": 1.0307, + "mean_token_accuracy": 0.7622543349862099, + "num_tokens": 11813252.0, + "step": 941 + }, + { + "entropy": 1.0316286012530327, + "epoch": 1.5699999999999998, + "grad_norm": 0.5001046061515808, + "learning_rate": 0.00010046783625730995, + "loss": 1.0018, + "mean_token_accuracy": 0.762552946805954, + "num_tokens": 11825972.0, + "step": 942 + }, + { + "entropy": 1.077604465186596, + "epoch": 1.5716666666666668, + "grad_norm": 0.4228179454803467, + "learning_rate": 0.00010035087719298246, + "loss": 1.0474, + "mean_token_accuracy": 0.7495718151330948, + "num_tokens": 11838741.0, + "step": 943 + }, + { + "entropy": 1.116953693330288, + "epoch": 1.5733333333333333, + "grad_norm": 0.35452526807785034, + "learning_rate": 0.00010023391812865497, + "loss": 1.109, + "mean_token_accuracy": 0.7296589389443398, + "num_tokens": 11851287.0, + "step": 944 + }, + { + "entropy": 1.0866753309965134, + "epoch": 1.575, + "grad_norm": 0.40109559893608093, + "learning_rate": 0.0001001169590643275, + "loss": 1.0577, + "mean_token_accuracy": 0.7466104477643967, + "num_tokens": 11864035.0, + "step": 945 + }, + { + "entropy": 1.0261227637529373, + "epoch": 1.5766666666666667, + "grad_norm": 0.6341944336891174, + "learning_rate": 0.0001, + "loss": 1.0223, + "mean_token_accuracy": 0.7552044317126274, + "num_tokens": 11876551.0, + "step": 946 + }, + { + "entropy": 1.0396640598773956, + "epoch": 1.5783333333333334, + "grad_norm": 0.387890487909317, + "learning_rate": 9.988304093567253e-05, + "loss": 1.0433, + "mean_token_accuracy": 0.7529490813612938, + "num_tokens": 11889267.0, + "step": 947 + }, + { + "entropy": 0.9809285327792168, + "epoch": 1.58, + "grad_norm": 0.3864838480949402, + "learning_rate": 9.976608187134502e-05, + "loss": 0.9764, + "mean_token_accuracy": 0.758466362953186, + "num_tokens": 11902019.0, + "step": 948 + }, + { + "entropy": 1.022968828678131, + "epoch": 1.5816666666666666, + "grad_norm": 0.4740675687789917, + "learning_rate": 9.964912280701755e-05, + "loss": 1.0368, + "mean_token_accuracy": 0.7580825462937355, + "num_tokens": 11914538.0, + "step": 949 + }, + { + "entropy": 1.0423406288027763, + "epoch": 1.5833333333333335, + "grad_norm": 0.46600988507270813, + "learning_rate": 9.953216374269007e-05, + "loss": 1.0365, + "mean_token_accuracy": 0.7558559775352478, + "num_tokens": 11926973.0, + "step": 950 + }, + { + "entropy": 1.0362046658992767, + "epoch": 1.585, + "grad_norm": 0.3281984329223633, + "learning_rate": 9.941520467836257e-05, + "loss": 1.0365, + "mean_token_accuracy": 0.7519606500864029, + "num_tokens": 11939555.0, + "step": 951 + }, + { + "entropy": 1.0146067589521408, + "epoch": 1.5866666666666667, + "grad_norm": 0.33721277117729187, + "learning_rate": 9.929824561403509e-05, + "loss": 1.0087, + "mean_token_accuracy": 0.7585414201021194, + "num_tokens": 11951842.0, + "step": 952 + }, + { + "entropy": 1.0265194848179817, + "epoch": 1.5883333333333334, + "grad_norm": 0.5035887956619263, + "learning_rate": 9.918128654970761e-05, + "loss": 1.0271, + "mean_token_accuracy": 0.7587100341916084, + "num_tokens": 11964140.0, + "step": 953 + }, + { + "entropy": 1.0434783324599266, + "epoch": 1.5899999999999999, + "grad_norm": 0.35144132375717163, + "learning_rate": 9.906432748538012e-05, + "loss": 1.015, + "mean_token_accuracy": 0.7533115297555923, + "num_tokens": 11976609.0, + "step": 954 + }, + { + "entropy": 0.9975638315081596, + "epoch": 1.5916666666666668, + "grad_norm": 0.35626283288002014, + "learning_rate": 9.894736842105263e-05, + "loss": 0.9785, + "mean_token_accuracy": 0.7682247906923294, + "num_tokens": 11989066.0, + "step": 955 + }, + { + "entropy": 1.0139445587992668, + "epoch": 1.5933333333333333, + "grad_norm": 0.34869149327278137, + "learning_rate": 9.883040935672515e-05, + "loss": 0.9703, + "mean_token_accuracy": 0.7643293812870979, + "num_tokens": 12001706.0, + "step": 956 + }, + { + "entropy": 1.0372228473424911, + "epoch": 1.595, + "grad_norm": 0.3563248813152313, + "learning_rate": 9.871345029239766e-05, + "loss": 1.0117, + "mean_token_accuracy": 0.7578230872750282, + "num_tokens": 12014447.0, + "step": 957 + }, + { + "entropy": 0.8905011862516403, + "epoch": 1.5966666666666667, + "grad_norm": 0.39937323331832886, + "learning_rate": 9.859649122807017e-05, + "loss": 0.8516, + "mean_token_accuracy": 0.7892890945076942, + "num_tokens": 12027001.0, + "step": 958 + }, + { + "entropy": 1.0956081077456474, + "epoch": 1.5983333333333334, + "grad_norm": 0.3379671275615692, + "learning_rate": 9.84795321637427e-05, + "loss": 1.1264, + "mean_token_accuracy": 0.7354054301977158, + "num_tokens": 12039455.0, + "step": 959 + }, + { + "entropy": 1.1044630855321884, + "epoch": 1.6, + "grad_norm": 0.4157054126262665, + "learning_rate": 9.83625730994152e-05, + "loss": 1.1178, + "mean_token_accuracy": 0.7352635785937309, + "num_tokens": 12052156.0, + "step": 960 + }, + { + "entropy": 0.989508643746376, + "epoch": 1.6016666666666666, + "grad_norm": 0.3482710123062134, + "learning_rate": 9.824561403508771e-05, + "loss": 1.0037, + "mean_token_accuracy": 0.7680085748434067, + "num_tokens": 12064784.0, + "step": 961 + }, + { + "entropy": 0.8837302699685097, + "epoch": 1.6033333333333335, + "grad_norm": 0.39792779088020325, + "learning_rate": 9.812865497076024e-05, + "loss": 0.8757, + "mean_token_accuracy": 0.794054351747036, + "num_tokens": 12077182.0, + "step": 962 + }, + { + "entropy": 1.075769916176796, + "epoch": 1.605, + "grad_norm": 0.4095625877380371, + "learning_rate": 9.801169590643276e-05, + "loss": 1.1094, + "mean_token_accuracy": 0.7338564768433571, + "num_tokens": 12089748.0, + "step": 963 + }, + { + "entropy": 1.0048549994826317, + "epoch": 1.6066666666666667, + "grad_norm": 0.5313544273376465, + "learning_rate": 9.789473684210527e-05, + "loss": 1.0222, + "mean_token_accuracy": 0.7637491151690483, + "num_tokens": 12102208.0, + "step": 964 + }, + { + "entropy": 0.9381846338510513, + "epoch": 1.6083333333333334, + "grad_norm": 0.7829982042312622, + "learning_rate": 9.777777777777778e-05, + "loss": 0.9526, + "mean_token_accuracy": 0.7717723697423935, + "num_tokens": 12115050.0, + "step": 965 + }, + { + "entropy": 0.988870695233345, + "epoch": 1.6099999999999999, + "grad_norm": 0.3945852220058441, + "learning_rate": 9.76608187134503e-05, + "loss": 1.0066, + "mean_token_accuracy": 0.7594099268317223, + "num_tokens": 12127483.0, + "step": 966 + }, + { + "entropy": 0.8450613245368004, + "epoch": 1.6116666666666668, + "grad_norm": 0.7763268351554871, + "learning_rate": 9.754385964912281e-05, + "loss": 0.8352, + "mean_token_accuracy": 0.8030087202787399, + "num_tokens": 12140018.0, + "step": 967 + }, + { + "entropy": 0.9965870380401611, + "epoch": 1.6133333333333333, + "grad_norm": 0.3498965799808502, + "learning_rate": 9.742690058479532e-05, + "loss": 0.9648, + "mean_token_accuracy": 0.7675676867365837, + "num_tokens": 12152304.0, + "step": 968 + }, + { + "entropy": 0.8886230438947678, + "epoch": 1.615, + "grad_norm": 0.35514211654663086, + "learning_rate": 9.730994152046784e-05, + "loss": 0.8499, + "mean_token_accuracy": 0.7898500040173531, + "num_tokens": 12165239.0, + "step": 969 + }, + { + "entropy": 1.0475463792681694, + "epoch": 1.6166666666666667, + "grad_norm": 0.32757648825645447, + "learning_rate": 9.719298245614035e-05, + "loss": 1.0377, + "mean_token_accuracy": 0.7567182034254074, + "num_tokens": 12177732.0, + "step": 970 + }, + { + "entropy": 1.077672116458416, + "epoch": 1.6183333333333332, + "grad_norm": 0.3843959867954254, + "learning_rate": 9.707602339181286e-05, + "loss": 1.0523, + "mean_token_accuracy": 0.7472524493932724, + "num_tokens": 12190337.0, + "step": 971 + }, + { + "entropy": 0.96406065300107, + "epoch": 1.62, + "grad_norm": 0.4375675320625305, + "learning_rate": 9.695906432748539e-05, + "loss": 0.9323, + "mean_token_accuracy": 0.7749380320310593, + "num_tokens": 12202681.0, + "step": 972 + }, + { + "entropy": 0.9428460970520973, + "epoch": 1.6216666666666666, + "grad_norm": 0.42265263199806213, + "learning_rate": 9.68421052631579e-05, + "loss": 0.9253, + "mean_token_accuracy": 0.7729097902774811, + "num_tokens": 12215372.0, + "step": 973 + }, + { + "entropy": 0.8881849274039268, + "epoch": 1.6233333333333333, + "grad_norm": 0.3061010241508484, + "learning_rate": 9.672514619883042e-05, + "loss": 0.8821, + "mean_token_accuracy": 0.7929625511169434, + "num_tokens": 12227794.0, + "step": 974 + }, + { + "entropy": 1.2201930955052376, + "epoch": 1.625, + "grad_norm": 0.4130735695362091, + "learning_rate": 9.660818713450293e-05, + "loss": 1.202, + "mean_token_accuracy": 0.7246149331331253, + "num_tokens": 12240350.0, + "step": 975 + }, + { + "entropy": 0.9785461276769638, + "epoch": 1.6266666666666667, + "grad_norm": 0.37417230010032654, + "learning_rate": 9.649122807017544e-05, + "loss": 0.9684, + "mean_token_accuracy": 0.7681845799088478, + "num_tokens": 12252809.0, + "step": 976 + }, + { + "entropy": 0.9370051473379135, + "epoch": 1.6283333333333334, + "grad_norm": 0.4027501344680786, + "learning_rate": 9.637426900584796e-05, + "loss": 0.9468, + "mean_token_accuracy": 0.7657081261277199, + "num_tokens": 12265800.0, + "step": 977 + }, + { + "entropy": 0.8598027527332306, + "epoch": 1.63, + "grad_norm": 0.38356924057006836, + "learning_rate": 9.625730994152047e-05, + "loss": 0.8497, + "mean_token_accuracy": 0.7939843013882637, + "num_tokens": 12278392.0, + "step": 978 + }, + { + "entropy": 0.9169039651751518, + "epoch": 1.6316666666666668, + "grad_norm": 0.3893364369869232, + "learning_rate": 9.614035087719298e-05, + "loss": 0.947, + "mean_token_accuracy": 0.7715698778629303, + "num_tokens": 12290956.0, + "step": 979 + }, + { + "entropy": 1.0241443440318108, + "epoch": 1.6333333333333333, + "grad_norm": 0.3506811559200287, + "learning_rate": 9.60233918128655e-05, + "loss": 1.0449, + "mean_token_accuracy": 0.752354621887207, + "num_tokens": 12303340.0, + "step": 980 + }, + { + "entropy": 1.1178838685154915, + "epoch": 1.635, + "grad_norm": 0.42362692952156067, + "learning_rate": 9.590643274853801e-05, + "loss": 1.1287, + "mean_token_accuracy": 0.7328075543045998, + "num_tokens": 12315539.0, + "step": 981 + }, + { + "entropy": 0.9596805796027184, + "epoch": 1.6366666666666667, + "grad_norm": 0.3150109648704529, + "learning_rate": 9.578947368421052e-05, + "loss": 0.9699, + "mean_token_accuracy": 0.7640129998326302, + "num_tokens": 12328171.0, + "step": 982 + }, + { + "entropy": 0.8891875892877579, + "epoch": 1.6383333333333332, + "grad_norm": 0.38294193148612976, + "learning_rate": 9.567251461988305e-05, + "loss": 0.8698, + "mean_token_accuracy": 0.7974608764052391, + "num_tokens": 12340881.0, + "step": 983 + }, + { + "entropy": 1.0465619787573814, + "epoch": 1.6400000000000001, + "grad_norm": 0.40911293029785156, + "learning_rate": 9.555555555555557e-05, + "loss": 1.0587, + "mean_token_accuracy": 0.7567858397960663, + "num_tokens": 12353378.0, + "step": 984 + }, + { + "entropy": 1.0272936820983887, + "epoch": 1.6416666666666666, + "grad_norm": 0.3643612861633301, + "learning_rate": 9.543859649122808e-05, + "loss": 1.0141, + "mean_token_accuracy": 0.7541831061244011, + "num_tokens": 12365690.0, + "step": 985 + }, + { + "entropy": 1.1167894005775452, + "epoch": 1.6433333333333333, + "grad_norm": 0.4006604850292206, + "learning_rate": 9.532163742690059e-05, + "loss": 1.1211, + "mean_token_accuracy": 0.7414597794413567, + "num_tokens": 12378138.0, + "step": 986 + }, + { + "entropy": 0.9925767779350281, + "epoch": 1.645, + "grad_norm": 0.390142023563385, + "learning_rate": 9.520467836257311e-05, + "loss": 0.9605, + "mean_token_accuracy": 0.768660306930542, + "num_tokens": 12390642.0, + "step": 987 + }, + { + "entropy": 0.9583388492465019, + "epoch": 1.6466666666666665, + "grad_norm": 0.36772555112838745, + "learning_rate": 9.508771929824562e-05, + "loss": 0.9473, + "mean_token_accuracy": 0.7698638662695885, + "num_tokens": 12403040.0, + "step": 988 + }, + { + "entropy": 1.0084424540400505, + "epoch": 1.6483333333333334, + "grad_norm": 0.3564258813858032, + "learning_rate": 9.497076023391813e-05, + "loss": 1.0044, + "mean_token_accuracy": 0.7635295242071152, + "num_tokens": 12415673.0, + "step": 989 + }, + { + "entropy": 0.9867688044905663, + "epoch": 1.65, + "grad_norm": 0.3392479717731476, + "learning_rate": 9.485380116959065e-05, + "loss": 1.0133, + "mean_token_accuracy": 0.7665977254509926, + "num_tokens": 12428197.0, + "step": 990 + }, + { + "entropy": 1.0522988960146904, + "epoch": 1.6516666666666666, + "grad_norm": 0.40604665875434875, + "learning_rate": 9.473684210526316e-05, + "loss": 0.9979, + "mean_token_accuracy": 0.7608874589204788, + "num_tokens": 12440702.0, + "step": 991 + }, + { + "entropy": 1.0473966524004936, + "epoch": 1.6533333333333333, + "grad_norm": 0.3371841311454773, + "learning_rate": 9.461988304093567e-05, + "loss": 1.0415, + "mean_token_accuracy": 0.75101687759161, + "num_tokens": 12453412.0, + "step": 992 + }, + { + "entropy": 0.9661148190498352, + "epoch": 1.655, + "grad_norm": 0.5349663496017456, + "learning_rate": 9.45029239766082e-05, + "loss": 0.9644, + "mean_token_accuracy": 0.7739033624529839, + "num_tokens": 12465842.0, + "step": 993 + }, + { + "entropy": 1.040437489748001, + "epoch": 1.6566666666666667, + "grad_norm": 0.3611598610877991, + "learning_rate": 9.438596491228072e-05, + "loss": 1.0096, + "mean_token_accuracy": 0.7568608149886131, + "num_tokens": 12478560.0, + "step": 994 + }, + { + "entropy": 0.9655180498957634, + "epoch": 1.6583333333333332, + "grad_norm": 0.3291683793067932, + "learning_rate": 9.426900584795321e-05, + "loss": 0.9887, + "mean_token_accuracy": 0.7650616839528084, + "num_tokens": 12490947.0, + "step": 995 + }, + { + "entropy": 0.9577772319316864, + "epoch": 1.6600000000000001, + "grad_norm": 0.463991641998291, + "learning_rate": 9.415204678362574e-05, + "loss": 0.9601, + "mean_token_accuracy": 0.769118033349514, + "num_tokens": 12503752.0, + "step": 996 + }, + { + "entropy": 1.115663342177868, + "epoch": 1.6616666666666666, + "grad_norm": 0.4828460216522217, + "learning_rate": 9.403508771929826e-05, + "loss": 1.1281, + "mean_token_accuracy": 0.7351868525147438, + "num_tokens": 12516063.0, + "step": 997 + }, + { + "entropy": 1.104174591600895, + "epoch": 1.6633333333333333, + "grad_norm": 0.3741133511066437, + "learning_rate": 9.391812865497076e-05, + "loss": 1.114, + "mean_token_accuracy": 0.7381681129336357, + "num_tokens": 12528649.0, + "step": 998 + }, + { + "entropy": 1.0054408684372902, + "epoch": 1.665, + "grad_norm": 0.34373944997787476, + "learning_rate": 9.380116959064328e-05, + "loss": 1.0193, + "mean_token_accuracy": 0.7606060951948166, + "num_tokens": 12541056.0, + "step": 999 + }, + { + "entropy": 1.148328110575676, + "epoch": 1.6666666666666665, + "grad_norm": 0.32720351219177246, + "learning_rate": 9.36842105263158e-05, + "loss": 1.1651, + "mean_token_accuracy": 0.7306461036205292, + "num_tokens": 12553340.0, + "step": 1000 + }, + { + "entropy": 1.0338936150074005, + "epoch": 1.6683333333333334, + "grad_norm": 0.33580687642097473, + "learning_rate": 9.35672514619883e-05, + "loss": 1.0078, + "mean_token_accuracy": 0.7570296004414558, + "num_tokens": 12565983.0, + "step": 1001 + }, + { + "entropy": 0.877336673438549, + "epoch": 1.67, + "grad_norm": 0.36133208870887756, + "learning_rate": 9.345029239766082e-05, + "loss": 0.8854, + "mean_token_accuracy": 0.7905719429254532, + "num_tokens": 12578466.0, + "step": 1002 + }, + { + "entropy": 1.157466672360897, + "epoch": 1.6716666666666666, + "grad_norm": 0.4128221571445465, + "learning_rate": 9.333333333333334e-05, + "loss": 1.1676, + "mean_token_accuracy": 0.726306177675724, + "num_tokens": 12591221.0, + "step": 1003 + }, + { + "entropy": 1.0364705994725227, + "epoch": 1.6733333333333333, + "grad_norm": 0.35205385088920593, + "learning_rate": 9.321637426900585e-05, + "loss": 1.0262, + "mean_token_accuracy": 0.7624763324856758, + "num_tokens": 12603804.0, + "step": 1004 + }, + { + "entropy": 1.1215010583400726, + "epoch": 1.675, + "grad_norm": 0.42232760787010193, + "learning_rate": 9.309941520467836e-05, + "loss": 1.1199, + "mean_token_accuracy": 0.736581914126873, + "num_tokens": 12616411.0, + "step": 1005 + }, + { + "entropy": 0.9427232444286346, + "epoch": 1.6766666666666667, + "grad_norm": 0.34733209013938904, + "learning_rate": 9.298245614035089e-05, + "loss": 0.9315, + "mean_token_accuracy": 0.7790528759360313, + "num_tokens": 12628828.0, + "step": 1006 + }, + { + "entropy": 0.9177378788590431, + "epoch": 1.6783333333333332, + "grad_norm": 0.35805800557136536, + "learning_rate": 9.28654970760234e-05, + "loss": 0.9019, + "mean_token_accuracy": 0.7822014093399048, + "num_tokens": 12641546.0, + "step": 1007 + }, + { + "entropy": 0.9339721202850342, + "epoch": 1.6800000000000002, + "grad_norm": 0.3551904559135437, + "learning_rate": 9.27485380116959e-05, + "loss": 0.9064, + "mean_token_accuracy": 0.7829243242740631, + "num_tokens": 12654056.0, + "step": 1008 + }, + { + "entropy": 1.0404436141252518, + "epoch": 1.6816666666666666, + "grad_norm": 0.3631914556026459, + "learning_rate": 9.263157894736843e-05, + "loss": 0.9982, + "mean_token_accuracy": 0.7624745666980743, + "num_tokens": 12666398.0, + "step": 1009 + }, + { + "entropy": 0.8712766841053963, + "epoch": 1.6833333333333333, + "grad_norm": 0.33432894945144653, + "learning_rate": 9.251461988304094e-05, + "loss": 0.8554, + "mean_token_accuracy": 0.7947898954153061, + "num_tokens": 12679178.0, + "step": 1010 + }, + { + "entropy": 1.0278621464967728, + "epoch": 1.685, + "grad_norm": 0.33881428837776184, + "learning_rate": 9.239766081871345e-05, + "loss": 1.0208, + "mean_token_accuracy": 0.7516693696379662, + "num_tokens": 12691644.0, + "step": 1011 + }, + { + "entropy": 1.0649035423994064, + "epoch": 1.6866666666666665, + "grad_norm": 0.37799784541130066, + "learning_rate": 9.228070175438597e-05, + "loss": 1.0772, + "mean_token_accuracy": 0.7425209805369377, + "num_tokens": 12704296.0, + "step": 1012 + }, + { + "entropy": 0.9589243307709694, + "epoch": 1.6883333333333335, + "grad_norm": 0.3711475431919098, + "learning_rate": 9.216374269005849e-05, + "loss": 0.9257, + "mean_token_accuracy": 0.7744899317622185, + "num_tokens": 12716590.0, + "step": 1013 + }, + { + "entropy": 1.00307896733284, + "epoch": 1.69, + "grad_norm": 0.4023561179637909, + "learning_rate": 9.2046783625731e-05, + "loss": 1.0033, + "mean_token_accuracy": 0.758160911500454, + "num_tokens": 12728910.0, + "step": 1014 + }, + { + "entropy": 0.9863514825701714, + "epoch": 1.6916666666666667, + "grad_norm": 0.3267923891544342, + "learning_rate": 9.192982456140351e-05, + "loss": 0.9921, + "mean_token_accuracy": 0.7703479081392288, + "num_tokens": 12741483.0, + "step": 1015 + }, + { + "entropy": 0.9462117999792099, + "epoch": 1.6933333333333334, + "grad_norm": 0.5054460167884827, + "learning_rate": 9.181286549707603e-05, + "loss": 0.9359, + "mean_token_accuracy": 0.778098352253437, + "num_tokens": 12754055.0, + "step": 1016 + }, + { + "entropy": 1.0157199203968048, + "epoch": 1.6949999999999998, + "grad_norm": 0.4718424379825592, + "learning_rate": 9.169590643274854e-05, + "loss": 1.0238, + "mean_token_accuracy": 0.7569868713617325, + "num_tokens": 12766637.0, + "step": 1017 + }, + { + "entropy": 1.164522334933281, + "epoch": 1.6966666666666668, + "grad_norm": 0.41534754633903503, + "learning_rate": 9.157894736842105e-05, + "loss": 1.1519, + "mean_token_accuracy": 0.7301210761070251, + "num_tokens": 12779298.0, + "step": 1018 + }, + { + "entropy": 0.9943608194589615, + "epoch": 1.6983333333333333, + "grad_norm": 0.3897961676120758, + "learning_rate": 9.146198830409358e-05, + "loss": 0.9711, + "mean_token_accuracy": 0.7657627090811729, + "num_tokens": 12792031.0, + "step": 1019 + }, + { + "entropy": 1.0494454652071, + "epoch": 1.7, + "grad_norm": 0.37525609135627747, + "learning_rate": 9.134502923976609e-05, + "loss": 1.0261, + "mean_token_accuracy": 0.7546698749065399, + "num_tokens": 12804790.0, + "step": 1020 + }, + { + "entropy": 1.0734568759799004, + "epoch": 1.7016666666666667, + "grad_norm": 0.4507957994937897, + "learning_rate": 9.12280701754386e-05, + "loss": 1.0734, + "mean_token_accuracy": 0.7483791783452034, + "num_tokens": 12817417.0, + "step": 1021 + }, + { + "entropy": 1.0368591323494911, + "epoch": 1.7033333333333334, + "grad_norm": 0.36012768745422363, + "learning_rate": 9.111111111111112e-05, + "loss": 1.0285, + "mean_token_accuracy": 0.7521537616848946, + "num_tokens": 12829969.0, + "step": 1022 + }, + { + "entropy": 1.1286500170826912, + "epoch": 1.705, + "grad_norm": 0.3422996401786804, + "learning_rate": 9.099415204678363e-05, + "loss": 1.1183, + "mean_token_accuracy": 0.7346495315432549, + "num_tokens": 12842365.0, + "step": 1023 + }, + { + "entropy": 1.2009264305233955, + "epoch": 1.7066666666666666, + "grad_norm": 0.45700791478157043, + "learning_rate": 9.087719298245615e-05, + "loss": 1.2134, + "mean_token_accuracy": 0.7141278833150864, + "num_tokens": 12855028.0, + "step": 1024 + }, + { + "entropy": 1.1235537379980087, + "epoch": 1.7083333333333335, + "grad_norm": 0.39127787947654724, + "learning_rate": 9.076023391812866e-05, + "loss": 1.0881, + "mean_token_accuracy": 0.7371407151222229, + "num_tokens": 12867632.0, + "step": 1025 + }, + { + "entropy": 1.0213222280144691, + "epoch": 1.71, + "grad_norm": 0.6702572107315063, + "learning_rate": 9.064327485380117e-05, + "loss": 0.9986, + "mean_token_accuracy": 0.7571739181876183, + "num_tokens": 12880333.0, + "step": 1026 + }, + { + "entropy": 1.0406272113323212, + "epoch": 1.7116666666666667, + "grad_norm": 0.5308269262313843, + "learning_rate": 9.052631578947369e-05, + "loss": 1.0334, + "mean_token_accuracy": 0.7501184120774269, + "num_tokens": 12892992.0, + "step": 1027 + }, + { + "entropy": 1.0075775384902954, + "epoch": 1.7133333333333334, + "grad_norm": 0.41957348585128784, + "learning_rate": 9.04093567251462e-05, + "loss": 1.0008, + "mean_token_accuracy": 0.7609769701957703, + "num_tokens": 12905703.0, + "step": 1028 + }, + { + "entropy": 0.9897546097636223, + "epoch": 1.7149999999999999, + "grad_norm": 0.35864391922950745, + "learning_rate": 9.029239766081871e-05, + "loss": 1.0039, + "mean_token_accuracy": 0.7623355314135551, + "num_tokens": 12918300.0, + "step": 1029 + }, + { + "entropy": 0.9119668006896973, + "epoch": 1.7166666666666668, + "grad_norm": 0.5414547324180603, + "learning_rate": 9.017543859649123e-05, + "loss": 0.9076, + "mean_token_accuracy": 0.7755086645483971, + "num_tokens": 12930883.0, + "step": 1030 + }, + { + "entropy": 0.9837629571557045, + "epoch": 1.7183333333333333, + "grad_norm": 0.4744820296764374, + "learning_rate": 9.005847953216374e-05, + "loss": 0.9837, + "mean_token_accuracy": 0.7667191326618195, + "num_tokens": 12943412.0, + "step": 1031 + }, + { + "entropy": 1.1909087374806404, + "epoch": 1.72, + "grad_norm": 0.31805115938186646, + "learning_rate": 8.994152046783625e-05, + "loss": 1.2015, + "mean_token_accuracy": 0.7204370275139809, + "num_tokens": 12955839.0, + "step": 1032 + }, + { + "entropy": 1.009498618543148, + "epoch": 1.7216666666666667, + "grad_norm": 0.47378993034362793, + "learning_rate": 8.982456140350878e-05, + "loss": 1.033, + "mean_token_accuracy": 0.7578397020697594, + "num_tokens": 12968328.0, + "step": 1033 + }, + { + "entropy": 1.0100986510515213, + "epoch": 1.7233333333333334, + "grad_norm": 0.3783584535121918, + "learning_rate": 8.97076023391813e-05, + "loss": 0.978, + "mean_token_accuracy": 0.7699412554502487, + "num_tokens": 12981120.0, + "step": 1034 + }, + { + "entropy": 0.9874220192432404, + "epoch": 1.725, + "grad_norm": 0.3560539484024048, + "learning_rate": 8.959064327485381e-05, + "loss": 0.9914, + "mean_token_accuracy": 0.7623036429286003, + "num_tokens": 12994015.0, + "step": 1035 + }, + { + "entropy": 1.0502874776721, + "epoch": 1.7266666666666666, + "grad_norm": 0.3287025988101959, + "learning_rate": 8.947368421052632e-05, + "loss": 1.0022, + "mean_token_accuracy": 0.7620445415377617, + "num_tokens": 13006422.0, + "step": 1036 + }, + { + "entropy": 1.1608180850744247, + "epoch": 1.7283333333333335, + "grad_norm": 0.39951977133750916, + "learning_rate": 8.935672514619884e-05, + "loss": 1.1191, + "mean_token_accuracy": 0.7336246818304062, + "num_tokens": 13018958.0, + "step": 1037 + }, + { + "entropy": 1.0154462233185768, + "epoch": 1.73, + "grad_norm": 0.3510722517967224, + "learning_rate": 8.923976608187135e-05, + "loss": 0.9876, + "mean_token_accuracy": 0.7671427950263023, + "num_tokens": 13031565.0, + "step": 1038 + }, + { + "entropy": 1.1321008205413818, + "epoch": 1.7316666666666667, + "grad_norm": 0.34480804204940796, + "learning_rate": 8.912280701754386e-05, + "loss": 1.1336, + "mean_token_accuracy": 0.7281079292297363, + "num_tokens": 13043941.0, + "step": 1039 + }, + { + "entropy": 1.0491846576333046, + "epoch": 1.7333333333333334, + "grad_norm": 0.35498106479644775, + "learning_rate": 8.900584795321638e-05, + "loss": 1.0446, + "mean_token_accuracy": 0.752373032271862, + "num_tokens": 13056405.0, + "step": 1040 + }, + { + "entropy": 1.184033825993538, + "epoch": 1.7349999999999999, + "grad_norm": 0.37634584307670593, + "learning_rate": 8.888888888888889e-05, + "loss": 1.1856, + "mean_token_accuracy": 0.7222929745912552, + "num_tokens": 13068870.0, + "step": 1041 + }, + { + "entropy": 0.9711630195379257, + "epoch": 1.7366666666666668, + "grad_norm": 0.40814366936683655, + "learning_rate": 8.87719298245614e-05, + "loss": 0.9579, + "mean_token_accuracy": 0.7746291309595108, + "num_tokens": 13081147.0, + "step": 1042 + }, + { + "entropy": 0.9708935245871544, + "epoch": 1.7383333333333333, + "grad_norm": 0.3281678557395935, + "learning_rate": 8.865497076023393e-05, + "loss": 0.9806, + "mean_token_accuracy": 0.7686115130782127, + "num_tokens": 13093666.0, + "step": 1043 + }, + { + "entropy": 1.0442078933119774, + "epoch": 1.74, + "grad_norm": 0.4168369174003601, + "learning_rate": 8.853801169590645e-05, + "loss": 1.0271, + "mean_token_accuracy": 0.7528911083936691, + "num_tokens": 13106165.0, + "step": 1044 + }, + { + "entropy": 1.1267547607421875, + "epoch": 1.7416666666666667, + "grad_norm": 0.47203320264816284, + "learning_rate": 8.842105263157894e-05, + "loss": 1.1349, + "mean_token_accuracy": 0.7340073138475418, + "num_tokens": 13118860.0, + "step": 1045 + }, + { + "entropy": 0.9274444133043289, + "epoch": 1.7433333333333332, + "grad_norm": 0.5290305614471436, + "learning_rate": 8.830409356725147e-05, + "loss": 0.9147, + "mean_token_accuracy": 0.7824391052126884, + "num_tokens": 13131405.0, + "step": 1046 + }, + { + "entropy": 0.9275342971086502, + "epoch": 1.745, + "grad_norm": 0.5540999174118042, + "learning_rate": 8.818713450292399e-05, + "loss": 0.9224, + "mean_token_accuracy": 0.7760967463254929, + "num_tokens": 13143980.0, + "step": 1047 + }, + { + "entropy": 1.041547805070877, + "epoch": 1.7466666666666666, + "grad_norm": 0.4107725918292999, + "learning_rate": 8.807017543859649e-05, + "loss": 1.0375, + "mean_token_accuracy": 0.7471731752157211, + "num_tokens": 13156625.0, + "step": 1048 + }, + { + "entropy": 1.0120449364185333, + "epoch": 1.7483333333333333, + "grad_norm": 0.3597240149974823, + "learning_rate": 8.795321637426901e-05, + "loss": 1.0087, + "mean_token_accuracy": 0.7546082735061646, + "num_tokens": 13169318.0, + "step": 1049 + }, + { + "entropy": 0.9050641730427742, + "epoch": 1.75, + "grad_norm": 0.36369770765304565, + "learning_rate": 8.783625730994153e-05, + "loss": 0.893, + "mean_token_accuracy": 0.7888905927538872, + "num_tokens": 13181903.0, + "step": 1050 + }, + { + "entropy": 1.096294365823269, + "epoch": 1.7516666666666667, + "grad_norm": 0.3386310040950775, + "learning_rate": 8.771929824561403e-05, + "loss": 1.0779, + "mean_token_accuracy": 0.746938169002533, + "num_tokens": 13194482.0, + "step": 1051 + }, + { + "entropy": 0.9471368938684464, + "epoch": 1.7533333333333334, + "grad_norm": 0.3317272961139679, + "learning_rate": 8.760233918128655e-05, + "loss": 0.9503, + "mean_token_accuracy": 0.7712305337190628, + "num_tokens": 13206825.0, + "step": 1052 + }, + { + "entropy": 0.9653787389397621, + "epoch": 1.755, + "grad_norm": 0.3206188678741455, + "learning_rate": 8.748538011695907e-05, + "loss": 0.9591, + "mean_token_accuracy": 0.776692196726799, + "num_tokens": 13219412.0, + "step": 1053 + }, + { + "entropy": 0.9689168483018875, + "epoch": 1.7566666666666668, + "grad_norm": 0.311023473739624, + "learning_rate": 8.736842105263158e-05, + "loss": 0.9732, + "mean_token_accuracy": 0.7670754492282867, + "num_tokens": 13232192.0, + "step": 1054 + }, + { + "entropy": 1.0213828086853027, + "epoch": 1.7583333333333333, + "grad_norm": 0.3334166705608368, + "learning_rate": 8.72514619883041e-05, + "loss": 1.0207, + "mean_token_accuracy": 0.7589734867215157, + "num_tokens": 13244638.0, + "step": 1055 + }, + { + "entropy": 1.1304619312286377, + "epoch": 1.76, + "grad_norm": 0.3724319636821747, + "learning_rate": 8.713450292397662e-05, + "loss": 1.146, + "mean_token_accuracy": 0.725090965628624, + "num_tokens": 13257030.0, + "step": 1056 + }, + { + "entropy": 1.1443024575710297, + "epoch": 1.7616666666666667, + "grad_norm": 0.3518555760383606, + "learning_rate": 8.701754385964913e-05, + "loss": 1.1252, + "mean_token_accuracy": 0.7382890284061432, + "num_tokens": 13269515.0, + "step": 1057 + }, + { + "entropy": 1.135504774749279, + "epoch": 1.7633333333333332, + "grad_norm": 0.38220056891441345, + "learning_rate": 8.690058479532164e-05, + "loss": 1.1137, + "mean_token_accuracy": 0.7388380318880081, + "num_tokens": 13282187.0, + "step": 1058 + }, + { + "entropy": 1.0930515304207802, + "epoch": 1.7650000000000001, + "grad_norm": 0.42755579948425293, + "learning_rate": 8.678362573099416e-05, + "loss": 1.0558, + "mean_token_accuracy": 0.7507267519831657, + "num_tokens": 13294816.0, + "step": 1059 + }, + { + "entropy": 1.0393217131495476, + "epoch": 1.7666666666666666, + "grad_norm": 0.34985482692718506, + "learning_rate": 8.666666666666667e-05, + "loss": 1.0237, + "mean_token_accuracy": 0.7579164057970047, + "num_tokens": 13307462.0, + "step": 1060 + }, + { + "entropy": 1.183841995894909, + "epoch": 1.7683333333333333, + "grad_norm": 0.3967154324054718, + "learning_rate": 8.654970760233918e-05, + "loss": 1.1676, + "mean_token_accuracy": 0.7235167846083641, + "num_tokens": 13319817.0, + "step": 1061 + }, + { + "entropy": 1.0351843312382698, + "epoch": 1.77, + "grad_norm": 0.4487072825431824, + "learning_rate": 8.64327485380117e-05, + "loss": 1.0107, + "mean_token_accuracy": 0.7641036361455917, + "num_tokens": 13332520.0, + "step": 1062 + }, + { + "entropy": 1.0560117810964584, + "epoch": 1.7716666666666665, + "grad_norm": 0.3373064398765564, + "learning_rate": 8.631578947368421e-05, + "loss": 1.0522, + "mean_token_accuracy": 0.7510386854410172, + "num_tokens": 13345292.0, + "step": 1063 + }, + { + "entropy": 0.826760470867157, + "epoch": 1.7733333333333334, + "grad_norm": 0.33728596568107605, + "learning_rate": 8.619883040935673e-05, + "loss": 0.8244, + "mean_token_accuracy": 0.8044012635946274, + "num_tokens": 13357697.0, + "step": 1064 + }, + { + "entropy": 1.1935306042432785, + "epoch": 1.775, + "grad_norm": 0.519767701625824, + "learning_rate": 8.608187134502924e-05, + "loss": 1.2138, + "mean_token_accuracy": 0.7204710319638252, + "num_tokens": 13370108.0, + "step": 1065 + }, + { + "entropy": 0.9616173505783081, + "epoch": 1.7766666666666666, + "grad_norm": 0.3920440375804901, + "learning_rate": 8.596491228070177e-05, + "loss": 0.9605, + "mean_token_accuracy": 0.7691715583205223, + "num_tokens": 13382572.0, + "step": 1066 + }, + { + "entropy": 0.9955972135066986, + "epoch": 1.7783333333333333, + "grad_norm": 0.31550100445747375, + "learning_rate": 8.584795321637428e-05, + "loss": 0.9734, + "mean_token_accuracy": 0.7609797418117523, + "num_tokens": 13395370.0, + "step": 1067 + }, + { + "entropy": 0.9564253985881805, + "epoch": 1.78, + "grad_norm": 0.49894317984580994, + "learning_rate": 8.573099415204678e-05, + "loss": 0.9603, + "mean_token_accuracy": 0.768037311732769, + "num_tokens": 13407807.0, + "step": 1068 + }, + { + "entropy": 0.9421810433268547, + "epoch": 1.7816666666666667, + "grad_norm": 0.8620859980583191, + "learning_rate": 8.561403508771931e-05, + "loss": 0.9432, + "mean_token_accuracy": 0.7720376253128052, + "num_tokens": 13420682.0, + "step": 1069 + }, + { + "entropy": 0.9941176548600197, + "epoch": 1.7833333333333332, + "grad_norm": 0.32992666959762573, + "learning_rate": 8.549707602339182e-05, + "loss": 0.9752, + "mean_token_accuracy": 0.7662701159715652, + "num_tokens": 13433310.0, + "step": 1070 + }, + { + "entropy": 0.9855979382991791, + "epoch": 1.7850000000000001, + "grad_norm": 0.8007895946502686, + "learning_rate": 8.538011695906433e-05, + "loss": 0.9685, + "mean_token_accuracy": 0.76729516685009, + "num_tokens": 13445953.0, + "step": 1071 + }, + { + "entropy": 1.029094435274601, + "epoch": 1.7866666666666666, + "grad_norm": 0.7585043907165527, + "learning_rate": 8.526315789473685e-05, + "loss": 1.041, + "mean_token_accuracy": 0.7518841326236725, + "num_tokens": 13458298.0, + "step": 1072 + }, + { + "entropy": 1.072975106537342, + "epoch": 1.7883333333333333, + "grad_norm": 0.35254842042922974, + "learning_rate": 8.514619883040936e-05, + "loss": 1.0819, + "mean_token_accuracy": 0.7426500543951988, + "num_tokens": 13470965.0, + "step": 1073 + }, + { + "entropy": 0.9032114669680595, + "epoch": 1.79, + "grad_norm": 0.3355119824409485, + "learning_rate": 8.502923976608188e-05, + "loss": 0.9127, + "mean_token_accuracy": 0.7799766734242439, + "num_tokens": 13483518.0, + "step": 1074 + }, + { + "entropy": 0.9452763050794601, + "epoch": 1.7916666666666665, + "grad_norm": 0.7401320338249207, + "learning_rate": 8.491228070175439e-05, + "loss": 0.9524, + "mean_token_accuracy": 0.773579441010952, + "num_tokens": 13496222.0, + "step": 1075 + }, + { + "entropy": 1.1635581478476524, + "epoch": 1.7933333333333334, + "grad_norm": 0.39469605684280396, + "learning_rate": 8.47953216374269e-05, + "loss": 1.1658, + "mean_token_accuracy": 0.7263970300555229, + "num_tokens": 13508916.0, + "step": 1076 + }, + { + "entropy": 0.9113326743245125, + "epoch": 1.795, + "grad_norm": 0.32929447293281555, + "learning_rate": 8.467836257309942e-05, + "loss": 0.8863, + "mean_token_accuracy": 0.7842982038855553, + "num_tokens": 13521457.0, + "step": 1077 + }, + { + "entropy": 1.1759463623166084, + "epoch": 1.7966666666666666, + "grad_norm": 0.40519341826438904, + "learning_rate": 8.456140350877193e-05, + "loss": 1.1794, + "mean_token_accuracy": 0.7177974060177803, + "num_tokens": 13534057.0, + "step": 1078 + }, + { + "entropy": 1.0619780719280243, + "epoch": 1.7983333333333333, + "grad_norm": 0.4279235601425171, + "learning_rate": 8.444444444444444e-05, + "loss": 1.071, + "mean_token_accuracy": 0.7492412179708481, + "num_tokens": 13546665.0, + "step": 1079 + }, + { + "entropy": 1.0635966658592224, + "epoch": 1.8, + "grad_norm": 0.7339469790458679, + "learning_rate": 8.432748538011697e-05, + "loss": 1.0278, + "mean_token_accuracy": 0.7510287240147591, + "num_tokens": 13559479.0, + "step": 1080 + }, + { + "epoch": 1.8, + "eval_entropy": 1.1268045465048195, + "eval_loss": 1.1247466802597046, + "eval_mean_token_accuracy": 0.7332956134105196, + "eval_num_tokens": 13559479.0, + "eval_runtime": 2667.8789, + "eval_samples_per_second": 1.875, + "eval_steps_per_second": 0.937, + "step": 1080 + } + ], + "logging_steps": 1, + "max_steps": 1800, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 360, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5.822611035064173e+18, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}