diff --git "a/checkpoint-1800/trainer_state.json" "b/checkpoint-1800/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-1800/trainer_state.json" @@ -0,0 +1,18089 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 360, + "global_step": 1800, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 1.1883099973201752, + "epoch": 0.0016666666666666668, + "grad_norm": 0.31513479351997375, + "learning_rate": 0.0, + "loss": 2.0341, + "mean_token_accuracy": 0.5869412161409855, + "num_tokens": 12531.0, + "step": 1 + }, + { + "entropy": 1.2275140136480331, + "epoch": 0.0033333333333333335, + "grad_norm": 0.3110893964767456, + "learning_rate": 2.2222222222222225e-06, + "loss": 2.0736, + "mean_token_accuracy": 0.5821192935109138, + "num_tokens": 25080.0, + "step": 2 + }, + { + "entropy": 1.1326133534312248, + "epoch": 0.005, + "grad_norm": 0.32383376359939575, + "learning_rate": 4.444444444444445e-06, + "loss": 1.9606, + "mean_token_accuracy": 0.6036411970853806, + "num_tokens": 37694.0, + "step": 3 + }, + { + "entropy": 1.2363431453704834, + "epoch": 0.006666666666666667, + "grad_norm": 0.3201318383216858, + "learning_rate": 6.666666666666667e-06, + "loss": 2.0939, + "mean_token_accuracy": 0.5772385410964489, + "num_tokens": 50110.0, + "step": 4 + }, + { + "entropy": 1.1547652631998062, + "epoch": 0.008333333333333333, + "grad_norm": 0.3156285881996155, + "learning_rate": 8.88888888888889e-06, + "loss": 2.0016, + "mean_token_accuracy": 0.603407584130764, + "num_tokens": 62553.0, + "step": 5 + }, + { + "entropy": 1.221859723329544, + "epoch": 0.01, + "grad_norm": 0.32244929671287537, + "learning_rate": 1.1111111111111112e-05, + "loss": 2.0972, + "mean_token_accuracy": 0.577469028532505, + "num_tokens": 75072.0, + "step": 6 + }, + { + "entropy": 1.2124661356210709, + "epoch": 0.011666666666666667, + "grad_norm": 0.32469165325164795, + "learning_rate": 1.3333333333333333e-05, + "loss": 2.028, + "mean_token_accuracy": 0.5872330367565155, + "num_tokens": 87639.0, + "step": 7 + }, + { + "entropy": 1.1714164167642593, + "epoch": 0.013333333333333334, + "grad_norm": 0.3768642246723175, + "learning_rate": 1.5555555555555555e-05, + "loss": 2.0241, + "mean_token_accuracy": 0.585505448281765, + "num_tokens": 100287.0, + "step": 8 + }, + { + "entropy": 1.1553556099534035, + "epoch": 0.015, + "grad_norm": 0.37776222825050354, + "learning_rate": 1.777777777777778e-05, + "loss": 1.9622, + "mean_token_accuracy": 0.600151389837265, + "num_tokens": 113153.0, + "step": 9 + }, + { + "entropy": 1.1338117942214012, + "epoch": 0.016666666666666666, + "grad_norm": 0.3281095027923584, + "learning_rate": 2e-05, + "loss": 1.9465, + "mean_token_accuracy": 0.6039463356137276, + "num_tokens": 125548.0, + "step": 10 + }, + { + "entropy": 1.124964714050293, + "epoch": 0.018333333333333333, + "grad_norm": 0.35819536447525024, + "learning_rate": 2.2222222222222223e-05, + "loss": 1.9556, + "mean_token_accuracy": 0.6038774251937866, + "num_tokens": 138024.0, + "step": 11 + }, + { + "entropy": 1.247290499508381, + "epoch": 0.02, + "grad_norm": 0.40456947684288025, + "learning_rate": 2.4444444444444445e-05, + "loss": 2.0633, + "mean_token_accuracy": 0.5761790797114372, + "num_tokens": 150424.0, + "step": 12 + }, + { + "entropy": 1.2814347296953201, + "epoch": 0.021666666666666667, + "grad_norm": 0.40442585945129395, + "learning_rate": 2.6666666666666667e-05, + "loss": 2.1256, + "mean_token_accuracy": 0.5704136714339256, + "num_tokens": 162786.0, + "step": 13 + }, + { + "entropy": 1.1805027946829796, + "epoch": 0.023333333333333334, + "grad_norm": 0.43369293212890625, + "learning_rate": 2.8888888888888888e-05, + "loss": 1.9641, + "mean_token_accuracy": 0.5983459949493408, + "num_tokens": 175429.0, + "step": 14 + }, + { + "entropy": 1.2493249326944351, + "epoch": 0.025, + "grad_norm": 0.4346776008605957, + "learning_rate": 3.111111111111111e-05, + "loss": 2.0526, + "mean_token_accuracy": 0.5874154344201088, + "num_tokens": 187863.0, + "step": 15 + }, + { + "entropy": 1.1698118671774864, + "epoch": 0.02666666666666667, + "grad_norm": 0.4407603144645691, + "learning_rate": 3.3333333333333335e-05, + "loss": 1.9261, + "mean_token_accuracy": 0.6163465455174446, + "num_tokens": 200573.0, + "step": 16 + }, + { + "entropy": 1.189228169620037, + "epoch": 0.028333333333333332, + "grad_norm": 0.4984031617641449, + "learning_rate": 3.555555555555556e-05, + "loss": 1.9534, + "mean_token_accuracy": 0.5960768610239029, + "num_tokens": 213121.0, + "step": 17 + }, + { + "entropy": 1.234485238790512, + "epoch": 0.03, + "grad_norm": 0.545619547367096, + "learning_rate": 3.777777777777778e-05, + "loss": 2.0114, + "mean_token_accuracy": 0.582790918648243, + "num_tokens": 225055.0, + "step": 18 + }, + { + "entropy": 1.2693497836589813, + "epoch": 0.03166666666666667, + "grad_norm": 0.5483012199401855, + "learning_rate": 4e-05, + "loss": 2.0327, + "mean_token_accuracy": 0.5864806175231934, + "num_tokens": 237675.0, + "step": 19 + }, + { + "entropy": 1.3061316907405853, + "epoch": 0.03333333333333333, + "grad_norm": 0.6728277206420898, + "learning_rate": 4.222222222222222e-05, + "loss": 2.0134, + "mean_token_accuracy": 0.581995002925396, + "num_tokens": 250378.0, + "step": 20 + }, + { + "entropy": 1.3692396432161331, + "epoch": 0.035, + "grad_norm": 0.6627784371376038, + "learning_rate": 4.4444444444444447e-05, + "loss": 2.0612, + "mean_token_accuracy": 0.5739701353013515, + "num_tokens": 262331.0, + "step": 21 + }, + { + "entropy": 1.240286722779274, + "epoch": 0.03666666666666667, + "grad_norm": 0.6716634631156921, + "learning_rate": 4.666666666666667e-05, + "loss": 1.8739, + "mean_token_accuracy": 0.6026649698615074, + "num_tokens": 274867.0, + "step": 22 + }, + { + "entropy": 1.1950730830430984, + "epoch": 0.03833333333333333, + "grad_norm": 0.6836673617362976, + "learning_rate": 4.888888888888889e-05, + "loss": 1.7975, + "mean_token_accuracy": 0.616993211209774, + "num_tokens": 287375.0, + "step": 23 + }, + { + "entropy": 1.3049941956996918, + "epoch": 0.04, + "grad_norm": 0.7128519415855408, + "learning_rate": 5.111111111111111e-05, + "loss": 1.9134, + "mean_token_accuracy": 0.5976476445794106, + "num_tokens": 299836.0, + "step": 24 + }, + { + "entropy": 1.3695744276046753, + "epoch": 0.041666666666666664, + "grad_norm": 1.17112398147583, + "learning_rate": 5.333333333333333e-05, + "loss": 1.9372, + "mean_token_accuracy": 0.5883589163422585, + "num_tokens": 312372.0, + "step": 25 + }, + { + "entropy": 1.1785986423492432, + "epoch": 0.043333333333333335, + "grad_norm": 0.625380277633667, + "learning_rate": 5.555555555555556e-05, + "loss": 1.6627, + "mean_token_accuracy": 0.6405750215053558, + "num_tokens": 324984.0, + "step": 26 + }, + { + "entropy": 1.2021356672048569, + "epoch": 0.045, + "grad_norm": 0.7430734038352966, + "learning_rate": 5.7777777777777776e-05, + "loss": 1.6448, + "mean_token_accuracy": 0.6367640718817711, + "num_tokens": 337559.0, + "step": 27 + }, + { + "entropy": 1.3892791867256165, + "epoch": 0.04666666666666667, + "grad_norm": 0.7540942430496216, + "learning_rate": 6e-05, + "loss": 1.8673, + "mean_token_accuracy": 0.6006477549672127, + "num_tokens": 349939.0, + "step": 28 + }, + { + "entropy": 1.2635403275489807, + "epoch": 0.04833333333333333, + "grad_norm": 0.5877339839935303, + "learning_rate": 6.222222222222222e-05, + "loss": 1.6482, + "mean_token_accuracy": 0.6342698633670807, + "num_tokens": 362517.0, + "step": 29 + }, + { + "entropy": 1.1776714846491814, + "epoch": 0.05, + "grad_norm": 0.6017478108406067, + "learning_rate": 6.444444444444446e-05, + "loss": 1.4892, + "mean_token_accuracy": 0.6633898839354515, + "num_tokens": 375048.0, + "step": 30 + }, + { + "entropy": 1.3733180239796638, + "epoch": 0.051666666666666666, + "grad_norm": 0.4876861274242401, + "learning_rate": 6.666666666666667e-05, + "loss": 1.6464, + "mean_token_accuracy": 0.6362894400954247, + "num_tokens": 387684.0, + "step": 31 + }, + { + "entropy": 1.3747638314962387, + "epoch": 0.05333333333333334, + "grad_norm": 0.5345461368560791, + "learning_rate": 6.88888888888889e-05, + "loss": 1.6545, + "mean_token_accuracy": 0.6348208487033844, + "num_tokens": 400150.0, + "step": 32 + }, + { + "entropy": 1.2881105542182922, + "epoch": 0.055, + "grad_norm": 0.5066477656364441, + "learning_rate": 7.111111111111112e-05, + "loss": 1.4894, + "mean_token_accuracy": 0.6688544601202011, + "num_tokens": 412800.0, + "step": 33 + }, + { + "entropy": 1.546808585524559, + "epoch": 0.056666666666666664, + "grad_norm": 0.46314069628715515, + "learning_rate": 7.333333333333333e-05, + "loss": 1.7549, + "mean_token_accuracy": 0.6177285388112068, + "num_tokens": 425287.0, + "step": 34 + }, + { + "entropy": 1.343202069401741, + "epoch": 0.058333333333333334, + "grad_norm": 0.4545861780643463, + "learning_rate": 7.555555555555556e-05, + "loss": 1.4505, + "mean_token_accuracy": 0.6723693758249283, + "num_tokens": 437762.0, + "step": 35 + }, + { + "entropy": 1.4139084219932556, + "epoch": 0.06, + "grad_norm": 0.48711973428726196, + "learning_rate": 7.777777777777778e-05, + "loss": 1.5045, + "mean_token_accuracy": 0.6590398252010345, + "num_tokens": 450223.0, + "step": 36 + }, + { + "entropy": 1.3924538046121597, + "epoch": 0.06166666666666667, + "grad_norm": 0.47148916125297546, + "learning_rate": 8e-05, + "loss": 1.4763, + "mean_token_accuracy": 0.671143427491188, + "num_tokens": 462605.0, + "step": 37 + }, + { + "entropy": 1.3940246105194092, + "epoch": 0.06333333333333334, + "grad_norm": 0.532814621925354, + "learning_rate": 8.222222222222222e-05, + "loss": 1.4072, + "mean_token_accuracy": 0.6783580556511879, + "num_tokens": 475221.0, + "step": 38 + }, + { + "entropy": 1.4382469952106476, + "epoch": 0.065, + "grad_norm": 0.4498147666454315, + "learning_rate": 8.444444444444444e-05, + "loss": 1.4616, + "mean_token_accuracy": 0.6687446236610413, + "num_tokens": 487648.0, + "step": 39 + }, + { + "entropy": 1.3836999088525772, + "epoch": 0.06666666666666667, + "grad_norm": 0.4689527153968811, + "learning_rate": 8.666666666666667e-05, + "loss": 1.3839, + "mean_token_accuracy": 0.6905905231833458, + "num_tokens": 500175.0, + "step": 40 + }, + { + "entropy": 1.3075188547372818, + "epoch": 0.06833333333333333, + "grad_norm": 0.45789051055908203, + "learning_rate": 8.888888888888889e-05, + "loss": 1.2932, + "mean_token_accuracy": 0.7051132321357727, + "num_tokens": 512860.0, + "step": 41 + }, + { + "entropy": 1.2035221755504608, + "epoch": 0.07, + "grad_norm": 0.48102283477783203, + "learning_rate": 9.111111111111112e-05, + "loss": 1.2177, + "mean_token_accuracy": 0.7276435941457748, + "num_tokens": 525540.0, + "step": 42 + }, + { + "entropy": 1.2424119412899017, + "epoch": 0.07166666666666667, + "grad_norm": 0.48856499791145325, + "learning_rate": 9.333333333333334e-05, + "loss": 1.244, + "mean_token_accuracy": 0.7171106263995171, + "num_tokens": 538098.0, + "step": 43 + }, + { + "entropy": 1.2872763872146606, + "epoch": 0.07333333333333333, + "grad_norm": 0.46277451515197754, + "learning_rate": 9.555555555555557e-05, + "loss": 1.3318, + "mean_token_accuracy": 0.7009768709540367, + "num_tokens": 550788.0, + "step": 44 + }, + { + "entropy": 1.3774635940790176, + "epoch": 0.075, + "grad_norm": 0.48092466592788696, + "learning_rate": 9.777777777777778e-05, + "loss": 1.4124, + "mean_token_accuracy": 0.6832538694143295, + "num_tokens": 563184.0, + "step": 45 + }, + { + "entropy": 1.1826415210962296, + "epoch": 0.07666666666666666, + "grad_norm": 0.395088255405426, + "learning_rate": 0.0001, + "loss": 1.2277, + "mean_token_accuracy": 0.7197900637984276, + "num_tokens": 575833.0, + "step": 46 + }, + { + "entropy": 1.2140811532735825, + "epoch": 0.07833333333333334, + "grad_norm": 0.3708420693874359, + "learning_rate": 0.00010222222222222222, + "loss": 1.3184, + "mean_token_accuracy": 0.7010470479726791, + "num_tokens": 588340.0, + "step": 47 + }, + { + "entropy": 1.1609551459550858, + "epoch": 0.08, + "grad_norm": 0.29149869084358215, + "learning_rate": 0.00010444444444444445, + "loss": 1.2231, + "mean_token_accuracy": 0.7239813506603241, + "num_tokens": 600817.0, + "step": 48 + }, + { + "entropy": 1.094582460820675, + "epoch": 0.08166666666666667, + "grad_norm": 0.31041696667671204, + "learning_rate": 0.00010666666666666667, + "loss": 1.1622, + "mean_token_accuracy": 0.7331580147147179, + "num_tokens": 613072.0, + "step": 49 + }, + { + "entropy": 1.2046773582696915, + "epoch": 0.08333333333333333, + "grad_norm": 0.34948456287384033, + "learning_rate": 0.00010888888888888889, + "loss": 1.3382, + "mean_token_accuracy": 0.7008231580257416, + "num_tokens": 625279.0, + "step": 50 + }, + { + "entropy": 1.1427595689892769, + "epoch": 0.085, + "grad_norm": 0.30553025007247925, + "learning_rate": 0.00011111111111111112, + "loss": 1.1741, + "mean_token_accuracy": 0.7270924001932144, + "num_tokens": 637559.0, + "step": 51 + }, + { + "entropy": 1.0716800168156624, + "epoch": 0.08666666666666667, + "grad_norm": 0.3007242679595947, + "learning_rate": 0.00011333333333333334, + "loss": 1.1271, + "mean_token_accuracy": 0.7399003505706787, + "num_tokens": 650264.0, + "step": 52 + }, + { + "entropy": 1.3152555897831917, + "epoch": 0.08833333333333333, + "grad_norm": 0.4258122146129608, + "learning_rate": 0.00011555555555555555, + "loss": 1.3586, + "mean_token_accuracy": 0.6957122161984444, + "num_tokens": 662826.0, + "step": 53 + }, + { + "entropy": 1.2838200628757477, + "epoch": 0.09, + "grad_norm": 0.32665765285491943, + "learning_rate": 0.00011777777777777779, + "loss": 1.3561, + "mean_token_accuracy": 0.6944003701210022, + "num_tokens": 675585.0, + "step": 54 + }, + { + "entropy": 1.0607134401798248, + "epoch": 0.09166666666666666, + "grad_norm": 0.30055567622184753, + "learning_rate": 0.00012, + "loss": 1.1042, + "mean_token_accuracy": 0.7346132323145866, + "num_tokens": 688083.0, + "step": 55 + }, + { + "entropy": 1.2985290735960007, + "epoch": 0.09333333333333334, + "grad_norm": 0.2808513343334198, + "learning_rate": 0.00012222222222222224, + "loss": 1.346, + "mean_token_accuracy": 0.6928954645991325, + "num_tokens": 700448.0, + "step": 56 + }, + { + "entropy": 1.1469447389245033, + "epoch": 0.095, + "grad_norm": 0.3286835551261902, + "learning_rate": 0.00012444444444444444, + "loss": 1.168, + "mean_token_accuracy": 0.7282102331519127, + "num_tokens": 713117.0, + "step": 57 + }, + { + "entropy": 1.211227871477604, + "epoch": 0.09666666666666666, + "grad_norm": 0.2934599220752716, + "learning_rate": 0.00012666666666666666, + "loss": 1.2521, + "mean_token_accuracy": 0.7141993716359138, + "num_tokens": 725605.0, + "step": 58 + }, + { + "entropy": 1.1839049607515335, + "epoch": 0.09833333333333333, + "grad_norm": 0.3096744120121002, + "learning_rate": 0.00012888888888888892, + "loss": 1.2122, + "mean_token_accuracy": 0.7217304483056068, + "num_tokens": 738207.0, + "step": 59 + }, + { + "entropy": 0.9791679158806801, + "epoch": 0.1, + "grad_norm": 0.2698657810688019, + "learning_rate": 0.00013111111111111111, + "loss": 1.0068, + "mean_token_accuracy": 0.7672298699617386, + "num_tokens": 750830.0, + "step": 60 + }, + { + "entropy": 1.1061433926224709, + "epoch": 0.10166666666666667, + "grad_norm": 0.32427090406417847, + "learning_rate": 0.00013333333333333334, + "loss": 1.151, + "mean_token_accuracy": 0.7332676723599434, + "num_tokens": 763526.0, + "step": 61 + }, + { + "entropy": 1.0617150589823723, + "epoch": 0.10333333333333333, + "grad_norm": 0.2928755581378937, + "learning_rate": 0.00013555555555555556, + "loss": 1.0784, + "mean_token_accuracy": 0.7479719892144203, + "num_tokens": 776010.0, + "step": 62 + }, + { + "entropy": 1.0685010254383087, + "epoch": 0.105, + "grad_norm": 0.35550597310066223, + "learning_rate": 0.0001377777777777778, + "loss": 1.1365, + "mean_token_accuracy": 0.7320908978581429, + "num_tokens": 788523.0, + "step": 63 + }, + { + "entropy": 1.2497737780213356, + "epoch": 0.10666666666666667, + "grad_norm": 0.32172951102256775, + "learning_rate": 0.00014, + "loss": 1.277, + "mean_token_accuracy": 0.7090576663613319, + "num_tokens": 801026.0, + "step": 64 + }, + { + "entropy": 1.212308518588543, + "epoch": 0.10833333333333334, + "grad_norm": 0.30178961157798767, + "learning_rate": 0.00014222222222222224, + "loss": 1.2619, + "mean_token_accuracy": 0.712481826543808, + "num_tokens": 813435.0, + "step": 65 + }, + { + "entropy": 1.197321593761444, + "epoch": 0.11, + "grad_norm": 0.28265058994293213, + "learning_rate": 0.00014444444444444444, + "loss": 1.251, + "mean_token_accuracy": 0.7203180119395256, + "num_tokens": 825830.0, + "step": 66 + }, + { + "entropy": 1.0528302267193794, + "epoch": 0.11166666666666666, + "grad_norm": 0.2757464349269867, + "learning_rate": 0.00014666666666666666, + "loss": 1.0901, + "mean_token_accuracy": 0.75026024132967, + "num_tokens": 838764.0, + "step": 67 + }, + { + "entropy": 1.2962219715118408, + "epoch": 0.11333333333333333, + "grad_norm": 0.3328060209751129, + "learning_rate": 0.0001488888888888889, + "loss": 1.3067, + "mean_token_accuracy": 0.704713948071003, + "num_tokens": 851146.0, + "step": 68 + }, + { + "entropy": 1.1621850430965424, + "epoch": 0.115, + "grad_norm": 0.32625430822372437, + "learning_rate": 0.0001511111111111111, + "loss": 1.1708, + "mean_token_accuracy": 0.7335870340466499, + "num_tokens": 863579.0, + "step": 69 + }, + { + "entropy": 1.1265803426504135, + "epoch": 0.11666666666666667, + "grad_norm": 0.34571829438209534, + "learning_rate": 0.00015333333333333334, + "loss": 1.1124, + "mean_token_accuracy": 0.7329849451780319, + "num_tokens": 875785.0, + "step": 70 + }, + { + "entropy": 1.0563486441969872, + "epoch": 0.11833333333333333, + "grad_norm": 0.34876880049705505, + "learning_rate": 0.00015555555555555556, + "loss": 1.0546, + "mean_token_accuracy": 0.7550168186426163, + "num_tokens": 888578.0, + "step": 71 + }, + { + "entropy": 1.1433459967374802, + "epoch": 0.12, + "grad_norm": 0.39345014095306396, + "learning_rate": 0.0001577777777777778, + "loss": 1.1505, + "mean_token_accuracy": 0.7349046617746353, + "num_tokens": 901173.0, + "step": 72 + }, + { + "entropy": 1.2071392461657524, + "epoch": 0.12166666666666667, + "grad_norm": 0.3662756681442261, + "learning_rate": 0.00016, + "loss": 1.2599, + "mean_token_accuracy": 0.7073846533894539, + "num_tokens": 913731.0, + "step": 73 + }, + { + "entropy": 1.1201880425214767, + "epoch": 0.12333333333333334, + "grad_norm": 0.34027227759361267, + "learning_rate": 0.00016222222222222224, + "loss": 1.1595, + "mean_token_accuracy": 0.7306345105171204, + "num_tokens": 926306.0, + "step": 74 + }, + { + "entropy": 1.1337391138076782, + "epoch": 0.125, + "grad_norm": 0.45978736877441406, + "learning_rate": 0.00016444444444444444, + "loss": 1.2174, + "mean_token_accuracy": 0.7151773795485497, + "num_tokens": 938678.0, + "step": 75 + }, + { + "entropy": 1.1473943069577217, + "epoch": 0.12666666666666668, + "grad_norm": 0.662312388420105, + "learning_rate": 0.0001666666666666667, + "loss": 1.2348, + "mean_token_accuracy": 0.7175232917070389, + "num_tokens": 951039.0, + "step": 76 + }, + { + "entropy": 1.0714805275201797, + "epoch": 0.12833333333333333, + "grad_norm": 0.30166077613830566, + "learning_rate": 0.00016888888888888889, + "loss": 1.1602, + "mean_token_accuracy": 0.7314794659614563, + "num_tokens": 963428.0, + "step": 77 + }, + { + "entropy": 1.2219713553786278, + "epoch": 0.13, + "grad_norm": 0.269293874502182, + "learning_rate": 0.0001711111111111111, + "loss": 1.2639, + "mean_token_accuracy": 0.7121034041047096, + "num_tokens": 975918.0, + "step": 78 + }, + { + "entropy": 1.1331642344594002, + "epoch": 0.13166666666666665, + "grad_norm": 0.289949893951416, + "learning_rate": 0.00017333333333333334, + "loss": 1.202, + "mean_token_accuracy": 0.720516249537468, + "num_tokens": 988256.0, + "step": 79 + }, + { + "entropy": 1.199205830693245, + "epoch": 0.13333333333333333, + "grad_norm": 0.384086549282074, + "learning_rate": 0.00017555555555555556, + "loss": 1.2254, + "mean_token_accuracy": 0.7111634537577629, + "num_tokens": 1000806.0, + "step": 80 + }, + { + "entropy": 1.0977754443883896, + "epoch": 0.135, + "grad_norm": 0.2883825898170471, + "learning_rate": 0.00017777777777777779, + "loss": 1.1091, + "mean_token_accuracy": 0.7458898946642876, + "num_tokens": 1013837.0, + "step": 81 + }, + { + "entropy": 1.168524369597435, + "epoch": 0.13666666666666666, + "grad_norm": 0.38602787256240845, + "learning_rate": 0.00018, + "loss": 1.1589, + "mean_token_accuracy": 0.7247432917356491, + "num_tokens": 1026588.0, + "step": 82 + }, + { + "entropy": 1.1804613769054413, + "epoch": 0.13833333333333334, + "grad_norm": 0.4096560478210449, + "learning_rate": 0.00018222222222222224, + "loss": 1.1928, + "mean_token_accuracy": 0.7234074100852013, + "num_tokens": 1038970.0, + "step": 83 + }, + { + "entropy": 1.220597304403782, + "epoch": 0.14, + "grad_norm": 0.34397998452186584, + "learning_rate": 0.00018444444444444446, + "loss": 1.2601, + "mean_token_accuracy": 0.7074739634990692, + "num_tokens": 1051554.0, + "step": 84 + }, + { + "entropy": 0.9805111661553383, + "epoch": 0.14166666666666666, + "grad_norm": 0.5971760153770447, + "learning_rate": 0.0001866666666666667, + "loss": 1.0317, + "mean_token_accuracy": 0.7597767561674118, + "num_tokens": 1064327.0, + "step": 85 + }, + { + "entropy": 1.1483803391456604, + "epoch": 0.14333333333333334, + "grad_norm": 0.3001822233200073, + "learning_rate": 0.00018888888888888888, + "loss": 1.2179, + "mean_token_accuracy": 0.7179044857621193, + "num_tokens": 1076977.0, + "step": 86 + }, + { + "entropy": 1.1420434266328812, + "epoch": 0.145, + "grad_norm": 0.5787074565887451, + "learning_rate": 0.00019111111111111114, + "loss": 1.1715, + "mean_token_accuracy": 0.7311113104224205, + "num_tokens": 1089404.0, + "step": 87 + }, + { + "entropy": 1.2739483416080475, + "epoch": 0.14666666666666667, + "grad_norm": 0.4274302124977112, + "learning_rate": 0.00019333333333333333, + "loss": 1.3089, + "mean_token_accuracy": 0.6991681382060051, + "num_tokens": 1101887.0, + "step": 88 + }, + { + "entropy": 1.0925018265843391, + "epoch": 0.14833333333333334, + "grad_norm": 0.3573361933231354, + "learning_rate": 0.00019555555555555556, + "loss": 1.1291, + "mean_token_accuracy": 0.7353638261556625, + "num_tokens": 1114866.0, + "step": 89 + }, + { + "entropy": 1.3409122675657272, + "epoch": 0.15, + "grad_norm": 0.486543208360672, + "learning_rate": 0.00019777777777777778, + "loss": 1.3796, + "mean_token_accuracy": 0.6867416799068451, + "num_tokens": 1127433.0, + "step": 90 + }, + { + "entropy": 1.087234228849411, + "epoch": 0.15166666666666667, + "grad_norm": 0.3652050793170929, + "learning_rate": 0.0002, + "loss": 1.1062, + "mean_token_accuracy": 0.7423817366361618, + "num_tokens": 1139907.0, + "step": 91 + }, + { + "entropy": 1.2978694140911102, + "epoch": 0.15333333333333332, + "grad_norm": 0.42118868231773376, + "learning_rate": 0.0001998830409356725, + "loss": 1.3141, + "mean_token_accuracy": 0.6988303884863853, + "num_tokens": 1152242.0, + "step": 92 + }, + { + "entropy": 0.991526797413826, + "epoch": 0.155, + "grad_norm": 0.7384818196296692, + "learning_rate": 0.00019976608187134506, + "loss": 1.0096, + "mean_token_accuracy": 0.760047473013401, + "num_tokens": 1164770.0, + "step": 93 + }, + { + "entropy": 1.1662617474794388, + "epoch": 0.15666666666666668, + "grad_norm": 0.33492511510849, + "learning_rate": 0.00019964912280701755, + "loss": 1.1827, + "mean_token_accuracy": 0.7286000698804855, + "num_tokens": 1177211.0, + "step": 94 + }, + { + "entropy": 0.9847327098250389, + "epoch": 0.15833333333333333, + "grad_norm": 0.5884389877319336, + "learning_rate": 0.00019953216374269005, + "loss": 1.0243, + "mean_token_accuracy": 0.7573123648762703, + "num_tokens": 1189909.0, + "step": 95 + }, + { + "entropy": 1.0150301530957222, + "epoch": 0.16, + "grad_norm": 0.38942259550094604, + "learning_rate": 0.0001994152046783626, + "loss": 1.0619, + "mean_token_accuracy": 0.7528154477477074, + "num_tokens": 1202264.0, + "step": 96 + }, + { + "entropy": 1.0576318800449371, + "epoch": 0.16166666666666665, + "grad_norm": 0.46958282589912415, + "learning_rate": 0.0001992982456140351, + "loss": 1.0924, + "mean_token_accuracy": 0.7439210563898087, + "num_tokens": 1214929.0, + "step": 97 + }, + { + "entropy": 1.098344899713993, + "epoch": 0.16333333333333333, + "grad_norm": 0.45931607484817505, + "learning_rate": 0.0001991812865497076, + "loss": 1.1208, + "mean_token_accuracy": 0.731578603386879, + "num_tokens": 1227226.0, + "step": 98 + }, + { + "entropy": 1.154830977320671, + "epoch": 0.165, + "grad_norm": 0.3942425549030304, + "learning_rate": 0.00019906432748538014, + "loss": 1.1668, + "mean_token_accuracy": 0.7254332229495049, + "num_tokens": 1239739.0, + "step": 99 + }, + { + "entropy": 1.0297606065869331, + "epoch": 0.16666666666666666, + "grad_norm": 0.316371351480484, + "learning_rate": 0.00019894736842105264, + "loss": 1.0509, + "mean_token_accuracy": 0.7479109838604927, + "num_tokens": 1252407.0, + "step": 100 + }, + { + "entropy": 1.0756673142313957, + "epoch": 0.16833333333333333, + "grad_norm": 0.7114313244819641, + "learning_rate": 0.00019883040935672513, + "loss": 1.0986, + "mean_token_accuracy": 0.7459438368678093, + "num_tokens": 1264801.0, + "step": 101 + }, + { + "entropy": 0.973985955119133, + "epoch": 0.17, + "grad_norm": 0.4525020122528076, + "learning_rate": 0.00019871345029239768, + "loss": 0.9842, + "mean_token_accuracy": 0.7690640017390251, + "num_tokens": 1277422.0, + "step": 102 + }, + { + "entropy": 1.0724262371659279, + "epoch": 0.17166666666666666, + "grad_norm": 0.5805977582931519, + "learning_rate": 0.00019859649122807018, + "loss": 1.1007, + "mean_token_accuracy": 0.7470309287309647, + "num_tokens": 1289893.0, + "step": 103 + }, + { + "entropy": 1.093192383646965, + "epoch": 0.17333333333333334, + "grad_norm": 0.391726553440094, + "learning_rate": 0.0001984795321637427, + "loss": 1.1168, + "mean_token_accuracy": 0.7345704063773155, + "num_tokens": 1302535.0, + "step": 104 + }, + { + "entropy": 1.030884176492691, + "epoch": 0.175, + "grad_norm": 0.4719918668270111, + "learning_rate": 0.00019836257309941522, + "loss": 1.0402, + "mean_token_accuracy": 0.7528453394770622, + "num_tokens": 1315365.0, + "step": 105 + }, + { + "entropy": 1.022902749478817, + "epoch": 0.17666666666666667, + "grad_norm": 0.4833899736404419, + "learning_rate": 0.00019824561403508772, + "loss": 1.0441, + "mean_token_accuracy": 0.7529502660036087, + "num_tokens": 1327961.0, + "step": 106 + }, + { + "entropy": 1.1461059749126434, + "epoch": 0.17833333333333334, + "grad_norm": 0.33028945326805115, + "learning_rate": 0.00019812865497076024, + "loss": 1.189, + "mean_token_accuracy": 0.7240455821156502, + "num_tokens": 1340552.0, + "step": 107 + }, + { + "entropy": 1.0696540772914886, + "epoch": 0.18, + "grad_norm": 0.39427050948143005, + "learning_rate": 0.00019801169590643277, + "loss": 1.0901, + "mean_token_accuracy": 0.7402398586273193, + "num_tokens": 1352996.0, + "step": 108 + }, + { + "entropy": 1.2089848518371582, + "epoch": 0.18166666666666667, + "grad_norm": 0.49633994698524475, + "learning_rate": 0.00019789473684210526, + "loss": 1.2587, + "mean_token_accuracy": 0.7089790180325508, + "num_tokens": 1365591.0, + "step": 109 + }, + { + "entropy": 1.1154760420322418, + "epoch": 0.18333333333333332, + "grad_norm": 0.3988918662071228, + "learning_rate": 0.00019777777777777778, + "loss": 1.1758, + "mean_token_accuracy": 0.728231742978096, + "num_tokens": 1378160.0, + "step": 110 + }, + { + "entropy": 1.1121669262647629, + "epoch": 0.185, + "grad_norm": 0.38943156599998474, + "learning_rate": 0.0001976608187134503, + "loss": 1.1454, + "mean_token_accuracy": 0.7311063706874847, + "num_tokens": 1390786.0, + "step": 111 + }, + { + "entropy": 1.0965967029333115, + "epoch": 0.18666666666666668, + "grad_norm": 0.5972121357917786, + "learning_rate": 0.0001975438596491228, + "loss": 1.1035, + "mean_token_accuracy": 0.7371370121836662, + "num_tokens": 1403473.0, + "step": 112 + }, + { + "entropy": 0.9760538339614868, + "epoch": 0.18833333333333332, + "grad_norm": 0.32120487093925476, + "learning_rate": 0.00019742690058479533, + "loss": 0.9931, + "mean_token_accuracy": 0.7640335187315941, + "num_tokens": 1415959.0, + "step": 113 + }, + { + "entropy": 1.1588884815573692, + "epoch": 0.19, + "grad_norm": 0.6615768074989319, + "learning_rate": 0.00019730994152046785, + "loss": 1.1925, + "mean_token_accuracy": 0.7154464647173882, + "num_tokens": 1428492.0, + "step": 114 + }, + { + "entropy": 1.0532007440924644, + "epoch": 0.19166666666666668, + "grad_norm": 0.4122265875339508, + "learning_rate": 0.00019719298245614035, + "loss": 1.0635, + "mean_token_accuracy": 0.7488418594002724, + "num_tokens": 1440911.0, + "step": 115 + }, + { + "entropy": 1.2064021080732346, + "epoch": 0.19333333333333333, + "grad_norm": 0.4100744128227234, + "learning_rate": 0.00019707602339181287, + "loss": 1.2092, + "mean_token_accuracy": 0.7215066328644753, + "num_tokens": 1453573.0, + "step": 116 + }, + { + "entropy": 0.980360358953476, + "epoch": 0.195, + "grad_norm": 0.4821430444717407, + "learning_rate": 0.0001969590643274854, + "loss": 0.9949, + "mean_token_accuracy": 0.76118303835392, + "num_tokens": 1466030.0, + "step": 117 + }, + { + "entropy": 1.1557006016373634, + "epoch": 0.19666666666666666, + "grad_norm": 0.4301220774650574, + "learning_rate": 0.0001968421052631579, + "loss": 1.207, + "mean_token_accuracy": 0.7246038690209389, + "num_tokens": 1478409.0, + "step": 118 + }, + { + "entropy": 0.994934193789959, + "epoch": 0.19833333333333333, + "grad_norm": 0.33380362391471863, + "learning_rate": 0.0001967251461988304, + "loss": 1.0463, + "mean_token_accuracy": 0.7522767782211304, + "num_tokens": 1490956.0, + "step": 119 + }, + { + "entropy": 1.0550952181220055, + "epoch": 0.2, + "grad_norm": 0.43042588233947754, + "learning_rate": 0.00019660818713450293, + "loss": 1.1072, + "mean_token_accuracy": 0.7346431165933609, + "num_tokens": 1503441.0, + "step": 120 + }, + { + "entropy": 1.3361243903636932, + "epoch": 0.20166666666666666, + "grad_norm": 0.40949326753616333, + "learning_rate": 0.00019649122807017543, + "loss": 1.4268, + "mean_token_accuracy": 0.6733630150556564, + "num_tokens": 1515872.0, + "step": 121 + }, + { + "entropy": 0.9732860177755356, + "epoch": 0.20333333333333334, + "grad_norm": 0.35589590668678284, + "learning_rate": 0.00019637426900584798, + "loss": 1.0341, + "mean_token_accuracy": 0.759332112967968, + "num_tokens": 1528695.0, + "step": 122 + }, + { + "entropy": 1.0260741487145424, + "epoch": 0.205, + "grad_norm": 0.4480772614479065, + "learning_rate": 0.00019625730994152048, + "loss": 1.0118, + "mean_token_accuracy": 0.7584766522049904, + "num_tokens": 1541198.0, + "step": 123 + }, + { + "entropy": 1.1190374419093132, + "epoch": 0.20666666666666667, + "grad_norm": 0.7018595933914185, + "learning_rate": 0.000196140350877193, + "loss": 1.1102, + "mean_token_accuracy": 0.7376047149300575, + "num_tokens": 1553802.0, + "step": 124 + }, + { + "entropy": 1.1323942467570305, + "epoch": 0.20833333333333334, + "grad_norm": 0.32317179441452026, + "learning_rate": 0.00019602339181286552, + "loss": 1.1489, + "mean_token_accuracy": 0.7312273606657982, + "num_tokens": 1566099.0, + "step": 125 + }, + { + "entropy": 1.2469940930604935, + "epoch": 0.21, + "grad_norm": 0.4663606286048889, + "learning_rate": 0.00019590643274853802, + "loss": 1.2628, + "mean_token_accuracy": 0.7112779766321182, + "num_tokens": 1578467.0, + "step": 126 + }, + { + "entropy": 1.117019146680832, + "epoch": 0.21166666666666667, + "grad_norm": 0.5317836403846741, + "learning_rate": 0.00019578947368421054, + "loss": 1.1085, + "mean_token_accuracy": 0.7378982827067375, + "num_tokens": 1591089.0, + "step": 127 + }, + { + "entropy": 1.2175140753388405, + "epoch": 0.21333333333333335, + "grad_norm": 0.38012969493865967, + "learning_rate": 0.00019567251461988306, + "loss": 1.2299, + "mean_token_accuracy": 0.717160701751709, + "num_tokens": 1603396.0, + "step": 128 + }, + { + "entropy": 1.1809884086251259, + "epoch": 0.215, + "grad_norm": 0.40074971318244934, + "learning_rate": 0.00019555555555555556, + "loss": 1.1807, + "mean_token_accuracy": 0.7240558713674545, + "num_tokens": 1615698.0, + "step": 129 + }, + { + "entropy": 0.9892362505197525, + "epoch": 0.21666666666666667, + "grad_norm": 0.44255295395851135, + "learning_rate": 0.00019543859649122808, + "loss": 1.0268, + "mean_token_accuracy": 0.7633307725191116, + "num_tokens": 1628080.0, + "step": 130 + }, + { + "entropy": 1.1162428334355354, + "epoch": 0.21833333333333332, + "grad_norm": 0.3332391679286957, + "learning_rate": 0.0001953216374269006, + "loss": 1.171, + "mean_token_accuracy": 0.7306881099939346, + "num_tokens": 1640529.0, + "step": 131 + }, + { + "entropy": 1.0377461314201355, + "epoch": 0.22, + "grad_norm": 0.33684980869293213, + "learning_rate": 0.0001952046783625731, + "loss": 1.1064, + "mean_token_accuracy": 0.7430030331015587, + "num_tokens": 1653382.0, + "step": 132 + }, + { + "entropy": 1.156599409878254, + "epoch": 0.22166666666666668, + "grad_norm": 0.42173266410827637, + "learning_rate": 0.00019508771929824562, + "loss": 1.2381, + "mean_token_accuracy": 0.7101339101791382, + "num_tokens": 1666061.0, + "step": 133 + }, + { + "entropy": 1.1487335935235023, + "epoch": 0.22333333333333333, + "grad_norm": 0.5505969524383545, + "learning_rate": 0.00019497076023391815, + "loss": 1.1949, + "mean_token_accuracy": 0.7228164002299309, + "num_tokens": 1678760.0, + "step": 134 + }, + { + "entropy": 1.1205493286252022, + "epoch": 0.225, + "grad_norm": 0.4036419987678528, + "learning_rate": 0.00019485380116959064, + "loss": 1.1353, + "mean_token_accuracy": 0.7325499951839447, + "num_tokens": 1691086.0, + "step": 135 + }, + { + "entropy": 1.023528330028057, + "epoch": 0.22666666666666666, + "grad_norm": 0.4935557246208191, + "learning_rate": 0.00019473684210526317, + "loss": 1.0273, + "mean_token_accuracy": 0.7585752308368683, + "num_tokens": 1703917.0, + "step": 136 + }, + { + "entropy": 1.3035095483064651, + "epoch": 0.22833333333333333, + "grad_norm": 0.4646526277065277, + "learning_rate": 0.0001946198830409357, + "loss": 1.337, + "mean_token_accuracy": 0.6854947060346603, + "num_tokens": 1716473.0, + "step": 137 + }, + { + "entropy": 1.0940191820263863, + "epoch": 0.23, + "grad_norm": 0.5709595680236816, + "learning_rate": 0.00019450292397660819, + "loss": 1.1138, + "mean_token_accuracy": 0.7348800003528595, + "num_tokens": 1728903.0, + "step": 138 + }, + { + "entropy": 1.1010525897145271, + "epoch": 0.23166666666666666, + "grad_norm": 0.28354716300964355, + "learning_rate": 0.0001943859649122807, + "loss": 1.1311, + "mean_token_accuracy": 0.7405002191662788, + "num_tokens": 1741242.0, + "step": 139 + }, + { + "entropy": 1.167225994169712, + "epoch": 0.23333333333333334, + "grad_norm": 0.47398078441619873, + "learning_rate": 0.00019426900584795323, + "loss": 1.1551, + "mean_token_accuracy": 0.7253175228834152, + "num_tokens": 1753803.0, + "step": 140 + }, + { + "entropy": 1.153997391462326, + "epoch": 0.235, + "grad_norm": 0.42761462926864624, + "learning_rate": 0.00019415204678362573, + "loss": 1.1576, + "mean_token_accuracy": 0.7320474088191986, + "num_tokens": 1766016.0, + "step": 141 + }, + { + "entropy": 1.118414282798767, + "epoch": 0.23666666666666666, + "grad_norm": 0.45717811584472656, + "learning_rate": 0.00019403508771929825, + "loss": 1.1184, + "mean_token_accuracy": 0.7379914745688438, + "num_tokens": 1778602.0, + "step": 142 + }, + { + "entropy": 1.0639612078666687, + "epoch": 0.23833333333333334, + "grad_norm": 0.4153079688549042, + "learning_rate": 0.00019391812865497077, + "loss": 1.0899, + "mean_token_accuracy": 0.7416495755314827, + "num_tokens": 1791202.0, + "step": 143 + }, + { + "entropy": 1.0359639376401901, + "epoch": 0.24, + "grad_norm": 0.46512192487716675, + "learning_rate": 0.0001938011695906433, + "loss": 1.0682, + "mean_token_accuracy": 0.7477246001362801, + "num_tokens": 1803814.0, + "step": 144 + }, + { + "entropy": 1.1540048494935036, + "epoch": 0.24166666666666667, + "grad_norm": 0.5459341406822205, + "learning_rate": 0.0001936842105263158, + "loss": 1.22, + "mean_token_accuracy": 0.7181112244725227, + "num_tokens": 1816320.0, + "step": 145 + }, + { + "entropy": 1.0917329415678978, + "epoch": 0.24333333333333335, + "grad_norm": 0.448598712682724, + "learning_rate": 0.00019356725146198832, + "loss": 1.1382, + "mean_token_accuracy": 0.7411187067627907, + "num_tokens": 1828742.0, + "step": 146 + }, + { + "entropy": 1.0613715201616287, + "epoch": 0.245, + "grad_norm": 0.40278568863868713, + "learning_rate": 0.00019345029239766084, + "loss": 1.1119, + "mean_token_accuracy": 0.7407647371292114, + "num_tokens": 1841462.0, + "step": 147 + }, + { + "entropy": 1.0917879864573479, + "epoch": 0.24666666666666667, + "grad_norm": 0.47940903902053833, + "learning_rate": 0.00019333333333333333, + "loss": 1.1205, + "mean_token_accuracy": 0.7301802858710289, + "num_tokens": 1854255.0, + "step": 148 + }, + { + "entropy": 1.018385998904705, + "epoch": 0.24833333333333332, + "grad_norm": 0.4194647967815399, + "learning_rate": 0.00019321637426900586, + "loss": 1.0555, + "mean_token_accuracy": 0.7508860379457474, + "num_tokens": 1866779.0, + "step": 149 + }, + { + "entropy": 1.0104970261454582, + "epoch": 0.25, + "grad_norm": 0.32308053970336914, + "learning_rate": 0.00019309941520467838, + "loss": 1.0155, + "mean_token_accuracy": 0.7584084123373032, + "num_tokens": 1879629.0, + "step": 150 + }, + { + "entropy": 1.1477160826325417, + "epoch": 0.25166666666666665, + "grad_norm": 0.2899821400642395, + "learning_rate": 0.00019298245614035088, + "loss": 1.1195, + "mean_token_accuracy": 0.7364438623189926, + "num_tokens": 1892328.0, + "step": 151 + }, + { + "entropy": 1.1384098306298256, + "epoch": 0.25333333333333335, + "grad_norm": 0.4167255163192749, + "learning_rate": 0.0001928654970760234, + "loss": 1.1635, + "mean_token_accuracy": 0.7330257892608643, + "num_tokens": 1904903.0, + "step": 152 + }, + { + "entropy": 1.0779243260622025, + "epoch": 0.255, + "grad_norm": 0.3709876537322998, + "learning_rate": 0.00019274853801169592, + "loss": 1.1139, + "mean_token_accuracy": 0.7434241771697998, + "num_tokens": 1917471.0, + "step": 153 + }, + { + "entropy": 1.2101422995328903, + "epoch": 0.25666666666666665, + "grad_norm": 0.3245011568069458, + "learning_rate": 0.00019263157894736842, + "loss": 1.2267, + "mean_token_accuracy": 0.7105641290545464, + "num_tokens": 1930428.0, + "step": 154 + }, + { + "entropy": 1.1308674216270447, + "epoch": 0.25833333333333336, + "grad_norm": 0.5401122570037842, + "learning_rate": 0.00019251461988304094, + "loss": 1.1383, + "mean_token_accuracy": 0.7337904721498489, + "num_tokens": 1942865.0, + "step": 155 + }, + { + "entropy": 1.109563060104847, + "epoch": 0.26, + "grad_norm": 0.36856064200401306, + "learning_rate": 0.00019239766081871346, + "loss": 1.1064, + "mean_token_accuracy": 0.7388525083661079, + "num_tokens": 1955547.0, + "step": 156 + }, + { + "entropy": 0.8726945370435715, + "epoch": 0.26166666666666666, + "grad_norm": 0.3195601999759674, + "learning_rate": 0.00019228070175438596, + "loss": 0.8676, + "mean_token_accuracy": 0.7892613261938095, + "num_tokens": 1968285.0, + "step": 157 + }, + { + "entropy": 1.2257354855537415, + "epoch": 0.2633333333333333, + "grad_norm": 0.44171369075775146, + "learning_rate": 0.00019216374269005848, + "loss": 1.2279, + "mean_token_accuracy": 0.7155179604887962, + "num_tokens": 1980596.0, + "step": 158 + }, + { + "entropy": 1.1729735136032104, + "epoch": 0.265, + "grad_norm": 0.39025864005088806, + "learning_rate": 0.000192046783625731, + "loss": 1.1974, + "mean_token_accuracy": 0.7177061587572098, + "num_tokens": 1993239.0, + "step": 159 + }, + { + "entropy": 1.1277462840080261, + "epoch": 0.26666666666666666, + "grad_norm": 0.3358098566532135, + "learning_rate": 0.0001919298245614035, + "loss": 1.1326, + "mean_token_accuracy": 0.7304977104067802, + "num_tokens": 2005914.0, + "step": 160 + }, + { + "entropy": 1.1848002821207047, + "epoch": 0.2683333333333333, + "grad_norm": 0.395328551530838, + "learning_rate": 0.00019181286549707603, + "loss": 1.1864, + "mean_token_accuracy": 0.7176820933818817, + "num_tokens": 2018380.0, + "step": 161 + }, + { + "entropy": 1.1120817065238953, + "epoch": 0.27, + "grad_norm": 0.40584561228752136, + "learning_rate": 0.00019169590643274855, + "loss": 1.1281, + "mean_token_accuracy": 0.7257193401455879, + "num_tokens": 2030805.0, + "step": 162 + }, + { + "entropy": 1.1387057602405548, + "epoch": 0.27166666666666667, + "grad_norm": 0.4128866493701935, + "learning_rate": 0.00019157894736842104, + "loss": 1.1639, + "mean_token_accuracy": 0.7216651067137718, + "num_tokens": 2043168.0, + "step": 163 + }, + { + "entropy": 0.9902090951800346, + "epoch": 0.2733333333333333, + "grad_norm": 0.4670630097389221, + "learning_rate": 0.0001914619883040936, + "loss": 1.0027, + "mean_token_accuracy": 0.7596156373620033, + "num_tokens": 2055446.0, + "step": 164 + }, + { + "entropy": 1.1425034403800964, + "epoch": 0.275, + "grad_norm": 0.46681496500968933, + "learning_rate": 0.0001913450292397661, + "loss": 1.1538, + "mean_token_accuracy": 0.7130375802516937, + "num_tokens": 2067917.0, + "step": 165 + }, + { + "entropy": 1.0597015172243118, + "epoch": 0.27666666666666667, + "grad_norm": 0.47518017888069153, + "learning_rate": 0.0001912280701754386, + "loss": 1.0724, + "mean_token_accuracy": 0.7431479915976524, + "num_tokens": 2080651.0, + "step": 166 + }, + { + "entropy": 0.9730403125286102, + "epoch": 0.2783333333333333, + "grad_norm": 0.733403742313385, + "learning_rate": 0.00019111111111111114, + "loss": 0.963, + "mean_token_accuracy": 0.7653697729110718, + "num_tokens": 2093312.0, + "step": 167 + }, + { + "entropy": 1.0079271346330643, + "epoch": 0.28, + "grad_norm": 0.3541069030761719, + "learning_rate": 0.00019099415204678363, + "loss": 0.9992, + "mean_token_accuracy": 0.7640089094638824, + "num_tokens": 2105865.0, + "step": 168 + }, + { + "entropy": 1.1272375360131264, + "epoch": 0.2816666666666667, + "grad_norm": 13.837325096130371, + "learning_rate": 0.00019087719298245616, + "loss": 1.1298, + "mean_token_accuracy": 0.7283240929245949, + "num_tokens": 2118722.0, + "step": 169 + }, + { + "entropy": 1.1928050369024277, + "epoch": 0.2833333333333333, + "grad_norm": 0.6107990145683289, + "learning_rate": 0.00019076023391812868, + "loss": 1.1818, + "mean_token_accuracy": 0.7213939651846886, + "num_tokens": 2131235.0, + "step": 170 + }, + { + "entropy": 1.1104667708277702, + "epoch": 0.285, + "grad_norm": 0.5343078374862671, + "learning_rate": 0.00019064327485380117, + "loss": 1.1138, + "mean_token_accuracy": 0.7371880561113358, + "num_tokens": 2143714.0, + "step": 171 + }, + { + "entropy": 1.0297225266695023, + "epoch": 0.2866666666666667, + "grad_norm": 0.3125198781490326, + "learning_rate": 0.0001905263157894737, + "loss": 1.0159, + "mean_token_accuracy": 0.7521278113126755, + "num_tokens": 2156404.0, + "step": 172 + }, + { + "entropy": 1.0767759680747986, + "epoch": 0.28833333333333333, + "grad_norm": 0.34481510519981384, + "learning_rate": 0.00019040935672514622, + "loss": 1.0683, + "mean_token_accuracy": 0.7420379370450974, + "num_tokens": 2169219.0, + "step": 173 + }, + { + "entropy": 0.9333702996373177, + "epoch": 0.29, + "grad_norm": 0.48810121417045593, + "learning_rate": 0.00019029239766081872, + "loss": 0.9179, + "mean_token_accuracy": 0.7737013623118401, + "num_tokens": 2181773.0, + "step": 174 + }, + { + "entropy": 1.0734611302614212, + "epoch": 0.2916666666666667, + "grad_norm": 0.7126191854476929, + "learning_rate": 0.00019017543859649124, + "loss": 1.1299, + "mean_token_accuracy": 0.7278291434049606, + "num_tokens": 2194152.0, + "step": 175 + }, + { + "entropy": 1.0925179943442345, + "epoch": 0.29333333333333333, + "grad_norm": 0.4982717037200928, + "learning_rate": 0.00019005847953216376, + "loss": 1.1199, + "mean_token_accuracy": 0.7315082252025604, + "num_tokens": 2206831.0, + "step": 176 + }, + { + "entropy": 1.0705517753958702, + "epoch": 0.295, + "grad_norm": 0.3316156566143036, + "learning_rate": 0.00018994152046783626, + "loss": 1.0693, + "mean_token_accuracy": 0.7411659136414528, + "num_tokens": 2219538.0, + "step": 177 + }, + { + "entropy": 1.1149278432130814, + "epoch": 0.2966666666666667, + "grad_norm": 0.3496127128601074, + "learning_rate": 0.00018982456140350878, + "loss": 1.1338, + "mean_token_accuracy": 0.7307734712958336, + "num_tokens": 2232143.0, + "step": 178 + }, + { + "entropy": 1.054306723177433, + "epoch": 0.29833333333333334, + "grad_norm": 0.40683513879776, + "learning_rate": 0.0001897076023391813, + "loss": 1.0751, + "mean_token_accuracy": 0.7427195087075233, + "num_tokens": 2244992.0, + "step": 179 + }, + { + "entropy": 1.1751435473561287, + "epoch": 0.3, + "grad_norm": 0.41907891631126404, + "learning_rate": 0.0001895906432748538, + "loss": 1.198, + "mean_token_accuracy": 0.7172646000981331, + "num_tokens": 2257612.0, + "step": 180 + }, + { + "entropy": 1.1129144579172134, + "epoch": 0.3016666666666667, + "grad_norm": 0.3949214220046997, + "learning_rate": 0.00018947368421052632, + "loss": 1.0958, + "mean_token_accuracy": 0.7357127368450165, + "num_tokens": 2270112.0, + "step": 181 + }, + { + "entropy": 1.179383508861065, + "epoch": 0.30333333333333334, + "grad_norm": 0.3359801471233368, + "learning_rate": 0.00018935672514619885, + "loss": 1.1459, + "mean_token_accuracy": 0.7238163203001022, + "num_tokens": 2282533.0, + "step": 182 + }, + { + "entropy": 1.1705670580267906, + "epoch": 0.305, + "grad_norm": 0.3245558440685272, + "learning_rate": 0.00018923976608187134, + "loss": 1.1676, + "mean_token_accuracy": 0.7258678451180458, + "num_tokens": 2295083.0, + "step": 183 + }, + { + "entropy": 1.0460694283246994, + "epoch": 0.30666666666666664, + "grad_norm": 0.36032363772392273, + "learning_rate": 0.0001891228070175439, + "loss": 1.0124, + "mean_token_accuracy": 0.7577406391501427, + "num_tokens": 2307968.0, + "step": 184 + }, + { + "entropy": 1.1906094327569008, + "epoch": 0.30833333333333335, + "grad_norm": 0.3748197853565216, + "learning_rate": 0.0001890058479532164, + "loss": 1.1827, + "mean_token_accuracy": 0.7173843756318092, + "num_tokens": 2320581.0, + "step": 185 + }, + { + "entropy": 1.0227904915809631, + "epoch": 0.31, + "grad_norm": 0.3617149591445923, + "learning_rate": 0.00018888888888888888, + "loss": 1.0322, + "mean_token_accuracy": 0.7545242831110954, + "num_tokens": 2333156.0, + "step": 186 + }, + { + "entropy": 1.0227366983890533, + "epoch": 0.31166666666666665, + "grad_norm": 0.32479673624038696, + "learning_rate": 0.00018877192982456143, + "loss": 1.0207, + "mean_token_accuracy": 0.7527587786316872, + "num_tokens": 2345652.0, + "step": 187 + }, + { + "entropy": 1.1130978390574455, + "epoch": 0.31333333333333335, + "grad_norm": 0.30757713317871094, + "learning_rate": 0.00018865497076023393, + "loss": 1.1258, + "mean_token_accuracy": 0.7307859510183334, + "num_tokens": 2358342.0, + "step": 188 + }, + { + "entropy": 1.1219107881188393, + "epoch": 0.315, + "grad_norm": 0.39883914589881897, + "learning_rate": 0.00018853801169590643, + "loss": 1.1484, + "mean_token_accuracy": 0.7276914939284325, + "num_tokens": 2370759.0, + "step": 189 + }, + { + "entropy": 1.0377257764339447, + "epoch": 0.31666666666666665, + "grad_norm": 0.3542444705963135, + "learning_rate": 0.00018842105263157898, + "loss": 1.043, + "mean_token_accuracy": 0.755888819694519, + "num_tokens": 2383289.0, + "step": 190 + }, + { + "entropy": 1.0208693370223045, + "epoch": 0.31833333333333336, + "grad_norm": 0.34766149520874023, + "learning_rate": 0.00018830409356725147, + "loss": 0.9811, + "mean_token_accuracy": 0.7647728249430656, + "num_tokens": 2395819.0, + "step": 191 + }, + { + "entropy": 1.116250567138195, + "epoch": 0.32, + "grad_norm": 0.32695358991622925, + "learning_rate": 0.00018818713450292397, + "loss": 1.1336, + "mean_token_accuracy": 0.735762432217598, + "num_tokens": 2408408.0, + "step": 192 + }, + { + "entropy": 1.1328190714120865, + "epoch": 0.32166666666666666, + "grad_norm": 0.36984795331954956, + "learning_rate": 0.00018807017543859652, + "loss": 1.1403, + "mean_token_accuracy": 0.7324722409248352, + "num_tokens": 2420683.0, + "step": 193 + }, + { + "entropy": 1.1805738806724548, + "epoch": 0.3233333333333333, + "grad_norm": 0.44600820541381836, + "learning_rate": 0.00018795321637426901, + "loss": 1.1871, + "mean_token_accuracy": 0.7195275351405144, + "num_tokens": 2433445.0, + "step": 194 + }, + { + "entropy": 1.1397397369146347, + "epoch": 0.325, + "grad_norm": 0.415822833776474, + "learning_rate": 0.0001878362573099415, + "loss": 1.1042, + "mean_token_accuracy": 0.7342669293284416, + "num_tokens": 2446015.0, + "step": 195 + }, + { + "entropy": 1.045611895620823, + "epoch": 0.32666666666666666, + "grad_norm": 0.33927756547927856, + "learning_rate": 0.00018771929824561406, + "loss": 1.0144, + "mean_token_accuracy": 0.7533715888857841, + "num_tokens": 2458426.0, + "step": 196 + }, + { + "entropy": 1.2224002107977867, + "epoch": 0.3283333333333333, + "grad_norm": 0.3514537215232849, + "learning_rate": 0.00018760233918128656, + "loss": 1.198, + "mean_token_accuracy": 0.7203835994005203, + "num_tokens": 2470769.0, + "step": 197 + }, + { + "entropy": 1.091364249587059, + "epoch": 0.33, + "grad_norm": 0.34935829043388367, + "learning_rate": 0.00018748538011695905, + "loss": 1.0844, + "mean_token_accuracy": 0.7347274050116539, + "num_tokens": 2483142.0, + "step": 198 + }, + { + "entropy": 0.9068185538053513, + "epoch": 0.33166666666666667, + "grad_norm": 0.3763393759727478, + "learning_rate": 0.0001873684210526316, + "loss": 0.9062, + "mean_token_accuracy": 0.7771774157881737, + "num_tokens": 2496030.0, + "step": 199 + }, + { + "entropy": 1.1054254174232483, + "epoch": 0.3333333333333333, + "grad_norm": 0.3939693570137024, + "learning_rate": 0.0001872514619883041, + "loss": 1.1066, + "mean_token_accuracy": 0.7332883253693581, + "num_tokens": 2508253.0, + "step": 200 + }, + { + "entropy": 1.0377614423632622, + "epoch": 0.335, + "grad_norm": 0.3749332129955292, + "learning_rate": 0.0001871345029239766, + "loss": 1.0401, + "mean_token_accuracy": 0.7471970319747925, + "num_tokens": 2520898.0, + "step": 201 + }, + { + "entropy": 1.0859627276659012, + "epoch": 0.33666666666666667, + "grad_norm": 0.3993607759475708, + "learning_rate": 0.00018701754385964914, + "loss": 1.0711, + "mean_token_accuracy": 0.7411193326115608, + "num_tokens": 2533581.0, + "step": 202 + }, + { + "entropy": 1.148285612463951, + "epoch": 0.3383333333333333, + "grad_norm": 0.3228718042373657, + "learning_rate": 0.00018690058479532164, + "loss": 1.1805, + "mean_token_accuracy": 0.7245375514030457, + "num_tokens": 2546041.0, + "step": 203 + }, + { + "entropy": 0.9141278266906738, + "epoch": 0.34, + "grad_norm": 0.36570003628730774, + "learning_rate": 0.00018678362573099416, + "loss": 0.8984, + "mean_token_accuracy": 0.7771345153450966, + "num_tokens": 2558399.0, + "step": 204 + }, + { + "entropy": 1.099015660583973, + "epoch": 0.3416666666666667, + "grad_norm": 0.3728063404560089, + "learning_rate": 0.0001866666666666667, + "loss": 1.1063, + "mean_token_accuracy": 0.7349843755364418, + "num_tokens": 2570753.0, + "step": 205 + }, + { + "entropy": 1.0622873678803444, + "epoch": 0.3433333333333333, + "grad_norm": 0.3592352271080017, + "learning_rate": 0.00018654970760233918, + "loss": 1.0546, + "mean_token_accuracy": 0.7437913119792938, + "num_tokens": 2583317.0, + "step": 206 + }, + { + "entropy": 1.0601943358778954, + "epoch": 0.345, + "grad_norm": 0.3239583373069763, + "learning_rate": 0.0001864327485380117, + "loss": 1.0572, + "mean_token_accuracy": 0.7508477568626404, + "num_tokens": 2596203.0, + "step": 207 + }, + { + "entropy": 1.0785654410719872, + "epoch": 0.3466666666666667, + "grad_norm": 0.3292888104915619, + "learning_rate": 0.00018631578947368423, + "loss": 1.0613, + "mean_token_accuracy": 0.7437207996845245, + "num_tokens": 2608688.0, + "step": 208 + }, + { + "entropy": 1.133879691362381, + "epoch": 0.34833333333333333, + "grad_norm": 0.46649813652038574, + "learning_rate": 0.00018619883040935672, + "loss": 1.1081, + "mean_token_accuracy": 0.7371273636817932, + "num_tokens": 2621110.0, + "step": 209 + }, + { + "entropy": 1.0522583425045013, + "epoch": 0.35, + "grad_norm": 0.44118815660476685, + "learning_rate": 0.00018608187134502925, + "loss": 1.0359, + "mean_token_accuracy": 0.7514612525701523, + "num_tokens": 2633452.0, + "step": 210 + }, + { + "entropy": 1.1348539143800735, + "epoch": 0.3516666666666667, + "grad_norm": 0.3560009300708771, + "learning_rate": 0.00018596491228070177, + "loss": 1.1507, + "mean_token_accuracy": 0.7266515046358109, + "num_tokens": 2645787.0, + "step": 211 + }, + { + "entropy": 1.0085133984684944, + "epoch": 0.35333333333333333, + "grad_norm": 0.39134904742240906, + "learning_rate": 0.00018584795321637427, + "loss": 1.0109, + "mean_token_accuracy": 0.7624108791351318, + "num_tokens": 2658104.0, + "step": 212 + }, + { + "entropy": 0.9829937592148781, + "epoch": 0.355, + "grad_norm": 0.3849150836467743, + "learning_rate": 0.0001857309941520468, + "loss": 0.9771, + "mean_token_accuracy": 0.7678255960345268, + "num_tokens": 2670756.0, + "step": 213 + }, + { + "entropy": 1.1382714584469795, + "epoch": 0.3566666666666667, + "grad_norm": 0.3976750075817108, + "learning_rate": 0.0001856140350877193, + "loss": 1.13, + "mean_token_accuracy": 0.7280794978141785, + "num_tokens": 2683295.0, + "step": 214 + }, + { + "entropy": 1.217024527490139, + "epoch": 0.35833333333333334, + "grad_norm": 0.36979228258132935, + "learning_rate": 0.0001854970760233918, + "loss": 1.2515, + "mean_token_accuracy": 0.7075021639466286, + "num_tokens": 2695796.0, + "step": 215 + }, + { + "entropy": 1.189859315752983, + "epoch": 0.36, + "grad_norm": 0.3615109324455261, + "learning_rate": 0.00018538011695906433, + "loss": 1.1924, + "mean_token_accuracy": 0.7155702859163284, + "num_tokens": 2708402.0, + "step": 216 + }, + { + "entropy": 1.1346632614731789, + "epoch": 0.3616666666666667, + "grad_norm": 0.31497734785079956, + "learning_rate": 0.00018526315789473685, + "loss": 1.1414, + "mean_token_accuracy": 0.7290553748607635, + "num_tokens": 2721310.0, + "step": 217 + }, + { + "entropy": 1.0656853094696999, + "epoch": 0.36333333333333334, + "grad_norm": 0.36364349722862244, + "learning_rate": 0.00018514619883040935, + "loss": 1.0464, + "mean_token_accuracy": 0.7473271563649178, + "num_tokens": 2733663.0, + "step": 218 + }, + { + "entropy": 1.267977461218834, + "epoch": 0.365, + "grad_norm": 0.37948599457740784, + "learning_rate": 0.00018502923976608187, + "loss": 1.268, + "mean_token_accuracy": 0.7058289349079132, + "num_tokens": 2746382.0, + "step": 219 + }, + { + "entropy": 1.1866832301020622, + "epoch": 0.36666666666666664, + "grad_norm": 0.34032464027404785, + "learning_rate": 0.0001849122807017544, + "loss": 1.2023, + "mean_token_accuracy": 0.7172464728355408, + "num_tokens": 2758970.0, + "step": 220 + }, + { + "entropy": 1.0964962020516396, + "epoch": 0.36833333333333335, + "grad_norm": 0.34577351808547974, + "learning_rate": 0.0001847953216374269, + "loss": 1.0842, + "mean_token_accuracy": 0.744616910815239, + "num_tokens": 2771288.0, + "step": 221 + }, + { + "entropy": 1.0916159451007843, + "epoch": 0.37, + "grad_norm": 0.32731419801712036, + "learning_rate": 0.00018467836257309942, + "loss": 1.074, + "mean_token_accuracy": 0.739419586956501, + "num_tokens": 2783755.0, + "step": 222 + }, + { + "entropy": 1.1499411910772324, + "epoch": 0.37166666666666665, + "grad_norm": 0.3359861671924591, + "learning_rate": 0.00018456140350877194, + "loss": 1.1354, + "mean_token_accuracy": 0.7299651131033897, + "num_tokens": 2796265.0, + "step": 223 + }, + { + "entropy": 1.2112242728471756, + "epoch": 0.37333333333333335, + "grad_norm": 0.3740575909614563, + "learning_rate": 0.00018444444444444446, + "loss": 1.2154, + "mean_token_accuracy": 0.7131304666399956, + "num_tokens": 2808704.0, + "step": 224 + }, + { + "entropy": 1.105809710919857, + "epoch": 0.375, + "grad_norm": 0.3910123109817505, + "learning_rate": 0.00018432748538011698, + "loss": 1.1179, + "mean_token_accuracy": 0.7317347005009651, + "num_tokens": 2821062.0, + "step": 225 + }, + { + "entropy": 1.1065769121050835, + "epoch": 0.37666666666666665, + "grad_norm": 0.37251028418540955, + "learning_rate": 0.00018421052631578948, + "loss": 1.1162, + "mean_token_accuracy": 0.73953577876091, + "num_tokens": 2833606.0, + "step": 226 + }, + { + "entropy": 1.0926253944635391, + "epoch": 0.37833333333333335, + "grad_norm": 0.3471030294895172, + "learning_rate": 0.000184093567251462, + "loss": 1.0956, + "mean_token_accuracy": 0.7368374243378639, + "num_tokens": 2846358.0, + "step": 227 + }, + { + "entropy": 1.0553656443953514, + "epoch": 0.38, + "grad_norm": 0.3455545902252197, + "learning_rate": 0.00018397660818713453, + "loss": 1.0357, + "mean_token_accuracy": 0.7497663721442223, + "num_tokens": 2858901.0, + "step": 228 + }, + { + "entropy": 1.1180525943636894, + "epoch": 0.38166666666666665, + "grad_norm": 0.426946759223938, + "learning_rate": 0.00018385964912280702, + "loss": 1.1552, + "mean_token_accuracy": 0.7236368507146835, + "num_tokens": 2871380.0, + "step": 229 + }, + { + "entropy": 1.0417871698737144, + "epoch": 0.38333333333333336, + "grad_norm": 0.446646511554718, + "learning_rate": 0.00018374269005847955, + "loss": 1.021, + "mean_token_accuracy": 0.7536017820239067, + "num_tokens": 2883953.0, + "step": 230 + }, + { + "entropy": 1.0821654945611954, + "epoch": 0.385, + "grad_norm": 0.3330538272857666, + "learning_rate": 0.00018362573099415207, + "loss": 1.091, + "mean_token_accuracy": 0.7424333989620209, + "num_tokens": 2896783.0, + "step": 231 + }, + { + "entropy": 1.261072151362896, + "epoch": 0.38666666666666666, + "grad_norm": 0.40883293747901917, + "learning_rate": 0.00018350877192982456, + "loss": 1.2329, + "mean_token_accuracy": 0.7064631283283234, + "num_tokens": 2909412.0, + "step": 232 + }, + { + "entropy": 1.219589687883854, + "epoch": 0.3883333333333333, + "grad_norm": 0.3914692997932434, + "learning_rate": 0.0001833918128654971, + "loss": 1.219, + "mean_token_accuracy": 0.7204131335020065, + "num_tokens": 2921869.0, + "step": 233 + }, + { + "entropy": 1.2079207003116608, + "epoch": 0.39, + "grad_norm": 0.3635447025299072, + "learning_rate": 0.0001832748538011696, + "loss": 1.1823, + "mean_token_accuracy": 0.7185709178447723, + "num_tokens": 2934474.0, + "step": 234 + }, + { + "entropy": 1.0661711767315865, + "epoch": 0.39166666666666666, + "grad_norm": 0.4999198615550995, + "learning_rate": 0.0001831578947368421, + "loss": 1.0022, + "mean_token_accuracy": 0.7570779994130135, + "num_tokens": 2947214.0, + "step": 235 + }, + { + "entropy": 0.9990430921316147, + "epoch": 0.3933333333333333, + "grad_norm": 0.4003547728061676, + "learning_rate": 0.00018304093567251463, + "loss": 0.9514, + "mean_token_accuracy": 0.7665991857647896, + "num_tokens": 2959663.0, + "step": 236 + }, + { + "entropy": 1.0679278895258904, + "epoch": 0.395, + "grad_norm": 0.38362008333206177, + "learning_rate": 0.00018292397660818715, + "loss": 1.0679, + "mean_token_accuracy": 0.7407658472657204, + "num_tokens": 2972227.0, + "step": 237 + }, + { + "entropy": 1.0128286629915237, + "epoch": 0.39666666666666667, + "grad_norm": 0.3753218948841095, + "learning_rate": 0.00018280701754385965, + "loss": 1.0775, + "mean_token_accuracy": 0.7455108985304832, + "num_tokens": 2984809.0, + "step": 238 + }, + { + "entropy": 1.2022801265120506, + "epoch": 0.3983333333333333, + "grad_norm": 0.4850371778011322, + "learning_rate": 0.00018269005847953217, + "loss": 1.2478, + "mean_token_accuracy": 0.7030462697148323, + "num_tokens": 2997396.0, + "step": 239 + }, + { + "entropy": 1.0180853754281998, + "epoch": 0.4, + "grad_norm": 0.3701488971710205, + "learning_rate": 0.0001825730994152047, + "loss": 1.0154, + "mean_token_accuracy": 0.7558140829205513, + "num_tokens": 3009921.0, + "step": 240 + }, + { + "entropy": 1.0700580030679703, + "epoch": 0.40166666666666667, + "grad_norm": 0.3202139437198639, + "learning_rate": 0.0001824561403508772, + "loss": 1.0867, + "mean_token_accuracy": 0.7432869449257851, + "num_tokens": 3022468.0, + "step": 241 + }, + { + "entropy": 1.0071598812937737, + "epoch": 0.4033333333333333, + "grad_norm": 0.42016497254371643, + "learning_rate": 0.0001823391812865497, + "loss": 1.0128, + "mean_token_accuracy": 0.7569889947772026, + "num_tokens": 3035124.0, + "step": 242 + }, + { + "entropy": 1.1294294893741608, + "epoch": 0.405, + "grad_norm": 0.3567630648612976, + "learning_rate": 0.00018222222222222224, + "loss": 1.1274, + "mean_token_accuracy": 0.7349538579583168, + "num_tokens": 3047710.0, + "step": 243 + }, + { + "entropy": 1.1060679331421852, + "epoch": 0.4066666666666667, + "grad_norm": 0.34659335017204285, + "learning_rate": 0.00018210526315789476, + "loss": 1.1141, + "mean_token_accuracy": 0.7334257215261459, + "num_tokens": 3060375.0, + "step": 244 + }, + { + "entropy": 1.210126355290413, + "epoch": 0.4083333333333333, + "grad_norm": 0.4510009288787842, + "learning_rate": 0.00018198830409356726, + "loss": 1.1887, + "mean_token_accuracy": 0.7228007987141609, + "num_tokens": 3072667.0, + "step": 245 + }, + { + "entropy": 0.9602404832839966, + "epoch": 0.41, + "grad_norm": 0.4799667298793793, + "learning_rate": 0.00018187134502923978, + "loss": 0.928, + "mean_token_accuracy": 0.7675070241093636, + "num_tokens": 3085435.0, + "step": 246 + }, + { + "entropy": 1.0962599590420723, + "epoch": 0.4116666666666667, + "grad_norm": 0.3477798402309418, + "learning_rate": 0.0001817543859649123, + "loss": 1.0924, + "mean_token_accuracy": 0.7440480887889862, + "num_tokens": 3097879.0, + "step": 247 + }, + { + "entropy": 1.1427634581923485, + "epoch": 0.41333333333333333, + "grad_norm": 0.343257337808609, + "learning_rate": 0.0001816374269005848, + "loss": 1.1493, + "mean_token_accuracy": 0.7298817038536072, + "num_tokens": 3110255.0, + "step": 248 + }, + { + "entropy": 1.0447398945689201, + "epoch": 0.415, + "grad_norm": 0.49843472242355347, + "learning_rate": 0.00018152046783625732, + "loss": 1.0298, + "mean_token_accuracy": 0.7523345276713371, + "num_tokens": 3122802.0, + "step": 249 + }, + { + "entropy": 1.1335030645132065, + "epoch": 0.4166666666666667, + "grad_norm": 0.3733726739883423, + "learning_rate": 0.00018140350877192984, + "loss": 1.1147, + "mean_token_accuracy": 0.7301384806632996, + "num_tokens": 3135401.0, + "step": 250 + }, + { + "entropy": 1.1334904357790947, + "epoch": 0.41833333333333333, + "grad_norm": 0.3926542103290558, + "learning_rate": 0.00018128654970760234, + "loss": 1.1316, + "mean_token_accuracy": 0.735007993876934, + "num_tokens": 3147925.0, + "step": 251 + }, + { + "entropy": 0.9874943792819977, + "epoch": 0.42, + "grad_norm": 0.5496231317520142, + "learning_rate": 0.00018116959064327486, + "loss": 0.9596, + "mean_token_accuracy": 0.7662070468068123, + "num_tokens": 3160379.0, + "step": 252 + }, + { + "entropy": 0.9868237897753716, + "epoch": 0.4216666666666667, + "grad_norm": 0.46299904584884644, + "learning_rate": 0.00018105263157894739, + "loss": 0.9998, + "mean_token_accuracy": 0.7589811682701111, + "num_tokens": 3172837.0, + "step": 253 + }, + { + "entropy": 0.9789915308356285, + "epoch": 0.42333333333333334, + "grad_norm": 0.3225744068622589, + "learning_rate": 0.00018093567251461988, + "loss": 0.9982, + "mean_token_accuracy": 0.7591699734330177, + "num_tokens": 3185252.0, + "step": 254 + }, + { + "entropy": 1.1388946995139122, + "epoch": 0.425, + "grad_norm": 0.4054366648197174, + "learning_rate": 0.0001808187134502924, + "loss": 1.149, + "mean_token_accuracy": 0.7230332866311073, + "num_tokens": 3197900.0, + "step": 255 + }, + { + "entropy": 1.0267946869134903, + "epoch": 0.4266666666666667, + "grad_norm": 0.43072929978370667, + "learning_rate": 0.00018070175438596493, + "loss": 1.0254, + "mean_token_accuracy": 0.7437686920166016, + "num_tokens": 3210354.0, + "step": 256 + }, + { + "entropy": 1.151397317647934, + "epoch": 0.42833333333333334, + "grad_norm": 0.5497655272483826, + "learning_rate": 0.00018058479532163742, + "loss": 1.1575, + "mean_token_accuracy": 0.7231586053967476, + "num_tokens": 3222849.0, + "step": 257 + }, + { + "entropy": 0.9719003960490227, + "epoch": 0.43, + "grad_norm": 0.39777350425720215, + "learning_rate": 0.00018046783625730995, + "loss": 0.9829, + "mean_token_accuracy": 0.76484714448452, + "num_tokens": 3235515.0, + "step": 258 + }, + { + "entropy": 1.1252076029777527, + "epoch": 0.43166666666666664, + "grad_norm": 0.5205410718917847, + "learning_rate": 0.00018035087719298247, + "loss": 1.1477, + "mean_token_accuracy": 0.73708376288414, + "num_tokens": 3248200.0, + "step": 259 + }, + { + "entropy": 1.1057978570461273, + "epoch": 0.43333333333333335, + "grad_norm": 0.35812073945999146, + "learning_rate": 0.00018023391812865497, + "loss": 1.0672, + "mean_token_accuracy": 0.7432841360569, + "num_tokens": 3260782.0, + "step": 260 + }, + { + "entropy": 1.1699188724160194, + "epoch": 0.435, + "grad_norm": 0.4195549786090851, + "learning_rate": 0.0001801169590643275, + "loss": 1.1592, + "mean_token_accuracy": 0.720735527575016, + "num_tokens": 3273231.0, + "step": 261 + }, + { + "entropy": 1.0379428714513779, + "epoch": 0.43666666666666665, + "grad_norm": 0.7551639080047607, + "learning_rate": 0.00018, + "loss": 1.0302, + "mean_token_accuracy": 0.7501674890518188, + "num_tokens": 3285787.0, + "step": 262 + }, + { + "entropy": 1.071130983531475, + "epoch": 0.43833333333333335, + "grad_norm": 0.4712306261062622, + "learning_rate": 0.0001798830409356725, + "loss": 1.0726, + "mean_token_accuracy": 0.7447422966361046, + "num_tokens": 3298694.0, + "step": 263 + }, + { + "entropy": 1.2623703926801682, + "epoch": 0.44, + "grad_norm": 0.33710184693336487, + "learning_rate": 0.00017976608187134503, + "loss": 1.2761, + "mean_token_accuracy": 0.7032147943973541, + "num_tokens": 3311387.0, + "step": 264 + }, + { + "entropy": 1.1410105228424072, + "epoch": 0.44166666666666665, + "grad_norm": 0.6147916913032532, + "learning_rate": 0.00017964912280701755, + "loss": 1.1392, + "mean_token_accuracy": 0.7260777652263641, + "num_tokens": 3324108.0, + "step": 265 + }, + { + "entropy": 1.1146948486566544, + "epoch": 0.44333333333333336, + "grad_norm": 0.4302304983139038, + "learning_rate": 0.00017953216374269005, + "loss": 1.1261, + "mean_token_accuracy": 0.7268940955400467, + "num_tokens": 3336483.0, + "step": 266 + }, + { + "entropy": 1.1186750009655952, + "epoch": 0.445, + "grad_norm": 0.33564555644989014, + "learning_rate": 0.0001794152046783626, + "loss": 1.1414, + "mean_token_accuracy": 0.7282785773277283, + "num_tokens": 3349243.0, + "step": 267 + }, + { + "entropy": 1.135681688785553, + "epoch": 0.44666666666666666, + "grad_norm": 0.3261569142341614, + "learning_rate": 0.0001792982456140351, + "loss": 1.112, + "mean_token_accuracy": 0.7374716177582741, + "num_tokens": 3361738.0, + "step": 268 + }, + { + "entropy": 1.017588496208191, + "epoch": 0.4483333333333333, + "grad_norm": 0.3962899446487427, + "learning_rate": 0.00017918128654970762, + "loss": 0.9988, + "mean_token_accuracy": 0.760860413312912, + "num_tokens": 3374177.0, + "step": 269 + }, + { + "entropy": 1.1711449921131134, + "epoch": 0.45, + "grad_norm": 0.43412500619888306, + "learning_rate": 0.00017906432748538014, + "loss": 1.1504, + "mean_token_accuracy": 0.724379375576973, + "num_tokens": 3386986.0, + "step": 270 + }, + { + "entropy": 0.9756506755948067, + "epoch": 0.45166666666666666, + "grad_norm": 0.3354060649871826, + "learning_rate": 0.00017894736842105264, + "loss": 0.9702, + "mean_token_accuracy": 0.7611405923962593, + "num_tokens": 3399391.0, + "step": 271 + }, + { + "entropy": 1.1847113892436028, + "epoch": 0.4533333333333333, + "grad_norm": 0.3396911919116974, + "learning_rate": 0.00017883040935672516, + "loss": 1.1831, + "mean_token_accuracy": 0.7153457403182983, + "num_tokens": 3411895.0, + "step": 272 + }, + { + "entropy": 1.11505925655365, + "epoch": 0.455, + "grad_norm": 0.4412688910961151, + "learning_rate": 0.00017871345029239768, + "loss": 1.1175, + "mean_token_accuracy": 0.7404645457863808, + "num_tokens": 3424508.0, + "step": 273 + }, + { + "entropy": 1.1529017016291618, + "epoch": 0.45666666666666667, + "grad_norm": 0.37485653162002563, + "learning_rate": 0.00017859649122807018, + "loss": 1.1488, + "mean_token_accuracy": 0.7300072684884071, + "num_tokens": 3437006.0, + "step": 274 + }, + { + "entropy": 1.0116957277059555, + "epoch": 0.4583333333333333, + "grad_norm": 0.3534213900566101, + "learning_rate": 0.0001784795321637427, + "loss": 1.0102, + "mean_token_accuracy": 0.7638874277472496, + "num_tokens": 3449704.0, + "step": 275 + }, + { + "entropy": 0.9486217200756073, + "epoch": 0.46, + "grad_norm": 0.4339519143104553, + "learning_rate": 0.00017836257309941523, + "loss": 0.9528, + "mean_token_accuracy": 0.7683136314153671, + "num_tokens": 3462323.0, + "step": 276 + }, + { + "entropy": 1.1378100514411926, + "epoch": 0.46166666666666667, + "grad_norm": 0.410174697637558, + "learning_rate": 0.00017824561403508772, + "loss": 1.143, + "mean_token_accuracy": 0.7271644920110703, + "num_tokens": 3474728.0, + "step": 277 + }, + { + "entropy": 1.035399042069912, + "epoch": 0.4633333333333333, + "grad_norm": 0.36434057354927063, + "learning_rate": 0.00017812865497076024, + "loss": 1.0336, + "mean_token_accuracy": 0.7525843381881714, + "num_tokens": 3487241.0, + "step": 278 + }, + { + "entropy": 1.035888947546482, + "epoch": 0.465, + "grad_norm": 0.3117313086986542, + "learning_rate": 0.00017801169590643277, + "loss": 1.0461, + "mean_token_accuracy": 0.7474090084433556, + "num_tokens": 3499778.0, + "step": 279 + }, + { + "entropy": 0.9683891534805298, + "epoch": 0.4666666666666667, + "grad_norm": 0.3829636871814728, + "learning_rate": 0.00017789473684210526, + "loss": 0.9695, + "mean_token_accuracy": 0.7638266384601593, + "num_tokens": 3512453.0, + "step": 280 + }, + { + "entropy": 1.0482271388173103, + "epoch": 0.4683333333333333, + "grad_norm": 0.4052916169166565, + "learning_rate": 0.00017777777777777779, + "loss": 1.0454, + "mean_token_accuracy": 0.7466916441917419, + "num_tokens": 3525098.0, + "step": 281 + }, + { + "entropy": 0.9748341217637062, + "epoch": 0.47, + "grad_norm": 0.44575342535972595, + "learning_rate": 0.0001776608187134503, + "loss": 0.9795, + "mean_token_accuracy": 0.7599806860089302, + "num_tokens": 3537524.0, + "step": 282 + }, + { + "entropy": 1.153599664568901, + "epoch": 0.4716666666666667, + "grad_norm": 0.37257784605026245, + "learning_rate": 0.0001775438596491228, + "loss": 1.1518, + "mean_token_accuracy": 0.7194614708423615, + "num_tokens": 3550045.0, + "step": 283 + }, + { + "entropy": 1.004349708557129, + "epoch": 0.47333333333333333, + "grad_norm": 0.38646551966667175, + "learning_rate": 0.00017742690058479533, + "loss": 0.9497, + "mean_token_accuracy": 0.7630625516176224, + "num_tokens": 3562622.0, + "step": 284 + }, + { + "entropy": 0.9980553165078163, + "epoch": 0.475, + "grad_norm": 0.49010729789733887, + "learning_rate": 0.00017730994152046785, + "loss": 0.975, + "mean_token_accuracy": 0.7679460272192955, + "num_tokens": 3575340.0, + "step": 285 + }, + { + "entropy": 1.088503360748291, + "epoch": 0.4766666666666667, + "grad_norm": 0.5452380180358887, + "learning_rate": 0.00017719298245614035, + "loss": 1.1096, + "mean_token_accuracy": 0.7412427291274071, + "num_tokens": 3587845.0, + "step": 286 + }, + { + "entropy": 1.0458046644926071, + "epoch": 0.47833333333333333, + "grad_norm": 0.6659599542617798, + "learning_rate": 0.0001770760233918129, + "loss": 1.048, + "mean_token_accuracy": 0.7454439774155617, + "num_tokens": 3600369.0, + "step": 287 + }, + { + "entropy": 1.1275902390480042, + "epoch": 0.48, + "grad_norm": 0.4208016097545624, + "learning_rate": 0.0001769590643274854, + "loss": 1.1682, + "mean_token_accuracy": 0.7288567647337914, + "num_tokens": 3612814.0, + "step": 288 + }, + { + "entropy": 1.1426914036273956, + "epoch": 0.4816666666666667, + "grad_norm": 0.5147913694381714, + "learning_rate": 0.0001768421052631579, + "loss": 1.1828, + "mean_token_accuracy": 0.723309837281704, + "num_tokens": 3625180.0, + "step": 289 + }, + { + "entropy": 1.0466126427054405, + "epoch": 0.48333333333333334, + "grad_norm": 0.5052932500839233, + "learning_rate": 0.00017672514619883044, + "loss": 1.0574, + "mean_token_accuracy": 0.7450472787022591, + "num_tokens": 3637813.0, + "step": 290 + }, + { + "entropy": 1.0919866040349007, + "epoch": 0.485, + "grad_norm": 0.3577198386192322, + "learning_rate": 0.00017660818713450294, + "loss": 1.0871, + "mean_token_accuracy": 0.7373141944408417, + "num_tokens": 3650370.0, + "step": 291 + }, + { + "entropy": 1.0942333936691284, + "epoch": 0.4866666666666667, + "grad_norm": 0.438251256942749, + "learning_rate": 0.00017649122807017543, + "loss": 1.088, + "mean_token_accuracy": 0.7383796274662018, + "num_tokens": 3663061.0, + "step": 292 + }, + { + "entropy": 0.9413135126233101, + "epoch": 0.48833333333333334, + "grad_norm": 0.44067561626434326, + "learning_rate": 0.00017637426900584798, + "loss": 0.9098, + "mean_token_accuracy": 0.7814988046884537, + "num_tokens": 3675856.0, + "step": 293 + }, + { + "entropy": 1.0296655967831612, + "epoch": 0.49, + "grad_norm": 0.3933659791946411, + "learning_rate": 0.00017625730994152048, + "loss": 1.0182, + "mean_token_accuracy": 0.7536342963576317, + "num_tokens": 3688322.0, + "step": 294 + }, + { + "entropy": 1.203233003616333, + "epoch": 0.49166666666666664, + "grad_norm": 0.3295867443084717, + "learning_rate": 0.00017614035087719297, + "loss": 1.1907, + "mean_token_accuracy": 0.7196495458483696, + "num_tokens": 3700776.0, + "step": 295 + }, + { + "entropy": 0.968670666217804, + "epoch": 0.49333333333333335, + "grad_norm": 0.3442942202091217, + "learning_rate": 0.00017602339181286552, + "loss": 0.9705, + "mean_token_accuracy": 0.7706414982676506, + "num_tokens": 3712967.0, + "step": 296 + }, + { + "entropy": 1.0014533996582031, + "epoch": 0.495, + "grad_norm": 0.41025930643081665, + "learning_rate": 0.00017590643274853802, + "loss": 0.9909, + "mean_token_accuracy": 0.7594960108399391, + "num_tokens": 3725548.0, + "step": 297 + }, + { + "entropy": 0.9800790995359421, + "epoch": 0.49666666666666665, + "grad_norm": 0.35484063625335693, + "learning_rate": 0.00017578947368421052, + "loss": 0.9636, + "mean_token_accuracy": 0.765408881008625, + "num_tokens": 3738348.0, + "step": 298 + }, + { + "entropy": 1.0904498919844627, + "epoch": 0.49833333333333335, + "grad_norm": 0.3571205139160156, + "learning_rate": 0.00017567251461988307, + "loss": 1.1312, + "mean_token_accuracy": 0.735343262553215, + "num_tokens": 3750904.0, + "step": 299 + }, + { + "entropy": 0.8727906718850136, + "epoch": 0.5, + "grad_norm": 0.3699484169483185, + "learning_rate": 0.00017555555555555556, + "loss": 0.8909, + "mean_token_accuracy": 0.7798029482364655, + "num_tokens": 3763871.0, + "step": 300 + }, + { + "entropy": 1.032260425388813, + "epoch": 0.5016666666666667, + "grad_norm": 0.31954166293144226, + "learning_rate": 0.00017543859649122806, + "loss": 1.0508, + "mean_token_accuracy": 0.7496752962470055, + "num_tokens": 3776386.0, + "step": 301 + }, + { + "entropy": 0.9070574641227722, + "epoch": 0.5033333333333333, + "grad_norm": 0.3604190945625305, + "learning_rate": 0.0001753216374269006, + "loss": 0.9088, + "mean_token_accuracy": 0.7788211852312088, + "num_tokens": 3789088.0, + "step": 302 + }, + { + "entropy": 1.0767110586166382, + "epoch": 0.505, + "grad_norm": 0.3933933675289154, + "learning_rate": 0.0001752046783625731, + "loss": 1.0663, + "mean_token_accuracy": 0.7495614886283875, + "num_tokens": 3801836.0, + "step": 303 + }, + { + "entropy": 1.0157058015465736, + "epoch": 0.5066666666666667, + "grad_norm": 0.3408040404319763, + "learning_rate": 0.0001750877192982456, + "loss": 1.0113, + "mean_token_accuracy": 0.746182844042778, + "num_tokens": 3814328.0, + "step": 304 + }, + { + "entropy": 1.2159147933125496, + "epoch": 0.5083333333333333, + "grad_norm": 0.3778250217437744, + "learning_rate": 0.00017497076023391815, + "loss": 1.2088, + "mean_token_accuracy": 0.7146832719445229, + "num_tokens": 3827003.0, + "step": 305 + }, + { + "entropy": 1.103552520275116, + "epoch": 0.51, + "grad_norm": 0.3790249526500702, + "learning_rate": 0.00017485380116959065, + "loss": 1.0762, + "mean_token_accuracy": 0.7435255199670792, + "num_tokens": 3839330.0, + "step": 306 + }, + { + "entropy": 0.9754323288798332, + "epoch": 0.5116666666666667, + "grad_norm": 0.3479340970516205, + "learning_rate": 0.00017473684210526317, + "loss": 0.9591, + "mean_token_accuracy": 0.7697796747088432, + "num_tokens": 3851732.0, + "step": 307 + }, + { + "entropy": 1.0752006620168686, + "epoch": 0.5133333333333333, + "grad_norm": 0.36727213859558105, + "learning_rate": 0.0001746198830409357, + "loss": 1.076, + "mean_token_accuracy": 0.745739258825779, + "num_tokens": 3864226.0, + "step": 308 + }, + { + "entropy": 1.0623352229595184, + "epoch": 0.515, + "grad_norm": 0.5051237940788269, + "learning_rate": 0.0001745029239766082, + "loss": 1.0474, + "mean_token_accuracy": 0.7463502958416939, + "num_tokens": 3876593.0, + "step": 309 + }, + { + "entropy": 1.212167464196682, + "epoch": 0.5166666666666667, + "grad_norm": 0.39934271574020386, + "learning_rate": 0.0001743859649122807, + "loss": 1.2181, + "mean_token_accuracy": 0.7197644412517548, + "num_tokens": 3889321.0, + "step": 310 + }, + { + "entropy": 1.1309688091278076, + "epoch": 0.5183333333333333, + "grad_norm": 0.3818771243095398, + "learning_rate": 0.00017426900584795323, + "loss": 1.1528, + "mean_token_accuracy": 0.7310786545276642, + "num_tokens": 3901680.0, + "step": 311 + }, + { + "entropy": 1.0219652131199837, + "epoch": 0.52, + "grad_norm": 0.38632193207740784, + "learning_rate": 0.00017415204678362573, + "loss": 0.9868, + "mean_token_accuracy": 0.75808484852314, + "num_tokens": 3914432.0, + "step": 312 + }, + { + "entropy": 1.2867136895656586, + "epoch": 0.5216666666666666, + "grad_norm": 0.3463575839996338, + "learning_rate": 0.00017403508771929825, + "loss": 1.3031, + "mean_token_accuracy": 0.6908155083656311, + "num_tokens": 3926978.0, + "step": 313 + }, + { + "entropy": 1.0409365370869637, + "epoch": 0.5233333333333333, + "grad_norm": 0.3130481541156769, + "learning_rate": 0.00017391812865497078, + "loss": 1.0231, + "mean_token_accuracy": 0.7590188607573509, + "num_tokens": 3939332.0, + "step": 314 + }, + { + "entropy": 1.1452669128775597, + "epoch": 0.525, + "grad_norm": 0.3694944679737091, + "learning_rate": 0.00017380116959064327, + "loss": 1.0962, + "mean_token_accuracy": 0.7332403659820557, + "num_tokens": 3951894.0, + "step": 315 + }, + { + "entropy": 1.03856360912323, + "epoch": 0.5266666666666666, + "grad_norm": 0.363551527261734, + "learning_rate": 0.0001736842105263158, + "loss": 1.0056, + "mean_token_accuracy": 0.7608824819326401, + "num_tokens": 3964580.0, + "step": 316 + }, + { + "entropy": 1.2034416571259499, + "epoch": 0.5283333333333333, + "grad_norm": 0.4383612275123596, + "learning_rate": 0.00017356725146198832, + "loss": 1.1768, + "mean_token_accuracy": 0.7235340550541878, + "num_tokens": 3976928.0, + "step": 317 + }, + { + "entropy": 1.0424899756908417, + "epoch": 0.53, + "grad_norm": 0.4106066823005676, + "learning_rate": 0.0001734502923976608, + "loss": 1.0418, + "mean_token_accuracy": 0.7455758079886436, + "num_tokens": 3989698.0, + "step": 318 + }, + { + "entropy": 1.0708102360367775, + "epoch": 0.5316666666666666, + "grad_norm": 0.3656046688556671, + "learning_rate": 0.00017333333333333334, + "loss": 1.0867, + "mean_token_accuracy": 0.739653930068016, + "num_tokens": 4001878.0, + "step": 319 + }, + { + "entropy": 1.0080599710345268, + "epoch": 0.5333333333333333, + "grad_norm": 0.39112988114356995, + "learning_rate": 0.00017321637426900586, + "loss": 1.0352, + "mean_token_accuracy": 0.753649964928627, + "num_tokens": 4014519.0, + "step": 320 + }, + { + "entropy": 1.184828795492649, + "epoch": 0.535, + "grad_norm": 0.7222509980201721, + "learning_rate": 0.00017309941520467836, + "loss": 1.2284, + "mean_token_accuracy": 0.7146416530013084, + "num_tokens": 4026885.0, + "step": 321 + }, + { + "entropy": 1.1485476717352867, + "epoch": 0.5366666666666666, + "grad_norm": 0.4598398506641388, + "learning_rate": 0.00017298245614035088, + "loss": 1.1697, + "mean_token_accuracy": 0.7225939184427261, + "num_tokens": 4039427.0, + "step": 322 + }, + { + "entropy": 1.0673358216881752, + "epoch": 0.5383333333333333, + "grad_norm": 0.3209613263607025, + "learning_rate": 0.0001728654970760234, + "loss": 1.0625, + "mean_token_accuracy": 0.7471172362565994, + "num_tokens": 4052273.0, + "step": 323 + }, + { + "entropy": 1.103698968887329, + "epoch": 0.54, + "grad_norm": 0.4327990412712097, + "learning_rate": 0.0001727485380116959, + "loss": 1.1195, + "mean_token_accuracy": 0.7404018118977547, + "num_tokens": 4064623.0, + "step": 324 + }, + { + "entropy": 0.978014774620533, + "epoch": 0.5416666666666666, + "grad_norm": 0.48123371601104736, + "learning_rate": 0.00017263157894736842, + "loss": 0.9427, + "mean_token_accuracy": 0.76978749781847, + "num_tokens": 4077102.0, + "step": 325 + }, + { + "entropy": 1.196822389960289, + "epoch": 0.5433333333333333, + "grad_norm": 0.4324086308479309, + "learning_rate": 0.00017251461988304094, + "loss": 1.1894, + "mean_token_accuracy": 0.722084753215313, + "num_tokens": 4089599.0, + "step": 326 + }, + { + "entropy": 1.194992557168007, + "epoch": 0.545, + "grad_norm": 0.31563282012939453, + "learning_rate": 0.00017239766081871347, + "loss": 1.1794, + "mean_token_accuracy": 0.7149224281311035, + "num_tokens": 4101954.0, + "step": 327 + }, + { + "entropy": 1.1106646209955215, + "epoch": 0.5466666666666666, + "grad_norm": 0.3899517357349396, + "learning_rate": 0.000172280701754386, + "loss": 1.0646, + "mean_token_accuracy": 0.7455310076475143, + "num_tokens": 4114386.0, + "step": 328 + }, + { + "entropy": 1.0722733810544014, + "epoch": 0.5483333333333333, + "grad_norm": 0.4056398570537567, + "learning_rate": 0.00017216374269005849, + "loss": 1.0508, + "mean_token_accuracy": 0.7531849294900894, + "num_tokens": 4126890.0, + "step": 329 + }, + { + "entropy": 1.180145487189293, + "epoch": 0.55, + "grad_norm": 0.4245923161506653, + "learning_rate": 0.000172046783625731, + "loss": 1.1845, + "mean_token_accuracy": 0.7225575000047684, + "num_tokens": 4139334.0, + "step": 330 + }, + { + "entropy": 1.0642430186271667, + "epoch": 0.5516666666666666, + "grad_norm": 0.33601292967796326, + "learning_rate": 0.00017192982456140353, + "loss": 1.0667, + "mean_token_accuracy": 0.7459002658724785, + "num_tokens": 4152125.0, + "step": 331 + }, + { + "entropy": 1.1839115172624588, + "epoch": 0.5533333333333333, + "grad_norm": 0.41593268513679504, + "learning_rate": 0.00017181286549707603, + "loss": 1.1974, + "mean_token_accuracy": 0.7193257734179497, + "num_tokens": 4164994.0, + "step": 332 + }, + { + "entropy": 1.013751097023487, + "epoch": 0.555, + "grad_norm": 0.5192084312438965, + "learning_rate": 0.00017169590643274855, + "loss": 1.0265, + "mean_token_accuracy": 0.7517165392637253, + "num_tokens": 4177479.0, + "step": 333 + }, + { + "entropy": 1.0605740025639534, + "epoch": 0.5566666666666666, + "grad_norm": 0.48862767219543457, + "learning_rate": 0.00017157894736842107, + "loss": 1.087, + "mean_token_accuracy": 0.7423926591873169, + "num_tokens": 4189789.0, + "step": 334 + }, + { + "entropy": 0.9547868818044662, + "epoch": 0.5583333333333333, + "grad_norm": 0.3805651068687439, + "learning_rate": 0.00017146198830409357, + "loss": 0.9617, + "mean_token_accuracy": 0.7645149901509285, + "num_tokens": 4202221.0, + "step": 335 + }, + { + "entropy": 1.1145575419068336, + "epoch": 0.56, + "grad_norm": 0.4303564429283142, + "learning_rate": 0.0001713450292397661, + "loss": 1.1506, + "mean_token_accuracy": 0.7287673428654671, + "num_tokens": 4214721.0, + "step": 336 + }, + { + "entropy": 1.0919224098324776, + "epoch": 0.5616666666666666, + "grad_norm": 0.3729289770126343, + "learning_rate": 0.00017122807017543862, + "loss": 1.106, + "mean_token_accuracy": 0.7369844168424606, + "num_tokens": 4227452.0, + "step": 337 + }, + { + "entropy": 1.1665791720151901, + "epoch": 0.5633333333333334, + "grad_norm": 0.4484737813472748, + "learning_rate": 0.0001711111111111111, + "loss": 1.1683, + "mean_token_accuracy": 0.7236492782831192, + "num_tokens": 4240203.0, + "step": 338 + }, + { + "entropy": 1.0452025011181831, + "epoch": 0.565, + "grad_norm": 0.45528316497802734, + "learning_rate": 0.00017099415204678363, + "loss": 1.025, + "mean_token_accuracy": 0.7446356862783432, + "num_tokens": 4252566.0, + "step": 339 + }, + { + "entropy": 1.2566066607832909, + "epoch": 0.5666666666666667, + "grad_norm": 0.33891013264656067, + "learning_rate": 0.00017087719298245616, + "loss": 1.2393, + "mean_token_accuracy": 0.7148144468665123, + "num_tokens": 4264997.0, + "step": 340 + }, + { + "entropy": 1.083845317363739, + "epoch": 0.5683333333333334, + "grad_norm": 0.3768393099308014, + "learning_rate": 0.00017076023391812865, + "loss": 1.0713, + "mean_token_accuracy": 0.7463406249880791, + "num_tokens": 4277370.0, + "step": 341 + }, + { + "entropy": 1.0517774820327759, + "epoch": 0.57, + "grad_norm": 0.41828426718711853, + "learning_rate": 0.00017064327485380118, + "loss": 1.0319, + "mean_token_accuracy": 0.7481891736388206, + "num_tokens": 4289689.0, + "step": 342 + }, + { + "entropy": 1.0900000855326653, + "epoch": 0.5716666666666667, + "grad_norm": 0.34837833046913147, + "learning_rate": 0.0001705263157894737, + "loss": 1.0622, + "mean_token_accuracy": 0.7406937256455421, + "num_tokens": 4302237.0, + "step": 343 + }, + { + "entropy": 1.1605029106140137, + "epoch": 0.5733333333333334, + "grad_norm": 0.4013015329837799, + "learning_rate": 0.0001704093567251462, + "loss": 1.155, + "mean_token_accuracy": 0.7249374613165855, + "num_tokens": 4314466.0, + "step": 344 + }, + { + "entropy": 1.1528219133615494, + "epoch": 0.575, + "grad_norm": 0.3658445477485657, + "learning_rate": 0.00017029239766081872, + "loss": 1.1334, + "mean_token_accuracy": 0.7317510321736336, + "num_tokens": 4326923.0, + "step": 345 + }, + { + "entropy": 0.9520823210477829, + "epoch": 0.5766666666666667, + "grad_norm": 0.3611161410808563, + "learning_rate": 0.00017017543859649124, + "loss": 0.9579, + "mean_token_accuracy": 0.7622030377388, + "num_tokens": 4339742.0, + "step": 346 + }, + { + "entropy": 1.050878219306469, + "epoch": 0.5783333333333334, + "grad_norm": 0.34087908267974854, + "learning_rate": 0.00017005847953216376, + "loss": 1.0817, + "mean_token_accuracy": 0.7354206740856171, + "num_tokens": 4352376.0, + "step": 347 + }, + { + "entropy": 0.9521585963666439, + "epoch": 0.58, + "grad_norm": 0.33794334530830383, + "learning_rate": 0.00016994152046783626, + "loss": 0.9943, + "mean_token_accuracy": 0.756027527153492, + "num_tokens": 4365003.0, + "step": 348 + }, + { + "entropy": 1.0915799364447594, + "epoch": 0.5816666666666667, + "grad_norm": 0.40143731236457825, + "learning_rate": 0.00016982456140350878, + "loss": 1.0984, + "mean_token_accuracy": 0.7424183115363121, + "num_tokens": 4377303.0, + "step": 349 + }, + { + "entropy": 0.9480448961257935, + "epoch": 0.5833333333333334, + "grad_norm": 0.4219553470611572, + "learning_rate": 0.0001697076023391813, + "loss": 0.9469, + "mean_token_accuracy": 0.770964540541172, + "num_tokens": 4390015.0, + "step": 350 + }, + { + "entropy": 1.147115521132946, + "epoch": 0.585, + "grad_norm": 0.3754667043685913, + "learning_rate": 0.0001695906432748538, + "loss": 1.1543, + "mean_token_accuracy": 0.7254085168242455, + "num_tokens": 4402733.0, + "step": 351 + }, + { + "entropy": 1.253534510731697, + "epoch": 0.5866666666666667, + "grad_norm": 5.795986652374268, + "learning_rate": 0.00016947368421052633, + "loss": 1.2822, + "mean_token_accuracy": 0.7053791284561157, + "num_tokens": 4415352.0, + "step": 352 + }, + { + "entropy": 1.1879041716456413, + "epoch": 0.5883333333333334, + "grad_norm": 0.3433243930339813, + "learning_rate": 0.00016935672514619885, + "loss": 1.1967, + "mean_token_accuracy": 0.716492310166359, + "num_tokens": 4428044.0, + "step": 353 + }, + { + "entropy": 1.1484075263142586, + "epoch": 0.59, + "grad_norm": 0.3310488760471344, + "learning_rate": 0.00016923976608187134, + "loss": 1.1446, + "mean_token_accuracy": 0.733469732105732, + "num_tokens": 4440325.0, + "step": 354 + }, + { + "entropy": 1.0385578274726868, + "epoch": 0.5916666666666667, + "grad_norm": 0.36995747685432434, + "learning_rate": 0.00016912280701754387, + "loss": 1.0412, + "mean_token_accuracy": 0.7494801208376884, + "num_tokens": 4452827.0, + "step": 355 + }, + { + "entropy": 1.0025876611471176, + "epoch": 0.5933333333333334, + "grad_norm": 0.3320871889591217, + "learning_rate": 0.0001690058479532164, + "loss": 1.0015, + "mean_token_accuracy": 0.7598021700978279, + "num_tokens": 4465154.0, + "step": 356 + }, + { + "entropy": 0.9714228957891464, + "epoch": 0.595, + "grad_norm": 0.3272383511066437, + "learning_rate": 0.00016888888888888889, + "loss": 0.9632, + "mean_token_accuracy": 0.7730072066187859, + "num_tokens": 4477940.0, + "step": 357 + }, + { + "entropy": 0.9843970835208893, + "epoch": 0.5966666666666667, + "grad_norm": 0.331716924905777, + "learning_rate": 0.0001687719298245614, + "loss": 0.9832, + "mean_token_accuracy": 0.7648526951670647, + "num_tokens": 4490324.0, + "step": 358 + }, + { + "entropy": 1.0355599969625473, + "epoch": 0.5983333333333334, + "grad_norm": 0.328522652387619, + "learning_rate": 0.00016865497076023393, + "loss": 1.0229, + "mean_token_accuracy": 0.7536440342664719, + "num_tokens": 4502903.0, + "step": 359 + }, + { + "entropy": 1.1647923961281776, + "epoch": 0.6, + "grad_norm": 0.35590431094169617, + "learning_rate": 0.00016853801169590643, + "loss": 1.1555, + "mean_token_accuracy": 0.7302551120519638, + "num_tokens": 4515421.0, + "step": 360 + }, + { + "epoch": 0.6, + "eval_entropy": 1.1506038707096735, + "eval_loss": 1.147754430770874, + "eval_mean_token_accuracy": 0.7277677131266367, + "eval_num_tokens": 4515421.0, + "eval_runtime": 2668.7626, + "eval_samples_per_second": 1.874, + "eval_steps_per_second": 0.937, + "step": 360 + }, + { + "entropy": 1.0822272449731827, + "epoch": 0.6016666666666667, + "grad_norm": 0.3461451232433319, + "learning_rate": 0.00016842105263157895, + "loss": 1.0538, + "mean_token_accuracy": 0.7523676231503487, + "num_tokens": 4527959.0, + "step": 361 + }, + { + "entropy": 0.9786264225840569, + "epoch": 0.6033333333333334, + "grad_norm": 0.34319812059402466, + "learning_rate": 0.00016830409356725147, + "loss": 0.9882, + "mean_token_accuracy": 0.7686471715569496, + "num_tokens": 4540528.0, + "step": 362 + }, + { + "entropy": 1.0155667290091515, + "epoch": 0.605, + "grad_norm": 0.424983412027359, + "learning_rate": 0.00016818713450292397, + "loss": 0.9836, + "mean_token_accuracy": 0.7593458294868469, + "num_tokens": 4553090.0, + "step": 363 + }, + { + "entropy": 1.1010105088353157, + "epoch": 0.6066666666666667, + "grad_norm": 0.35969412326812744, + "learning_rate": 0.0001680701754385965, + "loss": 1.093, + "mean_token_accuracy": 0.7331129014492035, + "num_tokens": 4565595.0, + "step": 364 + }, + { + "entropy": 1.0532179400324821, + "epoch": 0.6083333333333333, + "grad_norm": 0.3190361559391022, + "learning_rate": 0.00016795321637426902, + "loss": 1.0569, + "mean_token_accuracy": 0.7494515404105186, + "num_tokens": 4577921.0, + "step": 365 + }, + { + "entropy": 1.1271280273795128, + "epoch": 0.61, + "grad_norm": 0.43350517749786377, + "learning_rate": 0.0001678362573099415, + "loss": 1.1579, + "mean_token_accuracy": 0.729541227221489, + "num_tokens": 4590555.0, + "step": 366 + }, + { + "entropy": 1.1090258359909058, + "epoch": 0.6116666666666667, + "grad_norm": 0.4741346538066864, + "learning_rate": 0.00016771929824561406, + "loss": 1.1043, + "mean_token_accuracy": 0.7302690967917442, + "num_tokens": 4603046.0, + "step": 367 + }, + { + "entropy": 1.0478442907333374, + "epoch": 0.6133333333333333, + "grad_norm": 0.4757939279079437, + "learning_rate": 0.00016760233918128656, + "loss": 1.0388, + "mean_token_accuracy": 0.7470605447888374, + "num_tokens": 4615534.0, + "step": 368 + }, + { + "entropy": 1.1102875471115112, + "epoch": 0.615, + "grad_norm": 0.34702980518341064, + "learning_rate": 0.00016748538011695905, + "loss": 1.1123, + "mean_token_accuracy": 0.7333204820752144, + "num_tokens": 4628160.0, + "step": 369 + }, + { + "entropy": 1.037328228354454, + "epoch": 0.6166666666666667, + "grad_norm": 0.3619624972343445, + "learning_rate": 0.0001673684210526316, + "loss": 1.0005, + "mean_token_accuracy": 0.7570820525288582, + "num_tokens": 4640833.0, + "step": 370 + }, + { + "entropy": 1.138997420668602, + "epoch": 0.6183333333333333, + "grad_norm": 0.5491196513175964, + "learning_rate": 0.0001672514619883041, + "loss": 1.1537, + "mean_token_accuracy": 0.7282362058758736, + "num_tokens": 4653071.0, + "step": 371 + }, + { + "entropy": 1.0007020235061646, + "epoch": 0.62, + "grad_norm": 0.33528798818588257, + "learning_rate": 0.00016713450292397662, + "loss": 0.9821, + "mean_token_accuracy": 0.7631944566965103, + "num_tokens": 4665686.0, + "step": 372 + }, + { + "entropy": 1.0706755891442299, + "epoch": 0.6216666666666667, + "grad_norm": 0.5516906976699829, + "learning_rate": 0.00016701754385964915, + "loss": 1.0717, + "mean_token_accuracy": 0.7461396679282188, + "num_tokens": 4678137.0, + "step": 373 + }, + { + "entropy": 1.0305498763918877, + "epoch": 0.6233333333333333, + "grad_norm": 0.4353298544883728, + "learning_rate": 0.00016690058479532164, + "loss": 1.0007, + "mean_token_accuracy": 0.7523628026247025, + "num_tokens": 4690948.0, + "step": 374 + }, + { + "entropy": 1.0295665189623833, + "epoch": 0.625, + "grad_norm": 0.33500105142593384, + "learning_rate": 0.00016678362573099417, + "loss": 1.0147, + "mean_token_accuracy": 0.7575552314519882, + "num_tokens": 4703464.0, + "step": 375 + }, + { + "entropy": 1.261782169342041, + "epoch": 0.6266666666666667, + "grad_norm": 0.3233359158039093, + "learning_rate": 0.0001666666666666667, + "loss": 1.2725, + "mean_token_accuracy": 0.6999640017747879, + "num_tokens": 4716083.0, + "step": 376 + }, + { + "entropy": 1.0816773176193237, + "epoch": 0.6283333333333333, + "grad_norm": 0.36155110597610474, + "learning_rate": 0.00016654970760233918, + "loss": 1.0828, + "mean_token_accuracy": 0.7376217916607857, + "num_tokens": 4728514.0, + "step": 377 + }, + { + "entropy": 0.9655618295073509, + "epoch": 0.63, + "grad_norm": 0.4339034855365753, + "learning_rate": 0.0001664327485380117, + "loss": 0.9711, + "mean_token_accuracy": 0.761038102209568, + "num_tokens": 4740972.0, + "step": 378 + }, + { + "entropy": 1.0789566859602928, + "epoch": 0.6316666666666667, + "grad_norm": 1.1587278842926025, + "learning_rate": 0.00016631578947368423, + "loss": 1.0795, + "mean_token_accuracy": 0.7451052665710449, + "num_tokens": 4753592.0, + "step": 379 + }, + { + "entropy": 1.1246184334158897, + "epoch": 0.6333333333333333, + "grad_norm": 0.3487575948238373, + "learning_rate": 0.00016619883040935673, + "loss": 1.1458, + "mean_token_accuracy": 0.7255075052380562, + "num_tokens": 4766271.0, + "step": 380 + }, + { + "entropy": 0.9961473643779755, + "epoch": 0.635, + "grad_norm": 0.3496881425380707, + "learning_rate": 0.00016608187134502925, + "loss": 0.9703, + "mean_token_accuracy": 0.7653394937515259, + "num_tokens": 4778509.0, + "step": 381 + }, + { + "entropy": 1.0803877338767052, + "epoch": 0.6366666666666667, + "grad_norm": 0.42682647705078125, + "learning_rate": 0.00016596491228070177, + "loss": 1.0697, + "mean_token_accuracy": 0.7415556833148003, + "num_tokens": 4790982.0, + "step": 382 + }, + { + "entropy": 1.23675137758255, + "epoch": 0.6383333333333333, + "grad_norm": 0.48014000058174133, + "learning_rate": 0.00016584795321637427, + "loss": 1.2376, + "mean_token_accuracy": 0.7128911018371582, + "num_tokens": 4803455.0, + "step": 383 + }, + { + "entropy": 1.1108812019228935, + "epoch": 0.64, + "grad_norm": 0.38982605934143066, + "learning_rate": 0.0001657309941520468, + "loss": 1.1064, + "mean_token_accuracy": 0.7403939291834831, + "num_tokens": 4815947.0, + "step": 384 + }, + { + "entropy": 1.0063207522034645, + "epoch": 0.6416666666666667, + "grad_norm": 0.350036084651947, + "learning_rate": 0.00016561403508771931, + "loss": 0.9986, + "mean_token_accuracy": 0.7624973133206367, + "num_tokens": 4828658.0, + "step": 385 + }, + { + "entropy": 1.0487488061189651, + "epoch": 0.6433333333333333, + "grad_norm": 0.41516250371932983, + "learning_rate": 0.0001654970760233918, + "loss": 1.0116, + "mean_token_accuracy": 0.751943901181221, + "num_tokens": 4841203.0, + "step": 386 + }, + { + "entropy": 1.055719830095768, + "epoch": 0.645, + "grad_norm": 0.3185690939426422, + "learning_rate": 0.00016538011695906433, + "loss": 1.051, + "mean_token_accuracy": 0.7423844113945961, + "num_tokens": 4853646.0, + "step": 387 + }, + { + "entropy": 1.0037056729197502, + "epoch": 0.6466666666666666, + "grad_norm": 0.38994213938713074, + "learning_rate": 0.00016526315789473686, + "loss": 1.0071, + "mean_token_accuracy": 0.759202741086483, + "num_tokens": 4866080.0, + "step": 388 + }, + { + "entropy": 1.2100782170891762, + "epoch": 0.6483333333333333, + "grad_norm": 0.3755752742290497, + "learning_rate": 0.00016514619883040935, + "loss": 1.2323, + "mean_token_accuracy": 0.6998582407832146, + "num_tokens": 4878573.0, + "step": 389 + }, + { + "entropy": 0.964059017598629, + "epoch": 0.65, + "grad_norm": 0.47546395659446716, + "learning_rate": 0.0001650292397660819, + "loss": 0.9479, + "mean_token_accuracy": 0.7689564228057861, + "num_tokens": 4891251.0, + "step": 390 + }, + { + "entropy": 1.0938122794032097, + "epoch": 0.6516666666666666, + "grad_norm": 0.31692859530448914, + "learning_rate": 0.0001649122807017544, + "loss": 1.112, + "mean_token_accuracy": 0.734823040664196, + "num_tokens": 4903849.0, + "step": 391 + }, + { + "entropy": 1.2476731687784195, + "epoch": 0.6533333333333333, + "grad_norm": 0.3951464295387268, + "learning_rate": 0.0001647953216374269, + "loss": 1.2505, + "mean_token_accuracy": 0.7056182846426964, + "num_tokens": 4916605.0, + "step": 392 + }, + { + "entropy": 1.0463567823171616, + "epoch": 0.655, + "grad_norm": 0.3860931992530823, + "learning_rate": 0.00016467836257309944, + "loss": 1.0277, + "mean_token_accuracy": 0.7453161776065826, + "num_tokens": 4929002.0, + "step": 393 + }, + { + "entropy": 1.1087151244282722, + "epoch": 0.6566666666666666, + "grad_norm": 0.34755659103393555, + "learning_rate": 0.00016456140350877194, + "loss": 1.09, + "mean_token_accuracy": 0.7396251112222672, + "num_tokens": 4941434.0, + "step": 394 + }, + { + "entropy": 1.2049994841217995, + "epoch": 0.6583333333333333, + "grad_norm": 0.39863044023513794, + "learning_rate": 0.00016444444444444444, + "loss": 1.1623, + "mean_token_accuracy": 0.7202948480844498, + "num_tokens": 4953876.0, + "step": 395 + }, + { + "entropy": 1.0984854400157928, + "epoch": 0.66, + "grad_norm": 0.38562482595443726, + "learning_rate": 0.00016432748538011699, + "loss": 1.0935, + "mean_token_accuracy": 0.7386272475123405, + "num_tokens": 4966358.0, + "step": 396 + }, + { + "entropy": 1.0148514583706856, + "epoch": 0.6616666666666666, + "grad_norm": 0.3571801781654358, + "learning_rate": 0.00016421052631578948, + "loss": 1.0091, + "mean_token_accuracy": 0.7530024722218513, + "num_tokens": 4979075.0, + "step": 397 + }, + { + "entropy": 1.1101968213915825, + "epoch": 0.6633333333333333, + "grad_norm": 0.37536031007766724, + "learning_rate": 0.00016409356725146198, + "loss": 1.1301, + "mean_token_accuracy": 0.7349445819854736, + "num_tokens": 4991282.0, + "step": 398 + }, + { + "entropy": 1.0139321312308311, + "epoch": 0.665, + "grad_norm": 0.3825819194316864, + "learning_rate": 0.00016397660818713453, + "loss": 1.0468, + "mean_token_accuracy": 0.7493115812540054, + "num_tokens": 5003653.0, + "step": 399 + }, + { + "entropy": 0.9064864292740822, + "epoch": 0.6666666666666666, + "grad_norm": 0.37946513295173645, + "learning_rate": 0.00016385964912280702, + "loss": 0.9145, + "mean_token_accuracy": 0.7758191227912903, + "num_tokens": 5016221.0, + "step": 400 + }, + { + "entropy": 1.0882866755127907, + "epoch": 0.6683333333333333, + "grad_norm": 0.32677122950553894, + "learning_rate": 0.00016374269005847952, + "loss": 1.08, + "mean_token_accuracy": 0.7416599541902542, + "num_tokens": 5028657.0, + "step": 401 + }, + { + "entropy": 1.1420171335339546, + "epoch": 0.67, + "grad_norm": 0.34600427746772766, + "learning_rate": 0.00016362573099415207, + "loss": 1.1095, + "mean_token_accuracy": 0.7331014648079872, + "num_tokens": 5041256.0, + "step": 402 + }, + { + "entropy": 1.2241623848676682, + "epoch": 0.6716666666666666, + "grad_norm": 0.38560718297958374, + "learning_rate": 0.00016350877192982457, + "loss": 1.2226, + "mean_token_accuracy": 0.7100344523787498, + "num_tokens": 5053831.0, + "step": 403 + }, + { + "entropy": 0.9865270294249058, + "epoch": 0.6733333333333333, + "grad_norm": 0.3292683959007263, + "learning_rate": 0.00016339181286549706, + "loss": 0.9717, + "mean_token_accuracy": 0.7668761685490608, + "num_tokens": 5066367.0, + "step": 404 + }, + { + "entropy": 1.0482201799750328, + "epoch": 0.675, + "grad_norm": 0.5308311581611633, + "learning_rate": 0.0001632748538011696, + "loss": 1.0109, + "mean_token_accuracy": 0.7543414235115051, + "num_tokens": 5079159.0, + "step": 405 + }, + { + "entropy": 1.0350475907325745, + "epoch": 0.6766666666666666, + "grad_norm": 0.34866708517074585, + "learning_rate": 0.0001631578947368421, + "loss": 1.0179, + "mean_token_accuracy": 0.7569469437003136, + "num_tokens": 5091744.0, + "step": 406 + }, + { + "entropy": 1.0928167328238487, + "epoch": 0.6783333333333333, + "grad_norm": 0.38286733627319336, + "learning_rate": 0.00016304093567251463, + "loss": 1.0727, + "mean_token_accuracy": 0.7430339977145195, + "num_tokens": 5104211.0, + "step": 407 + }, + { + "entropy": 1.1038938909769058, + "epoch": 0.68, + "grad_norm": 0.3320970833301544, + "learning_rate": 0.00016292397660818715, + "loss": 1.1293, + "mean_token_accuracy": 0.7329866364598274, + "num_tokens": 5116819.0, + "step": 408 + }, + { + "entropy": 1.042891263961792, + "epoch": 0.6816666666666666, + "grad_norm": 0.38465678691864014, + "learning_rate": 0.00016280701754385965, + "loss": 1.0725, + "mean_token_accuracy": 0.747420534491539, + "num_tokens": 5129352.0, + "step": 409 + }, + { + "entropy": 1.083320826292038, + "epoch": 0.6833333333333333, + "grad_norm": 0.35532549023628235, + "learning_rate": 0.00016269005847953217, + "loss": 1.081, + "mean_token_accuracy": 0.7357378974556923, + "num_tokens": 5141789.0, + "step": 410 + }, + { + "entropy": 1.0968919321894646, + "epoch": 0.685, + "grad_norm": 0.37026599049568176, + "learning_rate": 0.0001625730994152047, + "loss": 1.0946, + "mean_token_accuracy": 0.7453345954418182, + "num_tokens": 5154353.0, + "step": 411 + }, + { + "entropy": 1.1025621965527534, + "epoch": 0.6866666666666666, + "grad_norm": 0.38080209493637085, + "learning_rate": 0.0001624561403508772, + "loss": 1.1408, + "mean_token_accuracy": 0.7331846132874489, + "num_tokens": 5167104.0, + "step": 412 + }, + { + "entropy": 0.9193796887993813, + "epoch": 0.6883333333333334, + "grad_norm": 0.4362526834011078, + "learning_rate": 0.00016233918128654972, + "loss": 0.9057, + "mean_token_accuracy": 0.7817874625325203, + "num_tokens": 5179789.0, + "step": 413 + }, + { + "entropy": 1.1486607491970062, + "epoch": 0.69, + "grad_norm": 0.43080970644950867, + "learning_rate": 0.00016222222222222224, + "loss": 1.1557, + "mean_token_accuracy": 0.7257590591907501, + "num_tokens": 5192550.0, + "step": 414 + }, + { + "entropy": 1.1312269866466522, + "epoch": 0.6916666666666667, + "grad_norm": 0.33069857954978943, + "learning_rate": 0.00016210526315789473, + "loss": 1.1336, + "mean_token_accuracy": 0.7364302352070808, + "num_tokens": 5205107.0, + "step": 415 + }, + { + "entropy": 1.1854391023516655, + "epoch": 0.6933333333333334, + "grad_norm": 0.5881476998329163, + "learning_rate": 0.00016198830409356726, + "loss": 1.1957, + "mean_token_accuracy": 0.7226643934845924, + "num_tokens": 5217821.0, + "step": 416 + }, + { + "entropy": 1.153312124311924, + "epoch": 0.695, + "grad_norm": 0.506568431854248, + "learning_rate": 0.00016187134502923978, + "loss": 1.1506, + "mean_token_accuracy": 0.7315347641706467, + "num_tokens": 5230369.0, + "step": 417 + }, + { + "entropy": 1.0305966809391975, + "epoch": 0.6966666666666667, + "grad_norm": 0.37616896629333496, + "learning_rate": 0.00016175438596491228, + "loss": 1.0028, + "mean_token_accuracy": 0.7581542059779167, + "num_tokens": 5243034.0, + "step": 418 + }, + { + "entropy": 1.184016190469265, + "epoch": 0.6983333333333334, + "grad_norm": 0.3351752460002899, + "learning_rate": 0.0001616374269005848, + "loss": 1.1678, + "mean_token_accuracy": 0.7203118875622749, + "num_tokens": 5255634.0, + "step": 419 + }, + { + "entropy": 1.1107853651046753, + "epoch": 0.7, + "grad_norm": 0.5095561742782593, + "learning_rate": 0.00016152046783625732, + "loss": 1.1212, + "mean_token_accuracy": 0.7350276410579681, + "num_tokens": 5268326.0, + "step": 420 + }, + { + "entropy": 1.0543791353702545, + "epoch": 0.7016666666666667, + "grad_norm": 0.4145483672618866, + "learning_rate": 0.00016140350877192982, + "loss": 1.0542, + "mean_token_accuracy": 0.7537256851792336, + "num_tokens": 5281032.0, + "step": 421 + }, + { + "entropy": 1.0694367215037346, + "epoch": 0.7033333333333334, + "grad_norm": 0.4112991690635681, + "learning_rate": 0.00016128654970760234, + "loss": 1.0322, + "mean_token_accuracy": 0.7503436282277107, + "num_tokens": 5293573.0, + "step": 422 + }, + { + "entropy": 1.116036280989647, + "epoch": 0.705, + "grad_norm": 0.614185094833374, + "learning_rate": 0.00016116959064327486, + "loss": 1.1065, + "mean_token_accuracy": 0.7300728484988213, + "num_tokens": 5306194.0, + "step": 423 + }, + { + "entropy": 1.042599968612194, + "epoch": 0.7066666666666667, + "grad_norm": 0.9990386366844177, + "learning_rate": 0.00016105263157894736, + "loss": 1.0299, + "mean_token_accuracy": 0.7559118717908859, + "num_tokens": 5318533.0, + "step": 424 + }, + { + "entropy": 1.119065299630165, + "epoch": 0.7083333333333334, + "grad_norm": 0.48257675766944885, + "learning_rate": 0.00016093567251461988, + "loss": 1.1202, + "mean_token_accuracy": 0.7334137335419655, + "num_tokens": 5331208.0, + "step": 425 + }, + { + "entropy": 1.1456444934010506, + "epoch": 0.71, + "grad_norm": 0.3156353533267975, + "learning_rate": 0.0001608187134502924, + "loss": 1.1071, + "mean_token_accuracy": 0.7339949384331703, + "num_tokens": 5343567.0, + "step": 426 + }, + { + "entropy": 1.1843851804733276, + "epoch": 0.7116666666666667, + "grad_norm": 0.4149649143218994, + "learning_rate": 0.00016070175438596493, + "loss": 1.2098, + "mean_token_accuracy": 0.7177803814411163, + "num_tokens": 5355834.0, + "step": 427 + }, + { + "entropy": 1.0262960121035576, + "epoch": 0.7133333333333334, + "grad_norm": 0.47338202595710754, + "learning_rate": 0.00016058479532163743, + "loss": 1.0406, + "mean_token_accuracy": 0.7452673614025116, + "num_tokens": 5368384.0, + "step": 428 + }, + { + "entropy": 1.0742842629551888, + "epoch": 0.715, + "grad_norm": 0.4460916519165039, + "learning_rate": 0.00016046783625730995, + "loss": 1.0959, + "mean_token_accuracy": 0.7396591976284981, + "num_tokens": 5380753.0, + "step": 429 + }, + { + "entropy": 0.9951488599181175, + "epoch": 0.7166666666666667, + "grad_norm": 0.3373461961746216, + "learning_rate": 0.00016035087719298247, + "loss": 0.9911, + "mean_token_accuracy": 0.7618727758526802, + "num_tokens": 5393349.0, + "step": 430 + }, + { + "entropy": 1.121171198785305, + "epoch": 0.7183333333333334, + "grad_norm": 0.4891211688518524, + "learning_rate": 0.000160233918128655, + "loss": 1.1048, + "mean_token_accuracy": 0.7355387806892395, + "num_tokens": 5405990.0, + "step": 431 + }, + { + "entropy": 1.1132899299263954, + "epoch": 0.72, + "grad_norm": 0.4116326570510864, + "learning_rate": 0.0001601169590643275, + "loss": 1.0877, + "mean_token_accuracy": 0.732665404677391, + "num_tokens": 5418879.0, + "step": 432 + }, + { + "entropy": 1.0193076133728027, + "epoch": 0.7216666666666667, + "grad_norm": 0.4597654640674591, + "learning_rate": 0.00016, + "loss": 0.9755, + "mean_token_accuracy": 0.7617568150162697, + "num_tokens": 5431625.0, + "step": 433 + }, + { + "entropy": 1.102819487452507, + "epoch": 0.7233333333333334, + "grad_norm": 0.3408207595348358, + "learning_rate": 0.00015988304093567254, + "loss": 1.1085, + "mean_token_accuracy": 0.7342933788895607, + "num_tokens": 5444373.0, + "step": 434 + }, + { + "entropy": 1.1037746369838715, + "epoch": 0.725, + "grad_norm": 0.3480106294155121, + "learning_rate": 0.00015976608187134503, + "loss": 1.0826, + "mean_token_accuracy": 0.7379643693566322, + "num_tokens": 5456763.0, + "step": 435 + }, + { + "entropy": 1.0365785732865334, + "epoch": 0.7266666666666667, + "grad_norm": 0.6273422241210938, + "learning_rate": 0.00015964912280701756, + "loss": 1.046, + "mean_token_accuracy": 0.7498196363449097, + "num_tokens": 5469348.0, + "step": 436 + }, + { + "entropy": 1.0879196152091026, + "epoch": 0.7283333333333334, + "grad_norm": 0.4042668342590332, + "learning_rate": 0.00015953216374269008, + "loss": 1.0885, + "mean_token_accuracy": 0.746056891977787, + "num_tokens": 5481851.0, + "step": 437 + }, + { + "entropy": 1.0286137238144875, + "epoch": 0.73, + "grad_norm": 0.3042530119419098, + "learning_rate": 0.00015941520467836257, + "loss": 1.0445, + "mean_token_accuracy": 0.7513305693864822, + "num_tokens": 5494525.0, + "step": 438 + }, + { + "entropy": 1.0987597107887268, + "epoch": 0.7316666666666667, + "grad_norm": 0.3762005567550659, + "learning_rate": 0.0001592982456140351, + "loss": 1.0922, + "mean_token_accuracy": 0.7425737306475639, + "num_tokens": 5507144.0, + "step": 439 + }, + { + "entropy": 1.127465382218361, + "epoch": 0.7333333333333333, + "grad_norm": 0.5283383131027222, + "learning_rate": 0.00015918128654970762, + "loss": 1.1109, + "mean_token_accuracy": 0.7371382638812065, + "num_tokens": 5519953.0, + "step": 440 + }, + { + "entropy": 1.2035595402121544, + "epoch": 0.735, + "grad_norm": 0.4114871025085449, + "learning_rate": 0.00015906432748538012, + "loss": 1.2056, + "mean_token_accuracy": 0.7149497643113136, + "num_tokens": 5532573.0, + "step": 441 + }, + { + "entropy": 1.0220743417739868, + "epoch": 0.7366666666666667, + "grad_norm": 0.3489610552787781, + "learning_rate": 0.00015894736842105264, + "loss": 0.9867, + "mean_token_accuracy": 0.7571059986948967, + "num_tokens": 5545041.0, + "step": 442 + }, + { + "entropy": 0.8919041678309441, + "epoch": 0.7383333333333333, + "grad_norm": 0.44151565432548523, + "learning_rate": 0.00015883040935672516, + "loss": 0.8787, + "mean_token_accuracy": 0.7870308607816696, + "num_tokens": 5557653.0, + "step": 443 + }, + { + "entropy": 1.0376613810658455, + "epoch": 0.74, + "grad_norm": 0.418760746717453, + "learning_rate": 0.00015871345029239766, + "loss": 1.0233, + "mean_token_accuracy": 0.756847932934761, + "num_tokens": 5570252.0, + "step": 444 + }, + { + "entropy": 1.2031901478767395, + "epoch": 0.7416666666666667, + "grad_norm": 0.3491179049015045, + "learning_rate": 0.00015859649122807018, + "loss": 1.2008, + "mean_token_accuracy": 0.7186564728617668, + "num_tokens": 5582827.0, + "step": 445 + }, + { + "entropy": 1.010102555155754, + "epoch": 0.7433333333333333, + "grad_norm": 0.3810936212539673, + "learning_rate": 0.0001584795321637427, + "loss": 0.984, + "mean_token_accuracy": 0.7599733769893646, + "num_tokens": 5595401.0, + "step": 446 + }, + { + "entropy": 1.075411356985569, + "epoch": 0.745, + "grad_norm": 0.43530669808387756, + "learning_rate": 0.0001583625730994152, + "loss": 1.1041, + "mean_token_accuracy": 0.7356607168912888, + "num_tokens": 5607942.0, + "step": 447 + }, + { + "entropy": 0.9819160103797913, + "epoch": 0.7466666666666667, + "grad_norm": 0.40340355038642883, + "learning_rate": 0.00015824561403508772, + "loss": 1.0154, + "mean_token_accuracy": 0.7579329013824463, + "num_tokens": 5620235.0, + "step": 448 + }, + { + "entropy": 1.0106851756572723, + "epoch": 0.7483333333333333, + "grad_norm": 0.34522029757499695, + "learning_rate": 0.00015812865497076025, + "loss": 1.0051, + "mean_token_accuracy": 0.756496749818325, + "num_tokens": 5633075.0, + "step": 449 + }, + { + "entropy": 1.1666646376252174, + "epoch": 0.75, + "grad_norm": 0.31447482109069824, + "learning_rate": 0.00015801169590643277, + "loss": 1.1664, + "mean_token_accuracy": 0.7229723930358887, + "num_tokens": 5645725.0, + "step": 450 + }, + { + "entropy": 0.9302625432610512, + "epoch": 0.7516666666666667, + "grad_norm": 0.33545219898223877, + "learning_rate": 0.00015789473684210527, + "loss": 0.9136, + "mean_token_accuracy": 0.7761102318763733, + "num_tokens": 5658368.0, + "step": 451 + }, + { + "entropy": 1.0969965159893036, + "epoch": 0.7533333333333333, + "grad_norm": 0.3889496624469757, + "learning_rate": 0.0001577777777777778, + "loss": 1.0953, + "mean_token_accuracy": 0.7352195754647255, + "num_tokens": 5671044.0, + "step": 452 + }, + { + "entropy": 1.110503688454628, + "epoch": 0.755, + "grad_norm": 0.36706921458244324, + "learning_rate": 0.0001576608187134503, + "loss": 1.1267, + "mean_token_accuracy": 0.734914131462574, + "num_tokens": 5683498.0, + "step": 453 + }, + { + "entropy": 1.1655322015285492, + "epoch": 0.7566666666666667, + "grad_norm": 0.3752106726169586, + "learning_rate": 0.0001575438596491228, + "loss": 1.1619, + "mean_token_accuracy": 0.7204124853014946, + "num_tokens": 5695777.0, + "step": 454 + }, + { + "entropy": 1.0306529253721237, + "epoch": 0.7583333333333333, + "grad_norm": 0.32710975408554077, + "learning_rate": 0.00015742690058479533, + "loss": 1.0054, + "mean_token_accuracy": 0.7572605907917023, + "num_tokens": 5708165.0, + "step": 455 + }, + { + "entropy": 1.2195520401000977, + "epoch": 0.76, + "grad_norm": 0.44669049978256226, + "learning_rate": 0.00015730994152046785, + "loss": 1.2255, + "mean_token_accuracy": 0.7160174250602722, + "num_tokens": 5720892.0, + "step": 456 + }, + { + "entropy": 1.0541856065392494, + "epoch": 0.7616666666666667, + "grad_norm": 0.34097859263420105, + "learning_rate": 0.00015719298245614035, + "loss": 1.043, + "mean_token_accuracy": 0.7480626776814461, + "num_tokens": 5733726.0, + "step": 457 + }, + { + "entropy": 1.1402226686477661, + "epoch": 0.7633333333333333, + "grad_norm": 0.8296970725059509, + "learning_rate": 0.00015707602339181287, + "loss": 1.0894, + "mean_token_accuracy": 0.7324612215161324, + "num_tokens": 5746163.0, + "step": 458 + }, + { + "entropy": 1.223743736743927, + "epoch": 0.765, + "grad_norm": 0.31552544236183167, + "learning_rate": 0.0001569590643274854, + "loss": 1.2019, + "mean_token_accuracy": 0.7200068011879921, + "num_tokens": 5758756.0, + "step": 459 + }, + { + "entropy": 1.0422032475471497, + "epoch": 0.7666666666666667, + "grad_norm": 0.33989831805229187, + "learning_rate": 0.0001568421052631579, + "loss": 1.0594, + "mean_token_accuracy": 0.7472169026732445, + "num_tokens": 5771343.0, + "step": 460 + }, + { + "entropy": 0.9894929677248001, + "epoch": 0.7683333333333333, + "grad_norm": 0.3527223765850067, + "learning_rate": 0.00015672514619883041, + "loss": 0.9773, + "mean_token_accuracy": 0.7673698663711548, + "num_tokens": 5783675.0, + "step": 461 + }, + { + "entropy": 1.0100511014461517, + "epoch": 0.77, + "grad_norm": 0.3458307981491089, + "learning_rate": 0.00015660818713450294, + "loss": 1.0227, + "mean_token_accuracy": 0.7536729276180267, + "num_tokens": 5796529.0, + "step": 462 + }, + { + "entropy": 1.1438388898968697, + "epoch": 0.7716666666666666, + "grad_norm": 0.3677491843700409, + "learning_rate": 0.00015649122807017543, + "loss": 1.1439, + "mean_token_accuracy": 0.7285068556666374, + "num_tokens": 5808813.0, + "step": 463 + }, + { + "entropy": 1.1831453144550323, + "epoch": 0.7733333333333333, + "grad_norm": 0.3970377445220947, + "learning_rate": 0.00015637426900584796, + "loss": 1.2011, + "mean_token_accuracy": 0.7191813364624977, + "num_tokens": 5821697.0, + "step": 464 + }, + { + "entropy": 1.0034284479916096, + "epoch": 0.775, + "grad_norm": 0.3054230809211731, + "learning_rate": 0.00015625730994152048, + "loss": 0.9897, + "mean_token_accuracy": 0.7667393088340759, + "num_tokens": 5834449.0, + "step": 465 + }, + { + "entropy": 1.0818930864334106, + "epoch": 0.7766666666666666, + "grad_norm": 1.377423644065857, + "learning_rate": 0.00015614035087719297, + "loss": 1.0527, + "mean_token_accuracy": 0.7446305453777313, + "num_tokens": 5847123.0, + "step": 466 + }, + { + "entropy": 1.2061632052063942, + "epoch": 0.7783333333333333, + "grad_norm": 0.36091288924217224, + "learning_rate": 0.0001560233918128655, + "loss": 1.225, + "mean_token_accuracy": 0.7133935913443565, + "num_tokens": 5859520.0, + "step": 467 + }, + { + "entropy": 0.9294106736779213, + "epoch": 0.78, + "grad_norm": 0.37731432914733887, + "learning_rate": 0.00015590643274853802, + "loss": 0.9141, + "mean_token_accuracy": 0.7746392264962196, + "num_tokens": 5872398.0, + "step": 468 + }, + { + "entropy": 1.1364581286907196, + "epoch": 0.7816666666666666, + "grad_norm": 0.3194917142391205, + "learning_rate": 0.00015578947368421052, + "loss": 1.1358, + "mean_token_accuracy": 0.7337777689099312, + "num_tokens": 5884964.0, + "step": 469 + }, + { + "entropy": 1.0378025621175766, + "epoch": 0.7833333333333333, + "grad_norm": 0.28554415702819824, + "learning_rate": 0.00015567251461988307, + "loss": 1.0323, + "mean_token_accuracy": 0.7506354302167892, + "num_tokens": 5897550.0, + "step": 470 + }, + { + "entropy": 1.0520753636956215, + "epoch": 0.785, + "grad_norm": 0.3658890128135681, + "learning_rate": 0.00015555555555555556, + "loss": 1.0722, + "mean_token_accuracy": 0.7424618750810623, + "num_tokens": 5910216.0, + "step": 471 + }, + { + "entropy": 1.0621570646762848, + "epoch": 0.7866666666666666, + "grad_norm": 0.5311276912689209, + "learning_rate": 0.00015543859649122806, + "loss": 1.0606, + "mean_token_accuracy": 0.7419012188911438, + "num_tokens": 5922884.0, + "step": 472 + }, + { + "entropy": 1.1522653177380562, + "epoch": 0.7883333333333333, + "grad_norm": 0.4982028603553772, + "learning_rate": 0.0001553216374269006, + "loss": 1.1606, + "mean_token_accuracy": 0.725653164088726, + "num_tokens": 5935273.0, + "step": 473 + }, + { + "entropy": 1.0625966489315033, + "epoch": 0.79, + "grad_norm": 0.4393594264984131, + "learning_rate": 0.0001552046783625731, + "loss": 1.0302, + "mean_token_accuracy": 0.748927153646946, + "num_tokens": 5947999.0, + "step": 474 + }, + { + "entropy": 1.1587400287389755, + "epoch": 0.7916666666666666, + "grad_norm": 4.552830696105957, + "learning_rate": 0.00015508771929824563, + "loss": 1.1518, + "mean_token_accuracy": 0.7315473929047585, + "num_tokens": 5960567.0, + "step": 475 + }, + { + "entropy": 0.9702800586819649, + "epoch": 0.7933333333333333, + "grad_norm": 0.5625200271606445, + "learning_rate": 0.00015497076023391815, + "loss": 0.9613, + "mean_token_accuracy": 0.7680332958698273, + "num_tokens": 5972922.0, + "step": 476 + }, + { + "entropy": 1.066114716231823, + "epoch": 0.795, + "grad_norm": 0.41820967197418213, + "learning_rate": 0.00015485380116959065, + "loss": 1.053, + "mean_token_accuracy": 0.7436480596661568, + "num_tokens": 5985492.0, + "step": 477 + }, + { + "entropy": 1.121892273426056, + "epoch": 0.7966666666666666, + "grad_norm": 0.43082207441329956, + "learning_rate": 0.00015473684210526317, + "loss": 1.135, + "mean_token_accuracy": 0.726841926574707, + "num_tokens": 5997846.0, + "step": 478 + }, + { + "entropy": 0.9342339262366295, + "epoch": 0.7983333333333333, + "grad_norm": 0.7145920395851135, + "learning_rate": 0.0001546198830409357, + "loss": 0.9339, + "mean_token_accuracy": 0.7741554453969002, + "num_tokens": 6010409.0, + "step": 479 + }, + { + "entropy": 1.0903588011860847, + "epoch": 0.8, + "grad_norm": 0.5080858469009399, + "learning_rate": 0.0001545029239766082, + "loss": 1.0853, + "mean_token_accuracy": 0.7472739815711975, + "num_tokens": 6022911.0, + "step": 480 + }, + { + "entropy": 1.0341752544045448, + "epoch": 0.8016666666666666, + "grad_norm": 0.3803417384624481, + "learning_rate": 0.0001543859649122807, + "loss": 1.0156, + "mean_token_accuracy": 0.7580198422074318, + "num_tokens": 6035536.0, + "step": 481 + }, + { + "entropy": 0.9857856929302216, + "epoch": 0.8033333333333333, + "grad_norm": 0.5306246876716614, + "learning_rate": 0.00015426900584795324, + "loss": 0.9642, + "mean_token_accuracy": 0.7662367448210716, + "num_tokens": 6048113.0, + "step": 482 + }, + { + "entropy": 1.0703137665987015, + "epoch": 0.805, + "grad_norm": 0.49546366930007935, + "learning_rate": 0.00015415204678362573, + "loss": 1.0364, + "mean_token_accuracy": 0.7481872513890266, + "num_tokens": 6060978.0, + "step": 483 + }, + { + "entropy": 1.1418191492557526, + "epoch": 0.8066666666666666, + "grad_norm": 0.39517495036125183, + "learning_rate": 0.00015403508771929825, + "loss": 1.1241, + "mean_token_accuracy": 0.7268876954913139, + "num_tokens": 6073300.0, + "step": 484 + }, + { + "entropy": 1.0620516315102577, + "epoch": 0.8083333333333333, + "grad_norm": 0.33354660868644714, + "learning_rate": 0.00015391812865497078, + "loss": 1.0627, + "mean_token_accuracy": 0.7447437271475792, + "num_tokens": 6085969.0, + "step": 485 + }, + { + "entropy": 1.1370228081941605, + "epoch": 0.81, + "grad_norm": 0.4519972801208496, + "learning_rate": 0.00015380116959064327, + "loss": 1.1648, + "mean_token_accuracy": 0.727062314748764, + "num_tokens": 6098405.0, + "step": 486 + }, + { + "entropy": 1.116714984178543, + "epoch": 0.8116666666666666, + "grad_norm": 0.3834986984729767, + "learning_rate": 0.0001536842105263158, + "loss": 1.1237, + "mean_token_accuracy": 0.7360536903142929, + "num_tokens": 6110790.0, + "step": 487 + }, + { + "entropy": 1.1766095086932182, + "epoch": 0.8133333333333334, + "grad_norm": 0.3951236605644226, + "learning_rate": 0.00015356725146198832, + "loss": 1.2038, + "mean_token_accuracy": 0.7180357128381729, + "num_tokens": 6123104.0, + "step": 488 + }, + { + "entropy": 1.0317028015851974, + "epoch": 0.815, + "grad_norm": 0.31130674481391907, + "learning_rate": 0.00015345029239766081, + "loss": 1.018, + "mean_token_accuracy": 0.7511586546897888, + "num_tokens": 6135499.0, + "step": 489 + }, + { + "entropy": 1.0695944800972939, + "epoch": 0.8166666666666667, + "grad_norm": 0.41369152069091797, + "learning_rate": 0.00015333333333333334, + "loss": 1.0622, + "mean_token_accuracy": 0.743607684969902, + "num_tokens": 6147940.0, + "step": 490 + }, + { + "entropy": 0.973351001739502, + "epoch": 0.8183333333333334, + "grad_norm": 0.4467531442642212, + "learning_rate": 0.00015321637426900586, + "loss": 0.9566, + "mean_token_accuracy": 0.7704202383756638, + "num_tokens": 6160394.0, + "step": 491 + }, + { + "entropy": 1.0809285417199135, + "epoch": 0.82, + "grad_norm": 0.37388381361961365, + "learning_rate": 0.00015309941520467836, + "loss": 1.0841, + "mean_token_accuracy": 0.7388180121779442, + "num_tokens": 6172807.0, + "step": 492 + }, + { + "entropy": 1.1079175993800163, + "epoch": 0.8216666666666667, + "grad_norm": 0.2852994203567505, + "learning_rate": 0.0001529824561403509, + "loss": 1.1107, + "mean_token_accuracy": 0.7399151921272278, + "num_tokens": 6185562.0, + "step": 493 + }, + { + "entropy": 1.051186740398407, + "epoch": 0.8233333333333334, + "grad_norm": 0.35674288868904114, + "learning_rate": 0.0001528654970760234, + "loss": 1.036, + "mean_token_accuracy": 0.7519017159938812, + "num_tokens": 6198341.0, + "step": 494 + }, + { + "entropy": 1.112432986497879, + "epoch": 0.825, + "grad_norm": 0.47925853729248047, + "learning_rate": 0.0001527485380116959, + "loss": 1.0952, + "mean_token_accuracy": 0.7376601323485374, + "num_tokens": 6210805.0, + "step": 495 + }, + { + "entropy": 1.136579304933548, + "epoch": 0.8266666666666667, + "grad_norm": 0.3833317160606384, + "learning_rate": 0.00015263157894736845, + "loss": 1.1597, + "mean_token_accuracy": 0.7291416153311729, + "num_tokens": 6223352.0, + "step": 496 + }, + { + "entropy": 0.9923493564128876, + "epoch": 0.8283333333333334, + "grad_norm": 0.3395523726940155, + "learning_rate": 0.00015251461988304094, + "loss": 0.9754, + "mean_token_accuracy": 0.7562162950634956, + "num_tokens": 6235932.0, + "step": 497 + }, + { + "entropy": 1.0614645034074783, + "epoch": 0.83, + "grad_norm": 0.3680817782878876, + "learning_rate": 0.00015239766081871344, + "loss": 1.0593, + "mean_token_accuracy": 0.7413745895028114, + "num_tokens": 6248285.0, + "step": 498 + }, + { + "entropy": 0.9913108944892883, + "epoch": 0.8316666666666667, + "grad_norm": 0.3792550265789032, + "learning_rate": 0.000152280701754386, + "loss": 0.9644, + "mean_token_accuracy": 0.7651605606079102, + "num_tokens": 6261083.0, + "step": 499 + }, + { + "entropy": 1.07400331646204, + "epoch": 0.8333333333333334, + "grad_norm": 0.40772777795791626, + "learning_rate": 0.0001521637426900585, + "loss": 1.0577, + "mean_token_accuracy": 0.7462185472249985, + "num_tokens": 6273916.0, + "step": 500 + }, + { + "entropy": 1.1181387081742287, + "epoch": 0.835, + "grad_norm": 0.33552879095077515, + "learning_rate": 0.00015204678362573098, + "loss": 1.0951, + "mean_token_accuracy": 0.7397628724575043, + "num_tokens": 6286445.0, + "step": 501 + }, + { + "entropy": 1.1042726710438728, + "epoch": 0.8366666666666667, + "grad_norm": 0.36931732296943665, + "learning_rate": 0.00015192982456140353, + "loss": 1.1075, + "mean_token_accuracy": 0.7333213239908218, + "num_tokens": 6298861.0, + "step": 502 + }, + { + "entropy": 1.0911922678351402, + "epoch": 0.8383333333333334, + "grad_norm": 0.72311931848526, + "learning_rate": 0.00015181286549707603, + "loss": 1.0636, + "mean_token_accuracy": 0.7454661652445793, + "num_tokens": 6311619.0, + "step": 503 + }, + { + "entropy": 0.9951315149664879, + "epoch": 0.84, + "grad_norm": 0.40310415625572205, + "learning_rate": 0.00015169590643274852, + "loss": 1.0008, + "mean_token_accuracy": 0.7613855600357056, + "num_tokens": 6324047.0, + "step": 504 + }, + { + "entropy": 0.9872754141688347, + "epoch": 0.8416666666666667, + "grad_norm": 0.3429985046386719, + "learning_rate": 0.00015157894736842108, + "loss": 0.9861, + "mean_token_accuracy": 0.7662120833992958, + "num_tokens": 6336544.0, + "step": 505 + }, + { + "entropy": 1.0408969223499298, + "epoch": 0.8433333333333334, + "grad_norm": 0.36479246616363525, + "learning_rate": 0.00015146198830409357, + "loss": 1.0479, + "mean_token_accuracy": 0.7461319342255592, + "num_tokens": 6349291.0, + "step": 506 + }, + { + "entropy": 1.1309384107589722, + "epoch": 0.845, + "grad_norm": 0.3272276818752289, + "learning_rate": 0.00015134502923976607, + "loss": 1.1283, + "mean_token_accuracy": 0.732874296605587, + "num_tokens": 6361636.0, + "step": 507 + }, + { + "entropy": 1.025696039199829, + "epoch": 0.8466666666666667, + "grad_norm": 0.30461591482162476, + "learning_rate": 0.00015122807017543862, + "loss": 1.008, + "mean_token_accuracy": 0.754217803478241, + "num_tokens": 6374330.0, + "step": 508 + }, + { + "entropy": 0.9422205537557602, + "epoch": 0.8483333333333334, + "grad_norm": 0.34366074204444885, + "learning_rate": 0.0001511111111111111, + "loss": 0.9349, + "mean_token_accuracy": 0.7703981176018715, + "num_tokens": 6386892.0, + "step": 509 + }, + { + "entropy": 1.0122661367058754, + "epoch": 0.85, + "grad_norm": 0.35145244002342224, + "learning_rate": 0.00015099415204678364, + "loss": 1.011, + "mean_token_accuracy": 0.749379850924015, + "num_tokens": 6399805.0, + "step": 510 + }, + { + "entropy": 0.980590432882309, + "epoch": 0.8516666666666667, + "grad_norm": 0.4205039441585541, + "learning_rate": 0.00015087719298245616, + "loss": 0.9929, + "mean_token_accuracy": 0.7566556483507156, + "num_tokens": 6412486.0, + "step": 511 + }, + { + "entropy": 1.0508618205785751, + "epoch": 0.8533333333333334, + "grad_norm": 0.40067777037620544, + "learning_rate": 0.00015076023391812865, + "loss": 1.0756, + "mean_token_accuracy": 0.7451096102595329, + "num_tokens": 6425135.0, + "step": 512 + }, + { + "entropy": 0.8931740894913673, + "epoch": 0.855, + "grad_norm": 0.3182304799556732, + "learning_rate": 0.00015064327485380118, + "loss": 0.9151, + "mean_token_accuracy": 0.7833112999796867, + "num_tokens": 6437931.0, + "step": 513 + }, + { + "entropy": 1.039138525724411, + "epoch": 0.8566666666666667, + "grad_norm": 0.35510554909706116, + "learning_rate": 0.0001505263157894737, + "loss": 1.0345, + "mean_token_accuracy": 0.7493056431412697, + "num_tokens": 6450669.0, + "step": 514 + }, + { + "entropy": 0.9968626797199249, + "epoch": 0.8583333333333333, + "grad_norm": 0.4760706424713135, + "learning_rate": 0.0001504093567251462, + "loss": 0.9712, + "mean_token_accuracy": 0.7673781663179398, + "num_tokens": 6463375.0, + "step": 515 + }, + { + "entropy": 1.2044792175292969, + "epoch": 0.86, + "grad_norm": 0.32037344574928284, + "learning_rate": 0.00015029239766081872, + "loss": 1.1869, + "mean_token_accuracy": 0.7217776477336884, + "num_tokens": 6476101.0, + "step": 516 + }, + { + "entropy": 1.0119360834360123, + "epoch": 0.8616666666666667, + "grad_norm": 0.3619234263896942, + "learning_rate": 0.00015017543859649124, + "loss": 1.0088, + "mean_token_accuracy": 0.7546406164765358, + "num_tokens": 6488799.0, + "step": 517 + }, + { + "entropy": 0.9528638869524002, + "epoch": 0.8633333333333333, + "grad_norm": 0.3633744716644287, + "learning_rate": 0.00015005847953216374, + "loss": 0.9415, + "mean_token_accuracy": 0.7687265649437904, + "num_tokens": 6501728.0, + "step": 518 + }, + { + "entropy": 0.9753984436392784, + "epoch": 0.865, + "grad_norm": 0.35341712832450867, + "learning_rate": 0.00014994152046783626, + "loss": 0.962, + "mean_token_accuracy": 0.7667989581823349, + "num_tokens": 6514229.0, + "step": 519 + }, + { + "entropy": 1.1342968195676804, + "epoch": 0.8666666666666667, + "grad_norm": 0.35511359572410583, + "learning_rate": 0.00014982456140350878, + "loss": 1.1138, + "mean_token_accuracy": 0.7273155152797699, + "num_tokens": 6526771.0, + "step": 520 + }, + { + "entropy": 1.1301146745681763, + "epoch": 0.8683333333333333, + "grad_norm": 0.54920893907547, + "learning_rate": 0.00014970760233918128, + "loss": 1.125, + "mean_token_accuracy": 0.726063072681427, + "num_tokens": 6539121.0, + "step": 521 + }, + { + "entropy": 1.108246959745884, + "epoch": 0.87, + "grad_norm": 0.4356490671634674, + "learning_rate": 0.0001495906432748538, + "loss": 1.1239, + "mean_token_accuracy": 0.7272974625229836, + "num_tokens": 6551645.0, + "step": 522 + }, + { + "entropy": 1.0279260724782944, + "epoch": 0.8716666666666667, + "grad_norm": 0.3281795084476471, + "learning_rate": 0.00014947368421052633, + "loss": 1.0203, + "mean_token_accuracy": 0.7577808052301407, + "num_tokens": 6564090.0, + "step": 523 + }, + { + "entropy": 1.0240607187151909, + "epoch": 0.8733333333333333, + "grad_norm": 0.38245126605033875, + "learning_rate": 0.00014935672514619882, + "loss": 1.0336, + "mean_token_accuracy": 0.754253052175045, + "num_tokens": 6576574.0, + "step": 524 + }, + { + "entropy": 1.0713168308138847, + "epoch": 0.875, + "grad_norm": 0.4837334454059601, + "learning_rate": 0.00014923976608187135, + "loss": 1.0833, + "mean_token_accuracy": 0.7464174851775169, + "num_tokens": 6589106.0, + "step": 525 + }, + { + "entropy": 1.0884229391813278, + "epoch": 0.8766666666666667, + "grad_norm": 0.2943100035190582, + "learning_rate": 0.00014912280701754387, + "loss": 1.1, + "mean_token_accuracy": 0.7427090853452682, + "num_tokens": 6601258.0, + "step": 526 + }, + { + "entropy": 1.2013995423913002, + "epoch": 0.8783333333333333, + "grad_norm": 0.37657371163368225, + "learning_rate": 0.00014900584795321636, + "loss": 1.2225, + "mean_token_accuracy": 0.7097373679280281, + "num_tokens": 6613781.0, + "step": 527 + }, + { + "entropy": 1.1369323432445526, + "epoch": 0.88, + "grad_norm": 0.47003990411758423, + "learning_rate": 0.0001488888888888889, + "loss": 1.1584, + "mean_token_accuracy": 0.7299115657806396, + "num_tokens": 6626534.0, + "step": 528 + }, + { + "entropy": 0.957214891910553, + "epoch": 0.8816666666666667, + "grad_norm": 0.31164368987083435, + "learning_rate": 0.0001487719298245614, + "loss": 0.9584, + "mean_token_accuracy": 0.7725479602813721, + "num_tokens": 6639006.0, + "step": 529 + }, + { + "entropy": 1.051169142127037, + "epoch": 0.8833333333333333, + "grad_norm": 0.3350316882133484, + "learning_rate": 0.00014865497076023393, + "loss": 1.0422, + "mean_token_accuracy": 0.7498800754547119, + "num_tokens": 6651122.0, + "step": 530 + }, + { + "entropy": 1.1519297286868095, + "epoch": 0.885, + "grad_norm": 0.7732307314872742, + "learning_rate": 0.00014853801169590643, + "loss": 1.1511, + "mean_token_accuracy": 0.7270490527153015, + "num_tokens": 6663635.0, + "step": 531 + }, + { + "entropy": 0.9805091023445129, + "epoch": 0.8866666666666667, + "grad_norm": 0.4671543836593628, + "learning_rate": 0.00014842105263157895, + "loss": 0.9635, + "mean_token_accuracy": 0.7717467620968819, + "num_tokens": 6676184.0, + "step": 532 + }, + { + "entropy": 1.125210352241993, + "epoch": 0.8883333333333333, + "grad_norm": 0.3196561932563782, + "learning_rate": 0.00014830409356725148, + "loss": 1.1003, + "mean_token_accuracy": 0.7358310669660568, + "num_tokens": 6688989.0, + "step": 533 + }, + { + "entropy": 0.9900911301374435, + "epoch": 0.89, + "grad_norm": 0.3698914647102356, + "learning_rate": 0.000148187134502924, + "loss": 0.9624, + "mean_token_accuracy": 0.7684887275099754, + "num_tokens": 6701563.0, + "step": 534 + }, + { + "entropy": 1.1607655212283134, + "epoch": 0.8916666666666667, + "grad_norm": 0.36797863245010376, + "learning_rate": 0.0001480701754385965, + "loss": 1.1324, + "mean_token_accuracy": 0.726492203772068, + "num_tokens": 6714026.0, + "step": 535 + }, + { + "entropy": 1.122398853302002, + "epoch": 0.8933333333333333, + "grad_norm": 0.32914605736732483, + "learning_rate": 0.00014795321637426902, + "loss": 1.0988, + "mean_token_accuracy": 0.7404111847281456, + "num_tokens": 6726144.0, + "step": 536 + }, + { + "entropy": 1.1616889387369156, + "epoch": 0.895, + "grad_norm": 0.35523873567581177, + "learning_rate": 0.00014783625730994154, + "loss": 1.2038, + "mean_token_accuracy": 0.7175712808966637, + "num_tokens": 6738984.0, + "step": 537 + }, + { + "entropy": 1.038257472217083, + "epoch": 0.8966666666666666, + "grad_norm": 0.3147364556789398, + "learning_rate": 0.00014771929824561404, + "loss": 1.0323, + "mean_token_accuracy": 0.755009114742279, + "num_tokens": 6751404.0, + "step": 538 + }, + { + "entropy": 1.0097395554184914, + "epoch": 0.8983333333333333, + "grad_norm": 0.38519787788391113, + "learning_rate": 0.00014760233918128656, + "loss": 1.0577, + "mean_token_accuracy": 0.7535994872450829, + "num_tokens": 6763959.0, + "step": 539 + }, + { + "entropy": 1.1510286554694176, + "epoch": 0.9, + "grad_norm": 0.5456175804138184, + "learning_rate": 0.00014748538011695908, + "loss": 1.1552, + "mean_token_accuracy": 0.7292575761675835, + "num_tokens": 6776521.0, + "step": 540 + }, + { + "entropy": 1.0821500197052956, + "epoch": 0.9016666666666666, + "grad_norm": 0.358005166053772, + "learning_rate": 0.00014736842105263158, + "loss": 1.0762, + "mean_token_accuracy": 0.7408832535147667, + "num_tokens": 6789060.0, + "step": 541 + }, + { + "entropy": 1.111682377755642, + "epoch": 0.9033333333333333, + "grad_norm": 0.30763792991638184, + "learning_rate": 0.0001472514619883041, + "loss": 1.1123, + "mean_token_accuracy": 0.7355025187134743, + "num_tokens": 6801425.0, + "step": 542 + }, + { + "entropy": 0.953965000808239, + "epoch": 0.905, + "grad_norm": 0.4438503384590149, + "learning_rate": 0.00014713450292397662, + "loss": 0.9275, + "mean_token_accuracy": 0.7724104151129723, + "num_tokens": 6814332.0, + "step": 543 + }, + { + "entropy": 1.0561645030975342, + "epoch": 0.9066666666666666, + "grad_norm": 0.5390836596488953, + "learning_rate": 0.00014701754385964912, + "loss": 1.035, + "mean_token_accuracy": 0.7513830289244652, + "num_tokens": 6826918.0, + "step": 544 + }, + { + "entropy": 1.144611619412899, + "epoch": 0.9083333333333333, + "grad_norm": 0.6197843551635742, + "learning_rate": 0.00014690058479532164, + "loss": 1.0745, + "mean_token_accuracy": 0.736310139298439, + "num_tokens": 6839687.0, + "step": 545 + }, + { + "entropy": 1.1314348950982094, + "epoch": 0.91, + "grad_norm": 0.4797782003879547, + "learning_rate": 0.00014678362573099417, + "loss": 1.1071, + "mean_token_accuracy": 0.7364379167556763, + "num_tokens": 6852267.0, + "step": 546 + }, + { + "entropy": 1.0520601645112038, + "epoch": 0.9116666666666666, + "grad_norm": 0.40423494577407837, + "learning_rate": 0.00014666666666666666, + "loss": 1.0716, + "mean_token_accuracy": 0.7455736324191093, + "num_tokens": 6864783.0, + "step": 547 + }, + { + "entropy": 1.0261236801743507, + "epoch": 0.9133333333333333, + "grad_norm": 0.38641560077667236, + "learning_rate": 0.00014654970760233919, + "loss": 1.0317, + "mean_token_accuracy": 0.7592292949557304, + "num_tokens": 6877450.0, + "step": 548 + }, + { + "entropy": 0.996756412088871, + "epoch": 0.915, + "grad_norm": 0.45368504524230957, + "learning_rate": 0.0001464327485380117, + "loss": 1.0146, + "mean_token_accuracy": 0.7616635635495186, + "num_tokens": 6889999.0, + "step": 549 + }, + { + "entropy": 1.1064547002315521, + "epoch": 0.9166666666666666, + "grad_norm": 1.2801135778427124, + "learning_rate": 0.00014631578947368423, + "loss": 1.1435, + "mean_token_accuracy": 0.7284985184669495, + "num_tokens": 6902795.0, + "step": 550 + }, + { + "entropy": 1.0498828887939453, + "epoch": 0.9183333333333333, + "grad_norm": 0.33059030771255493, + "learning_rate": 0.00014619883040935673, + "loss": 1.0641, + "mean_token_accuracy": 0.7469140291213989, + "num_tokens": 6915455.0, + "step": 551 + }, + { + "entropy": 1.0957630798220634, + "epoch": 0.92, + "grad_norm": 0.4147501289844513, + "learning_rate": 0.00014608187134502925, + "loss": 1.1234, + "mean_token_accuracy": 0.7397946789860725, + "num_tokens": 6927897.0, + "step": 552 + }, + { + "entropy": 1.0699311718344688, + "epoch": 0.9216666666666666, + "grad_norm": 0.5108545422554016, + "learning_rate": 0.00014596491228070177, + "loss": 1.0595, + "mean_token_accuracy": 0.745146743953228, + "num_tokens": 6940318.0, + "step": 553 + }, + { + "entropy": 1.1025114730000496, + "epoch": 0.9233333333333333, + "grad_norm": 0.4209730625152588, + "learning_rate": 0.00014584795321637427, + "loss": 1.0914, + "mean_token_accuracy": 0.738226130604744, + "num_tokens": 6952938.0, + "step": 554 + }, + { + "entropy": 1.115426942706108, + "epoch": 0.925, + "grad_norm": 0.6522241830825806, + "learning_rate": 0.0001457309941520468, + "loss": 1.105, + "mean_token_accuracy": 0.7367953211069107, + "num_tokens": 6965483.0, + "step": 555 + }, + { + "entropy": 1.0508125722408295, + "epoch": 0.9266666666666666, + "grad_norm": 0.4295806884765625, + "learning_rate": 0.00014561403508771932, + "loss": 1.0411, + "mean_token_accuracy": 0.7544974535703659, + "num_tokens": 6978017.0, + "step": 556 + }, + { + "entropy": 0.9892331138253212, + "epoch": 0.9283333333333333, + "grad_norm": 0.37067389488220215, + "learning_rate": 0.0001454970760233918, + "loss": 0.9823, + "mean_token_accuracy": 0.765356183052063, + "num_tokens": 6990616.0, + "step": 557 + }, + { + "entropy": 1.095439076423645, + "epoch": 0.93, + "grad_norm": 0.5443627238273621, + "learning_rate": 0.00014538011695906433, + "loss": 1.0851, + "mean_token_accuracy": 0.7415796294808388, + "num_tokens": 7003399.0, + "step": 558 + }, + { + "entropy": 1.1333895400166512, + "epoch": 0.9316666666666666, + "grad_norm": 0.5122875571250916, + "learning_rate": 0.00014526315789473686, + "loss": 1.1032, + "mean_token_accuracy": 0.7338119447231293, + "num_tokens": 7015872.0, + "step": 559 + }, + { + "entropy": 1.1227083802223206, + "epoch": 0.9333333333333333, + "grad_norm": 0.3637396991252899, + "learning_rate": 0.00014514619883040935, + "loss": 1.0848, + "mean_token_accuracy": 0.7447784096002579, + "num_tokens": 7028607.0, + "step": 560 + }, + { + "entropy": 1.014683723449707, + "epoch": 0.935, + "grad_norm": 0.44447654485702515, + "learning_rate": 0.00014502923976608188, + "loss": 1.016, + "mean_token_accuracy": 0.7610589489340782, + "num_tokens": 7041311.0, + "step": 561 + }, + { + "entropy": 1.0506829991936684, + "epoch": 0.9366666666666666, + "grad_norm": 0.6196885704994202, + "learning_rate": 0.0001449122807017544, + "loss": 1.067, + "mean_token_accuracy": 0.749383956193924, + "num_tokens": 7054093.0, + "step": 562 + }, + { + "entropy": 1.099650725722313, + "epoch": 0.9383333333333334, + "grad_norm": 0.4171883165836334, + "learning_rate": 0.0001447953216374269, + "loss": 1.0832, + "mean_token_accuracy": 0.7402654960751534, + "num_tokens": 7066465.0, + "step": 563 + }, + { + "entropy": 1.1942984014749527, + "epoch": 0.94, + "grad_norm": 0.33442190289497375, + "learning_rate": 0.00014467836257309942, + "loss": 1.1844, + "mean_token_accuracy": 0.7190638408064842, + "num_tokens": 7079064.0, + "step": 564 + }, + { + "entropy": 1.024195820093155, + "epoch": 0.9416666666666667, + "grad_norm": 0.7153050303459167, + "learning_rate": 0.00014456140350877194, + "loss": 1.0327, + "mean_token_accuracy": 0.7473888471722603, + "num_tokens": 7091659.0, + "step": 565 + }, + { + "entropy": 1.0238566473126411, + "epoch": 0.9433333333333334, + "grad_norm": 0.39869338274002075, + "learning_rate": 0.00014444444444444444, + "loss": 1.0185, + "mean_token_accuracy": 0.7563499286770821, + "num_tokens": 7104066.0, + "step": 566 + }, + { + "entropy": 1.0474332720041275, + "epoch": 0.945, + "grad_norm": 0.3648073971271515, + "learning_rate": 0.00014432748538011696, + "loss": 1.0393, + "mean_token_accuracy": 0.7489822506904602, + "num_tokens": 7116617.0, + "step": 567 + }, + { + "entropy": 1.050786353647709, + "epoch": 0.9466666666666667, + "grad_norm": 0.4042346477508545, + "learning_rate": 0.00014421052631578948, + "loss": 1.0455, + "mean_token_accuracy": 0.755046546459198, + "num_tokens": 7128987.0, + "step": 568 + }, + { + "entropy": 1.034841150045395, + "epoch": 0.9483333333333334, + "grad_norm": 0.4897487163543701, + "learning_rate": 0.00014409356725146198, + "loss": 1.0298, + "mean_token_accuracy": 0.7585836425423622, + "num_tokens": 7141286.0, + "step": 569 + }, + { + "entropy": 0.9620069712400436, + "epoch": 0.95, + "grad_norm": 0.31527337431907654, + "learning_rate": 0.00014397660818713453, + "loss": 0.9693, + "mean_token_accuracy": 0.7721031606197357, + "num_tokens": 7153845.0, + "step": 570 + }, + { + "entropy": 1.0886893197894096, + "epoch": 0.9516666666666667, + "grad_norm": 0.322122186422348, + "learning_rate": 0.00014385964912280703, + "loss": 1.1107, + "mean_token_accuracy": 0.7335245460271835, + "num_tokens": 7166461.0, + "step": 571 + }, + { + "entropy": 1.077827326953411, + "epoch": 0.9533333333333334, + "grad_norm": 0.342206209897995, + "learning_rate": 0.00014374269005847952, + "loss": 1.1087, + "mean_token_accuracy": 0.7354479655623436, + "num_tokens": 7179412.0, + "step": 572 + }, + { + "entropy": 0.9776570126414299, + "epoch": 0.955, + "grad_norm": 0.36812132596969604, + "learning_rate": 0.00014362573099415207, + "loss": 0.9973, + "mean_token_accuracy": 0.7615081444382668, + "num_tokens": 7191954.0, + "step": 573 + }, + { + "entropy": 1.0732961222529411, + "epoch": 0.9566666666666667, + "grad_norm": 0.3786291182041168, + "learning_rate": 0.00014350877192982457, + "loss": 1.0882, + "mean_token_accuracy": 0.7390999048948288, + "num_tokens": 7204637.0, + "step": 574 + }, + { + "entropy": 1.1834777668118477, + "epoch": 0.9583333333333334, + "grad_norm": 0.5077952146530151, + "learning_rate": 0.00014339181286549706, + "loss": 1.1723, + "mean_token_accuracy": 0.721016451716423, + "num_tokens": 7217015.0, + "step": 575 + }, + { + "entropy": 1.0592687726020813, + "epoch": 0.96, + "grad_norm": 0.37753114104270935, + "learning_rate": 0.00014327485380116961, + "loss": 1.03, + "mean_token_accuracy": 0.759399339556694, + "num_tokens": 7229837.0, + "step": 576 + }, + { + "entropy": 1.1041856706142426, + "epoch": 0.9616666666666667, + "grad_norm": 0.6394175291061401, + "learning_rate": 0.0001431578947368421, + "loss": 1.1038, + "mean_token_accuracy": 0.734016478061676, + "num_tokens": 7242931.0, + "step": 577 + }, + { + "entropy": 0.9978655651211739, + "epoch": 0.9633333333333334, + "grad_norm": 0.506219208240509, + "learning_rate": 0.00014304093567251463, + "loss": 0.9685, + "mean_token_accuracy": 0.7632486671209335, + "num_tokens": 7255579.0, + "step": 578 + }, + { + "entropy": 1.181634321808815, + "epoch": 0.965, + "grad_norm": 0.2994771897792816, + "learning_rate": 0.00014292397660818716, + "loss": 1.1717, + "mean_token_accuracy": 0.7260859459638596, + "num_tokens": 7268114.0, + "step": 579 + }, + { + "entropy": 1.074332445859909, + "epoch": 0.9666666666666667, + "grad_norm": 0.4389213025569916, + "learning_rate": 0.00014280701754385965, + "loss": 1.0814, + "mean_token_accuracy": 0.7418688982725143, + "num_tokens": 7280696.0, + "step": 580 + }, + { + "entropy": 1.1216760650277138, + "epoch": 0.9683333333333334, + "grad_norm": 0.445691853761673, + "learning_rate": 0.00014269005847953217, + "loss": 1.1332, + "mean_token_accuracy": 0.7311763614416122, + "num_tokens": 7293554.0, + "step": 581 + }, + { + "entropy": 1.0460280254483223, + "epoch": 0.97, + "grad_norm": 0.4536060690879822, + "learning_rate": 0.0001425730994152047, + "loss": 1.0734, + "mean_token_accuracy": 0.7441517636179924, + "num_tokens": 7306011.0, + "step": 582 + }, + { + "entropy": 0.9758674651384354, + "epoch": 0.9716666666666667, + "grad_norm": 0.3244321048259735, + "learning_rate": 0.0001424561403508772, + "loss": 0.9507, + "mean_token_accuracy": 0.7699640765786171, + "num_tokens": 7318847.0, + "step": 583 + }, + { + "entropy": 1.0076914280653, + "epoch": 0.9733333333333334, + "grad_norm": 0.3470350503921509, + "learning_rate": 0.00014233918128654972, + "loss": 1.0062, + "mean_token_accuracy": 0.7613753005862236, + "num_tokens": 7331344.0, + "step": 584 + }, + { + "entropy": 1.0001231580972672, + "epoch": 0.975, + "grad_norm": 0.47927960753440857, + "learning_rate": 0.00014222222222222224, + "loss": 0.9762, + "mean_token_accuracy": 0.7627003714442253, + "num_tokens": 7344081.0, + "step": 585 + }, + { + "entropy": 1.0094628855586052, + "epoch": 0.9766666666666667, + "grad_norm": 0.3564485013484955, + "learning_rate": 0.00014210526315789474, + "loss": 0.9876, + "mean_token_accuracy": 0.7616174221038818, + "num_tokens": 7356857.0, + "step": 586 + }, + { + "entropy": 1.1501418203115463, + "epoch": 0.9783333333333334, + "grad_norm": 0.4095627963542938, + "learning_rate": 0.00014198830409356726, + "loss": 1.1615, + "mean_token_accuracy": 0.7238694280385971, + "num_tokens": 7369198.0, + "step": 587 + }, + { + "entropy": 0.9223703965544701, + "epoch": 0.98, + "grad_norm": 0.5018359422683716, + "learning_rate": 0.00014187134502923978, + "loss": 0.8939, + "mean_token_accuracy": 0.7803265228867531, + "num_tokens": 7381601.0, + "step": 588 + }, + { + "entropy": 1.0110130235552788, + "epoch": 0.9816666666666667, + "grad_norm": 0.311329185962677, + "learning_rate": 0.00014175438596491228, + "loss": 1.0116, + "mean_token_accuracy": 0.7552156001329422, + "num_tokens": 7393736.0, + "step": 589 + }, + { + "entropy": 1.1393386349081993, + "epoch": 0.9833333333333333, + "grad_norm": 0.37725117802619934, + "learning_rate": 0.0001416374269005848, + "loss": 1.1883, + "mean_token_accuracy": 0.7277703955769539, + "num_tokens": 7406423.0, + "step": 590 + }, + { + "entropy": 1.0763930901885033, + "epoch": 0.985, + "grad_norm": 0.39282989501953125, + "learning_rate": 0.00014152046783625732, + "loss": 1.0786, + "mean_token_accuracy": 0.7410120218992233, + "num_tokens": 7419133.0, + "step": 591 + }, + { + "entropy": 1.016746073961258, + "epoch": 0.9866666666666667, + "grad_norm": 0.3819109797477722, + "learning_rate": 0.00014140350877192982, + "loss": 1.0051, + "mean_token_accuracy": 0.7601424679160118, + "num_tokens": 7431867.0, + "step": 592 + }, + { + "entropy": 1.1087006330490112, + "epoch": 0.9883333333333333, + "grad_norm": 0.2999494969844818, + "learning_rate": 0.00014128654970760234, + "loss": 1.1193, + "mean_token_accuracy": 0.7328962907195091, + "num_tokens": 7444366.0, + "step": 593 + }, + { + "entropy": 1.0927283689379692, + "epoch": 0.99, + "grad_norm": 0.38941463828086853, + "learning_rate": 0.00014116959064327487, + "loss": 1.0806, + "mean_token_accuracy": 0.7454044669866562, + "num_tokens": 7457006.0, + "step": 594 + }, + { + "entropy": 1.0681473091244698, + "epoch": 0.9916666666666667, + "grad_norm": 0.38292431831359863, + "learning_rate": 0.00014105263157894736, + "loss": 1.0783, + "mean_token_accuracy": 0.7404307276010513, + "num_tokens": 7469273.0, + "step": 595 + }, + { + "entropy": 1.133509248495102, + "epoch": 0.9933333333333333, + "grad_norm": 0.3152455985546112, + "learning_rate": 0.0001409356725146199, + "loss": 1.1119, + "mean_token_accuracy": 0.7372884005308151, + "num_tokens": 7481852.0, + "step": 596 + }, + { + "entropy": 1.166313149034977, + "epoch": 0.995, + "grad_norm": 0.3544023633003235, + "learning_rate": 0.0001408187134502924, + "loss": 1.1587, + "mean_token_accuracy": 0.7204631865024567, + "num_tokens": 7494352.0, + "step": 597 + }, + { + "entropy": 1.128667414188385, + "epoch": 0.9966666666666667, + "grad_norm": 0.3434421718120575, + "learning_rate": 0.0001407017543859649, + "loss": 1.1268, + "mean_token_accuracy": 0.7295839041471481, + "num_tokens": 7506807.0, + "step": 598 + }, + { + "entropy": 0.996250681579113, + "epoch": 0.9983333333333333, + "grad_norm": 0.29589951038360596, + "learning_rate": 0.00014058479532163745, + "loss": 0.9773, + "mean_token_accuracy": 0.7680518105626106, + "num_tokens": 7519235.0, + "step": 599 + }, + { + "entropy": 1.0180830582976341, + "epoch": 1.0, + "grad_norm": 0.32254987955093384, + "learning_rate": 0.00014046783625730995, + "loss": 0.9957, + "mean_token_accuracy": 0.7620798721909523, + "num_tokens": 7531661.0, + "step": 600 + }, + { + "entropy": 1.1241462379693985, + "epoch": 1.0016666666666667, + "grad_norm": 0.3471614718437195, + "learning_rate": 0.00014035087719298245, + "loss": 1.11, + "mean_token_accuracy": 0.736703634262085, + "num_tokens": 7544221.0, + "step": 601 + }, + { + "entropy": 1.0065209418535233, + "epoch": 1.0033333333333334, + "grad_norm": 0.3413226008415222, + "learning_rate": 0.000140233918128655, + "loss": 0.9849, + "mean_token_accuracy": 0.7642301768064499, + "num_tokens": 7556642.0, + "step": 602 + }, + { + "entropy": 1.2092658504843712, + "epoch": 1.005, + "grad_norm": 0.29548096656799316, + "learning_rate": 0.0001401169590643275, + "loss": 1.195, + "mean_token_accuracy": 0.719109907746315, + "num_tokens": 7569043.0, + "step": 603 + }, + { + "entropy": 1.1198002099990845, + "epoch": 1.0066666666666666, + "grad_norm": 0.45361337065696716, + "learning_rate": 0.00014, + "loss": 1.1547, + "mean_token_accuracy": 0.7289656400680542, + "num_tokens": 7581641.0, + "step": 604 + }, + { + "entropy": 1.211386151611805, + "epoch": 1.0083333333333333, + "grad_norm": 0.3899124264717102, + "learning_rate": 0.00013988304093567254, + "loss": 1.2301, + "mean_token_accuracy": 0.713324747979641, + "num_tokens": 7594098.0, + "step": 605 + }, + { + "entropy": 1.0142735317349434, + "epoch": 1.01, + "grad_norm": 0.2941846549510956, + "learning_rate": 0.00013976608187134503, + "loss": 1.0154, + "mean_token_accuracy": 0.7602197378873825, + "num_tokens": 7606658.0, + "step": 606 + }, + { + "entropy": 0.9515361040830612, + "epoch": 1.0116666666666667, + "grad_norm": 0.3320436477661133, + "learning_rate": 0.00013964912280701753, + "loss": 0.9443, + "mean_token_accuracy": 0.7731414288282394, + "num_tokens": 7619292.0, + "step": 607 + }, + { + "entropy": 0.9823862388730049, + "epoch": 1.0133333333333334, + "grad_norm": 0.45267120003700256, + "learning_rate": 0.00013953216374269008, + "loss": 0.984, + "mean_token_accuracy": 0.7694632411003113, + "num_tokens": 7631882.0, + "step": 608 + }, + { + "entropy": 1.0947947576642036, + "epoch": 1.015, + "grad_norm": 0.34952351450920105, + "learning_rate": 0.00013941520467836258, + "loss": 1.0837, + "mean_token_accuracy": 0.7458218857645988, + "num_tokens": 7644653.0, + "step": 609 + }, + { + "entropy": 1.0054941028356552, + "epoch": 1.0166666666666666, + "grad_norm": 0.4093509316444397, + "learning_rate": 0.00013929824561403507, + "loss": 0.9902, + "mean_token_accuracy": 0.7687611132860184, + "num_tokens": 7657159.0, + "step": 610 + }, + { + "entropy": 1.1774860545992851, + "epoch": 1.0183333333333333, + "grad_norm": 0.3505180776119232, + "learning_rate": 0.00013918128654970762, + "loss": 1.1882, + "mean_token_accuracy": 0.7232249453663826, + "num_tokens": 7669875.0, + "step": 611 + }, + { + "entropy": 1.0407119169831276, + "epoch": 1.02, + "grad_norm": 0.34889596700668335, + "learning_rate": 0.00013906432748538012, + "loss": 1.0021, + "mean_token_accuracy": 0.7623367980122566, + "num_tokens": 7682410.0, + "step": 612 + }, + { + "entropy": 0.9798188135027885, + "epoch": 1.0216666666666667, + "grad_norm": 0.34436383843421936, + "learning_rate": 0.00013894736842105264, + "loss": 0.9655, + "mean_token_accuracy": 0.7712270691990852, + "num_tokens": 7695026.0, + "step": 613 + }, + { + "entropy": 1.0230854898691177, + "epoch": 1.0233333333333334, + "grad_norm": 0.30474236607551575, + "learning_rate": 0.00013883040935672516, + "loss": 1.0168, + "mean_token_accuracy": 0.7577306106686592, + "num_tokens": 7707287.0, + "step": 614 + }, + { + "entropy": 1.0280113369226456, + "epoch": 1.025, + "grad_norm": 0.3403511345386505, + "learning_rate": 0.00013871345029239766, + "loss": 1.0075, + "mean_token_accuracy": 0.7558683082461357, + "num_tokens": 7720140.0, + "step": 615 + }, + { + "entropy": 1.0159951895475388, + "epoch": 1.0266666666666666, + "grad_norm": 0.4081074297428131, + "learning_rate": 0.00013859649122807018, + "loss": 1.0084, + "mean_token_accuracy": 0.7621957957744598, + "num_tokens": 7732366.0, + "step": 616 + }, + { + "entropy": 1.110385812819004, + "epoch": 1.0283333333333333, + "grad_norm": 0.35655292868614197, + "learning_rate": 0.0001384795321637427, + "loss": 1.1002, + "mean_token_accuracy": 0.7427221015095711, + "num_tokens": 7745123.0, + "step": 617 + }, + { + "entropy": 0.8933522030711174, + "epoch": 1.03, + "grad_norm": 0.44295790791511536, + "learning_rate": 0.0001383625730994152, + "loss": 0.8798, + "mean_token_accuracy": 0.7849943116307259, + "num_tokens": 7757701.0, + "step": 618 + }, + { + "entropy": 1.1147135198116302, + "epoch": 1.0316666666666667, + "grad_norm": 0.40337374806404114, + "learning_rate": 0.00013824561403508772, + "loss": 1.1254, + "mean_token_accuracy": 0.7315419912338257, + "num_tokens": 7770054.0, + "step": 619 + }, + { + "entropy": 0.8377392664551735, + "epoch": 1.0333333333333334, + "grad_norm": 0.47487035393714905, + "learning_rate": 0.00013812865497076025, + "loss": 0.8388, + "mean_token_accuracy": 0.7951571643352509, + "num_tokens": 7782805.0, + "step": 620 + }, + { + "entropy": 1.1692701056599617, + "epoch": 1.035, + "grad_norm": 0.31510019302368164, + "learning_rate": 0.00013801169590643274, + "loss": 1.1988, + "mean_token_accuracy": 0.7232685908675194, + "num_tokens": 7795507.0, + "step": 621 + }, + { + "entropy": 1.06218171864748, + "epoch": 1.0366666666666666, + "grad_norm": 0.5403528809547424, + "learning_rate": 0.00013789473684210527, + "loss": 1.0487, + "mean_token_accuracy": 0.7484084740281105, + "num_tokens": 7807693.0, + "step": 622 + }, + { + "entropy": 0.9076170176267624, + "epoch": 1.0383333333333333, + "grad_norm": 0.3648722171783447, + "learning_rate": 0.0001377777777777778, + "loss": 0.9021, + "mean_token_accuracy": 0.7875769883394241, + "num_tokens": 7820364.0, + "step": 623 + }, + { + "entropy": 1.0128463730216026, + "epoch": 1.04, + "grad_norm": 0.35606029629707336, + "learning_rate": 0.00013766081871345029, + "loss": 1.0369, + "mean_token_accuracy": 0.7596360221505165, + "num_tokens": 7833058.0, + "step": 624 + }, + { + "entropy": 1.127178505063057, + "epoch": 1.0416666666666667, + "grad_norm": 0.3587397038936615, + "learning_rate": 0.0001375438596491228, + "loss": 1.1201, + "mean_token_accuracy": 0.7313251048326492, + "num_tokens": 7845496.0, + "step": 625 + }, + { + "entropy": 0.9487481713294983, + "epoch": 1.0433333333333334, + "grad_norm": 0.4658021926879883, + "learning_rate": 0.00013742690058479533, + "loss": 0.94, + "mean_token_accuracy": 0.778320774435997, + "num_tokens": 7858159.0, + "step": 626 + }, + { + "entropy": 1.1421342343091965, + "epoch": 1.045, + "grad_norm": 0.361741840839386, + "learning_rate": 0.00013730994152046783, + "loss": 1.1211, + "mean_token_accuracy": 0.729691281914711, + "num_tokens": 7870529.0, + "step": 627 + }, + { + "entropy": 1.1835851818323135, + "epoch": 1.0466666666666666, + "grad_norm": 0.41982337832450867, + "learning_rate": 0.00013719298245614035, + "loss": 1.1648, + "mean_token_accuracy": 0.7253150641918182, + "num_tokens": 7882888.0, + "step": 628 + }, + { + "entropy": 1.0434592813253403, + "epoch": 1.0483333333333333, + "grad_norm": 0.3684554100036621, + "learning_rate": 0.00013707602339181287, + "loss": 1.0318, + "mean_token_accuracy": 0.7509391456842422, + "num_tokens": 7895483.0, + "step": 629 + }, + { + "entropy": 1.0862918049097061, + "epoch": 1.05, + "grad_norm": 0.3779102563858032, + "learning_rate": 0.00013695906432748537, + "loss": 1.0687, + "mean_token_accuracy": 0.7403496354818344, + "num_tokens": 7908116.0, + "step": 630 + }, + { + "entropy": 1.0131681859493256, + "epoch": 1.0516666666666667, + "grad_norm": 0.3559149503707886, + "learning_rate": 0.0001368421052631579, + "loss": 0.9913, + "mean_token_accuracy": 0.7679590433835983, + "num_tokens": 7920785.0, + "step": 631 + }, + { + "entropy": 0.9745831564068794, + "epoch": 1.0533333333333332, + "grad_norm": 0.344426691532135, + "learning_rate": 0.00013672514619883042, + "loss": 0.9656, + "mean_token_accuracy": 0.7636517956852913, + "num_tokens": 7933396.0, + "step": 632 + }, + { + "entropy": 0.8822155594825745, + "epoch": 1.055, + "grad_norm": 0.41743558645248413, + "learning_rate": 0.00013660818713450294, + "loss": 0.881, + "mean_token_accuracy": 0.7784114480018616, + "num_tokens": 7945821.0, + "step": 633 + }, + { + "entropy": 0.966235339641571, + "epoch": 1.0566666666666666, + "grad_norm": 0.4000849723815918, + "learning_rate": 0.00013649122807017543, + "loss": 0.9528, + "mean_token_accuracy": 0.7646506726741791, + "num_tokens": 7958541.0, + "step": 634 + }, + { + "entropy": 1.165940783917904, + "epoch": 1.0583333333333333, + "grad_norm": 0.527256965637207, + "learning_rate": 0.00013637426900584796, + "loss": 1.1738, + "mean_token_accuracy": 0.7271719202399254, + "num_tokens": 7970896.0, + "step": 635 + }, + { + "entropy": 1.0754114240407944, + "epoch": 1.06, + "grad_norm": 0.38329169154167175, + "learning_rate": 0.00013625730994152048, + "loss": 1.0728, + "mean_token_accuracy": 0.7432007640600204, + "num_tokens": 7983186.0, + "step": 636 + }, + { + "entropy": 0.948194220662117, + "epoch": 1.0616666666666668, + "grad_norm": 0.37484967708587646, + "learning_rate": 0.000136140350877193, + "loss": 0.9581, + "mean_token_accuracy": 0.7700267806649208, + "num_tokens": 7995749.0, + "step": 637 + }, + { + "entropy": 1.0313308089971542, + "epoch": 1.0633333333333332, + "grad_norm": 0.356343537569046, + "learning_rate": 0.0001360233918128655, + "loss": 1.056, + "mean_token_accuracy": 0.7483900561928749, + "num_tokens": 8008421.0, + "step": 638 + }, + { + "entropy": 0.9141811951994896, + "epoch": 1.065, + "grad_norm": 0.5060378313064575, + "learning_rate": 0.00013590643274853802, + "loss": 0.9068, + "mean_token_accuracy": 0.7790825441479683, + "num_tokens": 8021117.0, + "step": 639 + }, + { + "entropy": 0.9227776229381561, + "epoch": 1.0666666666666667, + "grad_norm": 0.3761998414993286, + "learning_rate": 0.00013578947368421055, + "loss": 0.9256, + "mean_token_accuracy": 0.7767295092344284, + "num_tokens": 8033634.0, + "step": 640 + }, + { + "entropy": 1.018035314977169, + "epoch": 1.0683333333333334, + "grad_norm": 0.33513742685317993, + "learning_rate": 0.00013567251461988304, + "loss": 1.0056, + "mean_token_accuracy": 0.7575684189796448, + "num_tokens": 8046127.0, + "step": 641 + }, + { + "entropy": 0.9292529672384262, + "epoch": 1.07, + "grad_norm": 1.8294329643249512, + "learning_rate": 0.00013555555555555556, + "loss": 0.9008, + "mean_token_accuracy": 0.7848961800336838, + "num_tokens": 8058773.0, + "step": 642 + }, + { + "entropy": 1.1138723865151405, + "epoch": 1.0716666666666668, + "grad_norm": 0.44567447900772095, + "learning_rate": 0.0001354385964912281, + "loss": 1.0769, + "mean_token_accuracy": 0.7429224848747253, + "num_tokens": 8071248.0, + "step": 643 + }, + { + "entropy": 1.099638320505619, + "epoch": 1.0733333333333333, + "grad_norm": 0.3087984621524811, + "learning_rate": 0.00013532163742690058, + "loss": 1.0883, + "mean_token_accuracy": 0.7450162544846535, + "num_tokens": 8083877.0, + "step": 644 + }, + { + "entropy": 1.0040937513113022, + "epoch": 1.075, + "grad_norm": 0.4541734457015991, + "learning_rate": 0.0001352046783625731, + "loss": 0.9684, + "mean_token_accuracy": 0.7722791135311127, + "num_tokens": 8096576.0, + "step": 645 + }, + { + "entropy": 0.9650404900312424, + "epoch": 1.0766666666666667, + "grad_norm": 0.48237717151641846, + "learning_rate": 0.00013508771929824563, + "loss": 0.9591, + "mean_token_accuracy": 0.7744063958525658, + "num_tokens": 8109075.0, + "step": 646 + }, + { + "entropy": 0.9337432831525803, + "epoch": 1.0783333333333334, + "grad_norm": 0.43152591586112976, + "learning_rate": 0.00013497076023391813, + "loss": 0.9395, + "mean_token_accuracy": 0.7721759006381035, + "num_tokens": 8121886.0, + "step": 647 + }, + { + "entropy": 1.1030597686767578, + "epoch": 1.08, + "grad_norm": 0.6193279027938843, + "learning_rate": 0.00013485380116959065, + "loss": 1.1343, + "mean_token_accuracy": 0.7327957898378372, + "num_tokens": 8134506.0, + "step": 648 + }, + { + "entropy": 1.0601794198155403, + "epoch": 1.0816666666666666, + "grad_norm": 0.32980358600616455, + "learning_rate": 0.00013473684210526317, + "loss": 1.0671, + "mean_token_accuracy": 0.7458383813500404, + "num_tokens": 8146814.0, + "step": 649 + }, + { + "entropy": 1.0408655479550362, + "epoch": 1.0833333333333333, + "grad_norm": 0.5926406383514404, + "learning_rate": 0.00013461988304093567, + "loss": 1.0491, + "mean_token_accuracy": 0.7476745769381523, + "num_tokens": 8159248.0, + "step": 650 + }, + { + "entropy": 1.0431873723864555, + "epoch": 1.085, + "grad_norm": 0.4574386775493622, + "learning_rate": 0.0001345029239766082, + "loss": 1.0554, + "mean_token_accuracy": 0.7551540955901146, + "num_tokens": 8171730.0, + "step": 651 + }, + { + "entropy": 1.086970031261444, + "epoch": 1.0866666666666667, + "grad_norm": 0.36465033888816833, + "learning_rate": 0.00013438596491228071, + "loss": 1.0768, + "mean_token_accuracy": 0.7430087327957153, + "num_tokens": 8184030.0, + "step": 652 + }, + { + "entropy": 0.9763112142682076, + "epoch": 1.0883333333333334, + "grad_norm": 0.4279404878616333, + "learning_rate": 0.00013426900584795324, + "loss": 0.9455, + "mean_token_accuracy": 0.7743237987160683, + "num_tokens": 8196477.0, + "step": 653 + }, + { + "entropy": 0.9648814126849174, + "epoch": 1.09, + "grad_norm": 0.38802456855773926, + "learning_rate": 0.00013415204678362573, + "loss": 0.9467, + "mean_token_accuracy": 0.7732060924172401, + "num_tokens": 8209184.0, + "step": 654 + }, + { + "entropy": 0.9697625562548637, + "epoch": 1.0916666666666666, + "grad_norm": 0.5765432119369507, + "learning_rate": 0.00013403508771929826, + "loss": 0.9383, + "mean_token_accuracy": 0.7735694646835327, + "num_tokens": 8221636.0, + "step": 655 + }, + { + "entropy": 1.032564863562584, + "epoch": 1.0933333333333333, + "grad_norm": 0.32644009590148926, + "learning_rate": 0.00013391812865497078, + "loss": 0.998, + "mean_token_accuracy": 0.7601174414157867, + "num_tokens": 8234233.0, + "step": 656 + }, + { + "entropy": 0.9871031567454338, + "epoch": 1.095, + "grad_norm": 0.37354862689971924, + "learning_rate": 0.00013380116959064327, + "loss": 0.9648, + "mean_token_accuracy": 0.7655048966407776, + "num_tokens": 8246750.0, + "step": 657 + }, + { + "entropy": 0.987128734588623, + "epoch": 1.0966666666666667, + "grad_norm": 0.35152730345726013, + "learning_rate": 0.0001336842105263158, + "loss": 0.9735, + "mean_token_accuracy": 0.7622586041688919, + "num_tokens": 8259211.0, + "step": 658 + }, + { + "entropy": 1.0830785781145096, + "epoch": 1.0983333333333334, + "grad_norm": 0.36088189482688904, + "learning_rate": 0.00013356725146198832, + "loss": 1.1024, + "mean_token_accuracy": 0.7382792606949806, + "num_tokens": 8271621.0, + "step": 659 + }, + { + "entropy": 1.0922202467918396, + "epoch": 1.1, + "grad_norm": 0.43077078461647034, + "learning_rate": 0.00013345029239766082, + "loss": 1.0774, + "mean_token_accuracy": 0.7471672371029854, + "num_tokens": 8284002.0, + "step": 660 + }, + { + "entropy": 1.1043326631188393, + "epoch": 1.1016666666666666, + "grad_norm": 0.35150665044784546, + "learning_rate": 0.00013333333333333334, + "loss": 1.0946, + "mean_token_accuracy": 0.7388827204704285, + "num_tokens": 8296436.0, + "step": 661 + }, + { + "entropy": 0.9849683940410614, + "epoch": 1.1033333333333333, + "grad_norm": 0.3895672857761383, + "learning_rate": 0.00013321637426900586, + "loss": 0.9802, + "mean_token_accuracy": 0.7634393870830536, + "num_tokens": 8309100.0, + "step": 662 + }, + { + "entropy": 1.1219883561134338, + "epoch": 1.105, + "grad_norm": 0.3513847887516022, + "learning_rate": 0.00013309941520467836, + "loss": 1.1286, + "mean_token_accuracy": 0.7343808338046074, + "num_tokens": 8321419.0, + "step": 663 + }, + { + "entropy": 1.055749535560608, + "epoch": 1.1066666666666667, + "grad_norm": 0.3936227858066559, + "learning_rate": 0.00013298245614035088, + "loss": 1.0459, + "mean_token_accuracy": 0.7490439489483833, + "num_tokens": 8333640.0, + "step": 664 + }, + { + "entropy": 0.9720334634184837, + "epoch": 1.1083333333333334, + "grad_norm": 0.3029753267765045, + "learning_rate": 0.0001328654970760234, + "loss": 0.9635, + "mean_token_accuracy": 0.7607943564653397, + "num_tokens": 8346373.0, + "step": 665 + }, + { + "entropy": 1.1321242824196815, + "epoch": 1.11, + "grad_norm": 0.47852373123168945, + "learning_rate": 0.0001327485380116959, + "loss": 1.1348, + "mean_token_accuracy": 0.7242363542318344, + "num_tokens": 8358810.0, + "step": 666 + }, + { + "entropy": 0.9721924886107445, + "epoch": 1.1116666666666666, + "grad_norm": 0.9138944149017334, + "learning_rate": 0.00013263157894736842, + "loss": 0.9505, + "mean_token_accuracy": 0.7762376815080643, + "num_tokens": 8371471.0, + "step": 667 + }, + { + "entropy": 0.9570804685354233, + "epoch": 1.1133333333333333, + "grad_norm": 0.42705467343330383, + "learning_rate": 0.00013251461988304095, + "loss": 0.9619, + "mean_token_accuracy": 0.7696249037981033, + "num_tokens": 8384041.0, + "step": 668 + }, + { + "entropy": 1.0321296378970146, + "epoch": 1.115, + "grad_norm": 0.382088303565979, + "learning_rate": 0.00013239766081871344, + "loss": 1.0341, + "mean_token_accuracy": 0.7548602446913719, + "num_tokens": 8396378.0, + "step": 669 + }, + { + "entropy": 1.1567051485180855, + "epoch": 1.1166666666666667, + "grad_norm": 0.3601890206336975, + "learning_rate": 0.00013228070175438597, + "loss": 1.1401, + "mean_token_accuracy": 0.7298571541905403, + "num_tokens": 8408835.0, + "step": 670 + }, + { + "entropy": 1.1171844527125359, + "epoch": 1.1183333333333334, + "grad_norm": 0.35644233226776123, + "learning_rate": 0.0001321637426900585, + "loss": 1.1135, + "mean_token_accuracy": 0.7378202676773071, + "num_tokens": 8421299.0, + "step": 671 + }, + { + "entropy": 1.0986279770731926, + "epoch": 1.12, + "grad_norm": 0.36225011944770813, + "learning_rate": 0.00013204678362573098, + "loss": 1.0893, + "mean_token_accuracy": 0.7424418106675148, + "num_tokens": 8433566.0, + "step": 672 + }, + { + "entropy": 0.9406923651695251, + "epoch": 1.1216666666666666, + "grad_norm": 0.36922165751457214, + "learning_rate": 0.00013192982456140353, + "loss": 0.92, + "mean_token_accuracy": 0.7776926532387733, + "num_tokens": 8445982.0, + "step": 673 + }, + { + "entropy": 0.982761062681675, + "epoch": 1.1233333333333333, + "grad_norm": 0.4194695055484772, + "learning_rate": 0.00013181286549707603, + "loss": 0.9678, + "mean_token_accuracy": 0.7725221887230873, + "num_tokens": 8458397.0, + "step": 674 + }, + { + "entropy": 1.0702436491847038, + "epoch": 1.125, + "grad_norm": 0.3364623785018921, + "learning_rate": 0.00013169590643274853, + "loss": 1.0965, + "mean_token_accuracy": 0.7367196604609489, + "num_tokens": 8470996.0, + "step": 675 + }, + { + "entropy": 1.117453172802925, + "epoch": 1.1266666666666667, + "grad_norm": 0.35230013728141785, + "learning_rate": 0.00013157894736842108, + "loss": 1.1182, + "mean_token_accuracy": 0.736903615295887, + "num_tokens": 8483765.0, + "step": 676 + }, + { + "entropy": 1.088931068778038, + "epoch": 1.1283333333333334, + "grad_norm": 0.37009745836257935, + "learning_rate": 0.00013146198830409357, + "loss": 1.1245, + "mean_token_accuracy": 0.7352629750967026, + "num_tokens": 8496465.0, + "step": 677 + }, + { + "entropy": 1.0094858780503273, + "epoch": 1.13, + "grad_norm": 0.3854370713233948, + "learning_rate": 0.00013134502923976607, + "loss": 1.0274, + "mean_token_accuracy": 0.7591015845537186, + "num_tokens": 8509037.0, + "step": 678 + }, + { + "entropy": 1.1349955797195435, + "epoch": 1.1316666666666666, + "grad_norm": 0.38764774799346924, + "learning_rate": 0.00013122807017543862, + "loss": 1.1466, + "mean_token_accuracy": 0.7316305935382843, + "num_tokens": 8521698.0, + "step": 679 + }, + { + "entropy": 0.9581394642591476, + "epoch": 1.1333333333333333, + "grad_norm": 0.3510202169418335, + "learning_rate": 0.00013111111111111111, + "loss": 0.9305, + "mean_token_accuracy": 0.7765649557113647, + "num_tokens": 8534228.0, + "step": 680 + }, + { + "entropy": 1.0585462525486946, + "epoch": 1.135, + "grad_norm": 0.6673049330711365, + "learning_rate": 0.00013099415204678364, + "loss": 1.0507, + "mean_token_accuracy": 0.7504134178161621, + "num_tokens": 8546770.0, + "step": 681 + }, + { + "entropy": 1.122896485030651, + "epoch": 1.1366666666666667, + "grad_norm": 0.34946927428245544, + "learning_rate": 0.00013087719298245616, + "loss": 1.1012, + "mean_token_accuracy": 0.7360977083444595, + "num_tokens": 8559354.0, + "step": 682 + }, + { + "entropy": 1.0782023295760155, + "epoch": 1.1383333333333334, + "grad_norm": 0.34809041023254395, + "learning_rate": 0.00013076023391812866, + "loss": 1.0375, + "mean_token_accuracy": 0.7485805526375771, + "num_tokens": 8571826.0, + "step": 683 + }, + { + "entropy": 0.932886391878128, + "epoch": 1.1400000000000001, + "grad_norm": 0.520618736743927, + "learning_rate": 0.00013064327485380118, + "loss": 0.9169, + "mean_token_accuracy": 0.7839078307151794, + "num_tokens": 8584449.0, + "step": 684 + }, + { + "entropy": 0.9941971525549889, + "epoch": 1.1416666666666666, + "grad_norm": 0.3270869851112366, + "learning_rate": 0.0001305263157894737, + "loss": 0.999, + "mean_token_accuracy": 0.769331268966198, + "num_tokens": 8596790.0, + "step": 685 + }, + { + "entropy": 1.0389942303299904, + "epoch": 1.1433333333333333, + "grad_norm": 0.37851324677467346, + "learning_rate": 0.0001304093567251462, + "loss": 1.0379, + "mean_token_accuracy": 0.7544435933232307, + "num_tokens": 8609395.0, + "step": 686 + }, + { + "entropy": 0.9451502487063408, + "epoch": 1.145, + "grad_norm": 0.5139569640159607, + "learning_rate": 0.00013029239766081872, + "loss": 0.9436, + "mean_token_accuracy": 0.7673129811882973, + "num_tokens": 8622265.0, + "step": 687 + }, + { + "entropy": 1.1505193412303925, + "epoch": 1.1466666666666667, + "grad_norm": 0.6956432461738586, + "learning_rate": 0.00013017543859649124, + "loss": 1.1384, + "mean_token_accuracy": 0.7312004566192627, + "num_tokens": 8634777.0, + "step": 688 + }, + { + "entropy": 1.1453742682933807, + "epoch": 1.1483333333333334, + "grad_norm": 0.8315491676330566, + "learning_rate": 0.00013005847953216374, + "loss": 1.1393, + "mean_token_accuracy": 0.7327683791518211, + "num_tokens": 8647293.0, + "step": 689 + }, + { + "entropy": 0.924394853413105, + "epoch": 1.15, + "grad_norm": 0.43016624450683594, + "learning_rate": 0.00012994152046783626, + "loss": 0.9055, + "mean_token_accuracy": 0.7811589166522026, + "num_tokens": 8659936.0, + "step": 690 + }, + { + "entropy": 1.1657907515764236, + "epoch": 1.1516666666666666, + "grad_norm": 0.3746880292892456, + "learning_rate": 0.0001298245614035088, + "loss": 1.1476, + "mean_token_accuracy": 0.7288665175437927, + "num_tokens": 8672578.0, + "step": 691 + }, + { + "entropy": 1.1058854684233665, + "epoch": 1.1533333333333333, + "grad_norm": 0.5405567288398743, + "learning_rate": 0.00012970760233918128, + "loss": 1.1241, + "mean_token_accuracy": 0.7345702573657036, + "num_tokens": 8684983.0, + "step": 692 + }, + { + "entropy": 0.9981840997934341, + "epoch": 1.155, + "grad_norm": 0.7133622765541077, + "learning_rate": 0.0001295906432748538, + "loss": 0.99, + "mean_token_accuracy": 0.7627017050981522, + "num_tokens": 8697774.0, + "step": 693 + }, + { + "entropy": 1.0237222835421562, + "epoch": 1.1566666666666667, + "grad_norm": 0.4631465673446655, + "learning_rate": 0.00012947368421052633, + "loss": 0.9932, + "mean_token_accuracy": 0.7638612240552902, + "num_tokens": 8709955.0, + "step": 694 + }, + { + "entropy": 1.0166835561394691, + "epoch": 1.1583333333333332, + "grad_norm": 0.4396076798439026, + "learning_rate": 0.00012935672514619882, + "loss": 1.0102, + "mean_token_accuracy": 0.759076252579689, + "num_tokens": 8722501.0, + "step": 695 + }, + { + "entropy": 1.139280654489994, + "epoch": 1.16, + "grad_norm": 0.5104596614837646, + "learning_rate": 0.00012923976608187135, + "loss": 1.1324, + "mean_token_accuracy": 0.7318560257554054, + "num_tokens": 8735256.0, + "step": 696 + }, + { + "entropy": 0.996860571205616, + "epoch": 1.1616666666666666, + "grad_norm": 0.41430947184562683, + "learning_rate": 0.00012912280701754387, + "loss": 0.9843, + "mean_token_accuracy": 0.7681520283222198, + "num_tokens": 8748117.0, + "step": 697 + }, + { + "entropy": 1.088785506784916, + "epoch": 1.1633333333333333, + "grad_norm": 0.4602546989917755, + "learning_rate": 0.00012900584795321637, + "loss": 1.0874, + "mean_token_accuracy": 0.7467052713036537, + "num_tokens": 8760752.0, + "step": 698 + }, + { + "entropy": 1.0092452727258205, + "epoch": 1.165, + "grad_norm": 0.43864133954048157, + "learning_rate": 0.00012888888888888892, + "loss": 0.9948, + "mean_token_accuracy": 0.7639948949217796, + "num_tokens": 8773427.0, + "step": 699 + }, + { + "entropy": 1.0612296387553215, + "epoch": 1.1666666666666667, + "grad_norm": 0.424075722694397, + "learning_rate": 0.0001287719298245614, + "loss": 1.0701, + "mean_token_accuracy": 0.7479716464877129, + "num_tokens": 8786118.0, + "step": 700 + }, + { + "entropy": 1.0840974599123, + "epoch": 1.1683333333333334, + "grad_norm": 0.561173677444458, + "learning_rate": 0.0001286549707602339, + "loss": 1.07, + "mean_token_accuracy": 0.7422567680478096, + "num_tokens": 8798822.0, + "step": 701 + }, + { + "entropy": 1.059173308312893, + "epoch": 1.17, + "grad_norm": 0.33119791746139526, + "learning_rate": 0.00012853801169590646, + "loss": 1.0362, + "mean_token_accuracy": 0.7531376257538795, + "num_tokens": 8811334.0, + "step": 702 + }, + { + "entropy": 0.9581945165991783, + "epoch": 1.1716666666666666, + "grad_norm": 2.1980700492858887, + "learning_rate": 0.00012842105263157895, + "loss": 0.9294, + "mean_token_accuracy": 0.7717323303222656, + "num_tokens": 8824060.0, + "step": 703 + }, + { + "entropy": 1.000770427286625, + "epoch": 1.1733333333333333, + "grad_norm": 0.344192773103714, + "learning_rate": 0.00012830409356725145, + "loss": 1.0056, + "mean_token_accuracy": 0.7614035829901695, + "num_tokens": 8836623.0, + "step": 704 + }, + { + "entropy": 1.109620176255703, + "epoch": 1.175, + "grad_norm": 0.5027482509613037, + "learning_rate": 0.000128187134502924, + "loss": 1.1039, + "mean_token_accuracy": 0.7359057664871216, + "num_tokens": 8848962.0, + "step": 705 + }, + { + "entropy": 1.1144047752022743, + "epoch": 1.1766666666666667, + "grad_norm": 0.41070157289505005, + "learning_rate": 0.0001280701754385965, + "loss": 1.132, + "mean_token_accuracy": 0.73667923361063, + "num_tokens": 8861555.0, + "step": 706 + }, + { + "entropy": 0.9361367151141167, + "epoch": 1.1783333333333332, + "grad_norm": 0.3477269113063812, + "learning_rate": 0.000127953216374269, + "loss": 0.9413, + "mean_token_accuracy": 0.7774360477924347, + "num_tokens": 8874006.0, + "step": 707 + }, + { + "entropy": 1.1030670925974846, + "epoch": 1.18, + "grad_norm": 1.3029204607009888, + "learning_rate": 0.00012783625730994154, + "loss": 1.0991, + "mean_token_accuracy": 0.7363706007599831, + "num_tokens": 8886475.0, + "step": 708 + }, + { + "entropy": 0.9754893407225609, + "epoch": 1.1816666666666666, + "grad_norm": 0.42802709341049194, + "learning_rate": 0.00012771929824561404, + "loss": 0.9424, + "mean_token_accuracy": 0.7736692875623703, + "num_tokens": 8898956.0, + "step": 709 + }, + { + "entropy": 0.9986881986260414, + "epoch": 1.1833333333333333, + "grad_norm": 0.4046630561351776, + "learning_rate": 0.00012760233918128653, + "loss": 1.0008, + "mean_token_accuracy": 0.7546000257134438, + "num_tokens": 8911710.0, + "step": 710 + }, + { + "entropy": 1.1963095217943192, + "epoch": 1.185, + "grad_norm": 0.36986932158470154, + "learning_rate": 0.00012748538011695908, + "loss": 1.1593, + "mean_token_accuracy": 0.721351720392704, + "num_tokens": 8924129.0, + "step": 711 + }, + { + "entropy": 0.9531427696347237, + "epoch": 1.1866666666666668, + "grad_norm": 0.9468092322349548, + "learning_rate": 0.00012736842105263158, + "loss": 0.9382, + "mean_token_accuracy": 0.774338111281395, + "num_tokens": 8936909.0, + "step": 712 + }, + { + "entropy": 0.9403403848409653, + "epoch": 1.1883333333333332, + "grad_norm": 0.35958191752433777, + "learning_rate": 0.0001272514619883041, + "loss": 0.9279, + "mean_token_accuracy": 0.7800295427441597, + "num_tokens": 8949251.0, + "step": 713 + }, + { + "entropy": 1.0490553975105286, + "epoch": 1.19, + "grad_norm": 0.5040456652641296, + "learning_rate": 0.00012713450292397663, + "loss": 1.0677, + "mean_token_accuracy": 0.7471382990479469, + "num_tokens": 8961862.0, + "step": 714 + }, + { + "entropy": 0.9729638993740082, + "epoch": 1.1916666666666667, + "grad_norm": 0.37353086471557617, + "learning_rate": 0.00012701754385964912, + "loss": 0.9867, + "mean_token_accuracy": 0.7674241736531258, + "num_tokens": 8974543.0, + "step": 715 + }, + { + "entropy": 1.1073362156748772, + "epoch": 1.1933333333333334, + "grad_norm": 0.37765026092529297, + "learning_rate": 0.00012690058479532165, + "loss": 1.1119, + "mean_token_accuracy": 0.7373607456684113, + "num_tokens": 8987091.0, + "step": 716 + }, + { + "entropy": 1.0764212757349014, + "epoch": 1.195, + "grad_norm": 0.4003337323665619, + "learning_rate": 0.00012678362573099417, + "loss": 1.0417, + "mean_token_accuracy": 0.7387293726205826, + "num_tokens": 8999414.0, + "step": 717 + }, + { + "entropy": 0.9788024201989174, + "epoch": 1.1966666666666668, + "grad_norm": 0.33461993932724, + "learning_rate": 0.00012666666666666666, + "loss": 0.9539, + "mean_token_accuracy": 0.7684841901063919, + "num_tokens": 9011861.0, + "step": 718 + }, + { + "entropy": 1.1892686560750008, + "epoch": 1.1983333333333333, + "grad_norm": 0.3961053192615509, + "learning_rate": 0.0001265497076023392, + "loss": 1.2005, + "mean_token_accuracy": 0.7188596576452255, + "num_tokens": 9024474.0, + "step": 719 + }, + { + "entropy": 1.0204068645834923, + "epoch": 1.2, + "grad_norm": 0.3742503821849823, + "learning_rate": 0.0001264327485380117, + "loss": 1.0065, + "mean_token_accuracy": 0.7588150277733803, + "num_tokens": 9037064.0, + "step": 720 + }, + { + "epoch": 1.2, + "eval_entropy": 1.1187498391699382, + "eval_loss": 1.1341508626937866, + "eval_mean_token_accuracy": 0.7310460867499504, + "eval_num_tokens": 9037064.0, + "eval_runtime": 2668.7172, + "eval_samples_per_second": 1.874, + "eval_steps_per_second": 0.937, + "step": 720 + }, + { + "entropy": 1.1895209550857544, + "epoch": 1.2016666666666667, + "grad_norm": 0.3838255703449249, + "learning_rate": 0.0001263157894736842, + "loss": 1.2079, + "mean_token_accuracy": 0.7193019464612007, + "num_tokens": 9049604.0, + "step": 721 + }, + { + "entropy": 1.0349683538079262, + "epoch": 1.2033333333333334, + "grad_norm": 0.4433048665523529, + "learning_rate": 0.00012619883040935673, + "loss": 1.0267, + "mean_token_accuracy": 0.7509428858757019, + "num_tokens": 9062116.0, + "step": 722 + }, + { + "entropy": 1.030206061899662, + "epoch": 1.205, + "grad_norm": 0.34358423948287964, + "learning_rate": 0.00012608187134502925, + "loss": 1.0245, + "mean_token_accuracy": 0.7634887248277664, + "num_tokens": 9074529.0, + "step": 723 + }, + { + "entropy": 1.141561210155487, + "epoch": 1.2066666666666666, + "grad_norm": 0.3249812424182892, + "learning_rate": 0.00012596491228070175, + "loss": 1.1205, + "mean_token_accuracy": 0.7343723922967911, + "num_tokens": 9087182.0, + "step": 724 + }, + { + "entropy": 1.0616471990942955, + "epoch": 1.2083333333333333, + "grad_norm": 0.4340597093105316, + "learning_rate": 0.00012584795321637427, + "loss": 1.0581, + "mean_token_accuracy": 0.7507943511009216, + "num_tokens": 9099877.0, + "step": 725 + }, + { + "entropy": 0.9706320464611053, + "epoch": 1.21, + "grad_norm": 0.3568764328956604, + "learning_rate": 0.0001257309941520468, + "loss": 0.9761, + "mean_token_accuracy": 0.7712465897202492, + "num_tokens": 9112601.0, + "step": 726 + }, + { + "entropy": 0.91465774923563, + "epoch": 1.2116666666666667, + "grad_norm": 0.3260438144207001, + "learning_rate": 0.0001256140350877193, + "loss": 0.9254, + "mean_token_accuracy": 0.7811517640948296, + "num_tokens": 9125033.0, + "step": 727 + }, + { + "entropy": 0.9401550590991974, + "epoch": 1.2133333333333334, + "grad_norm": 0.3260049819946289, + "learning_rate": 0.00012549707602339181, + "loss": 0.9633, + "mean_token_accuracy": 0.7732428461313248, + "num_tokens": 9137669.0, + "step": 728 + }, + { + "entropy": 0.9846240356564522, + "epoch": 1.215, + "grad_norm": 0.5016320943832397, + "learning_rate": 0.00012538011695906434, + "loss": 1.0008, + "mean_token_accuracy": 0.7595839649438858, + "num_tokens": 9150317.0, + "step": 729 + }, + { + "entropy": 0.9101727679371834, + "epoch": 1.2166666666666668, + "grad_norm": 0.3477831184864044, + "learning_rate": 0.00012526315789473683, + "loss": 0.9248, + "mean_token_accuracy": 0.7770381346344948, + "num_tokens": 9162864.0, + "step": 730 + }, + { + "entropy": 1.055095262825489, + "epoch": 1.2183333333333333, + "grad_norm": 0.3225356936454773, + "learning_rate": 0.00012514619883040936, + "loss": 1.0594, + "mean_token_accuracy": 0.7478219419717789, + "num_tokens": 9175636.0, + "step": 731 + }, + { + "entropy": 0.9906914457678795, + "epoch": 1.22, + "grad_norm": 0.36654067039489746, + "learning_rate": 0.00012502923976608188, + "loss": 0.9822, + "mean_token_accuracy": 0.7651053443551064, + "num_tokens": 9188056.0, + "step": 732 + }, + { + "entropy": 1.0227198526263237, + "epoch": 1.2216666666666667, + "grad_norm": 0.42129281163215637, + "learning_rate": 0.0001249122807017544, + "loss": 1.0036, + "mean_token_accuracy": 0.7546610608696938, + "num_tokens": 9200761.0, + "step": 733 + }, + { + "entropy": 1.051247701048851, + "epoch": 1.2233333333333334, + "grad_norm": 0.39238080382347107, + "learning_rate": 0.0001247953216374269, + "loss": 1.0324, + "mean_token_accuracy": 0.7527084350585938, + "num_tokens": 9213241.0, + "step": 734 + }, + { + "entropy": 1.1549249365925789, + "epoch": 1.225, + "grad_norm": 0.3630923926830292, + "learning_rate": 0.00012467836257309942, + "loss": 1.1407, + "mean_token_accuracy": 0.7309977412223816, + "num_tokens": 9225826.0, + "step": 735 + }, + { + "entropy": 0.9571069180965424, + "epoch": 1.2266666666666666, + "grad_norm": 0.4624479115009308, + "learning_rate": 0.00012456140350877194, + "loss": 0.9133, + "mean_token_accuracy": 0.7807331830263138, + "num_tokens": 9238128.0, + "step": 736 + }, + { + "entropy": 1.1259456798434258, + "epoch": 1.2283333333333333, + "grad_norm": 0.41041669249534607, + "learning_rate": 0.00012444444444444444, + "loss": 1.1123, + "mean_token_accuracy": 0.7379928156733513, + "num_tokens": 9250736.0, + "step": 737 + }, + { + "entropy": 1.0745483115315437, + "epoch": 1.23, + "grad_norm": 0.3722630739212036, + "learning_rate": 0.00012432748538011696, + "loss": 1.0743, + "mean_token_accuracy": 0.7459522411227226, + "num_tokens": 9263669.0, + "step": 738 + }, + { + "entropy": 1.017277792096138, + "epoch": 1.2316666666666667, + "grad_norm": 0.5359635949134827, + "learning_rate": 0.00012421052631578949, + "loss": 0.9979, + "mean_token_accuracy": 0.756466805934906, + "num_tokens": 9276278.0, + "step": 739 + }, + { + "entropy": 1.0748402923345566, + "epoch": 1.2333333333333334, + "grad_norm": 0.35720473527908325, + "learning_rate": 0.000124093567251462, + "loss": 1.1062, + "mean_token_accuracy": 0.7410350739955902, + "num_tokens": 9288698.0, + "step": 740 + }, + { + "entropy": 1.130936212837696, + "epoch": 1.2349999999999999, + "grad_norm": 0.37070757150650024, + "learning_rate": 0.0001239766081871345, + "loss": 1.1396, + "mean_token_accuracy": 0.7324651181697845, + "num_tokens": 9301584.0, + "step": 741 + }, + { + "entropy": 1.0489218086004257, + "epoch": 1.2366666666666666, + "grad_norm": 0.3306441605091095, + "learning_rate": 0.00012385964912280703, + "loss": 1.0664, + "mean_token_accuracy": 0.7477298304438591, + "num_tokens": 9314084.0, + "step": 742 + }, + { + "entropy": 0.9938762001693249, + "epoch": 1.2383333333333333, + "grad_norm": 0.31619134545326233, + "learning_rate": 0.00012374269005847955, + "loss": 0.9852, + "mean_token_accuracy": 0.7605732753872871, + "num_tokens": 9326397.0, + "step": 743 + }, + { + "entropy": 1.0824184268712997, + "epoch": 1.24, + "grad_norm": 0.3798287510871887, + "learning_rate": 0.00012362573099415205, + "loss": 1.0686, + "mean_token_accuracy": 0.7485281676054001, + "num_tokens": 9339046.0, + "step": 744 + }, + { + "entropy": 1.069664090871811, + "epoch": 1.2416666666666667, + "grad_norm": 0.306922972202301, + "learning_rate": 0.00012350877192982457, + "loss": 1.0744, + "mean_token_accuracy": 0.7507193833589554, + "num_tokens": 9351671.0, + "step": 745 + }, + { + "entropy": 1.0598416477441788, + "epoch": 1.2433333333333334, + "grad_norm": 0.33934125304222107, + "learning_rate": 0.0001233918128654971, + "loss": 1.0267, + "mean_token_accuracy": 0.7564558759331703, + "num_tokens": 9364530.0, + "step": 746 + }, + { + "entropy": 0.9628029838204384, + "epoch": 1.245, + "grad_norm": 0.41921910643577576, + "learning_rate": 0.0001232748538011696, + "loss": 0.9517, + "mean_token_accuracy": 0.7717362120747566, + "num_tokens": 9376918.0, + "step": 747 + }, + { + "entropy": 1.0656858682632446, + "epoch": 1.2466666666666666, + "grad_norm": 0.38583341240882874, + "learning_rate": 0.0001231578947368421, + "loss": 1.0376, + "mean_token_accuracy": 0.7540598586201668, + "num_tokens": 9389510.0, + "step": 748 + }, + { + "entropy": 1.0888371095061302, + "epoch": 1.2483333333333333, + "grad_norm": 0.34380537271499634, + "learning_rate": 0.00012304093567251463, + "loss": 1.0715, + "mean_token_accuracy": 0.7473909482359886, + "num_tokens": 9402055.0, + "step": 749 + }, + { + "entropy": 1.0373852625489235, + "epoch": 1.25, + "grad_norm": 0.3231388330459595, + "learning_rate": 0.00012292397660818713, + "loss": 1.0305, + "mean_token_accuracy": 0.7523825243115425, + "num_tokens": 9414512.0, + "step": 750 + }, + { + "entropy": 1.106944017112255, + "epoch": 1.2516666666666667, + "grad_norm": 0.36373844742774963, + "learning_rate": 0.00012280701754385965, + "loss": 1.0971, + "mean_token_accuracy": 0.7417397871613503, + "num_tokens": 9426843.0, + "step": 751 + }, + { + "entropy": 0.9788102731108665, + "epoch": 1.2533333333333334, + "grad_norm": 0.31296437978744507, + "learning_rate": 0.00012269005847953218, + "loss": 0.9652, + "mean_token_accuracy": 0.7717032134532928, + "num_tokens": 9439540.0, + "step": 752 + }, + { + "entropy": 0.9851587638258934, + "epoch": 1.255, + "grad_norm": 0.39397764205932617, + "learning_rate": 0.0001225730994152047, + "loss": 0.9935, + "mean_token_accuracy": 0.7582743614912033, + "num_tokens": 9451949.0, + "step": 753 + }, + { + "entropy": 1.1130240112543106, + "epoch": 1.2566666666666666, + "grad_norm": 1.2226158380508423, + "learning_rate": 0.0001224561403508772, + "loss": 1.1278, + "mean_token_accuracy": 0.7374890893697739, + "num_tokens": 9464393.0, + "step": 754 + }, + { + "entropy": 0.9969649165868759, + "epoch": 1.2583333333333333, + "grad_norm": 0.5840109586715698, + "learning_rate": 0.00012233918128654972, + "loss": 1.0028, + "mean_token_accuracy": 0.7622894421219826, + "num_tokens": 9477027.0, + "step": 755 + }, + { + "entropy": 1.0583245903253555, + "epoch": 1.26, + "grad_norm": 0.49765530228614807, + "learning_rate": 0.00012222222222222224, + "loss": 1.056, + "mean_token_accuracy": 0.7438866198062897, + "num_tokens": 9489474.0, + "step": 756 + }, + { + "entropy": 0.9789082854986191, + "epoch": 1.2616666666666667, + "grad_norm": 0.394092321395874, + "learning_rate": 0.00012210526315789474, + "loss": 0.9702, + "mean_token_accuracy": 0.7751469835639, + "num_tokens": 9502055.0, + "step": 757 + }, + { + "entropy": 0.9848510399460793, + "epoch": 1.2633333333333332, + "grad_norm": 0.34688401222229004, + "learning_rate": 0.00012198830409356725, + "loss": 0.9855, + "mean_token_accuracy": 0.7628518790006638, + "num_tokens": 9514580.0, + "step": 758 + }, + { + "entropy": 1.0338635221123695, + "epoch": 1.2650000000000001, + "grad_norm": 0.33692267537117004, + "learning_rate": 0.00012187134502923978, + "loss": 1.0188, + "mean_token_accuracy": 0.7581898495554924, + "num_tokens": 9527194.0, + "step": 759 + }, + { + "entropy": 0.9540547728538513, + "epoch": 1.2666666666666666, + "grad_norm": 0.38330939412117004, + "learning_rate": 0.00012175438596491229, + "loss": 0.9213, + "mean_token_accuracy": 0.7747813165187836, + "num_tokens": 9539878.0, + "step": 760 + }, + { + "entropy": 0.9945897087454796, + "epoch": 1.2683333333333333, + "grad_norm": 0.4628215730190277, + "learning_rate": 0.00012163742690058479, + "loss": 0.9471, + "mean_token_accuracy": 0.7692919299006462, + "num_tokens": 9552618.0, + "step": 761 + }, + { + "entropy": 0.8448145538568497, + "epoch": 1.27, + "grad_norm": 0.545650064945221, + "learning_rate": 0.00012152046783625733, + "loss": 0.8138, + "mean_token_accuracy": 0.8002117648720741, + "num_tokens": 9565245.0, + "step": 762 + }, + { + "entropy": 1.096286728978157, + "epoch": 1.2716666666666667, + "grad_norm": 0.3716736137866974, + "learning_rate": 0.00012140350877192984, + "loss": 1.104, + "mean_token_accuracy": 0.7303317859768867, + "num_tokens": 9577885.0, + "step": 763 + }, + { + "entropy": 1.1562151238322258, + "epoch": 1.2733333333333334, + "grad_norm": 0.5344386100769043, + "learning_rate": 0.00012128654970760233, + "loss": 1.1596, + "mean_token_accuracy": 0.7277534902095795, + "num_tokens": 9590261.0, + "step": 764 + }, + { + "entropy": 0.9411380141973495, + "epoch": 1.275, + "grad_norm": 0.5676725506782532, + "learning_rate": 0.00012116959064327487, + "loss": 0.959, + "mean_token_accuracy": 0.7726512774825096, + "num_tokens": 9602848.0, + "step": 765 + }, + { + "entropy": 0.9246381893754005, + "epoch": 1.2766666666666666, + "grad_norm": 0.6678740382194519, + "learning_rate": 0.00012105263157894738, + "loss": 0.9213, + "mean_token_accuracy": 0.7782176956534386, + "num_tokens": 9615488.0, + "step": 766 + }, + { + "entropy": 0.9777566716074944, + "epoch": 1.2783333333333333, + "grad_norm": 0.46021127700805664, + "learning_rate": 0.00012093567251461989, + "loss": 0.9976, + "mean_token_accuracy": 0.7602128386497498, + "num_tokens": 9628260.0, + "step": 767 + }, + { + "entropy": 1.0313529521226883, + "epoch": 1.28, + "grad_norm": 0.37414881587028503, + "learning_rate": 0.00012081871345029241, + "loss": 1.0473, + "mean_token_accuracy": 0.7518272027373314, + "num_tokens": 9640722.0, + "step": 768 + }, + { + "entropy": 1.0204561650753021, + "epoch": 1.2816666666666667, + "grad_norm": 0.5116375088691711, + "learning_rate": 0.00012070175438596492, + "loss": 1.0463, + "mean_token_accuracy": 0.7471587732434273, + "num_tokens": 9653280.0, + "step": 769 + }, + { + "entropy": 0.9984316229820251, + "epoch": 1.2833333333333332, + "grad_norm": 0.48678651452064514, + "learning_rate": 0.00012058479532163743, + "loss": 1.0045, + "mean_token_accuracy": 0.7568108588457108, + "num_tokens": 9665894.0, + "step": 770 + }, + { + "entropy": 0.8993102163076401, + "epoch": 1.285, + "grad_norm": 0.33669960498809814, + "learning_rate": 0.00012046783625730995, + "loss": 0.8766, + "mean_token_accuracy": 0.7823315188288689, + "num_tokens": 9678454.0, + "step": 771 + }, + { + "entropy": 1.0980085879564285, + "epoch": 1.2866666666666666, + "grad_norm": 0.3783906102180481, + "learning_rate": 0.00012035087719298246, + "loss": 1.0533, + "mean_token_accuracy": 0.7443738207221031, + "num_tokens": 9691210.0, + "step": 772 + }, + { + "entropy": 0.9627801030874252, + "epoch": 1.2883333333333333, + "grad_norm": 0.3682050108909607, + "learning_rate": 0.00012023391812865498, + "loss": 0.9516, + "mean_token_accuracy": 0.7748104557394981, + "num_tokens": 9703463.0, + "step": 773 + }, + { + "entropy": 1.0400335937738419, + "epoch": 1.29, + "grad_norm": 0.3542267680168152, + "learning_rate": 0.0001201169590643275, + "loss": 0.9967, + "mean_token_accuracy": 0.7683819159865379, + "num_tokens": 9716027.0, + "step": 774 + }, + { + "entropy": 0.9925283342599869, + "epoch": 1.2916666666666667, + "grad_norm": 0.3950151801109314, + "learning_rate": 0.00012, + "loss": 0.9498, + "mean_token_accuracy": 0.7683768942952156, + "num_tokens": 9728291.0, + "step": 775 + }, + { + "entropy": 1.1487708538770676, + "epoch": 1.2933333333333334, + "grad_norm": 0.3613208532333374, + "learning_rate": 0.00011988304093567253, + "loss": 1.1255, + "mean_token_accuracy": 0.7375266328454018, + "num_tokens": 9740911.0, + "step": 776 + }, + { + "entropy": 0.9904927983880043, + "epoch": 1.295, + "grad_norm": 0.3638273775577545, + "learning_rate": 0.00011976608187134504, + "loss": 0.9816, + "mean_token_accuracy": 0.7648102417588234, + "num_tokens": 9753671.0, + "step": 777 + }, + { + "entropy": 0.9700521230697632, + "epoch": 1.2966666666666666, + "grad_norm": 0.44246748089790344, + "learning_rate": 0.00011964912280701755, + "loss": 0.9678, + "mean_token_accuracy": 0.7655287235975266, + "num_tokens": 9766264.0, + "step": 778 + }, + { + "entropy": 1.1071206703782082, + "epoch": 1.2983333333333333, + "grad_norm": 0.41459017992019653, + "learning_rate": 0.00011953216374269007, + "loss": 1.1313, + "mean_token_accuracy": 0.7323150411248207, + "num_tokens": 9778506.0, + "step": 779 + }, + { + "entropy": 0.9841584786772728, + "epoch": 1.3, + "grad_norm": 0.4696158468723297, + "learning_rate": 0.00011941520467836258, + "loss": 0.9963, + "mean_token_accuracy": 0.7636822164058685, + "num_tokens": 9791182.0, + "step": 780 + }, + { + "entropy": 0.9327910616993904, + "epoch": 1.3016666666666667, + "grad_norm": 0.7686042785644531, + "learning_rate": 0.00011929824561403509, + "loss": 0.9611, + "mean_token_accuracy": 0.7661004289984703, + "num_tokens": 9803792.0, + "step": 781 + }, + { + "entropy": 1.0880418792366982, + "epoch": 1.3033333333333332, + "grad_norm": 0.36805927753448486, + "learning_rate": 0.00011918128654970761, + "loss": 1.1148, + "mean_token_accuracy": 0.734060674905777, + "num_tokens": 9816277.0, + "step": 782 + }, + { + "entropy": 0.9658422768115997, + "epoch": 1.305, + "grad_norm": 0.6255410313606262, + "learning_rate": 0.00011906432748538012, + "loss": 0.9658, + "mean_token_accuracy": 0.7605624049901962, + "num_tokens": 9828950.0, + "step": 783 + }, + { + "entropy": 0.9103493466973305, + "epoch": 1.3066666666666666, + "grad_norm": 0.5097252130508423, + "learning_rate": 0.00011894736842105263, + "loss": 0.8901, + "mean_token_accuracy": 0.7865965068340302, + "num_tokens": 9841394.0, + "step": 784 + }, + { + "entropy": 0.9112555459141731, + "epoch": 1.3083333333333333, + "grad_norm": 0.4171690046787262, + "learning_rate": 0.00011883040935672515, + "loss": 0.9152, + "mean_token_accuracy": 0.7758986055850983, + "num_tokens": 9854242.0, + "step": 785 + }, + { + "entropy": 0.9862824454903603, + "epoch": 1.31, + "grad_norm": 0.7817356586456299, + "learning_rate": 0.00011871345029239766, + "loss": 0.9753, + "mean_token_accuracy": 0.7611427754163742, + "num_tokens": 9866662.0, + "step": 786 + }, + { + "entropy": 1.1108715310692787, + "epoch": 1.3116666666666665, + "grad_norm": 0.3802628517150879, + "learning_rate": 0.00011859649122807017, + "loss": 1.1302, + "mean_token_accuracy": 0.7318329811096191, + "num_tokens": 9879163.0, + "step": 787 + }, + { + "entropy": 1.0539921298623085, + "epoch": 1.3133333333333335, + "grad_norm": 0.37499651312828064, + "learning_rate": 0.00011847953216374271, + "loss": 1.0575, + "mean_token_accuracy": 0.7487984895706177, + "num_tokens": 9891670.0, + "step": 788 + }, + { + "entropy": 1.0349926948547363, + "epoch": 1.315, + "grad_norm": 0.571121871471405, + "learning_rate": 0.0001183625730994152, + "loss": 1.0738, + "mean_token_accuracy": 0.7484195157885551, + "num_tokens": 9903992.0, + "step": 789 + }, + { + "entropy": 0.9439081102609634, + "epoch": 1.3166666666666667, + "grad_norm": 0.4074806272983551, + "learning_rate": 0.00011824561403508771, + "loss": 0.92, + "mean_token_accuracy": 0.7820922136306763, + "num_tokens": 9916572.0, + "step": 790 + }, + { + "entropy": 0.9308940395712852, + "epoch": 1.3183333333333334, + "grad_norm": 0.5895468592643738, + "learning_rate": 0.00011812865497076025, + "loss": 0.9146, + "mean_token_accuracy": 0.78362637758255, + "num_tokens": 9928971.0, + "step": 791 + }, + { + "entropy": 1.1400123611092567, + "epoch": 1.32, + "grad_norm": 0.4412790536880493, + "learning_rate": 0.00011801169590643275, + "loss": 1.1464, + "mean_token_accuracy": 0.7309730723500252, + "num_tokens": 9941405.0, + "step": 792 + }, + { + "entropy": 0.9348255023360252, + "epoch": 1.3216666666666668, + "grad_norm": 0.3576184809207916, + "learning_rate": 0.00011789473684210525, + "loss": 0.9154, + "mean_token_accuracy": 0.7779194116592407, + "num_tokens": 9953803.0, + "step": 793 + }, + { + "entropy": 0.8785227425396442, + "epoch": 1.3233333333333333, + "grad_norm": 0.42946749925613403, + "learning_rate": 0.00011777777777777779, + "loss": 0.8667, + "mean_token_accuracy": 0.7964329123497009, + "num_tokens": 9966445.0, + "step": 794 + }, + { + "entropy": 1.033266007900238, + "epoch": 1.325, + "grad_norm": 0.758540153503418, + "learning_rate": 0.00011766081871345029, + "loss": 1.0317, + "mean_token_accuracy": 0.7529370561242104, + "num_tokens": 9978921.0, + "step": 795 + }, + { + "entropy": 1.023316115140915, + "epoch": 1.3266666666666667, + "grad_norm": 0.32743772864341736, + "learning_rate": 0.00011754385964912282, + "loss": 1.0038, + "mean_token_accuracy": 0.7565391361713409, + "num_tokens": 9991602.0, + "step": 796 + }, + { + "entropy": 1.0393253713846207, + "epoch": 1.3283333333333334, + "grad_norm": 0.769688606262207, + "learning_rate": 0.00011742690058479533, + "loss": 0.9982, + "mean_token_accuracy": 0.7604466900229454, + "num_tokens": 10004156.0, + "step": 797 + }, + { + "entropy": 1.0976822525262833, + "epoch": 1.33, + "grad_norm": 0.4449421763420105, + "learning_rate": 0.00011730994152046784, + "loss": 1.0685, + "mean_token_accuracy": 0.7425640299916267, + "num_tokens": 10016749.0, + "step": 798 + }, + { + "entropy": 1.133506491780281, + "epoch": 1.3316666666666666, + "grad_norm": 0.33224308490753174, + "learning_rate": 0.00011719298245614037, + "loss": 1.1163, + "mean_token_accuracy": 0.7311032935976982, + "num_tokens": 10029307.0, + "step": 799 + }, + { + "entropy": 0.9819313511252403, + "epoch": 1.3333333333333333, + "grad_norm": 0.4555339217185974, + "learning_rate": 0.00011707602339181288, + "loss": 0.9963, + "mean_token_accuracy": 0.7624331265687943, + "num_tokens": 10041671.0, + "step": 800 + }, + { + "entropy": 0.8490017838776112, + "epoch": 1.335, + "grad_norm": 0.46284276247024536, + "learning_rate": 0.00011695906432748539, + "loss": 0.8662, + "mean_token_accuracy": 0.7898927256464958, + "num_tokens": 10054407.0, + "step": 801 + }, + { + "entropy": 0.9487814530730247, + "epoch": 1.3366666666666667, + "grad_norm": 0.36387118697166443, + "learning_rate": 0.00011684210526315791, + "loss": 0.939, + "mean_token_accuracy": 0.7769873812794685, + "num_tokens": 10067181.0, + "step": 802 + }, + { + "entropy": 1.049319937825203, + "epoch": 1.3383333333333334, + "grad_norm": 0.3920729458332062, + "learning_rate": 0.00011672514619883042, + "loss": 1.0566, + "mean_token_accuracy": 0.7499385103583336, + "num_tokens": 10079648.0, + "step": 803 + }, + { + "entropy": 0.9268646985292435, + "epoch": 1.34, + "grad_norm": 0.37466734647750854, + "learning_rate": 0.00011660818713450293, + "loss": 0.8918, + "mean_token_accuracy": 0.7850276306271553, + "num_tokens": 10092233.0, + "step": 804 + }, + { + "entropy": 1.1615970730781555, + "epoch": 1.3416666666666668, + "grad_norm": 0.3827607333660126, + "learning_rate": 0.00011649122807017545, + "loss": 1.1766, + "mean_token_accuracy": 0.7194992825388908, + "num_tokens": 10105123.0, + "step": 805 + }, + { + "entropy": 1.0954640060663223, + "epoch": 1.3433333333333333, + "grad_norm": 0.3855399787425995, + "learning_rate": 0.00011637426900584796, + "loss": 1.1217, + "mean_token_accuracy": 0.7424816787242889, + "num_tokens": 10117720.0, + "step": 806 + }, + { + "entropy": 1.0013692080974579, + "epoch": 1.345, + "grad_norm": 0.48803019523620605, + "learning_rate": 0.00011625730994152047, + "loss": 1.0108, + "mean_token_accuracy": 0.7596750035881996, + "num_tokens": 10130194.0, + "step": 807 + }, + { + "entropy": 0.9630342796444893, + "epoch": 1.3466666666666667, + "grad_norm": 0.5283374786376953, + "learning_rate": 0.00011614035087719299, + "loss": 0.941, + "mean_token_accuracy": 0.7686321437358856, + "num_tokens": 10142856.0, + "step": 808 + }, + { + "entropy": 1.0880045965313911, + "epoch": 1.3483333333333334, + "grad_norm": 0.49206939339637756, + "learning_rate": 0.0001160233918128655, + "loss": 1.0745, + "mean_token_accuracy": 0.7435120195150375, + "num_tokens": 10155326.0, + "step": 809 + }, + { + "entropy": 1.082480400800705, + "epoch": 1.35, + "grad_norm": 0.41293981671333313, + "learning_rate": 0.00011590643274853801, + "loss": 1.0921, + "mean_token_accuracy": 0.7371596023440361, + "num_tokens": 10167789.0, + "step": 810 + }, + { + "entropy": 1.0294490680098534, + "epoch": 1.3516666666666666, + "grad_norm": 0.4838137626647949, + "learning_rate": 0.00011578947368421053, + "loss": 1.0284, + "mean_token_accuracy": 0.7529405504465103, + "num_tokens": 10180153.0, + "step": 811 + }, + { + "entropy": 1.0858699977397919, + "epoch": 1.3533333333333333, + "grad_norm": 0.43557876348495483, + "learning_rate": 0.00011567251461988304, + "loss": 1.0725, + "mean_token_accuracy": 0.7415995746850967, + "num_tokens": 10192703.0, + "step": 812 + }, + { + "entropy": 0.9729868844151497, + "epoch": 1.355, + "grad_norm": 0.5065525770187378, + "learning_rate": 0.00011555555555555555, + "loss": 0.9715, + "mean_token_accuracy": 0.7719387412071228, + "num_tokens": 10205169.0, + "step": 813 + }, + { + "entropy": 1.004656471312046, + "epoch": 1.3566666666666667, + "grad_norm": 0.4453844726085663, + "learning_rate": 0.00011543859649122808, + "loss": 1.0198, + "mean_token_accuracy": 0.7633600905537605, + "num_tokens": 10217893.0, + "step": 814 + }, + { + "entropy": 1.1781500577926636, + "epoch": 1.3583333333333334, + "grad_norm": 0.4585513770580292, + "learning_rate": 0.00011532163742690059, + "loss": 1.1959, + "mean_token_accuracy": 0.7197646796703339, + "num_tokens": 10230751.0, + "step": 815 + }, + { + "entropy": 1.1357240229845047, + "epoch": 1.3599999999999999, + "grad_norm": 0.4589082598686218, + "learning_rate": 0.00011520467836257311, + "loss": 1.1389, + "mean_token_accuracy": 0.7281661555171013, + "num_tokens": 10242960.0, + "step": 816 + }, + { + "entropy": 0.977837011218071, + "epoch": 1.3616666666666668, + "grad_norm": 0.35289326310157776, + "learning_rate": 0.00011508771929824562, + "loss": 0.993, + "mean_token_accuracy": 0.763344369828701, + "num_tokens": 10255565.0, + "step": 817 + }, + { + "entropy": 0.9367702156305313, + "epoch": 1.3633333333333333, + "grad_norm": 0.3776535093784332, + "learning_rate": 0.00011497076023391813, + "loss": 0.9283, + "mean_token_accuracy": 0.7799378857016563, + "num_tokens": 10267996.0, + "step": 818 + }, + { + "entropy": 1.1042609736323357, + "epoch": 1.365, + "grad_norm": 0.35418426990509033, + "learning_rate": 0.00011485380116959066, + "loss": 1.0843, + "mean_token_accuracy": 0.7454045936465263, + "num_tokens": 10280299.0, + "step": 819 + }, + { + "entropy": 1.0538883432745934, + "epoch": 1.3666666666666667, + "grad_norm": 0.48952603340148926, + "learning_rate": 0.00011473684210526316, + "loss": 1.0108, + "mean_token_accuracy": 0.7581906095147133, + "num_tokens": 10292891.0, + "step": 820 + }, + { + "entropy": 1.0277271196246147, + "epoch": 1.3683333333333334, + "grad_norm": 0.36572200059890747, + "learning_rate": 0.00011461988304093567, + "loss": 0.9814, + "mean_token_accuracy": 0.7698658257722855, + "num_tokens": 10305316.0, + "step": 821 + }, + { + "entropy": 1.2049788609147072, + "epoch": 1.37, + "grad_norm": 0.32481735944747925, + "learning_rate": 0.0001145029239766082, + "loss": 1.1662, + "mean_token_accuracy": 0.7243218049407005, + "num_tokens": 10317552.0, + "step": 822 + }, + { + "entropy": 1.0278822854161263, + "epoch": 1.3716666666666666, + "grad_norm": 0.5740171074867249, + "learning_rate": 0.0001143859649122807, + "loss": 0.9984, + "mean_token_accuracy": 0.7567669078707695, + "num_tokens": 10330519.0, + "step": 823 + }, + { + "entropy": 1.1082218512892723, + "epoch": 1.3733333333333333, + "grad_norm": 0.4737369418144226, + "learning_rate": 0.00011426900584795321, + "loss": 1.136, + "mean_token_accuracy": 0.7322841584682465, + "num_tokens": 10343070.0, + "step": 824 + }, + { + "entropy": 0.9878677502274513, + "epoch": 1.375, + "grad_norm": 0.338487446308136, + "learning_rate": 0.00011415204678362575, + "loss": 1.0096, + "mean_token_accuracy": 0.7636533156037331, + "num_tokens": 10355763.0, + "step": 825 + }, + { + "entropy": 1.0612533316016197, + "epoch": 1.3766666666666667, + "grad_norm": 0.9741193056106567, + "learning_rate": 0.00011403508771929824, + "loss": 1.0522, + "mean_token_accuracy": 0.7476543188095093, + "num_tokens": 10368182.0, + "step": 826 + }, + { + "entropy": 0.9444241896271706, + "epoch": 1.3783333333333334, + "grad_norm": 0.395882248878479, + "learning_rate": 0.00011391812865497075, + "loss": 0.9429, + "mean_token_accuracy": 0.7722252234816551, + "num_tokens": 10380635.0, + "step": 827 + }, + { + "entropy": 0.9475426152348518, + "epoch": 1.38, + "grad_norm": 0.5885340571403503, + "learning_rate": 0.00011380116959064329, + "loss": 0.9643, + "mean_token_accuracy": 0.7732816264033318, + "num_tokens": 10393166.0, + "step": 828 + }, + { + "entropy": 1.027025744318962, + "epoch": 1.3816666666666666, + "grad_norm": 0.4447604715824127, + "learning_rate": 0.0001136842105263158, + "loss": 1.0358, + "mean_token_accuracy": 0.7527819871902466, + "num_tokens": 10405898.0, + "step": 829 + }, + { + "entropy": 1.028792716562748, + "epoch": 1.3833333333333333, + "grad_norm": 4.340502738952637, + "learning_rate": 0.0001135672514619883, + "loss": 1.0166, + "mean_token_accuracy": 0.7590840607881546, + "num_tokens": 10418260.0, + "step": 830 + }, + { + "entropy": 0.9294743090867996, + "epoch": 1.385, + "grad_norm": 0.3686159551143646, + "learning_rate": 0.00011345029239766083, + "loss": 0.9325, + "mean_token_accuracy": 0.7747152373194695, + "num_tokens": 10430739.0, + "step": 831 + }, + { + "entropy": 1.0500045493245125, + "epoch": 1.3866666666666667, + "grad_norm": 0.6629594564437866, + "learning_rate": 0.00011333333333333334, + "loss": 1.054, + "mean_token_accuracy": 0.7504790723323822, + "num_tokens": 10443176.0, + "step": 832 + }, + { + "entropy": 0.943693071603775, + "epoch": 1.3883333333333332, + "grad_norm": 0.6458562016487122, + "learning_rate": 0.00011321637426900584, + "loss": 0.9239, + "mean_token_accuracy": 0.7761659324169159, + "num_tokens": 10456060.0, + "step": 833 + }, + { + "entropy": 0.9273689910769463, + "epoch": 1.3900000000000001, + "grad_norm": 0.5584622621536255, + "learning_rate": 0.00011309941520467837, + "loss": 0.8941, + "mean_token_accuracy": 0.7786312326788902, + "num_tokens": 10468776.0, + "step": 834 + }, + { + "entropy": 1.0498060882091522, + "epoch": 1.3916666666666666, + "grad_norm": 0.8290840983390808, + "learning_rate": 0.00011298245614035088, + "loss": 1.0591, + "mean_token_accuracy": 0.7479442059993744, + "num_tokens": 10481203.0, + "step": 835 + }, + { + "entropy": 0.9626110792160034, + "epoch": 1.3933333333333333, + "grad_norm": 0.47482046484947205, + "learning_rate": 0.0001128654970760234, + "loss": 0.945, + "mean_token_accuracy": 0.7699298560619354, + "num_tokens": 10493832.0, + "step": 836 + }, + { + "entropy": 1.0231076627969742, + "epoch": 1.395, + "grad_norm": 0.3916915953159332, + "learning_rate": 0.00011274853801169592, + "loss": 1.0063, + "mean_token_accuracy": 0.7689137682318687, + "num_tokens": 10506284.0, + "step": 837 + }, + { + "entropy": 1.1151341199874878, + "epoch": 1.3966666666666667, + "grad_norm": 0.5548978447914124, + "learning_rate": 0.00011263157894736843, + "loss": 1.109, + "mean_token_accuracy": 0.7429637610912323, + "num_tokens": 10518567.0, + "step": 838 + }, + { + "entropy": 1.066704586148262, + "epoch": 1.3983333333333334, + "grad_norm": 0.5807605981826782, + "learning_rate": 0.00011251461988304095, + "loss": 1.0531, + "mean_token_accuracy": 0.7495111003518105, + "num_tokens": 10531465.0, + "step": 839 + }, + { + "entropy": 0.9433969557285309, + "epoch": 1.4, + "grad_norm": 0.35977932810783386, + "learning_rate": 0.00011239766081871346, + "loss": 0.9214, + "mean_token_accuracy": 0.7798573076725006, + "num_tokens": 10543955.0, + "step": 840 + }, + { + "entropy": 1.0538764372467995, + "epoch": 1.4016666666666666, + "grad_norm": 0.5120930671691895, + "learning_rate": 0.00011228070175438597, + "loss": 1.0399, + "mean_token_accuracy": 0.7506494671106339, + "num_tokens": 10556527.0, + "step": 841 + }, + { + "entropy": 1.0073631629347801, + "epoch": 1.4033333333333333, + "grad_norm": 0.5792893767356873, + "learning_rate": 0.00011216374269005849, + "loss": 1.0071, + "mean_token_accuracy": 0.7614312022924423, + "num_tokens": 10569305.0, + "step": 842 + }, + { + "entropy": 1.1166088730096817, + "epoch": 1.405, + "grad_norm": 0.3618125319480896, + "learning_rate": 0.000112046783625731, + "loss": 1.0961, + "mean_token_accuracy": 0.7349946275353432, + "num_tokens": 10581842.0, + "step": 843 + }, + { + "entropy": 0.9652634114027023, + "epoch": 1.4066666666666667, + "grad_norm": 0.413026362657547, + "learning_rate": 0.00011192982456140351, + "loss": 0.9539, + "mean_token_accuracy": 0.7748262882232666, + "num_tokens": 10594474.0, + "step": 844 + }, + { + "entropy": 1.0343084707856178, + "epoch": 1.4083333333333332, + "grad_norm": 0.44033581018447876, + "learning_rate": 0.00011181286549707603, + "loss": 1.0534, + "mean_token_accuracy": 0.7520105093717575, + "num_tokens": 10607053.0, + "step": 845 + }, + { + "entropy": 0.9925975799560547, + "epoch": 1.41, + "grad_norm": 0.3442208170890808, + "learning_rate": 0.00011169590643274854, + "loss": 0.9824, + "mean_token_accuracy": 0.767107367515564, + "num_tokens": 10619633.0, + "step": 846 + }, + { + "entropy": 0.9882001951336861, + "epoch": 1.4116666666666666, + "grad_norm": 0.372429221868515, + "learning_rate": 0.00011157894736842105, + "loss": 0.997, + "mean_token_accuracy": 0.754954032599926, + "num_tokens": 10631881.0, + "step": 847 + }, + { + "entropy": 1.216900959610939, + "epoch": 1.4133333333333333, + "grad_norm": 0.32626840472221375, + "learning_rate": 0.00011146198830409357, + "loss": 1.1901, + "mean_token_accuracy": 0.7172373160719872, + "num_tokens": 10644620.0, + "step": 848 + }, + { + "entropy": 1.0781096443533897, + "epoch": 1.415, + "grad_norm": 0.48700082302093506, + "learning_rate": 0.00011134502923976608, + "loss": 1.0506, + "mean_token_accuracy": 0.7515213713049889, + "num_tokens": 10657275.0, + "step": 849 + }, + { + "entropy": 1.006898395717144, + "epoch": 1.4166666666666667, + "grad_norm": 0.4172718822956085, + "learning_rate": 0.0001112280701754386, + "loss": 1.0189, + "mean_token_accuracy": 0.7525632977485657, + "num_tokens": 10669659.0, + "step": 850 + }, + { + "entropy": 0.9942312985658646, + "epoch": 1.4183333333333334, + "grad_norm": 0.3973284959793091, + "learning_rate": 0.00011111111111111112, + "loss": 1.0055, + "mean_token_accuracy": 0.7587251886725426, + "num_tokens": 10682439.0, + "step": 851 + }, + { + "entropy": 1.0647492855787277, + "epoch": 1.42, + "grad_norm": 0.36533141136169434, + "learning_rate": 0.00011099415204678363, + "loss": 1.0481, + "mean_token_accuracy": 0.7488600835204124, + "num_tokens": 10694974.0, + "step": 852 + }, + { + "entropy": 0.965335488319397, + "epoch": 1.4216666666666666, + "grad_norm": 0.3551306426525116, + "learning_rate": 0.00011087719298245614, + "loss": 0.9249, + "mean_token_accuracy": 0.7786455601453781, + "num_tokens": 10707370.0, + "step": 853 + }, + { + "entropy": 0.985395722091198, + "epoch": 1.4233333333333333, + "grad_norm": 0.3487420082092285, + "learning_rate": 0.00011076023391812866, + "loss": 0.9543, + "mean_token_accuracy": 0.7682265117764473, + "num_tokens": 10719952.0, + "step": 854 + }, + { + "entropy": 1.171361893415451, + "epoch": 1.425, + "grad_norm": 0.3417138159275055, + "learning_rate": 0.00011064327485380117, + "loss": 1.1721, + "mean_token_accuracy": 0.7295513302087784, + "num_tokens": 10732226.0, + "step": 855 + }, + { + "entropy": 0.8770536556839943, + "epoch": 1.4266666666666667, + "grad_norm": 0.3753270208835602, + "learning_rate": 0.0001105263157894737, + "loss": 0.8653, + "mean_token_accuracy": 0.7902911081910133, + "num_tokens": 10744771.0, + "step": 856 + }, + { + "entropy": 0.993248276412487, + "epoch": 1.4283333333333332, + "grad_norm": 0.3620983958244324, + "learning_rate": 0.0001104093567251462, + "loss": 1.0105, + "mean_token_accuracy": 0.7606182098388672, + "num_tokens": 10757054.0, + "step": 857 + }, + { + "entropy": 1.0585757717490196, + "epoch": 1.43, + "grad_norm": 0.6057856678962708, + "learning_rate": 0.00011029239766081871, + "loss": 1.0547, + "mean_token_accuracy": 0.748723529279232, + "num_tokens": 10769655.0, + "step": 858 + }, + { + "entropy": 1.044035878032446, + "epoch": 1.4316666666666666, + "grad_norm": 0.48957177996635437, + "learning_rate": 0.00011017543859649125, + "loss": 1.0722, + "mean_token_accuracy": 0.7449115514755249, + "num_tokens": 10782498.0, + "step": 859 + }, + { + "entropy": 1.0420118942856789, + "epoch": 1.4333333333333333, + "grad_norm": 1.1686972379684448, + "learning_rate": 0.00011005847953216376, + "loss": 1.0403, + "mean_token_accuracy": 0.7532968968153, + "num_tokens": 10794988.0, + "step": 860 + }, + { + "entropy": 0.9224732890725136, + "epoch": 1.435, + "grad_norm": 0.554107129573822, + "learning_rate": 0.00010994152046783625, + "loss": 0.8907, + "mean_token_accuracy": 0.7824530005455017, + "num_tokens": 10807684.0, + "step": 861 + }, + { + "entropy": 1.0446735471487045, + "epoch": 1.4366666666666665, + "grad_norm": 0.5582395195960999, + "learning_rate": 0.00010982456140350879, + "loss": 1.0294, + "mean_token_accuracy": 0.7591670379042625, + "num_tokens": 10820456.0, + "step": 862 + }, + { + "entropy": 1.1306973919272423, + "epoch": 1.4383333333333335, + "grad_norm": 0.6018702387809753, + "learning_rate": 0.0001097076023391813, + "loss": 1.0978, + "mean_token_accuracy": 0.7418882921338081, + "num_tokens": 10832758.0, + "step": 863 + }, + { + "entropy": 1.0693649873137474, + "epoch": 1.44, + "grad_norm": 0.5192097425460815, + "learning_rate": 0.0001095906432748538, + "loss": 1.0388, + "mean_token_accuracy": 0.7521316781640053, + "num_tokens": 10845340.0, + "step": 864 + }, + { + "entropy": 1.1522692888975143, + "epoch": 1.4416666666666667, + "grad_norm": 0.6497692465782166, + "learning_rate": 0.00010947368421052633, + "loss": 1.1314, + "mean_token_accuracy": 0.7329999729990959, + "num_tokens": 10857967.0, + "step": 865 + }, + { + "entropy": 1.0177212581038475, + "epoch": 1.4433333333333334, + "grad_norm": 0.5419860482215881, + "learning_rate": 0.00010935672514619884, + "loss": 1.0031, + "mean_token_accuracy": 0.7599886953830719, + "num_tokens": 10870669.0, + "step": 866 + }, + { + "entropy": 1.023183934390545, + "epoch": 1.445, + "grad_norm": 0.5257622003555298, + "learning_rate": 0.00010923976608187134, + "loss": 0.9979, + "mean_token_accuracy": 0.7620133087038994, + "num_tokens": 10883191.0, + "step": 867 + }, + { + "entropy": 1.0132087841629982, + "epoch": 1.4466666666666668, + "grad_norm": 0.5330016016960144, + "learning_rate": 0.00010912280701754387, + "loss": 1.0076, + "mean_token_accuracy": 0.7519190832972527, + "num_tokens": 10895676.0, + "step": 868 + }, + { + "entropy": 0.9974637776613235, + "epoch": 1.4483333333333333, + "grad_norm": 0.36778372526168823, + "learning_rate": 0.00010900584795321638, + "loss": 0.9833, + "mean_token_accuracy": 0.7621881663799286, + "num_tokens": 10908336.0, + "step": 869 + }, + { + "entropy": 0.8968958966434002, + "epoch": 1.45, + "grad_norm": 0.5859230756759644, + "learning_rate": 0.00010888888888888889, + "loss": 0.8937, + "mean_token_accuracy": 0.7845388129353523, + "num_tokens": 10920842.0, + "step": 870 + }, + { + "entropy": 1.1610392034053802, + "epoch": 1.4516666666666667, + "grad_norm": 0.6320472359657288, + "learning_rate": 0.00010877192982456141, + "loss": 1.1792, + "mean_token_accuracy": 0.7180443182587624, + "num_tokens": 10933308.0, + "step": 871 + }, + { + "entropy": 0.9494692981243134, + "epoch": 1.4533333333333334, + "grad_norm": 0.34578004479408264, + "learning_rate": 0.00010865497076023392, + "loss": 0.9557, + "mean_token_accuracy": 0.7738220021128654, + "num_tokens": 10945907.0, + "step": 872 + }, + { + "entropy": 1.1724983602762222, + "epoch": 1.455, + "grad_norm": 0.36840155720710754, + "learning_rate": 0.00010853801169590643, + "loss": 1.2108, + "mean_token_accuracy": 0.7143998891115189, + "num_tokens": 10958532.0, + "step": 873 + }, + { + "entropy": 0.9114683270454407, + "epoch": 1.4566666666666666, + "grad_norm": 0.49961942434310913, + "learning_rate": 0.00010842105263157896, + "loss": 0.9228, + "mean_token_accuracy": 0.7720942944288254, + "num_tokens": 10970796.0, + "step": 874 + }, + { + "entropy": 1.1112416312098503, + "epoch": 1.4583333333333333, + "grad_norm": 0.41927391290664673, + "learning_rate": 0.00010830409356725147, + "loss": 1.1155, + "mean_token_accuracy": 0.7332775890827179, + "num_tokens": 10983480.0, + "step": 875 + }, + { + "entropy": 1.108654335141182, + "epoch": 1.46, + "grad_norm": 0.3839800953865051, + "learning_rate": 0.00010818713450292399, + "loss": 1.1028, + "mean_token_accuracy": 0.744315542280674, + "num_tokens": 10996321.0, + "step": 876 + }, + { + "entropy": 1.0760968998074532, + "epoch": 1.4616666666666667, + "grad_norm": 0.3334137499332428, + "learning_rate": 0.0001080701754385965, + "loss": 1.0539, + "mean_token_accuracy": 0.7508803084492683, + "num_tokens": 11008706.0, + "step": 877 + }, + { + "entropy": 0.9972855970263481, + "epoch": 1.4633333333333334, + "grad_norm": 0.42082127928733826, + "learning_rate": 0.00010795321637426901, + "loss": 0.9675, + "mean_token_accuracy": 0.7756324484944344, + "num_tokens": 11021264.0, + "step": 878 + }, + { + "entropy": 1.0188745334744453, + "epoch": 1.465, + "grad_norm": 0.4535685181617737, + "learning_rate": 0.00010783625730994153, + "loss": 1.0095, + "mean_token_accuracy": 0.761257492005825, + "num_tokens": 11033968.0, + "step": 879 + }, + { + "entropy": 1.1675554513931274, + "epoch": 1.4666666666666668, + "grad_norm": 0.41163545846939087, + "learning_rate": 0.00010771929824561404, + "loss": 1.1454, + "mean_token_accuracy": 0.7286913469433784, + "num_tokens": 11046089.0, + "step": 880 + }, + { + "entropy": 0.9919754564762115, + "epoch": 1.4683333333333333, + "grad_norm": 0.37073564529418945, + "learning_rate": 0.00010760233918128655, + "loss": 0.9886, + "mean_token_accuracy": 0.7659579887986183, + "num_tokens": 11058355.0, + "step": 881 + }, + { + "entropy": 1.1535531505942345, + "epoch": 1.47, + "grad_norm": 0.43245184421539307, + "learning_rate": 0.00010748538011695907, + "loss": 1.1231, + "mean_token_accuracy": 0.7396808043122292, + "num_tokens": 11070883.0, + "step": 882 + }, + { + "entropy": 1.0976762846112251, + "epoch": 1.4716666666666667, + "grad_norm": 0.40902963280677795, + "learning_rate": 0.00010736842105263158, + "loss": 1.0822, + "mean_token_accuracy": 0.7501776218414307, + "num_tokens": 11083431.0, + "step": 883 + }, + { + "entropy": 1.0290912240743637, + "epoch": 1.4733333333333334, + "grad_norm": 0.4858871400356293, + "learning_rate": 0.00010725146198830409, + "loss": 1.0126, + "mean_token_accuracy": 0.7592322379350662, + "num_tokens": 11096095.0, + "step": 884 + }, + { + "entropy": 0.8518183901906013, + "epoch": 1.475, + "grad_norm": 0.3919861614704132, + "learning_rate": 0.00010713450292397661, + "loss": 0.8265, + "mean_token_accuracy": 0.7959297299385071, + "num_tokens": 11108770.0, + "step": 885 + }, + { + "entropy": 1.1525486186146736, + "epoch": 1.4766666666666666, + "grad_norm": 0.4030328691005707, + "learning_rate": 0.00010701754385964912, + "loss": 1.1595, + "mean_token_accuracy": 0.7353719994425774, + "num_tokens": 11121258.0, + "step": 886 + }, + { + "entropy": 1.0715966746211052, + "epoch": 1.4783333333333333, + "grad_norm": 0.41663604974746704, + "learning_rate": 0.00010690058479532163, + "loss": 1.0769, + "mean_token_accuracy": 0.7448949441313744, + "num_tokens": 11133725.0, + "step": 887 + }, + { + "entropy": 0.8812004327774048, + "epoch": 1.48, + "grad_norm": 0.37869131565093994, + "learning_rate": 0.00010678362573099416, + "loss": 0.8715, + "mean_token_accuracy": 0.7830679789185524, + "num_tokens": 11146636.0, + "step": 888 + }, + { + "entropy": 1.017636887729168, + "epoch": 1.4816666666666667, + "grad_norm": 0.36093369126319885, + "learning_rate": 0.00010666666666666667, + "loss": 1.0347, + "mean_token_accuracy": 0.7492915317416191, + "num_tokens": 11159517.0, + "step": 889 + }, + { + "entropy": 1.1328649371862411, + "epoch": 1.4833333333333334, + "grad_norm": 0.3405679762363434, + "learning_rate": 0.00010654970760233918, + "loss": 1.1572, + "mean_token_accuracy": 0.7216387167572975, + "num_tokens": 11172100.0, + "step": 890 + }, + { + "entropy": 1.0226778164505959, + "epoch": 1.4849999999999999, + "grad_norm": 0.37923672795295715, + "learning_rate": 0.00010643274853801171, + "loss": 1.026, + "mean_token_accuracy": 0.7598460465669632, + "num_tokens": 11184561.0, + "step": 891 + }, + { + "entropy": 1.0817178189754486, + "epoch": 1.4866666666666668, + "grad_norm": 0.3810464143753052, + "learning_rate": 0.00010631578947368421, + "loss": 1.0715, + "mean_token_accuracy": 0.746548056602478, + "num_tokens": 11196944.0, + "step": 892 + }, + { + "entropy": 0.9835236445069313, + "epoch": 1.4883333333333333, + "grad_norm": 0.37003716826438904, + "learning_rate": 0.00010619883040935672, + "loss": 0.9567, + "mean_token_accuracy": 0.7729001268744469, + "num_tokens": 11209530.0, + "step": 893 + }, + { + "entropy": 1.1025512740015984, + "epoch": 1.49, + "grad_norm": 0.3993394672870636, + "learning_rate": 0.00010608187134502925, + "loss": 1.0974, + "mean_token_accuracy": 0.7313189208507538, + "num_tokens": 11221987.0, + "step": 894 + }, + { + "entropy": 1.0139773339033127, + "epoch": 1.4916666666666667, + "grad_norm": 0.31166166067123413, + "learning_rate": 0.00010596491228070175, + "loss": 1.0037, + "mean_token_accuracy": 0.7613530680537224, + "num_tokens": 11234942.0, + "step": 895 + }, + { + "entropy": 1.0583973452448845, + "epoch": 1.4933333333333334, + "grad_norm": 0.3589918613433838, + "learning_rate": 0.00010584795321637429, + "loss": 1.0734, + "mean_token_accuracy": 0.7417832165956497, + "num_tokens": 11247788.0, + "step": 896 + }, + { + "entropy": 1.1149739176034927, + "epoch": 1.495, + "grad_norm": 0.4475265145301819, + "learning_rate": 0.0001057309941520468, + "loss": 1.1002, + "mean_token_accuracy": 0.7413612082600594, + "num_tokens": 11260738.0, + "step": 897 + }, + { + "entropy": 1.0767404064536095, + "epoch": 1.4966666666666666, + "grad_norm": 0.43724361062049866, + "learning_rate": 0.00010561403508771929, + "loss": 1.077, + "mean_token_accuracy": 0.7468773946166039, + "num_tokens": 11273395.0, + "step": 898 + }, + { + "entropy": 1.1489032730460167, + "epoch": 1.4983333333333333, + "grad_norm": 0.328046053647995, + "learning_rate": 0.00010549707602339183, + "loss": 1.1427, + "mean_token_accuracy": 0.7237901836633682, + "num_tokens": 11285769.0, + "step": 899 + }, + { + "entropy": 0.9795516058802605, + "epoch": 1.5, + "grad_norm": 0.3441937565803528, + "learning_rate": 0.00010538011695906434, + "loss": 0.9833, + "mean_token_accuracy": 0.7675316259264946, + "num_tokens": 11298603.0, + "step": 900 + }, + { + "entropy": 1.0521475449204445, + "epoch": 1.5016666666666667, + "grad_norm": 0.36500778794288635, + "learning_rate": 0.00010526315789473685, + "loss": 1.0503, + "mean_token_accuracy": 0.7476348206400871, + "num_tokens": 11311395.0, + "step": 901 + }, + { + "entropy": 0.9882413372397423, + "epoch": 1.5033333333333334, + "grad_norm": 0.31784096360206604, + "learning_rate": 0.00010514619883040937, + "loss": 0.9629, + "mean_token_accuracy": 0.7727819085121155, + "num_tokens": 11324223.0, + "step": 902 + }, + { + "entropy": 1.0452478006482124, + "epoch": 1.505, + "grad_norm": 0.3701280355453491, + "learning_rate": 0.00010502923976608188, + "loss": 1.0313, + "mean_token_accuracy": 0.7561671435832977, + "num_tokens": 11336798.0, + "step": 903 + }, + { + "entropy": 1.1844883859157562, + "epoch": 1.5066666666666668, + "grad_norm": 0.3551271855831146, + "learning_rate": 0.00010491228070175439, + "loss": 1.166, + "mean_token_accuracy": 0.7237709909677505, + "num_tokens": 11349352.0, + "step": 904 + }, + { + "entropy": 1.0555674508213997, + "epoch": 1.5083333333333333, + "grad_norm": 0.34529635310173035, + "learning_rate": 0.00010479532163742691, + "loss": 1.0294, + "mean_token_accuracy": 0.7504124864935875, + "num_tokens": 11362065.0, + "step": 905 + }, + { + "entropy": 0.9649133235216141, + "epoch": 1.51, + "grad_norm": 0.43623584508895874, + "learning_rate": 0.00010467836257309942, + "loss": 0.9695, + "mean_token_accuracy": 0.7675874829292297, + "num_tokens": 11374499.0, + "step": 906 + }, + { + "entropy": 1.0951272398233414, + "epoch": 1.5116666666666667, + "grad_norm": 0.4360898733139038, + "learning_rate": 0.00010456140350877193, + "loss": 1.0914, + "mean_token_accuracy": 0.7397733628749847, + "num_tokens": 11387186.0, + "step": 907 + }, + { + "entropy": 0.97959403693676, + "epoch": 1.5133333333333332, + "grad_norm": 0.39979809522628784, + "learning_rate": 0.00010444444444444445, + "loss": 0.9577, + "mean_token_accuracy": 0.7697319462895393, + "num_tokens": 11399647.0, + "step": 908 + }, + { + "entropy": 0.997077964246273, + "epoch": 1.5150000000000001, + "grad_norm": 0.4603921175003052, + "learning_rate": 0.00010432748538011696, + "loss": 0.991, + "mean_token_accuracy": 0.7578662112355232, + "num_tokens": 11412192.0, + "step": 909 + }, + { + "entropy": 0.9444845467805862, + "epoch": 1.5166666666666666, + "grad_norm": 0.4429384768009186, + "learning_rate": 0.00010421052631578947, + "loss": 0.9316, + "mean_token_accuracy": 0.7735821157693863, + "num_tokens": 11424501.0, + "step": 910 + }, + { + "entropy": 1.052556574344635, + "epoch": 1.5183333333333333, + "grad_norm": 0.3348616659641266, + "learning_rate": 0.000104093567251462, + "loss": 1.0526, + "mean_token_accuracy": 0.7529369965195656, + "num_tokens": 11437123.0, + "step": 911 + }, + { + "entropy": 1.0511446967720985, + "epoch": 1.52, + "grad_norm": 0.46372920274734497, + "learning_rate": 0.0001039766081871345, + "loss": 1.0789, + "mean_token_accuracy": 0.7440238147974014, + "num_tokens": 11449811.0, + "step": 912 + }, + { + "entropy": 0.9433198198676109, + "epoch": 1.5216666666666665, + "grad_norm": 0.4527278244495392, + "learning_rate": 0.00010385964912280702, + "loss": 0.9384, + "mean_token_accuracy": 0.7744319513440132, + "num_tokens": 11462288.0, + "step": 913 + }, + { + "entropy": 0.9461892023682594, + "epoch": 1.5233333333333334, + "grad_norm": 0.569244384765625, + "learning_rate": 0.00010374269005847954, + "loss": 0.9473, + "mean_token_accuracy": 0.7712215408682823, + "num_tokens": 11474652.0, + "step": 914 + }, + { + "entropy": 0.9832625687122345, + "epoch": 1.525, + "grad_norm": 0.47432512044906616, + "learning_rate": 0.00010362573099415205, + "loss": 0.9741, + "mean_token_accuracy": 0.7722496688365936, + "num_tokens": 11487038.0, + "step": 915 + }, + { + "entropy": 1.113365113735199, + "epoch": 1.5266666666666666, + "grad_norm": 0.5353997945785522, + "learning_rate": 0.00010350877192982457, + "loss": 1.1232, + "mean_token_accuracy": 0.7369033172726631, + "num_tokens": 11499827.0, + "step": 916 + }, + { + "entropy": 1.045794539153576, + "epoch": 1.5283333333333333, + "grad_norm": 0.4842732548713684, + "learning_rate": 0.00010339181286549708, + "loss": 1.0219, + "mean_token_accuracy": 0.7538414746522903, + "num_tokens": 11512411.0, + "step": 917 + }, + { + "entropy": 1.008032351732254, + "epoch": 1.53, + "grad_norm": 0.4723028242588043, + "learning_rate": 0.00010327485380116959, + "loss": 0.9897, + "mean_token_accuracy": 0.7631094679236412, + "num_tokens": 11525202.0, + "step": 918 + }, + { + "entropy": 1.2124775275588036, + "epoch": 1.5316666666666667, + "grad_norm": 0.6457427740097046, + "learning_rate": 0.00010315789473684211, + "loss": 1.2234, + "mean_token_accuracy": 0.7147088348865509, + "num_tokens": 11537747.0, + "step": 919 + }, + { + "entropy": 1.0353680774569511, + "epoch": 1.5333333333333332, + "grad_norm": 0.5149745345115662, + "learning_rate": 0.00010304093567251462, + "loss": 1.0249, + "mean_token_accuracy": 0.7582436203956604, + "num_tokens": 11550393.0, + "step": 920 + }, + { + "entropy": 1.0725176259875298, + "epoch": 1.5350000000000001, + "grad_norm": 0.3201546370983124, + "learning_rate": 0.00010292397660818713, + "loss": 1.08, + "mean_token_accuracy": 0.7444223612546921, + "num_tokens": 11562681.0, + "step": 921 + }, + { + "entropy": 1.0299292877316475, + "epoch": 1.5366666666666666, + "grad_norm": 0.6729260087013245, + "learning_rate": 0.00010280701754385967, + "loss": 1.0095, + "mean_token_accuracy": 0.7589081674814224, + "num_tokens": 11575287.0, + "step": 922 + }, + { + "entropy": 1.129386618733406, + "epoch": 1.5383333333333333, + "grad_norm": 0.647361159324646, + "learning_rate": 0.00010269005847953216, + "loss": 1.1285, + "mean_token_accuracy": 0.7395085915923119, + "num_tokens": 11587776.0, + "step": 923 + }, + { + "entropy": 0.901118665933609, + "epoch": 1.54, + "grad_norm": 0.4085787832736969, + "learning_rate": 0.00010257309941520467, + "loss": 0.8937, + "mean_token_accuracy": 0.7793251350522041, + "num_tokens": 11600608.0, + "step": 924 + }, + { + "entropy": 1.0647885873913765, + "epoch": 1.5416666666666665, + "grad_norm": 0.38735586404800415, + "learning_rate": 0.00010245614035087721, + "loss": 1.0462, + "mean_token_accuracy": 0.740901917219162, + "num_tokens": 11613369.0, + "step": 925 + }, + { + "entropy": 0.9393143653869629, + "epoch": 1.5433333333333334, + "grad_norm": 0.6960055232048035, + "learning_rate": 0.00010233918128654971, + "loss": 0.9437, + "mean_token_accuracy": 0.7748453468084335, + "num_tokens": 11625871.0, + "step": 926 + }, + { + "entropy": 1.1271546632051468, + "epoch": 1.545, + "grad_norm": 0.5912812948226929, + "learning_rate": 0.00010222222222222222, + "loss": 1.1254, + "mean_token_accuracy": 0.7337081283330917, + "num_tokens": 11638374.0, + "step": 927 + }, + { + "entropy": 0.9807042926549911, + "epoch": 1.5466666666666666, + "grad_norm": 0.3811970055103302, + "learning_rate": 0.00010210526315789475, + "loss": 0.9585, + "mean_token_accuracy": 0.7691030874848366, + "num_tokens": 11651054.0, + "step": 928 + }, + { + "entropy": 1.0841076374053955, + "epoch": 1.5483333333333333, + "grad_norm": 0.5395686626434326, + "learning_rate": 0.00010198830409356725, + "loss": 1.0496, + "mean_token_accuracy": 0.7462358325719833, + "num_tokens": 11663627.0, + "step": 929 + }, + { + "entropy": 1.065460205078125, + "epoch": 1.55, + "grad_norm": 0.763958752155304, + "learning_rate": 0.00010187134502923976, + "loss": 1.0523, + "mean_token_accuracy": 0.7501270100474358, + "num_tokens": 11675897.0, + "step": 930 + }, + { + "entropy": 1.0842286050319672, + "epoch": 1.5516666666666667, + "grad_norm": 0.5680394172668457, + "learning_rate": 0.0001017543859649123, + "loss": 1.0838, + "mean_token_accuracy": 0.7463953346014023, + "num_tokens": 11688459.0, + "step": 931 + }, + { + "entropy": 1.0065980926156044, + "epoch": 1.5533333333333332, + "grad_norm": 0.5923241376876831, + "learning_rate": 0.0001016374269005848, + "loss": 0.9959, + "mean_token_accuracy": 0.7607046961784363, + "num_tokens": 11700966.0, + "step": 932 + }, + { + "entropy": 1.0660686418414116, + "epoch": 1.5550000000000002, + "grad_norm": 0.6443043351173401, + "learning_rate": 0.0001015204678362573, + "loss": 1.0844, + "mean_token_accuracy": 0.7445439100265503, + "num_tokens": 11713700.0, + "step": 933 + }, + { + "entropy": 1.0937560498714447, + "epoch": 1.5566666666666666, + "grad_norm": 0.5176796317100525, + "learning_rate": 0.00010140350877192984, + "loss": 1.1249, + "mean_token_accuracy": 0.7354537099599838, + "num_tokens": 11726035.0, + "step": 934 + }, + { + "entropy": 0.9690373241901398, + "epoch": 1.5583333333333333, + "grad_norm": 0.36154699325561523, + "learning_rate": 0.00010128654970760235, + "loss": 0.9794, + "mean_token_accuracy": 0.7714982256293297, + "num_tokens": 11738341.0, + "step": 935 + }, + { + "entropy": 1.124791868031025, + "epoch": 1.56, + "grad_norm": 0.47166070342063904, + "learning_rate": 0.00010116959064327487, + "loss": 1.1183, + "mean_token_accuracy": 0.7350871786475182, + "num_tokens": 11750883.0, + "step": 936 + }, + { + "entropy": 1.076091207563877, + "epoch": 1.5616666666666665, + "grad_norm": 0.5033275485038757, + "learning_rate": 0.00010105263157894738, + "loss": 1.0704, + "mean_token_accuracy": 0.7419762536883354, + "num_tokens": 11763453.0, + "step": 937 + }, + { + "entropy": 1.044884666800499, + "epoch": 1.5633333333333335, + "grad_norm": 0.381427139043808, + "learning_rate": 0.00010093567251461989, + "loss": 1.0389, + "mean_token_accuracy": 0.7595488056540489, + "num_tokens": 11775807.0, + "step": 938 + }, + { + "entropy": 1.0149907171726227, + "epoch": 1.565, + "grad_norm": 0.3665209114551544, + "learning_rate": 0.00010081871345029241, + "loss": 1.0111, + "mean_token_accuracy": 0.7635852620005608, + "num_tokens": 11788144.0, + "step": 939 + }, + { + "entropy": 0.9833221808075905, + "epoch": 1.5666666666666667, + "grad_norm": 0.3676835894584656, + "learning_rate": 0.00010070175438596492, + "loss": 0.9584, + "mean_token_accuracy": 0.7688944488763809, + "num_tokens": 11800736.0, + "step": 940 + }, + { + "entropy": 1.0265733674168587, + "epoch": 1.5683333333333334, + "grad_norm": 0.356206476688385, + "learning_rate": 0.00010058479532163743, + "loss": 1.0307, + "mean_token_accuracy": 0.7622543349862099, + "num_tokens": 11813252.0, + "step": 941 + }, + { + "entropy": 1.0316286012530327, + "epoch": 1.5699999999999998, + "grad_norm": 0.5001046061515808, + "learning_rate": 0.00010046783625730995, + "loss": 1.0018, + "mean_token_accuracy": 0.762552946805954, + "num_tokens": 11825972.0, + "step": 942 + }, + { + "entropy": 1.077604465186596, + "epoch": 1.5716666666666668, + "grad_norm": 0.4228179454803467, + "learning_rate": 0.00010035087719298246, + "loss": 1.0474, + "mean_token_accuracy": 0.7495718151330948, + "num_tokens": 11838741.0, + "step": 943 + }, + { + "entropy": 1.116953693330288, + "epoch": 1.5733333333333333, + "grad_norm": 0.35452526807785034, + "learning_rate": 0.00010023391812865497, + "loss": 1.109, + "mean_token_accuracy": 0.7296589389443398, + "num_tokens": 11851287.0, + "step": 944 + }, + { + "entropy": 1.0866753309965134, + "epoch": 1.575, + "grad_norm": 0.40109559893608093, + "learning_rate": 0.0001001169590643275, + "loss": 1.0577, + "mean_token_accuracy": 0.7466104477643967, + "num_tokens": 11864035.0, + "step": 945 + }, + { + "entropy": 1.0261227637529373, + "epoch": 1.5766666666666667, + "grad_norm": 0.6341944336891174, + "learning_rate": 0.0001, + "loss": 1.0223, + "mean_token_accuracy": 0.7552044317126274, + "num_tokens": 11876551.0, + "step": 946 + }, + { + "entropy": 1.0396640598773956, + "epoch": 1.5783333333333334, + "grad_norm": 0.387890487909317, + "learning_rate": 9.988304093567253e-05, + "loss": 1.0433, + "mean_token_accuracy": 0.7529490813612938, + "num_tokens": 11889267.0, + "step": 947 + }, + { + "entropy": 0.9809285327792168, + "epoch": 1.58, + "grad_norm": 0.3864838480949402, + "learning_rate": 9.976608187134502e-05, + "loss": 0.9764, + "mean_token_accuracy": 0.758466362953186, + "num_tokens": 11902019.0, + "step": 948 + }, + { + "entropy": 1.022968828678131, + "epoch": 1.5816666666666666, + "grad_norm": 0.4740675687789917, + "learning_rate": 9.964912280701755e-05, + "loss": 1.0368, + "mean_token_accuracy": 0.7580825462937355, + "num_tokens": 11914538.0, + "step": 949 + }, + { + "entropy": 1.0423406288027763, + "epoch": 1.5833333333333335, + "grad_norm": 0.46600988507270813, + "learning_rate": 9.953216374269007e-05, + "loss": 1.0365, + "mean_token_accuracy": 0.7558559775352478, + "num_tokens": 11926973.0, + "step": 950 + }, + { + "entropy": 1.0362046658992767, + "epoch": 1.585, + "grad_norm": 0.3281984329223633, + "learning_rate": 9.941520467836257e-05, + "loss": 1.0365, + "mean_token_accuracy": 0.7519606500864029, + "num_tokens": 11939555.0, + "step": 951 + }, + { + "entropy": 1.0146067589521408, + "epoch": 1.5866666666666667, + "grad_norm": 0.33721277117729187, + "learning_rate": 9.929824561403509e-05, + "loss": 1.0087, + "mean_token_accuracy": 0.7585414201021194, + "num_tokens": 11951842.0, + "step": 952 + }, + { + "entropy": 1.0265194848179817, + "epoch": 1.5883333333333334, + "grad_norm": 0.5035887956619263, + "learning_rate": 9.918128654970761e-05, + "loss": 1.0271, + "mean_token_accuracy": 0.7587100341916084, + "num_tokens": 11964140.0, + "step": 953 + }, + { + "entropy": 1.0434783324599266, + "epoch": 1.5899999999999999, + "grad_norm": 0.35144132375717163, + "learning_rate": 9.906432748538012e-05, + "loss": 1.015, + "mean_token_accuracy": 0.7533115297555923, + "num_tokens": 11976609.0, + "step": 954 + }, + { + "entropy": 0.9975638315081596, + "epoch": 1.5916666666666668, + "grad_norm": 0.35626283288002014, + "learning_rate": 9.894736842105263e-05, + "loss": 0.9785, + "mean_token_accuracy": 0.7682247906923294, + "num_tokens": 11989066.0, + "step": 955 + }, + { + "entropy": 1.0139445587992668, + "epoch": 1.5933333333333333, + "grad_norm": 0.34869149327278137, + "learning_rate": 9.883040935672515e-05, + "loss": 0.9703, + "mean_token_accuracy": 0.7643293812870979, + "num_tokens": 12001706.0, + "step": 956 + }, + { + "entropy": 1.0372228473424911, + "epoch": 1.595, + "grad_norm": 0.3563248813152313, + "learning_rate": 9.871345029239766e-05, + "loss": 1.0117, + "mean_token_accuracy": 0.7578230872750282, + "num_tokens": 12014447.0, + "step": 957 + }, + { + "entropy": 0.8905011862516403, + "epoch": 1.5966666666666667, + "grad_norm": 0.39937323331832886, + "learning_rate": 9.859649122807017e-05, + "loss": 0.8516, + "mean_token_accuracy": 0.7892890945076942, + "num_tokens": 12027001.0, + "step": 958 + }, + { + "entropy": 1.0956081077456474, + "epoch": 1.5983333333333334, + "grad_norm": 0.3379671275615692, + "learning_rate": 9.84795321637427e-05, + "loss": 1.1264, + "mean_token_accuracy": 0.7354054301977158, + "num_tokens": 12039455.0, + "step": 959 + }, + { + "entropy": 1.1044630855321884, + "epoch": 1.6, + "grad_norm": 0.4157054126262665, + "learning_rate": 9.83625730994152e-05, + "loss": 1.1178, + "mean_token_accuracy": 0.7352635785937309, + "num_tokens": 12052156.0, + "step": 960 + }, + { + "entropy": 0.989508643746376, + "epoch": 1.6016666666666666, + "grad_norm": 0.3482710123062134, + "learning_rate": 9.824561403508771e-05, + "loss": 1.0037, + "mean_token_accuracy": 0.7680085748434067, + "num_tokens": 12064784.0, + "step": 961 + }, + { + "entropy": 0.8837302699685097, + "epoch": 1.6033333333333335, + "grad_norm": 0.39792779088020325, + "learning_rate": 9.812865497076024e-05, + "loss": 0.8757, + "mean_token_accuracy": 0.794054351747036, + "num_tokens": 12077182.0, + "step": 962 + }, + { + "entropy": 1.075769916176796, + "epoch": 1.605, + "grad_norm": 0.4095625877380371, + "learning_rate": 9.801169590643276e-05, + "loss": 1.1094, + "mean_token_accuracy": 0.7338564768433571, + "num_tokens": 12089748.0, + "step": 963 + }, + { + "entropy": 1.0048549994826317, + "epoch": 1.6066666666666667, + "grad_norm": 0.5313544273376465, + "learning_rate": 9.789473684210527e-05, + "loss": 1.0222, + "mean_token_accuracy": 0.7637491151690483, + "num_tokens": 12102208.0, + "step": 964 + }, + { + "entropy": 0.9381846338510513, + "epoch": 1.6083333333333334, + "grad_norm": 0.7829982042312622, + "learning_rate": 9.777777777777778e-05, + "loss": 0.9526, + "mean_token_accuracy": 0.7717723697423935, + "num_tokens": 12115050.0, + "step": 965 + }, + { + "entropy": 0.988870695233345, + "epoch": 1.6099999999999999, + "grad_norm": 0.3945852220058441, + "learning_rate": 9.76608187134503e-05, + "loss": 1.0066, + "mean_token_accuracy": 0.7594099268317223, + "num_tokens": 12127483.0, + "step": 966 + }, + { + "entropy": 0.8450613245368004, + "epoch": 1.6116666666666668, + "grad_norm": 0.7763268351554871, + "learning_rate": 9.754385964912281e-05, + "loss": 0.8352, + "mean_token_accuracy": 0.8030087202787399, + "num_tokens": 12140018.0, + "step": 967 + }, + { + "entropy": 0.9965870380401611, + "epoch": 1.6133333333333333, + "grad_norm": 0.3498965799808502, + "learning_rate": 9.742690058479532e-05, + "loss": 0.9648, + "mean_token_accuracy": 0.7675676867365837, + "num_tokens": 12152304.0, + "step": 968 + }, + { + "entropy": 0.8886230438947678, + "epoch": 1.615, + "grad_norm": 0.35514211654663086, + "learning_rate": 9.730994152046784e-05, + "loss": 0.8499, + "mean_token_accuracy": 0.7898500040173531, + "num_tokens": 12165239.0, + "step": 969 + }, + { + "entropy": 1.0475463792681694, + "epoch": 1.6166666666666667, + "grad_norm": 0.32757648825645447, + "learning_rate": 9.719298245614035e-05, + "loss": 1.0377, + "mean_token_accuracy": 0.7567182034254074, + "num_tokens": 12177732.0, + "step": 970 + }, + { + "entropy": 1.077672116458416, + "epoch": 1.6183333333333332, + "grad_norm": 0.3843959867954254, + "learning_rate": 9.707602339181286e-05, + "loss": 1.0523, + "mean_token_accuracy": 0.7472524493932724, + "num_tokens": 12190337.0, + "step": 971 + }, + { + "entropy": 0.96406065300107, + "epoch": 1.62, + "grad_norm": 0.4375675320625305, + "learning_rate": 9.695906432748539e-05, + "loss": 0.9323, + "mean_token_accuracy": 0.7749380320310593, + "num_tokens": 12202681.0, + "step": 972 + }, + { + "entropy": 0.9428460970520973, + "epoch": 1.6216666666666666, + "grad_norm": 0.42265263199806213, + "learning_rate": 9.68421052631579e-05, + "loss": 0.9253, + "mean_token_accuracy": 0.7729097902774811, + "num_tokens": 12215372.0, + "step": 973 + }, + { + "entropy": 0.8881849274039268, + "epoch": 1.6233333333333333, + "grad_norm": 0.3061010241508484, + "learning_rate": 9.672514619883042e-05, + "loss": 0.8821, + "mean_token_accuracy": 0.7929625511169434, + "num_tokens": 12227794.0, + "step": 974 + }, + { + "entropy": 1.2201930955052376, + "epoch": 1.625, + "grad_norm": 0.4130735695362091, + "learning_rate": 9.660818713450293e-05, + "loss": 1.202, + "mean_token_accuracy": 0.7246149331331253, + "num_tokens": 12240350.0, + "step": 975 + }, + { + "entropy": 0.9785461276769638, + "epoch": 1.6266666666666667, + "grad_norm": 0.37417230010032654, + "learning_rate": 9.649122807017544e-05, + "loss": 0.9684, + "mean_token_accuracy": 0.7681845799088478, + "num_tokens": 12252809.0, + "step": 976 + }, + { + "entropy": 0.9370051473379135, + "epoch": 1.6283333333333334, + "grad_norm": 0.4027501344680786, + "learning_rate": 9.637426900584796e-05, + "loss": 0.9468, + "mean_token_accuracy": 0.7657081261277199, + "num_tokens": 12265800.0, + "step": 977 + }, + { + "entropy": 0.8598027527332306, + "epoch": 1.63, + "grad_norm": 0.38356924057006836, + "learning_rate": 9.625730994152047e-05, + "loss": 0.8497, + "mean_token_accuracy": 0.7939843013882637, + "num_tokens": 12278392.0, + "step": 978 + }, + { + "entropy": 0.9169039651751518, + "epoch": 1.6316666666666668, + "grad_norm": 0.3893364369869232, + "learning_rate": 9.614035087719298e-05, + "loss": 0.947, + "mean_token_accuracy": 0.7715698778629303, + "num_tokens": 12290956.0, + "step": 979 + }, + { + "entropy": 1.0241443440318108, + "epoch": 1.6333333333333333, + "grad_norm": 0.3506811559200287, + "learning_rate": 9.60233918128655e-05, + "loss": 1.0449, + "mean_token_accuracy": 0.752354621887207, + "num_tokens": 12303340.0, + "step": 980 + }, + { + "entropy": 1.1178838685154915, + "epoch": 1.635, + "grad_norm": 0.42362692952156067, + "learning_rate": 9.590643274853801e-05, + "loss": 1.1287, + "mean_token_accuracy": 0.7328075543045998, + "num_tokens": 12315539.0, + "step": 981 + }, + { + "entropy": 0.9596805796027184, + "epoch": 1.6366666666666667, + "grad_norm": 0.3150109648704529, + "learning_rate": 9.578947368421052e-05, + "loss": 0.9699, + "mean_token_accuracy": 0.7640129998326302, + "num_tokens": 12328171.0, + "step": 982 + }, + { + "entropy": 0.8891875892877579, + "epoch": 1.6383333333333332, + "grad_norm": 0.38294193148612976, + "learning_rate": 9.567251461988305e-05, + "loss": 0.8698, + "mean_token_accuracy": 0.7974608764052391, + "num_tokens": 12340881.0, + "step": 983 + }, + { + "entropy": 1.0465619787573814, + "epoch": 1.6400000000000001, + "grad_norm": 0.40911293029785156, + "learning_rate": 9.555555555555557e-05, + "loss": 1.0587, + "mean_token_accuracy": 0.7567858397960663, + "num_tokens": 12353378.0, + "step": 984 + }, + { + "entropy": 1.0272936820983887, + "epoch": 1.6416666666666666, + "grad_norm": 0.3643612861633301, + "learning_rate": 9.543859649122808e-05, + "loss": 1.0141, + "mean_token_accuracy": 0.7541831061244011, + "num_tokens": 12365690.0, + "step": 985 + }, + { + "entropy": 1.1167894005775452, + "epoch": 1.6433333333333333, + "grad_norm": 0.4006604850292206, + "learning_rate": 9.532163742690059e-05, + "loss": 1.1211, + "mean_token_accuracy": 0.7414597794413567, + "num_tokens": 12378138.0, + "step": 986 + }, + { + "entropy": 0.9925767779350281, + "epoch": 1.645, + "grad_norm": 0.390142023563385, + "learning_rate": 9.520467836257311e-05, + "loss": 0.9605, + "mean_token_accuracy": 0.768660306930542, + "num_tokens": 12390642.0, + "step": 987 + }, + { + "entropy": 0.9583388492465019, + "epoch": 1.6466666666666665, + "grad_norm": 0.36772555112838745, + "learning_rate": 9.508771929824562e-05, + "loss": 0.9473, + "mean_token_accuracy": 0.7698638662695885, + "num_tokens": 12403040.0, + "step": 988 + }, + { + "entropy": 1.0084424540400505, + "epoch": 1.6483333333333334, + "grad_norm": 0.3564258813858032, + "learning_rate": 9.497076023391813e-05, + "loss": 1.0044, + "mean_token_accuracy": 0.7635295242071152, + "num_tokens": 12415673.0, + "step": 989 + }, + { + "entropy": 0.9867688044905663, + "epoch": 1.65, + "grad_norm": 0.3392479717731476, + "learning_rate": 9.485380116959065e-05, + "loss": 1.0133, + "mean_token_accuracy": 0.7665977254509926, + "num_tokens": 12428197.0, + "step": 990 + }, + { + "entropy": 1.0522988960146904, + "epoch": 1.6516666666666666, + "grad_norm": 0.40604665875434875, + "learning_rate": 9.473684210526316e-05, + "loss": 0.9979, + "mean_token_accuracy": 0.7608874589204788, + "num_tokens": 12440702.0, + "step": 991 + }, + { + "entropy": 1.0473966524004936, + "epoch": 1.6533333333333333, + "grad_norm": 0.3371841311454773, + "learning_rate": 9.461988304093567e-05, + "loss": 1.0415, + "mean_token_accuracy": 0.75101687759161, + "num_tokens": 12453412.0, + "step": 992 + }, + { + "entropy": 0.9661148190498352, + "epoch": 1.655, + "grad_norm": 0.5349663496017456, + "learning_rate": 9.45029239766082e-05, + "loss": 0.9644, + "mean_token_accuracy": 0.7739033624529839, + "num_tokens": 12465842.0, + "step": 993 + }, + { + "entropy": 1.040437489748001, + "epoch": 1.6566666666666667, + "grad_norm": 0.3611598610877991, + "learning_rate": 9.438596491228072e-05, + "loss": 1.0096, + "mean_token_accuracy": 0.7568608149886131, + "num_tokens": 12478560.0, + "step": 994 + }, + { + "entropy": 0.9655180498957634, + "epoch": 1.6583333333333332, + "grad_norm": 0.3291683793067932, + "learning_rate": 9.426900584795321e-05, + "loss": 0.9887, + "mean_token_accuracy": 0.7650616839528084, + "num_tokens": 12490947.0, + "step": 995 + }, + { + "entropy": 0.9577772319316864, + "epoch": 1.6600000000000001, + "grad_norm": 0.463991641998291, + "learning_rate": 9.415204678362574e-05, + "loss": 0.9601, + "mean_token_accuracy": 0.769118033349514, + "num_tokens": 12503752.0, + "step": 996 + }, + { + "entropy": 1.115663342177868, + "epoch": 1.6616666666666666, + "grad_norm": 0.4828460216522217, + "learning_rate": 9.403508771929826e-05, + "loss": 1.1281, + "mean_token_accuracy": 0.7351868525147438, + "num_tokens": 12516063.0, + "step": 997 + }, + { + "entropy": 1.104174591600895, + "epoch": 1.6633333333333333, + "grad_norm": 0.3741133511066437, + "learning_rate": 9.391812865497076e-05, + "loss": 1.114, + "mean_token_accuracy": 0.7381681129336357, + "num_tokens": 12528649.0, + "step": 998 + }, + { + "entropy": 1.0054408684372902, + "epoch": 1.665, + "grad_norm": 0.34373944997787476, + "learning_rate": 9.380116959064328e-05, + "loss": 1.0193, + "mean_token_accuracy": 0.7606060951948166, + "num_tokens": 12541056.0, + "step": 999 + }, + { + "entropy": 1.148328110575676, + "epoch": 1.6666666666666665, + "grad_norm": 0.32720351219177246, + "learning_rate": 9.36842105263158e-05, + "loss": 1.1651, + "mean_token_accuracy": 0.7306461036205292, + "num_tokens": 12553340.0, + "step": 1000 + }, + { + "entropy": 1.0338936150074005, + "epoch": 1.6683333333333334, + "grad_norm": 0.33580687642097473, + "learning_rate": 9.35672514619883e-05, + "loss": 1.0078, + "mean_token_accuracy": 0.7570296004414558, + "num_tokens": 12565983.0, + "step": 1001 + }, + { + "entropy": 0.877336673438549, + "epoch": 1.67, + "grad_norm": 0.36133208870887756, + "learning_rate": 9.345029239766082e-05, + "loss": 0.8854, + "mean_token_accuracy": 0.7905719429254532, + "num_tokens": 12578466.0, + "step": 1002 + }, + { + "entropy": 1.157466672360897, + "epoch": 1.6716666666666666, + "grad_norm": 0.4128221571445465, + "learning_rate": 9.333333333333334e-05, + "loss": 1.1676, + "mean_token_accuracy": 0.726306177675724, + "num_tokens": 12591221.0, + "step": 1003 + }, + { + "entropy": 1.0364705994725227, + "epoch": 1.6733333333333333, + "grad_norm": 0.35205385088920593, + "learning_rate": 9.321637426900585e-05, + "loss": 1.0262, + "mean_token_accuracy": 0.7624763324856758, + "num_tokens": 12603804.0, + "step": 1004 + }, + { + "entropy": 1.1215010583400726, + "epoch": 1.675, + "grad_norm": 0.42232760787010193, + "learning_rate": 9.309941520467836e-05, + "loss": 1.1199, + "mean_token_accuracy": 0.736581914126873, + "num_tokens": 12616411.0, + "step": 1005 + }, + { + "entropy": 0.9427232444286346, + "epoch": 1.6766666666666667, + "grad_norm": 0.34733209013938904, + "learning_rate": 9.298245614035089e-05, + "loss": 0.9315, + "mean_token_accuracy": 0.7790528759360313, + "num_tokens": 12628828.0, + "step": 1006 + }, + { + "entropy": 0.9177378788590431, + "epoch": 1.6783333333333332, + "grad_norm": 0.35805800557136536, + "learning_rate": 9.28654970760234e-05, + "loss": 0.9019, + "mean_token_accuracy": 0.7822014093399048, + "num_tokens": 12641546.0, + "step": 1007 + }, + { + "entropy": 0.9339721202850342, + "epoch": 1.6800000000000002, + "grad_norm": 0.3551904559135437, + "learning_rate": 9.27485380116959e-05, + "loss": 0.9064, + "mean_token_accuracy": 0.7829243242740631, + "num_tokens": 12654056.0, + "step": 1008 + }, + { + "entropy": 1.0404436141252518, + "epoch": 1.6816666666666666, + "grad_norm": 0.3631914556026459, + "learning_rate": 9.263157894736843e-05, + "loss": 0.9982, + "mean_token_accuracy": 0.7624745666980743, + "num_tokens": 12666398.0, + "step": 1009 + }, + { + "entropy": 0.8712766841053963, + "epoch": 1.6833333333333333, + "grad_norm": 0.33432894945144653, + "learning_rate": 9.251461988304094e-05, + "loss": 0.8554, + "mean_token_accuracy": 0.7947898954153061, + "num_tokens": 12679178.0, + "step": 1010 + }, + { + "entropy": 1.0278621464967728, + "epoch": 1.685, + "grad_norm": 0.33881428837776184, + "learning_rate": 9.239766081871345e-05, + "loss": 1.0208, + "mean_token_accuracy": 0.7516693696379662, + "num_tokens": 12691644.0, + "step": 1011 + }, + { + "entropy": 1.0649035423994064, + "epoch": 1.6866666666666665, + "grad_norm": 0.37799784541130066, + "learning_rate": 9.228070175438597e-05, + "loss": 1.0772, + "mean_token_accuracy": 0.7425209805369377, + "num_tokens": 12704296.0, + "step": 1012 + }, + { + "entropy": 0.9589243307709694, + "epoch": 1.6883333333333335, + "grad_norm": 0.3711475431919098, + "learning_rate": 9.216374269005849e-05, + "loss": 0.9257, + "mean_token_accuracy": 0.7744899317622185, + "num_tokens": 12716590.0, + "step": 1013 + }, + { + "entropy": 1.00307896733284, + "epoch": 1.69, + "grad_norm": 0.4023561179637909, + "learning_rate": 9.2046783625731e-05, + "loss": 1.0033, + "mean_token_accuracy": 0.758160911500454, + "num_tokens": 12728910.0, + "step": 1014 + }, + { + "entropy": 0.9863514825701714, + "epoch": 1.6916666666666667, + "grad_norm": 0.3267923891544342, + "learning_rate": 9.192982456140351e-05, + "loss": 0.9921, + "mean_token_accuracy": 0.7703479081392288, + "num_tokens": 12741483.0, + "step": 1015 + }, + { + "entropy": 0.9462117999792099, + "epoch": 1.6933333333333334, + "grad_norm": 0.5054460167884827, + "learning_rate": 9.181286549707603e-05, + "loss": 0.9359, + "mean_token_accuracy": 0.778098352253437, + "num_tokens": 12754055.0, + "step": 1016 + }, + { + "entropy": 1.0157199203968048, + "epoch": 1.6949999999999998, + "grad_norm": 0.4718424379825592, + "learning_rate": 9.169590643274854e-05, + "loss": 1.0238, + "mean_token_accuracy": 0.7569868713617325, + "num_tokens": 12766637.0, + "step": 1017 + }, + { + "entropy": 1.164522334933281, + "epoch": 1.6966666666666668, + "grad_norm": 0.41534754633903503, + "learning_rate": 9.157894736842105e-05, + "loss": 1.1519, + "mean_token_accuracy": 0.7301210761070251, + "num_tokens": 12779298.0, + "step": 1018 + }, + { + "entropy": 0.9943608194589615, + "epoch": 1.6983333333333333, + "grad_norm": 0.3897961676120758, + "learning_rate": 9.146198830409358e-05, + "loss": 0.9711, + "mean_token_accuracy": 0.7657627090811729, + "num_tokens": 12792031.0, + "step": 1019 + }, + { + "entropy": 1.0494454652071, + "epoch": 1.7, + "grad_norm": 0.37525609135627747, + "learning_rate": 9.134502923976609e-05, + "loss": 1.0261, + "mean_token_accuracy": 0.7546698749065399, + "num_tokens": 12804790.0, + "step": 1020 + }, + { + "entropy": 1.0734568759799004, + "epoch": 1.7016666666666667, + "grad_norm": 0.4507957994937897, + "learning_rate": 9.12280701754386e-05, + "loss": 1.0734, + "mean_token_accuracy": 0.7483791783452034, + "num_tokens": 12817417.0, + "step": 1021 + }, + { + "entropy": 1.0368591323494911, + "epoch": 1.7033333333333334, + "grad_norm": 0.36012768745422363, + "learning_rate": 9.111111111111112e-05, + "loss": 1.0285, + "mean_token_accuracy": 0.7521537616848946, + "num_tokens": 12829969.0, + "step": 1022 + }, + { + "entropy": 1.1286500170826912, + "epoch": 1.705, + "grad_norm": 0.3422996401786804, + "learning_rate": 9.099415204678363e-05, + "loss": 1.1183, + "mean_token_accuracy": 0.7346495315432549, + "num_tokens": 12842365.0, + "step": 1023 + }, + { + "entropy": 1.2009264305233955, + "epoch": 1.7066666666666666, + "grad_norm": 0.45700791478157043, + "learning_rate": 9.087719298245615e-05, + "loss": 1.2134, + "mean_token_accuracy": 0.7141278833150864, + "num_tokens": 12855028.0, + "step": 1024 + }, + { + "entropy": 1.1235537379980087, + "epoch": 1.7083333333333335, + "grad_norm": 0.39127787947654724, + "learning_rate": 9.076023391812866e-05, + "loss": 1.0881, + "mean_token_accuracy": 0.7371407151222229, + "num_tokens": 12867632.0, + "step": 1025 + }, + { + "entropy": 1.0213222280144691, + "epoch": 1.71, + "grad_norm": 0.6702572107315063, + "learning_rate": 9.064327485380117e-05, + "loss": 0.9986, + "mean_token_accuracy": 0.7571739181876183, + "num_tokens": 12880333.0, + "step": 1026 + }, + { + "entropy": 1.0406272113323212, + "epoch": 1.7116666666666667, + "grad_norm": 0.5308269262313843, + "learning_rate": 9.052631578947369e-05, + "loss": 1.0334, + "mean_token_accuracy": 0.7501184120774269, + "num_tokens": 12892992.0, + "step": 1027 + }, + { + "entropy": 1.0075775384902954, + "epoch": 1.7133333333333334, + "grad_norm": 0.41957348585128784, + "learning_rate": 9.04093567251462e-05, + "loss": 1.0008, + "mean_token_accuracy": 0.7609769701957703, + "num_tokens": 12905703.0, + "step": 1028 + }, + { + "entropy": 0.9897546097636223, + "epoch": 1.7149999999999999, + "grad_norm": 0.35864391922950745, + "learning_rate": 9.029239766081871e-05, + "loss": 1.0039, + "mean_token_accuracy": 0.7623355314135551, + "num_tokens": 12918300.0, + "step": 1029 + }, + { + "entropy": 0.9119668006896973, + "epoch": 1.7166666666666668, + "grad_norm": 0.5414547324180603, + "learning_rate": 9.017543859649123e-05, + "loss": 0.9076, + "mean_token_accuracy": 0.7755086645483971, + "num_tokens": 12930883.0, + "step": 1030 + }, + { + "entropy": 0.9837629571557045, + "epoch": 1.7183333333333333, + "grad_norm": 0.4744820296764374, + "learning_rate": 9.005847953216374e-05, + "loss": 0.9837, + "mean_token_accuracy": 0.7667191326618195, + "num_tokens": 12943412.0, + "step": 1031 + }, + { + "entropy": 1.1909087374806404, + "epoch": 1.72, + "grad_norm": 0.31805115938186646, + "learning_rate": 8.994152046783625e-05, + "loss": 1.2015, + "mean_token_accuracy": 0.7204370275139809, + "num_tokens": 12955839.0, + "step": 1032 + }, + { + "entropy": 1.009498618543148, + "epoch": 1.7216666666666667, + "grad_norm": 0.47378993034362793, + "learning_rate": 8.982456140350878e-05, + "loss": 1.033, + "mean_token_accuracy": 0.7578397020697594, + "num_tokens": 12968328.0, + "step": 1033 + }, + { + "entropy": 1.0100986510515213, + "epoch": 1.7233333333333334, + "grad_norm": 0.3783584535121918, + "learning_rate": 8.97076023391813e-05, + "loss": 0.978, + "mean_token_accuracy": 0.7699412554502487, + "num_tokens": 12981120.0, + "step": 1034 + }, + { + "entropy": 0.9874220192432404, + "epoch": 1.725, + "grad_norm": 0.3560539484024048, + "learning_rate": 8.959064327485381e-05, + "loss": 0.9914, + "mean_token_accuracy": 0.7623036429286003, + "num_tokens": 12994015.0, + "step": 1035 + }, + { + "entropy": 1.0502874776721, + "epoch": 1.7266666666666666, + "grad_norm": 0.3287025988101959, + "learning_rate": 8.947368421052632e-05, + "loss": 1.0022, + "mean_token_accuracy": 0.7620445415377617, + "num_tokens": 13006422.0, + "step": 1036 + }, + { + "entropy": 1.1608180850744247, + "epoch": 1.7283333333333335, + "grad_norm": 0.39951977133750916, + "learning_rate": 8.935672514619884e-05, + "loss": 1.1191, + "mean_token_accuracy": 0.7336246818304062, + "num_tokens": 13018958.0, + "step": 1037 + }, + { + "entropy": 1.0154462233185768, + "epoch": 1.73, + "grad_norm": 0.3510722517967224, + "learning_rate": 8.923976608187135e-05, + "loss": 0.9876, + "mean_token_accuracy": 0.7671427950263023, + "num_tokens": 13031565.0, + "step": 1038 + }, + { + "entropy": 1.1321008205413818, + "epoch": 1.7316666666666667, + "grad_norm": 0.34480804204940796, + "learning_rate": 8.912280701754386e-05, + "loss": 1.1336, + "mean_token_accuracy": 0.7281079292297363, + "num_tokens": 13043941.0, + "step": 1039 + }, + { + "entropy": 1.0491846576333046, + "epoch": 1.7333333333333334, + "grad_norm": 0.35498106479644775, + "learning_rate": 8.900584795321638e-05, + "loss": 1.0446, + "mean_token_accuracy": 0.752373032271862, + "num_tokens": 13056405.0, + "step": 1040 + }, + { + "entropy": 1.184033825993538, + "epoch": 1.7349999999999999, + "grad_norm": 0.37634584307670593, + "learning_rate": 8.888888888888889e-05, + "loss": 1.1856, + "mean_token_accuracy": 0.7222929745912552, + "num_tokens": 13068870.0, + "step": 1041 + }, + { + "entropy": 0.9711630195379257, + "epoch": 1.7366666666666668, + "grad_norm": 0.40814366936683655, + "learning_rate": 8.87719298245614e-05, + "loss": 0.9579, + "mean_token_accuracy": 0.7746291309595108, + "num_tokens": 13081147.0, + "step": 1042 + }, + { + "entropy": 0.9708935245871544, + "epoch": 1.7383333333333333, + "grad_norm": 0.3281678557395935, + "learning_rate": 8.865497076023393e-05, + "loss": 0.9806, + "mean_token_accuracy": 0.7686115130782127, + "num_tokens": 13093666.0, + "step": 1043 + }, + { + "entropy": 1.0442078933119774, + "epoch": 1.74, + "grad_norm": 0.4168369174003601, + "learning_rate": 8.853801169590645e-05, + "loss": 1.0271, + "mean_token_accuracy": 0.7528911083936691, + "num_tokens": 13106165.0, + "step": 1044 + }, + { + "entropy": 1.1267547607421875, + "epoch": 1.7416666666666667, + "grad_norm": 0.47203320264816284, + "learning_rate": 8.842105263157894e-05, + "loss": 1.1349, + "mean_token_accuracy": 0.7340073138475418, + "num_tokens": 13118860.0, + "step": 1045 + }, + { + "entropy": 0.9274444133043289, + "epoch": 1.7433333333333332, + "grad_norm": 0.5290305614471436, + "learning_rate": 8.830409356725147e-05, + "loss": 0.9147, + "mean_token_accuracy": 0.7824391052126884, + "num_tokens": 13131405.0, + "step": 1046 + }, + { + "entropy": 0.9275342971086502, + "epoch": 1.745, + "grad_norm": 0.5540999174118042, + "learning_rate": 8.818713450292399e-05, + "loss": 0.9224, + "mean_token_accuracy": 0.7760967463254929, + "num_tokens": 13143980.0, + "step": 1047 + }, + { + "entropy": 1.041547805070877, + "epoch": 1.7466666666666666, + "grad_norm": 0.4107725918292999, + "learning_rate": 8.807017543859649e-05, + "loss": 1.0375, + "mean_token_accuracy": 0.7471731752157211, + "num_tokens": 13156625.0, + "step": 1048 + }, + { + "entropy": 1.0120449364185333, + "epoch": 1.7483333333333333, + "grad_norm": 0.3597240149974823, + "learning_rate": 8.795321637426901e-05, + "loss": 1.0087, + "mean_token_accuracy": 0.7546082735061646, + "num_tokens": 13169318.0, + "step": 1049 + }, + { + "entropy": 0.9050641730427742, + "epoch": 1.75, + "grad_norm": 0.36369770765304565, + "learning_rate": 8.783625730994153e-05, + "loss": 0.893, + "mean_token_accuracy": 0.7888905927538872, + "num_tokens": 13181903.0, + "step": 1050 + }, + { + "entropy": 1.096294365823269, + "epoch": 1.7516666666666667, + "grad_norm": 0.3386310040950775, + "learning_rate": 8.771929824561403e-05, + "loss": 1.0779, + "mean_token_accuracy": 0.746938169002533, + "num_tokens": 13194482.0, + "step": 1051 + }, + { + "entropy": 0.9471368938684464, + "epoch": 1.7533333333333334, + "grad_norm": 0.3317272961139679, + "learning_rate": 8.760233918128655e-05, + "loss": 0.9503, + "mean_token_accuracy": 0.7712305337190628, + "num_tokens": 13206825.0, + "step": 1052 + }, + { + "entropy": 0.9653787389397621, + "epoch": 1.755, + "grad_norm": 0.3206188678741455, + "learning_rate": 8.748538011695907e-05, + "loss": 0.9591, + "mean_token_accuracy": 0.776692196726799, + "num_tokens": 13219412.0, + "step": 1053 + }, + { + "entropy": 0.9689168483018875, + "epoch": 1.7566666666666668, + "grad_norm": 0.311023473739624, + "learning_rate": 8.736842105263158e-05, + "loss": 0.9732, + "mean_token_accuracy": 0.7670754492282867, + "num_tokens": 13232192.0, + "step": 1054 + }, + { + "entropy": 1.0213828086853027, + "epoch": 1.7583333333333333, + "grad_norm": 0.3334166705608368, + "learning_rate": 8.72514619883041e-05, + "loss": 1.0207, + "mean_token_accuracy": 0.7589734867215157, + "num_tokens": 13244638.0, + "step": 1055 + }, + { + "entropy": 1.1304619312286377, + "epoch": 1.76, + "grad_norm": 0.3724319636821747, + "learning_rate": 8.713450292397662e-05, + "loss": 1.146, + "mean_token_accuracy": 0.725090965628624, + "num_tokens": 13257030.0, + "step": 1056 + }, + { + "entropy": 1.1443024575710297, + "epoch": 1.7616666666666667, + "grad_norm": 0.3518555760383606, + "learning_rate": 8.701754385964913e-05, + "loss": 1.1252, + "mean_token_accuracy": 0.7382890284061432, + "num_tokens": 13269515.0, + "step": 1057 + }, + { + "entropy": 1.135504774749279, + "epoch": 1.7633333333333332, + "grad_norm": 0.38220056891441345, + "learning_rate": 8.690058479532164e-05, + "loss": 1.1137, + "mean_token_accuracy": 0.7388380318880081, + "num_tokens": 13282187.0, + "step": 1058 + }, + { + "entropy": 1.0930515304207802, + "epoch": 1.7650000000000001, + "grad_norm": 0.42755579948425293, + "learning_rate": 8.678362573099416e-05, + "loss": 1.0558, + "mean_token_accuracy": 0.7507267519831657, + "num_tokens": 13294816.0, + "step": 1059 + }, + { + "entropy": 1.0393217131495476, + "epoch": 1.7666666666666666, + "grad_norm": 0.34985482692718506, + "learning_rate": 8.666666666666667e-05, + "loss": 1.0237, + "mean_token_accuracy": 0.7579164057970047, + "num_tokens": 13307462.0, + "step": 1060 + }, + { + "entropy": 1.183841995894909, + "epoch": 1.7683333333333333, + "grad_norm": 0.3967154324054718, + "learning_rate": 8.654970760233918e-05, + "loss": 1.1676, + "mean_token_accuracy": 0.7235167846083641, + "num_tokens": 13319817.0, + "step": 1061 + }, + { + "entropy": 1.0351843312382698, + "epoch": 1.77, + "grad_norm": 0.4487072825431824, + "learning_rate": 8.64327485380117e-05, + "loss": 1.0107, + "mean_token_accuracy": 0.7641036361455917, + "num_tokens": 13332520.0, + "step": 1062 + }, + { + "entropy": 1.0560117810964584, + "epoch": 1.7716666666666665, + "grad_norm": 0.3373064398765564, + "learning_rate": 8.631578947368421e-05, + "loss": 1.0522, + "mean_token_accuracy": 0.7510386854410172, + "num_tokens": 13345292.0, + "step": 1063 + }, + { + "entropy": 0.826760470867157, + "epoch": 1.7733333333333334, + "grad_norm": 0.33728596568107605, + "learning_rate": 8.619883040935673e-05, + "loss": 0.8244, + "mean_token_accuracy": 0.8044012635946274, + "num_tokens": 13357697.0, + "step": 1064 + }, + { + "entropy": 1.1935306042432785, + "epoch": 1.775, + "grad_norm": 0.519767701625824, + "learning_rate": 8.608187134502924e-05, + "loss": 1.2138, + "mean_token_accuracy": 0.7204710319638252, + "num_tokens": 13370108.0, + "step": 1065 + }, + { + "entropy": 0.9616173505783081, + "epoch": 1.7766666666666666, + "grad_norm": 0.3920440375804901, + "learning_rate": 8.596491228070177e-05, + "loss": 0.9605, + "mean_token_accuracy": 0.7691715583205223, + "num_tokens": 13382572.0, + "step": 1066 + }, + { + "entropy": 0.9955972135066986, + "epoch": 1.7783333333333333, + "grad_norm": 0.31550100445747375, + "learning_rate": 8.584795321637428e-05, + "loss": 0.9734, + "mean_token_accuracy": 0.7609797418117523, + "num_tokens": 13395370.0, + "step": 1067 + }, + { + "entropy": 0.9564253985881805, + "epoch": 1.78, + "grad_norm": 0.49894317984580994, + "learning_rate": 8.573099415204678e-05, + "loss": 0.9603, + "mean_token_accuracy": 0.768037311732769, + "num_tokens": 13407807.0, + "step": 1068 + }, + { + "entropy": 0.9421810433268547, + "epoch": 1.7816666666666667, + "grad_norm": 0.8620859980583191, + "learning_rate": 8.561403508771931e-05, + "loss": 0.9432, + "mean_token_accuracy": 0.7720376253128052, + "num_tokens": 13420682.0, + "step": 1069 + }, + { + "entropy": 0.9941176548600197, + "epoch": 1.7833333333333332, + "grad_norm": 0.32992666959762573, + "learning_rate": 8.549707602339182e-05, + "loss": 0.9752, + "mean_token_accuracy": 0.7662701159715652, + "num_tokens": 13433310.0, + "step": 1070 + }, + { + "entropy": 0.9855979382991791, + "epoch": 1.7850000000000001, + "grad_norm": 0.8007895946502686, + "learning_rate": 8.538011695906433e-05, + "loss": 0.9685, + "mean_token_accuracy": 0.76729516685009, + "num_tokens": 13445953.0, + "step": 1071 + }, + { + "entropy": 1.029094435274601, + "epoch": 1.7866666666666666, + "grad_norm": 0.7585043907165527, + "learning_rate": 8.526315789473685e-05, + "loss": 1.041, + "mean_token_accuracy": 0.7518841326236725, + "num_tokens": 13458298.0, + "step": 1072 + }, + { + "entropy": 1.072975106537342, + "epoch": 1.7883333333333333, + "grad_norm": 0.35254842042922974, + "learning_rate": 8.514619883040936e-05, + "loss": 1.0819, + "mean_token_accuracy": 0.7426500543951988, + "num_tokens": 13470965.0, + "step": 1073 + }, + { + "entropy": 0.9032114669680595, + "epoch": 1.79, + "grad_norm": 0.3355119824409485, + "learning_rate": 8.502923976608188e-05, + "loss": 0.9127, + "mean_token_accuracy": 0.7799766734242439, + "num_tokens": 13483518.0, + "step": 1074 + }, + { + "entropy": 0.9452763050794601, + "epoch": 1.7916666666666665, + "grad_norm": 0.7401320338249207, + "learning_rate": 8.491228070175439e-05, + "loss": 0.9524, + "mean_token_accuracy": 0.773579441010952, + "num_tokens": 13496222.0, + "step": 1075 + }, + { + "entropy": 1.1635581478476524, + "epoch": 1.7933333333333334, + "grad_norm": 0.39469605684280396, + "learning_rate": 8.47953216374269e-05, + "loss": 1.1658, + "mean_token_accuracy": 0.7263970300555229, + "num_tokens": 13508916.0, + "step": 1076 + }, + { + "entropy": 0.9113326743245125, + "epoch": 1.795, + "grad_norm": 0.32929447293281555, + "learning_rate": 8.467836257309942e-05, + "loss": 0.8863, + "mean_token_accuracy": 0.7842982038855553, + "num_tokens": 13521457.0, + "step": 1077 + }, + { + "entropy": 1.1759463623166084, + "epoch": 1.7966666666666666, + "grad_norm": 0.40519341826438904, + "learning_rate": 8.456140350877193e-05, + "loss": 1.1794, + "mean_token_accuracy": 0.7177974060177803, + "num_tokens": 13534057.0, + "step": 1078 + }, + { + "entropy": 1.0619780719280243, + "epoch": 1.7983333333333333, + "grad_norm": 0.4279235601425171, + "learning_rate": 8.444444444444444e-05, + "loss": 1.071, + "mean_token_accuracy": 0.7492412179708481, + "num_tokens": 13546665.0, + "step": 1079 + }, + { + "entropy": 1.0635966658592224, + "epoch": 1.8, + "grad_norm": 0.7339469790458679, + "learning_rate": 8.432748538011697e-05, + "loss": 1.0278, + "mean_token_accuracy": 0.7510287240147591, + "num_tokens": 13559479.0, + "step": 1080 + }, + { + "epoch": 1.8, + "eval_entropy": 1.1268045465048195, + "eval_loss": 1.1247466802597046, + "eval_mean_token_accuracy": 0.7332956134105196, + "eval_num_tokens": 13559479.0, + "eval_runtime": 2667.8789, + "eval_samples_per_second": 1.875, + "eval_steps_per_second": 0.937, + "step": 1080 + }, + { + "entropy": 1.0391376838088036, + "epoch": 1.8016666666666667, + "grad_norm": 0.3413653075695038, + "learning_rate": 8.421052631578948e-05, + "loss": 1.0279, + "mean_token_accuracy": 0.7522547543048859, + "num_tokens": 13571812.0, + "step": 1081 + }, + { + "entropy": 1.039592519402504, + "epoch": 1.8033333333333332, + "grad_norm": 0.7185345888137817, + "learning_rate": 8.409356725146199e-05, + "loss": 1.0245, + "mean_token_accuracy": 0.7574363052845001, + "num_tokens": 13584277.0, + "step": 1082 + }, + { + "entropy": 1.0078017935156822, + "epoch": 1.8050000000000002, + "grad_norm": 0.601192057132721, + "learning_rate": 8.397660818713451e-05, + "loss": 1.0029, + "mean_token_accuracy": 0.7555301338434219, + "num_tokens": 13596887.0, + "step": 1083 + }, + { + "entropy": 1.1202399730682373, + "epoch": 1.8066666666666666, + "grad_norm": 0.36545878648757935, + "learning_rate": 8.385964912280703e-05, + "loss": 1.1177, + "mean_token_accuracy": 0.740936741232872, + "num_tokens": 13609592.0, + "step": 1084 + }, + { + "entropy": 1.0592864975333214, + "epoch": 1.8083333333333333, + "grad_norm": 0.39112910628318787, + "learning_rate": 8.374269005847953e-05, + "loss": 1.0921, + "mean_token_accuracy": 0.7445860356092453, + "num_tokens": 13622268.0, + "step": 1085 + }, + { + "entropy": 1.1477596685290337, + "epoch": 1.81, + "grad_norm": 0.481083482503891, + "learning_rate": 8.362573099415205e-05, + "loss": 1.1132, + "mean_token_accuracy": 0.7333399578928947, + "num_tokens": 13634812.0, + "step": 1086 + }, + { + "entropy": 1.0418332889676094, + "epoch": 1.8116666666666665, + "grad_norm": 0.580384373664856, + "learning_rate": 8.350877192982457e-05, + "loss": 1.0395, + "mean_token_accuracy": 0.7594472095370293, + "num_tokens": 13647258.0, + "step": 1087 + }, + { + "entropy": 0.917961873114109, + "epoch": 1.8133333333333335, + "grad_norm": 0.6606692671775818, + "learning_rate": 8.339181286549708e-05, + "loss": 0.9231, + "mean_token_accuracy": 0.7793416678905487, + "num_tokens": 13659835.0, + "step": 1088 + }, + { + "entropy": 1.1957320272922516, + "epoch": 1.815, + "grad_norm": 0.46521374583244324, + "learning_rate": 8.327485380116959e-05, + "loss": 1.1858, + "mean_token_accuracy": 0.7182819619774818, + "num_tokens": 13672228.0, + "step": 1089 + }, + { + "entropy": 1.0594796538352966, + "epoch": 1.8166666666666667, + "grad_norm": 0.4050081670284271, + "learning_rate": 8.315789473684212e-05, + "loss": 1.0571, + "mean_token_accuracy": 0.7443678379058838, + "num_tokens": 13684887.0, + "step": 1090 + }, + { + "entropy": 0.9371944293379784, + "epoch": 1.8183333333333334, + "grad_norm": 0.9697496891021729, + "learning_rate": 8.304093567251462e-05, + "loss": 0.9083, + "mean_token_accuracy": 0.7763741835951805, + "num_tokens": 13697492.0, + "step": 1091 + }, + { + "entropy": 1.0318915471434593, + "epoch": 1.8199999999999998, + "grad_norm": 0.5709109306335449, + "learning_rate": 8.292397660818713e-05, + "loss": 1.0158, + "mean_token_accuracy": 0.754377044737339, + "num_tokens": 13709877.0, + "step": 1092 + }, + { + "entropy": 1.1231976374983788, + "epoch": 1.8216666666666668, + "grad_norm": 0.3857564330101013, + "learning_rate": 8.280701754385966e-05, + "loss": 1.1226, + "mean_token_accuracy": 0.7332571968436241, + "num_tokens": 13722363.0, + "step": 1093 + }, + { + "entropy": 1.080797977745533, + "epoch": 1.8233333333333333, + "grad_norm": 0.41505011916160583, + "learning_rate": 8.269005847953217e-05, + "loss": 1.0847, + "mean_token_accuracy": 0.7445045188069344, + "num_tokens": 13735045.0, + "step": 1094 + }, + { + "entropy": 1.108239695429802, + "epoch": 1.825, + "grad_norm": 0.5872892737388611, + "learning_rate": 8.257309941520468e-05, + "loss": 1.1088, + "mean_token_accuracy": 0.7337179258465767, + "num_tokens": 13747457.0, + "step": 1095 + }, + { + "entropy": 1.020391158759594, + "epoch": 1.8266666666666667, + "grad_norm": 0.8762550950050354, + "learning_rate": 8.24561403508772e-05, + "loss": 1.0069, + "mean_token_accuracy": 0.7582212015986443, + "num_tokens": 13760067.0, + "step": 1096 + }, + { + "entropy": 0.9287305921316147, + "epoch": 1.8283333333333334, + "grad_norm": 0.4548966884613037, + "learning_rate": 8.233918128654972e-05, + "loss": 0.9154, + "mean_token_accuracy": 0.7810219079256058, + "num_tokens": 13772746.0, + "step": 1097 + }, + { + "entropy": 1.1012649685144424, + "epoch": 1.83, + "grad_norm": 0.3358611464500427, + "learning_rate": 8.222222222222222e-05, + "loss": 1.0686, + "mean_token_accuracy": 0.7442895025014877, + "num_tokens": 13785306.0, + "step": 1098 + }, + { + "entropy": 0.9027578607201576, + "epoch": 1.8316666666666666, + "grad_norm": 0.32184484601020813, + "learning_rate": 8.210526315789474e-05, + "loss": 0.9001, + "mean_token_accuracy": 0.7844014763832092, + "num_tokens": 13798056.0, + "step": 1099 + }, + { + "entropy": 0.9975152164697647, + "epoch": 1.8333333333333335, + "grad_norm": 0.4850866496562958, + "learning_rate": 8.198830409356726e-05, + "loss": 1.0004, + "mean_token_accuracy": 0.763169527053833, + "num_tokens": 13810558.0, + "step": 1100 + }, + { + "entropy": 0.9758845418691635, + "epoch": 1.835, + "grad_norm": 0.32519644498825073, + "learning_rate": 8.187134502923976e-05, + "loss": 0.9507, + "mean_token_accuracy": 0.7746480032801628, + "num_tokens": 13822952.0, + "step": 1101 + }, + { + "entropy": 0.9918239563703537, + "epoch": 1.8366666666666667, + "grad_norm": 0.3663935363292694, + "learning_rate": 8.175438596491228e-05, + "loss": 0.956, + "mean_token_accuracy": 0.7692156657576561, + "num_tokens": 13835592.0, + "step": 1102 + }, + { + "entropy": 1.0694205686450005, + "epoch": 1.8383333333333334, + "grad_norm": 0.360186904668808, + "learning_rate": 8.16374269005848e-05, + "loss": 1.0338, + "mean_token_accuracy": 0.7491528615355492, + "num_tokens": 13848105.0, + "step": 1103 + }, + { + "entropy": 0.9077115952968597, + "epoch": 1.8399999999999999, + "grad_norm": 0.35274380445480347, + "learning_rate": 8.152046783625732e-05, + "loss": 0.9347, + "mean_token_accuracy": 0.778414212167263, + "num_tokens": 13860759.0, + "step": 1104 + }, + { + "entropy": 1.0398012027144432, + "epoch": 1.8416666666666668, + "grad_norm": 0.3670746982097626, + "learning_rate": 8.140350877192983e-05, + "loss": 1.0687, + "mean_token_accuracy": 0.7458862215280533, + "num_tokens": 13873203.0, + "step": 1105 + }, + { + "entropy": 1.0697217732667923, + "epoch": 1.8433333333333333, + "grad_norm": 0.37493640184402466, + "learning_rate": 8.128654970760235e-05, + "loss": 1.0786, + "mean_token_accuracy": 0.7431371510028839, + "num_tokens": 13885591.0, + "step": 1106 + }, + { + "entropy": 1.0747253149747849, + "epoch": 1.845, + "grad_norm": 0.3651712238788605, + "learning_rate": 8.116959064327486e-05, + "loss": 1.0785, + "mean_token_accuracy": 0.7437972277402878, + "num_tokens": 13898295.0, + "step": 1107 + }, + { + "entropy": 1.0024217069149017, + "epoch": 1.8466666666666667, + "grad_norm": 0.3548741936683655, + "learning_rate": 8.105263157894737e-05, + "loss": 0.987, + "mean_token_accuracy": 0.7618882954120636, + "num_tokens": 13911232.0, + "step": 1108 + }, + { + "entropy": 0.9793405681848526, + "epoch": 1.8483333333333334, + "grad_norm": 0.30992835760116577, + "learning_rate": 8.093567251461989e-05, + "loss": 0.9603, + "mean_token_accuracy": 0.764976479113102, + "num_tokens": 13923772.0, + "step": 1109 + }, + { + "entropy": 1.1670287922024727, + "epoch": 1.85, + "grad_norm": 0.36640802025794983, + "learning_rate": 8.08187134502924e-05, + "loss": 1.1756, + "mean_token_accuracy": 0.7263910621404648, + "num_tokens": 13936060.0, + "step": 1110 + }, + { + "entropy": 1.0384756848216057, + "epoch": 1.8516666666666666, + "grad_norm": 0.41028112173080444, + "learning_rate": 8.070175438596491e-05, + "loss": 1.0252, + "mean_token_accuracy": 0.7612837627530098, + "num_tokens": 13948618.0, + "step": 1111 + }, + { + "entropy": 1.0971432998776436, + "epoch": 1.8533333333333335, + "grad_norm": 0.3250121474266052, + "learning_rate": 8.058479532163743e-05, + "loss": 1.1201, + "mean_token_accuracy": 0.7379398792982101, + "num_tokens": 13961234.0, + "step": 1112 + }, + { + "entropy": 1.0641928985714912, + "epoch": 1.855, + "grad_norm": 0.41859084367752075, + "learning_rate": 8.046783625730994e-05, + "loss": 1.0217, + "mean_token_accuracy": 0.7521350830793381, + "num_tokens": 13973879.0, + "step": 1113 + }, + { + "entropy": 1.1233059540390968, + "epoch": 1.8566666666666667, + "grad_norm": 0.37480679154396057, + "learning_rate": 8.035087719298246e-05, + "loss": 1.0989, + "mean_token_accuracy": 0.7355039641261101, + "num_tokens": 13986299.0, + "step": 1114 + }, + { + "entropy": 0.9380587711930275, + "epoch": 1.8583333333333334, + "grad_norm": 1.1096789836883545, + "learning_rate": 8.023391812865497e-05, + "loss": 0.8959, + "mean_token_accuracy": 0.7882434278726578, + "num_tokens": 13998985.0, + "step": 1115 + }, + { + "entropy": 0.9945440739393234, + "epoch": 1.8599999999999999, + "grad_norm": 0.31984928250312805, + "learning_rate": 8.01169590643275e-05, + "loss": 0.989, + "mean_token_accuracy": 0.7666028961539268, + "num_tokens": 14011688.0, + "step": 1116 + }, + { + "entropy": 1.0660679265856743, + "epoch": 1.8616666666666668, + "grad_norm": 0.5125501751899719, + "learning_rate": 8e-05, + "loss": 1.0447, + "mean_token_accuracy": 0.7504593953490257, + "num_tokens": 14024075.0, + "step": 1117 + }, + { + "entropy": 1.1582975387573242, + "epoch": 1.8633333333333333, + "grad_norm": 0.3501436114311218, + "learning_rate": 7.988304093567252e-05, + "loss": 1.1421, + "mean_token_accuracy": 0.7337515875697136, + "num_tokens": 14036125.0, + "step": 1118 + }, + { + "entropy": 1.1479779705405235, + "epoch": 1.865, + "grad_norm": 0.3567291796207428, + "learning_rate": 7.976608187134504e-05, + "loss": 1.1642, + "mean_token_accuracy": 0.7288005948066711, + "num_tokens": 14048797.0, + "step": 1119 + }, + { + "entropy": 0.9859679490327835, + "epoch": 1.8666666666666667, + "grad_norm": 0.46824610233306885, + "learning_rate": 7.964912280701755e-05, + "loss": 0.9647, + "mean_token_accuracy": 0.7705734744668007, + "num_tokens": 14061198.0, + "step": 1120 + }, + { + "entropy": 1.1900914385914803, + "epoch": 1.8683333333333332, + "grad_norm": 0.3760721981525421, + "learning_rate": 7.953216374269006e-05, + "loss": 1.1866, + "mean_token_accuracy": 0.7222587689757347, + "num_tokens": 14073488.0, + "step": 1121 + }, + { + "entropy": 1.1246634796261787, + "epoch": 1.87, + "grad_norm": 0.3690677583217621, + "learning_rate": 7.941520467836258e-05, + "loss": 1.0971, + "mean_token_accuracy": 0.7379028648138046, + "num_tokens": 14085954.0, + "step": 1122 + }, + { + "entropy": 0.9077712148427963, + "epoch": 1.8716666666666666, + "grad_norm": 0.3517340123653412, + "learning_rate": 7.929824561403509e-05, + "loss": 0.9047, + "mean_token_accuracy": 0.7790766954421997, + "num_tokens": 14098451.0, + "step": 1123 + }, + { + "entropy": 1.0539271980524063, + "epoch": 1.8733333333333333, + "grad_norm": 0.48315373063087463, + "learning_rate": 7.91812865497076e-05, + "loss": 1.0756, + "mean_token_accuracy": 0.7477589771151543, + "num_tokens": 14110964.0, + "step": 1124 + }, + { + "entropy": 1.1417809948325157, + "epoch": 1.875, + "grad_norm": 0.3321865200996399, + "learning_rate": 7.906432748538012e-05, + "loss": 1.1316, + "mean_token_accuracy": 0.7366370558738708, + "num_tokens": 14123427.0, + "step": 1125 + }, + { + "entropy": 0.9822080656886101, + "epoch": 1.8766666666666667, + "grad_norm": 0.5568637251853943, + "learning_rate": 7.894736842105263e-05, + "loss": 0.9742, + "mean_token_accuracy": 0.7670710012316704, + "num_tokens": 14135842.0, + "step": 1126 + }, + { + "entropy": 1.0455272644758224, + "epoch": 1.8783333333333334, + "grad_norm": 0.5140118598937988, + "learning_rate": 7.883040935672516e-05, + "loss": 1.0249, + "mean_token_accuracy": 0.7504927515983582, + "num_tokens": 14148308.0, + "step": 1127 + }, + { + "entropy": 1.0229843482375145, + "epoch": 1.88, + "grad_norm": 0.5705822110176086, + "learning_rate": 7.871345029239767e-05, + "loss": 0.9991, + "mean_token_accuracy": 0.7572809234261513, + "num_tokens": 14160786.0, + "step": 1128 + }, + { + "entropy": 0.9899871572852135, + "epoch": 1.8816666666666668, + "grad_norm": 0.3809033930301666, + "learning_rate": 7.859649122807017e-05, + "loss": 0.9683, + "mean_token_accuracy": 0.7672120854258537, + "num_tokens": 14173341.0, + "step": 1129 + }, + { + "entropy": 1.1007420122623444, + "epoch": 1.8833333333333333, + "grad_norm": 0.35236722230911255, + "learning_rate": 7.84795321637427e-05, + "loss": 1.0798, + "mean_token_accuracy": 0.7496308162808418, + "num_tokens": 14186069.0, + "step": 1130 + }, + { + "entropy": 0.976139709353447, + "epoch": 1.885, + "grad_norm": 0.47841936349868774, + "learning_rate": 7.836257309941521e-05, + "loss": 0.9598, + "mean_token_accuracy": 0.7672455906867981, + "num_tokens": 14198471.0, + "step": 1131 + }, + { + "entropy": 0.991685077548027, + "epoch": 1.8866666666666667, + "grad_norm": 0.34977567195892334, + "learning_rate": 7.824561403508772e-05, + "loss": 0.9799, + "mean_token_accuracy": 0.763420507311821, + "num_tokens": 14211100.0, + "step": 1132 + }, + { + "entropy": 1.1330292448401451, + "epoch": 1.8883333333333332, + "grad_norm": 0.4268051087856293, + "learning_rate": 7.812865497076024e-05, + "loss": 1.1339, + "mean_token_accuracy": 0.7320972010493279, + "num_tokens": 14223613.0, + "step": 1133 + }, + { + "entropy": 1.100053831934929, + "epoch": 1.8900000000000001, + "grad_norm": 0.3878801465034485, + "learning_rate": 7.801169590643275e-05, + "loss": 1.0923, + "mean_token_accuracy": 0.7437557876110077, + "num_tokens": 14235887.0, + "step": 1134 + }, + { + "entropy": 1.080342672765255, + "epoch": 1.8916666666666666, + "grad_norm": 0.4604344964027405, + "learning_rate": 7.789473684210526e-05, + "loss": 1.0718, + "mean_token_accuracy": 0.7443812191486359, + "num_tokens": 14248763.0, + "step": 1135 + }, + { + "entropy": 0.9030890613794327, + "epoch": 1.8933333333333333, + "grad_norm": 0.5369459390640259, + "learning_rate": 7.777777777777778e-05, + "loss": 0.8915, + "mean_token_accuracy": 0.7843483313918114, + "num_tokens": 14261278.0, + "step": 1136 + }, + { + "entropy": 0.9645693078637123, + "epoch": 1.895, + "grad_norm": 0.8396739959716797, + "learning_rate": 7.76608187134503e-05, + "loss": 0.9532, + "mean_token_accuracy": 0.7656803950667381, + "num_tokens": 14273802.0, + "step": 1137 + }, + { + "entropy": 1.0688777342438698, + "epoch": 1.8966666666666665, + "grad_norm": 0.34922143816947937, + "learning_rate": 7.754385964912281e-05, + "loss": 1.0548, + "mean_token_accuracy": 0.7512115687131882, + "num_tokens": 14286137.0, + "step": 1138 + }, + { + "entropy": 1.133958362042904, + "epoch": 1.8983333333333334, + "grad_norm": 0.9323358535766602, + "learning_rate": 7.742690058479532e-05, + "loss": 1.1478, + "mean_token_accuracy": 0.7285942807793617, + "num_tokens": 14298729.0, + "step": 1139 + }, + { + "entropy": 1.082243151962757, + "epoch": 1.9, + "grad_norm": 0.6908741593360901, + "learning_rate": 7.730994152046785e-05, + "loss": 1.0916, + "mean_token_accuracy": 0.7418855577707291, + "num_tokens": 14311029.0, + "step": 1140 + }, + { + "entropy": 1.1843367964029312, + "epoch": 1.9016666666666666, + "grad_norm": 0.3764801323413849, + "learning_rate": 7.719298245614036e-05, + "loss": 1.208, + "mean_token_accuracy": 0.7169347703456879, + "num_tokens": 14323313.0, + "step": 1141 + }, + { + "entropy": 1.1391602233052254, + "epoch": 1.9033333333333333, + "grad_norm": 0.5154283046722412, + "learning_rate": 7.707602339181287e-05, + "loss": 1.1108, + "mean_token_accuracy": 0.7377238571643829, + "num_tokens": 14335801.0, + "step": 1142 + }, + { + "entropy": 1.0269312635064125, + "epoch": 1.905, + "grad_norm": 0.3648120164871216, + "learning_rate": 7.695906432748539e-05, + "loss": 1.0329, + "mean_token_accuracy": 0.7542443946003914, + "num_tokens": 14347942.0, + "step": 1143 + }, + { + "entropy": 1.0888805985450745, + "epoch": 1.9066666666666667, + "grad_norm": 0.4464230537414551, + "learning_rate": 7.68421052631579e-05, + "loss": 1.0974, + "mean_token_accuracy": 0.7424792423844337, + "num_tokens": 14360623.0, + "step": 1144 + }, + { + "entropy": 1.0239420160651207, + "epoch": 1.9083333333333332, + "grad_norm": 0.5623224973678589, + "learning_rate": 7.672514619883041e-05, + "loss": 1.0, + "mean_token_accuracy": 0.7568425610661507, + "num_tokens": 14373395.0, + "step": 1145 + }, + { + "entropy": 0.9860764443874359, + "epoch": 1.9100000000000001, + "grad_norm": 0.4440317451953888, + "learning_rate": 7.660818713450293e-05, + "loss": 0.9882, + "mean_token_accuracy": 0.7602974772453308, + "num_tokens": 14385773.0, + "step": 1146 + }, + { + "entropy": 1.0706650838255882, + "epoch": 1.9116666666666666, + "grad_norm": 0.6666054725646973, + "learning_rate": 7.649122807017545e-05, + "loss": 1.0785, + "mean_token_accuracy": 0.7401752322912216, + "num_tokens": 14398206.0, + "step": 1147 + }, + { + "entropy": 1.0299118682742119, + "epoch": 1.9133333333333333, + "grad_norm": 0.43782472610473633, + "learning_rate": 7.637426900584795e-05, + "loss": 1.0285, + "mean_token_accuracy": 0.7530301362276077, + "num_tokens": 14411008.0, + "step": 1148 + }, + { + "entropy": 0.9420201480388641, + "epoch": 1.915, + "grad_norm": 0.3290242850780487, + "learning_rate": 7.625730994152047e-05, + "loss": 0.9101, + "mean_token_accuracy": 0.7839518040418625, + "num_tokens": 14423537.0, + "step": 1149 + }, + { + "entropy": 1.0861295089125633, + "epoch": 1.9166666666666665, + "grad_norm": 0.37817490100860596, + "learning_rate": 7.6140350877193e-05, + "loss": 1.0851, + "mean_token_accuracy": 0.7391221076250076, + "num_tokens": 14435786.0, + "step": 1150 + }, + { + "entropy": 1.0331864580512047, + "epoch": 1.9183333333333334, + "grad_norm": 0.5804871320724487, + "learning_rate": 7.602339181286549e-05, + "loss": 1.0234, + "mean_token_accuracy": 0.7551736012101173, + "num_tokens": 14448139.0, + "step": 1151 + }, + { + "entropy": 1.1816378012299538, + "epoch": 1.92, + "grad_norm": 0.6650798916816711, + "learning_rate": 7.590643274853801e-05, + "loss": 1.1801, + "mean_token_accuracy": 0.7287933006882668, + "num_tokens": 14460512.0, + "step": 1152 + }, + { + "entropy": 0.9338883832097054, + "epoch": 1.9216666666666666, + "grad_norm": 0.43626463413238525, + "learning_rate": 7.578947368421054e-05, + "loss": 0.8948, + "mean_token_accuracy": 0.7890411987900734, + "num_tokens": 14473289.0, + "step": 1153 + }, + { + "entropy": 1.0590982139110565, + "epoch": 1.9233333333333333, + "grad_norm": 0.8146944642066956, + "learning_rate": 7.567251461988303e-05, + "loss": 1.0317, + "mean_token_accuracy": 0.755891315639019, + "num_tokens": 14485761.0, + "step": 1154 + }, + { + "entropy": 1.0805394351482391, + "epoch": 1.925, + "grad_norm": 0.5242136716842651, + "learning_rate": 7.555555555555556e-05, + "loss": 1.0615, + "mean_token_accuracy": 0.7452685311436653, + "num_tokens": 14498365.0, + "step": 1155 + }, + { + "entropy": 0.9772091507911682, + "epoch": 1.9266666666666667, + "grad_norm": 0.3373044431209564, + "learning_rate": 7.543859649122808e-05, + "loss": 0.9666, + "mean_token_accuracy": 0.7663690894842148, + "num_tokens": 14511073.0, + "step": 1156 + }, + { + "entropy": 0.932278111577034, + "epoch": 1.9283333333333332, + "grad_norm": 0.39301741123199463, + "learning_rate": 7.532163742690059e-05, + "loss": 0.9284, + "mean_token_accuracy": 0.7763963863253593, + "num_tokens": 14523707.0, + "step": 1157 + }, + { + "entropy": 1.015767127275467, + "epoch": 1.9300000000000002, + "grad_norm": 0.3373499810695648, + "learning_rate": 7.52046783625731e-05, + "loss": 1.0106, + "mean_token_accuracy": 0.7596909403800964, + "num_tokens": 14536042.0, + "step": 1158 + }, + { + "entropy": 1.030866727232933, + "epoch": 1.9316666666666666, + "grad_norm": 0.41007867455482483, + "learning_rate": 7.508771929824562e-05, + "loss": 1.0411, + "mean_token_accuracy": 0.7504064664244652, + "num_tokens": 14548393.0, + "step": 1159 + }, + { + "entropy": 1.0437910482287407, + "epoch": 1.9333333333333333, + "grad_norm": 0.3775924742221832, + "learning_rate": 7.497076023391813e-05, + "loss": 1.0591, + "mean_token_accuracy": 0.7515989542007446, + "num_tokens": 14561064.0, + "step": 1160 + }, + { + "entropy": 0.9633251279592514, + "epoch": 1.935, + "grad_norm": 0.424560546875, + "learning_rate": 7.485380116959064e-05, + "loss": 0.9581, + "mean_token_accuracy": 0.7683878242969513, + "num_tokens": 14573528.0, + "step": 1161 + }, + { + "entropy": 1.0720863342285156, + "epoch": 1.9366666666666665, + "grad_norm": 0.3431107699871063, + "learning_rate": 7.473684210526316e-05, + "loss": 1.0604, + "mean_token_accuracy": 0.7436831146478653, + "num_tokens": 14586019.0, + "step": 1162 + }, + { + "entropy": 1.201435960829258, + "epoch": 1.9383333333333335, + "grad_norm": 0.4237931966781616, + "learning_rate": 7.461988304093567e-05, + "loss": 1.2042, + "mean_token_accuracy": 0.7172101438045502, + "num_tokens": 14598571.0, + "step": 1163 + }, + { + "entropy": 1.0133287012577057, + "epoch": 1.94, + "grad_norm": 0.3479676842689514, + "learning_rate": 7.450292397660818e-05, + "loss": 1.0352, + "mean_token_accuracy": 0.7594067007303238, + "num_tokens": 14611466.0, + "step": 1164 + }, + { + "entropy": 0.9939203038811684, + "epoch": 1.9416666666666667, + "grad_norm": 0.5237852334976196, + "learning_rate": 7.43859649122807e-05, + "loss": 0.9818, + "mean_token_accuracy": 0.7681661173701286, + "num_tokens": 14624098.0, + "step": 1165 + }, + { + "entropy": 1.0392756760120392, + "epoch": 1.9433333333333334, + "grad_norm": 0.38556504249572754, + "learning_rate": 7.426900584795321e-05, + "loss": 1.0402, + "mean_token_accuracy": 0.7585587650537491, + "num_tokens": 14636562.0, + "step": 1166 + }, + { + "entropy": 0.9937330782413483, + "epoch": 1.9449999999999998, + "grad_norm": 0.3307511806488037, + "learning_rate": 7.415204678362574e-05, + "loss": 0.9701, + "mean_token_accuracy": 0.7711986675858498, + "num_tokens": 14649136.0, + "step": 1167 + }, + { + "entropy": 0.9863338023424149, + "epoch": 1.9466666666666668, + "grad_norm": 0.32653582096099854, + "learning_rate": 7.403508771929825e-05, + "loss": 0.9534, + "mean_token_accuracy": 0.7664777636528015, + "num_tokens": 14661775.0, + "step": 1168 + }, + { + "entropy": 0.8837543055415154, + "epoch": 1.9483333333333333, + "grad_norm": 0.4141775369644165, + "learning_rate": 7.391812865497077e-05, + "loss": 0.8535, + "mean_token_accuracy": 0.78807432949543, + "num_tokens": 14674301.0, + "step": 1169 + }, + { + "entropy": 1.0114361122250557, + "epoch": 1.95, + "grad_norm": 0.33612117171287537, + "learning_rate": 7.380116959064328e-05, + "loss": 1.0057, + "mean_token_accuracy": 0.7618621662259102, + "num_tokens": 14686839.0, + "step": 1170 + }, + { + "entropy": 1.046335145831108, + "epoch": 1.9516666666666667, + "grad_norm": 0.43380284309387207, + "learning_rate": 7.368421052631579e-05, + "loss": 1.0345, + "mean_token_accuracy": 0.7503594979643822, + "num_tokens": 14699255.0, + "step": 1171 + }, + { + "entropy": 1.0053035020828247, + "epoch": 1.9533333333333334, + "grad_norm": 0.32732105255126953, + "learning_rate": 7.356725146198831e-05, + "loss": 0.998, + "mean_token_accuracy": 0.7619671300053596, + "num_tokens": 14711917.0, + "step": 1172 + }, + { + "entropy": 1.042715035378933, + "epoch": 1.955, + "grad_norm": 0.3852812647819519, + "learning_rate": 7.345029239766082e-05, + "loss": 1.0446, + "mean_token_accuracy": 0.7507964074611664, + "num_tokens": 14724367.0, + "step": 1173 + }, + { + "entropy": 1.0430034920573235, + "epoch": 1.9566666666666666, + "grad_norm": 0.4369751513004303, + "learning_rate": 7.333333333333333e-05, + "loss": 1.0245, + "mean_token_accuracy": 0.7552479654550552, + "num_tokens": 14736839.0, + "step": 1174 + }, + { + "entropy": 1.0451406240463257, + "epoch": 1.9583333333333335, + "grad_norm": 0.5805354118347168, + "learning_rate": 7.321637426900585e-05, + "loss": 1.0104, + "mean_token_accuracy": 0.7555409520864487, + "num_tokens": 14749344.0, + "step": 1175 + }, + { + "entropy": 0.9118838533759117, + "epoch": 1.96, + "grad_norm": 0.33266305923461914, + "learning_rate": 7.309941520467836e-05, + "loss": 0.9036, + "mean_token_accuracy": 0.7849425300955772, + "num_tokens": 14761751.0, + "step": 1176 + }, + { + "entropy": 0.9509773701429367, + "epoch": 1.9616666666666667, + "grad_norm": 0.3681040406227112, + "learning_rate": 7.298245614035089e-05, + "loss": 0.9523, + "mean_token_accuracy": 0.7692973092198372, + "num_tokens": 14774154.0, + "step": 1177 + }, + { + "entropy": 0.9553389064967632, + "epoch": 1.9633333333333334, + "grad_norm": 0.4086499512195587, + "learning_rate": 7.28654970760234e-05, + "loss": 0.9604, + "mean_token_accuracy": 0.7629097029566765, + "num_tokens": 14786843.0, + "step": 1178 + }, + { + "entropy": 0.8695802688598633, + "epoch": 1.9649999999999999, + "grad_norm": 0.32611146569252014, + "learning_rate": 7.27485380116959e-05, + "loss": 0.8522, + "mean_token_accuracy": 0.7970450446009636, + "num_tokens": 14799455.0, + "step": 1179 + }, + { + "entropy": 1.0503259599208832, + "epoch": 1.9666666666666668, + "grad_norm": 0.46494752168655396, + "learning_rate": 7.263157894736843e-05, + "loss": 1.0448, + "mean_token_accuracy": 0.7501124665141106, + "num_tokens": 14811898.0, + "step": 1180 + }, + { + "entropy": 1.0048613548278809, + "epoch": 1.9683333333333333, + "grad_norm": 0.3598049581050873, + "learning_rate": 7.251461988304094e-05, + "loss": 1.02, + "mean_token_accuracy": 0.7541192695498466, + "num_tokens": 14824474.0, + "step": 1181 + }, + { + "entropy": 1.0100720524787903, + "epoch": 1.97, + "grad_norm": 0.39491939544677734, + "learning_rate": 7.239766081871345e-05, + "loss": 0.9959, + "mean_token_accuracy": 0.7641180381178856, + "num_tokens": 14836851.0, + "step": 1182 + }, + { + "entropy": 1.0197961702942848, + "epoch": 1.9716666666666667, + "grad_norm": 0.3616783320903778, + "learning_rate": 7.228070175438597e-05, + "loss": 1.0259, + "mean_token_accuracy": 0.7520253881812096, + "num_tokens": 14849418.0, + "step": 1183 + }, + { + "entropy": 1.0248072817921638, + "epoch": 1.9733333333333334, + "grad_norm": 0.32178795337677, + "learning_rate": 7.216374269005848e-05, + "loss": 1.0013, + "mean_token_accuracy": 0.7647489160299301, + "num_tokens": 14861767.0, + "step": 1184 + }, + { + "entropy": 1.0271701738238335, + "epoch": 1.975, + "grad_norm": 0.3734261989593506, + "learning_rate": 7.204678362573099e-05, + "loss": 1.0044, + "mean_token_accuracy": 0.7597475573420525, + "num_tokens": 14874500.0, + "step": 1185 + }, + { + "entropy": 1.0513724014163017, + "epoch": 1.9766666666666666, + "grad_norm": 0.38311973214149475, + "learning_rate": 7.192982456140351e-05, + "loss": 1.0285, + "mean_token_accuracy": 0.7554675191640854, + "num_tokens": 14886801.0, + "step": 1186 + }, + { + "entropy": 1.0406484082341194, + "epoch": 1.9783333333333335, + "grad_norm": 0.337187796831131, + "learning_rate": 7.181286549707604e-05, + "loss": 1.0296, + "mean_token_accuracy": 0.7530212178826332, + "num_tokens": 14899411.0, + "step": 1187 + }, + { + "entropy": 0.9863916710019112, + "epoch": 1.98, + "grad_norm": 0.37474244832992554, + "learning_rate": 7.169590643274853e-05, + "loss": 0.968, + "mean_token_accuracy": 0.7640804052352905, + "num_tokens": 14912082.0, + "step": 1188 + }, + { + "entropy": 0.9524248540401459, + "epoch": 1.9816666666666667, + "grad_norm": 0.38662028312683105, + "learning_rate": 7.157894736842105e-05, + "loss": 0.9406, + "mean_token_accuracy": 0.7761217206716537, + "num_tokens": 14924385.0, + "step": 1189 + }, + { + "entropy": 1.1712022498250008, + "epoch": 1.9833333333333334, + "grad_norm": 0.34474673867225647, + "learning_rate": 7.146198830409358e-05, + "loss": 1.182, + "mean_token_accuracy": 0.7282759845256805, + "num_tokens": 14937054.0, + "step": 1190 + }, + { + "entropy": 1.09844072163105, + "epoch": 1.9849999999999999, + "grad_norm": 0.3593318462371826, + "learning_rate": 7.134502923976609e-05, + "loss": 1.0974, + "mean_token_accuracy": 0.740160807967186, + "num_tokens": 14949496.0, + "step": 1191 + }, + { + "entropy": 1.1235914453864098, + "epoch": 1.9866666666666668, + "grad_norm": 0.35967862606048584, + "learning_rate": 7.12280701754386e-05, + "loss": 1.1005, + "mean_token_accuracy": 0.7394478842616081, + "num_tokens": 14962172.0, + "step": 1192 + }, + { + "entropy": 0.9706814885139465, + "epoch": 1.9883333333333333, + "grad_norm": 0.32786986231803894, + "learning_rate": 7.111111111111112e-05, + "loss": 0.9485, + "mean_token_accuracy": 0.7693494334816933, + "num_tokens": 14975100.0, + "step": 1193 + }, + { + "entropy": 0.9292563125491142, + "epoch": 1.99, + "grad_norm": 0.4656606614589691, + "learning_rate": 7.099415204678363e-05, + "loss": 0.9191, + "mean_token_accuracy": 0.7757812887430191, + "num_tokens": 14987690.0, + "step": 1194 + }, + { + "entropy": 1.0805052816867828, + "epoch": 1.9916666666666667, + "grad_norm": 0.38252657651901245, + "learning_rate": 7.087719298245614e-05, + "loss": 1.0654, + "mean_token_accuracy": 0.744781069457531, + "num_tokens": 15000166.0, + "step": 1195 + }, + { + "entropy": 1.0376411154866219, + "epoch": 1.9933333333333332, + "grad_norm": 0.39776602387428284, + "learning_rate": 7.076023391812866e-05, + "loss": 1.0531, + "mean_token_accuracy": 0.7509570717811584, + "num_tokens": 15012969.0, + "step": 1196 + }, + { + "entropy": 0.91599540412426, + "epoch": 1.995, + "grad_norm": 0.38006147742271423, + "learning_rate": 7.064327485380117e-05, + "loss": 0.9244, + "mean_token_accuracy": 0.7794359549880028, + "num_tokens": 15025400.0, + "step": 1197 + }, + { + "entropy": 1.0757018700242043, + "epoch": 1.9966666666666666, + "grad_norm": 0.31853750348091125, + "learning_rate": 7.052631578947368e-05, + "loss": 1.0729, + "mean_token_accuracy": 0.7420129850506783, + "num_tokens": 15038149.0, + "step": 1198 + }, + { + "entropy": 0.8585388213396072, + "epoch": 1.9983333333333333, + "grad_norm": 0.5563913583755493, + "learning_rate": 7.04093567251462e-05, + "loss": 0.8488, + "mean_token_accuracy": 0.7974219918251038, + "num_tokens": 15050818.0, + "step": 1199 + }, + { + "entropy": 1.1881198361515999, + "epoch": 2.0, + "grad_norm": 0.5018253326416016, + "learning_rate": 7.029239766081873e-05, + "loss": 1.2152, + "mean_token_accuracy": 0.7145092412829399, + "num_tokens": 15063322.0, + "step": 1200 + }, + { + "entropy": 0.9052357375621796, + "epoch": 2.0016666666666665, + "grad_norm": 0.34325864911079407, + "learning_rate": 7.017543859649122e-05, + "loss": 0.8783, + "mean_token_accuracy": 0.7856507301330566, + "num_tokens": 15075793.0, + "step": 1201 + }, + { + "entropy": 0.9189976379275322, + "epoch": 2.0033333333333334, + "grad_norm": 0.3808995485305786, + "learning_rate": 7.005847953216375e-05, + "loss": 0.9259, + "mean_token_accuracy": 0.778274305164814, + "num_tokens": 15088477.0, + "step": 1202 + }, + { + "entropy": 1.061006784439087, + "epoch": 2.005, + "grad_norm": 0.3382485806941986, + "learning_rate": 6.994152046783627e-05, + "loss": 1.0298, + "mean_token_accuracy": 0.7612174600362778, + "num_tokens": 15101097.0, + "step": 1203 + }, + { + "entropy": 0.9360107630491257, + "epoch": 2.006666666666667, + "grad_norm": 0.491824209690094, + "learning_rate": 6.982456140350876e-05, + "loss": 0.9028, + "mean_token_accuracy": 0.78416408598423, + "num_tokens": 15113789.0, + "step": 1204 + }, + { + "entropy": 1.0699936002492905, + "epoch": 2.0083333333333333, + "grad_norm": 0.533436119556427, + "learning_rate": 6.970760233918129e-05, + "loss": 1.0144, + "mean_token_accuracy": 0.758854866027832, + "num_tokens": 15126605.0, + "step": 1205 + }, + { + "entropy": 1.0428190529346466, + "epoch": 2.01, + "grad_norm": 0.3870173692703247, + "learning_rate": 6.959064327485381e-05, + "loss": 1.0027, + "mean_token_accuracy": 0.7668027579784393, + "num_tokens": 15138961.0, + "step": 1206 + }, + { + "entropy": 0.986046314239502, + "epoch": 2.0116666666666667, + "grad_norm": 1.019547939300537, + "learning_rate": 6.947368421052632e-05, + "loss": 0.9609, + "mean_token_accuracy": 0.7719600424170494, + "num_tokens": 15151398.0, + "step": 1207 + }, + { + "entropy": 1.0389724001288414, + "epoch": 2.013333333333333, + "grad_norm": 2.1938655376434326, + "learning_rate": 6.935672514619883e-05, + "loss": 1.0173, + "mean_token_accuracy": 0.7511957213282585, + "num_tokens": 15163822.0, + "step": 1208 + }, + { + "entropy": 0.9506646022200584, + "epoch": 2.015, + "grad_norm": 0.4420545995235443, + "learning_rate": 6.923976608187135e-05, + "loss": 0.9235, + "mean_token_accuracy": 0.781024269759655, + "num_tokens": 15176488.0, + "step": 1209 + }, + { + "entropy": 1.1012853309512138, + "epoch": 2.0166666666666666, + "grad_norm": 0.5005508661270142, + "learning_rate": 6.912280701754386e-05, + "loss": 1.0818, + "mean_token_accuracy": 0.7453935891389847, + "num_tokens": 15189123.0, + "step": 1210 + }, + { + "entropy": 0.7870658077299595, + "epoch": 2.0183333333333335, + "grad_norm": 0.3204546570777893, + "learning_rate": 6.900584795321637e-05, + "loss": 0.7564, + "mean_token_accuracy": 0.8177636787295341, + "num_tokens": 15201559.0, + "step": 1211 + }, + { + "entropy": 1.002734825015068, + "epoch": 2.02, + "grad_norm": 0.38311612606048584, + "learning_rate": 6.88888888888889e-05, + "loss": 0.9952, + "mean_token_accuracy": 0.7644277438521385, + "num_tokens": 15214065.0, + "step": 1212 + }, + { + "entropy": 1.0941368639469147, + "epoch": 2.0216666666666665, + "grad_norm": 0.4404506981372833, + "learning_rate": 6.87719298245614e-05, + "loss": 1.0926, + "mean_token_accuracy": 0.7410142794251442, + "num_tokens": 15226499.0, + "step": 1213 + }, + { + "entropy": 1.0271618217229843, + "epoch": 2.0233333333333334, + "grad_norm": 0.41902676224708557, + "learning_rate": 6.865497076023391e-05, + "loss": 0.9877, + "mean_token_accuracy": 0.7617835104465485, + "num_tokens": 15238929.0, + "step": 1214 + }, + { + "entropy": 0.9966288581490517, + "epoch": 2.025, + "grad_norm": 0.3291116952896118, + "learning_rate": 6.853801169590644e-05, + "loss": 1.0114, + "mean_token_accuracy": 0.7586018741130829, + "num_tokens": 15251462.0, + "step": 1215 + }, + { + "entropy": 0.9194224253296852, + "epoch": 2.026666666666667, + "grad_norm": 0.3572547435760498, + "learning_rate": 6.842105263157895e-05, + "loss": 0.908, + "mean_token_accuracy": 0.7783378884196281, + "num_tokens": 15263834.0, + "step": 1216 + }, + { + "entropy": 0.9324749112129211, + "epoch": 2.0283333333333333, + "grad_norm": 0.3369165062904358, + "learning_rate": 6.830409356725147e-05, + "loss": 0.9252, + "mean_token_accuracy": 0.7835753262042999, + "num_tokens": 15276425.0, + "step": 1217 + }, + { + "entropy": 0.8321675360202789, + "epoch": 2.03, + "grad_norm": 1.595625877380371, + "learning_rate": 6.818713450292398e-05, + "loss": 0.8153, + "mean_token_accuracy": 0.8065165877342224, + "num_tokens": 15288972.0, + "step": 1218 + }, + { + "entropy": 0.9044404104351997, + "epoch": 2.0316666666666667, + "grad_norm": 0.3817458152770996, + "learning_rate": 6.80701754385965e-05, + "loss": 0.8901, + "mean_token_accuracy": 0.7881944924592972, + "num_tokens": 15301669.0, + "step": 1219 + }, + { + "entropy": 1.095058612525463, + "epoch": 2.033333333333333, + "grad_norm": 0.46860161423683167, + "learning_rate": 6.795321637426901e-05, + "loss": 1.0851, + "mean_token_accuracy": 0.7476026341319084, + "num_tokens": 15314091.0, + "step": 1220 + }, + { + "entropy": 0.97823616117239, + "epoch": 2.035, + "grad_norm": 0.3651435971260071, + "learning_rate": 6.783625730994152e-05, + "loss": 0.9952, + "mean_token_accuracy": 0.7599917650222778, + "num_tokens": 15326653.0, + "step": 1221 + }, + { + "entropy": 1.0307016968727112, + "epoch": 2.0366666666666666, + "grad_norm": 0.32165053486824036, + "learning_rate": 6.771929824561404e-05, + "loss": 1.0204, + "mean_token_accuracy": 0.7560223415493965, + "num_tokens": 15339053.0, + "step": 1222 + }, + { + "entropy": 0.8370644301176071, + "epoch": 2.038333333333333, + "grad_norm": 0.409912645816803, + "learning_rate": 6.760233918128655e-05, + "loss": 0.8277, + "mean_token_accuracy": 0.7972834259271622, + "num_tokens": 15351541.0, + "step": 1223 + }, + { + "entropy": 1.030876912176609, + "epoch": 2.04, + "grad_norm": 0.44045114517211914, + "learning_rate": 6.748538011695906e-05, + "loss": 1.0133, + "mean_token_accuracy": 0.7592709213495255, + "num_tokens": 15364113.0, + "step": 1224 + }, + { + "entropy": 0.9634627774357796, + "epoch": 2.0416666666666665, + "grad_norm": 0.6017156839370728, + "learning_rate": 6.736842105263159e-05, + "loss": 1.0051, + "mean_token_accuracy": 0.7669636905193329, + "num_tokens": 15376653.0, + "step": 1225 + }, + { + "entropy": 0.8200527541339397, + "epoch": 2.0433333333333334, + "grad_norm": 0.34684279561042786, + "learning_rate": 6.72514619883041e-05, + "loss": 0.825, + "mean_token_accuracy": 0.805654875934124, + "num_tokens": 15389301.0, + "step": 1226 + }, + { + "entropy": 0.8842723518610001, + "epoch": 2.045, + "grad_norm": 0.5951091647148132, + "learning_rate": 6.713450292397662e-05, + "loss": 0.8755, + "mean_token_accuracy": 0.7912396788597107, + "num_tokens": 15401961.0, + "step": 1227 + }, + { + "entropy": 0.8875353932380676, + "epoch": 2.046666666666667, + "grad_norm": 0.37242352962493896, + "learning_rate": 6.701754385964913e-05, + "loss": 0.8651, + "mean_token_accuracy": 0.7901949658989906, + "num_tokens": 15414342.0, + "step": 1228 + }, + { + "entropy": 1.075327143073082, + "epoch": 2.0483333333333333, + "grad_norm": 0.3590560555458069, + "learning_rate": 6.690058479532164e-05, + "loss": 1.0645, + "mean_token_accuracy": 0.7556563019752502, + "num_tokens": 15426845.0, + "step": 1229 + }, + { + "entropy": 0.9320587888360023, + "epoch": 2.05, + "grad_norm": 0.3665508031845093, + "learning_rate": 6.678362573099416e-05, + "loss": 0.9163, + "mean_token_accuracy": 0.780915156006813, + "num_tokens": 15439540.0, + "step": 1230 + }, + { + "entropy": 1.0235765650868416, + "epoch": 2.0516666666666667, + "grad_norm": 0.5307242274284363, + "learning_rate": 6.666666666666667e-05, + "loss": 1.0204, + "mean_token_accuracy": 0.7594335526227951, + "num_tokens": 15452301.0, + "step": 1231 + }, + { + "entropy": 0.9012108668684959, + "epoch": 2.0533333333333332, + "grad_norm": 0.3947784900665283, + "learning_rate": 6.654970760233918e-05, + "loss": 0.8771, + "mean_token_accuracy": 0.7836640998721123, + "num_tokens": 15464933.0, + "step": 1232 + }, + { + "entropy": 1.0133416280150414, + "epoch": 2.055, + "grad_norm": 0.3938189446926117, + "learning_rate": 6.64327485380117e-05, + "loss": 0.9836, + "mean_token_accuracy": 0.7572296559810638, + "num_tokens": 15477426.0, + "step": 1233 + }, + { + "entropy": 1.2194915115833282, + "epoch": 2.0566666666666666, + "grad_norm": 0.34032195806503296, + "learning_rate": 6.631578947368421e-05, + "loss": 1.232, + "mean_token_accuracy": 0.7150535508990288, + "num_tokens": 15490142.0, + "step": 1234 + }, + { + "entropy": 1.0875378251075745, + "epoch": 2.058333333333333, + "grad_norm": 0.4885045289993286, + "learning_rate": 6.619883040935672e-05, + "loss": 1.0742, + "mean_token_accuracy": 0.7478625327348709, + "num_tokens": 15502567.0, + "step": 1235 + }, + { + "entropy": 0.9491895511746407, + "epoch": 2.06, + "grad_norm": 0.3522387444972992, + "learning_rate": 6.608187134502924e-05, + "loss": 0.9396, + "mean_token_accuracy": 0.777589961886406, + "num_tokens": 15514928.0, + "step": 1236 + }, + { + "entropy": 0.9448045641183853, + "epoch": 2.0616666666666665, + "grad_norm": 0.3736669719219208, + "learning_rate": 6.596491228070177e-05, + "loss": 0.9304, + "mean_token_accuracy": 0.7839526385068893, + "num_tokens": 15527334.0, + "step": 1237 + }, + { + "entropy": 1.0716444924473763, + "epoch": 2.0633333333333335, + "grad_norm": 0.33991655707359314, + "learning_rate": 6.584795321637426e-05, + "loss": 1.0556, + "mean_token_accuracy": 0.7570153623819351, + "num_tokens": 15539543.0, + "step": 1238 + }, + { + "entropy": 1.0596841275691986, + "epoch": 2.065, + "grad_norm": 0.2872769832611084, + "learning_rate": 6.573099415204679e-05, + "loss": 1.0411, + "mean_token_accuracy": 0.7550288438796997, + "num_tokens": 15551968.0, + "step": 1239 + }, + { + "entropy": 0.9897029399871826, + "epoch": 2.066666666666667, + "grad_norm": 0.4234226942062378, + "learning_rate": 6.561403508771931e-05, + "loss": 0.9396, + "mean_token_accuracy": 0.7762594521045685, + "num_tokens": 15564570.0, + "step": 1240 + }, + { + "entropy": 1.10260471701622, + "epoch": 2.0683333333333334, + "grad_norm": 1.1800907850265503, + "learning_rate": 6.549707602339182e-05, + "loss": 1.0693, + "mean_token_accuracy": 0.7487555369734764, + "num_tokens": 15577110.0, + "step": 1241 + }, + { + "entropy": 1.089598923921585, + "epoch": 2.07, + "grad_norm": 0.4393610656261444, + "learning_rate": 6.538011695906433e-05, + "loss": 1.0549, + "mean_token_accuracy": 0.7502364441752434, + "num_tokens": 15589418.0, + "step": 1242 + }, + { + "entropy": 1.038057379424572, + "epoch": 2.0716666666666668, + "grad_norm": 0.8369620442390442, + "learning_rate": 6.526315789473685e-05, + "loss": 1.0309, + "mean_token_accuracy": 0.7616065144538879, + "num_tokens": 15601851.0, + "step": 1243 + }, + { + "entropy": 0.9448019489645958, + "epoch": 2.0733333333333333, + "grad_norm": 0.4217173159122467, + "learning_rate": 6.514619883040936e-05, + "loss": 0.9364, + "mean_token_accuracy": 0.7743350118398666, + "num_tokens": 15614606.0, + "step": 1244 + }, + { + "entropy": 0.9873029887676239, + "epoch": 2.075, + "grad_norm": 0.3019360899925232, + "learning_rate": 6.502923976608187e-05, + "loss": 0.9774, + "mean_token_accuracy": 0.7656509503722191, + "num_tokens": 15626973.0, + "step": 1245 + }, + { + "entropy": 0.9539402648806572, + "epoch": 2.0766666666666667, + "grad_norm": 0.4058213233947754, + "learning_rate": 6.49122807017544e-05, + "loss": 0.9458, + "mean_token_accuracy": 0.7742001786828041, + "num_tokens": 15639510.0, + "step": 1246 + }, + { + "entropy": 0.9103057011961937, + "epoch": 2.078333333333333, + "grad_norm": 0.46826452016830444, + "learning_rate": 6.47953216374269e-05, + "loss": 0.8938, + "mean_token_accuracy": 0.7843516543507576, + "num_tokens": 15652041.0, + "step": 1247 + }, + { + "entropy": 0.9239180311560631, + "epoch": 2.08, + "grad_norm": 0.40790650248527527, + "learning_rate": 6.467836257309941e-05, + "loss": 0.9342, + "mean_token_accuracy": 0.7735781818628311, + "num_tokens": 15664742.0, + "step": 1248 + }, + { + "entropy": 0.9277187436819077, + "epoch": 2.0816666666666666, + "grad_norm": 0.33851221203804016, + "learning_rate": 6.456140350877194e-05, + "loss": 0.927, + "mean_token_accuracy": 0.7758015990257263, + "num_tokens": 15677285.0, + "step": 1249 + }, + { + "entropy": 0.944388322532177, + "epoch": 2.0833333333333335, + "grad_norm": 0.3766917288303375, + "learning_rate": 6.444444444444446e-05, + "loss": 0.9479, + "mean_token_accuracy": 0.7702078968286514, + "num_tokens": 15690222.0, + "step": 1250 + }, + { + "entropy": 0.9091977030038834, + "epoch": 2.085, + "grad_norm": 0.40036749839782715, + "learning_rate": 6.432748538011695e-05, + "loss": 0.9128, + "mean_token_accuracy": 0.7758940979838371, + "num_tokens": 15702955.0, + "step": 1251 + }, + { + "entropy": 1.0803281962871552, + "epoch": 2.086666666666667, + "grad_norm": 0.347028523683548, + "learning_rate": 6.421052631578948e-05, + "loss": 1.0897, + "mean_token_accuracy": 0.7477646172046661, + "num_tokens": 15715271.0, + "step": 1252 + }, + { + "entropy": 0.9825423136353493, + "epoch": 2.0883333333333334, + "grad_norm": 0.36182457208633423, + "learning_rate": 6.4093567251462e-05, + "loss": 1.0068, + "mean_token_accuracy": 0.7584322690963745, + "num_tokens": 15727735.0, + "step": 1253 + }, + { + "entropy": 1.1912232413887978, + "epoch": 2.09, + "grad_norm": 0.3485911786556244, + "learning_rate": 6.39766081871345e-05, + "loss": 1.2016, + "mean_token_accuracy": 0.721094161272049, + "num_tokens": 15740384.0, + "step": 1254 + }, + { + "entropy": 0.7906246408820152, + "epoch": 2.091666666666667, + "grad_norm": 0.33932462334632874, + "learning_rate": 6.385964912280702e-05, + "loss": 0.7592, + "mean_token_accuracy": 0.8148343116044998, + "num_tokens": 15752728.0, + "step": 1255 + }, + { + "entropy": 1.0403779968619347, + "epoch": 2.0933333333333333, + "grad_norm": 0.3166411817073822, + "learning_rate": 6.374269005847954e-05, + "loss": 1.0151, + "mean_token_accuracy": 0.7559191957116127, + "num_tokens": 15765183.0, + "step": 1256 + }, + { + "entropy": 0.9112806841731071, + "epoch": 2.095, + "grad_norm": 0.37241312861442566, + "learning_rate": 6.362573099415205e-05, + "loss": 0.8906, + "mean_token_accuracy": 0.7894803658127785, + "num_tokens": 15777864.0, + "step": 1257 + }, + { + "entropy": 0.981794960796833, + "epoch": 2.0966666666666667, + "grad_norm": 0.4328164756298065, + "learning_rate": 6.350877192982456e-05, + "loss": 0.9675, + "mean_token_accuracy": 0.7765341326594353, + "num_tokens": 15790467.0, + "step": 1258 + }, + { + "entropy": 1.0570649579167366, + "epoch": 2.098333333333333, + "grad_norm": 6.728091716766357, + "learning_rate": 6.339181286549708e-05, + "loss": 1.0274, + "mean_token_accuracy": 0.753454253077507, + "num_tokens": 15802919.0, + "step": 1259 + }, + { + "entropy": 1.0974561870098114, + "epoch": 2.1, + "grad_norm": 0.41763633489608765, + "learning_rate": 6.32748538011696e-05, + "loss": 1.0801, + "mean_token_accuracy": 0.7413333430886269, + "num_tokens": 15815614.0, + "step": 1260 + }, + { + "entropy": 0.9489110037684441, + "epoch": 2.1016666666666666, + "grad_norm": 0.37353506684303284, + "learning_rate": 6.31578947368421e-05, + "loss": 0.9261, + "mean_token_accuracy": 0.7756678089499474, + "num_tokens": 15828155.0, + "step": 1261 + }, + { + "entropy": 0.9432752504944801, + "epoch": 2.1033333333333335, + "grad_norm": 0.41370049118995667, + "learning_rate": 6.304093567251463e-05, + "loss": 0.9497, + "mean_token_accuracy": 0.7864074036478996, + "num_tokens": 15840912.0, + "step": 1262 + }, + { + "entropy": 0.970004640519619, + "epoch": 2.105, + "grad_norm": 0.42266350984573364, + "learning_rate": 6.292397660818714e-05, + "loss": 0.9489, + "mean_token_accuracy": 0.7750717252492905, + "num_tokens": 15853612.0, + "step": 1263 + }, + { + "entropy": 0.9371956959366798, + "epoch": 2.1066666666666665, + "grad_norm": 0.5785712599754333, + "learning_rate": 6.280701754385965e-05, + "loss": 0.9071, + "mean_token_accuracy": 0.7850569412112236, + "num_tokens": 15865974.0, + "step": 1264 + }, + { + "entropy": 1.043307363986969, + "epoch": 2.1083333333333334, + "grad_norm": 0.5705166459083557, + "learning_rate": 6.269005847953217e-05, + "loss": 1.0468, + "mean_token_accuracy": 0.7476249039173126, + "num_tokens": 15878868.0, + "step": 1265 + }, + { + "entropy": 1.0852922201156616, + "epoch": 2.11, + "grad_norm": 0.5431942343711853, + "learning_rate": 6.257309941520468e-05, + "loss": 1.0859, + "mean_token_accuracy": 0.7422461211681366, + "num_tokens": 15891104.0, + "step": 1266 + }, + { + "entropy": 0.9250567182898521, + "epoch": 2.111666666666667, + "grad_norm": 0.3531194031238556, + "learning_rate": 6.24561403508772e-05, + "loss": 0.9291, + "mean_token_accuracy": 0.7763449102640152, + "num_tokens": 15903544.0, + "step": 1267 + }, + { + "entropy": 0.7491031736135483, + "epoch": 2.1133333333333333, + "grad_norm": 0.37093281745910645, + "learning_rate": 6.233918128654971e-05, + "loss": 0.7308, + "mean_token_accuracy": 0.8212139829993248, + "num_tokens": 15916166.0, + "step": 1268 + }, + { + "entropy": 1.0547382161021233, + "epoch": 2.115, + "grad_norm": 0.36116114258766174, + "learning_rate": 6.222222222222222e-05, + "loss": 1.0204, + "mean_token_accuracy": 0.7612402960658073, + "num_tokens": 15928425.0, + "step": 1269 + }, + { + "entropy": 1.0326936393976212, + "epoch": 2.1166666666666667, + "grad_norm": 0.3690188229084015, + "learning_rate": 6.210526315789474e-05, + "loss": 1.0187, + "mean_token_accuracy": 0.7550322636961937, + "num_tokens": 15940780.0, + "step": 1270 + }, + { + "entropy": 0.8861713111400604, + "epoch": 2.118333333333333, + "grad_norm": 0.4047095477581024, + "learning_rate": 6.198830409356725e-05, + "loss": 0.8637, + "mean_token_accuracy": 0.7948219925165176, + "num_tokens": 15953138.0, + "step": 1271 + }, + { + "entropy": 0.9570751041173935, + "epoch": 2.12, + "grad_norm": 0.3608614504337311, + "learning_rate": 6.187134502923978e-05, + "loss": 0.9606, + "mean_token_accuracy": 0.7752379328012466, + "num_tokens": 15965859.0, + "step": 1272 + }, + { + "entropy": 0.9991676434874535, + "epoch": 2.1216666666666666, + "grad_norm": 0.3457930088043213, + "learning_rate": 6.175438596491228e-05, + "loss": 0.999, + "mean_token_accuracy": 0.7654828205704689, + "num_tokens": 15978733.0, + "step": 1273 + }, + { + "entropy": 1.0009681209921837, + "epoch": 2.1233333333333335, + "grad_norm": 0.5010899305343628, + "learning_rate": 6.16374269005848e-05, + "loss": 0.9963, + "mean_token_accuracy": 0.7657868787646294, + "num_tokens": 15991322.0, + "step": 1274 + }, + { + "entropy": 1.0141113102436066, + "epoch": 2.125, + "grad_norm": 0.4147863984107971, + "learning_rate": 6.152046783625732e-05, + "loss": 1.0278, + "mean_token_accuracy": 0.7668524906039238, + "num_tokens": 16003959.0, + "step": 1275 + }, + { + "entropy": 0.9721154272556305, + "epoch": 2.1266666666666665, + "grad_norm": 0.42858490347862244, + "learning_rate": 6.140350877192983e-05, + "loss": 0.9421, + "mean_token_accuracy": 0.770805761218071, + "num_tokens": 16016461.0, + "step": 1276 + }, + { + "entropy": 0.9666438475251198, + "epoch": 2.1283333333333334, + "grad_norm": 0.32886090874671936, + "learning_rate": 6.128654970760235e-05, + "loss": 0.9455, + "mean_token_accuracy": 0.7742414399981499, + "num_tokens": 16029093.0, + "step": 1277 + }, + { + "entropy": 0.921160988509655, + "epoch": 2.13, + "grad_norm": 0.4459408223628998, + "learning_rate": 6.116959064327486e-05, + "loss": 0.872, + "mean_token_accuracy": 0.7858338877558708, + "num_tokens": 16041897.0, + "step": 1278 + }, + { + "entropy": 1.0833180248737335, + "epoch": 2.131666666666667, + "grad_norm": 0.42643165588378906, + "learning_rate": 6.105263157894737e-05, + "loss": 1.0701, + "mean_token_accuracy": 0.7469203472137451, + "num_tokens": 16054086.0, + "step": 1279 + }, + { + "entropy": 0.9691863358020782, + "epoch": 2.1333333333333333, + "grad_norm": 0.48922327160835266, + "learning_rate": 6.093567251461989e-05, + "loss": 0.953, + "mean_token_accuracy": 0.7782407402992249, + "num_tokens": 16066312.0, + "step": 1280 + }, + { + "entropy": 1.0332336723804474, + "epoch": 2.135, + "grad_norm": 0.3459659218788147, + "learning_rate": 6.0818713450292395e-05, + "loss": 0.9957, + "mean_token_accuracy": 0.7688183858990669, + "num_tokens": 16079021.0, + "step": 1281 + }, + { + "entropy": 1.0902429074048996, + "epoch": 2.1366666666666667, + "grad_norm": 0.48367759585380554, + "learning_rate": 6.070175438596492e-05, + "loss": 1.0606, + "mean_token_accuracy": 0.7484945505857468, + "num_tokens": 16091561.0, + "step": 1282 + }, + { + "entropy": 0.9734514728188515, + "epoch": 2.138333333333333, + "grad_norm": 0.3331240713596344, + "learning_rate": 6.0584795321637434e-05, + "loss": 0.9409, + "mean_token_accuracy": 0.7843116372823715, + "num_tokens": 16104178.0, + "step": 1283 + }, + { + "entropy": 0.8520051911473274, + "epoch": 2.14, + "grad_norm": 0.4123169183731079, + "learning_rate": 6.046783625730994e-05, + "loss": 0.8289, + "mean_token_accuracy": 0.8049101606011391, + "num_tokens": 16116623.0, + "step": 1284 + }, + { + "entropy": 1.1987340301275253, + "epoch": 2.1416666666666666, + "grad_norm": 0.36131563782691956, + "learning_rate": 6.035087719298246e-05, + "loss": 1.2341, + "mean_token_accuracy": 0.7114728689193726, + "num_tokens": 16129614.0, + "step": 1285 + }, + { + "entropy": 1.063599593937397, + "epoch": 2.1433333333333335, + "grad_norm": 0.403729110956192, + "learning_rate": 6.0233918128654976e-05, + "loss": 1.0633, + "mean_token_accuracy": 0.7531147226691246, + "num_tokens": 16142163.0, + "step": 1286 + }, + { + "entropy": 1.0272391214966774, + "epoch": 2.145, + "grad_norm": 0.7972368001937866, + "learning_rate": 6.011695906432749e-05, + "loss": 1.0565, + "mean_token_accuracy": 0.7503082677721977, + "num_tokens": 16154990.0, + "step": 1287 + }, + { + "entropy": 1.016928717494011, + "epoch": 2.1466666666666665, + "grad_norm": 0.36236366629600525, + "learning_rate": 6e-05, + "loss": 1.0134, + "mean_token_accuracy": 0.7566632479429245, + "num_tokens": 16167762.0, + "step": 1288 + }, + { + "entropy": 1.0799991637468338, + "epoch": 2.1483333333333334, + "grad_norm": 0.45774954557418823, + "learning_rate": 5.988304093567252e-05, + "loss": 1.0938, + "mean_token_accuracy": 0.7485719472169876, + "num_tokens": 16180282.0, + "step": 1289 + }, + { + "entropy": 1.1192268580198288, + "epoch": 2.15, + "grad_norm": 0.5673292875289917, + "learning_rate": 5.9766081871345034e-05, + "loss": 1.1019, + "mean_token_accuracy": 0.7411420792341232, + "num_tokens": 16192847.0, + "step": 1290 + }, + { + "entropy": 1.022900365293026, + "epoch": 2.151666666666667, + "grad_norm": 0.560111403465271, + "learning_rate": 5.9649122807017544e-05, + "loss": 1.0098, + "mean_token_accuracy": 0.7596027627587318, + "num_tokens": 16205469.0, + "step": 1291 + }, + { + "entropy": 1.0922938510775566, + "epoch": 2.1533333333333333, + "grad_norm": 0.4293026924133301, + "learning_rate": 5.953216374269006e-05, + "loss": 1.0881, + "mean_token_accuracy": 0.741927333176136, + "num_tokens": 16218067.0, + "step": 1292 + }, + { + "entropy": 0.9778279587626457, + "epoch": 2.155, + "grad_norm": 0.5148698687553406, + "learning_rate": 5.9415204678362576e-05, + "loss": 0.9647, + "mean_token_accuracy": 0.7713644728064537, + "num_tokens": 16230709.0, + "step": 1293 + }, + { + "entropy": 1.0436527356505394, + "epoch": 2.1566666666666667, + "grad_norm": 0.7006407976150513, + "learning_rate": 5.9298245614035085e-05, + "loss": 1.0154, + "mean_token_accuracy": 0.7672221437096596, + "num_tokens": 16243490.0, + "step": 1294 + }, + { + "entropy": 1.1106083691120148, + "epoch": 2.158333333333333, + "grad_norm": 0.4080680310726166, + "learning_rate": 5.91812865497076e-05, + "loss": 1.1154, + "mean_token_accuracy": 0.7437613978981972, + "num_tokens": 16256281.0, + "step": 1295 + }, + { + "entropy": 1.0642745941877365, + "epoch": 2.16, + "grad_norm": 0.3763604164123535, + "learning_rate": 5.9064327485380125e-05, + "loss": 1.0475, + "mean_token_accuracy": 0.7536506652832031, + "num_tokens": 16268918.0, + "step": 1296 + }, + { + "entropy": 1.1376720741391182, + "epoch": 2.1616666666666666, + "grad_norm": 0.4499747157096863, + "learning_rate": 5.894736842105263e-05, + "loss": 1.1319, + "mean_token_accuracy": 0.730003148317337, + "num_tokens": 16281366.0, + "step": 1297 + }, + { + "entropy": 0.986762024462223, + "epoch": 2.163333333333333, + "grad_norm": 0.3800641596317291, + "learning_rate": 5.8830409356725144e-05, + "loss": 0.9589, + "mean_token_accuracy": 0.7807234078645706, + "num_tokens": 16294061.0, + "step": 1298 + }, + { + "entropy": 1.0639652237296104, + "epoch": 2.165, + "grad_norm": 0.5049359798431396, + "learning_rate": 5.871345029239767e-05, + "loss": 1.0493, + "mean_token_accuracy": 0.7527299225330353, + "num_tokens": 16306510.0, + "step": 1299 + }, + { + "entropy": 1.100369393825531, + "epoch": 2.1666666666666665, + "grad_norm": 0.516254723072052, + "learning_rate": 5.859649122807018e-05, + "loss": 1.0883, + "mean_token_accuracy": 0.7433087825775146, + "num_tokens": 16319067.0, + "step": 1300 + }, + { + "entropy": 1.1864375174045563, + "epoch": 2.1683333333333334, + "grad_norm": 0.47443917393684387, + "learning_rate": 5.847953216374269e-05, + "loss": 1.1709, + "mean_token_accuracy": 0.732798770070076, + "num_tokens": 16331635.0, + "step": 1301 + }, + { + "entropy": 0.9979792460799217, + "epoch": 2.17, + "grad_norm": 0.5144807696342468, + "learning_rate": 5.836257309941521e-05, + "loss": 0.9709, + "mean_token_accuracy": 0.7687290459871292, + "num_tokens": 16344209.0, + "step": 1302 + }, + { + "entropy": 0.9684187322854996, + "epoch": 2.171666666666667, + "grad_norm": 0.4104618430137634, + "learning_rate": 5.8245614035087725e-05, + "loss": 0.9587, + "mean_token_accuracy": 0.7708048149943352, + "num_tokens": 16356870.0, + "step": 1303 + }, + { + "entropy": 0.9549127817153931, + "epoch": 2.1733333333333333, + "grad_norm": 0.48793643712997437, + "learning_rate": 5.8128654970760234e-05, + "loss": 0.9355, + "mean_token_accuracy": 0.7806605771183968, + "num_tokens": 16369450.0, + "step": 1304 + }, + { + "entropy": 0.9907964468002319, + "epoch": 2.175, + "grad_norm": 0.6664928793907166, + "learning_rate": 5.801169590643275e-05, + "loss": 0.9765, + "mean_token_accuracy": 0.7666610702872276, + "num_tokens": 16381818.0, + "step": 1305 + }, + { + "entropy": 1.0534771382808685, + "epoch": 2.1766666666666667, + "grad_norm": 0.4101022481918335, + "learning_rate": 5.789473684210527e-05, + "loss": 1.0621, + "mean_token_accuracy": 0.7422067001461983, + "num_tokens": 16394484.0, + "step": 1306 + }, + { + "entropy": 0.959588311612606, + "epoch": 2.1783333333333332, + "grad_norm": 0.4971601366996765, + "learning_rate": 5.7777777777777776e-05, + "loss": 0.9667, + "mean_token_accuracy": 0.7726762592792511, + "num_tokens": 16406921.0, + "step": 1307 + }, + { + "entropy": 0.94914510846138, + "epoch": 2.18, + "grad_norm": 0.7297646999359131, + "learning_rate": 5.766081871345029e-05, + "loss": 0.9198, + "mean_token_accuracy": 0.7834586650133133, + "num_tokens": 16419330.0, + "step": 1308 + }, + { + "entropy": 0.8451811969280243, + "epoch": 2.1816666666666666, + "grad_norm": 0.8522327542304993, + "learning_rate": 5.754385964912281e-05, + "loss": 0.8151, + "mean_token_accuracy": 0.8042490780353546, + "num_tokens": 16431780.0, + "step": 1309 + }, + { + "entropy": 0.9630127027630806, + "epoch": 2.183333333333333, + "grad_norm": 0.7566533088684082, + "learning_rate": 5.742690058479533e-05, + "loss": 0.9617, + "mean_token_accuracy": 0.7709286585450172, + "num_tokens": 16444175.0, + "step": 1310 + }, + { + "entropy": 1.0044907331466675, + "epoch": 2.185, + "grad_norm": 0.567910373210907, + "learning_rate": 5.7309941520467835e-05, + "loss": 1.0039, + "mean_token_accuracy": 0.7546889409422874, + "num_tokens": 16456675.0, + "step": 1311 + }, + { + "entropy": 0.9066718518733978, + "epoch": 2.1866666666666665, + "grad_norm": 0.5360475778579712, + "learning_rate": 5.719298245614035e-05, + "loss": 0.9041, + "mean_token_accuracy": 0.7795831486582756, + "num_tokens": 16469215.0, + "step": 1312 + }, + { + "entropy": 0.964377224445343, + "epoch": 2.1883333333333335, + "grad_norm": 0.42369315028190613, + "learning_rate": 5.7076023391812874e-05, + "loss": 0.9632, + "mean_token_accuracy": 0.7736216709017754, + "num_tokens": 16481897.0, + "step": 1313 + }, + { + "entropy": 1.036913737654686, + "epoch": 2.19, + "grad_norm": 1.0702619552612305, + "learning_rate": 5.695906432748538e-05, + "loss": 1.0294, + "mean_token_accuracy": 0.7630732581019402, + "num_tokens": 16494498.0, + "step": 1314 + }, + { + "entropy": 0.9326903074979782, + "epoch": 2.191666666666667, + "grad_norm": 0.37404054403305054, + "learning_rate": 5.68421052631579e-05, + "loss": 0.9261, + "mean_token_accuracy": 0.7786694467067719, + "num_tokens": 16507080.0, + "step": 1315 + }, + { + "entropy": 1.1574224308133125, + "epoch": 2.1933333333333334, + "grad_norm": 0.38884639739990234, + "learning_rate": 5.6725146198830416e-05, + "loss": 1.1545, + "mean_token_accuracy": 0.7322019338607788, + "num_tokens": 16519470.0, + "step": 1316 + }, + { + "entropy": 1.067192830145359, + "epoch": 2.195, + "grad_norm": 0.4278869032859802, + "learning_rate": 5.660818713450292e-05, + "loss": 1.0693, + "mean_token_accuracy": 0.7495457902550697, + "num_tokens": 16532119.0, + "step": 1317 + }, + { + "entropy": 1.0568198040127754, + "epoch": 2.1966666666666668, + "grad_norm": 0.3551260232925415, + "learning_rate": 5.649122807017544e-05, + "loss": 1.0402, + "mean_token_accuracy": 0.7526931017637253, + "num_tokens": 16544667.0, + "step": 1318 + }, + { + "entropy": 1.1023252457380295, + "epoch": 2.1983333333333333, + "grad_norm": 0.35593000054359436, + "learning_rate": 5.637426900584796e-05, + "loss": 1.0841, + "mean_token_accuracy": 0.7347672209143639, + "num_tokens": 16557315.0, + "step": 1319 + }, + { + "entropy": 0.9789210185408592, + "epoch": 2.2, + "grad_norm": 0.43837839365005493, + "learning_rate": 5.6257309941520474e-05, + "loss": 0.9507, + "mean_token_accuracy": 0.7715327590703964, + "num_tokens": 16570029.0, + "step": 1320 + }, + { + "entropy": 1.013417411595583, + "epoch": 2.2016666666666667, + "grad_norm": 0.5318921208381653, + "learning_rate": 5.6140350877192984e-05, + "loss": 0.9914, + "mean_token_accuracy": 0.7649900913238525, + "num_tokens": 16582268.0, + "step": 1321 + }, + { + "entropy": 0.9769079238176346, + "epoch": 2.203333333333333, + "grad_norm": 0.3640197515487671, + "learning_rate": 5.60233918128655e-05, + "loss": 0.9435, + "mean_token_accuracy": 0.7728189751505852, + "num_tokens": 16594946.0, + "step": 1322 + }, + { + "entropy": 0.9699849337339401, + "epoch": 2.205, + "grad_norm": 0.40620866417884827, + "learning_rate": 5.5906432748538016e-05, + "loss": 0.9454, + "mean_token_accuracy": 0.7786931991577148, + "num_tokens": 16607792.0, + "step": 1323 + }, + { + "entropy": 1.1079127714037895, + "epoch": 2.2066666666666666, + "grad_norm": 0.6549301743507385, + "learning_rate": 5.5789473684210526e-05, + "loss": 1.1275, + "mean_token_accuracy": 0.7339038550853729, + "num_tokens": 16620299.0, + "step": 1324 + }, + { + "entropy": 0.9230095148086548, + "epoch": 2.2083333333333335, + "grad_norm": 0.33990395069122314, + "learning_rate": 5.567251461988304e-05, + "loss": 0.8899, + "mean_token_accuracy": 0.7884261012077332, + "num_tokens": 16633195.0, + "step": 1325 + }, + { + "entropy": 1.0868623852729797, + "epoch": 2.21, + "grad_norm": 0.3604794442653656, + "learning_rate": 5.555555555555556e-05, + "loss": 1.0579, + "mean_token_accuracy": 0.7498101890087128, + "num_tokens": 16645775.0, + "step": 1326 + }, + { + "entropy": 0.9807609021663666, + "epoch": 2.211666666666667, + "grad_norm": 0.33847159147262573, + "learning_rate": 5.543859649122807e-05, + "loss": 0.9867, + "mean_token_accuracy": 0.7728202939033508, + "num_tokens": 16658175.0, + "step": 1327 + }, + { + "entropy": 0.8667835667729378, + "epoch": 2.2133333333333334, + "grad_norm": 0.5681234002113342, + "learning_rate": 5.5321637426900584e-05, + "loss": 0.8326, + "mean_token_accuracy": 0.7929875701665878, + "num_tokens": 16670551.0, + "step": 1328 + }, + { + "entropy": 1.0152478590607643, + "epoch": 2.215, + "grad_norm": 0.39527827501296997, + "learning_rate": 5.52046783625731e-05, + "loss": 1.0096, + "mean_token_accuracy": 0.7634976357221603, + "num_tokens": 16683038.0, + "step": 1329 + }, + { + "entropy": 0.8312469348311424, + "epoch": 2.216666666666667, + "grad_norm": 0.3481723964214325, + "learning_rate": 5.508771929824562e-05, + "loss": 0.8237, + "mean_token_accuracy": 0.8021807745099068, + "num_tokens": 16695532.0, + "step": 1330 + }, + { + "entropy": 1.11122115701437, + "epoch": 2.2183333333333333, + "grad_norm": 0.4181380867958069, + "learning_rate": 5.4970760233918126e-05, + "loss": 1.1574, + "mean_token_accuracy": 0.7308775633573532, + "num_tokens": 16707895.0, + "step": 1331 + }, + { + "entropy": 0.9292695820331573, + "epoch": 2.22, + "grad_norm": 0.4616110920906067, + "learning_rate": 5.485380116959065e-05, + "loss": 0.9376, + "mean_token_accuracy": 0.7777559012174606, + "num_tokens": 16720572.0, + "step": 1332 + }, + { + "entropy": 1.0448790863156319, + "epoch": 2.2216666666666667, + "grad_norm": 0.5627619028091431, + "learning_rate": 5.4736842105263165e-05, + "loss": 1.0246, + "mean_token_accuracy": 0.7547913640737534, + "num_tokens": 16733334.0, + "step": 1333 + }, + { + "entropy": 1.0009194388985634, + "epoch": 2.223333333333333, + "grad_norm": 0.45245105028152466, + "learning_rate": 5.461988304093567e-05, + "loss": 0.9985, + "mean_token_accuracy": 0.7636258527636528, + "num_tokens": 16746203.0, + "step": 1334 + }, + { + "entropy": 0.892767034471035, + "epoch": 2.225, + "grad_norm": 0.8160334825515747, + "learning_rate": 5.450292397660819e-05, + "loss": 0.899, + "mean_token_accuracy": 0.7808797210454941, + "num_tokens": 16758950.0, + "step": 1335 + }, + { + "entropy": 0.9594156295061111, + "epoch": 2.2266666666666666, + "grad_norm": 1.0954060554504395, + "learning_rate": 5.438596491228071e-05, + "loss": 0.9574, + "mean_token_accuracy": 0.7772248908877373, + "num_tokens": 16771399.0, + "step": 1336 + }, + { + "entropy": 0.9466140791773796, + "epoch": 2.2283333333333335, + "grad_norm": 0.4164392650127411, + "learning_rate": 5.426900584795322e-05, + "loss": 0.9407, + "mean_token_accuracy": 0.7773276045918465, + "num_tokens": 16783852.0, + "step": 1337 + }, + { + "entropy": 1.0618887767195702, + "epoch": 2.23, + "grad_norm": 0.3848114013671875, + "learning_rate": 5.415204678362573e-05, + "loss": 1.0241, + "mean_token_accuracy": 0.7604824677109718, + "num_tokens": 16796449.0, + "step": 1338 + }, + { + "entropy": 1.0883656069636345, + "epoch": 2.2316666666666665, + "grad_norm": 0.4396343529224396, + "learning_rate": 5.403508771929825e-05, + "loss": 1.0648, + "mean_token_accuracy": 0.7537149339914322, + "num_tokens": 16808785.0, + "step": 1339 + }, + { + "entropy": 1.1117828115820885, + "epoch": 2.2333333333333334, + "grad_norm": 0.505547046661377, + "learning_rate": 5.3918128654970765e-05, + "loss": 1.0814, + "mean_token_accuracy": 0.7465233132243156, + "num_tokens": 16821408.0, + "step": 1340 + }, + { + "entropy": 0.9992786198854446, + "epoch": 2.235, + "grad_norm": 0.6752765774726868, + "learning_rate": 5.3801169590643275e-05, + "loss": 0.9719, + "mean_token_accuracy": 0.7632914334535599, + "num_tokens": 16833965.0, + "step": 1341 + }, + { + "entropy": 1.029735304415226, + "epoch": 2.236666666666667, + "grad_norm": 0.3577609062194824, + "learning_rate": 5.368421052631579e-05, + "loss": 1.027, + "mean_token_accuracy": 0.7575557827949524, + "num_tokens": 16846213.0, + "step": 1342 + }, + { + "entropy": 1.02254568785429, + "epoch": 2.2383333333333333, + "grad_norm": 0.5643934607505798, + "learning_rate": 5.356725146198831e-05, + "loss": 0.9779, + "mean_token_accuracy": 0.7670003026723862, + "num_tokens": 16858588.0, + "step": 1343 + }, + { + "entropy": 1.127457857131958, + "epoch": 2.24, + "grad_norm": 0.5274756550788879, + "learning_rate": 5.345029239766082e-05, + "loss": 1.1312, + "mean_token_accuracy": 0.7362267971038818, + "num_tokens": 16870873.0, + "step": 1344 + }, + { + "entropy": 0.872569777071476, + "epoch": 2.2416666666666667, + "grad_norm": 0.47619274258613586, + "learning_rate": 5.333333333333333e-05, + "loss": 0.8457, + "mean_token_accuracy": 0.7962174266576767, + "num_tokens": 16883429.0, + "step": 1345 + }, + { + "entropy": 1.0071209147572517, + "epoch": 2.243333333333333, + "grad_norm": 0.347260057926178, + "learning_rate": 5.3216374269005856e-05, + "loss": 0.9982, + "mean_token_accuracy": 0.7638007178902626, + "num_tokens": 16896021.0, + "step": 1346 + }, + { + "entropy": 0.9757136106491089, + "epoch": 2.245, + "grad_norm": 0.365809828042984, + "learning_rate": 5.309941520467836e-05, + "loss": 0.9584, + "mean_token_accuracy": 0.777867503464222, + "num_tokens": 16908752.0, + "step": 1347 + }, + { + "entropy": 0.9249651879072189, + "epoch": 2.2466666666666666, + "grad_norm": 0.7336277961730957, + "learning_rate": 5.2982456140350875e-05, + "loss": 0.9327, + "mean_token_accuracy": 0.7768701538443565, + "num_tokens": 16921397.0, + "step": 1348 + }, + { + "entropy": 1.1343712359666824, + "epoch": 2.2483333333333335, + "grad_norm": 0.3682149350643158, + "learning_rate": 5.28654970760234e-05, + "loss": 1.1316, + "mean_token_accuracy": 0.7318572327494621, + "num_tokens": 16933902.0, + "step": 1349 + }, + { + "entropy": 1.1242988482117653, + "epoch": 2.25, + "grad_norm": 0.34693223237991333, + "learning_rate": 5.2748538011695914e-05, + "loss": 1.1318, + "mean_token_accuracy": 0.7306938543915749, + "num_tokens": 16946513.0, + "step": 1350 + }, + { + "entropy": 1.0182012990117073, + "epoch": 2.2516666666666665, + "grad_norm": 0.39444735646247864, + "learning_rate": 5.2631578947368424e-05, + "loss": 1.0336, + "mean_token_accuracy": 0.7569727301597595, + "num_tokens": 16958860.0, + "step": 1351 + }, + { + "entropy": 0.9810265675187111, + "epoch": 2.2533333333333334, + "grad_norm": 0.4229176938533783, + "learning_rate": 5.251461988304094e-05, + "loss": 0.9885, + "mean_token_accuracy": 0.7616246268153191, + "num_tokens": 16971442.0, + "step": 1352 + }, + { + "entropy": 0.8432167172431946, + "epoch": 2.255, + "grad_norm": 0.38831403851509094, + "learning_rate": 5.2397660818713456e-05, + "loss": 0.8372, + "mean_token_accuracy": 0.7978884279727936, + "num_tokens": 16983965.0, + "step": 1353 + }, + { + "entropy": 0.8533117473125458, + "epoch": 2.256666666666667, + "grad_norm": 0.6745826005935669, + "learning_rate": 5.2280701754385966e-05, + "loss": 0.8643, + "mean_token_accuracy": 0.7991088628768921, + "num_tokens": 16996438.0, + "step": 1354 + }, + { + "entropy": 0.9709722027182579, + "epoch": 2.2583333333333333, + "grad_norm": 0.3520893156528473, + "learning_rate": 5.216374269005848e-05, + "loss": 0.9832, + "mean_token_accuracy": 0.7662216052412987, + "num_tokens": 17008825.0, + "step": 1355 + }, + { + "entropy": 1.1532756462693214, + "epoch": 2.26, + "grad_norm": 0.4352859556674957, + "learning_rate": 5.2046783625731e-05, + "loss": 1.1474, + "mean_token_accuracy": 0.7404665350914001, + "num_tokens": 17021349.0, + "step": 1356 + }, + { + "entropy": 1.0780869722366333, + "epoch": 2.2616666666666667, + "grad_norm": 0.41774484515190125, + "learning_rate": 5.192982456140351e-05, + "loss": 1.0469, + "mean_token_accuracy": 0.7495746538043022, + "num_tokens": 17034042.0, + "step": 1357 + }, + { + "entropy": 0.8776328600943089, + "epoch": 2.263333333333333, + "grad_norm": 0.5010235905647278, + "learning_rate": 5.1812865497076024e-05, + "loss": 0.8583, + "mean_token_accuracy": 0.7983968332409859, + "num_tokens": 17046895.0, + "step": 1358 + }, + { + "entropy": 0.9787009358406067, + "epoch": 2.265, + "grad_norm": 0.47016122937202454, + "learning_rate": 5.169590643274854e-05, + "loss": 0.9504, + "mean_token_accuracy": 0.7770741358399391, + "num_tokens": 17059591.0, + "step": 1359 + }, + { + "entropy": 1.0662935376167297, + "epoch": 2.2666666666666666, + "grad_norm": 0.4710860848426819, + "learning_rate": 5.157894736842106e-05, + "loss": 1.0395, + "mean_token_accuracy": 0.7559436112642288, + "num_tokens": 17072314.0, + "step": 1360 + }, + { + "entropy": 1.0282399132847786, + "epoch": 2.2683333333333335, + "grad_norm": 0.47713732719421387, + "learning_rate": 5.1461988304093566e-05, + "loss": 1.006, + "mean_token_accuracy": 0.7650522887706757, + "num_tokens": 17084913.0, + "step": 1361 + }, + { + "entropy": 1.043331265449524, + "epoch": 2.27, + "grad_norm": 0.3616006374359131, + "learning_rate": 5.134502923976608e-05, + "loss": 1.0325, + "mean_token_accuracy": 0.7531749531626701, + "num_tokens": 17097574.0, + "step": 1362 + }, + { + "entropy": 0.9881621971726418, + "epoch": 2.2716666666666665, + "grad_norm": 0.37835896015167236, + "learning_rate": 5.1228070175438605e-05, + "loss": 0.9585, + "mean_token_accuracy": 0.7672366797924042, + "num_tokens": 17110078.0, + "step": 1363 + }, + { + "entropy": 1.1365701109170914, + "epoch": 2.2733333333333334, + "grad_norm": 0.8861700296401978, + "learning_rate": 5.111111111111111e-05, + "loss": 1.1198, + "mean_token_accuracy": 0.7398519292473793, + "num_tokens": 17122710.0, + "step": 1364 + }, + { + "entropy": 1.1724487319588661, + "epoch": 2.275, + "grad_norm": 0.341764897108078, + "learning_rate": 5.0994152046783624e-05, + "loss": 1.1484, + "mean_token_accuracy": 0.7318525314331055, + "num_tokens": 17135245.0, + "step": 1365 + }, + { + "entropy": 1.046781301498413, + "epoch": 2.276666666666667, + "grad_norm": 0.5951229929924011, + "learning_rate": 5.087719298245615e-05, + "loss": 1.0135, + "mean_token_accuracy": 0.7535328194499016, + "num_tokens": 17147572.0, + "step": 1366 + }, + { + "entropy": 0.9836950078606606, + "epoch": 2.2783333333333333, + "grad_norm": 0.33295539021492004, + "learning_rate": 5.076023391812865e-05, + "loss": 0.9734, + "mean_token_accuracy": 0.7714741080999374, + "num_tokens": 17160001.0, + "step": 1367 + }, + { + "entropy": 1.0225476697087288, + "epoch": 2.2800000000000002, + "grad_norm": 0.364423006772995, + "learning_rate": 5.064327485380117e-05, + "loss": 1.0192, + "mean_token_accuracy": 0.7587759718298912, + "num_tokens": 17172308.0, + "step": 1368 + }, + { + "entropy": 0.9102307036519051, + "epoch": 2.2816666666666667, + "grad_norm": 0.41601067781448364, + "learning_rate": 5.052631578947369e-05, + "loss": 0.8893, + "mean_token_accuracy": 0.7857940495014191, + "num_tokens": 17184830.0, + "step": 1369 + }, + { + "entropy": 1.0207800790667534, + "epoch": 2.283333333333333, + "grad_norm": 0.4100296199321747, + "learning_rate": 5.0409356725146206e-05, + "loss": 1.0173, + "mean_token_accuracy": 0.7640554904937744, + "num_tokens": 17197396.0, + "step": 1370 + }, + { + "entropy": 0.9694831073284149, + "epoch": 2.285, + "grad_norm": 0.3034035861492157, + "learning_rate": 5.0292397660818715e-05, + "loss": 0.9673, + "mean_token_accuracy": 0.7726395651698112, + "num_tokens": 17209857.0, + "step": 1371 + }, + { + "entropy": 0.9977451115846634, + "epoch": 2.2866666666666666, + "grad_norm": 0.3630425035953522, + "learning_rate": 5.017543859649123e-05, + "loss": 1.0045, + "mean_token_accuracy": 0.7612852081656456, + "num_tokens": 17222566.0, + "step": 1372 + }, + { + "entropy": 0.7503664679825306, + "epoch": 2.288333333333333, + "grad_norm": 0.39784306287765503, + "learning_rate": 5.005847953216375e-05, + "loss": 0.7382, + "mean_token_accuracy": 0.8181203901767731, + "num_tokens": 17235045.0, + "step": 1373 + }, + { + "entropy": 1.0640960857272148, + "epoch": 2.29, + "grad_norm": 0.34543782472610474, + "learning_rate": 4.9941520467836264e-05, + "loss": 1.0438, + "mean_token_accuracy": 0.7567973583936691, + "num_tokens": 17247637.0, + "step": 1374 + }, + { + "entropy": 1.0128931030631065, + "epoch": 2.2916666666666665, + "grad_norm": 0.39248374104499817, + "learning_rate": 4.9824561403508773e-05, + "loss": 1.0173, + "mean_token_accuracy": 0.757995493710041, + "num_tokens": 17260592.0, + "step": 1375 + }, + { + "entropy": 0.9241216853260994, + "epoch": 2.2933333333333334, + "grad_norm": 0.369783878326416, + "learning_rate": 4.970760233918128e-05, + "loss": 0.9299, + "mean_token_accuracy": 0.7820791676640511, + "num_tokens": 17273154.0, + "step": 1376 + }, + { + "entropy": 0.98366928845644, + "epoch": 2.295, + "grad_norm": 0.32785898447036743, + "learning_rate": 4.9590643274853806e-05, + "loss": 0.9758, + "mean_token_accuracy": 0.7671954706311226, + "num_tokens": 17285626.0, + "step": 1377 + }, + { + "entropy": 0.9590506628155708, + "epoch": 2.296666666666667, + "grad_norm": 0.4462047815322876, + "learning_rate": 4.9473684210526315e-05, + "loss": 0.9667, + "mean_token_accuracy": 0.7689107283949852, + "num_tokens": 17298152.0, + "step": 1378 + }, + { + "entropy": 0.8879049271345139, + "epoch": 2.2983333333333333, + "grad_norm": 0.6436523199081421, + "learning_rate": 4.935672514619883e-05, + "loss": 0.8695, + "mean_token_accuracy": 0.7915430590510368, + "num_tokens": 17310616.0, + "step": 1379 + }, + { + "entropy": 0.976331889629364, + "epoch": 2.3, + "grad_norm": 0.38645899295806885, + "learning_rate": 4.923976608187135e-05, + "loss": 0.9414, + "mean_token_accuracy": 0.7720116227865219, + "num_tokens": 17323089.0, + "step": 1380 + }, + { + "entropy": 0.9235109984874725, + "epoch": 2.3016666666666667, + "grad_norm": 0.4779398739337921, + "learning_rate": 4.912280701754386e-05, + "loss": 0.9052, + "mean_token_accuracy": 0.7808584272861481, + "num_tokens": 17335586.0, + "step": 1381 + }, + { + "entropy": 0.9150906801223755, + "epoch": 2.3033333333333332, + "grad_norm": 0.6987245082855225, + "learning_rate": 4.900584795321638e-05, + "loss": 0.8952, + "mean_token_accuracy": 0.7901689186692238, + "num_tokens": 17347701.0, + "step": 1382 + }, + { + "entropy": 1.0283723548054695, + "epoch": 2.305, + "grad_norm": 0.3757184147834778, + "learning_rate": 4.888888888888889e-05, + "loss": 0.9959, + "mean_token_accuracy": 0.7702204138040543, + "num_tokens": 17360287.0, + "step": 1383 + }, + { + "entropy": 1.0523831248283386, + "epoch": 2.3066666666666666, + "grad_norm": 0.3326495885848999, + "learning_rate": 4.8771929824561406e-05, + "loss": 1.0386, + "mean_token_accuracy": 0.7531796246767044, + "num_tokens": 17372638.0, + "step": 1384 + }, + { + "entropy": 0.9708017259836197, + "epoch": 2.3083333333333336, + "grad_norm": 0.3804745674133301, + "learning_rate": 4.865497076023392e-05, + "loss": 0.9709, + "mean_token_accuracy": 0.7721543908119202, + "num_tokens": 17385346.0, + "step": 1385 + }, + { + "entropy": 0.9105298519134521, + "epoch": 2.31, + "grad_norm": 0.3721154034137726, + "learning_rate": 4.853801169590643e-05, + "loss": 0.9014, + "mean_token_accuracy": 0.783396877348423, + "num_tokens": 17397826.0, + "step": 1386 + }, + { + "entropy": 0.9587042778730392, + "epoch": 2.3116666666666665, + "grad_norm": 0.5590223073959351, + "learning_rate": 4.842105263157895e-05, + "loss": 0.9568, + "mean_token_accuracy": 0.7682449370622635, + "num_tokens": 17410077.0, + "step": 1387 + }, + { + "entropy": 0.868468314409256, + "epoch": 2.3133333333333335, + "grad_norm": 0.48390790820121765, + "learning_rate": 4.8304093567251464e-05, + "loss": 0.8674, + "mean_token_accuracy": 0.7915588393807411, + "num_tokens": 17422601.0, + "step": 1388 + }, + { + "entropy": 1.0089046433568, + "epoch": 2.315, + "grad_norm": 0.411978542804718, + "learning_rate": 4.818713450292398e-05, + "loss": 1.0145, + "mean_token_accuracy": 0.7641316577792168, + "num_tokens": 17434932.0, + "step": 1389 + }, + { + "entropy": 1.0269602611660957, + "epoch": 2.3166666666666664, + "grad_norm": 0.7659572958946228, + "learning_rate": 4.807017543859649e-05, + "loss": 1.0268, + "mean_token_accuracy": 0.7537020742893219, + "num_tokens": 17447433.0, + "step": 1390 + }, + { + "entropy": 0.9385881945490837, + "epoch": 2.3183333333333334, + "grad_norm": 0.5323599576950073, + "learning_rate": 4.7953216374269006e-05, + "loss": 0.9092, + "mean_token_accuracy": 0.7828353866934776, + "num_tokens": 17459790.0, + "step": 1391 + }, + { + "entropy": 1.1370372027158737, + "epoch": 2.32, + "grad_norm": 0.38865986466407776, + "learning_rate": 4.783625730994152e-05, + "loss": 1.1373, + "mean_token_accuracy": 0.7357551530003548, + "num_tokens": 17472076.0, + "step": 1392 + }, + { + "entropy": 0.834554873406887, + "epoch": 2.3216666666666668, + "grad_norm": 0.49932676553726196, + "learning_rate": 4.771929824561404e-05, + "loss": 0.7998, + "mean_token_accuracy": 0.8036729022860527, + "num_tokens": 17484323.0, + "step": 1393 + }, + { + "entropy": 0.9042144566774368, + "epoch": 2.3233333333333333, + "grad_norm": 0.37473568320274353, + "learning_rate": 4.7602339181286555e-05, + "loss": 0.8912, + "mean_token_accuracy": 0.7868322804570198, + "num_tokens": 17496793.0, + "step": 1394 + }, + { + "entropy": 1.0557275265455246, + "epoch": 2.325, + "grad_norm": 0.3701728880405426, + "learning_rate": 4.7485380116959065e-05, + "loss": 1.0479, + "mean_token_accuracy": 0.754970870912075, + "num_tokens": 17509332.0, + "step": 1395 + }, + { + "entropy": 1.0963630303740501, + "epoch": 2.3266666666666667, + "grad_norm": 0.3401143550872803, + "learning_rate": 4.736842105263158e-05, + "loss": 1.1029, + "mean_token_accuracy": 0.7424159646034241, + "num_tokens": 17522106.0, + "step": 1396 + }, + { + "entropy": 0.9811793863773346, + "epoch": 2.328333333333333, + "grad_norm": 0.5580219626426697, + "learning_rate": 4.72514619883041e-05, + "loss": 0.9812, + "mean_token_accuracy": 0.765669047832489, + "num_tokens": 17534600.0, + "step": 1397 + }, + { + "entropy": 0.9123764485120773, + "epoch": 2.33, + "grad_norm": 0.46365654468536377, + "learning_rate": 4.7134502923976607e-05, + "loss": 0.8968, + "mean_token_accuracy": 0.7805200591683388, + "num_tokens": 17547071.0, + "step": 1398 + }, + { + "entropy": 1.0711625665426254, + "epoch": 2.3316666666666666, + "grad_norm": 0.37433910369873047, + "learning_rate": 4.701754385964913e-05, + "loss": 1.0559, + "mean_token_accuracy": 0.7528220415115356, + "num_tokens": 17559536.0, + "step": 1399 + }, + { + "entropy": 0.9382849037647247, + "epoch": 2.3333333333333335, + "grad_norm": 0.45902541279792786, + "learning_rate": 4.690058479532164e-05, + "loss": 0.9257, + "mean_token_accuracy": 0.7820262312889099, + "num_tokens": 17572095.0, + "step": 1400 + }, + { + "entropy": 0.9464045315980911, + "epoch": 2.335, + "grad_norm": 0.38684797286987305, + "learning_rate": 4.678362573099415e-05, + "loss": 0.9321, + "mean_token_accuracy": 0.7770356684923172, + "num_tokens": 17584624.0, + "step": 1401 + }, + { + "entropy": 0.9594104662537575, + "epoch": 2.336666666666667, + "grad_norm": 0.42251867055892944, + "learning_rate": 4.666666666666667e-05, + "loss": 0.9388, + "mean_token_accuracy": 0.7785516977310181, + "num_tokens": 17597454.0, + "step": 1402 + }, + { + "entropy": 1.033950388431549, + "epoch": 2.3383333333333334, + "grad_norm": 0.3799275755882263, + "learning_rate": 4.654970760233918e-05, + "loss": 1.0232, + "mean_token_accuracy": 0.7573679387569427, + "num_tokens": 17609960.0, + "step": 1403 + }, + { + "entropy": 1.0632849484682083, + "epoch": 2.34, + "grad_norm": 0.3400690257549286, + "learning_rate": 4.64327485380117e-05, + "loss": 1.0697, + "mean_token_accuracy": 0.747526504099369, + "num_tokens": 17622503.0, + "step": 1404 + }, + { + "entropy": 1.0248740836977959, + "epoch": 2.341666666666667, + "grad_norm": 0.47778773307800293, + "learning_rate": 4.6315789473684214e-05, + "loss": 1.0235, + "mean_token_accuracy": 0.7550725936889648, + "num_tokens": 17635158.0, + "step": 1405 + }, + { + "entropy": 0.9748602956533432, + "epoch": 2.3433333333333333, + "grad_norm": 0.43096184730529785, + "learning_rate": 4.619883040935672e-05, + "loss": 1.0036, + "mean_token_accuracy": 0.7629280313849449, + "num_tokens": 17647742.0, + "step": 1406 + }, + { + "entropy": 1.0009631663560867, + "epoch": 2.3449999999999998, + "grad_norm": 0.38473641872406006, + "learning_rate": 4.6081871345029246e-05, + "loss": 0.9622, + "mean_token_accuracy": 0.7641818448901176, + "num_tokens": 17660258.0, + "step": 1407 + }, + { + "entropy": 1.0116385519504547, + "epoch": 2.3466666666666667, + "grad_norm": 0.3373880982398987, + "learning_rate": 4.5964912280701756e-05, + "loss": 1.0132, + "mean_token_accuracy": 0.7566264197230339, + "num_tokens": 17672849.0, + "step": 1408 + }, + { + "entropy": 0.965719573199749, + "epoch": 2.348333333333333, + "grad_norm": 0.3528232276439667, + "learning_rate": 4.584795321637427e-05, + "loss": 0.9646, + "mean_token_accuracy": 0.769320622086525, + "num_tokens": 17685229.0, + "step": 1409 + }, + { + "entropy": 1.0245723873376846, + "epoch": 2.35, + "grad_norm": 0.3594391345977783, + "learning_rate": 4.573099415204679e-05, + "loss": 1.0213, + "mean_token_accuracy": 0.7601810023188591, + "num_tokens": 17697564.0, + "step": 1410 + }, + { + "entropy": 0.9542577862739563, + "epoch": 2.3516666666666666, + "grad_norm": 0.4028317928314209, + "learning_rate": 4.56140350877193e-05, + "loss": 0.9477, + "mean_token_accuracy": 0.7768222391605377, + "num_tokens": 17710116.0, + "step": 1411 + }, + { + "entropy": 1.0591498240828514, + "epoch": 2.3533333333333335, + "grad_norm": 0.35096943378448486, + "learning_rate": 4.5497076023391814e-05, + "loss": 1.0479, + "mean_token_accuracy": 0.7511708587408066, + "num_tokens": 17722922.0, + "step": 1412 + }, + { + "entropy": 1.122874990105629, + "epoch": 2.355, + "grad_norm": 0.48014557361602783, + "learning_rate": 4.538011695906433e-05, + "loss": 1.1276, + "mean_token_accuracy": 0.7345095053315163, + "num_tokens": 17735375.0, + "step": 1413 + }, + { + "entropy": 0.9548719525337219, + "epoch": 2.3566666666666665, + "grad_norm": 0.36935633420944214, + "learning_rate": 4.5263157894736846e-05, + "loss": 0.9415, + "mean_token_accuracy": 0.7843847125768661, + "num_tokens": 17747973.0, + "step": 1414 + }, + { + "entropy": 1.11712995916605, + "epoch": 2.3583333333333334, + "grad_norm": 0.3613123595714569, + "learning_rate": 4.5146198830409356e-05, + "loss": 1.0685, + "mean_token_accuracy": 0.7474715933203697, + "num_tokens": 17760248.0, + "step": 1415 + }, + { + "entropy": 0.9302668869495392, + "epoch": 2.36, + "grad_norm": 0.45687052607536316, + "learning_rate": 4.502923976608187e-05, + "loss": 0.9326, + "mean_token_accuracy": 0.7832116708159447, + "num_tokens": 17772988.0, + "step": 1416 + }, + { + "entropy": 1.0519694536924362, + "epoch": 2.361666666666667, + "grad_norm": 0.403952032327652, + "learning_rate": 4.491228070175439e-05, + "loss": 1.0079, + "mean_token_accuracy": 0.7550529465079308, + "num_tokens": 17785885.0, + "step": 1417 + }, + { + "entropy": 1.086365208029747, + "epoch": 2.3633333333333333, + "grad_norm": 0.4308124780654907, + "learning_rate": 4.4795321637426905e-05, + "loss": 1.0632, + "mean_token_accuracy": 0.7503360137343407, + "num_tokens": 17798417.0, + "step": 1418 + }, + { + "entropy": 0.8909163251519203, + "epoch": 2.365, + "grad_norm": 0.3536031246185303, + "learning_rate": 4.467836257309942e-05, + "loss": 0.8313, + "mean_token_accuracy": 0.8002294525504112, + "num_tokens": 17810914.0, + "step": 1419 + }, + { + "entropy": 0.9891779273748398, + "epoch": 2.3666666666666667, + "grad_norm": 0.3896813690662384, + "learning_rate": 4.456140350877193e-05, + "loss": 0.9712, + "mean_token_accuracy": 0.7715538889169693, + "num_tokens": 17823565.0, + "step": 1420 + }, + { + "entropy": 1.0380757227540016, + "epoch": 2.368333333333333, + "grad_norm": 0.39828822016716003, + "learning_rate": 4.4444444444444447e-05, + "loss": 1.0271, + "mean_token_accuracy": 0.7572704255580902, + "num_tokens": 17836401.0, + "step": 1421 + }, + { + "entropy": 0.9392967596650124, + "epoch": 2.37, + "grad_norm": 0.3789653778076172, + "learning_rate": 4.432748538011696e-05, + "loss": 0.9073, + "mean_token_accuracy": 0.7890078723430634, + "num_tokens": 17849078.0, + "step": 1422 + }, + { + "entropy": 0.843848530203104, + "epoch": 2.3716666666666666, + "grad_norm": 0.385623037815094, + "learning_rate": 4.421052631578947e-05, + "loss": 0.8303, + "mean_token_accuracy": 0.8040110915899277, + "num_tokens": 17861477.0, + "step": 1423 + }, + { + "entropy": 0.9434859231114388, + "epoch": 2.3733333333333335, + "grad_norm": 0.43605050444602966, + "learning_rate": 4.4093567251461995e-05, + "loss": 0.9544, + "mean_token_accuracy": 0.7695381715893745, + "num_tokens": 17873926.0, + "step": 1424 + }, + { + "entropy": 1.0431689321994781, + "epoch": 2.375, + "grad_norm": 0.38098517060279846, + "learning_rate": 4.3976608187134505e-05, + "loss": 1.0271, + "mean_token_accuracy": 0.7505878806114197, + "num_tokens": 17886406.0, + "step": 1425 + }, + { + "entropy": 0.8668869659304619, + "epoch": 2.3766666666666665, + "grad_norm": 0.4740240275859833, + "learning_rate": 4.3859649122807014e-05, + "loss": 0.8867, + "mean_token_accuracy": 0.7860351949930191, + "num_tokens": 17899108.0, + "step": 1426 + }, + { + "entropy": 0.998226173222065, + "epoch": 2.3783333333333334, + "grad_norm": 0.43642929196357727, + "learning_rate": 4.374269005847954e-05, + "loss": 0.9991, + "mean_token_accuracy": 0.7625894770026207, + "num_tokens": 17911774.0, + "step": 1427 + }, + { + "entropy": 1.1312288716435432, + "epoch": 2.38, + "grad_norm": 0.731705367565155, + "learning_rate": 4.362573099415205e-05, + "loss": 1.1616, + "mean_token_accuracy": 0.7352300584316254, + "num_tokens": 17924260.0, + "step": 1428 + }, + { + "entropy": 0.9560132697224617, + "epoch": 2.381666666666667, + "grad_norm": 0.3722324073314667, + "learning_rate": 4.350877192982456e-05, + "loss": 0.9538, + "mean_token_accuracy": 0.7685009241104126, + "num_tokens": 17936503.0, + "step": 1429 + }, + { + "entropy": 0.925403892993927, + "epoch": 2.3833333333333333, + "grad_norm": 0.3431994616985321, + "learning_rate": 4.339181286549708e-05, + "loss": 0.9155, + "mean_token_accuracy": 0.7818304002285004, + "num_tokens": 17949168.0, + "step": 1430 + }, + { + "entropy": 1.024433709681034, + "epoch": 2.385, + "grad_norm": 0.6439318656921387, + "learning_rate": 4.327485380116959e-05, + "loss": 1.0217, + "mean_token_accuracy": 0.7565485537052155, + "num_tokens": 17961757.0, + "step": 1431 + }, + { + "entropy": 0.9094724208116531, + "epoch": 2.3866666666666667, + "grad_norm": 0.6560204029083252, + "learning_rate": 4.3157894736842105e-05, + "loss": 0.8842, + "mean_token_accuracy": 0.7887041494250298, + "num_tokens": 17974342.0, + "step": 1432 + }, + { + "entropy": 0.8868458643555641, + "epoch": 2.388333333333333, + "grad_norm": 0.46793970465660095, + "learning_rate": 4.304093567251462e-05, + "loss": 0.8846, + "mean_token_accuracy": 0.7890603691339493, + "num_tokens": 17987083.0, + "step": 1433 + }, + { + "entropy": 0.9005289226770401, + "epoch": 2.39, + "grad_norm": 0.38978397846221924, + "learning_rate": 4.292397660818714e-05, + "loss": 0.8938, + "mean_token_accuracy": 0.782791867852211, + "num_tokens": 17999827.0, + "step": 1434 + }, + { + "entropy": 0.9923791363835335, + "epoch": 2.3916666666666666, + "grad_norm": 0.7273948788642883, + "learning_rate": 4.2807017543859654e-05, + "loss": 0.9815, + "mean_token_accuracy": 0.7701498419046402, + "num_tokens": 18012354.0, + "step": 1435 + }, + { + "entropy": 1.0210438519716263, + "epoch": 2.3933333333333335, + "grad_norm": 0.7376374006271362, + "learning_rate": 4.269005847953216e-05, + "loss": 0.9966, + "mean_token_accuracy": 0.7644841596484184, + "num_tokens": 18024833.0, + "step": 1436 + }, + { + "entropy": 0.9516857638955116, + "epoch": 2.395, + "grad_norm": 0.35253602266311646, + "learning_rate": 4.257309941520468e-05, + "loss": 0.9323, + "mean_token_accuracy": 0.7826920300722122, + "num_tokens": 18037262.0, + "step": 1437 + }, + { + "entropy": 1.065951719880104, + "epoch": 2.3966666666666665, + "grad_norm": 0.5512713193893433, + "learning_rate": 4.2456140350877196e-05, + "loss": 1.044, + "mean_token_accuracy": 0.7530983835458755, + "num_tokens": 18049715.0, + "step": 1438 + }, + { + "entropy": 0.9494584500789642, + "epoch": 2.3983333333333334, + "grad_norm": 0.5576030015945435, + "learning_rate": 4.233918128654971e-05, + "loss": 0.9224, + "mean_token_accuracy": 0.7802279368042946, + "num_tokens": 18062197.0, + "step": 1439 + }, + { + "entropy": 1.0058221891522408, + "epoch": 2.4, + "grad_norm": 0.5195385217666626, + "learning_rate": 4.222222222222222e-05, + "loss": 1.0022, + "mean_token_accuracy": 0.7639900669455528, + "num_tokens": 18074789.0, + "step": 1440 + }, + { + "epoch": 2.4, + "eval_entropy": 1.0796787294088865, + "eval_loss": 1.1247867345809937, + "eval_mean_token_accuracy": 0.73417642486186, + "eval_num_tokens": 18074789.0, + "eval_runtime": 2670.5798, + "eval_samples_per_second": 1.873, + "eval_steps_per_second": 0.937, + "step": 1440 + }, + { + "entropy": 0.9775924235582352, + "epoch": 2.401666666666667, + "grad_norm": 0.4396207928657532, + "learning_rate": 4.210526315789474e-05, + "loss": 0.97, + "mean_token_accuracy": 0.7707381621003151, + "num_tokens": 18087359.0, + "step": 1441 + }, + { + "entropy": 0.9310579523444176, + "epoch": 2.4033333333333333, + "grad_norm": 0.4029017984867096, + "learning_rate": 4.1988304093567254e-05, + "loss": 0.9228, + "mean_token_accuracy": 0.7824476584792137, + "num_tokens": 18099966.0, + "step": 1442 + }, + { + "entropy": 0.9692279025912285, + "epoch": 2.4050000000000002, + "grad_norm": 0.6140645146369934, + "learning_rate": 4.1871345029239764e-05, + "loss": 0.9755, + "mean_token_accuracy": 0.7698812261223793, + "num_tokens": 18112463.0, + "step": 1443 + }, + { + "entropy": 0.8655563145875931, + "epoch": 2.4066666666666667, + "grad_norm": 0.5127424597740173, + "learning_rate": 4.1754385964912287e-05, + "loss": 0.8721, + "mean_token_accuracy": 0.7947784885764122, + "num_tokens": 18125084.0, + "step": 1444 + }, + { + "entropy": 0.9460387006402016, + "epoch": 2.408333333333333, + "grad_norm": 0.3317752778530121, + "learning_rate": 4.1637426900584796e-05, + "loss": 0.9538, + "mean_token_accuracy": 0.7768925428390503, + "num_tokens": 18137569.0, + "step": 1445 + }, + { + "entropy": 0.8308089300990105, + "epoch": 2.41, + "grad_norm": 0.5850228071212769, + "learning_rate": 4.152046783625731e-05, + "loss": 0.7986, + "mean_token_accuracy": 0.7998393177986145, + "num_tokens": 18150177.0, + "step": 1446 + }, + { + "entropy": 0.9665387645363808, + "epoch": 2.4116666666666666, + "grad_norm": 0.4210375249385834, + "learning_rate": 4.140350877192983e-05, + "loss": 0.9485, + "mean_token_accuracy": 0.7760468646883965, + "num_tokens": 18162716.0, + "step": 1447 + }, + { + "entropy": 0.9252666160464287, + "epoch": 2.413333333333333, + "grad_norm": 0.5330778360366821, + "learning_rate": 4.128654970760234e-05, + "loss": 0.9011, + "mean_token_accuracy": 0.7870231345295906, + "num_tokens": 18175228.0, + "step": 1448 + }, + { + "entropy": 0.9812035113573074, + "epoch": 2.415, + "grad_norm": 0.34740927815437317, + "learning_rate": 4.116959064327486e-05, + "loss": 0.9782, + "mean_token_accuracy": 0.7687889486551285, + "num_tokens": 18187655.0, + "step": 1449 + }, + { + "entropy": 0.972503311932087, + "epoch": 2.4166666666666665, + "grad_norm": 0.5119990706443787, + "learning_rate": 4.105263157894737e-05, + "loss": 0.9478, + "mean_token_accuracy": 0.7767730876803398, + "num_tokens": 18199938.0, + "step": 1450 + }, + { + "entropy": 1.0573881417512894, + "epoch": 2.4183333333333334, + "grad_norm": 0.41864481568336487, + "learning_rate": 4.093567251461988e-05, + "loss": 1.0521, + "mean_token_accuracy": 0.7550124228000641, + "num_tokens": 18212399.0, + "step": 1451 + }, + { + "entropy": 1.0436188504099846, + "epoch": 2.42, + "grad_norm": 0.4275730550289154, + "learning_rate": 4.08187134502924e-05, + "loss": 1.0737, + "mean_token_accuracy": 0.7530336901545525, + "num_tokens": 18224654.0, + "step": 1452 + }, + { + "entropy": 1.0678341761231422, + "epoch": 2.421666666666667, + "grad_norm": 0.40624552965164185, + "learning_rate": 4.070175438596491e-05, + "loss": 1.0342, + "mean_token_accuracy": 0.7541193515062332, + "num_tokens": 18237061.0, + "step": 1453 + }, + { + "entropy": 1.003325767815113, + "epoch": 2.4233333333333333, + "grad_norm": 0.3441218137741089, + "learning_rate": 4.058479532163743e-05, + "loss": 0.9889, + "mean_token_accuracy": 0.7704547345638275, + "num_tokens": 18249598.0, + "step": 1454 + }, + { + "entropy": 0.9985587671399117, + "epoch": 2.425, + "grad_norm": 0.33472204208374023, + "learning_rate": 4.0467836257309945e-05, + "loss": 0.9894, + "mean_token_accuracy": 0.7677558958530426, + "num_tokens": 18262067.0, + "step": 1455 + }, + { + "entropy": 1.0390379056334496, + "epoch": 2.4266666666666667, + "grad_norm": 0.5532830953598022, + "learning_rate": 4.0350877192982455e-05, + "loss": 1.0407, + "mean_token_accuracy": 0.7571277692914009, + "num_tokens": 18274478.0, + "step": 1456 + }, + { + "entropy": 1.016162134706974, + "epoch": 2.4283333333333332, + "grad_norm": 0.39858290553092957, + "learning_rate": 4.023391812865497e-05, + "loss": 1.0137, + "mean_token_accuracy": 0.7565730661153793, + "num_tokens": 18287142.0, + "step": 1457 + }, + { + "entropy": 1.2140344008803368, + "epoch": 2.43, + "grad_norm": 0.5076939463615417, + "learning_rate": 4.011695906432749e-05, + "loss": 1.1832, + "mean_token_accuracy": 0.7223897650837898, + "num_tokens": 18299762.0, + "step": 1458 + }, + { + "entropy": 0.9129492491483688, + "epoch": 2.4316666666666666, + "grad_norm": 0.36217865347862244, + "learning_rate": 4e-05, + "loss": 0.9022, + "mean_token_accuracy": 0.7806392908096313, + "num_tokens": 18312392.0, + "step": 1459 + }, + { + "entropy": 0.9277146384119987, + "epoch": 2.4333333333333336, + "grad_norm": 0.4581172466278076, + "learning_rate": 3.988304093567252e-05, + "loss": 0.8954, + "mean_token_accuracy": 0.7858082130551338, + "num_tokens": 18325360.0, + "step": 1460 + }, + { + "entropy": 0.9304872676730156, + "epoch": 2.435, + "grad_norm": 0.4575720727443695, + "learning_rate": 3.976608187134503e-05, + "loss": 0.9198, + "mean_token_accuracy": 0.7800796404480934, + "num_tokens": 18337726.0, + "step": 1461 + }, + { + "entropy": 0.9998406544327736, + "epoch": 2.4366666666666665, + "grad_norm": 0.5538321733474731, + "learning_rate": 3.9649122807017545e-05, + "loss": 1.0006, + "mean_token_accuracy": 0.7627718448638916, + "num_tokens": 18350012.0, + "step": 1462 + }, + { + "entropy": 0.8742708638310432, + "epoch": 2.4383333333333335, + "grad_norm": 0.32581356167793274, + "learning_rate": 3.953216374269006e-05, + "loss": 0.8429, + "mean_token_accuracy": 0.7942229881882668, + "num_tokens": 18362671.0, + "step": 1463 + }, + { + "entropy": 0.96209517121315, + "epoch": 2.44, + "grad_norm": 0.4885143041610718, + "learning_rate": 3.941520467836258e-05, + "loss": 0.9596, + "mean_token_accuracy": 0.7725346311926842, + "num_tokens": 18375457.0, + "step": 1464 + }, + { + "entropy": 0.893056645989418, + "epoch": 2.4416666666666664, + "grad_norm": 0.40171849727630615, + "learning_rate": 3.929824561403509e-05, + "loss": 0.8843, + "mean_token_accuracy": 0.7862138077616692, + "num_tokens": 18388113.0, + "step": 1465 + }, + { + "entropy": 1.0815696865320206, + "epoch": 2.4433333333333334, + "grad_norm": 0.39122113585472107, + "learning_rate": 3.9181286549707604e-05, + "loss": 1.0638, + "mean_token_accuracy": 0.7479680553078651, + "num_tokens": 18400524.0, + "step": 1466 + }, + { + "entropy": 0.9530718848109245, + "epoch": 2.445, + "grad_norm": 0.3515426218509674, + "learning_rate": 3.906432748538012e-05, + "loss": 0.9316, + "mean_token_accuracy": 0.7808438464999199, + "num_tokens": 18413183.0, + "step": 1467 + }, + { + "entropy": 0.9159688428044319, + "epoch": 2.4466666666666668, + "grad_norm": 0.38037267327308655, + "learning_rate": 3.894736842105263e-05, + "loss": 0.9163, + "mean_token_accuracy": 0.779375821352005, + "num_tokens": 18425640.0, + "step": 1468 + }, + { + "entropy": 1.123489424586296, + "epoch": 2.4483333333333333, + "grad_norm": 0.4094807505607605, + "learning_rate": 3.883040935672515e-05, + "loss": 1.1264, + "mean_token_accuracy": 0.7327072098851204, + "num_tokens": 18438380.0, + "step": 1469 + }, + { + "entropy": 0.937807597219944, + "epoch": 2.45, + "grad_norm": 0.36226987838745117, + "learning_rate": 3.871345029239766e-05, + "loss": 0.9331, + "mean_token_accuracy": 0.7771824821829796, + "num_tokens": 18450696.0, + "step": 1470 + }, + { + "entropy": 0.9960529804229736, + "epoch": 2.4516666666666667, + "grad_norm": 0.3767246901988983, + "learning_rate": 3.859649122807018e-05, + "loss": 0.9779, + "mean_token_accuracy": 0.7736406475305557, + "num_tokens": 18463295.0, + "step": 1471 + }, + { + "entropy": 1.120502732694149, + "epoch": 2.453333333333333, + "grad_norm": 0.4414403736591339, + "learning_rate": 3.8479532163742694e-05, + "loss": 1.1137, + "mean_token_accuracy": 0.7398768663406372, + "num_tokens": 18476071.0, + "step": 1472 + }, + { + "entropy": 0.9994122721254826, + "epoch": 2.455, + "grad_norm": 0.5228672027587891, + "learning_rate": 3.8362573099415204e-05, + "loss": 0.9758, + "mean_token_accuracy": 0.769538126885891, + "num_tokens": 18488896.0, + "step": 1473 + }, + { + "entropy": 0.996564008295536, + "epoch": 2.4566666666666666, + "grad_norm": 0.37627217173576355, + "learning_rate": 3.824561403508773e-05, + "loss": 0.9812, + "mean_token_accuracy": 0.7634431943297386, + "num_tokens": 18501317.0, + "step": 1474 + }, + { + "entropy": 1.0408613234758377, + "epoch": 2.4583333333333335, + "grad_norm": 0.34794583916664124, + "learning_rate": 3.8128654970760236e-05, + "loss": 1.0409, + "mean_token_accuracy": 0.7593723088502884, + "num_tokens": 18513934.0, + "step": 1475 + }, + { + "entropy": 0.9645885825157166, + "epoch": 2.46, + "grad_norm": 0.35473302006721497, + "learning_rate": 3.8011695906432746e-05, + "loss": 0.9487, + "mean_token_accuracy": 0.7796276733279228, + "num_tokens": 18526481.0, + "step": 1476 + }, + { + "entropy": 0.9993656426668167, + "epoch": 2.461666666666667, + "grad_norm": 0.6172723770141602, + "learning_rate": 3.789473684210527e-05, + "loss": 0.9774, + "mean_token_accuracy": 0.7698904275894165, + "num_tokens": 18538931.0, + "step": 1477 + }, + { + "entropy": 1.036060705780983, + "epoch": 2.4633333333333334, + "grad_norm": 0.4515897035598755, + "learning_rate": 3.777777777777778e-05, + "loss": 1.0131, + "mean_token_accuracy": 0.7584156319499016, + "num_tokens": 18551795.0, + "step": 1478 + }, + { + "entropy": 1.071168415248394, + "epoch": 2.465, + "grad_norm": 0.32726114988327026, + "learning_rate": 3.7660818713450294e-05, + "loss": 1.0727, + "mean_token_accuracy": 0.7522844672203064, + "num_tokens": 18564559.0, + "step": 1479 + }, + { + "entropy": 0.8098116889595985, + "epoch": 2.466666666666667, + "grad_norm": 0.36809012293815613, + "learning_rate": 3.754385964912281e-05, + "loss": 0.7787, + "mean_token_accuracy": 0.8122852146625519, + "num_tokens": 18577068.0, + "step": 1480 + }, + { + "entropy": 1.2138219326734543, + "epoch": 2.4683333333333333, + "grad_norm": 0.5272742509841919, + "learning_rate": 3.742690058479532e-05, + "loss": 1.2165, + "mean_token_accuracy": 0.7226243764162064, + "num_tokens": 18589541.0, + "step": 1481 + }, + { + "entropy": 1.0125328078866005, + "epoch": 2.4699999999999998, + "grad_norm": 0.683719277381897, + "learning_rate": 3.7309941520467836e-05, + "loss": 1.0023, + "mean_token_accuracy": 0.7621996402740479, + "num_tokens": 18601956.0, + "step": 1482 + }, + { + "entropy": 0.9330783635377884, + "epoch": 2.4716666666666667, + "grad_norm": 0.36025023460388184, + "learning_rate": 3.719298245614035e-05, + "loss": 0.9222, + "mean_token_accuracy": 0.7863496914505959, + "num_tokens": 18614520.0, + "step": 1483 + }, + { + "entropy": 0.9152974709868431, + "epoch": 2.473333333333333, + "grad_norm": 0.3496282994747162, + "learning_rate": 3.707602339181287e-05, + "loss": 0.9179, + "mean_token_accuracy": 0.7877078875899315, + "num_tokens": 18626998.0, + "step": 1484 + }, + { + "entropy": 1.0227690413594246, + "epoch": 2.475, + "grad_norm": 0.3869014382362366, + "learning_rate": 3.6959064327485385e-05, + "loss": 1.0343, + "mean_token_accuracy": 0.7536423355340958, + "num_tokens": 18639379.0, + "step": 1485 + }, + { + "entropy": 1.0270971581339836, + "epoch": 2.4766666666666666, + "grad_norm": 0.7031916379928589, + "learning_rate": 3.6842105263157895e-05, + "loss": 1.0099, + "mean_token_accuracy": 0.7610350698232651, + "num_tokens": 18651998.0, + "step": 1486 + }, + { + "entropy": 0.9499993473291397, + "epoch": 2.4783333333333335, + "grad_norm": 0.453957736492157, + "learning_rate": 3.672514619883041e-05, + "loss": 0.9549, + "mean_token_accuracy": 0.7735193893313408, + "num_tokens": 18664471.0, + "step": 1487 + }, + { + "entropy": 1.117090955376625, + "epoch": 2.48, + "grad_norm": 0.36520126461982727, + "learning_rate": 3.660818713450293e-05, + "loss": 1.1023, + "mean_token_accuracy": 0.7412121519446373, + "num_tokens": 18677291.0, + "step": 1488 + }, + { + "entropy": 0.8686870709061623, + "epoch": 2.4816666666666665, + "grad_norm": 0.37018629908561707, + "learning_rate": 3.6491228070175443e-05, + "loss": 0.8513, + "mean_token_accuracy": 0.7947266772389412, + "num_tokens": 18689856.0, + "step": 1489 + }, + { + "entropy": 1.0467691123485565, + "epoch": 2.4833333333333334, + "grad_norm": 0.416146844625473, + "learning_rate": 3.637426900584795e-05, + "loss": 1.0297, + "mean_token_accuracy": 0.7553540989756584, + "num_tokens": 18702231.0, + "step": 1490 + }, + { + "entropy": 0.9876288920640945, + "epoch": 2.485, + "grad_norm": 0.7249135971069336, + "learning_rate": 3.625730994152047e-05, + "loss": 0.9494, + "mean_token_accuracy": 0.7757026925683022, + "num_tokens": 18714634.0, + "step": 1491 + }, + { + "entropy": 1.026538647711277, + "epoch": 2.486666666666667, + "grad_norm": 0.5086227059364319, + "learning_rate": 3.6140350877192985e-05, + "loss": 1.0216, + "mean_token_accuracy": 0.7559056803584099, + "num_tokens": 18727218.0, + "step": 1492 + }, + { + "entropy": 1.1290322616696358, + "epoch": 2.4883333333333333, + "grad_norm": 0.4001719653606415, + "learning_rate": 3.6023391812865495e-05, + "loss": 1.1469, + "mean_token_accuracy": 0.72971972823143, + "num_tokens": 18739692.0, + "step": 1493 + }, + { + "entropy": 1.024274654686451, + "epoch": 2.49, + "grad_norm": 0.40942856669425964, + "learning_rate": 3.590643274853802e-05, + "loss": 1.0228, + "mean_token_accuracy": 0.7576658725738525, + "num_tokens": 18752344.0, + "step": 1494 + }, + { + "entropy": 1.0166387408971786, + "epoch": 2.4916666666666667, + "grad_norm": 0.592272937297821, + "learning_rate": 3.578947368421053e-05, + "loss": 1.0035, + "mean_token_accuracy": 0.7658247873187065, + "num_tokens": 18764740.0, + "step": 1495 + }, + { + "entropy": 0.9203286692500114, + "epoch": 2.493333333333333, + "grad_norm": 0.3312186598777771, + "learning_rate": 3.5672514619883044e-05, + "loss": 0.8966, + "mean_token_accuracy": 0.7830562368035316, + "num_tokens": 18777434.0, + "step": 1496 + }, + { + "entropy": 1.0274470299482346, + "epoch": 2.495, + "grad_norm": 0.47624674439430237, + "learning_rate": 3.555555555555556e-05, + "loss": 1.0194, + "mean_token_accuracy": 0.7651049792766571, + "num_tokens": 18789938.0, + "step": 1497 + }, + { + "entropy": 1.0490001514554024, + "epoch": 2.4966666666666666, + "grad_norm": 0.3425477147102356, + "learning_rate": 3.543859649122807e-05, + "loss": 1.0557, + "mean_token_accuracy": 0.7523152008652687, + "num_tokens": 18802244.0, + "step": 1498 + }, + { + "entropy": 0.9781380966305733, + "epoch": 2.4983333333333335, + "grad_norm": 0.4817301630973816, + "learning_rate": 3.5321637426900586e-05, + "loss": 0.9518, + "mean_token_accuracy": 0.7738873139023781, + "num_tokens": 18814540.0, + "step": 1499 + }, + { + "entropy": 1.0649482309818268, + "epoch": 2.5, + "grad_norm": 0.806278645992279, + "learning_rate": 3.52046783625731e-05, + "loss": 1.0481, + "mean_token_accuracy": 0.7527475655078888, + "num_tokens": 18827149.0, + "step": 1500 + }, + { + "entropy": 1.1663037165999413, + "epoch": 2.501666666666667, + "grad_norm": 0.34465131163597107, + "learning_rate": 3.508771929824561e-05, + "loss": 1.1521, + "mean_token_accuracy": 0.7319402098655701, + "num_tokens": 18839677.0, + "step": 1501 + }, + { + "entropy": 0.9615538790822029, + "epoch": 2.5033333333333334, + "grad_norm": 0.4134405255317688, + "learning_rate": 3.4970760233918134e-05, + "loss": 0.9513, + "mean_token_accuracy": 0.773155614733696, + "num_tokens": 18852368.0, + "step": 1502 + }, + { + "entropy": 1.085058145225048, + "epoch": 2.505, + "grad_norm": 0.4559493362903595, + "learning_rate": 3.4853801169590644e-05, + "loss": 1.0672, + "mean_token_accuracy": 0.7496365085244179, + "num_tokens": 18864685.0, + "step": 1503 + }, + { + "entropy": 1.0483967959880829, + "epoch": 2.506666666666667, + "grad_norm": 0.3874960243701935, + "learning_rate": 3.473684210526316e-05, + "loss": 1.0364, + "mean_token_accuracy": 0.753168523311615, + "num_tokens": 18876956.0, + "step": 1504 + }, + { + "entropy": 0.9937590956687927, + "epoch": 2.5083333333333333, + "grad_norm": 0.5102173089981079, + "learning_rate": 3.4619883040935676e-05, + "loss": 0.9986, + "mean_token_accuracy": 0.7727838605642319, + "num_tokens": 18889389.0, + "step": 1505 + }, + { + "entropy": 0.9617498740553856, + "epoch": 2.51, + "grad_norm": 0.48724833130836487, + "learning_rate": 3.4502923976608186e-05, + "loss": 0.9353, + "mean_token_accuracy": 0.7754090800881386, + "num_tokens": 18901682.0, + "step": 1506 + }, + { + "entropy": 1.0282488837838173, + "epoch": 2.5116666666666667, + "grad_norm": 0.3587839901447296, + "learning_rate": 3.43859649122807e-05, + "loss": 1.0014, + "mean_token_accuracy": 0.7635190635919571, + "num_tokens": 18914078.0, + "step": 1507 + }, + { + "entropy": 0.9259160608053207, + "epoch": 2.513333333333333, + "grad_norm": 0.3565276861190796, + "learning_rate": 3.426900584795322e-05, + "loss": 0.9156, + "mean_token_accuracy": 0.7849203050136566, + "num_tokens": 18926803.0, + "step": 1508 + }, + { + "entropy": 0.9611619710922241, + "epoch": 2.515, + "grad_norm": 0.33669984340667725, + "learning_rate": 3.4152046783625735e-05, + "loss": 0.9654, + "mean_token_accuracy": 0.7725402861833572, + "num_tokens": 18939312.0, + "step": 1509 + }, + { + "entropy": 0.981966145336628, + "epoch": 2.5166666666666666, + "grad_norm": 0.4173159897327423, + "learning_rate": 3.403508771929825e-05, + "loss": 0.983, + "mean_token_accuracy": 0.7673230543732643, + "num_tokens": 18951932.0, + "step": 1510 + }, + { + "entropy": 0.8428291901946068, + "epoch": 2.5183333333333335, + "grad_norm": 0.49428561329841614, + "learning_rate": 3.391812865497076e-05, + "loss": 0.8179, + "mean_token_accuracy": 0.7984678149223328, + "num_tokens": 18964757.0, + "step": 1511 + }, + { + "entropy": 0.904907688498497, + "epoch": 2.52, + "grad_norm": 0.3907698094844818, + "learning_rate": 3.380116959064328e-05, + "loss": 0.8958, + "mean_token_accuracy": 0.7823513001203537, + "num_tokens": 18977378.0, + "step": 1512 + }, + { + "entropy": 0.8524839282035828, + "epoch": 2.5216666666666665, + "grad_norm": 0.3634012043476105, + "learning_rate": 3.368421052631579e-05, + "loss": 0.8424, + "mean_token_accuracy": 0.7951704487204552, + "num_tokens": 18989942.0, + "step": 1513 + }, + { + "entropy": 1.0936120599508286, + "epoch": 2.5233333333333334, + "grad_norm": 0.7154280543327332, + "learning_rate": 3.356725146198831e-05, + "loss": 1.0834, + "mean_token_accuracy": 0.73818039894104, + "num_tokens": 19002527.0, + "step": 1514 + }, + { + "entropy": 1.1705647706985474, + "epoch": 2.525, + "grad_norm": 0.34830713272094727, + "learning_rate": 3.345029239766082e-05, + "loss": 1.1673, + "mean_token_accuracy": 0.7235186025500298, + "num_tokens": 19014994.0, + "step": 1515 + }, + { + "entropy": 1.0501948818564415, + "epoch": 2.5266666666666664, + "grad_norm": 0.43134674429893494, + "learning_rate": 3.3333333333333335e-05, + "loss": 1.0322, + "mean_token_accuracy": 0.7616890296339989, + "num_tokens": 19027643.0, + "step": 1516 + }, + { + "entropy": 0.979412391781807, + "epoch": 2.5283333333333333, + "grad_norm": 0.4833930432796478, + "learning_rate": 3.321637426900585e-05, + "loss": 0.9932, + "mean_token_accuracy": 0.7695396468043327, + "num_tokens": 19040183.0, + "step": 1517 + }, + { + "entropy": 0.8569963127374649, + "epoch": 2.5300000000000002, + "grad_norm": 0.43490472435951233, + "learning_rate": 3.309941520467836e-05, + "loss": 0.847, + "mean_token_accuracy": 0.8029119372367859, + "num_tokens": 19052804.0, + "step": 1518 + }, + { + "entropy": 1.0245169177651405, + "epoch": 2.5316666666666667, + "grad_norm": 0.34944966435432434, + "learning_rate": 3.2982456140350884e-05, + "loss": 1.018, + "mean_token_accuracy": 0.7651989385485649, + "num_tokens": 19065594.0, + "step": 1519 + }, + { + "entropy": 1.061990074813366, + "epoch": 2.533333333333333, + "grad_norm": 0.3310334384441376, + "learning_rate": 3.286549707602339e-05, + "loss": 1.0605, + "mean_token_accuracy": 0.7491501048207283, + "num_tokens": 19078371.0, + "step": 1520 + }, + { + "entropy": 0.9716204106807709, + "epoch": 2.535, + "grad_norm": 0.4097890555858612, + "learning_rate": 3.274853801169591e-05, + "loss": 0.9585, + "mean_token_accuracy": 0.7635811790823936, + "num_tokens": 19090858.0, + "step": 1521 + }, + { + "entropy": 0.9108923450112343, + "epoch": 2.5366666666666666, + "grad_norm": 0.46846216917037964, + "learning_rate": 3.2631578947368426e-05, + "loss": 0.8741, + "mean_token_accuracy": 0.7925085723400116, + "num_tokens": 19103242.0, + "step": 1522 + }, + { + "entropy": 0.9476322382688522, + "epoch": 2.538333333333333, + "grad_norm": 0.33571940660476685, + "learning_rate": 3.2514619883040935e-05, + "loss": 0.9514, + "mean_token_accuracy": 0.7766336873173714, + "num_tokens": 19115868.0, + "step": 1523 + }, + { + "entropy": 1.0243253335356712, + "epoch": 2.54, + "grad_norm": 0.3860994279384613, + "learning_rate": 3.239766081871345e-05, + "loss": 1.0148, + "mean_token_accuracy": 0.7617216110229492, + "num_tokens": 19128686.0, + "step": 1524 + }, + { + "entropy": 1.027320757508278, + "epoch": 2.5416666666666665, + "grad_norm": 0.36819013953208923, + "learning_rate": 3.228070175438597e-05, + "loss": 1.0216, + "mean_token_accuracy": 0.7593289390206337, + "num_tokens": 19141368.0, + "step": 1525 + }, + { + "entropy": 1.040614478290081, + "epoch": 2.5433333333333334, + "grad_norm": 0.4054155945777893, + "learning_rate": 3.216374269005848e-05, + "loss": 1.0307, + "mean_token_accuracy": 0.7575199753046036, + "num_tokens": 19153828.0, + "step": 1526 + }, + { + "entropy": 1.072327844798565, + "epoch": 2.545, + "grad_norm": 0.42650023102760315, + "learning_rate": 3.2046783625731e-05, + "loss": 1.0672, + "mean_token_accuracy": 0.7499737814068794, + "num_tokens": 19166331.0, + "step": 1527 + }, + { + "entropy": 1.0014429613947868, + "epoch": 2.546666666666667, + "grad_norm": 0.40855589509010315, + "learning_rate": 3.192982456140351e-05, + "loss": 0.9656, + "mean_token_accuracy": 0.7691745683550835, + "num_tokens": 19178786.0, + "step": 1528 + }, + { + "entropy": 1.0100915431976318, + "epoch": 2.5483333333333333, + "grad_norm": 0.9716803431510925, + "learning_rate": 3.1812865497076026e-05, + "loss": 1.0088, + "mean_token_accuracy": 0.7613450139760971, + "num_tokens": 19191528.0, + "step": 1529 + }, + { + "entropy": 1.0127575770020485, + "epoch": 2.55, + "grad_norm": 0.5951594710350037, + "learning_rate": 3.169590643274854e-05, + "loss": 0.9975, + "mean_token_accuracy": 0.7631114646792412, + "num_tokens": 19204005.0, + "step": 1530 + }, + { + "entropy": 1.0337045267224312, + "epoch": 2.5516666666666667, + "grad_norm": 0.34800878167152405, + "learning_rate": 3.157894736842105e-05, + "loss": 1.026, + "mean_token_accuracy": 0.755635216832161, + "num_tokens": 19216559.0, + "step": 1531 + }, + { + "entropy": 1.0046528205275536, + "epoch": 2.5533333333333332, + "grad_norm": 1.2115473747253418, + "learning_rate": 3.146198830409357e-05, + "loss": 0.9952, + "mean_token_accuracy": 0.760844275355339, + "num_tokens": 19228952.0, + "step": 1532 + }, + { + "entropy": 0.9489381983876228, + "epoch": 2.555, + "grad_norm": 1.0763698816299438, + "learning_rate": 3.1345029239766084e-05, + "loss": 0.9473, + "mean_token_accuracy": 0.7689626663923264, + "num_tokens": 19241519.0, + "step": 1533 + }, + { + "entropy": 0.919933371245861, + "epoch": 2.5566666666666666, + "grad_norm": 0.3880161941051483, + "learning_rate": 3.12280701754386e-05, + "loss": 0.9007, + "mean_token_accuracy": 0.7861380577087402, + "num_tokens": 19254113.0, + "step": 1534 + }, + { + "entropy": 0.9838115721940994, + "epoch": 2.5583333333333336, + "grad_norm": 0.33529192209243774, + "learning_rate": 3.111111111111111e-05, + "loss": 0.972, + "mean_token_accuracy": 0.7701843157410622, + "num_tokens": 19266569.0, + "step": 1535 + }, + { + "entropy": 1.033842496573925, + "epoch": 2.56, + "grad_norm": 0.6191700100898743, + "learning_rate": 3.0994152046783626e-05, + "loss": 1.0361, + "mean_token_accuracy": 0.7588989809155464, + "num_tokens": 19279141.0, + "step": 1536 + }, + { + "entropy": 1.0123683139681816, + "epoch": 2.5616666666666665, + "grad_norm": 0.6005243062973022, + "learning_rate": 3.087719298245614e-05, + "loss": 0.9822, + "mean_token_accuracy": 0.7644473239779472, + "num_tokens": 19291380.0, + "step": 1537 + }, + { + "entropy": 0.839779444038868, + "epoch": 2.5633333333333335, + "grad_norm": 0.5065546631813049, + "learning_rate": 3.076023391812866e-05, + "loss": 0.8304, + "mean_token_accuracy": 0.8035098612308502, + "num_tokens": 19303957.0, + "step": 1538 + }, + { + "entropy": 1.0079979747533798, + "epoch": 2.565, + "grad_norm": 0.37137481570243835, + "learning_rate": 3.0643274853801175e-05, + "loss": 1.0261, + "mean_token_accuracy": 0.7604669854044914, + "num_tokens": 19316761.0, + "step": 1539 + }, + { + "entropy": 0.963746502995491, + "epoch": 2.5666666666666664, + "grad_norm": 0.3654783070087433, + "learning_rate": 3.0526315789473684e-05, + "loss": 0.9678, + "mean_token_accuracy": 0.774099662899971, + "num_tokens": 19329179.0, + "step": 1540 + }, + { + "entropy": 0.832111582159996, + "epoch": 2.5683333333333334, + "grad_norm": 0.531024694442749, + "learning_rate": 3.0409356725146197e-05, + "loss": 0.8019, + "mean_token_accuracy": 0.8066947385668755, + "num_tokens": 19342224.0, + "step": 1541 + }, + { + "entropy": 1.1391898691654205, + "epoch": 2.57, + "grad_norm": 0.38318124413490295, + "learning_rate": 3.0292397660818717e-05, + "loss": 1.1384, + "mean_token_accuracy": 0.7360123470425606, + "num_tokens": 19354255.0, + "step": 1542 + }, + { + "entropy": 0.939979076385498, + "epoch": 2.5716666666666668, + "grad_norm": 0.38007327914237976, + "learning_rate": 3.017543859649123e-05, + "loss": 0.9443, + "mean_token_accuracy": 0.7805913388729095, + "num_tokens": 19366854.0, + "step": 1543 + }, + { + "entropy": 1.0215481221675873, + "epoch": 2.5733333333333333, + "grad_norm": 0.361587792634964, + "learning_rate": 3.0058479532163746e-05, + "loss": 1.0246, + "mean_token_accuracy": 0.7535988911986351, + "num_tokens": 19379602.0, + "step": 1544 + }, + { + "entropy": 0.9393767863512039, + "epoch": 2.575, + "grad_norm": 0.4748750925064087, + "learning_rate": 2.994152046783626e-05, + "loss": 0.9202, + "mean_token_accuracy": 0.7778749465942383, + "num_tokens": 19392273.0, + "step": 1545 + }, + { + "entropy": 1.0125665217638016, + "epoch": 2.5766666666666667, + "grad_norm": 0.370896577835083, + "learning_rate": 2.9824561403508772e-05, + "loss": 0.9974, + "mean_token_accuracy": 0.7660475447773933, + "num_tokens": 19405015.0, + "step": 1546 + }, + { + "entropy": 0.9444859474897385, + "epoch": 2.578333333333333, + "grad_norm": 0.38414040207862854, + "learning_rate": 2.9707602339181288e-05, + "loss": 0.9165, + "mean_token_accuracy": 0.7838402763009071, + "num_tokens": 19417525.0, + "step": 1547 + }, + { + "entropy": 0.9054437354207039, + "epoch": 2.58, + "grad_norm": 0.3734860122203827, + "learning_rate": 2.95906432748538e-05, + "loss": 0.8771, + "mean_token_accuracy": 0.7857618629932404, + "num_tokens": 19430253.0, + "step": 1548 + }, + { + "entropy": 1.1172316670417786, + "epoch": 2.5816666666666666, + "grad_norm": 0.3424968719482422, + "learning_rate": 2.9473684210526314e-05, + "loss": 1.1002, + "mean_token_accuracy": 0.7426133081316948, + "num_tokens": 19442996.0, + "step": 1549 + }, + { + "entropy": 0.9223441183567047, + "epoch": 2.5833333333333335, + "grad_norm": 0.4439259171485901, + "learning_rate": 2.9356725146198833e-05, + "loss": 0.916, + "mean_token_accuracy": 0.7827012911438942, + "num_tokens": 19455446.0, + "step": 1550 + }, + { + "entropy": 1.0415283143520355, + "epoch": 2.585, + "grad_norm": 0.4243526756763458, + "learning_rate": 2.9239766081871346e-05, + "loss": 1.022, + "mean_token_accuracy": 0.7620616406202316, + "num_tokens": 19468106.0, + "step": 1551 + }, + { + "entropy": 0.9273224174976349, + "epoch": 2.586666666666667, + "grad_norm": 0.47626543045043945, + "learning_rate": 2.9122807017543863e-05, + "loss": 0.9094, + "mean_token_accuracy": 0.7869971543550491, + "num_tokens": 19480673.0, + "step": 1552 + }, + { + "entropy": 1.0693067982792854, + "epoch": 2.5883333333333334, + "grad_norm": 0.37999090552330017, + "learning_rate": 2.9005847953216375e-05, + "loss": 1.0586, + "mean_token_accuracy": 0.7553419768810272, + "num_tokens": 19493300.0, + "step": 1553 + }, + { + "entropy": 0.9354708418250084, + "epoch": 2.59, + "grad_norm": 0.35718834400177, + "learning_rate": 2.8888888888888888e-05, + "loss": 0.9316, + "mean_token_accuracy": 0.7715967372059822, + "num_tokens": 19505982.0, + "step": 1554 + }, + { + "entropy": 1.0914480835199356, + "epoch": 2.591666666666667, + "grad_norm": 0.3676576316356659, + "learning_rate": 2.8771929824561404e-05, + "loss": 1.1033, + "mean_token_accuracy": 0.7432594522833824, + "num_tokens": 19518253.0, + "step": 1555 + }, + { + "entropy": 0.9367355927824974, + "epoch": 2.5933333333333333, + "grad_norm": 0.38452738523483276, + "learning_rate": 2.8654970760233917e-05, + "loss": 0.9363, + "mean_token_accuracy": 0.7810924127697945, + "num_tokens": 19530971.0, + "step": 1556 + }, + { + "entropy": 0.9693558886647224, + "epoch": 2.5949999999999998, + "grad_norm": 0.35174238681793213, + "learning_rate": 2.8538011695906437e-05, + "loss": 0.9631, + "mean_token_accuracy": 0.77150858938694, + "num_tokens": 19543602.0, + "step": 1557 + }, + { + "entropy": 1.2319720908999443, + "epoch": 2.5966666666666667, + "grad_norm": 0.35031840205192566, + "learning_rate": 2.842105263157895e-05, + "loss": 1.2466, + "mean_token_accuracy": 0.7065985575318336, + "num_tokens": 19556123.0, + "step": 1558 + }, + { + "entropy": 0.9793859273195267, + "epoch": 2.5983333333333336, + "grad_norm": 0.5962924957275391, + "learning_rate": 2.830409356725146e-05, + "loss": 0.9649, + "mean_token_accuracy": 0.7668005377054214, + "num_tokens": 19568779.0, + "step": 1559 + }, + { + "entropy": 0.9815565198659897, + "epoch": 2.6, + "grad_norm": 0.38826265931129456, + "learning_rate": 2.818713450292398e-05, + "loss": 0.9607, + "mean_token_accuracy": 0.7701145485043526, + "num_tokens": 19581610.0, + "step": 1560 + }, + { + "entropy": 1.0949571281671524, + "epoch": 2.6016666666666666, + "grad_norm": 0.39656704664230347, + "learning_rate": 2.8070175438596492e-05, + "loss": 1.0889, + "mean_token_accuracy": 0.7468261495232582, + "num_tokens": 19594128.0, + "step": 1561 + }, + { + "entropy": 0.958502359688282, + "epoch": 2.6033333333333335, + "grad_norm": 0.6246112585067749, + "learning_rate": 2.7953216374269008e-05, + "loss": 0.9366, + "mean_token_accuracy": 0.7778401374816895, + "num_tokens": 19606561.0, + "step": 1562 + }, + { + "entropy": 0.9954840168356895, + "epoch": 2.605, + "grad_norm": 0.37746328115463257, + "learning_rate": 2.783625730994152e-05, + "loss": 0.9716, + "mean_token_accuracy": 0.7661651894450188, + "num_tokens": 19619160.0, + "step": 1563 + }, + { + "entropy": 1.0242468938231468, + "epoch": 2.6066666666666665, + "grad_norm": 0.4247598946094513, + "learning_rate": 2.7719298245614034e-05, + "loss": 1.0034, + "mean_token_accuracy": 0.7616144269704819, + "num_tokens": 19631737.0, + "step": 1564 + }, + { + "entropy": 0.859401136636734, + "epoch": 2.6083333333333334, + "grad_norm": 0.460211843252182, + "learning_rate": 2.760233918128655e-05, + "loss": 0.8309, + "mean_token_accuracy": 0.7996894121170044, + "num_tokens": 19644234.0, + "step": 1565 + }, + { + "entropy": 1.0404495373368263, + "epoch": 2.61, + "grad_norm": 0.3696504831314087, + "learning_rate": 2.7485380116959063e-05, + "loss": 1.0253, + "mean_token_accuracy": 0.756307564675808, + "num_tokens": 19656603.0, + "step": 1566 + }, + { + "entropy": 0.9299256652593613, + "epoch": 2.611666666666667, + "grad_norm": 0.35621878504753113, + "learning_rate": 2.7368421052631583e-05, + "loss": 0.9094, + "mean_token_accuracy": 0.7813384011387825, + "num_tokens": 19669347.0, + "step": 1567 + }, + { + "entropy": 0.9908732026815414, + "epoch": 2.6133333333333333, + "grad_norm": 0.4020984172821045, + "learning_rate": 2.7251461988304095e-05, + "loss": 0.9654, + "mean_token_accuracy": 0.7673398107290268, + "num_tokens": 19681739.0, + "step": 1568 + }, + { + "entropy": 1.0475333034992218, + "epoch": 2.615, + "grad_norm": 0.39371800422668457, + "learning_rate": 2.713450292397661e-05, + "loss": 1.0688, + "mean_token_accuracy": 0.755957767367363, + "num_tokens": 19694356.0, + "step": 1569 + }, + { + "entropy": 1.0151420086622238, + "epoch": 2.6166666666666667, + "grad_norm": 0.3678935766220093, + "learning_rate": 2.7017543859649125e-05, + "loss": 0.9956, + "mean_token_accuracy": 0.7647636458277702, + "num_tokens": 19706731.0, + "step": 1570 + }, + { + "entropy": 0.9901127070188522, + "epoch": 2.618333333333333, + "grad_norm": 0.359884649515152, + "learning_rate": 2.6900584795321637e-05, + "loss": 0.9895, + "mean_token_accuracy": 0.7631267011165619, + "num_tokens": 19719189.0, + "step": 1571 + }, + { + "entropy": 0.8601501733064651, + "epoch": 2.62, + "grad_norm": 0.4630512297153473, + "learning_rate": 2.6783625730994154e-05, + "loss": 0.8749, + "mean_token_accuracy": 0.7906171232461929, + "num_tokens": 19731906.0, + "step": 1572 + }, + { + "entropy": 1.0086650848388672, + "epoch": 2.6216666666666666, + "grad_norm": 0.3412810266017914, + "learning_rate": 2.6666666666666667e-05, + "loss": 1.0108, + "mean_token_accuracy": 0.7667879238724709, + "num_tokens": 19744378.0, + "step": 1573 + }, + { + "entropy": 1.0869086980819702, + "epoch": 2.623333333333333, + "grad_norm": 0.36509355902671814, + "learning_rate": 2.654970760233918e-05, + "loss": 1.0904, + "mean_token_accuracy": 0.7425634190440178, + "num_tokens": 19756533.0, + "step": 1574 + }, + { + "entropy": 0.8585296720266342, + "epoch": 2.625, + "grad_norm": 0.3661693036556244, + "learning_rate": 2.64327485380117e-05, + "loss": 0.8514, + "mean_token_accuracy": 0.8021795675158501, + "num_tokens": 19768876.0, + "step": 1575 + }, + { + "entropy": 0.9490972980856895, + "epoch": 2.626666666666667, + "grad_norm": 0.3908548951148987, + "learning_rate": 2.6315789473684212e-05, + "loss": 0.9479, + "mean_token_accuracy": 0.7722587212920189, + "num_tokens": 19781796.0, + "step": 1576 + }, + { + "entropy": 0.901982881128788, + "epoch": 2.6283333333333334, + "grad_norm": 0.4567826986312866, + "learning_rate": 2.6198830409356728e-05, + "loss": 0.9142, + "mean_token_accuracy": 0.7837750613689423, + "num_tokens": 19794439.0, + "step": 1577 + }, + { + "entropy": 0.8791032209992409, + "epoch": 2.63, + "grad_norm": 0.40911853313446045, + "learning_rate": 2.608187134502924e-05, + "loss": 0.8703, + "mean_token_accuracy": 0.7904982343316078, + "num_tokens": 19806955.0, + "step": 1578 + }, + { + "entropy": 1.014504313468933, + "epoch": 2.631666666666667, + "grad_norm": 0.3578345477581024, + "learning_rate": 2.5964912280701754e-05, + "loss": 0.9709, + "mean_token_accuracy": 0.7692977413535118, + "num_tokens": 19819650.0, + "step": 1579 + }, + { + "entropy": 1.0009704530239105, + "epoch": 2.6333333333333333, + "grad_norm": 0.36074739694595337, + "learning_rate": 2.584795321637427e-05, + "loss": 0.9779, + "mean_token_accuracy": 0.7693654671311378, + "num_tokens": 19832134.0, + "step": 1580 + }, + { + "entropy": 1.0298119634389877, + "epoch": 2.635, + "grad_norm": 0.38023653626441956, + "learning_rate": 2.5730994152046783e-05, + "loss": 1.0097, + "mean_token_accuracy": 0.7607345879077911, + "num_tokens": 19844652.0, + "step": 1581 + }, + { + "entropy": 0.8797326683998108, + "epoch": 2.6366666666666667, + "grad_norm": 0.368125855922699, + "learning_rate": 2.5614035087719303e-05, + "loss": 0.8419, + "mean_token_accuracy": 0.7969785332679749, + "num_tokens": 19857270.0, + "step": 1582 + }, + { + "entropy": 0.9885070994496346, + "epoch": 2.638333333333333, + "grad_norm": 0.5501052141189575, + "learning_rate": 2.5497076023391812e-05, + "loss": 0.9906, + "mean_token_accuracy": 0.7612092047929764, + "num_tokens": 19869633.0, + "step": 1583 + }, + { + "entropy": 1.0777137354016304, + "epoch": 2.64, + "grad_norm": 0.4588547945022583, + "learning_rate": 2.5380116959064325e-05, + "loss": 1.0708, + "mean_token_accuracy": 0.7438740506768227, + "num_tokens": 19882053.0, + "step": 1584 + }, + { + "entropy": 0.9231302738189697, + "epoch": 2.6416666666666666, + "grad_norm": 0.49727585911750793, + "learning_rate": 2.5263157894736845e-05, + "loss": 0.8955, + "mean_token_accuracy": 0.7852959036827087, + "num_tokens": 19894354.0, + "step": 1585 + }, + { + "entropy": 1.040400579571724, + "epoch": 2.6433333333333335, + "grad_norm": 0.35327550768852234, + "learning_rate": 2.5146198830409358e-05, + "loss": 1.0321, + "mean_token_accuracy": 0.751515619456768, + "num_tokens": 19906711.0, + "step": 1586 + }, + { + "entropy": 1.0184793323278427, + "epoch": 2.645, + "grad_norm": 0.35986557602882385, + "learning_rate": 2.5029239766081874e-05, + "loss": 0.9956, + "mean_token_accuracy": 0.7662120163440704, + "num_tokens": 19919357.0, + "step": 1587 + }, + { + "entropy": 0.9297609850764275, + "epoch": 2.6466666666666665, + "grad_norm": 0.4078976809978485, + "learning_rate": 2.4912280701754387e-05, + "loss": 0.911, + "mean_token_accuracy": 0.7853410243988037, + "num_tokens": 19932046.0, + "step": 1588 + }, + { + "entropy": 1.078060306608677, + "epoch": 2.6483333333333334, + "grad_norm": 0.445563405752182, + "learning_rate": 2.4795321637426903e-05, + "loss": 1.0591, + "mean_token_accuracy": 0.7507107853889465, + "num_tokens": 19944483.0, + "step": 1589 + }, + { + "entropy": 0.9507294967770576, + "epoch": 2.65, + "grad_norm": 0.5744986534118652, + "learning_rate": 2.4678362573099416e-05, + "loss": 0.9363, + "mean_token_accuracy": 0.7791957780718803, + "num_tokens": 19956903.0, + "step": 1590 + }, + { + "entropy": 1.1206394955515862, + "epoch": 2.6516666666666664, + "grad_norm": 0.36570364236831665, + "learning_rate": 2.456140350877193e-05, + "loss": 1.1156, + "mean_token_accuracy": 0.7432742789387703, + "num_tokens": 19969546.0, + "step": 1591 + }, + { + "entropy": 0.8363280594348907, + "epoch": 2.6533333333333333, + "grad_norm": 0.3354378938674927, + "learning_rate": 2.4444444444444445e-05, + "loss": 0.801, + "mean_token_accuracy": 0.8103508800268173, + "num_tokens": 19982233.0, + "step": 1592 + }, + { + "entropy": 1.1075763031840324, + "epoch": 2.6550000000000002, + "grad_norm": 0.43673840165138245, + "learning_rate": 2.432748538011696e-05, + "loss": 1.1235, + "mean_token_accuracy": 0.7423203513026237, + "num_tokens": 19994437.0, + "step": 1593 + }, + { + "entropy": 1.0388616025447845, + "epoch": 2.6566666666666667, + "grad_norm": 0.36875611543655396, + "learning_rate": 2.4210526315789474e-05, + "loss": 1.0386, + "mean_token_accuracy": 0.759231723845005, + "num_tokens": 20007103.0, + "step": 1594 + }, + { + "entropy": 1.1487151607871056, + "epoch": 2.658333333333333, + "grad_norm": 0.31443503499031067, + "learning_rate": 2.409356725146199e-05, + "loss": 1.1074, + "mean_token_accuracy": 0.7383889853954315, + "num_tokens": 20019737.0, + "step": 1595 + }, + { + "entropy": 0.9868790879845619, + "epoch": 2.66, + "grad_norm": 0.490033358335495, + "learning_rate": 2.3976608187134503e-05, + "loss": 0.9642, + "mean_token_accuracy": 0.7724264338612556, + "num_tokens": 20031998.0, + "step": 1596 + }, + { + "entropy": 1.1250801086425781, + "epoch": 2.6616666666666666, + "grad_norm": 0.43826350569725037, + "learning_rate": 2.385964912280702e-05, + "loss": 1.1149, + "mean_token_accuracy": 0.7414567396044731, + "num_tokens": 20044668.0, + "step": 1597 + }, + { + "entropy": 1.0366590395569801, + "epoch": 2.663333333333333, + "grad_norm": 0.36638733744621277, + "learning_rate": 2.3742690058479532e-05, + "loss": 1.0305, + "mean_token_accuracy": 0.7574252262711525, + "num_tokens": 20056968.0, + "step": 1598 + }, + { + "entropy": 1.0217024236917496, + "epoch": 2.665, + "grad_norm": 0.3933520019054413, + "learning_rate": 2.362573099415205e-05, + "loss": 1.0108, + "mean_token_accuracy": 0.7617498487234116, + "num_tokens": 20069172.0, + "step": 1599 + }, + { + "entropy": 1.2189399525523186, + "epoch": 2.6666666666666665, + "grad_norm": 0.37974849343299866, + "learning_rate": 2.3508771929824565e-05, + "loss": 1.2034, + "mean_token_accuracy": 0.7185921147465706, + "num_tokens": 20081751.0, + "step": 1600 + }, + { + "entropy": 0.8786380961537361, + "epoch": 2.6683333333333334, + "grad_norm": 0.37079235911369324, + "learning_rate": 2.3391812865497074e-05, + "loss": 0.8608, + "mean_token_accuracy": 0.7884936407208443, + "num_tokens": 20094434.0, + "step": 1601 + }, + { + "entropy": 0.9831163436174393, + "epoch": 2.67, + "grad_norm": 0.39722537994384766, + "learning_rate": 2.327485380116959e-05, + "loss": 0.9917, + "mean_token_accuracy": 0.7645980343222618, + "num_tokens": 20106783.0, + "step": 1602 + }, + { + "entropy": 0.9219950735569, + "epoch": 2.671666666666667, + "grad_norm": 0.4101279079914093, + "learning_rate": 2.3157894736842107e-05, + "loss": 0.9249, + "mean_token_accuracy": 0.7848934978246689, + "num_tokens": 20119366.0, + "step": 1603 + }, + { + "entropy": 0.8318170011043549, + "epoch": 2.6733333333333333, + "grad_norm": 0.3974708616733551, + "learning_rate": 2.3040935672514623e-05, + "loss": 0.8116, + "mean_token_accuracy": 0.8100480064749718, + "num_tokens": 20131870.0, + "step": 1604 + }, + { + "entropy": 0.9451627805829048, + "epoch": 2.675, + "grad_norm": 0.3505653738975525, + "learning_rate": 2.2923976608187136e-05, + "loss": 0.9579, + "mean_token_accuracy": 0.7800606712698936, + "num_tokens": 20144220.0, + "step": 1605 + }, + { + "entropy": 1.0744628980755806, + "epoch": 2.6766666666666667, + "grad_norm": 0.4347447454929352, + "learning_rate": 2.280701754385965e-05, + "loss": 1.0547, + "mean_token_accuracy": 0.751056581735611, + "num_tokens": 20156853.0, + "step": 1606 + }, + { + "entropy": 1.0362855270504951, + "epoch": 2.6783333333333332, + "grad_norm": 0.363424688577652, + "learning_rate": 2.2690058479532165e-05, + "loss": 1.0085, + "mean_token_accuracy": 0.7557000145316124, + "num_tokens": 20169596.0, + "step": 1607 + }, + { + "entropy": 1.029419258236885, + "epoch": 2.68, + "grad_norm": 0.40872007608413696, + "learning_rate": 2.2573099415204678e-05, + "loss": 1.0296, + "mean_token_accuracy": 0.7495494335889816, + "num_tokens": 20181865.0, + "step": 1608 + }, + { + "entropy": 1.0220500379800797, + "epoch": 2.6816666666666666, + "grad_norm": 0.355505108833313, + "learning_rate": 2.2456140350877194e-05, + "loss": 0.9934, + "mean_token_accuracy": 0.7668603658676147, + "num_tokens": 20194446.0, + "step": 1609 + }, + { + "entropy": 0.9161782190203667, + "epoch": 2.6833333333333336, + "grad_norm": 0.39546746015548706, + "learning_rate": 2.233918128654971e-05, + "loss": 0.8827, + "mean_token_accuracy": 0.7861305251717567, + "num_tokens": 20206916.0, + "step": 1610 + }, + { + "entropy": 1.1460915431380272, + "epoch": 2.685, + "grad_norm": 0.39181846380233765, + "learning_rate": 2.2222222222222223e-05, + "loss": 1.1242, + "mean_token_accuracy": 0.7357367277145386, + "num_tokens": 20219401.0, + "step": 1611 + }, + { + "entropy": 0.8742747977375984, + "epoch": 2.6866666666666665, + "grad_norm": 0.38525453209877014, + "learning_rate": 2.2105263157894736e-05, + "loss": 0.8574, + "mean_token_accuracy": 0.7935161367058754, + "num_tokens": 20232118.0, + "step": 1612 + }, + { + "entropy": 0.9349016323685646, + "epoch": 2.6883333333333335, + "grad_norm": 0.5899012684822083, + "learning_rate": 2.1988304093567252e-05, + "loss": 0.9085, + "mean_token_accuracy": 0.784353718161583, + "num_tokens": 20244766.0, + "step": 1613 + }, + { + "entropy": 1.0421105921268463, + "epoch": 2.69, + "grad_norm": 0.3671818673610687, + "learning_rate": 2.187134502923977e-05, + "loss": 1.059, + "mean_token_accuracy": 0.7464089542627335, + "num_tokens": 20257355.0, + "step": 1614 + }, + { + "entropy": 0.9193084388971329, + "epoch": 2.6916666666666664, + "grad_norm": 0.3241657018661499, + "learning_rate": 2.175438596491228e-05, + "loss": 0.9117, + "mean_token_accuracy": 0.7878368347883224, + "num_tokens": 20269957.0, + "step": 1615 + }, + { + "entropy": 1.1140388399362564, + "epoch": 2.6933333333333334, + "grad_norm": 0.4192982316017151, + "learning_rate": 2.1637426900584794e-05, + "loss": 1.1401, + "mean_token_accuracy": 0.7369045242667198, + "num_tokens": 20282651.0, + "step": 1616 + }, + { + "entropy": 0.9145372211933136, + "epoch": 2.695, + "grad_norm": 0.44047683477401733, + "learning_rate": 2.152046783625731e-05, + "loss": 0.9044, + "mean_token_accuracy": 0.7820262387394905, + "num_tokens": 20295315.0, + "step": 1617 + }, + { + "entropy": 1.1075597703456879, + "epoch": 2.6966666666666668, + "grad_norm": 0.32873111963272095, + "learning_rate": 2.1403508771929827e-05, + "loss": 1.097, + "mean_token_accuracy": 0.7436584010720253, + "num_tokens": 20307692.0, + "step": 1618 + }, + { + "entropy": 1.0005147382616997, + "epoch": 2.6983333333333333, + "grad_norm": 0.3647487163543701, + "learning_rate": 2.128654970760234e-05, + "loss": 0.9993, + "mean_token_accuracy": 0.7612884119153023, + "num_tokens": 20320325.0, + "step": 1619 + }, + { + "entropy": 0.896811954677105, + "epoch": 2.7, + "grad_norm": 0.3368508219718933, + "learning_rate": 2.1169590643274856e-05, + "loss": 0.8833, + "mean_token_accuracy": 0.789857380092144, + "num_tokens": 20332827.0, + "step": 1620 + }, + { + "entropy": 0.9515457004308701, + "epoch": 2.7016666666666667, + "grad_norm": 0.371025949716568, + "learning_rate": 2.105263157894737e-05, + "loss": 0.9634, + "mean_token_accuracy": 0.7750696912407875, + "num_tokens": 20345345.0, + "step": 1621 + }, + { + "entropy": 1.0203317180275917, + "epoch": 2.703333333333333, + "grad_norm": 0.3610087037086487, + "learning_rate": 2.0935672514619882e-05, + "loss": 1.0279, + "mean_token_accuracy": 0.7598353177309036, + "num_tokens": 20357838.0, + "step": 1622 + }, + { + "entropy": 0.9561098366975784, + "epoch": 2.705, + "grad_norm": 0.40472012758255005, + "learning_rate": 2.0818713450292398e-05, + "loss": 0.9371, + "mean_token_accuracy": 0.7784752324223518, + "num_tokens": 20370168.0, + "step": 1623 + }, + { + "entropy": 0.7523621320724487, + "epoch": 2.7066666666666666, + "grad_norm": 0.7040989398956299, + "learning_rate": 2.0701754385964914e-05, + "loss": 0.7393, + "mean_token_accuracy": 0.8237317577004433, + "num_tokens": 20382578.0, + "step": 1624 + }, + { + "entropy": 0.9483193829655647, + "epoch": 2.7083333333333335, + "grad_norm": 0.3585512936115265, + "learning_rate": 2.058479532163743e-05, + "loss": 0.9162, + "mean_token_accuracy": 0.7878899574279785, + "num_tokens": 20395323.0, + "step": 1625 + }, + { + "entropy": 0.8915515244007111, + "epoch": 2.71, + "grad_norm": 0.4794406592845917, + "learning_rate": 2.046783625730994e-05, + "loss": 0.8639, + "mean_token_accuracy": 0.7915221601724625, + "num_tokens": 20407957.0, + "step": 1626 + }, + { + "entropy": 1.0462695360183716, + "epoch": 2.711666666666667, + "grad_norm": 0.5657960772514343, + "learning_rate": 2.0350877192982456e-05, + "loss": 1.0777, + "mean_token_accuracy": 0.7543677762150764, + "num_tokens": 20420288.0, + "step": 1627 + }, + { + "entropy": 1.0665453746914864, + "epoch": 2.7133333333333334, + "grad_norm": 0.3548641502857208, + "learning_rate": 2.0233918128654973e-05, + "loss": 1.0509, + "mean_token_accuracy": 0.7565915361046791, + "num_tokens": 20432794.0, + "step": 1628 + }, + { + "entropy": 0.9368977323174477, + "epoch": 2.715, + "grad_norm": 0.3973557651042938, + "learning_rate": 2.0116959064327485e-05, + "loss": 0.9184, + "mean_token_accuracy": 0.7830042317509651, + "num_tokens": 20445324.0, + "step": 1629 + }, + { + "entropy": 0.9192683473229408, + "epoch": 2.716666666666667, + "grad_norm": 0.3835406005382538, + "learning_rate": 2e-05, + "loss": 0.9116, + "mean_token_accuracy": 0.7847650721669197, + "num_tokens": 20457787.0, + "step": 1630 + }, + { + "entropy": 0.8217027559876442, + "epoch": 2.7183333333333333, + "grad_norm": 0.4502755105495453, + "learning_rate": 1.9883040935672515e-05, + "loss": 0.7903, + "mean_token_accuracy": 0.8053467124700546, + "num_tokens": 20470409.0, + "step": 1631 + }, + { + "entropy": 0.9506581202149391, + "epoch": 2.7199999999999998, + "grad_norm": 0.3773946166038513, + "learning_rate": 1.976608187134503e-05, + "loss": 0.9309, + "mean_token_accuracy": 0.7757667228579521, + "num_tokens": 20482892.0, + "step": 1632 + }, + { + "entropy": 1.0465561263263226, + "epoch": 2.7216666666666667, + "grad_norm": 0.33543112874031067, + "learning_rate": 1.9649122807017544e-05, + "loss": 1.0301, + "mean_token_accuracy": 0.7597394809126854, + "num_tokens": 20495307.0, + "step": 1633 + }, + { + "entropy": 0.9205638915300369, + "epoch": 2.7233333333333336, + "grad_norm": 0.38839203119277954, + "learning_rate": 1.953216374269006e-05, + "loss": 0.9177, + "mean_token_accuracy": 0.7760036289691925, + "num_tokens": 20507791.0, + "step": 1634 + }, + { + "entropy": 0.987869456410408, + "epoch": 2.725, + "grad_norm": 0.34582483768463135, + "learning_rate": 1.9415204678362576e-05, + "loss": 0.962, + "mean_token_accuracy": 0.7674970328807831, + "num_tokens": 20520382.0, + "step": 1635 + }, + { + "entropy": 0.940305083990097, + "epoch": 2.7266666666666666, + "grad_norm": 0.41462910175323486, + "learning_rate": 1.929824561403509e-05, + "loss": 0.9158, + "mean_token_accuracy": 0.7811434492468834, + "num_tokens": 20533116.0, + "step": 1636 + }, + { + "entropy": 0.8735738471150398, + "epoch": 2.7283333333333335, + "grad_norm": 0.3786480128765106, + "learning_rate": 1.9181286549707602e-05, + "loss": 0.8728, + "mean_token_accuracy": 0.7916983366012573, + "num_tokens": 20545702.0, + "step": 1637 + }, + { + "entropy": 1.056575320661068, + "epoch": 2.73, + "grad_norm": 0.3440716862678528, + "learning_rate": 1.9064327485380118e-05, + "loss": 1.041, + "mean_token_accuracy": 0.750531516969204, + "num_tokens": 20558173.0, + "step": 1638 + }, + { + "entropy": 1.022874854505062, + "epoch": 2.7316666666666665, + "grad_norm": 0.3751567304134369, + "learning_rate": 1.8947368421052634e-05, + "loss": 1.0086, + "mean_token_accuracy": 0.7592471837997437, + "num_tokens": 20570935.0, + "step": 1639 + }, + { + "entropy": 0.8968786224722862, + "epoch": 2.7333333333333334, + "grad_norm": 0.3456690311431885, + "learning_rate": 1.8830409356725147e-05, + "loss": 0.9068, + "mean_token_accuracy": 0.7874610051512718, + "num_tokens": 20583670.0, + "step": 1640 + }, + { + "entropy": 0.9974970147013664, + "epoch": 2.735, + "grad_norm": 0.3717511296272278, + "learning_rate": 1.871345029239766e-05, + "loss": 1.0124, + "mean_token_accuracy": 0.760253295302391, + "num_tokens": 20596341.0, + "step": 1641 + }, + { + "entropy": 0.9726630449295044, + "epoch": 2.736666666666667, + "grad_norm": 0.3645414412021637, + "learning_rate": 1.8596491228070176e-05, + "loss": 0.9653, + "mean_token_accuracy": 0.7680811062455177, + "num_tokens": 20609111.0, + "step": 1642 + }, + { + "entropy": 0.9171030893921852, + "epoch": 2.7383333333333333, + "grad_norm": 0.5028409361839294, + "learning_rate": 1.8479532163742693e-05, + "loss": 0.8993, + "mean_token_accuracy": 0.7851759195327759, + "num_tokens": 20621779.0, + "step": 1643 + }, + { + "entropy": 0.8245140202343464, + "epoch": 2.74, + "grad_norm": 0.3632289469242096, + "learning_rate": 1.8362573099415205e-05, + "loss": 0.8054, + "mean_token_accuracy": 0.8106393218040466, + "num_tokens": 20634535.0, + "step": 1644 + }, + { + "entropy": 1.0589898452162743, + "epoch": 2.7416666666666667, + "grad_norm": 0.3945890963077545, + "learning_rate": 1.8245614035087722e-05, + "loss": 1.0619, + "mean_token_accuracy": 0.7522666826844215, + "num_tokens": 20647252.0, + "step": 1645 + }, + { + "entropy": 0.9454857483506203, + "epoch": 2.743333333333333, + "grad_norm": 0.39829105138778687, + "learning_rate": 1.8128654970760235e-05, + "loss": 0.9165, + "mean_token_accuracy": 0.7723772302269936, + "num_tokens": 20659685.0, + "step": 1646 + }, + { + "entropy": 1.1061757281422615, + "epoch": 2.745, + "grad_norm": 0.41006022691726685, + "learning_rate": 1.8011695906432747e-05, + "loss": 1.1046, + "mean_token_accuracy": 0.7397371530532837, + "num_tokens": 20672158.0, + "step": 1647 + }, + { + "entropy": 0.9142966568470001, + "epoch": 2.7466666666666666, + "grad_norm": 0.3873363435268402, + "learning_rate": 1.7894736842105264e-05, + "loss": 0.8881, + "mean_token_accuracy": 0.7829329073429108, + "num_tokens": 20685010.0, + "step": 1648 + }, + { + "entropy": 0.9453158751130104, + "epoch": 2.748333333333333, + "grad_norm": 0.4290614724159241, + "learning_rate": 1.777777777777778e-05, + "loss": 0.9078, + "mean_token_accuracy": 0.782992847263813, + "num_tokens": 20697468.0, + "step": 1649 + }, + { + "entropy": 1.002040408551693, + "epoch": 2.75, + "grad_norm": 0.360868364572525, + "learning_rate": 1.7660818713450293e-05, + "loss": 1.0053, + "mean_token_accuracy": 0.7629223838448524, + "num_tokens": 20710295.0, + "step": 1650 + }, + { + "entropy": 1.1798802465200424, + "epoch": 2.751666666666667, + "grad_norm": 0.3488300144672394, + "learning_rate": 1.7543859649122806e-05, + "loss": 1.1586, + "mean_token_accuracy": 0.7250996008515358, + "num_tokens": 20723057.0, + "step": 1651 + }, + { + "entropy": 0.966465599834919, + "epoch": 2.7533333333333334, + "grad_norm": 0.37623661756515503, + "learning_rate": 1.7426900584795322e-05, + "loss": 0.9683, + "mean_token_accuracy": 0.7719982042908669, + "num_tokens": 20735443.0, + "step": 1652 + }, + { + "entropy": 1.0982854589819908, + "epoch": 2.755, + "grad_norm": 0.3756445646286011, + "learning_rate": 1.7309941520467838e-05, + "loss": 1.0835, + "mean_token_accuracy": 0.7424386367201805, + "num_tokens": 20747977.0, + "step": 1653 + }, + { + "entropy": 1.0840732529759407, + "epoch": 2.756666666666667, + "grad_norm": 0.45538654923439026, + "learning_rate": 1.719298245614035e-05, + "loss": 1.063, + "mean_token_accuracy": 0.7478004172444344, + "num_tokens": 20760322.0, + "step": 1654 + }, + { + "entropy": 0.9573176577687263, + "epoch": 2.7583333333333333, + "grad_norm": 0.36861681938171387, + "learning_rate": 1.7076023391812867e-05, + "loss": 0.9475, + "mean_token_accuracy": 0.7757124379277229, + "num_tokens": 20772925.0, + "step": 1655 + }, + { + "entropy": 1.1003102585673332, + "epoch": 2.76, + "grad_norm": 0.33055004477500916, + "learning_rate": 1.695906432748538e-05, + "loss": 1.077, + "mean_token_accuracy": 0.748504027724266, + "num_tokens": 20785518.0, + "step": 1656 + }, + { + "entropy": 1.0443431660532951, + "epoch": 2.7616666666666667, + "grad_norm": 0.33600515127182007, + "learning_rate": 1.6842105263157896e-05, + "loss": 1.0309, + "mean_token_accuracy": 0.7604049816727638, + "num_tokens": 20797984.0, + "step": 1657 + }, + { + "entropy": 1.0836257934570312, + "epoch": 2.763333333333333, + "grad_norm": 0.4598836600780487, + "learning_rate": 1.672514619883041e-05, + "loss": 1.0451, + "mean_token_accuracy": 0.7539729624986649, + "num_tokens": 20810342.0, + "step": 1658 + }, + { + "entropy": 1.0073582082986832, + "epoch": 2.765, + "grad_norm": 0.3531425893306732, + "learning_rate": 1.6608187134502926e-05, + "loss": 0.9698, + "mean_token_accuracy": 0.7685017138719559, + "num_tokens": 20822920.0, + "step": 1659 + }, + { + "entropy": 1.0834669694304466, + "epoch": 2.7666666666666666, + "grad_norm": 0.35335052013397217, + "learning_rate": 1.6491228070175442e-05, + "loss": 1.0595, + "mean_token_accuracy": 0.7512029409408569, + "num_tokens": 20835314.0, + "step": 1660 + }, + { + "entropy": 0.9735758602619171, + "epoch": 2.7683333333333335, + "grad_norm": 0.37318694591522217, + "learning_rate": 1.6374269005847955e-05, + "loss": 0.9567, + "mean_token_accuracy": 0.769873820245266, + "num_tokens": 20847736.0, + "step": 1661 + }, + { + "entropy": 1.0253816321492195, + "epoch": 2.77, + "grad_norm": 0.31762170791625977, + "learning_rate": 1.6257309941520468e-05, + "loss": 1.0237, + "mean_token_accuracy": 0.7625403180718422, + "num_tokens": 20860323.0, + "step": 1662 + }, + { + "entropy": 1.1479705274105072, + "epoch": 2.7716666666666665, + "grad_norm": 0.4154747426509857, + "learning_rate": 1.6140350877192984e-05, + "loss": 1.1541, + "mean_token_accuracy": 0.7322883307933807, + "num_tokens": 20873124.0, + "step": 1663 + }, + { + "entropy": 1.0438043177127838, + "epoch": 2.7733333333333334, + "grad_norm": 0.4771679639816284, + "learning_rate": 1.60233918128655e-05, + "loss": 1.025, + "mean_token_accuracy": 0.7605370879173279, + "num_tokens": 20885646.0, + "step": 1664 + }, + { + "entropy": 0.9098687171936035, + "epoch": 2.775, + "grad_norm": 0.3227625787258148, + "learning_rate": 1.5906432748538013e-05, + "loss": 0.8682, + "mean_token_accuracy": 0.7874719724059105, + "num_tokens": 20898395.0, + "step": 1665 + }, + { + "entropy": 1.0298122838139534, + "epoch": 2.7766666666666664, + "grad_norm": 0.3388575315475464, + "learning_rate": 1.5789473684210526e-05, + "loss": 1.0174, + "mean_token_accuracy": 0.7583098858594894, + "num_tokens": 20910524.0, + "step": 1666 + }, + { + "entropy": 1.0449883863329887, + "epoch": 2.7783333333333333, + "grad_norm": 0.4490339457988739, + "learning_rate": 1.5672514619883042e-05, + "loss": 1.0517, + "mean_token_accuracy": 0.7573523223400116, + "num_tokens": 20922701.0, + "step": 1667 + }, + { + "entropy": 0.9152447134256363, + "epoch": 2.7800000000000002, + "grad_norm": 0.43474024534225464, + "learning_rate": 1.5555555555555555e-05, + "loss": 0.9005, + "mean_token_accuracy": 0.785552367568016, + "num_tokens": 20935650.0, + "step": 1668 + }, + { + "entropy": 1.165888786315918, + "epoch": 2.7816666666666667, + "grad_norm": 0.37757226824760437, + "learning_rate": 1.543859649122807e-05, + "loss": 1.1589, + "mean_token_accuracy": 0.7293761596083641, + "num_tokens": 20948285.0, + "step": 1669 + }, + { + "entropy": 0.9996040016412735, + "epoch": 2.783333333333333, + "grad_norm": 0.47588273882865906, + "learning_rate": 1.5321637426900587e-05, + "loss": 0.9902, + "mean_token_accuracy": 0.7650641575455666, + "num_tokens": 20961136.0, + "step": 1670 + }, + { + "entropy": 0.8657255843281746, + "epoch": 2.785, + "grad_norm": 0.3503558933734894, + "learning_rate": 1.5204678362573099e-05, + "loss": 0.8656, + "mean_token_accuracy": 0.7937938943505287, + "num_tokens": 20973764.0, + "step": 1671 + }, + { + "entropy": 0.9660136625170708, + "epoch": 2.7866666666666666, + "grad_norm": 0.4027051627635956, + "learning_rate": 1.5087719298245615e-05, + "loss": 0.9678, + "mean_token_accuracy": 0.7705286145210266, + "num_tokens": 20986510.0, + "step": 1672 + }, + { + "entropy": 0.9605318829417229, + "epoch": 2.788333333333333, + "grad_norm": 0.4430026412010193, + "learning_rate": 1.497076023391813e-05, + "loss": 0.97, + "mean_token_accuracy": 0.7727236077189445, + "num_tokens": 20999299.0, + "step": 1673 + }, + { + "entropy": 1.018745370209217, + "epoch": 2.79, + "grad_norm": 0.457681804895401, + "learning_rate": 1.4853801169590644e-05, + "loss": 1.0367, + "mean_token_accuracy": 0.759983517229557, + "num_tokens": 21011963.0, + "step": 1674 + }, + { + "entropy": 0.9377132654190063, + "epoch": 2.7916666666666665, + "grad_norm": 0.5751153826713562, + "learning_rate": 1.4736842105263157e-05, + "loss": 0.9228, + "mean_token_accuracy": 0.7740181460976601, + "num_tokens": 21024353.0, + "step": 1675 + }, + { + "entropy": 0.9824780151247978, + "epoch": 2.7933333333333334, + "grad_norm": 0.39329952001571655, + "learning_rate": 1.4619883040935673e-05, + "loss": 0.9921, + "mean_token_accuracy": 0.7658392265439034, + "num_tokens": 21036896.0, + "step": 1676 + }, + { + "entropy": 0.9139937981963158, + "epoch": 2.795, + "grad_norm": 0.41152364015579224, + "learning_rate": 1.4502923976608188e-05, + "loss": 0.8983, + "mean_token_accuracy": 0.7790831923484802, + "num_tokens": 21049541.0, + "step": 1677 + }, + { + "entropy": 0.9995200335979462, + "epoch": 2.796666666666667, + "grad_norm": 0.312898725271225, + "learning_rate": 1.4385964912280702e-05, + "loss": 0.9827, + "mean_token_accuracy": 0.7627207487821579, + "num_tokens": 21062074.0, + "step": 1678 + }, + { + "entropy": 1.0503408163785934, + "epoch": 2.7983333333333333, + "grad_norm": 0.422442764043808, + "learning_rate": 1.4269005847953219e-05, + "loss": 1.0603, + "mean_token_accuracy": 0.7457630932331085, + "num_tokens": 21074463.0, + "step": 1679 + }, + { + "entropy": 1.0046812146902084, + "epoch": 2.8, + "grad_norm": 0.35540297627449036, + "learning_rate": 1.415204678362573e-05, + "loss": 1.0123, + "mean_token_accuracy": 0.7617236971855164, + "num_tokens": 21087052.0, + "step": 1680 + }, + { + "entropy": 0.9316699206829071, + "epoch": 2.8016666666666667, + "grad_norm": 0.6039937138557434, + "learning_rate": 1.4035087719298246e-05, + "loss": 0.8812, + "mean_token_accuracy": 0.7913266718387604, + "num_tokens": 21099662.0, + "step": 1681 + }, + { + "entropy": 0.979851484298706, + "epoch": 2.8033333333333332, + "grad_norm": 0.32677847146987915, + "learning_rate": 1.391812865497076e-05, + "loss": 0.9749, + "mean_token_accuracy": 0.7660597264766693, + "num_tokens": 21112113.0, + "step": 1682 + }, + { + "entropy": 0.9364907667040825, + "epoch": 2.805, + "grad_norm": 0.3332626223564148, + "learning_rate": 1.3801169590643275e-05, + "loss": 0.903, + "mean_token_accuracy": 0.790557011961937, + "num_tokens": 21124712.0, + "step": 1683 + }, + { + "entropy": 0.9260425418615341, + "epoch": 2.8066666666666666, + "grad_norm": 0.39053162932395935, + "learning_rate": 1.3684210526315791e-05, + "loss": 0.9073, + "mean_token_accuracy": 0.7805077061057091, + "num_tokens": 21137413.0, + "step": 1684 + }, + { + "entropy": 0.9894504696130753, + "epoch": 2.8083333333333336, + "grad_norm": 0.44214099645614624, + "learning_rate": 1.3567251461988304e-05, + "loss": 0.9808, + "mean_token_accuracy": 0.7690370231866837, + "num_tokens": 21149718.0, + "step": 1685 + }, + { + "entropy": 0.9374509304761887, + "epoch": 2.81, + "grad_norm": 0.39250999689102173, + "learning_rate": 1.3450292397660819e-05, + "loss": 0.9139, + "mean_token_accuracy": 0.7772816047072411, + "num_tokens": 21162320.0, + "step": 1686 + }, + { + "entropy": 0.9401525929570198, + "epoch": 2.8116666666666665, + "grad_norm": 0.3679231107234955, + "learning_rate": 1.3333333333333333e-05, + "loss": 0.9285, + "mean_token_accuracy": 0.7783447355031967, + "num_tokens": 21174882.0, + "step": 1687 + }, + { + "entropy": 0.9473934099078178, + "epoch": 2.8133333333333335, + "grad_norm": 0.3454856276512146, + "learning_rate": 1.321637426900585e-05, + "loss": 0.9476, + "mean_token_accuracy": 0.7781709879636765, + "num_tokens": 21187442.0, + "step": 1688 + }, + { + "entropy": 1.059903234243393, + "epoch": 2.815, + "grad_norm": 0.38406118750572205, + "learning_rate": 1.3099415204678364e-05, + "loss": 1.0406, + "mean_token_accuracy": 0.7552267909049988, + "num_tokens": 21199863.0, + "step": 1689 + }, + { + "entropy": 1.2062927782535553, + "epoch": 2.8166666666666664, + "grad_norm": 0.4775955080986023, + "learning_rate": 1.2982456140350877e-05, + "loss": 1.212, + "mean_token_accuracy": 0.7197834625840187, + "num_tokens": 21212493.0, + "step": 1690 + }, + { + "entropy": 0.9059441983699799, + "epoch": 2.8183333333333334, + "grad_norm": 0.3290518820285797, + "learning_rate": 1.2865497076023392e-05, + "loss": 0.9133, + "mean_token_accuracy": 0.7860268801450729, + "num_tokens": 21225254.0, + "step": 1691 + }, + { + "entropy": 0.8256610706448555, + "epoch": 2.82, + "grad_norm": 0.4111975431442261, + "learning_rate": 1.2748538011695906e-05, + "loss": 0.8004, + "mean_token_accuracy": 0.8044591471552849, + "num_tokens": 21237913.0, + "step": 1692 + }, + { + "entropy": 0.9270822256803513, + "epoch": 2.8216666666666668, + "grad_norm": 0.3687874674797058, + "learning_rate": 1.2631578947368422e-05, + "loss": 0.9244, + "mean_token_accuracy": 0.7831602171063423, + "num_tokens": 21250470.0, + "step": 1693 + }, + { + "entropy": 0.959955707192421, + "epoch": 2.8233333333333333, + "grad_norm": 0.3667733669281006, + "learning_rate": 1.2514619883040937e-05, + "loss": 0.9558, + "mean_token_accuracy": 0.7712240889668465, + "num_tokens": 21262915.0, + "step": 1694 + }, + { + "entropy": 0.9285707548260689, + "epoch": 2.825, + "grad_norm": 0.363221675157547, + "learning_rate": 1.2397660818713451e-05, + "loss": 0.9175, + "mean_token_accuracy": 0.7797513231635094, + "num_tokens": 21275618.0, + "step": 1695 + }, + { + "entropy": 0.901066817343235, + "epoch": 2.8266666666666667, + "grad_norm": 0.48460933566093445, + "learning_rate": 1.2280701754385964e-05, + "loss": 0.8796, + "mean_token_accuracy": 0.7882982492446899, + "num_tokens": 21287941.0, + "step": 1696 + }, + { + "entropy": 1.006144106388092, + "epoch": 2.828333333333333, + "grad_norm": 0.42387738823890686, + "learning_rate": 1.216374269005848e-05, + "loss": 1.0094, + "mean_token_accuracy": 0.7629361301660538, + "num_tokens": 21300496.0, + "step": 1697 + }, + { + "entropy": 1.0734409987926483, + "epoch": 2.83, + "grad_norm": 0.35227811336517334, + "learning_rate": 1.2046783625730995e-05, + "loss": 1.0533, + "mean_token_accuracy": 0.7464311644434929, + "num_tokens": 21312980.0, + "step": 1698 + }, + { + "entropy": 0.8379950523376465, + "epoch": 2.8316666666666666, + "grad_norm": 0.3516140580177307, + "learning_rate": 1.192982456140351e-05, + "loss": 0.8028, + "mean_token_accuracy": 0.8023798167705536, + "num_tokens": 21325576.0, + "step": 1699 + }, + { + "entropy": 1.02951218187809, + "epoch": 2.8333333333333335, + "grad_norm": 0.3923986256122589, + "learning_rate": 1.1812865497076024e-05, + "loss": 1.0423, + "mean_token_accuracy": 0.7478686273097992, + "num_tokens": 21338290.0, + "step": 1700 + }, + { + "entropy": 1.0603998601436615, + "epoch": 2.835, + "grad_norm": 0.3304397165775299, + "learning_rate": 1.1695906432748537e-05, + "loss": 1.0531, + "mean_token_accuracy": 0.7521646395325661, + "num_tokens": 21351033.0, + "step": 1701 + }, + { + "entropy": 0.9814613237977028, + "epoch": 2.836666666666667, + "grad_norm": 0.32319486141204834, + "learning_rate": 1.1578947368421053e-05, + "loss": 0.9568, + "mean_token_accuracy": 0.768329955637455, + "num_tokens": 21363845.0, + "step": 1702 + }, + { + "entropy": 0.9949151948094368, + "epoch": 2.8383333333333334, + "grad_norm": 0.4614749848842621, + "learning_rate": 1.1461988304093568e-05, + "loss": 1.0075, + "mean_token_accuracy": 0.7567261829972267, + "num_tokens": 21376273.0, + "step": 1703 + }, + { + "entropy": 0.9148530811071396, + "epoch": 2.84, + "grad_norm": 0.43842312693595886, + "learning_rate": 1.1345029239766083e-05, + "loss": 0.9119, + "mean_token_accuracy": 0.7857235297560692, + "num_tokens": 21388628.0, + "step": 1704 + }, + { + "entropy": 1.056400939822197, + "epoch": 2.841666666666667, + "grad_norm": 0.4291359484195709, + "learning_rate": 1.1228070175438597e-05, + "loss": 1.0408, + "mean_token_accuracy": 0.7525315657258034, + "num_tokens": 21401361.0, + "step": 1705 + }, + { + "entropy": 0.9079612866044044, + "epoch": 2.8433333333333333, + "grad_norm": 0.3619014620780945, + "learning_rate": 1.1111111111111112e-05, + "loss": 0.8956, + "mean_token_accuracy": 0.7931994721293449, + "num_tokens": 21414172.0, + "step": 1706 + }, + { + "entropy": 0.9774614349007607, + "epoch": 2.8449999999999998, + "grad_norm": 0.33217278122901917, + "learning_rate": 1.0994152046783626e-05, + "loss": 0.9427, + "mean_token_accuracy": 0.774011068046093, + "num_tokens": 21426611.0, + "step": 1707 + }, + { + "entropy": 0.9890346825122833, + "epoch": 2.8466666666666667, + "grad_norm": 0.4149286448955536, + "learning_rate": 1.087719298245614e-05, + "loss": 0.9502, + "mean_token_accuracy": 0.7701681852340698, + "num_tokens": 21439119.0, + "step": 1708 + }, + { + "entropy": 0.9981124773621559, + "epoch": 2.8483333333333336, + "grad_norm": 0.3465225100517273, + "learning_rate": 1.0760233918128655e-05, + "loss": 0.9977, + "mean_token_accuracy": 0.7658281549811363, + "num_tokens": 21451814.0, + "step": 1709 + }, + { + "entropy": 1.093929685652256, + "epoch": 2.85, + "grad_norm": 0.36552703380584717, + "learning_rate": 1.064327485380117e-05, + "loss": 1.0817, + "mean_token_accuracy": 0.7503879442811012, + "num_tokens": 21464218.0, + "step": 1710 + }, + { + "entropy": 1.0611464828252792, + "epoch": 2.8516666666666666, + "grad_norm": 0.39026060700416565, + "learning_rate": 1.0526315789473684e-05, + "loss": 1.0491, + "mean_token_accuracy": 0.7560157850384712, + "num_tokens": 21476767.0, + "step": 1711 + }, + { + "entropy": 0.8186813741922379, + "epoch": 2.8533333333333335, + "grad_norm": 0.34907373785972595, + "learning_rate": 1.0409356725146199e-05, + "loss": 0.7844, + "mean_token_accuracy": 0.8095691204071045, + "num_tokens": 21489525.0, + "step": 1712 + }, + { + "entropy": 0.9509245008230209, + "epoch": 2.855, + "grad_norm": 0.3324735760688782, + "learning_rate": 1.0292397660818715e-05, + "loss": 0.9693, + "mean_token_accuracy": 0.7745850831270218, + "num_tokens": 21502033.0, + "step": 1713 + }, + { + "entropy": 1.0823215022683144, + "epoch": 2.8566666666666665, + "grad_norm": 0.47572627663612366, + "learning_rate": 1.0175438596491228e-05, + "loss": 1.0902, + "mean_token_accuracy": 0.7481512501835823, + "num_tokens": 21514671.0, + "step": 1714 + }, + { + "entropy": 0.9347349405288696, + "epoch": 2.8583333333333334, + "grad_norm": 0.39239612221717834, + "learning_rate": 1.0058479532163743e-05, + "loss": 0.9175, + "mean_token_accuracy": 0.7820291370153427, + "num_tokens": 21526894.0, + "step": 1715 + }, + { + "entropy": 1.099070705473423, + "epoch": 2.86, + "grad_norm": 0.37087810039520264, + "learning_rate": 9.941520467836257e-06, + "loss": 1.0606, + "mean_token_accuracy": 0.7476373165845871, + "num_tokens": 21539412.0, + "step": 1716 + }, + { + "entropy": 0.9596787095069885, + "epoch": 2.861666666666667, + "grad_norm": 0.37303170561790466, + "learning_rate": 9.824561403508772e-06, + "loss": 0.9513, + "mean_token_accuracy": 0.7722743526101112, + "num_tokens": 21551977.0, + "step": 1717 + }, + { + "entropy": 1.1655426248908043, + "epoch": 2.8633333333333333, + "grad_norm": 0.7214367985725403, + "learning_rate": 9.707602339181288e-06, + "loss": 1.1615, + "mean_token_accuracy": 0.729925125837326, + "num_tokens": 21564558.0, + "step": 1718 + }, + { + "entropy": 0.8337251618504524, + "epoch": 2.865, + "grad_norm": 0.3646109998226166, + "learning_rate": 9.590643274853801e-06, + "loss": 0.8085, + "mean_token_accuracy": 0.8040366545319557, + "num_tokens": 21576973.0, + "step": 1719 + }, + { + "entropy": 0.911796860396862, + "epoch": 2.8666666666666667, + "grad_norm": 0.411490261554718, + "learning_rate": 9.473684210526317e-06, + "loss": 0.8599, + "mean_token_accuracy": 0.7922530472278595, + "num_tokens": 21589415.0, + "step": 1720 + }, + { + "entropy": 1.0511579066514969, + "epoch": 2.868333333333333, + "grad_norm": 0.420224666595459, + "learning_rate": 9.35672514619883e-06, + "loss": 1.0566, + "mean_token_accuracy": 0.7517753392457962, + "num_tokens": 21601940.0, + "step": 1721 + }, + { + "entropy": 1.0496739074587822, + "epoch": 2.87, + "grad_norm": 0.40443339943885803, + "learning_rate": 9.239766081871346e-06, + "loss": 1.0366, + "mean_token_accuracy": 0.7607442736625671, + "num_tokens": 21614537.0, + "step": 1722 + }, + { + "entropy": 1.0743912309408188, + "epoch": 2.8716666666666666, + "grad_norm": 0.3886486291885376, + "learning_rate": 9.122807017543861e-06, + "loss": 1.0524, + "mean_token_accuracy": 0.7566971555352211, + "num_tokens": 21627224.0, + "step": 1723 + }, + { + "entropy": 1.0322471410036087, + "epoch": 2.873333333333333, + "grad_norm": 0.34302622079849243, + "learning_rate": 9.005847953216374e-06, + "loss": 0.9959, + "mean_token_accuracy": 0.7573575004935265, + "num_tokens": 21639711.0, + "step": 1724 + }, + { + "entropy": 1.112082228064537, + "epoch": 2.875, + "grad_norm": 0.3288070559501648, + "learning_rate": 8.88888888888889e-06, + "loss": 1.0887, + "mean_token_accuracy": 0.7458484619855881, + "num_tokens": 21652011.0, + "step": 1725 + }, + { + "entropy": 1.0008636564016342, + "epoch": 2.876666666666667, + "grad_norm": 0.5277789831161499, + "learning_rate": 8.771929824561403e-06, + "loss": 0.968, + "mean_token_accuracy": 0.7696216627955437, + "num_tokens": 21664594.0, + "step": 1726 + }, + { + "entropy": 1.0205089896917343, + "epoch": 2.8783333333333334, + "grad_norm": 0.47895386815071106, + "learning_rate": 8.654970760233919e-06, + "loss": 1.0174, + "mean_token_accuracy": 0.7588381469249725, + "num_tokens": 21677212.0, + "step": 1727 + }, + { + "entropy": 0.8266075178980827, + "epoch": 2.88, + "grad_norm": 0.5006054043769836, + "learning_rate": 8.538011695906434e-06, + "loss": 0.8022, + "mean_token_accuracy": 0.8057693690061569, + "num_tokens": 21689646.0, + "step": 1728 + }, + { + "entropy": 1.0616027638316154, + "epoch": 2.881666666666667, + "grad_norm": 0.39488089084625244, + "learning_rate": 8.421052631578948e-06, + "loss": 1.0615, + "mean_token_accuracy": 0.7516462132334709, + "num_tokens": 21702112.0, + "step": 1729 + }, + { + "entropy": 1.0379400290548801, + "epoch": 2.8833333333333333, + "grad_norm": 0.395575612783432, + "learning_rate": 8.304093567251463e-06, + "loss": 1.0613, + "mean_token_accuracy": 0.7509022504091263, + "num_tokens": 21714545.0, + "step": 1730 + }, + { + "entropy": 1.0454155802726746, + "epoch": 2.885, + "grad_norm": 0.3766516149044037, + "learning_rate": 8.187134502923977e-06, + "loss": 1.0183, + "mean_token_accuracy": 0.761749804019928, + "num_tokens": 21727147.0, + "step": 1731 + }, + { + "entropy": 1.0835371166467667, + "epoch": 2.8866666666666667, + "grad_norm": 0.4320039451122284, + "learning_rate": 8.070175438596492e-06, + "loss": 1.0894, + "mean_token_accuracy": 0.7444860264658928, + "num_tokens": 21739638.0, + "step": 1732 + }, + { + "entropy": 0.8791218400001526, + "epoch": 2.888333333333333, + "grad_norm": 0.3652154207229614, + "learning_rate": 7.953216374269006e-06, + "loss": 0.855, + "mean_token_accuracy": 0.8004785180091858, + "num_tokens": 21751939.0, + "step": 1733 + }, + { + "entropy": 0.8801833540201187, + "epoch": 2.89, + "grad_norm": 0.4067878723144531, + "learning_rate": 7.836257309941521e-06, + "loss": 0.8632, + "mean_token_accuracy": 0.7940299808979034, + "num_tokens": 21764330.0, + "step": 1734 + }, + { + "entropy": 0.9821511209011078, + "epoch": 2.8916666666666666, + "grad_norm": 0.4845409095287323, + "learning_rate": 7.719298245614036e-06, + "loss": 0.9515, + "mean_token_accuracy": 0.7696467861533165, + "num_tokens": 21776790.0, + "step": 1735 + }, + { + "entropy": 0.9873602092266083, + "epoch": 2.8933333333333335, + "grad_norm": 0.49251922965049744, + "learning_rate": 7.602339181286549e-06, + "loss": 0.9941, + "mean_token_accuracy": 0.7596195042133331, + "num_tokens": 21789444.0, + "step": 1736 + }, + { + "entropy": 0.9540248364210129, + "epoch": 2.895, + "grad_norm": 0.5118216276168823, + "learning_rate": 7.485380116959065e-06, + "loss": 0.9118, + "mean_token_accuracy": 0.7770734652876854, + "num_tokens": 21802080.0, + "step": 1737 + }, + { + "entropy": 1.0105539560317993, + "epoch": 2.8966666666666665, + "grad_norm": 0.4545968770980835, + "learning_rate": 7.3684210526315784e-06, + "loss": 0.9977, + "mean_token_accuracy": 0.7711414247751236, + "num_tokens": 21814948.0, + "step": 1738 + }, + { + "entropy": 1.060143068432808, + "epoch": 2.8983333333333334, + "grad_norm": 0.5532039403915405, + "learning_rate": 7.251461988304094e-06, + "loss": 1.0537, + "mean_token_accuracy": 0.751476377248764, + "num_tokens": 21827241.0, + "step": 1739 + }, + { + "entropy": 1.01781490072608, + "epoch": 2.9, + "grad_norm": 0.33422431349754333, + "learning_rate": 7.134502923976609e-06, + "loss": 0.9985, + "mean_token_accuracy": 0.7669263780117035, + "num_tokens": 21839975.0, + "step": 1740 + }, + { + "entropy": 0.9719846248626709, + "epoch": 2.9016666666666664, + "grad_norm": 0.5147994756698608, + "learning_rate": 7.017543859649123e-06, + "loss": 0.9883, + "mean_token_accuracy": 0.7697862386703491, + "num_tokens": 21852350.0, + "step": 1741 + }, + { + "entropy": 0.8945157006382942, + "epoch": 2.9033333333333333, + "grad_norm": 0.405960351228714, + "learning_rate": 6.9005847953216375e-06, + "loss": 0.8984, + "mean_token_accuracy": 0.7873081862926483, + "num_tokens": 21865142.0, + "step": 1742 + }, + { + "entropy": 1.0113761126995087, + "epoch": 2.9050000000000002, + "grad_norm": 0.4485865533351898, + "learning_rate": 6.783625730994152e-06, + "loss": 0.9957, + "mean_token_accuracy": 0.7705218642950058, + "num_tokens": 21877826.0, + "step": 1743 + }, + { + "entropy": 0.8957099765539169, + "epoch": 2.9066666666666667, + "grad_norm": 0.9728347063064575, + "learning_rate": 6.666666666666667e-06, + "loss": 0.9012, + "mean_token_accuracy": 0.7876846194267273, + "num_tokens": 21890554.0, + "step": 1744 + }, + { + "entropy": 0.9720732048153877, + "epoch": 2.908333333333333, + "grad_norm": 0.5269340872764587, + "learning_rate": 6.549707602339182e-06, + "loss": 0.9801, + "mean_token_accuracy": 0.767320029437542, + "num_tokens": 21902850.0, + "step": 1745 + }, + { + "entropy": 1.110954761505127, + "epoch": 2.91, + "grad_norm": 0.59569251537323, + "learning_rate": 6.432748538011696e-06, + "loss": 1.1039, + "mean_token_accuracy": 0.7425623312592506, + "num_tokens": 21915082.0, + "step": 1746 + }, + { + "entropy": 0.8544746041297913, + "epoch": 2.9116666666666666, + "grad_norm": 0.3668573796749115, + "learning_rate": 6.315789473684211e-06, + "loss": 0.8312, + "mean_token_accuracy": 0.8035476878285408, + "num_tokens": 21927790.0, + "step": 1747 + }, + { + "entropy": 0.9093237146735191, + "epoch": 2.913333333333333, + "grad_norm": 0.40672871470451355, + "learning_rate": 6.198830409356726e-06, + "loss": 0.906, + "mean_token_accuracy": 0.79190793633461, + "num_tokens": 21940335.0, + "step": 1748 + }, + { + "entropy": 1.0111807882785797, + "epoch": 2.915, + "grad_norm": 0.35827288031578064, + "learning_rate": 6.08187134502924e-06, + "loss": 1.0062, + "mean_token_accuracy": 0.7483839467167854, + "num_tokens": 21952980.0, + "step": 1749 + }, + { + "entropy": 0.9529319256544113, + "epoch": 2.9166666666666665, + "grad_norm": 0.689719021320343, + "learning_rate": 5.964912280701755e-06, + "loss": 0.9459, + "mean_token_accuracy": 0.771594263613224, + "num_tokens": 21965805.0, + "step": 1750 + }, + { + "entropy": 0.9855370000004768, + "epoch": 2.9183333333333334, + "grad_norm": 0.3926987051963806, + "learning_rate": 5.8479532163742686e-06, + "loss": 0.9775, + "mean_token_accuracy": 0.7613004446029663, + "num_tokens": 21978351.0, + "step": 1751 + }, + { + "entropy": 0.9271128326654434, + "epoch": 2.92, + "grad_norm": 0.4803829491138458, + "learning_rate": 5.730994152046784e-06, + "loss": 0.9187, + "mean_token_accuracy": 0.7786646783351898, + "num_tokens": 21991136.0, + "step": 1752 + }, + { + "entropy": 0.959517128765583, + "epoch": 2.921666666666667, + "grad_norm": 0.3758235573768616, + "learning_rate": 5.6140350877192985e-06, + "loss": 0.9469, + "mean_token_accuracy": 0.7675915956497192, + "num_tokens": 22003596.0, + "step": 1753 + }, + { + "entropy": 0.9961358681321144, + "epoch": 2.9233333333333333, + "grad_norm": 0.5530729293823242, + "learning_rate": 5.497076023391813e-06, + "loss": 0.9832, + "mean_token_accuracy": 0.7634861022233963, + "num_tokens": 22016277.0, + "step": 1754 + }, + { + "entropy": 1.0380299985408783, + "epoch": 2.925, + "grad_norm": 0.3746365010738373, + "learning_rate": 5.380116959064328e-06, + "loss": 1.0511, + "mean_token_accuracy": 0.7556693628430367, + "num_tokens": 22028835.0, + "step": 1755 + }, + { + "entropy": 0.8915413618087769, + "epoch": 2.9266666666666667, + "grad_norm": 0.41581663489341736, + "learning_rate": 5.263157894736842e-06, + "loss": 0.8697, + "mean_token_accuracy": 0.7895423173904419, + "num_tokens": 22041316.0, + "step": 1756 + }, + { + "entropy": 0.9499136805534363, + "epoch": 2.9283333333333332, + "grad_norm": 0.42703530192375183, + "learning_rate": 5.146198830409358e-06, + "loss": 0.944, + "mean_token_accuracy": 0.7821884900331497, + "num_tokens": 22054133.0, + "step": 1757 + }, + { + "entropy": 1.0416665747761726, + "epoch": 2.93, + "grad_norm": 0.3519282341003418, + "learning_rate": 5.029239766081871e-06, + "loss": 1.033, + "mean_token_accuracy": 0.754830114543438, + "num_tokens": 22066697.0, + "step": 1758 + }, + { + "entropy": 1.0209436565637589, + "epoch": 2.9316666666666666, + "grad_norm": 0.37903350591659546, + "learning_rate": 4.912280701754386e-06, + "loss": 1.0428, + "mean_token_accuracy": 0.7567298635840416, + "num_tokens": 22079466.0, + "step": 1759 + }, + { + "entropy": 0.9729228541254997, + "epoch": 2.9333333333333336, + "grad_norm": 0.36119741201400757, + "learning_rate": 4.7953216374269005e-06, + "loss": 0.939, + "mean_token_accuracy": 0.7855269759893417, + "num_tokens": 22092040.0, + "step": 1760 + }, + { + "entropy": 0.9589105322957039, + "epoch": 2.935, + "grad_norm": 0.34711146354675293, + "learning_rate": 4.678362573099415e-06, + "loss": 0.9337, + "mean_token_accuracy": 0.7768553495407104, + "num_tokens": 22104870.0, + "step": 1761 + }, + { + "entropy": 0.9655081853270531, + "epoch": 2.9366666666666665, + "grad_norm": 1.72267484664917, + "learning_rate": 4.5614035087719304e-06, + "loss": 0.9741, + "mean_token_accuracy": 0.7714637219905853, + "num_tokens": 22117515.0, + "step": 1762 + }, + { + "entropy": 1.1810454651713371, + "epoch": 2.9383333333333335, + "grad_norm": 0.4067312777042389, + "learning_rate": 4.444444444444445e-06, + "loss": 1.1582, + "mean_token_accuracy": 0.7326506525278091, + "num_tokens": 22130154.0, + "step": 1763 + }, + { + "entropy": 1.1184354051947594, + "epoch": 2.94, + "grad_norm": 4.306568622589111, + "learning_rate": 4.3274853801169596e-06, + "loss": 1.1218, + "mean_token_accuracy": 0.7444816380739212, + "num_tokens": 22142551.0, + "step": 1764 + }, + { + "entropy": 1.0146836042404175, + "epoch": 2.9416666666666664, + "grad_norm": 0.5526940822601318, + "learning_rate": 4.210526315789474e-06, + "loss": 1.0098, + "mean_token_accuracy": 0.7575501501560211, + "num_tokens": 22155272.0, + "step": 1765 + }, + { + "entropy": 1.0421291068196297, + "epoch": 2.9433333333333334, + "grad_norm": 0.3948024809360504, + "learning_rate": 4.093567251461989e-06, + "loss": 1.0437, + "mean_token_accuracy": 0.7535993233323097, + "num_tokens": 22167968.0, + "step": 1766 + }, + { + "entropy": 0.9924807921051979, + "epoch": 2.945, + "grad_norm": 0.4762410819530487, + "learning_rate": 3.976608187134503e-06, + "loss": 0.9732, + "mean_token_accuracy": 0.7665065228939056, + "num_tokens": 22180804.0, + "step": 1767 + }, + { + "entropy": 1.0026592165231705, + "epoch": 2.9466666666666668, + "grad_norm": 0.3470718264579773, + "learning_rate": 3.859649122807018e-06, + "loss": 1.0083, + "mean_token_accuracy": 0.7664928585290909, + "num_tokens": 22193537.0, + "step": 1768 + }, + { + "entropy": 0.8695595599710941, + "epoch": 2.9483333333333333, + "grad_norm": 0.37034153938293457, + "learning_rate": 3.7426900584795324e-06, + "loss": 0.84, + "mean_token_accuracy": 0.7970539182424545, + "num_tokens": 22206209.0, + "step": 1769 + }, + { + "entropy": 0.9539394453167915, + "epoch": 2.95, + "grad_norm": 0.3855673372745514, + "learning_rate": 3.625730994152047e-06, + "loss": 0.9141, + "mean_token_accuracy": 0.779160238802433, + "num_tokens": 22218799.0, + "step": 1770 + }, + { + "entropy": 0.8880699202418327, + "epoch": 2.9516666666666667, + "grad_norm": 0.48845577239990234, + "learning_rate": 3.5087719298245615e-06, + "loss": 0.8658, + "mean_token_accuracy": 0.7958652228116989, + "num_tokens": 22231319.0, + "step": 1771 + }, + { + "entropy": 0.9505726099014282, + "epoch": 2.953333333333333, + "grad_norm": 0.3741341233253479, + "learning_rate": 3.391812865497076e-06, + "loss": 0.9302, + "mean_token_accuracy": 0.7803698033094406, + "num_tokens": 22243810.0, + "step": 1772 + }, + { + "entropy": 1.0194538086652756, + "epoch": 2.955, + "grad_norm": 0.3747548758983612, + "learning_rate": 3.274853801169591e-06, + "loss": 1.006, + "mean_token_accuracy": 0.7586158514022827, + "num_tokens": 22256330.0, + "step": 1773 + }, + { + "entropy": 0.8892013356089592, + "epoch": 2.9566666666666666, + "grad_norm": 0.34842562675476074, + "learning_rate": 3.1578947368421056e-06, + "loss": 0.8478, + "mean_token_accuracy": 0.7886041849851608, + "num_tokens": 22268971.0, + "step": 1774 + }, + { + "entropy": 1.1123011782765388, + "epoch": 2.9583333333333335, + "grad_norm": 0.38970237970352173, + "learning_rate": 3.04093567251462e-06, + "loss": 1.092, + "mean_token_accuracy": 0.7433041483163834, + "num_tokens": 22281687.0, + "step": 1775 + }, + { + "entropy": 0.8590154871344566, + "epoch": 2.96, + "grad_norm": 0.3537849187850952, + "learning_rate": 2.9239766081871343e-06, + "loss": 0.8384, + "mean_token_accuracy": 0.7923452258110046, + "num_tokens": 22294162.0, + "step": 1776 + }, + { + "entropy": 1.1243919283151627, + "epoch": 2.961666666666667, + "grad_norm": 0.40013039112091064, + "learning_rate": 2.8070175438596493e-06, + "loss": 1.0846, + "mean_token_accuracy": 0.7491604089736938, + "num_tokens": 22306855.0, + "step": 1777 + }, + { + "entropy": 1.0627360194921494, + "epoch": 2.9633333333333334, + "grad_norm": 0.3579505681991577, + "learning_rate": 2.690058479532164e-06, + "loss": 1.0431, + "mean_token_accuracy": 0.759134478867054, + "num_tokens": 22319386.0, + "step": 1778 + }, + { + "entropy": 0.9896356016397476, + "epoch": 2.965, + "grad_norm": 0.33257195353507996, + "learning_rate": 2.573099415204679e-06, + "loss": 0.965, + "mean_token_accuracy": 0.769006259739399, + "num_tokens": 22331854.0, + "step": 1779 + }, + { + "entropy": 0.9897688552737236, + "epoch": 2.966666666666667, + "grad_norm": 0.34927383065223694, + "learning_rate": 2.456140350877193e-06, + "loss": 0.9717, + "mean_token_accuracy": 0.7724886983633041, + "num_tokens": 22344643.0, + "step": 1780 + }, + { + "entropy": 1.0401649847626686, + "epoch": 2.9683333333333333, + "grad_norm": 0.8142718076705933, + "learning_rate": 2.3391812865497075e-06, + "loss": 1.0133, + "mean_token_accuracy": 0.7575424313545227, + "num_tokens": 22357295.0, + "step": 1781 + }, + { + "entropy": 1.0531227812170982, + "epoch": 2.9699999999999998, + "grad_norm": 0.37667667865753174, + "learning_rate": 2.2222222222222225e-06, + "loss": 1.0099, + "mean_token_accuracy": 0.7554879859089851, + "num_tokens": 22369776.0, + "step": 1782 + }, + { + "entropy": 1.1273983120918274, + "epoch": 2.9716666666666667, + "grad_norm": 0.646806538105011, + "learning_rate": 2.105263157894737e-06, + "loss": 1.1169, + "mean_token_accuracy": 0.7348012179136276, + "num_tokens": 22382252.0, + "step": 1783 + }, + { + "entropy": 1.016762100160122, + "epoch": 2.9733333333333336, + "grad_norm": 0.48839271068573, + "learning_rate": 1.9883040935672516e-06, + "loss": 1.0147, + "mean_token_accuracy": 0.7614038586616516, + "num_tokens": 22394792.0, + "step": 1784 + }, + { + "entropy": 1.057390421628952, + "epoch": 2.975, + "grad_norm": 0.36040061712265015, + "learning_rate": 1.8713450292397662e-06, + "loss": 1.0385, + "mean_token_accuracy": 0.7580665051937103, + "num_tokens": 22407415.0, + "step": 1785 + }, + { + "entropy": 1.0245096608996391, + "epoch": 2.9766666666666666, + "grad_norm": 0.379582017660141, + "learning_rate": 1.7543859649122807e-06, + "loss": 1.0077, + "mean_token_accuracy": 0.757752887904644, + "num_tokens": 22420025.0, + "step": 1786 + }, + { + "entropy": 0.9722128883004189, + "epoch": 2.9783333333333335, + "grad_norm": 0.613247811794281, + "learning_rate": 1.6374269005847955e-06, + "loss": 0.976, + "mean_token_accuracy": 0.7673339918255806, + "num_tokens": 22432660.0, + "step": 1787 + }, + { + "entropy": 0.8799656555056572, + "epoch": 2.98, + "grad_norm": 0.37738871574401855, + "learning_rate": 1.52046783625731e-06, + "loss": 0.8639, + "mean_token_accuracy": 0.7907030135393143, + "num_tokens": 22444798.0, + "step": 1788 + }, + { + "entropy": 1.0010998025536537, + "epoch": 2.9816666666666665, + "grad_norm": 0.39368584752082825, + "learning_rate": 1.4035087719298246e-06, + "loss": 0.9917, + "mean_token_accuracy": 0.7631276845932007, + "num_tokens": 22457399.0, + "step": 1789 + }, + { + "entropy": 0.8982286229729652, + "epoch": 2.9833333333333334, + "grad_norm": 0.3286541700363159, + "learning_rate": 1.2865497076023394e-06, + "loss": 0.8955, + "mean_token_accuracy": 0.7828829064965248, + "num_tokens": 22469975.0, + "step": 1790 + }, + { + "entropy": 0.9310647435486317, + "epoch": 2.985, + "grad_norm": 0.3575252592563629, + "learning_rate": 1.1695906432748538e-06, + "loss": 0.9109, + "mean_token_accuracy": 0.7825771421194077, + "num_tokens": 22482418.0, + "step": 1791 + }, + { + "entropy": 1.1049975901842117, + "epoch": 2.986666666666667, + "grad_norm": 0.4538922905921936, + "learning_rate": 1.0526315789473685e-06, + "loss": 1.1066, + "mean_token_accuracy": 0.742551900446415, + "num_tokens": 22494771.0, + "step": 1792 + }, + { + "entropy": 0.9470599517226219, + "epoch": 2.9883333333333333, + "grad_norm": 0.3689374029636383, + "learning_rate": 9.356725146198831e-07, + "loss": 0.9287, + "mean_token_accuracy": 0.7846303805708885, + "num_tokens": 22507216.0, + "step": 1793 + }, + { + "entropy": 1.1192733272910118, + "epoch": 2.99, + "grad_norm": 0.34788239002227783, + "learning_rate": 8.187134502923978e-07, + "loss": 1.1308, + "mean_token_accuracy": 0.7368496730923653, + "num_tokens": 22519744.0, + "step": 1794 + }, + { + "entropy": 0.8728603720664978, + "epoch": 2.9916666666666667, + "grad_norm": 0.3710815906524658, + "learning_rate": 7.017543859649123e-07, + "loss": 0.8574, + "mean_token_accuracy": 0.7881719544529915, + "num_tokens": 22532492.0, + "step": 1795 + }, + { + "entropy": 0.8995281532406807, + "epoch": 2.993333333333333, + "grad_norm": 0.3521256148815155, + "learning_rate": 5.847953216374269e-07, + "loss": 0.8746, + "mean_token_accuracy": 0.7923890128731728, + "num_tokens": 22544874.0, + "step": 1796 + }, + { + "entropy": 0.8939443454146385, + "epoch": 2.995, + "grad_norm": 0.41208329796791077, + "learning_rate": 4.6783625730994155e-07, + "loss": 0.8724, + "mean_token_accuracy": 0.7936901748180389, + "num_tokens": 22557363.0, + "step": 1797 + }, + { + "entropy": 1.1656424701213837, + "epoch": 2.9966666666666666, + "grad_norm": 0.3441072702407837, + "learning_rate": 3.5087719298245616e-07, + "loss": 1.1401, + "mean_token_accuracy": 0.7287291288375854, + "num_tokens": 22569809.0, + "step": 1798 + }, + { + "entropy": 1.1117211356759071, + "epoch": 2.998333333333333, + "grad_norm": 0.5035508275032043, + "learning_rate": 2.3391812865497077e-07, + "loss": 1.089, + "mean_token_accuracy": 0.7472686842083931, + "num_tokens": 22582438.0, + "step": 1799 + }, + { + "entropy": 0.975653849542141, + "epoch": 3.0, + "grad_norm": 0.358730286359787, + "learning_rate": 1.1695906432748539e-07, + "loss": 0.9599, + "mean_token_accuracy": 0.772113561630249, + "num_tokens": 22594983.0, + "step": 1800 + }, + { + "epoch": 3.0, + "eval_entropy": 1.084252776276441, + "eval_loss": 1.1232056617736816, + "eval_mean_token_accuracy": 0.7345037632992343, + "eval_num_tokens": 22594983.0, + "eval_runtime": 2669.672, + "eval_samples_per_second": 1.873, + "eval_steps_per_second": 0.937, + "step": 1800 + } + ], + "logging_steps": 1, + "max_steps": 1800, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 360, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 9.70165270324799e+18, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}