| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.8006084624314479, | |
| "eval_steps": 500, | |
| "global_step": 5000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0016012169248628958, | |
| "grad_norm": 4.850070476531982, | |
| "learning_rate": 1.8e-05, | |
| "loss": 1.1411, | |
| "mean_token_accuracy": 0.7277230955660343, | |
| "num_tokens": 319675.0, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.0032024338497257917, | |
| "grad_norm": 2.02237606048584, | |
| "learning_rate": 1.9963927855711424e-05, | |
| "loss": 0.7742, | |
| "mean_token_accuracy": 0.7824086248874664, | |
| "num_tokens": 642220.0, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.004803650774588687, | |
| "grad_norm": 1.9458409547805786, | |
| "learning_rate": 1.9923847695390783e-05, | |
| "loss": 0.7128, | |
| "mean_token_accuracy": 0.7927536629140377, | |
| "num_tokens": 962961.0, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.006404867699451583, | |
| "grad_norm": 1.8043383359909058, | |
| "learning_rate": 1.988376753507014e-05, | |
| "loss": 0.6919, | |
| "mean_token_accuracy": 0.7971156224608421, | |
| "num_tokens": 1284446.0, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.008006084624314479, | |
| "grad_norm": 1.7415369749069214, | |
| "learning_rate": 1.98436873747495e-05, | |
| "loss": 0.6973, | |
| "mean_token_accuracy": 0.793434601277113, | |
| "num_tokens": 1602892.0, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.009607301549177374, | |
| "grad_norm": 1.631671667098999, | |
| "learning_rate": 1.980360721442886e-05, | |
| "loss": 0.6588, | |
| "mean_token_accuracy": 0.8002292089164257, | |
| "num_tokens": 1928108.0, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.011208518474040271, | |
| "grad_norm": 1.7467247247695923, | |
| "learning_rate": 1.976352705410822e-05, | |
| "loss": 0.6482, | |
| "mean_token_accuracy": 0.8024597182869911, | |
| "num_tokens": 2250524.0, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.012809735398903167, | |
| "grad_norm": 1.8255817890167236, | |
| "learning_rate": 1.9723446893787578e-05, | |
| "loss": 0.6479, | |
| "mean_token_accuracy": 0.8015000730752945, | |
| "num_tokens": 2570969.0, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.014410952323766062, | |
| "grad_norm": 1.5774496793746948, | |
| "learning_rate": 1.9683366733466936e-05, | |
| "loss": 0.6347, | |
| "mean_token_accuracy": 0.8069861680269241, | |
| "num_tokens": 2895603.0, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.016012169248628957, | |
| "grad_norm": 1.557003378868103, | |
| "learning_rate": 1.9643286573146294e-05, | |
| "loss": 0.6223, | |
| "mean_token_accuracy": 0.811630766093731, | |
| "num_tokens": 3218426.0, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.017613386173491855, | |
| "grad_norm": 1.5707590579986572, | |
| "learning_rate": 1.9603206412825653e-05, | |
| "loss": 0.6418, | |
| "mean_token_accuracy": 0.8049483895301819, | |
| "num_tokens": 3541200.0, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.019214603098354748, | |
| "grad_norm": 1.6678072214126587, | |
| "learning_rate": 1.956312625250501e-05, | |
| "loss": 0.6534, | |
| "mean_token_accuracy": 0.8036209844052792, | |
| "num_tokens": 3855626.0, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.020815820023217645, | |
| "grad_norm": 1.638335108757019, | |
| "learning_rate": 1.952304609218437e-05, | |
| "loss": 0.6316, | |
| "mean_token_accuracy": 0.8084485895931721, | |
| "num_tokens": 4181551.0, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.022417036948080542, | |
| "grad_norm": 1.5564364194869995, | |
| "learning_rate": 1.9482965931863728e-05, | |
| "loss": 0.6246, | |
| "mean_token_accuracy": 0.8101852715015412, | |
| "num_tokens": 4500584.0, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.024018253872943436, | |
| "grad_norm": 1.6337069272994995, | |
| "learning_rate": 1.9442885771543086e-05, | |
| "loss": 0.6293, | |
| "mean_token_accuracy": 0.8066410474479199, | |
| "num_tokens": 4821016.0, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.025619470797806333, | |
| "grad_norm": 1.64547598361969, | |
| "learning_rate": 1.9402805611222445e-05, | |
| "loss": 0.6427, | |
| "mean_token_accuracy": 0.8044939957559109, | |
| "num_tokens": 5141076.0, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.027220687722669227, | |
| "grad_norm": 1.5588223934173584, | |
| "learning_rate": 1.9362725450901806e-05, | |
| "loss": 0.6238, | |
| "mean_token_accuracy": 0.8083601631224155, | |
| "num_tokens": 5463211.0, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.028821904647532124, | |
| "grad_norm": 1.7133407592773438, | |
| "learning_rate": 1.9322645290581165e-05, | |
| "loss": 0.613, | |
| "mean_token_accuracy": 0.8120619148015976, | |
| "num_tokens": 5784216.0, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.03042312157239502, | |
| "grad_norm": 1.5363540649414062, | |
| "learning_rate": 1.9282565130260523e-05, | |
| "loss": 0.6377, | |
| "mean_token_accuracy": 0.8058379679918289, | |
| "num_tokens": 6100691.0, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.032024338497257915, | |
| "grad_norm": 1.5218147039413452, | |
| "learning_rate": 1.924248496993988e-05, | |
| "loss": 0.6168, | |
| "mean_token_accuracy": 0.811554492264986, | |
| "num_tokens": 6418188.0, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.03362555542212081, | |
| "grad_norm": 1.5204449892044067, | |
| "learning_rate": 1.920240480961924e-05, | |
| "loss": 0.6232, | |
| "mean_token_accuracy": 0.8090743437409401, | |
| "num_tokens": 6739535.0, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.03522677234698371, | |
| "grad_norm": 1.5194987058639526, | |
| "learning_rate": 1.9162324649298598e-05, | |
| "loss": 0.6063, | |
| "mean_token_accuracy": 0.8127121582627297, | |
| "num_tokens": 7065312.0, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.036827989271846606, | |
| "grad_norm": 1.5482524633407593, | |
| "learning_rate": 1.9122244488977956e-05, | |
| "loss": 0.6103, | |
| "mean_token_accuracy": 0.8122214756906032, | |
| "num_tokens": 7384331.0, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.038429206196709496, | |
| "grad_norm": 1.5524567365646362, | |
| "learning_rate": 1.9082164328657315e-05, | |
| "loss": 0.6222, | |
| "mean_token_accuracy": 0.8087183713912964, | |
| "num_tokens": 7707286.0, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.040030423121572394, | |
| "grad_norm": 1.479811429977417, | |
| "learning_rate": 1.9042084168336673e-05, | |
| "loss": 0.6132, | |
| "mean_token_accuracy": 0.8108192197978497, | |
| "num_tokens": 8028138.0, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.04163164004643529, | |
| "grad_norm": 1.552553415298462, | |
| "learning_rate": 1.900200400801603e-05, | |
| "loss": 0.6093, | |
| "mean_token_accuracy": 0.811344139277935, | |
| "num_tokens": 8352469.0, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.04323285697129819, | |
| "grad_norm": 1.608813762664795, | |
| "learning_rate": 1.8961923847695393e-05, | |
| "loss": 0.6096, | |
| "mean_token_accuracy": 0.8128134682774544, | |
| "num_tokens": 8674506.0, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.044834073896161085, | |
| "grad_norm": 1.488484501838684, | |
| "learning_rate": 1.892184368737475e-05, | |
| "loss": 0.5843, | |
| "mean_token_accuracy": 0.8183152191340923, | |
| "num_tokens": 8999341.0, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.046435290821023975, | |
| "grad_norm": 1.4090988636016846, | |
| "learning_rate": 1.888176352705411e-05, | |
| "loss": 0.5993, | |
| "mean_token_accuracy": 0.8143721930682659, | |
| "num_tokens": 9319846.0, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.04803650774588687, | |
| "grad_norm": 1.4540024995803833, | |
| "learning_rate": 1.8841683366733468e-05, | |
| "loss": 0.5945, | |
| "mean_token_accuracy": 0.8154042765498162, | |
| "num_tokens": 9642299.0, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.04963772467074977, | |
| "grad_norm": 1.4003726243972778, | |
| "learning_rate": 1.8801603206412827e-05, | |
| "loss": 0.597, | |
| "mean_token_accuracy": 0.8151420652866364, | |
| "num_tokens": 9964218.0, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.051238941595612666, | |
| "grad_norm": 1.5370477437973022, | |
| "learning_rate": 1.8761523046092185e-05, | |
| "loss": 0.6042, | |
| "mean_token_accuracy": 0.8127450421452522, | |
| "num_tokens": 10287745.0, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.052840158520475564, | |
| "grad_norm": 1.4918196201324463, | |
| "learning_rate": 1.8721442885771543e-05, | |
| "loss": 0.5926, | |
| "mean_token_accuracy": 0.8160783097147941, | |
| "num_tokens": 10608680.0, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.054441375445338454, | |
| "grad_norm": 1.5067648887634277, | |
| "learning_rate": 1.86813627254509e-05, | |
| "loss": 0.6034, | |
| "mean_token_accuracy": 0.8130478210747242, | |
| "num_tokens": 10929914.0, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.05604259237020135, | |
| "grad_norm": 1.5446354150772095, | |
| "learning_rate": 1.8641282565130263e-05, | |
| "loss": 0.6077, | |
| "mean_token_accuracy": 0.812052571028471, | |
| "num_tokens": 11253937.0, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.05764380929506425, | |
| "grad_norm": 1.5637918710708618, | |
| "learning_rate": 1.8601202404809622e-05, | |
| "loss": 0.612, | |
| "mean_token_accuracy": 0.8104033015668393, | |
| "num_tokens": 11577989.0, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.059245026219927145, | |
| "grad_norm": 1.6339150667190552, | |
| "learning_rate": 1.856112224448898e-05, | |
| "loss": 0.6029, | |
| "mean_token_accuracy": 0.8136964634060859, | |
| "num_tokens": 11901498.0, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.06084624314479004, | |
| "grad_norm": 1.5778098106384277, | |
| "learning_rate": 1.852104208416834e-05, | |
| "loss": 0.5969, | |
| "mean_token_accuracy": 0.8150609947741032, | |
| "num_tokens": 12222176.0, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.06244746006965294, | |
| "grad_norm": 1.5342057943344116, | |
| "learning_rate": 1.8480961923847697e-05, | |
| "loss": 0.598, | |
| "mean_token_accuracy": 0.8151043303310871, | |
| "num_tokens": 12541061.0, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.06404867699451583, | |
| "grad_norm": 1.4940104484558105, | |
| "learning_rate": 1.8440881763527055e-05, | |
| "loss": 0.6005, | |
| "mean_token_accuracy": 0.8139387130737304, | |
| "num_tokens": 12858467.0, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.06564989391937873, | |
| "grad_norm": 1.482343077659607, | |
| "learning_rate": 1.8400801603206414e-05, | |
| "loss": 0.5975, | |
| "mean_token_accuracy": 0.8135347843170166, | |
| "num_tokens": 13179275.0, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.06725111084424162, | |
| "grad_norm": 1.6798138618469238, | |
| "learning_rate": 1.8360721442885775e-05, | |
| "loss": 0.5922, | |
| "mean_token_accuracy": 0.8157666385173797, | |
| "num_tokens": 13500869.0, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.06885232776910452, | |
| "grad_norm": 1.4996591806411743, | |
| "learning_rate": 1.8320641282565134e-05, | |
| "loss": 0.5928, | |
| "mean_token_accuracy": 0.8151577524840832, | |
| "num_tokens": 13822306.0, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.07045354469396742, | |
| "grad_norm": 1.530585765838623, | |
| "learning_rate": 1.8280561122244492e-05, | |
| "loss": 0.599, | |
| "mean_token_accuracy": 0.8142477743327617, | |
| "num_tokens": 14143469.0, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.07205476161883032, | |
| "grad_norm": 1.468666672706604, | |
| "learning_rate": 1.824048096192385e-05, | |
| "loss": 0.5986, | |
| "mean_token_accuracy": 0.8139159172773361, | |
| "num_tokens": 14462010.0, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.07365597854369321, | |
| "grad_norm": 1.4792921543121338, | |
| "learning_rate": 1.820040080160321e-05, | |
| "loss": 0.5822, | |
| "mean_token_accuracy": 0.8174809239804744, | |
| "num_tokens": 14786575.0, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.07525719546855611, | |
| "grad_norm": 1.5551143884658813, | |
| "learning_rate": 1.8160320641282567e-05, | |
| "loss": 0.5874, | |
| "mean_token_accuracy": 0.8174747616052628, | |
| "num_tokens": 15108744.0, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.07685841239341899, | |
| "grad_norm": 1.628406047821045, | |
| "learning_rate": 1.8120240480961925e-05, | |
| "loss": 0.5763, | |
| "mean_token_accuracy": 0.8200252979993821, | |
| "num_tokens": 15433006.0, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.07845962931828189, | |
| "grad_norm": 1.442197322845459, | |
| "learning_rate": 1.8080160320641284e-05, | |
| "loss": 0.5776, | |
| "mean_token_accuracy": 0.8189563080668449, | |
| "num_tokens": 15751357.0, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.08006084624314479, | |
| "grad_norm": 1.4398698806762695, | |
| "learning_rate": 1.8040080160320642e-05, | |
| "loss": 0.5848, | |
| "mean_token_accuracy": 0.818307051807642, | |
| "num_tokens": 16069637.0, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.08166206316800768, | |
| "grad_norm": 1.4899983406066895, | |
| "learning_rate": 1.8e-05, | |
| "loss": 0.5949, | |
| "mean_token_accuracy": 0.8153203561902046, | |
| "num_tokens": 16390254.0, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.08326328009287058, | |
| "grad_norm": 1.4584699869155884, | |
| "learning_rate": 1.7959919839679362e-05, | |
| "loss": 0.5751, | |
| "mean_token_accuracy": 0.8211358264088631, | |
| "num_tokens": 16711945.0, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.08486449701773348, | |
| "grad_norm": 1.3782806396484375, | |
| "learning_rate": 1.791983967935872e-05, | |
| "loss": 0.5792, | |
| "mean_token_accuracy": 0.8189084567129612, | |
| "num_tokens": 17036263.0, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.08646571394259638, | |
| "grad_norm": 1.4102047681808472, | |
| "learning_rate": 1.787975951903808e-05, | |
| "loss": 0.5797, | |
| "mean_token_accuracy": 0.8194458931684494, | |
| "num_tokens": 17360014.0, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.08806693086745927, | |
| "grad_norm": 1.3993043899536133, | |
| "learning_rate": 1.7839679358717437e-05, | |
| "loss": 0.5882, | |
| "mean_token_accuracy": 0.8176226451992988, | |
| "num_tokens": 17678632.0, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.08966814779232217, | |
| "grad_norm": 1.374983549118042, | |
| "learning_rate": 1.7799599198396796e-05, | |
| "loss": 0.5791, | |
| "mean_token_accuracy": 0.8191694885492324, | |
| "num_tokens": 17997860.0, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.09126936471718507, | |
| "grad_norm": 1.431760549545288, | |
| "learning_rate": 1.7759519038076154e-05, | |
| "loss": 0.5596, | |
| "mean_token_accuracy": 0.8247168369591236, | |
| "num_tokens": 18319285.0, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.09287058164204795, | |
| "grad_norm": 1.4059189558029175, | |
| "learning_rate": 1.7719438877755512e-05, | |
| "loss": 0.5745, | |
| "mean_token_accuracy": 0.8204911254346371, | |
| "num_tokens": 18640990.0, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.09447179856691085, | |
| "grad_norm": 1.376071572303772, | |
| "learning_rate": 1.767935871743487e-05, | |
| "loss": 0.5784, | |
| "mean_token_accuracy": 0.8198997363448143, | |
| "num_tokens": 18962655.0, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.09607301549177374, | |
| "grad_norm": 1.4976638555526733, | |
| "learning_rate": 1.763927855711423e-05, | |
| "loss": 0.575, | |
| "mean_token_accuracy": 0.8209247954189778, | |
| "num_tokens": 19283609.0, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.09767423241663664, | |
| "grad_norm": 1.646478533744812, | |
| "learning_rate": 1.7599198396793587e-05, | |
| "loss": 0.5728, | |
| "mean_token_accuracy": 0.8210850782692433, | |
| "num_tokens": 19606331.0, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.09927544934149954, | |
| "grad_norm": 1.4226250648498535, | |
| "learning_rate": 1.7559118236472946e-05, | |
| "loss": 0.567, | |
| "mean_token_accuracy": 0.8224009595811367, | |
| "num_tokens": 19928113.0, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.10087666626636244, | |
| "grad_norm": 1.4810740947723389, | |
| "learning_rate": 1.7519038076152307e-05, | |
| "loss": 0.5742, | |
| "mean_token_accuracy": 0.8200457535684109, | |
| "num_tokens": 20250090.0, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.10247788319122533, | |
| "grad_norm": 1.4819321632385254, | |
| "learning_rate": 1.7478957915831666e-05, | |
| "loss": 0.5841, | |
| "mean_token_accuracy": 0.8184079818427563, | |
| "num_tokens": 20565828.0, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.10407910011608823, | |
| "grad_norm": 1.5129350423812866, | |
| "learning_rate": 1.7438877755511024e-05, | |
| "loss": 0.5609, | |
| "mean_token_accuracy": 0.8236034691333771, | |
| "num_tokens": 20889355.0, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.10568031704095113, | |
| "grad_norm": 1.4785889387130737, | |
| "learning_rate": 1.7398797595190383e-05, | |
| "loss": 0.5666, | |
| "mean_token_accuracy": 0.8222376257181168, | |
| "num_tokens": 21209813.0, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.10728153396581402, | |
| "grad_norm": 1.465562105178833, | |
| "learning_rate": 1.735871743486974e-05, | |
| "loss": 0.5757, | |
| "mean_token_accuracy": 0.8193745911121368, | |
| "num_tokens": 21530352.0, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.10888275089067691, | |
| "grad_norm": 1.4120159149169922, | |
| "learning_rate": 1.73186372745491e-05, | |
| "loss": 0.5591, | |
| "mean_token_accuracy": 0.8245022237300873, | |
| "num_tokens": 21852065.0, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.1104839678155398, | |
| "grad_norm": 1.4140994548797607, | |
| "learning_rate": 1.7278557114228458e-05, | |
| "loss": 0.5609, | |
| "mean_token_accuracy": 0.8247852481901645, | |
| "num_tokens": 22174963.0, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.1120851847404027, | |
| "grad_norm": 1.457542061805725, | |
| "learning_rate": 1.7238476953907816e-05, | |
| "loss": 0.5792, | |
| "mean_token_accuracy": 0.8197700724005699, | |
| "num_tokens": 22493835.0, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.1136864016652656, | |
| "grad_norm": 1.3921725749969482, | |
| "learning_rate": 1.7198396793587174e-05, | |
| "loss": 0.5733, | |
| "mean_token_accuracy": 0.8208441331982612, | |
| "num_tokens": 22815246.0, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.1152876185901285, | |
| "grad_norm": 1.4376442432403564, | |
| "learning_rate": 1.7158316633266533e-05, | |
| "loss": 0.5774, | |
| "mean_token_accuracy": 0.8196846872568131, | |
| "num_tokens": 23133502.0, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.1168888355149914, | |
| "grad_norm": 1.4338574409484863, | |
| "learning_rate": 1.7118236472945894e-05, | |
| "loss": 0.5579, | |
| "mean_token_accuracy": 0.8241358153522015, | |
| "num_tokens": 23454131.0, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.11849005243985429, | |
| "grad_norm": 1.5646940469741821, | |
| "learning_rate": 1.7078156312625253e-05, | |
| "loss": 0.5766, | |
| "mean_token_accuracy": 0.8180469058454036, | |
| "num_tokens": 23776345.0, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.12009126936471719, | |
| "grad_norm": 1.4892045259475708, | |
| "learning_rate": 1.703807615230461e-05, | |
| "loss": 0.5776, | |
| "mean_token_accuracy": 0.8196270175278186, | |
| "num_tokens": 24094995.0, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.12169248628958008, | |
| "grad_norm": 1.4670966863632202, | |
| "learning_rate": 1.699799599198397e-05, | |
| "loss": 0.5562, | |
| "mean_token_accuracy": 0.824824545532465, | |
| "num_tokens": 24419434.0, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.12329370321444298, | |
| "grad_norm": 1.3859130144119263, | |
| "learning_rate": 1.6957915831663328e-05, | |
| "loss": 0.56, | |
| "mean_token_accuracy": 0.8234731949865818, | |
| "num_tokens": 24740101.0, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.12489492013930588, | |
| "grad_norm": 1.4733003377914429, | |
| "learning_rate": 1.6917835671342686e-05, | |
| "loss": 0.5623, | |
| "mean_token_accuracy": 0.8231001414358616, | |
| "num_tokens": 25060596.0, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.12649613706416876, | |
| "grad_norm": 1.440463900566101, | |
| "learning_rate": 1.6877755511022045e-05, | |
| "loss": 0.564, | |
| "mean_token_accuracy": 0.821749759465456, | |
| "num_tokens": 25385513.0, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.12809735398903166, | |
| "grad_norm": 1.4805524349212646, | |
| "learning_rate": 1.6837675350701403e-05, | |
| "loss": 0.5634, | |
| "mean_token_accuracy": 0.8236978933215141, | |
| "num_tokens": 25708757.0, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.12969857091389456, | |
| "grad_norm": 1.4496817588806152, | |
| "learning_rate": 1.679759519038076e-05, | |
| "loss": 0.5735, | |
| "mean_token_accuracy": 0.8206959038972854, | |
| "num_tokens": 26026542.0, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.13129978783875745, | |
| "grad_norm": 1.3556431531906128, | |
| "learning_rate": 1.675751503006012e-05, | |
| "loss": 0.566, | |
| "mean_token_accuracy": 0.82326265797019, | |
| "num_tokens": 26346021.0, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.13290100476362035, | |
| "grad_norm": 1.3347582817077637, | |
| "learning_rate": 1.6717434869739478e-05, | |
| "loss": 0.5503, | |
| "mean_token_accuracy": 0.8263788960874081, | |
| "num_tokens": 26666638.0, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.13450222168848325, | |
| "grad_norm": 1.4640405178070068, | |
| "learning_rate": 1.667735470941884e-05, | |
| "loss": 0.5646, | |
| "mean_token_accuracy": 0.8227420128881932, | |
| "num_tokens": 26988771.0, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.13610343861334614, | |
| "grad_norm": 1.3605999946594238, | |
| "learning_rate": 1.6637274549098198e-05, | |
| "loss": 0.5527, | |
| "mean_token_accuracy": 0.8256938807666302, | |
| "num_tokens": 27312347.0, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.13770465553820904, | |
| "grad_norm": 1.4248700141906738, | |
| "learning_rate": 1.6597194388777556e-05, | |
| "loss": 0.5549, | |
| "mean_token_accuracy": 0.8253827333450318, | |
| "num_tokens": 27631652.0, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.13930587246307194, | |
| "grad_norm": 1.3759219646453857, | |
| "learning_rate": 1.6557114228456915e-05, | |
| "loss": 0.5604, | |
| "mean_token_accuracy": 0.8238206699490547, | |
| "num_tokens": 27955242.0, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.14090708938793484, | |
| "grad_norm": 1.401932716369629, | |
| "learning_rate": 1.6517034068136273e-05, | |
| "loss": 0.5605, | |
| "mean_token_accuracy": 0.8243908196687698, | |
| "num_tokens": 28273016.0, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.14250830631279773, | |
| "grad_norm": 1.465811014175415, | |
| "learning_rate": 1.647695390781563e-05, | |
| "loss": 0.58, | |
| "mean_token_accuracy": 0.818411073833704, | |
| "num_tokens": 28593907.0, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.14410952323766063, | |
| "grad_norm": 1.4966421127319336, | |
| "learning_rate": 1.6436873747494993e-05, | |
| "loss": 0.5628, | |
| "mean_token_accuracy": 0.8237097032368184, | |
| "num_tokens": 28912904.0, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.14571074016252353, | |
| "grad_norm": 1.4070931673049927, | |
| "learning_rate": 1.639679358717435e-05, | |
| "loss": 0.561, | |
| "mean_token_accuracy": 0.823528315126896, | |
| "num_tokens": 29232051.0, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.14731195708738642, | |
| "grad_norm": 1.431369662284851, | |
| "learning_rate": 1.635671342685371e-05, | |
| "loss": 0.5609, | |
| "mean_token_accuracy": 0.8236170291900635, | |
| "num_tokens": 29549895.0, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.14891317401224932, | |
| "grad_norm": 1.4001986980438232, | |
| "learning_rate": 1.6316633266533068e-05, | |
| "loss": 0.5252, | |
| "mean_token_accuracy": 0.8331415235996247, | |
| "num_tokens": 29872096.0, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.15051439093711222, | |
| "grad_norm": 1.4629161357879639, | |
| "learning_rate": 1.6276553106212427e-05, | |
| "loss": 0.5527, | |
| "mean_token_accuracy": 0.8252298124134541, | |
| "num_tokens": 30192367.0, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.1521156078619751, | |
| "grad_norm": 1.45273756980896, | |
| "learning_rate": 1.6236472945891785e-05, | |
| "loss": 0.5507, | |
| "mean_token_accuracy": 0.8263878554105759, | |
| "num_tokens": 30511572.0, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.15371682478683799, | |
| "grad_norm": 1.4437830448150635, | |
| "learning_rate": 1.6196392785571143e-05, | |
| "loss": 0.5609, | |
| "mean_token_accuracy": 0.8232449531555176, | |
| "num_tokens": 30833522.0, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.15531804171170088, | |
| "grad_norm": 1.4527957439422607, | |
| "learning_rate": 1.6156312625250502e-05, | |
| "loss": 0.5458, | |
| "mean_token_accuracy": 0.8275917746126652, | |
| "num_tokens": 31156837.0, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.15691925863656378, | |
| "grad_norm": 1.429284930229187, | |
| "learning_rate": 1.6116232464929863e-05, | |
| "loss": 0.554, | |
| "mean_token_accuracy": 0.8261115580797196, | |
| "num_tokens": 31475979.0, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.15852047556142668, | |
| "grad_norm": 1.480430245399475, | |
| "learning_rate": 1.6076152304609222e-05, | |
| "loss": 0.5691, | |
| "mean_token_accuracy": 0.8202250853180886, | |
| "num_tokens": 31796308.0, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.16012169248628957, | |
| "grad_norm": 1.35923171043396, | |
| "learning_rate": 1.603607214428858e-05, | |
| "loss": 0.5409, | |
| "mean_token_accuracy": 0.8294860117137433, | |
| "num_tokens": 32119772.0, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.16172290941115247, | |
| "grad_norm": 1.3103224039077759, | |
| "learning_rate": 1.599599198396794e-05, | |
| "loss": 0.5465, | |
| "mean_token_accuracy": 0.8272594325244427, | |
| "num_tokens": 32442861.0, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.16332412633601537, | |
| "grad_norm": 1.3765544891357422, | |
| "learning_rate": 1.5955911823647297e-05, | |
| "loss": 0.5428, | |
| "mean_token_accuracy": 0.8292494244873524, | |
| "num_tokens": 32765550.0, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.16492534326087827, | |
| "grad_norm": 1.4439506530761719, | |
| "learning_rate": 1.5915831663326655e-05, | |
| "loss": 0.5485, | |
| "mean_token_accuracy": 0.8280377320945262, | |
| "num_tokens": 33082831.0, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.16652656018574116, | |
| "grad_norm": 1.308767557144165, | |
| "learning_rate": 1.5875751503006014e-05, | |
| "loss": 0.5383, | |
| "mean_token_accuracy": 0.8299390114843845, | |
| "num_tokens": 33406604.0, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.16812777711060406, | |
| "grad_norm": 1.4223109483718872, | |
| "learning_rate": 1.5835671342685372e-05, | |
| "loss": 0.5501, | |
| "mean_token_accuracy": 0.8263260968029499, | |
| "num_tokens": 33725548.0, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.16972899403546696, | |
| "grad_norm": 1.4169821739196777, | |
| "learning_rate": 1.579559118236473e-05, | |
| "loss": 0.5642, | |
| "mean_token_accuracy": 0.8226730786263943, | |
| "num_tokens": 34044057.0, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.17133021096032985, | |
| "grad_norm": 1.363132119178772, | |
| "learning_rate": 1.575551102204409e-05, | |
| "loss": 0.5604, | |
| "mean_token_accuracy": 0.8246619693934918, | |
| "num_tokens": 34363201.0, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.17293142788519275, | |
| "grad_norm": 1.437795877456665, | |
| "learning_rate": 1.5715430861723447e-05, | |
| "loss": 0.5487, | |
| "mean_token_accuracy": 0.8276727601885796, | |
| "num_tokens": 34687079.0, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.17453264481005565, | |
| "grad_norm": 1.3839459419250488, | |
| "learning_rate": 1.567535070140281e-05, | |
| "loss": 0.548, | |
| "mean_token_accuracy": 0.8272468335926533, | |
| "num_tokens": 35010402.0, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.17613386173491855, | |
| "grad_norm": 1.377226710319519, | |
| "learning_rate": 1.5635270541082167e-05, | |
| "loss": 0.5437, | |
| "mean_token_accuracy": 0.8282894738018513, | |
| "num_tokens": 35330447.0, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.17773507865978144, | |
| "grad_norm": 1.4175010919570923, | |
| "learning_rate": 1.5595190380761525e-05, | |
| "loss": 0.5485, | |
| "mean_token_accuracy": 0.8271911948919296, | |
| "num_tokens": 35651468.0, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.17933629558464434, | |
| "grad_norm": 1.409951090812683, | |
| "learning_rate": 1.5555110220440884e-05, | |
| "loss": 0.567, | |
| "mean_token_accuracy": 0.8226927921175957, | |
| "num_tokens": 35974507.0, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.18093751250950724, | |
| "grad_norm": 1.353214144706726, | |
| "learning_rate": 1.5515030060120242e-05, | |
| "loss": 0.5479, | |
| "mean_token_accuracy": 0.8266568630933762, | |
| "num_tokens": 36295172.0, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.18253872943437013, | |
| "grad_norm": 1.2821177244186401, | |
| "learning_rate": 1.54749498997996e-05, | |
| "loss": 0.5538, | |
| "mean_token_accuracy": 0.8255051828920841, | |
| "num_tokens": 36613435.0, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.184139946359233, | |
| "grad_norm": 1.3915753364562988, | |
| "learning_rate": 1.543486973947896e-05, | |
| "loss": 0.5441, | |
| "mean_token_accuracy": 0.8281103193759918, | |
| "num_tokens": 36932500.0, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.1857411632840959, | |
| "grad_norm": 1.3582857847213745, | |
| "learning_rate": 1.5394789579158317e-05, | |
| "loss": 0.5442, | |
| "mean_token_accuracy": 0.8282648637890816, | |
| "num_tokens": 37255598.0, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.1873423802089588, | |
| "grad_norm": 1.41195809841156, | |
| "learning_rate": 1.5354709418837676e-05, | |
| "loss": 0.5463, | |
| "mean_token_accuracy": 0.8273337736725808, | |
| "num_tokens": 37576559.0, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.1889435971338217, | |
| "grad_norm": 1.42801034450531, | |
| "learning_rate": 1.5314629258517034e-05, | |
| "loss": 0.5381, | |
| "mean_token_accuracy": 0.8300585649907589, | |
| "num_tokens": 37899029.0, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.1905448140586846, | |
| "grad_norm": 1.484485387802124, | |
| "learning_rate": 1.5274549098196396e-05, | |
| "loss": 0.5527, | |
| "mean_token_accuracy": 0.826143679022789, | |
| "num_tokens": 38219539.0, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.1921460309835475, | |
| "grad_norm": 1.3785059452056885, | |
| "learning_rate": 1.5234468937875752e-05, | |
| "loss": 0.5423, | |
| "mean_token_accuracy": 0.8280981115996837, | |
| "num_tokens": 38541774.0, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.19374724790841039, | |
| "grad_norm": 1.4416697025299072, | |
| "learning_rate": 1.519438877755511e-05, | |
| "loss": 0.5536, | |
| "mean_token_accuracy": 0.8254386700689793, | |
| "num_tokens": 38863007.0, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 0.19534846483327328, | |
| "grad_norm": 1.4279732704162598, | |
| "learning_rate": 1.515430861723447e-05, | |
| "loss": 0.5463, | |
| "mean_token_accuracy": 0.8276839859783649, | |
| "num_tokens": 39182212.0, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.19694968175813618, | |
| "grad_norm": 1.3984514474868774, | |
| "learning_rate": 1.5114228456913829e-05, | |
| "loss": 0.5321, | |
| "mean_token_accuracy": 0.8313628524541855, | |
| "num_tokens": 39503356.0, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 0.19855089868299908, | |
| "grad_norm": 1.4602679014205933, | |
| "learning_rate": 1.5074148296593187e-05, | |
| "loss": 0.5472, | |
| "mean_token_accuracy": 0.8276026301085949, | |
| "num_tokens": 39825344.0, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.20015211560786197, | |
| "grad_norm": 1.4117668867111206, | |
| "learning_rate": 1.5034068136272546e-05, | |
| "loss": 0.5595, | |
| "mean_token_accuracy": 0.8228803716599942, | |
| "num_tokens": 40150098.0, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.20175333253272487, | |
| "grad_norm": 1.5286558866500854, | |
| "learning_rate": 1.4993987975951904e-05, | |
| "loss": 0.547, | |
| "mean_token_accuracy": 0.8276199653744698, | |
| "num_tokens": 40470403.0, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.20335454945758777, | |
| "grad_norm": 1.3663097620010376, | |
| "learning_rate": 1.4953907815631264e-05, | |
| "loss": 0.5484, | |
| "mean_token_accuracy": 0.8275168865919114, | |
| "num_tokens": 40791563.0, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 0.20495576638245067, | |
| "grad_norm": 1.284912347793579, | |
| "learning_rate": 1.4913827655310623e-05, | |
| "loss": 0.5351, | |
| "mean_token_accuracy": 0.830632296949625, | |
| "num_tokens": 41112978.0, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.20655698330731356, | |
| "grad_norm": 1.32599675655365, | |
| "learning_rate": 1.4873747494989981e-05, | |
| "loss": 0.5327, | |
| "mean_token_accuracy": 0.8312986418604851, | |
| "num_tokens": 41433487.0, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 0.20815820023217646, | |
| "grad_norm": 1.3861145973205566, | |
| "learning_rate": 1.483366733466934e-05, | |
| "loss": 0.5404, | |
| "mean_token_accuracy": 0.8290247596800328, | |
| "num_tokens": 41754957.0, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.20975941715703936, | |
| "grad_norm": 1.4228520393371582, | |
| "learning_rate": 1.4793587174348698e-05, | |
| "loss": 0.551, | |
| "mean_token_accuracy": 0.8258979551494121, | |
| "num_tokens": 42078353.0, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 0.21136063408190225, | |
| "grad_norm": 1.415467619895935, | |
| "learning_rate": 1.4753507014028056e-05, | |
| "loss": 0.5341, | |
| "mean_token_accuracy": 0.8312284901738167, | |
| "num_tokens": 42394346.0, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.21296185100676515, | |
| "grad_norm": 1.6167529821395874, | |
| "learning_rate": 1.4713426853707416e-05, | |
| "loss": 0.5362, | |
| "mean_token_accuracy": 0.8303378522396088, | |
| "num_tokens": 42713221.0, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 0.21456306793162805, | |
| "grad_norm": 1.416754961013794, | |
| "learning_rate": 1.4673346693386774e-05, | |
| "loss": 0.5209, | |
| "mean_token_accuracy": 0.8338037431240082, | |
| "num_tokens": 43034016.0, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 0.21616428485649095, | |
| "grad_norm": 1.4791791439056396, | |
| "learning_rate": 1.4633266533066133e-05, | |
| "loss": 0.5239, | |
| "mean_token_accuracy": 0.8336230233311653, | |
| "num_tokens": 43354775.0, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.21776550178135382, | |
| "grad_norm": 1.3203613758087158, | |
| "learning_rate": 1.4593186372745491e-05, | |
| "loss": 0.5324, | |
| "mean_token_accuracy": 0.8318623468279839, | |
| "num_tokens": 43673374.0, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 0.2193667187062167, | |
| "grad_norm": 1.3472514152526855, | |
| "learning_rate": 1.455310621242485e-05, | |
| "loss": 0.516, | |
| "mean_token_accuracy": 0.8363048598170281, | |
| "num_tokens": 43995790.0, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 0.2209679356310796, | |
| "grad_norm": 1.3604769706726074, | |
| "learning_rate": 1.451302605210421e-05, | |
| "loss": 0.5469, | |
| "mean_token_accuracy": 0.8270332351326942, | |
| "num_tokens": 44315769.0, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 0.2225691525559425, | |
| "grad_norm": 1.3541983366012573, | |
| "learning_rate": 1.4472945891783568e-05, | |
| "loss": 0.5221, | |
| "mean_token_accuracy": 0.83459262996912, | |
| "num_tokens": 44636906.0, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 0.2241703694808054, | |
| "grad_norm": 1.3885678052902222, | |
| "learning_rate": 1.4432865731462926e-05, | |
| "loss": 0.5178, | |
| "mean_token_accuracy": 0.8351704962551594, | |
| "num_tokens": 44957937.0, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.2257715864056683, | |
| "grad_norm": 1.4956326484680176, | |
| "learning_rate": 1.4392785571142285e-05, | |
| "loss": 0.535, | |
| "mean_token_accuracy": 0.8305199623107911, | |
| "num_tokens": 45279740.0, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 0.2273728033305312, | |
| "grad_norm": 1.519060730934143, | |
| "learning_rate": 1.4352705410821643e-05, | |
| "loss": 0.527, | |
| "mean_token_accuracy": 0.83277582898736, | |
| "num_tokens": 45603063.0, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 0.2289740202553941, | |
| "grad_norm": 1.3711844682693481, | |
| "learning_rate": 1.4312625250501003e-05, | |
| "loss": 0.5355, | |
| "mean_token_accuracy": 0.8295778013765812, | |
| "num_tokens": 45924072.0, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 0.230575237180257, | |
| "grad_norm": 1.3725755214691162, | |
| "learning_rate": 1.4272545090180361e-05, | |
| "loss": 0.523, | |
| "mean_token_accuracy": 0.8344107784330845, | |
| "num_tokens": 46247756.0, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.2321764541051199, | |
| "grad_norm": 1.3968268632888794, | |
| "learning_rate": 1.423246492985972e-05, | |
| "loss": 0.5415, | |
| "mean_token_accuracy": 0.8285624369978905, | |
| "num_tokens": 46568774.0, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.2337776710299828, | |
| "grad_norm": 1.3580690622329712, | |
| "learning_rate": 1.419238476953908e-05, | |
| "loss": 0.5234, | |
| "mean_token_accuracy": 0.8335274688899517, | |
| "num_tokens": 46892270.0, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 0.23537888795484568, | |
| "grad_norm": 1.40677809715271, | |
| "learning_rate": 1.415230460921844e-05, | |
| "loss": 0.5436, | |
| "mean_token_accuracy": 0.8272083044052124, | |
| "num_tokens": 47218605.0, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 0.23698010487970858, | |
| "grad_norm": 1.3678380250930786, | |
| "learning_rate": 1.4112224448897798e-05, | |
| "loss": 0.5351, | |
| "mean_token_accuracy": 0.8313795134425164, | |
| "num_tokens": 47540775.0, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 0.23858132180457148, | |
| "grad_norm": 1.3293787240982056, | |
| "learning_rate": 1.4072144288577156e-05, | |
| "loss": 0.5337, | |
| "mean_token_accuracy": 0.8307412274181842, | |
| "num_tokens": 47860802.0, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 0.24018253872943437, | |
| "grad_norm": 1.3010331392288208, | |
| "learning_rate": 1.4032064128256515e-05, | |
| "loss": 0.5316, | |
| "mean_token_accuracy": 0.8306872852146625, | |
| "num_tokens": 48184368.0, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.24178375565429727, | |
| "grad_norm": 1.382288932800293, | |
| "learning_rate": 1.3991983967935873e-05, | |
| "loss": 0.5242, | |
| "mean_token_accuracy": 0.8331730879843235, | |
| "num_tokens": 48506165.0, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 0.24338497257916017, | |
| "grad_norm": 1.436518907546997, | |
| "learning_rate": 1.3951903807615233e-05, | |
| "loss": 0.5275, | |
| "mean_token_accuracy": 0.8332164831459522, | |
| "num_tokens": 48826226.0, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 0.24498618950402307, | |
| "grad_norm": 1.2919634580612183, | |
| "learning_rate": 1.3911823647294592e-05, | |
| "loss": 0.519, | |
| "mean_token_accuracy": 0.8350857742130756, | |
| "num_tokens": 49146883.0, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 0.24658740642888596, | |
| "grad_norm": 1.3144079446792603, | |
| "learning_rate": 1.387174348697395e-05, | |
| "loss": 0.5411, | |
| "mean_token_accuracy": 0.8288080364465713, | |
| "num_tokens": 49466324.0, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 0.24818862335374886, | |
| "grad_norm": 1.5151828527450562, | |
| "learning_rate": 1.3831663326653308e-05, | |
| "loss": 0.5432, | |
| "mean_token_accuracy": 0.8270205929875374, | |
| "num_tokens": 49786371.0, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.24978984027861176, | |
| "grad_norm": 1.441520094871521, | |
| "learning_rate": 1.3791583166332667e-05, | |
| "loss": 0.5219, | |
| "mean_token_accuracy": 0.8341112732887268, | |
| "num_tokens": 50107593.0, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 0.25139105720347465, | |
| "grad_norm": 1.4237080812454224, | |
| "learning_rate": 1.3751503006012025e-05, | |
| "loss": 0.5435, | |
| "mean_token_accuracy": 0.8286943808197975, | |
| "num_tokens": 50426595.0, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 0.2529922741283375, | |
| "grad_norm": 1.2775644063949585, | |
| "learning_rate": 1.3711422845691385e-05, | |
| "loss": 0.5234, | |
| "mean_token_accuracy": 0.83297755792737, | |
| "num_tokens": 50747687.0, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 0.25459349105320045, | |
| "grad_norm": 1.3905959129333496, | |
| "learning_rate": 1.3671342685370743e-05, | |
| "loss": 0.531, | |
| "mean_token_accuracy": 0.8305413030087948, | |
| "num_tokens": 51064024.0, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 0.2561947079780633, | |
| "grad_norm": 1.4419200420379639, | |
| "learning_rate": 1.3631262525050102e-05, | |
| "loss": 0.5143, | |
| "mean_token_accuracy": 0.8352411143481732, | |
| "num_tokens": 51386046.0, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.25779592490292624, | |
| "grad_norm": 1.3886038064956665, | |
| "learning_rate": 1.359118236472946e-05, | |
| "loss": 0.5094, | |
| "mean_token_accuracy": 0.8376454688608647, | |
| "num_tokens": 51707022.0, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 0.2593971418277891, | |
| "grad_norm": 1.409141182899475, | |
| "learning_rate": 1.3551102204408818e-05, | |
| "loss": 0.5366, | |
| "mean_token_accuracy": 0.8303587555885314, | |
| "num_tokens": 52026853.0, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 0.26099835875265204, | |
| "grad_norm": 1.2826589345932007, | |
| "learning_rate": 1.3511022044088178e-05, | |
| "loss": 0.5324, | |
| "mean_token_accuracy": 0.8306440569460392, | |
| "num_tokens": 52347972.0, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 0.2625995756775149, | |
| "grad_norm": 1.3578959703445435, | |
| "learning_rate": 1.3470941883767537e-05, | |
| "loss": 0.5103, | |
| "mean_token_accuracy": 0.8366386197507382, | |
| "num_tokens": 52671515.0, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 0.26420079260237783, | |
| "grad_norm": 1.3251434564590454, | |
| "learning_rate": 1.3430861723446895e-05, | |
| "loss": 0.5261, | |
| "mean_token_accuracy": 0.8333509922027588, | |
| "num_tokens": 52992164.0, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.2658020095272407, | |
| "grad_norm": 1.3080499172210693, | |
| "learning_rate": 1.3390781563126254e-05, | |
| "loss": 0.5369, | |
| "mean_token_accuracy": 0.8305408164858818, | |
| "num_tokens": 53310199.0, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 0.2674032264521036, | |
| "grad_norm": 1.3889862298965454, | |
| "learning_rate": 1.3350701402805612e-05, | |
| "loss": 0.508, | |
| "mean_token_accuracy": 0.8376062199473381, | |
| "num_tokens": 53630423.0, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 0.2690044433769665, | |
| "grad_norm": 1.3830387592315674, | |
| "learning_rate": 1.3310621242484972e-05, | |
| "loss": 0.5105, | |
| "mean_token_accuracy": 0.8370765008032321, | |
| "num_tokens": 53953057.0, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 0.27060566030182936, | |
| "grad_norm": 1.2608269453048706, | |
| "learning_rate": 1.327054108216433e-05, | |
| "loss": 0.5104, | |
| "mean_token_accuracy": 0.8367661446332931, | |
| "num_tokens": 54273327.0, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 0.2722068772266923, | |
| "grad_norm": 1.3897860050201416, | |
| "learning_rate": 1.3230460921843689e-05, | |
| "loss": 0.5231, | |
| "mean_token_accuracy": 0.8334738679230214, | |
| "num_tokens": 54594530.0, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.27380809415155516, | |
| "grad_norm": 1.3804519176483154, | |
| "learning_rate": 1.3190380761523047e-05, | |
| "loss": 0.5229, | |
| "mean_token_accuracy": 0.8341789036989212, | |
| "num_tokens": 54915943.0, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 0.2754093110764181, | |
| "grad_norm": 1.4036991596221924, | |
| "learning_rate": 1.3150300601202405e-05, | |
| "loss": 0.533, | |
| "mean_token_accuracy": 0.8310645878314972, | |
| "num_tokens": 55236195.0, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 0.27701052800128095, | |
| "grad_norm": 1.4407809972763062, | |
| "learning_rate": 1.3110220440881765e-05, | |
| "loss": 0.5344, | |
| "mean_token_accuracy": 0.8298022277653218, | |
| "num_tokens": 55561851.0, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 0.2786117449261439, | |
| "grad_norm": 1.4469009637832642, | |
| "learning_rate": 1.3070140280561124e-05, | |
| "loss": 0.5247, | |
| "mean_token_accuracy": 0.8323244817554951, | |
| "num_tokens": 55883311.0, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 0.28021296185100675, | |
| "grad_norm": 1.447547197341919, | |
| "learning_rate": 1.3030060120240482e-05, | |
| "loss": 0.5223, | |
| "mean_token_accuracy": 0.8338434509932995, | |
| "num_tokens": 56204768.0, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.2818141787758697, | |
| "grad_norm": 1.2555780410766602, | |
| "learning_rate": 1.298997995991984e-05, | |
| "loss": 0.5208, | |
| "mean_token_accuracy": 0.83369060754776, | |
| "num_tokens": 56526793.0, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 0.28341539570073254, | |
| "grad_norm": 1.4045627117156982, | |
| "learning_rate": 1.2949899799599199e-05, | |
| "loss": 0.5354, | |
| "mean_token_accuracy": 0.8305397607386112, | |
| "num_tokens": 56848330.0, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 0.28501661262559547, | |
| "grad_norm": 1.3728803396224976, | |
| "learning_rate": 1.2909819639278557e-05, | |
| "loss": 0.525, | |
| "mean_token_accuracy": 0.832657416164875, | |
| "num_tokens": 57168555.0, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 0.28661782955045834, | |
| "grad_norm": 1.3874263763427734, | |
| "learning_rate": 1.2869739478957917e-05, | |
| "loss": 0.5189, | |
| "mean_token_accuracy": 0.8351394325494766, | |
| "num_tokens": 57483351.0, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 0.28821904647532126, | |
| "grad_norm": 1.5312138795852661, | |
| "learning_rate": 1.2829659318637276e-05, | |
| "loss": 0.514, | |
| "mean_token_accuracy": 0.8355598233640193, | |
| "num_tokens": 57806839.0, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.28982026340018413, | |
| "grad_norm": 1.278784155845642, | |
| "learning_rate": 1.2789579158316634e-05, | |
| "loss": 0.5171, | |
| "mean_token_accuracy": 0.8347181245684624, | |
| "num_tokens": 58129029.0, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 0.29142148032504706, | |
| "grad_norm": 1.4046473503112793, | |
| "learning_rate": 1.2749498997995992e-05, | |
| "loss": 0.5059, | |
| "mean_token_accuracy": 0.837676589936018, | |
| "num_tokens": 58450663.0, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 0.2930226972499099, | |
| "grad_norm": 1.2539747953414917, | |
| "learning_rate": 1.270941883767535e-05, | |
| "loss": 0.5144, | |
| "mean_token_accuracy": 0.8357288807630538, | |
| "num_tokens": 58770676.0, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 0.29462391417477285, | |
| "grad_norm": 1.285671353340149, | |
| "learning_rate": 1.266933867735471e-05, | |
| "loss": 0.5344, | |
| "mean_token_accuracy": 0.831085941195488, | |
| "num_tokens": 59090424.0, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 0.2962251310996357, | |
| "grad_norm": 1.3148458003997803, | |
| "learning_rate": 1.2629258517034069e-05, | |
| "loss": 0.5146, | |
| "mean_token_accuracy": 0.8359122090041637, | |
| "num_tokens": 59412216.0, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.29782634802449864, | |
| "grad_norm": 1.2842211723327637, | |
| "learning_rate": 1.2589178356713427e-05, | |
| "loss": 0.5171, | |
| "mean_token_accuracy": 0.835281378030777, | |
| "num_tokens": 59732518.0, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 0.2994275649493615, | |
| "grad_norm": 1.3869774341583252, | |
| "learning_rate": 1.2549098196392786e-05, | |
| "loss": 0.5123, | |
| "mean_token_accuracy": 0.8364998243749142, | |
| "num_tokens": 60055451.0, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 0.30102878187422444, | |
| "grad_norm": 1.4054564237594604, | |
| "learning_rate": 1.2509018036072144e-05, | |
| "loss": 0.5374, | |
| "mean_token_accuracy": 0.8293967254459857, | |
| "num_tokens": 60373896.0, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 0.3026299987990873, | |
| "grad_norm": 1.3444355726242065, | |
| "learning_rate": 1.2468937875751504e-05, | |
| "loss": 0.5183, | |
| "mean_token_accuracy": 0.8353605672717095, | |
| "num_tokens": 60694905.0, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 0.3042312157239502, | |
| "grad_norm": 1.397457480430603, | |
| "learning_rate": 1.2428857715430863e-05, | |
| "loss": 0.5127, | |
| "mean_token_accuracy": 0.8363382741808891, | |
| "num_tokens": 61016198.0, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.3058324326488131, | |
| "grad_norm": 1.3963935375213623, | |
| "learning_rate": 1.2388777555110221e-05, | |
| "loss": 0.5216, | |
| "mean_token_accuracy": 0.8344702877104282, | |
| "num_tokens": 61337171.0, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 0.30743364957367597, | |
| "grad_norm": 1.4211982488632202, | |
| "learning_rate": 1.234869739478958e-05, | |
| "loss": 0.5286, | |
| "mean_token_accuracy": 0.8320501960814, | |
| "num_tokens": 61656850.0, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 0.3090348664985389, | |
| "grad_norm": 1.3779051303863525, | |
| "learning_rate": 1.2308617234468938e-05, | |
| "loss": 0.5164, | |
| "mean_token_accuracy": 0.83540810495615, | |
| "num_tokens": 61975898.0, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 0.31063608342340177, | |
| "grad_norm": 1.3957812786102295, | |
| "learning_rate": 1.2268537074148296e-05, | |
| "loss": 0.5143, | |
| "mean_token_accuracy": 0.8354752957820892, | |
| "num_tokens": 62298700.0, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 0.3122373003482647, | |
| "grad_norm": 1.3711864948272705, | |
| "learning_rate": 1.2228456913827656e-05, | |
| "loss": 0.5115, | |
| "mean_token_accuracy": 0.8378391414880753, | |
| "num_tokens": 62620613.0, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.31383851727312756, | |
| "grad_norm": 1.4381730556488037, | |
| "learning_rate": 1.2188376753507014e-05, | |
| "loss": 0.5217, | |
| "mean_token_accuracy": 0.8346961595118045, | |
| "num_tokens": 62941883.0, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 0.3154397341979905, | |
| "grad_norm": 1.3389967679977417, | |
| "learning_rate": 1.2148296593186373e-05, | |
| "loss": 0.5131, | |
| "mean_token_accuracy": 0.8369076319038868, | |
| "num_tokens": 63264632.0, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 0.31704095112285335, | |
| "grad_norm": 1.3715007305145264, | |
| "learning_rate": 1.2108216432865731e-05, | |
| "loss": 0.5096, | |
| "mean_token_accuracy": 0.8370655708014965, | |
| "num_tokens": 63587068.0, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 0.3186421680477163, | |
| "grad_norm": 1.378260612487793, | |
| "learning_rate": 1.206813627254509e-05, | |
| "loss": 0.5073, | |
| "mean_token_accuracy": 0.8378829523921013, | |
| "num_tokens": 63908106.0, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 0.32024338497257915, | |
| "grad_norm": 1.5017547607421875, | |
| "learning_rate": 1.202805611222445e-05, | |
| "loss": 0.5134, | |
| "mean_token_accuracy": 0.835593044012785, | |
| "num_tokens": 64230902.0, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.3218446018974421, | |
| "grad_norm": 1.4279427528381348, | |
| "learning_rate": 1.198797595190381e-05, | |
| "loss": 0.5088, | |
| "mean_token_accuracy": 0.8369226314127445, | |
| "num_tokens": 64553352.0, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 0.32344581882230494, | |
| "grad_norm": 1.3920783996582031, | |
| "learning_rate": 1.1947895791583168e-05, | |
| "loss": 0.5082, | |
| "mean_token_accuracy": 0.8379290044307709, | |
| "num_tokens": 64872776.0, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 0.32504703574716787, | |
| "grad_norm": 1.5056698322296143, | |
| "learning_rate": 1.1907815631262526e-05, | |
| "loss": 0.5141, | |
| "mean_token_accuracy": 0.8356631733477116, | |
| "num_tokens": 65191309.0, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 0.32664825267203074, | |
| "grad_norm": 1.4136664867401123, | |
| "learning_rate": 1.1867735470941886e-05, | |
| "loss": 0.5054, | |
| "mean_token_accuracy": 0.8383793398737908, | |
| "num_tokens": 65512555.0, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 0.32824946959689366, | |
| "grad_norm": 1.4516948461532593, | |
| "learning_rate": 1.1827655310621245e-05, | |
| "loss": 0.5071, | |
| "mean_token_accuracy": 0.8382188349962234, | |
| "num_tokens": 65834222.0, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.32985068652175653, | |
| "grad_norm": 1.3846871852874756, | |
| "learning_rate": 1.1787575150300603e-05, | |
| "loss": 0.5085, | |
| "mean_token_accuracy": 0.8372341521084309, | |
| "num_tokens": 66157553.0, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 0.33145190344661946, | |
| "grad_norm": 1.3973990678787231, | |
| "learning_rate": 1.1747494989979961e-05, | |
| "loss": 0.494, | |
| "mean_token_accuracy": 0.8409634441137314, | |
| "num_tokens": 66477710.0, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 0.3330531203714823, | |
| "grad_norm": 1.4062806367874146, | |
| "learning_rate": 1.170741482965932e-05, | |
| "loss": 0.5102, | |
| "mean_token_accuracy": 0.8364981457591056, | |
| "num_tokens": 66799644.0, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 0.33465433729634525, | |
| "grad_norm": 1.3162293434143066, | |
| "learning_rate": 1.166733466933868e-05, | |
| "loss": 0.5179, | |
| "mean_token_accuracy": 0.8351655416190624, | |
| "num_tokens": 67117711.0, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 0.3362555542212081, | |
| "grad_norm": 1.321420669555664, | |
| "learning_rate": 1.1627254509018038e-05, | |
| "loss": 0.5091, | |
| "mean_token_accuracy": 0.8373862035572529, | |
| "num_tokens": 67437650.0, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.337856771146071, | |
| "grad_norm": 1.342096209526062, | |
| "learning_rate": 1.1587174348697396e-05, | |
| "loss": 0.5169, | |
| "mean_token_accuracy": 0.8356272347271443, | |
| "num_tokens": 67756620.0, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 0.3394579880709339, | |
| "grad_norm": 1.4526606798171997, | |
| "learning_rate": 1.1547094188376755e-05, | |
| "loss": 0.5026, | |
| "mean_token_accuracy": 0.8400054812431336, | |
| "num_tokens": 68076803.0, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 0.3410592049957968, | |
| "grad_norm": 1.5370142459869385, | |
| "learning_rate": 1.1507014028056113e-05, | |
| "loss": 0.5207, | |
| "mean_token_accuracy": 0.8342337474226952, | |
| "num_tokens": 68392652.0, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 0.3426604219206597, | |
| "grad_norm": 1.404117465019226, | |
| "learning_rate": 1.1466933867735473e-05, | |
| "loss": 0.5088, | |
| "mean_token_accuracy": 0.8374922059476375, | |
| "num_tokens": 68712578.0, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 0.3442616388455226, | |
| "grad_norm": 1.4577999114990234, | |
| "learning_rate": 1.1426853707414832e-05, | |
| "loss": 0.5012, | |
| "mean_token_accuracy": 0.8386322245001793, | |
| "num_tokens": 69035100.0, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.3458628557703855, | |
| "grad_norm": 1.3693453073501587, | |
| "learning_rate": 1.138677354709419e-05, | |
| "loss": 0.5095, | |
| "mean_token_accuracy": 0.8373454891145229, | |
| "num_tokens": 69356912.0, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 0.34746407269524837, | |
| "grad_norm": 1.340850830078125, | |
| "learning_rate": 1.1346693386773548e-05, | |
| "loss": 0.4995, | |
| "mean_token_accuracy": 0.8392350934445858, | |
| "num_tokens": 69676682.0, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 0.3490652896201113, | |
| "grad_norm": 1.417924165725708, | |
| "learning_rate": 1.1306613226452907e-05, | |
| "loss": 0.5102, | |
| "mean_token_accuracy": 0.836965323984623, | |
| "num_tokens": 69996716.0, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 0.35066650654497417, | |
| "grad_norm": 1.3659642934799194, | |
| "learning_rate": 1.1266533066132267e-05, | |
| "loss": 0.5214, | |
| "mean_token_accuracy": 0.8338551007211208, | |
| "num_tokens": 70317035.0, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 0.3522677234698371, | |
| "grad_norm": 1.3467968702316284, | |
| "learning_rate": 1.1226452905811625e-05, | |
| "loss": 0.4923, | |
| "mean_token_accuracy": 0.8416352264583111, | |
| "num_tokens": 70640405.0, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.35386894039469996, | |
| "grad_norm": 1.3253226280212402, | |
| "learning_rate": 1.1186372745490983e-05, | |
| "loss": 0.5146, | |
| "mean_token_accuracy": 0.8355057209730148, | |
| "num_tokens": 70960326.0, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 0.3554701573195629, | |
| "grad_norm": 1.3283216953277588, | |
| "learning_rate": 1.1146292585170342e-05, | |
| "loss": 0.5055, | |
| "mean_token_accuracy": 0.8376817353069782, | |
| "num_tokens": 71281551.0, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 0.35707137424442575, | |
| "grad_norm": 1.3400591611862183, | |
| "learning_rate": 1.11062124248497e-05, | |
| "loss": 0.5117, | |
| "mean_token_accuracy": 0.835385499149561, | |
| "num_tokens": 71597894.0, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 0.3586725911692887, | |
| "grad_norm": 1.3082510232925415, | |
| "learning_rate": 1.1066132264529058e-05, | |
| "loss": 0.5095, | |
| "mean_token_accuracy": 0.8371740274131299, | |
| "num_tokens": 71916696.0, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 0.36027380809415155, | |
| "grad_norm": 1.3859164714813232, | |
| "learning_rate": 1.1026052104208418e-05, | |
| "loss": 0.4923, | |
| "mean_token_accuracy": 0.8419888988137245, | |
| "num_tokens": 72239617.0, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.3618750250190145, | |
| "grad_norm": 1.3124301433563232, | |
| "learning_rate": 1.0985971943887777e-05, | |
| "loss": 0.4986, | |
| "mean_token_accuracy": 0.8399450138211251, | |
| "num_tokens": 72559883.0, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 0.36347624194387734, | |
| "grad_norm": 1.457430124282837, | |
| "learning_rate": 1.0945891783567135e-05, | |
| "loss": 0.5145, | |
| "mean_token_accuracy": 0.8359267294406891, | |
| "num_tokens": 72880607.0, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 0.36507745886874027, | |
| "grad_norm": 1.3115805387496948, | |
| "learning_rate": 1.0905811623246494e-05, | |
| "loss": 0.5135, | |
| "mean_token_accuracy": 0.8364898063242435, | |
| "num_tokens": 73201909.0, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 0.36667867579360314, | |
| "grad_norm": 1.3560421466827393, | |
| "learning_rate": 1.0865731462925852e-05, | |
| "loss": 0.5092, | |
| "mean_token_accuracy": 0.8378934688866139, | |
| "num_tokens": 73520717.0, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 0.368279892718466, | |
| "grad_norm": 1.383169174194336, | |
| "learning_rate": 1.0825651302605212e-05, | |
| "loss": 0.5055, | |
| "mean_token_accuracy": 0.8381465241312981, | |
| "num_tokens": 73840716.0, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.36988110964332893, | |
| "grad_norm": 1.3521928787231445, | |
| "learning_rate": 1.078557114228457e-05, | |
| "loss": 0.5153, | |
| "mean_token_accuracy": 0.8346577867865562, | |
| "num_tokens": 74161813.0, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 0.3714823265681918, | |
| "grad_norm": 1.4678071737289429, | |
| "learning_rate": 1.0745490981963929e-05, | |
| "loss": 0.5051, | |
| "mean_token_accuracy": 0.8379949778318405, | |
| "num_tokens": 74482446.0, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 0.3730835434930547, | |
| "grad_norm": 1.4253208637237549, | |
| "learning_rate": 1.0705410821643287e-05, | |
| "loss": 0.4978, | |
| "mean_token_accuracy": 0.84078124538064, | |
| "num_tokens": 74805794.0, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 0.3746847604179176, | |
| "grad_norm": 1.3019065856933594, | |
| "learning_rate": 1.0665330661322645e-05, | |
| "loss": 0.4955, | |
| "mean_token_accuracy": 0.8400741912424564, | |
| "num_tokens": 75126615.0, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 0.3762859773427805, | |
| "grad_norm": 1.3907262086868286, | |
| "learning_rate": 1.0625250501002005e-05, | |
| "loss": 0.4983, | |
| "mean_token_accuracy": 0.839722640812397, | |
| "num_tokens": 75449259.0, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 0.3778871942676434, | |
| "grad_norm": 1.4004820585250854, | |
| "learning_rate": 1.0585170340681364e-05, | |
| "loss": 0.5007, | |
| "mean_token_accuracy": 0.8395778320729732, | |
| "num_tokens": 75771850.0, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 0.3794884111925063, | |
| "grad_norm": 1.3553279638290405, | |
| "learning_rate": 1.0545090180360722e-05, | |
| "loss": 0.4984, | |
| "mean_token_accuracy": 0.8410831809043884, | |
| "num_tokens": 76091494.0, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 0.3810896281173692, | |
| "grad_norm": 1.3445892333984375, | |
| "learning_rate": 1.050501002004008e-05, | |
| "loss": 0.4959, | |
| "mean_token_accuracy": 0.8421156950294971, | |
| "num_tokens": 76410965.0, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 0.3826908450422321, | |
| "grad_norm": 1.3246954679489136, | |
| "learning_rate": 1.0464929859719439e-05, | |
| "loss": 0.4965, | |
| "mean_token_accuracy": 0.8410131432116031, | |
| "num_tokens": 76732767.0, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 0.384292061967095, | |
| "grad_norm": 1.3353439569473267, | |
| "learning_rate": 1.0424849699398797e-05, | |
| "loss": 0.5087, | |
| "mean_token_accuracy": 0.8370159074664116, | |
| "num_tokens": 77053743.0, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.3858932788919579, | |
| "grad_norm": 1.3861980438232422, | |
| "learning_rate": 1.0384769539078157e-05, | |
| "loss": 0.5224, | |
| "mean_token_accuracy": 0.8334609299898148, | |
| "num_tokens": 77373771.0, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 0.38749449581682077, | |
| "grad_norm": 1.3504012823104858, | |
| "learning_rate": 1.0344689378757516e-05, | |
| "loss": 0.4983, | |
| "mean_token_accuracy": 0.8405769415199756, | |
| "num_tokens": 77695514.0, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 0.3890957127416837, | |
| "grad_norm": 1.3298859596252441, | |
| "learning_rate": 1.0304609218436874e-05, | |
| "loss": 0.5159, | |
| "mean_token_accuracy": 0.8361081972718238, | |
| "num_tokens": 78013751.0, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 0.39069692966654657, | |
| "grad_norm": 1.3871954679489136, | |
| "learning_rate": 1.0264529058116232e-05, | |
| "loss": 0.5088, | |
| "mean_token_accuracy": 0.8377067312598229, | |
| "num_tokens": 78331269.0, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 0.3922981465914095, | |
| "grad_norm": 1.3551021814346313, | |
| "learning_rate": 1.022444889779559e-05, | |
| "loss": 0.4959, | |
| "mean_token_accuracy": 0.8411931954324245, | |
| "num_tokens": 78651674.0, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 0.39389936351627236, | |
| "grad_norm": 1.3737174272537231, | |
| "learning_rate": 1.018436873747495e-05, | |
| "loss": 0.505, | |
| "mean_token_accuracy": 0.8385034531354905, | |
| "num_tokens": 78969171.0, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 0.3955005804411353, | |
| "grad_norm": 1.35822594165802, | |
| "learning_rate": 1.0144288577154309e-05, | |
| "loss": 0.4916, | |
| "mean_token_accuracy": 0.8429812632501126, | |
| "num_tokens": 79286752.0, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 0.39710179736599815, | |
| "grad_norm": 1.2987345457077026, | |
| "learning_rate": 1.0104208416833667e-05, | |
| "loss": 0.499, | |
| "mean_token_accuracy": 0.8391521126031876, | |
| "num_tokens": 79610622.0, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 0.3987030142908611, | |
| "grad_norm": 1.3836746215820312, | |
| "learning_rate": 1.0064128256513026e-05, | |
| "loss": 0.4993, | |
| "mean_token_accuracy": 0.8398879647254944, | |
| "num_tokens": 79932869.0, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 0.40030423121572395, | |
| "grad_norm": 1.3992457389831543, | |
| "learning_rate": 1.0024048096192384e-05, | |
| "loss": 0.5066, | |
| "mean_token_accuracy": 0.8372752889990807, | |
| "num_tokens": 80256060.0, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.4019054481405868, | |
| "grad_norm": 1.3400214910507202, | |
| "learning_rate": 9.983967935871744e-06, | |
| "loss": 0.4926, | |
| "mean_token_accuracy": 0.8424744494259357, | |
| "num_tokens": 80580299.0, | |
| "step": 2510 | |
| }, | |
| { | |
| "epoch": 0.40350666506544974, | |
| "grad_norm": 1.3263059854507446, | |
| "learning_rate": 9.943887775551103e-06, | |
| "loss": 0.5024, | |
| "mean_token_accuracy": 0.8391487777233124, | |
| "num_tokens": 80899422.0, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 0.4051078819903126, | |
| "grad_norm": 1.3698290586471558, | |
| "learning_rate": 9.903807615230463e-06, | |
| "loss": 0.5054, | |
| "mean_token_accuracy": 0.8376754596829414, | |
| "num_tokens": 81218682.0, | |
| "step": 2530 | |
| }, | |
| { | |
| "epoch": 0.40670909891517554, | |
| "grad_norm": 1.3288636207580566, | |
| "learning_rate": 9.863727454909821e-06, | |
| "loss": 0.4906, | |
| "mean_token_accuracy": 0.8424558945000171, | |
| "num_tokens": 81536321.0, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 0.4083103158400384, | |
| "grad_norm": 1.363012433052063, | |
| "learning_rate": 9.82364729458918e-06, | |
| "loss": 0.5048, | |
| "mean_token_accuracy": 0.8382453963160514, | |
| "num_tokens": 81855653.0, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 0.40991153276490133, | |
| "grad_norm": 1.3733346462249756, | |
| "learning_rate": 9.783567134268538e-06, | |
| "loss": 0.5051, | |
| "mean_token_accuracy": 0.8381193451583385, | |
| "num_tokens": 82175730.0, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 0.4115127496897642, | |
| "grad_norm": 1.415134310722351, | |
| "learning_rate": 9.743486973947896e-06, | |
| "loss": 0.4965, | |
| "mean_token_accuracy": 0.8402998454868793, | |
| "num_tokens": 82497654.0, | |
| "step": 2570 | |
| }, | |
| { | |
| "epoch": 0.4131139666146271, | |
| "grad_norm": 1.3909255266189575, | |
| "learning_rate": 9.703406813627256e-06, | |
| "loss": 0.4916, | |
| "mean_token_accuracy": 0.8428246550261974, | |
| "num_tokens": 82818882.0, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 0.41471518353949, | |
| "grad_norm": 1.4890861511230469, | |
| "learning_rate": 9.663326653306614e-06, | |
| "loss": 0.4973, | |
| "mean_token_accuracy": 0.8410121746361255, | |
| "num_tokens": 83141262.0, | |
| "step": 2590 | |
| }, | |
| { | |
| "epoch": 0.4163164004643529, | |
| "grad_norm": 1.2782690525054932, | |
| "learning_rate": 9.623246492985973e-06, | |
| "loss": 0.497, | |
| "mean_token_accuracy": 0.8404788970947266, | |
| "num_tokens": 83463591.0, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.4179176173892158, | |
| "grad_norm": 1.4288479089736938, | |
| "learning_rate": 9.583166332665331e-06, | |
| "loss": 0.4985, | |
| "mean_token_accuracy": 0.8398890845477581, | |
| "num_tokens": 83786227.0, | |
| "step": 2610 | |
| }, | |
| { | |
| "epoch": 0.4195188343140787, | |
| "grad_norm": 1.3501110076904297, | |
| "learning_rate": 9.54308617234469e-06, | |
| "loss": 0.5178, | |
| "mean_token_accuracy": 0.8343347430229187, | |
| "num_tokens": 84107788.0, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 0.4211200512389416, | |
| "grad_norm": 1.4550552368164062, | |
| "learning_rate": 9.503006012024048e-06, | |
| "loss": 0.5067, | |
| "mean_token_accuracy": 0.8379735544323921, | |
| "num_tokens": 84428834.0, | |
| "step": 2630 | |
| }, | |
| { | |
| "epoch": 0.4227212681638045, | |
| "grad_norm": 1.4233253002166748, | |
| "learning_rate": 9.462925851703408e-06, | |
| "loss": 0.4972, | |
| "mean_token_accuracy": 0.8409186281263828, | |
| "num_tokens": 84748101.0, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 0.4243224850886674, | |
| "grad_norm": 1.352547526359558, | |
| "learning_rate": 9.422845691382766e-06, | |
| "loss": 0.4919, | |
| "mean_token_accuracy": 0.8416687369346618, | |
| "num_tokens": 85070665.0, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 0.4259237020135303, | |
| "grad_norm": 1.3560786247253418, | |
| "learning_rate": 9.382765531062125e-06, | |
| "loss": 0.516, | |
| "mean_token_accuracy": 0.835656214505434, | |
| "num_tokens": 85390075.0, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 0.4275249189383932, | |
| "grad_norm": 1.345350742340088, | |
| "learning_rate": 9.342685370741483e-06, | |
| "loss": 0.4822, | |
| "mean_token_accuracy": 0.8448456957936287, | |
| "num_tokens": 85711180.0, | |
| "step": 2670 | |
| }, | |
| { | |
| "epoch": 0.4291261358632561, | |
| "grad_norm": 1.4362421035766602, | |
| "learning_rate": 9.302605210420841e-06, | |
| "loss": 0.4864, | |
| "mean_token_accuracy": 0.8442413218319416, | |
| "num_tokens": 86033712.0, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 0.43072735278811897, | |
| "grad_norm": 1.4138871431350708, | |
| "learning_rate": 9.262525050100201e-06, | |
| "loss": 0.4863, | |
| "mean_token_accuracy": 0.8433097846806049, | |
| "num_tokens": 86357770.0, | |
| "step": 2690 | |
| }, | |
| { | |
| "epoch": 0.4323285697129819, | |
| "grad_norm": 1.3361561298370361, | |
| "learning_rate": 9.22244488977956e-06, | |
| "loss": 0.4998, | |
| "mean_token_accuracy": 0.8395190499722958, | |
| "num_tokens": 86677995.0, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.43392978663784476, | |
| "grad_norm": 1.4338953495025635, | |
| "learning_rate": 9.18236472945892e-06, | |
| "loss": 0.5036, | |
| "mean_token_accuracy": 0.8392317593097687, | |
| "num_tokens": 86998421.0, | |
| "step": 2710 | |
| }, | |
| { | |
| "epoch": 0.43553100356270763, | |
| "grad_norm": 1.3419009447097778, | |
| "learning_rate": 9.142284569138278e-06, | |
| "loss": 0.4882, | |
| "mean_token_accuracy": 0.8429043956100941, | |
| "num_tokens": 87320198.0, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 0.43713222048757056, | |
| "grad_norm": 1.3865110874176025, | |
| "learning_rate": 9.102204408817636e-06, | |
| "loss": 0.5061, | |
| "mean_token_accuracy": 0.8393984533846378, | |
| "num_tokens": 87637830.0, | |
| "step": 2730 | |
| }, | |
| { | |
| "epoch": 0.4387334374124334, | |
| "grad_norm": 1.2503968477249146, | |
| "learning_rate": 9.062124248496995e-06, | |
| "loss": 0.5001, | |
| "mean_token_accuracy": 0.8395638100802898, | |
| "num_tokens": 87958915.0, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 0.44033465433729635, | |
| "grad_norm": 1.3062018156051636, | |
| "learning_rate": 9.022044088176353e-06, | |
| "loss": 0.4927, | |
| "mean_token_accuracy": 0.8417042560875416, | |
| "num_tokens": 88278719.0, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 0.4419358712621592, | |
| "grad_norm": 1.3273067474365234, | |
| "learning_rate": 8.981963927855713e-06, | |
| "loss": 0.4835, | |
| "mean_token_accuracy": 0.8445591539144516, | |
| "num_tokens": 88601269.0, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 0.44353708818702214, | |
| "grad_norm": 1.3917337656021118, | |
| "learning_rate": 8.941883767535072e-06, | |
| "loss": 0.485, | |
| "mean_token_accuracy": 0.8442142225801945, | |
| "num_tokens": 88923914.0, | |
| "step": 2770 | |
| }, | |
| { | |
| "epoch": 0.445138305111885, | |
| "grad_norm": 1.4732093811035156, | |
| "learning_rate": 8.90180360721443e-06, | |
| "loss": 0.4954, | |
| "mean_token_accuracy": 0.8403341613709927, | |
| "num_tokens": 89246546.0, | |
| "step": 2780 | |
| }, | |
| { | |
| "epoch": 0.44673952203674794, | |
| "grad_norm": 1.3622393608093262, | |
| "learning_rate": 8.861723446893788e-06, | |
| "loss": 0.5039, | |
| "mean_token_accuracy": 0.8388668954372406, | |
| "num_tokens": 89564556.0, | |
| "step": 2790 | |
| }, | |
| { | |
| "epoch": 0.4483407389616108, | |
| "grad_norm": 1.2903145551681519, | |
| "learning_rate": 8.821643286573147e-06, | |
| "loss": 0.4841, | |
| "mean_token_accuracy": 0.8444612257182598, | |
| "num_tokens": 89884328.0, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.44994195588647373, | |
| "grad_norm": 1.3457651138305664, | |
| "learning_rate": 8.781563126252507e-06, | |
| "loss": 0.4876, | |
| "mean_token_accuracy": 0.843576093763113, | |
| "num_tokens": 90201742.0, | |
| "step": 2810 | |
| }, | |
| { | |
| "epoch": 0.4515431728113366, | |
| "grad_norm": 1.340334177017212, | |
| "learning_rate": 8.741482965931865e-06, | |
| "loss": 0.4837, | |
| "mean_token_accuracy": 0.8439679995179177, | |
| "num_tokens": 90526771.0, | |
| "step": 2820 | |
| }, | |
| { | |
| "epoch": 0.4531443897361995, | |
| "grad_norm": 1.3280917406082153, | |
| "learning_rate": 8.701402805611223e-06, | |
| "loss": 0.4924, | |
| "mean_token_accuracy": 0.8423436298966408, | |
| "num_tokens": 90845097.0, | |
| "step": 2830 | |
| }, | |
| { | |
| "epoch": 0.4547456066610624, | |
| "grad_norm": 1.3423433303833008, | |
| "learning_rate": 8.661322645290582e-06, | |
| "loss": 0.4925, | |
| "mean_token_accuracy": 0.8414337895810604, | |
| "num_tokens": 91167887.0, | |
| "step": 2840 | |
| }, | |
| { | |
| "epoch": 0.4563468235859253, | |
| "grad_norm": 1.3706343173980713, | |
| "learning_rate": 8.62124248496994e-06, | |
| "loss": 0.4935, | |
| "mean_token_accuracy": 0.8412027418613434, | |
| "num_tokens": 91488957.0, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 0.4579480405107882, | |
| "grad_norm": 1.3505706787109375, | |
| "learning_rate": 8.581162324649298e-06, | |
| "loss": 0.4966, | |
| "mean_token_accuracy": 0.8411595188081264, | |
| "num_tokens": 91811009.0, | |
| "step": 2860 | |
| }, | |
| { | |
| "epoch": 0.4595492574356511, | |
| "grad_norm": 1.452344298362732, | |
| "learning_rate": 8.541082164328658e-06, | |
| "loss": 0.4985, | |
| "mean_token_accuracy": 0.8402431011199951, | |
| "num_tokens": 92130613.0, | |
| "step": 2870 | |
| }, | |
| { | |
| "epoch": 0.461150474360514, | |
| "grad_norm": 1.3361514806747437, | |
| "learning_rate": 8.501002004008017e-06, | |
| "loss": 0.4899, | |
| "mean_token_accuracy": 0.8420615963637829, | |
| "num_tokens": 92449224.0, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 0.4627516912853769, | |
| "grad_norm": 1.418734073638916, | |
| "learning_rate": 8.460921843687375e-06, | |
| "loss": 0.4924, | |
| "mean_token_accuracy": 0.8421052910387516, | |
| "num_tokens": 92770044.0, | |
| "step": 2890 | |
| }, | |
| { | |
| "epoch": 0.4643529082102398, | |
| "grad_norm": 1.4310579299926758, | |
| "learning_rate": 8.420841683366734e-06, | |
| "loss": 0.469, | |
| "mean_token_accuracy": 0.8483155004680156, | |
| "num_tokens": 93094237.0, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.4659541251351027, | |
| "grad_norm": 1.3272638320922852, | |
| "learning_rate": 8.380761523046092e-06, | |
| "loss": 0.4888, | |
| "mean_token_accuracy": 0.8436732694506646, | |
| "num_tokens": 93409585.0, | |
| "step": 2910 | |
| }, | |
| { | |
| "epoch": 0.4675553420599656, | |
| "grad_norm": 1.3308969736099243, | |
| "learning_rate": 8.340681362725452e-06, | |
| "loss": 0.4957, | |
| "mean_token_accuracy": 0.8402362495660782, | |
| "num_tokens": 93731419.0, | |
| "step": 2920 | |
| }, | |
| { | |
| "epoch": 0.46915655898482844, | |
| "grad_norm": 1.3682903051376343, | |
| "learning_rate": 8.30060120240481e-06, | |
| "loss": 0.4902, | |
| "mean_token_accuracy": 0.8418670207262039, | |
| "num_tokens": 94053839.0, | |
| "step": 2930 | |
| }, | |
| { | |
| "epoch": 0.47075777590969137, | |
| "grad_norm": 1.428017020225525, | |
| "learning_rate": 8.260521042084169e-06, | |
| "loss": 0.4911, | |
| "mean_token_accuracy": 0.8421666778624057, | |
| "num_tokens": 94376862.0, | |
| "step": 2940 | |
| }, | |
| { | |
| "epoch": 0.47235899283455424, | |
| "grad_norm": 1.2561357021331787, | |
| "learning_rate": 8.220440881763527e-06, | |
| "loss": 0.4687, | |
| "mean_token_accuracy": 0.8477826707065106, | |
| "num_tokens": 94698788.0, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 0.47396020975941716, | |
| "grad_norm": 1.3731807470321655, | |
| "learning_rate": 8.180360721442885e-06, | |
| "loss": 0.4948, | |
| "mean_token_accuracy": 0.8410520948469639, | |
| "num_tokens": 95018237.0, | |
| "step": 2960 | |
| }, | |
| { | |
| "epoch": 0.47556142668428003, | |
| "grad_norm": 1.4014275074005127, | |
| "learning_rate": 8.140280561122245e-06, | |
| "loss": 0.5026, | |
| "mean_token_accuracy": 0.8390306062996388, | |
| "num_tokens": 95340071.0, | |
| "step": 2970 | |
| }, | |
| { | |
| "epoch": 0.47716264360914296, | |
| "grad_norm": 1.3577542304992676, | |
| "learning_rate": 8.100200400801604e-06, | |
| "loss": 0.485, | |
| "mean_token_accuracy": 0.8437927052378654, | |
| "num_tokens": 95663929.0, | |
| "step": 2980 | |
| }, | |
| { | |
| "epoch": 0.4787638605340058, | |
| "grad_norm": 1.3654958009719849, | |
| "learning_rate": 8.060120240480964e-06, | |
| "loss": 0.4916, | |
| "mean_token_accuracy": 0.842602127045393, | |
| "num_tokens": 95981460.0, | |
| "step": 2990 | |
| }, | |
| { | |
| "epoch": 0.48036507745886875, | |
| "grad_norm": 1.3493473529815674, | |
| "learning_rate": 8.020040080160322e-06, | |
| "loss": 0.4958, | |
| "mean_token_accuracy": 0.8409499667584897, | |
| "num_tokens": 96299611.0, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.4819662943837316, | |
| "grad_norm": 1.2955293655395508, | |
| "learning_rate": 7.97995991983968e-06, | |
| "loss": 0.474, | |
| "mean_token_accuracy": 0.8469664104282856, | |
| "num_tokens": 96619890.0, | |
| "step": 3010 | |
| }, | |
| { | |
| "epoch": 0.48356751130859454, | |
| "grad_norm": 1.394558072090149, | |
| "learning_rate": 7.939879759519039e-06, | |
| "loss": 0.4724, | |
| "mean_token_accuracy": 0.8480439126491547, | |
| "num_tokens": 96941934.0, | |
| "step": 3020 | |
| }, | |
| { | |
| "epoch": 0.4851687282334574, | |
| "grad_norm": 1.3079112768173218, | |
| "learning_rate": 7.899799599198397e-06, | |
| "loss": 0.4771, | |
| "mean_token_accuracy": 0.8466820225119591, | |
| "num_tokens": 97263833.0, | |
| "step": 3030 | |
| }, | |
| { | |
| "epoch": 0.48676994515832034, | |
| "grad_norm": 1.3734931945800781, | |
| "learning_rate": 7.859719438877757e-06, | |
| "loss": 0.4913, | |
| "mean_token_accuracy": 0.8421886332333088, | |
| "num_tokens": 97589768.0, | |
| "step": 3040 | |
| }, | |
| { | |
| "epoch": 0.4883711620831832, | |
| "grad_norm": 1.4044214487075806, | |
| "learning_rate": 7.819639278557116e-06, | |
| "loss": 0.4773, | |
| "mean_token_accuracy": 0.8462626121938228, | |
| "num_tokens": 97909366.0, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 0.48997237900804613, | |
| "grad_norm": 1.3782063722610474, | |
| "learning_rate": 7.779559118236474e-06, | |
| "loss": 0.5033, | |
| "mean_token_accuracy": 0.8381896771490573, | |
| "num_tokens": 98231390.0, | |
| "step": 3060 | |
| }, | |
| { | |
| "epoch": 0.491573595932909, | |
| "grad_norm": 1.3914376497268677, | |
| "learning_rate": 7.739478957915832e-06, | |
| "loss": 0.4725, | |
| "mean_token_accuracy": 0.8477898120880127, | |
| "num_tokens": 98549143.0, | |
| "step": 3070 | |
| }, | |
| { | |
| "epoch": 0.4931748128577719, | |
| "grad_norm": 1.3243604898452759, | |
| "learning_rate": 7.69939879759519e-06, | |
| "loss": 0.4824, | |
| "mean_token_accuracy": 0.8446347959339618, | |
| "num_tokens": 98872752.0, | |
| "step": 3080 | |
| }, | |
| { | |
| "epoch": 0.4947760297826348, | |
| "grad_norm": 1.2843396663665771, | |
| "learning_rate": 7.659318637274549e-06, | |
| "loss": 0.4847, | |
| "mean_token_accuracy": 0.844062016159296, | |
| "num_tokens": 99191883.0, | |
| "step": 3090 | |
| }, | |
| { | |
| "epoch": 0.4963772467074977, | |
| "grad_norm": 1.322749376296997, | |
| "learning_rate": 7.619238476953908e-06, | |
| "loss": 0.4798, | |
| "mean_token_accuracy": 0.8457852609455585, | |
| "num_tokens": 99510490.0, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.4979784636323606, | |
| "grad_norm": 1.319571614265442, | |
| "learning_rate": 7.5791583166332674e-06, | |
| "loss": 0.4784, | |
| "mean_token_accuracy": 0.8460357464849949, | |
| "num_tokens": 99832436.0, | |
| "step": 3110 | |
| }, | |
| { | |
| "epoch": 0.4995796805572235, | |
| "grad_norm": 1.3875261545181274, | |
| "learning_rate": 7.539078156312626e-06, | |
| "loss": 0.4747, | |
| "mean_token_accuracy": 0.8463287480175495, | |
| "num_tokens": 100154761.0, | |
| "step": 3120 | |
| }, | |
| { | |
| "epoch": 0.5011808974820864, | |
| "grad_norm": 1.3350956439971924, | |
| "learning_rate": 7.498997995991984e-06, | |
| "loss": 0.4667, | |
| "mean_token_accuracy": 0.8492703415453434, | |
| "num_tokens": 100479640.0, | |
| "step": 3130 | |
| }, | |
| { | |
| "epoch": 0.5027821144069493, | |
| "grad_norm": 1.340723991394043, | |
| "learning_rate": 7.458917835671343e-06, | |
| "loss": 0.4789, | |
| "mean_token_accuracy": 0.84622046276927, | |
| "num_tokens": 100801858.0, | |
| "step": 3140 | |
| }, | |
| { | |
| "epoch": 0.5043833313318121, | |
| "grad_norm": 1.4275963306427002, | |
| "learning_rate": 7.418837675350702e-06, | |
| "loss": 0.4777, | |
| "mean_token_accuracy": 0.8459878988564015, | |
| "num_tokens": 101122875.0, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 0.505984548256675, | |
| "grad_norm": 1.3403825759887695, | |
| "learning_rate": 7.378757515030061e-06, | |
| "loss": 0.4864, | |
| "mean_token_accuracy": 0.8435619659721851, | |
| "num_tokens": 101445138.0, | |
| "step": 3160 | |
| }, | |
| { | |
| "epoch": 0.507585765181538, | |
| "grad_norm": 1.3267710208892822, | |
| "learning_rate": 7.338677354709419e-06, | |
| "loss": 0.471, | |
| "mean_token_accuracy": 0.8473985768854618, | |
| "num_tokens": 101769313.0, | |
| "step": 3170 | |
| }, | |
| { | |
| "epoch": 0.5091869821064009, | |
| "grad_norm": 1.3425947427749634, | |
| "learning_rate": 7.298597194388778e-06, | |
| "loss": 0.5004, | |
| "mean_token_accuracy": 0.8391834445297718, | |
| "num_tokens": 102089028.0, | |
| "step": 3180 | |
| }, | |
| { | |
| "epoch": 0.5107881990312637, | |
| "grad_norm": 1.372733235359192, | |
| "learning_rate": 7.258517034068137e-06, | |
| "loss": 0.4849, | |
| "mean_token_accuracy": 0.8438459776341916, | |
| "num_tokens": 102408536.0, | |
| "step": 3190 | |
| }, | |
| { | |
| "epoch": 0.5123894159561266, | |
| "grad_norm": 1.3354692459106445, | |
| "learning_rate": 7.218436873747495e-06, | |
| "loss": 0.4868, | |
| "mean_token_accuracy": 0.8435739390552044, | |
| "num_tokens": 102729129.0, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.5139906328809896, | |
| "grad_norm": 1.391961693763733, | |
| "learning_rate": 7.1783567134268535e-06, | |
| "loss": 0.4754, | |
| "mean_token_accuracy": 0.845844616740942, | |
| "num_tokens": 103050875.0, | |
| "step": 3210 | |
| }, | |
| { | |
| "epoch": 0.5155918498058525, | |
| "grad_norm": 1.372835397720337, | |
| "learning_rate": 7.138276553106213e-06, | |
| "loss": 0.4858, | |
| "mean_token_accuracy": 0.8446020767092705, | |
| "num_tokens": 103370747.0, | |
| "step": 3220 | |
| }, | |
| { | |
| "epoch": 0.5171930667307153, | |
| "grad_norm": 1.3345670700073242, | |
| "learning_rate": 7.098196392785571e-06, | |
| "loss": 0.4963, | |
| "mean_token_accuracy": 0.840513052046299, | |
| "num_tokens": 103688117.0, | |
| "step": 3230 | |
| }, | |
| { | |
| "epoch": 0.5187942836555782, | |
| "grad_norm": 1.385915756225586, | |
| "learning_rate": 7.05811623246493e-06, | |
| "loss": 0.4825, | |
| "mean_token_accuracy": 0.8452211946249009, | |
| "num_tokens": 104007055.0, | |
| "step": 3240 | |
| }, | |
| { | |
| "epoch": 0.5203955005804412, | |
| "grad_norm": 1.367280125617981, | |
| "learning_rate": 7.018036072144289e-06, | |
| "loss": 0.4805, | |
| "mean_token_accuracy": 0.8464450456202031, | |
| "num_tokens": 104327148.0, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 0.5219967175053041, | |
| "grad_norm": 1.3158072233200073, | |
| "learning_rate": 6.977955911823649e-06, | |
| "loss": 0.4859, | |
| "mean_token_accuracy": 0.8442385122179985, | |
| "num_tokens": 104647169.0, | |
| "step": 3260 | |
| }, | |
| { | |
| "epoch": 0.5235979344301669, | |
| "grad_norm": 1.403935432434082, | |
| "learning_rate": 6.937875751503007e-06, | |
| "loss": 0.4841, | |
| "mean_token_accuracy": 0.844200960546732, | |
| "num_tokens": 104966435.0, | |
| "step": 3270 | |
| }, | |
| { | |
| "epoch": 0.5251991513550298, | |
| "grad_norm": 1.4430937767028809, | |
| "learning_rate": 6.897795591182365e-06, | |
| "loss": 0.485, | |
| "mean_token_accuracy": 0.8436707988381386, | |
| "num_tokens": 105288924.0, | |
| "step": 3280 | |
| }, | |
| { | |
| "epoch": 0.5268003682798927, | |
| "grad_norm": 1.390043020248413, | |
| "learning_rate": 6.857715430861725e-06, | |
| "loss": 0.4778, | |
| "mean_token_accuracy": 0.8453225284814835, | |
| "num_tokens": 105613010.0, | |
| "step": 3290 | |
| }, | |
| { | |
| "epoch": 0.5284015852047557, | |
| "grad_norm": 1.323838710784912, | |
| "learning_rate": 6.817635270541083e-06, | |
| "loss": 0.4904, | |
| "mean_token_accuracy": 0.8430100113153458, | |
| "num_tokens": 105928275.0, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.5300028021296185, | |
| "grad_norm": 1.3698164224624634, | |
| "learning_rate": 6.777555110220442e-06, | |
| "loss": 0.4838, | |
| "mean_token_accuracy": 0.8435216702520847, | |
| "num_tokens": 106248267.0, | |
| "step": 3310 | |
| }, | |
| { | |
| "epoch": 0.5316040190544814, | |
| "grad_norm": 1.3220192193984985, | |
| "learning_rate": 6.7374749498998005e-06, | |
| "loss": 0.4832, | |
| "mean_token_accuracy": 0.844068469107151, | |
| "num_tokens": 106569992.0, | |
| "step": 3320 | |
| }, | |
| { | |
| "epoch": 0.5332052359793443, | |
| "grad_norm": 1.404192566871643, | |
| "learning_rate": 6.697394789579159e-06, | |
| "loss": 0.4819, | |
| "mean_token_accuracy": 0.8442453131079674, | |
| "num_tokens": 106888843.0, | |
| "step": 3330 | |
| }, | |
| { | |
| "epoch": 0.5348064529042073, | |
| "grad_norm": 1.3688969612121582, | |
| "learning_rate": 6.657314629258518e-06, | |
| "loss": 0.472, | |
| "mean_token_accuracy": 0.8473145790398121, | |
| "num_tokens": 107211805.0, | |
| "step": 3340 | |
| }, | |
| { | |
| "epoch": 0.5364076698290701, | |
| "grad_norm": 1.3773363828659058, | |
| "learning_rate": 6.617234468937876e-06, | |
| "loss": 0.4798, | |
| "mean_token_accuracy": 0.8453869082033634, | |
| "num_tokens": 107533886.0, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 0.538008886753933, | |
| "grad_norm": 1.3996471166610718, | |
| "learning_rate": 6.577154308617235e-06, | |
| "loss": 0.4584, | |
| "mean_token_accuracy": 0.8509170480072499, | |
| "num_tokens": 107854467.0, | |
| "step": 3360 | |
| }, | |
| { | |
| "epoch": 0.5396101036787959, | |
| "grad_norm": 1.318352460861206, | |
| "learning_rate": 6.537074148296594e-06, | |
| "loss": 0.4747, | |
| "mean_token_accuracy": 0.848233550041914, | |
| "num_tokens": 108170069.0, | |
| "step": 3370 | |
| }, | |
| { | |
| "epoch": 0.5412113206036587, | |
| "grad_norm": 1.3130826950073242, | |
| "learning_rate": 6.496993987975952e-06, | |
| "loss": 0.4783, | |
| "mean_token_accuracy": 0.8458823367953301, | |
| "num_tokens": 108491213.0, | |
| "step": 3380 | |
| }, | |
| { | |
| "epoch": 0.5428125375285217, | |
| "grad_norm": 1.3533992767333984, | |
| "learning_rate": 6.4569138276553115e-06, | |
| "loss": 0.4846, | |
| "mean_token_accuracy": 0.8436866298317909, | |
| "num_tokens": 108810115.0, | |
| "step": 3390 | |
| }, | |
| { | |
| "epoch": 0.5444137544533846, | |
| "grad_norm": 1.448379635810852, | |
| "learning_rate": 6.41683366733467e-06, | |
| "loss": 0.4786, | |
| "mean_token_accuracy": 0.8458628259599209, | |
| "num_tokens": 109131087.0, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.5460149713782475, | |
| "grad_norm": 1.3309921026229858, | |
| "learning_rate": 6.376753507014028e-06, | |
| "loss": 0.4744, | |
| "mean_token_accuracy": 0.8471526563167572, | |
| "num_tokens": 109452761.0, | |
| "step": 3410 | |
| }, | |
| { | |
| "epoch": 0.5476161883031103, | |
| "grad_norm": 1.3475972414016724, | |
| "learning_rate": 6.3366733466933874e-06, | |
| "loss": 0.4843, | |
| "mean_token_accuracy": 0.843706914037466, | |
| "num_tokens": 109773661.0, | |
| "step": 3420 | |
| }, | |
| { | |
| "epoch": 0.5492174052279732, | |
| "grad_norm": 1.3514693975448608, | |
| "learning_rate": 6.296593186372746e-06, | |
| "loss": 0.4738, | |
| "mean_token_accuracy": 0.8470919780433178, | |
| "num_tokens": 110096533.0, | |
| "step": 3430 | |
| }, | |
| { | |
| "epoch": 0.5508186221528362, | |
| "grad_norm": 1.2711541652679443, | |
| "learning_rate": 6.256513026052104e-06, | |
| "loss": 0.4855, | |
| "mean_token_accuracy": 0.8440033115446568, | |
| "num_tokens": 110414605.0, | |
| "step": 3440 | |
| }, | |
| { | |
| "epoch": 0.5524198390776991, | |
| "grad_norm": 1.3714540004730225, | |
| "learning_rate": 6.216432865731463e-06, | |
| "loss": 0.4786, | |
| "mean_token_accuracy": 0.8459533281624317, | |
| "num_tokens": 110736487.0, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 0.5540210560025619, | |
| "grad_norm": 1.3252812623977661, | |
| "learning_rate": 6.176352705410822e-06, | |
| "loss": 0.4631, | |
| "mean_token_accuracy": 0.8502146042883396, | |
| "num_tokens": 111057278.0, | |
| "step": 3460 | |
| }, | |
| { | |
| "epoch": 0.5556222729274248, | |
| "grad_norm": 1.338040828704834, | |
| "learning_rate": 6.136272545090181e-06, | |
| "loss": 0.4882, | |
| "mean_token_accuracy": 0.8439413107931614, | |
| "num_tokens": 111376894.0, | |
| "step": 3470 | |
| }, | |
| { | |
| "epoch": 0.5572234898522878, | |
| "grad_norm": 1.3579652309417725, | |
| "learning_rate": 6.096192384769539e-06, | |
| "loss": 0.4907, | |
| "mean_token_accuracy": 0.8418215617537499, | |
| "num_tokens": 111699740.0, | |
| "step": 3480 | |
| }, | |
| { | |
| "epoch": 0.5588247067771507, | |
| "grad_norm": 1.4137458801269531, | |
| "learning_rate": 6.056112224448898e-06, | |
| "loss": 0.4781, | |
| "mean_token_accuracy": 0.8453948952257633, | |
| "num_tokens": 112020552.0, | |
| "step": 3490 | |
| }, | |
| { | |
| "epoch": 0.5604259237020135, | |
| "grad_norm": 1.3462501764297485, | |
| "learning_rate": 6.016032064128257e-06, | |
| "loss": 0.4831, | |
| "mean_token_accuracy": 0.8446070902049542, | |
| "num_tokens": 112339630.0, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.5620271406268764, | |
| "grad_norm": 1.3319779634475708, | |
| "learning_rate": 5.975951903807615e-06, | |
| "loss": 0.4684, | |
| "mean_token_accuracy": 0.8486264780163765, | |
| "num_tokens": 112656097.0, | |
| "step": 3510 | |
| }, | |
| { | |
| "epoch": 0.5636283575517393, | |
| "grad_norm": 1.4535154104232788, | |
| "learning_rate": 5.9358717434869735e-06, | |
| "loss": 0.4815, | |
| "mean_token_accuracy": 0.8445090480148792, | |
| "num_tokens": 112977129.0, | |
| "step": 3520 | |
| }, | |
| { | |
| "epoch": 0.5652295744766023, | |
| "grad_norm": 1.3629623651504517, | |
| "learning_rate": 5.895791583166333e-06, | |
| "loss": 0.4801, | |
| "mean_token_accuracy": 0.8442439757287502, | |
| "num_tokens": 113300171.0, | |
| "step": 3530 | |
| }, | |
| { | |
| "epoch": 0.5668307914014651, | |
| "grad_norm": 1.4707759618759155, | |
| "learning_rate": 5.855711422845693e-06, | |
| "loss": 0.4644, | |
| "mean_token_accuracy": 0.8490770682692528, | |
| "num_tokens": 113622572.0, | |
| "step": 3540 | |
| }, | |
| { | |
| "epoch": 0.568432008326328, | |
| "grad_norm": 1.3515493869781494, | |
| "learning_rate": 5.815631262525051e-06, | |
| "loss": 0.4685, | |
| "mean_token_accuracy": 0.8480603471398354, | |
| "num_tokens": 113946434.0, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 0.5700332252511909, | |
| "grad_norm": 1.386256456375122, | |
| "learning_rate": 5.7755511022044095e-06, | |
| "loss": 0.4704, | |
| "mean_token_accuracy": 0.8484614215791225, | |
| "num_tokens": 114267728.0, | |
| "step": 3560 | |
| }, | |
| { | |
| "epoch": 0.5716344421760537, | |
| "grad_norm": 1.2644587755203247, | |
| "learning_rate": 5.735470941883769e-06, | |
| "loss": 0.4659, | |
| "mean_token_accuracy": 0.8490276664495469, | |
| "num_tokens": 114587361.0, | |
| "step": 3570 | |
| }, | |
| { | |
| "epoch": 0.5732356591009167, | |
| "grad_norm": 1.459396481513977, | |
| "learning_rate": 5.695390781563127e-06, | |
| "loss": 0.4858, | |
| "mean_token_accuracy": 0.8442618504166604, | |
| "num_tokens": 114904642.0, | |
| "step": 3580 | |
| }, | |
| { | |
| "epoch": 0.5748368760257796, | |
| "grad_norm": 1.4098371267318726, | |
| "learning_rate": 5.655310621242485e-06, | |
| "loss": 0.4747, | |
| "mean_token_accuracy": 0.8466537833213806, | |
| "num_tokens": 115225919.0, | |
| "step": 3590 | |
| }, | |
| { | |
| "epoch": 0.5764380929506425, | |
| "grad_norm": 1.3138372898101807, | |
| "learning_rate": 5.615230460921845e-06, | |
| "loss": 0.4772, | |
| "mean_token_accuracy": 0.8464322753250599, | |
| "num_tokens": 115545208.0, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.5780393098755053, | |
| "grad_norm": 1.3510714769363403, | |
| "learning_rate": 5.575150300601203e-06, | |
| "loss": 0.4723, | |
| "mean_token_accuracy": 0.8474077172577381, | |
| "num_tokens": 115867897.0, | |
| "step": 3610 | |
| }, | |
| { | |
| "epoch": 0.5796405268003683, | |
| "grad_norm": 1.3798366785049438, | |
| "learning_rate": 5.535070140280562e-06, | |
| "loss": 0.4731, | |
| "mean_token_accuracy": 0.8457662783563137, | |
| "num_tokens": 116193082.0, | |
| "step": 3620 | |
| }, | |
| { | |
| "epoch": 0.5812417437252312, | |
| "grad_norm": 1.3346961736679077, | |
| "learning_rate": 5.4949899799599205e-06, | |
| "loss": 0.4863, | |
| "mean_token_accuracy": 0.8438302218914032, | |
| "num_tokens": 116511638.0, | |
| "step": 3630 | |
| }, | |
| { | |
| "epoch": 0.5828429606500941, | |
| "grad_norm": 1.423759937286377, | |
| "learning_rate": 5.454909819639279e-06, | |
| "loss": 0.4832, | |
| "mean_token_accuracy": 0.8442726418375969, | |
| "num_tokens": 116833064.0, | |
| "step": 3640 | |
| }, | |
| { | |
| "epoch": 0.5844441775749569, | |
| "grad_norm": 1.3669489622116089, | |
| "learning_rate": 5.414829659318638e-06, | |
| "loss": 0.4738, | |
| "mean_token_accuracy": 0.847341725975275, | |
| "num_tokens": 117151502.0, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 0.5860453944998198, | |
| "grad_norm": 1.2966816425323486, | |
| "learning_rate": 5.374749498997996e-06, | |
| "loss": 0.4714, | |
| "mean_token_accuracy": 0.8476379804313183, | |
| "num_tokens": 117472793.0, | |
| "step": 3660 | |
| }, | |
| { | |
| "epoch": 0.5876466114246828, | |
| "grad_norm": 1.364230990409851, | |
| "learning_rate": 5.334669338677355e-06, | |
| "loss": 0.4694, | |
| "mean_token_accuracy": 0.8477983787655831, | |
| "num_tokens": 117793998.0, | |
| "step": 3670 | |
| }, | |
| { | |
| "epoch": 0.5892478283495457, | |
| "grad_norm": 1.3028515577316284, | |
| "learning_rate": 5.294589178356714e-06, | |
| "loss": 0.4653, | |
| "mean_token_accuracy": 0.848087765276432, | |
| "num_tokens": 118117628.0, | |
| "step": 3680 | |
| }, | |
| { | |
| "epoch": 0.5908490452744085, | |
| "grad_norm": 1.3520796298980713, | |
| "learning_rate": 5.254509018036072e-06, | |
| "loss": 0.4585, | |
| "mean_token_accuracy": 0.8520961962640285, | |
| "num_tokens": 118438499.0, | |
| "step": 3690 | |
| }, | |
| { | |
| "epoch": 0.5924502621992714, | |
| "grad_norm": 1.3508750200271606, | |
| "learning_rate": 5.2144288577154315e-06, | |
| "loss": 0.4795, | |
| "mean_token_accuracy": 0.8454087562859058, | |
| "num_tokens": 118758788.0, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 0.5940514791241344, | |
| "grad_norm": 1.3332635164260864, | |
| "learning_rate": 5.17434869739479e-06, | |
| "loss": 0.4872, | |
| "mean_token_accuracy": 0.8422816544771194, | |
| "num_tokens": 119079229.0, | |
| "step": 3710 | |
| }, | |
| { | |
| "epoch": 0.5956526960489973, | |
| "grad_norm": 1.315183401107788, | |
| "learning_rate": 5.134268537074148e-06, | |
| "loss": 0.4778, | |
| "mean_token_accuracy": 0.8453255288302899, | |
| "num_tokens": 119397643.0, | |
| "step": 3720 | |
| }, | |
| { | |
| "epoch": 0.5972539129738601, | |
| "grad_norm": 1.4416955709457397, | |
| "learning_rate": 5.0941883767535074e-06, | |
| "loss": 0.4764, | |
| "mean_token_accuracy": 0.8459903568029403, | |
| "num_tokens": 119717263.0, | |
| "step": 3730 | |
| }, | |
| { | |
| "epoch": 0.598855129898723, | |
| "grad_norm": 1.422741413116455, | |
| "learning_rate": 5.054108216432866e-06, | |
| "loss": 0.4659, | |
| "mean_token_accuracy": 0.848649474978447, | |
| "num_tokens": 120038795.0, | |
| "step": 3740 | |
| }, | |
| { | |
| "epoch": 0.600456346823586, | |
| "grad_norm": 1.3014475107192993, | |
| "learning_rate": 5.014028056112224e-06, | |
| "loss": 0.4848, | |
| "mean_token_accuracy": 0.8440289154648781, | |
| "num_tokens": 120361532.0, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 0.6020575637484489, | |
| "grad_norm": 1.462580919265747, | |
| "learning_rate": 4.973947895791584e-06, | |
| "loss": 0.4735, | |
| "mean_token_accuracy": 0.846782649308443, | |
| "num_tokens": 120681185.0, | |
| "step": 3760 | |
| }, | |
| { | |
| "epoch": 0.6036587806733117, | |
| "grad_norm": 1.3661870956420898, | |
| "learning_rate": 4.9338677354709425e-06, | |
| "loss": 0.4798, | |
| "mean_token_accuracy": 0.8457231089472771, | |
| "num_tokens": 121001657.0, | |
| "step": 3770 | |
| }, | |
| { | |
| "epoch": 0.6052599975981746, | |
| "grad_norm": 1.3351037502288818, | |
| "learning_rate": 4.893787575150301e-06, | |
| "loss": 0.4722, | |
| "mean_token_accuracy": 0.8475423932075501, | |
| "num_tokens": 121322036.0, | |
| "step": 3780 | |
| }, | |
| { | |
| "epoch": 0.6068612145230375, | |
| "grad_norm": 1.3064804077148438, | |
| "learning_rate": 4.85370741482966e-06, | |
| "loss": 0.4793, | |
| "mean_token_accuracy": 0.8454695589840412, | |
| "num_tokens": 121645118.0, | |
| "step": 3790 | |
| }, | |
| { | |
| "epoch": 0.6084624314479004, | |
| "grad_norm": 1.3094234466552734, | |
| "learning_rate": 4.8136272545090185e-06, | |
| "loss": 0.4609, | |
| "mean_token_accuracy": 0.8507507719099522, | |
| "num_tokens": 121964402.0, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 0.6100636483727633, | |
| "grad_norm": 1.3205708265304565, | |
| "learning_rate": 4.773547094188377e-06, | |
| "loss": 0.4788, | |
| "mean_token_accuracy": 0.8460944712162017, | |
| "num_tokens": 122286914.0, | |
| "step": 3810 | |
| }, | |
| { | |
| "epoch": 0.6116648652976262, | |
| "grad_norm": 1.279442310333252, | |
| "learning_rate": 4.733466933867736e-06, | |
| "loss": 0.4644, | |
| "mean_token_accuracy": 0.8499685250222683, | |
| "num_tokens": 122608238.0, | |
| "step": 3820 | |
| }, | |
| { | |
| "epoch": 0.6132660822224891, | |
| "grad_norm": 1.378182291984558, | |
| "learning_rate": 4.693386773547094e-06, | |
| "loss": 0.4738, | |
| "mean_token_accuracy": 0.8474943287670612, | |
| "num_tokens": 122926752.0, | |
| "step": 3830 | |
| }, | |
| { | |
| "epoch": 0.6148672991473519, | |
| "grad_norm": 1.3573033809661865, | |
| "learning_rate": 4.6533066132264536e-06, | |
| "loss": 0.4623, | |
| "mean_token_accuracy": 0.8500754803419113, | |
| "num_tokens": 123250988.0, | |
| "step": 3840 | |
| }, | |
| { | |
| "epoch": 0.6164685160722149, | |
| "grad_norm": 1.4006189107894897, | |
| "learning_rate": 4.613226452905812e-06, | |
| "loss": 0.4805, | |
| "mean_token_accuracy": 0.8458505406975746, | |
| "num_tokens": 123570092.0, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 0.6180697329970778, | |
| "grad_norm": 1.3687673807144165, | |
| "learning_rate": 4.57314629258517e-06, | |
| "loss": 0.4604, | |
| "mean_token_accuracy": 0.850204499810934, | |
| "num_tokens": 123892090.0, | |
| "step": 3860 | |
| }, | |
| { | |
| "epoch": 0.6196709499219407, | |
| "grad_norm": 1.3411681652069092, | |
| "learning_rate": 4.5330661322645295e-06, | |
| "loss": 0.4645, | |
| "mean_token_accuracy": 0.8495909467339515, | |
| "num_tokens": 124210494.0, | |
| "step": 3870 | |
| }, | |
| { | |
| "epoch": 0.6212721668468035, | |
| "grad_norm": 1.3550022840499878, | |
| "learning_rate": 4.492985971943889e-06, | |
| "loss": 0.48, | |
| "mean_token_accuracy": 0.8453271247446537, | |
| "num_tokens": 124530687.0, | |
| "step": 3880 | |
| }, | |
| { | |
| "epoch": 0.6228733837716665, | |
| "grad_norm": 1.2486540079116821, | |
| "learning_rate": 4.452905811623247e-06, | |
| "loss": 0.4725, | |
| "mean_token_accuracy": 0.8472283579409122, | |
| "num_tokens": 124849128.0, | |
| "step": 3890 | |
| }, | |
| { | |
| "epoch": 0.6244746006965294, | |
| "grad_norm": 1.3693174123764038, | |
| "learning_rate": 4.412825651302605e-06, | |
| "loss": 0.4714, | |
| "mean_token_accuracy": 0.8471100583672524, | |
| "num_tokens": 125171470.0, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 0.6260758176213923, | |
| "grad_norm": 1.258594036102295, | |
| "learning_rate": 4.372745490981965e-06, | |
| "loss": 0.4707, | |
| "mean_token_accuracy": 0.847071935236454, | |
| "num_tokens": 125491940.0, | |
| "step": 3910 | |
| }, | |
| { | |
| "epoch": 0.6276770345462551, | |
| "grad_norm": 1.3418142795562744, | |
| "learning_rate": 4.332665330661323e-06, | |
| "loss": 0.4665, | |
| "mean_token_accuracy": 0.8485884606838227, | |
| "num_tokens": 125813213.0, | |
| "step": 3920 | |
| }, | |
| { | |
| "epoch": 0.629278251471118, | |
| "grad_norm": 1.3918439149856567, | |
| "learning_rate": 4.292585170340682e-06, | |
| "loss": 0.4892, | |
| "mean_token_accuracy": 0.8419119253754616, | |
| "num_tokens": 126134673.0, | |
| "step": 3930 | |
| }, | |
| { | |
| "epoch": 0.630879468395981, | |
| "grad_norm": 1.367687463760376, | |
| "learning_rate": 4.2525050100200405e-06, | |
| "loss": 0.4764, | |
| "mean_token_accuracy": 0.845719288289547, | |
| "num_tokens": 126456701.0, | |
| "step": 3940 | |
| }, | |
| { | |
| "epoch": 0.6324806853208439, | |
| "grad_norm": 1.3708374500274658, | |
| "learning_rate": 4.212424849699399e-06, | |
| "loss": 0.4749, | |
| "mean_token_accuracy": 0.8470516465604305, | |
| "num_tokens": 126775227.0, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 0.6340819022457067, | |
| "grad_norm": 1.2801411151885986, | |
| "learning_rate": 4.172344689378758e-06, | |
| "loss": 0.4687, | |
| "mean_token_accuracy": 0.8478149607777595, | |
| "num_tokens": 127095332.0, | |
| "step": 3960 | |
| }, | |
| { | |
| "epoch": 0.6356831191705696, | |
| "grad_norm": 1.346408724784851, | |
| "learning_rate": 4.132264529058116e-06, | |
| "loss": 0.4834, | |
| "mean_token_accuracy": 0.8447770491242409, | |
| "num_tokens": 127417747.0, | |
| "step": 3970 | |
| }, | |
| { | |
| "epoch": 0.6372843360954326, | |
| "grad_norm": 1.3216502666473389, | |
| "learning_rate": 4.092184368737475e-06, | |
| "loss": 0.4828, | |
| "mean_token_accuracy": 0.8442099191248417, | |
| "num_tokens": 127737975.0, | |
| "step": 3980 | |
| }, | |
| { | |
| "epoch": 0.6388855530202954, | |
| "grad_norm": 1.3793998956680298, | |
| "learning_rate": 4.052104208416834e-06, | |
| "loss": 0.4659, | |
| "mean_token_accuracy": 0.8481622524559498, | |
| "num_tokens": 128061762.0, | |
| "step": 3990 | |
| }, | |
| { | |
| "epoch": 0.6404867699451583, | |
| "grad_norm": 1.3442909717559814, | |
| "learning_rate": 4.012024048096192e-06, | |
| "loss": 0.4707, | |
| "mean_token_accuracy": 0.8477355271577836, | |
| "num_tokens": 128379163.0, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.6420879868700212, | |
| "grad_norm": 1.3677475452423096, | |
| "learning_rate": 3.9719438877755515e-06, | |
| "loss": 0.4714, | |
| "mean_token_accuracy": 0.8472521089017391, | |
| "num_tokens": 128704307.0, | |
| "step": 4010 | |
| }, | |
| { | |
| "epoch": 0.6436892037948841, | |
| "grad_norm": 1.426178216934204, | |
| "learning_rate": 3.931863727454911e-06, | |
| "loss": 0.4757, | |
| "mean_token_accuracy": 0.8470147632062435, | |
| "num_tokens": 129026076.0, | |
| "step": 4020 | |
| }, | |
| { | |
| "epoch": 0.645290420719747, | |
| "grad_norm": 1.298842430114746, | |
| "learning_rate": 3.891783567134269e-06, | |
| "loss": 0.4832, | |
| "mean_token_accuracy": 0.843799925595522, | |
| "num_tokens": 129346856.0, | |
| "step": 4030 | |
| }, | |
| { | |
| "epoch": 0.6468916376446099, | |
| "grad_norm": 1.304426670074463, | |
| "learning_rate": 3.8517034068136274e-06, | |
| "loss": 0.4675, | |
| "mean_token_accuracy": 0.8482393056154252, | |
| "num_tokens": 129666854.0, | |
| "step": 4040 | |
| }, | |
| { | |
| "epoch": 0.6484928545694728, | |
| "grad_norm": 1.5680227279663086, | |
| "learning_rate": 3.8116232464929866e-06, | |
| "loss": 0.4746, | |
| "mean_token_accuracy": 0.8471040144562721, | |
| "num_tokens": 129986712.0, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 0.6500940714943357, | |
| "grad_norm": 1.3348615169525146, | |
| "learning_rate": 3.771543086172345e-06, | |
| "loss": 0.4864, | |
| "mean_token_accuracy": 0.8429410822689534, | |
| "num_tokens": 130308424.0, | |
| "step": 4060 | |
| }, | |
| { | |
| "epoch": 0.6516952884191985, | |
| "grad_norm": 1.2914706468582153, | |
| "learning_rate": 3.7314629258517038e-06, | |
| "loss": 0.4677, | |
| "mean_token_accuracy": 0.8490622960031032, | |
| "num_tokens": 130631694.0, | |
| "step": 4070 | |
| }, | |
| { | |
| "epoch": 0.6532965053440615, | |
| "grad_norm": 1.3071000576019287, | |
| "learning_rate": 3.6913827655310625e-06, | |
| "loss": 0.4636, | |
| "mean_token_accuracy": 0.8498057357966899, | |
| "num_tokens": 130953507.0, | |
| "step": 4080 | |
| }, | |
| { | |
| "epoch": 0.6548977222689244, | |
| "grad_norm": 1.2781637907028198, | |
| "learning_rate": 3.6513026052104213e-06, | |
| "loss": 0.4714, | |
| "mean_token_accuracy": 0.8469345659017563, | |
| "num_tokens": 131275817.0, | |
| "step": 4090 | |
| }, | |
| { | |
| "epoch": 0.6564989391937873, | |
| "grad_norm": 1.2780126333236694, | |
| "learning_rate": 3.6112224448897797e-06, | |
| "loss": 0.4622, | |
| "mean_token_accuracy": 0.8498845219612121, | |
| "num_tokens": 131598709.0, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 0.6581001561186501, | |
| "grad_norm": 1.354146122932434, | |
| "learning_rate": 3.5711422845691384e-06, | |
| "loss": 0.4602, | |
| "mean_token_accuracy": 0.8503227770328522, | |
| "num_tokens": 131921819.0, | |
| "step": 4110 | |
| }, | |
| { | |
| "epoch": 0.6597013730435131, | |
| "grad_norm": 1.3504717350006104, | |
| "learning_rate": 3.5310621242484972e-06, | |
| "loss": 0.4734, | |
| "mean_token_accuracy": 0.8470642603933811, | |
| "num_tokens": 132238582.0, | |
| "step": 4120 | |
| }, | |
| { | |
| "epoch": 0.661302589968376, | |
| "grad_norm": 1.3072102069854736, | |
| "learning_rate": 3.490981963927856e-06, | |
| "loss": 0.4656, | |
| "mean_token_accuracy": 0.8497247867286205, | |
| "num_tokens": 132562287.0, | |
| "step": 4130 | |
| }, | |
| { | |
| "epoch": 0.6629038068932389, | |
| "grad_norm": 1.347090244293213, | |
| "learning_rate": 3.4509018036072144e-06, | |
| "loss": 0.4704, | |
| "mean_token_accuracy": 0.848843315243721, | |
| "num_tokens": 132884120.0, | |
| "step": 4140 | |
| }, | |
| { | |
| "epoch": 0.6645050238181017, | |
| "grad_norm": 1.2977161407470703, | |
| "learning_rate": 3.410821643286573e-06, | |
| "loss": 0.4494, | |
| "mean_token_accuracy": 0.8530115015804768, | |
| "num_tokens": 133208577.0, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 0.6661062407429647, | |
| "grad_norm": 1.3324532508850098, | |
| "learning_rate": 3.3707414829659323e-06, | |
| "loss": 0.4821, | |
| "mean_token_accuracy": 0.8452388256788254, | |
| "num_tokens": 133526335.0, | |
| "step": 4160 | |
| }, | |
| { | |
| "epoch": 0.6677074576678276, | |
| "grad_norm": 1.3802504539489746, | |
| "learning_rate": 3.330661322645291e-06, | |
| "loss": 0.4607, | |
| "mean_token_accuracy": 0.8510694406926632, | |
| "num_tokens": 133849885.0, | |
| "step": 4170 | |
| }, | |
| { | |
| "epoch": 0.6693086745926905, | |
| "grad_norm": 1.2233039140701294, | |
| "learning_rate": 3.29058116232465e-06, | |
| "loss": 0.4737, | |
| "mean_token_accuracy": 0.8467421375215054, | |
| "num_tokens": 134172893.0, | |
| "step": 4180 | |
| }, | |
| { | |
| "epoch": 0.6709098915175533, | |
| "grad_norm": 1.2868399620056152, | |
| "learning_rate": 3.2505010020040082e-06, | |
| "loss": 0.4653, | |
| "mean_token_accuracy": 0.8496215872466564, | |
| "num_tokens": 134489308.0, | |
| "step": 4190 | |
| }, | |
| { | |
| "epoch": 0.6725111084424162, | |
| "grad_norm": 1.3277688026428223, | |
| "learning_rate": 3.210420841683367e-06, | |
| "loss": 0.4745, | |
| "mean_token_accuracy": 0.846123356372118, | |
| "num_tokens": 134811879.0, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 0.6741123253672792, | |
| "grad_norm": 1.3243411779403687, | |
| "learning_rate": 3.170340681362726e-06, | |
| "loss": 0.4613, | |
| "mean_token_accuracy": 0.8502600885927677, | |
| "num_tokens": 135131420.0, | |
| "step": 4210 | |
| }, | |
| { | |
| "epoch": 0.675713542292142, | |
| "grad_norm": 1.3020135164260864, | |
| "learning_rate": 3.1302605210420846e-06, | |
| "loss": 0.4763, | |
| "mean_token_accuracy": 0.8461723700165749, | |
| "num_tokens": 135447352.0, | |
| "step": 4220 | |
| }, | |
| { | |
| "epoch": 0.6773147592170049, | |
| "grad_norm": 1.3202717304229736, | |
| "learning_rate": 3.090180360721443e-06, | |
| "loss": 0.471, | |
| "mean_token_accuracy": 0.8477423399686813, | |
| "num_tokens": 135770840.0, | |
| "step": 4230 | |
| }, | |
| { | |
| "epoch": 0.6789159761418678, | |
| "grad_norm": 1.2884771823883057, | |
| "learning_rate": 3.0501002004008017e-06, | |
| "loss": 0.4715, | |
| "mean_token_accuracy": 0.8479295544326305, | |
| "num_tokens": 136091571.0, | |
| "step": 4240 | |
| }, | |
| { | |
| "epoch": 0.6805171930667308, | |
| "grad_norm": 1.3056050539016724, | |
| "learning_rate": 3.0100200400801605e-06, | |
| "loss": 0.4596, | |
| "mean_token_accuracy": 0.8502701714634895, | |
| "num_tokens": 136412386.0, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 0.6821184099915936, | |
| "grad_norm": 1.4288556575775146, | |
| "learning_rate": 2.9699398797595193e-06, | |
| "loss": 0.4541, | |
| "mean_token_accuracy": 0.8518479906022549, | |
| "num_tokens": 136735397.0, | |
| "step": 4260 | |
| }, | |
| { | |
| "epoch": 0.6837196269164565, | |
| "grad_norm": 1.296898603439331, | |
| "learning_rate": 2.9298597194388776e-06, | |
| "loss": 0.4684, | |
| "mean_token_accuracy": 0.8481690533459186, | |
| "num_tokens": 137059014.0, | |
| "step": 4270 | |
| }, | |
| { | |
| "epoch": 0.6853208438413194, | |
| "grad_norm": 1.3223106861114502, | |
| "learning_rate": 2.8897795591182364e-06, | |
| "loss": 0.4564, | |
| "mean_token_accuracy": 0.851376112550497, | |
| "num_tokens": 137381739.0, | |
| "step": 4280 | |
| }, | |
| { | |
| "epoch": 0.6869220607661823, | |
| "grad_norm": 1.3440994024276733, | |
| "learning_rate": 2.849699398797595e-06, | |
| "loss": 0.4622, | |
| "mean_token_accuracy": 0.8493120886385441, | |
| "num_tokens": 137705581.0, | |
| "step": 4290 | |
| }, | |
| { | |
| "epoch": 0.6885232776910452, | |
| "grad_norm": 1.3851518630981445, | |
| "learning_rate": 2.8096192384769544e-06, | |
| "loss": 0.4726, | |
| "mean_token_accuracy": 0.8463930167257786, | |
| "num_tokens": 138027757.0, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 0.6901244946159081, | |
| "grad_norm": 1.3805090188980103, | |
| "learning_rate": 2.769539078156313e-06, | |
| "loss": 0.4703, | |
| "mean_token_accuracy": 0.847575119137764, | |
| "num_tokens": 138350808.0, | |
| "step": 4310 | |
| }, | |
| { | |
| "epoch": 0.691725711540771, | |
| "grad_norm": 1.27788507938385, | |
| "learning_rate": 2.729458917835672e-06, | |
| "loss": 0.4685, | |
| "mean_token_accuracy": 0.8492364950478077, | |
| "num_tokens": 138671764.0, | |
| "step": 4320 | |
| }, | |
| { | |
| "epoch": 0.6933269284656339, | |
| "grad_norm": 1.2978250980377197, | |
| "learning_rate": 2.6893787575150303e-06, | |
| "loss": 0.4685, | |
| "mean_token_accuracy": 0.8482681311666965, | |
| "num_tokens": 138989687.0, | |
| "step": 4330 | |
| }, | |
| { | |
| "epoch": 0.6949281453904967, | |
| "grad_norm": 1.3477455377578735, | |
| "learning_rate": 2.649298597194389e-06, | |
| "loss": 0.4819, | |
| "mean_token_accuracy": 0.8435915961861611, | |
| "num_tokens": 139308686.0, | |
| "step": 4340 | |
| }, | |
| { | |
| "epoch": 0.6965293623153597, | |
| "grad_norm": 1.3582419157028198, | |
| "learning_rate": 2.609218436873748e-06, | |
| "loss": 0.4759, | |
| "mean_token_accuracy": 0.8469212375581264, | |
| "num_tokens": 139629510.0, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 0.6981305792402226, | |
| "grad_norm": 1.2451307773590088, | |
| "learning_rate": 2.5691382765531066e-06, | |
| "loss": 0.4558, | |
| "mean_token_accuracy": 0.8517604671418667, | |
| "num_tokens": 139950548.0, | |
| "step": 4360 | |
| }, | |
| { | |
| "epoch": 0.6997317961650855, | |
| "grad_norm": 1.3171881437301636, | |
| "learning_rate": 2.529058116232465e-06, | |
| "loss": 0.4741, | |
| "mean_token_accuracy": 0.8464261189103126, | |
| "num_tokens": 140271293.0, | |
| "step": 4370 | |
| }, | |
| { | |
| "epoch": 0.7013330130899483, | |
| "grad_norm": 1.3458523750305176, | |
| "learning_rate": 2.4889779559118238e-06, | |
| "loss": 0.4674, | |
| "mean_token_accuracy": 0.84786431863904, | |
| "num_tokens": 140593505.0, | |
| "step": 4380 | |
| }, | |
| { | |
| "epoch": 0.7029342300148113, | |
| "grad_norm": 1.3209023475646973, | |
| "learning_rate": 2.4488977955911825e-06, | |
| "loss": 0.4691, | |
| "mean_token_accuracy": 0.8488998875021935, | |
| "num_tokens": 140914094.0, | |
| "step": 4390 | |
| }, | |
| { | |
| "epoch": 0.7045354469396742, | |
| "grad_norm": 1.3460259437561035, | |
| "learning_rate": 2.4088176352705413e-06, | |
| "loss": 0.4521, | |
| "mean_token_accuracy": 0.8530812054872513, | |
| "num_tokens": 141235256.0, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 0.706136663864537, | |
| "grad_norm": 1.2102141380310059, | |
| "learning_rate": 2.3687374749499e-06, | |
| "loss": 0.4547, | |
| "mean_token_accuracy": 0.8519871957600117, | |
| "num_tokens": 141557578.0, | |
| "step": 4410 | |
| }, | |
| { | |
| "epoch": 0.7077378807893999, | |
| "grad_norm": 1.3173609972000122, | |
| "learning_rate": 2.328657314629259e-06, | |
| "loss": 0.4667, | |
| "mean_token_accuracy": 0.8493893645703793, | |
| "num_tokens": 141874382.0, | |
| "step": 4420 | |
| }, | |
| { | |
| "epoch": 0.7093390977142628, | |
| "grad_norm": 1.3135616779327393, | |
| "learning_rate": 2.2885771543086176e-06, | |
| "loss": 0.4626, | |
| "mean_token_accuracy": 0.8501255489885807, | |
| "num_tokens": 142196961.0, | |
| "step": 4430 | |
| }, | |
| { | |
| "epoch": 0.7109403146391258, | |
| "grad_norm": 1.2953917980194092, | |
| "learning_rate": 2.248496993987976e-06, | |
| "loss": 0.4565, | |
| "mean_token_accuracy": 0.8517375260591507, | |
| "num_tokens": 142519735.0, | |
| "step": 4440 | |
| }, | |
| { | |
| "epoch": 0.7125415315639886, | |
| "grad_norm": 1.3296713829040527, | |
| "learning_rate": 2.2084168336673348e-06, | |
| "loss": 0.4761, | |
| "mean_token_accuracy": 0.8469030007719993, | |
| "num_tokens": 142839542.0, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 0.7141427484888515, | |
| "grad_norm": 1.39572274684906, | |
| "learning_rate": 2.1683366733466936e-06, | |
| "loss": 0.4639, | |
| "mean_token_accuracy": 0.8488202162086964, | |
| "num_tokens": 143164098.0, | |
| "step": 4460 | |
| }, | |
| { | |
| "epoch": 0.7157439654137144, | |
| "grad_norm": 1.2757151126861572, | |
| "learning_rate": 2.1282565130260523e-06, | |
| "loss": 0.462, | |
| "mean_token_accuracy": 0.8499647453427315, | |
| "num_tokens": 143483543.0, | |
| "step": 4470 | |
| }, | |
| { | |
| "epoch": 0.7173451823385774, | |
| "grad_norm": 1.3006616830825806, | |
| "learning_rate": 2.088176352705411e-06, | |
| "loss": 0.4545, | |
| "mean_token_accuracy": 0.8515301026403904, | |
| "num_tokens": 143804301.0, | |
| "step": 4480 | |
| }, | |
| { | |
| "epoch": 0.7189463992634402, | |
| "grad_norm": 1.292068600654602, | |
| "learning_rate": 2.04809619238477e-06, | |
| "loss": 0.4789, | |
| "mean_token_accuracy": 0.8460986621677875, | |
| "num_tokens": 144122446.0, | |
| "step": 4490 | |
| }, | |
| { | |
| "epoch": 0.7205476161883031, | |
| "grad_norm": 1.253038763999939, | |
| "learning_rate": 2.0080160320641282e-06, | |
| "loss": 0.4605, | |
| "mean_token_accuracy": 0.8504187315702438, | |
| "num_tokens": 144442269.0, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.722148833113166, | |
| "grad_norm": 1.2668460607528687, | |
| "learning_rate": 1.967935871743487e-06, | |
| "loss": 0.4658, | |
| "mean_token_accuracy": 0.8492423847317696, | |
| "num_tokens": 144762045.0, | |
| "step": 4510 | |
| }, | |
| { | |
| "epoch": 0.723750050038029, | |
| "grad_norm": 1.2495707273483276, | |
| "learning_rate": 1.927855711422846e-06, | |
| "loss": 0.4678, | |
| "mean_token_accuracy": 0.8484641022980213, | |
| "num_tokens": 145081166.0, | |
| "step": 4520 | |
| }, | |
| { | |
| "epoch": 0.7253512669628918, | |
| "grad_norm": 1.253281593322754, | |
| "learning_rate": 1.8877755511022044e-06, | |
| "loss": 0.4529, | |
| "mean_token_accuracy": 0.8517255567014217, | |
| "num_tokens": 145405369.0, | |
| "step": 4530 | |
| }, | |
| { | |
| "epoch": 0.7269524838877547, | |
| "grad_norm": 1.318231463432312, | |
| "learning_rate": 1.8476953907815633e-06, | |
| "loss": 0.4519, | |
| "mean_token_accuracy": 0.8528639182448388, | |
| "num_tokens": 145730104.0, | |
| "step": 4540 | |
| }, | |
| { | |
| "epoch": 0.7285537008126176, | |
| "grad_norm": 1.3955156803131104, | |
| "learning_rate": 1.8076152304609221e-06, | |
| "loss": 0.4665, | |
| "mean_token_accuracy": 0.8482579290866852, | |
| "num_tokens": 146049418.0, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 0.7301549177374805, | |
| "grad_norm": 1.318996787071228, | |
| "learning_rate": 1.7675350701402807e-06, | |
| "loss": 0.4636, | |
| "mean_token_accuracy": 0.849512092024088, | |
| "num_tokens": 146368693.0, | |
| "step": 4560 | |
| }, | |
| { | |
| "epoch": 0.7317561346623433, | |
| "grad_norm": 1.3554900884628296, | |
| "learning_rate": 1.7274549098196395e-06, | |
| "loss": 0.4625, | |
| "mean_token_accuracy": 0.849649652838707, | |
| "num_tokens": 146691164.0, | |
| "step": 4570 | |
| }, | |
| { | |
| "epoch": 0.7333573515872063, | |
| "grad_norm": 1.261730670928955, | |
| "learning_rate": 1.687374749498998e-06, | |
| "loss": 0.4675, | |
| "mean_token_accuracy": 0.8492170706391334, | |
| "num_tokens": 147011805.0, | |
| "step": 4580 | |
| }, | |
| { | |
| "epoch": 0.7349585685120692, | |
| "grad_norm": 1.3335061073303223, | |
| "learning_rate": 1.6472945891783568e-06, | |
| "loss": 0.4612, | |
| "mean_token_accuracy": 0.8497836641967297, | |
| "num_tokens": 147333247.0, | |
| "step": 4590 | |
| }, | |
| { | |
| "epoch": 0.736559785436932, | |
| "grad_norm": 1.3388036489486694, | |
| "learning_rate": 1.6072144288577154e-06, | |
| "loss": 0.4689, | |
| "mean_token_accuracy": 0.8484977997839451, | |
| "num_tokens": 147653432.0, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 0.7381610023617949, | |
| "grad_norm": 1.2885268926620483, | |
| "learning_rate": 1.5671342685370744e-06, | |
| "loss": 0.4646, | |
| "mean_token_accuracy": 0.8488264963030815, | |
| "num_tokens": 147975821.0, | |
| "step": 4610 | |
| }, | |
| { | |
| "epoch": 0.7397622192866579, | |
| "grad_norm": 1.4199271202087402, | |
| "learning_rate": 1.5270541082164331e-06, | |
| "loss": 0.4704, | |
| "mean_token_accuracy": 0.8481570340692997, | |
| "num_tokens": 148299345.0, | |
| "step": 4620 | |
| }, | |
| { | |
| "epoch": 0.7413634362115208, | |
| "grad_norm": 1.3368284702301025, | |
| "learning_rate": 1.4869739478957917e-06, | |
| "loss": 0.4735, | |
| "mean_token_accuracy": 0.8470888569951057, | |
| "num_tokens": 148620981.0, | |
| "step": 4630 | |
| }, | |
| { | |
| "epoch": 0.7429646531363836, | |
| "grad_norm": 1.2820199728012085, | |
| "learning_rate": 1.4468937875751505e-06, | |
| "loss": 0.4642, | |
| "mean_token_accuracy": 0.8500242799520492, | |
| "num_tokens": 148944464.0, | |
| "step": 4640 | |
| }, | |
| { | |
| "epoch": 0.7445658700612465, | |
| "grad_norm": 1.2548437118530273, | |
| "learning_rate": 1.406813627254509e-06, | |
| "loss": 0.4657, | |
| "mean_token_accuracy": 0.8486466005444526, | |
| "num_tokens": 149268167.0, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 0.7461670869861095, | |
| "grad_norm": 1.452405571937561, | |
| "learning_rate": 1.3667334669338678e-06, | |
| "loss": 0.4425, | |
| "mean_token_accuracy": 0.8560060441493988, | |
| "num_tokens": 149589241.0, | |
| "step": 4660 | |
| }, | |
| { | |
| "epoch": 0.7477683039109724, | |
| "grad_norm": 1.3026350736618042, | |
| "learning_rate": 1.3266533066132264e-06, | |
| "loss": 0.4646, | |
| "mean_token_accuracy": 0.8496662922203541, | |
| "num_tokens": 149909375.0, | |
| "step": 4670 | |
| }, | |
| { | |
| "epoch": 0.7493695208358352, | |
| "grad_norm": 1.2289704084396362, | |
| "learning_rate": 1.2865731462925854e-06, | |
| "loss": 0.4666, | |
| "mean_token_accuracy": 0.8491358175873757, | |
| "num_tokens": 150231321.0, | |
| "step": 4680 | |
| }, | |
| { | |
| "epoch": 0.7509707377606981, | |
| "grad_norm": 1.343451976776123, | |
| "learning_rate": 1.246492985971944e-06, | |
| "loss": 0.4571, | |
| "mean_token_accuracy": 0.8527932472527027, | |
| "num_tokens": 150549837.0, | |
| "step": 4690 | |
| }, | |
| { | |
| "epoch": 0.752571954685561, | |
| "grad_norm": 1.2429101467132568, | |
| "learning_rate": 1.2064128256513027e-06, | |
| "loss": 0.4546, | |
| "mean_token_accuracy": 0.852372557669878, | |
| "num_tokens": 150871442.0, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 0.754173171610424, | |
| "grad_norm": 1.2928380966186523, | |
| "learning_rate": 1.1663326653306615e-06, | |
| "loss": 0.459, | |
| "mean_token_accuracy": 0.8513782821595669, | |
| "num_tokens": 151191666.0, | |
| "step": 4710 | |
| }, | |
| { | |
| "epoch": 0.7557743885352868, | |
| "grad_norm": 1.3096081018447876, | |
| "learning_rate": 1.12625250501002e-06, | |
| "loss": 0.4496, | |
| "mean_token_accuracy": 0.8535910114645958, | |
| "num_tokens": 151513743.0, | |
| "step": 4720 | |
| }, | |
| { | |
| "epoch": 0.7573756054601497, | |
| "grad_norm": 1.4618325233459473, | |
| "learning_rate": 1.0861723446893789e-06, | |
| "loss": 0.4577, | |
| "mean_token_accuracy": 0.8506775170564651, | |
| "num_tokens": 151834026.0, | |
| "step": 4730 | |
| }, | |
| { | |
| "epoch": 0.7589768223850126, | |
| "grad_norm": 1.243790864944458, | |
| "learning_rate": 1.0460921843687376e-06, | |
| "loss": 0.4643, | |
| "mean_token_accuracy": 0.8498982474207878, | |
| "num_tokens": 152153646.0, | |
| "step": 4740 | |
| }, | |
| { | |
| "epoch": 0.7605780393098756, | |
| "grad_norm": 1.281795620918274, | |
| "learning_rate": 1.0060120240480962e-06, | |
| "loss": 0.4657, | |
| "mean_token_accuracy": 0.8492770433425904, | |
| "num_tokens": 152472580.0, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 0.7621792562347384, | |
| "grad_norm": 1.250246286392212, | |
| "learning_rate": 9.65931863727455e-07, | |
| "loss": 0.4556, | |
| "mean_token_accuracy": 0.8518551647663116, | |
| "num_tokens": 152792991.0, | |
| "step": 4760 | |
| }, | |
| { | |
| "epoch": 0.7637804731596013, | |
| "grad_norm": 1.3070975542068481, | |
| "learning_rate": 9.258517034068138e-07, | |
| "loss": 0.4657, | |
| "mean_token_accuracy": 0.8486363351345062, | |
| "num_tokens": 153113976.0, | |
| "step": 4770 | |
| }, | |
| { | |
| "epoch": 0.7653816900844642, | |
| "grad_norm": 1.2252980470657349, | |
| "learning_rate": 8.857715430861724e-07, | |
| "loss": 0.4578, | |
| "mean_token_accuracy": 0.8504931330680847, | |
| "num_tokens": 153434405.0, | |
| "step": 4780 | |
| }, | |
| { | |
| "epoch": 0.7669829070093271, | |
| "grad_norm": 1.2938100099563599, | |
| "learning_rate": 8.456913827655311e-07, | |
| "loss": 0.4626, | |
| "mean_token_accuracy": 0.8495556160807609, | |
| "num_tokens": 153754192.0, | |
| "step": 4790 | |
| }, | |
| { | |
| "epoch": 0.76858412393419, | |
| "grad_norm": 1.2442668676376343, | |
| "learning_rate": 8.056112224448899e-07, | |
| "loss": 0.4602, | |
| "mean_token_accuracy": 0.8500640645623208, | |
| "num_tokens": 154073655.0, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 0.7701853408590529, | |
| "grad_norm": 1.2852963209152222, | |
| "learning_rate": 7.655310621242486e-07, | |
| "loss": 0.455, | |
| "mean_token_accuracy": 0.8520859353244304, | |
| "num_tokens": 154393306.0, | |
| "step": 4810 | |
| }, | |
| { | |
| "epoch": 0.7717865577839158, | |
| "grad_norm": 1.2161128520965576, | |
| "learning_rate": 7.254509018036072e-07, | |
| "loss": 0.4653, | |
| "mean_token_accuracy": 0.8493512712419033, | |
| "num_tokens": 154713527.0, | |
| "step": 4820 | |
| }, | |
| { | |
| "epoch": 0.7733877747087786, | |
| "grad_norm": 1.3726937770843506, | |
| "learning_rate": 6.853707414829659e-07, | |
| "loss": 0.4731, | |
| "mean_token_accuracy": 0.8470825746655464, | |
| "num_tokens": 155031568.0, | |
| "step": 4830 | |
| }, | |
| { | |
| "epoch": 0.7749889916336415, | |
| "grad_norm": 1.2655962705612183, | |
| "learning_rate": 6.452905811623248e-07, | |
| "loss": 0.4589, | |
| "mean_token_accuracy": 0.8517993964254856, | |
| "num_tokens": 155348756.0, | |
| "step": 4840 | |
| }, | |
| { | |
| "epoch": 0.7765902085585045, | |
| "grad_norm": 1.3064429759979248, | |
| "learning_rate": 6.052104208416835e-07, | |
| "loss": 0.4634, | |
| "mean_token_accuracy": 0.8494015254080296, | |
| "num_tokens": 155670420.0, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 0.7781914254833674, | |
| "grad_norm": 1.2700303792953491, | |
| "learning_rate": 5.651302605210421e-07, | |
| "loss": 0.4592, | |
| "mean_token_accuracy": 0.8505661249160766, | |
| "num_tokens": 155989794.0, | |
| "step": 4860 | |
| }, | |
| { | |
| "epoch": 0.7797926424082302, | |
| "grad_norm": 1.322021722793579, | |
| "learning_rate": 5.250501002004008e-07, | |
| "loss": 0.4709, | |
| "mean_token_accuracy": 0.8476430311799049, | |
| "num_tokens": 156308053.0, | |
| "step": 4870 | |
| }, | |
| { | |
| "epoch": 0.7813938593330931, | |
| "grad_norm": 1.2155754566192627, | |
| "learning_rate": 4.849699398797596e-07, | |
| "loss": 0.4445, | |
| "mean_token_accuracy": 0.8554054662585259, | |
| "num_tokens": 156632693.0, | |
| "step": 4880 | |
| }, | |
| { | |
| "epoch": 0.7829950762579561, | |
| "grad_norm": 1.22914457321167, | |
| "learning_rate": 4.4488977955911824e-07, | |
| "loss": 0.4622, | |
| "mean_token_accuracy": 0.8507880836725235, | |
| "num_tokens": 156950553.0, | |
| "step": 4890 | |
| }, | |
| { | |
| "epoch": 0.784596293182819, | |
| "grad_norm": 1.2851382493972778, | |
| "learning_rate": 4.0480961923847697e-07, | |
| "loss": 0.4569, | |
| "mean_token_accuracy": 0.8511246681213379, | |
| "num_tokens": 157271840.0, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 0.7861975101076818, | |
| "grad_norm": 1.255151629447937, | |
| "learning_rate": 3.6472945891783575e-07, | |
| "loss": 0.4508, | |
| "mean_token_accuracy": 0.8532536290585995, | |
| "num_tokens": 157590910.0, | |
| "step": 4910 | |
| }, | |
| { | |
| "epoch": 0.7877987270325447, | |
| "grad_norm": 1.3058985471725464, | |
| "learning_rate": 3.246492985971944e-07, | |
| "loss": 0.4809, | |
| "mean_token_accuracy": 0.8454479977488518, | |
| "num_tokens": 157910380.0, | |
| "step": 4920 | |
| }, | |
| { | |
| "epoch": 0.7893999439574076, | |
| "grad_norm": 1.2654027938842773, | |
| "learning_rate": 2.845691382765531e-07, | |
| "loss": 0.4676, | |
| "mean_token_accuracy": 0.8477957636117935, | |
| "num_tokens": 158231334.0, | |
| "step": 4930 | |
| }, | |
| { | |
| "epoch": 0.7910011608822706, | |
| "grad_norm": 1.2774121761322021, | |
| "learning_rate": 2.4448897795591187e-07, | |
| "loss": 0.4558, | |
| "mean_token_accuracy": 0.8519059099256993, | |
| "num_tokens": 158552075.0, | |
| "step": 4940 | |
| }, | |
| { | |
| "epoch": 0.7926023778071334, | |
| "grad_norm": 1.2725189924240112, | |
| "learning_rate": 2.0440881763527057e-07, | |
| "loss": 0.4469, | |
| "mean_token_accuracy": 0.8547384902834892, | |
| "num_tokens": 158874136.0, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 0.7942035947319963, | |
| "grad_norm": 1.3281700611114502, | |
| "learning_rate": 1.6432865731462927e-07, | |
| "loss": 0.4578, | |
| "mean_token_accuracy": 0.851103599369526, | |
| "num_tokens": 159193599.0, | |
| "step": 4960 | |
| }, | |
| { | |
| "epoch": 0.7958048116568592, | |
| "grad_norm": 1.279972791671753, | |
| "learning_rate": 1.24248496993988e-07, | |
| "loss": 0.4457, | |
| "mean_token_accuracy": 0.8538298286497593, | |
| "num_tokens": 159517847.0, | |
| "step": 4970 | |
| }, | |
| { | |
| "epoch": 0.7974060285817222, | |
| "grad_norm": 1.2874363660812378, | |
| "learning_rate": 8.416833667334669e-08, | |
| "loss": 0.4557, | |
| "mean_token_accuracy": 0.8525973983108998, | |
| "num_tokens": 159838652.0, | |
| "step": 4980 | |
| }, | |
| { | |
| "epoch": 0.799007245506585, | |
| "grad_norm": 1.343727469444275, | |
| "learning_rate": 4.408817635270541e-08, | |
| "loss": 0.4567, | |
| "mean_token_accuracy": 0.8513975150883197, | |
| "num_tokens": 160160586.0, | |
| "step": 4990 | |
| }, | |
| { | |
| "epoch": 0.8006084624314479, | |
| "grad_norm": 1.2488781213760376, | |
| "learning_rate": 4.008016032064128e-09, | |
| "loss": 0.4672, | |
| "mean_token_accuracy": 0.8483051300048828, | |
| "num_tokens": 160483113.0, | |
| "step": 5000 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 5000, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 4.3299712794624e+17, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |