| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 9.0, | |
| "eval_steps": 500, | |
| "global_step": 1413, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "entropy": 2.05683453977108, | |
| "epoch": 0.064, | |
| "grad_norm": 1.5962693691253662, | |
| "learning_rate": 3.7500000000000005e-06, | |
| "loss": 2.056, | |
| "mean_token_accuracy": 0.527055786550045, | |
| "num_tokens": 78959.0, | |
| "step": 10 | |
| }, | |
| { | |
| "entropy": 2.2151891469955443, | |
| "epoch": 0.128, | |
| "grad_norm": 1.022516131401062, | |
| "learning_rate": 7.916666666666667e-06, | |
| "loss": 2.0766, | |
| "mean_token_accuracy": 0.5204883277416229, | |
| "num_tokens": 158094.0, | |
| "step": 20 | |
| }, | |
| { | |
| "entropy": 2.376429131627083, | |
| "epoch": 0.192, | |
| "grad_norm": 0.8220515251159668, | |
| "learning_rate": 1.2083333333333333e-05, | |
| "loss": 1.8902, | |
| "mean_token_accuracy": 0.5391427092254162, | |
| "num_tokens": 239163.0, | |
| "step": 30 | |
| }, | |
| { | |
| "entropy": 2.285146689414978, | |
| "epoch": 0.256, | |
| "grad_norm": 0.7022648453712463, | |
| "learning_rate": 1.6250000000000002e-05, | |
| "loss": 1.7478, | |
| "mean_token_accuracy": 0.56041978597641, | |
| "num_tokens": 317818.0, | |
| "step": 40 | |
| }, | |
| { | |
| "entropy": 2.3282038152217863, | |
| "epoch": 0.32, | |
| "grad_norm": 0.5584391951560974, | |
| "learning_rate": 1.9999978697023387e-05, | |
| "loss": 1.7687, | |
| "mean_token_accuracy": 0.5601607479155064, | |
| "num_tokens": 396146.0, | |
| "step": 50 | |
| }, | |
| { | |
| "entropy": 2.2709642231464384, | |
| "epoch": 0.384, | |
| "grad_norm": 0.5373395085334778, | |
| "learning_rate": 1.999742244965125e-05, | |
| "loss": 1.6913, | |
| "mean_token_accuracy": 0.5693033933639526, | |
| "num_tokens": 474291.0, | |
| "step": 60 | |
| }, | |
| { | |
| "entropy": 2.2445768117904663, | |
| "epoch": 0.448, | |
| "grad_norm": 0.4558122754096985, | |
| "learning_rate": 1.9990606854864625e-05, | |
| "loss": 1.679, | |
| "mean_token_accuracy": 0.5720810443162918, | |
| "num_tokens": 554739.0, | |
| "step": 70 | |
| }, | |
| { | |
| "entropy": 2.2270330280065536, | |
| "epoch": 0.512, | |
| "grad_norm": 0.5535369515419006, | |
| "learning_rate": 1.997953481641056e-05, | |
| "loss": 1.6522, | |
| "mean_token_accuracy": 0.574026207625866, | |
| "num_tokens": 633658.0, | |
| "step": 80 | |
| }, | |
| { | |
| "entropy": 2.2367560386657717, | |
| "epoch": 0.576, | |
| "grad_norm": 0.5366299152374268, | |
| "learning_rate": 1.9964211051470778e-05, | |
| "loss": 1.6955, | |
| "mean_token_accuracy": 0.5699351653456688, | |
| "num_tokens": 712400.0, | |
| "step": 90 | |
| }, | |
| { | |
| "entropy": 2.21894571185112, | |
| "epoch": 0.64, | |
| "grad_norm": 0.4690150022506714, | |
| "learning_rate": 1.994464208865191e-05, | |
| "loss": 1.7048, | |
| "mean_token_accuracy": 0.5701304003596306, | |
| "num_tokens": 792630.0, | |
| "step": 100 | |
| }, | |
| { | |
| "entropy": 2.235249537229538, | |
| "epoch": 0.704, | |
| "grad_norm": 0.5834165811538696, | |
| "learning_rate": 1.9920836265204047e-05, | |
| "loss": 1.7032, | |
| "mean_token_accuracy": 0.5705543920397759, | |
| "num_tokens": 872045.0, | |
| "step": 110 | |
| }, | |
| { | |
| "entropy": 2.2257163137197495, | |
| "epoch": 0.768, | |
| "grad_norm": 0.5584805011749268, | |
| "learning_rate": 1.989280372346868e-05, | |
| "loss": 1.666, | |
| "mean_token_accuracy": 0.5684764981269836, | |
| "num_tokens": 952057.0, | |
| "step": 120 | |
| }, | |
| { | |
| "entropy": 2.2563431203365325, | |
| "epoch": 0.832, | |
| "grad_norm": 0.5170231461524963, | |
| "learning_rate": 1.986055640655763e-05, | |
| "loss": 1.7134, | |
| "mean_token_accuracy": 0.570289532840252, | |
| "num_tokens": 1029200.0, | |
| "step": 130 | |
| }, | |
| { | |
| "entropy": 2.2378907680511473, | |
| "epoch": 0.896, | |
| "grad_norm": 0.5027748942375183, | |
| "learning_rate": 1.9824108053264726e-05, | |
| "loss": 1.6719, | |
| "mean_token_accuracy": 0.5730531394481659, | |
| "num_tokens": 1105844.0, | |
| "step": 140 | |
| }, | |
| { | |
| "entropy": 2.1966699600219726, | |
| "epoch": 0.96, | |
| "grad_norm": 0.5884814262390137, | |
| "learning_rate": 1.9783474192212484e-05, | |
| "loss": 1.6327, | |
| "mean_token_accuracy": 0.5813805550336838, | |
| "num_tokens": 1182935.0, | |
| "step": 150 | |
| }, | |
| { | |
| "entropy": 2.20564815804765, | |
| "epoch": 1.0192, | |
| "grad_norm": 0.570175290107727, | |
| "learning_rate": 1.9738672135236218e-05, | |
| "loss": 1.6118, | |
| "mean_token_accuracy": 0.582583570802534, | |
| "num_tokens": 1254363.0, | |
| "step": 160 | |
| }, | |
| { | |
| "entropy": 2.1847074955701826, | |
| "epoch": 1.0832, | |
| "grad_norm": 0.5836730003356934, | |
| "learning_rate": 1.968972097000843e-05, | |
| "loss": 1.6172, | |
| "mean_token_accuracy": 0.5812226444482803, | |
| "num_tokens": 1330281.0, | |
| "step": 170 | |
| }, | |
| { | |
| "entropy": 2.1814055383205413, | |
| "epoch": 1.1472, | |
| "grad_norm": 0.5746439695358276, | |
| "learning_rate": 1.96366415519066e-05, | |
| "loss": 1.6192, | |
| "mean_token_accuracy": 0.5789176046848297, | |
| "num_tokens": 1409407.0, | |
| "step": 180 | |
| }, | |
| { | |
| "entropy": 2.2038993716239927, | |
| "epoch": 1.2112, | |
| "grad_norm": 0.5652104616165161, | |
| "learning_rate": 1.957945649512788e-05, | |
| "loss": 1.6166, | |
| "mean_token_accuracy": 0.5809548154473305, | |
| "num_tokens": 1489034.0, | |
| "step": 190 | |
| }, | |
| { | |
| "entropy": 2.173789343237877, | |
| "epoch": 1.2752, | |
| "grad_norm": 0.6653291583061218, | |
| "learning_rate": 1.951819016305442e-05, | |
| "loss": 1.62, | |
| "mean_token_accuracy": 0.5827470317482948, | |
| "num_tokens": 1568549.0, | |
| "step": 200 | |
| }, | |
| { | |
| "entropy": 2.1907752990722655, | |
| "epoch": 1.3392, | |
| "grad_norm": 0.7024573087692261, | |
| "learning_rate": 1.9452868657873513e-05, | |
| "loss": 1.6397, | |
| "mean_token_accuracy": 0.5796025812625885, | |
| "num_tokens": 1647404.0, | |
| "step": 210 | |
| }, | |
| { | |
| "entropy": 2.189376249909401, | |
| "epoch": 1.4032, | |
| "grad_norm": 0.5727422833442688, | |
| "learning_rate": 1.9383519809456862e-05, | |
| "loss": 1.6349, | |
| "mean_token_accuracy": 0.5815459445118905, | |
| "num_tokens": 1728421.0, | |
| "step": 220 | |
| }, | |
| { | |
| "entropy": 2.209022229909897, | |
| "epoch": 1.4672, | |
| "grad_norm": 0.6421232223510742, | |
| "learning_rate": 1.931017316350384e-05, | |
| "loss": 1.6425, | |
| "mean_token_accuracy": 0.5790404245257378, | |
| "num_tokens": 1806891.0, | |
| "step": 230 | |
| }, | |
| { | |
| "entropy": 2.2337595343589784, | |
| "epoch": 1.5312000000000001, | |
| "grad_norm": 0.6296209692955017, | |
| "learning_rate": 1.9232859968953702e-05, | |
| "loss": 1.624, | |
| "mean_token_accuracy": 0.5814317353069782, | |
| "num_tokens": 1883100.0, | |
| "step": 240 | |
| }, | |
| { | |
| "entropy": 2.205833575129509, | |
| "epoch": 1.5952, | |
| "grad_norm": 0.6371021866798401, | |
| "learning_rate": 1.9151613164672136e-05, | |
| "loss": 1.6284, | |
| "mean_token_accuracy": 0.5819905593991279, | |
| "num_tokens": 1961317.0, | |
| "step": 250 | |
| }, | |
| { | |
| "entropy": 2.205822005867958, | |
| "epoch": 1.6592, | |
| "grad_norm": 0.6950616836547852, | |
| "learning_rate": 1.9066467365417844e-05, | |
| "loss": 1.6374, | |
| "mean_token_accuracy": 0.5760326236486435, | |
| "num_tokens": 2042881.0, | |
| "step": 260 | |
| }, | |
| { | |
| "entropy": 2.2163637399673464, | |
| "epoch": 1.7231999999999998, | |
| "grad_norm": 0.7801616191864014, | |
| "learning_rate": 1.8977458847095117e-05, | |
| "loss": 1.663, | |
| "mean_token_accuracy": 0.5744953289628029, | |
| "num_tokens": 2121403.0, | |
| "step": 270 | |
| }, | |
| { | |
| "entropy": 2.199243775010109, | |
| "epoch": 1.7872, | |
| "grad_norm": 0.6671239733695984, | |
| "learning_rate": 1.888462553129867e-05, | |
| "loss": 1.6456, | |
| "mean_token_accuracy": 0.579181258380413, | |
| "num_tokens": 2200908.0, | |
| "step": 280 | |
| }, | |
| { | |
| "entropy": 2.214826595783234, | |
| "epoch": 1.8512, | |
| "grad_norm": 0.7415009140968323, | |
| "learning_rate": 1.878800696915737e-05, | |
| "loss": 1.6113, | |
| "mean_token_accuracy": 0.5840038731694221, | |
| "num_tokens": 2278414.0, | |
| "step": 290 | |
| }, | |
| { | |
| "entropy": 2.187604659795761, | |
| "epoch": 1.9152, | |
| "grad_norm": 0.662319540977478, | |
| "learning_rate": 1.868764432448369e-05, | |
| "loss": 1.6182, | |
| "mean_token_accuracy": 0.580166706442833, | |
| "num_tokens": 2355826.0, | |
| "step": 300 | |
| }, | |
| { | |
| "entropy": 2.2184703826904295, | |
| "epoch": 1.9792, | |
| "grad_norm": 0.7123025059700012, | |
| "learning_rate": 1.8583580356236065e-05, | |
| "loss": 1.655, | |
| "mean_token_accuracy": 0.5762834578752518, | |
| "num_tokens": 2434933.0, | |
| "step": 310 | |
| }, | |
| { | |
| "entropy": 2.1887036239778674, | |
| "epoch": 2.0384, | |
| "grad_norm": 0.6846157312393188, | |
| "learning_rate": 1.8475859400301708e-05, | |
| "loss": 1.5935, | |
| "mean_token_accuracy": 0.5881956976813238, | |
| "num_tokens": 2507166.0, | |
| "step": 320 | |
| }, | |
| { | |
| "entropy": 2.102977079153061, | |
| "epoch": 2.1024, | |
| "grad_norm": 0.7967628240585327, | |
| "learning_rate": 1.8364527350607527e-05, | |
| "loss": 1.5405, | |
| "mean_token_accuracy": 0.5946892097592353, | |
| "num_tokens": 2584298.0, | |
| "step": 330 | |
| }, | |
| { | |
| "entropy": 2.118516767024994, | |
| "epoch": 2.1664, | |
| "grad_norm": 0.7417224645614624, | |
| "learning_rate": 1.824963163956726e-05, | |
| "loss": 1.5727, | |
| "mean_token_accuracy": 0.5870080485939979, | |
| "num_tokens": 2663601.0, | |
| "step": 340 | |
| }, | |
| { | |
| "entropy": 2.104418155550957, | |
| "epoch": 2.2304, | |
| "grad_norm": 0.7956721782684326, | |
| "learning_rate": 1.8131221217873175e-05, | |
| "loss": 1.5575, | |
| "mean_token_accuracy": 0.5936456203460694, | |
| "num_tokens": 2744783.0, | |
| "step": 350 | |
| }, | |
| { | |
| "entropy": 2.129578319191933, | |
| "epoch": 2.2944, | |
| "grad_norm": 0.769292950630188, | |
| "learning_rate": 1.8009346533640877e-05, | |
| "loss": 1.5878, | |
| "mean_token_accuracy": 0.5841517195105552, | |
| "num_tokens": 2823023.0, | |
| "step": 360 | |
| }, | |
| { | |
| "entropy": 2.097687366604805, | |
| "epoch": 2.3584, | |
| "grad_norm": 0.9341740608215332, | |
| "learning_rate": 1.7884059510916167e-05, | |
| "loss": 1.5346, | |
| "mean_token_accuracy": 0.599460557103157, | |
| "num_tokens": 2899598.0, | |
| "step": 370 | |
| }, | |
| { | |
| "entropy": 2.151599031686783, | |
| "epoch": 2.4224, | |
| "grad_norm": 0.8752340078353882, | |
| "learning_rate": 1.7755413527553087e-05, | |
| "loss": 1.5984, | |
| "mean_token_accuracy": 0.585393351316452, | |
| "num_tokens": 2978519.0, | |
| "step": 380 | |
| }, | |
| { | |
| "entropy": 2.1223404884338377, | |
| "epoch": 2.4864, | |
| "grad_norm": 1.0296390056610107, | |
| "learning_rate": 1.7623463392472574e-05, | |
| "loss": 1.5232, | |
| "mean_token_accuracy": 0.595654422044754, | |
| "num_tokens": 3055327.0, | |
| "step": 390 | |
| }, | |
| { | |
| "entropy": 2.16276493370533, | |
| "epoch": 2.5504, | |
| "grad_norm": 0.9905762672424316, | |
| "learning_rate": 1.748826532231142e-05, | |
| "loss": 1.6049, | |
| "mean_token_accuracy": 0.5822189599275589, | |
| "num_tokens": 3135348.0, | |
| "step": 400 | |
| }, | |
| { | |
| "entropy": 2.127479985356331, | |
| "epoch": 2.6144, | |
| "grad_norm": 0.851375162601471, | |
| "learning_rate": 1.7349876917471474e-05, | |
| "loss": 1.5842, | |
| "mean_token_accuracy": 0.5855211839079857, | |
| "num_tokens": 3213122.0, | |
| "step": 410 | |
| }, | |
| { | |
| "entropy": 2.167752879858017, | |
| "epoch": 2.6784, | |
| "grad_norm": 0.975143313407898, | |
| "learning_rate": 1.7208357137579318e-05, | |
| "loss": 1.5918, | |
| "mean_token_accuracy": 0.5839722648262977, | |
| "num_tokens": 3289583.0, | |
| "step": 420 | |
| }, | |
| { | |
| "entropy": 2.127084198594093, | |
| "epoch": 2.7424, | |
| "grad_norm": 0.8077936768531799, | |
| "learning_rate": 1.7063766276366814e-05, | |
| "loss": 1.5916, | |
| "mean_token_accuracy": 0.5900941833853721, | |
| "num_tokens": 3369740.0, | |
| "step": 430 | |
| }, | |
| { | |
| "entropy": 2.1315969794988634, | |
| "epoch": 2.8064, | |
| "grad_norm": 0.9403624534606934, | |
| "learning_rate": 1.6916165935983323e-05, | |
| "loss": 1.5713, | |
| "mean_token_accuracy": 0.5892721861600876, | |
| "num_tokens": 3448328.0, | |
| "step": 440 | |
| }, | |
| { | |
| "entropy": 2.130605939030647, | |
| "epoch": 2.8704, | |
| "grad_norm": 0.8252040147781372, | |
| "learning_rate": 1.676561900075041e-05, | |
| "loss": 1.6003, | |
| "mean_token_accuracy": 0.5845118075609207, | |
| "num_tokens": 3529853.0, | |
| "step": 450 | |
| }, | |
| { | |
| "entropy": 2.112012493610382, | |
| "epoch": 2.9344, | |
| "grad_norm": 0.9267668724060059, | |
| "learning_rate": 1.6612189610370336e-05, | |
| "loss": 1.5796, | |
| "mean_token_accuracy": 0.5887707889080047, | |
| "num_tokens": 3610922.0, | |
| "step": 460 | |
| }, | |
| { | |
| "entropy": 2.100590059161186, | |
| "epoch": 2.9984, | |
| "grad_norm": 0.8996879458427429, | |
| "learning_rate": 1.6455943132599698e-05, | |
| "loss": 1.5483, | |
| "mean_token_accuracy": 0.5934251204133034, | |
| "num_tokens": 3688391.0, | |
| "step": 470 | |
| }, | |
| { | |
| "entropy": 2.1115864160898568, | |
| "epoch": 3.0576, | |
| "grad_norm": 1.097270131111145, | |
| "learning_rate": 1.6296946135399835e-05, | |
| "loss": 1.5506, | |
| "mean_token_accuracy": 0.592829834770512, | |
| "num_tokens": 3758747.0, | |
| "step": 480 | |
| }, | |
| { | |
| "entropy": 2.0610430628061294, | |
| "epoch": 3.1216, | |
| "grad_norm": 1.176645278930664, | |
| "learning_rate": 1.613526635857591e-05, | |
| "loss": 1.4461, | |
| "mean_token_accuracy": 0.6111307457089424, | |
| "num_tokens": 3834689.0, | |
| "step": 490 | |
| }, | |
| { | |
| "entropy": 2.0154007196426393, | |
| "epoch": 3.1856, | |
| "grad_norm": 1.1834276914596558, | |
| "learning_rate": 1.5970972684916754e-05, | |
| "loss": 1.4852, | |
| "mean_token_accuracy": 0.6026980608701706, | |
| "num_tokens": 3916450.0, | |
| "step": 500 | |
| }, | |
| { | |
| "entropy": 2.0441433399915696, | |
| "epoch": 3.2496, | |
| "grad_norm": 1.159286379814148, | |
| "learning_rate": 1.5804135110847708e-05, | |
| "loss": 1.4978, | |
| "mean_token_accuracy": 0.6042912915349007, | |
| "num_tokens": 3998511.0, | |
| "step": 510 | |
| }, | |
| { | |
| "entropy": 2.0493109285831452, | |
| "epoch": 3.3136, | |
| "grad_norm": 1.2141708135604858, | |
| "learning_rate": 1.5634824716609037e-05, | |
| "loss": 1.5018, | |
| "mean_token_accuracy": 0.5995921581983567, | |
| "num_tokens": 4077676.0, | |
| "step": 520 | |
| }, | |
| { | |
| "entropy": 2.0533218771219253, | |
| "epoch": 3.3776, | |
| "grad_norm": 1.1630637645721436, | |
| "learning_rate": 1.5463113635972577e-05, | |
| "loss": 1.499, | |
| "mean_token_accuracy": 0.6046154126524925, | |
| "num_tokens": 4155264.0, | |
| "step": 530 | |
| }, | |
| { | |
| "entropy": 2.0600034058094026, | |
| "epoch": 3.4416, | |
| "grad_norm": 1.2523504495620728, | |
| "learning_rate": 1.528907502550954e-05, | |
| "loss": 1.521, | |
| "mean_token_accuracy": 0.6000443026423454, | |
| "num_tokens": 4233655.0, | |
| "step": 540 | |
| }, | |
| { | |
| "entropy": 2.0414596855640412, | |
| "epoch": 3.5056000000000003, | |
| "grad_norm": 1.3990252017974854, | |
| "learning_rate": 1.5112783033422547e-05, | |
| "loss": 1.4899, | |
| "mean_token_accuracy": 0.6026965886354446, | |
| "num_tokens": 4311644.0, | |
| "step": 550 | |
| }, | |
| { | |
| "entropy": 2.061043033003807, | |
| "epoch": 3.5696, | |
| "grad_norm": 1.1884260177612305, | |
| "learning_rate": 1.4934312767955193e-05, | |
| "loss": 1.5143, | |
| "mean_token_accuracy": 0.5981319859623909, | |
| "num_tokens": 4390933.0, | |
| "step": 560 | |
| }, | |
| { | |
| "entropy": 2.034099668264389, | |
| "epoch": 3.6336, | |
| "grad_norm": 1.2996599674224854, | |
| "learning_rate": 1.4753740265392595e-05, | |
| "loss": 1.4953, | |
| "mean_token_accuracy": 0.6029247522354126, | |
| "num_tokens": 4470462.0, | |
| "step": 570 | |
| }, | |
| { | |
| "entropy": 2.0379767954349517, | |
| "epoch": 3.6976, | |
| "grad_norm": 1.2936193943023682, | |
| "learning_rate": 1.4571142457666536e-05, | |
| "loss": 1.4965, | |
| "mean_token_accuracy": 0.6041712030768395, | |
| "num_tokens": 4549236.0, | |
| "step": 580 | |
| }, | |
| { | |
| "entropy": 2.040063351392746, | |
| "epoch": 3.7616, | |
| "grad_norm": 1.5094560384750366, | |
| "learning_rate": 1.4386597139579041e-05, | |
| "loss": 1.4979, | |
| "mean_token_accuracy": 0.6051288455724716, | |
| "num_tokens": 4628758.0, | |
| "step": 590 | |
| }, | |
| { | |
| "entropy": 1.9998936265707017, | |
| "epoch": 3.8256, | |
| "grad_norm": 1.3166426420211792, | |
| "learning_rate": 1.4200182935658327e-05, | |
| "loss": 1.459, | |
| "mean_token_accuracy": 0.6084850415587425, | |
| "num_tokens": 4708526.0, | |
| "step": 600 | |
| }, | |
| { | |
| "entropy": 2.0041965901851655, | |
| "epoch": 3.8895999999999997, | |
| "grad_norm": 1.2710400819778442, | |
| "learning_rate": 1.4011979266661235e-05, | |
| "loss": 1.4831, | |
| "mean_token_accuracy": 0.6057328775525093, | |
| "num_tokens": 4788733.0, | |
| "step": 610 | |
| }, | |
| { | |
| "entropy": 2.0265558779239656, | |
| "epoch": 3.9536, | |
| "grad_norm": 1.4318969249725342, | |
| "learning_rate": 1.3822066315736477e-05, | |
| "loss": 1.4966, | |
| "mean_token_accuracy": 0.5994595810770988, | |
| "num_tokens": 4866451.0, | |
| "step": 620 | |
| }, | |
| { | |
| "entropy": 2.0692459924800977, | |
| "epoch": 4.0128, | |
| "grad_norm": 1.2546013593673706, | |
| "learning_rate": 1.363052499426302e-05, | |
| "loss": 1.503, | |
| "mean_token_accuracy": 0.6039850309088424, | |
| "num_tokens": 4936715.0, | |
| "step": 630 | |
| }, | |
| { | |
| "entropy": 1.9788923293352128, | |
| "epoch": 4.0768, | |
| "grad_norm": 1.416927456855774, | |
| "learning_rate": 1.3437436907378225e-05, | |
| "loss": 1.4248, | |
| "mean_token_accuracy": 0.6142558038234711, | |
| "num_tokens": 5016713.0, | |
| "step": 640 | |
| }, | |
| { | |
| "entropy": 1.9646029412746429, | |
| "epoch": 4.1408, | |
| "grad_norm": 1.5146726369857788, | |
| "learning_rate": 1.3242884319210463e-05, | |
| "loss": 1.3875, | |
| "mean_token_accuracy": 0.624424883723259, | |
| "num_tokens": 5096513.0, | |
| "step": 650 | |
| }, | |
| { | |
| "entropy": 1.93471617102623, | |
| "epoch": 4.2048, | |
| "grad_norm": 1.5090768337249756, | |
| "learning_rate": 1.3046950117830888e-05, | |
| "loss": 1.3884, | |
| "mean_token_accuracy": 0.6222448632121086, | |
| "num_tokens": 5177075.0, | |
| "step": 660 | |
| }, | |
| { | |
| "entropy": 2.002266028523445, | |
| "epoch": 4.2688, | |
| "grad_norm": 1.74358332157135, | |
| "learning_rate": 1.2849717779939439e-05, | |
| "loss": 1.4062, | |
| "mean_token_accuracy": 0.6180147424340248, | |
| "num_tokens": 5252902.0, | |
| "step": 670 | |
| }, | |
| { | |
| "entropy": 1.9397415190935134, | |
| "epoch": 4.3328, | |
| "grad_norm": 1.774728775024414, | |
| "learning_rate": 1.2651271335300063e-05, | |
| "loss": 1.3933, | |
| "mean_token_accuracy": 0.626343595981598, | |
| "num_tokens": 5331448.0, | |
| "step": 680 | |
| }, | |
| { | |
| "entropy": 1.9571841150522231, | |
| "epoch": 4.3968, | |
| "grad_norm": 1.80965256690979, | |
| "learning_rate": 1.2451695330940268e-05, | |
| "loss": 1.4205, | |
| "mean_token_accuracy": 0.6187710732221603, | |
| "num_tokens": 5410857.0, | |
| "step": 690 | |
| }, | |
| { | |
| "entropy": 1.9691186994314194, | |
| "epoch": 4.4608, | |
| "grad_norm": 1.5400609970092773, | |
| "learning_rate": 1.2251074795130339e-05, | |
| "loss": 1.4123, | |
| "mean_token_accuracy": 0.614769059419632, | |
| "num_tokens": 5488867.0, | |
| "step": 700 | |
| }, | |
| { | |
| "entropy": 1.9635825514793397, | |
| "epoch": 4.5248, | |
| "grad_norm": 1.467608094215393, | |
| "learning_rate": 1.2049495201157489e-05, | |
| "loss": 1.4228, | |
| "mean_token_accuracy": 0.6202724784612655, | |
| "num_tokens": 5567515.0, | |
| "step": 710 | |
| }, | |
| { | |
| "entropy": 1.9384470194578172, | |
| "epoch": 4.5888, | |
| "grad_norm": 1.652387022972107, | |
| "learning_rate": 1.1847042430910451e-05, | |
| "loss": 1.4273, | |
| "mean_token_accuracy": 0.6190450325608253, | |
| "num_tokens": 5648858.0, | |
| "step": 720 | |
| }, | |
| { | |
| "entropy": 1.9911590039730072, | |
| "epoch": 4.6528, | |
| "grad_norm": 1.7492380142211914, | |
| "learning_rate": 1.1643802738289955e-05, | |
| "loss": 1.4776, | |
| "mean_token_accuracy": 0.6073927089571953, | |
| "num_tokens": 5725459.0, | |
| "step": 730 | |
| }, | |
| { | |
| "entropy": 1.9724233269691467, | |
| "epoch": 4.7168, | |
| "grad_norm": 1.709669828414917, | |
| "learning_rate": 1.1439862712460721e-05, | |
| "loss": 1.4217, | |
| "mean_token_accuracy": 0.6184087961912155, | |
| "num_tokens": 5801601.0, | |
| "step": 740 | |
| }, | |
| { | |
| "entropy": 1.9725236982107162, | |
| "epoch": 4.7808, | |
| "grad_norm": 1.7469470500946045, | |
| "learning_rate": 1.1235309240960621e-05, | |
| "loss": 1.405, | |
| "mean_token_accuracy": 0.6196158319711685, | |
| "num_tokens": 5881107.0, | |
| "step": 750 | |
| }, | |
| { | |
| "entropy": 1.9484833419322967, | |
| "epoch": 4.8448, | |
| "grad_norm": 1.532373309135437, | |
| "learning_rate": 1.1030229472682719e-05, | |
| "loss": 1.4155, | |
| "mean_token_accuracy": 0.611663281917572, | |
| "num_tokens": 5960375.0, | |
| "step": 760 | |
| }, | |
| { | |
| "entropy": 1.9964754343032838, | |
| "epoch": 4.9088, | |
| "grad_norm": 1.7157669067382812, | |
| "learning_rate": 1.0824710780745954e-05, | |
| "loss": 1.4295, | |
| "mean_token_accuracy": 0.6131752103567123, | |
| "num_tokens": 6038267.0, | |
| "step": 770 | |
| }, | |
| { | |
| "entropy": 1.9598666340112687, | |
| "epoch": 4.9728, | |
| "grad_norm": 1.9844586849212646, | |
| "learning_rate": 1.06188407252703e-05, | |
| "loss": 1.397, | |
| "mean_token_accuracy": 0.6226776748895645, | |
| "num_tokens": 6114749.0, | |
| "step": 780 | |
| }, | |
| { | |
| "entropy": 1.9227982276194804, | |
| "epoch": 5.032, | |
| "grad_norm": 1.8960447311401367, | |
| "learning_rate": 1.0412707016072254e-05, | |
| "loss": 1.3649, | |
| "mean_token_accuracy": 0.6269845414806057, | |
| "num_tokens": 6190567.0, | |
| "step": 790 | |
| }, | |
| { | |
| "entropy": 1.9008578658103943, | |
| "epoch": 5.096, | |
| "grad_norm": 2.1205599308013916, | |
| "learning_rate": 1.0206397475296548e-05, | |
| "loss": 1.3582, | |
| "mean_token_accuracy": 0.6292989999055862, | |
| "num_tokens": 6269285.0, | |
| "step": 800 | |
| }, | |
| { | |
| "entropy": 1.9224162876605988, | |
| "epoch": 5.16, | |
| "grad_norm": 2.0454013347625732, | |
| "learning_rate": 1e-05, | |
| "loss": 1.3349, | |
| "mean_token_accuracy": 0.6315066292881966, | |
| "num_tokens": 6345352.0, | |
| "step": 810 | |
| }, | |
| { | |
| "entropy": 1.9340467154979706, | |
| "epoch": 5.224, | |
| "grad_norm": 2.2607693672180176, | |
| "learning_rate": 9.793602524703456e-06, | |
| "loss": 1.359, | |
| "mean_token_accuracy": 0.6322078078985214, | |
| "num_tokens": 6422524.0, | |
| "step": 820 | |
| }, | |
| { | |
| "entropy": 1.9296668291091919, | |
| "epoch": 5.288, | |
| "grad_norm": 2.1245901584625244, | |
| "learning_rate": 9.58729298392775e-06, | |
| "loss": 1.3672, | |
| "mean_token_accuracy": 0.6282135233283043, | |
| "num_tokens": 6500128.0, | |
| "step": 830 | |
| }, | |
| { | |
| "entropy": 1.9272812247276305, | |
| "epoch": 5.352, | |
| "grad_norm": 1.965820550918579, | |
| "learning_rate": 9.381159274729704e-06, | |
| "loss": 1.3786, | |
| "mean_token_accuracy": 0.6249860525131226, | |
| "num_tokens": 6578766.0, | |
| "step": 840 | |
| }, | |
| { | |
| "entropy": 1.904970219731331, | |
| "epoch": 5.416, | |
| "grad_norm": 1.9188759326934814, | |
| "learning_rate": 9.175289219254051e-06, | |
| "loss": 1.3418, | |
| "mean_token_accuracy": 0.6325456693768501, | |
| "num_tokens": 6658732.0, | |
| "step": 850 | |
| }, | |
| { | |
| "entropy": 1.8833305448293687, | |
| "epoch": 5.48, | |
| "grad_norm": 1.9675428867340088, | |
| "learning_rate": 8.969770527317283e-06, | |
| "loss": 1.3274, | |
| "mean_token_accuracy": 0.6377805054187775, | |
| "num_tokens": 6738683.0, | |
| "step": 860 | |
| }, | |
| { | |
| "entropy": 1.8806802958250046, | |
| "epoch": 5.5440000000000005, | |
| "grad_norm": 1.8849304914474487, | |
| "learning_rate": 8.764690759039382e-06, | |
| "loss": 1.3109, | |
| "mean_token_accuracy": 0.636364534497261, | |
| "num_tokens": 6817786.0, | |
| "step": 870 | |
| }, | |
| { | |
| "entropy": 1.8846195042133331, | |
| "epoch": 5.608, | |
| "grad_norm": 2.050208330154419, | |
| "learning_rate": 8.56013728753928e-06, | |
| "loss": 1.3449, | |
| "mean_token_accuracy": 0.6316975012421608, | |
| "num_tokens": 6896222.0, | |
| "step": 880 | |
| }, | |
| { | |
| "entropy": 1.88524529337883, | |
| "epoch": 5.672, | |
| "grad_norm": 2.1371288299560547, | |
| "learning_rate": 8.356197261710048e-06, | |
| "loss": 1.346, | |
| "mean_token_accuracy": 0.633928644657135, | |
| "num_tokens": 6976885.0, | |
| "step": 890 | |
| }, | |
| { | |
| "entropy": 1.9162244260311128, | |
| "epoch": 5.736, | |
| "grad_norm": 1.9879032373428345, | |
| "learning_rate": 8.152957569089552e-06, | |
| "loss": 1.3486, | |
| "mean_token_accuracy": 0.6311523199081421, | |
| "num_tokens": 7053473.0, | |
| "step": 900 | |
| }, | |
| { | |
| "entropy": 1.89161317050457, | |
| "epoch": 5.8, | |
| "grad_norm": 2.2934179306030273, | |
| "learning_rate": 7.950504798842513e-06, | |
| "loss": 1.3699, | |
| "mean_token_accuracy": 0.6269390240311623, | |
| "num_tokens": 7133137.0, | |
| "step": 910 | |
| }, | |
| { | |
| "entropy": 1.888116827607155, | |
| "epoch": 5.864, | |
| "grad_norm": 1.769087791442871, | |
| "learning_rate": 7.748925204869667e-06, | |
| "loss": 1.3756, | |
| "mean_token_accuracy": 0.6285945609211921, | |
| "num_tokens": 7213693.0, | |
| "step": 920 | |
| }, | |
| { | |
| "entropy": 1.89390210211277, | |
| "epoch": 5.928, | |
| "grad_norm": 2.2577364444732666, | |
| "learning_rate": 7.548304669059735e-06, | |
| "loss": 1.3396, | |
| "mean_token_accuracy": 0.6290415957570076, | |
| "num_tokens": 7291999.0, | |
| "step": 930 | |
| }, | |
| { | |
| "entropy": 1.8755547761917115, | |
| "epoch": 5.992, | |
| "grad_norm": 2.080371618270874, | |
| "learning_rate": 7.348728664699939e-06, | |
| "loss": 1.3305, | |
| "mean_token_accuracy": 0.6322756335139275, | |
| "num_tokens": 7370138.0, | |
| "step": 940 | |
| }, | |
| { | |
| "entropy": 1.8396991845723745, | |
| "epoch": 6.0512, | |
| "grad_norm": 2.222177028656006, | |
| "learning_rate": 7.150282220060564e-06, | |
| "loss": 1.2782, | |
| "mean_token_accuracy": 0.6437820018948736, | |
| "num_tokens": 7444764.0, | |
| "step": 950 | |
| }, | |
| { | |
| "entropy": 1.864711531996727, | |
| "epoch": 6.1152, | |
| "grad_norm": 2.236663579940796, | |
| "learning_rate": 6.9530498821691165e-06, | |
| "loss": 1.342, | |
| "mean_token_accuracy": 0.6400286257266998, | |
| "num_tokens": 7523012.0, | |
| "step": 960 | |
| }, | |
| { | |
| "entropy": 1.8462383985519408, | |
| "epoch": 6.1792, | |
| "grad_norm": 2.438649892807007, | |
| "learning_rate": 6.757115680789539e-06, | |
| "loss": 1.2769, | |
| "mean_token_accuracy": 0.6437345445156097, | |
| "num_tokens": 7602451.0, | |
| "step": 970 | |
| }, | |
| { | |
| "entropy": 1.8425735771656035, | |
| "epoch": 6.2432, | |
| "grad_norm": 2.306880235671997, | |
| "learning_rate": 6.562563092621776e-06, | |
| "loss": 1.309, | |
| "mean_token_accuracy": 0.6463457986712455, | |
| "num_tokens": 7681972.0, | |
| "step": 980 | |
| }, | |
| { | |
| "entropy": 1.8139673799276352, | |
| "epoch": 6.3072, | |
| "grad_norm": 2.286114454269409, | |
| "learning_rate": 6.369475005736984e-06, | |
| "loss": 1.2748, | |
| "mean_token_accuracy": 0.6487143859267235, | |
| "num_tokens": 7762845.0, | |
| "step": 990 | |
| }, | |
| { | |
| "entropy": 1.8660429507493972, | |
| "epoch": 6.3712, | |
| "grad_norm": 2.421706199645996, | |
| "learning_rate": 6.177933684263524e-06, | |
| "loss": 1.2964, | |
| "mean_token_accuracy": 0.6455973491072655, | |
| "num_tokens": 7839552.0, | |
| "step": 1000 | |
| }, | |
| { | |
| "entropy": 1.8517659038305283, | |
| "epoch": 6.4352, | |
| "grad_norm": 2.3891334533691406, | |
| "learning_rate": 5.988020733338767e-06, | |
| "loss": 1.2893, | |
| "mean_token_accuracy": 0.6442387655377388, | |
| "num_tokens": 7915996.0, | |
| "step": 1010 | |
| }, | |
| { | |
| "entropy": 1.8547363132238388, | |
| "epoch": 6.4992, | |
| "grad_norm": 2.26686429977417, | |
| "learning_rate": 5.7998170643416795e-06, | |
| "loss": 1.2973, | |
| "mean_token_accuracy": 0.6435917019844055, | |
| "num_tokens": 7995119.0, | |
| "step": 1020 | |
| }, | |
| { | |
| "entropy": 1.8365773737430573, | |
| "epoch": 6.5632, | |
| "grad_norm": 2.1454896926879883, | |
| "learning_rate": 5.613402860420962e-06, | |
| "loss": 1.2744, | |
| "mean_token_accuracy": 0.6410152271389962, | |
| "num_tokens": 8075306.0, | |
| "step": 1030 | |
| }, | |
| { | |
| "entropy": 1.8936803489923477, | |
| "epoch": 6.6272, | |
| "grad_norm": 2.5226423740386963, | |
| "learning_rate": 5.428857542333465e-06, | |
| "loss": 1.3225, | |
| "mean_token_accuracy": 0.6396260514855385, | |
| "num_tokens": 8152449.0, | |
| "step": 1040 | |
| }, | |
| { | |
| "entropy": 1.8555004209280015, | |
| "epoch": 6.6912, | |
| "grad_norm": 2.216014862060547, | |
| "learning_rate": 5.246259734607411e-06, | |
| "loss": 1.299, | |
| "mean_token_accuracy": 0.641279113292694, | |
| "num_tokens": 8231904.0, | |
| "step": 1050 | |
| }, | |
| { | |
| "entropy": 1.8588679373264312, | |
| "epoch": 6.7552, | |
| "grad_norm": 2.4265236854553223, | |
| "learning_rate": 5.065687232044811e-06, | |
| "loss": 1.3026, | |
| "mean_token_accuracy": 0.6363563358783721, | |
| "num_tokens": 8310755.0, | |
| "step": 1060 | |
| }, | |
| { | |
| "entropy": 1.8318012267351151, | |
| "epoch": 6.8192, | |
| "grad_norm": 2.2089412212371826, | |
| "learning_rate": 4.887216966577458e-06, | |
| "loss": 1.2583, | |
| "mean_token_accuracy": 0.6502064153552055, | |
| "num_tokens": 8390161.0, | |
| "step": 1070 | |
| }, | |
| { | |
| "entropy": 1.8765722244977951, | |
| "epoch": 6.8832, | |
| "grad_norm": 2.3233554363250732, | |
| "learning_rate": 4.710924974490463e-06, | |
| "loss": 1.3223, | |
| "mean_token_accuracy": 0.6393219083547592, | |
| "num_tokens": 8469078.0, | |
| "step": 1080 | |
| }, | |
| { | |
| "entropy": 1.8413788318634032, | |
| "epoch": 6.9472000000000005, | |
| "grad_norm": 2.321904420852661, | |
| "learning_rate": 4.536886364027428e-06, | |
| "loss": 1.272, | |
| "mean_token_accuracy": 0.647525629401207, | |
| "num_tokens": 8547873.0, | |
| "step": 1090 | |
| }, | |
| { | |
| "entropy": 1.8666607818088017, | |
| "epoch": 7.0064, | |
| "grad_norm": 2.1004791259765625, | |
| "learning_rate": 4.365175283390968e-06, | |
| "loss": 1.2721, | |
| "mean_token_accuracy": 0.6479364424138456, | |
| "num_tokens": 8619109.0, | |
| "step": 1100 | |
| }, | |
| { | |
| "entropy": 1.8333647519350051, | |
| "epoch": 7.0704, | |
| "grad_norm": 2.9210190773010254, | |
| "learning_rate": 4.195864889152295e-06, | |
| "loss": 1.1833, | |
| "mean_token_accuracy": 0.6699477419257164, | |
| "num_tokens": 8692475.0, | |
| "step": 1110 | |
| }, | |
| { | |
| "entropy": 1.8425445258617401, | |
| "epoch": 7.1344, | |
| "grad_norm": 2.3149521350860596, | |
| "learning_rate": 4.029027315083251e-06, | |
| "loss": 1.2707, | |
| "mean_token_accuracy": 0.650185227394104, | |
| "num_tokens": 8770456.0, | |
| "step": 1120 | |
| }, | |
| { | |
| "entropy": 1.8121359765529632, | |
| "epoch": 7.1984, | |
| "grad_norm": 2.6502795219421387, | |
| "learning_rate": 3.864733641424093e-06, | |
| "loss": 1.2383, | |
| "mean_token_accuracy": 0.6547705471515656, | |
| "num_tokens": 8851214.0, | |
| "step": 1130 | |
| }, | |
| { | |
| "entropy": 1.802490884065628, | |
| "epoch": 7.2624, | |
| "grad_norm": 2.227534770965576, | |
| "learning_rate": 3.703053864600169e-06, | |
| "loss": 1.2603, | |
| "mean_token_accuracy": 0.6489648431539535, | |
| "num_tokens": 8932363.0, | |
| "step": 1140 | |
| }, | |
| { | |
| "entropy": 1.8214709132909774, | |
| "epoch": 7.3264, | |
| "grad_norm": 2.5923874378204346, | |
| "learning_rate": 3.544056867400306e-06, | |
| "loss": 1.248, | |
| "mean_token_accuracy": 0.651621387898922, | |
| "num_tokens": 9011734.0, | |
| "step": 1150 | |
| }, | |
| { | |
| "entropy": 1.826240959763527, | |
| "epoch": 7.3904, | |
| "grad_norm": 2.67551589012146, | |
| "learning_rate": 3.3878103896296677e-06, | |
| "loss": 1.2488, | |
| "mean_token_accuracy": 0.6530374586582184, | |
| "num_tokens": 9090277.0, | |
| "step": 1160 | |
| }, | |
| { | |
| "entropy": 1.837952870130539, | |
| "epoch": 7.4544, | |
| "grad_norm": 2.2191765308380127, | |
| "learning_rate": 3.2343809992495945e-06, | |
| "loss": 1.2704, | |
| "mean_token_accuracy": 0.6503957703709602, | |
| "num_tokens": 9168093.0, | |
| "step": 1170 | |
| }, | |
| { | |
| "entropy": 1.8135560542345046, | |
| "epoch": 7.5184, | |
| "grad_norm": 2.5211071968078613, | |
| "learning_rate": 3.083834064016682e-06, | |
| "loss": 1.2212, | |
| "mean_token_accuracy": 0.6587097644805908, | |
| "num_tokens": 9247777.0, | |
| "step": 1180 | |
| }, | |
| { | |
| "entropy": 1.8237973660230637, | |
| "epoch": 7.5824, | |
| "grad_norm": 2.6236841678619385, | |
| "learning_rate": 2.9362337236331884e-06, | |
| "loss": 1.2604, | |
| "mean_token_accuracy": 0.6501624628901481, | |
| "num_tokens": 9325367.0, | |
| "step": 1190 | |
| }, | |
| { | |
| "entropy": 1.836614164710045, | |
| "epoch": 7.6464, | |
| "grad_norm": 2.726731777191162, | |
| "learning_rate": 2.791642862420686e-06, | |
| "loss": 1.2554, | |
| "mean_token_accuracy": 0.6520631939172745, | |
| "num_tokens": 9403641.0, | |
| "step": 1200 | |
| }, | |
| { | |
| "entropy": 1.8044064462184906, | |
| "epoch": 7.7104, | |
| "grad_norm": 2.4943737983703613, | |
| "learning_rate": 2.6501230825285294e-06, | |
| "loss": 1.2519, | |
| "mean_token_accuracy": 0.6524736672639847, | |
| "num_tokens": 9484075.0, | |
| "step": 1210 | |
| }, | |
| { | |
| "entropy": 1.8258908241987228, | |
| "epoch": 7.7744, | |
| "grad_norm": 2.4426612854003906, | |
| "learning_rate": 2.5117346776885843e-06, | |
| "loss": 1.251, | |
| "mean_token_accuracy": 0.6484281331300735, | |
| "num_tokens": 9561148.0, | |
| "step": 1220 | |
| }, | |
| { | |
| "entropy": 1.8062447488307953, | |
| "epoch": 7.8384, | |
| "grad_norm": 2.465646266937256, | |
| "learning_rate": 2.3765366075274287e-06, | |
| "loss": 1.2662, | |
| "mean_token_accuracy": 0.6492940753698349, | |
| "num_tokens": 9642108.0, | |
| "step": 1230 | |
| }, | |
| { | |
| "entropy": 1.8293108910322189, | |
| "epoch": 7.9024, | |
| "grad_norm": 2.4230668544769287, | |
| "learning_rate": 2.2445864724469146e-06, | |
| "loss": 1.2625, | |
| "mean_token_accuracy": 0.6592240884900094, | |
| "num_tokens": 9719660.0, | |
| "step": 1240 | |
| }, | |
| { | |
| "entropy": 1.837513843178749, | |
| "epoch": 7.9664, | |
| "grad_norm": 2.7502171993255615, | |
| "learning_rate": 2.1159404890838365e-06, | |
| "loss": 1.2677, | |
| "mean_token_accuracy": 0.6493206784129143, | |
| "num_tokens": 9797593.0, | |
| "step": 1250 | |
| }, | |
| { | |
| "entropy": 1.8162316245001715, | |
| "epoch": 8.0256, | |
| "grad_norm": 2.5199058055877686, | |
| "learning_rate": 1.990653466359125e-06, | |
| "loss": 1.2293, | |
| "mean_token_accuracy": 0.656300467413825, | |
| "num_tokens": 9871177.0, | |
| "step": 1260 | |
| }, | |
| { | |
| "entropy": 1.780314788222313, | |
| "epoch": 8.0896, | |
| "grad_norm": 2.5237162113189697, | |
| "learning_rate": 1.8687787821268255e-06, | |
| "loss": 1.1791, | |
| "mean_token_accuracy": 0.6675050809979439, | |
| "num_tokens": 9949391.0, | |
| "step": 1270 | |
| }, | |
| { | |
| "entropy": 1.779639583826065, | |
| "epoch": 8.1536, | |
| "grad_norm": 2.4559428691864014, | |
| "learning_rate": 1.7503683604327426e-06, | |
| "loss": 1.2177, | |
| "mean_token_accuracy": 0.6600575730204582, | |
| "num_tokens": 10030182.0, | |
| "step": 1280 | |
| }, | |
| { | |
| "entropy": 1.7865025967359542, | |
| "epoch": 8.2176, | |
| "grad_norm": 2.9508230686187744, | |
| "learning_rate": 1.6354726493924745e-06, | |
| "loss": 1.1937, | |
| "mean_token_accuracy": 0.6630557537078857, | |
| "num_tokens": 10107960.0, | |
| "step": 1290 | |
| }, | |
| { | |
| "entropy": 1.8122670024633407, | |
| "epoch": 8.2816, | |
| "grad_norm": 2.6917898654937744, | |
| "learning_rate": 1.5241405996982928e-06, | |
| "loss": 1.2319, | |
| "mean_token_accuracy": 0.6598842918872834, | |
| "num_tokens": 10185524.0, | |
| "step": 1300 | |
| }, | |
| { | |
| "entropy": 1.806730917096138, | |
| "epoch": 8.3456, | |
| "grad_norm": 2.7887086868286133, | |
| "learning_rate": 1.4164196437639355e-06, | |
| "loss": 1.25, | |
| "mean_token_accuracy": 0.6578737393021583, | |
| "num_tokens": 10265123.0, | |
| "step": 1310 | |
| }, | |
| { | |
| "entropy": 1.8156007081270218, | |
| "epoch": 8.4096, | |
| "grad_norm": 2.9965310096740723, | |
| "learning_rate": 1.3123556755163114e-06, | |
| "loss": 1.234, | |
| "mean_token_accuracy": 0.6579165816307068, | |
| "num_tokens": 10342205.0, | |
| "step": 1320 | |
| }, | |
| { | |
| "entropy": 1.8044028550386428, | |
| "epoch": 8.4736, | |
| "grad_norm": 2.9466843605041504, | |
| "learning_rate": 1.2119930308426264e-06, | |
| "loss": 1.2423, | |
| "mean_token_accuracy": 0.6527451828122139, | |
| "num_tokens": 10420603.0, | |
| "step": 1330 | |
| }, | |
| { | |
| "entropy": 1.8250535994768142, | |
| "epoch": 8.5376, | |
| "grad_norm": 2.9452784061431885, | |
| "learning_rate": 1.1153744687013313e-06, | |
| "loss": 1.258, | |
| "mean_token_accuracy": 0.6589037463068962, | |
| "num_tokens": 10499049.0, | |
| "step": 1340 | |
| }, | |
| { | |
| "entropy": 1.7990799486637115, | |
| "epoch": 8.6016, | |
| "grad_norm": 2.6469309329986572, | |
| "learning_rate": 1.0225411529048857e-06, | |
| "loss": 1.2415, | |
| "mean_token_accuracy": 0.6555879130959511, | |
| "num_tokens": 10578178.0, | |
| "step": 1350 | |
| }, | |
| { | |
| "entropy": 1.7638877242803574, | |
| "epoch": 8.6656, | |
| "grad_norm": 2.77990460395813, | |
| "learning_rate": 9.33532634582156e-07, | |
| "loss": 1.2143, | |
| "mean_token_accuracy": 0.6589834168553352, | |
| "num_tokens": 10659018.0, | |
| "step": 1360 | |
| }, | |
| { | |
| "entropy": 1.8123771637678145, | |
| "epoch": 8.7296, | |
| "grad_norm": 3.1158993244171143, | |
| "learning_rate": 8.483868353278657e-07, | |
| "loss": 1.2358, | |
| "mean_token_accuracy": 0.6561313390731811, | |
| "num_tokens": 10736582.0, | |
| "step": 1370 | |
| }, | |
| { | |
| "entropy": 1.8054670304059983, | |
| "epoch": 8.7936, | |
| "grad_norm": 2.915422201156616, | |
| "learning_rate": 7.671400310462984e-07, | |
| "loss": 1.2089, | |
| "mean_token_accuracy": 0.6610309720039368, | |
| "num_tokens": 10814534.0, | |
| "step": 1380 | |
| }, | |
| { | |
| "entropy": 1.800497230887413, | |
| "epoch": 8.8576, | |
| "grad_norm": 2.7816338539123535, | |
| "learning_rate": 6.898268364961591e-07, | |
| "loss": 1.2227, | |
| "mean_token_accuracy": 0.6584793984889984, | |
| "num_tokens": 10893484.0, | |
| "step": 1390 | |
| }, | |
| { | |
| "entropy": 1.7850348353385925, | |
| "epoch": 8.9216, | |
| "grad_norm": 2.569054126739502, | |
| "learning_rate": 6.164801905431394e-07, | |
| "loss": 1.2242, | |
| "mean_token_accuracy": 0.6574000924825668, | |
| "num_tokens": 10973818.0, | |
| "step": 1400 | |
| }, | |
| { | |
| "entropy": 1.795585972070694, | |
| "epoch": 8.9856, | |
| "grad_norm": 2.6529977321624756, | |
| "learning_rate": 5.471313421264879e-07, | |
| "loss": 1.2127, | |
| "mean_token_accuracy": 0.6600923746824264, | |
| "num_tokens": 11051396.0, | |
| "step": 1410 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 1570, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 10, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.066034365626581e+18, | |
| "train_batch_size": 16, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |