|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 10.0, |
|
"eval_steps": 500, |
|
"global_step": 1570, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"entropy": 2.05683453977108, |
|
"epoch": 0.064, |
|
"grad_norm": 1.5962693691253662, |
|
"learning_rate": 3.7500000000000005e-06, |
|
"loss": 2.056, |
|
"mean_token_accuracy": 0.527055786550045, |
|
"num_tokens": 78959.0, |
|
"step": 10 |
|
}, |
|
{ |
|
"entropy": 2.2151891469955443, |
|
"epoch": 0.128, |
|
"grad_norm": 1.022516131401062, |
|
"learning_rate": 7.916666666666667e-06, |
|
"loss": 2.0766, |
|
"mean_token_accuracy": 0.5204883277416229, |
|
"num_tokens": 158094.0, |
|
"step": 20 |
|
}, |
|
{ |
|
"entropy": 2.376429131627083, |
|
"epoch": 0.192, |
|
"grad_norm": 0.8220515251159668, |
|
"learning_rate": 1.2083333333333333e-05, |
|
"loss": 1.8902, |
|
"mean_token_accuracy": 0.5391427092254162, |
|
"num_tokens": 239163.0, |
|
"step": 30 |
|
}, |
|
{ |
|
"entropy": 2.285146689414978, |
|
"epoch": 0.256, |
|
"grad_norm": 0.7022648453712463, |
|
"learning_rate": 1.6250000000000002e-05, |
|
"loss": 1.7478, |
|
"mean_token_accuracy": 0.56041978597641, |
|
"num_tokens": 317818.0, |
|
"step": 40 |
|
}, |
|
{ |
|
"entropy": 2.3282038152217863, |
|
"epoch": 0.32, |
|
"grad_norm": 0.5584391951560974, |
|
"learning_rate": 1.9999978697023387e-05, |
|
"loss": 1.7687, |
|
"mean_token_accuracy": 0.5601607479155064, |
|
"num_tokens": 396146.0, |
|
"step": 50 |
|
}, |
|
{ |
|
"entropy": 2.2709642231464384, |
|
"epoch": 0.384, |
|
"grad_norm": 0.5373395085334778, |
|
"learning_rate": 1.999742244965125e-05, |
|
"loss": 1.6913, |
|
"mean_token_accuracy": 0.5693033933639526, |
|
"num_tokens": 474291.0, |
|
"step": 60 |
|
}, |
|
{ |
|
"entropy": 2.2445768117904663, |
|
"epoch": 0.448, |
|
"grad_norm": 0.4558122754096985, |
|
"learning_rate": 1.9990606854864625e-05, |
|
"loss": 1.679, |
|
"mean_token_accuracy": 0.5720810443162918, |
|
"num_tokens": 554739.0, |
|
"step": 70 |
|
}, |
|
{ |
|
"entropy": 2.2270330280065536, |
|
"epoch": 0.512, |
|
"grad_norm": 0.5535369515419006, |
|
"learning_rate": 1.997953481641056e-05, |
|
"loss": 1.6522, |
|
"mean_token_accuracy": 0.574026207625866, |
|
"num_tokens": 633658.0, |
|
"step": 80 |
|
}, |
|
{ |
|
"entropy": 2.2367560386657717, |
|
"epoch": 0.576, |
|
"grad_norm": 0.5366299152374268, |
|
"learning_rate": 1.9964211051470778e-05, |
|
"loss": 1.6955, |
|
"mean_token_accuracy": 0.5699351653456688, |
|
"num_tokens": 712400.0, |
|
"step": 90 |
|
}, |
|
{ |
|
"entropy": 2.21894571185112, |
|
"epoch": 0.64, |
|
"grad_norm": 0.4690150022506714, |
|
"learning_rate": 1.994464208865191e-05, |
|
"loss": 1.7048, |
|
"mean_token_accuracy": 0.5701304003596306, |
|
"num_tokens": 792630.0, |
|
"step": 100 |
|
}, |
|
{ |
|
"entropy": 2.235249537229538, |
|
"epoch": 0.704, |
|
"grad_norm": 0.5834165811538696, |
|
"learning_rate": 1.9920836265204047e-05, |
|
"loss": 1.7032, |
|
"mean_token_accuracy": 0.5705543920397759, |
|
"num_tokens": 872045.0, |
|
"step": 110 |
|
}, |
|
{ |
|
"entropy": 2.2257163137197495, |
|
"epoch": 0.768, |
|
"grad_norm": 0.5584805011749268, |
|
"learning_rate": 1.989280372346868e-05, |
|
"loss": 1.666, |
|
"mean_token_accuracy": 0.5684764981269836, |
|
"num_tokens": 952057.0, |
|
"step": 120 |
|
}, |
|
{ |
|
"entropy": 2.2563431203365325, |
|
"epoch": 0.832, |
|
"grad_norm": 0.5170231461524963, |
|
"learning_rate": 1.986055640655763e-05, |
|
"loss": 1.7134, |
|
"mean_token_accuracy": 0.570289532840252, |
|
"num_tokens": 1029200.0, |
|
"step": 130 |
|
}, |
|
{ |
|
"entropy": 2.2378907680511473, |
|
"epoch": 0.896, |
|
"grad_norm": 0.5027748942375183, |
|
"learning_rate": 1.9824108053264726e-05, |
|
"loss": 1.6719, |
|
"mean_token_accuracy": 0.5730531394481659, |
|
"num_tokens": 1105844.0, |
|
"step": 140 |
|
}, |
|
{ |
|
"entropy": 2.1966699600219726, |
|
"epoch": 0.96, |
|
"grad_norm": 0.5884814262390137, |
|
"learning_rate": 1.9783474192212484e-05, |
|
"loss": 1.6327, |
|
"mean_token_accuracy": 0.5813805550336838, |
|
"num_tokens": 1182935.0, |
|
"step": 150 |
|
}, |
|
{ |
|
"entropy": 2.20564815804765, |
|
"epoch": 1.0192, |
|
"grad_norm": 0.570175290107727, |
|
"learning_rate": 1.9738672135236218e-05, |
|
"loss": 1.6118, |
|
"mean_token_accuracy": 0.582583570802534, |
|
"num_tokens": 1254363.0, |
|
"step": 160 |
|
}, |
|
{ |
|
"entropy": 2.1847074955701826, |
|
"epoch": 1.0832, |
|
"grad_norm": 0.5836730003356934, |
|
"learning_rate": 1.968972097000843e-05, |
|
"loss": 1.6172, |
|
"mean_token_accuracy": 0.5812226444482803, |
|
"num_tokens": 1330281.0, |
|
"step": 170 |
|
}, |
|
{ |
|
"entropy": 2.1814055383205413, |
|
"epoch": 1.1472, |
|
"grad_norm": 0.5746439695358276, |
|
"learning_rate": 1.96366415519066e-05, |
|
"loss": 1.6192, |
|
"mean_token_accuracy": 0.5789176046848297, |
|
"num_tokens": 1409407.0, |
|
"step": 180 |
|
}, |
|
{ |
|
"entropy": 2.2038993716239927, |
|
"epoch": 1.2112, |
|
"grad_norm": 0.5652104616165161, |
|
"learning_rate": 1.957945649512788e-05, |
|
"loss": 1.6166, |
|
"mean_token_accuracy": 0.5809548154473305, |
|
"num_tokens": 1489034.0, |
|
"step": 190 |
|
}, |
|
{ |
|
"entropy": 2.173789343237877, |
|
"epoch": 1.2752, |
|
"grad_norm": 0.6653291583061218, |
|
"learning_rate": 1.951819016305442e-05, |
|
"loss": 1.62, |
|
"mean_token_accuracy": 0.5827470317482948, |
|
"num_tokens": 1568549.0, |
|
"step": 200 |
|
}, |
|
{ |
|
"entropy": 2.1907752990722655, |
|
"epoch": 1.3392, |
|
"grad_norm": 0.7024573087692261, |
|
"learning_rate": 1.9452868657873513e-05, |
|
"loss": 1.6397, |
|
"mean_token_accuracy": 0.5796025812625885, |
|
"num_tokens": 1647404.0, |
|
"step": 210 |
|
}, |
|
{ |
|
"entropy": 2.189376249909401, |
|
"epoch": 1.4032, |
|
"grad_norm": 0.5727422833442688, |
|
"learning_rate": 1.9383519809456862e-05, |
|
"loss": 1.6349, |
|
"mean_token_accuracy": 0.5815459445118905, |
|
"num_tokens": 1728421.0, |
|
"step": 220 |
|
}, |
|
{ |
|
"entropy": 2.209022229909897, |
|
"epoch": 1.4672, |
|
"grad_norm": 0.6421232223510742, |
|
"learning_rate": 1.931017316350384e-05, |
|
"loss": 1.6425, |
|
"mean_token_accuracy": 0.5790404245257378, |
|
"num_tokens": 1806891.0, |
|
"step": 230 |
|
}, |
|
{ |
|
"entropy": 2.2337595343589784, |
|
"epoch": 1.5312000000000001, |
|
"grad_norm": 0.6296209692955017, |
|
"learning_rate": 1.9232859968953702e-05, |
|
"loss": 1.624, |
|
"mean_token_accuracy": 0.5814317353069782, |
|
"num_tokens": 1883100.0, |
|
"step": 240 |
|
}, |
|
{ |
|
"entropy": 2.205833575129509, |
|
"epoch": 1.5952, |
|
"grad_norm": 0.6371021866798401, |
|
"learning_rate": 1.9151613164672136e-05, |
|
"loss": 1.6284, |
|
"mean_token_accuracy": 0.5819905593991279, |
|
"num_tokens": 1961317.0, |
|
"step": 250 |
|
}, |
|
{ |
|
"entropy": 2.205822005867958, |
|
"epoch": 1.6592, |
|
"grad_norm": 0.6950616836547852, |
|
"learning_rate": 1.9066467365417844e-05, |
|
"loss": 1.6374, |
|
"mean_token_accuracy": 0.5760326236486435, |
|
"num_tokens": 2042881.0, |
|
"step": 260 |
|
}, |
|
{ |
|
"entropy": 2.2163637399673464, |
|
"epoch": 1.7231999999999998, |
|
"grad_norm": 0.7801616191864014, |
|
"learning_rate": 1.8977458847095117e-05, |
|
"loss": 1.663, |
|
"mean_token_accuracy": 0.5744953289628029, |
|
"num_tokens": 2121403.0, |
|
"step": 270 |
|
}, |
|
{ |
|
"entropy": 2.199243775010109, |
|
"epoch": 1.7872, |
|
"grad_norm": 0.6671239733695984, |
|
"learning_rate": 1.888462553129867e-05, |
|
"loss": 1.6456, |
|
"mean_token_accuracy": 0.579181258380413, |
|
"num_tokens": 2200908.0, |
|
"step": 280 |
|
}, |
|
{ |
|
"entropy": 2.214826595783234, |
|
"epoch": 1.8512, |
|
"grad_norm": 0.7415009140968323, |
|
"learning_rate": 1.878800696915737e-05, |
|
"loss": 1.6113, |
|
"mean_token_accuracy": 0.5840038731694221, |
|
"num_tokens": 2278414.0, |
|
"step": 290 |
|
}, |
|
{ |
|
"entropy": 2.187604659795761, |
|
"epoch": 1.9152, |
|
"grad_norm": 0.662319540977478, |
|
"learning_rate": 1.868764432448369e-05, |
|
"loss": 1.6182, |
|
"mean_token_accuracy": 0.580166706442833, |
|
"num_tokens": 2355826.0, |
|
"step": 300 |
|
}, |
|
{ |
|
"entropy": 2.2184703826904295, |
|
"epoch": 1.9792, |
|
"grad_norm": 0.7123025059700012, |
|
"learning_rate": 1.8583580356236065e-05, |
|
"loss": 1.655, |
|
"mean_token_accuracy": 0.5762834578752518, |
|
"num_tokens": 2434933.0, |
|
"step": 310 |
|
}, |
|
{ |
|
"entropy": 2.1887036239778674, |
|
"epoch": 2.0384, |
|
"grad_norm": 0.6846157312393188, |
|
"learning_rate": 1.8475859400301708e-05, |
|
"loss": 1.5935, |
|
"mean_token_accuracy": 0.5881956976813238, |
|
"num_tokens": 2507166.0, |
|
"step": 320 |
|
}, |
|
{ |
|
"entropy": 2.102977079153061, |
|
"epoch": 2.1024, |
|
"grad_norm": 0.7967628240585327, |
|
"learning_rate": 1.8364527350607527e-05, |
|
"loss": 1.5405, |
|
"mean_token_accuracy": 0.5946892097592353, |
|
"num_tokens": 2584298.0, |
|
"step": 330 |
|
}, |
|
{ |
|
"entropy": 2.118516767024994, |
|
"epoch": 2.1664, |
|
"grad_norm": 0.7417224645614624, |
|
"learning_rate": 1.824963163956726e-05, |
|
"loss": 1.5727, |
|
"mean_token_accuracy": 0.5870080485939979, |
|
"num_tokens": 2663601.0, |
|
"step": 340 |
|
}, |
|
{ |
|
"entropy": 2.104418155550957, |
|
"epoch": 2.2304, |
|
"grad_norm": 0.7956721782684326, |
|
"learning_rate": 1.8131221217873175e-05, |
|
"loss": 1.5575, |
|
"mean_token_accuracy": 0.5936456203460694, |
|
"num_tokens": 2744783.0, |
|
"step": 350 |
|
}, |
|
{ |
|
"entropy": 2.129578319191933, |
|
"epoch": 2.2944, |
|
"grad_norm": 0.769292950630188, |
|
"learning_rate": 1.8009346533640877e-05, |
|
"loss": 1.5878, |
|
"mean_token_accuracy": 0.5841517195105552, |
|
"num_tokens": 2823023.0, |
|
"step": 360 |
|
}, |
|
{ |
|
"entropy": 2.097687366604805, |
|
"epoch": 2.3584, |
|
"grad_norm": 0.9341740608215332, |
|
"learning_rate": 1.7884059510916167e-05, |
|
"loss": 1.5346, |
|
"mean_token_accuracy": 0.599460557103157, |
|
"num_tokens": 2899598.0, |
|
"step": 370 |
|
}, |
|
{ |
|
"entropy": 2.151599031686783, |
|
"epoch": 2.4224, |
|
"grad_norm": 0.8752340078353882, |
|
"learning_rate": 1.7755413527553087e-05, |
|
"loss": 1.5984, |
|
"mean_token_accuracy": 0.585393351316452, |
|
"num_tokens": 2978519.0, |
|
"step": 380 |
|
}, |
|
{ |
|
"entropy": 2.1223404884338377, |
|
"epoch": 2.4864, |
|
"grad_norm": 1.0296390056610107, |
|
"learning_rate": 1.7623463392472574e-05, |
|
"loss": 1.5232, |
|
"mean_token_accuracy": 0.595654422044754, |
|
"num_tokens": 3055327.0, |
|
"step": 390 |
|
}, |
|
{ |
|
"entropy": 2.16276493370533, |
|
"epoch": 2.5504, |
|
"grad_norm": 0.9905762672424316, |
|
"learning_rate": 1.748826532231142e-05, |
|
"loss": 1.6049, |
|
"mean_token_accuracy": 0.5822189599275589, |
|
"num_tokens": 3135348.0, |
|
"step": 400 |
|
}, |
|
{ |
|
"entropy": 2.127479985356331, |
|
"epoch": 2.6144, |
|
"grad_norm": 0.851375162601471, |
|
"learning_rate": 1.7349876917471474e-05, |
|
"loss": 1.5842, |
|
"mean_token_accuracy": 0.5855211839079857, |
|
"num_tokens": 3213122.0, |
|
"step": 410 |
|
}, |
|
{ |
|
"entropy": 2.167752879858017, |
|
"epoch": 2.6784, |
|
"grad_norm": 0.975143313407898, |
|
"learning_rate": 1.7208357137579318e-05, |
|
"loss": 1.5918, |
|
"mean_token_accuracy": 0.5839722648262977, |
|
"num_tokens": 3289583.0, |
|
"step": 420 |
|
}, |
|
{ |
|
"entropy": 2.127084198594093, |
|
"epoch": 2.7424, |
|
"grad_norm": 0.8077936768531799, |
|
"learning_rate": 1.7063766276366814e-05, |
|
"loss": 1.5916, |
|
"mean_token_accuracy": 0.5900941833853721, |
|
"num_tokens": 3369740.0, |
|
"step": 430 |
|
}, |
|
{ |
|
"entropy": 2.1315969794988634, |
|
"epoch": 2.8064, |
|
"grad_norm": 0.9403624534606934, |
|
"learning_rate": 1.6916165935983323e-05, |
|
"loss": 1.5713, |
|
"mean_token_accuracy": 0.5892721861600876, |
|
"num_tokens": 3448328.0, |
|
"step": 440 |
|
}, |
|
{ |
|
"entropy": 2.130605939030647, |
|
"epoch": 2.8704, |
|
"grad_norm": 0.8252040147781372, |
|
"learning_rate": 1.676561900075041e-05, |
|
"loss": 1.6003, |
|
"mean_token_accuracy": 0.5845118075609207, |
|
"num_tokens": 3529853.0, |
|
"step": 450 |
|
}, |
|
{ |
|
"entropy": 2.112012493610382, |
|
"epoch": 2.9344, |
|
"grad_norm": 0.9267668724060059, |
|
"learning_rate": 1.6612189610370336e-05, |
|
"loss": 1.5796, |
|
"mean_token_accuracy": 0.5887707889080047, |
|
"num_tokens": 3610922.0, |
|
"step": 460 |
|
}, |
|
{ |
|
"entropy": 2.100590059161186, |
|
"epoch": 2.9984, |
|
"grad_norm": 0.8996879458427429, |
|
"learning_rate": 1.6455943132599698e-05, |
|
"loss": 1.5483, |
|
"mean_token_accuracy": 0.5934251204133034, |
|
"num_tokens": 3688391.0, |
|
"step": 470 |
|
}, |
|
{ |
|
"entropy": 2.1115864160898568, |
|
"epoch": 3.0576, |
|
"grad_norm": 1.097270131111145, |
|
"learning_rate": 1.6296946135399835e-05, |
|
"loss": 1.5506, |
|
"mean_token_accuracy": 0.592829834770512, |
|
"num_tokens": 3758747.0, |
|
"step": 480 |
|
}, |
|
{ |
|
"entropy": 2.0610430628061294, |
|
"epoch": 3.1216, |
|
"grad_norm": 1.176645278930664, |
|
"learning_rate": 1.613526635857591e-05, |
|
"loss": 1.4461, |
|
"mean_token_accuracy": 0.6111307457089424, |
|
"num_tokens": 3834689.0, |
|
"step": 490 |
|
}, |
|
{ |
|
"entropy": 2.0154007196426393, |
|
"epoch": 3.1856, |
|
"grad_norm": 1.1834276914596558, |
|
"learning_rate": 1.5970972684916754e-05, |
|
"loss": 1.4852, |
|
"mean_token_accuracy": 0.6026980608701706, |
|
"num_tokens": 3916450.0, |
|
"step": 500 |
|
}, |
|
{ |
|
"entropy": 2.0441433399915696, |
|
"epoch": 3.2496, |
|
"grad_norm": 1.159286379814148, |
|
"learning_rate": 1.5804135110847708e-05, |
|
"loss": 1.4978, |
|
"mean_token_accuracy": 0.6042912915349007, |
|
"num_tokens": 3998511.0, |
|
"step": 510 |
|
}, |
|
{ |
|
"entropy": 2.0493109285831452, |
|
"epoch": 3.3136, |
|
"grad_norm": 1.2141708135604858, |
|
"learning_rate": 1.5634824716609037e-05, |
|
"loss": 1.5018, |
|
"mean_token_accuracy": 0.5995921581983567, |
|
"num_tokens": 4077676.0, |
|
"step": 520 |
|
}, |
|
{ |
|
"entropy": 2.0533218771219253, |
|
"epoch": 3.3776, |
|
"grad_norm": 1.1630637645721436, |
|
"learning_rate": 1.5463113635972577e-05, |
|
"loss": 1.499, |
|
"mean_token_accuracy": 0.6046154126524925, |
|
"num_tokens": 4155264.0, |
|
"step": 530 |
|
}, |
|
{ |
|
"entropy": 2.0600034058094026, |
|
"epoch": 3.4416, |
|
"grad_norm": 1.2523504495620728, |
|
"learning_rate": 1.528907502550954e-05, |
|
"loss": 1.521, |
|
"mean_token_accuracy": 0.6000443026423454, |
|
"num_tokens": 4233655.0, |
|
"step": 540 |
|
}, |
|
{ |
|
"entropy": 2.0414596855640412, |
|
"epoch": 3.5056000000000003, |
|
"grad_norm": 1.3990252017974854, |
|
"learning_rate": 1.5112783033422547e-05, |
|
"loss": 1.4899, |
|
"mean_token_accuracy": 0.6026965886354446, |
|
"num_tokens": 4311644.0, |
|
"step": 550 |
|
}, |
|
{ |
|
"entropy": 2.061043033003807, |
|
"epoch": 3.5696, |
|
"grad_norm": 1.1884260177612305, |
|
"learning_rate": 1.4934312767955193e-05, |
|
"loss": 1.5143, |
|
"mean_token_accuracy": 0.5981319859623909, |
|
"num_tokens": 4390933.0, |
|
"step": 560 |
|
}, |
|
{ |
|
"entropy": 2.034099668264389, |
|
"epoch": 3.6336, |
|
"grad_norm": 1.2996599674224854, |
|
"learning_rate": 1.4753740265392595e-05, |
|
"loss": 1.4953, |
|
"mean_token_accuracy": 0.6029247522354126, |
|
"num_tokens": 4470462.0, |
|
"step": 570 |
|
}, |
|
{ |
|
"entropy": 2.0379767954349517, |
|
"epoch": 3.6976, |
|
"grad_norm": 1.2936193943023682, |
|
"learning_rate": 1.4571142457666536e-05, |
|
"loss": 1.4965, |
|
"mean_token_accuracy": 0.6041712030768395, |
|
"num_tokens": 4549236.0, |
|
"step": 580 |
|
}, |
|
{ |
|
"entropy": 2.040063351392746, |
|
"epoch": 3.7616, |
|
"grad_norm": 1.5094560384750366, |
|
"learning_rate": 1.4386597139579041e-05, |
|
"loss": 1.4979, |
|
"mean_token_accuracy": 0.6051288455724716, |
|
"num_tokens": 4628758.0, |
|
"step": 590 |
|
}, |
|
{ |
|
"entropy": 1.9998936265707017, |
|
"epoch": 3.8256, |
|
"grad_norm": 1.3166426420211792, |
|
"learning_rate": 1.4200182935658327e-05, |
|
"loss": 1.459, |
|
"mean_token_accuracy": 0.6084850415587425, |
|
"num_tokens": 4708526.0, |
|
"step": 600 |
|
}, |
|
{ |
|
"entropy": 2.0041965901851655, |
|
"epoch": 3.8895999999999997, |
|
"grad_norm": 1.2710400819778442, |
|
"learning_rate": 1.4011979266661235e-05, |
|
"loss": 1.4831, |
|
"mean_token_accuracy": 0.6057328775525093, |
|
"num_tokens": 4788733.0, |
|
"step": 610 |
|
}, |
|
{ |
|
"entropy": 2.0265558779239656, |
|
"epoch": 3.9536, |
|
"grad_norm": 1.4318969249725342, |
|
"learning_rate": 1.3822066315736477e-05, |
|
"loss": 1.4966, |
|
"mean_token_accuracy": 0.5994595810770988, |
|
"num_tokens": 4866451.0, |
|
"step": 620 |
|
}, |
|
{ |
|
"entropy": 2.0692459924800977, |
|
"epoch": 4.0128, |
|
"grad_norm": 1.2546013593673706, |
|
"learning_rate": 1.363052499426302e-05, |
|
"loss": 1.503, |
|
"mean_token_accuracy": 0.6039850309088424, |
|
"num_tokens": 4936715.0, |
|
"step": 630 |
|
}, |
|
{ |
|
"entropy": 1.9788923293352128, |
|
"epoch": 4.0768, |
|
"grad_norm": 1.416927456855774, |
|
"learning_rate": 1.3437436907378225e-05, |
|
"loss": 1.4248, |
|
"mean_token_accuracy": 0.6142558038234711, |
|
"num_tokens": 5016713.0, |
|
"step": 640 |
|
}, |
|
{ |
|
"entropy": 1.9646029412746429, |
|
"epoch": 4.1408, |
|
"grad_norm": 1.5146726369857788, |
|
"learning_rate": 1.3242884319210463e-05, |
|
"loss": 1.3875, |
|
"mean_token_accuracy": 0.624424883723259, |
|
"num_tokens": 5096513.0, |
|
"step": 650 |
|
}, |
|
{ |
|
"entropy": 1.93471617102623, |
|
"epoch": 4.2048, |
|
"grad_norm": 1.5090768337249756, |
|
"learning_rate": 1.3046950117830888e-05, |
|
"loss": 1.3884, |
|
"mean_token_accuracy": 0.6222448632121086, |
|
"num_tokens": 5177075.0, |
|
"step": 660 |
|
}, |
|
{ |
|
"entropy": 2.002266028523445, |
|
"epoch": 4.2688, |
|
"grad_norm": 1.74358332157135, |
|
"learning_rate": 1.2849717779939439e-05, |
|
"loss": 1.4062, |
|
"mean_token_accuracy": 0.6180147424340248, |
|
"num_tokens": 5252902.0, |
|
"step": 670 |
|
}, |
|
{ |
|
"entropy": 1.9397415190935134, |
|
"epoch": 4.3328, |
|
"grad_norm": 1.774728775024414, |
|
"learning_rate": 1.2651271335300063e-05, |
|
"loss": 1.3933, |
|
"mean_token_accuracy": 0.626343595981598, |
|
"num_tokens": 5331448.0, |
|
"step": 680 |
|
}, |
|
{ |
|
"entropy": 1.9571841150522231, |
|
"epoch": 4.3968, |
|
"grad_norm": 1.80965256690979, |
|
"learning_rate": 1.2451695330940268e-05, |
|
"loss": 1.4205, |
|
"mean_token_accuracy": 0.6187710732221603, |
|
"num_tokens": 5410857.0, |
|
"step": 690 |
|
}, |
|
{ |
|
"entropy": 1.9691186994314194, |
|
"epoch": 4.4608, |
|
"grad_norm": 1.5400609970092773, |
|
"learning_rate": 1.2251074795130339e-05, |
|
"loss": 1.4123, |
|
"mean_token_accuracy": 0.614769059419632, |
|
"num_tokens": 5488867.0, |
|
"step": 700 |
|
}, |
|
{ |
|
"entropy": 1.9635825514793397, |
|
"epoch": 4.5248, |
|
"grad_norm": 1.467608094215393, |
|
"learning_rate": 1.2049495201157489e-05, |
|
"loss": 1.4228, |
|
"mean_token_accuracy": 0.6202724784612655, |
|
"num_tokens": 5567515.0, |
|
"step": 710 |
|
}, |
|
{ |
|
"entropy": 1.9384470194578172, |
|
"epoch": 4.5888, |
|
"grad_norm": 1.652387022972107, |
|
"learning_rate": 1.1847042430910451e-05, |
|
"loss": 1.4273, |
|
"mean_token_accuracy": 0.6190450325608253, |
|
"num_tokens": 5648858.0, |
|
"step": 720 |
|
}, |
|
{ |
|
"entropy": 1.9911590039730072, |
|
"epoch": 4.6528, |
|
"grad_norm": 1.7492380142211914, |
|
"learning_rate": 1.1643802738289955e-05, |
|
"loss": 1.4776, |
|
"mean_token_accuracy": 0.6073927089571953, |
|
"num_tokens": 5725459.0, |
|
"step": 730 |
|
}, |
|
{ |
|
"entropy": 1.9724233269691467, |
|
"epoch": 4.7168, |
|
"grad_norm": 1.709669828414917, |
|
"learning_rate": 1.1439862712460721e-05, |
|
"loss": 1.4217, |
|
"mean_token_accuracy": 0.6184087961912155, |
|
"num_tokens": 5801601.0, |
|
"step": 740 |
|
}, |
|
{ |
|
"entropy": 1.9725236982107162, |
|
"epoch": 4.7808, |
|
"grad_norm": 1.7469470500946045, |
|
"learning_rate": 1.1235309240960621e-05, |
|
"loss": 1.405, |
|
"mean_token_accuracy": 0.6196158319711685, |
|
"num_tokens": 5881107.0, |
|
"step": 750 |
|
}, |
|
{ |
|
"entropy": 1.9484833419322967, |
|
"epoch": 4.8448, |
|
"grad_norm": 1.532373309135437, |
|
"learning_rate": 1.1030229472682719e-05, |
|
"loss": 1.4155, |
|
"mean_token_accuracy": 0.611663281917572, |
|
"num_tokens": 5960375.0, |
|
"step": 760 |
|
}, |
|
{ |
|
"entropy": 1.9964754343032838, |
|
"epoch": 4.9088, |
|
"grad_norm": 1.7157669067382812, |
|
"learning_rate": 1.0824710780745954e-05, |
|
"loss": 1.4295, |
|
"mean_token_accuracy": 0.6131752103567123, |
|
"num_tokens": 6038267.0, |
|
"step": 770 |
|
}, |
|
{ |
|
"entropy": 1.9598666340112687, |
|
"epoch": 4.9728, |
|
"grad_norm": 1.9844586849212646, |
|
"learning_rate": 1.06188407252703e-05, |
|
"loss": 1.397, |
|
"mean_token_accuracy": 0.6226776748895645, |
|
"num_tokens": 6114749.0, |
|
"step": 780 |
|
}, |
|
{ |
|
"entropy": 1.9227982276194804, |
|
"epoch": 5.032, |
|
"grad_norm": 1.8960447311401367, |
|
"learning_rate": 1.0412707016072254e-05, |
|
"loss": 1.3649, |
|
"mean_token_accuracy": 0.6269845414806057, |
|
"num_tokens": 6190567.0, |
|
"step": 790 |
|
}, |
|
{ |
|
"entropy": 1.9008578658103943, |
|
"epoch": 5.096, |
|
"grad_norm": 2.1205599308013916, |
|
"learning_rate": 1.0206397475296548e-05, |
|
"loss": 1.3582, |
|
"mean_token_accuracy": 0.6292989999055862, |
|
"num_tokens": 6269285.0, |
|
"step": 800 |
|
}, |
|
{ |
|
"entropy": 1.9224162876605988, |
|
"epoch": 5.16, |
|
"grad_norm": 2.0454013347625732, |
|
"learning_rate": 1e-05, |
|
"loss": 1.3349, |
|
"mean_token_accuracy": 0.6315066292881966, |
|
"num_tokens": 6345352.0, |
|
"step": 810 |
|
}, |
|
{ |
|
"entropy": 1.9340467154979706, |
|
"epoch": 5.224, |
|
"grad_norm": 2.2607693672180176, |
|
"learning_rate": 9.793602524703456e-06, |
|
"loss": 1.359, |
|
"mean_token_accuracy": 0.6322078078985214, |
|
"num_tokens": 6422524.0, |
|
"step": 820 |
|
}, |
|
{ |
|
"entropy": 1.9296668291091919, |
|
"epoch": 5.288, |
|
"grad_norm": 2.1245901584625244, |
|
"learning_rate": 9.58729298392775e-06, |
|
"loss": 1.3672, |
|
"mean_token_accuracy": 0.6282135233283043, |
|
"num_tokens": 6500128.0, |
|
"step": 830 |
|
}, |
|
{ |
|
"entropy": 1.9272812247276305, |
|
"epoch": 5.352, |
|
"grad_norm": 1.965820550918579, |
|
"learning_rate": 9.381159274729704e-06, |
|
"loss": 1.3786, |
|
"mean_token_accuracy": 0.6249860525131226, |
|
"num_tokens": 6578766.0, |
|
"step": 840 |
|
}, |
|
{ |
|
"entropy": 1.904970219731331, |
|
"epoch": 5.416, |
|
"grad_norm": 1.9188759326934814, |
|
"learning_rate": 9.175289219254051e-06, |
|
"loss": 1.3418, |
|
"mean_token_accuracy": 0.6325456693768501, |
|
"num_tokens": 6658732.0, |
|
"step": 850 |
|
}, |
|
{ |
|
"entropy": 1.8833305448293687, |
|
"epoch": 5.48, |
|
"grad_norm": 1.9675428867340088, |
|
"learning_rate": 8.969770527317283e-06, |
|
"loss": 1.3274, |
|
"mean_token_accuracy": 0.6377805054187775, |
|
"num_tokens": 6738683.0, |
|
"step": 860 |
|
}, |
|
{ |
|
"entropy": 1.8806802958250046, |
|
"epoch": 5.5440000000000005, |
|
"grad_norm": 1.8849304914474487, |
|
"learning_rate": 8.764690759039382e-06, |
|
"loss": 1.3109, |
|
"mean_token_accuracy": 0.636364534497261, |
|
"num_tokens": 6817786.0, |
|
"step": 870 |
|
}, |
|
{ |
|
"entropy": 1.8846195042133331, |
|
"epoch": 5.608, |
|
"grad_norm": 2.050208330154419, |
|
"learning_rate": 8.56013728753928e-06, |
|
"loss": 1.3449, |
|
"mean_token_accuracy": 0.6316975012421608, |
|
"num_tokens": 6896222.0, |
|
"step": 880 |
|
}, |
|
{ |
|
"entropy": 1.88524529337883, |
|
"epoch": 5.672, |
|
"grad_norm": 2.1371288299560547, |
|
"learning_rate": 8.356197261710048e-06, |
|
"loss": 1.346, |
|
"mean_token_accuracy": 0.633928644657135, |
|
"num_tokens": 6976885.0, |
|
"step": 890 |
|
}, |
|
{ |
|
"entropy": 1.9162244260311128, |
|
"epoch": 5.736, |
|
"grad_norm": 1.9879032373428345, |
|
"learning_rate": 8.152957569089552e-06, |
|
"loss": 1.3486, |
|
"mean_token_accuracy": 0.6311523199081421, |
|
"num_tokens": 7053473.0, |
|
"step": 900 |
|
}, |
|
{ |
|
"entropy": 1.89161317050457, |
|
"epoch": 5.8, |
|
"grad_norm": 2.2934179306030273, |
|
"learning_rate": 7.950504798842513e-06, |
|
"loss": 1.3699, |
|
"mean_token_accuracy": 0.6269390240311623, |
|
"num_tokens": 7133137.0, |
|
"step": 910 |
|
}, |
|
{ |
|
"entropy": 1.888116827607155, |
|
"epoch": 5.864, |
|
"grad_norm": 1.769087791442871, |
|
"learning_rate": 7.748925204869667e-06, |
|
"loss": 1.3756, |
|
"mean_token_accuracy": 0.6285945609211921, |
|
"num_tokens": 7213693.0, |
|
"step": 920 |
|
}, |
|
{ |
|
"entropy": 1.89390210211277, |
|
"epoch": 5.928, |
|
"grad_norm": 2.2577364444732666, |
|
"learning_rate": 7.548304669059735e-06, |
|
"loss": 1.3396, |
|
"mean_token_accuracy": 0.6290415957570076, |
|
"num_tokens": 7291999.0, |
|
"step": 930 |
|
}, |
|
{ |
|
"entropy": 1.8755547761917115, |
|
"epoch": 5.992, |
|
"grad_norm": 2.080371618270874, |
|
"learning_rate": 7.348728664699939e-06, |
|
"loss": 1.3305, |
|
"mean_token_accuracy": 0.6322756335139275, |
|
"num_tokens": 7370138.0, |
|
"step": 940 |
|
}, |
|
{ |
|
"entropy": 1.8396991845723745, |
|
"epoch": 6.0512, |
|
"grad_norm": 2.222177028656006, |
|
"learning_rate": 7.150282220060564e-06, |
|
"loss": 1.2782, |
|
"mean_token_accuracy": 0.6437820018948736, |
|
"num_tokens": 7444764.0, |
|
"step": 950 |
|
}, |
|
{ |
|
"entropy": 1.864711531996727, |
|
"epoch": 6.1152, |
|
"grad_norm": 2.236663579940796, |
|
"learning_rate": 6.9530498821691165e-06, |
|
"loss": 1.342, |
|
"mean_token_accuracy": 0.6400286257266998, |
|
"num_tokens": 7523012.0, |
|
"step": 960 |
|
}, |
|
{ |
|
"entropy": 1.8462383985519408, |
|
"epoch": 6.1792, |
|
"grad_norm": 2.438649892807007, |
|
"learning_rate": 6.757115680789539e-06, |
|
"loss": 1.2769, |
|
"mean_token_accuracy": 0.6437345445156097, |
|
"num_tokens": 7602451.0, |
|
"step": 970 |
|
}, |
|
{ |
|
"entropy": 1.8425735771656035, |
|
"epoch": 6.2432, |
|
"grad_norm": 2.306880235671997, |
|
"learning_rate": 6.562563092621776e-06, |
|
"loss": 1.309, |
|
"mean_token_accuracy": 0.6463457986712455, |
|
"num_tokens": 7681972.0, |
|
"step": 980 |
|
}, |
|
{ |
|
"entropy": 1.8139673799276352, |
|
"epoch": 6.3072, |
|
"grad_norm": 2.286114454269409, |
|
"learning_rate": 6.369475005736984e-06, |
|
"loss": 1.2748, |
|
"mean_token_accuracy": 0.6487143859267235, |
|
"num_tokens": 7762845.0, |
|
"step": 990 |
|
}, |
|
{ |
|
"entropy": 1.8660429507493972, |
|
"epoch": 6.3712, |
|
"grad_norm": 2.421706199645996, |
|
"learning_rate": 6.177933684263524e-06, |
|
"loss": 1.2964, |
|
"mean_token_accuracy": 0.6455973491072655, |
|
"num_tokens": 7839552.0, |
|
"step": 1000 |
|
}, |
|
{ |
|
"entropy": 1.8517659038305283, |
|
"epoch": 6.4352, |
|
"grad_norm": 2.3891334533691406, |
|
"learning_rate": 5.988020733338767e-06, |
|
"loss": 1.2893, |
|
"mean_token_accuracy": 0.6442387655377388, |
|
"num_tokens": 7915996.0, |
|
"step": 1010 |
|
}, |
|
{ |
|
"entropy": 1.8547363132238388, |
|
"epoch": 6.4992, |
|
"grad_norm": 2.26686429977417, |
|
"learning_rate": 5.7998170643416795e-06, |
|
"loss": 1.2973, |
|
"mean_token_accuracy": 0.6435917019844055, |
|
"num_tokens": 7995119.0, |
|
"step": 1020 |
|
}, |
|
{ |
|
"entropy": 1.8365773737430573, |
|
"epoch": 6.5632, |
|
"grad_norm": 2.1454896926879883, |
|
"learning_rate": 5.613402860420962e-06, |
|
"loss": 1.2744, |
|
"mean_token_accuracy": 0.6410152271389962, |
|
"num_tokens": 8075306.0, |
|
"step": 1030 |
|
}, |
|
{ |
|
"entropy": 1.8936803489923477, |
|
"epoch": 6.6272, |
|
"grad_norm": 2.5226423740386963, |
|
"learning_rate": 5.428857542333465e-06, |
|
"loss": 1.3225, |
|
"mean_token_accuracy": 0.6396260514855385, |
|
"num_tokens": 8152449.0, |
|
"step": 1040 |
|
}, |
|
{ |
|
"entropy": 1.8555004209280015, |
|
"epoch": 6.6912, |
|
"grad_norm": 2.216014862060547, |
|
"learning_rate": 5.246259734607411e-06, |
|
"loss": 1.299, |
|
"mean_token_accuracy": 0.641279113292694, |
|
"num_tokens": 8231904.0, |
|
"step": 1050 |
|
}, |
|
{ |
|
"entropy": 1.8588679373264312, |
|
"epoch": 6.7552, |
|
"grad_norm": 2.4265236854553223, |
|
"learning_rate": 5.065687232044811e-06, |
|
"loss": 1.3026, |
|
"mean_token_accuracy": 0.6363563358783721, |
|
"num_tokens": 8310755.0, |
|
"step": 1060 |
|
}, |
|
{ |
|
"entropy": 1.8318012267351151, |
|
"epoch": 6.8192, |
|
"grad_norm": 2.2089412212371826, |
|
"learning_rate": 4.887216966577458e-06, |
|
"loss": 1.2583, |
|
"mean_token_accuracy": 0.6502064153552055, |
|
"num_tokens": 8390161.0, |
|
"step": 1070 |
|
}, |
|
{ |
|
"entropy": 1.8765722244977951, |
|
"epoch": 6.8832, |
|
"grad_norm": 2.3233554363250732, |
|
"learning_rate": 4.710924974490463e-06, |
|
"loss": 1.3223, |
|
"mean_token_accuracy": 0.6393219083547592, |
|
"num_tokens": 8469078.0, |
|
"step": 1080 |
|
}, |
|
{ |
|
"entropy": 1.8413788318634032, |
|
"epoch": 6.9472000000000005, |
|
"grad_norm": 2.321904420852661, |
|
"learning_rate": 4.536886364027428e-06, |
|
"loss": 1.272, |
|
"mean_token_accuracy": 0.647525629401207, |
|
"num_tokens": 8547873.0, |
|
"step": 1090 |
|
}, |
|
{ |
|
"entropy": 1.8666607818088017, |
|
"epoch": 7.0064, |
|
"grad_norm": 2.1004791259765625, |
|
"learning_rate": 4.365175283390968e-06, |
|
"loss": 1.2721, |
|
"mean_token_accuracy": 0.6479364424138456, |
|
"num_tokens": 8619109.0, |
|
"step": 1100 |
|
}, |
|
{ |
|
"entropy": 1.8333647519350051, |
|
"epoch": 7.0704, |
|
"grad_norm": 2.9210190773010254, |
|
"learning_rate": 4.195864889152295e-06, |
|
"loss": 1.1833, |
|
"mean_token_accuracy": 0.6699477419257164, |
|
"num_tokens": 8692475.0, |
|
"step": 1110 |
|
}, |
|
{ |
|
"entropy": 1.8425445258617401, |
|
"epoch": 7.1344, |
|
"grad_norm": 2.3149521350860596, |
|
"learning_rate": 4.029027315083251e-06, |
|
"loss": 1.2707, |
|
"mean_token_accuracy": 0.650185227394104, |
|
"num_tokens": 8770456.0, |
|
"step": 1120 |
|
}, |
|
{ |
|
"entropy": 1.8121359765529632, |
|
"epoch": 7.1984, |
|
"grad_norm": 2.6502795219421387, |
|
"learning_rate": 3.864733641424093e-06, |
|
"loss": 1.2383, |
|
"mean_token_accuracy": 0.6547705471515656, |
|
"num_tokens": 8851214.0, |
|
"step": 1130 |
|
}, |
|
{ |
|
"entropy": 1.802490884065628, |
|
"epoch": 7.2624, |
|
"grad_norm": 2.227534770965576, |
|
"learning_rate": 3.703053864600169e-06, |
|
"loss": 1.2603, |
|
"mean_token_accuracy": 0.6489648431539535, |
|
"num_tokens": 8932363.0, |
|
"step": 1140 |
|
}, |
|
{ |
|
"entropy": 1.8214709132909774, |
|
"epoch": 7.3264, |
|
"grad_norm": 2.5923874378204346, |
|
"learning_rate": 3.544056867400306e-06, |
|
"loss": 1.248, |
|
"mean_token_accuracy": 0.651621387898922, |
|
"num_tokens": 9011734.0, |
|
"step": 1150 |
|
}, |
|
{ |
|
"entropy": 1.826240959763527, |
|
"epoch": 7.3904, |
|
"grad_norm": 2.67551589012146, |
|
"learning_rate": 3.3878103896296677e-06, |
|
"loss": 1.2488, |
|
"mean_token_accuracy": 0.6530374586582184, |
|
"num_tokens": 9090277.0, |
|
"step": 1160 |
|
}, |
|
{ |
|
"entropy": 1.837952870130539, |
|
"epoch": 7.4544, |
|
"grad_norm": 2.2191765308380127, |
|
"learning_rate": 3.2343809992495945e-06, |
|
"loss": 1.2704, |
|
"mean_token_accuracy": 0.6503957703709602, |
|
"num_tokens": 9168093.0, |
|
"step": 1170 |
|
}, |
|
{ |
|
"entropy": 1.8135560542345046, |
|
"epoch": 7.5184, |
|
"grad_norm": 2.5211071968078613, |
|
"learning_rate": 3.083834064016682e-06, |
|
"loss": 1.2212, |
|
"mean_token_accuracy": 0.6587097644805908, |
|
"num_tokens": 9247777.0, |
|
"step": 1180 |
|
}, |
|
{ |
|
"entropy": 1.8237973660230637, |
|
"epoch": 7.5824, |
|
"grad_norm": 2.6236841678619385, |
|
"learning_rate": 2.9362337236331884e-06, |
|
"loss": 1.2604, |
|
"mean_token_accuracy": 0.6501624628901481, |
|
"num_tokens": 9325367.0, |
|
"step": 1190 |
|
}, |
|
{ |
|
"entropy": 1.836614164710045, |
|
"epoch": 7.6464, |
|
"grad_norm": 2.726731777191162, |
|
"learning_rate": 2.791642862420686e-06, |
|
"loss": 1.2554, |
|
"mean_token_accuracy": 0.6520631939172745, |
|
"num_tokens": 9403641.0, |
|
"step": 1200 |
|
}, |
|
{ |
|
"entropy": 1.8044064462184906, |
|
"epoch": 7.7104, |
|
"grad_norm": 2.4943737983703613, |
|
"learning_rate": 2.6501230825285294e-06, |
|
"loss": 1.2519, |
|
"mean_token_accuracy": 0.6524736672639847, |
|
"num_tokens": 9484075.0, |
|
"step": 1210 |
|
}, |
|
{ |
|
"entropy": 1.8258908241987228, |
|
"epoch": 7.7744, |
|
"grad_norm": 2.4426612854003906, |
|
"learning_rate": 2.5117346776885843e-06, |
|
"loss": 1.251, |
|
"mean_token_accuracy": 0.6484281331300735, |
|
"num_tokens": 9561148.0, |
|
"step": 1220 |
|
}, |
|
{ |
|
"entropy": 1.8062447488307953, |
|
"epoch": 7.8384, |
|
"grad_norm": 2.465646266937256, |
|
"learning_rate": 2.3765366075274287e-06, |
|
"loss": 1.2662, |
|
"mean_token_accuracy": 0.6492940753698349, |
|
"num_tokens": 9642108.0, |
|
"step": 1230 |
|
}, |
|
{ |
|
"entropy": 1.8293108910322189, |
|
"epoch": 7.9024, |
|
"grad_norm": 2.4230668544769287, |
|
"learning_rate": 2.2445864724469146e-06, |
|
"loss": 1.2625, |
|
"mean_token_accuracy": 0.6592240884900094, |
|
"num_tokens": 9719660.0, |
|
"step": 1240 |
|
}, |
|
{ |
|
"entropy": 1.837513843178749, |
|
"epoch": 7.9664, |
|
"grad_norm": 2.7502171993255615, |
|
"learning_rate": 2.1159404890838365e-06, |
|
"loss": 1.2677, |
|
"mean_token_accuracy": 0.6493206784129143, |
|
"num_tokens": 9797593.0, |
|
"step": 1250 |
|
}, |
|
{ |
|
"entropy": 1.8162316245001715, |
|
"epoch": 8.0256, |
|
"grad_norm": 2.5199058055877686, |
|
"learning_rate": 1.990653466359125e-06, |
|
"loss": 1.2293, |
|
"mean_token_accuracy": 0.656300467413825, |
|
"num_tokens": 9871177.0, |
|
"step": 1260 |
|
}, |
|
{ |
|
"entropy": 1.780314788222313, |
|
"epoch": 8.0896, |
|
"grad_norm": 2.5237162113189697, |
|
"learning_rate": 1.8687787821268255e-06, |
|
"loss": 1.1791, |
|
"mean_token_accuracy": 0.6675050809979439, |
|
"num_tokens": 9949391.0, |
|
"step": 1270 |
|
}, |
|
{ |
|
"entropy": 1.779639583826065, |
|
"epoch": 8.1536, |
|
"grad_norm": 2.4559428691864014, |
|
"learning_rate": 1.7503683604327426e-06, |
|
"loss": 1.2177, |
|
"mean_token_accuracy": 0.6600575730204582, |
|
"num_tokens": 10030182.0, |
|
"step": 1280 |
|
}, |
|
{ |
|
"entropy": 1.7865025967359542, |
|
"epoch": 8.2176, |
|
"grad_norm": 2.9508230686187744, |
|
"learning_rate": 1.6354726493924745e-06, |
|
"loss": 1.1937, |
|
"mean_token_accuracy": 0.6630557537078857, |
|
"num_tokens": 10107960.0, |
|
"step": 1290 |
|
}, |
|
{ |
|
"entropy": 1.8122670024633407, |
|
"epoch": 8.2816, |
|
"grad_norm": 2.6917898654937744, |
|
"learning_rate": 1.5241405996982928e-06, |
|
"loss": 1.2319, |
|
"mean_token_accuracy": 0.6598842918872834, |
|
"num_tokens": 10185524.0, |
|
"step": 1300 |
|
}, |
|
{ |
|
"entropy": 1.806730917096138, |
|
"epoch": 8.3456, |
|
"grad_norm": 2.7887086868286133, |
|
"learning_rate": 1.4164196437639355e-06, |
|
"loss": 1.25, |
|
"mean_token_accuracy": 0.6578737393021583, |
|
"num_tokens": 10265123.0, |
|
"step": 1310 |
|
}, |
|
{ |
|
"entropy": 1.8156007081270218, |
|
"epoch": 8.4096, |
|
"grad_norm": 2.9965310096740723, |
|
"learning_rate": 1.3123556755163114e-06, |
|
"loss": 1.234, |
|
"mean_token_accuracy": 0.6579165816307068, |
|
"num_tokens": 10342205.0, |
|
"step": 1320 |
|
}, |
|
{ |
|
"entropy": 1.8044028550386428, |
|
"epoch": 8.4736, |
|
"grad_norm": 2.9466843605041504, |
|
"learning_rate": 1.2119930308426264e-06, |
|
"loss": 1.2423, |
|
"mean_token_accuracy": 0.6527451828122139, |
|
"num_tokens": 10420603.0, |
|
"step": 1330 |
|
}, |
|
{ |
|
"entropy": 1.8250535994768142, |
|
"epoch": 8.5376, |
|
"grad_norm": 2.9452784061431885, |
|
"learning_rate": 1.1153744687013313e-06, |
|
"loss": 1.258, |
|
"mean_token_accuracy": 0.6589037463068962, |
|
"num_tokens": 10499049.0, |
|
"step": 1340 |
|
}, |
|
{ |
|
"entropy": 1.7990799486637115, |
|
"epoch": 8.6016, |
|
"grad_norm": 2.6469309329986572, |
|
"learning_rate": 1.0225411529048857e-06, |
|
"loss": 1.2415, |
|
"mean_token_accuracy": 0.6555879130959511, |
|
"num_tokens": 10578178.0, |
|
"step": 1350 |
|
}, |
|
{ |
|
"entropy": 1.7638877242803574, |
|
"epoch": 8.6656, |
|
"grad_norm": 2.77990460395813, |
|
"learning_rate": 9.33532634582156e-07, |
|
"loss": 1.2143, |
|
"mean_token_accuracy": 0.6589834168553352, |
|
"num_tokens": 10659018.0, |
|
"step": 1360 |
|
}, |
|
{ |
|
"entropy": 1.8123771637678145, |
|
"epoch": 8.7296, |
|
"grad_norm": 3.1158993244171143, |
|
"learning_rate": 8.483868353278657e-07, |
|
"loss": 1.2358, |
|
"mean_token_accuracy": 0.6561313390731811, |
|
"num_tokens": 10736582.0, |
|
"step": 1370 |
|
}, |
|
{ |
|
"entropy": 1.8054670304059983, |
|
"epoch": 8.7936, |
|
"grad_norm": 2.915422201156616, |
|
"learning_rate": 7.671400310462984e-07, |
|
"loss": 1.2089, |
|
"mean_token_accuracy": 0.6610309720039368, |
|
"num_tokens": 10814534.0, |
|
"step": 1380 |
|
}, |
|
{ |
|
"entropy": 1.800497230887413, |
|
"epoch": 8.8576, |
|
"grad_norm": 2.7816338539123535, |
|
"learning_rate": 6.898268364961591e-07, |
|
"loss": 1.2227, |
|
"mean_token_accuracy": 0.6584793984889984, |
|
"num_tokens": 10893484.0, |
|
"step": 1390 |
|
}, |
|
{ |
|
"entropy": 1.7850348353385925, |
|
"epoch": 8.9216, |
|
"grad_norm": 2.569054126739502, |
|
"learning_rate": 6.164801905431394e-07, |
|
"loss": 1.2242, |
|
"mean_token_accuracy": 0.6574000924825668, |
|
"num_tokens": 10973818.0, |
|
"step": 1400 |
|
}, |
|
{ |
|
"entropy": 1.795585972070694, |
|
"epoch": 8.9856, |
|
"grad_norm": 2.6529977321624756, |
|
"learning_rate": 5.471313421264879e-07, |
|
"loss": 1.2127, |
|
"mean_token_accuracy": 0.6600923746824264, |
|
"num_tokens": 11051396.0, |
|
"step": 1410 |
|
}, |
|
{ |
|
"entropy": 1.8172799606580992, |
|
"epoch": 9.0448, |
|
"grad_norm": 2.719399929046631, |
|
"learning_rate": 4.818098369455793e-07, |
|
"loss": 1.2758, |
|
"mean_token_accuracy": 0.6563853702029666, |
|
"num_tokens": 11124338.0, |
|
"step": 1420 |
|
}, |
|
{ |
|
"entropy": 1.7982712090015411, |
|
"epoch": 9.1088, |
|
"grad_norm": 2.516369581222534, |
|
"learning_rate": 4.20543504872124e-07, |
|
"loss": 1.2054, |
|
"mean_token_accuracy": 0.6637166649103164, |
|
"num_tokens": 11202233.0, |
|
"step": 1430 |
|
}, |
|
{ |
|
"entropy": 1.795827680826187, |
|
"epoch": 9.1728, |
|
"grad_norm": 3.1275811195373535, |
|
"learning_rate": 3.633584480934016e-07, |
|
"loss": 1.1907, |
|
"mean_token_accuracy": 0.6667785882949829, |
|
"num_tokens": 11279587.0, |
|
"step": 1440 |
|
}, |
|
{ |
|
"entropy": 1.7738620430231093, |
|
"epoch": 9.2368, |
|
"grad_norm": 2.6204917430877686, |
|
"learning_rate": 3.1027902999157146e-07, |
|
"loss": 1.2156, |
|
"mean_token_accuracy": 0.6609065368771553, |
|
"num_tokens": 11360243.0, |
|
"step": 1450 |
|
}, |
|
{ |
|
"entropy": 1.7866268098354339, |
|
"epoch": 9.3008, |
|
"grad_norm": 2.763274908065796, |
|
"learning_rate": 2.61327864763784e-07, |
|
"loss": 1.2109, |
|
"mean_token_accuracy": 0.6604277700185776, |
|
"num_tokens": 11440689.0, |
|
"step": 1460 |
|
}, |
|
{ |
|
"entropy": 1.7874858051538467, |
|
"epoch": 9.3648, |
|
"grad_norm": 2.725693464279175, |
|
"learning_rate": 2.1652580778751875e-07, |
|
"loss": 1.2379, |
|
"mean_token_accuracy": 0.6640482068061828, |
|
"num_tokens": 11520425.0, |
|
"step": 1470 |
|
}, |
|
{ |
|
"entropy": 1.7999387830495834, |
|
"epoch": 9.4288, |
|
"grad_norm": 2.849959135055542, |
|
"learning_rate": 1.758919467352771e-07, |
|
"loss": 1.2453, |
|
"mean_token_accuracy": 0.652675162255764, |
|
"num_tokens": 11600907.0, |
|
"step": 1480 |
|
}, |
|
{ |
|
"entropy": 1.7698762983083725, |
|
"epoch": 9.4928, |
|
"grad_norm": 2.384965419769287, |
|
"learning_rate": 1.3944359344237214e-07, |
|
"loss": 1.2038, |
|
"mean_token_accuracy": 0.6633729308843612, |
|
"num_tokens": 11680986.0, |
|
"step": 1490 |
|
}, |
|
{ |
|
"entropy": 1.7817810475826263, |
|
"epoch": 9.556799999999999, |
|
"grad_norm": 2.6171748638153076, |
|
"learning_rate": 1.0719627653131948e-07, |
|
"loss": 1.2052, |
|
"mean_token_accuracy": 0.662623830139637, |
|
"num_tokens": 11759645.0, |
|
"step": 1500 |
|
}, |
|
{ |
|
"entropy": 1.7850700795650483, |
|
"epoch": 9.6208, |
|
"grad_norm": 2.759584426879883, |
|
"learning_rate": 7.916373479595507e-08, |
|
"loss": 1.2011, |
|
"mean_token_accuracy": 0.6652053311467171, |
|
"num_tokens": 11837753.0, |
|
"step": 1510 |
|
}, |
|
{ |
|
"entropy": 1.8089916795492171, |
|
"epoch": 9.6848, |
|
"grad_norm": 2.8050880432128906, |
|
"learning_rate": 5.535791134809176e-08, |
|
"loss": 1.218, |
|
"mean_token_accuracy": 0.6629775419831276, |
|
"num_tokens": 11915480.0, |
|
"step": 1520 |
|
}, |
|
{ |
|
"entropy": 1.7926248282194137, |
|
"epoch": 9.7488, |
|
"grad_norm": 2.947237491607666, |
|
"learning_rate": 3.57889485292251e-08, |
|
"loss": 1.2402, |
|
"mean_token_accuracy": 0.6560651332139968, |
|
"num_tokens": 11994204.0, |
|
"step": 1530 |
|
}, |
|
{ |
|
"entropy": 1.788163235783577, |
|
"epoch": 9.8128, |
|
"grad_norm": 3.0515170097351074, |
|
"learning_rate": 2.046518358944094e-08, |
|
"loss": 1.2018, |
|
"mean_token_accuracy": 0.66530032902956, |
|
"num_tokens": 12070332.0, |
|
"step": 1540 |
|
}, |
|
{ |
|
"entropy": 1.798910641670227, |
|
"epoch": 9.8768, |
|
"grad_norm": 2.6602368354797363, |
|
"learning_rate": 9.393145135377924e-09, |
|
"loss": 1.2375, |
|
"mean_token_accuracy": 0.6554615125060081, |
|
"num_tokens": 12149645.0, |
|
"step": 1550 |
|
}, |
|
{ |
|
"entropy": 1.794683536887169, |
|
"epoch": 9.9408, |
|
"grad_norm": 2.509953022003174, |
|
"learning_rate": 2.5775503487501795e-09, |
|
"loss": 1.2262, |
|
"mean_token_accuracy": 0.6570453852415085, |
|
"num_tokens": 12229123.0, |
|
"step": 1560 |
|
}, |
|
{ |
|
"entropy": 1.7768772969374786, |
|
"epoch": 10.0, |
|
"grad_norm": 6.54733419418335, |
|
"learning_rate": 2.1302976616066616e-11, |
|
"loss": 1.1615, |
|
"mean_token_accuracy": 0.6717051477045626, |
|
"num_tokens": 12300470.0, |
|
"step": 1570 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1570, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.1844416691327468e+18, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|