|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.012497188132670149, |
|
"eval_steps": 500, |
|
"global_step": 500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 2.4994376265340298e-05, |
|
"grad_norm": 22.25, |
|
"learning_rate": 0.0008, |
|
"loss": 12.4316, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.00012497188132670149, |
|
"grad_norm": 4.40625, |
|
"learning_rate": 0.0007999840035991902, |
|
"loss": 13.3525, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.00024994376265340297, |
|
"grad_norm": 4.5, |
|
"learning_rate": 0.000799964008098178, |
|
"loss": 14.8491, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.00037491564398010446, |
|
"grad_norm": 6.84375, |
|
"learning_rate": 0.0007999440125971656, |
|
"loss": 8.8491, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.0004998875253068059, |
|
"grad_norm": 3.75, |
|
"learning_rate": 0.0007999240170961534, |
|
"loss": 7.9633, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0006248594066335074, |
|
"grad_norm": 4.75, |
|
"learning_rate": 0.0007999040215951412, |
|
"loss": 7.787, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.0007498312879602089, |
|
"grad_norm": 15.3125, |
|
"learning_rate": 0.0007998840260941289, |
|
"loss": 7.6196, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.0008748031692869104, |
|
"grad_norm": 3.03125, |
|
"learning_rate": 0.0007998640305931166, |
|
"loss": 7.453, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.0009997750506136119, |
|
"grad_norm": 3.921875, |
|
"learning_rate": 0.0007998440350921043, |
|
"loss": 7.3384, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.0011247469319403134, |
|
"grad_norm": 3.203125, |
|
"learning_rate": 0.000799824039591092, |
|
"loss": 7.2074, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.0012497188132670149, |
|
"grad_norm": 3.28125, |
|
"learning_rate": 0.0007998040440900798, |
|
"loss": 7.1685, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.0013746906945937163, |
|
"grad_norm": 3.578125, |
|
"learning_rate": 0.0007997840485890674, |
|
"loss": 7.2311, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.0014996625759204178, |
|
"grad_norm": 7.28125, |
|
"learning_rate": 0.0007997640530880552, |
|
"loss": 7.1694, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.0016246344572471193, |
|
"grad_norm": 3.0, |
|
"learning_rate": 0.0007997440575870429, |
|
"loss": 7.1179, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.0017496063385738208, |
|
"grad_norm": 2.453125, |
|
"learning_rate": 0.0007997240620860307, |
|
"loss": 7.1143, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.0018745782199005223, |
|
"grad_norm": 3.46875, |
|
"learning_rate": 0.0007997040665850185, |
|
"loss": 7.0387, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.0019995501012272238, |
|
"grad_norm": 3.484375, |
|
"learning_rate": 0.0007996840710840061, |
|
"loss": 6.825, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.0021245219825539252, |
|
"grad_norm": 2.859375, |
|
"learning_rate": 0.0007996640755829939, |
|
"loss": 6.7811, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.0022494938638806267, |
|
"grad_norm": 2.640625, |
|
"learning_rate": 0.0007996440800819816, |
|
"loss": 6.821, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.0023744657452073282, |
|
"grad_norm": 3.0, |
|
"learning_rate": 0.0007996240845809693, |
|
"loss": 6.7552, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.0024994376265340297, |
|
"grad_norm": 3.953125, |
|
"learning_rate": 0.000799604089079957, |
|
"loss": 6.7139, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.002624409507860731, |
|
"grad_norm": 2.296875, |
|
"learning_rate": 0.0007995840935789447, |
|
"loss": 6.7687, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.0027493813891874327, |
|
"grad_norm": 2.65625, |
|
"learning_rate": 0.0007995640980779325, |
|
"loss": 6.6156, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.002874353270514134, |
|
"grad_norm": 4.28125, |
|
"learning_rate": 0.0007995441025769203, |
|
"loss": 6.5183, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.0029993251518408356, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 0.000799524107075908, |
|
"loss": 6.5023, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.003124297033167537, |
|
"grad_norm": 3.0, |
|
"learning_rate": 0.0007995041115748957, |
|
"loss": 6.3342, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.0032492689144942386, |
|
"grad_norm": 3.296875, |
|
"learning_rate": 0.0007994841160738834, |
|
"loss": 6.416, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.00337424079582094, |
|
"grad_norm": 3.90625, |
|
"learning_rate": 0.0007994641205728712, |
|
"loss": 6.3348, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.0034992126771476416, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 0.0007994441250718589, |
|
"loss": 6.3827, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.003624184558474343, |
|
"grad_norm": 2.5, |
|
"learning_rate": 0.0007994241295708465, |
|
"loss": 6.4047, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.0037491564398010446, |
|
"grad_norm": 2.78125, |
|
"learning_rate": 0.0007994041340698343, |
|
"loss": 6.1781, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.003874128321127746, |
|
"grad_norm": 3.375, |
|
"learning_rate": 0.000799384138568822, |
|
"loss": 6.2532, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.0039991002024544475, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 0.0007993641430678099, |
|
"loss": 6.2372, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.004124072083781149, |
|
"grad_norm": 2.796875, |
|
"learning_rate": 0.0007993441475667975, |
|
"loss": 6.0455, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.0042490439651078505, |
|
"grad_norm": 2.6875, |
|
"learning_rate": 0.0007993241520657852, |
|
"loss": 6.1694, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.004374015846434552, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 0.000799304156564773, |
|
"loss": 6.045, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.0044989877277612535, |
|
"grad_norm": 2.765625, |
|
"learning_rate": 0.0007992841610637607, |
|
"loss": 6.0104, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.004623959609087955, |
|
"grad_norm": 1.796875, |
|
"learning_rate": 0.0007992641655627485, |
|
"loss": 5.9327, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.0047489314904146564, |
|
"grad_norm": 2.765625, |
|
"learning_rate": 0.0007992441700617361, |
|
"loss": 5.9164, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.004873903371741358, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 0.0007992241745607238, |
|
"loss": 5.9775, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.004998875253068059, |
|
"grad_norm": 2.421875, |
|
"learning_rate": 0.0007992041790597116, |
|
"loss": 5.6958, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.005123847134394761, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 0.0007991841835586994, |
|
"loss": 5.8286, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.005248819015721462, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 0.000799164188057687, |
|
"loss": 5.7053, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.005373790897048164, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 0.0007991441925566748, |
|
"loss": 5.725, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.005498762778374865, |
|
"grad_norm": 1.875, |
|
"learning_rate": 0.0007991241970556625, |
|
"loss": 5.6576, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.005623734659701567, |
|
"grad_norm": 2.3125, |
|
"learning_rate": 0.0007991042015546503, |
|
"loss": 5.7389, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.005748706541028268, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 0.000799084206053638, |
|
"loss": 5.4624, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.00587367842235497, |
|
"grad_norm": 2.625, |
|
"learning_rate": 0.0007990642105526257, |
|
"loss": 5.6441, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.005998650303681671, |
|
"grad_norm": 2.5, |
|
"learning_rate": 0.0007990442150516134, |
|
"loss": 5.552, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.006123622185008373, |
|
"grad_norm": 1.640625, |
|
"learning_rate": 0.0007990242195506011, |
|
"loss": 5.3983, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.006248594066335074, |
|
"grad_norm": 2.640625, |
|
"learning_rate": 0.000799004224049589, |
|
"loss": 5.3757, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.006373565947661776, |
|
"grad_norm": 2.25, |
|
"learning_rate": 0.0007989842285485766, |
|
"loss": 5.4723, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.006498537828988477, |
|
"grad_norm": 2.25, |
|
"learning_rate": 0.0007989642330475643, |
|
"loss": 5.301, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.006623509710315179, |
|
"grad_norm": 1.90625, |
|
"learning_rate": 0.0007989442375465521, |
|
"loss": 5.4499, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.00674848159164188, |
|
"grad_norm": 2.65625, |
|
"learning_rate": 0.0007989242420455398, |
|
"loss": 5.5597, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.006873453472968582, |
|
"grad_norm": 2.25, |
|
"learning_rate": 0.0007989042465445275, |
|
"loss": 5.2123, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.006998425354295283, |
|
"grad_norm": 1.71875, |
|
"learning_rate": 0.0007988842510435152, |
|
"loss": 5.3393, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.007123397235621985, |
|
"grad_norm": 2.390625, |
|
"learning_rate": 0.000798864255542503, |
|
"loss": 5.2593, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.007248369116948686, |
|
"grad_norm": 2.125, |
|
"learning_rate": 0.0007988442600414907, |
|
"loss": 5.2776, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.007373340998275388, |
|
"grad_norm": 1.84375, |
|
"learning_rate": 0.0007988242645404785, |
|
"loss": 5.2529, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.007498312879602089, |
|
"grad_norm": 2.3125, |
|
"learning_rate": 0.0007988042690394662, |
|
"loss": 5.0664, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.007623284760928791, |
|
"grad_norm": 1.5234375, |
|
"learning_rate": 0.0007987842735384539, |
|
"loss": 5.3028, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.007748256642255492, |
|
"grad_norm": 1.7578125, |
|
"learning_rate": 0.0007987642780374416, |
|
"loss": 5.2152, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.007873228523582194, |
|
"grad_norm": 1.765625, |
|
"learning_rate": 0.0007987442825364294, |
|
"loss": 4.97, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.007998200404908895, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 0.000798724287035417, |
|
"loss": 5.1328, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.008123172286235597, |
|
"grad_norm": 1.765625, |
|
"learning_rate": 0.0007987042915344048, |
|
"loss": 5.0475, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.008248144167562298, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 0.0007986842960333925, |
|
"loss": 5.009, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.008373116048889, |
|
"grad_norm": 1.90625, |
|
"learning_rate": 0.0007986643005323803, |
|
"loss": 5.0827, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.008498087930215701, |
|
"grad_norm": 1.609375, |
|
"learning_rate": 0.000798644305031368, |
|
"loss": 5.0896, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.008623059811542403, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 0.0007986243095303557, |
|
"loss": 4.9185, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.008748031692869104, |
|
"grad_norm": 2.25, |
|
"learning_rate": 0.0007986043140293435, |
|
"loss": 5.0279, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.008873003574195806, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 0.0007985843185283312, |
|
"loss": 4.8811, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.008997975455522507, |
|
"grad_norm": 1.640625, |
|
"learning_rate": 0.0007985643230273188, |
|
"loss": 5.0377, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.00912294733684921, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 0.0007985443275263066, |
|
"loss": 4.9903, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.00924791921817591, |
|
"grad_norm": 2.15625, |
|
"learning_rate": 0.0007985243320252943, |
|
"loss": 4.8109, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.009372891099502612, |
|
"grad_norm": 1.859375, |
|
"learning_rate": 0.0007985043365242821, |
|
"loss": 4.8445, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.009497862980829313, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 0.0007984843410232698, |
|
"loss": 4.887, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.009622834862156015, |
|
"grad_norm": 1.96875, |
|
"learning_rate": 0.0007984643455222575, |
|
"loss": 4.8373, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.009747806743482716, |
|
"grad_norm": 1.7890625, |
|
"learning_rate": 0.0007984443500212453, |
|
"loss": 4.8948, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.009872778624809418, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 0.000798424354520233, |
|
"loss": 4.9098, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.009997750506136119, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 0.0007984043590192208, |
|
"loss": 4.6948, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.010122722387462821, |
|
"grad_norm": 1.9453125, |
|
"learning_rate": 0.0007983843635182084, |
|
"loss": 4.7561, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.010247694268789522, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 0.0007983643680171961, |
|
"loss": 4.6634, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.010372666150116224, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 0.0007983443725161839, |
|
"loss": 4.6339, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.010497638031442925, |
|
"grad_norm": 1.953125, |
|
"learning_rate": 0.0007983243770151716, |
|
"loss": 4.7292, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.010622609912769627, |
|
"grad_norm": 1.4453125, |
|
"learning_rate": 0.0007983043815141594, |
|
"loss": 4.6211, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.010747581794096328, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 0.0007982843860131471, |
|
"loss": 4.6857, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.01087255367542303, |
|
"grad_norm": 1.8359375, |
|
"learning_rate": 0.0007982643905121348, |
|
"loss": 4.6335, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.01099752555674973, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 0.0007982443950111226, |
|
"loss": 4.7285, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.011122497438076433, |
|
"grad_norm": 1.609375, |
|
"learning_rate": 0.0007982243995101103, |
|
"loss": 4.7707, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.011247469319403134, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 0.000798204404009098, |
|
"loss": 4.6769, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.011372441200729836, |
|
"grad_norm": 1.390625, |
|
"learning_rate": 0.0007981844085080857, |
|
"loss": 4.7311, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.011497413082056537, |
|
"grad_norm": 1.84375, |
|
"learning_rate": 0.0007981644130070734, |
|
"loss": 4.6677, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.011622384963383239, |
|
"grad_norm": 1.328125, |
|
"learning_rate": 0.0007981444175060612, |
|
"loss": 4.6862, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.01174735684470994, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 0.0007981244220050488, |
|
"loss": 4.5674, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.011872328726036642, |
|
"grad_norm": 1.8671875, |
|
"learning_rate": 0.0007981044265040366, |
|
"loss": 4.5283, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.011997300607363343, |
|
"grad_norm": 1.625, |
|
"learning_rate": 0.0007980844310030244, |
|
"loss": 4.5966, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.012122272488690045, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 0.0007980644355020121, |
|
"loss": 4.4785, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.012247244370016746, |
|
"grad_norm": 1.765625, |
|
"learning_rate": 0.0007980444400009999, |
|
"loss": 4.5966, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.012372216251343448, |
|
"grad_norm": 1.8671875, |
|
"learning_rate": 0.0007980244444999875, |
|
"loss": 4.6428, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.012497188132670149, |
|
"grad_norm": 1.609375, |
|
"learning_rate": 0.0007980044489989753, |
|
"loss": 4.5225, |
|
"step": 500 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 200045, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.2266079346688e+16, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|