|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 100, |
|
"global_step": 610, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.01639344262295082, |
|
"grad_norm": 1.9867063760757446, |
|
"learning_rate": 1.6393442622950818e-05, |
|
"loss": 0.3745, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.03278688524590164, |
|
"grad_norm": 1.508779525756836, |
|
"learning_rate": 3.2786885245901635e-05, |
|
"loss": 0.2514, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.04918032786885246, |
|
"grad_norm": 0.43040570616722107, |
|
"learning_rate": 4.918032786885246e-05, |
|
"loss": 0.1555, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.06557377049180328, |
|
"grad_norm": 1.1166633367538452, |
|
"learning_rate": 6.557377049180327e-05, |
|
"loss": 0.1932, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.08196721311475409, |
|
"grad_norm": 0.4990024268627167, |
|
"learning_rate": 8.19672131147541e-05, |
|
"loss": 0.1536, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.09836065573770492, |
|
"grad_norm": 1.1272306442260742, |
|
"learning_rate": 9.836065573770493e-05, |
|
"loss": 0.1524, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.11475409836065574, |
|
"grad_norm": 0.4084867238998413, |
|
"learning_rate": 9.993370449424153e-05, |
|
"loss": 0.1391, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.13114754098360656, |
|
"grad_norm": 0.6679890751838684, |
|
"learning_rate": 9.970476054107763e-05, |
|
"loss": 0.1274, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.14754098360655737, |
|
"grad_norm": 1.0677056312561035, |
|
"learning_rate": 9.931309898856423e-05, |
|
"loss": 0.1283, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.16393442622950818, |
|
"grad_norm": 1.0797268152236938, |
|
"learning_rate": 9.876000201222912e-05, |
|
"loss": 0.1317, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.16393442622950818, |
|
"eval_loss": 0.16001197695732117, |
|
"eval_runtime": 62.2223, |
|
"eval_samples_per_second": 1.093, |
|
"eval_steps_per_second": 1.093, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.18032786885245902, |
|
"grad_norm": 0.9521363973617554, |
|
"learning_rate": 9.804728027590449e-05, |
|
"loss": 0.1882, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.19672131147540983, |
|
"grad_norm": 0.835111141204834, |
|
"learning_rate": 9.717726700418842e-05, |
|
"loss": 0.1557, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.21311475409836064, |
|
"grad_norm": 0.6795278787612915, |
|
"learning_rate": 9.61528103442088e-05, |
|
"loss": 0.161, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.22950819672131148, |
|
"grad_norm": 0.9582076072692871, |
|
"learning_rate": 9.497726404169412e-05, |
|
"loss": 0.1615, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.2459016393442623, |
|
"grad_norm": 0.9718400239944458, |
|
"learning_rate": 9.365447646187509e-05, |
|
"loss": 0.1473, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.26229508196721313, |
|
"grad_norm": 0.881258487701416, |
|
"learning_rate": 9.218877799115928e-05, |
|
"loss": 0.125, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.2786885245901639, |
|
"grad_norm": 0.9122475385665894, |
|
"learning_rate": 9.058496686082132e-05, |
|
"loss": 0.1401, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.29508196721311475, |
|
"grad_norm": 1.2337970733642578, |
|
"learning_rate": 8.884829343911762e-05, |
|
"loss": 0.145, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.3114754098360656, |
|
"grad_norm": 0.3640283942222595, |
|
"learning_rate": 8.698444304324835e-05, |
|
"loss": 0.1411, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.32786885245901637, |
|
"grad_norm": 0.9699270129203796, |
|
"learning_rate": 8.499951732743457e-05, |
|
"loss": 0.1605, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.32786885245901637, |
|
"eval_loss": 0.15475818514823914, |
|
"eval_runtime": 62.2552, |
|
"eval_samples_per_second": 1.092, |
|
"eval_steps_per_second": 1.092, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.3442622950819672, |
|
"grad_norm": 1.2340621948242188, |
|
"learning_rate": 8.290001430804025e-05, |
|
"loss": 0.1302, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.36065573770491804, |
|
"grad_norm": 0.7683020234107971, |
|
"learning_rate": 8.06928070911306e-05, |
|
"loss": 0.1771, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.3770491803278688, |
|
"grad_norm": 1.2670581340789795, |
|
"learning_rate": 7.838512137210565e-05, |
|
"loss": 0.1442, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.39344262295081966, |
|
"grad_norm": 0.6768158078193665, |
|
"learning_rate": 7.598451178106857e-05, |
|
"loss": 0.1389, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.4098360655737705, |
|
"grad_norm": 1.129391074180603, |
|
"learning_rate": 7.3498837151366e-05, |
|
"loss": 0.1385, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.4262295081967213, |
|
"grad_norm": 1.101136326789856, |
|
"learning_rate": 7.093623479226385e-05, |
|
"loss": 0.1564, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.4426229508196721, |
|
"grad_norm": 0.2261950969696045, |
|
"learning_rate": 6.830509384998114e-05, |
|
"loss": 0.1431, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.45901639344262296, |
|
"grad_norm": 0.3458766043186188, |
|
"learning_rate": 6.561402784428974e-05, |
|
"loss": 0.1206, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.47540983606557374, |
|
"grad_norm": 1.1199126243591309, |
|
"learning_rate": 6.287184647058648e-05, |
|
"loss": 0.11, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.4918032786885246, |
|
"grad_norm": 0.45597025752067566, |
|
"learning_rate": 6.0087526759748304e-05, |
|
"loss": 0.1191, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.4918032786885246, |
|
"eval_loss": 0.1460057497024536, |
|
"eval_runtime": 62.2673, |
|
"eval_samples_per_second": 1.092, |
|
"eval_steps_per_second": 1.092, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.5081967213114754, |
|
"grad_norm": 0.734933614730835, |
|
"learning_rate": 5.7270183690184495e-05, |
|
"loss": 0.1585, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.5245901639344263, |
|
"grad_norm": 0.8497341275215149, |
|
"learning_rate": 5.4429040348292256e-05, |
|
"loss": 0.1406, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.5409836065573771, |
|
"grad_norm": 0.8217471241950989, |
|
"learning_rate": 5.157339773500125e-05, |
|
"loss": 0.1388, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.5573770491803278, |
|
"grad_norm": 0.3617191016674042, |
|
"learning_rate": 4.8712604317250576e-05, |
|
"loss": 0.1493, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.5737704918032787, |
|
"grad_norm": 0.9715378880500793, |
|
"learning_rate": 4.585602542407722e-05, |
|
"loss": 0.1446, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.5901639344262295, |
|
"grad_norm": 0.8967556953430176, |
|
"learning_rate": 4.3013012587503254e-05, |
|
"loss": 0.1134, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.6065573770491803, |
|
"grad_norm": 1.1487077474594116, |
|
"learning_rate": 4.019287292859016e-05, |
|
"loss": 0.1419, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.6229508196721312, |
|
"grad_norm": 0.8955410718917847, |
|
"learning_rate": 3.7404838688880446e-05, |
|
"loss": 0.1339, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.639344262295082, |
|
"grad_norm": 0.7369993925094604, |
|
"learning_rate": 3.465803700697114e-05, |
|
"loss": 0.1435, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.6557377049180327, |
|
"grad_norm": 0.4338698089122772, |
|
"learning_rate": 3.196146003916084e-05, |
|
"loss": 0.1423, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.6557377049180327, |
|
"eval_loss": 0.13493812084197998, |
|
"eval_runtime": 62.3222, |
|
"eval_samples_per_second": 1.091, |
|
"eval_steps_per_second": 1.091, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.6721311475409836, |
|
"grad_norm": 1.00863516330719, |
|
"learning_rate": 2.932393552198597e-05, |
|
"loss": 0.1575, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.6885245901639344, |
|
"grad_norm": 1.1648294925689697, |
|
"learning_rate": 2.6754097873015148e-05, |
|
"loss": 0.1372, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.7049180327868853, |
|
"grad_norm": 0.5712903141975403, |
|
"learning_rate": 2.426035992450848e-05, |
|
"loss": 0.1293, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.7213114754098361, |
|
"grad_norm": 0.6494749188423157, |
|
"learning_rate": 2.1850885382476562e-05, |
|
"loss": 0.1347, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.7377049180327869, |
|
"grad_norm": 0.7962918281555176, |
|
"learning_rate": 1.9533562101300097e-05, |
|
"loss": 0.1157, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.7540983606557377, |
|
"grad_norm": 0.7322007417678833, |
|
"learning_rate": 1.7315976261399696e-05, |
|
"loss": 0.1339, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.7704918032786885, |
|
"grad_norm": 1.0486736297607422, |
|
"learning_rate": 1.5205387534490806e-05, |
|
"loss": 0.1327, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.7868852459016393, |
|
"grad_norm": 0.28778010606765747, |
|
"learning_rate": 1.3208705317724006e-05, |
|
"loss": 0.0872, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.8032786885245902, |
|
"grad_norm": 1.064375638961792, |
|
"learning_rate": 1.1332466114513512e-05, |
|
"loss": 0.1348, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.819672131147541, |
|
"grad_norm": 0.7255303263664246, |
|
"learning_rate": 9.582812136100783e-06, |
|
"loss": 0.1065, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.819672131147541, |
|
"eval_loss": 0.1266276091337204, |
|
"eval_runtime": 62.2443, |
|
"eval_samples_per_second": 1.092, |
|
"eval_steps_per_second": 1.092, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.8360655737704918, |
|
"grad_norm": 0.8413804173469543, |
|
"learning_rate": 7.965471193905954e-06, |
|
"loss": 0.114, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.8524590163934426, |
|
"grad_norm": 0.9656268954277039, |
|
"learning_rate": 6.4857379484922375e-06, |
|
"loss": 0.1114, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.8688524590163934, |
|
"grad_norm": 0.6842993497848511, |
|
"learning_rate": 5.148456576529081e-06, |
|
"loss": 0.1265, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.8852459016393442, |
|
"grad_norm": 1.1006067991256714, |
|
"learning_rate": 3.958004912496127e-06, |
|
"loss": 0.1418, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.9016393442622951, |
|
"grad_norm": 0.8723937273025513, |
|
"learning_rate": 2.918280117043709e-06, |
|
"loss": 0.1555, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.9180327868852459, |
|
"grad_norm": 0.810757577419281, |
|
"learning_rate": 2.032685918926508e-06, |
|
"loss": 0.1195, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.9344262295081968, |
|
"grad_norm": 1.3418116569519043, |
|
"learning_rate": 1.3041214722768035e-06, |
|
"loss": 0.1395, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.9508196721311475, |
|
"grad_norm": 0.7165635228157043, |
|
"learning_rate": 7.349718656945504e-07, |
|
"loss": 0.1296, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.9672131147540983, |
|
"grad_norm": 0.7757364511489868, |
|
"learning_rate": 3.271003142248652e-07, |
|
"loss": 0.1411, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.9836065573770492, |
|
"grad_norm": 0.8983824253082275, |
|
"learning_rate": 8.184205978370996e-08, |
|
"loss": 0.1174, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.9836065573770492, |
|
"eval_loss": 0.12527307868003845, |
|
"eval_runtime": 62.1952, |
|
"eval_samples_per_second": 1.093, |
|
"eval_steps_per_second": 1.093, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.8321591019630432, |
|
"learning_rate": 0.0, |
|
"loss": 0.1184, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"step": 610, |
|
"total_flos": 1.134915283869696e+16, |
|
"train_loss": 0.04400509732668517, |
|
"train_runtime": 2473.793, |
|
"train_samples_per_second": 0.986, |
|
"train_steps_per_second": 0.247 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 610, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.134915283869696e+16, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|