|
{ |
|
"best_global_step": 2000, |
|
"best_metric": 1.2713440656661987, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 1000, |
|
"global_step": 2906, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.017205781142463867, |
|
"grad_norm": 0.8632408976554871, |
|
"learning_rate": 2.784090909090909e-05, |
|
"loss": 1.268, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.034411562284927734, |
|
"grad_norm": 0.8244401812553406, |
|
"learning_rate": 4.9804826117814056e-05, |
|
"loss": 1.2135, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.051617343427391604, |
|
"grad_norm": 1.049148678779602, |
|
"learning_rate": 4.891767210787793e-05, |
|
"loss": 1.1783, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.06882312456985547, |
|
"grad_norm": 0.8507347106933594, |
|
"learning_rate": 4.8030518097941806e-05, |
|
"loss": 1.1688, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.08602890571231935, |
|
"grad_norm": 0.8333495259284973, |
|
"learning_rate": 4.714336408800568e-05, |
|
"loss": 1.1632, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.10323468685478321, |
|
"grad_norm": 0.8183310627937317, |
|
"learning_rate": 4.6256210078069556e-05, |
|
"loss": 1.1241, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.12044046799724707, |
|
"grad_norm": 0.8449575901031494, |
|
"learning_rate": 4.536905606813343e-05, |
|
"loss": 1.1488, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.13764624913971094, |
|
"grad_norm": 0.8357868194580078, |
|
"learning_rate": 4.448190205819731e-05, |
|
"loss": 1.1488, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.1548520302821748, |
|
"grad_norm": 0.922451913356781, |
|
"learning_rate": 4.359474804826118e-05, |
|
"loss": 1.1242, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.1720578114246387, |
|
"grad_norm": 0.9548965096473694, |
|
"learning_rate": 4.270759403832506e-05, |
|
"loss": 1.1569, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.18926359256710254, |
|
"grad_norm": 0.8528347611427307, |
|
"learning_rate": 4.182044002838893e-05, |
|
"loss": 1.1645, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.20646937370956642, |
|
"grad_norm": 0.9261412620544434, |
|
"learning_rate": 4.093328601845281e-05, |
|
"loss": 1.1587, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.2236751548520303, |
|
"grad_norm": 0.8746649622917175, |
|
"learning_rate": 4.0046132008516676e-05, |
|
"loss": 1.1441, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.24088093599449414, |
|
"grad_norm": 0.8994395732879639, |
|
"learning_rate": 3.915897799858055e-05, |
|
"loss": 1.1248, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.258086717136958, |
|
"grad_norm": 0.8658247590065002, |
|
"learning_rate": 3.8271823988644426e-05, |
|
"loss": 1.1494, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.27529249827942187, |
|
"grad_norm": 0.8776836395263672, |
|
"learning_rate": 3.73846699787083e-05, |
|
"loss": 1.1178, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.2924982794218858, |
|
"grad_norm": 0.8656069040298462, |
|
"learning_rate": 3.649751596877218e-05, |
|
"loss": 1.1265, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.3097040605643496, |
|
"grad_norm": 0.9538426995277405, |
|
"learning_rate": 3.561036195883606e-05, |
|
"loss": 1.1138, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.3269098417068135, |
|
"grad_norm": 0.7584338188171387, |
|
"learning_rate": 3.4723207948899934e-05, |
|
"loss": 1.1484, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.3441156228492774, |
|
"grad_norm": 0.9330400228500366, |
|
"learning_rate": 3.383605393896381e-05, |
|
"loss": 1.1758, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.3441156228492774, |
|
"eval_loss": 1.2882763147354126, |
|
"eval_runtime": 96.0412, |
|
"eval_samples_per_second": 7.643, |
|
"eval_steps_per_second": 1.531, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.36132140399174123, |
|
"grad_norm": 0.76217120885849, |
|
"learning_rate": 3.2948899929027684e-05, |
|
"loss": 1.1237, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.3785271851342051, |
|
"grad_norm": 0.9417053461074829, |
|
"learning_rate": 3.206174591909156e-05, |
|
"loss": 1.1834, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.395732966276669, |
|
"grad_norm": 0.7688853144645691, |
|
"learning_rate": 3.1174591909155435e-05, |
|
"loss": 1.2116, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.41293874741913283, |
|
"grad_norm": 0.7310340404510498, |
|
"learning_rate": 3.0287437899219306e-05, |
|
"loss": 1.1458, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.4301445285615967, |
|
"grad_norm": 0.8664773106575012, |
|
"learning_rate": 2.940028388928318e-05, |
|
"loss": 1.1651, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.4473503097040606, |
|
"grad_norm": 0.8525294661521912, |
|
"learning_rate": 2.8513129879347057e-05, |
|
"loss": 1.1609, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.46455609084652444, |
|
"grad_norm": 0.9533822536468506, |
|
"learning_rate": 2.7625975869410932e-05, |
|
"loss": 1.1114, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.4817618719889883, |
|
"grad_norm": 0.9051440358161926, |
|
"learning_rate": 2.6738821859474804e-05, |
|
"loss": 1.1417, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.4989676531314522, |
|
"grad_norm": 0.8630899786949158, |
|
"learning_rate": 2.585166784953868e-05, |
|
"loss": 1.1121, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.516173434273916, |
|
"grad_norm": 0.7944740653038025, |
|
"learning_rate": 2.4964513839602558e-05, |
|
"loss": 1.1243, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.5333792154163799, |
|
"grad_norm": 0.8027036190032959, |
|
"learning_rate": 2.4077359829666433e-05, |
|
"loss": 1.1151, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.5505849965588437, |
|
"grad_norm": 0.9314497709274292, |
|
"learning_rate": 2.3190205819730308e-05, |
|
"loss": 1.1382, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.5677907777013076, |
|
"grad_norm": 0.8070854544639587, |
|
"learning_rate": 2.2303051809794183e-05, |
|
"loss": 1.1339, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.5849965588437716, |
|
"grad_norm": 0.9217064380645752, |
|
"learning_rate": 2.1415897799858055e-05, |
|
"loss": 1.1411, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.6022023399862354, |
|
"grad_norm": 0.8798492550849915, |
|
"learning_rate": 2.052874378992193e-05, |
|
"loss": 1.1843, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.6194081211286993, |
|
"grad_norm": 0.7490786910057068, |
|
"learning_rate": 1.9641589779985805e-05, |
|
"loss": 1.1571, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.6366139022711631, |
|
"grad_norm": 0.8831539154052734, |
|
"learning_rate": 1.875443577004968e-05, |
|
"loss": 1.1452, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.653819683413627, |
|
"grad_norm": 0.7337464690208435, |
|
"learning_rate": 1.786728176011356e-05, |
|
"loss": 1.1066, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.6710254645560908, |
|
"grad_norm": 0.8665643334388733, |
|
"learning_rate": 1.698012775017743e-05, |
|
"loss": 1.1761, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.6882312456985548, |
|
"grad_norm": 0.9193384051322937, |
|
"learning_rate": 1.6092973740241306e-05, |
|
"loss": 1.1344, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.6882312456985548, |
|
"eval_loss": 1.2713440656661987, |
|
"eval_runtime": 91.1794, |
|
"eval_samples_per_second": 8.05, |
|
"eval_steps_per_second": 1.612, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.7054370268410186, |
|
"grad_norm": 0.924618661403656, |
|
"learning_rate": 1.5205819730305181e-05, |
|
"loss": 1.1481, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.7226428079834825, |
|
"grad_norm": 0.9339317679405212, |
|
"learning_rate": 1.4318665720369056e-05, |
|
"loss": 1.1114, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.7398485891259463, |
|
"grad_norm": 0.7737475037574768, |
|
"learning_rate": 1.3431511710432932e-05, |
|
"loss": 1.1446, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.7570543702684102, |
|
"grad_norm": 0.8811545372009277, |
|
"learning_rate": 1.2544357700496807e-05, |
|
"loss": 1.1311, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.774260151410874, |
|
"grad_norm": 0.8200384378433228, |
|
"learning_rate": 1.1657203690560682e-05, |
|
"loss": 1.159, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.791465932553338, |
|
"grad_norm": 0.9544340968132019, |
|
"learning_rate": 1.0770049680624557e-05, |
|
"loss": 1.1244, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.8086717136958018, |
|
"grad_norm": 0.8595368266105652, |
|
"learning_rate": 9.88289567068843e-06, |
|
"loss": 1.1819, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 0.8258774948382657, |
|
"grad_norm": 0.7589201331138611, |
|
"learning_rate": 8.995741660752308e-06, |
|
"loss": 1.0796, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.8430832759807295, |
|
"grad_norm": 0.7756196856498718, |
|
"learning_rate": 8.108587650816183e-06, |
|
"loss": 1.1882, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 0.8602890571231934, |
|
"grad_norm": 0.9547609686851501, |
|
"learning_rate": 7.221433640880057e-06, |
|
"loss": 1.1521, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.8774948382656572, |
|
"grad_norm": 1.0288267135620117, |
|
"learning_rate": 6.3342796309439315e-06, |
|
"loss": 1.1987, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 0.8947006194081212, |
|
"grad_norm": 1.0321507453918457, |
|
"learning_rate": 5.4471256210078075e-06, |
|
"loss": 1.1331, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.911906400550585, |
|
"grad_norm": 0.8565710783004761, |
|
"learning_rate": 4.559971611071683e-06, |
|
"loss": 1.1583, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 0.9291121816930489, |
|
"grad_norm": 0.7842207551002502, |
|
"learning_rate": 3.672817601135557e-06, |
|
"loss": 1.1586, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.9463179628355127, |
|
"grad_norm": 0.9376333355903625, |
|
"learning_rate": 2.7856635911994322e-06, |
|
"loss": 1.1702, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 0.9635237439779766, |
|
"grad_norm": 0.7735033631324768, |
|
"learning_rate": 1.8985095812633074e-06, |
|
"loss": 1.1598, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.9807295251204404, |
|
"grad_norm": 0.8405628204345703, |
|
"learning_rate": 1.0113555713271824e-06, |
|
"loss": 1.1977, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 0.9979353062629044, |
|
"grad_norm": 0.9018322825431824, |
|
"learning_rate": 1.242015613910575e-07, |
|
"loss": 1.1157, |
|
"step": 2900 |
|
} |
|
], |
|
"logging_steps": 50, |
|
"max_steps": 2906, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 3000, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 8, |
|
"early_stopping_threshold": 0.01 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 0 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3.617784650549304e+18, |
|
"train_batch_size": 5, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|