|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.2797651464850307, |
|
"eval_steps": 500, |
|
"global_step": 600000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.004662752441417178, |
|
"grad_norm": 1.3671598434448242, |
|
"learning_rate": 4.976690900545356e-05, |
|
"loss": 4.8243, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.009325504882834356, |
|
"grad_norm": 1.3568960428237915, |
|
"learning_rate": 4.953384132466932e-05, |
|
"loss": 3.7236, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 0.013988257324251536, |
|
"grad_norm": 1.4119852781295776, |
|
"learning_rate": 4.930079695764729e-05, |
|
"loss": 3.4332, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 0.018651009765668712, |
|
"grad_norm": 1.1216883659362793, |
|
"learning_rate": 4.906772927686305e-05, |
|
"loss": 3.2825, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 0.023313762207085892, |
|
"grad_norm": 1.2323102951049805, |
|
"learning_rate": 4.883468490984102e-05, |
|
"loss": 3.1866, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 0.02797651464850307, |
|
"grad_norm": 0.9565121531486511, |
|
"learning_rate": 4.860161722905678e-05, |
|
"loss": 3.1126, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 0.03263926708992025, |
|
"grad_norm": 1.1173287630081177, |
|
"learning_rate": 4.836861948955917e-05, |
|
"loss": 3.0577, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 0.037302019531337424, |
|
"grad_norm": 1.4626446962356567, |
|
"learning_rate": 4.813555180877493e-05, |
|
"loss": 3.0145, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 0.041964771972754604, |
|
"grad_norm": 1.0016125440597534, |
|
"learning_rate": 4.790246081422848e-05, |
|
"loss": 2.973, |
|
"step": 90000 |
|
}, |
|
{ |
|
"epoch": 0.046627524414171784, |
|
"grad_norm": 1.3417049646377563, |
|
"learning_rate": 4.766941644720645e-05, |
|
"loss": 2.9423, |
|
"step": 100000 |
|
}, |
|
{ |
|
"epoch": 0.051290276855588963, |
|
"grad_norm": 1.2270045280456543, |
|
"learning_rate": 4.7436372080184424e-05, |
|
"loss": 2.9127, |
|
"step": 110000 |
|
}, |
|
{ |
|
"epoch": 0.05595302929700614, |
|
"grad_norm": 1.0300427675247192, |
|
"learning_rate": 4.7203327713162395e-05, |
|
"loss": 2.8823, |
|
"step": 120000 |
|
}, |
|
{ |
|
"epoch": 0.060615781738423316, |
|
"grad_norm": 0.8841068148612976, |
|
"learning_rate": 4.6970283346140366e-05, |
|
"loss": 2.8588, |
|
"step": 130000 |
|
}, |
|
{ |
|
"epoch": 0.0652785341798405, |
|
"grad_norm": 1.0524730682373047, |
|
"learning_rate": 4.673721566535613e-05, |
|
"loss": 2.8425, |
|
"step": 140000 |
|
}, |
|
{ |
|
"epoch": 0.06994128662125768, |
|
"grad_norm": 0.9874018430709839, |
|
"learning_rate": 4.650417129833409e-05, |
|
"loss": 2.8257, |
|
"step": 150000 |
|
}, |
|
{ |
|
"epoch": 0.07460403906267485, |
|
"grad_norm": 0.9634119272232056, |
|
"learning_rate": 4.6271126931312064e-05, |
|
"loss": 2.8114, |
|
"step": 160000 |
|
}, |
|
{ |
|
"epoch": 0.07926679150409204, |
|
"grad_norm": 0.885671854019165, |
|
"learning_rate": 4.603808256429003e-05, |
|
"loss": 2.7911, |
|
"step": 170000 |
|
}, |
|
{ |
|
"epoch": 0.08392954394550921, |
|
"grad_norm": 1.0135940313339233, |
|
"learning_rate": 4.58050148835058e-05, |
|
"loss": 2.7786, |
|
"step": 180000 |
|
}, |
|
{ |
|
"epoch": 0.0885922963869264, |
|
"grad_norm": 1.011932611465454, |
|
"learning_rate": 4.557194720272156e-05, |
|
"loss": 2.7636, |
|
"step": 190000 |
|
}, |
|
{ |
|
"epoch": 0.09325504882834357, |
|
"grad_norm": 0.7796096205711365, |
|
"learning_rate": 4.533892614946173e-05, |
|
"loss": 2.753, |
|
"step": 200000 |
|
}, |
|
{ |
|
"epoch": 0.09791780126976074, |
|
"grad_norm": 1.1194034814834595, |
|
"learning_rate": 4.5105858468677495e-05, |
|
"loss": 2.7371, |
|
"step": 210000 |
|
}, |
|
{ |
|
"epoch": 0.10258055371117793, |
|
"grad_norm": 1.1135520935058594, |
|
"learning_rate": 4.4872814101655467e-05, |
|
"loss": 2.7285, |
|
"step": 220000 |
|
}, |
|
{ |
|
"epoch": 0.1072433061525951, |
|
"grad_norm": 0.7772097587585449, |
|
"learning_rate": 4.463976973463344e-05, |
|
"loss": 2.7173, |
|
"step": 230000 |
|
}, |
|
{ |
|
"epoch": 0.11190605859401229, |
|
"grad_norm": 1.096358299255371, |
|
"learning_rate": 4.44067020538492e-05, |
|
"loss": 2.7141, |
|
"step": 240000 |
|
}, |
|
{ |
|
"epoch": 0.11656881103542946, |
|
"grad_norm": 0.8112640380859375, |
|
"learning_rate": 4.417363437306496e-05, |
|
"loss": 2.7073, |
|
"step": 250000 |
|
}, |
|
{ |
|
"epoch": 0.12123156347684663, |
|
"grad_norm": 1.0931545495986938, |
|
"learning_rate": 4.394059000604293e-05, |
|
"loss": 2.6931, |
|
"step": 260000 |
|
}, |
|
{ |
|
"epoch": 0.1258943159182638, |
|
"grad_norm": 1.1369918584823608, |
|
"learning_rate": 4.37075456390209e-05, |
|
"loss": 2.6824, |
|
"step": 270000 |
|
}, |
|
{ |
|
"epoch": 0.130557068359681, |
|
"grad_norm": 1.258300542831421, |
|
"learning_rate": 4.347450127199887e-05, |
|
"loss": 2.6741, |
|
"step": 280000 |
|
}, |
|
{ |
|
"epoch": 0.13521982080109818, |
|
"grad_norm": 0.9752686023712158, |
|
"learning_rate": 4.324143359121463e-05, |
|
"loss": 2.6645, |
|
"step": 290000 |
|
}, |
|
{ |
|
"epoch": 0.13988257324251535, |
|
"grad_norm": 1.0001367330551147, |
|
"learning_rate": 4.3008412537954805e-05, |
|
"loss": 2.6592, |
|
"step": 300000 |
|
}, |
|
{ |
|
"epoch": 0.14454532568393252, |
|
"grad_norm": 1.0314422845840454, |
|
"learning_rate": 4.277534485717057e-05, |
|
"loss": 2.6523, |
|
"step": 310000 |
|
}, |
|
{ |
|
"epoch": 0.1492080781253497, |
|
"grad_norm": 0.9287506937980652, |
|
"learning_rate": 4.254230049014854e-05, |
|
"loss": 2.6503, |
|
"step": 320000 |
|
}, |
|
{ |
|
"epoch": 0.1538708305667669, |
|
"grad_norm": 0.8209073543548584, |
|
"learning_rate": 4.23092328093643e-05, |
|
"loss": 2.6379, |
|
"step": 330000 |
|
}, |
|
{ |
|
"epoch": 0.15853358300818407, |
|
"grad_norm": 0.8727386593818665, |
|
"learning_rate": 4.207618844234227e-05, |
|
"loss": 2.6332, |
|
"step": 340000 |
|
}, |
|
{ |
|
"epoch": 0.16319633544960124, |
|
"grad_norm": 0.9841961860656738, |
|
"learning_rate": 4.184314407532024e-05, |
|
"loss": 2.6279, |
|
"step": 350000 |
|
}, |
|
{ |
|
"epoch": 0.16785908789101842, |
|
"grad_norm": 0.7831237316131592, |
|
"learning_rate": 4.1610099708298214e-05, |
|
"loss": 2.6237, |
|
"step": 360000 |
|
}, |
|
{ |
|
"epoch": 0.1725218403324356, |
|
"grad_norm": 0.9184048175811768, |
|
"learning_rate": 4.137707865503839e-05, |
|
"loss": 2.6167, |
|
"step": 370000 |
|
}, |
|
{ |
|
"epoch": 0.1771845927738528, |
|
"grad_norm": 0.9598727822303772, |
|
"learning_rate": 4.114401097425415e-05, |
|
"loss": 2.6082, |
|
"step": 380000 |
|
}, |
|
{ |
|
"epoch": 0.18184734521526996, |
|
"grad_norm": 0.8814136981964111, |
|
"learning_rate": 4.0910966607232115e-05, |
|
"loss": 2.6027, |
|
"step": 390000 |
|
}, |
|
{ |
|
"epoch": 0.18651009765668713, |
|
"grad_norm": 0.9080318212509155, |
|
"learning_rate": 4.067789892644788e-05, |
|
"loss": 2.6014, |
|
"step": 400000 |
|
}, |
|
{ |
|
"epoch": 0.1911728500981043, |
|
"grad_norm": 0.8321977257728577, |
|
"learning_rate": 4.044485455942585e-05, |
|
"loss": 2.5964, |
|
"step": 410000 |
|
}, |
|
{ |
|
"epoch": 0.19583560253952148, |
|
"grad_norm": 1.3375693559646606, |
|
"learning_rate": 4.021178687864161e-05, |
|
"loss": 2.5906, |
|
"step": 420000 |
|
}, |
|
{ |
|
"epoch": 0.20049835498093868, |
|
"grad_norm": 0.8211286067962646, |
|
"learning_rate": 3.997874251161958e-05, |
|
"loss": 2.5866, |
|
"step": 430000 |
|
}, |
|
{ |
|
"epoch": 0.20516110742235585, |
|
"grad_norm": 0.7890422940254211, |
|
"learning_rate": 3.9745698144597546e-05, |
|
"loss": 2.5817, |
|
"step": 440000 |
|
}, |
|
{ |
|
"epoch": 0.20982385986377303, |
|
"grad_norm": 1.0580294132232666, |
|
"learning_rate": 3.951263046381331e-05, |
|
"loss": 2.5776, |
|
"step": 450000 |
|
}, |
|
{ |
|
"epoch": 0.2144866123051902, |
|
"grad_norm": 1.0666168928146362, |
|
"learning_rate": 3.927956278302907e-05, |
|
"loss": 2.5729, |
|
"step": 460000 |
|
}, |
|
{ |
|
"epoch": 0.21914936474660737, |
|
"grad_norm": 1.0440067052841187, |
|
"learning_rate": 3.904651841600704e-05, |
|
"loss": 2.5748, |
|
"step": 470000 |
|
}, |
|
{ |
|
"epoch": 0.22381211718802457, |
|
"grad_norm": 0.8746099472045898, |
|
"learning_rate": 3.88134507352228e-05, |
|
"loss": 2.5704, |
|
"step": 480000 |
|
}, |
|
{ |
|
"epoch": 0.22847486962944175, |
|
"grad_norm": 0.882897675037384, |
|
"learning_rate": 3.858040636820078e-05, |
|
"loss": 2.5623, |
|
"step": 490000 |
|
}, |
|
{ |
|
"epoch": 0.23313762207085892, |
|
"grad_norm": 0.8458369970321655, |
|
"learning_rate": 3.834740862870316e-05, |
|
"loss": 2.5612, |
|
"step": 500000 |
|
}, |
|
{ |
|
"epoch": 0.2378003745122761, |
|
"grad_norm": 0.9579658508300781, |
|
"learning_rate": 3.811434094791892e-05, |
|
"loss": 2.5551, |
|
"step": 510000 |
|
}, |
|
{ |
|
"epoch": 0.24246312695369326, |
|
"grad_norm": 1.0498722791671753, |
|
"learning_rate": 3.78813198946591e-05, |
|
"loss": 2.5502, |
|
"step": 520000 |
|
}, |
|
{ |
|
"epoch": 0.24712587939511046, |
|
"grad_norm": 1.032334804534912, |
|
"learning_rate": 3.764825221387486e-05, |
|
"loss": 2.5534, |
|
"step": 530000 |
|
}, |
|
{ |
|
"epoch": 0.2517886318365276, |
|
"grad_norm": 0.9145790934562683, |
|
"learning_rate": 3.7415184533090624e-05, |
|
"loss": 2.547, |
|
"step": 540000 |
|
}, |
|
{ |
|
"epoch": 0.2564513842779448, |
|
"grad_norm": 1.0633904933929443, |
|
"learning_rate": 3.71821634798308e-05, |
|
"loss": 2.543, |
|
"step": 550000 |
|
}, |
|
{ |
|
"epoch": 0.261114136719362, |
|
"grad_norm": 0.9828123450279236, |
|
"learning_rate": 3.694911911280877e-05, |
|
"loss": 2.5398, |
|
"step": 560000 |
|
}, |
|
{ |
|
"epoch": 0.2657768891607792, |
|
"grad_norm": 0.8735861778259277, |
|
"learning_rate": 3.671607474578674e-05, |
|
"loss": 2.5345, |
|
"step": 570000 |
|
}, |
|
{ |
|
"epoch": 0.27043964160219636, |
|
"grad_norm": 1.1347264051437378, |
|
"learning_rate": 3.6483053692526915e-05, |
|
"loss": 2.5312, |
|
"step": 580000 |
|
}, |
|
{ |
|
"epoch": 0.27510239404361353, |
|
"grad_norm": 0.8275557160377502, |
|
"learning_rate": 3.625000932550489e-05, |
|
"loss": 2.5299, |
|
"step": 590000 |
|
}, |
|
{ |
|
"epoch": 0.2797651464850307, |
|
"grad_norm": 0.8891148567199707, |
|
"learning_rate": 3.601696495848285e-05, |
|
"loss": 2.5301, |
|
"step": 600000 |
|
} |
|
], |
|
"logging_steps": 10000, |
|
"max_steps": 2144656, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 9223372036854775807, |
|
"save_steps": 100000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.4794966500590223e+19, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|