nileshmalpeddi's picture
End of training
7d507b3 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 35802,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.013965700240210044,
"grad_norm": 4.268923282623291,
"learning_rate": 0.0004930730126808558,
"loss": 2.5844,
"mean_token_accuracy": 0.5059588371738791,
"step": 500
},
{
"epoch": 0.027931400480420088,
"grad_norm": 0.6783562302589417,
"learning_rate": 0.0004860901625607508,
"loss": 2.153,
"mean_token_accuracy": 0.5498105692565441,
"step": 1000
},
{
"epoch": 0.04189710072063013,
"grad_norm": 0.49954575300216675,
"learning_rate": 0.0004791073124406458,
"loss": 2.0782,
"mean_token_accuracy": 0.5626471206247806,
"step": 1500
},
{
"epoch": 0.055862800960840175,
"grad_norm": 0.501465916633606,
"learning_rate": 0.0004721244623205407,
"loss": 2.0761,
"mean_token_accuracy": 0.5623862668275833,
"step": 2000
},
{
"epoch": 0.06982850120105022,
"grad_norm": 0.52457594871521,
"learning_rate": 0.0004651416122004357,
"loss": 2.0543,
"mean_token_accuracy": 0.5653762078285217,
"step": 2500
},
{
"epoch": 0.08379420144126026,
"grad_norm": 0.366854727268219,
"learning_rate": 0.0004581587620803307,
"loss": 2.0699,
"mean_token_accuracy": 0.5624443043172359,
"step": 3000
},
{
"epoch": 0.09775990168147031,
"grad_norm": 0.33193984627723694,
"learning_rate": 0.00045118987766046596,
"loss": 2.0706,
"mean_token_accuracy": 0.5632118558287621,
"step": 3500
},
{
"epoch": 0.11172560192168035,
"grad_norm": 0.26177772879600525,
"learning_rate": 0.0004442070275403609,
"loss": 2.0744,
"mean_token_accuracy": 0.564106923341751,
"step": 4000
},
{
"epoch": 0.1256913021618904,
"grad_norm": 0.31001174449920654,
"learning_rate": 0.0004372521088207363,
"loss": 2.0789,
"mean_token_accuracy": 0.5612107511162758,
"step": 4500
},
{
"epoch": 0.13965700240210044,
"grad_norm": 0.26565852761268616,
"learning_rate": 0.0004302692587006313,
"loss": 2.0766,
"mean_token_accuracy": 0.5622789396047592,
"step": 5000
},
{
"epoch": 0.15362270264231048,
"grad_norm": 0.2479698359966278,
"learning_rate": 0.00042331433998100666,
"loss": 2.0658,
"mean_token_accuracy": 0.5642604413330555,
"step": 5500
},
{
"epoch": 0.16758840288252053,
"grad_norm": 0.2320714145898819,
"learning_rate": 0.00041633148986090166,
"loss": 2.0537,
"mean_token_accuracy": 0.5658319992125034,
"step": 6000
},
{
"epoch": 0.18155410312273057,
"grad_norm": 0.20908017456531525,
"learning_rate": 0.00040937657114127705,
"loss": 2.068,
"mean_token_accuracy": 0.5633928556740284,
"step": 6500
},
{
"epoch": 0.19551980336294061,
"grad_norm": 0.21336887776851654,
"learning_rate": 0.000402393721021172,
"loss": 2.056,
"mean_token_accuracy": 0.5659000330269337,
"step": 7000
},
{
"epoch": 0.20948550360315066,
"grad_norm": 0.18956658244132996,
"learning_rate": 0.00039543880230154737,
"loss": 2.0611,
"mean_token_accuracy": 0.5643967908620834,
"step": 7500
},
{
"epoch": 0.2234512038433607,
"grad_norm": 0.19688238203525543,
"learning_rate": 0.00038845595218144237,
"loss": 2.0571,
"mean_token_accuracy": 0.5656557891070843,
"step": 8000
},
{
"epoch": 0.23741690408357075,
"grad_norm": 0.17653051018714905,
"learning_rate": 0.0003815010334618178,
"loss": 2.0642,
"mean_token_accuracy": 0.5636518878340722,
"step": 8500
},
{
"epoch": 0.2513826043237808,
"grad_norm": 0.163355752825737,
"learning_rate": 0.0003745181833417128,
"loss": 2.0687,
"mean_token_accuracy": 0.5622124362289905,
"step": 9000
},
{
"epoch": 0.26534830456399083,
"grad_norm": 0.1576842963695526,
"learning_rate": 0.0003675632646220882,
"loss": 2.0537,
"mean_token_accuracy": 0.5655075218081475,
"step": 9500
},
{
"epoch": 0.2793140048042009,
"grad_norm": 0.1434595286846161,
"learning_rate": 0.00036058041450198313,
"loss": 2.0531,
"mean_token_accuracy": 0.566190888762474,
"step": 10000
},
{
"epoch": 0.2932797050444109,
"grad_norm": 0.15508811175823212,
"learning_rate": 0.0003536254957823585,
"loss": 2.0609,
"mean_token_accuracy": 0.5647544565498829,
"step": 10500
},
{
"epoch": 0.30724540528462096,
"grad_norm": 0.1619788408279419,
"learning_rate": 0.0003466426456622535,
"loss": 2.0572,
"mean_token_accuracy": 0.5651431261301041,
"step": 11000
},
{
"epoch": 0.321211105524831,
"grad_norm": 0.15186958014965057,
"learning_rate": 0.0003396877269426289,
"loss": 2.052,
"mean_token_accuracy": 0.5652778272330761,
"step": 11500
},
{
"epoch": 0.33517680576504105,
"grad_norm": 0.1533740907907486,
"learning_rate": 0.0003327048768225239,
"loss": 2.0507,
"mean_token_accuracy": 0.5665202861726284,
"step": 12000
},
{
"epoch": 0.3491425060052511,
"grad_norm": 0.15380869805812836,
"learning_rate": 0.00032574995810289933,
"loss": 2.0556,
"mean_token_accuracy": 0.5652170441150666,
"step": 12500
},
{
"epoch": 0.36310820624546114,
"grad_norm": 0.1550268828868866,
"learning_rate": 0.0003187671079827943,
"loss": 2.0509,
"mean_token_accuracy": 0.565740670889616,
"step": 13000
},
{
"epoch": 0.3770739064856712,
"grad_norm": 0.15138842165470123,
"learning_rate": 0.00031181218926316966,
"loss": 2.0576,
"mean_token_accuracy": 0.5651396196782589,
"step": 13500
},
{
"epoch": 0.39103960672588123,
"grad_norm": 0.14943508803844452,
"learning_rate": 0.00030482933914306466,
"loss": 2.0459,
"mean_token_accuracy": 0.5667617973387241,
"step": 14000
},
{
"epoch": 0.40500530696609127,
"grad_norm": 0.15654833614826202,
"learning_rate": 0.00029787442042344004,
"loss": 2.0682,
"mean_token_accuracy": 0.5644841706454754,
"step": 14500
},
{
"epoch": 0.4189710072063013,
"grad_norm": 0.1552565097808838,
"learning_rate": 0.00029089157030333504,
"loss": 2.0619,
"mean_token_accuracy": 0.5638643040955067,
"step": 15000
},
{
"epoch": 0.43293670744651136,
"grad_norm": 0.15082110464572906,
"learning_rate": 0.0002839366515837104,
"loss": 2.0546,
"mean_token_accuracy": 0.565163453668356,
"step": 15500
},
{
"epoch": 0.4469024076867214,
"grad_norm": 0.16357985138893127,
"learning_rate": 0.00027695380146360536,
"loss": 2.052,
"mean_token_accuracy": 0.5660725966691971,
"step": 16000
},
{
"epoch": 0.46086810792693145,
"grad_norm": 0.1474219560623169,
"learning_rate": 0.00026999888274398075,
"loss": 2.0514,
"mean_token_accuracy": 0.5660765036344528,
"step": 16500
},
{
"epoch": 0.4748338081671415,
"grad_norm": 0.15185120701789856,
"learning_rate": 0.00026301603262387575,
"loss": 2.0445,
"mean_token_accuracy": 0.5662275057137013,
"step": 17000
},
{
"epoch": 0.48879950840735154,
"grad_norm": 0.14625252783298492,
"learning_rate": 0.0002560611139042512,
"loss": 2.0506,
"mean_token_accuracy": 0.567681826710701,
"step": 17500
},
{
"epoch": 0.5027652086475616,
"grad_norm": 0.1625310778617859,
"learning_rate": 0.0002490782637841461,
"loss": 2.0475,
"mean_token_accuracy": 0.565769705593586,
"step": 18000
},
{
"epoch": 0.5167309088877716,
"grad_norm": 0.1645444631576538,
"learning_rate": 0.00024212334506452154,
"loss": 2.0541,
"mean_token_accuracy": 0.5652133010923862,
"step": 18500
},
{
"epoch": 0.5306966091279817,
"grad_norm": 0.15332843363285065,
"learning_rate": 0.00023514049494441654,
"loss": 2.0476,
"mean_token_accuracy": 0.5676583781838417,
"step": 19000
},
{
"epoch": 0.5446623093681917,
"grad_norm": 0.1857738345861435,
"learning_rate": 0.00022818557622479192,
"loss": 2.046,
"mean_token_accuracy": 0.5666445840001106,
"step": 19500
},
{
"epoch": 0.5586280096084018,
"grad_norm": 0.1527378261089325,
"learning_rate": 0.0002212027261046869,
"loss": 2.0469,
"mean_token_accuracy": 0.5673894871473313,
"step": 20000
},
{
"epoch": 0.5725937098486118,
"grad_norm": 0.15622954070568085,
"learning_rate": 0.0002142478073850623,
"loss": 2.0451,
"mean_token_accuracy": 0.5668408484160901,
"step": 20500
},
{
"epoch": 0.5865594100888218,
"grad_norm": 0.15915007889270782,
"learning_rate": 0.00020726495726495727,
"loss": 2.0423,
"mean_token_accuracy": 0.5684662259817124,
"step": 21000
},
{
"epoch": 0.6005251103290319,
"grad_norm": 0.17898404598236084,
"learning_rate": 0.00020031003854533265,
"loss": 2.0485,
"mean_token_accuracy": 0.5655563974380493,
"step": 21500
},
{
"epoch": 0.6144908105692419,
"grad_norm": 0.18496471643447876,
"learning_rate": 0.00019332718842522765,
"loss": 2.0495,
"mean_token_accuracy": 0.5659684043526649,
"step": 22000
},
{
"epoch": 0.628456510809452,
"grad_norm": 0.15914028882980347,
"learning_rate": 0.00018637226970560304,
"loss": 2.0587,
"mean_token_accuracy": 0.5645939151346684,
"step": 22500
},
{
"epoch": 0.642422211049662,
"grad_norm": 0.1719997376203537,
"learning_rate": 0.000179389419585498,
"loss": 2.0522,
"mean_token_accuracy": 0.5654012953042984,
"step": 23000
},
{
"epoch": 0.6563879112898721,
"grad_norm": 0.1447959542274475,
"learning_rate": 0.00017243450086587342,
"loss": 2.0376,
"mean_token_accuracy": 0.5688524658381939,
"step": 23500
},
{
"epoch": 0.6703536115300821,
"grad_norm": 0.1650005280971527,
"learning_rate": 0.00016545165074576841,
"loss": 2.0554,
"mean_token_accuracy": 0.5649112530052662,
"step": 24000
},
{
"epoch": 0.6843193117702921,
"grad_norm": 0.1643020361661911,
"learning_rate": 0.0001584967320261438,
"loss": 2.0369,
"mean_token_accuracy": 0.5688672766685486,
"step": 24500
},
{
"epoch": 0.6982850120105022,
"grad_norm": 0.17811213433742523,
"learning_rate": 0.00015151388190603877,
"loss": 2.0551,
"mean_token_accuracy": 0.5658198243677616,
"step": 25000
},
{
"epoch": 0.7122507122507122,
"grad_norm": 0.18822260200977325,
"learning_rate": 0.00014455896318641418,
"loss": 2.0391,
"mean_token_accuracy": 0.5678052612841129,
"step": 25500
},
{
"epoch": 0.7262164124909223,
"grad_norm": 0.1553669422864914,
"learning_rate": 0.00013757611306630915,
"loss": 2.0481,
"mean_token_accuracy": 0.5658735463917255,
"step": 26000
},
{
"epoch": 0.7401821127311323,
"grad_norm": 0.1644522100687027,
"learning_rate": 0.00013062119434668453,
"loss": 2.0443,
"mean_token_accuracy": 0.5666062987446785,
"step": 26500
},
{
"epoch": 0.7541478129713424,
"grad_norm": 0.18426066637039185,
"learning_rate": 0.00012363834422657953,
"loss": 2.0446,
"mean_token_accuracy": 0.5659443159997464,
"step": 27000
},
{
"epoch": 0.7681135132115524,
"grad_norm": 0.19535085558891296,
"learning_rate": 0.00011668342550695491,
"loss": 2.0498,
"mean_token_accuracy": 0.5656883924603462,
"step": 27500
},
{
"epoch": 0.7820792134517625,
"grad_norm": 0.1869782954454422,
"learning_rate": 0.0001097005753868499,
"loss": 2.0352,
"mean_token_accuracy": 0.5688391443789005,
"step": 28000
},
{
"epoch": 0.7960449136919725,
"grad_norm": 0.17099939286708832,
"learning_rate": 0.0001027456566672253,
"loss": 2.0505,
"mean_token_accuracy": 0.5657015817165375,
"step": 28500
},
{
"epoch": 0.8100106139321825,
"grad_norm": 0.17975910007953644,
"learning_rate": 9.576280654712028e-05,
"loss": 2.0318,
"mean_token_accuracy": 0.5694990956783295,
"step": 29000
},
{
"epoch": 0.8239763141723926,
"grad_norm": 0.17870086431503296,
"learning_rate": 8.880788782749568e-05,
"loss": 2.0447,
"mean_token_accuracy": 0.5674924266338348,
"step": 29500
},
{
"epoch": 0.8379420144126026,
"grad_norm": 0.1745506078004837,
"learning_rate": 8.182503770739065e-05,
"loss": 2.041,
"mean_token_accuracy": 0.5678074882626534,
"step": 30000
},
{
"epoch": 0.8519077146528127,
"grad_norm": 0.18852059543132782,
"learning_rate": 7.487011898776604e-05,
"loss": 2.0407,
"mean_token_accuracy": 0.5682654000520706,
"step": 30500
},
{
"epoch": 0.8658734148930227,
"grad_norm": 0.15847346186637878,
"learning_rate": 6.788726886766103e-05,
"loss": 2.0414,
"mean_token_accuracy": 0.5677537434101104,
"step": 31000
},
{
"epoch": 0.8798391151332328,
"grad_norm": 0.1863761991262436,
"learning_rate": 6.093235014803642e-05,
"loss": 2.0484,
"mean_token_accuracy": 0.5661664333343506,
"step": 31500
},
{
"epoch": 0.8938048153734428,
"grad_norm": 0.18861085176467896,
"learning_rate": 5.39495000279314e-05,
"loss": 2.0404,
"mean_token_accuracy": 0.5677967172861099,
"step": 32000
},
{
"epoch": 0.9077705156136529,
"grad_norm": 0.17940880358219147,
"learning_rate": 4.69945813083068e-05,
"loss": 2.0334,
"mean_token_accuracy": 0.5684446477293968,
"step": 32500
},
{
"epoch": 0.9217362158538629,
"grad_norm": 0.17555660009384155,
"learning_rate": 4.0011731188201776e-05,
"loss": 2.0472,
"mean_token_accuracy": 0.5666785769164562,
"step": 33000
},
{
"epoch": 0.9357019160940729,
"grad_norm": 0.18865884840488434,
"learning_rate": 3.3056812468577174e-05,
"loss": 2.044,
"mean_token_accuracy": 0.5670678113102913,
"step": 33500
},
{
"epoch": 0.949667616334283,
"grad_norm": 0.20321083068847656,
"learning_rate": 2.6073962348472154e-05,
"loss": 2.0505,
"mean_token_accuracy": 0.5650117390453816,
"step": 34000
},
{
"epoch": 0.963633316574493,
"grad_norm": 0.17523406445980072,
"learning_rate": 1.911904362884755e-05,
"loss": 2.0555,
"mean_token_accuracy": 0.5649900923371315,
"step": 34500
},
{
"epoch": 0.9775990168147031,
"grad_norm": 0.1853921115398407,
"learning_rate": 1.213619350874253e-05,
"loss": 2.0464,
"mean_token_accuracy": 0.5669381264448166,
"step": 35000
},
{
"epoch": 0.9915647170549131,
"grad_norm": 0.20597019791603088,
"learning_rate": 5.181274789117926e-06,
"loss": 2.0481,
"mean_token_accuracy": 0.5671657482087612,
"step": 35500
},
{
"epoch": 1.0,
"mean_token_accuracy": 0.5673895427920171,
"step": 35802,
"total_flos": 6.223165951274123e+18,
"train_loss": 2.0618421919924823,
"train_runtime": 64377.845,
"train_samples_per_second": 17.796,
"train_steps_per_second": 0.556
}
],
"logging_steps": 500,
"max_steps": 35802,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 0,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 6.223165951274123e+18,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}