|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 500, |
|
"global_step": 35802, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.013965700240210044, |
|
"grad_norm": 4.268923282623291, |
|
"learning_rate": 0.0004930730126808558, |
|
"loss": 2.5844, |
|
"mean_token_accuracy": 0.5059588371738791, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.027931400480420088, |
|
"grad_norm": 0.6783562302589417, |
|
"learning_rate": 0.0004860901625607508, |
|
"loss": 2.153, |
|
"mean_token_accuracy": 0.5498105692565441, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.04189710072063013, |
|
"grad_norm": 0.49954575300216675, |
|
"learning_rate": 0.0004791073124406458, |
|
"loss": 2.0782, |
|
"mean_token_accuracy": 0.5626471206247806, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.055862800960840175, |
|
"grad_norm": 0.501465916633606, |
|
"learning_rate": 0.0004721244623205407, |
|
"loss": 2.0761, |
|
"mean_token_accuracy": 0.5623862668275833, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.06982850120105022, |
|
"grad_norm": 0.52457594871521, |
|
"learning_rate": 0.0004651416122004357, |
|
"loss": 2.0543, |
|
"mean_token_accuracy": 0.5653762078285217, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.08379420144126026, |
|
"grad_norm": 0.366854727268219, |
|
"learning_rate": 0.0004581587620803307, |
|
"loss": 2.0699, |
|
"mean_token_accuracy": 0.5624443043172359, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.09775990168147031, |
|
"grad_norm": 0.33193984627723694, |
|
"learning_rate": 0.00045118987766046596, |
|
"loss": 2.0706, |
|
"mean_token_accuracy": 0.5632118558287621, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.11172560192168035, |
|
"grad_norm": 0.26177772879600525, |
|
"learning_rate": 0.0004442070275403609, |
|
"loss": 2.0744, |
|
"mean_token_accuracy": 0.564106923341751, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.1256913021618904, |
|
"grad_norm": 0.31001174449920654, |
|
"learning_rate": 0.0004372521088207363, |
|
"loss": 2.0789, |
|
"mean_token_accuracy": 0.5612107511162758, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.13965700240210044, |
|
"grad_norm": 0.26565852761268616, |
|
"learning_rate": 0.0004302692587006313, |
|
"loss": 2.0766, |
|
"mean_token_accuracy": 0.5622789396047592, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.15362270264231048, |
|
"grad_norm": 0.2479698359966278, |
|
"learning_rate": 0.00042331433998100666, |
|
"loss": 2.0658, |
|
"mean_token_accuracy": 0.5642604413330555, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.16758840288252053, |
|
"grad_norm": 0.2320714145898819, |
|
"learning_rate": 0.00041633148986090166, |
|
"loss": 2.0537, |
|
"mean_token_accuracy": 0.5658319992125034, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.18155410312273057, |
|
"grad_norm": 0.20908017456531525, |
|
"learning_rate": 0.00040937657114127705, |
|
"loss": 2.068, |
|
"mean_token_accuracy": 0.5633928556740284, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.19551980336294061, |
|
"grad_norm": 0.21336887776851654, |
|
"learning_rate": 0.000402393721021172, |
|
"loss": 2.056, |
|
"mean_token_accuracy": 0.5659000330269337, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.20948550360315066, |
|
"grad_norm": 0.18956658244132996, |
|
"learning_rate": 0.00039543880230154737, |
|
"loss": 2.0611, |
|
"mean_token_accuracy": 0.5643967908620834, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.2234512038433607, |
|
"grad_norm": 0.19688238203525543, |
|
"learning_rate": 0.00038845595218144237, |
|
"loss": 2.0571, |
|
"mean_token_accuracy": 0.5656557891070843, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.23741690408357075, |
|
"grad_norm": 0.17653051018714905, |
|
"learning_rate": 0.0003815010334618178, |
|
"loss": 2.0642, |
|
"mean_token_accuracy": 0.5636518878340722, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.2513826043237808, |
|
"grad_norm": 0.163355752825737, |
|
"learning_rate": 0.0003745181833417128, |
|
"loss": 2.0687, |
|
"mean_token_accuracy": 0.5622124362289905, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.26534830456399083, |
|
"grad_norm": 0.1576842963695526, |
|
"learning_rate": 0.0003675632646220882, |
|
"loss": 2.0537, |
|
"mean_token_accuracy": 0.5655075218081475, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 0.2793140048042009, |
|
"grad_norm": 0.1434595286846161, |
|
"learning_rate": 0.00036058041450198313, |
|
"loss": 2.0531, |
|
"mean_token_accuracy": 0.566190888762474, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.2932797050444109, |
|
"grad_norm": 0.15508811175823212, |
|
"learning_rate": 0.0003536254957823585, |
|
"loss": 2.0609, |
|
"mean_token_accuracy": 0.5647544565498829, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 0.30724540528462096, |
|
"grad_norm": 0.1619788408279419, |
|
"learning_rate": 0.0003466426456622535, |
|
"loss": 2.0572, |
|
"mean_token_accuracy": 0.5651431261301041, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.321211105524831, |
|
"grad_norm": 0.15186958014965057, |
|
"learning_rate": 0.0003396877269426289, |
|
"loss": 2.052, |
|
"mean_token_accuracy": 0.5652778272330761, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 0.33517680576504105, |
|
"grad_norm": 0.1533740907907486, |
|
"learning_rate": 0.0003327048768225239, |
|
"loss": 2.0507, |
|
"mean_token_accuracy": 0.5665202861726284, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.3491425060052511, |
|
"grad_norm": 0.15380869805812836, |
|
"learning_rate": 0.00032574995810289933, |
|
"loss": 2.0556, |
|
"mean_token_accuracy": 0.5652170441150666, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 0.36310820624546114, |
|
"grad_norm": 0.1550268828868866, |
|
"learning_rate": 0.0003187671079827943, |
|
"loss": 2.0509, |
|
"mean_token_accuracy": 0.565740670889616, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.3770739064856712, |
|
"grad_norm": 0.15138842165470123, |
|
"learning_rate": 0.00031181218926316966, |
|
"loss": 2.0576, |
|
"mean_token_accuracy": 0.5651396196782589, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 0.39103960672588123, |
|
"grad_norm": 0.14943508803844452, |
|
"learning_rate": 0.00030482933914306466, |
|
"loss": 2.0459, |
|
"mean_token_accuracy": 0.5667617973387241, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.40500530696609127, |
|
"grad_norm": 0.15654833614826202, |
|
"learning_rate": 0.00029787442042344004, |
|
"loss": 2.0682, |
|
"mean_token_accuracy": 0.5644841706454754, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 0.4189710072063013, |
|
"grad_norm": 0.1552565097808838, |
|
"learning_rate": 0.00029089157030333504, |
|
"loss": 2.0619, |
|
"mean_token_accuracy": 0.5638643040955067, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.43293670744651136, |
|
"grad_norm": 0.15082110464572906, |
|
"learning_rate": 0.0002839366515837104, |
|
"loss": 2.0546, |
|
"mean_token_accuracy": 0.565163453668356, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 0.4469024076867214, |
|
"grad_norm": 0.16357985138893127, |
|
"learning_rate": 0.00027695380146360536, |
|
"loss": 2.052, |
|
"mean_token_accuracy": 0.5660725966691971, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 0.46086810792693145, |
|
"grad_norm": 0.1474219560623169, |
|
"learning_rate": 0.00026999888274398075, |
|
"loss": 2.0514, |
|
"mean_token_accuracy": 0.5660765036344528, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 0.4748338081671415, |
|
"grad_norm": 0.15185120701789856, |
|
"learning_rate": 0.00026301603262387575, |
|
"loss": 2.0445, |
|
"mean_token_accuracy": 0.5662275057137013, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 0.48879950840735154, |
|
"grad_norm": 0.14625252783298492, |
|
"learning_rate": 0.0002560611139042512, |
|
"loss": 2.0506, |
|
"mean_token_accuracy": 0.567681826710701, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 0.5027652086475616, |
|
"grad_norm": 0.1625310778617859, |
|
"learning_rate": 0.0002490782637841461, |
|
"loss": 2.0475, |
|
"mean_token_accuracy": 0.565769705593586, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 0.5167309088877716, |
|
"grad_norm": 0.1645444631576538, |
|
"learning_rate": 0.00024212334506452154, |
|
"loss": 2.0541, |
|
"mean_token_accuracy": 0.5652133010923862, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 0.5306966091279817, |
|
"grad_norm": 0.15332843363285065, |
|
"learning_rate": 0.00023514049494441654, |
|
"loss": 2.0476, |
|
"mean_token_accuracy": 0.5676583781838417, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 0.5446623093681917, |
|
"grad_norm": 0.1857738345861435, |
|
"learning_rate": 0.00022818557622479192, |
|
"loss": 2.046, |
|
"mean_token_accuracy": 0.5666445840001106, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 0.5586280096084018, |
|
"grad_norm": 0.1527378261089325, |
|
"learning_rate": 0.0002212027261046869, |
|
"loss": 2.0469, |
|
"mean_token_accuracy": 0.5673894871473313, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 0.5725937098486118, |
|
"grad_norm": 0.15622954070568085, |
|
"learning_rate": 0.0002142478073850623, |
|
"loss": 2.0451, |
|
"mean_token_accuracy": 0.5668408484160901, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 0.5865594100888218, |
|
"grad_norm": 0.15915007889270782, |
|
"learning_rate": 0.00020726495726495727, |
|
"loss": 2.0423, |
|
"mean_token_accuracy": 0.5684662259817124, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 0.6005251103290319, |
|
"grad_norm": 0.17898404598236084, |
|
"learning_rate": 0.00020031003854533265, |
|
"loss": 2.0485, |
|
"mean_token_accuracy": 0.5655563974380493, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 0.6144908105692419, |
|
"grad_norm": 0.18496471643447876, |
|
"learning_rate": 0.00019332718842522765, |
|
"loss": 2.0495, |
|
"mean_token_accuracy": 0.5659684043526649, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 0.628456510809452, |
|
"grad_norm": 0.15914028882980347, |
|
"learning_rate": 0.00018637226970560304, |
|
"loss": 2.0587, |
|
"mean_token_accuracy": 0.5645939151346684, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 0.642422211049662, |
|
"grad_norm": 0.1719997376203537, |
|
"learning_rate": 0.000179389419585498, |
|
"loss": 2.0522, |
|
"mean_token_accuracy": 0.5654012953042984, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 0.6563879112898721, |
|
"grad_norm": 0.1447959542274475, |
|
"learning_rate": 0.00017243450086587342, |
|
"loss": 2.0376, |
|
"mean_token_accuracy": 0.5688524658381939, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 0.6703536115300821, |
|
"grad_norm": 0.1650005280971527, |
|
"learning_rate": 0.00016545165074576841, |
|
"loss": 2.0554, |
|
"mean_token_accuracy": 0.5649112530052662, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 0.6843193117702921, |
|
"grad_norm": 0.1643020361661911, |
|
"learning_rate": 0.0001584967320261438, |
|
"loss": 2.0369, |
|
"mean_token_accuracy": 0.5688672766685486, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 0.6982850120105022, |
|
"grad_norm": 0.17811213433742523, |
|
"learning_rate": 0.00015151388190603877, |
|
"loss": 2.0551, |
|
"mean_token_accuracy": 0.5658198243677616, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 0.7122507122507122, |
|
"grad_norm": 0.18822260200977325, |
|
"learning_rate": 0.00014455896318641418, |
|
"loss": 2.0391, |
|
"mean_token_accuracy": 0.5678052612841129, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 0.7262164124909223, |
|
"grad_norm": 0.1553669422864914, |
|
"learning_rate": 0.00013757611306630915, |
|
"loss": 2.0481, |
|
"mean_token_accuracy": 0.5658735463917255, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 0.7401821127311323, |
|
"grad_norm": 0.1644522100687027, |
|
"learning_rate": 0.00013062119434668453, |
|
"loss": 2.0443, |
|
"mean_token_accuracy": 0.5666062987446785, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 0.7541478129713424, |
|
"grad_norm": 0.18426066637039185, |
|
"learning_rate": 0.00012363834422657953, |
|
"loss": 2.0446, |
|
"mean_token_accuracy": 0.5659443159997464, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 0.7681135132115524, |
|
"grad_norm": 0.19535085558891296, |
|
"learning_rate": 0.00011668342550695491, |
|
"loss": 2.0498, |
|
"mean_token_accuracy": 0.5656883924603462, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 0.7820792134517625, |
|
"grad_norm": 0.1869782954454422, |
|
"learning_rate": 0.0001097005753868499, |
|
"loss": 2.0352, |
|
"mean_token_accuracy": 0.5688391443789005, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 0.7960449136919725, |
|
"grad_norm": 0.17099939286708832, |
|
"learning_rate": 0.0001027456566672253, |
|
"loss": 2.0505, |
|
"mean_token_accuracy": 0.5657015817165375, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 0.8100106139321825, |
|
"grad_norm": 0.17975910007953644, |
|
"learning_rate": 9.576280654712028e-05, |
|
"loss": 2.0318, |
|
"mean_token_accuracy": 0.5694990956783295, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 0.8239763141723926, |
|
"grad_norm": 0.17870086431503296, |
|
"learning_rate": 8.880788782749568e-05, |
|
"loss": 2.0447, |
|
"mean_token_accuracy": 0.5674924266338348, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 0.8379420144126026, |
|
"grad_norm": 0.1745506078004837, |
|
"learning_rate": 8.182503770739065e-05, |
|
"loss": 2.041, |
|
"mean_token_accuracy": 0.5678074882626534, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 0.8519077146528127, |
|
"grad_norm": 0.18852059543132782, |
|
"learning_rate": 7.487011898776604e-05, |
|
"loss": 2.0407, |
|
"mean_token_accuracy": 0.5682654000520706, |
|
"step": 30500 |
|
}, |
|
{ |
|
"epoch": 0.8658734148930227, |
|
"grad_norm": 0.15847346186637878, |
|
"learning_rate": 6.788726886766103e-05, |
|
"loss": 2.0414, |
|
"mean_token_accuracy": 0.5677537434101104, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 0.8798391151332328, |
|
"grad_norm": 0.1863761991262436, |
|
"learning_rate": 6.093235014803642e-05, |
|
"loss": 2.0484, |
|
"mean_token_accuracy": 0.5661664333343506, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 0.8938048153734428, |
|
"grad_norm": 0.18861085176467896, |
|
"learning_rate": 5.39495000279314e-05, |
|
"loss": 2.0404, |
|
"mean_token_accuracy": 0.5677967172861099, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 0.9077705156136529, |
|
"grad_norm": 0.17940880358219147, |
|
"learning_rate": 4.69945813083068e-05, |
|
"loss": 2.0334, |
|
"mean_token_accuracy": 0.5684446477293968, |
|
"step": 32500 |
|
}, |
|
{ |
|
"epoch": 0.9217362158538629, |
|
"grad_norm": 0.17555660009384155, |
|
"learning_rate": 4.0011731188201776e-05, |
|
"loss": 2.0472, |
|
"mean_token_accuracy": 0.5666785769164562, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 0.9357019160940729, |
|
"grad_norm": 0.18865884840488434, |
|
"learning_rate": 3.3056812468577174e-05, |
|
"loss": 2.044, |
|
"mean_token_accuracy": 0.5670678113102913, |
|
"step": 33500 |
|
}, |
|
{ |
|
"epoch": 0.949667616334283, |
|
"grad_norm": 0.20321083068847656, |
|
"learning_rate": 2.6073962348472154e-05, |
|
"loss": 2.0505, |
|
"mean_token_accuracy": 0.5650117390453816, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 0.963633316574493, |
|
"grad_norm": 0.17523406445980072, |
|
"learning_rate": 1.911904362884755e-05, |
|
"loss": 2.0555, |
|
"mean_token_accuracy": 0.5649900923371315, |
|
"step": 34500 |
|
}, |
|
{ |
|
"epoch": 0.9775990168147031, |
|
"grad_norm": 0.1853921115398407, |
|
"learning_rate": 1.213619350874253e-05, |
|
"loss": 2.0464, |
|
"mean_token_accuracy": 0.5669381264448166, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 0.9915647170549131, |
|
"grad_norm": 0.20597019791603088, |
|
"learning_rate": 5.181274789117926e-06, |
|
"loss": 2.0481, |
|
"mean_token_accuracy": 0.5671657482087612, |
|
"step": 35500 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"mean_token_accuracy": 0.5673895427920171, |
|
"step": 35802, |
|
"total_flos": 6.223165951274123e+18, |
|
"train_loss": 2.0618421919924823, |
|
"train_runtime": 64377.845, |
|
"train_samples_per_second": 17.796, |
|
"train_steps_per_second": 0.556 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 35802, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 0, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": false, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 6.223165951274123e+18, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|