|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.089829781147189, |
|
"eval_steps": 400, |
|
"global_step": 10800, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.014304105278214848, |
|
"grad_norm": 1.6012784676437102, |
|
"learning_rate": 1.6666666666666667e-06, |
|
"loss": 4.4038, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.028608210556429696, |
|
"grad_norm": 0.81484251583879, |
|
"learning_rate": 3.3333333333333333e-06, |
|
"loss": 3.6036, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.04291231583464454, |
|
"grad_norm": 3.9762696099154904, |
|
"learning_rate": 5e-06, |
|
"loss": 3.0207, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.05721642111285939, |
|
"grad_norm": 3.057952660211588, |
|
"learning_rate": 6.666666666666667e-06, |
|
"loss": 2.4324, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.07152052639107424, |
|
"grad_norm": 2.092719622855296, |
|
"learning_rate": 8.333333333333334e-06, |
|
"loss": 2.222, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.08582463166928908, |
|
"grad_norm": 6.08825143706115, |
|
"learning_rate": 1e-05, |
|
"loss": 2.1021, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.10012873694750393, |
|
"grad_norm": 2.493878945601314, |
|
"learning_rate": 9.999953760295448e-06, |
|
"loss": 1.9831, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.11443284222571878, |
|
"grad_norm": 4.462960292469778, |
|
"learning_rate": 9.999815042132062e-06, |
|
"loss": 1.917, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.11443284222571878, |
|
"eval_loss": 1.808639645576477, |
|
"eval_runtime": 14.2096, |
|
"eval_samples_per_second": 70.375, |
|
"eval_steps_per_second": 2.252, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.12873694750393364, |
|
"grad_norm": 2.038795534490349, |
|
"learning_rate": 9.999583848360633e-06, |
|
"loss": 1.8614, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.1430410527821485, |
|
"grad_norm": 2.259377386606669, |
|
"learning_rate": 9.999260183732424e-06, |
|
"loss": 1.8105, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.1573451580603633, |
|
"grad_norm": 1.6457423711505388, |
|
"learning_rate": 9.998844054899058e-06, |
|
"loss": 1.7759, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.17164926333857816, |
|
"grad_norm": 2.6198123977173555, |
|
"learning_rate": 9.998335470412393e-06, |
|
"loss": 1.7508, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.185953368616793, |
|
"grad_norm": 1.6377415784196128, |
|
"learning_rate": 9.997734440724333e-06, |
|
"loss": 1.7156, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.20025747389500786, |
|
"grad_norm": 3.5293148754159285, |
|
"learning_rate": 9.997040978186633e-06, |
|
"loss": 1.7015, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.21456157917322272, |
|
"grad_norm": 2.3013282525925263, |
|
"learning_rate": 9.996255097050624e-06, |
|
"loss": 1.6782, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.22886568445143757, |
|
"grad_norm": 2.428974082500653, |
|
"learning_rate": 9.995376813466934e-06, |
|
"loss": 1.66, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.22886568445143757, |
|
"eval_loss": 1.5992412567138672, |
|
"eval_runtime": 14.0538, |
|
"eval_samples_per_second": 71.155, |
|
"eval_steps_per_second": 2.277, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.24316978972965242, |
|
"grad_norm": 2.9094373795416506, |
|
"learning_rate": 9.994406145485151e-06, |
|
"loss": 1.6399, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.25747389500786727, |
|
"grad_norm": 1.5133813561921106, |
|
"learning_rate": 9.993343113053454e-06, |
|
"loss": 1.626, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.2717780002860821, |
|
"grad_norm": 1.3663105185649191, |
|
"learning_rate": 9.992187738018203e-06, |
|
"loss": 1.6099, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.286082105564297, |
|
"grad_norm": 1.3144291853877879, |
|
"learning_rate": 9.99094004412348e-06, |
|
"loss": 1.5968, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.3003862108425118, |
|
"grad_norm": 1.8770146895064077, |
|
"learning_rate": 9.989600057010625e-06, |
|
"loss": 1.5754, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.3146903161207266, |
|
"grad_norm": 1.8478210167954083, |
|
"learning_rate": 9.988167804217682e-06, |
|
"loss": 1.5711, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.3289944213989415, |
|
"grad_norm": 1.5949372088951037, |
|
"learning_rate": 9.986643315178848e-06, |
|
"loss": 1.5557, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.3432985266771563, |
|
"grad_norm": 1.8431659408457755, |
|
"learning_rate": 9.98502662122387e-06, |
|
"loss": 1.5572, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.3432985266771563, |
|
"eval_loss": 1.50032639503479, |
|
"eval_runtime": 14.0776, |
|
"eval_samples_per_second": 71.035, |
|
"eval_steps_per_second": 2.273, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.3576026319553712, |
|
"grad_norm": 1.3869607567913713, |
|
"learning_rate": 9.983317755577392e-06, |
|
"loss": 1.5363, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.371906737233586, |
|
"grad_norm": 1.4514189742887267, |
|
"learning_rate": 9.981516753358274e-06, |
|
"loss": 1.5358, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.3862108425118009, |
|
"grad_norm": 1.4333267526235296, |
|
"learning_rate": 9.979623651578881e-06, |
|
"loss": 1.5141, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.40051494779001573, |
|
"grad_norm": 0.8580367772458624, |
|
"learning_rate": 9.977638489144308e-06, |
|
"loss": 1.523, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.4148190530682306, |
|
"grad_norm": 0.9460440332154582, |
|
"learning_rate": 9.975561306851585e-06, |
|
"loss": 1.5175, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.42912315834644543, |
|
"grad_norm": 1.376203229447874, |
|
"learning_rate": 9.973392147388847e-06, |
|
"loss": 1.5126, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.4434272636246603, |
|
"grad_norm": 1.5041770784794857, |
|
"learning_rate": 9.971131055334445e-06, |
|
"loss": 1.4977, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.45773136890287514, |
|
"grad_norm": 1.095703863839786, |
|
"learning_rate": 9.968778077156035e-06, |
|
"loss": 1.4877, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.45773136890287514, |
|
"eval_loss": 1.4638383388519287, |
|
"eval_runtime": 14.0468, |
|
"eval_samples_per_second": 71.191, |
|
"eval_steps_per_second": 2.278, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.47203547418109, |
|
"grad_norm": 1.1967770971190828, |
|
"learning_rate": 9.966333261209625e-06, |
|
"loss": 1.4941, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.48633957945930484, |
|
"grad_norm": 0.7764934991914475, |
|
"learning_rate": 9.96379665773858e-06, |
|
"loss": 1.4943, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.5006436847375196, |
|
"grad_norm": 1.1957507140564159, |
|
"learning_rate": 9.961168318872583e-06, |
|
"loss": 1.4834, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.5149477900157345, |
|
"grad_norm": 0.891291786132535, |
|
"learning_rate": 9.958448298626576e-06, |
|
"loss": 1.4766, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.5292518952939493, |
|
"grad_norm": 0.9430107046686556, |
|
"learning_rate": 9.95563665289964e-06, |
|
"loss": 1.4659, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.5435560005721642, |
|
"grad_norm": 1.3583446842191815, |
|
"learning_rate": 9.952733439473847e-06, |
|
"loss": 1.4681, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.557860105850379, |
|
"grad_norm": 1.010261006024344, |
|
"learning_rate": 9.94973871801308e-06, |
|
"loss": 1.4667, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.572164211128594, |
|
"grad_norm": 0.8494941104833196, |
|
"learning_rate": 9.946652550061798e-06, |
|
"loss": 1.4453, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.572164211128594, |
|
"eval_loss": 1.4287511110305786, |
|
"eval_runtime": 14.0255, |
|
"eval_samples_per_second": 71.299, |
|
"eval_steps_per_second": 2.282, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.5864683164068087, |
|
"grad_norm": 0.7812469708103134, |
|
"learning_rate": 9.943474999043775e-06, |
|
"loss": 1.4496, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.6007724216850236, |
|
"grad_norm": 0.7254104161544093, |
|
"learning_rate": 9.9402061302608e-06, |
|
"loss": 1.4462, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.6150765269632384, |
|
"grad_norm": 1.1402597738223317, |
|
"learning_rate": 9.93684601089133e-06, |
|
"loss": 1.4402, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.6293806322414532, |
|
"grad_norm": 1.0636750138637265, |
|
"learning_rate": 9.933394709989109e-06, |
|
"loss": 1.4514, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.6436847375196681, |
|
"grad_norm": 0.6340325583537392, |
|
"learning_rate": 9.92985229848175e-06, |
|
"loss": 1.4376, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.657988842797883, |
|
"grad_norm": 1.3226650510062645, |
|
"learning_rate": 9.926218849169284e-06, |
|
"loss": 1.4404, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.6722929480760979, |
|
"grad_norm": 0.9023729708460776, |
|
"learning_rate": 9.922494436722653e-06, |
|
"loss": 1.435, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 0.6865970533543126, |
|
"grad_norm": 1.1170660045757717, |
|
"learning_rate": 9.91867913768218e-06, |
|
"loss": 1.4275, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.6865970533543126, |
|
"eval_loss": 1.4157905578613281, |
|
"eval_runtime": 14.0561, |
|
"eval_samples_per_second": 71.143, |
|
"eval_steps_per_second": 2.277, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.7009011586325276, |
|
"grad_norm": 1.164925228192199, |
|
"learning_rate": 9.914773030456001e-06, |
|
"loss": 1.4238, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 0.7152052639107424, |
|
"grad_norm": 0.8519530167823217, |
|
"learning_rate": 9.910776195318448e-06, |
|
"loss": 1.4347, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.7295093691889573, |
|
"grad_norm": 0.7139589978182425, |
|
"learning_rate": 9.906688714408396e-06, |
|
"loss": 1.4306, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 0.743813474467172, |
|
"grad_norm": 0.8653282057170465, |
|
"learning_rate": 9.902510671727583e-06, |
|
"loss": 1.4229, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.758117579745387, |
|
"grad_norm": 0.8247347491114752, |
|
"learning_rate": 9.898242153138882e-06, |
|
"loss": 1.4118, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 0.7724216850236018, |
|
"grad_norm": 1.0924147996236788, |
|
"learning_rate": 9.89388324636453e-06, |
|
"loss": 1.4322, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.7867257903018167, |
|
"grad_norm": 0.842516122122594, |
|
"learning_rate": 9.889434040984333e-06, |
|
"loss": 1.4101, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 0.8010298955800315, |
|
"grad_norm": 0.8063486362804477, |
|
"learning_rate": 9.88489462843382e-06, |
|
"loss": 1.4191, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.8010298955800315, |
|
"eval_loss": 1.4116355180740356, |
|
"eval_runtime": 13.9779, |
|
"eval_samples_per_second": 71.542, |
|
"eval_steps_per_second": 2.289, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.8153340008582464, |
|
"grad_norm": 0.6258848452847008, |
|
"learning_rate": 9.880265102002369e-06, |
|
"loss": 1.4001, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 0.8296381061364612, |
|
"grad_norm": 0.726517642303323, |
|
"learning_rate": 9.875545556831283e-06, |
|
"loss": 1.4086, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.843942211414676, |
|
"grad_norm": 0.6713970013254277, |
|
"learning_rate": 9.870736089911836e-06, |
|
"loss": 1.4073, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 0.8582463166928909, |
|
"grad_norm": 0.6148598667666052, |
|
"learning_rate": 9.865836800083291e-06, |
|
"loss": 1.4093, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.8725504219711057, |
|
"grad_norm": 0.5359562950631023, |
|
"learning_rate": 9.860847788030852e-06, |
|
"loss": 1.4017, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 0.8868545272493206, |
|
"grad_norm": 0.6194549549607876, |
|
"learning_rate": 9.855769156283604e-06, |
|
"loss": 1.4196, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.9011586325275354, |
|
"grad_norm": 0.7870838887793197, |
|
"learning_rate": 9.850601009212408e-06, |
|
"loss": 1.4039, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 0.9154627378057503, |
|
"grad_norm": 0.8348797495331252, |
|
"learning_rate": 9.845343453027747e-06, |
|
"loss": 1.4092, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.9154627378057503, |
|
"eval_loss": 1.3961894512176514, |
|
"eval_runtime": 14.0237, |
|
"eval_samples_per_second": 71.308, |
|
"eval_steps_per_second": 2.282, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.9297668430839651, |
|
"grad_norm": 0.8890086654120082, |
|
"learning_rate": 9.839996595777552e-06, |
|
"loss": 1.3991, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 0.94407094836218, |
|
"grad_norm": 0.8338244522175184, |
|
"learning_rate": 9.83456054734498e-06, |
|
"loss": 1.3939, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.9583750536403948, |
|
"grad_norm": 0.667534745389414, |
|
"learning_rate": 9.829035419446156e-06, |
|
"loss": 1.4052, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 0.9726791589186097, |
|
"grad_norm": 0.830996338803645, |
|
"learning_rate": 9.823421325627865e-06, |
|
"loss": 1.408, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.9869832641968245, |
|
"grad_norm": 0.752895350030203, |
|
"learning_rate": 9.81771838126524e-06, |
|
"loss": 1.3927, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 1.0012873694750393, |
|
"grad_norm": 0.6022807633216317, |
|
"learning_rate": 9.811926703559374e-06, |
|
"loss": 1.3947, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 1.0155914747532542, |
|
"grad_norm": 0.7757999852306153, |
|
"learning_rate": 9.806046411534916e-06, |
|
"loss": 1.3613, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 1.029895580031469, |
|
"grad_norm": 0.6991186658573486, |
|
"learning_rate": 9.800077626037633e-06, |
|
"loss": 1.3805, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 1.029895580031469, |
|
"eval_loss": 1.386795163154602, |
|
"eval_runtime": 13.9668, |
|
"eval_samples_per_second": 71.598, |
|
"eval_steps_per_second": 2.291, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 1.044199685309684, |
|
"grad_norm": 0.6304272914508194, |
|
"learning_rate": 9.794020469731915e-06, |
|
"loss": 1.3772, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 1.0585037905878987, |
|
"grad_norm": 0.6127596406721845, |
|
"learning_rate": 9.787875067098257e-06, |
|
"loss": 1.3695, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 1.0728078958661136, |
|
"grad_norm": 0.5752396229133312, |
|
"learning_rate": 9.781641544430703e-06, |
|
"loss": 1.3737, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 1.0871120011443285, |
|
"grad_norm": 0.8167932197181069, |
|
"learning_rate": 9.775320029834255e-06, |
|
"loss": 1.3679, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 1.1014161064225432, |
|
"grad_norm": 0.7493986062078165, |
|
"learning_rate": 9.76891065322223e-06, |
|
"loss": 1.3686, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 1.115720211700758, |
|
"grad_norm": 0.6896574555563986, |
|
"learning_rate": 9.762413546313597e-06, |
|
"loss": 1.3688, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 1.130024316978973, |
|
"grad_norm": 0.54479225381951, |
|
"learning_rate": 9.755828842630269e-06, |
|
"loss": 1.3577, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 1.144328422257188, |
|
"grad_norm": 0.8631407967474234, |
|
"learning_rate": 9.749156677494357e-06, |
|
"loss": 1.3791, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 1.144328422257188, |
|
"eval_loss": 1.3818904161453247, |
|
"eval_runtime": 14.0228, |
|
"eval_samples_per_second": 71.312, |
|
"eval_steps_per_second": 2.282, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 1.1586325275354026, |
|
"grad_norm": 0.6269351505110898, |
|
"learning_rate": 9.742397188025394e-06, |
|
"loss": 1.3672, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 1.1729366328136175, |
|
"grad_norm": 0.5964977170501943, |
|
"learning_rate": 9.735550513137513e-06, |
|
"loss": 1.3579, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 1.1872407380918324, |
|
"grad_norm": 0.6696040499572795, |
|
"learning_rate": 9.728616793536588e-06, |
|
"loss": 1.3704, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 1.2015448433700473, |
|
"grad_norm": 0.7153959218092929, |
|
"learning_rate": 9.721596171717352e-06, |
|
"loss": 1.3631, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 1.215848948648262, |
|
"grad_norm": 0.8228253318299735, |
|
"learning_rate": 9.714488791960463e-06, |
|
"loss": 1.3643, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 1.230153053926477, |
|
"grad_norm": 0.6427955816989828, |
|
"learning_rate": 9.707294800329536e-06, |
|
"loss": 1.3608, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 1.2444571592046918, |
|
"grad_norm": 0.6438118616712295, |
|
"learning_rate": 9.700014344668152e-06, |
|
"loss": 1.3564, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 1.2587612644829065, |
|
"grad_norm": 0.5732058961632965, |
|
"learning_rate": 9.692647574596803e-06, |
|
"loss": 1.3623, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 1.2587612644829065, |
|
"eval_loss": 1.3667371273040771, |
|
"eval_runtime": 14.0711, |
|
"eval_samples_per_second": 71.068, |
|
"eval_steps_per_second": 2.274, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 1.2732084108139037, |
|
"grad_norm": 0.5434874117890776, |
|
"learning_rate": 9.685194641509837e-06, |
|
"loss": 1.3592, |
|
"step": 4450 |
|
}, |
|
{ |
|
"epoch": 1.2875125160921184, |
|
"grad_norm": 0.8067573948854371, |
|
"learning_rate": 9.677655698572326e-06, |
|
"loss": 1.3571, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 1.3018166213703333, |
|
"grad_norm": 0.6211280738341731, |
|
"learning_rate": 9.670030900716941e-06, |
|
"loss": 1.3577, |
|
"step": 4550 |
|
}, |
|
{ |
|
"epoch": 1.3161207266485482, |
|
"grad_norm": 0.47127980997402974, |
|
"learning_rate": 9.662320404640743e-06, |
|
"loss": 1.3497, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 1.3304248319267629, |
|
"grad_norm": 0.6437090365289073, |
|
"learning_rate": 9.654524368801982e-06, |
|
"loss": 1.3611, |
|
"step": 4650 |
|
}, |
|
{ |
|
"epoch": 1.3447289372049778, |
|
"grad_norm": 0.4706214878937702, |
|
"learning_rate": 9.646642953416835e-06, |
|
"loss": 1.3596, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 1.3590330424831927, |
|
"grad_norm": 0.4433218616654087, |
|
"learning_rate": 9.638676320456109e-06, |
|
"loss": 1.3612, |
|
"step": 4750 |
|
}, |
|
{ |
|
"epoch": 1.3733371477614076, |
|
"grad_norm": 0.6227834199361844, |
|
"learning_rate": 9.630624633641918e-06, |
|
"loss": 1.3487, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 1.3733371477614076, |
|
"eval_loss": 1.3724805116653442, |
|
"eval_runtime": 13.958, |
|
"eval_samples_per_second": 71.643, |
|
"eval_steps_per_second": 2.293, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 1.3876412530396225, |
|
"grad_norm": 0.5615209752207829, |
|
"learning_rate": 9.622488058444313e-06, |
|
"loss": 1.3416, |
|
"step": 4850 |
|
}, |
|
{ |
|
"epoch": 1.4019453583178372, |
|
"grad_norm": 0.4593448830072353, |
|
"learning_rate": 9.614266762077891e-06, |
|
"loss": 1.3509, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 1.416249463596052, |
|
"grad_norm": 0.5260361200473717, |
|
"learning_rate": 9.605960913498342e-06, |
|
"loss": 1.3504, |
|
"step": 4950 |
|
}, |
|
{ |
|
"epoch": 1.430553568874267, |
|
"grad_norm": 0.4949775762320425, |
|
"learning_rate": 9.597570683398996e-06, |
|
"loss": 1.3608, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 1.4448576741524817, |
|
"grad_norm": 0.7134992551375888, |
|
"learning_rate": 9.5890962442073e-06, |
|
"loss": 1.3456, |
|
"step": 5050 |
|
}, |
|
{ |
|
"epoch": 1.4591617794306966, |
|
"grad_norm": 0.749997828555375, |
|
"learning_rate": 9.580537770081285e-06, |
|
"loss": 1.3413, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 1.4734658847089115, |
|
"grad_norm": 0.5312330906616294, |
|
"learning_rate": 9.57189543690598e-06, |
|
"loss": 1.3507, |
|
"step": 5150 |
|
}, |
|
{ |
|
"epoch": 1.4877699899871262, |
|
"grad_norm": 0.5913338284525619, |
|
"learning_rate": 9.563169422289798e-06, |
|
"loss": 1.3386, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 1.4877699899871262, |
|
"eval_loss": 1.359579086303711, |
|
"eval_runtime": 14.046, |
|
"eval_samples_per_second": 71.195, |
|
"eval_steps_per_second": 2.278, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 1.5020740952653413, |
|
"grad_norm": 0.63516444597305, |
|
"learning_rate": 9.554359905560887e-06, |
|
"loss": 1.3412, |
|
"step": 5250 |
|
}, |
|
{ |
|
"epoch": 1.516378200543556, |
|
"grad_norm": 0.4411581484928778, |
|
"learning_rate": 9.54546706776345e-06, |
|
"loss": 1.3505, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 1.530682305821771, |
|
"grad_norm": 0.403266190389094, |
|
"learning_rate": 9.536491091654018e-06, |
|
"loss": 1.3418, |
|
"step": 5350 |
|
}, |
|
{ |
|
"epoch": 1.5449864110999858, |
|
"grad_norm": 0.4887790997121695, |
|
"learning_rate": 9.527432161697696e-06, |
|
"loss": 1.352, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 1.5592905163782005, |
|
"grad_norm": 0.43803734390526294, |
|
"learning_rate": 9.518290464064365e-06, |
|
"loss": 1.3374, |
|
"step": 5450 |
|
}, |
|
{ |
|
"epoch": 1.5735946216564154, |
|
"grad_norm": 0.4477296911829739, |
|
"learning_rate": 9.509066186624872e-06, |
|
"loss": 1.3362, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 1.5878987269346303, |
|
"grad_norm": 0.4849220779673394, |
|
"learning_rate": 9.499759518947156e-06, |
|
"loss": 1.3463, |
|
"step": 5550 |
|
}, |
|
{ |
|
"epoch": 1.602202832212845, |
|
"grad_norm": 0.43453154893881496, |
|
"learning_rate": 9.490370652292357e-06, |
|
"loss": 1.3342, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 1.602202832212845, |
|
"eval_loss": 1.3611611127853394, |
|
"eval_runtime": 13.9617, |
|
"eval_samples_per_second": 71.625, |
|
"eval_steps_per_second": 2.292, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 1.61650693749106, |
|
"grad_norm": 0.4973975633500145, |
|
"learning_rate": 9.480899779610883e-06, |
|
"loss": 1.3557, |
|
"step": 5650 |
|
}, |
|
{ |
|
"epoch": 1.6308110427692748, |
|
"grad_norm": 0.8646218397904073, |
|
"learning_rate": 9.471347095538448e-06, |
|
"loss": 1.332, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 1.6451151480474895, |
|
"grad_norm": 0.4766662524894494, |
|
"learning_rate": 9.461712796392067e-06, |
|
"loss": 1.3425, |
|
"step": 5750 |
|
}, |
|
{ |
|
"epoch": 1.6594192533257046, |
|
"grad_norm": 0.43492118267166, |
|
"learning_rate": 9.45199708016603e-06, |
|
"loss": 1.3366, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 1.6737233586039193, |
|
"grad_norm": 0.7281191349195701, |
|
"learning_rate": 9.442200146527824e-06, |
|
"loss": 1.3405, |
|
"step": 5850 |
|
}, |
|
{ |
|
"epoch": 1.6880274638821342, |
|
"grad_norm": 0.5059870049803485, |
|
"learning_rate": 9.432322196814032e-06, |
|
"loss": 1.336, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 1.7023315691603491, |
|
"grad_norm": 0.48815713123329457, |
|
"learning_rate": 9.422363434026205e-06, |
|
"loss": 1.3331, |
|
"step": 5950 |
|
}, |
|
{ |
|
"epoch": 1.7166356744385638, |
|
"grad_norm": 0.4825656212310282, |
|
"learning_rate": 9.41232406282667e-06, |
|
"loss": 1.3382, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 1.7166356744385638, |
|
"eval_loss": 1.356214165687561, |
|
"eval_runtime": 13.9939, |
|
"eval_samples_per_second": 71.46, |
|
"eval_steps_per_second": 2.287, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 1.7309397797167787, |
|
"grad_norm": 0.7522246864779827, |
|
"learning_rate": 9.402204289534344e-06, |
|
"loss": 1.3239, |
|
"step": 6050 |
|
}, |
|
{ |
|
"epoch": 1.7452438849949936, |
|
"grad_norm": 0.48984350066891824, |
|
"learning_rate": 9.392004322120484e-06, |
|
"loss": 1.3237, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 1.7595479902732083, |
|
"grad_norm": 0.544930574118496, |
|
"learning_rate": 9.381724370204414e-06, |
|
"loss": 1.3241, |
|
"step": 6150 |
|
}, |
|
{ |
|
"epoch": 1.7738520955514234, |
|
"grad_norm": 0.5482222598847393, |
|
"learning_rate": 9.371364645049216e-06, |
|
"loss": 1.3313, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 1.788156200829638, |
|
"grad_norm": 0.46339705172698076, |
|
"learning_rate": 9.360925359557397e-06, |
|
"loss": 1.3256, |
|
"step": 6250 |
|
}, |
|
{ |
|
"epoch": 1.8024603061078528, |
|
"grad_norm": 0.5277875338001611, |
|
"learning_rate": 9.3504067282665e-06, |
|
"loss": 1.3503, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 1.816764411386068, |
|
"grad_norm": 0.5539059109504075, |
|
"learning_rate": 9.339808967344701e-06, |
|
"loss": 1.3368, |
|
"step": 6350 |
|
}, |
|
{ |
|
"epoch": 1.8310685166642826, |
|
"grad_norm": 0.5119187022621997, |
|
"learning_rate": 9.329132294586374e-06, |
|
"loss": 1.3257, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 1.8310685166642826, |
|
"eval_loss": 1.348954200744629, |
|
"eval_runtime": 14.1165, |
|
"eval_samples_per_second": 70.839, |
|
"eval_steps_per_second": 2.267, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 1.8453726219424975, |
|
"grad_norm": 0.4572643729622861, |
|
"learning_rate": 9.318376929407606e-06, |
|
"loss": 1.3296, |
|
"step": 6450 |
|
}, |
|
{ |
|
"epoch": 1.8596767272207124, |
|
"grad_norm": 0.41441721606603, |
|
"learning_rate": 9.307543092841688e-06, |
|
"loss": 1.3306, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 1.873980832498927, |
|
"grad_norm": 0.4437842388580668, |
|
"learning_rate": 9.296631007534576e-06, |
|
"loss": 1.3219, |
|
"step": 6550 |
|
}, |
|
{ |
|
"epoch": 1.888284937777142, |
|
"grad_norm": 0.668469538481535, |
|
"learning_rate": 9.285640897740316e-06, |
|
"loss": 1.3201, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 1.902589043055357, |
|
"grad_norm": 0.4476992280694945, |
|
"learning_rate": 9.27457298931643e-06, |
|
"loss": 1.3279, |
|
"step": 6650 |
|
}, |
|
{ |
|
"epoch": 1.9168931483335716, |
|
"grad_norm": 0.8609307931818154, |
|
"learning_rate": 9.263427509719287e-06, |
|
"loss": 1.3248, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 1.9311972536117867, |
|
"grad_norm": 0.48764755574202223, |
|
"learning_rate": 9.252204687999401e-06, |
|
"loss": 1.3293, |
|
"step": 6750 |
|
}, |
|
{ |
|
"epoch": 1.9455013588900014, |
|
"grad_norm": 0.7588730534632143, |
|
"learning_rate": 9.240904754796767e-06, |
|
"loss": 1.3338, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 1.9455013588900014, |
|
"eval_loss": 1.3457790613174438, |
|
"eval_runtime": 14.0391, |
|
"eval_samples_per_second": 71.229, |
|
"eval_steps_per_second": 2.279, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 1.9598054641682163, |
|
"grad_norm": 0.47728013357161364, |
|
"learning_rate": 9.22952794233608e-06, |
|
"loss": 1.328, |
|
"step": 6850 |
|
}, |
|
{ |
|
"epoch": 1.9741095694464312, |
|
"grad_norm": 0.4865065014657903, |
|
"learning_rate": 9.218074484421977e-06, |
|
"loss": 1.3329, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 1.988413674724646, |
|
"grad_norm": 0.46233352981690246, |
|
"learning_rate": 9.206544616434249e-06, |
|
"loss": 1.3193, |
|
"step": 6950 |
|
}, |
|
{ |
|
"epoch": 2.0027177800028606, |
|
"grad_norm": 0.4748345037256569, |
|
"learning_rate": 9.194938575322973e-06, |
|
"loss": 1.3137, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 2.0170218852810757, |
|
"grad_norm": 0.3961349395717629, |
|
"learning_rate": 9.183256599603672e-06, |
|
"loss": 1.2981, |
|
"step": 7050 |
|
}, |
|
{ |
|
"epoch": 2.0313259905592904, |
|
"grad_norm": 0.6284979836068443, |
|
"learning_rate": 9.171498929352388e-06, |
|
"loss": 1.2961, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 2.0456300958375055, |
|
"grad_norm": 0.6558610249594138, |
|
"learning_rate": 9.159665806200766e-06, |
|
"loss": 1.2913, |
|
"step": 7150 |
|
}, |
|
{ |
|
"epoch": 2.0599342011157202, |
|
"grad_norm": 0.45514976033924853, |
|
"learning_rate": 9.147757473331082e-06, |
|
"loss": 1.2906, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 2.0599342011157202, |
|
"eval_loss": 1.3430439233779907, |
|
"eval_runtime": 14.0262, |
|
"eval_samples_per_second": 71.295, |
|
"eval_steps_per_second": 2.281, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 2.074238306393935, |
|
"grad_norm": 0.5426461545993814, |
|
"learning_rate": 9.135774175471244e-06, |
|
"loss": 1.3004, |
|
"step": 7250 |
|
}, |
|
{ |
|
"epoch": 2.08854241167215, |
|
"grad_norm": 0.6005516516830625, |
|
"learning_rate": 9.123716158889765e-06, |
|
"loss": 1.292, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 2.1028465169503647, |
|
"grad_norm": 0.9639752009743953, |
|
"learning_rate": 9.111583671390697e-06, |
|
"loss": 1.2862, |
|
"step": 7350 |
|
}, |
|
{ |
|
"epoch": 2.1171506222285794, |
|
"grad_norm": 0.4488649957289315, |
|
"learning_rate": 9.09937696230855e-06, |
|
"loss": 1.3036, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 2.1314547275067945, |
|
"grad_norm": 0.7721978784000721, |
|
"learning_rate": 9.087096282503152e-06, |
|
"loss": 1.2901, |
|
"step": 7450 |
|
}, |
|
{ |
|
"epoch": 2.1457588327850092, |
|
"grad_norm": 0.4782857255612778, |
|
"learning_rate": 9.074741884354507e-06, |
|
"loss": 1.2946, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 2.1600629380632244, |
|
"grad_norm": 0.43220427000612477, |
|
"learning_rate": 9.062314021757603e-06, |
|
"loss": 1.2921, |
|
"step": 7550 |
|
}, |
|
{ |
|
"epoch": 2.174367043341439, |
|
"grad_norm": 0.5795623059587878, |
|
"learning_rate": 9.049812950117191e-06, |
|
"loss": 1.279, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 2.174367043341439, |
|
"eval_loss": 1.3394057750701904, |
|
"eval_runtime": 14.0446, |
|
"eval_samples_per_second": 71.202, |
|
"eval_steps_per_second": 2.278, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 2.1886711486196537, |
|
"grad_norm": 0.5713295331254999, |
|
"learning_rate": 9.037238926342544e-06, |
|
"loss": 1.2909, |
|
"step": 7650 |
|
}, |
|
{ |
|
"epoch": 2.202975253897869, |
|
"grad_norm": 0.45758770778160607, |
|
"learning_rate": 9.02459220884217e-06, |
|
"loss": 1.3009, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 2.2172793591760835, |
|
"grad_norm": 0.4138476142224768, |
|
"learning_rate": 9.011873057518503e-06, |
|
"loss": 1.2901, |
|
"step": 7750 |
|
}, |
|
{ |
|
"epoch": 2.2315834644542982, |
|
"grad_norm": 0.5401623167342202, |
|
"learning_rate": 8.999081733762568e-06, |
|
"loss": 1.2883, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 2.2458875697325134, |
|
"grad_norm": 0.4225832679092138, |
|
"learning_rate": 8.986218500448598e-06, |
|
"loss": 1.2986, |
|
"step": 7850 |
|
}, |
|
{ |
|
"epoch": 2.260191675010728, |
|
"grad_norm": 0.578769239923742, |
|
"learning_rate": 8.973283621928644e-06, |
|
"loss": 1.2932, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 2.2744957802889427, |
|
"grad_norm": 0.42471537710995716, |
|
"learning_rate": 8.96027736402713e-06, |
|
"loss": 1.2911, |
|
"step": 7950 |
|
}, |
|
{ |
|
"epoch": 2.288799885567158, |
|
"grad_norm": 0.45640421971129197, |
|
"learning_rate": 8.947199994035402e-06, |
|
"loss": 1.2795, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 2.288799885567158, |
|
"eval_loss": 1.3331786394119263, |
|
"eval_runtime": 13.9979, |
|
"eval_samples_per_second": 71.439, |
|
"eval_steps_per_second": 2.286, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 2.3031039908453725, |
|
"grad_norm": 0.5262528524865082, |
|
"learning_rate": 8.934051780706226e-06, |
|
"loss": 1.2847, |
|
"step": 8050 |
|
}, |
|
{ |
|
"epoch": 2.3174080961235877, |
|
"grad_norm": 0.4308615143171633, |
|
"learning_rate": 8.920832994248268e-06, |
|
"loss": 1.2942, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 2.3317122014018024, |
|
"grad_norm": 0.46124798716185816, |
|
"learning_rate": 8.907543906320542e-06, |
|
"loss": 1.297, |
|
"step": 8150 |
|
}, |
|
{ |
|
"epoch": 2.346016306680017, |
|
"grad_norm": 0.4538526984132291, |
|
"learning_rate": 8.894184790026823e-06, |
|
"loss": 1.2832, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 2.360320411958232, |
|
"grad_norm": 0.4645888620271419, |
|
"learning_rate": 8.880755919910048e-06, |
|
"loss": 1.2891, |
|
"step": 8250 |
|
}, |
|
{ |
|
"epoch": 2.374624517236447, |
|
"grad_norm": 0.5676282155239492, |
|
"learning_rate": 8.867257571946646e-06, |
|
"loss": 1.295, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 2.3889286225146615, |
|
"grad_norm": 0.429927163826217, |
|
"learning_rate": 8.853690023540898e-06, |
|
"loss": 1.2917, |
|
"step": 8350 |
|
}, |
|
{ |
|
"epoch": 2.4032327277928767, |
|
"grad_norm": 0.4224712416764881, |
|
"learning_rate": 8.840053553519216e-06, |
|
"loss": 1.2793, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 2.4032327277928767, |
|
"eval_loss": 1.3279030323028564, |
|
"eval_runtime": 14.0803, |
|
"eval_samples_per_second": 71.021, |
|
"eval_steps_per_second": 2.273, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 2.4175368330710914, |
|
"grad_norm": 0.3947030765297477, |
|
"learning_rate": 8.82634844212442e-06, |
|
"loss": 1.288, |
|
"step": 8450 |
|
}, |
|
{ |
|
"epoch": 2.431840938349306, |
|
"grad_norm": 0.4497937878369028, |
|
"learning_rate": 8.81257497100998e-06, |
|
"loss": 1.2949, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 2.446145043627521, |
|
"grad_norm": 0.4948619624780139, |
|
"learning_rate": 8.79873342323422e-06, |
|
"loss": 1.2879, |
|
"step": 8550 |
|
}, |
|
{ |
|
"epoch": 2.460449148905736, |
|
"grad_norm": 0.8841779211631144, |
|
"learning_rate": 8.78482408325451e-06, |
|
"loss": 1.2842, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 2.474753254183951, |
|
"grad_norm": 0.44783586114307045, |
|
"learning_rate": 8.770847236921412e-06, |
|
"loss": 1.2868, |
|
"step": 8650 |
|
}, |
|
{ |
|
"epoch": 2.4890573594621657, |
|
"grad_norm": 0.6387382536339177, |
|
"learning_rate": 8.756803171472817e-06, |
|
"loss": 1.2821, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 2.5033614647403803, |
|
"grad_norm": 0.4704200568795867, |
|
"learning_rate": 8.742692175528027e-06, |
|
"loss": 1.2854, |
|
"step": 8750 |
|
}, |
|
{ |
|
"epoch": 2.5176655700185955, |
|
"grad_norm": 0.4776364379876357, |
|
"learning_rate": 8.728514539081837e-06, |
|
"loss": 1.2814, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 2.5176655700185955, |
|
"eval_loss": 1.3429194688796997, |
|
"eval_runtime": 13.9117, |
|
"eval_samples_per_second": 71.882, |
|
"eval_steps_per_second": 2.3, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 2.53196967529681, |
|
"grad_norm": 0.7082070517295844, |
|
"learning_rate": 8.714270553498567e-06, |
|
"loss": 1.2851, |
|
"step": 8850 |
|
}, |
|
{ |
|
"epoch": 2.5462737805750253, |
|
"grad_norm": 0.4514295526886292, |
|
"learning_rate": 8.699960511506077e-06, |
|
"loss": 1.2809, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 2.56057788585324, |
|
"grad_norm": 0.6853925555348788, |
|
"learning_rate": 8.685584707189749e-06, |
|
"loss": 1.2961, |
|
"step": 8950 |
|
}, |
|
{ |
|
"epoch": 2.5748819911314547, |
|
"grad_norm": 0.4538248869842651, |
|
"learning_rate": 8.671143435986447e-06, |
|
"loss": 1.2893, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 2.5891860964096693, |
|
"grad_norm": 0.45631276178983216, |
|
"learning_rate": 8.656636994678447e-06, |
|
"loss": 1.2921, |
|
"step": 9050 |
|
}, |
|
{ |
|
"epoch": 2.6034902016878845, |
|
"grad_norm": 0.4181402292311998, |
|
"learning_rate": 8.642065681387329e-06, |
|
"loss": 1.2849, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 2.617794306966099, |
|
"grad_norm": 0.4679963507707488, |
|
"learning_rate": 8.627429795567858e-06, |
|
"loss": 1.2789, |
|
"step": 9150 |
|
}, |
|
{ |
|
"epoch": 2.6320984122443143, |
|
"grad_norm": 0.4065327115468989, |
|
"learning_rate": 8.61272963800183e-06, |
|
"loss": 1.2805, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 2.6320984122443143, |
|
"eval_loss": 1.3250114917755127, |
|
"eval_runtime": 14.1045, |
|
"eval_samples_per_second": 70.899, |
|
"eval_steps_per_second": 2.269, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 2.646402517522529, |
|
"grad_norm": 0.442868220510357, |
|
"learning_rate": 8.597965510791883e-06, |
|
"loss": 1.2878, |
|
"step": 9250 |
|
}, |
|
{ |
|
"epoch": 2.6607066228007437, |
|
"grad_norm": 0.4167482981358102, |
|
"learning_rate": 8.5831377173553e-06, |
|
"loss": 1.2812, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 2.675010728078959, |
|
"grad_norm": 0.4090769340560565, |
|
"learning_rate": 8.568246562417762e-06, |
|
"loss": 1.2933, |
|
"step": 9350 |
|
}, |
|
{ |
|
"epoch": 2.6893148333571735, |
|
"grad_norm": 0.42518490969522255, |
|
"learning_rate": 8.553292352007096e-06, |
|
"loss": 1.2864, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 2.7036189386353886, |
|
"grad_norm": 0.4463014716471431, |
|
"learning_rate": 8.538275393446976e-06, |
|
"loss": 1.2857, |
|
"step": 9450 |
|
}, |
|
{ |
|
"epoch": 2.7179230439136033, |
|
"grad_norm": 0.45596948523932324, |
|
"learning_rate": 8.523195995350613e-06, |
|
"loss": 1.2835, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 2.732227149191818, |
|
"grad_norm": 0.4205155827535561, |
|
"learning_rate": 8.508054467614417e-06, |
|
"loss": 1.2849, |
|
"step": 9550 |
|
}, |
|
{ |
|
"epoch": 2.7465312544700327, |
|
"grad_norm": 0.48430008888282355, |
|
"learning_rate": 8.492851121411614e-06, |
|
"loss": 1.2789, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 2.7465312544700327, |
|
"eval_loss": 1.3283616304397583, |
|
"eval_runtime": 14.0066, |
|
"eval_samples_per_second": 71.395, |
|
"eval_steps_per_second": 2.285, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 2.760835359748248, |
|
"grad_norm": 0.5759994995680412, |
|
"learning_rate": 8.477586269185868e-06, |
|
"loss": 1.2807, |
|
"step": 9650 |
|
}, |
|
{ |
|
"epoch": 2.7751394650264625, |
|
"grad_norm": 0.4062177321040095, |
|
"learning_rate": 8.462260224644848e-06, |
|
"loss": 1.2786, |
|
"step": 9700 |
|
}, |
|
{ |
|
"epoch": 2.7894435703046776, |
|
"grad_norm": 0.40744982615324904, |
|
"learning_rate": 8.446873302753783e-06, |
|
"loss": 1.288, |
|
"step": 9750 |
|
}, |
|
{ |
|
"epoch": 2.8037476755828923, |
|
"grad_norm": 0.4351554021842912, |
|
"learning_rate": 8.431425819728998e-06, |
|
"loss": 1.2809, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 2.818051780861107, |
|
"grad_norm": 0.4565206220601423, |
|
"learning_rate": 8.415918093031403e-06, |
|
"loss": 1.2761, |
|
"step": 9850 |
|
}, |
|
{ |
|
"epoch": 2.832355886139322, |
|
"grad_norm": 0.4286148896345825, |
|
"learning_rate": 8.400350441359976e-06, |
|
"loss": 1.2738, |
|
"step": 9900 |
|
}, |
|
{ |
|
"epoch": 2.846659991417537, |
|
"grad_norm": 0.4091019318117471, |
|
"learning_rate": 8.384723184645211e-06, |
|
"loss": 1.2756, |
|
"step": 9950 |
|
}, |
|
{ |
|
"epoch": 2.860964096695752, |
|
"grad_norm": 0.5366072380832926, |
|
"learning_rate": 8.369036644042546e-06, |
|
"loss": 1.264, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 2.860964096695752, |
|
"eval_loss": 1.319417953491211, |
|
"eval_runtime": 14.0197, |
|
"eval_samples_per_second": 71.328, |
|
"eval_steps_per_second": 2.283, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 2.8752682019739666, |
|
"grad_norm": 0.39891877892139094, |
|
"learning_rate": 8.353291141925763e-06, |
|
"loss": 1.2714, |
|
"step": 10050 |
|
}, |
|
{ |
|
"epoch": 2.8895723072521813, |
|
"grad_norm": 0.43116855479870975, |
|
"learning_rate": 8.337487001880353e-06, |
|
"loss": 1.276, |
|
"step": 10100 |
|
}, |
|
{ |
|
"epoch": 2.903876412530396, |
|
"grad_norm": 0.43311934645181527, |
|
"learning_rate": 8.32162454869688e-06, |
|
"loss": 1.2733, |
|
"step": 10150 |
|
}, |
|
{ |
|
"epoch": 2.918180517808611, |
|
"grad_norm": 0.4236540903742665, |
|
"learning_rate": 8.305704108364301e-06, |
|
"loss": 1.2758, |
|
"step": 10200 |
|
}, |
|
{ |
|
"epoch": 2.932484623086826, |
|
"grad_norm": 0.4815023613318688, |
|
"learning_rate": 8.289726008063265e-06, |
|
"loss": 1.275, |
|
"step": 10250 |
|
}, |
|
{ |
|
"epoch": 2.946788728365041, |
|
"grad_norm": 0.43681054268020525, |
|
"learning_rate": 8.273690576159383e-06, |
|
"loss": 1.2789, |
|
"step": 10300 |
|
}, |
|
{ |
|
"epoch": 2.9610928336432556, |
|
"grad_norm": 0.4370480894359291, |
|
"learning_rate": 8.257598142196496e-06, |
|
"loss": 1.267, |
|
"step": 10350 |
|
}, |
|
{ |
|
"epoch": 2.9753969389214703, |
|
"grad_norm": 0.4461842695375769, |
|
"learning_rate": 8.241449036889892e-06, |
|
"loss": 1.2734, |
|
"step": 10400 |
|
}, |
|
{ |
|
"epoch": 2.9753969389214703, |
|
"eval_loss": 1.3316634893417358, |
|
"eval_runtime": 13.9113, |
|
"eval_samples_per_second": 71.884, |
|
"eval_steps_per_second": 2.3, |
|
"step": 10400 |
|
}, |
|
{ |
|
"epoch": 2.9897010441996854, |
|
"grad_norm": 0.44034804073477984, |
|
"learning_rate": 8.225243592119501e-06, |
|
"loss": 1.2736, |
|
"step": 10450 |
|
}, |
|
{ |
|
"epoch": 3.0040051494779, |
|
"grad_norm": 0.4720256474307512, |
|
"learning_rate": 8.208982140923095e-06, |
|
"loss": 1.2694, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 3.0183092547561152, |
|
"grad_norm": 0.6347562232882346, |
|
"learning_rate": 8.192665017489431e-06, |
|
"loss": 1.2336, |
|
"step": 10550 |
|
}, |
|
{ |
|
"epoch": 3.03261336003433, |
|
"grad_norm": 0.37981139577002, |
|
"learning_rate": 8.17629255715138e-06, |
|
"loss": 1.2494, |
|
"step": 10600 |
|
}, |
|
{ |
|
"epoch": 3.0469174653125446, |
|
"grad_norm": 0.7105885207992017, |
|
"learning_rate": 8.159865096379046e-06, |
|
"loss": 1.2397, |
|
"step": 10650 |
|
}, |
|
{ |
|
"epoch": 3.0612215705907597, |
|
"grad_norm": 0.43006752774126733, |
|
"learning_rate": 8.14338297277284e-06, |
|
"loss": 1.2384, |
|
"step": 10700 |
|
}, |
|
{ |
|
"epoch": 3.0755256758689744, |
|
"grad_norm": 0.4261194480956777, |
|
"learning_rate": 8.126846525056555e-06, |
|
"loss": 1.2436, |
|
"step": 10750 |
|
}, |
|
{ |
|
"epoch": 3.089829781147189, |
|
"grad_norm": 0.45249834468920586, |
|
"learning_rate": 8.110256093070393e-06, |
|
"loss": 1.252, |
|
"step": 10800 |
|
}, |
|
{ |
|
"epoch": 3.089829781147189, |
|
"eval_loss": 1.324701189994812, |
|
"eval_runtime": 14.0291, |
|
"eval_samples_per_second": 71.28, |
|
"eval_steps_per_second": 2.281, |
|
"step": 10800 |
|
} |
|
], |
|
"logging_steps": 50, |
|
"max_steps": 34950, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 400, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.0328768862224384e+16, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|