rabiulawal's picture
Upload folder using huggingface_hub
c322678 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.089829781147189,
"eval_steps": 400,
"global_step": 10800,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.014304105278214848,
"grad_norm": 1.6012784676437102,
"learning_rate": 1.6666666666666667e-06,
"loss": 4.4038,
"step": 50
},
{
"epoch": 0.028608210556429696,
"grad_norm": 0.81484251583879,
"learning_rate": 3.3333333333333333e-06,
"loss": 3.6036,
"step": 100
},
{
"epoch": 0.04291231583464454,
"grad_norm": 3.9762696099154904,
"learning_rate": 5e-06,
"loss": 3.0207,
"step": 150
},
{
"epoch": 0.05721642111285939,
"grad_norm": 3.057952660211588,
"learning_rate": 6.666666666666667e-06,
"loss": 2.4324,
"step": 200
},
{
"epoch": 0.07152052639107424,
"grad_norm": 2.092719622855296,
"learning_rate": 8.333333333333334e-06,
"loss": 2.222,
"step": 250
},
{
"epoch": 0.08582463166928908,
"grad_norm": 6.08825143706115,
"learning_rate": 1e-05,
"loss": 2.1021,
"step": 300
},
{
"epoch": 0.10012873694750393,
"grad_norm": 2.493878945601314,
"learning_rate": 9.999953760295448e-06,
"loss": 1.9831,
"step": 350
},
{
"epoch": 0.11443284222571878,
"grad_norm": 4.462960292469778,
"learning_rate": 9.999815042132062e-06,
"loss": 1.917,
"step": 400
},
{
"epoch": 0.11443284222571878,
"eval_loss": 1.808639645576477,
"eval_runtime": 14.2096,
"eval_samples_per_second": 70.375,
"eval_steps_per_second": 2.252,
"step": 400
},
{
"epoch": 0.12873694750393364,
"grad_norm": 2.038795534490349,
"learning_rate": 9.999583848360633e-06,
"loss": 1.8614,
"step": 450
},
{
"epoch": 0.1430410527821485,
"grad_norm": 2.259377386606669,
"learning_rate": 9.999260183732424e-06,
"loss": 1.8105,
"step": 500
},
{
"epoch": 0.1573451580603633,
"grad_norm": 1.6457423711505388,
"learning_rate": 9.998844054899058e-06,
"loss": 1.7759,
"step": 550
},
{
"epoch": 0.17164926333857816,
"grad_norm": 2.6198123977173555,
"learning_rate": 9.998335470412393e-06,
"loss": 1.7508,
"step": 600
},
{
"epoch": 0.185953368616793,
"grad_norm": 1.6377415784196128,
"learning_rate": 9.997734440724333e-06,
"loss": 1.7156,
"step": 650
},
{
"epoch": 0.20025747389500786,
"grad_norm": 3.5293148754159285,
"learning_rate": 9.997040978186633e-06,
"loss": 1.7015,
"step": 700
},
{
"epoch": 0.21456157917322272,
"grad_norm": 2.3013282525925263,
"learning_rate": 9.996255097050624e-06,
"loss": 1.6782,
"step": 750
},
{
"epoch": 0.22886568445143757,
"grad_norm": 2.428974082500653,
"learning_rate": 9.995376813466934e-06,
"loss": 1.66,
"step": 800
},
{
"epoch": 0.22886568445143757,
"eval_loss": 1.5992412567138672,
"eval_runtime": 14.0538,
"eval_samples_per_second": 71.155,
"eval_steps_per_second": 2.277,
"step": 800
},
{
"epoch": 0.24316978972965242,
"grad_norm": 2.9094373795416506,
"learning_rate": 9.994406145485151e-06,
"loss": 1.6399,
"step": 850
},
{
"epoch": 0.25747389500786727,
"grad_norm": 1.5133813561921106,
"learning_rate": 9.993343113053454e-06,
"loss": 1.626,
"step": 900
},
{
"epoch": 0.2717780002860821,
"grad_norm": 1.3663105185649191,
"learning_rate": 9.992187738018203e-06,
"loss": 1.6099,
"step": 950
},
{
"epoch": 0.286082105564297,
"grad_norm": 1.3144291853877879,
"learning_rate": 9.99094004412348e-06,
"loss": 1.5968,
"step": 1000
},
{
"epoch": 0.3003862108425118,
"grad_norm": 1.8770146895064077,
"learning_rate": 9.989600057010625e-06,
"loss": 1.5754,
"step": 1050
},
{
"epoch": 0.3146903161207266,
"grad_norm": 1.8478210167954083,
"learning_rate": 9.988167804217682e-06,
"loss": 1.5711,
"step": 1100
},
{
"epoch": 0.3289944213989415,
"grad_norm": 1.5949372088951037,
"learning_rate": 9.986643315178848e-06,
"loss": 1.5557,
"step": 1150
},
{
"epoch": 0.3432985266771563,
"grad_norm": 1.8431659408457755,
"learning_rate": 9.98502662122387e-06,
"loss": 1.5572,
"step": 1200
},
{
"epoch": 0.3432985266771563,
"eval_loss": 1.50032639503479,
"eval_runtime": 14.0776,
"eval_samples_per_second": 71.035,
"eval_steps_per_second": 2.273,
"step": 1200
},
{
"epoch": 0.3576026319553712,
"grad_norm": 1.3869607567913713,
"learning_rate": 9.983317755577392e-06,
"loss": 1.5363,
"step": 1250
},
{
"epoch": 0.371906737233586,
"grad_norm": 1.4514189742887267,
"learning_rate": 9.981516753358274e-06,
"loss": 1.5358,
"step": 1300
},
{
"epoch": 0.3862108425118009,
"grad_norm": 1.4333267526235296,
"learning_rate": 9.979623651578881e-06,
"loss": 1.5141,
"step": 1350
},
{
"epoch": 0.40051494779001573,
"grad_norm": 0.8580367772458624,
"learning_rate": 9.977638489144308e-06,
"loss": 1.523,
"step": 1400
},
{
"epoch": 0.4148190530682306,
"grad_norm": 0.9460440332154582,
"learning_rate": 9.975561306851585e-06,
"loss": 1.5175,
"step": 1450
},
{
"epoch": 0.42912315834644543,
"grad_norm": 1.376203229447874,
"learning_rate": 9.973392147388847e-06,
"loss": 1.5126,
"step": 1500
},
{
"epoch": 0.4434272636246603,
"grad_norm": 1.5041770784794857,
"learning_rate": 9.971131055334445e-06,
"loss": 1.4977,
"step": 1550
},
{
"epoch": 0.45773136890287514,
"grad_norm": 1.095703863839786,
"learning_rate": 9.968778077156035e-06,
"loss": 1.4877,
"step": 1600
},
{
"epoch": 0.45773136890287514,
"eval_loss": 1.4638383388519287,
"eval_runtime": 14.0468,
"eval_samples_per_second": 71.191,
"eval_steps_per_second": 2.278,
"step": 1600
},
{
"epoch": 0.47203547418109,
"grad_norm": 1.1967770971190828,
"learning_rate": 9.966333261209625e-06,
"loss": 1.4941,
"step": 1650
},
{
"epoch": 0.48633957945930484,
"grad_norm": 0.7764934991914475,
"learning_rate": 9.96379665773858e-06,
"loss": 1.4943,
"step": 1700
},
{
"epoch": 0.5006436847375196,
"grad_norm": 1.1957507140564159,
"learning_rate": 9.961168318872583e-06,
"loss": 1.4834,
"step": 1750
},
{
"epoch": 0.5149477900157345,
"grad_norm": 0.891291786132535,
"learning_rate": 9.958448298626576e-06,
"loss": 1.4766,
"step": 1800
},
{
"epoch": 0.5292518952939493,
"grad_norm": 0.9430107046686556,
"learning_rate": 9.95563665289964e-06,
"loss": 1.4659,
"step": 1850
},
{
"epoch": 0.5435560005721642,
"grad_norm": 1.3583446842191815,
"learning_rate": 9.952733439473847e-06,
"loss": 1.4681,
"step": 1900
},
{
"epoch": 0.557860105850379,
"grad_norm": 1.010261006024344,
"learning_rate": 9.94973871801308e-06,
"loss": 1.4667,
"step": 1950
},
{
"epoch": 0.572164211128594,
"grad_norm": 0.8494941104833196,
"learning_rate": 9.946652550061798e-06,
"loss": 1.4453,
"step": 2000
},
{
"epoch": 0.572164211128594,
"eval_loss": 1.4287511110305786,
"eval_runtime": 14.0255,
"eval_samples_per_second": 71.299,
"eval_steps_per_second": 2.282,
"step": 2000
},
{
"epoch": 0.5864683164068087,
"grad_norm": 0.7812469708103134,
"learning_rate": 9.943474999043775e-06,
"loss": 1.4496,
"step": 2050
},
{
"epoch": 0.6007724216850236,
"grad_norm": 0.7254104161544093,
"learning_rate": 9.9402061302608e-06,
"loss": 1.4462,
"step": 2100
},
{
"epoch": 0.6150765269632384,
"grad_norm": 1.1402597738223317,
"learning_rate": 9.93684601089133e-06,
"loss": 1.4402,
"step": 2150
},
{
"epoch": 0.6293806322414532,
"grad_norm": 1.0636750138637265,
"learning_rate": 9.933394709989109e-06,
"loss": 1.4514,
"step": 2200
},
{
"epoch": 0.6436847375196681,
"grad_norm": 0.6340325583537392,
"learning_rate": 9.92985229848175e-06,
"loss": 1.4376,
"step": 2250
},
{
"epoch": 0.657988842797883,
"grad_norm": 1.3226650510062645,
"learning_rate": 9.926218849169284e-06,
"loss": 1.4404,
"step": 2300
},
{
"epoch": 0.6722929480760979,
"grad_norm": 0.9023729708460776,
"learning_rate": 9.922494436722653e-06,
"loss": 1.435,
"step": 2350
},
{
"epoch": 0.6865970533543126,
"grad_norm": 1.1170660045757717,
"learning_rate": 9.91867913768218e-06,
"loss": 1.4275,
"step": 2400
},
{
"epoch": 0.6865970533543126,
"eval_loss": 1.4157905578613281,
"eval_runtime": 14.0561,
"eval_samples_per_second": 71.143,
"eval_steps_per_second": 2.277,
"step": 2400
},
{
"epoch": 0.7009011586325276,
"grad_norm": 1.164925228192199,
"learning_rate": 9.914773030456001e-06,
"loss": 1.4238,
"step": 2450
},
{
"epoch": 0.7152052639107424,
"grad_norm": 0.8519530167823217,
"learning_rate": 9.910776195318448e-06,
"loss": 1.4347,
"step": 2500
},
{
"epoch": 0.7295093691889573,
"grad_norm": 0.7139589978182425,
"learning_rate": 9.906688714408396e-06,
"loss": 1.4306,
"step": 2550
},
{
"epoch": 0.743813474467172,
"grad_norm": 0.8653282057170465,
"learning_rate": 9.902510671727583e-06,
"loss": 1.4229,
"step": 2600
},
{
"epoch": 0.758117579745387,
"grad_norm": 0.8247347491114752,
"learning_rate": 9.898242153138882e-06,
"loss": 1.4118,
"step": 2650
},
{
"epoch": 0.7724216850236018,
"grad_norm": 1.0924147996236788,
"learning_rate": 9.89388324636453e-06,
"loss": 1.4322,
"step": 2700
},
{
"epoch": 0.7867257903018167,
"grad_norm": 0.842516122122594,
"learning_rate": 9.889434040984333e-06,
"loss": 1.4101,
"step": 2750
},
{
"epoch": 0.8010298955800315,
"grad_norm": 0.8063486362804477,
"learning_rate": 9.88489462843382e-06,
"loss": 1.4191,
"step": 2800
},
{
"epoch": 0.8010298955800315,
"eval_loss": 1.4116355180740356,
"eval_runtime": 13.9779,
"eval_samples_per_second": 71.542,
"eval_steps_per_second": 2.289,
"step": 2800
},
{
"epoch": 0.8153340008582464,
"grad_norm": 0.6258848452847008,
"learning_rate": 9.880265102002369e-06,
"loss": 1.4001,
"step": 2850
},
{
"epoch": 0.8296381061364612,
"grad_norm": 0.726517642303323,
"learning_rate": 9.875545556831283e-06,
"loss": 1.4086,
"step": 2900
},
{
"epoch": 0.843942211414676,
"grad_norm": 0.6713970013254277,
"learning_rate": 9.870736089911836e-06,
"loss": 1.4073,
"step": 2950
},
{
"epoch": 0.8582463166928909,
"grad_norm": 0.6148598667666052,
"learning_rate": 9.865836800083291e-06,
"loss": 1.4093,
"step": 3000
},
{
"epoch": 0.8725504219711057,
"grad_norm": 0.5359562950631023,
"learning_rate": 9.860847788030852e-06,
"loss": 1.4017,
"step": 3050
},
{
"epoch": 0.8868545272493206,
"grad_norm": 0.6194549549607876,
"learning_rate": 9.855769156283604e-06,
"loss": 1.4196,
"step": 3100
},
{
"epoch": 0.9011586325275354,
"grad_norm": 0.7870838887793197,
"learning_rate": 9.850601009212408e-06,
"loss": 1.4039,
"step": 3150
},
{
"epoch": 0.9154627378057503,
"grad_norm": 0.8348797495331252,
"learning_rate": 9.845343453027747e-06,
"loss": 1.4092,
"step": 3200
},
{
"epoch": 0.9154627378057503,
"eval_loss": 1.3961894512176514,
"eval_runtime": 14.0237,
"eval_samples_per_second": 71.308,
"eval_steps_per_second": 2.282,
"step": 3200
},
{
"epoch": 0.9297668430839651,
"grad_norm": 0.8890086654120082,
"learning_rate": 9.839996595777552e-06,
"loss": 1.3991,
"step": 3250
},
{
"epoch": 0.94407094836218,
"grad_norm": 0.8338244522175184,
"learning_rate": 9.83456054734498e-06,
"loss": 1.3939,
"step": 3300
},
{
"epoch": 0.9583750536403948,
"grad_norm": 0.667534745389414,
"learning_rate": 9.829035419446156e-06,
"loss": 1.4052,
"step": 3350
},
{
"epoch": 0.9726791589186097,
"grad_norm": 0.830996338803645,
"learning_rate": 9.823421325627865e-06,
"loss": 1.408,
"step": 3400
},
{
"epoch": 0.9869832641968245,
"grad_norm": 0.752895350030203,
"learning_rate": 9.81771838126524e-06,
"loss": 1.3927,
"step": 3450
},
{
"epoch": 1.0012873694750393,
"grad_norm": 0.6022807633216317,
"learning_rate": 9.811926703559374e-06,
"loss": 1.3947,
"step": 3500
},
{
"epoch": 1.0155914747532542,
"grad_norm": 0.7757999852306153,
"learning_rate": 9.806046411534916e-06,
"loss": 1.3613,
"step": 3550
},
{
"epoch": 1.029895580031469,
"grad_norm": 0.6991186658573486,
"learning_rate": 9.800077626037633e-06,
"loss": 1.3805,
"step": 3600
},
{
"epoch": 1.029895580031469,
"eval_loss": 1.386795163154602,
"eval_runtime": 13.9668,
"eval_samples_per_second": 71.598,
"eval_steps_per_second": 2.291,
"step": 3600
},
{
"epoch": 1.044199685309684,
"grad_norm": 0.6304272914508194,
"learning_rate": 9.794020469731915e-06,
"loss": 1.3772,
"step": 3650
},
{
"epoch": 1.0585037905878987,
"grad_norm": 0.6127596406721845,
"learning_rate": 9.787875067098257e-06,
"loss": 1.3695,
"step": 3700
},
{
"epoch": 1.0728078958661136,
"grad_norm": 0.5752396229133312,
"learning_rate": 9.781641544430703e-06,
"loss": 1.3737,
"step": 3750
},
{
"epoch": 1.0871120011443285,
"grad_norm": 0.8167932197181069,
"learning_rate": 9.775320029834255e-06,
"loss": 1.3679,
"step": 3800
},
{
"epoch": 1.1014161064225432,
"grad_norm": 0.7493986062078165,
"learning_rate": 9.76891065322223e-06,
"loss": 1.3686,
"step": 3850
},
{
"epoch": 1.115720211700758,
"grad_norm": 0.6896574555563986,
"learning_rate": 9.762413546313597e-06,
"loss": 1.3688,
"step": 3900
},
{
"epoch": 1.130024316978973,
"grad_norm": 0.54479225381951,
"learning_rate": 9.755828842630269e-06,
"loss": 1.3577,
"step": 3950
},
{
"epoch": 1.144328422257188,
"grad_norm": 0.8631407967474234,
"learning_rate": 9.749156677494357e-06,
"loss": 1.3791,
"step": 4000
},
{
"epoch": 1.144328422257188,
"eval_loss": 1.3818904161453247,
"eval_runtime": 14.0228,
"eval_samples_per_second": 71.312,
"eval_steps_per_second": 2.282,
"step": 4000
},
{
"epoch": 1.1586325275354026,
"grad_norm": 0.6269351505110898,
"learning_rate": 9.742397188025394e-06,
"loss": 1.3672,
"step": 4050
},
{
"epoch": 1.1729366328136175,
"grad_norm": 0.5964977170501943,
"learning_rate": 9.735550513137513e-06,
"loss": 1.3579,
"step": 4100
},
{
"epoch": 1.1872407380918324,
"grad_norm": 0.6696040499572795,
"learning_rate": 9.728616793536588e-06,
"loss": 1.3704,
"step": 4150
},
{
"epoch": 1.2015448433700473,
"grad_norm": 0.7153959218092929,
"learning_rate": 9.721596171717352e-06,
"loss": 1.3631,
"step": 4200
},
{
"epoch": 1.215848948648262,
"grad_norm": 0.8228253318299735,
"learning_rate": 9.714488791960463e-06,
"loss": 1.3643,
"step": 4250
},
{
"epoch": 1.230153053926477,
"grad_norm": 0.6427955816989828,
"learning_rate": 9.707294800329536e-06,
"loss": 1.3608,
"step": 4300
},
{
"epoch": 1.2444571592046918,
"grad_norm": 0.6438118616712295,
"learning_rate": 9.700014344668152e-06,
"loss": 1.3564,
"step": 4350
},
{
"epoch": 1.2587612644829065,
"grad_norm": 0.5732058961632965,
"learning_rate": 9.692647574596803e-06,
"loss": 1.3623,
"step": 4400
},
{
"epoch": 1.2587612644829065,
"eval_loss": 1.3667371273040771,
"eval_runtime": 14.0711,
"eval_samples_per_second": 71.068,
"eval_steps_per_second": 2.274,
"step": 4400
},
{
"epoch": 1.2732084108139037,
"grad_norm": 0.5434874117890776,
"learning_rate": 9.685194641509837e-06,
"loss": 1.3592,
"step": 4450
},
{
"epoch": 1.2875125160921184,
"grad_norm": 0.8067573948854371,
"learning_rate": 9.677655698572326e-06,
"loss": 1.3571,
"step": 4500
},
{
"epoch": 1.3018166213703333,
"grad_norm": 0.6211280738341731,
"learning_rate": 9.670030900716941e-06,
"loss": 1.3577,
"step": 4550
},
{
"epoch": 1.3161207266485482,
"grad_norm": 0.47127980997402974,
"learning_rate": 9.662320404640743e-06,
"loss": 1.3497,
"step": 4600
},
{
"epoch": 1.3304248319267629,
"grad_norm": 0.6437090365289073,
"learning_rate": 9.654524368801982e-06,
"loss": 1.3611,
"step": 4650
},
{
"epoch": 1.3447289372049778,
"grad_norm": 0.4706214878937702,
"learning_rate": 9.646642953416835e-06,
"loss": 1.3596,
"step": 4700
},
{
"epoch": 1.3590330424831927,
"grad_norm": 0.4433218616654087,
"learning_rate": 9.638676320456109e-06,
"loss": 1.3612,
"step": 4750
},
{
"epoch": 1.3733371477614076,
"grad_norm": 0.6227834199361844,
"learning_rate": 9.630624633641918e-06,
"loss": 1.3487,
"step": 4800
},
{
"epoch": 1.3733371477614076,
"eval_loss": 1.3724805116653442,
"eval_runtime": 13.958,
"eval_samples_per_second": 71.643,
"eval_steps_per_second": 2.293,
"step": 4800
},
{
"epoch": 1.3876412530396225,
"grad_norm": 0.5615209752207829,
"learning_rate": 9.622488058444313e-06,
"loss": 1.3416,
"step": 4850
},
{
"epoch": 1.4019453583178372,
"grad_norm": 0.4593448830072353,
"learning_rate": 9.614266762077891e-06,
"loss": 1.3509,
"step": 4900
},
{
"epoch": 1.416249463596052,
"grad_norm": 0.5260361200473717,
"learning_rate": 9.605960913498342e-06,
"loss": 1.3504,
"step": 4950
},
{
"epoch": 1.430553568874267,
"grad_norm": 0.4949775762320425,
"learning_rate": 9.597570683398996e-06,
"loss": 1.3608,
"step": 5000
},
{
"epoch": 1.4448576741524817,
"grad_norm": 0.7134992551375888,
"learning_rate": 9.5890962442073e-06,
"loss": 1.3456,
"step": 5050
},
{
"epoch": 1.4591617794306966,
"grad_norm": 0.749997828555375,
"learning_rate": 9.580537770081285e-06,
"loss": 1.3413,
"step": 5100
},
{
"epoch": 1.4734658847089115,
"grad_norm": 0.5312330906616294,
"learning_rate": 9.57189543690598e-06,
"loss": 1.3507,
"step": 5150
},
{
"epoch": 1.4877699899871262,
"grad_norm": 0.5913338284525619,
"learning_rate": 9.563169422289798e-06,
"loss": 1.3386,
"step": 5200
},
{
"epoch": 1.4877699899871262,
"eval_loss": 1.359579086303711,
"eval_runtime": 14.046,
"eval_samples_per_second": 71.195,
"eval_steps_per_second": 2.278,
"step": 5200
},
{
"epoch": 1.5020740952653413,
"grad_norm": 0.63516444597305,
"learning_rate": 9.554359905560887e-06,
"loss": 1.3412,
"step": 5250
},
{
"epoch": 1.516378200543556,
"grad_norm": 0.4411581484928778,
"learning_rate": 9.54546706776345e-06,
"loss": 1.3505,
"step": 5300
},
{
"epoch": 1.530682305821771,
"grad_norm": 0.403266190389094,
"learning_rate": 9.536491091654018e-06,
"loss": 1.3418,
"step": 5350
},
{
"epoch": 1.5449864110999858,
"grad_norm": 0.4887790997121695,
"learning_rate": 9.527432161697696e-06,
"loss": 1.352,
"step": 5400
},
{
"epoch": 1.5592905163782005,
"grad_norm": 0.43803734390526294,
"learning_rate": 9.518290464064365e-06,
"loss": 1.3374,
"step": 5450
},
{
"epoch": 1.5735946216564154,
"grad_norm": 0.4477296911829739,
"learning_rate": 9.509066186624872e-06,
"loss": 1.3362,
"step": 5500
},
{
"epoch": 1.5878987269346303,
"grad_norm": 0.4849220779673394,
"learning_rate": 9.499759518947156e-06,
"loss": 1.3463,
"step": 5550
},
{
"epoch": 1.602202832212845,
"grad_norm": 0.43453154893881496,
"learning_rate": 9.490370652292357e-06,
"loss": 1.3342,
"step": 5600
},
{
"epoch": 1.602202832212845,
"eval_loss": 1.3611611127853394,
"eval_runtime": 13.9617,
"eval_samples_per_second": 71.625,
"eval_steps_per_second": 2.292,
"step": 5600
},
{
"epoch": 1.61650693749106,
"grad_norm": 0.4973975633500145,
"learning_rate": 9.480899779610883e-06,
"loss": 1.3557,
"step": 5650
},
{
"epoch": 1.6308110427692748,
"grad_norm": 0.8646218397904073,
"learning_rate": 9.471347095538448e-06,
"loss": 1.332,
"step": 5700
},
{
"epoch": 1.6451151480474895,
"grad_norm": 0.4766662524894494,
"learning_rate": 9.461712796392067e-06,
"loss": 1.3425,
"step": 5750
},
{
"epoch": 1.6594192533257046,
"grad_norm": 0.43492118267166,
"learning_rate": 9.45199708016603e-06,
"loss": 1.3366,
"step": 5800
},
{
"epoch": 1.6737233586039193,
"grad_norm": 0.7281191349195701,
"learning_rate": 9.442200146527824e-06,
"loss": 1.3405,
"step": 5850
},
{
"epoch": 1.6880274638821342,
"grad_norm": 0.5059870049803485,
"learning_rate": 9.432322196814032e-06,
"loss": 1.336,
"step": 5900
},
{
"epoch": 1.7023315691603491,
"grad_norm": 0.48815713123329457,
"learning_rate": 9.422363434026205e-06,
"loss": 1.3331,
"step": 5950
},
{
"epoch": 1.7166356744385638,
"grad_norm": 0.4825656212310282,
"learning_rate": 9.41232406282667e-06,
"loss": 1.3382,
"step": 6000
},
{
"epoch": 1.7166356744385638,
"eval_loss": 1.356214165687561,
"eval_runtime": 13.9939,
"eval_samples_per_second": 71.46,
"eval_steps_per_second": 2.287,
"step": 6000
},
{
"epoch": 1.7309397797167787,
"grad_norm": 0.7522246864779827,
"learning_rate": 9.402204289534344e-06,
"loss": 1.3239,
"step": 6050
},
{
"epoch": 1.7452438849949936,
"grad_norm": 0.48984350066891824,
"learning_rate": 9.392004322120484e-06,
"loss": 1.3237,
"step": 6100
},
{
"epoch": 1.7595479902732083,
"grad_norm": 0.544930574118496,
"learning_rate": 9.381724370204414e-06,
"loss": 1.3241,
"step": 6150
},
{
"epoch": 1.7738520955514234,
"grad_norm": 0.5482222598847393,
"learning_rate": 9.371364645049216e-06,
"loss": 1.3313,
"step": 6200
},
{
"epoch": 1.788156200829638,
"grad_norm": 0.46339705172698076,
"learning_rate": 9.360925359557397e-06,
"loss": 1.3256,
"step": 6250
},
{
"epoch": 1.8024603061078528,
"grad_norm": 0.5277875338001611,
"learning_rate": 9.3504067282665e-06,
"loss": 1.3503,
"step": 6300
},
{
"epoch": 1.816764411386068,
"grad_norm": 0.5539059109504075,
"learning_rate": 9.339808967344701e-06,
"loss": 1.3368,
"step": 6350
},
{
"epoch": 1.8310685166642826,
"grad_norm": 0.5119187022621997,
"learning_rate": 9.329132294586374e-06,
"loss": 1.3257,
"step": 6400
},
{
"epoch": 1.8310685166642826,
"eval_loss": 1.348954200744629,
"eval_runtime": 14.1165,
"eval_samples_per_second": 70.839,
"eval_steps_per_second": 2.267,
"step": 6400
},
{
"epoch": 1.8453726219424975,
"grad_norm": 0.4572643729622861,
"learning_rate": 9.318376929407606e-06,
"loss": 1.3296,
"step": 6450
},
{
"epoch": 1.8596767272207124,
"grad_norm": 0.41441721606603,
"learning_rate": 9.307543092841688e-06,
"loss": 1.3306,
"step": 6500
},
{
"epoch": 1.873980832498927,
"grad_norm": 0.4437842388580668,
"learning_rate": 9.296631007534576e-06,
"loss": 1.3219,
"step": 6550
},
{
"epoch": 1.888284937777142,
"grad_norm": 0.668469538481535,
"learning_rate": 9.285640897740316e-06,
"loss": 1.3201,
"step": 6600
},
{
"epoch": 1.902589043055357,
"grad_norm": 0.4476992280694945,
"learning_rate": 9.27457298931643e-06,
"loss": 1.3279,
"step": 6650
},
{
"epoch": 1.9168931483335716,
"grad_norm": 0.8609307931818154,
"learning_rate": 9.263427509719287e-06,
"loss": 1.3248,
"step": 6700
},
{
"epoch": 1.9311972536117867,
"grad_norm": 0.48764755574202223,
"learning_rate": 9.252204687999401e-06,
"loss": 1.3293,
"step": 6750
},
{
"epoch": 1.9455013588900014,
"grad_norm": 0.7588730534632143,
"learning_rate": 9.240904754796767e-06,
"loss": 1.3338,
"step": 6800
},
{
"epoch": 1.9455013588900014,
"eval_loss": 1.3457790613174438,
"eval_runtime": 14.0391,
"eval_samples_per_second": 71.229,
"eval_steps_per_second": 2.279,
"step": 6800
},
{
"epoch": 1.9598054641682163,
"grad_norm": 0.47728013357161364,
"learning_rate": 9.22952794233608e-06,
"loss": 1.328,
"step": 6850
},
{
"epoch": 1.9741095694464312,
"grad_norm": 0.4865065014657903,
"learning_rate": 9.218074484421977e-06,
"loss": 1.3329,
"step": 6900
},
{
"epoch": 1.988413674724646,
"grad_norm": 0.46233352981690246,
"learning_rate": 9.206544616434249e-06,
"loss": 1.3193,
"step": 6950
},
{
"epoch": 2.0027177800028606,
"grad_norm": 0.4748345037256569,
"learning_rate": 9.194938575322973e-06,
"loss": 1.3137,
"step": 7000
},
{
"epoch": 2.0170218852810757,
"grad_norm": 0.3961349395717629,
"learning_rate": 9.183256599603672e-06,
"loss": 1.2981,
"step": 7050
},
{
"epoch": 2.0313259905592904,
"grad_norm": 0.6284979836068443,
"learning_rate": 9.171498929352388e-06,
"loss": 1.2961,
"step": 7100
},
{
"epoch": 2.0456300958375055,
"grad_norm": 0.6558610249594138,
"learning_rate": 9.159665806200766e-06,
"loss": 1.2913,
"step": 7150
},
{
"epoch": 2.0599342011157202,
"grad_norm": 0.45514976033924853,
"learning_rate": 9.147757473331082e-06,
"loss": 1.2906,
"step": 7200
},
{
"epoch": 2.0599342011157202,
"eval_loss": 1.3430439233779907,
"eval_runtime": 14.0262,
"eval_samples_per_second": 71.295,
"eval_steps_per_second": 2.281,
"step": 7200
},
{
"epoch": 2.074238306393935,
"grad_norm": 0.5426461545993814,
"learning_rate": 9.135774175471244e-06,
"loss": 1.3004,
"step": 7250
},
{
"epoch": 2.08854241167215,
"grad_norm": 0.6005516516830625,
"learning_rate": 9.123716158889765e-06,
"loss": 1.292,
"step": 7300
},
{
"epoch": 2.1028465169503647,
"grad_norm": 0.9639752009743953,
"learning_rate": 9.111583671390697e-06,
"loss": 1.2862,
"step": 7350
},
{
"epoch": 2.1171506222285794,
"grad_norm": 0.4488649957289315,
"learning_rate": 9.09937696230855e-06,
"loss": 1.3036,
"step": 7400
},
{
"epoch": 2.1314547275067945,
"grad_norm": 0.7721978784000721,
"learning_rate": 9.087096282503152e-06,
"loss": 1.2901,
"step": 7450
},
{
"epoch": 2.1457588327850092,
"grad_norm": 0.4782857255612778,
"learning_rate": 9.074741884354507e-06,
"loss": 1.2946,
"step": 7500
},
{
"epoch": 2.1600629380632244,
"grad_norm": 0.43220427000612477,
"learning_rate": 9.062314021757603e-06,
"loss": 1.2921,
"step": 7550
},
{
"epoch": 2.174367043341439,
"grad_norm": 0.5795623059587878,
"learning_rate": 9.049812950117191e-06,
"loss": 1.279,
"step": 7600
},
{
"epoch": 2.174367043341439,
"eval_loss": 1.3394057750701904,
"eval_runtime": 14.0446,
"eval_samples_per_second": 71.202,
"eval_steps_per_second": 2.278,
"step": 7600
},
{
"epoch": 2.1886711486196537,
"grad_norm": 0.5713295331254999,
"learning_rate": 9.037238926342544e-06,
"loss": 1.2909,
"step": 7650
},
{
"epoch": 2.202975253897869,
"grad_norm": 0.45758770778160607,
"learning_rate": 9.02459220884217e-06,
"loss": 1.3009,
"step": 7700
},
{
"epoch": 2.2172793591760835,
"grad_norm": 0.4138476142224768,
"learning_rate": 9.011873057518503e-06,
"loss": 1.2901,
"step": 7750
},
{
"epoch": 2.2315834644542982,
"grad_norm": 0.5401623167342202,
"learning_rate": 8.999081733762568e-06,
"loss": 1.2883,
"step": 7800
},
{
"epoch": 2.2458875697325134,
"grad_norm": 0.4225832679092138,
"learning_rate": 8.986218500448598e-06,
"loss": 1.2986,
"step": 7850
},
{
"epoch": 2.260191675010728,
"grad_norm": 0.578769239923742,
"learning_rate": 8.973283621928644e-06,
"loss": 1.2932,
"step": 7900
},
{
"epoch": 2.2744957802889427,
"grad_norm": 0.42471537710995716,
"learning_rate": 8.96027736402713e-06,
"loss": 1.2911,
"step": 7950
},
{
"epoch": 2.288799885567158,
"grad_norm": 0.45640421971129197,
"learning_rate": 8.947199994035402e-06,
"loss": 1.2795,
"step": 8000
},
{
"epoch": 2.288799885567158,
"eval_loss": 1.3331786394119263,
"eval_runtime": 13.9979,
"eval_samples_per_second": 71.439,
"eval_steps_per_second": 2.286,
"step": 8000
},
{
"epoch": 2.3031039908453725,
"grad_norm": 0.5262528524865082,
"learning_rate": 8.934051780706226e-06,
"loss": 1.2847,
"step": 8050
},
{
"epoch": 2.3174080961235877,
"grad_norm": 0.4308615143171633,
"learning_rate": 8.920832994248268e-06,
"loss": 1.2942,
"step": 8100
},
{
"epoch": 2.3317122014018024,
"grad_norm": 0.46124798716185816,
"learning_rate": 8.907543906320542e-06,
"loss": 1.297,
"step": 8150
},
{
"epoch": 2.346016306680017,
"grad_norm": 0.4538526984132291,
"learning_rate": 8.894184790026823e-06,
"loss": 1.2832,
"step": 8200
},
{
"epoch": 2.360320411958232,
"grad_norm": 0.4645888620271419,
"learning_rate": 8.880755919910048e-06,
"loss": 1.2891,
"step": 8250
},
{
"epoch": 2.374624517236447,
"grad_norm": 0.5676282155239492,
"learning_rate": 8.867257571946646e-06,
"loss": 1.295,
"step": 8300
},
{
"epoch": 2.3889286225146615,
"grad_norm": 0.429927163826217,
"learning_rate": 8.853690023540898e-06,
"loss": 1.2917,
"step": 8350
},
{
"epoch": 2.4032327277928767,
"grad_norm": 0.4224712416764881,
"learning_rate": 8.840053553519216e-06,
"loss": 1.2793,
"step": 8400
},
{
"epoch": 2.4032327277928767,
"eval_loss": 1.3279030323028564,
"eval_runtime": 14.0803,
"eval_samples_per_second": 71.021,
"eval_steps_per_second": 2.273,
"step": 8400
},
{
"epoch": 2.4175368330710914,
"grad_norm": 0.3947030765297477,
"learning_rate": 8.82634844212442e-06,
"loss": 1.288,
"step": 8450
},
{
"epoch": 2.431840938349306,
"grad_norm": 0.4497937878369028,
"learning_rate": 8.81257497100998e-06,
"loss": 1.2949,
"step": 8500
},
{
"epoch": 2.446145043627521,
"grad_norm": 0.4948619624780139,
"learning_rate": 8.79873342323422e-06,
"loss": 1.2879,
"step": 8550
},
{
"epoch": 2.460449148905736,
"grad_norm": 0.8841779211631144,
"learning_rate": 8.78482408325451e-06,
"loss": 1.2842,
"step": 8600
},
{
"epoch": 2.474753254183951,
"grad_norm": 0.44783586114307045,
"learning_rate": 8.770847236921412e-06,
"loss": 1.2868,
"step": 8650
},
{
"epoch": 2.4890573594621657,
"grad_norm": 0.6387382536339177,
"learning_rate": 8.756803171472817e-06,
"loss": 1.2821,
"step": 8700
},
{
"epoch": 2.5033614647403803,
"grad_norm": 0.4704200568795867,
"learning_rate": 8.742692175528027e-06,
"loss": 1.2854,
"step": 8750
},
{
"epoch": 2.5176655700185955,
"grad_norm": 0.4776364379876357,
"learning_rate": 8.728514539081837e-06,
"loss": 1.2814,
"step": 8800
},
{
"epoch": 2.5176655700185955,
"eval_loss": 1.3429194688796997,
"eval_runtime": 13.9117,
"eval_samples_per_second": 71.882,
"eval_steps_per_second": 2.3,
"step": 8800
},
{
"epoch": 2.53196967529681,
"grad_norm": 0.7082070517295844,
"learning_rate": 8.714270553498567e-06,
"loss": 1.2851,
"step": 8850
},
{
"epoch": 2.5462737805750253,
"grad_norm": 0.4514295526886292,
"learning_rate": 8.699960511506077e-06,
"loss": 1.2809,
"step": 8900
},
{
"epoch": 2.56057788585324,
"grad_norm": 0.6853925555348788,
"learning_rate": 8.685584707189749e-06,
"loss": 1.2961,
"step": 8950
},
{
"epoch": 2.5748819911314547,
"grad_norm": 0.4538248869842651,
"learning_rate": 8.671143435986447e-06,
"loss": 1.2893,
"step": 9000
},
{
"epoch": 2.5891860964096693,
"grad_norm": 0.45631276178983216,
"learning_rate": 8.656636994678447e-06,
"loss": 1.2921,
"step": 9050
},
{
"epoch": 2.6034902016878845,
"grad_norm": 0.4181402292311998,
"learning_rate": 8.642065681387329e-06,
"loss": 1.2849,
"step": 9100
},
{
"epoch": 2.617794306966099,
"grad_norm": 0.4679963507707488,
"learning_rate": 8.627429795567858e-06,
"loss": 1.2789,
"step": 9150
},
{
"epoch": 2.6320984122443143,
"grad_norm": 0.4065327115468989,
"learning_rate": 8.61272963800183e-06,
"loss": 1.2805,
"step": 9200
},
{
"epoch": 2.6320984122443143,
"eval_loss": 1.3250114917755127,
"eval_runtime": 14.1045,
"eval_samples_per_second": 70.899,
"eval_steps_per_second": 2.269,
"step": 9200
},
{
"epoch": 2.646402517522529,
"grad_norm": 0.442868220510357,
"learning_rate": 8.597965510791883e-06,
"loss": 1.2878,
"step": 9250
},
{
"epoch": 2.6607066228007437,
"grad_norm": 0.4167482981358102,
"learning_rate": 8.5831377173553e-06,
"loss": 1.2812,
"step": 9300
},
{
"epoch": 2.675010728078959,
"grad_norm": 0.4090769340560565,
"learning_rate": 8.568246562417762e-06,
"loss": 1.2933,
"step": 9350
},
{
"epoch": 2.6893148333571735,
"grad_norm": 0.42518490969522255,
"learning_rate": 8.553292352007096e-06,
"loss": 1.2864,
"step": 9400
},
{
"epoch": 2.7036189386353886,
"grad_norm": 0.4463014716471431,
"learning_rate": 8.538275393446976e-06,
"loss": 1.2857,
"step": 9450
},
{
"epoch": 2.7179230439136033,
"grad_norm": 0.45596948523932324,
"learning_rate": 8.523195995350613e-06,
"loss": 1.2835,
"step": 9500
},
{
"epoch": 2.732227149191818,
"grad_norm": 0.4205155827535561,
"learning_rate": 8.508054467614417e-06,
"loss": 1.2849,
"step": 9550
},
{
"epoch": 2.7465312544700327,
"grad_norm": 0.48430008888282355,
"learning_rate": 8.492851121411614e-06,
"loss": 1.2789,
"step": 9600
},
{
"epoch": 2.7465312544700327,
"eval_loss": 1.3283616304397583,
"eval_runtime": 14.0066,
"eval_samples_per_second": 71.395,
"eval_steps_per_second": 2.285,
"step": 9600
},
{
"epoch": 2.760835359748248,
"grad_norm": 0.5759994995680412,
"learning_rate": 8.477586269185868e-06,
"loss": 1.2807,
"step": 9650
},
{
"epoch": 2.7751394650264625,
"grad_norm": 0.4062177321040095,
"learning_rate": 8.462260224644848e-06,
"loss": 1.2786,
"step": 9700
},
{
"epoch": 2.7894435703046776,
"grad_norm": 0.40744982615324904,
"learning_rate": 8.446873302753783e-06,
"loss": 1.288,
"step": 9750
},
{
"epoch": 2.8037476755828923,
"grad_norm": 0.4351554021842912,
"learning_rate": 8.431425819728998e-06,
"loss": 1.2809,
"step": 9800
},
{
"epoch": 2.818051780861107,
"grad_norm": 0.4565206220601423,
"learning_rate": 8.415918093031403e-06,
"loss": 1.2761,
"step": 9850
},
{
"epoch": 2.832355886139322,
"grad_norm": 0.4286148896345825,
"learning_rate": 8.400350441359976e-06,
"loss": 1.2738,
"step": 9900
},
{
"epoch": 2.846659991417537,
"grad_norm": 0.4091019318117471,
"learning_rate": 8.384723184645211e-06,
"loss": 1.2756,
"step": 9950
},
{
"epoch": 2.860964096695752,
"grad_norm": 0.5366072380832926,
"learning_rate": 8.369036644042546e-06,
"loss": 1.264,
"step": 10000
},
{
"epoch": 2.860964096695752,
"eval_loss": 1.319417953491211,
"eval_runtime": 14.0197,
"eval_samples_per_second": 71.328,
"eval_steps_per_second": 2.283,
"step": 10000
},
{
"epoch": 2.8752682019739666,
"grad_norm": 0.39891877892139094,
"learning_rate": 8.353291141925763e-06,
"loss": 1.2714,
"step": 10050
},
{
"epoch": 2.8895723072521813,
"grad_norm": 0.43116855479870975,
"learning_rate": 8.337487001880353e-06,
"loss": 1.276,
"step": 10100
},
{
"epoch": 2.903876412530396,
"grad_norm": 0.43311934645181527,
"learning_rate": 8.32162454869688e-06,
"loss": 1.2733,
"step": 10150
},
{
"epoch": 2.918180517808611,
"grad_norm": 0.4236540903742665,
"learning_rate": 8.305704108364301e-06,
"loss": 1.2758,
"step": 10200
},
{
"epoch": 2.932484623086826,
"grad_norm": 0.4815023613318688,
"learning_rate": 8.289726008063265e-06,
"loss": 1.275,
"step": 10250
},
{
"epoch": 2.946788728365041,
"grad_norm": 0.43681054268020525,
"learning_rate": 8.273690576159383e-06,
"loss": 1.2789,
"step": 10300
},
{
"epoch": 2.9610928336432556,
"grad_norm": 0.4370480894359291,
"learning_rate": 8.257598142196496e-06,
"loss": 1.267,
"step": 10350
},
{
"epoch": 2.9753969389214703,
"grad_norm": 0.4461842695375769,
"learning_rate": 8.241449036889892e-06,
"loss": 1.2734,
"step": 10400
},
{
"epoch": 2.9753969389214703,
"eval_loss": 1.3316634893417358,
"eval_runtime": 13.9113,
"eval_samples_per_second": 71.884,
"eval_steps_per_second": 2.3,
"step": 10400
},
{
"epoch": 2.9897010441996854,
"grad_norm": 0.44034804073477984,
"learning_rate": 8.225243592119501e-06,
"loss": 1.2736,
"step": 10450
},
{
"epoch": 3.0040051494779,
"grad_norm": 0.4720256474307512,
"learning_rate": 8.208982140923095e-06,
"loss": 1.2694,
"step": 10500
},
{
"epoch": 3.0183092547561152,
"grad_norm": 0.6347562232882346,
"learning_rate": 8.192665017489431e-06,
"loss": 1.2336,
"step": 10550
},
{
"epoch": 3.03261336003433,
"grad_norm": 0.37981139577002,
"learning_rate": 8.17629255715138e-06,
"loss": 1.2494,
"step": 10600
},
{
"epoch": 3.0469174653125446,
"grad_norm": 0.7105885207992017,
"learning_rate": 8.159865096379046e-06,
"loss": 1.2397,
"step": 10650
},
{
"epoch": 3.0612215705907597,
"grad_norm": 0.43006752774126733,
"learning_rate": 8.14338297277284e-06,
"loss": 1.2384,
"step": 10700
},
{
"epoch": 3.0755256758689744,
"grad_norm": 0.4261194480956777,
"learning_rate": 8.126846525056555e-06,
"loss": 1.2436,
"step": 10750
},
{
"epoch": 3.089829781147189,
"grad_norm": 0.45249834468920586,
"learning_rate": 8.110256093070393e-06,
"loss": 1.252,
"step": 10800
},
{
"epoch": 3.089829781147189,
"eval_loss": 1.324701189994812,
"eval_runtime": 14.0291,
"eval_samples_per_second": 71.28,
"eval_steps_per_second": 2.281,
"step": 10800
}
],
"logging_steps": 50,
"max_steps": 34950,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 400,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.0328768862224384e+16,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}