|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.9956108266276518, |
|
"eval_steps": 500, |
|
"global_step": 1023, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.029261155815654718, |
|
"grad_norm": 8.98854305134706, |
|
"learning_rate": 9.615384615384617e-07, |
|
"loss": 0.886, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.058522311631309436, |
|
"grad_norm": 2.172535983028044, |
|
"learning_rate": 1.9230769230769234e-06, |
|
"loss": 0.8326, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.08778346744696415, |
|
"grad_norm": 173647.7881486052, |
|
"learning_rate": 2.8846153846153845e-06, |
|
"loss": 1.1121, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.11704462326261887, |
|
"grad_norm": 1.2197795451638063, |
|
"learning_rate": 3.846153846153847e-06, |
|
"loss": 0.7357, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.14630577907827358, |
|
"grad_norm": 1.2235462186510278, |
|
"learning_rate": 4.807692307692308e-06, |
|
"loss": 0.717, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.1755669348939283, |
|
"grad_norm": 1.1500264823307949, |
|
"learning_rate": 4.999246350291281e-06, |
|
"loss": 0.7008, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.20482809070958302, |
|
"grad_norm": 1.0809398356246653, |
|
"learning_rate": 4.996185513623117e-06, |
|
"loss": 0.6883, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.23408924652523774, |
|
"grad_norm": 0.8102596570656748, |
|
"learning_rate": 4.9907735882096915e-06, |
|
"loss": 0.6773, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.26335040234089246, |
|
"grad_norm": 0.6041123421413714, |
|
"learning_rate": 4.983016238728676e-06, |
|
"loss": 0.6693, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.29261155815654716, |
|
"grad_norm": 0.4679255906368303, |
|
"learning_rate": 4.9729215848197685e-06, |
|
"loss": 0.6593, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.3218727139722019, |
|
"grad_norm": 0.379578037990065, |
|
"learning_rate": 4.960500192585831e-06, |
|
"loss": 0.6672, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.3511338697878566, |
|
"grad_norm": 0.3628760977145694, |
|
"learning_rate": 4.945765063533333e-06, |
|
"loss": 0.6715, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.38039502560351135, |
|
"grad_norm": 0.3970159780765436, |
|
"learning_rate": 4.928731620963628e-06, |
|
"loss": 0.655, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.40965618141916604, |
|
"grad_norm": 0.3716740167179648, |
|
"learning_rate": 4.909417693829346e-06, |
|
"loss": 0.6553, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.4389173372348208, |
|
"grad_norm": 0.34641025541861964, |
|
"learning_rate": 4.887843498072774e-06, |
|
"loss": 0.6505, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.4681784930504755, |
|
"grad_norm": 0.3923741109320672, |
|
"learning_rate": 4.864031615465776e-06, |
|
"loss": 0.6522, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.49743964886613024, |
|
"grad_norm": 0.36292753209283274, |
|
"learning_rate": 4.838006969973388e-06, |
|
"loss": 0.6571, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.5267008046817849, |
|
"grad_norm": 0.3376680362849002, |
|
"learning_rate": 4.809796801665825e-06, |
|
"loss": 0.6566, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.5559619604974396, |
|
"grad_norm": 0.36203567203579706, |
|
"learning_rate": 4.779430638206237e-06, |
|
"loss": 0.645, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.5852231163130943, |
|
"grad_norm": 0.3408562846700468, |
|
"learning_rate": 4.746940263944006e-06, |
|
"loss": 0.6461, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.6144842721287491, |
|
"grad_norm": 0.33889751456302075, |
|
"learning_rate": 4.712359686645986e-06, |
|
"loss": 0.6526, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.6437454279444038, |
|
"grad_norm": 0.35041172201449083, |
|
"learning_rate": 4.675725101900474e-06, |
|
"loss": 0.6404, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.6730065837600585, |
|
"grad_norm": 0.3526449799646731, |
|
"learning_rate": 4.637074855231186e-06, |
|
"loss": 0.6403, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.7022677395757132, |
|
"grad_norm": 0.32110885822251356, |
|
"learning_rate": 4.596449401960888e-06, |
|
"loss": 0.6471, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.731528895391368, |
|
"grad_norm": 0.3431192477294489, |
|
"learning_rate": 4.553891264866696e-06, |
|
"loss": 0.6454, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.7607900512070227, |
|
"grad_norm": 0.34368046757987986, |
|
"learning_rate": 4.509444989671361e-06, |
|
"loss": 0.6481, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.7900512070226774, |
|
"grad_norm": 0.37897239974426505, |
|
"learning_rate": 4.463157098417141e-06, |
|
"loss": 0.6503, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.8193123628383321, |
|
"grad_norm": 0.35185141428324446, |
|
"learning_rate": 4.415076040771041e-06, |
|
"loss": 0.6491, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.8485735186539868, |
|
"grad_norm": 0.333333492720985, |
|
"learning_rate": 4.365252143312415e-06, |
|
"loss": 0.6412, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.8778346744696416, |
|
"grad_norm": 0.34730525842530574, |
|
"learning_rate": 4.313737556855987e-06, |
|
"loss": 0.6331, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.9070958302852963, |
|
"grad_norm": 0.35101537112120373, |
|
"learning_rate": 4.260586201865451e-06, |
|
"loss": 0.6461, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.936356986100951, |
|
"grad_norm": 0.3265112519469777, |
|
"learning_rate": 4.205853712014766e-06, |
|
"loss": 0.638, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.9656181419166057, |
|
"grad_norm": 0.37750068013169596, |
|
"learning_rate": 4.149597375956228e-06, |
|
"loss": 0.6413, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.9948792977322605, |
|
"grad_norm": 0.3525928158653123, |
|
"learning_rate": 4.0918760773562825e-06, |
|
"loss": 0.6373, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.9978054133138259, |
|
"eval_loss": 0.6418758034706116, |
|
"eval_runtime": 344.0656, |
|
"eval_samples_per_second": 26.765, |
|
"eval_steps_per_second": 0.419, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 1.025237746891002, |
|
"grad_norm": 0.36230468002056376, |
|
"learning_rate": 4.03275023326182e-06, |
|
"loss": 0.655, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.054498902706657, |
|
"grad_norm": 0.3782311966663319, |
|
"learning_rate": 3.972281730861482e-06, |
|
"loss": 0.6096, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.0837600585223117, |
|
"grad_norm": 0.3642553214435278, |
|
"learning_rate": 3.910533862708164e-06, |
|
"loss": 0.613, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.1130212143379663, |
|
"grad_norm": 0.31097337637296735, |
|
"learning_rate": 3.847571260470523e-06, |
|
"loss": 0.6101, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.142282370153621, |
|
"grad_norm": 0.38535301593017185, |
|
"learning_rate": 3.783459827282829e-06, |
|
"loss": 0.6155, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.1715435259692757, |
|
"grad_norm": 0.34644899266963136, |
|
"learning_rate": 3.718266668763958e-06, |
|
"loss": 0.617, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.2008046817849305, |
|
"grad_norm": 0.3594372904896393, |
|
"learning_rate": 3.652060022777762e-06, |
|
"loss": 0.6166, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.2300658376005853, |
|
"grad_norm": 0.34202267838177935, |
|
"learning_rate": 3.5849091880082976e-06, |
|
"loss": 0.6046, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.2593269934162399, |
|
"grad_norm": 0.3209790424989277, |
|
"learning_rate": 3.516884451424696e-06, |
|
"loss": 0.6148, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.2885881492318947, |
|
"grad_norm": 0.39145882313544417, |
|
"learning_rate": 3.448057014711598e-06, |
|
"loss": 0.6072, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.3178493050475493, |
|
"grad_norm": 0.33748028752444753, |
|
"learning_rate": 3.3784989197421414e-06, |
|
"loss": 0.6111, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.347110460863204, |
|
"grad_norm": 0.32484250673480136, |
|
"learning_rate": 3.3082829731715353e-06, |
|
"loss": 0.6086, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.3763716166788589, |
|
"grad_norm": 0.31957432344924297, |
|
"learning_rate": 3.237482670230125e-06, |
|
"loss": 0.6059, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.4056327724945135, |
|
"grad_norm": 0.33076862820101466, |
|
"learning_rate": 3.1661721177957238e-06, |
|
"loss": 0.6065, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.4348939283101683, |
|
"grad_norm": 0.3542135461787459, |
|
"learning_rate": 3.0944259568257374e-06, |
|
"loss": 0.6128, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.464155084125823, |
|
"grad_norm": 0.3358725180392758, |
|
"learning_rate": 3.022319284230261e-06, |
|
"loss": 0.6013, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.4934162399414777, |
|
"grad_norm": 0.33010342462196535, |
|
"learning_rate": 2.9499275742679303e-06, |
|
"loss": 0.6093, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.5226773957571325, |
|
"grad_norm": 0.3349440152012161, |
|
"learning_rate": 2.8773265995467975e-06, |
|
"loss": 0.6116, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.5519385515727873, |
|
"grad_norm": 0.3347770675456349, |
|
"learning_rate": 2.804592351712937e-06, |
|
"loss": 0.6102, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.5811997073884418, |
|
"grad_norm": 0.32807277985651556, |
|
"learning_rate": 2.731800961909764e-06, |
|
"loss": 0.6168, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.6104608632040964, |
|
"grad_norm": 0.34394938763944105, |
|
"learning_rate": 2.6590286210913557e-06, |
|
"loss": 0.6076, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.6397220190197512, |
|
"grad_norm": 0.3338905788072409, |
|
"learning_rate": 2.5863515002731645e-06, |
|
"loss": 0.6094, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.668983174835406, |
|
"grad_norm": 0.3264863178278298, |
|
"learning_rate": 2.5138456708035952e-06, |
|
"loss": 0.5999, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.6982443306510606, |
|
"grad_norm": 0.33312679764478187, |
|
"learning_rate": 2.441587024739917e-06, |
|
"loss": 0.6095, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.7275054864667154, |
|
"grad_norm": 0.32189993245032417, |
|
"learning_rate": 2.3696511954118236e-06, |
|
"loss": 0.6092, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.7567666422823702, |
|
"grad_norm": 0.3406006209905999, |
|
"learning_rate": 2.2981134782558192e-06, |
|
"loss": 0.6063, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.7860277980980248, |
|
"grad_norm": 0.3523904881790879, |
|
"learning_rate": 2.2270487520032656e-06, |
|
"loss": 0.6087, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.8152889539136796, |
|
"grad_norm": 0.31402762046992583, |
|
"learning_rate": 2.1565314003046083e-06, |
|
"loss": 0.609, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.8445501097293344, |
|
"grad_norm": 0.3205352263710775, |
|
"learning_rate": 2.0866352338717924e-06, |
|
"loss": 0.6082, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.873811265544989, |
|
"grad_norm": 0.31888506747610823, |
|
"learning_rate": 2.0174334132203865e-06, |
|
"loss": 0.603, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.9030724213606436, |
|
"grad_norm": 0.3364346855436591, |
|
"learning_rate": 1.9489983720922596e-06, |
|
"loss": 0.6087, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.9323335771762986, |
|
"grad_norm": 0.3188456233134923, |
|
"learning_rate": 1.8814017416389813e-06, |
|
"loss": 0.6093, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.9615947329919532, |
|
"grad_norm": 0.32907133034135566, |
|
"learning_rate": 1.8147142754452888e-06, |
|
"loss": 0.598, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.9908558888076078, |
|
"grad_norm": 0.3137569353732775, |
|
"learning_rate": 1.7490057754711138e-06, |
|
"loss": 0.6126, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.9967081199707388, |
|
"eval_loss": 0.634891152381897, |
|
"eval_runtime": 343.5767, |
|
"eval_samples_per_second": 26.803, |
|
"eval_steps_per_second": 0.419, |
|
"step": 682 |
|
}, |
|
{ |
|
"epoch": 2.0212143379663496, |
|
"grad_norm": 0.3203598916756129, |
|
"learning_rate": 1.6843450189896675e-06, |
|
"loss": 0.6316, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 2.050475493782004, |
|
"grad_norm": 0.3121584618138378, |
|
"learning_rate": 1.620799686598081e-06, |
|
"loss": 0.5826, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 2.0797366495976592, |
|
"grad_norm": 0.3077301379603627, |
|
"learning_rate": 1.558436291375923e-06, |
|
"loss": 0.5847, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 2.108997805413314, |
|
"grad_norm": 0.32533505697453335, |
|
"learning_rate": 1.497320109265781e-06, |
|
"loss": 0.5857, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 2.1382589612289684, |
|
"grad_norm": 0.3195086645547528, |
|
"learning_rate": 1.4375151107487388e-06, |
|
"loss": 0.5873, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 2.1675201170446234, |
|
"grad_norm": 0.3330446987320431, |
|
"learning_rate": 1.3790838938862973e-06, |
|
"loss": 0.5922, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 2.196781272860278, |
|
"grad_norm": 0.3299155176826055, |
|
"learning_rate": 1.3220876187988046e-06, |
|
"loss": 0.5909, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 2.2260424286759326, |
|
"grad_norm": 0.3434531818196139, |
|
"learning_rate": 1.2665859436489852e-06, |
|
"loss": 0.5879, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 2.255303584491587, |
|
"grad_norm": 0.3055793504536637, |
|
"learning_rate": 1.2126369621975703e-06, |
|
"loss": 0.5863, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 2.284564740307242, |
|
"grad_norm": 0.32739465865049094, |
|
"learning_rate": 1.1602971429963966e-06, |
|
"loss": 0.5856, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 2.313825896122897, |
|
"grad_norm": 0.30600023197231446, |
|
"learning_rate": 1.1096212702826087e-06, |
|
"loss": 0.5837, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 2.3430870519385514, |
|
"grad_norm": 0.3167798036799912, |
|
"learning_rate": 1.0606623866358496e-06, |
|
"loss": 0.5913, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.3723482077542064, |
|
"grad_norm": 0.30861506318915694, |
|
"learning_rate": 1.0134717374584333e-06, |
|
"loss": 0.5863, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 2.401609363569861, |
|
"grad_norm": 0.3282720702061679, |
|
"learning_rate": 9.680987173366452e-07, |
|
"loss": 0.5925, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 2.4308705193855156, |
|
"grad_norm": 0.295462519102209, |
|
"learning_rate": 9.245908183392827e-07, |
|
"loss": 0.5849, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 2.4601316752011706, |
|
"grad_norm": 0.3027364063325311, |
|
"learning_rate": 8.829935803075765e-07, |
|
"loss": 0.5851, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 2.489392831016825, |
|
"grad_norm": 0.3129238717661994, |
|
"learning_rate": 8.433505431885053e-07, |
|
"loss": 0.5869, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 2.5186539868324798, |
|
"grad_norm": 0.31616510805578135, |
|
"learning_rate": 8.057032014614151e-07, |
|
"loss": 0.58, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 2.547915142648135, |
|
"grad_norm": 0.3063728049938137, |
|
"learning_rate": 7.700909607056239e-07, |
|
"loss": 0.5824, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 2.5771762984637894, |
|
"grad_norm": 0.29740345091723525, |
|
"learning_rate": 7.365510963544958e-07, |
|
"loss": 0.5864, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 2.606437454279444, |
|
"grad_norm": 0.30981329357585746, |
|
"learning_rate": 7.05118714679136e-07, |
|
"loss": 0.5858, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 2.6356986100950985, |
|
"grad_norm": 0.30780806957740076, |
|
"learning_rate": 6.758267160425574e-07, |
|
"loss": 0.5904, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.6649597659107536, |
|
"grad_norm": 0.31109647888764025, |
|
"learning_rate": 6.487057604627791e-07, |
|
"loss": 0.5906, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 2.694220921726408, |
|
"grad_norm": 0.32083524501934124, |
|
"learning_rate": 6.237842355208961e-07, |
|
"loss": 0.5846, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 2.723482077542063, |
|
"grad_norm": 0.31324844821827436, |
|
"learning_rate": 6.010882266477213e-07, |
|
"loss": 0.5881, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 2.7527432333577178, |
|
"grad_norm": 0.3175561726099663, |
|
"learning_rate": 5.806414898200865e-07, |
|
"loss": 0.5871, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 2.7820043891733723, |
|
"grad_norm": 0.2984100866178075, |
|
"learning_rate": 5.62465426695398e-07, |
|
"loss": 0.5857, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 2.811265544989027, |
|
"grad_norm": 0.304486800782011, |
|
"learning_rate": 5.465790622104596e-07, |
|
"loss": 0.5896, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 2.840526700804682, |
|
"grad_norm": 0.29859411604012787, |
|
"learning_rate": 5.329990246680166e-07, |
|
"loss": 0.5853, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 2.8697878566203365, |
|
"grad_norm": 0.3044740888118332, |
|
"learning_rate": 5.21739528331866e-07, |
|
"loss": 0.5851, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 2.899049012435991, |
|
"grad_norm": 0.3038848632087947, |
|
"learning_rate": 5.128123585487441e-07, |
|
"loss": 0.5779, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 2.928310168251646, |
|
"grad_norm": 0.30622813167771706, |
|
"learning_rate": 5.062268594125722e-07, |
|
"loss": 0.5858, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.9575713240673007, |
|
"grad_norm": 0.3068908017681571, |
|
"learning_rate": 5.01989923983967e-07, |
|
"loss": 0.5771, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 2.9868324798829553, |
|
"grad_norm": 0.3045684042515058, |
|
"learning_rate": 5.00105987075254e-07, |
|
"loss": 0.5891, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 2.9956108266276518, |
|
"eval_loss": 0.6350145936012268, |
|
"eval_runtime": 341.4521, |
|
"eval_samples_per_second": 26.97, |
|
"eval_steps_per_second": 0.422, |
|
"step": 1023 |
|
}, |
|
{ |
|
"epoch": 2.9956108266276518, |
|
"step": 1023, |
|
"total_flos": 2144987064041472.0, |
|
"train_loss": 0.6271805009301223, |
|
"train_runtime": 55047.6195, |
|
"train_samples_per_second": 9.535, |
|
"train_steps_per_second": 0.019 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1023, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2144987064041472.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|