|
{ |
|
"best_metric": 0.30429786443710327, |
|
"best_model_checkpoint": "/mnt/storage1/ziya/VQA/M3D/LaMed/output/Llama3.2_3B_Slake_BiomedCLIP_KG2/checkpoint-49175", |
|
"epoch": 5.0, |
|
"eval_steps": 500, |
|
"global_step": 49175, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.030503304524656837, |
|
"grad_norm": 219.84915161132812, |
|
"learning_rate": 5.083884087442806e-07, |
|
"loss": 6.7276, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.061006609049313675, |
|
"grad_norm": 198.07276916503906, |
|
"learning_rate": 1.0167768174885612e-06, |
|
"loss": 4.6638, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.09150991357397051, |
|
"grad_norm": 206.43557739257812, |
|
"learning_rate": 1.525165226232842e-06, |
|
"loss": 2.7355, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.12201321809862735, |
|
"grad_norm": 197.00241088867188, |
|
"learning_rate": 2.0335536349771224e-06, |
|
"loss": 1.7593, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.1525165226232842, |
|
"grad_norm": 130.2694091796875, |
|
"learning_rate": 2.541942043721403e-06, |
|
"loss": 1.3763, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.18301982714794102, |
|
"grad_norm": 125.28648376464844, |
|
"learning_rate": 3.050330452465684e-06, |
|
"loss": 1.2219, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.21352313167259787, |
|
"grad_norm": 34.316184997558594, |
|
"learning_rate": 3.558718861209965e-06, |
|
"loss": 1.0971, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.2440264361972547, |
|
"grad_norm": 98.0787582397461, |
|
"learning_rate": 4.067107269954245e-06, |
|
"loss": 1.1094, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.27452974072191155, |
|
"grad_norm": 84.89276123046875, |
|
"learning_rate": 4.575495678698526e-06, |
|
"loss": 0.9373, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.3050330452465684, |
|
"grad_norm": 59.95817565917969, |
|
"learning_rate": 5.083884087442806e-06, |
|
"loss": 1.0445, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.3355363497712252, |
|
"grad_norm": 6.108051776885986, |
|
"learning_rate": 5.5922724961870875e-06, |
|
"loss": 0.9383, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.36603965429588203, |
|
"grad_norm": 0.15768121182918549, |
|
"learning_rate": 6.100660904931368e-06, |
|
"loss": 0.8041, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.3965429588205389, |
|
"grad_norm": 61.489315032958984, |
|
"learning_rate": 6.609049313675649e-06, |
|
"loss": 0.8024, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.42704626334519574, |
|
"grad_norm": 59.69186782836914, |
|
"learning_rate": 7.11743772241993e-06, |
|
"loss": 0.8968, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.45754956786985257, |
|
"grad_norm": 33.70270538330078, |
|
"learning_rate": 7.62582613116421e-06, |
|
"loss": 0.8929, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.4880528723945094, |
|
"grad_norm": 0.0485120490193367, |
|
"learning_rate": 8.13421453990849e-06, |
|
"loss": 0.854, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.5185561769191662, |
|
"grad_norm": 22.23134994506836, |
|
"learning_rate": 8.642602948652772e-06, |
|
"loss": 0.7011, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 0.5490594814438231, |
|
"grad_norm": 31.907052993774414, |
|
"learning_rate": 9.150991357397053e-06, |
|
"loss": 0.7926, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 0.5795627859684799, |
|
"grad_norm": 0.41279709339141846, |
|
"learning_rate": 9.659379766141333e-06, |
|
"loss": 0.6784, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 0.6100660904931368, |
|
"grad_norm": 5.743080139160156, |
|
"learning_rate": 9.9999933570978e-06, |
|
"loss": 0.6052, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.6405693950177936, |
|
"grad_norm": 0.039274271577596664, |
|
"learning_rate": 9.999892097428444e-06, |
|
"loss": 0.6623, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 0.6710726995424504, |
|
"grad_norm": 7.821774959564209, |
|
"learning_rate": 9.999668840572521e-06, |
|
"loss": 0.7683, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 0.7015760040671073, |
|
"grad_norm": 45.965476989746094, |
|
"learning_rate": 9.99932359197749e-06, |
|
"loss": 0.6642, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 0.7320793085917641, |
|
"grad_norm": 11.506255149841309, |
|
"learning_rate": 9.998856360067404e-06, |
|
"loss": 0.5841, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 0.762582613116421, |
|
"grad_norm": 4.3167243003845215, |
|
"learning_rate": 9.998267156242705e-06, |
|
"loss": 0.5843, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.7930859176410778, |
|
"grad_norm": 0.1081758663058281, |
|
"learning_rate": 9.997555994879945e-06, |
|
"loss": 0.6163, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 0.8235892221657346, |
|
"grad_norm": 8.694897651672363, |
|
"learning_rate": 9.996722893331437e-06, |
|
"loss": 0.5964, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 0.8540925266903915, |
|
"grad_norm": 1.5994528532028198, |
|
"learning_rate": 9.995767871924826e-06, |
|
"loss": 0.5306, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 0.8845958312150483, |
|
"grad_norm": 32.59883499145508, |
|
"learning_rate": 9.994690953962602e-06, |
|
"loss": 0.5799, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 0.9150991357397051, |
|
"grad_norm": 18.219099044799805, |
|
"learning_rate": 9.993492165721523e-06, |
|
"loss": 0.537, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.945602440264362, |
|
"grad_norm": 38.14984893798828, |
|
"learning_rate": 9.99217153645198e-06, |
|
"loss": 0.5993, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 0.9761057447890188, |
|
"grad_norm": 26.25863265991211, |
|
"learning_rate": 9.990729098377278e-06, |
|
"loss": 0.5018, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 0.6256594061851501, |
|
"eval_runtime": 103.5099, |
|
"eval_samples_per_second": 20.278, |
|
"eval_steps_per_second": 1.353, |
|
"step": 9835 |
|
}, |
|
{ |
|
"epoch": 1.0066090493136757, |
|
"grad_norm": 0.06917725503444672, |
|
"learning_rate": 9.989164886692855e-06, |
|
"loss": 0.5179, |
|
"step": 9900 |
|
}, |
|
{ |
|
"epoch": 1.0371123538383324, |
|
"grad_norm": 0.5146661996841431, |
|
"learning_rate": 9.987478939565419e-06, |
|
"loss": 0.5132, |
|
"step": 10200 |
|
}, |
|
{ |
|
"epoch": 1.0676156583629894, |
|
"grad_norm": 0.00010405463399365544, |
|
"learning_rate": 9.985671298132018e-06, |
|
"loss": 0.553, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 1.0981189628876462, |
|
"grad_norm": 3.304703712463379, |
|
"learning_rate": 9.98374200649904e-06, |
|
"loss": 0.4167, |
|
"step": 10800 |
|
}, |
|
{ |
|
"epoch": 1.128622267412303, |
|
"grad_norm": 0.6084465384483337, |
|
"learning_rate": 9.981691111741132e-06, |
|
"loss": 0.4076, |
|
"step": 11100 |
|
}, |
|
{ |
|
"epoch": 1.1591255719369598, |
|
"grad_norm": 0.05212326720356941, |
|
"learning_rate": 9.979518663900047e-06, |
|
"loss": 0.5073, |
|
"step": 11400 |
|
}, |
|
{ |
|
"epoch": 1.1896288764616167, |
|
"grad_norm": 21.804567337036133, |
|
"learning_rate": 9.977224715983439e-06, |
|
"loss": 0.4453, |
|
"step": 11700 |
|
}, |
|
{ |
|
"epoch": 1.2201321809862735, |
|
"grad_norm": 32.33522415161133, |
|
"learning_rate": 9.974809323963552e-06, |
|
"loss": 0.5258, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 1.2506354855109303, |
|
"grad_norm": 0.14623811841011047, |
|
"learning_rate": 9.972272546775862e-06, |
|
"loss": 0.446, |
|
"step": 12300 |
|
}, |
|
{ |
|
"epoch": 1.281138790035587, |
|
"grad_norm": 0.28084316849708557, |
|
"learning_rate": 9.969614446317645e-06, |
|
"loss": 0.4267, |
|
"step": 12600 |
|
}, |
|
{ |
|
"epoch": 1.311642094560244, |
|
"grad_norm": 2.6160593032836914, |
|
"learning_rate": 9.966835087446457e-06, |
|
"loss": 0.5175, |
|
"step": 12900 |
|
}, |
|
{ |
|
"epoch": 1.3421453990849008, |
|
"grad_norm": 0.21234674751758575, |
|
"learning_rate": 9.963934537978547e-06, |
|
"loss": 0.4142, |
|
"step": 13200 |
|
}, |
|
{ |
|
"epoch": 1.3726487036095576, |
|
"grad_norm": 0.09984096884727478, |
|
"learning_rate": 9.960912868687222e-06, |
|
"loss": 0.3752, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 1.4031520081342146, |
|
"grad_norm": 0.003108405042439699, |
|
"learning_rate": 9.957770153301104e-06, |
|
"loss": 0.4626, |
|
"step": 13800 |
|
}, |
|
{ |
|
"epoch": 1.4336553126588714, |
|
"grad_norm": 7.253620424307883e-05, |
|
"learning_rate": 9.954506468502335e-06, |
|
"loss": 0.3995, |
|
"step": 14100 |
|
}, |
|
{ |
|
"epoch": 1.4641586171835281, |
|
"grad_norm": 7.312223434448242, |
|
"learning_rate": 9.951121893924704e-06, |
|
"loss": 0.5808, |
|
"step": 14400 |
|
}, |
|
{ |
|
"epoch": 1.4946619217081851, |
|
"grad_norm": 10.09123420715332, |
|
"learning_rate": 9.94761651215171e-06, |
|
"loss": 0.4186, |
|
"step": 14700 |
|
}, |
|
{ |
|
"epoch": 1.525165226232842, |
|
"grad_norm": 6.39027214050293, |
|
"learning_rate": 9.943990408714542e-06, |
|
"loss": 0.4374, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 1.5556685307574987, |
|
"grad_norm": 0.5071086287498474, |
|
"learning_rate": 9.94024367208999e-06, |
|
"loss": 0.3927, |
|
"step": 15300 |
|
}, |
|
{ |
|
"epoch": 1.5861718352821557, |
|
"grad_norm": 0.00023602554574608803, |
|
"learning_rate": 9.936376393698296e-06, |
|
"loss": 0.453, |
|
"step": 15600 |
|
}, |
|
{ |
|
"epoch": 1.6166751398068124, |
|
"grad_norm": 4.159149646759033, |
|
"learning_rate": 9.93238866790091e-06, |
|
"loss": 0.4594, |
|
"step": 15900 |
|
}, |
|
{ |
|
"epoch": 1.6471784443314692, |
|
"grad_norm": 0.02954951673746109, |
|
"learning_rate": 9.928280591998201e-06, |
|
"loss": 0.4708, |
|
"step": 16200 |
|
}, |
|
{ |
|
"epoch": 1.6776817488561262, |
|
"grad_norm": 0.036519214510917664, |
|
"learning_rate": 9.924052266227069e-06, |
|
"loss": 0.348, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 1.708185053380783, |
|
"grad_norm": 14.24988079071045, |
|
"learning_rate": 9.919703793758511e-06, |
|
"loss": 0.4279, |
|
"step": 16800 |
|
}, |
|
{ |
|
"epoch": 1.7386883579054397, |
|
"grad_norm": 17.464754104614258, |
|
"learning_rate": 9.9152352806951e-06, |
|
"loss": 0.4133, |
|
"step": 17100 |
|
}, |
|
{ |
|
"epoch": 1.7691916624300967, |
|
"grad_norm": 0.008350013755261898, |
|
"learning_rate": 9.91064683606839e-06, |
|
"loss": 0.3868, |
|
"step": 17400 |
|
}, |
|
{ |
|
"epoch": 1.7996949669547533, |
|
"grad_norm": 0.14084838330745697, |
|
"learning_rate": 9.90593857183627e-06, |
|
"loss": 0.41, |
|
"step": 17700 |
|
}, |
|
{ |
|
"epoch": 1.8301982714794103, |
|
"grad_norm": 0.011409441009163857, |
|
"learning_rate": 9.901110602880211e-06, |
|
"loss": 0.3632, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 1.8607015760040673, |
|
"grad_norm": 0.2167450338602066, |
|
"learning_rate": 9.896163047002486e-06, |
|
"loss": 0.4203, |
|
"step": 18300 |
|
}, |
|
{ |
|
"epoch": 1.8912048805287238, |
|
"grad_norm": 0.12340361624956131, |
|
"learning_rate": 9.891096024923274e-06, |
|
"loss": 0.3836, |
|
"step": 18600 |
|
}, |
|
{ |
|
"epoch": 1.9217081850533808, |
|
"grad_norm": 0.011780009604990482, |
|
"learning_rate": 9.885909660277735e-06, |
|
"loss": 0.3638, |
|
"step": 18900 |
|
}, |
|
{ |
|
"epoch": 1.9522114895780376, |
|
"grad_norm": 1.290787696838379, |
|
"learning_rate": 9.880604079612977e-06, |
|
"loss": 0.3716, |
|
"step": 19200 |
|
}, |
|
{ |
|
"epoch": 1.9827147941026944, |
|
"grad_norm": 1.6361336747650057e-05, |
|
"learning_rate": 9.87517941238498e-06, |
|
"loss": 0.3499, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 0.41577816009521484, |
|
"eval_runtime": 103.2846, |
|
"eval_samples_per_second": 20.322, |
|
"eval_steps_per_second": 1.355, |
|
"step": 19670 |
|
}, |
|
{ |
|
"epoch": 2.0132180986273513, |
|
"grad_norm": 69.40552520751953, |
|
"learning_rate": 9.869635790955423e-06, |
|
"loss": 0.3971, |
|
"step": 19800 |
|
}, |
|
{ |
|
"epoch": 2.0437214031520083, |
|
"grad_norm": 0.0739259347319603, |
|
"learning_rate": 9.863973350588472e-06, |
|
"loss": 0.3106, |
|
"step": 20100 |
|
}, |
|
{ |
|
"epoch": 2.074224707676665, |
|
"grad_norm": 6.068602085113525, |
|
"learning_rate": 9.85819222944747e-06, |
|
"loss": 0.3609, |
|
"step": 20400 |
|
}, |
|
{ |
|
"epoch": 2.104728012201322, |
|
"grad_norm": 15.744518280029297, |
|
"learning_rate": 9.852292568591557e-06, |
|
"loss": 0.355, |
|
"step": 20700 |
|
}, |
|
{ |
|
"epoch": 2.135231316725979, |
|
"grad_norm": 12.378608703613281, |
|
"learning_rate": 9.846274511972251e-06, |
|
"loss": 0.306, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 2.1657346212506354, |
|
"grad_norm": 10.521739959716797, |
|
"learning_rate": 9.840138206429911e-06, |
|
"loss": 0.3215, |
|
"step": 21300 |
|
}, |
|
{ |
|
"epoch": 2.1962379257752924, |
|
"grad_norm": 8.210835456848145, |
|
"learning_rate": 9.833883801690179e-06, |
|
"loss": 0.2643, |
|
"step": 21600 |
|
}, |
|
{ |
|
"epoch": 2.226741230299949, |
|
"grad_norm": 0.28642842173576355, |
|
"learning_rate": 9.827511450360295e-06, |
|
"loss": 0.3466, |
|
"step": 21900 |
|
}, |
|
{ |
|
"epoch": 2.257244534824606, |
|
"grad_norm": 0.010792361572384834, |
|
"learning_rate": 9.821021307925406e-06, |
|
"loss": 0.3405, |
|
"step": 22200 |
|
}, |
|
{ |
|
"epoch": 2.287747839349263, |
|
"grad_norm": 0.46060484647750854, |
|
"learning_rate": 9.814413532744753e-06, |
|
"loss": 0.3429, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 2.3182511438739195, |
|
"grad_norm": 0.04725227132439613, |
|
"learning_rate": 9.80768828604781e-06, |
|
"loss": 0.3277, |
|
"step": 22800 |
|
}, |
|
{ |
|
"epoch": 2.3487544483985765, |
|
"grad_norm": 4.012476921081543, |
|
"learning_rate": 9.800845731930356e-06, |
|
"loss": 0.3004, |
|
"step": 23100 |
|
}, |
|
{ |
|
"epoch": 2.3792577529232335, |
|
"grad_norm": 12.063570976257324, |
|
"learning_rate": 9.793886037350461e-06, |
|
"loss": 0.351, |
|
"step": 23400 |
|
}, |
|
{ |
|
"epoch": 2.40976105744789, |
|
"grad_norm": 0.0161123126745224, |
|
"learning_rate": 9.786809372124425e-06, |
|
"loss": 0.3215, |
|
"step": 23700 |
|
}, |
|
{ |
|
"epoch": 2.440264361972547, |
|
"grad_norm": 0.012391666881740093, |
|
"learning_rate": 9.779615908922622e-06, |
|
"loss": 0.3581, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 2.470767666497204, |
|
"grad_norm": 3.5457940101623535, |
|
"learning_rate": 9.772305823265294e-06, |
|
"loss": 0.339, |
|
"step": 24300 |
|
}, |
|
{ |
|
"epoch": 2.5012709710218606, |
|
"grad_norm": 0.04232852905988693, |
|
"learning_rate": 9.764879293518266e-06, |
|
"loss": 0.3523, |
|
"step": 24600 |
|
}, |
|
{ |
|
"epoch": 2.5317742755465176, |
|
"grad_norm": 0.2725585997104645, |
|
"learning_rate": 9.757336500888599e-06, |
|
"loss": 0.2807, |
|
"step": 24900 |
|
}, |
|
{ |
|
"epoch": 2.562277580071174, |
|
"grad_norm": 15.362088203430176, |
|
"learning_rate": 9.749677629420157e-06, |
|
"loss": 0.3227, |
|
"step": 25200 |
|
}, |
|
{ |
|
"epoch": 2.592780884595831, |
|
"grad_norm": 0.04092194885015488, |
|
"learning_rate": 9.74190286598913e-06, |
|
"loss": 0.2848, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 2.623284189120488, |
|
"grad_norm": 18.27284049987793, |
|
"learning_rate": 9.734012400299463e-06, |
|
"loss": 0.3182, |
|
"step": 25800 |
|
}, |
|
{ |
|
"epoch": 2.6537874936451447, |
|
"grad_norm": 10.792008399963379, |
|
"learning_rate": 9.726006424878234e-06, |
|
"loss": 0.3344, |
|
"step": 26100 |
|
}, |
|
{ |
|
"epoch": 2.6842907981698017, |
|
"grad_norm": 0.0012637190520763397, |
|
"learning_rate": 9.717885135070957e-06, |
|
"loss": 0.3022, |
|
"step": 26400 |
|
}, |
|
{ |
|
"epoch": 2.7147941026944586, |
|
"grad_norm": 0.0459415428340435, |
|
"learning_rate": 9.709648729036805e-06, |
|
"loss": 0.3292, |
|
"step": 26700 |
|
}, |
|
{ |
|
"epoch": 2.745297407219115, |
|
"grad_norm": 0.0021137718576937914, |
|
"learning_rate": 9.70129740774379e-06, |
|
"loss": 0.34, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 2.775800711743772, |
|
"grad_norm": 0.020368747413158417, |
|
"learning_rate": 9.69283137496385e-06, |
|
"loss": 0.2781, |
|
"step": 27300 |
|
}, |
|
{ |
|
"epoch": 2.806304016268429, |
|
"grad_norm": 0.01668807491660118, |
|
"learning_rate": 9.68425083726788e-06, |
|
"loss": 0.3146, |
|
"step": 27600 |
|
}, |
|
{ |
|
"epoch": 2.8368073207930857, |
|
"grad_norm": 18.376174926757812, |
|
"learning_rate": 9.67555600402069e-06, |
|
"loss": 0.3176, |
|
"step": 27900 |
|
}, |
|
{ |
|
"epoch": 2.8673106253177427, |
|
"grad_norm": 0.012932281009852886, |
|
"learning_rate": 9.666747087375894e-06, |
|
"loss": 0.3183, |
|
"step": 28200 |
|
}, |
|
{ |
|
"epoch": 2.8978139298423997, |
|
"grad_norm": 24.642900466918945, |
|
"learning_rate": 9.657824302270743e-06, |
|
"loss": 0.3418, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 2.9283172343670563, |
|
"grad_norm": 0.013992193154990673, |
|
"learning_rate": 9.64878786642087e-06, |
|
"loss": 0.3196, |
|
"step": 28800 |
|
}, |
|
{ |
|
"epoch": 2.9588205388917133, |
|
"grad_norm": 0.001322018215432763, |
|
"learning_rate": 9.639638000314983e-06, |
|
"loss": 0.2828, |
|
"step": 29100 |
|
}, |
|
{ |
|
"epoch": 2.9893238434163703, |
|
"grad_norm": 0.010299603454768658, |
|
"learning_rate": 9.630374927209485e-06, |
|
"loss": 0.3031, |
|
"step": 29400 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 0.384840726852417, |
|
"eval_runtime": 103.5367, |
|
"eval_samples_per_second": 20.273, |
|
"eval_steps_per_second": 1.352, |
|
"step": 29505 |
|
}, |
|
{ |
|
"epoch": 3.019827147941027, |
|
"grad_norm": 0.03328376263380051, |
|
"learning_rate": 9.620998873123027e-06, |
|
"loss": 0.2759, |
|
"step": 29700 |
|
}, |
|
{ |
|
"epoch": 3.050330452465684, |
|
"grad_norm": 13.360301971435547, |
|
"learning_rate": 9.611510066830984e-06, |
|
"loss": 0.1929, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 3.080833756990341, |
|
"grad_norm": 0.0005070280749350786, |
|
"learning_rate": 9.601908739859895e-06, |
|
"loss": 0.2731, |
|
"step": 30300 |
|
}, |
|
{ |
|
"epoch": 3.1113370615149973, |
|
"grad_norm": 7.794924385962076e-06, |
|
"learning_rate": 9.592195126481786e-06, |
|
"loss": 0.2898, |
|
"step": 30600 |
|
}, |
|
{ |
|
"epoch": 3.1418403660396543, |
|
"grad_norm": 0.000699843221809715, |
|
"learning_rate": 9.582369463708473e-06, |
|
"loss": 0.2485, |
|
"step": 30900 |
|
}, |
|
{ |
|
"epoch": 3.1723436705643113, |
|
"grad_norm": 0.24882352352142334, |
|
"learning_rate": 9.572431991285775e-06, |
|
"loss": 0.2353, |
|
"step": 31200 |
|
}, |
|
{ |
|
"epoch": 3.202846975088968, |
|
"grad_norm": 0.004962280858308077, |
|
"learning_rate": 9.562382951687658e-06, |
|
"loss": 0.2352, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 3.233350279613625, |
|
"grad_norm": 4.832521915435791, |
|
"learning_rate": 9.552222590110324e-06, |
|
"loss": 0.2363, |
|
"step": 31800 |
|
}, |
|
{ |
|
"epoch": 3.263853584138282, |
|
"grad_norm": 0.019647739827632904, |
|
"learning_rate": 9.541951154466233e-06, |
|
"loss": 0.2931, |
|
"step": 32100 |
|
}, |
|
{ |
|
"epoch": 3.2943568886629384, |
|
"grad_norm": 0.04419803246855736, |
|
"learning_rate": 9.53156889537804e-06, |
|
"loss": 0.2634, |
|
"step": 32400 |
|
}, |
|
{ |
|
"epoch": 3.3248601931875954, |
|
"grad_norm": 0.00563009362667799, |
|
"learning_rate": 9.521076066172493e-06, |
|
"loss": 0.3196, |
|
"step": 32700 |
|
}, |
|
{ |
|
"epoch": 3.3553634977122524, |
|
"grad_norm": 35.982086181640625, |
|
"learning_rate": 9.510472922874246e-06, |
|
"loss": 0.2312, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 3.385866802236909, |
|
"grad_norm": 0.023021390661597252, |
|
"learning_rate": 9.49975972419961e-06, |
|
"loss": 0.248, |
|
"step": 33300 |
|
}, |
|
{ |
|
"epoch": 3.416370106761566, |
|
"grad_norm": 12.021758079528809, |
|
"learning_rate": 9.488936731550247e-06, |
|
"loss": 0.2552, |
|
"step": 33600 |
|
}, |
|
{ |
|
"epoch": 3.4468734112862225, |
|
"grad_norm": 0.08628563582897186, |
|
"learning_rate": 9.47800420900679e-06, |
|
"loss": 0.2553, |
|
"step": 33900 |
|
}, |
|
{ |
|
"epoch": 3.4773767158108795, |
|
"grad_norm": 1.5440913438796997, |
|
"learning_rate": 9.466962423322388e-06, |
|
"loss": 0.3116, |
|
"step": 34200 |
|
}, |
|
{ |
|
"epoch": 3.5078800203355365, |
|
"grad_norm": 1.9582719687605277e-05, |
|
"learning_rate": 9.455811643916217e-06, |
|
"loss": 0.2573, |
|
"step": 34500 |
|
}, |
|
{ |
|
"epoch": 3.538383324860193, |
|
"grad_norm": 0.0327552855014801, |
|
"learning_rate": 9.444552142866892e-06, |
|
"loss": 0.2714, |
|
"step": 34800 |
|
}, |
|
{ |
|
"epoch": 3.56888662938485, |
|
"grad_norm": 23.4067440032959, |
|
"learning_rate": 9.433184194905831e-06, |
|
"loss": 0.2582, |
|
"step": 35100 |
|
}, |
|
{ |
|
"epoch": 3.599389933909507, |
|
"grad_norm": 0.510364830493927, |
|
"learning_rate": 9.421708077410551e-06, |
|
"loss": 0.2953, |
|
"step": 35400 |
|
}, |
|
{ |
|
"epoch": 3.6298932384341636, |
|
"grad_norm": 0.008372425101697445, |
|
"learning_rate": 9.410124070397908e-06, |
|
"loss": 0.3149, |
|
"step": 35700 |
|
}, |
|
{ |
|
"epoch": 3.6603965429588206, |
|
"grad_norm": 0.005560519173741341, |
|
"learning_rate": 9.398432456517254e-06, |
|
"loss": 0.2407, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 3.690899847483477, |
|
"grad_norm": 4.856593608856201, |
|
"learning_rate": 9.386633521043545e-06, |
|
"loss": 0.2772, |
|
"step": 36300 |
|
}, |
|
{ |
|
"epoch": 3.721403152008134, |
|
"grad_norm": 0.00459680799394846, |
|
"learning_rate": 9.374727551870377e-06, |
|
"loss": 0.2901, |
|
"step": 36600 |
|
}, |
|
{ |
|
"epoch": 3.751906456532791, |
|
"grad_norm": 0.00823771022260189, |
|
"learning_rate": 9.362714839502973e-06, |
|
"loss": 0.3216, |
|
"step": 36900 |
|
}, |
|
{ |
|
"epoch": 3.7824097610574476, |
|
"grad_norm": 0.011493176221847534, |
|
"learning_rate": 9.35059567705108e-06, |
|
"loss": 0.2682, |
|
"step": 37200 |
|
}, |
|
{ |
|
"epoch": 3.8129130655821046, |
|
"grad_norm": 0.057664211839437485, |
|
"learning_rate": 9.33837036022182e-06, |
|
"loss": 0.2823, |
|
"step": 37500 |
|
}, |
|
{ |
|
"epoch": 3.8434163701067616, |
|
"grad_norm": 11.62787914276123, |
|
"learning_rate": 9.326039187312485e-06, |
|
"loss": 0.1798, |
|
"step": 37800 |
|
}, |
|
{ |
|
"epoch": 3.873919674631418, |
|
"grad_norm": 0.02458140067756176, |
|
"learning_rate": 9.313602459203248e-06, |
|
"loss": 0.2337, |
|
"step": 38100 |
|
}, |
|
{ |
|
"epoch": 3.904422979156075, |
|
"grad_norm": 0.019079158082604408, |
|
"learning_rate": 9.301060479349826e-06, |
|
"loss": 0.2539, |
|
"step": 38400 |
|
}, |
|
{ |
|
"epoch": 3.934926283680732, |
|
"grad_norm": 9.756794929504395, |
|
"learning_rate": 9.288413553776076e-06, |
|
"loss": 0.279, |
|
"step": 38700 |
|
}, |
|
{ |
|
"epoch": 3.9654295882053887, |
|
"grad_norm": 3.1425323486328125, |
|
"learning_rate": 9.275661991066522e-06, |
|
"loss": 0.2767, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 3.9959328927300457, |
|
"grad_norm": 0.006602356676012278, |
|
"learning_rate": 9.262806102358834e-06, |
|
"loss": 0.2558, |
|
"step": 39300 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_loss": 0.3359658420085907, |
|
"eval_runtime": 110.2827, |
|
"eval_samples_per_second": 19.033, |
|
"eval_steps_per_second": 1.269, |
|
"step": 39340 |
|
}, |
|
{ |
|
"epoch": 4.026436197254703, |
|
"grad_norm": 0.00417200056836009, |
|
"learning_rate": 9.249846201336235e-06, |
|
"loss": 0.2559, |
|
"step": 39600 |
|
}, |
|
{ |
|
"epoch": 4.056939501779359, |
|
"grad_norm": 0.008380061946809292, |
|
"learning_rate": 9.236782604219838e-06, |
|
"loss": 0.2173, |
|
"step": 39900 |
|
}, |
|
{ |
|
"epoch": 4.087442806304017, |
|
"grad_norm": 0.06265965104103088, |
|
"learning_rate": 9.223615629760943e-06, |
|
"loss": 0.2338, |
|
"step": 40200 |
|
}, |
|
{ |
|
"epoch": 4.117946110828673, |
|
"grad_norm": 2.496182241884526e-05, |
|
"learning_rate": 9.21034559923325e-06, |
|
"loss": 0.2761, |
|
"step": 40500 |
|
}, |
|
{ |
|
"epoch": 4.14844941535333, |
|
"grad_norm": 40.386531829833984, |
|
"learning_rate": 9.196972836425027e-06, |
|
"loss": 0.1951, |
|
"step": 40800 |
|
}, |
|
{ |
|
"epoch": 4.178952719877987, |
|
"grad_norm": 0.024761863052845, |
|
"learning_rate": 9.183497667631199e-06, |
|
"loss": 0.1979, |
|
"step": 41100 |
|
}, |
|
{ |
|
"epoch": 4.209456024402644, |
|
"grad_norm": 0.0945596694946289, |
|
"learning_rate": 9.169920421645401e-06, |
|
"loss": 0.1914, |
|
"step": 41400 |
|
}, |
|
{ |
|
"epoch": 4.2399593289273, |
|
"grad_norm": 0.6701067686080933, |
|
"learning_rate": 9.156241429751947e-06, |
|
"loss": 0.2216, |
|
"step": 41700 |
|
}, |
|
{ |
|
"epoch": 4.270462633451958, |
|
"grad_norm": 240.65187072753906, |
|
"learning_rate": 9.142461025717739e-06, |
|
"loss": 0.2277, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 4.300965937976614, |
|
"grad_norm": 0.0036695213057100773, |
|
"learning_rate": 9.128579545784142e-06, |
|
"loss": 0.2476, |
|
"step": 42300 |
|
}, |
|
{ |
|
"epoch": 4.331469242501271, |
|
"grad_norm": 33.61179733276367, |
|
"learning_rate": 9.114597328658763e-06, |
|
"loss": 0.1958, |
|
"step": 42600 |
|
}, |
|
{ |
|
"epoch": 4.361972547025927, |
|
"grad_norm": 52.01089859008789, |
|
"learning_rate": 9.100514715507196e-06, |
|
"loss": 0.217, |
|
"step": 42900 |
|
}, |
|
{ |
|
"epoch": 4.392475851550585, |
|
"grad_norm": 0.0013590186135843396, |
|
"learning_rate": 9.086332049944692e-06, |
|
"loss": 0.2393, |
|
"step": 43200 |
|
}, |
|
{ |
|
"epoch": 4.422979156075241, |
|
"grad_norm": 0.12093915045261383, |
|
"learning_rate": 9.072049678027778e-06, |
|
"loss": 0.2175, |
|
"step": 43500 |
|
}, |
|
{ |
|
"epoch": 4.453482460599898, |
|
"grad_norm": 0.008124121464788914, |
|
"learning_rate": 9.057667948245816e-06, |
|
"loss": 0.262, |
|
"step": 43800 |
|
}, |
|
{ |
|
"epoch": 4.483985765124555, |
|
"grad_norm": 0.7974226474761963, |
|
"learning_rate": 9.043187211512492e-06, |
|
"loss": 0.2742, |
|
"step": 44100 |
|
}, |
|
{ |
|
"epoch": 4.514489069649212, |
|
"grad_norm": 13.317780494689941, |
|
"learning_rate": 9.028607821157256e-06, |
|
"loss": 0.2471, |
|
"step": 44400 |
|
}, |
|
{ |
|
"epoch": 4.5449923741738685, |
|
"grad_norm": 0.0005867113941349089, |
|
"learning_rate": 9.013930132916709e-06, |
|
"loss": 0.252, |
|
"step": 44700 |
|
}, |
|
{ |
|
"epoch": 4.575495678698526, |
|
"grad_norm": 0.053022801876068115, |
|
"learning_rate": 8.999154504925914e-06, |
|
"loss": 0.2319, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 4.6059989832231825, |
|
"grad_norm": 0.0034278552047908306, |
|
"learning_rate": 8.984281297709658e-06, |
|
"loss": 0.2185, |
|
"step": 45300 |
|
}, |
|
{ |
|
"epoch": 4.636502287747839, |
|
"grad_norm": 0.008740383200347424, |
|
"learning_rate": 8.969310874173663e-06, |
|
"loss": 0.2231, |
|
"step": 45600 |
|
}, |
|
{ |
|
"epoch": 4.6670055922724965, |
|
"grad_norm": 0.06879168748855591, |
|
"learning_rate": 8.95424359959572e-06, |
|
"loss": 0.1777, |
|
"step": 45900 |
|
}, |
|
{ |
|
"epoch": 4.697508896797153, |
|
"grad_norm": 0.005040327087044716, |
|
"learning_rate": 8.939079841616785e-06, |
|
"loss": 0.2596, |
|
"step": 46200 |
|
}, |
|
{ |
|
"epoch": 4.7280122013218095, |
|
"grad_norm": 0.0021866620518267155, |
|
"learning_rate": 8.923819970232003e-06, |
|
"loss": 0.2109, |
|
"step": 46500 |
|
}, |
|
{ |
|
"epoch": 4.758515505846467, |
|
"grad_norm": 0.023091251030564308, |
|
"learning_rate": 8.90846435778169e-06, |
|
"loss": 0.1908, |
|
"step": 46800 |
|
}, |
|
{ |
|
"epoch": 4.7890188103711235, |
|
"grad_norm": 0.0027208509854972363, |
|
"learning_rate": 8.893013378942232e-06, |
|
"loss": 0.2422, |
|
"step": 47100 |
|
}, |
|
{ |
|
"epoch": 4.81952211489578, |
|
"grad_norm": 0.0003605252131819725, |
|
"learning_rate": 8.877467410716951e-06, |
|
"loss": 0.1941, |
|
"step": 47400 |
|
}, |
|
{ |
|
"epoch": 4.8500254194204375, |
|
"grad_norm": 4.229369640350342, |
|
"learning_rate": 8.861826832426916e-06, |
|
"loss": 0.2363, |
|
"step": 47700 |
|
}, |
|
{ |
|
"epoch": 4.880528723945094, |
|
"grad_norm": 0.01985393464565277, |
|
"learning_rate": 8.84609202570167e-06, |
|
"loss": 0.215, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 4.911032028469751, |
|
"grad_norm": 3.8484694957733154, |
|
"learning_rate": 8.830263374469927e-06, |
|
"loss": 0.2126, |
|
"step": 48300 |
|
}, |
|
{ |
|
"epoch": 4.941535332994408, |
|
"grad_norm": 4.421438279678114e-05, |
|
"learning_rate": 8.814341264950207e-06, |
|
"loss": 0.2451, |
|
"step": 48600 |
|
}, |
|
{ |
|
"epoch": 4.972038637519065, |
|
"grad_norm": 3.2253410816192627, |
|
"learning_rate": 8.798326085641407e-06, |
|
"loss": 0.261, |
|
"step": 48900 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_loss": 0.30429786443710327, |
|
"eval_runtime": 103.3201, |
|
"eval_samples_per_second": 20.316, |
|
"eval_steps_per_second": 1.355, |
|
"step": 49175 |
|
} |
|
], |
|
"logging_steps": 300, |
|
"max_steps": 196700, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 20, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.410768182018816e+17, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|