Llama3.2_3B_Slake_BiomedCLIP_KG / trainer_state.json
yaziciz's picture
Upload 5 files
fa92da1 verified
{
"best_metric": 0.30429786443710327,
"best_model_checkpoint": "/mnt/storage1/ziya/VQA/M3D/LaMed/output/Llama3.2_3B_Slake_BiomedCLIP_KG2/checkpoint-49175",
"epoch": 5.0,
"eval_steps": 500,
"global_step": 49175,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.030503304524656837,
"grad_norm": 219.84915161132812,
"learning_rate": 5.083884087442806e-07,
"loss": 6.7276,
"step": 300
},
{
"epoch": 0.061006609049313675,
"grad_norm": 198.07276916503906,
"learning_rate": 1.0167768174885612e-06,
"loss": 4.6638,
"step": 600
},
{
"epoch": 0.09150991357397051,
"grad_norm": 206.43557739257812,
"learning_rate": 1.525165226232842e-06,
"loss": 2.7355,
"step": 900
},
{
"epoch": 0.12201321809862735,
"grad_norm": 197.00241088867188,
"learning_rate": 2.0335536349771224e-06,
"loss": 1.7593,
"step": 1200
},
{
"epoch": 0.1525165226232842,
"grad_norm": 130.2694091796875,
"learning_rate": 2.541942043721403e-06,
"loss": 1.3763,
"step": 1500
},
{
"epoch": 0.18301982714794102,
"grad_norm": 125.28648376464844,
"learning_rate": 3.050330452465684e-06,
"loss": 1.2219,
"step": 1800
},
{
"epoch": 0.21352313167259787,
"grad_norm": 34.316184997558594,
"learning_rate": 3.558718861209965e-06,
"loss": 1.0971,
"step": 2100
},
{
"epoch": 0.2440264361972547,
"grad_norm": 98.0787582397461,
"learning_rate": 4.067107269954245e-06,
"loss": 1.1094,
"step": 2400
},
{
"epoch": 0.27452974072191155,
"grad_norm": 84.89276123046875,
"learning_rate": 4.575495678698526e-06,
"loss": 0.9373,
"step": 2700
},
{
"epoch": 0.3050330452465684,
"grad_norm": 59.95817565917969,
"learning_rate": 5.083884087442806e-06,
"loss": 1.0445,
"step": 3000
},
{
"epoch": 0.3355363497712252,
"grad_norm": 6.108051776885986,
"learning_rate": 5.5922724961870875e-06,
"loss": 0.9383,
"step": 3300
},
{
"epoch": 0.36603965429588203,
"grad_norm": 0.15768121182918549,
"learning_rate": 6.100660904931368e-06,
"loss": 0.8041,
"step": 3600
},
{
"epoch": 0.3965429588205389,
"grad_norm": 61.489315032958984,
"learning_rate": 6.609049313675649e-06,
"loss": 0.8024,
"step": 3900
},
{
"epoch": 0.42704626334519574,
"grad_norm": 59.69186782836914,
"learning_rate": 7.11743772241993e-06,
"loss": 0.8968,
"step": 4200
},
{
"epoch": 0.45754956786985257,
"grad_norm": 33.70270538330078,
"learning_rate": 7.62582613116421e-06,
"loss": 0.8929,
"step": 4500
},
{
"epoch": 0.4880528723945094,
"grad_norm": 0.0485120490193367,
"learning_rate": 8.13421453990849e-06,
"loss": 0.854,
"step": 4800
},
{
"epoch": 0.5185561769191662,
"grad_norm": 22.23134994506836,
"learning_rate": 8.642602948652772e-06,
"loss": 0.7011,
"step": 5100
},
{
"epoch": 0.5490594814438231,
"grad_norm": 31.907052993774414,
"learning_rate": 9.150991357397053e-06,
"loss": 0.7926,
"step": 5400
},
{
"epoch": 0.5795627859684799,
"grad_norm": 0.41279709339141846,
"learning_rate": 9.659379766141333e-06,
"loss": 0.6784,
"step": 5700
},
{
"epoch": 0.6100660904931368,
"grad_norm": 5.743080139160156,
"learning_rate": 9.9999933570978e-06,
"loss": 0.6052,
"step": 6000
},
{
"epoch": 0.6405693950177936,
"grad_norm": 0.039274271577596664,
"learning_rate": 9.999892097428444e-06,
"loss": 0.6623,
"step": 6300
},
{
"epoch": 0.6710726995424504,
"grad_norm": 7.821774959564209,
"learning_rate": 9.999668840572521e-06,
"loss": 0.7683,
"step": 6600
},
{
"epoch": 0.7015760040671073,
"grad_norm": 45.965476989746094,
"learning_rate": 9.99932359197749e-06,
"loss": 0.6642,
"step": 6900
},
{
"epoch": 0.7320793085917641,
"grad_norm": 11.506255149841309,
"learning_rate": 9.998856360067404e-06,
"loss": 0.5841,
"step": 7200
},
{
"epoch": 0.762582613116421,
"grad_norm": 4.3167243003845215,
"learning_rate": 9.998267156242705e-06,
"loss": 0.5843,
"step": 7500
},
{
"epoch": 0.7930859176410778,
"grad_norm": 0.1081758663058281,
"learning_rate": 9.997555994879945e-06,
"loss": 0.6163,
"step": 7800
},
{
"epoch": 0.8235892221657346,
"grad_norm": 8.694897651672363,
"learning_rate": 9.996722893331437e-06,
"loss": 0.5964,
"step": 8100
},
{
"epoch": 0.8540925266903915,
"grad_norm": 1.5994528532028198,
"learning_rate": 9.995767871924826e-06,
"loss": 0.5306,
"step": 8400
},
{
"epoch": 0.8845958312150483,
"grad_norm": 32.59883499145508,
"learning_rate": 9.994690953962602e-06,
"loss": 0.5799,
"step": 8700
},
{
"epoch": 0.9150991357397051,
"grad_norm": 18.219099044799805,
"learning_rate": 9.993492165721523e-06,
"loss": 0.537,
"step": 9000
},
{
"epoch": 0.945602440264362,
"grad_norm": 38.14984893798828,
"learning_rate": 9.99217153645198e-06,
"loss": 0.5993,
"step": 9300
},
{
"epoch": 0.9761057447890188,
"grad_norm": 26.25863265991211,
"learning_rate": 9.990729098377278e-06,
"loss": 0.5018,
"step": 9600
},
{
"epoch": 1.0,
"eval_loss": 0.6256594061851501,
"eval_runtime": 103.5099,
"eval_samples_per_second": 20.278,
"eval_steps_per_second": 1.353,
"step": 9835
},
{
"epoch": 1.0066090493136757,
"grad_norm": 0.06917725503444672,
"learning_rate": 9.989164886692855e-06,
"loss": 0.5179,
"step": 9900
},
{
"epoch": 1.0371123538383324,
"grad_norm": 0.5146661996841431,
"learning_rate": 9.987478939565419e-06,
"loss": 0.5132,
"step": 10200
},
{
"epoch": 1.0676156583629894,
"grad_norm": 0.00010405463399365544,
"learning_rate": 9.985671298132018e-06,
"loss": 0.553,
"step": 10500
},
{
"epoch": 1.0981189628876462,
"grad_norm": 3.304703712463379,
"learning_rate": 9.98374200649904e-06,
"loss": 0.4167,
"step": 10800
},
{
"epoch": 1.128622267412303,
"grad_norm": 0.6084465384483337,
"learning_rate": 9.981691111741132e-06,
"loss": 0.4076,
"step": 11100
},
{
"epoch": 1.1591255719369598,
"grad_norm": 0.05212326720356941,
"learning_rate": 9.979518663900047e-06,
"loss": 0.5073,
"step": 11400
},
{
"epoch": 1.1896288764616167,
"grad_norm": 21.804567337036133,
"learning_rate": 9.977224715983439e-06,
"loss": 0.4453,
"step": 11700
},
{
"epoch": 1.2201321809862735,
"grad_norm": 32.33522415161133,
"learning_rate": 9.974809323963552e-06,
"loss": 0.5258,
"step": 12000
},
{
"epoch": 1.2506354855109303,
"grad_norm": 0.14623811841011047,
"learning_rate": 9.972272546775862e-06,
"loss": 0.446,
"step": 12300
},
{
"epoch": 1.281138790035587,
"grad_norm": 0.28084316849708557,
"learning_rate": 9.969614446317645e-06,
"loss": 0.4267,
"step": 12600
},
{
"epoch": 1.311642094560244,
"grad_norm": 2.6160593032836914,
"learning_rate": 9.966835087446457e-06,
"loss": 0.5175,
"step": 12900
},
{
"epoch": 1.3421453990849008,
"grad_norm": 0.21234674751758575,
"learning_rate": 9.963934537978547e-06,
"loss": 0.4142,
"step": 13200
},
{
"epoch": 1.3726487036095576,
"grad_norm": 0.09984096884727478,
"learning_rate": 9.960912868687222e-06,
"loss": 0.3752,
"step": 13500
},
{
"epoch": 1.4031520081342146,
"grad_norm": 0.003108405042439699,
"learning_rate": 9.957770153301104e-06,
"loss": 0.4626,
"step": 13800
},
{
"epoch": 1.4336553126588714,
"grad_norm": 7.253620424307883e-05,
"learning_rate": 9.954506468502335e-06,
"loss": 0.3995,
"step": 14100
},
{
"epoch": 1.4641586171835281,
"grad_norm": 7.312223434448242,
"learning_rate": 9.951121893924704e-06,
"loss": 0.5808,
"step": 14400
},
{
"epoch": 1.4946619217081851,
"grad_norm": 10.09123420715332,
"learning_rate": 9.94761651215171e-06,
"loss": 0.4186,
"step": 14700
},
{
"epoch": 1.525165226232842,
"grad_norm": 6.39027214050293,
"learning_rate": 9.943990408714542e-06,
"loss": 0.4374,
"step": 15000
},
{
"epoch": 1.5556685307574987,
"grad_norm": 0.5071086287498474,
"learning_rate": 9.94024367208999e-06,
"loss": 0.3927,
"step": 15300
},
{
"epoch": 1.5861718352821557,
"grad_norm": 0.00023602554574608803,
"learning_rate": 9.936376393698296e-06,
"loss": 0.453,
"step": 15600
},
{
"epoch": 1.6166751398068124,
"grad_norm": 4.159149646759033,
"learning_rate": 9.93238866790091e-06,
"loss": 0.4594,
"step": 15900
},
{
"epoch": 1.6471784443314692,
"grad_norm": 0.02954951673746109,
"learning_rate": 9.928280591998201e-06,
"loss": 0.4708,
"step": 16200
},
{
"epoch": 1.6776817488561262,
"grad_norm": 0.036519214510917664,
"learning_rate": 9.924052266227069e-06,
"loss": 0.348,
"step": 16500
},
{
"epoch": 1.708185053380783,
"grad_norm": 14.24988079071045,
"learning_rate": 9.919703793758511e-06,
"loss": 0.4279,
"step": 16800
},
{
"epoch": 1.7386883579054397,
"grad_norm": 17.464754104614258,
"learning_rate": 9.9152352806951e-06,
"loss": 0.4133,
"step": 17100
},
{
"epoch": 1.7691916624300967,
"grad_norm": 0.008350013755261898,
"learning_rate": 9.91064683606839e-06,
"loss": 0.3868,
"step": 17400
},
{
"epoch": 1.7996949669547533,
"grad_norm": 0.14084838330745697,
"learning_rate": 9.90593857183627e-06,
"loss": 0.41,
"step": 17700
},
{
"epoch": 1.8301982714794103,
"grad_norm": 0.011409441009163857,
"learning_rate": 9.901110602880211e-06,
"loss": 0.3632,
"step": 18000
},
{
"epoch": 1.8607015760040673,
"grad_norm": 0.2167450338602066,
"learning_rate": 9.896163047002486e-06,
"loss": 0.4203,
"step": 18300
},
{
"epoch": 1.8912048805287238,
"grad_norm": 0.12340361624956131,
"learning_rate": 9.891096024923274e-06,
"loss": 0.3836,
"step": 18600
},
{
"epoch": 1.9217081850533808,
"grad_norm": 0.011780009604990482,
"learning_rate": 9.885909660277735e-06,
"loss": 0.3638,
"step": 18900
},
{
"epoch": 1.9522114895780376,
"grad_norm": 1.290787696838379,
"learning_rate": 9.880604079612977e-06,
"loss": 0.3716,
"step": 19200
},
{
"epoch": 1.9827147941026944,
"grad_norm": 1.6361336747650057e-05,
"learning_rate": 9.87517941238498e-06,
"loss": 0.3499,
"step": 19500
},
{
"epoch": 2.0,
"eval_loss": 0.41577816009521484,
"eval_runtime": 103.2846,
"eval_samples_per_second": 20.322,
"eval_steps_per_second": 1.355,
"step": 19670
},
{
"epoch": 2.0132180986273513,
"grad_norm": 69.40552520751953,
"learning_rate": 9.869635790955423e-06,
"loss": 0.3971,
"step": 19800
},
{
"epoch": 2.0437214031520083,
"grad_norm": 0.0739259347319603,
"learning_rate": 9.863973350588472e-06,
"loss": 0.3106,
"step": 20100
},
{
"epoch": 2.074224707676665,
"grad_norm": 6.068602085113525,
"learning_rate": 9.85819222944747e-06,
"loss": 0.3609,
"step": 20400
},
{
"epoch": 2.104728012201322,
"grad_norm": 15.744518280029297,
"learning_rate": 9.852292568591557e-06,
"loss": 0.355,
"step": 20700
},
{
"epoch": 2.135231316725979,
"grad_norm": 12.378608703613281,
"learning_rate": 9.846274511972251e-06,
"loss": 0.306,
"step": 21000
},
{
"epoch": 2.1657346212506354,
"grad_norm": 10.521739959716797,
"learning_rate": 9.840138206429911e-06,
"loss": 0.3215,
"step": 21300
},
{
"epoch": 2.1962379257752924,
"grad_norm": 8.210835456848145,
"learning_rate": 9.833883801690179e-06,
"loss": 0.2643,
"step": 21600
},
{
"epoch": 2.226741230299949,
"grad_norm": 0.28642842173576355,
"learning_rate": 9.827511450360295e-06,
"loss": 0.3466,
"step": 21900
},
{
"epoch": 2.257244534824606,
"grad_norm": 0.010792361572384834,
"learning_rate": 9.821021307925406e-06,
"loss": 0.3405,
"step": 22200
},
{
"epoch": 2.287747839349263,
"grad_norm": 0.46060484647750854,
"learning_rate": 9.814413532744753e-06,
"loss": 0.3429,
"step": 22500
},
{
"epoch": 2.3182511438739195,
"grad_norm": 0.04725227132439613,
"learning_rate": 9.80768828604781e-06,
"loss": 0.3277,
"step": 22800
},
{
"epoch": 2.3487544483985765,
"grad_norm": 4.012476921081543,
"learning_rate": 9.800845731930356e-06,
"loss": 0.3004,
"step": 23100
},
{
"epoch": 2.3792577529232335,
"grad_norm": 12.063570976257324,
"learning_rate": 9.793886037350461e-06,
"loss": 0.351,
"step": 23400
},
{
"epoch": 2.40976105744789,
"grad_norm": 0.0161123126745224,
"learning_rate": 9.786809372124425e-06,
"loss": 0.3215,
"step": 23700
},
{
"epoch": 2.440264361972547,
"grad_norm": 0.012391666881740093,
"learning_rate": 9.779615908922622e-06,
"loss": 0.3581,
"step": 24000
},
{
"epoch": 2.470767666497204,
"grad_norm": 3.5457940101623535,
"learning_rate": 9.772305823265294e-06,
"loss": 0.339,
"step": 24300
},
{
"epoch": 2.5012709710218606,
"grad_norm": 0.04232852905988693,
"learning_rate": 9.764879293518266e-06,
"loss": 0.3523,
"step": 24600
},
{
"epoch": 2.5317742755465176,
"grad_norm": 0.2725585997104645,
"learning_rate": 9.757336500888599e-06,
"loss": 0.2807,
"step": 24900
},
{
"epoch": 2.562277580071174,
"grad_norm": 15.362088203430176,
"learning_rate": 9.749677629420157e-06,
"loss": 0.3227,
"step": 25200
},
{
"epoch": 2.592780884595831,
"grad_norm": 0.04092194885015488,
"learning_rate": 9.74190286598913e-06,
"loss": 0.2848,
"step": 25500
},
{
"epoch": 2.623284189120488,
"grad_norm": 18.27284049987793,
"learning_rate": 9.734012400299463e-06,
"loss": 0.3182,
"step": 25800
},
{
"epoch": 2.6537874936451447,
"grad_norm": 10.792008399963379,
"learning_rate": 9.726006424878234e-06,
"loss": 0.3344,
"step": 26100
},
{
"epoch": 2.6842907981698017,
"grad_norm": 0.0012637190520763397,
"learning_rate": 9.717885135070957e-06,
"loss": 0.3022,
"step": 26400
},
{
"epoch": 2.7147941026944586,
"grad_norm": 0.0459415428340435,
"learning_rate": 9.709648729036805e-06,
"loss": 0.3292,
"step": 26700
},
{
"epoch": 2.745297407219115,
"grad_norm": 0.0021137718576937914,
"learning_rate": 9.70129740774379e-06,
"loss": 0.34,
"step": 27000
},
{
"epoch": 2.775800711743772,
"grad_norm": 0.020368747413158417,
"learning_rate": 9.69283137496385e-06,
"loss": 0.2781,
"step": 27300
},
{
"epoch": 2.806304016268429,
"grad_norm": 0.01668807491660118,
"learning_rate": 9.68425083726788e-06,
"loss": 0.3146,
"step": 27600
},
{
"epoch": 2.8368073207930857,
"grad_norm": 18.376174926757812,
"learning_rate": 9.67555600402069e-06,
"loss": 0.3176,
"step": 27900
},
{
"epoch": 2.8673106253177427,
"grad_norm": 0.012932281009852886,
"learning_rate": 9.666747087375894e-06,
"loss": 0.3183,
"step": 28200
},
{
"epoch": 2.8978139298423997,
"grad_norm": 24.642900466918945,
"learning_rate": 9.657824302270743e-06,
"loss": 0.3418,
"step": 28500
},
{
"epoch": 2.9283172343670563,
"grad_norm": 0.013992193154990673,
"learning_rate": 9.64878786642087e-06,
"loss": 0.3196,
"step": 28800
},
{
"epoch": 2.9588205388917133,
"grad_norm": 0.001322018215432763,
"learning_rate": 9.639638000314983e-06,
"loss": 0.2828,
"step": 29100
},
{
"epoch": 2.9893238434163703,
"grad_norm": 0.010299603454768658,
"learning_rate": 9.630374927209485e-06,
"loss": 0.3031,
"step": 29400
},
{
"epoch": 3.0,
"eval_loss": 0.384840726852417,
"eval_runtime": 103.5367,
"eval_samples_per_second": 20.273,
"eval_steps_per_second": 1.352,
"step": 29505
},
{
"epoch": 3.019827147941027,
"grad_norm": 0.03328376263380051,
"learning_rate": 9.620998873123027e-06,
"loss": 0.2759,
"step": 29700
},
{
"epoch": 3.050330452465684,
"grad_norm": 13.360301971435547,
"learning_rate": 9.611510066830984e-06,
"loss": 0.1929,
"step": 30000
},
{
"epoch": 3.080833756990341,
"grad_norm": 0.0005070280749350786,
"learning_rate": 9.601908739859895e-06,
"loss": 0.2731,
"step": 30300
},
{
"epoch": 3.1113370615149973,
"grad_norm": 7.794924385962076e-06,
"learning_rate": 9.592195126481786e-06,
"loss": 0.2898,
"step": 30600
},
{
"epoch": 3.1418403660396543,
"grad_norm": 0.000699843221809715,
"learning_rate": 9.582369463708473e-06,
"loss": 0.2485,
"step": 30900
},
{
"epoch": 3.1723436705643113,
"grad_norm": 0.24882352352142334,
"learning_rate": 9.572431991285775e-06,
"loss": 0.2353,
"step": 31200
},
{
"epoch": 3.202846975088968,
"grad_norm": 0.004962280858308077,
"learning_rate": 9.562382951687658e-06,
"loss": 0.2352,
"step": 31500
},
{
"epoch": 3.233350279613625,
"grad_norm": 4.832521915435791,
"learning_rate": 9.552222590110324e-06,
"loss": 0.2363,
"step": 31800
},
{
"epoch": 3.263853584138282,
"grad_norm": 0.019647739827632904,
"learning_rate": 9.541951154466233e-06,
"loss": 0.2931,
"step": 32100
},
{
"epoch": 3.2943568886629384,
"grad_norm": 0.04419803246855736,
"learning_rate": 9.53156889537804e-06,
"loss": 0.2634,
"step": 32400
},
{
"epoch": 3.3248601931875954,
"grad_norm": 0.00563009362667799,
"learning_rate": 9.521076066172493e-06,
"loss": 0.3196,
"step": 32700
},
{
"epoch": 3.3553634977122524,
"grad_norm": 35.982086181640625,
"learning_rate": 9.510472922874246e-06,
"loss": 0.2312,
"step": 33000
},
{
"epoch": 3.385866802236909,
"grad_norm": 0.023021390661597252,
"learning_rate": 9.49975972419961e-06,
"loss": 0.248,
"step": 33300
},
{
"epoch": 3.416370106761566,
"grad_norm": 12.021758079528809,
"learning_rate": 9.488936731550247e-06,
"loss": 0.2552,
"step": 33600
},
{
"epoch": 3.4468734112862225,
"grad_norm": 0.08628563582897186,
"learning_rate": 9.47800420900679e-06,
"loss": 0.2553,
"step": 33900
},
{
"epoch": 3.4773767158108795,
"grad_norm": 1.5440913438796997,
"learning_rate": 9.466962423322388e-06,
"loss": 0.3116,
"step": 34200
},
{
"epoch": 3.5078800203355365,
"grad_norm": 1.9582719687605277e-05,
"learning_rate": 9.455811643916217e-06,
"loss": 0.2573,
"step": 34500
},
{
"epoch": 3.538383324860193,
"grad_norm": 0.0327552855014801,
"learning_rate": 9.444552142866892e-06,
"loss": 0.2714,
"step": 34800
},
{
"epoch": 3.56888662938485,
"grad_norm": 23.4067440032959,
"learning_rate": 9.433184194905831e-06,
"loss": 0.2582,
"step": 35100
},
{
"epoch": 3.599389933909507,
"grad_norm": 0.510364830493927,
"learning_rate": 9.421708077410551e-06,
"loss": 0.2953,
"step": 35400
},
{
"epoch": 3.6298932384341636,
"grad_norm": 0.008372425101697445,
"learning_rate": 9.410124070397908e-06,
"loss": 0.3149,
"step": 35700
},
{
"epoch": 3.6603965429588206,
"grad_norm": 0.005560519173741341,
"learning_rate": 9.398432456517254e-06,
"loss": 0.2407,
"step": 36000
},
{
"epoch": 3.690899847483477,
"grad_norm": 4.856593608856201,
"learning_rate": 9.386633521043545e-06,
"loss": 0.2772,
"step": 36300
},
{
"epoch": 3.721403152008134,
"grad_norm": 0.00459680799394846,
"learning_rate": 9.374727551870377e-06,
"loss": 0.2901,
"step": 36600
},
{
"epoch": 3.751906456532791,
"grad_norm": 0.00823771022260189,
"learning_rate": 9.362714839502973e-06,
"loss": 0.3216,
"step": 36900
},
{
"epoch": 3.7824097610574476,
"grad_norm": 0.011493176221847534,
"learning_rate": 9.35059567705108e-06,
"loss": 0.2682,
"step": 37200
},
{
"epoch": 3.8129130655821046,
"grad_norm": 0.057664211839437485,
"learning_rate": 9.33837036022182e-06,
"loss": 0.2823,
"step": 37500
},
{
"epoch": 3.8434163701067616,
"grad_norm": 11.62787914276123,
"learning_rate": 9.326039187312485e-06,
"loss": 0.1798,
"step": 37800
},
{
"epoch": 3.873919674631418,
"grad_norm": 0.02458140067756176,
"learning_rate": 9.313602459203248e-06,
"loss": 0.2337,
"step": 38100
},
{
"epoch": 3.904422979156075,
"grad_norm": 0.019079158082604408,
"learning_rate": 9.301060479349826e-06,
"loss": 0.2539,
"step": 38400
},
{
"epoch": 3.934926283680732,
"grad_norm": 9.756794929504395,
"learning_rate": 9.288413553776076e-06,
"loss": 0.279,
"step": 38700
},
{
"epoch": 3.9654295882053887,
"grad_norm": 3.1425323486328125,
"learning_rate": 9.275661991066522e-06,
"loss": 0.2767,
"step": 39000
},
{
"epoch": 3.9959328927300457,
"grad_norm": 0.006602356676012278,
"learning_rate": 9.262806102358834e-06,
"loss": 0.2558,
"step": 39300
},
{
"epoch": 4.0,
"eval_loss": 0.3359658420085907,
"eval_runtime": 110.2827,
"eval_samples_per_second": 19.033,
"eval_steps_per_second": 1.269,
"step": 39340
},
{
"epoch": 4.026436197254703,
"grad_norm": 0.00417200056836009,
"learning_rate": 9.249846201336235e-06,
"loss": 0.2559,
"step": 39600
},
{
"epoch": 4.056939501779359,
"grad_norm": 0.008380061946809292,
"learning_rate": 9.236782604219838e-06,
"loss": 0.2173,
"step": 39900
},
{
"epoch": 4.087442806304017,
"grad_norm": 0.06265965104103088,
"learning_rate": 9.223615629760943e-06,
"loss": 0.2338,
"step": 40200
},
{
"epoch": 4.117946110828673,
"grad_norm": 2.496182241884526e-05,
"learning_rate": 9.21034559923325e-06,
"loss": 0.2761,
"step": 40500
},
{
"epoch": 4.14844941535333,
"grad_norm": 40.386531829833984,
"learning_rate": 9.196972836425027e-06,
"loss": 0.1951,
"step": 40800
},
{
"epoch": 4.178952719877987,
"grad_norm": 0.024761863052845,
"learning_rate": 9.183497667631199e-06,
"loss": 0.1979,
"step": 41100
},
{
"epoch": 4.209456024402644,
"grad_norm": 0.0945596694946289,
"learning_rate": 9.169920421645401e-06,
"loss": 0.1914,
"step": 41400
},
{
"epoch": 4.2399593289273,
"grad_norm": 0.6701067686080933,
"learning_rate": 9.156241429751947e-06,
"loss": 0.2216,
"step": 41700
},
{
"epoch": 4.270462633451958,
"grad_norm": 240.65187072753906,
"learning_rate": 9.142461025717739e-06,
"loss": 0.2277,
"step": 42000
},
{
"epoch": 4.300965937976614,
"grad_norm": 0.0036695213057100773,
"learning_rate": 9.128579545784142e-06,
"loss": 0.2476,
"step": 42300
},
{
"epoch": 4.331469242501271,
"grad_norm": 33.61179733276367,
"learning_rate": 9.114597328658763e-06,
"loss": 0.1958,
"step": 42600
},
{
"epoch": 4.361972547025927,
"grad_norm": 52.01089859008789,
"learning_rate": 9.100514715507196e-06,
"loss": 0.217,
"step": 42900
},
{
"epoch": 4.392475851550585,
"grad_norm": 0.0013590186135843396,
"learning_rate": 9.086332049944692e-06,
"loss": 0.2393,
"step": 43200
},
{
"epoch": 4.422979156075241,
"grad_norm": 0.12093915045261383,
"learning_rate": 9.072049678027778e-06,
"loss": 0.2175,
"step": 43500
},
{
"epoch": 4.453482460599898,
"grad_norm": 0.008124121464788914,
"learning_rate": 9.057667948245816e-06,
"loss": 0.262,
"step": 43800
},
{
"epoch": 4.483985765124555,
"grad_norm": 0.7974226474761963,
"learning_rate": 9.043187211512492e-06,
"loss": 0.2742,
"step": 44100
},
{
"epoch": 4.514489069649212,
"grad_norm": 13.317780494689941,
"learning_rate": 9.028607821157256e-06,
"loss": 0.2471,
"step": 44400
},
{
"epoch": 4.5449923741738685,
"grad_norm": 0.0005867113941349089,
"learning_rate": 9.013930132916709e-06,
"loss": 0.252,
"step": 44700
},
{
"epoch": 4.575495678698526,
"grad_norm": 0.053022801876068115,
"learning_rate": 8.999154504925914e-06,
"loss": 0.2319,
"step": 45000
},
{
"epoch": 4.6059989832231825,
"grad_norm": 0.0034278552047908306,
"learning_rate": 8.984281297709658e-06,
"loss": 0.2185,
"step": 45300
},
{
"epoch": 4.636502287747839,
"grad_norm": 0.008740383200347424,
"learning_rate": 8.969310874173663e-06,
"loss": 0.2231,
"step": 45600
},
{
"epoch": 4.6670055922724965,
"grad_norm": 0.06879168748855591,
"learning_rate": 8.95424359959572e-06,
"loss": 0.1777,
"step": 45900
},
{
"epoch": 4.697508896797153,
"grad_norm": 0.005040327087044716,
"learning_rate": 8.939079841616785e-06,
"loss": 0.2596,
"step": 46200
},
{
"epoch": 4.7280122013218095,
"grad_norm": 0.0021866620518267155,
"learning_rate": 8.923819970232003e-06,
"loss": 0.2109,
"step": 46500
},
{
"epoch": 4.758515505846467,
"grad_norm": 0.023091251030564308,
"learning_rate": 8.90846435778169e-06,
"loss": 0.1908,
"step": 46800
},
{
"epoch": 4.7890188103711235,
"grad_norm": 0.0027208509854972363,
"learning_rate": 8.893013378942232e-06,
"loss": 0.2422,
"step": 47100
},
{
"epoch": 4.81952211489578,
"grad_norm": 0.0003605252131819725,
"learning_rate": 8.877467410716951e-06,
"loss": 0.1941,
"step": 47400
},
{
"epoch": 4.8500254194204375,
"grad_norm": 4.229369640350342,
"learning_rate": 8.861826832426916e-06,
"loss": 0.2363,
"step": 47700
},
{
"epoch": 4.880528723945094,
"grad_norm": 0.01985393464565277,
"learning_rate": 8.84609202570167e-06,
"loss": 0.215,
"step": 48000
},
{
"epoch": 4.911032028469751,
"grad_norm": 3.8484694957733154,
"learning_rate": 8.830263374469927e-06,
"loss": 0.2126,
"step": 48300
},
{
"epoch": 4.941535332994408,
"grad_norm": 4.421438279678114e-05,
"learning_rate": 8.814341264950207e-06,
"loss": 0.2451,
"step": 48600
},
{
"epoch": 4.972038637519065,
"grad_norm": 3.2253410816192627,
"learning_rate": 8.798326085641407e-06,
"loss": 0.261,
"step": 48900
},
{
"epoch": 5.0,
"eval_loss": 0.30429786443710327,
"eval_runtime": 103.3201,
"eval_samples_per_second": 20.316,
"eval_steps_per_second": 1.355,
"step": 49175
}
],
"logging_steps": 300,
"max_steps": 196700,
"num_input_tokens_seen": 0,
"num_train_epochs": 20,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2.410768182018816e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}