Qwen2.5-1.5B-Open-R1-Distill / trainer_state.json
shiquan181116's picture
Model save
4c3246d verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 5.0,
"eval_steps": 500,
"global_step": 3665,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0068212824010914054,
"grad_norm": 2.718381452173294,
"learning_rate": 1.3586956521739131e-06,
"loss": 0.8554,
"step": 5
},
{
"epoch": 0.013642564802182811,
"grad_norm": 1.8320824052298514,
"learning_rate": 2.7173913043478263e-06,
"loss": 0.8525,
"step": 10
},
{
"epoch": 0.020463847203274217,
"grad_norm": 1.4237490955410819,
"learning_rate": 4.07608695652174e-06,
"loss": 0.8086,
"step": 15
},
{
"epoch": 0.027285129604365622,
"grad_norm": 1.0670318844733424,
"learning_rate": 5.4347826086956525e-06,
"loss": 0.7639,
"step": 20
},
{
"epoch": 0.034106412005457026,
"grad_norm": 0.8294384112181443,
"learning_rate": 6.7934782608695655e-06,
"loss": 0.7246,
"step": 25
},
{
"epoch": 0.040927694406548434,
"grad_norm": 0.6498540769323535,
"learning_rate": 8.15217391304348e-06,
"loss": 0.6903,
"step": 30
},
{
"epoch": 0.047748976807639835,
"grad_norm": 0.44534977628267997,
"learning_rate": 9.510869565217392e-06,
"loss": 0.6795,
"step": 35
},
{
"epoch": 0.054570259208731244,
"grad_norm": 0.3942695256318386,
"learning_rate": 1.0869565217391305e-05,
"loss": 0.6436,
"step": 40
},
{
"epoch": 0.061391541609822645,
"grad_norm": 0.33678552623662655,
"learning_rate": 1.2228260869565218e-05,
"loss": 0.6429,
"step": 45
},
{
"epoch": 0.06821282401091405,
"grad_norm": 0.3279277406182261,
"learning_rate": 1.3586956521739131e-05,
"loss": 0.6403,
"step": 50
},
{
"epoch": 0.07503410641200546,
"grad_norm": 0.3110307712217628,
"learning_rate": 1.4945652173913044e-05,
"loss": 0.6374,
"step": 55
},
{
"epoch": 0.08185538881309687,
"grad_norm": 0.3155916087900725,
"learning_rate": 1.630434782608696e-05,
"loss": 0.6218,
"step": 60
},
{
"epoch": 0.08867667121418826,
"grad_norm": 0.3402617489170636,
"learning_rate": 1.766304347826087e-05,
"loss": 0.6237,
"step": 65
},
{
"epoch": 0.09549795361527967,
"grad_norm": 0.3613035423063458,
"learning_rate": 1.9021739130434784e-05,
"loss": 0.611,
"step": 70
},
{
"epoch": 0.10231923601637108,
"grad_norm": 0.33685873327255084,
"learning_rate": 2.0380434782608694e-05,
"loss": 0.6119,
"step": 75
},
{
"epoch": 0.10914051841746249,
"grad_norm": 0.31895725640857175,
"learning_rate": 2.173913043478261e-05,
"loss": 0.5929,
"step": 80
},
{
"epoch": 0.11596180081855388,
"grad_norm": 0.367814337182358,
"learning_rate": 2.3097826086956523e-05,
"loss": 0.5847,
"step": 85
},
{
"epoch": 0.12278308321964529,
"grad_norm": 0.33537099137809573,
"learning_rate": 2.4456521739130436e-05,
"loss": 0.6021,
"step": 90
},
{
"epoch": 0.1296043656207367,
"grad_norm": 0.3515481694311979,
"learning_rate": 2.5815217391304346e-05,
"loss": 0.5971,
"step": 95
},
{
"epoch": 0.1364256480218281,
"grad_norm": 0.48781555539445853,
"learning_rate": 2.7173913043478262e-05,
"loss": 0.5828,
"step": 100
},
{
"epoch": 0.1432469304229195,
"grad_norm": 0.3695141232563311,
"learning_rate": 2.8532608695652175e-05,
"loss": 0.5839,
"step": 105
},
{
"epoch": 0.15006821282401092,
"grad_norm": 0.3608533158391854,
"learning_rate": 2.9891304347826088e-05,
"loss": 0.587,
"step": 110
},
{
"epoch": 0.15688949522510232,
"grad_norm": 0.42150696344097405,
"learning_rate": 3.125e-05,
"loss": 0.5848,
"step": 115
},
{
"epoch": 0.16371077762619374,
"grad_norm": 0.4645864912756431,
"learning_rate": 3.260869565217392e-05,
"loss": 0.5733,
"step": 120
},
{
"epoch": 0.17053206002728513,
"grad_norm": 0.48252326725187134,
"learning_rate": 3.3967391304347826e-05,
"loss": 0.5799,
"step": 125
},
{
"epoch": 0.17735334242837653,
"grad_norm": 0.439342079352817,
"learning_rate": 3.532608695652174e-05,
"loss": 0.5754,
"step": 130
},
{
"epoch": 0.18417462482946795,
"grad_norm": 0.5005585625374034,
"learning_rate": 3.668478260869566e-05,
"loss": 0.5744,
"step": 135
},
{
"epoch": 0.19099590723055934,
"grad_norm": 0.5876076245892312,
"learning_rate": 3.804347826086957e-05,
"loss": 0.5871,
"step": 140
},
{
"epoch": 0.19781718963165076,
"grad_norm": 0.5038018525075361,
"learning_rate": 3.940217391304348e-05,
"loss": 0.559,
"step": 145
},
{
"epoch": 0.20463847203274216,
"grad_norm": 0.5317683287124427,
"learning_rate": 4.076086956521739e-05,
"loss": 0.5743,
"step": 150
},
{
"epoch": 0.21145975443383355,
"grad_norm": 0.47414673284908293,
"learning_rate": 4.2119565217391304e-05,
"loss": 0.5582,
"step": 155
},
{
"epoch": 0.21828103683492497,
"grad_norm": 0.5016540873040667,
"learning_rate": 4.347826086956522e-05,
"loss": 0.5667,
"step": 160
},
{
"epoch": 0.22510231923601637,
"grad_norm": 0.4794691683623029,
"learning_rate": 4.483695652173913e-05,
"loss": 0.5696,
"step": 165
},
{
"epoch": 0.23192360163710776,
"grad_norm": 0.6036530565440041,
"learning_rate": 4.6195652173913046e-05,
"loss": 0.5629,
"step": 170
},
{
"epoch": 0.23874488403819918,
"grad_norm": 0.5121018480218417,
"learning_rate": 4.7554347826086956e-05,
"loss": 0.5774,
"step": 175
},
{
"epoch": 0.24556616643929058,
"grad_norm": 0.5943036033467437,
"learning_rate": 4.891304347826087e-05,
"loss": 0.5574,
"step": 180
},
{
"epoch": 0.252387448840382,
"grad_norm": 0.5932480324295518,
"learning_rate": 4.999999083686275e-05,
"loss": 0.5531,
"step": 185
},
{
"epoch": 0.2592087312414734,
"grad_norm": 0.535232775932377,
"learning_rate": 4.999967012784259e-05,
"loss": 0.5638,
"step": 190
},
{
"epoch": 0.2660300136425648,
"grad_norm": 0.4932300563298028,
"learning_rate": 4.999889126942317e-05,
"loss": 0.551,
"step": 195
},
{
"epoch": 0.2728512960436562,
"grad_norm": 0.3724078808223032,
"learning_rate": 4.999765427746401e-05,
"loss": 0.5601,
"step": 200
},
{
"epoch": 0.27967257844474763,
"grad_norm": 0.527609939821496,
"learning_rate": 4.9995959177153344e-05,
"loss": 0.5533,
"step": 205
},
{
"epoch": 0.286493860845839,
"grad_norm": 0.48098555041424135,
"learning_rate": 4.999380600300766e-05,
"loss": 0.5543,
"step": 210
},
{
"epoch": 0.2933151432469304,
"grad_norm": 0.3977697801311094,
"learning_rate": 4.999119479887092e-05,
"loss": 0.5572,
"step": 215
},
{
"epoch": 0.30013642564802184,
"grad_norm": 0.4438001480487939,
"learning_rate": 4.9988125617913766e-05,
"loss": 0.5362,
"step": 220
},
{
"epoch": 0.3069577080491132,
"grad_norm": 0.40138360501438736,
"learning_rate": 4.998459852263239e-05,
"loss": 0.5488,
"step": 225
},
{
"epoch": 0.31377899045020463,
"grad_norm": 0.3935147485176112,
"learning_rate": 4.9980613584847244e-05,
"loss": 0.5443,
"step": 230
},
{
"epoch": 0.32060027285129605,
"grad_norm": 0.47675495570547083,
"learning_rate": 4.9976170885701596e-05,
"loss": 0.5363,
"step": 235
},
{
"epoch": 0.3274215552523875,
"grad_norm": 0.37900784050363756,
"learning_rate": 4.9971270515659874e-05,
"loss": 0.5489,
"step": 240
},
{
"epoch": 0.33424283765347884,
"grad_norm": 0.43306874676755636,
"learning_rate": 4.996591257450584e-05,
"loss": 0.5575,
"step": 245
},
{
"epoch": 0.34106412005457026,
"grad_norm": 0.45525589941001887,
"learning_rate": 4.996009717134054e-05,
"loss": 0.5394,
"step": 250
},
{
"epoch": 0.3478854024556617,
"grad_norm": 0.36973267988621533,
"learning_rate": 4.995382442458009e-05,
"loss": 0.5404,
"step": 255
},
{
"epoch": 0.35470668485675305,
"grad_norm": 0.3888515869412804,
"learning_rate": 4.9947094461953255e-05,
"loss": 0.5291,
"step": 260
},
{
"epoch": 0.3615279672578445,
"grad_norm": 0.35113901228576483,
"learning_rate": 4.993990742049886e-05,
"loss": 0.5485,
"step": 265
},
{
"epoch": 0.3683492496589359,
"grad_norm": 0.41740322629825416,
"learning_rate": 4.9932263446562995e-05,
"loss": 0.5374,
"step": 270
},
{
"epoch": 0.37517053206002726,
"grad_norm": 0.4157253561819312,
"learning_rate": 4.9924162695796016e-05,
"loss": 0.5332,
"step": 275
},
{
"epoch": 0.3819918144611187,
"grad_norm": 0.35820892208440913,
"learning_rate": 4.991560533314944e-05,
"loss": 0.539,
"step": 280
},
{
"epoch": 0.3888130968622101,
"grad_norm": 0.4088803576788514,
"learning_rate": 4.9906591532872496e-05,
"loss": 0.5433,
"step": 285
},
{
"epoch": 0.3956343792633015,
"grad_norm": 0.371678787656567,
"learning_rate": 4.989712147850865e-05,
"loss": 0.5389,
"step": 290
},
{
"epoch": 0.4024556616643929,
"grad_norm": 0.582724495752912,
"learning_rate": 4.988719536289182e-05,
"loss": 0.5303,
"step": 295
},
{
"epoch": 0.4092769440654843,
"grad_norm": 0.5039803164193325,
"learning_rate": 4.9876813388142466e-05,
"loss": 0.5299,
"step": 300
},
{
"epoch": 0.41609822646657574,
"grad_norm": 0.45140868023849334,
"learning_rate": 4.986597576566351e-05,
"loss": 0.545,
"step": 305
},
{
"epoch": 0.4229195088676671,
"grad_norm": 0.4153549634571551,
"learning_rate": 4.9854682716135965e-05,
"loss": 0.5421,
"step": 310
},
{
"epoch": 0.4297407912687585,
"grad_norm": 0.39423844986474943,
"learning_rate": 4.984293446951447e-05,
"loss": 0.5363,
"step": 315
},
{
"epoch": 0.43656207366984995,
"grad_norm": 0.38143699656601243,
"learning_rate": 4.983073126502266e-05,
"loss": 0.5366,
"step": 320
},
{
"epoch": 0.4433833560709413,
"grad_norm": 0.40244690165803765,
"learning_rate": 4.9818073351148184e-05,
"loss": 0.5429,
"step": 325
},
{
"epoch": 0.45020463847203274,
"grad_norm": 0.4415944055471395,
"learning_rate": 4.9804960985637745e-05,
"loss": 0.5304,
"step": 330
},
{
"epoch": 0.45702592087312416,
"grad_norm": 0.3394607588175486,
"learning_rate": 4.9791394435491815e-05,
"loss": 0.5367,
"step": 335
},
{
"epoch": 0.4638472032742155,
"grad_norm": 0.44036319608717406,
"learning_rate": 4.977737397695919e-05,
"loss": 0.5296,
"step": 340
},
{
"epoch": 0.47066848567530695,
"grad_norm": 0.3612121871031905,
"learning_rate": 4.9762899895531365e-05,
"loss": 0.5226,
"step": 345
},
{
"epoch": 0.47748976807639837,
"grad_norm": 0.4084070856561623,
"learning_rate": 4.9747972485936746e-05,
"loss": 0.5312,
"step": 350
},
{
"epoch": 0.4843110504774898,
"grad_norm": 0.4187475634442714,
"learning_rate": 4.973259205213461e-05,
"loss": 0.5405,
"step": 355
},
{
"epoch": 0.49113233287858116,
"grad_norm": 0.43222798441609056,
"learning_rate": 4.971675890730897e-05,
"loss": 0.5442,
"step": 360
},
{
"epoch": 0.4979536152796726,
"grad_norm": 0.43270299261628004,
"learning_rate": 4.9700473373862124e-05,
"loss": 0.5219,
"step": 365
},
{
"epoch": 0.504774897680764,
"grad_norm": 0.3734426909759304,
"learning_rate": 4.9683735783408165e-05,
"loss": 0.5238,
"step": 370
},
{
"epoch": 0.5115961800818554,
"grad_norm": 0.41628380265626064,
"learning_rate": 4.9666546476766164e-05,
"loss": 0.5321,
"step": 375
},
{
"epoch": 0.5184174624829468,
"grad_norm": 0.3923952948213395,
"learning_rate": 4.9648905803953284e-05,
"loss": 0.5182,
"step": 380
},
{
"epoch": 0.5252387448840382,
"grad_norm": 0.31124417935955667,
"learning_rate": 4.963081412417762e-05,
"loss": 0.5277,
"step": 385
},
{
"epoch": 0.5320600272851296,
"grad_norm": 0.359338085196968,
"learning_rate": 4.961227180583089e-05,
"loss": 0.5304,
"step": 390
},
{
"epoch": 0.538881309686221,
"grad_norm": 0.333204967843598,
"learning_rate": 4.9593279226480944e-05,
"loss": 0.5139,
"step": 395
},
{
"epoch": 0.5457025920873124,
"grad_norm": 0.4730935475590952,
"learning_rate": 4.9573836772864074e-05,
"loss": 0.5361,
"step": 400
},
{
"epoch": 0.5525238744884038,
"grad_norm": 0.44098956286299457,
"learning_rate": 4.955394484087711e-05,
"loss": 0.5343,
"step": 405
},
{
"epoch": 0.5593451568894953,
"grad_norm": 0.42042925676970927,
"learning_rate": 4.953360383556941e-05,
"loss": 0.5193,
"step": 410
},
{
"epoch": 0.5661664392905866,
"grad_norm": 0.42110351706341315,
"learning_rate": 4.951281417113457e-05,
"loss": 0.5245,
"step": 415
},
{
"epoch": 0.572987721691678,
"grad_norm": 0.3695438489534778,
"learning_rate": 4.9491576270902e-05,
"loss": 0.5241,
"step": 420
},
{
"epoch": 0.5798090040927695,
"grad_norm": 0.41403931778963793,
"learning_rate": 4.946989056732833e-05,
"loss": 0.5192,
"step": 425
},
{
"epoch": 0.5866302864938608,
"grad_norm": 0.3982973651072619,
"learning_rate": 4.944775750198858e-05,
"loss": 0.524,
"step": 430
},
{
"epoch": 0.5934515688949522,
"grad_norm": 0.42595084178546927,
"learning_rate": 4.942517752556714e-05,
"loss": 0.5134,
"step": 435
},
{
"epoch": 0.6002728512960437,
"grad_norm": 0.35556353177455896,
"learning_rate": 4.940215109784865e-05,
"loss": 0.5109,
"step": 440
},
{
"epoch": 0.607094133697135,
"grad_norm": 0.3420291981554025,
"learning_rate": 4.937867868770862e-05,
"loss": 0.5238,
"step": 445
},
{
"epoch": 0.6139154160982264,
"grad_norm": 0.4013278641302897,
"learning_rate": 4.9354760773103845e-05,
"loss": 0.5225,
"step": 450
},
{
"epoch": 0.6207366984993179,
"grad_norm": 0.3606171148062443,
"learning_rate": 4.933039784106272e-05,
"loss": 0.5259,
"step": 455
},
{
"epoch": 0.6275579809004093,
"grad_norm": 0.3381144814134007,
"learning_rate": 4.930559038767532e-05,
"loss": 0.5194,
"step": 460
},
{
"epoch": 0.6343792633015006,
"grad_norm": 0.391843344180771,
"learning_rate": 4.9280338918083264e-05,
"loss": 0.5139,
"step": 465
},
{
"epoch": 0.6412005457025921,
"grad_norm": 0.36206934317906436,
"learning_rate": 4.925464394646944e-05,
"loss": 0.5325,
"step": 470
},
{
"epoch": 0.6480218281036835,
"grad_norm": 0.3433338943637673,
"learning_rate": 4.922850599604756e-05,
"loss": 0.5085,
"step": 475
},
{
"epoch": 0.654843110504775,
"grad_norm": 0.3299742405744191,
"learning_rate": 4.920192559905149e-05,
"loss": 0.5108,
"step": 480
},
{
"epoch": 0.6616643929058663,
"grad_norm": 0.4134988825523861,
"learning_rate": 4.9174903296724394e-05,
"loss": 0.5249,
"step": 485
},
{
"epoch": 0.6684856753069577,
"grad_norm": 0.37990282467605163,
"learning_rate": 4.914743963930775e-05,
"loss": 0.5102,
"step": 490
},
{
"epoch": 0.6753069577080492,
"grad_norm": 0.399202229699515,
"learning_rate": 4.911953518603012e-05,
"loss": 0.5031,
"step": 495
},
{
"epoch": 0.6821282401091405,
"grad_norm": 0.4466818320853147,
"learning_rate": 4.909119050509576e-05,
"loss": 0.5077,
"step": 500
},
{
"epoch": 0.6889495225102319,
"grad_norm": 0.3878038281442584,
"learning_rate": 4.906240617367308e-05,
"loss": 0.5163,
"step": 505
},
{
"epoch": 0.6957708049113234,
"grad_norm": 0.46953615879792254,
"learning_rate": 4.9033182777882845e-05,
"loss": 0.5136,
"step": 510
},
{
"epoch": 0.7025920873124147,
"grad_norm": 0.43602888466155715,
"learning_rate": 4.9003520912786286e-05,
"loss": 0.518,
"step": 515
},
{
"epoch": 0.7094133697135061,
"grad_norm": 0.39296876089094146,
"learning_rate": 4.8973421182372955e-05,
"loss": 0.5298,
"step": 520
},
{
"epoch": 0.7162346521145976,
"grad_norm": 0.4385411631704965,
"learning_rate": 4.8942884199548424e-05,
"loss": 0.5142,
"step": 525
},
{
"epoch": 0.723055934515689,
"grad_norm": 0.3690695281664004,
"learning_rate": 4.891191058612184e-05,
"loss": 0.5078,
"step": 530
},
{
"epoch": 0.7298772169167803,
"grad_norm": 0.38913526291124273,
"learning_rate": 4.8880500972793204e-05,
"loss": 0.5134,
"step": 535
},
{
"epoch": 0.7366984993178718,
"grad_norm": 0.3579203031993548,
"learning_rate": 4.88486559991406e-05,
"loss": 0.5185,
"step": 540
},
{
"epoch": 0.7435197817189632,
"grad_norm": 0.3437909123481529,
"learning_rate": 4.8816376313607095e-05,
"loss": 0.5218,
"step": 545
},
{
"epoch": 0.7503410641200545,
"grad_norm": 0.34425300375670553,
"learning_rate": 4.878366257348761e-05,
"loss": 0.5063,
"step": 550
},
{
"epoch": 0.757162346521146,
"grad_norm": 0.37603307112189627,
"learning_rate": 4.8750515444915475e-05,
"loss": 0.5067,
"step": 555
},
{
"epoch": 0.7639836289222374,
"grad_norm": 0.42138974150775454,
"learning_rate": 4.8716935602848904e-05,
"loss": 0.5025,
"step": 560
},
{
"epoch": 0.7708049113233287,
"grad_norm": 0.46990974803356833,
"learning_rate": 4.868292373105722e-05,
"loss": 0.5156,
"step": 565
},
{
"epoch": 0.7776261937244202,
"grad_norm": 0.4146567827090065,
"learning_rate": 4.8648480522106974e-05,
"loss": 0.5064,
"step": 570
},
{
"epoch": 0.7844474761255116,
"grad_norm": 0.37240757378463407,
"learning_rate": 4.8613606677347794e-05,
"loss": 0.506,
"step": 575
},
{
"epoch": 0.791268758526603,
"grad_norm": 0.4262453048825825,
"learning_rate": 4.857830290689814e-05,
"loss": 0.5209,
"step": 580
},
{
"epoch": 0.7980900409276944,
"grad_norm": 0.4373383547064571,
"learning_rate": 4.8542569929630844e-05,
"loss": 0.5081,
"step": 585
},
{
"epoch": 0.8049113233287858,
"grad_norm": 0.44110437145532355,
"learning_rate": 4.8506408473158414e-05,
"loss": 0.522,
"step": 590
},
{
"epoch": 0.8117326057298773,
"grad_norm": 0.4173217160585339,
"learning_rate": 4.8469819273818315e-05,
"loss": 0.5074,
"step": 595
},
{
"epoch": 0.8185538881309686,
"grad_norm": 0.44658170498127164,
"learning_rate": 4.843280307665788e-05,
"loss": 0.5052,
"step": 600
},
{
"epoch": 0.82537517053206,
"grad_norm": 0.3857834259517951,
"learning_rate": 4.8395360635419226e-05,
"loss": 0.5109,
"step": 605
},
{
"epoch": 0.8321964529331515,
"grad_norm": 0.3505595634781613,
"learning_rate": 4.835749271252383e-05,
"loss": 0.5048,
"step": 610
},
{
"epoch": 0.8390177353342428,
"grad_norm": 0.4486071997949835,
"learning_rate": 4.8319200079057044e-05,
"loss": 0.502,
"step": 615
},
{
"epoch": 0.8458390177353342,
"grad_norm": 0.36540477393499693,
"learning_rate": 4.828048351475239e-05,
"loss": 0.4994,
"step": 620
},
{
"epoch": 0.8526603001364257,
"grad_norm": 0.3535891851423872,
"learning_rate": 4.824134380797568e-05,
"loss": 0.5156,
"step": 625
},
{
"epoch": 0.859481582537517,
"grad_norm": 0.3564989100834705,
"learning_rate": 4.820178175570897e-05,
"loss": 0.5145,
"step": 630
},
{
"epoch": 0.8663028649386084,
"grad_norm": 0.34049206381140384,
"learning_rate": 4.81617981635343e-05,
"loss": 0.5124,
"step": 635
},
{
"epoch": 0.8731241473396999,
"grad_norm": 0.322576632173292,
"learning_rate": 4.8121393845617336e-05,
"loss": 0.4972,
"step": 640
},
{
"epoch": 0.8799454297407913,
"grad_norm": 0.35074887766341006,
"learning_rate": 4.808056962469076e-05,
"loss": 0.5005,
"step": 645
},
{
"epoch": 0.8867667121418826,
"grad_norm": 0.37244533491828524,
"learning_rate": 4.803932633203753e-05,
"loss": 0.508,
"step": 650
},
{
"epoch": 0.8935879945429741,
"grad_norm": 0.3439172116866752,
"learning_rate": 4.799766480747394e-05,
"loss": 0.5027,
"step": 655
},
{
"epoch": 0.9004092769440655,
"grad_norm": 0.4355482554835195,
"learning_rate": 4.795558589933254e-05,
"loss": 0.5067,
"step": 660
},
{
"epoch": 0.9072305593451568,
"grad_norm": 0.4870363512314333,
"learning_rate": 4.791309046444485e-05,
"loss": 0.5029,
"step": 665
},
{
"epoch": 0.9140518417462483,
"grad_norm": 0.4120257961084212,
"learning_rate": 4.787017936812391e-05,
"loss": 0.5097,
"step": 670
},
{
"epoch": 0.9208731241473397,
"grad_norm": 0.37936056411557967,
"learning_rate": 4.782685348414666e-05,
"loss": 0.5002,
"step": 675
},
{
"epoch": 0.927694406548431,
"grad_norm": 0.4164234019796205,
"learning_rate": 4.7783113694736155e-05,
"loss": 0.5095,
"step": 680
},
{
"epoch": 0.9345156889495225,
"grad_norm": 0.41384348002400084,
"learning_rate": 4.77389608905436e-05,
"loss": 0.4951,
"step": 685
},
{
"epoch": 0.9413369713506139,
"grad_norm": 0.4758845307523666,
"learning_rate": 4.769439597063021e-05,
"loss": 0.5085,
"step": 690
},
{
"epoch": 0.9481582537517054,
"grad_norm": 0.3574461311161845,
"learning_rate": 4.7649419842448897e-05,
"loss": 0.5059,
"step": 695
},
{
"epoch": 0.9549795361527967,
"grad_norm": 0.3495938838243895,
"learning_rate": 4.76040334218258e-05,
"loss": 0.5081,
"step": 700
},
{
"epoch": 0.9618008185538881,
"grad_norm": 0.4378709381568982,
"learning_rate": 4.755823763294165e-05,
"loss": 0.4918,
"step": 705
},
{
"epoch": 0.9686221009549796,
"grad_norm": 0.29336875522442324,
"learning_rate": 4.751203340831293e-05,
"loss": 0.5024,
"step": 710
},
{
"epoch": 0.975443383356071,
"grad_norm": 0.36977219152146296,
"learning_rate": 4.746542168877286e-05,
"loss": 0.5134,
"step": 715
},
{
"epoch": 0.9822646657571623,
"grad_norm": 0.3291235462155399,
"learning_rate": 4.741840342345234e-05,
"loss": 0.4992,
"step": 720
},
{
"epoch": 0.9890859481582538,
"grad_norm": 0.3371844589918567,
"learning_rate": 4.7370979569760487e-05,
"loss": 0.4955,
"step": 725
},
{
"epoch": 0.9959072305593452,
"grad_norm": 0.38097927189872294,
"learning_rate": 4.732315109336526e-05,
"loss": 0.5028,
"step": 730
},
{
"epoch": 1.0027285129604366,
"grad_norm": 0.3827764202163699,
"learning_rate": 4.7274918968173715e-05,
"loss": 0.4903,
"step": 735
},
{
"epoch": 1.009549795361528,
"grad_norm": 0.3823197870028838,
"learning_rate": 4.722628417631222e-05,
"loss": 0.4698,
"step": 740
},
{
"epoch": 1.0163710777626194,
"grad_norm": 0.49050241814536016,
"learning_rate": 4.717724770810644e-05,
"loss": 0.4784,
"step": 745
},
{
"epoch": 1.0231923601637107,
"grad_norm": 0.42959343836399067,
"learning_rate": 4.712781056206115e-05,
"loss": 0.4784,
"step": 750
},
{
"epoch": 1.030013642564802,
"grad_norm": 0.33192677798145853,
"learning_rate": 4.707797374483995e-05,
"loss": 0.4688,
"step": 755
},
{
"epoch": 1.0368349249658937,
"grad_norm": 0.33677259823116745,
"learning_rate": 4.7027738271244745e-05,
"loss": 0.4709,
"step": 760
},
{
"epoch": 1.043656207366985,
"grad_norm": 0.3587520916498671,
"learning_rate": 4.697710516419506e-05,
"loss": 0.4732,
"step": 765
},
{
"epoch": 1.0504774897680764,
"grad_norm": 0.3052981614611353,
"learning_rate": 4.692607545470724e-05,
"loss": 0.4819,
"step": 770
},
{
"epoch": 1.0572987721691678,
"grad_norm": 0.3082531398269355,
"learning_rate": 4.6874650181873434e-05,
"loss": 0.4621,
"step": 775
},
{
"epoch": 1.0641200545702592,
"grad_norm": 0.3293876714456996,
"learning_rate": 4.6822830392840454e-05,
"loss": 0.4692,
"step": 780
},
{
"epoch": 1.0709413369713505,
"grad_norm": 0.2874139325705941,
"learning_rate": 4.677061714278845e-05,
"loss": 0.4739,
"step": 785
},
{
"epoch": 1.077762619372442,
"grad_norm": 0.3172277958685006,
"learning_rate": 4.671801149490942e-05,
"loss": 0.476,
"step": 790
},
{
"epoch": 1.0845839017735335,
"grad_norm": 0.3085088466565494,
"learning_rate": 4.666501452038555e-05,
"loss": 0.4688,
"step": 795
},
{
"epoch": 1.0914051841746248,
"grad_norm": 0.3314327355813381,
"learning_rate": 4.661162729836742e-05,
"loss": 0.4714,
"step": 800
},
{
"epoch": 1.0982264665757162,
"grad_norm": 0.4003761744144885,
"learning_rate": 4.655785091595203e-05,
"loss": 0.4696,
"step": 805
},
{
"epoch": 1.1050477489768076,
"grad_norm": 0.351393239914098,
"learning_rate": 4.650368646816063e-05,
"loss": 0.4677,
"step": 810
},
{
"epoch": 1.111869031377899,
"grad_norm": 0.35762582610575827,
"learning_rate": 4.644913505791648e-05,
"loss": 0.4659,
"step": 815
},
{
"epoch": 1.1186903137789905,
"grad_norm": 0.315908724272643,
"learning_rate": 4.639419779602234e-05,
"loss": 0.4786,
"step": 820
},
{
"epoch": 1.125511596180082,
"grad_norm": 0.3323051669415947,
"learning_rate": 4.633887580113788e-05,
"loss": 0.4673,
"step": 825
},
{
"epoch": 1.1323328785811733,
"grad_norm": 0.3416812819537007,
"learning_rate": 4.62831701997569e-05,
"loss": 0.4626,
"step": 830
},
{
"epoch": 1.1391541609822646,
"grad_norm": 0.3346367428692832,
"learning_rate": 4.622708212618436e-05,
"loss": 0.4707,
"step": 835
},
{
"epoch": 1.145975443383356,
"grad_norm": 0.38356789979052214,
"learning_rate": 4.617061272251334e-05,
"loss": 0.4705,
"step": 840
},
{
"epoch": 1.1527967257844476,
"grad_norm": 0.3189849965788692,
"learning_rate": 4.6113763138601733e-05,
"loss": 0.4756,
"step": 845
},
{
"epoch": 1.159618008185539,
"grad_norm": 0.4417765040372437,
"learning_rate": 4.605653453204885e-05,
"loss": 0.4686,
"step": 850
},
{
"epoch": 1.1664392905866303,
"grad_norm": 0.37245869543006016,
"learning_rate": 4.5998928068171855e-05,
"loss": 0.4784,
"step": 855
},
{
"epoch": 1.1732605729877217,
"grad_norm": 0.4203605877476329,
"learning_rate": 4.594094491998202e-05,
"loss": 0.4737,
"step": 860
},
{
"epoch": 1.180081855388813,
"grad_norm": 0.36494183086977755,
"learning_rate": 4.588258626816087e-05,
"loss": 0.4689,
"step": 865
},
{
"epoch": 1.1869031377899044,
"grad_norm": 0.40333895829909894,
"learning_rate": 4.582385330103609e-05,
"loss": 0.4771,
"step": 870
},
{
"epoch": 1.1937244201909958,
"grad_norm": 0.3901597877953876,
"learning_rate": 4.576474721455738e-05,
"loss": 0.4751,
"step": 875
},
{
"epoch": 1.2005457025920874,
"grad_norm": 0.34277800089770916,
"learning_rate": 4.570526921227208e-05,
"loss": 0.4666,
"step": 880
},
{
"epoch": 1.2073669849931787,
"grad_norm": 0.39138074126943645,
"learning_rate": 4.564542050530065e-05,
"loss": 0.465,
"step": 885
},
{
"epoch": 1.21418826739427,
"grad_norm": 0.29677610423800593,
"learning_rate": 4.558520231231203e-05,
"loss": 0.4809,
"step": 890
},
{
"epoch": 1.2210095497953615,
"grad_norm": 1.986257352752189,
"learning_rate": 4.552461585949882e-05,
"loss": 0.4729,
"step": 895
},
{
"epoch": 1.2278308321964528,
"grad_norm": 0.31433424596600806,
"learning_rate": 4.5463662380552305e-05,
"loss": 0.4691,
"step": 900
},
{
"epoch": 1.2346521145975444,
"grad_norm": 0.33462141961486536,
"learning_rate": 4.540234311663733e-05,
"loss": 0.479,
"step": 905
},
{
"epoch": 1.2414733969986358,
"grad_norm": 0.3618725063175746,
"learning_rate": 4.5340659316367076e-05,
"loss": 0.4704,
"step": 910
},
{
"epoch": 1.2482946793997272,
"grad_norm": 0.3153025248788844,
"learning_rate": 4.5278612235777506e-05,
"loss": 0.4576,
"step": 915
},
{
"epoch": 1.2551159618008185,
"grad_norm": 0.4061352898494509,
"learning_rate": 4.5216203138301965e-05,
"loss": 0.4522,
"step": 920
},
{
"epoch": 1.26193724420191,
"grad_norm": 0.3607694811238664,
"learning_rate": 4.515343329474533e-05,
"loss": 0.4706,
"step": 925
},
{
"epoch": 1.2687585266030013,
"grad_norm": 0.3267827363161734,
"learning_rate": 4.5090303983258145e-05,
"loss": 0.468,
"step": 930
},
{
"epoch": 1.2755798090040928,
"grad_norm": 0.27843919573690984,
"learning_rate": 4.5026816489310663e-05,
"loss": 0.4661,
"step": 935
},
{
"epoch": 1.2824010914051842,
"grad_norm": 0.34060835289474956,
"learning_rate": 4.4962972105666594e-05,
"loss": 0.4629,
"step": 940
},
{
"epoch": 1.2892223738062756,
"grad_norm": 0.37337450694564905,
"learning_rate": 4.4898772132356814e-05,
"loss": 0.4659,
"step": 945
},
{
"epoch": 1.296043656207367,
"grad_norm": 0.3889912662850152,
"learning_rate": 4.48342178766529e-05,
"loss": 0.4754,
"step": 950
},
{
"epoch": 1.3028649386084583,
"grad_norm": 0.31158287570845944,
"learning_rate": 4.476931065304051e-05,
"loss": 0.4807,
"step": 955
},
{
"epoch": 1.30968622100955,
"grad_norm": 0.2766567733359623,
"learning_rate": 4.4704051783192586e-05,
"loss": 0.464,
"step": 960
},
{
"epoch": 1.3165075034106413,
"grad_norm": 0.3433197368032259,
"learning_rate": 4.463844259594248e-05,
"loss": 0.4752,
"step": 965
},
{
"epoch": 1.3233287858117326,
"grad_norm": 0.318333371191306,
"learning_rate": 4.457248442725689e-05,
"loss": 0.475,
"step": 970
},
{
"epoch": 1.330150068212824,
"grad_norm": 0.31343360813925925,
"learning_rate": 4.450617862020863e-05,
"loss": 0.4672,
"step": 975
},
{
"epoch": 1.3369713506139154,
"grad_norm": 0.39934458201682166,
"learning_rate": 4.4439526524949284e-05,
"loss": 0.4667,
"step": 980
},
{
"epoch": 1.3437926330150067,
"grad_norm": 0.35542044713329446,
"learning_rate": 4.4372529498681766e-05,
"loss": 0.4714,
"step": 985
},
{
"epoch": 1.350613915416098,
"grad_norm": 0.35992889764639413,
"learning_rate": 4.430518890563261e-05,
"loss": 0.471,
"step": 990
},
{
"epoch": 1.3574351978171897,
"grad_norm": 0.3560037441904099,
"learning_rate": 4.423750611702426e-05,
"loss": 0.4623,
"step": 995
},
{
"epoch": 1.364256480218281,
"grad_norm": 0.33206985127339195,
"learning_rate": 4.416948251104707e-05,
"loss": 0.4682,
"step": 1000
},
{
"epoch": 1.3710777626193724,
"grad_norm": 0.31074855188770295,
"learning_rate": 4.4101119472831344e-05,
"loss": 0.4678,
"step": 1005
},
{
"epoch": 1.3778990450204638,
"grad_norm": 0.31053961787527984,
"learning_rate": 4.403241839441901e-05,
"loss": 0.4688,
"step": 1010
},
{
"epoch": 1.3847203274215554,
"grad_norm": 0.3152896675317222,
"learning_rate": 4.39633806747354e-05,
"loss": 0.4754,
"step": 1015
},
{
"epoch": 1.3915416098226467,
"grad_norm": 0.30982743776045496,
"learning_rate": 4.389400771956065e-05,
"loss": 0.4628,
"step": 1020
},
{
"epoch": 1.398362892223738,
"grad_norm": 0.28953578544241515,
"learning_rate": 4.382430094150115e-05,
"loss": 0.4649,
"step": 1025
},
{
"epoch": 1.4051841746248295,
"grad_norm": 0.31445930491885443,
"learning_rate": 4.3754261759960754e-05,
"loss": 0.4667,
"step": 1030
},
{
"epoch": 1.4120054570259208,
"grad_norm": 0.35215307464128237,
"learning_rate": 4.3683891601111885e-05,
"loss": 0.4727,
"step": 1035
},
{
"epoch": 1.4188267394270122,
"grad_norm": 0.32253563108413785,
"learning_rate": 4.3613191897866484e-05,
"loss": 0.4672,
"step": 1040
},
{
"epoch": 1.4256480218281036,
"grad_norm": 0.3679199412491207,
"learning_rate": 4.354216408984683e-05,
"loss": 0.4671,
"step": 1045
},
{
"epoch": 1.4324693042291952,
"grad_norm": 0.33159929778862235,
"learning_rate": 4.3470809623356254e-05,
"loss": 0.4574,
"step": 1050
},
{
"epoch": 1.4392905866302865,
"grad_norm": 0.32071774142054066,
"learning_rate": 4.3399129951349644e-05,
"loss": 0.4679,
"step": 1055
},
{
"epoch": 1.446111869031378,
"grad_norm": 0.33875938116054005,
"learning_rate": 4.3327126533403906e-05,
"loss": 0.4667,
"step": 1060
},
{
"epoch": 1.4529331514324693,
"grad_norm": 0.32046481852129444,
"learning_rate": 4.3254800835688206e-05,
"loss": 0.4664,
"step": 1065
},
{
"epoch": 1.4597544338335606,
"grad_norm": 0.3671443933349803,
"learning_rate": 4.318215433093412e-05,
"loss": 0.4636,
"step": 1070
},
{
"epoch": 1.4665757162346522,
"grad_norm": 0.3877193573168454,
"learning_rate": 4.310918849840568e-05,
"loss": 0.4636,
"step": 1075
},
{
"epoch": 1.4733969986357436,
"grad_norm": 0.37738393252018904,
"learning_rate": 4.3035904823869236e-05,
"loss": 0.4616,
"step": 1080
},
{
"epoch": 1.480218281036835,
"grad_norm": 0.3911245540247714,
"learning_rate": 4.2962304799563145e-05,
"loss": 0.4676,
"step": 1085
},
{
"epoch": 1.4870395634379263,
"grad_norm": 0.3700102178404319,
"learning_rate": 4.2888389924167485e-05,
"loss": 0.4657,
"step": 1090
},
{
"epoch": 1.4938608458390177,
"grad_norm": 0.33699884554276954,
"learning_rate": 4.2814161702773445e-05,
"loss": 0.4765,
"step": 1095
},
{
"epoch": 1.500682128240109,
"grad_norm": 0.3241464736592151,
"learning_rate": 4.273962164685277e-05,
"loss": 0.4707,
"step": 1100
},
{
"epoch": 1.5075034106412004,
"grad_norm": 0.36575583201384765,
"learning_rate": 4.266477127422689e-05,
"loss": 0.461,
"step": 1105
},
{
"epoch": 1.514324693042292,
"grad_norm": 0.3186365429573977,
"learning_rate": 4.258961210903607e-05,
"loss": 0.4669,
"step": 1110
},
{
"epoch": 1.5211459754433834,
"grad_norm": 0.4112679503825286,
"learning_rate": 4.251414568170837e-05,
"loss": 0.4662,
"step": 1115
},
{
"epoch": 1.5279672578444747,
"grad_norm": 0.4022859807157274,
"learning_rate": 4.243837352892847e-05,
"loss": 0.4657,
"step": 1120
},
{
"epoch": 1.5347885402455663,
"grad_norm": 0.3204521643007053,
"learning_rate": 4.236229719360637e-05,
"loss": 0.4742,
"step": 1125
},
{
"epoch": 1.5416098226466577,
"grad_norm": 0.48450770190539705,
"learning_rate": 4.2285918224846004e-05,
"loss": 0.4648,
"step": 1130
},
{
"epoch": 1.548431105047749,
"grad_norm": 0.38815781737547445,
"learning_rate": 4.220923817791368e-05,
"loss": 0.4695,
"step": 1135
},
{
"epoch": 1.5552523874488404,
"grad_norm": 0.35417533551611025,
"learning_rate": 4.213225861420638e-05,
"loss": 0.4788,
"step": 1140
},
{
"epoch": 1.5620736698499318,
"grad_norm": 0.33621340683762574,
"learning_rate": 4.205498110122001e-05,
"loss": 0.468,
"step": 1145
},
{
"epoch": 1.5688949522510232,
"grad_norm": 0.3051392387726156,
"learning_rate": 4.1977407212517485e-05,
"loss": 0.4672,
"step": 1150
},
{
"epoch": 1.5757162346521145,
"grad_norm": 0.30403627574032777,
"learning_rate": 4.1899538527696645e-05,
"loss": 0.4709,
"step": 1155
},
{
"epoch": 1.5825375170532059,
"grad_norm": 0.31580261281637434,
"learning_rate": 4.1821376632358125e-05,
"loss": 0.4875,
"step": 1160
},
{
"epoch": 1.5893587994542973,
"grad_norm": 0.3395866286207179,
"learning_rate": 4.174292311807305e-05,
"loss": 0.4585,
"step": 1165
},
{
"epoch": 1.5961800818553888,
"grad_norm": 0.31931572920688767,
"learning_rate": 4.166417958235064e-05,
"loss": 0.4627,
"step": 1170
},
{
"epoch": 1.6030013642564802,
"grad_norm": 0.2678192239884723,
"learning_rate": 4.158514762860567e-05,
"loss": 0.4636,
"step": 1175
},
{
"epoch": 1.6098226466575716,
"grad_norm": 0.30945877917388176,
"learning_rate": 4.150582886612583e-05,
"loss": 0.4675,
"step": 1180
},
{
"epoch": 1.6166439290586632,
"grad_norm": 0.3018968259724802,
"learning_rate": 4.142622491003895e-05,
"loss": 0.4654,
"step": 1185
},
{
"epoch": 1.6234652114597545,
"grad_norm": 0.34852848169490774,
"learning_rate": 4.134633738128011e-05,
"loss": 0.4693,
"step": 1190
},
{
"epoch": 1.630286493860846,
"grad_norm": 0.32787903172740046,
"learning_rate": 4.1266167906558666e-05,
"loss": 0.4626,
"step": 1195
},
{
"epoch": 1.6371077762619373,
"grad_norm": 0.31265931725066487,
"learning_rate": 4.118571811832503e-05,
"loss": 0.4654,
"step": 1200
},
{
"epoch": 1.6439290586630286,
"grad_norm": 0.2993073436351214,
"learning_rate": 4.110498965473755e-05,
"loss": 0.4554,
"step": 1205
},
{
"epoch": 1.65075034106412,
"grad_norm": 0.36553912187903026,
"learning_rate": 4.10239841596291e-05,
"loss": 0.4675,
"step": 1210
},
{
"epoch": 1.6575716234652114,
"grad_norm": 0.31918331042181586,
"learning_rate": 4.094270328247358e-05,
"loss": 0.4602,
"step": 1215
},
{
"epoch": 1.6643929058663027,
"grad_norm": 0.28081409466017554,
"learning_rate": 4.0861148678352365e-05,
"loss": 0.4527,
"step": 1220
},
{
"epoch": 1.6712141882673943,
"grad_norm": 0.27849601812181274,
"learning_rate": 4.07793220079206e-05,
"loss": 0.4569,
"step": 1225
},
{
"epoch": 1.6780354706684857,
"grad_norm": 0.33295439455092557,
"learning_rate": 4.0697224937373395e-05,
"loss": 0.46,
"step": 1230
},
{
"epoch": 1.684856753069577,
"grad_norm": 0.3505769876040763,
"learning_rate": 4.0614859138411835e-05,
"loss": 0.468,
"step": 1235
},
{
"epoch": 1.6916780354706686,
"grad_norm": 0.39217466352691527,
"learning_rate": 4.053222628820902e-05,
"loss": 0.4578,
"step": 1240
},
{
"epoch": 1.69849931787176,
"grad_norm": 0.30143581574850914,
"learning_rate": 4.044932806937587e-05,
"loss": 0.4575,
"step": 1245
},
{
"epoch": 1.7053206002728514,
"grad_norm": 0.31505555216878045,
"learning_rate": 4.036616616992688e-05,
"loss": 0.4644,
"step": 1250
},
{
"epoch": 1.7121418826739427,
"grad_norm": 0.2883792300853787,
"learning_rate": 4.0282742283245725e-05,
"loss": 0.4586,
"step": 1255
},
{
"epoch": 1.718963165075034,
"grad_norm": 0.2908471999120709,
"learning_rate": 4.0199058108050793e-05,
"loss": 0.4552,
"step": 1260
},
{
"epoch": 1.7257844474761255,
"grad_norm": 0.32328729724020944,
"learning_rate": 4.0115115348360635e-05,
"loss": 0.4606,
"step": 1265
},
{
"epoch": 1.7326057298772168,
"grad_norm": 0.3751122636763178,
"learning_rate": 4.003091571345917e-05,
"loss": 0.4446,
"step": 1270
},
{
"epoch": 1.7394270122783082,
"grad_norm": 0.27616502281371696,
"learning_rate": 3.994646091786097e-05,
"loss": 0.4715,
"step": 1275
},
{
"epoch": 1.7462482946793996,
"grad_norm": 0.30232993983843404,
"learning_rate": 3.9861752681276305e-05,
"loss": 0.464,
"step": 1280
},
{
"epoch": 1.7530695770804912,
"grad_norm": 0.2960787022634833,
"learning_rate": 3.977679272857615e-05,
"loss": 0.4588,
"step": 1285
},
{
"epoch": 1.7598908594815825,
"grad_norm": 0.3108279513492761,
"learning_rate": 3.969158278975703e-05,
"loss": 0.4698,
"step": 1290
},
{
"epoch": 1.766712141882674,
"grad_norm": 0.2987708195925191,
"learning_rate": 3.9606124599905805e-05,
"loss": 0.4629,
"step": 1295
},
{
"epoch": 1.7735334242837655,
"grad_norm": 0.3488308740525556,
"learning_rate": 3.9520419899164383e-05,
"loss": 0.4544,
"step": 1300
},
{
"epoch": 1.7803547066848568,
"grad_norm": 0.32248512138093716,
"learning_rate": 3.9434470432694206e-05,
"loss": 0.4568,
"step": 1305
},
{
"epoch": 1.7871759890859482,
"grad_norm": 0.2979029518282203,
"learning_rate": 3.9348277950640785e-05,
"loss": 0.4543,
"step": 1310
},
{
"epoch": 1.7939972714870396,
"grad_norm": 0.2927500842790821,
"learning_rate": 3.926184420809801e-05,
"loss": 0.4675,
"step": 1315
},
{
"epoch": 1.800818553888131,
"grad_norm": 0.2944797090124265,
"learning_rate": 3.917517096507245e-05,
"loss": 0.4604,
"step": 1320
},
{
"epoch": 1.8076398362892223,
"grad_norm": 0.2956062921328968,
"learning_rate": 3.908825998644753e-05,
"loss": 0.4657,
"step": 1325
},
{
"epoch": 1.8144611186903137,
"grad_norm": 0.31291911806828143,
"learning_rate": 3.90011130419475e-05,
"loss": 0.457,
"step": 1330
},
{
"epoch": 1.821282401091405,
"grad_norm": 0.28055698375775834,
"learning_rate": 3.891373190610151e-05,
"loss": 0.4596,
"step": 1335
},
{
"epoch": 1.8281036834924966,
"grad_norm": 0.2675001092514469,
"learning_rate": 3.882611835820743e-05,
"loss": 0.4502,
"step": 1340
},
{
"epoch": 1.834924965893588,
"grad_norm": 0.2771395030669777,
"learning_rate": 3.87382741822956e-05,
"loss": 0.4694,
"step": 1345
},
{
"epoch": 1.8417462482946794,
"grad_norm": 0.29869108659521465,
"learning_rate": 3.865020116709253e-05,
"loss": 0.461,
"step": 1350
},
{
"epoch": 1.848567530695771,
"grad_norm": 0.3494382288274053,
"learning_rate": 3.856190110598446e-05,
"loss": 0.4622,
"step": 1355
},
{
"epoch": 1.8553888130968623,
"grad_norm": 0.352991866875619,
"learning_rate": 3.8473375796980884e-05,
"loss": 0.471,
"step": 1360
},
{
"epoch": 1.8622100954979537,
"grad_norm": 0.2910222507971248,
"learning_rate": 3.8384627042677856e-05,
"loss": 0.4596,
"step": 1365
},
{
"epoch": 1.869031377899045,
"grad_norm": 0.2800350442292248,
"learning_rate": 3.8295656650221365e-05,
"loss": 0.4653,
"step": 1370
},
{
"epoch": 1.8758526603001364,
"grad_norm": 0.29156706940537624,
"learning_rate": 3.8206466431270506e-05,
"loss": 0.4605,
"step": 1375
},
{
"epoch": 1.8826739427012278,
"grad_norm": 0.2933723081470944,
"learning_rate": 3.811705820196057e-05,
"loss": 0.4554,
"step": 1380
},
{
"epoch": 1.8894952251023192,
"grad_norm": 0.29832500170699905,
"learning_rate": 3.8027433782866113e-05,
"loss": 0.455,
"step": 1385
},
{
"epoch": 1.8963165075034105,
"grad_norm": 0.25799082440570675,
"learning_rate": 3.793759499896382e-05,
"loss": 0.4486,
"step": 1390
},
{
"epoch": 1.9031377899045019,
"grad_norm": 0.3235586101259049,
"learning_rate": 3.78475436795954e-05,
"loss": 0.4667,
"step": 1395
},
{
"epoch": 1.9099590723055935,
"grad_norm": 0.3074786143159734,
"learning_rate": 3.775728165843031e-05,
"loss": 0.4576,
"step": 1400
},
{
"epoch": 1.9167803547066848,
"grad_norm": 0.3603064254550076,
"learning_rate": 3.7666810773428404e-05,
"loss": 0.4569,
"step": 1405
},
{
"epoch": 1.9236016371077762,
"grad_norm": 0.3081793409916903,
"learning_rate": 3.757613286680256e-05,
"loss": 0.4632,
"step": 1410
},
{
"epoch": 1.9304229195088678,
"grad_norm": 0.3089787023285962,
"learning_rate": 3.748524978498111e-05,
"loss": 0.4531,
"step": 1415
},
{
"epoch": 1.9372442019099592,
"grad_norm": 0.28472538824854116,
"learning_rate": 3.739416337857026e-05,
"loss": 0.4683,
"step": 1420
},
{
"epoch": 1.9440654843110505,
"grad_norm": 0.33132010335669787,
"learning_rate": 3.730287550231643e-05,
"loss": 0.461,
"step": 1425
},
{
"epoch": 1.950886766712142,
"grad_norm": 0.30425617531682264,
"learning_rate": 3.721138801506844e-05,
"loss": 0.4596,
"step": 1430
},
{
"epoch": 1.9577080491132333,
"grad_norm": 0.33217187574645907,
"learning_rate": 3.7119702779739725e-05,
"loss": 0.4653,
"step": 1435
},
{
"epoch": 1.9645293315143246,
"grad_norm": 0.31928783726821597,
"learning_rate": 3.702782166327033e-05,
"loss": 0.4534,
"step": 1440
},
{
"epoch": 1.971350613915416,
"grad_norm": 0.34901830956753394,
"learning_rate": 3.693574653658894e-05,
"loss": 0.4541,
"step": 1445
},
{
"epoch": 1.9781718963165074,
"grad_norm": 0.31094667952595445,
"learning_rate": 3.6843479274574786e-05,
"loss": 0.4493,
"step": 1450
},
{
"epoch": 1.984993178717599,
"grad_norm": 0.2979687660697197,
"learning_rate": 3.6751021756019445e-05,
"loss": 0.4641,
"step": 1455
},
{
"epoch": 1.9918144611186903,
"grad_norm": 0.3084947114843808,
"learning_rate": 3.665837586358858e-05,
"loss": 0.4611,
"step": 1460
},
{
"epoch": 1.9986357435197817,
"grad_norm": 0.3029857564961375,
"learning_rate": 3.6565543483783625e-05,
"loss": 0.4509,
"step": 1465
},
{
"epoch": 2.0054570259208733,
"grad_norm": 0.3135726662910769,
"learning_rate": 3.647252650690337e-05,
"loss": 0.4236,
"step": 1470
},
{
"epoch": 2.0122783083219646,
"grad_norm": 0.31311249123809054,
"learning_rate": 3.6379326827005446e-05,
"loss": 0.4171,
"step": 1475
},
{
"epoch": 2.019099590723056,
"grad_norm": 0.3129396844691106,
"learning_rate": 3.628594634186778e-05,
"loss": 0.4254,
"step": 1480
},
{
"epoch": 2.0259208731241474,
"grad_norm": 0.32116328115030246,
"learning_rate": 3.6192386952949956e-05,
"loss": 0.42,
"step": 1485
},
{
"epoch": 2.0327421555252387,
"grad_norm": 0.3069414900056045,
"learning_rate": 3.609865056535446e-05,
"loss": 0.4331,
"step": 1490
},
{
"epoch": 2.03956343792633,
"grad_norm": 0.2632126018742137,
"learning_rate": 3.600473908778795e-05,
"loss": 0.4225,
"step": 1495
},
{
"epoch": 2.0463847203274215,
"grad_norm": 0.32874921996960255,
"learning_rate": 3.5910654432522307e-05,
"loss": 0.41,
"step": 1500
},
{
"epoch": 2.053206002728513,
"grad_norm": 0.3105130163069014,
"learning_rate": 3.5816398515355756e-05,
"loss": 0.4292,
"step": 1505
},
{
"epoch": 2.060027285129604,
"grad_norm": 0.32641874471348564,
"learning_rate": 3.572197325557389e-05,
"loss": 0.4187,
"step": 1510
},
{
"epoch": 2.0668485675306956,
"grad_norm": 0.2941991953873116,
"learning_rate": 3.5627380575910477e-05,
"loss": 0.4239,
"step": 1515
},
{
"epoch": 2.0736698499317874,
"grad_norm": 0.3182323852409404,
"learning_rate": 3.5532622402508375e-05,
"loss": 0.4268,
"step": 1520
},
{
"epoch": 2.0804911323328787,
"grad_norm": 0.31245460362116023,
"learning_rate": 3.5437700664880356e-05,
"loss": 0.4263,
"step": 1525
},
{
"epoch": 2.08731241473397,
"grad_norm": 0.34871682717496116,
"learning_rate": 3.534261729586974e-05,
"loss": 0.4193,
"step": 1530
},
{
"epoch": 2.0941336971350615,
"grad_norm": 0.37090856535041666,
"learning_rate": 3.5247374231611035e-05,
"loss": 0.4246,
"step": 1535
},
{
"epoch": 2.100954979536153,
"grad_norm": 0.30717318325796616,
"learning_rate": 3.515197341149059e-05,
"loss": 0.4317,
"step": 1540
},
{
"epoch": 2.107776261937244,
"grad_norm": 0.31921982872180504,
"learning_rate": 3.5056416778107046e-05,
"loss": 0.4207,
"step": 1545
},
{
"epoch": 2.1145975443383356,
"grad_norm": 0.2991247847589312,
"learning_rate": 3.496070627723176e-05,
"loss": 0.4299,
"step": 1550
},
{
"epoch": 2.121418826739427,
"grad_norm": 0.3816219601230735,
"learning_rate": 3.486484385776925e-05,
"loss": 0.4359,
"step": 1555
},
{
"epoch": 2.1282401091405183,
"grad_norm": 0.30922612207452377,
"learning_rate": 3.476883147171746e-05,
"loss": 0.4148,
"step": 1560
},
{
"epoch": 2.1350613915416097,
"grad_norm": 0.35605310989928785,
"learning_rate": 3.467267107412804e-05,
"loss": 0.4216,
"step": 1565
},
{
"epoch": 2.141882673942701,
"grad_norm": 0.31876379646066433,
"learning_rate": 3.457636462306649e-05,
"loss": 0.4181,
"step": 1570
},
{
"epoch": 2.148703956343793,
"grad_norm": 0.3348502402747506,
"learning_rate": 3.447991407957238e-05,
"loss": 0.4258,
"step": 1575
},
{
"epoch": 2.155525238744884,
"grad_norm": 0.3103190282044908,
"learning_rate": 3.43833214076193e-05,
"loss": 0.4298,
"step": 1580
},
{
"epoch": 2.1623465211459756,
"grad_norm": 0.26280179921818947,
"learning_rate": 3.428658857407498e-05,
"loss": 0.42,
"step": 1585
},
{
"epoch": 2.169167803547067,
"grad_norm": 0.27387018972309857,
"learning_rate": 3.4189717548661155e-05,
"loss": 0.4265,
"step": 1590
},
{
"epoch": 2.1759890859481583,
"grad_norm": 0.27965256772921515,
"learning_rate": 3.40927103039135e-05,
"loss": 0.4306,
"step": 1595
},
{
"epoch": 2.1828103683492497,
"grad_norm": 0.354633813195912,
"learning_rate": 3.3995568815141475e-05,
"loss": 0.4319,
"step": 1600
},
{
"epoch": 2.189631650750341,
"grad_norm": 0.30928248025299715,
"learning_rate": 3.389829506038806e-05,
"loss": 0.409,
"step": 1605
},
{
"epoch": 2.1964529331514324,
"grad_norm": 0.271879368659074,
"learning_rate": 3.38008910203895e-05,
"loss": 0.4242,
"step": 1610
},
{
"epoch": 2.203274215552524,
"grad_norm": 0.28795120711169647,
"learning_rate": 3.3703358678535e-05,
"loss": 0.428,
"step": 1615
},
{
"epoch": 2.210095497953615,
"grad_norm": 0.27920888733605737,
"learning_rate": 3.360570002082627e-05,
"loss": 0.4272,
"step": 1620
},
{
"epoch": 2.2169167803547065,
"grad_norm": 0.3041918063287464,
"learning_rate": 3.3507917035837156e-05,
"loss": 0.4244,
"step": 1625
},
{
"epoch": 2.223738062755798,
"grad_norm": 0.2833032542430106,
"learning_rate": 3.3410011714673116e-05,
"loss": 0.4264,
"step": 1630
},
{
"epoch": 2.2305593451568897,
"grad_norm": 0.30023594828150446,
"learning_rate": 3.331198605093066e-05,
"loss": 0.4346,
"step": 1635
},
{
"epoch": 2.237380627557981,
"grad_norm": 0.27055036739880434,
"learning_rate": 3.321384204065679e-05,
"loss": 0.4231,
"step": 1640
},
{
"epoch": 2.2442019099590724,
"grad_norm": 0.2956240648237309,
"learning_rate": 3.311558168230833e-05,
"loss": 0.4264,
"step": 1645
},
{
"epoch": 2.251023192360164,
"grad_norm": 0.27941390605638816,
"learning_rate": 3.3017206976711234e-05,
"loss": 0.4299,
"step": 1650
},
{
"epoch": 2.257844474761255,
"grad_norm": 0.2944715963544699,
"learning_rate": 3.2918719927019874e-05,
"loss": 0.4253,
"step": 1655
},
{
"epoch": 2.2646657571623465,
"grad_norm": 0.26529633059650365,
"learning_rate": 3.28201225386762e-05,
"loss": 0.4229,
"step": 1660
},
{
"epoch": 2.271487039563438,
"grad_norm": 0.2595003331375696,
"learning_rate": 3.272141681936896e-05,
"loss": 0.4127,
"step": 1665
},
{
"epoch": 2.2783083219645293,
"grad_norm": 0.282591282398735,
"learning_rate": 3.262260477899277e-05,
"loss": 0.4219,
"step": 1670
},
{
"epoch": 2.2851296043656206,
"grad_norm": 0.2750339518399228,
"learning_rate": 3.252368842960722e-05,
"loss": 0.4292,
"step": 1675
},
{
"epoch": 2.291950886766712,
"grad_norm": 0.2618182153631443,
"learning_rate": 3.242466978539588e-05,
"loss": 0.4241,
"step": 1680
},
{
"epoch": 2.2987721691678034,
"grad_norm": 0.2552841576254521,
"learning_rate": 3.23255508626253e-05,
"loss": 0.4222,
"step": 1685
},
{
"epoch": 2.305593451568895,
"grad_norm": 0.28582055607185974,
"learning_rate": 3.222633367960396e-05,
"loss": 0.428,
"step": 1690
},
{
"epoch": 2.3124147339699865,
"grad_norm": 0.28026916468523033,
"learning_rate": 3.212702025664117e-05,
"loss": 0.4207,
"step": 1695
},
{
"epoch": 2.319236016371078,
"grad_norm": 0.2676116279510356,
"learning_rate": 3.2027612616005894e-05,
"loss": 0.415,
"step": 1700
},
{
"epoch": 2.3260572987721693,
"grad_norm": 0.2696917737491691,
"learning_rate": 3.192811278188565e-05,
"loss": 0.4301,
"step": 1705
},
{
"epoch": 2.3328785811732606,
"grad_norm": 0.327138021087197,
"learning_rate": 3.182852278034519e-05,
"loss": 0.4307,
"step": 1710
},
{
"epoch": 2.339699863574352,
"grad_norm": 0.2907867883983245,
"learning_rate": 3.172884463928536e-05,
"loss": 0.4176,
"step": 1715
},
{
"epoch": 2.3465211459754434,
"grad_norm": 0.2615366115260166,
"learning_rate": 3.162908038840168e-05,
"loss": 0.4193,
"step": 1720
},
{
"epoch": 2.3533424283765347,
"grad_norm": 0.25427967252321887,
"learning_rate": 3.152923205914315e-05,
"loss": 0.4192,
"step": 1725
},
{
"epoch": 2.360163710777626,
"grad_norm": 0.2839198481754373,
"learning_rate": 3.142930168467076e-05,
"loss": 0.4193,
"step": 1730
},
{
"epoch": 2.3669849931787175,
"grad_norm": 0.27753752053645026,
"learning_rate": 3.132929129981616e-05,
"loss": 0.4235,
"step": 1735
},
{
"epoch": 2.373806275579809,
"grad_norm": 0.28750808475579537,
"learning_rate": 3.1229202941040236e-05,
"loss": 0.4125,
"step": 1740
},
{
"epoch": 2.3806275579809,
"grad_norm": 0.27914304459801476,
"learning_rate": 3.112903864639159e-05,
"loss": 0.416,
"step": 1745
},
{
"epoch": 2.3874488403819916,
"grad_norm": 0.26521947027144227,
"learning_rate": 3.1028800455465076e-05,
"loss": 0.4221,
"step": 1750
},
{
"epoch": 2.3942701227830834,
"grad_norm": 0.3124062643534349,
"learning_rate": 3.092849040936026e-05,
"loss": 0.4245,
"step": 1755
},
{
"epoch": 2.4010914051841747,
"grad_norm": 0.2552430115684927,
"learning_rate": 3.082811055063987e-05,
"loss": 0.4341,
"step": 1760
},
{
"epoch": 2.407912687585266,
"grad_norm": 0.26492843264370497,
"learning_rate": 3.072766292328816e-05,
"loss": 0.4327,
"step": 1765
},
{
"epoch": 2.4147339699863575,
"grad_norm": 0.2712316410768401,
"learning_rate": 3.062714957266937e-05,
"loss": 0.424,
"step": 1770
},
{
"epoch": 2.421555252387449,
"grad_norm": 0.27122320185449333,
"learning_rate": 3.0526572545485996e-05,
"loss": 0.4261,
"step": 1775
},
{
"epoch": 2.42837653478854,
"grad_norm": 0.31474914331449816,
"learning_rate": 3.0425933889737146e-05,
"loss": 0.4297,
"step": 1780
},
{
"epoch": 2.4351978171896316,
"grad_norm": 0.2919691217077298,
"learning_rate": 3.032523565467686e-05,
"loss": 0.4205,
"step": 1785
},
{
"epoch": 2.442019099590723,
"grad_norm": 0.27221107044977205,
"learning_rate": 3.022447989077235e-05,
"loss": 0.4287,
"step": 1790
},
{
"epoch": 2.4488403819918143,
"grad_norm": 0.25942778646026976,
"learning_rate": 3.012366864966225e-05,
"loss": 0.4222,
"step": 1795
},
{
"epoch": 2.4556616643929057,
"grad_norm": 0.3071419336788923,
"learning_rate": 3.0022803984114874e-05,
"loss": 0.4257,
"step": 1800
},
{
"epoch": 2.4624829467939975,
"grad_norm": 0.27582616943958793,
"learning_rate": 2.9921887947986366e-05,
"loss": 0.418,
"step": 1805
},
{
"epoch": 2.469304229195089,
"grad_norm": 0.2671863620869304,
"learning_rate": 2.9820922596178913e-05,
"loss": 0.4255,
"step": 1810
},
{
"epoch": 2.47612551159618,
"grad_norm": 0.24860100398833726,
"learning_rate": 2.971990998459889e-05,
"loss": 0.4156,
"step": 1815
},
{
"epoch": 2.4829467939972716,
"grad_norm": 0.284594081004446,
"learning_rate": 2.961885217011499e-05,
"loss": 0.4223,
"step": 1820
},
{
"epoch": 2.489768076398363,
"grad_norm": 0.2579836114359165,
"learning_rate": 2.951775121051638e-05,
"loss": 0.4216,
"step": 1825
},
{
"epoch": 2.4965893587994543,
"grad_norm": 0.31508298112234534,
"learning_rate": 2.9416609164470742e-05,
"loss": 0.4175,
"step": 1830
},
{
"epoch": 2.5034106412005457,
"grad_norm": 0.292088251441105,
"learning_rate": 2.9315428091482378e-05,
"loss": 0.4231,
"step": 1835
},
{
"epoch": 2.510231923601637,
"grad_norm": 0.2412081187085273,
"learning_rate": 2.921421005185028e-05,
"loss": 0.4294,
"step": 1840
},
{
"epoch": 2.5170532060027284,
"grad_norm": 0.2636673784819988,
"learning_rate": 2.9112957106626215e-05,
"loss": 0.42,
"step": 1845
},
{
"epoch": 2.52387448840382,
"grad_norm": 0.26621944180879165,
"learning_rate": 2.901167131757264e-05,
"loss": 0.4286,
"step": 1850
},
{
"epoch": 2.530695770804911,
"grad_norm": 0.3122994061435118,
"learning_rate": 2.8910354747120838e-05,
"loss": 0.4294,
"step": 1855
},
{
"epoch": 2.5375170532060025,
"grad_norm": 0.28921100624351126,
"learning_rate": 2.88090094583289e-05,
"loss": 0.4188,
"step": 1860
},
{
"epoch": 2.544338335607094,
"grad_norm": 0.2976929836495886,
"learning_rate": 2.8707637514839636e-05,
"loss": 0.4276,
"step": 1865
},
{
"epoch": 2.5511596180081857,
"grad_norm": 0.28653245810178596,
"learning_rate": 2.860624098083865e-05,
"loss": 0.4205,
"step": 1870
},
{
"epoch": 2.557980900409277,
"grad_norm": 0.25772469564725126,
"learning_rate": 2.850482192101227e-05,
"loss": 0.4169,
"step": 1875
},
{
"epoch": 2.5648021828103684,
"grad_norm": 0.2755373164575972,
"learning_rate": 2.8403382400505503e-05,
"loss": 0.4224,
"step": 1880
},
{
"epoch": 2.57162346521146,
"grad_norm": 0.26169030436203317,
"learning_rate": 2.8301924484879965e-05,
"loss": 0.428,
"step": 1885
},
{
"epoch": 2.578444747612551,
"grad_norm": 0.27151017935464744,
"learning_rate": 2.820045024007188e-05,
"loss": 0.4203,
"step": 1890
},
{
"epoch": 2.5852660300136425,
"grad_norm": 0.28065488272204875,
"learning_rate": 2.8098961732349938e-05,
"loss": 0.4255,
"step": 1895
},
{
"epoch": 2.592087312414734,
"grad_norm": 0.24312684897514486,
"learning_rate": 2.799746102827328e-05,
"loss": 0.4297,
"step": 1900
},
{
"epoch": 2.5989085948158253,
"grad_norm": 0.2793981195277466,
"learning_rate": 2.7895950194649396e-05,
"loss": 0.428,
"step": 1905
},
{
"epoch": 2.6057298772169166,
"grad_norm": 0.299603171553896,
"learning_rate": 2.779443129849202e-05,
"loss": 0.4248,
"step": 1910
},
{
"epoch": 2.6125511596180084,
"grad_norm": 0.25257119356748714,
"learning_rate": 2.769290640697908e-05,
"loss": 0.4136,
"step": 1915
},
{
"epoch": 2.6193724420191,
"grad_norm": 0.26676727856011184,
"learning_rate": 2.759137758741058e-05,
"loss": 0.4177,
"step": 1920
},
{
"epoch": 2.626193724420191,
"grad_norm": 0.2653022309306384,
"learning_rate": 2.74898469071665e-05,
"loss": 0.4214,
"step": 1925
},
{
"epoch": 2.6330150068212825,
"grad_norm": 0.2732195694982371,
"learning_rate": 2.73883164336647e-05,
"loss": 0.4165,
"step": 1930
},
{
"epoch": 2.639836289222374,
"grad_norm": 0.25665300942208025,
"learning_rate": 2.7286788234318873e-05,
"loss": 0.4205,
"step": 1935
},
{
"epoch": 2.6466575716234653,
"grad_norm": 0.2744996916924785,
"learning_rate": 2.7185264376496343e-05,
"loss": 0.4335,
"step": 1940
},
{
"epoch": 2.6534788540245566,
"grad_norm": 0.2685342174813685,
"learning_rate": 2.708374692747609e-05,
"loss": 0.4261,
"step": 1945
},
{
"epoch": 2.660300136425648,
"grad_norm": 0.25868701017122736,
"learning_rate": 2.698223795440655e-05,
"loss": 0.4126,
"step": 1950
},
{
"epoch": 2.6671214188267394,
"grad_norm": 0.24670171402401134,
"learning_rate": 2.6880739524263577e-05,
"loss": 0.427,
"step": 1955
},
{
"epoch": 2.6739427012278307,
"grad_norm": 0.2661683816758132,
"learning_rate": 2.6779253703808354e-05,
"loss": 0.4122,
"step": 1960
},
{
"epoch": 2.680763983628922,
"grad_norm": 0.24338499870433408,
"learning_rate": 2.6677782559545318e-05,
"loss": 0.4276,
"step": 1965
},
{
"epoch": 2.6875852660300135,
"grad_norm": 0.2890820143591707,
"learning_rate": 2.657632815768002e-05,
"loss": 0.4243,
"step": 1970
},
{
"epoch": 2.694406548431105,
"grad_norm": 0.2704451519178081,
"learning_rate": 2.647489256407712e-05,
"loss": 0.4172,
"step": 1975
},
{
"epoch": 2.701227830832196,
"grad_norm": 0.2607870631295583,
"learning_rate": 2.6373477844218292e-05,
"loss": 0.4186,
"step": 1980
},
{
"epoch": 2.708049113233288,
"grad_norm": 0.2679123075950831,
"learning_rate": 2.6272086063160174e-05,
"loss": 0.4246,
"step": 1985
},
{
"epoch": 2.7148703956343794,
"grad_norm": 0.2510936329742177,
"learning_rate": 2.6170719285492284e-05,
"loss": 0.4176,
"step": 1990
},
{
"epoch": 2.7216916780354707,
"grad_norm": 0.2489113174603498,
"learning_rate": 2.606937957529505e-05,
"loss": 0.4251,
"step": 1995
},
{
"epoch": 2.728512960436562,
"grad_norm": 0.2766865540879013,
"learning_rate": 2.5968068996097704e-05,
"loss": 0.4201,
"step": 2000
},
{
"epoch": 2.7353342428376535,
"grad_norm": 0.2619653936916896,
"learning_rate": 2.5866789610836317e-05,
"loss": 0.4319,
"step": 2005
},
{
"epoch": 2.742155525238745,
"grad_norm": 0.25836513726725047,
"learning_rate": 2.576554348181178e-05,
"loss": 0.4225,
"step": 2010
},
{
"epoch": 2.748976807639836,
"grad_norm": 0.2812033276566002,
"learning_rate": 2.5664332670647784e-05,
"loss": 0.4105,
"step": 2015
},
{
"epoch": 2.7557980900409276,
"grad_norm": 0.26844846621087337,
"learning_rate": 2.5563159238248878e-05,
"loss": 0.4309,
"step": 2020
},
{
"epoch": 2.762619372442019,
"grad_norm": 0.27154792946950407,
"learning_rate": 2.5462025244758464e-05,
"loss": 0.4226,
"step": 2025
},
{
"epoch": 2.7694406548431107,
"grad_norm": 0.23587959371760756,
"learning_rate": 2.536093274951689e-05,
"loss": 0.4214,
"step": 2030
},
{
"epoch": 2.776261937244202,
"grad_norm": 0.24958599275970533,
"learning_rate": 2.5259883811019487e-05,
"loss": 0.426,
"step": 2035
},
{
"epoch": 2.7830832196452935,
"grad_norm": 0.256069387441564,
"learning_rate": 2.515888048687467e-05,
"loss": 0.4119,
"step": 2040
},
{
"epoch": 2.789904502046385,
"grad_norm": 0.2743425060857528,
"learning_rate": 2.5057924833762026e-05,
"loss": 0.4235,
"step": 2045
},
{
"epoch": 2.796725784447476,
"grad_norm": 0.24792986290013677,
"learning_rate": 2.495701890739044e-05,
"loss": 0.4286,
"step": 2050
},
{
"epoch": 2.8035470668485676,
"grad_norm": 0.2794659297554359,
"learning_rate": 2.4856164762456242e-05,
"loss": 0.4335,
"step": 2055
},
{
"epoch": 2.810368349249659,
"grad_norm": 0.24524097260162075,
"learning_rate": 2.4755364452601344e-05,
"loss": 0.416,
"step": 2060
},
{
"epoch": 2.8171896316507503,
"grad_norm": 0.26642107588698416,
"learning_rate": 2.4654620030371468e-05,
"loss": 0.4217,
"step": 2065
},
{
"epoch": 2.8240109140518417,
"grad_norm": 0.26001767290569017,
"learning_rate": 2.455393354717431e-05,
"loss": 0.4257,
"step": 2070
},
{
"epoch": 2.830832196452933,
"grad_norm": 0.2468783202942451,
"learning_rate": 2.4453307053237794e-05,
"loss": 0.4134,
"step": 2075
},
{
"epoch": 2.8376534788540244,
"grad_norm": 0.25085358697281507,
"learning_rate": 2.435274259756829e-05,
"loss": 0.4114,
"step": 2080
},
{
"epoch": 2.844474761255116,
"grad_norm": 0.2747138078999621,
"learning_rate": 2.425224222790894e-05,
"loss": 0.427,
"step": 2085
},
{
"epoch": 2.851296043656207,
"grad_norm": 0.28121509135866035,
"learning_rate": 2.4151807990697918e-05,
"loss": 0.4191,
"step": 2090
},
{
"epoch": 2.8581173260572985,
"grad_norm": 0.23999527320050015,
"learning_rate": 2.4051441931026798e-05,
"loss": 0.4224,
"step": 2095
},
{
"epoch": 2.8649386084583903,
"grad_norm": 0.29784575613259334,
"learning_rate": 2.395114609259885e-05,
"loss": 0.4267,
"step": 2100
},
{
"epoch": 2.8717598908594817,
"grad_norm": 0.26954147028119524,
"learning_rate": 2.3850922517687492e-05,
"loss": 0.4303,
"step": 2105
},
{
"epoch": 2.878581173260573,
"grad_norm": 0.25255242413846607,
"learning_rate": 2.3750773247094682e-05,
"loss": 0.426,
"step": 2110
},
{
"epoch": 2.8854024556616644,
"grad_norm": 0.26345645011178265,
"learning_rate": 2.3650700320109343e-05,
"loss": 0.4159,
"step": 2115
},
{
"epoch": 2.892223738062756,
"grad_norm": 0.2450797647252615,
"learning_rate": 2.3550705774465858e-05,
"loss": 0.4144,
"step": 2120
},
{
"epoch": 2.899045020463847,
"grad_norm": 0.2960852062725862,
"learning_rate": 2.3450791646302572e-05,
"loss": 0.428,
"step": 2125
},
{
"epoch": 2.9058663028649385,
"grad_norm": 0.23784327150753126,
"learning_rate": 2.3350959970120318e-05,
"loss": 0.4245,
"step": 2130
},
{
"epoch": 2.91268758526603,
"grad_norm": 0.2634044450477537,
"learning_rate": 2.3251212778741012e-05,
"loss": 0.4194,
"step": 2135
},
{
"epoch": 2.9195088676671213,
"grad_norm": 0.27382767150964016,
"learning_rate": 2.3151552103266234e-05,
"loss": 0.4234,
"step": 2140
},
{
"epoch": 2.926330150068213,
"grad_norm": 0.2882575767927483,
"learning_rate": 2.3051979973035913e-05,
"loss": 0.4161,
"step": 2145
},
{
"epoch": 2.9331514324693044,
"grad_norm": 0.29844342531485935,
"learning_rate": 2.295249841558696e-05,
"loss": 0.4232,
"step": 2150
},
{
"epoch": 2.939972714870396,
"grad_norm": 0.26905163387508313,
"learning_rate": 2.2853109456611987e-05,
"loss": 0.4164,
"step": 2155
},
{
"epoch": 2.946793997271487,
"grad_norm": 0.2646371089556455,
"learning_rate": 2.2753815119918076e-05,
"loss": 0.4153,
"step": 2160
},
{
"epoch": 2.9536152796725785,
"grad_norm": 0.25665046095413097,
"learning_rate": 2.2654617427385583e-05,
"loss": 0.4222,
"step": 2165
},
{
"epoch": 2.96043656207367,
"grad_norm": 0.24014304321760452,
"learning_rate": 2.2555518398926928e-05,
"loss": 0.4153,
"step": 2170
},
{
"epoch": 2.9672578444747613,
"grad_norm": 0.2783657011518547,
"learning_rate": 2.2456520052445484e-05,
"loss": 0.4236,
"step": 2175
},
{
"epoch": 2.9740791268758526,
"grad_norm": 0.2362458119689319,
"learning_rate": 2.2357624403794497e-05,
"loss": 0.4181,
"step": 2180
},
{
"epoch": 2.980900409276944,
"grad_norm": 0.2692604610177288,
"learning_rate": 2.2258833466736016e-05,
"loss": 0.4229,
"step": 2185
},
{
"epoch": 2.9877216916780354,
"grad_norm": 0.2570622690964156,
"learning_rate": 2.2160149252899913e-05,
"loss": 0.4189,
"step": 2190
},
{
"epoch": 2.9945429740791267,
"grad_norm": 0.23880413998979652,
"learning_rate": 2.206157377174292e-05,
"loss": 0.4215,
"step": 2195
},
{
"epoch": 3.001364256480218,
"grad_norm": 0.28400466218366754,
"learning_rate": 2.196310903050767e-05,
"loss": 0.4086,
"step": 2200
},
{
"epoch": 3.00818553888131,
"grad_norm": 0.26762839162616747,
"learning_rate": 2.1864757034181883e-05,
"loss": 0.3902,
"step": 2205
},
{
"epoch": 3.0150068212824013,
"grad_norm": 0.25290575577881014,
"learning_rate": 2.176651978545749e-05,
"loss": 0.39,
"step": 2210
},
{
"epoch": 3.0218281036834926,
"grad_norm": 0.24735456853320856,
"learning_rate": 2.166839928468988e-05,
"loss": 0.384,
"step": 2215
},
{
"epoch": 3.028649386084584,
"grad_norm": 0.2608451564395861,
"learning_rate": 2.1570397529857172e-05,
"loss": 0.3879,
"step": 2220
},
{
"epoch": 3.0354706684856754,
"grad_norm": 0.26650939101645865,
"learning_rate": 2.1472516516519524e-05,
"loss": 0.3868,
"step": 2225
},
{
"epoch": 3.0422919508867667,
"grad_norm": 0.2589156089671275,
"learning_rate": 2.1374758237778485e-05,
"loss": 0.387,
"step": 2230
},
{
"epoch": 3.049113233287858,
"grad_norm": 0.2553292117208548,
"learning_rate": 2.1277124684236416e-05,
"loss": 0.3869,
"step": 2235
},
{
"epoch": 3.0559345156889495,
"grad_norm": 0.26167405276695926,
"learning_rate": 2.117961784395599e-05,
"loss": 0.3938,
"step": 2240
},
{
"epoch": 3.062755798090041,
"grad_norm": 0.2723361097370056,
"learning_rate": 2.108223970241964e-05,
"loss": 0.39,
"step": 2245
},
{
"epoch": 3.069577080491132,
"grad_norm": 0.23691303745295014,
"learning_rate": 2.09849922424892e-05,
"loss": 0.398,
"step": 2250
},
{
"epoch": 3.0763983628922236,
"grad_norm": 0.26384966366753837,
"learning_rate": 2.0887877444365506e-05,
"loss": 0.386,
"step": 2255
},
{
"epoch": 3.083219645293315,
"grad_norm": 0.28652063774948267,
"learning_rate": 2.0790897285548044e-05,
"loss": 0.3979,
"step": 2260
},
{
"epoch": 3.0900409276944067,
"grad_norm": 0.24726341399079166,
"learning_rate": 2.0694053740794728e-05,
"loss": 0.3877,
"step": 2265
},
{
"epoch": 3.096862210095498,
"grad_norm": 0.3147075219833397,
"learning_rate": 2.0597348782081666e-05,
"loss": 0.3926,
"step": 2270
},
{
"epoch": 3.1036834924965895,
"grad_norm": 0.27699488401023803,
"learning_rate": 2.0500784378562997e-05,
"loss": 0.3859,
"step": 2275
},
{
"epoch": 3.110504774897681,
"grad_norm": 0.2424654454311902,
"learning_rate": 2.0404362496530832e-05,
"loss": 0.3791,
"step": 2280
},
{
"epoch": 3.117326057298772,
"grad_norm": 0.24533073896669516,
"learning_rate": 2.030808509937514e-05,
"loss": 0.384,
"step": 2285
},
{
"epoch": 3.1241473396998636,
"grad_norm": 0.2710469516457036,
"learning_rate": 2.0211954147543873e-05,
"loss": 0.3841,
"step": 2290
},
{
"epoch": 3.130968622100955,
"grad_norm": 0.25722065555572754,
"learning_rate": 2.0115971598502946e-05,
"loss": 0.391,
"step": 2295
},
{
"epoch": 3.1377899045020463,
"grad_norm": 0.23265895912306767,
"learning_rate": 2.002013940669647e-05,
"loss": 0.3898,
"step": 2300
},
{
"epoch": 3.1446111869031377,
"grad_norm": 0.25910259700805677,
"learning_rate": 1.992445952350686e-05,
"loss": 0.3801,
"step": 2305
},
{
"epoch": 3.151432469304229,
"grad_norm": 0.252711830390526,
"learning_rate": 1.9828933897215173e-05,
"loss": 0.3869,
"step": 2310
},
{
"epoch": 3.1582537517053204,
"grad_norm": 0.2527698741289496,
"learning_rate": 1.9733564472961424e-05,
"loss": 0.3907,
"step": 2315
},
{
"epoch": 3.1650750341064118,
"grad_norm": 0.2302111532873343,
"learning_rate": 1.9638353192704918e-05,
"loss": 0.393,
"step": 2320
},
{
"epoch": 3.1718963165075036,
"grad_norm": 0.25259862180285164,
"learning_rate": 1.9543301995184803e-05,
"loss": 0.3904,
"step": 2325
},
{
"epoch": 3.178717598908595,
"grad_norm": 0.25095781768714637,
"learning_rate": 1.9448412815880517e-05,
"loss": 0.3953,
"step": 2330
},
{
"epoch": 3.1855388813096863,
"grad_norm": 0.24601622008423973,
"learning_rate": 1.9353687586972408e-05,
"loss": 0.3913,
"step": 2335
},
{
"epoch": 3.1923601637107777,
"grad_norm": 0.25295128919530757,
"learning_rate": 1.9259128237302392e-05,
"loss": 0.3898,
"step": 2340
},
{
"epoch": 3.199181446111869,
"grad_norm": 0.25707867349735203,
"learning_rate": 1.9164736692334663e-05,
"loss": 0.3986,
"step": 2345
},
{
"epoch": 3.2060027285129604,
"grad_norm": 0.2706545466675487,
"learning_rate": 1.9070514874116492e-05,
"loss": 0.3876,
"step": 2350
},
{
"epoch": 3.212824010914052,
"grad_norm": 0.2315776907522079,
"learning_rate": 1.89764647012391e-05,
"loss": 0.3839,
"step": 2355
},
{
"epoch": 3.219645293315143,
"grad_norm": 0.23690592753543196,
"learning_rate": 1.8882588088798565e-05,
"loss": 0.386,
"step": 2360
},
{
"epoch": 3.2264665757162345,
"grad_norm": 0.256789498241867,
"learning_rate": 1.878888694835685e-05,
"loss": 0.3867,
"step": 2365
},
{
"epoch": 3.233287858117326,
"grad_norm": 0.2667167457952798,
"learning_rate": 1.8695363187902864e-05,
"loss": 0.3777,
"step": 2370
},
{
"epoch": 3.2401091405184177,
"grad_norm": 0.2717939410865978,
"learning_rate": 1.860201871181364e-05,
"loss": 0.386,
"step": 2375
},
{
"epoch": 3.246930422919509,
"grad_norm": 0.24664438475145947,
"learning_rate": 1.8508855420815508e-05,
"loss": 0.3877,
"step": 2380
},
{
"epoch": 3.2537517053206004,
"grad_norm": 0.2402057833979888,
"learning_rate": 1.8415875211945434e-05,
"loss": 0.3917,
"step": 2385
},
{
"epoch": 3.260572987721692,
"grad_norm": 0.2370683883511882,
"learning_rate": 1.832307997851236e-05,
"loss": 0.3939,
"step": 2390
},
{
"epoch": 3.267394270122783,
"grad_norm": 0.23764072931125804,
"learning_rate": 1.8230471610058673e-05,
"loss": 0.3878,
"step": 2395
},
{
"epoch": 3.2742155525238745,
"grad_norm": 0.25455084354221097,
"learning_rate": 1.813805199232173e-05,
"loss": 0.3935,
"step": 2400
},
{
"epoch": 3.281036834924966,
"grad_norm": 0.2592711310403459,
"learning_rate": 1.8045823007195456e-05,
"loss": 0.383,
"step": 2405
},
{
"epoch": 3.2878581173260573,
"grad_norm": 0.22929056385950908,
"learning_rate": 1.7953786532691996e-05,
"loss": 0.3975,
"step": 2410
},
{
"epoch": 3.2946793997271486,
"grad_norm": 0.2463955192715735,
"learning_rate": 1.7861944442903523e-05,
"loss": 0.3881,
"step": 2415
},
{
"epoch": 3.30150068212824,
"grad_norm": 0.25423450189038377,
"learning_rate": 1.777029860796406e-05,
"loss": 0.3935,
"step": 2420
},
{
"epoch": 3.3083219645293314,
"grad_norm": 0.2510864654388527,
"learning_rate": 1.767885089401135e-05,
"loss": 0.3866,
"step": 2425
},
{
"epoch": 3.3151432469304227,
"grad_norm": 0.2386139088635014,
"learning_rate": 1.7587603163148936e-05,
"loss": 0.3812,
"step": 2430
},
{
"epoch": 3.321964529331514,
"grad_norm": 0.24126241032577334,
"learning_rate": 1.749655727340819e-05,
"loss": 0.3797,
"step": 2435
},
{
"epoch": 3.328785811732606,
"grad_norm": 0.23457981332958217,
"learning_rate": 1.740571507871052e-05,
"loss": 0.4019,
"step": 2440
},
{
"epoch": 3.3356070941336973,
"grad_norm": 0.2455529177316154,
"learning_rate": 1.731507842882955e-05,
"loss": 0.3834,
"step": 2445
},
{
"epoch": 3.3424283765347886,
"grad_norm": 0.25498588860883886,
"learning_rate": 1.7224649169353547e-05,
"loss": 0.3872,
"step": 2450
},
{
"epoch": 3.34924965893588,
"grad_norm": 0.24993640876977888,
"learning_rate": 1.7134429141647747e-05,
"loss": 0.3896,
"step": 2455
},
{
"epoch": 3.3560709413369714,
"grad_norm": 0.2724169506194102,
"learning_rate": 1.704442018281694e-05,
"loss": 0.3939,
"step": 2460
},
{
"epoch": 3.3628922237380627,
"grad_norm": 0.24835590557722398,
"learning_rate": 1.695462412566802e-05,
"loss": 0.3918,
"step": 2465
},
{
"epoch": 3.369713506139154,
"grad_norm": 0.23729760409009898,
"learning_rate": 1.686504279867267e-05,
"loss": 0.3872,
"step": 2470
},
{
"epoch": 3.3765347885402455,
"grad_norm": 0.23242634563284187,
"learning_rate": 1.6775678025930107e-05,
"loss": 0.3894,
"step": 2475
},
{
"epoch": 3.383356070941337,
"grad_norm": 0.23937238350849568,
"learning_rate": 1.6686531627130013e-05,
"loss": 0.39,
"step": 2480
},
{
"epoch": 3.390177353342428,
"grad_norm": 0.2402399773084803,
"learning_rate": 1.6597605417515376e-05,
"loss": 0.3908,
"step": 2485
},
{
"epoch": 3.39699863574352,
"grad_norm": 0.2411687520633103,
"learning_rate": 1.6508901207845622e-05,
"loss": 0.3933,
"step": 2490
},
{
"epoch": 3.4038199181446114,
"grad_norm": 0.2399360785924478,
"learning_rate": 1.6420420804359703e-05,
"loss": 0.3815,
"step": 2495
},
{
"epoch": 3.4106412005457027,
"grad_norm": 0.23575057738592686,
"learning_rate": 1.6332166008739303e-05,
"loss": 0.3809,
"step": 2500
},
{
"epoch": 3.417462482946794,
"grad_norm": 0.2410072988517726,
"learning_rate": 1.6244138618072162e-05,
"loss": 0.3921,
"step": 2505
},
{
"epoch": 3.4242837653478855,
"grad_norm": 0.2627471930720635,
"learning_rate": 1.6156340424815516e-05,
"loss": 0.3887,
"step": 2510
},
{
"epoch": 3.431105047748977,
"grad_norm": 0.27721676597358763,
"learning_rate": 1.6068773216759543e-05,
"loss": 0.3861,
"step": 2515
},
{
"epoch": 3.437926330150068,
"grad_norm": 0.22524657033432005,
"learning_rate": 1.5981438776990993e-05,
"loss": 0.3915,
"step": 2520
},
{
"epoch": 3.4447476125511596,
"grad_norm": 0.25782439333115764,
"learning_rate": 1.589433888385689e-05,
"loss": 0.3812,
"step": 2525
},
{
"epoch": 3.451568894952251,
"grad_norm": 0.2534837223232656,
"learning_rate": 1.5807475310928277e-05,
"loss": 0.3819,
"step": 2530
},
{
"epoch": 3.4583901773533423,
"grad_norm": 0.2411924331188938,
"learning_rate": 1.572084982696415e-05,
"loss": 0.3875,
"step": 2535
},
{
"epoch": 3.4652114597544337,
"grad_norm": 0.23815388889239475,
"learning_rate": 1.5634464195875416e-05,
"loss": 0.3762,
"step": 2540
},
{
"epoch": 3.472032742155525,
"grad_norm": 0.2512767672555059,
"learning_rate": 1.5548320176688965e-05,
"loss": 0.3846,
"step": 2545
},
{
"epoch": 3.4788540245566164,
"grad_norm": 0.22813572411154703,
"learning_rate": 1.5462419523511872e-05,
"loss": 0.3891,
"step": 2550
},
{
"epoch": 3.485675306957708,
"grad_norm": 0.22750656208279468,
"learning_rate": 1.5376763985495692e-05,
"loss": 0.3791,
"step": 2555
},
{
"epoch": 3.4924965893587996,
"grad_norm": 0.22941848560650482,
"learning_rate": 1.529135530680079e-05,
"loss": 0.3855,
"step": 2560
},
{
"epoch": 3.499317871759891,
"grad_norm": 0.24186200739975727,
"learning_rate": 1.5206195226560888e-05,
"loss": 0.382,
"step": 2565
},
{
"epoch": 3.5061391541609823,
"grad_norm": 0.247880498233121,
"learning_rate": 1.5121285478847625e-05,
"loss": 0.3912,
"step": 2570
},
{
"epoch": 3.5129604365620737,
"grad_norm": 0.22121916115206966,
"learning_rate": 1.5036627792635219e-05,
"loss": 0.3851,
"step": 2575
},
{
"epoch": 3.519781718963165,
"grad_norm": 0.2541273486285163,
"learning_rate": 1.49522238917653e-05,
"loss": 0.3919,
"step": 2580
},
{
"epoch": 3.5266030013642564,
"grad_norm": 0.2219378122067416,
"learning_rate": 1.4868075494911813e-05,
"loss": 0.389,
"step": 2585
},
{
"epoch": 3.533424283765348,
"grad_norm": 0.23505397012761625,
"learning_rate": 1.4784184315545968e-05,
"loss": 0.3925,
"step": 2590
},
{
"epoch": 3.540245566166439,
"grad_norm": 0.243871234675225,
"learning_rate": 1.4700552061901423e-05,
"loss": 0.3941,
"step": 2595
},
{
"epoch": 3.547066848567531,
"grad_norm": 0.2399559284041621,
"learning_rate": 1.4617180436939442e-05,
"loss": 0.3864,
"step": 2600
},
{
"epoch": 3.5538881309686223,
"grad_norm": 0.21831642013948907,
"learning_rate": 1.453407113831424e-05,
"loss": 0.3839,
"step": 2605
},
{
"epoch": 3.5607094133697137,
"grad_norm": 0.23659655907063717,
"learning_rate": 1.4451225858338425e-05,
"loss": 0.3858,
"step": 2610
},
{
"epoch": 3.567530695770805,
"grad_norm": 0.23969385857421838,
"learning_rate": 1.4368646283948506e-05,
"loss": 0.3853,
"step": 2615
},
{
"epoch": 3.5743519781718964,
"grad_norm": 0.2449645484046614,
"learning_rate": 1.4286334096670575e-05,
"loss": 0.3805,
"step": 2620
},
{
"epoch": 3.581173260572988,
"grad_norm": 0.24260601540494856,
"learning_rate": 1.4204290972586062e-05,
"loss": 0.3945,
"step": 2625
},
{
"epoch": 3.587994542974079,
"grad_norm": 0.23850743724344997,
"learning_rate": 1.41225185822976e-05,
"loss": 0.3902,
"step": 2630
},
{
"epoch": 3.5948158253751705,
"grad_norm": 0.2200681993067038,
"learning_rate": 1.404101859089499e-05,
"loss": 0.396,
"step": 2635
},
{
"epoch": 3.601637107776262,
"grad_norm": 0.22389866141232462,
"learning_rate": 1.3959792657921322e-05,
"loss": 0.398,
"step": 2640
},
{
"epoch": 3.6084583901773533,
"grad_norm": 0.22920576973220413,
"learning_rate": 1.3878842437339184e-05,
"loss": 0.3951,
"step": 2645
},
{
"epoch": 3.6152796725784446,
"grad_norm": 0.2342034408316575,
"learning_rate": 1.3798169577496956e-05,
"loss": 0.3871,
"step": 2650
},
{
"epoch": 3.622100954979536,
"grad_norm": 0.23776945264796728,
"learning_rate": 1.3717775721095261e-05,
"loss": 0.3893,
"step": 2655
},
{
"epoch": 3.6289222373806274,
"grad_norm": 0.2287623836213206,
"learning_rate": 1.363766250515353e-05,
"loss": 0.3926,
"step": 2660
},
{
"epoch": 3.6357435197817187,
"grad_norm": 0.2290706247481116,
"learning_rate": 1.3557831560976642e-05,
"loss": 0.3902,
"step": 2665
},
{
"epoch": 3.64256480218281,
"grad_norm": 0.23897064324266662,
"learning_rate": 1.3478284514121717e-05,
"loss": 0.3865,
"step": 2670
},
{
"epoch": 3.649386084583902,
"grad_norm": 0.22158513812624758,
"learning_rate": 1.3399022984365042e-05,
"loss": 0.3779,
"step": 2675
},
{
"epoch": 3.6562073669849933,
"grad_norm": 0.260366723196768,
"learning_rate": 1.3320048585669028e-05,
"loss": 0.3912,
"step": 2680
},
{
"epoch": 3.6630286493860846,
"grad_norm": 0.2327592062400347,
"learning_rate": 1.3241362926149414e-05,
"loss": 0.3788,
"step": 2685
},
{
"epoch": 3.669849931787176,
"grad_norm": 0.274275386301667,
"learning_rate": 1.3162967608042468e-05,
"loss": 0.3834,
"step": 2690
},
{
"epoch": 3.6766712141882674,
"grad_norm": 0.2544946567880361,
"learning_rate": 1.3084864227672377e-05,
"loss": 0.3929,
"step": 2695
},
{
"epoch": 3.6834924965893587,
"grad_norm": 0.23821594624055176,
"learning_rate": 1.300705437541877e-05,
"loss": 0.3773,
"step": 2700
},
{
"epoch": 3.69031377899045,
"grad_norm": 0.22340565386504316,
"learning_rate": 1.2929539635684309e-05,
"loss": 0.3951,
"step": 2705
},
{
"epoch": 3.6971350613915415,
"grad_norm": 0.22585370418113979,
"learning_rate": 1.2852321586862407e-05,
"loss": 0.3864,
"step": 2710
},
{
"epoch": 3.7039563437926333,
"grad_norm": 0.223306643538749,
"learning_rate": 1.277540180130513e-05,
"loss": 0.3896,
"step": 2715
},
{
"epoch": 3.7107776261937246,
"grad_norm": 0.24095852480087845,
"learning_rate": 1.2698781845291164e-05,
"loss": 0.3986,
"step": 2720
},
{
"epoch": 3.717598908594816,
"grad_norm": 0.23864727514551046,
"learning_rate": 1.262246327899389e-05,
"loss": 0.3845,
"step": 2725
},
{
"epoch": 3.7244201909959074,
"grad_norm": 0.22563526124956518,
"learning_rate": 1.2546447656449668e-05,
"loss": 0.38,
"step": 2730
},
{
"epoch": 3.7312414733969987,
"grad_norm": 0.2129828812682242,
"learning_rate": 1.2470736525526169e-05,
"loss": 0.3925,
"step": 2735
},
{
"epoch": 3.73806275579809,
"grad_norm": 0.23462714598689244,
"learning_rate": 1.2395331427890827e-05,
"loss": 0.3917,
"step": 2740
},
{
"epoch": 3.7448840381991815,
"grad_norm": 0.23481258502381297,
"learning_rate": 1.2320233898979512e-05,
"loss": 0.381,
"step": 2745
},
{
"epoch": 3.751705320600273,
"grad_norm": 0.23006399815335687,
"learning_rate": 1.2245445467965208e-05,
"loss": 0.388,
"step": 2750
},
{
"epoch": 3.758526603001364,
"grad_norm": 0.2292582900286725,
"learning_rate": 1.2170967657726885e-05,
"loss": 0.3863,
"step": 2755
},
{
"epoch": 3.7653478854024556,
"grad_norm": 0.20911004549219808,
"learning_rate": 1.2096801984818528e-05,
"loss": 0.3927,
"step": 2760
},
{
"epoch": 3.772169167803547,
"grad_norm": 0.23285590431049188,
"learning_rate": 1.2022949959438203e-05,
"loss": 0.3934,
"step": 2765
},
{
"epoch": 3.7789904502046383,
"grad_norm": 0.22371180041699631,
"learning_rate": 1.1949413085397328e-05,
"loss": 0.3854,
"step": 2770
},
{
"epoch": 3.7858117326057297,
"grad_norm": 0.2296288443937746,
"learning_rate": 1.1876192860090073e-05,
"loss": 0.3971,
"step": 2775
},
{
"epoch": 3.792633015006821,
"grad_norm": 0.22350404721125006,
"learning_rate": 1.1803290774462848e-05,
"loss": 0.3896,
"step": 2780
},
{
"epoch": 3.799454297407913,
"grad_norm": 0.21611095868943725,
"learning_rate": 1.1730708312983925e-05,
"loss": 0.3845,
"step": 2785
},
{
"epoch": 3.806275579809004,
"grad_norm": 0.22943694058753963,
"learning_rate": 1.1658446953613246e-05,
"loss": 0.3844,
"step": 2790
},
{
"epoch": 3.8130968622100956,
"grad_norm": 0.23106087721823299,
"learning_rate": 1.1586508167772334e-05,
"loss": 0.389,
"step": 2795
},
{
"epoch": 3.819918144611187,
"grad_norm": 0.22389625388226372,
"learning_rate": 1.1514893420314252e-05,
"loss": 0.3871,
"step": 2800
},
{
"epoch": 3.8267394270122783,
"grad_norm": 0.2336575786987886,
"learning_rate": 1.1443604169493887e-05,
"loss": 0.3855,
"step": 2805
},
{
"epoch": 3.8335607094133697,
"grad_norm": 0.24695715433752222,
"learning_rate": 1.1372641866938197e-05,
"loss": 0.3834,
"step": 2810
},
{
"epoch": 3.840381991814461,
"grad_norm": 0.23095097247032337,
"learning_rate": 1.1302007957616626e-05,
"loss": 0.3868,
"step": 2815
},
{
"epoch": 3.8472032742155524,
"grad_norm": 0.2309262382228596,
"learning_rate": 1.123170387981174e-05,
"loss": 0.3842,
"step": 2820
},
{
"epoch": 3.854024556616644,
"grad_norm": 0.22324542558506838,
"learning_rate": 1.116173106508991e-05,
"loss": 0.3874,
"step": 2825
},
{
"epoch": 3.8608458390177356,
"grad_norm": 0.2336403968417466,
"learning_rate": 1.1092090938272154e-05,
"loss": 0.3856,
"step": 2830
},
{
"epoch": 3.867667121418827,
"grad_norm": 0.22474941288116115,
"learning_rate": 1.1022784917405146e-05,
"loss": 0.3931,
"step": 2835
},
{
"epoch": 3.8744884038199183,
"grad_norm": 0.23021154838376737,
"learning_rate": 1.0953814413732325e-05,
"loss": 0.3913,
"step": 2840
},
{
"epoch": 3.8813096862210097,
"grad_norm": 0.23687546810217178,
"learning_rate": 1.0885180831665148e-05,
"loss": 0.3921,
"step": 2845
},
{
"epoch": 3.888130968622101,
"grad_norm": 0.22277814774085422,
"learning_rate": 1.0816885568754533e-05,
"loss": 0.3883,
"step": 2850
},
{
"epoch": 3.8949522510231924,
"grad_norm": 0.2287671628320653,
"learning_rate": 1.074893001566237e-05,
"loss": 0.3859,
"step": 2855
},
{
"epoch": 3.901773533424284,
"grad_norm": 0.22176934372528126,
"learning_rate": 1.0681315556133193e-05,
"loss": 0.3848,
"step": 2860
},
{
"epoch": 3.908594815825375,
"grad_norm": 0.24129010351058253,
"learning_rate": 1.0614043566966036e-05,
"loss": 0.3827,
"step": 2865
},
{
"epoch": 3.9154160982264665,
"grad_norm": 0.2242584022267207,
"learning_rate": 1.0547115417986394e-05,
"loss": 0.3933,
"step": 2870
},
{
"epoch": 3.922237380627558,
"grad_norm": 0.23752891552617475,
"learning_rate": 1.0480532472018278e-05,
"loss": 0.3909,
"step": 2875
},
{
"epoch": 3.9290586630286493,
"grad_norm": 0.21420522853705198,
"learning_rate": 1.041429608485654e-05,
"loss": 0.3884,
"step": 2880
},
{
"epoch": 3.9358799454297406,
"grad_norm": 0.22604709228379252,
"learning_rate": 1.0348407605239225e-05,
"loss": 0.3826,
"step": 2885
},
{
"epoch": 3.942701227830832,
"grad_norm": 0.23521675479439724,
"learning_rate": 1.02828683748201e-05,
"loss": 0.3981,
"step": 2890
},
{
"epoch": 3.9495225102319234,
"grad_norm": 0.23258006147829116,
"learning_rate": 1.0217679728141358e-05,
"loss": 0.3889,
"step": 2895
},
{
"epoch": 3.956343792633015,
"grad_norm": 0.23352568917122027,
"learning_rate": 1.0152842992606434e-05,
"loss": 0.3791,
"step": 2900
},
{
"epoch": 3.9631650750341065,
"grad_norm": 0.23650015254821877,
"learning_rate": 1.0088359488452965e-05,
"loss": 0.385,
"step": 2905
},
{
"epoch": 3.969986357435198,
"grad_norm": 0.21467911019524638,
"learning_rate": 1.0024230528725923e-05,
"loss": 0.3841,
"step": 2910
},
{
"epoch": 3.9768076398362893,
"grad_norm": 0.2118786144254659,
"learning_rate": 9.960457419250868e-06,
"loss": 0.3815,
"step": 2915
},
{
"epoch": 3.9836289222373806,
"grad_norm": 0.24661055007603747,
"learning_rate": 9.897041458607355e-06,
"loss": 0.384,
"step": 2920
},
{
"epoch": 3.990450204638472,
"grad_norm": 0.23600245340062118,
"learning_rate": 9.833983938102517e-06,
"loss": 0.3898,
"step": 2925
},
{
"epoch": 3.9972714870395634,
"grad_norm": 0.2462434555318692,
"learning_rate": 9.77128614174474e-06,
"loss": 0.3878,
"step": 2930
},
{
"epoch": 4.004092769440655,
"grad_norm": 0.22737326101365074,
"learning_rate": 9.708949346217524e-06,
"loss": 0.3721,
"step": 2935
},
{
"epoch": 4.0109140518417465,
"grad_norm": 0.2716741438208754,
"learning_rate": 9.6469748208535e-06,
"loss": 0.3653,
"step": 2940
},
{
"epoch": 4.017735334242838,
"grad_norm": 0.23957849713614088,
"learning_rate": 9.58536382760858e-06,
"loss": 0.3584,
"step": 2945
},
{
"epoch": 4.024556616643929,
"grad_norm": 0.24534646396528859,
"learning_rate": 9.52411762103623e-06,
"loss": 0.3641,
"step": 2950
},
{
"epoch": 4.031377899045021,
"grad_norm": 0.23453648067754057,
"learning_rate": 9.463237448261978e-06,
"loss": 0.3563,
"step": 2955
},
{
"epoch": 4.038199181446112,
"grad_norm": 0.2458309245916263,
"learning_rate": 9.402724548957984e-06,
"loss": 0.3525,
"step": 2960
},
{
"epoch": 4.045020463847203,
"grad_norm": 0.21575333823040618,
"learning_rate": 9.34258015531779e-06,
"loss": 0.3601,
"step": 2965
},
{
"epoch": 4.051841746248295,
"grad_norm": 0.22050939826229063,
"learning_rate": 9.282805492031263e-06,
"loss": 0.3559,
"step": 2970
},
{
"epoch": 4.058663028649386,
"grad_norm": 0.2273826757325257,
"learning_rate": 9.22340177625963e-06,
"loss": 0.36,
"step": 2975
},
{
"epoch": 4.0654843110504775,
"grad_norm": 0.22114160384151124,
"learning_rate": 9.164370217610695e-06,
"loss": 0.3605,
"step": 2980
},
{
"epoch": 4.072305593451569,
"grad_norm": 0.22068975081869985,
"learning_rate": 9.105712018114216e-06,
"loss": 0.3677,
"step": 2985
},
{
"epoch": 4.07912687585266,
"grad_norm": 0.21739973803629506,
"learning_rate": 9.047428372197445e-06,
"loss": 0.3604,
"step": 2990
},
{
"epoch": 4.085948158253752,
"grad_norm": 0.21792476209759648,
"learning_rate": 8.989520466660758e-06,
"loss": 0.3574,
"step": 2995
},
{
"epoch": 4.092769440654843,
"grad_norm": 0.22329035125949306,
"learning_rate": 8.931989480653549e-06,
"loss": 0.3528,
"step": 3000
},
{
"epoch": 4.099590723055934,
"grad_norm": 0.22327717337480657,
"learning_rate": 8.874836585650183e-06,
"loss": 0.3588,
"step": 3005
},
{
"epoch": 4.106412005457026,
"grad_norm": 0.2539624450068349,
"learning_rate": 8.81806294542613e-06,
"loss": 0.3658,
"step": 3010
},
{
"epoch": 4.113233287858117,
"grad_norm": 0.2279275522447994,
"learning_rate": 8.761669716034316e-06,
"loss": 0.3657,
"step": 3015
},
{
"epoch": 4.120054570259208,
"grad_norm": 0.21624233022403727,
"learning_rate": 8.705658045781535e-06,
"loss": 0.3652,
"step": 3020
},
{
"epoch": 4.1268758526603,
"grad_norm": 0.22338819767620166,
"learning_rate": 8.65002907520508e-06,
"loss": 0.3554,
"step": 3025
},
{
"epoch": 4.133697135061391,
"grad_norm": 0.21933927707300271,
"learning_rate": 8.594783937049542e-06,
"loss": 0.3646,
"step": 3030
},
{
"epoch": 4.1405184174624825,
"grad_norm": 0.21745048274348827,
"learning_rate": 8.539923756243726e-06,
"loss": 0.3612,
"step": 3035
},
{
"epoch": 4.147339699863575,
"grad_norm": 0.22215248408199215,
"learning_rate": 8.485449649877719e-06,
"loss": 0.3617,
"step": 3040
},
{
"epoch": 4.154160982264666,
"grad_norm": 0.21830220010908438,
"learning_rate": 8.431362727180202e-06,
"loss": 0.3653,
"step": 3045
},
{
"epoch": 4.1609822646657575,
"grad_norm": 0.2172278500786131,
"learning_rate": 8.377664089495818e-06,
"loss": 0.3586,
"step": 3050
},
{
"epoch": 4.167803547066849,
"grad_norm": 0.22322562699694812,
"learning_rate": 8.32435483026275e-06,
"loss": 0.366,
"step": 3055
},
{
"epoch": 4.17462482946794,
"grad_norm": 0.22127876018606277,
"learning_rate": 8.271436034990476e-06,
"loss": 0.3552,
"step": 3060
},
{
"epoch": 4.181446111869032,
"grad_norm": 0.2285771675307399,
"learning_rate": 8.21890878123765e-06,
"loss": 0.3601,
"step": 3065
},
{
"epoch": 4.188267394270123,
"grad_norm": 0.2245833117692198,
"learning_rate": 8.16677413859016e-06,
"loss": 0.3547,
"step": 3070
},
{
"epoch": 4.195088676671214,
"grad_norm": 0.2274333858726854,
"learning_rate": 8.115033168639362e-06,
"loss": 0.3668,
"step": 3075
},
{
"epoch": 4.201909959072306,
"grad_norm": 0.2277027755491278,
"learning_rate": 8.063686924960451e-06,
"loss": 0.3656,
"step": 3080
},
{
"epoch": 4.208731241473397,
"grad_norm": 0.2217198628514234,
"learning_rate": 8.012736453091002e-06,
"loss": 0.3638,
"step": 3085
},
{
"epoch": 4.215552523874488,
"grad_norm": 0.224080006759543,
"learning_rate": 7.962182790509706e-06,
"loss": 0.3638,
"step": 3090
},
{
"epoch": 4.22237380627558,
"grad_norm": 0.21533038392555934,
"learning_rate": 7.912026966615206e-06,
"loss": 0.367,
"step": 3095
},
{
"epoch": 4.229195088676671,
"grad_norm": 0.21835647709359632,
"learning_rate": 7.862270002705168e-06,
"loss": 0.3592,
"step": 3100
},
{
"epoch": 4.2360163710777625,
"grad_norm": 0.22759755664959258,
"learning_rate": 7.81291291195548e-06,
"loss": 0.3582,
"step": 3105
},
{
"epoch": 4.242837653478854,
"grad_norm": 0.21341514951715657,
"learning_rate": 7.763956699399613e-06,
"loss": 0.369,
"step": 3110
},
{
"epoch": 4.249658935879945,
"grad_norm": 0.21830086158415568,
"learning_rate": 7.71540236190814e-06,
"loss": 0.3703,
"step": 3115
},
{
"epoch": 4.256480218281037,
"grad_norm": 0.22265158567097876,
"learning_rate": 7.667250888168484e-06,
"loss": 0.3569,
"step": 3120
},
{
"epoch": 4.263301500682128,
"grad_norm": 0.22227322272741737,
"learning_rate": 7.619503258664734e-06,
"loss": 0.3579,
"step": 3125
},
{
"epoch": 4.270122783083219,
"grad_norm": 0.2210961648085674,
"learning_rate": 7.5721604456577165e-06,
"loss": 0.3549,
"step": 3130
},
{
"epoch": 4.276944065484311,
"grad_norm": 0.23040654515127004,
"learning_rate": 7.525223413165174e-06,
"loss": 0.3585,
"step": 3135
},
{
"epoch": 4.283765347885402,
"grad_norm": 0.22824816791771613,
"learning_rate": 7.478693116942159e-06,
"loss": 0.361,
"step": 3140
},
{
"epoch": 4.2905866302864935,
"grad_norm": 0.2169829907264743,
"learning_rate": 7.432570504461546e-06,
"loss": 0.3669,
"step": 3145
},
{
"epoch": 4.297407912687586,
"grad_norm": 0.21829402625134559,
"learning_rate": 7.386856514894759e-06,
"loss": 0.3635,
"step": 3150
},
{
"epoch": 4.304229195088677,
"grad_norm": 0.21652462215311938,
"learning_rate": 7.341552079092644e-06,
"loss": 0.3625,
"step": 3155
},
{
"epoch": 4.311050477489768,
"grad_norm": 0.21101625882615488,
"learning_rate": 7.296658119566495e-06,
"loss": 0.3588,
"step": 3160
},
{
"epoch": 4.31787175989086,
"grad_norm": 0.21735045950549228,
"learning_rate": 7.252175550469309e-06,
"loss": 0.3686,
"step": 3165
},
{
"epoch": 4.324693042291951,
"grad_norm": 0.21565787043013268,
"learning_rate": 7.20810527757713e-06,
"loss": 0.3671,
"step": 3170
},
{
"epoch": 4.3315143246930425,
"grad_norm": 0.22421139524596934,
"learning_rate": 7.164448198270618e-06,
"loss": 0.3526,
"step": 3175
},
{
"epoch": 4.338335607094134,
"grad_norm": 0.22134931294096435,
"learning_rate": 7.121205201516804e-06,
"loss": 0.3567,
"step": 3180
},
{
"epoch": 4.345156889495225,
"grad_norm": 0.22666795543953605,
"learning_rate": 7.0783771678509485e-06,
"loss": 0.3726,
"step": 3185
},
{
"epoch": 4.351978171896317,
"grad_norm": 0.21253153167501385,
"learning_rate": 7.035964969358627e-06,
"loss": 0.3613,
"step": 3190
},
{
"epoch": 4.358799454297408,
"grad_norm": 0.21642915334480572,
"learning_rate": 6.993969469657991e-06,
"loss": 0.3621,
"step": 3195
},
{
"epoch": 4.365620736698499,
"grad_norm": 0.21985376534741252,
"learning_rate": 6.952391523882136e-06,
"loss": 0.3644,
"step": 3200
},
{
"epoch": 4.372442019099591,
"grad_norm": 0.20818031870682635,
"learning_rate": 6.911231978661756e-06,
"loss": 0.3577,
"step": 3205
},
{
"epoch": 4.379263301500682,
"grad_norm": 0.21797401592355672,
"learning_rate": 6.870491672107829e-06,
"loss": 0.3606,
"step": 3210
},
{
"epoch": 4.3860845839017735,
"grad_norm": 0.2223959557046167,
"learning_rate": 6.830171433794615e-06,
"loss": 0.3614,
"step": 3215
},
{
"epoch": 4.392905866302865,
"grad_norm": 0.21662605179762412,
"learning_rate": 6.79027208474272e-06,
"loss": 0.3619,
"step": 3220
},
{
"epoch": 4.399727148703956,
"grad_norm": 0.21827160013518693,
"learning_rate": 6.750794437402409e-06,
"loss": 0.3643,
"step": 3225
},
{
"epoch": 4.406548431105048,
"grad_norm": 0.2094602127422858,
"learning_rate": 6.711739295637037e-06,
"loss": 0.3665,
"step": 3230
},
{
"epoch": 4.413369713506139,
"grad_norm": 0.21671970848566585,
"learning_rate": 6.673107454706698e-06,
"loss": 0.3556,
"step": 3235
},
{
"epoch": 4.42019099590723,
"grad_norm": 0.21946450945142332,
"learning_rate": 6.634899701252023e-06,
"loss": 0.3584,
"step": 3240
},
{
"epoch": 4.427012278308322,
"grad_norm": 0.21520753526661168,
"learning_rate": 6.597116813278165e-06,
"loss": 0.3587,
"step": 3245
},
{
"epoch": 4.433833560709413,
"grad_norm": 0.21879280857490085,
"learning_rate": 6.559759560138951e-06,
"loss": 0.3738,
"step": 3250
},
{
"epoch": 4.440654843110504,
"grad_norm": 0.21443348635512918,
"learning_rate": 6.522828702521229e-06,
"loss": 0.3681,
"step": 3255
},
{
"epoch": 4.447476125511596,
"grad_norm": 0.21531181663268836,
"learning_rate": 6.486324992429374e-06,
"loss": 0.3586,
"step": 3260
},
{
"epoch": 4.454297407912687,
"grad_norm": 0.22730777162459695,
"learning_rate": 6.450249173169957e-06,
"loss": 0.3647,
"step": 3265
},
{
"epoch": 4.461118690313779,
"grad_norm": 0.2186572367289963,
"learning_rate": 6.414601979336641e-06,
"loss": 0.3663,
"step": 3270
},
{
"epoch": 4.467939972714871,
"grad_norm": 0.205548359460269,
"learning_rate": 6.379384136795187e-06,
"loss": 0.3652,
"step": 3275
},
{
"epoch": 4.474761255115962,
"grad_norm": 0.21275260150841085,
"learning_rate": 6.344596362668717e-06,
"loss": 0.3567,
"step": 3280
},
{
"epoch": 4.4815825375170535,
"grad_norm": 0.22394364468614514,
"learning_rate": 6.310239365323067e-06,
"loss": 0.3568,
"step": 3285
},
{
"epoch": 4.488403819918145,
"grad_norm": 0.21614224486754255,
"learning_rate": 6.276313844352398e-06,
"loss": 0.3674,
"step": 3290
},
{
"epoch": 4.495225102319236,
"grad_norm": 0.22717780466240847,
"learning_rate": 6.242820490564919e-06,
"loss": 0.3579,
"step": 3295
},
{
"epoch": 4.502046384720328,
"grad_norm": 0.2194581741821536,
"learning_rate": 6.209759985968859e-06,
"loss": 0.3586,
"step": 3300
},
{
"epoch": 4.508867667121419,
"grad_norm": 0.21205274286854864,
"learning_rate": 6.177133003758534e-06,
"loss": 0.3639,
"step": 3305
},
{
"epoch": 4.51568894952251,
"grad_norm": 0.20169862390766227,
"learning_rate": 6.144940208300686e-06,
"loss": 0.3645,
"step": 3310
},
{
"epoch": 4.522510231923602,
"grad_norm": 0.2307839103232325,
"learning_rate": 6.113182255120918e-06,
"loss": 0.3612,
"step": 3315
},
{
"epoch": 4.529331514324693,
"grad_norm": 0.24589136143206128,
"learning_rate": 6.081859790890362e-06,
"loss": 0.3637,
"step": 3320
},
{
"epoch": 4.536152796725784,
"grad_norm": 0.21286529915712976,
"learning_rate": 6.050973453412505e-06,
"loss": 0.3662,
"step": 3325
},
{
"epoch": 4.542974079126876,
"grad_norm": 0.21707616385137538,
"learning_rate": 6.02052387161022e-06,
"loss": 0.3593,
"step": 3330
},
{
"epoch": 4.549795361527967,
"grad_norm": 0.21842536440351526,
"learning_rate": 5.990511665512928e-06,
"loss": 0.3721,
"step": 3335
},
{
"epoch": 4.5566166439290585,
"grad_norm": 0.21294660571272483,
"learning_rate": 5.9609374462439985e-06,
"loss": 0.3676,
"step": 3340
},
{
"epoch": 4.56343792633015,
"grad_norm": 0.22337040238839628,
"learning_rate": 5.931801816008301e-06,
"loss": 0.3684,
"step": 3345
},
{
"epoch": 4.570259208731241,
"grad_norm": 0.2161036036245841,
"learning_rate": 5.903105368079925e-06,
"loss": 0.3758,
"step": 3350
},
{
"epoch": 4.577080491132333,
"grad_norm": 0.21231866353419215,
"learning_rate": 5.874848686790128e-06,
"loss": 0.3589,
"step": 3355
},
{
"epoch": 4.583901773533424,
"grad_norm": 0.21210490903696586,
"learning_rate": 5.84703234751541e-06,
"loss": 0.3627,
"step": 3360
},
{
"epoch": 4.590723055934515,
"grad_norm": 0.23088940972859365,
"learning_rate": 5.819656916665815e-06,
"loss": 0.3683,
"step": 3365
},
{
"epoch": 4.597544338335607,
"grad_norm": 0.22016010914864348,
"learning_rate": 5.792722951673392e-06,
"loss": 0.3685,
"step": 3370
},
{
"epoch": 4.604365620736699,
"grad_norm": 0.214327414487605,
"learning_rate": 5.766231000980844e-06,
"loss": 0.3656,
"step": 3375
},
{
"epoch": 4.61118690313779,
"grad_norm": 0.22682397661714535,
"learning_rate": 5.740181604030356e-06,
"loss": 0.3673,
"step": 3380
},
{
"epoch": 4.618008185538882,
"grad_norm": 0.20115490800552044,
"learning_rate": 5.7145752912526205e-06,
"loss": 0.3509,
"step": 3385
},
{
"epoch": 4.624829467939973,
"grad_norm": 0.2147637290224296,
"learning_rate": 5.689412584056033e-06,
"loss": 0.365,
"step": 3390
},
{
"epoch": 4.631650750341064,
"grad_norm": 0.22835409215980654,
"learning_rate": 5.664693994816064e-06,
"loss": 0.3636,
"step": 3395
},
{
"epoch": 4.638472032742156,
"grad_norm": 0.23181824569276913,
"learning_rate": 5.640420026864841e-06,
"loss": 0.3577,
"step": 3400
},
{
"epoch": 4.645293315143247,
"grad_norm": 0.21273142507245463,
"learning_rate": 5.616591174480892e-06,
"loss": 0.3754,
"step": 3405
},
{
"epoch": 4.6521145975443385,
"grad_norm": 0.23412233563353524,
"learning_rate": 5.593207922879085e-06,
"loss": 0.3635,
"step": 3410
},
{
"epoch": 4.65893587994543,
"grad_norm": 0.23805419714394244,
"learning_rate": 5.5702707482007375e-06,
"loss": 0.3602,
"step": 3415
},
{
"epoch": 4.665757162346521,
"grad_norm": 0.22195919424198418,
"learning_rate": 5.547780117503936e-06,
"loss": 0.3615,
"step": 3420
},
{
"epoch": 4.672578444747613,
"grad_norm": 0.21560156330796026,
"learning_rate": 5.525736488754013e-06,
"loss": 0.3632,
"step": 3425
},
{
"epoch": 4.679399727148704,
"grad_norm": 0.2128851675688275,
"learning_rate": 5.504140310814227e-06,
"loss": 0.3712,
"step": 3430
},
{
"epoch": 4.686221009549795,
"grad_norm": 0.21823521503347237,
"learning_rate": 5.482992023436628e-06,
"loss": 0.3626,
"step": 3435
},
{
"epoch": 4.693042291950887,
"grad_norm": 0.21111278578924456,
"learning_rate": 5.462292057253084e-06,
"loss": 0.3687,
"step": 3440
},
{
"epoch": 4.699863574351978,
"grad_norm": 0.2277150652115561,
"learning_rate": 5.442040833766537e-06,
"loss": 0.3646,
"step": 3445
},
{
"epoch": 4.7066848567530695,
"grad_norm": 0.22645677225238678,
"learning_rate": 5.422238765342407e-06,
"loss": 0.3683,
"step": 3450
},
{
"epoch": 4.713506139154161,
"grad_norm": 0.2351028025182847,
"learning_rate": 5.402886255200191e-06,
"loss": 0.3666,
"step": 3455
},
{
"epoch": 4.720327421555252,
"grad_norm": 0.2274769392892358,
"learning_rate": 5.383983697405264e-06,
"loss": 0.4029,
"step": 3460
},
{
"epoch": 4.727148703956344,
"grad_norm": 0.22256597899239655,
"learning_rate": 5.36553147686085e-06,
"loss": 0.3585,
"step": 3465
},
{
"epoch": 4.733969986357435,
"grad_norm": 0.22712154690395114,
"learning_rate": 5.3475299693001705e-06,
"loss": 0.3637,
"step": 3470
},
{
"epoch": 4.740791268758526,
"grad_norm": 0.2341092963683226,
"learning_rate": 5.329979541278825e-06,
"loss": 0.3593,
"step": 3475
},
{
"epoch": 4.747612551159618,
"grad_norm": 0.21942571969743654,
"learning_rate": 5.312880550167298e-06,
"loss": 0.3702,
"step": 3480
},
{
"epoch": 4.754433833560709,
"grad_norm": 0.21824512110145416,
"learning_rate": 5.296233344143691e-06,
"loss": 0.3652,
"step": 3485
},
{
"epoch": 4.7612551159618,
"grad_norm": 0.2228367756186903,
"learning_rate": 5.28003826218664e-06,
"loss": 0.3576,
"step": 3490
},
{
"epoch": 4.768076398362892,
"grad_norm": 0.206323937884375,
"learning_rate": 5.264295634068407e-06,
"loss": 0.3764,
"step": 3495
},
{
"epoch": 4.774897680763983,
"grad_norm": 0.219043701650007,
"learning_rate": 5.249005780348163e-06,
"loss": 0.3629,
"step": 3500
},
{
"epoch": 4.781718963165075,
"grad_norm": 0.2065583178371043,
"learning_rate": 5.234169012365458e-06,
"loss": 0.3674,
"step": 3505
},
{
"epoch": 4.788540245566167,
"grad_norm": 0.21803424570263244,
"learning_rate": 5.2197856322339e-06,
"loss": 0.3706,
"step": 3510
},
{
"epoch": 4.795361527967258,
"grad_norm": 0.19896410177188684,
"learning_rate": 5.205855932834974e-06,
"loss": 0.3563,
"step": 3515
},
{
"epoch": 4.8021828103683495,
"grad_norm": 0.21096568003167546,
"learning_rate": 5.192380197812105e-06,
"loss": 0.3646,
"step": 3520
},
{
"epoch": 4.809004092769441,
"grad_norm": 0.20895896960294863,
"learning_rate": 5.1793587015648676e-06,
"loss": 0.3668,
"step": 3525
},
{
"epoch": 4.815825375170532,
"grad_norm": 0.21356363495231379,
"learning_rate": 5.1667917092434e-06,
"loss": 0.3606,
"step": 3530
},
{
"epoch": 4.822646657571624,
"grad_norm": 0.2280566078765369,
"learning_rate": 5.154679476743011e-06,
"loss": 0.3729,
"step": 3535
},
{
"epoch": 4.829467939972715,
"grad_norm": 0.22121541945426695,
"learning_rate": 5.143022250698964e-06,
"loss": 0.3587,
"step": 3540
},
{
"epoch": 4.836289222373806,
"grad_norm": 0.2170468593712326,
"learning_rate": 5.1318202684814476e-06,
"loss": 0.3609,
"step": 3545
},
{
"epoch": 4.843110504774898,
"grad_norm": 0.21645996535263878,
"learning_rate": 5.121073758190766e-06,
"loss": 0.3753,
"step": 3550
},
{
"epoch": 4.849931787175989,
"grad_norm": 0.20281049437817825,
"learning_rate": 5.110782938652669e-06,
"loss": 0.3675,
"step": 3555
},
{
"epoch": 4.85675306957708,
"grad_norm": 0.21795270522334995,
"learning_rate": 5.100948019413905e-06,
"loss": 0.3629,
"step": 3560
},
{
"epoch": 4.863574351978172,
"grad_norm": 0.2332969083911167,
"learning_rate": 5.091569200737963e-06,
"loss": 0.3662,
"step": 3565
},
{
"epoch": 4.870395634379263,
"grad_norm": 0.22343458114543863,
"learning_rate": 5.082646673600981e-06,
"loss": 0.361,
"step": 3570
},
{
"epoch": 4.8772169167803545,
"grad_norm": 0.23104806486326432,
"learning_rate": 5.074180619687862e-06,
"loss": 0.3683,
"step": 3575
},
{
"epoch": 4.884038199181446,
"grad_norm": 0.2103538496331855,
"learning_rate": 5.066171211388582e-06,
"loss": 0.3587,
"step": 3580
},
{
"epoch": 4.890859481582537,
"grad_norm": 0.2260978460150931,
"learning_rate": 5.05861861179467e-06,
"loss": 0.3717,
"step": 3585
},
{
"epoch": 4.897680763983629,
"grad_norm": 0.21658341303568335,
"learning_rate": 5.051522974695889e-06,
"loss": 0.3663,
"step": 3590
},
{
"epoch": 4.90450204638472,
"grad_norm": 0.2160133005935788,
"learning_rate": 5.044884444577105e-06,
"loss": 0.3701,
"step": 3595
},
{
"epoch": 4.911323328785811,
"grad_norm": 0.22687277607733605,
"learning_rate": 5.038703156615354e-06,
"loss": 0.3685,
"step": 3600
},
{
"epoch": 4.918144611186904,
"grad_norm": 0.22565159385590897,
"learning_rate": 5.0329792366770686e-06,
"loss": 0.3682,
"step": 3605
},
{
"epoch": 4.924965893587995,
"grad_norm": 0.21363842038305725,
"learning_rate": 5.0277128013155404e-06,
"loss": 0.3647,
"step": 3610
},
{
"epoch": 4.931787175989086,
"grad_norm": 0.21645228300911193,
"learning_rate": 5.022903957768524e-06,
"loss": 0.3583,
"step": 3615
},
{
"epoch": 4.938608458390178,
"grad_norm": 0.2093479992601062,
"learning_rate": 5.0185528039560695e-06,
"loss": 0.3641,
"step": 3620
},
{
"epoch": 4.945429740791269,
"grad_norm": 0.21505943439715722,
"learning_rate": 5.01465942847852e-06,
"loss": 0.3662,
"step": 3625
},
{
"epoch": 4.95225102319236,
"grad_norm": 0.21629968585480175,
"learning_rate": 5.01122391061471e-06,
"loss": 0.3666,
"step": 3630
},
{
"epoch": 4.959072305593452,
"grad_norm": 0.21294059754853326,
"learning_rate": 5.008246320320353e-06,
"loss": 0.3631,
"step": 3635
},
{
"epoch": 4.965893587994543,
"grad_norm": 0.20559301512458342,
"learning_rate": 5.005726718226612e-06,
"loss": 0.3567,
"step": 3640
},
{
"epoch": 4.9727148703956345,
"grad_norm": 0.21200057872237174,
"learning_rate": 5.003665155638871e-06,
"loss": 0.3567,
"step": 3645
},
{
"epoch": 4.979536152796726,
"grad_norm": 0.21168073451110256,
"learning_rate": 5.002061674535687e-06,
"loss": 0.3642,
"step": 3650
},
{
"epoch": 4.986357435197817,
"grad_norm": 0.22189263802123968,
"learning_rate": 5.00091630756793e-06,
"loss": 0.3646,
"step": 3655
},
{
"epoch": 4.993178717598909,
"grad_norm": 0.21534576409298245,
"learning_rate": 5.0002290780581325e-06,
"loss": 0.3683,
"step": 3660
},
{
"epoch": 5.0,
"grad_norm": 0.21118277872368893,
"learning_rate": 5e-06,
"loss": 0.3618,
"step": 3665
},
{
"epoch": 5.0,
"step": 3665,
"total_flos": 5731176614461440.0,
"train_loss": 0.4375836861247418,
"train_runtime": 77494.5958,
"train_samples_per_second": 6.048,
"train_steps_per_second": 0.047
}
],
"logging_steps": 5,
"max_steps": 3665,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 5731176614461440.0,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}