diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,12054 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 7.999534233814625, + "eval_steps": 500, + "global_step": 8584, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.004657661853749418, + "grad_norm": 5.635344410943957, + "learning_rate": 2.910360884749709e-07, + "loss": 1.3166, + "step": 5 + }, + { + "epoch": 0.009315323707498836, + "grad_norm": 5.5648851943522954, + "learning_rate": 5.820721769499418e-07, + "loss": 1.302, + "step": 10 + }, + { + "epoch": 0.013972985561248253, + "grad_norm": 5.3123150312900895, + "learning_rate": 8.731082654249127e-07, + "loss": 1.2882, + "step": 15 + }, + { + "epoch": 0.018630647414997672, + "grad_norm": 4.237789240817698, + "learning_rate": 1.1641443538998836e-06, + "loss": 1.2574, + "step": 20 + }, + { + "epoch": 0.02328830926874709, + "grad_norm": 2.4660698848230447, + "learning_rate": 1.4551804423748545e-06, + "loss": 1.2017, + "step": 25 + }, + { + "epoch": 0.027945971122496506, + "grad_norm": 2.056462779510106, + "learning_rate": 1.7462165308498253e-06, + "loss": 1.1262, + "step": 30 + }, + { + "epoch": 0.032603632976245925, + "grad_norm": 1.4327847209433977, + "learning_rate": 2.037252619324796e-06, + "loss": 1.078, + "step": 35 + }, + { + "epoch": 0.037261294829995344, + "grad_norm": 1.3173231724943057, + "learning_rate": 2.3282887077997673e-06, + "loss": 1.0082, + "step": 40 + }, + { + "epoch": 0.04191895668374476, + "grad_norm": 0.8876525406538337, + "learning_rate": 2.6193247962747383e-06, + "loss": 0.9316, + "step": 45 + }, + { + "epoch": 0.04657661853749418, + "grad_norm": 0.7198347171125989, + "learning_rate": 2.910360884749709e-06, + "loss": 0.8925, + "step": 50 + }, + { + "epoch": 0.05123428039124359, + "grad_norm": 0.7051557927465876, + "learning_rate": 3.2013969732246805e-06, + "loss": 0.8458, + "step": 55 + }, + { + "epoch": 0.05589194224499301, + "grad_norm": 0.5078638639252016, + "learning_rate": 3.4924330616996507e-06, + "loss": 0.8224, + "step": 60 + }, + { + "epoch": 0.06054960409874243, + "grad_norm": 0.5201115702404338, + "learning_rate": 3.7834691501746217e-06, + "loss": 0.7945, + "step": 65 + }, + { + "epoch": 0.06520726595249185, + "grad_norm": 0.4026038703421235, + "learning_rate": 4.074505238649592e-06, + "loss": 0.7554, + "step": 70 + }, + { + "epoch": 0.06986492780624126, + "grad_norm": 0.35147895058479645, + "learning_rate": 4.3655413271245635e-06, + "loss": 0.7448, + "step": 75 + }, + { + "epoch": 0.07452258965999069, + "grad_norm": 0.3792473243721313, + "learning_rate": 4.6565774155995345e-06, + "loss": 0.7502, + "step": 80 + }, + { + "epoch": 0.0791802515137401, + "grad_norm": 0.33218038449745835, + "learning_rate": 4.947613504074506e-06, + "loss": 0.7138, + "step": 85 + }, + { + "epoch": 0.08383791336748952, + "grad_norm": 0.375352695478229, + "learning_rate": 5.238649592549477e-06, + "loss": 0.7143, + "step": 90 + }, + { + "epoch": 0.08849557522123894, + "grad_norm": 0.3853692458917314, + "learning_rate": 5.529685681024447e-06, + "loss": 0.7169, + "step": 95 + }, + { + "epoch": 0.09315323707498836, + "grad_norm": 0.33731584102558443, + "learning_rate": 5.820721769499418e-06, + "loss": 0.6978, + "step": 100 + }, + { + "epoch": 0.09781089892873777, + "grad_norm": 0.33360385089561806, + "learning_rate": 6.111757857974389e-06, + "loss": 0.6988, + "step": 105 + }, + { + "epoch": 0.10246856078248719, + "grad_norm": 0.3322529338881453, + "learning_rate": 6.402793946449361e-06, + "loss": 0.6786, + "step": 110 + }, + { + "epoch": 0.10712622263623661, + "grad_norm": 0.33345730335216767, + "learning_rate": 6.693830034924331e-06, + "loss": 0.6742, + "step": 115 + }, + { + "epoch": 0.11178388448998602, + "grad_norm": 0.3480904112152002, + "learning_rate": 6.984866123399301e-06, + "loss": 0.6843, + "step": 120 + }, + { + "epoch": 0.11644154634373545, + "grad_norm": 0.35538266924148043, + "learning_rate": 7.275902211874273e-06, + "loss": 0.6553, + "step": 125 + }, + { + "epoch": 0.12109920819748486, + "grad_norm": 0.36029823100280123, + "learning_rate": 7.5669383003492435e-06, + "loss": 0.6565, + "step": 130 + }, + { + "epoch": 0.1257568700512343, + "grad_norm": 0.3882208414109668, + "learning_rate": 7.857974388824214e-06, + "loss": 0.6634, + "step": 135 + }, + { + "epoch": 0.1304145319049837, + "grad_norm": 0.37180571772425014, + "learning_rate": 8.149010477299185e-06, + "loss": 0.6519, + "step": 140 + }, + { + "epoch": 0.1350721937587331, + "grad_norm": 0.3776764291908277, + "learning_rate": 8.440046565774158e-06, + "loss": 0.663, + "step": 145 + }, + { + "epoch": 0.13972985561248252, + "grad_norm": 0.37473220243738176, + "learning_rate": 8.731082654249127e-06, + "loss": 0.6526, + "step": 150 + }, + { + "epoch": 0.14438751746623196, + "grad_norm": 0.4110351084409799, + "learning_rate": 9.022118742724098e-06, + "loss": 0.6283, + "step": 155 + }, + { + "epoch": 0.14904517931998137, + "grad_norm": 0.4199064132815328, + "learning_rate": 9.313154831199069e-06, + "loss": 0.6344, + "step": 160 + }, + { + "epoch": 0.1537028411737308, + "grad_norm": 0.36807908599975947, + "learning_rate": 9.60419091967404e-06, + "loss": 0.6343, + "step": 165 + }, + { + "epoch": 0.1583605030274802, + "grad_norm": 0.3840459636079555, + "learning_rate": 9.895227008149011e-06, + "loss": 0.6303, + "step": 170 + }, + { + "epoch": 0.1630181648812296, + "grad_norm": 0.4201103965332282, + "learning_rate": 1.0186263096623982e-05, + "loss": 0.6418, + "step": 175 + }, + { + "epoch": 0.16767582673497905, + "grad_norm": 0.4114690700055479, + "learning_rate": 1.0477299185098953e-05, + "loss": 0.6335, + "step": 180 + }, + { + "epoch": 0.17233348858872846, + "grad_norm": 0.4136408854558498, + "learning_rate": 1.0768335273573923e-05, + "loss": 0.6299, + "step": 185 + }, + { + "epoch": 0.17699115044247787, + "grad_norm": 0.42259582362571485, + "learning_rate": 1.1059371362048894e-05, + "loss": 0.6224, + "step": 190 + }, + { + "epoch": 0.18164881229622729, + "grad_norm": 0.4423031089666256, + "learning_rate": 1.1350407450523866e-05, + "loss": 0.6274, + "step": 195 + }, + { + "epoch": 0.18630647414997673, + "grad_norm": 0.403271923196446, + "learning_rate": 1.1641443538998836e-05, + "loss": 0.594, + "step": 200 + }, + { + "epoch": 0.19096413600372614, + "grad_norm": 0.436282244225205, + "learning_rate": 1.1932479627473807e-05, + "loss": 0.6187, + "step": 205 + }, + { + "epoch": 0.19562179785747555, + "grad_norm": 0.39792760577510866, + "learning_rate": 1.2223515715948778e-05, + "loss": 0.6059, + "step": 210 + }, + { + "epoch": 0.20027945971122496, + "grad_norm": 0.39812469275850443, + "learning_rate": 1.2514551804423749e-05, + "loss": 0.6181, + "step": 215 + }, + { + "epoch": 0.20493712156497437, + "grad_norm": 0.39376887358317875, + "learning_rate": 1.2805587892898722e-05, + "loss": 0.6108, + "step": 220 + }, + { + "epoch": 0.2095947834187238, + "grad_norm": 0.464652404653219, + "learning_rate": 1.309662398137369e-05, + "loss": 0.614, + "step": 225 + }, + { + "epoch": 0.21425244527247322, + "grad_norm": 0.4023604229121632, + "learning_rate": 1.3387660069848662e-05, + "loss": 0.6108, + "step": 230 + }, + { + "epoch": 0.21891010712622264, + "grad_norm": 0.4360445909537957, + "learning_rate": 1.3678696158323633e-05, + "loss": 0.6049, + "step": 235 + }, + { + "epoch": 0.22356776897997205, + "grad_norm": 0.4373905365842894, + "learning_rate": 1.3969732246798603e-05, + "loss": 0.603, + "step": 240 + }, + { + "epoch": 0.22822543083372146, + "grad_norm": 0.39179031021650224, + "learning_rate": 1.4260768335273575e-05, + "loss": 0.59, + "step": 245 + }, + { + "epoch": 0.2328830926874709, + "grad_norm": 0.4196746075080395, + "learning_rate": 1.4551804423748547e-05, + "loss": 0.5865, + "step": 250 + }, + { + "epoch": 0.2375407545412203, + "grad_norm": 0.4303813452229446, + "learning_rate": 1.4842840512223516e-05, + "loss": 0.5984, + "step": 255 + }, + { + "epoch": 0.24219841639496972, + "grad_norm": 0.47523169063383375, + "learning_rate": 1.5133876600698487e-05, + "loss": 0.6036, + "step": 260 + }, + { + "epoch": 0.24685607824871914, + "grad_norm": 0.4335910841568507, + "learning_rate": 1.5424912689173458e-05, + "loss": 0.5947, + "step": 265 + }, + { + "epoch": 0.2515137401024686, + "grad_norm": 0.41341980426138514, + "learning_rate": 1.5715948777648427e-05, + "loss": 0.5813, + "step": 270 + }, + { + "epoch": 0.25617140195621796, + "grad_norm": 0.5160767565953956, + "learning_rate": 1.60069848661234e-05, + "loss": 0.6011, + "step": 275 + }, + { + "epoch": 0.2608290638099674, + "grad_norm": 0.5529678248494052, + "learning_rate": 1.629802095459837e-05, + "loss": 0.5876, + "step": 280 + }, + { + "epoch": 0.26548672566371684, + "grad_norm": 0.47547593485478673, + "learning_rate": 1.6589057043073342e-05, + "loss": 0.584, + "step": 285 + }, + { + "epoch": 0.2701443875174662, + "grad_norm": 0.45222675935849044, + "learning_rate": 1.6880093131548315e-05, + "loss": 0.5823, + "step": 290 + }, + { + "epoch": 0.27480204937121566, + "grad_norm": 0.4510297094925413, + "learning_rate": 1.717112922002328e-05, + "loss": 0.5742, + "step": 295 + }, + { + "epoch": 0.27945971122496505, + "grad_norm": 0.5521894478552463, + "learning_rate": 1.7462165308498254e-05, + "loss": 0.5866, + "step": 300 + }, + { + "epoch": 0.2841173730787145, + "grad_norm": 0.44020499832083604, + "learning_rate": 1.7753201396973227e-05, + "loss": 0.5941, + "step": 305 + }, + { + "epoch": 0.2887750349324639, + "grad_norm": 0.4401227459843441, + "learning_rate": 1.8044237485448196e-05, + "loss": 0.5884, + "step": 310 + }, + { + "epoch": 0.2934326967862133, + "grad_norm": 0.42288303099486596, + "learning_rate": 1.833527357392317e-05, + "loss": 0.5751, + "step": 315 + }, + { + "epoch": 0.29809035863996275, + "grad_norm": 0.44248419559906405, + "learning_rate": 1.8626309662398138e-05, + "loss": 0.5843, + "step": 320 + }, + { + "epoch": 0.30274802049371213, + "grad_norm": 0.4192435308637777, + "learning_rate": 1.8917345750873107e-05, + "loss": 0.5868, + "step": 325 + }, + { + "epoch": 0.3074056823474616, + "grad_norm": 0.4499232353603837, + "learning_rate": 1.920838183934808e-05, + "loss": 0.5847, + "step": 330 + }, + { + "epoch": 0.312063344201211, + "grad_norm": 0.4256491052871394, + "learning_rate": 1.9499417927823053e-05, + "loss": 0.5645, + "step": 335 + }, + { + "epoch": 0.3167210060549604, + "grad_norm": 0.4862344891338101, + "learning_rate": 1.9790454016298022e-05, + "loss": 0.5799, + "step": 340 + }, + { + "epoch": 0.32137866790870984, + "grad_norm": 0.44409885524054266, + "learning_rate": 2.0081490104772992e-05, + "loss": 0.5676, + "step": 345 + }, + { + "epoch": 0.3260363297624592, + "grad_norm": 0.5273940404660536, + "learning_rate": 2.0372526193247964e-05, + "loss": 0.5705, + "step": 350 + }, + { + "epoch": 0.33069399161620866, + "grad_norm": 0.4506295177797624, + "learning_rate": 2.0663562281722934e-05, + "loss": 0.5663, + "step": 355 + }, + { + "epoch": 0.3353516534699581, + "grad_norm": 0.4429860512388572, + "learning_rate": 2.0954598370197907e-05, + "loss": 0.5777, + "step": 360 + }, + { + "epoch": 0.3400093153237075, + "grad_norm": 0.47551309129603747, + "learning_rate": 2.124563445867288e-05, + "loss": 0.5771, + "step": 365 + }, + { + "epoch": 0.3446669771774569, + "grad_norm": 0.5109560446100108, + "learning_rate": 2.1536670547147845e-05, + "loss": 0.588, + "step": 370 + }, + { + "epoch": 0.3493246390312063, + "grad_norm": 0.5013967031612557, + "learning_rate": 2.1827706635622818e-05, + "loss": 0.5805, + "step": 375 + }, + { + "epoch": 0.35398230088495575, + "grad_norm": 0.5029584087507263, + "learning_rate": 2.2118742724097787e-05, + "loss": 0.5655, + "step": 380 + }, + { + "epoch": 0.3586399627387052, + "grad_norm": 0.392267985885871, + "learning_rate": 2.240977881257276e-05, + "loss": 0.5588, + "step": 385 + }, + { + "epoch": 0.36329762459245457, + "grad_norm": 0.49346124052819235, + "learning_rate": 2.2700814901047733e-05, + "loss": 0.5654, + "step": 390 + }, + { + "epoch": 0.367955286446204, + "grad_norm": 0.5440269465150551, + "learning_rate": 2.2991850989522702e-05, + "loss": 0.5576, + "step": 395 + }, + { + "epoch": 0.37261294829995345, + "grad_norm": 0.47389647089010667, + "learning_rate": 2.3282887077997672e-05, + "loss": 0.5544, + "step": 400 + }, + { + "epoch": 0.37727061015370283, + "grad_norm": 0.5060217377835531, + "learning_rate": 2.3573923166472644e-05, + "loss": 0.5581, + "step": 405 + }, + { + "epoch": 0.3819282720074523, + "grad_norm": 0.4280491572297177, + "learning_rate": 2.3864959254947614e-05, + "loss": 0.5648, + "step": 410 + }, + { + "epoch": 0.38658593386120166, + "grad_norm": 0.4546099384450869, + "learning_rate": 2.4155995343422587e-05, + "loss": 0.5648, + "step": 415 + }, + { + "epoch": 0.3912435957149511, + "grad_norm": 0.44032956083221475, + "learning_rate": 2.4447031431897556e-05, + "loss": 0.5523, + "step": 420 + }, + { + "epoch": 0.39590125756870054, + "grad_norm": 0.4418506911683883, + "learning_rate": 2.4738067520372525e-05, + "loss": 0.5711, + "step": 425 + }, + { + "epoch": 0.4005589194224499, + "grad_norm": 0.521278363776306, + "learning_rate": 2.5029103608847498e-05, + "loss": 0.5643, + "step": 430 + }, + { + "epoch": 0.40521658127619936, + "grad_norm": 0.532595160062641, + "learning_rate": 2.532013969732247e-05, + "loss": 0.5665, + "step": 435 + }, + { + "epoch": 0.40987424312994875, + "grad_norm": 0.41035785696363714, + "learning_rate": 2.5611175785797444e-05, + "loss": 0.5776, + "step": 440 + }, + { + "epoch": 0.4145319049836982, + "grad_norm": 0.4116095623614095, + "learning_rate": 2.590221187427241e-05, + "loss": 0.5502, + "step": 445 + }, + { + "epoch": 0.4191895668374476, + "grad_norm": 0.5062712817242586, + "learning_rate": 2.619324796274738e-05, + "loss": 0.5552, + "step": 450 + }, + { + "epoch": 0.423847228691197, + "grad_norm": 0.5033640845621997, + "learning_rate": 2.6484284051222352e-05, + "loss": 0.5762, + "step": 455 + }, + { + "epoch": 0.42850489054494645, + "grad_norm": 0.5035654454052618, + "learning_rate": 2.6775320139697325e-05, + "loss": 0.5546, + "step": 460 + }, + { + "epoch": 0.43316255239869583, + "grad_norm": 0.4336291085369849, + "learning_rate": 2.7066356228172297e-05, + "loss": 0.5641, + "step": 465 + }, + { + "epoch": 0.43782021425244527, + "grad_norm": 0.4699748065337065, + "learning_rate": 2.7357392316647267e-05, + "loss": 0.5551, + "step": 470 + }, + { + "epoch": 0.4424778761061947, + "grad_norm": 0.5259808485283921, + "learning_rate": 2.7648428405122233e-05, + "loss": 0.5655, + "step": 475 + }, + { + "epoch": 0.4471355379599441, + "grad_norm": 0.4170440552225455, + "learning_rate": 2.7939464493597205e-05, + "loss": 0.5495, + "step": 480 + }, + { + "epoch": 0.45179319981369354, + "grad_norm": 0.4402168017776964, + "learning_rate": 2.8230500582072178e-05, + "loss": 0.5522, + "step": 485 + }, + { + "epoch": 0.4564508616674429, + "grad_norm": 0.4569052068490754, + "learning_rate": 2.852153667054715e-05, + "loss": 0.5565, + "step": 490 + }, + { + "epoch": 0.46110852352119236, + "grad_norm": 0.4978898390597469, + "learning_rate": 2.881257275902212e-05, + "loss": 0.5466, + "step": 495 + }, + { + "epoch": 0.4657661853749418, + "grad_norm": 0.4229733715419873, + "learning_rate": 2.9103608847497093e-05, + "loss": 0.5554, + "step": 500 + }, + { + "epoch": 0.4704238472286912, + "grad_norm": 0.43529337252388356, + "learning_rate": 2.939464493597206e-05, + "loss": 0.5482, + "step": 505 + }, + { + "epoch": 0.4750815090824406, + "grad_norm": 0.5160534559430204, + "learning_rate": 2.9685681024447032e-05, + "loss": 0.5444, + "step": 510 + }, + { + "epoch": 0.47973917093619, + "grad_norm": 0.443484452067975, + "learning_rate": 2.9976717112922005e-05, + "loss": 0.5572, + "step": 515 + }, + { + "epoch": 0.48439683278993945, + "grad_norm": 0.5944864965501068, + "learning_rate": 3.0267753201396974e-05, + "loss": 0.5524, + "step": 520 + }, + { + "epoch": 0.4890544946436889, + "grad_norm": 0.5071449662346628, + "learning_rate": 3.055878928987195e-05, + "loss": 0.5638, + "step": 525 + }, + { + "epoch": 0.49371215649743827, + "grad_norm": 0.4939766509456816, + "learning_rate": 3.0849825378346916e-05, + "loss": 0.5466, + "step": 530 + }, + { + "epoch": 0.4983698183511877, + "grad_norm": 0.454732349457497, + "learning_rate": 3.1140861466821885e-05, + "loss": 0.5533, + "step": 535 + }, + { + "epoch": 0.5030274802049371, + "grad_norm": 0.47844922111423144, + "learning_rate": 3.1431897555296855e-05, + "loss": 0.5575, + "step": 540 + }, + { + "epoch": 0.5076851420586865, + "grad_norm": 0.48361551844718786, + "learning_rate": 3.172293364377183e-05, + "loss": 0.5452, + "step": 545 + }, + { + "epoch": 0.5123428039124359, + "grad_norm": 0.5181559294500859, + "learning_rate": 3.20139697322468e-05, + "loss": 0.5618, + "step": 550 + }, + { + "epoch": 0.5170004657661854, + "grad_norm": 0.4899936584071396, + "learning_rate": 3.2305005820721776e-05, + "loss": 0.5481, + "step": 555 + }, + { + "epoch": 0.5216581276199348, + "grad_norm": 0.5506585569549096, + "learning_rate": 3.259604190919674e-05, + "loss": 0.5538, + "step": 560 + }, + { + "epoch": 0.5263157894736842, + "grad_norm": 0.6754186581156059, + "learning_rate": 3.288707799767171e-05, + "loss": 0.5657, + "step": 565 + }, + { + "epoch": 0.5309734513274337, + "grad_norm": 0.6458262406019906, + "learning_rate": 3.3178114086146685e-05, + "loss": 0.5804, + "step": 570 + }, + { + "epoch": 0.5356311131811831, + "grad_norm": 0.5677532060663508, + "learning_rate": 3.3469150174621654e-05, + "loss": 0.5509, + "step": 575 + }, + { + "epoch": 0.5402887750349324, + "grad_norm": 0.5140073417715375, + "learning_rate": 3.376018626309663e-05, + "loss": 0.5498, + "step": 580 + }, + { + "epoch": 0.5449464368886818, + "grad_norm": 0.47973716619213574, + "learning_rate": 3.40512223515716e-05, + "loss": 0.5437, + "step": 585 + }, + { + "epoch": 0.5496040987424313, + "grad_norm": 2.850114498375805, + "learning_rate": 3.434225844004656e-05, + "loss": 0.5621, + "step": 590 + }, + { + "epoch": 0.5542617605961807, + "grad_norm": 0.7132559839205367, + "learning_rate": 3.463329452852154e-05, + "loss": 0.6687, + "step": 595 + }, + { + "epoch": 0.5589194224499301, + "grad_norm": 0.6423899235862081, + "learning_rate": 3.492433061699651e-05, + "loss": 0.5595, + "step": 600 + }, + { + "epoch": 0.5635770843036796, + "grad_norm": 0.698995384460383, + "learning_rate": 3.5215366705471484e-05, + "loss": 0.5625, + "step": 605 + }, + { + "epoch": 0.568234746157429, + "grad_norm": 0.4903608237478288, + "learning_rate": 3.550640279394645e-05, + "loss": 0.5516, + "step": 610 + }, + { + "epoch": 0.5728924080111784, + "grad_norm": 4.379274402774887, + "learning_rate": 3.579743888242142e-05, + "loss": 0.5441, + "step": 615 + }, + { + "epoch": 0.5775500698649279, + "grad_norm": 0.512657850098662, + "learning_rate": 3.608847497089639e-05, + "loss": 0.5458, + "step": 620 + }, + { + "epoch": 0.5822077317186772, + "grad_norm": 0.43346421191572665, + "learning_rate": 3.637951105937136e-05, + "loss": 0.5569, + "step": 625 + }, + { + "epoch": 0.5868653935724266, + "grad_norm": 0.531831351876456, + "learning_rate": 3.667054714784634e-05, + "loss": 0.5499, + "step": 630 + }, + { + "epoch": 0.5915230554261761, + "grad_norm": 0.6622562456603708, + "learning_rate": 3.696158323632131e-05, + "loss": 0.5594, + "step": 635 + }, + { + "epoch": 0.5961807172799255, + "grad_norm": 0.44507715538737447, + "learning_rate": 3.7252619324796276e-05, + "loss": 0.5487, + "step": 640 + }, + { + "epoch": 0.6008383791336749, + "grad_norm": 5.752742943024781, + "learning_rate": 3.7543655413271246e-05, + "loss": 0.5868, + "step": 645 + }, + { + "epoch": 0.6054960409874243, + "grad_norm": 0.5592925415997116, + "learning_rate": 3.7834691501746215e-05, + "loss": 0.545, + "step": 650 + }, + { + "epoch": 0.6101537028411738, + "grad_norm": 0.3957341710601061, + "learning_rate": 3.812572759022119e-05, + "loss": 0.5493, + "step": 655 + }, + { + "epoch": 0.6148113646949231, + "grad_norm": 0.45043284107748643, + "learning_rate": 3.841676367869616e-05, + "loss": 0.5519, + "step": 660 + }, + { + "epoch": 0.6194690265486725, + "grad_norm": 0.4597678841607699, + "learning_rate": 3.870779976717113e-05, + "loss": 0.5451, + "step": 665 + }, + { + "epoch": 0.624126688402422, + "grad_norm": 0.4401269476751836, + "learning_rate": 3.8998835855646106e-05, + "loss": 0.5478, + "step": 670 + }, + { + "epoch": 0.6287843502561714, + "grad_norm": 0.4517164819813474, + "learning_rate": 3.928987194412107e-05, + "loss": 0.5377, + "step": 675 + }, + { + "epoch": 0.6334420121099208, + "grad_norm": 0.42856901369174305, + "learning_rate": 3.9580908032596045e-05, + "loss": 0.5419, + "step": 680 + }, + { + "epoch": 0.6380996739636703, + "grad_norm": 0.4778106030676651, + "learning_rate": 3.9871944121071014e-05, + "loss": 0.5557, + "step": 685 + }, + { + "epoch": 0.6427573358174197, + "grad_norm": 0.4810252200454141, + "learning_rate": 4.0162980209545983e-05, + "loss": 0.5424, + "step": 690 + }, + { + "epoch": 0.6474149976711691, + "grad_norm": 0.42113883038821803, + "learning_rate": 4.045401629802096e-05, + "loss": 0.5421, + "step": 695 + }, + { + "epoch": 0.6520726595249184, + "grad_norm": 0.48920856808484403, + "learning_rate": 4.074505238649593e-05, + "loss": 0.5391, + "step": 700 + }, + { + "epoch": 0.6567303213786679, + "grad_norm": 0.48208747773248684, + "learning_rate": 4.10360884749709e-05, + "loss": 0.5354, + "step": 705 + }, + { + "epoch": 0.6613879832324173, + "grad_norm": 0.4687178891717093, + "learning_rate": 4.132712456344587e-05, + "loss": 0.5485, + "step": 710 + }, + { + "epoch": 0.6660456450861667, + "grad_norm": 0.45466895427802073, + "learning_rate": 4.161816065192084e-05, + "loss": 0.5429, + "step": 715 + }, + { + "epoch": 0.6707033069399162, + "grad_norm": 0.40939740583021633, + "learning_rate": 4.190919674039581e-05, + "loss": 0.5311, + "step": 720 + }, + { + "epoch": 0.6753609687936656, + "grad_norm": 0.4026942565114344, + "learning_rate": 4.220023282887078e-05, + "loss": 0.5365, + "step": 725 + }, + { + "epoch": 0.680018630647415, + "grad_norm": 0.44458418781623743, + "learning_rate": 4.249126891734576e-05, + "loss": 0.5445, + "step": 730 + }, + { + "epoch": 0.6846762925011645, + "grad_norm": 0.45908128679549104, + "learning_rate": 4.278230500582072e-05, + "loss": 0.5318, + "step": 735 + }, + { + "epoch": 0.6893339543549138, + "grad_norm": 0.4336786017781467, + "learning_rate": 4.307334109429569e-05, + "loss": 0.5392, + "step": 740 + }, + { + "epoch": 0.6939916162086632, + "grad_norm": 0.6430149710986586, + "learning_rate": 4.336437718277067e-05, + "loss": 0.5396, + "step": 745 + }, + { + "epoch": 0.6986492780624126, + "grad_norm": 0.5060406772414919, + "learning_rate": 4.3655413271245636e-05, + "loss": 0.542, + "step": 750 + }, + { + "epoch": 0.7033069399161621, + "grad_norm": 0.43575745452775133, + "learning_rate": 4.394644935972061e-05, + "loss": 0.5287, + "step": 755 + }, + { + "epoch": 0.7079646017699115, + "grad_norm": 0.4509480021266233, + "learning_rate": 4.4237485448195575e-05, + "loss": 0.5312, + "step": 760 + }, + { + "epoch": 0.7126222636236609, + "grad_norm": 0.39247296762258177, + "learning_rate": 4.452852153667055e-05, + "loss": 0.5428, + "step": 765 + }, + { + "epoch": 0.7172799254774104, + "grad_norm": 0.43065576347919365, + "learning_rate": 4.481955762514552e-05, + "loss": 0.5408, + "step": 770 + }, + { + "epoch": 0.7219375873311598, + "grad_norm": 0.4968073164264031, + "learning_rate": 4.511059371362049e-05, + "loss": 0.5277, + "step": 775 + }, + { + "epoch": 0.7265952491849091, + "grad_norm": 0.4752250835263456, + "learning_rate": 4.5401629802095466e-05, + "loss": 0.5461, + "step": 780 + }, + { + "epoch": 0.7312529110386586, + "grad_norm": 0.3917023142794256, + "learning_rate": 4.5692665890570435e-05, + "loss": 0.539, + "step": 785 + }, + { + "epoch": 0.735910572892408, + "grad_norm": 0.42438394966515364, + "learning_rate": 4.5983701979045405e-05, + "loss": 0.5303, + "step": 790 + }, + { + "epoch": 0.7405682347461574, + "grad_norm": 0.39706636448503807, + "learning_rate": 4.6274738067520374e-05, + "loss": 0.5371, + "step": 795 + }, + { + "epoch": 0.7452258965999069, + "grad_norm": 0.4034247813868715, + "learning_rate": 4.6565774155995343e-05, + "loss": 0.5334, + "step": 800 + }, + { + "epoch": 0.7498835584536563, + "grad_norm": 0.38931923519454176, + "learning_rate": 4.685681024447032e-05, + "loss": 0.5494, + "step": 805 + }, + { + "epoch": 0.7545412203074057, + "grad_norm": 0.4126988813279989, + "learning_rate": 4.714784633294529e-05, + "loss": 0.5329, + "step": 810 + }, + { + "epoch": 0.759198882161155, + "grad_norm": 0.41847891492257616, + "learning_rate": 4.743888242142026e-05, + "loss": 0.5368, + "step": 815 + }, + { + "epoch": 0.7638565440149045, + "grad_norm": 0.37832989246330034, + "learning_rate": 4.772991850989523e-05, + "loss": 0.5368, + "step": 820 + }, + { + "epoch": 0.7685142058686539, + "grad_norm": 0.4264254827761458, + "learning_rate": 4.80209545983702e-05, + "loss": 0.5222, + "step": 825 + }, + { + "epoch": 0.7731718677224033, + "grad_norm": 0.4215105843173847, + "learning_rate": 4.831199068684517e-05, + "loss": 0.5321, + "step": 830 + }, + { + "epoch": 0.7778295295761528, + "grad_norm": 0.40779532756054626, + "learning_rate": 4.860302677532014e-05, + "loss": 0.5386, + "step": 835 + }, + { + "epoch": 0.7824871914299022, + "grad_norm": 0.43656255019856294, + "learning_rate": 4.889406286379511e-05, + "loss": 0.5229, + "step": 840 + }, + { + "epoch": 0.7871448532836516, + "grad_norm": 0.46094480601401433, + "learning_rate": 4.918509895227008e-05, + "loss": 0.5301, + "step": 845 + }, + { + "epoch": 0.7918025151374011, + "grad_norm": 0.4382225357931327, + "learning_rate": 4.947613504074505e-05, + "loss": 0.5324, + "step": 850 + }, + { + "epoch": 0.7964601769911505, + "grad_norm": 0.37445428395677366, + "learning_rate": 4.976717112922003e-05, + "loss": 0.5215, + "step": 855 + }, + { + "epoch": 0.8011178388448998, + "grad_norm": 0.4172807893879857, + "learning_rate": 4.9993527508090615e-05, + "loss": 0.5264, + "step": 860 + }, + { + "epoch": 0.8057755006986492, + "grad_norm": 0.42493387898618923, + "learning_rate": 4.996116504854369e-05, + "loss": 0.5289, + "step": 865 + }, + { + "epoch": 0.8104331625523987, + "grad_norm": 0.4793713302760421, + "learning_rate": 4.992880258899677e-05, + "loss": 0.522, + "step": 870 + }, + { + "epoch": 0.8150908244061481, + "grad_norm": 0.42581510639579395, + "learning_rate": 4.989644012944984e-05, + "loss": 0.5353, + "step": 875 + }, + { + "epoch": 0.8197484862598975, + "grad_norm": 0.39346573629830267, + "learning_rate": 4.9864077669902914e-05, + "loss": 0.5143, + "step": 880 + }, + { + "epoch": 0.824406148113647, + "grad_norm": 0.46479332953603936, + "learning_rate": 4.983171521035599e-05, + "loss": 0.5343, + "step": 885 + }, + { + "epoch": 0.8290638099673964, + "grad_norm": 0.38234965919360414, + "learning_rate": 4.979935275080906e-05, + "loss": 0.5323, + "step": 890 + }, + { + "epoch": 0.8337214718211458, + "grad_norm": 0.38452753250908844, + "learning_rate": 4.976699029126214e-05, + "loss": 0.5323, + "step": 895 + }, + { + "epoch": 0.8383791336748952, + "grad_norm": 0.3558793576496668, + "learning_rate": 4.9734627831715214e-05, + "loss": 0.5258, + "step": 900 + }, + { + "epoch": 0.8430367955286446, + "grad_norm": 0.40705429734477666, + "learning_rate": 4.970226537216829e-05, + "loss": 0.5293, + "step": 905 + }, + { + "epoch": 0.847694457382394, + "grad_norm": 0.3681084398657872, + "learning_rate": 4.966990291262136e-05, + "loss": 0.5377, + "step": 910 + }, + { + "epoch": 0.8523521192361434, + "grad_norm": 0.4451189523688905, + "learning_rate": 4.963754045307444e-05, + "loss": 0.526, + "step": 915 + }, + { + "epoch": 0.8570097810898929, + "grad_norm": 0.4098553913425108, + "learning_rate": 4.9605177993527513e-05, + "loss": 0.5358, + "step": 920 + }, + { + "epoch": 0.8616674429436423, + "grad_norm": 0.4210630886877289, + "learning_rate": 4.957281553398058e-05, + "loss": 0.5285, + "step": 925 + }, + { + "epoch": 0.8663251047973917, + "grad_norm": 0.5682054260649863, + "learning_rate": 4.954045307443366e-05, + "loss": 0.531, + "step": 930 + }, + { + "epoch": 0.8709827666511412, + "grad_norm": 0.4003926309716626, + "learning_rate": 4.950809061488673e-05, + "loss": 0.516, + "step": 935 + }, + { + "epoch": 0.8756404285048905, + "grad_norm": 0.36583126445264214, + "learning_rate": 4.9475728155339806e-05, + "loss": 0.5147, + "step": 940 + }, + { + "epoch": 0.8802980903586399, + "grad_norm": 0.34710755584775155, + "learning_rate": 4.944336569579288e-05, + "loss": 0.521, + "step": 945 + }, + { + "epoch": 0.8849557522123894, + "grad_norm": 0.33969459451512213, + "learning_rate": 4.941100323624596e-05, + "loss": 0.508, + "step": 950 + }, + { + "epoch": 0.8896134140661388, + "grad_norm": 0.3667209891990349, + "learning_rate": 4.937864077669903e-05, + "loss": 0.5247, + "step": 955 + }, + { + "epoch": 0.8942710759198882, + "grad_norm": 0.3628895818072265, + "learning_rate": 4.9346278317152106e-05, + "loss": 0.5301, + "step": 960 + }, + { + "epoch": 0.8989287377736377, + "grad_norm": 0.3743557642518737, + "learning_rate": 4.931391585760518e-05, + "loss": 0.5236, + "step": 965 + }, + { + "epoch": 0.9035863996273871, + "grad_norm": 0.3571762409530922, + "learning_rate": 4.928155339805826e-05, + "loss": 0.527, + "step": 970 + }, + { + "epoch": 0.9082440614811365, + "grad_norm": 0.43902683870985737, + "learning_rate": 4.924919093851133e-05, + "loss": 0.5212, + "step": 975 + }, + { + "epoch": 0.9129017233348858, + "grad_norm": 0.39354640530264057, + "learning_rate": 4.9216828478964405e-05, + "loss": 0.5244, + "step": 980 + }, + { + "epoch": 0.9175593851886353, + "grad_norm": 0.3496362068420844, + "learning_rate": 4.918446601941748e-05, + "loss": 0.5295, + "step": 985 + }, + { + "epoch": 0.9222170470423847, + "grad_norm": 0.4353930009984086, + "learning_rate": 4.915210355987055e-05, + "loss": 0.4976, + "step": 990 + }, + { + "epoch": 0.9268747088961341, + "grad_norm": 0.3566344310918772, + "learning_rate": 4.911974110032363e-05, + "loss": 0.5152, + "step": 995 + }, + { + "epoch": 0.9315323707498836, + "grad_norm": 0.38805577658270657, + "learning_rate": 4.90873786407767e-05, + "loss": 0.519, + "step": 1000 + }, + { + "epoch": 0.936190032603633, + "grad_norm": 0.3883213881506984, + "learning_rate": 4.9055016181229774e-05, + "loss": 0.5304, + "step": 1005 + }, + { + "epoch": 0.9408476944573824, + "grad_norm": 0.3729784939764993, + "learning_rate": 4.902265372168285e-05, + "loss": 0.5162, + "step": 1010 + }, + { + "epoch": 0.9455053563111319, + "grad_norm": 0.39356886816717457, + "learning_rate": 4.899029126213592e-05, + "loss": 0.5194, + "step": 1015 + }, + { + "epoch": 0.9501630181648812, + "grad_norm": 0.37144096012401695, + "learning_rate": 4.8957928802589e-05, + "loss": 0.5292, + "step": 1020 + }, + { + "epoch": 0.9548206800186306, + "grad_norm": 0.3960035947632593, + "learning_rate": 4.8925566343042074e-05, + "loss": 0.5193, + "step": 1025 + }, + { + "epoch": 0.95947834187238, + "grad_norm": 0.3625631338673291, + "learning_rate": 4.889320388349515e-05, + "loss": 0.5099, + "step": 1030 + }, + { + "epoch": 0.9641360037261295, + "grad_norm": 0.37924346646168744, + "learning_rate": 4.886084142394822e-05, + "loss": 0.5146, + "step": 1035 + }, + { + "epoch": 0.9687936655798789, + "grad_norm": 0.35148813999705747, + "learning_rate": 4.88284789644013e-05, + "loss": 0.5117, + "step": 1040 + }, + { + "epoch": 0.9734513274336283, + "grad_norm": 0.33290102128014737, + "learning_rate": 4.879611650485437e-05, + "loss": 0.514, + "step": 1045 + }, + { + "epoch": 0.9781089892873778, + "grad_norm": 0.36029983492128725, + "learning_rate": 4.876375404530745e-05, + "loss": 0.5128, + "step": 1050 + }, + { + "epoch": 0.9827666511411272, + "grad_norm": 0.39114010829978446, + "learning_rate": 4.873139158576052e-05, + "loss": 0.5155, + "step": 1055 + }, + { + "epoch": 0.9874243129948765, + "grad_norm": 0.39656745624556927, + "learning_rate": 4.8699029126213596e-05, + "loss": 0.5187, + "step": 1060 + }, + { + "epoch": 0.992081974848626, + "grad_norm": 0.4294193162930166, + "learning_rate": 4.866666666666667e-05, + "loss": 0.5163, + "step": 1065 + }, + { + "epoch": 0.9967396367023754, + "grad_norm": 0.3982263305944793, + "learning_rate": 4.863430420711974e-05, + "loss": 0.5061, + "step": 1070 + }, + { + "epoch": 1.00093153237075, + "grad_norm": 0.5928038015870168, + "learning_rate": 4.860194174757281e-05, + "loss": 0.5253, + "step": 1075 + }, + { + "epoch": 1.0055891942244992, + "grad_norm": 0.45617357290049604, + "learning_rate": 4.856957928802589e-05, + "loss": 0.4908, + "step": 1080 + }, + { + "epoch": 1.0102468560782487, + "grad_norm": 0.39682435507090447, + "learning_rate": 4.8537216828478965e-05, + "loss": 0.4762, + "step": 1085 + }, + { + "epoch": 1.0149045179319982, + "grad_norm": 0.3918327414704858, + "learning_rate": 4.850485436893204e-05, + "loss": 0.4958, + "step": 1090 + }, + { + "epoch": 1.0195621797857475, + "grad_norm": 0.3836278471157271, + "learning_rate": 4.847249190938511e-05, + "loss": 0.4921, + "step": 1095 + }, + { + "epoch": 1.024219841639497, + "grad_norm": 0.3563021906682752, + "learning_rate": 4.844012944983819e-05, + "loss": 0.4944, + "step": 1100 + }, + { + "epoch": 1.0288775034932465, + "grad_norm": 0.35054296656861605, + "learning_rate": 4.8407766990291265e-05, + "loss": 0.4916, + "step": 1105 + }, + { + "epoch": 1.0335351653469957, + "grad_norm": 0.4068508503935885, + "learning_rate": 4.837540453074434e-05, + "loss": 0.4887, + "step": 1110 + }, + { + "epoch": 1.0381928272007452, + "grad_norm": 0.38370359471299786, + "learning_rate": 4.834304207119741e-05, + "loss": 0.4801, + "step": 1115 + }, + { + "epoch": 1.0428504890544947, + "grad_norm": 0.434600802675546, + "learning_rate": 4.831067961165049e-05, + "loss": 0.4904, + "step": 1120 + }, + { + "epoch": 1.047508150908244, + "grad_norm": 0.34054843848982447, + "learning_rate": 4.8278317152103564e-05, + "loss": 0.4967, + "step": 1125 + }, + { + "epoch": 1.0521658127619935, + "grad_norm": 0.3811863603092705, + "learning_rate": 4.824595469255664e-05, + "loss": 0.4838, + "step": 1130 + }, + { + "epoch": 1.056823474615743, + "grad_norm": 0.36035159110419396, + "learning_rate": 4.821359223300971e-05, + "loss": 0.4906, + "step": 1135 + }, + { + "epoch": 1.0614811364694923, + "grad_norm": 0.36681536116027774, + "learning_rate": 4.818122977346279e-05, + "loss": 0.4906, + "step": 1140 + }, + { + "epoch": 1.0661387983232418, + "grad_norm": 0.38080197685374506, + "learning_rate": 4.814886731391586e-05, + "loss": 0.4898, + "step": 1145 + }, + { + "epoch": 1.0707964601769913, + "grad_norm": 0.3583265769522901, + "learning_rate": 4.8116504854368934e-05, + "loss": 0.4886, + "step": 1150 + }, + { + "epoch": 1.0754541220307405, + "grad_norm": 0.4361753476463697, + "learning_rate": 4.808414239482201e-05, + "loss": 0.4902, + "step": 1155 + }, + { + "epoch": 1.08011178388449, + "grad_norm": 0.4099438655688987, + "learning_rate": 4.805177993527508e-05, + "loss": 0.4985, + "step": 1160 + }, + { + "epoch": 1.0847694457382393, + "grad_norm": 0.3625829222877311, + "learning_rate": 4.801941747572816e-05, + "loss": 0.4962, + "step": 1165 + }, + { + "epoch": 1.0894271075919888, + "grad_norm": 0.35932190699829, + "learning_rate": 4.798705501618123e-05, + "loss": 0.4922, + "step": 1170 + }, + { + "epoch": 1.0940847694457383, + "grad_norm": 0.38128309499882657, + "learning_rate": 4.795469255663431e-05, + "loss": 0.4944, + "step": 1175 + }, + { + "epoch": 1.0987424312994876, + "grad_norm": 0.3991505478965423, + "learning_rate": 4.792233009708738e-05, + "loss": 0.4856, + "step": 1180 + }, + { + "epoch": 1.103400093153237, + "grad_norm": 0.37327909814375987, + "learning_rate": 4.7889967637540456e-05, + "loss": 0.4835, + "step": 1185 + }, + { + "epoch": 1.1080577550069866, + "grad_norm": 0.33891412197653836, + "learning_rate": 4.785760517799353e-05, + "loss": 0.4658, + "step": 1190 + }, + { + "epoch": 1.1127154168607358, + "grad_norm": 0.3474094654759008, + "learning_rate": 4.78252427184466e-05, + "loss": 0.4768, + "step": 1195 + }, + { + "epoch": 1.1173730787144853, + "grad_norm": 0.36244721849020517, + "learning_rate": 4.779288025889968e-05, + "loss": 0.4836, + "step": 1200 + }, + { + "epoch": 1.1220307405682348, + "grad_norm": 0.4033010179698364, + "learning_rate": 4.7760517799352756e-05, + "loss": 0.497, + "step": 1205 + }, + { + "epoch": 1.126688402421984, + "grad_norm": 0.37809180198013964, + "learning_rate": 4.772815533980583e-05, + "loss": 0.4916, + "step": 1210 + }, + { + "epoch": 1.1313460642757336, + "grad_norm": 0.36933723525949036, + "learning_rate": 4.76957928802589e-05, + "loss": 0.4858, + "step": 1215 + }, + { + "epoch": 1.136003726129483, + "grad_norm": 0.36142771353078024, + "learning_rate": 4.766343042071197e-05, + "loss": 0.4938, + "step": 1220 + }, + { + "epoch": 1.1406613879832324, + "grad_norm": 0.39343722932343206, + "learning_rate": 4.763106796116505e-05, + "loss": 0.4846, + "step": 1225 + }, + { + "epoch": 1.1453190498369819, + "grad_norm": 0.38536600679449245, + "learning_rate": 4.7598705501618125e-05, + "loss": 0.4826, + "step": 1230 + }, + { + "epoch": 1.1499767116907313, + "grad_norm": 0.3208835229721257, + "learning_rate": 4.75663430420712e-05, + "loss": 0.4875, + "step": 1235 + }, + { + "epoch": 1.1546343735444806, + "grad_norm": 0.3045647033657388, + "learning_rate": 4.753398058252427e-05, + "loss": 0.4965, + "step": 1240 + }, + { + "epoch": 1.1592920353982301, + "grad_norm": 0.3330554785430036, + "learning_rate": 4.750161812297735e-05, + "loss": 0.4906, + "step": 1245 + }, + { + "epoch": 1.1639496972519794, + "grad_norm": 0.3751461447400107, + "learning_rate": 4.7469255663430424e-05, + "loss": 0.5014, + "step": 1250 + }, + { + "epoch": 1.1686073591057289, + "grad_norm": 0.34075871643973377, + "learning_rate": 4.74368932038835e-05, + "loss": 0.4878, + "step": 1255 + }, + { + "epoch": 1.1732650209594784, + "grad_norm": 0.3513756614443728, + "learning_rate": 4.740453074433657e-05, + "loss": 0.4967, + "step": 1260 + }, + { + "epoch": 1.1779226828132279, + "grad_norm": 0.3364889150024333, + "learning_rate": 4.737216828478965e-05, + "loss": 0.4827, + "step": 1265 + }, + { + "epoch": 1.1825803446669771, + "grad_norm": 0.32836418576161563, + "learning_rate": 4.7339805825242724e-05, + "loss": 0.475, + "step": 1270 + }, + { + "epoch": 1.1872380065207266, + "grad_norm": 0.34996683855701755, + "learning_rate": 4.73074433656958e-05, + "loss": 0.4812, + "step": 1275 + }, + { + "epoch": 1.191895668374476, + "grad_norm": 0.4025013401969635, + "learning_rate": 4.727508090614887e-05, + "loss": 0.4918, + "step": 1280 + }, + { + "epoch": 1.1965533302282254, + "grad_norm": 0.3401225034506895, + "learning_rate": 4.724271844660194e-05, + "loss": 0.4926, + "step": 1285 + }, + { + "epoch": 1.201210992081975, + "grad_norm": 0.31003980724430014, + "learning_rate": 4.7210355987055017e-05, + "loss": 0.4791, + "step": 1290 + }, + { + "epoch": 1.2058686539357242, + "grad_norm": 0.35347481534080316, + "learning_rate": 4.717799352750809e-05, + "loss": 0.485, + "step": 1295 + }, + { + "epoch": 1.2105263157894737, + "grad_norm": 0.35989687260611175, + "learning_rate": 4.714563106796116e-05, + "loss": 0.4804, + "step": 1300 + }, + { + "epoch": 1.2151839776432232, + "grad_norm": 0.35019515606373103, + "learning_rate": 4.711326860841424e-05, + "loss": 0.4915, + "step": 1305 + }, + { + "epoch": 1.2198416394969724, + "grad_norm": 0.3586166735739774, + "learning_rate": 4.7080906148867316e-05, + "loss": 0.4943, + "step": 1310 + }, + { + "epoch": 1.224499301350722, + "grad_norm": 0.30463953467892224, + "learning_rate": 4.704854368932039e-05, + "loss": 0.4836, + "step": 1315 + }, + { + "epoch": 1.2291569632044714, + "grad_norm": 0.3451363142622922, + "learning_rate": 4.701618122977346e-05, + "loss": 0.4955, + "step": 1320 + }, + { + "epoch": 1.2338146250582207, + "grad_norm": 0.34909714884580134, + "learning_rate": 4.698381877022654e-05, + "loss": 0.4803, + "step": 1325 + }, + { + "epoch": 1.2384722869119702, + "grad_norm": 0.36522864866273713, + "learning_rate": 4.6951456310679615e-05, + "loss": 0.4923, + "step": 1330 + }, + { + "epoch": 1.2431299487657197, + "grad_norm": 0.39221862966053167, + "learning_rate": 4.691909385113269e-05, + "loss": 0.4857, + "step": 1335 + }, + { + "epoch": 1.247787610619469, + "grad_norm": 0.39668592667455693, + "learning_rate": 4.688673139158576e-05, + "loss": 0.4836, + "step": 1340 + }, + { + "epoch": 1.2524452724732185, + "grad_norm": 0.3189900074415766, + "learning_rate": 4.685436893203884e-05, + "loss": 0.4733, + "step": 1345 + }, + { + "epoch": 1.257102934326968, + "grad_norm": 0.4752552185352753, + "learning_rate": 4.6822006472491915e-05, + "loss": 0.4837, + "step": 1350 + }, + { + "epoch": 1.2617605961807172, + "grad_norm": 0.34317843753923166, + "learning_rate": 4.678964401294499e-05, + "loss": 0.4762, + "step": 1355 + }, + { + "epoch": 1.2664182580344667, + "grad_norm": 0.331883711021671, + "learning_rate": 4.675728155339806e-05, + "loss": 0.4694, + "step": 1360 + }, + { + "epoch": 1.271075919888216, + "grad_norm": 0.31579302865236475, + "learning_rate": 4.672491909385113e-05, + "loss": 0.4874, + "step": 1365 + }, + { + "epoch": 1.2757335817419655, + "grad_norm": 0.33947923252160384, + "learning_rate": 4.669255663430421e-05, + "loss": 0.4833, + "step": 1370 + }, + { + "epoch": 1.280391243595715, + "grad_norm": 0.3561249490177733, + "learning_rate": 4.6660194174757284e-05, + "loss": 0.4831, + "step": 1375 + }, + { + "epoch": 1.2850489054494645, + "grad_norm": 0.3220762092182873, + "learning_rate": 4.6627831715210354e-05, + "loss": 0.4827, + "step": 1380 + }, + { + "epoch": 1.2897065673032138, + "grad_norm": 0.3251770474209212, + "learning_rate": 4.659546925566343e-05, + "loss": 0.484, + "step": 1385 + }, + { + "epoch": 1.2943642291569633, + "grad_norm": 0.36465163745930795, + "learning_rate": 4.656310679611651e-05, + "loss": 0.4922, + "step": 1390 + }, + { + "epoch": 1.2990218910107125, + "grad_norm": 0.32665155125372775, + "learning_rate": 4.6530744336569584e-05, + "loss": 0.476, + "step": 1395 + }, + { + "epoch": 1.303679552864462, + "grad_norm": 0.33996752375205314, + "learning_rate": 4.6498381877022653e-05, + "loss": 0.4903, + "step": 1400 + }, + { + "epoch": 1.3083372147182115, + "grad_norm": 0.31472307161627344, + "learning_rate": 4.646601941747573e-05, + "loss": 0.4793, + "step": 1405 + }, + { + "epoch": 1.312994876571961, + "grad_norm": 0.3373270351504413, + "learning_rate": 4.6433656957928807e-05, + "loss": 0.4762, + "step": 1410 + }, + { + "epoch": 1.3176525384257103, + "grad_norm": 0.33123321376978476, + "learning_rate": 4.640129449838188e-05, + "loss": 0.4967, + "step": 1415 + }, + { + "epoch": 1.3223102002794598, + "grad_norm": 0.31722496467412775, + "learning_rate": 4.636893203883495e-05, + "loss": 0.4839, + "step": 1420 + }, + { + "epoch": 1.326967862133209, + "grad_norm": 0.34349051465183933, + "learning_rate": 4.633656957928803e-05, + "loss": 0.4782, + "step": 1425 + }, + { + "epoch": 1.3316255239869585, + "grad_norm": 0.3587305409881593, + "learning_rate": 4.63042071197411e-05, + "loss": 0.4771, + "step": 1430 + }, + { + "epoch": 1.336283185840708, + "grad_norm": 0.3445600613600092, + "learning_rate": 4.6271844660194176e-05, + "loss": 0.482, + "step": 1435 + }, + { + "epoch": 1.3409408476944573, + "grad_norm": 0.3153330747521536, + "learning_rate": 4.623948220064725e-05, + "loss": 0.485, + "step": 1440 + }, + { + "epoch": 1.3455985095482068, + "grad_norm": 0.33287271332785706, + "learning_rate": 4.620711974110032e-05, + "loss": 0.4933, + "step": 1445 + }, + { + "epoch": 1.350256171401956, + "grad_norm": 0.34406012611925313, + "learning_rate": 4.61747572815534e-05, + "loss": 0.481, + "step": 1450 + }, + { + "epoch": 1.3549138332557056, + "grad_norm": 0.3575650526851001, + "learning_rate": 4.6142394822006475e-05, + "loss": 0.4914, + "step": 1455 + }, + { + "epoch": 1.359571495109455, + "grad_norm": 0.3197678788993809, + "learning_rate": 4.611003236245955e-05, + "loss": 0.4716, + "step": 1460 + }, + { + "epoch": 1.3642291569632046, + "grad_norm": 0.35048652815717235, + "learning_rate": 4.607766990291262e-05, + "loss": 0.4786, + "step": 1465 + }, + { + "epoch": 1.3688868188169538, + "grad_norm": 0.3299342177568497, + "learning_rate": 4.60453074433657e-05, + "loss": 0.4831, + "step": 1470 + }, + { + "epoch": 1.3735444806707033, + "grad_norm": 0.35020584888995054, + "learning_rate": 4.6012944983818775e-05, + "loss": 0.4751, + "step": 1475 + }, + { + "epoch": 1.3782021425244526, + "grad_norm": 0.34466146202234915, + "learning_rate": 4.5980582524271845e-05, + "loss": 0.4754, + "step": 1480 + }, + { + "epoch": 1.382859804378202, + "grad_norm": 0.3326105229693496, + "learning_rate": 4.594822006472492e-05, + "loss": 0.4814, + "step": 1485 + }, + { + "epoch": 1.3875174662319516, + "grad_norm": 0.3863251103968962, + "learning_rate": 4.5915857605178e-05, + "loss": 0.4856, + "step": 1490 + }, + { + "epoch": 1.392175128085701, + "grad_norm": 0.39869131402062374, + "learning_rate": 4.5883495145631074e-05, + "loss": 0.4872, + "step": 1495 + }, + { + "epoch": 1.3968327899394504, + "grad_norm": 0.3236670357940102, + "learning_rate": 4.5851132686084144e-05, + "loss": 0.4744, + "step": 1500 + }, + { + "epoch": 1.4014904517931999, + "grad_norm": 0.33671866535084927, + "learning_rate": 4.5818770226537214e-05, + "loss": 0.4782, + "step": 1505 + }, + { + "epoch": 1.4061481136469491, + "grad_norm": 0.3964201377988781, + "learning_rate": 4.578640776699029e-05, + "loss": 0.4782, + "step": 1510 + }, + { + "epoch": 1.4108057755006986, + "grad_norm": 0.3160900921197823, + "learning_rate": 4.575404530744337e-05, + "loss": 0.4895, + "step": 1515 + }, + { + "epoch": 1.4154634373544481, + "grad_norm": 0.32777499194109583, + "learning_rate": 4.5721682847896444e-05, + "loss": 0.4787, + "step": 1520 + }, + { + "epoch": 1.4201210992081974, + "grad_norm": 0.3903507215961828, + "learning_rate": 4.568932038834951e-05, + "loss": 0.4778, + "step": 1525 + }, + { + "epoch": 1.424778761061947, + "grad_norm": 0.35785839178718776, + "learning_rate": 4.565695792880259e-05, + "loss": 0.4712, + "step": 1530 + }, + { + "epoch": 1.4294364229156964, + "grad_norm": 0.3117173347583603, + "learning_rate": 4.5624595469255666e-05, + "loss": 0.4855, + "step": 1535 + }, + { + "epoch": 1.4340940847694457, + "grad_norm": 0.3068803654580654, + "learning_rate": 4.559223300970874e-05, + "loss": 0.4755, + "step": 1540 + }, + { + "epoch": 1.4387517466231952, + "grad_norm": 0.3429070075799174, + "learning_rate": 4.555987055016181e-05, + "loss": 0.4783, + "step": 1545 + }, + { + "epoch": 1.4434094084769447, + "grad_norm": 0.34774860374178423, + "learning_rate": 4.552750809061489e-05, + "loss": 0.4721, + "step": 1550 + }, + { + "epoch": 1.448067070330694, + "grad_norm": 0.3133469650282993, + "learning_rate": 4.5495145631067966e-05, + "loss": 0.468, + "step": 1555 + }, + { + "epoch": 1.4527247321844434, + "grad_norm": 0.32300725004307296, + "learning_rate": 4.546278317152104e-05, + "loss": 0.4754, + "step": 1560 + }, + { + "epoch": 1.4573823940381927, + "grad_norm": 0.3654645603530482, + "learning_rate": 4.543042071197411e-05, + "loss": 0.4856, + "step": 1565 + }, + { + "epoch": 1.4620400558919422, + "grad_norm": 0.3144382775868269, + "learning_rate": 4.539805825242719e-05, + "loss": 0.5011, + "step": 1570 + }, + { + "epoch": 1.4666977177456917, + "grad_norm": 0.32738808814400805, + "learning_rate": 4.536569579288026e-05, + "loss": 0.4796, + "step": 1575 + }, + { + "epoch": 1.4713553795994412, + "grad_norm": 0.33065024289653616, + "learning_rate": 4.5333333333333335e-05, + "loss": 0.467, + "step": 1580 + }, + { + "epoch": 1.4760130414531905, + "grad_norm": 0.31408876095328725, + "learning_rate": 4.5300970873786405e-05, + "loss": 0.4766, + "step": 1585 + }, + { + "epoch": 1.48067070330694, + "grad_norm": 0.33132227281018756, + "learning_rate": 4.526860841423948e-05, + "loss": 0.4774, + "step": 1590 + }, + { + "epoch": 1.4853283651606892, + "grad_norm": 0.3267028107171943, + "learning_rate": 4.523624595469256e-05, + "loss": 0.4721, + "step": 1595 + }, + { + "epoch": 1.4899860270144387, + "grad_norm": 0.34183969397437214, + "learning_rate": 4.5203883495145635e-05, + "loss": 0.4808, + "step": 1600 + }, + { + "epoch": 1.4946436888681882, + "grad_norm": 0.3341079510588814, + "learning_rate": 4.5171521035598705e-05, + "loss": 0.4712, + "step": 1605 + }, + { + "epoch": 1.4993013507219377, + "grad_norm": 0.3637929050334216, + "learning_rate": 4.513915857605178e-05, + "loss": 0.4769, + "step": 1610 + }, + { + "epoch": 1.503959012575687, + "grad_norm": 0.3614116465559769, + "learning_rate": 4.510679611650486e-05, + "loss": 0.4793, + "step": 1615 + }, + { + "epoch": 1.5086166744294365, + "grad_norm": 0.3282234536811554, + "learning_rate": 4.5074433656957934e-05, + "loss": 0.4756, + "step": 1620 + }, + { + "epoch": 1.5132743362831858, + "grad_norm": 0.3174096727244024, + "learning_rate": 4.5042071197411004e-05, + "loss": 0.4821, + "step": 1625 + }, + { + "epoch": 1.5179319981369352, + "grad_norm": 0.33054476333996435, + "learning_rate": 4.500970873786408e-05, + "loss": 0.4787, + "step": 1630 + }, + { + "epoch": 1.5225896599906847, + "grad_norm": 0.3143046040017771, + "learning_rate": 4.497734627831716e-05, + "loss": 0.4903, + "step": 1635 + }, + { + "epoch": 1.5272473218444342, + "grad_norm": 0.36640286783015863, + "learning_rate": 4.4944983818770234e-05, + "loss": 0.4833, + "step": 1640 + }, + { + "epoch": 1.5319049836981835, + "grad_norm": 0.3012687202528479, + "learning_rate": 4.4912621359223303e-05, + "loss": 0.4893, + "step": 1645 + }, + { + "epoch": 1.5365626455519328, + "grad_norm": 0.37938595625196225, + "learning_rate": 4.488025889967637e-05, + "loss": 0.4711, + "step": 1650 + }, + { + "epoch": 1.5412203074056823, + "grad_norm": 0.33746086958512767, + "learning_rate": 4.484789644012945e-05, + "loss": 0.469, + "step": 1655 + }, + { + "epoch": 1.5458779692594318, + "grad_norm": 0.3195215446211632, + "learning_rate": 4.4815533980582526e-05, + "loss": 0.4669, + "step": 1660 + }, + { + "epoch": 1.5505356311131813, + "grad_norm": 0.31017867155169293, + "learning_rate": 4.4783171521035596e-05, + "loss": 0.4813, + "step": 1665 + }, + { + "epoch": 1.5551932929669308, + "grad_norm": 0.35887769881626985, + "learning_rate": 4.475080906148867e-05, + "loss": 0.4752, + "step": 1670 + }, + { + "epoch": 1.55985095482068, + "grad_norm": 0.31528761907295816, + "learning_rate": 4.471844660194175e-05, + "loss": 0.4722, + "step": 1675 + }, + { + "epoch": 1.5645086166744293, + "grad_norm": 0.29713048688592464, + "learning_rate": 4.4686084142394826e-05, + "loss": 0.4816, + "step": 1680 + }, + { + "epoch": 1.5691662785281788, + "grad_norm": 0.34501437275635716, + "learning_rate": 4.4653721682847896e-05, + "loss": 0.4711, + "step": 1685 + }, + { + "epoch": 1.5738239403819283, + "grad_norm": 0.3193644835414442, + "learning_rate": 4.462135922330097e-05, + "loss": 0.4772, + "step": 1690 + }, + { + "epoch": 1.5784816022356778, + "grad_norm": 0.3525186050167732, + "learning_rate": 4.458899676375405e-05, + "loss": 0.4752, + "step": 1695 + }, + { + "epoch": 1.583139264089427, + "grad_norm": 0.3390490212094124, + "learning_rate": 4.4556634304207125e-05, + "loss": 0.4801, + "step": 1700 + }, + { + "epoch": 1.5877969259431766, + "grad_norm": 0.27199321463712367, + "learning_rate": 4.4524271844660195e-05, + "loss": 0.4732, + "step": 1705 + }, + { + "epoch": 1.5924545877969258, + "grad_norm": 0.33136111443589444, + "learning_rate": 4.449190938511327e-05, + "loss": 0.4794, + "step": 1710 + }, + { + "epoch": 1.5971122496506753, + "grad_norm": 0.33788165112244023, + "learning_rate": 4.445954692556635e-05, + "loss": 0.4869, + "step": 1715 + }, + { + "epoch": 1.6017699115044248, + "grad_norm": 0.2771764238057386, + "learning_rate": 4.442718446601942e-05, + "loss": 0.4701, + "step": 1720 + }, + { + "epoch": 1.6064275733581743, + "grad_norm": 0.3085685435375194, + "learning_rate": 4.4394822006472495e-05, + "loss": 0.4722, + "step": 1725 + }, + { + "epoch": 1.6110852352119236, + "grad_norm": 0.3213685919603364, + "learning_rate": 4.4362459546925564e-05, + "loss": 0.4854, + "step": 1730 + }, + { + "epoch": 1.6157428970656729, + "grad_norm": 0.3449226743664926, + "learning_rate": 4.433009708737864e-05, + "loss": 0.4831, + "step": 1735 + }, + { + "epoch": 1.6204005589194224, + "grad_norm": 0.2819191908463559, + "learning_rate": 4.429773462783172e-05, + "loss": 0.4626, + "step": 1740 + }, + { + "epoch": 1.6250582207731719, + "grad_norm": 0.2803367610061562, + "learning_rate": 4.4265372168284794e-05, + "loss": 0.4983, + "step": 1745 + }, + { + "epoch": 1.6297158826269214, + "grad_norm": 0.3032620461460716, + "learning_rate": 4.4233009708737864e-05, + "loss": 0.4688, + "step": 1750 + }, + { + "epoch": 1.6343735444806708, + "grad_norm": 0.31143778436372727, + "learning_rate": 4.420064724919094e-05, + "loss": 0.4878, + "step": 1755 + }, + { + "epoch": 1.6390312063344201, + "grad_norm": 0.29059732424768564, + "learning_rate": 4.416828478964402e-05, + "loss": 0.4662, + "step": 1760 + }, + { + "epoch": 1.6436888681881694, + "grad_norm": 0.30902263729331736, + "learning_rate": 4.4135922330097094e-05, + "loss": 0.4735, + "step": 1765 + }, + { + "epoch": 1.648346530041919, + "grad_norm": 0.3067477355060161, + "learning_rate": 4.410355987055016e-05, + "loss": 0.4659, + "step": 1770 + }, + { + "epoch": 1.6530041918956684, + "grad_norm": 0.30766225324156005, + "learning_rate": 4.407119741100324e-05, + "loss": 0.4839, + "step": 1775 + }, + { + "epoch": 1.6576618537494179, + "grad_norm": 0.3060356213831821, + "learning_rate": 4.4038834951456316e-05, + "loss": 0.4683, + "step": 1780 + }, + { + "epoch": 1.6623195156031674, + "grad_norm": 0.27993399458596613, + "learning_rate": 4.4006472491909386e-05, + "loss": 0.4787, + "step": 1785 + }, + { + "epoch": 1.6669771774569166, + "grad_norm": 0.2878538280302826, + "learning_rate": 4.397411003236246e-05, + "loss": 0.469, + "step": 1790 + }, + { + "epoch": 1.671634839310666, + "grad_norm": 0.3190507344099696, + "learning_rate": 4.394174757281553e-05, + "loss": 0.467, + "step": 1795 + }, + { + "epoch": 1.6762925011644154, + "grad_norm": 0.2740825398174676, + "learning_rate": 4.390938511326861e-05, + "loss": 0.4663, + "step": 1800 + }, + { + "epoch": 1.680950163018165, + "grad_norm": 0.3029113523627245, + "learning_rate": 4.3877022653721686e-05, + "loss": 0.473, + "step": 1805 + }, + { + "epoch": 1.6856078248719144, + "grad_norm": 0.30275582143151997, + "learning_rate": 4.3844660194174756e-05, + "loss": 0.4714, + "step": 1810 + }, + { + "epoch": 1.6902654867256637, + "grad_norm": 0.29018400666960237, + "learning_rate": 4.381229773462783e-05, + "loss": 0.4731, + "step": 1815 + }, + { + "epoch": 1.6949231485794132, + "grad_norm": 0.31944630955568515, + "learning_rate": 4.377993527508091e-05, + "loss": 0.4569, + "step": 1820 + }, + { + "epoch": 1.6995808104331624, + "grad_norm": 0.33676422137045936, + "learning_rate": 4.3747572815533985e-05, + "loss": 0.4633, + "step": 1825 + }, + { + "epoch": 1.704238472286912, + "grad_norm": 0.3396901223033333, + "learning_rate": 4.3715210355987055e-05, + "loss": 0.4643, + "step": 1830 + }, + { + "epoch": 1.7088961341406614, + "grad_norm": 0.3366830543419861, + "learning_rate": 4.368284789644013e-05, + "loss": 0.474, + "step": 1835 + }, + { + "epoch": 1.713553795994411, + "grad_norm": 0.3398454823387078, + "learning_rate": 4.365048543689321e-05, + "loss": 0.4716, + "step": 1840 + }, + { + "epoch": 1.7182114578481602, + "grad_norm": 0.3093101686513045, + "learning_rate": 4.3618122977346285e-05, + "loss": 0.4734, + "step": 1845 + }, + { + "epoch": 1.7228691197019095, + "grad_norm": 0.28865990425667754, + "learning_rate": 4.3585760517799354e-05, + "loss": 0.4843, + "step": 1850 + }, + { + "epoch": 1.727526781555659, + "grad_norm": 0.30938960750962463, + "learning_rate": 4.355339805825243e-05, + "loss": 0.4608, + "step": 1855 + }, + { + "epoch": 1.7321844434094085, + "grad_norm": 0.303570178840196, + "learning_rate": 4.352103559870551e-05, + "loss": 0.4938, + "step": 1860 + }, + { + "epoch": 1.736842105263158, + "grad_norm": 0.320398560744746, + "learning_rate": 4.348867313915858e-05, + "loss": 0.4687, + "step": 1865 + }, + { + "epoch": 1.7414997671169075, + "grad_norm": 0.28934725217636076, + "learning_rate": 4.345631067961165e-05, + "loss": 0.4767, + "step": 1870 + }, + { + "epoch": 1.7461574289706567, + "grad_norm": 0.3082083156804532, + "learning_rate": 4.3423948220064724e-05, + "loss": 0.4917, + "step": 1875 + }, + { + "epoch": 1.750815090824406, + "grad_norm": 0.33073903630019985, + "learning_rate": 4.33915857605178e-05, + "loss": 0.48, + "step": 1880 + }, + { + "epoch": 1.7554727526781555, + "grad_norm": 0.3795705747571329, + "learning_rate": 4.335922330097088e-05, + "loss": 0.4818, + "step": 1885 + }, + { + "epoch": 1.760130414531905, + "grad_norm": 0.33196205439252985, + "learning_rate": 4.332686084142395e-05, + "loss": 0.4833, + "step": 1890 + }, + { + "epoch": 1.7647880763856545, + "grad_norm": 0.3755261858357244, + "learning_rate": 4.329449838187702e-05, + "loss": 0.4882, + "step": 1895 + }, + { + "epoch": 1.7694457382394038, + "grad_norm": 0.3036362546197639, + "learning_rate": 4.32621359223301e-05, + "loss": 0.4758, + "step": 1900 + }, + { + "epoch": 1.7741034000931533, + "grad_norm": 0.30502050664737707, + "learning_rate": 4.3229773462783176e-05, + "loss": 0.475, + "step": 1905 + }, + { + "epoch": 1.7787610619469025, + "grad_norm": 0.3157357177209518, + "learning_rate": 4.3197411003236246e-05, + "loss": 0.466, + "step": 1910 + }, + { + "epoch": 1.783418723800652, + "grad_norm": 0.3040131152683334, + "learning_rate": 4.316504854368932e-05, + "loss": 0.4577, + "step": 1915 + }, + { + "epoch": 1.7880763856544015, + "grad_norm": 0.3187185944812107, + "learning_rate": 4.31326860841424e-05, + "loss": 0.4669, + "step": 1920 + }, + { + "epoch": 1.792734047508151, + "grad_norm": 0.33931947982120053, + "learning_rate": 4.3100323624595476e-05, + "loss": 0.4735, + "step": 1925 + }, + { + "epoch": 1.7973917093619003, + "grad_norm": 0.32734347976657, + "learning_rate": 4.3067961165048546e-05, + "loss": 0.4672, + "step": 1930 + }, + { + "epoch": 1.8020493712156498, + "grad_norm": 0.4692029460469582, + "learning_rate": 4.3035598705501615e-05, + "loss": 0.4613, + "step": 1935 + }, + { + "epoch": 1.806707033069399, + "grad_norm": 0.32483802088569746, + "learning_rate": 4.300323624595469e-05, + "loss": 0.4636, + "step": 1940 + }, + { + "epoch": 1.8113646949231486, + "grad_norm": 0.302012305602097, + "learning_rate": 4.297087378640777e-05, + "loss": 0.4772, + "step": 1945 + }, + { + "epoch": 1.816022356776898, + "grad_norm": 0.31280787882848426, + "learning_rate": 4.2938511326860845e-05, + "loss": 0.4667, + "step": 1950 + }, + { + "epoch": 1.8206800186306475, + "grad_norm": 0.2833157297650169, + "learning_rate": 4.2906148867313915e-05, + "loss": 0.4685, + "step": 1955 + }, + { + "epoch": 1.8253376804843968, + "grad_norm": 0.3057640134824448, + "learning_rate": 4.287378640776699e-05, + "loss": 0.4713, + "step": 1960 + }, + { + "epoch": 1.829995342338146, + "grad_norm": 0.28708015146494686, + "learning_rate": 4.284142394822007e-05, + "loss": 0.4739, + "step": 1965 + }, + { + "epoch": 1.8346530041918956, + "grad_norm": 0.30123275415917106, + "learning_rate": 4.280906148867314e-05, + "loss": 0.4695, + "step": 1970 + }, + { + "epoch": 1.839310666045645, + "grad_norm": 0.3099654672262839, + "learning_rate": 4.2776699029126214e-05, + "loss": 0.4749, + "step": 1975 + }, + { + "epoch": 1.8439683278993946, + "grad_norm": 0.3152945452256832, + "learning_rate": 4.274433656957929e-05, + "loss": 0.4671, + "step": 1980 + }, + { + "epoch": 1.848625989753144, + "grad_norm": 0.31539137515997484, + "learning_rate": 4.271197411003237e-05, + "loss": 0.4843, + "step": 1985 + }, + { + "epoch": 1.8532836516068933, + "grad_norm": 0.33743023774825115, + "learning_rate": 4.267961165048544e-05, + "loss": 0.4742, + "step": 1990 + }, + { + "epoch": 1.8579413134606426, + "grad_norm": 0.278496500840638, + "learning_rate": 4.2647249190938514e-05, + "loss": 0.4571, + "step": 1995 + }, + { + "epoch": 1.8625989753143921, + "grad_norm": 0.2998795993570895, + "learning_rate": 4.261488673139159e-05, + "loss": 0.471, + "step": 2000 + }, + { + "epoch": 1.8672566371681416, + "grad_norm": 0.30700849405318176, + "learning_rate": 4.258252427184467e-05, + "loss": 0.4817, + "step": 2005 + }, + { + "epoch": 1.871914299021891, + "grad_norm": 0.27629264600348125, + "learning_rate": 4.255016181229774e-05, + "loss": 0.4632, + "step": 2010 + }, + { + "epoch": 1.8765719608756404, + "grad_norm": 0.28518434221399247, + "learning_rate": 4.2517799352750807e-05, + "loss": 0.4827, + "step": 2015 + }, + { + "epoch": 1.8812296227293899, + "grad_norm": 0.3021920950459163, + "learning_rate": 4.248543689320388e-05, + "loss": 0.4642, + "step": 2020 + }, + { + "epoch": 1.8858872845831391, + "grad_norm": 0.28974303173801413, + "learning_rate": 4.245307443365696e-05, + "loss": 0.4707, + "step": 2025 + }, + { + "epoch": 1.8905449464368886, + "grad_norm": 0.2944523033092523, + "learning_rate": 4.2420711974110036e-05, + "loss": 0.4664, + "step": 2030 + }, + { + "epoch": 1.8952026082906381, + "grad_norm": 1.2596267443116471, + "learning_rate": 4.2388349514563106e-05, + "loss": 0.463, + "step": 2035 + }, + { + "epoch": 1.8998602701443876, + "grad_norm": 0.3251364871994849, + "learning_rate": 4.235598705501618e-05, + "loss": 0.4769, + "step": 2040 + }, + { + "epoch": 1.904517931998137, + "grad_norm": 0.28708893882914116, + "learning_rate": 4.232362459546926e-05, + "loss": 0.4813, + "step": 2045 + }, + { + "epoch": 1.9091755938518864, + "grad_norm": 0.300276772974375, + "learning_rate": 4.2291262135922336e-05, + "loss": 0.4686, + "step": 2050 + }, + { + "epoch": 1.9138332557056357, + "grad_norm": 0.27252504469429434, + "learning_rate": 4.2258899676375405e-05, + "loss": 0.4681, + "step": 2055 + }, + { + "epoch": 1.9184909175593852, + "grad_norm": 0.2770826992578108, + "learning_rate": 4.222653721682848e-05, + "loss": 0.4632, + "step": 2060 + }, + { + "epoch": 1.9231485794131347, + "grad_norm": 0.2866673869336603, + "learning_rate": 4.219417475728156e-05, + "loss": 0.4678, + "step": 2065 + }, + { + "epoch": 1.9278062412668842, + "grad_norm": 0.3148912097597905, + "learning_rate": 4.2161812297734635e-05, + "loss": 0.4707, + "step": 2070 + }, + { + "epoch": 1.9324639031206334, + "grad_norm": 0.2856113904672565, + "learning_rate": 4.2129449838187705e-05, + "loss": 0.4682, + "step": 2075 + }, + { + "epoch": 1.9371215649743827, + "grad_norm": 0.28939502907971115, + "learning_rate": 4.2097087378640775e-05, + "loss": 0.4603, + "step": 2080 + }, + { + "epoch": 1.9417792268281322, + "grad_norm": 0.2833963329099756, + "learning_rate": 4.206472491909385e-05, + "loss": 0.4753, + "step": 2085 + }, + { + "epoch": 1.9464368886818817, + "grad_norm": 0.2792282266609358, + "learning_rate": 4.203236245954693e-05, + "loss": 0.4526, + "step": 2090 + }, + { + "epoch": 1.9510945505356312, + "grad_norm": 0.27854417338326687, + "learning_rate": 4.2e-05, + "loss": 0.4653, + "step": 2095 + }, + { + "epoch": 1.9557522123893807, + "grad_norm": 0.31580009741056303, + "learning_rate": 4.1967637540453074e-05, + "loss": 0.4634, + "step": 2100 + }, + { + "epoch": 1.96040987424313, + "grad_norm": 0.2919023246129842, + "learning_rate": 4.193527508090615e-05, + "loss": 0.4739, + "step": 2105 + }, + { + "epoch": 1.9650675360968792, + "grad_norm": 0.30597531862356964, + "learning_rate": 4.190291262135923e-05, + "loss": 0.4558, + "step": 2110 + }, + { + "epoch": 1.9697251979506287, + "grad_norm": 0.29143179368567523, + "learning_rate": 4.18705501618123e-05, + "loss": 0.4691, + "step": 2115 + }, + { + "epoch": 1.9743828598043782, + "grad_norm": 0.3179819476003419, + "learning_rate": 4.1838187702265374e-05, + "loss": 0.4723, + "step": 2120 + }, + { + "epoch": 1.9790405216581277, + "grad_norm": 0.31806911713554653, + "learning_rate": 4.180582524271845e-05, + "loss": 0.4618, + "step": 2125 + }, + { + "epoch": 1.983698183511877, + "grad_norm": 0.2831599208032219, + "learning_rate": 4.177346278317153e-05, + "loss": 0.465, + "step": 2130 + }, + { + "epoch": 1.9883558453656265, + "grad_norm": 0.30209993683958286, + "learning_rate": 4.17411003236246e-05, + "loss": 0.4723, + "step": 2135 + }, + { + "epoch": 1.9930135072193758, + "grad_norm": 0.2789050959016751, + "learning_rate": 4.170873786407767e-05, + "loss": 0.4709, + "step": 2140 + }, + { + "epoch": 1.9976711690731253, + "grad_norm": 0.3214563107858177, + "learning_rate": 4.167637540453075e-05, + "loss": 0.4705, + "step": 2145 + }, + { + "epoch": 2.0018630647415, + "grad_norm": 0.34324871696573384, + "learning_rate": 4.1644012944983826e-05, + "loss": 0.4626, + "step": 2150 + }, + { + "epoch": 2.0065207265952494, + "grad_norm": 0.33538344509233275, + "learning_rate": 4.161165048543689e-05, + "loss": 0.4303, + "step": 2155 + }, + { + "epoch": 2.0111783884489984, + "grad_norm": 0.3454953096884954, + "learning_rate": 4.1579288025889966e-05, + "loss": 0.4225, + "step": 2160 + }, + { + "epoch": 2.015836050302748, + "grad_norm": 0.32907002129649626, + "learning_rate": 4.154692556634304e-05, + "loss": 0.429, + "step": 2165 + }, + { + "epoch": 2.0204937121564974, + "grad_norm": 0.30918096504052195, + "learning_rate": 4.151456310679612e-05, + "loss": 0.4152, + "step": 2170 + }, + { + "epoch": 2.025151374010247, + "grad_norm": 0.29841535792838386, + "learning_rate": 4.148220064724919e-05, + "loss": 0.4362, + "step": 2175 + }, + { + "epoch": 2.0298090358639964, + "grad_norm": 0.31922764691509886, + "learning_rate": 4.1449838187702265e-05, + "loss": 0.4226, + "step": 2180 + }, + { + "epoch": 2.034466697717746, + "grad_norm": 0.33711068057941024, + "learning_rate": 4.141747572815534e-05, + "loss": 0.4322, + "step": 2185 + }, + { + "epoch": 2.039124359571495, + "grad_norm": 0.3615982081697286, + "learning_rate": 4.138511326860842e-05, + "loss": 0.429, + "step": 2190 + }, + { + "epoch": 2.0437820214252445, + "grad_norm": 0.2894804248909392, + "learning_rate": 4.135275080906149e-05, + "loss": 0.4268, + "step": 2195 + }, + { + "epoch": 2.048439683278994, + "grad_norm": 0.3094882269270303, + "learning_rate": 4.1320388349514565e-05, + "loss": 0.4262, + "step": 2200 + }, + { + "epoch": 2.0530973451327434, + "grad_norm": 0.3228500916846504, + "learning_rate": 4.128802588996764e-05, + "loss": 0.4323, + "step": 2205 + }, + { + "epoch": 2.057755006986493, + "grad_norm": 0.32065541360901384, + "learning_rate": 4.125566343042072e-05, + "loss": 0.4179, + "step": 2210 + }, + { + "epoch": 2.062412668840242, + "grad_norm": 0.37823517394569717, + "learning_rate": 4.122330097087379e-05, + "loss": 0.4251, + "step": 2215 + }, + { + "epoch": 2.0670703306939915, + "grad_norm": 0.29478585632572873, + "learning_rate": 4.1190938511326864e-05, + "loss": 0.4291, + "step": 2220 + }, + { + "epoch": 2.071727992547741, + "grad_norm": 0.2991406315394289, + "learning_rate": 4.1158576051779934e-05, + "loss": 0.4259, + "step": 2225 + }, + { + "epoch": 2.0763856544014905, + "grad_norm": 0.28207541248218293, + "learning_rate": 4.112621359223301e-05, + "loss": 0.4272, + "step": 2230 + }, + { + "epoch": 2.08104331625524, + "grad_norm": 0.3039635186974918, + "learning_rate": 4.109385113268609e-05, + "loss": 0.421, + "step": 2235 + }, + { + "epoch": 2.0857009781089895, + "grad_norm": 0.29727358441688373, + "learning_rate": 4.106148867313916e-05, + "loss": 0.4321, + "step": 2240 + }, + { + "epoch": 2.0903586399627385, + "grad_norm": 0.2999892387740095, + "learning_rate": 4.1029126213592234e-05, + "loss": 0.4319, + "step": 2245 + }, + { + "epoch": 2.095016301816488, + "grad_norm": 0.3098725249210994, + "learning_rate": 4.099676375404531e-05, + "loss": 0.424, + "step": 2250 + }, + { + "epoch": 2.0996739636702375, + "grad_norm": 0.33377558074464014, + "learning_rate": 4.096440129449839e-05, + "loss": 0.436, + "step": 2255 + }, + { + "epoch": 2.104331625523987, + "grad_norm": 0.3364161596111139, + "learning_rate": 4.0932038834951457e-05, + "loss": 0.4232, + "step": 2260 + }, + { + "epoch": 2.1089892873777365, + "grad_norm": 0.3131140562229925, + "learning_rate": 4.089967637540453e-05, + "loss": 0.4381, + "step": 2265 + }, + { + "epoch": 2.113646949231486, + "grad_norm": 0.32109915934435745, + "learning_rate": 4.086731391585761e-05, + "loss": 0.4265, + "step": 2270 + }, + { + "epoch": 2.118304611085235, + "grad_norm": 0.29268954120346086, + "learning_rate": 4.083495145631068e-05, + "loss": 0.4279, + "step": 2275 + }, + { + "epoch": 2.1229622729389845, + "grad_norm": 0.3249364007907747, + "learning_rate": 4.0802588996763756e-05, + "loss": 0.428, + "step": 2280 + }, + { + "epoch": 2.127619934792734, + "grad_norm": 0.31460519638919976, + "learning_rate": 4.077022653721683e-05, + "loss": 0.4329, + "step": 2285 + }, + { + "epoch": 2.1322775966464835, + "grad_norm": 0.32332636152884364, + "learning_rate": 4.073786407766991e-05, + "loss": 0.4301, + "step": 2290 + }, + { + "epoch": 2.136935258500233, + "grad_norm": 0.28748649337095494, + "learning_rate": 4.070550161812298e-05, + "loss": 0.4387, + "step": 2295 + }, + { + "epoch": 2.1415929203539825, + "grad_norm": 0.29107742775138384, + "learning_rate": 4.067313915857605e-05, + "loss": 0.4387, + "step": 2300 + }, + { + "epoch": 2.1462505822077316, + "grad_norm": 0.2844782439437948, + "learning_rate": 4.0640776699029125e-05, + "loss": 0.4295, + "step": 2305 + }, + { + "epoch": 2.150908244061481, + "grad_norm": 0.27362329963575716, + "learning_rate": 4.06084142394822e-05, + "loss": 0.4137, + "step": 2310 + }, + { + "epoch": 2.1555659059152306, + "grad_norm": 0.28922030486408484, + "learning_rate": 4.057605177993528e-05, + "loss": 0.4211, + "step": 2315 + }, + { + "epoch": 2.16022356776898, + "grad_norm": 0.2783657206027948, + "learning_rate": 4.054368932038835e-05, + "loss": 0.4248, + "step": 2320 + }, + { + "epoch": 2.1648812296227296, + "grad_norm": 0.30625672965034256, + "learning_rate": 4.0511326860841425e-05, + "loss": 0.4344, + "step": 2325 + }, + { + "epoch": 2.1695388914764786, + "grad_norm": 0.29237291366395135, + "learning_rate": 4.04789644012945e-05, + "loss": 0.4367, + "step": 2330 + }, + { + "epoch": 2.174196553330228, + "grad_norm": 0.30061144676071067, + "learning_rate": 4.044660194174758e-05, + "loss": 0.4193, + "step": 2335 + }, + { + "epoch": 2.1788542151839776, + "grad_norm": 0.3034112135905207, + "learning_rate": 4.041423948220065e-05, + "loss": 0.433, + "step": 2340 + }, + { + "epoch": 2.183511877037727, + "grad_norm": 0.2784621691367184, + "learning_rate": 4.0381877022653724e-05, + "loss": 0.4318, + "step": 2345 + }, + { + "epoch": 2.1881695388914766, + "grad_norm": 0.28920312172259255, + "learning_rate": 4.03495145631068e-05, + "loss": 0.4303, + "step": 2350 + }, + { + "epoch": 2.192827200745226, + "grad_norm": 0.2884542372811835, + "learning_rate": 4.031715210355988e-05, + "loss": 0.4237, + "step": 2355 + }, + { + "epoch": 2.197484862598975, + "grad_norm": 0.29695407946896535, + "learning_rate": 4.028478964401295e-05, + "loss": 0.4339, + "step": 2360 + }, + { + "epoch": 2.2021425244527246, + "grad_norm": 0.30202143737915377, + "learning_rate": 4.0252427184466024e-05, + "loss": 0.4301, + "step": 2365 + }, + { + "epoch": 2.206800186306474, + "grad_norm": 0.2935803126492083, + "learning_rate": 4.0220064724919093e-05, + "loss": 0.4353, + "step": 2370 + }, + { + "epoch": 2.2114578481602236, + "grad_norm": 0.2911223567144398, + "learning_rate": 4.018770226537217e-05, + "loss": 0.436, + "step": 2375 + }, + { + "epoch": 2.216115510013973, + "grad_norm": 0.31911706724850875, + "learning_rate": 4.015533980582524e-05, + "loss": 0.4265, + "step": 2380 + }, + { + "epoch": 2.2207731718677226, + "grad_norm": 0.28529775833448395, + "learning_rate": 4.0122977346278316e-05, + "loss": 0.4409, + "step": 2385 + }, + { + "epoch": 2.2254308337214717, + "grad_norm": 0.25695809085703514, + "learning_rate": 4.009061488673139e-05, + "loss": 0.4304, + "step": 2390 + }, + { + "epoch": 2.230088495575221, + "grad_norm": 0.29657749009103374, + "learning_rate": 4.005825242718447e-05, + "loss": 0.4296, + "step": 2395 + }, + { + "epoch": 2.2347461574289706, + "grad_norm": 0.29804875797050473, + "learning_rate": 4.002588996763754e-05, + "loss": 0.4209, + "step": 2400 + }, + { + "epoch": 2.23940381928272, + "grad_norm": 0.29220612134590873, + "learning_rate": 3.9993527508090616e-05, + "loss": 0.4311, + "step": 2405 + }, + { + "epoch": 2.2440614811364696, + "grad_norm": 0.2838098626320004, + "learning_rate": 3.996116504854369e-05, + "loss": 0.4299, + "step": 2410 + }, + { + "epoch": 2.248719142990219, + "grad_norm": 0.2717759340416869, + "learning_rate": 3.992880258899677e-05, + "loss": 0.4261, + "step": 2415 + }, + { + "epoch": 2.253376804843968, + "grad_norm": 0.2729701123231704, + "learning_rate": 3.989644012944984e-05, + "loss": 0.4138, + "step": 2420 + }, + { + "epoch": 2.2580344666977177, + "grad_norm": 0.2955858122750388, + "learning_rate": 3.9864077669902915e-05, + "loss": 0.4392, + "step": 2425 + }, + { + "epoch": 2.262692128551467, + "grad_norm": 0.2870127555384206, + "learning_rate": 3.983171521035599e-05, + "loss": 0.4217, + "step": 2430 + }, + { + "epoch": 2.2673497904052167, + "grad_norm": 0.29714494202478464, + "learning_rate": 3.979935275080907e-05, + "loss": 0.4307, + "step": 2435 + }, + { + "epoch": 2.272007452258966, + "grad_norm": 0.31323215155035516, + "learning_rate": 3.976699029126214e-05, + "loss": 0.4284, + "step": 2440 + }, + { + "epoch": 2.276665114112715, + "grad_norm": 0.30731519196804635, + "learning_rate": 3.973462783171521e-05, + "loss": 0.4269, + "step": 2445 + }, + { + "epoch": 2.2813227759664647, + "grad_norm": 0.2976558685973644, + "learning_rate": 3.9702265372168285e-05, + "loss": 0.4275, + "step": 2450 + }, + { + "epoch": 2.285980437820214, + "grad_norm": 0.31150191739658856, + "learning_rate": 3.966990291262136e-05, + "loss": 0.4232, + "step": 2455 + }, + { + "epoch": 2.2906380996739637, + "grad_norm": 0.3068690355886579, + "learning_rate": 3.963754045307443e-05, + "loss": 0.4194, + "step": 2460 + }, + { + "epoch": 2.295295761527713, + "grad_norm": 0.30564701433441577, + "learning_rate": 3.960517799352751e-05, + "loss": 0.4415, + "step": 2465 + }, + { + "epoch": 2.2999534233814627, + "grad_norm": 0.2832601518769638, + "learning_rate": 3.9572815533980584e-05, + "loss": 0.4325, + "step": 2470 + }, + { + "epoch": 2.3046110852352117, + "grad_norm": 0.26612149177883143, + "learning_rate": 3.954045307443366e-05, + "loss": 0.4348, + "step": 2475 + }, + { + "epoch": 2.3092687470889612, + "grad_norm": 0.3016054945452254, + "learning_rate": 3.950809061488673e-05, + "loss": 0.428, + "step": 2480 + }, + { + "epoch": 2.3139264089427107, + "grad_norm": 0.2934348638470389, + "learning_rate": 3.947572815533981e-05, + "loss": 0.44, + "step": 2485 + }, + { + "epoch": 2.3185840707964602, + "grad_norm": 0.2820481413068723, + "learning_rate": 3.9443365695792884e-05, + "loss": 0.4385, + "step": 2490 + }, + { + "epoch": 2.3232417326502097, + "grad_norm": 0.3118026197566741, + "learning_rate": 3.941100323624596e-05, + "loss": 0.4292, + "step": 2495 + }, + { + "epoch": 2.3278993945039588, + "grad_norm": 0.3056236723096069, + "learning_rate": 3.937864077669903e-05, + "loss": 0.4166, + "step": 2500 + }, + { + "epoch": 2.3325570563577083, + "grad_norm": 0.3024595378867525, + "learning_rate": 3.9346278317152106e-05, + "loss": 0.428, + "step": 2505 + }, + { + "epoch": 2.3372147182114578, + "grad_norm": 0.3016160148497247, + "learning_rate": 3.931391585760518e-05, + "loss": 0.4291, + "step": 2510 + }, + { + "epoch": 2.3418723800652073, + "grad_norm": 0.2766409436577404, + "learning_rate": 3.928155339805825e-05, + "loss": 0.4304, + "step": 2515 + }, + { + "epoch": 2.3465300419189568, + "grad_norm": 0.26733233943446133, + "learning_rate": 3.924919093851133e-05, + "loss": 0.4349, + "step": 2520 + }, + { + "epoch": 2.3511877037727063, + "grad_norm": 0.2690198456721221, + "learning_rate": 3.92168284789644e-05, + "loss": 0.4305, + "step": 2525 + }, + { + "epoch": 2.3558453656264557, + "grad_norm": 0.2945860386231397, + "learning_rate": 3.9184466019417476e-05, + "loss": 0.4313, + "step": 2530 + }, + { + "epoch": 2.360503027480205, + "grad_norm": 0.2887479801089616, + "learning_rate": 3.915210355987055e-05, + "loss": 0.4185, + "step": 2535 + }, + { + "epoch": 2.3651606893339543, + "grad_norm": 0.2657832811996791, + "learning_rate": 3.911974110032363e-05, + "loss": 0.4169, + "step": 2540 + }, + { + "epoch": 2.369818351187704, + "grad_norm": 0.28653108542984546, + "learning_rate": 3.90873786407767e-05, + "loss": 0.4324, + "step": 2545 + }, + { + "epoch": 2.3744760130414533, + "grad_norm": 0.28649792966496657, + "learning_rate": 3.9055016181229775e-05, + "loss": 0.4329, + "step": 2550 + }, + { + "epoch": 2.3791336748952028, + "grad_norm": 0.31285273419976656, + "learning_rate": 3.902265372168285e-05, + "loss": 0.4239, + "step": 2555 + }, + { + "epoch": 2.383791336748952, + "grad_norm": 0.29394295890338806, + "learning_rate": 3.899029126213593e-05, + "loss": 0.427, + "step": 2560 + }, + { + "epoch": 2.3884489986027013, + "grad_norm": 0.2751190301995722, + "learning_rate": 3.8957928802589e-05, + "loss": 0.4304, + "step": 2565 + }, + { + "epoch": 2.393106660456451, + "grad_norm": 0.28394635868081375, + "learning_rate": 3.8925566343042075e-05, + "loss": 0.4283, + "step": 2570 + }, + { + "epoch": 2.3977643223102003, + "grad_norm": 0.26088780818246793, + "learning_rate": 3.889320388349515e-05, + "loss": 0.4313, + "step": 2575 + }, + { + "epoch": 2.40242198416395, + "grad_norm": 0.27408334765565273, + "learning_rate": 3.886084142394822e-05, + "loss": 0.4211, + "step": 2580 + }, + { + "epoch": 2.4070796460176993, + "grad_norm": 0.30743853711271846, + "learning_rate": 3.882847896440129e-05, + "loss": 0.4228, + "step": 2585 + }, + { + "epoch": 2.4117373078714484, + "grad_norm": 0.28436879894690836, + "learning_rate": 3.879611650485437e-05, + "loss": 0.4173, + "step": 2590 + }, + { + "epoch": 2.416394969725198, + "grad_norm": 0.31784449370435436, + "learning_rate": 3.8763754045307444e-05, + "loss": 0.4366, + "step": 2595 + }, + { + "epoch": 2.4210526315789473, + "grad_norm": 0.2784369425125365, + "learning_rate": 3.873139158576052e-05, + "loss": 0.4325, + "step": 2600 + }, + { + "epoch": 2.425710293432697, + "grad_norm": 0.30370530331971196, + "learning_rate": 3.869902912621359e-05, + "loss": 0.4232, + "step": 2605 + }, + { + "epoch": 2.4303679552864463, + "grad_norm": 0.3064281445791429, + "learning_rate": 3.866666666666667e-05, + "loss": 0.419, + "step": 2610 + }, + { + "epoch": 2.4350256171401954, + "grad_norm": 0.33279912691579955, + "learning_rate": 3.8634304207119743e-05, + "loss": 0.4228, + "step": 2615 + }, + { + "epoch": 2.439683278993945, + "grad_norm": 0.30559746775800656, + "learning_rate": 3.860194174757282e-05, + "loss": 0.4279, + "step": 2620 + }, + { + "epoch": 2.4443409408476944, + "grad_norm": 0.2821489347743237, + "learning_rate": 3.856957928802589e-05, + "loss": 0.4273, + "step": 2625 + }, + { + "epoch": 2.448998602701444, + "grad_norm": 0.30707651894613214, + "learning_rate": 3.8537216828478966e-05, + "loss": 0.421, + "step": 2630 + }, + { + "epoch": 2.4536562645551934, + "grad_norm": 0.2961175138557341, + "learning_rate": 3.850485436893204e-05, + "loss": 0.433, + "step": 2635 + }, + { + "epoch": 2.458313926408943, + "grad_norm": 0.30038582825826193, + "learning_rate": 3.847249190938512e-05, + "loss": 0.4313, + "step": 2640 + }, + { + "epoch": 2.4629715882626924, + "grad_norm": 0.28346270038788274, + "learning_rate": 3.844012944983819e-05, + "loss": 0.4309, + "step": 2645 + }, + { + "epoch": 2.4676292501164414, + "grad_norm": 0.3057343243492832, + "learning_rate": 3.8407766990291266e-05, + "loss": 0.4228, + "step": 2650 + }, + { + "epoch": 2.472286911970191, + "grad_norm": 0.2838210234509721, + "learning_rate": 3.837540453074434e-05, + "loss": 0.4312, + "step": 2655 + }, + { + "epoch": 2.4769445738239404, + "grad_norm": 0.27759231451015043, + "learning_rate": 3.834304207119741e-05, + "loss": 0.4239, + "step": 2660 + }, + { + "epoch": 2.48160223567769, + "grad_norm": 0.30874594722339815, + "learning_rate": 3.831067961165048e-05, + "loss": 0.4239, + "step": 2665 + }, + { + "epoch": 2.4862598975314394, + "grad_norm": 0.28049362889434204, + "learning_rate": 3.827831715210356e-05, + "loss": 0.4313, + "step": 2670 + }, + { + "epoch": 2.4909175593851884, + "grad_norm": 0.3088921111059484, + "learning_rate": 3.8245954692556635e-05, + "loss": 0.4216, + "step": 2675 + }, + { + "epoch": 2.495575221238938, + "grad_norm": 0.3149293091667282, + "learning_rate": 3.821359223300971e-05, + "loss": 0.4235, + "step": 2680 + }, + { + "epoch": 2.5002328830926874, + "grad_norm": 0.28238147922335444, + "learning_rate": 3.818122977346278e-05, + "loss": 0.4271, + "step": 2685 + }, + { + "epoch": 2.504890544946437, + "grad_norm": 0.2956984739353203, + "learning_rate": 3.814886731391586e-05, + "loss": 0.4144, + "step": 2690 + }, + { + "epoch": 2.5095482068001864, + "grad_norm": 0.3037219452671658, + "learning_rate": 3.8116504854368935e-05, + "loss": 0.4256, + "step": 2695 + }, + { + "epoch": 2.514205868653936, + "grad_norm": 0.2808798231051215, + "learning_rate": 3.808414239482201e-05, + "loss": 0.4207, + "step": 2700 + }, + { + "epoch": 2.5188635305076854, + "grad_norm": 0.3439874362565095, + "learning_rate": 3.805177993527508e-05, + "loss": 0.4328, + "step": 2705 + }, + { + "epoch": 2.5235211923614345, + "grad_norm": 0.28499473571752737, + "learning_rate": 3.801941747572816e-05, + "loss": 0.4245, + "step": 2710 + }, + { + "epoch": 2.528178854215184, + "grad_norm": 0.2792589165257693, + "learning_rate": 3.7987055016181234e-05, + "loss": 0.4339, + "step": 2715 + }, + { + "epoch": 2.5328365160689335, + "grad_norm": 0.2768222961452844, + "learning_rate": 3.795469255663431e-05, + "loss": 0.4436, + "step": 2720 + }, + { + "epoch": 2.537494177922683, + "grad_norm": 0.2866344604425688, + "learning_rate": 3.792233009708738e-05, + "loss": 0.4263, + "step": 2725 + }, + { + "epoch": 2.542151839776432, + "grad_norm": 0.3126347316719504, + "learning_rate": 3.788996763754045e-05, + "loss": 0.4362, + "step": 2730 + }, + { + "epoch": 2.5468095016301815, + "grad_norm": 0.28054583912793213, + "learning_rate": 3.785760517799353e-05, + "loss": 0.4252, + "step": 2735 + }, + { + "epoch": 2.551467163483931, + "grad_norm": 0.3105960286486963, + "learning_rate": 3.78252427184466e-05, + "loss": 0.4235, + "step": 2740 + }, + { + "epoch": 2.5561248253376805, + "grad_norm": 0.2824370855630529, + "learning_rate": 3.779288025889968e-05, + "loss": 0.4224, + "step": 2745 + }, + { + "epoch": 2.56078248719143, + "grad_norm": 0.2694991578625555, + "learning_rate": 3.776051779935275e-05, + "loss": 0.4226, + "step": 2750 + }, + { + "epoch": 2.5654401490451795, + "grad_norm": 0.30479670220041877, + "learning_rate": 3.7728155339805826e-05, + "loss": 0.4176, + "step": 2755 + }, + { + "epoch": 2.570097810898929, + "grad_norm": 0.2838119776784065, + "learning_rate": 3.76957928802589e-05, + "loss": 0.431, + "step": 2760 + }, + { + "epoch": 2.574755472752678, + "grad_norm": 0.2902762884026888, + "learning_rate": 3.766343042071197e-05, + "loss": 0.422, + "step": 2765 + }, + { + "epoch": 2.5794131346064275, + "grad_norm": 0.2819819286550231, + "learning_rate": 3.763106796116505e-05, + "loss": 0.4247, + "step": 2770 + }, + { + "epoch": 2.584070796460177, + "grad_norm": 0.30369792665580253, + "learning_rate": 3.7598705501618126e-05, + "loss": 0.4225, + "step": 2775 + }, + { + "epoch": 2.5887284583139265, + "grad_norm": 0.29196706684716095, + "learning_rate": 3.75663430420712e-05, + "loss": 0.4299, + "step": 2780 + }, + { + "epoch": 2.5933861201676756, + "grad_norm": 0.2788017675818027, + "learning_rate": 3.753398058252427e-05, + "loss": 0.4346, + "step": 2785 + }, + { + "epoch": 2.598043782021425, + "grad_norm": 0.2916211716964687, + "learning_rate": 3.750161812297735e-05, + "loss": 0.435, + "step": 2790 + }, + { + "epoch": 2.6027014438751745, + "grad_norm": 0.278499551317208, + "learning_rate": 3.7469255663430425e-05, + "loss": 0.4244, + "step": 2795 + }, + { + "epoch": 2.607359105728924, + "grad_norm": 0.2711130362986734, + "learning_rate": 3.74368932038835e-05, + "loss": 0.4253, + "step": 2800 + }, + { + "epoch": 2.6120167675826735, + "grad_norm": 0.27338890570112695, + "learning_rate": 3.740453074433657e-05, + "loss": 0.4214, + "step": 2805 + }, + { + "epoch": 2.616674429436423, + "grad_norm": 0.2894722582711034, + "learning_rate": 3.737216828478964e-05, + "loss": 0.4288, + "step": 2810 + }, + { + "epoch": 2.6213320912901725, + "grad_norm": 0.24846741147246218, + "learning_rate": 3.733980582524272e-05, + "loss": 0.422, + "step": 2815 + }, + { + "epoch": 2.625989753143922, + "grad_norm": 0.2875890641223257, + "learning_rate": 3.7307443365695794e-05, + "loss": 0.4336, + "step": 2820 + }, + { + "epoch": 2.630647414997671, + "grad_norm": 0.2799915068598867, + "learning_rate": 3.727508090614887e-05, + "loss": 0.4331, + "step": 2825 + }, + { + "epoch": 2.6353050768514206, + "grad_norm": 0.3201306836255748, + "learning_rate": 3.724271844660194e-05, + "loss": 0.4227, + "step": 2830 + }, + { + "epoch": 2.63996273870517, + "grad_norm": 0.28169259531778595, + "learning_rate": 3.721035598705502e-05, + "loss": 0.4338, + "step": 2835 + }, + { + "epoch": 2.6446204005589196, + "grad_norm": 0.2587167611906602, + "learning_rate": 3.7177993527508094e-05, + "loss": 0.4366, + "step": 2840 + }, + { + "epoch": 2.6492780624126686, + "grad_norm": 0.27229247764700637, + "learning_rate": 3.714563106796117e-05, + "loss": 0.4313, + "step": 2845 + }, + { + "epoch": 2.653935724266418, + "grad_norm": 0.28467939462359715, + "learning_rate": 3.711326860841424e-05, + "loss": 0.4325, + "step": 2850 + }, + { + "epoch": 2.6585933861201676, + "grad_norm": 0.26758982806172527, + "learning_rate": 3.708090614886732e-05, + "loss": 0.426, + "step": 2855 + }, + { + "epoch": 2.663251047973917, + "grad_norm": 0.2743846459839584, + "learning_rate": 3.7048543689320393e-05, + "loss": 0.4322, + "step": 2860 + }, + { + "epoch": 2.6679087098276666, + "grad_norm": 0.284576576366453, + "learning_rate": 3.701618122977347e-05, + "loss": 0.4311, + "step": 2865 + }, + { + "epoch": 2.672566371681416, + "grad_norm": 0.2810259475558676, + "learning_rate": 3.698381877022654e-05, + "loss": 0.4226, + "step": 2870 + }, + { + "epoch": 2.6772240335351656, + "grad_norm": 0.2704861868580724, + "learning_rate": 3.695145631067961e-05, + "loss": 0.428, + "step": 2875 + }, + { + "epoch": 2.6818816953889146, + "grad_norm": 0.26025855572697787, + "learning_rate": 3.6919093851132686e-05, + "loss": 0.4198, + "step": 2880 + }, + { + "epoch": 2.686539357242664, + "grad_norm": 0.2783715023932399, + "learning_rate": 3.688673139158576e-05, + "loss": 0.4372, + "step": 2885 + }, + { + "epoch": 2.6911970190964136, + "grad_norm": 0.2519929327579714, + "learning_rate": 3.685436893203883e-05, + "loss": 0.4234, + "step": 2890 + }, + { + "epoch": 2.695854680950163, + "grad_norm": 0.31313597074446853, + "learning_rate": 3.682200647249191e-05, + "loss": 0.4312, + "step": 2895 + }, + { + "epoch": 2.700512342803912, + "grad_norm": 0.29198060410841264, + "learning_rate": 3.6789644012944986e-05, + "loss": 0.4256, + "step": 2900 + }, + { + "epoch": 2.7051700046576617, + "grad_norm": 0.2834423205145289, + "learning_rate": 3.675728155339806e-05, + "loss": 0.42, + "step": 2905 + }, + { + "epoch": 2.709827666511411, + "grad_norm": 0.2685259132702088, + "learning_rate": 3.672491909385113e-05, + "loss": 0.4233, + "step": 2910 + }, + { + "epoch": 2.7144853283651607, + "grad_norm": 0.2852612186401845, + "learning_rate": 3.669255663430421e-05, + "loss": 0.433, + "step": 2915 + }, + { + "epoch": 2.71914299021891, + "grad_norm": 0.2927922523300444, + "learning_rate": 3.6660194174757285e-05, + "loss": 0.4241, + "step": 2920 + }, + { + "epoch": 2.7238006520726596, + "grad_norm": 0.29404001370657046, + "learning_rate": 3.662783171521036e-05, + "loss": 0.4369, + "step": 2925 + }, + { + "epoch": 2.728458313926409, + "grad_norm": 0.2625429798487282, + "learning_rate": 3.659546925566343e-05, + "loss": 0.4356, + "step": 2930 + }, + { + "epoch": 2.7331159757801586, + "grad_norm": 0.26283427925119807, + "learning_rate": 3.656310679611651e-05, + "loss": 0.4241, + "step": 2935 + }, + { + "epoch": 2.7377736376339077, + "grad_norm": 0.2704873739007011, + "learning_rate": 3.6530744336569585e-05, + "loss": 0.424, + "step": 2940 + }, + { + "epoch": 2.742431299487657, + "grad_norm": 0.2656977810974544, + "learning_rate": 3.649838187702266e-05, + "loss": 0.4335, + "step": 2945 + }, + { + "epoch": 2.7470889613414067, + "grad_norm": 0.3103876140772922, + "learning_rate": 3.6466019417475724e-05, + "loss": 0.4253, + "step": 2950 + }, + { + "epoch": 2.751746623195156, + "grad_norm": 0.2829566677570478, + "learning_rate": 3.64336569579288e-05, + "loss": 0.4215, + "step": 2955 + }, + { + "epoch": 2.7564042850489052, + "grad_norm": 0.2841068708096097, + "learning_rate": 3.640129449838188e-05, + "loss": 0.4276, + "step": 2960 + }, + { + "epoch": 2.7610619469026547, + "grad_norm": 0.26485793653651674, + "learning_rate": 3.6368932038834954e-05, + "loss": 0.4263, + "step": 2965 + }, + { + "epoch": 2.765719608756404, + "grad_norm": 0.28162603988775287, + "learning_rate": 3.6336569579288024e-05, + "loss": 0.4283, + "step": 2970 + }, + { + "epoch": 2.7703772706101537, + "grad_norm": 0.2822946670919207, + "learning_rate": 3.63042071197411e-05, + "loss": 0.4207, + "step": 2975 + }, + { + "epoch": 2.775034932463903, + "grad_norm": 0.29086642862827244, + "learning_rate": 3.627184466019418e-05, + "loss": 0.4217, + "step": 2980 + }, + { + "epoch": 2.7796925943176527, + "grad_norm": 0.3044051581811522, + "learning_rate": 3.623948220064725e-05, + "loss": 0.432, + "step": 2985 + }, + { + "epoch": 2.784350256171402, + "grad_norm": 0.28576952931281374, + "learning_rate": 3.620711974110032e-05, + "loss": 0.4248, + "step": 2990 + }, + { + "epoch": 2.7890079180251512, + "grad_norm": 0.28252026904913935, + "learning_rate": 3.61747572815534e-05, + "loss": 0.4224, + "step": 2995 + }, + { + "epoch": 2.7936655798789007, + "grad_norm": 0.2863225210504502, + "learning_rate": 3.6142394822006476e-05, + "loss": 0.4342, + "step": 3000 + }, + { + "epoch": 2.7983232417326502, + "grad_norm": 0.25690143326129644, + "learning_rate": 3.611003236245955e-05, + "loss": 0.4201, + "step": 3005 + }, + { + "epoch": 2.8029809035863997, + "grad_norm": 0.2609803133859486, + "learning_rate": 3.607766990291262e-05, + "loss": 0.4354, + "step": 3010 + }, + { + "epoch": 2.807638565440149, + "grad_norm": 0.2763251649760853, + "learning_rate": 3.60453074433657e-05, + "loss": 0.4123, + "step": 3015 + }, + { + "epoch": 2.8122962272938983, + "grad_norm": 0.2552305839808659, + "learning_rate": 3.601294498381877e-05, + "loss": 0.4304, + "step": 3020 + }, + { + "epoch": 2.8169538891476478, + "grad_norm": 0.2615547694390696, + "learning_rate": 3.5980582524271845e-05, + "loss": 0.4268, + "step": 3025 + }, + { + "epoch": 2.8216115510013973, + "grad_norm": 0.2797157438024088, + "learning_rate": 3.594822006472492e-05, + "loss": 0.4281, + "step": 3030 + }, + { + "epoch": 2.8262692128551468, + "grad_norm": 0.27294965373088204, + "learning_rate": 3.591585760517799e-05, + "loss": 0.4292, + "step": 3035 + }, + { + "epoch": 2.8309268747088963, + "grad_norm": 0.3033447910940389, + "learning_rate": 3.588349514563107e-05, + "loss": 0.439, + "step": 3040 + }, + { + "epoch": 2.8355845365626458, + "grad_norm": 0.27535370122112035, + "learning_rate": 3.5851132686084145e-05, + "loss": 0.4254, + "step": 3045 + }, + { + "epoch": 2.840242198416395, + "grad_norm": 0.2872687513105721, + "learning_rate": 3.581877022653722e-05, + "loss": 0.4302, + "step": 3050 + }, + { + "epoch": 2.8448998602701443, + "grad_norm": 0.282487449246132, + "learning_rate": 3.578640776699029e-05, + "loss": 0.4315, + "step": 3055 + }, + { + "epoch": 2.849557522123894, + "grad_norm": 0.27478843406031495, + "learning_rate": 3.575404530744337e-05, + "loss": 0.4335, + "step": 3060 + }, + { + "epoch": 2.8542151839776433, + "grad_norm": 0.26905353589793823, + "learning_rate": 3.5721682847896444e-05, + "loss": 0.4275, + "step": 3065 + }, + { + "epoch": 2.858872845831393, + "grad_norm": 0.26383496267923806, + "learning_rate": 3.5689320388349514e-05, + "loss": 0.4179, + "step": 3070 + }, + { + "epoch": 2.863530507685142, + "grad_norm": 0.28002246896820876, + "learning_rate": 3.565695792880259e-05, + "loss": 0.4289, + "step": 3075 + }, + { + "epoch": 2.8681881695388913, + "grad_norm": 0.27936359297466795, + "learning_rate": 3.562459546925567e-05, + "loss": 0.4174, + "step": 3080 + }, + { + "epoch": 2.872845831392641, + "grad_norm": 0.2781403896702408, + "learning_rate": 3.5592233009708744e-05, + "loss": 0.4292, + "step": 3085 + }, + { + "epoch": 2.8775034932463903, + "grad_norm": 0.26905538248570515, + "learning_rate": 3.5559870550161814e-05, + "loss": 0.4332, + "step": 3090 + }, + { + "epoch": 2.88216115510014, + "grad_norm": 0.28429896396917936, + "learning_rate": 3.5527508090614884e-05, + "loss": 0.4196, + "step": 3095 + }, + { + "epoch": 2.8868188169538893, + "grad_norm": 0.2841084121770271, + "learning_rate": 3.549514563106796e-05, + "loss": 0.4273, + "step": 3100 + }, + { + "epoch": 2.891476478807639, + "grad_norm": 0.2670969391614412, + "learning_rate": 3.546278317152104e-05, + "loss": 0.4333, + "step": 3105 + }, + { + "epoch": 2.896134140661388, + "grad_norm": 0.2781049518759708, + "learning_rate": 3.543042071197411e-05, + "loss": 0.4213, + "step": 3110 + }, + { + "epoch": 2.9007918025151374, + "grad_norm": 0.2907894011753527, + "learning_rate": 3.539805825242718e-05, + "loss": 0.417, + "step": 3115 + }, + { + "epoch": 2.905449464368887, + "grad_norm": 0.2775449067053298, + "learning_rate": 3.536569579288026e-05, + "loss": 0.4256, + "step": 3120 + }, + { + "epoch": 2.9101071262226363, + "grad_norm": 0.26484076532516715, + "learning_rate": 3.5333333333333336e-05, + "loss": 0.4311, + "step": 3125 + }, + { + "epoch": 2.9147647880763854, + "grad_norm": 0.24962832810099225, + "learning_rate": 3.530097087378641e-05, + "loss": 0.414, + "step": 3130 + }, + { + "epoch": 2.919422449930135, + "grad_norm": 0.2728547334939587, + "learning_rate": 3.526860841423948e-05, + "loss": 0.4198, + "step": 3135 + }, + { + "epoch": 2.9240801117838844, + "grad_norm": 0.26480567082735235, + "learning_rate": 3.523624595469256e-05, + "loss": 0.4324, + "step": 3140 + }, + { + "epoch": 2.928737773637634, + "grad_norm": 0.277450964142194, + "learning_rate": 3.5203883495145636e-05, + "loss": 0.4347, + "step": 3145 + }, + { + "epoch": 2.9333954354913834, + "grad_norm": 0.30278907294138124, + "learning_rate": 3.517152103559871e-05, + "loss": 0.4323, + "step": 3150 + }, + { + "epoch": 2.938053097345133, + "grad_norm": 0.32235080879387107, + "learning_rate": 3.513915857605178e-05, + "loss": 0.4167, + "step": 3155 + }, + { + "epoch": 2.9427107591988824, + "grad_norm": 0.29246494531502704, + "learning_rate": 3.510679611650486e-05, + "loss": 0.4289, + "step": 3160 + }, + { + "epoch": 2.9473684210526314, + "grad_norm": 0.28676116742888197, + "learning_rate": 3.507443365695793e-05, + "loss": 0.4298, + "step": 3165 + }, + { + "epoch": 2.952026082906381, + "grad_norm": 0.2642095649143648, + "learning_rate": 3.5042071197411005e-05, + "loss": 0.4262, + "step": 3170 + }, + { + "epoch": 2.9566837447601304, + "grad_norm": 0.2699739868562258, + "learning_rate": 3.5009708737864075e-05, + "loss": 0.4313, + "step": 3175 + }, + { + "epoch": 2.96134140661388, + "grad_norm": 0.2660895898372268, + "learning_rate": 3.497734627831715e-05, + "loss": 0.4225, + "step": 3180 + }, + { + "epoch": 2.9659990684676294, + "grad_norm": 0.263658290507509, + "learning_rate": 3.494498381877023e-05, + "loss": 0.4277, + "step": 3185 + }, + { + "epoch": 2.9706567303213784, + "grad_norm": 0.2737982059810963, + "learning_rate": 3.4912621359223304e-05, + "loss": 0.4166, + "step": 3190 + }, + { + "epoch": 2.975314392175128, + "grad_norm": 0.2639166192385112, + "learning_rate": 3.4880258899676374e-05, + "loss": 0.4337, + "step": 3195 + }, + { + "epoch": 2.9799720540288774, + "grad_norm": 0.25426316889153205, + "learning_rate": 3.484789644012945e-05, + "loss": 0.4306, + "step": 3200 + }, + { + "epoch": 2.984629715882627, + "grad_norm": 0.30651398901580795, + "learning_rate": 3.481553398058253e-05, + "loss": 0.4377, + "step": 3205 + }, + { + "epoch": 2.9892873777363764, + "grad_norm": 0.2578203086881995, + "learning_rate": 3.4783171521035604e-05, + "loss": 0.4102, + "step": 3210 + }, + { + "epoch": 2.993945039590126, + "grad_norm": 0.25777786368813266, + "learning_rate": 3.4750809061488674e-05, + "loss": 0.4109, + "step": 3215 + }, + { + "epoch": 2.9986027014438754, + "grad_norm": 0.2943705743126533, + "learning_rate": 3.471844660194175e-05, + "loss": 0.4308, + "step": 3220 + }, + { + "epoch": 3.0027945971122496, + "grad_norm": 0.3189228181467205, + "learning_rate": 3.468608414239483e-05, + "loss": 0.3913, + "step": 3225 + }, + { + "epoch": 3.007452258965999, + "grad_norm": 0.32071113650327926, + "learning_rate": 3.46537216828479e-05, + "loss": 0.3748, + "step": 3230 + }, + { + "epoch": 3.0121099208197486, + "grad_norm": 0.296343837965841, + "learning_rate": 3.462135922330097e-05, + "loss": 0.3798, + "step": 3235 + }, + { + "epoch": 3.016767582673498, + "grad_norm": 0.2691270837455306, + "learning_rate": 3.458899676375404e-05, + "loss": 0.3692, + "step": 3240 + }, + { + "epoch": 3.021425244527247, + "grad_norm": 0.2881057731000792, + "learning_rate": 3.455663430420712e-05, + "loss": 0.3795, + "step": 3245 + }, + { + "epoch": 3.0260829063809966, + "grad_norm": 0.28863008786419747, + "learning_rate": 3.4524271844660196e-05, + "loss": 0.3805, + "step": 3250 + }, + { + "epoch": 3.030740568234746, + "grad_norm": 0.27177752021649265, + "learning_rate": 3.4491909385113266e-05, + "loss": 0.3819, + "step": 3255 + }, + { + "epoch": 3.0353982300884956, + "grad_norm": 0.2679652027957279, + "learning_rate": 3.445954692556634e-05, + "loss": 0.3828, + "step": 3260 + }, + { + "epoch": 3.040055891942245, + "grad_norm": 0.2868363828990796, + "learning_rate": 3.442718446601942e-05, + "loss": 0.3853, + "step": 3265 + }, + { + "epoch": 3.0447135537959946, + "grad_norm": 0.2890105371189473, + "learning_rate": 3.4394822006472495e-05, + "loss": 0.3732, + "step": 3270 + }, + { + "epoch": 3.0493712156497437, + "grad_norm": 0.27203805412745935, + "learning_rate": 3.4362459546925565e-05, + "loss": 0.3788, + "step": 3275 + }, + { + "epoch": 3.054028877503493, + "grad_norm": 0.2602642985237527, + "learning_rate": 3.433009708737864e-05, + "loss": 0.373, + "step": 3280 + }, + { + "epoch": 3.0586865393572427, + "grad_norm": 0.27094020224990417, + "learning_rate": 3.429773462783172e-05, + "loss": 0.3918, + "step": 3285 + }, + { + "epoch": 3.063344201210992, + "grad_norm": 0.26783933249771047, + "learning_rate": 3.4265372168284795e-05, + "loss": 0.3777, + "step": 3290 + }, + { + "epoch": 3.0680018630647417, + "grad_norm": 0.2756467871454317, + "learning_rate": 3.4233009708737865e-05, + "loss": 0.3823, + "step": 3295 + }, + { + "epoch": 3.0726595249184907, + "grad_norm": 0.31533387411110464, + "learning_rate": 3.420064724919094e-05, + "loss": 0.3748, + "step": 3300 + }, + { + "epoch": 3.07731718677224, + "grad_norm": 0.3032337273006253, + "learning_rate": 3.416828478964402e-05, + "loss": 0.3722, + "step": 3305 + }, + { + "epoch": 3.0819748486259897, + "grad_norm": 0.29797556470822706, + "learning_rate": 3.413592233009709e-05, + "loss": 0.3774, + "step": 3310 + }, + { + "epoch": 3.086632510479739, + "grad_norm": 0.2907185676087362, + "learning_rate": 3.4103559870550164e-05, + "loss": 0.3744, + "step": 3315 + }, + { + "epoch": 3.0912901723334887, + "grad_norm": 0.2797688048331569, + "learning_rate": 3.4071197411003234e-05, + "loss": 0.368, + "step": 3320 + }, + { + "epoch": 3.095947834187238, + "grad_norm": 0.280107318388062, + "learning_rate": 3.403883495145631e-05, + "loss": 0.3854, + "step": 3325 + }, + { + "epoch": 3.1006054960409872, + "grad_norm": 0.31397208430684337, + "learning_rate": 3.400647249190939e-05, + "loss": 0.3802, + "step": 3330 + }, + { + "epoch": 3.1052631578947367, + "grad_norm": 0.310995000296094, + "learning_rate": 3.3974110032362464e-05, + "loss": 0.3753, + "step": 3335 + }, + { + "epoch": 3.109920819748486, + "grad_norm": 0.28716773816954755, + "learning_rate": 3.3941747572815533e-05, + "loss": 0.3733, + "step": 3340 + }, + { + "epoch": 3.1145784816022357, + "grad_norm": 0.2959267581584993, + "learning_rate": 3.390938511326861e-05, + "loss": 0.3795, + "step": 3345 + }, + { + "epoch": 3.119236143455985, + "grad_norm": 0.27780045387843744, + "learning_rate": 3.3877022653721687e-05, + "loss": 0.3937, + "step": 3350 + }, + { + "epoch": 3.1238938053097347, + "grad_norm": 0.29258888948818523, + "learning_rate": 3.384466019417476e-05, + "loss": 0.3799, + "step": 3355 + }, + { + "epoch": 3.1285514671634838, + "grad_norm": 0.286240193461029, + "learning_rate": 3.381229773462783e-05, + "loss": 0.3798, + "step": 3360 + }, + { + "epoch": 3.1332091290172333, + "grad_norm": 0.29269590854130606, + "learning_rate": 3.377993527508091e-05, + "loss": 0.3873, + "step": 3365 + }, + { + "epoch": 3.1378667908709827, + "grad_norm": 0.2675117207246662, + "learning_rate": 3.3747572815533986e-05, + "loss": 0.3814, + "step": 3370 + }, + { + "epoch": 3.1425244527247322, + "grad_norm": 0.2655466234635772, + "learning_rate": 3.3715210355987056e-05, + "loss": 0.3814, + "step": 3375 + }, + { + "epoch": 3.1471821145784817, + "grad_norm": 0.2993931745308231, + "learning_rate": 3.3682847896440126e-05, + "loss": 0.3806, + "step": 3380 + }, + { + "epoch": 3.1518397764322312, + "grad_norm": 0.2737577998173692, + "learning_rate": 3.36504854368932e-05, + "loss": 0.3784, + "step": 3385 + }, + { + "epoch": 3.1564974382859803, + "grad_norm": 0.2852074345495303, + "learning_rate": 3.361812297734628e-05, + "loss": 0.3837, + "step": 3390 + }, + { + "epoch": 3.16115510013973, + "grad_norm": 0.27338540600457273, + "learning_rate": 3.3585760517799355e-05, + "loss": 0.3747, + "step": 3395 + }, + { + "epoch": 3.1658127619934793, + "grad_norm": 0.2768239874769818, + "learning_rate": 3.3553398058252425e-05, + "loss": 0.3839, + "step": 3400 + }, + { + "epoch": 3.1704704238472288, + "grad_norm": 0.25522005482265103, + "learning_rate": 3.35210355987055e-05, + "loss": 0.387, + "step": 3405 + }, + { + "epoch": 3.1751280857009783, + "grad_norm": 0.27748340384064507, + "learning_rate": 3.348867313915858e-05, + "loss": 0.3919, + "step": 3410 + }, + { + "epoch": 3.1797857475547273, + "grad_norm": 0.2730075292402206, + "learning_rate": 3.3456310679611655e-05, + "loss": 0.3769, + "step": 3415 + }, + { + "epoch": 3.184443409408477, + "grad_norm": 0.33155555731665076, + "learning_rate": 3.3423948220064725e-05, + "loss": 0.3917, + "step": 3420 + }, + { + "epoch": 3.1891010712622263, + "grad_norm": 0.3224328620915219, + "learning_rate": 3.33915857605178e-05, + "loss": 0.3815, + "step": 3425 + }, + { + "epoch": 3.193758733115976, + "grad_norm": 0.32625083415496575, + "learning_rate": 3.335922330097088e-05, + "loss": 0.3921, + "step": 3430 + }, + { + "epoch": 3.1984163949697253, + "grad_norm": 0.287751067070024, + "learning_rate": 3.3326860841423954e-05, + "loss": 0.3894, + "step": 3435 + }, + { + "epoch": 3.203074056823475, + "grad_norm": 0.3025426180601656, + "learning_rate": 3.3294498381877024e-05, + "loss": 0.3875, + "step": 3440 + }, + { + "epoch": 3.207731718677224, + "grad_norm": 0.2820344404437468, + "learning_rate": 3.32621359223301e-05, + "loss": 0.3748, + "step": 3445 + }, + { + "epoch": 3.2123893805309733, + "grad_norm": 0.27489434938772717, + "learning_rate": 3.322977346278318e-05, + "loss": 0.3837, + "step": 3450 + }, + { + "epoch": 3.217047042384723, + "grad_norm": 0.2697069074678676, + "learning_rate": 3.319741100323625e-05, + "loss": 0.3869, + "step": 3455 + }, + { + "epoch": 3.2217047042384723, + "grad_norm": 0.2711007301779973, + "learning_rate": 3.316504854368932e-05, + "loss": 0.3854, + "step": 3460 + }, + { + "epoch": 3.226362366092222, + "grad_norm": 0.30944819540145513, + "learning_rate": 3.313268608414239e-05, + "loss": 0.3802, + "step": 3465 + }, + { + "epoch": 3.2310200279459713, + "grad_norm": 0.29374337119158195, + "learning_rate": 3.310032362459547e-05, + "loss": 0.3875, + "step": 3470 + }, + { + "epoch": 3.2356776897997204, + "grad_norm": 0.26681487298679607, + "learning_rate": 3.3067961165048546e-05, + "loss": 0.38, + "step": 3475 + }, + { + "epoch": 3.24033535165347, + "grad_norm": 0.2630307816642084, + "learning_rate": 3.3035598705501616e-05, + "loss": 0.3854, + "step": 3480 + }, + { + "epoch": 3.2449930135072194, + "grad_norm": 0.28504157556514736, + "learning_rate": 3.300323624595469e-05, + "loss": 0.3939, + "step": 3485 + }, + { + "epoch": 3.249650675360969, + "grad_norm": 0.29762962713937785, + "learning_rate": 3.297087378640777e-05, + "loss": 0.3979, + "step": 3490 + }, + { + "epoch": 3.2543083372147183, + "grad_norm": 0.2967087683421554, + "learning_rate": 3.2938511326860846e-05, + "loss": 0.3958, + "step": 3495 + }, + { + "epoch": 3.258965999068468, + "grad_norm": 0.30491724136449017, + "learning_rate": 3.2906148867313916e-05, + "loss": 0.3944, + "step": 3500 + }, + { + "epoch": 3.263623660922217, + "grad_norm": 0.29395653686814566, + "learning_rate": 3.287378640776699e-05, + "loss": 0.3969, + "step": 3505 + }, + { + "epoch": 3.2682813227759664, + "grad_norm": 0.30303390983399636, + "learning_rate": 3.284142394822007e-05, + "loss": 0.3967, + "step": 3510 + }, + { + "epoch": 3.272938984629716, + "grad_norm": 0.27927982759731645, + "learning_rate": 3.2809061488673145e-05, + "loss": 0.3891, + "step": 3515 + }, + { + "epoch": 3.2775966464834654, + "grad_norm": 0.2678751340501679, + "learning_rate": 3.2776699029126215e-05, + "loss": 0.3874, + "step": 3520 + }, + { + "epoch": 3.282254308337215, + "grad_norm": 0.2900994579568571, + "learning_rate": 3.2744336569579285e-05, + "loss": 0.4021, + "step": 3525 + }, + { + "epoch": 3.286911970190964, + "grad_norm": 0.27271565228674416, + "learning_rate": 3.271197411003236e-05, + "loss": 0.3786, + "step": 3530 + }, + { + "epoch": 3.2915696320447134, + "grad_norm": 0.28652170030917257, + "learning_rate": 3.267961165048544e-05, + "loss": 0.3764, + "step": 3535 + }, + { + "epoch": 3.296227293898463, + "grad_norm": 0.2942415884741508, + "learning_rate": 3.2647249190938515e-05, + "loss": 0.3804, + "step": 3540 + }, + { + "epoch": 3.3008849557522124, + "grad_norm": 0.28215749117316613, + "learning_rate": 3.2614886731391584e-05, + "loss": 0.3899, + "step": 3545 + }, + { + "epoch": 3.305542617605962, + "grad_norm": 0.28326309598504706, + "learning_rate": 3.258252427184466e-05, + "loss": 0.3839, + "step": 3550 + }, + { + "epoch": 3.3102002794597114, + "grad_norm": 0.28041612783208264, + "learning_rate": 3.255016181229774e-05, + "loss": 0.3751, + "step": 3555 + }, + { + "epoch": 3.3148579413134605, + "grad_norm": 0.28751641561182656, + "learning_rate": 3.251779935275081e-05, + "loss": 0.3845, + "step": 3560 + }, + { + "epoch": 3.31951560316721, + "grad_norm": 0.2916078642342049, + "learning_rate": 3.2485436893203884e-05, + "loss": 0.3858, + "step": 3565 + }, + { + "epoch": 3.3241732650209594, + "grad_norm": 0.2746505539943781, + "learning_rate": 3.245307443365696e-05, + "loss": 0.3816, + "step": 3570 + }, + { + "epoch": 3.328830926874709, + "grad_norm": 0.266878501429854, + "learning_rate": 3.242071197411004e-05, + "loss": 0.3923, + "step": 3575 + }, + { + "epoch": 3.3334885887284584, + "grad_norm": 0.29417958059476884, + "learning_rate": 3.238834951456311e-05, + "loss": 0.3874, + "step": 3580 + }, + { + "epoch": 3.3381462505822075, + "grad_norm": 0.30459617589281895, + "learning_rate": 3.2355987055016183e-05, + "loss": 0.3897, + "step": 3585 + }, + { + "epoch": 3.342803912435957, + "grad_norm": 0.2619547631377024, + "learning_rate": 3.232362459546926e-05, + "loss": 0.382, + "step": 3590 + }, + { + "epoch": 3.3474615742897065, + "grad_norm": 0.2709779198156142, + "learning_rate": 3.2291262135922337e-05, + "loss": 0.3814, + "step": 3595 + }, + { + "epoch": 3.352119236143456, + "grad_norm": 0.29945087804261317, + "learning_rate": 3.2258899676375406e-05, + "loss": 0.3822, + "step": 3600 + }, + { + "epoch": 3.3567768979972055, + "grad_norm": 0.293632639764198, + "learning_rate": 3.2226537216828476e-05, + "loss": 0.3748, + "step": 3605 + }, + { + "epoch": 3.361434559850955, + "grad_norm": 0.2860156609777422, + "learning_rate": 3.219417475728155e-05, + "loss": 0.386, + "step": 3610 + }, + { + "epoch": 3.3660922217047045, + "grad_norm": 0.2911454668206605, + "learning_rate": 3.216181229773463e-05, + "loss": 0.3826, + "step": 3615 + }, + { + "epoch": 3.3707498835584535, + "grad_norm": 0.30221815752117726, + "learning_rate": 3.2129449838187706e-05, + "loss": 0.3822, + "step": 3620 + }, + { + "epoch": 3.375407545412203, + "grad_norm": 0.2794516738068307, + "learning_rate": 3.2097087378640776e-05, + "loss": 0.3919, + "step": 3625 + }, + { + "epoch": 3.3800652072659525, + "grad_norm": 0.3011158384207955, + "learning_rate": 3.206472491909385e-05, + "loss": 0.3847, + "step": 3630 + }, + { + "epoch": 3.384722869119702, + "grad_norm": 0.2739006993400138, + "learning_rate": 3.203236245954693e-05, + "loss": 0.3949, + "step": 3635 + }, + { + "epoch": 3.3893805309734515, + "grad_norm": 0.25395578448798356, + "learning_rate": 3.2000000000000005e-05, + "loss": 0.3723, + "step": 3640 + }, + { + "epoch": 3.3940381928272005, + "grad_norm": 0.29246657684261074, + "learning_rate": 3.1967637540453075e-05, + "loss": 0.3889, + "step": 3645 + }, + { + "epoch": 3.39869585468095, + "grad_norm": 0.2853839589454004, + "learning_rate": 3.193527508090615e-05, + "loss": 0.3795, + "step": 3650 + }, + { + "epoch": 3.4033535165346995, + "grad_norm": 0.27538025167907765, + "learning_rate": 3.190291262135923e-05, + "loss": 0.3964, + "step": 3655 + }, + { + "epoch": 3.408011178388449, + "grad_norm": 0.2793994213733302, + "learning_rate": 3.1870550161812305e-05, + "loss": 0.3813, + "step": 3660 + }, + { + "epoch": 3.4126688402421985, + "grad_norm": 0.27325487831809636, + "learning_rate": 3.1838187702265375e-05, + "loss": 0.3817, + "step": 3665 + }, + { + "epoch": 3.417326502095948, + "grad_norm": 0.27544264466801227, + "learning_rate": 3.1805825242718444e-05, + "loss": 0.3959, + "step": 3670 + }, + { + "epoch": 3.421984163949697, + "grad_norm": 0.29269746788784207, + "learning_rate": 3.177346278317152e-05, + "loss": 0.3827, + "step": 3675 + }, + { + "epoch": 3.4266418258034466, + "grad_norm": 0.2716112401157802, + "learning_rate": 3.17411003236246e-05, + "loss": 0.3757, + "step": 3680 + }, + { + "epoch": 3.431299487657196, + "grad_norm": 0.2722022576763241, + "learning_rate": 3.170873786407767e-05, + "loss": 0.3903, + "step": 3685 + }, + { + "epoch": 3.4359571495109456, + "grad_norm": 0.27203797852537565, + "learning_rate": 3.1676375404530744e-05, + "loss": 0.3815, + "step": 3690 + }, + { + "epoch": 3.440614811364695, + "grad_norm": 0.2977608399442475, + "learning_rate": 3.164401294498382e-05, + "loss": 0.384, + "step": 3695 + }, + { + "epoch": 3.445272473218444, + "grad_norm": 0.2744028942156667, + "learning_rate": 3.16116504854369e-05, + "loss": 0.3928, + "step": 3700 + }, + { + "epoch": 3.4499301350721936, + "grad_norm": 0.2530992346697563, + "learning_rate": 3.157928802588997e-05, + "loss": 0.3945, + "step": 3705 + }, + { + "epoch": 3.454587796925943, + "grad_norm": 0.2668517004773288, + "learning_rate": 3.154692556634304e-05, + "loss": 0.39, + "step": 3710 + }, + { + "epoch": 3.4592454587796926, + "grad_norm": 0.2850822875751711, + "learning_rate": 3.151456310679612e-05, + "loss": 0.3803, + "step": 3715 + }, + { + "epoch": 3.463903120633442, + "grad_norm": 0.27417386624832685, + "learning_rate": 3.1482200647249196e-05, + "loss": 0.3831, + "step": 3720 + }, + { + "epoch": 3.4685607824871916, + "grad_norm": 0.27493865538797385, + "learning_rate": 3.1449838187702266e-05, + "loss": 0.3931, + "step": 3725 + }, + { + "epoch": 3.473218444340941, + "grad_norm": 0.28177535307624046, + "learning_rate": 3.141747572815534e-05, + "loss": 0.3879, + "step": 3730 + }, + { + "epoch": 3.47787610619469, + "grad_norm": 0.2829667377578548, + "learning_rate": 3.138511326860842e-05, + "loss": 0.3923, + "step": 3735 + }, + { + "epoch": 3.4825337680484396, + "grad_norm": 0.2674923275403164, + "learning_rate": 3.135275080906149e-05, + "loss": 0.3828, + "step": 3740 + }, + { + "epoch": 3.487191429902189, + "grad_norm": 0.27103683627144687, + "learning_rate": 3.132038834951456e-05, + "loss": 0.3879, + "step": 3745 + }, + { + "epoch": 3.4918490917559386, + "grad_norm": 0.28765532743751443, + "learning_rate": 3.1288025889967636e-05, + "loss": 0.392, + "step": 3750 + }, + { + "epoch": 3.496506753609688, + "grad_norm": 0.25654156325072325, + "learning_rate": 3.125566343042071e-05, + "loss": 0.3876, + "step": 3755 + }, + { + "epoch": 3.501164415463437, + "grad_norm": 0.26945747468313935, + "learning_rate": 3.122330097087379e-05, + "loss": 0.3858, + "step": 3760 + }, + { + "epoch": 3.5058220773171866, + "grad_norm": 0.2820454488625458, + "learning_rate": 3.119093851132686e-05, + "loss": 0.3787, + "step": 3765 + }, + { + "epoch": 3.510479739170936, + "grad_norm": 0.29939288089971683, + "learning_rate": 3.1158576051779935e-05, + "loss": 0.3925, + "step": 3770 + }, + { + "epoch": 3.5151374010246856, + "grad_norm": 0.2794718494657326, + "learning_rate": 3.112621359223301e-05, + "loss": 0.3813, + "step": 3775 + }, + { + "epoch": 3.519795062878435, + "grad_norm": 0.2552939514511198, + "learning_rate": 3.109385113268609e-05, + "loss": 0.3704, + "step": 3780 + }, + { + "epoch": 3.5244527247321846, + "grad_norm": 0.26867450601995413, + "learning_rate": 3.106148867313916e-05, + "loss": 0.3902, + "step": 3785 + }, + { + "epoch": 3.529110386585934, + "grad_norm": 0.26301103443758594, + "learning_rate": 3.1029126213592234e-05, + "loss": 0.3847, + "step": 3790 + }, + { + "epoch": 3.533768048439683, + "grad_norm": 0.26969144672133993, + "learning_rate": 3.099676375404531e-05, + "loss": 0.3814, + "step": 3795 + }, + { + "epoch": 3.5384257102934327, + "grad_norm": 0.25876446057457136, + "learning_rate": 3.096440129449839e-05, + "loss": 0.3933, + "step": 3800 + }, + { + "epoch": 3.543083372147182, + "grad_norm": 0.26528598864564545, + "learning_rate": 3.093203883495146e-05, + "loss": 0.3999, + "step": 3805 + }, + { + "epoch": 3.5477410340009317, + "grad_norm": 0.25395748015440367, + "learning_rate": 3.0899676375404534e-05, + "loss": 0.3922, + "step": 3810 + }, + { + "epoch": 3.5523986958546807, + "grad_norm": 0.2539446461729014, + "learning_rate": 3.0867313915857604e-05, + "loss": 0.3901, + "step": 3815 + }, + { + "epoch": 3.55705635770843, + "grad_norm": 0.25587630611112755, + "learning_rate": 3.083495145631068e-05, + "loss": 0.3809, + "step": 3820 + }, + { + "epoch": 3.5617140195621797, + "grad_norm": 0.2646247097511274, + "learning_rate": 3.080258899676376e-05, + "loss": 0.3952, + "step": 3825 + }, + { + "epoch": 3.566371681415929, + "grad_norm": 0.2657954284334948, + "learning_rate": 3.077022653721683e-05, + "loss": 0.3889, + "step": 3830 + }, + { + "epoch": 3.5710293432696787, + "grad_norm": 0.2536065507085372, + "learning_rate": 3.07378640776699e-05, + "loss": 0.3922, + "step": 3835 + }, + { + "epoch": 3.575687005123428, + "grad_norm": 0.28212563334640933, + "learning_rate": 3.070550161812298e-05, + "loss": 0.3853, + "step": 3840 + }, + { + "epoch": 3.5803446669771777, + "grad_norm": 0.2891735614020263, + "learning_rate": 3.0673139158576056e-05, + "loss": 0.3806, + "step": 3845 + }, + { + "epoch": 3.5850023288309267, + "grad_norm": 0.27718664019958394, + "learning_rate": 3.0640776699029126e-05, + "loss": 0.3865, + "step": 3850 + }, + { + "epoch": 3.5896599906846762, + "grad_norm": 0.2603433525061592, + "learning_rate": 3.06084142394822e-05, + "loss": 0.3957, + "step": 3855 + }, + { + "epoch": 3.5943176525384257, + "grad_norm": 0.2875738160319884, + "learning_rate": 3.057605177993528e-05, + "loss": 0.3889, + "step": 3860 + }, + { + "epoch": 3.598975314392175, + "grad_norm": 0.29818723262331726, + "learning_rate": 3.054368932038835e-05, + "loss": 0.3936, + "step": 3865 + }, + { + "epoch": 3.6036329762459243, + "grad_norm": 0.25405865808063205, + "learning_rate": 3.0511326860841426e-05, + "loss": 0.389, + "step": 3870 + }, + { + "epoch": 3.6082906380996738, + "grad_norm": 0.26055927208792357, + "learning_rate": 3.0478964401294502e-05, + "loss": 0.3889, + "step": 3875 + }, + { + "epoch": 3.6129482999534233, + "grad_norm": 0.2700398384860422, + "learning_rate": 3.0446601941747575e-05, + "loss": 0.3914, + "step": 3880 + }, + { + "epoch": 3.6176059618071728, + "grad_norm": 0.270324050559537, + "learning_rate": 3.0414239482200645e-05, + "loss": 0.3862, + "step": 3885 + }, + { + "epoch": 3.6222636236609222, + "grad_norm": 0.2601990175502833, + "learning_rate": 3.0381877022653722e-05, + "loss": 0.3861, + "step": 3890 + }, + { + "epoch": 3.6269212855146717, + "grad_norm": 0.26161758347757474, + "learning_rate": 3.0349514563106795e-05, + "loss": 0.3892, + "step": 3895 + }, + { + "epoch": 3.6315789473684212, + "grad_norm": 0.2489041481801808, + "learning_rate": 3.031715210355987e-05, + "loss": 0.3762, + "step": 3900 + }, + { + "epoch": 3.6362366092221707, + "grad_norm": 0.26774224401033997, + "learning_rate": 3.0284789644012945e-05, + "loss": 0.3959, + "step": 3905 + }, + { + "epoch": 3.64089427107592, + "grad_norm": 0.26793933237265394, + "learning_rate": 3.025242718446602e-05, + "loss": 0.3864, + "step": 3910 + }, + { + "epoch": 3.6455519329296693, + "grad_norm": 0.2585379407540758, + "learning_rate": 3.0220064724919094e-05, + "loss": 0.3905, + "step": 3915 + }, + { + "epoch": 3.6502095947834188, + "grad_norm": 0.26491393957203, + "learning_rate": 3.018770226537217e-05, + "loss": 0.3812, + "step": 3920 + }, + { + "epoch": 3.6548672566371683, + "grad_norm": 0.271400161850077, + "learning_rate": 3.0155339805825244e-05, + "loss": 0.3689, + "step": 3925 + }, + { + "epoch": 3.6595249184909173, + "grad_norm": 0.25988470519759704, + "learning_rate": 3.012297734627832e-05, + "loss": 0.3873, + "step": 3930 + }, + { + "epoch": 3.664182580344667, + "grad_norm": 0.2687443667800867, + "learning_rate": 3.0090614886731394e-05, + "loss": 0.3852, + "step": 3935 + }, + { + "epoch": 3.6688402421984163, + "grad_norm": 0.24634214735412785, + "learning_rate": 3.005825242718447e-05, + "loss": 0.3884, + "step": 3940 + }, + { + "epoch": 3.673497904052166, + "grad_norm": 0.2798971204953813, + "learning_rate": 3.0025889967637544e-05, + "loss": 0.3843, + "step": 3945 + }, + { + "epoch": 3.6781555659059153, + "grad_norm": 0.2606601001038783, + "learning_rate": 2.999352750809062e-05, + "loss": 0.3773, + "step": 3950 + }, + { + "epoch": 3.682813227759665, + "grad_norm": 0.25113689388450894, + "learning_rate": 2.9961165048543693e-05, + "loss": 0.3911, + "step": 3955 + }, + { + "epoch": 3.6874708896134143, + "grad_norm": 0.31146062762708887, + "learning_rate": 2.9928802588996763e-05, + "loss": 0.3932, + "step": 3960 + }, + { + "epoch": 3.6921285514671633, + "grad_norm": 0.2667929766530086, + "learning_rate": 2.9896440129449836e-05, + "loss": 0.3862, + "step": 3965 + }, + { + "epoch": 3.696786213320913, + "grad_norm": 0.2659300669598008, + "learning_rate": 2.9864077669902913e-05, + "loss": 0.3951, + "step": 3970 + }, + { + "epoch": 3.7014438751746623, + "grad_norm": 0.2505557400077424, + "learning_rate": 2.9831715210355986e-05, + "loss": 0.3762, + "step": 3975 + }, + { + "epoch": 3.706101537028412, + "grad_norm": 0.26179678001104023, + "learning_rate": 2.9799352750809063e-05, + "loss": 0.3819, + "step": 3980 + }, + { + "epoch": 3.710759198882161, + "grad_norm": 0.2655073543872324, + "learning_rate": 2.9766990291262136e-05, + "loss": 0.4004, + "step": 3985 + }, + { + "epoch": 3.7154168607359104, + "grad_norm": 0.27257211675153703, + "learning_rate": 2.9734627831715212e-05, + "loss": 0.3887, + "step": 3990 + }, + { + "epoch": 3.72007452258966, + "grad_norm": 0.2530649928660773, + "learning_rate": 2.9702265372168285e-05, + "loss": 0.3826, + "step": 3995 + }, + { + "epoch": 3.7247321844434094, + "grad_norm": 0.25498988538656236, + "learning_rate": 2.9669902912621362e-05, + "loss": 0.3865, + "step": 4000 + }, + { + "epoch": 3.729389846297159, + "grad_norm": 0.27534519248019096, + "learning_rate": 2.9637540453074435e-05, + "loss": 0.3861, + "step": 4005 + }, + { + "epoch": 3.7340475081509084, + "grad_norm": 0.30351479908172263, + "learning_rate": 2.9605177993527512e-05, + "loss": 0.3936, + "step": 4010 + }, + { + "epoch": 3.738705170004658, + "grad_norm": 0.26297033246002766, + "learning_rate": 2.9572815533980585e-05, + "loss": 0.3794, + "step": 4015 + }, + { + "epoch": 3.7433628318584073, + "grad_norm": 0.2570828812364618, + "learning_rate": 2.954045307443366e-05, + "loss": 0.3935, + "step": 4020 + }, + { + "epoch": 3.7480204937121564, + "grad_norm": 0.26413532586678856, + "learning_rate": 2.9508090614886735e-05, + "loss": 0.3902, + "step": 4025 + }, + { + "epoch": 3.752678155565906, + "grad_norm": 0.2469097728757831, + "learning_rate": 2.9475728155339804e-05, + "loss": 0.3871, + "step": 4030 + }, + { + "epoch": 3.7573358174196554, + "grad_norm": 0.25017081158294974, + "learning_rate": 2.944336569579288e-05, + "loss": 0.3844, + "step": 4035 + }, + { + "epoch": 3.761993479273405, + "grad_norm": 0.2618028697384336, + "learning_rate": 2.9411003236245954e-05, + "loss": 0.3816, + "step": 4040 + }, + { + "epoch": 3.766651141127154, + "grad_norm": 0.2589025120578359, + "learning_rate": 2.9378640776699027e-05, + "loss": 0.3987, + "step": 4045 + }, + { + "epoch": 3.7713088029809034, + "grad_norm": 0.25799806537651776, + "learning_rate": 2.9346278317152104e-05, + "loss": 0.3922, + "step": 4050 + }, + { + "epoch": 3.775966464834653, + "grad_norm": 0.2663926898446476, + "learning_rate": 2.9313915857605177e-05, + "loss": 0.3962, + "step": 4055 + }, + { + "epoch": 3.7806241266884024, + "grad_norm": 0.2603178754321631, + "learning_rate": 2.9281553398058254e-05, + "loss": 0.3857, + "step": 4060 + }, + { + "epoch": 3.785281788542152, + "grad_norm": 0.26731069796356594, + "learning_rate": 2.9249190938511327e-05, + "loss": 0.381, + "step": 4065 + }, + { + "epoch": 3.7899394503959014, + "grad_norm": 0.26696106303225803, + "learning_rate": 2.9216828478964403e-05, + "loss": 0.3862, + "step": 4070 + }, + { + "epoch": 3.794597112249651, + "grad_norm": 0.2815662554612125, + "learning_rate": 2.9184466019417477e-05, + "loss": 0.3826, + "step": 4075 + }, + { + "epoch": 3.7992547741034, + "grad_norm": 0.2827948401073868, + "learning_rate": 2.9152103559870553e-05, + "loss": 0.3891, + "step": 4080 + }, + { + "epoch": 3.8039124359571495, + "grad_norm": 0.2581943852613212, + "learning_rate": 2.9119741100323626e-05, + "loss": 0.3883, + "step": 4085 + }, + { + "epoch": 3.808570097810899, + "grad_norm": 0.2392873592198476, + "learning_rate": 2.9087378640776703e-05, + "loss": 0.3863, + "step": 4090 + }, + { + "epoch": 3.8132277596646484, + "grad_norm": 0.2584092344259934, + "learning_rate": 2.9055016181229776e-05, + "loss": 0.3811, + "step": 4095 + }, + { + "epoch": 3.8178854215183975, + "grad_norm": 0.27433228887528766, + "learning_rate": 2.9022653721682853e-05, + "loss": 0.3832, + "step": 4100 + }, + { + "epoch": 3.822543083372147, + "grad_norm": 0.2555586480404662, + "learning_rate": 2.8990291262135922e-05, + "loss": 0.3823, + "step": 4105 + }, + { + "epoch": 3.8272007452258965, + "grad_norm": 0.2562741954663834, + "learning_rate": 2.8957928802588996e-05, + "loss": 0.385, + "step": 4110 + }, + { + "epoch": 3.831858407079646, + "grad_norm": 0.2747688007417003, + "learning_rate": 2.8925566343042072e-05, + "loss": 0.3929, + "step": 4115 + }, + { + "epoch": 3.8365160689333955, + "grad_norm": 0.2751272213075399, + "learning_rate": 2.8893203883495145e-05, + "loss": 0.3802, + "step": 4120 + }, + { + "epoch": 3.841173730787145, + "grad_norm": 0.2866223815197106, + "learning_rate": 2.8860841423948222e-05, + "loss": 0.3918, + "step": 4125 + }, + { + "epoch": 3.8458313926408945, + "grad_norm": 0.28003334091875887, + "learning_rate": 2.8828478964401295e-05, + "loss": 0.3947, + "step": 4130 + }, + { + "epoch": 3.850489054494644, + "grad_norm": 0.27526826453198927, + "learning_rate": 2.879611650485437e-05, + "loss": 0.3922, + "step": 4135 + }, + { + "epoch": 3.855146716348393, + "grad_norm": 0.2693495968482176, + "learning_rate": 2.8763754045307445e-05, + "loss": 0.3895, + "step": 4140 + }, + { + "epoch": 3.8598043782021425, + "grad_norm": 0.26923804782011435, + "learning_rate": 2.873139158576052e-05, + "loss": 0.3921, + "step": 4145 + }, + { + "epoch": 3.864462040055892, + "grad_norm": 0.2569074358143511, + "learning_rate": 2.8699029126213595e-05, + "loss": 0.3805, + "step": 4150 + }, + { + "epoch": 3.8691197019096415, + "grad_norm": 0.26120815562925404, + "learning_rate": 2.8666666666666668e-05, + "loss": 0.3841, + "step": 4155 + }, + { + "epoch": 3.8737773637633905, + "grad_norm": 0.2656997451883316, + "learning_rate": 2.8634304207119744e-05, + "loss": 0.4023, + "step": 4160 + }, + { + "epoch": 3.87843502561714, + "grad_norm": 0.27576343279594656, + "learning_rate": 2.8601941747572818e-05, + "loss": 0.3942, + "step": 4165 + }, + { + "epoch": 3.8830926874708895, + "grad_norm": 0.2708624364712976, + "learning_rate": 2.8569579288025894e-05, + "loss": 0.3849, + "step": 4170 + }, + { + "epoch": 3.887750349324639, + "grad_norm": 0.2692406892963796, + "learning_rate": 2.8537216828478964e-05, + "loss": 0.393, + "step": 4175 + }, + { + "epoch": 3.8924080111783885, + "grad_norm": 0.25240465246199195, + "learning_rate": 2.8504854368932037e-05, + "loss": 0.3879, + "step": 4180 + }, + { + "epoch": 3.897065673032138, + "grad_norm": 0.2612808657935316, + "learning_rate": 2.8472491909385114e-05, + "loss": 0.3895, + "step": 4185 + }, + { + "epoch": 3.9017233348858875, + "grad_norm": 0.24624516475912692, + "learning_rate": 2.8440129449838187e-05, + "loss": 0.388, + "step": 4190 + }, + { + "epoch": 3.9063809967396366, + "grad_norm": 0.25821877656832976, + "learning_rate": 2.8407766990291263e-05, + "loss": 0.3802, + "step": 4195 + }, + { + "epoch": 3.911038658593386, + "grad_norm": 0.2706771888411189, + "learning_rate": 2.8375404530744337e-05, + "loss": 0.3835, + "step": 4200 + }, + { + "epoch": 3.9156963204471356, + "grad_norm": 0.26183511758896194, + "learning_rate": 2.8343042071197413e-05, + "loss": 0.3885, + "step": 4205 + }, + { + "epoch": 3.920353982300885, + "grad_norm": 0.25796419196414927, + "learning_rate": 2.8310679611650486e-05, + "loss": 0.3837, + "step": 4210 + }, + { + "epoch": 3.925011644154634, + "grad_norm": 0.25902566823988277, + "learning_rate": 2.8278317152103563e-05, + "loss": 0.3817, + "step": 4215 + }, + { + "epoch": 3.9296693060083836, + "grad_norm": 0.2616577356628845, + "learning_rate": 2.8245954692556636e-05, + "loss": 0.3853, + "step": 4220 + }, + { + "epoch": 3.934326967862133, + "grad_norm": 0.27134376397369114, + "learning_rate": 2.8213592233009713e-05, + "loss": 0.383, + "step": 4225 + }, + { + "epoch": 3.9389846297158826, + "grad_norm": 0.26252907445849977, + "learning_rate": 2.8181229773462786e-05, + "loss": 0.3864, + "step": 4230 + }, + { + "epoch": 3.943642291569632, + "grad_norm": 0.2727230567833132, + "learning_rate": 2.8148867313915862e-05, + "loss": 0.3848, + "step": 4235 + }, + { + "epoch": 3.9482999534233816, + "grad_norm": 0.2523302047492875, + "learning_rate": 2.8116504854368935e-05, + "loss": 0.3816, + "step": 4240 + }, + { + "epoch": 3.952957615277131, + "grad_norm": 0.2528613621966959, + "learning_rate": 2.8084142394822012e-05, + "loss": 0.3819, + "step": 4245 + }, + { + "epoch": 3.9576152771308806, + "grad_norm": 0.2550015777299334, + "learning_rate": 2.805177993527508e-05, + "loss": 0.3868, + "step": 4250 + }, + { + "epoch": 3.9622729389846296, + "grad_norm": 0.2606325297186323, + "learning_rate": 2.8019417475728155e-05, + "loss": 0.3871, + "step": 4255 + }, + { + "epoch": 3.966930600838379, + "grad_norm": 0.2641775467222413, + "learning_rate": 2.7987055016181228e-05, + "loss": 0.3938, + "step": 4260 + }, + { + "epoch": 3.9715882626921286, + "grad_norm": 0.26248135354387697, + "learning_rate": 2.7954692556634305e-05, + "loss": 0.3933, + "step": 4265 + }, + { + "epoch": 3.976245924545878, + "grad_norm": 0.2489884177697842, + "learning_rate": 2.7922330097087378e-05, + "loss": 0.3879, + "step": 4270 + }, + { + "epoch": 3.980903586399627, + "grad_norm": 0.26477408983487055, + "learning_rate": 2.7889967637540454e-05, + "loss": 0.3948, + "step": 4275 + }, + { + "epoch": 3.9855612482533767, + "grad_norm": 0.2996748671144824, + "learning_rate": 2.7857605177993528e-05, + "loss": 0.392, + "step": 4280 + }, + { + "epoch": 3.990218910107126, + "grad_norm": 0.2577741645300303, + "learning_rate": 2.7825242718446604e-05, + "loss": 0.3825, + "step": 4285 + }, + { + "epoch": 3.9948765719608756, + "grad_norm": 0.2604730744906666, + "learning_rate": 2.7792880258899677e-05, + "loss": 0.389, + "step": 4290 + }, + { + "epoch": 3.999534233814625, + "grad_norm": 0.2554837869509496, + "learning_rate": 2.7760517799352754e-05, + "loss": 0.39, + "step": 4295 + }, + { + "epoch": 4.003726129483, + "grad_norm": 0.30922681334743707, + "learning_rate": 2.7728155339805827e-05, + "loss": 0.3427, + "step": 4300 + }, + { + "epoch": 4.008383791336749, + "grad_norm": 0.301227570163748, + "learning_rate": 2.7695792880258904e-05, + "loss": 0.3331, + "step": 4305 + }, + { + "epoch": 4.013041453190499, + "grad_norm": 0.2991715513747825, + "learning_rate": 2.7663430420711977e-05, + "loss": 0.3375, + "step": 4310 + }, + { + "epoch": 4.017699115044247, + "grad_norm": 0.28312685959454287, + "learning_rate": 2.7631067961165053e-05, + "loss": 0.3322, + "step": 4315 + }, + { + "epoch": 4.022356776897997, + "grad_norm": 0.29405199530826526, + "learning_rate": 2.7598705501618123e-05, + "loss": 0.3345, + "step": 4320 + }, + { + "epoch": 4.027014438751746, + "grad_norm": 0.29429720520135993, + "learning_rate": 2.7566343042071196e-05, + "loss": 0.3259, + "step": 4325 + }, + { + "epoch": 4.031672100605496, + "grad_norm": 0.2637447275165894, + "learning_rate": 2.7533980582524273e-05, + "loss": 0.3359, + "step": 4330 + }, + { + "epoch": 4.036329762459245, + "grad_norm": 0.27150321164414054, + "learning_rate": 2.7501618122977346e-05, + "loss": 0.3387, + "step": 4335 + }, + { + "epoch": 4.040987424312995, + "grad_norm": 0.2884600945776732, + "learning_rate": 2.7469255663430423e-05, + "loss": 0.337, + "step": 4340 + }, + { + "epoch": 4.045645086166744, + "grad_norm": 0.27462833663202424, + "learning_rate": 2.7436893203883496e-05, + "loss": 0.3332, + "step": 4345 + }, + { + "epoch": 4.050302748020494, + "grad_norm": 0.2770260272624138, + "learning_rate": 2.740453074433657e-05, + "loss": 0.34, + "step": 4350 + }, + { + "epoch": 4.054960409874243, + "grad_norm": 0.2719819262086612, + "learning_rate": 2.7372168284789646e-05, + "loss": 0.3448, + "step": 4355 + }, + { + "epoch": 4.059618071727993, + "grad_norm": 0.2937422211184152, + "learning_rate": 2.733980582524272e-05, + "loss": 0.3453, + "step": 4360 + }, + { + "epoch": 4.064275733581742, + "grad_norm": 0.2883040671392013, + "learning_rate": 2.7307443365695795e-05, + "loss": 0.3419, + "step": 4365 + }, + { + "epoch": 4.068933395435492, + "grad_norm": 0.28028997899065444, + "learning_rate": 2.727508090614887e-05, + "loss": 0.3416, + "step": 4370 + }, + { + "epoch": 4.07359105728924, + "grad_norm": 0.2747243694429081, + "learning_rate": 2.7242718446601945e-05, + "loss": 0.3367, + "step": 4375 + }, + { + "epoch": 4.07824871914299, + "grad_norm": 0.28036218802618706, + "learning_rate": 2.7210355987055018e-05, + "loss": 0.336, + "step": 4380 + }, + { + "epoch": 4.082906380996739, + "grad_norm": 0.288073842340157, + "learning_rate": 2.7177993527508095e-05, + "loss": 0.335, + "step": 4385 + }, + { + "epoch": 4.087564042850489, + "grad_norm": 0.2840079345646035, + "learning_rate": 2.7145631067961165e-05, + "loss": 0.3428, + "step": 4390 + }, + { + "epoch": 4.092221704704238, + "grad_norm": 0.274581937492412, + "learning_rate": 2.7113268608414238e-05, + "loss": 0.3394, + "step": 4395 + }, + { + "epoch": 4.096879366557988, + "grad_norm": 0.2772673030668756, + "learning_rate": 2.7080906148867314e-05, + "loss": 0.3394, + "step": 4400 + }, + { + "epoch": 4.101537028411737, + "grad_norm": 0.27849278411701783, + "learning_rate": 2.7048543689320388e-05, + "loss": 0.3386, + "step": 4405 + }, + { + "epoch": 4.106194690265487, + "grad_norm": 0.2796044183413613, + "learning_rate": 2.7016181229773464e-05, + "loss": 0.3415, + "step": 4410 + }, + { + "epoch": 4.110852352119236, + "grad_norm": 0.275451109703783, + "learning_rate": 2.6983818770226537e-05, + "loss": 0.3387, + "step": 4415 + }, + { + "epoch": 4.115510013972986, + "grad_norm": 0.2961787378625091, + "learning_rate": 2.6951456310679614e-05, + "loss": 0.3472, + "step": 4420 + }, + { + "epoch": 4.120167675826735, + "grad_norm": 0.2834404893264586, + "learning_rate": 2.6919093851132687e-05, + "loss": 0.3419, + "step": 4425 + }, + { + "epoch": 4.124825337680484, + "grad_norm": 0.2770535443435555, + "learning_rate": 2.6886731391585764e-05, + "loss": 0.3459, + "step": 4430 + }, + { + "epoch": 4.1294829995342335, + "grad_norm": 0.276070086645229, + "learning_rate": 2.6854368932038837e-05, + "loss": 0.3323, + "step": 4435 + }, + { + "epoch": 4.134140661387983, + "grad_norm": 0.26597945758759256, + "learning_rate": 2.6822006472491913e-05, + "loss": 0.3415, + "step": 4440 + }, + { + "epoch": 4.1387983232417325, + "grad_norm": 0.28650783158027116, + "learning_rate": 2.6789644012944986e-05, + "loss": 0.3422, + "step": 4445 + }, + { + "epoch": 4.143455985095482, + "grad_norm": 0.2731057495828754, + "learning_rate": 2.6757281553398063e-05, + "loss": 0.3429, + "step": 4450 + }, + { + "epoch": 4.1481136469492315, + "grad_norm": 0.2785744835419355, + "learning_rate": 2.6724919093851136e-05, + "loss": 0.3412, + "step": 4455 + }, + { + "epoch": 4.152771308802981, + "grad_norm": 0.27582090334567066, + "learning_rate": 2.669255663430421e-05, + "loss": 0.3403, + "step": 4460 + }, + { + "epoch": 4.1574289706567304, + "grad_norm": 0.2761543634938907, + "learning_rate": 2.666019417475728e-05, + "loss": 0.35, + "step": 4465 + }, + { + "epoch": 4.16208663251048, + "grad_norm": 0.2720827997393211, + "learning_rate": 2.6627831715210356e-05, + "loss": 0.3482, + "step": 4470 + }, + { + "epoch": 4.166744294364229, + "grad_norm": 0.2887979203991473, + "learning_rate": 2.659546925566343e-05, + "loss": 0.3406, + "step": 4475 + }, + { + "epoch": 4.171401956217979, + "grad_norm": 0.2827047506321513, + "learning_rate": 2.6563106796116505e-05, + "loss": 0.3402, + "step": 4480 + }, + { + "epoch": 4.1760596180717275, + "grad_norm": 0.2772822053172354, + "learning_rate": 2.653074433656958e-05, + "loss": 0.3299, + "step": 4485 + }, + { + "epoch": 4.180717279925477, + "grad_norm": 0.29272264893784194, + "learning_rate": 2.6498381877022655e-05, + "loss": 0.3354, + "step": 4490 + }, + { + "epoch": 4.1853749417792265, + "grad_norm": 0.29375422060955253, + "learning_rate": 2.646601941747573e-05, + "loss": 0.3453, + "step": 4495 + }, + { + "epoch": 4.190032603632976, + "grad_norm": 0.29150751325996316, + "learning_rate": 2.6433656957928805e-05, + "loss": 0.3484, + "step": 4500 + }, + { + "epoch": 4.1946902654867255, + "grad_norm": 0.2817529454463103, + "learning_rate": 2.6401294498381878e-05, + "loss": 0.3371, + "step": 4505 + }, + { + "epoch": 4.199347927340475, + "grad_norm": 0.28966074414240217, + "learning_rate": 2.6368932038834955e-05, + "loss": 0.3451, + "step": 4510 + }, + { + "epoch": 4.2040055891942245, + "grad_norm": 0.27391509686820326, + "learning_rate": 2.6336569579288028e-05, + "loss": 0.3416, + "step": 4515 + }, + { + "epoch": 4.208663251047974, + "grad_norm": 0.29148215450693565, + "learning_rate": 2.6304207119741104e-05, + "loss": 0.3414, + "step": 4520 + }, + { + "epoch": 4.2133209129017235, + "grad_norm": 0.2689398936859596, + "learning_rate": 2.6271844660194178e-05, + "loss": 0.3385, + "step": 4525 + }, + { + "epoch": 4.217978574755473, + "grad_norm": 0.296520115240737, + "learning_rate": 2.6239482200647254e-05, + "loss": 0.34, + "step": 4530 + }, + { + "epoch": 4.2226362366092225, + "grad_norm": 0.2819120898954846, + "learning_rate": 2.620711974110032e-05, + "loss": 0.3413, + "step": 4535 + }, + { + "epoch": 4.227293898462972, + "grad_norm": 0.3030595014424858, + "learning_rate": 2.6174757281553397e-05, + "loss": 0.3446, + "step": 4540 + }, + { + "epoch": 4.231951560316721, + "grad_norm": 0.28596944705353855, + "learning_rate": 2.614239482200647e-05, + "loss": 0.3447, + "step": 4545 + }, + { + "epoch": 4.23660922217047, + "grad_norm": 0.2820542054589958, + "learning_rate": 2.6110032362459547e-05, + "loss": 0.3424, + "step": 4550 + }, + { + "epoch": 4.24126688402422, + "grad_norm": 0.3143861183376282, + "learning_rate": 2.607766990291262e-05, + "loss": 0.3464, + "step": 4555 + }, + { + "epoch": 4.245924545877969, + "grad_norm": 0.28997006041292267, + "learning_rate": 2.6045307443365697e-05, + "loss": 0.3503, + "step": 4560 + }, + { + "epoch": 4.250582207731719, + "grad_norm": 0.29817368907759895, + "learning_rate": 2.601294498381877e-05, + "loss": 0.3433, + "step": 4565 + }, + { + "epoch": 4.255239869585468, + "grad_norm": 0.28644175397133737, + "learning_rate": 2.5980582524271846e-05, + "loss": 0.3504, + "step": 4570 + }, + { + "epoch": 4.259897531439218, + "grad_norm": 0.3080648006556646, + "learning_rate": 2.594822006472492e-05, + "loss": 0.35, + "step": 4575 + }, + { + "epoch": 4.264555193292967, + "grad_norm": 0.28181312998638813, + "learning_rate": 2.5915857605177996e-05, + "loss": 0.347, + "step": 4580 + }, + { + "epoch": 4.269212855146717, + "grad_norm": 0.26855612943955576, + "learning_rate": 2.588349514563107e-05, + "loss": 0.3456, + "step": 4585 + }, + { + "epoch": 4.273870517000466, + "grad_norm": 0.2813470462397564, + "learning_rate": 2.5851132686084146e-05, + "loss": 0.3424, + "step": 4590 + }, + { + "epoch": 4.2785281788542155, + "grad_norm": 0.27916875991725926, + "learning_rate": 2.581877022653722e-05, + "loss": 0.3466, + "step": 4595 + }, + { + "epoch": 4.283185840707965, + "grad_norm": 0.2698085291333574, + "learning_rate": 2.5786407766990296e-05, + "loss": 0.3474, + "step": 4600 + }, + { + "epoch": 4.287843502561714, + "grad_norm": 0.265844277649668, + "learning_rate": 2.575404530744337e-05, + "loss": 0.3524, + "step": 4605 + }, + { + "epoch": 4.292501164415463, + "grad_norm": 0.2697893276218071, + "learning_rate": 2.572168284789644e-05, + "loss": 0.3432, + "step": 4610 + }, + { + "epoch": 4.297158826269213, + "grad_norm": 0.3027818789291518, + "learning_rate": 2.5689320388349515e-05, + "loss": 0.3435, + "step": 4615 + }, + { + "epoch": 4.301816488122962, + "grad_norm": 0.3097305617204279, + "learning_rate": 2.5656957928802588e-05, + "loss": 0.3409, + "step": 4620 + }, + { + "epoch": 4.306474149976712, + "grad_norm": 0.28377648125088384, + "learning_rate": 2.5624595469255665e-05, + "loss": 0.3456, + "step": 4625 + }, + { + "epoch": 4.311131811830461, + "grad_norm": 0.30292279319148513, + "learning_rate": 2.5592233009708738e-05, + "loss": 0.3485, + "step": 4630 + }, + { + "epoch": 4.315789473684211, + "grad_norm": 0.2783600029192179, + "learning_rate": 2.5559870550161815e-05, + "loss": 0.3439, + "step": 4635 + }, + { + "epoch": 4.32044713553796, + "grad_norm": 0.25988299657809694, + "learning_rate": 2.5527508090614888e-05, + "loss": 0.3428, + "step": 4640 + }, + { + "epoch": 4.32510479739171, + "grad_norm": 0.281776274609003, + "learning_rate": 2.549514563106796e-05, + "loss": 0.3423, + "step": 4645 + }, + { + "epoch": 4.329762459245459, + "grad_norm": 0.2848401214493112, + "learning_rate": 2.5462783171521038e-05, + "loss": 0.343, + "step": 4650 + }, + { + "epoch": 4.334420121099209, + "grad_norm": 0.26808676440877494, + "learning_rate": 2.543042071197411e-05, + "loss": 0.3423, + "step": 4655 + }, + { + "epoch": 4.339077782952957, + "grad_norm": 0.26625329707544965, + "learning_rate": 2.5398058252427187e-05, + "loss": 0.3468, + "step": 4660 + }, + { + "epoch": 4.343735444806707, + "grad_norm": 0.2760378023774553, + "learning_rate": 2.536569579288026e-05, + "loss": 0.3526, + "step": 4665 + }, + { + "epoch": 4.348393106660456, + "grad_norm": 0.27430108921888785, + "learning_rate": 2.5333333333333337e-05, + "loss": 0.3481, + "step": 4670 + }, + { + "epoch": 4.353050768514206, + "grad_norm": 0.2890731689439146, + "learning_rate": 2.530097087378641e-05, + "loss": 0.3384, + "step": 4675 + }, + { + "epoch": 4.357708430367955, + "grad_norm": 0.2850035961825501, + "learning_rate": 2.526860841423948e-05, + "loss": 0.3511, + "step": 4680 + }, + { + "epoch": 4.362366092221705, + "grad_norm": 0.28371174995581067, + "learning_rate": 2.5236245954692557e-05, + "loss": 0.341, + "step": 4685 + }, + { + "epoch": 4.367023754075454, + "grad_norm": 0.2798334297146016, + "learning_rate": 2.520388349514563e-05, + "loss": 0.3457, + "step": 4690 + }, + { + "epoch": 4.371681415929204, + "grad_norm": 0.2792780079485981, + "learning_rate": 2.5171521035598706e-05, + "loss": 0.3421, + "step": 4695 + }, + { + "epoch": 4.376339077782953, + "grad_norm": 0.2789377083550942, + "learning_rate": 2.513915857605178e-05, + "loss": 0.3526, + "step": 4700 + }, + { + "epoch": 4.380996739636703, + "grad_norm": 0.27147206127013335, + "learning_rate": 2.5106796116504856e-05, + "loss": 0.3441, + "step": 4705 + }, + { + "epoch": 4.385654401490452, + "grad_norm": 0.2902924692605414, + "learning_rate": 2.507443365695793e-05, + "loss": 0.3529, + "step": 4710 + }, + { + "epoch": 4.390312063344201, + "grad_norm": 0.27929557612231476, + "learning_rate": 2.5042071197411006e-05, + "loss": 0.3378, + "step": 4715 + }, + { + "epoch": 4.39496972519795, + "grad_norm": 0.2701750459913034, + "learning_rate": 2.500970873786408e-05, + "loss": 0.3412, + "step": 4720 + }, + { + "epoch": 4.3996273870517, + "grad_norm": 0.2799269464041573, + "learning_rate": 2.4977346278317155e-05, + "loss": 0.3375, + "step": 4725 + }, + { + "epoch": 4.404285048905449, + "grad_norm": 0.28078679287279756, + "learning_rate": 2.494498381877023e-05, + "loss": 0.3526, + "step": 4730 + }, + { + "epoch": 4.408942710759199, + "grad_norm": 0.28802952806823595, + "learning_rate": 2.4912621359223302e-05, + "loss": 0.3514, + "step": 4735 + }, + { + "epoch": 4.413600372612948, + "grad_norm": 0.28721454128911067, + "learning_rate": 2.4880258899676375e-05, + "loss": 0.3351, + "step": 4740 + }, + { + "epoch": 4.418258034466698, + "grad_norm": 0.2906670016825979, + "learning_rate": 2.484789644012945e-05, + "loss": 0.347, + "step": 4745 + }, + { + "epoch": 4.422915696320447, + "grad_norm": 0.2780805909530155, + "learning_rate": 2.4815533980582525e-05, + "loss": 0.3412, + "step": 4750 + }, + { + "epoch": 4.427573358174197, + "grad_norm": 0.3024089657369089, + "learning_rate": 2.47831715210356e-05, + "loss": 0.3547, + "step": 4755 + }, + { + "epoch": 4.432231020027946, + "grad_norm": 0.2807347536015423, + "learning_rate": 2.4750809061488674e-05, + "loss": 0.3461, + "step": 4760 + }, + { + "epoch": 4.436888681881696, + "grad_norm": 0.27572550054354583, + "learning_rate": 2.471844660194175e-05, + "loss": 0.3462, + "step": 4765 + }, + { + "epoch": 4.441546343735445, + "grad_norm": 0.2845970279662639, + "learning_rate": 2.468608414239482e-05, + "loss": 0.3372, + "step": 4770 + }, + { + "epoch": 4.446204005589194, + "grad_norm": 0.2798998146127669, + "learning_rate": 2.4653721682847897e-05, + "loss": 0.3453, + "step": 4775 + }, + { + "epoch": 4.450861667442943, + "grad_norm": 0.2951093743961065, + "learning_rate": 2.462135922330097e-05, + "loss": 0.3476, + "step": 4780 + }, + { + "epoch": 4.455519329296693, + "grad_norm": 0.2587475261460553, + "learning_rate": 2.4588996763754047e-05, + "loss": 0.3525, + "step": 4785 + }, + { + "epoch": 4.460176991150442, + "grad_norm": 0.2772934314044936, + "learning_rate": 2.455663430420712e-05, + "loss": 0.3441, + "step": 4790 + }, + { + "epoch": 4.464834653004192, + "grad_norm": 0.28261470030353486, + "learning_rate": 2.4524271844660197e-05, + "loss": 0.3473, + "step": 4795 + }, + { + "epoch": 4.469492314857941, + "grad_norm": 0.26471592322331733, + "learning_rate": 2.449190938511327e-05, + "loss": 0.3505, + "step": 4800 + }, + { + "epoch": 4.474149976711691, + "grad_norm": 0.28724363547599, + "learning_rate": 2.4459546925566343e-05, + "loss": 0.3487, + "step": 4805 + }, + { + "epoch": 4.47880763856544, + "grad_norm": 0.2959670533347613, + "learning_rate": 2.4427184466019416e-05, + "loss": 0.3451, + "step": 4810 + }, + { + "epoch": 4.48346530041919, + "grad_norm": 0.27338912373615454, + "learning_rate": 2.4394822006472493e-05, + "loss": 0.3406, + "step": 4815 + }, + { + "epoch": 4.488122962272939, + "grad_norm": 0.27552301008710245, + "learning_rate": 2.4362459546925566e-05, + "loss": 0.352, + "step": 4820 + }, + { + "epoch": 4.492780624126689, + "grad_norm": 0.5518125177948608, + "learning_rate": 2.4330097087378643e-05, + "loss": 0.3506, + "step": 4825 + }, + { + "epoch": 4.497438285980438, + "grad_norm": 0.27618297840980366, + "learning_rate": 2.4297734627831716e-05, + "loss": 0.3528, + "step": 4830 + }, + { + "epoch": 4.502095947834187, + "grad_norm": 0.2725652864541843, + "learning_rate": 2.4265372168284792e-05, + "loss": 0.3477, + "step": 4835 + }, + { + "epoch": 4.506753609687936, + "grad_norm": 0.2688666248203594, + "learning_rate": 2.4233009708737866e-05, + "loss": 0.3492, + "step": 4840 + }, + { + "epoch": 4.511411271541686, + "grad_norm": 0.2675264289953943, + "learning_rate": 2.420064724919094e-05, + "loss": 0.3463, + "step": 4845 + }, + { + "epoch": 4.516068933395435, + "grad_norm": 0.28151463623190387, + "learning_rate": 2.4168284789644012e-05, + "loss": 0.3398, + "step": 4850 + }, + { + "epoch": 4.520726595249185, + "grad_norm": 0.2566560814830423, + "learning_rate": 2.413592233009709e-05, + "loss": 0.3482, + "step": 4855 + }, + { + "epoch": 4.525384257102934, + "grad_norm": 0.2898263552733961, + "learning_rate": 2.4103559870550162e-05, + "loss": 0.3418, + "step": 4860 + }, + { + "epoch": 4.530041918956684, + "grad_norm": 0.2588298954480233, + "learning_rate": 2.4071197411003238e-05, + "loss": 0.347, + "step": 4865 + }, + { + "epoch": 4.534699580810433, + "grad_norm": 0.28811816872383217, + "learning_rate": 2.403883495145631e-05, + "loss": 0.3489, + "step": 4870 + }, + { + "epoch": 4.539357242664183, + "grad_norm": 0.27020662162198467, + "learning_rate": 2.4006472491909388e-05, + "loss": 0.3498, + "step": 4875 + }, + { + "epoch": 4.544014904517932, + "grad_norm": 0.2736851785899911, + "learning_rate": 2.397411003236246e-05, + "loss": 0.3505, + "step": 4880 + }, + { + "epoch": 4.548672566371682, + "grad_norm": 0.270114303022711, + "learning_rate": 2.3941747572815534e-05, + "loss": 0.3485, + "step": 4885 + }, + { + "epoch": 4.55333022822543, + "grad_norm": 0.274061760735231, + "learning_rate": 2.3909385113268608e-05, + "loss": 0.3454, + "step": 4890 + }, + { + "epoch": 4.55798789007918, + "grad_norm": 0.27592510610491877, + "learning_rate": 2.3877022653721684e-05, + "loss": 0.3435, + "step": 4895 + }, + { + "epoch": 4.562645551932929, + "grad_norm": 0.26265886365244917, + "learning_rate": 2.3844660194174757e-05, + "loss": 0.3381, + "step": 4900 + }, + { + "epoch": 4.567303213786679, + "grad_norm": 0.2675318112084943, + "learning_rate": 2.3812297734627834e-05, + "loss": 0.3479, + "step": 4905 + }, + { + "epoch": 4.571960875640428, + "grad_norm": 0.2756756004486905, + "learning_rate": 2.3779935275080907e-05, + "loss": 0.3454, + "step": 4910 + }, + { + "epoch": 4.576618537494178, + "grad_norm": 0.2744517804738508, + "learning_rate": 2.374757281553398e-05, + "loss": 0.3422, + "step": 4915 + }, + { + "epoch": 4.581276199347927, + "grad_norm": 0.2956500056572687, + "learning_rate": 2.3715210355987057e-05, + "loss": 0.3431, + "step": 4920 + }, + { + "epoch": 4.585933861201677, + "grad_norm": 0.2790678584215101, + "learning_rate": 2.368284789644013e-05, + "loss": 0.3496, + "step": 4925 + }, + { + "epoch": 4.590591523055426, + "grad_norm": 0.2765275041694904, + "learning_rate": 2.3650485436893206e-05, + "loss": 0.3423, + "step": 4930 + }, + { + "epoch": 4.595249184909176, + "grad_norm": 0.24998145054628612, + "learning_rate": 2.361812297734628e-05, + "loss": 0.3504, + "step": 4935 + }, + { + "epoch": 4.599906846762925, + "grad_norm": 0.2687141625914439, + "learning_rate": 2.3585760517799356e-05, + "loss": 0.3413, + "step": 4940 + }, + { + "epoch": 4.604564508616674, + "grad_norm": 0.25987178416925466, + "learning_rate": 2.355339805825243e-05, + "loss": 0.3569, + "step": 4945 + }, + { + "epoch": 4.6092221704704235, + "grad_norm": 0.2784445234026133, + "learning_rate": 2.3521035598705503e-05, + "loss": 0.3437, + "step": 4950 + }, + { + "epoch": 4.613879832324173, + "grad_norm": 0.2639561735354289, + "learning_rate": 2.3488673139158576e-05, + "loss": 0.3507, + "step": 4955 + }, + { + "epoch": 4.6185374941779225, + "grad_norm": 0.26122970993507066, + "learning_rate": 2.3456310679611652e-05, + "loss": 0.3426, + "step": 4960 + }, + { + "epoch": 4.623195156031672, + "grad_norm": 0.2692150538527661, + "learning_rate": 2.3423948220064725e-05, + "loss": 0.3546, + "step": 4965 + }, + { + "epoch": 4.6278528178854215, + "grad_norm": 0.2733463787678732, + "learning_rate": 2.3391585760517802e-05, + "loss": 0.3525, + "step": 4970 + }, + { + "epoch": 4.632510479739171, + "grad_norm": 0.27159880615014076, + "learning_rate": 2.3359223300970875e-05, + "loss": 0.3391, + "step": 4975 + }, + { + "epoch": 4.6371681415929205, + "grad_norm": 0.2727071919572588, + "learning_rate": 2.3326860841423952e-05, + "loss": 0.3407, + "step": 4980 + }, + { + "epoch": 4.64182580344667, + "grad_norm": 0.2700456284616228, + "learning_rate": 2.3294498381877025e-05, + "loss": 0.355, + "step": 4985 + }, + { + "epoch": 4.6464834653004194, + "grad_norm": 0.2676835694946083, + "learning_rate": 2.3262135922330098e-05, + "loss": 0.3413, + "step": 4990 + }, + { + "epoch": 4.651141127154169, + "grad_norm": 0.27720368166900317, + "learning_rate": 2.322977346278317e-05, + "loss": 0.3489, + "step": 4995 + }, + { + "epoch": 4.6557987890079175, + "grad_norm": 0.28112682498727826, + "learning_rate": 2.3197411003236248e-05, + "loss": 0.3498, + "step": 5000 + }, + { + "epoch": 4.660456450861668, + "grad_norm": 0.27696629138280643, + "learning_rate": 2.316504854368932e-05, + "loss": 0.3482, + "step": 5005 + }, + { + "epoch": 4.6651141127154165, + "grad_norm": 0.26509317898455204, + "learning_rate": 2.3132686084142398e-05, + "loss": 0.3526, + "step": 5010 + }, + { + "epoch": 4.669771774569166, + "grad_norm": 0.2661649794027691, + "learning_rate": 2.310032362459547e-05, + "loss": 0.3471, + "step": 5015 + }, + { + "epoch": 4.6744294364229155, + "grad_norm": 0.2742085688046022, + "learning_rate": 2.3067961165048547e-05, + "loss": 0.348, + "step": 5020 + }, + { + "epoch": 4.679087098276665, + "grad_norm": 0.2631833779800896, + "learning_rate": 2.3035598705501617e-05, + "loss": 0.3491, + "step": 5025 + }, + { + "epoch": 4.6837447601304145, + "grad_norm": 0.27825788482078895, + "learning_rate": 2.3003236245954694e-05, + "loss": 0.3453, + "step": 5030 + }, + { + "epoch": 4.688402421984164, + "grad_norm": 0.28050281118191694, + "learning_rate": 2.2970873786407767e-05, + "loss": 0.3462, + "step": 5035 + }, + { + "epoch": 4.6930600838379135, + "grad_norm": 0.2577816537435826, + "learning_rate": 2.2938511326860843e-05, + "loss": 0.3447, + "step": 5040 + }, + { + "epoch": 4.697717745691663, + "grad_norm": 0.2682148115169009, + "learning_rate": 2.2906148867313917e-05, + "loss": 0.3409, + "step": 5045 + }, + { + "epoch": 4.7023754075454125, + "grad_norm": 0.2716317439276806, + "learning_rate": 2.2873786407766993e-05, + "loss": 0.3431, + "step": 5050 + }, + { + "epoch": 4.707033069399162, + "grad_norm": 0.26484905556040395, + "learning_rate": 2.2841423948220066e-05, + "loss": 0.344, + "step": 5055 + }, + { + "epoch": 4.7116907312529115, + "grad_norm": 0.2582513351019725, + "learning_rate": 2.280906148867314e-05, + "loss": 0.3438, + "step": 5060 + }, + { + "epoch": 4.71634839310666, + "grad_norm": 0.27263840516335347, + "learning_rate": 2.2776699029126213e-05, + "loss": 0.3405, + "step": 5065 + }, + { + "epoch": 4.72100605496041, + "grad_norm": 0.2636007854508017, + "learning_rate": 2.274433656957929e-05, + "loss": 0.3444, + "step": 5070 + }, + { + "epoch": 4.725663716814159, + "grad_norm": 0.28450549420320026, + "learning_rate": 2.2711974110032362e-05, + "loss": 0.3495, + "step": 5075 + }, + { + "epoch": 4.730321378667909, + "grad_norm": 0.29049335289255, + "learning_rate": 2.267961165048544e-05, + "loss": 0.3501, + "step": 5080 + }, + { + "epoch": 4.734979040521658, + "grad_norm": 0.2617782030129591, + "learning_rate": 2.2647249190938512e-05, + "loss": 0.3412, + "step": 5085 + }, + { + "epoch": 4.739636702375408, + "grad_norm": 0.26619791820681776, + "learning_rate": 2.261488673139159e-05, + "loss": 0.3465, + "step": 5090 + }, + { + "epoch": 4.744294364229157, + "grad_norm": 0.258803310255694, + "learning_rate": 2.258252427184466e-05, + "loss": 0.3401, + "step": 5095 + }, + { + "epoch": 4.748952026082907, + "grad_norm": 0.26462008992087993, + "learning_rate": 2.2550161812297735e-05, + "loss": 0.3485, + "step": 5100 + }, + { + "epoch": 4.753609687936656, + "grad_norm": 0.28270089001402976, + "learning_rate": 2.2517799352750808e-05, + "loss": 0.3501, + "step": 5105 + }, + { + "epoch": 4.7582673497904056, + "grad_norm": 0.2768800118594251, + "learning_rate": 2.2485436893203885e-05, + "loss": 0.3483, + "step": 5110 + }, + { + "epoch": 4.762925011644155, + "grad_norm": 0.2827237369746407, + "learning_rate": 2.2453074433656958e-05, + "loss": 0.3455, + "step": 5115 + }, + { + "epoch": 4.767582673497904, + "grad_norm": 0.2586423898934621, + "learning_rate": 2.2420711974110035e-05, + "loss": 0.3404, + "step": 5120 + }, + { + "epoch": 4.772240335351653, + "grad_norm": 0.2695792090511496, + "learning_rate": 2.2388349514563108e-05, + "loss": 0.3469, + "step": 5125 + }, + { + "epoch": 4.776897997205403, + "grad_norm": 0.2728769912631749, + "learning_rate": 2.235598705501618e-05, + "loss": 0.3497, + "step": 5130 + }, + { + "epoch": 4.781555659059152, + "grad_norm": 0.26886469462732937, + "learning_rate": 2.2323624595469254e-05, + "loss": 0.3584, + "step": 5135 + }, + { + "epoch": 4.786213320912902, + "grad_norm": 0.2569094746861235, + "learning_rate": 2.229126213592233e-05, + "loss": 0.341, + "step": 5140 + }, + { + "epoch": 4.790870982766651, + "grad_norm": 0.27323842192783715, + "learning_rate": 2.2258899676375404e-05, + "loss": 0.3473, + "step": 5145 + }, + { + "epoch": 4.795528644620401, + "grad_norm": 0.2779329948321792, + "learning_rate": 2.222653721682848e-05, + "loss": 0.3341, + "step": 5150 + }, + { + "epoch": 4.80018630647415, + "grad_norm": 0.27433246496954844, + "learning_rate": 2.2194174757281554e-05, + "loss": 0.3397, + "step": 5155 + }, + { + "epoch": 4.8048439683279, + "grad_norm": 0.2751671836627462, + "learning_rate": 2.216181229773463e-05, + "loss": 0.34, + "step": 5160 + }, + { + "epoch": 4.809501630181649, + "grad_norm": 0.2581618089575914, + "learning_rate": 2.2129449838187703e-05, + "loss": 0.351, + "step": 5165 + }, + { + "epoch": 4.814159292035399, + "grad_norm": 0.28479693685585583, + "learning_rate": 2.2097087378640777e-05, + "loss": 0.3525, + "step": 5170 + }, + { + "epoch": 4.818816953889147, + "grad_norm": 0.30270734662749793, + "learning_rate": 2.2064724919093853e-05, + "loss": 0.3553, + "step": 5175 + }, + { + "epoch": 4.823474615742897, + "grad_norm": 0.2592937709832869, + "learning_rate": 2.2032362459546926e-05, + "loss": 0.3429, + "step": 5180 + }, + { + "epoch": 4.828132277596646, + "grad_norm": 0.263554513112591, + "learning_rate": 2.2000000000000003e-05, + "loss": 0.3427, + "step": 5185 + }, + { + "epoch": 4.832789939450396, + "grad_norm": 0.27555447618027107, + "learning_rate": 2.1967637540453076e-05, + "loss": 0.3568, + "step": 5190 + }, + { + "epoch": 4.837447601304145, + "grad_norm": 0.27340251424360784, + "learning_rate": 2.193527508090615e-05, + "loss": 0.346, + "step": 5195 + }, + { + "epoch": 4.842105263157895, + "grad_norm": 0.25997046231793325, + "learning_rate": 2.1902912621359226e-05, + "loss": 0.3418, + "step": 5200 + }, + { + "epoch": 4.846762925011644, + "grad_norm": 0.2610912590644731, + "learning_rate": 2.18705501618123e-05, + "loss": 0.3515, + "step": 5205 + }, + { + "epoch": 4.851420586865394, + "grad_norm": 0.27044344081098914, + "learning_rate": 2.1838187702265372e-05, + "loss": 0.356, + "step": 5210 + }, + { + "epoch": 4.856078248719143, + "grad_norm": 0.2823486349622019, + "learning_rate": 2.180582524271845e-05, + "loss": 0.3483, + "step": 5215 + }, + { + "epoch": 4.860735910572893, + "grad_norm": 0.26661920969071545, + "learning_rate": 2.1773462783171522e-05, + "loss": 0.3495, + "step": 5220 + }, + { + "epoch": 4.865393572426642, + "grad_norm": 0.2601414397083011, + "learning_rate": 2.17411003236246e-05, + "loss": 0.3409, + "step": 5225 + }, + { + "epoch": 4.870051234280391, + "grad_norm": 0.25772456138046534, + "learning_rate": 2.170873786407767e-05, + "loss": 0.3476, + "step": 5230 + }, + { + "epoch": 4.874708896134141, + "grad_norm": 0.2801575006987134, + "learning_rate": 2.1676375404530748e-05, + "loss": 0.3535, + "step": 5235 + }, + { + "epoch": 4.87936655798789, + "grad_norm": 0.2713153646610175, + "learning_rate": 2.1644012944983818e-05, + "loss": 0.3455, + "step": 5240 + }, + { + "epoch": 4.884024219841639, + "grad_norm": 0.2756939128532273, + "learning_rate": 2.1611650485436894e-05, + "loss": 0.3581, + "step": 5245 + }, + { + "epoch": 4.888681881695389, + "grad_norm": 0.27593933048729546, + "learning_rate": 2.1579288025889968e-05, + "loss": 0.3559, + "step": 5250 + }, + { + "epoch": 4.893339543549138, + "grad_norm": 0.28570776066307496, + "learning_rate": 2.1546925566343044e-05, + "loss": 0.3478, + "step": 5255 + }, + { + "epoch": 4.897997205402888, + "grad_norm": 0.27757994536033176, + "learning_rate": 2.1514563106796117e-05, + "loss": 0.3413, + "step": 5260 + }, + { + "epoch": 4.902654867256637, + "grad_norm": 0.2656187787344544, + "learning_rate": 2.1482200647249194e-05, + "loss": 0.3592, + "step": 5265 + }, + { + "epoch": 4.907312529110387, + "grad_norm": 0.2658878897171109, + "learning_rate": 2.1449838187702267e-05, + "loss": 0.3506, + "step": 5270 + }, + { + "epoch": 4.911970190964136, + "grad_norm": 0.2689864272819296, + "learning_rate": 2.141747572815534e-05, + "loss": 0.3482, + "step": 5275 + }, + { + "epoch": 4.916627852817886, + "grad_norm": 0.25500517516010174, + "learning_rate": 2.1385113268608413e-05, + "loss": 0.3499, + "step": 5280 + }, + { + "epoch": 4.921285514671635, + "grad_norm": 0.26771225102965857, + "learning_rate": 2.135275080906149e-05, + "loss": 0.3453, + "step": 5285 + }, + { + "epoch": 4.925943176525385, + "grad_norm": 0.27129980615123656, + "learning_rate": 2.1320388349514563e-05, + "loss": 0.3539, + "step": 5290 + }, + { + "epoch": 4.930600838379133, + "grad_norm": 0.2696112163961816, + "learning_rate": 2.128802588996764e-05, + "loss": 0.3435, + "step": 5295 + }, + { + "epoch": 4.935258500232883, + "grad_norm": 0.2538508151162818, + "learning_rate": 2.1255663430420713e-05, + "loss": 0.348, + "step": 5300 + }, + { + "epoch": 4.939916162086632, + "grad_norm": 0.2758886557259427, + "learning_rate": 2.122330097087379e-05, + "loss": 0.3517, + "step": 5305 + }, + { + "epoch": 4.944573823940382, + "grad_norm": 0.27270125908117077, + "learning_rate": 2.119093851132686e-05, + "loss": 0.3467, + "step": 5310 + }, + { + "epoch": 4.949231485794131, + "grad_norm": 0.26641381930592944, + "learning_rate": 2.1158576051779936e-05, + "loss": 0.3395, + "step": 5315 + }, + { + "epoch": 4.953889147647881, + "grad_norm": 0.2671391952368209, + "learning_rate": 2.112621359223301e-05, + "loss": 0.348, + "step": 5320 + }, + { + "epoch": 4.95854680950163, + "grad_norm": 0.27534994898249343, + "learning_rate": 2.1093851132686086e-05, + "loss": 0.3505, + "step": 5325 + }, + { + "epoch": 4.96320447135538, + "grad_norm": 0.26284212092487064, + "learning_rate": 2.106148867313916e-05, + "loss": 0.3497, + "step": 5330 + }, + { + "epoch": 4.967862133209129, + "grad_norm": 0.2528206388631256, + "learning_rate": 2.1029126213592235e-05, + "loss": 0.3379, + "step": 5335 + }, + { + "epoch": 4.972519795062879, + "grad_norm": 0.25393533764045556, + "learning_rate": 2.099676375404531e-05, + "loss": 0.3485, + "step": 5340 + }, + { + "epoch": 4.977177456916628, + "grad_norm": 0.26312186320913344, + "learning_rate": 2.0964401294498385e-05, + "loss": 0.3434, + "step": 5345 + }, + { + "epoch": 4.981835118770377, + "grad_norm": 0.2675083313597985, + "learning_rate": 2.0932038834951455e-05, + "loss": 0.3388, + "step": 5350 + }, + { + "epoch": 4.986492780624126, + "grad_norm": 0.2656481802559307, + "learning_rate": 2.089967637540453e-05, + "loss": 0.3492, + "step": 5355 + }, + { + "epoch": 4.991150442477876, + "grad_norm": 0.2715437365012533, + "learning_rate": 2.0867313915857605e-05, + "loss": 0.3347, + "step": 5360 + }, + { + "epoch": 4.995808104331625, + "grad_norm": 0.27608544405178936, + "learning_rate": 2.083495145631068e-05, + "loss": 0.3497, + "step": 5365 + }, + { + "epoch": 5.0, + "grad_norm": 0.26313815985857575, + "learning_rate": 2.0802588996763754e-05, + "loss": 0.3378, + "step": 5370 + }, + { + "epoch": 5.0046576618537495, + "grad_norm": 0.35923261940748996, + "learning_rate": 2.077022653721683e-05, + "loss": 0.3066, + "step": 5375 + }, + { + "epoch": 5.009315323707499, + "grad_norm": 0.32640043938876945, + "learning_rate": 2.0737864077669904e-05, + "loss": 0.3022, + "step": 5380 + }, + { + "epoch": 5.0139729855612485, + "grad_norm": 0.32506467004826806, + "learning_rate": 2.0705501618122977e-05, + "loss": 0.299, + "step": 5385 + }, + { + "epoch": 5.018630647414998, + "grad_norm": 0.3047003724926295, + "learning_rate": 2.067313915857605e-05, + "loss": 0.3079, + "step": 5390 + }, + { + "epoch": 5.0232883092687475, + "grad_norm": 0.2890942126979149, + "learning_rate": 2.0640776699029127e-05, + "loss": 0.2949, + "step": 5395 + }, + { + "epoch": 5.027945971122496, + "grad_norm": 0.28379140694538696, + "learning_rate": 2.06084142394822e-05, + "loss": 0.2933, + "step": 5400 + }, + { + "epoch": 5.032603632976246, + "grad_norm": 0.29498361686431485, + "learning_rate": 2.0576051779935277e-05, + "loss": 0.2944, + "step": 5405 + }, + { + "epoch": 5.037261294829995, + "grad_norm": 0.2857039981729688, + "learning_rate": 2.054368932038835e-05, + "loss": 0.296, + "step": 5410 + }, + { + "epoch": 5.041918956683745, + "grad_norm": 0.3213640227256369, + "learning_rate": 2.0511326860841426e-05, + "loss": 0.2968, + "step": 5415 + }, + { + "epoch": 5.046576618537494, + "grad_norm": 0.2935458031818663, + "learning_rate": 2.04789644012945e-05, + "loss": 0.292, + "step": 5420 + }, + { + "epoch": 5.051234280391244, + "grad_norm": 0.278771134528813, + "learning_rate": 2.0446601941747573e-05, + "loss": 0.2963, + "step": 5425 + }, + { + "epoch": 5.055891942244993, + "grad_norm": 0.28885954698934563, + "learning_rate": 2.041423948220065e-05, + "loss": 0.3027, + "step": 5430 + }, + { + "epoch": 5.0605496040987425, + "grad_norm": 0.2985162846167875, + "learning_rate": 2.0381877022653723e-05, + "loss": 0.2855, + "step": 5435 + }, + { + "epoch": 5.065207265952492, + "grad_norm": 0.2779933359640956, + "learning_rate": 2.0349514563106796e-05, + "loss": 0.2899, + "step": 5440 + }, + { + "epoch": 5.0698649278062415, + "grad_norm": 0.2794291563396326, + "learning_rate": 2.0317152103559872e-05, + "loss": 0.2948, + "step": 5445 + }, + { + "epoch": 5.074522589659991, + "grad_norm": 0.2782833031018338, + "learning_rate": 2.0284789644012945e-05, + "loss": 0.3055, + "step": 5450 + }, + { + "epoch": 5.0791802515137405, + "grad_norm": 0.3084450159275583, + "learning_rate": 2.025242718446602e-05, + "loss": 0.3048, + "step": 5455 + }, + { + "epoch": 5.083837913367489, + "grad_norm": 0.28616079154167284, + "learning_rate": 2.0220064724919095e-05, + "loss": 0.2873, + "step": 5460 + }, + { + "epoch": 5.088495575221239, + "grad_norm": 0.2933049647365567, + "learning_rate": 2.018770226537217e-05, + "loss": 0.2995, + "step": 5465 + }, + { + "epoch": 5.093153237074988, + "grad_norm": 0.2985619669338097, + "learning_rate": 2.0155339805825245e-05, + "loss": 0.303, + "step": 5470 + }, + { + "epoch": 5.097810898928738, + "grad_norm": 0.287924432538155, + "learning_rate": 2.0122977346278318e-05, + "loss": 0.304, + "step": 5475 + }, + { + "epoch": 5.102468560782487, + "grad_norm": 0.2822730331503238, + "learning_rate": 2.0090614886731395e-05, + "loss": 0.3096, + "step": 5480 + }, + { + "epoch": 5.107126222636237, + "grad_norm": 0.2881739389332346, + "learning_rate": 2.0058252427184468e-05, + "loss": 0.2994, + "step": 5485 + }, + { + "epoch": 5.111783884489986, + "grad_norm": 0.27816726175124123, + "learning_rate": 2.0025889967637544e-05, + "loss": 0.2958, + "step": 5490 + }, + { + "epoch": 5.116441546343736, + "grad_norm": 0.3030122639921386, + "learning_rate": 1.9993527508090614e-05, + "loss": 0.3091, + "step": 5495 + }, + { + "epoch": 5.121099208197485, + "grad_norm": 0.2966032003252686, + "learning_rate": 1.996116504854369e-05, + "loss": 0.3026, + "step": 5500 + }, + { + "epoch": 5.125756870051235, + "grad_norm": 0.28524766975263094, + "learning_rate": 1.9928802588996764e-05, + "loss": 0.301, + "step": 5505 + }, + { + "epoch": 5.130414531904984, + "grad_norm": 0.28364105344562807, + "learning_rate": 1.989644012944984e-05, + "loss": 0.3103, + "step": 5510 + }, + { + "epoch": 5.135072193758733, + "grad_norm": 0.28685104135946954, + "learning_rate": 1.9864077669902914e-05, + "loss": 0.2932, + "step": 5515 + }, + { + "epoch": 5.139729855612482, + "grad_norm": 0.28023219847238934, + "learning_rate": 1.983171521035599e-05, + "loss": 0.3046, + "step": 5520 + }, + { + "epoch": 5.144387517466232, + "grad_norm": 0.2983844446151215, + "learning_rate": 1.9799352750809063e-05, + "loss": 0.3006, + "step": 5525 + }, + { + "epoch": 5.149045179319981, + "grad_norm": 0.2888586797330622, + "learning_rate": 1.9766990291262137e-05, + "loss": 0.2979, + "step": 5530 + }, + { + "epoch": 5.153702841173731, + "grad_norm": 0.2991770317047864, + "learning_rate": 1.973462783171521e-05, + "loss": 0.3026, + "step": 5535 + }, + { + "epoch": 5.15836050302748, + "grad_norm": 0.29147195431068673, + "learning_rate": 1.9702265372168286e-05, + "loss": 0.2988, + "step": 5540 + }, + { + "epoch": 5.16301816488123, + "grad_norm": 0.29769143133613796, + "learning_rate": 1.966990291262136e-05, + "loss": 0.3001, + "step": 5545 + }, + { + "epoch": 5.167675826734979, + "grad_norm": 0.28832867719266875, + "learning_rate": 1.9637540453074436e-05, + "loss": 0.3064, + "step": 5550 + }, + { + "epoch": 5.172333488588729, + "grad_norm": 0.2969967378852645, + "learning_rate": 1.960517799352751e-05, + "loss": 0.3038, + "step": 5555 + }, + { + "epoch": 5.176991150442478, + "grad_norm": 0.2887367922059911, + "learning_rate": 1.9572815533980586e-05, + "loss": 0.3145, + "step": 5560 + }, + { + "epoch": 5.181648812296228, + "grad_norm": 0.2738537643409762, + "learning_rate": 1.9540453074433656e-05, + "loss": 0.2996, + "step": 5565 + }, + { + "epoch": 5.186306474149977, + "grad_norm": 0.2823866205348465, + "learning_rate": 1.9508090614886732e-05, + "loss": 0.3011, + "step": 5570 + }, + { + "epoch": 5.190964136003726, + "grad_norm": 0.26814278929933427, + "learning_rate": 1.9475728155339805e-05, + "loss": 0.303, + "step": 5575 + }, + { + "epoch": 5.195621797857475, + "grad_norm": 0.2878790608953289, + "learning_rate": 1.9443365695792882e-05, + "loss": 0.2982, + "step": 5580 + }, + { + "epoch": 5.200279459711225, + "grad_norm": 0.28478623464513214, + "learning_rate": 1.9411003236245955e-05, + "loss": 0.3002, + "step": 5585 + }, + { + "epoch": 5.204937121564974, + "grad_norm": 0.29267564895392156, + "learning_rate": 1.937864077669903e-05, + "loss": 0.3043, + "step": 5590 + }, + { + "epoch": 5.209594783418724, + "grad_norm": 0.30782221674287835, + "learning_rate": 1.9346278317152105e-05, + "loss": 0.3035, + "step": 5595 + }, + { + "epoch": 5.214252445272473, + "grad_norm": 0.2862863533195408, + "learning_rate": 1.9313915857605178e-05, + "loss": 0.307, + "step": 5600 + }, + { + "epoch": 5.218910107126223, + "grad_norm": 0.2849550623168926, + "learning_rate": 1.928155339805825e-05, + "loss": 0.3069, + "step": 5605 + }, + { + "epoch": 5.223567768979972, + "grad_norm": 0.29920473766249195, + "learning_rate": 1.9249190938511328e-05, + "loss": 0.3099, + "step": 5610 + }, + { + "epoch": 5.228225430833722, + "grad_norm": 0.2899994453541254, + "learning_rate": 1.92168284789644e-05, + "loss": 0.2988, + "step": 5615 + }, + { + "epoch": 5.232883092687471, + "grad_norm": 0.2745311500475131, + "learning_rate": 1.9184466019417478e-05, + "loss": 0.3013, + "step": 5620 + }, + { + "epoch": 5.237540754541221, + "grad_norm": 0.2905887239671658, + "learning_rate": 1.915210355987055e-05, + "loss": 0.2939, + "step": 5625 + }, + { + "epoch": 5.242198416394969, + "grad_norm": 0.2756889111772824, + "learning_rate": 1.9119741100323627e-05, + "loss": 0.2998, + "step": 5630 + }, + { + "epoch": 5.246856078248719, + "grad_norm": 0.29560085688678456, + "learning_rate": 1.9087378640776697e-05, + "loss": 0.3051, + "step": 5635 + }, + { + "epoch": 5.251513740102468, + "grad_norm": 0.28723648162439525, + "learning_rate": 1.9055016181229774e-05, + "loss": 0.2974, + "step": 5640 + }, + { + "epoch": 5.256171401956218, + "grad_norm": 0.275091484400794, + "learning_rate": 1.9022653721682847e-05, + "loss": 0.3014, + "step": 5645 + }, + { + "epoch": 5.260829063809967, + "grad_norm": 0.2924049738193532, + "learning_rate": 1.8990291262135923e-05, + "loss": 0.3022, + "step": 5650 + }, + { + "epoch": 5.265486725663717, + "grad_norm": 0.29754127995135604, + "learning_rate": 1.8957928802588997e-05, + "loss": 0.2999, + "step": 5655 + }, + { + "epoch": 5.270144387517466, + "grad_norm": 0.2798089481799948, + "learning_rate": 1.8925566343042073e-05, + "loss": 0.2991, + "step": 5660 + }, + { + "epoch": 5.274802049371216, + "grad_norm": 0.2749452867239249, + "learning_rate": 1.8893203883495146e-05, + "loss": 0.3046, + "step": 5665 + }, + { + "epoch": 5.279459711224965, + "grad_norm": 0.29331315085638715, + "learning_rate": 1.8860841423948223e-05, + "loss": 0.3, + "step": 5670 + }, + { + "epoch": 5.284117373078715, + "grad_norm": 0.2928312452903107, + "learning_rate": 1.8828478964401296e-05, + "loss": 0.2964, + "step": 5675 + }, + { + "epoch": 5.288775034932464, + "grad_norm": 0.3008544248483986, + "learning_rate": 1.879611650485437e-05, + "loss": 0.3098, + "step": 5680 + }, + { + "epoch": 5.293432696786214, + "grad_norm": 0.28553735790247403, + "learning_rate": 1.8763754045307442e-05, + "loss": 0.3047, + "step": 5685 + }, + { + "epoch": 5.298090358639962, + "grad_norm": 0.2887305220130655, + "learning_rate": 1.873139158576052e-05, + "loss": 0.3044, + "step": 5690 + }, + { + "epoch": 5.302748020493712, + "grad_norm": 0.27448891372378204, + "learning_rate": 1.8699029126213592e-05, + "loss": 0.3035, + "step": 5695 + }, + { + "epoch": 5.307405682347461, + "grad_norm": 0.28295005491441444, + "learning_rate": 1.866666666666667e-05, + "loss": 0.3005, + "step": 5700 + }, + { + "epoch": 5.312063344201211, + "grad_norm": 0.2725195115220492, + "learning_rate": 1.8634304207119742e-05, + "loss": 0.3001, + "step": 5705 + }, + { + "epoch": 5.31672100605496, + "grad_norm": 0.27847794549500177, + "learning_rate": 1.8601941747572815e-05, + "loss": 0.305, + "step": 5710 + }, + { + "epoch": 5.32137866790871, + "grad_norm": 0.2839018387739063, + "learning_rate": 1.856957928802589e-05, + "loss": 0.3049, + "step": 5715 + }, + { + "epoch": 5.326036329762459, + "grad_norm": 0.3021144133018839, + "learning_rate": 1.8537216828478965e-05, + "loss": 0.3037, + "step": 5720 + }, + { + "epoch": 5.330693991616209, + "grad_norm": 0.2972052723877232, + "learning_rate": 1.850485436893204e-05, + "loss": 0.3066, + "step": 5725 + }, + { + "epoch": 5.335351653469958, + "grad_norm": 0.2856206514179065, + "learning_rate": 1.8472491909385114e-05, + "loss": 0.3065, + "step": 5730 + }, + { + "epoch": 5.340009315323708, + "grad_norm": 0.28392937584839906, + "learning_rate": 1.844012944983819e-05, + "loss": 0.3058, + "step": 5735 + }, + { + "epoch": 5.344666977177457, + "grad_norm": 0.2842067899622778, + "learning_rate": 1.8407766990291264e-05, + "loss": 0.3063, + "step": 5740 + }, + { + "epoch": 5.349324639031206, + "grad_norm": 0.2824119276233377, + "learning_rate": 1.8375404530744337e-05, + "loss": 0.3024, + "step": 5745 + }, + { + "epoch": 5.353982300884955, + "grad_norm": 0.2930347933934265, + "learning_rate": 1.834304207119741e-05, + "loss": 0.3005, + "step": 5750 + }, + { + "epoch": 5.358639962738705, + "grad_norm": 0.29347353585391656, + "learning_rate": 1.8310679611650487e-05, + "loss": 0.3017, + "step": 5755 + }, + { + "epoch": 5.363297624592454, + "grad_norm": 0.2926875315209124, + "learning_rate": 1.827831715210356e-05, + "loss": 0.3028, + "step": 5760 + }, + { + "epoch": 5.367955286446204, + "grad_norm": 0.2901551394692628, + "learning_rate": 1.8245954692556637e-05, + "loss": 0.3014, + "step": 5765 + }, + { + "epoch": 5.372612948299953, + "grad_norm": 0.28209903392114477, + "learning_rate": 1.821359223300971e-05, + "loss": 0.3046, + "step": 5770 + }, + { + "epoch": 5.377270610153703, + "grad_norm": 0.2749568426111605, + "learning_rate": 1.8181229773462787e-05, + "loss": 0.3032, + "step": 5775 + }, + { + "epoch": 5.381928272007452, + "grad_norm": 0.3002171746802142, + "learning_rate": 1.8148867313915856e-05, + "loss": 0.3027, + "step": 5780 + }, + { + "epoch": 5.386585933861202, + "grad_norm": 0.3008333108433299, + "learning_rate": 1.8116504854368933e-05, + "loss": 0.3042, + "step": 5785 + }, + { + "epoch": 5.391243595714951, + "grad_norm": 0.2897571641211582, + "learning_rate": 1.8084142394822006e-05, + "loss": 0.3103, + "step": 5790 + }, + { + "epoch": 5.395901257568701, + "grad_norm": 0.2850028644112056, + "learning_rate": 1.8051779935275083e-05, + "loss": 0.3093, + "step": 5795 + }, + { + "epoch": 5.4005589194224495, + "grad_norm": 0.29898701097962505, + "learning_rate": 1.8019417475728156e-05, + "loss": 0.3027, + "step": 5800 + }, + { + "epoch": 5.405216581276199, + "grad_norm": 0.28638137759427745, + "learning_rate": 1.7987055016181232e-05, + "loss": 0.3187, + "step": 5805 + }, + { + "epoch": 5.4098742431299485, + "grad_norm": 0.2767947983890502, + "learning_rate": 1.7954692556634306e-05, + "loss": 0.301, + "step": 5810 + }, + { + "epoch": 5.414531904983698, + "grad_norm": 0.293008143502894, + "learning_rate": 1.7922330097087382e-05, + "loss": 0.3012, + "step": 5815 + }, + { + "epoch": 5.4191895668374475, + "grad_norm": 0.29107156446017896, + "learning_rate": 1.7889967637540452e-05, + "loss": 0.3018, + "step": 5820 + }, + { + "epoch": 5.423847228691197, + "grad_norm": 0.2836867406762743, + "learning_rate": 1.785760517799353e-05, + "loss": 0.3056, + "step": 5825 + }, + { + "epoch": 5.4285048905449464, + "grad_norm": 0.3141053505194596, + "learning_rate": 1.7825242718446602e-05, + "loss": 0.3012, + "step": 5830 + }, + { + "epoch": 5.433162552398696, + "grad_norm": 0.2880531038635791, + "learning_rate": 1.7792880258899678e-05, + "loss": 0.3032, + "step": 5835 + }, + { + "epoch": 5.437820214252445, + "grad_norm": 0.2833450738832913, + "learning_rate": 1.776051779935275e-05, + "loss": 0.3148, + "step": 5840 + }, + { + "epoch": 5.442477876106195, + "grad_norm": 0.2888814119260653, + "learning_rate": 1.7728155339805828e-05, + "loss": 0.301, + "step": 5845 + }, + { + "epoch": 5.447135537959944, + "grad_norm": 0.28688991478590337, + "learning_rate": 1.76957928802589e-05, + "loss": 0.309, + "step": 5850 + }, + { + "epoch": 5.451793199813694, + "grad_norm": 0.290900394259756, + "learning_rate": 1.7663430420711974e-05, + "loss": 0.3071, + "step": 5855 + }, + { + "epoch": 5.4564508616674425, + "grad_norm": 0.2760376265018021, + "learning_rate": 1.7631067961165048e-05, + "loss": 0.3064, + "step": 5860 + }, + { + "epoch": 5.461108523521192, + "grad_norm": 0.2961192697428574, + "learning_rate": 1.7598705501618124e-05, + "loss": 0.3147, + "step": 5865 + }, + { + "epoch": 5.4657661853749415, + "grad_norm": 0.30113261224880655, + "learning_rate": 1.7566343042071197e-05, + "loss": 0.305, + "step": 5870 + }, + { + "epoch": 5.470423847228691, + "grad_norm": 0.30236627699114277, + "learning_rate": 1.7533980582524274e-05, + "loss": 0.2918, + "step": 5875 + }, + { + "epoch": 5.4750815090824405, + "grad_norm": 0.28786989286640924, + "learning_rate": 1.7501618122977347e-05, + "loss": 0.3029, + "step": 5880 + }, + { + "epoch": 5.47973917093619, + "grad_norm": 0.285207802940617, + "learning_rate": 1.7469255663430424e-05, + "loss": 0.3045, + "step": 5885 + }, + { + "epoch": 5.4843968327899395, + "grad_norm": 0.2704341023495142, + "learning_rate": 1.7436893203883493e-05, + "loss": 0.3085, + "step": 5890 + }, + { + "epoch": 5.489054494643689, + "grad_norm": 0.2717516312446038, + "learning_rate": 1.740453074433657e-05, + "loss": 0.3099, + "step": 5895 + }, + { + "epoch": 5.4937121564974385, + "grad_norm": 0.2753605154349603, + "learning_rate": 1.7372168284789643e-05, + "loss": 0.3023, + "step": 5900 + }, + { + "epoch": 5.498369818351188, + "grad_norm": 0.29454768766505524, + "learning_rate": 1.733980582524272e-05, + "loss": 0.315, + "step": 5905 + }, + { + "epoch": 5.5030274802049375, + "grad_norm": 0.2780881770733559, + "learning_rate": 1.7307443365695793e-05, + "loss": 0.2996, + "step": 5910 + }, + { + "epoch": 5.507685142058687, + "grad_norm": 0.29014626764984414, + "learning_rate": 1.727508090614887e-05, + "loss": 0.3051, + "step": 5915 + }, + { + "epoch": 5.512342803912436, + "grad_norm": 0.2837262516203258, + "learning_rate": 1.7242718446601943e-05, + "loss": 0.3008, + "step": 5920 + }, + { + "epoch": 5.517000465766185, + "grad_norm": 0.264402190574167, + "learning_rate": 1.7210355987055016e-05, + "loss": 0.3074, + "step": 5925 + }, + { + "epoch": 5.521658127619935, + "grad_norm": 0.2738935632023862, + "learning_rate": 1.717799352750809e-05, + "loss": 0.303, + "step": 5930 + }, + { + "epoch": 5.526315789473684, + "grad_norm": 0.27687565798806185, + "learning_rate": 1.7145631067961165e-05, + "loss": 0.3028, + "step": 5935 + }, + { + "epoch": 5.530973451327434, + "grad_norm": 0.28930865751975293, + "learning_rate": 1.711326860841424e-05, + "loss": 0.3041, + "step": 5940 + }, + { + "epoch": 5.535631113181183, + "grad_norm": 0.28596312771077054, + "learning_rate": 1.7080906148867315e-05, + "loss": 0.3065, + "step": 5945 + }, + { + "epoch": 5.5402887750349326, + "grad_norm": 0.2820486218168636, + "learning_rate": 1.704854368932039e-05, + "loss": 0.3158, + "step": 5950 + }, + { + "epoch": 5.544946436888682, + "grad_norm": 0.29890310082210714, + "learning_rate": 1.7016181229773465e-05, + "loss": 0.2946, + "step": 5955 + }, + { + "epoch": 5.5496040987424315, + "grad_norm": 0.2786619683844314, + "learning_rate": 1.6983818770226538e-05, + "loss": 0.3048, + "step": 5960 + }, + { + "epoch": 5.554261760596181, + "grad_norm": 0.27659802253619753, + "learning_rate": 1.695145631067961e-05, + "loss": 0.3039, + "step": 5965 + }, + { + "epoch": 5.5589194224499305, + "grad_norm": 0.2829305638289971, + "learning_rate": 1.6919093851132688e-05, + "loss": 0.3018, + "step": 5970 + }, + { + "epoch": 5.563577084303679, + "grad_norm": 0.2706599135302209, + "learning_rate": 1.688673139158576e-05, + "loss": 0.3068, + "step": 5975 + }, + { + "epoch": 5.568234746157429, + "grad_norm": 0.28211649730594046, + "learning_rate": 1.6854368932038838e-05, + "loss": 0.3063, + "step": 5980 + }, + { + "epoch": 5.572892408011178, + "grad_norm": 0.2951191578671079, + "learning_rate": 1.682200647249191e-05, + "loss": 0.3057, + "step": 5985 + }, + { + "epoch": 5.577550069864928, + "grad_norm": 0.2748692573835518, + "learning_rate": 1.6789644012944984e-05, + "loss": 0.3026, + "step": 5990 + }, + { + "epoch": 5.582207731718677, + "grad_norm": 0.2939313528747144, + "learning_rate": 1.675728155339806e-05, + "loss": 0.3021, + "step": 5995 + }, + { + "epoch": 5.586865393572427, + "grad_norm": 0.3031816982790779, + "learning_rate": 1.6724919093851134e-05, + "loss": 0.3119, + "step": 6000 + }, + { + "epoch": 5.591523055426176, + "grad_norm": 0.2752378797516827, + "learning_rate": 1.6692556634304207e-05, + "loss": 0.3096, + "step": 6005 + }, + { + "epoch": 5.596180717279926, + "grad_norm": 0.2903316590367377, + "learning_rate": 1.6660194174757283e-05, + "loss": 0.3076, + "step": 6010 + }, + { + "epoch": 5.600838379133675, + "grad_norm": 0.28716777283864764, + "learning_rate": 1.6627831715210357e-05, + "loss": 0.3094, + "step": 6015 + }, + { + "epoch": 5.605496040987425, + "grad_norm": 0.2891692730334035, + "learning_rate": 1.6595469255663433e-05, + "loss": 0.3038, + "step": 6020 + }, + { + "epoch": 5.610153702841174, + "grad_norm": 0.2982118804535516, + "learning_rate": 1.6563106796116506e-05, + "loss": 0.2994, + "step": 6025 + }, + { + "epoch": 5.614811364694923, + "grad_norm": 0.29024791500115066, + "learning_rate": 1.6530744336569583e-05, + "loss": 0.3035, + "step": 6030 + }, + { + "epoch": 5.619469026548672, + "grad_norm": 0.2780711941813935, + "learning_rate": 1.6498381877022653e-05, + "loss": 0.3004, + "step": 6035 + }, + { + "epoch": 5.624126688402422, + "grad_norm": 0.27659165989238843, + "learning_rate": 1.646601941747573e-05, + "loss": 0.3019, + "step": 6040 + }, + { + "epoch": 5.628784350256171, + "grad_norm": 0.28984546199561667, + "learning_rate": 1.6433656957928802e-05, + "loss": 0.306, + "step": 6045 + }, + { + "epoch": 5.633442012109921, + "grad_norm": 0.29164650641720846, + "learning_rate": 1.640129449838188e-05, + "loss": 0.3135, + "step": 6050 + }, + { + "epoch": 5.63809967396367, + "grad_norm": 0.26750630815804854, + "learning_rate": 1.6368932038834952e-05, + "loss": 0.3031, + "step": 6055 + }, + { + "epoch": 5.64275733581742, + "grad_norm": 0.2946599458080474, + "learning_rate": 1.633656957928803e-05, + "loss": 0.3049, + "step": 6060 + }, + { + "epoch": 5.647414997671169, + "grad_norm": 0.28010853242725814, + "learning_rate": 1.6304207119741102e-05, + "loss": 0.303, + "step": 6065 + }, + { + "epoch": 5.652072659524919, + "grad_norm": 0.2742657873539159, + "learning_rate": 1.6271844660194175e-05, + "loss": 0.3044, + "step": 6070 + }, + { + "epoch": 5.656730321378668, + "grad_norm": 0.2753030283239919, + "learning_rate": 1.6239482200647248e-05, + "loss": 0.3076, + "step": 6075 + }, + { + "epoch": 5.661387983232418, + "grad_norm": 0.2771477193319902, + "learning_rate": 1.6207119741100325e-05, + "loss": 0.3076, + "step": 6080 + }, + { + "epoch": 5.666045645086166, + "grad_norm": 0.2873816399993061, + "learning_rate": 1.6174757281553398e-05, + "loss": 0.3114, + "step": 6085 + }, + { + "epoch": 5.670703306939917, + "grad_norm": 0.2715615426840634, + "learning_rate": 1.6142394822006475e-05, + "loss": 0.2998, + "step": 6090 + }, + { + "epoch": 5.675360968793665, + "grad_norm": 0.2725590859049447, + "learning_rate": 1.6110032362459548e-05, + "loss": 0.3003, + "step": 6095 + }, + { + "epoch": 5.680018630647415, + "grad_norm": 0.27208932930615526, + "learning_rate": 1.6077669902912624e-05, + "loss": 0.3055, + "step": 6100 + }, + { + "epoch": 5.684676292501164, + "grad_norm": 0.28401672646025355, + "learning_rate": 1.6045307443365694e-05, + "loss": 0.3101, + "step": 6105 + }, + { + "epoch": 5.689333954354914, + "grad_norm": 0.28584083494492163, + "learning_rate": 1.601294498381877e-05, + "loss": 0.3061, + "step": 6110 + }, + { + "epoch": 5.693991616208663, + "grad_norm": 0.2866753336676237, + "learning_rate": 1.5980582524271844e-05, + "loss": 0.3025, + "step": 6115 + }, + { + "epoch": 5.698649278062413, + "grad_norm": 0.2771789772174729, + "learning_rate": 1.594822006472492e-05, + "loss": 0.3093, + "step": 6120 + }, + { + "epoch": 5.703306939916162, + "grad_norm": 0.3001232150493061, + "learning_rate": 1.5915857605177994e-05, + "loss": 0.3084, + "step": 6125 + }, + { + "epoch": 5.707964601769912, + "grad_norm": 0.26727045280123496, + "learning_rate": 1.588349514563107e-05, + "loss": 0.3015, + "step": 6130 + }, + { + "epoch": 5.712622263623661, + "grad_norm": 0.2913315766767915, + "learning_rate": 1.5851132686084143e-05, + "loss": 0.3073, + "step": 6135 + }, + { + "epoch": 5.717279925477411, + "grad_norm": 0.2850823152288255, + "learning_rate": 1.581877022653722e-05, + "loss": 0.3084, + "step": 6140 + }, + { + "epoch": 5.72193758733116, + "grad_norm": 0.2864258330057128, + "learning_rate": 1.578640776699029e-05, + "loss": 0.3017, + "step": 6145 + }, + { + "epoch": 5.726595249184909, + "grad_norm": 0.28175280652917506, + "learning_rate": 1.5754045307443366e-05, + "loss": 0.3084, + "step": 6150 + }, + { + "epoch": 5.731252911038658, + "grad_norm": 0.2807399096732578, + "learning_rate": 1.572168284789644e-05, + "loss": 0.3133, + "step": 6155 + }, + { + "epoch": 5.735910572892408, + "grad_norm": 0.2724600629396077, + "learning_rate": 1.5689320388349516e-05, + "loss": 0.3107, + "step": 6160 + }, + { + "epoch": 5.740568234746157, + "grad_norm": 0.2700028287114075, + "learning_rate": 1.565695792880259e-05, + "loss": 0.312, + "step": 6165 + }, + { + "epoch": 5.745225896599907, + "grad_norm": 0.27130060678321727, + "learning_rate": 1.5624595469255666e-05, + "loss": 0.3012, + "step": 6170 + }, + { + "epoch": 5.749883558453656, + "grad_norm": 0.27951703064101047, + "learning_rate": 1.559223300970874e-05, + "loss": 0.3082, + "step": 6175 + }, + { + "epoch": 5.754541220307406, + "grad_norm": 0.27592930561792794, + "learning_rate": 1.5559870550161812e-05, + "loss": 0.3072, + "step": 6180 + }, + { + "epoch": 5.759198882161155, + "grad_norm": 0.3465026871622147, + "learning_rate": 1.5527508090614885e-05, + "loss": 0.3059, + "step": 6185 + }, + { + "epoch": 5.763856544014905, + "grad_norm": 0.2746005126989584, + "learning_rate": 1.5495145631067962e-05, + "loss": 0.3016, + "step": 6190 + }, + { + "epoch": 5.768514205868654, + "grad_norm": 0.2772282824990714, + "learning_rate": 1.5462783171521035e-05, + "loss": 0.312, + "step": 6195 + }, + { + "epoch": 5.773171867722404, + "grad_norm": 0.2743844708127403, + "learning_rate": 1.543042071197411e-05, + "loss": 0.309, + "step": 6200 + }, + { + "epoch": 5.777829529576152, + "grad_norm": 0.2877371452033368, + "learning_rate": 1.5398058252427185e-05, + "loss": 0.3133, + "step": 6205 + }, + { + "epoch": 5.782487191429902, + "grad_norm": 0.2622623411875907, + "learning_rate": 1.536569579288026e-05, + "loss": 0.3028, + "step": 6210 + }, + { + "epoch": 5.787144853283651, + "grad_norm": 0.2710226518065163, + "learning_rate": 1.5333333333333334e-05, + "loss": 0.3031, + "step": 6215 + }, + { + "epoch": 5.791802515137401, + "grad_norm": 0.27172885126665913, + "learning_rate": 1.5300970873786408e-05, + "loss": 0.3032, + "step": 6220 + }, + { + "epoch": 5.79646017699115, + "grad_norm": 0.26208199703980944, + "learning_rate": 1.526860841423948e-05, + "loss": 0.3084, + "step": 6225 + }, + { + "epoch": 5.8011178388449, + "grad_norm": 0.2721236700379943, + "learning_rate": 1.5236245954692557e-05, + "loss": 0.3149, + "step": 6230 + }, + { + "epoch": 5.805775500698649, + "grad_norm": 0.27870200535732587, + "learning_rate": 1.5203883495145632e-05, + "loss": 0.3111, + "step": 6235 + }, + { + "epoch": 5.810433162552399, + "grad_norm": 0.28270581614522816, + "learning_rate": 1.5171521035598707e-05, + "loss": 0.3077, + "step": 6240 + }, + { + "epoch": 5.815090824406148, + "grad_norm": 0.2735990234754489, + "learning_rate": 1.5139158576051782e-05, + "loss": 0.3062, + "step": 6245 + }, + { + "epoch": 5.819748486259898, + "grad_norm": 0.27679598567312363, + "learning_rate": 1.5106796116504853e-05, + "loss": 0.3079, + "step": 6250 + }, + { + "epoch": 5.824406148113647, + "grad_norm": 0.28298106354491503, + "learning_rate": 1.5074433656957928e-05, + "loss": 0.3153, + "step": 6255 + }, + { + "epoch": 5.829063809967396, + "grad_norm": 0.2846618481258775, + "learning_rate": 1.5042071197411003e-05, + "loss": 0.302, + "step": 6260 + }, + { + "epoch": 5.833721471821145, + "grad_norm": 0.2851511747212997, + "learning_rate": 1.5009708737864078e-05, + "loss": 0.3073, + "step": 6265 + }, + { + "epoch": 5.838379133674895, + "grad_norm": 0.28636420923696226, + "learning_rate": 1.4977346278317153e-05, + "loss": 0.3143, + "step": 6270 + }, + { + "epoch": 5.843036795528644, + "grad_norm": 0.2746846190129639, + "learning_rate": 1.4944983818770228e-05, + "loss": 0.3072, + "step": 6275 + }, + { + "epoch": 5.847694457382394, + "grad_norm": 0.28020487777908365, + "learning_rate": 1.4912621359223303e-05, + "loss": 0.3073, + "step": 6280 + }, + { + "epoch": 5.852352119236143, + "grad_norm": 0.2792427758363, + "learning_rate": 1.4880258899676374e-05, + "loss": 0.3087, + "step": 6285 + }, + { + "epoch": 5.857009781089893, + "grad_norm": 0.2782388073052526, + "learning_rate": 1.4847896440129449e-05, + "loss": 0.3032, + "step": 6290 + }, + { + "epoch": 5.861667442943642, + "grad_norm": 0.2579244666091734, + "learning_rate": 1.4815533980582524e-05, + "loss": 0.3012, + "step": 6295 + }, + { + "epoch": 5.866325104797392, + "grad_norm": 0.2656839646498183, + "learning_rate": 1.4783171521035599e-05, + "loss": 0.3042, + "step": 6300 + }, + { + "epoch": 5.870982766651141, + "grad_norm": 0.2808985029331758, + "learning_rate": 1.4750809061488674e-05, + "loss": 0.2979, + "step": 6305 + }, + { + "epoch": 5.875640428504891, + "grad_norm": 0.2806107094803382, + "learning_rate": 1.4718446601941749e-05, + "loss": 0.3125, + "step": 6310 + }, + { + "epoch": 5.8802980903586395, + "grad_norm": 0.2795338031099193, + "learning_rate": 1.4686084142394823e-05, + "loss": 0.2961, + "step": 6315 + }, + { + "epoch": 5.88495575221239, + "grad_norm": 0.2760213410981478, + "learning_rate": 1.4653721682847898e-05, + "loss": 0.3046, + "step": 6320 + }, + { + "epoch": 5.8896134140661385, + "grad_norm": 0.2932730180826483, + "learning_rate": 1.462135922330097e-05, + "loss": 0.3049, + "step": 6325 + }, + { + "epoch": 5.894271075919888, + "grad_norm": 0.28127802161670795, + "learning_rate": 1.4588996763754045e-05, + "loss": 0.3146, + "step": 6330 + }, + { + "epoch": 5.8989287377736375, + "grad_norm": 0.2792155247730604, + "learning_rate": 1.455663430420712e-05, + "loss": 0.3021, + "step": 6335 + }, + { + "epoch": 5.903586399627387, + "grad_norm": 0.27856019558478023, + "learning_rate": 1.4524271844660194e-05, + "loss": 0.3015, + "step": 6340 + }, + { + "epoch": 5.9082440614811365, + "grad_norm": 0.28321062046344186, + "learning_rate": 1.449190938511327e-05, + "loss": 0.2967, + "step": 6345 + }, + { + "epoch": 5.912901723334886, + "grad_norm": 0.27330655251511077, + "learning_rate": 1.4459546925566344e-05, + "loss": 0.3061, + "step": 6350 + }, + { + "epoch": 5.9175593851886354, + "grad_norm": 0.27751395380672944, + "learning_rate": 1.4427184466019419e-05, + "loss": 0.306, + "step": 6355 + }, + { + "epoch": 5.922217047042385, + "grad_norm": 0.27478548080495335, + "learning_rate": 1.4394822006472492e-05, + "loss": 0.2994, + "step": 6360 + }, + { + "epoch": 5.926874708896134, + "grad_norm": 0.2823478583161036, + "learning_rate": 1.4362459546925567e-05, + "loss": 0.3118, + "step": 6365 + }, + { + "epoch": 5.931532370749884, + "grad_norm": 0.28287174374847096, + "learning_rate": 1.4330097087378642e-05, + "loss": 0.299, + "step": 6370 + }, + { + "epoch": 5.936190032603633, + "grad_norm": 0.28444922456277477, + "learning_rate": 1.4297734627831717e-05, + "loss": 0.3157, + "step": 6375 + }, + { + "epoch": 5.940847694457382, + "grad_norm": 0.28803878164195124, + "learning_rate": 1.426537216828479e-05, + "loss": 0.3044, + "step": 6380 + }, + { + "epoch": 5.9455053563111315, + "grad_norm": 0.2891142857695731, + "learning_rate": 1.4233009708737865e-05, + "loss": 0.3087, + "step": 6385 + }, + { + "epoch": 5.950163018164881, + "grad_norm": 0.2867316511263794, + "learning_rate": 1.420064724919094e-05, + "loss": 0.3166, + "step": 6390 + }, + { + "epoch": 5.9548206800186305, + "grad_norm": 0.28433359030956057, + "learning_rate": 1.4168284789644013e-05, + "loss": 0.3113, + "step": 6395 + }, + { + "epoch": 5.95947834187238, + "grad_norm": 0.27053434327134906, + "learning_rate": 1.4135922330097088e-05, + "loss": 0.3088, + "step": 6400 + }, + { + "epoch": 5.9641360037261295, + "grad_norm": 0.2645493590042334, + "learning_rate": 1.4103559870550163e-05, + "loss": 0.3027, + "step": 6405 + }, + { + "epoch": 5.968793665579879, + "grad_norm": 0.27412814807463415, + "learning_rate": 1.4071197411003237e-05, + "loss": 0.3054, + "step": 6410 + }, + { + "epoch": 5.9734513274336285, + "grad_norm": 0.2714124141236671, + "learning_rate": 1.4038834951456312e-05, + "loss": 0.296, + "step": 6415 + }, + { + "epoch": 5.978108989287378, + "grad_norm": 0.27945140158886544, + "learning_rate": 1.4006472491909387e-05, + "loss": 0.299, + "step": 6420 + }, + { + "epoch": 5.9827666511411275, + "grad_norm": 0.2713442676698441, + "learning_rate": 1.3974110032362462e-05, + "loss": 0.3057, + "step": 6425 + }, + { + "epoch": 5.987424312994877, + "grad_norm": 0.2827835607802788, + "learning_rate": 1.3941747572815534e-05, + "loss": 0.3135, + "step": 6430 + }, + { + "epoch": 5.992081974848626, + "grad_norm": 0.2716682297183809, + "learning_rate": 1.3909385113268608e-05, + "loss": 0.3083, + "step": 6435 + }, + { + "epoch": 5.996739636702375, + "grad_norm": 0.27225571037293367, + "learning_rate": 1.3877022653721683e-05, + "loss": 0.3047, + "step": 6440 + }, + { + "epoch": 6.00093153237075, + "grad_norm": 0.35495868274581827, + "learning_rate": 1.3844660194174758e-05, + "loss": 0.2923, + "step": 6445 + }, + { + "epoch": 6.005589194224499, + "grad_norm": 0.3297878781680354, + "learning_rate": 1.3812297734627833e-05, + "loss": 0.2578, + "step": 6450 + }, + { + "epoch": 6.010246856078249, + "grad_norm": 0.3049393990291519, + "learning_rate": 1.3779935275080908e-05, + "loss": 0.2647, + "step": 6455 + }, + { + "epoch": 6.014904517931998, + "grad_norm": 0.3045770601314326, + "learning_rate": 1.3747572815533983e-05, + "loss": 0.2593, + "step": 6460 + }, + { + "epoch": 6.019562179785748, + "grad_norm": 0.29522401789966235, + "learning_rate": 1.3715210355987058e-05, + "loss": 0.2614, + "step": 6465 + }, + { + "epoch": 6.024219841639497, + "grad_norm": 0.2992223453383009, + "learning_rate": 1.3682847896440129e-05, + "loss": 0.2572, + "step": 6470 + }, + { + "epoch": 6.028877503493247, + "grad_norm": 0.3104405294123856, + "learning_rate": 1.3650485436893204e-05, + "loss": 0.2639, + "step": 6475 + }, + { + "epoch": 6.033535165346996, + "grad_norm": 0.29912511415470894, + "learning_rate": 1.3618122977346279e-05, + "loss": 0.2592, + "step": 6480 + }, + { + "epoch": 6.038192827200745, + "grad_norm": 0.3021541526670289, + "learning_rate": 1.3585760517799354e-05, + "loss": 0.2628, + "step": 6485 + }, + { + "epoch": 6.042850489054494, + "grad_norm": 0.3133637204965224, + "learning_rate": 1.3553398058252429e-05, + "loss": 0.2609, + "step": 6490 + }, + { + "epoch": 6.047508150908244, + "grad_norm": 0.31960446564212824, + "learning_rate": 1.3521035598705503e-05, + "loss": 0.2629, + "step": 6495 + }, + { + "epoch": 6.052165812761993, + "grad_norm": 0.30991779756370824, + "learning_rate": 1.3488673139158578e-05, + "loss": 0.2641, + "step": 6500 + }, + { + "epoch": 6.056823474615743, + "grad_norm": 0.3264487591304554, + "learning_rate": 1.345631067961165e-05, + "loss": 0.2642, + "step": 6505 + }, + { + "epoch": 6.061481136469492, + "grad_norm": 0.28858785783185686, + "learning_rate": 1.3423948220064725e-05, + "loss": 0.2617, + "step": 6510 + }, + { + "epoch": 6.066138798323242, + "grad_norm": 0.3105777896361813, + "learning_rate": 1.33915857605178e-05, + "loss": 0.2622, + "step": 6515 + }, + { + "epoch": 6.070796460176991, + "grad_norm": 0.29554505620954064, + "learning_rate": 1.3359223300970874e-05, + "loss": 0.2511, + "step": 6520 + }, + { + "epoch": 6.075454122030741, + "grad_norm": 0.2953799747572498, + "learning_rate": 1.332686084142395e-05, + "loss": 0.2624, + "step": 6525 + }, + { + "epoch": 6.08011178388449, + "grad_norm": 0.29749608780623976, + "learning_rate": 1.3294498381877024e-05, + "loss": 0.2654, + "step": 6530 + }, + { + "epoch": 6.08476944573824, + "grad_norm": 0.3045485523189274, + "learning_rate": 1.3262135922330099e-05, + "loss": 0.2635, + "step": 6535 + }, + { + "epoch": 6.089427107591989, + "grad_norm": 0.30079742188595193, + "learning_rate": 1.322977346278317e-05, + "loss": 0.2627, + "step": 6540 + }, + { + "epoch": 6.094084769445738, + "grad_norm": 0.2997340622747308, + "learning_rate": 1.3197411003236245e-05, + "loss": 0.2595, + "step": 6545 + }, + { + "epoch": 6.098742431299487, + "grad_norm": 0.31052606162390783, + "learning_rate": 1.316504854368932e-05, + "loss": 0.2644, + "step": 6550 + }, + { + "epoch": 6.103400093153237, + "grad_norm": 0.29891365265398806, + "learning_rate": 1.3132686084142395e-05, + "loss": 0.258, + "step": 6555 + }, + { + "epoch": 6.108057755006986, + "grad_norm": 0.2977288901620322, + "learning_rate": 1.310032362459547e-05, + "loss": 0.2687, + "step": 6560 + }, + { + "epoch": 6.112715416860736, + "grad_norm": 0.3290928356537451, + "learning_rate": 1.3067961165048545e-05, + "loss": 0.2646, + "step": 6565 + }, + { + "epoch": 6.117373078714485, + "grad_norm": 0.2888676419652737, + "learning_rate": 1.303559870550162e-05, + "loss": 0.26, + "step": 6570 + }, + { + "epoch": 6.122030740568235, + "grad_norm": 0.28503442473557816, + "learning_rate": 1.3003236245954691e-05, + "loss": 0.2599, + "step": 6575 + }, + { + "epoch": 6.126688402421984, + "grad_norm": 0.2905977574492153, + "learning_rate": 1.2970873786407766e-05, + "loss": 0.26, + "step": 6580 + }, + { + "epoch": 6.131346064275734, + "grad_norm": 0.3062366689782617, + "learning_rate": 1.2938511326860841e-05, + "loss": 0.2633, + "step": 6585 + }, + { + "epoch": 6.136003726129483, + "grad_norm": 0.29240949963563057, + "learning_rate": 1.2906148867313916e-05, + "loss": 0.2575, + "step": 6590 + }, + { + "epoch": 6.140661387983233, + "grad_norm": 0.3023468143758687, + "learning_rate": 1.287378640776699e-05, + "loss": 0.2635, + "step": 6595 + }, + { + "epoch": 6.145319049836981, + "grad_norm": 0.30685404285585544, + "learning_rate": 1.2841423948220066e-05, + "loss": 0.2671, + "step": 6600 + }, + { + "epoch": 6.149976711690731, + "grad_norm": 0.3013693735118242, + "learning_rate": 1.280906148867314e-05, + "loss": 0.268, + "step": 6605 + }, + { + "epoch": 6.15463437354448, + "grad_norm": 0.2924855300662006, + "learning_rate": 1.2776699029126214e-05, + "loss": 0.2627, + "step": 6610 + }, + { + "epoch": 6.15929203539823, + "grad_norm": 0.3079077929755618, + "learning_rate": 1.2744336569579288e-05, + "loss": 0.2642, + "step": 6615 + }, + { + "epoch": 6.163949697251979, + "grad_norm": 0.2996259816826425, + "learning_rate": 1.2711974110032363e-05, + "loss": 0.2619, + "step": 6620 + }, + { + "epoch": 6.168607359105729, + "grad_norm": 0.32401337024916305, + "learning_rate": 1.2679611650485437e-05, + "loss": 0.2671, + "step": 6625 + }, + { + "epoch": 6.173265020959478, + "grad_norm": 0.29220015779851977, + "learning_rate": 1.2647249190938511e-05, + "loss": 0.2608, + "step": 6630 + }, + { + "epoch": 6.177922682813228, + "grad_norm": 0.3098691399332797, + "learning_rate": 1.2614886731391586e-05, + "loss": 0.2702, + "step": 6635 + }, + { + "epoch": 6.182580344666977, + "grad_norm": 0.30089891579442063, + "learning_rate": 1.2582524271844661e-05, + "loss": 0.2631, + "step": 6640 + }, + { + "epoch": 6.187238006520727, + "grad_norm": 0.31490153801082493, + "learning_rate": 1.2550161812297736e-05, + "loss": 0.2666, + "step": 6645 + }, + { + "epoch": 6.191895668374476, + "grad_norm": 0.30465227236311726, + "learning_rate": 1.251779935275081e-05, + "loss": 0.2607, + "step": 6650 + }, + { + "epoch": 6.196553330228226, + "grad_norm": 0.293389503277096, + "learning_rate": 1.2485436893203884e-05, + "loss": 0.2574, + "step": 6655 + }, + { + "epoch": 6.2012109920819745, + "grad_norm": 0.29459904393256614, + "learning_rate": 1.2453074433656959e-05, + "loss": 0.2602, + "step": 6660 + }, + { + "epoch": 6.205868653935724, + "grad_norm": 0.2952166883304406, + "learning_rate": 1.2420711974110034e-05, + "loss": 0.2605, + "step": 6665 + }, + { + "epoch": 6.2105263157894735, + "grad_norm": 0.3011999710805045, + "learning_rate": 1.2388349514563109e-05, + "loss": 0.2601, + "step": 6670 + }, + { + "epoch": 6.215183977643223, + "grad_norm": 0.3041059442688845, + "learning_rate": 1.2355987055016182e-05, + "loss": 0.2621, + "step": 6675 + }, + { + "epoch": 6.219841639496972, + "grad_norm": 0.2994745867739228, + "learning_rate": 1.2323624595469257e-05, + "loss": 0.262, + "step": 6680 + }, + { + "epoch": 6.224499301350722, + "grad_norm": 0.3029753006698125, + "learning_rate": 1.2291262135922332e-05, + "loss": 0.2609, + "step": 6685 + }, + { + "epoch": 6.229156963204471, + "grad_norm": 0.2973074409724744, + "learning_rate": 1.2258899676375406e-05, + "loss": 0.2575, + "step": 6690 + }, + { + "epoch": 6.233814625058221, + "grad_norm": 0.2919733617641279, + "learning_rate": 1.222653721682848e-05, + "loss": 0.2627, + "step": 6695 + }, + { + "epoch": 6.23847228691197, + "grad_norm": 0.29864477756602675, + "learning_rate": 1.2194174757281554e-05, + "loss": 0.2638, + "step": 6700 + }, + { + "epoch": 6.24312994876572, + "grad_norm": 0.2986274093655223, + "learning_rate": 1.216181229773463e-05, + "loss": 0.2618, + "step": 6705 + }, + { + "epoch": 6.247787610619469, + "grad_norm": 0.28958751927297255, + "learning_rate": 1.2129449838187703e-05, + "loss": 0.2621, + "step": 6710 + }, + { + "epoch": 6.252445272473219, + "grad_norm": 0.3055074090931315, + "learning_rate": 1.2097087378640777e-05, + "loss": 0.2611, + "step": 6715 + }, + { + "epoch": 6.2571029343269675, + "grad_norm": 0.29902667150406925, + "learning_rate": 1.2064724919093852e-05, + "loss": 0.2654, + "step": 6720 + }, + { + "epoch": 6.261760596180717, + "grad_norm": 0.29407942984693936, + "learning_rate": 1.2032362459546927e-05, + "loss": 0.2632, + "step": 6725 + }, + { + "epoch": 6.2664182580344665, + "grad_norm": 0.30488411423530387, + "learning_rate": 1.2e-05, + "loss": 0.2643, + "step": 6730 + }, + { + "epoch": 6.271075919888216, + "grad_norm": 0.30663631113750467, + "learning_rate": 1.1967637540453075e-05, + "loss": 0.2715, + "step": 6735 + }, + { + "epoch": 6.2757335817419655, + "grad_norm": 0.2937405769189997, + "learning_rate": 1.193527508090615e-05, + "loss": 0.2639, + "step": 6740 + }, + { + "epoch": 6.280391243595715, + "grad_norm": 0.30589147682629475, + "learning_rate": 1.1902912621359223e-05, + "loss": 0.2645, + "step": 6745 + }, + { + "epoch": 6.2850489054494645, + "grad_norm": 0.297237299068367, + "learning_rate": 1.1870550161812298e-05, + "loss": 0.2636, + "step": 6750 + }, + { + "epoch": 6.289706567303214, + "grad_norm": 0.3004296527812095, + "learning_rate": 1.1838187702265373e-05, + "loss": 0.2634, + "step": 6755 + }, + { + "epoch": 6.2943642291569635, + "grad_norm": 0.3000745498565585, + "learning_rate": 1.1805825242718448e-05, + "loss": 0.2639, + "step": 6760 + }, + { + "epoch": 6.299021891010713, + "grad_norm": 0.2936673332032889, + "learning_rate": 1.1773462783171521e-05, + "loss": 0.2628, + "step": 6765 + }, + { + "epoch": 6.3036795528644625, + "grad_norm": 0.2956144303291978, + "learning_rate": 1.1741100323624596e-05, + "loss": 0.2623, + "step": 6770 + }, + { + "epoch": 6.308337214718211, + "grad_norm": 0.3037688284992403, + "learning_rate": 1.170873786407767e-05, + "loss": 0.2724, + "step": 6775 + }, + { + "epoch": 6.312994876571961, + "grad_norm": 0.2967836672965417, + "learning_rate": 1.1676375404530746e-05, + "loss": 0.267, + "step": 6780 + }, + { + "epoch": 6.31765253842571, + "grad_norm": 0.2938130888285839, + "learning_rate": 1.1644012944983819e-05, + "loss": 0.2594, + "step": 6785 + }, + { + "epoch": 6.32231020027946, + "grad_norm": 0.31339946077072456, + "learning_rate": 1.1611650485436894e-05, + "loss": 0.269, + "step": 6790 + }, + { + "epoch": 6.326967862133209, + "grad_norm": 0.31099128875742715, + "learning_rate": 1.1579288025889969e-05, + "loss": 0.2626, + "step": 6795 + }, + { + "epoch": 6.3316255239869585, + "grad_norm": 0.30323748956184154, + "learning_rate": 1.1546925566343042e-05, + "loss": 0.2633, + "step": 6800 + }, + { + "epoch": 6.336283185840708, + "grad_norm": 0.3081294010202703, + "learning_rate": 1.1514563106796117e-05, + "loss": 0.2689, + "step": 6805 + }, + { + "epoch": 6.3409408476944575, + "grad_norm": 0.30399877874074915, + "learning_rate": 1.1482200647249191e-05, + "loss": 0.2662, + "step": 6810 + }, + { + "epoch": 6.345598509548207, + "grad_norm": 0.29208443874938306, + "learning_rate": 1.1449838187702266e-05, + "loss": 0.2667, + "step": 6815 + }, + { + "epoch": 6.3502561714019565, + "grad_norm": 0.2977432429689944, + "learning_rate": 1.141747572815534e-05, + "loss": 0.2615, + "step": 6820 + }, + { + "epoch": 6.354913833255706, + "grad_norm": 0.3091236926197169, + "learning_rate": 1.1385113268608414e-05, + "loss": 0.2664, + "step": 6825 + }, + { + "epoch": 6.359571495109455, + "grad_norm": 0.29377145812895084, + "learning_rate": 1.135275080906149e-05, + "loss": 0.2722, + "step": 6830 + }, + { + "epoch": 6.364229156963204, + "grad_norm": 0.3060481901896544, + "learning_rate": 1.1320388349514564e-05, + "loss": 0.2676, + "step": 6835 + }, + { + "epoch": 6.368886818816954, + "grad_norm": 0.3044897451068506, + "learning_rate": 1.1288025889967637e-05, + "loss": 0.2655, + "step": 6840 + }, + { + "epoch": 6.373544480670703, + "grad_norm": 0.31723769869483387, + "learning_rate": 1.1255663430420712e-05, + "loss": 0.2739, + "step": 6845 + }, + { + "epoch": 6.378202142524453, + "grad_norm": 0.3003761183208017, + "learning_rate": 1.1223300970873787e-05, + "loss": 0.2676, + "step": 6850 + }, + { + "epoch": 6.382859804378202, + "grad_norm": 0.28548123147448823, + "learning_rate": 1.119093851132686e-05, + "loss": 0.2666, + "step": 6855 + }, + { + "epoch": 6.387517466231952, + "grad_norm": 0.30757258907864266, + "learning_rate": 1.1158576051779935e-05, + "loss": 0.2715, + "step": 6860 + }, + { + "epoch": 6.392175128085701, + "grad_norm": 0.2824807846329121, + "learning_rate": 1.112621359223301e-05, + "loss": 0.2641, + "step": 6865 + }, + { + "epoch": 6.396832789939451, + "grad_norm": 0.2987736875062339, + "learning_rate": 1.1093851132686085e-05, + "loss": 0.2648, + "step": 6870 + }, + { + "epoch": 6.4014904517932, + "grad_norm": 0.29263393638706314, + "learning_rate": 1.1061488673139158e-05, + "loss": 0.2571, + "step": 6875 + }, + { + "epoch": 6.40614811364695, + "grad_norm": 0.30484628646229844, + "learning_rate": 1.1029126213592233e-05, + "loss": 0.2621, + "step": 6880 + }, + { + "epoch": 6.410805775500698, + "grad_norm": 0.29353850152552546, + "learning_rate": 1.0996763754045308e-05, + "loss": 0.2656, + "step": 6885 + }, + { + "epoch": 6.415463437354448, + "grad_norm": 0.292195194398706, + "learning_rate": 1.0964401294498383e-05, + "loss": 0.2666, + "step": 6890 + }, + { + "epoch": 6.420121099208197, + "grad_norm": 0.29631019439162676, + "learning_rate": 1.0932038834951456e-05, + "loss": 0.264, + "step": 6895 + }, + { + "epoch": 6.424778761061947, + "grad_norm": 0.3018934528570828, + "learning_rate": 1.089967637540453e-05, + "loss": 0.2659, + "step": 6900 + }, + { + "epoch": 6.429436422915696, + "grad_norm": 0.2982611313693515, + "learning_rate": 1.0867313915857605e-05, + "loss": 0.2616, + "step": 6905 + }, + { + "epoch": 6.434094084769446, + "grad_norm": 0.31311872362980864, + "learning_rate": 1.083495145631068e-05, + "loss": 0.2723, + "step": 6910 + }, + { + "epoch": 6.438751746623195, + "grad_norm": 0.29715420694631617, + "learning_rate": 1.0802588996763755e-05, + "loss": 0.2686, + "step": 6915 + }, + { + "epoch": 6.443409408476945, + "grad_norm": 0.2973471857168713, + "learning_rate": 1.077022653721683e-05, + "loss": 0.2586, + "step": 6920 + }, + { + "epoch": 6.448067070330694, + "grad_norm": 0.29641440938775837, + "learning_rate": 1.0737864077669903e-05, + "loss": 0.265, + "step": 6925 + }, + { + "epoch": 6.452724732184444, + "grad_norm": 0.3233739194102652, + "learning_rate": 1.0705501618122978e-05, + "loss": 0.265, + "step": 6930 + }, + { + "epoch": 6.457382394038193, + "grad_norm": 0.29505157833257495, + "learning_rate": 1.0673139158576053e-05, + "loss": 0.2633, + "step": 6935 + }, + { + "epoch": 6.462040055891943, + "grad_norm": 0.2896116728878053, + "learning_rate": 1.0640776699029128e-05, + "loss": 0.262, + "step": 6940 + }, + { + "epoch": 6.466697717745692, + "grad_norm": 0.3063186852222229, + "learning_rate": 1.0608414239482201e-05, + "loss": 0.2666, + "step": 6945 + }, + { + "epoch": 6.471355379599441, + "grad_norm": 0.30838975092012333, + "learning_rate": 1.0576051779935276e-05, + "loss": 0.2756, + "step": 6950 + }, + { + "epoch": 6.47601304145319, + "grad_norm": 0.30807125812888864, + "learning_rate": 1.054368932038835e-05, + "loss": 0.2752, + "step": 6955 + }, + { + "epoch": 6.48067070330694, + "grad_norm": 0.306453526616044, + "learning_rate": 1.0511326860841426e-05, + "loss": 0.2583, + "step": 6960 + }, + { + "epoch": 6.485328365160689, + "grad_norm": 0.3015846241843987, + "learning_rate": 1.0478964401294499e-05, + "loss": 0.2645, + "step": 6965 + }, + { + "epoch": 6.489986027014439, + "grad_norm": 0.32405432934544387, + "learning_rate": 1.0446601941747574e-05, + "loss": 0.2663, + "step": 6970 + }, + { + "epoch": 6.494643688868188, + "grad_norm": 0.2887204349916844, + "learning_rate": 1.0414239482200649e-05, + "loss": 0.2681, + "step": 6975 + }, + { + "epoch": 6.499301350721938, + "grad_norm": 0.294867047881976, + "learning_rate": 1.0381877022653722e-05, + "loss": 0.2669, + "step": 6980 + }, + { + "epoch": 6.503959012575687, + "grad_norm": 0.2973771704685833, + "learning_rate": 1.0349514563106797e-05, + "loss": 0.2613, + "step": 6985 + }, + { + "epoch": 6.508616674429437, + "grad_norm": 0.29580255284489865, + "learning_rate": 1.0317152103559872e-05, + "loss": 0.2701, + "step": 6990 + }, + { + "epoch": 6.513274336283186, + "grad_norm": 0.28965926055244845, + "learning_rate": 1.0284789644012946e-05, + "loss": 0.2696, + "step": 6995 + }, + { + "epoch": 6.517931998136936, + "grad_norm": 0.2953950153666152, + "learning_rate": 1.025242718446602e-05, + "loss": 0.2726, + "step": 7000 + }, + { + "epoch": 6.522589659990684, + "grad_norm": 0.28930047453778795, + "learning_rate": 1.0220064724919094e-05, + "loss": 0.2654, + "step": 7005 + }, + { + "epoch": 6.527247321844434, + "grad_norm": 0.2972447245728253, + "learning_rate": 1.018770226537217e-05, + "loss": 0.2692, + "step": 7010 + }, + { + "epoch": 6.531904983698183, + "grad_norm": 0.3084742358881919, + "learning_rate": 1.0155339805825244e-05, + "loss": 0.2678, + "step": 7015 + }, + { + "epoch": 6.536562645551933, + "grad_norm": 0.29204430034154616, + "learning_rate": 1.0122977346278317e-05, + "loss": 0.2685, + "step": 7020 + }, + { + "epoch": 6.541220307405682, + "grad_norm": 0.29192158072861035, + "learning_rate": 1.0090614886731392e-05, + "loss": 0.265, + "step": 7025 + }, + { + "epoch": 6.545877969259432, + "grad_norm": 0.30029405364577455, + "learning_rate": 1.0058252427184467e-05, + "loss": 0.2717, + "step": 7030 + }, + { + "epoch": 6.550535631113181, + "grad_norm": 0.3015878432849847, + "learning_rate": 1.002588996763754e-05, + "loss": 0.2679, + "step": 7035 + }, + { + "epoch": 6.555193292966931, + "grad_norm": 0.2896341645771092, + "learning_rate": 9.993527508090615e-06, + "loss": 0.2603, + "step": 7040 + }, + { + "epoch": 6.55985095482068, + "grad_norm": 0.3056973985571281, + "learning_rate": 9.96116504854369e-06, + "loss": 0.2724, + "step": 7045 + }, + { + "epoch": 6.56450861667443, + "grad_norm": 0.2921427997727339, + "learning_rate": 9.928802588996765e-06, + "loss": 0.2629, + "step": 7050 + }, + { + "epoch": 6.569166278528179, + "grad_norm": 0.2884651901506424, + "learning_rate": 9.896440129449838e-06, + "loss": 0.2602, + "step": 7055 + }, + { + "epoch": 6.573823940381928, + "grad_norm": 0.30672936523261934, + "learning_rate": 9.864077669902913e-06, + "loss": 0.2682, + "step": 7060 + }, + { + "epoch": 6.578481602235677, + "grad_norm": 0.2994506278425497, + "learning_rate": 9.831715210355988e-06, + "loss": 0.269, + "step": 7065 + }, + { + "epoch": 6.583139264089427, + "grad_norm": 0.312676659203426, + "learning_rate": 9.799352750809061e-06, + "loss": 0.2658, + "step": 7070 + }, + { + "epoch": 6.587796925943176, + "grad_norm": 0.2861834121660456, + "learning_rate": 9.766990291262136e-06, + "loss": 0.2591, + "step": 7075 + }, + { + "epoch": 6.592454587796926, + "grad_norm": 0.3159432948596571, + "learning_rate": 9.73462783171521e-06, + "loss": 0.2665, + "step": 7080 + }, + { + "epoch": 6.597112249650675, + "grad_norm": 0.3106042391433404, + "learning_rate": 9.702265372168286e-06, + "loss": 0.2693, + "step": 7085 + }, + { + "epoch": 6.601769911504425, + "grad_norm": 0.2874804804924082, + "learning_rate": 9.669902912621359e-06, + "loss": 0.2648, + "step": 7090 + }, + { + "epoch": 6.606427573358174, + "grad_norm": 0.30044410174964736, + "learning_rate": 9.637540453074434e-06, + "loss": 0.2627, + "step": 7095 + }, + { + "epoch": 6.611085235211924, + "grad_norm": 0.29779385187282104, + "learning_rate": 9.605177993527508e-06, + "loss": 0.2646, + "step": 7100 + }, + { + "epoch": 6.615742897065673, + "grad_norm": 0.28021042242020305, + "learning_rate": 9.572815533980583e-06, + "loss": 0.2627, + "step": 7105 + }, + { + "epoch": 6.620400558919423, + "grad_norm": 0.2865437965680647, + "learning_rate": 9.540453074433657e-06, + "loss": 0.2644, + "step": 7110 + }, + { + "epoch": 6.625058220773171, + "grad_norm": 0.28847036183764996, + "learning_rate": 9.508090614886731e-06, + "loss": 0.2704, + "step": 7115 + }, + { + "epoch": 6.629715882626921, + "grad_norm": 0.29923828687211645, + "learning_rate": 9.475728155339806e-06, + "loss": 0.2669, + "step": 7120 + }, + { + "epoch": 6.63437354448067, + "grad_norm": 0.28995732729217844, + "learning_rate": 9.44336569579288e-06, + "loss": 0.2655, + "step": 7125 + }, + { + "epoch": 6.63903120633442, + "grad_norm": 0.29606046217236426, + "learning_rate": 9.411003236245954e-06, + "loss": 0.2673, + "step": 7130 + }, + { + "epoch": 6.643688868188169, + "grad_norm": 0.29502049733762253, + "learning_rate": 9.37864077669903e-06, + "loss": 0.2694, + "step": 7135 + }, + { + "epoch": 6.648346530041919, + "grad_norm": 0.30156900738632925, + "learning_rate": 9.346278317152104e-06, + "loss": 0.2643, + "step": 7140 + }, + { + "epoch": 6.653004191895668, + "grad_norm": 0.2865140989183644, + "learning_rate": 9.313915857605177e-06, + "loss": 0.2676, + "step": 7145 + }, + { + "epoch": 6.657661853749418, + "grad_norm": 0.2864300410942112, + "learning_rate": 9.281553398058252e-06, + "loss": 0.2567, + "step": 7150 + }, + { + "epoch": 6.662319515603167, + "grad_norm": 0.2963896335540961, + "learning_rate": 9.249190938511327e-06, + "loss": 0.2629, + "step": 7155 + }, + { + "epoch": 6.666977177456917, + "grad_norm": 0.2929631491412471, + "learning_rate": 9.216828478964402e-06, + "loss": 0.263, + "step": 7160 + }, + { + "epoch": 6.671634839310666, + "grad_norm": 0.2960008906124059, + "learning_rate": 9.184466019417477e-06, + "loss": 0.2665, + "step": 7165 + }, + { + "epoch": 6.676292501164415, + "grad_norm": 0.2957610946094985, + "learning_rate": 9.15210355987055e-06, + "loss": 0.2704, + "step": 7170 + }, + { + "epoch": 6.680950163018165, + "grad_norm": 0.28705408532948146, + "learning_rate": 9.119741100323625e-06, + "loss": 0.2594, + "step": 7175 + }, + { + "epoch": 6.685607824871914, + "grad_norm": 0.29626334225006756, + "learning_rate": 9.0873786407767e-06, + "loss": 0.2639, + "step": 7180 + }, + { + "epoch": 6.6902654867256635, + "grad_norm": 0.2826872422417214, + "learning_rate": 9.055016181229774e-06, + "loss": 0.2637, + "step": 7185 + }, + { + "epoch": 6.694923148579413, + "grad_norm": 0.29968315068623025, + "learning_rate": 9.02265372168285e-06, + "loss": 0.2678, + "step": 7190 + }, + { + "epoch": 6.6995808104331624, + "grad_norm": 0.29773778109848104, + "learning_rate": 8.990291262135924e-06, + "loss": 0.2712, + "step": 7195 + }, + { + "epoch": 6.704238472286912, + "grad_norm": 0.3120568855084784, + "learning_rate": 8.957928802588997e-06, + "loss": 0.2636, + "step": 7200 + }, + { + "epoch": 6.708896134140661, + "grad_norm": 0.2953788516265102, + "learning_rate": 8.925566343042072e-06, + "loss": 0.2626, + "step": 7205 + }, + { + "epoch": 6.713553795994411, + "grad_norm": 0.2946094057534323, + "learning_rate": 8.893203883495147e-06, + "loss": 0.2653, + "step": 7210 + }, + { + "epoch": 6.71821145784816, + "grad_norm": 0.28172021875277514, + "learning_rate": 8.86084142394822e-06, + "loss": 0.2586, + "step": 7215 + }, + { + "epoch": 6.72286911970191, + "grad_norm": 0.2837044630178436, + "learning_rate": 8.828478964401295e-06, + "loss": 0.2638, + "step": 7220 + }, + { + "epoch": 6.727526781555659, + "grad_norm": 0.2931079722970222, + "learning_rate": 8.79611650485437e-06, + "loss": 0.2702, + "step": 7225 + }, + { + "epoch": 6.732184443409409, + "grad_norm": 0.29377975665519773, + "learning_rate": 8.763754045307445e-06, + "loss": 0.2682, + "step": 7230 + }, + { + "epoch": 6.7368421052631575, + "grad_norm": 0.2957932794325564, + "learning_rate": 8.731391585760518e-06, + "loss": 0.2647, + "step": 7235 + }, + { + "epoch": 6.741499767116907, + "grad_norm": 0.28703326172482113, + "learning_rate": 8.699029126213593e-06, + "loss": 0.2646, + "step": 7240 + }, + { + "epoch": 6.7461574289706565, + "grad_norm": 0.29889415938281116, + "learning_rate": 8.666666666666668e-06, + "loss": 0.2692, + "step": 7245 + }, + { + "epoch": 6.750815090824406, + "grad_norm": 0.2924197702146725, + "learning_rate": 8.634304207119743e-06, + "loss": 0.2717, + "step": 7250 + }, + { + "epoch": 6.7554727526781555, + "grad_norm": 0.2950840807083608, + "learning_rate": 8.601941747572816e-06, + "loss": 0.2712, + "step": 7255 + }, + { + "epoch": 6.760130414531905, + "grad_norm": 0.2792814764684901, + "learning_rate": 8.56957928802589e-06, + "loss": 0.2657, + "step": 7260 + }, + { + "epoch": 6.7647880763856545, + "grad_norm": 0.30399814729641883, + "learning_rate": 8.537216828478966e-06, + "loss": 0.2697, + "step": 7265 + }, + { + "epoch": 6.769445738239404, + "grad_norm": 0.3003646990654973, + "learning_rate": 8.504854368932039e-06, + "loss": 0.272, + "step": 7270 + }, + { + "epoch": 6.7741034000931535, + "grad_norm": 0.29113757128782447, + "learning_rate": 8.472491909385114e-06, + "loss": 0.2714, + "step": 7275 + }, + { + "epoch": 6.778761061946903, + "grad_norm": 0.28732756484781796, + "learning_rate": 8.440129449838189e-06, + "loss": 0.2584, + "step": 7280 + }, + { + "epoch": 6.7834187238006525, + "grad_norm": 0.2864611200669503, + "learning_rate": 8.407766990291263e-06, + "loss": 0.2703, + "step": 7285 + }, + { + "epoch": 6.788076385654401, + "grad_norm": 0.2955687762426276, + "learning_rate": 8.375404530744337e-06, + "loss": 0.2599, + "step": 7290 + }, + { + "epoch": 6.792734047508151, + "grad_norm": 0.2951699494413524, + "learning_rate": 8.343042071197411e-06, + "loss": 0.2665, + "step": 7295 + }, + { + "epoch": 6.7973917093619, + "grad_norm": 0.28605632622518984, + "learning_rate": 8.310679611650486e-06, + "loss": 0.2574, + "step": 7300 + }, + { + "epoch": 6.80204937121565, + "grad_norm": 0.2962348449926179, + "learning_rate": 8.27831715210356e-06, + "loss": 0.2651, + "step": 7305 + }, + { + "epoch": 6.806707033069399, + "grad_norm": 0.29452805097334306, + "learning_rate": 8.245954692556634e-06, + "loss": 0.2733, + "step": 7310 + }, + { + "epoch": 6.8113646949231486, + "grad_norm": 0.3012791508232329, + "learning_rate": 8.21359223300971e-06, + "loss": 0.2683, + "step": 7315 + }, + { + "epoch": 6.816022356776898, + "grad_norm": 0.3128355013767348, + "learning_rate": 8.181229773462784e-06, + "loss": 0.2664, + "step": 7320 + }, + { + "epoch": 6.8206800186306475, + "grad_norm": 0.3024946632235964, + "learning_rate": 8.148867313915857e-06, + "loss": 0.2706, + "step": 7325 + }, + { + "epoch": 6.825337680484397, + "grad_norm": 0.30155698607280634, + "learning_rate": 8.116504854368932e-06, + "loss": 0.2649, + "step": 7330 + }, + { + "epoch": 6.8299953423381465, + "grad_norm": 0.283215740652698, + "learning_rate": 8.084142394822007e-06, + "loss": 0.2661, + "step": 7335 + }, + { + "epoch": 6.834653004191896, + "grad_norm": 0.2967537224876354, + "learning_rate": 8.051779935275082e-06, + "loss": 0.2702, + "step": 7340 + }, + { + "epoch": 6.839310666045645, + "grad_norm": 0.28931249674766857, + "learning_rate": 8.019417475728155e-06, + "loss": 0.2665, + "step": 7345 + }, + { + "epoch": 6.843968327899394, + "grad_norm": 0.2755925519692027, + "learning_rate": 7.98705501618123e-06, + "loss": 0.266, + "step": 7350 + }, + { + "epoch": 6.848625989753144, + "grad_norm": 0.3054564057994233, + "learning_rate": 7.954692556634305e-06, + "loss": 0.2689, + "step": 7355 + }, + { + "epoch": 6.853283651606893, + "grad_norm": 0.297366586058889, + "learning_rate": 7.922330097087378e-06, + "loss": 0.2609, + "step": 7360 + }, + { + "epoch": 6.857941313460643, + "grad_norm": 0.30458237552842815, + "learning_rate": 7.889967637540453e-06, + "loss": 0.2685, + "step": 7365 + }, + { + "epoch": 6.862598975314392, + "grad_norm": 0.2933080510933296, + "learning_rate": 7.857605177993528e-06, + "loss": 0.2691, + "step": 7370 + }, + { + "epoch": 6.867256637168142, + "grad_norm": 0.2885045348475768, + "learning_rate": 7.825242718446603e-06, + "loss": 0.262, + "step": 7375 + }, + { + "epoch": 6.871914299021891, + "grad_norm": 0.31863232410466397, + "learning_rate": 7.792880258899676e-06, + "loss": 0.2669, + "step": 7380 + }, + { + "epoch": 6.876571960875641, + "grad_norm": 0.28217303875134564, + "learning_rate": 7.76051779935275e-06, + "loss": 0.2687, + "step": 7385 + }, + { + "epoch": 6.88122962272939, + "grad_norm": 0.2692090947552198, + "learning_rate": 7.728155339805825e-06, + "loss": 0.2636, + "step": 7390 + }, + { + "epoch": 6.88588728458314, + "grad_norm": 0.30114752200919587, + "learning_rate": 7.695792880258899e-06, + "loss": 0.2687, + "step": 7395 + }, + { + "epoch": 6.890544946436888, + "grad_norm": 0.28975468868704063, + "learning_rate": 7.663430420711974e-06, + "loss": 0.2649, + "step": 7400 + }, + { + "epoch": 6.895202608290639, + "grad_norm": 0.2868613444787753, + "learning_rate": 7.631067961165048e-06, + "loss": 0.266, + "step": 7405 + }, + { + "epoch": 6.899860270144387, + "grad_norm": 0.29949397350159185, + "learning_rate": 7.598705501618124e-06, + "loss": 0.2738, + "step": 7410 + }, + { + "epoch": 6.904517931998137, + "grad_norm": 0.29353444896116215, + "learning_rate": 7.566343042071197e-06, + "loss": 0.2635, + "step": 7415 + }, + { + "epoch": 6.909175593851886, + "grad_norm": 0.2936811288174701, + "learning_rate": 7.533980582524272e-06, + "loss": 0.2629, + "step": 7420 + }, + { + "epoch": 6.913833255705636, + "grad_norm": 0.28723294135836375, + "learning_rate": 7.501618122977347e-06, + "loss": 0.2639, + "step": 7425 + }, + { + "epoch": 6.918490917559385, + "grad_norm": 0.28236578729616, + "learning_rate": 7.469255663430422e-06, + "loss": 0.2653, + "step": 7430 + }, + { + "epoch": 6.923148579413135, + "grad_norm": 0.28907354168684685, + "learning_rate": 7.436893203883495e-06, + "loss": 0.2688, + "step": 7435 + }, + { + "epoch": 6.927806241266884, + "grad_norm": 0.28564285514552595, + "learning_rate": 7.40453074433657e-06, + "loss": 0.2649, + "step": 7440 + }, + { + "epoch": 6.932463903120634, + "grad_norm": 0.2837275318483068, + "learning_rate": 7.372168284789645e-06, + "loss": 0.2637, + "step": 7445 + }, + { + "epoch": 6.937121564974383, + "grad_norm": 0.2938890945273722, + "learning_rate": 7.339805825242718e-06, + "loss": 0.2639, + "step": 7450 + }, + { + "epoch": 6.941779226828133, + "grad_norm": 0.2898250518862583, + "learning_rate": 7.307443365695793e-06, + "loss": 0.2676, + "step": 7455 + }, + { + "epoch": 6.946436888681882, + "grad_norm": 0.2840903723540283, + "learning_rate": 7.275080906148868e-06, + "loss": 0.2574, + "step": 7460 + }, + { + "epoch": 6.951094550535631, + "grad_norm": 0.29011946794308335, + "learning_rate": 7.242718446601943e-06, + "loss": 0.2641, + "step": 7465 + }, + { + "epoch": 6.95575221238938, + "grad_norm": 0.27449848997937937, + "learning_rate": 7.210355987055016e-06, + "loss": 0.2629, + "step": 7470 + }, + { + "epoch": 6.96040987424313, + "grad_norm": 0.284340271468117, + "learning_rate": 7.177993527508091e-06, + "loss": 0.2605, + "step": 7475 + }, + { + "epoch": 6.965067536096879, + "grad_norm": 0.2889739086826554, + "learning_rate": 7.1456310679611655e-06, + "loss": 0.2745, + "step": 7480 + }, + { + "epoch": 6.969725197950629, + "grad_norm": 0.28879334801466866, + "learning_rate": 7.1132686084142395e-06, + "loss": 0.2678, + "step": 7485 + }, + { + "epoch": 6.974382859804378, + "grad_norm": 0.28876981261647255, + "learning_rate": 7.0809061488673136e-06, + "loss": 0.2639, + "step": 7490 + }, + { + "epoch": 6.979040521658128, + "grad_norm": 0.2756354004750388, + "learning_rate": 7.0485436893203884e-06, + "loss": 0.2715, + "step": 7495 + }, + { + "epoch": 6.983698183511877, + "grad_norm": 0.2819723345873216, + "learning_rate": 7.016181229773463e-06, + "loss": 0.272, + "step": 7500 + }, + { + "epoch": 6.988355845365627, + "grad_norm": 0.2861866310764198, + "learning_rate": 6.983818770226537e-06, + "loss": 0.2666, + "step": 7505 + }, + { + "epoch": 6.993013507219376, + "grad_norm": 0.2958946961429871, + "learning_rate": 6.951456310679612e-06, + "loss": 0.2666, + "step": 7510 + }, + { + "epoch": 6.997671169073126, + "grad_norm": 0.2889127846949159, + "learning_rate": 6.919093851132687e-06, + "loss": 0.2719, + "step": 7515 + }, + { + "epoch": 7.008383791336749, + "grad_norm": 0.32096371180847455, + "learning_rate": 6.886731391585761e-06, + "loss": 0.2247, + "step": 7520 + }, + { + "epoch": 7.013041453190499, + "grad_norm": 0.3488405021055525, + "learning_rate": 6.854368932038835e-06, + "loss": 0.2307, + "step": 7525 + }, + { + "epoch": 7.017699115044247, + "grad_norm": 0.3327638817581441, + "learning_rate": 6.82200647249191e-06, + "loss": 0.233, + "step": 7530 + }, + { + "epoch": 7.022356776897997, + "grad_norm": 0.317729462852438, + "learning_rate": 6.789644012944985e-06, + "loss": 0.2325, + "step": 7535 + }, + { + "epoch": 7.027014438751746, + "grad_norm": 0.32110341569284023, + "learning_rate": 6.757281553398058e-06, + "loss": 0.2293, + "step": 7540 + }, + { + "epoch": 7.031672100605496, + "grad_norm": 0.304755330404921, + "learning_rate": 6.724919093851133e-06, + "loss": 0.2268, + "step": 7545 + }, + { + "epoch": 7.036329762459245, + "grad_norm": 0.3050925925720897, + "learning_rate": 6.692556634304208e-06, + "loss": 0.2291, + "step": 7550 + }, + { + "epoch": 7.040987424312995, + "grad_norm": 0.3031758245942809, + "learning_rate": 6.660194174757283e-06, + "loss": 0.2329, + "step": 7555 + }, + { + "epoch": 7.045645086166744, + "grad_norm": 0.3000571027142737, + "learning_rate": 6.627831715210356e-06, + "loss": 0.2377, + "step": 7560 + }, + { + "epoch": 7.050302748020494, + "grad_norm": 0.29397010498824716, + "learning_rate": 6.595469255663431e-06, + "loss": 0.2312, + "step": 7565 + }, + { + "epoch": 7.054960409874243, + "grad_norm": 0.3145370761461229, + "learning_rate": 6.5631067961165056e-06, + "loss": 0.2341, + "step": 7570 + }, + { + "epoch": 7.059618071727993, + "grad_norm": 0.3036224922667628, + "learning_rate": 6.53074433656958e-06, + "loss": 0.2268, + "step": 7575 + }, + { + "epoch": 7.064275733581742, + "grad_norm": 0.3059794559599267, + "learning_rate": 6.498381877022654e-06, + "loss": 0.2294, + "step": 7580 + }, + { + "epoch": 7.068933395435492, + "grad_norm": 0.31289234807793126, + "learning_rate": 6.4660194174757285e-06, + "loss": 0.2264, + "step": 7585 + }, + { + "epoch": 7.07359105728924, + "grad_norm": 0.29989915293078834, + "learning_rate": 6.433656957928803e-06, + "loss": 0.2354, + "step": 7590 + }, + { + "epoch": 7.07824871914299, + "grad_norm": 0.3075677478265152, + "learning_rate": 6.4012944983818765e-06, + "loss": 0.2369, + "step": 7595 + }, + { + "epoch": 7.082906380996739, + "grad_norm": 0.3048232778200601, + "learning_rate": 6.368932038834951e-06, + "loss": 0.2364, + "step": 7600 + }, + { + "epoch": 7.087564042850489, + "grad_norm": 0.3190828574703254, + "learning_rate": 6.336569579288026e-06, + "loss": 0.2296, + "step": 7605 + }, + { + "epoch": 7.092221704704238, + "grad_norm": 0.31184952505319163, + "learning_rate": 6.304207119741101e-06, + "loss": 0.2343, + "step": 7610 + }, + { + "epoch": 7.096879366557988, + "grad_norm": 0.30613800937594887, + "learning_rate": 6.271844660194174e-06, + "loss": 0.2303, + "step": 7615 + }, + { + "epoch": 7.101537028411737, + "grad_norm": 0.30498712611830453, + "learning_rate": 6.239482200647249e-06, + "loss": 0.2259, + "step": 7620 + }, + { + "epoch": 7.106194690265487, + "grad_norm": 0.3100049730029901, + "learning_rate": 6.207119741100323e-06, + "loss": 0.2251, + "step": 7625 + }, + { + "epoch": 7.110852352119236, + "grad_norm": 0.31023444487558577, + "learning_rate": 6.174757281553398e-06, + "loss": 0.2308, + "step": 7630 + }, + { + "epoch": 7.115510013972986, + "grad_norm": 0.29584613934244036, + "learning_rate": 6.142394822006473e-06, + "loss": 0.2304, + "step": 7635 + }, + { + "epoch": 7.120167675826735, + "grad_norm": 0.29448593108961485, + "learning_rate": 6.110032362459547e-06, + "loss": 0.2371, + "step": 7640 + }, + { + "epoch": 7.124825337680484, + "grad_norm": 0.29164372951556267, + "learning_rate": 6.077669902912622e-06, + "loss": 0.2342, + "step": 7645 + }, + { + "epoch": 7.1294829995342335, + "grad_norm": 0.29109888029207276, + "learning_rate": 6.045307443365697e-06, + "loss": 0.229, + "step": 7650 + }, + { + "epoch": 7.134140661387983, + "grad_norm": 0.30246909213948103, + "learning_rate": 6.012944983818771e-06, + "loss": 0.2303, + "step": 7655 + }, + { + "epoch": 7.1387983232417325, + "grad_norm": 0.30563015303434904, + "learning_rate": 5.980582524271846e-06, + "loss": 0.233, + "step": 7660 + }, + { + "epoch": 7.143455985095482, + "grad_norm": 0.2930342631619546, + "learning_rate": 5.94822006472492e-06, + "loss": 0.2298, + "step": 7665 + }, + { + "epoch": 7.1481136469492315, + "grad_norm": 0.2969918259211002, + "learning_rate": 5.915857605177994e-06, + "loss": 0.2348, + "step": 7670 + }, + { + "epoch": 7.152771308802981, + "grad_norm": 0.2938360434343845, + "learning_rate": 5.8834951456310685e-06, + "loss": 0.2251, + "step": 7675 + }, + { + "epoch": 7.1574289706567304, + "grad_norm": 0.30467062065402506, + "learning_rate": 5.8511326860841425e-06, + "loss": 0.2333, + "step": 7680 + }, + { + "epoch": 7.16208663251048, + "grad_norm": 0.3070298308597322, + "learning_rate": 5.818770226537217e-06, + "loss": 0.2317, + "step": 7685 + }, + { + "epoch": 7.166744294364229, + "grad_norm": 0.3199230292905365, + "learning_rate": 5.786407766990291e-06, + "loss": 0.234, + "step": 7690 + }, + { + "epoch": 7.171401956217979, + "grad_norm": 0.319527302015154, + "learning_rate": 5.754045307443366e-06, + "loss": 0.2344, + "step": 7695 + }, + { + "epoch": 7.1760596180717275, + "grad_norm": 0.2955541120587258, + "learning_rate": 5.72168284789644e-06, + "loss": 0.2272, + "step": 7700 + }, + { + "epoch": 7.180717279925477, + "grad_norm": 0.3013211631850419, + "learning_rate": 5.689320388349515e-06, + "loss": 0.2318, + "step": 7705 + }, + { + "epoch": 7.1853749417792265, + "grad_norm": 0.3003735465957072, + "learning_rate": 5.656957928802589e-06, + "loss": 0.2281, + "step": 7710 + }, + { + "epoch": 7.190032603632976, + "grad_norm": 0.29421673650976043, + "learning_rate": 5.624595469255663e-06, + "loss": 0.2268, + "step": 7715 + }, + { + "epoch": 7.1946902654867255, + "grad_norm": 0.3161682831115032, + "learning_rate": 5.592233009708738e-06, + "loss": 0.2282, + "step": 7720 + }, + { + "epoch": 7.199347927340475, + "grad_norm": 0.31838340427441747, + "learning_rate": 5.559870550161812e-06, + "loss": 0.233, + "step": 7725 + }, + { + "epoch": 7.2040055891942245, + "grad_norm": 0.3126739004760499, + "learning_rate": 5.527508090614887e-06, + "loss": 0.2335, + "step": 7730 + }, + { + "epoch": 7.208663251047974, + "grad_norm": 0.30809587611518674, + "learning_rate": 5.495145631067961e-06, + "loss": 0.2297, + "step": 7735 + }, + { + "epoch": 7.2133209129017235, + "grad_norm": 0.3178394441862703, + "learning_rate": 5.462783171521036e-06, + "loss": 0.2327, + "step": 7740 + }, + { + "epoch": 7.217978574755473, + "grad_norm": 0.29434533557144726, + "learning_rate": 5.43042071197411e-06, + "loss": 0.2333, + "step": 7745 + }, + { + "epoch": 7.2226362366092225, + "grad_norm": 0.30090154673070385, + "learning_rate": 5.398058252427185e-06, + "loss": 0.2333, + "step": 7750 + }, + { + "epoch": 7.227293898462972, + "grad_norm": 0.3002754882767805, + "learning_rate": 5.365695792880259e-06, + "loss": 0.2286, + "step": 7755 + }, + { + "epoch": 7.231951560316721, + "grad_norm": 0.29763020403980267, + "learning_rate": 5.333333333333334e-06, + "loss": 0.2305, + "step": 7760 + }, + { + "epoch": 7.23660922217047, + "grad_norm": 0.3044463012655862, + "learning_rate": 5.300970873786408e-06, + "loss": 0.2323, + "step": 7765 + }, + { + "epoch": 7.24126688402422, + "grad_norm": 0.33893341317409675, + "learning_rate": 5.2686084142394825e-06, + "loss": 0.23, + "step": 7770 + }, + { + "epoch": 7.245924545877969, + "grad_norm": 0.29788870867671846, + "learning_rate": 5.2362459546925566e-06, + "loss": 0.2271, + "step": 7775 + }, + { + "epoch": 7.250582207731719, + "grad_norm": 0.3001345810140399, + "learning_rate": 5.2038834951456314e-06, + "loss": 0.2356, + "step": 7780 + }, + { + "epoch": 7.255239869585468, + "grad_norm": 0.29624174126127084, + "learning_rate": 5.171521035598706e-06, + "loss": 0.2258, + "step": 7785 + }, + { + "epoch": 7.259897531439218, + "grad_norm": 0.2998103795335706, + "learning_rate": 5.13915857605178e-06, + "loss": 0.2261, + "step": 7790 + }, + { + "epoch": 7.264555193292967, + "grad_norm": 0.30170442123639113, + "learning_rate": 5.106796116504855e-06, + "loss": 0.2335, + "step": 7795 + }, + { + "epoch": 7.269212855146717, + "grad_norm": 0.3048136215688031, + "learning_rate": 5.074433656957929e-06, + "loss": 0.2291, + "step": 7800 + }, + { + "epoch": 7.273870517000466, + "grad_norm": 0.308336424611371, + "learning_rate": 5.042071197411004e-06, + "loss": 0.2351, + "step": 7805 + }, + { + "epoch": 7.2785281788542155, + "grad_norm": 0.2981761242984121, + "learning_rate": 5.009708737864078e-06, + "loss": 0.2296, + "step": 7810 + }, + { + "epoch": 7.283185840707965, + "grad_norm": 0.3125107994925421, + "learning_rate": 4.977346278317152e-06, + "loss": 0.2327, + "step": 7815 + }, + { + "epoch": 7.287843502561714, + "grad_norm": 0.29783242600642124, + "learning_rate": 4.944983818770227e-06, + "loss": 0.2339, + "step": 7820 + }, + { + "epoch": 7.292501164415463, + "grad_norm": 0.3070161169497212, + "learning_rate": 4.912621359223301e-06, + "loss": 0.224, + "step": 7825 + }, + { + "epoch": 7.297158826269213, + "grad_norm": 0.3066786806886577, + "learning_rate": 4.880258899676376e-06, + "loss": 0.2364, + "step": 7830 + }, + { + "epoch": 7.301816488122962, + "grad_norm": 0.3138937968754756, + "learning_rate": 4.84789644012945e-06, + "loss": 0.2366, + "step": 7835 + }, + { + "epoch": 7.306474149976712, + "grad_norm": 0.29513622504520076, + "learning_rate": 4.815533980582525e-06, + "loss": 0.2268, + "step": 7840 + }, + { + "epoch": 7.311131811830461, + "grad_norm": 0.3005379349455707, + "learning_rate": 4.783171521035599e-06, + "loss": 0.2315, + "step": 7845 + }, + { + "epoch": 7.315789473684211, + "grad_norm": 0.311541396432155, + "learning_rate": 4.750809061488674e-06, + "loss": 0.2353, + "step": 7850 + }, + { + "epoch": 7.32044713553796, + "grad_norm": 0.2946986117882064, + "learning_rate": 4.718446601941748e-06, + "loss": 0.228, + "step": 7855 + }, + { + "epoch": 7.32510479739171, + "grad_norm": 0.2863751901804945, + "learning_rate": 4.686084142394822e-06, + "loss": 0.2334, + "step": 7860 + }, + { + "epoch": 7.329762459245459, + "grad_norm": 0.2863240982722415, + "learning_rate": 4.653721682847897e-06, + "loss": 0.2276, + "step": 7865 + }, + { + "epoch": 7.334420121099209, + "grad_norm": 0.3045987014232288, + "learning_rate": 4.621359223300971e-06, + "loss": 0.233, + "step": 7870 + }, + { + "epoch": 7.339077782952957, + "grad_norm": 0.30961495259781646, + "learning_rate": 4.5889967637540455e-06, + "loss": 0.2325, + "step": 7875 + }, + { + "epoch": 7.343735444806707, + "grad_norm": 0.3004648632404717, + "learning_rate": 4.5566343042071195e-06, + "loss": 0.2365, + "step": 7880 + }, + { + "epoch": 7.348393106660456, + "grad_norm": 0.29199875078520454, + "learning_rate": 4.524271844660194e-06, + "loss": 0.2266, + "step": 7885 + }, + { + "epoch": 7.353050768514206, + "grad_norm": 0.30603013293302106, + "learning_rate": 4.491909385113268e-06, + "loss": 0.2333, + "step": 7890 + }, + { + "epoch": 7.357708430367955, + "grad_norm": 0.30955009017341273, + "learning_rate": 4.459546925566343e-06, + "loss": 0.2362, + "step": 7895 + }, + { + "epoch": 7.362366092221705, + "grad_norm": 0.29175598178699413, + "learning_rate": 4.427184466019417e-06, + "loss": 0.2356, + "step": 7900 + }, + { + "epoch": 7.367023754075454, + "grad_norm": 0.2924815398673306, + "learning_rate": 4.394822006472492e-06, + "loss": 0.2359, + "step": 7905 + }, + { + "epoch": 7.371681415929204, + "grad_norm": 0.3036714855580005, + "learning_rate": 4.362459546925567e-06, + "loss": 0.2278, + "step": 7910 + }, + { + "epoch": 7.376339077782953, + "grad_norm": 0.2957981028535776, + "learning_rate": 4.330097087378641e-06, + "loss": 0.2325, + "step": 7915 + }, + { + "epoch": 7.380996739636703, + "grad_norm": 0.29468279953281146, + "learning_rate": 4.297734627831716e-06, + "loss": 0.2331, + "step": 7920 + }, + { + "epoch": 7.385654401490452, + "grad_norm": 0.3007582685414487, + "learning_rate": 4.26537216828479e-06, + "loss": 0.2381, + "step": 7925 + }, + { + "epoch": 7.390312063344201, + "grad_norm": 0.299592184905258, + "learning_rate": 4.233009708737865e-06, + "loss": 0.2381, + "step": 7930 + }, + { + "epoch": 7.39496972519795, + "grad_norm": 0.30220768863760616, + "learning_rate": 4.200647249190939e-06, + "loss": 0.2362, + "step": 7935 + }, + { + "epoch": 7.3996273870517, + "grad_norm": 0.3125457994305197, + "learning_rate": 4.168284789644014e-06, + "loss": 0.233, + "step": 7940 + }, + { + "epoch": 7.404285048905449, + "grad_norm": 0.300485101651472, + "learning_rate": 4.135922330097088e-06, + "loss": 0.2258, + "step": 7945 + }, + { + "epoch": 7.408942710759199, + "grad_norm": 0.3002736163154933, + "learning_rate": 4.103559870550162e-06, + "loss": 0.2311, + "step": 7950 + }, + { + "epoch": 7.413600372612948, + "grad_norm": 0.2954509859677457, + "learning_rate": 4.071197411003237e-06, + "loss": 0.2389, + "step": 7955 + }, + { + "epoch": 7.418258034466698, + "grad_norm": 0.2968556612469226, + "learning_rate": 4.038834951456311e-06, + "loss": 0.2345, + "step": 7960 + }, + { + "epoch": 7.422915696320447, + "grad_norm": 0.3256328206048908, + "learning_rate": 4.0064724919093855e-06, + "loss": 0.2315, + "step": 7965 + }, + { + "epoch": 7.427573358174197, + "grad_norm": 0.30621774169996147, + "learning_rate": 3.9741100323624595e-06, + "loss": 0.2331, + "step": 7970 + }, + { + "epoch": 7.432231020027946, + "grad_norm": 0.2890683583664343, + "learning_rate": 3.941747572815534e-06, + "loss": 0.2346, + "step": 7975 + }, + { + "epoch": 7.436888681881696, + "grad_norm": 0.29389431766416, + "learning_rate": 3.9093851132686084e-06, + "loss": 0.2286, + "step": 7980 + }, + { + "epoch": 7.441546343735445, + "grad_norm": 0.2972520095660748, + "learning_rate": 3.877022653721683e-06, + "loss": 0.2355, + "step": 7985 + }, + { + "epoch": 7.446204005589194, + "grad_norm": 0.29555186694090657, + "learning_rate": 3.844660194174757e-06, + "loss": 0.2358, + "step": 7990 + }, + { + "epoch": 7.450861667442943, + "grad_norm": 0.3018674902048298, + "learning_rate": 3.8122977346278318e-06, + "loss": 0.2301, + "step": 7995 + }, + { + "epoch": 7.455519329296693, + "grad_norm": 0.2893659186658692, + "learning_rate": 3.7799352750809062e-06, + "loss": 0.2282, + "step": 8000 + }, + { + "epoch": 7.460176991150442, + "grad_norm": 0.29704410993243424, + "learning_rate": 3.7475728155339807e-06, + "loss": 0.2295, + "step": 8005 + }, + { + "epoch": 7.464834653004192, + "grad_norm": 0.299885140331387, + "learning_rate": 3.7152103559870555e-06, + "loss": 0.2309, + "step": 8010 + }, + { + "epoch": 7.469492314857941, + "grad_norm": 0.28340958647352826, + "learning_rate": 3.6828478964401296e-06, + "loss": 0.2257, + "step": 8015 + }, + { + "epoch": 7.474149976711691, + "grad_norm": 0.29171765309912573, + "learning_rate": 3.6504854368932044e-06, + "loss": 0.2292, + "step": 8020 + }, + { + "epoch": 7.47880763856544, + "grad_norm": 0.2949674377093234, + "learning_rate": 3.6181229773462785e-06, + "loss": 0.2402, + "step": 8025 + }, + { + "epoch": 7.48346530041919, + "grad_norm": 0.2959510821426171, + "learning_rate": 3.5857605177993533e-06, + "loss": 0.232, + "step": 8030 + }, + { + "epoch": 7.488122962272939, + "grad_norm": 0.30774189794162643, + "learning_rate": 3.5533980582524273e-06, + "loss": 0.2346, + "step": 8035 + }, + { + "epoch": 7.492780624126689, + "grad_norm": 0.2898875205533011, + "learning_rate": 3.5210355987055014e-06, + "loss": 0.2271, + "step": 8040 + }, + { + "epoch": 7.497438285980438, + "grad_norm": 0.2937295756525654, + "learning_rate": 3.4886731391585762e-06, + "loss": 0.2368, + "step": 8045 + }, + { + "epoch": 7.502095947834187, + "grad_norm": 0.29222859784505506, + "learning_rate": 3.4563106796116503e-06, + "loss": 0.2245, + "step": 8050 + }, + { + "epoch": 7.506753609687936, + "grad_norm": 0.3088248045967586, + "learning_rate": 3.423948220064725e-06, + "loss": 0.2356, + "step": 8055 + }, + { + "epoch": 7.511411271541686, + "grad_norm": 0.29795075855304565, + "learning_rate": 3.391585760517799e-06, + "loss": 0.234, + "step": 8060 + }, + { + "epoch": 7.516068933395435, + "grad_norm": 0.29499070200838384, + "learning_rate": 3.359223300970874e-06, + "loss": 0.2337, + "step": 8065 + }, + { + "epoch": 7.520726595249185, + "grad_norm": 0.29851001641469693, + "learning_rate": 3.3268608414239485e-06, + "loss": 0.2318, + "step": 8070 + }, + { + "epoch": 7.525384257102934, + "grad_norm": 0.31823647472711214, + "learning_rate": 3.294498381877023e-06, + "loss": 0.2379, + "step": 8075 + }, + { + "epoch": 7.530041918956684, + "grad_norm": 0.2949888501288617, + "learning_rate": 3.2621359223300974e-06, + "loss": 0.2382, + "step": 8080 + }, + { + "epoch": 7.534699580810433, + "grad_norm": 0.3079856554407537, + "learning_rate": 3.2297734627831714e-06, + "loss": 0.2328, + "step": 8085 + }, + { + "epoch": 7.539357242664183, + "grad_norm": 0.30063866398956485, + "learning_rate": 3.1974110032362463e-06, + "loss": 0.2296, + "step": 8090 + }, + { + "epoch": 7.544014904517932, + "grad_norm": 0.3079909250110497, + "learning_rate": 3.1650485436893203e-06, + "loss": 0.23, + "step": 8095 + }, + { + "epoch": 7.548672566371682, + "grad_norm": 0.2958436428090986, + "learning_rate": 3.132686084142395e-06, + "loss": 0.2333, + "step": 8100 + }, + { + "epoch": 7.55333022822543, + "grad_norm": 0.3053616138559035, + "learning_rate": 3.1003236245954696e-06, + "loss": 0.2317, + "step": 8105 + }, + { + "epoch": 7.55798789007918, + "grad_norm": 0.30532557136329946, + "learning_rate": 3.0679611650485436e-06, + "loss": 0.2326, + "step": 8110 + }, + { + "epoch": 7.562645551932929, + "grad_norm": 0.2979271334069836, + "learning_rate": 3.035598705501618e-06, + "loss": 0.2307, + "step": 8115 + }, + { + "epoch": 7.567303213786679, + "grad_norm": 0.2960491281556295, + "learning_rate": 3.0032362459546925e-06, + "loss": 0.2357, + "step": 8120 + }, + { + "epoch": 7.571960875640428, + "grad_norm": 0.28863064035013297, + "learning_rate": 2.970873786407767e-06, + "loss": 0.2252, + "step": 8125 + }, + { + "epoch": 7.576618537494178, + "grad_norm": 0.29019940510217623, + "learning_rate": 2.9385113268608414e-06, + "loss": 0.2306, + "step": 8130 + }, + { + "epoch": 7.581276199347927, + "grad_norm": 0.28983014326869905, + "learning_rate": 2.906148867313916e-06, + "loss": 0.232, + "step": 8135 + }, + { + "epoch": 7.585933861201677, + "grad_norm": 0.3016603416714441, + "learning_rate": 2.8737864077669903e-06, + "loss": 0.2397, + "step": 8140 + }, + { + "epoch": 7.590591523055426, + "grad_norm": 0.29625751997817373, + "learning_rate": 2.841423948220065e-06, + "loss": 0.2316, + "step": 8145 + }, + { + "epoch": 7.595249184909176, + "grad_norm": 0.29904315648276253, + "learning_rate": 2.8090614886731396e-06, + "loss": 0.2351, + "step": 8150 + }, + { + "epoch": 7.599906846762925, + "grad_norm": 0.29640256634448936, + "learning_rate": 2.7766990291262136e-06, + "loss": 0.2255, + "step": 8155 + }, + { + "epoch": 7.604564508616674, + "grad_norm": 0.3005793461785394, + "learning_rate": 2.744336569579288e-06, + "loss": 0.2341, + "step": 8160 + }, + { + "epoch": 7.6092221704704235, + "grad_norm": 0.28604597838088214, + "learning_rate": 2.7119741100323625e-06, + "loss": 0.2331, + "step": 8165 + }, + { + "epoch": 7.613879832324173, + "grad_norm": 0.30494875057396265, + "learning_rate": 2.679611650485437e-06, + "loss": 0.2302, + "step": 8170 + }, + { + "epoch": 7.6185374941779225, + "grad_norm": 0.29665569172774714, + "learning_rate": 2.6472491909385114e-06, + "loss": 0.2268, + "step": 8175 + }, + { + "epoch": 7.623195156031672, + "grad_norm": 0.2903936085950602, + "learning_rate": 2.614886731391586e-06, + "loss": 0.2305, + "step": 8180 + }, + { + "epoch": 7.6278528178854215, + "grad_norm": 0.2840449153885873, + "learning_rate": 2.5825242718446603e-06, + "loss": 0.2279, + "step": 8185 + }, + { + "epoch": 7.632510479739171, + "grad_norm": 0.29311363279163966, + "learning_rate": 2.5501618122977347e-06, + "loss": 0.232, + "step": 8190 + }, + { + "epoch": 7.6371681415929205, + "grad_norm": 0.3024133237930259, + "learning_rate": 2.517799352750809e-06, + "loss": 0.234, + "step": 8195 + }, + { + "epoch": 7.64182580344667, + "grad_norm": 0.29842457764289687, + "learning_rate": 2.4854368932038836e-06, + "loss": 0.2283, + "step": 8200 + }, + { + "epoch": 7.6464834653004194, + "grad_norm": 0.29404576936950766, + "learning_rate": 2.453074433656958e-06, + "loss": 0.2308, + "step": 8205 + }, + { + "epoch": 7.651141127154169, + "grad_norm": 0.30149197235212855, + "learning_rate": 2.4207119741100325e-06, + "loss": 0.2364, + "step": 8210 + }, + { + "epoch": 7.6557987890079175, + "grad_norm": 0.30022001517778296, + "learning_rate": 2.388349514563107e-06, + "loss": 0.2296, + "step": 8215 + }, + { + "epoch": 7.660456450861668, + "grad_norm": 0.2886569031218321, + "learning_rate": 2.3559870550161814e-06, + "loss": 0.2293, + "step": 8220 + }, + { + "epoch": 7.6651141127154165, + "grad_norm": 0.29162327843663, + "learning_rate": 2.323624595469256e-06, + "loss": 0.2314, + "step": 8225 + }, + { + "epoch": 7.669771774569166, + "grad_norm": 0.304313023599954, + "learning_rate": 2.2912621359223303e-06, + "loss": 0.2324, + "step": 8230 + }, + { + "epoch": 7.6744294364229155, + "grad_norm": 0.3155740911282473, + "learning_rate": 2.2588996763754048e-06, + "loss": 0.2368, + "step": 8235 + }, + { + "epoch": 7.679087098276665, + "grad_norm": 0.30076835045192435, + "learning_rate": 2.226537216828479e-06, + "loss": 0.2337, + "step": 8240 + }, + { + "epoch": 7.6837447601304145, + "grad_norm": 0.2967465249604805, + "learning_rate": 2.1941747572815537e-06, + "loss": 0.2291, + "step": 8245 + }, + { + "epoch": 7.688402421984164, + "grad_norm": 0.3040736391467545, + "learning_rate": 2.1618122977346277e-06, + "loss": 0.2303, + "step": 8250 + }, + { + "epoch": 7.6930600838379135, + "grad_norm": 0.30152847877289063, + "learning_rate": 2.129449838187702e-06, + "loss": 0.2305, + "step": 8255 + }, + { + "epoch": 7.697717745691663, + "grad_norm": 0.28538787401129073, + "learning_rate": 2.0970873786407766e-06, + "loss": 0.2291, + "step": 8260 + }, + { + "epoch": 7.7023754075454125, + "grad_norm": 0.2998642191190617, + "learning_rate": 2.064724919093851e-06, + "loss": 0.2383, + "step": 8265 + }, + { + "epoch": 7.707033069399162, + "grad_norm": 0.2917780958653254, + "learning_rate": 2.0323624595469255e-06, + "loss": 0.2332, + "step": 8270 + }, + { + "epoch": 7.7116907312529115, + "grad_norm": 0.298805259047907, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.2303, + "step": 8275 + }, + { + "epoch": 7.71634839310666, + "grad_norm": 0.2890465234614115, + "learning_rate": 1.9676375404530748e-06, + "loss": 0.2275, + "step": 8280 + }, + { + "epoch": 7.72100605496041, + "grad_norm": 0.29389457909621547, + "learning_rate": 1.9352750809061492e-06, + "loss": 0.2284, + "step": 8285 + }, + { + "epoch": 7.725663716814159, + "grad_norm": 0.2959644171394708, + "learning_rate": 1.9029126213592235e-06, + "loss": 0.2345, + "step": 8290 + }, + { + "epoch": 7.730321378667909, + "grad_norm": 0.29421810879944266, + "learning_rate": 1.8705501618122977e-06, + "loss": 0.2302, + "step": 8295 + }, + { + "epoch": 7.734979040521658, + "grad_norm": 0.2915200400387438, + "learning_rate": 1.8381877022653721e-06, + "loss": 0.2274, + "step": 8300 + }, + { + "epoch": 7.739636702375408, + "grad_norm": 0.30041840963474026, + "learning_rate": 1.8058252427184466e-06, + "loss": 0.2298, + "step": 8305 + }, + { + "epoch": 7.744294364229157, + "grad_norm": 0.2811313882467788, + "learning_rate": 1.773462783171521e-06, + "loss": 0.2266, + "step": 8310 + }, + { + "epoch": 7.748952026082907, + "grad_norm": 0.28925553908674034, + "learning_rate": 1.7411003236245955e-06, + "loss": 0.2319, + "step": 8315 + }, + { + "epoch": 7.753609687936656, + "grad_norm": 0.2906004808859835, + "learning_rate": 1.70873786407767e-06, + "loss": 0.2332, + "step": 8320 + }, + { + "epoch": 7.7582673497904056, + "grad_norm": 0.28951439849956767, + "learning_rate": 1.6763754045307446e-06, + "loss": 0.2324, + "step": 8325 + }, + { + "epoch": 7.762925011644155, + "grad_norm": 0.29773446769676193, + "learning_rate": 1.644012944983819e-06, + "loss": 0.2324, + "step": 8330 + }, + { + "epoch": 7.767582673497904, + "grad_norm": 0.3004545400619002, + "learning_rate": 1.6116504854368935e-06, + "loss": 0.2331, + "step": 8335 + }, + { + "epoch": 7.772240335351653, + "grad_norm": 0.2852230726291482, + "learning_rate": 1.5792880258899675e-06, + "loss": 0.2285, + "step": 8340 + }, + { + "epoch": 7.776897997205403, + "grad_norm": 0.3054806079257286, + "learning_rate": 1.5469255663430422e-06, + "loss": 0.2333, + "step": 8345 + }, + { + "epoch": 7.781555659059152, + "grad_norm": 0.2821067570432199, + "learning_rate": 1.5145631067961166e-06, + "loss": 0.231, + "step": 8350 + }, + { + "epoch": 7.786213320912902, + "grad_norm": 0.2734352593394025, + "learning_rate": 1.482200647249191e-06, + "loss": 0.2284, + "step": 8355 + }, + { + "epoch": 7.790870982766651, + "grad_norm": 0.2744502239977389, + "learning_rate": 1.4498381877022655e-06, + "loss": 0.2303, + "step": 8360 + }, + { + "epoch": 7.795528644620401, + "grad_norm": 0.2906796981483134, + "learning_rate": 1.41747572815534e-06, + "loss": 0.2329, + "step": 8365 + }, + { + "epoch": 7.80018630647415, + "grad_norm": 0.285778377684361, + "learning_rate": 1.3851132686084144e-06, + "loss": 0.2303, + "step": 8370 + }, + { + "epoch": 7.8048439683279, + "grad_norm": 0.2807787673889005, + "learning_rate": 1.3527508090614886e-06, + "loss": 0.2349, + "step": 8375 + }, + { + "epoch": 7.809501630181649, + "grad_norm": 0.30122596269049373, + "learning_rate": 1.320388349514563e-06, + "loss": 0.2365, + "step": 8380 + }, + { + "epoch": 7.814159292035399, + "grad_norm": 0.29920390405740077, + "learning_rate": 1.2880258899676375e-06, + "loss": 0.2324, + "step": 8385 + }, + { + "epoch": 7.818816953889147, + "grad_norm": 0.29268110303894623, + "learning_rate": 1.2556634304207122e-06, + "loss": 0.2306, + "step": 8390 + }, + { + "epoch": 7.823474615742897, + "grad_norm": 0.2891680650027946, + "learning_rate": 1.2233009708737866e-06, + "loss": 0.2308, + "step": 8395 + }, + { + "epoch": 7.828132277596646, + "grad_norm": 0.2965934137032846, + "learning_rate": 1.1909385113268608e-06, + "loss": 0.2376, + "step": 8400 + }, + { + "epoch": 7.832789939450396, + "grad_norm": 0.2932565825711091, + "learning_rate": 1.1585760517799353e-06, + "loss": 0.2337, + "step": 8405 + }, + { + "epoch": 7.837447601304145, + "grad_norm": 0.29773472915142823, + "learning_rate": 1.1262135922330097e-06, + "loss": 0.2293, + "step": 8410 + }, + { + "epoch": 7.842105263157895, + "grad_norm": 0.2934365641617045, + "learning_rate": 1.0938511326860842e-06, + "loss": 0.2339, + "step": 8415 + }, + { + "epoch": 7.846762925011644, + "grad_norm": 0.2960715244509101, + "learning_rate": 1.0614886731391586e-06, + "loss": 0.2331, + "step": 8420 + }, + { + "epoch": 7.851420586865394, + "grad_norm": 0.2954502286031332, + "learning_rate": 1.029126213592233e-06, + "loss": 0.2254, + "step": 8425 + }, + { + "epoch": 7.856078248719143, + "grad_norm": 0.3018314394803379, + "learning_rate": 9.967637540453075e-07, + "loss": 0.2329, + "step": 8430 + }, + { + "epoch": 7.860735910572893, + "grad_norm": 0.2880968499417839, + "learning_rate": 9.64401294498382e-07, + "loss": 0.2357, + "step": 8435 + }, + { + "epoch": 7.865393572426642, + "grad_norm": 0.294434898554631, + "learning_rate": 9.320388349514564e-07, + "loss": 0.2312, + "step": 8440 + }, + { + "epoch": 7.870051234280391, + "grad_norm": 0.2932235274867024, + "learning_rate": 8.996763754045308e-07, + "loss": 0.2342, + "step": 8445 + }, + { + "epoch": 7.874708896134141, + "grad_norm": 0.2884539693871812, + "learning_rate": 8.673139158576052e-07, + "loss": 0.2335, + "step": 8450 + }, + { + "epoch": 7.87936655798789, + "grad_norm": 0.2936378535586453, + "learning_rate": 8.349514563106797e-07, + "loss": 0.2341, + "step": 8455 + }, + { + "epoch": 7.884024219841639, + "grad_norm": 0.289241138931296, + "learning_rate": 8.025889967637541e-07, + "loss": 0.2259, + "step": 8460 + }, + { + "epoch": 7.888681881695389, + "grad_norm": 0.29803961484355557, + "learning_rate": 7.702265372168285e-07, + "loss": 0.2371, + "step": 8465 + }, + { + "epoch": 7.893339543549138, + "grad_norm": 0.28305757547326993, + "learning_rate": 7.37864077669903e-07, + "loss": 0.2333, + "step": 8470 + }, + { + "epoch": 7.897997205402888, + "grad_norm": 0.2891859316380593, + "learning_rate": 7.055016181229773e-07, + "loss": 0.2311, + "step": 8475 + }, + { + "epoch": 7.902654867256637, + "grad_norm": 0.29477394307231747, + "learning_rate": 6.731391585760519e-07, + "loss": 0.235, + "step": 8480 + }, + { + "epoch": 7.907312529110387, + "grad_norm": 0.28070352814466104, + "learning_rate": 6.407766990291262e-07, + "loss": 0.2242, + "step": 8485 + }, + { + "epoch": 7.911970190964136, + "grad_norm": 0.2929726293630473, + "learning_rate": 6.084142394822007e-07, + "loss": 0.2367, + "step": 8490 + }, + { + "epoch": 7.916627852817886, + "grad_norm": 0.30314735503554396, + "learning_rate": 5.760517799352751e-07, + "loss": 0.2249, + "step": 8495 + }, + { + "epoch": 7.921285514671635, + "grad_norm": 0.29427314766744256, + "learning_rate": 5.436893203883496e-07, + "loss": 0.2307, + "step": 8500 + }, + { + "epoch": 7.925943176525385, + "grad_norm": 0.2912139356277789, + "learning_rate": 5.11326860841424e-07, + "loss": 0.2319, + "step": 8505 + }, + { + "epoch": 7.930600838379133, + "grad_norm": 0.293854156431429, + "learning_rate": 4.789644012944983e-07, + "loss": 0.2365, + "step": 8510 + }, + { + "epoch": 7.935258500232883, + "grad_norm": 0.2918833196305414, + "learning_rate": 4.4660194174757285e-07, + "loss": 0.2346, + "step": 8515 + }, + { + "epoch": 7.939916162086632, + "grad_norm": 0.2923200670274681, + "learning_rate": 4.1423948220064724e-07, + "loss": 0.2291, + "step": 8520 + }, + { + "epoch": 7.944573823940382, + "grad_norm": 0.2924664374700889, + "learning_rate": 3.818770226537217e-07, + "loss": 0.235, + "step": 8525 + }, + { + "epoch": 7.949231485794131, + "grad_norm": 0.30212915417954544, + "learning_rate": 3.4951456310679613e-07, + "loss": 0.2322, + "step": 8530 + }, + { + "epoch": 7.953889147647881, + "grad_norm": 0.30164147260075297, + "learning_rate": 3.171521035598706e-07, + "loss": 0.2266, + "step": 8535 + }, + { + "epoch": 7.95854680950163, + "grad_norm": 0.2939683525337563, + "learning_rate": 2.8478964401294497e-07, + "loss": 0.2305, + "step": 8540 + }, + { + "epoch": 7.96320447135538, + "grad_norm": 0.2755637464638358, + "learning_rate": 2.524271844660194e-07, + "loss": 0.2358, + "step": 8545 + }, + { + "epoch": 7.967862133209129, + "grad_norm": 0.2901318305536487, + "learning_rate": 2.2006472491909384e-07, + "loss": 0.2309, + "step": 8550 + }, + { + "epoch": 7.972519795062879, + "grad_norm": 0.2944501952740433, + "learning_rate": 1.8770226537216828e-07, + "loss": 0.2308, + "step": 8555 + }, + { + "epoch": 7.977177456916628, + "grad_norm": 0.2849006022937681, + "learning_rate": 1.5533980582524273e-07, + "loss": 0.2329, + "step": 8560 + }, + { + "epoch": 7.981835118770377, + "grad_norm": 0.289904412851554, + "learning_rate": 1.2297734627831717e-07, + "loss": 0.2334, + "step": 8565 + }, + { + "epoch": 7.986492780624126, + "grad_norm": 0.2926599702016367, + "learning_rate": 9.061488673139159e-08, + "loss": 0.2296, + "step": 8570 + }, + { + "epoch": 7.991150442477876, + "grad_norm": 0.30510864446562197, + "learning_rate": 5.8252427184466026e-08, + "loss": 0.234, + "step": 8575 + }, + { + "epoch": 7.995808104331625, + "grad_norm": 0.30231794718044347, + "learning_rate": 2.5889967637540452e-08, + "loss": 0.2294, + "step": 8580 + }, + { + "epoch": 7.999534233814625, + "step": 8584, + "total_flos": 6105625239486464.0, + "train_loss": 0.028776329486367866, + "train_runtime": 7678.0521, + "train_samples_per_second": 17.891, + "train_steps_per_second": 1.118 + } + ], + "logging_steps": 5, + "max_steps": 8584, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 6105625239486464.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}