{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.698944636348022, "eval_steps": 500, "global_step": 27468, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0007888191250469157, "grad_norm": 19.37966537475586, "learning_rate": 1.0157273918741808e-06, "loss": 8.7613, "step": 31 }, { "epoch": 0.0015776382500938314, "grad_norm": 13.922501564025879, "learning_rate": 2.0314547837483616e-06, "loss": 7.7444, "step": 62 }, { "epoch": 0.002366457375140747, "grad_norm": 11.746194839477539, "learning_rate": 3.0471821756225426e-06, "loss": 6.5314, "step": 93 }, { "epoch": 0.003155276500187663, "grad_norm": 16.49347686767578, "learning_rate": 4.062909567496723e-06, "loss": 5.3965, "step": 124 }, { "epoch": 0.003944095625234578, "grad_norm": 14.946576118469238, "learning_rate": 5.078636959370905e-06, "loss": 4.8936, "step": 155 }, { "epoch": 0.004732914750281494, "grad_norm": 16.985593795776367, "learning_rate": 6.094364351245085e-06, "loss": 4.5739, "step": 186 }, { "epoch": 0.00552173387532841, "grad_norm": 17.912887573242188, "learning_rate": 7.110091743119267e-06, "loss": 4.3466, "step": 217 }, { "epoch": 0.006310553000375326, "grad_norm": 15.746530532836914, "learning_rate": 8.125819134993446e-06, "loss": 4.1902, "step": 248 }, { "epoch": 0.007099372125422241, "grad_norm": 24.640979766845703, "learning_rate": 9.141546526867629e-06, "loss": 4.0289, "step": 279 }, { "epoch": 0.007888191250469157, "grad_norm": 14.921712875366211, "learning_rate": 1.015727391874181e-05, "loss": 3.8831, "step": 310 }, { "epoch": 0.008677010375516072, "grad_norm": 13.711713790893555, "learning_rate": 1.117300131061599e-05, "loss": 3.7542, "step": 341 }, { "epoch": 0.009465829500562987, "grad_norm": 12.796462059020996, "learning_rate": 1.218872870249017e-05, "loss": 3.6361, "step": 372 }, { "epoch": 0.010254648625609904, "grad_norm": 13.015061378479004, "learning_rate": 1.3204456094364351e-05, "loss": 3.5447, "step": 403 }, { "epoch": 0.01104346775065682, "grad_norm": 11.99329662322998, "learning_rate": 1.4220183486238533e-05, "loss": 3.4755, "step": 434 }, { "epoch": 0.011832286875703734, "grad_norm": 14.673354148864746, "learning_rate": 1.5235910878112714e-05, "loss": 3.384, "step": 465 }, { "epoch": 0.012621106000750651, "grad_norm": 14.919001579284668, "learning_rate": 1.6251638269986893e-05, "loss": 3.3096, "step": 496 }, { "epoch": 0.013409925125797566, "grad_norm": 9.896846771240234, "learning_rate": 1.7267365661861077e-05, "loss": 3.2666, "step": 527 }, { "epoch": 0.014198744250844482, "grad_norm": 10.563855171203613, "learning_rate": 1.8283093053735257e-05, "loss": 3.1977, "step": 558 }, { "epoch": 0.014987563375891397, "grad_norm": 8.470562934875488, "learning_rate": 1.9298820445609438e-05, "loss": 3.1561, "step": 589 }, { "epoch": 0.015776382500938314, "grad_norm": 7.783871173858643, "learning_rate": 2.031454783748362e-05, "loss": 3.1059, "step": 620 }, { "epoch": 0.016565201625985227, "grad_norm": 7.2594194412231445, "learning_rate": 2.13302752293578e-05, "loss": 3.0697, "step": 651 }, { "epoch": 0.017354020751032144, "grad_norm": 6.209031581878662, "learning_rate": 2.234600262123198e-05, "loss": 3.0443, "step": 682 }, { "epoch": 0.01814283987607906, "grad_norm": 8.110218048095703, "learning_rate": 2.336173001310616e-05, "loss": 2.9893, "step": 713 }, { "epoch": 0.018931659001125974, "grad_norm": 6.372434616088867, "learning_rate": 2.437745740498034e-05, "loss": 2.9522, "step": 744 }, { "epoch": 0.01972047812617289, "grad_norm": 6.396059989929199, "learning_rate": 2.5393184796854525e-05, "loss": 2.9286, "step": 775 }, { "epoch": 0.020509297251219808, "grad_norm": 5.576442718505859, "learning_rate": 2.6408912188728702e-05, "loss": 2.8986, "step": 806 }, { "epoch": 0.02129811637626672, "grad_norm": 7.455265045166016, "learning_rate": 2.7424639580602886e-05, "loss": 2.8346, "step": 837 }, { "epoch": 0.02208693550131364, "grad_norm": 5.682501792907715, "learning_rate": 2.8440366972477066e-05, "loss": 2.8033, "step": 868 }, { "epoch": 0.022875754626360555, "grad_norm": 6.197375297546387, "learning_rate": 2.9456094364351244e-05, "loss": 2.8146, "step": 899 }, { "epoch": 0.02366457375140747, "grad_norm": 6.124543190002441, "learning_rate": 3.0471821756225428e-05, "loss": 2.7633, "step": 930 }, { "epoch": 0.024453392876454386, "grad_norm": 5.52219820022583, "learning_rate": 3.148754914809961e-05, "loss": 2.7328, "step": 961 }, { "epoch": 0.025242212001501303, "grad_norm": 5.2266950607299805, "learning_rate": 3.2503276539973785e-05, "loss": 2.7086, "step": 992 }, { "epoch": 0.026031031126548216, "grad_norm": 4.860825538635254, "learning_rate": 3.351900393184797e-05, "loss": 2.6655, "step": 1023 }, { "epoch": 0.026819850251595133, "grad_norm": 4.891534805297852, "learning_rate": 3.453473132372215e-05, "loss": 2.6643, "step": 1054 }, { "epoch": 0.027608669376642046, "grad_norm": 4.9125566482543945, "learning_rate": 3.555045871559633e-05, "loss": 2.6084, "step": 1085 }, { "epoch": 0.028397488501688963, "grad_norm": 4.432997703552246, "learning_rate": 3.6566186107470514e-05, "loss": 2.5912, "step": 1116 }, { "epoch": 0.02918630762673588, "grad_norm": 4.265169143676758, "learning_rate": 3.7581913499344695e-05, "loss": 2.574, "step": 1147 }, { "epoch": 0.029975126751782794, "grad_norm": 4.731688022613525, "learning_rate": 3.8597640891218876e-05, "loss": 2.548, "step": 1178 }, { "epoch": 0.03076394587682971, "grad_norm": 4.037458419799805, "learning_rate": 3.9613368283093056e-05, "loss": 2.5411, "step": 1209 }, { "epoch": 0.03155276500187663, "grad_norm": 4.526784896850586, "learning_rate": 4.062909567496724e-05, "loss": 2.5111, "step": 1240 }, { "epoch": 0.03234158412692354, "grad_norm": 4.06175422668457, "learning_rate": 4.164482306684142e-05, "loss": 2.519, "step": 1271 }, { "epoch": 0.033130403251970454, "grad_norm": 4.599717617034912, "learning_rate": 4.26605504587156e-05, "loss": 2.4858, "step": 1302 }, { "epoch": 0.033919222377017375, "grad_norm": 3.97731876373291, "learning_rate": 4.367627785058978e-05, "loss": 2.4902, "step": 1333 }, { "epoch": 0.03470804150206429, "grad_norm": 3.732372522354126, "learning_rate": 4.469200524246396e-05, "loss": 2.4718, "step": 1364 }, { "epoch": 0.0354968606271112, "grad_norm": 3.583721160888672, "learning_rate": 4.570773263433814e-05, "loss": 2.423, "step": 1395 }, { "epoch": 0.03628567975215812, "grad_norm": 3.5558745861053467, "learning_rate": 4.672346002621232e-05, "loss": 2.4137, "step": 1426 }, { "epoch": 0.037074498877205035, "grad_norm": 3.757810115814209, "learning_rate": 4.77391874180865e-05, "loss": 2.4368, "step": 1457 }, { "epoch": 0.03786331800225195, "grad_norm": 3.6044111251831055, "learning_rate": 4.875491480996068e-05, "loss": 2.4129, "step": 1488 }, { "epoch": 0.03865213712729887, "grad_norm": 3.366729736328125, "learning_rate": 4.977064220183487e-05, "loss": 2.3889, "step": 1519 }, { "epoch": 0.03944095625234578, "grad_norm": 3.2331082820892334, "learning_rate": 4.9999915451558777e-05, "loss": 2.3745, "step": 1550 }, { "epoch": 0.040229775377392696, "grad_norm": 3.150716781616211, "learning_rate": 4.999955597496219e-05, "loss": 2.3588, "step": 1581 }, { "epoch": 0.041018594502439616, "grad_norm": 3.4237565994262695, "learning_rate": 4.9998914381774255e-05, "loss": 2.3639, "step": 1612 }, { "epoch": 0.04180741362748653, "grad_norm": 2.9677655696868896, "learning_rate": 4.999799067923527e-05, "loss": 2.3587, "step": 1643 }, { "epoch": 0.04259623275253344, "grad_norm": 3.1596009731292725, "learning_rate": 4.999678487776908e-05, "loss": 2.3365, "step": 1674 }, { "epoch": 0.04338505187758036, "grad_norm": 3.4485344886779785, "learning_rate": 4.9995296990983006e-05, "loss": 2.3353, "step": 1705 }, { "epoch": 0.04417387100262728, "grad_norm": 3.016425848007202, "learning_rate": 4.999352703566763e-05, "loss": 2.3172, "step": 1736 }, { "epoch": 0.04496269012767419, "grad_norm": 2.8494279384613037, "learning_rate": 4.999147503179668e-05, "loss": 2.3241, "step": 1767 }, { "epoch": 0.04575150925272111, "grad_norm": 2.7547197341918945, "learning_rate": 4.998914100252672e-05, "loss": 2.301, "step": 1798 }, { "epoch": 0.046540328377768024, "grad_norm": 2.7102956771850586, "learning_rate": 4.998652497419696e-05, "loss": 2.3046, "step": 1829 }, { "epoch": 0.04732914750281494, "grad_norm": 2.5211455821990967, "learning_rate": 4.9983626976328927e-05, "loss": 2.2794, "step": 1860 }, { "epoch": 0.04811796662786186, "grad_norm": 2.7271173000335693, "learning_rate": 4.998044704162613e-05, "loss": 2.2554, "step": 1891 }, { "epoch": 0.04890678575290877, "grad_norm": 2.850342273712158, "learning_rate": 4.9976985205973705e-05, "loss": 2.2722, "step": 1922 }, { "epoch": 0.049695604877955685, "grad_norm": 2.6712722778320312, "learning_rate": 4.997324150843799e-05, "loss": 2.2412, "step": 1953 }, { "epoch": 0.050484424003002605, "grad_norm": 2.6086056232452393, "learning_rate": 4.99692159912661e-05, "loss": 2.2724, "step": 1984 }, { "epoch": 0.05127324312804952, "grad_norm": 2.495509147644043, "learning_rate": 4.996490869988546e-05, "loss": 2.2588, "step": 2015 }, { "epoch": 0.05206206225309643, "grad_norm": 2.671813488006592, "learning_rate": 4.996031968290326e-05, "loss": 2.2334, "step": 2046 }, { "epoch": 0.052850881378143345, "grad_norm": 2.6381373405456543, "learning_rate": 4.995544899210594e-05, "loss": 2.2361, "step": 2077 }, { "epoch": 0.053639700503190266, "grad_norm": 2.54028058052063, "learning_rate": 4.9950296682458583e-05, "loss": 2.2216, "step": 2108 }, { "epoch": 0.05442851962823718, "grad_norm": 2.67765212059021, "learning_rate": 4.994486281210429e-05, "loss": 2.2064, "step": 2139 }, { "epoch": 0.05521733875328409, "grad_norm": 2.570333957672119, "learning_rate": 4.9939147442363566e-05, "loss": 2.2334, "step": 2170 }, { "epoch": 0.05600615787833101, "grad_norm": 2.4661099910736084, "learning_rate": 4.9933150637733574e-05, "loss": 2.1975, "step": 2201 }, { "epoch": 0.056794977003377926, "grad_norm": 2.5485525131225586, "learning_rate": 4.992687246588743e-05, "loss": 2.1883, "step": 2232 }, { "epoch": 0.05758379612842484, "grad_norm": 2.471689462661743, "learning_rate": 4.992031299767347e-05, "loss": 2.1976, "step": 2263 }, { "epoch": 0.05837261525347176, "grad_norm": 2.4743897914886475, "learning_rate": 4.9913472307114386e-05, "loss": 2.175, "step": 2294 }, { "epoch": 0.059161434378518674, "grad_norm": 2.445451259613037, "learning_rate": 4.9906350471406446e-05, "loss": 2.1683, "step": 2325 }, { "epoch": 0.05995025350356559, "grad_norm": 2.406235456466675, "learning_rate": 4.989894757091861e-05, "loss": 2.1685, "step": 2356 }, { "epoch": 0.06073907262861251, "grad_norm": 2.2473762035369873, "learning_rate": 4.989126368919158e-05, "loss": 2.191, "step": 2387 }, { "epoch": 0.06152789175365942, "grad_norm": 2.3035123348236084, "learning_rate": 4.988329891293693e-05, "loss": 2.1702, "step": 2418 }, { "epoch": 0.062316710878706334, "grad_norm": 2.3511545658111572, "learning_rate": 4.987505333203608e-05, "loss": 2.1565, "step": 2449 }, { "epoch": 0.06310553000375325, "grad_norm": 2.2300775051116943, "learning_rate": 4.9866527039539276e-05, "loss": 2.1558, "step": 2480 }, { "epoch": 0.06389434912880017, "grad_norm": 2.3471133708953857, "learning_rate": 4.9857720131664594e-05, "loss": 2.1516, "step": 2511 }, { "epoch": 0.06468316825384708, "grad_norm": 2.3770556449890137, "learning_rate": 4.9848632707796773e-05, "loss": 2.1384, "step": 2542 }, { "epoch": 0.065471987378894, "grad_norm": 2.235302209854126, "learning_rate": 4.9839264870486155e-05, "loss": 2.1287, "step": 2573 }, { "epoch": 0.06626080650394091, "grad_norm": 2.3938326835632324, "learning_rate": 4.9829616725447526e-05, "loss": 2.1387, "step": 2604 }, { "epoch": 0.06704962562898784, "grad_norm": 2.177912712097168, "learning_rate": 4.981968838155888e-05, "loss": 2.1126, "step": 2635 }, { "epoch": 0.06783844475403475, "grad_norm": 2.2535035610198975, "learning_rate": 4.980947995086024e-05, "loss": 2.126, "step": 2666 }, { "epoch": 0.06862726387908166, "grad_norm": 2.14459228515625, "learning_rate": 4.979899154855234e-05, "loss": 2.1334, "step": 2697 }, { "epoch": 0.06941608300412858, "grad_norm": 2.156930446624756, "learning_rate": 4.9788223292995386e-05, "loss": 2.1121, "step": 2728 }, { "epoch": 0.07020490212917549, "grad_norm": 2.344521999359131, "learning_rate": 4.977717530570768e-05, "loss": 2.123, "step": 2759 }, { "epoch": 0.0709937212542224, "grad_norm": 2.1548428535461426, "learning_rate": 4.976584771136425e-05, "loss": 2.113, "step": 2790 }, { "epoch": 0.07178254037926933, "grad_norm": 2.2467427253723145, "learning_rate": 4.975424063779547e-05, "loss": 2.1048, "step": 2821 }, { "epoch": 0.07257135950431624, "grad_norm": 2.601376533508301, "learning_rate": 4.974235421598557e-05, "loss": 2.1144, "step": 2852 }, { "epoch": 0.07336017862936316, "grad_norm": 2.143150568008423, "learning_rate": 4.973018858007122e-05, "loss": 2.0793, "step": 2883 }, { "epoch": 0.07414899775441007, "grad_norm": 2.1503472328186035, "learning_rate": 4.9717743867339963e-05, "loss": 2.0996, "step": 2914 }, { "epoch": 0.07493781687945698, "grad_norm": 2.1850991249084473, "learning_rate": 4.9705020218228695e-05, "loss": 2.0912, "step": 2945 }, { "epoch": 0.0757266360045039, "grad_norm": 2.207058906555176, "learning_rate": 4.969201777632205e-05, "loss": 2.104, "step": 2976 }, { "epoch": 0.07651545512955082, "grad_norm": 2.1132214069366455, "learning_rate": 4.9678736688350846e-05, "loss": 2.0857, "step": 3007 }, { "epoch": 0.07730427425459774, "grad_norm": 4.404266834259033, "learning_rate": 4.966517710419033e-05, "loss": 2.0888, "step": 3038 }, { "epoch": 0.07809309337964465, "grad_norm": 2.0443875789642334, "learning_rate": 4.965133917685858e-05, "loss": 2.0688, "step": 3069 }, { "epoch": 0.07888191250469156, "grad_norm": 1.9510438442230225, "learning_rate": 4.9637223062514714e-05, "loss": 2.0891, "step": 3100 }, { "epoch": 0.07967073162973848, "grad_norm": 2.089725971221924, "learning_rate": 4.962282892045718e-05, "loss": 2.0949, "step": 3131 }, { "epoch": 0.08045955075478539, "grad_norm": 2.1167550086975098, "learning_rate": 4.9608156913121904e-05, "loss": 2.0911, "step": 3162 }, { "epoch": 0.08124836987983232, "grad_norm": 2.2896945476531982, "learning_rate": 4.959320720608049e-05, "loss": 2.0709, "step": 3193 }, { "epoch": 0.08203718900487923, "grad_norm": 2.0464134216308594, "learning_rate": 4.9577979968038354e-05, "loss": 2.0743, "step": 3224 }, { "epoch": 0.08282600812992615, "grad_norm": 2.0745983123779297, "learning_rate": 4.956247537083282e-05, "loss": 2.0644, "step": 3255 }, { "epoch": 0.08361482725497306, "grad_norm": 2.1493799686431885, "learning_rate": 4.9546693589431145e-05, "loss": 2.0651, "step": 3286 }, { "epoch": 0.08440364638001997, "grad_norm": 2.07473087310791, "learning_rate": 4.9530634801928595e-05, "loss": 2.0354, "step": 3317 }, { "epoch": 0.08519246550506689, "grad_norm": 1.962320327758789, "learning_rate": 4.9514299189546395e-05, "loss": 2.0664, "step": 3348 }, { "epoch": 0.0859812846301138, "grad_norm": 1.9907017946243286, "learning_rate": 4.949768693662973e-05, "loss": 2.0772, "step": 3379 }, { "epoch": 0.08677010375516073, "grad_norm": 2.2022948265075684, "learning_rate": 4.948079823064559e-05, "loss": 2.0629, "step": 3410 }, { "epoch": 0.08755892288020764, "grad_norm": 1.9094113111495972, "learning_rate": 4.946363326218074e-05, "loss": 2.0516, "step": 3441 }, { "epoch": 0.08834774200525455, "grad_norm": 2.16923189163208, "learning_rate": 4.9446192224939525e-05, "loss": 2.0712, "step": 3472 }, { "epoch": 0.08913656113030147, "grad_norm": 2.2181923389434814, "learning_rate": 4.942847531574167e-05, "loss": 2.0476, "step": 3503 }, { "epoch": 0.08992538025534838, "grad_norm": 2.0701868534088135, "learning_rate": 4.941048273452008e-05, "loss": 2.0662, "step": 3534 }, { "epoch": 0.0907141993803953, "grad_norm": 2.1713969707489014, "learning_rate": 4.9392214684318605e-05, "loss": 2.0316, "step": 3565 }, { "epoch": 0.09150301850544222, "grad_norm": 1.8713674545288086, "learning_rate": 4.93736713712897e-05, "loss": 2.0276, "step": 3596 }, { "epoch": 0.09229183763048913, "grad_norm": 1.957153081893921, "learning_rate": 4.9354853004692124e-05, "loss": 2.0423, "step": 3627 }, { "epoch": 0.09308065675553605, "grad_norm": 2.025723457336426, "learning_rate": 4.93357597968886e-05, "loss": 2.0361, "step": 3658 }, { "epoch": 0.09386947588058296, "grad_norm": 1.9253807067871094, "learning_rate": 4.931639196334338e-05, "loss": 2.0254, "step": 3689 }, { "epoch": 0.09465829500562987, "grad_norm": 2.1453866958618164, "learning_rate": 4.9296749722619826e-05, "loss": 2.0434, "step": 3720 }, { "epoch": 0.09544711413067679, "grad_norm": 2.116746187210083, "learning_rate": 4.9276833296377966e-05, "loss": 2.0535, "step": 3751 }, { "epoch": 0.09623593325572372, "grad_norm": 2.060053825378418, "learning_rate": 4.925664290937196e-05, "loss": 2.0162, "step": 3782 }, { "epoch": 0.09702475238077063, "grad_norm": 2.033005952835083, "learning_rate": 4.9236178789447576e-05, "loss": 2.0377, "step": 3813 }, { "epoch": 0.09781357150581754, "grad_norm": 1.930992841720581, "learning_rate": 4.921544116753962e-05, "loss": 2.0091, "step": 3844 }, { "epoch": 0.09860239063086446, "grad_norm": 1.987481951713562, "learning_rate": 4.919443027766935e-05, "loss": 2.0191, "step": 3875 }, { "epoch": 0.09939120975591137, "grad_norm": 2.1110246181488037, "learning_rate": 4.91731463569418e-05, "loss": 2.015, "step": 3906 }, { "epoch": 0.10018002888095828, "grad_norm": 2.009760618209839, "learning_rate": 4.915158964554312e-05, "loss": 2.0196, "step": 3937 }, { "epoch": 0.10096884800600521, "grad_norm": 1.9978469610214233, "learning_rate": 4.912976038673786e-05, "loss": 2.0095, "step": 3968 }, { "epoch": 0.10175766713105212, "grad_norm": 1.9885358810424805, "learning_rate": 4.9107658826866254e-05, "loss": 2.0148, "step": 3999 }, { "epoch": 0.10254648625609904, "grad_norm": 1.9853264093399048, "learning_rate": 4.908528521534139e-05, "loss": 2.014, "step": 4030 }, { "epoch": 0.10333530538114595, "grad_norm": 1.9636098146438599, "learning_rate": 4.906263980464644e-05, "loss": 2.0197, "step": 4061 }, { "epoch": 0.10412412450619286, "grad_norm": 1.9954477548599243, "learning_rate": 4.903972285033178e-05, "loss": 1.9974, "step": 4092 }, { "epoch": 0.10491294363123978, "grad_norm": 1.9644993543624878, "learning_rate": 4.901653461101213e-05, "loss": 2.0035, "step": 4123 }, { "epoch": 0.10570176275628669, "grad_norm": 1.9775594472885132, "learning_rate": 4.8993075348363626e-05, "loss": 2.0108, "step": 4154 }, { "epoch": 0.10649058188133362, "grad_norm": 1.9478306770324707, "learning_rate": 4.896934532712084e-05, "loss": 1.992, "step": 4185 }, { "epoch": 0.10727940100638053, "grad_norm": 2.2822272777557373, "learning_rate": 4.8945344815073846e-05, "loss": 2.0038, "step": 4216 }, { "epoch": 0.10806822013142744, "grad_norm": 1.9325686693191528, "learning_rate": 4.892107408306516e-05, "loss": 1.9914, "step": 4247 }, { "epoch": 0.10885703925647436, "grad_norm": 1.9367748498916626, "learning_rate": 4.889653340498669e-05, "loss": 2.0012, "step": 4278 }, { "epoch": 0.10964585838152127, "grad_norm": 1.9141064882278442, "learning_rate": 4.8871723057776664e-05, "loss": 2.0192, "step": 4309 }, { "epoch": 0.11043467750656819, "grad_norm": 2.1826133728027344, "learning_rate": 4.8846643321416476e-05, "loss": 2.0014, "step": 4340 }, { "epoch": 0.11122349663161511, "grad_norm": 1.9631683826446533, "learning_rate": 4.882129447892753e-05, "loss": 1.9848, "step": 4371 }, { "epoch": 0.11201231575666203, "grad_norm": 1.969335913658142, "learning_rate": 4.8795676816368076e-05, "loss": 1.9832, "step": 4402 }, { "epoch": 0.11280113488170894, "grad_norm": 1.8996473550796509, "learning_rate": 4.876979062282995e-05, "loss": 1.9866, "step": 4433 }, { "epoch": 0.11358995400675585, "grad_norm": 1.9162741899490356, "learning_rate": 4.8743636190435325e-05, "loss": 1.9985, "step": 4464 }, { "epoch": 0.11437877313180277, "grad_norm": 1.9941459894180298, "learning_rate": 4.871721381433344e-05, "loss": 1.9831, "step": 4495 }, { "epoch": 0.11516759225684968, "grad_norm": 1.7893972396850586, "learning_rate": 4.869052379269719e-05, "loss": 1.9965, "step": 4526 }, { "epoch": 0.1159564113818966, "grad_norm": 1.8150454759597778, "learning_rate": 4.866356642671985e-05, "loss": 1.9814, "step": 4557 }, { "epoch": 0.11674523050694352, "grad_norm": 1.9319466352462769, "learning_rate": 4.8636342020611634e-05, "loss": 1.9761, "step": 4588 }, { "epoch": 0.11753404963199043, "grad_norm": 1.7858814001083374, "learning_rate": 4.860885088159626e-05, "loss": 1.9912, "step": 4619 }, { "epoch": 0.11832286875703735, "grad_norm": 1.8554847240447998, "learning_rate": 4.858109331990751e-05, "loss": 1.9709, "step": 4650 }, { "epoch": 0.11911168788208426, "grad_norm": 1.8804956674575806, "learning_rate": 4.855306964878567e-05, "loss": 2.0065, "step": 4681 }, { "epoch": 0.11990050700713117, "grad_norm": 1.8334413766860962, "learning_rate": 4.8524780184474084e-05, "loss": 1.9883, "step": 4712 }, { "epoch": 0.1206893261321781, "grad_norm": 2.0563790798187256, "learning_rate": 4.8496225246215496e-05, "loss": 1.9805, "step": 4743 }, { "epoch": 0.12147814525722501, "grad_norm": 1.7843685150146484, "learning_rate": 4.8467405156248505e-05, "loss": 1.9732, "step": 4774 }, { "epoch": 0.12226696438227193, "grad_norm": 1.8490878343582153, "learning_rate": 4.843832023980392e-05, "loss": 1.9955, "step": 4805 }, { "epoch": 0.12305578350731884, "grad_norm": 1.6674724817276, "learning_rate": 4.840897082510106e-05, "loss": 1.9837, "step": 4836 }, { "epoch": 0.12384460263236576, "grad_norm": 1.7055600881576538, "learning_rate": 4.8379357243344084e-05, "loss": 1.9782, "step": 4867 }, { "epoch": 0.12463342175741267, "grad_norm": 1.7836189270019531, "learning_rate": 4.8349479828718236e-05, "loss": 1.9696, "step": 4898 }, { "epoch": 0.1254222408824596, "grad_norm": 1.8213683366775513, "learning_rate": 4.8319338918386075e-05, "loss": 1.9689, "step": 4929 }, { "epoch": 0.1262110600075065, "grad_norm": 1.7552939653396606, "learning_rate": 4.828893485248369e-05, "loss": 2.0059, "step": 4960 }, { "epoch": 0.12699987913255342, "grad_norm": 1.7021842002868652, "learning_rate": 4.825826797411682e-05, "loss": 1.966, "step": 4991 }, { "epoch": 0.12778869825760034, "grad_norm": 2.696760416030884, "learning_rate": 4.822733862935702e-05, "loss": 1.9909, "step": 5022 }, { "epoch": 0.12857751738264725, "grad_norm": 1.8321341276168823, "learning_rate": 4.819614716723775e-05, "loss": 1.967, "step": 5053 }, { "epoch": 0.12936633650769416, "grad_norm": 2.1158053874969482, "learning_rate": 4.8164693939750425e-05, "loss": 1.9692, "step": 5084 }, { "epoch": 0.13015515563274108, "grad_norm": 1.9149221181869507, "learning_rate": 4.813297930184042e-05, "loss": 1.9676, "step": 5115 }, { "epoch": 0.130943974757788, "grad_norm": 1.7447597980499268, "learning_rate": 4.810100361140314e-05, "loss": 1.9538, "step": 5146 }, { "epoch": 0.1317327938828349, "grad_norm": 1.7692095041275024, "learning_rate": 4.8068767229279885e-05, "loss": 1.9612, "step": 5177 }, { "epoch": 0.13252161300788182, "grad_norm": 1.8180122375488281, "learning_rate": 4.8036270519253854e-05, "loss": 1.9679, "step": 5208 }, { "epoch": 0.13331043213292876, "grad_norm": 1.7397531270980835, "learning_rate": 4.8003513848046e-05, "loss": 1.9483, "step": 5239 }, { "epoch": 0.13409925125797567, "grad_norm": 1.773038387298584, "learning_rate": 4.79704975853109e-05, "loss": 1.9706, "step": 5270 }, { "epoch": 0.13488807038302258, "grad_norm": 1.846821904182434, "learning_rate": 4.793722210363262e-05, "loss": 1.9557, "step": 5301 }, { "epoch": 0.1356768895080695, "grad_norm": 1.8578094244003296, "learning_rate": 4.7903687778520414e-05, "loss": 1.9613, "step": 5332 }, { "epoch": 0.1364657086331164, "grad_norm": 1.6778779029846191, "learning_rate": 4.7869894988404593e-05, "loss": 1.9481, "step": 5363 }, { "epoch": 0.13725452775816332, "grad_norm": 1.8821789026260376, "learning_rate": 4.783584411463221e-05, "loss": 1.9475, "step": 5394 }, { "epoch": 0.13804334688321024, "grad_norm": 1.7182306051254272, "learning_rate": 4.780153554146274e-05, "loss": 1.9541, "step": 5425 }, { "epoch": 0.13883216600825715, "grad_norm": 1.9063465595245361, "learning_rate": 4.7766969656063766e-05, "loss": 1.9475, "step": 5456 }, { "epoch": 0.13962098513330407, "grad_norm": 1.8275973796844482, "learning_rate": 4.773214684850662e-05, "loss": 1.9672, "step": 5487 }, { "epoch": 0.14040980425835098, "grad_norm": 1.798607349395752, "learning_rate": 4.769706751176193e-05, "loss": 1.9603, "step": 5518 }, { "epoch": 0.1411986233833979, "grad_norm": 1.9662116765975952, "learning_rate": 4.7661732041695264e-05, "loss": 1.959, "step": 5549 }, { "epoch": 0.1419874425084448, "grad_norm": 1.9336684942245483, "learning_rate": 4.762614083706258e-05, "loss": 1.9427, "step": 5580 }, { "epoch": 0.14277626163349175, "grad_norm": 1.68619704246521, "learning_rate": 4.759029429950581e-05, "loss": 1.9459, "step": 5611 }, { "epoch": 0.14356508075853866, "grad_norm": 1.9060170650482178, "learning_rate": 4.7554192833548235e-05, "loss": 1.9522, "step": 5642 }, { "epoch": 0.14435389988358557, "grad_norm": 1.7184319496154785, "learning_rate": 4.751783684659e-05, "loss": 1.9253, "step": 5673 }, { "epoch": 0.1451427190086325, "grad_norm": 1.7833524942398071, "learning_rate": 4.748122674890348e-05, "loss": 1.9484, "step": 5704 }, { "epoch": 0.1459315381336794, "grad_norm": 1.8405711650848389, "learning_rate": 4.7444362953628654e-05, "loss": 1.9576, "step": 5735 }, { "epoch": 0.1467203572587263, "grad_norm": 1.7746304273605347, "learning_rate": 4.7407245876768424e-05, "loss": 1.9549, "step": 5766 }, { "epoch": 0.14750917638377323, "grad_norm": 1.6882883310317993, "learning_rate": 4.736987593718397e-05, "loss": 1.9407, "step": 5797 }, { "epoch": 0.14829799550882014, "grad_norm": 1.6760326623916626, "learning_rate": 4.733225355658999e-05, "loss": 1.9358, "step": 5828 }, { "epoch": 0.14908681463386705, "grad_norm": 1.6274425983428955, "learning_rate": 4.7294379159549926e-05, "loss": 1.929, "step": 5859 }, { "epoch": 0.14987563375891397, "grad_norm": 1.8570448160171509, "learning_rate": 4.725625317347119e-05, "loss": 1.926, "step": 5890 }, { "epoch": 0.15066445288396088, "grad_norm": 1.754744052886963, "learning_rate": 4.7217876028600374e-05, "loss": 1.948, "step": 5921 }, { "epoch": 0.1514532720090078, "grad_norm": 1.7100951671600342, "learning_rate": 4.717924815801832e-05, "loss": 1.9527, "step": 5952 }, { "epoch": 0.1522420911340547, "grad_norm": 1.731489658355713, "learning_rate": 4.714036999763532e-05, "loss": 1.933, "step": 5983 }, { "epoch": 0.15303091025910165, "grad_norm": 1.7440745830535889, "learning_rate": 4.7101241986186116e-05, "loss": 1.9214, "step": 6014 }, { "epoch": 0.15381972938414856, "grad_norm": 1.7062227725982666, "learning_rate": 4.7061864565225e-05, "loss": 1.9381, "step": 6045 }, { "epoch": 0.15460854850919548, "grad_norm": 1.6533842086791992, "learning_rate": 4.702223817912081e-05, "loss": 1.9097, "step": 6076 }, { "epoch": 0.1553973676342424, "grad_norm": 1.8380335569381714, "learning_rate": 4.698236327505195e-05, "loss": 1.9357, "step": 6107 }, { "epoch": 0.1561861867592893, "grad_norm": 1.7769367694854736, "learning_rate": 4.694224030300127e-05, "loss": 1.9176, "step": 6138 }, { "epoch": 0.15697500588433622, "grad_norm": 1.697293758392334, "learning_rate": 4.690186971575107e-05, "loss": 1.9242, "step": 6169 }, { "epoch": 0.15776382500938313, "grad_norm": 1.6948884725570679, "learning_rate": 4.6861251968877916e-05, "loss": 1.9277, "step": 6200 }, { "epoch": 0.15855264413443004, "grad_norm": 1.8506945371627808, "learning_rate": 4.68203875207476e-05, "loss": 1.9413, "step": 6231 }, { "epoch": 0.15934146325947696, "grad_norm": 1.6183403730392456, "learning_rate": 4.677927683250983e-05, "loss": 1.929, "step": 6262 }, { "epoch": 0.16013028238452387, "grad_norm": 1.7411370277404785, "learning_rate": 4.6737920368093156e-05, "loss": 1.9272, "step": 6293 }, { "epoch": 0.16091910150957078, "grad_norm": 1.6963975429534912, "learning_rate": 4.669631859419965e-05, "loss": 1.9183, "step": 6324 }, { "epoch": 0.1617079206346177, "grad_norm": 1.664981484413147, "learning_rate": 4.6654471980299676e-05, "loss": 1.9358, "step": 6355 }, { "epoch": 0.16249673975966464, "grad_norm": 1.8351246118545532, "learning_rate": 4.661238099862658e-05, "loss": 1.9126, "step": 6386 }, { "epoch": 0.16328555888471155, "grad_norm": 1.8973945379257202, "learning_rate": 4.657004612417138e-05, "loss": 1.9171, "step": 6417 }, { "epoch": 0.16407437800975846, "grad_norm": 1.7961071729660034, "learning_rate": 4.6527467834677374e-05, "loss": 1.9095, "step": 6448 }, { "epoch": 0.16486319713480538, "grad_norm": 1.73550283908844, "learning_rate": 4.648464661063478e-05, "loss": 1.924, "step": 6479 }, { "epoch": 0.1656520162598523, "grad_norm": 1.77008056640625, "learning_rate": 4.6441582935275264e-05, "loss": 1.9338, "step": 6510 }, { "epoch": 0.1664408353848992, "grad_norm": 1.7121846675872803, "learning_rate": 4.6398277294566586e-05, "loss": 1.9094, "step": 6541 }, { "epoch": 0.16722965450994612, "grad_norm": 1.6560605764389038, "learning_rate": 4.6354730177207e-05, "loss": 1.9195, "step": 6572 }, { "epoch": 0.16801847363499303, "grad_norm": 1.7044239044189453, "learning_rate": 4.6310942074619787e-05, "loss": 1.9197, "step": 6603 }, { "epoch": 0.16880729276003995, "grad_norm": 1.6876695156097412, "learning_rate": 4.626691348094777e-05, "loss": 1.9078, "step": 6634 }, { "epoch": 0.16959611188508686, "grad_norm": 1.7991163730621338, "learning_rate": 4.622264489304762e-05, "loss": 1.9147, "step": 6665 }, { "epoch": 0.17038493101013377, "grad_norm": 1.662376046180725, "learning_rate": 4.617813681048434e-05, "loss": 1.9263, "step": 6696 }, { "epoch": 0.17117375013518069, "grad_norm": 1.6112306118011475, "learning_rate": 4.61333897355256e-05, "loss": 1.9102, "step": 6727 }, { "epoch": 0.1719625692602276, "grad_norm": 1.757332444190979, "learning_rate": 4.608840417313604e-05, "loss": 1.921, "step": 6758 }, { "epoch": 0.17275138838527454, "grad_norm": 1.6604098081588745, "learning_rate": 4.6043180630971646e-05, "loss": 1.905, "step": 6789 }, { "epoch": 0.17354020751032145, "grad_norm": 1.6792216300964355, "learning_rate": 4.599771961937391e-05, "loss": 1.906, "step": 6820 }, { "epoch": 0.17432902663536837, "grad_norm": 1.635213851928711, "learning_rate": 4.5952021651364204e-05, "loss": 1.9038, "step": 6851 }, { "epoch": 0.17511784576041528, "grad_norm": 1.6034295558929443, "learning_rate": 4.590608724263786e-05, "loss": 1.9214, "step": 6882 }, { "epoch": 0.1759066648854622, "grad_norm": 1.730025053024292, "learning_rate": 4.585991691155845e-05, "loss": 1.9023, "step": 6913 }, { "epoch": 0.1766954840105091, "grad_norm": 1.6403334140777588, "learning_rate": 4.581351117915188e-05, "loss": 1.8944, "step": 6944 }, { "epoch": 0.17748430313555602, "grad_norm": 1.7030408382415771, "learning_rate": 4.5766870569100534e-05, "loss": 1.9082, "step": 6975 }, { "epoch": 0.17827312226060293, "grad_norm": 1.9924838542938232, "learning_rate": 4.571999560773736e-05, "loss": 1.9033, "step": 7006 }, { "epoch": 0.17906194138564985, "grad_norm": 1.688637137413025, "learning_rate": 4.5672886824039915e-05, "loss": 1.9087, "step": 7037 }, { "epoch": 0.17985076051069676, "grad_norm": 1.7744380235671997, "learning_rate": 4.5625544749624435e-05, "loss": 1.8911, "step": 7068 }, { "epoch": 0.18063957963574367, "grad_norm": 1.6221119165420532, "learning_rate": 4.5577969918739794e-05, "loss": 1.9018, "step": 7099 }, { "epoch": 0.1814283987607906, "grad_norm": 1.6945770978927612, "learning_rate": 4.5530162868261486e-05, "loss": 1.9042, "step": 7130 }, { "epoch": 0.18221721788583753, "grad_norm": 1.6812230348587036, "learning_rate": 4.548212413768558e-05, "loss": 1.8992, "step": 7161 }, { "epoch": 0.18300603701088444, "grad_norm": 1.5889744758605957, "learning_rate": 4.543385426912261e-05, "loss": 1.9132, "step": 7192 }, { "epoch": 0.18379485613593136, "grad_norm": 1.9001593589782715, "learning_rate": 4.53853538072915e-05, "loss": 1.9053, "step": 7223 }, { "epoch": 0.18458367526097827, "grad_norm": 1.647740125656128, "learning_rate": 4.533662329951336e-05, "loss": 1.9191, "step": 7254 }, { "epoch": 0.18537249438602518, "grad_norm": 1.6723840236663818, "learning_rate": 4.528766329570536e-05, "loss": 1.8782, "step": 7285 }, { "epoch": 0.1861613135110721, "grad_norm": 1.668317437171936, "learning_rate": 4.523847434837447e-05, "loss": 1.9026, "step": 7316 }, { "epoch": 0.186950132636119, "grad_norm": 1.640189528465271, "learning_rate": 4.518905701261128e-05, "loss": 1.9087, "step": 7347 }, { "epoch": 0.18773895176116592, "grad_norm": 1.5775387287139893, "learning_rate": 4.5139411846083715e-05, "loss": 1.9129, "step": 7378 }, { "epoch": 0.18852777088621284, "grad_norm": 1.5601975917816162, "learning_rate": 4.508953940903073e-05, "loss": 1.9088, "step": 7409 }, { "epoch": 0.18931659001125975, "grad_norm": 1.648223876953125, "learning_rate": 4.5039440264255994e-05, "loss": 1.8977, "step": 7440 }, { "epoch": 0.19010540913630666, "grad_norm": 1.6973642110824585, "learning_rate": 4.498911497712155e-05, "loss": 1.8849, "step": 7471 }, { "epoch": 0.19089422826135358, "grad_norm": 1.6180808544158936, "learning_rate": 4.493856411554142e-05, "loss": 1.8859, "step": 7502 }, { "epoch": 0.1916830473864005, "grad_norm": 1.6768759489059448, "learning_rate": 4.4887788249975206e-05, "loss": 1.9005, "step": 7533 }, { "epoch": 0.19247186651144743, "grad_norm": 1.6389315128326416, "learning_rate": 4.4836787953421656e-05, "loss": 1.9025, "step": 7564 }, { "epoch": 0.19326068563649434, "grad_norm": 1.6695667505264282, "learning_rate": 4.478556380141218e-05, "loss": 1.9015, "step": 7595 }, { "epoch": 0.19404950476154126, "grad_norm": 1.634464979171753, "learning_rate": 4.4734116372004375e-05, "loss": 1.8848, "step": 7626 }, { "epoch": 0.19483832388658817, "grad_norm": 1.6077677011489868, "learning_rate": 4.4682446245775477e-05, "loss": 1.8944, "step": 7657 }, { "epoch": 0.19562714301163509, "grad_norm": 1.7401186227798462, "learning_rate": 4.463055400581586e-05, "loss": 1.8841, "step": 7688 }, { "epoch": 0.196415962136682, "grad_norm": 1.588080644607544, "learning_rate": 4.4578440237722374e-05, "loss": 1.8988, "step": 7719 }, { "epoch": 0.1972047812617289, "grad_norm": 1.5799649953842163, "learning_rate": 4.452610552959183e-05, "loss": 1.8828, "step": 7750 }, { "epoch": 0.19799360038677583, "grad_norm": 1.6081531047821045, "learning_rate": 4.447355047201428e-05, "loss": 1.8967, "step": 7781 }, { "epoch": 0.19878241951182274, "grad_norm": 1.6133239269256592, "learning_rate": 4.4420775658066414e-05, "loss": 1.8898, "step": 7812 }, { "epoch": 0.19957123863686965, "grad_norm": 1.612506628036499, "learning_rate": 4.436778168330484e-05, "loss": 1.8962, "step": 7843 }, { "epoch": 0.20036005776191657, "grad_norm": 1.6066638231277466, "learning_rate": 4.4314569145759353e-05, "loss": 1.8885, "step": 7874 }, { "epoch": 0.20114887688696348, "grad_norm": 1.6667569875717163, "learning_rate": 4.42611386459262e-05, "loss": 1.8836, "step": 7905 }, { "epoch": 0.20193769601201042, "grad_norm": 1.7658061981201172, "learning_rate": 4.420749078676133e-05, "loss": 1.8627, "step": 7936 }, { "epoch": 0.20272651513705733, "grad_norm": 1.5527122020721436, "learning_rate": 4.4153626173673516e-05, "loss": 1.8682, "step": 7967 }, { "epoch": 0.20351533426210425, "grad_norm": 1.6022471189498901, "learning_rate": 4.409954541451762e-05, "loss": 1.8892, "step": 7998 }, { "epoch": 0.20430415338715116, "grad_norm": 1.6246200799942017, "learning_rate": 4.404524911958764e-05, "loss": 1.8703, "step": 8029 }, { "epoch": 0.20509297251219807, "grad_norm": 1.5030767917633057, "learning_rate": 4.399073790160989e-05, "loss": 1.8897, "step": 8060 }, { "epoch": 0.205881791637245, "grad_norm": 1.603633165359497, "learning_rate": 4.393601237573607e-05, "loss": 1.8655, "step": 8091 }, { "epoch": 0.2066706107622919, "grad_norm": 1.5709718465805054, "learning_rate": 4.388107315953628e-05, "loss": 1.8715, "step": 8122 }, { "epoch": 0.20745942988733881, "grad_norm": 1.529145359992981, "learning_rate": 4.382592087299212e-05, "loss": 1.8628, "step": 8153 }, { "epoch": 0.20824824901238573, "grad_norm": 1.6858514547348022, "learning_rate": 4.377055613848964e-05, "loss": 1.8871, "step": 8184 }, { "epoch": 0.20903706813743264, "grad_norm": 1.7125133275985718, "learning_rate": 4.3714979580812355e-05, "loss": 1.8768, "step": 8215 }, { "epoch": 0.20982588726247955, "grad_norm": 1.5397855043411255, "learning_rate": 4.365919182713416e-05, "loss": 1.8807, "step": 8246 }, { "epoch": 0.21061470638752647, "grad_norm": 1.6475356817245483, "learning_rate": 4.360319350701226e-05, "loss": 1.885, "step": 8277 }, { "epoch": 0.21140352551257338, "grad_norm": 1.5485234260559082, "learning_rate": 4.3546985252380115e-05, "loss": 1.8777, "step": 8308 }, { "epoch": 0.21219234463762032, "grad_norm": 1.5416793823242188, "learning_rate": 4.349056769754021e-05, "loss": 1.8577, "step": 8339 }, { "epoch": 0.21298116376266724, "grad_norm": 1.6852402687072754, "learning_rate": 4.3433941479156994e-05, "loss": 1.8807, "step": 8370 }, { "epoch": 0.21376998288771415, "grad_norm": 1.5214046239852905, "learning_rate": 4.3377107236249647e-05, "loss": 1.8631, "step": 8401 }, { "epoch": 0.21455880201276106, "grad_norm": 1.6051750183105469, "learning_rate": 4.332006561018488e-05, "loss": 1.8684, "step": 8432 }, { "epoch": 0.21534762113780798, "grad_norm": 1.5419507026672363, "learning_rate": 4.3262817244669683e-05, "loss": 1.9002, "step": 8463 }, { "epoch": 0.2161364402628549, "grad_norm": 1.6802319288253784, "learning_rate": 4.3205362785744083e-05, "loss": 1.8577, "step": 8494 }, { "epoch": 0.2169252593879018, "grad_norm": 1.5763416290283203, "learning_rate": 4.314770288177384e-05, "loss": 1.865, "step": 8525 }, { "epoch": 0.21771407851294872, "grad_norm": 1.5372315645217896, "learning_rate": 4.308983818344313e-05, "loss": 1.8634, "step": 8556 }, { "epoch": 0.21850289763799563, "grad_norm": 1.6838102340698242, "learning_rate": 4.3031769343747206e-05, "loss": 1.8685, "step": 8587 }, { "epoch": 0.21929171676304254, "grad_norm": 1.6022460460662842, "learning_rate": 4.297349701798505e-05, "loss": 1.8593, "step": 8618 }, { "epoch": 0.22008053588808946, "grad_norm": 1.6018834114074707, "learning_rate": 4.2915021863751916e-05, "loss": 1.8852, "step": 8649 }, { "epoch": 0.22086935501313637, "grad_norm": 1.5209150314331055, "learning_rate": 4.285634454093198e-05, "loss": 1.8522, "step": 8680 }, { "epoch": 0.2216581741381833, "grad_norm": 1.6187845468521118, "learning_rate": 4.279746571169086e-05, "loss": 1.8497, "step": 8711 }, { "epoch": 0.22244699326323022, "grad_norm": 1.6452182531356812, "learning_rate": 4.2738386040468136e-05, "loss": 1.8551, "step": 8742 }, { "epoch": 0.22323581238827714, "grad_norm": 1.613244652748108, "learning_rate": 4.2679106193969866e-05, "loss": 1.8629, "step": 8773 }, { "epoch": 0.22402463151332405, "grad_norm": 1.5586017370224, "learning_rate": 4.261962684116106e-05, "loss": 1.8674, "step": 8804 }, { "epoch": 0.22481345063837097, "grad_norm": 1.5552334785461426, "learning_rate": 4.2559948653258145e-05, "loss": 1.8923, "step": 8835 }, { "epoch": 0.22560226976341788, "grad_norm": 1.609717607498169, "learning_rate": 4.250007230372134e-05, "loss": 1.8854, "step": 8866 }, { "epoch": 0.2263910888884648, "grad_norm": 1.617981195449829, "learning_rate": 4.2439998468247126e-05, "loss": 1.8794, "step": 8897 }, { "epoch": 0.2271799080135117, "grad_norm": 1.5466769933700562, "learning_rate": 4.2379727824760566e-05, "loss": 1.8594, "step": 8928 }, { "epoch": 0.22796872713855862, "grad_norm": 1.5078591108322144, "learning_rate": 4.231926105340768e-05, "loss": 1.8543, "step": 8959 }, { "epoch": 0.22875754626360553, "grad_norm": 1.5065971612930298, "learning_rate": 4.225859883654776e-05, "loss": 1.8506, "step": 8990 }, { "epoch": 0.22954636538865245, "grad_norm": 1.583526849746704, "learning_rate": 4.219774185874569e-05, "loss": 1.8741, "step": 9021 }, { "epoch": 0.23033518451369936, "grad_norm": 1.5880491733551025, "learning_rate": 4.213669080676418e-05, "loss": 1.8531, "step": 9052 }, { "epoch": 0.2311240036387463, "grad_norm": 1.6649373769760132, "learning_rate": 4.2075446369556056e-05, "loss": 1.8524, "step": 9083 }, { "epoch": 0.2319128227637932, "grad_norm": 1.556809425354004, "learning_rate": 4.201400923825648e-05, "loss": 1.8581, "step": 9114 }, { "epoch": 0.23270164188884013, "grad_norm": 1.5572203397750854, "learning_rate": 4.195238010617511e-05, "loss": 1.8546, "step": 9145 }, { "epoch": 0.23349046101388704, "grad_norm": 1.5201469659805298, "learning_rate": 4.1890559668788344e-05, "loss": 1.8644, "step": 9176 }, { "epoch": 0.23427928013893395, "grad_norm": 1.5816295146942139, "learning_rate": 4.1828548623731405e-05, "loss": 1.8556, "step": 9207 }, { "epoch": 0.23506809926398087, "grad_norm": 1.5696643590927124, "learning_rate": 4.1766347670790506e-05, "loss": 1.8491, "step": 9238 }, { "epoch": 0.23585691838902778, "grad_norm": 1.5091686248779297, "learning_rate": 4.170395751189495e-05, "loss": 1.8493, "step": 9269 }, { "epoch": 0.2366457375140747, "grad_norm": 1.5627410411834717, "learning_rate": 4.164137885110921e-05, "loss": 1.8502, "step": 9300 }, { "epoch": 0.2374345566391216, "grad_norm": 1.5759685039520264, "learning_rate": 4.157861239462495e-05, "loss": 1.8427, "step": 9331 }, { "epoch": 0.23822337576416852, "grad_norm": 1.533496379852295, "learning_rate": 4.1515658850753114e-05, "loss": 1.8533, "step": 9362 }, { "epoch": 0.23901219488921543, "grad_norm": 1.5628371238708496, "learning_rate": 4.145251892991588e-05, "loss": 1.8439, "step": 9393 }, { "epoch": 0.23980101401426235, "grad_norm": 1.568591594696045, "learning_rate": 4.138919334463868e-05, "loss": 1.8533, "step": 9424 }, { "epoch": 0.24058983313930926, "grad_norm": 1.5815969705581665, "learning_rate": 4.1325682809542124e-05, "loss": 1.8511, "step": 9455 }, { "epoch": 0.2413786522643562, "grad_norm": 1.5089821815490723, "learning_rate": 4.126198804133398e-05, "loss": 1.8609, "step": 9486 }, { "epoch": 0.24216747138940312, "grad_norm": 1.5676932334899902, "learning_rate": 4.1198109758801055e-05, "loss": 1.8565, "step": 9517 }, { "epoch": 0.24295629051445003, "grad_norm": 1.5041563510894775, "learning_rate": 4.113404868280107e-05, "loss": 1.8536, "step": 9548 }, { "epoch": 0.24374510963949694, "grad_norm": 1.5710649490356445, "learning_rate": 4.106980553625457e-05, "loss": 1.8547, "step": 9579 }, { "epoch": 0.24453392876454386, "grad_norm": 1.5798616409301758, "learning_rate": 4.100538104413674e-05, "loss": 1.8572, "step": 9610 }, { "epoch": 0.24532274788959077, "grad_norm": 2.4222729206085205, "learning_rate": 4.09407759334692e-05, "loss": 1.8645, "step": 9641 }, { "epoch": 0.24611156701463768, "grad_norm": 1.4646201133728027, "learning_rate": 4.087599093331186e-05, "loss": 1.8496, "step": 9672 }, { "epoch": 0.2469003861396846, "grad_norm": 1.606465458869934, "learning_rate": 4.081102677475462e-05, "loss": 1.8459, "step": 9703 }, { "epoch": 0.2476892052647315, "grad_norm": 1.5241799354553223, "learning_rate": 4.0745884190909194e-05, "loss": 1.8511, "step": 9734 }, { "epoch": 0.24847802438977842, "grad_norm": 1.6257604360580444, "learning_rate": 4.0680563916900796e-05, "loss": 1.8416, "step": 9765 }, { "epoch": 0.24926684351482534, "grad_norm": 1.5233662128448486, "learning_rate": 4.0615066689859815e-05, "loss": 1.8465, "step": 9796 }, { "epoch": 0.2500556626398723, "grad_norm": 1.4877644777297974, "learning_rate": 4.0549393248913584e-05, "loss": 1.8467, "step": 9827 }, { "epoch": 0.2508444817649192, "grad_norm": 1.530216932296753, "learning_rate": 4.048354433517794e-05, "loss": 1.8304, "step": 9858 }, { "epoch": 0.2516333008899661, "grad_norm": 1.4812994003295898, "learning_rate": 4.0417520691748916e-05, "loss": 1.8324, "step": 9889 }, { "epoch": 0.252422120015013, "grad_norm": 1.5022152662277222, "learning_rate": 4.035132306369438e-05, "loss": 1.8521, "step": 9920 }, { "epoch": 0.25321093914005993, "grad_norm": 1.5792648792266846, "learning_rate": 4.028495219804555e-05, "loss": 1.8275, "step": 9951 }, { "epoch": 0.25399975826510685, "grad_norm": 1.4738425016403198, "learning_rate": 4.021840884378864e-05, "loss": 1.8484, "step": 9982 }, { "epoch": 0.25478857739015376, "grad_norm": 1.4558053016662598, "learning_rate": 4.015169375185633e-05, "loss": 1.842, "step": 10013 }, { "epoch": 0.25557739651520067, "grad_norm": 1.6721614599227905, "learning_rate": 4.0084807675119396e-05, "loss": 1.8582, "step": 10044 }, { "epoch": 0.2563662156402476, "grad_norm": 1.643314003944397, "learning_rate": 4.0017751368378106e-05, "loss": 1.8467, "step": 10075 }, { "epoch": 0.2571550347652945, "grad_norm": 1.600417137145996, "learning_rate": 3.995052558835377e-05, "loss": 1.8466, "step": 10106 }, { "epoch": 0.2579438538903414, "grad_norm": 1.5018486976623535, "learning_rate": 3.988313109368017e-05, "loss": 1.8299, "step": 10137 }, { "epoch": 0.2587326730153883, "grad_norm": 1.4656468629837036, "learning_rate": 3.981556864489504e-05, "loss": 1.8254, "step": 10168 }, { "epoch": 0.25952149214043524, "grad_norm": 1.573634386062622, "learning_rate": 3.974783900443142e-05, "loss": 1.8387, "step": 10199 }, { "epoch": 0.26031031126548215, "grad_norm": 1.4818580150604248, "learning_rate": 3.9679942936609095e-05, "loss": 1.8364, "step": 10230 }, { "epoch": 0.26109913039052907, "grad_norm": 1.560797095298767, "learning_rate": 3.961188120762596e-05, "loss": 1.8504, "step": 10261 }, { "epoch": 0.261887949515576, "grad_norm": 1.535651683807373, "learning_rate": 3.954365458554938e-05, "loss": 1.8408, "step": 10292 }, { "epoch": 0.2626767686406229, "grad_norm": 1.5009502172470093, "learning_rate": 3.947526384030751e-05, "loss": 1.8612, "step": 10323 }, { "epoch": 0.2634655877656698, "grad_norm": 1.5589120388031006, "learning_rate": 3.9406709743680624e-05, "loss": 1.843, "step": 10354 }, { "epoch": 0.2642544068907167, "grad_norm": 1.6979308128356934, "learning_rate": 3.9337993069292366e-05, "loss": 1.8445, "step": 10385 }, { "epoch": 0.26504322601576363, "grad_norm": 1.4456043243408203, "learning_rate": 3.926911459260109e-05, "loss": 1.8455, "step": 10416 }, { "epoch": 0.2658320451408106, "grad_norm": 1.5096760988235474, "learning_rate": 3.920007509089102e-05, "loss": 1.8243, "step": 10447 }, { "epoch": 0.2666208642658575, "grad_norm": 1.534406065940857, "learning_rate": 3.913087534326357e-05, "loss": 1.828, "step": 10478 }, { "epoch": 0.26740968339090443, "grad_norm": 1.7499176263809204, "learning_rate": 3.9061516130628475e-05, "loss": 1.8372, "step": 10509 }, { "epoch": 0.26819850251595134, "grad_norm": 1.5080372095108032, "learning_rate": 3.8991998235695025e-05, "loss": 1.8237, "step": 10540 }, { "epoch": 0.26898732164099826, "grad_norm": 1.495322346687317, "learning_rate": 3.8922322442963224e-05, "loss": 1.856, "step": 10571 }, { "epoch": 0.26977614076604517, "grad_norm": 1.548775315284729, "learning_rate": 3.885248953871491e-05, "loss": 1.8305, "step": 10602 }, { "epoch": 0.2705649598910921, "grad_norm": 1.655852198600769, "learning_rate": 3.8782500311004915e-05, "loss": 1.826, "step": 10633 }, { "epoch": 0.271353779016139, "grad_norm": 1.4688091278076172, "learning_rate": 3.871235554965218e-05, "loss": 1.8277, "step": 10664 }, { "epoch": 0.2721425981411859, "grad_norm": 1.4753823280334473, "learning_rate": 3.864205604623078e-05, "loss": 1.8343, "step": 10695 }, { "epoch": 0.2729314172662328, "grad_norm": 1.5295604467391968, "learning_rate": 3.857160259406107e-05, "loss": 1.8212, "step": 10726 }, { "epoch": 0.27372023639127974, "grad_norm": 1.5936615467071533, "learning_rate": 3.8500995988200674e-05, "loss": 1.8369, "step": 10757 }, { "epoch": 0.27450905551632665, "grad_norm": 1.61776864528656, "learning_rate": 3.843023702543556e-05, "loss": 1.8075, "step": 10788 }, { "epoch": 0.27529787464137356, "grad_norm": 1.5151159763336182, "learning_rate": 3.8359326504270984e-05, "loss": 1.824, "step": 10819 }, { "epoch": 0.2760866937664205, "grad_norm": 1.6150144338607788, "learning_rate": 3.828826522492255e-05, "loss": 1.8392, "step": 10850 }, { "epoch": 0.2768755128914674, "grad_norm": 1.4708991050720215, "learning_rate": 3.821705398930713e-05, "loss": 1.8428, "step": 10881 }, { "epoch": 0.2776643320165143, "grad_norm": 1.4772177934646606, "learning_rate": 3.814569360103385e-05, "loss": 1.816, "step": 10912 }, { "epoch": 0.2784531511415612, "grad_norm": 1.5404503345489502, "learning_rate": 3.807418486539499e-05, "loss": 1.8148, "step": 10943 }, { "epoch": 0.27924197026660813, "grad_norm": 1.5612235069274902, "learning_rate": 3.80025285893569e-05, "loss": 1.8345, "step": 10974 }, { "epoch": 0.28003078939165504, "grad_norm": 1.5301347970962524, "learning_rate": 3.793072558155093e-05, "loss": 1.8248, "step": 11005 }, { "epoch": 0.28081960851670196, "grad_norm": 1.4737629890441895, "learning_rate": 3.785877665226426e-05, "loss": 1.8198, "step": 11036 }, { "epoch": 0.28160842764174887, "grad_norm": 1.4503047466278076, "learning_rate": 3.778668261343079e-05, "loss": 1.8371, "step": 11067 }, { "epoch": 0.2823972467667958, "grad_norm": 1.5823520421981812, "learning_rate": 3.771444427862192e-05, "loss": 1.8325, "step": 11098 }, { "epoch": 0.2831860658918427, "grad_norm": 1.6144663095474243, "learning_rate": 3.7642062463037465e-05, "loss": 1.8209, "step": 11129 }, { "epoch": 0.2839748850168896, "grad_norm": 1.6444216966629028, "learning_rate": 3.7569537983496373e-05, "loss": 1.825, "step": 11160 }, { "epoch": 0.2847637041419365, "grad_norm": 1.503098487854004, "learning_rate": 3.749687165842753e-05, "loss": 1.8392, "step": 11191 }, { "epoch": 0.2855525232669835, "grad_norm": 1.83393394947052, "learning_rate": 3.7424064307860536e-05, "loss": 1.8252, "step": 11222 }, { "epoch": 0.2863413423920304, "grad_norm": 1.4748836755752563, "learning_rate": 3.735111675341645e-05, "loss": 1.806, "step": 11253 }, { "epoch": 0.2871301615170773, "grad_norm": 1.912840485572815, "learning_rate": 3.7278029818298524e-05, "loss": 1.8095, "step": 11284 }, { "epoch": 0.28791898064212423, "grad_norm": 1.5209791660308838, "learning_rate": 3.720480432728287e-05, "loss": 1.8162, "step": 11315 }, { "epoch": 0.28870779976717115, "grad_norm": 1.5430744886398315, "learning_rate": 3.71314411067092e-05, "loss": 1.833, "step": 11346 }, { "epoch": 0.28949661889221806, "grad_norm": 1.592753529548645, "learning_rate": 3.70579409844715e-05, "loss": 1.8402, "step": 11377 }, { "epoch": 0.290285438017265, "grad_norm": 1.5640573501586914, "learning_rate": 3.698430479000865e-05, "loss": 1.8275, "step": 11408 }, { "epoch": 0.2910742571423119, "grad_norm": 1.4649739265441895, "learning_rate": 3.691053335429509e-05, "loss": 1.8092, "step": 11439 }, { "epoch": 0.2918630762673588, "grad_norm": 1.5399479866027832, "learning_rate": 3.683662750983147e-05, "loss": 1.8131, "step": 11470 }, { "epoch": 0.2926518953924057, "grad_norm": 1.5420011281967163, "learning_rate": 3.676258809063518e-05, "loss": 1.8287, "step": 11501 }, { "epoch": 0.2934407145174526, "grad_norm": 1.4778993129730225, "learning_rate": 3.6688415932231004e-05, "loss": 1.8202, "step": 11532 }, { "epoch": 0.29422953364249954, "grad_norm": 1.5598480701446533, "learning_rate": 3.661411187164166e-05, "loss": 1.8425, "step": 11563 }, { "epoch": 0.29501835276754645, "grad_norm": 1.6161003112792969, "learning_rate": 3.65396767473784e-05, "loss": 1.8402, "step": 11594 }, { "epoch": 0.29580717189259337, "grad_norm": 1.4399648904800415, "learning_rate": 3.6465111399431465e-05, "loss": 1.8083, "step": 11625 }, { "epoch": 0.2965959910176403, "grad_norm": 1.5157605409622192, "learning_rate": 3.6390416669260674e-05, "loss": 1.8041, "step": 11656 }, { "epoch": 0.2973848101426872, "grad_norm": 1.5234498977661133, "learning_rate": 3.63155933997859e-05, "loss": 1.8104, "step": 11687 }, { "epoch": 0.2981736292677341, "grad_norm": 1.4988269805908203, "learning_rate": 3.624064243537758e-05, "loss": 1.8192, "step": 11718 }, { "epoch": 0.298962448392781, "grad_norm": 1.5283297300338745, "learning_rate": 3.616556462184716e-05, "loss": 1.8123, "step": 11749 }, { "epoch": 0.29975126751782794, "grad_norm": 1.523959994316101, "learning_rate": 3.609036080643755e-05, "loss": 1.8391, "step": 11780 }, { "epoch": 0.30054008664287485, "grad_norm": 1.4363136291503906, "learning_rate": 3.60150318378136e-05, "loss": 1.8176, "step": 11811 }, { "epoch": 0.30132890576792176, "grad_norm": 1.521101474761963, "learning_rate": 3.5939578566052465e-05, "loss": 1.8164, "step": 11842 }, { "epoch": 0.3021177248929687, "grad_norm": 1.4613672494888306, "learning_rate": 3.586400184263408e-05, "loss": 1.8204, "step": 11873 }, { "epoch": 0.3029065440180156, "grad_norm": 1.4992350339889526, "learning_rate": 3.578830252043148e-05, "loss": 1.8122, "step": 11904 }, { "epoch": 0.3036953631430625, "grad_norm": 1.659454584121704, "learning_rate": 3.571248145370125e-05, "loss": 1.8243, "step": 11935 }, { "epoch": 0.3044841822681094, "grad_norm": 1.429630160331726, "learning_rate": 3.5636539498073794e-05, "loss": 1.8079, "step": 11966 }, { "epoch": 0.3052730013931564, "grad_norm": 1.4403787851333618, "learning_rate": 3.556047751054378e-05, "loss": 1.8079, "step": 11997 }, { "epoch": 0.3060618205182033, "grad_norm": 1.4795056581497192, "learning_rate": 3.548429634946039e-05, "loss": 1.7949, "step": 12028 }, { "epoch": 0.3068506396432502, "grad_norm": 1.5466026067733765, "learning_rate": 3.540799687451768e-05, "loss": 1.8039, "step": 12059 }, { "epoch": 0.3076394587682971, "grad_norm": 1.5872678756713867, "learning_rate": 3.533157994674485e-05, "loss": 1.8174, "step": 12090 }, { "epoch": 0.30842827789334404, "grad_norm": 1.4539406299591064, "learning_rate": 3.5255046428496546e-05, "loss": 1.8212, "step": 12121 }, { "epoch": 0.30921709701839095, "grad_norm": 1.443831443786621, "learning_rate": 3.517839718344311e-05, "loss": 1.8387, "step": 12152 }, { "epoch": 0.31000591614343787, "grad_norm": 1.4756397008895874, "learning_rate": 3.510163307656086e-05, "loss": 1.827, "step": 12183 }, { "epoch": 0.3107947352684848, "grad_norm": 1.5675030946731567, "learning_rate": 3.5024754974122324e-05, "loss": 1.8198, "step": 12214 }, { "epoch": 0.3115835543935317, "grad_norm": 1.4247853755950928, "learning_rate": 3.494776374368643e-05, "loss": 1.8072, "step": 12245 }, { "epoch": 0.3123723735185786, "grad_norm": 1.567158579826355, "learning_rate": 3.4870660254088724e-05, "loss": 1.8158, "step": 12276 }, { "epoch": 0.3131611926436255, "grad_norm": 1.4549590349197388, "learning_rate": 3.479344537543164e-05, "loss": 1.8044, "step": 12307 }, { "epoch": 0.31395001176867243, "grad_norm": 1.4478166103363037, "learning_rate": 3.4716119979074565e-05, "loss": 1.8099, "step": 12338 }, { "epoch": 0.31473883089371935, "grad_norm": 1.4160298109054565, "learning_rate": 3.463868493762412e-05, "loss": 1.8179, "step": 12369 }, { "epoch": 0.31552765001876626, "grad_norm": 1.5618009567260742, "learning_rate": 3.456114112492418e-05, "loss": 1.801, "step": 12400 }, { "epoch": 0.3163164691438132, "grad_norm": 1.3777692317962646, "learning_rate": 3.4483489416046164e-05, "loss": 1.8117, "step": 12431 }, { "epoch": 0.3171052882688601, "grad_norm": 1.5304317474365234, "learning_rate": 3.440573068727905e-05, "loss": 1.8085, "step": 12462 }, { "epoch": 0.317894107393907, "grad_norm": 1.5578155517578125, "learning_rate": 3.4327865816119495e-05, "loss": 1.8106, "step": 12493 }, { "epoch": 0.3186829265189539, "grad_norm": 1.51682710647583, "learning_rate": 3.4249895681262025e-05, "loss": 1.8199, "step": 12524 }, { "epoch": 0.3194717456440008, "grad_norm": 1.4429659843444824, "learning_rate": 3.417182116258899e-05, "loss": 1.7975, "step": 12555 }, { "epoch": 0.32026056476904774, "grad_norm": 1.4837095737457275, "learning_rate": 3.409364314116074e-05, "loss": 1.8039, "step": 12586 }, { "epoch": 0.32104938389409465, "grad_norm": 1.4250850677490234, "learning_rate": 3.401536249920559e-05, "loss": 1.7886, "step": 12617 }, { "epoch": 0.32183820301914157, "grad_norm": 1.429291009902954, "learning_rate": 3.393698012010998e-05, "loss": 1.7938, "step": 12648 }, { "epoch": 0.3226270221441885, "grad_norm": 1.4839123487472534, "learning_rate": 3.385849688840839e-05, "loss": 1.811, "step": 12679 }, { "epoch": 0.3234158412692354, "grad_norm": 1.5528687238693237, "learning_rate": 3.3779913689773414e-05, "loss": 1.8004, "step": 12710 }, { "epoch": 0.3242046603942823, "grad_norm": 1.4324339628219604, "learning_rate": 3.370123141100578e-05, "loss": 1.8048, "step": 12741 }, { "epoch": 0.3249934795193293, "grad_norm": 1.5204551219940186, "learning_rate": 3.3622450940024305e-05, "loss": 1.7944, "step": 12772 }, { "epoch": 0.3257822986443762, "grad_norm": 1.3667051792144775, "learning_rate": 3.35435731658559e-05, "loss": 1.8086, "step": 12803 }, { "epoch": 0.3265711177694231, "grad_norm": 1.4720345735549927, "learning_rate": 3.346459897862552e-05, "loss": 1.7982, "step": 12834 }, { "epoch": 0.32735993689447, "grad_norm": 1.4867823123931885, "learning_rate": 3.338552926954613e-05, "loss": 1.7855, "step": 12865 }, { "epoch": 0.32814875601951693, "grad_norm": 1.468201994895935, "learning_rate": 3.330636493090868e-05, "loss": 1.7909, "step": 12896 }, { "epoch": 0.32893757514456384, "grad_norm": 1.4950802326202393, "learning_rate": 3.322710685607193e-05, "loss": 1.78, "step": 12927 }, { "epoch": 0.32972639426961076, "grad_norm": 1.6039625406265259, "learning_rate": 3.314775593945251e-05, "loss": 1.8094, "step": 12958 }, { "epoch": 0.33051521339465767, "grad_norm": 1.5084600448608398, "learning_rate": 3.3068313076514714e-05, "loss": 1.7908, "step": 12989 }, { "epoch": 0.3313040325197046, "grad_norm": 1.4338363409042358, "learning_rate": 3.298877916376047e-05, "loss": 1.8092, "step": 13020 }, { "epoch": 0.3320928516447515, "grad_norm": 1.4125412702560425, "learning_rate": 3.290915509871915e-05, "loss": 1.7756, "step": 13051 }, { "epoch": 0.3328816707697984, "grad_norm": 1.4515947103500366, "learning_rate": 3.282944177993753e-05, "loss": 1.7845, "step": 13082 }, { "epoch": 0.3336704898948453, "grad_norm": 1.586795449256897, "learning_rate": 3.274964010696957e-05, "loss": 1.8047, "step": 13113 }, { "epoch": 0.33445930901989224, "grad_norm": 1.486333966255188, "learning_rate": 3.266975098036629e-05, "loss": 1.7913, "step": 13144 }, { "epoch": 0.33524812814493915, "grad_norm": 1.4172276258468628, "learning_rate": 3.258977530166562e-05, "loss": 1.7962, "step": 13175 }, { "epoch": 0.33603694726998606, "grad_norm": 1.4856761693954468, "learning_rate": 3.250971397338227e-05, "loss": 1.8232, "step": 13206 }, { "epoch": 0.336825766395033, "grad_norm": 1.496001124382019, "learning_rate": 3.2429567898997404e-05, "loss": 1.7968, "step": 13237 }, { "epoch": 0.3376145855200799, "grad_norm": 1.5100946426391602, "learning_rate": 3.234933798294859e-05, "loss": 1.806, "step": 13268 }, { "epoch": 0.3384034046451268, "grad_norm": 1.4353514909744263, "learning_rate": 3.2269025130619535e-05, "loss": 1.8088, "step": 13299 }, { "epoch": 0.3391922237701737, "grad_norm": 1.393201231956482, "learning_rate": 3.218863024832985e-05, "loss": 1.778, "step": 13330 }, { "epoch": 0.33998104289522063, "grad_norm": 1.4197418689727783, "learning_rate": 3.2108154243324864e-05, "loss": 1.7986, "step": 13361 }, { "epoch": 0.34076986202026754, "grad_norm": 1.5117108821868896, "learning_rate": 3.2027598023765345e-05, "loss": 1.812, "step": 13392 }, { "epoch": 0.34155868114531446, "grad_norm": 1.4815988540649414, "learning_rate": 3.194696249871729e-05, "loss": 1.7971, "step": 13423 }, { "epoch": 0.34234750027036137, "grad_norm": 1.4436742067337036, "learning_rate": 3.186624857814164e-05, "loss": 1.7946, "step": 13454 }, { "epoch": 0.3431363193954083, "grad_norm": 1.6074799299240112, "learning_rate": 3.178545717288401e-05, "loss": 1.8018, "step": 13485 }, { "epoch": 0.3439251385204552, "grad_norm": 1.5564550161361694, "learning_rate": 3.170458919466444e-05, "loss": 1.7816, "step": 13516 }, { "epoch": 0.34471395764550217, "grad_norm": 1.4918630123138428, "learning_rate": 3.1623645556067063e-05, "loss": 1.7759, "step": 13547 }, { "epoch": 0.3455027767705491, "grad_norm": 1.537247896194458, "learning_rate": 3.154262717052985e-05, "loss": 1.7959, "step": 13578 }, { "epoch": 0.346291595895596, "grad_norm": 1.4561980962753296, "learning_rate": 3.146153495233426e-05, "loss": 1.7923, "step": 13609 }, { "epoch": 0.3470804150206429, "grad_norm": 1.4964359998703003, "learning_rate": 3.1380369816594944e-05, "loss": 1.8033, "step": 13640 }, { "epoch": 0.3478692341456898, "grad_norm": 1.463797688484192, "learning_rate": 3.129913267924946e-05, "loss": 1.7794, "step": 13671 }, { "epoch": 0.34865805327073673, "grad_norm": 1.4784711599349976, "learning_rate": 3.121782445704782e-05, "loss": 1.8002, "step": 13702 }, { "epoch": 0.34944687239578365, "grad_norm": 1.4917761087417603, "learning_rate": 3.11364460675423e-05, "loss": 1.7964, "step": 13733 }, { "epoch": 0.35023569152083056, "grad_norm": 1.4284688234329224, "learning_rate": 3.1054998429076934e-05, "loss": 1.7981, "step": 13764 }, { "epoch": 0.3510245106458775, "grad_norm": 1.4910475015640259, "learning_rate": 3.097348246077728e-05, "loss": 1.7952, "step": 13795 }, { "epoch": 0.3518133297709244, "grad_norm": 1.4870178699493408, "learning_rate": 3.0891899082539924e-05, "loss": 1.7876, "step": 13826 }, { "epoch": 0.3526021488959713, "grad_norm": 1.5134365558624268, "learning_rate": 3.0810249215022233e-05, "loss": 1.7961, "step": 13857 }, { "epoch": 0.3533909680210182, "grad_norm": 1.595760464668274, "learning_rate": 3.0728533779631865e-05, "loss": 1.8069, "step": 13888 }, { "epoch": 0.35417978714606513, "grad_norm": 1.5597907304763794, "learning_rate": 3.064675369851637e-05, "loss": 1.787, "step": 13919 }, { "epoch": 0.35496860627111204, "grad_norm": 1.4652432203292847, "learning_rate": 3.056490989455289e-05, "loss": 1.7936, "step": 13950 }, { "epoch": 0.35575742539615895, "grad_norm": 1.5195232629776, "learning_rate": 3.0483003291337596e-05, "loss": 1.7988, "step": 13981 }, { "epoch": 0.35654624452120587, "grad_norm": 1.5883373022079468, "learning_rate": 3.040103481317539e-05, "loss": 1.7752, "step": 14012 }, { "epoch": 0.3573350636462528, "grad_norm": 1.4016722440719604, "learning_rate": 3.03190053850694e-05, "loss": 1.7793, "step": 14043 }, { "epoch": 0.3581238827712997, "grad_norm": 1.4025518894195557, "learning_rate": 3.0236915932710573e-05, "loss": 1.7788, "step": 14074 }, { "epoch": 0.3589127018963466, "grad_norm": 1.3901499509811401, "learning_rate": 3.0154767382467232e-05, "loss": 1.7766, "step": 14105 }, { "epoch": 0.3597015210213935, "grad_norm": 1.4077001810073853, "learning_rate": 3.0072560661374582e-05, "loss": 1.7903, "step": 14136 }, { "epoch": 0.36049034014644044, "grad_norm": 1.4476062059402466, "learning_rate": 2.999029669712431e-05, "loss": 1.7851, "step": 14167 }, { "epoch": 0.36127915927148735, "grad_norm": 1.4461426734924316, "learning_rate": 2.990797641805408e-05, "loss": 1.7759, "step": 14198 }, { "epoch": 0.36206797839653426, "grad_norm": 1.452197790145874, "learning_rate": 2.982560075313704e-05, "loss": 1.7877, "step": 14229 }, { "epoch": 0.3628567975215812, "grad_norm": 1.4651702642440796, "learning_rate": 2.9743170631971368e-05, "loss": 1.7609, "step": 14260 }, { "epoch": 0.3636456166466281, "grad_norm": 1.4937199354171753, "learning_rate": 2.9660686984769792e-05, "loss": 1.7794, "step": 14291 }, { "epoch": 0.36443443577167506, "grad_norm": 1.4670535326004028, "learning_rate": 2.9578150742349047e-05, "loss": 1.7891, "step": 14322 }, { "epoch": 0.36522325489672197, "grad_norm": 1.5605027675628662, "learning_rate": 2.949556283611942e-05, "loss": 1.7941, "step": 14353 }, { "epoch": 0.3660120740217689, "grad_norm": 1.4724050760269165, "learning_rate": 2.9412924198074206e-05, "loss": 1.7944, "step": 14384 }, { "epoch": 0.3668008931468158, "grad_norm": 1.4741649627685547, "learning_rate": 2.9330235760779208e-05, "loss": 1.7881, "step": 14415 }, { "epoch": 0.3675897122718627, "grad_norm": 1.9677305221557617, "learning_rate": 2.9247498457362188e-05, "loss": 1.7911, "step": 14446 }, { "epoch": 0.3683785313969096, "grad_norm": 1.5587711334228516, "learning_rate": 2.9164713221502373e-05, "loss": 1.7955, "step": 14477 }, { "epoch": 0.36916735052195654, "grad_norm": 1.4210267066955566, "learning_rate": 2.9081880987419912e-05, "loss": 1.7656, "step": 14508 }, { "epoch": 0.36995616964700345, "grad_norm": 1.5188663005828857, "learning_rate": 2.8999002689865296e-05, "loss": 1.7779, "step": 14539 }, { "epoch": 0.37074498877205037, "grad_norm": 1.460797905921936, "learning_rate": 2.8916079264108852e-05, "loss": 1.7929, "step": 14570 }, { "epoch": 0.3715338078970973, "grad_norm": 1.464895486831665, "learning_rate": 2.883311164593017e-05, "loss": 1.777, "step": 14601 }, { "epoch": 0.3723226270221442, "grad_norm": 1.5290796756744385, "learning_rate": 2.875010077160754e-05, "loss": 1.7837, "step": 14632 }, { "epoch": 0.3731114461471911, "grad_norm": 1.4018083810806274, "learning_rate": 2.866704757790741e-05, "loss": 1.7685, "step": 14663 }, { "epoch": 0.373900265272238, "grad_norm": 1.3743735551834106, "learning_rate": 2.858395300207376e-05, "loss": 1.7752, "step": 14694 }, { "epoch": 0.37468908439728493, "grad_norm": 1.4071861505508423, "learning_rate": 2.8500817981817607e-05, "loss": 1.7894, "step": 14725 }, { "epoch": 0.37547790352233185, "grad_norm": 1.533968448638916, "learning_rate": 2.8417643455306336e-05, "loss": 1.7815, "step": 14756 }, { "epoch": 0.37626672264737876, "grad_norm": 1.4698586463928223, "learning_rate": 2.8334430361153185e-05, "loss": 1.7693, "step": 14787 }, { "epoch": 0.3770555417724257, "grad_norm": 1.4075795412063599, "learning_rate": 2.8251179638406612e-05, "loss": 1.7701, "step": 14818 }, { "epoch": 0.3778443608974726, "grad_norm": 1.4033679962158203, "learning_rate": 2.8167892226539704e-05, "loss": 1.7728, "step": 14849 }, { "epoch": 0.3786331800225195, "grad_norm": 1.4572257995605469, "learning_rate": 2.8084569065439588e-05, "loss": 1.8019, "step": 14880 }, { "epoch": 0.3794219991475664, "grad_norm": 1.5333317518234253, "learning_rate": 2.8001211095396807e-05, "loss": 1.7979, "step": 14911 }, { "epoch": 0.3802108182726133, "grad_norm": 1.4421522617340088, "learning_rate": 2.791781925709473e-05, "loss": 1.7768, "step": 14942 }, { "epoch": 0.38099963739766024, "grad_norm": 1.5021952390670776, "learning_rate": 2.7834394491598908e-05, "loss": 1.7758, "step": 14973 }, { "epoch": 0.38178845652270715, "grad_norm": 1.462990641593933, "learning_rate": 2.7750937740346485e-05, "loss": 1.757, "step": 15004 }, { "epoch": 0.38257727564775407, "grad_norm": 1.4034866094589233, "learning_rate": 2.7667449945135564e-05, "loss": 1.7658, "step": 15035 }, { "epoch": 0.383366094772801, "grad_norm": 1.5529896020889282, "learning_rate": 2.7583932048114557e-05, "loss": 1.787, "step": 15066 }, { "epoch": 0.38415491389784795, "grad_norm": 1.3766757249832153, "learning_rate": 2.7500384991771587e-05, "loss": 1.7857, "step": 15097 }, { "epoch": 0.38494373302289486, "grad_norm": 1.3775665760040283, "learning_rate": 2.7416809718923825e-05, "loss": 1.7961, "step": 15128 }, { "epoch": 0.3857325521479418, "grad_norm": 1.4085272550582886, "learning_rate": 2.7333207172706864e-05, "loss": 1.7818, "step": 15159 }, { "epoch": 0.3865213712729887, "grad_norm": 1.441758394241333, "learning_rate": 2.7249578296564088e-05, "loss": 1.7746, "step": 15190 }, { "epoch": 0.3873101903980356, "grad_norm": 1.4011828899383545, "learning_rate": 2.7165924034235973e-05, "loss": 1.7704, "step": 15221 }, { "epoch": 0.3880990095230825, "grad_norm": 1.4673304557800293, "learning_rate": 2.708224532974953e-05, "loss": 1.7863, "step": 15252 }, { "epoch": 0.38888782864812943, "grad_norm": 1.4282735586166382, "learning_rate": 2.6998543127407538e-05, "loss": 1.76, "step": 15283 }, { "epoch": 0.38967664777317634, "grad_norm": 1.3983831405639648, "learning_rate": 2.6914818371777988e-05, "loss": 1.7803, "step": 15314 }, { "epoch": 0.39046546689822326, "grad_norm": 1.5473729372024536, "learning_rate": 2.6831072007683373e-05, "loss": 1.7787, "step": 15345 }, { "epoch": 0.39125428602327017, "grad_norm": 1.565489649772644, "learning_rate": 2.6747304980190018e-05, "loss": 1.755, "step": 15376 }, { "epoch": 0.3920431051483171, "grad_norm": 1.4918326139450073, "learning_rate": 2.6663518234597453e-05, "loss": 1.8007, "step": 15407 }, { "epoch": 0.392831924273364, "grad_norm": 1.5468804836273193, "learning_rate": 2.6579712716427696e-05, "loss": 1.7574, "step": 15438 }, { "epoch": 0.3936207433984109, "grad_norm": 1.4871866703033447, "learning_rate": 2.6495889371414652e-05, "loss": 1.7757, "step": 15469 }, { "epoch": 0.3944095625234578, "grad_norm": 1.5485950708389282, "learning_rate": 2.6412049145493367e-05, "loss": 1.79, "step": 15500 }, { "epoch": 0.39519838164850474, "grad_norm": 1.5302681922912598, "learning_rate": 2.632819298478939e-05, "loss": 1.779, "step": 15531 }, { "epoch": 0.39598720077355165, "grad_norm": 1.5713484287261963, "learning_rate": 2.6244321835608105e-05, "loss": 1.7526, "step": 15562 }, { "epoch": 0.39677601989859856, "grad_norm": 1.4450056552886963, "learning_rate": 2.6160436644424024e-05, "loss": 1.7896, "step": 15593 }, { "epoch": 0.3975648390236455, "grad_norm": 1.5404566526412964, "learning_rate": 2.6076538357870133e-05, "loss": 1.7612, "step": 15624 }, { "epoch": 0.3983536581486924, "grad_norm": 1.5850070714950562, "learning_rate": 2.5992627922727196e-05, "loss": 1.7588, "step": 15655 }, { "epoch": 0.3991424772737393, "grad_norm": 1.4891109466552734, "learning_rate": 2.5908706285913066e-05, "loss": 1.768, "step": 15686 }, { "epoch": 0.3999312963987862, "grad_norm": 1.4907901287078857, "learning_rate": 2.5824774394472008e-05, "loss": 1.7672, "step": 15717 }, { "epoch": 0.40072011552383313, "grad_norm": 1.418935775756836, "learning_rate": 2.5740833195563996e-05, "loss": 1.7812, "step": 15748 }, { "epoch": 0.40150893464888004, "grad_norm": 1.4996947050094604, "learning_rate": 2.5656883636454067e-05, "loss": 1.7833, "step": 15779 }, { "epoch": 0.40229775377392696, "grad_norm": 1.5073673725128174, "learning_rate": 2.557292666450159e-05, "loss": 1.7768, "step": 15810 }, { "epoch": 0.40308657289897387, "grad_norm": 1.4004729986190796, "learning_rate": 2.5488963227149566e-05, "loss": 1.7688, "step": 15841 }, { "epoch": 0.40387539202402084, "grad_norm": 1.4226566553115845, "learning_rate": 2.5404994271913983e-05, "loss": 1.7758, "step": 15872 }, { "epoch": 0.40466421114906775, "grad_norm": 1.3709113597869873, "learning_rate": 2.5321020746373085e-05, "loss": 1.7664, "step": 15903 }, { "epoch": 0.40545303027411467, "grad_norm": 1.3796721696853638, "learning_rate": 2.52370435981567e-05, "loss": 1.7584, "step": 15934 }, { "epoch": 0.4062418493991616, "grad_norm": 1.455452561378479, "learning_rate": 2.5153063774935533e-05, "loss": 1.7745, "step": 15965 }, { "epoch": 0.4070306685242085, "grad_norm": 1.509347677230835, "learning_rate": 2.506908222441045e-05, "loss": 1.7763, "step": 15996 }, { "epoch": 0.4078194876492554, "grad_norm": 1.3093947172164917, "learning_rate": 2.498509989430187e-05, "loss": 1.7565, "step": 16027 }, { "epoch": 0.4086083067743023, "grad_norm": 1.4819965362548828, "learning_rate": 2.4901117732338958e-05, "loss": 1.7678, "step": 16058 }, { "epoch": 0.40939712589934923, "grad_norm": 1.4977960586547852, "learning_rate": 2.481713668624899e-05, "loss": 1.7673, "step": 16089 }, { "epoch": 0.41018594502439615, "grad_norm": 1.4152425527572632, "learning_rate": 2.4733157703746663e-05, "loss": 1.759, "step": 16120 }, { "epoch": 0.41097476414944306, "grad_norm": 1.3504704236984253, "learning_rate": 2.4649181732523392e-05, "loss": 1.773, "step": 16151 }, { "epoch": 0.41176358327449, "grad_norm": 1.3932607173919678, "learning_rate": 2.4565209720236582e-05, "loss": 1.7724, "step": 16182 }, { "epoch": 0.4125524023995369, "grad_norm": 1.423255443572998, "learning_rate": 2.4481242614498975e-05, "loss": 1.7504, "step": 16213 }, { "epoch": 0.4133412215245838, "grad_norm": 1.5146458148956299, "learning_rate": 2.439728136286796e-05, "loss": 1.7572, "step": 16244 }, { "epoch": 0.4141300406496307, "grad_norm": 1.4159959554672241, "learning_rate": 2.4313326912834852e-05, "loss": 1.7495, "step": 16275 }, { "epoch": 0.41491885977467763, "grad_norm": 1.4505484104156494, "learning_rate": 2.4229380211814206e-05, "loss": 1.7748, "step": 16306 }, { "epoch": 0.41570767889972454, "grad_norm": 1.4519730806350708, "learning_rate": 2.4145442207133124e-05, "loss": 1.7635, "step": 16337 }, { "epoch": 0.41649649802477146, "grad_norm": 1.452431082725525, "learning_rate": 2.406151384602059e-05, "loss": 1.7624, "step": 16368 }, { "epoch": 0.41728531714981837, "grad_norm": 1.4315119981765747, "learning_rate": 2.3977596075596747e-05, "loss": 1.7765, "step": 16399 }, { "epoch": 0.4180741362748653, "grad_norm": 1.4047067165374756, "learning_rate": 2.3893689842862223e-05, "loss": 1.755, "step": 16430 }, { "epoch": 0.4188629553999122, "grad_norm": 1.426621913909912, "learning_rate": 2.3809796094687475e-05, "loss": 1.7598, "step": 16461 }, { "epoch": 0.4196517745249591, "grad_norm": 1.4108635187149048, "learning_rate": 2.372591577780202e-05, "loss": 1.7652, "step": 16492 }, { "epoch": 0.420440593650006, "grad_norm": 1.4988287687301636, "learning_rate": 2.3642049838783838e-05, "loss": 1.7763, "step": 16523 }, { "epoch": 0.42122941277505294, "grad_norm": 1.4525630474090576, "learning_rate": 2.3558199224048666e-05, "loss": 1.7607, "step": 16554 }, { "epoch": 0.42201823190009985, "grad_norm": 1.512402892112732, "learning_rate": 2.347436487983929e-05, "loss": 1.7625, "step": 16585 }, { "epoch": 0.42280705102514676, "grad_norm": 1.4328192472457886, "learning_rate": 2.3390547752214888e-05, "loss": 1.7598, "step": 16616 }, { "epoch": 0.42359587015019373, "grad_norm": 1.4110822677612305, "learning_rate": 2.330674878704035e-05, "loss": 1.757, "step": 16647 }, { "epoch": 0.42438468927524065, "grad_norm": 1.4538228511810303, "learning_rate": 2.322296892997561e-05, "loss": 1.7503, "step": 16678 }, { "epoch": 0.42517350840028756, "grad_norm": 1.4495991468429565, "learning_rate": 2.313920912646497e-05, "loss": 1.7593, "step": 16709 }, { "epoch": 0.42596232752533447, "grad_norm": 1.5201659202575684, "learning_rate": 2.305547032172643e-05, "loss": 1.7512, "step": 16740 }, { "epoch": 0.4267511466503814, "grad_norm": 1.4683400392532349, "learning_rate": 2.2971753460741014e-05, "loss": 1.7792, "step": 16771 }, { "epoch": 0.4275399657754283, "grad_norm": 1.4335435628890991, "learning_rate": 2.288805948824212e-05, "loss": 1.7495, "step": 16802 }, { "epoch": 0.4283287849004752, "grad_norm": 1.494997501373291, "learning_rate": 2.2804389348704858e-05, "loss": 1.7806, "step": 16833 }, { "epoch": 0.4291176040255221, "grad_norm": 1.5237140655517578, "learning_rate": 2.2720743986335374e-05, "loss": 1.7348, "step": 16864 }, { "epoch": 0.42990642315056904, "grad_norm": 1.4462862014770508, "learning_rate": 2.2637124345060233e-05, "loss": 1.7663, "step": 16895 }, { "epoch": 0.43069524227561595, "grad_norm": 1.4371618032455444, "learning_rate": 2.2553531368515695e-05, "loss": 1.7699, "step": 16926 }, { "epoch": 0.43148406140066287, "grad_norm": 1.4182896614074707, "learning_rate": 2.2469966000037144e-05, "loss": 1.7651, "step": 16957 }, { "epoch": 0.4322728805257098, "grad_norm": 1.3784195184707642, "learning_rate": 2.2386429182648417e-05, "loss": 1.7596, "step": 16988 }, { "epoch": 0.4330616996507567, "grad_norm": 1.398327350616455, "learning_rate": 2.230292185905114e-05, "loss": 1.7588, "step": 17019 }, { "epoch": 0.4338505187758036, "grad_norm": 1.4239211082458496, "learning_rate": 2.2219444971614116e-05, "loss": 1.7656, "step": 17050 }, { "epoch": 0.4346393379008505, "grad_norm": 1.4027754068374634, "learning_rate": 2.2135999462362655e-05, "loss": 1.7485, "step": 17081 }, { "epoch": 0.43542815702589743, "grad_norm": 1.442612648010254, "learning_rate": 2.2052586272968003e-05, "loss": 1.7682, "step": 17112 }, { "epoch": 0.43621697615094435, "grad_norm": 1.3537038564682007, "learning_rate": 2.196920634473666e-05, "loss": 1.7511, "step": 17143 }, { "epoch": 0.43700579527599126, "grad_norm": 1.3696125745773315, "learning_rate": 2.1885860618599787e-05, "loss": 1.767, "step": 17174 }, { "epoch": 0.4377946144010382, "grad_norm": 1.5365840196609497, "learning_rate": 2.1802550035102577e-05, "loss": 1.7527, "step": 17205 }, { "epoch": 0.4385834335260851, "grad_norm": 1.4375520944595337, "learning_rate": 2.171927553439363e-05, "loss": 1.7577, "step": 17236 }, { "epoch": 0.439372252651132, "grad_norm": 1.4054752588272095, "learning_rate": 2.1636038056214376e-05, "loss": 1.7479, "step": 17267 }, { "epoch": 0.4401610717761789, "grad_norm": 1.4836634397506714, "learning_rate": 2.155283853988844e-05, "loss": 1.7463, "step": 17298 }, { "epoch": 0.4409498909012258, "grad_norm": 1.4966789484024048, "learning_rate": 2.146967792431106e-05, "loss": 1.7539, "step": 17329 }, { "epoch": 0.44173871002627274, "grad_norm": 1.3743985891342163, "learning_rate": 2.138655714793849e-05, "loss": 1.7501, "step": 17360 }, { "epoch": 0.44252752915131965, "grad_norm": 1.4786440134048462, "learning_rate": 2.1303477148777367e-05, "loss": 1.7438, "step": 17391 }, { "epoch": 0.4433163482763666, "grad_norm": 1.3931723833084106, "learning_rate": 2.122043886437421e-05, "loss": 1.7569, "step": 17422 }, { "epoch": 0.44410516740141354, "grad_norm": 1.457221508026123, "learning_rate": 2.1137443231804765e-05, "loss": 1.7459, "step": 17453 }, { "epoch": 0.44489398652646045, "grad_norm": 1.4700186252593994, "learning_rate": 2.105449118766347e-05, "loss": 1.749, "step": 17484 }, { "epoch": 0.44568280565150736, "grad_norm": 1.4787609577178955, "learning_rate": 2.097158366805287e-05, "loss": 1.7433, "step": 17515 }, { "epoch": 0.4464716247765543, "grad_norm": 1.435116171836853, "learning_rate": 2.0888721608573047e-05, "loss": 1.7492, "step": 17546 }, { "epoch": 0.4472604439016012, "grad_norm": 1.3931212425231934, "learning_rate": 2.0805905944311087e-05, "loss": 1.7698, "step": 17577 }, { "epoch": 0.4480492630266481, "grad_norm": 1.4713780879974365, "learning_rate": 2.0723137609830497e-05, "loss": 1.7599, "step": 17608 }, { "epoch": 0.448838082151695, "grad_norm": 1.3709975481033325, "learning_rate": 2.0640417539160686e-05, "loss": 1.7615, "step": 17639 }, { "epoch": 0.44962690127674193, "grad_norm": 1.4991896152496338, "learning_rate": 2.0557746665786427e-05, "loss": 1.7541, "step": 17670 }, { "epoch": 0.45041572040178884, "grad_norm": 1.4068297147750854, "learning_rate": 2.0475125922637256e-05, "loss": 1.7383, "step": 17701 }, { "epoch": 0.45120453952683576, "grad_norm": 1.4071435928344727, "learning_rate": 2.0392556242077047e-05, "loss": 1.754, "step": 17732 }, { "epoch": 0.45199335865188267, "grad_norm": 1.5196751356124878, "learning_rate": 2.031003855589343e-05, "loss": 1.7406, "step": 17763 }, { "epoch": 0.4527821777769296, "grad_norm": 1.4674859046936035, "learning_rate": 2.022757379528727e-05, "loss": 1.7496, "step": 17794 }, { "epoch": 0.4535709969019765, "grad_norm": 1.377008318901062, "learning_rate": 2.0145162890862184e-05, "loss": 1.7573, "step": 17825 }, { "epoch": 0.4543598160270234, "grad_norm": 1.3753769397735596, "learning_rate": 2.0062806772614022e-05, "loss": 1.7312, "step": 17856 }, { "epoch": 0.4551486351520703, "grad_norm": 1.3770841360092163, "learning_rate": 1.9980506369920392e-05, "loss": 1.7378, "step": 17887 }, { "epoch": 0.45593745427711724, "grad_norm": 1.644900918006897, "learning_rate": 1.989826261153015e-05, "loss": 1.7423, "step": 17918 }, { "epoch": 0.45672627340216415, "grad_norm": 1.423464059829712, "learning_rate": 1.9816076425552923e-05, "loss": 1.745, "step": 17949 }, { "epoch": 0.45751509252721106, "grad_norm": 1.4013458490371704, "learning_rate": 1.9733948739448676e-05, "loss": 1.7444, "step": 17980 }, { "epoch": 0.458303911652258, "grad_norm": 1.4134126901626587, "learning_rate": 1.9651880480017155e-05, "loss": 1.7318, "step": 18011 }, { "epoch": 0.4590927307773049, "grad_norm": 1.389404535293579, "learning_rate": 1.9569872573387516e-05, "loss": 1.7531, "step": 18042 }, { "epoch": 0.4598815499023518, "grad_norm": 1.4825111627578735, "learning_rate": 1.9487925945007854e-05, "loss": 1.7178, "step": 18073 }, { "epoch": 0.4606703690273987, "grad_norm": 1.335856556892395, "learning_rate": 1.9406041519634726e-05, "loss": 1.7569, "step": 18104 }, { "epoch": 0.46145918815244563, "grad_norm": 1.5451414585113525, "learning_rate": 1.932422022132275e-05, "loss": 1.7608, "step": 18135 }, { "epoch": 0.4622480072774926, "grad_norm": 1.458856225013733, "learning_rate": 1.924246297341414e-05, "loss": 1.7381, "step": 18166 }, { "epoch": 0.4630368264025395, "grad_norm": 1.5138990879058838, "learning_rate": 1.9160770698528338e-05, "loss": 1.7505, "step": 18197 }, { "epoch": 0.4638256455275864, "grad_norm": 1.433817744255066, "learning_rate": 1.907914431855156e-05, "loss": 1.7406, "step": 18228 }, { "epoch": 0.46461446465263334, "grad_norm": 1.3636925220489502, "learning_rate": 1.8997584754626412e-05, "loss": 1.7533, "step": 18259 }, { "epoch": 0.46540328377768025, "grad_norm": 1.4643160104751587, "learning_rate": 1.8916092927141486e-05, "loss": 1.7329, "step": 18290 }, { "epoch": 0.46619210290272717, "grad_norm": 1.3883280754089355, "learning_rate": 1.883466975572098e-05, "loss": 1.7386, "step": 18321 }, { "epoch": 0.4669809220277741, "grad_norm": 1.4294878244400024, "learning_rate": 1.8753316159214312e-05, "loss": 1.7553, "step": 18352 }, { "epoch": 0.467769741152821, "grad_norm": 1.4422011375427246, "learning_rate": 1.8672033055685766e-05, "loss": 1.7333, "step": 18383 }, { "epoch": 0.4685585602778679, "grad_norm": 1.4707412719726562, "learning_rate": 1.8590821362404116e-05, "loss": 1.7463, "step": 18414 }, { "epoch": 0.4693473794029148, "grad_norm": 1.4446028470993042, "learning_rate": 1.8509681995832294e-05, "loss": 1.7262, "step": 18445 }, { "epoch": 0.47013619852796174, "grad_norm": 1.3790693283081055, "learning_rate": 1.8428615871617004e-05, "loss": 1.7442, "step": 18476 }, { "epoch": 0.47092501765300865, "grad_norm": 1.4067668914794922, "learning_rate": 1.8347623904578448e-05, "loss": 1.731, "step": 18507 }, { "epoch": 0.47171383677805556, "grad_norm": 1.496756672859192, "learning_rate": 1.8266707008699975e-05, "loss": 1.7546, "step": 18538 }, { "epoch": 0.4725026559031025, "grad_norm": 1.4508312940597534, "learning_rate": 1.818586609711774e-05, "loss": 1.748, "step": 18569 }, { "epoch": 0.4732914750281494, "grad_norm": 1.4680043458938599, "learning_rate": 1.8105102082110462e-05, "loss": 1.7334, "step": 18600 }, { "epoch": 0.4740802941531963, "grad_norm": 1.512750267982483, "learning_rate": 1.8024415875089058e-05, "loss": 1.7437, "step": 18631 }, { "epoch": 0.4748691132782432, "grad_norm": 1.4424457550048828, "learning_rate": 1.7943808386586407e-05, "loss": 1.7454, "step": 18662 }, { "epoch": 0.47565793240329013, "grad_norm": 1.47055983543396, "learning_rate": 1.7863280526247073e-05, "loss": 1.7132, "step": 18693 }, { "epoch": 0.47644675152833704, "grad_norm": 1.3706848621368408, "learning_rate": 1.7782833202817003e-05, "loss": 1.7329, "step": 18724 }, { "epoch": 0.47723557065338396, "grad_norm": 1.3911017179489136, "learning_rate": 1.7702467324133327e-05, "loss": 1.7322, "step": 18755 }, { "epoch": 0.47802438977843087, "grad_norm": 1.3935508728027344, "learning_rate": 1.7622183797114042e-05, "loss": 1.7463, "step": 18786 }, { "epoch": 0.4788132089034778, "grad_norm": 1.3628978729248047, "learning_rate": 1.7541983527747838e-05, "loss": 1.7529, "step": 18817 }, { "epoch": 0.4796020280285247, "grad_norm": 1.3993347883224487, "learning_rate": 1.746186742108387e-05, "loss": 1.7517, "step": 18848 }, { "epoch": 0.4803908471535716, "grad_norm": 1.4427608251571655, "learning_rate": 1.73818363812215e-05, "loss": 1.7499, "step": 18879 }, { "epoch": 0.4811796662786185, "grad_norm": 1.4312776327133179, "learning_rate": 1.7301891311300153e-05, "loss": 1.7471, "step": 18910 }, { "epoch": 0.4819684854036655, "grad_norm": 1.373559832572937, "learning_rate": 1.7222033113489055e-05, "loss": 1.7555, "step": 18941 }, { "epoch": 0.4827573045287124, "grad_norm": 1.383086085319519, "learning_rate": 1.7142262688977127e-05, "loss": 1.7446, "step": 18972 }, { "epoch": 0.4835461236537593, "grad_norm": 1.459486722946167, "learning_rate": 1.7062580937962764e-05, "loss": 1.7523, "step": 19003 }, { "epoch": 0.48433494277880623, "grad_norm": 1.5249037742614746, "learning_rate": 1.698298875964369e-05, "loss": 1.7249, "step": 19034 }, { "epoch": 0.48512376190385315, "grad_norm": 1.431281566619873, "learning_rate": 1.690348705220684e-05, "loss": 1.7133, "step": 19065 }, { "epoch": 0.48591258102890006, "grad_norm": 1.4861342906951904, "learning_rate": 1.6824076712818156e-05, "loss": 1.7377, "step": 19096 }, { "epoch": 0.486701400153947, "grad_norm": 1.3854913711547852, "learning_rate": 1.6744758637612533e-05, "loss": 1.7292, "step": 19127 }, { "epoch": 0.4874902192789939, "grad_norm": 1.482332468032837, "learning_rate": 1.6665533721683664e-05, "loss": 1.7505, "step": 19158 }, { "epoch": 0.4882790384040408, "grad_norm": 1.3565430641174316, "learning_rate": 1.6586402859073974e-05, "loss": 1.742, "step": 19189 }, { "epoch": 0.4890678575290877, "grad_norm": 1.445395588874817, "learning_rate": 1.6507366942764463e-05, "loss": 1.7387, "step": 19220 }, { "epoch": 0.4898566766541346, "grad_norm": 1.4928544759750366, "learning_rate": 1.6428426864664732e-05, "loss": 1.762, "step": 19251 }, { "epoch": 0.49064549577918154, "grad_norm": 1.38858962059021, "learning_rate": 1.6349583515602816e-05, "loss": 1.7486, "step": 19282 }, { "epoch": 0.49143431490422845, "grad_norm": 1.3937194347381592, "learning_rate": 1.6270837785315208e-05, "loss": 1.7542, "step": 19313 }, { "epoch": 0.49222313402927537, "grad_norm": 1.501042127609253, "learning_rate": 1.619219056243676e-05, "loss": 1.7274, "step": 19344 }, { "epoch": 0.4930119531543223, "grad_norm": 1.3143610954284668, "learning_rate": 1.6113642734490698e-05, "loss": 1.7137, "step": 19375 }, { "epoch": 0.4938007722793692, "grad_norm": 1.4225116968154907, "learning_rate": 1.6035195187878577e-05, "loss": 1.7414, "step": 19406 }, { "epoch": 0.4945895914044161, "grad_norm": 1.4575517177581787, "learning_rate": 1.5956848807870305e-05, "loss": 1.7044, "step": 19437 }, { "epoch": 0.495378410529463, "grad_norm": 1.4163532257080078, "learning_rate": 1.587860447859413e-05, "loss": 1.7365, "step": 19468 }, { "epoch": 0.49616722965450993, "grad_norm": 1.504955530166626, "learning_rate": 1.5800463083026686e-05, "loss": 1.759, "step": 19499 }, { "epoch": 0.49695604877955685, "grad_norm": 1.4385664463043213, "learning_rate": 1.572242550298298e-05, "loss": 1.7193, "step": 19530 }, { "epoch": 0.49774486790460376, "grad_norm": 1.384371280670166, "learning_rate": 1.56444926191065e-05, "loss": 1.7224, "step": 19561 }, { "epoch": 0.4985336870296507, "grad_norm": 1.392520785331726, "learning_rate": 1.5566665310859257e-05, "loss": 1.7246, "step": 19592 }, { "epoch": 0.4993225061546976, "grad_norm": 1.421629786491394, "learning_rate": 1.5488944456511846e-05, "loss": 1.7314, "step": 19623 }, { "epoch": 0.5001113252797446, "grad_norm": 1.405013918876648, "learning_rate": 1.5411330933133546e-05, "loss": 1.7329, "step": 19654 }, { "epoch": 0.5009001444047915, "grad_norm": 1.372490406036377, "learning_rate": 1.533382561658241e-05, "loss": 1.7253, "step": 19685 }, { "epoch": 0.5016889635298384, "grad_norm": 1.4861042499542236, "learning_rate": 1.525642938149541e-05, "loss": 1.7317, "step": 19716 }, { "epoch": 0.5024777826548853, "grad_norm": 1.4108079671859741, "learning_rate": 1.5179143101278536e-05, "loss": 1.7391, "step": 19747 }, { "epoch": 0.5032666017799322, "grad_norm": 1.4616518020629883, "learning_rate": 1.5101967648096955e-05, "loss": 1.7129, "step": 19778 }, { "epoch": 0.5040554209049791, "grad_norm": 1.3660775423049927, "learning_rate": 1.5024903892865172e-05, "loss": 1.7149, "step": 19809 }, { "epoch": 0.504844240030026, "grad_norm": 1.4286696910858154, "learning_rate": 1.4947952705237184e-05, "loss": 1.6982, "step": 19840 }, { "epoch": 0.505633059155073, "grad_norm": 1.4002240896224976, "learning_rate": 1.4871114953596682e-05, "loss": 1.7319, "step": 19871 }, { "epoch": 0.5064218782801199, "grad_norm": 1.4784040451049805, "learning_rate": 1.4794391505047256e-05, "loss": 1.7193, "step": 19902 }, { "epoch": 0.5072106974051668, "grad_norm": 1.4451029300689697, "learning_rate": 1.4717783225402596e-05, "loss": 1.7225, "step": 19933 }, { "epoch": 0.5079995165302137, "grad_norm": 1.438502550125122, "learning_rate": 1.4641290979176735e-05, "loss": 1.7186, "step": 19964 }, { "epoch": 0.5087883356552606, "grad_norm": 1.4443246126174927, "learning_rate": 1.4564915629574246e-05, "loss": 1.7242, "step": 19995 }, { "epoch": 0.5095771547803075, "grad_norm": 1.5206542015075684, "learning_rate": 1.4488658038480601e-05, "loss": 1.7406, "step": 20026 }, { "epoch": 0.5103659739053544, "grad_norm": 1.4452012777328491, "learning_rate": 1.4412519066452323e-05, "loss": 1.7218, "step": 20057 }, { "epoch": 0.5111547930304013, "grad_norm": 1.4169068336486816, "learning_rate": 1.4336499572707373e-05, "loss": 1.7365, "step": 20088 }, { "epoch": 0.5119436121554483, "grad_norm": 1.475844383239746, "learning_rate": 1.4260600415115433e-05, "loss": 1.7264, "step": 20119 }, { "epoch": 0.5127324312804952, "grad_norm": 1.4148321151733398, "learning_rate": 1.4184822450188137e-05, "loss": 1.7348, "step": 20150 }, { "epoch": 0.5135212504055421, "grad_norm": 1.4532842636108398, "learning_rate": 1.410916653306954e-05, "loss": 1.7021, "step": 20181 }, { "epoch": 0.514310069530589, "grad_norm": 1.4390312433242798, "learning_rate": 1.403363351752639e-05, "loss": 1.7205, "step": 20212 }, { "epoch": 0.5150988886556359, "grad_norm": 1.4490697383880615, "learning_rate": 1.3958224255938485e-05, "loss": 1.7235, "step": 20243 }, { "epoch": 0.5158877077806828, "grad_norm": 1.4487396478652954, "learning_rate": 1.388293959928911e-05, "loss": 1.7325, "step": 20274 }, { "epoch": 0.5166765269057297, "grad_norm": 1.3987274169921875, "learning_rate": 1.3807780397155379e-05, "loss": 1.707, "step": 20305 }, { "epoch": 0.5174653460307767, "grad_norm": 1.4041749238967896, "learning_rate": 1.3732747497698655e-05, "loss": 1.7179, "step": 20336 }, { "epoch": 0.5182541651558236, "grad_norm": 1.442674994468689, "learning_rate": 1.3657841747655038e-05, "loss": 1.7382, "step": 20367 }, { "epoch": 0.5190429842808705, "grad_norm": 1.4303447008132935, "learning_rate": 1.3583063992325706e-05, "loss": 1.7375, "step": 20398 }, { "epoch": 0.5198318034059174, "grad_norm": 1.4175372123718262, "learning_rate": 1.3508415075567496e-05, "loss": 1.7126, "step": 20429 }, { "epoch": 0.5206206225309643, "grad_norm": 1.4764102697372437, "learning_rate": 1.343389583978327e-05, "loss": 1.7372, "step": 20460 }, { "epoch": 0.5214094416560112, "grad_norm": 1.4597842693328857, "learning_rate": 1.3359507125912468e-05, "loss": 1.7247, "step": 20491 }, { "epoch": 0.5221982607810581, "grad_norm": 1.4972703456878662, "learning_rate": 1.3285249773421627e-05, "loss": 1.7158, "step": 20522 }, { "epoch": 0.522987079906105, "grad_norm": 1.4102123975753784, "learning_rate": 1.3211124620294884e-05, "loss": 1.74, "step": 20553 }, { "epoch": 0.523775899031152, "grad_norm": 1.3836309909820557, "learning_rate": 1.313713250302451e-05, "loss": 1.733, "step": 20584 }, { "epoch": 0.5245647181561989, "grad_norm": 1.4065951108932495, "learning_rate": 1.3063274256601479e-05, "loss": 1.7291, "step": 20615 }, { "epoch": 0.5253535372812458, "grad_norm": 1.4294134378433228, "learning_rate": 1.2989550714506086e-05, "loss": 1.7196, "step": 20646 }, { "epoch": 0.5261423564062927, "grad_norm": 1.438848853111267, "learning_rate": 1.291596270869846e-05, "loss": 1.7294, "step": 20677 }, { "epoch": 0.5269311755313396, "grad_norm": 1.3648425340652466, "learning_rate": 1.284251106960927e-05, "loss": 1.725, "step": 20708 }, { "epoch": 0.5277199946563865, "grad_norm": 1.4666975736618042, "learning_rate": 1.2769196626130263e-05, "loss": 1.7272, "step": 20739 }, { "epoch": 0.5285088137814334, "grad_norm": 1.4472864866256714, "learning_rate": 1.2696020205604969e-05, "loss": 1.7216, "step": 20770 }, { "epoch": 0.5292976329064804, "grad_norm": 1.4326000213623047, "learning_rate": 1.2622982633819359e-05, "loss": 1.7263, "step": 20801 }, { "epoch": 0.5300864520315273, "grad_norm": 1.468807578086853, "learning_rate": 1.2550084734992484e-05, "loss": 1.7366, "step": 20832 }, { "epoch": 0.5308752711565742, "grad_norm": 1.3874242305755615, "learning_rate": 1.247732733176724e-05, "loss": 1.7235, "step": 20863 }, { "epoch": 0.5316640902816212, "grad_norm": 1.4644588232040405, "learning_rate": 1.2404711245201044e-05, "loss": 1.7363, "step": 20894 }, { "epoch": 0.5324529094066681, "grad_norm": 1.4458835124969482, "learning_rate": 1.2332237294756535e-05, "loss": 1.7062, "step": 20925 }, { "epoch": 0.533241728531715, "grad_norm": 1.4956963062286377, "learning_rate": 1.225990629829241e-05, "loss": 1.7244, "step": 20956 }, { "epoch": 0.534030547656762, "grad_norm": 1.4594619274139404, "learning_rate": 1.2187719072054136e-05, "loss": 1.7074, "step": 20987 }, { "epoch": 0.5348193667818089, "grad_norm": 1.4499660730361938, "learning_rate": 1.2115676430664735e-05, "loss": 1.7154, "step": 21018 }, { "epoch": 0.5356081859068558, "grad_norm": 1.5303255319595337, "learning_rate": 1.2043779187115647e-05, "loss": 1.7284, "step": 21049 }, { "epoch": 0.5363970050319027, "grad_norm": 1.3913129568099976, "learning_rate": 1.1972028152757476e-05, "loss": 1.7328, "step": 21080 }, { "epoch": 0.5371858241569496, "grad_norm": 1.4718728065490723, "learning_rate": 1.1900424137290889e-05, "loss": 1.7208, "step": 21111 }, { "epoch": 0.5379746432819965, "grad_norm": 1.3919767141342163, "learning_rate": 1.1828967948757482e-05, "loss": 1.7143, "step": 21142 }, { "epoch": 0.5387634624070434, "grad_norm": 1.4659541845321655, "learning_rate": 1.175766039353062e-05, "loss": 1.7111, "step": 21173 }, { "epoch": 0.5395522815320903, "grad_norm": 1.4828646183013916, "learning_rate": 1.1686502276306382e-05, "loss": 1.7113, "step": 21204 }, { "epoch": 0.5403411006571373, "grad_norm": 1.458970308303833, "learning_rate": 1.1615494400094445e-05, "loss": 1.7199, "step": 21235 }, { "epoch": 0.5411299197821842, "grad_norm": 1.4522119760513306, "learning_rate": 1.1544637566209029e-05, "loss": 1.7052, "step": 21266 }, { "epoch": 0.5419187389072311, "grad_norm": 1.4456357955932617, "learning_rate": 1.1473932574259886e-05, "loss": 1.7201, "step": 21297 }, { "epoch": 0.542707558032278, "grad_norm": 1.4089595079421997, "learning_rate": 1.1403380222143247e-05, "loss": 1.705, "step": 21328 }, { "epoch": 0.5434963771573249, "grad_norm": 1.4137688875198364, "learning_rate": 1.1332981306032808e-05, "loss": 1.7135, "step": 21359 }, { "epoch": 0.5442851962823718, "grad_norm": 1.4155645370483398, "learning_rate": 1.1262736620370762e-05, "loss": 1.7159, "step": 21390 }, { "epoch": 0.5450740154074187, "grad_norm": 1.5558688640594482, "learning_rate": 1.1192646957858854e-05, "loss": 1.7283, "step": 21421 }, { "epoch": 0.5458628345324656, "grad_norm": 1.5027565956115723, "learning_rate": 1.1122713109449381e-05, "loss": 1.7135, "step": 21452 }, { "epoch": 0.5466516536575126, "grad_norm": 1.499029517173767, "learning_rate": 1.105293586433634e-05, "loss": 1.7208, "step": 21483 }, { "epoch": 0.5474404727825595, "grad_norm": 1.4107885360717773, "learning_rate": 1.0983316009946446e-05, "loss": 1.7199, "step": 21514 }, { "epoch": 0.5482292919076064, "grad_norm": 1.3750280141830444, "learning_rate": 1.0913854331930282e-05, "loss": 1.6973, "step": 21545 }, { "epoch": 0.5490181110326533, "grad_norm": 1.4192049503326416, "learning_rate": 1.0844551614153456e-05, "loss": 1.7034, "step": 21576 }, { "epoch": 0.5498069301577002, "grad_norm": 1.422545075416565, "learning_rate": 1.0775408638687725e-05, "loss": 1.7168, "step": 21607 }, { "epoch": 0.5505957492827471, "grad_norm": 1.4749089479446411, "learning_rate": 1.0706426185802165e-05, "loss": 1.7169, "step": 21638 }, { "epoch": 0.551384568407794, "grad_norm": 1.4102238416671753, "learning_rate": 1.0637605033954371e-05, "loss": 1.7195, "step": 21669 }, { "epoch": 0.552173387532841, "grad_norm": 1.4288344383239746, "learning_rate": 1.05689459597817e-05, "loss": 1.704, "step": 21700 }, { "epoch": 0.5529622066578879, "grad_norm": 1.4801214933395386, "learning_rate": 1.050044973809246e-05, "loss": 1.7011, "step": 21731 }, { "epoch": 0.5537510257829348, "grad_norm": 1.4600056409835815, "learning_rate": 1.043211714185722e-05, "loss": 1.714, "step": 21762 }, { "epoch": 0.5545398449079817, "grad_norm": 1.405286192893982, "learning_rate": 1.036394894220003e-05, "loss": 1.7098, "step": 21793 }, { "epoch": 0.5553286640330286, "grad_norm": 1.4454749822616577, "learning_rate": 1.0295945908389751e-05, "loss": 1.7152, "step": 21824 }, { "epoch": 0.5561174831580755, "grad_norm": 1.5381968021392822, "learning_rate": 1.0228108807831393e-05, "loss": 1.7199, "step": 21855 }, { "epoch": 0.5569063022831224, "grad_norm": 1.426140546798706, "learning_rate": 1.01604384060574e-05, "loss": 1.7317, "step": 21886 }, { "epoch": 0.5576951214081693, "grad_norm": 1.6093019247055054, "learning_rate": 1.009293546671907e-05, "loss": 1.6993, "step": 21917 }, { "epoch": 0.5584839405332163, "grad_norm": 1.344679355621338, "learning_rate": 1.002560075157791e-05, "loss": 1.7258, "step": 21948 }, { "epoch": 0.5592727596582632, "grad_norm": 1.3664970397949219, "learning_rate": 9.958435020496995e-06, "loss": 1.71, "step": 21979 }, { "epoch": 0.5600615787833101, "grad_norm": 1.457160234451294, "learning_rate": 9.89143903143249e-06, "loss": 1.7173, "step": 22010 }, { "epoch": 0.560850397908357, "grad_norm": 1.3795216083526611, "learning_rate": 9.824613540425038e-06, "loss": 1.6924, "step": 22041 }, { "epoch": 0.5616392170334039, "grad_norm": 1.3805229663848877, "learning_rate": 9.757959301591197e-06, "loss": 1.7149, "step": 22072 }, { "epoch": 0.5624280361584508, "grad_norm": 1.4958610534667969, "learning_rate": 9.691477067115017e-06, "loss": 1.7161, "step": 22103 }, { "epoch": 0.5632168552834977, "grad_norm": 1.4804275035858154, "learning_rate": 9.625167587239467e-06, "loss": 1.7023, "step": 22134 }, { "epoch": 0.5640056744085447, "grad_norm": 1.3880009651184082, "learning_rate": 9.559031610258007e-06, "loss": 1.7312, "step": 22165 }, { "epoch": 0.5647944935335916, "grad_norm": 1.429051160812378, "learning_rate": 9.493069882506164e-06, "loss": 1.7102, "step": 22196 }, { "epoch": 0.5655833126586385, "grad_norm": 1.4571672677993774, "learning_rate": 9.427283148353056e-06, "loss": 1.7234, "step": 22227 }, { "epoch": 0.5663721317836854, "grad_norm": 1.4143497943878174, "learning_rate": 9.361672150193052e-06, "loss": 1.7129, "step": 22258 }, { "epoch": 0.5671609509087323, "grad_norm": 1.4296061992645264, "learning_rate": 9.29623762843734e-06, "loss": 1.7291, "step": 22289 }, { "epoch": 0.5679497700337792, "grad_norm": 1.4027940034866333, "learning_rate": 9.230980321505594e-06, "loss": 1.7199, "step": 22320 }, { "epoch": 0.5687385891588261, "grad_norm": 1.4574463367462158, "learning_rate": 9.165900965817668e-06, "loss": 1.713, "step": 22351 }, { "epoch": 0.569527408283873, "grad_norm": 1.4593865871429443, "learning_rate": 9.101000295785245e-06, "loss": 1.7153, "step": 22382 }, { "epoch": 0.57031622740892, "grad_norm": 1.4154292345046997, "learning_rate": 9.036279043803565e-06, "loss": 1.7046, "step": 22413 }, { "epoch": 0.571105046533967, "grad_norm": 1.4303706884384155, "learning_rate": 8.971737940243147e-06, "loss": 1.6939, "step": 22444 }, { "epoch": 0.5718938656590139, "grad_norm": 1.4045100212097168, "learning_rate": 8.907377713441592e-06, "loss": 1.7022, "step": 22475 }, { "epoch": 0.5726826847840608, "grad_norm": 1.4179104566574097, "learning_rate": 8.843199089695293e-06, "loss": 1.6987, "step": 22506 }, { "epoch": 0.5734715039091077, "grad_norm": 1.4246447086334229, "learning_rate": 8.779202793251311e-06, "loss": 1.7187, "step": 22537 }, { "epoch": 0.5742603230341546, "grad_norm": 1.3932377099990845, "learning_rate": 8.715389546299149e-06, "loss": 1.7074, "step": 22568 }, { "epoch": 0.5750491421592016, "grad_norm": 1.3958845138549805, "learning_rate": 8.651760068962617e-06, "loss": 1.6959, "step": 22599 }, { "epoch": 0.5758379612842485, "grad_norm": 1.4608432054519653, "learning_rate": 8.588315079291733e-06, "loss": 1.6956, "step": 22630 }, { "epoch": 0.5766267804092954, "grad_norm": 1.4236079454421997, "learning_rate": 8.52505529325457e-06, "loss": 1.6964, "step": 22661 }, { "epoch": 0.5774155995343423, "grad_norm": 1.4737471342086792, "learning_rate": 8.461981424729216e-06, "loss": 1.7316, "step": 22692 }, { "epoch": 0.5782044186593892, "grad_norm": 1.4301661252975464, "learning_rate": 8.399094185495725e-06, "loss": 1.6969, "step": 22723 }, { "epoch": 0.5789932377844361, "grad_norm": 1.5357433557510376, "learning_rate": 8.336394285228017e-06, "loss": 1.7211, "step": 22754 }, { "epoch": 0.579782056909483, "grad_norm": 1.4420846700668335, "learning_rate": 8.273882431485952e-06, "loss": 1.7177, "step": 22785 }, { "epoch": 0.58057087603453, "grad_norm": 1.402849793434143, "learning_rate": 8.211559329707316e-06, "loss": 1.7008, "step": 22816 }, { "epoch": 0.5813596951595769, "grad_norm": 1.4084275960922241, "learning_rate": 8.149425683199823e-06, "loss": 1.7052, "step": 22847 }, { "epoch": 0.5821485142846238, "grad_norm": 1.406717300415039, "learning_rate": 8.08748219313325e-06, "loss": 1.7201, "step": 22878 }, { "epoch": 0.5829373334096707, "grad_norm": 1.9726225137710571, "learning_rate": 8.025729558531453e-06, "loss": 1.7197, "step": 22909 }, { "epoch": 0.5837261525347176, "grad_norm": 1.4474542140960693, "learning_rate": 7.964168476264508e-06, "loss": 1.7314, "step": 22940 }, { "epoch": 0.5845149716597645, "grad_norm": 1.5046030282974243, "learning_rate": 7.902799641040884e-06, "loss": 1.7128, "step": 22971 }, { "epoch": 0.5853037907848114, "grad_norm": 1.4233800172805786, "learning_rate": 7.841623745399523e-06, "loss": 1.7026, "step": 23002 }, { "epoch": 0.5860926099098583, "grad_norm": 1.4411020278930664, "learning_rate": 7.780641479702114e-06, "loss": 1.7039, "step": 23033 }, { "epoch": 0.5868814290349053, "grad_norm": 1.3648072481155396, "learning_rate": 7.719853532125227e-06, "loss": 1.6997, "step": 23064 }, { "epoch": 0.5876702481599522, "grad_norm": 1.3941482305526733, "learning_rate": 7.65926058865258e-06, "loss": 1.6947, "step": 23095 }, { "epoch": 0.5884590672849991, "grad_norm": 1.4287201166152954, "learning_rate": 7.598863333067313e-06, "loss": 1.7081, "step": 23126 }, { "epoch": 0.589247886410046, "grad_norm": 1.4891555309295654, "learning_rate": 7.538662446944253e-06, "loss": 1.6999, "step": 23157 }, { "epoch": 0.5900367055350929, "grad_norm": 1.4390950202941895, "learning_rate": 7.478658609642211e-06, "loss": 1.71, "step": 23188 }, { "epoch": 0.5908255246601398, "grad_norm": 1.4771630764007568, "learning_rate": 7.418852498296327e-06, "loss": 1.6975, "step": 23219 }, { "epoch": 0.5916143437851867, "grad_norm": 1.4118000268936157, "learning_rate": 7.359244787810457e-06, "loss": 1.7028, "step": 23250 }, { "epoch": 0.5924031629102336, "grad_norm": 1.4303267002105713, "learning_rate": 7.299836150849493e-06, "loss": 1.7052, "step": 23281 }, { "epoch": 0.5931919820352806, "grad_norm": 1.3951334953308105, "learning_rate": 7.240627257831847e-06, "loss": 1.711, "step": 23312 }, { "epoch": 0.5939808011603275, "grad_norm": 1.4434106349945068, "learning_rate": 7.1816187769218195e-06, "loss": 1.71, "step": 23343 }, { "epoch": 0.5947696202853744, "grad_norm": 1.4348808526992798, "learning_rate": 7.1228113740220895e-06, "loss": 1.7104, "step": 23374 }, { "epoch": 0.5955584394104213, "grad_norm": 1.4280933141708374, "learning_rate": 7.064205712766226e-06, "loss": 1.6948, "step": 23405 }, { "epoch": 0.5963472585354682, "grad_norm": 1.4204617738723755, "learning_rate": 7.005802454511129e-06, "loss": 1.7016, "step": 23436 }, { "epoch": 0.5971360776605151, "grad_norm": 1.3821487426757812, "learning_rate": 6.947602258329639e-06, "loss": 1.6919, "step": 23467 }, { "epoch": 0.597924896785562, "grad_norm": 1.4799888134002686, "learning_rate": 6.889605781003078e-06, "loss": 1.7245, "step": 23498 }, { "epoch": 0.598713715910609, "grad_norm": 1.4447741508483887, "learning_rate": 6.831813677013776e-06, "loss": 1.7352, "step": 23529 }, { "epoch": 0.5995025350356559, "grad_norm": 1.5367285013198853, "learning_rate": 6.774226598537792e-06, "loss": 1.7047, "step": 23560 }, { "epoch": 0.6002913541607028, "grad_norm": 1.4005663394927979, "learning_rate": 6.716845195437482e-06, "loss": 1.7021, "step": 23591 }, { "epoch": 0.6010801732857497, "grad_norm": 1.4289170503616333, "learning_rate": 6.659670115254168e-06, "loss": 1.7093, "step": 23622 }, { "epoch": 0.6018689924107966, "grad_norm": 1.5853567123413086, "learning_rate": 6.602702003200872e-06, "loss": 1.7075, "step": 23653 }, { "epoch": 0.6026578115358435, "grad_norm": 1.580708622932434, "learning_rate": 6.545941502154992e-06, "loss": 1.7041, "step": 23684 }, { "epoch": 0.6034466306608904, "grad_norm": 1.477163553237915, "learning_rate": 6.489389252651057e-06, "loss": 1.7145, "step": 23715 }, { "epoch": 0.6042354497859374, "grad_norm": 1.428688883781433, "learning_rate": 6.4330458928735325e-06, "loss": 1.6906, "step": 23746 }, { "epoch": 0.6050242689109843, "grad_norm": 1.4114421606063843, "learning_rate": 6.376912058649559e-06, "loss": 1.7116, "step": 23777 }, { "epoch": 0.6058130880360312, "grad_norm": 1.472838044166565, "learning_rate": 6.320988383441845e-06, "loss": 1.6997, "step": 23808 }, { "epoch": 0.6066019071610781, "grad_norm": 1.4391415119171143, "learning_rate": 6.265275498341452e-06, "loss": 1.7061, "step": 23839 }, { "epoch": 0.607390726286125, "grad_norm": 1.4201316833496094, "learning_rate": 6.209774032060714e-06, "loss": 1.6964, "step": 23870 }, { "epoch": 0.6081795454111719, "grad_norm": 1.3875731229782104, "learning_rate": 6.1544846109261365e-06, "loss": 1.7004, "step": 23901 }, { "epoch": 0.6089683645362188, "grad_norm": 1.4405038356781006, "learning_rate": 6.099407858871342e-06, "loss": 1.7059, "step": 23932 }, { "epoch": 0.6097571836612657, "grad_norm": 1.4719831943511963, "learning_rate": 6.044544397429958e-06, "loss": 1.7063, "step": 23963 }, { "epoch": 0.6105460027863128, "grad_norm": 1.4867260456085205, "learning_rate": 5.989894845728708e-06, "loss": 1.7054, "step": 23994 }, { "epoch": 0.6113348219113597, "grad_norm": 1.4767833948135376, "learning_rate": 5.9354598204803605e-06, "loss": 1.6951, "step": 24025 }, { "epoch": 0.6121236410364066, "grad_norm": 1.4202378988265991, "learning_rate": 5.881239935976762e-06, "loss": 1.7042, "step": 24056 }, { "epoch": 0.6129124601614535, "grad_norm": 1.4658666849136353, "learning_rate": 5.827235804081954e-06, "loss": 1.7011, "step": 24087 }, { "epoch": 0.6137012792865004, "grad_norm": 1.437771201133728, "learning_rate": 5.773448034225221e-06, "loss": 1.7033, "step": 24118 }, { "epoch": 0.6144900984115473, "grad_norm": 1.4407992362976074, "learning_rate": 5.719877233394228e-06, "loss": 1.6841, "step": 24149 }, { "epoch": 0.6152789175365942, "grad_norm": 1.434173822402954, "learning_rate": 5.666524006128191e-06, "loss": 1.6893, "step": 24180 }, { "epoch": 0.6160677366616412, "grad_norm": 1.5241893529891968, "learning_rate": 5.613388954511015e-06, "loss": 1.707, "step": 24211 }, { "epoch": 0.6168565557866881, "grad_norm": 1.4565976858139038, "learning_rate": 5.560472678164552e-06, "loss": 1.695, "step": 24242 }, { "epoch": 0.617645374911735, "grad_norm": 1.458123803138733, "learning_rate": 5.507775774241775e-06, "loss": 1.6988, "step": 24273 }, { "epoch": 0.6184341940367819, "grad_norm": 1.4085556268692017, "learning_rate": 5.4552988374200945e-06, "loss": 1.6986, "step": 24304 }, { "epoch": 0.6192230131618288, "grad_norm": 1.444799542427063, "learning_rate": 5.403042459894597e-06, "loss": 1.7036, "step": 24335 }, { "epoch": 0.6200118322868757, "grad_norm": 1.417597770690918, "learning_rate": 5.3510072313714135e-06, "loss": 1.7016, "step": 24366 }, { "epoch": 0.6208006514119226, "grad_norm": 1.4727599620819092, "learning_rate": 5.2991937390610205e-06, "loss": 1.7198, "step": 24397 }, { "epoch": 0.6215894705369696, "grad_norm": 1.407718300819397, "learning_rate": 5.247602567671625e-06, "loss": 1.6932, "step": 24428 }, { "epoch": 0.6223782896620165, "grad_norm": 1.424126148223877, "learning_rate": 5.196234299402603e-06, "loss": 1.6927, "step": 24459 }, { "epoch": 0.6231671087870634, "grad_norm": 1.5233465433120728, "learning_rate": 5.145089513937865e-06, "loss": 1.7072, "step": 24490 }, { "epoch": 0.6239559279121103, "grad_norm": 1.427517056465149, "learning_rate": 5.094168788439369e-06, "loss": 1.6981, "step": 24521 }, { "epoch": 0.6247447470371572, "grad_norm": 1.4485392570495605, "learning_rate": 5.043472697540594e-06, "loss": 1.6855, "step": 24552 }, { "epoch": 0.6255335661622041, "grad_norm": 1.4278972148895264, "learning_rate": 4.993001813340012e-06, "loss": 1.6945, "step": 24583 }, { "epoch": 0.626322385287251, "grad_norm": 1.392105221748352, "learning_rate": 4.942756705394702e-06, "loss": 1.6946, "step": 24614 }, { "epoch": 0.627111204412298, "grad_norm": 1.4224188327789307, "learning_rate": 4.892737940713884e-06, "loss": 1.7071, "step": 24645 }, { "epoch": 0.6279000235373449, "grad_norm": 1.4652680158615112, "learning_rate": 4.842946083752511e-06, "loss": 1.6967, "step": 24676 }, { "epoch": 0.6286888426623918, "grad_norm": 1.490435004234314, "learning_rate": 4.79338169640493e-06, "loss": 1.6873, "step": 24707 }, { "epoch": 0.6294776617874387, "grad_norm": 1.54020357131958, "learning_rate": 4.74404533799851e-06, "loss": 1.7026, "step": 24738 }, { "epoch": 0.6302664809124856, "grad_norm": 1.3947267532348633, "learning_rate": 4.694937565287344e-06, "loss": 1.6959, "step": 24769 }, { "epoch": 0.6310553000375325, "grad_norm": 1.4161572456359863, "learning_rate": 4.646058932445985e-06, "loss": 1.6909, "step": 24800 }, { "epoch": 0.6318441191625794, "grad_norm": 1.4541959762573242, "learning_rate": 4.597409991063148e-06, "loss": 1.6961, "step": 24831 }, { "epoch": 0.6326329382876263, "grad_norm": 1.410683035850525, "learning_rate": 4.5489912901355375e-06, "loss": 1.6846, "step": 24862 }, { "epoch": 0.6334217574126733, "grad_norm": 1.4031442403793335, "learning_rate": 4.500803376061608e-06, "loss": 1.6855, "step": 24893 }, { "epoch": 0.6342105765377202, "grad_norm": 1.3770359754562378, "learning_rate": 4.45284679263541e-06, "loss": 1.6989, "step": 24924 }, { "epoch": 0.6349993956627671, "grad_norm": 1.4767192602157593, "learning_rate": 4.4051220810404775e-06, "loss": 1.6911, "step": 24955 }, { "epoch": 0.635788214787814, "grad_norm": 1.4399274587631226, "learning_rate": 4.3576297798437025e-06, "loss": 1.7003, "step": 24986 }, { "epoch": 0.6365770339128609, "grad_norm": 1.3938783407211304, "learning_rate": 4.3103704249892436e-06, "loss": 1.7098, "step": 25017 }, { "epoch": 0.6373658530379078, "grad_norm": 1.4374542236328125, "learning_rate": 4.263344549792487e-06, "loss": 1.6949, "step": 25048 }, { "epoch": 0.6381546721629547, "grad_norm": 1.443415641784668, "learning_rate": 4.216552684934056e-06, "loss": 1.7002, "step": 25079 }, { "epoch": 0.6389434912880017, "grad_norm": 1.455540418624878, "learning_rate": 4.169995358453777e-06, "loss": 1.7018, "step": 25110 }, { "epoch": 0.6397323104130486, "grad_norm": 1.4947654008865356, "learning_rate": 4.123673095744757e-06, "loss": 1.681, "step": 25141 }, { "epoch": 0.6405211295380955, "grad_norm": 1.4933280944824219, "learning_rate": 4.077586419547435e-06, "loss": 1.703, "step": 25172 }, { "epoch": 0.6413099486631424, "grad_norm": 1.4724138975143433, "learning_rate": 4.03173584994368e-06, "loss": 1.6987, "step": 25203 }, { "epoch": 0.6420987677881893, "grad_norm": 1.370006799697876, "learning_rate": 3.986121904350948e-06, "loss": 1.6881, "step": 25234 }, { "epoch": 0.6428875869132362, "grad_norm": 1.5258022546768188, "learning_rate": 3.940745097516407e-06, "loss": 1.6856, "step": 25265 }, { "epoch": 0.6436764060382831, "grad_norm": 1.3982164859771729, "learning_rate": 3.89560594151116e-06, "loss": 1.6956, "step": 25296 }, { "epoch": 0.64446522516333, "grad_norm": 1.457051396369934, "learning_rate": 3.850704945724456e-06, "loss": 1.7038, "step": 25327 }, { "epoch": 0.645254044288377, "grad_norm": 1.4047811031341553, "learning_rate": 3.8060426168579077e-06, "loss": 1.6984, "step": 25358 }, { "epoch": 0.6460428634134239, "grad_norm": 1.3755521774291992, "learning_rate": 3.7616194589198407e-06, "loss": 1.7016, "step": 25389 }, { "epoch": 0.6468316825384708, "grad_norm": 1.4575284719467163, "learning_rate": 3.7174359732195574e-06, "loss": 1.6907, "step": 25420 }, { "epoch": 0.6476205016635177, "grad_norm": 1.563887357711792, "learning_rate": 3.673492658361677e-06, "loss": 1.7141, "step": 25451 }, { "epoch": 0.6484093207885646, "grad_norm": 1.4307068586349487, "learning_rate": 3.6297900102405467e-06, "loss": 1.7018, "step": 25482 }, { "epoch": 0.6491981399136116, "grad_norm": 1.4639099836349487, "learning_rate": 3.586328522034607e-06, "loss": 1.7162, "step": 25513 }, { "epoch": 0.6499869590386586, "grad_norm": 1.4759560823440552, "learning_rate": 3.543108684200838e-06, "loss": 1.6893, "step": 25544 }, { "epoch": 0.6507757781637055, "grad_norm": 1.4981391429901123, "learning_rate": 3.5001309844692464e-06, "loss": 1.7037, "step": 25575 }, { "epoch": 0.6515645972887524, "grad_norm": 1.4637056589126587, "learning_rate": 3.4573959078373215e-06, "loss": 1.683, "step": 25606 }, { "epoch": 0.6523534164137993, "grad_norm": 1.5560393333435059, "learning_rate": 3.4149039365646063e-06, "loss": 1.6843, "step": 25637 }, { "epoch": 0.6531422355388462, "grad_norm": 1.4658019542694092, "learning_rate": 3.3726555501672143e-06, "loss": 1.6883, "step": 25668 }, { "epoch": 0.6539310546638931, "grad_norm": 1.397363543510437, "learning_rate": 3.33065122541244e-06, "loss": 1.7005, "step": 25699 }, { "epoch": 0.65471987378894, "grad_norm": 1.439571738243103, "learning_rate": 3.288891436313385e-06, "loss": 1.7144, "step": 25730 }, { "epoch": 0.655508692913987, "grad_norm": 1.4690093994140625, "learning_rate": 3.2473766541235963e-06, "loss": 1.6918, "step": 25761 }, { "epoch": 0.6562975120390339, "grad_norm": 1.4217287302017212, "learning_rate": 3.2061073473317466e-06, "loss": 1.6902, "step": 25792 }, { "epoch": 0.6570863311640808, "grad_norm": 1.3401572704315186, "learning_rate": 3.1650839816563444e-06, "loss": 1.6949, "step": 25823 }, { "epoch": 0.6578751502891277, "grad_norm": 1.4836351871490479, "learning_rate": 3.1243070200405093e-06, "loss": 1.6989, "step": 25854 }, { "epoch": 0.6586639694141746, "grad_norm": 1.3935353755950928, "learning_rate": 3.0837769226467e-06, "loss": 1.6926, "step": 25885 }, { "epoch": 0.6594527885392215, "grad_norm": 1.4989404678344727, "learning_rate": 3.0434941468515666e-06, "loss": 1.6857, "step": 25916 }, { "epoch": 0.6602416076642684, "grad_norm": 1.4674372673034668, "learning_rate": 3.003459147240753e-06, "loss": 1.6912, "step": 25947 }, { "epoch": 0.6610304267893153, "grad_norm": 1.5865478515625, "learning_rate": 2.9636723756037875e-06, "loss": 1.6904, "step": 25978 }, { "epoch": 0.6618192459143623, "grad_norm": 1.440338134765625, "learning_rate": 2.9241342809289833e-06, "loss": 1.6981, "step": 26009 }, { "epoch": 0.6626080650394092, "grad_norm": 1.3906199932098389, "learning_rate": 2.8848453093983594e-06, "loss": 1.6854, "step": 26040 }, { "epoch": 0.6633968841644561, "grad_norm": 1.475035309791565, "learning_rate": 2.8458059043826257e-06, "loss": 1.704, "step": 26071 }, { "epoch": 0.664185703289503, "grad_norm": 1.4185906648635864, "learning_rate": 2.807016506436172e-06, "loss": 1.6873, "step": 26102 }, { "epoch": 0.6649745224145499, "grad_norm": 1.5231366157531738, "learning_rate": 2.7684775532920566e-06, "loss": 1.7009, "step": 26133 }, { "epoch": 0.6657633415395968, "grad_norm": 1.427589774131775, "learning_rate": 2.7301894798571425e-06, "loss": 1.7065, "step": 26164 }, { "epoch": 0.6665521606646437, "grad_norm": 1.4298368692398071, "learning_rate": 2.6921527182071386e-06, "loss": 1.6944, "step": 26195 }, { "epoch": 0.6673409797896906, "grad_norm": 1.498779058456421, "learning_rate": 2.654367697581725e-06, "loss": 1.6845, "step": 26226 }, { "epoch": 0.6681297989147376, "grad_norm": 1.4032225608825684, "learning_rate": 2.6168348443797175e-06, "loss": 1.6936, "step": 26257 }, { "epoch": 0.6689186180397845, "grad_norm": 1.4571456909179688, "learning_rate": 2.5795545821542757e-06, "loss": 1.7055, "step": 26288 }, { "epoch": 0.6697074371648314, "grad_norm": 1.4156779050827026, "learning_rate": 2.54252733160808e-06, "loss": 1.6861, "step": 26319 }, { "epoch": 0.6704962562898783, "grad_norm": 1.5022954940795898, "learning_rate": 2.5057535105886294e-06, "loss": 1.6834, "step": 26350 }, { "epoch": 0.6712850754149252, "grad_norm": 1.4164525270462036, "learning_rate": 2.4692335340834953e-06, "loss": 1.699, "step": 26381 }, { "epoch": 0.6720738945399721, "grad_norm": 1.459639072418213, "learning_rate": 2.432967814215639e-06, "loss": 1.6987, "step": 26412 }, { "epoch": 0.672862713665019, "grad_norm": 1.4693511724472046, "learning_rate": 2.396956760238794e-06, "loss": 1.701, "step": 26443 }, { "epoch": 0.673651532790066, "grad_norm": 1.3714548349380493, "learning_rate": 2.361200778532796e-06, "loss": 1.6754, "step": 26474 }, { "epoch": 0.6744403519151129, "grad_norm": 1.4285922050476074, "learning_rate": 2.325700272599049e-06, "loss": 1.6907, "step": 26505 }, { "epoch": 0.6752291710401598, "grad_norm": 1.5615297555923462, "learning_rate": 2.2904556430559415e-06, "loss": 1.6932, "step": 26536 }, { "epoch": 0.6760179901652067, "grad_norm": 1.3956187963485718, "learning_rate": 2.2554672876343106e-06, "loss": 1.7064, "step": 26567 }, { "epoch": 0.6768068092902536, "grad_norm": 1.4564794301986694, "learning_rate": 2.220735601173002e-06, "loss": 1.6922, "step": 26598 }, { "epoch": 0.6775956284153005, "grad_norm": 1.4553749561309814, "learning_rate": 2.186260975614382e-06, "loss": 1.7055, "step": 26629 }, { "epoch": 0.6783844475403474, "grad_norm": 1.4266986846923828, "learning_rate": 2.1520437999999034e-06, "loss": 1.7145, "step": 26660 }, { "epoch": 0.6791732666653943, "grad_norm": 1.4530359506607056, "learning_rate": 2.1180844604657526e-06, "loss": 1.6916, "step": 26691 }, { "epoch": 0.6799620857904413, "grad_norm": 1.4178498983383179, "learning_rate": 2.084383340238455e-06, "loss": 1.6766, "step": 26722 }, { "epoch": 0.6807509049154882, "grad_norm": 1.393988847732544, "learning_rate": 2.0509408196305704e-06, "loss": 1.6873, "step": 26753 }, { "epoch": 0.6815397240405351, "grad_norm": 1.3752752542495728, "learning_rate": 2.017757276036403e-06, "loss": 1.6984, "step": 26784 }, { "epoch": 0.682328543165582, "grad_norm": 1.394559383392334, "learning_rate": 1.984833083927726e-06, "loss": 1.7032, "step": 26815 }, { "epoch": 0.6831173622906289, "grad_norm": 1.4148964881896973, "learning_rate": 1.952168614849581e-06, "loss": 1.6844, "step": 26846 }, { "epoch": 0.6839061814156758, "grad_norm": 1.4353492259979248, "learning_rate": 1.919764237416058e-06, "loss": 1.7102, "step": 26877 }, { "epoch": 0.6846950005407227, "grad_norm": 1.3867477178573608, "learning_rate": 1.8876203173061463e-06, "loss": 1.6931, "step": 26908 }, { "epoch": 0.6854838196657697, "grad_norm": 1.3678532838821411, "learning_rate": 1.8557372172596206e-06, "loss": 1.7009, "step": 26939 }, { "epoch": 0.6862726387908166, "grad_norm": 1.485137939453125, "learning_rate": 1.8241152970729341e-06, "loss": 1.7016, "step": 26970 }, { "epoch": 0.6870614579158635, "grad_norm": 1.4369994401931763, "learning_rate": 1.7927549135951572e-06, "loss": 1.6963, "step": 27001 }, { "epoch": 0.6878502770409104, "grad_norm": 1.4508947134017944, "learning_rate": 1.7616564207239477e-06, "loss": 1.6831, "step": 27032 }, { "epoch": 0.6886390961659574, "grad_norm": 1.3917666673660278, "learning_rate": 1.730820169401584e-06, "loss": 1.6708, "step": 27063 }, { "epoch": 0.6894279152910043, "grad_norm": 1.4687188863754272, "learning_rate": 1.7002465076109558e-06, "loss": 1.6764, "step": 27094 }, { "epoch": 0.6902167344160512, "grad_norm": 1.4053486585617065, "learning_rate": 1.6699357803716898e-06, "loss": 1.6931, "step": 27125 }, { "epoch": 0.6910055535410982, "grad_norm": 1.4645016193389893, "learning_rate": 1.6398883297362305e-06, "loss": 1.7035, "step": 27156 }, { "epoch": 0.6917943726661451, "grad_norm": 1.434180498123169, "learning_rate": 1.6101044947859606e-06, "loss": 1.6693, "step": 27187 }, { "epoch": 0.692583191791192, "grad_norm": 1.451497197151184, "learning_rate": 1.5805846116274114e-06, "loss": 1.6776, "step": 27218 }, { "epoch": 0.6933720109162389, "grad_norm": 1.4085174798965454, "learning_rate": 1.5513290133884611e-06, "loss": 1.684, "step": 27249 }, { "epoch": 0.6941608300412858, "grad_norm": 1.3959869146347046, "learning_rate": 1.5223380302145512e-06, "loss": 1.6798, "step": 27280 }, { "epoch": 0.6949496491663327, "grad_norm": 1.377614974975586, "learning_rate": 1.4936119892649925e-06, "loss": 1.6889, "step": 27311 }, { "epoch": 0.6957384682913796, "grad_norm": 1.4654227495193481, "learning_rate": 1.4651512147092482e-06, "loss": 1.7027, "step": 27342 }, { "epoch": 0.6965272874164266, "grad_norm": 1.336857795715332, "learning_rate": 1.4369560277232908e-06, "loss": 1.6756, "step": 27373 }, { "epoch": 0.6973161065414735, "grad_norm": 1.4347259998321533, "learning_rate": 1.409026746485978e-06, "loss": 1.6831, "step": 27404 }, { "epoch": 0.6981049256665204, "grad_norm": 1.5176235437393188, "learning_rate": 1.3813636861754464e-06, "loss": 1.6864, "step": 27435 }, { "epoch": 0.6988937447915673, "grad_norm": 1.4501276016235352, "learning_rate": 1.3539671589655773e-06, "loss": 1.6941, "step": 27466 } ], "logging_steps": 31, "max_steps": 30517, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 3052, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.037550548620044e+19, "train_batch_size": 8, "trial_name": null, "trial_params": null }