diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,6 +1,6 @@ { - "best_metric": 0.04101279005408287, - "best_model_checkpoint": "./test_default_model/checkpoint-3962", + "best_metric": 0.038467586040496826, + "best_model_checkpoint": "./test_microsoft_dit/checkpoint-7924", "epoch": 5.0, "eval_steps": 500, "global_step": 9905, @@ -10,6987 +10,6987 @@ "log_history": [ { "epoch": 0.005047955577990914, - "grad_norm": 1.0162577629089355, + "grad_norm": 0.8398004174232483, "learning_rate": 2.9969712266532054e-05, - "loss": 0.2788, + "loss": 0.3087, "step": 10 }, { "epoch": 0.010095911155981827, - "grad_norm": 1.099901556968689, + "grad_norm": 1.147126317024231, "learning_rate": 2.993942453306411e-05, - "loss": 0.1588, + "loss": 0.202, "step": 20 }, { "epoch": 0.01514386673397274, - "grad_norm": 0.8216469883918762, + "grad_norm": 1.1376692056655884, "learning_rate": 2.9909136799596164e-05, - "loss": 0.1325, + "loss": 0.1375, "step": 30 }, { "epoch": 0.020191822311963654, - "grad_norm": 1.0275465250015259, + "grad_norm": 3.0222654342651367, "learning_rate": 2.987884906612822e-05, - "loss": 0.1362, + "loss": 0.1254, "step": 40 }, { "epoch": 0.02523977788995457, - "grad_norm": 0.49435922503471375, + "grad_norm": 1.3963178396224976, "learning_rate": 2.9848561332660275e-05, - "loss": 0.1235, + "loss": 0.1105, "step": 50 }, { "epoch": 0.03028773346794548, - "grad_norm": 0.6702704429626465, + "grad_norm": 0.741131067276001, "learning_rate": 2.9818273599192328e-05, - "loss": 0.1141, + "loss": 0.1022, "step": 60 }, { "epoch": 0.0353356890459364, - "grad_norm": 0.5657111406326294, + "grad_norm": 1.0705397129058838, "learning_rate": 2.978798586572438e-05, - "loss": 0.1125, + "loss": 0.1027, "step": 70 }, { "epoch": 0.04038364462392731, - "grad_norm": 1.2878154516220093, + "grad_norm": 1.127729892730713, "learning_rate": 2.9757698132256435e-05, - "loss": 0.1172, + "loss": 0.0979, "step": 80 }, { "epoch": 0.04543160020191822, - "grad_norm": 1.3100908994674683, + "grad_norm": 0.888960063457489, "learning_rate": 2.9727410398788492e-05, - "loss": 0.1222, + "loss": 0.1024, "step": 90 }, { "epoch": 0.05047955577990914, - "grad_norm": 0.923688530921936, + "grad_norm": 0.9185839295387268, "learning_rate": 2.9697122665320545e-05, - "loss": 0.1295, + "loss": 0.1142, "step": 100 }, { "epoch": 0.05552751135790005, - "grad_norm": 0.7309263944625854, + "grad_norm": 0.737047016620636, "learning_rate": 2.96668349318526e-05, - "loss": 0.1163, + "loss": 0.0956, "step": 110 }, { "epoch": 0.06057546693589096, - "grad_norm": 0.6152750849723816, + "grad_norm": 0.7749747037887573, "learning_rate": 2.9636547198384656e-05, - "loss": 0.1134, + "loss": 0.0978, "step": 120 }, { "epoch": 0.06562342251388188, - "grad_norm": 0.4956571161746979, + "grad_norm": 1.079695224761963, "learning_rate": 2.960625946491671e-05, - "loss": 0.1107, + "loss": 0.092, "step": 130 }, { "epoch": 0.0706713780918728, - "grad_norm": 0.7335121035575867, + "grad_norm": 0.8315634727478027, "learning_rate": 2.9575971731448766e-05, - "loss": 0.1209, + "loss": 0.0975, "step": 140 }, { "epoch": 0.0757193336698637, - "grad_norm": 0.873475193977356, + "grad_norm": 0.7270865440368652, "learning_rate": 2.954568399798082e-05, - "loss": 0.1192, + "loss": 0.098, "step": 150 }, { "epoch": 0.08076728924785462, - "grad_norm": 0.539779543876648, + "grad_norm": 0.5786823630332947, "learning_rate": 2.9515396264512873e-05, - "loss": 0.0961, + "loss": 0.0846, "step": 160 }, { "epoch": 0.08581524482584553, - "grad_norm": 0.8240886926651001, + "grad_norm": 0.7117003798484802, "learning_rate": 2.948510853104493e-05, - "loss": 0.1111, + "loss": 0.0905, "step": 170 }, { "epoch": 0.09086320040383644, - "grad_norm": 0.8032135367393494, + "grad_norm": 0.6765159368515015, "learning_rate": 2.9454820797576983e-05, - "loss": 0.0917, + "loss": 0.0764, "step": 180 }, { "epoch": 0.09591115598182735, - "grad_norm": 1.6522753238677979, + "grad_norm": 1.1397738456726074, "learning_rate": 2.9424533064109037e-05, - "loss": 0.1093, + "loss": 0.0882, "step": 190 }, { "epoch": 0.10095911155981828, - "grad_norm": 0.4631141126155853, + "grad_norm": 0.6545870900154114, "learning_rate": 2.939424533064109e-05, - "loss": 0.1168, + "loss": 0.0991, "step": 200 }, { "epoch": 0.10600706713780919, - "grad_norm": 0.7879306077957153, + "grad_norm": 0.8882391452789307, "learning_rate": 2.9363957597173144e-05, - "loss": 0.1071, + "loss": 0.0902, "step": 210 }, { "epoch": 0.1110550227158001, - "grad_norm": 0.6251317262649536, + "grad_norm": 0.5973140001296997, "learning_rate": 2.93336698637052e-05, - "loss": 0.1183, + "loss": 0.0968, "step": 220 }, { "epoch": 0.11610297829379101, - "grad_norm": 1.182413935661316, + "grad_norm": 1.3215384483337402, "learning_rate": 2.9303382130237254e-05, - "loss": 0.1046, + "loss": 0.0901, "step": 230 }, { "epoch": 0.12115093387178193, - "grad_norm": 0.5547954440116882, + "grad_norm": 0.6139042973518372, "learning_rate": 2.9273094396769307e-05, - "loss": 0.0862, + "loss": 0.0739, "step": 240 }, { "epoch": 0.12619888944977284, - "grad_norm": 0.9633322954177856, + "grad_norm": 0.9095037579536438, "learning_rate": 2.9242806663301364e-05, - "loss": 0.1027, + "loss": 0.0907, "step": 250 }, { "epoch": 0.13124684502776376, - "grad_norm": 0.3100033402442932, + "grad_norm": 1.0266954898834229, "learning_rate": 2.9212518929833418e-05, - "loss": 0.0826, + "loss": 0.0726, "step": 260 }, { "epoch": 0.13629480060575466, - "grad_norm": 0.6877946257591248, + "grad_norm": 0.734716534614563, "learning_rate": 2.9182231196365474e-05, - "loss": 0.1009, + "loss": 0.0891, "step": 270 }, { "epoch": 0.1413427561837456, - "grad_norm": 0.6235649585723877, + "grad_norm": 0.7633081674575806, "learning_rate": 2.9151943462897528e-05, - "loss": 0.0923, + "loss": 0.0747, "step": 280 }, { "epoch": 0.1463907117617365, - "grad_norm": 0.4079645276069641, + "grad_norm": 0.8185615539550781, "learning_rate": 2.912165572942958e-05, - "loss": 0.087, + "loss": 0.0815, "step": 290 }, { "epoch": 0.1514386673397274, - "grad_norm": 0.4664750397205353, + "grad_norm": 1.2503191232681274, "learning_rate": 2.9091367995961638e-05, - "loss": 0.0972, + "loss": 0.0844, "step": 300 }, { "epoch": 0.15648662291771834, - "grad_norm": 0.49487921595573425, + "grad_norm": 0.52531898021698, "learning_rate": 2.906108026249369e-05, - "loss": 0.101, + "loss": 0.0863, "step": 310 }, { "epoch": 0.16153457849570924, - "grad_norm": 0.4716992974281311, + "grad_norm": 0.8883135914802551, "learning_rate": 2.9030792529025745e-05, - "loss": 0.0976, + "loss": 0.0833, "step": 320 }, { "epoch": 0.16658253407370016, - "grad_norm": 0.5814321637153625, + "grad_norm": 0.5173369646072388, "learning_rate": 2.90005047955578e-05, - "loss": 0.1011, + "loss": 0.0882, "step": 330 }, { "epoch": 0.17163048965169106, - "grad_norm": 0.48927783966064453, + "grad_norm": 0.5770648717880249, "learning_rate": 2.8970217062089852e-05, - "loss": 0.0954, + "loss": 0.0814, "step": 340 }, { "epoch": 0.17667844522968199, - "grad_norm": 0.46283578872680664, + "grad_norm": 0.8828192949295044, "learning_rate": 2.893992932862191e-05, - "loss": 0.0893, + "loss": 0.0776, "step": 350 }, { "epoch": 0.18172640080767288, - "grad_norm": 0.39830565452575684, + "grad_norm": 0.756236732006073, "learning_rate": 2.8909641595153962e-05, - "loss": 0.0906, + "loss": 0.0736, "step": 360 }, { "epoch": 0.1867743563856638, - "grad_norm": 0.39141398668289185, + "grad_norm": 0.47730007767677307, "learning_rate": 2.887935386168602e-05, - "loss": 0.0934, + "loss": 0.0856, "step": 370 }, { "epoch": 0.1918223119636547, - "grad_norm": 0.5772321224212646, + "grad_norm": 2.5338025093078613, "learning_rate": 2.8849066128218072e-05, - "loss": 0.096, + "loss": 0.0879, "step": 380 }, { "epoch": 0.19687026754164563, - "grad_norm": 0.5649458765983582, + "grad_norm": 0.6218165159225464, "learning_rate": 2.8818778394750126e-05, - "loss": 0.0869, + "loss": 0.0724, "step": 390 }, { "epoch": 0.20191822311963656, - "grad_norm": 0.9187427163124084, + "grad_norm": 1.1621041297912598, "learning_rate": 2.8788490661282183e-05, - "loss": 0.0895, + "loss": 0.0742, "step": 400 }, { "epoch": 0.20696617869762746, - "grad_norm": 0.555701494216919, + "grad_norm": 0.8511998653411865, "learning_rate": 2.8758202927814236e-05, - "loss": 0.0932, + "loss": 0.0798, "step": 410 }, { "epoch": 0.21201413427561838, - "grad_norm": 0.4509018361568451, + "grad_norm": 0.5848472118377686, "learning_rate": 2.8727915194346293e-05, - "loss": 0.094, + "loss": 0.0834, "step": 420 }, { "epoch": 0.21706208985360928, - "grad_norm": 0.47534915804862976, + "grad_norm": 0.5747645497322083, "learning_rate": 2.8697627460878346e-05, - "loss": 0.0887, + "loss": 0.0745, "step": 430 }, { "epoch": 0.2221100454316002, - "grad_norm": 1.2933378219604492, + "grad_norm": 1.058206558227539, "learning_rate": 2.86673397274104e-05, - "loss": 0.0931, + "loss": 0.0767, "step": 440 }, { "epoch": 0.2271580010095911, - "grad_norm": 0.8813143968582153, + "grad_norm": 0.8267918825149536, "learning_rate": 2.8637051993942453e-05, - "loss": 0.1047, + "loss": 0.0893, "step": 450 }, { "epoch": 0.23220595658758203, - "grad_norm": 0.6796084642410278, + "grad_norm": 1.1392240524291992, "learning_rate": 2.8606764260474507e-05, - "loss": 0.0979, + "loss": 0.0833, "step": 460 }, { "epoch": 0.23725391216557296, - "grad_norm": 0.8377231359481812, + "grad_norm": 0.9474436044692993, "learning_rate": 2.8576476527006564e-05, - "loss": 0.0994, + "loss": 0.0896, "step": 470 }, { "epoch": 0.24230186774356385, - "grad_norm": 0.45192497968673706, + "grad_norm": 1.2880048751831055, "learning_rate": 2.8546188793538617e-05, - "loss": 0.1016, + "loss": 0.0924, "step": 480 }, { "epoch": 0.24734982332155478, - "grad_norm": 0.6596019268035889, + "grad_norm": 0.6342403888702393, "learning_rate": 2.851590106007067e-05, - "loss": 0.0929, + "loss": 0.0799, "step": 490 }, { "epoch": 0.2523977788995457, - "grad_norm": 0.33004361391067505, + "grad_norm": 0.5780256986618042, "learning_rate": 2.8485613326602727e-05, - "loss": 0.0893, + "loss": 0.0798, "step": 500 }, { "epoch": 0.2574457344775366, - "grad_norm": 0.5023931264877319, + "grad_norm": 0.7743504643440247, "learning_rate": 2.845532559313478e-05, - "loss": 0.0766, + "loss": 0.0681, "step": 510 }, { "epoch": 0.26249369005552753, - "grad_norm": 0.4122072458267212, + "grad_norm": 0.5771861672401428, "learning_rate": 2.8425037859666834e-05, - "loss": 0.0808, + "loss": 0.0753, "step": 520 }, { "epoch": 0.2675416456335184, - "grad_norm": 0.43148109316825867, + "grad_norm": 0.6735575199127197, "learning_rate": 2.839475012619889e-05, - "loss": 0.0852, + "loss": 0.0773, "step": 530 }, { "epoch": 0.2725896012115093, - "grad_norm": 0.4459318518638611, + "grad_norm": 0.7692667841911316, "learning_rate": 2.8364462392730945e-05, - "loss": 0.0857, + "loss": 0.0732, "step": 540 }, { "epoch": 0.27763755678950025, - "grad_norm": 0.47170397639274597, + "grad_norm": 0.5109196901321411, "learning_rate": 2.8334174659263e-05, - "loss": 0.1001, + "loss": 0.0859, "step": 550 }, { "epoch": 0.2826855123674912, - "grad_norm": 0.4913211464881897, + "grad_norm": 0.726249098777771, "learning_rate": 2.8303886925795055e-05, - "loss": 0.0927, + "loss": 0.0801, "step": 560 }, { "epoch": 0.2877334679454821, - "grad_norm": 0.5056201219558716, + "grad_norm": 0.8817322254180908, "learning_rate": 2.8273599192327108e-05, - "loss": 0.0853, + "loss": 0.0739, "step": 570 }, { "epoch": 0.292781423523473, - "grad_norm": 0.9730571508407593, + "grad_norm": 0.5081413984298706, "learning_rate": 2.8243311458859162e-05, - "loss": 0.0885, + "loss": 0.0727, "step": 580 }, { "epoch": 0.2978293791014639, - "grad_norm": 0.4681917130947113, + "grad_norm": 0.9367203712463379, "learning_rate": 2.8213023725391215e-05, - "loss": 0.0846, + "loss": 0.0751, "step": 590 }, { "epoch": 0.3028773346794548, - "grad_norm": 0.43745434284210205, + "grad_norm": 0.5382592678070068, "learning_rate": 2.8182735991923272e-05, - "loss": 0.0849, + "loss": 0.0756, "step": 600 }, { "epoch": 0.30792529025744575, - "grad_norm": 0.5624836683273315, + "grad_norm": 0.40977007150650024, "learning_rate": 2.8152448258455325e-05, - "loss": 0.0833, + "loss": 0.0714, "step": 610 }, { "epoch": 0.3129732458354367, - "grad_norm": 0.7955760359764099, + "grad_norm": 0.6829769015312195, "learning_rate": 2.812216052498738e-05, - "loss": 0.0917, + "loss": 0.0809, "step": 620 }, { "epoch": 0.31802120141342755, - "grad_norm": 0.3756354749202728, + "grad_norm": 0.4805002212524414, "learning_rate": 2.8091872791519436e-05, - "loss": 0.096, + "loss": 0.0789, "step": 630 }, { "epoch": 0.32306915699141847, - "grad_norm": 0.511903703212738, + "grad_norm": 0.6755364537239075, "learning_rate": 2.806158505805149e-05, - "loss": 0.0921, + "loss": 0.0819, "step": 640 }, { "epoch": 0.3281171125694094, - "grad_norm": 0.41848480701446533, + "grad_norm": 1.3035857677459717, "learning_rate": 2.8031297324583546e-05, - "loss": 0.091, + "loss": 0.0861, "step": 650 }, { "epoch": 0.3331650681474003, - "grad_norm": 0.5067800283432007, + "grad_norm": 0.7905831933021545, "learning_rate": 2.80010095911156e-05, - "loss": 0.0843, + "loss": 0.0739, "step": 660 }, { "epoch": 0.3382130237253912, - "grad_norm": 0.4545990228652954, + "grad_norm": 0.8810652494430542, "learning_rate": 2.7970721857647653e-05, - "loss": 0.0796, + "loss": 0.0678, "step": 670 }, { "epoch": 0.3432609793033821, - "grad_norm": 0.5365005135536194, + "grad_norm": 1.1220252513885498, "learning_rate": 2.794043412417971e-05, - "loss": 0.0814, + "loss": 0.07, "step": 680 }, { "epoch": 0.34830893488137304, - "grad_norm": 0.5178242921829224, + "grad_norm": 0.8519473075866699, "learning_rate": 2.7910146390711763e-05, - "loss": 0.0854, + "loss": 0.076, "step": 690 }, { "epoch": 0.35335689045936397, - "grad_norm": 0.3831228017807007, + "grad_norm": 0.49878937005996704, "learning_rate": 2.787985865724382e-05, - "loss": 0.0867, + "loss": 0.0787, "step": 700 }, { "epoch": 0.3584048460373549, - "grad_norm": 0.785432755947113, + "grad_norm": 1.4854084253311157, "learning_rate": 2.784957092377587e-05, - "loss": 0.0959, + "loss": 0.0872, "step": 710 }, { "epoch": 0.36345280161534577, - "grad_norm": 0.4992307126522064, + "grad_norm": 0.787535548210144, "learning_rate": 2.7819283190307924e-05, - "loss": 0.0857, + "loss": 0.0805, "step": 720 }, { "epoch": 0.3685007571933367, - "grad_norm": 0.337698757648468, + "grad_norm": 0.8322392106056213, "learning_rate": 2.778899545683998e-05, - "loss": 0.0833, + "loss": 0.0726, "step": 730 }, { "epoch": 0.3735487127713276, - "grad_norm": 0.4655255675315857, + "grad_norm": 0.48470157384872437, "learning_rate": 2.7758707723372034e-05, - "loss": 0.0773, + "loss": 0.0673, "step": 740 }, { "epoch": 0.37859666834931854, - "grad_norm": 0.3810546398162842, + "grad_norm": 0.8375622034072876, "learning_rate": 2.772841998990409e-05, - "loss": 0.0858, + "loss": 0.0767, "step": 750 }, { "epoch": 0.3836446239273094, - "grad_norm": 0.4854646325111389, + "grad_norm": 0.5212222337722778, "learning_rate": 2.7698132256436144e-05, - "loss": 0.0901, + "loss": 0.0737, "step": 760 }, { "epoch": 0.38869257950530034, - "grad_norm": 0.5742236375808716, + "grad_norm": 0.503209114074707, "learning_rate": 2.7667844522968198e-05, - "loss": 0.0775, + "loss": 0.0657, "step": 770 }, { "epoch": 0.39374053508329127, - "grad_norm": 0.40320712327957153, + "grad_norm": 0.4290629029273987, "learning_rate": 2.7637556789500254e-05, - "loss": 0.0837, + "loss": 0.0745, "step": 780 }, { "epoch": 0.3987884906612822, - "grad_norm": 0.8492166996002197, + "grad_norm": 0.7535534501075745, "learning_rate": 2.7607269056032308e-05, - "loss": 0.0743, + "loss": 0.0702, "step": 790 }, { "epoch": 0.4038364462392731, - "grad_norm": 0.3958197236061096, + "grad_norm": 0.67135089635849, "learning_rate": 2.757698132256436e-05, - "loss": 0.0849, + "loss": 0.0754, "step": 800 }, { "epoch": 0.408884401817264, - "grad_norm": 0.4942672848701477, + "grad_norm": 0.5307912230491638, "learning_rate": 2.7546693589096418e-05, - "loss": 0.0843, + "loss": 0.0717, "step": 810 }, { "epoch": 0.4139323573952549, - "grad_norm": 0.4589976966381073, + "grad_norm": 0.46130767464637756, "learning_rate": 2.751640585562847e-05, - "loss": 0.0737, + "loss": 0.065, "step": 820 }, { "epoch": 0.41898031297324584, - "grad_norm": 0.7180899381637573, + "grad_norm": 1.2904905080795288, "learning_rate": 2.748611812216053e-05, - "loss": 0.0871, + "loss": 0.0818, "step": 830 }, { "epoch": 0.42402826855123676, - "grad_norm": 0.9610480070114136, + "grad_norm": 2.0480494499206543, "learning_rate": 2.745583038869258e-05, - "loss": 0.0939, + "loss": 0.085, "step": 840 }, { "epoch": 0.4290762241292277, - "grad_norm": 0.5229182243347168, + "grad_norm": 0.5108308792114258, "learning_rate": 2.7425542655224632e-05, - "loss": 0.0856, + "loss": 0.0729, "step": 850 }, { "epoch": 0.43412417970721856, - "grad_norm": 0.4386545717716217, + "grad_norm": 0.6915296912193298, "learning_rate": 2.739525492175669e-05, - "loss": 0.0827, + "loss": 0.071, "step": 860 }, { "epoch": 0.4391721352852095, - "grad_norm": 0.40693387389183044, + "grad_norm": 0.8100910782814026, "learning_rate": 2.7364967188288742e-05, - "loss": 0.0756, + "loss": 0.0667, "step": 870 }, { "epoch": 0.4442200908632004, - "grad_norm": 0.5173991918563843, + "grad_norm": 0.626818835735321, "learning_rate": 2.73346794548208e-05, - "loss": 0.0783, + "loss": 0.0695, "step": 880 }, { "epoch": 0.44926804644119134, - "grad_norm": 0.3451812267303467, + "grad_norm": 0.673156201839447, "learning_rate": 2.7304391721352853e-05, - "loss": 0.0911, + "loss": 0.0793, "step": 890 }, { "epoch": 0.4543160020191822, - "grad_norm": 0.5798471570014954, + "grad_norm": 0.5740798711776733, "learning_rate": 2.7274103987884906e-05, - "loss": 0.084, + "loss": 0.0731, "step": 900 }, { "epoch": 0.45936395759717313, - "grad_norm": 0.3773733675479889, + "grad_norm": 0.744429349899292, "learning_rate": 2.7243816254416963e-05, - "loss": 0.0838, + "loss": 0.0743, "step": 910 }, { "epoch": 0.46441191317516406, - "grad_norm": 0.4767201840877533, + "grad_norm": 0.5837222933769226, "learning_rate": 2.7213528520949016e-05, - "loss": 0.0809, + "loss": 0.0747, "step": 920 }, { "epoch": 0.469459868753155, - "grad_norm": 0.43917012214660645, + "grad_norm": 0.500978410243988, "learning_rate": 2.7183240787481073e-05, - "loss": 0.082, + "loss": 0.0753, "step": 930 }, { "epoch": 0.4745078243311459, - "grad_norm": 0.7668654322624207, + "grad_norm": 1.0817604064941406, "learning_rate": 2.7152953054013127e-05, - "loss": 0.0791, + "loss": 0.0748, "step": 940 }, { "epoch": 0.4795557799091368, - "grad_norm": 0.5831783413887024, + "grad_norm": 0.5821205377578735, "learning_rate": 2.712266532054518e-05, - "loss": 0.0903, + "loss": 0.0766, "step": 950 }, { "epoch": 0.4846037354871277, - "grad_norm": 0.4111500084400177, + "grad_norm": 0.6120801568031311, "learning_rate": 2.7092377587077233e-05, - "loss": 0.0962, + "loss": 0.0827, "step": 960 }, { "epoch": 0.48965169106511863, - "grad_norm": 0.48939448595046997, + "grad_norm": 0.4379239082336426, "learning_rate": 2.7062089853609287e-05, - "loss": 0.0798, + "loss": 0.0664, "step": 970 }, { "epoch": 0.49469964664310956, - "grad_norm": 0.6999348998069763, + "grad_norm": 0.5472243428230286, "learning_rate": 2.7031802120141344e-05, - "loss": 0.0873, + "loss": 0.0767, "step": 980 }, { "epoch": 0.49974760222110043, - "grad_norm": 0.4761042594909668, + "grad_norm": 1.0190905332565308, "learning_rate": 2.7001514386673397e-05, - "loss": 0.0902, + "loss": 0.0739, "step": 990 }, { "epoch": 0.5047955577990914, - "grad_norm": 0.6692693829536438, + "grad_norm": 0.7046610713005066, "learning_rate": 2.697122665320545e-05, - "loss": 0.0768, + "loss": 0.0685, "step": 1000 }, { "epoch": 0.5098435133770823, - "grad_norm": 0.3619178831577301, + "grad_norm": 0.5559498071670532, "learning_rate": 2.6940938919737507e-05, - "loss": 0.0767, + "loss": 0.0715, "step": 1010 }, { "epoch": 0.5148914689550732, - "grad_norm": 0.4190191328525543, + "grad_norm": 0.6298381686210632, "learning_rate": 2.691065118626956e-05, - "loss": 0.0917, + "loss": 0.0828, "step": 1020 }, { "epoch": 0.5199394245330641, - "grad_norm": 0.3305515646934509, + "grad_norm": 0.7023555636405945, "learning_rate": 2.6880363452801618e-05, - "loss": 0.0905, + "loss": 0.0809, "step": 1030 }, { "epoch": 0.5249873801110551, - "grad_norm": 0.46522971987724304, + "grad_norm": 0.6804683804512024, "learning_rate": 2.685007571933367e-05, - "loss": 0.0797, + "loss": 0.0739, "step": 1040 }, { "epoch": 0.5300353356890459, - "grad_norm": 0.39177805185317993, + "grad_norm": 0.7743015885353088, "learning_rate": 2.6819787985865725e-05, - "loss": 0.0725, + "loss": 0.0658, "step": 1050 }, { "epoch": 0.5350832912670368, - "grad_norm": 0.6978829503059387, + "grad_norm": 1.36810302734375, "learning_rate": 2.678950025239778e-05, - "loss": 0.0834, + "loss": 0.0747, "step": 1060 }, { "epoch": 0.5401312468450278, - "grad_norm": 0.40147507190704346, + "grad_norm": 0.47373896837234497, "learning_rate": 2.6759212518929835e-05, - "loss": 0.0824, + "loss": 0.0751, "step": 1070 }, { "epoch": 0.5451792024230186, - "grad_norm": 0.6341513395309448, + "grad_norm": 0.6654021143913269, "learning_rate": 2.6728924785461892e-05, - "loss": 0.0831, + "loss": 0.0683, "step": 1080 }, { "epoch": 0.5502271580010096, - "grad_norm": 0.3328685164451599, + "grad_norm": 1.0054854154586792, "learning_rate": 2.6698637051993942e-05, - "loss": 0.0746, + "loss": 0.0676, "step": 1090 }, { "epoch": 0.5552751135790005, - "grad_norm": 0.5470515489578247, + "grad_norm": 0.5544041395187378, "learning_rate": 2.6668349318525995e-05, - "loss": 0.0784, + "loss": 0.075, "step": 1100 }, { "epoch": 0.5603230691569914, - "grad_norm": 0.8354987502098083, + "grad_norm": 0.6919006109237671, "learning_rate": 2.6638061585058052e-05, - "loss": 0.0778, + "loss": 0.0709, "step": 1110 }, { "epoch": 0.5653710247349824, - "grad_norm": 0.45674967765808105, + "grad_norm": 0.5584747791290283, "learning_rate": 2.6607773851590106e-05, - "loss": 0.0739, + "loss": 0.0623, "step": 1120 }, { "epoch": 0.5704189803129732, - "grad_norm": 0.3991139829158783, + "grad_norm": 0.47064319252967834, "learning_rate": 2.657748611812216e-05, - "loss": 0.0877, + "loss": 0.0744, "step": 1130 }, { "epoch": 0.5754669358909642, - "grad_norm": 0.500252902507782, + "grad_norm": 0.5119986534118652, "learning_rate": 2.6547198384654216e-05, - "loss": 0.0866, + "loss": 0.0795, "step": 1140 }, { "epoch": 0.5805148914689551, - "grad_norm": 0.42237555980682373, + "grad_norm": 0.9572923183441162, "learning_rate": 2.651691065118627e-05, - "loss": 0.0793, + "loss": 0.073, "step": 1150 }, { "epoch": 0.585562847046946, - "grad_norm": 0.3488081097602844, + "grad_norm": 0.5633489489555359, "learning_rate": 2.6486622917718326e-05, - "loss": 0.0739, + "loss": 0.0637, "step": 1160 }, { "epoch": 0.5906108026249369, - "grad_norm": 0.8973365426063538, + "grad_norm": 1.1218105554580688, "learning_rate": 2.645633518425038e-05, - "loss": 0.0784, + "loss": 0.0695, "step": 1170 }, { "epoch": 0.5956587582029278, - "grad_norm": 0.459522008895874, + "grad_norm": 0.6655285954475403, "learning_rate": 2.6426047450782433e-05, - "loss": 0.0865, + "loss": 0.0774, "step": 1180 }, { "epoch": 0.6007067137809188, - "grad_norm": 0.7989380955696106, + "grad_norm": 1.3088024854660034, "learning_rate": 2.639575971731449e-05, - "loss": 0.0794, + "loss": 0.0748, "step": 1190 }, { "epoch": 0.6057546693589096, - "grad_norm": 0.40716055035591125, + "grad_norm": 0.9868513941764832, "learning_rate": 2.6365471983846543e-05, - "loss": 0.085, + "loss": 0.0695, "step": 1200 }, { "epoch": 0.6108026249369005, - "grad_norm": 0.3626324534416199, + "grad_norm": 0.5922626852989197, "learning_rate": 2.63351842503786e-05, - "loss": 0.0737, + "loss": 0.0678, "step": 1210 }, { "epoch": 0.6158505805148915, - "grad_norm": 0.4917464852333069, + "grad_norm": 0.6839954257011414, "learning_rate": 2.630489651691065e-05, - "loss": 0.0807, + "loss": 0.0693, "step": 1220 }, { "epoch": 0.6208985360928824, - "grad_norm": 0.41341814398765564, + "grad_norm": 0.6755519509315491, "learning_rate": 2.6274608783442704e-05, - "loss": 0.0806, + "loss": 0.0742, "step": 1230 }, { "epoch": 0.6259464916708734, - "grad_norm": 0.3172214925289154, + "grad_norm": 0.4968509078025818, "learning_rate": 2.624432104997476e-05, - "loss": 0.0779, + "loss": 0.0615, "step": 1240 }, { "epoch": 0.6309944472488642, - "grad_norm": 0.7099377512931824, + "grad_norm": 1.1036404371261597, "learning_rate": 2.6214033316506814e-05, - "loss": 0.0815, + "loss": 0.0727, "step": 1250 }, { "epoch": 0.6360424028268551, - "grad_norm": 0.5447896122932434, + "grad_norm": 0.810405969619751, "learning_rate": 2.618374558303887e-05, - "loss": 0.0844, + "loss": 0.072, "step": 1260 }, { "epoch": 0.6410903584048461, - "grad_norm": 0.4068484604358673, + "grad_norm": 0.730140209197998, "learning_rate": 2.6153457849570924e-05, - "loss": 0.0828, + "loss": 0.0652, "step": 1270 }, { "epoch": 0.6461383139828369, - "grad_norm": 0.576968789100647, + "grad_norm": 1.1645480394363403, "learning_rate": 2.6123170116102978e-05, - "loss": 0.0825, + "loss": 0.0716, "step": 1280 }, { "epoch": 0.6511862695608278, - "grad_norm": 0.4223102033138275, + "grad_norm": 0.8481037020683289, "learning_rate": 2.6092882382635034e-05, - "loss": 0.0828, + "loss": 0.0737, "step": 1290 }, { "epoch": 0.6562342251388188, - "grad_norm": 0.4649188816547394, + "grad_norm": 0.5972946882247925, "learning_rate": 2.6062594649167088e-05, - "loss": 0.0752, + "loss": 0.0704, "step": 1300 }, { "epoch": 0.6612821807168097, - "grad_norm": 0.3611209988594055, + "grad_norm": 0.6405556201934814, "learning_rate": 2.6032306915699145e-05, - "loss": 0.0692, + "loss": 0.0628, "step": 1310 }, { "epoch": 0.6663301362948006, - "grad_norm": 0.5452816486358643, + "grad_norm": 0.8645715117454529, "learning_rate": 2.6002019182231198e-05, - "loss": 0.0765, + "loss": 0.0742, "step": 1320 }, { "epoch": 0.6713780918727915, - "grad_norm": 0.608113169670105, + "grad_norm": 1.4211089611053467, "learning_rate": 2.597173144876325e-05, - "loss": 0.0799, + "loss": 0.0731, "step": 1330 }, { "epoch": 0.6764260474507824, - "grad_norm": 0.37480154633522034, + "grad_norm": 0.8079481720924377, "learning_rate": 2.594144371529531e-05, - "loss": 0.0808, + "loss": 0.0732, "step": 1340 }, { "epoch": 0.6814740030287734, - "grad_norm": 0.37567126750946045, + "grad_norm": 0.6517273783683777, "learning_rate": 2.591115598182736e-05, - "loss": 0.0738, + "loss": 0.0688, "step": 1350 }, { "epoch": 0.6865219586067642, - "grad_norm": 0.570625364780426, + "grad_norm": 1.2093323469161987, "learning_rate": 2.5880868248359415e-05, - "loss": 0.0868, + "loss": 0.0729, "step": 1360 }, { "epoch": 0.6915699141847552, - "grad_norm": 0.41150689125061035, + "grad_norm": 0.6432307362556458, "learning_rate": 2.585058051489147e-05, - "loss": 0.0857, + "loss": 0.076, "step": 1370 }, { "epoch": 0.6966178697627461, - "grad_norm": 0.35640430450439453, + "grad_norm": 0.5220794677734375, "learning_rate": 2.5820292781423522e-05, - "loss": 0.0832, + "loss": 0.0702, "step": 1380 }, { "epoch": 0.701665825340737, - "grad_norm": 0.6833171844482422, + "grad_norm": 1.0983613729476929, "learning_rate": 2.579000504795558e-05, - "loss": 0.079, + "loss": 0.0676, "step": 1390 }, { "epoch": 0.7067137809187279, - "grad_norm": 0.4165969491004944, + "grad_norm": 0.859348475933075, "learning_rate": 2.5759717314487633e-05, - "loss": 0.0721, + "loss": 0.0615, "step": 1400 }, { "epoch": 0.7117617364967188, - "grad_norm": 0.40253061056137085, + "grad_norm": 0.7912864685058594, "learning_rate": 2.572942958101969e-05, - "loss": 0.0735, + "loss": 0.0681, "step": 1410 }, { "epoch": 0.7168096920747098, - "grad_norm": 0.42983728647232056, + "grad_norm": 0.6189167499542236, "learning_rate": 2.5699141847551743e-05, - "loss": 0.0798, + "loss": 0.0682, "step": 1420 }, { "epoch": 0.7218576476527007, - "grad_norm": 0.3952350914478302, + "grad_norm": 0.5456287860870361, "learning_rate": 2.5668854114083796e-05, - "loss": 0.0694, + "loss": 0.0591, "step": 1430 }, { "epoch": 0.7269056032306915, - "grad_norm": 0.5461121201515198, + "grad_norm": 0.485055148601532, "learning_rate": 2.5638566380615853e-05, - "loss": 0.0843, + "loss": 0.0729, "step": 1440 }, { "epoch": 0.7319535588086825, - "grad_norm": 0.62554931640625, + "grad_norm": 0.46423906087875366, "learning_rate": 2.5608278647147907e-05, - "loss": 0.0765, + "loss": 0.0646, "step": 1450 }, { "epoch": 0.7370015143866734, - "grad_norm": 0.5430169105529785, + "grad_norm": 0.5944865345954895, "learning_rate": 2.557799091367996e-05, - "loss": 0.0855, + "loss": 0.0696, "step": 1460 }, { "epoch": 0.7420494699646644, - "grad_norm": 0.5172569751739502, + "grad_norm": 0.794015645980835, "learning_rate": 2.5547703180212014e-05, - "loss": 0.0771, + "loss": 0.0671, "step": 1470 }, { "epoch": 0.7470974255426552, - "grad_norm": 0.4924006164073944, + "grad_norm": 0.6759900450706482, "learning_rate": 2.5517415446744067e-05, - "loss": 0.0829, + "loss": 0.074, "step": 1480 }, { "epoch": 0.7521453811206461, - "grad_norm": 0.29295244812965393, + "grad_norm": 0.6719480156898499, "learning_rate": 2.5487127713276124e-05, - "loss": 0.0831, + "loss": 0.0708, "step": 1490 }, { "epoch": 0.7571933366986371, - "grad_norm": 0.6032044887542725, + "grad_norm": 0.7934426665306091, "learning_rate": 2.5456839979808177e-05, - "loss": 0.0751, + "loss": 0.0664, "step": 1500 }, { "epoch": 0.762241292276628, - "grad_norm": 0.5197745561599731, + "grad_norm": 1.4169378280639648, "learning_rate": 2.542655224634023e-05, - "loss": 0.0764, + "loss": 0.0726, "step": 1510 }, { "epoch": 0.7672892478546188, - "grad_norm": 0.3689173758029938, + "grad_norm": 0.5849716067314148, "learning_rate": 2.5396264512872288e-05, - "loss": 0.0758, + "loss": 0.0709, "step": 1520 }, { "epoch": 0.7723372034326098, - "grad_norm": 0.5350760817527771, + "grad_norm": 0.8471559286117554, "learning_rate": 2.536597677940434e-05, - "loss": 0.0863, + "loss": 0.0764, "step": 1530 }, { "epoch": 0.7773851590106007, - "grad_norm": 0.37884262204170227, + "grad_norm": 0.7494149804115295, "learning_rate": 2.5335689045936398e-05, - "loss": 0.0703, + "loss": 0.0629, "step": 1540 }, { "epoch": 0.7824331145885917, - "grad_norm": 0.3809797167778015, + "grad_norm": 0.7659397721290588, "learning_rate": 2.530540131246845e-05, - "loss": 0.0761, + "loss": 0.061, "step": 1550 }, { "epoch": 0.7874810701665825, - "grad_norm": 0.5026581287384033, + "grad_norm": 0.8505954146385193, "learning_rate": 2.5275113579000505e-05, - "loss": 0.0737, + "loss": 0.0693, "step": 1560 }, { "epoch": 0.7925290257445734, - "grad_norm": 0.46075060963630676, + "grad_norm": 0.8126624226570129, "learning_rate": 2.524482584553256e-05, - "loss": 0.0798, + "loss": 0.0738, "step": 1570 }, { "epoch": 0.7975769813225644, - "grad_norm": 0.4620317220687866, + "grad_norm": 0.9350792765617371, "learning_rate": 2.5214538112064615e-05, - "loss": 0.089, + "loss": 0.0821, "step": 1580 }, { "epoch": 0.8026249369005553, - "grad_norm": 0.46049225330352783, + "grad_norm": 1.075035810470581, "learning_rate": 2.5184250378596672e-05, - "loss": 0.0842, + "loss": 0.0758, "step": 1590 }, { "epoch": 0.8076728924785462, - "grad_norm": 0.3389497995376587, + "grad_norm": 0.6885321736335754, "learning_rate": 2.5153962645128722e-05, - "loss": 0.0759, + "loss": 0.0641, "step": 1600 }, { "epoch": 0.8127208480565371, - "grad_norm": 0.34683436155319214, + "grad_norm": 0.7702226042747498, "learning_rate": 2.5123674911660775e-05, - "loss": 0.0731, + "loss": 0.0642, "step": 1610 }, { "epoch": 0.817768803634528, - "grad_norm": 0.3016813397407532, + "grad_norm": 0.9809953570365906, "learning_rate": 2.5093387178192832e-05, - "loss": 0.0828, + "loss": 0.0759, "step": 1620 }, { "epoch": 0.822816759212519, - "grad_norm": 0.563191294670105, + "grad_norm": 0.5996444225311279, "learning_rate": 2.5063099444724886e-05, - "loss": 0.0791, + "loss": 0.0686, "step": 1630 }, { "epoch": 0.8278647147905098, - "grad_norm": 0.33876487612724304, + "grad_norm": 0.5003983378410339, "learning_rate": 2.5032811711256942e-05, - "loss": 0.0776, + "loss": 0.0697, "step": 1640 }, { "epoch": 0.8329126703685008, - "grad_norm": 0.4185733497142792, + "grad_norm": 0.7024896740913391, "learning_rate": 2.5002523977788996e-05, - "loss": 0.0758, + "loss": 0.0699, "step": 1650 }, { "epoch": 0.8379606259464917, - "grad_norm": 0.3273310959339142, + "grad_norm": 0.5384397506713867, "learning_rate": 2.497223624432105e-05, - "loss": 0.0775, + "loss": 0.0684, "step": 1660 }, { "epoch": 0.8430085815244825, - "grad_norm": 0.5738667845726013, + "grad_norm": 1.176849126815796, "learning_rate": 2.4941948510853106e-05, - "loss": 0.0723, + "loss": 0.065, "step": 1670 }, { "epoch": 0.8480565371024735, - "grad_norm": 0.35539621114730835, + "grad_norm": 0.7623859643936157, "learning_rate": 2.491166077738516e-05, - "loss": 0.0725, + "loss": 0.0676, "step": 1680 }, { "epoch": 0.8531044926804644, - "grad_norm": 0.45273271203041077, + "grad_norm": 0.8817411065101624, "learning_rate": 2.4881373043917216e-05, - "loss": 0.0803, + "loss": 0.0712, "step": 1690 }, { "epoch": 0.8581524482584554, - "grad_norm": 0.48917362093925476, + "grad_norm": 0.7471240162849426, "learning_rate": 2.485108531044927e-05, - "loss": 0.0778, + "loss": 0.0719, "step": 1700 }, { "epoch": 0.8632004038364463, - "grad_norm": 0.44357436895370483, + "grad_norm": 0.9217013120651245, "learning_rate": 2.4820797576981323e-05, - "loss": 0.0885, + "loss": 0.0758, "step": 1710 }, { "epoch": 0.8682483594144371, - "grad_norm": 0.2906216084957123, + "grad_norm": 0.4985320568084717, "learning_rate": 2.479050984351338e-05, - "loss": 0.0817, + "loss": 0.075, "step": 1720 }, { "epoch": 0.8732963149924281, - "grad_norm": 0.4553854763507843, + "grad_norm": 0.47823965549468994, "learning_rate": 2.476022211004543e-05, - "loss": 0.0672, + "loss": 0.0576, "step": 1730 }, { "epoch": 0.878344270570419, - "grad_norm": 0.35258758068084717, + "grad_norm": 0.5073914527893066, "learning_rate": 2.4729934376577487e-05, - "loss": 0.067, + "loss": 0.0619, "step": 1740 }, { "epoch": 0.8833922261484098, - "grad_norm": 0.38019898533821106, + "grad_norm": 0.6744971871376038, "learning_rate": 2.469964664310954e-05, - "loss": 0.0781, + "loss": 0.0674, "step": 1750 }, { "epoch": 0.8884401817264008, - "grad_norm": 0.3867049813270569, + "grad_norm": 0.7287705540657043, "learning_rate": 2.4669358909641594e-05, - "loss": 0.0808, + "loss": 0.0705, "step": 1760 }, { "epoch": 0.8934881373043917, - "grad_norm": 0.5274444222450256, + "grad_norm": 0.6387834548950195, "learning_rate": 2.463907117617365e-05, - "loss": 0.0852, + "loss": 0.0736, "step": 1770 }, { "epoch": 0.8985360928823827, - "grad_norm": 0.34507790207862854, + "grad_norm": 0.8428398370742798, "learning_rate": 2.4608783442705704e-05, - "loss": 0.0832, + "loss": 0.0741, "step": 1780 }, { "epoch": 0.9035840484603735, - "grad_norm": 0.41448697447776794, + "grad_norm": 0.6455987691879272, "learning_rate": 2.4578495709237758e-05, - "loss": 0.0744, + "loss": 0.0639, "step": 1790 }, { "epoch": 0.9086320040383644, - "grad_norm": 0.4750345051288605, + "grad_norm": 0.6735292673110962, "learning_rate": 2.4548207975769815e-05, - "loss": 0.0787, + "loss": 0.0795, "step": 1800 }, { "epoch": 0.9136799596163554, - "grad_norm": 0.35344141721725464, + "grad_norm": 0.6157563924789429, "learning_rate": 2.4517920242301868e-05, - "loss": 0.0743, + "loss": 0.0699, "step": 1810 }, { "epoch": 0.9187279151943463, - "grad_norm": 0.5075356960296631, + "grad_norm": 0.7483514547348022, "learning_rate": 2.4487632508833925e-05, - "loss": 0.0737, + "loss": 0.0681, "step": 1820 }, { "epoch": 0.9237758707723372, - "grad_norm": 0.4184614419937134, + "grad_norm": 0.5686767101287842, "learning_rate": 2.4457344775365978e-05, - "loss": 0.0809, + "loss": 0.0713, "step": 1830 }, { "epoch": 0.9288238263503281, - "grad_norm": 0.3176049590110779, + "grad_norm": 0.352909654378891, "learning_rate": 2.4427057041898032e-05, - "loss": 0.077, + "loss": 0.0641, "step": 1840 }, { "epoch": 0.933871781928319, - "grad_norm": 0.5197444558143616, + "grad_norm": 0.6095912456512451, "learning_rate": 2.439676930843009e-05, - "loss": 0.0809, + "loss": 0.0794, "step": 1850 }, { "epoch": 0.93891973750631, - "grad_norm": 0.3748112916946411, + "grad_norm": 0.3929665684700012, "learning_rate": 2.436648157496214e-05, - "loss": 0.0764, + "loss": 0.0672, "step": 1860 }, { "epoch": 0.9439676930843008, - "grad_norm": 0.2999976575374603, + "grad_norm": 0.22026501595973969, "learning_rate": 2.4336193841494195e-05, - "loss": 0.0783, + "loss": 0.0699, "step": 1870 }, { "epoch": 0.9490156486622918, - "grad_norm": 0.4079499840736389, + "grad_norm": 0.5952547788619995, "learning_rate": 2.430590610802625e-05, - "loss": 0.0807, + "loss": 0.0733, "step": 1880 }, { "epoch": 0.9540636042402827, - "grad_norm": 0.2642356753349304, + "grad_norm": 0.7297592163085938, "learning_rate": 2.4275618374558302e-05, - "loss": 0.0762, + "loss": 0.0725, "step": 1890 }, { "epoch": 0.9591115598182736, - "grad_norm": 0.31227466464042664, + "grad_norm": 0.35177797079086304, "learning_rate": 2.424533064109036e-05, - "loss": 0.0709, + "loss": 0.0651, "step": 1900 }, { "epoch": 0.9641595153962645, - "grad_norm": 0.386422723531723, + "grad_norm": 0.6706666350364685, "learning_rate": 2.4215042907622413e-05, - "loss": 0.0786, + "loss": 0.0737, "step": 1910 }, { "epoch": 0.9692074709742554, - "grad_norm": 0.7000331878662109, + "grad_norm": 0.7155650854110718, "learning_rate": 2.418475517415447e-05, - "loss": 0.0849, + "loss": 0.074, "step": 1920 }, { "epoch": 0.9742554265522464, - "grad_norm": 0.44338202476501465, + "grad_norm": 0.5200046300888062, "learning_rate": 2.4154467440686523e-05, - "loss": 0.0811, + "loss": 0.0706, "step": 1930 }, { "epoch": 0.9793033821302373, - "grad_norm": 0.5613553524017334, + "grad_norm": 0.46796679496765137, "learning_rate": 2.4124179707218576e-05, - "loss": 0.069, + "loss": 0.0592, "step": 1940 }, { "epoch": 0.9843513377082281, - "grad_norm": 0.2940104007720947, + "grad_norm": 0.5713896751403809, "learning_rate": 2.4093891973750633e-05, - "loss": 0.0654, + "loss": 0.0586, "step": 1950 }, { "epoch": 0.9893992932862191, - "grad_norm": 0.37430045008659363, + "grad_norm": 0.9147453308105469, "learning_rate": 2.4063604240282687e-05, - "loss": 0.094, + "loss": 0.0848, "step": 1960 }, { "epoch": 0.99444724886421, - "grad_norm": 0.4766289293766022, + "grad_norm": 1.1067036390304565, "learning_rate": 2.4033316506814744e-05, - "loss": 0.0784, + "loss": 0.07, "step": 1970 }, { "epoch": 0.9994952044422009, - "grad_norm": 0.35420897603034973, + "grad_norm": 0.5658775568008423, "learning_rate": 2.4003028773346797e-05, - "loss": 0.0663, + "loss": 0.0594, "step": 1980 }, { "epoch": 1.0, "eval_f1": 0.9705180789481339, - "eval_loss": 0.05940837040543556, - "eval_runtime": 553.5581, - "eval_samples_per_second": 372.611, - "eval_steps_per_second": 2.912, + "eval_loss": 0.04397369921207428, + "eval_runtime": 594.1594, + "eval_samples_per_second": 347.149, + "eval_steps_per_second": 2.713, "step": 1981 }, { "epoch": 1.0045431600201917, - "grad_norm": 0.2541043162345886, + "grad_norm": 0.6783074736595154, "learning_rate": 2.3972741039878847e-05, - "loss": 0.0824, + "loss": 0.0783, "step": 1990 }, { "epoch": 1.0095911155981827, - "grad_norm": 0.4705805480480194, + "grad_norm": 0.5741100311279297, "learning_rate": 2.3942453306410904e-05, - "loss": 0.0738, + "loss": 0.0612, "step": 2000 }, { "epoch": 1.0146390711761737, - "grad_norm": 0.44369685649871826, + "grad_norm": 0.8516017198562622, "learning_rate": 2.3912165572942957e-05, - "loss": 0.0797, + "loss": 0.0654, "step": 2010 }, { "epoch": 1.0196870267541647, - "grad_norm": 0.4401172697544098, + "grad_norm": 0.48648303747177124, "learning_rate": 2.3881877839475014e-05, - "loss": 0.0699, + "loss": 0.0659, "step": 2020 }, { "epoch": 1.0247349823321554, - "grad_norm": 0.5683963298797607, + "grad_norm": 0.48170068860054016, "learning_rate": 2.3851590106007068e-05, - "loss": 0.0779, + "loss": 0.0687, "step": 2030 }, { "epoch": 1.0297829379101464, - "grad_norm": 0.7009720206260681, + "grad_norm": 0.8060422539710999, "learning_rate": 2.382130237253912e-05, - "loss": 0.081, + "loss": 0.0741, "step": 2040 }, { "epoch": 1.0348308934881374, - "grad_norm": 0.3499268889427185, + "grad_norm": 0.3721982538700104, "learning_rate": 2.3791014639071178e-05, - "loss": 0.0733, + "loss": 0.0643, "step": 2050 }, { "epoch": 1.0398788490661282, - "grad_norm": 0.25898194313049316, + "grad_norm": 0.9289938807487488, "learning_rate": 2.376072690560323e-05, - "loss": 0.0786, + "loss": 0.0678, "step": 2060 }, { "epoch": 1.0449268046441191, - "grad_norm": 0.4099780023097992, + "grad_norm": 0.7339480519294739, "learning_rate": 2.3730439172135288e-05, - "loss": 0.0745, + "loss": 0.065, "step": 2070 }, { "epoch": 1.0499747602221101, - "grad_norm": 0.5677788853645325, + "grad_norm": 0.5676091313362122, "learning_rate": 2.370015143866734e-05, - "loss": 0.0776, + "loss": 0.0665, "step": 2080 }, { "epoch": 1.0550227158001009, - "grad_norm": 0.724709689617157, + "grad_norm": 1.0972354412078857, "learning_rate": 2.3669863705199395e-05, - "loss": 0.0738, + "loss": 0.0664, "step": 2090 }, { "epoch": 1.0600706713780919, - "grad_norm": 0.7656406164169312, + "grad_norm": 1.11980402469635, "learning_rate": 2.3639575971731452e-05, - "loss": 0.0877, + "loss": 0.0742, "step": 2100 }, { "epoch": 1.0651186269560828, - "grad_norm": 0.40501999855041504, + "grad_norm": 0.6586318016052246, "learning_rate": 2.3609288238263502e-05, - "loss": 0.0805, + "loss": 0.0755, "step": 2110 }, { "epoch": 1.0701665825340738, - "grad_norm": 0.4794836640357971, + "grad_norm": 0.6912874579429626, "learning_rate": 2.3579000504795555e-05, - "loss": 0.0839, + "loss": 0.0722, "step": 2120 }, { "epoch": 1.0752145381120646, - "grad_norm": 0.33964771032333374, + "grad_norm": 0.5603944659233093, "learning_rate": 2.3548712771327612e-05, - "loss": 0.0744, + "loss": 0.0636, "step": 2130 }, { "epoch": 1.0802624936900556, - "grad_norm": 0.4785172641277313, + "grad_norm": 0.7324510216712952, "learning_rate": 2.3518425037859666e-05, - "loss": 0.0779, + "loss": 0.0697, "step": 2140 }, { "epoch": 1.0853104492680465, - "grad_norm": 0.4255255162715912, + "grad_norm": 0.6833095550537109, "learning_rate": 2.3488137304391723e-05, - "loss": 0.0723, + "loss": 0.0678, "step": 2150 }, { "epoch": 1.0903584048460373, - "grad_norm": 0.4259156584739685, + "grad_norm": 0.49107661843299866, "learning_rate": 2.3457849570923776e-05, - "loss": 0.0713, + "loss": 0.0608, "step": 2160 }, { "epoch": 1.0954063604240283, - "grad_norm": 0.400991290807724, + "grad_norm": 0.541980504989624, "learning_rate": 2.342756183745583e-05, - "loss": 0.075, + "loss": 0.0645, "step": 2170 }, { "epoch": 1.1004543160020193, - "grad_norm": 0.4522845447063446, + "grad_norm": 0.487343966960907, "learning_rate": 2.3397274103987886e-05, - "loss": 0.0672, + "loss": 0.0573, "step": 2180 }, { "epoch": 1.10550227158001, - "grad_norm": 0.33158665895462036, + "grad_norm": 0.3503382205963135, "learning_rate": 2.336698637051994e-05, - "loss": 0.083, + "loss": 0.0753, "step": 2190 }, { "epoch": 1.110550227158001, - "grad_norm": 0.4556925296783447, + "grad_norm": 0.750566840171814, "learning_rate": 2.3336698637051997e-05, - "loss": 0.0759, + "loss": 0.0703, "step": 2200 }, { "epoch": 1.115598182735992, - "grad_norm": 0.7028746008872986, + "grad_norm": 1.1437385082244873, "learning_rate": 2.330641090358405e-05, - "loss": 0.0742, + "loss": 0.0706, "step": 2210 }, { "epoch": 1.1206461383139827, - "grad_norm": 0.4525831639766693, + "grad_norm": 0.4508492648601532, "learning_rate": 2.3276123170116103e-05, - "loss": 0.0663, + "loss": 0.064, "step": 2220 }, { "epoch": 1.1256940938919737, - "grad_norm": 0.35331177711486816, + "grad_norm": 1.0053447484970093, "learning_rate": 2.324583543664816e-05, - "loss": 0.0667, + "loss": 0.0595, "step": 2230 }, { "epoch": 1.1307420494699647, - "grad_norm": 0.3286212682723999, + "grad_norm": 0.5974487662315369, "learning_rate": 2.321554770318021e-05, - "loss": 0.0665, + "loss": 0.0613, "step": 2240 }, { "epoch": 1.1357900050479555, - "grad_norm": 0.3492475152015686, + "grad_norm": 0.48302361369132996, "learning_rate": 2.3185259969712267e-05, - "loss": 0.0592, + "loss": 0.0553, "step": 2250 }, { "epoch": 1.1408379606259464, - "grad_norm": 0.4186830520629883, + "grad_norm": 0.7124462127685547, "learning_rate": 2.315497223624432e-05, - "loss": 0.0768, + "loss": 0.0628, "step": 2260 }, { "epoch": 1.1458859162039374, - "grad_norm": 0.43187859654426575, + "grad_norm": 0.8712441921234131, "learning_rate": 2.3124684502776374e-05, - "loss": 0.0747, + "loss": 0.066, "step": 2270 }, { "epoch": 1.1509338717819284, - "grad_norm": 0.35978901386260986, + "grad_norm": 0.7473580241203308, "learning_rate": 2.309439676930843e-05, - "loss": 0.0797, + "loss": 0.0687, "step": 2280 }, { "epoch": 1.1559818273599192, - "grad_norm": 0.6061636805534363, + "grad_norm": 0.8231186866760254, "learning_rate": 2.3064109035840484e-05, - "loss": 0.0757, + "loss": 0.0686, "step": 2290 }, { "epoch": 1.1610297829379101, - "grad_norm": 0.4342908561229706, + "grad_norm": 0.5205137729644775, "learning_rate": 2.303382130237254e-05, - "loss": 0.0747, + "loss": 0.0668, "step": 2300 }, { "epoch": 1.1660777385159011, - "grad_norm": 0.30125463008880615, + "grad_norm": 0.5173012614250183, "learning_rate": 2.3003533568904595e-05, - "loss": 0.078, + "loss": 0.0664, "step": 2310 }, { "epoch": 1.171125694093892, - "grad_norm": 0.4021187424659729, + "grad_norm": 0.6976504325866699, "learning_rate": 2.2973245835436648e-05, - "loss": 0.0714, + "loss": 0.067, "step": 2320 }, { "epoch": 1.1761736496718829, - "grad_norm": 0.2937578856945038, + "grad_norm": 0.7795687317848206, "learning_rate": 2.2942958101968705e-05, - "loss": 0.0684, + "loss": 0.0591, "step": 2330 }, { "epoch": 1.1812216052498739, - "grad_norm": 0.3535318970680237, + "grad_norm": 0.35292479395866394, "learning_rate": 2.291267036850076e-05, - "loss": 0.0761, + "loss": 0.0721, "step": 2340 }, { "epoch": 1.1862695608278648, - "grad_norm": 0.7090115547180176, + "grad_norm": 1.548770546913147, "learning_rate": 2.2882382635032815e-05, - "loss": 0.0677, + "loss": 0.0608, "step": 2350 }, { "epoch": 1.1913175164058556, - "grad_norm": 0.40100908279418945, + "grad_norm": 0.521295964717865, "learning_rate": 2.285209490156487e-05, - "loss": 0.0795, + "loss": 0.0735, "step": 2360 }, { "epoch": 1.1963654719838466, - "grad_norm": 0.5058602690696716, + "grad_norm": 0.6001691818237305, "learning_rate": 2.282180716809692e-05, - "loss": 0.0722, + "loss": 0.0646, "step": 2370 }, { "epoch": 1.2014134275618376, - "grad_norm": 0.5241690874099731, + "grad_norm": 0.9061608910560608, "learning_rate": 2.2791519434628976e-05, - "loss": 0.0621, + "loss": 0.0598, "step": 2380 }, { "epoch": 1.2064613831398283, - "grad_norm": 0.4490416646003723, + "grad_norm": 0.6509453654289246, "learning_rate": 2.276123170116103e-05, - "loss": 0.0669, + "loss": 0.0591, "step": 2390 }, { "epoch": 1.2115093387178193, - "grad_norm": 0.3629598021507263, + "grad_norm": 0.4685826301574707, "learning_rate": 2.2730943967693086e-05, - "loss": 0.0742, + "loss": 0.0675, "step": 2400 }, { "epoch": 1.2165572942958103, - "grad_norm": 0.3595810532569885, + "grad_norm": 0.4527621865272522, "learning_rate": 2.270065623422514e-05, - "loss": 0.0714, + "loss": 0.0635, "step": 2410 }, { "epoch": 1.221605249873801, - "grad_norm": 0.3362235128879547, + "grad_norm": 0.46990010142326355, "learning_rate": 2.2670368500757193e-05, - "loss": 0.0695, + "loss": 0.0609, "step": 2420 }, { "epoch": 1.226653205451792, - "grad_norm": 0.574418306350708, + "grad_norm": 0.7978981137275696, "learning_rate": 2.264008076728925e-05, - "loss": 0.0739, + "loss": 0.0682, "step": 2430 }, { "epoch": 1.231701161029783, - "grad_norm": 0.31744587421417236, + "grad_norm": 0.5001055598258972, "learning_rate": 2.2609793033821303e-05, - "loss": 0.0741, + "loss": 0.0657, "step": 2440 }, { "epoch": 1.2367491166077738, - "grad_norm": 0.4100383520126343, + "grad_norm": 0.7271714806556702, "learning_rate": 2.2579505300353356e-05, - "loss": 0.0744, + "loss": 0.0627, "step": 2450 }, { "epoch": 1.2417970721857647, - "grad_norm": 0.5664629936218262, + "grad_norm": 0.3601450026035309, "learning_rate": 2.2549217566885413e-05, - "loss": 0.0775, + "loss": 0.0649, "step": 2460 }, { "epoch": 1.2468450277637557, - "grad_norm": 0.3939385414123535, + "grad_norm": 0.6351629495620728, "learning_rate": 2.2518929833417467e-05, - "loss": 0.0694, + "loss": 0.0619, "step": 2470 }, { "epoch": 1.2518929833417465, - "grad_norm": 0.6710524559020996, + "grad_norm": 0.8523517847061157, "learning_rate": 2.2488642099949524e-05, - "loss": 0.0892, + "loss": 0.078, "step": 2480 }, { "epoch": 1.2569409389197375, - "grad_norm": 0.5967342853546143, + "grad_norm": 1.0878459215164185, "learning_rate": 2.2458354366481577e-05, - "loss": 0.0775, + "loss": 0.0636, "step": 2490 }, { "epoch": 1.2619888944977284, - "grad_norm": 0.4590104818344116, + "grad_norm": 0.6811727285385132, "learning_rate": 2.2428066633013627e-05, - "loss": 0.0797, + "loss": 0.0703, "step": 2500 }, { "epoch": 1.2670368500757192, - "grad_norm": 0.4371030330657959, + "grad_norm": 0.6043427586555481, "learning_rate": 2.2397778899545684e-05, - "loss": 0.0632, + "loss": 0.0587, "step": 2510 }, { "epoch": 1.2720848056537102, - "grad_norm": 0.4119449257850647, + "grad_norm": 0.6673144102096558, "learning_rate": 2.2367491166077737e-05, - "loss": 0.0734, + "loss": 0.0675, "step": 2520 }, { "epoch": 1.2771327612317012, - "grad_norm": 0.2977501153945923, + "grad_norm": 0.3510701358318329, "learning_rate": 2.2337203432609794e-05, - "loss": 0.0715, + "loss": 0.069, "step": 2530 }, { "epoch": 1.2821807168096921, - "grad_norm": 0.3859623968601227, + "grad_norm": 0.302438884973526, "learning_rate": 2.2306915699141848e-05, - "loss": 0.0661, + "loss": 0.0609, "step": 2540 }, { "epoch": 1.2872286723876831, - "grad_norm": 0.645246148109436, + "grad_norm": 0.8073706030845642, "learning_rate": 2.22766279656739e-05, - "loss": 0.0855, + "loss": 0.076, "step": 2550 }, { "epoch": 1.2922766279656739, - "grad_norm": 0.3807261288166046, + "grad_norm": 0.7314086556434631, "learning_rate": 2.2246340232205958e-05, - "loss": 0.0774, + "loss": 0.0676, "step": 2560 }, { "epoch": 1.2973245835436649, - "grad_norm": 0.48953214287757874, + "grad_norm": 0.6998431086540222, "learning_rate": 2.221605249873801e-05, - "loss": 0.0679, + "loss": 0.0594, "step": 2570 }, { "epoch": 1.3023725391216558, - "grad_norm": 0.511131763458252, + "grad_norm": 0.9340649843215942, "learning_rate": 2.2185764765270068e-05, - "loss": 0.0692, + "loss": 0.0601, "step": 2580 }, { "epoch": 1.3074204946996466, - "grad_norm": 0.2933480143547058, + "grad_norm": 0.5486651062965393, "learning_rate": 2.215547703180212e-05, - "loss": 0.0899, + "loss": 0.0752, "step": 2590 }, { "epoch": 1.3124684502776376, - "grad_norm": 0.38106125593185425, + "grad_norm": 0.3997117280960083, "learning_rate": 2.2125189298334175e-05, - "loss": 0.0742, + "loss": 0.0669, "step": 2600 }, { "epoch": 1.3175164058556286, - "grad_norm": 0.400388240814209, + "grad_norm": 0.6159607172012329, "learning_rate": 2.2094901564866232e-05, - "loss": 0.0714, + "loss": 0.0646, "step": 2610 }, { "epoch": 1.3225643614336193, - "grad_norm": 0.5102821588516235, + "grad_norm": 1.0720511674880981, "learning_rate": 2.2064613831398285e-05, - "loss": 0.0822, + "loss": 0.0697, "step": 2620 }, { "epoch": 1.3276123170116103, - "grad_norm": 0.4120141565799713, + "grad_norm": 0.6496064066886902, "learning_rate": 2.203432609793034e-05, - "loss": 0.0718, + "loss": 0.0642, "step": 2630 }, { "epoch": 1.3326602725896013, - "grad_norm": 0.2506933808326721, + "grad_norm": 0.5649464726448059, "learning_rate": 2.2004038364462392e-05, - "loss": 0.0703, + "loss": 0.0596, "step": 2640 }, { "epoch": 1.337708228167592, - "grad_norm": 0.4566921293735504, + "grad_norm": 0.5532758235931396, "learning_rate": 2.1973750630994446e-05, - "loss": 0.0724, + "loss": 0.0651, "step": 2650 }, { "epoch": 1.342756183745583, - "grad_norm": 0.41095855832099915, + "grad_norm": 0.4955766797065735, "learning_rate": 2.1943462897526503e-05, - "loss": 0.0752, + "loss": 0.0661, "step": 2660 }, { "epoch": 1.347804139323574, - "grad_norm": 0.4002370536327362, + "grad_norm": 0.5403378009796143, "learning_rate": 2.1913175164058556e-05, - "loss": 0.0719, + "loss": 0.068, "step": 2670 }, { "epoch": 1.3528520949015648, - "grad_norm": 0.28318819403648376, + "grad_norm": 0.8987810015678406, "learning_rate": 2.1882887430590613e-05, - "loss": 0.0609, + "loss": 0.0551, "step": 2680 }, { "epoch": 1.3579000504795558, - "grad_norm": 0.24140208959579468, + "grad_norm": 0.5531570911407471, "learning_rate": 2.1852599697122666e-05, - "loss": 0.0612, + "loss": 0.0554, "step": 2690 }, { "epoch": 1.3629480060575467, - "grad_norm": 0.39612990617752075, + "grad_norm": 0.8810332417488098, "learning_rate": 2.182231196365472e-05, - "loss": 0.0711, + "loss": 0.0683, "step": 2700 }, { "epoch": 1.3679959616355375, - "grad_norm": 0.48765823245048523, + "grad_norm": 0.8977289199829102, "learning_rate": 2.1792024230186777e-05, - "loss": 0.074, + "loss": 0.0682, "step": 2710 }, { "epoch": 1.3730439172135285, - "grad_norm": 0.44596147537231445, + "grad_norm": 0.6664491295814514, "learning_rate": 2.176173649671883e-05, - "loss": 0.0722, + "loss": 0.0652, "step": 2720 }, { "epoch": 1.3780918727915195, - "grad_norm": 0.3737035393714905, + "grad_norm": 0.7725427150726318, "learning_rate": 2.1731448763250883e-05, - "loss": 0.0811, + "loss": 0.0693, "step": 2730 }, { "epoch": 1.3831398283695102, - "grad_norm": 0.7131165266036987, + "grad_norm": 1.149824857711792, "learning_rate": 2.170116102978294e-05, - "loss": 0.0729, + "loss": 0.0697, "step": 2740 }, { "epoch": 1.3881877839475012, - "grad_norm": 0.4601830244064331, + "grad_norm": 0.8231659531593323, "learning_rate": 2.167087329631499e-05, - "loss": 0.0668, + "loss": 0.0586, "step": 2750 }, { "epoch": 1.3932357395254922, - "grad_norm": 0.4313521385192871, + "grad_norm": 0.5706813335418701, "learning_rate": 2.1640585562847047e-05, - "loss": 0.0767, + "loss": 0.0648, "step": 2760 }, { "epoch": 1.3982836951034832, - "grad_norm": 0.2787948548793793, + "grad_norm": 0.4602285623550415, "learning_rate": 2.16102978293791e-05, - "loss": 0.0703, + "loss": 0.0642, "step": 2770 }, { "epoch": 1.4033316506814741, - "grad_norm": 0.490631639957428, + "grad_norm": 0.5022104978561401, "learning_rate": 2.1580010095911154e-05, - "loss": 0.0662, + "loss": 0.0582, "step": 2780 }, { "epoch": 1.408379606259465, - "grad_norm": 0.2579457759857178, + "grad_norm": 0.3675612211227417, "learning_rate": 2.154972236244321e-05, - "loss": 0.0806, + "loss": 0.0685, "step": 2790 }, { "epoch": 1.4134275618374559, - "grad_norm": 0.6154518127441406, + "grad_norm": 0.5692434906959534, "learning_rate": 2.1519434628975264e-05, - "loss": 0.0715, + "loss": 0.0625, "step": 2800 }, { "epoch": 1.4184755174154469, - "grad_norm": 0.3302385210990906, + "grad_norm": 0.44433364272117615, "learning_rate": 2.148914689550732e-05, - "loss": 0.0716, + "loss": 0.0683, "step": 2810 }, { "epoch": 1.4235234729934376, - "grad_norm": 0.39062386751174927, + "grad_norm": 0.5225184559822083, "learning_rate": 2.1458859162039375e-05, - "loss": 0.0733, + "loss": 0.0676, "step": 2820 }, { "epoch": 1.4285714285714286, - "grad_norm": 0.42694535851478577, + "grad_norm": 1.125475287437439, "learning_rate": 2.1428571428571428e-05, - "loss": 0.066, + "loss": 0.0641, "step": 2830 }, { "epoch": 1.4336193841494196, - "grad_norm": 0.3877299129962921, + "grad_norm": 0.6783428192138672, "learning_rate": 2.1398283695103485e-05, - "loss": 0.0798, + "loss": 0.0735, "step": 2840 }, { "epoch": 1.4386673397274103, - "grad_norm": 0.45881980657577515, + "grad_norm": 0.6056823134422302, "learning_rate": 2.136799596163554e-05, - "loss": 0.0672, + "loss": 0.0607, "step": 2850 }, { "epoch": 1.4437152953054013, - "grad_norm": 0.5283980965614319, + "grad_norm": 0.7588714361190796, "learning_rate": 2.1337708228167595e-05, - "loss": 0.0694, + "loss": 0.0638, "step": 2860 }, { "epoch": 1.4487632508833923, - "grad_norm": 0.5722761750221252, + "grad_norm": 0.5353738069534302, "learning_rate": 2.130742049469965e-05, - "loss": 0.0673, + "loss": 0.0628, "step": 2870 }, { "epoch": 1.453811206461383, - "grad_norm": 0.3672831654548645, + "grad_norm": 0.3690322935581207, "learning_rate": 2.12771327612317e-05, - "loss": 0.0654, + "loss": 0.055, "step": 2880 }, { "epoch": 1.458859162039374, - "grad_norm": 0.3739102780818939, + "grad_norm": 0.5556847453117371, "learning_rate": 2.1246845027763756e-05, - "loss": 0.069, + "loss": 0.0672, "step": 2890 }, { "epoch": 1.463907117617365, - "grad_norm": 0.28704676032066345, + "grad_norm": 0.5658410787582397, "learning_rate": 2.121655729429581e-05, - "loss": 0.0706, + "loss": 0.0634, "step": 2900 }, { "epoch": 1.4689550731953558, - "grad_norm": 0.6112382411956787, + "grad_norm": 1.1000596284866333, "learning_rate": 2.1186269560827866e-05, - "loss": 0.0727, + "loss": 0.0648, "step": 2910 }, { "epoch": 1.4740030287733468, - "grad_norm": 0.28976988792419434, + "grad_norm": 0.5739458799362183, "learning_rate": 2.115598182735992e-05, - "loss": 0.0661, + "loss": 0.0622, "step": 2920 }, { "epoch": 1.4790509843513377, - "grad_norm": 0.3798251152038574, + "grad_norm": 0.9371837377548218, "learning_rate": 2.1125694093891973e-05, - "loss": 0.0721, + "loss": 0.067, "step": 2930 }, { "epoch": 1.4840989399293285, - "grad_norm": 0.495906800031662, + "grad_norm": 0.5997252464294434, "learning_rate": 2.109540636042403e-05, - "loss": 0.0732, + "loss": 0.0665, "step": 2940 }, { "epoch": 1.4891468955073195, - "grad_norm": 0.5157324075698853, + "grad_norm": 0.6729413866996765, "learning_rate": 2.1065118626956083e-05, - "loss": 0.0681, + "loss": 0.0576, "step": 2950 }, { "epoch": 1.4941948510853105, - "grad_norm": 0.40662431716918945, + "grad_norm": 0.796592652797699, "learning_rate": 2.103483089348814e-05, - "loss": 0.0714, + "loss": 0.0671, "step": 2960 }, { "epoch": 1.4992428066633012, - "grad_norm": 0.4008966386318207, + "grad_norm": 0.7947612404823303, "learning_rate": 2.1004543160020193e-05, - "loss": 0.077, + "loss": 0.0701, "step": 2970 }, { "epoch": 1.5042907622412924, - "grad_norm": 0.48692312836647034, + "grad_norm": 0.7790849208831787, "learning_rate": 2.0974255426552247e-05, - "loss": 0.073, + "loss": 0.065, "step": 2980 }, { "epoch": 1.5093387178192832, - "grad_norm": 0.3787757456302643, + "grad_norm": 0.5330706238746643, "learning_rate": 2.0943967693084304e-05, - "loss": 0.0659, + "loss": 0.0587, "step": 2990 }, { "epoch": 1.514386673397274, - "grad_norm": 0.5147730112075806, + "grad_norm": 1.0482598543167114, "learning_rate": 2.0913679959616357e-05, - "loss": 0.075, + "loss": 0.0696, "step": 3000 }, { "epoch": 1.5194346289752652, - "grad_norm": 0.24803757667541504, + "grad_norm": 0.46928080916404724, "learning_rate": 2.088339222614841e-05, - "loss": 0.0721, + "loss": 0.0668, "step": 3010 }, { "epoch": 1.524482584553256, - "grad_norm": 0.5188020467758179, + "grad_norm": 1.0525529384613037, "learning_rate": 2.0853104492680464e-05, - "loss": 0.0767, + "loss": 0.0664, "step": 3020 }, { "epoch": 1.529530540131247, - "grad_norm": 0.305984228849411, + "grad_norm": 0.43941500782966614, "learning_rate": 2.0822816759212517e-05, - "loss": 0.076, + "loss": 0.0642, "step": 3030 }, { "epoch": 1.5345784957092379, - "grad_norm": 0.47039300203323364, + "grad_norm": 0.6985353231430054, "learning_rate": 2.0792529025744574e-05, - "loss": 0.0779, + "loss": 0.068, "step": 3040 }, { "epoch": 1.5396264512872286, - "grad_norm": 0.28816476464271545, + "grad_norm": 0.6110888123512268, "learning_rate": 2.0762241292276628e-05, - "loss": 0.0704, + "loss": 0.0639, "step": 3050 }, { "epoch": 1.5446744068652196, - "grad_norm": 0.47483137249946594, + "grad_norm": 0.8250141739845276, "learning_rate": 2.073195355880868e-05, - "loss": 0.0677, + "loss": 0.0614, "step": 3060 }, { "epoch": 1.5497223624432106, - "grad_norm": 0.41244029998779297, + "grad_norm": 0.4882888197898865, "learning_rate": 2.0701665825340738e-05, - "loss": 0.0758, + "loss": 0.066, "step": 3070 }, { "epoch": 1.5547703180212014, - "grad_norm": 0.34873828291893005, + "grad_norm": 0.38679155707359314, "learning_rate": 2.067137809187279e-05, - "loss": 0.0724, + "loss": 0.0684, "step": 3080 }, { "epoch": 1.5598182735991923, - "grad_norm": 0.5220038294792175, + "grad_norm": 0.6574121117591858, "learning_rate": 2.0641090358404848e-05, - "loss": 0.0698, + "loss": 0.0666, "step": 3090 }, { "epoch": 1.5648662291771833, - "grad_norm": 0.4924815595149994, + "grad_norm": 0.48571038246154785, "learning_rate": 2.0610802624936902e-05, - "loss": 0.0665, + "loss": 0.0646, "step": 3100 }, { "epoch": 1.569914184755174, - "grad_norm": 0.4955058991909027, + "grad_norm": 0.8285214304924011, "learning_rate": 2.0580514891468955e-05, - "loss": 0.0682, + "loss": 0.0634, "step": 3110 }, { "epoch": 1.574962140333165, - "grad_norm": 0.3984096050262451, + "grad_norm": 0.5619475245475769, "learning_rate": 2.0550227158001012e-05, - "loss": 0.0714, + "loss": 0.0665, "step": 3120 }, { "epoch": 1.580010095911156, - "grad_norm": 0.7246518731117249, + "grad_norm": 0.47569337487220764, "learning_rate": 2.0519939424533065e-05, - "loss": 0.0763, + "loss": 0.0661, "step": 3130 }, { "epoch": 1.5850580514891468, - "grad_norm": 0.3734409809112549, + "grad_norm": 0.8858407139778137, "learning_rate": 2.048965169106512e-05, - "loss": 0.0762, + "loss": 0.0696, "step": 3140 }, { "epoch": 1.5901060070671378, - "grad_norm": 0.3476959466934204, + "grad_norm": 0.5578007698059082, "learning_rate": 2.0459363957597172e-05, - "loss": 0.06, + "loss": 0.0547, "step": 3150 }, { "epoch": 1.5951539626451288, - "grad_norm": 0.37709012627601624, + "grad_norm": 0.6875492334365845, "learning_rate": 2.0429076224129226e-05, - "loss": 0.0686, + "loss": 0.0608, "step": 3160 }, { "epoch": 1.6002019182231195, - "grad_norm": 0.4265778958797455, + "grad_norm": 0.5009766221046448, "learning_rate": 2.0398788490661283e-05, - "loss": 0.0739, + "loss": 0.0684, "step": 3170 }, { "epoch": 1.6052498738011105, - "grad_norm": 0.42841967940330505, + "grad_norm": 0.7467596530914307, "learning_rate": 2.0368500757193336e-05, - "loss": 0.071, + "loss": 0.0654, "step": 3180 }, { "epoch": 1.6102978293791015, - "grad_norm": 0.41701772809028625, + "grad_norm": 0.5688017010688782, "learning_rate": 2.0338213023725393e-05, - "loss": 0.0666, + "loss": 0.0594, "step": 3190 }, { "epoch": 1.6153457849570922, - "grad_norm": 0.38340792059898376, + "grad_norm": 0.9353786110877991, "learning_rate": 2.0307925290257446e-05, - "loss": 0.0705, + "loss": 0.0685, "step": 3200 }, { "epoch": 1.6203937405350834, - "grad_norm": 0.28962600231170654, + "grad_norm": 0.5310063362121582, "learning_rate": 2.02776375567895e-05, - "loss": 0.0652, + "loss": 0.0597, "step": 3210 }, { "epoch": 1.6254416961130742, - "grad_norm": 0.4337672293186188, + "grad_norm": 1.107693076133728, "learning_rate": 2.0247349823321557e-05, - "loss": 0.0767, + "loss": 0.0722, "step": 3220 }, { "epoch": 1.630489651691065, - "grad_norm": 0.2966071367263794, + "grad_norm": 0.688391923904419, "learning_rate": 2.021706208985361e-05, - "loss": 0.0761, + "loss": 0.0719, "step": 3230 }, { "epoch": 1.6355376072690562, - "grad_norm": 0.3643532693386078, + "grad_norm": 0.4255257546901703, "learning_rate": 2.0186774356385667e-05, - "loss": 0.074, + "loss": 0.0638, "step": 3240 }, { "epoch": 1.640585562847047, - "grad_norm": 0.4204406142234802, + "grad_norm": 0.6049216389656067, "learning_rate": 2.015648662291772e-05, - "loss": 0.0649, + "loss": 0.0555, "step": 3250 }, { "epoch": 1.645633518425038, - "grad_norm": 0.3872784376144409, + "grad_norm": 0.6898351311683655, "learning_rate": 2.012619888944977e-05, - "loss": 0.072, + "loss": 0.0599, "step": 3260 }, { "epoch": 1.650681474003029, - "grad_norm": 0.5608325600624084, + "grad_norm": 0.6150475144386292, "learning_rate": 2.0095911155981827e-05, - "loss": 0.073, + "loss": 0.0664, "step": 3270 }, { "epoch": 1.6557294295810197, - "grad_norm": 0.40342044830322266, + "grad_norm": 0.5084889531135559, "learning_rate": 2.006562342251388e-05, - "loss": 0.067, + "loss": 0.0574, "step": 3280 }, { "epoch": 1.6607773851590106, - "grad_norm": 0.4224311411380768, + "grad_norm": 0.9478010535240173, "learning_rate": 2.0035335689045938e-05, - "loss": 0.0704, + "loss": 0.0619, "step": 3290 }, { "epoch": 1.6658253407370016, - "grad_norm": 0.4088759422302246, + "grad_norm": 1.1725986003875732, "learning_rate": 2.000504795557799e-05, - "loss": 0.0734, + "loss": 0.0672, "step": 3300 }, { "epoch": 1.6708732963149924, - "grad_norm": 0.5260732769966125, + "grad_norm": 0.8932427763938904, "learning_rate": 1.9974760222110044e-05, - "loss": 0.0654, + "loss": 0.0604, "step": 3310 }, { "epoch": 1.6759212518929834, - "grad_norm": 0.2915021777153015, + "grad_norm": 0.4670265316963196, "learning_rate": 1.99444724886421e-05, - "loss": 0.0755, + "loss": 0.0658, "step": 3320 }, { "epoch": 1.6809692074709743, - "grad_norm": 0.43440404534339905, + "grad_norm": 0.518844485282898, "learning_rate": 1.9914184755174155e-05, - "loss": 0.0757, + "loss": 0.068, "step": 3330 }, { "epoch": 1.686017163048965, - "grad_norm": 0.4600958526134491, + "grad_norm": 0.7717642784118652, "learning_rate": 1.988389702170621e-05, - "loss": 0.0655, + "loss": 0.0594, "step": 3340 }, { "epoch": 1.691065118626956, - "grad_norm": 0.5585376620292664, + "grad_norm": 0.9715004563331604, "learning_rate": 1.9853609288238265e-05, - "loss": 0.0693, + "loss": 0.0651, "step": 3350 }, { "epoch": 1.696113074204947, - "grad_norm": 0.5592395663261414, + "grad_norm": 0.7362111210823059, "learning_rate": 1.982332155477032e-05, - "loss": 0.0767, + "loss": 0.0664, "step": 3360 }, { "epoch": 1.7011610297829378, - "grad_norm": 0.4244596064090729, + "grad_norm": 0.480751633644104, "learning_rate": 1.9793033821302375e-05, - "loss": 0.0669, + "loss": 0.0609, "step": 3370 }, { "epoch": 1.7062089853609288, - "grad_norm": 0.31476616859436035, + "grad_norm": 0.31802135705947876, "learning_rate": 1.976274608783443e-05, - "loss": 0.0688, + "loss": 0.0658, "step": 3380 }, { "epoch": 1.7112569409389198, - "grad_norm": 0.4726528525352478, + "grad_norm": 0.5285906195640564, "learning_rate": 1.973245835436648e-05, - "loss": 0.0668, + "loss": 0.0606, "step": 3390 }, { "epoch": 1.7163048965169105, - "grad_norm": 0.4156901240348816, + "grad_norm": 0.7230745553970337, "learning_rate": 1.9702170620898536e-05, - "loss": 0.0728, + "loss": 0.0618, "step": 3400 }, { "epoch": 1.7213528520949015, - "grad_norm": 0.4030071794986725, + "grad_norm": 0.566842257976532, "learning_rate": 1.967188288743059e-05, - "loss": 0.0684, + "loss": 0.0623, "step": 3410 }, { "epoch": 1.7264008076728925, - "grad_norm": 0.6460726857185364, + "grad_norm": 0.9110565781593323, "learning_rate": 1.9641595153962646e-05, - "loss": 0.0776, + "loss": 0.0712, "step": 3420 }, { "epoch": 1.7314487632508833, - "grad_norm": 0.3956555426120758, + "grad_norm": 0.5621252059936523, "learning_rate": 1.96113074204947e-05, - "loss": 0.0726, + "loss": 0.0624, "step": 3430 }, { "epoch": 1.7364967188288745, - "grad_norm": 0.39375460147857666, + "grad_norm": 0.6153441667556763, "learning_rate": 1.9581019687026753e-05, - "loss": 0.0686, + "loss": 0.0679, "step": 3440 }, { "epoch": 1.7415446744068652, - "grad_norm": 0.46966952085494995, + "grad_norm": 0.7521117925643921, "learning_rate": 1.955073195355881e-05, - "loss": 0.0735, + "loss": 0.073, "step": 3450 }, { "epoch": 1.746592629984856, - "grad_norm": 0.39831027388572693, + "grad_norm": 0.7781336307525635, "learning_rate": 1.9520444220090863e-05, - "loss": 0.0605, + "loss": 0.0576, "step": 3460 }, { "epoch": 1.7516405855628472, - "grad_norm": 0.5071054697036743, + "grad_norm": 0.5981038808822632, "learning_rate": 1.949015648662292e-05, - "loss": 0.0666, + "loss": 0.0558, "step": 3470 }, { "epoch": 1.756688541140838, - "grad_norm": 0.3473348617553711, + "grad_norm": 0.5716273188591003, "learning_rate": 1.9459868753154973e-05, - "loss": 0.068, + "loss": 0.0615, "step": 3480 }, { "epoch": 1.761736496718829, - "grad_norm": 0.47857144474983215, + "grad_norm": 1.0969016551971436, "learning_rate": 1.9429581019687027e-05, - "loss": 0.0747, + "loss": 0.0695, "step": 3490 }, { "epoch": 1.76678445229682, - "grad_norm": 0.4897679090499878, + "grad_norm": 0.4081050157546997, "learning_rate": 1.9399293286219084e-05, - "loss": 0.0634, + "loss": 0.0569, "step": 3500 }, { "epoch": 1.7718324078748107, - "grad_norm": 0.546002209186554, + "grad_norm": 0.6996564269065857, "learning_rate": 1.9369005552751137e-05, - "loss": 0.0669, + "loss": 0.0615, "step": 3510 }, { "epoch": 1.7768803634528016, - "grad_norm": 0.3376496732234955, + "grad_norm": 0.7040839791297913, "learning_rate": 1.933871781928319e-05, - "loss": 0.064, + "loss": 0.0609, "step": 3520 }, { "epoch": 1.7819283190307926, - "grad_norm": 0.5528877377510071, + "grad_norm": 0.6955099105834961, "learning_rate": 1.9308430085815244e-05, - "loss": 0.07, + "loss": 0.0596, "step": 3530 }, { "epoch": 1.7869762746087834, - "grad_norm": 0.5076607465744019, + "grad_norm": 0.49400514364242554, "learning_rate": 1.9278142352347298e-05, - "loss": 0.0603, + "loss": 0.0531, "step": 3540 }, { "epoch": 1.7920242301867744, - "grad_norm": 0.41241809725761414, + "grad_norm": 0.6069557666778564, "learning_rate": 1.9247854618879354e-05, - "loss": 0.0745, + "loss": 0.0663, "step": 3550 }, { "epoch": 1.7970721857647654, - "grad_norm": 0.38720259070396423, + "grad_norm": 0.859195351600647, "learning_rate": 1.9217566885411408e-05, - "loss": 0.0604, + "loss": 0.0539, "step": 3560 }, { "epoch": 1.802120141342756, - "grad_norm": 0.5174373388290405, + "grad_norm": 0.8939780592918396, "learning_rate": 1.9187279151943465e-05, - "loss": 0.0716, + "loss": 0.0668, "step": 3570 }, { "epoch": 1.807168096920747, - "grad_norm": 0.41615715622901917, + "grad_norm": 0.7258803248405457, "learning_rate": 1.9156991418475518e-05, - "loss": 0.0623, + "loss": 0.0585, "step": 3580 }, { "epoch": 1.812216052498738, - "grad_norm": 0.31420108675956726, + "grad_norm": 0.38900288939476013, "learning_rate": 1.912670368500757e-05, - "loss": 0.072, + "loss": 0.0686, "step": 3590 }, { "epoch": 1.8172640080767288, - "grad_norm": 0.44855940341949463, + "grad_norm": 0.38506415486335754, "learning_rate": 1.909641595153963e-05, - "loss": 0.0711, + "loss": 0.0625, "step": 3600 }, { "epoch": 1.8223119636547198, - "grad_norm": 0.4198800325393677, + "grad_norm": 0.5235381722450256, "learning_rate": 1.9066128218071682e-05, - "loss": 0.0669, + "loss": 0.0597, "step": 3610 }, { "epoch": 1.8273599192327108, - "grad_norm": 0.3376767933368683, + "grad_norm": 0.4835253357887268, "learning_rate": 1.903584048460374e-05, - "loss": 0.0729, + "loss": 0.0667, "step": 3620 }, { "epoch": 1.8324078748107016, - "grad_norm": 0.3325952887535095, + "grad_norm": 0.6338971257209778, "learning_rate": 1.9005552751135792e-05, - "loss": 0.0742, + "loss": 0.0635, "step": 3630 }, { "epoch": 1.8374558303886925, - "grad_norm": 0.4255514442920685, + "grad_norm": 1.0663739442825317, "learning_rate": 1.8975265017667846e-05, - "loss": 0.0767, + "loss": 0.0744, "step": 3640 }, { "epoch": 1.8425037859666835, - "grad_norm": 0.49874627590179443, + "grad_norm": 0.6655123829841614, "learning_rate": 1.89449772841999e-05, - "loss": 0.0705, + "loss": 0.0654, "step": 3650 }, { "epoch": 1.8475517415446743, - "grad_norm": 0.44393444061279297, + "grad_norm": 0.582611083984375, "learning_rate": 1.8914689550731952e-05, - "loss": 0.0683, + "loss": 0.0661, "step": 3660 }, { "epoch": 1.8525996971226655, - "grad_norm": 0.33301976323127747, + "grad_norm": 0.6533240079879761, "learning_rate": 1.888440181726401e-05, - "loss": 0.0702, + "loss": 0.0613, "step": 3670 }, { "epoch": 1.8576476527006562, - "grad_norm": 0.3944764733314514, + "grad_norm": 0.4978090524673462, "learning_rate": 1.8854114083796063e-05, - "loss": 0.0678, + "loss": 0.0627, "step": 3680 }, { "epoch": 1.862695608278647, - "grad_norm": 0.6915509700775146, + "grad_norm": 0.7043678164482117, "learning_rate": 1.8823826350328116e-05, - "loss": 0.0669, + "loss": 0.0578, "step": 3690 }, { "epoch": 1.8677435638566382, - "grad_norm": 0.42169591784477234, + "grad_norm": 0.7941015362739563, "learning_rate": 1.8793538616860173e-05, - "loss": 0.0651, + "loss": 0.0622, "step": 3700 }, { "epoch": 1.872791519434629, - "grad_norm": 0.34414538741111755, + "grad_norm": 0.4428146183490753, "learning_rate": 1.8763250883392226e-05, - "loss": 0.0677, + "loss": 0.0613, "step": 3710 }, { "epoch": 1.87783947501262, - "grad_norm": 0.5671964883804321, + "grad_norm": 0.6554248929023743, "learning_rate": 1.873296314992428e-05, - "loss": 0.0677, + "loss": 0.0643, "step": 3720 }, { "epoch": 1.882887430590611, - "grad_norm": 0.3801959455013275, + "grad_norm": 0.48168087005615234, "learning_rate": 1.8702675416456337e-05, - "loss": 0.0635, + "loss": 0.055, "step": 3730 }, { "epoch": 1.8879353861686017, - "grad_norm": 0.35626405477523804, + "grad_norm": 0.509777307510376, "learning_rate": 1.867238768298839e-05, - "loss": 0.0631, + "loss": 0.058, "step": 3740 }, { "epoch": 1.8929833417465927, - "grad_norm": 0.33528760075569153, + "grad_norm": 0.5132505893707275, "learning_rate": 1.8642099949520447e-05, - "loss": 0.0663, + "loss": 0.0623, "step": 3750 }, { "epoch": 1.8980312973245836, - "grad_norm": 0.36265772581100464, + "grad_norm": 0.7474920749664307, "learning_rate": 1.86118122160525e-05, - "loss": 0.0547, + "loss": 0.0489, "step": 3760 }, { "epoch": 1.9030792529025744, - "grad_norm": 0.5244046449661255, + "grad_norm": 1.0404279232025146, "learning_rate": 1.8581524482584554e-05, - "loss": 0.0795, + "loss": 0.0687, "step": 3770 }, { "epoch": 1.9081272084805654, - "grad_norm": 0.4042259156703949, + "grad_norm": 0.6796401143074036, "learning_rate": 1.8551236749116607e-05, - "loss": 0.0689, + "loss": 0.0679, "step": 3780 }, { "epoch": 1.9131751640585564, - "grad_norm": 0.48803991079330444, + "grad_norm": 0.9071604609489441, "learning_rate": 1.852094901564866e-05, - "loss": 0.0795, + "loss": 0.0725, "step": 3790 }, { "epoch": 1.9182231196365471, - "grad_norm": 0.37799277901649475, + "grad_norm": 0.7023878693580627, "learning_rate": 1.8490661282180718e-05, - "loss": 0.0767, + "loss": 0.0702, "step": 3800 }, { "epoch": 1.923271075214538, - "grad_norm": 0.3289439082145691, + "grad_norm": 0.7312602996826172, "learning_rate": 1.846037354871277e-05, - "loss": 0.0618, + "loss": 0.0532, "step": 3810 }, { "epoch": 1.928319030792529, - "grad_norm": 0.3983497619628906, + "grad_norm": 0.6224806904792786, "learning_rate": 1.8430085815244825e-05, - "loss": 0.0673, + "loss": 0.0638, "step": 3820 }, { "epoch": 1.9333669863705198, - "grad_norm": 0.5559443831443787, + "grad_norm": 0.7255429029464722, "learning_rate": 1.839979808177688e-05, - "loss": 0.0705, + "loss": 0.0641, "step": 3830 }, { "epoch": 1.9384149419485108, - "grad_norm": 0.4850088059902191, + "grad_norm": 0.584086000919342, "learning_rate": 1.8369510348308935e-05, - "loss": 0.0766, + "loss": 0.0692, "step": 3840 }, { "epoch": 1.9434628975265018, - "grad_norm": 0.3563697338104248, + "grad_norm": 0.4826408326625824, "learning_rate": 1.833922261484099e-05, - "loss": 0.0699, + "loss": 0.0627, "step": 3850 }, { "epoch": 1.9485108531044926, - "grad_norm": 0.3636411428451538, + "grad_norm": 0.5803766846656799, "learning_rate": 1.8308934881373045e-05, - "loss": 0.0701, + "loss": 0.0635, "step": 3860 }, { "epoch": 1.9535588086824835, - "grad_norm": 0.47166189551353455, + "grad_norm": 0.7855948209762573, "learning_rate": 1.82786471479051e-05, - "loss": 0.0743, + "loss": 0.0659, "step": 3870 }, { "epoch": 1.9586067642604745, - "grad_norm": 0.4918811619281769, + "grad_norm": 0.5980962514877319, "learning_rate": 1.8248359414437155e-05, - "loss": 0.0749, + "loss": 0.0651, "step": 3880 }, { "epoch": 1.9636547198384653, - "grad_norm": 0.4318666458129883, + "grad_norm": 0.6440220475196838, "learning_rate": 1.821807168096921e-05, - "loss": 0.0668, + "loss": 0.0639, "step": 3890 }, { "epoch": 1.9687026754164565, - "grad_norm": 0.4516458809375763, + "grad_norm": 0.7104585766792297, "learning_rate": 1.8187783947501262e-05, - "loss": 0.0666, + "loss": 0.056, "step": 3900 }, { "epoch": 1.9737506309944473, - "grad_norm": 0.31433552503585815, + "grad_norm": 0.7219833731651306, "learning_rate": 1.8157496214033316e-05, - "loss": 0.0694, + "loss": 0.0574, "step": 3910 }, { "epoch": 1.978798586572438, - "grad_norm": 0.3614991009235382, + "grad_norm": 0.5478711724281311, "learning_rate": 1.812720848056537e-05, - "loss": 0.0721, + "loss": 0.0657, "step": 3920 }, { "epoch": 1.9838465421504292, - "grad_norm": 0.3617209792137146, + "grad_norm": 0.6501402854919434, "learning_rate": 1.8096920747097426e-05, - "loss": 0.0719, + "loss": 0.0641, "step": 3930 }, { "epoch": 1.98889449772842, - "grad_norm": 0.6318678855895996, + "grad_norm": 0.7231020331382751, "learning_rate": 1.806663301362948e-05, - "loss": 0.0722, + "loss": 0.0692, "step": 3940 }, { "epoch": 1.993942453306411, - "grad_norm": 0.4043138027191162, + "grad_norm": 0.6480854749679565, "learning_rate": 1.8036345280161536e-05, - "loss": 0.0654, + "loss": 0.0632, "step": 3950 }, { "epoch": 1.998990408884402, - "grad_norm": 0.5654613375663757, + "grad_norm": 0.4803590774536133, "learning_rate": 1.800605754669359e-05, - "loss": 0.0747, + "loss": 0.0678, "step": 3960 }, { "epoch": 2.0, "eval_f1": 0.9705180789481339, - "eval_loss": 0.04101279005408287, - "eval_runtime": 739.7396, - "eval_samples_per_second": 278.831, - "eval_steps_per_second": 2.179, + "eval_loss": 0.0446692518889904, + "eval_runtime": 584.4017, + "eval_samples_per_second": 352.946, + "eval_steps_per_second": 2.758, "step": 3962 }, { "epoch": 2.0040383644623927, - "grad_norm": 0.44467952847480774, + "grad_norm": 0.680855393409729, "learning_rate": 1.7975769813225643e-05, - "loss": 0.0641, + "loss": 0.0567, "step": 3970 }, { "epoch": 2.0090863200403835, - "grad_norm": 0.5186964869499207, + "grad_norm": 0.47991836071014404, "learning_rate": 1.79454820797577e-05, - "loss": 0.0603, + "loss": 0.0562, "step": 3980 }, { "epoch": 2.0141342756183747, - "grad_norm": 0.37641072273254395, + "grad_norm": 0.8615912199020386, "learning_rate": 1.7915194346289753e-05, - "loss": 0.0705, + "loss": 0.0679, "step": 3990 }, { "epoch": 2.0191822311963654, - "grad_norm": 0.3755345642566681, + "grad_norm": 0.5970327258110046, "learning_rate": 1.7884906612821807e-05, - "loss": 0.0611, + "loss": 0.053, "step": 4000 }, { "epoch": 2.024230186774356, - "grad_norm": 0.5084393620491028, + "grad_norm": 0.5402255654335022, "learning_rate": 1.7854618879353864e-05, - "loss": 0.0673, + "loss": 0.0574, "step": 4010 }, { "epoch": 2.0292781423523474, - "grad_norm": 0.3902832865715027, + "grad_norm": 0.5014840364456177, "learning_rate": 1.7824331145885917e-05, - "loss": 0.0709, + "loss": 0.0649, "step": 4020 }, { "epoch": 2.034326097930338, - "grad_norm": 0.3876974582672119, + "grad_norm": 0.7147154808044434, "learning_rate": 1.779404341241797e-05, - "loss": 0.073, + "loss": 0.0687, "step": 4030 }, { "epoch": 2.0393740535083293, - "grad_norm": 0.3503962755203247, + "grad_norm": 0.5346552729606628, "learning_rate": 1.7763755678950024e-05, - "loss": 0.0686, + "loss": 0.0638, "step": 4040 }, { "epoch": 2.04442200908632, - "grad_norm": 0.4520651698112488, + "grad_norm": 0.5596599578857422, "learning_rate": 1.7733467945482078e-05, - "loss": 0.0756, + "loss": 0.0669, "step": 4050 }, { "epoch": 2.049469964664311, - "grad_norm": 0.4055824875831604, + "grad_norm": 0.40591198205947876, "learning_rate": 1.7703180212014134e-05, - "loss": 0.0663, + "loss": 0.0564, "step": 4060 }, { "epoch": 2.054517920242302, - "grad_norm": 0.4180123805999756, + "grad_norm": 0.609337568283081, "learning_rate": 1.7672892478546188e-05, - "loss": 0.0624, + "loss": 0.0576, "step": 4070 }, { "epoch": 2.059565875820293, - "grad_norm": 0.4090680181980133, + "grad_norm": 0.5424002408981323, "learning_rate": 1.7642604745078245e-05, - "loss": 0.0641, + "loss": 0.0585, "step": 4080 }, { "epoch": 2.0646138313982836, - "grad_norm": 0.47140470147132874, + "grad_norm": 0.9868631362915039, "learning_rate": 1.7612317011610298e-05, - "loss": 0.0707, + "loss": 0.0684, "step": 4090 }, { "epoch": 2.069661786976275, - "grad_norm": 0.39671817421913147, + "grad_norm": 0.6492929458618164, "learning_rate": 1.758202927814235e-05, - "loss": 0.0706, + "loss": 0.0638, "step": 4100 }, { "epoch": 2.0747097425542655, - "grad_norm": 0.2987823486328125, + "grad_norm": 0.7837685346603394, "learning_rate": 1.755174154467441e-05, - "loss": 0.071, + "loss": 0.0675, "step": 4110 }, { "epoch": 2.0797576981322563, - "grad_norm": 0.325086385011673, + "grad_norm": 0.5961639881134033, "learning_rate": 1.7521453811206462e-05, - "loss": 0.0643, + "loss": 0.0575, "step": 4120 }, { "epoch": 2.0848056537102475, - "grad_norm": 0.4358964264392853, + "grad_norm": 0.4114825427532196, "learning_rate": 1.749116607773852e-05, - "loss": 0.0714, + "loss": 0.0659, "step": 4130 }, { "epoch": 2.0898536092882383, - "grad_norm": 0.28630056977272034, + "grad_norm": 0.4567316174507141, "learning_rate": 1.7460878344270572e-05, - "loss": 0.0756, + "loss": 0.0661, "step": 4140 }, { "epoch": 2.094901564866229, - "grad_norm": 0.3596285581588745, + "grad_norm": 0.6321776509284973, "learning_rate": 1.7430590610802626e-05, - "loss": 0.0755, + "loss": 0.066, "step": 4150 }, { "epoch": 2.0999495204442202, - "grad_norm": 0.5699533224105835, + "grad_norm": 0.8911116719245911, "learning_rate": 1.740030287733468e-05, - "loss": 0.069, + "loss": 0.0585, "step": 4160 }, { "epoch": 2.104997476022211, - "grad_norm": 0.584018886089325, + "grad_norm": 0.4896914064884186, "learning_rate": 1.7370015143866733e-05, - "loss": 0.0693, + "loss": 0.0612, "step": 4170 }, { "epoch": 2.1100454316002017, - "grad_norm": 0.4305306375026703, + "grad_norm": 0.7571251392364502, "learning_rate": 1.733972741039879e-05, - "loss": 0.0671, + "loss": 0.0563, "step": 4180 }, { "epoch": 2.115093387178193, - "grad_norm": 0.49273547530174255, + "grad_norm": 0.9115099310874939, "learning_rate": 1.7309439676930843e-05, - "loss": 0.0747, + "loss": 0.0698, "step": 4190 }, { "epoch": 2.1201413427561837, - "grad_norm": 0.32480379939079285, + "grad_norm": 0.5267325639724731, "learning_rate": 1.7279151943462896e-05, - "loss": 0.0677, + "loss": 0.0604, "step": 4200 }, { "epoch": 2.1251892983341745, - "grad_norm": 0.45263049006462097, + "grad_norm": 0.6659255623817444, "learning_rate": 1.7248864209994953e-05, - "loss": 0.0681, + "loss": 0.0627, "step": 4210 }, { "epoch": 2.1302372539121657, - "grad_norm": 0.528186559677124, + "grad_norm": 0.89178466796875, "learning_rate": 1.7218576476527007e-05, - "loss": 0.0622, + "loss": 0.0552, "step": 4220 }, { "epoch": 2.1352852094901564, - "grad_norm": 0.35689589381217957, + "grad_norm": 0.4615127742290497, "learning_rate": 1.7188288743059063e-05, - "loss": 0.0615, + "loss": 0.0557, "step": 4230 }, { "epoch": 2.1403331650681476, - "grad_norm": 0.37193727493286133, + "grad_norm": 0.6602596044540405, "learning_rate": 1.7158001009591117e-05, - "loss": 0.0587, + "loss": 0.0548, "step": 4240 }, { "epoch": 2.1453811206461384, - "grad_norm": 0.4723437428474426, + "grad_norm": 0.7081389427185059, "learning_rate": 1.712771327612317e-05, - "loss": 0.0687, + "loss": 0.0606, "step": 4250 }, { "epoch": 2.150429076224129, - "grad_norm": 0.5692839622497559, + "grad_norm": 0.5817338824272156, "learning_rate": 1.7097425542655227e-05, - "loss": 0.0739, + "loss": 0.0606, "step": 4260 }, { "epoch": 2.1554770318021204, - "grad_norm": 0.35744959115982056, + "grad_norm": 0.4401390254497528, "learning_rate": 1.706713780918728e-05, - "loss": 0.0699, + "loss": 0.0607, "step": 4270 }, { "epoch": 2.160524987380111, - "grad_norm": 0.4813705384731293, + "grad_norm": 1.0127087831497192, "learning_rate": 1.7036850075719337e-05, - "loss": 0.0667, + "loss": 0.0615, "step": 4280 }, { "epoch": 2.165572942958102, - "grad_norm": 0.37739092111587524, + "grad_norm": 0.5774319171905518, "learning_rate": 1.7006562342251387e-05, - "loss": 0.06, + "loss": 0.0525, "step": 4290 }, { "epoch": 2.170620898536093, - "grad_norm": 0.28146716952323914, + "grad_norm": 0.47623270750045776, "learning_rate": 1.697627460878344e-05, - "loss": 0.0595, + "loss": 0.0591, "step": 4300 }, { "epoch": 2.175668854114084, - "grad_norm": 0.49551817774772644, + "grad_norm": 0.7083358764648438, "learning_rate": 1.6945986875315498e-05, - "loss": 0.07, + "loss": 0.0631, "step": 4310 }, { "epoch": 2.1807168096920746, - "grad_norm": 0.4708079993724823, + "grad_norm": 0.6057601571083069, "learning_rate": 1.691569914184755e-05, - "loss": 0.07, + "loss": 0.0595, "step": 4320 }, { "epoch": 2.185764765270066, - "grad_norm": 0.4068326950073242, + "grad_norm": 0.8947880864143372, "learning_rate": 1.6885411408379605e-05, - "loss": 0.0713, + "loss": 0.0666, "step": 4330 }, { "epoch": 2.1908127208480566, - "grad_norm": 0.31832921504974365, + "grad_norm": 0.6460204720497131, "learning_rate": 1.685512367491166e-05, - "loss": 0.0716, + "loss": 0.0669, "step": 4340 }, { "epoch": 2.1958606764260473, - "grad_norm": 0.5986499786376953, + "grad_norm": 0.9029686450958252, "learning_rate": 1.6824835941443715e-05, - "loss": 0.0606, + "loss": 0.0607, "step": 4350 }, { "epoch": 2.2009086320040385, - "grad_norm": 0.4528530240058899, + "grad_norm": 0.5201438665390015, "learning_rate": 1.6794548207975772e-05, - "loss": 0.0611, + "loss": 0.0514, "step": 4360 }, { "epoch": 2.2059565875820293, - "grad_norm": 0.30561351776123047, + "grad_norm": 0.39414748549461365, "learning_rate": 1.6764260474507825e-05, - "loss": 0.0657, + "loss": 0.0581, "step": 4370 }, { "epoch": 2.21100454316002, - "grad_norm": 0.2965313792228699, + "grad_norm": 0.642257034778595, "learning_rate": 1.673397274103988e-05, - "loss": 0.0714, + "loss": 0.0611, "step": 4380 }, { "epoch": 2.2160524987380112, - "grad_norm": 0.5420868992805481, + "grad_norm": 0.7225739359855652, "learning_rate": 1.6703685007571935e-05, - "loss": 0.0648, + "loss": 0.0569, "step": 4390 }, { "epoch": 2.221100454316002, - "grad_norm": 0.5012361407279968, + "grad_norm": 0.6948502659797668, "learning_rate": 1.667339727410399e-05, - "loss": 0.0709, + "loss": 0.0652, "step": 4400 }, { "epoch": 2.2261484098939928, - "grad_norm": 0.29113131761550903, + "grad_norm": 0.5755937695503235, "learning_rate": 1.6643109540636042e-05, - "loss": 0.0591, + "loss": 0.0566, "step": 4410 }, { "epoch": 2.231196365471984, - "grad_norm": 0.37094447016716003, + "grad_norm": 0.4249815046787262, "learning_rate": 1.6612821807168096e-05, - "loss": 0.0708, + "loss": 0.0642, "step": 4420 }, { "epoch": 2.2362443210499747, - "grad_norm": 0.3133958876132965, + "grad_norm": 0.5442089438438416, "learning_rate": 1.658253407370015e-05, - "loss": 0.0797, + "loss": 0.0685, "step": 4430 }, { "epoch": 2.2412922766279655, - "grad_norm": 0.41249901056289673, + "grad_norm": 0.8074495792388916, "learning_rate": 1.6552246340232206e-05, - "loss": 0.0615, + "loss": 0.0558, "step": 4440 }, { "epoch": 2.2463402322059567, - "grad_norm": 0.5728883743286133, + "grad_norm": 0.8810071349143982, "learning_rate": 1.652195860676426e-05, - "loss": 0.0742, + "loss": 0.0685, "step": 4450 }, { "epoch": 2.2513881877839474, - "grad_norm": 0.29768499732017517, + "grad_norm": 0.5399377942085266, "learning_rate": 1.6491670873296316e-05, - "loss": 0.0675, + "loss": 0.0607, "step": 4460 }, { "epoch": 2.256436143361938, - "grad_norm": 0.4960065484046936, + "grad_norm": 0.7178535461425781, "learning_rate": 1.646138313982837e-05, - "loss": 0.0571, + "loss": 0.0504, "step": 4470 }, { "epoch": 2.2614840989399294, - "grad_norm": 0.3015303313732147, + "grad_norm": 0.4272046983242035, "learning_rate": 1.6431095406360423e-05, - "loss": 0.0616, + "loss": 0.0583, "step": 4480 }, { "epoch": 2.26653205451792, - "grad_norm": 0.4689812958240509, + "grad_norm": 0.6807524561882019, "learning_rate": 1.640080767289248e-05, - "loss": 0.0722, + "loss": 0.0639, "step": 4490 }, { "epoch": 2.271580010095911, - "grad_norm": 0.42969536781311035, + "grad_norm": 0.5895000100135803, "learning_rate": 1.6370519939424534e-05, - "loss": 0.07, + "loss": 0.0675, "step": 4500 }, { "epoch": 2.276627965673902, - "grad_norm": 0.5754514336585999, + "grad_norm": 0.6640876531600952, "learning_rate": 1.634023220595659e-05, - "loss": 0.0668, + "loss": 0.0603, "step": 4510 }, { "epoch": 2.281675921251893, - "grad_norm": 0.4731753170490265, + "grad_norm": 0.4367890954017639, "learning_rate": 1.6309944472488644e-05, - "loss": 0.0599, + "loss": 0.0517, "step": 4520 }, { "epoch": 2.2867238768298837, - "grad_norm": 0.4508737027645111, + "grad_norm": 1.082713007926941, "learning_rate": 1.6279656739020697e-05, - "loss": 0.0632, + "loss": 0.0524, "step": 4530 }, { "epoch": 2.291771832407875, - "grad_norm": 0.3547825813293457, + "grad_norm": 0.5186300277709961, "learning_rate": 1.624936900555275e-05, - "loss": 0.066, + "loss": 0.0566, "step": 4540 }, { "epoch": 2.2968197879858656, - "grad_norm": 0.6295393109321594, + "grad_norm": 1.2778280973434448, "learning_rate": 1.6219081272084804e-05, - "loss": 0.0595, + "loss": 0.0531, "step": 4550 }, { "epoch": 2.301867743563857, - "grad_norm": 0.44660690426826477, + "grad_norm": 0.46757417917251587, "learning_rate": 1.618879353861686e-05, - "loss": 0.0651, + "loss": 0.0637, "step": 4560 }, { "epoch": 2.3069156991418476, - "grad_norm": 0.4014616310596466, + "grad_norm": 0.6333388686180115, "learning_rate": 1.6158505805148914e-05, - "loss": 0.0652, + "loss": 0.0557, "step": 4570 }, { "epoch": 2.3119636547198383, - "grad_norm": 0.45291078090667725, + "grad_norm": 0.4005846381187439, "learning_rate": 1.6128218071680968e-05, - "loss": 0.0556, + "loss": 0.0512, "step": 4580 }, { "epoch": 2.3170116102978295, - "grad_norm": 0.41079530119895935, + "grad_norm": 1.0479962825775146, "learning_rate": 1.6097930338213025e-05, - "loss": 0.0683, + "loss": 0.0639, "step": 4590 }, { "epoch": 2.3220595658758203, - "grad_norm": 0.48419833183288574, + "grad_norm": 1.1324669122695923, "learning_rate": 1.6067642604745078e-05, - "loss": 0.0732, + "loss": 0.0642, "step": 4600 }, { "epoch": 2.327107521453811, - "grad_norm": 0.46579691767692566, + "grad_norm": 0.827215313911438, "learning_rate": 1.6037354871277135e-05, - "loss": 0.0717, + "loss": 0.0654, "step": 4610 }, { "epoch": 2.3321554770318023, - "grad_norm": 0.44318434596061707, + "grad_norm": 0.8228656649589539, "learning_rate": 1.600706713780919e-05, - "loss": 0.07, + "loss": 0.0648, "step": 4620 }, { "epoch": 2.337203432609793, - "grad_norm": 0.3508608937263489, + "grad_norm": 0.5897762775421143, "learning_rate": 1.5976779404341242e-05, - "loss": 0.0608, + "loss": 0.0546, "step": 4630 }, { "epoch": 2.342251388187784, - "grad_norm": 0.3729645311832428, + "grad_norm": 0.6223641633987427, "learning_rate": 1.59464916708733e-05, - "loss": 0.0764, + "loss": 0.0712, "step": 4640 }, { "epoch": 2.347299343765775, - "grad_norm": 0.3809719681739807, + "grad_norm": 0.5593187808990479, "learning_rate": 1.5916203937405352e-05, - "loss": 0.0795, + "loss": 0.0707, "step": 4650 }, { "epoch": 2.3523472993437657, - "grad_norm": 0.42823466658592224, + "grad_norm": 0.9349427223205566, "learning_rate": 1.5885916203937406e-05, - "loss": 0.0629, + "loss": 0.0581, "step": 4660 }, { "epoch": 2.3573952549217565, - "grad_norm": 0.45598268508911133, + "grad_norm": 0.47101134061813354, "learning_rate": 1.585562847046946e-05, - "loss": 0.0747, + "loss": 0.0688, "step": 4670 }, { "epoch": 2.3624432104997477, - "grad_norm": 0.3352445662021637, + "grad_norm": 0.5073738098144531, "learning_rate": 1.5825340737001513e-05, - "loss": 0.0724, + "loss": 0.0678, "step": 4680 }, { "epoch": 2.3674911660777385, - "grad_norm": 0.44051024317741394, + "grad_norm": 0.5324171781539917, "learning_rate": 1.579505300353357e-05, - "loss": 0.0668, + "loss": 0.0614, "step": 4690 }, { "epoch": 2.3725391216557297, - "grad_norm": 0.3988921344280243, + "grad_norm": 0.662965714931488, "learning_rate": 1.5764765270065623e-05, - "loss": 0.0616, + "loss": 0.0507, "step": 4700 }, { "epoch": 2.3775870772337204, - "grad_norm": 0.42814871668815613, + "grad_norm": 0.6482782959938049, "learning_rate": 1.5734477536597676e-05, - "loss": 0.0574, + "loss": 0.0537, "step": 4710 }, { "epoch": 2.382635032811711, - "grad_norm": 0.43511560559272766, + "grad_norm": 1.0039052963256836, "learning_rate": 1.5704189803129733e-05, - "loss": 0.0647, + "loss": 0.059, "step": 4720 }, { "epoch": 2.3876829883897024, - "grad_norm": 0.4476068317890167, + "grad_norm": 0.8546132445335388, "learning_rate": 1.5673902069661787e-05, - "loss": 0.0739, + "loss": 0.0691, "step": 4730 }, { "epoch": 2.392730943967693, - "grad_norm": 0.289435476064682, + "grad_norm": 0.4903261363506317, "learning_rate": 1.5643614336193843e-05, - "loss": 0.0572, + "loss": 0.0535, "step": 4740 }, { "epoch": 2.397778899545684, - "grad_norm": 0.3826657235622406, + "grad_norm": 0.8538033962249756, "learning_rate": 1.5613326602725897e-05, - "loss": 0.0694, + "loss": 0.0616, "step": 4750 }, { "epoch": 2.402826855123675, - "grad_norm": 0.4220544397830963, + "grad_norm": 0.7978336215019226, "learning_rate": 1.558303886925795e-05, - "loss": 0.0638, + "loss": 0.0613, "step": 4760 }, { "epoch": 2.407874810701666, - "grad_norm": 0.3492651581764221, + "grad_norm": 0.6981778740882874, "learning_rate": 1.5552751135790007e-05, - "loss": 0.0679, + "loss": 0.0646, "step": 4770 }, { "epoch": 2.4129227662796566, - "grad_norm": 0.5811386704444885, + "grad_norm": 0.8517895936965942, "learning_rate": 1.552246340232206e-05, - "loss": 0.0787, + "loss": 0.0705, "step": 4780 }, { "epoch": 2.417970721857648, - "grad_norm": 0.2967993915081024, + "grad_norm": 0.4087599813938141, "learning_rate": 1.5492175668854117e-05, - "loss": 0.0711, + "loss": 0.0638, "step": 4790 }, { "epoch": 2.4230186774356386, - "grad_norm": 0.33070528507232666, + "grad_norm": 0.3779948651790619, "learning_rate": 1.5461887935386168e-05, - "loss": 0.0594, + "loss": 0.0524, "step": 4800 }, { "epoch": 2.4280666330136293, - "grad_norm": 0.42890259623527527, + "grad_norm": 0.42263171076774597, "learning_rate": 1.543160020191822e-05, - "loss": 0.0627, + "loss": 0.0623, "step": 4810 }, { "epoch": 2.4331145885916206, - "grad_norm": 0.39528656005859375, + "grad_norm": 0.5812351107597351, "learning_rate": 1.5401312468450278e-05, - "loss": 0.0621, + "loss": 0.0573, "step": 4820 }, { "epoch": 2.4381625441696113, - "grad_norm": 0.3624139130115509, + "grad_norm": 0.6073315143585205, "learning_rate": 1.537102473498233e-05, - "loss": 0.0652, + "loss": 0.057, "step": 4830 }, { "epoch": 2.443210499747602, - "grad_norm": 0.4606042206287384, + "grad_norm": 0.8706870079040527, "learning_rate": 1.5340737001514388e-05, - "loss": 0.0636, + "loss": 0.0606, "step": 4840 }, { "epoch": 2.4482584553255933, - "grad_norm": 0.5022151470184326, + "grad_norm": 0.9355966448783875, "learning_rate": 1.531044926804644e-05, - "loss": 0.0643, + "loss": 0.0563, "step": 4850 }, { "epoch": 2.453306410903584, - "grad_norm": 0.41519105434417725, + "grad_norm": 0.6352431774139404, "learning_rate": 1.5280161534578495e-05, - "loss": 0.0601, + "loss": 0.0537, "step": 4860 }, { "epoch": 2.458354366481575, - "grad_norm": 0.3175613284111023, + "grad_norm": 0.5970965623855591, "learning_rate": 1.524987380111055e-05, - "loss": 0.0681, + "loss": 0.0663, "step": 4870 }, { "epoch": 2.463402322059566, - "grad_norm": 0.4150908887386322, + "grad_norm": 0.40907353162765503, "learning_rate": 1.5219586067642605e-05, - "loss": 0.0577, + "loss": 0.0502, "step": 4880 }, { "epoch": 2.4684502776375568, - "grad_norm": 0.27239662408828735, + "grad_norm": 0.5130166411399841, "learning_rate": 1.518929833417466e-05, - "loss": 0.0611, + "loss": 0.0538, "step": 4890 }, { "epoch": 2.4734982332155475, - "grad_norm": 0.6037119030952454, + "grad_norm": 0.9824861288070679, "learning_rate": 1.5159010600706716e-05, - "loss": 0.0618, + "loss": 0.0518, "step": 4900 }, { "epoch": 2.4785461887935387, - "grad_norm": 0.28230902552604675, + "grad_norm": 0.6424157023429871, "learning_rate": 1.512872286723877e-05, - "loss": 0.0661, + "loss": 0.0599, "step": 4910 }, { "epoch": 2.4835941443715295, - "grad_norm": 0.42984738945961, + "grad_norm": 0.8797338008880615, "learning_rate": 1.5098435133770824e-05, - "loss": 0.0621, + "loss": 0.0534, "step": 4920 }, { "epoch": 2.4886420999495202, - "grad_norm": 0.5079028010368347, + "grad_norm": 1.0275185108184814, "learning_rate": 1.5068147400302876e-05, - "loss": 0.0669, + "loss": 0.063, "step": 4930 }, { "epoch": 2.4936900555275114, - "grad_norm": 0.3618210554122925, + "grad_norm": 0.6370276808738708, "learning_rate": 1.5037859666834931e-05, - "loss": 0.0664, + "loss": 0.0584, "step": 4940 }, { "epoch": 2.498738011105502, - "grad_norm": 0.278143972158432, + "grad_norm": 0.5083595514297485, "learning_rate": 1.5007571933366986e-05, - "loss": 0.0669, + "loss": 0.0635, "step": 4950 }, { "epoch": 2.503785966683493, - "grad_norm": 0.40146970748901367, + "grad_norm": 0.8423396348953247, "learning_rate": 1.4977284199899041e-05, - "loss": 0.0644, + "loss": 0.0593, "step": 4960 }, { "epoch": 2.508833922261484, - "grad_norm": 0.40683749318122864, + "grad_norm": 0.6133778691291809, "learning_rate": 1.4946996466431095e-05, - "loss": 0.068, + "loss": 0.0652, "step": 4970 }, { "epoch": 2.513881877839475, - "grad_norm": 0.4395790994167328, + "grad_norm": 0.5626839995384216, "learning_rate": 1.491670873296315e-05, - "loss": 0.0692, + "loss": 0.061, "step": 4980 }, { "epoch": 2.5189298334174657, - "grad_norm": 0.2304450422525406, + "grad_norm": 0.6379786729812622, "learning_rate": 1.4886420999495205e-05, - "loss": 0.0627, + "loss": 0.0583, "step": 4990 }, { "epoch": 2.523977788995457, - "grad_norm": 0.3872699737548828, + "grad_norm": 0.39859360456466675, "learning_rate": 1.485613326602726e-05, - "loss": 0.0631, + "loss": 0.057, "step": 5000 }, { "epoch": 2.5290257445734476, - "grad_norm": 0.3607660233974457, + "grad_norm": 0.4674101173877716, "learning_rate": 1.4825845532559315e-05, - "loss": 0.0615, + "loss": 0.0584, "step": 5010 }, { "epoch": 2.5340737001514384, - "grad_norm": 0.4235934913158417, + "grad_norm": 0.6018111705780029, "learning_rate": 1.4795557799091367e-05, - "loss": 0.0676, + "loss": 0.0606, "step": 5020 }, { "epoch": 2.5391216557294296, - "grad_norm": 0.4623524248600006, + "grad_norm": 0.4932622015476227, "learning_rate": 1.4765270065623422e-05, - "loss": 0.062, + "loss": 0.0551, "step": 5030 }, { "epoch": 2.5441696113074204, - "grad_norm": 0.4494507610797882, + "grad_norm": 0.5576731562614441, "learning_rate": 1.4734982332155477e-05, - "loss": 0.0638, + "loss": 0.0562, "step": 5040 }, { "epoch": 2.5492175668854116, - "grad_norm": 0.3828752636909485, + "grad_norm": 0.5910426378250122, "learning_rate": 1.4704694598687533e-05, - "loss": 0.0637, + "loss": 0.0632, "step": 5050 }, { "epoch": 2.5542655224634023, - "grad_norm": 0.35024183988571167, + "grad_norm": 0.42830216884613037, "learning_rate": 1.4674406865219586e-05, - "loss": 0.0604, + "loss": 0.0589, "step": 5060 }, { "epoch": 2.559313478041393, - "grad_norm": 0.3416309356689453, + "grad_norm": 0.657305896282196, "learning_rate": 1.4644119131751641e-05, - "loss": 0.0721, + "loss": 0.0666, "step": 5070 }, { "epoch": 2.5643614336193843, - "grad_norm": 0.4081193804740906, + "grad_norm": 0.5498583912849426, "learning_rate": 1.4613831398283696e-05, - "loss": 0.0701, + "loss": 0.0677, "step": 5080 }, { "epoch": 2.569409389197375, - "grad_norm": 0.9290459156036377, + "grad_norm": 1.5641086101531982, "learning_rate": 1.458354366481575e-05, - "loss": 0.067, + "loss": 0.0618, "step": 5090 }, { "epoch": 2.5744573447753663, - "grad_norm": 0.3432193398475647, + "grad_norm": 0.576878011226654, "learning_rate": 1.4553255931347805e-05, - "loss": 0.063, + "loss": 0.0596, "step": 5100 }, { "epoch": 2.579505300353357, - "grad_norm": 0.45165976881980896, + "grad_norm": 0.6855084896087646, "learning_rate": 1.4522968197879858e-05, - "loss": 0.0725, + "loss": 0.0684, "step": 5110 }, { "epoch": 2.5845532559313478, - "grad_norm": 0.37954217195510864, + "grad_norm": 0.46760818362236023, "learning_rate": 1.4492680464411913e-05, - "loss": 0.0673, + "loss": 0.0628, "step": 5120 }, { "epoch": 2.589601211509339, - "grad_norm": 0.3993614614009857, + "grad_norm": 0.4708857834339142, "learning_rate": 1.4462392730943969e-05, - "loss": 0.0691, + "loss": 0.0656, "step": 5130 }, { "epoch": 2.5946491670873297, - "grad_norm": 0.48905062675476074, + "grad_norm": 0.957336962223053, "learning_rate": 1.4432104997476024e-05, - "loss": 0.0567, + "loss": 0.0527, "step": 5140 }, { "epoch": 2.5996971226653205, - "grad_norm": 0.4493992328643799, + "grad_norm": 0.6079381704330444, "learning_rate": 1.4401817264008077e-05, - "loss": 0.0603, + "loss": 0.0499, "step": 5150 }, { "epoch": 2.6047450782433117, - "grad_norm": 0.34471943974494934, + "grad_norm": 0.644965410232544, "learning_rate": 1.437152953054013e-05, - "loss": 0.0607, + "loss": 0.0567, "step": 5160 }, { "epoch": 2.6097930338213025, - "grad_norm": 0.3988341689109802, + "grad_norm": 0.9058682322502136, "learning_rate": 1.4341241797072186e-05, - "loss": 0.0614, + "loss": 0.059, "step": 5170 }, { "epoch": 2.614840989399293, - "grad_norm": 0.4356638193130493, + "grad_norm": 0.6784061789512634, "learning_rate": 1.4310954063604241e-05, - "loss": 0.0589, + "loss": 0.0577, "step": 5180 }, { "epoch": 2.6198889449772844, - "grad_norm": 0.3544706106185913, + "grad_norm": 0.7699759602546692, "learning_rate": 1.4280666330136296e-05, - "loss": 0.0596, + "loss": 0.056, "step": 5190 }, { "epoch": 2.624936900555275, - "grad_norm": 0.4624828100204468, + "grad_norm": 1.0204094648361206, "learning_rate": 1.425037859666835e-05, - "loss": 0.0712, + "loss": 0.0595, "step": 5200 }, { "epoch": 2.629984856133266, - "grad_norm": 0.3146689236164093, + "grad_norm": 0.3317660987377167, "learning_rate": 1.4220090863200403e-05, - "loss": 0.0636, + "loss": 0.0579, "step": 5210 }, { "epoch": 2.635032811711257, - "grad_norm": 0.5862840414047241, + "grad_norm": 0.7586853504180908, "learning_rate": 1.4189803129732458e-05, - "loss": 0.0658, + "loss": 0.0612, "step": 5220 }, { "epoch": 2.640080767289248, - "grad_norm": 0.3758508265018463, + "grad_norm": 0.43295013904571533, "learning_rate": 1.4159515396264513e-05, - "loss": 0.063, + "loss": 0.0584, "step": 5230 }, { "epoch": 2.6451287228672387, - "grad_norm": 0.3946121335029602, + "grad_norm": 0.9083705544471741, "learning_rate": 1.4129227662796568e-05, - "loss": 0.0772, + "loss": 0.0698, "step": 5240 }, { "epoch": 2.65017667844523, - "grad_norm": 0.4428150951862335, + "grad_norm": 0.6299885511398315, "learning_rate": 1.4098939929328622e-05, - "loss": 0.064, + "loss": 0.0602, "step": 5250 }, { "epoch": 2.6552246340232206, - "grad_norm": 0.3693462312221527, + "grad_norm": 0.538589358329773, "learning_rate": 1.4068652195860677e-05, - "loss": 0.0691, + "loss": 0.0634, "step": 5260 }, { "epoch": 2.6602725896012114, - "grad_norm": 0.604390025138855, + "grad_norm": 0.5712538361549377, "learning_rate": 1.4038364462392732e-05, - "loss": 0.067, + "loss": 0.0625, "step": 5270 }, { "epoch": 2.6653205451792026, - "grad_norm": 0.32199588418006897, + "grad_norm": 0.5739433765411377, "learning_rate": 1.4008076728924786e-05, - "loss": 0.069, + "loss": 0.0647, "step": 5280 }, { "epoch": 2.6703685007571933, - "grad_norm": 0.40118536353111267, + "grad_norm": 0.5050386786460876, "learning_rate": 1.397778899545684e-05, - "loss": 0.0598, + "loss": 0.0592, "step": 5290 }, { "epoch": 2.675416456335184, - "grad_norm": 0.4204835295677185, + "grad_norm": 0.41851407289505005, "learning_rate": 1.3947501261988894e-05, - "loss": 0.0649, + "loss": 0.0581, "step": 5300 }, { "epoch": 2.6804644119131753, - "grad_norm": 0.45677709579467773, + "grad_norm": 0.5866436958312988, "learning_rate": 1.391721352852095e-05, - "loss": 0.0666, + "loss": 0.0656, "step": 5310 }, { "epoch": 2.685512367491166, - "grad_norm": 0.3687781095504761, + "grad_norm": 0.47498345375061035, "learning_rate": 1.3886925795053004e-05, - "loss": 0.0716, + "loss": 0.0657, "step": 5320 }, { "epoch": 2.690560323069157, - "grad_norm": 0.5170356631278992, + "grad_norm": 0.5748500227928162, "learning_rate": 1.385663806158506e-05, - "loss": 0.0638, + "loss": 0.0588, "step": 5330 }, { "epoch": 2.695608278647148, - "grad_norm": 0.4643763303756714, + "grad_norm": 0.685787558555603, "learning_rate": 1.3826350328117113e-05, - "loss": 0.0696, + "loss": 0.0621, "step": 5340 }, { "epoch": 2.700656234225139, - "grad_norm": 0.3444504141807556, + "grad_norm": 0.5321753025054932, "learning_rate": 1.3796062594649166e-05, - "loss": 0.0739, + "loss": 0.0665, "step": 5350 }, { "epoch": 2.7057041898031295, - "grad_norm": 0.4813980758190155, + "grad_norm": 0.4687628746032715, "learning_rate": 1.3765774861181222e-05, - "loss": 0.0647, + "loss": 0.0622, "step": 5360 }, { "epoch": 2.7107521453811207, - "grad_norm": 0.3534546494483948, + "grad_norm": 0.6931032538414001, "learning_rate": 1.3735487127713277e-05, - "loss": 0.0575, + "loss": 0.0542, "step": 5370 }, { "epoch": 2.7158001009591115, - "grad_norm": 0.41960790753364563, + "grad_norm": 0.6347541213035583, "learning_rate": 1.3705199394245332e-05, - "loss": 0.0709, + "loss": 0.0618, "step": 5380 }, { "epoch": 2.7208480565371023, - "grad_norm": 0.38305145502090454, + "grad_norm": 0.5090097188949585, "learning_rate": 1.3674911660777385e-05, - "loss": 0.0606, + "loss": 0.0577, "step": 5390 }, { "epoch": 2.7258960121150935, - "grad_norm": 0.5087040662765503, + "grad_norm": 0.557161808013916, "learning_rate": 1.3644623927309439e-05, - "loss": 0.0607, + "loss": 0.0485, "step": 5400 }, { "epoch": 2.7309439676930842, - "grad_norm": 0.37414073944091797, + "grad_norm": 0.7229135036468506, "learning_rate": 1.3614336193841494e-05, - "loss": 0.0682, + "loss": 0.0642, "step": 5410 }, { "epoch": 2.735991923271075, - "grad_norm": 0.39554670453071594, + "grad_norm": 0.7802084684371948, "learning_rate": 1.3584048460373549e-05, - "loss": 0.079, + "loss": 0.0721, "step": 5420 }, { "epoch": 2.741039878849066, - "grad_norm": 0.357322633266449, + "grad_norm": 0.8350520730018616, "learning_rate": 1.3553760726905604e-05, - "loss": 0.0529, + "loss": 0.05, "step": 5430 }, { "epoch": 2.746087834427057, - "grad_norm": 0.3612682819366455, + "grad_norm": 0.24809196591377258, "learning_rate": 1.3523472993437658e-05, - "loss": 0.0678, + "loss": 0.0577, "step": 5440 }, { "epoch": 2.7511357900050477, - "grad_norm": 0.49319979548454285, + "grad_norm": 0.5501554608345032, "learning_rate": 1.3493185259969713e-05, - "loss": 0.0654, + "loss": 0.0613, "step": 5450 }, { "epoch": 2.756183745583039, - "grad_norm": 0.3630322515964508, + "grad_norm": 0.6459994912147522, "learning_rate": 1.3462897526501768e-05, - "loss": 0.0614, + "loss": 0.0545, "step": 5460 }, { "epoch": 2.7612317011610297, - "grad_norm": 0.6561079025268555, + "grad_norm": 1.0892735719680786, "learning_rate": 1.3432609793033821e-05, - "loss": 0.0609, + "loss": 0.0517, "step": 5470 }, { "epoch": 2.7662796567390204, - "grad_norm": 0.49902087450027466, + "grad_norm": 0.8553361296653748, "learning_rate": 1.3402322059565877e-05, - "loss": 0.0604, + "loss": 0.055, "step": 5480 }, { "epoch": 2.7713276123170116, - "grad_norm": 0.4306737184524536, + "grad_norm": 0.5909534692764282, "learning_rate": 1.337203432609793e-05, - "loss": 0.0642, + "loss": 0.0583, "step": 5490 }, { "epoch": 2.7763755678950024, - "grad_norm": 0.2556377351284027, + "grad_norm": 0.3620651662349701, "learning_rate": 1.3341746592629985e-05, - "loss": 0.0581, + "loss": 0.053, "step": 5500 }, { "epoch": 2.7814235234729936, - "grad_norm": 0.37852397561073303, + "grad_norm": 0.6525430083274841, "learning_rate": 1.331145885916204e-05, - "loss": 0.0655, + "loss": 0.0667, "step": 5510 }, { "epoch": 2.7864714790509844, - "grad_norm": 0.4397842288017273, + "grad_norm": 0.6129066944122314, "learning_rate": 1.3281171125694095e-05, - "loss": 0.0683, + "loss": 0.0578, "step": 5520 }, { "epoch": 2.791519434628975, - "grad_norm": 0.3145972788333893, + "grad_norm": 0.6374188661575317, "learning_rate": 1.3250883392226147e-05, - "loss": 0.065, + "loss": 0.0598, "step": 5530 }, { "epoch": 2.7965673902069663, - "grad_norm": 0.4314529597759247, + "grad_norm": 0.6404274702072144, "learning_rate": 1.3220595658758202e-05, - "loss": 0.0729, + "loss": 0.064, "step": 5540 }, { "epoch": 2.801615345784957, - "grad_norm": 0.43847423791885376, + "grad_norm": 0.3882500231266022, "learning_rate": 1.3190307925290257e-05, - "loss": 0.0603, + "loss": 0.0556, "step": 5550 }, { "epoch": 2.8066633013629483, - "grad_norm": 0.7666720151901245, + "grad_norm": 0.827498197555542, "learning_rate": 1.3160020191822313e-05, - "loss": 0.0638, + "loss": 0.056, "step": 5560 }, { "epoch": 2.811711256940939, - "grad_norm": 0.3244626224040985, + "grad_norm": 0.5474889874458313, "learning_rate": 1.3129732458354368e-05, - "loss": 0.0612, + "loss": 0.0559, "step": 5570 }, { "epoch": 2.81675921251893, - "grad_norm": 0.4250195324420929, + "grad_norm": 0.7505003809928894, "learning_rate": 1.3099444724886421e-05, - "loss": 0.0655, + "loss": 0.0562, "step": 5580 }, { "epoch": 2.821807168096921, - "grad_norm": 0.49263009428977966, + "grad_norm": 0.7723977565765381, "learning_rate": 1.3069156991418476e-05, - "loss": 0.0761, + "loss": 0.0711, "step": 5590 }, { "epoch": 2.8268551236749118, - "grad_norm": 0.5163371562957764, + "grad_norm": 0.5930567979812622, "learning_rate": 1.303886925795053e-05, - "loss": 0.071, + "loss": 0.0666, "step": 5600 }, { "epoch": 2.8319030792529025, - "grad_norm": 0.4262700378894806, + "grad_norm": 0.9205801486968994, "learning_rate": 1.3008581524482585e-05, - "loss": 0.0674, + "loss": 0.0635, "step": 5610 }, { "epoch": 2.8369510348308937, - "grad_norm": 0.3641040027141571, + "grad_norm": 0.6520891189575195, "learning_rate": 1.297829379101464e-05, - "loss": 0.061, + "loss": 0.0503, "step": 5620 }, { "epoch": 2.8419989904088845, - "grad_norm": 0.38265225291252136, + "grad_norm": 0.697742760181427, "learning_rate": 1.2948006057546693e-05, - "loss": 0.0621, + "loss": 0.0527, "step": 5630 }, { "epoch": 2.8470469459868752, - "grad_norm": 0.33575159311294556, + "grad_norm": 0.5600337386131287, "learning_rate": 1.2917718324078749e-05, - "loss": 0.0701, + "loss": 0.0658, "step": 5640 }, { "epoch": 2.8520949015648664, - "grad_norm": 0.4343346357345581, + "grad_norm": 0.7648780941963196, "learning_rate": 1.2887430590610804e-05, - "loss": 0.0528, + "loss": 0.0503, "step": 5650 }, { "epoch": 2.857142857142857, - "grad_norm": 0.3838566839694977, + "grad_norm": 0.44580090045928955, "learning_rate": 1.2857142857142857e-05, - "loss": 0.0595, + "loss": 0.0569, "step": 5660 }, { "epoch": 2.862190812720848, - "grad_norm": 0.5549935698509216, + "grad_norm": 0.6274628043174744, "learning_rate": 1.2826855123674912e-05, - "loss": 0.0619, + "loss": 0.0544, "step": 5670 }, { "epoch": 2.867238768298839, - "grad_norm": 0.7633477449417114, + "grad_norm": 0.5967713594436646, "learning_rate": 1.2796567390206966e-05, - "loss": 0.055, + "loss": 0.049, "step": 5680 }, { "epoch": 2.87228672387683, - "grad_norm": 0.4539719223976135, + "grad_norm": 0.49563518166542053, "learning_rate": 1.2766279656739021e-05, - "loss": 0.0663, + "loss": 0.0637, "step": 5690 }, { "epoch": 2.8773346794548207, - "grad_norm": 0.3440587818622589, + "grad_norm": 0.5065841674804688, "learning_rate": 1.2735991923271076e-05, - "loss": 0.0678, + "loss": 0.0635, "step": 5700 }, { "epoch": 2.882382635032812, - "grad_norm": 0.36671680212020874, + "grad_norm": 0.4228837490081787, "learning_rate": 1.2705704189803131e-05, - "loss": 0.0641, + "loss": 0.0561, "step": 5710 }, { "epoch": 2.8874305906108026, - "grad_norm": 0.2760653793811798, + "grad_norm": 0.36254429817199707, "learning_rate": 1.2675416456335183e-05, - "loss": 0.0602, + "loss": 0.0564, "step": 5720 }, { "epoch": 2.8924785461887934, - "grad_norm": 0.3290037214756012, + "grad_norm": 0.6964749097824097, "learning_rate": 1.2645128722867238e-05, - "loss": 0.0584, + "loss": 0.0566, "step": 5730 }, { "epoch": 2.8975265017667846, - "grad_norm": 0.6113518476486206, + "grad_norm": 1.2399131059646606, "learning_rate": 1.2614840989399293e-05, - "loss": 0.0583, + "loss": 0.0528, "step": 5740 }, { "epoch": 2.9025744573447754, - "grad_norm": 0.3404606580734253, + "grad_norm": 0.45011046528816223, "learning_rate": 1.2584553255931348e-05, - "loss": 0.0663, + "loss": 0.0605, "step": 5750 }, { "epoch": 2.907622412922766, - "grad_norm": 0.3430802822113037, + "grad_norm": 0.6450422406196594, "learning_rate": 1.2554265522463404e-05, - "loss": 0.0633, + "loss": 0.0579, "step": 5760 }, { "epoch": 2.9126703685007573, - "grad_norm": 0.3817721903324127, + "grad_norm": 0.6685008406639099, "learning_rate": 1.2523977788995457e-05, - "loss": 0.0667, + "loss": 0.0596, "step": 5770 }, { "epoch": 2.917718324078748, - "grad_norm": 0.49006789922714233, + "grad_norm": 0.7710725665092468, "learning_rate": 1.2493690055527512e-05, - "loss": 0.0662, + "loss": 0.063, "step": 5780 }, { "epoch": 2.922766279656739, - "grad_norm": 0.41557011008262634, + "grad_norm": 0.6229269504547119, "learning_rate": 1.2463402322059566e-05, - "loss": 0.0588, + "loss": 0.0542, "step": 5790 }, { "epoch": 2.92781423523473, - "grad_norm": 0.28697067499160767, + "grad_norm": 0.41364407539367676, "learning_rate": 1.243311458859162e-05, - "loss": 0.0599, + "loss": 0.0588, "step": 5800 }, { "epoch": 2.932862190812721, - "grad_norm": 0.39947792887687683, + "grad_norm": 0.5546961426734924, "learning_rate": 1.2402826855123676e-05, - "loss": 0.068, + "loss": 0.0607, "step": 5810 }, { "epoch": 2.9379101463907116, - "grad_norm": 0.31385132670402527, + "grad_norm": 0.6814476251602173, "learning_rate": 1.237253912165573e-05, - "loss": 0.0676, + "loss": 0.0587, "step": 5820 }, { "epoch": 2.9429581019687028, - "grad_norm": 0.482799768447876, + "grad_norm": 0.7745892405509949, "learning_rate": 1.2342251388187784e-05, - "loss": 0.0536, + "loss": 0.0484, "step": 5830 }, { "epoch": 2.9480060575466935, - "grad_norm": 0.556859016418457, + "grad_norm": 0.9947149157524109, "learning_rate": 1.231196365471984e-05, - "loss": 0.0644, + "loss": 0.056, "step": 5840 }, { "epoch": 2.9530540131246843, - "grad_norm": 0.2774258255958557, + "grad_norm": 0.599892258644104, "learning_rate": 1.2281675921251893e-05, - "loss": 0.0633, + "loss": 0.0603, "step": 5850 }, { "epoch": 2.9581019687026755, - "grad_norm": 0.3613818883895874, + "grad_norm": 0.4991750121116638, "learning_rate": 1.2251388187783947e-05, - "loss": 0.0647, + "loss": 0.0603, "step": 5860 }, { "epoch": 2.9631499242806663, - "grad_norm": 0.3277703821659088, + "grad_norm": 0.44697603583335876, "learning_rate": 1.2221100454316002e-05, - "loss": 0.0648, + "loss": 0.0614, "step": 5870 }, { "epoch": 2.968197879858657, - "grad_norm": 0.331059068441391, + "grad_norm": 0.34608447551727295, "learning_rate": 1.2190812720848057e-05, - "loss": 0.0715, + "loss": 0.0633, "step": 5880 }, { "epoch": 2.973245835436648, - "grad_norm": 0.4360307455062866, + "grad_norm": 0.6991161108016968, "learning_rate": 1.2160524987380112e-05, - "loss": 0.0771, + "loss": 0.0713, "step": 5890 }, { "epoch": 2.978293791014639, - "grad_norm": 0.5486271977424622, + "grad_norm": 0.7053156495094299, "learning_rate": 1.2130237253912167e-05, - "loss": 0.067, + "loss": 0.0642, "step": 5900 }, { "epoch": 2.9833417465926297, - "grad_norm": 0.316173255443573, + "grad_norm": 0.4541454315185547, "learning_rate": 1.209994952044422e-05, - "loss": 0.0658, + "loss": 0.0583, "step": 5910 }, { "epoch": 2.988389702170621, - "grad_norm": 0.38328826427459717, + "grad_norm": 0.5963706970214844, "learning_rate": 1.2069661786976274e-05, - "loss": 0.0591, + "loss": 0.0551, "step": 5920 }, { "epoch": 2.9934376577486117, - "grad_norm": 0.32088446617126465, + "grad_norm": 0.37611526250839233, "learning_rate": 1.2039374053508329e-05, "loss": 0.0551, "step": 5930 }, { "epoch": 2.9984856133266025, - "grad_norm": 0.2678850591182709, + "grad_norm": 0.5949448943138123, "learning_rate": 1.2009086320040384e-05, - "loss": 0.0676, + "loss": 0.0615, "step": 5940 }, { "epoch": 3.0, "eval_f1": 0.9705180789481339, - "eval_loss": 0.04634944349527359, - "eval_runtime": 705.0876, - "eval_samples_per_second": 292.534, - "eval_steps_per_second": 2.286, + "eval_loss": 0.04155249148607254, + "eval_runtime": 582.0561, + "eval_samples_per_second": 354.368, + "eval_steps_per_second": 2.769, "step": 5943 }, { "epoch": 3.0035335689045937, - "grad_norm": 0.4443244934082031, + "grad_norm": 0.732612133026123, "learning_rate": 1.197879858657244e-05, - "loss": 0.0483, + "loss": 0.0473, "step": 5950 }, { "epoch": 3.0085815244825844, - "grad_norm": 0.4683389663696289, + "grad_norm": 0.8803137540817261, "learning_rate": 1.1948510853104493e-05, - "loss": 0.0571, + "loss": 0.0513, "step": 5960 }, { "epoch": 3.0136294800605756, - "grad_norm": 0.4674948751926422, + "grad_norm": 0.5578094720840454, "learning_rate": 1.1918223119636548e-05, - "loss": 0.0639, + "loss": 0.0603, "step": 5970 }, { "epoch": 3.0186774356385664, - "grad_norm": 0.5270655155181885, + "grad_norm": 0.9948665499687195, "learning_rate": 1.1887935386168601e-05, - "loss": 0.0658, + "loss": 0.0592, "step": 5980 }, { "epoch": 3.023725391216557, - "grad_norm": 0.343179851770401, + "grad_norm": 0.6967259049415588, "learning_rate": 1.1857647652700657e-05, - "loss": 0.0795, + "loss": 0.0741, "step": 5990 }, { "epoch": 3.0287733467945483, - "grad_norm": 0.3782111704349518, + "grad_norm": 0.48011064529418945, "learning_rate": 1.182735991923271e-05, - "loss": 0.0586, + "loss": 0.055, "step": 6000 }, { "epoch": 3.033821302372539, - "grad_norm": 0.5034363865852356, + "grad_norm": 0.663847804069519, "learning_rate": 1.1797072185764765e-05, - "loss": 0.0628, + "loss": 0.0591, "step": 6010 }, { "epoch": 3.03886925795053, - "grad_norm": 0.3796568512916565, + "grad_norm": 0.589154839515686, "learning_rate": 1.176678445229682e-05, - "loss": 0.0548, + "loss": 0.0508, "step": 6020 }, { "epoch": 3.043917213528521, - "grad_norm": 0.381005197763443, + "grad_norm": 0.7075181007385254, "learning_rate": 1.1736496718828875e-05, - "loss": 0.0535, + "loss": 0.0493, "step": 6030 }, { "epoch": 3.048965169106512, - "grad_norm": 0.5388962626457214, + "grad_norm": 0.6230030655860901, "learning_rate": 1.1706208985360929e-05, - "loss": 0.0621, + "loss": 0.0589, "step": 6040 }, { "epoch": 3.0540131246845026, - "grad_norm": 0.4249768555164337, + "grad_norm": 0.6204888820648193, "learning_rate": 1.1675921251892982e-05, - "loss": 0.0632, + "loss": 0.0602, "step": 6050 }, { "epoch": 3.059061080262494, - "grad_norm": 0.3373398780822754, + "grad_norm": 0.456939160823822, "learning_rate": 1.1645633518425038e-05, - "loss": 0.0641, + "loss": 0.059, "step": 6060 }, { "epoch": 3.0641090358404846, - "grad_norm": 0.4852452278137207, + "grad_norm": 0.7607660889625549, "learning_rate": 1.1615345784957093e-05, - "loss": 0.0569, + "loss": 0.0488, "step": 6070 }, { "epoch": 3.0691569914184753, - "grad_norm": 0.5530717968940735, + "grad_norm": 1.2064040899276733, "learning_rate": 1.1585058051489148e-05, - "loss": 0.0765, + "loss": 0.0695, "step": 6080 }, { "epoch": 3.0742049469964665, - "grad_norm": 0.3554864823818207, + "grad_norm": 0.5143324732780457, "learning_rate": 1.1554770318021203e-05, - "loss": 0.0677, + "loss": 0.0606, "step": 6090 }, { "epoch": 3.0792529025744573, - "grad_norm": 0.45646339654922485, + "grad_norm": 0.6567758917808533, "learning_rate": 1.1524482584553256e-05, - "loss": 0.0595, + "loss": 0.0581, "step": 6100 }, { "epoch": 3.0843008581524485, - "grad_norm": 0.44173210859298706, + "grad_norm": 0.7469787001609802, "learning_rate": 1.149419485108531e-05, - "loss": 0.0561, + "loss": 0.0535, "step": 6110 }, { "epoch": 3.0893488137304392, - "grad_norm": 0.31832337379455566, + "grad_norm": 0.40161028504371643, "learning_rate": 1.1463907117617365e-05, - "loss": 0.0592, + "loss": 0.056, "step": 6120 }, { "epoch": 3.09439676930843, - "grad_norm": 0.4026302993297577, + "grad_norm": 0.7404605150222778, "learning_rate": 1.143361938414942e-05, - "loss": 0.0546, + "loss": 0.0471, "step": 6130 }, { "epoch": 3.099444724886421, - "grad_norm": 0.4788239300251007, + "grad_norm": 0.8587531447410583, "learning_rate": 1.1403331650681475e-05, - "loss": 0.0641, + "loss": 0.0558, "step": 6140 }, { "epoch": 3.104492680464412, - "grad_norm": 0.4949135184288025, + "grad_norm": 0.424450159072876, "learning_rate": 1.1373043917213529e-05, - "loss": 0.0612, + "loss": 0.0558, "step": 6150 }, { "epoch": 3.1095406360424027, - "grad_norm": 0.5024904012680054, + "grad_norm": 0.9383788704872131, "learning_rate": 1.1342756183745584e-05, - "loss": 0.0592, + "loss": 0.0517, "step": 6160 }, { "epoch": 3.114588591620394, - "grad_norm": 0.42776796221733093, + "grad_norm": 0.8069589734077454, "learning_rate": 1.1312468450277637e-05, - "loss": 0.0598, + "loss": 0.0588, "step": 6170 }, { "epoch": 3.1196365471983847, - "grad_norm": 0.551500678062439, + "grad_norm": 0.8677689433097839, "learning_rate": 1.1282180716809692e-05, - "loss": 0.071, + "loss": 0.0611, "step": 6180 }, { "epoch": 3.1246845027763754, - "grad_norm": 0.3293100893497467, + "grad_norm": 0.7949932813644409, "learning_rate": 1.1251892983341746e-05, - "loss": 0.064, + "loss": 0.0553, "step": 6190 }, { "epoch": 3.1297324583543666, - "grad_norm": 0.4054960310459137, + "grad_norm": 0.6563514471054077, "learning_rate": 1.1221605249873801e-05, - "loss": 0.0596, + "loss": 0.0549, "step": 6200 }, { "epoch": 3.1347804139323574, - "grad_norm": 0.38681086897850037, + "grad_norm": 0.5856168866157532, "learning_rate": 1.1191317516405856e-05, - "loss": 0.0635, + "loss": 0.0585, "step": 6210 }, { "epoch": 3.139828369510348, - "grad_norm": 0.40157806873321533, + "grad_norm": 0.6840217709541321, "learning_rate": 1.1161029782937911e-05, - "loss": 0.0716, + "loss": 0.0683, "step": 6220 }, { "epoch": 3.1448763250883394, - "grad_norm": 0.6303773522377014, + "grad_norm": 1.310652494430542, "learning_rate": 1.1130742049469966e-05, - "loss": 0.0629, + "loss": 0.057, "step": 6230 }, { "epoch": 3.14992428066633, - "grad_norm": 0.43884503841400146, + "grad_norm": 0.6700050830841064, "learning_rate": 1.1100454316002018e-05, - "loss": 0.0617, + "loss": 0.0562, "step": 6240 }, { "epoch": 3.154972236244321, - "grad_norm": 0.3345347046852112, + "grad_norm": 0.5210493803024292, "learning_rate": 1.1070166582534073e-05, - "loss": 0.0558, + "loss": 0.0545, "step": 6250 }, { "epoch": 3.160020191822312, - "grad_norm": 0.42731714248657227, + "grad_norm": 0.44693487882614136, "learning_rate": 1.1039878849066128e-05, - "loss": 0.0666, + "loss": 0.0614, "step": 6260 }, { "epoch": 3.165068147400303, - "grad_norm": 0.5786126255989075, + "grad_norm": 0.8827401995658875, "learning_rate": 1.1009591115598184e-05, - "loss": 0.0616, + "loss": 0.06, "step": 6270 }, { "epoch": 3.1701161029782936, - "grad_norm": 0.3354320228099823, + "grad_norm": 0.29074421525001526, "learning_rate": 1.0979303382130239e-05, - "loss": 0.0627, + "loss": 0.059, "step": 6280 }, { "epoch": 3.175164058556285, - "grad_norm": 0.4890894293785095, + "grad_norm": 0.8659618496894836, "learning_rate": 1.0949015648662292e-05, - "loss": 0.0648, + "loss": 0.0541, "step": 6290 }, { "epoch": 3.1802120141342756, - "grad_norm": 0.42990556359291077, + "grad_norm": 0.8624622821807861, "learning_rate": 1.0918727915194346e-05, - "loss": 0.0677, + "loss": 0.0661, "step": 6300 }, { "epoch": 3.1852599697122663, - "grad_norm": 0.4617575705051422, + "grad_norm": 0.6411763429641724, "learning_rate": 1.08884401817264e-05, - "loss": 0.071, + "loss": 0.0642, "step": 6310 }, { "epoch": 3.1903079252902575, - "grad_norm": 0.33714014291763306, + "grad_norm": 0.5271298289299011, "learning_rate": 1.0858152448258456e-05, - "loss": 0.06, + "loss": 0.0552, "step": 6320 }, { "epoch": 3.1953558808682483, - "grad_norm": 0.5312590003013611, + "grad_norm": 0.9701720476150513, "learning_rate": 1.082786471479051e-05, - "loss": 0.0618, + "loss": 0.0586, "step": 6330 }, { "epoch": 3.200403836446239, - "grad_norm": 0.40886396169662476, + "grad_norm": 0.5633390545845032, "learning_rate": 1.0797576981322565e-05, - "loss": 0.0606, + "loss": 0.0554, "step": 6340 }, { "epoch": 3.2054517920242303, - "grad_norm": 0.5101807117462158, + "grad_norm": 0.45846840739250183, "learning_rate": 1.076728924785462e-05, - "loss": 0.0669, + "loss": 0.0582, "step": 6350 }, { "epoch": 3.210499747602221, - "grad_norm": 0.37605538964271545, + "grad_norm": 0.43338650465011597, "learning_rate": 1.0737001514386673e-05, - "loss": 0.0682, + "loss": 0.0588, "step": 6360 }, { "epoch": 3.215547703180212, - "grad_norm": 0.3650659918785095, + "grad_norm": 0.8287716507911682, "learning_rate": 1.0706713780918728e-05, - "loss": 0.0605, + "loss": 0.053, "step": 6370 }, { "epoch": 3.220595658758203, - "grad_norm": 0.49304908514022827, + "grad_norm": 0.5174350142478943, "learning_rate": 1.0676426047450782e-05, - "loss": 0.0633, + "loss": 0.0587, "step": 6380 }, { "epoch": 3.2256436143361937, - "grad_norm": 0.45995691418647766, + "grad_norm": 0.47460228204727173, "learning_rate": 1.0646138313982837e-05, - "loss": 0.0648, + "loss": 0.0598, "step": 6390 }, { "epoch": 3.230691569914185, - "grad_norm": 0.3334788382053375, + "grad_norm": 0.49122539162635803, "learning_rate": 1.0615850580514892e-05, - "loss": 0.0579, + "loss": 0.0535, "step": 6400 }, { "epoch": 3.2357395254921757, - "grad_norm": 0.2866950035095215, + "grad_norm": 0.5462148189544678, "learning_rate": 1.0585562847046947e-05, - "loss": 0.0569, + "loss": 0.0518, "step": 6410 }, { "epoch": 3.2407874810701665, - "grad_norm": 0.532154381275177, + "grad_norm": 0.7671846747398376, "learning_rate": 1.0555275113579002e-05, - "loss": 0.0636, + "loss": 0.0611, "step": 6420 }, { "epoch": 3.2458354366481577, - "grad_norm": 0.4379573464393616, + "grad_norm": 0.6748913526535034, "learning_rate": 1.0524987380111054e-05, - "loss": 0.0677, + "loss": 0.0561, "step": 6430 }, { "epoch": 3.2508833922261484, - "grad_norm": 0.4286845326423645, + "grad_norm": 0.5004613399505615, "learning_rate": 1.049469964664311e-05, - "loss": 0.0577, + "loss": 0.0534, "step": 6440 }, { "epoch": 3.255931347804139, - "grad_norm": 0.3957328498363495, + "grad_norm": 0.4895551800727844, "learning_rate": 1.0464411913175164e-05, - "loss": 0.0566, + "loss": 0.0459, "step": 6450 }, { "epoch": 3.2609793033821304, - "grad_norm": 0.3571922183036804, + "grad_norm": 0.47480469942092896, "learning_rate": 1.043412417970722e-05, - "loss": 0.0651, + "loss": 0.0601, "step": 6460 }, { "epoch": 3.266027258960121, - "grad_norm": 0.32685035467147827, + "grad_norm": 0.4885694086551666, "learning_rate": 1.0403836446239273e-05, - "loss": 0.0623, + "loss": 0.0598, "step": 6470 }, { "epoch": 3.271075214538112, - "grad_norm": 0.3616839647293091, + "grad_norm": 0.6375486254692078, "learning_rate": 1.0373548712771328e-05, - "loss": 0.0655, + "loss": 0.0602, "step": 6480 }, { "epoch": 3.276123170116103, - "grad_norm": 0.40262675285339355, + "grad_norm": 0.7264606356620789, "learning_rate": 1.0343260979303382e-05, - "loss": 0.0651, + "loss": 0.0579, "step": 6490 }, { "epoch": 3.281171125694094, - "grad_norm": 0.5313582420349121, + "grad_norm": 0.5704456567764282, "learning_rate": 1.0312973245835437e-05, - "loss": 0.0628, + "loss": 0.056, "step": 6500 }, { "epoch": 3.2862190812720846, - "grad_norm": 0.3204849064350128, + "grad_norm": 0.6324512362480164, "learning_rate": 1.0282685512367492e-05, - "loss": 0.0573, + "loss": 0.0515, "step": 6510 }, { "epoch": 3.291267036850076, - "grad_norm": 0.3533790409564972, + "grad_norm": 0.5736483931541443, "learning_rate": 1.0252397778899545e-05, - "loss": 0.06, + "loss": 0.0538, "step": 6520 }, { "epoch": 3.2963149924280666, - "grad_norm": 0.36665427684783936, + "grad_norm": 0.48032522201538086, "learning_rate": 1.02221100454316e-05, - "loss": 0.0602, + "loss": 0.0568, "step": 6530 }, { "epoch": 3.301362948006058, - "grad_norm": 0.561029314994812, + "grad_norm": 0.6696997880935669, "learning_rate": 1.0191822311963656e-05, - "loss": 0.0607, + "loss": 0.0537, "step": 6540 }, { "epoch": 3.3064109035840485, - "grad_norm": 0.43554937839508057, + "grad_norm": 0.44333356618881226, "learning_rate": 1.016153457849571e-05, - "loss": 0.0578, + "loss": 0.0514, "step": 6550 }, { "epoch": 3.3114588591620393, - "grad_norm": 0.36535871028900146, + "grad_norm": 0.6224443912506104, "learning_rate": 1.0131246845027764e-05, - "loss": 0.0656, + "loss": 0.0607, "step": 6560 }, { "epoch": 3.3165068147400305, - "grad_norm": 0.3634582757949829, + "grad_norm": 0.7066437602043152, "learning_rate": 1.0100959111559818e-05, - "loss": 0.0643, + "loss": 0.0563, "step": 6570 }, { "epoch": 3.3215547703180213, - "grad_norm": 0.39043471217155457, + "grad_norm": 0.6406083106994629, "learning_rate": 1.0070671378091873e-05, - "loss": 0.0653, + "loss": 0.0573, "step": 6580 }, { "epoch": 3.326602725896012, - "grad_norm": 0.3135824203491211, + "grad_norm": 0.44534462690353394, "learning_rate": 1.0040383644623928e-05, - "loss": 0.063, + "loss": 0.059, "step": 6590 }, { "epoch": 3.3316506814740032, - "grad_norm": 0.47060030698776245, + "grad_norm": 0.7137624025344849, "learning_rate": 1.0010095911155983e-05, - "loss": 0.0675, + "loss": 0.0568, "step": 6600 }, { "epoch": 3.336698637051994, - "grad_norm": 0.4136187732219696, + "grad_norm": 0.6909269690513611, "learning_rate": 9.979808177688038e-06, - "loss": 0.0523, + "loss": 0.0493, "step": 6610 }, { "epoch": 3.3417465926299847, - "grad_norm": 0.4783284664154053, + "grad_norm": 0.6987153887748718, "learning_rate": 9.94952044422009e-06, - "loss": 0.0621, + "loss": 0.059, "step": 6620 }, { "epoch": 3.346794548207976, - "grad_norm": 0.41832417249679565, + "grad_norm": 0.538732647895813, "learning_rate": 9.919232710752145e-06, - "loss": 0.0644, + "loss": 0.0582, "step": 6630 }, { "epoch": 3.3518425037859667, - "grad_norm": 0.3421390950679779, + "grad_norm": 0.6330693960189819, "learning_rate": 9.8889449772842e-06, - "loss": 0.054, + "loss": 0.0506, "step": 6640 }, { "epoch": 3.3568904593639575, - "grad_norm": 0.5004147291183472, + "grad_norm": 0.5216783881187439, "learning_rate": 9.858657243816255e-06, - "loss": 0.0614, + "loss": 0.0544, "step": 6650 }, { "epoch": 3.3619384149419487, - "grad_norm": 0.3952717185020447, + "grad_norm": 0.7052462697029114, "learning_rate": 9.828369510348309e-06, - "loss": 0.06, + "loss": 0.0553, "step": 6660 }, { "epoch": 3.3669863705199394, - "grad_norm": 0.3463038206100464, + "grad_norm": 0.7679615616798401, "learning_rate": 9.798081776880364e-06, - "loss": 0.0671, + "loss": 0.061, "step": 6670 }, { "epoch": 3.37203432609793, - "grad_norm": 0.3278227746486664, + "grad_norm": 0.530564546585083, "learning_rate": 9.767794043412417e-06, - "loss": 0.0563, + "loss": 0.0567, "step": 6680 }, { "epoch": 3.3770822816759214, - "grad_norm": 0.516260027885437, + "grad_norm": 0.6907301545143127, "learning_rate": 9.737506309944473e-06, - "loss": 0.0619, + "loss": 0.0561, "step": 6690 }, { "epoch": 3.382130237253912, - "grad_norm": 0.42303574085235596, + "grad_norm": 0.7837420105934143, "learning_rate": 9.707218576476528e-06, - "loss": 0.0616, + "loss": 0.0618, "step": 6700 }, { "epoch": 3.387178192831903, - "grad_norm": 0.3528966009616852, + "grad_norm": 0.6361984014511108, "learning_rate": 9.676930843008581e-06, - "loss": 0.0615, + "loss": 0.0533, "step": 6710 }, { "epoch": 3.392226148409894, - "grad_norm": 0.28841379284858704, + "grad_norm": 0.6775834560394287, "learning_rate": 9.646643109540636e-06, - "loss": 0.0602, + "loss": 0.0571, "step": 6720 }, { "epoch": 3.397274103987885, - "grad_norm": 0.3085114061832428, + "grad_norm": 0.4820801615715027, "learning_rate": 9.616355376072691e-06, - "loss": 0.066, + "loss": 0.063, "step": 6730 }, { "epoch": 3.4023220595658756, - "grad_norm": 0.3895336985588074, + "grad_norm": 0.511091411113739, "learning_rate": 9.586067642604747e-06, - "loss": 0.0654, + "loss": 0.0621, "step": 6740 }, { "epoch": 3.407370015143867, - "grad_norm": 0.36049631237983704, + "grad_norm": 0.5163900852203369, "learning_rate": 9.5557799091368e-06, - "loss": 0.0681, + "loss": 0.0606, "step": 6750 }, { "epoch": 3.4124179707218576, - "grad_norm": 0.37995630502700806, + "grad_norm": 0.4652441740036011, "learning_rate": 9.525492175668853e-06, - "loss": 0.0532, + "loss": 0.0539, "step": 6760 }, { "epoch": 3.4174659262998484, - "grad_norm": 0.369208961725235, + "grad_norm": 0.5968872904777527, "learning_rate": 9.495204442200909e-06, - "loss": 0.0621, + "loss": 0.0599, "step": 6770 }, { "epoch": 3.4225138818778396, - "grad_norm": 0.4210832118988037, + "grad_norm": 0.4634818732738495, "learning_rate": 9.464916708732964e-06, - "loss": 0.0605, + "loss": 0.0518, "step": 6780 }, { "epoch": 3.4275618374558303, - "grad_norm": 0.5589237213134766, + "grad_norm": 0.34169018268585205, "learning_rate": 9.434628975265019e-06, - "loss": 0.0602, + "loss": 0.0588, "step": 6790 }, { "epoch": 3.432609793033821, - "grad_norm": 0.44891810417175293, + "grad_norm": 0.719494640827179, "learning_rate": 9.404341241797072e-06, - "loss": 0.0582, + "loss": 0.0538, "step": 6800 }, { "epoch": 3.4376577486118123, - "grad_norm": 0.40576326847076416, + "grad_norm": 0.4465346336364746, "learning_rate": 9.374053508329126e-06, - "loss": 0.0595, + "loss": 0.0577, "step": 6810 }, { "epoch": 3.442705704189803, - "grad_norm": 0.4331837594509125, + "grad_norm": 0.6223052740097046, "learning_rate": 9.343765774861181e-06, - "loss": 0.0653, + "loss": 0.0598, "step": 6820 }, { "epoch": 3.447753659767794, - "grad_norm": 0.3736474812030792, + "grad_norm": 0.6854692697525024, "learning_rate": 9.313478041393236e-06, - "loss": 0.057, + "loss": 0.0544, "step": 6830 }, { "epoch": 3.452801615345785, - "grad_norm": 0.5507432818412781, + "grad_norm": 1.0640225410461426, "learning_rate": 9.283190307925291e-06, - "loss": 0.0609, + "loss": 0.0569, "step": 6840 }, { "epoch": 3.4578495709237758, - "grad_norm": 0.25957268476486206, + "grad_norm": 0.5437650680541992, "learning_rate": 9.252902574457345e-06, - "loss": 0.0662, + "loss": 0.0612, "step": 6850 }, { "epoch": 3.462897526501767, - "grad_norm": 0.3138289153575897, + "grad_norm": 0.5767130255699158, "learning_rate": 9.2226148409894e-06, - "loss": 0.0634, + "loss": 0.0618, "step": 6860 }, { "epoch": 3.4679454820797577, - "grad_norm": 0.47947317361831665, + "grad_norm": 0.5814956426620483, "learning_rate": 9.192327107521453e-06, - "loss": 0.0598, + "loss": 0.0571, "step": 6870 }, { "epoch": 3.4729934376577485, - "grad_norm": 0.33063846826553345, + "grad_norm": 0.31469887495040894, "learning_rate": 9.162039374053508e-06, - "loss": 0.0671, + "loss": 0.0573, "step": 6880 }, { "epoch": 3.4780413932357397, - "grad_norm": 0.6448005437850952, + "grad_norm": 0.3987484872341156, "learning_rate": 9.131751640585563e-06, - "loss": 0.0656, + "loss": 0.0534, "step": 6890 }, { "epoch": 3.4830893488137304, - "grad_norm": 0.3326038420200348, + "grad_norm": 0.47312065958976746, "learning_rate": 9.101463907117617e-06, - "loss": 0.0649, + "loss": 0.0608, "step": 6900 }, { "epoch": 3.488137304391721, - "grad_norm": 0.49056732654571533, + "grad_norm": 0.4635220170021057, "learning_rate": 9.071176173649672e-06, - "loss": 0.0571, + "loss": 0.05, "step": 6910 }, { "epoch": 3.4931852599697124, - "grad_norm": 0.7147297859191895, + "grad_norm": 1.146721363067627, "learning_rate": 9.040888440181727e-06, - "loss": 0.0589, + "loss": 0.0548, "step": 6920 }, { "epoch": 3.498233215547703, - "grad_norm": 0.6848371028900146, + "grad_norm": 0.42057961225509644, "learning_rate": 9.010600706713782e-06, - "loss": 0.0556, + "loss": 0.0463, "step": 6930 }, { "epoch": 3.5032811711256944, - "grad_norm": 0.7190125584602356, + "grad_norm": 0.7835047841072083, "learning_rate": 8.980312973245836e-06, - "loss": 0.0565, + "loss": 0.0507, "step": 6940 }, { "epoch": 3.508329126703685, - "grad_norm": 0.46388497948646545, + "grad_norm": 0.6441161036491394, "learning_rate": 8.95002523977789e-06, - "loss": 0.0593, + "loss": 0.0571, "step": 6950 }, { "epoch": 3.513377082281676, - "grad_norm": 0.4307047426700592, + "grad_norm": 0.6828143000602722, "learning_rate": 8.919737506309944e-06, - "loss": 0.0568, + "loss": 0.0525, "step": 6960 }, { "epoch": 3.518425037859667, - "grad_norm": 0.4674361050128937, + "grad_norm": 0.8285954594612122, "learning_rate": 8.889449772842e-06, - "loss": 0.064, + "loss": 0.0621, "step": 6970 }, { "epoch": 3.523472993437658, - "grad_norm": 0.474177747964859, + "grad_norm": 0.4954177439212799, "learning_rate": 8.859162039374055e-06, - "loss": 0.0717, + "loss": 0.0625, "step": 6980 }, { "epoch": 3.5285209490156486, - "grad_norm": 0.4488455653190613, + "grad_norm": 0.7900820374488831, "learning_rate": 8.828874305906108e-06, - "loss": 0.0618, + "loss": 0.0603, "step": 6990 }, { "epoch": 3.53356890459364, - "grad_norm": 0.3444407284259796, + "grad_norm": 0.6767242550849915, "learning_rate": 8.798586572438162e-06, - "loss": 0.0613, + "loss": 0.0586, "step": 7000 }, { "epoch": 3.5386168601716306, - "grad_norm": 0.3389851152896881, + "grad_norm": 0.5408624410629272, "learning_rate": 8.768298838970217e-06, - "loss": 0.0596, + "loss": 0.0561, "step": 7010 }, { "epoch": 3.5436648157496213, - "grad_norm": 0.4028567373752594, + "grad_norm": 0.4577973484992981, "learning_rate": 8.738011105502272e-06, - "loss": 0.0584, + "loss": 0.057, "step": 7020 }, { "epoch": 3.5487127713276125, - "grad_norm": 0.5268592238426208, + "grad_norm": 0.7334242463111877, "learning_rate": 8.707723372034327e-06, - "loss": 0.0585, + "loss": 0.0602, "step": 7030 }, { "epoch": 3.5537607269056033, - "grad_norm": 0.39129918813705444, + "grad_norm": 0.5569146275520325, "learning_rate": 8.67743563856638e-06, - "loss": 0.055, + "loss": 0.0564, "step": 7040 }, { "epoch": 3.558808682483594, - "grad_norm": 0.5022369623184204, + "grad_norm": 0.5739743709564209, "learning_rate": 8.647147905098436e-06, - "loss": 0.0644, + "loss": 0.0605, "step": 7050 }, { "epoch": 3.5638566380615853, - "grad_norm": 0.524813711643219, + "grad_norm": 0.5553867816925049, "learning_rate": 8.61686017163049e-06, - "loss": 0.0622, + "loss": 0.0573, "step": 7060 }, { "epoch": 3.568904593639576, - "grad_norm": 0.34827151894569397, + "grad_norm": 0.7109550833702087, "learning_rate": 8.586572438162544e-06, - "loss": 0.0608, + "loss": 0.0634, "step": 7070 }, { "epoch": 3.5739525492175668, - "grad_norm": 0.43751201033592224, + "grad_norm": 0.46534502506256104, "learning_rate": 8.5562847046946e-06, - "loss": 0.0552, + "loss": 0.0494, "step": 7080 }, { "epoch": 3.579000504795558, - "grad_norm": 0.40720903873443604, + "grad_norm": 0.47850191593170166, "learning_rate": 8.525996971226653e-06, - "loss": 0.0648, + "loss": 0.0613, "step": 7090 }, { "epoch": 3.5840484603735487, - "grad_norm": 0.25798696279525757, + "grad_norm": 0.3749614953994751, "learning_rate": 8.495709237758708e-06, - "loss": 0.0613, + "loss": 0.0574, "step": 7100 }, { "epoch": 3.5890964159515395, - "grad_norm": 0.3780571520328522, + "grad_norm": 0.5852258801460266, "learning_rate": 8.465421504290763e-06, - "loss": 0.0684, + "loss": 0.064, "step": 7110 }, { "epoch": 3.5941443715295307, - "grad_norm": 0.5011832118034363, + "grad_norm": 0.3820860981941223, "learning_rate": 8.435133770822818e-06, - "loss": 0.0637, + "loss": 0.0559, "step": 7120 }, { "epoch": 3.5991923271075215, - "grad_norm": 0.4237106144428253, + "grad_norm": 0.5200080275535583, "learning_rate": 8.40484603735487e-06, - "loss": 0.0649, + "loss": 0.0556, "step": 7130 }, { "epoch": 3.604240282685512, - "grad_norm": 0.36436161398887634, + "grad_norm": 0.6472256183624268, "learning_rate": 8.374558303886925e-06, - "loss": 0.065, + "loss": 0.0596, "step": 7140 }, { "epoch": 3.6092882382635034, - "grad_norm": 0.3161546289920807, + "grad_norm": 0.43182119727134705, "learning_rate": 8.34427057041898e-06, - "loss": 0.0552, + "loss": 0.0478, "step": 7150 }, { "epoch": 3.614336193841494, - "grad_norm": 0.3535007834434509, + "grad_norm": 0.6659020781517029, "learning_rate": 8.313982836951035e-06, - "loss": 0.0545, + "loss": 0.054, "step": 7160 }, { "epoch": 3.619384149419485, - "grad_norm": 0.5373191833496094, + "grad_norm": 0.6561934947967529, "learning_rate": 8.28369510348309e-06, - "loss": 0.0633, + "loss": 0.0583, "step": 7170 }, { "epoch": 3.624432104997476, - "grad_norm": 0.4402375817298889, + "grad_norm": 0.7083423733711243, "learning_rate": 8.253407370015144e-06, - "loss": 0.0667, + "loss": 0.0598, "step": 7180 }, { "epoch": 3.629480060575467, - "grad_norm": 0.3555195927619934, + "grad_norm": 0.6030146479606628, "learning_rate": 8.223119636547197e-06, - "loss": 0.0604, + "loss": 0.0569, "step": 7190 }, { "epoch": 3.6345280161534577, - "grad_norm": 0.3920760452747345, + "grad_norm": 0.4650856554508209, "learning_rate": 8.192831903079253e-06, - "loss": 0.0681, + "loss": 0.0593, "step": 7200 }, { "epoch": 3.639575971731449, - "grad_norm": 0.40458381175994873, + "grad_norm": 0.5656235814094543, "learning_rate": 8.162544169611308e-06, - "loss": 0.0642, + "loss": 0.058, "step": 7210 }, { "epoch": 3.6446239273094396, - "grad_norm": 0.431784063577652, + "grad_norm": 0.5745735764503479, "learning_rate": 8.132256436143363e-06, - "loss": 0.0636, + "loss": 0.0582, "step": 7220 }, { "epoch": 3.6496718828874304, - "grad_norm": 0.3183976709842682, + "grad_norm": 0.7879515886306763, "learning_rate": 8.101968702675416e-06, - "loss": 0.0596, + "loss": 0.0593, "step": 7230 }, { "epoch": 3.6547198384654216, - "grad_norm": 0.6023189425468445, + "grad_norm": 0.7000477313995361, "learning_rate": 8.071680969207471e-06, - "loss": 0.0563, + "loss": 0.0517, "step": 7240 }, { "epoch": 3.6597677940434123, - "grad_norm": 0.3629746735095978, + "grad_norm": 0.44397464394569397, "learning_rate": 8.041393235739527e-06, - "loss": 0.0584, + "loss": 0.0569, "step": 7250 }, { "epoch": 3.664815749621403, - "grad_norm": 0.41280779242515564, + "grad_norm": 0.55961674451828, "learning_rate": 8.01110550227158e-06, - "loss": 0.0572, + "loss": 0.0529, "step": 7260 }, { "epoch": 3.6698637051993943, - "grad_norm": 0.45728689432144165, + "grad_norm": 0.5441805720329285, "learning_rate": 7.980817768803635e-06, - "loss": 0.0533, + "loss": 0.0537, "step": 7270 }, { "epoch": 3.674911660777385, - "grad_norm": 0.37236565351486206, + "grad_norm": 0.5779780149459839, "learning_rate": 7.950530035335689e-06, - "loss": 0.0621, + "loss": 0.0549, "step": 7280 }, { "epoch": 3.679959616355376, - "grad_norm": 0.38259199261665344, + "grad_norm": 0.4491129517555237, "learning_rate": 7.920242301867744e-06, - "loss": 0.0537, + "loss": 0.0527, "step": 7290 }, { "epoch": 3.685007571933367, - "grad_norm": 0.36142826080322266, + "grad_norm": 0.6601787209510803, "learning_rate": 7.889954568399799e-06, - "loss": 0.0596, + "loss": 0.0545, "step": 7300 }, { "epoch": 3.690055527511358, - "grad_norm": 0.4781215190887451, + "grad_norm": 0.7920609712600708, "learning_rate": 7.859666834931854e-06, - "loss": 0.0673, + "loss": 0.0607, "step": 7310 }, { "epoch": 3.6951034830893486, - "grad_norm": 0.35468199849128723, + "grad_norm": 0.6220458149909973, "learning_rate": 7.829379101463906e-06, - "loss": 0.0594, + "loss": 0.0574, "step": 7320 }, { "epoch": 3.7001514386673398, - "grad_norm": 0.3899647891521454, + "grad_norm": 0.6900739669799805, "learning_rate": 7.799091367995961e-06, - "loss": 0.0594, + "loss": 0.0549, "step": 7330 }, { "epoch": 3.7051993942453305, - "grad_norm": 0.36970004439353943, + "grad_norm": 1.071191430091858, "learning_rate": 7.768803634528016e-06, - "loss": 0.0662, + "loss": 0.0644, "step": 7340 }, { "epoch": 3.7102473498233217, - "grad_norm": 0.2706756293773651, + "grad_norm": 0.5342854261398315, "learning_rate": 7.738515901060071e-06, - "loss": 0.0667, + "loss": 0.0639, "step": 7350 }, { "epoch": 3.7152953054013125, - "grad_norm": 0.5611262321472168, + "grad_norm": 0.49695709347724915, "learning_rate": 7.708228167592126e-06, - "loss": 0.0565, + "loss": 0.0525, "step": 7360 }, { "epoch": 3.7203432609793032, - "grad_norm": 0.4006311595439911, + "grad_norm": 0.6041547060012817, "learning_rate": 7.67794043412418e-06, - "loss": 0.0657, + "loss": 0.0596, "step": 7370 }, { "epoch": 3.7253912165572944, - "grad_norm": 0.34327423572540283, + "grad_norm": 0.6425964832305908, "learning_rate": 7.647652700656235e-06, - "loss": 0.067, + "loss": 0.0626, "step": 7380 }, { "epoch": 3.730439172135285, - "grad_norm": 0.38259416818618774, + "grad_norm": 0.5185597538948059, "learning_rate": 7.617364967188288e-06, - "loss": 0.0649, + "loss": 0.063, "step": 7390 }, { "epoch": 3.7354871277132764, - "grad_norm": 0.3452150225639343, + "grad_norm": 0.48031681776046753, "learning_rate": 7.587077233720343e-06, - "loss": 0.0714, + "loss": 0.0633, "step": 7400 }, { "epoch": 3.740535083291267, - "grad_norm": 0.32733386754989624, + "grad_norm": 0.46377626061439514, "learning_rate": 7.556789500252398e-06, - "loss": 0.0639, + "loss": 0.0581, "step": 7410 }, { "epoch": 3.745583038869258, - "grad_norm": 0.4879290759563446, + "grad_norm": 0.7336452007293701, "learning_rate": 7.526501766784453e-06, - "loss": 0.061, + "loss": 0.0572, "step": 7420 }, { "epoch": 3.750630994447249, - "grad_norm": 0.5546141266822815, + "grad_norm": 0.8720684051513672, "learning_rate": 7.4962140333165064e-06, - "loss": 0.065, + "loss": 0.0558, "step": 7430 }, { "epoch": 3.75567895002524, - "grad_norm": 0.3634326159954071, + "grad_norm": 0.372592031955719, "learning_rate": 7.465926299848562e-06, - "loss": 0.0614, + "loss": 0.0613, "step": 7440 }, { "epoch": 3.7607269056032306, - "grad_norm": 0.35349273681640625, + "grad_norm": 0.5049020648002625, "learning_rate": 7.435638566380616e-06, - "loss": 0.0565, + "loss": 0.058, "step": 7450 }, { "epoch": 3.765774861181222, - "grad_norm": 0.3440592288970947, + "grad_norm": 0.5402325391769409, "learning_rate": 7.405350832912671e-06, - "loss": 0.0508, + "loss": 0.0484, "step": 7460 }, { "epoch": 3.7708228167592126, - "grad_norm": 0.46401387453079224, + "grad_norm": 0.5662652850151062, "learning_rate": 7.375063099444725e-06, - "loss": 0.0651, + "loss": 0.0613, "step": 7470 }, { "epoch": 3.7758707723372034, - "grad_norm": 0.2772069275379181, + "grad_norm": 0.6431825160980225, "learning_rate": 7.34477536597678e-06, - "loss": 0.0586, + "loss": 0.0522, "step": 7480 }, { "epoch": 3.7809187279151946, - "grad_norm": 0.5362659692764282, + "grad_norm": 0.9309275150299072, "learning_rate": 7.314487632508835e-06, - "loss": 0.0659, + "loss": 0.0602, "step": 7490 }, { "epoch": 3.7859666834931853, - "grad_norm": 0.3963538110256195, + "grad_norm": 0.801145076751709, "learning_rate": 7.284199899040888e-06, - "loss": 0.0573, + "loss": 0.0581, "step": 7500 }, { "epoch": 3.791014639071176, - "grad_norm": 0.32734355330467224, + "grad_norm": 0.5122712850570679, "learning_rate": 7.253912165572943e-06, - "loss": 0.0563, + "loss": 0.0552, "step": 7510 }, { "epoch": 3.7960625946491673, - "grad_norm": 0.4177393317222595, + "grad_norm": 0.39402052760124207, "learning_rate": 7.223624432104998e-06, - "loss": 0.0619, + "loss": 0.0552, "step": 7520 }, { "epoch": 3.801110550227158, - "grad_norm": 0.2514793276786804, + "grad_norm": 0.5302004814147949, "learning_rate": 7.193336698637052e-06, - "loss": 0.0627, + "loss": 0.0626, "step": 7530 }, { "epoch": 3.806158505805149, - "grad_norm": 0.4437217116355896, + "grad_norm": 0.4123098850250244, "learning_rate": 7.163048965169107e-06, - "loss": 0.061, + "loss": 0.0569, "step": 7540 }, { "epoch": 3.81120646138314, - "grad_norm": 0.5421292781829834, + "grad_norm": 0.8736279010772705, "learning_rate": 7.132761231701161e-06, - "loss": 0.0569, + "loss": 0.0537, "step": 7550 }, { "epoch": 3.8162544169611308, - "grad_norm": 0.2918401062488556, + "grad_norm": 0.4374080002307892, "learning_rate": 7.102473498233216e-06, - "loss": 0.0597, + "loss": 0.057, "step": 7560 }, { "epoch": 3.8213023725391215, - "grad_norm": 0.5811281204223633, + "grad_norm": 0.863776445388794, "learning_rate": 7.07218576476527e-06, - "loss": 0.0582, + "loss": 0.049, "step": 7570 }, { "epoch": 3.8263503281171127, - "grad_norm": 0.3603706955909729, + "grad_norm": 0.5356324315071106, "learning_rate": 7.041898031297325e-06, - "loss": 0.0606, + "loss": 0.0578, "step": 7580 }, { "epoch": 3.8313982836951035, - "grad_norm": 0.3899904191493988, + "grad_norm": 0.5422727465629578, "learning_rate": 7.0116102978293786e-06, - "loss": 0.0565, + "loss": 0.0577, "step": 7590 }, { "epoch": 3.8364462392730942, - "grad_norm": 0.5667692422866821, + "grad_norm": 0.6234108805656433, "learning_rate": 6.981322564361434e-06, - "loss": 0.061, + "loss": 0.0573, "step": 7600 }, { "epoch": 3.8414941948510855, - "grad_norm": 0.5629199147224426, + "grad_norm": 0.9067860841751099, "learning_rate": 6.951034830893489e-06, - "loss": 0.053, + "loss": 0.0471, "step": 7610 }, { "epoch": 3.846542150429076, - "grad_norm": 0.32999759912490845, + "grad_norm": 0.5522469878196716, "learning_rate": 6.920747097425543e-06, - "loss": 0.0619, + "loss": 0.053, "step": 7620 }, { "epoch": 3.851590106007067, - "grad_norm": 0.4874314069747925, + "grad_norm": 0.7358270287513733, "learning_rate": 6.8904593639575974e-06, - "loss": 0.0608, + "loss": 0.0561, "step": 7630 }, { "epoch": 3.856638061585058, - "grad_norm": 0.3464485704898834, + "grad_norm": 0.5285794138908386, "learning_rate": 6.860171630489652e-06, - "loss": 0.0646, + "loss": 0.0618, "step": 7640 }, { "epoch": 3.861686017163049, - "grad_norm": 0.6928033232688904, + "grad_norm": 0.6937068700790405, "learning_rate": 6.829883897021707e-06, - "loss": 0.0621, + "loss": 0.059, "step": 7650 }, { "epoch": 3.8667339727410397, - "grad_norm": 0.45158517360687256, + "grad_norm": 0.6941738724708557, "learning_rate": 6.79959616355376e-06, - "loss": 0.0536, + "loss": 0.0515, "step": 7660 }, { "epoch": 3.871781928319031, - "grad_norm": 0.4070644974708557, + "grad_norm": 0.8964054584503174, "learning_rate": 6.7693084300858155e-06, - "loss": 0.0516, + "loss": 0.0526, "step": 7670 }, { "epoch": 3.8768298838970217, - "grad_norm": 0.3872455954551697, + "grad_norm": 0.5919986367225647, "learning_rate": 6.739020696617871e-06, - "loss": 0.0659, + "loss": 0.0577, "step": 7680 }, { "epoch": 3.8818778394750124, - "grad_norm": 0.3202153742313385, + "grad_norm": 0.4616561532020569, "learning_rate": 6.708732963149924e-06, - "loss": 0.0588, + "loss": 0.0509, "step": 7690 }, { "epoch": 3.8869257950530036, - "grad_norm": 0.5554611682891846, + "grad_norm": 0.6349731087684631, "learning_rate": 6.678445229681979e-06, - "loss": 0.0557, + "loss": 0.0535, "step": 7700 }, { "epoch": 3.8919737506309944, - "grad_norm": 0.326063871383667, + "grad_norm": 0.6474828720092773, "learning_rate": 6.6481574962140335e-06, - "loss": 0.0575, + "loss": 0.0552, "step": 7710 }, { "epoch": 3.897021706208985, - "grad_norm": 0.4410304129123688, + "grad_norm": 0.5433930158615112, "learning_rate": 6.617869762746088e-06, - "loss": 0.0647, + "loss": 0.062, "step": 7720 }, { "epoch": 3.9020696617869763, - "grad_norm": 0.48591405153274536, + "grad_norm": 0.6113614439964294, "learning_rate": 6.587582029278142e-06, - "loss": 0.0673, + "loss": 0.06, "step": 7730 }, { "epoch": 3.907117617364967, - "grad_norm": 0.43069708347320557, + "grad_norm": 0.8800488114356995, "learning_rate": 6.557294295810197e-06, - "loss": 0.057, + "loss": 0.0578, "step": 7740 }, { "epoch": 3.912165572942958, - "grad_norm": 0.37136363983154297, + "grad_norm": 0.5158660411834717, "learning_rate": 6.5270065623422515e-06, - "loss": 0.0551, + "loss": 0.0524, "step": 7750 }, { "epoch": 3.917213528520949, - "grad_norm": 0.5326444506645203, + "grad_norm": 0.5676606297492981, "learning_rate": 6.496718828874306e-06, - "loss": 0.0512, + "loss": 0.0474, "step": 7760 }, { "epoch": 3.92226148409894, - "grad_norm": 0.488471657037735, + "grad_norm": 0.6438203454017639, "learning_rate": 6.466431095406361e-06, - "loss": 0.0645, + "loss": 0.0587, "step": 7770 }, { "epoch": 3.9273094396769306, - "grad_norm": 0.5709624290466309, + "grad_norm": 0.6570119857788086, "learning_rate": 6.436143361938415e-06, - "loss": 0.0529, + "loss": 0.0489, "step": 7780 }, { "epoch": 3.932357395254922, - "grad_norm": 0.5405532717704773, + "grad_norm": 0.5620145201683044, "learning_rate": 6.4058556284704695e-06, - "loss": 0.0629, + "loss": 0.0559, "step": 7790 }, { "epoch": 3.9374053508329125, - "grad_norm": 0.4417840242385864, + "grad_norm": 0.6886317729949951, "learning_rate": 6.375567895002524e-06, - "loss": 0.0647, + "loss": 0.0581, "step": 7800 }, { "epoch": 3.9424533064109037, - "grad_norm": 0.4238974153995514, + "grad_norm": 0.7463077306747437, "learning_rate": 6.345280161534579e-06, - "loss": 0.055, + "loss": 0.0477, "step": 7810 }, { "epoch": 3.9475012619888945, - "grad_norm": 0.5029925107955933, + "grad_norm": 0.5246394276618958, "learning_rate": 6.314992428066633e-06, - "loss": 0.0538, + "loss": 0.0501, "step": 7820 }, { "epoch": 3.9525492175668853, - "grad_norm": 0.36458006501197815, + "grad_norm": 0.5147930979728699, "learning_rate": 6.2847046945986876e-06, - "loss": 0.063, + "loss": 0.0603, "step": 7830 }, { "epoch": 3.9575971731448765, - "grad_norm": 0.2648584246635437, + "grad_norm": 0.3963003158569336, "learning_rate": 6.254416961130743e-06, - "loss": 0.0647, + "loss": 0.059, "step": 7840 }, { "epoch": 3.9626451287228672, - "grad_norm": 0.36690616607666016, + "grad_norm": 0.7148598432540894, "learning_rate": 6.224129227662796e-06, - "loss": 0.0563, + "loss": 0.0524, "step": 7850 }, { "epoch": 3.967693084300858, - "grad_norm": 0.4327741861343384, + "grad_norm": 0.5985211133956909, "learning_rate": 6.193841494194851e-06, - "loss": 0.0605, + "loss": 0.0609, "step": 7860 }, { "epoch": 3.972741039878849, - "grad_norm": 0.5171884298324585, + "grad_norm": 0.6152123808860779, "learning_rate": 6.163553760726906e-06, - "loss": 0.0642, + "loss": 0.0622, "step": 7870 }, { "epoch": 3.97778899545684, - "grad_norm": 0.40773433446884155, + "grad_norm": 0.49580270051956177, "learning_rate": 6.13326602725896e-06, - "loss": 0.0621, + "loss": 0.056, "step": 7880 }, { "epoch": 3.982836951034831, - "grad_norm": 0.5063067078590393, + "grad_norm": 0.8874292373657227, "learning_rate": 6.102978293791015e-06, - "loss": 0.0654, + "loss": 0.0599, "step": 7890 }, { "epoch": 3.987884906612822, - "grad_norm": 0.37013697624206543, + "grad_norm": 0.6198350787162781, "learning_rate": 6.072690560323069e-06, - "loss": 0.0586, + "loss": 0.0546, "step": 7900 }, { "epoch": 3.9929328621908127, - "grad_norm": 0.3777279555797577, + "grad_norm": 0.39257192611694336, "learning_rate": 6.042402826855124e-06, - "loss": 0.0544, + "loss": 0.0523, "step": 7910 }, { "epoch": 3.997980817768804, - "grad_norm": 0.4045654535293579, + "grad_norm": 0.4612904191017151, "learning_rate": 6.012115093387178e-06, - "loss": 0.0706, + "loss": 0.0685, "step": 7920 }, { "epoch": 4.0, "eval_f1": 0.9705180789481339, - "eval_loss": 0.044486090540885925, - "eval_runtime": 801.1309, - "eval_samples_per_second": 257.464, - "eval_steps_per_second": 2.012, + "eval_loss": 0.038467586040496826, + "eval_runtime": 578.9562, + "eval_samples_per_second": 356.265, + "eval_steps_per_second": 2.784, "step": 7924 }, { "epoch": 4.003028773346794, - "grad_norm": 0.34277331829071045, + "grad_norm": 0.7146291732788086, "learning_rate": 5.981827359919233e-06, - "loss": 0.0532, + "loss": 0.0575, "step": 7930 }, { "epoch": 4.008076728924785, - "grad_norm": 0.40653395652770996, + "grad_norm": 0.6313480138778687, "learning_rate": 5.951539626451287e-06, - "loss": 0.0601, + "loss": 0.0581, "step": 7940 }, { "epoch": 4.013124684502777, - "grad_norm": 0.39089271426200867, + "grad_norm": 0.4977870583534241, "learning_rate": 5.921251892983342e-06, - "loss": 0.0585, + "loss": 0.0582, "step": 7950 }, { "epoch": 4.018172640080767, - "grad_norm": 0.3117099404335022, + "grad_norm": 0.4447147250175476, "learning_rate": 5.890964159515397e-06, - "loss": 0.0536, + "loss": 0.0544, "step": 7960 }, { "epoch": 4.023220595658758, - "grad_norm": 0.4908514618873596, + "grad_norm": 0.6496310234069824, "learning_rate": 5.860676426047451e-06, - "loss": 0.0618, + "loss": 0.0595, "step": 7970 }, { "epoch": 4.028268551236749, - "grad_norm": 0.35001102089881897, + "grad_norm": 0.4380001127719879, "learning_rate": 5.830388692579505e-06, - "loss": 0.0595, + "loss": 0.0549, "step": 7980 }, { "epoch": 4.03331650681474, - "grad_norm": 0.39042168855667114, + "grad_norm": 0.5718368887901306, "learning_rate": 5.80010095911156e-06, - "loss": 0.0639, + "loss": 0.0559, "step": 7990 }, { "epoch": 4.038364462392731, - "grad_norm": 0.48590323328971863, + "grad_norm": 0.5859358906745911, "learning_rate": 5.769813225643615e-06, - "loss": 0.0606, + "loss": 0.0572, "step": 8000 }, { "epoch": 4.043412417970722, - "grad_norm": 0.3951605558395386, + "grad_norm": 0.49378788471221924, "learning_rate": 5.739525492175669e-06, - "loss": 0.0585, + "loss": 0.054, "step": 8010 }, { "epoch": 4.048460373548712, - "grad_norm": 0.4090045690536499, + "grad_norm": 0.6780097484588623, "learning_rate": 5.709237758707723e-06, - "loss": 0.064, + "loss": 0.0568, "step": 8020 }, { "epoch": 4.053508329126704, - "grad_norm": 0.5321690440177917, + "grad_norm": 0.8048389554023743, "learning_rate": 5.6789500252397786e-06, - "loss": 0.0581, + "loss": 0.0527, "step": 8030 }, { "epoch": 4.058556284704695, - "grad_norm": 0.4750302731990814, + "grad_norm": 0.4513346254825592, "learning_rate": 5.648662291771832e-06, - "loss": 0.066, + "loss": 0.0597, "step": 8040 }, { "epoch": 4.063604240282685, - "grad_norm": 0.36469149589538574, + "grad_norm": 0.6877405643463135, "learning_rate": 5.618374558303887e-06, - "loss": 0.0604, + "loss": 0.0594, "step": 8050 }, { "epoch": 4.068652195860676, - "grad_norm": 0.35261520743370056, + "grad_norm": 0.41468387842178345, "learning_rate": 5.5880868248359414e-06, - "loss": 0.061, + "loss": 0.0563, "step": 8060 }, { "epoch": 4.0737001514386675, - "grad_norm": 0.32109716534614563, + "grad_norm": 0.5062978267669678, "learning_rate": 5.557799091367996e-06, - "loss": 0.0613, + "loss": 0.0598, "step": 8070 }, { "epoch": 4.078748107016659, - "grad_norm": 0.41034355759620667, + "grad_norm": 0.6427041888237, "learning_rate": 5.527511357900051e-06, - "loss": 0.0567, + "loss": 0.057, "step": 8080 }, { "epoch": 4.083796062594649, - "grad_norm": 0.4242144823074341, + "grad_norm": 0.5508936643600464, "learning_rate": 5.497223624432105e-06, - "loss": 0.0539, + "loss": 0.0472, "step": 8090 }, { "epoch": 4.08884401817264, - "grad_norm": 0.32515600323677063, + "grad_norm": 0.39490872621536255, "learning_rate": 5.4669358909641595e-06, - "loss": 0.0581, + "loss": 0.0589, "step": 8100 }, { "epoch": 4.093891973750631, - "grad_norm": 0.6698907017707825, + "grad_norm": 0.5776220560073853, "learning_rate": 5.436648157496214e-06, - "loss": 0.0686, + "loss": 0.0602, "step": 8110 }, { "epoch": 4.098939929328622, - "grad_norm": 0.2780954837799072, + "grad_norm": 0.36714500188827515, "learning_rate": 5.406360424028269e-06, - "loss": 0.0518, + "loss": 0.0474, "step": 8120 }, { "epoch": 4.103987884906613, - "grad_norm": 0.3639545440673828, + "grad_norm": 0.7429747581481934, "learning_rate": 5.376072690560323e-06, - "loss": 0.0569, + "loss": 0.0516, "step": 8130 }, { "epoch": 4.109035840484604, - "grad_norm": 0.4723798930644989, + "grad_norm": 0.7167190909385681, "learning_rate": 5.3457849570923775e-06, - "loss": 0.0596, + "loss": 0.0559, "step": 8140 }, { "epoch": 4.1140837960625944, - "grad_norm": 0.30923640727996826, + "grad_norm": 0.5668296217918396, "learning_rate": 5.315497223624433e-06, - "loss": 0.0564, + "loss": 0.0558, "step": 8150 }, { "epoch": 4.119131751640586, - "grad_norm": 0.3050035238265991, + "grad_norm": 0.5577311515808105, "learning_rate": 5.285209490156487e-06, - "loss": 0.0653, + "loss": 0.0589, "step": 8160 }, { "epoch": 4.124179707218577, - "grad_norm": 0.5005570650100708, + "grad_norm": 0.611304759979248, "learning_rate": 5.254921756688541e-06, - "loss": 0.0623, + "loss": 0.0546, "step": 8170 }, { "epoch": 4.129227662796567, - "grad_norm": 0.5100895762443542, + "grad_norm": 0.5540894865989685, "learning_rate": 5.2246340232205955e-06, - "loss": 0.0622, + "loss": 0.0611, "step": 8180 }, { "epoch": 4.134275618374558, - "grad_norm": 0.33904436230659485, + "grad_norm": 0.5128312706947327, "learning_rate": 5.194346289752651e-06, - "loss": 0.0575, + "loss": 0.0552, "step": 8190 }, { "epoch": 4.13932357395255, - "grad_norm": 0.3320677876472473, + "grad_norm": 0.6017599105834961, "learning_rate": 5.164058556284704e-06, - "loss": 0.0565, + "loss": 0.0494, "step": 8200 }, { "epoch": 4.14437152953054, - "grad_norm": 0.3176303803920746, + "grad_norm": 0.42843466997146606, "learning_rate": 5.133770822816759e-06, - "loss": 0.0597, + "loss": 0.0534, "step": 8210 }, { "epoch": 4.149419485108531, - "grad_norm": 0.33052679896354675, + "grad_norm": 0.6050401926040649, "learning_rate": 5.103483089348814e-06, - "loss": 0.0553, + "loss": 0.0524, "step": 8220 }, { "epoch": 4.154467440686522, - "grad_norm": 0.3024562895298004, + "grad_norm": 0.512793242931366, "learning_rate": 5.073195355880868e-06, - "loss": 0.0595, + "loss": 0.0562, "step": 8230 }, { "epoch": 4.159515396264513, - "grad_norm": 0.380520224571228, + "grad_norm": 0.5130860209465027, "learning_rate": 5.042907622412923e-06, - "loss": 0.048, + "loss": 0.0413, "step": 8240 }, { "epoch": 4.164563351842504, - "grad_norm": 0.47053784132003784, + "grad_norm": 0.6443082690238953, "learning_rate": 5.012619888944977e-06, - "loss": 0.0616, + "loss": 0.0593, "step": 8250 }, { "epoch": 4.169611307420495, - "grad_norm": 0.5295135378837585, + "grad_norm": 0.6051344871520996, "learning_rate": 4.982332155477032e-06, - "loss": 0.0579, + "loss": 0.0542, "step": 8260 }, { "epoch": 4.174659262998485, - "grad_norm": 0.3950503468513489, + "grad_norm": 0.5795598030090332, "learning_rate": 4.952044422009086e-06, - "loss": 0.0594, + "loss": 0.0569, "step": 8270 }, { "epoch": 4.1797072185764765, - "grad_norm": 0.40204277634620667, + "grad_norm": 0.6054142117500305, "learning_rate": 4.921756688541141e-06, - "loss": 0.0568, + "loss": 0.0575, "step": 8280 }, { "epoch": 4.184755174154468, - "grad_norm": 0.4756285548210144, + "grad_norm": 0.6954050660133362, "learning_rate": 4.891468955073196e-06, - "loss": 0.0684, + "loss": 0.0609, "step": 8290 }, { "epoch": 4.189803129732458, - "grad_norm": 0.42255735397338867, + "grad_norm": 0.7217870354652405, "learning_rate": 4.86118122160525e-06, - "loss": 0.0551, + "loss": 0.0559, "step": 8300 }, { "epoch": 4.194851085310449, - "grad_norm": 0.35746055841445923, + "grad_norm": 0.49758586287498474, "learning_rate": 4.830893488137305e-06, - "loss": 0.0536, + "loss": 0.0506, "step": 8310 }, { "epoch": 4.1998990408884405, - "grad_norm": 0.2798272371292114, + "grad_norm": 0.4497081935405731, "learning_rate": 4.800605754669359e-06, - "loss": 0.0654, + "loss": 0.0581, "step": 8320 }, { "epoch": 4.204946996466431, - "grad_norm": 0.4099213778972626, + "grad_norm": 0.6054022312164307, "learning_rate": 4.770318021201413e-06, - "loss": 0.0695, + "loss": 0.0596, "step": 8330 }, { "epoch": 4.209994952044422, - "grad_norm": 0.31809088587760925, + "grad_norm": 0.7262012958526611, "learning_rate": 4.7400302877334685e-06, - "loss": 0.0567, + "loss": 0.0489, "step": 8340 }, { "epoch": 4.215042907622413, - "grad_norm": 0.3884822726249695, + "grad_norm": 0.6226342916488647, "learning_rate": 4.709742554265523e-06, - "loss": 0.0621, + "loss": 0.0596, "step": 8350 }, { "epoch": 4.2200908632004035, - "grad_norm": 0.4989534020423889, + "grad_norm": 0.8234953284263611, "learning_rate": 4.679454820797577e-06, - "loss": 0.0591, + "loss": 0.057, "step": 8360 }, { "epoch": 4.225138818778395, - "grad_norm": 0.5055777430534363, + "grad_norm": 0.8438859581947327, "learning_rate": 4.649167087329631e-06, - "loss": 0.0552, + "loss": 0.0516, "step": 8370 }, { "epoch": 4.230186774356386, - "grad_norm": 0.4415469765663147, + "grad_norm": 0.5095875263214111, "learning_rate": 4.6188793538616865e-06, - "loss": 0.0726, + "loss": 0.0646, "step": 8380 }, { "epoch": 4.235234729934376, - "grad_norm": 0.24666030704975128, + "grad_norm": 0.5543855428695679, "learning_rate": 4.58859162039374e-06, - "loss": 0.0526, + "loss": 0.0482, "step": 8390 }, { "epoch": 4.240282685512367, - "grad_norm": 0.49552977085113525, + "grad_norm": 0.7510880827903748, "learning_rate": 4.558303886925795e-06, - "loss": 0.0607, + "loss": 0.0595, "step": 8400 }, { "epoch": 4.245330641090359, - "grad_norm": 0.3048471510410309, + "grad_norm": 0.5140940546989441, "learning_rate": 4.52801615345785e-06, - "loss": 0.0628, + "loss": 0.0568, "step": 8410 }, { "epoch": 4.250378596668349, - "grad_norm": 0.3662854731082916, + "grad_norm": 0.43089789152145386, "learning_rate": 4.497728419989904e-06, - "loss": 0.062, + "loss": 0.058, "step": 8420 }, { "epoch": 4.25542655224634, - "grad_norm": 0.3893071711063385, + "grad_norm": 0.6229716539382935, "learning_rate": 4.467440686521959e-06, - "loss": 0.0542, + "loss": 0.0538, "step": 8430 }, { "epoch": 4.260474507824331, - "grad_norm": 0.40179580450057983, + "grad_norm": 0.6465341448783875, "learning_rate": 4.437152953054013e-06, - "loss": 0.0524, + "loss": 0.0544, "step": 8440 }, { "epoch": 4.265522463402322, - "grad_norm": 0.35265469551086426, + "grad_norm": 0.42706695199012756, "learning_rate": 4.406865219586068e-06, - "loss": 0.0616, + "loss": 0.0562, "step": 8450 }, { "epoch": 4.270570418980313, - "grad_norm": 0.2585351765155792, + "grad_norm": 0.5305337309837341, "learning_rate": 4.376577486118122e-06, - "loss": 0.058, + "loss": 0.0567, "step": 8460 }, { "epoch": 4.275618374558304, - "grad_norm": 0.4452759325504303, + "grad_norm": 0.7307097315788269, "learning_rate": 4.346289752650177e-06, - "loss": 0.0533, + "loss": 0.0486, "step": 8470 }, { "epoch": 4.280666330136295, - "grad_norm": 0.40577125549316406, + "grad_norm": 0.5940870046615601, "learning_rate": 4.316002019182232e-06, - "loss": 0.055, + "loss": 0.0514, "step": 8480 }, { "epoch": 4.285714285714286, - "grad_norm": 0.2692396938800812, + "grad_norm": 0.4446733593940735, "learning_rate": 4.2857142857142855e-06, - "loss": 0.0616, + "loss": 0.0545, "step": 8490 }, { "epoch": 4.290762241292277, - "grad_norm": 0.47697675228118896, + "grad_norm": 0.9121294617652893, "learning_rate": 4.255426552246341e-06, - "loss": 0.0596, + "loss": 0.0557, "step": 8500 }, { "epoch": 4.295810196870267, - "grad_norm": 0.4272094964981079, + "grad_norm": 0.568056583404541, "learning_rate": 4.225138818778395e-06, - "loss": 0.0571, + "loss": 0.0522, "step": 8510 }, { "epoch": 4.300858152448258, - "grad_norm": 0.5147340297698975, + "grad_norm": 0.8788109421730042, "learning_rate": 4.194851085310449e-06, - "loss": 0.0432, + "loss": 0.0433, "step": 8520 }, { "epoch": 4.3059061080262495, - "grad_norm": 0.37690308690071106, + "grad_norm": 0.7445030808448792, "learning_rate": 4.1645633518425035e-06, - "loss": 0.054, + "loss": 0.05, "step": 8530 }, { "epoch": 4.310954063604241, - "grad_norm": 0.5072263479232788, + "grad_norm": 0.8348667621612549, "learning_rate": 4.134275618374559e-06, - "loss": 0.0575, + "loss": 0.0584, "step": 8540 }, { "epoch": 4.316002019182231, - "grad_norm": 0.3782062232494354, + "grad_norm": 0.462342232465744, "learning_rate": 4.103987884906613e-06, - "loss": 0.0558, + "loss": 0.0555, "step": 8550 }, { "epoch": 4.321049974760222, - "grad_norm": 0.27360981702804565, + "grad_norm": 0.42785176634788513, "learning_rate": 4.073700151438667e-06, - "loss": 0.0645, + "loss": 0.0607, "step": 8560 }, { "epoch": 4.326097930338213, - "grad_norm": 0.5791490077972412, + "grad_norm": 0.7172122597694397, "learning_rate": 4.043412417970722e-06, - "loss": 0.0751, + "loss": 0.0675, "step": 8570 }, { "epoch": 4.331145885916204, - "grad_norm": 0.2799968421459198, + "grad_norm": 0.4495554566383362, "learning_rate": 4.013124684502776e-06, - "loss": 0.0542, + "loss": 0.0546, "step": 8580 }, { "epoch": 4.336193841494195, - "grad_norm": 0.4403197467327118, + "grad_norm": 0.5083460807800293, "learning_rate": 3.982836951034831e-06, - "loss": 0.0647, + "loss": 0.06, "step": 8590 }, { "epoch": 4.341241797072186, - "grad_norm": 0.3798120319843292, + "grad_norm": 0.4353145360946655, "learning_rate": 3.952549217566885e-06, - "loss": 0.0545, + "loss": 0.0535, "step": 8600 }, { "epoch": 4.3462897526501765, - "grad_norm": 0.40195682644844055, + "grad_norm": 0.6741386651992798, "learning_rate": 3.92226148409894e-06, - "loss": 0.058, + "loss": 0.0581, "step": 8610 }, { "epoch": 4.351337708228168, - "grad_norm": 0.30205094814300537, + "grad_norm": 0.47798269987106323, "learning_rate": 3.891973750630995e-06, - "loss": 0.0585, + "loss": 0.0541, "step": 8620 }, { "epoch": 4.356385663806159, - "grad_norm": 0.3941998779773712, + "grad_norm": 0.49109166860580444, "learning_rate": 3.861686017163049e-06, - "loss": 0.0628, + "loss": 0.0608, "step": 8630 }, { "epoch": 4.361433619384149, - "grad_norm": 0.4298538267612457, + "grad_norm": 0.8310505747795105, "learning_rate": 3.831398283695104e-06, - "loss": 0.0519, + "loss": 0.0514, "step": 8640 }, { "epoch": 4.36648157496214, - "grad_norm": 0.45147988200187683, + "grad_norm": 0.4586045742034912, "learning_rate": 3.801110550227158e-06, - "loss": 0.0555, + "loss": 0.0538, "step": 8650 }, { "epoch": 4.371529530540132, - "grad_norm": 0.3213054835796356, + "grad_norm": 0.4350300133228302, "learning_rate": 3.7708228167592127e-06, - "loss": 0.0573, + "loss": 0.0526, "step": 8660 }, { "epoch": 4.376577486118122, - "grad_norm": 0.3924931287765503, + "grad_norm": 0.6310685276985168, "learning_rate": 3.740535083291267e-06, - "loss": 0.0609, + "loss": 0.0597, "step": 8670 }, { "epoch": 4.381625441696113, - "grad_norm": 0.3347417116165161, + "grad_norm": 0.6845548152923584, "learning_rate": 3.7102473498233217e-06, - "loss": 0.0573, + "loss": 0.0542, "step": 8680 }, { "epoch": 4.386673397274104, - "grad_norm": 0.5916124582290649, + "grad_norm": 1.085631012916565, "learning_rate": 3.679959616355376e-06, - "loss": 0.0631, + "loss": 0.0601, "step": 8690 }, { "epoch": 4.391721352852095, - "grad_norm": 0.4623749852180481, + "grad_norm": 0.6232538223266602, "learning_rate": 3.6496718828874303e-06, - "loss": 0.0603, + "loss": 0.0557, "step": 8700 }, { "epoch": 4.396769308430086, - "grad_norm": 0.3337404727935791, + "grad_norm": 0.4568091630935669, "learning_rate": 3.6193841494194855e-06, - "loss": 0.0559, + "loss": 0.0494, "step": 8710 }, { "epoch": 4.401817264008077, - "grad_norm": 0.4419994652271271, + "grad_norm": 0.7550612092018127, "learning_rate": 3.5890964159515398e-06, - "loss": 0.0574, + "loss": 0.0562, "step": 8720 }, { "epoch": 4.406865219586067, - "grad_norm": 0.47578585147857666, + "grad_norm": 0.5380585789680481, "learning_rate": 3.5588086824835945e-06, - "loss": 0.0554, + "loss": 0.0521, "step": 8730 }, { "epoch": 4.411913175164059, - "grad_norm": 0.3991304337978363, + "grad_norm": 0.42225027084350586, "learning_rate": 3.5285209490156488e-06, - "loss": 0.0522, + "loss": 0.0515, "step": 8740 }, { "epoch": 4.41696113074205, - "grad_norm": 0.2646455764770508, + "grad_norm": 0.5831999778747559, "learning_rate": 3.498233215547703e-06, - "loss": 0.053, + "loss": 0.0465, "step": 8750 }, { "epoch": 4.42200908632004, - "grad_norm": 0.38998502492904663, + "grad_norm": 0.7943524718284607, "learning_rate": 3.4679454820797578e-06, - "loss": 0.0697, + "loss": 0.062, "step": 8760 }, { "epoch": 4.427057041898031, - "grad_norm": 0.39025184512138367, + "grad_norm": 0.634747326374054, "learning_rate": 3.437657748611812e-06, - "loss": 0.0564, + "loss": 0.0496, "step": 8770 }, { "epoch": 4.4321049974760225, - "grad_norm": 0.36179178953170776, + "grad_norm": 0.5734288692474365, "learning_rate": 3.407370015143867e-06, - "loss": 0.0695, + "loss": 0.0612, "step": 8780 }, { "epoch": 4.437152953054013, - "grad_norm": 0.47754356265068054, + "grad_norm": 0.7079018354415894, "learning_rate": 3.3770822816759215e-06, - "loss": 0.0599, + "loss": 0.0578, "step": 8790 }, { "epoch": 4.442200908632004, - "grad_norm": 0.3687341511249542, + "grad_norm": 0.44444698095321655, "learning_rate": 3.346794548207976e-06, - "loss": 0.0577, + "loss": 0.0559, "step": 8800 }, { "epoch": 4.447248864209995, - "grad_norm": 0.4395473003387451, + "grad_norm": 0.7473122477531433, "learning_rate": 3.3165068147400305e-06, - "loss": 0.0559, + "loss": 0.0544, "step": 8810 }, { "epoch": 4.4522968197879855, - "grad_norm": 0.3659065365791321, + "grad_norm": 0.6658338308334351, "learning_rate": 3.286219081272085e-06, - "loss": 0.0591, + "loss": 0.0552, "step": 8820 }, { "epoch": 4.457344775365977, - "grad_norm": 0.47786960005760193, + "grad_norm": 0.48870500922203064, "learning_rate": 3.255931347804139e-06, - "loss": 0.0591, + "loss": 0.0566, "step": 8830 }, { "epoch": 4.462392730943968, - "grad_norm": 0.44323790073394775, + "grad_norm": 0.6261917948722839, "learning_rate": 3.2256436143361943e-06, - "loss": 0.0508, + "loss": 0.0487, "step": 8840 }, { "epoch": 4.467440686521958, - "grad_norm": 0.3510769307613373, + "grad_norm": 0.6060011982917786, "learning_rate": 3.1953558808682486e-06, - "loss": 0.0554, + "loss": 0.0514, "step": 8850 }, { "epoch": 4.4724886420999495, - "grad_norm": 0.45277318358421326, + "grad_norm": 0.4858971834182739, "learning_rate": 3.165068147400303e-06, - "loss": 0.0532, + "loss": 0.05, "step": 8860 }, { "epoch": 4.477536597677941, - "grad_norm": 0.5000207424163818, + "grad_norm": 0.6394979357719421, "learning_rate": 3.1347804139323576e-06, - "loss": 0.0654, + "loss": 0.0604, "step": 8870 }, { "epoch": 4.482584553255931, - "grad_norm": 0.37949642539024353, + "grad_norm": 0.6840482950210571, "learning_rate": 3.104492680464412e-06, - "loss": 0.0549, + "loss": 0.0514, "step": 8880 }, { "epoch": 4.487632508833922, - "grad_norm": 0.3000931143760681, + "grad_norm": 0.388715535402298, "learning_rate": 3.0742049469964666e-06, - "loss": 0.0544, + "loss": 0.0479, "step": 8890 }, { "epoch": 4.492680464411913, - "grad_norm": 0.512484610080719, + "grad_norm": 0.6516565084457397, "learning_rate": 3.043917213528521e-06, - "loss": 0.0651, + "loss": 0.0608, "step": 8900 }, { "epoch": 4.497728419989904, - "grad_norm": 0.4052237570285797, + "grad_norm": 0.76282799243927, "learning_rate": 3.0136294800605756e-06, - "loss": 0.0601, + "loss": 0.0572, "step": 8910 }, { "epoch": 4.502776375567895, - "grad_norm": 0.3805348873138428, + "grad_norm": 0.49448370933532715, "learning_rate": 2.9833417465926303e-06, - "loss": 0.0553, + "loss": 0.0575, "step": 8920 }, { "epoch": 4.507824331145886, - "grad_norm": 0.4143049120903015, + "grad_norm": 0.5593730807304382, "learning_rate": 2.9530540131246846e-06, - "loss": 0.0488, + "loss": 0.0486, "step": 8930 }, { "epoch": 4.512872286723876, - "grad_norm": 0.4691813290119171, + "grad_norm": 0.5773325562477112, "learning_rate": 2.922766279656739e-06, - "loss": 0.0544, + "loss": 0.0541, "step": 8940 }, { "epoch": 4.517920242301868, - "grad_norm": 0.40783849358558655, + "grad_norm": 0.34630000591278076, "learning_rate": 2.8924785461887936e-06, - "loss": 0.0678, + "loss": 0.0606, "step": 8950 }, { "epoch": 4.522968197879859, - "grad_norm": 0.36696454882621765, + "grad_norm": 0.5409483313560486, "learning_rate": 2.862190812720848e-06, - "loss": 0.0591, + "loss": 0.0589, "step": 8960 }, { "epoch": 4.52801615345785, - "grad_norm": 0.43989595770835876, + "grad_norm": 0.5004202127456665, "learning_rate": 2.8319030792529026e-06, - "loss": 0.0604, + "loss": 0.0621, "step": 8970 }, { "epoch": 4.53306410903584, - "grad_norm": 0.38078877329826355, + "grad_norm": 0.4979722797870636, "learning_rate": 2.8016153457849574e-06, - "loss": 0.0578, + "loss": 0.0537, "step": 8980 }, { "epoch": 4.5381120646138315, - "grad_norm": 0.3941843807697296, + "grad_norm": 0.6733251214027405, "learning_rate": 2.7713276123170117e-06, - "loss": 0.0694, + "loss": 0.069, "step": 8990 }, { "epoch": 4.543160020191822, - "grad_norm": 0.3795044422149658, + "grad_norm": 0.4152880609035492, "learning_rate": 2.7410398788490664e-06, - "loss": 0.0588, + "loss": 0.0565, "step": 9000 }, { "epoch": 4.548207975769813, - "grad_norm": 0.3949735462665558, + "grad_norm": 0.6170037984848022, "learning_rate": 2.7107521453811207e-06, - "loss": 0.0623, + "loss": 0.0589, "step": 9010 }, { "epoch": 4.553255931347804, - "grad_norm": 0.5588275194168091, + "grad_norm": 0.5258937478065491, "learning_rate": 2.680464411913175e-06, - "loss": 0.0588, + "loss": 0.0548, "step": 9020 }, { "epoch": 4.5583038869257955, - "grad_norm": 0.29749733209609985, + "grad_norm": 0.534015417098999, "learning_rate": 2.6501766784452297e-06, - "loss": 0.0445, + "loss": 0.0447, "step": 9030 }, { "epoch": 4.563351842503786, - "grad_norm": 0.4993056654930115, + "grad_norm": 0.86041259765625, "learning_rate": 2.6198889449772844e-06, - "loss": 0.0595, + "loss": 0.0578, "step": 9040 }, { "epoch": 4.568399798081777, - "grad_norm": 0.5257248878479004, + "grad_norm": 0.8807480335235596, "learning_rate": 2.589601211509339e-06, - "loss": 0.0469, + "loss": 0.0479, "step": 9050 }, { "epoch": 4.573447753659767, - "grad_norm": 0.35071873664855957, + "grad_norm": 0.6071127653121948, "learning_rate": 2.5593134780413934e-06, - "loss": 0.056, + "loss": 0.0521, "step": 9060 }, { "epoch": 4.5784957092377585, - "grad_norm": 0.49088719487190247, + "grad_norm": 0.9106950759887695, "learning_rate": 2.5290257445734477e-06, - "loss": 0.0619, + "loss": 0.056, "step": 9070 }, { "epoch": 4.58354366481575, - "grad_norm": 0.5432353019714355, + "grad_norm": 0.6179044246673584, "learning_rate": 2.4987380111055024e-06, - "loss": 0.0583, + "loss": 0.0548, "step": 9080 }, { "epoch": 4.588591620393741, - "grad_norm": 0.5358169674873352, + "grad_norm": 0.9295970797538757, "learning_rate": 2.4684502776375567e-06, - "loss": 0.0618, + "loss": 0.0626, "step": 9090 }, { "epoch": 4.593639575971731, - "grad_norm": 0.299734890460968, + "grad_norm": 0.4483726918697357, "learning_rate": 2.438162544169611e-06, - "loss": 0.0587, + "loss": 0.0531, "step": 9100 }, { "epoch": 4.598687531549722, - "grad_norm": 0.28594735264778137, + "grad_norm": 0.38749760389328003, "learning_rate": 2.407874810701666e-06, - "loss": 0.0552, + "loss": 0.0514, "step": 9110 }, { "epoch": 4.603735487127714, - "grad_norm": 0.440019428730011, + "grad_norm": 0.7203320860862732, "learning_rate": 2.3775870772337205e-06, - "loss": 0.0616, + "loss": 0.0603, "step": 9120 }, { "epoch": 4.608783442705704, - "grad_norm": 0.3852064311504364, + "grad_norm": 0.8010473251342773, "learning_rate": 2.347299343765775e-06, - "loss": 0.0544, + "loss": 0.053, "step": 9130 }, { "epoch": 4.613831398283695, - "grad_norm": 0.47597625851631165, + "grad_norm": 0.7866964936256409, "learning_rate": 2.3170116102978295e-06, - "loss": 0.0626, + "loss": 0.0544, "step": 9140 }, { "epoch": 4.618879353861686, - "grad_norm": 0.4893425703048706, + "grad_norm": 0.9333378076553345, "learning_rate": 2.2867238768298838e-06, - "loss": 0.0493, + "loss": 0.0472, "step": 9150 }, { "epoch": 4.623927309439677, - "grad_norm": 0.4313579201698303, + "grad_norm": 0.5904621481895447, "learning_rate": 2.2564361433619385e-06, - "loss": 0.0533, + "loss": 0.0515, "step": 9160 }, { "epoch": 4.628975265017668, - "grad_norm": 0.31476062536239624, + "grad_norm": 0.6837446093559265, "learning_rate": 2.2261484098939928e-06, - "loss": 0.0586, + "loss": 0.0566, "step": 9170 }, { "epoch": 4.634023220595659, - "grad_norm": 0.4846239686012268, + "grad_norm": 0.5726220607757568, "learning_rate": 2.1958606764260475e-06, - "loss": 0.0541, + "loss": 0.0521, "step": 9180 }, { "epoch": 4.639071176173649, - "grad_norm": 0.4027024805545807, + "grad_norm": 0.5920945405960083, "learning_rate": 2.1655729429581022e-06, - "loss": 0.0532, + "loss": 0.0527, "step": 9190 }, { "epoch": 4.644119131751641, - "grad_norm": 0.43335291743278503, + "grad_norm": 0.5921088457107544, "learning_rate": 2.1352852094901565e-06, - "loss": 0.0664, + "loss": 0.0594, "step": 9200 }, { "epoch": 4.649167087329632, - "grad_norm": 0.47337576746940613, + "grad_norm": 0.8026402592658997, "learning_rate": 2.1049974760222112e-06, - "loss": 0.0592, + "loss": 0.058, "step": 9210 }, { "epoch": 4.654215042907622, - "grad_norm": 0.44911569356918335, + "grad_norm": 0.9913181066513062, "learning_rate": 2.0747097425542655e-06, - "loss": 0.0642, + "loss": 0.0591, "step": 9220 }, { "epoch": 4.659262998485613, - "grad_norm": 0.47989997267723083, + "grad_norm": 0.675123393535614, "learning_rate": 2.04442200908632e-06, - "loss": 0.0558, + "loss": 0.0561, "step": 9230 }, { "epoch": 4.6643109540636045, - "grad_norm": 0.3837885856628418, + "grad_norm": 0.5947641730308533, "learning_rate": 2.014134275618375e-06, - "loss": 0.0534, + "loss": 0.0486, "step": 9240 }, { "epoch": 4.669358909641595, - "grad_norm": 0.33468201756477356, + "grad_norm": 0.5389765501022339, "learning_rate": 1.9838465421504293e-06, - "loss": 0.0638, + "loss": 0.0586, "step": 9250 }, { "epoch": 4.674406865219586, - "grad_norm": 0.3218873143196106, + "grad_norm": 0.5905711054801941, "learning_rate": 1.9535588086824836e-06, - "loss": 0.0562, + "loss": 0.0523, "step": 9260 }, { "epoch": 4.679454820797577, - "grad_norm": 0.4538477659225464, + "grad_norm": 0.36754655838012695, "learning_rate": 1.9232710752145383e-06, - "loss": 0.0562, + "loss": 0.0518, "step": 9270 }, { "epoch": 4.684502776375568, - "grad_norm": 0.42905497550964355, + "grad_norm": 0.5583412647247314, "learning_rate": 1.8929833417465926e-06, - "loss": 0.0581, + "loss": 0.0536, "step": 9280 }, { "epoch": 4.689550731953559, - "grad_norm": 0.3783353567123413, + "grad_norm": 0.4586925506591797, "learning_rate": 1.8626956082786473e-06, - "loss": 0.0486, + "loss": 0.0482, "step": 9290 }, { "epoch": 4.69459868753155, - "grad_norm": 0.42233869433403015, + "grad_norm": 0.4932919442653656, "learning_rate": 1.8324078748107018e-06, - "loss": 0.0534, + "loss": 0.0484, "step": 9300 }, { "epoch": 4.69964664310954, - "grad_norm": 0.2925800383090973, + "grad_norm": 0.3211473524570465, "learning_rate": 1.802120141342756e-06, - "loss": 0.0557, + "loss": 0.0522, "step": 9310 }, { "epoch": 4.7046945986875315, - "grad_norm": 0.4508257210254669, + "grad_norm": 0.8603491187095642, "learning_rate": 1.7718324078748106e-06, - "loss": 0.0615, + "loss": 0.0585, "step": 9320 }, { "epoch": 4.709742554265523, - "grad_norm": 0.5092118382453918, + "grad_norm": 0.7181740999221802, "learning_rate": 1.7415446744068653e-06, - "loss": 0.0577, + "loss": 0.0522, "step": 9330 }, { "epoch": 4.714790509843513, - "grad_norm": 0.3694470524787903, + "grad_norm": 0.49415314197540283, "learning_rate": 1.7112569409389198e-06, - "loss": 0.0485, + "loss": 0.0417, "step": 9340 }, { "epoch": 4.719838465421504, - "grad_norm": 0.4794639050960541, + "grad_norm": 0.758638322353363, "learning_rate": 1.6809692074709741e-06, - "loss": 0.0699, + "loss": 0.0608, "step": 9350 }, { "epoch": 4.724886420999495, - "grad_norm": 0.4152567982673645, + "grad_norm": 0.6659887433052063, "learning_rate": 1.6506814740030288e-06, - "loss": 0.0521, + "loss": 0.0468, "step": 9360 }, { "epoch": 4.729934376577486, - "grad_norm": 0.48920056223869324, + "grad_norm": 0.3270837962627411, "learning_rate": 1.6203937405350833e-06, - "loss": 0.0677, + "loss": 0.0602, "step": 9370 }, { "epoch": 4.734982332155477, - "grad_norm": 0.37886640429496765, + "grad_norm": 0.6695159077644348, "learning_rate": 1.5901060070671379e-06, - "loss": 0.0575, + "loss": 0.0515, "step": 9380 }, { "epoch": 4.740030287733468, - "grad_norm": 0.5271609425544739, + "grad_norm": 0.8143603205680847, "learning_rate": 1.5598182735991924e-06, - "loss": 0.0618, + "loss": 0.0613, "step": 9390 }, { "epoch": 4.745078243311459, - "grad_norm": 0.376953125, + "grad_norm": 0.6727936863899231, "learning_rate": 1.5295305401312469e-06, - "loss": 0.0558, + "loss": 0.0505, "step": 9400 }, { "epoch": 4.75012619888945, - "grad_norm": 0.4146003723144531, + "grad_norm": 0.5365564823150635, "learning_rate": 1.4992428066633014e-06, - "loss": 0.0567, + "loss": 0.0512, "step": 9410 }, { "epoch": 4.755174154467441, - "grad_norm": 0.5335793495178223, + "grad_norm": 0.5240725874900818, "learning_rate": 1.4689550731953559e-06, - "loss": 0.0527, + "loss": 0.0526, "step": 9420 }, { "epoch": 4.760222110045431, - "grad_norm": 0.4028931260108948, + "grad_norm": 0.6975441575050354, "learning_rate": 1.4386673397274104e-06, - "loss": 0.0546, + "loss": 0.0592, "step": 9430 }, { "epoch": 4.765270065623422, - "grad_norm": 0.4504133462905884, + "grad_norm": 0.44649407267570496, "learning_rate": 1.408379606259465e-06, - "loss": 0.0608, + "loss": 0.0597, "step": 9440 }, { "epoch": 4.770318021201414, - "grad_norm": 0.4923204183578491, + "grad_norm": 0.598850429058075, "learning_rate": 1.3780918727915194e-06, - "loss": 0.0621, + "loss": 0.0606, "step": 9450 }, { "epoch": 4.775365976779405, - "grad_norm": 0.29700249433517456, + "grad_norm": 0.57352614402771, "learning_rate": 1.3478041393235741e-06, - "loss": 0.055, + "loss": 0.0502, "step": 9460 }, { "epoch": 4.780413932357395, - "grad_norm": 0.4809055030345917, + "grad_norm": 0.7437055706977844, "learning_rate": 1.3175164058556284e-06, - "loss": 0.0546, + "loss": 0.0521, "step": 9470 }, { "epoch": 4.785461887935386, - "grad_norm": 0.5369795560836792, + "grad_norm": 0.6993494629859924, "learning_rate": 1.287228672387683e-06, - "loss": 0.059, + "loss": 0.0565, "step": 9480 }, { "epoch": 4.790509843513377, - "grad_norm": 0.4439578652381897, + "grad_norm": 0.8067084550857544, "learning_rate": 1.2569409389197376e-06, - "loss": 0.0615, + "loss": 0.0575, "step": 9490 }, { "epoch": 4.795557799091368, - "grad_norm": 0.39975985884666443, + "grad_norm": 0.5363942384719849, "learning_rate": 1.2266532054517921e-06, - "loss": 0.0587, + "loss": 0.058, "step": 9500 }, { "epoch": 4.800605754669359, - "grad_norm": 0.34285855293273926, + "grad_norm": 0.8145700693130493, "learning_rate": 1.1963654719838464e-06, - "loss": 0.0497, + "loss": 0.0488, "step": 9510 }, { "epoch": 4.80565371024735, - "grad_norm": 0.3402077257633209, + "grad_norm": 0.7701184153556824, "learning_rate": 1.166077738515901e-06, - "loss": 0.0579, + "loss": 0.0577, "step": 9520 }, { "epoch": 4.8107016658253405, - "grad_norm": 0.3736449182033539, + "grad_norm": 0.5177111625671387, "learning_rate": 1.1357900050479557e-06, - "loss": 0.063, + "loss": 0.0605, "step": 9530 }, { "epoch": 4.815749621403332, - "grad_norm": 0.3561767637729645, + "grad_norm": 0.44751742482185364, "learning_rate": 1.1055022715800102e-06, - "loss": 0.0633, + "loss": 0.0565, "step": 9540 }, { "epoch": 4.820797576981323, - "grad_norm": 0.447592556476593, + "grad_norm": 0.37919309735298157, "learning_rate": 1.0752145381120645e-06, - "loss": 0.0484, + "loss": 0.0454, "step": 9550 }, { "epoch": 4.825845532559313, - "grad_norm": 0.3960745930671692, + "grad_norm": 0.6037785410881042, "learning_rate": 1.0449268046441192e-06, - "loss": 0.0631, + "loss": 0.0606, "step": 9560 }, { "epoch": 4.8308934881373045, - "grad_norm": 0.2932693064212799, + "grad_norm": 0.3584793508052826, "learning_rate": 1.0146390711761737e-06, - "loss": 0.0562, + "loss": 0.0503, "step": 9570 }, { "epoch": 4.835941443715296, - "grad_norm": 0.37769854068756104, + "grad_norm": 0.49841853976249695, "learning_rate": 9.843513377082282e-07, - "loss": 0.0482, + "loss": 0.0434, "step": 9580 }, { "epoch": 4.840989399293286, - "grad_norm": 0.3415481150150299, + "grad_norm": 0.5114769339561462, "learning_rate": 9.540636042402827e-07, - "loss": 0.055, + "loss": 0.0535, "step": 9590 }, { "epoch": 4.846037354871277, - "grad_norm": 0.38010311126708984, + "grad_norm": 0.5932824611663818, "learning_rate": 9.237758707723372e-07, - "loss": 0.0599, + "loss": 0.0547, "step": 9600 }, { "epoch": 4.851085310449268, - "grad_norm": 0.3991403579711914, + "grad_norm": 0.6020333766937256, "learning_rate": 8.934881373043917e-07, - "loss": 0.0637, + "loss": 0.0597, "step": 9610 }, { "epoch": 4.856133266027259, - "grad_norm": 0.5155503153800964, + "grad_norm": 0.721193790435791, "learning_rate": 8.632004038364462e-07, - "loss": 0.0671, + "loss": 0.0614, "step": 9620 }, { "epoch": 4.86118122160525, - "grad_norm": 0.42242443561553955, + "grad_norm": 0.4858354926109314, "learning_rate": 8.329126703685008e-07, - "loss": 0.0565, + "loss": 0.0555, "step": 9630 }, { "epoch": 4.866229177183241, - "grad_norm": 0.4904538691043854, + "grad_norm": 0.7863103747367859, "learning_rate": 8.026249369005552e-07, - "loss": 0.0568, + "loss": 0.0554, "step": 9640 }, { "epoch": 4.871277132761231, - "grad_norm": 0.5523189902305603, + "grad_norm": 0.8363025784492493, "learning_rate": 7.723372034326099e-07, - "loss": 0.0559, + "loss": 0.0565, "step": 9650 }, { "epoch": 4.876325088339223, - "grad_norm": 0.4754299819469452, + "grad_norm": 0.6137521266937256, "learning_rate": 7.420494699646643e-07, - "loss": 0.0653, + "loss": 0.0575, "step": 9660 }, { "epoch": 4.881373043917214, - "grad_norm": 0.3697846531867981, + "grad_norm": 0.4781091511249542, "learning_rate": 7.117617364967189e-07, - "loss": 0.0539, + "loss": 0.0478, "step": 9670 }, { "epoch": 4.886420999495204, - "grad_norm": 0.46191075444221497, + "grad_norm": 0.8294112086296082, "learning_rate": 6.814740030287734e-07, - "loss": 0.0676, + "loss": 0.0593, "step": 9680 }, { "epoch": 4.891468955073195, - "grad_norm": 0.3706737756729126, + "grad_norm": 0.5780894160270691, "learning_rate": 6.511862695608279e-07, - "loss": 0.0576, + "loss": 0.0518, "step": 9690 }, { "epoch": 4.8965169106511865, - "grad_norm": 0.34824711084365845, + "grad_norm": 0.4407060146331787, "learning_rate": 6.208985360928824e-07, - "loss": 0.0607, + "loss": 0.0522, "step": 9700 }, { "epoch": 4.901564866229177, - "grad_norm": 0.33516255021095276, + "grad_norm": 0.4369337558746338, "learning_rate": 5.906108026249369e-07, - "loss": 0.0532, + "loss": 0.0522, "step": 9710 }, { "epoch": 4.906612821807168, - "grad_norm": 0.4216098189353943, + "grad_norm": 0.8428089022636414, "learning_rate": 5.603230691569914e-07, - "loss": 0.0506, + "loss": 0.0468, "step": 9720 }, { "epoch": 4.911660777385159, - "grad_norm": 0.39393237233161926, + "grad_norm": 0.6303294897079468, "learning_rate": 5.30035335689046e-07, - "loss": 0.0622, + "loss": 0.0577, "step": 9730 }, { "epoch": 4.91670873296315, - "grad_norm": 0.37353748083114624, + "grad_norm": 0.4869242012500763, "learning_rate": 4.997476022211004e-07, - "loss": 0.0508, + "loss": 0.0472, "step": 9740 }, { "epoch": 4.921756688541141, - "grad_norm": 0.32179582118988037, + "grad_norm": 0.5907611846923828, "learning_rate": 4.69459868753155e-07, - "loss": 0.0461, + "loss": 0.0455, "step": 9750 }, { "epoch": 4.926804644119132, - "grad_norm": 0.34863799810409546, + "grad_norm": 0.6162139177322388, "learning_rate": 4.3917213528520954e-07, - "loss": 0.0513, + "loss": 0.0475, "step": 9760 }, { "epoch": 4.931852599697122, - "grad_norm": 0.4207555651664734, + "grad_norm": 0.5222154259681702, "learning_rate": 4.0888440181726405e-07, - "loss": 0.0516, + "loss": 0.0513, "step": 9770 }, { "epoch": 4.9369005552751135, - "grad_norm": 0.372896283864975, + "grad_norm": 0.5132977366447449, "learning_rate": 3.7859666834931856e-07, - "loss": 0.0476, + "loss": 0.043, "step": 9780 }, { "epoch": 4.941948510853105, - "grad_norm": 0.5434166789054871, + "grad_norm": 0.6620015501976013, "learning_rate": 3.4830893488137306e-07, - "loss": 0.0646, + "loss": 0.0598, "step": 9790 }, { "epoch": 4.946996466431095, - "grad_norm": 0.5460948348045349, + "grad_norm": 0.7160341143608093, "learning_rate": 3.1802120141342757e-07, - "loss": 0.0562, + "loss": 0.0539, "step": 9800 }, { "epoch": 4.952044422009086, - "grad_norm": 0.4554930329322815, + "grad_norm": 0.5954631567001343, "learning_rate": 2.8773346794548213e-07, - "loss": 0.0664, + "loss": 0.0581, "step": 9810 }, { "epoch": 4.957092377587077, - "grad_norm": 0.5326105356216431, + "grad_norm": 1.0010461807250977, "learning_rate": 2.5744573447753664e-07, - "loss": 0.0536, + "loss": 0.0499, "step": 9820 }, { "epoch": 4.962140333165069, - "grad_norm": 0.3335418999195099, + "grad_norm": 0.5768128633499146, "learning_rate": 2.2715800100959112e-07, - "loss": 0.0611, + "loss": 0.0562, "step": 9830 }, { "epoch": 4.967188288743059, - "grad_norm": 0.408489465713501, + "grad_norm": 0.6427052617073059, "learning_rate": 1.9687026754164563e-07, - "loss": 0.056, + "loss": 0.0545, "step": 9840 }, { "epoch": 4.97223624432105, - "grad_norm": 0.49370092153549194, + "grad_norm": 0.6932212114334106, "learning_rate": 1.6658253407370016e-07, - "loss": 0.0615, + "loss": 0.0575, "step": 9850 }, { "epoch": 4.9772841998990405, - "grad_norm": 0.47176486253738403, + "grad_norm": 0.4219547510147095, "learning_rate": 1.3629480060575467e-07, - "loss": 0.0534, + "loss": 0.0491, "step": 9860 }, { "epoch": 4.982332155477032, - "grad_norm": 0.3332078158855438, + "grad_norm": 0.5215485692024231, "learning_rate": 1.0600706713780919e-07, - "loss": 0.0484, + "loss": 0.0438, "step": 9870 }, { "epoch": 4.987380111055023, - "grad_norm": 0.4342339038848877, + "grad_norm": 0.36851760745048523, "learning_rate": 7.57193336698637e-08, - "loss": 0.0557, + "loss": 0.052, "step": 9880 }, { "epoch": 4.992428066633014, - "grad_norm": 0.3356720805168152, + "grad_norm": 0.5213483572006226, "learning_rate": 4.5431600201918226e-08, - "loss": 0.0534, + "loss": 0.0472, "step": 9890 }, { "epoch": 4.997476022211004, - "grad_norm": 0.6361636519432068, + "grad_norm": 0.710657000541687, "learning_rate": 1.514386673397274e-08, - "loss": 0.0595, + "loss": 0.0582, "step": 9900 }, { "epoch": 5.0, - "eval_f1": 0.9429269569770486, - "eval_loss": 0.0460049994289875, - "eval_runtime": 555.0466, - "eval_samples_per_second": 371.612, - "eval_steps_per_second": 2.904, + "eval_f1": 0.9705180789481339, + "eval_loss": 0.03909851238131523, + "eval_runtime": 579.4034, + "eval_samples_per_second": 355.99, + "eval_steps_per_second": 2.782, "step": 9905 }, { "epoch": 5.0, "step": 9905, - "total_flos": 9.820471825285631e+19, - "train_loss": 0.011631221487809769, - "train_runtime": 2940.6816, - "train_samples_per_second": 430.955, - "train_steps_per_second": 3.368 + "total_flos": 9.82152667464321e+19, + "train_loss": 0.0, + "train_runtime": 0.074, + "train_samples_per_second": 17128131.172, + "train_steps_per_second": 133870.543 } ], "logging_steps": 10, @@ -7019,7 +7019,7 @@ "attributes": {} } }, - "total_flos": 9.820471825285631e+19, + "total_flos": 9.82152667464321e+19, "train_batch_size": 128, "trial_name": null, "trial_params": null