{ "best_metric": 0.04101279005408287, "best_model_checkpoint": "./test_default_model/checkpoint-3962", "epoch": 5.0, "eval_steps": 500, "global_step": 9905, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.005047955577990914, "grad_norm": 1.0162577629089355, "learning_rate": 2.9969712266532054e-05, "loss": 0.2788, "step": 10 }, { "epoch": 0.010095911155981827, "grad_norm": 1.099901556968689, "learning_rate": 2.993942453306411e-05, "loss": 0.1588, "step": 20 }, { "epoch": 0.01514386673397274, "grad_norm": 0.8216469883918762, "learning_rate": 2.9909136799596164e-05, "loss": 0.1325, "step": 30 }, { "epoch": 0.020191822311963654, "grad_norm": 1.0275465250015259, "learning_rate": 2.987884906612822e-05, "loss": 0.1362, "step": 40 }, { "epoch": 0.02523977788995457, "grad_norm": 0.49435922503471375, "learning_rate": 2.9848561332660275e-05, "loss": 0.1235, "step": 50 }, { "epoch": 0.03028773346794548, "grad_norm": 0.6702704429626465, "learning_rate": 2.9818273599192328e-05, "loss": 0.1141, "step": 60 }, { "epoch": 0.0353356890459364, "grad_norm": 0.5657111406326294, "learning_rate": 2.978798586572438e-05, "loss": 0.1125, "step": 70 }, { "epoch": 0.04038364462392731, "grad_norm": 1.2878154516220093, "learning_rate": 2.9757698132256435e-05, "loss": 0.1172, "step": 80 }, { "epoch": 0.04543160020191822, "grad_norm": 1.3100908994674683, "learning_rate": 2.9727410398788492e-05, "loss": 0.1222, "step": 90 }, { "epoch": 0.05047955577990914, "grad_norm": 0.923688530921936, "learning_rate": 2.9697122665320545e-05, "loss": 0.1295, "step": 100 }, { "epoch": 0.05552751135790005, "grad_norm": 0.7309263944625854, "learning_rate": 2.96668349318526e-05, "loss": 0.1163, "step": 110 }, { "epoch": 0.06057546693589096, "grad_norm": 0.6152750849723816, "learning_rate": 2.9636547198384656e-05, "loss": 0.1134, "step": 120 }, { "epoch": 0.06562342251388188, "grad_norm": 0.4956571161746979, "learning_rate": 2.960625946491671e-05, "loss": 0.1107, "step": 130 }, { "epoch": 0.0706713780918728, "grad_norm": 0.7335121035575867, "learning_rate": 2.9575971731448766e-05, "loss": 0.1209, "step": 140 }, { "epoch": 0.0757193336698637, "grad_norm": 0.873475193977356, "learning_rate": 2.954568399798082e-05, "loss": 0.1192, "step": 150 }, { "epoch": 0.08076728924785462, "grad_norm": 0.539779543876648, "learning_rate": 2.9515396264512873e-05, "loss": 0.0961, "step": 160 }, { "epoch": 0.08581524482584553, "grad_norm": 0.8240886926651001, "learning_rate": 2.948510853104493e-05, "loss": 0.1111, "step": 170 }, { "epoch": 0.09086320040383644, "grad_norm": 0.8032135367393494, "learning_rate": 2.9454820797576983e-05, "loss": 0.0917, "step": 180 }, { "epoch": 0.09591115598182735, "grad_norm": 1.6522753238677979, "learning_rate": 2.9424533064109037e-05, "loss": 0.1093, "step": 190 }, { "epoch": 0.10095911155981828, "grad_norm": 0.4631141126155853, "learning_rate": 2.939424533064109e-05, "loss": 0.1168, "step": 200 }, { "epoch": 0.10600706713780919, "grad_norm": 0.7879306077957153, "learning_rate": 2.9363957597173144e-05, "loss": 0.1071, "step": 210 }, { "epoch": 0.1110550227158001, "grad_norm": 0.6251317262649536, "learning_rate": 2.93336698637052e-05, "loss": 0.1183, "step": 220 }, { "epoch": 0.11610297829379101, "grad_norm": 1.182413935661316, "learning_rate": 2.9303382130237254e-05, "loss": 0.1046, "step": 230 }, { "epoch": 0.12115093387178193, "grad_norm": 0.5547954440116882, "learning_rate": 2.9273094396769307e-05, "loss": 0.0862, "step": 240 }, { "epoch": 0.12619888944977284, "grad_norm": 0.9633322954177856, "learning_rate": 2.9242806663301364e-05, "loss": 0.1027, "step": 250 }, { "epoch": 0.13124684502776376, "grad_norm": 0.3100033402442932, "learning_rate": 2.9212518929833418e-05, "loss": 0.0826, "step": 260 }, { "epoch": 0.13629480060575466, "grad_norm": 0.6877946257591248, "learning_rate": 2.9182231196365474e-05, "loss": 0.1009, "step": 270 }, { "epoch": 0.1413427561837456, "grad_norm": 0.6235649585723877, "learning_rate": 2.9151943462897528e-05, "loss": 0.0923, "step": 280 }, { "epoch": 0.1463907117617365, "grad_norm": 0.4079645276069641, "learning_rate": 2.912165572942958e-05, "loss": 0.087, "step": 290 }, { "epoch": 0.1514386673397274, "grad_norm": 0.4664750397205353, "learning_rate": 2.9091367995961638e-05, "loss": 0.0972, "step": 300 }, { "epoch": 0.15648662291771834, "grad_norm": 0.49487921595573425, "learning_rate": 2.906108026249369e-05, "loss": 0.101, "step": 310 }, { "epoch": 0.16153457849570924, "grad_norm": 0.4716992974281311, "learning_rate": 2.9030792529025745e-05, "loss": 0.0976, "step": 320 }, { "epoch": 0.16658253407370016, "grad_norm": 0.5814321637153625, "learning_rate": 2.90005047955578e-05, "loss": 0.1011, "step": 330 }, { "epoch": 0.17163048965169106, "grad_norm": 0.48927783966064453, "learning_rate": 2.8970217062089852e-05, "loss": 0.0954, "step": 340 }, { "epoch": 0.17667844522968199, "grad_norm": 0.46283578872680664, "learning_rate": 2.893992932862191e-05, "loss": 0.0893, "step": 350 }, { "epoch": 0.18172640080767288, "grad_norm": 0.39830565452575684, "learning_rate": 2.8909641595153962e-05, "loss": 0.0906, "step": 360 }, { "epoch": 0.1867743563856638, "grad_norm": 0.39141398668289185, "learning_rate": 2.887935386168602e-05, "loss": 0.0934, "step": 370 }, { "epoch": 0.1918223119636547, "grad_norm": 0.5772321224212646, "learning_rate": 2.8849066128218072e-05, "loss": 0.096, "step": 380 }, { "epoch": 0.19687026754164563, "grad_norm": 0.5649458765983582, "learning_rate": 2.8818778394750126e-05, "loss": 0.0869, "step": 390 }, { "epoch": 0.20191822311963656, "grad_norm": 0.9187427163124084, "learning_rate": 2.8788490661282183e-05, "loss": 0.0895, "step": 400 }, { "epoch": 0.20696617869762746, "grad_norm": 0.555701494216919, "learning_rate": 2.8758202927814236e-05, "loss": 0.0932, "step": 410 }, { "epoch": 0.21201413427561838, "grad_norm": 0.4509018361568451, "learning_rate": 2.8727915194346293e-05, "loss": 0.094, "step": 420 }, { "epoch": 0.21706208985360928, "grad_norm": 0.47534915804862976, "learning_rate": 2.8697627460878346e-05, "loss": 0.0887, "step": 430 }, { "epoch": 0.2221100454316002, "grad_norm": 1.2933378219604492, "learning_rate": 2.86673397274104e-05, "loss": 0.0931, "step": 440 }, { "epoch": 0.2271580010095911, "grad_norm": 0.8813143968582153, "learning_rate": 2.8637051993942453e-05, "loss": 0.1047, "step": 450 }, { "epoch": 0.23220595658758203, "grad_norm": 0.6796084642410278, "learning_rate": 2.8606764260474507e-05, "loss": 0.0979, "step": 460 }, { "epoch": 0.23725391216557296, "grad_norm": 0.8377231359481812, "learning_rate": 2.8576476527006564e-05, "loss": 0.0994, "step": 470 }, { "epoch": 0.24230186774356385, "grad_norm": 0.45192497968673706, "learning_rate": 2.8546188793538617e-05, "loss": 0.1016, "step": 480 }, { "epoch": 0.24734982332155478, "grad_norm": 0.6596019268035889, "learning_rate": 2.851590106007067e-05, "loss": 0.0929, "step": 490 }, { "epoch": 0.2523977788995457, "grad_norm": 0.33004361391067505, "learning_rate": 2.8485613326602727e-05, "loss": 0.0893, "step": 500 }, { "epoch": 0.2574457344775366, "grad_norm": 0.5023931264877319, "learning_rate": 2.845532559313478e-05, "loss": 0.0766, "step": 510 }, { "epoch": 0.26249369005552753, "grad_norm": 0.4122072458267212, "learning_rate": 2.8425037859666834e-05, "loss": 0.0808, "step": 520 }, { "epoch": 0.2675416456335184, "grad_norm": 0.43148109316825867, "learning_rate": 2.839475012619889e-05, "loss": 0.0852, "step": 530 }, { "epoch": 0.2725896012115093, "grad_norm": 0.4459318518638611, "learning_rate": 2.8364462392730945e-05, "loss": 0.0857, "step": 540 }, { "epoch": 0.27763755678950025, "grad_norm": 0.47170397639274597, "learning_rate": 2.8334174659263e-05, "loss": 0.1001, "step": 550 }, { "epoch": 0.2826855123674912, "grad_norm": 0.4913211464881897, "learning_rate": 2.8303886925795055e-05, "loss": 0.0927, "step": 560 }, { "epoch": 0.2877334679454821, "grad_norm": 0.5056201219558716, "learning_rate": 2.8273599192327108e-05, "loss": 0.0853, "step": 570 }, { "epoch": 0.292781423523473, "grad_norm": 0.9730571508407593, "learning_rate": 2.8243311458859162e-05, "loss": 0.0885, "step": 580 }, { "epoch": 0.2978293791014639, "grad_norm": 0.4681917130947113, "learning_rate": 2.8213023725391215e-05, "loss": 0.0846, "step": 590 }, { "epoch": 0.3028773346794548, "grad_norm": 0.43745434284210205, "learning_rate": 2.8182735991923272e-05, "loss": 0.0849, "step": 600 }, { "epoch": 0.30792529025744575, "grad_norm": 0.5624836683273315, "learning_rate": 2.8152448258455325e-05, "loss": 0.0833, "step": 610 }, { "epoch": 0.3129732458354367, "grad_norm": 0.7955760359764099, "learning_rate": 2.812216052498738e-05, "loss": 0.0917, "step": 620 }, { "epoch": 0.31802120141342755, "grad_norm": 0.3756354749202728, "learning_rate": 2.8091872791519436e-05, "loss": 0.096, "step": 630 }, { "epoch": 0.32306915699141847, "grad_norm": 0.511903703212738, "learning_rate": 2.806158505805149e-05, "loss": 0.0921, "step": 640 }, { "epoch": 0.3281171125694094, "grad_norm": 0.41848480701446533, "learning_rate": 2.8031297324583546e-05, "loss": 0.091, "step": 650 }, { "epoch": 0.3331650681474003, "grad_norm": 0.5067800283432007, "learning_rate": 2.80010095911156e-05, "loss": 0.0843, "step": 660 }, { "epoch": 0.3382130237253912, "grad_norm": 0.4545990228652954, "learning_rate": 2.7970721857647653e-05, "loss": 0.0796, "step": 670 }, { "epoch": 0.3432609793033821, "grad_norm": 0.5365005135536194, "learning_rate": 2.794043412417971e-05, "loss": 0.0814, "step": 680 }, { "epoch": 0.34830893488137304, "grad_norm": 0.5178242921829224, "learning_rate": 2.7910146390711763e-05, "loss": 0.0854, "step": 690 }, { "epoch": 0.35335689045936397, "grad_norm": 0.3831228017807007, "learning_rate": 2.787985865724382e-05, "loss": 0.0867, "step": 700 }, { "epoch": 0.3584048460373549, "grad_norm": 0.785432755947113, "learning_rate": 2.784957092377587e-05, "loss": 0.0959, "step": 710 }, { "epoch": 0.36345280161534577, "grad_norm": 0.4992307126522064, "learning_rate": 2.7819283190307924e-05, "loss": 0.0857, "step": 720 }, { "epoch": 0.3685007571933367, "grad_norm": 0.337698757648468, "learning_rate": 2.778899545683998e-05, "loss": 0.0833, "step": 730 }, { "epoch": 0.3735487127713276, "grad_norm": 0.4655255675315857, "learning_rate": 2.7758707723372034e-05, "loss": 0.0773, "step": 740 }, { "epoch": 0.37859666834931854, "grad_norm": 0.3810546398162842, "learning_rate": 2.772841998990409e-05, "loss": 0.0858, "step": 750 }, { "epoch": 0.3836446239273094, "grad_norm": 0.4854646325111389, "learning_rate": 2.7698132256436144e-05, "loss": 0.0901, "step": 760 }, { "epoch": 0.38869257950530034, "grad_norm": 0.5742236375808716, "learning_rate": 2.7667844522968198e-05, "loss": 0.0775, "step": 770 }, { "epoch": 0.39374053508329127, "grad_norm": 0.40320712327957153, "learning_rate": 2.7637556789500254e-05, "loss": 0.0837, "step": 780 }, { "epoch": 0.3987884906612822, "grad_norm": 0.8492166996002197, "learning_rate": 2.7607269056032308e-05, "loss": 0.0743, "step": 790 }, { "epoch": 0.4038364462392731, "grad_norm": 0.3958197236061096, "learning_rate": 2.757698132256436e-05, "loss": 0.0849, "step": 800 }, { "epoch": 0.408884401817264, "grad_norm": 0.4942672848701477, "learning_rate": 2.7546693589096418e-05, "loss": 0.0843, "step": 810 }, { "epoch": 0.4139323573952549, "grad_norm": 0.4589976966381073, "learning_rate": 2.751640585562847e-05, "loss": 0.0737, "step": 820 }, { "epoch": 0.41898031297324584, "grad_norm": 0.7180899381637573, "learning_rate": 2.748611812216053e-05, "loss": 0.0871, "step": 830 }, { "epoch": 0.42402826855123676, "grad_norm": 0.9610480070114136, "learning_rate": 2.745583038869258e-05, "loss": 0.0939, "step": 840 }, { "epoch": 0.4290762241292277, "grad_norm": 0.5229182243347168, "learning_rate": 2.7425542655224632e-05, "loss": 0.0856, "step": 850 }, { "epoch": 0.43412417970721856, "grad_norm": 0.4386545717716217, "learning_rate": 2.739525492175669e-05, "loss": 0.0827, "step": 860 }, { "epoch": 0.4391721352852095, "grad_norm": 0.40693387389183044, "learning_rate": 2.7364967188288742e-05, "loss": 0.0756, "step": 870 }, { "epoch": 0.4442200908632004, "grad_norm": 0.5173991918563843, "learning_rate": 2.73346794548208e-05, "loss": 0.0783, "step": 880 }, { "epoch": 0.44926804644119134, "grad_norm": 0.3451812267303467, "learning_rate": 2.7304391721352853e-05, "loss": 0.0911, "step": 890 }, { "epoch": 0.4543160020191822, "grad_norm": 0.5798471570014954, "learning_rate": 2.7274103987884906e-05, "loss": 0.084, "step": 900 }, { "epoch": 0.45936395759717313, "grad_norm": 0.3773733675479889, "learning_rate": 2.7243816254416963e-05, "loss": 0.0838, "step": 910 }, { "epoch": 0.46441191317516406, "grad_norm": 0.4767201840877533, "learning_rate": 2.7213528520949016e-05, "loss": 0.0809, "step": 920 }, { "epoch": 0.469459868753155, "grad_norm": 0.43917012214660645, "learning_rate": 2.7183240787481073e-05, "loss": 0.082, "step": 930 }, { "epoch": 0.4745078243311459, "grad_norm": 0.7668654322624207, "learning_rate": 2.7152953054013127e-05, "loss": 0.0791, "step": 940 }, { "epoch": 0.4795557799091368, "grad_norm": 0.5831783413887024, "learning_rate": 2.712266532054518e-05, "loss": 0.0903, "step": 950 }, { "epoch": 0.4846037354871277, "grad_norm": 0.4111500084400177, "learning_rate": 2.7092377587077233e-05, "loss": 0.0962, "step": 960 }, { "epoch": 0.48965169106511863, "grad_norm": 0.48939448595046997, "learning_rate": 2.7062089853609287e-05, "loss": 0.0798, "step": 970 }, { "epoch": 0.49469964664310956, "grad_norm": 0.6999348998069763, "learning_rate": 2.7031802120141344e-05, "loss": 0.0873, "step": 980 }, { "epoch": 0.49974760222110043, "grad_norm": 0.4761042594909668, "learning_rate": 2.7001514386673397e-05, "loss": 0.0902, "step": 990 }, { "epoch": 0.5047955577990914, "grad_norm": 0.6692693829536438, "learning_rate": 2.697122665320545e-05, "loss": 0.0768, "step": 1000 }, { "epoch": 0.5098435133770823, "grad_norm": 0.3619178831577301, "learning_rate": 2.6940938919737507e-05, "loss": 0.0767, "step": 1010 }, { "epoch": 0.5148914689550732, "grad_norm": 0.4190191328525543, "learning_rate": 2.691065118626956e-05, "loss": 0.0917, "step": 1020 }, { "epoch": 0.5199394245330641, "grad_norm": 0.3305515646934509, "learning_rate": 2.6880363452801618e-05, "loss": 0.0905, "step": 1030 }, { "epoch": 0.5249873801110551, "grad_norm": 0.46522971987724304, "learning_rate": 2.685007571933367e-05, "loss": 0.0797, "step": 1040 }, { "epoch": 0.5300353356890459, "grad_norm": 0.39177805185317993, "learning_rate": 2.6819787985865725e-05, "loss": 0.0725, "step": 1050 }, { "epoch": 0.5350832912670368, "grad_norm": 0.6978829503059387, "learning_rate": 2.678950025239778e-05, "loss": 0.0834, "step": 1060 }, { "epoch": 0.5401312468450278, "grad_norm": 0.40147507190704346, "learning_rate": 2.6759212518929835e-05, "loss": 0.0824, "step": 1070 }, { "epoch": 0.5451792024230186, "grad_norm": 0.6341513395309448, "learning_rate": 2.6728924785461892e-05, "loss": 0.0831, "step": 1080 }, { "epoch": 0.5502271580010096, "grad_norm": 0.3328685164451599, "learning_rate": 2.6698637051993942e-05, "loss": 0.0746, "step": 1090 }, { "epoch": 0.5552751135790005, "grad_norm": 0.5470515489578247, "learning_rate": 2.6668349318525995e-05, "loss": 0.0784, "step": 1100 }, { "epoch": 0.5603230691569914, "grad_norm": 0.8354987502098083, "learning_rate": 2.6638061585058052e-05, "loss": 0.0778, "step": 1110 }, { "epoch": 0.5653710247349824, "grad_norm": 0.45674967765808105, "learning_rate": 2.6607773851590106e-05, "loss": 0.0739, "step": 1120 }, { "epoch": 0.5704189803129732, "grad_norm": 0.3991139829158783, "learning_rate": 2.657748611812216e-05, "loss": 0.0877, "step": 1130 }, { "epoch": 0.5754669358909642, "grad_norm": 0.500252902507782, "learning_rate": 2.6547198384654216e-05, "loss": 0.0866, "step": 1140 }, { "epoch": 0.5805148914689551, "grad_norm": 0.42237555980682373, "learning_rate": 2.651691065118627e-05, "loss": 0.0793, "step": 1150 }, { "epoch": 0.585562847046946, "grad_norm": 0.3488081097602844, "learning_rate": 2.6486622917718326e-05, "loss": 0.0739, "step": 1160 }, { "epoch": 0.5906108026249369, "grad_norm": 0.8973365426063538, "learning_rate": 2.645633518425038e-05, "loss": 0.0784, "step": 1170 }, { "epoch": 0.5956587582029278, "grad_norm": 0.459522008895874, "learning_rate": 2.6426047450782433e-05, "loss": 0.0865, "step": 1180 }, { "epoch": 0.6007067137809188, "grad_norm": 0.7989380955696106, "learning_rate": 2.639575971731449e-05, "loss": 0.0794, "step": 1190 }, { "epoch": 0.6057546693589096, "grad_norm": 0.40716055035591125, "learning_rate": 2.6365471983846543e-05, "loss": 0.085, "step": 1200 }, { "epoch": 0.6108026249369005, "grad_norm": 0.3626324534416199, "learning_rate": 2.63351842503786e-05, "loss": 0.0737, "step": 1210 }, { "epoch": 0.6158505805148915, "grad_norm": 0.4917464852333069, "learning_rate": 2.630489651691065e-05, "loss": 0.0807, "step": 1220 }, { "epoch": 0.6208985360928824, "grad_norm": 0.41341814398765564, "learning_rate": 2.6274608783442704e-05, "loss": 0.0806, "step": 1230 }, { "epoch": 0.6259464916708734, "grad_norm": 0.3172214925289154, "learning_rate": 2.624432104997476e-05, "loss": 0.0779, "step": 1240 }, { "epoch": 0.6309944472488642, "grad_norm": 0.7099377512931824, "learning_rate": 2.6214033316506814e-05, "loss": 0.0815, "step": 1250 }, { "epoch": 0.6360424028268551, "grad_norm": 0.5447896122932434, "learning_rate": 2.618374558303887e-05, "loss": 0.0844, "step": 1260 }, { "epoch": 0.6410903584048461, "grad_norm": 0.4068484604358673, "learning_rate": 2.6153457849570924e-05, "loss": 0.0828, "step": 1270 }, { "epoch": 0.6461383139828369, "grad_norm": 0.576968789100647, "learning_rate": 2.6123170116102978e-05, "loss": 0.0825, "step": 1280 }, { "epoch": 0.6511862695608278, "grad_norm": 0.4223102033138275, "learning_rate": 2.6092882382635034e-05, "loss": 0.0828, "step": 1290 }, { "epoch": 0.6562342251388188, "grad_norm": 0.4649188816547394, "learning_rate": 2.6062594649167088e-05, "loss": 0.0752, "step": 1300 }, { "epoch": 0.6612821807168097, "grad_norm": 0.3611209988594055, "learning_rate": 2.6032306915699145e-05, "loss": 0.0692, "step": 1310 }, { "epoch": 0.6663301362948006, "grad_norm": 0.5452816486358643, "learning_rate": 2.6002019182231198e-05, "loss": 0.0765, "step": 1320 }, { "epoch": 0.6713780918727915, "grad_norm": 0.608113169670105, "learning_rate": 2.597173144876325e-05, "loss": 0.0799, "step": 1330 }, { "epoch": 0.6764260474507824, "grad_norm": 0.37480154633522034, "learning_rate": 2.594144371529531e-05, "loss": 0.0808, "step": 1340 }, { "epoch": 0.6814740030287734, "grad_norm": 0.37567126750946045, "learning_rate": 2.591115598182736e-05, "loss": 0.0738, "step": 1350 }, { "epoch": 0.6865219586067642, "grad_norm": 0.570625364780426, "learning_rate": 2.5880868248359415e-05, "loss": 0.0868, "step": 1360 }, { "epoch": 0.6915699141847552, "grad_norm": 0.41150689125061035, "learning_rate": 2.585058051489147e-05, "loss": 0.0857, "step": 1370 }, { "epoch": 0.6966178697627461, "grad_norm": 0.35640430450439453, "learning_rate": 2.5820292781423522e-05, "loss": 0.0832, "step": 1380 }, { "epoch": 0.701665825340737, "grad_norm": 0.6833171844482422, "learning_rate": 2.579000504795558e-05, "loss": 0.079, "step": 1390 }, { "epoch": 0.7067137809187279, "grad_norm": 0.4165969491004944, "learning_rate": 2.5759717314487633e-05, "loss": 0.0721, "step": 1400 }, { "epoch": 0.7117617364967188, "grad_norm": 0.40253061056137085, "learning_rate": 2.572942958101969e-05, "loss": 0.0735, "step": 1410 }, { "epoch": 0.7168096920747098, "grad_norm": 0.42983728647232056, "learning_rate": 2.5699141847551743e-05, "loss": 0.0798, "step": 1420 }, { "epoch": 0.7218576476527007, "grad_norm": 0.3952350914478302, "learning_rate": 2.5668854114083796e-05, "loss": 0.0694, "step": 1430 }, { "epoch": 0.7269056032306915, "grad_norm": 0.5461121201515198, "learning_rate": 2.5638566380615853e-05, "loss": 0.0843, "step": 1440 }, { "epoch": 0.7319535588086825, "grad_norm": 0.62554931640625, "learning_rate": 2.5608278647147907e-05, "loss": 0.0765, "step": 1450 }, { "epoch": 0.7370015143866734, "grad_norm": 0.5430169105529785, "learning_rate": 2.557799091367996e-05, "loss": 0.0855, "step": 1460 }, { "epoch": 0.7420494699646644, "grad_norm": 0.5172569751739502, "learning_rate": 2.5547703180212014e-05, "loss": 0.0771, "step": 1470 }, { "epoch": 0.7470974255426552, "grad_norm": 0.4924006164073944, "learning_rate": 2.5517415446744067e-05, "loss": 0.0829, "step": 1480 }, { "epoch": 0.7521453811206461, "grad_norm": 0.29295244812965393, "learning_rate": 2.5487127713276124e-05, "loss": 0.0831, "step": 1490 }, { "epoch": 0.7571933366986371, "grad_norm": 0.6032044887542725, "learning_rate": 2.5456839979808177e-05, "loss": 0.0751, "step": 1500 }, { "epoch": 0.762241292276628, "grad_norm": 0.5197745561599731, "learning_rate": 2.542655224634023e-05, "loss": 0.0764, "step": 1510 }, { "epoch": 0.7672892478546188, "grad_norm": 0.3689173758029938, "learning_rate": 2.5396264512872288e-05, "loss": 0.0758, "step": 1520 }, { "epoch": 0.7723372034326098, "grad_norm": 0.5350760817527771, "learning_rate": 2.536597677940434e-05, "loss": 0.0863, "step": 1530 }, { "epoch": 0.7773851590106007, "grad_norm": 0.37884262204170227, "learning_rate": 2.5335689045936398e-05, "loss": 0.0703, "step": 1540 }, { "epoch": 0.7824331145885917, "grad_norm": 0.3809797167778015, "learning_rate": 2.530540131246845e-05, "loss": 0.0761, "step": 1550 }, { "epoch": 0.7874810701665825, "grad_norm": 0.5026581287384033, "learning_rate": 2.5275113579000505e-05, "loss": 0.0737, "step": 1560 }, { "epoch": 0.7925290257445734, "grad_norm": 0.46075060963630676, "learning_rate": 2.524482584553256e-05, "loss": 0.0798, "step": 1570 }, { "epoch": 0.7975769813225644, "grad_norm": 0.4620317220687866, "learning_rate": 2.5214538112064615e-05, "loss": 0.089, "step": 1580 }, { "epoch": 0.8026249369005553, "grad_norm": 0.46049225330352783, "learning_rate": 2.5184250378596672e-05, "loss": 0.0842, "step": 1590 }, { "epoch": 0.8076728924785462, "grad_norm": 0.3389497995376587, "learning_rate": 2.5153962645128722e-05, "loss": 0.0759, "step": 1600 }, { "epoch": 0.8127208480565371, "grad_norm": 0.34683436155319214, "learning_rate": 2.5123674911660775e-05, "loss": 0.0731, "step": 1610 }, { "epoch": 0.817768803634528, "grad_norm": 0.3016813397407532, "learning_rate": 2.5093387178192832e-05, "loss": 0.0828, "step": 1620 }, { "epoch": 0.822816759212519, "grad_norm": 0.563191294670105, "learning_rate": 2.5063099444724886e-05, "loss": 0.0791, "step": 1630 }, { "epoch": 0.8278647147905098, "grad_norm": 0.33876487612724304, "learning_rate": 2.5032811711256942e-05, "loss": 0.0776, "step": 1640 }, { "epoch": 0.8329126703685008, "grad_norm": 0.4185733497142792, "learning_rate": 2.5002523977788996e-05, "loss": 0.0758, "step": 1650 }, { "epoch": 0.8379606259464917, "grad_norm": 0.3273310959339142, "learning_rate": 2.497223624432105e-05, "loss": 0.0775, "step": 1660 }, { "epoch": 0.8430085815244825, "grad_norm": 0.5738667845726013, "learning_rate": 2.4941948510853106e-05, "loss": 0.0723, "step": 1670 }, { "epoch": 0.8480565371024735, "grad_norm": 0.35539621114730835, "learning_rate": 2.491166077738516e-05, "loss": 0.0725, "step": 1680 }, { "epoch": 0.8531044926804644, "grad_norm": 0.45273271203041077, "learning_rate": 2.4881373043917216e-05, "loss": 0.0803, "step": 1690 }, { "epoch": 0.8581524482584554, "grad_norm": 0.48917362093925476, "learning_rate": 2.485108531044927e-05, "loss": 0.0778, "step": 1700 }, { "epoch": 0.8632004038364463, "grad_norm": 0.44357436895370483, "learning_rate": 2.4820797576981323e-05, "loss": 0.0885, "step": 1710 }, { "epoch": 0.8682483594144371, "grad_norm": 0.2906216084957123, "learning_rate": 2.479050984351338e-05, "loss": 0.0817, "step": 1720 }, { "epoch": 0.8732963149924281, "grad_norm": 0.4553854763507843, "learning_rate": 2.476022211004543e-05, "loss": 0.0672, "step": 1730 }, { "epoch": 0.878344270570419, "grad_norm": 0.35258758068084717, "learning_rate": 2.4729934376577487e-05, "loss": 0.067, "step": 1740 }, { "epoch": 0.8833922261484098, "grad_norm": 0.38019898533821106, "learning_rate": 2.469964664310954e-05, "loss": 0.0781, "step": 1750 }, { "epoch": 0.8884401817264008, "grad_norm": 0.3867049813270569, "learning_rate": 2.4669358909641594e-05, "loss": 0.0808, "step": 1760 }, { "epoch": 0.8934881373043917, "grad_norm": 0.5274444222450256, "learning_rate": 2.463907117617365e-05, "loss": 0.0852, "step": 1770 }, { "epoch": 0.8985360928823827, "grad_norm": 0.34507790207862854, "learning_rate": 2.4608783442705704e-05, "loss": 0.0832, "step": 1780 }, { "epoch": 0.9035840484603735, "grad_norm": 0.41448697447776794, "learning_rate": 2.4578495709237758e-05, "loss": 0.0744, "step": 1790 }, { "epoch": 0.9086320040383644, "grad_norm": 0.4750345051288605, "learning_rate": 2.4548207975769815e-05, "loss": 0.0787, "step": 1800 }, { "epoch": 0.9136799596163554, "grad_norm": 0.35344141721725464, "learning_rate": 2.4517920242301868e-05, "loss": 0.0743, "step": 1810 }, { "epoch": 0.9187279151943463, "grad_norm": 0.5075356960296631, "learning_rate": 2.4487632508833925e-05, "loss": 0.0737, "step": 1820 }, { "epoch": 0.9237758707723372, "grad_norm": 0.4184614419937134, "learning_rate": 2.4457344775365978e-05, "loss": 0.0809, "step": 1830 }, { "epoch": 0.9288238263503281, "grad_norm": 0.3176049590110779, "learning_rate": 2.4427057041898032e-05, "loss": 0.077, "step": 1840 }, { "epoch": 0.933871781928319, "grad_norm": 0.5197444558143616, "learning_rate": 2.439676930843009e-05, "loss": 0.0809, "step": 1850 }, { "epoch": 0.93891973750631, "grad_norm": 0.3748112916946411, "learning_rate": 2.436648157496214e-05, "loss": 0.0764, "step": 1860 }, { "epoch": 0.9439676930843008, "grad_norm": 0.2999976575374603, "learning_rate": 2.4336193841494195e-05, "loss": 0.0783, "step": 1870 }, { "epoch": 0.9490156486622918, "grad_norm": 0.4079499840736389, "learning_rate": 2.430590610802625e-05, "loss": 0.0807, "step": 1880 }, { "epoch": 0.9540636042402827, "grad_norm": 0.2642356753349304, "learning_rate": 2.4275618374558302e-05, "loss": 0.0762, "step": 1890 }, { "epoch": 0.9591115598182736, "grad_norm": 0.31227466464042664, "learning_rate": 2.424533064109036e-05, "loss": 0.0709, "step": 1900 }, { "epoch": 0.9641595153962645, "grad_norm": 0.386422723531723, "learning_rate": 2.4215042907622413e-05, "loss": 0.0786, "step": 1910 }, { "epoch": 0.9692074709742554, "grad_norm": 0.7000331878662109, "learning_rate": 2.418475517415447e-05, "loss": 0.0849, "step": 1920 }, { "epoch": 0.9742554265522464, "grad_norm": 0.44338202476501465, "learning_rate": 2.4154467440686523e-05, "loss": 0.0811, "step": 1930 }, { "epoch": 0.9793033821302373, "grad_norm": 0.5613553524017334, "learning_rate": 2.4124179707218576e-05, "loss": 0.069, "step": 1940 }, { "epoch": 0.9843513377082281, "grad_norm": 0.2940104007720947, "learning_rate": 2.4093891973750633e-05, "loss": 0.0654, "step": 1950 }, { "epoch": 0.9893992932862191, "grad_norm": 0.37430045008659363, "learning_rate": 2.4063604240282687e-05, "loss": 0.094, "step": 1960 }, { "epoch": 0.99444724886421, "grad_norm": 0.4766289293766022, "learning_rate": 2.4033316506814744e-05, "loss": 0.0784, "step": 1970 }, { "epoch": 0.9994952044422009, "grad_norm": 0.35420897603034973, "learning_rate": 2.4003028773346797e-05, "loss": 0.0663, "step": 1980 }, { "epoch": 1.0, "eval_f1": 0.9705180789481339, "eval_loss": 0.05940837040543556, "eval_runtime": 553.5581, "eval_samples_per_second": 372.611, "eval_steps_per_second": 2.912, "step": 1981 }, { "epoch": 1.0045431600201917, "grad_norm": 0.2541043162345886, "learning_rate": 2.3972741039878847e-05, "loss": 0.0824, "step": 1990 }, { "epoch": 1.0095911155981827, "grad_norm": 0.4705805480480194, "learning_rate": 2.3942453306410904e-05, "loss": 0.0738, "step": 2000 }, { "epoch": 1.0146390711761737, "grad_norm": 0.44369685649871826, "learning_rate": 2.3912165572942957e-05, "loss": 0.0797, "step": 2010 }, { "epoch": 1.0196870267541647, "grad_norm": 0.4401172697544098, "learning_rate": 2.3881877839475014e-05, "loss": 0.0699, "step": 2020 }, { "epoch": 1.0247349823321554, "grad_norm": 0.5683963298797607, "learning_rate": 2.3851590106007068e-05, "loss": 0.0779, "step": 2030 }, { "epoch": 1.0297829379101464, "grad_norm": 0.7009720206260681, "learning_rate": 2.382130237253912e-05, "loss": 0.081, "step": 2040 }, { "epoch": 1.0348308934881374, "grad_norm": 0.3499268889427185, "learning_rate": 2.3791014639071178e-05, "loss": 0.0733, "step": 2050 }, { "epoch": 1.0398788490661282, "grad_norm": 0.25898194313049316, "learning_rate": 2.376072690560323e-05, "loss": 0.0786, "step": 2060 }, { "epoch": 1.0449268046441191, "grad_norm": 0.4099780023097992, "learning_rate": 2.3730439172135288e-05, "loss": 0.0745, "step": 2070 }, { "epoch": 1.0499747602221101, "grad_norm": 0.5677788853645325, "learning_rate": 2.370015143866734e-05, "loss": 0.0776, "step": 2080 }, { "epoch": 1.0550227158001009, "grad_norm": 0.724709689617157, "learning_rate": 2.3669863705199395e-05, "loss": 0.0738, "step": 2090 }, { "epoch": 1.0600706713780919, "grad_norm": 0.7656406164169312, "learning_rate": 2.3639575971731452e-05, "loss": 0.0877, "step": 2100 }, { "epoch": 1.0651186269560828, "grad_norm": 0.40501999855041504, "learning_rate": 2.3609288238263502e-05, "loss": 0.0805, "step": 2110 }, { "epoch": 1.0701665825340738, "grad_norm": 0.4794836640357971, "learning_rate": 2.3579000504795555e-05, "loss": 0.0839, "step": 2120 }, { "epoch": 1.0752145381120646, "grad_norm": 0.33964771032333374, "learning_rate": 2.3548712771327612e-05, "loss": 0.0744, "step": 2130 }, { "epoch": 1.0802624936900556, "grad_norm": 0.4785172641277313, "learning_rate": 2.3518425037859666e-05, "loss": 0.0779, "step": 2140 }, { "epoch": 1.0853104492680465, "grad_norm": 0.4255255162715912, "learning_rate": 2.3488137304391723e-05, "loss": 0.0723, "step": 2150 }, { "epoch": 1.0903584048460373, "grad_norm": 0.4259156584739685, "learning_rate": 2.3457849570923776e-05, "loss": 0.0713, "step": 2160 }, { "epoch": 1.0954063604240283, "grad_norm": 0.400991290807724, "learning_rate": 2.342756183745583e-05, "loss": 0.075, "step": 2170 }, { "epoch": 1.1004543160020193, "grad_norm": 0.4522845447063446, "learning_rate": 2.3397274103987886e-05, "loss": 0.0672, "step": 2180 }, { "epoch": 1.10550227158001, "grad_norm": 0.33158665895462036, "learning_rate": 2.336698637051994e-05, "loss": 0.083, "step": 2190 }, { "epoch": 1.110550227158001, "grad_norm": 0.4556925296783447, "learning_rate": 2.3336698637051997e-05, "loss": 0.0759, "step": 2200 }, { "epoch": 1.115598182735992, "grad_norm": 0.7028746008872986, "learning_rate": 2.330641090358405e-05, "loss": 0.0742, "step": 2210 }, { "epoch": 1.1206461383139827, "grad_norm": 0.4525831639766693, "learning_rate": 2.3276123170116103e-05, "loss": 0.0663, "step": 2220 }, { "epoch": 1.1256940938919737, "grad_norm": 0.35331177711486816, "learning_rate": 2.324583543664816e-05, "loss": 0.0667, "step": 2230 }, { "epoch": 1.1307420494699647, "grad_norm": 0.3286212682723999, "learning_rate": 2.321554770318021e-05, "loss": 0.0665, "step": 2240 }, { "epoch": 1.1357900050479555, "grad_norm": 0.3492475152015686, "learning_rate": 2.3185259969712267e-05, "loss": 0.0592, "step": 2250 }, { "epoch": 1.1408379606259464, "grad_norm": 0.4186830520629883, "learning_rate": 2.315497223624432e-05, "loss": 0.0768, "step": 2260 }, { "epoch": 1.1458859162039374, "grad_norm": 0.43187859654426575, "learning_rate": 2.3124684502776374e-05, "loss": 0.0747, "step": 2270 }, { "epoch": 1.1509338717819284, "grad_norm": 0.35978901386260986, "learning_rate": 2.309439676930843e-05, "loss": 0.0797, "step": 2280 }, { "epoch": 1.1559818273599192, "grad_norm": 0.6061636805534363, "learning_rate": 2.3064109035840484e-05, "loss": 0.0757, "step": 2290 }, { "epoch": 1.1610297829379101, "grad_norm": 0.4342908561229706, "learning_rate": 2.303382130237254e-05, "loss": 0.0747, "step": 2300 }, { "epoch": 1.1660777385159011, "grad_norm": 0.30125463008880615, "learning_rate": 2.3003533568904595e-05, "loss": 0.078, "step": 2310 }, { "epoch": 1.171125694093892, "grad_norm": 0.4021187424659729, "learning_rate": 2.2973245835436648e-05, "loss": 0.0714, "step": 2320 }, { "epoch": 1.1761736496718829, "grad_norm": 0.2937578856945038, "learning_rate": 2.2942958101968705e-05, "loss": 0.0684, "step": 2330 }, { "epoch": 1.1812216052498739, "grad_norm": 0.3535318970680237, "learning_rate": 2.291267036850076e-05, "loss": 0.0761, "step": 2340 }, { "epoch": 1.1862695608278648, "grad_norm": 0.7090115547180176, "learning_rate": 2.2882382635032815e-05, "loss": 0.0677, "step": 2350 }, { "epoch": 1.1913175164058556, "grad_norm": 0.40100908279418945, "learning_rate": 2.285209490156487e-05, "loss": 0.0795, "step": 2360 }, { "epoch": 1.1963654719838466, "grad_norm": 0.5058602690696716, "learning_rate": 2.282180716809692e-05, "loss": 0.0722, "step": 2370 }, { "epoch": 1.2014134275618376, "grad_norm": 0.5241690874099731, "learning_rate": 2.2791519434628976e-05, "loss": 0.0621, "step": 2380 }, { "epoch": 1.2064613831398283, "grad_norm": 0.4490416646003723, "learning_rate": 2.276123170116103e-05, "loss": 0.0669, "step": 2390 }, { "epoch": 1.2115093387178193, "grad_norm": 0.3629598021507263, "learning_rate": 2.2730943967693086e-05, "loss": 0.0742, "step": 2400 }, { "epoch": 1.2165572942958103, "grad_norm": 0.3595810532569885, "learning_rate": 2.270065623422514e-05, "loss": 0.0714, "step": 2410 }, { "epoch": 1.221605249873801, "grad_norm": 0.3362235128879547, "learning_rate": 2.2670368500757193e-05, "loss": 0.0695, "step": 2420 }, { "epoch": 1.226653205451792, "grad_norm": 0.574418306350708, "learning_rate": 2.264008076728925e-05, "loss": 0.0739, "step": 2430 }, { "epoch": 1.231701161029783, "grad_norm": 0.31744587421417236, "learning_rate": 2.2609793033821303e-05, "loss": 0.0741, "step": 2440 }, { "epoch": 1.2367491166077738, "grad_norm": 0.4100383520126343, "learning_rate": 2.2579505300353356e-05, "loss": 0.0744, "step": 2450 }, { "epoch": 1.2417970721857647, "grad_norm": 0.5664629936218262, "learning_rate": 2.2549217566885413e-05, "loss": 0.0775, "step": 2460 }, { "epoch": 1.2468450277637557, "grad_norm": 0.3939385414123535, "learning_rate": 2.2518929833417467e-05, "loss": 0.0694, "step": 2470 }, { "epoch": 1.2518929833417465, "grad_norm": 0.6710524559020996, "learning_rate": 2.2488642099949524e-05, "loss": 0.0892, "step": 2480 }, { "epoch": 1.2569409389197375, "grad_norm": 0.5967342853546143, "learning_rate": 2.2458354366481577e-05, "loss": 0.0775, "step": 2490 }, { "epoch": 1.2619888944977284, "grad_norm": 0.4590104818344116, "learning_rate": 2.2428066633013627e-05, "loss": 0.0797, "step": 2500 }, { "epoch": 1.2670368500757192, "grad_norm": 0.4371030330657959, "learning_rate": 2.2397778899545684e-05, "loss": 0.0632, "step": 2510 }, { "epoch": 1.2720848056537102, "grad_norm": 0.4119449257850647, "learning_rate": 2.2367491166077737e-05, "loss": 0.0734, "step": 2520 }, { "epoch": 1.2771327612317012, "grad_norm": 0.2977501153945923, "learning_rate": 2.2337203432609794e-05, "loss": 0.0715, "step": 2530 }, { "epoch": 1.2821807168096921, "grad_norm": 0.3859623968601227, "learning_rate": 2.2306915699141848e-05, "loss": 0.0661, "step": 2540 }, { "epoch": 1.2872286723876831, "grad_norm": 0.645246148109436, "learning_rate": 2.22766279656739e-05, "loss": 0.0855, "step": 2550 }, { "epoch": 1.2922766279656739, "grad_norm": 0.3807261288166046, "learning_rate": 2.2246340232205958e-05, "loss": 0.0774, "step": 2560 }, { "epoch": 1.2973245835436649, "grad_norm": 0.48953214287757874, "learning_rate": 2.221605249873801e-05, "loss": 0.0679, "step": 2570 }, { "epoch": 1.3023725391216558, "grad_norm": 0.511131763458252, "learning_rate": 2.2185764765270068e-05, "loss": 0.0692, "step": 2580 }, { "epoch": 1.3074204946996466, "grad_norm": 0.2933480143547058, "learning_rate": 2.215547703180212e-05, "loss": 0.0899, "step": 2590 }, { "epoch": 1.3124684502776376, "grad_norm": 0.38106125593185425, "learning_rate": 2.2125189298334175e-05, "loss": 0.0742, "step": 2600 }, { "epoch": 1.3175164058556286, "grad_norm": 0.400388240814209, "learning_rate": 2.2094901564866232e-05, "loss": 0.0714, "step": 2610 }, { "epoch": 1.3225643614336193, "grad_norm": 0.5102821588516235, "learning_rate": 2.2064613831398285e-05, "loss": 0.0822, "step": 2620 }, { "epoch": 1.3276123170116103, "grad_norm": 0.4120141565799713, "learning_rate": 2.203432609793034e-05, "loss": 0.0718, "step": 2630 }, { "epoch": 1.3326602725896013, "grad_norm": 0.2506933808326721, "learning_rate": 2.2004038364462392e-05, "loss": 0.0703, "step": 2640 }, { "epoch": 1.337708228167592, "grad_norm": 0.4566921293735504, "learning_rate": 2.1973750630994446e-05, "loss": 0.0724, "step": 2650 }, { "epoch": 1.342756183745583, "grad_norm": 0.41095855832099915, "learning_rate": 2.1943462897526503e-05, "loss": 0.0752, "step": 2660 }, { "epoch": 1.347804139323574, "grad_norm": 0.4002370536327362, "learning_rate": 2.1913175164058556e-05, "loss": 0.0719, "step": 2670 }, { "epoch": 1.3528520949015648, "grad_norm": 0.28318819403648376, "learning_rate": 2.1882887430590613e-05, "loss": 0.0609, "step": 2680 }, { "epoch": 1.3579000504795558, "grad_norm": 0.24140208959579468, "learning_rate": 2.1852599697122666e-05, "loss": 0.0612, "step": 2690 }, { "epoch": 1.3629480060575467, "grad_norm": 0.39612990617752075, "learning_rate": 2.182231196365472e-05, "loss": 0.0711, "step": 2700 }, { "epoch": 1.3679959616355375, "grad_norm": 0.48765823245048523, "learning_rate": 2.1792024230186777e-05, "loss": 0.074, "step": 2710 }, { "epoch": 1.3730439172135285, "grad_norm": 0.44596147537231445, "learning_rate": 2.176173649671883e-05, "loss": 0.0722, "step": 2720 }, { "epoch": 1.3780918727915195, "grad_norm": 0.3737035393714905, "learning_rate": 2.1731448763250883e-05, "loss": 0.0811, "step": 2730 }, { "epoch": 1.3831398283695102, "grad_norm": 0.7131165266036987, "learning_rate": 2.170116102978294e-05, "loss": 0.0729, "step": 2740 }, { "epoch": 1.3881877839475012, "grad_norm": 0.4601830244064331, "learning_rate": 2.167087329631499e-05, "loss": 0.0668, "step": 2750 }, { "epoch": 1.3932357395254922, "grad_norm": 0.4313521385192871, "learning_rate": 2.1640585562847047e-05, "loss": 0.0767, "step": 2760 }, { "epoch": 1.3982836951034832, "grad_norm": 0.2787948548793793, "learning_rate": 2.16102978293791e-05, "loss": 0.0703, "step": 2770 }, { "epoch": 1.4033316506814741, "grad_norm": 0.490631639957428, "learning_rate": 2.1580010095911154e-05, "loss": 0.0662, "step": 2780 }, { "epoch": 1.408379606259465, "grad_norm": 0.2579457759857178, "learning_rate": 2.154972236244321e-05, "loss": 0.0806, "step": 2790 }, { "epoch": 1.4134275618374559, "grad_norm": 0.6154518127441406, "learning_rate": 2.1519434628975264e-05, "loss": 0.0715, "step": 2800 }, { "epoch": 1.4184755174154469, "grad_norm": 0.3302385210990906, "learning_rate": 2.148914689550732e-05, "loss": 0.0716, "step": 2810 }, { "epoch": 1.4235234729934376, "grad_norm": 0.39062386751174927, "learning_rate": 2.1458859162039375e-05, "loss": 0.0733, "step": 2820 }, { "epoch": 1.4285714285714286, "grad_norm": 0.42694535851478577, "learning_rate": 2.1428571428571428e-05, "loss": 0.066, "step": 2830 }, { "epoch": 1.4336193841494196, "grad_norm": 0.3877299129962921, "learning_rate": 2.1398283695103485e-05, "loss": 0.0798, "step": 2840 }, { "epoch": 1.4386673397274103, "grad_norm": 0.45881980657577515, "learning_rate": 2.136799596163554e-05, "loss": 0.0672, "step": 2850 }, { "epoch": 1.4437152953054013, "grad_norm": 0.5283980965614319, "learning_rate": 2.1337708228167595e-05, "loss": 0.0694, "step": 2860 }, { "epoch": 1.4487632508833923, "grad_norm": 0.5722761750221252, "learning_rate": 2.130742049469965e-05, "loss": 0.0673, "step": 2870 }, { "epoch": 1.453811206461383, "grad_norm": 0.3672831654548645, "learning_rate": 2.12771327612317e-05, "loss": 0.0654, "step": 2880 }, { "epoch": 1.458859162039374, "grad_norm": 0.3739102780818939, "learning_rate": 2.1246845027763756e-05, "loss": 0.069, "step": 2890 }, { "epoch": 1.463907117617365, "grad_norm": 0.28704676032066345, "learning_rate": 2.121655729429581e-05, "loss": 0.0706, "step": 2900 }, { "epoch": 1.4689550731953558, "grad_norm": 0.6112382411956787, "learning_rate": 2.1186269560827866e-05, "loss": 0.0727, "step": 2910 }, { "epoch": 1.4740030287733468, "grad_norm": 0.28976988792419434, "learning_rate": 2.115598182735992e-05, "loss": 0.0661, "step": 2920 }, { "epoch": 1.4790509843513377, "grad_norm": 0.3798251152038574, "learning_rate": 2.1125694093891973e-05, "loss": 0.0721, "step": 2930 }, { "epoch": 1.4840989399293285, "grad_norm": 0.495906800031662, "learning_rate": 2.109540636042403e-05, "loss": 0.0732, "step": 2940 }, { "epoch": 1.4891468955073195, "grad_norm": 0.5157324075698853, "learning_rate": 2.1065118626956083e-05, "loss": 0.0681, "step": 2950 }, { "epoch": 1.4941948510853105, "grad_norm": 0.40662431716918945, "learning_rate": 2.103483089348814e-05, "loss": 0.0714, "step": 2960 }, { "epoch": 1.4992428066633012, "grad_norm": 0.4008966386318207, "learning_rate": 2.1004543160020193e-05, "loss": 0.077, "step": 2970 }, { "epoch": 1.5042907622412924, "grad_norm": 0.48692312836647034, "learning_rate": 2.0974255426552247e-05, "loss": 0.073, "step": 2980 }, { "epoch": 1.5093387178192832, "grad_norm": 0.3787757456302643, "learning_rate": 2.0943967693084304e-05, "loss": 0.0659, "step": 2990 }, { "epoch": 1.514386673397274, "grad_norm": 0.5147730112075806, "learning_rate": 2.0913679959616357e-05, "loss": 0.075, "step": 3000 }, { "epoch": 1.5194346289752652, "grad_norm": 0.24803757667541504, "learning_rate": 2.088339222614841e-05, "loss": 0.0721, "step": 3010 }, { "epoch": 1.524482584553256, "grad_norm": 0.5188020467758179, "learning_rate": 2.0853104492680464e-05, "loss": 0.0767, "step": 3020 }, { "epoch": 1.529530540131247, "grad_norm": 0.305984228849411, "learning_rate": 2.0822816759212517e-05, "loss": 0.076, "step": 3030 }, { "epoch": 1.5345784957092379, "grad_norm": 0.47039300203323364, "learning_rate": 2.0792529025744574e-05, "loss": 0.0779, "step": 3040 }, { "epoch": 1.5396264512872286, "grad_norm": 0.28816476464271545, "learning_rate": 2.0762241292276628e-05, "loss": 0.0704, "step": 3050 }, { "epoch": 1.5446744068652196, "grad_norm": 0.47483137249946594, "learning_rate": 2.073195355880868e-05, "loss": 0.0677, "step": 3060 }, { "epoch": 1.5497223624432106, "grad_norm": 0.41244029998779297, "learning_rate": 2.0701665825340738e-05, "loss": 0.0758, "step": 3070 }, { "epoch": 1.5547703180212014, "grad_norm": 0.34873828291893005, "learning_rate": 2.067137809187279e-05, "loss": 0.0724, "step": 3080 }, { "epoch": 1.5598182735991923, "grad_norm": 0.5220038294792175, "learning_rate": 2.0641090358404848e-05, "loss": 0.0698, "step": 3090 }, { "epoch": 1.5648662291771833, "grad_norm": 0.4924815595149994, "learning_rate": 2.0610802624936902e-05, "loss": 0.0665, "step": 3100 }, { "epoch": 1.569914184755174, "grad_norm": 0.4955058991909027, "learning_rate": 2.0580514891468955e-05, "loss": 0.0682, "step": 3110 }, { "epoch": 1.574962140333165, "grad_norm": 0.3984096050262451, "learning_rate": 2.0550227158001012e-05, "loss": 0.0714, "step": 3120 }, { "epoch": 1.580010095911156, "grad_norm": 0.7246518731117249, "learning_rate": 2.0519939424533065e-05, "loss": 0.0763, "step": 3130 }, { "epoch": 1.5850580514891468, "grad_norm": 0.3734409809112549, "learning_rate": 2.048965169106512e-05, "loss": 0.0762, "step": 3140 }, { "epoch": 1.5901060070671378, "grad_norm": 0.3476959466934204, "learning_rate": 2.0459363957597172e-05, "loss": 0.06, "step": 3150 }, { "epoch": 1.5951539626451288, "grad_norm": 0.37709012627601624, "learning_rate": 2.0429076224129226e-05, "loss": 0.0686, "step": 3160 }, { "epoch": 1.6002019182231195, "grad_norm": 0.4265778958797455, "learning_rate": 2.0398788490661283e-05, "loss": 0.0739, "step": 3170 }, { "epoch": 1.6052498738011105, "grad_norm": 0.42841967940330505, "learning_rate": 2.0368500757193336e-05, "loss": 0.071, "step": 3180 }, { "epoch": 1.6102978293791015, "grad_norm": 0.41701772809028625, "learning_rate": 2.0338213023725393e-05, "loss": 0.0666, "step": 3190 }, { "epoch": 1.6153457849570922, "grad_norm": 0.38340792059898376, "learning_rate": 2.0307925290257446e-05, "loss": 0.0705, "step": 3200 }, { "epoch": 1.6203937405350834, "grad_norm": 0.28962600231170654, "learning_rate": 2.02776375567895e-05, "loss": 0.0652, "step": 3210 }, { "epoch": 1.6254416961130742, "grad_norm": 0.4337672293186188, "learning_rate": 2.0247349823321557e-05, "loss": 0.0767, "step": 3220 }, { "epoch": 1.630489651691065, "grad_norm": 0.2966071367263794, "learning_rate": 2.021706208985361e-05, "loss": 0.0761, "step": 3230 }, { "epoch": 1.6355376072690562, "grad_norm": 0.3643532693386078, "learning_rate": 2.0186774356385667e-05, "loss": 0.074, "step": 3240 }, { "epoch": 1.640585562847047, "grad_norm": 0.4204406142234802, "learning_rate": 2.015648662291772e-05, "loss": 0.0649, "step": 3250 }, { "epoch": 1.645633518425038, "grad_norm": 0.3872784376144409, "learning_rate": 2.012619888944977e-05, "loss": 0.072, "step": 3260 }, { "epoch": 1.650681474003029, "grad_norm": 0.5608325600624084, "learning_rate": 2.0095911155981827e-05, "loss": 0.073, "step": 3270 }, { "epoch": 1.6557294295810197, "grad_norm": 0.40342044830322266, "learning_rate": 2.006562342251388e-05, "loss": 0.067, "step": 3280 }, { "epoch": 1.6607773851590106, "grad_norm": 0.4224311411380768, "learning_rate": 2.0035335689045938e-05, "loss": 0.0704, "step": 3290 }, { "epoch": 1.6658253407370016, "grad_norm": 0.4088759422302246, "learning_rate": 2.000504795557799e-05, "loss": 0.0734, "step": 3300 }, { "epoch": 1.6708732963149924, "grad_norm": 0.5260732769966125, "learning_rate": 1.9974760222110044e-05, "loss": 0.0654, "step": 3310 }, { "epoch": 1.6759212518929834, "grad_norm": 0.2915021777153015, "learning_rate": 1.99444724886421e-05, "loss": 0.0755, "step": 3320 }, { "epoch": 1.6809692074709743, "grad_norm": 0.43440404534339905, "learning_rate": 1.9914184755174155e-05, "loss": 0.0757, "step": 3330 }, { "epoch": 1.686017163048965, "grad_norm": 0.4600958526134491, "learning_rate": 1.988389702170621e-05, "loss": 0.0655, "step": 3340 }, { "epoch": 1.691065118626956, "grad_norm": 0.5585376620292664, "learning_rate": 1.9853609288238265e-05, "loss": 0.0693, "step": 3350 }, { "epoch": 1.696113074204947, "grad_norm": 0.5592395663261414, "learning_rate": 1.982332155477032e-05, "loss": 0.0767, "step": 3360 }, { "epoch": 1.7011610297829378, "grad_norm": 0.4244596064090729, "learning_rate": 1.9793033821302375e-05, "loss": 0.0669, "step": 3370 }, { "epoch": 1.7062089853609288, "grad_norm": 0.31476616859436035, "learning_rate": 1.976274608783443e-05, "loss": 0.0688, "step": 3380 }, { "epoch": 1.7112569409389198, "grad_norm": 0.4726528525352478, "learning_rate": 1.973245835436648e-05, "loss": 0.0668, "step": 3390 }, { "epoch": 1.7163048965169105, "grad_norm": 0.4156901240348816, "learning_rate": 1.9702170620898536e-05, "loss": 0.0728, "step": 3400 }, { "epoch": 1.7213528520949015, "grad_norm": 0.4030071794986725, "learning_rate": 1.967188288743059e-05, "loss": 0.0684, "step": 3410 }, { "epoch": 1.7264008076728925, "grad_norm": 0.6460726857185364, "learning_rate": 1.9641595153962646e-05, "loss": 0.0776, "step": 3420 }, { "epoch": 1.7314487632508833, "grad_norm": 0.3956555426120758, "learning_rate": 1.96113074204947e-05, "loss": 0.0726, "step": 3430 }, { "epoch": 1.7364967188288745, "grad_norm": 0.39375460147857666, "learning_rate": 1.9581019687026753e-05, "loss": 0.0686, "step": 3440 }, { "epoch": 1.7415446744068652, "grad_norm": 0.46966952085494995, "learning_rate": 1.955073195355881e-05, "loss": 0.0735, "step": 3450 }, { "epoch": 1.746592629984856, "grad_norm": 0.39831027388572693, "learning_rate": 1.9520444220090863e-05, "loss": 0.0605, "step": 3460 }, { "epoch": 1.7516405855628472, "grad_norm": 0.5071054697036743, "learning_rate": 1.949015648662292e-05, "loss": 0.0666, "step": 3470 }, { "epoch": 1.756688541140838, "grad_norm": 0.3473348617553711, "learning_rate": 1.9459868753154973e-05, "loss": 0.068, "step": 3480 }, { "epoch": 1.761736496718829, "grad_norm": 0.47857144474983215, "learning_rate": 1.9429581019687027e-05, "loss": 0.0747, "step": 3490 }, { "epoch": 1.76678445229682, "grad_norm": 0.4897679090499878, "learning_rate": 1.9399293286219084e-05, "loss": 0.0634, "step": 3500 }, { "epoch": 1.7718324078748107, "grad_norm": 0.546002209186554, "learning_rate": 1.9369005552751137e-05, "loss": 0.0669, "step": 3510 }, { "epoch": 1.7768803634528016, "grad_norm": 0.3376496732234955, "learning_rate": 1.933871781928319e-05, "loss": 0.064, "step": 3520 }, { "epoch": 1.7819283190307926, "grad_norm": 0.5528877377510071, "learning_rate": 1.9308430085815244e-05, "loss": 0.07, "step": 3530 }, { "epoch": 1.7869762746087834, "grad_norm": 0.5076607465744019, "learning_rate": 1.9278142352347298e-05, "loss": 0.0603, "step": 3540 }, { "epoch": 1.7920242301867744, "grad_norm": 0.41241809725761414, "learning_rate": 1.9247854618879354e-05, "loss": 0.0745, "step": 3550 }, { "epoch": 1.7970721857647654, "grad_norm": 0.38720259070396423, "learning_rate": 1.9217566885411408e-05, "loss": 0.0604, "step": 3560 }, { "epoch": 1.802120141342756, "grad_norm": 0.5174373388290405, "learning_rate": 1.9187279151943465e-05, "loss": 0.0716, "step": 3570 }, { "epoch": 1.807168096920747, "grad_norm": 0.41615715622901917, "learning_rate": 1.9156991418475518e-05, "loss": 0.0623, "step": 3580 }, { "epoch": 1.812216052498738, "grad_norm": 0.31420108675956726, "learning_rate": 1.912670368500757e-05, "loss": 0.072, "step": 3590 }, { "epoch": 1.8172640080767288, "grad_norm": 0.44855940341949463, "learning_rate": 1.909641595153963e-05, "loss": 0.0711, "step": 3600 }, { "epoch": 1.8223119636547198, "grad_norm": 0.4198800325393677, "learning_rate": 1.9066128218071682e-05, "loss": 0.0669, "step": 3610 }, { "epoch": 1.8273599192327108, "grad_norm": 0.3376767933368683, "learning_rate": 1.903584048460374e-05, "loss": 0.0729, "step": 3620 }, { "epoch": 1.8324078748107016, "grad_norm": 0.3325952887535095, "learning_rate": 1.9005552751135792e-05, "loss": 0.0742, "step": 3630 }, { "epoch": 1.8374558303886925, "grad_norm": 0.4255514442920685, "learning_rate": 1.8975265017667846e-05, "loss": 0.0767, "step": 3640 }, { "epoch": 1.8425037859666835, "grad_norm": 0.49874627590179443, "learning_rate": 1.89449772841999e-05, "loss": 0.0705, "step": 3650 }, { "epoch": 1.8475517415446743, "grad_norm": 0.44393444061279297, "learning_rate": 1.8914689550731952e-05, "loss": 0.0683, "step": 3660 }, { "epoch": 1.8525996971226655, "grad_norm": 0.33301976323127747, "learning_rate": 1.888440181726401e-05, "loss": 0.0702, "step": 3670 }, { "epoch": 1.8576476527006562, "grad_norm": 0.3944764733314514, "learning_rate": 1.8854114083796063e-05, "loss": 0.0678, "step": 3680 }, { "epoch": 1.862695608278647, "grad_norm": 0.6915509700775146, "learning_rate": 1.8823826350328116e-05, "loss": 0.0669, "step": 3690 }, { "epoch": 1.8677435638566382, "grad_norm": 0.42169591784477234, "learning_rate": 1.8793538616860173e-05, "loss": 0.0651, "step": 3700 }, { "epoch": 1.872791519434629, "grad_norm": 0.34414538741111755, "learning_rate": 1.8763250883392226e-05, "loss": 0.0677, "step": 3710 }, { "epoch": 1.87783947501262, "grad_norm": 0.5671964883804321, "learning_rate": 1.873296314992428e-05, "loss": 0.0677, "step": 3720 }, { "epoch": 1.882887430590611, "grad_norm": 0.3801959455013275, "learning_rate": 1.8702675416456337e-05, "loss": 0.0635, "step": 3730 }, { "epoch": 1.8879353861686017, "grad_norm": 0.35626405477523804, "learning_rate": 1.867238768298839e-05, "loss": 0.0631, "step": 3740 }, { "epoch": 1.8929833417465927, "grad_norm": 0.33528760075569153, "learning_rate": 1.8642099949520447e-05, "loss": 0.0663, "step": 3750 }, { "epoch": 1.8980312973245836, "grad_norm": 0.36265772581100464, "learning_rate": 1.86118122160525e-05, "loss": 0.0547, "step": 3760 }, { "epoch": 1.9030792529025744, "grad_norm": 0.5244046449661255, "learning_rate": 1.8581524482584554e-05, "loss": 0.0795, "step": 3770 }, { "epoch": 1.9081272084805654, "grad_norm": 0.4042259156703949, "learning_rate": 1.8551236749116607e-05, "loss": 0.0689, "step": 3780 }, { "epoch": 1.9131751640585564, "grad_norm": 0.48803991079330444, "learning_rate": 1.852094901564866e-05, "loss": 0.0795, "step": 3790 }, { "epoch": 1.9182231196365471, "grad_norm": 0.37799277901649475, "learning_rate": 1.8490661282180718e-05, "loss": 0.0767, "step": 3800 }, { "epoch": 1.923271075214538, "grad_norm": 0.3289439082145691, "learning_rate": 1.846037354871277e-05, "loss": 0.0618, "step": 3810 }, { "epoch": 1.928319030792529, "grad_norm": 0.3983497619628906, "learning_rate": 1.8430085815244825e-05, "loss": 0.0673, "step": 3820 }, { "epoch": 1.9333669863705198, "grad_norm": 0.5559443831443787, "learning_rate": 1.839979808177688e-05, "loss": 0.0705, "step": 3830 }, { "epoch": 1.9384149419485108, "grad_norm": 0.4850088059902191, "learning_rate": 1.8369510348308935e-05, "loss": 0.0766, "step": 3840 }, { "epoch": 1.9434628975265018, "grad_norm": 0.3563697338104248, "learning_rate": 1.833922261484099e-05, "loss": 0.0699, "step": 3850 }, { "epoch": 1.9485108531044926, "grad_norm": 0.3636411428451538, "learning_rate": 1.8308934881373045e-05, "loss": 0.0701, "step": 3860 }, { "epoch": 1.9535588086824835, "grad_norm": 0.47166189551353455, "learning_rate": 1.82786471479051e-05, "loss": 0.0743, "step": 3870 }, { "epoch": 1.9586067642604745, "grad_norm": 0.4918811619281769, "learning_rate": 1.8248359414437155e-05, "loss": 0.0749, "step": 3880 }, { "epoch": 1.9636547198384653, "grad_norm": 0.4318666458129883, "learning_rate": 1.821807168096921e-05, "loss": 0.0668, "step": 3890 }, { "epoch": 1.9687026754164565, "grad_norm": 0.4516458809375763, "learning_rate": 1.8187783947501262e-05, "loss": 0.0666, "step": 3900 }, { "epoch": 1.9737506309944473, "grad_norm": 0.31433552503585815, "learning_rate": 1.8157496214033316e-05, "loss": 0.0694, "step": 3910 }, { "epoch": 1.978798586572438, "grad_norm": 0.3614991009235382, "learning_rate": 1.812720848056537e-05, "loss": 0.0721, "step": 3920 }, { "epoch": 1.9838465421504292, "grad_norm": 0.3617209792137146, "learning_rate": 1.8096920747097426e-05, "loss": 0.0719, "step": 3930 }, { "epoch": 1.98889449772842, "grad_norm": 0.6318678855895996, "learning_rate": 1.806663301362948e-05, "loss": 0.0722, "step": 3940 }, { "epoch": 1.993942453306411, "grad_norm": 0.4043138027191162, "learning_rate": 1.8036345280161536e-05, "loss": 0.0654, "step": 3950 }, { "epoch": 1.998990408884402, "grad_norm": 0.5654613375663757, "learning_rate": 1.800605754669359e-05, "loss": 0.0747, "step": 3960 }, { "epoch": 2.0, "eval_f1": 0.9705180789481339, "eval_loss": 0.04101279005408287, "eval_runtime": 739.7396, "eval_samples_per_second": 278.831, "eval_steps_per_second": 2.179, "step": 3962 }, { "epoch": 2.0040383644623927, "grad_norm": 0.44467952847480774, "learning_rate": 1.7975769813225643e-05, "loss": 0.0641, "step": 3970 }, { "epoch": 2.0090863200403835, "grad_norm": 0.5186964869499207, "learning_rate": 1.79454820797577e-05, "loss": 0.0603, "step": 3980 }, { "epoch": 2.0141342756183747, "grad_norm": 0.37641072273254395, "learning_rate": 1.7915194346289753e-05, "loss": 0.0705, "step": 3990 }, { "epoch": 2.0191822311963654, "grad_norm": 0.3755345642566681, "learning_rate": 1.7884906612821807e-05, "loss": 0.0611, "step": 4000 }, { "epoch": 2.024230186774356, "grad_norm": 0.5084393620491028, "learning_rate": 1.7854618879353864e-05, "loss": 0.0673, "step": 4010 }, { "epoch": 2.0292781423523474, "grad_norm": 0.3902832865715027, "learning_rate": 1.7824331145885917e-05, "loss": 0.0709, "step": 4020 }, { "epoch": 2.034326097930338, "grad_norm": 0.3876974582672119, "learning_rate": 1.779404341241797e-05, "loss": 0.073, "step": 4030 }, { "epoch": 2.0393740535083293, "grad_norm": 0.3503962755203247, "learning_rate": 1.7763755678950024e-05, "loss": 0.0686, "step": 4040 }, { "epoch": 2.04442200908632, "grad_norm": 0.4520651698112488, "learning_rate": 1.7733467945482078e-05, "loss": 0.0756, "step": 4050 }, { "epoch": 2.049469964664311, "grad_norm": 0.4055824875831604, "learning_rate": 1.7703180212014134e-05, "loss": 0.0663, "step": 4060 }, { "epoch": 2.054517920242302, "grad_norm": 0.4180123805999756, "learning_rate": 1.7672892478546188e-05, "loss": 0.0624, "step": 4070 }, { "epoch": 2.059565875820293, "grad_norm": 0.4090680181980133, "learning_rate": 1.7642604745078245e-05, "loss": 0.0641, "step": 4080 }, { "epoch": 2.0646138313982836, "grad_norm": 0.47140470147132874, "learning_rate": 1.7612317011610298e-05, "loss": 0.0707, "step": 4090 }, { "epoch": 2.069661786976275, "grad_norm": 0.39671817421913147, "learning_rate": 1.758202927814235e-05, "loss": 0.0706, "step": 4100 }, { "epoch": 2.0747097425542655, "grad_norm": 0.2987823486328125, "learning_rate": 1.755174154467441e-05, "loss": 0.071, "step": 4110 }, { "epoch": 2.0797576981322563, "grad_norm": 0.325086385011673, "learning_rate": 1.7521453811206462e-05, "loss": 0.0643, "step": 4120 }, { "epoch": 2.0848056537102475, "grad_norm": 0.4358964264392853, "learning_rate": 1.749116607773852e-05, "loss": 0.0714, "step": 4130 }, { "epoch": 2.0898536092882383, "grad_norm": 0.28630056977272034, "learning_rate": 1.7460878344270572e-05, "loss": 0.0756, "step": 4140 }, { "epoch": 2.094901564866229, "grad_norm": 0.3596285581588745, "learning_rate": 1.7430590610802626e-05, "loss": 0.0755, "step": 4150 }, { "epoch": 2.0999495204442202, "grad_norm": 0.5699533224105835, "learning_rate": 1.740030287733468e-05, "loss": 0.069, "step": 4160 }, { "epoch": 2.104997476022211, "grad_norm": 0.584018886089325, "learning_rate": 1.7370015143866733e-05, "loss": 0.0693, "step": 4170 }, { "epoch": 2.1100454316002017, "grad_norm": 0.4305306375026703, "learning_rate": 1.733972741039879e-05, "loss": 0.0671, "step": 4180 }, { "epoch": 2.115093387178193, "grad_norm": 0.49273547530174255, "learning_rate": 1.7309439676930843e-05, "loss": 0.0747, "step": 4190 }, { "epoch": 2.1201413427561837, "grad_norm": 0.32480379939079285, "learning_rate": 1.7279151943462896e-05, "loss": 0.0677, "step": 4200 }, { "epoch": 2.1251892983341745, "grad_norm": 0.45263049006462097, "learning_rate": 1.7248864209994953e-05, "loss": 0.0681, "step": 4210 }, { "epoch": 2.1302372539121657, "grad_norm": 0.528186559677124, "learning_rate": 1.7218576476527007e-05, "loss": 0.0622, "step": 4220 }, { "epoch": 2.1352852094901564, "grad_norm": 0.35689589381217957, "learning_rate": 1.7188288743059063e-05, "loss": 0.0615, "step": 4230 }, { "epoch": 2.1403331650681476, "grad_norm": 0.37193727493286133, "learning_rate": 1.7158001009591117e-05, "loss": 0.0587, "step": 4240 }, { "epoch": 2.1453811206461384, "grad_norm": 0.4723437428474426, "learning_rate": 1.712771327612317e-05, "loss": 0.0687, "step": 4250 }, { "epoch": 2.150429076224129, "grad_norm": 0.5692839622497559, "learning_rate": 1.7097425542655227e-05, "loss": 0.0739, "step": 4260 }, { "epoch": 2.1554770318021204, "grad_norm": 0.35744959115982056, "learning_rate": 1.706713780918728e-05, "loss": 0.0699, "step": 4270 }, { "epoch": 2.160524987380111, "grad_norm": 0.4813705384731293, "learning_rate": 1.7036850075719337e-05, "loss": 0.0667, "step": 4280 }, { "epoch": 2.165572942958102, "grad_norm": 0.37739092111587524, "learning_rate": 1.7006562342251387e-05, "loss": 0.06, "step": 4290 }, { "epoch": 2.170620898536093, "grad_norm": 0.28146716952323914, "learning_rate": 1.697627460878344e-05, "loss": 0.0595, "step": 4300 }, { "epoch": 2.175668854114084, "grad_norm": 0.49551817774772644, "learning_rate": 1.6945986875315498e-05, "loss": 0.07, "step": 4310 }, { "epoch": 2.1807168096920746, "grad_norm": 0.4708079993724823, "learning_rate": 1.691569914184755e-05, "loss": 0.07, "step": 4320 }, { "epoch": 2.185764765270066, "grad_norm": 0.4068326950073242, "learning_rate": 1.6885411408379605e-05, "loss": 0.0713, "step": 4330 }, { "epoch": 2.1908127208480566, "grad_norm": 0.31832921504974365, "learning_rate": 1.685512367491166e-05, "loss": 0.0716, "step": 4340 }, { "epoch": 2.1958606764260473, "grad_norm": 0.5986499786376953, "learning_rate": 1.6824835941443715e-05, "loss": 0.0606, "step": 4350 }, { "epoch": 2.2009086320040385, "grad_norm": 0.4528530240058899, "learning_rate": 1.6794548207975772e-05, "loss": 0.0611, "step": 4360 }, { "epoch": 2.2059565875820293, "grad_norm": 0.30561351776123047, "learning_rate": 1.6764260474507825e-05, "loss": 0.0657, "step": 4370 }, { "epoch": 2.21100454316002, "grad_norm": 0.2965313792228699, "learning_rate": 1.673397274103988e-05, "loss": 0.0714, "step": 4380 }, { "epoch": 2.2160524987380112, "grad_norm": 0.5420868992805481, "learning_rate": 1.6703685007571935e-05, "loss": 0.0648, "step": 4390 }, { "epoch": 2.221100454316002, "grad_norm": 0.5012361407279968, "learning_rate": 1.667339727410399e-05, "loss": 0.0709, "step": 4400 }, { "epoch": 2.2261484098939928, "grad_norm": 0.29113131761550903, "learning_rate": 1.6643109540636042e-05, "loss": 0.0591, "step": 4410 }, { "epoch": 2.231196365471984, "grad_norm": 0.37094447016716003, "learning_rate": 1.6612821807168096e-05, "loss": 0.0708, "step": 4420 }, { "epoch": 2.2362443210499747, "grad_norm": 0.3133958876132965, "learning_rate": 1.658253407370015e-05, "loss": 0.0797, "step": 4430 }, { "epoch": 2.2412922766279655, "grad_norm": 0.41249901056289673, "learning_rate": 1.6552246340232206e-05, "loss": 0.0615, "step": 4440 }, { "epoch": 2.2463402322059567, "grad_norm": 0.5728883743286133, "learning_rate": 1.652195860676426e-05, "loss": 0.0742, "step": 4450 }, { "epoch": 2.2513881877839474, "grad_norm": 0.29768499732017517, "learning_rate": 1.6491670873296316e-05, "loss": 0.0675, "step": 4460 }, { "epoch": 2.256436143361938, "grad_norm": 0.4960065484046936, "learning_rate": 1.646138313982837e-05, "loss": 0.0571, "step": 4470 }, { "epoch": 2.2614840989399294, "grad_norm": 0.3015303313732147, "learning_rate": 1.6431095406360423e-05, "loss": 0.0616, "step": 4480 }, { "epoch": 2.26653205451792, "grad_norm": 0.4689812958240509, "learning_rate": 1.640080767289248e-05, "loss": 0.0722, "step": 4490 }, { "epoch": 2.271580010095911, "grad_norm": 0.42969536781311035, "learning_rate": 1.6370519939424534e-05, "loss": 0.07, "step": 4500 }, { "epoch": 2.276627965673902, "grad_norm": 0.5754514336585999, "learning_rate": 1.634023220595659e-05, "loss": 0.0668, "step": 4510 }, { "epoch": 2.281675921251893, "grad_norm": 0.4731753170490265, "learning_rate": 1.6309944472488644e-05, "loss": 0.0599, "step": 4520 }, { "epoch": 2.2867238768298837, "grad_norm": 0.4508737027645111, "learning_rate": 1.6279656739020697e-05, "loss": 0.0632, "step": 4530 }, { "epoch": 2.291771832407875, "grad_norm": 0.3547825813293457, "learning_rate": 1.624936900555275e-05, "loss": 0.066, "step": 4540 }, { "epoch": 2.2968197879858656, "grad_norm": 0.6295393109321594, "learning_rate": 1.6219081272084804e-05, "loss": 0.0595, "step": 4550 }, { "epoch": 2.301867743563857, "grad_norm": 0.44660690426826477, "learning_rate": 1.618879353861686e-05, "loss": 0.0651, "step": 4560 }, { "epoch": 2.3069156991418476, "grad_norm": 0.4014616310596466, "learning_rate": 1.6158505805148914e-05, "loss": 0.0652, "step": 4570 }, { "epoch": 2.3119636547198383, "grad_norm": 0.45291078090667725, "learning_rate": 1.6128218071680968e-05, "loss": 0.0556, "step": 4580 }, { "epoch": 2.3170116102978295, "grad_norm": 0.41079530119895935, "learning_rate": 1.6097930338213025e-05, "loss": 0.0683, "step": 4590 }, { "epoch": 2.3220595658758203, "grad_norm": 0.48419833183288574, "learning_rate": 1.6067642604745078e-05, "loss": 0.0732, "step": 4600 }, { "epoch": 2.327107521453811, "grad_norm": 0.46579691767692566, "learning_rate": 1.6037354871277135e-05, "loss": 0.0717, "step": 4610 }, { "epoch": 2.3321554770318023, "grad_norm": 0.44318434596061707, "learning_rate": 1.600706713780919e-05, "loss": 0.07, "step": 4620 }, { "epoch": 2.337203432609793, "grad_norm": 0.3508608937263489, "learning_rate": 1.5976779404341242e-05, "loss": 0.0608, "step": 4630 }, { "epoch": 2.342251388187784, "grad_norm": 0.3729645311832428, "learning_rate": 1.59464916708733e-05, "loss": 0.0764, "step": 4640 }, { "epoch": 2.347299343765775, "grad_norm": 0.3809719681739807, "learning_rate": 1.5916203937405352e-05, "loss": 0.0795, "step": 4650 }, { "epoch": 2.3523472993437657, "grad_norm": 0.42823466658592224, "learning_rate": 1.5885916203937406e-05, "loss": 0.0629, "step": 4660 }, { "epoch": 2.3573952549217565, "grad_norm": 0.45598268508911133, "learning_rate": 1.585562847046946e-05, "loss": 0.0747, "step": 4670 }, { "epoch": 2.3624432104997477, "grad_norm": 0.3352445662021637, "learning_rate": 1.5825340737001513e-05, "loss": 0.0724, "step": 4680 }, { "epoch": 2.3674911660777385, "grad_norm": 0.44051024317741394, "learning_rate": 1.579505300353357e-05, "loss": 0.0668, "step": 4690 }, { "epoch": 2.3725391216557297, "grad_norm": 0.3988921344280243, "learning_rate": 1.5764765270065623e-05, "loss": 0.0616, "step": 4700 }, { "epoch": 2.3775870772337204, "grad_norm": 0.42814871668815613, "learning_rate": 1.5734477536597676e-05, "loss": 0.0574, "step": 4710 }, { "epoch": 2.382635032811711, "grad_norm": 0.43511560559272766, "learning_rate": 1.5704189803129733e-05, "loss": 0.0647, "step": 4720 }, { "epoch": 2.3876829883897024, "grad_norm": 0.4476068317890167, "learning_rate": 1.5673902069661787e-05, "loss": 0.0739, "step": 4730 }, { "epoch": 2.392730943967693, "grad_norm": 0.289435476064682, "learning_rate": 1.5643614336193843e-05, "loss": 0.0572, "step": 4740 }, { "epoch": 2.397778899545684, "grad_norm": 0.3826657235622406, "learning_rate": 1.5613326602725897e-05, "loss": 0.0694, "step": 4750 }, { "epoch": 2.402826855123675, "grad_norm": 0.4220544397830963, "learning_rate": 1.558303886925795e-05, "loss": 0.0638, "step": 4760 }, { "epoch": 2.407874810701666, "grad_norm": 0.3492651581764221, "learning_rate": 1.5552751135790007e-05, "loss": 0.0679, "step": 4770 }, { "epoch": 2.4129227662796566, "grad_norm": 0.5811386704444885, "learning_rate": 1.552246340232206e-05, "loss": 0.0787, "step": 4780 }, { "epoch": 2.417970721857648, "grad_norm": 0.2967993915081024, "learning_rate": 1.5492175668854117e-05, "loss": 0.0711, "step": 4790 }, { "epoch": 2.4230186774356386, "grad_norm": 0.33070528507232666, "learning_rate": 1.5461887935386168e-05, "loss": 0.0594, "step": 4800 }, { "epoch": 2.4280666330136293, "grad_norm": 0.42890259623527527, "learning_rate": 1.543160020191822e-05, "loss": 0.0627, "step": 4810 }, { "epoch": 2.4331145885916206, "grad_norm": 0.39528656005859375, "learning_rate": 1.5401312468450278e-05, "loss": 0.0621, "step": 4820 }, { "epoch": 2.4381625441696113, "grad_norm": 0.3624139130115509, "learning_rate": 1.537102473498233e-05, "loss": 0.0652, "step": 4830 }, { "epoch": 2.443210499747602, "grad_norm": 0.4606042206287384, "learning_rate": 1.5340737001514388e-05, "loss": 0.0636, "step": 4840 }, { "epoch": 2.4482584553255933, "grad_norm": 0.5022151470184326, "learning_rate": 1.531044926804644e-05, "loss": 0.0643, "step": 4850 }, { "epoch": 2.453306410903584, "grad_norm": 0.41519105434417725, "learning_rate": 1.5280161534578495e-05, "loss": 0.0601, "step": 4860 }, { "epoch": 2.458354366481575, "grad_norm": 0.3175613284111023, "learning_rate": 1.524987380111055e-05, "loss": 0.0681, "step": 4870 }, { "epoch": 2.463402322059566, "grad_norm": 0.4150908887386322, "learning_rate": 1.5219586067642605e-05, "loss": 0.0577, "step": 4880 }, { "epoch": 2.4684502776375568, "grad_norm": 0.27239662408828735, "learning_rate": 1.518929833417466e-05, "loss": 0.0611, "step": 4890 }, { "epoch": 2.4734982332155475, "grad_norm": 0.6037119030952454, "learning_rate": 1.5159010600706716e-05, "loss": 0.0618, "step": 4900 }, { "epoch": 2.4785461887935387, "grad_norm": 0.28230902552604675, "learning_rate": 1.512872286723877e-05, "loss": 0.0661, "step": 4910 }, { "epoch": 2.4835941443715295, "grad_norm": 0.42984738945961, "learning_rate": 1.5098435133770824e-05, "loss": 0.0621, "step": 4920 }, { "epoch": 2.4886420999495202, "grad_norm": 0.5079028010368347, "learning_rate": 1.5068147400302876e-05, "loss": 0.0669, "step": 4930 }, { "epoch": 2.4936900555275114, "grad_norm": 0.3618210554122925, "learning_rate": 1.5037859666834931e-05, "loss": 0.0664, "step": 4940 }, { "epoch": 2.498738011105502, "grad_norm": 0.278143972158432, "learning_rate": 1.5007571933366986e-05, "loss": 0.0669, "step": 4950 }, { "epoch": 2.503785966683493, "grad_norm": 0.40146970748901367, "learning_rate": 1.4977284199899041e-05, "loss": 0.0644, "step": 4960 }, { "epoch": 2.508833922261484, "grad_norm": 0.40683749318122864, "learning_rate": 1.4946996466431095e-05, "loss": 0.068, "step": 4970 }, { "epoch": 2.513881877839475, "grad_norm": 0.4395790994167328, "learning_rate": 1.491670873296315e-05, "loss": 0.0692, "step": 4980 }, { "epoch": 2.5189298334174657, "grad_norm": 0.2304450422525406, "learning_rate": 1.4886420999495205e-05, "loss": 0.0627, "step": 4990 }, { "epoch": 2.523977788995457, "grad_norm": 0.3872699737548828, "learning_rate": 1.485613326602726e-05, "loss": 0.0631, "step": 5000 }, { "epoch": 2.5290257445734476, "grad_norm": 0.3607660233974457, "learning_rate": 1.4825845532559315e-05, "loss": 0.0615, "step": 5010 }, { "epoch": 2.5340737001514384, "grad_norm": 0.4235934913158417, "learning_rate": 1.4795557799091367e-05, "loss": 0.0676, "step": 5020 }, { "epoch": 2.5391216557294296, "grad_norm": 0.4623524248600006, "learning_rate": 1.4765270065623422e-05, "loss": 0.062, "step": 5030 }, { "epoch": 2.5441696113074204, "grad_norm": 0.4494507610797882, "learning_rate": 1.4734982332155477e-05, "loss": 0.0638, "step": 5040 }, { "epoch": 2.5492175668854116, "grad_norm": 0.3828752636909485, "learning_rate": 1.4704694598687533e-05, "loss": 0.0637, "step": 5050 }, { "epoch": 2.5542655224634023, "grad_norm": 0.35024183988571167, "learning_rate": 1.4674406865219586e-05, "loss": 0.0604, "step": 5060 }, { "epoch": 2.559313478041393, "grad_norm": 0.3416309356689453, "learning_rate": 1.4644119131751641e-05, "loss": 0.0721, "step": 5070 }, { "epoch": 2.5643614336193843, "grad_norm": 0.4081193804740906, "learning_rate": 1.4613831398283696e-05, "loss": 0.0701, "step": 5080 }, { "epoch": 2.569409389197375, "grad_norm": 0.9290459156036377, "learning_rate": 1.458354366481575e-05, "loss": 0.067, "step": 5090 }, { "epoch": 2.5744573447753663, "grad_norm": 0.3432193398475647, "learning_rate": 1.4553255931347805e-05, "loss": 0.063, "step": 5100 }, { "epoch": 2.579505300353357, "grad_norm": 0.45165976881980896, "learning_rate": 1.4522968197879858e-05, "loss": 0.0725, "step": 5110 }, { "epoch": 2.5845532559313478, "grad_norm": 0.37954217195510864, "learning_rate": 1.4492680464411913e-05, "loss": 0.0673, "step": 5120 }, { "epoch": 2.589601211509339, "grad_norm": 0.3993614614009857, "learning_rate": 1.4462392730943969e-05, "loss": 0.0691, "step": 5130 }, { "epoch": 2.5946491670873297, "grad_norm": 0.48905062675476074, "learning_rate": 1.4432104997476024e-05, "loss": 0.0567, "step": 5140 }, { "epoch": 2.5996971226653205, "grad_norm": 0.4493992328643799, "learning_rate": 1.4401817264008077e-05, "loss": 0.0603, "step": 5150 }, { "epoch": 2.6047450782433117, "grad_norm": 0.34471943974494934, "learning_rate": 1.437152953054013e-05, "loss": 0.0607, "step": 5160 }, { "epoch": 2.6097930338213025, "grad_norm": 0.3988341689109802, "learning_rate": 1.4341241797072186e-05, "loss": 0.0614, "step": 5170 }, { "epoch": 2.614840989399293, "grad_norm": 0.4356638193130493, "learning_rate": 1.4310954063604241e-05, "loss": 0.0589, "step": 5180 }, { "epoch": 2.6198889449772844, "grad_norm": 0.3544706106185913, "learning_rate": 1.4280666330136296e-05, "loss": 0.0596, "step": 5190 }, { "epoch": 2.624936900555275, "grad_norm": 0.4624828100204468, "learning_rate": 1.425037859666835e-05, "loss": 0.0712, "step": 5200 }, { "epoch": 2.629984856133266, "grad_norm": 0.3146689236164093, "learning_rate": 1.4220090863200403e-05, "loss": 0.0636, "step": 5210 }, { "epoch": 2.635032811711257, "grad_norm": 0.5862840414047241, "learning_rate": 1.4189803129732458e-05, "loss": 0.0658, "step": 5220 }, { "epoch": 2.640080767289248, "grad_norm": 0.3758508265018463, "learning_rate": 1.4159515396264513e-05, "loss": 0.063, "step": 5230 }, { "epoch": 2.6451287228672387, "grad_norm": 0.3946121335029602, "learning_rate": 1.4129227662796568e-05, "loss": 0.0772, "step": 5240 }, { "epoch": 2.65017667844523, "grad_norm": 0.4428150951862335, "learning_rate": 1.4098939929328622e-05, "loss": 0.064, "step": 5250 }, { "epoch": 2.6552246340232206, "grad_norm": 0.3693462312221527, "learning_rate": 1.4068652195860677e-05, "loss": 0.0691, "step": 5260 }, { "epoch": 2.6602725896012114, "grad_norm": 0.604390025138855, "learning_rate": 1.4038364462392732e-05, "loss": 0.067, "step": 5270 }, { "epoch": 2.6653205451792026, "grad_norm": 0.32199588418006897, "learning_rate": 1.4008076728924786e-05, "loss": 0.069, "step": 5280 }, { "epoch": 2.6703685007571933, "grad_norm": 0.40118536353111267, "learning_rate": 1.397778899545684e-05, "loss": 0.0598, "step": 5290 }, { "epoch": 2.675416456335184, "grad_norm": 0.4204835295677185, "learning_rate": 1.3947501261988894e-05, "loss": 0.0649, "step": 5300 }, { "epoch": 2.6804644119131753, "grad_norm": 0.45677709579467773, "learning_rate": 1.391721352852095e-05, "loss": 0.0666, "step": 5310 }, { "epoch": 2.685512367491166, "grad_norm": 0.3687781095504761, "learning_rate": 1.3886925795053004e-05, "loss": 0.0716, "step": 5320 }, { "epoch": 2.690560323069157, "grad_norm": 0.5170356631278992, "learning_rate": 1.385663806158506e-05, "loss": 0.0638, "step": 5330 }, { "epoch": 2.695608278647148, "grad_norm": 0.4643763303756714, "learning_rate": 1.3826350328117113e-05, "loss": 0.0696, "step": 5340 }, { "epoch": 2.700656234225139, "grad_norm": 0.3444504141807556, "learning_rate": 1.3796062594649166e-05, "loss": 0.0739, "step": 5350 }, { "epoch": 2.7057041898031295, "grad_norm": 0.4813980758190155, "learning_rate": 1.3765774861181222e-05, "loss": 0.0647, "step": 5360 }, { "epoch": 2.7107521453811207, "grad_norm": 0.3534546494483948, "learning_rate": 1.3735487127713277e-05, "loss": 0.0575, "step": 5370 }, { "epoch": 2.7158001009591115, "grad_norm": 0.41960790753364563, "learning_rate": 1.3705199394245332e-05, "loss": 0.0709, "step": 5380 }, { "epoch": 2.7208480565371023, "grad_norm": 0.38305145502090454, "learning_rate": 1.3674911660777385e-05, "loss": 0.0606, "step": 5390 }, { "epoch": 2.7258960121150935, "grad_norm": 0.5087040662765503, "learning_rate": 1.3644623927309439e-05, "loss": 0.0607, "step": 5400 }, { "epoch": 2.7309439676930842, "grad_norm": 0.37414073944091797, "learning_rate": 1.3614336193841494e-05, "loss": 0.0682, "step": 5410 }, { "epoch": 2.735991923271075, "grad_norm": 0.39554670453071594, "learning_rate": 1.3584048460373549e-05, "loss": 0.079, "step": 5420 }, { "epoch": 2.741039878849066, "grad_norm": 0.357322633266449, "learning_rate": 1.3553760726905604e-05, "loss": 0.0529, "step": 5430 }, { "epoch": 2.746087834427057, "grad_norm": 0.3612682819366455, "learning_rate": 1.3523472993437658e-05, "loss": 0.0678, "step": 5440 }, { "epoch": 2.7511357900050477, "grad_norm": 0.49319979548454285, "learning_rate": 1.3493185259969713e-05, "loss": 0.0654, "step": 5450 }, { "epoch": 2.756183745583039, "grad_norm": 0.3630322515964508, "learning_rate": 1.3462897526501768e-05, "loss": 0.0614, "step": 5460 }, { "epoch": 2.7612317011610297, "grad_norm": 0.6561079025268555, "learning_rate": 1.3432609793033821e-05, "loss": 0.0609, "step": 5470 }, { "epoch": 2.7662796567390204, "grad_norm": 0.49902087450027466, "learning_rate": 1.3402322059565877e-05, "loss": 0.0604, "step": 5480 }, { "epoch": 2.7713276123170116, "grad_norm": 0.4306737184524536, "learning_rate": 1.337203432609793e-05, "loss": 0.0642, "step": 5490 }, { "epoch": 2.7763755678950024, "grad_norm": 0.2556377351284027, "learning_rate": 1.3341746592629985e-05, "loss": 0.0581, "step": 5500 }, { "epoch": 2.7814235234729936, "grad_norm": 0.37852397561073303, "learning_rate": 1.331145885916204e-05, "loss": 0.0655, "step": 5510 }, { "epoch": 2.7864714790509844, "grad_norm": 0.4397842288017273, "learning_rate": 1.3281171125694095e-05, "loss": 0.0683, "step": 5520 }, { "epoch": 2.791519434628975, "grad_norm": 0.3145972788333893, "learning_rate": 1.3250883392226147e-05, "loss": 0.065, "step": 5530 }, { "epoch": 2.7965673902069663, "grad_norm": 0.4314529597759247, "learning_rate": 1.3220595658758202e-05, "loss": 0.0729, "step": 5540 }, { "epoch": 2.801615345784957, "grad_norm": 0.43847423791885376, "learning_rate": 1.3190307925290257e-05, "loss": 0.0603, "step": 5550 }, { "epoch": 2.8066633013629483, "grad_norm": 0.7666720151901245, "learning_rate": 1.3160020191822313e-05, "loss": 0.0638, "step": 5560 }, { "epoch": 2.811711256940939, "grad_norm": 0.3244626224040985, "learning_rate": 1.3129732458354368e-05, "loss": 0.0612, "step": 5570 }, { "epoch": 2.81675921251893, "grad_norm": 0.4250195324420929, "learning_rate": 1.3099444724886421e-05, "loss": 0.0655, "step": 5580 }, { "epoch": 2.821807168096921, "grad_norm": 0.49263009428977966, "learning_rate": 1.3069156991418476e-05, "loss": 0.0761, "step": 5590 }, { "epoch": 2.8268551236749118, "grad_norm": 0.5163371562957764, "learning_rate": 1.303886925795053e-05, "loss": 0.071, "step": 5600 }, { "epoch": 2.8319030792529025, "grad_norm": 0.4262700378894806, "learning_rate": 1.3008581524482585e-05, "loss": 0.0674, "step": 5610 }, { "epoch": 2.8369510348308937, "grad_norm": 0.3641040027141571, "learning_rate": 1.297829379101464e-05, "loss": 0.061, "step": 5620 }, { "epoch": 2.8419989904088845, "grad_norm": 0.38265225291252136, "learning_rate": 1.2948006057546693e-05, "loss": 0.0621, "step": 5630 }, { "epoch": 2.8470469459868752, "grad_norm": 0.33575159311294556, "learning_rate": 1.2917718324078749e-05, "loss": 0.0701, "step": 5640 }, { "epoch": 2.8520949015648664, "grad_norm": 0.4343346357345581, "learning_rate": 1.2887430590610804e-05, "loss": 0.0528, "step": 5650 }, { "epoch": 2.857142857142857, "grad_norm": 0.3838566839694977, "learning_rate": 1.2857142857142857e-05, "loss": 0.0595, "step": 5660 }, { "epoch": 2.862190812720848, "grad_norm": 0.5549935698509216, "learning_rate": 1.2826855123674912e-05, "loss": 0.0619, "step": 5670 }, { "epoch": 2.867238768298839, "grad_norm": 0.7633477449417114, "learning_rate": 1.2796567390206966e-05, "loss": 0.055, "step": 5680 }, { "epoch": 2.87228672387683, "grad_norm": 0.4539719223976135, "learning_rate": 1.2766279656739021e-05, "loss": 0.0663, "step": 5690 }, { "epoch": 2.8773346794548207, "grad_norm": 0.3440587818622589, "learning_rate": 1.2735991923271076e-05, "loss": 0.0678, "step": 5700 }, { "epoch": 2.882382635032812, "grad_norm": 0.36671680212020874, "learning_rate": 1.2705704189803131e-05, "loss": 0.0641, "step": 5710 }, { "epoch": 2.8874305906108026, "grad_norm": 0.2760653793811798, "learning_rate": 1.2675416456335183e-05, "loss": 0.0602, "step": 5720 }, { "epoch": 2.8924785461887934, "grad_norm": 0.3290037214756012, "learning_rate": 1.2645128722867238e-05, "loss": 0.0584, "step": 5730 }, { "epoch": 2.8975265017667846, "grad_norm": 0.6113518476486206, "learning_rate": 1.2614840989399293e-05, "loss": 0.0583, "step": 5740 }, { "epoch": 2.9025744573447754, "grad_norm": 0.3404606580734253, "learning_rate": 1.2584553255931348e-05, "loss": 0.0663, "step": 5750 }, { "epoch": 2.907622412922766, "grad_norm": 0.3430802822113037, "learning_rate": 1.2554265522463404e-05, "loss": 0.0633, "step": 5760 }, { "epoch": 2.9126703685007573, "grad_norm": 0.3817721903324127, "learning_rate": 1.2523977788995457e-05, "loss": 0.0667, "step": 5770 }, { "epoch": 2.917718324078748, "grad_norm": 0.49006789922714233, "learning_rate": 1.2493690055527512e-05, "loss": 0.0662, "step": 5780 }, { "epoch": 2.922766279656739, "grad_norm": 0.41557011008262634, "learning_rate": 1.2463402322059566e-05, "loss": 0.0588, "step": 5790 }, { "epoch": 2.92781423523473, "grad_norm": 0.28697067499160767, "learning_rate": 1.243311458859162e-05, "loss": 0.0599, "step": 5800 }, { "epoch": 2.932862190812721, "grad_norm": 0.39947792887687683, "learning_rate": 1.2402826855123676e-05, "loss": 0.068, "step": 5810 }, { "epoch": 2.9379101463907116, "grad_norm": 0.31385132670402527, "learning_rate": 1.237253912165573e-05, "loss": 0.0676, "step": 5820 }, { "epoch": 2.9429581019687028, "grad_norm": 0.482799768447876, "learning_rate": 1.2342251388187784e-05, "loss": 0.0536, "step": 5830 }, { "epoch": 2.9480060575466935, "grad_norm": 0.556859016418457, "learning_rate": 1.231196365471984e-05, "loss": 0.0644, "step": 5840 }, { "epoch": 2.9530540131246843, "grad_norm": 0.2774258255958557, "learning_rate": 1.2281675921251893e-05, "loss": 0.0633, "step": 5850 }, { "epoch": 2.9581019687026755, "grad_norm": 0.3613818883895874, "learning_rate": 1.2251388187783947e-05, "loss": 0.0647, "step": 5860 }, { "epoch": 2.9631499242806663, "grad_norm": 0.3277703821659088, "learning_rate": 1.2221100454316002e-05, "loss": 0.0648, "step": 5870 }, { "epoch": 2.968197879858657, "grad_norm": 0.331059068441391, "learning_rate": 1.2190812720848057e-05, "loss": 0.0715, "step": 5880 }, { "epoch": 2.973245835436648, "grad_norm": 0.4360307455062866, "learning_rate": 1.2160524987380112e-05, "loss": 0.0771, "step": 5890 }, { "epoch": 2.978293791014639, "grad_norm": 0.5486271977424622, "learning_rate": 1.2130237253912167e-05, "loss": 0.067, "step": 5900 }, { "epoch": 2.9833417465926297, "grad_norm": 0.316173255443573, "learning_rate": 1.209994952044422e-05, "loss": 0.0658, "step": 5910 }, { "epoch": 2.988389702170621, "grad_norm": 0.38328826427459717, "learning_rate": 1.2069661786976274e-05, "loss": 0.0591, "step": 5920 }, { "epoch": 2.9934376577486117, "grad_norm": 0.32088446617126465, "learning_rate": 1.2039374053508329e-05, "loss": 0.0551, "step": 5930 }, { "epoch": 2.9984856133266025, "grad_norm": 0.2678850591182709, "learning_rate": 1.2009086320040384e-05, "loss": 0.0676, "step": 5940 }, { "epoch": 3.0, "eval_f1": 0.9705180789481339, "eval_loss": 0.04634944349527359, "eval_runtime": 705.0876, "eval_samples_per_second": 292.534, "eval_steps_per_second": 2.286, "step": 5943 }, { "epoch": 3.0035335689045937, "grad_norm": 0.4443244934082031, "learning_rate": 1.197879858657244e-05, "loss": 0.0483, "step": 5950 }, { "epoch": 3.0085815244825844, "grad_norm": 0.4683389663696289, "learning_rate": 1.1948510853104493e-05, "loss": 0.0571, "step": 5960 }, { "epoch": 3.0136294800605756, "grad_norm": 0.4674948751926422, "learning_rate": 1.1918223119636548e-05, "loss": 0.0639, "step": 5970 }, { "epoch": 3.0186774356385664, "grad_norm": 0.5270655155181885, "learning_rate": 1.1887935386168601e-05, "loss": 0.0658, "step": 5980 }, { "epoch": 3.023725391216557, "grad_norm": 0.343179851770401, "learning_rate": 1.1857647652700657e-05, "loss": 0.0795, "step": 5990 }, { "epoch": 3.0287733467945483, "grad_norm": 0.3782111704349518, "learning_rate": 1.182735991923271e-05, "loss": 0.0586, "step": 6000 }, { "epoch": 3.033821302372539, "grad_norm": 0.5034363865852356, "learning_rate": 1.1797072185764765e-05, "loss": 0.0628, "step": 6010 }, { "epoch": 3.03886925795053, "grad_norm": 0.3796568512916565, "learning_rate": 1.176678445229682e-05, "loss": 0.0548, "step": 6020 }, { "epoch": 3.043917213528521, "grad_norm": 0.381005197763443, "learning_rate": 1.1736496718828875e-05, "loss": 0.0535, "step": 6030 }, { "epoch": 3.048965169106512, "grad_norm": 0.5388962626457214, "learning_rate": 1.1706208985360929e-05, "loss": 0.0621, "step": 6040 }, { "epoch": 3.0540131246845026, "grad_norm": 0.4249768555164337, "learning_rate": 1.1675921251892982e-05, "loss": 0.0632, "step": 6050 }, { "epoch": 3.059061080262494, "grad_norm": 0.3373398780822754, "learning_rate": 1.1645633518425038e-05, "loss": 0.0641, "step": 6060 }, { "epoch": 3.0641090358404846, "grad_norm": 0.4852452278137207, "learning_rate": 1.1615345784957093e-05, "loss": 0.0569, "step": 6070 }, { "epoch": 3.0691569914184753, "grad_norm": 0.5530717968940735, "learning_rate": 1.1585058051489148e-05, "loss": 0.0765, "step": 6080 }, { "epoch": 3.0742049469964665, "grad_norm": 0.3554864823818207, "learning_rate": 1.1554770318021203e-05, "loss": 0.0677, "step": 6090 }, { "epoch": 3.0792529025744573, "grad_norm": 0.45646339654922485, "learning_rate": 1.1524482584553256e-05, "loss": 0.0595, "step": 6100 }, { "epoch": 3.0843008581524485, "grad_norm": 0.44173210859298706, "learning_rate": 1.149419485108531e-05, "loss": 0.0561, "step": 6110 }, { "epoch": 3.0893488137304392, "grad_norm": 0.31832337379455566, "learning_rate": 1.1463907117617365e-05, "loss": 0.0592, "step": 6120 }, { "epoch": 3.09439676930843, "grad_norm": 0.4026302993297577, "learning_rate": 1.143361938414942e-05, "loss": 0.0546, "step": 6130 }, { "epoch": 3.099444724886421, "grad_norm": 0.4788239300251007, "learning_rate": 1.1403331650681475e-05, "loss": 0.0641, "step": 6140 }, { "epoch": 3.104492680464412, "grad_norm": 0.4949135184288025, "learning_rate": 1.1373043917213529e-05, "loss": 0.0612, "step": 6150 }, { "epoch": 3.1095406360424027, "grad_norm": 0.5024904012680054, "learning_rate": 1.1342756183745584e-05, "loss": 0.0592, "step": 6160 }, { "epoch": 3.114588591620394, "grad_norm": 0.42776796221733093, "learning_rate": 1.1312468450277637e-05, "loss": 0.0598, "step": 6170 }, { "epoch": 3.1196365471983847, "grad_norm": 0.551500678062439, "learning_rate": 1.1282180716809692e-05, "loss": 0.071, "step": 6180 }, { "epoch": 3.1246845027763754, "grad_norm": 0.3293100893497467, "learning_rate": 1.1251892983341746e-05, "loss": 0.064, "step": 6190 }, { "epoch": 3.1297324583543666, "grad_norm": 0.4054960310459137, "learning_rate": 1.1221605249873801e-05, "loss": 0.0596, "step": 6200 }, { "epoch": 3.1347804139323574, "grad_norm": 0.38681086897850037, "learning_rate": 1.1191317516405856e-05, "loss": 0.0635, "step": 6210 }, { "epoch": 3.139828369510348, "grad_norm": 0.40157806873321533, "learning_rate": 1.1161029782937911e-05, "loss": 0.0716, "step": 6220 }, { "epoch": 3.1448763250883394, "grad_norm": 0.6303773522377014, "learning_rate": 1.1130742049469966e-05, "loss": 0.0629, "step": 6230 }, { "epoch": 3.14992428066633, "grad_norm": 0.43884503841400146, "learning_rate": 1.1100454316002018e-05, "loss": 0.0617, "step": 6240 }, { "epoch": 3.154972236244321, "grad_norm": 0.3345347046852112, "learning_rate": 1.1070166582534073e-05, "loss": 0.0558, "step": 6250 }, { "epoch": 3.160020191822312, "grad_norm": 0.42731714248657227, "learning_rate": 1.1039878849066128e-05, "loss": 0.0666, "step": 6260 }, { "epoch": 3.165068147400303, "grad_norm": 0.5786126255989075, "learning_rate": 1.1009591115598184e-05, "loss": 0.0616, "step": 6270 }, { "epoch": 3.1701161029782936, "grad_norm": 0.3354320228099823, "learning_rate": 1.0979303382130239e-05, "loss": 0.0627, "step": 6280 }, { "epoch": 3.175164058556285, "grad_norm": 0.4890894293785095, "learning_rate": 1.0949015648662292e-05, "loss": 0.0648, "step": 6290 }, { "epoch": 3.1802120141342756, "grad_norm": 0.42990556359291077, "learning_rate": 1.0918727915194346e-05, "loss": 0.0677, "step": 6300 }, { "epoch": 3.1852599697122663, "grad_norm": 0.4617575705051422, "learning_rate": 1.08884401817264e-05, "loss": 0.071, "step": 6310 }, { "epoch": 3.1903079252902575, "grad_norm": 0.33714014291763306, "learning_rate": 1.0858152448258456e-05, "loss": 0.06, "step": 6320 }, { "epoch": 3.1953558808682483, "grad_norm": 0.5312590003013611, "learning_rate": 1.082786471479051e-05, "loss": 0.0618, "step": 6330 }, { "epoch": 3.200403836446239, "grad_norm": 0.40886396169662476, "learning_rate": 1.0797576981322565e-05, "loss": 0.0606, "step": 6340 }, { "epoch": 3.2054517920242303, "grad_norm": 0.5101807117462158, "learning_rate": 1.076728924785462e-05, "loss": 0.0669, "step": 6350 }, { "epoch": 3.210499747602221, "grad_norm": 0.37605538964271545, "learning_rate": 1.0737001514386673e-05, "loss": 0.0682, "step": 6360 }, { "epoch": 3.215547703180212, "grad_norm": 0.3650659918785095, "learning_rate": 1.0706713780918728e-05, "loss": 0.0605, "step": 6370 }, { "epoch": 3.220595658758203, "grad_norm": 0.49304908514022827, "learning_rate": 1.0676426047450782e-05, "loss": 0.0633, "step": 6380 }, { "epoch": 3.2256436143361937, "grad_norm": 0.45995691418647766, "learning_rate": 1.0646138313982837e-05, "loss": 0.0648, "step": 6390 }, { "epoch": 3.230691569914185, "grad_norm": 0.3334788382053375, "learning_rate": 1.0615850580514892e-05, "loss": 0.0579, "step": 6400 }, { "epoch": 3.2357395254921757, "grad_norm": 0.2866950035095215, "learning_rate": 1.0585562847046947e-05, "loss": 0.0569, "step": 6410 }, { "epoch": 3.2407874810701665, "grad_norm": 0.532154381275177, "learning_rate": 1.0555275113579002e-05, "loss": 0.0636, "step": 6420 }, { "epoch": 3.2458354366481577, "grad_norm": 0.4379573464393616, "learning_rate": 1.0524987380111054e-05, "loss": 0.0677, "step": 6430 }, { "epoch": 3.2508833922261484, "grad_norm": 0.4286845326423645, "learning_rate": 1.049469964664311e-05, "loss": 0.0577, "step": 6440 }, { "epoch": 3.255931347804139, "grad_norm": 0.3957328498363495, "learning_rate": 1.0464411913175164e-05, "loss": 0.0566, "step": 6450 }, { "epoch": 3.2609793033821304, "grad_norm": 0.3571922183036804, "learning_rate": 1.043412417970722e-05, "loss": 0.0651, "step": 6460 }, { "epoch": 3.266027258960121, "grad_norm": 0.32685035467147827, "learning_rate": 1.0403836446239273e-05, "loss": 0.0623, "step": 6470 }, { "epoch": 3.271075214538112, "grad_norm": 0.3616839647293091, "learning_rate": 1.0373548712771328e-05, "loss": 0.0655, "step": 6480 }, { "epoch": 3.276123170116103, "grad_norm": 0.40262675285339355, "learning_rate": 1.0343260979303382e-05, "loss": 0.0651, "step": 6490 }, { "epoch": 3.281171125694094, "grad_norm": 0.5313582420349121, "learning_rate": 1.0312973245835437e-05, "loss": 0.0628, "step": 6500 }, { "epoch": 3.2862190812720846, "grad_norm": 0.3204849064350128, "learning_rate": 1.0282685512367492e-05, "loss": 0.0573, "step": 6510 }, { "epoch": 3.291267036850076, "grad_norm": 0.3533790409564972, "learning_rate": 1.0252397778899545e-05, "loss": 0.06, "step": 6520 }, { "epoch": 3.2963149924280666, "grad_norm": 0.36665427684783936, "learning_rate": 1.02221100454316e-05, "loss": 0.0602, "step": 6530 }, { "epoch": 3.301362948006058, "grad_norm": 0.561029314994812, "learning_rate": 1.0191822311963656e-05, "loss": 0.0607, "step": 6540 }, { "epoch": 3.3064109035840485, "grad_norm": 0.43554937839508057, "learning_rate": 1.016153457849571e-05, "loss": 0.0578, "step": 6550 }, { "epoch": 3.3114588591620393, "grad_norm": 0.36535871028900146, "learning_rate": 1.0131246845027764e-05, "loss": 0.0656, "step": 6560 }, { "epoch": 3.3165068147400305, "grad_norm": 0.3634582757949829, "learning_rate": 1.0100959111559818e-05, "loss": 0.0643, "step": 6570 }, { "epoch": 3.3215547703180213, "grad_norm": 0.39043471217155457, "learning_rate": 1.0070671378091873e-05, "loss": 0.0653, "step": 6580 }, { "epoch": 3.326602725896012, "grad_norm": 0.3135824203491211, "learning_rate": 1.0040383644623928e-05, "loss": 0.063, "step": 6590 }, { "epoch": 3.3316506814740032, "grad_norm": 0.47060030698776245, "learning_rate": 1.0010095911155983e-05, "loss": 0.0675, "step": 6600 }, { "epoch": 3.336698637051994, "grad_norm": 0.4136187732219696, "learning_rate": 9.979808177688038e-06, "loss": 0.0523, "step": 6610 }, { "epoch": 3.3417465926299847, "grad_norm": 0.4783284664154053, "learning_rate": 9.94952044422009e-06, "loss": 0.0621, "step": 6620 }, { "epoch": 3.346794548207976, "grad_norm": 0.41832417249679565, "learning_rate": 9.919232710752145e-06, "loss": 0.0644, "step": 6630 }, { "epoch": 3.3518425037859667, "grad_norm": 0.3421390950679779, "learning_rate": 9.8889449772842e-06, "loss": 0.054, "step": 6640 }, { "epoch": 3.3568904593639575, "grad_norm": 0.5004147291183472, "learning_rate": 9.858657243816255e-06, "loss": 0.0614, "step": 6650 }, { "epoch": 3.3619384149419487, "grad_norm": 0.3952717185020447, "learning_rate": 9.828369510348309e-06, "loss": 0.06, "step": 6660 }, { "epoch": 3.3669863705199394, "grad_norm": 0.3463038206100464, "learning_rate": 9.798081776880364e-06, "loss": 0.0671, "step": 6670 }, { "epoch": 3.37203432609793, "grad_norm": 0.3278227746486664, "learning_rate": 9.767794043412417e-06, "loss": 0.0563, "step": 6680 }, { "epoch": 3.3770822816759214, "grad_norm": 0.516260027885437, "learning_rate": 9.737506309944473e-06, "loss": 0.0619, "step": 6690 }, { "epoch": 3.382130237253912, "grad_norm": 0.42303574085235596, "learning_rate": 9.707218576476528e-06, "loss": 0.0616, "step": 6700 }, { "epoch": 3.387178192831903, "grad_norm": 0.3528966009616852, "learning_rate": 9.676930843008581e-06, "loss": 0.0615, "step": 6710 }, { "epoch": 3.392226148409894, "grad_norm": 0.28841379284858704, "learning_rate": 9.646643109540636e-06, "loss": 0.0602, "step": 6720 }, { "epoch": 3.397274103987885, "grad_norm": 0.3085114061832428, "learning_rate": 9.616355376072691e-06, "loss": 0.066, "step": 6730 }, { "epoch": 3.4023220595658756, "grad_norm": 0.3895336985588074, "learning_rate": 9.586067642604747e-06, "loss": 0.0654, "step": 6740 }, { "epoch": 3.407370015143867, "grad_norm": 0.36049631237983704, "learning_rate": 9.5557799091368e-06, "loss": 0.0681, "step": 6750 }, { "epoch": 3.4124179707218576, "grad_norm": 0.37995630502700806, "learning_rate": 9.525492175668853e-06, "loss": 0.0532, "step": 6760 }, { "epoch": 3.4174659262998484, "grad_norm": 0.369208961725235, "learning_rate": 9.495204442200909e-06, "loss": 0.0621, "step": 6770 }, { "epoch": 3.4225138818778396, "grad_norm": 0.4210832118988037, "learning_rate": 9.464916708732964e-06, "loss": 0.0605, "step": 6780 }, { "epoch": 3.4275618374558303, "grad_norm": 0.5589237213134766, "learning_rate": 9.434628975265019e-06, "loss": 0.0602, "step": 6790 }, { "epoch": 3.432609793033821, "grad_norm": 0.44891810417175293, "learning_rate": 9.404341241797072e-06, "loss": 0.0582, "step": 6800 }, { "epoch": 3.4376577486118123, "grad_norm": 0.40576326847076416, "learning_rate": 9.374053508329126e-06, "loss": 0.0595, "step": 6810 }, { "epoch": 3.442705704189803, "grad_norm": 0.4331837594509125, "learning_rate": 9.343765774861181e-06, "loss": 0.0653, "step": 6820 }, { "epoch": 3.447753659767794, "grad_norm": 0.3736474812030792, "learning_rate": 9.313478041393236e-06, "loss": 0.057, "step": 6830 }, { "epoch": 3.452801615345785, "grad_norm": 0.5507432818412781, "learning_rate": 9.283190307925291e-06, "loss": 0.0609, "step": 6840 }, { "epoch": 3.4578495709237758, "grad_norm": 0.25957268476486206, "learning_rate": 9.252902574457345e-06, "loss": 0.0662, "step": 6850 }, { "epoch": 3.462897526501767, "grad_norm": 0.3138289153575897, "learning_rate": 9.2226148409894e-06, "loss": 0.0634, "step": 6860 }, { "epoch": 3.4679454820797577, "grad_norm": 0.47947317361831665, "learning_rate": 9.192327107521453e-06, "loss": 0.0598, "step": 6870 }, { "epoch": 3.4729934376577485, "grad_norm": 0.33063846826553345, "learning_rate": 9.162039374053508e-06, "loss": 0.0671, "step": 6880 }, { "epoch": 3.4780413932357397, "grad_norm": 0.6448005437850952, "learning_rate": 9.131751640585563e-06, "loss": 0.0656, "step": 6890 }, { "epoch": 3.4830893488137304, "grad_norm": 0.3326038420200348, "learning_rate": 9.101463907117617e-06, "loss": 0.0649, "step": 6900 }, { "epoch": 3.488137304391721, "grad_norm": 0.49056732654571533, "learning_rate": 9.071176173649672e-06, "loss": 0.0571, "step": 6910 }, { "epoch": 3.4931852599697124, "grad_norm": 0.7147297859191895, "learning_rate": 9.040888440181727e-06, "loss": 0.0589, "step": 6920 }, { "epoch": 3.498233215547703, "grad_norm": 0.6848371028900146, "learning_rate": 9.010600706713782e-06, "loss": 0.0556, "step": 6930 }, { "epoch": 3.5032811711256944, "grad_norm": 0.7190125584602356, "learning_rate": 8.980312973245836e-06, "loss": 0.0565, "step": 6940 }, { "epoch": 3.508329126703685, "grad_norm": 0.46388497948646545, "learning_rate": 8.95002523977789e-06, "loss": 0.0593, "step": 6950 }, { "epoch": 3.513377082281676, "grad_norm": 0.4307047426700592, "learning_rate": 8.919737506309944e-06, "loss": 0.0568, "step": 6960 }, { "epoch": 3.518425037859667, "grad_norm": 0.4674361050128937, "learning_rate": 8.889449772842e-06, "loss": 0.064, "step": 6970 }, { "epoch": 3.523472993437658, "grad_norm": 0.474177747964859, "learning_rate": 8.859162039374055e-06, "loss": 0.0717, "step": 6980 }, { "epoch": 3.5285209490156486, "grad_norm": 0.4488455653190613, "learning_rate": 8.828874305906108e-06, "loss": 0.0618, "step": 6990 }, { "epoch": 3.53356890459364, "grad_norm": 0.3444407284259796, "learning_rate": 8.798586572438162e-06, "loss": 0.0613, "step": 7000 }, { "epoch": 3.5386168601716306, "grad_norm": 0.3389851152896881, "learning_rate": 8.768298838970217e-06, "loss": 0.0596, "step": 7010 }, { "epoch": 3.5436648157496213, "grad_norm": 0.4028567373752594, "learning_rate": 8.738011105502272e-06, "loss": 0.0584, "step": 7020 }, { "epoch": 3.5487127713276125, "grad_norm": 0.5268592238426208, "learning_rate": 8.707723372034327e-06, "loss": 0.0585, "step": 7030 }, { "epoch": 3.5537607269056033, "grad_norm": 0.39129918813705444, "learning_rate": 8.67743563856638e-06, "loss": 0.055, "step": 7040 }, { "epoch": 3.558808682483594, "grad_norm": 0.5022369623184204, "learning_rate": 8.647147905098436e-06, "loss": 0.0644, "step": 7050 }, { "epoch": 3.5638566380615853, "grad_norm": 0.524813711643219, "learning_rate": 8.61686017163049e-06, "loss": 0.0622, "step": 7060 }, { "epoch": 3.568904593639576, "grad_norm": 0.34827151894569397, "learning_rate": 8.586572438162544e-06, "loss": 0.0608, "step": 7070 }, { "epoch": 3.5739525492175668, "grad_norm": 0.43751201033592224, "learning_rate": 8.5562847046946e-06, "loss": 0.0552, "step": 7080 }, { "epoch": 3.579000504795558, "grad_norm": 0.40720903873443604, "learning_rate": 8.525996971226653e-06, "loss": 0.0648, "step": 7090 }, { "epoch": 3.5840484603735487, "grad_norm": 0.25798696279525757, "learning_rate": 8.495709237758708e-06, "loss": 0.0613, "step": 7100 }, { "epoch": 3.5890964159515395, "grad_norm": 0.3780571520328522, "learning_rate": 8.465421504290763e-06, "loss": 0.0684, "step": 7110 }, { "epoch": 3.5941443715295307, "grad_norm": 0.5011832118034363, "learning_rate": 8.435133770822818e-06, "loss": 0.0637, "step": 7120 }, { "epoch": 3.5991923271075215, "grad_norm": 0.4237106144428253, "learning_rate": 8.40484603735487e-06, "loss": 0.0649, "step": 7130 }, { "epoch": 3.604240282685512, "grad_norm": 0.36436161398887634, "learning_rate": 8.374558303886925e-06, "loss": 0.065, "step": 7140 }, { "epoch": 3.6092882382635034, "grad_norm": 0.3161546289920807, "learning_rate": 8.34427057041898e-06, "loss": 0.0552, "step": 7150 }, { "epoch": 3.614336193841494, "grad_norm": 0.3535007834434509, "learning_rate": 8.313982836951035e-06, "loss": 0.0545, "step": 7160 }, { "epoch": 3.619384149419485, "grad_norm": 0.5373191833496094, "learning_rate": 8.28369510348309e-06, "loss": 0.0633, "step": 7170 }, { "epoch": 3.624432104997476, "grad_norm": 0.4402375817298889, "learning_rate": 8.253407370015144e-06, "loss": 0.0667, "step": 7180 }, { "epoch": 3.629480060575467, "grad_norm": 0.3555195927619934, "learning_rate": 8.223119636547197e-06, "loss": 0.0604, "step": 7190 }, { "epoch": 3.6345280161534577, "grad_norm": 0.3920760452747345, "learning_rate": 8.192831903079253e-06, "loss": 0.0681, "step": 7200 }, { "epoch": 3.639575971731449, "grad_norm": 0.40458381175994873, "learning_rate": 8.162544169611308e-06, "loss": 0.0642, "step": 7210 }, { "epoch": 3.6446239273094396, "grad_norm": 0.431784063577652, "learning_rate": 8.132256436143363e-06, "loss": 0.0636, "step": 7220 }, { "epoch": 3.6496718828874304, "grad_norm": 0.3183976709842682, "learning_rate": 8.101968702675416e-06, "loss": 0.0596, "step": 7230 }, { "epoch": 3.6547198384654216, "grad_norm": 0.6023189425468445, "learning_rate": 8.071680969207471e-06, "loss": 0.0563, "step": 7240 }, { "epoch": 3.6597677940434123, "grad_norm": 0.3629746735095978, "learning_rate": 8.041393235739527e-06, "loss": 0.0584, "step": 7250 }, { "epoch": 3.664815749621403, "grad_norm": 0.41280779242515564, "learning_rate": 8.01110550227158e-06, "loss": 0.0572, "step": 7260 }, { "epoch": 3.6698637051993943, "grad_norm": 0.45728689432144165, "learning_rate": 7.980817768803635e-06, "loss": 0.0533, "step": 7270 }, { "epoch": 3.674911660777385, "grad_norm": 0.37236565351486206, "learning_rate": 7.950530035335689e-06, "loss": 0.0621, "step": 7280 }, { "epoch": 3.679959616355376, "grad_norm": 0.38259199261665344, "learning_rate": 7.920242301867744e-06, "loss": 0.0537, "step": 7290 }, { "epoch": 3.685007571933367, "grad_norm": 0.36142826080322266, "learning_rate": 7.889954568399799e-06, "loss": 0.0596, "step": 7300 }, { "epoch": 3.690055527511358, "grad_norm": 0.4781215190887451, "learning_rate": 7.859666834931854e-06, "loss": 0.0673, "step": 7310 }, { "epoch": 3.6951034830893486, "grad_norm": 0.35468199849128723, "learning_rate": 7.829379101463906e-06, "loss": 0.0594, "step": 7320 }, { "epoch": 3.7001514386673398, "grad_norm": 0.3899647891521454, "learning_rate": 7.799091367995961e-06, "loss": 0.0594, "step": 7330 }, { "epoch": 3.7051993942453305, "grad_norm": 0.36970004439353943, "learning_rate": 7.768803634528016e-06, "loss": 0.0662, "step": 7340 }, { "epoch": 3.7102473498233217, "grad_norm": 0.2706756293773651, "learning_rate": 7.738515901060071e-06, "loss": 0.0667, "step": 7350 }, { "epoch": 3.7152953054013125, "grad_norm": 0.5611262321472168, "learning_rate": 7.708228167592126e-06, "loss": 0.0565, "step": 7360 }, { "epoch": 3.7203432609793032, "grad_norm": 0.4006311595439911, "learning_rate": 7.67794043412418e-06, "loss": 0.0657, "step": 7370 }, { "epoch": 3.7253912165572944, "grad_norm": 0.34327423572540283, "learning_rate": 7.647652700656235e-06, "loss": 0.067, "step": 7380 }, { "epoch": 3.730439172135285, "grad_norm": 0.38259416818618774, "learning_rate": 7.617364967188288e-06, "loss": 0.0649, "step": 7390 }, { "epoch": 3.7354871277132764, "grad_norm": 0.3452150225639343, "learning_rate": 7.587077233720343e-06, "loss": 0.0714, "step": 7400 }, { "epoch": 3.740535083291267, "grad_norm": 0.32733386754989624, "learning_rate": 7.556789500252398e-06, "loss": 0.0639, "step": 7410 }, { "epoch": 3.745583038869258, "grad_norm": 0.4879290759563446, "learning_rate": 7.526501766784453e-06, "loss": 0.061, "step": 7420 }, { "epoch": 3.750630994447249, "grad_norm": 0.5546141266822815, "learning_rate": 7.4962140333165064e-06, "loss": 0.065, "step": 7430 }, { "epoch": 3.75567895002524, "grad_norm": 0.3634326159954071, "learning_rate": 7.465926299848562e-06, "loss": 0.0614, "step": 7440 }, { "epoch": 3.7607269056032306, "grad_norm": 0.35349273681640625, "learning_rate": 7.435638566380616e-06, "loss": 0.0565, "step": 7450 }, { "epoch": 3.765774861181222, "grad_norm": 0.3440592288970947, "learning_rate": 7.405350832912671e-06, "loss": 0.0508, "step": 7460 }, { "epoch": 3.7708228167592126, "grad_norm": 0.46401387453079224, "learning_rate": 7.375063099444725e-06, "loss": 0.0651, "step": 7470 }, { "epoch": 3.7758707723372034, "grad_norm": 0.2772069275379181, "learning_rate": 7.34477536597678e-06, "loss": 0.0586, "step": 7480 }, { "epoch": 3.7809187279151946, "grad_norm": 0.5362659692764282, "learning_rate": 7.314487632508835e-06, "loss": 0.0659, "step": 7490 }, { "epoch": 3.7859666834931853, "grad_norm": 0.3963538110256195, "learning_rate": 7.284199899040888e-06, "loss": 0.0573, "step": 7500 }, { "epoch": 3.791014639071176, "grad_norm": 0.32734355330467224, "learning_rate": 7.253912165572943e-06, "loss": 0.0563, "step": 7510 }, { "epoch": 3.7960625946491673, "grad_norm": 0.4177393317222595, "learning_rate": 7.223624432104998e-06, "loss": 0.0619, "step": 7520 }, { "epoch": 3.801110550227158, "grad_norm": 0.2514793276786804, "learning_rate": 7.193336698637052e-06, "loss": 0.0627, "step": 7530 }, { "epoch": 3.806158505805149, "grad_norm": 0.4437217116355896, "learning_rate": 7.163048965169107e-06, "loss": 0.061, "step": 7540 }, { "epoch": 3.81120646138314, "grad_norm": 0.5421292781829834, "learning_rate": 7.132761231701161e-06, "loss": 0.0569, "step": 7550 }, { "epoch": 3.8162544169611308, "grad_norm": 0.2918401062488556, "learning_rate": 7.102473498233216e-06, "loss": 0.0597, "step": 7560 }, { "epoch": 3.8213023725391215, "grad_norm": 0.5811281204223633, "learning_rate": 7.07218576476527e-06, "loss": 0.0582, "step": 7570 }, { "epoch": 3.8263503281171127, "grad_norm": 0.3603706955909729, "learning_rate": 7.041898031297325e-06, "loss": 0.0606, "step": 7580 }, { "epoch": 3.8313982836951035, "grad_norm": 0.3899904191493988, "learning_rate": 7.0116102978293786e-06, "loss": 0.0565, "step": 7590 }, { "epoch": 3.8364462392730942, "grad_norm": 0.5667692422866821, "learning_rate": 6.981322564361434e-06, "loss": 0.061, "step": 7600 }, { "epoch": 3.8414941948510855, "grad_norm": 0.5629199147224426, "learning_rate": 6.951034830893489e-06, "loss": 0.053, "step": 7610 }, { "epoch": 3.846542150429076, "grad_norm": 0.32999759912490845, "learning_rate": 6.920747097425543e-06, "loss": 0.0619, "step": 7620 }, { "epoch": 3.851590106007067, "grad_norm": 0.4874314069747925, "learning_rate": 6.8904593639575974e-06, "loss": 0.0608, "step": 7630 }, { "epoch": 3.856638061585058, "grad_norm": 0.3464485704898834, "learning_rate": 6.860171630489652e-06, "loss": 0.0646, "step": 7640 }, { "epoch": 3.861686017163049, "grad_norm": 0.6928033232688904, "learning_rate": 6.829883897021707e-06, "loss": 0.0621, "step": 7650 }, { "epoch": 3.8667339727410397, "grad_norm": 0.45158517360687256, "learning_rate": 6.79959616355376e-06, "loss": 0.0536, "step": 7660 }, { "epoch": 3.871781928319031, "grad_norm": 0.4070644974708557, "learning_rate": 6.7693084300858155e-06, "loss": 0.0516, "step": 7670 }, { "epoch": 3.8768298838970217, "grad_norm": 0.3872455954551697, "learning_rate": 6.739020696617871e-06, "loss": 0.0659, "step": 7680 }, { "epoch": 3.8818778394750124, "grad_norm": 0.3202153742313385, "learning_rate": 6.708732963149924e-06, "loss": 0.0588, "step": 7690 }, { "epoch": 3.8869257950530036, "grad_norm": 0.5554611682891846, "learning_rate": 6.678445229681979e-06, "loss": 0.0557, "step": 7700 }, { "epoch": 3.8919737506309944, "grad_norm": 0.326063871383667, "learning_rate": 6.6481574962140335e-06, "loss": 0.0575, "step": 7710 }, { "epoch": 3.897021706208985, "grad_norm": 0.4410304129123688, "learning_rate": 6.617869762746088e-06, "loss": 0.0647, "step": 7720 }, { "epoch": 3.9020696617869763, "grad_norm": 0.48591405153274536, "learning_rate": 6.587582029278142e-06, "loss": 0.0673, "step": 7730 }, { "epoch": 3.907117617364967, "grad_norm": 0.43069708347320557, "learning_rate": 6.557294295810197e-06, "loss": 0.057, "step": 7740 }, { "epoch": 3.912165572942958, "grad_norm": 0.37136363983154297, "learning_rate": 6.5270065623422515e-06, "loss": 0.0551, "step": 7750 }, { "epoch": 3.917213528520949, "grad_norm": 0.5326444506645203, "learning_rate": 6.496718828874306e-06, "loss": 0.0512, "step": 7760 }, { "epoch": 3.92226148409894, "grad_norm": 0.488471657037735, "learning_rate": 6.466431095406361e-06, "loss": 0.0645, "step": 7770 }, { "epoch": 3.9273094396769306, "grad_norm": 0.5709624290466309, "learning_rate": 6.436143361938415e-06, "loss": 0.0529, "step": 7780 }, { "epoch": 3.932357395254922, "grad_norm": 0.5405532717704773, "learning_rate": 6.4058556284704695e-06, "loss": 0.0629, "step": 7790 }, { "epoch": 3.9374053508329125, "grad_norm": 0.4417840242385864, "learning_rate": 6.375567895002524e-06, "loss": 0.0647, "step": 7800 }, { "epoch": 3.9424533064109037, "grad_norm": 0.4238974153995514, "learning_rate": 6.345280161534579e-06, "loss": 0.055, "step": 7810 }, { "epoch": 3.9475012619888945, "grad_norm": 0.5029925107955933, "learning_rate": 6.314992428066633e-06, "loss": 0.0538, "step": 7820 }, { "epoch": 3.9525492175668853, "grad_norm": 0.36458006501197815, "learning_rate": 6.2847046945986876e-06, "loss": 0.063, "step": 7830 }, { "epoch": 3.9575971731448765, "grad_norm": 0.2648584246635437, "learning_rate": 6.254416961130743e-06, "loss": 0.0647, "step": 7840 }, { "epoch": 3.9626451287228672, "grad_norm": 0.36690616607666016, "learning_rate": 6.224129227662796e-06, "loss": 0.0563, "step": 7850 }, { "epoch": 3.967693084300858, "grad_norm": 0.4327741861343384, "learning_rate": 6.193841494194851e-06, "loss": 0.0605, "step": 7860 }, { "epoch": 3.972741039878849, "grad_norm": 0.5171884298324585, "learning_rate": 6.163553760726906e-06, "loss": 0.0642, "step": 7870 }, { "epoch": 3.97778899545684, "grad_norm": 0.40773433446884155, "learning_rate": 6.13326602725896e-06, "loss": 0.0621, "step": 7880 }, { "epoch": 3.982836951034831, "grad_norm": 0.5063067078590393, "learning_rate": 6.102978293791015e-06, "loss": 0.0654, "step": 7890 }, { "epoch": 3.987884906612822, "grad_norm": 0.37013697624206543, "learning_rate": 6.072690560323069e-06, "loss": 0.0586, "step": 7900 }, { "epoch": 3.9929328621908127, "grad_norm": 0.3777279555797577, "learning_rate": 6.042402826855124e-06, "loss": 0.0544, "step": 7910 }, { "epoch": 3.997980817768804, "grad_norm": 0.4045654535293579, "learning_rate": 6.012115093387178e-06, "loss": 0.0706, "step": 7920 }, { "epoch": 4.0, "eval_f1": 0.9705180789481339, "eval_loss": 0.044486090540885925, "eval_runtime": 801.1309, "eval_samples_per_second": 257.464, "eval_steps_per_second": 2.012, "step": 7924 }, { "epoch": 4.003028773346794, "grad_norm": 0.34277331829071045, "learning_rate": 5.981827359919233e-06, "loss": 0.0532, "step": 7930 }, { "epoch": 4.008076728924785, "grad_norm": 0.40653395652770996, "learning_rate": 5.951539626451287e-06, "loss": 0.0601, "step": 7940 }, { "epoch": 4.013124684502777, "grad_norm": 0.39089271426200867, "learning_rate": 5.921251892983342e-06, "loss": 0.0585, "step": 7950 }, { "epoch": 4.018172640080767, "grad_norm": 0.3117099404335022, "learning_rate": 5.890964159515397e-06, "loss": 0.0536, "step": 7960 }, { "epoch": 4.023220595658758, "grad_norm": 0.4908514618873596, "learning_rate": 5.860676426047451e-06, "loss": 0.0618, "step": 7970 }, { "epoch": 4.028268551236749, "grad_norm": 0.35001102089881897, "learning_rate": 5.830388692579505e-06, "loss": 0.0595, "step": 7980 }, { "epoch": 4.03331650681474, "grad_norm": 0.39042168855667114, "learning_rate": 5.80010095911156e-06, "loss": 0.0639, "step": 7990 }, { "epoch": 4.038364462392731, "grad_norm": 0.48590323328971863, "learning_rate": 5.769813225643615e-06, "loss": 0.0606, "step": 8000 }, { "epoch": 4.043412417970722, "grad_norm": 0.3951605558395386, "learning_rate": 5.739525492175669e-06, "loss": 0.0585, "step": 8010 }, { "epoch": 4.048460373548712, "grad_norm": 0.4090045690536499, "learning_rate": 5.709237758707723e-06, "loss": 0.064, "step": 8020 }, { "epoch": 4.053508329126704, "grad_norm": 0.5321690440177917, "learning_rate": 5.6789500252397786e-06, "loss": 0.0581, "step": 8030 }, { "epoch": 4.058556284704695, "grad_norm": 0.4750302731990814, "learning_rate": 5.648662291771832e-06, "loss": 0.066, "step": 8040 }, { "epoch": 4.063604240282685, "grad_norm": 0.36469149589538574, "learning_rate": 5.618374558303887e-06, "loss": 0.0604, "step": 8050 }, { "epoch": 4.068652195860676, "grad_norm": 0.35261520743370056, "learning_rate": 5.5880868248359414e-06, "loss": 0.061, "step": 8060 }, { "epoch": 4.0737001514386675, "grad_norm": 0.32109716534614563, "learning_rate": 5.557799091367996e-06, "loss": 0.0613, "step": 8070 }, { "epoch": 4.078748107016659, "grad_norm": 0.41034355759620667, "learning_rate": 5.527511357900051e-06, "loss": 0.0567, "step": 8080 }, { "epoch": 4.083796062594649, "grad_norm": 0.4242144823074341, "learning_rate": 5.497223624432105e-06, "loss": 0.0539, "step": 8090 }, { "epoch": 4.08884401817264, "grad_norm": 0.32515600323677063, "learning_rate": 5.4669358909641595e-06, "loss": 0.0581, "step": 8100 }, { "epoch": 4.093891973750631, "grad_norm": 0.6698907017707825, "learning_rate": 5.436648157496214e-06, "loss": 0.0686, "step": 8110 }, { "epoch": 4.098939929328622, "grad_norm": 0.2780954837799072, "learning_rate": 5.406360424028269e-06, "loss": 0.0518, "step": 8120 }, { "epoch": 4.103987884906613, "grad_norm": 0.3639545440673828, "learning_rate": 5.376072690560323e-06, "loss": 0.0569, "step": 8130 }, { "epoch": 4.109035840484604, "grad_norm": 0.4723798930644989, "learning_rate": 5.3457849570923775e-06, "loss": 0.0596, "step": 8140 }, { "epoch": 4.1140837960625944, "grad_norm": 0.30923640727996826, "learning_rate": 5.315497223624433e-06, "loss": 0.0564, "step": 8150 }, { "epoch": 4.119131751640586, "grad_norm": 0.3050035238265991, "learning_rate": 5.285209490156487e-06, "loss": 0.0653, "step": 8160 }, { "epoch": 4.124179707218577, "grad_norm": 0.5005570650100708, "learning_rate": 5.254921756688541e-06, "loss": 0.0623, "step": 8170 }, { "epoch": 4.129227662796567, "grad_norm": 0.5100895762443542, "learning_rate": 5.2246340232205955e-06, "loss": 0.0622, "step": 8180 }, { "epoch": 4.134275618374558, "grad_norm": 0.33904436230659485, "learning_rate": 5.194346289752651e-06, "loss": 0.0575, "step": 8190 }, { "epoch": 4.13932357395255, "grad_norm": 0.3320677876472473, "learning_rate": 5.164058556284704e-06, "loss": 0.0565, "step": 8200 }, { "epoch": 4.14437152953054, "grad_norm": 0.3176303803920746, "learning_rate": 5.133770822816759e-06, "loss": 0.0597, "step": 8210 }, { "epoch": 4.149419485108531, "grad_norm": 0.33052679896354675, "learning_rate": 5.103483089348814e-06, "loss": 0.0553, "step": 8220 }, { "epoch": 4.154467440686522, "grad_norm": 0.3024562895298004, "learning_rate": 5.073195355880868e-06, "loss": 0.0595, "step": 8230 }, { "epoch": 4.159515396264513, "grad_norm": 0.380520224571228, "learning_rate": 5.042907622412923e-06, "loss": 0.048, "step": 8240 }, { "epoch": 4.164563351842504, "grad_norm": 0.47053784132003784, "learning_rate": 5.012619888944977e-06, "loss": 0.0616, "step": 8250 }, { "epoch": 4.169611307420495, "grad_norm": 0.5295135378837585, "learning_rate": 4.982332155477032e-06, "loss": 0.0579, "step": 8260 }, { "epoch": 4.174659262998485, "grad_norm": 0.3950503468513489, "learning_rate": 4.952044422009086e-06, "loss": 0.0594, "step": 8270 }, { "epoch": 4.1797072185764765, "grad_norm": 0.40204277634620667, "learning_rate": 4.921756688541141e-06, "loss": 0.0568, "step": 8280 }, { "epoch": 4.184755174154468, "grad_norm": 0.4756285548210144, "learning_rate": 4.891468955073196e-06, "loss": 0.0684, "step": 8290 }, { "epoch": 4.189803129732458, "grad_norm": 0.42255735397338867, "learning_rate": 4.86118122160525e-06, "loss": 0.0551, "step": 8300 }, { "epoch": 4.194851085310449, "grad_norm": 0.35746055841445923, "learning_rate": 4.830893488137305e-06, "loss": 0.0536, "step": 8310 }, { "epoch": 4.1998990408884405, "grad_norm": 0.2798272371292114, "learning_rate": 4.800605754669359e-06, "loss": 0.0654, "step": 8320 }, { "epoch": 4.204946996466431, "grad_norm": 0.4099213778972626, "learning_rate": 4.770318021201413e-06, "loss": 0.0695, "step": 8330 }, { "epoch": 4.209994952044422, "grad_norm": 0.31809088587760925, "learning_rate": 4.7400302877334685e-06, "loss": 0.0567, "step": 8340 }, { "epoch": 4.215042907622413, "grad_norm": 0.3884822726249695, "learning_rate": 4.709742554265523e-06, "loss": 0.0621, "step": 8350 }, { "epoch": 4.2200908632004035, "grad_norm": 0.4989534020423889, "learning_rate": 4.679454820797577e-06, "loss": 0.0591, "step": 8360 }, { "epoch": 4.225138818778395, "grad_norm": 0.5055777430534363, "learning_rate": 4.649167087329631e-06, "loss": 0.0552, "step": 8370 }, { "epoch": 4.230186774356386, "grad_norm": 0.4415469765663147, "learning_rate": 4.6188793538616865e-06, "loss": 0.0726, "step": 8380 }, { "epoch": 4.235234729934376, "grad_norm": 0.24666030704975128, "learning_rate": 4.58859162039374e-06, "loss": 0.0526, "step": 8390 }, { "epoch": 4.240282685512367, "grad_norm": 0.49552977085113525, "learning_rate": 4.558303886925795e-06, "loss": 0.0607, "step": 8400 }, { "epoch": 4.245330641090359, "grad_norm": 0.3048471510410309, "learning_rate": 4.52801615345785e-06, "loss": 0.0628, "step": 8410 }, { "epoch": 4.250378596668349, "grad_norm": 0.3662854731082916, "learning_rate": 4.497728419989904e-06, "loss": 0.062, "step": 8420 }, { "epoch": 4.25542655224634, "grad_norm": 0.3893071711063385, "learning_rate": 4.467440686521959e-06, "loss": 0.0542, "step": 8430 }, { "epoch": 4.260474507824331, "grad_norm": 0.40179580450057983, "learning_rate": 4.437152953054013e-06, "loss": 0.0524, "step": 8440 }, { "epoch": 4.265522463402322, "grad_norm": 0.35265469551086426, "learning_rate": 4.406865219586068e-06, "loss": 0.0616, "step": 8450 }, { "epoch": 4.270570418980313, "grad_norm": 0.2585351765155792, "learning_rate": 4.376577486118122e-06, "loss": 0.058, "step": 8460 }, { "epoch": 4.275618374558304, "grad_norm": 0.4452759325504303, "learning_rate": 4.346289752650177e-06, "loss": 0.0533, "step": 8470 }, { "epoch": 4.280666330136295, "grad_norm": 0.40577125549316406, "learning_rate": 4.316002019182232e-06, "loss": 0.055, "step": 8480 }, { "epoch": 4.285714285714286, "grad_norm": 0.2692396938800812, "learning_rate": 4.2857142857142855e-06, "loss": 0.0616, "step": 8490 }, { "epoch": 4.290762241292277, "grad_norm": 0.47697675228118896, "learning_rate": 4.255426552246341e-06, "loss": 0.0596, "step": 8500 }, { "epoch": 4.295810196870267, "grad_norm": 0.4272094964981079, "learning_rate": 4.225138818778395e-06, "loss": 0.0571, "step": 8510 }, { "epoch": 4.300858152448258, "grad_norm": 0.5147340297698975, "learning_rate": 4.194851085310449e-06, "loss": 0.0432, "step": 8520 }, { "epoch": 4.3059061080262495, "grad_norm": 0.37690308690071106, "learning_rate": 4.1645633518425035e-06, "loss": 0.054, "step": 8530 }, { "epoch": 4.310954063604241, "grad_norm": 0.5072263479232788, "learning_rate": 4.134275618374559e-06, "loss": 0.0575, "step": 8540 }, { "epoch": 4.316002019182231, "grad_norm": 0.3782062232494354, "learning_rate": 4.103987884906613e-06, "loss": 0.0558, "step": 8550 }, { "epoch": 4.321049974760222, "grad_norm": 0.27360981702804565, "learning_rate": 4.073700151438667e-06, "loss": 0.0645, "step": 8560 }, { "epoch": 4.326097930338213, "grad_norm": 0.5791490077972412, "learning_rate": 4.043412417970722e-06, "loss": 0.0751, "step": 8570 }, { "epoch": 4.331145885916204, "grad_norm": 0.2799968421459198, "learning_rate": 4.013124684502776e-06, "loss": 0.0542, "step": 8580 }, { "epoch": 4.336193841494195, "grad_norm": 0.4403197467327118, "learning_rate": 3.982836951034831e-06, "loss": 0.0647, "step": 8590 }, { "epoch": 4.341241797072186, "grad_norm": 0.3798120319843292, "learning_rate": 3.952549217566885e-06, "loss": 0.0545, "step": 8600 }, { "epoch": 4.3462897526501765, "grad_norm": 0.40195682644844055, "learning_rate": 3.92226148409894e-06, "loss": 0.058, "step": 8610 }, { "epoch": 4.351337708228168, "grad_norm": 0.30205094814300537, "learning_rate": 3.891973750630995e-06, "loss": 0.0585, "step": 8620 }, { "epoch": 4.356385663806159, "grad_norm": 0.3941998779773712, "learning_rate": 3.861686017163049e-06, "loss": 0.0628, "step": 8630 }, { "epoch": 4.361433619384149, "grad_norm": 0.4298538267612457, "learning_rate": 3.831398283695104e-06, "loss": 0.0519, "step": 8640 }, { "epoch": 4.36648157496214, "grad_norm": 0.45147988200187683, "learning_rate": 3.801110550227158e-06, "loss": 0.0555, "step": 8650 }, { "epoch": 4.371529530540132, "grad_norm": 0.3213054835796356, "learning_rate": 3.7708228167592127e-06, "loss": 0.0573, "step": 8660 }, { "epoch": 4.376577486118122, "grad_norm": 0.3924931287765503, "learning_rate": 3.740535083291267e-06, "loss": 0.0609, "step": 8670 }, { "epoch": 4.381625441696113, "grad_norm": 0.3347417116165161, "learning_rate": 3.7102473498233217e-06, "loss": 0.0573, "step": 8680 }, { "epoch": 4.386673397274104, "grad_norm": 0.5916124582290649, "learning_rate": 3.679959616355376e-06, "loss": 0.0631, "step": 8690 }, { "epoch": 4.391721352852095, "grad_norm": 0.4623749852180481, "learning_rate": 3.6496718828874303e-06, "loss": 0.0603, "step": 8700 }, { "epoch": 4.396769308430086, "grad_norm": 0.3337404727935791, "learning_rate": 3.6193841494194855e-06, "loss": 0.0559, "step": 8710 }, { "epoch": 4.401817264008077, "grad_norm": 0.4419994652271271, "learning_rate": 3.5890964159515398e-06, "loss": 0.0574, "step": 8720 }, { "epoch": 4.406865219586067, "grad_norm": 0.47578585147857666, "learning_rate": 3.5588086824835945e-06, "loss": 0.0554, "step": 8730 }, { "epoch": 4.411913175164059, "grad_norm": 0.3991304337978363, "learning_rate": 3.5285209490156488e-06, "loss": 0.0522, "step": 8740 }, { "epoch": 4.41696113074205, "grad_norm": 0.2646455764770508, "learning_rate": 3.498233215547703e-06, "loss": 0.053, "step": 8750 }, { "epoch": 4.42200908632004, "grad_norm": 0.38998502492904663, "learning_rate": 3.4679454820797578e-06, "loss": 0.0697, "step": 8760 }, { "epoch": 4.427057041898031, "grad_norm": 0.39025184512138367, "learning_rate": 3.437657748611812e-06, "loss": 0.0564, "step": 8770 }, { "epoch": 4.4321049974760225, "grad_norm": 0.36179178953170776, "learning_rate": 3.407370015143867e-06, "loss": 0.0695, "step": 8780 }, { "epoch": 4.437152953054013, "grad_norm": 0.47754356265068054, "learning_rate": 3.3770822816759215e-06, "loss": 0.0599, "step": 8790 }, { "epoch": 4.442200908632004, "grad_norm": 0.3687341511249542, "learning_rate": 3.346794548207976e-06, "loss": 0.0577, "step": 8800 }, { "epoch": 4.447248864209995, "grad_norm": 0.4395473003387451, "learning_rate": 3.3165068147400305e-06, "loss": 0.0559, "step": 8810 }, { "epoch": 4.4522968197879855, "grad_norm": 0.3659065365791321, "learning_rate": 3.286219081272085e-06, "loss": 0.0591, "step": 8820 }, { "epoch": 4.457344775365977, "grad_norm": 0.47786960005760193, "learning_rate": 3.255931347804139e-06, "loss": 0.0591, "step": 8830 }, { "epoch": 4.462392730943968, "grad_norm": 0.44323790073394775, "learning_rate": 3.2256436143361943e-06, "loss": 0.0508, "step": 8840 }, { "epoch": 4.467440686521958, "grad_norm": 0.3510769307613373, "learning_rate": 3.1953558808682486e-06, "loss": 0.0554, "step": 8850 }, { "epoch": 4.4724886420999495, "grad_norm": 0.45277318358421326, "learning_rate": 3.165068147400303e-06, "loss": 0.0532, "step": 8860 }, { "epoch": 4.477536597677941, "grad_norm": 0.5000207424163818, "learning_rate": 3.1347804139323576e-06, "loss": 0.0654, "step": 8870 }, { "epoch": 4.482584553255931, "grad_norm": 0.37949642539024353, "learning_rate": 3.104492680464412e-06, "loss": 0.0549, "step": 8880 }, { "epoch": 4.487632508833922, "grad_norm": 0.3000931143760681, "learning_rate": 3.0742049469964666e-06, "loss": 0.0544, "step": 8890 }, { "epoch": 4.492680464411913, "grad_norm": 0.512484610080719, "learning_rate": 3.043917213528521e-06, "loss": 0.0651, "step": 8900 }, { "epoch": 4.497728419989904, "grad_norm": 0.4052237570285797, "learning_rate": 3.0136294800605756e-06, "loss": 0.0601, "step": 8910 }, { "epoch": 4.502776375567895, "grad_norm": 0.3805348873138428, "learning_rate": 2.9833417465926303e-06, "loss": 0.0553, "step": 8920 }, { "epoch": 4.507824331145886, "grad_norm": 0.4143049120903015, "learning_rate": 2.9530540131246846e-06, "loss": 0.0488, "step": 8930 }, { "epoch": 4.512872286723876, "grad_norm": 0.4691813290119171, "learning_rate": 2.922766279656739e-06, "loss": 0.0544, "step": 8940 }, { "epoch": 4.517920242301868, "grad_norm": 0.40783849358558655, "learning_rate": 2.8924785461887936e-06, "loss": 0.0678, "step": 8950 }, { "epoch": 4.522968197879859, "grad_norm": 0.36696454882621765, "learning_rate": 2.862190812720848e-06, "loss": 0.0591, "step": 8960 }, { "epoch": 4.52801615345785, "grad_norm": 0.43989595770835876, "learning_rate": 2.8319030792529026e-06, "loss": 0.0604, "step": 8970 }, { "epoch": 4.53306410903584, "grad_norm": 0.38078877329826355, "learning_rate": 2.8016153457849574e-06, "loss": 0.0578, "step": 8980 }, { "epoch": 4.5381120646138315, "grad_norm": 0.3941843807697296, "learning_rate": 2.7713276123170117e-06, "loss": 0.0694, "step": 8990 }, { "epoch": 4.543160020191822, "grad_norm": 0.3795044422149658, "learning_rate": 2.7410398788490664e-06, "loss": 0.0588, "step": 9000 }, { "epoch": 4.548207975769813, "grad_norm": 0.3949735462665558, "learning_rate": 2.7107521453811207e-06, "loss": 0.0623, "step": 9010 }, { "epoch": 4.553255931347804, "grad_norm": 0.5588275194168091, "learning_rate": 2.680464411913175e-06, "loss": 0.0588, "step": 9020 }, { "epoch": 4.5583038869257955, "grad_norm": 0.29749733209609985, "learning_rate": 2.6501766784452297e-06, "loss": 0.0445, "step": 9030 }, { "epoch": 4.563351842503786, "grad_norm": 0.4993056654930115, "learning_rate": 2.6198889449772844e-06, "loss": 0.0595, "step": 9040 }, { "epoch": 4.568399798081777, "grad_norm": 0.5257248878479004, "learning_rate": 2.589601211509339e-06, "loss": 0.0469, "step": 9050 }, { "epoch": 4.573447753659767, "grad_norm": 0.35071873664855957, "learning_rate": 2.5593134780413934e-06, "loss": 0.056, "step": 9060 }, { "epoch": 4.5784957092377585, "grad_norm": 0.49088719487190247, "learning_rate": 2.5290257445734477e-06, "loss": 0.0619, "step": 9070 }, { "epoch": 4.58354366481575, "grad_norm": 0.5432353019714355, "learning_rate": 2.4987380111055024e-06, "loss": 0.0583, "step": 9080 }, { "epoch": 4.588591620393741, "grad_norm": 0.5358169674873352, "learning_rate": 2.4684502776375567e-06, "loss": 0.0618, "step": 9090 }, { "epoch": 4.593639575971731, "grad_norm": 0.299734890460968, "learning_rate": 2.438162544169611e-06, "loss": 0.0587, "step": 9100 }, { "epoch": 4.598687531549722, "grad_norm": 0.28594735264778137, "learning_rate": 2.407874810701666e-06, "loss": 0.0552, "step": 9110 }, { "epoch": 4.603735487127714, "grad_norm": 0.440019428730011, "learning_rate": 2.3775870772337205e-06, "loss": 0.0616, "step": 9120 }, { "epoch": 4.608783442705704, "grad_norm": 0.3852064311504364, "learning_rate": 2.347299343765775e-06, "loss": 0.0544, "step": 9130 }, { "epoch": 4.613831398283695, "grad_norm": 0.47597625851631165, "learning_rate": 2.3170116102978295e-06, "loss": 0.0626, "step": 9140 }, { "epoch": 4.618879353861686, "grad_norm": 0.4893425703048706, "learning_rate": 2.2867238768298838e-06, "loss": 0.0493, "step": 9150 }, { "epoch": 4.623927309439677, "grad_norm": 0.4313579201698303, "learning_rate": 2.2564361433619385e-06, "loss": 0.0533, "step": 9160 }, { "epoch": 4.628975265017668, "grad_norm": 0.31476062536239624, "learning_rate": 2.2261484098939928e-06, "loss": 0.0586, "step": 9170 }, { "epoch": 4.634023220595659, "grad_norm": 0.4846239686012268, "learning_rate": 2.1958606764260475e-06, "loss": 0.0541, "step": 9180 }, { "epoch": 4.639071176173649, "grad_norm": 0.4027024805545807, "learning_rate": 2.1655729429581022e-06, "loss": 0.0532, "step": 9190 }, { "epoch": 4.644119131751641, "grad_norm": 0.43335291743278503, "learning_rate": 2.1352852094901565e-06, "loss": 0.0664, "step": 9200 }, { "epoch": 4.649167087329632, "grad_norm": 0.47337576746940613, "learning_rate": 2.1049974760222112e-06, "loss": 0.0592, "step": 9210 }, { "epoch": 4.654215042907622, "grad_norm": 0.44911569356918335, "learning_rate": 2.0747097425542655e-06, "loss": 0.0642, "step": 9220 }, { "epoch": 4.659262998485613, "grad_norm": 0.47989997267723083, "learning_rate": 2.04442200908632e-06, "loss": 0.0558, "step": 9230 }, { "epoch": 4.6643109540636045, "grad_norm": 0.3837885856628418, "learning_rate": 2.014134275618375e-06, "loss": 0.0534, "step": 9240 }, { "epoch": 4.669358909641595, "grad_norm": 0.33468201756477356, "learning_rate": 1.9838465421504293e-06, "loss": 0.0638, "step": 9250 }, { "epoch": 4.674406865219586, "grad_norm": 0.3218873143196106, "learning_rate": 1.9535588086824836e-06, "loss": 0.0562, "step": 9260 }, { "epoch": 4.679454820797577, "grad_norm": 0.4538477659225464, "learning_rate": 1.9232710752145383e-06, "loss": 0.0562, "step": 9270 }, { "epoch": 4.684502776375568, "grad_norm": 0.42905497550964355, "learning_rate": 1.8929833417465926e-06, "loss": 0.0581, "step": 9280 }, { "epoch": 4.689550731953559, "grad_norm": 0.3783353567123413, "learning_rate": 1.8626956082786473e-06, "loss": 0.0486, "step": 9290 }, { "epoch": 4.69459868753155, "grad_norm": 0.42233869433403015, "learning_rate": 1.8324078748107018e-06, "loss": 0.0534, "step": 9300 }, { "epoch": 4.69964664310954, "grad_norm": 0.2925800383090973, "learning_rate": 1.802120141342756e-06, "loss": 0.0557, "step": 9310 }, { "epoch": 4.7046945986875315, "grad_norm": 0.4508257210254669, "learning_rate": 1.7718324078748106e-06, "loss": 0.0615, "step": 9320 }, { "epoch": 4.709742554265523, "grad_norm": 0.5092118382453918, "learning_rate": 1.7415446744068653e-06, "loss": 0.0577, "step": 9330 }, { "epoch": 4.714790509843513, "grad_norm": 0.3694470524787903, "learning_rate": 1.7112569409389198e-06, "loss": 0.0485, "step": 9340 }, { "epoch": 4.719838465421504, "grad_norm": 0.4794639050960541, "learning_rate": 1.6809692074709741e-06, "loss": 0.0699, "step": 9350 }, { "epoch": 4.724886420999495, "grad_norm": 0.4152567982673645, "learning_rate": 1.6506814740030288e-06, "loss": 0.0521, "step": 9360 }, { "epoch": 4.729934376577486, "grad_norm": 0.48920056223869324, "learning_rate": 1.6203937405350833e-06, "loss": 0.0677, "step": 9370 }, { "epoch": 4.734982332155477, "grad_norm": 0.37886640429496765, "learning_rate": 1.5901060070671379e-06, "loss": 0.0575, "step": 9380 }, { "epoch": 4.740030287733468, "grad_norm": 0.5271609425544739, "learning_rate": 1.5598182735991924e-06, "loss": 0.0618, "step": 9390 }, { "epoch": 4.745078243311459, "grad_norm": 0.376953125, "learning_rate": 1.5295305401312469e-06, "loss": 0.0558, "step": 9400 }, { "epoch": 4.75012619888945, "grad_norm": 0.4146003723144531, "learning_rate": 1.4992428066633014e-06, "loss": 0.0567, "step": 9410 }, { "epoch": 4.755174154467441, "grad_norm": 0.5335793495178223, "learning_rate": 1.4689550731953559e-06, "loss": 0.0527, "step": 9420 }, { "epoch": 4.760222110045431, "grad_norm": 0.4028931260108948, "learning_rate": 1.4386673397274104e-06, "loss": 0.0546, "step": 9430 }, { "epoch": 4.765270065623422, "grad_norm": 0.4504133462905884, "learning_rate": 1.408379606259465e-06, "loss": 0.0608, "step": 9440 }, { "epoch": 4.770318021201414, "grad_norm": 0.4923204183578491, "learning_rate": 1.3780918727915194e-06, "loss": 0.0621, "step": 9450 }, { "epoch": 4.775365976779405, "grad_norm": 0.29700249433517456, "learning_rate": 1.3478041393235741e-06, "loss": 0.055, "step": 9460 }, { "epoch": 4.780413932357395, "grad_norm": 0.4809055030345917, "learning_rate": 1.3175164058556284e-06, "loss": 0.0546, "step": 9470 }, { "epoch": 4.785461887935386, "grad_norm": 0.5369795560836792, "learning_rate": 1.287228672387683e-06, "loss": 0.059, "step": 9480 }, { "epoch": 4.790509843513377, "grad_norm": 0.4439578652381897, "learning_rate": 1.2569409389197376e-06, "loss": 0.0615, "step": 9490 }, { "epoch": 4.795557799091368, "grad_norm": 0.39975985884666443, "learning_rate": 1.2266532054517921e-06, "loss": 0.0587, "step": 9500 }, { "epoch": 4.800605754669359, "grad_norm": 0.34285855293273926, "learning_rate": 1.1963654719838464e-06, "loss": 0.0497, "step": 9510 }, { "epoch": 4.80565371024735, "grad_norm": 0.3402077257633209, "learning_rate": 1.166077738515901e-06, "loss": 0.0579, "step": 9520 }, { "epoch": 4.8107016658253405, "grad_norm": 0.3736449182033539, "learning_rate": 1.1357900050479557e-06, "loss": 0.063, "step": 9530 }, { "epoch": 4.815749621403332, "grad_norm": 0.3561767637729645, "learning_rate": 1.1055022715800102e-06, "loss": 0.0633, "step": 9540 }, { "epoch": 4.820797576981323, "grad_norm": 0.447592556476593, "learning_rate": 1.0752145381120645e-06, "loss": 0.0484, "step": 9550 }, { "epoch": 4.825845532559313, "grad_norm": 0.3960745930671692, "learning_rate": 1.0449268046441192e-06, "loss": 0.0631, "step": 9560 }, { "epoch": 4.8308934881373045, "grad_norm": 0.2932693064212799, "learning_rate": 1.0146390711761737e-06, "loss": 0.0562, "step": 9570 }, { "epoch": 4.835941443715296, "grad_norm": 0.37769854068756104, "learning_rate": 9.843513377082282e-07, "loss": 0.0482, "step": 9580 }, { "epoch": 4.840989399293286, "grad_norm": 0.3415481150150299, "learning_rate": 9.540636042402827e-07, "loss": 0.055, "step": 9590 }, { "epoch": 4.846037354871277, "grad_norm": 0.38010311126708984, "learning_rate": 9.237758707723372e-07, "loss": 0.0599, "step": 9600 }, { "epoch": 4.851085310449268, "grad_norm": 0.3991403579711914, "learning_rate": 8.934881373043917e-07, "loss": 0.0637, "step": 9610 }, { "epoch": 4.856133266027259, "grad_norm": 0.5155503153800964, "learning_rate": 8.632004038364462e-07, "loss": 0.0671, "step": 9620 }, { "epoch": 4.86118122160525, "grad_norm": 0.42242443561553955, "learning_rate": 8.329126703685008e-07, "loss": 0.0565, "step": 9630 }, { "epoch": 4.866229177183241, "grad_norm": 0.4904538691043854, "learning_rate": 8.026249369005552e-07, "loss": 0.0568, "step": 9640 }, { "epoch": 4.871277132761231, "grad_norm": 0.5523189902305603, "learning_rate": 7.723372034326099e-07, "loss": 0.0559, "step": 9650 }, { "epoch": 4.876325088339223, "grad_norm": 0.4754299819469452, "learning_rate": 7.420494699646643e-07, "loss": 0.0653, "step": 9660 }, { "epoch": 4.881373043917214, "grad_norm": 0.3697846531867981, "learning_rate": 7.117617364967189e-07, "loss": 0.0539, "step": 9670 }, { "epoch": 4.886420999495204, "grad_norm": 0.46191075444221497, "learning_rate": 6.814740030287734e-07, "loss": 0.0676, "step": 9680 }, { "epoch": 4.891468955073195, "grad_norm": 0.3706737756729126, "learning_rate": 6.511862695608279e-07, "loss": 0.0576, "step": 9690 }, { "epoch": 4.8965169106511865, "grad_norm": 0.34824711084365845, "learning_rate": 6.208985360928824e-07, "loss": 0.0607, "step": 9700 }, { "epoch": 4.901564866229177, "grad_norm": 0.33516255021095276, "learning_rate": 5.906108026249369e-07, "loss": 0.0532, "step": 9710 }, { "epoch": 4.906612821807168, "grad_norm": 0.4216098189353943, "learning_rate": 5.603230691569914e-07, "loss": 0.0506, "step": 9720 }, { "epoch": 4.911660777385159, "grad_norm": 0.39393237233161926, "learning_rate": 5.30035335689046e-07, "loss": 0.0622, "step": 9730 }, { "epoch": 4.91670873296315, "grad_norm": 0.37353748083114624, "learning_rate": 4.997476022211004e-07, "loss": 0.0508, "step": 9740 }, { "epoch": 4.921756688541141, "grad_norm": 0.32179582118988037, "learning_rate": 4.69459868753155e-07, "loss": 0.0461, "step": 9750 }, { "epoch": 4.926804644119132, "grad_norm": 0.34863799810409546, "learning_rate": 4.3917213528520954e-07, "loss": 0.0513, "step": 9760 }, { "epoch": 4.931852599697122, "grad_norm": 0.4207555651664734, "learning_rate": 4.0888440181726405e-07, "loss": 0.0516, "step": 9770 }, { "epoch": 4.9369005552751135, "grad_norm": 0.372896283864975, "learning_rate": 3.7859666834931856e-07, "loss": 0.0476, "step": 9780 }, { "epoch": 4.941948510853105, "grad_norm": 0.5434166789054871, "learning_rate": 3.4830893488137306e-07, "loss": 0.0646, "step": 9790 }, { "epoch": 4.946996466431095, "grad_norm": 0.5460948348045349, "learning_rate": 3.1802120141342757e-07, "loss": 0.0562, "step": 9800 }, { "epoch": 4.952044422009086, "grad_norm": 0.4554930329322815, "learning_rate": 2.8773346794548213e-07, "loss": 0.0664, "step": 9810 }, { "epoch": 4.957092377587077, "grad_norm": 0.5326105356216431, "learning_rate": 2.5744573447753664e-07, "loss": 0.0536, "step": 9820 }, { "epoch": 4.962140333165069, "grad_norm": 0.3335418999195099, "learning_rate": 2.2715800100959112e-07, "loss": 0.0611, "step": 9830 }, { "epoch": 4.967188288743059, "grad_norm": 0.408489465713501, "learning_rate": 1.9687026754164563e-07, "loss": 0.056, "step": 9840 }, { "epoch": 4.97223624432105, "grad_norm": 0.49370092153549194, "learning_rate": 1.6658253407370016e-07, "loss": 0.0615, "step": 9850 }, { "epoch": 4.9772841998990405, "grad_norm": 0.47176486253738403, "learning_rate": 1.3629480060575467e-07, "loss": 0.0534, "step": 9860 }, { "epoch": 4.982332155477032, "grad_norm": 0.3332078158855438, "learning_rate": 1.0600706713780919e-07, "loss": 0.0484, "step": 9870 }, { "epoch": 4.987380111055023, "grad_norm": 0.4342339038848877, "learning_rate": 7.57193336698637e-08, "loss": 0.0557, "step": 9880 }, { "epoch": 4.992428066633014, "grad_norm": 0.3356720805168152, "learning_rate": 4.5431600201918226e-08, "loss": 0.0534, "step": 9890 }, { "epoch": 4.997476022211004, "grad_norm": 0.6361636519432068, "learning_rate": 1.514386673397274e-08, "loss": 0.0595, "step": 9900 }, { "epoch": 5.0, "eval_f1": 0.9429269569770486, "eval_loss": 0.0460049994289875, "eval_runtime": 555.0466, "eval_samples_per_second": 371.612, "eval_steps_per_second": 2.904, "step": 9905 }, { "epoch": 5.0, "step": 9905, "total_flos": 9.820471825285631e+19, "train_loss": 0.011631221487809769, "train_runtime": 2940.6816, "train_samples_per_second": 430.955, "train_steps_per_second": 3.368 } ], "logging_steps": 10, "max_steps": 9905, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 2, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 9.820471825285631e+19, "train_batch_size": 128, "trial_name": null, "trial_params": null }