{ "best_global_step": 2000, "best_metric": 1.3367302417755127, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 1000, "global_step": 5812, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.017205781142463867, "grad_norm": 0.9484581351280212, "learning_rate": 1.4000000000000001e-05, "loss": 1.1583, "step": 50 }, { "epoch": 0.034411562284927734, "grad_norm": 1.0050328969955444, "learning_rate": 2.8285714285714287e-05, "loss": 0.9511, "step": 100 }, { "epoch": 0.051617343427391604, "grad_norm": 1.2752665281295776, "learning_rate": 4.257142857142857e-05, "loss": 0.8969, "step": 150 }, { "epoch": 0.06882312456985547, "grad_norm": 1.1626160144805908, "learning_rate": 4.978712080894093e-05, "loss": 0.8897, "step": 200 }, { "epoch": 0.08602890571231935, "grad_norm": 1.154097080230713, "learning_rate": 4.934362249423453e-05, "loss": 0.909, "step": 250 }, { "epoch": 0.10323468685478321, "grad_norm": 1.112544298171997, "learning_rate": 4.890012417952812e-05, "loss": 0.8691, "step": 300 }, { "epoch": 0.12044046799724707, "grad_norm": 1.2551391124725342, "learning_rate": 4.845662586482171e-05, "loss": 0.8942, "step": 350 }, { "epoch": 0.13764624913971094, "grad_norm": 1.0854212045669556, "learning_rate": 4.801312755011531e-05, "loss": 0.8984, "step": 400 }, { "epoch": 0.1548520302821748, "grad_norm": 1.198875904083252, "learning_rate": 4.7569629235408906e-05, "loss": 0.8765, "step": 450 }, { "epoch": 0.1720578114246387, "grad_norm": 1.3993616104125977, "learning_rate": 4.7126130920702504e-05, "loss": 0.9302, "step": 500 }, { "epoch": 0.18926359256710254, "grad_norm": 1.1944853067398071, "learning_rate": 4.66826326059961e-05, "loss": 0.9117, "step": 550 }, { "epoch": 0.20646937370956642, "grad_norm": 1.3922518491744995, "learning_rate": 4.623913429128969e-05, "loss": 0.9291, "step": 600 }, { "epoch": 0.2236751548520303, "grad_norm": 1.1352382898330688, "learning_rate": 4.579563597658329e-05, "loss": 0.9108, "step": 650 }, { "epoch": 0.24088093599449414, "grad_norm": 1.2245187759399414, "learning_rate": 4.535213766187689e-05, "loss": 0.9009, "step": 700 }, { "epoch": 0.258086717136958, "grad_norm": 1.303236961364746, "learning_rate": 4.4908639347170486e-05, "loss": 0.9326, "step": 750 }, { "epoch": 0.27529249827942187, "grad_norm": 1.1667600870132446, "learning_rate": 4.4465141032464084e-05, "loss": 0.8955, "step": 800 }, { "epoch": 0.2924982794218858, "grad_norm": 1.166150689125061, "learning_rate": 4.4021642717757675e-05, "loss": 0.918, "step": 850 }, { "epoch": 0.3097040605643496, "grad_norm": 1.330957055091858, "learning_rate": 4.3578144403051266e-05, "loss": 0.9135, "step": 900 }, { "epoch": 0.3269098417068135, "grad_norm": 1.064183235168457, "learning_rate": 4.313464608834486e-05, "loss": 0.946, "step": 950 }, { "epoch": 0.3441156228492774, "grad_norm": 1.189034342765808, "learning_rate": 4.269114777363846e-05, "loss": 0.9749, "step": 1000 }, { "epoch": 0.3441156228492774, "eval_loss": 1.4126514196395874, "eval_runtime": 93.1504, "eval_samples_per_second": 7.88, "eval_steps_per_second": 1.578, "step": 1000 }, { "epoch": 0.36132140399174123, "grad_norm": 1.0131909847259521, "learning_rate": 4.224764945893206e-05, "loss": 0.9431, "step": 1050 }, { "epoch": 0.3785271851342051, "grad_norm": 1.244815707206726, "learning_rate": 4.1804151144225656e-05, "loss": 1.0062, "step": 1100 }, { "epoch": 0.395732966276669, "grad_norm": 0.9802664518356323, "learning_rate": 4.136065282951925e-05, "loss": 1.0486, "step": 1150 }, { "epoch": 0.41293874741913283, "grad_norm": 0.9956826567649841, "learning_rate": 4.0917154514812845e-05, "loss": 0.9844, "step": 1200 }, { "epoch": 0.4301445285615967, "grad_norm": 1.0612707138061523, "learning_rate": 4.047365620010644e-05, "loss": 1.0091, "step": 1250 }, { "epoch": 0.4473503097040606, "grad_norm": 1.0374691486358643, "learning_rate": 4.003015788540004e-05, "loss": 1.0007, "step": 1300 }, { "epoch": 0.46455609084652444, "grad_norm": 1.2885727882385254, "learning_rate": 3.958665957069364e-05, "loss": 0.9555, "step": 1350 }, { "epoch": 0.4817618719889883, "grad_norm": 1.1126559972763062, "learning_rate": 3.914316125598723e-05, "loss": 1.0084, "step": 1400 }, { "epoch": 0.4989676531314522, "grad_norm": 1.1720352172851562, "learning_rate": 3.869966294128082e-05, "loss": 0.9789, "step": 1450 }, { "epoch": 0.516173434273916, "grad_norm": 1.0672069787979126, "learning_rate": 3.825616462657442e-05, "loss": 1.0046, "step": 1500 }, { "epoch": 0.5333792154163799, "grad_norm": 1.0774304866790771, "learning_rate": 3.7812666311868016e-05, "loss": 0.9957, "step": 1550 }, { "epoch": 0.5505849965588437, "grad_norm": 1.1217703819274902, "learning_rate": 3.736916799716161e-05, "loss": 1.0262, "step": 1600 }, { "epoch": 0.5677907777013076, "grad_norm": 1.037427544593811, "learning_rate": 3.692566968245521e-05, "loss": 1.0392, "step": 1650 }, { "epoch": 0.5849965588437716, "grad_norm": 1.1435151100158691, "learning_rate": 3.64821713677488e-05, "loss": 1.0341, "step": 1700 }, { "epoch": 0.6022023399862354, "grad_norm": 1.0763335227966309, "learning_rate": 3.60386730530424e-05, "loss": 1.0967, "step": 1750 }, { "epoch": 0.6194081211286993, "grad_norm": 0.9355671405792236, "learning_rate": 3.5595174738336e-05, "loss": 1.0745, "step": 1800 }, { "epoch": 0.6366139022711631, "grad_norm": 1.1394997835159302, "learning_rate": 3.5151676423629595e-05, "loss": 1.0604, "step": 1850 }, { "epoch": 0.653819683413627, "grad_norm": 0.8916899561882019, "learning_rate": 3.4708178108923186e-05, "loss": 1.0343, "step": 1900 }, { "epoch": 0.6710254645560908, "grad_norm": 1.0358752012252808, "learning_rate": 3.4264679794216784e-05, "loss": 1.1059, "step": 1950 }, { "epoch": 0.6882312456985548, "grad_norm": 1.090041160583496, "learning_rate": 3.3821181479510375e-05, "loss": 1.0659, "step": 2000 }, { "epoch": 0.6882312456985548, "eval_loss": 1.3367302417755127, "eval_runtime": 88.2266, "eval_samples_per_second": 8.319, "eval_steps_per_second": 1.666, "step": 2000 }, { "epoch": 0.7054370268410186, "grad_norm": 1.0861823558807373, "learning_rate": 3.337768316480397e-05, "loss": 1.0877, "step": 2050 }, { "epoch": 0.7226428079834825, "grad_norm": 1.1247456073760986, "learning_rate": 3.293418485009757e-05, "loss": 1.0547, "step": 2100 }, { "epoch": 0.7398485891259463, "grad_norm": 0.9415215253829956, "learning_rate": 3.249068653539117e-05, "loss": 1.1066, "step": 2150 }, { "epoch": 0.7570543702684102, "grad_norm": 1.0296528339385986, "learning_rate": 3.2047188220684766e-05, "loss": 1.1037, "step": 2200 }, { "epoch": 0.774260151410874, "grad_norm": 1.0104950666427612, "learning_rate": 3.1603689905978357e-05, "loss": 1.1236, "step": 2250 }, { "epoch": 0.791465932553338, "grad_norm": 1.1287598609924316, "learning_rate": 3.1160191591271954e-05, "loss": 1.0993, "step": 2300 }, { "epoch": 0.8086717136958018, "grad_norm": 0.9809098839759827, "learning_rate": 3.071669327656555e-05, "loss": 1.1639, "step": 2350 }, { "epoch": 0.8258774948382657, "grad_norm": 0.9065077304840088, "learning_rate": 3.0273194961859146e-05, "loss": 1.0698, "step": 2400 }, { "epoch": 0.8430832759807295, "grad_norm": 0.8680762648582458, "learning_rate": 2.9829696647152744e-05, "loss": 1.1864, "step": 2450 }, { "epoch": 0.8602890571231934, "grad_norm": 1.0787030458450317, "learning_rate": 2.9386198332446342e-05, "loss": 1.1552, "step": 2500 }, { "epoch": 0.8774948382656572, "grad_norm": 1.146647572517395, "learning_rate": 2.8942700017739933e-05, "loss": 1.2046, "step": 2550 }, { "epoch": 0.8947006194081212, "grad_norm": 1.0998808145523071, "learning_rate": 2.849920170303353e-05, "loss": 1.1504, "step": 2600 }, { "epoch": 0.911906400550585, "grad_norm": 0.9578316807746887, "learning_rate": 2.8055703388327125e-05, "loss": 1.1814, "step": 2650 }, { "epoch": 0.9291121816930489, "grad_norm": 0.8427146077156067, "learning_rate": 2.7612205073620722e-05, "loss": 1.1914, "step": 2700 }, { "epoch": 0.9463179628355127, "grad_norm": 1.0207325220108032, "learning_rate": 2.716870675891432e-05, "loss": 1.1949, "step": 2750 }, { "epoch": 0.9635237439779766, "grad_norm": 0.8344902992248535, "learning_rate": 2.672520844420791e-05, "loss": 1.1986, "step": 2800 }, { "epoch": 0.9807295251204404, "grad_norm": 1.0717881917953491, "learning_rate": 2.628171012950151e-05, "loss": 1.2419, "step": 2850 }, { "epoch": 0.9979353062629044, "grad_norm": 1.0195544958114624, "learning_rate": 2.5838211814795103e-05, "loss": 1.1612, "step": 2900 }, { "epoch": 1.0151410874053681, "grad_norm": 0.899029552936554, "learning_rate": 2.53947135000887e-05, "loss": 0.9398, "step": 2950 }, { "epoch": 1.032346868547832, "grad_norm": 1.1363555192947388, "learning_rate": 2.4951215185382295e-05, "loss": 0.8889, "step": 3000 }, { "epoch": 1.032346868547832, "eval_loss": 1.3666342496871948, "eval_runtime": 88.3327, "eval_samples_per_second": 8.309, "eval_steps_per_second": 1.664, "step": 3000 }, { "epoch": 1.049552649690296, "grad_norm": 1.237770438194275, "learning_rate": 2.4507716870675893e-05, "loss": 0.8746, "step": 3050 }, { "epoch": 1.0667584308327598, "grad_norm": 1.1103781461715698, "learning_rate": 2.406421855596949e-05, "loss": 0.8944, "step": 3100 }, { "epoch": 1.0839642119752237, "grad_norm": 1.2465671300888062, "learning_rate": 2.3620720241263085e-05, "loss": 0.9278, "step": 3150 }, { "epoch": 1.1011699931176875, "grad_norm": 1.304677963256836, "learning_rate": 2.317722192655668e-05, "loss": 0.8646, "step": 3200 }, { "epoch": 1.1183757742601514, "grad_norm": 1.1867053508758545, "learning_rate": 2.2733723611850277e-05, "loss": 0.8941, "step": 3250 }, { "epoch": 1.1355815554026152, "grad_norm": 1.0053008794784546, "learning_rate": 2.229022529714387e-05, "loss": 0.9093, "step": 3300 }, { "epoch": 1.1527873365450791, "grad_norm": 1.0691931247711182, "learning_rate": 2.184672698243747e-05, "loss": 0.8739, "step": 3350 }, { "epoch": 1.169993117687543, "grad_norm": 1.2377207279205322, "learning_rate": 2.1403228667731063e-05, "loss": 0.9417, "step": 3400 }, { "epoch": 1.1871988988300068, "grad_norm": 1.21890127658844, "learning_rate": 2.0959730353024658e-05, "loss": 0.8924, "step": 3450 }, { "epoch": 1.2044046799724708, "grad_norm": 1.099365472793579, "learning_rate": 2.0516232038318255e-05, "loss": 0.9046, "step": 3500 }, { "epoch": 1.2216104611149345, "grad_norm": 1.188915729522705, "learning_rate": 2.007273372361185e-05, "loss": 0.9054, "step": 3550 }, { "epoch": 1.2388162422573985, "grad_norm": 1.258689522743225, "learning_rate": 1.9629235408905447e-05, "loss": 0.9272, "step": 3600 }, { "epoch": 1.2560220233998622, "grad_norm": 1.1899304389953613, "learning_rate": 1.9185737094199045e-05, "loss": 0.9213, "step": 3650 }, { "epoch": 1.2732278045423262, "grad_norm": 1.2941292524337769, "learning_rate": 1.874223877949264e-05, "loss": 0.9083, "step": 3700 }, { "epoch": 1.2904335856847902, "grad_norm": 1.148829460144043, "learning_rate": 1.8298740464786234e-05, "loss": 0.9208, "step": 3750 }, { "epoch": 1.307639366827254, "grad_norm": 1.3640356063842773, "learning_rate": 1.785524215007983e-05, "loss": 0.9632, "step": 3800 }, { "epoch": 1.3248451479697179, "grad_norm": 1.049970030784607, "learning_rate": 1.7411743835373426e-05, "loss": 0.9223, "step": 3850 }, { "epoch": 1.3420509291121818, "grad_norm": 1.2103326320648193, "learning_rate": 1.6968245520667024e-05, "loss": 0.9937, "step": 3900 }, { "epoch": 1.3592567102546456, "grad_norm": 1.09535551071167, "learning_rate": 1.6524747205960618e-05, "loss": 0.9386, "step": 3950 }, { "epoch": 1.3764624913971093, "grad_norm": 1.1930817365646362, "learning_rate": 1.6081248891254212e-05, "loss": 0.9157, "step": 4000 }, { "epoch": 1.3764624913971093, "eval_loss": 1.3664780855178833, "eval_runtime": 88.454, "eval_samples_per_second": 8.298, "eval_steps_per_second": 1.662, "step": 4000 }, { "epoch": 1.3936682725395733, "grad_norm": 1.2864257097244263, "learning_rate": 1.563775057654781e-05, "loss": 0.946, "step": 4050 }, { "epoch": 1.4108740536820372, "grad_norm": 1.079478144645691, "learning_rate": 1.5194252261841404e-05, "loss": 0.8982, "step": 4100 }, { "epoch": 1.428079834824501, "grad_norm": 1.1406025886535645, "learning_rate": 1.4750753947135002e-05, "loss": 0.9076, "step": 4150 }, { "epoch": 1.445285615966965, "grad_norm": 1.044668436050415, "learning_rate": 1.4307255632428598e-05, "loss": 0.9332, "step": 4200 }, { "epoch": 1.4624913971094289, "grad_norm": 1.1297789812088013, "learning_rate": 1.3863757317722192e-05, "loss": 0.9203, "step": 4250 }, { "epoch": 1.4796971782518926, "grad_norm": 1.0997716188430786, "learning_rate": 1.342025900301579e-05, "loss": 0.9333, "step": 4300 }, { "epoch": 1.4969029593943566, "grad_norm": 1.374189853668213, "learning_rate": 1.2976760688309386e-05, "loss": 0.9235, "step": 4350 }, { "epoch": 1.5141087405368203, "grad_norm": 1.198410153388977, "learning_rate": 1.253326237360298e-05, "loss": 0.8935, "step": 4400 }, { "epoch": 1.5313145216792843, "grad_norm": 1.1307648420333862, "learning_rate": 1.2089764058896576e-05, "loss": 0.941, "step": 4450 }, { "epoch": 1.548520302821748, "grad_norm": 1.1348686218261719, "learning_rate": 1.1646265744190172e-05, "loss": 0.998, "step": 4500 }, { "epoch": 1.565726083964212, "grad_norm": 1.3584885597229004, "learning_rate": 1.1202767429483768e-05, "loss": 0.9083, "step": 4550 }, { "epoch": 1.582931865106676, "grad_norm": 1.2940605878829956, "learning_rate": 1.0759269114777365e-05, "loss": 0.9553, "step": 4600 }, { "epoch": 1.6001376462491397, "grad_norm": 1.2229249477386475, "learning_rate": 1.031577080007096e-05, "loss": 0.939, "step": 4650 }, { "epoch": 1.6173434273916034, "grad_norm": 1.356358528137207, "learning_rate": 9.872272485364557e-06, "loss": 0.9297, "step": 4700 }, { "epoch": 1.6345492085340676, "grad_norm": 0.916768491268158, "learning_rate": 9.428774170658151e-06, "loss": 0.9648, "step": 4750 }, { "epoch": 1.6517549896765313, "grad_norm": 1.5708458423614502, "learning_rate": 8.985275855951749e-06, "loss": 0.9427, "step": 4800 }, { "epoch": 1.668960770818995, "grad_norm": 0.9711500406265259, "learning_rate": 8.541777541245345e-06, "loss": 0.9696, "step": 4850 }, { "epoch": 1.686166551961459, "grad_norm": 1.191889762878418, "learning_rate": 8.098279226538939e-06, "loss": 0.932, "step": 4900 }, { "epoch": 1.703372333103923, "grad_norm": 1.3063703775405884, "learning_rate": 7.654780911832535e-06, "loss": 0.933, "step": 4950 }, { "epoch": 1.7205781142463867, "grad_norm": 1.094841718673706, "learning_rate": 7.211282597126132e-06, "loss": 0.9449, "step": 5000 }, { "epoch": 1.7205781142463867, "eval_loss": 1.3542050123214722, "eval_runtime": 88.544, "eval_samples_per_second": 8.29, "eval_steps_per_second": 1.66, "step": 5000 }, { "epoch": 1.7377838953888507, "grad_norm": 1.335047960281372, "learning_rate": 6.767784282419727e-06, "loss": 0.9855, "step": 5050 }, { "epoch": 1.7549896765313147, "grad_norm": 1.2741748094558716, "learning_rate": 6.324285967713322e-06, "loss": 0.9373, "step": 5100 }, { "epoch": 1.7721954576737784, "grad_norm": 1.4646645784378052, "learning_rate": 5.880787653006919e-06, "loss": 0.9481, "step": 5150 }, { "epoch": 1.7894012388162421, "grad_norm": 1.2502957582473755, "learning_rate": 5.437289338300515e-06, "loss": 0.9703, "step": 5200 }, { "epoch": 1.806607019958706, "grad_norm": 1.335482120513916, "learning_rate": 4.99379102359411e-06, "loss": 0.9671, "step": 5250 }, { "epoch": 1.82381280110117, "grad_norm": 1.1999086141586304, "learning_rate": 4.550292708887706e-06, "loss": 0.9621, "step": 5300 }, { "epoch": 1.8410185822436338, "grad_norm": 1.1440843343734741, "learning_rate": 4.106794394181302e-06, "loss": 0.9651, "step": 5350 }, { "epoch": 1.8582243633860978, "grad_norm": 1.1589412689208984, "learning_rate": 3.6632960794748983e-06, "loss": 0.9231, "step": 5400 }, { "epoch": 1.8754301445285617, "grad_norm": 1.1917223930358887, "learning_rate": 3.219797764768494e-06, "loss": 1.0207, "step": 5450 }, { "epoch": 1.8926359256710255, "grad_norm": 1.1760448217391968, "learning_rate": 2.77629945006209e-06, "loss": 0.9558, "step": 5500 }, { "epoch": 1.9098417068134892, "grad_norm": 1.25649893283844, "learning_rate": 2.3328011353556856e-06, "loss": 0.9408, "step": 5550 }, { "epoch": 1.9270474879559532, "grad_norm": 1.1310392618179321, "learning_rate": 1.8893028206492816e-06, "loss": 0.9771, "step": 5600 }, { "epoch": 1.9442532690984171, "grad_norm": 0.9325273036956787, "learning_rate": 1.4458045059428774e-06, "loss": 0.9757, "step": 5650 }, { "epoch": 1.9614590502408809, "grad_norm": 1.2769732475280762, "learning_rate": 1.0023061912364732e-06, "loss": 0.9719, "step": 5700 }, { "epoch": 1.9786648313833448, "grad_norm": 1.5237048864364624, "learning_rate": 5.588078765300692e-07, "loss": 0.9612, "step": 5750 }, { "epoch": 1.9958706125258088, "grad_norm": 1.3760077953338623, "learning_rate": 1.1530956182366508e-07, "loss": 0.962, "step": 5800 } ], "logging_steps": 50, "max_steps": 5812, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 3000, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 8, "early_stopping_threshold": 0.01 }, "attributes": { "early_stopping_patience_counter": 3 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 7.144678634728463e+18, "train_batch_size": 5, "trial_name": null, "trial_params": null }