|
{ |
|
"best_global_step": 2000, |
|
"best_metric": 1.3367302417755127, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.0, |
|
"eval_steps": 1000, |
|
"global_step": 5812, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.017205781142463867, |
|
"grad_norm": 0.9484581351280212, |
|
"learning_rate": 1.4000000000000001e-05, |
|
"loss": 1.1583, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.034411562284927734, |
|
"grad_norm": 1.0050328969955444, |
|
"learning_rate": 2.8285714285714287e-05, |
|
"loss": 0.9511, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.051617343427391604, |
|
"grad_norm": 1.2752665281295776, |
|
"learning_rate": 4.257142857142857e-05, |
|
"loss": 0.8969, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.06882312456985547, |
|
"grad_norm": 1.1626160144805908, |
|
"learning_rate": 4.978712080894093e-05, |
|
"loss": 0.8897, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.08602890571231935, |
|
"grad_norm": 1.154097080230713, |
|
"learning_rate": 4.934362249423453e-05, |
|
"loss": 0.909, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.10323468685478321, |
|
"grad_norm": 1.112544298171997, |
|
"learning_rate": 4.890012417952812e-05, |
|
"loss": 0.8691, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.12044046799724707, |
|
"grad_norm": 1.2551391124725342, |
|
"learning_rate": 4.845662586482171e-05, |
|
"loss": 0.8942, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.13764624913971094, |
|
"grad_norm": 1.0854212045669556, |
|
"learning_rate": 4.801312755011531e-05, |
|
"loss": 0.8984, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.1548520302821748, |
|
"grad_norm": 1.198875904083252, |
|
"learning_rate": 4.7569629235408906e-05, |
|
"loss": 0.8765, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.1720578114246387, |
|
"grad_norm": 1.3993616104125977, |
|
"learning_rate": 4.7126130920702504e-05, |
|
"loss": 0.9302, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.18926359256710254, |
|
"grad_norm": 1.1944853067398071, |
|
"learning_rate": 4.66826326059961e-05, |
|
"loss": 0.9117, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.20646937370956642, |
|
"grad_norm": 1.3922518491744995, |
|
"learning_rate": 4.623913429128969e-05, |
|
"loss": 0.9291, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.2236751548520303, |
|
"grad_norm": 1.1352382898330688, |
|
"learning_rate": 4.579563597658329e-05, |
|
"loss": 0.9108, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.24088093599449414, |
|
"grad_norm": 1.2245187759399414, |
|
"learning_rate": 4.535213766187689e-05, |
|
"loss": 0.9009, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.258086717136958, |
|
"grad_norm": 1.303236961364746, |
|
"learning_rate": 4.4908639347170486e-05, |
|
"loss": 0.9326, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.27529249827942187, |
|
"grad_norm": 1.1667600870132446, |
|
"learning_rate": 4.4465141032464084e-05, |
|
"loss": 0.8955, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.2924982794218858, |
|
"grad_norm": 1.166150689125061, |
|
"learning_rate": 4.4021642717757675e-05, |
|
"loss": 0.918, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.3097040605643496, |
|
"grad_norm": 1.330957055091858, |
|
"learning_rate": 4.3578144403051266e-05, |
|
"loss": 0.9135, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.3269098417068135, |
|
"grad_norm": 1.064183235168457, |
|
"learning_rate": 4.313464608834486e-05, |
|
"loss": 0.946, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.3441156228492774, |
|
"grad_norm": 1.189034342765808, |
|
"learning_rate": 4.269114777363846e-05, |
|
"loss": 0.9749, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.3441156228492774, |
|
"eval_loss": 1.4126514196395874, |
|
"eval_runtime": 93.1504, |
|
"eval_samples_per_second": 7.88, |
|
"eval_steps_per_second": 1.578, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.36132140399174123, |
|
"grad_norm": 1.0131909847259521, |
|
"learning_rate": 4.224764945893206e-05, |
|
"loss": 0.9431, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.3785271851342051, |
|
"grad_norm": 1.244815707206726, |
|
"learning_rate": 4.1804151144225656e-05, |
|
"loss": 1.0062, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.395732966276669, |
|
"grad_norm": 0.9802664518356323, |
|
"learning_rate": 4.136065282951925e-05, |
|
"loss": 1.0486, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.41293874741913283, |
|
"grad_norm": 0.9956826567649841, |
|
"learning_rate": 4.0917154514812845e-05, |
|
"loss": 0.9844, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.4301445285615967, |
|
"grad_norm": 1.0612707138061523, |
|
"learning_rate": 4.047365620010644e-05, |
|
"loss": 1.0091, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.4473503097040606, |
|
"grad_norm": 1.0374691486358643, |
|
"learning_rate": 4.003015788540004e-05, |
|
"loss": 1.0007, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.46455609084652444, |
|
"grad_norm": 1.2885727882385254, |
|
"learning_rate": 3.958665957069364e-05, |
|
"loss": 0.9555, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.4817618719889883, |
|
"grad_norm": 1.1126559972763062, |
|
"learning_rate": 3.914316125598723e-05, |
|
"loss": 1.0084, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.4989676531314522, |
|
"grad_norm": 1.1720352172851562, |
|
"learning_rate": 3.869966294128082e-05, |
|
"loss": 0.9789, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.516173434273916, |
|
"grad_norm": 1.0672069787979126, |
|
"learning_rate": 3.825616462657442e-05, |
|
"loss": 1.0046, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.5333792154163799, |
|
"grad_norm": 1.0774304866790771, |
|
"learning_rate": 3.7812666311868016e-05, |
|
"loss": 0.9957, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.5505849965588437, |
|
"grad_norm": 1.1217703819274902, |
|
"learning_rate": 3.736916799716161e-05, |
|
"loss": 1.0262, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.5677907777013076, |
|
"grad_norm": 1.037427544593811, |
|
"learning_rate": 3.692566968245521e-05, |
|
"loss": 1.0392, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.5849965588437716, |
|
"grad_norm": 1.1435151100158691, |
|
"learning_rate": 3.64821713677488e-05, |
|
"loss": 1.0341, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.6022023399862354, |
|
"grad_norm": 1.0763335227966309, |
|
"learning_rate": 3.60386730530424e-05, |
|
"loss": 1.0967, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.6194081211286993, |
|
"grad_norm": 0.9355671405792236, |
|
"learning_rate": 3.5595174738336e-05, |
|
"loss": 1.0745, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.6366139022711631, |
|
"grad_norm": 1.1394997835159302, |
|
"learning_rate": 3.5151676423629595e-05, |
|
"loss": 1.0604, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.653819683413627, |
|
"grad_norm": 0.8916899561882019, |
|
"learning_rate": 3.4708178108923186e-05, |
|
"loss": 1.0343, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.6710254645560908, |
|
"grad_norm": 1.0358752012252808, |
|
"learning_rate": 3.4264679794216784e-05, |
|
"loss": 1.1059, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.6882312456985548, |
|
"grad_norm": 1.090041160583496, |
|
"learning_rate": 3.3821181479510375e-05, |
|
"loss": 1.0659, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.6882312456985548, |
|
"eval_loss": 1.3367302417755127, |
|
"eval_runtime": 88.2266, |
|
"eval_samples_per_second": 8.319, |
|
"eval_steps_per_second": 1.666, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.7054370268410186, |
|
"grad_norm": 1.0861823558807373, |
|
"learning_rate": 3.337768316480397e-05, |
|
"loss": 1.0877, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.7226428079834825, |
|
"grad_norm": 1.1247456073760986, |
|
"learning_rate": 3.293418485009757e-05, |
|
"loss": 1.0547, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.7398485891259463, |
|
"grad_norm": 0.9415215253829956, |
|
"learning_rate": 3.249068653539117e-05, |
|
"loss": 1.1066, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.7570543702684102, |
|
"grad_norm": 1.0296528339385986, |
|
"learning_rate": 3.2047188220684766e-05, |
|
"loss": 1.1037, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.774260151410874, |
|
"grad_norm": 1.0104950666427612, |
|
"learning_rate": 3.1603689905978357e-05, |
|
"loss": 1.1236, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.791465932553338, |
|
"grad_norm": 1.1287598609924316, |
|
"learning_rate": 3.1160191591271954e-05, |
|
"loss": 1.0993, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.8086717136958018, |
|
"grad_norm": 0.9809098839759827, |
|
"learning_rate": 3.071669327656555e-05, |
|
"loss": 1.1639, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 0.8258774948382657, |
|
"grad_norm": 0.9065077304840088, |
|
"learning_rate": 3.0273194961859146e-05, |
|
"loss": 1.0698, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.8430832759807295, |
|
"grad_norm": 0.8680762648582458, |
|
"learning_rate": 2.9829696647152744e-05, |
|
"loss": 1.1864, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 0.8602890571231934, |
|
"grad_norm": 1.0787030458450317, |
|
"learning_rate": 2.9386198332446342e-05, |
|
"loss": 1.1552, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.8774948382656572, |
|
"grad_norm": 1.146647572517395, |
|
"learning_rate": 2.8942700017739933e-05, |
|
"loss": 1.2046, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 0.8947006194081212, |
|
"grad_norm": 1.0998808145523071, |
|
"learning_rate": 2.849920170303353e-05, |
|
"loss": 1.1504, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.911906400550585, |
|
"grad_norm": 0.9578316807746887, |
|
"learning_rate": 2.8055703388327125e-05, |
|
"loss": 1.1814, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 0.9291121816930489, |
|
"grad_norm": 0.8427146077156067, |
|
"learning_rate": 2.7612205073620722e-05, |
|
"loss": 1.1914, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.9463179628355127, |
|
"grad_norm": 1.0207325220108032, |
|
"learning_rate": 2.716870675891432e-05, |
|
"loss": 1.1949, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 0.9635237439779766, |
|
"grad_norm": 0.8344902992248535, |
|
"learning_rate": 2.672520844420791e-05, |
|
"loss": 1.1986, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.9807295251204404, |
|
"grad_norm": 1.0717881917953491, |
|
"learning_rate": 2.628171012950151e-05, |
|
"loss": 1.2419, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 0.9979353062629044, |
|
"grad_norm": 1.0195544958114624, |
|
"learning_rate": 2.5838211814795103e-05, |
|
"loss": 1.1612, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 1.0151410874053681, |
|
"grad_norm": 0.899029552936554, |
|
"learning_rate": 2.53947135000887e-05, |
|
"loss": 0.9398, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 1.032346868547832, |
|
"grad_norm": 1.1363555192947388, |
|
"learning_rate": 2.4951215185382295e-05, |
|
"loss": 0.8889, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 1.032346868547832, |
|
"eval_loss": 1.3666342496871948, |
|
"eval_runtime": 88.3327, |
|
"eval_samples_per_second": 8.309, |
|
"eval_steps_per_second": 1.664, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 1.049552649690296, |
|
"grad_norm": 1.237770438194275, |
|
"learning_rate": 2.4507716870675893e-05, |
|
"loss": 0.8746, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 1.0667584308327598, |
|
"grad_norm": 1.1103781461715698, |
|
"learning_rate": 2.406421855596949e-05, |
|
"loss": 0.8944, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 1.0839642119752237, |
|
"grad_norm": 1.2465671300888062, |
|
"learning_rate": 2.3620720241263085e-05, |
|
"loss": 0.9278, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 1.1011699931176875, |
|
"grad_norm": 1.304677963256836, |
|
"learning_rate": 2.317722192655668e-05, |
|
"loss": 0.8646, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 1.1183757742601514, |
|
"grad_norm": 1.1867053508758545, |
|
"learning_rate": 2.2733723611850277e-05, |
|
"loss": 0.8941, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 1.1355815554026152, |
|
"grad_norm": 1.0053008794784546, |
|
"learning_rate": 2.229022529714387e-05, |
|
"loss": 0.9093, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 1.1527873365450791, |
|
"grad_norm": 1.0691931247711182, |
|
"learning_rate": 2.184672698243747e-05, |
|
"loss": 0.8739, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 1.169993117687543, |
|
"grad_norm": 1.2377207279205322, |
|
"learning_rate": 2.1403228667731063e-05, |
|
"loss": 0.9417, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 1.1871988988300068, |
|
"grad_norm": 1.21890127658844, |
|
"learning_rate": 2.0959730353024658e-05, |
|
"loss": 0.8924, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 1.2044046799724708, |
|
"grad_norm": 1.099365472793579, |
|
"learning_rate": 2.0516232038318255e-05, |
|
"loss": 0.9046, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 1.2216104611149345, |
|
"grad_norm": 1.188915729522705, |
|
"learning_rate": 2.007273372361185e-05, |
|
"loss": 0.9054, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 1.2388162422573985, |
|
"grad_norm": 1.258689522743225, |
|
"learning_rate": 1.9629235408905447e-05, |
|
"loss": 0.9272, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 1.2560220233998622, |
|
"grad_norm": 1.1899304389953613, |
|
"learning_rate": 1.9185737094199045e-05, |
|
"loss": 0.9213, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 1.2732278045423262, |
|
"grad_norm": 1.2941292524337769, |
|
"learning_rate": 1.874223877949264e-05, |
|
"loss": 0.9083, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 1.2904335856847902, |
|
"grad_norm": 1.148829460144043, |
|
"learning_rate": 1.8298740464786234e-05, |
|
"loss": 0.9208, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 1.307639366827254, |
|
"grad_norm": 1.3640356063842773, |
|
"learning_rate": 1.785524215007983e-05, |
|
"loss": 0.9632, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 1.3248451479697179, |
|
"grad_norm": 1.049970030784607, |
|
"learning_rate": 1.7411743835373426e-05, |
|
"loss": 0.9223, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 1.3420509291121818, |
|
"grad_norm": 1.2103326320648193, |
|
"learning_rate": 1.6968245520667024e-05, |
|
"loss": 0.9937, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 1.3592567102546456, |
|
"grad_norm": 1.09535551071167, |
|
"learning_rate": 1.6524747205960618e-05, |
|
"loss": 0.9386, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 1.3764624913971093, |
|
"grad_norm": 1.1930817365646362, |
|
"learning_rate": 1.6081248891254212e-05, |
|
"loss": 0.9157, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 1.3764624913971093, |
|
"eval_loss": 1.3664780855178833, |
|
"eval_runtime": 88.454, |
|
"eval_samples_per_second": 8.298, |
|
"eval_steps_per_second": 1.662, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 1.3936682725395733, |
|
"grad_norm": 1.2864257097244263, |
|
"learning_rate": 1.563775057654781e-05, |
|
"loss": 0.946, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 1.4108740536820372, |
|
"grad_norm": 1.079478144645691, |
|
"learning_rate": 1.5194252261841404e-05, |
|
"loss": 0.8982, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 1.428079834824501, |
|
"grad_norm": 1.1406025886535645, |
|
"learning_rate": 1.4750753947135002e-05, |
|
"loss": 0.9076, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 1.445285615966965, |
|
"grad_norm": 1.044668436050415, |
|
"learning_rate": 1.4307255632428598e-05, |
|
"loss": 0.9332, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 1.4624913971094289, |
|
"grad_norm": 1.1297789812088013, |
|
"learning_rate": 1.3863757317722192e-05, |
|
"loss": 0.9203, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 1.4796971782518926, |
|
"grad_norm": 1.0997716188430786, |
|
"learning_rate": 1.342025900301579e-05, |
|
"loss": 0.9333, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 1.4969029593943566, |
|
"grad_norm": 1.374189853668213, |
|
"learning_rate": 1.2976760688309386e-05, |
|
"loss": 0.9235, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 1.5141087405368203, |
|
"grad_norm": 1.198410153388977, |
|
"learning_rate": 1.253326237360298e-05, |
|
"loss": 0.8935, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 1.5313145216792843, |
|
"grad_norm": 1.1307648420333862, |
|
"learning_rate": 1.2089764058896576e-05, |
|
"loss": 0.941, |
|
"step": 4450 |
|
}, |
|
{ |
|
"epoch": 1.548520302821748, |
|
"grad_norm": 1.1348686218261719, |
|
"learning_rate": 1.1646265744190172e-05, |
|
"loss": 0.998, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 1.565726083964212, |
|
"grad_norm": 1.3584885597229004, |
|
"learning_rate": 1.1202767429483768e-05, |
|
"loss": 0.9083, |
|
"step": 4550 |
|
}, |
|
{ |
|
"epoch": 1.582931865106676, |
|
"grad_norm": 1.2940605878829956, |
|
"learning_rate": 1.0759269114777365e-05, |
|
"loss": 0.9553, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 1.6001376462491397, |
|
"grad_norm": 1.2229249477386475, |
|
"learning_rate": 1.031577080007096e-05, |
|
"loss": 0.939, |
|
"step": 4650 |
|
}, |
|
{ |
|
"epoch": 1.6173434273916034, |
|
"grad_norm": 1.356358528137207, |
|
"learning_rate": 9.872272485364557e-06, |
|
"loss": 0.9297, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 1.6345492085340676, |
|
"grad_norm": 0.916768491268158, |
|
"learning_rate": 9.428774170658151e-06, |
|
"loss": 0.9648, |
|
"step": 4750 |
|
}, |
|
{ |
|
"epoch": 1.6517549896765313, |
|
"grad_norm": 1.5708458423614502, |
|
"learning_rate": 8.985275855951749e-06, |
|
"loss": 0.9427, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 1.668960770818995, |
|
"grad_norm": 0.9711500406265259, |
|
"learning_rate": 8.541777541245345e-06, |
|
"loss": 0.9696, |
|
"step": 4850 |
|
}, |
|
{ |
|
"epoch": 1.686166551961459, |
|
"grad_norm": 1.191889762878418, |
|
"learning_rate": 8.098279226538939e-06, |
|
"loss": 0.932, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 1.703372333103923, |
|
"grad_norm": 1.3063703775405884, |
|
"learning_rate": 7.654780911832535e-06, |
|
"loss": 0.933, |
|
"step": 4950 |
|
}, |
|
{ |
|
"epoch": 1.7205781142463867, |
|
"grad_norm": 1.094841718673706, |
|
"learning_rate": 7.211282597126132e-06, |
|
"loss": 0.9449, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 1.7205781142463867, |
|
"eval_loss": 1.3542050123214722, |
|
"eval_runtime": 88.544, |
|
"eval_samples_per_second": 8.29, |
|
"eval_steps_per_second": 1.66, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 1.7377838953888507, |
|
"grad_norm": 1.335047960281372, |
|
"learning_rate": 6.767784282419727e-06, |
|
"loss": 0.9855, |
|
"step": 5050 |
|
}, |
|
{ |
|
"epoch": 1.7549896765313147, |
|
"grad_norm": 1.2741748094558716, |
|
"learning_rate": 6.324285967713322e-06, |
|
"loss": 0.9373, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 1.7721954576737784, |
|
"grad_norm": 1.4646645784378052, |
|
"learning_rate": 5.880787653006919e-06, |
|
"loss": 0.9481, |
|
"step": 5150 |
|
}, |
|
{ |
|
"epoch": 1.7894012388162421, |
|
"grad_norm": 1.2502957582473755, |
|
"learning_rate": 5.437289338300515e-06, |
|
"loss": 0.9703, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 1.806607019958706, |
|
"grad_norm": 1.335482120513916, |
|
"learning_rate": 4.99379102359411e-06, |
|
"loss": 0.9671, |
|
"step": 5250 |
|
}, |
|
{ |
|
"epoch": 1.82381280110117, |
|
"grad_norm": 1.1999086141586304, |
|
"learning_rate": 4.550292708887706e-06, |
|
"loss": 0.9621, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 1.8410185822436338, |
|
"grad_norm": 1.1440843343734741, |
|
"learning_rate": 4.106794394181302e-06, |
|
"loss": 0.9651, |
|
"step": 5350 |
|
}, |
|
{ |
|
"epoch": 1.8582243633860978, |
|
"grad_norm": 1.1589412689208984, |
|
"learning_rate": 3.6632960794748983e-06, |
|
"loss": 0.9231, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 1.8754301445285617, |
|
"grad_norm": 1.1917223930358887, |
|
"learning_rate": 3.219797764768494e-06, |
|
"loss": 1.0207, |
|
"step": 5450 |
|
}, |
|
{ |
|
"epoch": 1.8926359256710255, |
|
"grad_norm": 1.1760448217391968, |
|
"learning_rate": 2.77629945006209e-06, |
|
"loss": 0.9558, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 1.9098417068134892, |
|
"grad_norm": 1.25649893283844, |
|
"learning_rate": 2.3328011353556856e-06, |
|
"loss": 0.9408, |
|
"step": 5550 |
|
}, |
|
{ |
|
"epoch": 1.9270474879559532, |
|
"grad_norm": 1.1310392618179321, |
|
"learning_rate": 1.8893028206492816e-06, |
|
"loss": 0.9771, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 1.9442532690984171, |
|
"grad_norm": 0.9325273036956787, |
|
"learning_rate": 1.4458045059428774e-06, |
|
"loss": 0.9757, |
|
"step": 5650 |
|
}, |
|
{ |
|
"epoch": 1.9614590502408809, |
|
"grad_norm": 1.2769732475280762, |
|
"learning_rate": 1.0023061912364732e-06, |
|
"loss": 0.9719, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 1.9786648313833448, |
|
"grad_norm": 1.5237048864364624, |
|
"learning_rate": 5.588078765300692e-07, |
|
"loss": 0.9612, |
|
"step": 5750 |
|
}, |
|
{ |
|
"epoch": 1.9958706125258088, |
|
"grad_norm": 1.3760077953338623, |
|
"learning_rate": 1.1530956182366508e-07, |
|
"loss": 0.962, |
|
"step": 5800 |
|
} |
|
], |
|
"logging_steps": 50, |
|
"max_steps": 5812, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 3000, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 8, |
|
"early_stopping_threshold": 0.01 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 3 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 7.144678634728463e+18, |
|
"train_batch_size": 5, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|