{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.4652504589562025, "eval_steps": 3001, "global_step": 14100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0017484045808200017, "grad_norm": 22.684425354003906, "learning_rate": 3.2558139534883724e-06, "loss": 3.8911, "step": 10 }, { "epoch": 0.0034968091616400035, "grad_norm": 12.553377151489258, "learning_rate": 7.906976744186048e-06, "loss": 2.9776, "step": 20 }, { "epoch": 0.005245213742460005, "grad_norm": 12.756866455078125, "learning_rate": 1.2558139534883723e-05, "loss": 2.543, "step": 30 }, { "epoch": 0.006993618323280007, "grad_norm": 11.82550048828125, "learning_rate": 1.7209302325581396e-05, "loss": 2.1148, "step": 40 }, { "epoch": 0.008742022904100009, "grad_norm": 10.381027221679688, "learning_rate": 2.186046511627907e-05, "loss": 1.924, "step": 50 }, { "epoch": 0.01049042748492001, "grad_norm": 13.786709785461426, "learning_rate": 2.6511627906976743e-05, "loss": 1.7809, "step": 60 }, { "epoch": 0.012238832065740012, "grad_norm": 9.527298927307129, "learning_rate": 3.116279069767442e-05, "loss": 1.5543, "step": 70 }, { "epoch": 0.013987236646560014, "grad_norm": 9.207590103149414, "learning_rate": 3.58139534883721e-05, "loss": 1.4604, "step": 80 }, { "epoch": 0.015735641227380016, "grad_norm": 7.972263336181641, "learning_rate": 3.999765684494172e-05, "loss": 1.3258, "step": 90 }, { "epoch": 0.017484045808200017, "grad_norm": 6.906246662139893, "learning_rate": 3.997422529435886e-05, "loss": 1.3031, "step": 100 }, { "epoch": 0.01923245038902002, "grad_norm": 8.441957473754883, "learning_rate": 3.9950793743776e-05, "loss": 1.2696, "step": 110 }, { "epoch": 0.02098085496984002, "grad_norm": 8.918852806091309, "learning_rate": 3.992736219319314e-05, "loss": 1.2792, "step": 120 }, { "epoch": 0.022729259550660023, "grad_norm": 8.715033531188965, "learning_rate": 3.990393064261028e-05, "loss": 1.1793, "step": 130 }, { "epoch": 0.024477664131480024, "grad_norm": 9.690086364746094, "learning_rate": 3.9880499092027415e-05, "loss": 1.2385, "step": 140 }, { "epoch": 0.026226068712300026, "grad_norm": 6.637733459472656, "learning_rate": 3.985706754144456e-05, "loss": 1.1732, "step": 150 }, { "epoch": 0.027974473293120028, "grad_norm": 11.725113868713379, "learning_rate": 3.9833635990861696e-05, "loss": 1.1724, "step": 160 }, { "epoch": 0.02972287787394003, "grad_norm": 10.116538047790527, "learning_rate": 3.9810204440278834e-05, "loss": 1.0775, "step": 170 }, { "epoch": 0.03147128245476003, "grad_norm": 7.076519966125488, "learning_rate": 3.978677288969598e-05, "loss": 1.0104, "step": 180 }, { "epoch": 0.03321968703558004, "grad_norm": 8.93696117401123, "learning_rate": 3.9763341339113115e-05, "loss": 1.1093, "step": 190 }, { "epoch": 0.034968091616400035, "grad_norm": 7.459012985229492, "learning_rate": 3.973990978853026e-05, "loss": 0.9626, "step": 200 }, { "epoch": 0.03671649619722004, "grad_norm": 7.383749961853027, "learning_rate": 3.9716478237947396e-05, "loss": 0.9835, "step": 210 }, { "epoch": 0.03846490077804004, "grad_norm": 6.915167808532715, "learning_rate": 3.969304668736454e-05, "loss": 0.9152, "step": 220 }, { "epoch": 0.040213305358860044, "grad_norm": 6.553030014038086, "learning_rate": 3.966961513678168e-05, "loss": 0.9545, "step": 230 }, { "epoch": 0.04196170993968004, "grad_norm": 7.936176776885986, "learning_rate": 3.964618358619882e-05, "loss": 0.9771, "step": 240 }, { "epoch": 0.04371011452050005, "grad_norm": 8.945891380310059, "learning_rate": 3.962275203561596e-05, "loss": 0.9664, "step": 250 }, { "epoch": 0.045458519101320045, "grad_norm": 7.30666971206665, "learning_rate": 3.95993204850331e-05, "loss": 0.8576, "step": 260 }, { "epoch": 0.04720692368214005, "grad_norm": 6.691559791564941, "learning_rate": 3.957588893445024e-05, "loss": 0.8771, "step": 270 }, { "epoch": 0.04895532826296005, "grad_norm": 6.786176681518555, "learning_rate": 3.9552457383867377e-05, "loss": 0.9918, "step": 280 }, { "epoch": 0.050703732843780054, "grad_norm": 8.400111198425293, "learning_rate": 3.952902583328452e-05, "loss": 0.9256, "step": 290 }, { "epoch": 0.05245213742460005, "grad_norm": 6.029471397399902, "learning_rate": 3.950559428270166e-05, "loss": 0.8877, "step": 300 }, { "epoch": 0.05420054200542006, "grad_norm": 5.6731276512146, "learning_rate": 3.94821627321188e-05, "loss": 0.7843, "step": 310 }, { "epoch": 0.055948946586240056, "grad_norm": 5.914205074310303, "learning_rate": 3.945873118153594e-05, "loss": 0.8157, "step": 320 }, { "epoch": 0.05769735116706006, "grad_norm": 5.39005708694458, "learning_rate": 3.943529963095308e-05, "loss": 0.8165, "step": 330 }, { "epoch": 0.05944575574788006, "grad_norm": 8.598004341125488, "learning_rate": 3.941186808037022e-05, "loss": 0.8669, "step": 340 }, { "epoch": 0.061194160328700065, "grad_norm": 6.7799763679504395, "learning_rate": 3.9388436529787364e-05, "loss": 0.8062, "step": 350 }, { "epoch": 0.06294256490952006, "grad_norm": 6.09038782119751, "learning_rate": 3.93650049792045e-05, "loss": 0.7334, "step": 360 }, { "epoch": 0.06469096949034006, "grad_norm": 6.056455612182617, "learning_rate": 3.9341573428621645e-05, "loss": 0.8423, "step": 370 }, { "epoch": 0.06643937407116007, "grad_norm": 7.564798355102539, "learning_rate": 3.931814187803878e-05, "loss": 0.7508, "step": 380 }, { "epoch": 0.06818777865198007, "grad_norm": 6.5860490798950195, "learning_rate": 3.929471032745592e-05, "loss": 0.7571, "step": 390 }, { "epoch": 0.06993618323280007, "grad_norm": 5.904583930969238, "learning_rate": 3.9271278776873064e-05, "loss": 0.8628, "step": 400 }, { "epoch": 0.07168458781362007, "grad_norm": 9.70653247833252, "learning_rate": 3.92478472262902e-05, "loss": 0.8791, "step": 410 }, { "epoch": 0.07343299239444008, "grad_norm": 6.332978248596191, "learning_rate": 3.9224415675707345e-05, "loss": 0.7527, "step": 420 }, { "epoch": 0.07518139697526008, "grad_norm": 5.800631523132324, "learning_rate": 3.920098412512448e-05, "loss": 0.7262, "step": 430 }, { "epoch": 0.07692980155608008, "grad_norm": 6.9981489181518555, "learning_rate": 3.9177552574541626e-05, "loss": 0.8282, "step": 440 }, { "epoch": 0.07867820613690008, "grad_norm": 7.195876121520996, "learning_rate": 3.915412102395876e-05, "loss": 0.8676, "step": 450 }, { "epoch": 0.08042661071772009, "grad_norm": 4.58298397064209, "learning_rate": 3.913068947337591e-05, "loss": 0.6943, "step": 460 }, { "epoch": 0.08217501529854009, "grad_norm": 7.165435314178467, "learning_rate": 3.9107257922793044e-05, "loss": 0.6899, "step": 470 }, { "epoch": 0.08392341987936008, "grad_norm": 5.472495079040527, "learning_rate": 3.908382637221019e-05, "loss": 0.7777, "step": 480 }, { "epoch": 0.08567182446018008, "grad_norm": 5.845532417297363, "learning_rate": 3.9060394821627325e-05, "loss": 0.7208, "step": 490 }, { "epoch": 0.0874202290410001, "grad_norm": 6.358067989349365, "learning_rate": 3.903696327104446e-05, "loss": 0.7331, "step": 500 }, { "epoch": 0.08916863362182009, "grad_norm": 7.398125648498535, "learning_rate": 3.901353172046161e-05, "loss": 0.7571, "step": 510 }, { "epoch": 0.09091703820264009, "grad_norm": 8.365299224853516, "learning_rate": 3.8990100169878744e-05, "loss": 0.7833, "step": 520 }, { "epoch": 0.09266544278346009, "grad_norm": 6.759946823120117, "learning_rate": 3.896666861929589e-05, "loss": 0.713, "step": 530 }, { "epoch": 0.0944138473642801, "grad_norm": 5.2622270584106445, "learning_rate": 3.8943237068713025e-05, "loss": 0.69, "step": 540 }, { "epoch": 0.0961622519451001, "grad_norm": 5.143499851226807, "learning_rate": 3.891980551813017e-05, "loss": 0.6665, "step": 550 }, { "epoch": 0.0979106565259201, "grad_norm": 6.446269989013672, "learning_rate": 3.8896373967547306e-05, "loss": 0.6641, "step": 560 }, { "epoch": 0.0996590611067401, "grad_norm": 5.429083347320557, "learning_rate": 3.887294241696445e-05, "loss": 0.6616, "step": 570 }, { "epoch": 0.10140746568756011, "grad_norm": 6.426352500915527, "learning_rate": 3.884951086638159e-05, "loss": 0.7137, "step": 580 }, { "epoch": 0.1031558702683801, "grad_norm": 6.015476226806641, "learning_rate": 3.882607931579873e-05, "loss": 0.7473, "step": 590 }, { "epoch": 0.1049042748492001, "grad_norm": 5.670246601104736, "learning_rate": 3.880264776521587e-05, "loss": 0.6374, "step": 600 }, { "epoch": 0.1066526794300201, "grad_norm": 6.190732479095459, "learning_rate": 3.8779216214633006e-05, "loss": 0.6213, "step": 610 }, { "epoch": 0.10840108401084012, "grad_norm": 5.9032793045043945, "learning_rate": 3.875578466405014e-05, "loss": 0.6196, "step": 620 }, { "epoch": 0.11014948859166011, "grad_norm": 5.00473690032959, "learning_rate": 3.873235311346729e-05, "loss": 0.6422, "step": 630 }, { "epoch": 0.11189789317248011, "grad_norm": 4.598703384399414, "learning_rate": 3.8708921562884424e-05, "loss": 0.6302, "step": 640 }, { "epoch": 0.11364629775330011, "grad_norm": 5.970333099365234, "learning_rate": 3.868549001230157e-05, "loss": 0.6547, "step": 650 }, { "epoch": 0.11539470233412012, "grad_norm": 5.355820655822754, "learning_rate": 3.8662058461718705e-05, "loss": 0.6128, "step": 660 }, { "epoch": 0.11714310691494012, "grad_norm": 5.05305290222168, "learning_rate": 3.863862691113584e-05, "loss": 0.6536, "step": 670 }, { "epoch": 0.11889151149576012, "grad_norm": 4.703378200531006, "learning_rate": 3.8615195360552986e-05, "loss": 0.576, "step": 680 }, { "epoch": 0.12063991607658012, "grad_norm": 5.541085720062256, "learning_rate": 3.8591763809970124e-05, "loss": 0.6727, "step": 690 }, { "epoch": 0.12238832065740013, "grad_norm": 4.993374824523926, "learning_rate": 3.856833225938727e-05, "loss": 0.6802, "step": 700 }, { "epoch": 0.12413672523822013, "grad_norm": 4.485571384429932, "learning_rate": 3.8544900708804405e-05, "loss": 0.6061, "step": 710 }, { "epoch": 0.12588512981904013, "grad_norm": 6.4693756103515625, "learning_rate": 3.852146915822155e-05, "loss": 0.6032, "step": 720 }, { "epoch": 0.12763353439986014, "grad_norm": 5.537546634674072, "learning_rate": 3.8498037607638686e-05, "loss": 0.6899, "step": 730 }, { "epoch": 0.12938193898068012, "grad_norm": 5.683461666107178, "learning_rate": 3.847460605705583e-05, "loss": 0.509, "step": 740 }, { "epoch": 0.13113034356150013, "grad_norm": 6.413394451141357, "learning_rate": 3.845117450647297e-05, "loss": 0.5851, "step": 750 }, { "epoch": 0.13287874814232015, "grad_norm": 6.22868013381958, "learning_rate": 3.842774295589011e-05, "loss": 0.6095, "step": 760 }, { "epoch": 0.13462715272314013, "grad_norm": 4.91422700881958, "learning_rate": 3.840431140530725e-05, "loss": 0.6649, "step": 770 }, { "epoch": 0.13637555730396014, "grad_norm": 6.396027088165283, "learning_rate": 3.8380879854724386e-05, "loss": 0.6328, "step": 780 }, { "epoch": 0.13812396188478013, "grad_norm": 5.467519283294678, "learning_rate": 3.835744830414153e-05, "loss": 0.5906, "step": 790 }, { "epoch": 0.13987236646560014, "grad_norm": 6.788895130157471, "learning_rate": 3.833401675355867e-05, "loss": 0.6719, "step": 800 }, { "epoch": 0.14162077104642015, "grad_norm": 8.55156135559082, "learning_rate": 3.831058520297581e-05, "loss": 0.6192, "step": 810 }, { "epoch": 0.14336917562724014, "grad_norm": 4.527801513671875, "learning_rate": 3.828715365239295e-05, "loss": 0.5647, "step": 820 }, { "epoch": 0.14511758020806015, "grad_norm": 7.3711042404174805, "learning_rate": 3.826372210181009e-05, "loss": 0.6221, "step": 830 }, { "epoch": 0.14686598478888016, "grad_norm": 3.927372932434082, "learning_rate": 3.824029055122723e-05, "loss": 0.5524, "step": 840 }, { "epoch": 0.14861438936970015, "grad_norm": 6.472529411315918, "learning_rate": 3.821685900064437e-05, "loss": 0.5478, "step": 850 }, { "epoch": 0.15036279395052016, "grad_norm": 3.832937240600586, "learning_rate": 3.819342745006151e-05, "loss": 0.5765, "step": 860 }, { "epoch": 0.15211119853134014, "grad_norm": 6.222163677215576, "learning_rate": 3.8169995899478654e-05, "loss": 0.6352, "step": 870 }, { "epoch": 0.15385960311216015, "grad_norm": 4.043605804443359, "learning_rate": 3.814656434889579e-05, "loss": 0.5951, "step": 880 }, { "epoch": 0.15560800769298017, "grad_norm": 6.224709510803223, "learning_rate": 3.812313279831293e-05, "loss": 0.6274, "step": 890 }, { "epoch": 0.15735641227380015, "grad_norm": 4.874531269073486, "learning_rate": 3.809970124773007e-05, "loss": 0.5995, "step": 900 }, { "epoch": 0.15910481685462016, "grad_norm": 4.65131950378418, "learning_rate": 3.807626969714721e-05, "loss": 0.5395, "step": 910 }, { "epoch": 0.16085322143544017, "grad_norm": 6.024080753326416, "learning_rate": 3.8052838146564354e-05, "loss": 0.6002, "step": 920 }, { "epoch": 0.16260162601626016, "grad_norm": 5.143209457397461, "learning_rate": 3.802940659598149e-05, "loss": 0.5729, "step": 930 }, { "epoch": 0.16435003059708017, "grad_norm": 4.049091815948486, "learning_rate": 3.8005975045398635e-05, "loss": 0.5778, "step": 940 }, { "epoch": 0.16609843517790016, "grad_norm": 7.80893611907959, "learning_rate": 3.798254349481577e-05, "loss": 0.6218, "step": 950 }, { "epoch": 0.16784683975872017, "grad_norm": 4.759846210479736, "learning_rate": 3.7959111944232916e-05, "loss": 0.5375, "step": 960 }, { "epoch": 0.16959524433954018, "grad_norm": 4.4919023513793945, "learning_rate": 3.793568039365005e-05, "loss": 0.5064, "step": 970 }, { "epoch": 0.17134364892036016, "grad_norm": 6.188364028930664, "learning_rate": 3.79122488430672e-05, "loss": 0.5616, "step": 980 }, { "epoch": 0.17309205350118018, "grad_norm": 4.2530436515808105, "learning_rate": 3.7888817292484334e-05, "loss": 0.5562, "step": 990 }, { "epoch": 0.1748404580820002, "grad_norm": 4.9472737312316895, "learning_rate": 3.786538574190147e-05, "loss": 0.5224, "step": 1000 }, { "epoch": 0.17658886266282017, "grad_norm": 4.524152755737305, "learning_rate": 3.7841954191318616e-05, "loss": 0.5238, "step": 1010 }, { "epoch": 0.17833726724364019, "grad_norm": 4.708081245422363, "learning_rate": 3.781852264073575e-05, "loss": 0.6241, "step": 1020 }, { "epoch": 0.18008567182446017, "grad_norm": 3.5008468627929688, "learning_rate": 3.77950910901529e-05, "loss": 0.4815, "step": 1030 }, { "epoch": 0.18183407640528018, "grad_norm": 5.174265384674072, "learning_rate": 3.7771659539570034e-05, "loss": 0.5692, "step": 1040 }, { "epoch": 0.1835824809861002, "grad_norm": 5.7122883796691895, "learning_rate": 3.774822798898718e-05, "loss": 0.5172, "step": 1050 }, { "epoch": 0.18533088556692018, "grad_norm": 6.649969100952148, "learning_rate": 3.7724796438404315e-05, "loss": 0.5664, "step": 1060 }, { "epoch": 0.1870792901477402, "grad_norm": 4.619507312774658, "learning_rate": 3.770136488782146e-05, "loss": 0.5372, "step": 1070 }, { "epoch": 0.1888276947285602, "grad_norm": 5.630303382873535, "learning_rate": 3.7677933337238596e-05, "loss": 0.513, "step": 1080 }, { "epoch": 0.1905760993093802, "grad_norm": 7.488351821899414, "learning_rate": 3.765450178665574e-05, "loss": 0.5913, "step": 1090 }, { "epoch": 0.1923245038902002, "grad_norm": 4.957793235778809, "learning_rate": 3.763107023607287e-05, "loss": 0.5634, "step": 1100 }, { "epoch": 0.19407290847102018, "grad_norm": 7.087141990661621, "learning_rate": 3.7607638685490015e-05, "loss": 0.567, "step": 1110 }, { "epoch": 0.1958213130518402, "grad_norm": 3.770094871520996, "learning_rate": 3.758420713490715e-05, "loss": 0.5791, "step": 1120 }, { "epoch": 0.1975697176326602, "grad_norm": 12.021100044250488, "learning_rate": 3.7560775584324296e-05, "loss": 0.5493, "step": 1130 }, { "epoch": 0.1993181222134802, "grad_norm": 4.023717880249023, "learning_rate": 3.753734403374143e-05, "loss": 0.4724, "step": 1140 }, { "epoch": 0.2010665267943002, "grad_norm": 4.10474157333374, "learning_rate": 3.751391248315858e-05, "loss": 0.5078, "step": 1150 }, { "epoch": 0.20281493137512022, "grad_norm": 4.947973728179932, "learning_rate": 3.7490480932575714e-05, "loss": 0.5133, "step": 1160 }, { "epoch": 0.2045633359559402, "grad_norm": 4.226052761077881, "learning_rate": 3.746704938199286e-05, "loss": 0.5145, "step": 1170 }, { "epoch": 0.2063117405367602, "grad_norm": 3.9101569652557373, "learning_rate": 3.7443617831409995e-05, "loss": 0.4885, "step": 1180 }, { "epoch": 0.2080601451175802, "grad_norm": 5.738667964935303, "learning_rate": 3.742018628082713e-05, "loss": 0.5903, "step": 1190 }, { "epoch": 0.2098085496984002, "grad_norm": 5.365860462188721, "learning_rate": 3.7396754730244277e-05, "loss": 0.5117, "step": 1200 }, { "epoch": 0.21155695427922022, "grad_norm": 4.273809909820557, "learning_rate": 3.7373323179661414e-05, "loss": 0.507, "step": 1210 }, { "epoch": 0.2133053588600402, "grad_norm": 4.795403480529785, "learning_rate": 3.734989162907856e-05, "loss": 0.4717, "step": 1220 }, { "epoch": 0.21505376344086022, "grad_norm": 4.199695110321045, "learning_rate": 3.7326460078495695e-05, "loss": 0.4503, "step": 1230 }, { "epoch": 0.21680216802168023, "grad_norm": 4.74060583114624, "learning_rate": 3.730302852791284e-05, "loss": 0.5411, "step": 1240 }, { "epoch": 0.21855057260250021, "grad_norm": 3.1553878784179688, "learning_rate": 3.7279596977329976e-05, "loss": 0.4929, "step": 1250 }, { "epoch": 0.22029897718332023, "grad_norm": 6.157911777496338, "learning_rate": 3.725616542674712e-05, "loss": 0.4856, "step": 1260 }, { "epoch": 0.2220473817641402, "grad_norm": 3.480355739593506, "learning_rate": 3.723273387616426e-05, "loss": 0.4727, "step": 1270 }, { "epoch": 0.22379578634496022, "grad_norm": 4.776696681976318, "learning_rate": 3.72093023255814e-05, "loss": 0.5281, "step": 1280 }, { "epoch": 0.22554419092578024, "grad_norm": 4.238858222961426, "learning_rate": 3.718587077499854e-05, "loss": 0.4846, "step": 1290 }, { "epoch": 0.22729259550660022, "grad_norm": 6.8187994956970215, "learning_rate": 3.7162439224415676e-05, "loss": 0.5232, "step": 1300 }, { "epoch": 0.22904100008742023, "grad_norm": 2.982185125350952, "learning_rate": 3.713900767383282e-05, "loss": 0.5057, "step": 1310 }, { "epoch": 0.23078940466824024, "grad_norm": 5.940972805023193, "learning_rate": 3.711557612324996e-05, "loss": 0.5363, "step": 1320 }, { "epoch": 0.23253780924906023, "grad_norm": 4.530322074890137, "learning_rate": 3.70921445726671e-05, "loss": 0.5113, "step": 1330 }, { "epoch": 0.23428621382988024, "grad_norm": 7.602742671966553, "learning_rate": 3.706871302208424e-05, "loss": 0.4607, "step": 1340 }, { "epoch": 0.23603461841070023, "grad_norm": 4.332957744598389, "learning_rate": 3.704528147150138e-05, "loss": 0.5148, "step": 1350 }, { "epoch": 0.23778302299152024, "grad_norm": 4.353420734405518, "learning_rate": 3.702184992091852e-05, "loss": 0.5027, "step": 1360 }, { "epoch": 0.23953142757234025, "grad_norm": 3.4939002990722656, "learning_rate": 3.699841837033566e-05, "loss": 0.462, "step": 1370 }, { "epoch": 0.24127983215316023, "grad_norm": 5.113827705383301, "learning_rate": 3.69749868197528e-05, "loss": 0.4807, "step": 1380 }, { "epoch": 0.24302823673398025, "grad_norm": 2.979421377182007, "learning_rate": 3.6953898424228225e-05, "loss": 0.4931, "step": 1390 }, { "epoch": 0.24477664131480026, "grad_norm": 4.109902858734131, "learning_rate": 3.693046687364536e-05, "loss": 0.4817, "step": 1400 }, { "epoch": 0.24652504589562024, "grad_norm": 3.92783522605896, "learning_rate": 3.6907035323062506e-05, "loss": 0.4318, "step": 1410 }, { "epoch": 0.24827345047644025, "grad_norm": 3.3686742782592773, "learning_rate": 3.6883603772479644e-05, "loss": 0.5278, "step": 1420 }, { "epoch": 0.25002185505726027, "grad_norm": 3.851233720779419, "learning_rate": 3.686017222189679e-05, "loss": 0.4368, "step": 1430 }, { "epoch": 0.25177025963808025, "grad_norm": 4.5934062004089355, "learning_rate": 3.6836740671313925e-05, "loss": 0.5209, "step": 1440 }, { "epoch": 0.25351866421890024, "grad_norm": 5.395561695098877, "learning_rate": 3.681330912073107e-05, "loss": 0.449, "step": 1450 }, { "epoch": 0.2552670687997203, "grad_norm": 3.0194127559661865, "learning_rate": 3.6789877570148206e-05, "loss": 0.4584, "step": 1460 }, { "epoch": 0.25701547338054026, "grad_norm": 6.502100944519043, "learning_rate": 3.676644601956535e-05, "loss": 0.539, "step": 1470 }, { "epoch": 0.25876387796136024, "grad_norm": 4.323697090148926, "learning_rate": 3.674301446898249e-05, "loss": 0.4571, "step": 1480 }, { "epoch": 0.2605122825421803, "grad_norm": 3.714613914489746, "learning_rate": 3.671958291839963e-05, "loss": 0.4525, "step": 1490 }, { "epoch": 0.26226068712300027, "grad_norm": 3.388582706451416, "learning_rate": 3.669615136781677e-05, "loss": 0.4457, "step": 1500 }, { "epoch": 0.26400909170382025, "grad_norm": 4.142477989196777, "learning_rate": 3.6672719817233905e-05, "loss": 0.4379, "step": 1510 }, { "epoch": 0.2657574962846403, "grad_norm": 5.404989242553711, "learning_rate": 3.664928826665105e-05, "loss": 0.4761, "step": 1520 }, { "epoch": 0.2675059008654603, "grad_norm": 2.9752023220062256, "learning_rate": 3.6625856716068187e-05, "loss": 0.4431, "step": 1530 }, { "epoch": 0.26925430544628026, "grad_norm": 5.292706489562988, "learning_rate": 3.660242516548533e-05, "loss": 0.4439, "step": 1540 }, { "epoch": 0.27100271002710025, "grad_norm": 4.105823516845703, "learning_rate": 3.657899361490247e-05, "loss": 0.4925, "step": 1550 }, { "epoch": 0.2727511146079203, "grad_norm": 4.246946334838867, "learning_rate": 3.655556206431961e-05, "loss": 0.46, "step": 1560 }, { "epoch": 0.27449951918874027, "grad_norm": 4.617825508117676, "learning_rate": 3.653213051373675e-05, "loss": 0.4587, "step": 1570 }, { "epoch": 0.27624792376956026, "grad_norm": 7.036331653594971, "learning_rate": 3.650869896315389e-05, "loss": 0.4417, "step": 1580 }, { "epoch": 0.2779963283503803, "grad_norm": 3.7988290786743164, "learning_rate": 3.648526741257103e-05, "loss": 0.4307, "step": 1590 }, { "epoch": 0.2797447329312003, "grad_norm": 4.456466197967529, "learning_rate": 3.6461835861988174e-05, "loss": 0.5143, "step": 1600 }, { "epoch": 0.28149313751202026, "grad_norm": 3.4820919036865234, "learning_rate": 3.643840431140531e-05, "loss": 0.4492, "step": 1610 }, { "epoch": 0.2832415420928403, "grad_norm": 3.1583054065704346, "learning_rate": 3.641497276082245e-05, "loss": 0.4656, "step": 1620 }, { "epoch": 0.2849899466736603, "grad_norm": 3.003847360610962, "learning_rate": 3.639154121023959e-05, "loss": 0.4477, "step": 1630 }, { "epoch": 0.2867383512544803, "grad_norm": 3.8377931118011475, "learning_rate": 3.636810965965673e-05, "loss": 0.437, "step": 1640 }, { "epoch": 0.2884867558353003, "grad_norm": 4.43681526184082, "learning_rate": 3.6344678109073874e-05, "loss": 0.4315, "step": 1650 }, { "epoch": 0.2902351604161203, "grad_norm": 3.5097219944000244, "learning_rate": 3.632124655849101e-05, "loss": 0.4466, "step": 1660 }, { "epoch": 0.2919835649969403, "grad_norm": 4.560410976409912, "learning_rate": 3.6297815007908155e-05, "loss": 0.4282, "step": 1670 }, { "epoch": 0.2937319695777603, "grad_norm": 4.94926643371582, "learning_rate": 3.627438345732529e-05, "loss": 0.4744, "step": 1680 }, { "epoch": 0.2954803741585803, "grad_norm": 3.3490183353424072, "learning_rate": 3.6250951906742436e-05, "loss": 0.4927, "step": 1690 }, { "epoch": 0.2972287787394003, "grad_norm": 3.6036620140075684, "learning_rate": 3.622752035615957e-05, "loss": 0.483, "step": 1700 }, { "epoch": 0.2989771833202203, "grad_norm": 3.800067663192749, "learning_rate": 3.620408880557671e-05, "loss": 0.4069, "step": 1710 }, { "epoch": 0.3007255879010403, "grad_norm": 3.0274336338043213, "learning_rate": 3.6180657254993854e-05, "loss": 0.4235, "step": 1720 }, { "epoch": 0.3024739924818603, "grad_norm": 6.128991603851318, "learning_rate": 3.615722570441099e-05, "loss": 0.4782, "step": 1730 }, { "epoch": 0.3042223970626803, "grad_norm": 3.7391269207000732, "learning_rate": 3.6133794153828135e-05, "loss": 0.4981, "step": 1740 }, { "epoch": 0.3059708016435003, "grad_norm": 4.853888034820557, "learning_rate": 3.611036260324527e-05, "loss": 0.4605, "step": 1750 }, { "epoch": 0.3077192062243203, "grad_norm": 4.367269515991211, "learning_rate": 3.608693105266242e-05, "loss": 0.4695, "step": 1760 }, { "epoch": 0.3094676108051403, "grad_norm": 3.594266414642334, "learning_rate": 3.6063499502079554e-05, "loss": 0.4399, "step": 1770 }, { "epoch": 0.31121601538596033, "grad_norm": 2.8320610523223877, "learning_rate": 3.604006795149669e-05, "loss": 0.4281, "step": 1780 }, { "epoch": 0.3129644199667803, "grad_norm": 2.9507384300231934, "learning_rate": 3.601663640091383e-05, "loss": 0.425, "step": 1790 }, { "epoch": 0.3147128245476003, "grad_norm": 5.069509506225586, "learning_rate": 3.599320485033097e-05, "loss": 0.4376, "step": 1800 }, { "epoch": 0.31646122912842034, "grad_norm": 4.728787899017334, "learning_rate": 3.596977329974811e-05, "loss": 0.487, "step": 1810 }, { "epoch": 0.3182096337092403, "grad_norm": 4.627148151397705, "learning_rate": 3.594634174916525e-05, "loss": 0.3894, "step": 1820 }, { "epoch": 0.3199580382900603, "grad_norm": 9.96601676940918, "learning_rate": 3.592291019858239e-05, "loss": 0.4409, "step": 1830 }, { "epoch": 0.32170644287088035, "grad_norm": 5.776546478271484, "learning_rate": 3.5899478647999535e-05, "loss": 0.4583, "step": 1840 }, { "epoch": 0.32345484745170033, "grad_norm": 3.2901666164398193, "learning_rate": 3.587604709741667e-05, "loss": 0.4464, "step": 1850 }, { "epoch": 0.3252032520325203, "grad_norm": 5.241537094116211, "learning_rate": 3.5852615546833816e-05, "loss": 0.4677, "step": 1860 }, { "epoch": 0.3269516566133403, "grad_norm": 4.418180465698242, "learning_rate": 3.582918399625095e-05, "loss": 0.4884, "step": 1870 }, { "epoch": 0.32870006119416034, "grad_norm": 3.4376697540283203, "learning_rate": 3.58057524456681e-05, "loss": 0.4303, "step": 1880 }, { "epoch": 0.3304484657749803, "grad_norm": 4.297024726867676, "learning_rate": 3.5782320895085234e-05, "loss": 0.5045, "step": 1890 }, { "epoch": 0.3321968703558003, "grad_norm": 7.0916948318481445, "learning_rate": 3.575888934450237e-05, "loss": 0.4702, "step": 1900 }, { "epoch": 0.33394527493662035, "grad_norm": 8.566019058227539, "learning_rate": 3.5735457793919515e-05, "loss": 0.4032, "step": 1910 }, { "epoch": 0.33569367951744034, "grad_norm": 2.6082425117492676, "learning_rate": 3.571202624333665e-05, "loss": 0.4511, "step": 1920 }, { "epoch": 0.3374420840982603, "grad_norm": 4.204542636871338, "learning_rate": 3.5688594692753796e-05, "loss": 0.3781, "step": 1930 }, { "epoch": 0.33919048867908036, "grad_norm": 3.8807003498077393, "learning_rate": 3.5665163142170934e-05, "loss": 0.4176, "step": 1940 }, { "epoch": 0.34093889325990034, "grad_norm": 2.7027812004089355, "learning_rate": 3.564173159158808e-05, "loss": 0.3927, "step": 1950 }, { "epoch": 0.34268729784072033, "grad_norm": 3.784552574157715, "learning_rate": 3.5618300041005215e-05, "loss": 0.4709, "step": 1960 }, { "epoch": 0.34443570242154037, "grad_norm": 3.2418737411499023, "learning_rate": 3.559486849042236e-05, "loss": 0.4252, "step": 1970 }, { "epoch": 0.34618410700236035, "grad_norm": 5.2287397384643555, "learning_rate": 3.5571436939839496e-05, "loss": 0.4115, "step": 1980 }, { "epoch": 0.34793251158318034, "grad_norm": 4.299645900726318, "learning_rate": 3.554800538925664e-05, "loss": 0.4234, "step": 1990 }, { "epoch": 0.3496809161640004, "grad_norm": 2.8772058486938477, "learning_rate": 3.552457383867378e-05, "loss": 0.4709, "step": 2000 }, { "epoch": 0.35142932074482036, "grad_norm": 3.6467995643615723, "learning_rate": 3.5501142288090914e-05, "loss": 0.3678, "step": 2010 }, { "epoch": 0.35317772532564035, "grad_norm": 4.426238059997559, "learning_rate": 3.547771073750806e-05, "loss": 0.3868, "step": 2020 }, { "epoch": 0.35492612990646033, "grad_norm": 6.251084327697754, "learning_rate": 3.5454279186925195e-05, "loss": 0.4048, "step": 2030 }, { "epoch": 0.35667453448728037, "grad_norm": 3.1436846256256104, "learning_rate": 3.543084763634234e-05, "loss": 0.369, "step": 2040 }, { "epoch": 0.35842293906810035, "grad_norm": 5.551470756530762, "learning_rate": 3.540741608575948e-05, "loss": 0.4341, "step": 2050 }, { "epoch": 0.36017134364892034, "grad_norm": 3.3913917541503906, "learning_rate": 3.538398453517662e-05, "loss": 0.3886, "step": 2060 }, { "epoch": 0.3619197482297404, "grad_norm": 2.8911020755767822, "learning_rate": 3.536055298459376e-05, "loss": 0.3767, "step": 2070 }, { "epoch": 0.36366815281056036, "grad_norm": 3.6292026042938232, "learning_rate": 3.53371214340109e-05, "loss": 0.4193, "step": 2080 }, { "epoch": 0.36541655739138035, "grad_norm": 3.489974021911621, "learning_rate": 3.531368988342804e-05, "loss": 0.4379, "step": 2090 }, { "epoch": 0.3671649619722004, "grad_norm": 3.882077217102051, "learning_rate": 3.529025833284518e-05, "loss": 0.3757, "step": 2100 }, { "epoch": 0.36891336655302037, "grad_norm": 4.579954147338867, "learning_rate": 3.526682678226232e-05, "loss": 0.3671, "step": 2110 }, { "epoch": 0.37066177113384036, "grad_norm": 3.082444906234741, "learning_rate": 3.524339523167946e-05, "loss": 0.3885, "step": 2120 }, { "epoch": 0.3724101757146604, "grad_norm": 4.321898460388184, "learning_rate": 3.52199636810966e-05, "loss": 0.4543, "step": 2130 }, { "epoch": 0.3741585802954804, "grad_norm": 5.577615737915039, "learning_rate": 3.519653213051374e-05, "loss": 0.3491, "step": 2140 }, { "epoch": 0.37590698487630037, "grad_norm": 4.5239057540893555, "learning_rate": 3.517310057993088e-05, "loss": 0.4436, "step": 2150 }, { "epoch": 0.3776553894571204, "grad_norm": 4.200013637542725, "learning_rate": 3.514966902934802e-05, "loss": 0.4066, "step": 2160 }, { "epoch": 0.3794037940379404, "grad_norm": 2.5660548210144043, "learning_rate": 3.5126237478765164e-05, "loss": 0.4033, "step": 2170 }, { "epoch": 0.3811521986187604, "grad_norm": 3.0077526569366455, "learning_rate": 3.51028059281823e-05, "loss": 0.4044, "step": 2180 }, { "epoch": 0.38290060319958036, "grad_norm": 3.9082486629486084, "learning_rate": 3.5079374377599445e-05, "loss": 0.4027, "step": 2190 }, { "epoch": 0.3846490077804004, "grad_norm": 3.365020751953125, "learning_rate": 3.505594282701658e-05, "loss": 0.4271, "step": 2200 }, { "epoch": 0.3863974123612204, "grad_norm": 5.796195983886719, "learning_rate": 3.5032511276433726e-05, "loss": 0.4496, "step": 2210 }, { "epoch": 0.38814581694204037, "grad_norm": 4.143885612487793, "learning_rate": 3.500907972585086e-05, "loss": 0.3982, "step": 2220 }, { "epoch": 0.3898942215228604, "grad_norm": 3.6130030155181885, "learning_rate": 3.4985648175268e-05, "loss": 0.432, "step": 2230 }, { "epoch": 0.3916426261036804, "grad_norm": 4.119210243225098, "learning_rate": 3.4962216624685144e-05, "loss": 0.4078, "step": 2240 }, { "epoch": 0.3933910306845004, "grad_norm": 3.4416446685791016, "learning_rate": 3.493878507410228e-05, "loss": 0.4556, "step": 2250 }, { "epoch": 0.3951394352653204, "grad_norm": 3.9759068489074707, "learning_rate": 3.491535352351942e-05, "loss": 0.4438, "step": 2260 }, { "epoch": 0.3968878398461404, "grad_norm": 5.250323295593262, "learning_rate": 3.489192197293656e-05, "loss": 0.3963, "step": 2270 }, { "epoch": 0.3986362444269604, "grad_norm": 2.926793098449707, "learning_rate": 3.48684904223537e-05, "loss": 0.3948, "step": 2280 }, { "epoch": 0.4003846490077804, "grad_norm": 3.849525213241577, "learning_rate": 3.484505887177084e-05, "loss": 0.4359, "step": 2290 }, { "epoch": 0.4021330535886004, "grad_norm": 5.853473663330078, "learning_rate": 3.482162732118798e-05, "loss": 0.4449, "step": 2300 }, { "epoch": 0.4038814581694204, "grad_norm": 2.683713674545288, "learning_rate": 3.479819577060512e-05, "loss": 0.4052, "step": 2310 }, { "epoch": 0.40562986275024043, "grad_norm": 4.078883647918701, "learning_rate": 3.477476422002226e-05, "loss": 0.4448, "step": 2320 }, { "epoch": 0.4073782673310604, "grad_norm": 2.502694606781006, "learning_rate": 3.47513326694394e-05, "loss": 0.3751, "step": 2330 }, { "epoch": 0.4091266719118804, "grad_norm": 3.9655332565307617, "learning_rate": 3.4727901118856543e-05, "loss": 0.394, "step": 2340 }, { "epoch": 0.4108750764927004, "grad_norm": 3.2672157287597656, "learning_rate": 3.470446956827368e-05, "loss": 0.3714, "step": 2350 }, { "epoch": 0.4126234810735204, "grad_norm": 2.6867640018463135, "learning_rate": 3.4681038017690825e-05, "loss": 0.3843, "step": 2360 }, { "epoch": 0.4143718856543404, "grad_norm": 3.3197810649871826, "learning_rate": 3.465760646710796e-05, "loss": 0.3949, "step": 2370 }, { "epoch": 0.4161202902351604, "grad_norm": 3.062208652496338, "learning_rate": 3.4634174916525106e-05, "loss": 0.3798, "step": 2380 }, { "epoch": 0.41786869481598043, "grad_norm": 3.712489366531372, "learning_rate": 3.461074336594224e-05, "loss": 0.3991, "step": 2390 }, { "epoch": 0.4196170993968004, "grad_norm": 4.0003437995910645, "learning_rate": 3.458731181535938e-05, "loss": 0.4353, "step": 2400 }, { "epoch": 0.4213655039776204, "grad_norm": 4.571620464324951, "learning_rate": 3.4563880264776524e-05, "loss": 0.3888, "step": 2410 }, { "epoch": 0.42311390855844044, "grad_norm": 9.544726371765137, "learning_rate": 3.454044871419366e-05, "loss": 0.4344, "step": 2420 }, { "epoch": 0.42486231313926043, "grad_norm": 5.037539958953857, "learning_rate": 3.4517017163610805e-05, "loss": 0.3653, "step": 2430 }, { "epoch": 0.4266107177200804, "grad_norm": 3.384692907333374, "learning_rate": 3.449358561302794e-05, "loss": 0.3715, "step": 2440 }, { "epoch": 0.42835912230090045, "grad_norm": 3.169987916946411, "learning_rate": 3.4470154062445087e-05, "loss": 0.3899, "step": 2450 }, { "epoch": 0.43010752688172044, "grad_norm": 3.1669843196868896, "learning_rate": 3.4446722511862224e-05, "loss": 0.4537, "step": 2460 }, { "epoch": 0.4318559314625404, "grad_norm": 3.975206136703491, "learning_rate": 3.442329096127937e-05, "loss": 0.4296, "step": 2470 }, { "epoch": 0.43360433604336046, "grad_norm": 3.153317928314209, "learning_rate": 3.4399859410696505e-05, "loss": 0.3524, "step": 2480 }, { "epoch": 0.43535274062418045, "grad_norm": 3.307684898376465, "learning_rate": 3.437642786011365e-05, "loss": 0.3707, "step": 2490 }, { "epoch": 0.43710114520500043, "grad_norm": 3.744170904159546, "learning_rate": 3.4352996309530786e-05, "loss": 0.4054, "step": 2500 }, { "epoch": 0.4388495497858204, "grad_norm": 5.2920427322387695, "learning_rate": 3.432956475894792e-05, "loss": 0.3621, "step": 2510 }, { "epoch": 0.44059795436664045, "grad_norm": 4.31833553314209, "learning_rate": 3.430613320836507e-05, "loss": 0.4324, "step": 2520 }, { "epoch": 0.44234635894746044, "grad_norm": 6.0344929695129395, "learning_rate": 3.4282701657782204e-05, "loss": 0.4201, "step": 2530 }, { "epoch": 0.4440947635282804, "grad_norm": 6.097479820251465, "learning_rate": 3.425927010719935e-05, "loss": 0.4248, "step": 2540 }, { "epoch": 0.44584316810910046, "grad_norm": 3.0370521545410156, "learning_rate": 3.4235838556616486e-05, "loss": 0.3989, "step": 2550 }, { "epoch": 0.44759157268992045, "grad_norm": 3.0904951095581055, "learning_rate": 3.421240700603363e-05, "loss": 0.4062, "step": 2560 }, { "epoch": 0.44933997727074043, "grad_norm": 4.901310920715332, "learning_rate": 3.418897545545077e-05, "loss": 0.401, "step": 2570 }, { "epoch": 0.45108838185156047, "grad_norm": 5.094497203826904, "learning_rate": 3.416554390486791e-05, "loss": 0.3905, "step": 2580 }, { "epoch": 0.45283678643238046, "grad_norm": 3.1318740844726562, "learning_rate": 3.414211235428505e-05, "loss": 0.3795, "step": 2590 }, { "epoch": 0.45458519101320044, "grad_norm": 3.8154094219207764, "learning_rate": 3.411868080370219e-05, "loss": 0.389, "step": 2600 }, { "epoch": 0.4563335955940205, "grad_norm": 3.527348756790161, "learning_rate": 3.409524925311933e-05, "loss": 0.4058, "step": 2610 }, { "epoch": 0.45808200017484046, "grad_norm": 6.264882564544678, "learning_rate": 3.4071817702536466e-05, "loss": 0.3955, "step": 2620 }, { "epoch": 0.45983040475566045, "grad_norm": 2.324648857116699, "learning_rate": 3.404838615195361e-05, "loss": 0.3582, "step": 2630 }, { "epoch": 0.4615788093364805, "grad_norm": 4.422379016876221, "learning_rate": 3.402495460137075e-05, "loss": 0.3479, "step": 2640 }, { "epoch": 0.4633272139173005, "grad_norm": 3.0725393295288086, "learning_rate": 3.400152305078789e-05, "loss": 0.3714, "step": 2650 }, { "epoch": 0.46507561849812046, "grad_norm": 4.242288112640381, "learning_rate": 3.397809150020503e-05, "loss": 0.3762, "step": 2660 }, { "epoch": 0.46682402307894044, "grad_norm": 4.661035537719727, "learning_rate": 3.395465994962217e-05, "loss": 0.3732, "step": 2670 }, { "epoch": 0.4685724276597605, "grad_norm": 2.518129825592041, "learning_rate": 3.393122839903931e-05, "loss": 0.4087, "step": 2680 }, { "epoch": 0.47032083224058047, "grad_norm": 4.576558589935303, "learning_rate": 3.3907796848456454e-05, "loss": 0.3828, "step": 2690 }, { "epoch": 0.47206923682140045, "grad_norm": 2.8037259578704834, "learning_rate": 3.388436529787359e-05, "loss": 0.3603, "step": 2700 }, { "epoch": 0.4738176414022205, "grad_norm": 4.486359596252441, "learning_rate": 3.3860933747290735e-05, "loss": 0.366, "step": 2710 }, { "epoch": 0.4755660459830405, "grad_norm": 3.6302101612091064, "learning_rate": 3.383750219670787e-05, "loss": 0.3775, "step": 2720 }, { "epoch": 0.47731445056386046, "grad_norm": 4.8168768882751465, "learning_rate": 3.381407064612501e-05, "loss": 0.3573, "step": 2730 }, { "epoch": 0.4790628551446805, "grad_norm": 4.675252437591553, "learning_rate": 3.3790639095542147e-05, "loss": 0.3809, "step": 2740 }, { "epoch": 0.4808112597255005, "grad_norm": 2.982111930847168, "learning_rate": 3.376720754495929e-05, "loss": 0.3664, "step": 2750 }, { "epoch": 0.48255966430632047, "grad_norm": 3.7463014125823975, "learning_rate": 3.374377599437643e-05, "loss": 0.3629, "step": 2760 }, { "epoch": 0.4843080688871405, "grad_norm": 2.5750458240509033, "learning_rate": 3.372268759885186e-05, "loss": 0.3459, "step": 2770 }, { "epoch": 0.4860564734679605, "grad_norm": 4.039637565612793, "learning_rate": 3.3699256048268997e-05, "loss": 0.3836, "step": 2780 }, { "epoch": 0.4878048780487805, "grad_norm": 4.1245832443237305, "learning_rate": 3.367582449768614e-05, "loss": 0.3801, "step": 2790 }, { "epoch": 0.4895532826296005, "grad_norm": 3.3629612922668457, "learning_rate": 3.365239294710328e-05, "loss": 0.3474, "step": 2800 }, { "epoch": 0.4913016872104205, "grad_norm": 3.7446513175964355, "learning_rate": 3.362896139652042e-05, "loss": 0.4071, "step": 2810 }, { "epoch": 0.4930500917912405, "grad_norm": 2.827909231185913, "learning_rate": 3.360552984593756e-05, "loss": 0.3862, "step": 2820 }, { "epoch": 0.49479849637206047, "grad_norm": 6.202451229095459, "learning_rate": 3.3582098295354696e-05, "loss": 0.3261, "step": 2830 }, { "epoch": 0.4965469009528805, "grad_norm": 3.2312405109405518, "learning_rate": 3.355866674477184e-05, "loss": 0.3866, "step": 2840 }, { "epoch": 0.4982953055337005, "grad_norm": 4.0673699378967285, "learning_rate": 3.353523519418898e-05, "loss": 0.3258, "step": 2850 }, { "epoch": 0.5000437101145205, "grad_norm": 2.6515884399414062, "learning_rate": 3.351180364360612e-05, "loss": 0.3814, "step": 2860 }, { "epoch": 0.5017921146953405, "grad_norm": 3.774637460708618, "learning_rate": 3.348837209302326e-05, "loss": 0.4082, "step": 2870 }, { "epoch": 0.5035405192761605, "grad_norm": 3.6968934535980225, "learning_rate": 3.34649405424404e-05, "loss": 0.395, "step": 2880 }, { "epoch": 0.5052889238569805, "grad_norm": 4.876258850097656, "learning_rate": 3.344150899185754e-05, "loss": 0.3328, "step": 2890 }, { "epoch": 0.5070373284378005, "grad_norm": 3.079639196395874, "learning_rate": 3.3418077441274684e-05, "loss": 0.3278, "step": 2900 }, { "epoch": 0.5087857330186205, "grad_norm": 3.560870409011841, "learning_rate": 3.339464589069182e-05, "loss": 0.3698, "step": 2910 }, { "epoch": 0.5105341375994406, "grad_norm": 3.489295482635498, "learning_rate": 3.3371214340108965e-05, "loss": 0.3407, "step": 2920 }, { "epoch": 0.5122825421802605, "grad_norm": 3.805683135986328, "learning_rate": 3.3347782789526095e-05, "loss": 0.3602, "step": 2930 }, { "epoch": 0.5140309467610805, "grad_norm": 8.44050407409668, "learning_rate": 3.332435123894324e-05, "loss": 0.3246, "step": 2940 }, { "epoch": 0.5157793513419006, "grad_norm": 3.361529588699341, "learning_rate": 3.3300919688360376e-05, "loss": 0.2978, "step": 2950 }, { "epoch": 0.5175277559227205, "grad_norm": 4.2473039627075195, "learning_rate": 3.327748813777752e-05, "loss": 0.3766, "step": 2960 }, { "epoch": 0.5192761605035405, "grad_norm": 4.070904731750488, "learning_rate": 3.325405658719466e-05, "loss": 0.393, "step": 2970 }, { "epoch": 0.5210245650843606, "grad_norm": 3.869349956512451, "learning_rate": 3.32306250366118e-05, "loss": 0.3478, "step": 2980 }, { "epoch": 0.5227729696651805, "grad_norm": 2.8064253330230713, "learning_rate": 3.320719348602894e-05, "loss": 0.3633, "step": 2990 }, { "epoch": 0.5245213742460005, "grad_norm": 3.720731019973755, "learning_rate": 3.318376193544608e-05, "loss": 0.3835, "step": 3000 }, { "epoch": 0.5246962147040826, "eval_loss": 0.5097190737724304, "eval_runtime": 1744.787, "eval_samples_per_second": 8.3, "eval_steps_per_second": 1.038, "step": 3001 }, { "epoch": 0.5262697788268206, "grad_norm": 4.681828498840332, "learning_rate": 3.316033038486322e-05, "loss": 0.333, "step": 3010 }, { "epoch": 0.5280181834076405, "grad_norm": 2.902214527130127, "learning_rate": 3.313689883428036e-05, "loss": 0.3833, "step": 3020 }, { "epoch": 0.5297665879884605, "grad_norm": 2.4706242084503174, "learning_rate": 3.31134672836975e-05, "loss": 0.3533, "step": 3030 }, { "epoch": 0.5315149925692806, "grad_norm": 3.9379889965057373, "learning_rate": 3.309003573311464e-05, "loss": 0.3583, "step": 3040 }, { "epoch": 0.5332633971501005, "grad_norm": 3.412458896636963, "learning_rate": 3.306660418253178e-05, "loss": 0.3696, "step": 3050 }, { "epoch": 0.5350118017309206, "grad_norm": 6.194937705993652, "learning_rate": 3.304317263194892e-05, "loss": 0.3401, "step": 3060 }, { "epoch": 0.5367602063117405, "grad_norm": 2.9540324211120605, "learning_rate": 3.301974108136606e-05, "loss": 0.3873, "step": 3070 }, { "epoch": 0.5385086108925605, "grad_norm": 2.695261240005493, "learning_rate": 3.29963095307832e-05, "loss": 0.3498, "step": 3080 }, { "epoch": 0.5402570154733806, "grad_norm": 3.2079668045043945, "learning_rate": 3.2972877980200344e-05, "loss": 0.3478, "step": 3090 }, { "epoch": 0.5420054200542005, "grad_norm": 2.720630407333374, "learning_rate": 3.294944642961748e-05, "loss": 0.3858, "step": 3100 }, { "epoch": 0.5437538246350205, "grad_norm": 2.9935555458068848, "learning_rate": 3.2926014879034626e-05, "loss": 0.3728, "step": 3110 }, { "epoch": 0.5455022292158406, "grad_norm": 3.6952109336853027, "learning_rate": 3.290258332845176e-05, "loss": 0.4133, "step": 3120 }, { "epoch": 0.5472506337966605, "grad_norm": 3.6369428634643555, "learning_rate": 3.28791517778689e-05, "loss": 0.3798, "step": 3130 }, { "epoch": 0.5489990383774805, "grad_norm": 4.478968620300293, "learning_rate": 3.2855720227286044e-05, "loss": 0.3333, "step": 3140 }, { "epoch": 0.5507474429583006, "grad_norm": 3.080244779586792, "learning_rate": 3.283228867670318e-05, "loss": 0.3818, "step": 3150 }, { "epoch": 0.5524958475391205, "grad_norm": 3.321441888809204, "learning_rate": 3.2808857126120325e-05, "loss": 0.3889, "step": 3160 }, { "epoch": 0.5542442521199406, "grad_norm": 4.040501117706299, "learning_rate": 3.278542557553746e-05, "loss": 0.3487, "step": 3170 }, { "epoch": 0.5559926567007606, "grad_norm": 2.8912744522094727, "learning_rate": 3.2761994024954606e-05, "loss": 0.331, "step": 3180 }, { "epoch": 0.5577410612815805, "grad_norm": 3.6850969791412354, "learning_rate": 3.2738562474371744e-05, "loss": 0.3921, "step": 3190 }, { "epoch": 0.5594894658624006, "grad_norm": 2.3551599979400635, "learning_rate": 3.271513092378889e-05, "loss": 0.3469, "step": 3200 }, { "epoch": 0.5612378704432206, "grad_norm": 3.7826590538024902, "learning_rate": 3.2691699373206025e-05, "loss": 0.3607, "step": 3210 }, { "epoch": 0.5629862750240405, "grad_norm": 4.876736164093018, "learning_rate": 3.266826782262316e-05, "loss": 0.4509, "step": 3220 }, { "epoch": 0.5647346796048606, "grad_norm": 4.1988959312438965, "learning_rate": 3.2644836272040306e-05, "loss": 0.3974, "step": 3230 }, { "epoch": 0.5664830841856806, "grad_norm": 5.847795009613037, "learning_rate": 3.262140472145744e-05, "loss": 0.3719, "step": 3240 }, { "epoch": 0.5682314887665005, "grad_norm": 1.9008731842041016, "learning_rate": 3.259797317087459e-05, "loss": 0.3163, "step": 3250 }, { "epoch": 0.5699798933473206, "grad_norm": 5.935329437255859, "learning_rate": 3.2574541620291724e-05, "loss": 0.383, "step": 3260 }, { "epoch": 0.5717282979281406, "grad_norm": 4.4414448738098145, "learning_rate": 3.255111006970887e-05, "loss": 0.3307, "step": 3270 }, { "epoch": 0.5734767025089605, "grad_norm": 3.289113998413086, "learning_rate": 3.2527678519126005e-05, "loss": 0.3523, "step": 3280 }, { "epoch": 0.5752251070897806, "grad_norm": 3.165818214416504, "learning_rate": 3.250424696854315e-05, "loss": 0.36, "step": 3290 }, { "epoch": 0.5769735116706006, "grad_norm": 2.0032644271850586, "learning_rate": 3.248081541796029e-05, "loss": 0.3958, "step": 3300 }, { "epoch": 0.5787219162514206, "grad_norm": 2.1944901943206787, "learning_rate": 3.245738386737743e-05, "loss": 0.3303, "step": 3310 }, { "epoch": 0.5804703208322406, "grad_norm": 2.2373247146606445, "learning_rate": 3.243395231679457e-05, "loss": 0.3071, "step": 3320 }, { "epoch": 0.5822187254130606, "grad_norm": 3.046159029006958, "learning_rate": 3.2410520766211705e-05, "loss": 0.3334, "step": 3330 }, { "epoch": 0.5839671299938806, "grad_norm": 4.304010391235352, "learning_rate": 3.238708921562885e-05, "loss": 0.323, "step": 3340 }, { "epoch": 0.5857155345747006, "grad_norm": 4.681334972381592, "learning_rate": 3.2363657665045986e-05, "loss": 0.3632, "step": 3350 }, { "epoch": 0.5874639391555206, "grad_norm": 2.211153984069824, "learning_rate": 3.234022611446313e-05, "loss": 0.3263, "step": 3360 }, { "epoch": 0.5892123437363406, "grad_norm": 3.0405044555664062, "learning_rate": 3.231679456388027e-05, "loss": 0.337, "step": 3370 }, { "epoch": 0.5909607483171606, "grad_norm": 3.744624614715576, "learning_rate": 3.229336301329741e-05, "loss": 0.3644, "step": 3380 }, { "epoch": 0.5927091528979805, "grad_norm": 2.4798245429992676, "learning_rate": 3.226993146271455e-05, "loss": 0.3328, "step": 3390 }, { "epoch": 0.5944575574788006, "grad_norm": 3.567376136779785, "learning_rate": 3.2246499912131686e-05, "loss": 0.3273, "step": 3400 }, { "epoch": 0.5962059620596206, "grad_norm": 3.397534132003784, "learning_rate": 3.222306836154882e-05, "loss": 0.3503, "step": 3410 }, { "epoch": 0.5979543666404405, "grad_norm": 5.0991692543029785, "learning_rate": 3.219963681096597e-05, "loss": 0.3703, "step": 3420 }, { "epoch": 0.5997027712212606, "grad_norm": 3.5852749347686768, "learning_rate": 3.2176205260383104e-05, "loss": 0.3693, "step": 3430 }, { "epoch": 0.6014511758020806, "grad_norm": 2.5035903453826904, "learning_rate": 3.215277370980025e-05, "loss": 0.316, "step": 3440 }, { "epoch": 0.6031995803829006, "grad_norm": 3.4612536430358887, "learning_rate": 3.2129342159217385e-05, "loss": 0.3087, "step": 3450 }, { "epoch": 0.6049479849637206, "grad_norm": 6.134129524230957, "learning_rate": 3.210591060863453e-05, "loss": 0.3416, "step": 3460 }, { "epoch": 0.6066963895445406, "grad_norm": 2.857895612716675, "learning_rate": 3.2082479058051666e-05, "loss": 0.3846, "step": 3470 }, { "epoch": 0.6084447941253606, "grad_norm": 4.056922912597656, "learning_rate": 3.205904750746881e-05, "loss": 0.3527, "step": 3480 }, { "epoch": 0.6101931987061806, "grad_norm": 3.3907055854797363, "learning_rate": 3.203561595688595e-05, "loss": 0.3492, "step": 3490 }, { "epoch": 0.6119416032870006, "grad_norm": 2.8946733474731445, "learning_rate": 3.201218440630309e-05, "loss": 0.3297, "step": 3500 }, { "epoch": 0.6136900078678206, "grad_norm": 2.1298012733459473, "learning_rate": 3.198875285572023e-05, "loss": 0.356, "step": 3510 }, { "epoch": 0.6154384124486406, "grad_norm": 3.428750514984131, "learning_rate": 3.1965321305137366e-05, "loss": 0.3316, "step": 3520 }, { "epoch": 0.6171868170294607, "grad_norm": 2.575652599334717, "learning_rate": 3.194188975455451e-05, "loss": 0.3288, "step": 3530 }, { "epoch": 0.6189352216102806, "grad_norm": 3.2835533618927, "learning_rate": 3.191845820397165e-05, "loss": 0.372, "step": 3540 }, { "epoch": 0.6206836261911006, "grad_norm": 3.7413949966430664, "learning_rate": 3.189502665338879e-05, "loss": 0.3372, "step": 3550 }, { "epoch": 0.6224320307719207, "grad_norm": 3.207977533340454, "learning_rate": 3.187159510280593e-05, "loss": 0.3327, "step": 3560 }, { "epoch": 0.6241804353527406, "grad_norm": 2.6386313438415527, "learning_rate": 3.184816355222307e-05, "loss": 0.3639, "step": 3570 }, { "epoch": 0.6259288399335606, "grad_norm": 3.423943519592285, "learning_rate": 3.182473200164021e-05, "loss": 0.3337, "step": 3580 }, { "epoch": 0.6276772445143807, "grad_norm": 2.5779225826263428, "learning_rate": 3.1801300451057353e-05, "loss": 0.3813, "step": 3590 }, { "epoch": 0.6294256490952006, "grad_norm": 3.204908847808838, "learning_rate": 3.177786890047449e-05, "loss": 0.2671, "step": 3600 }, { "epoch": 0.6311740536760206, "grad_norm": 2.594581365585327, "learning_rate": 3.1754437349891635e-05, "loss": 0.3638, "step": 3610 }, { "epoch": 0.6329224582568407, "grad_norm": 3.348681688308716, "learning_rate": 3.173100579930877e-05, "loss": 0.3639, "step": 3620 }, { "epoch": 0.6346708628376606, "grad_norm": 3.975121259689331, "learning_rate": 3.170757424872591e-05, "loss": 0.3239, "step": 3630 }, { "epoch": 0.6364192674184806, "grad_norm": 3.4465627670288086, "learning_rate": 3.168414269814305e-05, "loss": 0.3359, "step": 3640 }, { "epoch": 0.6381676719993007, "grad_norm": 3.444370985031128, "learning_rate": 3.166071114756019e-05, "loss": 0.3387, "step": 3650 }, { "epoch": 0.6399160765801206, "grad_norm": 2.481161117553711, "learning_rate": 3.1637279596977334e-05, "loss": 0.3292, "step": 3660 }, { "epoch": 0.6416644811609407, "grad_norm": 3.121288537979126, "learning_rate": 3.161384804639447e-05, "loss": 0.3552, "step": 3670 }, { "epoch": 0.6434128857417607, "grad_norm": 4.9208221435546875, "learning_rate": 3.1590416495811615e-05, "loss": 0.3309, "step": 3680 }, { "epoch": 0.6451612903225806, "grad_norm": 2.5919346809387207, "learning_rate": 3.156698494522875e-05, "loss": 0.3432, "step": 3690 }, { "epoch": 0.6469096949034007, "grad_norm": 3.173069953918457, "learning_rate": 3.1543553394645896e-05, "loss": 0.3804, "step": 3700 }, { "epoch": 0.6486580994842206, "grad_norm": 5.001594066619873, "learning_rate": 3.1520121844063034e-05, "loss": 0.3335, "step": 3710 }, { "epoch": 0.6504065040650406, "grad_norm": 2.092214822769165, "learning_rate": 3.149669029348018e-05, "loss": 0.3362, "step": 3720 }, { "epoch": 0.6521549086458607, "grad_norm": 4.916845798492432, "learning_rate": 3.1473258742897315e-05, "loss": 0.3568, "step": 3730 }, { "epoch": 0.6539033132266806, "grad_norm": 2.18415904045105, "learning_rate": 3.144982719231445e-05, "loss": 0.3235, "step": 3740 }, { "epoch": 0.6556517178075006, "grad_norm": 2.239564895629883, "learning_rate": 3.1426395641731596e-05, "loss": 0.3591, "step": 3750 }, { "epoch": 0.6574001223883207, "grad_norm": 2.547616481781006, "learning_rate": 3.140296409114873e-05, "loss": 0.3603, "step": 3760 }, { "epoch": 0.6591485269691406, "grad_norm": 2.3699333667755127, "learning_rate": 3.137953254056588e-05, "loss": 0.3346, "step": 3770 }, { "epoch": 0.6608969315499607, "grad_norm": 2.7866456508636475, "learning_rate": 3.1356100989983014e-05, "loss": 0.3621, "step": 3780 }, { "epoch": 0.6626453361307807, "grad_norm": 2.5254106521606445, "learning_rate": 3.133266943940016e-05, "loss": 0.3184, "step": 3790 }, { "epoch": 0.6643937407116006, "grad_norm": 4.498997211456299, "learning_rate": 3.1309237888817296e-05, "loss": 0.301, "step": 3800 }, { "epoch": 0.6661421452924207, "grad_norm": 3.224710702896118, "learning_rate": 3.128580633823444e-05, "loss": 0.3203, "step": 3810 }, { "epoch": 0.6678905498732407, "grad_norm": 2.7599270343780518, "learning_rate": 3.126237478765158e-05, "loss": 0.3336, "step": 3820 }, { "epoch": 0.6696389544540606, "grad_norm": 4.111194133758545, "learning_rate": 3.123894323706872e-05, "loss": 0.3643, "step": 3830 }, { "epoch": 0.6713873590348807, "grad_norm": 9.280810356140137, "learning_rate": 3.121551168648586e-05, "loss": 0.3294, "step": 3840 }, { "epoch": 0.6731357636157007, "grad_norm": 3.527141809463501, "learning_rate": 3.1192080135902995e-05, "loss": 0.3035, "step": 3850 }, { "epoch": 0.6748841681965206, "grad_norm": 2.8929083347320557, "learning_rate": 3.116864858532014e-05, "loss": 0.3047, "step": 3860 }, { "epoch": 0.6766325727773407, "grad_norm": 4.317626476287842, "learning_rate": 3.1145217034737276e-05, "loss": 0.352, "step": 3870 }, { "epoch": 0.6783809773581607, "grad_norm": 3.2065703868865967, "learning_rate": 3.1121785484154413e-05, "loss": 0.352, "step": 3880 }, { "epoch": 0.6801293819389806, "grad_norm": 2.805239677429199, "learning_rate": 3.109835393357156e-05, "loss": 0.3539, "step": 3890 }, { "epoch": 0.6818777865198007, "grad_norm": 3.5220377445220947, "learning_rate": 3.1074922382988695e-05, "loss": 0.3313, "step": 3900 }, { "epoch": 0.6836261911006207, "grad_norm": 3.755730628967285, "learning_rate": 3.105149083240583e-05, "loss": 0.321, "step": 3910 }, { "epoch": 0.6853745956814407, "grad_norm": 3.296947956085205, "learning_rate": 3.1028059281822976e-05, "loss": 0.3512, "step": 3920 }, { "epoch": 0.6871230002622607, "grad_norm": 3.954050064086914, "learning_rate": 3.100462773124011e-05, "loss": 0.3563, "step": 3930 }, { "epoch": 0.6888714048430807, "grad_norm": 3.8162853717803955, "learning_rate": 3.098119618065726e-05, "loss": 0.3559, "step": 3940 }, { "epoch": 0.6906198094239007, "grad_norm": 3.416830062866211, "learning_rate": 3.0957764630074394e-05, "loss": 0.3166, "step": 3950 }, { "epoch": 0.6923682140047207, "grad_norm": 3.054938554763794, "learning_rate": 3.0936676234549826e-05, "loss": 0.3129, "step": 3960 }, { "epoch": 0.6941166185855407, "grad_norm": 3.042231798171997, "learning_rate": 3.091324468396696e-05, "loss": 0.3109, "step": 3970 }, { "epoch": 0.6958650231663607, "grad_norm": 3.516209125518799, "learning_rate": 3.088981313338411e-05, "loss": 0.321, "step": 3980 }, { "epoch": 0.6976134277471807, "grad_norm": 3.2602617740631104, "learning_rate": 3.0866381582801244e-05, "loss": 0.3351, "step": 3990 }, { "epoch": 0.6993618323280008, "grad_norm": 2.974976062774658, "learning_rate": 3.084295003221839e-05, "loss": 0.3029, "step": 4000 }, { "epoch": 0.7011102369088207, "grad_norm": 4.38007926940918, "learning_rate": 3.0819518481635525e-05, "loss": 0.3487, "step": 4010 }, { "epoch": 0.7028586414896407, "grad_norm": 3.0209977626800537, "learning_rate": 3.079608693105267e-05, "loss": 0.3643, "step": 4020 }, { "epoch": 0.7046070460704607, "grad_norm": 3.7745320796966553, "learning_rate": 3.0772655380469806e-05, "loss": 0.3621, "step": 4030 }, { "epoch": 0.7063554506512807, "grad_norm": 2.3382375240325928, "learning_rate": 3.074922382988695e-05, "loss": 0.3187, "step": 4040 }, { "epoch": 0.7081038552321007, "grad_norm": 4.3452558517456055, "learning_rate": 3.072579227930409e-05, "loss": 0.3314, "step": 4050 }, { "epoch": 0.7098522598129207, "grad_norm": 4.7545695304870605, "learning_rate": 3.0702360728721225e-05, "loss": 0.3112, "step": 4060 }, { "epoch": 0.7116006643937407, "grad_norm": 1.974089503288269, "learning_rate": 3.067892917813837e-05, "loss": 0.3323, "step": 4070 }, { "epoch": 0.7133490689745607, "grad_norm": 9.842713356018066, "learning_rate": 3.0655497627555506e-05, "loss": 0.3103, "step": 4080 }, { "epoch": 0.7150974735553807, "grad_norm": 2.8412296772003174, "learning_rate": 3.063206607697264e-05, "loss": 0.3463, "step": 4090 }, { "epoch": 0.7168458781362007, "grad_norm": 2.391716480255127, "learning_rate": 3.060863452638979e-05, "loss": 0.3147, "step": 4100 }, { "epoch": 0.7185942827170207, "grad_norm": 3.2871251106262207, "learning_rate": 3.0585202975806924e-05, "loss": 0.329, "step": 4110 }, { "epoch": 0.7203426872978407, "grad_norm": 3.405353307723999, "learning_rate": 3.056177142522406e-05, "loss": 0.3005, "step": 4120 }, { "epoch": 0.7220910918786607, "grad_norm": 3.316866636276245, "learning_rate": 3.0538339874641206e-05, "loss": 0.3368, "step": 4130 }, { "epoch": 0.7238394964594808, "grad_norm": 2.363496780395508, "learning_rate": 3.0517251479116634e-05, "loss": 0.3416, "step": 4140 }, { "epoch": 0.7255879010403007, "grad_norm": 4.142341136932373, "learning_rate": 3.0493819928533774e-05, "loss": 0.3067, "step": 4150 }, { "epoch": 0.7273363056211207, "grad_norm": 2.67199444770813, "learning_rate": 3.0470388377950915e-05, "loss": 0.3286, "step": 4160 }, { "epoch": 0.7290847102019408, "grad_norm": 1.8342429399490356, "learning_rate": 3.0446956827368056e-05, "loss": 0.344, "step": 4170 }, { "epoch": 0.7308331147827607, "grad_norm": 3.3271865844726562, "learning_rate": 3.0423525276785196e-05, "loss": 0.3116, "step": 4180 }, { "epoch": 0.7325815193635807, "grad_norm": 3.249887466430664, "learning_rate": 3.0400093726202337e-05, "loss": 0.3075, "step": 4190 }, { "epoch": 0.7343299239444008, "grad_norm": 3.5678811073303223, "learning_rate": 3.0376662175619477e-05, "loss": 0.3138, "step": 4200 }, { "epoch": 0.7360783285252207, "grad_norm": 3.5020499229431152, "learning_rate": 3.03555737800949e-05, "loss": 0.3042, "step": 4210 }, { "epoch": 0.7378267331060407, "grad_norm": 4.059022903442383, "learning_rate": 3.033214222951204e-05, "loss": 0.3776, "step": 4220 }, { "epoch": 0.7395751376868608, "grad_norm": 2.5982818603515625, "learning_rate": 3.0311053833987464e-05, "loss": 0.3309, "step": 4230 }, { "epoch": 0.7413235422676807, "grad_norm": 2.723339557647705, "learning_rate": 3.0287622283404605e-05, "loss": 0.3199, "step": 4240 }, { "epoch": 0.7430719468485008, "grad_norm": 2.9116008281707764, "learning_rate": 3.0264190732821745e-05, "loss": 0.3283, "step": 4250 }, { "epoch": 0.7448203514293208, "grad_norm": 2.722931385040283, "learning_rate": 3.0240759182238886e-05, "loss": 0.3321, "step": 4260 }, { "epoch": 0.7465687560101407, "grad_norm": 2.779557228088379, "learning_rate": 3.0217327631656026e-05, "loss": 0.3497, "step": 4270 }, { "epoch": 0.7483171605909608, "grad_norm": 3.886526346206665, "learning_rate": 3.0193896081073167e-05, "loss": 0.3274, "step": 4280 }, { "epoch": 0.7500655651717808, "grad_norm": 2.644566059112549, "learning_rate": 3.0170464530490308e-05, "loss": 0.28, "step": 4290 }, { "epoch": 0.7518139697526007, "grad_norm": 3.2041871547698975, "learning_rate": 3.0147032979907448e-05, "loss": 0.3403, "step": 4300 }, { "epoch": 0.7535623743334208, "grad_norm": 3.1360421180725098, "learning_rate": 3.0123601429324585e-05, "loss": 0.3318, "step": 4310 }, { "epoch": 0.7553107789142408, "grad_norm": 3.71610164642334, "learning_rate": 3.0100169878741726e-05, "loss": 0.3425, "step": 4320 }, { "epoch": 0.7570591834950607, "grad_norm": 3.1907827854156494, "learning_rate": 3.0076738328158867e-05, "loss": 0.325, "step": 4330 }, { "epoch": 0.7588075880758808, "grad_norm": 3.3181164264678955, "learning_rate": 3.0053306777576007e-05, "loss": 0.3117, "step": 4340 }, { "epoch": 0.7605559926567007, "grad_norm": 2.3667750358581543, "learning_rate": 3.0029875226993148e-05, "loss": 0.2832, "step": 4350 }, { "epoch": 0.7623043972375207, "grad_norm": 3.1936376094818115, "learning_rate": 3.000644367641029e-05, "loss": 0.3561, "step": 4360 }, { "epoch": 0.7640528018183408, "grad_norm": 2.5650253295898438, "learning_rate": 2.998301212582743e-05, "loss": 0.3113, "step": 4370 }, { "epoch": 0.7658012063991607, "grad_norm": 4.8962082862854, "learning_rate": 2.995958057524457e-05, "loss": 0.3126, "step": 4380 }, { "epoch": 0.7675496109799808, "grad_norm": 2.574734926223755, "learning_rate": 2.993614902466171e-05, "loss": 0.3395, "step": 4390 }, { "epoch": 0.7692980155608008, "grad_norm": 2.9846832752227783, "learning_rate": 2.991271747407885e-05, "loss": 0.2877, "step": 4400 }, { "epoch": 0.7710464201416207, "grad_norm": 2.8485524654388428, "learning_rate": 2.988928592349599e-05, "loss": 0.3293, "step": 4410 }, { "epoch": 0.7727948247224408, "grad_norm": 3.241642475128174, "learning_rate": 2.986585437291313e-05, "loss": 0.3082, "step": 4420 }, { "epoch": 0.7745432293032608, "grad_norm": 3.4064993858337402, "learning_rate": 2.984242282233027e-05, "loss": 0.3417, "step": 4430 }, { "epoch": 0.7762916338840807, "grad_norm": 2.7267072200775146, "learning_rate": 2.981899127174741e-05, "loss": 0.2924, "step": 4440 }, { "epoch": 0.7780400384649008, "grad_norm": 4.6471266746521, "learning_rate": 2.979555972116455e-05, "loss": 0.3543, "step": 4450 }, { "epoch": 0.7797884430457208, "grad_norm": 3.8437321186065674, "learning_rate": 2.977212817058169e-05, "loss": 0.2831, "step": 4460 }, { "epoch": 0.7815368476265407, "grad_norm": 2.720120429992676, "learning_rate": 2.974869661999883e-05, "loss": 0.3661, "step": 4470 }, { "epoch": 0.7832852522073608, "grad_norm": 7.060765743255615, "learning_rate": 2.9725265069415972e-05, "loss": 0.3187, "step": 4480 }, { "epoch": 0.7850336567881808, "grad_norm": 3.308164358139038, "learning_rate": 2.9701833518833113e-05, "loss": 0.2999, "step": 4490 }, { "epoch": 0.7867820613690008, "grad_norm": 2.223618745803833, "learning_rate": 2.9678401968250253e-05, "loss": 0.2943, "step": 4500 }, { "epoch": 0.7885304659498208, "grad_norm": 2.492687463760376, "learning_rate": 2.9654970417667394e-05, "loss": 0.2852, "step": 4510 }, { "epoch": 0.7902788705306408, "grad_norm": 2.5673835277557373, "learning_rate": 2.9631538867084534e-05, "loss": 0.3437, "step": 4520 }, { "epoch": 0.7920272751114608, "grad_norm": 2.192340850830078, "learning_rate": 2.960810731650167e-05, "loss": 0.3324, "step": 4530 }, { "epoch": 0.7937756796922808, "grad_norm": 4.07915735244751, "learning_rate": 2.9584675765918812e-05, "loss": 0.2924, "step": 4540 }, { "epoch": 0.7955240842731008, "grad_norm": 2.894836187362671, "learning_rate": 2.9561244215335953e-05, "loss": 0.2772, "step": 4550 }, { "epoch": 0.7972724888539208, "grad_norm": 5.969550132751465, "learning_rate": 2.9537812664753093e-05, "loss": 0.2964, "step": 4560 }, { "epoch": 0.7990208934347408, "grad_norm": 3.9820101261138916, "learning_rate": 2.9514381114170234e-05, "loss": 0.3054, "step": 4570 }, { "epoch": 0.8007692980155608, "grad_norm": 3.4480111598968506, "learning_rate": 2.9490949563587374e-05, "loss": 0.3456, "step": 4580 }, { "epoch": 0.8025177025963808, "grad_norm": 2.6892831325531006, "learning_rate": 2.9467518013004515e-05, "loss": 0.2845, "step": 4590 }, { "epoch": 0.8042661071772008, "grad_norm": 3.488251209259033, "learning_rate": 2.9444086462421656e-05, "loss": 0.311, "step": 4600 }, { "epoch": 0.8060145117580209, "grad_norm": 2.8509933948516846, "learning_rate": 2.9420654911838796e-05, "loss": 0.3282, "step": 4610 }, { "epoch": 0.8077629163388408, "grad_norm": 2.416158676147461, "learning_rate": 2.9397223361255937e-05, "loss": 0.3229, "step": 4620 }, { "epoch": 0.8095113209196608, "grad_norm": 3.2727701663970947, "learning_rate": 2.9373791810673077e-05, "loss": 0.2934, "step": 4630 }, { "epoch": 0.8112597255004809, "grad_norm": 3.619755268096924, "learning_rate": 2.9350360260090215e-05, "loss": 0.3367, "step": 4640 }, { "epoch": 0.8130081300813008, "grad_norm": 2.848443031311035, "learning_rate": 2.9326928709507355e-05, "loss": 0.3027, "step": 4650 }, { "epoch": 0.8147565346621208, "grad_norm": 2.9893646240234375, "learning_rate": 2.9303497158924492e-05, "loss": 0.2942, "step": 4660 }, { "epoch": 0.8165049392429408, "grad_norm": 2.021780014038086, "learning_rate": 2.9280065608341633e-05, "loss": 0.3451, "step": 4670 }, { "epoch": 0.8182533438237608, "grad_norm": 2.3608715534210205, "learning_rate": 2.9256634057758774e-05, "loss": 0.2945, "step": 4680 }, { "epoch": 0.8200017484045808, "grad_norm": 2.520504951477051, "learning_rate": 2.9233202507175914e-05, "loss": 0.3133, "step": 4690 }, { "epoch": 0.8217501529854008, "grad_norm": 2.2169888019561768, "learning_rate": 2.9209770956593055e-05, "loss": 0.2961, "step": 4700 }, { "epoch": 0.8234985575662208, "grad_norm": 2.2153303623199463, "learning_rate": 2.9186339406010192e-05, "loss": 0.3015, "step": 4710 }, { "epoch": 0.8252469621470409, "grad_norm": 3.4472455978393555, "learning_rate": 2.9162907855427332e-05, "loss": 0.2829, "step": 4720 }, { "epoch": 0.8269953667278608, "grad_norm": 2.900556802749634, "learning_rate": 2.9139476304844473e-05, "loss": 0.3393, "step": 4730 }, { "epoch": 0.8287437713086808, "grad_norm": 5.161322593688965, "learning_rate": 2.9116044754261614e-05, "loss": 0.2681, "step": 4740 }, { "epoch": 0.8304921758895009, "grad_norm": 2.231976270675659, "learning_rate": 2.9092613203678754e-05, "loss": 0.3025, "step": 4750 }, { "epoch": 0.8322405804703208, "grad_norm": 3.7248172760009766, "learning_rate": 2.9069181653095895e-05, "loss": 0.325, "step": 4760 }, { "epoch": 0.8339889850511408, "grad_norm": 2.6625943183898926, "learning_rate": 2.9045750102513035e-05, "loss": 0.3331, "step": 4770 }, { "epoch": 0.8357373896319609, "grad_norm": 4.35618782043457, "learning_rate": 2.9022318551930176e-05, "loss": 0.3164, "step": 4780 }, { "epoch": 0.8374857942127808, "grad_norm": 3.5341124534606934, "learning_rate": 2.8998887001347317e-05, "loss": 0.3171, "step": 4790 }, { "epoch": 0.8392341987936008, "grad_norm": 2.554603099822998, "learning_rate": 2.8975455450764457e-05, "loss": 0.2849, "step": 4800 }, { "epoch": 0.8409826033744209, "grad_norm": 4.446366310119629, "learning_rate": 2.8952023900181598e-05, "loss": 0.3221, "step": 4810 }, { "epoch": 0.8427310079552408, "grad_norm": 2.116299629211426, "learning_rate": 2.8928592349598735e-05, "loss": 0.2904, "step": 4820 }, { "epoch": 0.8444794125360608, "grad_norm": 2.078610897064209, "learning_rate": 2.8905160799015876e-05, "loss": 0.3047, "step": 4830 }, { "epoch": 0.8462278171168809, "grad_norm": 2.4735875129699707, "learning_rate": 2.8881729248433016e-05, "loss": 0.29, "step": 4840 }, { "epoch": 0.8479762216977008, "grad_norm": 2.858583688735962, "learning_rate": 2.8858297697850157e-05, "loss": 0.3244, "step": 4850 }, { "epoch": 0.8497246262785209, "grad_norm": 2.644150733947754, "learning_rate": 2.8834866147267297e-05, "loss": 0.3133, "step": 4860 }, { "epoch": 0.8514730308593409, "grad_norm": 3.146573305130005, "learning_rate": 2.8811434596684438e-05, "loss": 0.2714, "step": 4870 }, { "epoch": 0.8532214354401608, "grad_norm": 3.7888834476470947, "learning_rate": 2.878800304610158e-05, "loss": 0.2532, "step": 4880 }, { "epoch": 0.8549698400209809, "grad_norm": 2.7797985076904297, "learning_rate": 2.876457149551872e-05, "loss": 0.3136, "step": 4890 }, { "epoch": 0.8567182446018009, "grad_norm": 2.3448262214660645, "learning_rate": 2.874113994493586e-05, "loss": 0.2965, "step": 4900 }, { "epoch": 0.8584666491826208, "grad_norm": 3.9640755653381348, "learning_rate": 2.8717708394353e-05, "loss": 0.2687, "step": 4910 }, { "epoch": 0.8602150537634409, "grad_norm": 2.2898406982421875, "learning_rate": 2.8694276843770137e-05, "loss": 0.3079, "step": 4920 }, { "epoch": 0.8619634583442609, "grad_norm": 2.158766508102417, "learning_rate": 2.8670845293187278e-05, "loss": 0.3238, "step": 4930 }, { "epoch": 0.8637118629250808, "grad_norm": 2.283958911895752, "learning_rate": 2.864741374260442e-05, "loss": 0.2864, "step": 4940 }, { "epoch": 0.8654602675059009, "grad_norm": 2.1821975708007812, "learning_rate": 2.862398219202156e-05, "loss": 0.2885, "step": 4950 }, { "epoch": 0.8672086720867209, "grad_norm": 3.1854681968688965, "learning_rate": 2.86005506414387e-05, "loss": 0.2939, "step": 4960 }, { "epoch": 0.8689570766675409, "grad_norm": 3.484039306640625, "learning_rate": 2.857711909085584e-05, "loss": 0.302, "step": 4970 }, { "epoch": 0.8707054812483609, "grad_norm": 2.4981396198272705, "learning_rate": 2.855368754027298e-05, "loss": 0.2504, "step": 4980 }, { "epoch": 0.8724538858291808, "grad_norm": 2.401259422302246, "learning_rate": 2.853025598969012e-05, "loss": 0.3036, "step": 4990 }, { "epoch": 0.8742022904100009, "grad_norm": 2.8834569454193115, "learning_rate": 2.8506824439107262e-05, "loss": 0.2515, "step": 5000 }, { "epoch": 0.8759506949908209, "grad_norm": 2.483109474182129, "learning_rate": 2.8483392888524403e-05, "loss": 0.2575, "step": 5010 }, { "epoch": 0.8776990995716408, "grad_norm": 4.260477542877197, "learning_rate": 2.8459961337941543e-05, "loss": 0.322, "step": 5020 }, { "epoch": 0.8794475041524609, "grad_norm": 2.232416868209839, "learning_rate": 2.843652978735868e-05, "loss": 0.3196, "step": 5030 }, { "epoch": 0.8811959087332809, "grad_norm": 4.053016185760498, "learning_rate": 2.841309823677582e-05, "loss": 0.2948, "step": 5040 }, { "epoch": 0.8829443133141008, "grad_norm": 3.5699994564056396, "learning_rate": 2.838966668619296e-05, "loss": 0.2459, "step": 5050 }, { "epoch": 0.8846927178949209, "grad_norm": 2.5316355228424072, "learning_rate": 2.8366235135610102e-05, "loss": 0.2907, "step": 5060 }, { "epoch": 0.8864411224757409, "grad_norm": 3.3079192638397217, "learning_rate": 2.8342803585027243e-05, "loss": 0.335, "step": 5070 }, { "epoch": 0.8881895270565608, "grad_norm": 3.4846723079681396, "learning_rate": 2.8319372034444383e-05, "loss": 0.3312, "step": 5080 }, { "epoch": 0.8899379316373809, "grad_norm": 2.5676932334899902, "learning_rate": 2.8295940483861524e-05, "loss": 0.2999, "step": 5090 }, { "epoch": 0.8916863362182009, "grad_norm": 3.0182511806488037, "learning_rate": 2.8272508933278665e-05, "loss": 0.291, "step": 5100 }, { "epoch": 0.8934347407990209, "grad_norm": 2.7049145698547363, "learning_rate": 2.8249077382695805e-05, "loss": 0.2789, "step": 5110 }, { "epoch": 0.8951831453798409, "grad_norm": 4.516125202178955, "learning_rate": 2.8225645832112946e-05, "loss": 0.3439, "step": 5120 }, { "epoch": 0.8969315499606609, "grad_norm": 2.7721059322357178, "learning_rate": 2.820221428153008e-05, "loss": 0.2949, "step": 5130 }, { "epoch": 0.8986799545414809, "grad_norm": 3.285956859588623, "learning_rate": 2.817878273094722e-05, "loss": 0.2931, "step": 5140 }, { "epoch": 0.9004283591223009, "grad_norm": 2.932749032974243, "learning_rate": 2.815535118036436e-05, "loss": 0.223, "step": 5150 }, { "epoch": 0.9021767637031209, "grad_norm": 2.5357871055603027, "learning_rate": 2.81319196297815e-05, "loss": 0.2869, "step": 5160 }, { "epoch": 0.9039251682839409, "grad_norm": 4.862621784210205, "learning_rate": 2.8108488079198642e-05, "loss": 0.3179, "step": 5170 }, { "epoch": 0.9056735728647609, "grad_norm": 3.4447200298309326, "learning_rate": 2.8085056528615782e-05, "loss": 0.3325, "step": 5180 }, { "epoch": 0.907421977445581, "grad_norm": 4.216275691986084, "learning_rate": 2.8061624978032923e-05, "loss": 0.3122, "step": 5190 }, { "epoch": 0.9091703820264009, "grad_norm": 2.99465012550354, "learning_rate": 2.8038193427450064e-05, "loss": 0.335, "step": 5200 }, { "epoch": 0.9109187866072209, "grad_norm": 2.5309603214263916, "learning_rate": 2.80147618768672e-05, "loss": 0.261, "step": 5210 }, { "epoch": 0.912667191188041, "grad_norm": 3.232043504714966, "learning_rate": 2.799133032628434e-05, "loss": 0.3165, "step": 5220 }, { "epoch": 0.9144155957688609, "grad_norm": 2.638690948486328, "learning_rate": 2.7967898775701482e-05, "loss": 0.303, "step": 5230 }, { "epoch": 0.9161640003496809, "grad_norm": 2.458595037460327, "learning_rate": 2.7944467225118623e-05, "loss": 0.263, "step": 5240 }, { "epoch": 0.917912404930501, "grad_norm": 2.526832342147827, "learning_rate": 2.7921035674535763e-05, "loss": 0.2814, "step": 5250 }, { "epoch": 0.9196608095113209, "grad_norm": 3.0694055557250977, "learning_rate": 2.7897604123952904e-05, "loss": 0.2664, "step": 5260 }, { "epoch": 0.9214092140921409, "grad_norm": 3.2216978073120117, "learning_rate": 2.7874172573370044e-05, "loss": 0.2943, "step": 5270 }, { "epoch": 0.923157618672961, "grad_norm": 2.958218574523926, "learning_rate": 2.7850741022787185e-05, "loss": 0.2932, "step": 5280 }, { "epoch": 0.9249060232537809, "grad_norm": 3.8881359100341797, "learning_rate": 2.7827309472204326e-05, "loss": 0.2968, "step": 5290 }, { "epoch": 0.926654427834601, "grad_norm": 2.983222723007202, "learning_rate": 2.7803877921621466e-05, "loss": 0.2567, "step": 5300 }, { "epoch": 0.9284028324154209, "grad_norm": 2.482820749282837, "learning_rate": 2.7780446371038607e-05, "loss": 0.3172, "step": 5310 }, { "epoch": 0.9301512369962409, "grad_norm": 2.0078659057617188, "learning_rate": 2.7757014820455744e-05, "loss": 0.2927, "step": 5320 }, { "epoch": 0.931899641577061, "grad_norm": 3.8458902835845947, "learning_rate": 2.7733583269872884e-05, "loss": 0.2618, "step": 5330 }, { "epoch": 0.9336480461578809, "grad_norm": 4.100974082946777, "learning_rate": 2.7710151719290025e-05, "loss": 0.2704, "step": 5340 }, { "epoch": 0.9353964507387009, "grad_norm": 4.017348766326904, "learning_rate": 2.7686720168707166e-05, "loss": 0.2909, "step": 5350 }, { "epoch": 0.937144855319521, "grad_norm": 2.4890189170837402, "learning_rate": 2.7663288618124306e-05, "loss": 0.3533, "step": 5360 }, { "epoch": 0.9388932599003409, "grad_norm": 2.674192190170288, "learning_rate": 2.7639857067541447e-05, "loss": 0.2778, "step": 5370 }, { "epoch": 0.9406416644811609, "grad_norm": 2.096602439880371, "learning_rate": 2.7616425516958587e-05, "loss": 0.2865, "step": 5380 }, { "epoch": 0.942390069061981, "grad_norm": 3.5576303005218506, "learning_rate": 2.7592993966375728e-05, "loss": 0.2894, "step": 5390 }, { "epoch": 0.9441384736428009, "grad_norm": 1.878792643547058, "learning_rate": 2.756956241579287e-05, "loss": 0.2778, "step": 5400 }, { "epoch": 0.9458868782236209, "grad_norm": 3.251866579055786, "learning_rate": 2.754613086521001e-05, "loss": 0.2895, "step": 5410 }, { "epoch": 0.947635282804441, "grad_norm": 3.257899522781372, "learning_rate": 2.752269931462715e-05, "loss": 0.3125, "step": 5420 }, { "epoch": 0.9493836873852609, "grad_norm": 2.863107442855835, "learning_rate": 2.7499267764044287e-05, "loss": 0.2414, "step": 5430 }, { "epoch": 0.951132091966081, "grad_norm": 3.4100799560546875, "learning_rate": 2.7475836213461428e-05, "loss": 0.2866, "step": 5440 }, { "epoch": 0.952880496546901, "grad_norm": 2.9259746074676514, "learning_rate": 2.7452404662878568e-05, "loss": 0.2912, "step": 5450 }, { "epoch": 0.9546289011277209, "grad_norm": 3.5859031677246094, "learning_rate": 2.742897311229571e-05, "loss": 0.2945, "step": 5460 }, { "epoch": 0.956377305708541, "grad_norm": 2.341395378112793, "learning_rate": 2.740554156171285e-05, "loss": 0.29, "step": 5470 }, { "epoch": 0.958125710289361, "grad_norm": 4.96193265914917, "learning_rate": 2.738211001112999e-05, "loss": 0.2655, "step": 5480 }, { "epoch": 0.9598741148701809, "grad_norm": 2.6068100929260254, "learning_rate": 2.735867846054713e-05, "loss": 0.2954, "step": 5490 }, { "epoch": 0.961622519451001, "grad_norm": 2.946169137954712, "learning_rate": 2.733524690996427e-05, "loss": 0.2622, "step": 5500 }, { "epoch": 0.963370924031821, "grad_norm": 2.533318042755127, "learning_rate": 2.731181535938141e-05, "loss": 0.2773, "step": 5510 }, { "epoch": 0.9651193286126409, "grad_norm": 3.2419304847717285, "learning_rate": 2.7288383808798552e-05, "loss": 0.2755, "step": 5520 }, { "epoch": 0.966867733193461, "grad_norm": 6.12715482711792, "learning_rate": 2.7264952258215693e-05, "loss": 0.2713, "step": 5530 }, { "epoch": 0.968616137774281, "grad_norm": 6.266939640045166, "learning_rate": 2.724152070763283e-05, "loss": 0.2615, "step": 5540 }, { "epoch": 0.970364542355101, "grad_norm": 2.3439266681671143, "learning_rate": 2.721808915704997e-05, "loss": 0.2522, "step": 5550 }, { "epoch": 0.972112946935921, "grad_norm": 3.9679603576660156, "learning_rate": 2.719465760646711e-05, "loss": 0.2881, "step": 5560 }, { "epoch": 0.973861351516741, "grad_norm": 4.07214879989624, "learning_rate": 2.7171226055884252e-05, "loss": 0.31, "step": 5570 }, { "epoch": 0.975609756097561, "grad_norm": 1.8089581727981567, "learning_rate": 2.7147794505301392e-05, "loss": 0.2693, "step": 5580 }, { "epoch": 0.977358160678381, "grad_norm": 2.350628137588501, "learning_rate": 2.7124362954718533e-05, "loss": 0.2936, "step": 5590 }, { "epoch": 0.979106565259201, "grad_norm": 2.955479383468628, "learning_rate": 2.7100931404135674e-05, "loss": 0.2859, "step": 5600 }, { "epoch": 0.980854969840021, "grad_norm": 2.559128522872925, "learning_rate": 2.7077499853552807e-05, "loss": 0.2941, "step": 5610 }, { "epoch": 0.982603374420841, "grad_norm": 4.118138313293457, "learning_rate": 2.7054068302969948e-05, "loss": 0.2653, "step": 5620 }, { "epoch": 0.9843517790016609, "grad_norm": 2.770746946334839, "learning_rate": 2.703063675238709e-05, "loss": 0.2511, "step": 5630 }, { "epoch": 0.986100183582481, "grad_norm": 2.902510404586792, "learning_rate": 2.700720520180423e-05, "loss": 0.2458, "step": 5640 }, { "epoch": 0.987848588163301, "grad_norm": 2.2082626819610596, "learning_rate": 2.698377365122137e-05, "loss": 0.2704, "step": 5650 }, { "epoch": 0.9895969927441209, "grad_norm": 3.753960371017456, "learning_rate": 2.696034210063851e-05, "loss": 0.2993, "step": 5660 }, { "epoch": 0.991345397324941, "grad_norm": 2.009828567504883, "learning_rate": 2.693691055005565e-05, "loss": 0.2383, "step": 5670 }, { "epoch": 0.993093801905761, "grad_norm": 2.961215019226074, "learning_rate": 2.691347899947279e-05, "loss": 0.2761, "step": 5680 }, { "epoch": 0.994842206486581, "grad_norm": 4.399358749389648, "learning_rate": 2.6890047448889932e-05, "loss": 0.2597, "step": 5690 }, { "epoch": 0.996590611067401, "grad_norm": 2.652677059173584, "learning_rate": 2.6866615898307073e-05, "loss": 0.2983, "step": 5700 }, { "epoch": 0.998339015648221, "grad_norm": 2.0003232955932617, "learning_rate": 2.6843184347724213e-05, "loss": 0.2542, "step": 5710 }, { "epoch": 1.000087420229041, "grad_norm": 2.8039956092834473, "learning_rate": 2.681975279714135e-05, "loss": 0.264, "step": 5720 }, { "epoch": 1.001835824809861, "grad_norm": 2.23201322555542, "learning_rate": 2.679632124655849e-05, "loss": 0.2583, "step": 5730 }, { "epoch": 1.003584229390681, "grad_norm": 3.3724849224090576, "learning_rate": 2.677288969597563e-05, "loss": 0.2173, "step": 5740 }, { "epoch": 1.005332633971501, "grad_norm": 2.6445441246032715, "learning_rate": 2.6749458145392772e-05, "loss": 0.2508, "step": 5750 }, { "epoch": 1.007081038552321, "grad_norm": 2.4510304927825928, "learning_rate": 2.6726026594809913e-05, "loss": 0.2326, "step": 5760 }, { "epoch": 1.008829443133141, "grad_norm": 2.9268946647644043, "learning_rate": 2.6702595044227053e-05, "loss": 0.2106, "step": 5770 }, { "epoch": 1.010577847713961, "grad_norm": 2.347891330718994, "learning_rate": 2.6679163493644194e-05, "loss": 0.2641, "step": 5780 }, { "epoch": 1.012326252294781, "grad_norm": 3.352431535720825, "learning_rate": 2.6655731943061334e-05, "loss": 0.2473, "step": 5790 }, { "epoch": 1.014074656875601, "grad_norm": 1.9598578214645386, "learning_rate": 2.6632300392478475e-05, "loss": 0.2164, "step": 5800 }, { "epoch": 1.015823061456421, "grad_norm": 1.9211621284484863, "learning_rate": 2.6608868841895616e-05, "loss": 0.211, "step": 5810 }, { "epoch": 1.017571466037241, "grad_norm": 2.0851757526397705, "learning_rate": 2.6585437291312756e-05, "loss": 0.2116, "step": 5820 }, { "epoch": 1.019319870618061, "grad_norm": 2.829580783843994, "learning_rate": 2.6562005740729893e-05, "loss": 0.2404, "step": 5830 }, { "epoch": 1.021068275198881, "grad_norm": 3.872819185256958, "learning_rate": 2.6538574190147034e-05, "loss": 0.2386, "step": 5840 }, { "epoch": 1.022816679779701, "grad_norm": 2.6188647747039795, "learning_rate": 2.6515142639564175e-05, "loss": 0.2521, "step": 5850 }, { "epoch": 1.024565084360521, "grad_norm": 2.390606164932251, "learning_rate": 2.6491711088981315e-05, "loss": 0.2205, "step": 5860 }, { "epoch": 1.026313488941341, "grad_norm": 3.985508918762207, "learning_rate": 2.6468279538398456e-05, "loss": 0.2025, "step": 5870 }, { "epoch": 1.028061893522161, "grad_norm": 2.431910753250122, "learning_rate": 2.6444847987815596e-05, "loss": 0.2647, "step": 5880 }, { "epoch": 1.029810298102981, "grad_norm": 2.83016300201416, "learning_rate": 2.6421416437232737e-05, "loss": 0.2238, "step": 5890 }, { "epoch": 1.0315587026838011, "grad_norm": 2.0961086750030518, "learning_rate": 2.6397984886649878e-05, "loss": 0.232, "step": 5900 }, { "epoch": 1.033307107264621, "grad_norm": 2.8335044384002686, "learning_rate": 2.6374553336067018e-05, "loss": 0.237, "step": 5910 }, { "epoch": 1.035055511845441, "grad_norm": 3.199272871017456, "learning_rate": 2.635112178548416e-05, "loss": 0.1832, "step": 5920 }, { "epoch": 1.036803916426261, "grad_norm": 3.307910919189453, "learning_rate": 2.6327690234901296e-05, "loss": 0.2298, "step": 5930 }, { "epoch": 1.038552321007081, "grad_norm": 5.532860279083252, "learning_rate": 2.6304258684318436e-05, "loss": 0.2429, "step": 5940 }, { "epoch": 1.040300725587901, "grad_norm": 2.590127468109131, "learning_rate": 2.6280827133735577e-05, "loss": 0.2157, "step": 5950 }, { "epoch": 1.0420491301687211, "grad_norm": 2.8453683853149414, "learning_rate": 2.6257395583152718e-05, "loss": 0.2553, "step": 5960 }, { "epoch": 1.043797534749541, "grad_norm": 1.8200639486312866, "learning_rate": 2.6233964032569858e-05, "loss": 0.2335, "step": 5970 }, { "epoch": 1.045545939330361, "grad_norm": 2.9046294689178467, "learning_rate": 2.6210532481987e-05, "loss": 0.247, "step": 5980 }, { "epoch": 1.047294343911181, "grad_norm": 5.429454326629639, "learning_rate": 2.618710093140414e-05, "loss": 0.2496, "step": 5990 }, { "epoch": 1.049042748492001, "grad_norm": 5.531388282775879, "learning_rate": 2.616366938082128e-05, "loss": 0.2569, "step": 6000 }, { "epoch": 1.0493924294081651, "eval_loss": 0.41312137246131897, "eval_runtime": 1800.1269, "eval_samples_per_second": 8.045, "eval_steps_per_second": 1.006, "step": 6002 }, { "epoch": 1.0507911530728211, "grad_norm": 1.860955834388733, "learning_rate": 2.614023783023842e-05, "loss": 0.2275, "step": 6010 }, { "epoch": 1.0525395576536412, "grad_norm": 2.9169716835021973, "learning_rate": 2.611680627965556e-05, "loss": 0.2302, "step": 6020 }, { "epoch": 1.054287962234461, "grad_norm": 2.0836966037750244, "learning_rate": 2.6093374729072702e-05, "loss": 0.2396, "step": 6030 }, { "epoch": 1.056036366815281, "grad_norm": 1.6626900434494019, "learning_rate": 2.606994317848984e-05, "loss": 0.2381, "step": 6040 }, { "epoch": 1.057784771396101, "grad_norm": 2.771097421646118, "learning_rate": 2.604651162790698e-05, "loss": 0.2185, "step": 6050 }, { "epoch": 1.059533175976921, "grad_norm": 3.489532232284546, "learning_rate": 2.602308007732412e-05, "loss": 0.2591, "step": 6060 }, { "epoch": 1.0612815805577411, "grad_norm": 2.6725568771362305, "learning_rate": 2.599964852674126e-05, "loss": 0.2444, "step": 6070 }, { "epoch": 1.0630299851385612, "grad_norm": 1.9345289468765259, "learning_rate": 2.59762169761584e-05, "loss": 0.2628, "step": 6080 }, { "epoch": 1.064778389719381, "grad_norm": 2.020622491836548, "learning_rate": 2.595278542557554e-05, "loss": 0.2586, "step": 6090 }, { "epoch": 1.066526794300201, "grad_norm": 2.4979703426361084, "learning_rate": 2.592935387499268e-05, "loss": 0.2441, "step": 6100 }, { "epoch": 1.068275198881021, "grad_norm": 2.5634591579437256, "learning_rate": 2.5905922324409816e-05, "loss": 0.2361, "step": 6110 }, { "epoch": 1.070023603461841, "grad_norm": 2.3151049613952637, "learning_rate": 2.5882490773826957e-05, "loss": 0.2465, "step": 6120 }, { "epoch": 1.0717720080426612, "grad_norm": 1.7265106439590454, "learning_rate": 2.5859059223244097e-05, "loss": 0.2395, "step": 6130 }, { "epoch": 1.073520412623481, "grad_norm": 2.878922462463379, "learning_rate": 2.5835627672661238e-05, "loss": 0.2371, "step": 6140 }, { "epoch": 1.075268817204301, "grad_norm": 3.5647659301757812, "learning_rate": 2.581219612207838e-05, "loss": 0.218, "step": 6150 }, { "epoch": 1.077017221785121, "grad_norm": 3.4102213382720947, "learning_rate": 2.578876457149552e-05, "loss": 0.2137, "step": 6160 }, { "epoch": 1.078765626365941, "grad_norm": 4.656369209289551, "learning_rate": 2.576533302091266e-05, "loss": 0.2058, "step": 6170 }, { "epoch": 1.0805140309467611, "grad_norm": 2.1041653156280518, "learning_rate": 2.57419014703298e-05, "loss": 0.2067, "step": 6180 }, { "epoch": 1.0822624355275812, "grad_norm": 2.1586110591888428, "learning_rate": 2.571846991974694e-05, "loss": 0.2056, "step": 6190 }, { "epoch": 1.084010840108401, "grad_norm": 1.9681655168533325, "learning_rate": 2.569503836916408e-05, "loss": 0.184, "step": 6200 }, { "epoch": 1.085759244689221, "grad_norm": 2.8586220741271973, "learning_rate": 2.5671606818581222e-05, "loss": 0.2226, "step": 6210 }, { "epoch": 1.087507649270041, "grad_norm": 7.736782073974609, "learning_rate": 2.564817526799836e-05, "loss": 0.1919, "step": 6220 }, { "epoch": 1.089256053850861, "grad_norm": 3.2476119995117188, "learning_rate": 2.56247437174155e-05, "loss": 0.2489, "step": 6230 }, { "epoch": 1.0910044584316811, "grad_norm": 3.7844748497009277, "learning_rate": 2.560131216683264e-05, "loss": 0.2415, "step": 6240 }, { "epoch": 1.0927528630125012, "grad_norm": 1.4511767625808716, "learning_rate": 2.557788061624978e-05, "loss": 0.2556, "step": 6250 }, { "epoch": 1.094501267593321, "grad_norm": 4.27903938293457, "learning_rate": 2.555444906566692e-05, "loss": 0.2551, "step": 6260 }, { "epoch": 1.096249672174141, "grad_norm": 3.3497371673583984, "learning_rate": 2.5531017515084062e-05, "loss": 0.2377, "step": 6270 }, { "epoch": 1.097998076754961, "grad_norm": 2.4137842655181885, "learning_rate": 2.5507585964501203e-05, "loss": 0.2338, "step": 6280 }, { "epoch": 1.0997464813357811, "grad_norm": 2.213383913040161, "learning_rate": 2.5484154413918343e-05, "loss": 0.2352, "step": 6290 }, { "epoch": 1.1014948859166012, "grad_norm": 2.463801622390747, "learning_rate": 2.5460722863335484e-05, "loss": 0.2433, "step": 6300 }, { "epoch": 1.1032432904974212, "grad_norm": 2.349886178970337, "learning_rate": 2.5437291312752625e-05, "loss": 0.2307, "step": 6310 }, { "epoch": 1.104991695078241, "grad_norm": 4.470160484313965, "learning_rate": 2.5413859762169765e-05, "loss": 0.2147, "step": 6320 }, { "epoch": 1.106740099659061, "grad_norm": 1.9070550203323364, "learning_rate": 2.5390428211586902e-05, "loss": 0.1952, "step": 6330 }, { "epoch": 1.108488504239881, "grad_norm": 3.6984667778015137, "learning_rate": 2.5366996661004043e-05, "loss": 0.2451, "step": 6340 }, { "epoch": 1.1102369088207011, "grad_norm": 2.5296738147735596, "learning_rate": 2.5343565110421184e-05, "loss": 0.2623, "step": 6350 }, { "epoch": 1.1119853134015212, "grad_norm": 1.6782501935958862, "learning_rate": 2.5320133559838324e-05, "loss": 0.2121, "step": 6360 }, { "epoch": 1.113733717982341, "grad_norm": 2.8727078437805176, "learning_rate": 2.5296702009255465e-05, "loss": 0.2381, "step": 6370 }, { "epoch": 1.115482122563161, "grad_norm": 2.176513671875, "learning_rate": 2.5273270458672605e-05, "loss": 0.1952, "step": 6380 }, { "epoch": 1.117230527143981, "grad_norm": 2.2744338512420654, "learning_rate": 2.5249838908089746e-05, "loss": 0.2417, "step": 6390 }, { "epoch": 1.1189789317248011, "grad_norm": 3.2771434783935547, "learning_rate": 2.5226407357506886e-05, "loss": 0.2233, "step": 6400 }, { "epoch": 1.1207273363056212, "grad_norm": 2.574244499206543, "learning_rate": 2.5202975806924027e-05, "loss": 0.2245, "step": 6410 }, { "epoch": 1.1224757408864412, "grad_norm": 2.5185132026672363, "learning_rate": 2.5181887411399452e-05, "loss": 0.2339, "step": 6420 }, { "epoch": 1.1242241454672612, "grad_norm": 1.5455431938171387, "learning_rate": 2.5158455860816592e-05, "loss": 0.221, "step": 6430 }, { "epoch": 1.125972550048081, "grad_norm": 3.230663299560547, "learning_rate": 2.513502431023373e-05, "loss": 0.2431, "step": 6440 }, { "epoch": 1.127720954628901, "grad_norm": 1.7547463178634644, "learning_rate": 2.511159275965087e-05, "loss": 0.258, "step": 6450 }, { "epoch": 1.1294693592097211, "grad_norm": 2.806102752685547, "learning_rate": 2.508816120906801e-05, "loss": 0.2637, "step": 6460 }, { "epoch": 1.1312177637905412, "grad_norm": 2.752462863922119, "learning_rate": 2.506472965848515e-05, "loss": 0.2201, "step": 6470 }, { "epoch": 1.1329661683713612, "grad_norm": 2.401191473007202, "learning_rate": 2.5041298107902292e-05, "loss": 0.2458, "step": 6480 }, { "epoch": 1.134714572952181, "grad_norm": 2.487614154815674, "learning_rate": 2.5017866557319433e-05, "loss": 0.2311, "step": 6490 }, { "epoch": 1.136462977533001, "grad_norm": 2.6839981079101562, "learning_rate": 2.4994435006736573e-05, "loss": 0.2184, "step": 6500 }, { "epoch": 1.1382113821138211, "grad_norm": 2.5567777156829834, "learning_rate": 2.4971003456153714e-05, "loss": 0.2579, "step": 6510 }, { "epoch": 1.1399597866946412, "grad_norm": 1.7943795919418335, "learning_rate": 2.4947571905570854e-05, "loss": 0.2226, "step": 6520 }, { "epoch": 1.1417081912754612, "grad_norm": 2.5085887908935547, "learning_rate": 2.4924140354987995e-05, "loss": 0.2171, "step": 6530 }, { "epoch": 1.1434565958562812, "grad_norm": 3.6210641860961914, "learning_rate": 2.4900708804405132e-05, "loss": 0.2351, "step": 6540 }, { "epoch": 1.145205000437101, "grad_norm": 2.5945966243743896, "learning_rate": 2.4877277253822273e-05, "loss": 0.2187, "step": 6550 }, { "epoch": 1.146953405017921, "grad_norm": 1.9475432634353638, "learning_rate": 2.4853845703239413e-05, "loss": 0.1945, "step": 6560 }, { "epoch": 1.1487018095987411, "grad_norm": 3.4020893573760986, "learning_rate": 2.4830414152656554e-05, "loss": 0.2151, "step": 6570 }, { "epoch": 1.1504502141795612, "grad_norm": 3.2796547412872314, "learning_rate": 2.4806982602073694e-05, "loss": 0.2289, "step": 6580 }, { "epoch": 1.1521986187603812, "grad_norm": 2.947808265686035, "learning_rate": 2.4783551051490835e-05, "loss": 0.2379, "step": 6590 }, { "epoch": 1.1539470233412013, "grad_norm": 1.4934639930725098, "learning_rate": 2.4760119500907976e-05, "loss": 0.2332, "step": 6600 }, { "epoch": 1.155695427922021, "grad_norm": 3.527163505554199, "learning_rate": 2.4736687950325116e-05, "loss": 0.2276, "step": 6610 }, { "epoch": 1.157443832502841, "grad_norm": 4.331171989440918, "learning_rate": 2.4713256399742257e-05, "loss": 0.2408, "step": 6620 }, { "epoch": 1.1591922370836611, "grad_norm": 1.2162501811981201, "learning_rate": 2.4689824849159397e-05, "loss": 0.2246, "step": 6630 }, { "epoch": 1.1609406416644812, "grad_norm": 2.465019702911377, "learning_rate": 2.4666393298576538e-05, "loss": 0.1932, "step": 6640 }, { "epoch": 1.1626890462453012, "grad_norm": 2.7230145931243896, "learning_rate": 2.4642961747993675e-05, "loss": 0.258, "step": 6650 }, { "epoch": 1.1644374508261213, "grad_norm": 2.6885485649108887, "learning_rate": 2.4619530197410816e-05, "loss": 0.2287, "step": 6660 }, { "epoch": 1.166185855406941, "grad_norm": 3.2494916915893555, "learning_rate": 2.4596098646827956e-05, "loss": 0.2647, "step": 6670 }, { "epoch": 1.1679342599877611, "grad_norm": 1.7186158895492554, "learning_rate": 2.4572667096245097e-05, "loss": 0.2115, "step": 6680 }, { "epoch": 1.1696826645685812, "grad_norm": 2.3007144927978516, "learning_rate": 2.4549235545662237e-05, "loss": 0.2497, "step": 6690 }, { "epoch": 1.1714310691494012, "grad_norm": 2.522245168685913, "learning_rate": 2.4525803995079378e-05, "loss": 0.2215, "step": 6700 }, { "epoch": 1.1731794737302212, "grad_norm": 2.192690134048462, "learning_rate": 2.450237244449652e-05, "loss": 0.2056, "step": 6710 }, { "epoch": 1.174927878311041, "grad_norm": 3.8989531993865967, "learning_rate": 2.447894089391366e-05, "loss": 0.2096, "step": 6720 }, { "epoch": 1.176676282891861, "grad_norm": 3.636918544769287, "learning_rate": 2.44555093433308e-05, "loss": 0.22, "step": 6730 }, { "epoch": 1.1784246874726811, "grad_norm": 2.5938773155212402, "learning_rate": 2.443207779274794e-05, "loss": 0.253, "step": 6740 }, { "epoch": 1.1801730920535012, "grad_norm": 2.396374464035034, "learning_rate": 2.440864624216508e-05, "loss": 0.2843, "step": 6750 }, { "epoch": 1.1819214966343212, "grad_norm": 2.2090964317321777, "learning_rate": 2.4385214691582218e-05, "loss": 0.2352, "step": 6760 }, { "epoch": 1.1836699012151413, "grad_norm": 2.504795551300049, "learning_rate": 2.4361783140999355e-05, "loss": 0.2247, "step": 6770 }, { "epoch": 1.1854183057959613, "grad_norm": 3.191880702972412, "learning_rate": 2.4338351590416496e-05, "loss": 0.2311, "step": 6780 }, { "epoch": 1.1871667103767811, "grad_norm": 2.5257225036621094, "learning_rate": 2.4314920039833637e-05, "loss": 0.2227, "step": 6790 }, { "epoch": 1.1889151149576012, "grad_norm": 1.7567265033721924, "learning_rate": 2.4291488489250777e-05, "loss": 0.206, "step": 6800 }, { "epoch": 1.1906635195384212, "grad_norm": 2.0397517681121826, "learning_rate": 2.4268056938667918e-05, "loss": 0.2359, "step": 6810 }, { "epoch": 1.1924119241192412, "grad_norm": 2.164275884628296, "learning_rate": 2.424462538808506e-05, "loss": 0.2466, "step": 6820 }, { "epoch": 1.1941603287000613, "grad_norm": 3.402735710144043, "learning_rate": 2.4221193837502196e-05, "loss": 0.2304, "step": 6830 }, { "epoch": 1.195908733280881, "grad_norm": 6.231383800506592, "learning_rate": 2.4200105441977627e-05, "loss": 0.223, "step": 6840 }, { "epoch": 1.1976571378617011, "grad_norm": 2.991027355194092, "learning_rate": 2.4176673891394768e-05, "loss": 0.207, "step": 6850 }, { "epoch": 1.1994055424425212, "grad_norm": 3.3107314109802246, "learning_rate": 2.4153242340811905e-05, "loss": 0.221, "step": 6860 }, { "epoch": 1.2011539470233412, "grad_norm": 3.051894426345825, "learning_rate": 2.4129810790229045e-05, "loss": 0.236, "step": 6870 }, { "epoch": 1.2029023516041613, "grad_norm": 2.3835701942443848, "learning_rate": 2.4106379239646186e-05, "loss": 0.2195, "step": 6880 }, { "epoch": 1.2046507561849813, "grad_norm": 3.9972636699676514, "learning_rate": 2.4082947689063327e-05, "loss": 0.2198, "step": 6890 }, { "epoch": 1.2063991607658011, "grad_norm": 2.0902743339538574, "learning_rate": 2.4059516138480467e-05, "loss": 0.2563, "step": 6900 }, { "epoch": 1.2081475653466212, "grad_norm": 2.5505049228668213, "learning_rate": 2.4036084587897608e-05, "loss": 0.2093, "step": 6910 }, { "epoch": 1.2098959699274412, "grad_norm": 3.174210548400879, "learning_rate": 2.401265303731475e-05, "loss": 0.221, "step": 6920 }, { "epoch": 1.2116443745082612, "grad_norm": 2.3565521240234375, "learning_rate": 2.398922148673189e-05, "loss": 0.224, "step": 6930 }, { "epoch": 1.2133927790890813, "grad_norm": 6.279238224029541, "learning_rate": 2.396578993614903e-05, "loss": 0.2333, "step": 6940 }, { "epoch": 1.215141183669901, "grad_norm": 4.914646625518799, "learning_rate": 2.394235838556617e-05, "loss": 0.2521, "step": 6950 }, { "epoch": 1.2168895882507211, "grad_norm": 3.3840725421905518, "learning_rate": 2.3918926834983304e-05, "loss": 0.2065, "step": 6960 }, { "epoch": 1.2186379928315412, "grad_norm": 2.5615413188934326, "learning_rate": 2.3895495284400445e-05, "loss": 0.2363, "step": 6970 }, { "epoch": 1.2203863974123612, "grad_norm": 3.362717628479004, "learning_rate": 2.3872063733817585e-05, "loss": 0.2325, "step": 6980 }, { "epoch": 1.2221348019931813, "grad_norm": 2.461860179901123, "learning_rate": 2.3848632183234726e-05, "loss": 0.2228, "step": 6990 }, { "epoch": 1.2238832065740013, "grad_norm": 2.791576385498047, "learning_rate": 2.3825200632651866e-05, "loss": 0.1829, "step": 7000 }, { "epoch": 1.2256316111548213, "grad_norm": 2.1985511779785156, "learning_rate": 2.3801769082069007e-05, "loss": 0.1964, "step": 7010 }, { "epoch": 1.2273800157356411, "grad_norm": 2.528165578842163, "learning_rate": 2.3778337531486147e-05, "loss": 0.2327, "step": 7020 }, { "epoch": 1.2291284203164612, "grad_norm": 2.3017685413360596, "learning_rate": 2.3754905980903288e-05, "loss": 0.2299, "step": 7030 }, { "epoch": 1.2308768248972812, "grad_norm": 1.9561365842819214, "learning_rate": 2.3731474430320425e-05, "loss": 0.2468, "step": 7040 }, { "epoch": 1.2326252294781013, "grad_norm": 3.533801555633545, "learning_rate": 2.3708042879737566e-05, "loss": 0.237, "step": 7050 }, { "epoch": 1.2343736340589213, "grad_norm": 1.6506298780441284, "learning_rate": 2.3684611329154706e-05, "loss": 0.2262, "step": 7060 }, { "epoch": 1.2361220386397411, "grad_norm": 2.2800042629241943, "learning_rate": 2.3661179778571847e-05, "loss": 0.2079, "step": 7070 }, { "epoch": 1.2378704432205612, "grad_norm": 3.2282028198242188, "learning_rate": 2.3637748227988988e-05, "loss": 0.2497, "step": 7080 }, { "epoch": 1.2396188478013812, "grad_norm": 9.39627742767334, "learning_rate": 2.3614316677406128e-05, "loss": 0.2292, "step": 7090 }, { "epoch": 1.2413672523822012, "grad_norm": 2.449101448059082, "learning_rate": 2.359088512682327e-05, "loss": 0.2477, "step": 7100 }, { "epoch": 1.2431156569630213, "grad_norm": 3.1302859783172607, "learning_rate": 2.356745357624041e-05, "loss": 0.2285, "step": 7110 }, { "epoch": 1.2448640615438413, "grad_norm": 1.8327531814575195, "learning_rate": 2.354402202565755e-05, "loss": 0.2005, "step": 7120 }, { "epoch": 1.2466124661246614, "grad_norm": 4.233156681060791, "learning_rate": 2.352059047507469e-05, "loss": 0.236, "step": 7130 }, { "epoch": 1.2483608707054812, "grad_norm": 2.3583102226257324, "learning_rate": 2.349715892449183e-05, "loss": 0.2273, "step": 7140 }, { "epoch": 1.2501092752863012, "grad_norm": 2.376291275024414, "learning_rate": 2.347372737390897e-05, "loss": 0.1867, "step": 7150 }, { "epoch": 1.2518576798671213, "grad_norm": 3.159830093383789, "learning_rate": 2.345029582332611e-05, "loss": 0.2266, "step": 7160 }, { "epoch": 1.2536060844479413, "grad_norm": 2.7414627075195312, "learning_rate": 2.342686427274325e-05, "loss": 0.2598, "step": 7170 }, { "epoch": 1.2553544890287611, "grad_norm": 3.401259183883667, "learning_rate": 2.340343272216039e-05, "loss": 0.2103, "step": 7180 }, { "epoch": 1.2571028936095812, "grad_norm": 2.1210904121398926, "learning_rate": 2.338000117157753e-05, "loss": 0.1867, "step": 7190 }, { "epoch": 1.2588512981904012, "grad_norm": 4.165828704833984, "learning_rate": 2.335656962099467e-05, "loss": 0.2408, "step": 7200 }, { "epoch": 1.2605997027712212, "grad_norm": 3.0600874423980713, "learning_rate": 2.3333138070411812e-05, "loss": 0.2235, "step": 7210 }, { "epoch": 1.2623481073520413, "grad_norm": 2.238833427429199, "learning_rate": 2.3309706519828952e-05, "loss": 0.2295, "step": 7220 }, { "epoch": 1.2640965119328613, "grad_norm": 7.00640344619751, "learning_rate": 2.3286274969246093e-05, "loss": 0.224, "step": 7230 }, { "epoch": 1.2658449165136814, "grad_norm": 3.161783218383789, "learning_rate": 2.3262843418663234e-05, "loss": 0.2077, "step": 7240 }, { "epoch": 1.2675933210945014, "grad_norm": 6.526487350463867, "learning_rate": 2.3239411868080374e-05, "loss": 0.2106, "step": 7250 }, { "epoch": 1.2693417256753212, "grad_norm": 1.9831335544586182, "learning_rate": 2.321598031749751e-05, "loss": 0.231, "step": 7260 }, { "epoch": 1.2710901302561413, "grad_norm": 2.8936715126037598, "learning_rate": 2.3192548766914652e-05, "loss": 0.2364, "step": 7270 }, { "epoch": 1.2728385348369613, "grad_norm": 2.9134674072265625, "learning_rate": 2.3169117216331793e-05, "loss": 0.2153, "step": 7280 }, { "epoch": 1.2745869394177813, "grad_norm": 1.7994840145111084, "learning_rate": 2.3145685665748933e-05, "loss": 0.2049, "step": 7290 }, { "epoch": 1.2763353439986012, "grad_norm": 1.5330135822296143, "learning_rate": 2.3122254115166074e-05, "loss": 0.202, "step": 7300 }, { "epoch": 1.2780837485794212, "grad_norm": 2.563875436782837, "learning_rate": 2.3098822564583214e-05, "loss": 0.2086, "step": 7310 }, { "epoch": 1.2798321531602412, "grad_norm": 3.0079505443573, "learning_rate": 2.3075391014000355e-05, "loss": 0.1833, "step": 7320 }, { "epoch": 1.2815805577410613, "grad_norm": 1.683423638343811, "learning_rate": 2.3051959463417495e-05, "loss": 0.2167, "step": 7330 }, { "epoch": 1.2833289623218813, "grad_norm": 2.0884647369384766, "learning_rate": 2.3028527912834636e-05, "loss": 0.2138, "step": 7340 }, { "epoch": 1.2850773669027014, "grad_norm": 3.214635133743286, "learning_rate": 2.3005096362251777e-05, "loss": 0.2095, "step": 7350 }, { "epoch": 1.2868257714835214, "grad_norm": 2.2779738903045654, "learning_rate": 2.2981664811668917e-05, "loss": 0.2373, "step": 7360 }, { "epoch": 1.2885741760643412, "grad_norm": 3.165019989013672, "learning_rate": 2.2958233261086054e-05, "loss": 0.2027, "step": 7370 }, { "epoch": 1.2903225806451613, "grad_norm": 1.9054561853408813, "learning_rate": 2.2934801710503195e-05, "loss": 0.2307, "step": 7380 }, { "epoch": 1.2920709852259813, "grad_norm": 2.0474791526794434, "learning_rate": 2.2911370159920336e-05, "loss": 0.2045, "step": 7390 }, { "epoch": 1.2938193898068013, "grad_norm": 13.95895004272461, "learning_rate": 2.2887938609337476e-05, "loss": 0.2003, "step": 7400 }, { "epoch": 1.2955677943876214, "grad_norm": 3.3436994552612305, "learning_rate": 2.2864507058754617e-05, "loss": 0.2024, "step": 7410 }, { "epoch": 1.2973161989684412, "grad_norm": 2.3914167881011963, "learning_rate": 2.2841075508171757e-05, "loss": 0.2, "step": 7420 }, { "epoch": 1.2990646035492612, "grad_norm": 2.7382149696350098, "learning_rate": 2.2817643957588898e-05, "loss": 0.2361, "step": 7430 }, { "epoch": 1.3008130081300813, "grad_norm": 1.886077642440796, "learning_rate": 2.2794212407006032e-05, "loss": 0.1993, "step": 7440 }, { "epoch": 1.3025614127109013, "grad_norm": 7.261506080627441, "learning_rate": 2.2770780856423172e-05, "loss": 0.1806, "step": 7450 }, { "epoch": 1.3043098172917214, "grad_norm": 2.7932260036468506, "learning_rate": 2.2747349305840313e-05, "loss": 0.2451, "step": 7460 }, { "epoch": 1.3060582218725414, "grad_norm": 2.1137731075286865, "learning_rate": 2.2723917755257453e-05, "loss": 0.2437, "step": 7470 }, { "epoch": 1.3078066264533614, "grad_norm": 2.070944309234619, "learning_rate": 2.2700486204674594e-05, "loss": 0.201, "step": 7480 }, { "epoch": 1.3095550310341812, "grad_norm": 2.4869847297668457, "learning_rate": 2.2677054654091735e-05, "loss": 0.2218, "step": 7490 }, { "epoch": 1.3113034356150013, "grad_norm": 2.1322641372680664, "learning_rate": 2.2653623103508875e-05, "loss": 0.2069, "step": 7500 }, { "epoch": 1.3130518401958213, "grad_norm": 3.1480398178100586, "learning_rate": 2.2630191552926016e-05, "loss": 0.2562, "step": 7510 }, { "epoch": 1.3148002447766414, "grad_norm": 2.788144111633301, "learning_rate": 2.2606760002343156e-05, "loss": 0.2215, "step": 7520 }, { "epoch": 1.3165486493574612, "grad_norm": 2.7730488777160645, "learning_rate": 2.2583328451760297e-05, "loss": 0.1914, "step": 7530 }, { "epoch": 1.3182970539382812, "grad_norm": 2.38083815574646, "learning_rate": 2.2559896901177438e-05, "loss": 0.2192, "step": 7540 }, { "epoch": 1.3200454585191013, "grad_norm": 1.9907407760620117, "learning_rate": 2.2536465350594575e-05, "loss": 0.1665, "step": 7550 }, { "epoch": 1.3217938630999213, "grad_norm": 2.5785951614379883, "learning_rate": 2.2513033800011715e-05, "loss": 0.23, "step": 7560 }, { "epoch": 1.3235422676807413, "grad_norm": 2.255279064178467, "learning_rate": 2.2489602249428856e-05, "loss": 0.2671, "step": 7570 }, { "epoch": 1.3252906722615614, "grad_norm": 2.6091926097869873, "learning_rate": 2.2466170698845997e-05, "loss": 0.1948, "step": 7580 }, { "epoch": 1.3270390768423814, "grad_norm": 1.9192357063293457, "learning_rate": 2.2442739148263137e-05, "loss": 0.2092, "step": 7590 }, { "epoch": 1.3287874814232015, "grad_norm": 2.7502996921539307, "learning_rate": 2.2419307597680278e-05, "loss": 0.2184, "step": 7600 }, { "epoch": 1.3305358860040213, "grad_norm": 1.731701374053955, "learning_rate": 2.2395876047097418e-05, "loss": 0.226, "step": 7610 }, { "epoch": 1.3322842905848413, "grad_norm": 2.618088960647583, "learning_rate": 2.237244449651456e-05, "loss": 0.1989, "step": 7620 }, { "epoch": 1.3340326951656614, "grad_norm": 2.9609458446502686, "learning_rate": 2.23490129459317e-05, "loss": 0.2568, "step": 7630 }, { "epoch": 1.3357810997464814, "grad_norm": 2.470890760421753, "learning_rate": 2.232558139534884e-05, "loss": 0.2545, "step": 7640 }, { "epoch": 1.3375295043273012, "grad_norm": 4.0039215087890625, "learning_rate": 2.230214984476598e-05, "loss": 0.2507, "step": 7650 }, { "epoch": 1.3392779089081213, "grad_norm": 1.8677549362182617, "learning_rate": 2.2278718294183118e-05, "loss": 0.2364, "step": 7660 }, { "epoch": 1.3410263134889413, "grad_norm": 2.4219982624053955, "learning_rate": 2.225528674360026e-05, "loss": 0.2278, "step": 7670 }, { "epoch": 1.3427747180697613, "grad_norm": 2.556628465652466, "learning_rate": 2.22318551930174e-05, "loss": 0.2008, "step": 7680 }, { "epoch": 1.3445231226505814, "grad_norm": 3.106130361557007, "learning_rate": 2.220842364243454e-05, "loss": 0.241, "step": 7690 }, { "epoch": 1.3462715272314014, "grad_norm": 5.037795066833496, "learning_rate": 2.218499209185168e-05, "loss": 0.1823, "step": 7700 }, { "epoch": 1.3480199318122215, "grad_norm": 2.826275110244751, "learning_rate": 2.216156054126882e-05, "loss": 0.2108, "step": 7710 }, { "epoch": 1.3497683363930413, "grad_norm": 2.2898755073547363, "learning_rate": 2.213812899068596e-05, "loss": 0.2649, "step": 7720 }, { "epoch": 1.3515167409738613, "grad_norm": 1.984183669090271, "learning_rate": 2.2114697440103102e-05, "loss": 0.22, "step": 7730 }, { "epoch": 1.3532651455546814, "grad_norm": 2.6796443462371826, "learning_rate": 2.2091265889520243e-05, "loss": 0.2003, "step": 7740 }, { "epoch": 1.3550135501355014, "grad_norm": 2.9810585975646973, "learning_rate": 2.2067834338937383e-05, "loss": 0.2152, "step": 7750 }, { "epoch": 1.3567619547163214, "grad_norm": 2.0471839904785156, "learning_rate": 2.204440278835452e-05, "loss": 0.1844, "step": 7760 }, { "epoch": 1.3585103592971413, "grad_norm": 2.1709094047546387, "learning_rate": 2.202097123777166e-05, "loss": 0.207, "step": 7770 }, { "epoch": 1.3602587638779613, "grad_norm": 3.221278667449951, "learning_rate": 2.19975396871888e-05, "loss": 0.2077, "step": 7780 }, { "epoch": 1.3620071684587813, "grad_norm": 2.5238027572631836, "learning_rate": 2.1974108136605942e-05, "loss": 0.23, "step": 7790 }, { "epoch": 1.3637555730396014, "grad_norm": 1.5731216669082642, "learning_rate": 2.1950676586023083e-05, "loss": 0.2321, "step": 7800 }, { "epoch": 1.3655039776204214, "grad_norm": 4.94499397277832, "learning_rate": 2.1927245035440223e-05, "loss": 0.2136, "step": 7810 }, { "epoch": 1.3672523822012415, "grad_norm": 1.9199833869934082, "learning_rate": 2.1903813484857364e-05, "loss": 0.2235, "step": 7820 }, { "epoch": 1.3690007867820615, "grad_norm": 2.569610357284546, "learning_rate": 2.1880381934274504e-05, "loss": 0.2219, "step": 7830 }, { "epoch": 1.3707491913628813, "grad_norm": 2.2798616886138916, "learning_rate": 2.1856950383691645e-05, "loss": 0.1895, "step": 7840 }, { "epoch": 1.3724975959437014, "grad_norm": 2.7358908653259277, "learning_rate": 2.183586198816707e-05, "loss": 0.2481, "step": 7850 }, { "epoch": 1.3742460005245214, "grad_norm": 2.1300289630889893, "learning_rate": 2.181243043758421e-05, "loss": 0.2229, "step": 7860 }, { "epoch": 1.3759944051053414, "grad_norm": 2.7066380977630615, "learning_rate": 2.1788998887001348e-05, "loss": 0.1998, "step": 7870 }, { "epoch": 1.3777428096861613, "grad_norm": 2.4064714908599854, "learning_rate": 2.1765567336418488e-05, "loss": 0.2018, "step": 7880 }, { "epoch": 1.3794912142669813, "grad_norm": 2.445901870727539, "learning_rate": 2.174213578583563e-05, "loss": 0.2298, "step": 7890 }, { "epoch": 1.3812396188478013, "grad_norm": 2.6251111030578613, "learning_rate": 2.171870423525277e-05, "loss": 0.2056, "step": 7900 }, { "epoch": 1.3829880234286214, "grad_norm": 2.2267794609069824, "learning_rate": 2.169527268466991e-05, "loss": 0.2266, "step": 7910 }, { "epoch": 1.3847364280094414, "grad_norm": 2.0632987022399902, "learning_rate": 2.167184113408705e-05, "loss": 0.2078, "step": 7920 }, { "epoch": 1.3864848325902615, "grad_norm": 1.6934055089950562, "learning_rate": 2.164840958350419e-05, "loss": 0.1987, "step": 7930 }, { "epoch": 1.3882332371710815, "grad_norm": 2.452653646469116, "learning_rate": 2.1624978032921332e-05, "loss": 0.2715, "step": 7940 }, { "epoch": 1.3899816417519013, "grad_norm": 1.3962805271148682, "learning_rate": 2.1601546482338472e-05, "loss": 0.2062, "step": 7950 }, { "epoch": 1.3917300463327213, "grad_norm": 2.5406017303466797, "learning_rate": 2.1578114931755613e-05, "loss": 0.2262, "step": 7960 }, { "epoch": 1.3934784509135414, "grad_norm": 1.7772146463394165, "learning_rate": 2.1554683381172753e-05, "loss": 0.2201, "step": 7970 }, { "epoch": 1.3952268554943614, "grad_norm": 2.1140542030334473, "learning_rate": 2.153125183058989e-05, "loss": 0.2508, "step": 7980 }, { "epoch": 1.3969752600751815, "grad_norm": 2.517038345336914, "learning_rate": 2.150782028000703e-05, "loss": 0.2068, "step": 7990 }, { "epoch": 1.3987236646560013, "grad_norm": 2.4555583000183105, "learning_rate": 2.1484388729424172e-05, "loss": 0.2419, "step": 8000 }, { "epoch": 1.4004720692368213, "grad_norm": 1.9649275541305542, "learning_rate": 2.1460957178841312e-05, "loss": 0.2094, "step": 8010 }, { "epoch": 1.4022204738176414, "grad_norm": 1.9330495595932007, "learning_rate": 2.1437525628258453e-05, "loss": 0.222, "step": 8020 }, { "epoch": 1.4039688783984614, "grad_norm": 2.596536636352539, "learning_rate": 2.1414094077675594e-05, "loss": 0.2442, "step": 8030 }, { "epoch": 1.4057172829792814, "grad_norm": 2.4878602027893066, "learning_rate": 2.1390662527092734e-05, "loss": 0.2341, "step": 8040 }, { "epoch": 1.4074656875601015, "grad_norm": 1.9203938245773315, "learning_rate": 2.1367230976509875e-05, "loss": 0.2145, "step": 8050 }, { "epoch": 1.4092140921409215, "grad_norm": 3.3724257946014404, "learning_rate": 2.1343799425927015e-05, "loss": 0.2151, "step": 8060 }, { "epoch": 1.4109624967217413, "grad_norm": 3.4099626541137695, "learning_rate": 2.1320367875344156e-05, "loss": 0.2314, "step": 8070 }, { "epoch": 1.4127109013025614, "grad_norm": 2.2942490577697754, "learning_rate": 2.1296936324761293e-05, "loss": 0.1972, "step": 8080 }, { "epoch": 1.4144593058833814, "grad_norm": 1.607845425605774, "learning_rate": 2.1273504774178434e-05, "loss": 0.2223, "step": 8090 }, { "epoch": 1.4162077104642015, "grad_norm": 2.1875293254852295, "learning_rate": 2.1250073223595574e-05, "loss": 0.2209, "step": 8100 }, { "epoch": 1.4179561150450213, "grad_norm": 2.5811476707458496, "learning_rate": 2.122664167301271e-05, "loss": 0.2046, "step": 8110 }, { "epoch": 1.4197045196258413, "grad_norm": 3.045577049255371, "learning_rate": 2.1203210122429852e-05, "loss": 0.2177, "step": 8120 }, { "epoch": 1.4214529242066614, "grad_norm": 3.3778419494628906, "learning_rate": 2.1179778571846993e-05, "loss": 0.2511, "step": 8130 }, { "epoch": 1.4232013287874814, "grad_norm": 4.097287654876709, "learning_rate": 2.1156347021264133e-05, "loss": 0.2004, "step": 8140 }, { "epoch": 1.4249497333683014, "grad_norm": 3.0460093021392822, "learning_rate": 2.1132915470681274e-05, "loss": 0.2308, "step": 8150 }, { "epoch": 1.4266981379491215, "grad_norm": 2.531940221786499, "learning_rate": 2.110948392009841e-05, "loss": 0.2029, "step": 8160 }, { "epoch": 1.4284465425299415, "grad_norm": 2.160956382751465, "learning_rate": 2.108605236951555e-05, "loss": 0.1643, "step": 8170 }, { "epoch": 1.4301949471107616, "grad_norm": 2.251553535461426, "learning_rate": 2.1062620818932692e-05, "loss": 0.2043, "step": 8180 }, { "epoch": 1.4319433516915814, "grad_norm": 1.8432042598724365, "learning_rate": 2.1039189268349833e-05, "loss": 0.2204, "step": 8190 }, { "epoch": 1.4336917562724014, "grad_norm": 2.3578741550445557, "learning_rate": 2.1015757717766973e-05, "loss": 0.216, "step": 8200 }, { "epoch": 1.4354401608532215, "grad_norm": 2.0462770462036133, "learning_rate": 2.0992326167184114e-05, "loss": 0.2136, "step": 8210 }, { "epoch": 1.4371885654340415, "grad_norm": 3.6537725925445557, "learning_rate": 2.0968894616601255e-05, "loss": 0.2025, "step": 8220 }, { "epoch": 1.4389369700148613, "grad_norm": 3.5801661014556885, "learning_rate": 2.0945463066018395e-05, "loss": 0.2113, "step": 8230 }, { "epoch": 1.4406853745956814, "grad_norm": 1.7767003774642944, "learning_rate": 2.0922031515435536e-05, "loss": 0.2428, "step": 8240 }, { "epoch": 1.4424337791765014, "grad_norm": 3.095386028289795, "learning_rate": 2.0898599964852676e-05, "loss": 0.1953, "step": 8250 }, { "epoch": 1.4441821837573214, "grad_norm": 2.714571237564087, "learning_rate": 2.0875168414269817e-05, "loss": 0.1861, "step": 8260 }, { "epoch": 1.4459305883381415, "grad_norm": 2.4387760162353516, "learning_rate": 2.0851736863686954e-05, "loss": 0.2076, "step": 8270 }, { "epoch": 1.4476789929189615, "grad_norm": 2.1680490970611572, "learning_rate": 2.0828305313104095e-05, "loss": 0.1952, "step": 8280 }, { "epoch": 1.4494273974997816, "grad_norm": 1.8361495733261108, "learning_rate": 2.0804873762521235e-05, "loss": 0.2027, "step": 8290 }, { "epoch": 1.4511758020806014, "grad_norm": 2.529107093811035, "learning_rate": 2.0781442211938376e-05, "loss": 0.1926, "step": 8300 }, { "epoch": 1.4529242066614214, "grad_norm": 1.8087151050567627, "learning_rate": 2.0758010661355516e-05, "loss": 0.226, "step": 8310 }, { "epoch": 1.4546726112422415, "grad_norm": 1.4918991327285767, "learning_rate": 2.0734579110772657e-05, "loss": 0.1934, "step": 8320 }, { "epoch": 1.4564210158230615, "grad_norm": 2.2762157917022705, "learning_rate": 2.0711147560189798e-05, "loss": 0.2199, "step": 8330 }, { "epoch": 1.4581694204038815, "grad_norm": 2.103135585784912, "learning_rate": 2.0687716009606938e-05, "loss": 0.1884, "step": 8340 }, { "epoch": 1.4599178249847014, "grad_norm": 2.3067610263824463, "learning_rate": 2.066428445902408e-05, "loss": 0.2075, "step": 8350 }, { "epoch": 1.4616662295655214, "grad_norm": 1.7336812019348145, "learning_rate": 2.064085290844122e-05, "loss": 0.176, "step": 8360 }, { "epoch": 1.4634146341463414, "grad_norm": 2.6093456745147705, "learning_rate": 2.0617421357858357e-05, "loss": 0.2267, "step": 8370 }, { "epoch": 1.4651630387271615, "grad_norm": 2.137324571609497, "learning_rate": 2.0593989807275497e-05, "loss": 0.2467, "step": 8380 }, { "epoch": 1.4669114433079815, "grad_norm": 2.212411880493164, "learning_rate": 2.0570558256692638e-05, "loss": 0.2465, "step": 8390 }, { "epoch": 1.4686598478888016, "grad_norm": 1.9635968208312988, "learning_rate": 2.0547126706109778e-05, "loss": 0.1947, "step": 8400 }, { "epoch": 1.4704082524696216, "grad_norm": 1.5587635040283203, "learning_rate": 2.052369515552692e-05, "loss": 0.1783, "step": 8410 }, { "epoch": 1.4721566570504414, "grad_norm": 1.5327953100204468, "learning_rate": 2.050026360494406e-05, "loss": 0.203, "step": 8420 }, { "epoch": 1.4739050616312614, "grad_norm": 1.5852612257003784, "learning_rate": 2.04768320543612e-05, "loss": 0.1994, "step": 8430 }, { "epoch": 1.4756534662120815, "grad_norm": 2.138629913330078, "learning_rate": 2.045340050377834e-05, "loss": 0.1943, "step": 8440 }, { "epoch": 1.4774018707929015, "grad_norm": 2.41829252243042, "learning_rate": 2.042996895319548e-05, "loss": 0.1848, "step": 8450 }, { "epoch": 1.4791502753737213, "grad_norm": 2.3004097938537598, "learning_rate": 2.0406537402612622e-05, "loss": 0.2026, "step": 8460 }, { "epoch": 1.4808986799545414, "grad_norm": 3.455299139022827, "learning_rate": 2.0383105852029762e-05, "loss": 0.2194, "step": 8470 }, { "epoch": 1.4826470845353614, "grad_norm": 2.1128129959106445, "learning_rate": 2.03596743014469e-05, "loss": 0.2093, "step": 8480 }, { "epoch": 1.4843954891161815, "grad_norm": 3.1031060218811035, "learning_rate": 2.033624275086404e-05, "loss": 0.2165, "step": 8490 }, { "epoch": 1.4861438936970015, "grad_norm": 3.0639071464538574, "learning_rate": 2.031281120028118e-05, "loss": 0.2182, "step": 8500 }, { "epoch": 1.4878922982778215, "grad_norm": 1.974797010421753, "learning_rate": 2.028937964969832e-05, "loss": 0.2176, "step": 8510 }, { "epoch": 1.4896407028586416, "grad_norm": 2.2451913356781006, "learning_rate": 2.0265948099115462e-05, "loss": 0.1985, "step": 8520 }, { "epoch": 1.4913891074394616, "grad_norm": 3.2870914936065674, "learning_rate": 2.0242516548532603e-05, "loss": 0.2182, "step": 8530 }, { "epoch": 1.4931375120202814, "grad_norm": 1.9064737558364868, "learning_rate": 2.0219084997949743e-05, "loss": 0.1819, "step": 8540 }, { "epoch": 1.4948859166011015, "grad_norm": 7.4060564041137695, "learning_rate": 2.0195653447366884e-05, "loss": 0.1877, "step": 8550 }, { "epoch": 1.4966343211819215, "grad_norm": 2.1721577644348145, "learning_rate": 2.0172221896784024e-05, "loss": 0.2139, "step": 8560 }, { "epoch": 1.4983827257627416, "grad_norm": 3.478092908859253, "learning_rate": 2.0148790346201165e-05, "loss": 0.2526, "step": 8570 }, { "epoch": 1.5001311303435614, "grad_norm": 2.2158713340759277, "learning_rate": 2.0125358795618305e-05, "loss": 0.1914, "step": 8580 }, { "epoch": 1.5018795349243814, "grad_norm": 1.9559482336044312, "learning_rate": 2.010192724503544e-05, "loss": 0.2016, "step": 8590 }, { "epoch": 1.5036279395052015, "grad_norm": 2.525531768798828, "learning_rate": 2.007849569445258e-05, "loss": 0.175, "step": 8600 }, { "epoch": 1.5053763440860215, "grad_norm": 2.0414490699768066, "learning_rate": 2.005506414386972e-05, "loss": 0.1678, "step": 8610 }, { "epoch": 1.5071247486668415, "grad_norm": 2.622178316116333, "learning_rate": 2.003163259328686e-05, "loss": 0.1964, "step": 8620 }, { "epoch": 1.5088731532476616, "grad_norm": 2.0801520347595215, "learning_rate": 2.0008201042704e-05, "loss": 0.2044, "step": 8630 }, { "epoch": 1.5106215578284816, "grad_norm": 2.0462727546691895, "learning_rate": 1.9984769492121142e-05, "loss": 0.1786, "step": 8640 }, { "epoch": 1.5123699624093017, "grad_norm": 2.199009895324707, "learning_rate": 1.9961337941538283e-05, "loss": 0.2076, "step": 8650 }, { "epoch": 1.5141183669901215, "grad_norm": 2.0829432010650635, "learning_rate": 1.9937906390955423e-05, "loss": 0.217, "step": 8660 }, { "epoch": 1.5158667715709415, "grad_norm": 1.6419503688812256, "learning_rate": 1.9914474840372564e-05, "loss": 0.2066, "step": 8670 }, { "epoch": 1.5176151761517616, "grad_norm": 2.3850531578063965, "learning_rate": 1.9891043289789705e-05, "loss": 0.2279, "step": 8680 }, { "epoch": 1.5193635807325814, "grad_norm": 2.983680486679077, "learning_rate": 1.9867611739206845e-05, "loss": 0.2123, "step": 8690 }, { "epoch": 1.5211119853134014, "grad_norm": 2.2864255905151367, "learning_rate": 1.9844180188623986e-05, "loss": 0.2574, "step": 8700 }, { "epoch": 1.5228603898942215, "grad_norm": 2.297508955001831, "learning_rate": 1.9820748638041123e-05, "loss": 0.182, "step": 8710 }, { "epoch": 1.5246087944750415, "grad_norm": 1.8566784858703613, "learning_rate": 1.9797317087458263e-05, "loss": 0.2081, "step": 8720 }, { "epoch": 1.5263571990558615, "grad_norm": 2.14294695854187, "learning_rate": 1.9773885536875404e-05, "loss": 0.2094, "step": 8730 }, { "epoch": 1.5281056036366816, "grad_norm": 3.8622517585754395, "learning_rate": 1.9750453986292545e-05, "loss": 0.1816, "step": 8740 }, { "epoch": 1.5298540082175016, "grad_norm": 3.9352099895477295, "learning_rate": 1.9727022435709685e-05, "loss": 0.1895, "step": 8750 }, { "epoch": 1.5316024127983217, "grad_norm": 2.0230891704559326, "learning_rate": 1.9703590885126826e-05, "loss": 0.2077, "step": 8760 }, { "epoch": 1.5333508173791417, "grad_norm": 3.11130952835083, "learning_rate": 1.9680159334543963e-05, "loss": 0.1958, "step": 8770 }, { "epoch": 1.5350992219599615, "grad_norm": 2.240939140319824, "learning_rate": 1.9656727783961104e-05, "loss": 0.1949, "step": 8780 }, { "epoch": 1.5368476265407816, "grad_norm": 3.9747726917266846, "learning_rate": 1.9633296233378244e-05, "loss": 0.2115, "step": 8790 }, { "epoch": 1.5385960311216016, "grad_norm": 2.0578691959381104, "learning_rate": 1.9609864682795385e-05, "loss": 0.2361, "step": 8800 }, { "epoch": 1.5403444357024214, "grad_norm": 1.8161098957061768, "learning_rate": 1.9586433132212525e-05, "loss": 0.1738, "step": 8810 }, { "epoch": 1.5420928402832415, "grad_norm": 2.508936643600464, "learning_rate": 1.9563001581629666e-05, "loss": 0.2129, "step": 8820 }, { "epoch": 1.5438412448640615, "grad_norm": 1.8289694786071777, "learning_rate": 1.9539570031046807e-05, "loss": 0.1573, "step": 8830 }, { "epoch": 1.5455896494448815, "grad_norm": 2.67386531829834, "learning_rate": 1.9516138480463947e-05, "loss": 0.228, "step": 8840 }, { "epoch": 1.5473380540257016, "grad_norm": 2.565060615539551, "learning_rate": 1.9492706929881088e-05, "loss": 0.234, "step": 8850 }, { "epoch": 1.5490864586065216, "grad_norm": 2.6208226680755615, "learning_rate": 1.9469275379298228e-05, "loss": 0.2135, "step": 8860 }, { "epoch": 1.5508348631873417, "grad_norm": 2.1162941455841064, "learning_rate": 1.944584382871537e-05, "loss": 0.2138, "step": 8870 }, { "epoch": 1.5525832677681617, "grad_norm": 3.716646194458008, "learning_rate": 1.9422412278132506e-05, "loss": 0.2058, "step": 8880 }, { "epoch": 1.5543316723489815, "grad_norm": 2.363117218017578, "learning_rate": 1.9398980727549647e-05, "loss": 0.2185, "step": 8890 }, { "epoch": 1.5560800769298015, "grad_norm": 4.495354175567627, "learning_rate": 1.9375549176966787e-05, "loss": 0.2137, "step": 8900 }, { "epoch": 1.5578284815106216, "grad_norm": 1.6843703985214233, "learning_rate": 1.9352117626383928e-05, "loss": 0.2047, "step": 8910 }, { "epoch": 1.5595768860914414, "grad_norm": 3.8748202323913574, "learning_rate": 1.932868607580107e-05, "loss": 0.2283, "step": 8920 }, { "epoch": 1.5613252906722614, "grad_norm": 2.3094186782836914, "learning_rate": 1.930525452521821e-05, "loss": 0.1873, "step": 8930 }, { "epoch": 1.5630736952530815, "grad_norm": 1.842756986618042, "learning_rate": 1.928182297463535e-05, "loss": 0.19, "step": 8940 }, { "epoch": 1.5648220998339015, "grad_norm": 2.115356206893921, "learning_rate": 1.9258391424052487e-05, "loss": 0.2043, "step": 8950 }, { "epoch": 1.5665705044147216, "grad_norm": 3.2382071018218994, "learning_rate": 1.9234959873469627e-05, "loss": 0.2055, "step": 8960 }, { "epoch": 1.5683189089955416, "grad_norm": 2.4569027423858643, "learning_rate": 1.9211528322886768e-05, "loss": 0.1677, "step": 8970 }, { "epoch": 1.5700673135763616, "grad_norm": 2.180326461791992, "learning_rate": 1.918809677230391e-05, "loss": 0.198, "step": 8980 }, { "epoch": 1.5718157181571817, "grad_norm": 8.215828895568848, "learning_rate": 1.916466522172105e-05, "loss": 0.2735, "step": 8990 }, { "epoch": 1.5735641227380017, "grad_norm": 2.064840316772461, "learning_rate": 1.9143576826196474e-05, "loss": 0.1926, "step": 9000 }, { "epoch": 1.5740886441122477, "eval_loss": 0.36656469106674194, "eval_runtime": 1792.8447, "eval_samples_per_second": 8.078, "eval_steps_per_second": 1.01, "step": 9003 }, { "epoch": 1.5753125273188215, "grad_norm": 1.91609787940979, "learning_rate": 1.9120145275613615e-05, "loss": 0.2178, "step": 9010 }, { "epoch": 1.5770609318996416, "grad_norm": 4.243214130401611, "learning_rate": 1.9096713725030755e-05, "loss": 0.2039, "step": 9020 }, { "epoch": 1.5788093364804616, "grad_norm": 2.9057223796844482, "learning_rate": 1.9073282174447896e-05, "loss": 0.177, "step": 9030 }, { "epoch": 1.5805577410612814, "grad_norm": 3.835923433303833, "learning_rate": 1.9049850623865036e-05, "loss": 0.19, "step": 9040 }, { "epoch": 1.5823061456421015, "grad_norm": 2.18546986579895, "learning_rate": 1.9026419073282177e-05, "loss": 0.1639, "step": 9050 }, { "epoch": 1.5840545502229215, "grad_norm": 2.5857343673706055, "learning_rate": 1.9002987522699317e-05, "loss": 0.2156, "step": 9060 }, { "epoch": 1.5858029548037416, "grad_norm": 5.416774272918701, "learning_rate": 1.8979555972116458e-05, "loss": 0.2085, "step": 9070 }, { "epoch": 1.5875513593845616, "grad_norm": 2.449079751968384, "learning_rate": 1.89561244215336e-05, "loss": 0.191, "step": 9080 }, { "epoch": 1.5892997639653816, "grad_norm": 2.1129159927368164, "learning_rate": 1.8932692870950736e-05, "loss": 0.2079, "step": 9090 }, { "epoch": 1.5910481685462017, "grad_norm": 2.000885009765625, "learning_rate": 1.8909261320367876e-05, "loss": 0.2055, "step": 9100 }, { "epoch": 1.5927965731270217, "grad_norm": 2.0597760677337646, "learning_rate": 1.8885829769785017e-05, "loss": 0.1894, "step": 9110 }, { "epoch": 1.5945449777078418, "grad_norm": 2.180694818496704, "learning_rate": 1.8862398219202158e-05, "loss": 0.1801, "step": 9120 }, { "epoch": 1.5962933822886616, "grad_norm": 2.4223179817199707, "learning_rate": 1.8838966668619298e-05, "loss": 0.1895, "step": 9130 }, { "epoch": 1.5980417868694816, "grad_norm": 1.957263708114624, "learning_rate": 1.8815535118036435e-05, "loss": 0.2013, "step": 9140 }, { "epoch": 1.5997901914503014, "grad_norm": 1.6094838380813599, "learning_rate": 1.8792103567453576e-05, "loss": 0.2229, "step": 9150 }, { "epoch": 1.6015385960311215, "grad_norm": 1.6934531927108765, "learning_rate": 1.8768672016870717e-05, "loss": 0.1792, "step": 9160 }, { "epoch": 1.6032870006119415, "grad_norm": 1.3896024227142334, "learning_rate": 1.8745240466287857e-05, "loss": 0.2096, "step": 9170 }, { "epoch": 1.6050354051927616, "grad_norm": 1.9864113330841064, "learning_rate": 1.8724152070763285e-05, "loss": 0.2035, "step": 9180 }, { "epoch": 1.6067838097735816, "grad_norm": 1.592942714691162, "learning_rate": 1.8700720520180426e-05, "loss": 0.2076, "step": 9190 }, { "epoch": 1.6085322143544016, "grad_norm": 1.9681413173675537, "learning_rate": 1.8677288969597563e-05, "loss": 0.1514, "step": 9200 }, { "epoch": 1.6102806189352217, "grad_norm": 1.6271953582763672, "learning_rate": 1.8653857419014704e-05, "loss": 0.2308, "step": 9210 }, { "epoch": 1.6120290235160417, "grad_norm": 2.731745481491089, "learning_rate": 1.8630425868431844e-05, "loss": 0.1761, "step": 9220 }, { "epoch": 1.6137774280968618, "grad_norm": 1.8629202842712402, "learning_rate": 1.8606994317848985e-05, "loss": 0.1986, "step": 9230 }, { "epoch": 1.6155258326776816, "grad_norm": 2.563565731048584, "learning_rate": 1.8583562767266125e-05, "loss": 0.2051, "step": 9240 }, { "epoch": 1.6172742372585016, "grad_norm": 3.7784006595611572, "learning_rate": 1.8560131216683266e-05, "loss": 0.161, "step": 9250 }, { "epoch": 1.6190226418393217, "grad_norm": 2.4034857749938965, "learning_rate": 1.8536699666100407e-05, "loss": 0.1901, "step": 9260 }, { "epoch": 1.6207710464201415, "grad_norm": 1.440220594406128, "learning_rate": 1.8513268115517547e-05, "loss": 0.1724, "step": 9270 }, { "epoch": 1.6225194510009615, "grad_norm": 3.3751020431518555, "learning_rate": 1.8489836564934688e-05, "loss": 0.2411, "step": 9280 }, { "epoch": 1.6242678555817816, "grad_norm": 1.5174119472503662, "learning_rate": 1.846640501435183e-05, "loss": 0.2148, "step": 9290 }, { "epoch": 1.6260162601626016, "grad_norm": 1.9150112867355347, "learning_rate": 1.8442973463768966e-05, "loss": 0.1907, "step": 9300 }, { "epoch": 1.6277646647434216, "grad_norm": 2.028003692626953, "learning_rate": 1.8419541913186106e-05, "loss": 0.2156, "step": 9310 }, { "epoch": 1.6295130693242417, "grad_norm": 1.975215196609497, "learning_rate": 1.8396110362603247e-05, "loss": 0.2151, "step": 9320 }, { "epoch": 1.6312614739050617, "grad_norm": 3.184420347213745, "learning_rate": 1.8372678812020387e-05, "loss": 0.1918, "step": 9330 }, { "epoch": 1.6330098784858817, "grad_norm": 2.1042797565460205, "learning_rate": 1.8349247261437525e-05, "loss": 0.1825, "step": 9340 }, { "epoch": 1.6347582830667018, "grad_norm": 2.3438453674316406, "learning_rate": 1.8325815710854665e-05, "loss": 0.2106, "step": 9350 }, { "epoch": 1.6365066876475216, "grad_norm": 2.2266077995300293, "learning_rate": 1.8302384160271806e-05, "loss": 0.2422, "step": 9360 }, { "epoch": 1.6382550922283416, "grad_norm": 2.7527801990509033, "learning_rate": 1.8278952609688946e-05, "loss": 0.2066, "step": 9370 }, { "epoch": 1.6400034968091617, "grad_norm": 1.2006065845489502, "learning_rate": 1.8255521059106087e-05, "loss": 0.1604, "step": 9380 }, { "epoch": 1.6417519013899815, "grad_norm": 2.272947311401367, "learning_rate": 1.8232089508523227e-05, "loss": 0.1771, "step": 9390 }, { "epoch": 1.6435003059708015, "grad_norm": 3.1900084018707275, "learning_rate": 1.8208657957940368e-05, "loss": 0.2074, "step": 9400 }, { "epoch": 1.6452487105516216, "grad_norm": 2.1115543842315674, "learning_rate": 1.818522640735751e-05, "loss": 0.2119, "step": 9410 }, { "epoch": 1.6469971151324416, "grad_norm": 2.303034543991089, "learning_rate": 1.816179485677465e-05, "loss": 0.2049, "step": 9420 }, { "epoch": 1.6487455197132617, "grad_norm": 2.002100944519043, "learning_rate": 1.813836330619179e-05, "loss": 0.1819, "step": 9430 }, { "epoch": 1.6504939242940817, "grad_norm": 2.697618246078491, "learning_rate": 1.811493175560893e-05, "loss": 0.2178, "step": 9440 }, { "epoch": 1.6522423288749017, "grad_norm": 1.8766717910766602, "learning_rate": 1.8091500205026068e-05, "loss": 0.1941, "step": 9450 }, { "epoch": 1.6539907334557218, "grad_norm": 1.7633792161941528, "learning_rate": 1.8068068654443208e-05, "loss": 0.1462, "step": 9460 }, { "epoch": 1.6557391380365416, "grad_norm": 2.1197335720062256, "learning_rate": 1.804463710386035e-05, "loss": 0.1916, "step": 9470 }, { "epoch": 1.6574875426173616, "grad_norm": 2.715771436691284, "learning_rate": 1.802120555327749e-05, "loss": 0.2045, "step": 9480 }, { "epoch": 1.6592359471981817, "grad_norm": 2.6755380630493164, "learning_rate": 1.799777400269463e-05, "loss": 0.1736, "step": 9490 }, { "epoch": 1.6609843517790015, "grad_norm": 2.555866003036499, "learning_rate": 1.797434245211177e-05, "loss": 0.1701, "step": 9500 }, { "epoch": 1.6627327563598215, "grad_norm": 1.945939540863037, "learning_rate": 1.79532540565872e-05, "loss": 0.2216, "step": 9510 }, { "epoch": 1.6644811609406416, "grad_norm": 1.77175772190094, "learning_rate": 1.7929822506004336e-05, "loss": 0.2054, "step": 9520 }, { "epoch": 1.6662295655214616, "grad_norm": 2.1837494373321533, "learning_rate": 1.7906390955421477e-05, "loss": 0.1917, "step": 9530 }, { "epoch": 1.6679779701022817, "grad_norm": 2.8827102184295654, "learning_rate": 1.7882959404838617e-05, "loss": 0.1919, "step": 9540 }, { "epoch": 1.6697263746831017, "grad_norm": 1.9127072095870972, "learning_rate": 1.7859527854255754e-05, "loss": 0.1998, "step": 9550 }, { "epoch": 1.6714747792639217, "grad_norm": 3.5764472484588623, "learning_rate": 1.7836096303672895e-05, "loss": 0.1751, "step": 9560 }, { "epoch": 1.6732231838447418, "grad_norm": 3.749941110610962, "learning_rate": 1.7812664753090035e-05, "loss": 0.2057, "step": 9570 }, { "epoch": 1.6749715884255618, "grad_norm": 2.783621072769165, "learning_rate": 1.7789233202507176e-05, "loss": 0.1788, "step": 9580 }, { "epoch": 1.6767199930063816, "grad_norm": 1.676371455192566, "learning_rate": 1.7765801651924317e-05, "loss": 0.1926, "step": 9590 }, { "epoch": 1.6784683975872017, "grad_norm": 2.354701519012451, "learning_rate": 1.7742370101341457e-05, "loss": 0.2128, "step": 9600 }, { "epoch": 1.6802168021680217, "grad_norm": 1.633273720741272, "learning_rate": 1.7718938550758598e-05, "loss": 0.2343, "step": 9610 }, { "epoch": 1.6819652067488415, "grad_norm": 1.3156176805496216, "learning_rate": 1.769550700017574e-05, "loss": 0.1748, "step": 9620 }, { "epoch": 1.6837136113296616, "grad_norm": 1.5171828269958496, "learning_rate": 1.767207544959288e-05, "loss": 0.1877, "step": 9630 }, { "epoch": 1.6854620159104816, "grad_norm": 2.5515353679656982, "learning_rate": 1.764864389901002e-05, "loss": 0.1922, "step": 9640 }, { "epoch": 1.6872104204913017, "grad_norm": 2.472705364227295, "learning_rate": 1.762521234842716e-05, "loss": 0.1552, "step": 9650 }, { "epoch": 1.6889588250721217, "grad_norm": 2.7057673931121826, "learning_rate": 1.7601780797844297e-05, "loss": 0.1945, "step": 9660 }, { "epoch": 1.6907072296529417, "grad_norm": 2.4016506671905518, "learning_rate": 1.7578349247261438e-05, "loss": 0.2261, "step": 9670 }, { "epoch": 1.6924556342337618, "grad_norm": 2.49354887008667, "learning_rate": 1.755491769667858e-05, "loss": 0.1836, "step": 9680 }, { "epoch": 1.6942040388145818, "grad_norm": 3.3163530826568604, "learning_rate": 1.753148614609572e-05, "loss": 0.2011, "step": 9690 }, { "epoch": 1.6959524433954019, "grad_norm": 1.8435873985290527, "learning_rate": 1.750805459551286e-05, "loss": 0.1976, "step": 9700 }, { "epoch": 1.6977008479762217, "grad_norm": 2.1045944690704346, "learning_rate": 1.748462304493e-05, "loss": 0.2117, "step": 9710 }, { "epoch": 1.6994492525570417, "grad_norm": 2.2373058795928955, "learning_rate": 1.746119149434714e-05, "loss": 0.2231, "step": 9720 }, { "epoch": 1.7011976571378618, "grad_norm": 4.231749534606934, "learning_rate": 1.743775994376428e-05, "loss": 0.2033, "step": 9730 }, { "epoch": 1.7029460617186816, "grad_norm": 1.860250473022461, "learning_rate": 1.7414328393181422e-05, "loss": 0.1666, "step": 9740 }, { "epoch": 1.7046944662995016, "grad_norm": 1.631990909576416, "learning_rate": 1.7390896842598563e-05, "loss": 0.1946, "step": 9750 }, { "epoch": 1.7064428708803216, "grad_norm": 2.109370470046997, "learning_rate": 1.7367465292015703e-05, "loss": 0.2146, "step": 9760 }, { "epoch": 1.7081912754611417, "grad_norm": 1.7522424459457397, "learning_rate": 1.734403374143284e-05, "loss": 0.2189, "step": 9770 }, { "epoch": 1.7099396800419617, "grad_norm": 2.2851176261901855, "learning_rate": 1.732060219084998e-05, "loss": 0.1709, "step": 9780 }, { "epoch": 1.7116880846227818, "grad_norm": 1.994432806968689, "learning_rate": 1.729717064026712e-05, "loss": 0.1814, "step": 9790 }, { "epoch": 1.7134364892036018, "grad_norm": 1.5442957878112793, "learning_rate": 1.7273739089684262e-05, "loss": 0.1878, "step": 9800 }, { "epoch": 1.7151848937844218, "grad_norm": 1.8015555143356323, "learning_rate": 1.72503075391014e-05, "loss": 0.2224, "step": 9810 }, { "epoch": 1.7169332983652417, "grad_norm": 2.210533380508423, "learning_rate": 1.722687598851854e-05, "loss": 0.209, "step": 9820 }, { "epoch": 1.7186817029460617, "grad_norm": 1.9179598093032837, "learning_rate": 1.720344443793568e-05, "loss": 0.1857, "step": 9830 }, { "epoch": 1.7204301075268817, "grad_norm": 3.421584367752075, "learning_rate": 1.718001288735282e-05, "loss": 0.219, "step": 9840 }, { "epoch": 1.7221785121077016, "grad_norm": 2.5188374519348145, "learning_rate": 1.715658133676996e-05, "loss": 0.1896, "step": 9850 }, { "epoch": 1.7239269166885216, "grad_norm": 2.2160191535949707, "learning_rate": 1.7133149786187102e-05, "loss": 0.1987, "step": 9860 }, { "epoch": 1.7256753212693416, "grad_norm": 2.5851011276245117, "learning_rate": 1.7109718235604243e-05, "loss": 0.1861, "step": 9870 }, { "epoch": 1.7274237258501617, "grad_norm": 1.6431199312210083, "learning_rate": 1.7086286685021383e-05, "loss": 0.167, "step": 9880 }, { "epoch": 1.7291721304309817, "grad_norm": 2.0865867137908936, "learning_rate": 1.7062855134438524e-05, "loss": 0.1806, "step": 9890 }, { "epoch": 1.7309205350118018, "grad_norm": 1.9474307298660278, "learning_rate": 1.7039423583855665e-05, "loss": 0.1907, "step": 9900 }, { "epoch": 1.7326689395926218, "grad_norm": 2.301378011703491, "learning_rate": 1.7015992033272802e-05, "loss": 0.2028, "step": 9910 }, { "epoch": 1.7344173441734418, "grad_norm": 1.9113527536392212, "learning_rate": 1.6992560482689942e-05, "loss": 0.2172, "step": 9920 }, { "epoch": 1.7361657487542619, "grad_norm": 2.14692759513855, "learning_rate": 1.6969128932107083e-05, "loss": 0.1846, "step": 9930 }, { "epoch": 1.7379141533350817, "grad_norm": 2.062518358230591, "learning_rate": 1.6945697381524224e-05, "loss": 0.1902, "step": 9940 }, { "epoch": 1.7396625579159017, "grad_norm": 3.0750021934509277, "learning_rate": 1.6922265830941364e-05, "loss": 0.1944, "step": 9950 }, { "epoch": 1.7414109624967218, "grad_norm": 1.9308764934539795, "learning_rate": 1.6898834280358505e-05, "loss": 0.2198, "step": 9960 }, { "epoch": 1.7431593670775416, "grad_norm": 2.22654128074646, "learning_rate": 1.6875402729775645e-05, "loss": 0.1913, "step": 9970 }, { "epoch": 1.7449077716583616, "grad_norm": 2.7233572006225586, "learning_rate": 1.6851971179192786e-05, "loss": 0.198, "step": 9980 }, { "epoch": 1.7466561762391817, "grad_norm": 2.2208783626556396, "learning_rate": 1.6828539628609926e-05, "loss": 0.2084, "step": 9990 }, { "epoch": 1.7484045808200017, "grad_norm": 1.8110119104385376, "learning_rate": 1.6805108078027067e-05, "loss": 0.1946, "step": 10000 }, { "epoch": 1.7501529854008218, "grad_norm": 2.9725208282470703, "learning_rate": 1.6781676527444204e-05, "loss": 0.179, "step": 10010 }, { "epoch": 1.7519013899816418, "grad_norm": 5.847568511962891, "learning_rate": 1.6758244976861345e-05, "loss": 0.2022, "step": 10020 }, { "epoch": 1.7536497945624618, "grad_norm": 2.0036888122558594, "learning_rate": 1.6734813426278485e-05, "loss": 0.1727, "step": 10030 }, { "epoch": 1.7553981991432819, "grad_norm": 2.1402735710144043, "learning_rate": 1.6711381875695626e-05, "loss": 0.1903, "step": 10040 }, { "epoch": 1.757146603724102, "grad_norm": 4.128479480743408, "learning_rate": 1.6687950325112767e-05, "loss": 0.2195, "step": 10050 }, { "epoch": 1.7588950083049217, "grad_norm": 1.9900801181793213, "learning_rate": 1.6664518774529904e-05, "loss": 0.1946, "step": 10060 }, { "epoch": 1.7606434128857418, "grad_norm": 1.4300425052642822, "learning_rate": 1.6641087223947044e-05, "loss": 0.1835, "step": 10070 }, { "epoch": 1.7623918174665618, "grad_norm": 2.1098504066467285, "learning_rate": 1.6617655673364185e-05, "loss": 0.2234, "step": 10080 }, { "epoch": 1.7641402220473816, "grad_norm": 4.656988143920898, "learning_rate": 1.6594224122781326e-05, "loss": 0.217, "step": 10090 }, { "epoch": 1.7658886266282017, "grad_norm": 1.80104660987854, "learning_rate": 1.6570792572198466e-05, "loss": 0.1836, "step": 10100 }, { "epoch": 1.7676370312090217, "grad_norm": 2.10807466506958, "learning_rate": 1.6547361021615607e-05, "loss": 0.2034, "step": 10110 }, { "epoch": 1.7693854357898418, "grad_norm": 2.7385706901550293, "learning_rate": 1.6523929471032747e-05, "loss": 0.216, "step": 10120 }, { "epoch": 1.7711338403706618, "grad_norm": 2.4053244590759277, "learning_rate": 1.6500497920449888e-05, "loss": 0.1993, "step": 10130 }, { "epoch": 1.7728822449514818, "grad_norm": 2.8163459300994873, "learning_rate": 1.647706636986703e-05, "loss": 0.1719, "step": 10140 }, { "epoch": 1.7746306495323019, "grad_norm": 1.9868711233139038, "learning_rate": 1.645363481928417e-05, "loss": 0.2597, "step": 10150 }, { "epoch": 1.776379054113122, "grad_norm": 2.1938767433166504, "learning_rate": 1.643020326870131e-05, "loss": 0.1863, "step": 10160 }, { "epoch": 1.7781274586939417, "grad_norm": 2.0124146938323975, "learning_rate": 1.6406771718118447e-05, "loss": 0.1958, "step": 10170 }, { "epoch": 1.7798758632747618, "grad_norm": 1.6832975149154663, "learning_rate": 1.6383340167535587e-05, "loss": 0.206, "step": 10180 }, { "epoch": 1.7816242678555818, "grad_norm": 2.3199076652526855, "learning_rate": 1.6359908616952728e-05, "loss": 0.2055, "step": 10190 }, { "epoch": 1.7833726724364016, "grad_norm": 3.9184508323669434, "learning_rate": 1.633647706636987e-05, "loss": 0.207, "step": 10200 }, { "epoch": 1.7851210770172217, "grad_norm": 2.7728350162506104, "learning_rate": 1.631304551578701e-05, "loss": 0.1892, "step": 10210 }, { "epoch": 1.7868694815980417, "grad_norm": 2.6902334690093994, "learning_rate": 1.628961396520415e-05, "loss": 0.1797, "step": 10220 }, { "epoch": 1.7886178861788617, "grad_norm": 1.4976388216018677, "learning_rate": 1.626618241462129e-05, "loss": 0.181, "step": 10230 }, { "epoch": 1.7903662907596818, "grad_norm": 2.592813730239868, "learning_rate": 1.624275086403843e-05, "loss": 0.192, "step": 10240 }, { "epoch": 1.7921146953405018, "grad_norm": 1.3043900728225708, "learning_rate": 1.6219319313455568e-05, "loss": 0.177, "step": 10250 }, { "epoch": 1.7938630999213219, "grad_norm": 3.4276037216186523, "learning_rate": 1.619588776287271e-05, "loss": 0.1738, "step": 10260 }, { "epoch": 1.795611504502142, "grad_norm": 3.072364568710327, "learning_rate": 1.617245621228985e-05, "loss": 0.211, "step": 10270 }, { "epoch": 1.797359909082962, "grad_norm": 1.6340441703796387, "learning_rate": 1.614902466170699e-05, "loss": 0.1852, "step": 10280 }, { "epoch": 1.7991083136637818, "grad_norm": 3.1025967597961426, "learning_rate": 1.612559311112413e-05, "loss": 0.1863, "step": 10290 }, { "epoch": 1.8008567182446018, "grad_norm": 2.3155341148376465, "learning_rate": 1.610216156054127e-05, "loss": 0.1998, "step": 10300 }, { "epoch": 1.8026051228254218, "grad_norm": 1.6753582954406738, "learning_rate": 1.6078730009958408e-05, "loss": 0.1839, "step": 10310 }, { "epoch": 1.8043535274062417, "grad_norm": 1.5715868473052979, "learning_rate": 1.605529845937555e-05, "loss": 0.1732, "step": 10320 }, { "epoch": 1.8061019319870617, "grad_norm": 2.3362326622009277, "learning_rate": 1.603186690879269e-05, "loss": 0.2014, "step": 10330 }, { "epoch": 1.8078503365678817, "grad_norm": 2.038578510284424, "learning_rate": 1.600843535820983e-05, "loss": 0.1737, "step": 10340 }, { "epoch": 1.8095987411487018, "grad_norm": 3.527510643005371, "learning_rate": 1.598500380762697e-05, "loss": 0.2329, "step": 10350 }, { "epoch": 1.8113471457295218, "grad_norm": 1.9759219884872437, "learning_rate": 1.596157225704411e-05, "loss": 0.1867, "step": 10360 }, { "epoch": 1.8130955503103419, "grad_norm": 1.822554588317871, "learning_rate": 1.5938140706461252e-05, "loss": 0.1904, "step": 10370 }, { "epoch": 1.814843954891162, "grad_norm": 2.0749101638793945, "learning_rate": 1.5914709155878392e-05, "loss": 0.1875, "step": 10380 }, { "epoch": 1.816592359471982, "grad_norm": 2.470715045928955, "learning_rate": 1.5891277605295533e-05, "loss": 0.1864, "step": 10390 }, { "epoch": 1.818340764052802, "grad_norm": 1.7627729177474976, "learning_rate": 1.5867846054712674e-05, "loss": 0.1887, "step": 10400 }, { "epoch": 1.8200891686336218, "grad_norm": 1.9679255485534668, "learning_rate": 1.5844414504129814e-05, "loss": 0.2045, "step": 10410 }, { "epoch": 1.8218375732144418, "grad_norm": 2.222324848175049, "learning_rate": 1.582098295354695e-05, "loss": 0.1988, "step": 10420 }, { "epoch": 1.8235859777952617, "grad_norm": 2.272304058074951, "learning_rate": 1.5797551402964092e-05, "loss": 0.1825, "step": 10430 }, { "epoch": 1.8253343823760817, "grad_norm": 2.1010255813598633, "learning_rate": 1.5774119852381232e-05, "loss": 0.1801, "step": 10440 }, { "epoch": 1.8270827869569017, "grad_norm": 2.3894827365875244, "learning_rate": 1.5750688301798373e-05, "loss": 0.1912, "step": 10450 }, { "epoch": 1.8288311915377218, "grad_norm": 2.351548910140991, "learning_rate": 1.5727256751215514e-05, "loss": 0.1814, "step": 10460 }, { "epoch": 1.8305795961185418, "grad_norm": 1.5725765228271484, "learning_rate": 1.5703825200632654e-05, "loss": 0.2019, "step": 10470 }, { "epoch": 1.8323280006993619, "grad_norm": 2.2197859287261963, "learning_rate": 1.5680393650049795e-05, "loss": 0.2174, "step": 10480 }, { "epoch": 1.834076405280182, "grad_norm": 2.1431429386138916, "learning_rate": 1.5656962099466932e-05, "loss": 0.1944, "step": 10490 }, { "epoch": 1.835824809861002, "grad_norm": 2.159376382827759, "learning_rate": 1.5633530548884073e-05, "loss": 0.1816, "step": 10500 }, { "epoch": 1.837573214441822, "grad_norm": 2.3646135330200195, "learning_rate": 1.5610098998301213e-05, "loss": 0.1694, "step": 10510 }, { "epoch": 1.8393216190226418, "grad_norm": 1.8579986095428467, "learning_rate": 1.5586667447718354e-05, "loss": 0.1852, "step": 10520 }, { "epoch": 1.8410700236034618, "grad_norm": 1.8673964738845825, "learning_rate": 1.5563235897135494e-05, "loss": 0.215, "step": 10530 }, { "epoch": 1.8428184281842819, "grad_norm": 1.8977408409118652, "learning_rate": 1.5539804346552635e-05, "loss": 0.1745, "step": 10540 }, { "epoch": 1.8445668327651017, "grad_norm": 2.263277530670166, "learning_rate": 1.5516372795969776e-05, "loss": 0.1934, "step": 10550 }, { "epoch": 1.8463152373459217, "grad_norm": 3.5230796337127686, "learning_rate": 1.5492941245386913e-05, "loss": 0.2121, "step": 10560 }, { "epoch": 1.8480636419267418, "grad_norm": 2.2471237182617188, "learning_rate": 1.5469509694804053e-05, "loss": 0.1665, "step": 10570 }, { "epoch": 1.8498120465075618, "grad_norm": 1.7764135599136353, "learning_rate": 1.5446078144221194e-05, "loss": 0.1789, "step": 10580 }, { "epoch": 1.8515604510883819, "grad_norm": 2.5531044006347656, "learning_rate": 1.5422646593638334e-05, "loss": 0.198, "step": 10590 }, { "epoch": 1.853308855669202, "grad_norm": 1.9833929538726807, "learning_rate": 1.5399215043055475e-05, "loss": 0.1773, "step": 10600 }, { "epoch": 1.855057260250022, "grad_norm": 3.2580177783966064, "learning_rate": 1.5375783492472616e-05, "loss": 0.1999, "step": 10610 }, { "epoch": 1.856805664830842, "grad_norm": 1.653817057609558, "learning_rate": 1.5352351941889756e-05, "loss": 0.1903, "step": 10620 }, { "epoch": 1.858554069411662, "grad_norm": 2.687511920928955, "learning_rate": 1.5328920391306897e-05, "loss": 0.1955, "step": 10630 }, { "epoch": 1.8603024739924818, "grad_norm": 2.3848729133605957, "learning_rate": 1.5305488840724037e-05, "loss": 0.1644, "step": 10640 }, { "epoch": 1.8620508785733019, "grad_norm": 3.581122636795044, "learning_rate": 1.5282057290141178e-05, "loss": 0.1731, "step": 10650 }, { "epoch": 1.863799283154122, "grad_norm": 2.3530807495117188, "learning_rate": 1.5258625739558317e-05, "loss": 0.1719, "step": 10660 }, { "epoch": 1.8655476877349417, "grad_norm": 3.611687421798706, "learning_rate": 1.5235194188975457e-05, "loss": 0.2315, "step": 10670 }, { "epoch": 1.8672960923157618, "grad_norm": 2.133084774017334, "learning_rate": 1.5211762638392598e-05, "loss": 0.222, "step": 10680 }, { "epoch": 1.8690444968965818, "grad_norm": 2.017202854156494, "learning_rate": 1.5188331087809739e-05, "loss": 0.1537, "step": 10690 }, { "epoch": 1.8707929014774018, "grad_norm": 4.252660751342773, "learning_rate": 1.5164899537226878e-05, "loss": 0.2105, "step": 10700 }, { "epoch": 1.8725413060582219, "grad_norm": 1.902491569519043, "learning_rate": 1.5141467986644018e-05, "loss": 0.1876, "step": 10710 }, { "epoch": 1.874289710639042, "grad_norm": 5.36565637588501, "learning_rate": 1.5118036436061159e-05, "loss": 0.1734, "step": 10720 }, { "epoch": 1.876038115219862, "grad_norm": 1.6138919591903687, "learning_rate": 1.5094604885478296e-05, "loss": 0.1812, "step": 10730 }, { "epoch": 1.877786519800682, "grad_norm": 3.4811031818389893, "learning_rate": 1.5071173334895436e-05, "loss": 0.1883, "step": 10740 }, { "epoch": 1.8795349243815018, "grad_norm": 2.3118202686309814, "learning_rate": 1.5047741784312577e-05, "loss": 0.1933, "step": 10750 }, { "epoch": 1.8812833289623219, "grad_norm": 2.5845682621002197, "learning_rate": 1.5024310233729718e-05, "loss": 0.2122, "step": 10760 }, { "epoch": 1.883031733543142, "grad_norm": 1.4849445819854736, "learning_rate": 1.5000878683146858e-05, "loss": 0.1763, "step": 10770 }, { "epoch": 1.8847801381239617, "grad_norm": 1.851722240447998, "learning_rate": 1.4977447132563999e-05, "loss": 0.1837, "step": 10780 }, { "epoch": 1.8865285427047818, "grad_norm": 1.793611764907837, "learning_rate": 1.4954015581981138e-05, "loss": 0.1894, "step": 10790 }, { "epoch": 1.8882769472856018, "grad_norm": 1.6560570001602173, "learning_rate": 1.4930584031398278e-05, "loss": 0.2056, "step": 10800 }, { "epoch": 1.8900253518664218, "grad_norm": 2.940868377685547, "learning_rate": 1.4907152480815419e-05, "loss": 0.2131, "step": 10810 }, { "epoch": 1.8917737564472419, "grad_norm": 1.374992847442627, "learning_rate": 1.488372093023256e-05, "loss": 0.2025, "step": 10820 }, { "epoch": 1.893522161028062, "grad_norm": 4.640373229980469, "learning_rate": 1.48602893796497e-05, "loss": 0.2027, "step": 10830 }, { "epoch": 1.895270565608882, "grad_norm": 1.665847659111023, "learning_rate": 1.4836857829066839e-05, "loss": 0.1858, "step": 10840 }, { "epoch": 1.897018970189702, "grad_norm": 2.2026145458221436, "learning_rate": 1.481342627848398e-05, "loss": 0.1924, "step": 10850 }, { "epoch": 1.898767374770522, "grad_norm": 2.361902952194214, "learning_rate": 1.478999472790112e-05, "loss": 0.2083, "step": 10860 }, { "epoch": 1.9005157793513419, "grad_norm": 2.2269675731658936, "learning_rate": 1.476656317731826e-05, "loss": 0.1747, "step": 10870 }, { "epoch": 1.902264183932162, "grad_norm": 2.43367338180542, "learning_rate": 1.4743131626735401e-05, "loss": 0.1902, "step": 10880 }, { "epoch": 1.904012588512982, "grad_norm": 1.6025736331939697, "learning_rate": 1.4719700076152542e-05, "loss": 0.1636, "step": 10890 }, { "epoch": 1.9057609930938018, "grad_norm": 2.627732038497925, "learning_rate": 1.469626852556968e-05, "loss": 0.1993, "step": 10900 }, { "epoch": 1.9075093976746218, "grad_norm": 2.630786180496216, "learning_rate": 1.4672836974986821e-05, "loss": 0.1601, "step": 10910 }, { "epoch": 1.9092578022554418, "grad_norm": 2.772336006164551, "learning_rate": 1.4649405424403962e-05, "loss": 0.2306, "step": 10920 }, { "epoch": 1.9110062068362619, "grad_norm": 2.6443400382995605, "learning_rate": 1.4625973873821103e-05, "loss": 0.1774, "step": 10930 }, { "epoch": 1.912754611417082, "grad_norm": 2.066016435623169, "learning_rate": 1.4602542323238243e-05, "loss": 0.1781, "step": 10940 }, { "epoch": 1.914503015997902, "grad_norm": 2.2029716968536377, "learning_rate": 1.4579110772655382e-05, "loss": 0.1656, "step": 10950 }, { "epoch": 1.916251420578722, "grad_norm": 2.0414412021636963, "learning_rate": 1.4555679222072523e-05, "loss": 0.1927, "step": 10960 }, { "epoch": 1.917999825159542, "grad_norm": 2.9510364532470703, "learning_rate": 1.4532247671489661e-05, "loss": 0.1812, "step": 10970 }, { "epoch": 1.919748229740362, "grad_norm": 4.375843524932861, "learning_rate": 1.4508816120906802e-05, "loss": 0.2294, "step": 10980 }, { "epoch": 1.921496634321182, "grad_norm": 1.7535252571105957, "learning_rate": 1.4485384570323941e-05, "loss": 0.1728, "step": 10990 }, { "epoch": 1.923245038902002, "grad_norm": 2.152392864227295, "learning_rate": 1.4464296174799367e-05, "loss": 0.1896, "step": 11000 }, { "epoch": 1.924993443482822, "grad_norm": 1.4847190380096436, "learning_rate": 1.4440864624216508e-05, "loss": 0.2039, "step": 11010 }, { "epoch": 1.9267418480636418, "grad_norm": 2.027345895767212, "learning_rate": 1.4417433073633649e-05, "loss": 0.2019, "step": 11020 }, { "epoch": 1.9284902526444618, "grad_norm": 2.9172937870025635, "learning_rate": 1.439400152305079e-05, "loss": 0.1894, "step": 11030 }, { "epoch": 1.9302386572252819, "grad_norm": 2.5305731296539307, "learning_rate": 1.437056997246793e-05, "loss": 0.2269, "step": 11040 }, { "epoch": 1.931987061806102, "grad_norm": 1.8299009799957275, "learning_rate": 1.4347138421885069e-05, "loss": 0.1715, "step": 11050 }, { "epoch": 1.933735466386922, "grad_norm": 2.175562620162964, "learning_rate": 1.432370687130221e-05, "loss": 0.1625, "step": 11060 }, { "epoch": 1.935483870967742, "grad_norm": 1.5124961137771606, "learning_rate": 1.430027532071935e-05, "loss": 0.1808, "step": 11070 }, { "epoch": 1.937232275548562, "grad_norm": 2.2984073162078857, "learning_rate": 1.427684377013649e-05, "loss": 0.1946, "step": 11080 }, { "epoch": 1.938980680129382, "grad_norm": 1.8547557592391968, "learning_rate": 1.4253412219553631e-05, "loss": 0.1866, "step": 11090 }, { "epoch": 1.940729084710202, "grad_norm": 2.1592087745666504, "learning_rate": 1.4229980668970772e-05, "loss": 0.1768, "step": 11100 }, { "epoch": 1.942477489291022, "grad_norm": 2.334019660949707, "learning_rate": 1.420654911838791e-05, "loss": 0.1688, "step": 11110 }, { "epoch": 1.944225893871842, "grad_norm": 1.6526786088943481, "learning_rate": 1.4183117567805051e-05, "loss": 0.1959, "step": 11120 }, { "epoch": 1.9459742984526618, "grad_norm": 1.7332444190979004, "learning_rate": 1.4159686017222192e-05, "loss": 0.1918, "step": 11130 }, { "epoch": 1.9477227030334818, "grad_norm": 1.4149800539016724, "learning_rate": 1.4136254466639332e-05, "loss": 0.1459, "step": 11140 }, { "epoch": 1.9494711076143019, "grad_norm": 1.6035906076431274, "learning_rate": 1.4112822916056473e-05, "loss": 0.1686, "step": 11150 }, { "epoch": 1.951219512195122, "grad_norm": 2.4894981384277344, "learning_rate": 1.408939136547361e-05, "loss": 0.1983, "step": 11160 }, { "epoch": 1.952967916775942, "grad_norm": 1.827898621559143, "learning_rate": 1.406595981489075e-05, "loss": 0.1856, "step": 11170 }, { "epoch": 1.954716321356762, "grad_norm": 1.7103666067123413, "learning_rate": 1.4042528264307891e-05, "loss": 0.1987, "step": 11180 }, { "epoch": 1.956464725937582, "grad_norm": 1.692668080329895, "learning_rate": 1.4019096713725032e-05, "loss": 0.1761, "step": 11190 }, { "epoch": 1.958213130518402, "grad_norm": 2.089463710784912, "learning_rate": 1.399566516314217e-05, "loss": 0.1899, "step": 11200 }, { "epoch": 1.959961535099222, "grad_norm": 1.9206026792526245, "learning_rate": 1.3972233612559311e-05, "loss": 0.184, "step": 11210 }, { "epoch": 1.961709939680042, "grad_norm": 1.8816583156585693, "learning_rate": 1.3948802061976452e-05, "loss": 0.1976, "step": 11220 }, { "epoch": 1.963458344260862, "grad_norm": 4.568369388580322, "learning_rate": 1.3925370511393592e-05, "loss": 0.1745, "step": 11230 }, { "epoch": 1.965206748841682, "grad_norm": 1.8514816761016846, "learning_rate": 1.3901938960810733e-05, "loss": 0.1673, "step": 11240 }, { "epoch": 1.9669551534225018, "grad_norm": 4.430703163146973, "learning_rate": 1.3878507410227872e-05, "loss": 0.1754, "step": 11250 }, { "epoch": 1.9687035580033219, "grad_norm": 1.8397603034973145, "learning_rate": 1.3855075859645013e-05, "loss": 0.254, "step": 11260 }, { "epoch": 1.970451962584142, "grad_norm": 1.7431299686431885, "learning_rate": 1.3831644309062153e-05, "loss": 0.1611, "step": 11270 }, { "epoch": 1.972200367164962, "grad_norm": 2.219205141067505, "learning_rate": 1.3808212758479294e-05, "loss": 0.1885, "step": 11280 }, { "epoch": 1.973948771745782, "grad_norm": 2.130847930908203, "learning_rate": 1.3784781207896434e-05, "loss": 0.1949, "step": 11290 }, { "epoch": 1.975697176326602, "grad_norm": 3.2657315731048584, "learning_rate": 1.3761349657313575e-05, "loss": 0.1534, "step": 11300 }, { "epoch": 1.977445580907422, "grad_norm": 3.336939811706543, "learning_rate": 1.3737918106730714e-05, "loss": 0.1989, "step": 11310 }, { "epoch": 1.979193985488242, "grad_norm": 1.9921077489852905, "learning_rate": 1.3714486556147854e-05, "loss": 0.1959, "step": 11320 }, { "epoch": 1.9809423900690621, "grad_norm": 1.6148415803909302, "learning_rate": 1.3691055005564995e-05, "loss": 0.1564, "step": 11330 }, { "epoch": 1.982690794649882, "grad_norm": 1.0459388494491577, "learning_rate": 1.3667623454982136e-05, "loss": 0.1766, "step": 11340 }, { "epoch": 1.984439199230702, "grad_norm": 3.012587308883667, "learning_rate": 1.3644191904399276e-05, "loss": 0.1731, "step": 11350 }, { "epoch": 1.986187603811522, "grad_norm": 1.4650547504425049, "learning_rate": 1.3620760353816415e-05, "loss": 0.1637, "step": 11360 }, { "epoch": 1.9879360083923419, "grad_norm": 1.4707012176513672, "learning_rate": 1.3597328803233556e-05, "loss": 0.1917, "step": 11370 }, { "epoch": 1.989684412973162, "grad_norm": 1.6757373809814453, "learning_rate": 1.3573897252650696e-05, "loss": 0.1974, "step": 11380 }, { "epoch": 1.991432817553982, "grad_norm": 2.444448471069336, "learning_rate": 1.3550465702067837e-05, "loss": 0.1702, "step": 11390 }, { "epoch": 1.993181222134802, "grad_norm": 2.135993003845215, "learning_rate": 1.3527034151484974e-05, "loss": 0.1989, "step": 11400 }, { "epoch": 1.994929626715622, "grad_norm": 1.804882526397705, "learning_rate": 1.3503602600902115e-05, "loss": 0.1994, "step": 11410 }, { "epoch": 1.996678031296442, "grad_norm": 1.5056926012039185, "learning_rate": 1.3480171050319255e-05, "loss": 0.1721, "step": 11420 }, { "epoch": 1.998426435877262, "grad_norm": 1.6799705028533936, "learning_rate": 1.3456739499736396e-05, "loss": 0.1924, "step": 11430 }, { "epoch": 2.000174840458082, "grad_norm": 1.5842691659927368, "learning_rate": 1.3433307949153536e-05, "loss": 0.2088, "step": 11440 }, { "epoch": 2.001923245038902, "grad_norm": 1.5139780044555664, "learning_rate": 1.3409876398570675e-05, "loss": 0.1146, "step": 11450 }, { "epoch": 2.003671649619722, "grad_norm": 1.719301462173462, "learning_rate": 1.3386444847987816e-05, "loss": 0.1513, "step": 11460 }, { "epoch": 2.005420054200542, "grad_norm": 2.1831586360931396, "learning_rate": 1.3363013297404956e-05, "loss": 0.1308, "step": 11470 }, { "epoch": 2.007168458781362, "grad_norm": 2.1777830123901367, "learning_rate": 1.3339581746822097e-05, "loss": 0.1188, "step": 11480 }, { "epoch": 2.008916863362182, "grad_norm": 1.8375096321105957, "learning_rate": 1.3316150196239238e-05, "loss": 0.1419, "step": 11490 }, { "epoch": 2.010665267943002, "grad_norm": 2.3195183277130127, "learning_rate": 1.3292718645656378e-05, "loss": 0.125, "step": 11500 }, { "epoch": 2.012413672523822, "grad_norm": 1.6732579469680786, "learning_rate": 1.3269287095073517e-05, "loss": 0.158, "step": 11510 }, { "epoch": 2.014162077104642, "grad_norm": 1.6776469945907593, "learning_rate": 1.3245855544490658e-05, "loss": 0.1204, "step": 11520 }, { "epoch": 2.015910481685462, "grad_norm": 1.415498971939087, "learning_rate": 1.3222423993907798e-05, "loss": 0.1534, "step": 11530 }, { "epoch": 2.017658886266282, "grad_norm": 1.47972571849823, "learning_rate": 1.3198992443324939e-05, "loss": 0.1335, "step": 11540 }, { "epoch": 2.019407290847102, "grad_norm": 3.020838499069214, "learning_rate": 1.317556089274208e-05, "loss": 0.1177, "step": 11550 }, { "epoch": 2.021155695427922, "grad_norm": 2.071444034576416, "learning_rate": 1.3152129342159218e-05, "loss": 0.1442, "step": 11560 }, { "epoch": 2.022904100008742, "grad_norm": 4.56058931350708, "learning_rate": 1.3128697791576359e-05, "loss": 0.1271, "step": 11570 }, { "epoch": 2.024652504589562, "grad_norm": 2.9579317569732666, "learning_rate": 1.31052662409935e-05, "loss": 0.1311, "step": 11580 }, { "epoch": 2.026400909170382, "grad_norm": 1.6421356201171875, "learning_rate": 1.308183469041064e-05, "loss": 0.1333, "step": 11590 }, { "epoch": 2.028149313751202, "grad_norm": 3.032768726348877, "learning_rate": 1.305840313982778e-05, "loss": 0.1532, "step": 11600 }, { "epoch": 2.029897718332022, "grad_norm": 3.688626766204834, "learning_rate": 1.303497158924492e-05, "loss": 0.1257, "step": 11610 }, { "epoch": 2.031646122912842, "grad_norm": 1.906921148300171, "learning_rate": 1.301154003866206e-05, "loss": 0.1316, "step": 11620 }, { "epoch": 2.033394527493662, "grad_norm": 1.6427615880966187, "learning_rate": 1.29881084880792e-05, "loss": 0.1173, "step": 11630 }, { "epoch": 2.035142932074482, "grad_norm": 2.043480157852173, "learning_rate": 1.296467693749634e-05, "loss": 0.1348, "step": 11640 }, { "epoch": 2.036891336655302, "grad_norm": 2.482868194580078, "learning_rate": 1.2941245386913478e-05, "loss": 0.1229, "step": 11650 }, { "epoch": 2.038639741236122, "grad_norm": 3.356874465942383, "learning_rate": 1.2917813836330619e-05, "loss": 0.1163, "step": 11660 }, { "epoch": 2.040388145816942, "grad_norm": 1.3664606809616089, "learning_rate": 1.289438228574776e-05, "loss": 0.1256, "step": 11670 }, { "epoch": 2.042136550397762, "grad_norm": 2.038381338119507, "learning_rate": 1.28709507351649e-05, "loss": 0.1458, "step": 11680 }, { "epoch": 2.0438849549785822, "grad_norm": 2.1345996856689453, "learning_rate": 1.284751918458204e-05, "loss": 0.1234, "step": 11690 }, { "epoch": 2.045633359559402, "grad_norm": 1.422678828239441, "learning_rate": 1.282408763399918e-05, "loss": 0.117, "step": 11700 }, { "epoch": 2.047381764140222, "grad_norm": 1.686991810798645, "learning_rate": 1.280065608341632e-05, "loss": 0.1389, "step": 11710 }, { "epoch": 2.049130168721042, "grad_norm": 1.8135253190994263, "learning_rate": 1.277722453283346e-05, "loss": 0.1121, "step": 11720 }, { "epoch": 2.050878573301862, "grad_norm": 1.7057915925979614, "learning_rate": 1.2753792982250601e-05, "loss": 0.1496, "step": 11730 }, { "epoch": 2.052626977882682, "grad_norm": 2.205564022064209, "learning_rate": 1.2730361431667742e-05, "loss": 0.1321, "step": 11740 }, { "epoch": 2.054375382463502, "grad_norm": 2.5153441429138184, "learning_rate": 1.2706929881084883e-05, "loss": 0.1138, "step": 11750 }, { "epoch": 2.056123787044322, "grad_norm": 1.5551224946975708, "learning_rate": 1.2683498330502021e-05, "loss": 0.1317, "step": 11760 }, { "epoch": 2.057872191625142, "grad_norm": 1.755265712738037, "learning_rate": 1.2660066779919162e-05, "loss": 0.1465, "step": 11770 }, { "epoch": 2.059620596205962, "grad_norm": 1.6050928831100464, "learning_rate": 1.2636635229336303e-05, "loss": 0.1178, "step": 11780 }, { "epoch": 2.061369000786782, "grad_norm": 0.8174687623977661, "learning_rate": 1.2613203678753443e-05, "loss": 0.1355, "step": 11790 }, { "epoch": 2.0631174053676022, "grad_norm": 2.0146546363830566, "learning_rate": 1.2589772128170584e-05, "loss": 0.1395, "step": 11800 }, { "epoch": 2.0648658099484223, "grad_norm": 3.2247419357299805, "learning_rate": 1.2566340577587723e-05, "loss": 0.1496, "step": 11810 }, { "epoch": 2.066614214529242, "grad_norm": 2.9578254222869873, "learning_rate": 1.2542909027004863e-05, "loss": 0.1384, "step": 11820 }, { "epoch": 2.068362619110062, "grad_norm": 1.7490684986114502, "learning_rate": 1.2519477476422004e-05, "loss": 0.1363, "step": 11830 }, { "epoch": 2.070111023690882, "grad_norm": 1.931545376777649, "learning_rate": 1.2496045925839144e-05, "loss": 0.1652, "step": 11840 }, { "epoch": 2.071859428271702, "grad_norm": 1.7428362369537354, "learning_rate": 1.2472614375256285e-05, "loss": 0.1309, "step": 11850 }, { "epoch": 2.073607832852522, "grad_norm": 3.5689780712127686, "learning_rate": 1.2449182824673424e-05, "loss": 0.121, "step": 11860 }, { "epoch": 2.075356237433342, "grad_norm": 1.8899126052856445, "learning_rate": 1.2425751274090565e-05, "loss": 0.1452, "step": 11870 }, { "epoch": 2.077104642014162, "grad_norm": 2.014786720275879, "learning_rate": 1.2402319723507703e-05, "loss": 0.1327, "step": 11880 }, { "epoch": 2.078853046594982, "grad_norm": 2.1480844020843506, "learning_rate": 1.2378888172924844e-05, "loss": 0.1378, "step": 11890 }, { "epoch": 2.080601451175802, "grad_norm": 2.5206010341644287, "learning_rate": 1.2355456622341983e-05, "loss": 0.1684, "step": 11900 }, { "epoch": 2.0823498557566222, "grad_norm": 2.49927020072937, "learning_rate": 1.2332025071759123e-05, "loss": 0.1402, "step": 11910 }, { "epoch": 2.0840982603374423, "grad_norm": 1.4586842060089111, "learning_rate": 1.2308593521176264e-05, "loss": 0.1285, "step": 11920 }, { "epoch": 2.085846664918262, "grad_norm": 1.743302583694458, "learning_rate": 1.2285161970593405e-05, "loss": 0.1117, "step": 11930 }, { "epoch": 2.087595069499082, "grad_norm": 1.992079257965088, "learning_rate": 1.2261730420010545e-05, "loss": 0.1409, "step": 11940 }, { "epoch": 2.089343474079902, "grad_norm": 1.6316864490509033, "learning_rate": 1.2238298869427686e-05, "loss": 0.1229, "step": 11950 }, { "epoch": 2.091091878660722, "grad_norm": 1.8108314275741577, "learning_rate": 1.2214867318844825e-05, "loss": 0.1306, "step": 11960 }, { "epoch": 2.092840283241542, "grad_norm": 1.7207168340682983, "learning_rate": 1.2191435768261965e-05, "loss": 0.1316, "step": 11970 }, { "epoch": 2.094588687822362, "grad_norm": 2.498771905899048, "learning_rate": 1.2168004217679106e-05, "loss": 0.1368, "step": 11980 }, { "epoch": 2.096337092403182, "grad_norm": 1.751407504081726, "learning_rate": 1.2144572667096246e-05, "loss": 0.1416, "step": 11990 }, { "epoch": 2.098085496984002, "grad_norm": 1.6396225690841675, "learning_rate": 1.2121141116513387e-05, "loss": 0.1482, "step": 12000 }, { "epoch": 2.0987848588163303, "eval_loss": 0.3506932854652405, "eval_runtime": 1791.7022, "eval_samples_per_second": 8.083, "eval_steps_per_second": 1.011, "step": 12004 }, { "epoch": 2.099833901564822, "grad_norm": 2.5797410011291504, "learning_rate": 1.2097709565930526e-05, "loss": 0.1339, "step": 12010 }, { "epoch": 2.1015823061456422, "grad_norm": 3.2276177406311035, "learning_rate": 1.2074278015347667e-05, "loss": 0.132, "step": 12020 }, { "epoch": 2.1033307107264623, "grad_norm": 1.973792314529419, "learning_rate": 1.2050846464764807e-05, "loss": 0.1232, "step": 12030 }, { "epoch": 2.1050791153072823, "grad_norm": 1.5933607816696167, "learning_rate": 1.2027414914181948e-05, "loss": 0.1249, "step": 12040 }, { "epoch": 2.106827519888102, "grad_norm": 2.8495397567749023, "learning_rate": 1.2003983363599088e-05, "loss": 0.1822, "step": 12050 }, { "epoch": 2.108575924468922, "grad_norm": 2.044532060623169, "learning_rate": 1.1980551813016227e-05, "loss": 0.1298, "step": 12060 }, { "epoch": 2.110324329049742, "grad_norm": 2.275475263595581, "learning_rate": 1.1957120262433368e-05, "loss": 0.1242, "step": 12070 }, { "epoch": 2.112072733630562, "grad_norm": 2.919373035430908, "learning_rate": 1.1933688711850508e-05, "loss": 0.1314, "step": 12080 }, { "epoch": 2.113821138211382, "grad_norm": 2.3667123317718506, "learning_rate": 1.1910257161267649e-05, "loss": 0.1338, "step": 12090 }, { "epoch": 2.115569542792202, "grad_norm": 2.1862826347351074, "learning_rate": 1.188682561068479e-05, "loss": 0.1493, "step": 12100 }, { "epoch": 2.117317947373022, "grad_norm": 2.557068347930908, "learning_rate": 1.186339406010193e-05, "loss": 0.1367, "step": 12110 }, { "epoch": 2.119066351953842, "grad_norm": 2.359553813934326, "learning_rate": 1.1839962509519067e-05, "loss": 0.1549, "step": 12120 }, { "epoch": 2.1208147565346622, "grad_norm": 1.9232834577560425, "learning_rate": 1.1816530958936208e-05, "loss": 0.132, "step": 12130 }, { "epoch": 2.1225631611154823, "grad_norm": 1.867737889289856, "learning_rate": 1.1793099408353348e-05, "loss": 0.1318, "step": 12140 }, { "epoch": 2.1243115656963023, "grad_norm": 2.331395149230957, "learning_rate": 1.1769667857770487e-05, "loss": 0.1428, "step": 12150 }, { "epoch": 2.1260599702771223, "grad_norm": 4.814284324645996, "learning_rate": 1.1746236307187628e-05, "loss": 0.1303, "step": 12160 }, { "epoch": 2.127808374857942, "grad_norm": 1.9960688352584839, "learning_rate": 1.1722804756604769e-05, "loss": 0.1218, "step": 12170 }, { "epoch": 2.129556779438762, "grad_norm": 1.694406270980835, "learning_rate": 1.1699373206021909e-05, "loss": 0.1435, "step": 12180 }, { "epoch": 2.131305184019582, "grad_norm": 2.0222890377044678, "learning_rate": 1.167594165543905e-05, "loss": 0.1449, "step": 12190 }, { "epoch": 2.133053588600402, "grad_norm": 1.690718650817871, "learning_rate": 1.165251010485619e-05, "loss": 0.1452, "step": 12200 }, { "epoch": 2.134801993181222, "grad_norm": 2.7942721843719482, "learning_rate": 1.162907855427333e-05, "loss": 0.1302, "step": 12210 }, { "epoch": 2.136550397762042, "grad_norm": 2.274010181427002, "learning_rate": 1.1607990158748756e-05, "loss": 0.1433, "step": 12220 }, { "epoch": 2.138298802342862, "grad_norm": 2.7134246826171875, "learning_rate": 1.1584558608165896e-05, "loss": 0.1297, "step": 12230 }, { "epoch": 2.140047206923682, "grad_norm": 2.353001832962036, "learning_rate": 1.1561127057583037e-05, "loss": 0.1512, "step": 12240 }, { "epoch": 2.1417956115045023, "grad_norm": 1.4769039154052734, "learning_rate": 1.1537695507000177e-05, "loss": 0.134, "step": 12250 }, { "epoch": 2.1435440160853223, "grad_norm": 2.0705323219299316, "learning_rate": 1.1514263956417318e-05, "loss": 0.1114, "step": 12260 }, { "epoch": 2.1452924206661423, "grad_norm": 2.1008477210998535, "learning_rate": 1.1490832405834459e-05, "loss": 0.1321, "step": 12270 }, { "epoch": 2.147040825246962, "grad_norm": 1.299492597579956, "learning_rate": 1.1467400855251598e-05, "loss": 0.1174, "step": 12280 }, { "epoch": 2.148789229827782, "grad_norm": 1.4347295761108398, "learning_rate": 1.1443969304668738e-05, "loss": 0.1049, "step": 12290 }, { "epoch": 2.150537634408602, "grad_norm": 1.9035148620605469, "learning_rate": 1.1420537754085879e-05, "loss": 0.1322, "step": 12300 }, { "epoch": 2.152286038989422, "grad_norm": 1.6976128816604614, "learning_rate": 1.1397106203503016e-05, "loss": 0.1165, "step": 12310 }, { "epoch": 2.154034443570242, "grad_norm": 4.1831955909729, "learning_rate": 1.1373674652920156e-05, "loss": 0.1407, "step": 12320 }, { "epoch": 2.155782848151062, "grad_norm": 1.985929012298584, "learning_rate": 1.1350243102337297e-05, "loss": 0.1475, "step": 12330 }, { "epoch": 2.157531252731882, "grad_norm": 1.6526029109954834, "learning_rate": 1.1326811551754438e-05, "loss": 0.1476, "step": 12340 }, { "epoch": 2.159279657312702, "grad_norm": 2.873518228530884, "learning_rate": 1.1303380001171578e-05, "loss": 0.1206, "step": 12350 }, { "epoch": 2.1610280618935223, "grad_norm": 3.4296302795410156, "learning_rate": 1.1279948450588719e-05, "loss": 0.1226, "step": 12360 }, { "epoch": 2.1627764664743423, "grad_norm": 4.733137607574463, "learning_rate": 1.1256516900005858e-05, "loss": 0.1748, "step": 12370 }, { "epoch": 2.1645248710551623, "grad_norm": 1.851542353630066, "learning_rate": 1.1233085349422998e-05, "loss": 0.1483, "step": 12380 }, { "epoch": 2.166273275635982, "grad_norm": 1.5884191989898682, "learning_rate": 1.1209653798840139e-05, "loss": 0.1383, "step": 12390 }, { "epoch": 2.168021680216802, "grad_norm": 2.071790933609009, "learning_rate": 1.118622224825728e-05, "loss": 0.1534, "step": 12400 }, { "epoch": 2.169770084797622, "grad_norm": 2.419951915740967, "learning_rate": 1.116279069767442e-05, "loss": 0.1177, "step": 12410 }, { "epoch": 2.171518489378442, "grad_norm": 1.8410372734069824, "learning_rate": 1.1139359147091559e-05, "loss": 0.1211, "step": 12420 }, { "epoch": 2.173266893959262, "grad_norm": 2.593384265899658, "learning_rate": 1.11159275965087e-05, "loss": 0.1481, "step": 12430 }, { "epoch": 2.175015298540082, "grad_norm": 1.5354266166687012, "learning_rate": 1.109249604592584e-05, "loss": 0.1076, "step": 12440 }, { "epoch": 2.176763703120902, "grad_norm": 2.548050880432129, "learning_rate": 1.106906449534298e-05, "loss": 0.1375, "step": 12450 }, { "epoch": 2.178512107701722, "grad_norm": 2.1876955032348633, "learning_rate": 1.1047976099818407e-05, "loss": 0.1366, "step": 12460 }, { "epoch": 2.1802605122825423, "grad_norm": 2.163553237915039, "learning_rate": 1.1024544549235548e-05, "loss": 0.1371, "step": 12470 }, { "epoch": 2.1820089168633623, "grad_norm": 2.0126430988311768, "learning_rate": 1.1001112998652688e-05, "loss": 0.1324, "step": 12480 }, { "epoch": 2.1837573214441823, "grad_norm": 3.8415536880493164, "learning_rate": 1.0977681448069827e-05, "loss": 0.1586, "step": 12490 }, { "epoch": 2.1855057260250024, "grad_norm": 1.8899825811386108, "learning_rate": 1.0954249897486966e-05, "loss": 0.1255, "step": 12500 }, { "epoch": 2.1872541306058224, "grad_norm": 1.9570482969284058, "learning_rate": 1.0930818346904107e-05, "loss": 0.1243, "step": 12510 }, { "epoch": 2.189002535186642, "grad_norm": 0.8766506910324097, "learning_rate": 1.0907386796321247e-05, "loss": 0.1176, "step": 12520 }, { "epoch": 2.190750939767462, "grad_norm": 0.7959820628166199, "learning_rate": 1.0883955245738386e-05, "loss": 0.1356, "step": 12530 }, { "epoch": 2.192499344348282, "grad_norm": 2.3093817234039307, "learning_rate": 1.0860523695155527e-05, "loss": 0.1562, "step": 12540 }, { "epoch": 2.194247748929102, "grad_norm": 2.737586259841919, "learning_rate": 1.0837092144572667e-05, "loss": 0.1354, "step": 12550 }, { "epoch": 2.195996153509922, "grad_norm": 1.244848370552063, "learning_rate": 1.0813660593989808e-05, "loss": 0.1086, "step": 12560 }, { "epoch": 2.197744558090742, "grad_norm": 1.8399499654769897, "learning_rate": 1.0790229043406949e-05, "loss": 0.1102, "step": 12570 }, { "epoch": 2.1994929626715622, "grad_norm": 1.1957368850708008, "learning_rate": 1.0766797492824087e-05, "loss": 0.1208, "step": 12580 }, { "epoch": 2.2012413672523823, "grad_norm": 3.7022900581359863, "learning_rate": 1.0743365942241228e-05, "loss": 0.141, "step": 12590 }, { "epoch": 2.2029897718332023, "grad_norm": 2.1159870624542236, "learning_rate": 1.0719934391658369e-05, "loss": 0.1192, "step": 12600 }, { "epoch": 2.2047381764140224, "grad_norm": 2.370440721511841, "learning_rate": 1.069650284107551e-05, "loss": 0.1346, "step": 12610 }, { "epoch": 2.2064865809948424, "grad_norm": 1.9298361539840698, "learning_rate": 1.067307129049265e-05, "loss": 0.1471, "step": 12620 }, { "epoch": 2.208234985575662, "grad_norm": 1.9388532638549805, "learning_rate": 1.0649639739909789e-05, "loss": 0.1305, "step": 12630 }, { "epoch": 2.209983390156482, "grad_norm": 1.9433221817016602, "learning_rate": 1.062620818932693e-05, "loss": 0.126, "step": 12640 }, { "epoch": 2.211731794737302, "grad_norm": 2.007972478866577, "learning_rate": 1.060277663874407e-05, "loss": 0.1147, "step": 12650 }, { "epoch": 2.213480199318122, "grad_norm": 1.1948915719985962, "learning_rate": 1.057934508816121e-05, "loss": 0.1036, "step": 12660 }, { "epoch": 2.215228603898942, "grad_norm": 1.7573200464248657, "learning_rate": 1.0555913537578351e-05, "loss": 0.1361, "step": 12670 }, { "epoch": 2.216977008479762, "grad_norm": 1.6941572427749634, "learning_rate": 1.0532481986995492e-05, "loss": 0.1513, "step": 12680 }, { "epoch": 2.2187254130605822, "grad_norm": 2.2214457988739014, "learning_rate": 1.050905043641263e-05, "loss": 0.1638, "step": 12690 }, { "epoch": 2.2204738176414023, "grad_norm": 2.1484594345092773, "learning_rate": 1.0485618885829771e-05, "loss": 0.1191, "step": 12700 }, { "epoch": 2.2222222222222223, "grad_norm": 1.3132258653640747, "learning_rate": 1.0462187335246912e-05, "loss": 0.153, "step": 12710 }, { "epoch": 2.2239706268030424, "grad_norm": 1.7616349458694458, "learning_rate": 1.0438755784664052e-05, "loss": 0.1358, "step": 12720 }, { "epoch": 2.2257190313838624, "grad_norm": 1.7969423532485962, "learning_rate": 1.0415324234081193e-05, "loss": 0.096, "step": 12730 }, { "epoch": 2.227467435964682, "grad_norm": 1.4737247228622437, "learning_rate": 1.039189268349833e-05, "loss": 0.1194, "step": 12740 }, { "epoch": 2.229215840545502, "grad_norm": 1.7292555570602417, "learning_rate": 1.036846113291547e-05, "loss": 0.1314, "step": 12750 }, { "epoch": 2.230964245126322, "grad_norm": 2.5975306034088135, "learning_rate": 1.0345029582332611e-05, "loss": 0.1552, "step": 12760 }, { "epoch": 2.232712649707142, "grad_norm": 1.841124176979065, "learning_rate": 1.0321598031749752e-05, "loss": 0.15, "step": 12770 }, { "epoch": 2.234461054287962, "grad_norm": 2.365156888961792, "learning_rate": 1.029816648116689e-05, "loss": 0.125, "step": 12780 }, { "epoch": 2.236209458868782, "grad_norm": 2.7648537158966064, "learning_rate": 1.0274734930584031e-05, "loss": 0.1316, "step": 12790 }, { "epoch": 2.2379578634496022, "grad_norm": 2.3384718894958496, "learning_rate": 1.0251303380001172e-05, "loss": 0.1273, "step": 12800 }, { "epoch": 2.2397062680304223, "grad_norm": 1.648476004600525, "learning_rate": 1.0227871829418312e-05, "loss": 0.1088, "step": 12810 }, { "epoch": 2.2414546726112423, "grad_norm": 1.8477935791015625, "learning_rate": 1.0204440278835453e-05, "loss": 0.1511, "step": 12820 }, { "epoch": 2.2432030771920624, "grad_norm": 1.7724605798721313, "learning_rate": 1.0181008728252592e-05, "loss": 0.1376, "step": 12830 }, { "epoch": 2.2449514817728824, "grad_norm": 3.3595921993255615, "learning_rate": 1.0157577177669733e-05, "loss": 0.1095, "step": 12840 }, { "epoch": 2.2466998863537024, "grad_norm": 2.554070472717285, "learning_rate": 1.0134145627086873e-05, "loss": 0.1462, "step": 12850 }, { "epoch": 2.2484482909345225, "grad_norm": 2.068704605102539, "learning_rate": 1.0110714076504014e-05, "loss": 0.1277, "step": 12860 }, { "epoch": 2.250196695515342, "grad_norm": 1.4557998180389404, "learning_rate": 1.0087282525921154e-05, "loss": 0.1457, "step": 12870 }, { "epoch": 2.251945100096162, "grad_norm": 1.990447998046875, "learning_rate": 1.0063850975338295e-05, "loss": 0.1192, "step": 12880 }, { "epoch": 2.253693504676982, "grad_norm": 1.5465627908706665, "learning_rate": 1.0040419424755434e-05, "loss": 0.1253, "step": 12890 }, { "epoch": 2.255441909257802, "grad_norm": 2.6916444301605225, "learning_rate": 1.0016987874172574e-05, "loss": 0.1491, "step": 12900 }, { "epoch": 2.2571903138386222, "grad_norm": 1.3198771476745605, "learning_rate": 9.993556323589715e-06, "loss": 0.1174, "step": 12910 }, { "epoch": 2.2589387184194423, "grad_norm": 4.6871256828308105, "learning_rate": 9.970124773006854e-06, "loss": 0.1412, "step": 12920 }, { "epoch": 2.2606871230002623, "grad_norm": 1.9112443923950195, "learning_rate": 9.946693222423994e-06, "loss": 0.1339, "step": 12930 }, { "epoch": 2.2624355275810824, "grad_norm": 2.208272933959961, "learning_rate": 9.923261671841135e-06, "loss": 0.1121, "step": 12940 }, { "epoch": 2.2641839321619024, "grad_norm": 1.507631540298462, "learning_rate": 9.899830121258276e-06, "loss": 0.1426, "step": 12950 }, { "epoch": 2.2659323367427224, "grad_norm": 1.6952465772628784, "learning_rate": 9.876398570675414e-06, "loss": 0.1428, "step": 12960 }, { "epoch": 2.2676807413235425, "grad_norm": 1.4942928552627563, "learning_rate": 9.852967020092555e-06, "loss": 0.1296, "step": 12970 }, { "epoch": 2.269429145904362, "grad_norm": 2.4580042362213135, "learning_rate": 9.829535469509696e-06, "loss": 0.1206, "step": 12980 }, { "epoch": 2.271177550485182, "grad_norm": 2.046311855316162, "learning_rate": 9.806103918926836e-06, "loss": 0.1252, "step": 12990 }, { "epoch": 2.272925955066002, "grad_norm": 1.9215744733810425, "learning_rate": 9.782672368343977e-06, "loss": 0.1239, "step": 13000 }, { "epoch": 2.274674359646822, "grad_norm": 1.7655632495880127, "learning_rate": 9.759240817761116e-06, "loss": 0.1512, "step": 13010 }, { "epoch": 2.2764227642276422, "grad_norm": 1.9617197513580322, "learning_rate": 9.735809267178256e-06, "loss": 0.0958, "step": 13020 }, { "epoch": 2.2781711688084623, "grad_norm": 2.193418502807617, "learning_rate": 9.712377716595397e-06, "loss": 0.1101, "step": 13030 }, { "epoch": 2.2799195733892823, "grad_norm": 1.809605598449707, "learning_rate": 9.688946166012536e-06, "loss": 0.1691, "step": 13040 }, { "epoch": 2.2816679779701023, "grad_norm": 1.4769902229309082, "learning_rate": 9.665514615429676e-06, "loss": 0.1594, "step": 13050 }, { "epoch": 2.2834163825509224, "grad_norm": 3.269402027130127, "learning_rate": 9.642083064846817e-06, "loss": 0.1122, "step": 13060 }, { "epoch": 2.2851647871317424, "grad_norm": 1.951167345046997, "learning_rate": 9.618651514263957e-06, "loss": 0.1327, "step": 13070 }, { "epoch": 2.2869131917125625, "grad_norm": 1.3698724508285522, "learning_rate": 9.595219963681096e-06, "loss": 0.1447, "step": 13080 }, { "epoch": 2.288661596293382, "grad_norm": 2.255122423171997, "learning_rate": 9.571788413098237e-06, "loss": 0.1374, "step": 13090 }, { "epoch": 2.290410000874202, "grad_norm": 2.1527059078216553, "learning_rate": 9.548356862515378e-06, "loss": 0.153, "step": 13100 }, { "epoch": 2.292158405455022, "grad_norm": 4.404884338378906, "learning_rate": 9.524925311932518e-06, "loss": 0.1195, "step": 13110 }, { "epoch": 2.293906810035842, "grad_norm": 1.2485246658325195, "learning_rate": 9.501493761349659e-06, "loss": 0.1142, "step": 13120 }, { "epoch": 2.295655214616662, "grad_norm": 1.492285966873169, "learning_rate": 9.4780622107668e-06, "loss": 0.1213, "step": 13130 }, { "epoch": 2.2974036191974823, "grad_norm": 2.0897228717803955, "learning_rate": 9.454630660183938e-06, "loss": 0.1439, "step": 13140 }, { "epoch": 2.2991520237783023, "grad_norm": 1.5162708759307861, "learning_rate": 9.431199109601079e-06, "loss": 0.1107, "step": 13150 }, { "epoch": 2.3009004283591223, "grad_norm": 2.7167162895202637, "learning_rate": 9.407767559018218e-06, "loss": 0.1276, "step": 13160 }, { "epoch": 2.3026488329399424, "grad_norm": 1.5149487257003784, "learning_rate": 9.384336008435358e-06, "loss": 0.1313, "step": 13170 }, { "epoch": 2.3043972375207624, "grad_norm": 1.6607292890548706, "learning_rate": 9.360904457852499e-06, "loss": 0.1323, "step": 13180 }, { "epoch": 2.3061456421015825, "grad_norm": 2.0336623191833496, "learning_rate": 9.33747290726964e-06, "loss": 0.1125, "step": 13190 }, { "epoch": 2.3078940466824025, "grad_norm": 2.8644120693206787, "learning_rate": 9.31404135668678e-06, "loss": 0.1332, "step": 13200 }, { "epoch": 2.3096424512632225, "grad_norm": 1.8403265476226807, "learning_rate": 9.290609806103919e-06, "loss": 0.1344, "step": 13210 }, { "epoch": 2.311390855844042, "grad_norm": 2.001629114151001, "learning_rate": 9.26717825552106e-06, "loss": 0.1537, "step": 13220 }, { "epoch": 2.313139260424862, "grad_norm": 1.4118083715438843, "learning_rate": 9.2437467049382e-06, "loss": 0.1025, "step": 13230 }, { "epoch": 2.314887665005682, "grad_norm": 1.8369883298873901, "learning_rate": 9.22031515435534e-06, "loss": 0.1232, "step": 13240 }, { "epoch": 2.3166360695865023, "grad_norm": 1.7884759902954102, "learning_rate": 9.196883603772481e-06, "loss": 0.1231, "step": 13250 }, { "epoch": 2.3183844741673223, "grad_norm": 2.710341453552246, "learning_rate": 9.17345205318962e-06, "loss": 0.1242, "step": 13260 }, { "epoch": 2.3201328787481423, "grad_norm": 2.240281105041504, "learning_rate": 9.15002050260676e-06, "loss": 0.1504, "step": 13270 }, { "epoch": 2.3218812833289624, "grad_norm": 1.7557798624038696, "learning_rate": 9.1265889520239e-06, "loss": 0.1403, "step": 13280 }, { "epoch": 2.3236296879097824, "grad_norm": 1.6217379570007324, "learning_rate": 9.10315740144104e-06, "loss": 0.1245, "step": 13290 }, { "epoch": 2.3253780924906025, "grad_norm": 1.895262360572815, "learning_rate": 9.07972585085818e-06, "loss": 0.1342, "step": 13300 }, { "epoch": 2.3271264970714225, "grad_norm": 1.5638673305511475, "learning_rate": 9.056294300275321e-06, "loss": 0.1352, "step": 13310 }, { "epoch": 2.3288749016522425, "grad_norm": 2.0554001331329346, "learning_rate": 9.032862749692462e-06, "loss": 0.1365, "step": 13320 }, { "epoch": 2.330623306233062, "grad_norm": 6.10771369934082, "learning_rate": 9.009431199109603e-06, "loss": 0.1243, "step": 13330 }, { "epoch": 2.332371710813882, "grad_norm": 2.100159168243408, "learning_rate": 8.985999648526741e-06, "loss": 0.1182, "step": 13340 }, { "epoch": 2.334120115394702, "grad_norm": 3.7410552501678467, "learning_rate": 8.962568097943882e-06, "loss": 0.1278, "step": 13350 }, { "epoch": 2.3358685199755223, "grad_norm": 1.9370155334472656, "learning_rate": 8.939136547361023e-06, "loss": 0.1128, "step": 13360 }, { "epoch": 2.3376169245563423, "grad_norm": 1.367945671081543, "learning_rate": 8.915704996778163e-06, "loss": 0.1242, "step": 13370 }, { "epoch": 2.3393653291371623, "grad_norm": 2.3016417026519775, "learning_rate": 8.892273446195304e-06, "loss": 0.1357, "step": 13380 }, { "epoch": 2.3411137337179824, "grad_norm": 1.5979362726211548, "learning_rate": 8.868841895612443e-06, "loss": 0.1251, "step": 13390 }, { "epoch": 2.3428621382988024, "grad_norm": 1.9017846584320068, "learning_rate": 8.845410345029583e-06, "loss": 0.1266, "step": 13400 }, { "epoch": 2.3446105428796225, "grad_norm": 2.5822110176086426, "learning_rate": 8.821978794446722e-06, "loss": 0.1234, "step": 13410 }, { "epoch": 2.3463589474604425, "grad_norm": 2.033761501312256, "learning_rate": 8.798547243863863e-06, "loss": 0.126, "step": 13420 }, { "epoch": 2.3481073520412625, "grad_norm": 1.3017164468765259, "learning_rate": 8.775115693281003e-06, "loss": 0.1296, "step": 13430 }, { "epoch": 2.349855756622082, "grad_norm": 4.223972320556641, "learning_rate": 8.751684142698144e-06, "loss": 0.1409, "step": 13440 }, { "epoch": 2.351604161202902, "grad_norm": 1.4217913150787354, "learning_rate": 8.728252592115284e-06, "loss": 0.1412, "step": 13450 }, { "epoch": 2.353352565783722, "grad_norm": 2.801734685897827, "learning_rate": 8.704821041532423e-06, "loss": 0.1182, "step": 13460 }, { "epoch": 2.3551009703645422, "grad_norm": 2.7878241539001465, "learning_rate": 8.681389490949564e-06, "loss": 0.1563, "step": 13470 }, { "epoch": 2.3568493749453623, "grad_norm": 1.362971544265747, "learning_rate": 8.657957940366705e-06, "loss": 0.1282, "step": 13480 }, { "epoch": 2.3585977795261823, "grad_norm": 1.0237337350845337, "learning_rate": 8.634526389783845e-06, "loss": 0.1142, "step": 13490 }, { "epoch": 2.3603461841070024, "grad_norm": 1.7820348739624023, "learning_rate": 8.611094839200986e-06, "loss": 0.1226, "step": 13500 }, { "epoch": 2.3620945886878224, "grad_norm": 4.237933158874512, "learning_rate": 8.587663288618126e-06, "loss": 0.1334, "step": 13510 }, { "epoch": 2.3638429932686424, "grad_norm": 2.8852193355560303, "learning_rate": 8.564231738035265e-06, "loss": 0.1396, "step": 13520 }, { "epoch": 2.3655913978494625, "grad_norm": 1.597548484802246, "learning_rate": 8.540800187452404e-06, "loss": 0.1273, "step": 13530 }, { "epoch": 2.3673398024302825, "grad_norm": 2.1600868701934814, "learning_rate": 8.517368636869545e-06, "loss": 0.1395, "step": 13540 }, { "epoch": 2.369088207011102, "grad_norm": 2.460857391357422, "learning_rate": 8.493937086286685e-06, "loss": 0.1441, "step": 13550 }, { "epoch": 2.3708366115919226, "grad_norm": 1.3445568084716797, "learning_rate": 8.470505535703826e-06, "loss": 0.1064, "step": 13560 }, { "epoch": 2.372585016172742, "grad_norm": 2.0153238773345947, "learning_rate": 8.447073985120966e-06, "loss": 0.1093, "step": 13570 }, { "epoch": 2.3743334207535622, "grad_norm": 3.338841438293457, "learning_rate": 8.423642434538107e-06, "loss": 0.1395, "step": 13580 }, { "epoch": 2.3760818253343823, "grad_norm": 1.538512110710144, "learning_rate": 8.400210883955246e-06, "loss": 0.1251, "step": 13590 }, { "epoch": 2.3778302299152023, "grad_norm": 1.4861085414886475, "learning_rate": 8.376779333372386e-06, "loss": 0.1409, "step": 13600 }, { "epoch": 2.3795786344960224, "grad_norm": 2.402609348297119, "learning_rate": 8.353347782789527e-06, "loss": 0.1561, "step": 13610 }, { "epoch": 2.3813270390768424, "grad_norm": 2.3510336875915527, "learning_rate": 8.329916232206668e-06, "loss": 0.1369, "step": 13620 }, { "epoch": 2.3830754436576624, "grad_norm": 2.3919291496276855, "learning_rate": 8.306484681623808e-06, "loss": 0.149, "step": 13630 }, { "epoch": 2.3848238482384825, "grad_norm": 2.825187921524048, "learning_rate": 8.283053131040947e-06, "loss": 0.1312, "step": 13640 }, { "epoch": 2.3865722528193025, "grad_norm": 1.6704410314559937, "learning_rate": 8.259621580458088e-06, "loss": 0.146, "step": 13650 }, { "epoch": 2.3883206574001226, "grad_norm": 1.7998132705688477, "learning_rate": 8.236190029875227e-06, "loss": 0.1159, "step": 13660 }, { "epoch": 2.3900690619809426, "grad_norm": 1.5917441844940186, "learning_rate": 8.212758479292367e-06, "loss": 0.1266, "step": 13670 }, { "epoch": 2.391817466561762, "grad_norm": 1.5092450380325317, "learning_rate": 8.189326928709508e-06, "loss": 0.1542, "step": 13680 }, { "epoch": 2.3935658711425822, "grad_norm": 3.4303741455078125, "learning_rate": 8.165895378126648e-06, "loss": 0.1276, "step": 13690 }, { "epoch": 2.3953142757234023, "grad_norm": 1.8061717748641968, "learning_rate": 8.142463827543789e-06, "loss": 0.1133, "step": 13700 }, { "epoch": 2.3970626803042223, "grad_norm": 2.527982711791992, "learning_rate": 8.119032276960928e-06, "loss": 0.1493, "step": 13710 }, { "epoch": 2.3988110848850424, "grad_norm": 1.5481889247894287, "learning_rate": 8.095600726378068e-06, "loss": 0.1252, "step": 13720 }, { "epoch": 2.4005594894658624, "grad_norm": 4.928106784820557, "learning_rate": 8.072169175795209e-06, "loss": 0.1466, "step": 13730 }, { "epoch": 2.4023078940466824, "grad_norm": 1.5288153886795044, "learning_rate": 8.04873762521235e-06, "loss": 0.1263, "step": 13740 }, { "epoch": 2.4040562986275025, "grad_norm": 1.9449552297592163, "learning_rate": 8.02530607462949e-06, "loss": 0.102, "step": 13750 }, { "epoch": 2.4058047032083225, "grad_norm": 2.1797351837158203, "learning_rate": 8.001874524046629e-06, "loss": 0.1107, "step": 13760 }, { "epoch": 2.4075531077891426, "grad_norm": 1.4594932794570923, "learning_rate": 7.97844297346377e-06, "loss": 0.1392, "step": 13770 }, { "epoch": 2.4093015123699626, "grad_norm": 2.8186988830566406, "learning_rate": 7.95501142288091e-06, "loss": 0.1256, "step": 13780 }, { "epoch": 2.411049916950782, "grad_norm": 2.1541221141815186, "learning_rate": 7.931579872298049e-06, "loss": 0.1344, "step": 13790 }, { "epoch": 2.4127983215316022, "grad_norm": 1.6721254587173462, "learning_rate": 7.90814832171519e-06, "loss": 0.1142, "step": 13800 }, { "epoch": 2.4145467261124223, "grad_norm": 1.4992693662643433, "learning_rate": 7.88471677113233e-06, "loss": 0.1053, "step": 13810 }, { "epoch": 2.4162951306932423, "grad_norm": 2.316558361053467, "learning_rate": 7.861285220549471e-06, "loss": 0.1284, "step": 13820 }, { "epoch": 2.4180435352740624, "grad_norm": 1.6623950004577637, "learning_rate": 7.837853669966611e-06, "loss": 0.1349, "step": 13830 }, { "epoch": 2.4197919398548824, "grad_norm": 1.9431278705596924, "learning_rate": 7.81442211938375e-06, "loss": 0.1378, "step": 13840 }, { "epoch": 2.4215403444357024, "grad_norm": 2.124650478363037, "learning_rate": 7.790990568800891e-06, "loss": 0.1406, "step": 13850 }, { "epoch": 2.4232887490165225, "grad_norm": 2.7442266941070557, "learning_rate": 7.767559018218032e-06, "loss": 0.1208, "step": 13860 }, { "epoch": 2.4250371535973425, "grad_norm": 1.9426761865615845, "learning_rate": 7.744127467635172e-06, "loss": 0.1504, "step": 13870 }, { "epoch": 2.4267855581781625, "grad_norm": 1.6392885446548462, "learning_rate": 7.720695917052311e-06, "loss": 0.1389, "step": 13880 }, { "epoch": 2.4285339627589826, "grad_norm": 2.3085715770721436, "learning_rate": 7.697264366469452e-06, "loss": 0.1455, "step": 13890 }, { "epoch": 2.430282367339802, "grad_norm": 1.8429359197616577, "learning_rate": 7.673832815886592e-06, "loss": 0.1173, "step": 13900 }, { "epoch": 2.4320307719206227, "grad_norm": 1.861633062362671, "learning_rate": 7.650401265303731e-06, "loss": 0.1499, "step": 13910 }, { "epoch": 2.4337791765014423, "grad_norm": 1.6089733839035034, "learning_rate": 7.6269697147208725e-06, "loss": 0.1182, "step": 13920 }, { "epoch": 2.4355275810822623, "grad_norm": 3.4693145751953125, "learning_rate": 7.603538164138012e-06, "loss": 0.1621, "step": 13930 }, { "epoch": 2.4372759856630823, "grad_norm": 2.4453048706054688, "learning_rate": 7.580106613555153e-06, "loss": 0.1203, "step": 13940 }, { "epoch": 2.4390243902439024, "grad_norm": 2.6296722888946533, "learning_rate": 7.5566750629722926e-06, "loss": 0.1384, "step": 13950 }, { "epoch": 2.4407727948247224, "grad_norm": 2.1992902755737305, "learning_rate": 7.533243512389433e-06, "loss": 0.1251, "step": 13960 }, { "epoch": 2.4425211994055425, "grad_norm": 2.368910551071167, "learning_rate": 7.509811961806574e-06, "loss": 0.1267, "step": 13970 }, { "epoch": 2.4442696039863625, "grad_norm": 2.3806991577148438, "learning_rate": 7.4863804112237135e-06, "loss": 0.1169, "step": 13980 }, { "epoch": 2.4460180085671825, "grad_norm": 0.9917481541633606, "learning_rate": 7.462948860640854e-06, "loss": 0.1358, "step": 13990 }, { "epoch": 2.4477664131480026, "grad_norm": 1.9190022945404053, "learning_rate": 7.439517310057993e-06, "loss": 0.1495, "step": 14000 }, { "epoch": 2.4495148177288226, "grad_norm": 1.8634378910064697, "learning_rate": 7.4160857594751335e-06, "loss": 0.1033, "step": 14010 }, { "epoch": 2.4512632223096427, "grad_norm": 2.452369451522827, "learning_rate": 7.392654208892274e-06, "loss": 0.1424, "step": 14020 }, { "epoch": 2.4530116268904623, "grad_norm": 1.8152307271957397, "learning_rate": 7.369222658309414e-06, "loss": 0.1383, "step": 14030 }, { "epoch": 2.4547600314712823, "grad_norm": 2.709925651550293, "learning_rate": 7.3457911077265544e-06, "loss": 0.1052, "step": 14040 }, { "epoch": 2.4565084360521023, "grad_norm": 1.8516377210617065, "learning_rate": 7.322359557143694e-06, "loss": 0.0941, "step": 14050 }, { "epoch": 2.4582568406329224, "grad_norm": 1.1404094696044922, "learning_rate": 7.298928006560835e-06, "loss": 0.1253, "step": 14060 }, { "epoch": 2.4600052452137424, "grad_norm": 1.4594693183898926, "learning_rate": 7.275496455977975e-06, "loss": 0.1384, "step": 14070 }, { "epoch": 2.4617536497945625, "grad_norm": 2.0537307262420654, "learning_rate": 7.252064905395115e-06, "loss": 0.12, "step": 14080 }, { "epoch": 2.4635020543753825, "grad_norm": 1.586864709854126, "learning_rate": 7.228633354812256e-06, "loss": 0.135, "step": 14090 }, { "epoch": 2.4652504589562025, "grad_norm": 1.7124476432800293, "learning_rate": 7.205201804229396e-06, "loss": 0.1189, "step": 14100 } ], "logging_steps": 10, "max_steps": 17157, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 300, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 5.839868867932848e+18, "train_batch_size": 12, "trial_name": null, "trial_params": null }