diff --git "a/checkpoint-5000/trainer_state.json" "b/checkpoint-5000/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-5000/trainer_state.json" @@ -0,0 +1,35033 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.11661569588958826, + "eval_steps": 500, + "global_step": 5000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 2.332313917791765e-05, + "grad_norm": 1.6235620975494385, + "learning_rate": 5.182689816014512e-09, + "loss": 1.9275, + "step": 1 + }, + { + "epoch": 4.66462783558353e-05, + "grad_norm": 1.5710082054138184, + "learning_rate": 1.0365379632029025e-08, + "loss": 1.5593, + "step": 2 + }, + { + "epoch": 6.996941753375295e-05, + "grad_norm": 2.3231985569000244, + "learning_rate": 1.5548069448043534e-08, + "loss": 2.0021, + "step": 3 + }, + { + "epoch": 9.32925567116706e-05, + "grad_norm": 1.8349288702011108, + "learning_rate": 2.073075926405805e-08, + "loss": 2.1141, + "step": 4 + }, + { + "epoch": 0.00011661569588958826, + "grad_norm": 2.039928436279297, + "learning_rate": 2.5913449080072562e-08, + "loss": 1.9361, + "step": 5 + }, + { + "epoch": 0.0001399388350675059, + "grad_norm": 1.8988783359527588, + "learning_rate": 3.109613889608707e-08, + "loss": 2.2441, + "step": 6 + }, + { + "epoch": 0.00016326197424542356, + "grad_norm": 1.4865813255310059, + "learning_rate": 3.6278828712101586e-08, + "loss": 1.8118, + "step": 7 + }, + { + "epoch": 0.0001865851134233412, + "grad_norm": 1.4033368825912476, + "learning_rate": 4.14615185281161e-08, + "loss": 1.8838, + "step": 8 + }, + { + "epoch": 0.00020990825260125886, + "grad_norm": 1.876894235610962, + "learning_rate": 4.6644208344130604e-08, + "loss": 1.9916, + "step": 9 + }, + { + "epoch": 0.00023323139177917651, + "grad_norm": 2.4104366302490234, + "learning_rate": 5.1826898160145123e-08, + "loss": 1.8618, + "step": 10 + }, + { + "epoch": 0.0002565545309570942, + "grad_norm": 1.8457229137420654, + "learning_rate": 5.700958797615963e-08, + "loss": 1.7303, + "step": 11 + }, + { + "epoch": 0.0002798776701350118, + "grad_norm": 1.940317988395691, + "learning_rate": 6.219227779217413e-08, + "loss": 2.2692, + "step": 12 + }, + { + "epoch": 0.0003032008093129295, + "grad_norm": 2.455432891845703, + "learning_rate": 6.737496760818865e-08, + "loss": 2.3401, + "step": 13 + }, + { + "epoch": 0.0003265239484908471, + "grad_norm": 1.5163850784301758, + "learning_rate": 7.255765742420317e-08, + "loss": 2.1687, + "step": 14 + }, + { + "epoch": 0.0003498470876687648, + "grad_norm": 1.3012642860412598, + "learning_rate": 7.774034724021768e-08, + "loss": 1.8693, + "step": 15 + }, + { + "epoch": 0.0003731702268466824, + "grad_norm": 2.0896522998809814, + "learning_rate": 8.29230370562322e-08, + "loss": 1.7031, + "step": 16 + }, + { + "epoch": 0.0003964933660246001, + "grad_norm": 1.7818728685379028, + "learning_rate": 8.810572687224672e-08, + "loss": 2.0829, + "step": 17 + }, + { + "epoch": 0.0004198165052025177, + "grad_norm": 2.569828510284424, + "learning_rate": 9.328841668826121e-08, + "loss": 1.8998, + "step": 18 + }, + { + "epoch": 0.0004431396443804354, + "grad_norm": 1.4619100093841553, + "learning_rate": 9.847110650427573e-08, + "loss": 1.5964, + "step": 19 + }, + { + "epoch": 0.00046646278355835303, + "grad_norm": 1.9832793474197388, + "learning_rate": 1.0365379632029025e-07, + "loss": 1.9292, + "step": 20 + }, + { + "epoch": 0.0004897859227362707, + "grad_norm": 2.0182175636291504, + "learning_rate": 1.0883648613630475e-07, + "loss": 2.0115, + "step": 21 + }, + { + "epoch": 0.0005131090619141884, + "grad_norm": 1.4642307758331299, + "learning_rate": 1.1401917595231926e-07, + "loss": 2.0291, + "step": 22 + }, + { + "epoch": 0.000536432201092106, + "grad_norm": 2.887909173965454, + "learning_rate": 1.1920186576833378e-07, + "loss": 2.1946, + "step": 23 + }, + { + "epoch": 0.0005597553402700236, + "grad_norm": 1.595544457435608, + "learning_rate": 1.2438455558434827e-07, + "loss": 2.0246, + "step": 24 + }, + { + "epoch": 0.0005830784794479413, + "grad_norm": 1.5648566484451294, + "learning_rate": 1.295672454003628e-07, + "loss": 2.1832, + "step": 25 + }, + { + "epoch": 0.000606401618625859, + "grad_norm": 1.4702372550964355, + "learning_rate": 1.347499352163773e-07, + "loss": 1.6395, + "step": 26 + }, + { + "epoch": 0.0006297247578037766, + "grad_norm": 1.7178195714950562, + "learning_rate": 1.399326250323918e-07, + "loss": 1.6264, + "step": 27 + }, + { + "epoch": 0.0006530478969816942, + "grad_norm": 2.1751515865325928, + "learning_rate": 1.4511531484840635e-07, + "loss": 2.511, + "step": 28 + }, + { + "epoch": 0.0006763710361596119, + "grad_norm": 2.9443299770355225, + "learning_rate": 1.5029800466442085e-07, + "loss": 2.229, + "step": 29 + }, + { + "epoch": 0.0006996941753375296, + "grad_norm": 1.8316481113433838, + "learning_rate": 1.5548069448043536e-07, + "loss": 1.8414, + "step": 30 + }, + { + "epoch": 0.0007230173145154472, + "grad_norm": 1.9659239053726196, + "learning_rate": 1.6066338429644986e-07, + "loss": 2.0109, + "step": 31 + }, + { + "epoch": 0.0007463404536933648, + "grad_norm": 2.1653449535369873, + "learning_rate": 1.658460741124644e-07, + "loss": 2.0155, + "step": 32 + }, + { + "epoch": 0.0007696635928712825, + "grad_norm": 1.8755710124969482, + "learning_rate": 1.710287639284789e-07, + "loss": 2.1105, + "step": 33 + }, + { + "epoch": 0.0007929867320492002, + "grad_norm": 1.5989196300506592, + "learning_rate": 1.7621145374449343e-07, + "loss": 2.1583, + "step": 34 + }, + { + "epoch": 0.0008163098712271178, + "grad_norm": 1.865307331085205, + "learning_rate": 1.813941435605079e-07, + "loss": 2.001, + "step": 35 + }, + { + "epoch": 0.0008396330104050355, + "grad_norm": 1.4584789276123047, + "learning_rate": 1.8657683337652242e-07, + "loss": 1.8854, + "step": 36 + }, + { + "epoch": 0.0008629561495829531, + "grad_norm": 2.6818912029266357, + "learning_rate": 1.9175952319253695e-07, + "loss": 2.1888, + "step": 37 + }, + { + "epoch": 0.0008862792887608708, + "grad_norm": 2.17561674118042, + "learning_rate": 1.9694221300855146e-07, + "loss": 1.9616, + "step": 38 + }, + { + "epoch": 0.0009096024279387884, + "grad_norm": 1.252475619316101, + "learning_rate": 2.02124902824566e-07, + "loss": 1.9585, + "step": 39 + }, + { + "epoch": 0.0009329255671167061, + "grad_norm": 1.884366750717163, + "learning_rate": 2.073075926405805e-07, + "loss": 2.2436, + "step": 40 + }, + { + "epoch": 0.0009562487062946237, + "grad_norm": 1.4951350688934326, + "learning_rate": 2.1249028245659497e-07, + "loss": 1.7149, + "step": 41 + }, + { + "epoch": 0.0009795718454725414, + "grad_norm": 1.891728162765503, + "learning_rate": 2.176729722726095e-07, + "loss": 2.0472, + "step": 42 + }, + { + "epoch": 0.001002894984650459, + "grad_norm": 1.8992432355880737, + "learning_rate": 2.22855662088624e-07, + "loss": 2.1471, + "step": 43 + }, + { + "epoch": 0.0010262181238283768, + "grad_norm": 1.3931283950805664, + "learning_rate": 2.2803835190463852e-07, + "loss": 1.5292, + "step": 44 + }, + { + "epoch": 0.0010495412630062942, + "grad_norm": 1.8894548416137695, + "learning_rate": 2.3322104172065305e-07, + "loss": 1.7759, + "step": 45 + }, + { + "epoch": 0.001072864402184212, + "grad_norm": 1.592050552368164, + "learning_rate": 2.3840373153666755e-07, + "loss": 2.2498, + "step": 46 + }, + { + "epoch": 0.0010961875413621296, + "grad_norm": 1.3746178150177002, + "learning_rate": 2.4358642135268203e-07, + "loss": 1.8503, + "step": 47 + }, + { + "epoch": 0.0011195106805400473, + "grad_norm": 2.0268595218658447, + "learning_rate": 2.4876911116869654e-07, + "loss": 1.9358, + "step": 48 + }, + { + "epoch": 0.001142833819717965, + "grad_norm": 1.7836228609085083, + "learning_rate": 2.539518009847111e-07, + "loss": 1.9855, + "step": 49 + }, + { + "epoch": 0.0011661569588958826, + "grad_norm": 1.829447627067566, + "learning_rate": 2.591344908007256e-07, + "loss": 2.2802, + "step": 50 + }, + { + "epoch": 0.0011894800980738003, + "grad_norm": 2.2813496589660645, + "learning_rate": 2.643171806167401e-07, + "loss": 2.1593, + "step": 51 + }, + { + "epoch": 0.001212803237251718, + "grad_norm": 3.019044876098633, + "learning_rate": 2.694998704327546e-07, + "loss": 1.9534, + "step": 52 + }, + { + "epoch": 0.0012361263764296354, + "grad_norm": 2.011425256729126, + "learning_rate": 2.746825602487691e-07, + "loss": 2.1284, + "step": 53 + }, + { + "epoch": 0.0012594495156075531, + "grad_norm": 2.207106590270996, + "learning_rate": 2.798652500647836e-07, + "loss": 2.2427, + "step": 54 + }, + { + "epoch": 0.0012827726547854708, + "grad_norm": 1.3172473907470703, + "learning_rate": 2.8504793988079813e-07, + "loss": 1.9782, + "step": 55 + }, + { + "epoch": 0.0013060957939633885, + "grad_norm": 1.522895097732544, + "learning_rate": 2.902306296968127e-07, + "loss": 1.9455, + "step": 56 + }, + { + "epoch": 0.0013294189331413062, + "grad_norm": 2.657248020172119, + "learning_rate": 2.954133195128272e-07, + "loss": 1.959, + "step": 57 + }, + { + "epoch": 0.0013527420723192238, + "grad_norm": 1.9738789796829224, + "learning_rate": 3.005960093288417e-07, + "loss": 1.7878, + "step": 58 + }, + { + "epoch": 0.0013760652114971415, + "grad_norm": 1.5549254417419434, + "learning_rate": 3.057786991448562e-07, + "loss": 1.9405, + "step": 59 + }, + { + "epoch": 0.0013993883506750592, + "grad_norm": 2.9688899517059326, + "learning_rate": 3.109613889608707e-07, + "loss": 1.9969, + "step": 60 + }, + { + "epoch": 0.0014227114898529767, + "grad_norm": 1.4602586030960083, + "learning_rate": 3.1614407877688527e-07, + "loss": 1.9339, + "step": 61 + }, + { + "epoch": 0.0014460346290308943, + "grad_norm": 2.4017045497894287, + "learning_rate": 3.213267685928997e-07, + "loss": 2.0842, + "step": 62 + }, + { + "epoch": 0.001469357768208812, + "grad_norm": 1.7433497905731201, + "learning_rate": 3.2650945840891423e-07, + "loss": 2.0223, + "step": 63 + }, + { + "epoch": 0.0014926809073867297, + "grad_norm": 1.7395591735839844, + "learning_rate": 3.316921482249288e-07, + "loss": 1.9257, + "step": 64 + }, + { + "epoch": 0.0015160040465646474, + "grad_norm": 1.8336257934570312, + "learning_rate": 3.3687483804094324e-07, + "loss": 1.948, + "step": 65 + }, + { + "epoch": 0.001539327185742565, + "grad_norm": 1.6493985652923584, + "learning_rate": 3.420575278569578e-07, + "loss": 1.8672, + "step": 66 + }, + { + "epoch": 0.0015626503249204827, + "grad_norm": 1.5789337158203125, + "learning_rate": 3.472402176729723e-07, + "loss": 1.9446, + "step": 67 + }, + { + "epoch": 0.0015859734640984004, + "grad_norm": 1.3755509853363037, + "learning_rate": 3.5242290748898686e-07, + "loss": 2.1796, + "step": 68 + }, + { + "epoch": 0.001609296603276318, + "grad_norm": 1.7978087663650513, + "learning_rate": 3.576055973050013e-07, + "loss": 1.8974, + "step": 69 + }, + { + "epoch": 0.0016326197424542355, + "grad_norm": 1.8888216018676758, + "learning_rate": 3.627882871210158e-07, + "loss": 1.915, + "step": 70 + }, + { + "epoch": 0.0016559428816321532, + "grad_norm": 2.6150593757629395, + "learning_rate": 3.679709769370304e-07, + "loss": 2.2133, + "step": 71 + }, + { + "epoch": 0.001679266020810071, + "grad_norm": 1.7009005546569824, + "learning_rate": 3.7315366675304483e-07, + "loss": 2.1024, + "step": 72 + }, + { + "epoch": 0.0017025891599879886, + "grad_norm": 1.741734266281128, + "learning_rate": 3.783363565690594e-07, + "loss": 2.1839, + "step": 73 + }, + { + "epoch": 0.0017259122991659063, + "grad_norm": 2.7715041637420654, + "learning_rate": 3.835190463850739e-07, + "loss": 2.0734, + "step": 74 + }, + { + "epoch": 0.001749235438343824, + "grad_norm": 1.9710502624511719, + "learning_rate": 3.8870173620108835e-07, + "loss": 2.18, + "step": 75 + }, + { + "epoch": 0.0017725585775217416, + "grad_norm": 2.077986478805542, + "learning_rate": 3.938844260171029e-07, + "loss": 2.1482, + "step": 76 + }, + { + "epoch": 0.0017958817166996593, + "grad_norm": 2.583721160888672, + "learning_rate": 3.990671158331174e-07, + "loss": 2.5364, + "step": 77 + }, + { + "epoch": 0.0018192048558775768, + "grad_norm": 1.3425930738449097, + "learning_rate": 4.04249805649132e-07, + "loss": 1.8194, + "step": 78 + }, + { + "epoch": 0.0018425279950554944, + "grad_norm": 2.1111888885498047, + "learning_rate": 4.0943249546514643e-07, + "loss": 1.7878, + "step": 79 + }, + { + "epoch": 0.0018658511342334121, + "grad_norm": 2.0795626640319824, + "learning_rate": 4.14615185281161e-07, + "loss": 2.3006, + "step": 80 + }, + { + "epoch": 0.0018891742734113298, + "grad_norm": 1.273370623588562, + "learning_rate": 4.197978750971755e-07, + "loss": 1.7599, + "step": 81 + }, + { + "epoch": 0.0019124974125892475, + "grad_norm": 1.6202706098556519, + "learning_rate": 4.2498056491318994e-07, + "loss": 2.1727, + "step": 82 + }, + { + "epoch": 0.0019358205517671651, + "grad_norm": 2.4593732357025146, + "learning_rate": 4.301632547292045e-07, + "loss": 2.4588, + "step": 83 + }, + { + "epoch": 0.001959143690945083, + "grad_norm": 1.2617835998535156, + "learning_rate": 4.35345944545219e-07, + "loss": 1.9078, + "step": 84 + }, + { + "epoch": 0.0019824668301230003, + "grad_norm": 2.2640504837036133, + "learning_rate": 4.405286343612335e-07, + "loss": 1.8983, + "step": 85 + }, + { + "epoch": 0.002005789969300918, + "grad_norm": 1.6804454326629639, + "learning_rate": 4.45711324177248e-07, + "loss": 2.1049, + "step": 86 + }, + { + "epoch": 0.0020291131084788356, + "grad_norm": 2.060009717941284, + "learning_rate": 4.5089401399326253e-07, + "loss": 2.0153, + "step": 87 + }, + { + "epoch": 0.0020524362476567535, + "grad_norm": 1.7166160345077515, + "learning_rate": 4.5607670380927703e-07, + "loss": 2.1093, + "step": 88 + }, + { + "epoch": 0.002075759386834671, + "grad_norm": 1.6695979833602905, + "learning_rate": 4.6125939362529154e-07, + "loss": 1.8607, + "step": 89 + }, + { + "epoch": 0.0020990825260125885, + "grad_norm": 1.4339056015014648, + "learning_rate": 4.664420834413061e-07, + "loss": 2.2632, + "step": 90 + }, + { + "epoch": 0.0021224056651905064, + "grad_norm": 1.5228222608566284, + "learning_rate": 4.7162477325732055e-07, + "loss": 2.0851, + "step": 91 + }, + { + "epoch": 0.002145728804368424, + "grad_norm": 1.540848731994629, + "learning_rate": 4.768074630733351e-07, + "loss": 2.1446, + "step": 92 + }, + { + "epoch": 0.0021690519435463417, + "grad_norm": 1.480702519416809, + "learning_rate": 4.819901528893496e-07, + "loss": 2.0718, + "step": 93 + }, + { + "epoch": 0.002192375082724259, + "grad_norm": 2.23518705368042, + "learning_rate": 4.871728427053641e-07, + "loss": 1.6198, + "step": 94 + }, + { + "epoch": 0.002215698221902177, + "grad_norm": 1.6477755308151245, + "learning_rate": 4.923555325213786e-07, + "loss": 2.1136, + "step": 95 + }, + { + "epoch": 0.0022390213610800945, + "grad_norm": 1.9548614025115967, + "learning_rate": 4.975382223373931e-07, + "loss": 1.9143, + "step": 96 + }, + { + "epoch": 0.0022623445002580124, + "grad_norm": 1.3557407855987549, + "learning_rate": 5.027209121534076e-07, + "loss": 2.0044, + "step": 97 + }, + { + "epoch": 0.00228566763943593, + "grad_norm": 2.2781455516815186, + "learning_rate": 5.079036019694222e-07, + "loss": 1.7761, + "step": 98 + }, + { + "epoch": 0.0023089907786138474, + "grad_norm": 2.1195600032806396, + "learning_rate": 5.130862917854368e-07, + "loss": 1.8174, + "step": 99 + }, + { + "epoch": 0.0023323139177917653, + "grad_norm": 2.0798068046569824, + "learning_rate": 5.182689816014512e-07, + "loss": 2.1431, + "step": 100 + }, + { + "epoch": 0.0023556370569696827, + "grad_norm": 1.8773006200790405, + "learning_rate": 5.234516714174657e-07, + "loss": 1.5221, + "step": 101 + }, + { + "epoch": 0.0023789601961476006, + "grad_norm": 1.7917876243591309, + "learning_rate": 5.286343612334802e-07, + "loss": 1.9383, + "step": 102 + }, + { + "epoch": 0.002402283335325518, + "grad_norm": 1.4980329275131226, + "learning_rate": 5.338170510494947e-07, + "loss": 1.846, + "step": 103 + }, + { + "epoch": 0.002425606474503436, + "grad_norm": 2.0081095695495605, + "learning_rate": 5.389997408655092e-07, + "loss": 1.8777, + "step": 104 + }, + { + "epoch": 0.0024489296136813534, + "grad_norm": 1.525317907333374, + "learning_rate": 5.441824306815238e-07, + "loss": 1.971, + "step": 105 + }, + { + "epoch": 0.002472252752859271, + "grad_norm": 1.4131786823272705, + "learning_rate": 5.493651204975382e-07, + "loss": 2.2224, + "step": 106 + }, + { + "epoch": 0.002495575892037189, + "grad_norm": 1.164492130279541, + "learning_rate": 5.545478103135528e-07, + "loss": 1.8909, + "step": 107 + }, + { + "epoch": 0.0025188990312151062, + "grad_norm": 1.9998016357421875, + "learning_rate": 5.597305001295673e-07, + "loss": 2.1197, + "step": 108 + }, + { + "epoch": 0.002542222170393024, + "grad_norm": 1.6218236684799194, + "learning_rate": 5.649131899455818e-07, + "loss": 1.7799, + "step": 109 + }, + { + "epoch": 0.0025655453095709416, + "grad_norm": 1.535388708114624, + "learning_rate": 5.700958797615963e-07, + "loss": 1.7878, + "step": 110 + }, + { + "epoch": 0.0025888684487488595, + "grad_norm": 1.4929994344711304, + "learning_rate": 5.752785695776108e-07, + "loss": 2.0802, + "step": 111 + }, + { + "epoch": 0.002612191587926777, + "grad_norm": 2.183293104171753, + "learning_rate": 5.804612593936254e-07, + "loss": 2.0506, + "step": 112 + }, + { + "epoch": 0.002635514727104695, + "grad_norm": 1.6339191198349, + "learning_rate": 5.856439492096398e-07, + "loss": 1.7152, + "step": 113 + }, + { + "epoch": 0.0026588378662826123, + "grad_norm": 1.4886974096298218, + "learning_rate": 5.908266390256544e-07, + "loss": 1.8327, + "step": 114 + }, + { + "epoch": 0.0026821610054605298, + "grad_norm": 1.4198302030563354, + "learning_rate": 5.960093288416688e-07, + "loss": 1.8342, + "step": 115 + }, + { + "epoch": 0.0027054841446384477, + "grad_norm": 2.041900157928467, + "learning_rate": 6.011920186576834e-07, + "loss": 1.9101, + "step": 116 + }, + { + "epoch": 0.002728807283816365, + "grad_norm": 1.7576725482940674, + "learning_rate": 6.063747084736979e-07, + "loss": 2.3793, + "step": 117 + }, + { + "epoch": 0.002752130422994283, + "grad_norm": 1.620440125465393, + "learning_rate": 6.115573982897124e-07, + "loss": 1.7363, + "step": 118 + }, + { + "epoch": 0.0027754535621722005, + "grad_norm": 1.972102403640747, + "learning_rate": 6.16740088105727e-07, + "loss": 2.0338, + "step": 119 + }, + { + "epoch": 0.0027987767013501184, + "grad_norm": 1.5385342836380005, + "learning_rate": 6.219227779217414e-07, + "loss": 1.829, + "step": 120 + }, + { + "epoch": 0.002822099840528036, + "grad_norm": 1.4439769983291626, + "learning_rate": 6.27105467737756e-07, + "loss": 1.9893, + "step": 121 + }, + { + "epoch": 0.0028454229797059533, + "grad_norm": 1.5146026611328125, + "learning_rate": 6.322881575537705e-07, + "loss": 1.6563, + "step": 122 + }, + { + "epoch": 0.002868746118883871, + "grad_norm": 1.7177401781082153, + "learning_rate": 6.374708473697849e-07, + "loss": 1.9483, + "step": 123 + }, + { + "epoch": 0.0028920692580617887, + "grad_norm": 2.484865188598633, + "learning_rate": 6.426535371857994e-07, + "loss": 2.0949, + "step": 124 + }, + { + "epoch": 0.0029153923972397066, + "grad_norm": 1.5320651531219482, + "learning_rate": 6.47836227001814e-07, + "loss": 1.8557, + "step": 125 + }, + { + "epoch": 0.002938715536417624, + "grad_norm": 1.3804417848587036, + "learning_rate": 6.530189168178285e-07, + "loss": 1.8733, + "step": 126 + }, + { + "epoch": 0.002962038675595542, + "grad_norm": 2.0832831859588623, + "learning_rate": 6.58201606633843e-07, + "loss": 1.8556, + "step": 127 + }, + { + "epoch": 0.0029853618147734594, + "grad_norm": 1.2582931518554688, + "learning_rate": 6.633842964498576e-07, + "loss": 2.1239, + "step": 128 + }, + { + "epoch": 0.0030086849539513773, + "grad_norm": 1.6449629068374634, + "learning_rate": 6.685669862658721e-07, + "loss": 2.1635, + "step": 129 + }, + { + "epoch": 0.0030320080931292947, + "grad_norm": 1.3350502252578735, + "learning_rate": 6.737496760818865e-07, + "loss": 1.801, + "step": 130 + }, + { + "epoch": 0.003055331232307212, + "grad_norm": 1.7689651250839233, + "learning_rate": 6.78932365897901e-07, + "loss": 1.7541, + "step": 131 + }, + { + "epoch": 0.00307865437148513, + "grad_norm": 1.4711276292800903, + "learning_rate": 6.841150557139156e-07, + "loss": 2.3916, + "step": 132 + }, + { + "epoch": 0.0031019775106630476, + "grad_norm": 1.2806516885757446, + "learning_rate": 6.892977455299301e-07, + "loss": 1.8609, + "step": 133 + }, + { + "epoch": 0.0031253006498409655, + "grad_norm": 1.5531939268112183, + "learning_rate": 6.944804353459446e-07, + "loss": 1.7721, + "step": 134 + }, + { + "epoch": 0.003148623789018883, + "grad_norm": 1.6541032791137695, + "learning_rate": 6.996631251619592e-07, + "loss": 2.1091, + "step": 135 + }, + { + "epoch": 0.003171946928196801, + "grad_norm": 2.050734281539917, + "learning_rate": 7.048458149779737e-07, + "loss": 1.8932, + "step": 136 + }, + { + "epoch": 0.0031952700673747183, + "grad_norm": 1.2903157472610474, + "learning_rate": 7.100285047939881e-07, + "loss": 2.0833, + "step": 137 + }, + { + "epoch": 0.003218593206552636, + "grad_norm": 1.3316091299057007, + "learning_rate": 7.152111946100026e-07, + "loss": 1.9307, + "step": 138 + }, + { + "epoch": 0.0032419163457305536, + "grad_norm": 1.441341519355774, + "learning_rate": 7.203938844260172e-07, + "loss": 2.2529, + "step": 139 + }, + { + "epoch": 0.003265239484908471, + "grad_norm": 2.159276008605957, + "learning_rate": 7.255765742420316e-07, + "loss": 1.847, + "step": 140 + }, + { + "epoch": 0.003288562624086389, + "grad_norm": 1.8410853147506714, + "learning_rate": 7.307592640580462e-07, + "loss": 2.2465, + "step": 141 + }, + { + "epoch": 0.0033118857632643064, + "grad_norm": 1.8678739070892334, + "learning_rate": 7.359419538740608e-07, + "loss": 1.9261, + "step": 142 + }, + { + "epoch": 0.0033352089024422243, + "grad_norm": 1.2097922563552856, + "learning_rate": 7.411246436900751e-07, + "loss": 2.0205, + "step": 143 + }, + { + "epoch": 0.003358532041620142, + "grad_norm": 1.733077883720398, + "learning_rate": 7.463073335060897e-07, + "loss": 1.8389, + "step": 144 + }, + { + "epoch": 0.0033818551807980597, + "grad_norm": 1.7118474245071411, + "learning_rate": 7.514900233221042e-07, + "loss": 1.9511, + "step": 145 + }, + { + "epoch": 0.003405178319975977, + "grad_norm": 1.6960872411727905, + "learning_rate": 7.566727131381188e-07, + "loss": 1.8828, + "step": 146 + }, + { + "epoch": 0.0034285014591538946, + "grad_norm": 1.2409390211105347, + "learning_rate": 7.618554029541332e-07, + "loss": 1.6878, + "step": 147 + }, + { + "epoch": 0.0034518245983318125, + "grad_norm": 1.3440965414047241, + "learning_rate": 7.670380927701478e-07, + "loss": 1.64, + "step": 148 + }, + { + "epoch": 0.00347514773750973, + "grad_norm": 1.539393663406372, + "learning_rate": 7.722207825861624e-07, + "loss": 1.6754, + "step": 149 + }, + { + "epoch": 0.003498470876687648, + "grad_norm": 1.5395653247833252, + "learning_rate": 7.774034724021767e-07, + "loss": 1.9761, + "step": 150 + }, + { + "epoch": 0.0035217940158655653, + "grad_norm": 2.0169472694396973, + "learning_rate": 7.825861622181913e-07, + "loss": 1.6927, + "step": 151 + }, + { + "epoch": 0.0035451171550434832, + "grad_norm": 1.8776079416275024, + "learning_rate": 7.877688520342058e-07, + "loss": 1.9273, + "step": 152 + }, + { + "epoch": 0.0035684402942214007, + "grad_norm": 2.078824043273926, + "learning_rate": 7.929515418502204e-07, + "loss": 1.6756, + "step": 153 + }, + { + "epoch": 0.0035917634333993186, + "grad_norm": 1.407560110092163, + "learning_rate": 7.981342316662348e-07, + "loss": 1.6038, + "step": 154 + }, + { + "epoch": 0.003615086572577236, + "grad_norm": 1.1770573854446411, + "learning_rate": 8.033169214822494e-07, + "loss": 1.6679, + "step": 155 + }, + { + "epoch": 0.0036384097117551535, + "grad_norm": 1.2057602405548096, + "learning_rate": 8.08499611298264e-07, + "loss": 1.7916, + "step": 156 + }, + { + "epoch": 0.0036617328509330714, + "grad_norm": 1.117970585823059, + "learning_rate": 8.136823011142783e-07, + "loss": 1.7974, + "step": 157 + }, + { + "epoch": 0.003685055990110989, + "grad_norm": 1.5996465682983398, + "learning_rate": 8.188649909302929e-07, + "loss": 1.6053, + "step": 158 + }, + { + "epoch": 0.0037083791292889068, + "grad_norm": 1.4170929193496704, + "learning_rate": 8.240476807463074e-07, + "loss": 1.7155, + "step": 159 + }, + { + "epoch": 0.0037317022684668242, + "grad_norm": 1.8114391565322876, + "learning_rate": 8.29230370562322e-07, + "loss": 1.9192, + "step": 160 + }, + { + "epoch": 0.003755025407644742, + "grad_norm": 1.3462793827056885, + "learning_rate": 8.344130603783364e-07, + "loss": 1.4624, + "step": 161 + }, + { + "epoch": 0.0037783485468226596, + "grad_norm": 1.6305956840515137, + "learning_rate": 8.39595750194351e-07, + "loss": 1.8017, + "step": 162 + }, + { + "epoch": 0.003801671686000577, + "grad_norm": 1.662576675415039, + "learning_rate": 8.447784400103655e-07, + "loss": 1.733, + "step": 163 + }, + { + "epoch": 0.003824994825178495, + "grad_norm": 1.556788682937622, + "learning_rate": 8.499611298263799e-07, + "loss": 1.9586, + "step": 164 + }, + { + "epoch": 0.0038483179643564124, + "grad_norm": 1.5282272100448608, + "learning_rate": 8.551438196423944e-07, + "loss": 1.8254, + "step": 165 + }, + { + "epoch": 0.0038716411035343303, + "grad_norm": 1.6790592670440674, + "learning_rate": 8.60326509458409e-07, + "loss": 2.1866, + "step": 166 + }, + { + "epoch": 0.0038949642427122478, + "grad_norm": 1.5164263248443604, + "learning_rate": 8.655091992744236e-07, + "loss": 1.6651, + "step": 167 + }, + { + "epoch": 0.003918287381890166, + "grad_norm": 1.5002336502075195, + "learning_rate": 8.70691889090438e-07, + "loss": 1.9295, + "step": 168 + }, + { + "epoch": 0.0039416105210680836, + "grad_norm": 1.2122441530227661, + "learning_rate": 8.758745789064526e-07, + "loss": 1.761, + "step": 169 + }, + { + "epoch": 0.003964933660246001, + "grad_norm": 1.637898564338684, + "learning_rate": 8.81057268722467e-07, + "loss": 1.8697, + "step": 170 + }, + { + "epoch": 0.0039882567994239185, + "grad_norm": 0.988777220249176, + "learning_rate": 8.862399585384815e-07, + "loss": 2.1249, + "step": 171 + }, + { + "epoch": 0.004011579938601836, + "grad_norm": 1.8833587169647217, + "learning_rate": 8.91422648354496e-07, + "loss": 1.6915, + "step": 172 + }, + { + "epoch": 0.004034903077779753, + "grad_norm": 1.8418108224868774, + "learning_rate": 8.966053381705106e-07, + "loss": 2.0019, + "step": 173 + }, + { + "epoch": 0.004058226216957671, + "grad_norm": 1.6375901699066162, + "learning_rate": 9.017880279865251e-07, + "loss": 1.7625, + "step": 174 + }, + { + "epoch": 0.004081549356135589, + "grad_norm": 1.8701720237731934, + "learning_rate": 9.069707178025396e-07, + "loss": 1.801, + "step": 175 + }, + { + "epoch": 0.004104872495313507, + "grad_norm": 1.4488773345947266, + "learning_rate": 9.121534076185541e-07, + "loss": 1.9971, + "step": 176 + }, + { + "epoch": 0.004128195634491424, + "grad_norm": 0.9587986469268799, + "learning_rate": 9.173360974345686e-07, + "loss": 1.6253, + "step": 177 + }, + { + "epoch": 0.004151518773669342, + "grad_norm": 2.6533186435699463, + "learning_rate": 9.225187872505831e-07, + "loss": 1.572, + "step": 178 + }, + { + "epoch": 0.00417484191284726, + "grad_norm": 2.4528841972351074, + "learning_rate": 9.277014770665976e-07, + "loss": 1.7586, + "step": 179 + }, + { + "epoch": 0.004198165052025177, + "grad_norm": 1.1871824264526367, + "learning_rate": 9.328841668826122e-07, + "loss": 1.6765, + "step": 180 + }, + { + "epoch": 0.004221488191203095, + "grad_norm": 1.1292660236358643, + "learning_rate": 9.380668566986266e-07, + "loss": 2.0673, + "step": 181 + }, + { + "epoch": 0.004244811330381013, + "grad_norm": 1.3055285215377808, + "learning_rate": 9.432495465146411e-07, + "loss": 1.8103, + "step": 182 + }, + { + "epoch": 0.004268134469558931, + "grad_norm": 1.5225868225097656, + "learning_rate": 9.484322363306557e-07, + "loss": 2.0813, + "step": 183 + }, + { + "epoch": 0.004291457608736848, + "grad_norm": 1.2439767122268677, + "learning_rate": 9.536149261466702e-07, + "loss": 1.6919, + "step": 184 + }, + { + "epoch": 0.0043147807479147655, + "grad_norm": 1.2424002885818481, + "learning_rate": 9.587976159626847e-07, + "loss": 1.9506, + "step": 185 + }, + { + "epoch": 0.0043381038870926834, + "grad_norm": 0.9796323776245117, + "learning_rate": 9.639803057786992e-07, + "loss": 1.7342, + "step": 186 + }, + { + "epoch": 0.0043614270262706005, + "grad_norm": 1.2240192890167236, + "learning_rate": 9.691629955947138e-07, + "loss": 2.0646, + "step": 187 + }, + { + "epoch": 0.004384750165448518, + "grad_norm": 0.8779449462890625, + "learning_rate": 9.743456854107281e-07, + "loss": 1.4535, + "step": 188 + }, + { + "epoch": 0.004408073304626436, + "grad_norm": 1.3131407499313354, + "learning_rate": 9.795283752267427e-07, + "loss": 1.9817, + "step": 189 + }, + { + "epoch": 0.004431396443804354, + "grad_norm": 1.3259912729263306, + "learning_rate": 9.847110650427573e-07, + "loss": 1.709, + "step": 190 + }, + { + "epoch": 0.004454719582982271, + "grad_norm": 1.4236465692520142, + "learning_rate": 9.898937548587718e-07, + "loss": 1.7059, + "step": 191 + }, + { + "epoch": 0.004478042722160189, + "grad_norm": 1.2791959047317505, + "learning_rate": 9.950764446747862e-07, + "loss": 1.9633, + "step": 192 + }, + { + "epoch": 0.004501365861338107, + "grad_norm": 0.9857053160667419, + "learning_rate": 1.0002591344908007e-06, + "loss": 1.807, + "step": 193 + }, + { + "epoch": 0.004524689000516025, + "grad_norm": 1.264302372932434, + "learning_rate": 1.0054418243068153e-06, + "loss": 1.5389, + "step": 194 + }, + { + "epoch": 0.004548012139693942, + "grad_norm": 1.2205390930175781, + "learning_rate": 1.0106245141228298e-06, + "loss": 1.4549, + "step": 195 + }, + { + "epoch": 0.00457133527887186, + "grad_norm": 1.055471420288086, + "learning_rate": 1.0158072039388444e-06, + "loss": 1.6931, + "step": 196 + }, + { + "epoch": 0.004594658418049778, + "grad_norm": 1.0585546493530273, + "learning_rate": 1.020989893754859e-06, + "loss": 1.8054, + "step": 197 + }, + { + "epoch": 0.004617981557227695, + "grad_norm": 2.16025972366333, + "learning_rate": 1.0261725835708735e-06, + "loss": 2.0077, + "step": 198 + }, + { + "epoch": 0.004641304696405613, + "grad_norm": 2.125786781311035, + "learning_rate": 1.0313552733868879e-06, + "loss": 1.9117, + "step": 199 + }, + { + "epoch": 0.0046646278355835305, + "grad_norm": 1.3560391664505005, + "learning_rate": 1.0365379632029024e-06, + "loss": 1.9871, + "step": 200 + }, + { + "epoch": 0.004687950974761448, + "grad_norm": 1.3505181074142456, + "learning_rate": 1.041720653018917e-06, + "loss": 1.714, + "step": 201 + }, + { + "epoch": 0.004711274113939365, + "grad_norm": 1.1724427938461304, + "learning_rate": 1.0469033428349313e-06, + "loss": 1.7611, + "step": 202 + }, + { + "epoch": 0.004734597253117283, + "grad_norm": 1.1746799945831299, + "learning_rate": 1.0520860326509459e-06, + "loss": 1.867, + "step": 203 + }, + { + "epoch": 0.004757920392295201, + "grad_norm": 1.0976382493972778, + "learning_rate": 1.0572687224669604e-06, + "loss": 1.808, + "step": 204 + }, + { + "epoch": 0.004781243531473118, + "grad_norm": 1.3842298984527588, + "learning_rate": 1.062451412282975e-06, + "loss": 1.7973, + "step": 205 + }, + { + "epoch": 0.004804566670651036, + "grad_norm": 1.6715288162231445, + "learning_rate": 1.0676341020989893e-06, + "loss": 1.9817, + "step": 206 + }, + { + "epoch": 0.004827889809828954, + "grad_norm": 1.0734590291976929, + "learning_rate": 1.072816791915004e-06, + "loss": 1.4297, + "step": 207 + }, + { + "epoch": 0.004851212949006872, + "grad_norm": 1.0182546377182007, + "learning_rate": 1.0779994817310185e-06, + "loss": 1.713, + "step": 208 + }, + { + "epoch": 0.004874536088184789, + "grad_norm": 1.1884313821792603, + "learning_rate": 1.083182171547033e-06, + "loss": 1.5234, + "step": 209 + }, + { + "epoch": 0.004897859227362707, + "grad_norm": 1.520266056060791, + "learning_rate": 1.0883648613630476e-06, + "loss": 2.0598, + "step": 210 + }, + { + "epoch": 0.004921182366540625, + "grad_norm": 1.1709904670715332, + "learning_rate": 1.0935475511790621e-06, + "loss": 2.1461, + "step": 211 + }, + { + "epoch": 0.004944505505718542, + "grad_norm": 1.2634027004241943, + "learning_rate": 1.0987302409950765e-06, + "loss": 1.5076, + "step": 212 + }, + { + "epoch": 0.00496782864489646, + "grad_norm": 1.490717887878418, + "learning_rate": 1.103912930811091e-06, + "loss": 1.8628, + "step": 213 + }, + { + "epoch": 0.004991151784074378, + "grad_norm": 2.077373743057251, + "learning_rate": 1.1090956206271056e-06, + "loss": 1.9295, + "step": 214 + }, + { + "epoch": 0.0050144749232522955, + "grad_norm": 1.647877812385559, + "learning_rate": 1.1142783104431202e-06, + "loss": 1.7929, + "step": 215 + }, + { + "epoch": 0.0050377980624302125, + "grad_norm": 1.1937353610992432, + "learning_rate": 1.1194610002591345e-06, + "loss": 1.6509, + "step": 216 + }, + { + "epoch": 0.00506112120160813, + "grad_norm": 1.0805108547210693, + "learning_rate": 1.124643690075149e-06, + "loss": 1.6447, + "step": 217 + }, + { + "epoch": 0.005084444340786048, + "grad_norm": 1.1077872514724731, + "learning_rate": 1.1298263798911636e-06, + "loss": 1.7675, + "step": 218 + }, + { + "epoch": 0.005107767479963966, + "grad_norm": 0.8648241758346558, + "learning_rate": 1.135009069707178e-06, + "loss": 1.6687, + "step": 219 + }, + { + "epoch": 0.005131090619141883, + "grad_norm": 1.0522700548171997, + "learning_rate": 1.1401917595231925e-06, + "loss": 1.2878, + "step": 220 + }, + { + "epoch": 0.005154413758319801, + "grad_norm": 1.3021256923675537, + "learning_rate": 1.145374449339207e-06, + "loss": 1.8535, + "step": 221 + }, + { + "epoch": 0.005177736897497719, + "grad_norm": 1.2912962436676025, + "learning_rate": 1.1505571391552216e-06, + "loss": 1.865, + "step": 222 + }, + { + "epoch": 0.005201060036675636, + "grad_norm": 1.6733994483947754, + "learning_rate": 1.1557398289712362e-06, + "loss": 1.5748, + "step": 223 + }, + { + "epoch": 0.005224383175853554, + "grad_norm": 1.0865724086761475, + "learning_rate": 1.1609225187872508e-06, + "loss": 1.8159, + "step": 224 + }, + { + "epoch": 0.005247706315031472, + "grad_norm": 1.1498301029205322, + "learning_rate": 1.1661052086032653e-06, + "loss": 1.8579, + "step": 225 + }, + { + "epoch": 0.00527102945420939, + "grad_norm": 1.9360573291778564, + "learning_rate": 1.1712878984192797e-06, + "loss": 1.7366, + "step": 226 + }, + { + "epoch": 0.005294352593387307, + "grad_norm": 1.0133939981460571, + "learning_rate": 1.1764705882352942e-06, + "loss": 1.4571, + "step": 227 + }, + { + "epoch": 0.005317675732565225, + "grad_norm": 1.6443811655044556, + "learning_rate": 1.1816532780513088e-06, + "loss": 1.5312, + "step": 228 + }, + { + "epoch": 0.0053409988717431425, + "grad_norm": 1.1923338174819946, + "learning_rate": 1.1868359678673233e-06, + "loss": 1.6993, + "step": 229 + }, + { + "epoch": 0.0053643220109210596, + "grad_norm": 1.0345349311828613, + "learning_rate": 1.1920186576833377e-06, + "loss": 1.5739, + "step": 230 + }, + { + "epoch": 0.0053876451500989775, + "grad_norm": 0.9833806753158569, + "learning_rate": 1.1972013474993522e-06, + "loss": 1.819, + "step": 231 + }, + { + "epoch": 0.005410968289276895, + "grad_norm": 1.3315545320510864, + "learning_rate": 1.2023840373153668e-06, + "loss": 1.9472, + "step": 232 + }, + { + "epoch": 0.005434291428454813, + "grad_norm": 1.0042314529418945, + "learning_rate": 1.2075667271313812e-06, + "loss": 1.993, + "step": 233 + }, + { + "epoch": 0.00545761456763273, + "grad_norm": 1.2731118202209473, + "learning_rate": 1.2127494169473957e-06, + "loss": 1.6763, + "step": 234 + }, + { + "epoch": 0.005480937706810648, + "grad_norm": 0.9664155840873718, + "learning_rate": 1.2179321067634103e-06, + "loss": 1.3091, + "step": 235 + }, + { + "epoch": 0.005504260845988566, + "grad_norm": 1.6930897235870361, + "learning_rate": 1.2231147965794248e-06, + "loss": 1.6111, + "step": 236 + }, + { + "epoch": 0.005527583985166483, + "grad_norm": 0.9807016253471375, + "learning_rate": 1.2282974863954394e-06, + "loss": 1.6131, + "step": 237 + }, + { + "epoch": 0.005550907124344401, + "grad_norm": 1.321951150894165, + "learning_rate": 1.233480176211454e-06, + "loss": 1.242, + "step": 238 + }, + { + "epoch": 0.005574230263522319, + "grad_norm": 1.1465637683868408, + "learning_rate": 1.2386628660274685e-06, + "loss": 1.7035, + "step": 239 + }, + { + "epoch": 0.005597553402700237, + "grad_norm": 2.4264347553253174, + "learning_rate": 1.2438455558434829e-06, + "loss": 1.9859, + "step": 240 + }, + { + "epoch": 0.005620876541878154, + "grad_norm": 1.429149866104126, + "learning_rate": 1.2490282456594974e-06, + "loss": 1.8249, + "step": 241 + }, + { + "epoch": 0.005644199681056072, + "grad_norm": 1.1119049787521362, + "learning_rate": 1.254210935475512e-06, + "loss": 1.8005, + "step": 242 + }, + { + "epoch": 0.00566752282023399, + "grad_norm": 1.9002227783203125, + "learning_rate": 1.2593936252915265e-06, + "loss": 1.6951, + "step": 243 + }, + { + "epoch": 0.005690845959411907, + "grad_norm": 1.067659854888916, + "learning_rate": 1.264576315107541e-06, + "loss": 1.799, + "step": 244 + }, + { + "epoch": 0.0057141690985898245, + "grad_norm": 1.2947990894317627, + "learning_rate": 1.2697590049235552e-06, + "loss": 1.7837, + "step": 245 + }, + { + "epoch": 0.005737492237767742, + "grad_norm": 1.0790272951126099, + "learning_rate": 1.2749416947395698e-06, + "loss": 1.67, + "step": 246 + }, + { + "epoch": 0.00576081537694566, + "grad_norm": 1.3589330911636353, + "learning_rate": 1.2801243845555843e-06, + "loss": 1.9282, + "step": 247 + }, + { + "epoch": 0.005784138516123577, + "grad_norm": 1.4140998125076294, + "learning_rate": 1.285307074371599e-06, + "loss": 1.6708, + "step": 248 + }, + { + "epoch": 0.005807461655301495, + "grad_norm": 1.000994086265564, + "learning_rate": 1.2904897641876135e-06, + "loss": 1.4077, + "step": 249 + }, + { + "epoch": 0.005830784794479413, + "grad_norm": 1.3655062913894653, + "learning_rate": 1.295672454003628e-06, + "loss": 1.8862, + "step": 250 + }, + { + "epoch": 0.005854107933657331, + "grad_norm": 1.1164065599441528, + "learning_rate": 1.3008551438196426e-06, + "loss": 1.528, + "step": 251 + }, + { + "epoch": 0.005877431072835248, + "grad_norm": 1.1792149543762207, + "learning_rate": 1.306037833635657e-06, + "loss": 1.2879, + "step": 252 + }, + { + "epoch": 0.005900754212013166, + "grad_norm": 2.236320734024048, + "learning_rate": 1.3112205234516715e-06, + "loss": 1.4929, + "step": 253 + }, + { + "epoch": 0.005924077351191084, + "grad_norm": 1.8795088529586792, + "learning_rate": 1.316403213267686e-06, + "loss": 1.2468, + "step": 254 + }, + { + "epoch": 0.005947400490369001, + "grad_norm": 1.2248806953430176, + "learning_rate": 1.3215859030837006e-06, + "loss": 1.769, + "step": 255 + }, + { + "epoch": 0.005970723629546919, + "grad_norm": 1.252236008644104, + "learning_rate": 1.3267685928997152e-06, + "loss": 1.9014, + "step": 256 + }, + { + "epoch": 0.005994046768724837, + "grad_norm": 1.3926386833190918, + "learning_rate": 1.3319512827157297e-06, + "loss": 1.9599, + "step": 257 + }, + { + "epoch": 0.0060173699079027546, + "grad_norm": 1.5681990385055542, + "learning_rate": 1.3371339725317443e-06, + "loss": 1.8109, + "step": 258 + }, + { + "epoch": 0.006040693047080672, + "grad_norm": 1.6841275691986084, + "learning_rate": 1.3423166623477584e-06, + "loss": 1.4601, + "step": 259 + }, + { + "epoch": 0.0060640161862585895, + "grad_norm": 1.5262291431427002, + "learning_rate": 1.347499352163773e-06, + "loss": 1.6493, + "step": 260 + }, + { + "epoch": 0.006087339325436507, + "grad_norm": 1.0905576944351196, + "learning_rate": 1.3526820419797875e-06, + "loss": 2.0847, + "step": 261 + }, + { + "epoch": 0.006110662464614424, + "grad_norm": 1.4682683944702148, + "learning_rate": 1.357864731795802e-06, + "loss": 1.6889, + "step": 262 + }, + { + "epoch": 0.006133985603792342, + "grad_norm": 1.1054515838623047, + "learning_rate": 1.3630474216118166e-06, + "loss": 1.55, + "step": 263 + }, + { + "epoch": 0.00615730874297026, + "grad_norm": 1.3931388854980469, + "learning_rate": 1.3682301114278312e-06, + "loss": 1.655, + "step": 264 + }, + { + "epoch": 0.006180631882148178, + "grad_norm": 1.1766420602798462, + "learning_rate": 1.3734128012438458e-06, + "loss": 1.9555, + "step": 265 + }, + { + "epoch": 0.006203955021326095, + "grad_norm": 1.1652954816818237, + "learning_rate": 1.3785954910598601e-06, + "loss": 1.8446, + "step": 266 + }, + { + "epoch": 0.006227278160504013, + "grad_norm": 1.378980278968811, + "learning_rate": 1.3837781808758747e-06, + "loss": 1.4449, + "step": 267 + }, + { + "epoch": 0.006250601299681931, + "grad_norm": 1.2017453908920288, + "learning_rate": 1.3889608706918892e-06, + "loss": 1.6272, + "step": 268 + }, + { + "epoch": 0.006273924438859848, + "grad_norm": 1.2221115827560425, + "learning_rate": 1.3941435605079038e-06, + "loss": 1.7299, + "step": 269 + }, + { + "epoch": 0.006297247578037766, + "grad_norm": 1.189775824546814, + "learning_rate": 1.3993262503239183e-06, + "loss": 1.1664, + "step": 270 + }, + { + "epoch": 0.006320570717215684, + "grad_norm": 1.0103381872177124, + "learning_rate": 1.404508940139933e-06, + "loss": 1.3519, + "step": 271 + }, + { + "epoch": 0.006343893856393602, + "grad_norm": 1.1243481636047363, + "learning_rate": 1.4096916299559475e-06, + "loss": 1.6704, + "step": 272 + }, + { + "epoch": 0.006367216995571519, + "grad_norm": 1.8137811422348022, + "learning_rate": 1.4148743197719616e-06, + "loss": 1.279, + "step": 273 + }, + { + "epoch": 0.0063905401347494365, + "grad_norm": 1.0875202417373657, + "learning_rate": 1.4200570095879762e-06, + "loss": 1.1564, + "step": 274 + }, + { + "epoch": 0.0064138632739273544, + "grad_norm": 1.0839550495147705, + "learning_rate": 1.4252396994039907e-06, + "loss": 1.7263, + "step": 275 + }, + { + "epoch": 0.006437186413105272, + "grad_norm": 1.7203173637390137, + "learning_rate": 1.4304223892200053e-06, + "loss": 1.9309, + "step": 276 + }, + { + "epoch": 0.006460509552283189, + "grad_norm": 1.3320658206939697, + "learning_rate": 1.4356050790360198e-06, + "loss": 1.8276, + "step": 277 + }, + { + "epoch": 0.006483832691461107, + "grad_norm": 1.5260910987854004, + "learning_rate": 1.4407877688520344e-06, + "loss": 1.413, + "step": 278 + }, + { + "epoch": 0.006507155830639025, + "grad_norm": 1.2401058673858643, + "learning_rate": 1.445970458668049e-06, + "loss": 1.4087, + "step": 279 + }, + { + "epoch": 0.006530478969816942, + "grad_norm": 1.2722922563552856, + "learning_rate": 1.4511531484840633e-06, + "loss": 1.6216, + "step": 280 + }, + { + "epoch": 0.00655380210899486, + "grad_norm": 1.2668229341506958, + "learning_rate": 1.4563358383000779e-06, + "loss": 1.6252, + "step": 281 + }, + { + "epoch": 0.006577125248172778, + "grad_norm": 1.4556583166122437, + "learning_rate": 1.4615185281160924e-06, + "loss": 2.3276, + "step": 282 + }, + { + "epoch": 0.006600448387350696, + "grad_norm": 1.537610411643982, + "learning_rate": 1.466701217932107e-06, + "loss": 1.4319, + "step": 283 + }, + { + "epoch": 0.006623771526528613, + "grad_norm": 1.3130170106887817, + "learning_rate": 1.4718839077481215e-06, + "loss": 1.4978, + "step": 284 + }, + { + "epoch": 0.006647094665706531, + "grad_norm": 1.5020934343338013, + "learning_rate": 1.477066597564136e-06, + "loss": 1.8697, + "step": 285 + }, + { + "epoch": 0.006670417804884449, + "grad_norm": 1.6949779987335205, + "learning_rate": 1.4822492873801502e-06, + "loss": 1.7433, + "step": 286 + }, + { + "epoch": 0.006693740944062366, + "grad_norm": 1.5566325187683105, + "learning_rate": 1.4874319771961648e-06, + "loss": 1.5674, + "step": 287 + }, + { + "epoch": 0.006717064083240284, + "grad_norm": 1.015093445777893, + "learning_rate": 1.4926146670121793e-06, + "loss": 1.9903, + "step": 288 + }, + { + "epoch": 0.0067403872224182015, + "grad_norm": 2.229853868484497, + "learning_rate": 1.497797356828194e-06, + "loss": 1.1905, + "step": 289 + }, + { + "epoch": 0.006763710361596119, + "grad_norm": 1.5241860151290894, + "learning_rate": 1.5029800466442085e-06, + "loss": 1.958, + "step": 290 + }, + { + "epoch": 0.006787033500774036, + "grad_norm": 0.8666454553604126, + "learning_rate": 1.508162736460223e-06, + "loss": 1.7141, + "step": 291 + }, + { + "epoch": 0.006810356639951954, + "grad_norm": 1.4594520330429077, + "learning_rate": 1.5133454262762376e-06, + "loss": 1.7235, + "step": 292 + }, + { + "epoch": 0.006833679779129872, + "grad_norm": 1.3267074823379517, + "learning_rate": 1.518528116092252e-06, + "loss": 1.6172, + "step": 293 + }, + { + "epoch": 0.006857002918307789, + "grad_norm": 1.5386312007904053, + "learning_rate": 1.5237108059082665e-06, + "loss": 1.4843, + "step": 294 + }, + { + "epoch": 0.006880326057485707, + "grad_norm": 1.3275539875030518, + "learning_rate": 1.528893495724281e-06, + "loss": 1.5444, + "step": 295 + }, + { + "epoch": 0.006903649196663625, + "grad_norm": 1.1002707481384277, + "learning_rate": 1.5340761855402956e-06, + "loss": 1.717, + "step": 296 + }, + { + "epoch": 0.006926972335841543, + "grad_norm": 1.172974944114685, + "learning_rate": 1.5392588753563102e-06, + "loss": 1.6963, + "step": 297 + }, + { + "epoch": 0.00695029547501946, + "grad_norm": 1.0728440284729004, + "learning_rate": 1.5444415651723247e-06, + "loss": 1.6228, + "step": 298 + }, + { + "epoch": 0.006973618614197378, + "grad_norm": 1.274348258972168, + "learning_rate": 1.5496242549883393e-06, + "loss": 1.2559, + "step": 299 + }, + { + "epoch": 0.006996941753375296, + "grad_norm": 1.2520028352737427, + "learning_rate": 1.5548069448043534e-06, + "loss": 1.6118, + "step": 300 + }, + { + "epoch": 0.007020264892553213, + "grad_norm": 1.5844305753707886, + "learning_rate": 1.559989634620368e-06, + "loss": 1.5645, + "step": 301 + }, + { + "epoch": 0.007043588031731131, + "grad_norm": 2.285438299179077, + "learning_rate": 1.5651723244363825e-06, + "loss": 1.4541, + "step": 302 + }, + { + "epoch": 0.007066911170909049, + "grad_norm": 1.2873152494430542, + "learning_rate": 1.570355014252397e-06, + "loss": 1.4835, + "step": 303 + }, + { + "epoch": 0.0070902343100869665, + "grad_norm": 1.1332640647888184, + "learning_rate": 1.5755377040684116e-06, + "loss": 1.8279, + "step": 304 + }, + { + "epoch": 0.0071135574492648835, + "grad_norm": 1.6483525037765503, + "learning_rate": 1.5807203938844262e-06, + "loss": 1.2509, + "step": 305 + }, + { + "epoch": 0.007136880588442801, + "grad_norm": 1.0219485759735107, + "learning_rate": 1.5859030837004408e-06, + "loss": 1.8421, + "step": 306 + }, + { + "epoch": 0.007160203727620719, + "grad_norm": 1.2478340864181519, + "learning_rate": 1.5910857735164551e-06, + "loss": 1.9144, + "step": 307 + }, + { + "epoch": 0.007183526866798637, + "grad_norm": 1.4016437530517578, + "learning_rate": 1.5962684633324697e-06, + "loss": 1.5146, + "step": 308 + }, + { + "epoch": 0.007206850005976554, + "grad_norm": 1.1399790048599243, + "learning_rate": 1.6014511531484842e-06, + "loss": 1.6714, + "step": 309 + }, + { + "epoch": 0.007230173145154472, + "grad_norm": 2.047961473464966, + "learning_rate": 1.6066338429644988e-06, + "loss": 1.1777, + "step": 310 + }, + { + "epoch": 0.00725349628433239, + "grad_norm": 1.1410201787948608, + "learning_rate": 1.6118165327805133e-06, + "loss": 1.6783, + "step": 311 + }, + { + "epoch": 0.007276819423510307, + "grad_norm": 1.2840640544891357, + "learning_rate": 1.616999222596528e-06, + "loss": 1.9351, + "step": 312 + }, + { + "epoch": 0.007300142562688225, + "grad_norm": 0.9116181135177612, + "learning_rate": 1.6221819124125425e-06, + "loss": 1.7705, + "step": 313 + }, + { + "epoch": 0.007323465701866143, + "grad_norm": 1.3190463781356812, + "learning_rate": 1.6273646022285566e-06, + "loss": 1.4484, + "step": 314 + }, + { + "epoch": 0.007346788841044061, + "grad_norm": 0.9988270401954651, + "learning_rate": 1.6325472920445712e-06, + "loss": 1.5159, + "step": 315 + }, + { + "epoch": 0.007370111980221978, + "grad_norm": 0.8620725870132446, + "learning_rate": 1.6377299818605857e-06, + "loss": 1.5605, + "step": 316 + }, + { + "epoch": 0.007393435119399896, + "grad_norm": 1.284604549407959, + "learning_rate": 1.6429126716766003e-06, + "loss": 1.4822, + "step": 317 + }, + { + "epoch": 0.0074167582585778135, + "grad_norm": 1.2546097040176392, + "learning_rate": 1.6480953614926148e-06, + "loss": 1.436, + "step": 318 + }, + { + "epoch": 0.0074400813977557306, + "grad_norm": 0.9116978645324707, + "learning_rate": 1.6532780513086294e-06, + "loss": 1.2708, + "step": 319 + }, + { + "epoch": 0.0074634045369336485, + "grad_norm": 0.9910548329353333, + "learning_rate": 1.658460741124644e-06, + "loss": 1.8144, + "step": 320 + }, + { + "epoch": 0.007486727676111566, + "grad_norm": 1.9879093170166016, + "learning_rate": 1.6636434309406583e-06, + "loss": 1.4826, + "step": 321 + }, + { + "epoch": 0.007510050815289484, + "grad_norm": 1.0845030546188354, + "learning_rate": 1.6688261207566729e-06, + "loss": 1.3364, + "step": 322 + }, + { + "epoch": 0.007533373954467401, + "grad_norm": 1.342966079711914, + "learning_rate": 1.6740088105726874e-06, + "loss": 1.6453, + "step": 323 + }, + { + "epoch": 0.007556697093645319, + "grad_norm": 0.9570252895355225, + "learning_rate": 1.679191500388702e-06, + "loss": 1.5384, + "step": 324 + }, + { + "epoch": 0.007580020232823237, + "grad_norm": 1.531516671180725, + "learning_rate": 1.6843741902047165e-06, + "loss": 1.5775, + "step": 325 + }, + { + "epoch": 0.007603343372001154, + "grad_norm": 1.4623240232467651, + "learning_rate": 1.689556880020731e-06, + "loss": 1.7159, + "step": 326 + }, + { + "epoch": 0.007626666511179072, + "grad_norm": 1.109586238861084, + "learning_rate": 1.6947395698367454e-06, + "loss": 1.7403, + "step": 327 + }, + { + "epoch": 0.00764998965035699, + "grad_norm": 1.3199604749679565, + "learning_rate": 1.6999222596527598e-06, + "loss": 1.7208, + "step": 328 + }, + { + "epoch": 0.007673312789534908, + "grad_norm": 1.0979784727096558, + "learning_rate": 1.7051049494687743e-06, + "loss": 1.6097, + "step": 329 + }, + { + "epoch": 0.007696635928712825, + "grad_norm": 1.0952926874160767, + "learning_rate": 1.710287639284789e-06, + "loss": 1.8262, + "step": 330 + }, + { + "epoch": 0.007719959067890743, + "grad_norm": 1.1149373054504395, + "learning_rate": 1.7154703291008035e-06, + "loss": 1.5762, + "step": 331 + }, + { + "epoch": 0.007743282207068661, + "grad_norm": 1.2090753316879272, + "learning_rate": 1.720653018916818e-06, + "loss": 1.6161, + "step": 332 + }, + { + "epoch": 0.007766605346246578, + "grad_norm": 1.3476163148880005, + "learning_rate": 1.7258357087328326e-06, + "loss": 1.6854, + "step": 333 + }, + { + "epoch": 0.0077899284854244955, + "grad_norm": 1.3222614526748657, + "learning_rate": 1.7310183985488471e-06, + "loss": 1.5996, + "step": 334 + }, + { + "epoch": 0.007813251624602413, + "grad_norm": 1.2350871562957764, + "learning_rate": 1.7362010883648615e-06, + "loss": 1.5052, + "step": 335 + }, + { + "epoch": 0.007836574763780331, + "grad_norm": 1.4628745317459106, + "learning_rate": 1.741383778180876e-06, + "loss": 1.6268, + "step": 336 + }, + { + "epoch": 0.00785989790295825, + "grad_norm": 1.3481048345565796, + "learning_rate": 1.7465664679968906e-06, + "loss": 1.4308, + "step": 337 + }, + { + "epoch": 0.007883221042136167, + "grad_norm": 1.0008901357650757, + "learning_rate": 1.7517491578129052e-06, + "loss": 1.6487, + "step": 338 + }, + { + "epoch": 0.007906544181314083, + "grad_norm": 2.4258437156677246, + "learning_rate": 1.7569318476289195e-06, + "loss": 1.5327, + "step": 339 + }, + { + "epoch": 0.007929867320492001, + "grad_norm": 1.3444914817810059, + "learning_rate": 1.762114537444934e-06, + "loss": 1.5257, + "step": 340 + }, + { + "epoch": 0.007953190459669919, + "grad_norm": 2.297591209411621, + "learning_rate": 1.7672972272609486e-06, + "loss": 1.9581, + "step": 341 + }, + { + "epoch": 0.007976513598847837, + "grad_norm": 1.107711672782898, + "learning_rate": 1.772479917076963e-06, + "loss": 1.3486, + "step": 342 + }, + { + "epoch": 0.007999836738025755, + "grad_norm": 1.4064106941223145, + "learning_rate": 1.7776626068929775e-06, + "loss": 1.3169, + "step": 343 + }, + { + "epoch": 0.008023159877203673, + "grad_norm": 1.1236720085144043, + "learning_rate": 1.782845296708992e-06, + "loss": 2.0225, + "step": 344 + }, + { + "epoch": 0.00804648301638159, + "grad_norm": 1.9214081764221191, + "learning_rate": 1.7880279865250066e-06, + "loss": 1.7269, + "step": 345 + }, + { + "epoch": 0.008069806155559507, + "grad_norm": 1.1544204950332642, + "learning_rate": 1.7932106763410212e-06, + "loss": 1.8407, + "step": 346 + }, + { + "epoch": 0.008093129294737425, + "grad_norm": 1.3266545534133911, + "learning_rate": 1.7983933661570358e-06, + "loss": 1.3316, + "step": 347 + }, + { + "epoch": 0.008116452433915343, + "grad_norm": 1.4208300113677979, + "learning_rate": 1.8035760559730501e-06, + "loss": 1.7712, + "step": 348 + }, + { + "epoch": 0.00813977557309326, + "grad_norm": 1.1849939823150635, + "learning_rate": 1.8087587457890647e-06, + "loss": 1.3843, + "step": 349 + }, + { + "epoch": 0.008163098712271178, + "grad_norm": 0.9147690534591675, + "learning_rate": 1.8139414356050792e-06, + "loss": 1.703, + "step": 350 + }, + { + "epoch": 0.008186421851449096, + "grad_norm": 1.2026822566986084, + "learning_rate": 1.8191241254210938e-06, + "loss": 1.642, + "step": 351 + }, + { + "epoch": 0.008209744990627014, + "grad_norm": 1.6620279550552368, + "learning_rate": 1.8243068152371081e-06, + "loss": 1.2861, + "step": 352 + }, + { + "epoch": 0.00823306812980493, + "grad_norm": 1.20318603515625, + "learning_rate": 1.8294895050531227e-06, + "loss": 1.7781, + "step": 353 + }, + { + "epoch": 0.008256391268982848, + "grad_norm": 1.117148756980896, + "learning_rate": 1.8346721948691372e-06, + "loss": 1.7056, + "step": 354 + }, + { + "epoch": 0.008279714408160766, + "grad_norm": 1.3435394763946533, + "learning_rate": 1.8398548846851516e-06, + "loss": 1.7352, + "step": 355 + }, + { + "epoch": 0.008303037547338684, + "grad_norm": 1.6550534963607788, + "learning_rate": 1.8450375745011662e-06, + "loss": 1.4283, + "step": 356 + }, + { + "epoch": 0.008326360686516602, + "grad_norm": 1.0326530933380127, + "learning_rate": 1.8502202643171807e-06, + "loss": 1.8726, + "step": 357 + }, + { + "epoch": 0.00834968382569452, + "grad_norm": 1.1237214803695679, + "learning_rate": 1.8554029541331953e-06, + "loss": 1.7547, + "step": 358 + }, + { + "epoch": 0.008373006964872438, + "grad_norm": 1.3457711935043335, + "learning_rate": 1.8605856439492098e-06, + "loss": 1.5047, + "step": 359 + }, + { + "epoch": 0.008396330104050354, + "grad_norm": 1.3615081310272217, + "learning_rate": 1.8657683337652244e-06, + "loss": 1.3476, + "step": 360 + }, + { + "epoch": 0.008419653243228272, + "grad_norm": 1.4443084001541138, + "learning_rate": 1.870951023581239e-06, + "loss": 1.4259, + "step": 361 + }, + { + "epoch": 0.00844297638240619, + "grad_norm": 0.9154095649719238, + "learning_rate": 1.8761337133972533e-06, + "loss": 1.6089, + "step": 362 + }, + { + "epoch": 0.008466299521584108, + "grad_norm": 1.1972756385803223, + "learning_rate": 1.8813164032132679e-06, + "loss": 1.5704, + "step": 363 + }, + { + "epoch": 0.008489622660762025, + "grad_norm": 1.1325738430023193, + "learning_rate": 1.8864990930292822e-06, + "loss": 1.7252, + "step": 364 + }, + { + "epoch": 0.008512945799939943, + "grad_norm": 1.2257301807403564, + "learning_rate": 1.8916817828452968e-06, + "loss": 1.5124, + "step": 365 + }, + { + "epoch": 0.008536268939117861, + "grad_norm": 1.7714002132415771, + "learning_rate": 1.8968644726613113e-06, + "loss": 1.5799, + "step": 366 + }, + { + "epoch": 0.008559592078295777, + "grad_norm": 1.1215579509735107, + "learning_rate": 1.9020471624773259e-06, + "loss": 1.7692, + "step": 367 + }, + { + "epoch": 0.008582915217473695, + "grad_norm": 1.3264069557189941, + "learning_rate": 1.9072298522933404e-06, + "loss": 1.7848, + "step": 368 + }, + { + "epoch": 0.008606238356651613, + "grad_norm": 0.9898104667663574, + "learning_rate": 1.912412542109355e-06, + "loss": 1.945, + "step": 369 + }, + { + "epoch": 0.008629561495829531, + "grad_norm": 0.9507944583892822, + "learning_rate": 1.9175952319253693e-06, + "loss": 1.6469, + "step": 370 + }, + { + "epoch": 0.008652884635007449, + "grad_norm": 1.1940997838974, + "learning_rate": 1.9227779217413837e-06, + "loss": 1.5144, + "step": 371 + }, + { + "epoch": 0.008676207774185367, + "grad_norm": 1.2926305532455444, + "learning_rate": 1.9279606115573985e-06, + "loss": 1.6527, + "step": 372 + }, + { + "epoch": 0.008699530913363285, + "grad_norm": 0.9909786581993103, + "learning_rate": 1.933143301373413e-06, + "loss": 1.8003, + "step": 373 + }, + { + "epoch": 0.008722854052541201, + "grad_norm": 1.3900662660598755, + "learning_rate": 1.9383259911894276e-06, + "loss": 1.7743, + "step": 374 + }, + { + "epoch": 0.008746177191719119, + "grad_norm": 0.9942039251327515, + "learning_rate": 1.943508681005442e-06, + "loss": 1.5635, + "step": 375 + }, + { + "epoch": 0.008769500330897037, + "grad_norm": 1.3887672424316406, + "learning_rate": 1.9486913708214563e-06, + "loss": 1.744, + "step": 376 + }, + { + "epoch": 0.008792823470074955, + "grad_norm": 1.2873059511184692, + "learning_rate": 1.953874060637471e-06, + "loss": 1.64, + "step": 377 + }, + { + "epoch": 0.008816146609252873, + "grad_norm": 1.2259247303009033, + "learning_rate": 1.9590567504534854e-06, + "loss": 1.6418, + "step": 378 + }, + { + "epoch": 0.00883946974843079, + "grad_norm": 1.5709097385406494, + "learning_rate": 1.9642394402695e-06, + "loss": 1.4343, + "step": 379 + }, + { + "epoch": 0.008862792887608708, + "grad_norm": 1.016625165939331, + "learning_rate": 1.9694221300855145e-06, + "loss": 1.5838, + "step": 380 + }, + { + "epoch": 0.008886116026786626, + "grad_norm": 1.5763674974441528, + "learning_rate": 1.9746048199015293e-06, + "loss": 1.3391, + "step": 381 + }, + { + "epoch": 0.008909439165964542, + "grad_norm": 1.014722466468811, + "learning_rate": 1.9797875097175436e-06, + "loss": 1.7185, + "step": 382 + }, + { + "epoch": 0.00893276230514246, + "grad_norm": 1.5255705118179321, + "learning_rate": 1.984970199533558e-06, + "loss": 1.5749, + "step": 383 + }, + { + "epoch": 0.008956085444320378, + "grad_norm": 1.4036648273468018, + "learning_rate": 1.9901528893495723e-06, + "loss": 1.4134, + "step": 384 + }, + { + "epoch": 0.008979408583498296, + "grad_norm": 1.327813982963562, + "learning_rate": 1.995335579165587e-06, + "loss": 1.8475, + "step": 385 + }, + { + "epoch": 0.009002731722676214, + "grad_norm": 1.357269287109375, + "learning_rate": 2.0005182689816014e-06, + "loss": 1.4145, + "step": 386 + }, + { + "epoch": 0.009026054861854132, + "grad_norm": 1.4663738012313843, + "learning_rate": 2.005700958797616e-06, + "loss": 1.5207, + "step": 387 + }, + { + "epoch": 0.00904937800103205, + "grad_norm": 0.9792691469192505, + "learning_rate": 2.0108836486136305e-06, + "loss": 1.7392, + "step": 388 + }, + { + "epoch": 0.009072701140209966, + "grad_norm": 1.9074856042861938, + "learning_rate": 2.0160663384296453e-06, + "loss": 1.5931, + "step": 389 + }, + { + "epoch": 0.009096024279387884, + "grad_norm": 1.562455654144287, + "learning_rate": 2.0212490282456597e-06, + "loss": 1.3503, + "step": 390 + }, + { + "epoch": 0.009119347418565802, + "grad_norm": 1.6827714443206787, + "learning_rate": 2.026431718061674e-06, + "loss": 1.8409, + "step": 391 + }, + { + "epoch": 0.00914267055774372, + "grad_norm": 0.969691276550293, + "learning_rate": 2.0316144078776888e-06, + "loss": 1.5167, + "step": 392 + }, + { + "epoch": 0.009165993696921637, + "grad_norm": 1.1107996702194214, + "learning_rate": 2.036797097693703e-06, + "loss": 1.5723, + "step": 393 + }, + { + "epoch": 0.009189316836099555, + "grad_norm": 0.9862359762191772, + "learning_rate": 2.041979787509718e-06, + "loss": 1.1188, + "step": 394 + }, + { + "epoch": 0.009212639975277473, + "grad_norm": 1.4997074604034424, + "learning_rate": 2.0471624773257322e-06, + "loss": 1.6742, + "step": 395 + }, + { + "epoch": 0.00923596311445539, + "grad_norm": 1.1336885690689087, + "learning_rate": 2.052345167141747e-06, + "loss": 1.5602, + "step": 396 + }, + { + "epoch": 0.009259286253633307, + "grad_norm": 1.4929397106170654, + "learning_rate": 2.057527856957761e-06, + "loss": 1.4891, + "step": 397 + }, + { + "epoch": 0.009282609392811225, + "grad_norm": 1.3118637800216675, + "learning_rate": 2.0627105467737757e-06, + "loss": 1.5758, + "step": 398 + }, + { + "epoch": 0.009305932531989143, + "grad_norm": 1.1043623685836792, + "learning_rate": 2.06789323658979e-06, + "loss": 1.9455, + "step": 399 + }, + { + "epoch": 0.009329255671167061, + "grad_norm": 1.3472813367843628, + "learning_rate": 2.073075926405805e-06, + "loss": 1.4657, + "step": 400 + }, + { + "epoch": 0.009352578810344979, + "grad_norm": 1.5614628791809082, + "learning_rate": 2.078258616221819e-06, + "loss": 1.3351, + "step": 401 + }, + { + "epoch": 0.009375901949522897, + "grad_norm": 1.393477439880371, + "learning_rate": 2.083441306037834e-06, + "loss": 1.8887, + "step": 402 + }, + { + "epoch": 0.009399225088700813, + "grad_norm": 1.0576095581054688, + "learning_rate": 2.0886239958538483e-06, + "loss": 1.7814, + "step": 403 + }, + { + "epoch": 0.00942254822787873, + "grad_norm": 1.5161347389221191, + "learning_rate": 2.0938066856698626e-06, + "loss": 1.2316, + "step": 404 + }, + { + "epoch": 0.009445871367056649, + "grad_norm": 1.05890691280365, + "learning_rate": 2.0989893754858774e-06, + "loss": 1.5303, + "step": 405 + }, + { + "epoch": 0.009469194506234567, + "grad_norm": 0.801816463470459, + "learning_rate": 2.1041720653018918e-06, + "loss": 1.5165, + "step": 406 + }, + { + "epoch": 0.009492517645412485, + "grad_norm": 1.2811832427978516, + "learning_rate": 2.1093547551179065e-06, + "loss": 1.8638, + "step": 407 + }, + { + "epoch": 0.009515840784590402, + "grad_norm": 1.2984956502914429, + "learning_rate": 2.114537444933921e-06, + "loss": 1.4195, + "step": 408 + }, + { + "epoch": 0.00953916392376832, + "grad_norm": 2.3772926330566406, + "learning_rate": 2.1197201347499356e-06, + "loss": 1.2616, + "step": 409 + }, + { + "epoch": 0.009562487062946236, + "grad_norm": 1.102181315422058, + "learning_rate": 2.12490282456595e-06, + "loss": 1.6683, + "step": 410 + }, + { + "epoch": 0.009585810202124154, + "grad_norm": 1.4473963975906372, + "learning_rate": 2.1300855143819643e-06, + "loss": 1.6474, + "step": 411 + }, + { + "epoch": 0.009609133341302072, + "grad_norm": 2.3995816707611084, + "learning_rate": 2.1352682041979787e-06, + "loss": 1.6203, + "step": 412 + }, + { + "epoch": 0.00963245648047999, + "grad_norm": 0.9490773677825928, + "learning_rate": 2.1404508940139935e-06, + "loss": 1.8082, + "step": 413 + }, + { + "epoch": 0.009655779619657908, + "grad_norm": 0.9358771443367004, + "learning_rate": 2.145633583830008e-06, + "loss": 1.5929, + "step": 414 + }, + { + "epoch": 0.009679102758835826, + "grad_norm": 0.9875616431236267, + "learning_rate": 2.1508162736460226e-06, + "loss": 1.4312, + "step": 415 + }, + { + "epoch": 0.009702425898013744, + "grad_norm": 1.197416067123413, + "learning_rate": 2.155998963462037e-06, + "loss": 1.3165, + "step": 416 + }, + { + "epoch": 0.00972574903719166, + "grad_norm": 2.0210750102996826, + "learning_rate": 2.1611816532780513e-06, + "loss": 1.4962, + "step": 417 + }, + { + "epoch": 0.009749072176369578, + "grad_norm": 1.2700085639953613, + "learning_rate": 2.166364343094066e-06, + "loss": 1.6101, + "step": 418 + }, + { + "epoch": 0.009772395315547496, + "grad_norm": 1.124679684638977, + "learning_rate": 2.1715470329100804e-06, + "loss": 1.7477, + "step": 419 + }, + { + "epoch": 0.009795718454725414, + "grad_norm": 1.178290843963623, + "learning_rate": 2.176729722726095e-06, + "loss": 1.4108, + "step": 420 + }, + { + "epoch": 0.009819041593903332, + "grad_norm": 1.792117953300476, + "learning_rate": 2.1819124125421095e-06, + "loss": 1.5568, + "step": 421 + }, + { + "epoch": 0.00984236473308125, + "grad_norm": 1.7381610870361328, + "learning_rate": 2.1870951023581243e-06, + "loss": 1.3229, + "step": 422 + }, + { + "epoch": 0.009865687872259167, + "grad_norm": 1.023553490638733, + "learning_rate": 2.1922777921741386e-06, + "loss": 1.1633, + "step": 423 + }, + { + "epoch": 0.009889011011437084, + "grad_norm": 1.5537900924682617, + "learning_rate": 2.197460481990153e-06, + "loss": 1.291, + "step": 424 + }, + { + "epoch": 0.009912334150615001, + "grad_norm": 1.722598671913147, + "learning_rate": 2.2026431718061673e-06, + "loss": 1.5201, + "step": 425 + }, + { + "epoch": 0.00993565728979292, + "grad_norm": 1.546295166015625, + "learning_rate": 2.207825861622182e-06, + "loss": 1.3554, + "step": 426 + }, + { + "epoch": 0.009958980428970837, + "grad_norm": 1.4075593948364258, + "learning_rate": 2.2130085514381964e-06, + "loss": 1.3831, + "step": 427 + }, + { + "epoch": 0.009982303568148755, + "grad_norm": 1.441125512123108, + "learning_rate": 2.218191241254211e-06, + "loss": 1.4806, + "step": 428 + }, + { + "epoch": 0.010005626707326673, + "grad_norm": 1.4198213815689087, + "learning_rate": 2.2233739310702255e-06, + "loss": 1.6962, + "step": 429 + }, + { + "epoch": 0.010028949846504591, + "grad_norm": 1.1716971397399902, + "learning_rate": 2.2285566208862403e-06, + "loss": 1.0423, + "step": 430 + }, + { + "epoch": 0.010052272985682507, + "grad_norm": 1.1271895170211792, + "learning_rate": 2.2337393107022547e-06, + "loss": 1.4246, + "step": 431 + }, + { + "epoch": 0.010075596124860425, + "grad_norm": 1.2987208366394043, + "learning_rate": 2.238922000518269e-06, + "loss": 1.5946, + "step": 432 + }, + { + "epoch": 0.010098919264038343, + "grad_norm": 1.7283997535705566, + "learning_rate": 2.2441046903342838e-06, + "loss": 1.5761, + "step": 433 + }, + { + "epoch": 0.01012224240321626, + "grad_norm": 1.635098934173584, + "learning_rate": 2.249287380150298e-06, + "loss": 1.6912, + "step": 434 + }, + { + "epoch": 0.010145565542394179, + "grad_norm": 2.1896469593048096, + "learning_rate": 2.254470069966313e-06, + "loss": 1.2961, + "step": 435 + }, + { + "epoch": 0.010168888681572097, + "grad_norm": 1.1874053478240967, + "learning_rate": 2.2596527597823272e-06, + "loss": 1.4999, + "step": 436 + }, + { + "epoch": 0.010192211820750014, + "grad_norm": 1.2898855209350586, + "learning_rate": 2.264835449598342e-06, + "loss": 1.7152, + "step": 437 + }, + { + "epoch": 0.010215534959927932, + "grad_norm": 0.792107105255127, + "learning_rate": 2.270018139414356e-06, + "loss": 1.4129, + "step": 438 + }, + { + "epoch": 0.010238858099105849, + "grad_norm": 1.2092666625976562, + "learning_rate": 2.2752008292303707e-06, + "loss": 1.4687, + "step": 439 + }, + { + "epoch": 0.010262181238283766, + "grad_norm": 1.2261115312576294, + "learning_rate": 2.280383519046385e-06, + "loss": 1.5548, + "step": 440 + }, + { + "epoch": 0.010285504377461684, + "grad_norm": 2.0835094451904297, + "learning_rate": 2.2855662088624e-06, + "loss": 1.5925, + "step": 441 + }, + { + "epoch": 0.010308827516639602, + "grad_norm": 1.075907826423645, + "learning_rate": 2.290748898678414e-06, + "loss": 1.4967, + "step": 442 + }, + { + "epoch": 0.01033215065581752, + "grad_norm": 0.9633646011352539, + "learning_rate": 2.295931588494429e-06, + "loss": 1.6798, + "step": 443 + }, + { + "epoch": 0.010355473794995438, + "grad_norm": 1.6833699941635132, + "learning_rate": 2.3011142783104433e-06, + "loss": 1.3053, + "step": 444 + }, + { + "epoch": 0.010378796934173356, + "grad_norm": 1.1333974599838257, + "learning_rate": 2.3062969681264576e-06, + "loss": 1.3658, + "step": 445 + }, + { + "epoch": 0.010402120073351272, + "grad_norm": 1.3382309675216675, + "learning_rate": 2.3114796579424724e-06, + "loss": 1.6492, + "step": 446 + }, + { + "epoch": 0.01042544321252919, + "grad_norm": 0.7148923873901367, + "learning_rate": 2.3166623477584868e-06, + "loss": 1.6269, + "step": 447 + }, + { + "epoch": 0.010448766351707108, + "grad_norm": 1.084245204925537, + "learning_rate": 2.3218450375745015e-06, + "loss": 2.0708, + "step": 448 + }, + { + "epoch": 0.010472089490885026, + "grad_norm": 1.1463004350662231, + "learning_rate": 2.327027727390516e-06, + "loss": 2.0115, + "step": 449 + }, + { + "epoch": 0.010495412630062944, + "grad_norm": 1.5500133037567139, + "learning_rate": 2.3322104172065306e-06, + "loss": 1.5454, + "step": 450 + }, + { + "epoch": 0.010518735769240862, + "grad_norm": 1.2993839979171753, + "learning_rate": 2.337393107022545e-06, + "loss": 1.5475, + "step": 451 + }, + { + "epoch": 0.01054205890841878, + "grad_norm": 1.295839786529541, + "learning_rate": 2.3425757968385593e-06, + "loss": 1.2895, + "step": 452 + }, + { + "epoch": 0.010565382047596696, + "grad_norm": 1.045040488243103, + "learning_rate": 2.3477584866545737e-06, + "loss": 1.7306, + "step": 453 + }, + { + "epoch": 0.010588705186774613, + "grad_norm": 1.4592766761779785, + "learning_rate": 2.3529411764705885e-06, + "loss": 1.7795, + "step": 454 + }, + { + "epoch": 0.010612028325952531, + "grad_norm": 0.9432761073112488, + "learning_rate": 2.358123866286603e-06, + "loss": 1.6963, + "step": 455 + }, + { + "epoch": 0.01063535146513045, + "grad_norm": 1.3770086765289307, + "learning_rate": 2.3633065561026176e-06, + "loss": 1.2003, + "step": 456 + }, + { + "epoch": 0.010658674604308367, + "grad_norm": 1.1453793048858643, + "learning_rate": 2.368489245918632e-06, + "loss": 1.9012, + "step": 457 + }, + { + "epoch": 0.010681997743486285, + "grad_norm": 1.2836976051330566, + "learning_rate": 2.3736719357346467e-06, + "loss": 1.4324, + "step": 458 + }, + { + "epoch": 0.010705320882664203, + "grad_norm": 1.6498123407363892, + "learning_rate": 2.378854625550661e-06, + "loss": 1.6212, + "step": 459 + }, + { + "epoch": 0.010728644021842119, + "grad_norm": 1.3681795597076416, + "learning_rate": 2.3840373153666754e-06, + "loss": 1.6047, + "step": 460 + }, + { + "epoch": 0.010751967161020037, + "grad_norm": 1.4474722146987915, + "learning_rate": 2.38922000518269e-06, + "loss": 1.5279, + "step": 461 + }, + { + "epoch": 0.010775290300197955, + "grad_norm": 1.4832510948181152, + "learning_rate": 2.3944026949987045e-06, + "loss": 1.7073, + "step": 462 + }, + { + "epoch": 0.010798613439375873, + "grad_norm": 1.343935251235962, + "learning_rate": 2.3995853848147193e-06, + "loss": 1.4637, + "step": 463 + }, + { + "epoch": 0.01082193657855379, + "grad_norm": 1.8285539150238037, + "learning_rate": 2.4047680746307336e-06, + "loss": 1.3944, + "step": 464 + }, + { + "epoch": 0.010845259717731709, + "grad_norm": 1.4653230905532837, + "learning_rate": 2.4099507644467484e-06, + "loss": 1.8847, + "step": 465 + }, + { + "epoch": 0.010868582856909626, + "grad_norm": 1.4410351514816284, + "learning_rate": 2.4151334542627623e-06, + "loss": 1.7298, + "step": 466 + }, + { + "epoch": 0.010891905996087543, + "grad_norm": 1.3057256937026978, + "learning_rate": 2.420316144078777e-06, + "loss": 1.6188, + "step": 467 + }, + { + "epoch": 0.01091522913526546, + "grad_norm": 1.574479103088379, + "learning_rate": 2.4254988338947914e-06, + "loss": 1.585, + "step": 468 + }, + { + "epoch": 0.010938552274443378, + "grad_norm": 1.4391696453094482, + "learning_rate": 2.430681523710806e-06, + "loss": 1.7272, + "step": 469 + }, + { + "epoch": 0.010961875413621296, + "grad_norm": 2.304706335067749, + "learning_rate": 2.4358642135268205e-06, + "loss": 1.7127, + "step": 470 + }, + { + "epoch": 0.010985198552799214, + "grad_norm": 1.2380545139312744, + "learning_rate": 2.4410469033428353e-06, + "loss": 1.5428, + "step": 471 + }, + { + "epoch": 0.011008521691977132, + "grad_norm": 1.303446888923645, + "learning_rate": 2.4462295931588497e-06, + "loss": 1.609, + "step": 472 + }, + { + "epoch": 0.01103184483115505, + "grad_norm": 1.3888837099075317, + "learning_rate": 2.451412282974864e-06, + "loss": 1.7134, + "step": 473 + }, + { + "epoch": 0.011055167970332966, + "grad_norm": 0.9802701473236084, + "learning_rate": 2.4565949727908788e-06, + "loss": 1.4401, + "step": 474 + }, + { + "epoch": 0.011078491109510884, + "grad_norm": 1.5808403491973877, + "learning_rate": 2.461777662606893e-06, + "loss": 1.7415, + "step": 475 + }, + { + "epoch": 0.011101814248688802, + "grad_norm": 1.299912691116333, + "learning_rate": 2.466960352422908e-06, + "loss": 1.361, + "step": 476 + }, + { + "epoch": 0.01112513738786672, + "grad_norm": 0.9326110482215881, + "learning_rate": 2.4721430422389222e-06, + "loss": 1.222, + "step": 477 + }, + { + "epoch": 0.011148460527044638, + "grad_norm": 1.0385396480560303, + "learning_rate": 2.477325732054937e-06, + "loss": 1.4813, + "step": 478 + }, + { + "epoch": 0.011171783666222556, + "grad_norm": 1.1004397869110107, + "learning_rate": 2.482508421870951e-06, + "loss": 1.5064, + "step": 479 + }, + { + "epoch": 0.011195106805400474, + "grad_norm": 1.274898886680603, + "learning_rate": 2.4876911116869657e-06, + "loss": 1.3046, + "step": 480 + }, + { + "epoch": 0.01121842994457839, + "grad_norm": 1.0818660259246826, + "learning_rate": 2.49287380150298e-06, + "loss": 1.878, + "step": 481 + }, + { + "epoch": 0.011241753083756308, + "grad_norm": 1.2744652032852173, + "learning_rate": 2.498056491318995e-06, + "loss": 1.6394, + "step": 482 + }, + { + "epoch": 0.011265076222934226, + "grad_norm": 1.0467538833618164, + "learning_rate": 2.503239181135009e-06, + "loss": 1.8949, + "step": 483 + }, + { + "epoch": 0.011288399362112143, + "grad_norm": 1.2507177591323853, + "learning_rate": 2.508421870951024e-06, + "loss": 1.5386, + "step": 484 + }, + { + "epoch": 0.011311722501290061, + "grad_norm": 2.0707380771636963, + "learning_rate": 2.5136045607670383e-06, + "loss": 1.3359, + "step": 485 + }, + { + "epoch": 0.01133504564046798, + "grad_norm": 1.0060955286026, + "learning_rate": 2.518787250583053e-06, + "loss": 1.5551, + "step": 486 + }, + { + "epoch": 0.011358368779645897, + "grad_norm": 2.1019294261932373, + "learning_rate": 2.5239699403990674e-06, + "loss": 1.4009, + "step": 487 + }, + { + "epoch": 0.011381691918823813, + "grad_norm": 1.2085974216461182, + "learning_rate": 2.529152630215082e-06, + "loss": 1.1264, + "step": 488 + }, + { + "epoch": 0.011405015058001731, + "grad_norm": 1.2670215368270874, + "learning_rate": 2.5343353200310965e-06, + "loss": 1.4005, + "step": 489 + }, + { + "epoch": 0.011428338197179649, + "grad_norm": 0.976809024810791, + "learning_rate": 2.5395180098471104e-06, + "loss": 1.6539, + "step": 490 + }, + { + "epoch": 0.011451661336357567, + "grad_norm": 1.8012447357177734, + "learning_rate": 2.5447006996631252e-06, + "loss": 1.5083, + "step": 491 + }, + { + "epoch": 0.011474984475535485, + "grad_norm": 2.0657784938812256, + "learning_rate": 2.5498833894791396e-06, + "loss": 1.4127, + "step": 492 + }, + { + "epoch": 0.011498307614713403, + "grad_norm": 1.4070103168487549, + "learning_rate": 2.5550660792951543e-06, + "loss": 1.4707, + "step": 493 + }, + { + "epoch": 0.01152163075389132, + "grad_norm": 0.859045147895813, + "learning_rate": 2.5602487691111687e-06, + "loss": 1.6301, + "step": 494 + }, + { + "epoch": 0.011544953893069239, + "grad_norm": 1.5209952592849731, + "learning_rate": 2.5654314589271835e-06, + "loss": 1.8438, + "step": 495 + }, + { + "epoch": 0.011568277032247155, + "grad_norm": 1.1508231163024902, + "learning_rate": 2.570614148743198e-06, + "loss": 1.2495, + "step": 496 + }, + { + "epoch": 0.011591600171425073, + "grad_norm": 0.9130313396453857, + "learning_rate": 2.5757968385592126e-06, + "loss": 1.1848, + "step": 497 + }, + { + "epoch": 0.01161492331060299, + "grad_norm": 1.5925562381744385, + "learning_rate": 2.580979528375227e-06, + "loss": 1.4745, + "step": 498 + }, + { + "epoch": 0.011638246449780908, + "grad_norm": 2.5118539333343506, + "learning_rate": 2.5861622181912417e-06, + "loss": 1.6218, + "step": 499 + }, + { + "epoch": 0.011661569588958826, + "grad_norm": 1.272691249847412, + "learning_rate": 2.591344908007256e-06, + "loss": 1.2147, + "step": 500 + }, + { + "epoch": 0.011684892728136744, + "grad_norm": 1.1436160802841187, + "learning_rate": 2.596527597823271e-06, + "loss": 1.5556, + "step": 501 + }, + { + "epoch": 0.011708215867314662, + "grad_norm": 1.0195647478103638, + "learning_rate": 2.601710287639285e-06, + "loss": 1.3303, + "step": 502 + }, + { + "epoch": 0.011731539006492578, + "grad_norm": 1.4576568603515625, + "learning_rate": 2.6068929774553e-06, + "loss": 1.6531, + "step": 503 + }, + { + "epoch": 0.011754862145670496, + "grad_norm": 1.360716462135315, + "learning_rate": 2.612075667271314e-06, + "loss": 1.1761, + "step": 504 + }, + { + "epoch": 0.011778185284848414, + "grad_norm": 2.7770462036132812, + "learning_rate": 2.617258357087328e-06, + "loss": 1.247, + "step": 505 + }, + { + "epoch": 0.011801508424026332, + "grad_norm": 1.3706661462783813, + "learning_rate": 2.622441046903343e-06, + "loss": 1.5103, + "step": 506 + }, + { + "epoch": 0.01182483156320425, + "grad_norm": 1.5405017137527466, + "learning_rate": 2.6276237367193573e-06, + "loss": 1.6827, + "step": 507 + }, + { + "epoch": 0.011848154702382168, + "grad_norm": 1.1809494495391846, + "learning_rate": 2.632806426535372e-06, + "loss": 1.7162, + "step": 508 + }, + { + "epoch": 0.011871477841560086, + "grad_norm": 1.085557222366333, + "learning_rate": 2.6379891163513864e-06, + "loss": 1.514, + "step": 509 + }, + { + "epoch": 0.011894800980738002, + "grad_norm": 1.2155910730361938, + "learning_rate": 2.643171806167401e-06, + "loss": 1.4029, + "step": 510 + }, + { + "epoch": 0.01191812411991592, + "grad_norm": 1.240242600440979, + "learning_rate": 2.6483544959834155e-06, + "loss": 1.4336, + "step": 511 + }, + { + "epoch": 0.011941447259093838, + "grad_norm": 1.649802327156067, + "learning_rate": 2.6535371857994303e-06, + "loss": 1.9082, + "step": 512 + }, + { + "epoch": 0.011964770398271755, + "grad_norm": 1.3479831218719482, + "learning_rate": 2.6587198756154447e-06, + "loss": 1.5424, + "step": 513 + }, + { + "epoch": 0.011988093537449673, + "grad_norm": 1.2537102699279785, + "learning_rate": 2.6639025654314594e-06, + "loss": 1.6061, + "step": 514 + }, + { + "epoch": 0.012011416676627591, + "grad_norm": 1.1049939393997192, + "learning_rate": 2.6690852552474738e-06, + "loss": 1.8361, + "step": 515 + }, + { + "epoch": 0.012034739815805509, + "grad_norm": 2.9946062564849854, + "learning_rate": 2.6742679450634885e-06, + "loss": 1.4471, + "step": 516 + }, + { + "epoch": 0.012058062954983425, + "grad_norm": 0.9455610513687134, + "learning_rate": 2.6794506348795025e-06, + "loss": 1.6831, + "step": 517 + }, + { + "epoch": 0.012081386094161343, + "grad_norm": 1.4750438928604126, + "learning_rate": 2.684633324695517e-06, + "loss": 1.3143, + "step": 518 + }, + { + "epoch": 0.012104709233339261, + "grad_norm": 1.1056557893753052, + "learning_rate": 2.6898160145115316e-06, + "loss": 1.5054, + "step": 519 + }, + { + "epoch": 0.012128032372517179, + "grad_norm": 0.9718064069747925, + "learning_rate": 2.694998704327546e-06, + "loss": 1.3134, + "step": 520 + }, + { + "epoch": 0.012151355511695097, + "grad_norm": 2.2384724617004395, + "learning_rate": 2.7001813941435607e-06, + "loss": 1.4851, + "step": 521 + }, + { + "epoch": 0.012174678650873015, + "grad_norm": 1.2468239068984985, + "learning_rate": 2.705364083959575e-06, + "loss": 1.4873, + "step": 522 + }, + { + "epoch": 0.012198001790050933, + "grad_norm": 1.4248602390289307, + "learning_rate": 2.71054677377559e-06, + "loss": 1.7643, + "step": 523 + }, + { + "epoch": 0.012221324929228849, + "grad_norm": 1.3377385139465332, + "learning_rate": 2.715729463591604e-06, + "loss": 1.7064, + "step": 524 + }, + { + "epoch": 0.012244648068406767, + "grad_norm": 0.9933966994285583, + "learning_rate": 2.720912153407619e-06, + "loss": 1.7187, + "step": 525 + }, + { + "epoch": 0.012267971207584685, + "grad_norm": 1.018750548362732, + "learning_rate": 2.7260948432236333e-06, + "loss": 1.5915, + "step": 526 + }, + { + "epoch": 0.012291294346762602, + "grad_norm": 1.356325387954712, + "learning_rate": 2.731277533039648e-06, + "loss": 1.7193, + "step": 527 + }, + { + "epoch": 0.01231461748594052, + "grad_norm": 1.2781217098236084, + "learning_rate": 2.7364602228556624e-06, + "loss": 1.5494, + "step": 528 + }, + { + "epoch": 0.012337940625118438, + "grad_norm": 1.561498761177063, + "learning_rate": 2.741642912671677e-06, + "loss": 1.6972, + "step": 529 + }, + { + "epoch": 0.012361263764296356, + "grad_norm": 1.1695748567581177, + "learning_rate": 2.7468256024876915e-06, + "loss": 2.1633, + "step": 530 + }, + { + "epoch": 0.012384586903474272, + "grad_norm": 1.4304964542388916, + "learning_rate": 2.7520082923037054e-06, + "loss": 1.6321, + "step": 531 + }, + { + "epoch": 0.01240791004265219, + "grad_norm": 1.0513828992843628, + "learning_rate": 2.7571909821197202e-06, + "loss": 1.2897, + "step": 532 + }, + { + "epoch": 0.012431233181830108, + "grad_norm": 1.0206960439682007, + "learning_rate": 2.7623736719357346e-06, + "loss": 1.7842, + "step": 533 + }, + { + "epoch": 0.012454556321008026, + "grad_norm": 1.1440876722335815, + "learning_rate": 2.7675563617517493e-06, + "loss": 1.4399, + "step": 534 + }, + { + "epoch": 0.012477879460185944, + "grad_norm": 1.0837441682815552, + "learning_rate": 2.7727390515677637e-06, + "loss": 1.5155, + "step": 535 + }, + { + "epoch": 0.012501202599363862, + "grad_norm": 1.071378231048584, + "learning_rate": 2.7779217413837785e-06, + "loss": 1.6459, + "step": 536 + }, + { + "epoch": 0.01252452573854178, + "grad_norm": 1.6966552734375, + "learning_rate": 2.783104431199793e-06, + "loss": 1.6015, + "step": 537 + }, + { + "epoch": 0.012547848877719696, + "grad_norm": 1.2789183855056763, + "learning_rate": 2.7882871210158076e-06, + "loss": 1.2423, + "step": 538 + }, + { + "epoch": 0.012571172016897614, + "grad_norm": 1.2072651386260986, + "learning_rate": 2.793469810831822e-06, + "loss": 1.69, + "step": 539 + }, + { + "epoch": 0.012594495156075532, + "grad_norm": 1.5257117748260498, + "learning_rate": 2.7986525006478367e-06, + "loss": 1.7608, + "step": 540 + }, + { + "epoch": 0.01261781829525345, + "grad_norm": 1.0233759880065918, + "learning_rate": 2.803835190463851e-06, + "loss": 1.1299, + "step": 541 + }, + { + "epoch": 0.012641141434431367, + "grad_norm": 1.8280616998672485, + "learning_rate": 2.809017880279866e-06, + "loss": 1.3338, + "step": 542 + }, + { + "epoch": 0.012664464573609285, + "grad_norm": 1.6891363859176636, + "learning_rate": 2.81420057009588e-06, + "loss": 1.5505, + "step": 543 + }, + { + "epoch": 0.012687787712787203, + "grad_norm": 1.1501421928405762, + "learning_rate": 2.819383259911895e-06, + "loss": 1.6788, + "step": 544 + }, + { + "epoch": 0.01271111085196512, + "grad_norm": 1.107029914855957, + "learning_rate": 2.824565949727909e-06, + "loss": 1.3782, + "step": 545 + }, + { + "epoch": 0.012734433991143037, + "grad_norm": 0.9627429246902466, + "learning_rate": 2.829748639543923e-06, + "loss": 1.3155, + "step": 546 + }, + { + "epoch": 0.012757757130320955, + "grad_norm": 2.330007791519165, + "learning_rate": 2.834931329359938e-06, + "loss": 1.425, + "step": 547 + }, + { + "epoch": 0.012781080269498873, + "grad_norm": 1.4026503562927246, + "learning_rate": 2.8401140191759523e-06, + "loss": 1.5578, + "step": 548 + }, + { + "epoch": 0.012804403408676791, + "grad_norm": 0.9430487155914307, + "learning_rate": 2.845296708991967e-06, + "loss": 1.6075, + "step": 549 + }, + { + "epoch": 0.012827726547854709, + "grad_norm": 1.0779294967651367, + "learning_rate": 2.8504793988079814e-06, + "loss": 1.5169, + "step": 550 + }, + { + "epoch": 0.012851049687032627, + "grad_norm": 1.130324125289917, + "learning_rate": 2.855662088623996e-06, + "loss": 1.5016, + "step": 551 + }, + { + "epoch": 0.012874372826210545, + "grad_norm": 1.0127092599868774, + "learning_rate": 2.8608447784400105e-06, + "loss": 1.8715, + "step": 552 + }, + { + "epoch": 0.01289769596538846, + "grad_norm": 1.1831302642822266, + "learning_rate": 2.8660274682560253e-06, + "loss": 1.678, + "step": 553 + }, + { + "epoch": 0.012921019104566379, + "grad_norm": 1.3394455909729004, + "learning_rate": 2.8712101580720397e-06, + "loss": 1.4129, + "step": 554 + }, + { + "epoch": 0.012944342243744297, + "grad_norm": 1.2189030647277832, + "learning_rate": 2.8763928478880544e-06, + "loss": 1.7364, + "step": 555 + }, + { + "epoch": 0.012967665382922215, + "grad_norm": 1.2808138132095337, + "learning_rate": 2.8815755377040688e-06, + "loss": 1.6274, + "step": 556 + }, + { + "epoch": 0.012990988522100132, + "grad_norm": 1.0384689569473267, + "learning_rate": 2.8867582275200835e-06, + "loss": 1.5942, + "step": 557 + }, + { + "epoch": 0.01301431166127805, + "grad_norm": 1.8520807027816772, + "learning_rate": 2.891940917336098e-06, + "loss": 1.3067, + "step": 558 + }, + { + "epoch": 0.013037634800455968, + "grad_norm": 1.1817374229431152, + "learning_rate": 2.897123607152112e-06, + "loss": 1.6405, + "step": 559 + }, + { + "epoch": 0.013060957939633884, + "grad_norm": 1.1010823249816895, + "learning_rate": 2.9023062969681266e-06, + "loss": 1.4339, + "step": 560 + }, + { + "epoch": 0.013084281078811802, + "grad_norm": 1.2461942434310913, + "learning_rate": 2.907488986784141e-06, + "loss": 1.9866, + "step": 561 + }, + { + "epoch": 0.01310760421798972, + "grad_norm": 1.1503125429153442, + "learning_rate": 2.9126716766001557e-06, + "loss": 1.585, + "step": 562 + }, + { + "epoch": 0.013130927357167638, + "grad_norm": 1.542434573173523, + "learning_rate": 2.91785436641617e-06, + "loss": 1.4524, + "step": 563 + }, + { + "epoch": 0.013154250496345556, + "grad_norm": 1.0469673871994019, + "learning_rate": 2.923037056232185e-06, + "loss": 1.6884, + "step": 564 + }, + { + "epoch": 0.013177573635523474, + "grad_norm": 1.5137437582015991, + "learning_rate": 2.928219746048199e-06, + "loss": 1.5377, + "step": 565 + }, + { + "epoch": 0.013200896774701392, + "grad_norm": 1.1454534530639648, + "learning_rate": 2.933402435864214e-06, + "loss": 1.8508, + "step": 566 + }, + { + "epoch": 0.013224219913879308, + "grad_norm": 1.310381531715393, + "learning_rate": 2.9385851256802283e-06, + "loss": 1.5774, + "step": 567 + }, + { + "epoch": 0.013247543053057226, + "grad_norm": 1.1223838329315186, + "learning_rate": 2.943767815496243e-06, + "loss": 1.4496, + "step": 568 + }, + { + "epoch": 0.013270866192235144, + "grad_norm": 1.4537910223007202, + "learning_rate": 2.9489505053122574e-06, + "loss": 1.4423, + "step": 569 + }, + { + "epoch": 0.013294189331413062, + "grad_norm": 1.1783167123794556, + "learning_rate": 2.954133195128272e-06, + "loss": 1.9314, + "step": 570 + }, + { + "epoch": 0.01331751247059098, + "grad_norm": 1.211719274520874, + "learning_rate": 2.9593158849442865e-06, + "loss": 1.5366, + "step": 571 + }, + { + "epoch": 0.013340835609768897, + "grad_norm": 2.9552671909332275, + "learning_rate": 2.9644985747603004e-06, + "loss": 1.3431, + "step": 572 + }, + { + "epoch": 0.013364158748946815, + "grad_norm": 1.2814795970916748, + "learning_rate": 2.9696812645763152e-06, + "loss": 1.3879, + "step": 573 + }, + { + "epoch": 0.013387481888124731, + "grad_norm": 1.2598010301589966, + "learning_rate": 2.9748639543923296e-06, + "loss": 1.4775, + "step": 574 + }, + { + "epoch": 0.01341080502730265, + "grad_norm": 1.3874925374984741, + "learning_rate": 2.9800466442083443e-06, + "loss": 1.4012, + "step": 575 + }, + { + "epoch": 0.013434128166480567, + "grad_norm": 1.1846306324005127, + "learning_rate": 2.9852293340243587e-06, + "loss": 1.4491, + "step": 576 + }, + { + "epoch": 0.013457451305658485, + "grad_norm": 1.388150691986084, + "learning_rate": 2.9904120238403734e-06, + "loss": 1.6913, + "step": 577 + }, + { + "epoch": 0.013480774444836403, + "grad_norm": 1.8026880025863647, + "learning_rate": 2.995594713656388e-06, + "loss": 1.1754, + "step": 578 + }, + { + "epoch": 0.013504097584014321, + "grad_norm": 1.9366620779037476, + "learning_rate": 3.0007774034724026e-06, + "loss": 1.4406, + "step": 579 + }, + { + "epoch": 0.013527420723192239, + "grad_norm": 1.039657473564148, + "learning_rate": 3.005960093288417e-06, + "loss": 1.4823, + "step": 580 + }, + { + "epoch": 0.013550743862370155, + "grad_norm": 1.0928449630737305, + "learning_rate": 3.0111427831044317e-06, + "loss": 1.4502, + "step": 581 + }, + { + "epoch": 0.013574067001548073, + "grad_norm": 2.408292531967163, + "learning_rate": 3.016325472920446e-06, + "loss": 1.4778, + "step": 582 + }, + { + "epoch": 0.01359739014072599, + "grad_norm": 1.2284953594207764, + "learning_rate": 3.021508162736461e-06, + "loss": 1.5887, + "step": 583 + }, + { + "epoch": 0.013620713279903909, + "grad_norm": 1.3841763734817505, + "learning_rate": 3.026690852552475e-06, + "loss": 1.3778, + "step": 584 + }, + { + "epoch": 0.013644036419081827, + "grad_norm": 1.305172324180603, + "learning_rate": 3.03187354236849e-06, + "loss": 1.2837, + "step": 585 + }, + { + "epoch": 0.013667359558259744, + "grad_norm": 1.087904691696167, + "learning_rate": 3.037056232184504e-06, + "loss": 1.4361, + "step": 586 + }, + { + "epoch": 0.013690682697437662, + "grad_norm": 1.1818716526031494, + "learning_rate": 3.042238922000518e-06, + "loss": 1.4903, + "step": 587 + }, + { + "epoch": 0.013714005836615578, + "grad_norm": 0.9969412088394165, + "learning_rate": 3.047421611816533e-06, + "loss": 1.6923, + "step": 588 + }, + { + "epoch": 0.013737328975793496, + "grad_norm": 1.3729232549667358, + "learning_rate": 3.0526043016325473e-06, + "loss": 1.4219, + "step": 589 + }, + { + "epoch": 0.013760652114971414, + "grad_norm": 1.091769814491272, + "learning_rate": 3.057786991448562e-06, + "loss": 1.6978, + "step": 590 + }, + { + "epoch": 0.013783975254149332, + "grad_norm": 1.1668254137039185, + "learning_rate": 3.0629696812645764e-06, + "loss": 1.4609, + "step": 591 + }, + { + "epoch": 0.01380729839332725, + "grad_norm": 1.3739502429962158, + "learning_rate": 3.068152371080591e-06, + "loss": 1.7247, + "step": 592 + }, + { + "epoch": 0.013830621532505168, + "grad_norm": 1.480758547782898, + "learning_rate": 3.0733350608966055e-06, + "loss": 1.6142, + "step": 593 + }, + { + "epoch": 0.013853944671683086, + "grad_norm": 0.853581964969635, + "learning_rate": 3.0785177507126203e-06, + "loss": 1.5563, + "step": 594 + }, + { + "epoch": 0.013877267810861002, + "grad_norm": 1.144692063331604, + "learning_rate": 3.0837004405286347e-06, + "loss": 1.6145, + "step": 595 + }, + { + "epoch": 0.01390059095003892, + "grad_norm": 1.2413440942764282, + "learning_rate": 3.0888831303446494e-06, + "loss": 1.5762, + "step": 596 + }, + { + "epoch": 0.013923914089216838, + "grad_norm": 1.147834062576294, + "learning_rate": 3.0940658201606638e-06, + "loss": 1.4478, + "step": 597 + }, + { + "epoch": 0.013947237228394756, + "grad_norm": 1.0349398851394653, + "learning_rate": 3.0992485099766785e-06, + "loss": 1.612, + "step": 598 + }, + { + "epoch": 0.013970560367572674, + "grad_norm": 1.4780391454696655, + "learning_rate": 3.104431199792693e-06, + "loss": 1.5179, + "step": 599 + }, + { + "epoch": 0.013993883506750592, + "grad_norm": 1.1395933628082275, + "learning_rate": 3.109613889608707e-06, + "loss": 1.4845, + "step": 600 + }, + { + "epoch": 0.01401720664592851, + "grad_norm": 1.37168550491333, + "learning_rate": 3.1147965794247216e-06, + "loss": 1.581, + "step": 601 + }, + { + "epoch": 0.014040529785106426, + "grad_norm": 1.8260347843170166, + "learning_rate": 3.119979269240736e-06, + "loss": 1.1221, + "step": 602 + }, + { + "epoch": 0.014063852924284343, + "grad_norm": 2.5528669357299805, + "learning_rate": 3.1251619590567507e-06, + "loss": 1.255, + "step": 603 + }, + { + "epoch": 0.014087176063462261, + "grad_norm": 1.3272032737731934, + "learning_rate": 3.130344648872765e-06, + "loss": 1.2713, + "step": 604 + }, + { + "epoch": 0.01411049920264018, + "grad_norm": 1.147449254989624, + "learning_rate": 3.13552733868878e-06, + "loss": 1.3694, + "step": 605 + }, + { + "epoch": 0.014133822341818097, + "grad_norm": 1.173793077468872, + "learning_rate": 3.140710028504794e-06, + "loss": 1.5818, + "step": 606 + }, + { + "epoch": 0.014157145480996015, + "grad_norm": 1.2347713708877563, + "learning_rate": 3.145892718320809e-06, + "loss": 1.501, + "step": 607 + }, + { + "epoch": 0.014180468620173933, + "grad_norm": 1.3945446014404297, + "learning_rate": 3.1510754081368233e-06, + "loss": 1.8674, + "step": 608 + }, + { + "epoch": 0.01420379175935185, + "grad_norm": 1.239762544631958, + "learning_rate": 3.156258097952838e-06, + "loss": 1.2516, + "step": 609 + }, + { + "epoch": 0.014227114898529767, + "grad_norm": 1.552531361579895, + "learning_rate": 3.1614407877688524e-06, + "loss": 1.5358, + "step": 610 + }, + { + "epoch": 0.014250438037707685, + "grad_norm": 1.576997995376587, + "learning_rate": 3.166623477584867e-06, + "loss": 1.7601, + "step": 611 + }, + { + "epoch": 0.014273761176885603, + "grad_norm": 1.3251402378082275, + "learning_rate": 3.1718061674008815e-06, + "loss": 1.2758, + "step": 612 + }, + { + "epoch": 0.01429708431606352, + "grad_norm": 1.2837574481964111, + "learning_rate": 3.1769888572168963e-06, + "loss": 1.528, + "step": 613 + }, + { + "epoch": 0.014320407455241439, + "grad_norm": 0.9697505831718445, + "learning_rate": 3.1821715470329102e-06, + "loss": 1.6359, + "step": 614 + }, + { + "epoch": 0.014343730594419356, + "grad_norm": 1.2682685852050781, + "learning_rate": 3.1873542368489246e-06, + "loss": 1.4759, + "step": 615 + }, + { + "epoch": 0.014367053733597274, + "grad_norm": 0.9607746005058289, + "learning_rate": 3.1925369266649393e-06, + "loss": 1.7474, + "step": 616 + }, + { + "epoch": 0.01439037687277519, + "grad_norm": 1.056736946105957, + "learning_rate": 3.1977196164809537e-06, + "loss": 1.8812, + "step": 617 + }, + { + "epoch": 0.014413700011953108, + "grad_norm": 1.1990852355957031, + "learning_rate": 3.2029023062969684e-06, + "loss": 1.6217, + "step": 618 + }, + { + "epoch": 0.014437023151131026, + "grad_norm": 1.1339764595031738, + "learning_rate": 3.208084996112983e-06, + "loss": 1.3557, + "step": 619 + }, + { + "epoch": 0.014460346290308944, + "grad_norm": 1.0672523975372314, + "learning_rate": 3.2132676859289976e-06, + "loss": 1.8239, + "step": 620 + }, + { + "epoch": 0.014483669429486862, + "grad_norm": 1.4371954202651978, + "learning_rate": 3.218450375745012e-06, + "loss": 1.4571, + "step": 621 + }, + { + "epoch": 0.01450699256866478, + "grad_norm": 1.9893105030059814, + "learning_rate": 3.2236330655610267e-06, + "loss": 1.3716, + "step": 622 + }, + { + "epoch": 0.014530315707842698, + "grad_norm": 1.7084318399429321, + "learning_rate": 3.228815755377041e-06, + "loss": 1.5201, + "step": 623 + }, + { + "epoch": 0.014553638847020614, + "grad_norm": 1.308225154876709, + "learning_rate": 3.233998445193056e-06, + "loss": 1.9173, + "step": 624 + }, + { + "epoch": 0.014576961986198532, + "grad_norm": 0.9914215803146362, + "learning_rate": 3.23918113500907e-06, + "loss": 1.7351, + "step": 625 + }, + { + "epoch": 0.01460028512537645, + "grad_norm": 1.0292766094207764, + "learning_rate": 3.244363824825085e-06, + "loss": 1.4073, + "step": 626 + }, + { + "epoch": 0.014623608264554368, + "grad_norm": 1.0998982191085815, + "learning_rate": 3.2495465146410993e-06, + "loss": 1.5979, + "step": 627 + }, + { + "epoch": 0.014646931403732286, + "grad_norm": 1.1409685611724854, + "learning_rate": 3.254729204457113e-06, + "loss": 1.3442, + "step": 628 + }, + { + "epoch": 0.014670254542910204, + "grad_norm": 1.7685736417770386, + "learning_rate": 3.259911894273128e-06, + "loss": 1.251, + "step": 629 + }, + { + "epoch": 0.014693577682088121, + "grad_norm": 1.6536918878555298, + "learning_rate": 3.2650945840891423e-06, + "loss": 1.4698, + "step": 630 + }, + { + "epoch": 0.014716900821266038, + "grad_norm": 2.046391248703003, + "learning_rate": 3.270277273905157e-06, + "loss": 1.5142, + "step": 631 + }, + { + "epoch": 0.014740223960443955, + "grad_norm": 1.3458948135375977, + "learning_rate": 3.2754599637211714e-06, + "loss": 1.3999, + "step": 632 + }, + { + "epoch": 0.014763547099621873, + "grad_norm": 1.7265046834945679, + "learning_rate": 3.280642653537186e-06, + "loss": 1.2212, + "step": 633 + }, + { + "epoch": 0.014786870238799791, + "grad_norm": 1.3191124200820923, + "learning_rate": 3.2858253433532005e-06, + "loss": 1.4354, + "step": 634 + }, + { + "epoch": 0.01481019337797771, + "grad_norm": 1.2317379713058472, + "learning_rate": 3.2910080331692153e-06, + "loss": 1.5661, + "step": 635 + }, + { + "epoch": 0.014833516517155627, + "grad_norm": 1.400969386100769, + "learning_rate": 3.2961907229852297e-06, + "loss": 1.462, + "step": 636 + }, + { + "epoch": 0.014856839656333545, + "grad_norm": 2.060718059539795, + "learning_rate": 3.3013734128012444e-06, + "loss": 1.7522, + "step": 637 + }, + { + "epoch": 0.014880162795511461, + "grad_norm": 1.138715386390686, + "learning_rate": 3.3065561026172588e-06, + "loss": 1.4923, + "step": 638 + }, + { + "epoch": 0.014903485934689379, + "grad_norm": 1.1973599195480347, + "learning_rate": 3.3117387924332735e-06, + "loss": 1.4462, + "step": 639 + }, + { + "epoch": 0.014926809073867297, + "grad_norm": 1.266867756843567, + "learning_rate": 3.316921482249288e-06, + "loss": 1.3159, + "step": 640 + }, + { + "epoch": 0.014950132213045215, + "grad_norm": 3.4681708812713623, + "learning_rate": 3.322104172065302e-06, + "loss": 1.3566, + "step": 641 + }, + { + "epoch": 0.014973455352223133, + "grad_norm": 1.248502492904663, + "learning_rate": 3.3272868618813166e-06, + "loss": 1.6299, + "step": 642 + }, + { + "epoch": 0.01499677849140105, + "grad_norm": 1.561563491821289, + "learning_rate": 3.332469551697331e-06, + "loss": 1.3246, + "step": 643 + }, + { + "epoch": 0.015020101630578968, + "grad_norm": 1.1922053098678589, + "learning_rate": 3.3376522415133457e-06, + "loss": 1.6847, + "step": 644 + }, + { + "epoch": 0.015043424769756885, + "grad_norm": 1.0779014825820923, + "learning_rate": 3.34283493132936e-06, + "loss": 1.8025, + "step": 645 + }, + { + "epoch": 0.015066747908934803, + "grad_norm": 1.5236597061157227, + "learning_rate": 3.348017621145375e-06, + "loss": 1.3894, + "step": 646 + }, + { + "epoch": 0.01509007104811272, + "grad_norm": 1.2087934017181396, + "learning_rate": 3.353200310961389e-06, + "loss": 1.9119, + "step": 647 + }, + { + "epoch": 0.015113394187290638, + "grad_norm": 1.435085654258728, + "learning_rate": 3.358383000777404e-06, + "loss": 1.4334, + "step": 648 + }, + { + "epoch": 0.015136717326468556, + "grad_norm": 1.3662467002868652, + "learning_rate": 3.3635656905934183e-06, + "loss": 1.6717, + "step": 649 + }, + { + "epoch": 0.015160040465646474, + "grad_norm": 1.379262924194336, + "learning_rate": 3.368748380409433e-06, + "loss": 1.0914, + "step": 650 + }, + { + "epoch": 0.015183363604824392, + "grad_norm": 1.436503529548645, + "learning_rate": 3.3739310702254474e-06, + "loss": 1.296, + "step": 651 + }, + { + "epoch": 0.015206686744002308, + "grad_norm": 1.0189919471740723, + "learning_rate": 3.379113760041462e-06, + "loss": 1.5578, + "step": 652 + }, + { + "epoch": 0.015230009883180226, + "grad_norm": 1.3371915817260742, + "learning_rate": 3.3842964498574765e-06, + "loss": 1.3883, + "step": 653 + }, + { + "epoch": 0.015253333022358144, + "grad_norm": 1.152949333190918, + "learning_rate": 3.389479139673491e-06, + "loss": 1.3408, + "step": 654 + }, + { + "epoch": 0.015276656161536062, + "grad_norm": 0.865856945514679, + "learning_rate": 3.3946618294895052e-06, + "loss": 1.8154, + "step": 655 + }, + { + "epoch": 0.01529997930071398, + "grad_norm": 1.3607538938522339, + "learning_rate": 3.3998445193055196e-06, + "loss": 1.5139, + "step": 656 + }, + { + "epoch": 0.015323302439891898, + "grad_norm": 1.0469399690628052, + "learning_rate": 3.4050272091215343e-06, + "loss": 1.4246, + "step": 657 + }, + { + "epoch": 0.015346625579069816, + "grad_norm": 1.2417982816696167, + "learning_rate": 3.4102098989375487e-06, + "loss": 1.4392, + "step": 658 + }, + { + "epoch": 0.015369948718247732, + "grad_norm": 2.018418073654175, + "learning_rate": 3.4153925887535634e-06, + "loss": 1.5175, + "step": 659 + }, + { + "epoch": 0.01539327185742565, + "grad_norm": 1.2593055963516235, + "learning_rate": 3.420575278569578e-06, + "loss": 1.6338, + "step": 660 + }, + { + "epoch": 0.015416594996603568, + "grad_norm": 1.0297298431396484, + "learning_rate": 3.4257579683855926e-06, + "loss": 1.6309, + "step": 661 + }, + { + "epoch": 0.015439918135781485, + "grad_norm": 1.2963732481002808, + "learning_rate": 3.430940658201607e-06, + "loss": 1.3099, + "step": 662 + }, + { + "epoch": 0.015463241274959403, + "grad_norm": 1.0868266820907593, + "learning_rate": 3.4361233480176217e-06, + "loss": 1.4949, + "step": 663 + }, + { + "epoch": 0.015486564414137321, + "grad_norm": 1.156296968460083, + "learning_rate": 3.441306037833636e-06, + "loss": 1.7845, + "step": 664 + }, + { + "epoch": 0.015509887553315239, + "grad_norm": 1.412965178489685, + "learning_rate": 3.446488727649651e-06, + "loss": 1.19, + "step": 665 + }, + { + "epoch": 0.015533210692493155, + "grad_norm": 1.0419931411743164, + "learning_rate": 3.451671417465665e-06, + "loss": 1.7125, + "step": 666 + }, + { + "epoch": 0.015556533831671073, + "grad_norm": 1.035372018814087, + "learning_rate": 3.4568541072816795e-06, + "loss": 1.7003, + "step": 667 + }, + { + "epoch": 0.015579856970848991, + "grad_norm": 1.1559805870056152, + "learning_rate": 3.4620367970976943e-06, + "loss": 1.981, + "step": 668 + }, + { + "epoch": 0.015603180110026909, + "grad_norm": 0.8634515404701233, + "learning_rate": 3.467219486913708e-06, + "loss": 1.2609, + "step": 669 + }, + { + "epoch": 0.015626503249204827, + "grad_norm": 1.1953692436218262, + "learning_rate": 3.472402176729723e-06, + "loss": 1.3956, + "step": 670 + }, + { + "epoch": 0.015649826388382745, + "grad_norm": 0.9668301939964294, + "learning_rate": 3.4775848665457373e-06, + "loss": 1.0568, + "step": 671 + }, + { + "epoch": 0.015673149527560663, + "grad_norm": 2.4868035316467285, + "learning_rate": 3.482767556361752e-06, + "loss": 1.364, + "step": 672 + }, + { + "epoch": 0.01569647266673858, + "grad_norm": 1.4255839586257935, + "learning_rate": 3.4879502461777664e-06, + "loss": 1.5207, + "step": 673 + }, + { + "epoch": 0.0157197958059165, + "grad_norm": 1.2752389907836914, + "learning_rate": 3.493132935993781e-06, + "loss": 1.5141, + "step": 674 + }, + { + "epoch": 0.015743118945094416, + "grad_norm": 1.2186245918273926, + "learning_rate": 3.4983156258097955e-06, + "loss": 1.3655, + "step": 675 + }, + { + "epoch": 0.015766442084272334, + "grad_norm": 1.3544304370880127, + "learning_rate": 3.5034983156258103e-06, + "loss": 1.7428, + "step": 676 + }, + { + "epoch": 0.01578976522345025, + "grad_norm": 1.0968130826950073, + "learning_rate": 3.5086810054418247e-06, + "loss": 1.3491, + "step": 677 + }, + { + "epoch": 0.015813088362628167, + "grad_norm": 1.1593806743621826, + "learning_rate": 3.513863695257839e-06, + "loss": 1.6708, + "step": 678 + }, + { + "epoch": 0.015836411501806084, + "grad_norm": 1.0408954620361328, + "learning_rate": 3.5190463850738538e-06, + "loss": 1.6977, + "step": 679 + }, + { + "epoch": 0.015859734640984002, + "grad_norm": 1.196632742881775, + "learning_rate": 3.524229074889868e-06, + "loss": 1.2019, + "step": 680 + }, + { + "epoch": 0.01588305778016192, + "grad_norm": 1.2698166370391846, + "learning_rate": 3.529411764705883e-06, + "loss": 1.8457, + "step": 681 + }, + { + "epoch": 0.015906380919339838, + "grad_norm": 0.9075011014938354, + "learning_rate": 3.5345944545218972e-06, + "loss": 1.2717, + "step": 682 + }, + { + "epoch": 0.015929704058517756, + "grad_norm": 1.0426501035690308, + "learning_rate": 3.5397771443379116e-06, + "loss": 1.6601, + "step": 683 + }, + { + "epoch": 0.015953027197695674, + "grad_norm": 1.4904205799102783, + "learning_rate": 3.544959834153926e-06, + "loss": 1.6324, + "step": 684 + }, + { + "epoch": 0.015976350336873592, + "grad_norm": 1.0664643049240112, + "learning_rate": 3.5501425239699407e-06, + "loss": 1.4896, + "step": 685 + }, + { + "epoch": 0.01599967347605151, + "grad_norm": 1.3758978843688965, + "learning_rate": 3.555325213785955e-06, + "loss": 1.5457, + "step": 686 + }, + { + "epoch": 0.016022996615229428, + "grad_norm": 1.4759879112243652, + "learning_rate": 3.56050790360197e-06, + "loss": 1.3865, + "step": 687 + }, + { + "epoch": 0.016046319754407345, + "grad_norm": 1.4678733348846436, + "learning_rate": 3.565690593417984e-06, + "loss": 1.223, + "step": 688 + }, + { + "epoch": 0.016069642893585263, + "grad_norm": 1.2057251930236816, + "learning_rate": 3.570873283233999e-06, + "loss": 1.4864, + "step": 689 + }, + { + "epoch": 0.01609296603276318, + "grad_norm": 1.3976320028305054, + "learning_rate": 3.5760559730500133e-06, + "loss": 1.3371, + "step": 690 + }, + { + "epoch": 0.016116289171941096, + "grad_norm": 1.0588197708129883, + "learning_rate": 3.5812386628660276e-06, + "loss": 1.264, + "step": 691 + }, + { + "epoch": 0.016139612311119014, + "grad_norm": 0.891678512096405, + "learning_rate": 3.5864213526820424e-06, + "loss": 1.6566, + "step": 692 + }, + { + "epoch": 0.01616293545029693, + "grad_norm": 1.1149228811264038, + "learning_rate": 3.5916040424980567e-06, + "loss": 1.6862, + "step": 693 + }, + { + "epoch": 0.01618625858947485, + "grad_norm": 1.463218331336975, + "learning_rate": 3.5967867323140715e-06, + "loss": 1.5771, + "step": 694 + }, + { + "epoch": 0.016209581728652767, + "grad_norm": 1.291648030281067, + "learning_rate": 3.601969422130086e-06, + "loss": 1.443, + "step": 695 + }, + { + "epoch": 0.016232904867830685, + "grad_norm": 1.1534149646759033, + "learning_rate": 3.6071521119461002e-06, + "loss": 1.76, + "step": 696 + }, + { + "epoch": 0.016256228007008603, + "grad_norm": 1.3349847793579102, + "learning_rate": 3.6123348017621146e-06, + "loss": 2.0584, + "step": 697 + }, + { + "epoch": 0.01627955114618652, + "grad_norm": 1.665682315826416, + "learning_rate": 3.6175174915781293e-06, + "loss": 1.5989, + "step": 698 + }, + { + "epoch": 0.01630287428536444, + "grad_norm": 1.6486263275146484, + "learning_rate": 3.6227001813941437e-06, + "loss": 1.7698, + "step": 699 + }, + { + "epoch": 0.016326197424542357, + "grad_norm": 1.5153722763061523, + "learning_rate": 3.6278828712101584e-06, + "loss": 1.3312, + "step": 700 + }, + { + "epoch": 0.016349520563720275, + "grad_norm": 1.3090248107910156, + "learning_rate": 3.633065561026173e-06, + "loss": 1.0735, + "step": 701 + }, + { + "epoch": 0.016372843702898193, + "grad_norm": 1.5462753772735596, + "learning_rate": 3.6382482508421876e-06, + "loss": 1.5408, + "step": 702 + }, + { + "epoch": 0.01639616684207611, + "grad_norm": 1.3447730541229248, + "learning_rate": 3.643430940658202e-06, + "loss": 1.5295, + "step": 703 + }, + { + "epoch": 0.01641948998125403, + "grad_norm": 1.232865571975708, + "learning_rate": 3.6486136304742163e-06, + "loss": 1.8686, + "step": 704 + }, + { + "epoch": 0.016442813120431946, + "grad_norm": 0.9742329120635986, + "learning_rate": 3.653796320290231e-06, + "loss": 1.5951, + "step": 705 + }, + { + "epoch": 0.01646613625960986, + "grad_norm": 1.1572047472000122, + "learning_rate": 3.6589790101062454e-06, + "loss": 1.5068, + "step": 706 + }, + { + "epoch": 0.01648945939878778, + "grad_norm": 1.2024304866790771, + "learning_rate": 3.66416169992226e-06, + "loss": 1.3933, + "step": 707 + }, + { + "epoch": 0.016512782537965696, + "grad_norm": 2.442342758178711, + "learning_rate": 3.6693443897382745e-06, + "loss": 1.0126, + "step": 708 + }, + { + "epoch": 0.016536105677143614, + "grad_norm": 1.2786589860916138, + "learning_rate": 3.6745270795542893e-06, + "loss": 1.6902, + "step": 709 + }, + { + "epoch": 0.016559428816321532, + "grad_norm": 0.9200882315635681, + "learning_rate": 3.679709769370303e-06, + "loss": 1.3918, + "step": 710 + }, + { + "epoch": 0.01658275195549945, + "grad_norm": 1.3768819570541382, + "learning_rate": 3.684892459186318e-06, + "loss": 1.6518, + "step": 711 + }, + { + "epoch": 0.016606075094677368, + "grad_norm": 1.274484395980835, + "learning_rate": 3.6900751490023323e-06, + "loss": 1.3728, + "step": 712 + }, + { + "epoch": 0.016629398233855286, + "grad_norm": 1.1752501726150513, + "learning_rate": 3.695257838818347e-06, + "loss": 1.4234, + "step": 713 + }, + { + "epoch": 0.016652721373033204, + "grad_norm": 1.4458903074264526, + "learning_rate": 3.7004405286343614e-06, + "loss": 1.5695, + "step": 714 + }, + { + "epoch": 0.01667604451221112, + "grad_norm": 1.2630547285079956, + "learning_rate": 3.705623218450376e-06, + "loss": 1.5334, + "step": 715 + }, + { + "epoch": 0.01669936765138904, + "grad_norm": 1.3754082918167114, + "learning_rate": 3.7108059082663905e-06, + "loss": 1.4807, + "step": 716 + }, + { + "epoch": 0.016722690790566958, + "grad_norm": 1.4704689979553223, + "learning_rate": 3.715988598082405e-06, + "loss": 1.5409, + "step": 717 + }, + { + "epoch": 0.016746013929744875, + "grad_norm": 1.4692633152008057, + "learning_rate": 3.7211712878984197e-06, + "loss": 1.5922, + "step": 718 + }, + { + "epoch": 0.016769337068922793, + "grad_norm": 1.2148405313491821, + "learning_rate": 3.726353977714434e-06, + "loss": 1.8115, + "step": 719 + }, + { + "epoch": 0.016792660208100708, + "grad_norm": 1.5564905405044556, + "learning_rate": 3.7315366675304488e-06, + "loss": 1.4189, + "step": 720 + }, + { + "epoch": 0.016815983347278626, + "grad_norm": 1.130292296409607, + "learning_rate": 3.736719357346463e-06, + "loss": 1.4455, + "step": 721 + }, + { + "epoch": 0.016839306486456544, + "grad_norm": 2.0609545707702637, + "learning_rate": 3.741902047162478e-06, + "loss": 1.6052, + "step": 722 + }, + { + "epoch": 0.01686262962563446, + "grad_norm": 1.0422543287277222, + "learning_rate": 3.7470847369784922e-06, + "loss": 1.5889, + "step": 723 + }, + { + "epoch": 0.01688595276481238, + "grad_norm": 1.7926782369613647, + "learning_rate": 3.7522674267945066e-06, + "loss": 1.2304, + "step": 724 + }, + { + "epoch": 0.016909275903990297, + "grad_norm": 1.2486250400543213, + "learning_rate": 3.757450116610521e-06, + "loss": 1.7512, + "step": 725 + }, + { + "epoch": 0.016932599043168215, + "grad_norm": 1.6907048225402832, + "learning_rate": 3.7626328064265357e-06, + "loss": 1.2031, + "step": 726 + }, + { + "epoch": 0.016955922182346133, + "grad_norm": 1.2899296283721924, + "learning_rate": 3.76781549624255e-06, + "loss": 1.3111, + "step": 727 + }, + { + "epoch": 0.01697924532152405, + "grad_norm": 2.320288896560669, + "learning_rate": 3.7729981860585644e-06, + "loss": 1.2764, + "step": 728 + }, + { + "epoch": 0.01700256846070197, + "grad_norm": 1.4165383577346802, + "learning_rate": 3.778180875874579e-06, + "loss": 1.2847, + "step": 729 + }, + { + "epoch": 0.017025891599879887, + "grad_norm": 1.1537601947784424, + "learning_rate": 3.7833635656905935e-06, + "loss": 1.6002, + "step": 730 + }, + { + "epoch": 0.017049214739057805, + "grad_norm": 1.3128899335861206, + "learning_rate": 3.7885462555066083e-06, + "loss": 1.4159, + "step": 731 + }, + { + "epoch": 0.017072537878235722, + "grad_norm": 0.9494642615318298, + "learning_rate": 3.7937289453226226e-06, + "loss": 1.5425, + "step": 732 + }, + { + "epoch": 0.01709586101741364, + "grad_norm": 1.8949923515319824, + "learning_rate": 3.7989116351386374e-06, + "loss": 1.109, + "step": 733 + }, + { + "epoch": 0.017119184156591555, + "grad_norm": 1.3136776685714722, + "learning_rate": 3.8040943249546517e-06, + "loss": 1.4208, + "step": 734 + }, + { + "epoch": 0.017142507295769473, + "grad_norm": 1.0108048915863037, + "learning_rate": 3.8092770147706665e-06, + "loss": 1.3101, + "step": 735 + }, + { + "epoch": 0.01716583043494739, + "grad_norm": 1.1397989988327026, + "learning_rate": 3.814459704586681e-06, + "loss": 1.6643, + "step": 736 + }, + { + "epoch": 0.01718915357412531, + "grad_norm": 0.9662717580795288, + "learning_rate": 3.819642394402696e-06, + "loss": 1.5524, + "step": 737 + }, + { + "epoch": 0.017212476713303226, + "grad_norm": 1.5264514684677124, + "learning_rate": 3.82482508421871e-06, + "loss": 1.6702, + "step": 738 + }, + { + "epoch": 0.017235799852481144, + "grad_norm": 1.1797709465026855, + "learning_rate": 3.830007774034724e-06, + "loss": 1.5751, + "step": 739 + }, + { + "epoch": 0.017259122991659062, + "grad_norm": 1.3964486122131348, + "learning_rate": 3.835190463850739e-06, + "loss": 1.3497, + "step": 740 + }, + { + "epoch": 0.01728244613083698, + "grad_norm": 1.0540798902511597, + "learning_rate": 3.840373153666753e-06, + "loss": 1.623, + "step": 741 + }, + { + "epoch": 0.017305769270014898, + "grad_norm": 1.8619107007980347, + "learning_rate": 3.845555843482767e-06, + "loss": 1.836, + "step": 742 + }, + { + "epoch": 0.017329092409192816, + "grad_norm": 1.190048098564148, + "learning_rate": 3.8507385332987826e-06, + "loss": 1.6031, + "step": 743 + }, + { + "epoch": 0.017352415548370734, + "grad_norm": 1.32784903049469, + "learning_rate": 3.855921223114797e-06, + "loss": 1.6144, + "step": 744 + }, + { + "epoch": 0.01737573868754865, + "grad_norm": 1.7393810749053955, + "learning_rate": 3.861103912930811e-06, + "loss": 1.4898, + "step": 745 + }, + { + "epoch": 0.01739906182672657, + "grad_norm": 1.008122444152832, + "learning_rate": 3.866286602746826e-06, + "loss": 1.6506, + "step": 746 + }, + { + "epoch": 0.017422384965904487, + "grad_norm": 1.3282239437103271, + "learning_rate": 3.871469292562841e-06, + "loss": 1.5178, + "step": 747 + }, + { + "epoch": 0.017445708105082402, + "grad_norm": 1.4479358196258545, + "learning_rate": 3.876651982378855e-06, + "loss": 1.5896, + "step": 748 + }, + { + "epoch": 0.01746903124426032, + "grad_norm": 1.9100661277770996, + "learning_rate": 3.8818346721948695e-06, + "loss": 1.2946, + "step": 749 + }, + { + "epoch": 0.017492354383438238, + "grad_norm": 1.269235610961914, + "learning_rate": 3.887017362010884e-06, + "loss": 1.5707, + "step": 750 + }, + { + "epoch": 0.017515677522616156, + "grad_norm": 1.3187369108200073, + "learning_rate": 3.892200051826899e-06, + "loss": 1.8153, + "step": 751 + }, + { + "epoch": 0.017539000661794073, + "grad_norm": 1.3091131448745728, + "learning_rate": 3.8973827416429125e-06, + "loss": 1.5973, + "step": 752 + }, + { + "epoch": 0.01756232380097199, + "grad_norm": 1.4826890230178833, + "learning_rate": 3.902565431458927e-06, + "loss": 1.3277, + "step": 753 + }, + { + "epoch": 0.01758564694014991, + "grad_norm": 1.2626949548721313, + "learning_rate": 3.907748121274942e-06, + "loss": 1.5531, + "step": 754 + }, + { + "epoch": 0.017608970079327827, + "grad_norm": 1.1990412473678589, + "learning_rate": 3.912930811090956e-06, + "loss": 1.349, + "step": 755 + }, + { + "epoch": 0.017632293218505745, + "grad_norm": 1.3036906719207764, + "learning_rate": 3.918113500906971e-06, + "loss": 1.5648, + "step": 756 + }, + { + "epoch": 0.017655616357683663, + "grad_norm": 1.3129525184631348, + "learning_rate": 3.923296190722985e-06, + "loss": 1.7147, + "step": 757 + }, + { + "epoch": 0.01767893949686158, + "grad_norm": 1.4686280488967896, + "learning_rate": 3.928478880539e-06, + "loss": 1.6136, + "step": 758 + }, + { + "epoch": 0.0177022626360395, + "grad_norm": 1.6845604181289673, + "learning_rate": 3.933661570355015e-06, + "loss": 1.763, + "step": 759 + }, + { + "epoch": 0.017725585775217417, + "grad_norm": 2.019049644470215, + "learning_rate": 3.938844260171029e-06, + "loss": 1.2543, + "step": 760 + }, + { + "epoch": 0.017748908914395334, + "grad_norm": 1.4184072017669678, + "learning_rate": 3.944026949987043e-06, + "loss": 1.596, + "step": 761 + }, + { + "epoch": 0.017772232053573252, + "grad_norm": 1.127982497215271, + "learning_rate": 3.9492096398030585e-06, + "loss": 1.5485, + "step": 762 + }, + { + "epoch": 0.017795555192751167, + "grad_norm": 1.5097321271896362, + "learning_rate": 3.954392329619073e-06, + "loss": 1.5452, + "step": 763 + }, + { + "epoch": 0.017818878331929085, + "grad_norm": 1.3832807540893555, + "learning_rate": 3.959575019435087e-06, + "loss": 1.3865, + "step": 764 + }, + { + "epoch": 0.017842201471107003, + "grad_norm": 1.065623164176941, + "learning_rate": 3.964757709251102e-06, + "loss": 1.2218, + "step": 765 + }, + { + "epoch": 0.01786552461028492, + "grad_norm": 1.2190065383911133, + "learning_rate": 3.969940399067116e-06, + "loss": 1.2169, + "step": 766 + }, + { + "epoch": 0.01788884774946284, + "grad_norm": 1.741749882698059, + "learning_rate": 3.97512308888313e-06, + "loss": 1.7316, + "step": 767 + }, + { + "epoch": 0.017912170888640756, + "grad_norm": 1.2072060108184814, + "learning_rate": 3.980305778699145e-06, + "loss": 1.815, + "step": 768 + }, + { + "epoch": 0.017935494027818674, + "grad_norm": 1.4645625352859497, + "learning_rate": 3.98548846851516e-06, + "loss": 1.2218, + "step": 769 + }, + { + "epoch": 0.017958817166996592, + "grad_norm": 1.4466350078582764, + "learning_rate": 3.990671158331174e-06, + "loss": 1.7291, + "step": 770 + }, + { + "epoch": 0.01798214030617451, + "grad_norm": 1.364358901977539, + "learning_rate": 3.9958538481471885e-06, + "loss": 1.6527, + "step": 771 + }, + { + "epoch": 0.018005463445352428, + "grad_norm": 1.2262394428253174, + "learning_rate": 4.001036537963203e-06, + "loss": 1.5522, + "step": 772 + }, + { + "epoch": 0.018028786584530346, + "grad_norm": 1.694001317024231, + "learning_rate": 4.006219227779218e-06, + "loss": 1.5791, + "step": 773 + }, + { + "epoch": 0.018052109723708264, + "grad_norm": 0.7941157817840576, + "learning_rate": 4.011401917595232e-06, + "loss": 1.23, + "step": 774 + }, + { + "epoch": 0.01807543286288618, + "grad_norm": 1.1942747831344604, + "learning_rate": 4.016584607411247e-06, + "loss": 1.4316, + "step": 775 + }, + { + "epoch": 0.0180987560020641, + "grad_norm": 1.5809072256088257, + "learning_rate": 4.021767297227261e-06, + "loss": 1.7361, + "step": 776 + }, + { + "epoch": 0.018122079141242014, + "grad_norm": 1.2918401956558228, + "learning_rate": 4.026949987043276e-06, + "loss": 1.3285, + "step": 777 + }, + { + "epoch": 0.018145402280419932, + "grad_norm": 1.966123342514038, + "learning_rate": 4.032132676859291e-06, + "loss": 1.2037, + "step": 778 + }, + { + "epoch": 0.01816872541959785, + "grad_norm": 1.3362590074539185, + "learning_rate": 4.037315366675304e-06, + "loss": 1.3811, + "step": 779 + }, + { + "epoch": 0.018192048558775768, + "grad_norm": 1.0375605821609497, + "learning_rate": 4.042498056491319e-06, + "loss": 1.481, + "step": 780 + }, + { + "epoch": 0.018215371697953685, + "grad_norm": 2.414684295654297, + "learning_rate": 4.047680746307334e-06, + "loss": 1.773, + "step": 781 + }, + { + "epoch": 0.018238694837131603, + "grad_norm": 1.2252676486968994, + "learning_rate": 4.052863436123348e-06, + "loss": 1.514, + "step": 782 + }, + { + "epoch": 0.01826201797630952, + "grad_norm": 1.517791748046875, + "learning_rate": 4.058046125939362e-06, + "loss": 1.3442, + "step": 783 + }, + { + "epoch": 0.01828534111548744, + "grad_norm": 1.0303611755371094, + "learning_rate": 4.0632288157553776e-06, + "loss": 1.5593, + "step": 784 + }, + { + "epoch": 0.018308664254665357, + "grad_norm": 1.3615033626556396, + "learning_rate": 4.068411505571392e-06, + "loss": 1.6971, + "step": 785 + }, + { + "epoch": 0.018331987393843275, + "grad_norm": 1.1224147081375122, + "learning_rate": 4.073594195387406e-06, + "loss": 1.2134, + "step": 786 + }, + { + "epoch": 0.018355310533021193, + "grad_norm": 1.3592679500579834, + "learning_rate": 4.078776885203421e-06, + "loss": 1.7391, + "step": 787 + }, + { + "epoch": 0.01837863367219911, + "grad_norm": 1.6286187171936035, + "learning_rate": 4.083959575019436e-06, + "loss": 1.7279, + "step": 788 + }, + { + "epoch": 0.01840195681137703, + "grad_norm": 1.2597742080688477, + "learning_rate": 4.08914226483545e-06, + "loss": 1.5227, + "step": 789 + }, + { + "epoch": 0.018425279950554947, + "grad_norm": 1.2776849269866943, + "learning_rate": 4.0943249546514645e-06, + "loss": 1.3575, + "step": 790 + }, + { + "epoch": 0.01844860308973286, + "grad_norm": 1.2529163360595703, + "learning_rate": 4.099507644467479e-06, + "loss": 1.6356, + "step": 791 + }, + { + "epoch": 0.01847192622891078, + "grad_norm": 1.184187650680542, + "learning_rate": 4.104690334283494e-06, + "loss": 1.734, + "step": 792 + }, + { + "epoch": 0.018495249368088697, + "grad_norm": 1.176222562789917, + "learning_rate": 4.1098730240995075e-06, + "loss": 1.5206, + "step": 793 + }, + { + "epoch": 0.018518572507266615, + "grad_norm": 1.0694701671600342, + "learning_rate": 4.115055713915522e-06, + "loss": 1.1824, + "step": 794 + }, + { + "epoch": 0.018541895646444533, + "grad_norm": 1.5169551372528076, + "learning_rate": 4.120238403731537e-06, + "loss": 1.3817, + "step": 795 + }, + { + "epoch": 0.01856521878562245, + "grad_norm": 1.0996246337890625, + "learning_rate": 4.125421093547551e-06, + "loss": 1.0921, + "step": 796 + }, + { + "epoch": 0.01858854192480037, + "grad_norm": 1.0202140808105469, + "learning_rate": 4.130603783363566e-06, + "loss": 1.2687, + "step": 797 + }, + { + "epoch": 0.018611865063978286, + "grad_norm": 2.089864730834961, + "learning_rate": 4.13578647317958e-06, + "loss": 1.5417, + "step": 798 + }, + { + "epoch": 0.018635188203156204, + "grad_norm": 1.1465847492218018, + "learning_rate": 4.140969162995595e-06, + "loss": 1.3415, + "step": 799 + }, + { + "epoch": 0.018658511342334122, + "grad_norm": 1.1085565090179443, + "learning_rate": 4.14615185281161e-06, + "loss": 1.4662, + "step": 800 + }, + { + "epoch": 0.01868183448151204, + "grad_norm": 1.2206768989562988, + "learning_rate": 4.151334542627624e-06, + "loss": 1.4954, + "step": 801 + }, + { + "epoch": 0.018705157620689958, + "grad_norm": 1.1540756225585938, + "learning_rate": 4.156517232443638e-06, + "loss": 1.4953, + "step": 802 + }, + { + "epoch": 0.018728480759867876, + "grad_norm": 1.9667025804519653, + "learning_rate": 4.1616999222596535e-06, + "loss": 1.1834, + "step": 803 + }, + { + "epoch": 0.018751803899045794, + "grad_norm": 1.2202988862991333, + "learning_rate": 4.166882612075668e-06, + "loss": 1.7045, + "step": 804 + }, + { + "epoch": 0.018775127038223708, + "grad_norm": 1.2399123907089233, + "learning_rate": 4.172065301891682e-06, + "loss": 1.4937, + "step": 805 + }, + { + "epoch": 0.018798450177401626, + "grad_norm": 1.5780203342437744, + "learning_rate": 4.177247991707697e-06, + "loss": 1.6386, + "step": 806 + }, + { + "epoch": 0.018821773316579544, + "grad_norm": 1.524564266204834, + "learning_rate": 4.182430681523711e-06, + "loss": 1.4951, + "step": 807 + }, + { + "epoch": 0.01884509645575746, + "grad_norm": 1.342991590499878, + "learning_rate": 4.187613371339725e-06, + "loss": 1.3007, + "step": 808 + }, + { + "epoch": 0.01886841959493538, + "grad_norm": 1.320813775062561, + "learning_rate": 4.19279606115574e-06, + "loss": 1.2112, + "step": 809 + }, + { + "epoch": 0.018891742734113297, + "grad_norm": 1.2329927682876587, + "learning_rate": 4.197978750971755e-06, + "loss": 1.333, + "step": 810 + }, + { + "epoch": 0.018915065873291215, + "grad_norm": 1.3429094552993774, + "learning_rate": 4.203161440787769e-06, + "loss": 1.4805, + "step": 811 + }, + { + "epoch": 0.018938389012469133, + "grad_norm": 1.643641710281372, + "learning_rate": 4.2083441306037835e-06, + "loss": 1.5665, + "step": 812 + }, + { + "epoch": 0.01896171215164705, + "grad_norm": 1.111887812614441, + "learning_rate": 4.213526820419798e-06, + "loss": 1.6087, + "step": 813 + }, + { + "epoch": 0.01898503529082497, + "grad_norm": 1.3594610691070557, + "learning_rate": 4.218709510235813e-06, + "loss": 1.7666, + "step": 814 + }, + { + "epoch": 0.019008358430002887, + "grad_norm": 1.2298046350479126, + "learning_rate": 4.223892200051827e-06, + "loss": 1.5032, + "step": 815 + }, + { + "epoch": 0.019031681569180805, + "grad_norm": 1.2679171562194824, + "learning_rate": 4.229074889867842e-06, + "loss": 1.4375, + "step": 816 + }, + { + "epoch": 0.019055004708358723, + "grad_norm": 1.0543935298919678, + "learning_rate": 4.234257579683856e-06, + "loss": 1.6645, + "step": 817 + }, + { + "epoch": 0.01907832784753664, + "grad_norm": 1.2821168899536133, + "learning_rate": 4.239440269499871e-06, + "loss": 1.1945, + "step": 818 + }, + { + "epoch": 0.01910165098671456, + "grad_norm": 1.5575084686279297, + "learning_rate": 4.244622959315886e-06, + "loss": 1.3262, + "step": 819 + }, + { + "epoch": 0.019124974125892473, + "grad_norm": 1.2359989881515503, + "learning_rate": 4.2498056491319e-06, + "loss": 1.4127, + "step": 820 + }, + { + "epoch": 0.01914829726507039, + "grad_norm": 1.0559273958206177, + "learning_rate": 4.254988338947914e-06, + "loss": 1.4455, + "step": 821 + }, + { + "epoch": 0.01917162040424831, + "grad_norm": 1.3651732206344604, + "learning_rate": 4.260171028763929e-06, + "loss": 1.245, + "step": 822 + }, + { + "epoch": 0.019194943543426227, + "grad_norm": 1.0067932605743408, + "learning_rate": 4.265353718579943e-06, + "loss": 1.4954, + "step": 823 + }, + { + "epoch": 0.019218266682604145, + "grad_norm": 1.7477822303771973, + "learning_rate": 4.270536408395957e-06, + "loss": 1.8164, + "step": 824 + }, + { + "epoch": 0.019241589821782062, + "grad_norm": 1.1976604461669922, + "learning_rate": 4.2757190982119726e-06, + "loss": 1.4552, + "step": 825 + }, + { + "epoch": 0.01926491296095998, + "grad_norm": 1.306269884109497, + "learning_rate": 4.280901788027987e-06, + "loss": 1.6348, + "step": 826 + }, + { + "epoch": 0.019288236100137898, + "grad_norm": 1.5786314010620117, + "learning_rate": 4.286084477844001e-06, + "loss": 1.4592, + "step": 827 + }, + { + "epoch": 0.019311559239315816, + "grad_norm": 1.4481762647628784, + "learning_rate": 4.291267167660016e-06, + "loss": 1.3409, + "step": 828 + }, + { + "epoch": 0.019334882378493734, + "grad_norm": 1.1410714387893677, + "learning_rate": 4.296449857476031e-06, + "loss": 1.5746, + "step": 829 + }, + { + "epoch": 0.019358205517671652, + "grad_norm": 1.363434076309204, + "learning_rate": 4.301632547292045e-06, + "loss": 1.0836, + "step": 830 + }, + { + "epoch": 0.01938152865684957, + "grad_norm": 1.1413646936416626, + "learning_rate": 4.3068152371080595e-06, + "loss": 1.8687, + "step": 831 + }, + { + "epoch": 0.019404851796027488, + "grad_norm": 1.9734309911727905, + "learning_rate": 4.311997926924074e-06, + "loss": 1.3295, + "step": 832 + }, + { + "epoch": 0.019428174935205406, + "grad_norm": 1.5119333267211914, + "learning_rate": 4.317180616740089e-06, + "loss": 1.6817, + "step": 833 + }, + { + "epoch": 0.01945149807438332, + "grad_norm": 1.3933395147323608, + "learning_rate": 4.3223633065561025e-06, + "loss": 1.5288, + "step": 834 + }, + { + "epoch": 0.019474821213561238, + "grad_norm": 1.3713746070861816, + "learning_rate": 4.327545996372117e-06, + "loss": 1.6361, + "step": 835 + }, + { + "epoch": 0.019498144352739156, + "grad_norm": 1.1849229335784912, + "learning_rate": 4.332728686188132e-06, + "loss": 1.6611, + "step": 836 + }, + { + "epoch": 0.019521467491917074, + "grad_norm": 2.122307777404785, + "learning_rate": 4.337911376004146e-06, + "loss": 1.6258, + "step": 837 + }, + { + "epoch": 0.01954479063109499, + "grad_norm": 1.221781611442566, + "learning_rate": 4.343094065820161e-06, + "loss": 1.9081, + "step": 838 + }, + { + "epoch": 0.01956811377027291, + "grad_norm": 1.2895511388778687, + "learning_rate": 4.348276755636175e-06, + "loss": 1.2742, + "step": 839 + }, + { + "epoch": 0.019591436909450827, + "grad_norm": 1.1531336307525635, + "learning_rate": 4.35345944545219e-06, + "loss": 1.587, + "step": 840 + }, + { + "epoch": 0.019614760048628745, + "grad_norm": 1.3979135751724243, + "learning_rate": 4.358642135268205e-06, + "loss": 1.5208, + "step": 841 + }, + { + "epoch": 0.019638083187806663, + "grad_norm": 1.3758100271224976, + "learning_rate": 4.363824825084219e-06, + "loss": 1.246, + "step": 842 + }, + { + "epoch": 0.01966140632698458, + "grad_norm": 1.3759677410125732, + "learning_rate": 4.369007514900233e-06, + "loss": 1.7344, + "step": 843 + }, + { + "epoch": 0.0196847294661625, + "grad_norm": 1.5575461387634277, + "learning_rate": 4.3741902047162485e-06, + "loss": 1.5554, + "step": 844 + }, + { + "epoch": 0.019708052605340417, + "grad_norm": 1.5018088817596436, + "learning_rate": 4.379372894532263e-06, + "loss": 1.3433, + "step": 845 + }, + { + "epoch": 0.019731375744518335, + "grad_norm": 1.4393954277038574, + "learning_rate": 4.384555584348277e-06, + "loss": 1.7277, + "step": 846 + }, + { + "epoch": 0.019754698883696253, + "grad_norm": 1.0249360799789429, + "learning_rate": 4.389738274164292e-06, + "loss": 1.6538, + "step": 847 + }, + { + "epoch": 0.019778022022874167, + "grad_norm": 1.128587007522583, + "learning_rate": 4.394920963980306e-06, + "loss": 1.2935, + "step": 848 + }, + { + "epoch": 0.019801345162052085, + "grad_norm": 1.301287293434143, + "learning_rate": 4.40010365379632e-06, + "loss": 1.4193, + "step": 849 + }, + { + "epoch": 0.019824668301230003, + "grad_norm": 1.5180747509002686, + "learning_rate": 4.405286343612335e-06, + "loss": 1.2061, + "step": 850 + }, + { + "epoch": 0.01984799144040792, + "grad_norm": 0.9110321402549744, + "learning_rate": 4.41046903342835e-06, + "loss": 1.2803, + "step": 851 + }, + { + "epoch": 0.01987131457958584, + "grad_norm": 1.68843674659729, + "learning_rate": 4.415651723244364e-06, + "loss": 1.2037, + "step": 852 + }, + { + "epoch": 0.019894637718763757, + "grad_norm": 1.2198610305786133, + "learning_rate": 4.4208344130603785e-06, + "loss": 1.6652, + "step": 853 + }, + { + "epoch": 0.019917960857941674, + "grad_norm": 1.579087257385254, + "learning_rate": 4.426017102876393e-06, + "loss": 1.5859, + "step": 854 + }, + { + "epoch": 0.019941283997119592, + "grad_norm": 1.7198874950408936, + "learning_rate": 4.431199792692408e-06, + "loss": 1.4662, + "step": 855 + }, + { + "epoch": 0.01996460713629751, + "grad_norm": 2.817178726196289, + "learning_rate": 4.436382482508422e-06, + "loss": 1.3427, + "step": 856 + }, + { + "epoch": 0.019987930275475428, + "grad_norm": 1.4508287906646729, + "learning_rate": 4.441565172324437e-06, + "loss": 1.2893, + "step": 857 + }, + { + "epoch": 0.020011253414653346, + "grad_norm": 1.29767644405365, + "learning_rate": 4.446747862140451e-06, + "loss": 1.5759, + "step": 858 + }, + { + "epoch": 0.020034576553831264, + "grad_norm": 1.84248685836792, + "learning_rate": 4.451930551956466e-06, + "loss": 2.1373, + "step": 859 + }, + { + "epoch": 0.020057899693009182, + "grad_norm": 1.6153839826583862, + "learning_rate": 4.457113241772481e-06, + "loss": 1.3915, + "step": 860 + }, + { + "epoch": 0.0200812228321871, + "grad_norm": 1.3203104734420776, + "learning_rate": 4.462295931588495e-06, + "loss": 1.569, + "step": 861 + }, + { + "epoch": 0.020104545971365014, + "grad_norm": 1.6475995779037476, + "learning_rate": 4.467478621404509e-06, + "loss": 1.6446, + "step": 862 + }, + { + "epoch": 0.020127869110542932, + "grad_norm": 1.165834665298462, + "learning_rate": 4.472661311220524e-06, + "loss": 1.7323, + "step": 863 + }, + { + "epoch": 0.02015119224972085, + "grad_norm": 1.3182172775268555, + "learning_rate": 4.477844001036538e-06, + "loss": 1.6265, + "step": 864 + }, + { + "epoch": 0.020174515388898768, + "grad_norm": 1.1236745119094849, + "learning_rate": 4.483026690852552e-06, + "loss": 1.2358, + "step": 865 + }, + { + "epoch": 0.020197838528076686, + "grad_norm": 1.2104893922805786, + "learning_rate": 4.4882093806685676e-06, + "loss": 1.4677, + "step": 866 + }, + { + "epoch": 0.020221161667254604, + "grad_norm": 1.6824678182601929, + "learning_rate": 4.493392070484582e-06, + "loss": 1.5802, + "step": 867 + }, + { + "epoch": 0.02024448480643252, + "grad_norm": 1.0679930448532104, + "learning_rate": 4.498574760300596e-06, + "loss": 1.4105, + "step": 868 + }, + { + "epoch": 0.02026780794561044, + "grad_norm": 1.3705253601074219, + "learning_rate": 4.503757450116611e-06, + "loss": 1.5095, + "step": 869 + }, + { + "epoch": 0.020291131084788357, + "grad_norm": 1.307491660118103, + "learning_rate": 4.508940139932626e-06, + "loss": 1.3987, + "step": 870 + }, + { + "epoch": 0.020314454223966275, + "grad_norm": 1.4814496040344238, + "learning_rate": 4.51412282974864e-06, + "loss": 1.635, + "step": 871 + }, + { + "epoch": 0.020337777363144193, + "grad_norm": 0.935867190361023, + "learning_rate": 4.5193055195646545e-06, + "loss": 1.6734, + "step": 872 + }, + { + "epoch": 0.02036110050232211, + "grad_norm": 1.3890215158462524, + "learning_rate": 4.524488209380669e-06, + "loss": 1.4458, + "step": 873 + }, + { + "epoch": 0.02038442364150003, + "grad_norm": 1.628081202507019, + "learning_rate": 4.529670899196684e-06, + "loss": 1.4814, + "step": 874 + }, + { + "epoch": 0.020407746780677947, + "grad_norm": 1.5255577564239502, + "learning_rate": 4.534853589012698e-06, + "loss": 1.3884, + "step": 875 + }, + { + "epoch": 0.020431069919855865, + "grad_norm": 2.09283185005188, + "learning_rate": 4.540036278828712e-06, + "loss": 1.7396, + "step": 876 + }, + { + "epoch": 0.02045439305903378, + "grad_norm": 0.9901561737060547, + "learning_rate": 4.545218968644727e-06, + "loss": 1.4941, + "step": 877 + }, + { + "epoch": 0.020477716198211697, + "grad_norm": 1.8444923162460327, + "learning_rate": 4.550401658460741e-06, + "loss": 1.2724, + "step": 878 + }, + { + "epoch": 0.020501039337389615, + "grad_norm": 1.414305567741394, + "learning_rate": 4.555584348276756e-06, + "loss": 1.5781, + "step": 879 + }, + { + "epoch": 0.020524362476567533, + "grad_norm": 1.1960091590881348, + "learning_rate": 4.56076703809277e-06, + "loss": 1.536, + "step": 880 + }, + { + "epoch": 0.02054768561574545, + "grad_norm": 2.241649627685547, + "learning_rate": 4.565949727908785e-06, + "loss": 1.6636, + "step": 881 + }, + { + "epoch": 0.02057100875492337, + "grad_norm": 1.0672343969345093, + "learning_rate": 4.5711324177248e-06, + "loss": 1.6369, + "step": 882 + }, + { + "epoch": 0.020594331894101287, + "grad_norm": 1.6761622428894043, + "learning_rate": 4.576315107540814e-06, + "loss": 1.2554, + "step": 883 + }, + { + "epoch": 0.020617655033279204, + "grad_norm": 1.1365658044815063, + "learning_rate": 4.581497797356828e-06, + "loss": 1.6271, + "step": 884 + }, + { + "epoch": 0.020640978172457122, + "grad_norm": 1.0631389617919922, + "learning_rate": 4.5866804871728435e-06, + "loss": 1.6393, + "step": 885 + }, + { + "epoch": 0.02066430131163504, + "grad_norm": 3.27304744720459, + "learning_rate": 4.591863176988858e-06, + "loss": 1.3521, + "step": 886 + }, + { + "epoch": 0.020687624450812958, + "grad_norm": 1.3354477882385254, + "learning_rate": 4.597045866804872e-06, + "loss": 1.5137, + "step": 887 + }, + { + "epoch": 0.020710947589990876, + "grad_norm": 2.192812919616699, + "learning_rate": 4.602228556620887e-06, + "loss": 1.7294, + "step": 888 + }, + { + "epoch": 0.020734270729168794, + "grad_norm": 0.9716669321060181, + "learning_rate": 4.607411246436901e-06, + "loss": 1.4244, + "step": 889 + }, + { + "epoch": 0.020757593868346712, + "grad_norm": 1.0377227067947388, + "learning_rate": 4.612593936252915e-06, + "loss": 1.3041, + "step": 890 + }, + { + "epoch": 0.020780917007524626, + "grad_norm": 1.971074104309082, + "learning_rate": 4.61777662606893e-06, + "loss": 1.4917, + "step": 891 + }, + { + "epoch": 0.020804240146702544, + "grad_norm": 1.3108222484588623, + "learning_rate": 4.622959315884945e-06, + "loss": 1.5923, + "step": 892 + }, + { + "epoch": 0.020827563285880462, + "grad_norm": 1.4194189310073853, + "learning_rate": 4.628142005700959e-06, + "loss": 1.2378, + "step": 893 + }, + { + "epoch": 0.02085088642505838, + "grad_norm": 1.5872682332992554, + "learning_rate": 4.6333246955169735e-06, + "loss": 1.3573, + "step": 894 + }, + { + "epoch": 0.020874209564236298, + "grad_norm": 1.351704716682434, + "learning_rate": 4.638507385332988e-06, + "loss": 1.8374, + "step": 895 + }, + { + "epoch": 0.020897532703414216, + "grad_norm": 1.15986168384552, + "learning_rate": 4.643690075149003e-06, + "loss": 1.4303, + "step": 896 + }, + { + "epoch": 0.020920855842592134, + "grad_norm": 1.912819743156433, + "learning_rate": 4.648872764965017e-06, + "loss": 1.7733, + "step": 897 + }, + { + "epoch": 0.02094417898177005, + "grad_norm": 1.6582539081573486, + "learning_rate": 4.654055454781032e-06, + "loss": 1.4696, + "step": 898 + }, + { + "epoch": 0.02096750212094797, + "grad_norm": 1.147661805152893, + "learning_rate": 4.659238144597046e-06, + "loss": 1.5037, + "step": 899 + }, + { + "epoch": 0.020990825260125887, + "grad_norm": 1.1773402690887451, + "learning_rate": 4.664420834413061e-06, + "loss": 1.604, + "step": 900 + }, + { + "epoch": 0.021014148399303805, + "grad_norm": 1.9128248691558838, + "learning_rate": 4.669603524229076e-06, + "loss": 1.3081, + "step": 901 + }, + { + "epoch": 0.021037471538481723, + "grad_norm": 1.0742683410644531, + "learning_rate": 4.67478621404509e-06, + "loss": 1.5619, + "step": 902 + }, + { + "epoch": 0.02106079467765964, + "grad_norm": 1.19862699508667, + "learning_rate": 4.679968903861104e-06, + "loss": 1.6896, + "step": 903 + }, + { + "epoch": 0.02108411781683756, + "grad_norm": 1.276283860206604, + "learning_rate": 4.685151593677119e-06, + "loss": 1.65, + "step": 904 + }, + { + "epoch": 0.021107440956015473, + "grad_norm": 1.3582435846328735, + "learning_rate": 4.690334283493133e-06, + "loss": 1.2686, + "step": 905 + }, + { + "epoch": 0.02113076409519339, + "grad_norm": 1.2145341634750366, + "learning_rate": 4.695516973309147e-06, + "loss": 1.8032, + "step": 906 + }, + { + "epoch": 0.02115408723437131, + "grad_norm": 1.1219233274459839, + "learning_rate": 4.7006996631251626e-06, + "loss": 1.7681, + "step": 907 + }, + { + "epoch": 0.021177410373549227, + "grad_norm": 1.0474015474319458, + "learning_rate": 4.705882352941177e-06, + "loss": 1.4555, + "step": 908 + }, + { + "epoch": 0.021200733512727145, + "grad_norm": 1.6325182914733887, + "learning_rate": 4.711065042757191e-06, + "loss": 1.432, + "step": 909 + }, + { + "epoch": 0.021224056651905063, + "grad_norm": 1.5804178714752197, + "learning_rate": 4.716247732573206e-06, + "loss": 1.7409, + "step": 910 + }, + { + "epoch": 0.02124737979108298, + "grad_norm": 1.226804256439209, + "learning_rate": 4.721430422389221e-06, + "loss": 1.8077, + "step": 911 + }, + { + "epoch": 0.0212707029302609, + "grad_norm": 1.0747625827789307, + "learning_rate": 4.726613112205235e-06, + "loss": 1.411, + "step": 912 + }, + { + "epoch": 0.021294026069438816, + "grad_norm": 1.2126623392105103, + "learning_rate": 4.7317958020212495e-06, + "loss": 1.6464, + "step": 913 + }, + { + "epoch": 0.021317349208616734, + "grad_norm": 1.196486473083496, + "learning_rate": 4.736978491837264e-06, + "loss": 1.4365, + "step": 914 + }, + { + "epoch": 0.021340672347794652, + "grad_norm": 1.4727115631103516, + "learning_rate": 4.742161181653279e-06, + "loss": 1.5059, + "step": 915 + }, + { + "epoch": 0.02136399548697257, + "grad_norm": 1.293938159942627, + "learning_rate": 4.747343871469293e-06, + "loss": 1.5508, + "step": 916 + }, + { + "epoch": 0.021387318626150488, + "grad_norm": 1.3074458837509155, + "learning_rate": 4.752526561285307e-06, + "loss": 1.364, + "step": 917 + }, + { + "epoch": 0.021410641765328406, + "grad_norm": 1.708522081375122, + "learning_rate": 4.757709251101322e-06, + "loss": 1.2891, + "step": 918 + }, + { + "epoch": 0.02143396490450632, + "grad_norm": 1.2926160097122192, + "learning_rate": 4.762891940917336e-06, + "loss": 1.1779, + "step": 919 + }, + { + "epoch": 0.021457288043684238, + "grad_norm": 1.7751168012619019, + "learning_rate": 4.768074630733351e-06, + "loss": 1.3136, + "step": 920 + }, + { + "epoch": 0.021480611182862156, + "grad_norm": 1.3698194026947021, + "learning_rate": 4.773257320549365e-06, + "loss": 1.5203, + "step": 921 + }, + { + "epoch": 0.021503934322040074, + "grad_norm": 1.4710402488708496, + "learning_rate": 4.77844001036538e-06, + "loss": 2.0632, + "step": 922 + }, + { + "epoch": 0.021527257461217992, + "grad_norm": 1.3340466022491455, + "learning_rate": 4.783622700181395e-06, + "loss": 0.9449, + "step": 923 + }, + { + "epoch": 0.02155058060039591, + "grad_norm": 1.990078330039978, + "learning_rate": 4.788805389997409e-06, + "loss": 1.4095, + "step": 924 + }, + { + "epoch": 0.021573903739573828, + "grad_norm": 2.6495463848114014, + "learning_rate": 4.793988079813423e-06, + "loss": 1.5914, + "step": 925 + }, + { + "epoch": 0.021597226878751746, + "grad_norm": 1.368868350982666, + "learning_rate": 4.7991707696294385e-06, + "loss": 1.8007, + "step": 926 + }, + { + "epoch": 0.021620550017929663, + "grad_norm": 1.3946820497512817, + "learning_rate": 4.804353459445453e-06, + "loss": 1.3846, + "step": 927 + }, + { + "epoch": 0.02164387315710758, + "grad_norm": 1.6035547256469727, + "learning_rate": 4.809536149261467e-06, + "loss": 1.6677, + "step": 928 + }, + { + "epoch": 0.0216671962962855, + "grad_norm": 1.29734468460083, + "learning_rate": 4.814718839077482e-06, + "loss": 1.3697, + "step": 929 + }, + { + "epoch": 0.021690519435463417, + "grad_norm": 1.1746439933776855, + "learning_rate": 4.819901528893497e-06, + "loss": 1.6134, + "step": 930 + }, + { + "epoch": 0.021713842574641335, + "grad_norm": 1.255861759185791, + "learning_rate": 4.82508421870951e-06, + "loss": 1.6253, + "step": 931 + }, + { + "epoch": 0.021737165713819253, + "grad_norm": 1.5499615669250488, + "learning_rate": 4.830266908525525e-06, + "loss": 1.2794, + "step": 932 + }, + { + "epoch": 0.02176048885299717, + "grad_norm": 1.6138273477554321, + "learning_rate": 4.83544959834154e-06, + "loss": 1.6365, + "step": 933 + }, + { + "epoch": 0.021783811992175085, + "grad_norm": 1.7135401964187622, + "learning_rate": 4.840632288157554e-06, + "loss": 1.509, + "step": 934 + }, + { + "epoch": 0.021807135131353003, + "grad_norm": 1.4290528297424316, + "learning_rate": 4.8458149779735685e-06, + "loss": 1.3415, + "step": 935 + }, + { + "epoch": 0.02183045827053092, + "grad_norm": 2.034870147705078, + "learning_rate": 4.850997667789583e-06, + "loss": 1.6834, + "step": 936 + }, + { + "epoch": 0.02185378140970884, + "grad_norm": 1.6626250743865967, + "learning_rate": 4.856180357605598e-06, + "loss": 1.3573, + "step": 937 + }, + { + "epoch": 0.021877104548886757, + "grad_norm": 1.2256288528442383, + "learning_rate": 4.861363047421612e-06, + "loss": 1.5497, + "step": 938 + }, + { + "epoch": 0.021900427688064675, + "grad_norm": 1.218955397605896, + "learning_rate": 4.866545737237627e-06, + "loss": 1.6823, + "step": 939 + }, + { + "epoch": 0.021923750827242593, + "grad_norm": 1.0629289150238037, + "learning_rate": 4.871728427053641e-06, + "loss": 1.3894, + "step": 940 + }, + { + "epoch": 0.02194707396642051, + "grad_norm": 2.6169822216033936, + "learning_rate": 4.876911116869656e-06, + "loss": 1.4063, + "step": 941 + }, + { + "epoch": 0.02197039710559843, + "grad_norm": 1.1517153978347778, + "learning_rate": 4.882093806685671e-06, + "loss": 1.3838, + "step": 942 + }, + { + "epoch": 0.021993720244776346, + "grad_norm": 1.6320403814315796, + "learning_rate": 4.887276496501685e-06, + "loss": 1.5752, + "step": 943 + }, + { + "epoch": 0.022017043383954264, + "grad_norm": 1.7344862222671509, + "learning_rate": 4.892459186317699e-06, + "loss": 1.3182, + "step": 944 + }, + { + "epoch": 0.022040366523132182, + "grad_norm": 1.2497214078903198, + "learning_rate": 4.897641876133714e-06, + "loss": 1.2266, + "step": 945 + }, + { + "epoch": 0.0220636896623101, + "grad_norm": 1.996893048286438, + "learning_rate": 4.902824565949728e-06, + "loss": 1.2708, + "step": 946 + }, + { + "epoch": 0.022087012801488018, + "grad_norm": 1.1130571365356445, + "learning_rate": 4.908007255765742e-06, + "loss": 1.4791, + "step": 947 + }, + { + "epoch": 0.022110335940665932, + "grad_norm": 1.2698702812194824, + "learning_rate": 4.9131899455817576e-06, + "loss": 1.3711, + "step": 948 + }, + { + "epoch": 0.02213365907984385, + "grad_norm": 1.0363445281982422, + "learning_rate": 4.918372635397772e-06, + "loss": 1.4153, + "step": 949 + }, + { + "epoch": 0.022156982219021768, + "grad_norm": 1.1418310403823853, + "learning_rate": 4.923555325213786e-06, + "loss": 1.3377, + "step": 950 + }, + { + "epoch": 0.022180305358199686, + "grad_norm": 1.3740698099136353, + "learning_rate": 4.928738015029801e-06, + "loss": 1.375, + "step": 951 + }, + { + "epoch": 0.022203628497377604, + "grad_norm": 1.5656532049179077, + "learning_rate": 4.933920704845816e-06, + "loss": 1.651, + "step": 952 + }, + { + "epoch": 0.022226951636555522, + "grad_norm": 1.209380865097046, + "learning_rate": 4.93910339466183e-06, + "loss": 1.6956, + "step": 953 + }, + { + "epoch": 0.02225027477573344, + "grad_norm": 1.9917747974395752, + "learning_rate": 4.9442860844778445e-06, + "loss": 1.2802, + "step": 954 + }, + { + "epoch": 0.022273597914911358, + "grad_norm": 2.168260097503662, + "learning_rate": 4.949468774293859e-06, + "loss": 1.9773, + "step": 955 + }, + { + "epoch": 0.022296921054089276, + "grad_norm": 1.113978624343872, + "learning_rate": 4.954651464109874e-06, + "loss": 1.8121, + "step": 956 + }, + { + "epoch": 0.022320244193267193, + "grad_norm": 1.4833635091781616, + "learning_rate": 4.959834153925888e-06, + "loss": 1.694, + "step": 957 + }, + { + "epoch": 0.02234356733244511, + "grad_norm": 1.3287935256958008, + "learning_rate": 4.965016843741902e-06, + "loss": 1.4865, + "step": 958 + }, + { + "epoch": 0.02236689047162303, + "grad_norm": 1.5515238046646118, + "learning_rate": 4.970199533557917e-06, + "loss": 1.6035, + "step": 959 + }, + { + "epoch": 0.022390213610800947, + "grad_norm": 1.2824245691299438, + "learning_rate": 4.975382223373931e-06, + "loss": 1.5124, + "step": 960 + }, + { + "epoch": 0.022413536749978865, + "grad_norm": 1.2062418460845947, + "learning_rate": 4.980564913189946e-06, + "loss": 1.5982, + "step": 961 + }, + { + "epoch": 0.02243685988915678, + "grad_norm": 1.2790741920471191, + "learning_rate": 4.98574760300596e-06, + "loss": 1.586, + "step": 962 + }, + { + "epoch": 0.022460183028334697, + "grad_norm": 1.202909231185913, + "learning_rate": 4.990930292821975e-06, + "loss": 1.7387, + "step": 963 + }, + { + "epoch": 0.022483506167512615, + "grad_norm": 1.328963041305542, + "learning_rate": 4.99611298263799e-06, + "loss": 1.5611, + "step": 964 + }, + { + "epoch": 0.022506829306690533, + "grad_norm": 1.3728841543197632, + "learning_rate": 5.001295672454004e-06, + "loss": 1.6887, + "step": 965 + }, + { + "epoch": 0.02253015244586845, + "grad_norm": 1.2474596500396729, + "learning_rate": 5.006478362270018e-06, + "loss": 1.7337, + "step": 966 + }, + { + "epoch": 0.02255347558504637, + "grad_norm": 1.4526808261871338, + "learning_rate": 5.0116610520860335e-06, + "loss": 1.4009, + "step": 967 + }, + { + "epoch": 0.022576798724224287, + "grad_norm": 1.74959397315979, + "learning_rate": 5.016843741902048e-06, + "loss": 1.4153, + "step": 968 + }, + { + "epoch": 0.022600121863402205, + "grad_norm": 1.7886738777160645, + "learning_rate": 5.022026431718062e-06, + "loss": 1.3897, + "step": 969 + }, + { + "epoch": 0.022623445002580123, + "grad_norm": 1.3122284412384033, + "learning_rate": 5.027209121534077e-06, + "loss": 1.6551, + "step": 970 + }, + { + "epoch": 0.02264676814175804, + "grad_norm": 1.5374927520751953, + "learning_rate": 5.032391811350092e-06, + "loss": 1.6396, + "step": 971 + }, + { + "epoch": 0.02267009128093596, + "grad_norm": 1.6476905345916748, + "learning_rate": 5.037574501166106e-06, + "loss": 1.733, + "step": 972 + }, + { + "epoch": 0.022693414420113876, + "grad_norm": 1.3407307863235474, + "learning_rate": 5.0427571909821205e-06, + "loss": 1.4984, + "step": 973 + }, + { + "epoch": 0.022716737559291794, + "grad_norm": 1.5565712451934814, + "learning_rate": 5.047939880798135e-06, + "loss": 1.6524, + "step": 974 + }, + { + "epoch": 0.022740060698469712, + "grad_norm": 1.381903052330017, + "learning_rate": 5.053122570614149e-06, + "loss": 1.5325, + "step": 975 + }, + { + "epoch": 0.022763383837647626, + "grad_norm": 1.916326880455017, + "learning_rate": 5.058305260430164e-06, + "loss": 1.2326, + "step": 976 + }, + { + "epoch": 0.022786706976825544, + "grad_norm": 1.1621575355529785, + "learning_rate": 5.063487950246179e-06, + "loss": 1.2568, + "step": 977 + }, + { + "epoch": 0.022810030116003462, + "grad_norm": 1.3575561046600342, + "learning_rate": 5.068670640062193e-06, + "loss": 1.3755, + "step": 978 + }, + { + "epoch": 0.02283335325518138, + "grad_norm": 1.482701063156128, + "learning_rate": 5.0738533298782065e-06, + "loss": 1.598, + "step": 979 + }, + { + "epoch": 0.022856676394359298, + "grad_norm": 1.2530887126922607, + "learning_rate": 5.079036019694221e-06, + "loss": 1.66, + "step": 980 + }, + { + "epoch": 0.022879999533537216, + "grad_norm": 1.4960439205169678, + "learning_rate": 5.084218709510236e-06, + "loss": 1.5341, + "step": 981 + }, + { + "epoch": 0.022903322672715134, + "grad_norm": 1.507735252380371, + "learning_rate": 5.0894013993262504e-06, + "loss": 1.3987, + "step": 982 + }, + { + "epoch": 0.022926645811893052, + "grad_norm": 2.0131475925445557, + "learning_rate": 5.094584089142265e-06, + "loss": 1.3134, + "step": 983 + }, + { + "epoch": 0.02294996895107097, + "grad_norm": 1.8096015453338623, + "learning_rate": 5.099766778958279e-06, + "loss": 1.3707, + "step": 984 + }, + { + "epoch": 0.022973292090248888, + "grad_norm": 1.0444198846817017, + "learning_rate": 5.104949468774294e-06, + "loss": 1.4119, + "step": 985 + }, + { + "epoch": 0.022996615229426805, + "grad_norm": 1.3110159635543823, + "learning_rate": 5.110132158590309e-06, + "loss": 1.2187, + "step": 986 + }, + { + "epoch": 0.023019938368604723, + "grad_norm": 1.3191614151000977, + "learning_rate": 5.115314848406323e-06, + "loss": 1.3691, + "step": 987 + }, + { + "epoch": 0.02304326150778264, + "grad_norm": 1.3888386487960815, + "learning_rate": 5.120497538222337e-06, + "loss": 1.1934, + "step": 988 + }, + { + "epoch": 0.02306658464696056, + "grad_norm": 1.2101585865020752, + "learning_rate": 5.1256802280383526e-06, + "loss": 1.4962, + "step": 989 + }, + { + "epoch": 0.023089907786138477, + "grad_norm": 1.2938464879989624, + "learning_rate": 5.130862917854367e-06, + "loss": 1.4601, + "step": 990 + }, + { + "epoch": 0.02311323092531639, + "grad_norm": 2.072444200515747, + "learning_rate": 5.136045607670381e-06, + "loss": 1.7241, + "step": 991 + }, + { + "epoch": 0.02313655406449431, + "grad_norm": 1.7139407396316528, + "learning_rate": 5.141228297486396e-06, + "loss": 1.394, + "step": 992 + }, + { + "epoch": 0.023159877203672227, + "grad_norm": 1.5825177431106567, + "learning_rate": 5.146410987302411e-06, + "loss": 1.4218, + "step": 993 + }, + { + "epoch": 0.023183200342850145, + "grad_norm": 1.2233787775039673, + "learning_rate": 5.151593677118425e-06, + "loss": 1.2882, + "step": 994 + }, + { + "epoch": 0.023206523482028063, + "grad_norm": 1.6474647521972656, + "learning_rate": 5.1567763669344395e-06, + "loss": 1.6499, + "step": 995 + }, + { + "epoch": 0.02322984662120598, + "grad_norm": 1.669651985168457, + "learning_rate": 5.161959056750454e-06, + "loss": 1.1727, + "step": 996 + }, + { + "epoch": 0.0232531697603839, + "grad_norm": 1.4976879358291626, + "learning_rate": 5.167141746566469e-06, + "loss": 1.2149, + "step": 997 + }, + { + "epoch": 0.023276492899561817, + "grad_norm": 1.4033470153808594, + "learning_rate": 5.172324436382483e-06, + "loss": 1.3004, + "step": 998 + }, + { + "epoch": 0.023299816038739735, + "grad_norm": 1.3042150735855103, + "learning_rate": 5.177507126198498e-06, + "loss": 1.3803, + "step": 999 + }, + { + "epoch": 0.023323139177917653, + "grad_norm": 1.4327346086502075, + "learning_rate": 5.182689816014512e-06, + "loss": 1.7267, + "step": 1000 + }, + { + "epoch": 0.02334646231709557, + "grad_norm": 1.4823616743087769, + "learning_rate": 5.187872505830526e-06, + "loss": 1.6386, + "step": 1001 + }, + { + "epoch": 0.02336978545627349, + "grad_norm": 1.7083938121795654, + "learning_rate": 5.193055195646542e-06, + "loss": 1.3112, + "step": 1002 + }, + { + "epoch": 0.023393108595451406, + "grad_norm": 1.51584792137146, + "learning_rate": 5.198237885462556e-06, + "loss": 1.6169, + "step": 1003 + }, + { + "epoch": 0.023416431734629324, + "grad_norm": 1.0864455699920654, + "learning_rate": 5.20342057527857e-06, + "loss": 1.3013, + "step": 1004 + }, + { + "epoch": 0.02343975487380724, + "grad_norm": 1.9760619401931763, + "learning_rate": 5.208603265094585e-06, + "loss": 1.7865, + "step": 1005 + }, + { + "epoch": 0.023463078012985156, + "grad_norm": 2.5747292041778564, + "learning_rate": 5.2137859549106e-06, + "loss": 1.3345, + "step": 1006 + }, + { + "epoch": 0.023486401152163074, + "grad_norm": 1.689779281616211, + "learning_rate": 5.218968644726613e-06, + "loss": 1.7856, + "step": 1007 + }, + { + "epoch": 0.023509724291340992, + "grad_norm": 1.9847980737686157, + "learning_rate": 5.224151334542628e-06, + "loss": 1.8401, + "step": 1008 + }, + { + "epoch": 0.02353304743051891, + "grad_norm": 1.3654876947402954, + "learning_rate": 5.229334024358642e-06, + "loss": 1.7705, + "step": 1009 + }, + { + "epoch": 0.023556370569696828, + "grad_norm": 1.7249932289123535, + "learning_rate": 5.234516714174656e-06, + "loss": 1.1657, + "step": 1010 + }, + { + "epoch": 0.023579693708874746, + "grad_norm": 1.0710606575012207, + "learning_rate": 5.2396994039906716e-06, + "loss": 1.1676, + "step": 1011 + }, + { + "epoch": 0.023603016848052664, + "grad_norm": 1.213040828704834, + "learning_rate": 5.244882093806686e-06, + "loss": 1.4183, + "step": 1012 + }, + { + "epoch": 0.02362633998723058, + "grad_norm": 1.6341387033462524, + "learning_rate": 5.2500647836227e-06, + "loss": 1.6092, + "step": 1013 + }, + { + "epoch": 0.0236496631264085, + "grad_norm": 1.6445837020874023, + "learning_rate": 5.255247473438715e-06, + "loss": 1.6693, + "step": 1014 + }, + { + "epoch": 0.023672986265586417, + "grad_norm": 1.2804230451583862, + "learning_rate": 5.26043016325473e-06, + "loss": 1.5687, + "step": 1015 + }, + { + "epoch": 0.023696309404764335, + "grad_norm": 1.8683735132217407, + "learning_rate": 5.265612853070744e-06, + "loss": 1.3944, + "step": 1016 + }, + { + "epoch": 0.023719632543942253, + "grad_norm": 1.6504722833633423, + "learning_rate": 5.2707955428867585e-06, + "loss": 1.3018, + "step": 1017 + }, + { + "epoch": 0.02374295568312017, + "grad_norm": 1.71793532371521, + "learning_rate": 5.275978232702773e-06, + "loss": 1.4581, + "step": 1018 + }, + { + "epoch": 0.023766278822298086, + "grad_norm": 1.1414326429367065, + "learning_rate": 5.281160922518788e-06, + "loss": 1.4924, + "step": 1019 + }, + { + "epoch": 0.023789601961476003, + "grad_norm": 1.6553568840026855, + "learning_rate": 5.286343612334802e-06, + "loss": 1.6926, + "step": 1020 + }, + { + "epoch": 0.02381292510065392, + "grad_norm": 1.4217321872711182, + "learning_rate": 5.291526302150817e-06, + "loss": 1.4806, + "step": 1021 + }, + { + "epoch": 0.02383624823983184, + "grad_norm": 1.4322501420974731, + "learning_rate": 5.296708991966831e-06, + "loss": 1.5978, + "step": 1022 + }, + { + "epoch": 0.023859571379009757, + "grad_norm": 1.9824562072753906, + "learning_rate": 5.3018916817828454e-06, + "loss": 1.493, + "step": 1023 + }, + { + "epoch": 0.023882894518187675, + "grad_norm": 1.3815537691116333, + "learning_rate": 5.307074371598861e-06, + "loss": 1.3702, + "step": 1024 + }, + { + "epoch": 0.023906217657365593, + "grad_norm": 1.101647138595581, + "learning_rate": 5.312257061414875e-06, + "loss": 1.1745, + "step": 1025 + }, + { + "epoch": 0.02392954079654351, + "grad_norm": 1.2983593940734863, + "learning_rate": 5.317439751230889e-06, + "loss": 1.7473, + "step": 1026 + }, + { + "epoch": 0.02395286393572143, + "grad_norm": 1.2676076889038086, + "learning_rate": 5.322622441046904e-06, + "loss": 1.6349, + "step": 1027 + }, + { + "epoch": 0.023976187074899347, + "grad_norm": 1.2923870086669922, + "learning_rate": 5.327805130862919e-06, + "loss": 1.619, + "step": 1028 + }, + { + "epoch": 0.023999510214077265, + "grad_norm": 1.4195587635040283, + "learning_rate": 5.332987820678933e-06, + "loss": 1.4933, + "step": 1029 + }, + { + "epoch": 0.024022833353255182, + "grad_norm": 1.3498200178146362, + "learning_rate": 5.3381705104949476e-06, + "loss": 1.489, + "step": 1030 + }, + { + "epoch": 0.0240461564924331, + "grad_norm": 1.473960280418396, + "learning_rate": 5.343353200310962e-06, + "loss": 1.5181, + "step": 1031 + }, + { + "epoch": 0.024069479631611018, + "grad_norm": 1.2730071544647217, + "learning_rate": 5.348535890126977e-06, + "loss": 1.5796, + "step": 1032 + }, + { + "epoch": 0.024092802770788933, + "grad_norm": 1.2243895530700684, + "learning_rate": 5.3537185799429914e-06, + "loss": 1.4051, + "step": 1033 + }, + { + "epoch": 0.02411612590996685, + "grad_norm": 2.1219441890716553, + "learning_rate": 5.358901269759005e-06, + "loss": 1.4317, + "step": 1034 + }, + { + "epoch": 0.02413944904914477, + "grad_norm": 1.0719225406646729, + "learning_rate": 5.364083959575019e-06, + "loss": 1.3937, + "step": 1035 + }, + { + "epoch": 0.024162772188322686, + "grad_norm": 1.6711935997009277, + "learning_rate": 5.369266649391034e-06, + "loss": 1.5832, + "step": 1036 + }, + { + "epoch": 0.024186095327500604, + "grad_norm": 1.33745276927948, + "learning_rate": 5.374449339207049e-06, + "loss": 1.4582, + "step": 1037 + }, + { + "epoch": 0.024209418466678522, + "grad_norm": 1.4278967380523682, + "learning_rate": 5.379632029023063e-06, + "loss": 1.6069, + "step": 1038 + }, + { + "epoch": 0.02423274160585644, + "grad_norm": 1.2003988027572632, + "learning_rate": 5.3848147188390775e-06, + "loss": 1.4942, + "step": 1039 + }, + { + "epoch": 0.024256064745034358, + "grad_norm": 1.7350938320159912, + "learning_rate": 5.389997408655092e-06, + "loss": 1.637, + "step": 1040 + }, + { + "epoch": 0.024279387884212276, + "grad_norm": 1.6094862222671509, + "learning_rate": 5.395180098471107e-06, + "loss": 1.6944, + "step": 1041 + }, + { + "epoch": 0.024302711023390194, + "grad_norm": 1.369091510772705, + "learning_rate": 5.400362788287121e-06, + "loss": 1.6905, + "step": 1042 + }, + { + "epoch": 0.02432603416256811, + "grad_norm": 1.275787353515625, + "learning_rate": 5.405545478103136e-06, + "loss": 1.6749, + "step": 1043 + }, + { + "epoch": 0.02434935730174603, + "grad_norm": 1.24448823928833, + "learning_rate": 5.41072816791915e-06, + "loss": 1.4275, + "step": 1044 + }, + { + "epoch": 0.024372680440923947, + "grad_norm": 1.7868009805679321, + "learning_rate": 5.415910857735165e-06, + "loss": 1.5942, + "step": 1045 + }, + { + "epoch": 0.024396003580101865, + "grad_norm": 1.5386407375335693, + "learning_rate": 5.42109354755118e-06, + "loss": 1.6505, + "step": 1046 + }, + { + "epoch": 0.024419326719279783, + "grad_norm": 1.9666537046432495, + "learning_rate": 5.426276237367194e-06, + "loss": 1.7035, + "step": 1047 + }, + { + "epoch": 0.024442649858457698, + "grad_norm": 1.7937966585159302, + "learning_rate": 5.431458927183208e-06, + "loss": 1.7956, + "step": 1048 + }, + { + "epoch": 0.024465972997635616, + "grad_norm": 1.1397721767425537, + "learning_rate": 5.436641616999223e-06, + "loss": 1.3459, + "step": 1049 + }, + { + "epoch": 0.024489296136813533, + "grad_norm": 1.28958261013031, + "learning_rate": 5.441824306815238e-06, + "loss": 1.0963, + "step": 1050 + }, + { + "epoch": 0.02451261927599145, + "grad_norm": 1.3734923601150513, + "learning_rate": 5.447006996631252e-06, + "loss": 1.3196, + "step": 1051 + }, + { + "epoch": 0.02453594241516937, + "grad_norm": 1.8763736486434937, + "learning_rate": 5.4521896864472666e-06, + "loss": 1.7322, + "step": 1052 + }, + { + "epoch": 0.024559265554347287, + "grad_norm": 1.5179871320724487, + "learning_rate": 5.457372376263281e-06, + "loss": 1.2844, + "step": 1053 + }, + { + "epoch": 0.024582588693525205, + "grad_norm": 1.4944384098052979, + "learning_rate": 5.462555066079296e-06, + "loss": 1.442, + "step": 1054 + }, + { + "epoch": 0.024605911832703123, + "grad_norm": 1.499028205871582, + "learning_rate": 5.4677377558953105e-06, + "loss": 1.394, + "step": 1055 + }, + { + "epoch": 0.02462923497188104, + "grad_norm": 1.1869397163391113, + "learning_rate": 5.472920445711325e-06, + "loss": 1.2928, + "step": 1056 + }, + { + "epoch": 0.02465255811105896, + "grad_norm": 1.3456541299819946, + "learning_rate": 5.478103135527339e-06, + "loss": 1.5983, + "step": 1057 + }, + { + "epoch": 0.024675881250236877, + "grad_norm": 1.5931065082550049, + "learning_rate": 5.483285825343354e-06, + "loss": 1.4794, + "step": 1058 + }, + { + "epoch": 0.024699204389414794, + "grad_norm": 1.4096170663833618, + "learning_rate": 5.488468515159369e-06, + "loss": 1.471, + "step": 1059 + }, + { + "epoch": 0.024722527528592712, + "grad_norm": 1.5033949613571167, + "learning_rate": 5.493651204975383e-06, + "loss": 1.2857, + "step": 1060 + }, + { + "epoch": 0.02474585066777063, + "grad_norm": 1.632089614868164, + "learning_rate": 5.498833894791397e-06, + "loss": 1.5157, + "step": 1061 + }, + { + "epoch": 0.024769173806948545, + "grad_norm": 1.563462495803833, + "learning_rate": 5.504016584607411e-06, + "loss": 1.5072, + "step": 1062 + }, + { + "epoch": 0.024792496946126463, + "grad_norm": 1.4055378437042236, + "learning_rate": 5.509199274423426e-06, + "loss": 1.1545, + "step": 1063 + }, + { + "epoch": 0.02481582008530438, + "grad_norm": 1.3467985391616821, + "learning_rate": 5.5143819642394404e-06, + "loss": 1.4615, + "step": 1064 + }, + { + "epoch": 0.0248391432244823, + "grad_norm": 1.6450691223144531, + "learning_rate": 5.519564654055455e-06, + "loss": 1.8051, + "step": 1065 + }, + { + "epoch": 0.024862466363660216, + "grad_norm": 1.247313141822815, + "learning_rate": 5.524747343871469e-06, + "loss": 1.5971, + "step": 1066 + }, + { + "epoch": 0.024885789502838134, + "grad_norm": 1.7429383993148804, + "learning_rate": 5.529930033687484e-06, + "loss": 1.5401, + "step": 1067 + }, + { + "epoch": 0.024909112642016052, + "grad_norm": 1.7351207733154297, + "learning_rate": 5.535112723503499e-06, + "loss": 1.4898, + "step": 1068 + }, + { + "epoch": 0.02493243578119397, + "grad_norm": 1.5003080368041992, + "learning_rate": 5.540295413319513e-06, + "loss": 1.773, + "step": 1069 + }, + { + "epoch": 0.024955758920371888, + "grad_norm": 1.370918869972229, + "learning_rate": 5.545478103135527e-06, + "loss": 1.6648, + "step": 1070 + }, + { + "epoch": 0.024979082059549806, + "grad_norm": 1.125687837600708, + "learning_rate": 5.5506607929515426e-06, + "loss": 1.5297, + "step": 1071 + }, + { + "epoch": 0.025002405198727724, + "grad_norm": 1.984605073928833, + "learning_rate": 5.555843482767557e-06, + "loss": 1.4637, + "step": 1072 + }, + { + "epoch": 0.02502572833790564, + "grad_norm": 1.6429048776626587, + "learning_rate": 5.561026172583571e-06, + "loss": 1.2794, + "step": 1073 + }, + { + "epoch": 0.02504905147708356, + "grad_norm": 1.8730500936508179, + "learning_rate": 5.566208862399586e-06, + "loss": 1.4462, + "step": 1074 + }, + { + "epoch": 0.025072374616261477, + "grad_norm": 1.536036729812622, + "learning_rate": 5.5713915522156e-06, + "loss": 1.2484, + "step": 1075 + }, + { + "epoch": 0.025095697755439392, + "grad_norm": 1.2056294679641724, + "learning_rate": 5.576574242031615e-06, + "loss": 1.7819, + "step": 1076 + }, + { + "epoch": 0.02511902089461731, + "grad_norm": 1.4317046403884888, + "learning_rate": 5.5817569318476295e-06, + "loss": 1.5005, + "step": 1077 + }, + { + "epoch": 0.025142344033795228, + "grad_norm": 1.5313549041748047, + "learning_rate": 5.586939621663644e-06, + "loss": 1.6916, + "step": 1078 + }, + { + "epoch": 0.025165667172973145, + "grad_norm": 1.2438437938690186, + "learning_rate": 5.592122311479658e-06, + "loss": 1.4453, + "step": 1079 + }, + { + "epoch": 0.025188990312151063, + "grad_norm": 1.665187954902649, + "learning_rate": 5.597305001295673e-06, + "loss": 1.1324, + "step": 1080 + }, + { + "epoch": 0.02521231345132898, + "grad_norm": 1.910433053970337, + "learning_rate": 5.602487691111688e-06, + "loss": 2.003, + "step": 1081 + }, + { + "epoch": 0.0252356365905069, + "grad_norm": 1.6894274950027466, + "learning_rate": 5.607670380927702e-06, + "loss": 1.5041, + "step": 1082 + }, + { + "epoch": 0.025258959729684817, + "grad_norm": 1.246095061302185, + "learning_rate": 5.612853070743716e-06, + "loss": 1.7421, + "step": 1083 + }, + { + "epoch": 0.025282282868862735, + "grad_norm": 1.7268954515457153, + "learning_rate": 5.618035760559732e-06, + "loss": 1.4601, + "step": 1084 + }, + { + "epoch": 0.025305606008040653, + "grad_norm": 1.2897146940231323, + "learning_rate": 5.623218450375746e-06, + "loss": 1.4538, + "step": 1085 + }, + { + "epoch": 0.02532892914721857, + "grad_norm": 1.329236388206482, + "learning_rate": 5.62840114019176e-06, + "loss": 1.6763, + "step": 1086 + }, + { + "epoch": 0.02535225228639649, + "grad_norm": 1.4001597166061401, + "learning_rate": 5.633583830007775e-06, + "loss": 1.4887, + "step": 1087 + }, + { + "epoch": 0.025375575425574406, + "grad_norm": 2.036400079727173, + "learning_rate": 5.63876651982379e-06, + "loss": 1.4996, + "step": 1088 + }, + { + "epoch": 0.025398898564752324, + "grad_norm": 1.4963785409927368, + "learning_rate": 5.643949209639803e-06, + "loss": 1.6515, + "step": 1089 + }, + { + "epoch": 0.02542222170393024, + "grad_norm": 1.4221199750900269, + "learning_rate": 5.649131899455818e-06, + "loss": 1.814, + "step": 1090 + }, + { + "epoch": 0.025445544843108157, + "grad_norm": 1.7034932374954224, + "learning_rate": 5.654314589271832e-06, + "loss": 1.478, + "step": 1091 + }, + { + "epoch": 0.025468867982286075, + "grad_norm": 1.5419113636016846, + "learning_rate": 5.659497279087846e-06, + "loss": 1.8225, + "step": 1092 + }, + { + "epoch": 0.025492191121463992, + "grad_norm": 1.8337044715881348, + "learning_rate": 5.6646799689038616e-06, + "loss": 1.5037, + "step": 1093 + }, + { + "epoch": 0.02551551426064191, + "grad_norm": 1.3712172508239746, + "learning_rate": 5.669862658719876e-06, + "loss": 1.4449, + "step": 1094 + }, + { + "epoch": 0.02553883739981983, + "grad_norm": 1.312258005142212, + "learning_rate": 5.67504534853589e-06, + "loss": 1.5159, + "step": 1095 + }, + { + "epoch": 0.025562160538997746, + "grad_norm": 1.5284754037857056, + "learning_rate": 5.680228038351905e-06, + "loss": 1.4479, + "step": 1096 + }, + { + "epoch": 0.025585483678175664, + "grad_norm": 1.1178314685821533, + "learning_rate": 5.68541072816792e-06, + "loss": 1.4729, + "step": 1097 + }, + { + "epoch": 0.025608806817353582, + "grad_norm": 1.2439149618148804, + "learning_rate": 5.690593417983934e-06, + "loss": 1.436, + "step": 1098 + }, + { + "epoch": 0.0256321299565315, + "grad_norm": 1.580632209777832, + "learning_rate": 5.6957761077999485e-06, + "loss": 1.2718, + "step": 1099 + }, + { + "epoch": 0.025655453095709418, + "grad_norm": 1.6244875192642212, + "learning_rate": 5.700958797615963e-06, + "loss": 1.6024, + "step": 1100 + }, + { + "epoch": 0.025678776234887336, + "grad_norm": 1.2542647123336792, + "learning_rate": 5.706141487431977e-06, + "loss": 1.4344, + "step": 1101 + }, + { + "epoch": 0.025702099374065254, + "grad_norm": 1.227737307548523, + "learning_rate": 5.711324177247992e-06, + "loss": 1.2912, + "step": 1102 + }, + { + "epoch": 0.02572542251324317, + "grad_norm": 1.705132007598877, + "learning_rate": 5.716506867064007e-06, + "loss": 1.7786, + "step": 1103 + }, + { + "epoch": 0.02574874565242109, + "grad_norm": 1.4411309957504272, + "learning_rate": 5.721689556880021e-06, + "loss": 1.6456, + "step": 1104 + }, + { + "epoch": 0.025772068791599004, + "grad_norm": 1.5248507261276245, + "learning_rate": 5.7268722466960354e-06, + "loss": 1.308, + "step": 1105 + }, + { + "epoch": 0.02579539193077692, + "grad_norm": 1.3953535556793213, + "learning_rate": 5.732054936512051e-06, + "loss": 1.7294, + "step": 1106 + }, + { + "epoch": 0.02581871506995484, + "grad_norm": 2.0566859245300293, + "learning_rate": 5.737237626328065e-06, + "loss": 1.4392, + "step": 1107 + }, + { + "epoch": 0.025842038209132757, + "grad_norm": 1.4723169803619385, + "learning_rate": 5.742420316144079e-06, + "loss": 1.4799, + "step": 1108 + }, + { + "epoch": 0.025865361348310675, + "grad_norm": 1.4092565774917603, + "learning_rate": 5.747603005960094e-06, + "loss": 1.199, + "step": 1109 + }, + { + "epoch": 0.025888684487488593, + "grad_norm": 1.277365803718567, + "learning_rate": 5.752785695776109e-06, + "loss": 1.6108, + "step": 1110 + }, + { + "epoch": 0.02591200762666651, + "grad_norm": 2.465951919555664, + "learning_rate": 5.757968385592123e-06, + "loss": 1.6563, + "step": 1111 + }, + { + "epoch": 0.02593533076584443, + "grad_norm": 1.8686498403549194, + "learning_rate": 5.7631510754081376e-06, + "loss": 1.4241, + "step": 1112 + }, + { + "epoch": 0.025958653905022347, + "grad_norm": 1.6791915893554688, + "learning_rate": 5.768333765224152e-06, + "loss": 1.5922, + "step": 1113 + }, + { + "epoch": 0.025981977044200265, + "grad_norm": 1.7679352760314941, + "learning_rate": 5.773516455040167e-06, + "loss": 1.3589, + "step": 1114 + }, + { + "epoch": 0.026005300183378183, + "grad_norm": 1.535530686378479, + "learning_rate": 5.7786991448561814e-06, + "loss": 1.1027, + "step": 1115 + }, + { + "epoch": 0.0260286233225561, + "grad_norm": 1.5171246528625488, + "learning_rate": 5.783881834672196e-06, + "loss": 1.5711, + "step": 1116 + }, + { + "epoch": 0.02605194646173402, + "grad_norm": 1.101453185081482, + "learning_rate": 5.789064524488209e-06, + "loss": 1.2025, + "step": 1117 + }, + { + "epoch": 0.026075269600911936, + "grad_norm": 1.4143930673599243, + "learning_rate": 5.794247214304224e-06, + "loss": 1.4293, + "step": 1118 + }, + { + "epoch": 0.02609859274008985, + "grad_norm": 1.4917521476745605, + "learning_rate": 5.799429904120239e-06, + "loss": 1.5479, + "step": 1119 + }, + { + "epoch": 0.02612191587926777, + "grad_norm": 1.4023706912994385, + "learning_rate": 5.804612593936253e-06, + "loss": 1.7088, + "step": 1120 + }, + { + "epoch": 0.026145239018445687, + "grad_norm": 1.4056384563446045, + "learning_rate": 5.8097952837522675e-06, + "loss": 1.3657, + "step": 1121 + }, + { + "epoch": 0.026168562157623605, + "grad_norm": 1.3393616676330566, + "learning_rate": 5.814977973568282e-06, + "loss": 1.1497, + "step": 1122 + }, + { + "epoch": 0.026191885296801522, + "grad_norm": 1.6090584993362427, + "learning_rate": 5.820160663384296e-06, + "loss": 1.391, + "step": 1123 + }, + { + "epoch": 0.02621520843597944, + "grad_norm": 1.4391287565231323, + "learning_rate": 5.825343353200311e-06, + "loss": 1.4316, + "step": 1124 + }, + { + "epoch": 0.026238531575157358, + "grad_norm": 1.0588252544403076, + "learning_rate": 5.830526043016326e-06, + "loss": 1.3495, + "step": 1125 + }, + { + "epoch": 0.026261854714335276, + "grad_norm": 1.2646477222442627, + "learning_rate": 5.83570873283234e-06, + "loss": 1.9107, + "step": 1126 + }, + { + "epoch": 0.026285177853513194, + "grad_norm": 1.2594728469848633, + "learning_rate": 5.8408914226483545e-06, + "loss": 1.3878, + "step": 1127 + }, + { + "epoch": 0.026308500992691112, + "grad_norm": 2.413245677947998, + "learning_rate": 5.84607411246437e-06, + "loss": 1.2988, + "step": 1128 + }, + { + "epoch": 0.02633182413186903, + "grad_norm": 1.8143887519836426, + "learning_rate": 5.851256802280384e-06, + "loss": 1.8778, + "step": 1129 + }, + { + "epoch": 0.026355147271046948, + "grad_norm": 1.4549977779388428, + "learning_rate": 5.856439492096398e-06, + "loss": 1.7828, + "step": 1130 + }, + { + "epoch": 0.026378470410224866, + "grad_norm": 1.370773196220398, + "learning_rate": 5.861622181912413e-06, + "loss": 1.6647, + "step": 1131 + }, + { + "epoch": 0.026401793549402783, + "grad_norm": 1.7972664833068848, + "learning_rate": 5.866804871728428e-06, + "loss": 1.8871, + "step": 1132 + }, + { + "epoch": 0.026425116688580698, + "grad_norm": 1.6887913942337036, + "learning_rate": 5.871987561544442e-06, + "loss": 1.4938, + "step": 1133 + }, + { + "epoch": 0.026448439827758616, + "grad_norm": 1.4011859893798828, + "learning_rate": 5.8771702513604566e-06, + "loss": 1.2893, + "step": 1134 + }, + { + "epoch": 0.026471762966936534, + "grad_norm": 1.2820593118667603, + "learning_rate": 5.882352941176471e-06, + "loss": 1.8028, + "step": 1135 + }, + { + "epoch": 0.02649508610611445, + "grad_norm": 1.5501364469528198, + "learning_rate": 5.887535630992486e-06, + "loss": 1.5666, + "step": 1136 + }, + { + "epoch": 0.02651840924529237, + "grad_norm": 1.635021686553955, + "learning_rate": 5.8927183208085005e-06, + "loss": 1.4217, + "step": 1137 + }, + { + "epoch": 0.026541732384470287, + "grad_norm": 1.780432105064392, + "learning_rate": 5.897901010624515e-06, + "loss": 1.5926, + "step": 1138 + }, + { + "epoch": 0.026565055523648205, + "grad_norm": 1.747233271598816, + "learning_rate": 5.903083700440529e-06, + "loss": 1.7011, + "step": 1139 + }, + { + "epoch": 0.026588378662826123, + "grad_norm": 1.6612962484359741, + "learning_rate": 5.908266390256544e-06, + "loss": 1.1466, + "step": 1140 + }, + { + "epoch": 0.02661170180200404, + "grad_norm": 1.906965732574463, + "learning_rate": 5.913449080072559e-06, + "loss": 1.2679, + "step": 1141 + }, + { + "epoch": 0.02663502494118196, + "grad_norm": 1.3008593320846558, + "learning_rate": 5.918631769888573e-06, + "loss": 1.1242, + "step": 1142 + }, + { + "epoch": 0.026658348080359877, + "grad_norm": 1.2631815671920776, + "learning_rate": 5.923814459704587e-06, + "loss": 1.6476, + "step": 1143 + }, + { + "epoch": 0.026681671219537795, + "grad_norm": 1.3338450193405151, + "learning_rate": 5.928997149520601e-06, + "loss": 1.6404, + "step": 1144 + }, + { + "epoch": 0.026704994358715713, + "grad_norm": 1.4749959707260132, + "learning_rate": 5.934179839336616e-06, + "loss": 1.4754, + "step": 1145 + }, + { + "epoch": 0.02672831749789363, + "grad_norm": 1.399997353553772, + "learning_rate": 5.9393625291526304e-06, + "loss": 1.776, + "step": 1146 + }, + { + "epoch": 0.026751640637071545, + "grad_norm": 1.6688719987869263, + "learning_rate": 5.944545218968645e-06, + "loss": 1.4341, + "step": 1147 + }, + { + "epoch": 0.026774963776249463, + "grad_norm": 1.2055866718292236, + "learning_rate": 5.949727908784659e-06, + "loss": 1.366, + "step": 1148 + }, + { + "epoch": 0.02679828691542738, + "grad_norm": 1.834375262260437, + "learning_rate": 5.9549105986006735e-06, + "loss": 1.7205, + "step": 1149 + }, + { + "epoch": 0.0268216100546053, + "grad_norm": 1.6463091373443604, + "learning_rate": 5.960093288416689e-06, + "loss": 1.2175, + "step": 1150 + }, + { + "epoch": 0.026844933193783217, + "grad_norm": 1.2439314126968384, + "learning_rate": 5.965275978232703e-06, + "loss": 1.1599, + "step": 1151 + }, + { + "epoch": 0.026868256332961134, + "grad_norm": 1.428876519203186, + "learning_rate": 5.970458668048717e-06, + "loss": 1.7428, + "step": 1152 + }, + { + "epoch": 0.026891579472139052, + "grad_norm": 1.3530622720718384, + "learning_rate": 5.975641357864732e-06, + "loss": 1.4968, + "step": 1153 + }, + { + "epoch": 0.02691490261131697, + "grad_norm": 2.7352559566497803, + "learning_rate": 5.980824047680747e-06, + "loss": 1.5478, + "step": 1154 + }, + { + "epoch": 0.026938225750494888, + "grad_norm": 1.8357428312301636, + "learning_rate": 5.986006737496761e-06, + "loss": 1.5217, + "step": 1155 + }, + { + "epoch": 0.026961548889672806, + "grad_norm": 1.3974493741989136, + "learning_rate": 5.991189427312776e-06, + "loss": 1.6203, + "step": 1156 + }, + { + "epoch": 0.026984872028850724, + "grad_norm": 1.3089922666549683, + "learning_rate": 5.99637211712879e-06, + "loss": 1.7992, + "step": 1157 + }, + { + "epoch": 0.027008195168028642, + "grad_norm": 1.8275575637817383, + "learning_rate": 6.001554806944805e-06, + "loss": 1.4841, + "step": 1158 + }, + { + "epoch": 0.02703151830720656, + "grad_norm": 2.55710506439209, + "learning_rate": 6.0067374967608195e-06, + "loss": 1.3043, + "step": 1159 + }, + { + "epoch": 0.027054841446384478, + "grad_norm": 2.4591903686523438, + "learning_rate": 6.011920186576834e-06, + "loss": 1.3368, + "step": 1160 + }, + { + "epoch": 0.027078164585562395, + "grad_norm": 1.9370126724243164, + "learning_rate": 6.017102876392848e-06, + "loss": 1.4075, + "step": 1161 + }, + { + "epoch": 0.02710148772474031, + "grad_norm": 1.4310760498046875, + "learning_rate": 6.022285566208863e-06, + "loss": 1.5424, + "step": 1162 + }, + { + "epoch": 0.027124810863918228, + "grad_norm": 1.3892368078231812, + "learning_rate": 6.027468256024878e-06, + "loss": 1.6432, + "step": 1163 + }, + { + "epoch": 0.027148134003096146, + "grad_norm": 1.4820071458816528, + "learning_rate": 6.032650945840892e-06, + "loss": 1.409, + "step": 1164 + }, + { + "epoch": 0.027171457142274064, + "grad_norm": 1.1135878562927246, + "learning_rate": 6.037833635656906e-06, + "loss": 1.5977, + "step": 1165 + }, + { + "epoch": 0.02719478028145198, + "grad_norm": 1.6016969680786133, + "learning_rate": 6.043016325472922e-06, + "loss": 1.6486, + "step": 1166 + }, + { + "epoch": 0.0272181034206299, + "grad_norm": 1.5183762311935425, + "learning_rate": 6.048199015288936e-06, + "loss": 1.4068, + "step": 1167 + }, + { + "epoch": 0.027241426559807817, + "grad_norm": 1.4730808734893799, + "learning_rate": 6.05338170510495e-06, + "loss": 1.6202, + "step": 1168 + }, + { + "epoch": 0.027264749698985735, + "grad_norm": 1.4382350444793701, + "learning_rate": 6.058564394920965e-06, + "loss": 1.7055, + "step": 1169 + }, + { + "epoch": 0.027288072838163653, + "grad_norm": 0.9570834040641785, + "learning_rate": 6.06374708473698e-06, + "loss": 0.8602, + "step": 1170 + }, + { + "epoch": 0.02731139597734157, + "grad_norm": 1.2127379179000854, + "learning_rate": 6.068929774552994e-06, + "loss": 1.5333, + "step": 1171 + }, + { + "epoch": 0.02733471911651949, + "grad_norm": 1.5822348594665527, + "learning_rate": 6.074112464369008e-06, + "loss": 0.9605, + "step": 1172 + }, + { + "epoch": 0.027358042255697407, + "grad_norm": 1.3108526468276978, + "learning_rate": 6.079295154185022e-06, + "loss": 1.1987, + "step": 1173 + }, + { + "epoch": 0.027381365394875325, + "grad_norm": 2.005154848098755, + "learning_rate": 6.084477844001036e-06, + "loss": 1.7214, + "step": 1174 + }, + { + "epoch": 0.027404688534053243, + "grad_norm": 2.299222707748413, + "learning_rate": 6.089660533817051e-06, + "loss": 1.5244, + "step": 1175 + }, + { + "epoch": 0.027428011673231157, + "grad_norm": 1.2665340900421143, + "learning_rate": 6.094843223633066e-06, + "loss": 1.1735, + "step": 1176 + }, + { + "epoch": 0.027451334812409075, + "grad_norm": 1.418123483657837, + "learning_rate": 6.10002591344908e-06, + "loss": 1.6755, + "step": 1177 + }, + { + "epoch": 0.027474657951586993, + "grad_norm": 1.4280682802200317, + "learning_rate": 6.105208603265095e-06, + "loss": 1.6664, + "step": 1178 + }, + { + "epoch": 0.02749798109076491, + "grad_norm": 2.0804097652435303, + "learning_rate": 6.110391293081109e-06, + "loss": 1.4688, + "step": 1179 + }, + { + "epoch": 0.02752130422994283, + "grad_norm": 1.7536234855651855, + "learning_rate": 6.115573982897124e-06, + "loss": 1.5823, + "step": 1180 + }, + { + "epoch": 0.027544627369120746, + "grad_norm": 1.1604044437408447, + "learning_rate": 6.1207566727131385e-06, + "loss": 1.4818, + "step": 1181 + }, + { + "epoch": 0.027567950508298664, + "grad_norm": 1.3865594863891602, + "learning_rate": 6.125939362529153e-06, + "loss": 1.5467, + "step": 1182 + }, + { + "epoch": 0.027591273647476582, + "grad_norm": 1.526190996170044, + "learning_rate": 6.131122052345167e-06, + "loss": 1.3397, + "step": 1183 + }, + { + "epoch": 0.0276145967866545, + "grad_norm": 1.6010215282440186, + "learning_rate": 6.136304742161182e-06, + "loss": 1.5507, + "step": 1184 + }, + { + "epoch": 0.027637919925832418, + "grad_norm": 1.4297575950622559, + "learning_rate": 6.141487431977197e-06, + "loss": 1.397, + "step": 1185 + }, + { + "epoch": 0.027661243065010336, + "grad_norm": 1.380254864692688, + "learning_rate": 6.146670121793211e-06, + "loss": 1.251, + "step": 1186 + }, + { + "epoch": 0.027684566204188254, + "grad_norm": 1.5398340225219727, + "learning_rate": 6.1518528116092254e-06, + "loss": 1.7319, + "step": 1187 + }, + { + "epoch": 0.027707889343366172, + "grad_norm": 1.8836907148361206, + "learning_rate": 6.157035501425241e-06, + "loss": 1.1504, + "step": 1188 + }, + { + "epoch": 0.02773121248254409, + "grad_norm": 1.200628399848938, + "learning_rate": 6.162218191241255e-06, + "loss": 1.5138, + "step": 1189 + }, + { + "epoch": 0.027754535621722004, + "grad_norm": 1.7400058507919312, + "learning_rate": 6.167400881057269e-06, + "loss": 1.5398, + "step": 1190 + }, + { + "epoch": 0.027777858760899922, + "grad_norm": 1.2723171710968018, + "learning_rate": 6.172583570873284e-06, + "loss": 1.1157, + "step": 1191 + }, + { + "epoch": 0.02780118190007784, + "grad_norm": 1.4392553567886353, + "learning_rate": 6.177766260689299e-06, + "loss": 1.7444, + "step": 1192 + }, + { + "epoch": 0.027824505039255758, + "grad_norm": 1.533337950706482, + "learning_rate": 6.182948950505313e-06, + "loss": 1.4784, + "step": 1193 + }, + { + "epoch": 0.027847828178433676, + "grad_norm": 1.5458931922912598, + "learning_rate": 6.1881316403213276e-06, + "loss": 1.8139, + "step": 1194 + }, + { + "epoch": 0.027871151317611594, + "grad_norm": 1.133946180343628, + "learning_rate": 6.193314330137342e-06, + "loss": 1.5137, + "step": 1195 + }, + { + "epoch": 0.02789447445678951, + "grad_norm": 1.458628535270691, + "learning_rate": 6.198497019953357e-06, + "loss": 1.3172, + "step": 1196 + }, + { + "epoch": 0.02791779759596743, + "grad_norm": 2.2303454875946045, + "learning_rate": 6.2036797097693714e-06, + "loss": 1.2295, + "step": 1197 + }, + { + "epoch": 0.027941120735145347, + "grad_norm": 1.2555915117263794, + "learning_rate": 6.208862399585386e-06, + "loss": 1.5021, + "step": 1198 + }, + { + "epoch": 0.027964443874323265, + "grad_norm": 1.7872976064682007, + "learning_rate": 6.2140450894014e-06, + "loss": 0.9375, + "step": 1199 + }, + { + "epoch": 0.027987767013501183, + "grad_norm": 1.5110255479812622, + "learning_rate": 6.219227779217414e-06, + "loss": 1.871, + "step": 1200 + }, + { + "epoch": 0.0280110901526791, + "grad_norm": 1.5963770151138306, + "learning_rate": 6.224410469033428e-06, + "loss": 1.6184, + "step": 1201 + }, + { + "epoch": 0.02803441329185702, + "grad_norm": 1.7600239515304565, + "learning_rate": 6.229593158849443e-06, + "loss": 1.5337, + "step": 1202 + }, + { + "epoch": 0.028057736431034937, + "grad_norm": 1.3252232074737549, + "learning_rate": 6.2347758486654575e-06, + "loss": 1.4088, + "step": 1203 + }, + { + "epoch": 0.02808105957021285, + "grad_norm": 1.3839343786239624, + "learning_rate": 6.239958538481472e-06, + "loss": 1.305, + "step": 1204 + }, + { + "epoch": 0.02810438270939077, + "grad_norm": 1.6570122241973877, + "learning_rate": 6.245141228297486e-06, + "loss": 1.5596, + "step": 1205 + }, + { + "epoch": 0.028127705848568687, + "grad_norm": 1.4685866832733154, + "learning_rate": 6.250323918113501e-06, + "loss": 1.4931, + "step": 1206 + }, + { + "epoch": 0.028151028987746605, + "grad_norm": 1.263984203338623, + "learning_rate": 6.255506607929516e-06, + "loss": 1.5393, + "step": 1207 + }, + { + "epoch": 0.028174352126924523, + "grad_norm": 1.8634412288665771, + "learning_rate": 6.26068929774553e-06, + "loss": 1.2369, + "step": 1208 + }, + { + "epoch": 0.02819767526610244, + "grad_norm": 1.676034927368164, + "learning_rate": 6.2658719875615444e-06, + "loss": 1.5886, + "step": 1209 + }, + { + "epoch": 0.02822099840528036, + "grad_norm": 1.7271007299423218, + "learning_rate": 6.27105467737756e-06, + "loss": 1.2692, + "step": 1210 + }, + { + "epoch": 0.028244321544458276, + "grad_norm": 1.4238859415054321, + "learning_rate": 6.276237367193574e-06, + "loss": 1.6261, + "step": 1211 + }, + { + "epoch": 0.028267644683636194, + "grad_norm": 2.13999080657959, + "learning_rate": 6.281420057009588e-06, + "loss": 1.7009, + "step": 1212 + }, + { + "epoch": 0.028290967822814112, + "grad_norm": 2.1164069175720215, + "learning_rate": 6.286602746825603e-06, + "loss": 1.4856, + "step": 1213 + }, + { + "epoch": 0.02831429096199203, + "grad_norm": 1.6996465921401978, + "learning_rate": 6.291785436641618e-06, + "loss": 1.4621, + "step": 1214 + }, + { + "epoch": 0.028337614101169948, + "grad_norm": 1.466536045074463, + "learning_rate": 6.296968126457632e-06, + "loss": 1.5882, + "step": 1215 + }, + { + "epoch": 0.028360937240347866, + "grad_norm": 1.7248129844665527, + "learning_rate": 6.3021508162736466e-06, + "loss": 1.658, + "step": 1216 + }, + { + "epoch": 0.028384260379525784, + "grad_norm": 1.7973899841308594, + "learning_rate": 6.307333506089661e-06, + "loss": 1.4981, + "step": 1217 + }, + { + "epoch": 0.0284075835187037, + "grad_norm": 1.4502708911895752, + "learning_rate": 6.312516195905676e-06, + "loss": 1.8872, + "step": 1218 + }, + { + "epoch": 0.028430906657881616, + "grad_norm": 1.592411756515503, + "learning_rate": 6.3176988857216905e-06, + "loss": 1.4145, + "step": 1219 + }, + { + "epoch": 0.028454229797059534, + "grad_norm": 1.931400179862976, + "learning_rate": 6.322881575537705e-06, + "loss": 1.6221, + "step": 1220 + }, + { + "epoch": 0.028477552936237452, + "grad_norm": 1.5922832489013672, + "learning_rate": 6.328064265353719e-06, + "loss": 1.3897, + "step": 1221 + }, + { + "epoch": 0.02850087607541537, + "grad_norm": 1.4899603128433228, + "learning_rate": 6.333246955169734e-06, + "loss": 1.66, + "step": 1222 + }, + { + "epoch": 0.028524199214593288, + "grad_norm": 1.3820170164108276, + "learning_rate": 6.338429644985749e-06, + "loss": 1.8425, + "step": 1223 + }, + { + "epoch": 0.028547522353771206, + "grad_norm": 1.6127132177352905, + "learning_rate": 6.343612334801763e-06, + "loss": 1.3965, + "step": 1224 + }, + { + "epoch": 0.028570845492949123, + "grad_norm": 1.927259922027588, + "learning_rate": 6.348795024617777e-06, + "loss": 1.486, + "step": 1225 + }, + { + "epoch": 0.02859416863212704, + "grad_norm": 1.5987411737442017, + "learning_rate": 6.353977714433793e-06, + "loss": 1.4371, + "step": 1226 + }, + { + "epoch": 0.02861749177130496, + "grad_norm": 1.7805335521697998, + "learning_rate": 6.359160404249805e-06, + "loss": 1.56, + "step": 1227 + }, + { + "epoch": 0.028640814910482877, + "grad_norm": 1.7960704565048218, + "learning_rate": 6.3643430940658204e-06, + "loss": 1.5536, + "step": 1228 + }, + { + "epoch": 0.028664138049660795, + "grad_norm": 1.4014300107955933, + "learning_rate": 6.369525783881835e-06, + "loss": 1.4391, + "step": 1229 + }, + { + "epoch": 0.028687461188838713, + "grad_norm": 1.7049264907836914, + "learning_rate": 6.374708473697849e-06, + "loss": 1.9225, + "step": 1230 + }, + { + "epoch": 0.02871078432801663, + "grad_norm": 1.9948570728302002, + "learning_rate": 6.3798911635138635e-06, + "loss": 1.6279, + "step": 1231 + }, + { + "epoch": 0.02873410746719455, + "grad_norm": 2.101736068725586, + "learning_rate": 6.385073853329879e-06, + "loss": 1.5433, + "step": 1232 + }, + { + "epoch": 0.028757430606372463, + "grad_norm": 1.342325210571289, + "learning_rate": 6.390256543145893e-06, + "loss": 1.3606, + "step": 1233 + }, + { + "epoch": 0.02878075374555038, + "grad_norm": 1.5539692640304565, + "learning_rate": 6.395439232961907e-06, + "loss": 1.4339, + "step": 1234 + }, + { + "epoch": 0.0288040768847283, + "grad_norm": 1.6053344011306763, + "learning_rate": 6.400621922777922e-06, + "loss": 1.5735, + "step": 1235 + }, + { + "epoch": 0.028827400023906217, + "grad_norm": 1.1527775526046753, + "learning_rate": 6.405804612593937e-06, + "loss": 1.3265, + "step": 1236 + }, + { + "epoch": 0.028850723163084135, + "grad_norm": 2.401747465133667, + "learning_rate": 6.410987302409951e-06, + "loss": 1.3331, + "step": 1237 + }, + { + "epoch": 0.028874046302262053, + "grad_norm": 1.372536301612854, + "learning_rate": 6.416169992225966e-06, + "loss": 1.6371, + "step": 1238 + }, + { + "epoch": 0.02889736944143997, + "grad_norm": 1.528669834136963, + "learning_rate": 6.42135268204198e-06, + "loss": 1.4658, + "step": 1239 + }, + { + "epoch": 0.02892069258061789, + "grad_norm": 1.7370809316635132, + "learning_rate": 6.426535371857995e-06, + "loss": 1.4893, + "step": 1240 + }, + { + "epoch": 0.028944015719795806, + "grad_norm": 1.5757806301116943, + "learning_rate": 6.4317180616740095e-06, + "loss": 1.2563, + "step": 1241 + }, + { + "epoch": 0.028967338858973724, + "grad_norm": 1.2458890676498413, + "learning_rate": 6.436900751490024e-06, + "loss": 1.6522, + "step": 1242 + }, + { + "epoch": 0.028990661998151642, + "grad_norm": 1.743046760559082, + "learning_rate": 6.442083441306038e-06, + "loss": 1.6444, + "step": 1243 + }, + { + "epoch": 0.02901398513732956, + "grad_norm": 1.5543162822723389, + "learning_rate": 6.447266131122053e-06, + "loss": 1.6381, + "step": 1244 + }, + { + "epoch": 0.029037308276507478, + "grad_norm": 1.3490428924560547, + "learning_rate": 6.452448820938068e-06, + "loss": 1.4615, + "step": 1245 + }, + { + "epoch": 0.029060631415685396, + "grad_norm": 1.3732086420059204, + "learning_rate": 6.457631510754082e-06, + "loss": 1.4085, + "step": 1246 + }, + { + "epoch": 0.02908395455486331, + "grad_norm": 2.9364993572235107, + "learning_rate": 6.462814200570096e-06, + "loss": 1.4811, + "step": 1247 + }, + { + "epoch": 0.029107277694041228, + "grad_norm": 1.2069623470306396, + "learning_rate": 6.467996890386112e-06, + "loss": 1.3635, + "step": 1248 + }, + { + "epoch": 0.029130600833219146, + "grad_norm": 1.2883137464523315, + "learning_rate": 6.473179580202126e-06, + "loss": 1.4202, + "step": 1249 + }, + { + "epoch": 0.029153923972397064, + "grad_norm": 1.592976689338684, + "learning_rate": 6.47836227001814e-06, + "loss": 2.1116, + "step": 1250 + }, + { + "epoch": 0.029177247111574982, + "grad_norm": 1.394774079322815, + "learning_rate": 6.483544959834155e-06, + "loss": 1.5042, + "step": 1251 + }, + { + "epoch": 0.0292005702507529, + "grad_norm": 1.2127888202667236, + "learning_rate": 6.48872764965017e-06, + "loss": 1.3806, + "step": 1252 + }, + { + "epoch": 0.029223893389930818, + "grad_norm": 1.5445924997329712, + "learning_rate": 6.493910339466184e-06, + "loss": 1.5067, + "step": 1253 + }, + { + "epoch": 0.029247216529108735, + "grad_norm": 2.4520442485809326, + "learning_rate": 6.4990930292821985e-06, + "loss": 1.3649, + "step": 1254 + }, + { + "epoch": 0.029270539668286653, + "grad_norm": 2.032709836959839, + "learning_rate": 6.504275719098212e-06, + "loss": 1.2058, + "step": 1255 + }, + { + "epoch": 0.02929386280746457, + "grad_norm": 1.3742554187774658, + "learning_rate": 6.509458408914226e-06, + "loss": 1.4328, + "step": 1256 + }, + { + "epoch": 0.02931718594664249, + "grad_norm": 1.4859979152679443, + "learning_rate": 6.514641098730241e-06, + "loss": 1.6409, + "step": 1257 + }, + { + "epoch": 0.029340509085820407, + "grad_norm": 1.6881428956985474, + "learning_rate": 6.519823788546256e-06, + "loss": 1.6298, + "step": 1258 + }, + { + "epoch": 0.029363832224998325, + "grad_norm": 1.892412543296814, + "learning_rate": 6.52500647836227e-06, + "loss": 1.6898, + "step": 1259 + }, + { + "epoch": 0.029387155364176243, + "grad_norm": 1.4890961647033691, + "learning_rate": 6.530189168178285e-06, + "loss": 1.6164, + "step": 1260 + }, + { + "epoch": 0.029410478503354157, + "grad_norm": 1.530034065246582, + "learning_rate": 6.535371857994299e-06, + "loss": 1.4036, + "step": 1261 + }, + { + "epoch": 0.029433801642532075, + "grad_norm": 1.4801392555236816, + "learning_rate": 6.540554547810314e-06, + "loss": 1.5928, + "step": 1262 + }, + { + "epoch": 0.029457124781709993, + "grad_norm": 1.4419362545013428, + "learning_rate": 6.5457372376263285e-06, + "loss": 1.7833, + "step": 1263 + }, + { + "epoch": 0.02948044792088791, + "grad_norm": 1.6963889598846436, + "learning_rate": 6.550919927442343e-06, + "loss": 1.7366, + "step": 1264 + }, + { + "epoch": 0.02950377106006583, + "grad_norm": 1.4853816032409668, + "learning_rate": 6.556102617258357e-06, + "loss": 1.2297, + "step": 1265 + }, + { + "epoch": 0.029527094199243747, + "grad_norm": 1.6151559352874756, + "learning_rate": 6.561285307074372e-06, + "loss": 2.0062, + "step": 1266 + }, + { + "epoch": 0.029550417338421665, + "grad_norm": 1.3132925033569336, + "learning_rate": 6.566467996890387e-06, + "loss": 1.7708, + "step": 1267 + }, + { + "epoch": 0.029573740477599583, + "grad_norm": 1.4057172536849976, + "learning_rate": 6.571650686706401e-06, + "loss": 1.5811, + "step": 1268 + }, + { + "epoch": 0.0295970636167775, + "grad_norm": 1.5369668006896973, + "learning_rate": 6.5768333765224154e-06, + "loss": 1.5121, + "step": 1269 + }, + { + "epoch": 0.02962038675595542, + "grad_norm": 1.6567087173461914, + "learning_rate": 6.582016066338431e-06, + "loss": 1.2413, + "step": 1270 + }, + { + "epoch": 0.029643709895133336, + "grad_norm": 1.3374396562576294, + "learning_rate": 6.587198756154445e-06, + "loss": 1.5594, + "step": 1271 + }, + { + "epoch": 0.029667033034311254, + "grad_norm": 1.4892241954803467, + "learning_rate": 6.592381445970459e-06, + "loss": 1.6287, + "step": 1272 + }, + { + "epoch": 0.029690356173489172, + "grad_norm": 2.012141466140747, + "learning_rate": 6.597564135786474e-06, + "loss": 1.7356, + "step": 1273 + }, + { + "epoch": 0.02971367931266709, + "grad_norm": 2.2330586910247803, + "learning_rate": 6.602746825602489e-06, + "loss": 1.1928, + "step": 1274 + }, + { + "epoch": 0.029737002451845004, + "grad_norm": 1.7101742029190063, + "learning_rate": 6.607929515418503e-06, + "loss": 1.497, + "step": 1275 + }, + { + "epoch": 0.029760325591022922, + "grad_norm": 1.4773057699203491, + "learning_rate": 6.6131122052345175e-06, + "loss": 1.4135, + "step": 1276 + }, + { + "epoch": 0.02978364873020084, + "grad_norm": 1.4007784128189087, + "learning_rate": 6.618294895050532e-06, + "loss": 1.3921, + "step": 1277 + }, + { + "epoch": 0.029806971869378758, + "grad_norm": 1.7430599927902222, + "learning_rate": 6.623477584866547e-06, + "loss": 1.5352, + "step": 1278 + }, + { + "epoch": 0.029830295008556676, + "grad_norm": 2.562096118927002, + "learning_rate": 6.6286602746825614e-06, + "loss": 1.5325, + "step": 1279 + }, + { + "epoch": 0.029853618147734594, + "grad_norm": 1.192498803138733, + "learning_rate": 6.633842964498576e-06, + "loss": 1.1816, + "step": 1280 + }, + { + "epoch": 0.02987694128691251, + "grad_norm": 2.39277982711792, + "learning_rate": 6.63902565431459e-06, + "loss": 1.3732, + "step": 1281 + }, + { + "epoch": 0.02990026442609043, + "grad_norm": 1.3731800317764282, + "learning_rate": 6.644208344130604e-06, + "loss": 1.4175, + "step": 1282 + }, + { + "epoch": 0.029923587565268348, + "grad_norm": 2.297088146209717, + "learning_rate": 6.649391033946618e-06, + "loss": 1.5919, + "step": 1283 + }, + { + "epoch": 0.029946910704446265, + "grad_norm": 1.1062113046646118, + "learning_rate": 6.654573723762633e-06, + "loss": 1.3707, + "step": 1284 + }, + { + "epoch": 0.029970233843624183, + "grad_norm": 2.175673246383667, + "learning_rate": 6.6597564135786475e-06, + "loss": 1.4268, + "step": 1285 + }, + { + "epoch": 0.0299935569828021, + "grad_norm": 1.57578444480896, + "learning_rate": 6.664939103394662e-06, + "loss": 1.6065, + "step": 1286 + }, + { + "epoch": 0.03001688012198002, + "grad_norm": 1.757105827331543, + "learning_rate": 6.670121793210676e-06, + "loss": 1.5827, + "step": 1287 + }, + { + "epoch": 0.030040203261157937, + "grad_norm": 1.6778910160064697, + "learning_rate": 6.675304483026691e-06, + "loss": 1.4697, + "step": 1288 + }, + { + "epoch": 0.030063526400335855, + "grad_norm": 1.4940367937088013, + "learning_rate": 6.680487172842706e-06, + "loss": 1.2309, + "step": 1289 + }, + { + "epoch": 0.03008684953951377, + "grad_norm": 2.175011157989502, + "learning_rate": 6.68566986265872e-06, + "loss": 0.9675, + "step": 1290 + }, + { + "epoch": 0.030110172678691687, + "grad_norm": 2.0137412548065186, + "learning_rate": 6.6908525524747344e-06, + "loss": 1.6618, + "step": 1291 + }, + { + "epoch": 0.030133495817869605, + "grad_norm": 1.3541489839553833, + "learning_rate": 6.69603524229075e-06, + "loss": 1.2989, + "step": 1292 + }, + { + "epoch": 0.030156818957047523, + "grad_norm": 1.9265953302383423, + "learning_rate": 6.701217932106764e-06, + "loss": 1.3859, + "step": 1293 + }, + { + "epoch": 0.03018014209622544, + "grad_norm": 1.899145483970642, + "learning_rate": 6.706400621922778e-06, + "loss": 1.2468, + "step": 1294 + }, + { + "epoch": 0.03020346523540336, + "grad_norm": 1.6764010190963745, + "learning_rate": 6.711583311738793e-06, + "loss": 1.4796, + "step": 1295 + }, + { + "epoch": 0.030226788374581277, + "grad_norm": 1.502276062965393, + "learning_rate": 6.716766001554808e-06, + "loss": 1.6102, + "step": 1296 + }, + { + "epoch": 0.030250111513759195, + "grad_norm": 1.742180347442627, + "learning_rate": 6.721948691370822e-06, + "loss": 1.4743, + "step": 1297 + }, + { + "epoch": 0.030273434652937112, + "grad_norm": 1.503127098083496, + "learning_rate": 6.7271313811868366e-06, + "loss": 1.7023, + "step": 1298 + }, + { + "epoch": 0.03029675779211503, + "grad_norm": 1.4494696855545044, + "learning_rate": 6.732314071002851e-06, + "loss": 1.6774, + "step": 1299 + }, + { + "epoch": 0.030320080931292948, + "grad_norm": 1.3726390600204468, + "learning_rate": 6.737496760818866e-06, + "loss": 1.6272, + "step": 1300 + }, + { + "epoch": 0.030343404070470866, + "grad_norm": 1.6922540664672852, + "learning_rate": 6.7426794506348805e-06, + "loss": 1.6249, + "step": 1301 + }, + { + "epoch": 0.030366727209648784, + "grad_norm": 1.3822194337844849, + "learning_rate": 6.747862140450895e-06, + "loss": 1.779, + "step": 1302 + }, + { + "epoch": 0.030390050348826702, + "grad_norm": 1.2841784954071045, + "learning_rate": 6.753044830266909e-06, + "loss": 1.2516, + "step": 1303 + }, + { + "epoch": 0.030413373488004616, + "grad_norm": 2.045302152633667, + "learning_rate": 6.758227520082924e-06, + "loss": 1.4461, + "step": 1304 + }, + { + "epoch": 0.030436696627182534, + "grad_norm": 1.6968058347702026, + "learning_rate": 6.763410209898939e-06, + "loss": 1.545, + "step": 1305 + }, + { + "epoch": 0.030460019766360452, + "grad_norm": 1.6409857273101807, + "learning_rate": 6.768592899714953e-06, + "loss": 1.7205, + "step": 1306 + }, + { + "epoch": 0.03048334290553837, + "grad_norm": 1.2925307750701904, + "learning_rate": 6.773775589530967e-06, + "loss": 1.5889, + "step": 1307 + }, + { + "epoch": 0.030506666044716288, + "grad_norm": 1.4610506296157837, + "learning_rate": 6.778958279346982e-06, + "loss": 1.49, + "step": 1308 + }, + { + "epoch": 0.030529989183894206, + "grad_norm": 1.5941089391708374, + "learning_rate": 6.784140969162997e-06, + "loss": 1.8275, + "step": 1309 + }, + { + "epoch": 0.030553312323072124, + "grad_norm": 1.2063391208648682, + "learning_rate": 6.7893236589790104e-06, + "loss": 1.2659, + "step": 1310 + }, + { + "epoch": 0.03057663546225004, + "grad_norm": 1.512366771697998, + "learning_rate": 6.794506348795025e-06, + "loss": 1.502, + "step": 1311 + }, + { + "epoch": 0.03059995860142796, + "grad_norm": 2.0490636825561523, + "learning_rate": 6.799689038611039e-06, + "loss": 1.4567, + "step": 1312 + }, + { + "epoch": 0.030623281740605877, + "grad_norm": 2.196171522140503, + "learning_rate": 6.8048717284270535e-06, + "loss": 1.7189, + "step": 1313 + }, + { + "epoch": 0.030646604879783795, + "grad_norm": 1.434403419494629, + "learning_rate": 6.810054418243069e-06, + "loss": 1.4947, + "step": 1314 + }, + { + "epoch": 0.030669928018961713, + "grad_norm": 1.3586199283599854, + "learning_rate": 6.815237108059083e-06, + "loss": 1.5511, + "step": 1315 + }, + { + "epoch": 0.03069325115813963, + "grad_norm": 1.7212327718734741, + "learning_rate": 6.820419797875097e-06, + "loss": 1.625, + "step": 1316 + }, + { + "epoch": 0.03071657429731755, + "grad_norm": 1.7246372699737549, + "learning_rate": 6.825602487691112e-06, + "loss": 1.6043, + "step": 1317 + }, + { + "epoch": 0.030739897436495463, + "grad_norm": 1.401949405670166, + "learning_rate": 6.830785177507127e-06, + "loss": 0.9642, + "step": 1318 + }, + { + "epoch": 0.03076322057567338, + "grad_norm": 1.6501095294952393, + "learning_rate": 6.835967867323141e-06, + "loss": 1.4776, + "step": 1319 + }, + { + "epoch": 0.0307865437148513, + "grad_norm": 1.266641616821289, + "learning_rate": 6.841150557139156e-06, + "loss": 1.1332, + "step": 1320 + }, + { + "epoch": 0.030809866854029217, + "grad_norm": 1.0934447050094604, + "learning_rate": 6.84633324695517e-06, + "loss": 1.6201, + "step": 1321 + }, + { + "epoch": 0.030833189993207135, + "grad_norm": 1.4711166620254517, + "learning_rate": 6.851515936771185e-06, + "loss": 1.401, + "step": 1322 + }, + { + "epoch": 0.030856513132385053, + "grad_norm": 1.609348177909851, + "learning_rate": 6.8566986265871995e-06, + "loss": 1.5497, + "step": 1323 + }, + { + "epoch": 0.03087983627156297, + "grad_norm": 1.277185082435608, + "learning_rate": 6.861881316403214e-06, + "loss": 1.5056, + "step": 1324 + }, + { + "epoch": 0.03090315941074089, + "grad_norm": 1.4644626379013062, + "learning_rate": 6.867064006219228e-06, + "loss": 1.3443, + "step": 1325 + }, + { + "epoch": 0.030926482549918807, + "grad_norm": 1.4824533462524414, + "learning_rate": 6.872246696035243e-06, + "loss": 1.5054, + "step": 1326 + }, + { + "epoch": 0.030949805689096724, + "grad_norm": 1.4885330200195312, + "learning_rate": 6.877429385851258e-06, + "loss": 1.4403, + "step": 1327 + }, + { + "epoch": 0.030973128828274642, + "grad_norm": 1.639889121055603, + "learning_rate": 6.882612075667272e-06, + "loss": 1.7286, + "step": 1328 + }, + { + "epoch": 0.03099645196745256, + "grad_norm": 1.2644333839416504, + "learning_rate": 6.887794765483286e-06, + "loss": 1.4472, + "step": 1329 + }, + { + "epoch": 0.031019775106630478, + "grad_norm": 1.4533531665802002, + "learning_rate": 6.892977455299302e-06, + "loss": 1.6504, + "step": 1330 + }, + { + "epoch": 0.031043098245808396, + "grad_norm": 1.5860834121704102, + "learning_rate": 6.898160145115316e-06, + "loss": 1.3219, + "step": 1331 + }, + { + "epoch": 0.03106642138498631, + "grad_norm": 1.4244756698608398, + "learning_rate": 6.90334283493133e-06, + "loss": 1.2863, + "step": 1332 + }, + { + "epoch": 0.03108974452416423, + "grad_norm": 1.7279314994812012, + "learning_rate": 6.908525524747345e-06, + "loss": 1.5325, + "step": 1333 + }, + { + "epoch": 0.031113067663342146, + "grad_norm": 1.3759844303131104, + "learning_rate": 6.913708214563359e-06, + "loss": 1.7333, + "step": 1334 + }, + { + "epoch": 0.031136390802520064, + "grad_norm": 1.3596171140670776, + "learning_rate": 6.918890904379374e-06, + "loss": 1.4572, + "step": 1335 + }, + { + "epoch": 0.031159713941697982, + "grad_norm": 1.4598828554153442, + "learning_rate": 6.9240735941953885e-06, + "loss": 1.5375, + "step": 1336 + }, + { + "epoch": 0.0311830370808759, + "grad_norm": 1.7578270435333252, + "learning_rate": 6.929256284011402e-06, + "loss": 1.7456, + "step": 1337 + }, + { + "epoch": 0.031206360220053818, + "grad_norm": 1.8432106971740723, + "learning_rate": 6.934438973827416e-06, + "loss": 1.3632, + "step": 1338 + }, + { + "epoch": 0.031229683359231736, + "grad_norm": 1.3926173448562622, + "learning_rate": 6.939621663643431e-06, + "loss": 1.5246, + "step": 1339 + }, + { + "epoch": 0.031253006498409654, + "grad_norm": 1.639283299446106, + "learning_rate": 6.944804353459446e-06, + "loss": 1.4081, + "step": 1340 + }, + { + "epoch": 0.03127632963758757, + "grad_norm": 1.818247675895691, + "learning_rate": 6.94998704327546e-06, + "loss": 1.4222, + "step": 1341 + }, + { + "epoch": 0.03129965277676549, + "grad_norm": 1.7598317861557007, + "learning_rate": 6.955169733091475e-06, + "loss": 1.5457, + "step": 1342 + }, + { + "epoch": 0.03132297591594341, + "grad_norm": 1.9077101945877075, + "learning_rate": 6.960352422907489e-06, + "loss": 1.2585, + "step": 1343 + }, + { + "epoch": 0.031346299055121325, + "grad_norm": 1.7100765705108643, + "learning_rate": 6.965535112723504e-06, + "loss": 1.5487, + "step": 1344 + }, + { + "epoch": 0.03136962219429924, + "grad_norm": 1.4282541275024414, + "learning_rate": 6.9707178025395185e-06, + "loss": 1.7457, + "step": 1345 + }, + { + "epoch": 0.03139294533347716, + "grad_norm": 1.5989662408828735, + "learning_rate": 6.975900492355533e-06, + "loss": 1.7449, + "step": 1346 + }, + { + "epoch": 0.03141626847265508, + "grad_norm": 1.2489700317382812, + "learning_rate": 6.981083182171547e-06, + "loss": 1.4873, + "step": 1347 + }, + { + "epoch": 0.031439591611833, + "grad_norm": 1.60476815700531, + "learning_rate": 6.986265871987562e-06, + "loss": 1.4751, + "step": 1348 + }, + { + "epoch": 0.031462914751010915, + "grad_norm": 1.5303354263305664, + "learning_rate": 6.991448561803577e-06, + "loss": 1.5709, + "step": 1349 + }, + { + "epoch": 0.03148623789018883, + "grad_norm": 1.462499737739563, + "learning_rate": 6.996631251619591e-06, + "loss": 1.7366, + "step": 1350 + }, + { + "epoch": 0.03150956102936675, + "grad_norm": 1.4246290922164917, + "learning_rate": 7.0018139414356054e-06, + "loss": 1.1592, + "step": 1351 + }, + { + "epoch": 0.03153288416854467, + "grad_norm": 1.8897913694381714, + "learning_rate": 7.006996631251621e-06, + "loss": 1.1699, + "step": 1352 + }, + { + "epoch": 0.031556207307722586, + "grad_norm": 1.6516541242599487, + "learning_rate": 7.012179321067635e-06, + "loss": 1.4705, + "step": 1353 + }, + { + "epoch": 0.0315795304469005, + "grad_norm": 1.816272258758545, + "learning_rate": 7.017362010883649e-06, + "loss": 1.3166, + "step": 1354 + }, + { + "epoch": 0.031602853586078415, + "grad_norm": 1.631224274635315, + "learning_rate": 7.022544700699664e-06, + "loss": 1.9471, + "step": 1355 + }, + { + "epoch": 0.03162617672525633, + "grad_norm": 1.7657747268676758, + "learning_rate": 7.027727390515678e-06, + "loss": 1.6623, + "step": 1356 + }, + { + "epoch": 0.03164949986443425, + "grad_norm": 1.5499768257141113, + "learning_rate": 7.032910080331693e-06, + "loss": 1.328, + "step": 1357 + }, + { + "epoch": 0.03167282300361217, + "grad_norm": 1.5339092016220093, + "learning_rate": 7.0380927701477075e-06, + "loss": 1.79, + "step": 1358 + }, + { + "epoch": 0.03169614614279009, + "grad_norm": 2.1172358989715576, + "learning_rate": 7.043275459963722e-06, + "loss": 1.719, + "step": 1359 + }, + { + "epoch": 0.031719469281968005, + "grad_norm": 1.5365610122680664, + "learning_rate": 7.048458149779736e-06, + "loss": 1.2236, + "step": 1360 + }, + { + "epoch": 0.03174279242114592, + "grad_norm": 1.7277380228042603, + "learning_rate": 7.0536408395957514e-06, + "loss": 1.768, + "step": 1361 + }, + { + "epoch": 0.03176611556032384, + "grad_norm": 3.0157341957092285, + "learning_rate": 7.058823529411766e-06, + "loss": 1.023, + "step": 1362 + }, + { + "epoch": 0.03178943869950176, + "grad_norm": 1.682496190071106, + "learning_rate": 7.06400621922778e-06, + "loss": 1.5555, + "step": 1363 + }, + { + "epoch": 0.031812761838679676, + "grad_norm": 1.6679117679595947, + "learning_rate": 7.0691889090437945e-06, + "loss": 1.762, + "step": 1364 + }, + { + "epoch": 0.031836084977857594, + "grad_norm": 1.5026060342788696, + "learning_rate": 7.074371598859808e-06, + "loss": 1.2893, + "step": 1365 + }, + { + "epoch": 0.03185940811703551, + "grad_norm": 1.8401672840118408, + "learning_rate": 7.079554288675823e-06, + "loss": 1.4318, + "step": 1366 + }, + { + "epoch": 0.03188273125621343, + "grad_norm": 1.6953387260437012, + "learning_rate": 7.0847369784918375e-06, + "loss": 1.5304, + "step": 1367 + }, + { + "epoch": 0.03190605439539135, + "grad_norm": 1.7483880519866943, + "learning_rate": 7.089919668307852e-06, + "loss": 1.763, + "step": 1368 + }, + { + "epoch": 0.031929377534569266, + "grad_norm": 1.6970646381378174, + "learning_rate": 7.095102358123866e-06, + "loss": 1.4232, + "step": 1369 + }, + { + "epoch": 0.031952700673747184, + "grad_norm": 1.4489586353302002, + "learning_rate": 7.100285047939881e-06, + "loss": 1.2495, + "step": 1370 + }, + { + "epoch": 0.0319760238129251, + "grad_norm": 1.8368195295333862, + "learning_rate": 7.105467737755896e-06, + "loss": 1.3631, + "step": 1371 + }, + { + "epoch": 0.03199934695210302, + "grad_norm": 2.073723077774048, + "learning_rate": 7.11065042757191e-06, + "loss": 1.4958, + "step": 1372 + }, + { + "epoch": 0.03202267009128094, + "grad_norm": 1.7000291347503662, + "learning_rate": 7.1158331173879244e-06, + "loss": 1.5018, + "step": 1373 + }, + { + "epoch": 0.032045993230458855, + "grad_norm": 1.896183729171753, + "learning_rate": 7.12101580720394e-06, + "loss": 1.4754, + "step": 1374 + }, + { + "epoch": 0.03206931636963677, + "grad_norm": 1.4250632524490356, + "learning_rate": 7.126198497019954e-06, + "loss": 1.2758, + "step": 1375 + }, + { + "epoch": 0.03209263950881469, + "grad_norm": 1.968647837638855, + "learning_rate": 7.131381186835968e-06, + "loss": 1.5062, + "step": 1376 + }, + { + "epoch": 0.03211596264799261, + "grad_norm": 1.5044890642166138, + "learning_rate": 7.136563876651983e-06, + "loss": 1.7057, + "step": 1377 + }, + { + "epoch": 0.03213928578717053, + "grad_norm": 1.5252755880355835, + "learning_rate": 7.141746566467998e-06, + "loss": 1.4311, + "step": 1378 + }, + { + "epoch": 0.032162608926348445, + "grad_norm": 1.7001562118530273, + "learning_rate": 7.146929256284012e-06, + "loss": 1.5573, + "step": 1379 + }, + { + "epoch": 0.03218593206552636, + "grad_norm": 2.1587064266204834, + "learning_rate": 7.1521119461000266e-06, + "loss": 1.1552, + "step": 1380 + }, + { + "epoch": 0.03220925520470428, + "grad_norm": 1.5938003063201904, + "learning_rate": 7.157294635916041e-06, + "loss": 1.3843, + "step": 1381 + }, + { + "epoch": 0.03223257834388219, + "grad_norm": 1.5198419094085693, + "learning_rate": 7.162477325732055e-06, + "loss": 1.4412, + "step": 1382 + }, + { + "epoch": 0.03225590148306011, + "grad_norm": 1.8579787015914917, + "learning_rate": 7.1676600155480705e-06, + "loss": 1.2986, + "step": 1383 + }, + { + "epoch": 0.03227922462223803, + "grad_norm": 1.5341622829437256, + "learning_rate": 7.172842705364085e-06, + "loss": 1.2032, + "step": 1384 + }, + { + "epoch": 0.032302547761415945, + "grad_norm": 2.0681440830230713, + "learning_rate": 7.178025395180099e-06, + "loss": 1.7171, + "step": 1385 + }, + { + "epoch": 0.03232587090059386, + "grad_norm": 1.7611883878707886, + "learning_rate": 7.1832080849961135e-06, + "loss": 1.3376, + "step": 1386 + }, + { + "epoch": 0.03234919403977178, + "grad_norm": 1.6917016506195068, + "learning_rate": 7.188390774812129e-06, + "loss": 1.3909, + "step": 1387 + }, + { + "epoch": 0.0323725171789497, + "grad_norm": 1.1238902807235718, + "learning_rate": 7.193573464628143e-06, + "loss": 1.1826, + "step": 1388 + }, + { + "epoch": 0.03239584031812762, + "grad_norm": 1.5484822988510132, + "learning_rate": 7.198756154444157e-06, + "loss": 1.4476, + "step": 1389 + }, + { + "epoch": 0.032419163457305535, + "grad_norm": 1.703244686126709, + "learning_rate": 7.203938844260172e-06, + "loss": 1.5256, + "step": 1390 + }, + { + "epoch": 0.03244248659648345, + "grad_norm": 2.350940465927124, + "learning_rate": 7.209121534076187e-06, + "loss": 1.4486, + "step": 1391 + }, + { + "epoch": 0.03246580973566137, + "grad_norm": 1.2115894556045532, + "learning_rate": 7.2143042238922004e-06, + "loss": 1.2387, + "step": 1392 + }, + { + "epoch": 0.03248913287483929, + "grad_norm": 1.4883688688278198, + "learning_rate": 7.219486913708215e-06, + "loss": 1.4499, + "step": 1393 + }, + { + "epoch": 0.032512456014017206, + "grad_norm": 1.2324401140213013, + "learning_rate": 7.224669603524229e-06, + "loss": 1.3548, + "step": 1394 + }, + { + "epoch": 0.032535779153195124, + "grad_norm": 2.054262638092041, + "learning_rate": 7.2298522933402435e-06, + "loss": 1.4986, + "step": 1395 + }, + { + "epoch": 0.03255910229237304, + "grad_norm": 1.7639497518539429, + "learning_rate": 7.235034983156259e-06, + "loss": 1.4023, + "step": 1396 + }, + { + "epoch": 0.03258242543155096, + "grad_norm": 1.3556314706802368, + "learning_rate": 7.240217672972273e-06, + "loss": 1.4122, + "step": 1397 + }, + { + "epoch": 0.03260574857072888, + "grad_norm": 1.8941506147384644, + "learning_rate": 7.245400362788287e-06, + "loss": 1.1754, + "step": 1398 + }, + { + "epoch": 0.032629071709906796, + "grad_norm": 1.7958110570907593, + "learning_rate": 7.250583052604302e-06, + "loss": 1.9056, + "step": 1399 + }, + { + "epoch": 0.032652394849084714, + "grad_norm": 1.3702186346054077, + "learning_rate": 7.255765742420317e-06, + "loss": 1.5533, + "step": 1400 + }, + { + "epoch": 0.03267571798826263, + "grad_norm": 1.4540181159973145, + "learning_rate": 7.260948432236331e-06, + "loss": 1.4704, + "step": 1401 + }, + { + "epoch": 0.03269904112744055, + "grad_norm": 1.6024681329727173, + "learning_rate": 7.266131122052346e-06, + "loss": 1.4394, + "step": 1402 + }, + { + "epoch": 0.03272236426661847, + "grad_norm": 1.5546940565109253, + "learning_rate": 7.27131381186836e-06, + "loss": 1.54, + "step": 1403 + }, + { + "epoch": 0.032745687405796385, + "grad_norm": 1.5781769752502441, + "learning_rate": 7.276496501684375e-06, + "loss": 1.3658, + "step": 1404 + }, + { + "epoch": 0.0327690105449743, + "grad_norm": 1.4951281547546387, + "learning_rate": 7.2816791915003895e-06, + "loss": 1.3768, + "step": 1405 + }, + { + "epoch": 0.03279233368415222, + "grad_norm": 1.9413893222808838, + "learning_rate": 7.286861881316404e-06, + "loss": 1.3878, + "step": 1406 + }, + { + "epoch": 0.03281565682333014, + "grad_norm": 1.6263363361358643, + "learning_rate": 7.292044571132418e-06, + "loss": 1.4236, + "step": 1407 + }, + { + "epoch": 0.03283897996250806, + "grad_norm": 2.2151589393615723, + "learning_rate": 7.2972272609484325e-06, + "loss": 1.7296, + "step": 1408 + }, + { + "epoch": 0.032862303101685975, + "grad_norm": 1.3772640228271484, + "learning_rate": 7.302409950764448e-06, + "loss": 1.292, + "step": 1409 + }, + { + "epoch": 0.03288562624086389, + "grad_norm": 1.7607418298721313, + "learning_rate": 7.307592640580462e-06, + "loss": 1.6019, + "step": 1410 + }, + { + "epoch": 0.0329089493800418, + "grad_norm": 1.9470393657684326, + "learning_rate": 7.312775330396476e-06, + "loss": 1.3396, + "step": 1411 + }, + { + "epoch": 0.03293227251921972, + "grad_norm": 2.021190881729126, + "learning_rate": 7.317958020212491e-06, + "loss": 1.6207, + "step": 1412 + }, + { + "epoch": 0.03295559565839764, + "grad_norm": 1.7311667203903198, + "learning_rate": 7.323140710028506e-06, + "loss": 1.6409, + "step": 1413 + }, + { + "epoch": 0.03297891879757556, + "grad_norm": 1.6784627437591553, + "learning_rate": 7.32832339984452e-06, + "loss": 1.5595, + "step": 1414 + }, + { + "epoch": 0.033002241936753475, + "grad_norm": 1.517193078994751, + "learning_rate": 7.333506089660535e-06, + "loss": 1.574, + "step": 1415 + }, + { + "epoch": 0.03302556507593139, + "grad_norm": 1.4831286668777466, + "learning_rate": 7.338688779476549e-06, + "loss": 0.8727, + "step": 1416 + }, + { + "epoch": 0.03304888821510931, + "grad_norm": 1.6477752923965454, + "learning_rate": 7.343871469292564e-06, + "loss": 1.559, + "step": 1417 + }, + { + "epoch": 0.03307221135428723, + "grad_norm": 1.853326678276062, + "learning_rate": 7.3490541591085785e-06, + "loss": 1.8523, + "step": 1418 + }, + { + "epoch": 0.03309553449346515, + "grad_norm": 1.6894885301589966, + "learning_rate": 7.354236848924593e-06, + "loss": 1.5844, + "step": 1419 + }, + { + "epoch": 0.033118857632643064, + "grad_norm": 1.6442736387252808, + "learning_rate": 7.359419538740606e-06, + "loss": 1.986, + "step": 1420 + }, + { + "epoch": 0.03314218077182098, + "grad_norm": 1.787266731262207, + "learning_rate": 7.364602228556621e-06, + "loss": 1.4822, + "step": 1421 + }, + { + "epoch": 0.0331655039109989, + "grad_norm": 2.073798418045044, + "learning_rate": 7.369784918372636e-06, + "loss": 1.637, + "step": 1422 + }, + { + "epoch": 0.03318882705017682, + "grad_norm": 1.3428417444229126, + "learning_rate": 7.37496760818865e-06, + "loss": 1.5598, + "step": 1423 + }, + { + "epoch": 0.033212150189354736, + "grad_norm": 1.5737829208374023, + "learning_rate": 7.380150298004665e-06, + "loss": 1.2274, + "step": 1424 + }, + { + "epoch": 0.033235473328532654, + "grad_norm": 2.1165404319763184, + "learning_rate": 7.385332987820679e-06, + "loss": 1.4134, + "step": 1425 + }, + { + "epoch": 0.03325879646771057, + "grad_norm": 1.5476047992706299, + "learning_rate": 7.390515677636694e-06, + "loss": 1.6364, + "step": 1426 + }, + { + "epoch": 0.03328211960688849, + "grad_norm": 1.6927748918533325, + "learning_rate": 7.3956983674527085e-06, + "loss": 1.6977, + "step": 1427 + }, + { + "epoch": 0.03330544274606641, + "grad_norm": 1.4677228927612305, + "learning_rate": 7.400881057268723e-06, + "loss": 1.4168, + "step": 1428 + }, + { + "epoch": 0.033328765885244326, + "grad_norm": 1.5205353498458862, + "learning_rate": 7.406063747084737e-06, + "loss": 1.2843, + "step": 1429 + }, + { + "epoch": 0.03335208902442224, + "grad_norm": 1.5447300672531128, + "learning_rate": 7.411246436900752e-06, + "loss": 1.5689, + "step": 1430 + }, + { + "epoch": 0.03337541216360016, + "grad_norm": 1.63996160030365, + "learning_rate": 7.416429126716767e-06, + "loss": 1.5884, + "step": 1431 + }, + { + "epoch": 0.03339873530277808, + "grad_norm": 1.452081322669983, + "learning_rate": 7.421611816532781e-06, + "loss": 1.3101, + "step": 1432 + }, + { + "epoch": 0.033422058441956, + "grad_norm": 1.7910422086715698, + "learning_rate": 7.426794506348795e-06, + "loss": 1.4715, + "step": 1433 + }, + { + "epoch": 0.033445381581133915, + "grad_norm": 1.983233094215393, + "learning_rate": 7.43197719616481e-06, + "loss": 1.5482, + "step": 1434 + }, + { + "epoch": 0.03346870472031183, + "grad_norm": 1.767785906791687, + "learning_rate": 7.437159885980825e-06, + "loss": 1.6462, + "step": 1435 + }, + { + "epoch": 0.03349202785948975, + "grad_norm": 1.6161593198776245, + "learning_rate": 7.442342575796839e-06, + "loss": 1.279, + "step": 1436 + }, + { + "epoch": 0.03351535099866767, + "grad_norm": 1.4756333827972412, + "learning_rate": 7.447525265612854e-06, + "loss": 1.5475, + "step": 1437 + }, + { + "epoch": 0.03353867413784559, + "grad_norm": 1.8089308738708496, + "learning_rate": 7.452707955428868e-06, + "loss": 1.8059, + "step": 1438 + }, + { + "epoch": 0.0335619972770235, + "grad_norm": 1.6815400123596191, + "learning_rate": 7.457890645244883e-06, + "loss": 1.7294, + "step": 1439 + }, + { + "epoch": 0.033585320416201415, + "grad_norm": 2.2101638317108154, + "learning_rate": 7.4630733350608975e-06, + "loss": 1.1257, + "step": 1440 + }, + { + "epoch": 0.03360864355537933, + "grad_norm": 1.4447871446609497, + "learning_rate": 7.468256024876912e-06, + "loss": 1.5934, + "step": 1441 + }, + { + "epoch": 0.03363196669455725, + "grad_norm": 1.8209795951843262, + "learning_rate": 7.473438714692926e-06, + "loss": 1.4218, + "step": 1442 + }, + { + "epoch": 0.03365528983373517, + "grad_norm": 1.4553669691085815, + "learning_rate": 7.4786214045089414e-06, + "loss": 1.2059, + "step": 1443 + }, + { + "epoch": 0.03367861297291309, + "grad_norm": 1.7106033563613892, + "learning_rate": 7.483804094324956e-06, + "loss": 1.1671, + "step": 1444 + }, + { + "epoch": 0.033701936112091005, + "grad_norm": 1.3894087076187134, + "learning_rate": 7.48898678414097e-06, + "loss": 1.4522, + "step": 1445 + }, + { + "epoch": 0.03372525925126892, + "grad_norm": 1.1842654943466187, + "learning_rate": 7.4941694739569845e-06, + "loss": 1.4706, + "step": 1446 + }, + { + "epoch": 0.03374858239044684, + "grad_norm": 2.5644612312316895, + "learning_rate": 7.499352163773e-06, + "loss": 1.6062, + "step": 1447 + }, + { + "epoch": 0.03377190552962476, + "grad_norm": 1.5129215717315674, + "learning_rate": 7.504534853589013e-06, + "loss": 1.2178, + "step": 1448 + }, + { + "epoch": 0.033795228668802677, + "grad_norm": 1.7350616455078125, + "learning_rate": 7.5097175434050275e-06, + "loss": 1.7579, + "step": 1449 + }, + { + "epoch": 0.033818551807980594, + "grad_norm": 2.163621187210083, + "learning_rate": 7.514900233221042e-06, + "loss": 1.6504, + "step": 1450 + }, + { + "epoch": 0.03384187494715851, + "grad_norm": 1.946423888206482, + "learning_rate": 7.520082923037056e-06, + "loss": 1.6524, + "step": 1451 + }, + { + "epoch": 0.03386519808633643, + "grad_norm": 1.766641616821289, + "learning_rate": 7.525265612853071e-06, + "loss": 1.1683, + "step": 1452 + }, + { + "epoch": 0.03388852122551435, + "grad_norm": 1.928938627243042, + "learning_rate": 7.530448302669086e-06, + "loss": 1.4919, + "step": 1453 + }, + { + "epoch": 0.033911844364692266, + "grad_norm": 1.5574640035629272, + "learning_rate": 7.5356309924851e-06, + "loss": 1.3775, + "step": 1454 + }, + { + "epoch": 0.033935167503870184, + "grad_norm": 1.6000114679336548, + "learning_rate": 7.5408136823011144e-06, + "loss": 1.8033, + "step": 1455 + }, + { + "epoch": 0.0339584906430481, + "grad_norm": 1.4576321840286255, + "learning_rate": 7.545996372117129e-06, + "loss": 1.6291, + "step": 1456 + }, + { + "epoch": 0.03398181378222602, + "grad_norm": 1.67397940158844, + "learning_rate": 7.551179061933144e-06, + "loss": 1.501, + "step": 1457 + }, + { + "epoch": 0.03400513692140394, + "grad_norm": 1.6351300477981567, + "learning_rate": 7.556361751749158e-06, + "loss": 1.4177, + "step": 1458 + }, + { + "epoch": 0.034028460060581855, + "grad_norm": 1.806840181350708, + "learning_rate": 7.561544441565173e-06, + "loss": 1.2173, + "step": 1459 + }, + { + "epoch": 0.03405178319975977, + "grad_norm": 2.1059956550598145, + "learning_rate": 7.566727131381187e-06, + "loss": 1.2487, + "step": 1460 + }, + { + "epoch": 0.03407510633893769, + "grad_norm": 1.5448449850082397, + "learning_rate": 7.571909821197202e-06, + "loss": 1.4264, + "step": 1461 + }, + { + "epoch": 0.03409842947811561, + "grad_norm": 2.8610997200012207, + "learning_rate": 7.5770925110132166e-06, + "loss": 1.3305, + "step": 1462 + }, + { + "epoch": 0.03412175261729353, + "grad_norm": 1.7565038204193115, + "learning_rate": 7.582275200829231e-06, + "loss": 1.6971, + "step": 1463 + }, + { + "epoch": 0.034145075756471445, + "grad_norm": 1.5691516399383545, + "learning_rate": 7.587457890645245e-06, + "loss": 1.6759, + "step": 1464 + }, + { + "epoch": 0.03416839889564936, + "grad_norm": 1.4603890180587769, + "learning_rate": 7.5926405804612605e-06, + "loss": 1.6264, + "step": 1465 + }, + { + "epoch": 0.03419172203482728, + "grad_norm": 1.5885038375854492, + "learning_rate": 7.597823270277275e-06, + "loss": 1.124, + "step": 1466 + }, + { + "epoch": 0.0342150451740052, + "grad_norm": 1.4058237075805664, + "learning_rate": 7.603005960093289e-06, + "loss": 1.4257, + "step": 1467 + }, + { + "epoch": 0.03423836831318311, + "grad_norm": 1.552217721939087, + "learning_rate": 7.6081886499093035e-06, + "loss": 1.2563, + "step": 1468 + }, + { + "epoch": 0.03426169145236103, + "grad_norm": 2.235629081726074, + "learning_rate": 7.613371339725319e-06, + "loss": 1.8083, + "step": 1469 + }, + { + "epoch": 0.034285014591538945, + "grad_norm": 1.8639624118804932, + "learning_rate": 7.618554029541333e-06, + "loss": 1.5186, + "step": 1470 + }, + { + "epoch": 0.03430833773071686, + "grad_norm": 2.1537373065948486, + "learning_rate": 7.623736719357347e-06, + "loss": 1.3531, + "step": 1471 + }, + { + "epoch": 0.03433166086989478, + "grad_norm": 1.9041272401809692, + "learning_rate": 7.628919409173362e-06, + "loss": 1.4382, + "step": 1472 + }, + { + "epoch": 0.0343549840090727, + "grad_norm": 1.5207409858703613, + "learning_rate": 7.634102098989377e-06, + "loss": 1.349, + "step": 1473 + }, + { + "epoch": 0.03437830714825062, + "grad_norm": 1.446553349494934, + "learning_rate": 7.639284788805391e-06, + "loss": 1.1513, + "step": 1474 + }, + { + "epoch": 0.034401630287428535, + "grad_norm": 1.5411823987960815, + "learning_rate": 7.644467478621404e-06, + "loss": 1.2673, + "step": 1475 + }, + { + "epoch": 0.03442495342660645, + "grad_norm": 1.588210105895996, + "learning_rate": 7.64965016843742e-06, + "loss": 1.3414, + "step": 1476 + }, + { + "epoch": 0.03444827656578437, + "grad_norm": 1.4371896982192993, + "learning_rate": 7.654832858253434e-06, + "loss": 1.3386, + "step": 1477 + }, + { + "epoch": 0.03447159970496229, + "grad_norm": 1.2713488340377808, + "learning_rate": 7.660015548069449e-06, + "loss": 1.3674, + "step": 1478 + }, + { + "epoch": 0.034494922844140206, + "grad_norm": 1.9180690050125122, + "learning_rate": 7.665198237885463e-06, + "loss": 1.3761, + "step": 1479 + }, + { + "epoch": 0.034518245983318124, + "grad_norm": 1.7977988719940186, + "learning_rate": 7.670380927701477e-06, + "loss": 1.5416, + "step": 1480 + }, + { + "epoch": 0.03454156912249604, + "grad_norm": 1.6764715909957886, + "learning_rate": 7.675563617517492e-06, + "loss": 1.9225, + "step": 1481 + }, + { + "epoch": 0.03456489226167396, + "grad_norm": 1.8952007293701172, + "learning_rate": 7.680746307333506e-06, + "loss": 1.545, + "step": 1482 + }, + { + "epoch": 0.03458821540085188, + "grad_norm": 1.2648754119873047, + "learning_rate": 7.68592899714952e-06, + "loss": 1.3556, + "step": 1483 + }, + { + "epoch": 0.034611538540029796, + "grad_norm": 1.5882269144058228, + "learning_rate": 7.691111686965535e-06, + "loss": 1.4676, + "step": 1484 + }, + { + "epoch": 0.034634861679207714, + "grad_norm": 1.4746918678283691, + "learning_rate": 7.69629437678155e-06, + "loss": 1.5869, + "step": 1485 + }, + { + "epoch": 0.03465818481838563, + "grad_norm": 1.6212809085845947, + "learning_rate": 7.701477066597565e-06, + "loss": 1.7056, + "step": 1486 + }, + { + "epoch": 0.03468150795756355, + "grad_norm": 2.3746814727783203, + "learning_rate": 7.70665975641358e-06, + "loss": 1.4726, + "step": 1487 + }, + { + "epoch": 0.03470483109674147, + "grad_norm": 1.5706418752670288, + "learning_rate": 7.711842446229594e-06, + "loss": 1.3319, + "step": 1488 + }, + { + "epoch": 0.034728154235919385, + "grad_norm": 1.6811712980270386, + "learning_rate": 7.717025136045608e-06, + "loss": 1.4744, + "step": 1489 + }, + { + "epoch": 0.0347514773750973, + "grad_norm": 1.839852213859558, + "learning_rate": 7.722207825861623e-06, + "loss": 1.3563, + "step": 1490 + }, + { + "epoch": 0.03477480051427522, + "grad_norm": 1.2929447889328003, + "learning_rate": 7.727390515677637e-06, + "loss": 1.7944, + "step": 1491 + }, + { + "epoch": 0.03479812365345314, + "grad_norm": 1.7659885883331299, + "learning_rate": 7.732573205493651e-06, + "loss": 1.6265, + "step": 1492 + }, + { + "epoch": 0.03482144679263106, + "grad_norm": 1.7670022249221802, + "learning_rate": 7.737755895309667e-06, + "loss": 1.6311, + "step": 1493 + }, + { + "epoch": 0.034844769931808975, + "grad_norm": 1.7348347902297974, + "learning_rate": 7.742938585125682e-06, + "loss": 1.4573, + "step": 1494 + }, + { + "epoch": 0.03486809307098689, + "grad_norm": 1.5826637744903564, + "learning_rate": 7.748121274941696e-06, + "loss": 1.8183, + "step": 1495 + }, + { + "epoch": 0.034891416210164804, + "grad_norm": 1.6276066303253174, + "learning_rate": 7.75330396475771e-06, + "loss": 1.5105, + "step": 1496 + }, + { + "epoch": 0.03491473934934272, + "grad_norm": 1.4175602197647095, + "learning_rate": 7.758486654573725e-06, + "loss": 1.4397, + "step": 1497 + }, + { + "epoch": 0.03493806248852064, + "grad_norm": 1.2575039863586426, + "learning_rate": 7.763669344389739e-06, + "loss": 1.3638, + "step": 1498 + }, + { + "epoch": 0.03496138562769856, + "grad_norm": 1.591441035270691, + "learning_rate": 7.768852034205753e-06, + "loss": 1.2515, + "step": 1499 + }, + { + "epoch": 0.034984708766876475, + "grad_norm": 1.8170280456542969, + "learning_rate": 7.774034724021768e-06, + "loss": 1.6124, + "step": 1500 + }, + { + "epoch": 0.03500803190605439, + "grad_norm": 1.825690507888794, + "learning_rate": 7.779217413837784e-06, + "loss": 1.5076, + "step": 1501 + }, + { + "epoch": 0.03503135504523231, + "grad_norm": 1.61045241355896, + "learning_rate": 7.784400103653798e-06, + "loss": 1.5944, + "step": 1502 + }, + { + "epoch": 0.03505467818441023, + "grad_norm": 2.1213035583496094, + "learning_rate": 7.78958279346981e-06, + "loss": 1.561, + "step": 1503 + }, + { + "epoch": 0.03507800132358815, + "grad_norm": 1.5680464506149292, + "learning_rate": 7.794765483285825e-06, + "loss": 1.1515, + "step": 1504 + }, + { + "epoch": 0.035101324462766065, + "grad_norm": 1.7792956829071045, + "learning_rate": 7.79994817310184e-06, + "loss": 1.7459, + "step": 1505 + }, + { + "epoch": 0.03512464760194398, + "grad_norm": 1.5262699127197266, + "learning_rate": 7.805130862917854e-06, + "loss": 1.4087, + "step": 1506 + }, + { + "epoch": 0.0351479707411219, + "grad_norm": 1.9013603925704956, + "learning_rate": 7.81031355273387e-06, + "loss": 1.745, + "step": 1507 + }, + { + "epoch": 0.03517129388029982, + "grad_norm": 2.1864850521087646, + "learning_rate": 7.815496242549884e-06, + "loss": 1.6892, + "step": 1508 + }, + { + "epoch": 0.035194617019477736, + "grad_norm": 1.6094999313354492, + "learning_rate": 7.820678932365898e-06, + "loss": 1.1677, + "step": 1509 + }, + { + "epoch": 0.035217940158655654, + "grad_norm": 1.6659038066864014, + "learning_rate": 7.825861622181913e-06, + "loss": 1.3676, + "step": 1510 + }, + { + "epoch": 0.03524126329783357, + "grad_norm": 1.5591635704040527, + "learning_rate": 7.831044311997927e-06, + "loss": 1.3353, + "step": 1511 + }, + { + "epoch": 0.03526458643701149, + "grad_norm": 1.6324151754379272, + "learning_rate": 7.836227001813942e-06, + "loss": 1.5816, + "step": 1512 + }, + { + "epoch": 0.03528790957618941, + "grad_norm": 1.8007915019989014, + "learning_rate": 7.841409691629956e-06, + "loss": 1.9207, + "step": 1513 + }, + { + "epoch": 0.035311232715367326, + "grad_norm": 1.6061041355133057, + "learning_rate": 7.84659238144597e-06, + "loss": 1.6949, + "step": 1514 + }, + { + "epoch": 0.035334555854545244, + "grad_norm": 1.5150330066680908, + "learning_rate": 7.851775071261986e-06, + "loss": 1.7975, + "step": 1515 + }, + { + "epoch": 0.03535787899372316, + "grad_norm": 1.7966561317443848, + "learning_rate": 7.856957761078e-06, + "loss": 1.3558, + "step": 1516 + }, + { + "epoch": 0.03538120213290108, + "grad_norm": 1.6751410961151123, + "learning_rate": 7.862140450894015e-06, + "loss": 1.3387, + "step": 1517 + }, + { + "epoch": 0.035404525272079, + "grad_norm": 1.7746779918670654, + "learning_rate": 7.86732314071003e-06, + "loss": 1.8068, + "step": 1518 + }, + { + "epoch": 0.035427848411256915, + "grad_norm": 1.4943839311599731, + "learning_rate": 7.872505830526044e-06, + "loss": 1.4922, + "step": 1519 + }, + { + "epoch": 0.03545117155043483, + "grad_norm": 1.3683398962020874, + "learning_rate": 7.877688520342058e-06, + "loss": 1.382, + "step": 1520 + }, + { + "epoch": 0.03547449468961275, + "grad_norm": 1.6939599514007568, + "learning_rate": 7.882871210158072e-06, + "loss": 1.6179, + "step": 1521 + }, + { + "epoch": 0.03549781782879067, + "grad_norm": 1.4292916059494019, + "learning_rate": 7.888053899974087e-06, + "loss": 1.4422, + "step": 1522 + }, + { + "epoch": 0.03552114096796859, + "grad_norm": 1.96234929561615, + "learning_rate": 7.893236589790103e-06, + "loss": 1.1728, + "step": 1523 + }, + { + "epoch": 0.035544464107146505, + "grad_norm": 1.8289707899093628, + "learning_rate": 7.898419279606117e-06, + "loss": 1.4281, + "step": 1524 + }, + { + "epoch": 0.035567787246324416, + "grad_norm": 1.563638687133789, + "learning_rate": 7.903601969422131e-06, + "loss": 1.441, + "step": 1525 + }, + { + "epoch": 0.035591110385502334, + "grad_norm": 1.7753417491912842, + "learning_rate": 7.908784659238146e-06, + "loss": 1.5371, + "step": 1526 + }, + { + "epoch": 0.03561443352468025, + "grad_norm": 1.4442288875579834, + "learning_rate": 7.91396734905416e-06, + "loss": 1.21, + "step": 1527 + }, + { + "epoch": 0.03563775666385817, + "grad_norm": 1.5175955295562744, + "learning_rate": 7.919150038870174e-06, + "loss": 1.2075, + "step": 1528 + }, + { + "epoch": 0.03566107980303609, + "grad_norm": 1.6752229928970337, + "learning_rate": 7.924332728686189e-06, + "loss": 1.2919, + "step": 1529 + }, + { + "epoch": 0.035684402942214005, + "grad_norm": 1.7506253719329834, + "learning_rate": 7.929515418502203e-06, + "loss": 1.5369, + "step": 1530 + }, + { + "epoch": 0.03570772608139192, + "grad_norm": 1.9442663192749023, + "learning_rate": 7.934698108318218e-06, + "loss": 1.6756, + "step": 1531 + }, + { + "epoch": 0.03573104922056984, + "grad_norm": 1.658495545387268, + "learning_rate": 7.939880798134232e-06, + "loss": 1.2362, + "step": 1532 + }, + { + "epoch": 0.03575437235974776, + "grad_norm": 1.2289533615112305, + "learning_rate": 7.945063487950246e-06, + "loss": 1.5051, + "step": 1533 + }, + { + "epoch": 0.03577769549892568, + "grad_norm": 1.5502135753631592, + "learning_rate": 7.95024617776626e-06, + "loss": 1.3702, + "step": 1534 + }, + { + "epoch": 0.035801018638103595, + "grad_norm": 1.8727954626083374, + "learning_rate": 7.955428867582275e-06, + "loss": 1.417, + "step": 1535 + }, + { + "epoch": 0.03582434177728151, + "grad_norm": 1.1890602111816406, + "learning_rate": 7.96061155739829e-06, + "loss": 1.1737, + "step": 1536 + }, + { + "epoch": 0.03584766491645943, + "grad_norm": 5.72725772857666, + "learning_rate": 7.965794247214305e-06, + "loss": 1.3097, + "step": 1537 + }, + { + "epoch": 0.03587098805563735, + "grad_norm": 1.2847952842712402, + "learning_rate": 7.97097693703032e-06, + "loss": 1.456, + "step": 1538 + }, + { + "epoch": 0.035894311194815266, + "grad_norm": 2.3652467727661133, + "learning_rate": 7.976159626846334e-06, + "loss": 1.7498, + "step": 1539 + }, + { + "epoch": 0.035917634333993184, + "grad_norm": 2.2748360633850098, + "learning_rate": 7.981342316662348e-06, + "loss": 1.4181, + "step": 1540 + }, + { + "epoch": 0.0359409574731711, + "grad_norm": 1.9288114309310913, + "learning_rate": 7.986525006478363e-06, + "loss": 1.4505, + "step": 1541 + }, + { + "epoch": 0.03596428061234902, + "grad_norm": 1.9735311269760132, + "learning_rate": 7.991707696294377e-06, + "loss": 1.5196, + "step": 1542 + }, + { + "epoch": 0.03598760375152694, + "grad_norm": 1.5026898384094238, + "learning_rate": 7.996890386110391e-06, + "loss": 1.2868, + "step": 1543 + }, + { + "epoch": 0.036010926890704856, + "grad_norm": 1.4773675203323364, + "learning_rate": 8.002073075926406e-06, + "loss": 1.3777, + "step": 1544 + }, + { + "epoch": 0.036034250029882774, + "grad_norm": 1.7095143795013428, + "learning_rate": 8.007255765742422e-06, + "loss": 1.2692, + "step": 1545 + }, + { + "epoch": 0.03605757316906069, + "grad_norm": 1.7218233346939087, + "learning_rate": 8.012438455558436e-06, + "loss": 1.4015, + "step": 1546 + }, + { + "epoch": 0.03608089630823861, + "grad_norm": 1.5240681171417236, + "learning_rate": 8.01762114537445e-06, + "loss": 1.5267, + "step": 1547 + }, + { + "epoch": 0.03610421944741653, + "grad_norm": 1.9092682600021362, + "learning_rate": 8.022803835190465e-06, + "loss": 1.2564, + "step": 1548 + }, + { + "epoch": 0.036127542586594445, + "grad_norm": 1.844650149345398, + "learning_rate": 8.027986525006479e-06, + "loss": 1.5158, + "step": 1549 + }, + { + "epoch": 0.03615086572577236, + "grad_norm": 1.5689501762390137, + "learning_rate": 8.033169214822493e-06, + "loss": 1.5708, + "step": 1550 + }, + { + "epoch": 0.03617418886495028, + "grad_norm": 2.210259437561035, + "learning_rate": 8.038351904638508e-06, + "loss": 1.5915, + "step": 1551 + }, + { + "epoch": 0.0361975120041282, + "grad_norm": 1.4000816345214844, + "learning_rate": 8.043534594454522e-06, + "loss": 1.2189, + "step": 1552 + }, + { + "epoch": 0.03622083514330611, + "grad_norm": 1.4790806770324707, + "learning_rate": 8.048717284270538e-06, + "loss": 1.3637, + "step": 1553 + }, + { + "epoch": 0.03624415828248403, + "grad_norm": 1.9432685375213623, + "learning_rate": 8.053899974086553e-06, + "loss": 1.4459, + "step": 1554 + }, + { + "epoch": 0.036267481421661946, + "grad_norm": 1.9427974224090576, + "learning_rate": 8.059082663902567e-06, + "loss": 1.8405, + "step": 1555 + }, + { + "epoch": 0.036290804560839864, + "grad_norm": 1.6169490814208984, + "learning_rate": 8.064265353718581e-06, + "loss": 1.5894, + "step": 1556 + }, + { + "epoch": 0.03631412770001778, + "grad_norm": 2.189110517501831, + "learning_rate": 8.069448043534596e-06, + "loss": 1.4458, + "step": 1557 + }, + { + "epoch": 0.0363374508391957, + "grad_norm": 1.6950788497924805, + "learning_rate": 8.074630733350608e-06, + "loss": 1.2485, + "step": 1558 + }, + { + "epoch": 0.03636077397837362, + "grad_norm": 1.5580222606658936, + "learning_rate": 8.079813423166624e-06, + "loss": 1.3971, + "step": 1559 + }, + { + "epoch": 0.036384097117551535, + "grad_norm": 1.68899405002594, + "learning_rate": 8.084996112982639e-06, + "loss": 1.5722, + "step": 1560 + }, + { + "epoch": 0.03640742025672945, + "grad_norm": 1.6812056303024292, + "learning_rate": 8.090178802798653e-06, + "loss": 1.8336, + "step": 1561 + }, + { + "epoch": 0.03643074339590737, + "grad_norm": 2.962195634841919, + "learning_rate": 8.095361492614667e-06, + "loss": 1.7488, + "step": 1562 + }, + { + "epoch": 0.03645406653508529, + "grad_norm": 1.6132487058639526, + "learning_rate": 8.100544182430682e-06, + "loss": 1.9727, + "step": 1563 + }, + { + "epoch": 0.03647738967426321, + "grad_norm": 1.6288578510284424, + "learning_rate": 8.105726872246696e-06, + "loss": 1.6962, + "step": 1564 + }, + { + "epoch": 0.036500712813441125, + "grad_norm": 1.5894676446914673, + "learning_rate": 8.11090956206271e-06, + "loss": 1.7313, + "step": 1565 + }, + { + "epoch": 0.03652403595261904, + "grad_norm": 1.702314019203186, + "learning_rate": 8.116092251878725e-06, + "loss": 1.5682, + "step": 1566 + }, + { + "epoch": 0.03654735909179696, + "grad_norm": 2.3464395999908447, + "learning_rate": 8.12127494169474e-06, + "loss": 1.1367, + "step": 1567 + }, + { + "epoch": 0.03657068223097488, + "grad_norm": 1.3930420875549316, + "learning_rate": 8.126457631510755e-06, + "loss": 1.2127, + "step": 1568 + }, + { + "epoch": 0.036594005370152796, + "grad_norm": 1.964519739151001, + "learning_rate": 8.13164032132677e-06, + "loss": 1.5458, + "step": 1569 + }, + { + "epoch": 0.036617328509330714, + "grad_norm": 1.7511687278747559, + "learning_rate": 8.136823011142784e-06, + "loss": 1.4957, + "step": 1570 + }, + { + "epoch": 0.03664065164850863, + "grad_norm": 1.403041958808899, + "learning_rate": 8.142005700958798e-06, + "loss": 1.5422, + "step": 1571 + }, + { + "epoch": 0.03666397478768655, + "grad_norm": 2.368617534637451, + "learning_rate": 8.147188390774813e-06, + "loss": 1.2203, + "step": 1572 + }, + { + "epoch": 0.03668729792686447, + "grad_norm": 1.7351584434509277, + "learning_rate": 8.152371080590827e-06, + "loss": 1.5534, + "step": 1573 + }, + { + "epoch": 0.036710621066042386, + "grad_norm": 1.7059663534164429, + "learning_rate": 8.157553770406841e-06, + "loss": 1.5554, + "step": 1574 + }, + { + "epoch": 0.036733944205220304, + "grad_norm": 1.9748015403747559, + "learning_rate": 8.162736460222857e-06, + "loss": 1.3837, + "step": 1575 + }, + { + "epoch": 0.03675726734439822, + "grad_norm": 1.7517926692962646, + "learning_rate": 8.167919150038872e-06, + "loss": 1.5008, + "step": 1576 + }, + { + "epoch": 0.03678059048357614, + "grad_norm": 2.074340343475342, + "learning_rate": 8.173101839854886e-06, + "loss": 1.1944, + "step": 1577 + }, + { + "epoch": 0.03680391362275406, + "grad_norm": 1.7943975925445557, + "learning_rate": 8.1782845296709e-06, + "loss": 1.5017, + "step": 1578 + }, + { + "epoch": 0.036827236761931975, + "grad_norm": 1.7202725410461426, + "learning_rate": 8.183467219486915e-06, + "loss": 1.3468, + "step": 1579 + }, + { + "epoch": 0.03685055990110989, + "grad_norm": 2.03446364402771, + "learning_rate": 8.188649909302929e-06, + "loss": 1.8081, + "step": 1580 + }, + { + "epoch": 0.03687388304028781, + "grad_norm": 1.8767874240875244, + "learning_rate": 8.193832599118943e-06, + "loss": 1.3877, + "step": 1581 + }, + { + "epoch": 0.03689720617946572, + "grad_norm": 1.4143779277801514, + "learning_rate": 8.199015288934958e-06, + "loss": 1.5551, + "step": 1582 + }, + { + "epoch": 0.03692052931864364, + "grad_norm": 1.4130569696426392, + "learning_rate": 8.204197978750974e-06, + "loss": 1.3058, + "step": 1583 + }, + { + "epoch": 0.03694385245782156, + "grad_norm": 1.4558956623077393, + "learning_rate": 8.209380668566988e-06, + "loss": 1.4228, + "step": 1584 + }, + { + "epoch": 0.036967175596999476, + "grad_norm": 2.6582729816436768, + "learning_rate": 8.214563358383e-06, + "loss": 1.5081, + "step": 1585 + }, + { + "epoch": 0.036990498736177393, + "grad_norm": 1.4754345417022705, + "learning_rate": 8.219746048199015e-06, + "loss": 1.5688, + "step": 1586 + }, + { + "epoch": 0.03701382187535531, + "grad_norm": 1.5351654291152954, + "learning_rate": 8.22492873801503e-06, + "loss": 1.8144, + "step": 1587 + }, + { + "epoch": 0.03703714501453323, + "grad_norm": 1.6197818517684937, + "learning_rate": 8.230111427831044e-06, + "loss": 1.5846, + "step": 1588 + }, + { + "epoch": 0.03706046815371115, + "grad_norm": 1.8108611106872559, + "learning_rate": 8.23529411764706e-06, + "loss": 1.3416, + "step": 1589 + }, + { + "epoch": 0.037083791292889065, + "grad_norm": 1.6245759725570679, + "learning_rate": 8.240476807463074e-06, + "loss": 1.4319, + "step": 1590 + }, + { + "epoch": 0.03710711443206698, + "grad_norm": 1.9573677778244019, + "learning_rate": 8.245659497279088e-06, + "loss": 1.4856, + "step": 1591 + }, + { + "epoch": 0.0371304375712449, + "grad_norm": 2.195033550262451, + "learning_rate": 8.250842187095103e-06, + "loss": 1.4553, + "step": 1592 + }, + { + "epoch": 0.03715376071042282, + "grad_norm": 1.7342851161956787, + "learning_rate": 8.256024876911117e-06, + "loss": 1.4633, + "step": 1593 + }, + { + "epoch": 0.03717708384960074, + "grad_norm": 1.499495506286621, + "learning_rate": 8.261207566727132e-06, + "loss": 1.6055, + "step": 1594 + }, + { + "epoch": 0.037200406988778655, + "grad_norm": 1.4192696809768677, + "learning_rate": 8.266390256543146e-06, + "loss": 1.3659, + "step": 1595 + }, + { + "epoch": 0.03722373012795657, + "grad_norm": 1.8910040855407715, + "learning_rate": 8.27157294635916e-06, + "loss": 1.703, + "step": 1596 + }, + { + "epoch": 0.03724705326713449, + "grad_norm": 1.4962915182113647, + "learning_rate": 8.276755636175176e-06, + "loss": 1.3212, + "step": 1597 + }, + { + "epoch": 0.03727037640631241, + "grad_norm": 2.1940252780914307, + "learning_rate": 8.28193832599119e-06, + "loss": 1.8816, + "step": 1598 + }, + { + "epoch": 0.037293699545490326, + "grad_norm": 1.415831208229065, + "learning_rate": 8.287121015807205e-06, + "loss": 1.229, + "step": 1599 + }, + { + "epoch": 0.037317022684668244, + "grad_norm": 1.5565876960754395, + "learning_rate": 8.29230370562322e-06, + "loss": 1.7503, + "step": 1600 + }, + { + "epoch": 0.03734034582384616, + "grad_norm": 2.6450204849243164, + "learning_rate": 8.297486395439234e-06, + "loss": 1.3618, + "step": 1601 + }, + { + "epoch": 0.03736366896302408, + "grad_norm": 1.5824869871139526, + "learning_rate": 8.302669085255248e-06, + "loss": 1.3587, + "step": 1602 + }, + { + "epoch": 0.037386992102202, + "grad_norm": 1.6635199785232544, + "learning_rate": 8.307851775071262e-06, + "loss": 1.7323, + "step": 1603 + }, + { + "epoch": 0.037410315241379916, + "grad_norm": 1.5391467809677124, + "learning_rate": 8.313034464887277e-06, + "loss": 1.7489, + "step": 1604 + }, + { + "epoch": 0.037433638380557833, + "grad_norm": 2.136975049972534, + "learning_rate": 8.318217154703293e-06, + "loss": 1.4696, + "step": 1605 + }, + { + "epoch": 0.03745696151973575, + "grad_norm": 1.4561282396316528, + "learning_rate": 8.323399844519307e-06, + "loss": 1.746, + "step": 1606 + }, + { + "epoch": 0.03748028465891367, + "grad_norm": 1.323926329612732, + "learning_rate": 8.328582534335321e-06, + "loss": 1.3654, + "step": 1607 + }, + { + "epoch": 0.03750360779809159, + "grad_norm": 1.6495275497436523, + "learning_rate": 8.333765224151336e-06, + "loss": 1.6208, + "step": 1608 + }, + { + "epoch": 0.037526930937269505, + "grad_norm": 1.4379764795303345, + "learning_rate": 8.33894791396735e-06, + "loss": 1.4988, + "step": 1609 + }, + { + "epoch": 0.037550254076447416, + "grad_norm": 1.676405668258667, + "learning_rate": 8.344130603783364e-06, + "loss": 1.5563, + "step": 1610 + }, + { + "epoch": 0.037573577215625334, + "grad_norm": 1.0886626243591309, + "learning_rate": 8.349313293599379e-06, + "loss": 1.2886, + "step": 1611 + }, + { + "epoch": 0.03759690035480325, + "grad_norm": 1.5499573945999146, + "learning_rate": 8.354495983415393e-06, + "loss": 1.6758, + "step": 1612 + }, + { + "epoch": 0.03762022349398117, + "grad_norm": 1.2256261110305786, + "learning_rate": 8.359678673231408e-06, + "loss": 1.1831, + "step": 1613 + }, + { + "epoch": 0.03764354663315909, + "grad_norm": 2.0278513431549072, + "learning_rate": 8.364861363047422e-06, + "loss": 1.5379, + "step": 1614 + }, + { + "epoch": 0.037666869772337006, + "grad_norm": 1.6582108736038208, + "learning_rate": 8.370044052863436e-06, + "loss": 1.4499, + "step": 1615 + }, + { + "epoch": 0.03769019291151492, + "grad_norm": 2.517474412918091, + "learning_rate": 8.37522674267945e-06, + "loss": 1.3365, + "step": 1616 + }, + { + "epoch": 0.03771351605069284, + "grad_norm": 1.651391863822937, + "learning_rate": 8.380409432495465e-06, + "loss": 1.5475, + "step": 1617 + }, + { + "epoch": 0.03773683918987076, + "grad_norm": 1.9716179370880127, + "learning_rate": 8.38559212231148e-06, + "loss": 1.4809, + "step": 1618 + }, + { + "epoch": 0.03776016232904868, + "grad_norm": 2.0555307865142822, + "learning_rate": 8.390774812127495e-06, + "loss": 1.7496, + "step": 1619 + }, + { + "epoch": 0.037783485468226595, + "grad_norm": 1.5695487260818481, + "learning_rate": 8.39595750194351e-06, + "loss": 1.3654, + "step": 1620 + }, + { + "epoch": 0.03780680860740451, + "grad_norm": 1.4122220277786255, + "learning_rate": 8.401140191759524e-06, + "loss": 1.7411, + "step": 1621 + }, + { + "epoch": 0.03783013174658243, + "grad_norm": 1.4024474620819092, + "learning_rate": 8.406322881575538e-06, + "loss": 1.3807, + "step": 1622 + }, + { + "epoch": 0.03785345488576035, + "grad_norm": 1.932897686958313, + "learning_rate": 8.411505571391553e-06, + "loss": 1.6327, + "step": 1623 + }, + { + "epoch": 0.03787677802493827, + "grad_norm": 1.3100526332855225, + "learning_rate": 8.416688261207567e-06, + "loss": 1.0531, + "step": 1624 + }, + { + "epoch": 0.037900101164116184, + "grad_norm": 1.5780110359191895, + "learning_rate": 8.421870951023581e-06, + "loss": 1.3187, + "step": 1625 + }, + { + "epoch": 0.0379234243032941, + "grad_norm": 1.905220866203308, + "learning_rate": 8.427053640839596e-06, + "loss": 1.5322, + "step": 1626 + }, + { + "epoch": 0.03794674744247202, + "grad_norm": 1.4416756629943848, + "learning_rate": 8.432236330655612e-06, + "loss": 1.3633, + "step": 1627 + }, + { + "epoch": 0.03797007058164994, + "grad_norm": 1.720937728881836, + "learning_rate": 8.437419020471626e-06, + "loss": 1.4178, + "step": 1628 + }, + { + "epoch": 0.037993393720827856, + "grad_norm": 1.891658902168274, + "learning_rate": 8.44260171028764e-06, + "loss": 1.5383, + "step": 1629 + }, + { + "epoch": 0.038016716860005774, + "grad_norm": 2.179572105407715, + "learning_rate": 8.447784400103655e-06, + "loss": 1.3399, + "step": 1630 + }, + { + "epoch": 0.03804003999918369, + "grad_norm": 1.4478271007537842, + "learning_rate": 8.452967089919669e-06, + "loss": 1.4122, + "step": 1631 + }, + { + "epoch": 0.03806336313836161, + "grad_norm": 1.4049443006515503, + "learning_rate": 8.458149779735683e-06, + "loss": 1.5164, + "step": 1632 + }, + { + "epoch": 0.03808668627753953, + "grad_norm": 2.1146810054779053, + "learning_rate": 8.463332469551698e-06, + "loss": 1.5005, + "step": 1633 + }, + { + "epoch": 0.038110009416717446, + "grad_norm": 1.8528714179992676, + "learning_rate": 8.468515159367712e-06, + "loss": 1.4598, + "step": 1634 + }, + { + "epoch": 0.03813333255589536, + "grad_norm": 2.274590492248535, + "learning_rate": 8.473697849183728e-06, + "loss": 1.4343, + "step": 1635 + }, + { + "epoch": 0.03815665569507328, + "grad_norm": 1.837266445159912, + "learning_rate": 8.478880538999743e-06, + "loss": 1.6039, + "step": 1636 + }, + { + "epoch": 0.0381799788342512, + "grad_norm": 1.735687494277954, + "learning_rate": 8.484063228815757e-06, + "loss": 1.3623, + "step": 1637 + }, + { + "epoch": 0.03820330197342912, + "grad_norm": 1.8133695125579834, + "learning_rate": 8.489245918631771e-06, + "loss": 1.2941, + "step": 1638 + }, + { + "epoch": 0.03822662511260703, + "grad_norm": 1.9450503587722778, + "learning_rate": 8.494428608447786e-06, + "loss": 1.4706, + "step": 1639 + }, + { + "epoch": 0.038249948251784946, + "grad_norm": 1.6004278659820557, + "learning_rate": 8.4996112982638e-06, + "loss": 1.4726, + "step": 1640 + }, + { + "epoch": 0.038273271390962864, + "grad_norm": 1.7052674293518066, + "learning_rate": 8.504793988079814e-06, + "loss": 1.5521, + "step": 1641 + }, + { + "epoch": 0.03829659453014078, + "grad_norm": 1.3694720268249512, + "learning_rate": 8.509976677895829e-06, + "loss": 1.1333, + "step": 1642 + }, + { + "epoch": 0.0383199176693187, + "grad_norm": 1.7958831787109375, + "learning_rate": 8.515159367711843e-06, + "loss": 1.6234, + "step": 1643 + }, + { + "epoch": 0.03834324080849662, + "grad_norm": 1.7349238395690918, + "learning_rate": 8.520342057527857e-06, + "loss": 1.3697, + "step": 1644 + }, + { + "epoch": 0.038366563947674535, + "grad_norm": 1.5960413217544556, + "learning_rate": 8.525524747343872e-06, + "loss": 1.4687, + "step": 1645 + }, + { + "epoch": 0.03838988708685245, + "grad_norm": 2.78328800201416, + "learning_rate": 8.530707437159886e-06, + "loss": 1.462, + "step": 1646 + }, + { + "epoch": 0.03841321022603037, + "grad_norm": 1.310705304145813, + "learning_rate": 8.5358901269759e-06, + "loss": 1.347, + "step": 1647 + }, + { + "epoch": 0.03843653336520829, + "grad_norm": 1.554968237876892, + "learning_rate": 8.541072816791915e-06, + "loss": 1.41, + "step": 1648 + }, + { + "epoch": 0.03845985650438621, + "grad_norm": 2.0181522369384766, + "learning_rate": 8.54625550660793e-06, + "loss": 1.3945, + "step": 1649 + }, + { + "epoch": 0.038483179643564125, + "grad_norm": 1.816375494003296, + "learning_rate": 8.551438196423945e-06, + "loss": 1.6109, + "step": 1650 + }, + { + "epoch": 0.03850650278274204, + "grad_norm": 2.1661388874053955, + "learning_rate": 8.55662088623996e-06, + "loss": 1.8344, + "step": 1651 + }, + { + "epoch": 0.03852982592191996, + "grad_norm": 1.9306049346923828, + "learning_rate": 8.561803576055974e-06, + "loss": 1.2227, + "step": 1652 + }, + { + "epoch": 0.03855314906109788, + "grad_norm": 1.3145751953125, + "learning_rate": 8.566986265871988e-06, + "loss": 1.3881, + "step": 1653 + }, + { + "epoch": 0.038576472200275796, + "grad_norm": 1.6416202783584595, + "learning_rate": 8.572168955688003e-06, + "loss": 1.2782, + "step": 1654 + }, + { + "epoch": 0.038599795339453714, + "grad_norm": 1.3195691108703613, + "learning_rate": 8.577351645504017e-06, + "loss": 1.5958, + "step": 1655 + }, + { + "epoch": 0.03862311847863163, + "grad_norm": 1.786651372909546, + "learning_rate": 8.582534335320031e-06, + "loss": 1.6379, + "step": 1656 + }, + { + "epoch": 0.03864644161780955, + "grad_norm": 1.685196876525879, + "learning_rate": 8.587717025136047e-06, + "loss": 1.2548, + "step": 1657 + }, + { + "epoch": 0.03866976475698747, + "grad_norm": 2.0508875846862793, + "learning_rate": 8.592899714952062e-06, + "loss": 1.635, + "step": 1658 + }, + { + "epoch": 0.038693087896165386, + "grad_norm": 1.7226320505142212, + "learning_rate": 8.598082404768076e-06, + "loss": 1.4694, + "step": 1659 + }, + { + "epoch": 0.038716411035343304, + "grad_norm": 1.5333112478256226, + "learning_rate": 8.60326509458409e-06, + "loss": 1.4825, + "step": 1660 + }, + { + "epoch": 0.03873973417452122, + "grad_norm": 1.4121674299240112, + "learning_rate": 8.608447784400105e-06, + "loss": 1.2056, + "step": 1661 + }, + { + "epoch": 0.03876305731369914, + "grad_norm": 1.6394184827804565, + "learning_rate": 8.613630474216119e-06, + "loss": 1.5131, + "step": 1662 + }, + { + "epoch": 0.03878638045287706, + "grad_norm": 2.2525839805603027, + "learning_rate": 8.618813164032133e-06, + "loss": 1.4413, + "step": 1663 + }, + { + "epoch": 0.038809703592054975, + "grad_norm": 1.6599324941635132, + "learning_rate": 8.623995853848148e-06, + "loss": 1.1568, + "step": 1664 + }, + { + "epoch": 0.03883302673123289, + "grad_norm": 1.930284857749939, + "learning_rate": 8.629178543664162e-06, + "loss": 1.2182, + "step": 1665 + }, + { + "epoch": 0.03885634987041081, + "grad_norm": 1.366219401359558, + "learning_rate": 8.634361233480178e-06, + "loss": 1.6951, + "step": 1666 + }, + { + "epoch": 0.03887967300958872, + "grad_norm": 1.8555302619934082, + "learning_rate": 8.639543923296192e-06, + "loss": 1.4508, + "step": 1667 + }, + { + "epoch": 0.03890299614876664, + "grad_norm": 2.110704183578491, + "learning_rate": 8.644726613112205e-06, + "loss": 1.5057, + "step": 1668 + }, + { + "epoch": 0.03892631928794456, + "grad_norm": 1.4422646760940552, + "learning_rate": 8.64990930292822e-06, + "loss": 1.5628, + "step": 1669 + }, + { + "epoch": 0.038949642427122476, + "grad_norm": 1.8097025156021118, + "learning_rate": 8.655091992744234e-06, + "loss": 1.5336, + "step": 1670 + }, + { + "epoch": 0.038972965566300394, + "grad_norm": 1.5321156978607178, + "learning_rate": 8.66027468256025e-06, + "loss": 1.4985, + "step": 1671 + }, + { + "epoch": 0.03899628870547831, + "grad_norm": 1.715100884437561, + "learning_rate": 8.665457372376264e-06, + "loss": 1.5365, + "step": 1672 + }, + { + "epoch": 0.03901961184465623, + "grad_norm": 1.7432835102081299, + "learning_rate": 8.670640062192278e-06, + "loss": 1.5822, + "step": 1673 + }, + { + "epoch": 0.03904293498383415, + "grad_norm": 1.7451759576797485, + "learning_rate": 8.675822752008293e-06, + "loss": 1.6363, + "step": 1674 + }, + { + "epoch": 0.039066258123012065, + "grad_norm": 1.6405068635940552, + "learning_rate": 8.681005441824307e-06, + "loss": 1.6819, + "step": 1675 + }, + { + "epoch": 0.03908958126218998, + "grad_norm": 1.7980347871780396, + "learning_rate": 8.686188131640322e-06, + "loss": 1.5362, + "step": 1676 + }, + { + "epoch": 0.0391129044013679, + "grad_norm": 1.6365665197372437, + "learning_rate": 8.691370821456336e-06, + "loss": 2.0277, + "step": 1677 + }, + { + "epoch": 0.03913622754054582, + "grad_norm": 1.9490535259246826, + "learning_rate": 8.69655351127235e-06, + "loss": 1.5076, + "step": 1678 + }, + { + "epoch": 0.03915955067972374, + "grad_norm": 1.4164410829544067, + "learning_rate": 8.701736201088366e-06, + "loss": 1.8005, + "step": 1679 + }, + { + "epoch": 0.039182873818901655, + "grad_norm": 1.4707103967666626, + "learning_rate": 8.70691889090438e-06, + "loss": 1.353, + "step": 1680 + }, + { + "epoch": 0.03920619695807957, + "grad_norm": 1.7562110424041748, + "learning_rate": 8.712101580720395e-06, + "loss": 1.5621, + "step": 1681 + }, + { + "epoch": 0.03922952009725749, + "grad_norm": 2.0748794078826904, + "learning_rate": 8.71728427053641e-06, + "loss": 1.4923, + "step": 1682 + }, + { + "epoch": 0.03925284323643541, + "grad_norm": 2.031003475189209, + "learning_rate": 8.722466960352424e-06, + "loss": 1.1706, + "step": 1683 + }, + { + "epoch": 0.039276166375613326, + "grad_norm": 2.4340038299560547, + "learning_rate": 8.727649650168438e-06, + "loss": 1.3371, + "step": 1684 + }, + { + "epoch": 0.039299489514791244, + "grad_norm": 2.129331111907959, + "learning_rate": 8.732832339984452e-06, + "loss": 1.558, + "step": 1685 + }, + { + "epoch": 0.03932281265396916, + "grad_norm": 1.907139778137207, + "learning_rate": 8.738015029800467e-06, + "loss": 1.6016, + "step": 1686 + }, + { + "epoch": 0.03934613579314708, + "grad_norm": 1.8079878091812134, + "learning_rate": 8.743197719616483e-06, + "loss": 1.4186, + "step": 1687 + }, + { + "epoch": 0.039369458932325, + "grad_norm": 1.9196524620056152, + "learning_rate": 8.748380409432497e-06, + "loss": 1.6435, + "step": 1688 + }, + { + "epoch": 0.039392782071502916, + "grad_norm": 1.5702369213104248, + "learning_rate": 8.753563099248511e-06, + "loss": 1.6279, + "step": 1689 + }, + { + "epoch": 0.039416105210680834, + "grad_norm": 1.8079639673233032, + "learning_rate": 8.758745789064526e-06, + "loss": 1.4299, + "step": 1690 + }, + { + "epoch": 0.03943942834985875, + "grad_norm": 1.5084450244903564, + "learning_rate": 8.76392847888054e-06, + "loss": 1.6051, + "step": 1691 + }, + { + "epoch": 0.03946275148903667, + "grad_norm": 1.8773257732391357, + "learning_rate": 8.769111168696554e-06, + "loss": 1.2258, + "step": 1692 + }, + { + "epoch": 0.03948607462821459, + "grad_norm": 1.662649154663086, + "learning_rate": 8.774293858512569e-06, + "loss": 1.5057, + "step": 1693 + }, + { + "epoch": 0.039509397767392505, + "grad_norm": 1.7742561101913452, + "learning_rate": 8.779476548328583e-06, + "loss": 1.5083, + "step": 1694 + }, + { + "epoch": 0.03953272090657042, + "grad_norm": 1.6094675064086914, + "learning_rate": 8.784659238144598e-06, + "loss": 1.4416, + "step": 1695 + }, + { + "epoch": 0.039556044045748334, + "grad_norm": 1.7892067432403564, + "learning_rate": 8.789841927960612e-06, + "loss": 1.6939, + "step": 1696 + }, + { + "epoch": 0.03957936718492625, + "grad_norm": 1.4669241905212402, + "learning_rate": 8.795024617776626e-06, + "loss": 1.3218, + "step": 1697 + }, + { + "epoch": 0.03960269032410417, + "grad_norm": 1.6289660930633545, + "learning_rate": 8.80020730759264e-06, + "loss": 1.4026, + "step": 1698 + }, + { + "epoch": 0.03962601346328209, + "grad_norm": 1.4103940725326538, + "learning_rate": 8.805389997408655e-06, + "loss": 1.5594, + "step": 1699 + }, + { + "epoch": 0.039649336602460006, + "grad_norm": 1.8094227313995361, + "learning_rate": 8.81057268722467e-06, + "loss": 1.4749, + "step": 1700 + }, + { + "epoch": 0.039672659741637924, + "grad_norm": 1.9171851873397827, + "learning_rate": 8.815755377040685e-06, + "loss": 1.5853, + "step": 1701 + }, + { + "epoch": 0.03969598288081584, + "grad_norm": 1.7482846975326538, + "learning_rate": 8.8209380668567e-06, + "loss": 1.8572, + "step": 1702 + }, + { + "epoch": 0.03971930601999376, + "grad_norm": 1.494166374206543, + "learning_rate": 8.826120756672714e-06, + "loss": 1.4618, + "step": 1703 + }, + { + "epoch": 0.03974262915917168, + "grad_norm": 1.8293770551681519, + "learning_rate": 8.831303446488728e-06, + "loss": 1.278, + "step": 1704 + }, + { + "epoch": 0.039765952298349595, + "grad_norm": 1.7367064952850342, + "learning_rate": 8.836486136304743e-06, + "loss": 1.65, + "step": 1705 + }, + { + "epoch": 0.03978927543752751, + "grad_norm": 1.783642292022705, + "learning_rate": 8.841668826120757e-06, + "loss": 1.5325, + "step": 1706 + }, + { + "epoch": 0.03981259857670543, + "grad_norm": 1.5297502279281616, + "learning_rate": 8.846851515936771e-06, + "loss": 1.4132, + "step": 1707 + }, + { + "epoch": 0.03983592171588335, + "grad_norm": 1.9751566648483276, + "learning_rate": 8.852034205752786e-06, + "loss": 1.4317, + "step": 1708 + }, + { + "epoch": 0.03985924485506127, + "grad_norm": 2.1414785385131836, + "learning_rate": 8.857216895568802e-06, + "loss": 1.6401, + "step": 1709 + }, + { + "epoch": 0.039882567994239185, + "grad_norm": 1.4582406282424927, + "learning_rate": 8.862399585384816e-06, + "loss": 1.4949, + "step": 1710 + }, + { + "epoch": 0.0399058911334171, + "grad_norm": 1.3729748725891113, + "learning_rate": 8.86758227520083e-06, + "loss": 0.8325, + "step": 1711 + }, + { + "epoch": 0.03992921427259502, + "grad_norm": 1.5666522979736328, + "learning_rate": 8.872764965016845e-06, + "loss": 1.6165, + "step": 1712 + }, + { + "epoch": 0.03995253741177294, + "grad_norm": 1.8730623722076416, + "learning_rate": 8.877947654832859e-06, + "loss": 1.5912, + "step": 1713 + }, + { + "epoch": 0.039975860550950856, + "grad_norm": 1.3995941877365112, + "learning_rate": 8.883130344648873e-06, + "loss": 1.4624, + "step": 1714 + }, + { + "epoch": 0.039999183690128774, + "grad_norm": 1.6787446737289429, + "learning_rate": 8.888313034464888e-06, + "loss": 1.7264, + "step": 1715 + }, + { + "epoch": 0.04002250682930669, + "grad_norm": 1.6797045469284058, + "learning_rate": 8.893495724280902e-06, + "loss": 1.458, + "step": 1716 + }, + { + "epoch": 0.04004582996848461, + "grad_norm": 1.4562252759933472, + "learning_rate": 8.898678414096917e-06, + "loss": 1.4469, + "step": 1717 + }, + { + "epoch": 0.04006915310766253, + "grad_norm": 1.8270559310913086, + "learning_rate": 8.903861103912933e-06, + "loss": 1.5524, + "step": 1718 + }, + { + "epoch": 0.040092476246840446, + "grad_norm": 2.2723021507263184, + "learning_rate": 8.909043793728947e-06, + "loss": 1.5524, + "step": 1719 + }, + { + "epoch": 0.040115799386018364, + "grad_norm": 1.6696120500564575, + "learning_rate": 8.914226483544961e-06, + "loss": 1.6466, + "step": 1720 + }, + { + "epoch": 0.04013912252519628, + "grad_norm": 1.8067409992218018, + "learning_rate": 8.919409173360976e-06, + "loss": 1.4901, + "step": 1721 + }, + { + "epoch": 0.0401624456643742, + "grad_norm": 1.6212742328643799, + "learning_rate": 8.92459186317699e-06, + "loss": 1.3791, + "step": 1722 + }, + { + "epoch": 0.04018576880355212, + "grad_norm": 1.5557783842086792, + "learning_rate": 8.929774552993004e-06, + "loss": 1.4122, + "step": 1723 + }, + { + "epoch": 0.04020909194273003, + "grad_norm": 2.65142822265625, + "learning_rate": 8.934957242809019e-06, + "loss": 1.3679, + "step": 1724 + }, + { + "epoch": 0.040232415081907946, + "grad_norm": 1.9991352558135986, + "learning_rate": 8.940139932625033e-06, + "loss": 1.4746, + "step": 1725 + }, + { + "epoch": 0.040255738221085864, + "grad_norm": 2.054579257965088, + "learning_rate": 8.945322622441047e-06, + "loss": 1.5759, + "step": 1726 + }, + { + "epoch": 0.04027906136026378, + "grad_norm": 1.62351393699646, + "learning_rate": 8.950505312257062e-06, + "loss": 1.4139, + "step": 1727 + }, + { + "epoch": 0.0403023844994417, + "grad_norm": 1.754712462425232, + "learning_rate": 8.955688002073076e-06, + "loss": 1.2871, + "step": 1728 + }, + { + "epoch": 0.04032570763861962, + "grad_norm": 1.744728922843933, + "learning_rate": 8.96087069188909e-06, + "loss": 1.5207, + "step": 1729 + }, + { + "epoch": 0.040349030777797536, + "grad_norm": 1.9871348142623901, + "learning_rate": 8.966053381705105e-06, + "loss": 1.7927, + "step": 1730 + }, + { + "epoch": 0.040372353916975454, + "grad_norm": 1.898793339729309, + "learning_rate": 8.97123607152112e-06, + "loss": 1.5487, + "step": 1731 + }, + { + "epoch": 0.04039567705615337, + "grad_norm": 1.6234720945358276, + "learning_rate": 8.976418761337135e-06, + "loss": 1.3666, + "step": 1732 + }, + { + "epoch": 0.04041900019533129, + "grad_norm": 1.7883436679840088, + "learning_rate": 8.98160145115315e-06, + "loss": 1.59, + "step": 1733 + }, + { + "epoch": 0.04044232333450921, + "grad_norm": 2.030747890472412, + "learning_rate": 8.986784140969164e-06, + "loss": 1.5484, + "step": 1734 + }, + { + "epoch": 0.040465646473687125, + "grad_norm": 1.5323489904403687, + "learning_rate": 8.991966830785178e-06, + "loss": 1.4076, + "step": 1735 + }, + { + "epoch": 0.04048896961286504, + "grad_norm": 1.545076847076416, + "learning_rate": 8.997149520601193e-06, + "loss": 1.9024, + "step": 1736 + }, + { + "epoch": 0.04051229275204296, + "grad_norm": 1.775343656539917, + "learning_rate": 9.002332210417207e-06, + "loss": 1.6269, + "step": 1737 + }, + { + "epoch": 0.04053561589122088, + "grad_norm": 1.5936089754104614, + "learning_rate": 9.007514900233221e-06, + "loss": 1.388, + "step": 1738 + }, + { + "epoch": 0.0405589390303988, + "grad_norm": 2.0282087326049805, + "learning_rate": 9.012697590049236e-06, + "loss": 1.5258, + "step": 1739 + }, + { + "epoch": 0.040582262169576715, + "grad_norm": 1.769651174545288, + "learning_rate": 9.017880279865252e-06, + "loss": 1.6468, + "step": 1740 + }, + { + "epoch": 0.04060558530875463, + "grad_norm": 1.671475887298584, + "learning_rate": 9.023062969681266e-06, + "loss": 1.457, + "step": 1741 + }, + { + "epoch": 0.04062890844793255, + "grad_norm": 1.5717363357543945, + "learning_rate": 9.02824565949728e-06, + "loss": 1.0661, + "step": 1742 + }, + { + "epoch": 0.04065223158711047, + "grad_norm": 2.1011769771575928, + "learning_rate": 9.033428349313295e-06, + "loss": 1.8212, + "step": 1743 + }, + { + "epoch": 0.040675554726288386, + "grad_norm": 1.8593213558197021, + "learning_rate": 9.038611039129309e-06, + "loss": 1.2838, + "step": 1744 + }, + { + "epoch": 0.040698877865466304, + "grad_norm": 3.45039963722229, + "learning_rate": 9.043793728945323e-06, + "loss": 1.2977, + "step": 1745 + }, + { + "epoch": 0.04072220100464422, + "grad_norm": 1.5961792469024658, + "learning_rate": 9.048976418761338e-06, + "loss": 1.598, + "step": 1746 + }, + { + "epoch": 0.04074552414382214, + "grad_norm": 1.7901935577392578, + "learning_rate": 9.054159108577352e-06, + "loss": 1.197, + "step": 1747 + }, + { + "epoch": 0.04076884728300006, + "grad_norm": 1.7534990310668945, + "learning_rate": 9.059341798393368e-06, + "loss": 1.5957, + "step": 1748 + }, + { + "epoch": 0.040792170422177976, + "grad_norm": 2.0215656757354736, + "learning_rate": 9.064524488209382e-06, + "loss": 1.4019, + "step": 1749 + }, + { + "epoch": 0.040815493561355894, + "grad_norm": 1.7355159521102905, + "learning_rate": 9.069707178025397e-06, + "loss": 1.6056, + "step": 1750 + }, + { + "epoch": 0.04083881670053381, + "grad_norm": 2.3358545303344727, + "learning_rate": 9.07488986784141e-06, + "loss": 1.3946, + "step": 1751 + }, + { + "epoch": 0.04086213983971173, + "grad_norm": 2.4582395553588867, + "learning_rate": 9.080072557657424e-06, + "loss": 1.3848, + "step": 1752 + }, + { + "epoch": 0.04088546297888964, + "grad_norm": 1.8667892217636108, + "learning_rate": 9.08525524747344e-06, + "loss": 1.5908, + "step": 1753 + }, + { + "epoch": 0.04090878611806756, + "grad_norm": 2.2128000259399414, + "learning_rate": 9.090437937289454e-06, + "loss": 1.4584, + "step": 1754 + }, + { + "epoch": 0.040932109257245476, + "grad_norm": 1.714179277420044, + "learning_rate": 9.095620627105468e-06, + "loss": 1.3882, + "step": 1755 + }, + { + "epoch": 0.040955432396423394, + "grad_norm": 1.7891523838043213, + "learning_rate": 9.100803316921483e-06, + "loss": 1.6921, + "step": 1756 + }, + { + "epoch": 0.04097875553560131, + "grad_norm": 2.0620603561401367, + "learning_rate": 9.105986006737497e-06, + "loss": 1.4833, + "step": 1757 + }, + { + "epoch": 0.04100207867477923, + "grad_norm": 1.4664239883422852, + "learning_rate": 9.111168696553512e-06, + "loss": 1.5, + "step": 1758 + }, + { + "epoch": 0.04102540181395715, + "grad_norm": 2.151362180709839, + "learning_rate": 9.116351386369526e-06, + "loss": 1.4189, + "step": 1759 + }, + { + "epoch": 0.041048724953135066, + "grad_norm": 2.1404523849487305, + "learning_rate": 9.12153407618554e-06, + "loss": 1.512, + "step": 1760 + }, + { + "epoch": 0.041072048092312984, + "grad_norm": 1.5175687074661255, + "learning_rate": 9.126716766001556e-06, + "loss": 1.3527, + "step": 1761 + }, + { + "epoch": 0.0410953712314909, + "grad_norm": 1.6199604272842407, + "learning_rate": 9.13189945581757e-06, + "loss": 1.1717, + "step": 1762 + }, + { + "epoch": 0.04111869437066882, + "grad_norm": 1.655900001525879, + "learning_rate": 9.137082145633585e-06, + "loss": 1.4903, + "step": 1763 + }, + { + "epoch": 0.04114201750984674, + "grad_norm": 1.6075772047042847, + "learning_rate": 9.1422648354496e-06, + "loss": 1.3745, + "step": 1764 + }, + { + "epoch": 0.041165340649024655, + "grad_norm": 1.5534958839416504, + "learning_rate": 9.147447525265614e-06, + "loss": 1.4659, + "step": 1765 + }, + { + "epoch": 0.04118866378820257, + "grad_norm": 2.197490930557251, + "learning_rate": 9.152630215081628e-06, + "loss": 1.5412, + "step": 1766 + }, + { + "epoch": 0.04121198692738049, + "grad_norm": 2.1121668815612793, + "learning_rate": 9.157812904897642e-06, + "loss": 1.7137, + "step": 1767 + }, + { + "epoch": 0.04123531006655841, + "grad_norm": 2.2003660202026367, + "learning_rate": 9.162995594713657e-06, + "loss": 1.6095, + "step": 1768 + }, + { + "epoch": 0.04125863320573633, + "grad_norm": 1.617874264717102, + "learning_rate": 9.168178284529671e-06, + "loss": 1.4913, + "step": 1769 + }, + { + "epoch": 0.041281956344914245, + "grad_norm": 1.6809815168380737, + "learning_rate": 9.173360974345687e-06, + "loss": 1.6014, + "step": 1770 + }, + { + "epoch": 0.04130527948409216, + "grad_norm": 1.8234214782714844, + "learning_rate": 9.178543664161701e-06, + "loss": 1.4921, + "step": 1771 + }, + { + "epoch": 0.04132860262327008, + "grad_norm": 1.605371117591858, + "learning_rate": 9.183726353977716e-06, + "loss": 1.526, + "step": 1772 + }, + { + "epoch": 0.041351925762448, + "grad_norm": 1.7158360481262207, + "learning_rate": 9.18890904379373e-06, + "loss": 1.6063, + "step": 1773 + }, + { + "epoch": 0.041375248901625916, + "grad_norm": 1.8888566493988037, + "learning_rate": 9.194091733609744e-06, + "loss": 1.3013, + "step": 1774 + }, + { + "epoch": 0.041398572040803834, + "grad_norm": 1.8596553802490234, + "learning_rate": 9.199274423425759e-06, + "loss": 1.3611, + "step": 1775 + }, + { + "epoch": 0.04142189517998175, + "grad_norm": 1.770941972732544, + "learning_rate": 9.204457113241773e-06, + "loss": 1.553, + "step": 1776 + }, + { + "epoch": 0.04144521831915967, + "grad_norm": 1.4563987255096436, + "learning_rate": 9.209639803057788e-06, + "loss": 1.3261, + "step": 1777 + }, + { + "epoch": 0.04146854145833759, + "grad_norm": 1.5590494871139526, + "learning_rate": 9.214822492873802e-06, + "loss": 1.6303, + "step": 1778 + }, + { + "epoch": 0.041491864597515506, + "grad_norm": 1.6040290594100952, + "learning_rate": 9.220005182689816e-06, + "loss": 1.6656, + "step": 1779 + }, + { + "epoch": 0.041515187736693424, + "grad_norm": 1.6253089904785156, + "learning_rate": 9.22518787250583e-06, + "loss": 1.3086, + "step": 1780 + }, + { + "epoch": 0.041538510875871335, + "grad_norm": 2.282277822494507, + "learning_rate": 9.230370562321845e-06, + "loss": 1.3154, + "step": 1781 + }, + { + "epoch": 0.04156183401504925, + "grad_norm": 1.6955877542495728, + "learning_rate": 9.23555325213786e-06, + "loss": 1.4742, + "step": 1782 + }, + { + "epoch": 0.04158515715422717, + "grad_norm": 2.6918323040008545, + "learning_rate": 9.240735941953875e-06, + "loss": 1.4942, + "step": 1783 + }, + { + "epoch": 0.04160848029340509, + "grad_norm": 2.111135244369507, + "learning_rate": 9.24591863176989e-06, + "loss": 1.4501, + "step": 1784 + }, + { + "epoch": 0.041631803432583006, + "grad_norm": 1.6524665355682373, + "learning_rate": 9.251101321585904e-06, + "loss": 1.2801, + "step": 1785 + }, + { + "epoch": 0.041655126571760924, + "grad_norm": 1.812553882598877, + "learning_rate": 9.256284011401918e-06, + "loss": 1.2928, + "step": 1786 + }, + { + "epoch": 0.04167844971093884, + "grad_norm": 1.7474865913391113, + "learning_rate": 9.261466701217933e-06, + "loss": 1.5489, + "step": 1787 + }, + { + "epoch": 0.04170177285011676, + "grad_norm": 1.91874098777771, + "learning_rate": 9.266649391033947e-06, + "loss": 1.5997, + "step": 1788 + }, + { + "epoch": 0.04172509598929468, + "grad_norm": 1.4715979099273682, + "learning_rate": 9.271832080849961e-06, + "loss": 0.921, + "step": 1789 + }, + { + "epoch": 0.041748419128472596, + "grad_norm": 1.599254846572876, + "learning_rate": 9.277014770665976e-06, + "loss": 1.5168, + "step": 1790 + }, + { + "epoch": 0.04177174226765051, + "grad_norm": 1.8970310688018799, + "learning_rate": 9.28219746048199e-06, + "loss": 1.4821, + "step": 1791 + }, + { + "epoch": 0.04179506540682843, + "grad_norm": 1.5975875854492188, + "learning_rate": 9.287380150298006e-06, + "loss": 1.4889, + "step": 1792 + }, + { + "epoch": 0.04181838854600635, + "grad_norm": 1.7852643728256226, + "learning_rate": 9.29256284011402e-06, + "loss": 1.4124, + "step": 1793 + }, + { + "epoch": 0.04184171168518427, + "grad_norm": 1.8535397052764893, + "learning_rate": 9.297745529930035e-06, + "loss": 1.5964, + "step": 1794 + }, + { + "epoch": 0.041865034824362185, + "grad_norm": 1.532125473022461, + "learning_rate": 9.302928219746049e-06, + "loss": 1.2431, + "step": 1795 + }, + { + "epoch": 0.0418883579635401, + "grad_norm": 1.542386531829834, + "learning_rate": 9.308110909562063e-06, + "loss": 1.6327, + "step": 1796 + }, + { + "epoch": 0.04191168110271802, + "grad_norm": 1.8671448230743408, + "learning_rate": 9.313293599378078e-06, + "loss": 1.7695, + "step": 1797 + }, + { + "epoch": 0.04193500424189594, + "grad_norm": 1.6148124933242798, + "learning_rate": 9.318476289194092e-06, + "loss": 1.7227, + "step": 1798 + }, + { + "epoch": 0.04195832738107386, + "grad_norm": 1.4859371185302734, + "learning_rate": 9.323658979010107e-06, + "loss": 1.2807, + "step": 1799 + }, + { + "epoch": 0.041981650520251775, + "grad_norm": 3.0297629833221436, + "learning_rate": 9.328841668826123e-06, + "loss": 1.3824, + "step": 1800 + }, + { + "epoch": 0.04200497365942969, + "grad_norm": 1.6791976690292358, + "learning_rate": 9.334024358642137e-06, + "loss": 1.7325, + "step": 1801 + }, + { + "epoch": 0.04202829679860761, + "grad_norm": 1.4695453643798828, + "learning_rate": 9.339207048458151e-06, + "loss": 1.2062, + "step": 1802 + }, + { + "epoch": 0.04205161993778553, + "grad_norm": 1.5592173337936401, + "learning_rate": 9.344389738274166e-06, + "loss": 1.1919, + "step": 1803 + }, + { + "epoch": 0.042074943076963446, + "grad_norm": 1.4761253595352173, + "learning_rate": 9.34957242809018e-06, + "loss": 1.2845, + "step": 1804 + }, + { + "epoch": 0.042098266216141364, + "grad_norm": 1.3584182262420654, + "learning_rate": 9.354755117906194e-06, + "loss": 1.6216, + "step": 1805 + }, + { + "epoch": 0.04212158935531928, + "grad_norm": 2.0344326496124268, + "learning_rate": 9.359937807722209e-06, + "loss": 1.2301, + "step": 1806 + }, + { + "epoch": 0.0421449124944972, + "grad_norm": 1.549643874168396, + "learning_rate": 9.365120497538223e-06, + "loss": 1.446, + "step": 1807 + }, + { + "epoch": 0.04216823563367512, + "grad_norm": 1.6695293188095093, + "learning_rate": 9.370303187354237e-06, + "loss": 1.7588, + "step": 1808 + }, + { + "epoch": 0.042191558772853036, + "grad_norm": 1.817617416381836, + "learning_rate": 9.375485877170252e-06, + "loss": 1.5394, + "step": 1809 + }, + { + "epoch": 0.04221488191203095, + "grad_norm": 1.917152762413025, + "learning_rate": 9.380668566986266e-06, + "loss": 1.6437, + "step": 1810 + }, + { + "epoch": 0.042238205051208864, + "grad_norm": 0.9892622828483582, + "learning_rate": 9.38585125680228e-06, + "loss": 1.1554, + "step": 1811 + }, + { + "epoch": 0.04226152819038678, + "grad_norm": 1.577576994895935, + "learning_rate": 9.391033946618295e-06, + "loss": 1.4737, + "step": 1812 + }, + { + "epoch": 0.0422848513295647, + "grad_norm": 1.739229679107666, + "learning_rate": 9.39621663643431e-06, + "loss": 1.4077, + "step": 1813 + }, + { + "epoch": 0.04230817446874262, + "grad_norm": 1.6817034482955933, + "learning_rate": 9.401399326250325e-06, + "loss": 0.9329, + "step": 1814 + }, + { + "epoch": 0.042331497607920536, + "grad_norm": 1.6616978645324707, + "learning_rate": 9.40658201606634e-06, + "loss": 1.6185, + "step": 1815 + }, + { + "epoch": 0.042354820747098454, + "grad_norm": 1.379654049873352, + "learning_rate": 9.411764705882354e-06, + "loss": 1.6863, + "step": 1816 + }, + { + "epoch": 0.04237814388627637, + "grad_norm": 2.3998191356658936, + "learning_rate": 9.416947395698368e-06, + "loss": 1.4281, + "step": 1817 + }, + { + "epoch": 0.04240146702545429, + "grad_norm": 2.078322410583496, + "learning_rate": 9.422130085514383e-06, + "loss": 1.2324, + "step": 1818 + }, + { + "epoch": 0.04242479016463221, + "grad_norm": 1.8474605083465576, + "learning_rate": 9.427312775330397e-06, + "loss": 1.3242, + "step": 1819 + }, + { + "epoch": 0.042448113303810125, + "grad_norm": 1.4538230895996094, + "learning_rate": 9.432495465146411e-06, + "loss": 1.3117, + "step": 1820 + }, + { + "epoch": 0.04247143644298804, + "grad_norm": 2.528913974761963, + "learning_rate": 9.437678154962426e-06, + "loss": 1.3846, + "step": 1821 + }, + { + "epoch": 0.04249475958216596, + "grad_norm": 1.5370780229568481, + "learning_rate": 9.442860844778442e-06, + "loss": 1.4712, + "step": 1822 + }, + { + "epoch": 0.04251808272134388, + "grad_norm": 1.7554328441619873, + "learning_rate": 9.448043534594456e-06, + "loss": 1.5354, + "step": 1823 + }, + { + "epoch": 0.0425414058605218, + "grad_norm": 1.490560531616211, + "learning_rate": 9.45322622441047e-06, + "loss": 1.1725, + "step": 1824 + }, + { + "epoch": 0.042564728999699715, + "grad_norm": 1.55622136592865, + "learning_rate": 9.458408914226485e-06, + "loss": 1.5401, + "step": 1825 + }, + { + "epoch": 0.04258805213887763, + "grad_norm": 1.6288939714431763, + "learning_rate": 9.463591604042499e-06, + "loss": 1.404, + "step": 1826 + }, + { + "epoch": 0.04261137527805555, + "grad_norm": 1.9815454483032227, + "learning_rate": 9.468774293858513e-06, + "loss": 1.538, + "step": 1827 + }, + { + "epoch": 0.04263469841723347, + "grad_norm": 1.8967722654342651, + "learning_rate": 9.473956983674528e-06, + "loss": 1.4561, + "step": 1828 + }, + { + "epoch": 0.04265802155641139, + "grad_norm": 2.010972023010254, + "learning_rate": 9.479139673490542e-06, + "loss": 1.692, + "step": 1829 + }, + { + "epoch": 0.042681344695589304, + "grad_norm": 1.82353937625885, + "learning_rate": 9.484322363306558e-06, + "loss": 1.6394, + "step": 1830 + }, + { + "epoch": 0.04270466783476722, + "grad_norm": 1.6288769245147705, + "learning_rate": 9.489505053122572e-06, + "loss": 1.7251, + "step": 1831 + }, + { + "epoch": 0.04272799097394514, + "grad_norm": 2.7632317543029785, + "learning_rate": 9.494687742938587e-06, + "loss": 1.5771, + "step": 1832 + }, + { + "epoch": 0.04275131411312306, + "grad_norm": 1.7157068252563477, + "learning_rate": 9.499870432754601e-06, + "loss": 1.9245, + "step": 1833 + }, + { + "epoch": 0.042774637252300976, + "grad_norm": 1.6728345155715942, + "learning_rate": 9.505053122570614e-06, + "loss": 1.5874, + "step": 1834 + }, + { + "epoch": 0.042797960391478894, + "grad_norm": 1.6265268325805664, + "learning_rate": 9.51023581238663e-06, + "loss": 1.6633, + "step": 1835 + }, + { + "epoch": 0.04282128353065681, + "grad_norm": 1.8013489246368408, + "learning_rate": 9.515418502202644e-06, + "loss": 1.3856, + "step": 1836 + }, + { + "epoch": 0.04284460666983473, + "grad_norm": 1.85427987575531, + "learning_rate": 9.520601192018658e-06, + "loss": 1.2233, + "step": 1837 + }, + { + "epoch": 0.04286792980901264, + "grad_norm": 1.6943988800048828, + "learning_rate": 9.525783881834673e-06, + "loss": 1.3198, + "step": 1838 + }, + { + "epoch": 0.04289125294819056, + "grad_norm": 1.7103756666183472, + "learning_rate": 9.530966571650687e-06, + "loss": 1.4118, + "step": 1839 + }, + { + "epoch": 0.042914576087368476, + "grad_norm": 2.0107672214508057, + "learning_rate": 9.536149261466702e-06, + "loss": 1.3456, + "step": 1840 + }, + { + "epoch": 0.042937899226546394, + "grad_norm": 1.505422830581665, + "learning_rate": 9.541331951282716e-06, + "loss": 1.2676, + "step": 1841 + }, + { + "epoch": 0.04296122236572431, + "grad_norm": 2.090595245361328, + "learning_rate": 9.54651464109873e-06, + "loss": 1.2113, + "step": 1842 + }, + { + "epoch": 0.04298454550490223, + "grad_norm": 1.7776191234588623, + "learning_rate": 9.551697330914745e-06, + "loss": 1.5694, + "step": 1843 + }, + { + "epoch": 0.04300786864408015, + "grad_norm": 3.0254878997802734, + "learning_rate": 9.55688002073076e-06, + "loss": 1.244, + "step": 1844 + }, + { + "epoch": 0.043031191783258066, + "grad_norm": 1.8657838106155396, + "learning_rate": 9.562062710546775e-06, + "loss": 1.9444, + "step": 1845 + }, + { + "epoch": 0.043054514922435984, + "grad_norm": 2.1006710529327393, + "learning_rate": 9.56724540036279e-06, + "loss": 1.3202, + "step": 1846 + }, + { + "epoch": 0.0430778380616139, + "grad_norm": 1.2389309406280518, + "learning_rate": 9.572428090178804e-06, + "loss": 1.1992, + "step": 1847 + }, + { + "epoch": 0.04310116120079182, + "grad_norm": 2.162818193435669, + "learning_rate": 9.577610779994818e-06, + "loss": 1.5446, + "step": 1848 + }, + { + "epoch": 0.04312448433996974, + "grad_norm": 2.476367950439453, + "learning_rate": 9.582793469810832e-06, + "loss": 1.6178, + "step": 1849 + }, + { + "epoch": 0.043147807479147655, + "grad_norm": 2.1805801391601562, + "learning_rate": 9.587976159626847e-06, + "loss": 1.5745, + "step": 1850 + }, + { + "epoch": 0.04317113061832557, + "grad_norm": 1.7875632047653198, + "learning_rate": 9.593158849442861e-06, + "loss": 1.6798, + "step": 1851 + }, + { + "epoch": 0.04319445375750349, + "grad_norm": 2.506103515625, + "learning_rate": 9.598341539258877e-06, + "loss": 1.2824, + "step": 1852 + }, + { + "epoch": 0.04321777689668141, + "grad_norm": 2.027400016784668, + "learning_rate": 9.603524229074891e-06, + "loss": 1.7745, + "step": 1853 + }, + { + "epoch": 0.04324110003585933, + "grad_norm": 1.5254895687103271, + "learning_rate": 9.608706918890906e-06, + "loss": 1.6716, + "step": 1854 + }, + { + "epoch": 0.043264423175037245, + "grad_norm": 1.9832854270935059, + "learning_rate": 9.61388960870692e-06, + "loss": 1.2432, + "step": 1855 + }, + { + "epoch": 0.04328774631421516, + "grad_norm": 1.3785820007324219, + "learning_rate": 9.619072298522934e-06, + "loss": 1.3452, + "step": 1856 + }, + { + "epoch": 0.04331106945339308, + "grad_norm": 2.0536274909973145, + "learning_rate": 9.624254988338949e-06, + "loss": 1.9594, + "step": 1857 + }, + { + "epoch": 0.043334392592571, + "grad_norm": 1.8014826774597168, + "learning_rate": 9.629437678154963e-06, + "loss": 1.4811, + "step": 1858 + }, + { + "epoch": 0.043357715731748916, + "grad_norm": 1.5722678899765015, + "learning_rate": 9.634620367970978e-06, + "loss": 1.2694, + "step": 1859 + }, + { + "epoch": 0.043381038870926834, + "grad_norm": 1.849761724472046, + "learning_rate": 9.639803057786994e-06, + "loss": 1.4856, + "step": 1860 + }, + { + "epoch": 0.04340436201010475, + "grad_norm": 1.412558913230896, + "learning_rate": 9.644985747603006e-06, + "loss": 1.4672, + "step": 1861 + }, + { + "epoch": 0.04342768514928267, + "grad_norm": 2.028230667114258, + "learning_rate": 9.65016843741902e-06, + "loss": 1.5573, + "step": 1862 + }, + { + "epoch": 0.04345100828846059, + "grad_norm": 2.5457494258880615, + "learning_rate": 9.655351127235035e-06, + "loss": 1.3734, + "step": 1863 + }, + { + "epoch": 0.043474331427638506, + "grad_norm": 1.6199779510498047, + "learning_rate": 9.66053381705105e-06, + "loss": 1.6676, + "step": 1864 + }, + { + "epoch": 0.043497654566816424, + "grad_norm": 1.4922274351119995, + "learning_rate": 9.665716506867064e-06, + "loss": 1.5909, + "step": 1865 + }, + { + "epoch": 0.04352097770599434, + "grad_norm": 1.545914649963379, + "learning_rate": 9.67089919668308e-06, + "loss": 1.0683, + "step": 1866 + }, + { + "epoch": 0.04354430084517225, + "grad_norm": 1.4928728342056274, + "learning_rate": 9.676081886499094e-06, + "loss": 1.1975, + "step": 1867 + }, + { + "epoch": 0.04356762398435017, + "grad_norm": 2.042757272720337, + "learning_rate": 9.681264576315108e-06, + "loss": 1.9166, + "step": 1868 + }, + { + "epoch": 0.04359094712352809, + "grad_norm": 1.9415842294692993, + "learning_rate": 9.686447266131123e-06, + "loss": 1.5207, + "step": 1869 + }, + { + "epoch": 0.043614270262706006, + "grad_norm": 1.6906239986419678, + "learning_rate": 9.691629955947137e-06, + "loss": 1.4171, + "step": 1870 + }, + { + "epoch": 0.043637593401883924, + "grad_norm": 1.5644055604934692, + "learning_rate": 9.696812645763151e-06, + "loss": 1.4997, + "step": 1871 + }, + { + "epoch": 0.04366091654106184, + "grad_norm": 1.7778024673461914, + "learning_rate": 9.701995335579166e-06, + "loss": 1.3872, + "step": 1872 + }, + { + "epoch": 0.04368423968023976, + "grad_norm": 1.9999544620513916, + "learning_rate": 9.70717802539518e-06, + "loss": 1.6039, + "step": 1873 + }, + { + "epoch": 0.04370756281941768, + "grad_norm": 2.1065220832824707, + "learning_rate": 9.712360715211196e-06, + "loss": 1.4525, + "step": 1874 + }, + { + "epoch": 0.043730885958595596, + "grad_norm": 1.785739541053772, + "learning_rate": 9.71754340502721e-06, + "loss": 1.7723, + "step": 1875 + }, + { + "epoch": 0.043754209097773514, + "grad_norm": 1.7912609577178955, + "learning_rate": 9.722726094843225e-06, + "loss": 1.8857, + "step": 1876 + }, + { + "epoch": 0.04377753223695143, + "grad_norm": 2.2229981422424316, + "learning_rate": 9.727908784659239e-06, + "loss": 1.7359, + "step": 1877 + }, + { + "epoch": 0.04380085537612935, + "grad_norm": 1.7545627355575562, + "learning_rate": 9.733091474475253e-06, + "loss": 1.3878, + "step": 1878 + }, + { + "epoch": 0.04382417851530727, + "grad_norm": 1.6687484979629517, + "learning_rate": 9.738274164291268e-06, + "loss": 1.3148, + "step": 1879 + }, + { + "epoch": 0.043847501654485185, + "grad_norm": 1.661619782447815, + "learning_rate": 9.743456854107282e-06, + "loss": 1.5319, + "step": 1880 + }, + { + "epoch": 0.0438708247936631, + "grad_norm": 1.6879695653915405, + "learning_rate": 9.748639543923297e-06, + "loss": 1.2871, + "step": 1881 + }, + { + "epoch": 0.04389414793284102, + "grad_norm": 1.614043116569519, + "learning_rate": 9.753822233739313e-06, + "loss": 1.0429, + "step": 1882 + }, + { + "epoch": 0.04391747107201894, + "grad_norm": 1.310645341873169, + "learning_rate": 9.759004923555327e-06, + "loss": 1.5535, + "step": 1883 + }, + { + "epoch": 0.04394079421119686, + "grad_norm": 1.677807092666626, + "learning_rate": 9.764187613371341e-06, + "loss": 1.5612, + "step": 1884 + }, + { + "epoch": 0.043964117350374775, + "grad_norm": 2.004786252975464, + "learning_rate": 9.769370303187356e-06, + "loss": 1.1547, + "step": 1885 + }, + { + "epoch": 0.04398744048955269, + "grad_norm": 2.4537112712860107, + "learning_rate": 9.77455299300337e-06, + "loss": 1.6863, + "step": 1886 + }, + { + "epoch": 0.04401076362873061, + "grad_norm": 1.8132030963897705, + "learning_rate": 9.779735682819384e-06, + "loss": 1.2049, + "step": 1887 + }, + { + "epoch": 0.04403408676790853, + "grad_norm": 1.954026699066162, + "learning_rate": 9.784918372635399e-06, + "loss": 1.3946, + "step": 1888 + }, + { + "epoch": 0.044057409907086446, + "grad_norm": 1.742790699005127, + "learning_rate": 9.790101062451413e-06, + "loss": 1.3851, + "step": 1889 + }, + { + "epoch": 0.044080733046264364, + "grad_norm": 2.010481357574463, + "learning_rate": 9.795283752267427e-06, + "loss": 1.4181, + "step": 1890 + }, + { + "epoch": 0.04410405618544228, + "grad_norm": 1.6661536693572998, + "learning_rate": 9.800466442083442e-06, + "loss": 1.1611, + "step": 1891 + }, + { + "epoch": 0.0441273793246202, + "grad_norm": 1.6758571863174438, + "learning_rate": 9.805649131899456e-06, + "loss": 1.0906, + "step": 1892 + }, + { + "epoch": 0.04415070246379812, + "grad_norm": 1.7925001382827759, + "learning_rate": 9.81083182171547e-06, + "loss": 1.4299, + "step": 1893 + }, + { + "epoch": 0.044174025602976036, + "grad_norm": 1.9415634870529175, + "learning_rate": 9.816014511531485e-06, + "loss": 1.5619, + "step": 1894 + }, + { + "epoch": 0.04419734874215395, + "grad_norm": 1.3546884059906006, + "learning_rate": 9.821197201347499e-06, + "loss": 1.4994, + "step": 1895 + }, + { + "epoch": 0.044220671881331865, + "grad_norm": 2.0756897926330566, + "learning_rate": 9.826379891163515e-06, + "loss": 1.7483, + "step": 1896 + }, + { + "epoch": 0.04424399502050978, + "grad_norm": 1.7983125448226929, + "learning_rate": 9.83156258097953e-06, + "loss": 1.586, + "step": 1897 + }, + { + "epoch": 0.0442673181596877, + "grad_norm": 1.5559202432632446, + "learning_rate": 9.836745270795544e-06, + "loss": 1.5093, + "step": 1898 + }, + { + "epoch": 0.04429064129886562, + "grad_norm": 1.772439956665039, + "learning_rate": 9.841927960611558e-06, + "loss": 1.3449, + "step": 1899 + }, + { + "epoch": 0.044313964438043536, + "grad_norm": 1.9158481359481812, + "learning_rate": 9.847110650427573e-06, + "loss": 1.3239, + "step": 1900 + }, + { + "epoch": 0.044337287577221454, + "grad_norm": 1.801500916481018, + "learning_rate": 9.852293340243587e-06, + "loss": 1.1534, + "step": 1901 + }, + { + "epoch": 0.04436061071639937, + "grad_norm": 1.5766456127166748, + "learning_rate": 9.857476030059601e-06, + "loss": 1.7678, + "step": 1902 + }, + { + "epoch": 0.04438393385557729, + "grad_norm": 1.852655053138733, + "learning_rate": 9.862658719875616e-06, + "loss": 1.7286, + "step": 1903 + }, + { + "epoch": 0.04440725699475521, + "grad_norm": 1.9849982261657715, + "learning_rate": 9.867841409691632e-06, + "loss": 1.3984, + "step": 1904 + }, + { + "epoch": 0.044430580133933126, + "grad_norm": 1.7213250398635864, + "learning_rate": 9.873024099507646e-06, + "loss": 1.6215, + "step": 1905 + }, + { + "epoch": 0.044453903273111044, + "grad_norm": 1.9416676759719849, + "learning_rate": 9.87820678932366e-06, + "loss": 1.6314, + "step": 1906 + }, + { + "epoch": 0.04447722641228896, + "grad_norm": 1.8408985137939453, + "learning_rate": 9.883389479139675e-06, + "loss": 1.6611, + "step": 1907 + }, + { + "epoch": 0.04450054955146688, + "grad_norm": 1.528350591659546, + "learning_rate": 9.888572168955689e-06, + "loss": 1.7559, + "step": 1908 + }, + { + "epoch": 0.0445238726906448, + "grad_norm": 1.6557738780975342, + "learning_rate": 9.893754858771703e-06, + "loss": 1.5072, + "step": 1909 + }, + { + "epoch": 0.044547195829822715, + "grad_norm": 2.0431089401245117, + "learning_rate": 9.898937548587718e-06, + "loss": 1.2895, + "step": 1910 + }, + { + "epoch": 0.04457051896900063, + "grad_norm": 1.8927110433578491, + "learning_rate": 9.904120238403732e-06, + "loss": 1.4221, + "step": 1911 + }, + { + "epoch": 0.04459384210817855, + "grad_norm": 1.547044038772583, + "learning_rate": 9.909302928219748e-06, + "loss": 1.2597, + "step": 1912 + }, + { + "epoch": 0.04461716524735647, + "grad_norm": 1.81504487991333, + "learning_rate": 9.914485618035762e-06, + "loss": 1.4845, + "step": 1913 + }, + { + "epoch": 0.04464048838653439, + "grad_norm": 3.442282199859619, + "learning_rate": 9.919668307851777e-06, + "loss": 1.0979, + "step": 1914 + }, + { + "epoch": 0.044663811525712305, + "grad_norm": 1.8255623579025269, + "learning_rate": 9.924850997667791e-06, + "loss": 1.6663, + "step": 1915 + }, + { + "epoch": 0.04468713466489022, + "grad_norm": 1.7657500505447388, + "learning_rate": 9.930033687483804e-06, + "loss": 1.765, + "step": 1916 + }, + { + "epoch": 0.04471045780406814, + "grad_norm": 1.6761666536331177, + "learning_rate": 9.935216377299818e-06, + "loss": 1.3525, + "step": 1917 + }, + { + "epoch": 0.04473378094324606, + "grad_norm": 2.3319602012634277, + "learning_rate": 9.940399067115834e-06, + "loss": 1.1265, + "step": 1918 + }, + { + "epoch": 0.044757104082423976, + "grad_norm": 1.6062688827514648, + "learning_rate": 9.945581756931848e-06, + "loss": 1.6085, + "step": 1919 + }, + { + "epoch": 0.044780427221601894, + "grad_norm": 1.4931232929229736, + "learning_rate": 9.950764446747863e-06, + "loss": 1.6418, + "step": 1920 + }, + { + "epoch": 0.04480375036077981, + "grad_norm": 2.0092151165008545, + "learning_rate": 9.955947136563877e-06, + "loss": 1.2352, + "step": 1921 + }, + { + "epoch": 0.04482707349995773, + "grad_norm": 2.2695815563201904, + "learning_rate": 9.961129826379892e-06, + "loss": 1.3626, + "step": 1922 + }, + { + "epoch": 0.04485039663913565, + "grad_norm": 1.6969548463821411, + "learning_rate": 9.966312516195906e-06, + "loss": 1.6971, + "step": 1923 + }, + { + "epoch": 0.04487371977831356, + "grad_norm": 1.8436291217803955, + "learning_rate": 9.97149520601192e-06, + "loss": 1.7701, + "step": 1924 + }, + { + "epoch": 0.04489704291749148, + "grad_norm": 1.7749122381210327, + "learning_rate": 9.976677895827935e-06, + "loss": 1.3771, + "step": 1925 + }, + { + "epoch": 0.044920366056669395, + "grad_norm": 1.9239168167114258, + "learning_rate": 9.98186058564395e-06, + "loss": 1.7554, + "step": 1926 + }, + { + "epoch": 0.04494368919584731, + "grad_norm": 1.5236059427261353, + "learning_rate": 9.987043275459965e-06, + "loss": 1.337, + "step": 1927 + }, + { + "epoch": 0.04496701233502523, + "grad_norm": 2.0506536960601807, + "learning_rate": 9.99222596527598e-06, + "loss": 1.7227, + "step": 1928 + }, + { + "epoch": 0.04499033547420315, + "grad_norm": 1.4491156339645386, + "learning_rate": 9.997408655091994e-06, + "loss": 1.4032, + "step": 1929 + }, + { + "epoch": 0.045013658613381066, + "grad_norm": 2.175860643386841, + "learning_rate": 1.0002591344908008e-05, + "loss": 1.5994, + "step": 1930 + }, + { + "epoch": 0.045036981752558984, + "grad_norm": 1.9326441287994385, + "learning_rate": 1.0007774034724022e-05, + "loss": 1.1194, + "step": 1931 + }, + { + "epoch": 0.0450603048917369, + "grad_norm": 1.8562779426574707, + "learning_rate": 1.0012956724540037e-05, + "loss": 1.7551, + "step": 1932 + }, + { + "epoch": 0.04508362803091482, + "grad_norm": 1.7570141553878784, + "learning_rate": 1.0018139414356051e-05, + "loss": 0.97, + "step": 1933 + }, + { + "epoch": 0.04510695117009274, + "grad_norm": 1.2578299045562744, + "learning_rate": 1.0023322104172067e-05, + "loss": 1.3259, + "step": 1934 + }, + { + "epoch": 0.045130274309270656, + "grad_norm": 2.211773633956909, + "learning_rate": 1.0028504793988081e-05, + "loss": 1.6072, + "step": 1935 + }, + { + "epoch": 0.045153597448448574, + "grad_norm": 1.7696832418441772, + "learning_rate": 1.0033687483804096e-05, + "loss": 1.4227, + "step": 1936 + }, + { + "epoch": 0.04517692058762649, + "grad_norm": 1.940531611442566, + "learning_rate": 1.003887017362011e-05, + "loss": 1.8458, + "step": 1937 + }, + { + "epoch": 0.04520024372680441, + "grad_norm": 2.282905101776123, + "learning_rate": 1.0044052863436124e-05, + "loss": 1.2556, + "step": 1938 + }, + { + "epoch": 0.04522356686598233, + "grad_norm": 1.643122673034668, + "learning_rate": 1.0049235553252139e-05, + "loss": 1.5571, + "step": 1939 + }, + { + "epoch": 0.045246890005160245, + "grad_norm": 1.6886086463928223, + "learning_rate": 1.0054418243068153e-05, + "loss": 1.481, + "step": 1940 + }, + { + "epoch": 0.04527021314433816, + "grad_norm": 2.349867105484009, + "learning_rate": 1.0059600932884168e-05, + "loss": 1.8651, + "step": 1941 + }, + { + "epoch": 0.04529353628351608, + "grad_norm": 2.0965826511383057, + "learning_rate": 1.0064783622700184e-05, + "loss": 1.5702, + "step": 1942 + }, + { + "epoch": 0.045316859422694, + "grad_norm": 1.4684425592422485, + "learning_rate": 1.0069966312516198e-05, + "loss": 1.4283, + "step": 1943 + }, + { + "epoch": 0.04534018256187192, + "grad_norm": 3.0096945762634277, + "learning_rate": 1.0075149002332212e-05, + "loss": 1.4832, + "step": 1944 + }, + { + "epoch": 0.045363505701049835, + "grad_norm": 2.2389118671417236, + "learning_rate": 1.0080331692148227e-05, + "loss": 1.6346, + "step": 1945 + }, + { + "epoch": 0.04538682884022775, + "grad_norm": 1.7624162435531616, + "learning_rate": 1.0085514381964241e-05, + "loss": 1.7017, + "step": 1946 + }, + { + "epoch": 0.04541015197940567, + "grad_norm": 1.8136117458343506, + "learning_rate": 1.0090697071780255e-05, + "loss": 1.5987, + "step": 1947 + }, + { + "epoch": 0.04543347511858359, + "grad_norm": 1.678236484527588, + "learning_rate": 1.009587976159627e-05, + "loss": 1.3684, + "step": 1948 + }, + { + "epoch": 0.045456798257761506, + "grad_norm": 1.7862106561660767, + "learning_rate": 1.0101062451412284e-05, + "loss": 1.7998, + "step": 1949 + }, + { + "epoch": 0.045480121396939424, + "grad_norm": 2.0441555976867676, + "learning_rate": 1.0106245141228298e-05, + "loss": 1.2902, + "step": 1950 + }, + { + "epoch": 0.04550344453611734, + "grad_norm": 1.5820708274841309, + "learning_rate": 1.0111427831044314e-05, + "loss": 1.2032, + "step": 1951 + }, + { + "epoch": 0.04552676767529525, + "grad_norm": 1.4560632705688477, + "learning_rate": 1.0116610520860329e-05, + "loss": 1.5599, + "step": 1952 + }, + { + "epoch": 0.04555009081447317, + "grad_norm": 2.3671185970306396, + "learning_rate": 1.0121793210676343e-05, + "loss": 1.6144, + "step": 1953 + }, + { + "epoch": 0.04557341395365109, + "grad_norm": 1.7525554895401, + "learning_rate": 1.0126975900492357e-05, + "loss": 1.913, + "step": 1954 + }, + { + "epoch": 0.04559673709282901, + "grad_norm": 1.2725483179092407, + "learning_rate": 1.0132158590308372e-05, + "loss": 1.2048, + "step": 1955 + }, + { + "epoch": 0.045620060232006925, + "grad_norm": 1.8041915893554688, + "learning_rate": 1.0137341280124386e-05, + "loss": 1.1796, + "step": 1956 + }, + { + "epoch": 0.04564338337118484, + "grad_norm": 2.3629374504089355, + "learning_rate": 1.01425239699404e-05, + "loss": 1.8434, + "step": 1957 + }, + { + "epoch": 0.04566670651036276, + "grad_norm": 1.3975788354873657, + "learning_rate": 1.0147706659756413e-05, + "loss": 1.5474, + "step": 1958 + }, + { + "epoch": 0.04569002964954068, + "grad_norm": 1.4148329496383667, + "learning_rate": 1.0152889349572427e-05, + "loss": 1.4695, + "step": 1959 + }, + { + "epoch": 0.045713352788718596, + "grad_norm": 3.3544209003448486, + "learning_rate": 1.0158072039388442e-05, + "loss": 1.6851, + "step": 1960 + }, + { + "epoch": 0.045736675927896514, + "grad_norm": 1.795784592628479, + "learning_rate": 1.0163254729204458e-05, + "loss": 1.2823, + "step": 1961 + }, + { + "epoch": 0.04575999906707443, + "grad_norm": 2.3135123252868652, + "learning_rate": 1.0168437419020472e-05, + "loss": 1.7222, + "step": 1962 + }, + { + "epoch": 0.04578332220625235, + "grad_norm": 1.62346351146698, + "learning_rate": 1.0173620108836487e-05, + "loss": 1.3822, + "step": 1963 + }, + { + "epoch": 0.04580664534543027, + "grad_norm": 1.9713786840438843, + "learning_rate": 1.0178802798652501e-05, + "loss": 1.1212, + "step": 1964 + }, + { + "epoch": 0.045829968484608186, + "grad_norm": 1.5502241849899292, + "learning_rate": 1.0183985488468515e-05, + "loss": 1.0937, + "step": 1965 + }, + { + "epoch": 0.045853291623786104, + "grad_norm": 1.893622875213623, + "learning_rate": 1.018916817828453e-05, + "loss": 1.7849, + "step": 1966 + }, + { + "epoch": 0.04587661476296402, + "grad_norm": 1.7515870332717896, + "learning_rate": 1.0194350868100544e-05, + "loss": 1.4724, + "step": 1967 + }, + { + "epoch": 0.04589993790214194, + "grad_norm": 1.7589161396026611, + "learning_rate": 1.0199533557916558e-05, + "loss": 1.4281, + "step": 1968 + }, + { + "epoch": 0.04592326104131986, + "grad_norm": 2.377809762954712, + "learning_rate": 1.0204716247732573e-05, + "loss": 1.0402, + "step": 1969 + }, + { + "epoch": 0.045946584180497775, + "grad_norm": 1.6169410943984985, + "learning_rate": 1.0209898937548589e-05, + "loss": 1.2902, + "step": 1970 + }, + { + "epoch": 0.04596990731967569, + "grad_norm": 1.7550357580184937, + "learning_rate": 1.0215081627364603e-05, + "loss": 1.274, + "step": 1971 + }, + { + "epoch": 0.04599323045885361, + "grad_norm": 1.846411943435669, + "learning_rate": 1.0220264317180617e-05, + "loss": 1.2554, + "step": 1972 + }, + { + "epoch": 0.04601655359803153, + "grad_norm": 1.880225419998169, + "learning_rate": 1.0225447006996632e-05, + "loss": 1.3451, + "step": 1973 + }, + { + "epoch": 0.04603987673720945, + "grad_norm": 1.6644784212112427, + "learning_rate": 1.0230629696812646e-05, + "loss": 1.5651, + "step": 1974 + }, + { + "epoch": 0.046063199876387365, + "grad_norm": 1.2287671566009521, + "learning_rate": 1.023581238662866e-05, + "loss": 1.2272, + "step": 1975 + }, + { + "epoch": 0.04608652301556528, + "grad_norm": 5.595534801483154, + "learning_rate": 1.0240995076444675e-05, + "loss": 1.2381, + "step": 1976 + }, + { + "epoch": 0.0461098461547432, + "grad_norm": 1.6219606399536133, + "learning_rate": 1.0246177766260689e-05, + "loss": 1.1357, + "step": 1977 + }, + { + "epoch": 0.04613316929392112, + "grad_norm": 1.7713710069656372, + "learning_rate": 1.0251360456076705e-05, + "loss": 1.3458, + "step": 1978 + }, + { + "epoch": 0.046156492433099036, + "grad_norm": 1.6285533905029297, + "learning_rate": 1.025654314589272e-05, + "loss": 1.6516, + "step": 1979 + }, + { + "epoch": 0.046179815572276954, + "grad_norm": 1.479745864868164, + "learning_rate": 1.0261725835708734e-05, + "loss": 1.2629, + "step": 1980 + }, + { + "epoch": 0.046203138711454865, + "grad_norm": 1.6205228567123413, + "learning_rate": 1.0266908525524748e-05, + "loss": 1.6772, + "step": 1981 + }, + { + "epoch": 0.04622646185063278, + "grad_norm": 1.845969319343567, + "learning_rate": 1.0272091215340763e-05, + "loss": 1.7172, + "step": 1982 + }, + { + "epoch": 0.0462497849898107, + "grad_norm": 1.71135413646698, + "learning_rate": 1.0277273905156777e-05, + "loss": 1.3776, + "step": 1983 + }, + { + "epoch": 0.04627310812898862, + "grad_norm": 1.5999668836593628, + "learning_rate": 1.0282456594972791e-05, + "loss": 1.3148, + "step": 1984 + }, + { + "epoch": 0.04629643126816654, + "grad_norm": 2.372850179672241, + "learning_rate": 1.0287639284788806e-05, + "loss": 1.5203, + "step": 1985 + }, + { + "epoch": 0.046319754407344454, + "grad_norm": 1.9471055269241333, + "learning_rate": 1.0292821974604822e-05, + "loss": 1.1852, + "step": 1986 + }, + { + "epoch": 0.04634307754652237, + "grad_norm": 2.035149574279785, + "learning_rate": 1.0298004664420836e-05, + "loss": 1.5986, + "step": 1987 + }, + { + "epoch": 0.04636640068570029, + "grad_norm": 1.9274436235427856, + "learning_rate": 1.030318735423685e-05, + "loss": 1.3578, + "step": 1988 + }, + { + "epoch": 0.04638972382487821, + "grad_norm": 1.8304780721664429, + "learning_rate": 1.0308370044052865e-05, + "loss": 1.2624, + "step": 1989 + }, + { + "epoch": 0.046413046964056126, + "grad_norm": 2.2276337146759033, + "learning_rate": 1.0313552733868879e-05, + "loss": 1.5508, + "step": 1990 + }, + { + "epoch": 0.046436370103234044, + "grad_norm": 1.7837759256362915, + "learning_rate": 1.0318735423684893e-05, + "loss": 1.4839, + "step": 1991 + }, + { + "epoch": 0.04645969324241196, + "grad_norm": 1.766287088394165, + "learning_rate": 1.0323918113500908e-05, + "loss": 1.7001, + "step": 1992 + }, + { + "epoch": 0.04648301638158988, + "grad_norm": 1.6771559715270996, + "learning_rate": 1.0329100803316922e-05, + "loss": 1.6349, + "step": 1993 + }, + { + "epoch": 0.0465063395207678, + "grad_norm": 1.7568877935409546, + "learning_rate": 1.0334283493132938e-05, + "loss": 1.4524, + "step": 1994 + }, + { + "epoch": 0.046529662659945716, + "grad_norm": 2.070405960083008, + "learning_rate": 1.0339466182948952e-05, + "loss": 1.3437, + "step": 1995 + }, + { + "epoch": 0.04655298579912363, + "grad_norm": 2.852936267852783, + "learning_rate": 1.0344648872764967e-05, + "loss": 1.2623, + "step": 1996 + }, + { + "epoch": 0.04657630893830155, + "grad_norm": 1.3660649061203003, + "learning_rate": 1.0349831562580981e-05, + "loss": 1.3146, + "step": 1997 + }, + { + "epoch": 0.04659963207747947, + "grad_norm": 1.672303318977356, + "learning_rate": 1.0355014252396995e-05, + "loss": 1.3361, + "step": 1998 + }, + { + "epoch": 0.04662295521665739, + "grad_norm": 1.6566362380981445, + "learning_rate": 1.036019694221301e-05, + "loss": 1.4374, + "step": 1999 + }, + { + "epoch": 0.046646278355835305, + "grad_norm": 1.6957907676696777, + "learning_rate": 1.0365379632029024e-05, + "loss": 1.4639, + "step": 2000 + }, + { + "epoch": 0.04666960149501322, + "grad_norm": 1.7481802701950073, + "learning_rate": 1.0370562321845038e-05, + "loss": 1.3724, + "step": 2001 + }, + { + "epoch": 0.04669292463419114, + "grad_norm": 1.7186965942382812, + "learning_rate": 1.0375745011661053e-05, + "loss": 1.4501, + "step": 2002 + }, + { + "epoch": 0.04671624777336906, + "grad_norm": 2.016763925552368, + "learning_rate": 1.0380927701477069e-05, + "loss": 1.598, + "step": 2003 + }, + { + "epoch": 0.04673957091254698, + "grad_norm": 2.0749828815460205, + "learning_rate": 1.0386110391293083e-05, + "loss": 1.6236, + "step": 2004 + }, + { + "epoch": 0.046762894051724894, + "grad_norm": 1.5764251947402954, + "learning_rate": 1.0391293081109098e-05, + "loss": 1.2068, + "step": 2005 + }, + { + "epoch": 0.04678621719090281, + "grad_norm": 1.7553889751434326, + "learning_rate": 1.0396475770925112e-05, + "loss": 1.5647, + "step": 2006 + }, + { + "epoch": 0.04680954033008073, + "grad_norm": 1.603076457977295, + "learning_rate": 1.0401658460741126e-05, + "loss": 1.32, + "step": 2007 + }, + { + "epoch": 0.04683286346925865, + "grad_norm": 1.6535013914108276, + "learning_rate": 1.040684115055714e-05, + "loss": 1.5769, + "step": 2008 + }, + { + "epoch": 0.04685618660843656, + "grad_norm": 2.1688969135284424, + "learning_rate": 1.0412023840373155e-05, + "loss": 1.3436, + "step": 2009 + }, + { + "epoch": 0.04687950974761448, + "grad_norm": 2.3349978923797607, + "learning_rate": 1.041720653018917e-05, + "loss": 1.5229, + "step": 2010 + }, + { + "epoch": 0.046902832886792395, + "grad_norm": 1.5256779193878174, + "learning_rate": 1.0422389220005185e-05, + "loss": 1.5068, + "step": 2011 + }, + { + "epoch": 0.04692615602597031, + "grad_norm": 1.6567631959915161, + "learning_rate": 1.04275719098212e-05, + "loss": 1.4358, + "step": 2012 + }, + { + "epoch": 0.04694947916514823, + "grad_norm": 1.9560909271240234, + "learning_rate": 1.043275459963721e-05, + "loss": 1.4106, + "step": 2013 + }, + { + "epoch": 0.04697280230432615, + "grad_norm": 1.7204327583312988, + "learning_rate": 1.0437937289453227e-05, + "loss": 1.6955, + "step": 2014 + }, + { + "epoch": 0.046996125443504067, + "grad_norm": 1.4477440118789673, + "learning_rate": 1.0443119979269241e-05, + "loss": 1.3587, + "step": 2015 + }, + { + "epoch": 0.047019448582681984, + "grad_norm": 1.5402978658676147, + "learning_rate": 1.0448302669085255e-05, + "loss": 1.4493, + "step": 2016 + }, + { + "epoch": 0.0470427717218599, + "grad_norm": 2.143346071243286, + "learning_rate": 1.045348535890127e-05, + "loss": 1.4861, + "step": 2017 + }, + { + "epoch": 0.04706609486103782, + "grad_norm": 1.7556196451187134, + "learning_rate": 1.0458668048717284e-05, + "loss": 1.3845, + "step": 2018 + }, + { + "epoch": 0.04708941800021574, + "grad_norm": 1.7759301662445068, + "learning_rate": 1.0463850738533298e-05, + "loss": 1.7286, + "step": 2019 + }, + { + "epoch": 0.047112741139393656, + "grad_norm": 1.6534310579299927, + "learning_rate": 1.0469033428349313e-05, + "loss": 1.5995, + "step": 2020 + }, + { + "epoch": 0.047136064278571574, + "grad_norm": 1.8520358800888062, + "learning_rate": 1.0474216118165327e-05, + "loss": 1.7078, + "step": 2021 + }, + { + "epoch": 0.04715938741774949, + "grad_norm": 1.762181043624878, + "learning_rate": 1.0479398807981343e-05, + "loss": 1.4365, + "step": 2022 + }, + { + "epoch": 0.04718271055692741, + "grad_norm": 2.2282192707061768, + "learning_rate": 1.0484581497797357e-05, + "loss": 1.0928, + "step": 2023 + }, + { + "epoch": 0.04720603369610533, + "grad_norm": 1.602739691734314, + "learning_rate": 1.0489764187613372e-05, + "loss": 1.8262, + "step": 2024 + }, + { + "epoch": 0.047229356835283245, + "grad_norm": 1.9949517250061035, + "learning_rate": 1.0494946877429386e-05, + "loss": 1.4207, + "step": 2025 + }, + { + "epoch": 0.04725267997446116, + "grad_norm": 1.738860011100769, + "learning_rate": 1.05001295672454e-05, + "loss": 1.3735, + "step": 2026 + }, + { + "epoch": 0.04727600311363908, + "grad_norm": 1.5074880123138428, + "learning_rate": 1.0505312257061415e-05, + "loss": 1.5181, + "step": 2027 + }, + { + "epoch": 0.047299326252817, + "grad_norm": 2.0777339935302734, + "learning_rate": 1.051049494687743e-05, + "loss": 1.1551, + "step": 2028 + }, + { + "epoch": 0.04732264939199492, + "grad_norm": 1.905203104019165, + "learning_rate": 1.0515677636693444e-05, + "loss": 1.4136, + "step": 2029 + }, + { + "epoch": 0.047345972531172835, + "grad_norm": 1.9061765670776367, + "learning_rate": 1.052086032650946e-05, + "loss": 1.4066, + "step": 2030 + }, + { + "epoch": 0.04736929567035075, + "grad_norm": 2.3726284503936768, + "learning_rate": 1.0526043016325474e-05, + "loss": 1.5807, + "step": 2031 + }, + { + "epoch": 0.04739261880952867, + "grad_norm": 1.7928005456924438, + "learning_rate": 1.0531225706141488e-05, + "loss": 1.4065, + "step": 2032 + }, + { + "epoch": 0.04741594194870659, + "grad_norm": 2.108304977416992, + "learning_rate": 1.0536408395957503e-05, + "loss": 1.4989, + "step": 2033 + }, + { + "epoch": 0.047439265087884507, + "grad_norm": 1.770139455795288, + "learning_rate": 1.0541591085773517e-05, + "loss": 1.5682, + "step": 2034 + }, + { + "epoch": 0.047462588227062424, + "grad_norm": 1.5590190887451172, + "learning_rate": 1.0546773775589531e-05, + "loss": 1.5977, + "step": 2035 + }, + { + "epoch": 0.04748591136624034, + "grad_norm": 1.8117510080337524, + "learning_rate": 1.0551956465405546e-05, + "loss": 1.366, + "step": 2036 + }, + { + "epoch": 0.04750923450541826, + "grad_norm": 1.4459258317947388, + "learning_rate": 1.055713915522156e-05, + "loss": 1.3128, + "step": 2037 + }, + { + "epoch": 0.04753255764459617, + "grad_norm": 1.661622166633606, + "learning_rate": 1.0562321845037576e-05, + "loss": 1.195, + "step": 2038 + }, + { + "epoch": 0.04755588078377409, + "grad_norm": 3.0413358211517334, + "learning_rate": 1.056750453485359e-05, + "loss": 1.2938, + "step": 2039 + }, + { + "epoch": 0.04757920392295201, + "grad_norm": 2.0208404064178467, + "learning_rate": 1.0572687224669605e-05, + "loss": 1.4222, + "step": 2040 + }, + { + "epoch": 0.047602527062129925, + "grad_norm": 1.5804784297943115, + "learning_rate": 1.0577869914485619e-05, + "loss": 1.4815, + "step": 2041 + }, + { + "epoch": 0.04762585020130784, + "grad_norm": 1.550583839416504, + "learning_rate": 1.0583052604301633e-05, + "loss": 1.2965, + "step": 2042 + }, + { + "epoch": 0.04764917334048576, + "grad_norm": 1.808358907699585, + "learning_rate": 1.0588235294117648e-05, + "loss": 1.7293, + "step": 2043 + }, + { + "epoch": 0.04767249647966368, + "grad_norm": 1.5591018199920654, + "learning_rate": 1.0593417983933662e-05, + "loss": 1.411, + "step": 2044 + }, + { + "epoch": 0.047695819618841596, + "grad_norm": 1.9220789670944214, + "learning_rate": 1.0598600673749677e-05, + "loss": 1.6004, + "step": 2045 + }, + { + "epoch": 0.047719142758019514, + "grad_norm": 1.454370141029358, + "learning_rate": 1.0603783363565691e-05, + "loss": 1.237, + "step": 2046 + }, + { + "epoch": 0.04774246589719743, + "grad_norm": 2.064704179763794, + "learning_rate": 1.0608966053381707e-05, + "loss": 1.5761, + "step": 2047 + }, + { + "epoch": 0.04776578903637535, + "grad_norm": 2.0700747966766357, + "learning_rate": 1.0614148743197721e-05, + "loss": 1.5053, + "step": 2048 + }, + { + "epoch": 0.04778911217555327, + "grad_norm": 2.590925931930542, + "learning_rate": 1.0619331433013736e-05, + "loss": 1.6171, + "step": 2049 + }, + { + "epoch": 0.047812435314731186, + "grad_norm": 1.7378954887390137, + "learning_rate": 1.062451412282975e-05, + "loss": 1.7035, + "step": 2050 + }, + { + "epoch": 0.047835758453909104, + "grad_norm": 1.74659264087677, + "learning_rate": 1.0629696812645764e-05, + "loss": 1.4361, + "step": 2051 + }, + { + "epoch": 0.04785908159308702, + "grad_norm": 1.1299657821655273, + "learning_rate": 1.0634879502461779e-05, + "loss": 1.1613, + "step": 2052 + }, + { + "epoch": 0.04788240473226494, + "grad_norm": 1.803884506225586, + "learning_rate": 1.0640062192277793e-05, + "loss": 1.8098, + "step": 2053 + }, + { + "epoch": 0.04790572787144286, + "grad_norm": 1.7978615760803223, + "learning_rate": 1.0645244882093807e-05, + "loss": 1.3783, + "step": 2054 + }, + { + "epoch": 0.047929051010620775, + "grad_norm": 2.5186221599578857, + "learning_rate": 1.0650427571909823e-05, + "loss": 1.3875, + "step": 2055 + }, + { + "epoch": 0.04795237414979869, + "grad_norm": 2.0152792930603027, + "learning_rate": 1.0655610261725838e-05, + "loss": 1.3065, + "step": 2056 + }, + { + "epoch": 0.04797569728897661, + "grad_norm": 1.6235994100570679, + "learning_rate": 1.0660792951541852e-05, + "loss": 1.5058, + "step": 2057 + }, + { + "epoch": 0.04799902042815453, + "grad_norm": 1.4507194757461548, + "learning_rate": 1.0665975641357866e-05, + "loss": 1.0629, + "step": 2058 + }, + { + "epoch": 0.04802234356733245, + "grad_norm": 1.8985979557037354, + "learning_rate": 1.067115833117388e-05, + "loss": 1.9146, + "step": 2059 + }, + { + "epoch": 0.048045666706510365, + "grad_norm": 2.050368309020996, + "learning_rate": 1.0676341020989895e-05, + "loss": 1.4296, + "step": 2060 + }, + { + "epoch": 0.04806898984568828, + "grad_norm": 1.9865542650222778, + "learning_rate": 1.068152371080591e-05, + "loss": 1.3698, + "step": 2061 + }, + { + "epoch": 0.0480923129848662, + "grad_norm": 1.6175458431243896, + "learning_rate": 1.0686706400621924e-05, + "loss": 1.3281, + "step": 2062 + }, + { + "epoch": 0.04811563612404412, + "grad_norm": 2.048891067504883, + "learning_rate": 1.069188909043794e-05, + "loss": 1.4731, + "step": 2063 + }, + { + "epoch": 0.048138959263222036, + "grad_norm": 1.917079210281372, + "learning_rate": 1.0697071780253954e-05, + "loss": 1.3246, + "step": 2064 + }, + { + "epoch": 0.048162282402399954, + "grad_norm": 1.8361105918884277, + "learning_rate": 1.0702254470069969e-05, + "loss": 1.5121, + "step": 2065 + }, + { + "epoch": 0.048185605541577865, + "grad_norm": 2.233614921569824, + "learning_rate": 1.0707437159885983e-05, + "loss": 1.6197, + "step": 2066 + }, + { + "epoch": 0.04820892868075578, + "grad_norm": 2.2280359268188477, + "learning_rate": 1.0712619849701997e-05, + "loss": 1.8298, + "step": 2067 + }, + { + "epoch": 0.0482322518199337, + "grad_norm": 1.7553162574768066, + "learning_rate": 1.071780253951801e-05, + "loss": 1.3408, + "step": 2068 + }, + { + "epoch": 0.04825557495911162, + "grad_norm": 1.6910414695739746, + "learning_rate": 1.0722985229334024e-05, + "loss": 1.473, + "step": 2069 + }, + { + "epoch": 0.04827889809828954, + "grad_norm": 2.053236961364746, + "learning_rate": 1.0728167919150039e-05, + "loss": 1.6323, + "step": 2070 + }, + { + "epoch": 0.048302221237467455, + "grad_norm": 1.5760650634765625, + "learning_rate": 1.0733350608966053e-05, + "loss": 1.4442, + "step": 2071 + }, + { + "epoch": 0.04832554437664537, + "grad_norm": 1.9398868083953857, + "learning_rate": 1.0738533298782067e-05, + "loss": 1.342, + "step": 2072 + }, + { + "epoch": 0.04834886751582329, + "grad_norm": 1.6530135869979858, + "learning_rate": 1.0743715988598082e-05, + "loss": 1.47, + "step": 2073 + }, + { + "epoch": 0.04837219065500121, + "grad_norm": 1.8386073112487793, + "learning_rate": 1.0748898678414098e-05, + "loss": 1.5633, + "step": 2074 + }, + { + "epoch": 0.048395513794179126, + "grad_norm": 2.6090152263641357, + "learning_rate": 1.0754081368230112e-05, + "loss": 1.5872, + "step": 2075 + }, + { + "epoch": 0.048418836933357044, + "grad_norm": 1.598777174949646, + "learning_rate": 1.0759264058046126e-05, + "loss": 1.3968, + "step": 2076 + }, + { + "epoch": 0.04844216007253496, + "grad_norm": 1.8660935163497925, + "learning_rate": 1.076444674786214e-05, + "loss": 1.3865, + "step": 2077 + }, + { + "epoch": 0.04846548321171288, + "grad_norm": 1.7573018074035645, + "learning_rate": 1.0769629437678155e-05, + "loss": 1.5532, + "step": 2078 + }, + { + "epoch": 0.0484888063508908, + "grad_norm": 1.8254483938217163, + "learning_rate": 1.077481212749417e-05, + "loss": 1.6758, + "step": 2079 + }, + { + "epoch": 0.048512129490068716, + "grad_norm": 2.102221727371216, + "learning_rate": 1.0779994817310184e-05, + "loss": 1.6375, + "step": 2080 + }, + { + "epoch": 0.048535452629246634, + "grad_norm": 1.6811659336090088, + "learning_rate": 1.0785177507126198e-05, + "loss": 1.4086, + "step": 2081 + }, + { + "epoch": 0.04855877576842455, + "grad_norm": 1.7381610870361328, + "learning_rate": 1.0790360196942214e-05, + "loss": 1.4361, + "step": 2082 + }, + { + "epoch": 0.04858209890760247, + "grad_norm": 1.346579670906067, + "learning_rate": 1.0795542886758228e-05, + "loss": 1.5014, + "step": 2083 + }, + { + "epoch": 0.04860542204678039, + "grad_norm": 1.4739258289337158, + "learning_rate": 1.0800725576574243e-05, + "loss": 0.922, + "step": 2084 + }, + { + "epoch": 0.048628745185958305, + "grad_norm": 1.5859721899032593, + "learning_rate": 1.0805908266390257e-05, + "loss": 1.4007, + "step": 2085 + }, + { + "epoch": 0.04865206832513622, + "grad_norm": 2.35955548286438, + "learning_rate": 1.0811090956206272e-05, + "loss": 1.3838, + "step": 2086 + }, + { + "epoch": 0.04867539146431414, + "grad_norm": 2.1148109436035156, + "learning_rate": 1.0816273646022286e-05, + "loss": 1.3424, + "step": 2087 + }, + { + "epoch": 0.04869871460349206, + "grad_norm": 1.6170052289962769, + "learning_rate": 1.08214563358383e-05, + "loss": 1.3171, + "step": 2088 + }, + { + "epoch": 0.04872203774266998, + "grad_norm": 1.8172898292541504, + "learning_rate": 1.0826639025654315e-05, + "loss": 1.8264, + "step": 2089 + }, + { + "epoch": 0.048745360881847895, + "grad_norm": 1.5396174192428589, + "learning_rate": 1.083182171547033e-05, + "loss": 1.2572, + "step": 2090 + }, + { + "epoch": 0.04876868402102581, + "grad_norm": 1.6363898515701294, + "learning_rate": 1.0837004405286345e-05, + "loss": 1.4644, + "step": 2091 + }, + { + "epoch": 0.04879200716020373, + "grad_norm": 2.310607433319092, + "learning_rate": 1.084218709510236e-05, + "loss": 1.773, + "step": 2092 + }, + { + "epoch": 0.04881533029938165, + "grad_norm": 1.9963150024414062, + "learning_rate": 1.0847369784918374e-05, + "loss": 1.7195, + "step": 2093 + }, + { + "epoch": 0.048838653438559566, + "grad_norm": 2.051004409790039, + "learning_rate": 1.0852552474734388e-05, + "loss": 1.4198, + "step": 2094 + }, + { + "epoch": 0.04886197657773748, + "grad_norm": 1.7132155895233154, + "learning_rate": 1.0857735164550402e-05, + "loss": 1.3747, + "step": 2095 + }, + { + "epoch": 0.048885299716915395, + "grad_norm": 1.9538960456848145, + "learning_rate": 1.0862917854366417e-05, + "loss": 1.7172, + "step": 2096 + }, + { + "epoch": 0.04890862285609331, + "grad_norm": 2.2270054817199707, + "learning_rate": 1.0868100544182431e-05, + "loss": 1.5241, + "step": 2097 + }, + { + "epoch": 0.04893194599527123, + "grad_norm": 1.859562873840332, + "learning_rate": 1.0873283233998445e-05, + "loss": 1.5629, + "step": 2098 + }, + { + "epoch": 0.04895526913444915, + "grad_norm": 1.9491593837738037, + "learning_rate": 1.0878465923814461e-05, + "loss": 1.4231, + "step": 2099 + }, + { + "epoch": 0.04897859227362707, + "grad_norm": 1.566665768623352, + "learning_rate": 1.0883648613630476e-05, + "loss": 1.6466, + "step": 2100 + }, + { + "epoch": 0.049001915412804985, + "grad_norm": 1.6445392370224, + "learning_rate": 1.088883130344649e-05, + "loss": 1.3313, + "step": 2101 + }, + { + "epoch": 0.0490252385519829, + "grad_norm": 2.30572772026062, + "learning_rate": 1.0894013993262504e-05, + "loss": 1.9519, + "step": 2102 + }, + { + "epoch": 0.04904856169116082, + "grad_norm": 1.9258235692977905, + "learning_rate": 1.0899196683078519e-05, + "loss": 1.6955, + "step": 2103 + }, + { + "epoch": 0.04907188483033874, + "grad_norm": 2.236351490020752, + "learning_rate": 1.0904379372894533e-05, + "loss": 1.576, + "step": 2104 + }, + { + "epoch": 0.049095207969516656, + "grad_norm": 1.592620849609375, + "learning_rate": 1.0909562062710547e-05, + "loss": 1.4411, + "step": 2105 + }, + { + "epoch": 0.049118531108694574, + "grad_norm": 1.689764380455017, + "learning_rate": 1.0914744752526562e-05, + "loss": 1.554, + "step": 2106 + }, + { + "epoch": 0.04914185424787249, + "grad_norm": 1.9864122867584229, + "learning_rate": 1.0919927442342578e-05, + "loss": 1.5606, + "step": 2107 + }, + { + "epoch": 0.04916517738705041, + "grad_norm": 1.8440738916397095, + "learning_rate": 1.0925110132158592e-05, + "loss": 1.3705, + "step": 2108 + }, + { + "epoch": 0.04918850052622833, + "grad_norm": 1.4290404319763184, + "learning_rate": 1.0930292821974607e-05, + "loss": 1.3095, + "step": 2109 + }, + { + "epoch": 0.049211823665406246, + "grad_norm": 2.0556204319000244, + "learning_rate": 1.0935475511790621e-05, + "loss": 1.2491, + "step": 2110 + }, + { + "epoch": 0.049235146804584164, + "grad_norm": 1.4986300468444824, + "learning_rate": 1.0940658201606635e-05, + "loss": 1.48, + "step": 2111 + }, + { + "epoch": 0.04925846994376208, + "grad_norm": 1.9182485342025757, + "learning_rate": 1.094584089142265e-05, + "loss": 1.7362, + "step": 2112 + }, + { + "epoch": 0.04928179308294, + "grad_norm": 1.789497971534729, + "learning_rate": 1.0951023581238664e-05, + "loss": 1.5304, + "step": 2113 + }, + { + "epoch": 0.04930511622211792, + "grad_norm": 1.468636155128479, + "learning_rate": 1.0956206271054678e-05, + "loss": 1.1878, + "step": 2114 + }, + { + "epoch": 0.049328439361295835, + "grad_norm": 1.8166897296905518, + "learning_rate": 1.0961388960870694e-05, + "loss": 1.4757, + "step": 2115 + }, + { + "epoch": 0.04935176250047375, + "grad_norm": 1.592802882194519, + "learning_rate": 1.0966571650686709e-05, + "loss": 1.0244, + "step": 2116 + }, + { + "epoch": 0.04937508563965167, + "grad_norm": 1.8792015314102173, + "learning_rate": 1.0971754340502723e-05, + "loss": 1.5129, + "step": 2117 + }, + { + "epoch": 0.04939840877882959, + "grad_norm": 1.6895866394042969, + "learning_rate": 1.0976937030318737e-05, + "loss": 1.1866, + "step": 2118 + }, + { + "epoch": 0.04942173191800751, + "grad_norm": 2.1797773838043213, + "learning_rate": 1.0982119720134752e-05, + "loss": 1.7618, + "step": 2119 + }, + { + "epoch": 0.049445055057185425, + "grad_norm": 1.7822822332382202, + "learning_rate": 1.0987302409950766e-05, + "loss": 1.1358, + "step": 2120 + }, + { + "epoch": 0.04946837819636334, + "grad_norm": 2.3683722019195557, + "learning_rate": 1.099248509976678e-05, + "loss": 1.5685, + "step": 2121 + }, + { + "epoch": 0.04949170133554126, + "grad_norm": 1.9489614963531494, + "learning_rate": 1.0997667789582795e-05, + "loss": 1.7013, + "step": 2122 + }, + { + "epoch": 0.04951502447471917, + "grad_norm": 2.0891358852386475, + "learning_rate": 1.1002850479398807e-05, + "loss": 1.4574, + "step": 2123 + }, + { + "epoch": 0.04953834761389709, + "grad_norm": 2.1940853595733643, + "learning_rate": 1.1008033169214822e-05, + "loss": 1.0502, + "step": 2124 + }, + { + "epoch": 0.04956167075307501, + "grad_norm": 1.7693442106246948, + "learning_rate": 1.1013215859030836e-05, + "loss": 1.3404, + "step": 2125 + }, + { + "epoch": 0.049584993892252925, + "grad_norm": 1.9649901390075684, + "learning_rate": 1.1018398548846852e-05, + "loss": 1.6006, + "step": 2126 + }, + { + "epoch": 0.04960831703143084, + "grad_norm": 1.1453593969345093, + "learning_rate": 1.1023581238662867e-05, + "loss": 1.3859, + "step": 2127 + }, + { + "epoch": 0.04963164017060876, + "grad_norm": 1.6021969318389893, + "learning_rate": 1.1028763928478881e-05, + "loss": 1.2068, + "step": 2128 + }, + { + "epoch": 0.04965496330978668, + "grad_norm": 1.8170448541641235, + "learning_rate": 1.1033946618294895e-05, + "loss": 1.7754, + "step": 2129 + }, + { + "epoch": 0.0496782864489646, + "grad_norm": 1.5090433359146118, + "learning_rate": 1.103912930811091e-05, + "loss": 1.2575, + "step": 2130 + }, + { + "epoch": 0.049701609588142515, + "grad_norm": 2.0015881061553955, + "learning_rate": 1.1044311997926924e-05, + "loss": 1.4633, + "step": 2131 + }, + { + "epoch": 0.04972493272732043, + "grad_norm": 1.5064808130264282, + "learning_rate": 1.1049494687742938e-05, + "loss": 1.1692, + "step": 2132 + }, + { + "epoch": 0.04974825586649835, + "grad_norm": 1.6928976774215698, + "learning_rate": 1.1054677377558953e-05, + "loss": 1.3624, + "step": 2133 + }, + { + "epoch": 0.04977157900567627, + "grad_norm": 1.6377867460250854, + "learning_rate": 1.1059860067374969e-05, + "loss": 1.6004, + "step": 2134 + }, + { + "epoch": 0.049794902144854186, + "grad_norm": 2.1778640747070312, + "learning_rate": 1.1065042757190983e-05, + "loss": 1.7913, + "step": 2135 + }, + { + "epoch": 0.049818225284032104, + "grad_norm": 1.695723295211792, + "learning_rate": 1.1070225447006997e-05, + "loss": 1.2567, + "step": 2136 + }, + { + "epoch": 0.04984154842321002, + "grad_norm": 1.6048933267593384, + "learning_rate": 1.1075408136823012e-05, + "loss": 1.5821, + "step": 2137 + }, + { + "epoch": 0.04986487156238794, + "grad_norm": 1.6514248847961426, + "learning_rate": 1.1080590826639026e-05, + "loss": 1.2986, + "step": 2138 + }, + { + "epoch": 0.04988819470156586, + "grad_norm": 3.6867258548736572, + "learning_rate": 1.108577351645504e-05, + "loss": 1.2123, + "step": 2139 + }, + { + "epoch": 0.049911517840743776, + "grad_norm": 2.162198543548584, + "learning_rate": 1.1090956206271055e-05, + "loss": 1.41, + "step": 2140 + }, + { + "epoch": 0.049934840979921694, + "grad_norm": 1.446838617324829, + "learning_rate": 1.1096138896087069e-05, + "loss": 1.4192, + "step": 2141 + }, + { + "epoch": 0.04995816411909961, + "grad_norm": 1.6596550941467285, + "learning_rate": 1.1101321585903085e-05, + "loss": 1.4474, + "step": 2142 + }, + { + "epoch": 0.04998148725827753, + "grad_norm": 1.4073383808135986, + "learning_rate": 1.11065042757191e-05, + "loss": 1.1819, + "step": 2143 + }, + { + "epoch": 0.05000481039745545, + "grad_norm": 1.4993380308151245, + "learning_rate": 1.1111686965535114e-05, + "loss": 1.5477, + "step": 2144 + }, + { + "epoch": 0.050028133536633365, + "grad_norm": 2.566545248031616, + "learning_rate": 1.1116869655351128e-05, + "loss": 0.9798, + "step": 2145 + }, + { + "epoch": 0.05005145667581128, + "grad_norm": 1.9660520553588867, + "learning_rate": 1.1122052345167142e-05, + "loss": 1.4595, + "step": 2146 + }, + { + "epoch": 0.0500747798149892, + "grad_norm": 1.6653189659118652, + "learning_rate": 1.1127235034983157e-05, + "loss": 1.3836, + "step": 2147 + }, + { + "epoch": 0.05009810295416712, + "grad_norm": 1.651076078414917, + "learning_rate": 1.1132417724799171e-05, + "loss": 1.2737, + "step": 2148 + }, + { + "epoch": 0.05012142609334504, + "grad_norm": 1.9957411289215088, + "learning_rate": 1.1137600414615186e-05, + "loss": 1.236, + "step": 2149 + }, + { + "epoch": 0.050144749232522955, + "grad_norm": 1.9232287406921387, + "learning_rate": 1.11427831044312e-05, + "loss": 1.3815, + "step": 2150 + }, + { + "epoch": 0.05016807237170087, + "grad_norm": 2.0008256435394287, + "learning_rate": 1.1147965794247216e-05, + "loss": 1.5746, + "step": 2151 + }, + { + "epoch": 0.050191395510878783, + "grad_norm": 1.3993918895721436, + "learning_rate": 1.115314848406323e-05, + "loss": 1.4401, + "step": 2152 + }, + { + "epoch": 0.0502147186500567, + "grad_norm": 1.95811927318573, + "learning_rate": 1.1158331173879245e-05, + "loss": 1.5971, + "step": 2153 + }, + { + "epoch": 0.05023804178923462, + "grad_norm": 1.6759461164474487, + "learning_rate": 1.1163513863695259e-05, + "loss": 1.5051, + "step": 2154 + }, + { + "epoch": 0.05026136492841254, + "grad_norm": 1.6531785726547241, + "learning_rate": 1.1168696553511273e-05, + "loss": 1.6752, + "step": 2155 + }, + { + "epoch": 0.050284688067590455, + "grad_norm": 1.9572185277938843, + "learning_rate": 1.1173879243327288e-05, + "loss": 1.5851, + "step": 2156 + }, + { + "epoch": 0.05030801120676837, + "grad_norm": 1.6343263387680054, + "learning_rate": 1.1179061933143302e-05, + "loss": 1.3371, + "step": 2157 + }, + { + "epoch": 0.05033133434594629, + "grad_norm": 1.6465532779693604, + "learning_rate": 1.1184244622959316e-05, + "loss": 1.2993, + "step": 2158 + }, + { + "epoch": 0.05035465748512421, + "grad_norm": 2.2306525707244873, + "learning_rate": 1.1189427312775332e-05, + "loss": 1.3089, + "step": 2159 + }, + { + "epoch": 0.05037798062430213, + "grad_norm": 1.6966320276260376, + "learning_rate": 1.1194610002591347e-05, + "loss": 1.2082, + "step": 2160 + }, + { + "epoch": 0.050401303763480045, + "grad_norm": 1.576805830001831, + "learning_rate": 1.1199792692407361e-05, + "loss": 1.6838, + "step": 2161 + }, + { + "epoch": 0.05042462690265796, + "grad_norm": 1.8179023265838623, + "learning_rate": 1.1204975382223375e-05, + "loss": 1.2606, + "step": 2162 + }, + { + "epoch": 0.05044795004183588, + "grad_norm": 1.7981466054916382, + "learning_rate": 1.121015807203939e-05, + "loss": 1.2698, + "step": 2163 + }, + { + "epoch": 0.0504712731810138, + "grad_norm": 2.0942420959472656, + "learning_rate": 1.1215340761855404e-05, + "loss": 1.4324, + "step": 2164 + }, + { + "epoch": 0.050494596320191716, + "grad_norm": 3.04243803024292, + "learning_rate": 1.1220523451671418e-05, + "loss": 1.725, + "step": 2165 + }, + { + "epoch": 0.050517919459369634, + "grad_norm": 1.5606476068496704, + "learning_rate": 1.1225706141487433e-05, + "loss": 1.2715, + "step": 2166 + }, + { + "epoch": 0.05054124259854755, + "grad_norm": 1.8899224996566772, + "learning_rate": 1.1230888831303449e-05, + "loss": 1.5831, + "step": 2167 + }, + { + "epoch": 0.05056456573772547, + "grad_norm": 2.1356899738311768, + "learning_rate": 1.1236071521119463e-05, + "loss": 1.3111, + "step": 2168 + }, + { + "epoch": 0.05058788887690339, + "grad_norm": 2.962841033935547, + "learning_rate": 1.1241254210935478e-05, + "loss": 1.2757, + "step": 2169 + }, + { + "epoch": 0.050611212016081306, + "grad_norm": 1.5891367197036743, + "learning_rate": 1.1246436900751492e-05, + "loss": 1.5705, + "step": 2170 + }, + { + "epoch": 0.050634535155259223, + "grad_norm": 1.9614683389663696, + "learning_rate": 1.1251619590567506e-05, + "loss": 1.273, + "step": 2171 + }, + { + "epoch": 0.05065785829443714, + "grad_norm": 1.9047058820724487, + "learning_rate": 1.125680228038352e-05, + "loss": 1.2976, + "step": 2172 + }, + { + "epoch": 0.05068118143361506, + "grad_norm": 1.6429585218429565, + "learning_rate": 1.1261984970199535e-05, + "loss": 1.1797, + "step": 2173 + }, + { + "epoch": 0.05070450457279298, + "grad_norm": 1.5334621667861938, + "learning_rate": 1.126716766001555e-05, + "loss": 1.3446, + "step": 2174 + }, + { + "epoch": 0.050727827711970895, + "grad_norm": 2.300473690032959, + "learning_rate": 1.1272350349831565e-05, + "loss": 1.471, + "step": 2175 + }, + { + "epoch": 0.05075115085114881, + "grad_norm": 2.3872854709625244, + "learning_rate": 1.127753303964758e-05, + "loss": 1.4409, + "step": 2176 + }, + { + "epoch": 0.05077447399032673, + "grad_norm": 1.5523571968078613, + "learning_rate": 1.1282715729463594e-05, + "loss": 1.2701, + "step": 2177 + }, + { + "epoch": 0.05079779712950465, + "grad_norm": 2.0226919651031494, + "learning_rate": 1.1287898419279607e-05, + "loss": 1.702, + "step": 2178 + }, + { + "epoch": 0.05082112026868257, + "grad_norm": 2.056716203689575, + "learning_rate": 1.1293081109095621e-05, + "loss": 1.3019, + "step": 2179 + }, + { + "epoch": 0.05084444340786048, + "grad_norm": 3.767144203186035, + "learning_rate": 1.1298263798911635e-05, + "loss": 1.6133, + "step": 2180 + }, + { + "epoch": 0.050867766547038396, + "grad_norm": 1.7097781896591187, + "learning_rate": 1.130344648872765e-05, + "loss": 1.119, + "step": 2181 + }, + { + "epoch": 0.05089108968621631, + "grad_norm": 2.574557304382324, + "learning_rate": 1.1308629178543664e-05, + "loss": 1.4325, + "step": 2182 + }, + { + "epoch": 0.05091441282539423, + "grad_norm": 1.7624635696411133, + "learning_rate": 1.1313811868359678e-05, + "loss": 1.3827, + "step": 2183 + }, + { + "epoch": 0.05093773596457215, + "grad_norm": 1.9757330417633057, + "learning_rate": 1.1318994558175693e-05, + "loss": 1.4845, + "step": 2184 + }, + { + "epoch": 0.05096105910375007, + "grad_norm": 1.4644198417663574, + "learning_rate": 1.1324177247991707e-05, + "loss": 1.1953, + "step": 2185 + }, + { + "epoch": 0.050984382242927985, + "grad_norm": 1.6254751682281494, + "learning_rate": 1.1329359937807723e-05, + "loss": 1.4711, + "step": 2186 + }, + { + "epoch": 0.0510077053821059, + "grad_norm": 1.4148911237716675, + "learning_rate": 1.1334542627623737e-05, + "loss": 1.1752, + "step": 2187 + }, + { + "epoch": 0.05103102852128382, + "grad_norm": 1.9235097169876099, + "learning_rate": 1.1339725317439752e-05, + "loss": 1.5117, + "step": 2188 + }, + { + "epoch": 0.05105435166046174, + "grad_norm": 2.0073230266571045, + "learning_rate": 1.1344908007255766e-05, + "loss": 1.7692, + "step": 2189 + }, + { + "epoch": 0.05107767479963966, + "grad_norm": 1.6105692386627197, + "learning_rate": 1.135009069707178e-05, + "loss": 1.2348, + "step": 2190 + }, + { + "epoch": 0.051100997938817574, + "grad_norm": 1.764140009880066, + "learning_rate": 1.1355273386887795e-05, + "loss": 1.5298, + "step": 2191 + }, + { + "epoch": 0.05112432107799549, + "grad_norm": 1.5853471755981445, + "learning_rate": 1.136045607670381e-05, + "loss": 1.1606, + "step": 2192 + }, + { + "epoch": 0.05114764421717341, + "grad_norm": 2.409334659576416, + "learning_rate": 1.1365638766519824e-05, + "loss": 1.7928, + "step": 2193 + }, + { + "epoch": 0.05117096735635133, + "grad_norm": 1.611031174659729, + "learning_rate": 1.137082145633584e-05, + "loss": 1.3917, + "step": 2194 + }, + { + "epoch": 0.051194290495529246, + "grad_norm": 1.5654594898223877, + "learning_rate": 1.1376004146151854e-05, + "loss": 1.5245, + "step": 2195 + }, + { + "epoch": 0.051217613634707164, + "grad_norm": 1.8865010738372803, + "learning_rate": 1.1381186835967868e-05, + "loss": 1.7595, + "step": 2196 + }, + { + "epoch": 0.05124093677388508, + "grad_norm": 1.7687900066375732, + "learning_rate": 1.1386369525783883e-05, + "loss": 1.7168, + "step": 2197 + }, + { + "epoch": 0.051264259913063, + "grad_norm": 2.0165562629699707, + "learning_rate": 1.1391552215599897e-05, + "loss": 1.2842, + "step": 2198 + }, + { + "epoch": 0.05128758305224092, + "grad_norm": 1.9831840991973877, + "learning_rate": 1.1396734905415911e-05, + "loss": 1.7535, + "step": 2199 + }, + { + "epoch": 0.051310906191418836, + "grad_norm": 1.753906488418579, + "learning_rate": 1.1401917595231926e-05, + "loss": 1.439, + "step": 2200 + }, + { + "epoch": 0.05133422933059675, + "grad_norm": 1.7206554412841797, + "learning_rate": 1.140710028504794e-05, + "loss": 1.5502, + "step": 2201 + }, + { + "epoch": 0.05135755246977467, + "grad_norm": 1.8260259628295898, + "learning_rate": 1.1412282974863954e-05, + "loss": 1.6683, + "step": 2202 + }, + { + "epoch": 0.05138087560895259, + "grad_norm": 1.3575698137283325, + "learning_rate": 1.141746566467997e-05, + "loss": 1.3584, + "step": 2203 + }, + { + "epoch": 0.05140419874813051, + "grad_norm": 1.5947502851486206, + "learning_rate": 1.1422648354495985e-05, + "loss": 1.2867, + "step": 2204 + }, + { + "epoch": 0.051427521887308425, + "grad_norm": 2.2568745613098145, + "learning_rate": 1.1427831044311999e-05, + "loss": 1.5639, + "step": 2205 + }, + { + "epoch": 0.05145084502648634, + "grad_norm": 2.484616756439209, + "learning_rate": 1.1433013734128013e-05, + "loss": 1.4994, + "step": 2206 + }, + { + "epoch": 0.05147416816566426, + "grad_norm": 2.137057065963745, + "learning_rate": 1.1438196423944028e-05, + "loss": 1.907, + "step": 2207 + }, + { + "epoch": 0.05149749130484218, + "grad_norm": 1.7077836990356445, + "learning_rate": 1.1443379113760042e-05, + "loss": 1.5018, + "step": 2208 + }, + { + "epoch": 0.05152081444402009, + "grad_norm": 1.6885474920272827, + "learning_rate": 1.1448561803576057e-05, + "loss": 1.7111, + "step": 2209 + }, + { + "epoch": 0.05154413758319801, + "grad_norm": 1.669134497642517, + "learning_rate": 1.1453744493392071e-05, + "loss": 1.5672, + "step": 2210 + }, + { + "epoch": 0.051567460722375925, + "grad_norm": 1.5086790323257446, + "learning_rate": 1.1458927183208087e-05, + "loss": 1.522, + "step": 2211 + }, + { + "epoch": 0.05159078386155384, + "grad_norm": 1.5217182636260986, + "learning_rate": 1.1464109873024101e-05, + "loss": 1.3006, + "step": 2212 + }, + { + "epoch": 0.05161410700073176, + "grad_norm": 1.4646259546279907, + "learning_rate": 1.1469292562840116e-05, + "loss": 1.2538, + "step": 2213 + }, + { + "epoch": 0.05163743013990968, + "grad_norm": 2.1693215370178223, + "learning_rate": 1.147447525265613e-05, + "loss": 1.4208, + "step": 2214 + }, + { + "epoch": 0.0516607532790876, + "grad_norm": 1.264846682548523, + "learning_rate": 1.1479657942472144e-05, + "loss": 0.9305, + "step": 2215 + }, + { + "epoch": 0.051684076418265515, + "grad_norm": 1.9992789030075073, + "learning_rate": 1.1484840632288159e-05, + "loss": 1.3709, + "step": 2216 + }, + { + "epoch": 0.05170739955744343, + "grad_norm": 1.6985483169555664, + "learning_rate": 1.1490023322104173e-05, + "loss": 1.7966, + "step": 2217 + }, + { + "epoch": 0.05173072269662135, + "grad_norm": 1.4030596017837524, + "learning_rate": 1.1495206011920187e-05, + "loss": 1.3329, + "step": 2218 + }, + { + "epoch": 0.05175404583579927, + "grad_norm": 1.765177607536316, + "learning_rate": 1.1500388701736203e-05, + "loss": 1.3921, + "step": 2219 + }, + { + "epoch": 0.051777368974977186, + "grad_norm": 2.0211474895477295, + "learning_rate": 1.1505571391552218e-05, + "loss": 1.9322, + "step": 2220 + }, + { + "epoch": 0.051800692114155104, + "grad_norm": 2.1338424682617188, + "learning_rate": 1.1510754081368232e-05, + "loss": 1.9375, + "step": 2221 + }, + { + "epoch": 0.05182401525333302, + "grad_norm": 2.1635642051696777, + "learning_rate": 1.1515936771184246e-05, + "loss": 1.4048, + "step": 2222 + }, + { + "epoch": 0.05184733839251094, + "grad_norm": 1.5882627964019775, + "learning_rate": 1.152111946100026e-05, + "loss": 1.4047, + "step": 2223 + }, + { + "epoch": 0.05187066153168886, + "grad_norm": 1.6761504411697388, + "learning_rate": 1.1526302150816275e-05, + "loss": 1.506, + "step": 2224 + }, + { + "epoch": 0.051893984670866776, + "grad_norm": 2.401423454284668, + "learning_rate": 1.153148484063229e-05, + "loss": 1.132, + "step": 2225 + }, + { + "epoch": 0.051917307810044694, + "grad_norm": 2.2021756172180176, + "learning_rate": 1.1536667530448304e-05, + "loss": 1.8485, + "step": 2226 + }, + { + "epoch": 0.05194063094922261, + "grad_norm": 1.436631441116333, + "learning_rate": 1.154185022026432e-05, + "loss": 1.3811, + "step": 2227 + }, + { + "epoch": 0.05196395408840053, + "grad_norm": 1.6710056066513062, + "learning_rate": 1.1547032910080334e-05, + "loss": 1.5939, + "step": 2228 + }, + { + "epoch": 0.05198727722757845, + "grad_norm": 1.295320749282837, + "learning_rate": 1.1552215599896349e-05, + "loss": 1.1543, + "step": 2229 + }, + { + "epoch": 0.052010600366756365, + "grad_norm": 1.9142168760299683, + "learning_rate": 1.1557398289712363e-05, + "loss": 1.3995, + "step": 2230 + }, + { + "epoch": 0.05203392350593428, + "grad_norm": 1.6744799613952637, + "learning_rate": 1.1562580979528377e-05, + "loss": 1.547, + "step": 2231 + }, + { + "epoch": 0.0520572466451122, + "grad_norm": 2.2781121730804443, + "learning_rate": 1.1567763669344392e-05, + "loss": 1.5105, + "step": 2232 + }, + { + "epoch": 0.05208056978429012, + "grad_norm": 1.9038276672363281, + "learning_rate": 1.1572946359160404e-05, + "loss": 1.265, + "step": 2233 + }, + { + "epoch": 0.05210389292346804, + "grad_norm": 1.9407936334609985, + "learning_rate": 1.1578129048976419e-05, + "loss": 1.4646, + "step": 2234 + }, + { + "epoch": 0.052127216062645955, + "grad_norm": 2.0184576511383057, + "learning_rate": 1.1583311738792433e-05, + "loss": 1.3433, + "step": 2235 + }, + { + "epoch": 0.05215053920182387, + "grad_norm": 2.160403251647949, + "learning_rate": 1.1588494428608447e-05, + "loss": 1.3387, + "step": 2236 + }, + { + "epoch": 0.052173862341001784, + "grad_norm": 1.5148425102233887, + "learning_rate": 1.1593677118424462e-05, + "loss": 1.2889, + "step": 2237 + }, + { + "epoch": 0.0521971854801797, + "grad_norm": 2.4264883995056152, + "learning_rate": 1.1598859808240478e-05, + "loss": 1.3715, + "step": 2238 + }, + { + "epoch": 0.05222050861935762, + "grad_norm": 1.9291166067123413, + "learning_rate": 1.1604042498056492e-05, + "loss": 1.9834, + "step": 2239 + }, + { + "epoch": 0.05224383175853554, + "grad_norm": 1.8168792724609375, + "learning_rate": 1.1609225187872506e-05, + "loss": 1.4135, + "step": 2240 + }, + { + "epoch": 0.052267154897713455, + "grad_norm": 1.6432805061340332, + "learning_rate": 1.161440787768852e-05, + "loss": 1.5285, + "step": 2241 + }, + { + "epoch": 0.05229047803689137, + "grad_norm": 1.6379437446594238, + "learning_rate": 1.1619590567504535e-05, + "loss": 1.3672, + "step": 2242 + }, + { + "epoch": 0.05231380117606929, + "grad_norm": 1.8223236799240112, + "learning_rate": 1.162477325732055e-05, + "loss": 1.6033, + "step": 2243 + }, + { + "epoch": 0.05233712431524721, + "grad_norm": 1.4181009531021118, + "learning_rate": 1.1629955947136564e-05, + "loss": 1.6317, + "step": 2244 + }, + { + "epoch": 0.05236044745442513, + "grad_norm": 2.042304039001465, + "learning_rate": 1.1635138636952578e-05, + "loss": 1.4046, + "step": 2245 + }, + { + "epoch": 0.052383770593603045, + "grad_norm": 1.937261700630188, + "learning_rate": 1.1640321326768592e-05, + "loss": 1.086, + "step": 2246 + }, + { + "epoch": 0.05240709373278096, + "grad_norm": 1.686892032623291, + "learning_rate": 1.1645504016584608e-05, + "loss": 1.6014, + "step": 2247 + }, + { + "epoch": 0.05243041687195888, + "grad_norm": 2.37595272064209, + "learning_rate": 1.1650686706400623e-05, + "loss": 1.464, + "step": 2248 + }, + { + "epoch": 0.0524537400111368, + "grad_norm": 1.8354530334472656, + "learning_rate": 1.1655869396216637e-05, + "loss": 1.3907, + "step": 2249 + }, + { + "epoch": 0.052477063150314716, + "grad_norm": 1.997286319732666, + "learning_rate": 1.1661052086032652e-05, + "loss": 1.1932, + "step": 2250 + }, + { + "epoch": 0.052500386289492634, + "grad_norm": 1.6411911249160767, + "learning_rate": 1.1666234775848666e-05, + "loss": 1.5849, + "step": 2251 + }, + { + "epoch": 0.05252370942867055, + "grad_norm": 1.7450475692749023, + "learning_rate": 1.167141746566468e-05, + "loss": 1.5098, + "step": 2252 + }, + { + "epoch": 0.05254703256784847, + "grad_norm": 2.1054482460021973, + "learning_rate": 1.1676600155480695e-05, + "loss": 1.7326, + "step": 2253 + }, + { + "epoch": 0.05257035570702639, + "grad_norm": 2.1427578926086426, + "learning_rate": 1.1681782845296709e-05, + "loss": 1.2461, + "step": 2254 + }, + { + "epoch": 0.052593678846204306, + "grad_norm": 1.820042371749878, + "learning_rate": 1.1686965535112725e-05, + "loss": 1.8495, + "step": 2255 + }, + { + "epoch": 0.052617001985382224, + "grad_norm": 1.908542275428772, + "learning_rate": 1.169214822492874e-05, + "loss": 1.2177, + "step": 2256 + }, + { + "epoch": 0.05264032512456014, + "grad_norm": 2.222275495529175, + "learning_rate": 1.1697330914744754e-05, + "loss": 1.8614, + "step": 2257 + }, + { + "epoch": 0.05266364826373806, + "grad_norm": 1.8124415874481201, + "learning_rate": 1.1702513604560768e-05, + "loss": 1.162, + "step": 2258 + }, + { + "epoch": 0.05268697140291598, + "grad_norm": 1.7822272777557373, + "learning_rate": 1.1707696294376782e-05, + "loss": 1.6498, + "step": 2259 + }, + { + "epoch": 0.052710294542093895, + "grad_norm": 2.066291570663452, + "learning_rate": 1.1712878984192797e-05, + "loss": 1.5203, + "step": 2260 + }, + { + "epoch": 0.05273361768127181, + "grad_norm": 1.8691362142562866, + "learning_rate": 1.1718061674008811e-05, + "loss": 1.6722, + "step": 2261 + }, + { + "epoch": 0.05275694082044973, + "grad_norm": 1.8264986276626587, + "learning_rate": 1.1723244363824825e-05, + "loss": 1.787, + "step": 2262 + }, + { + "epoch": 0.05278026395962765, + "grad_norm": 1.7741049528121948, + "learning_rate": 1.1728427053640841e-05, + "loss": 1.5503, + "step": 2263 + }, + { + "epoch": 0.05280358709880557, + "grad_norm": 1.8473697900772095, + "learning_rate": 1.1733609743456856e-05, + "loss": 1.337, + "step": 2264 + }, + { + "epoch": 0.052826910237983485, + "grad_norm": 2.0709192752838135, + "learning_rate": 1.173879243327287e-05, + "loss": 1.6331, + "step": 2265 + }, + { + "epoch": 0.052850233377161396, + "grad_norm": 2.1398303508758545, + "learning_rate": 1.1743975123088884e-05, + "loss": 0.9651, + "step": 2266 + }, + { + "epoch": 0.052873556516339314, + "grad_norm": 1.9882025718688965, + "learning_rate": 1.1749157812904899e-05, + "loss": 1.8047, + "step": 2267 + }, + { + "epoch": 0.05289687965551723, + "grad_norm": 1.8728996515274048, + "learning_rate": 1.1754340502720913e-05, + "loss": 1.4712, + "step": 2268 + }, + { + "epoch": 0.05292020279469515, + "grad_norm": 1.6529200077056885, + "learning_rate": 1.1759523192536927e-05, + "loss": 1.5364, + "step": 2269 + }, + { + "epoch": 0.05294352593387307, + "grad_norm": 2.0231099128723145, + "learning_rate": 1.1764705882352942e-05, + "loss": 1.46, + "step": 2270 + }, + { + "epoch": 0.052966849073050985, + "grad_norm": 1.8899494409561157, + "learning_rate": 1.1769888572168958e-05, + "loss": 1.6656, + "step": 2271 + }, + { + "epoch": 0.0529901722122289, + "grad_norm": 2.0990450382232666, + "learning_rate": 1.1775071261984972e-05, + "loss": 1.4712, + "step": 2272 + }, + { + "epoch": 0.05301349535140682, + "grad_norm": 2.2885234355926514, + "learning_rate": 1.1780253951800987e-05, + "loss": 1.3969, + "step": 2273 + }, + { + "epoch": 0.05303681849058474, + "grad_norm": 2.202568531036377, + "learning_rate": 1.1785436641617001e-05, + "loss": 1.0861, + "step": 2274 + }, + { + "epoch": 0.05306014162976266, + "grad_norm": 3.381636381149292, + "learning_rate": 1.1790619331433015e-05, + "loss": 1.2669, + "step": 2275 + }, + { + "epoch": 0.053083464768940575, + "grad_norm": 1.6885939836502075, + "learning_rate": 1.179580202124903e-05, + "loss": 1.6913, + "step": 2276 + }, + { + "epoch": 0.05310678790811849, + "grad_norm": 1.9537405967712402, + "learning_rate": 1.1800984711065044e-05, + "loss": 1.0473, + "step": 2277 + }, + { + "epoch": 0.05313011104729641, + "grad_norm": 1.9460904598236084, + "learning_rate": 1.1806167400881058e-05, + "loss": 1.131, + "step": 2278 + }, + { + "epoch": 0.05315343418647433, + "grad_norm": 2.025679111480713, + "learning_rate": 1.1811350090697073e-05, + "loss": 1.1503, + "step": 2279 + }, + { + "epoch": 0.053176757325652246, + "grad_norm": 1.8040107488632202, + "learning_rate": 1.1816532780513089e-05, + "loss": 1.6038, + "step": 2280 + }, + { + "epoch": 0.053200080464830164, + "grad_norm": 2.133852958679199, + "learning_rate": 1.1821715470329103e-05, + "loss": 1.6102, + "step": 2281 + }, + { + "epoch": 0.05322340360400808, + "grad_norm": 1.8510808944702148, + "learning_rate": 1.1826898160145117e-05, + "loss": 1.5358, + "step": 2282 + }, + { + "epoch": 0.053246726743186, + "grad_norm": 1.9468835592269897, + "learning_rate": 1.1832080849961132e-05, + "loss": 1.4974, + "step": 2283 + }, + { + "epoch": 0.05327004988236392, + "grad_norm": 1.580652117729187, + "learning_rate": 1.1837263539777146e-05, + "loss": 1.6189, + "step": 2284 + }, + { + "epoch": 0.053293373021541836, + "grad_norm": 1.6107277870178223, + "learning_rate": 1.184244622959316e-05, + "loss": 1.3356, + "step": 2285 + }, + { + "epoch": 0.053316696160719754, + "grad_norm": 1.7563567161560059, + "learning_rate": 1.1847628919409175e-05, + "loss": 1.4425, + "step": 2286 + }, + { + "epoch": 0.05334001929989767, + "grad_norm": 1.6008427143096924, + "learning_rate": 1.1852811609225189e-05, + "loss": 1.5226, + "step": 2287 + }, + { + "epoch": 0.05336334243907559, + "grad_norm": 1.8432244062423706, + "learning_rate": 1.1857994299041202e-05, + "loss": 1.4107, + "step": 2288 + }, + { + "epoch": 0.05338666557825351, + "grad_norm": 1.6150907278060913, + "learning_rate": 1.1863176988857216e-05, + "loss": 1.4161, + "step": 2289 + }, + { + "epoch": 0.053409988717431425, + "grad_norm": 1.8245857954025269, + "learning_rate": 1.1868359678673232e-05, + "loss": 1.417, + "step": 2290 + }, + { + "epoch": 0.05343331185660934, + "grad_norm": 1.7012311220169067, + "learning_rate": 1.1873542368489247e-05, + "loss": 1.2383, + "step": 2291 + }, + { + "epoch": 0.05345663499578726, + "grad_norm": 1.9361399412155151, + "learning_rate": 1.1878725058305261e-05, + "loss": 1.5359, + "step": 2292 + }, + { + "epoch": 0.05347995813496518, + "grad_norm": 1.9297102689743042, + "learning_rate": 1.1883907748121275e-05, + "loss": 1.3487, + "step": 2293 + }, + { + "epoch": 0.05350328127414309, + "grad_norm": 2.098510503768921, + "learning_rate": 1.188909043793729e-05, + "loss": 0.9242, + "step": 2294 + }, + { + "epoch": 0.05352660441332101, + "grad_norm": 1.745192050933838, + "learning_rate": 1.1894273127753304e-05, + "loss": 1.4062, + "step": 2295 + }, + { + "epoch": 0.053549927552498926, + "grad_norm": 2.441746473312378, + "learning_rate": 1.1899455817569318e-05, + "loss": 1.7816, + "step": 2296 + }, + { + "epoch": 0.053573250691676844, + "grad_norm": 2.045830488204956, + "learning_rate": 1.1904638507385333e-05, + "loss": 1.7797, + "step": 2297 + }, + { + "epoch": 0.05359657383085476, + "grad_norm": 1.6408205032348633, + "learning_rate": 1.1909821197201347e-05, + "loss": 1.2563, + "step": 2298 + }, + { + "epoch": 0.05361989697003268, + "grad_norm": 2.0717265605926514, + "learning_rate": 1.1915003887017363e-05, + "loss": 1.5343, + "step": 2299 + }, + { + "epoch": 0.0536432201092106, + "grad_norm": 1.457613468170166, + "learning_rate": 1.1920186576833377e-05, + "loss": 1.7042, + "step": 2300 + }, + { + "epoch": 0.053666543248388515, + "grad_norm": 2.1533775329589844, + "learning_rate": 1.1925369266649392e-05, + "loss": 1.3517, + "step": 2301 + }, + { + "epoch": 0.05368986638756643, + "grad_norm": 1.5491520166397095, + "learning_rate": 1.1930551956465406e-05, + "loss": 1.4162, + "step": 2302 + }, + { + "epoch": 0.05371318952674435, + "grad_norm": 1.7081563472747803, + "learning_rate": 1.193573464628142e-05, + "loss": 1.6408, + "step": 2303 + }, + { + "epoch": 0.05373651266592227, + "grad_norm": 1.8476041555404663, + "learning_rate": 1.1940917336097435e-05, + "loss": 1.2826, + "step": 2304 + }, + { + "epoch": 0.05375983580510019, + "grad_norm": 2.2153637409210205, + "learning_rate": 1.1946100025913449e-05, + "loss": 1.723, + "step": 2305 + }, + { + "epoch": 0.053783158944278105, + "grad_norm": 6.036977767944336, + "learning_rate": 1.1951282715729463e-05, + "loss": 1.4402, + "step": 2306 + }, + { + "epoch": 0.05380648208345602, + "grad_norm": 1.876021385192871, + "learning_rate": 1.195646540554548e-05, + "loss": 1.1415, + "step": 2307 + }, + { + "epoch": 0.05382980522263394, + "grad_norm": 2.157916307449341, + "learning_rate": 1.1961648095361494e-05, + "loss": 1.6102, + "step": 2308 + }, + { + "epoch": 0.05385312836181186, + "grad_norm": 1.8045432567596436, + "learning_rate": 1.1966830785177508e-05, + "loss": 1.5312, + "step": 2309 + }, + { + "epoch": 0.053876451500989776, + "grad_norm": 2.102548837661743, + "learning_rate": 1.1972013474993522e-05, + "loss": 1.3526, + "step": 2310 + }, + { + "epoch": 0.053899774640167694, + "grad_norm": 1.9561266899108887, + "learning_rate": 1.1977196164809537e-05, + "loss": 1.5143, + "step": 2311 + }, + { + "epoch": 0.05392309777934561, + "grad_norm": 1.5144243240356445, + "learning_rate": 1.1982378854625551e-05, + "loss": 1.5124, + "step": 2312 + }, + { + "epoch": 0.05394642091852353, + "grad_norm": 1.7776747941970825, + "learning_rate": 1.1987561544441566e-05, + "loss": 1.2017, + "step": 2313 + }, + { + "epoch": 0.05396974405770145, + "grad_norm": 1.5333037376403809, + "learning_rate": 1.199274423425758e-05, + "loss": 1.2877, + "step": 2314 + }, + { + "epoch": 0.053993067196879366, + "grad_norm": 1.7108148336410522, + "learning_rate": 1.1997926924073596e-05, + "loss": 1.5089, + "step": 2315 + }, + { + "epoch": 0.054016390336057284, + "grad_norm": 1.963119626045227, + "learning_rate": 1.200310961388961e-05, + "loss": 1.1509, + "step": 2316 + }, + { + "epoch": 0.0540397134752352, + "grad_norm": 1.6362985372543335, + "learning_rate": 1.2008292303705625e-05, + "loss": 1.4243, + "step": 2317 + }, + { + "epoch": 0.05406303661441312, + "grad_norm": 1.8509998321533203, + "learning_rate": 1.2013474993521639e-05, + "loss": 1.2237, + "step": 2318 + }, + { + "epoch": 0.05408635975359104, + "grad_norm": 1.904492735862732, + "learning_rate": 1.2018657683337653e-05, + "loss": 1.1659, + "step": 2319 + }, + { + "epoch": 0.054109682892768955, + "grad_norm": 1.8940964937210083, + "learning_rate": 1.2023840373153668e-05, + "loss": 1.6011, + "step": 2320 + }, + { + "epoch": 0.05413300603194687, + "grad_norm": 1.7062005996704102, + "learning_rate": 1.2029023062969682e-05, + "loss": 1.3888, + "step": 2321 + }, + { + "epoch": 0.05415632917112479, + "grad_norm": 2.0538320541381836, + "learning_rate": 1.2034205752785696e-05, + "loss": 1.4461, + "step": 2322 + }, + { + "epoch": 0.0541796523103027, + "grad_norm": 2.1501717567443848, + "learning_rate": 1.2039388442601712e-05, + "loss": 1.3813, + "step": 2323 + }, + { + "epoch": 0.05420297544948062, + "grad_norm": 1.9368064403533936, + "learning_rate": 1.2044571132417727e-05, + "loss": 1.466, + "step": 2324 + }, + { + "epoch": 0.05422629858865854, + "grad_norm": 1.8357282876968384, + "learning_rate": 1.2049753822233741e-05, + "loss": 1.3254, + "step": 2325 + }, + { + "epoch": 0.054249621727836456, + "grad_norm": 1.5460752248764038, + "learning_rate": 1.2054936512049755e-05, + "loss": 1.2867, + "step": 2326 + }, + { + "epoch": 0.054272944867014374, + "grad_norm": 1.5794566869735718, + "learning_rate": 1.206011920186577e-05, + "loss": 1.4563, + "step": 2327 + }, + { + "epoch": 0.05429626800619229, + "grad_norm": 1.7742433547973633, + "learning_rate": 1.2065301891681784e-05, + "loss": 1.9052, + "step": 2328 + }, + { + "epoch": 0.05431959114537021, + "grad_norm": 1.3682368993759155, + "learning_rate": 1.2070484581497798e-05, + "loss": 1.123, + "step": 2329 + }, + { + "epoch": 0.05434291428454813, + "grad_norm": 1.7420629262924194, + "learning_rate": 1.2075667271313813e-05, + "loss": 1.3116, + "step": 2330 + }, + { + "epoch": 0.054366237423726045, + "grad_norm": 1.6572750806808472, + "learning_rate": 1.2080849961129827e-05, + "loss": 1.2575, + "step": 2331 + }, + { + "epoch": 0.05438956056290396, + "grad_norm": 2.035689353942871, + "learning_rate": 1.2086032650945843e-05, + "loss": 1.5586, + "step": 2332 + }, + { + "epoch": 0.05441288370208188, + "grad_norm": 1.4352741241455078, + "learning_rate": 1.2091215340761858e-05, + "loss": 1.2754, + "step": 2333 + }, + { + "epoch": 0.0544362068412598, + "grad_norm": 1.4306999444961548, + "learning_rate": 1.2096398030577872e-05, + "loss": 1.2013, + "step": 2334 + }, + { + "epoch": 0.05445952998043772, + "grad_norm": 1.8308879137039185, + "learning_rate": 1.2101580720393886e-05, + "loss": 1.4611, + "step": 2335 + }, + { + "epoch": 0.054482853119615635, + "grad_norm": 1.5827016830444336, + "learning_rate": 1.21067634102099e-05, + "loss": 1.6502, + "step": 2336 + }, + { + "epoch": 0.05450617625879355, + "grad_norm": 2.0508148670196533, + "learning_rate": 1.2111946100025915e-05, + "loss": 1.4868, + "step": 2337 + }, + { + "epoch": 0.05452949939797147, + "grad_norm": 1.8490535020828247, + "learning_rate": 1.211712878984193e-05, + "loss": 1.4469, + "step": 2338 + }, + { + "epoch": 0.05455282253714939, + "grad_norm": 2.151078701019287, + "learning_rate": 1.2122311479657944e-05, + "loss": 1.5714, + "step": 2339 + }, + { + "epoch": 0.054576145676327306, + "grad_norm": 1.6393033266067505, + "learning_rate": 1.212749416947396e-05, + "loss": 1.5442, + "step": 2340 + }, + { + "epoch": 0.054599468815505224, + "grad_norm": 1.6875606775283813, + "learning_rate": 1.2132676859289974e-05, + "loss": 1.6064, + "step": 2341 + }, + { + "epoch": 0.05462279195468314, + "grad_norm": 1.5508856773376465, + "learning_rate": 1.2137859549105988e-05, + "loss": 1.4948, + "step": 2342 + }, + { + "epoch": 0.05464611509386106, + "grad_norm": 1.6648694276809692, + "learning_rate": 1.2143042238922001e-05, + "loss": 1.4836, + "step": 2343 + }, + { + "epoch": 0.05466943823303898, + "grad_norm": 1.7704250812530518, + "learning_rate": 1.2148224928738015e-05, + "loss": 1.5293, + "step": 2344 + }, + { + "epoch": 0.054692761372216896, + "grad_norm": 1.8191028833389282, + "learning_rate": 1.215340761855403e-05, + "loss": 1.6612, + "step": 2345 + }, + { + "epoch": 0.054716084511394814, + "grad_norm": 2.0659608840942383, + "learning_rate": 1.2158590308370044e-05, + "loss": 1.3825, + "step": 2346 + }, + { + "epoch": 0.05473940765057273, + "grad_norm": 2.3801848888397217, + "learning_rate": 1.2163772998186058e-05, + "loss": 1.6521, + "step": 2347 + }, + { + "epoch": 0.05476273078975065, + "grad_norm": 1.5148749351501465, + "learning_rate": 1.2168955688002073e-05, + "loss": 1.3766, + "step": 2348 + }, + { + "epoch": 0.05478605392892857, + "grad_norm": 2.612302780151367, + "learning_rate": 1.2174138377818087e-05, + "loss": 1.5648, + "step": 2349 + }, + { + "epoch": 0.054809377068106485, + "grad_norm": 1.318306803703308, + "learning_rate": 1.2179321067634101e-05, + "loss": 1.1209, + "step": 2350 + }, + { + "epoch": 0.054832700207284396, + "grad_norm": 1.9958271980285645, + "learning_rate": 1.2184503757450117e-05, + "loss": 1.4942, + "step": 2351 + }, + { + "epoch": 0.054856023346462314, + "grad_norm": 3.188138008117676, + "learning_rate": 1.2189686447266132e-05, + "loss": 1.4955, + "step": 2352 + }, + { + "epoch": 0.05487934648564023, + "grad_norm": 1.9520068168640137, + "learning_rate": 1.2194869137082146e-05, + "loss": 1.4372, + "step": 2353 + }, + { + "epoch": 0.05490266962481815, + "grad_norm": 1.7835674285888672, + "learning_rate": 1.220005182689816e-05, + "loss": 1.6209, + "step": 2354 + }, + { + "epoch": 0.05492599276399607, + "grad_norm": 1.5796537399291992, + "learning_rate": 1.2205234516714175e-05, + "loss": 1.5106, + "step": 2355 + }, + { + "epoch": 0.054949315903173986, + "grad_norm": 1.6579697132110596, + "learning_rate": 1.221041720653019e-05, + "loss": 1.6087, + "step": 2356 + }, + { + "epoch": 0.0549726390423519, + "grad_norm": 1.838502287864685, + "learning_rate": 1.2215599896346204e-05, + "loss": 1.5969, + "step": 2357 + }, + { + "epoch": 0.05499596218152982, + "grad_norm": 2.256510019302368, + "learning_rate": 1.2220782586162218e-05, + "loss": 1.3455, + "step": 2358 + }, + { + "epoch": 0.05501928532070774, + "grad_norm": 1.9825702905654907, + "learning_rate": 1.2225965275978234e-05, + "loss": 1.5903, + "step": 2359 + }, + { + "epoch": 0.05504260845988566, + "grad_norm": 1.8769842386245728, + "learning_rate": 1.2231147965794248e-05, + "loss": 1.3899, + "step": 2360 + }, + { + "epoch": 0.055065931599063575, + "grad_norm": 1.608070969581604, + "learning_rate": 1.2236330655610263e-05, + "loss": 1.1432, + "step": 2361 + }, + { + "epoch": 0.05508925473824149, + "grad_norm": 1.7243940830230713, + "learning_rate": 1.2241513345426277e-05, + "loss": 1.425, + "step": 2362 + }, + { + "epoch": 0.05511257787741941, + "grad_norm": 1.8134387731552124, + "learning_rate": 1.2246696035242291e-05, + "loss": 1.4092, + "step": 2363 + }, + { + "epoch": 0.05513590101659733, + "grad_norm": 1.6564340591430664, + "learning_rate": 1.2251878725058306e-05, + "loss": 1.4602, + "step": 2364 + }, + { + "epoch": 0.05515922415577525, + "grad_norm": 2.2256054878234863, + "learning_rate": 1.225706141487432e-05, + "loss": 1.6256, + "step": 2365 + }, + { + "epoch": 0.055182547294953165, + "grad_norm": 2.0529842376708984, + "learning_rate": 1.2262244104690334e-05, + "loss": 1.4628, + "step": 2366 + }, + { + "epoch": 0.05520587043413108, + "grad_norm": 1.3641233444213867, + "learning_rate": 1.226742679450635e-05, + "loss": 1.3189, + "step": 2367 + }, + { + "epoch": 0.055229193573309, + "grad_norm": 2.196507692337036, + "learning_rate": 1.2272609484322365e-05, + "loss": 1.7297, + "step": 2368 + }, + { + "epoch": 0.05525251671248692, + "grad_norm": 1.719078540802002, + "learning_rate": 1.2277792174138379e-05, + "loss": 1.6974, + "step": 2369 + }, + { + "epoch": 0.055275839851664836, + "grad_norm": 1.8548617362976074, + "learning_rate": 1.2282974863954393e-05, + "loss": 1.7735, + "step": 2370 + }, + { + "epoch": 0.055299162990842754, + "grad_norm": 2.1644322872161865, + "learning_rate": 1.2288157553770408e-05, + "loss": 1.2362, + "step": 2371 + }, + { + "epoch": 0.05532248613002067, + "grad_norm": 1.642645239830017, + "learning_rate": 1.2293340243586422e-05, + "loss": 1.3763, + "step": 2372 + }, + { + "epoch": 0.05534580926919859, + "grad_norm": 1.4598188400268555, + "learning_rate": 1.2298522933402437e-05, + "loss": 1.019, + "step": 2373 + }, + { + "epoch": 0.05536913240837651, + "grad_norm": 1.7192937135696411, + "learning_rate": 1.2303705623218451e-05, + "loss": 1.1431, + "step": 2374 + }, + { + "epoch": 0.055392455547554426, + "grad_norm": 1.5887870788574219, + "learning_rate": 1.2308888313034467e-05, + "loss": 1.3876, + "step": 2375 + }, + { + "epoch": 0.055415778686732343, + "grad_norm": 3.285121440887451, + "learning_rate": 1.2314071002850481e-05, + "loss": 1.6588, + "step": 2376 + }, + { + "epoch": 0.05543910182591026, + "grad_norm": 1.7930113077163696, + "learning_rate": 1.2319253692666496e-05, + "loss": 1.3764, + "step": 2377 + }, + { + "epoch": 0.05546242496508818, + "grad_norm": 1.4698500633239746, + "learning_rate": 1.232443638248251e-05, + "loss": 1.1216, + "step": 2378 + }, + { + "epoch": 0.0554857481042661, + "grad_norm": 2.0502030849456787, + "learning_rate": 1.2329619072298524e-05, + "loss": 1.3501, + "step": 2379 + }, + { + "epoch": 0.05550907124344401, + "grad_norm": 2.3289663791656494, + "learning_rate": 1.2334801762114539e-05, + "loss": 1.7261, + "step": 2380 + }, + { + "epoch": 0.055532394382621926, + "grad_norm": 1.5533864498138428, + "learning_rate": 1.2339984451930553e-05, + "loss": 1.1996, + "step": 2381 + }, + { + "epoch": 0.055555717521799844, + "grad_norm": 1.894768476486206, + "learning_rate": 1.2345167141746567e-05, + "loss": 1.1702, + "step": 2382 + }, + { + "epoch": 0.05557904066097776, + "grad_norm": 2.2866978645324707, + "learning_rate": 1.2350349831562582e-05, + "loss": 1.692, + "step": 2383 + }, + { + "epoch": 0.05560236380015568, + "grad_norm": 1.9464361667633057, + "learning_rate": 1.2355532521378598e-05, + "loss": 1.5588, + "step": 2384 + }, + { + "epoch": 0.0556256869393336, + "grad_norm": 1.5696362257003784, + "learning_rate": 1.2360715211194612e-05, + "loss": 1.5714, + "step": 2385 + }, + { + "epoch": 0.055649010078511515, + "grad_norm": 1.9288362264633179, + "learning_rate": 1.2365897901010626e-05, + "loss": 1.4401, + "step": 2386 + }, + { + "epoch": 0.05567233321768943, + "grad_norm": 1.8174824714660645, + "learning_rate": 1.237108059082664e-05, + "loss": 1.5862, + "step": 2387 + }, + { + "epoch": 0.05569565635686735, + "grad_norm": 1.8247565031051636, + "learning_rate": 1.2376263280642655e-05, + "loss": 1.4851, + "step": 2388 + }, + { + "epoch": 0.05571897949604527, + "grad_norm": 1.764724612236023, + "learning_rate": 1.238144597045867e-05, + "loss": 1.533, + "step": 2389 + }, + { + "epoch": 0.05574230263522319, + "grad_norm": 1.6865999698638916, + "learning_rate": 1.2386628660274684e-05, + "loss": 1.6418, + "step": 2390 + }, + { + "epoch": 0.055765625774401105, + "grad_norm": 1.3049399852752686, + "learning_rate": 1.2391811350090698e-05, + "loss": 1.2936, + "step": 2391 + }, + { + "epoch": 0.05578894891357902, + "grad_norm": 1.7230569124221802, + "learning_rate": 1.2396994039906714e-05, + "loss": 1.3043, + "step": 2392 + }, + { + "epoch": 0.05581227205275694, + "grad_norm": 1.7113614082336426, + "learning_rate": 1.2402176729722729e-05, + "loss": 1.0848, + "step": 2393 + }, + { + "epoch": 0.05583559519193486, + "grad_norm": 1.8730138540267944, + "learning_rate": 1.2407359419538743e-05, + "loss": 1.5867, + "step": 2394 + }, + { + "epoch": 0.05585891833111278, + "grad_norm": 1.5495893955230713, + "learning_rate": 1.2412542109354757e-05, + "loss": 1.1484, + "step": 2395 + }, + { + "epoch": 0.055882241470290694, + "grad_norm": 2.3786067962646484, + "learning_rate": 1.2417724799170772e-05, + "loss": 1.523, + "step": 2396 + }, + { + "epoch": 0.05590556460946861, + "grad_norm": 2.259895086288452, + "learning_rate": 1.2422907488986786e-05, + "loss": 1.7245, + "step": 2397 + }, + { + "epoch": 0.05592888774864653, + "grad_norm": 2.1496381759643555, + "learning_rate": 1.24280901788028e-05, + "loss": 1.2052, + "step": 2398 + }, + { + "epoch": 0.05595221088782445, + "grad_norm": 1.9179364442825317, + "learning_rate": 1.2433272868618813e-05, + "loss": 1.3639, + "step": 2399 + }, + { + "epoch": 0.055975534027002366, + "grad_norm": 1.6580770015716553, + "learning_rate": 1.2438455558434827e-05, + "loss": 1.3285, + "step": 2400 + }, + { + "epoch": 0.055998857166180284, + "grad_norm": 1.7400684356689453, + "learning_rate": 1.2443638248250842e-05, + "loss": 1.3427, + "step": 2401 + }, + { + "epoch": 0.0560221803053582, + "grad_norm": 1.6764858961105347, + "learning_rate": 1.2448820938066856e-05, + "loss": 1.276, + "step": 2402 + }, + { + "epoch": 0.05604550344453612, + "grad_norm": 2.2973010540008545, + "learning_rate": 1.2454003627882872e-05, + "loss": 1.4305, + "step": 2403 + }, + { + "epoch": 0.05606882658371404, + "grad_norm": 2.2047767639160156, + "learning_rate": 1.2459186317698886e-05, + "loss": 1.4465, + "step": 2404 + }, + { + "epoch": 0.056092149722891955, + "grad_norm": 2.275881052017212, + "learning_rate": 1.24643690075149e-05, + "loss": 1.337, + "step": 2405 + }, + { + "epoch": 0.05611547286206987, + "grad_norm": 1.8678287267684937, + "learning_rate": 1.2469551697330915e-05, + "loss": 1.49, + "step": 2406 + }, + { + "epoch": 0.05613879600124779, + "grad_norm": 1.6373817920684814, + "learning_rate": 1.247473438714693e-05, + "loss": 1.6221, + "step": 2407 + }, + { + "epoch": 0.0561621191404257, + "grad_norm": 1.689551591873169, + "learning_rate": 1.2479917076962944e-05, + "loss": 1.6474, + "step": 2408 + }, + { + "epoch": 0.05618544227960362, + "grad_norm": 1.5831496715545654, + "learning_rate": 1.2485099766778958e-05, + "loss": 1.5889, + "step": 2409 + }, + { + "epoch": 0.05620876541878154, + "grad_norm": 1.933913230895996, + "learning_rate": 1.2490282456594972e-05, + "loss": 1.5632, + "step": 2410 + }, + { + "epoch": 0.056232088557959456, + "grad_norm": 1.9560495615005493, + "learning_rate": 1.2495465146410988e-05, + "loss": 1.8688, + "step": 2411 + }, + { + "epoch": 0.056255411697137374, + "grad_norm": 1.767649531364441, + "learning_rate": 1.2500647836227003e-05, + "loss": 1.1946, + "step": 2412 + }, + { + "epoch": 0.05627873483631529, + "grad_norm": 1.344802975654602, + "learning_rate": 1.2505830526043017e-05, + "loss": 1.4424, + "step": 2413 + }, + { + "epoch": 0.05630205797549321, + "grad_norm": 1.9637680053710938, + "learning_rate": 1.2511013215859032e-05, + "loss": 1.3989, + "step": 2414 + }, + { + "epoch": 0.05632538111467113, + "grad_norm": 1.3612053394317627, + "learning_rate": 1.2516195905675046e-05, + "loss": 1.2406, + "step": 2415 + }, + { + "epoch": 0.056348704253849045, + "grad_norm": 1.7085059881210327, + "learning_rate": 1.252137859549106e-05, + "loss": 1.2047, + "step": 2416 + }, + { + "epoch": 0.05637202739302696, + "grad_norm": 2.3130269050598145, + "learning_rate": 1.2526561285307075e-05, + "loss": 1.441, + "step": 2417 + }, + { + "epoch": 0.05639535053220488, + "grad_norm": 1.6563998460769653, + "learning_rate": 1.2531743975123089e-05, + "loss": 1.1357, + "step": 2418 + }, + { + "epoch": 0.0564186736713828, + "grad_norm": 2.186823606491089, + "learning_rate": 1.2536926664939105e-05, + "loss": 1.541, + "step": 2419 + }, + { + "epoch": 0.05644199681056072, + "grad_norm": 1.8775302171707153, + "learning_rate": 1.254210935475512e-05, + "loss": 1.3811, + "step": 2420 + }, + { + "epoch": 0.056465319949738635, + "grad_norm": 1.6848359107971191, + "learning_rate": 1.2547292044571134e-05, + "loss": 1.2492, + "step": 2421 + }, + { + "epoch": 0.05648864308891655, + "grad_norm": 2.209897994995117, + "learning_rate": 1.2552474734387148e-05, + "loss": 1.4139, + "step": 2422 + }, + { + "epoch": 0.05651196622809447, + "grad_norm": 2.023489475250244, + "learning_rate": 1.2557657424203162e-05, + "loss": 1.7047, + "step": 2423 + }, + { + "epoch": 0.05653528936727239, + "grad_norm": 2.25374436378479, + "learning_rate": 1.2562840114019177e-05, + "loss": 1.6429, + "step": 2424 + }, + { + "epoch": 0.056558612506450306, + "grad_norm": 2.11035418510437, + "learning_rate": 1.2568022803835191e-05, + "loss": 1.7968, + "step": 2425 + }, + { + "epoch": 0.056581935645628224, + "grad_norm": 1.6524832248687744, + "learning_rate": 1.2573205493651205e-05, + "loss": 1.3035, + "step": 2426 + }, + { + "epoch": 0.05660525878480614, + "grad_norm": 1.9546128511428833, + "learning_rate": 1.2578388183467221e-05, + "loss": 1.6473, + "step": 2427 + }, + { + "epoch": 0.05662858192398406, + "grad_norm": 1.8195034265518188, + "learning_rate": 1.2583570873283236e-05, + "loss": 1.2349, + "step": 2428 + }, + { + "epoch": 0.05665190506316198, + "grad_norm": 2.0104966163635254, + "learning_rate": 1.258875356309925e-05, + "loss": 1.564, + "step": 2429 + }, + { + "epoch": 0.056675228202339896, + "grad_norm": 1.6539239883422852, + "learning_rate": 1.2593936252915264e-05, + "loss": 1.6515, + "step": 2430 + }, + { + "epoch": 0.056698551341517814, + "grad_norm": 1.92090904712677, + "learning_rate": 1.2599118942731279e-05, + "loss": 1.6177, + "step": 2431 + }, + { + "epoch": 0.05672187448069573, + "grad_norm": 1.899638056755066, + "learning_rate": 1.2604301632547293e-05, + "loss": 1.5904, + "step": 2432 + }, + { + "epoch": 0.05674519761987365, + "grad_norm": 1.7024785280227661, + "learning_rate": 1.2609484322363307e-05, + "loss": 1.7716, + "step": 2433 + }, + { + "epoch": 0.05676852075905157, + "grad_norm": 1.4970227479934692, + "learning_rate": 1.2614667012179322e-05, + "loss": 1.4859, + "step": 2434 + }, + { + "epoch": 0.056791843898229485, + "grad_norm": 1.5742204189300537, + "learning_rate": 1.2619849701995336e-05, + "loss": 1.3328, + "step": 2435 + }, + { + "epoch": 0.0568151670374074, + "grad_norm": 1.95245361328125, + "learning_rate": 1.2625032391811352e-05, + "loss": 1.1161, + "step": 2436 + }, + { + "epoch": 0.056838490176585314, + "grad_norm": 1.610113501548767, + "learning_rate": 1.2630215081627367e-05, + "loss": 1.447, + "step": 2437 + }, + { + "epoch": 0.05686181331576323, + "grad_norm": 1.517272710800171, + "learning_rate": 1.2635397771443381e-05, + "loss": 1.4569, + "step": 2438 + }, + { + "epoch": 0.05688513645494115, + "grad_norm": 1.9070580005645752, + "learning_rate": 1.2640580461259395e-05, + "loss": 1.5182, + "step": 2439 + }, + { + "epoch": 0.05690845959411907, + "grad_norm": 2.516972303390503, + "learning_rate": 1.264576315107541e-05, + "loss": 1.4891, + "step": 2440 + }, + { + "epoch": 0.056931782733296986, + "grad_norm": 1.7125346660614014, + "learning_rate": 1.2650945840891424e-05, + "loss": 1.2783, + "step": 2441 + }, + { + "epoch": 0.056955105872474904, + "grad_norm": 2.2393455505371094, + "learning_rate": 1.2656128530707438e-05, + "loss": 1.6549, + "step": 2442 + }, + { + "epoch": 0.05697842901165282, + "grad_norm": 1.6830679178237915, + "learning_rate": 1.2661311220523453e-05, + "loss": 1.6243, + "step": 2443 + }, + { + "epoch": 0.05700175215083074, + "grad_norm": 1.647987961769104, + "learning_rate": 1.2666493910339469e-05, + "loss": 0.9855, + "step": 2444 + }, + { + "epoch": 0.05702507529000866, + "grad_norm": 2.4067366123199463, + "learning_rate": 1.2671676600155483e-05, + "loss": 1.5937, + "step": 2445 + }, + { + "epoch": 0.057048398429186575, + "grad_norm": 1.5815311670303345, + "learning_rate": 1.2676859289971497e-05, + "loss": 1.4023, + "step": 2446 + }, + { + "epoch": 0.05707172156836449, + "grad_norm": 1.9707512855529785, + "learning_rate": 1.2682041979787512e-05, + "loss": 1.6465, + "step": 2447 + }, + { + "epoch": 0.05709504470754241, + "grad_norm": 1.5195176601409912, + "learning_rate": 1.2687224669603526e-05, + "loss": 1.2986, + "step": 2448 + }, + { + "epoch": 0.05711836784672033, + "grad_norm": 2.393983840942383, + "learning_rate": 1.269240735941954e-05, + "loss": 1.3601, + "step": 2449 + }, + { + "epoch": 0.05714169098589825, + "grad_norm": 1.6375079154968262, + "learning_rate": 1.2697590049235555e-05, + "loss": 1.2357, + "step": 2450 + }, + { + "epoch": 0.057165014125076165, + "grad_norm": 1.8043434619903564, + "learning_rate": 1.2702772739051569e-05, + "loss": 1.2337, + "step": 2451 + }, + { + "epoch": 0.05718833726425408, + "grad_norm": 1.841853141784668, + "learning_rate": 1.2707955428867585e-05, + "loss": 1.4255, + "step": 2452 + }, + { + "epoch": 0.057211660403432, + "grad_norm": 1.9364678859710693, + "learning_rate": 1.27131381186836e-05, + "loss": 1.7609, + "step": 2453 + }, + { + "epoch": 0.05723498354260992, + "grad_norm": 2.492034912109375, + "learning_rate": 1.271832080849961e-05, + "loss": 1.397, + "step": 2454 + }, + { + "epoch": 0.057258306681787836, + "grad_norm": 2.044177532196045, + "learning_rate": 1.2723503498315627e-05, + "loss": 1.1781, + "step": 2455 + }, + { + "epoch": 0.057281629820965754, + "grad_norm": 2.0078577995300293, + "learning_rate": 1.2728686188131641e-05, + "loss": 1.3808, + "step": 2456 + }, + { + "epoch": 0.05730495296014367, + "grad_norm": 2.386645793914795, + "learning_rate": 1.2733868877947655e-05, + "loss": 1.634, + "step": 2457 + }, + { + "epoch": 0.05732827609932159, + "grad_norm": 1.5789023637771606, + "learning_rate": 1.273905156776367e-05, + "loss": 1.0976, + "step": 2458 + }, + { + "epoch": 0.05735159923849951, + "grad_norm": 1.582072138786316, + "learning_rate": 1.2744234257579684e-05, + "loss": 1.5047, + "step": 2459 + }, + { + "epoch": 0.057374922377677426, + "grad_norm": 1.7116421461105347, + "learning_rate": 1.2749416947395698e-05, + "loss": 1.4517, + "step": 2460 + }, + { + "epoch": 0.057398245516855344, + "grad_norm": 1.992920994758606, + "learning_rate": 1.2754599637211713e-05, + "loss": 1.167, + "step": 2461 + }, + { + "epoch": 0.05742156865603326, + "grad_norm": 1.7484151124954224, + "learning_rate": 1.2759782327027727e-05, + "loss": 1.5353, + "step": 2462 + }, + { + "epoch": 0.05744489179521118, + "grad_norm": 1.54849374294281, + "learning_rate": 1.2764965016843743e-05, + "loss": 1.6425, + "step": 2463 + }, + { + "epoch": 0.0574682149343891, + "grad_norm": 1.6086442470550537, + "learning_rate": 1.2770147706659757e-05, + "loss": 1.4122, + "step": 2464 + }, + { + "epoch": 0.05749153807356701, + "grad_norm": 1.4398033618927002, + "learning_rate": 1.2775330396475772e-05, + "loss": 1.2049, + "step": 2465 + }, + { + "epoch": 0.057514861212744926, + "grad_norm": 1.8544633388519287, + "learning_rate": 1.2780513086291786e-05, + "loss": 1.4098, + "step": 2466 + }, + { + "epoch": 0.057538184351922844, + "grad_norm": 2.2655575275421143, + "learning_rate": 1.27856957761078e-05, + "loss": 1.8327, + "step": 2467 + }, + { + "epoch": 0.05756150749110076, + "grad_norm": 1.9104077816009521, + "learning_rate": 1.2790878465923815e-05, + "loss": 1.6752, + "step": 2468 + }, + { + "epoch": 0.05758483063027868, + "grad_norm": 1.6727238893508911, + "learning_rate": 1.2796061155739829e-05, + "loss": 1.4592, + "step": 2469 + }, + { + "epoch": 0.0576081537694566, + "grad_norm": 1.9189153909683228, + "learning_rate": 1.2801243845555843e-05, + "loss": 1.4464, + "step": 2470 + }, + { + "epoch": 0.057631476908634516, + "grad_norm": 2.133301258087158, + "learning_rate": 1.280642653537186e-05, + "loss": 1.3992, + "step": 2471 + }, + { + "epoch": 0.057654800047812434, + "grad_norm": 2.135209560394287, + "learning_rate": 1.2811609225187874e-05, + "loss": 1.5513, + "step": 2472 + }, + { + "epoch": 0.05767812318699035, + "grad_norm": 1.6449888944625854, + "learning_rate": 1.2816791915003888e-05, + "loss": 1.6559, + "step": 2473 + }, + { + "epoch": 0.05770144632616827, + "grad_norm": 1.9080917835235596, + "learning_rate": 1.2821974604819902e-05, + "loss": 1.4798, + "step": 2474 + }, + { + "epoch": 0.05772476946534619, + "grad_norm": 1.9846621751785278, + "learning_rate": 1.2827157294635917e-05, + "loss": 1.6511, + "step": 2475 + }, + { + "epoch": 0.057748092604524105, + "grad_norm": 1.755775809288025, + "learning_rate": 1.2832339984451931e-05, + "loss": 1.5076, + "step": 2476 + }, + { + "epoch": 0.05777141574370202, + "grad_norm": 2.24576735496521, + "learning_rate": 1.2837522674267946e-05, + "loss": 1.173, + "step": 2477 + }, + { + "epoch": 0.05779473888287994, + "grad_norm": 2.244088649749756, + "learning_rate": 1.284270536408396e-05, + "loss": 1.871, + "step": 2478 + }, + { + "epoch": 0.05781806202205786, + "grad_norm": 2.0059969425201416, + "learning_rate": 1.2847888053899974e-05, + "loss": 1.5696, + "step": 2479 + }, + { + "epoch": 0.05784138516123578, + "grad_norm": 1.586342453956604, + "learning_rate": 1.285307074371599e-05, + "loss": 1.7007, + "step": 2480 + }, + { + "epoch": 0.057864708300413695, + "grad_norm": 2.4792561531066895, + "learning_rate": 1.2858253433532005e-05, + "loss": 1.3885, + "step": 2481 + }, + { + "epoch": 0.05788803143959161, + "grad_norm": 1.8159295320510864, + "learning_rate": 1.2863436123348019e-05, + "loss": 1.4572, + "step": 2482 + }, + { + "epoch": 0.05791135457876953, + "grad_norm": 1.6010699272155762, + "learning_rate": 1.2868618813164033e-05, + "loss": 1.3239, + "step": 2483 + }, + { + "epoch": 0.05793467771794745, + "grad_norm": 2.294011116027832, + "learning_rate": 1.2873801502980048e-05, + "loss": 1.4858, + "step": 2484 + }, + { + "epoch": 0.057958000857125366, + "grad_norm": 1.7822222709655762, + "learning_rate": 1.2878984192796062e-05, + "loss": 1.6677, + "step": 2485 + }, + { + "epoch": 0.057981323996303284, + "grad_norm": 1.7381556034088135, + "learning_rate": 1.2884166882612076e-05, + "loss": 1.5102, + "step": 2486 + }, + { + "epoch": 0.0580046471354812, + "grad_norm": 1.5182676315307617, + "learning_rate": 1.288934957242809e-05, + "loss": 1.4902, + "step": 2487 + }, + { + "epoch": 0.05802797027465912, + "grad_norm": 2.3234503269195557, + "learning_rate": 1.2894532262244107e-05, + "loss": 1.7547, + "step": 2488 + }, + { + "epoch": 0.05805129341383704, + "grad_norm": 1.8449368476867676, + "learning_rate": 1.2899714952060121e-05, + "loss": 1.4566, + "step": 2489 + }, + { + "epoch": 0.058074616553014956, + "grad_norm": 1.6624127626419067, + "learning_rate": 1.2904897641876135e-05, + "loss": 1.3047, + "step": 2490 + }, + { + "epoch": 0.058097939692192874, + "grad_norm": 1.881435513496399, + "learning_rate": 1.291008033169215e-05, + "loss": 1.4678, + "step": 2491 + }, + { + "epoch": 0.05812126283137079, + "grad_norm": 1.8103615045547485, + "learning_rate": 1.2915263021508164e-05, + "loss": 1.4687, + "step": 2492 + }, + { + "epoch": 0.0581445859705487, + "grad_norm": 2.051447868347168, + "learning_rate": 1.2920445711324178e-05, + "loss": 1.45, + "step": 2493 + }, + { + "epoch": 0.05816790910972662, + "grad_norm": 1.8762485980987549, + "learning_rate": 1.2925628401140193e-05, + "loss": 1.2467, + "step": 2494 + }, + { + "epoch": 0.05819123224890454, + "grad_norm": 1.860500454902649, + "learning_rate": 1.2930811090956207e-05, + "loss": 1.4693, + "step": 2495 + }, + { + "epoch": 0.058214555388082456, + "grad_norm": 1.8621612787246704, + "learning_rate": 1.2935993780772223e-05, + "loss": 1.7739, + "step": 2496 + }, + { + "epoch": 0.058237878527260374, + "grad_norm": 2.166384220123291, + "learning_rate": 1.2941176470588238e-05, + "loss": 1.3166, + "step": 2497 + }, + { + "epoch": 0.05826120166643829, + "grad_norm": 1.7007169723510742, + "learning_rate": 1.2946359160404252e-05, + "loss": 1.2054, + "step": 2498 + }, + { + "epoch": 0.05828452480561621, + "grad_norm": 1.8515853881835938, + "learning_rate": 1.2951541850220266e-05, + "loss": 1.5207, + "step": 2499 + }, + { + "epoch": 0.05830784794479413, + "grad_norm": 1.6332519054412842, + "learning_rate": 1.295672454003628e-05, + "loss": 1.5376, + "step": 2500 + }, + { + "epoch": 0.058331171083972046, + "grad_norm": 1.839971899986267, + "learning_rate": 1.2961907229852295e-05, + "loss": 1.4077, + "step": 2501 + }, + { + "epoch": 0.058354494223149964, + "grad_norm": 1.689015507698059, + "learning_rate": 1.296708991966831e-05, + "loss": 1.5571, + "step": 2502 + }, + { + "epoch": 0.05837781736232788, + "grad_norm": 1.962945580482483, + "learning_rate": 1.2972272609484324e-05, + "loss": 1.5348, + "step": 2503 + }, + { + "epoch": 0.0584011405015058, + "grad_norm": 1.7358533143997192, + "learning_rate": 1.297745529930034e-05, + "loss": 1.4625, + "step": 2504 + }, + { + "epoch": 0.05842446364068372, + "grad_norm": 1.5950040817260742, + "learning_rate": 1.2982637989116354e-05, + "loss": 1.184, + "step": 2505 + }, + { + "epoch": 0.058447786779861635, + "grad_norm": 1.8383833169937134, + "learning_rate": 1.2987820678932368e-05, + "loss": 1.6773, + "step": 2506 + }, + { + "epoch": 0.05847110991903955, + "grad_norm": 1.7583168745040894, + "learning_rate": 1.2993003368748383e-05, + "loss": 1.564, + "step": 2507 + }, + { + "epoch": 0.05849443305821747, + "grad_norm": 1.9862589836120605, + "learning_rate": 1.2998186058564397e-05, + "loss": 1.411, + "step": 2508 + }, + { + "epoch": 0.05851775619739539, + "grad_norm": 1.8201732635498047, + "learning_rate": 1.300336874838041e-05, + "loss": 1.3807, + "step": 2509 + }, + { + "epoch": 0.05854107933657331, + "grad_norm": 1.6528314352035522, + "learning_rate": 1.3008551438196424e-05, + "loss": 1.5299, + "step": 2510 + }, + { + "epoch": 0.058564402475751225, + "grad_norm": 2.0610220432281494, + "learning_rate": 1.3013734128012438e-05, + "loss": 1.6129, + "step": 2511 + }, + { + "epoch": 0.05858772561492914, + "grad_norm": 1.6855478286743164, + "learning_rate": 1.3018916817828453e-05, + "loss": 1.0253, + "step": 2512 + }, + { + "epoch": 0.05861104875410706, + "grad_norm": 1.9679440259933472, + "learning_rate": 1.3024099507644467e-05, + "loss": 1.4495, + "step": 2513 + }, + { + "epoch": 0.05863437189328498, + "grad_norm": 1.7133574485778809, + "learning_rate": 1.3029282197460481e-05, + "loss": 1.4932, + "step": 2514 + }, + { + "epoch": 0.058657695032462896, + "grad_norm": 1.9857035875320435, + "learning_rate": 1.3034464887276497e-05, + "loss": 1.429, + "step": 2515 + }, + { + "epoch": 0.058681018171640814, + "grad_norm": 1.3343905210494995, + "learning_rate": 1.3039647577092512e-05, + "loss": 1.4605, + "step": 2516 + }, + { + "epoch": 0.05870434131081873, + "grad_norm": 1.6987974643707275, + "learning_rate": 1.3044830266908526e-05, + "loss": 1.0714, + "step": 2517 + }, + { + "epoch": 0.05872766444999665, + "grad_norm": 1.5600383281707764, + "learning_rate": 1.305001295672454e-05, + "loss": 1.3758, + "step": 2518 + }, + { + "epoch": 0.05875098758917457, + "grad_norm": 1.6523587703704834, + "learning_rate": 1.3055195646540555e-05, + "loss": 1.3751, + "step": 2519 + }, + { + "epoch": 0.058774310728352486, + "grad_norm": 2.1189181804656982, + "learning_rate": 1.306037833635657e-05, + "loss": 1.5012, + "step": 2520 + }, + { + "epoch": 0.058797633867530404, + "grad_norm": 1.5232523679733276, + "learning_rate": 1.3065561026172584e-05, + "loss": 1.697, + "step": 2521 + }, + { + "epoch": 0.058820957006708315, + "grad_norm": 1.6514025926589966, + "learning_rate": 1.3070743715988598e-05, + "loss": 1.2868, + "step": 2522 + }, + { + "epoch": 0.05884428014588623, + "grad_norm": 1.9371200799942017, + "learning_rate": 1.3075926405804614e-05, + "loss": 1.5558, + "step": 2523 + }, + { + "epoch": 0.05886760328506415, + "grad_norm": 2.4826416969299316, + "learning_rate": 1.3081109095620628e-05, + "loss": 1.4871, + "step": 2524 + }, + { + "epoch": 0.05889092642424207, + "grad_norm": 1.9945695400238037, + "learning_rate": 1.3086291785436643e-05, + "loss": 1.4965, + "step": 2525 + }, + { + "epoch": 0.058914249563419986, + "grad_norm": 2.7831015586853027, + "learning_rate": 1.3091474475252657e-05, + "loss": 1.42, + "step": 2526 + }, + { + "epoch": 0.058937572702597904, + "grad_norm": 1.4849002361297607, + "learning_rate": 1.3096657165068671e-05, + "loss": 1.2279, + "step": 2527 + }, + { + "epoch": 0.05896089584177582, + "grad_norm": 1.8503111600875854, + "learning_rate": 1.3101839854884686e-05, + "loss": 1.2645, + "step": 2528 + }, + { + "epoch": 0.05898421898095374, + "grad_norm": 1.8192991018295288, + "learning_rate": 1.31070225447007e-05, + "loss": 1.5411, + "step": 2529 + }, + { + "epoch": 0.05900754212013166, + "grad_norm": 1.799962043762207, + "learning_rate": 1.3112205234516714e-05, + "loss": 1.4876, + "step": 2530 + }, + { + "epoch": 0.059030865259309576, + "grad_norm": 1.8138644695281982, + "learning_rate": 1.3117387924332729e-05, + "loss": 1.0486, + "step": 2531 + }, + { + "epoch": 0.059054188398487494, + "grad_norm": 1.7154499292373657, + "learning_rate": 1.3122570614148745e-05, + "loss": 1.4033, + "step": 2532 + }, + { + "epoch": 0.05907751153766541, + "grad_norm": 2.8421483039855957, + "learning_rate": 1.3127753303964759e-05, + "loss": 1.3163, + "step": 2533 + }, + { + "epoch": 0.05910083467684333, + "grad_norm": 1.5384453535079956, + "learning_rate": 1.3132935993780773e-05, + "loss": 1.2216, + "step": 2534 + }, + { + "epoch": 0.05912415781602125, + "grad_norm": 2.0954272747039795, + "learning_rate": 1.3138118683596788e-05, + "loss": 1.5509, + "step": 2535 + }, + { + "epoch": 0.059147480955199165, + "grad_norm": 1.9540061950683594, + "learning_rate": 1.3143301373412802e-05, + "loss": 1.4651, + "step": 2536 + }, + { + "epoch": 0.05917080409437708, + "grad_norm": 2.361682176589966, + "learning_rate": 1.3148484063228817e-05, + "loss": 1.6031, + "step": 2537 + }, + { + "epoch": 0.059194127233555, + "grad_norm": 2.1207122802734375, + "learning_rate": 1.3153666753044831e-05, + "loss": 1.6092, + "step": 2538 + }, + { + "epoch": 0.05921745037273292, + "grad_norm": 1.5452579259872437, + "learning_rate": 1.3158849442860845e-05, + "loss": 1.5587, + "step": 2539 + }, + { + "epoch": 0.05924077351191084, + "grad_norm": 1.748274803161621, + "learning_rate": 1.3164032132676861e-05, + "loss": 1.1223, + "step": 2540 + }, + { + "epoch": 0.059264096651088755, + "grad_norm": 2.0728864669799805, + "learning_rate": 1.3169214822492876e-05, + "loss": 1.3435, + "step": 2541 + }, + { + "epoch": 0.05928741979026667, + "grad_norm": 1.6231045722961426, + "learning_rate": 1.317439751230889e-05, + "loss": 1.3008, + "step": 2542 + }, + { + "epoch": 0.05931074292944459, + "grad_norm": 1.9777765274047852, + "learning_rate": 1.3179580202124904e-05, + "loss": 1.3855, + "step": 2543 + }, + { + "epoch": 0.05933406606862251, + "grad_norm": 1.9784355163574219, + "learning_rate": 1.3184762891940919e-05, + "loss": 1.2233, + "step": 2544 + }, + { + "epoch": 0.059357389207800426, + "grad_norm": 1.5931442975997925, + "learning_rate": 1.3189945581756933e-05, + "loss": 1.277, + "step": 2545 + }, + { + "epoch": 0.059380712346978344, + "grad_norm": 1.490401268005371, + "learning_rate": 1.3195128271572947e-05, + "loss": 1.4339, + "step": 2546 + }, + { + "epoch": 0.05940403548615626, + "grad_norm": 1.848841905593872, + "learning_rate": 1.3200310961388962e-05, + "loss": 1.4852, + "step": 2547 + }, + { + "epoch": 0.05942735862533418, + "grad_norm": 2.0940616130828857, + "learning_rate": 1.3205493651204978e-05, + "loss": 1.4573, + "step": 2548 + }, + { + "epoch": 0.0594506817645121, + "grad_norm": 2.3672728538513184, + "learning_rate": 1.3210676341020992e-05, + "loss": 1.7065, + "step": 2549 + }, + { + "epoch": 0.05947400490369001, + "grad_norm": 4.1710028648376465, + "learning_rate": 1.3215859030837006e-05, + "loss": 1.4652, + "step": 2550 + }, + { + "epoch": 0.05949732804286793, + "grad_norm": 1.6404699087142944, + "learning_rate": 1.322104172065302e-05, + "loss": 1.4241, + "step": 2551 + }, + { + "epoch": 0.059520651182045844, + "grad_norm": 2.475877523422241, + "learning_rate": 1.3226224410469035e-05, + "loss": 1.2603, + "step": 2552 + }, + { + "epoch": 0.05954397432122376, + "grad_norm": 1.8253686428070068, + "learning_rate": 1.323140710028505e-05, + "loss": 1.4414, + "step": 2553 + }, + { + "epoch": 0.05956729746040168, + "grad_norm": 1.9098048210144043, + "learning_rate": 1.3236589790101064e-05, + "loss": 1.4837, + "step": 2554 + }, + { + "epoch": 0.0595906205995796, + "grad_norm": 1.7729190587997437, + "learning_rate": 1.3241772479917078e-05, + "loss": 1.4985, + "step": 2555 + }, + { + "epoch": 0.059613943738757516, + "grad_norm": 1.6407618522644043, + "learning_rate": 1.3246955169733094e-05, + "loss": 1.323, + "step": 2556 + }, + { + "epoch": 0.059637266877935434, + "grad_norm": 1.8097273111343384, + "learning_rate": 1.3252137859549109e-05, + "loss": 1.4201, + "step": 2557 + }, + { + "epoch": 0.05966059001711335, + "grad_norm": 1.6986953020095825, + "learning_rate": 1.3257320549365123e-05, + "loss": 1.2728, + "step": 2558 + }, + { + "epoch": 0.05968391315629127, + "grad_norm": 2.6232433319091797, + "learning_rate": 1.3262503239181137e-05, + "loss": 1.3089, + "step": 2559 + }, + { + "epoch": 0.05970723629546919, + "grad_norm": 1.9292854070663452, + "learning_rate": 1.3267685928997152e-05, + "loss": 1.7235, + "step": 2560 + }, + { + "epoch": 0.059730559434647106, + "grad_norm": 2.367253065109253, + "learning_rate": 1.3272868618813166e-05, + "loss": 1.5625, + "step": 2561 + }, + { + "epoch": 0.05975388257382502, + "grad_norm": 1.9255868196487427, + "learning_rate": 1.327805130862918e-05, + "loss": 1.4485, + "step": 2562 + }, + { + "epoch": 0.05977720571300294, + "grad_norm": 1.65224289894104, + "learning_rate": 1.3283233998445195e-05, + "loss": 1.058, + "step": 2563 + }, + { + "epoch": 0.05980052885218086, + "grad_norm": 1.5006394386291504, + "learning_rate": 1.3288416688261207e-05, + "loss": 1.5081, + "step": 2564 + }, + { + "epoch": 0.05982385199135878, + "grad_norm": 1.7269837856292725, + "learning_rate": 1.3293599378077222e-05, + "loss": 1.4554, + "step": 2565 + }, + { + "epoch": 0.059847175130536695, + "grad_norm": 1.8299132585525513, + "learning_rate": 1.3298782067893236e-05, + "loss": 1.3201, + "step": 2566 + }, + { + "epoch": 0.05987049826971461, + "grad_norm": 1.5136997699737549, + "learning_rate": 1.3303964757709252e-05, + "loss": 1.6177, + "step": 2567 + }, + { + "epoch": 0.05989382140889253, + "grad_norm": 1.5479817390441895, + "learning_rate": 1.3309147447525266e-05, + "loss": 1.3188, + "step": 2568 + }, + { + "epoch": 0.05991714454807045, + "grad_norm": 1.8136457204818726, + "learning_rate": 1.331433013734128e-05, + "loss": 1.0279, + "step": 2569 + }, + { + "epoch": 0.05994046768724837, + "grad_norm": 1.8352434635162354, + "learning_rate": 1.3319512827157295e-05, + "loss": 1.5215, + "step": 2570 + }, + { + "epoch": 0.059963790826426284, + "grad_norm": 2.270509719848633, + "learning_rate": 1.332469551697331e-05, + "loss": 1.6787, + "step": 2571 + }, + { + "epoch": 0.0599871139656042, + "grad_norm": 1.903822898864746, + "learning_rate": 1.3329878206789324e-05, + "loss": 1.5921, + "step": 2572 + }, + { + "epoch": 0.06001043710478212, + "grad_norm": 2.159048318862915, + "learning_rate": 1.3335060896605338e-05, + "loss": 1.6827, + "step": 2573 + }, + { + "epoch": 0.06003376024396004, + "grad_norm": 1.7216435670852661, + "learning_rate": 1.3340243586421352e-05, + "loss": 1.6214, + "step": 2574 + }, + { + "epoch": 0.060057083383137956, + "grad_norm": 1.6836422681808472, + "learning_rate": 1.3345426276237368e-05, + "loss": 1.336, + "step": 2575 + }, + { + "epoch": 0.060080406522315874, + "grad_norm": 2.070140838623047, + "learning_rate": 1.3350608966053383e-05, + "loss": 1.3277, + "step": 2576 + }, + { + "epoch": 0.06010372966149379, + "grad_norm": 1.3669464588165283, + "learning_rate": 1.3355791655869397e-05, + "loss": 1.3032, + "step": 2577 + }, + { + "epoch": 0.06012705280067171, + "grad_norm": 1.7620892524719238, + "learning_rate": 1.3360974345685412e-05, + "loss": 1.6768, + "step": 2578 + }, + { + "epoch": 0.06015037593984962, + "grad_norm": 2.349331855773926, + "learning_rate": 1.3366157035501426e-05, + "loss": 1.3301, + "step": 2579 + }, + { + "epoch": 0.06017369907902754, + "grad_norm": 1.7582626342773438, + "learning_rate": 1.337133972531744e-05, + "loss": 1.78, + "step": 2580 + }, + { + "epoch": 0.060197022218205457, + "grad_norm": 1.4992632865905762, + "learning_rate": 1.3376522415133455e-05, + "loss": 1.3913, + "step": 2581 + }, + { + "epoch": 0.060220345357383374, + "grad_norm": 2.119130849838257, + "learning_rate": 1.3381705104949469e-05, + "loss": 1.5424, + "step": 2582 + }, + { + "epoch": 0.06024366849656129, + "grad_norm": 2.205332040786743, + "learning_rate": 1.3386887794765483e-05, + "loss": 1.2993, + "step": 2583 + }, + { + "epoch": 0.06026699163573921, + "grad_norm": 1.7924771308898926, + "learning_rate": 1.33920704845815e-05, + "loss": 1.6227, + "step": 2584 + }, + { + "epoch": 0.06029031477491713, + "grad_norm": 2.209590435028076, + "learning_rate": 1.3397253174397514e-05, + "loss": 1.4697, + "step": 2585 + }, + { + "epoch": 0.060313637914095046, + "grad_norm": 1.8160085678100586, + "learning_rate": 1.3402435864213528e-05, + "loss": 1.6885, + "step": 2586 + }, + { + "epoch": 0.060336961053272964, + "grad_norm": 1.7656259536743164, + "learning_rate": 1.3407618554029542e-05, + "loss": 1.7152, + "step": 2587 + }, + { + "epoch": 0.06036028419245088, + "grad_norm": 1.9651085138320923, + "learning_rate": 1.3412801243845557e-05, + "loss": 1.3886, + "step": 2588 + }, + { + "epoch": 0.0603836073316288, + "grad_norm": 1.6476036310195923, + "learning_rate": 1.3417983933661571e-05, + "loss": 1.4977, + "step": 2589 + }, + { + "epoch": 0.06040693047080672, + "grad_norm": 2.9965789318084717, + "learning_rate": 1.3423166623477585e-05, + "loss": 1.6411, + "step": 2590 + }, + { + "epoch": 0.060430253609984635, + "grad_norm": 2.1225550174713135, + "learning_rate": 1.34283493132936e-05, + "loss": 1.7122, + "step": 2591 + }, + { + "epoch": 0.06045357674916255, + "grad_norm": 1.2732594013214111, + "learning_rate": 1.3433532003109616e-05, + "loss": 1.329, + "step": 2592 + }, + { + "epoch": 0.06047689988834047, + "grad_norm": 1.5653432607650757, + "learning_rate": 1.343871469292563e-05, + "loss": 1.2586, + "step": 2593 + }, + { + "epoch": 0.06050022302751839, + "grad_norm": 2.590881824493408, + "learning_rate": 1.3443897382741644e-05, + "loss": 1.3558, + "step": 2594 + }, + { + "epoch": 0.06052354616669631, + "grad_norm": 1.7460978031158447, + "learning_rate": 1.3449080072557659e-05, + "loss": 1.116, + "step": 2595 + }, + { + "epoch": 0.060546869305874225, + "grad_norm": 1.6606675386428833, + "learning_rate": 1.3454262762373673e-05, + "loss": 1.2989, + "step": 2596 + }, + { + "epoch": 0.06057019244505214, + "grad_norm": 1.8426719903945923, + "learning_rate": 1.3459445452189687e-05, + "loss": 1.322, + "step": 2597 + }, + { + "epoch": 0.06059351558423006, + "grad_norm": 1.3999927043914795, + "learning_rate": 1.3464628142005702e-05, + "loss": 1.3036, + "step": 2598 + }, + { + "epoch": 0.06061683872340798, + "grad_norm": 1.3142712116241455, + "learning_rate": 1.3469810831821716e-05, + "loss": 1.4356, + "step": 2599 + }, + { + "epoch": 0.060640161862585897, + "grad_norm": 2.233654737472534, + "learning_rate": 1.3474993521637732e-05, + "loss": 1.4797, + "step": 2600 + }, + { + "epoch": 0.060663485001763814, + "grad_norm": 2.094975471496582, + "learning_rate": 1.3480176211453747e-05, + "loss": 1.2695, + "step": 2601 + }, + { + "epoch": 0.06068680814094173, + "grad_norm": 1.5429056882858276, + "learning_rate": 1.3485358901269761e-05, + "loss": 1.5335, + "step": 2602 + }, + { + "epoch": 0.06071013128011965, + "grad_norm": 1.8557305335998535, + "learning_rate": 1.3490541591085775e-05, + "loss": 1.5586, + "step": 2603 + }, + { + "epoch": 0.06073345441929757, + "grad_norm": 1.8338919878005981, + "learning_rate": 1.349572428090179e-05, + "loss": 1.5284, + "step": 2604 + }, + { + "epoch": 0.060756777558475486, + "grad_norm": 2.071093797683716, + "learning_rate": 1.3500906970717804e-05, + "loss": 1.3638, + "step": 2605 + }, + { + "epoch": 0.060780100697653404, + "grad_norm": 1.9140174388885498, + "learning_rate": 1.3506089660533818e-05, + "loss": 1.4051, + "step": 2606 + }, + { + "epoch": 0.060803423836831315, + "grad_norm": 1.5910030603408813, + "learning_rate": 1.3511272350349833e-05, + "loss": 1.241, + "step": 2607 + }, + { + "epoch": 0.06082674697600923, + "grad_norm": 1.788118839263916, + "learning_rate": 1.3516455040165849e-05, + "loss": 1.6792, + "step": 2608 + }, + { + "epoch": 0.06085007011518715, + "grad_norm": 1.9293417930603027, + "learning_rate": 1.3521637729981863e-05, + "loss": 1.424, + "step": 2609 + }, + { + "epoch": 0.06087339325436507, + "grad_norm": 2.037588596343994, + "learning_rate": 1.3526820419797877e-05, + "loss": 1.3211, + "step": 2610 + }, + { + "epoch": 0.060896716393542986, + "grad_norm": 1.746131420135498, + "learning_rate": 1.3532003109613892e-05, + "loss": 1.4224, + "step": 2611 + }, + { + "epoch": 0.060920039532720904, + "grad_norm": 1.830556035041809, + "learning_rate": 1.3537185799429906e-05, + "loss": 1.3317, + "step": 2612 + }, + { + "epoch": 0.06094336267189882, + "grad_norm": 1.5884668827056885, + "learning_rate": 1.354236848924592e-05, + "loss": 1.3695, + "step": 2613 + }, + { + "epoch": 0.06096668581107674, + "grad_norm": 1.6232162714004517, + "learning_rate": 1.3547551179061935e-05, + "loss": 1.314, + "step": 2614 + }, + { + "epoch": 0.06099000895025466, + "grad_norm": 1.992008090019226, + "learning_rate": 1.3552733868877949e-05, + "loss": 1.8423, + "step": 2615 + }, + { + "epoch": 0.061013332089432576, + "grad_norm": 1.4518450498580933, + "learning_rate": 1.3557916558693963e-05, + "loss": 1.5026, + "step": 2616 + }, + { + "epoch": 0.061036655228610494, + "grad_norm": 2.025433301925659, + "learning_rate": 1.356309924850998e-05, + "loss": 1.5297, + "step": 2617 + }, + { + "epoch": 0.06105997836778841, + "grad_norm": 3.8088228702545166, + "learning_rate": 1.3568281938325994e-05, + "loss": 1.4006, + "step": 2618 + }, + { + "epoch": 0.06108330150696633, + "grad_norm": 1.7329661846160889, + "learning_rate": 1.3573464628142007e-05, + "loss": 1.5897, + "step": 2619 + }, + { + "epoch": 0.06110662464614425, + "grad_norm": 1.9908490180969238, + "learning_rate": 1.3578647317958021e-05, + "loss": 1.9205, + "step": 2620 + }, + { + "epoch": 0.061129947785322165, + "grad_norm": 1.6847058534622192, + "learning_rate": 1.3583830007774035e-05, + "loss": 1.4672, + "step": 2621 + }, + { + "epoch": 0.06115327092450008, + "grad_norm": 1.6240817308425903, + "learning_rate": 1.358901269759005e-05, + "loss": 1.6396, + "step": 2622 + }, + { + "epoch": 0.061176594063678, + "grad_norm": 1.4848893880844116, + "learning_rate": 1.3594195387406064e-05, + "loss": 1.077, + "step": 2623 + }, + { + "epoch": 0.06119991720285592, + "grad_norm": 1.7938750982284546, + "learning_rate": 1.3599378077222078e-05, + "loss": 1.4143, + "step": 2624 + }, + { + "epoch": 0.06122324034203384, + "grad_norm": 1.827236294746399, + "learning_rate": 1.3604560767038093e-05, + "loss": 1.4336, + "step": 2625 + }, + { + "epoch": 0.061246563481211755, + "grad_norm": 1.7886489629745483, + "learning_rate": 1.3609743456854107e-05, + "loss": 1.6841, + "step": 2626 + }, + { + "epoch": 0.06126988662038967, + "grad_norm": 1.934395432472229, + "learning_rate": 1.3614926146670123e-05, + "loss": 1.3356, + "step": 2627 + }, + { + "epoch": 0.06129320975956759, + "grad_norm": 2.428112745285034, + "learning_rate": 1.3620108836486137e-05, + "loss": 1.6694, + "step": 2628 + }, + { + "epoch": 0.06131653289874551, + "grad_norm": 2.5163893699645996, + "learning_rate": 1.3625291526302152e-05, + "loss": 1.412, + "step": 2629 + }, + { + "epoch": 0.061339856037923426, + "grad_norm": 1.8245412111282349, + "learning_rate": 1.3630474216118166e-05, + "loss": 1.3696, + "step": 2630 + }, + { + "epoch": 0.061363179177101344, + "grad_norm": 2.0314536094665527, + "learning_rate": 1.363565690593418e-05, + "loss": 1.5721, + "step": 2631 + }, + { + "epoch": 0.06138650231627926, + "grad_norm": 1.6534019708633423, + "learning_rate": 1.3640839595750195e-05, + "loss": 1.7137, + "step": 2632 + }, + { + "epoch": 0.06140982545545718, + "grad_norm": 1.8295602798461914, + "learning_rate": 1.3646022285566209e-05, + "loss": 1.7957, + "step": 2633 + }, + { + "epoch": 0.0614331485946351, + "grad_norm": 1.9154319763183594, + "learning_rate": 1.3651204975382223e-05, + "loss": 1.6792, + "step": 2634 + }, + { + "epoch": 0.061456471733813016, + "grad_norm": 1.9036335945129395, + "learning_rate": 1.3656387665198238e-05, + "loss": 1.5156, + "step": 2635 + }, + { + "epoch": 0.06147979487299093, + "grad_norm": 1.7407076358795166, + "learning_rate": 1.3661570355014254e-05, + "loss": 1.6499, + "step": 2636 + }, + { + "epoch": 0.061503118012168845, + "grad_norm": 1.8979015350341797, + "learning_rate": 1.3666753044830268e-05, + "loss": 1.6375, + "step": 2637 + }, + { + "epoch": 0.06152644115134676, + "grad_norm": 1.8352705240249634, + "learning_rate": 1.3671935734646282e-05, + "loss": 1.4799, + "step": 2638 + }, + { + "epoch": 0.06154976429052468, + "grad_norm": 2.0247464179992676, + "learning_rate": 1.3677118424462297e-05, + "loss": 1.3662, + "step": 2639 + }, + { + "epoch": 0.0615730874297026, + "grad_norm": 1.6887078285217285, + "learning_rate": 1.3682301114278311e-05, + "loss": 1.7703, + "step": 2640 + }, + { + "epoch": 0.061596410568880516, + "grad_norm": 1.784588098526001, + "learning_rate": 1.3687483804094326e-05, + "loss": 1.571, + "step": 2641 + }, + { + "epoch": 0.061619733708058434, + "grad_norm": 2.69565749168396, + "learning_rate": 1.369266649391034e-05, + "loss": 1.5766, + "step": 2642 + }, + { + "epoch": 0.06164305684723635, + "grad_norm": 1.9466243982315063, + "learning_rate": 1.3697849183726354e-05, + "loss": 1.6967, + "step": 2643 + }, + { + "epoch": 0.06166637998641427, + "grad_norm": 2.077322006225586, + "learning_rate": 1.370303187354237e-05, + "loss": 1.4202, + "step": 2644 + }, + { + "epoch": 0.06168970312559219, + "grad_norm": 1.9221806526184082, + "learning_rate": 1.3708214563358385e-05, + "loss": 1.6579, + "step": 2645 + }, + { + "epoch": 0.061713026264770106, + "grad_norm": 1.5117321014404297, + "learning_rate": 1.3713397253174399e-05, + "loss": 1.3543, + "step": 2646 + }, + { + "epoch": 0.061736349403948024, + "grad_norm": 1.7436460256576538, + "learning_rate": 1.3718579942990413e-05, + "loss": 1.6648, + "step": 2647 + }, + { + "epoch": 0.06175967254312594, + "grad_norm": 1.8213878870010376, + "learning_rate": 1.3723762632806428e-05, + "loss": 1.3991, + "step": 2648 + }, + { + "epoch": 0.06178299568230386, + "grad_norm": 2.2070603370666504, + "learning_rate": 1.3728945322622442e-05, + "loss": 1.531, + "step": 2649 + }, + { + "epoch": 0.06180631882148178, + "grad_norm": 2.077101945877075, + "learning_rate": 1.3734128012438456e-05, + "loss": 1.3124, + "step": 2650 + }, + { + "epoch": 0.061829641960659695, + "grad_norm": 2.2365119457244873, + "learning_rate": 1.373931070225447e-05, + "loss": 1.4602, + "step": 2651 + }, + { + "epoch": 0.06185296509983761, + "grad_norm": 1.6677820682525635, + "learning_rate": 1.3744493392070487e-05, + "loss": 1.3884, + "step": 2652 + }, + { + "epoch": 0.06187628823901553, + "grad_norm": 1.9608267545700073, + "learning_rate": 1.3749676081886501e-05, + "loss": 1.5826, + "step": 2653 + }, + { + "epoch": 0.06189961137819345, + "grad_norm": 2.236156463623047, + "learning_rate": 1.3754858771702515e-05, + "loss": 1.4431, + "step": 2654 + }, + { + "epoch": 0.06192293451737137, + "grad_norm": 2.1494522094726562, + "learning_rate": 1.376004146151853e-05, + "loss": 1.2584, + "step": 2655 + }, + { + "epoch": 0.061946257656549285, + "grad_norm": 1.94198477268219, + "learning_rate": 1.3765224151334544e-05, + "loss": 1.3086, + "step": 2656 + }, + { + "epoch": 0.0619695807957272, + "grad_norm": 1.835691213607788, + "learning_rate": 1.3770406841150558e-05, + "loss": 1.7029, + "step": 2657 + }, + { + "epoch": 0.06199290393490512, + "grad_norm": 1.8903217315673828, + "learning_rate": 1.3775589530966573e-05, + "loss": 1.652, + "step": 2658 + }, + { + "epoch": 0.06201622707408304, + "grad_norm": 1.707434892654419, + "learning_rate": 1.3780772220782587e-05, + "loss": 1.2228, + "step": 2659 + }, + { + "epoch": 0.062039550213260956, + "grad_norm": 1.802922248840332, + "learning_rate": 1.3785954910598603e-05, + "loss": 1.2687, + "step": 2660 + }, + { + "epoch": 0.062062873352438874, + "grad_norm": 2.5531184673309326, + "learning_rate": 1.3791137600414618e-05, + "loss": 1.4097, + "step": 2661 + }, + { + "epoch": 0.06208619649161679, + "grad_norm": 1.693434715270996, + "learning_rate": 1.3796320290230632e-05, + "loss": 1.4266, + "step": 2662 + }, + { + "epoch": 0.06210951963079471, + "grad_norm": 1.6222015619277954, + "learning_rate": 1.3801502980046646e-05, + "loss": 1.3221, + "step": 2663 + }, + { + "epoch": 0.06213284276997262, + "grad_norm": 1.7141613960266113, + "learning_rate": 1.380668566986266e-05, + "loss": 1.4362, + "step": 2664 + }, + { + "epoch": 0.06215616590915054, + "grad_norm": 2.7775673866271973, + "learning_rate": 1.3811868359678675e-05, + "loss": 1.6965, + "step": 2665 + }, + { + "epoch": 0.06217948904832846, + "grad_norm": 1.24868905544281, + "learning_rate": 1.381705104949469e-05, + "loss": 1.2357, + "step": 2666 + }, + { + "epoch": 0.062202812187506375, + "grad_norm": 1.7877827882766724, + "learning_rate": 1.3822233739310704e-05, + "loss": 1.6933, + "step": 2667 + }, + { + "epoch": 0.06222613532668429, + "grad_norm": 1.5741866827011108, + "learning_rate": 1.3827416429126718e-05, + "loss": 1.5295, + "step": 2668 + }, + { + "epoch": 0.06224945846586221, + "grad_norm": 1.7893970012664795, + "learning_rate": 1.3832599118942734e-05, + "loss": 1.3697, + "step": 2669 + }, + { + "epoch": 0.06227278160504013, + "grad_norm": 1.8457837104797363, + "learning_rate": 1.3837781808758748e-05, + "loss": 1.3283, + "step": 2670 + }, + { + "epoch": 0.062296104744218046, + "grad_norm": 1.6998080015182495, + "learning_rate": 1.3842964498574763e-05, + "loss": 1.4704, + "step": 2671 + }, + { + "epoch": 0.062319427883395964, + "grad_norm": 1.6351855993270874, + "learning_rate": 1.3848147188390777e-05, + "loss": 1.5497, + "step": 2672 + }, + { + "epoch": 0.06234275102257388, + "grad_norm": 2.2880098819732666, + "learning_rate": 1.3853329878206791e-05, + "loss": 1.279, + "step": 2673 + }, + { + "epoch": 0.0623660741617518, + "grad_norm": 1.5100282430648804, + "learning_rate": 1.3858512568022804e-05, + "loss": 1.389, + "step": 2674 + }, + { + "epoch": 0.06238939730092972, + "grad_norm": 2.57076358795166, + "learning_rate": 1.3863695257838818e-05, + "loss": 1.4024, + "step": 2675 + }, + { + "epoch": 0.062412720440107636, + "grad_norm": 1.4813395738601685, + "learning_rate": 1.3868877947654833e-05, + "loss": 1.2078, + "step": 2676 + }, + { + "epoch": 0.062436043579285554, + "grad_norm": 2.9532761573791504, + "learning_rate": 1.3874060637470847e-05, + "loss": 1.3444, + "step": 2677 + }, + { + "epoch": 0.06245936671846347, + "grad_norm": 2.0162930488586426, + "learning_rate": 1.3879243327286861e-05, + "loss": 1.646, + "step": 2678 + }, + { + "epoch": 0.06248268985764139, + "grad_norm": 2.062361717224121, + "learning_rate": 1.3884426017102876e-05, + "loss": 1.485, + "step": 2679 + }, + { + "epoch": 0.06250601299681931, + "grad_norm": 2.0320801734924316, + "learning_rate": 1.3889608706918892e-05, + "loss": 1.6613, + "step": 2680 + }, + { + "epoch": 0.06252933613599722, + "grad_norm": 1.9245129823684692, + "learning_rate": 1.3894791396734906e-05, + "loss": 1.6381, + "step": 2681 + }, + { + "epoch": 0.06255265927517514, + "grad_norm": 2.0062341690063477, + "learning_rate": 1.389997408655092e-05, + "loss": 1.5528, + "step": 2682 + }, + { + "epoch": 0.06257598241435305, + "grad_norm": 2.170292377471924, + "learning_rate": 1.3905156776366935e-05, + "loss": 1.4878, + "step": 2683 + }, + { + "epoch": 0.06259930555353098, + "grad_norm": 1.6300157308578491, + "learning_rate": 1.391033946618295e-05, + "loss": 1.195, + "step": 2684 + }, + { + "epoch": 0.06262262869270889, + "grad_norm": 2.239786386489868, + "learning_rate": 1.3915522155998964e-05, + "loss": 1.6731, + "step": 2685 + }, + { + "epoch": 0.06264595183188681, + "grad_norm": 1.7368957996368408, + "learning_rate": 1.3920704845814978e-05, + "loss": 1.1561, + "step": 2686 + }, + { + "epoch": 0.06266927497106473, + "grad_norm": 1.863064169883728, + "learning_rate": 1.3925887535630992e-05, + "loss": 1.3789, + "step": 2687 + }, + { + "epoch": 0.06269259811024265, + "grad_norm": 1.7001705169677734, + "learning_rate": 1.3931070225447008e-05, + "loss": 1.8909, + "step": 2688 + }, + { + "epoch": 0.06271592124942056, + "grad_norm": 2.388777256011963, + "learning_rate": 1.3936252915263023e-05, + "loss": 1.4946, + "step": 2689 + }, + { + "epoch": 0.06273924438859849, + "grad_norm": 1.803241491317749, + "learning_rate": 1.3941435605079037e-05, + "loss": 1.781, + "step": 2690 + }, + { + "epoch": 0.0627625675277764, + "grad_norm": 1.699387550354004, + "learning_rate": 1.3946618294895051e-05, + "loss": 1.2251, + "step": 2691 + }, + { + "epoch": 0.06278589066695432, + "grad_norm": 2.1750314235687256, + "learning_rate": 1.3951800984711066e-05, + "loss": 1.4449, + "step": 2692 + }, + { + "epoch": 0.06280921380613223, + "grad_norm": 1.7841440439224243, + "learning_rate": 1.395698367452708e-05, + "loss": 1.5959, + "step": 2693 + }, + { + "epoch": 0.06283253694531016, + "grad_norm": 1.9455244541168213, + "learning_rate": 1.3962166364343094e-05, + "loss": 1.3483, + "step": 2694 + }, + { + "epoch": 0.06285586008448807, + "grad_norm": 2.1182174682617188, + "learning_rate": 1.3967349054159109e-05, + "loss": 1.4448, + "step": 2695 + }, + { + "epoch": 0.062879183223666, + "grad_norm": 1.6244593858718872, + "learning_rate": 1.3972531743975125e-05, + "loss": 1.4739, + "step": 2696 + }, + { + "epoch": 0.0629025063628439, + "grad_norm": 1.751879096031189, + "learning_rate": 1.3977714433791139e-05, + "loss": 1.4888, + "step": 2697 + }, + { + "epoch": 0.06292582950202183, + "grad_norm": 2.2495086193084717, + "learning_rate": 1.3982897123607153e-05, + "loss": 1.6448, + "step": 2698 + }, + { + "epoch": 0.06294915264119974, + "grad_norm": 1.6507688760757446, + "learning_rate": 1.3988079813423168e-05, + "loss": 1.735, + "step": 2699 + }, + { + "epoch": 0.06297247578037767, + "grad_norm": 2.61356520652771, + "learning_rate": 1.3993262503239182e-05, + "loss": 1.7015, + "step": 2700 + }, + { + "epoch": 0.06299579891955558, + "grad_norm": 1.9515256881713867, + "learning_rate": 1.3998445193055197e-05, + "loss": 1.7929, + "step": 2701 + }, + { + "epoch": 0.0630191220587335, + "grad_norm": 1.7551541328430176, + "learning_rate": 1.4003627882871211e-05, + "loss": 1.578, + "step": 2702 + }, + { + "epoch": 0.06304244519791141, + "grad_norm": 1.6188023090362549, + "learning_rate": 1.4008810572687225e-05, + "loss": 1.5236, + "step": 2703 + }, + { + "epoch": 0.06306576833708934, + "grad_norm": 1.9739527702331543, + "learning_rate": 1.4013993262503241e-05, + "loss": 1.3597, + "step": 2704 + }, + { + "epoch": 0.06308909147626725, + "grad_norm": 6.611608505249023, + "learning_rate": 1.4019175952319256e-05, + "loss": 1.6558, + "step": 2705 + }, + { + "epoch": 0.06311241461544517, + "grad_norm": 1.7356557846069336, + "learning_rate": 1.402435864213527e-05, + "loss": 1.3899, + "step": 2706 + }, + { + "epoch": 0.06313573775462308, + "grad_norm": 1.846679449081421, + "learning_rate": 1.4029541331951284e-05, + "loss": 1.3879, + "step": 2707 + }, + { + "epoch": 0.063159060893801, + "grad_norm": 2.0728909969329834, + "learning_rate": 1.4034724021767299e-05, + "loss": 1.554, + "step": 2708 + }, + { + "epoch": 0.06318238403297892, + "grad_norm": 2.1357173919677734, + "learning_rate": 1.4039906711583313e-05, + "loss": 1.6166, + "step": 2709 + }, + { + "epoch": 0.06320570717215683, + "grad_norm": 2.6331140995025635, + "learning_rate": 1.4045089401399327e-05, + "loss": 1.6178, + "step": 2710 + }, + { + "epoch": 0.06322903031133476, + "grad_norm": 1.7224211692810059, + "learning_rate": 1.4050272091215342e-05, + "loss": 1.5338, + "step": 2711 + }, + { + "epoch": 0.06325235345051267, + "grad_norm": 1.7378405332565308, + "learning_rate": 1.4055454781031356e-05, + "loss": 1.5187, + "step": 2712 + }, + { + "epoch": 0.06327567658969059, + "grad_norm": 2.088463544845581, + "learning_rate": 1.4060637470847372e-05, + "loss": 1.8847, + "step": 2713 + }, + { + "epoch": 0.0632989997288685, + "grad_norm": 1.7667561769485474, + "learning_rate": 1.4065820160663386e-05, + "loss": 1.4182, + "step": 2714 + }, + { + "epoch": 0.06332232286804643, + "grad_norm": 2.654961109161377, + "learning_rate": 1.40710028504794e-05, + "loss": 1.6021, + "step": 2715 + }, + { + "epoch": 0.06334564600722434, + "grad_norm": 1.7463783025741577, + "learning_rate": 1.4076185540295415e-05, + "loss": 1.6907, + "step": 2716 + }, + { + "epoch": 0.06336896914640226, + "grad_norm": 1.81992769241333, + "learning_rate": 1.408136823011143e-05, + "loss": 1.1185, + "step": 2717 + }, + { + "epoch": 0.06339229228558017, + "grad_norm": 1.9757153987884521, + "learning_rate": 1.4086550919927444e-05, + "loss": 1.5705, + "step": 2718 + }, + { + "epoch": 0.0634156154247581, + "grad_norm": 1.7173175811767578, + "learning_rate": 1.4091733609743458e-05, + "loss": 1.3057, + "step": 2719 + }, + { + "epoch": 0.06343893856393601, + "grad_norm": 1.5778136253356934, + "learning_rate": 1.4096916299559472e-05, + "loss": 1.3756, + "step": 2720 + }, + { + "epoch": 0.06346226170311393, + "grad_norm": 1.7649437189102173, + "learning_rate": 1.4102098989375489e-05, + "loss": 1.3827, + "step": 2721 + }, + { + "epoch": 0.06348558484229185, + "grad_norm": 1.6660621166229248, + "learning_rate": 1.4107281679191503e-05, + "loss": 1.2465, + "step": 2722 + }, + { + "epoch": 0.06350890798146977, + "grad_norm": 1.7358027696609497, + "learning_rate": 1.4112464369007517e-05, + "loss": 1.6967, + "step": 2723 + }, + { + "epoch": 0.06353223112064768, + "grad_norm": 2.084941864013672, + "learning_rate": 1.4117647058823532e-05, + "loss": 1.445, + "step": 2724 + }, + { + "epoch": 0.0635555542598256, + "grad_norm": 2.192439556121826, + "learning_rate": 1.4122829748639546e-05, + "loss": 1.5963, + "step": 2725 + }, + { + "epoch": 0.06357887739900352, + "grad_norm": 2.5304958820343018, + "learning_rate": 1.412801243845556e-05, + "loss": 1.8453, + "step": 2726 + }, + { + "epoch": 0.06360220053818144, + "grad_norm": 1.7766294479370117, + "learning_rate": 1.4133195128271575e-05, + "loss": 1.5245, + "step": 2727 + }, + { + "epoch": 0.06362552367735935, + "grad_norm": 1.8651812076568604, + "learning_rate": 1.4138377818087589e-05, + "loss": 1.5013, + "step": 2728 + }, + { + "epoch": 0.06364884681653728, + "grad_norm": 2.4332542419433594, + "learning_rate": 1.4143560507903602e-05, + "loss": 1.4505, + "step": 2729 + }, + { + "epoch": 0.06367216995571519, + "grad_norm": 1.2664331197738647, + "learning_rate": 1.4148743197719616e-05, + "loss": 1.1377, + "step": 2730 + }, + { + "epoch": 0.06369549309489311, + "grad_norm": 2.3579812049865723, + "learning_rate": 1.415392588753563e-05, + "loss": 1.2938, + "step": 2731 + }, + { + "epoch": 0.06371881623407102, + "grad_norm": 1.6869444847106934, + "learning_rate": 1.4159108577351646e-05, + "loss": 1.6204, + "step": 2732 + }, + { + "epoch": 0.06374213937324895, + "grad_norm": 1.6905202865600586, + "learning_rate": 1.416429126716766e-05, + "loss": 1.4774, + "step": 2733 + }, + { + "epoch": 0.06376546251242686, + "grad_norm": 1.5543984174728394, + "learning_rate": 1.4169473956983675e-05, + "loss": 1.3287, + "step": 2734 + }, + { + "epoch": 0.06378878565160478, + "grad_norm": 2.0648207664489746, + "learning_rate": 1.417465664679969e-05, + "loss": 1.6893, + "step": 2735 + }, + { + "epoch": 0.0638121087907827, + "grad_norm": 2.0521440505981445, + "learning_rate": 1.4179839336615704e-05, + "loss": 1.5036, + "step": 2736 + }, + { + "epoch": 0.0638354319299606, + "grad_norm": 1.6368522644042969, + "learning_rate": 1.4185022026431718e-05, + "loss": 1.5613, + "step": 2737 + }, + { + "epoch": 0.06385875506913853, + "grad_norm": 1.7415629625320435, + "learning_rate": 1.4190204716247732e-05, + "loss": 1.3272, + "step": 2738 + }, + { + "epoch": 0.06388207820831644, + "grad_norm": 2.0426433086395264, + "learning_rate": 1.4195387406063747e-05, + "loss": 1.6206, + "step": 2739 + }, + { + "epoch": 0.06390540134749437, + "grad_norm": 1.856251835823059, + "learning_rate": 1.4200570095879763e-05, + "loss": 1.6922, + "step": 2740 + }, + { + "epoch": 0.06392872448667228, + "grad_norm": 1.6982864141464233, + "learning_rate": 1.4205752785695777e-05, + "loss": 1.5107, + "step": 2741 + }, + { + "epoch": 0.0639520476258502, + "grad_norm": 2.2496535778045654, + "learning_rate": 1.4210935475511792e-05, + "loss": 1.5137, + "step": 2742 + }, + { + "epoch": 0.06397537076502811, + "grad_norm": 1.5447698831558228, + "learning_rate": 1.4216118165327806e-05, + "loss": 1.3271, + "step": 2743 + }, + { + "epoch": 0.06399869390420604, + "grad_norm": 1.8603590726852417, + "learning_rate": 1.422130085514382e-05, + "loss": 1.4773, + "step": 2744 + }, + { + "epoch": 0.06402201704338395, + "grad_norm": 2.3592123985290527, + "learning_rate": 1.4226483544959835e-05, + "loss": 0.9334, + "step": 2745 + }, + { + "epoch": 0.06404534018256187, + "grad_norm": 2.216006278991699, + "learning_rate": 1.4231666234775849e-05, + "loss": 1.3389, + "step": 2746 + }, + { + "epoch": 0.06406866332173979, + "grad_norm": 1.652770757675171, + "learning_rate": 1.4236848924591863e-05, + "loss": 1.3359, + "step": 2747 + }, + { + "epoch": 0.06409198646091771, + "grad_norm": 1.717504620552063, + "learning_rate": 1.424203161440788e-05, + "loss": 1.4533, + "step": 2748 + }, + { + "epoch": 0.06411530960009562, + "grad_norm": 1.7929201126098633, + "learning_rate": 1.4247214304223894e-05, + "loss": 1.3263, + "step": 2749 + }, + { + "epoch": 0.06413863273927355, + "grad_norm": 1.9233717918395996, + "learning_rate": 1.4252396994039908e-05, + "loss": 1.4278, + "step": 2750 + }, + { + "epoch": 0.06416195587845146, + "grad_norm": 2.1122560501098633, + "learning_rate": 1.4257579683855922e-05, + "loss": 1.3754, + "step": 2751 + }, + { + "epoch": 0.06418527901762938, + "grad_norm": 2.260162353515625, + "learning_rate": 1.4262762373671937e-05, + "loss": 1.5928, + "step": 2752 + }, + { + "epoch": 0.06420860215680729, + "grad_norm": 2.246264696121216, + "learning_rate": 1.4267945063487951e-05, + "loss": 1.3268, + "step": 2753 + }, + { + "epoch": 0.06423192529598522, + "grad_norm": 1.7550286054611206, + "learning_rate": 1.4273127753303965e-05, + "loss": 1.393, + "step": 2754 + }, + { + "epoch": 0.06425524843516313, + "grad_norm": 2.4047062397003174, + "learning_rate": 1.427831044311998e-05, + "loss": 1.581, + "step": 2755 + }, + { + "epoch": 0.06427857157434105, + "grad_norm": 3.7945029735565186, + "learning_rate": 1.4283493132935996e-05, + "loss": 1.4191, + "step": 2756 + }, + { + "epoch": 0.06430189471351896, + "grad_norm": 1.6757818460464478, + "learning_rate": 1.428867582275201e-05, + "loss": 1.3387, + "step": 2757 + }, + { + "epoch": 0.06432521785269689, + "grad_norm": 2.4777257442474365, + "learning_rate": 1.4293858512568024e-05, + "loss": 1.7051, + "step": 2758 + }, + { + "epoch": 0.0643485409918748, + "grad_norm": 1.803855299949646, + "learning_rate": 1.4299041202384039e-05, + "loss": 1.3828, + "step": 2759 + }, + { + "epoch": 0.06437186413105273, + "grad_norm": 1.8653554916381836, + "learning_rate": 1.4304223892200053e-05, + "loss": 1.2711, + "step": 2760 + }, + { + "epoch": 0.06439518727023064, + "grad_norm": 1.588797688484192, + "learning_rate": 1.4309406582016067e-05, + "loss": 1.2614, + "step": 2761 + }, + { + "epoch": 0.06441851040940856, + "grad_norm": 2.4432761669158936, + "learning_rate": 1.4314589271832082e-05, + "loss": 1.1186, + "step": 2762 + }, + { + "epoch": 0.06444183354858647, + "grad_norm": 1.8135852813720703, + "learning_rate": 1.4319771961648096e-05, + "loss": 1.4257, + "step": 2763 + }, + { + "epoch": 0.06446515668776438, + "grad_norm": 1.841215968132019, + "learning_rate": 1.432495465146411e-05, + "loss": 1.3623, + "step": 2764 + }, + { + "epoch": 0.06448847982694231, + "grad_norm": 2.0197958946228027, + "learning_rate": 1.4330137341280127e-05, + "loss": 1.468, + "step": 2765 + }, + { + "epoch": 0.06451180296612022, + "grad_norm": 2.380474090576172, + "learning_rate": 1.4335320031096141e-05, + "loss": 1.2547, + "step": 2766 + }, + { + "epoch": 0.06453512610529814, + "grad_norm": 2.137549638748169, + "learning_rate": 1.4340502720912155e-05, + "loss": 1.3884, + "step": 2767 + }, + { + "epoch": 0.06455844924447605, + "grad_norm": 1.8818745613098145, + "learning_rate": 1.434568541072817e-05, + "loss": 1.305, + "step": 2768 + }, + { + "epoch": 0.06458177238365398, + "grad_norm": 1.7254643440246582, + "learning_rate": 1.4350868100544184e-05, + "loss": 1.1492, + "step": 2769 + }, + { + "epoch": 0.06460509552283189, + "grad_norm": 1.8451322317123413, + "learning_rate": 1.4356050790360198e-05, + "loss": 1.5132, + "step": 2770 + }, + { + "epoch": 0.06462841866200981, + "grad_norm": 2.049947738647461, + "learning_rate": 1.4361233480176213e-05, + "loss": 1.4172, + "step": 2771 + }, + { + "epoch": 0.06465174180118773, + "grad_norm": 2.0844335556030273, + "learning_rate": 1.4366416169992227e-05, + "loss": 1.3341, + "step": 2772 + }, + { + "epoch": 0.06467506494036565, + "grad_norm": 2.167858362197876, + "learning_rate": 1.4371598859808243e-05, + "loss": 1.4029, + "step": 2773 + }, + { + "epoch": 0.06469838807954356, + "grad_norm": 2.2055740356445312, + "learning_rate": 1.4376781549624257e-05, + "loss": 1.4169, + "step": 2774 + }, + { + "epoch": 0.06472171121872149, + "grad_norm": 1.6567565202713013, + "learning_rate": 1.4381964239440272e-05, + "loss": 1.4323, + "step": 2775 + }, + { + "epoch": 0.0647450343578994, + "grad_norm": 1.3381197452545166, + "learning_rate": 1.4387146929256286e-05, + "loss": 1.1591, + "step": 2776 + }, + { + "epoch": 0.06476835749707732, + "grad_norm": 1.877096176147461, + "learning_rate": 1.43923296190723e-05, + "loss": 1.4447, + "step": 2777 + }, + { + "epoch": 0.06479168063625523, + "grad_norm": 2.3243725299835205, + "learning_rate": 1.4397512308888315e-05, + "loss": 1.2583, + "step": 2778 + }, + { + "epoch": 0.06481500377543316, + "grad_norm": 2.023531198501587, + "learning_rate": 1.4402694998704329e-05, + "loss": 1.0388, + "step": 2779 + }, + { + "epoch": 0.06483832691461107, + "grad_norm": 1.5490742921829224, + "learning_rate": 1.4407877688520343e-05, + "loss": 1.3603, + "step": 2780 + }, + { + "epoch": 0.064861650053789, + "grad_norm": 1.8510228395462036, + "learning_rate": 1.441306037833636e-05, + "loss": 1.5173, + "step": 2781 + }, + { + "epoch": 0.0648849731929669, + "grad_norm": 1.8563857078552246, + "learning_rate": 1.4418243068152374e-05, + "loss": 1.6119, + "step": 2782 + }, + { + "epoch": 0.06490829633214483, + "grad_norm": 2.1504569053649902, + "learning_rate": 1.4423425757968388e-05, + "loss": 1.2287, + "step": 2783 + }, + { + "epoch": 0.06493161947132274, + "grad_norm": 1.629302978515625, + "learning_rate": 1.4428608447784401e-05, + "loss": 1.4008, + "step": 2784 + }, + { + "epoch": 0.06495494261050067, + "grad_norm": 1.6605809926986694, + "learning_rate": 1.4433791137600415e-05, + "loss": 1.5132, + "step": 2785 + }, + { + "epoch": 0.06497826574967858, + "grad_norm": 1.8998322486877441, + "learning_rate": 1.443897382741643e-05, + "loss": 1.4024, + "step": 2786 + }, + { + "epoch": 0.0650015888888565, + "grad_norm": 1.89012610912323, + "learning_rate": 1.4444156517232444e-05, + "loss": 1.5893, + "step": 2787 + }, + { + "epoch": 0.06502491202803441, + "grad_norm": 2.7149722576141357, + "learning_rate": 1.4449339207048458e-05, + "loss": 1.4361, + "step": 2788 + }, + { + "epoch": 0.06504823516721234, + "grad_norm": 1.397595763206482, + "learning_rate": 1.4454521896864473e-05, + "loss": 1.1094, + "step": 2789 + }, + { + "epoch": 0.06507155830639025, + "grad_norm": 1.8745239973068237, + "learning_rate": 1.4459704586680487e-05, + "loss": 1.6704, + "step": 2790 + }, + { + "epoch": 0.06509488144556817, + "grad_norm": 1.8384937047958374, + "learning_rate": 1.4464887276496501e-05, + "loss": 1.7918, + "step": 2791 + }, + { + "epoch": 0.06511820458474608, + "grad_norm": 1.9192211627960205, + "learning_rate": 1.4470069966312517e-05, + "loss": 1.459, + "step": 2792 + }, + { + "epoch": 0.065141527723924, + "grad_norm": 1.795638918876648, + "learning_rate": 1.4475252656128532e-05, + "loss": 1.3766, + "step": 2793 + }, + { + "epoch": 0.06516485086310192, + "grad_norm": 1.8964101076126099, + "learning_rate": 1.4480435345944546e-05, + "loss": 1.5834, + "step": 2794 + }, + { + "epoch": 0.06518817400227983, + "grad_norm": 1.563743233680725, + "learning_rate": 1.448561803576056e-05, + "loss": 1.1312, + "step": 2795 + }, + { + "epoch": 0.06521149714145776, + "grad_norm": 1.5765119791030884, + "learning_rate": 1.4490800725576575e-05, + "loss": 1.4893, + "step": 2796 + }, + { + "epoch": 0.06523482028063567, + "grad_norm": 1.7887187004089355, + "learning_rate": 1.4495983415392589e-05, + "loss": 1.2458, + "step": 2797 + }, + { + "epoch": 0.06525814341981359, + "grad_norm": 1.4799649715423584, + "learning_rate": 1.4501166105208603e-05, + "loss": 1.358, + "step": 2798 + }, + { + "epoch": 0.0652814665589915, + "grad_norm": 2.0007877349853516, + "learning_rate": 1.4506348795024618e-05, + "loss": 0.915, + "step": 2799 + }, + { + "epoch": 0.06530478969816943, + "grad_norm": 2.1305413246154785, + "learning_rate": 1.4511531484840634e-05, + "loss": 1.7505, + "step": 2800 + }, + { + "epoch": 0.06532811283734734, + "grad_norm": 3.3206400871276855, + "learning_rate": 1.4516714174656648e-05, + "loss": 1.8972, + "step": 2801 + }, + { + "epoch": 0.06535143597652526, + "grad_norm": 1.7682409286499023, + "learning_rate": 1.4521896864472662e-05, + "loss": 1.4471, + "step": 2802 + }, + { + "epoch": 0.06537475911570317, + "grad_norm": 1.81817626953125, + "learning_rate": 1.4527079554288677e-05, + "loss": 1.1318, + "step": 2803 + }, + { + "epoch": 0.0653980822548811, + "grad_norm": 1.710696816444397, + "learning_rate": 1.4532262244104691e-05, + "loss": 1.4271, + "step": 2804 + }, + { + "epoch": 0.06542140539405901, + "grad_norm": 2.3982298374176025, + "learning_rate": 1.4537444933920706e-05, + "loss": 1.5585, + "step": 2805 + }, + { + "epoch": 0.06544472853323693, + "grad_norm": 1.7883695363998413, + "learning_rate": 1.454262762373672e-05, + "loss": 1.3224, + "step": 2806 + }, + { + "epoch": 0.06546805167241485, + "grad_norm": 1.9640473127365112, + "learning_rate": 1.4547810313552734e-05, + "loss": 1.8388, + "step": 2807 + }, + { + "epoch": 0.06549137481159277, + "grad_norm": 2.2285878658294678, + "learning_rate": 1.455299300336875e-05, + "loss": 1.2853, + "step": 2808 + }, + { + "epoch": 0.06551469795077068, + "grad_norm": 1.8806909322738647, + "learning_rate": 1.4558175693184765e-05, + "loss": 1.2943, + "step": 2809 + }, + { + "epoch": 0.0655380210899486, + "grad_norm": 1.5270092487335205, + "learning_rate": 1.4563358383000779e-05, + "loss": 1.2686, + "step": 2810 + }, + { + "epoch": 0.06556134422912652, + "grad_norm": 2.655914783477783, + "learning_rate": 1.4568541072816793e-05, + "loss": 1.5861, + "step": 2811 + }, + { + "epoch": 0.06558466736830444, + "grad_norm": 1.6872950792312622, + "learning_rate": 1.4573723762632808e-05, + "loss": 1.4328, + "step": 2812 + }, + { + "epoch": 0.06560799050748235, + "grad_norm": 2.5100176334381104, + "learning_rate": 1.4578906452448822e-05, + "loss": 1.4183, + "step": 2813 + }, + { + "epoch": 0.06563131364666028, + "grad_norm": 2.044571876525879, + "learning_rate": 1.4584089142264836e-05, + "loss": 1.4343, + "step": 2814 + }, + { + "epoch": 0.06565463678583819, + "grad_norm": 1.8628580570220947, + "learning_rate": 1.458927183208085e-05, + "loss": 1.4403, + "step": 2815 + }, + { + "epoch": 0.06567795992501611, + "grad_norm": 1.4153575897216797, + "learning_rate": 1.4594454521896865e-05, + "loss": 1.3196, + "step": 2816 + }, + { + "epoch": 0.06570128306419402, + "grad_norm": 1.8334929943084717, + "learning_rate": 1.4599637211712881e-05, + "loss": 1.5043, + "step": 2817 + }, + { + "epoch": 0.06572460620337195, + "grad_norm": 1.9358466863632202, + "learning_rate": 1.4604819901528895e-05, + "loss": 1.4786, + "step": 2818 + }, + { + "epoch": 0.06574792934254986, + "grad_norm": 2.730907678604126, + "learning_rate": 1.461000259134491e-05, + "loss": 1.7105, + "step": 2819 + }, + { + "epoch": 0.06577125248172778, + "grad_norm": 1.9616193771362305, + "learning_rate": 1.4615185281160924e-05, + "loss": 1.2128, + "step": 2820 + }, + { + "epoch": 0.0657945756209057, + "grad_norm": 2.0155746936798096, + "learning_rate": 1.4620367970976938e-05, + "loss": 1.5347, + "step": 2821 + }, + { + "epoch": 0.0658178987600836, + "grad_norm": 2.0486910343170166, + "learning_rate": 1.4625550660792953e-05, + "loss": 1.2862, + "step": 2822 + }, + { + "epoch": 0.06584122189926153, + "grad_norm": 1.7724339962005615, + "learning_rate": 1.4630733350608967e-05, + "loss": 1.193, + "step": 2823 + }, + { + "epoch": 0.06586454503843944, + "grad_norm": 1.8260146379470825, + "learning_rate": 1.4635916040424981e-05, + "loss": 1.6844, + "step": 2824 + }, + { + "epoch": 0.06588786817761737, + "grad_norm": 1.7288804054260254, + "learning_rate": 1.4641098730240998e-05, + "loss": 1.3953, + "step": 2825 + }, + { + "epoch": 0.06591119131679528, + "grad_norm": 1.7598590850830078, + "learning_rate": 1.4646281420057012e-05, + "loss": 1.4005, + "step": 2826 + }, + { + "epoch": 0.0659345144559732, + "grad_norm": 1.7817363739013672, + "learning_rate": 1.4651464109873026e-05, + "loss": 1.2379, + "step": 2827 + }, + { + "epoch": 0.06595783759515111, + "grad_norm": 1.7752867937088013, + "learning_rate": 1.465664679968904e-05, + "loss": 1.5554, + "step": 2828 + }, + { + "epoch": 0.06598116073432904, + "grad_norm": 1.7931783199310303, + "learning_rate": 1.4661829489505055e-05, + "loss": 1.5309, + "step": 2829 + }, + { + "epoch": 0.06600448387350695, + "grad_norm": 1.4036725759506226, + "learning_rate": 1.466701217932107e-05, + "loss": 1.2221, + "step": 2830 + }, + { + "epoch": 0.06602780701268487, + "grad_norm": 2.4719862937927246, + "learning_rate": 1.4672194869137084e-05, + "loss": 1.5346, + "step": 2831 + }, + { + "epoch": 0.06605113015186279, + "grad_norm": 1.5505207777023315, + "learning_rate": 1.4677377558953098e-05, + "loss": 1.2664, + "step": 2832 + }, + { + "epoch": 0.06607445329104071, + "grad_norm": 1.8633780479431152, + "learning_rate": 1.4682560248769114e-05, + "loss": 1.4508, + "step": 2833 + }, + { + "epoch": 0.06609777643021862, + "grad_norm": 2.0683753490448, + "learning_rate": 1.4687742938585128e-05, + "loss": 1.5232, + "step": 2834 + }, + { + "epoch": 0.06612109956939655, + "grad_norm": 1.9980714321136475, + "learning_rate": 1.4692925628401143e-05, + "loss": 1.3419, + "step": 2835 + }, + { + "epoch": 0.06614442270857446, + "grad_norm": 1.9079272747039795, + "learning_rate": 1.4698108318217157e-05, + "loss": 1.3417, + "step": 2836 + }, + { + "epoch": 0.06616774584775238, + "grad_norm": 2.0348222255706787, + "learning_rate": 1.4703291008033171e-05, + "loss": 1.1548, + "step": 2837 + }, + { + "epoch": 0.0661910689869303, + "grad_norm": 1.6220494508743286, + "learning_rate": 1.4708473697849186e-05, + "loss": 1.3376, + "step": 2838 + }, + { + "epoch": 0.06621439212610822, + "grad_norm": 2.169706344604492, + "learning_rate": 1.47136563876652e-05, + "loss": 1.4463, + "step": 2839 + }, + { + "epoch": 0.06623771526528613, + "grad_norm": 1.7391022443771362, + "learning_rate": 1.4718839077481213e-05, + "loss": 1.4331, + "step": 2840 + }, + { + "epoch": 0.06626103840446405, + "grad_norm": 1.9789973497390747, + "learning_rate": 1.4724021767297227e-05, + "loss": 1.4105, + "step": 2841 + }, + { + "epoch": 0.06628436154364196, + "grad_norm": 2.1749653816223145, + "learning_rate": 1.4729204457113241e-05, + "loss": 1.7178, + "step": 2842 + }, + { + "epoch": 0.06630768468281989, + "grad_norm": 1.8664934635162354, + "learning_rate": 1.4734387146929256e-05, + "loss": 1.6212, + "step": 2843 + }, + { + "epoch": 0.0663310078219978, + "grad_norm": 1.8881118297576904, + "learning_rate": 1.4739569836745272e-05, + "loss": 1.22, + "step": 2844 + }, + { + "epoch": 0.06635433096117573, + "grad_norm": 1.743923544883728, + "learning_rate": 1.4744752526561286e-05, + "loss": 1.7732, + "step": 2845 + }, + { + "epoch": 0.06637765410035364, + "grad_norm": 1.611082673072815, + "learning_rate": 1.47499352163773e-05, + "loss": 1.6571, + "step": 2846 + }, + { + "epoch": 0.06640097723953156, + "grad_norm": 1.955450177192688, + "learning_rate": 1.4755117906193315e-05, + "loss": 1.4484, + "step": 2847 + }, + { + "epoch": 0.06642430037870947, + "grad_norm": 1.7298091650009155, + "learning_rate": 1.476030059600933e-05, + "loss": 1.561, + "step": 2848 + }, + { + "epoch": 0.0664476235178874, + "grad_norm": 1.5733643770217896, + "learning_rate": 1.4765483285825344e-05, + "loss": 1.3661, + "step": 2849 + }, + { + "epoch": 0.06647094665706531, + "grad_norm": 1.6763901710510254, + "learning_rate": 1.4770665975641358e-05, + "loss": 1.5303, + "step": 2850 + }, + { + "epoch": 0.06649426979624322, + "grad_norm": 1.6987820863723755, + "learning_rate": 1.4775848665457372e-05, + "loss": 1.3741, + "step": 2851 + }, + { + "epoch": 0.06651759293542114, + "grad_norm": 1.3298968076705933, + "learning_rate": 1.4781031355273388e-05, + "loss": 1.129, + "step": 2852 + }, + { + "epoch": 0.06654091607459905, + "grad_norm": 1.903433084487915, + "learning_rate": 1.4786214045089403e-05, + "loss": 1.3274, + "step": 2853 + }, + { + "epoch": 0.06656423921377698, + "grad_norm": 1.4711178541183472, + "learning_rate": 1.4791396734905417e-05, + "loss": 1.3581, + "step": 2854 + }, + { + "epoch": 0.06658756235295489, + "grad_norm": 1.758985996246338, + "learning_rate": 1.4796579424721431e-05, + "loss": 1.7431, + "step": 2855 + }, + { + "epoch": 0.06661088549213282, + "grad_norm": 2.4290409088134766, + "learning_rate": 1.4801762114537446e-05, + "loss": 1.6673, + "step": 2856 + }, + { + "epoch": 0.06663420863131073, + "grad_norm": 2.1742918491363525, + "learning_rate": 1.480694480435346e-05, + "loss": 1.3935, + "step": 2857 + }, + { + "epoch": 0.06665753177048865, + "grad_norm": 2.2304227352142334, + "learning_rate": 1.4812127494169474e-05, + "loss": 0.9519, + "step": 2858 + }, + { + "epoch": 0.06668085490966656, + "grad_norm": 2.019341468811035, + "learning_rate": 1.4817310183985489e-05, + "loss": 1.2995, + "step": 2859 + }, + { + "epoch": 0.06670417804884449, + "grad_norm": 1.9606066942214966, + "learning_rate": 1.4822492873801505e-05, + "loss": 1.3055, + "step": 2860 + }, + { + "epoch": 0.0667275011880224, + "grad_norm": 1.6898326873779297, + "learning_rate": 1.4827675563617519e-05, + "loss": 1.5552, + "step": 2861 + }, + { + "epoch": 0.06675082432720032, + "grad_norm": 1.844104290008545, + "learning_rate": 1.4832858253433533e-05, + "loss": 1.2242, + "step": 2862 + }, + { + "epoch": 0.06677414746637823, + "grad_norm": 1.9563182592391968, + "learning_rate": 1.4838040943249548e-05, + "loss": 0.9554, + "step": 2863 + }, + { + "epoch": 0.06679747060555616, + "grad_norm": 1.9188711643218994, + "learning_rate": 1.4843223633065562e-05, + "loss": 1.4186, + "step": 2864 + }, + { + "epoch": 0.06682079374473407, + "grad_norm": 2.0907742977142334, + "learning_rate": 1.4848406322881576e-05, + "loss": 1.4292, + "step": 2865 + }, + { + "epoch": 0.066844116883912, + "grad_norm": 1.5654215812683105, + "learning_rate": 1.485358901269759e-05, + "loss": 1.3211, + "step": 2866 + }, + { + "epoch": 0.0668674400230899, + "grad_norm": 1.6763267517089844, + "learning_rate": 1.4858771702513605e-05, + "loss": 1.4544, + "step": 2867 + }, + { + "epoch": 0.06689076316226783, + "grad_norm": 1.7230890989303589, + "learning_rate": 1.486395439232962e-05, + "loss": 1.269, + "step": 2868 + }, + { + "epoch": 0.06691408630144574, + "grad_norm": 1.7296384572982788, + "learning_rate": 1.4869137082145636e-05, + "loss": 1.3582, + "step": 2869 + }, + { + "epoch": 0.06693740944062367, + "grad_norm": 2.278798818588257, + "learning_rate": 1.487431977196165e-05, + "loss": 1.7924, + "step": 2870 + }, + { + "epoch": 0.06696073257980158, + "grad_norm": 1.5812768936157227, + "learning_rate": 1.4879502461777664e-05, + "loss": 1.5316, + "step": 2871 + }, + { + "epoch": 0.0669840557189795, + "grad_norm": 1.7185741662979126, + "learning_rate": 1.4884685151593679e-05, + "loss": 1.3088, + "step": 2872 + }, + { + "epoch": 0.06700737885815741, + "grad_norm": 1.9661529064178467, + "learning_rate": 1.4889867841409693e-05, + "loss": 1.898, + "step": 2873 + }, + { + "epoch": 0.06703070199733534, + "grad_norm": 1.6297186613082886, + "learning_rate": 1.4895050531225707e-05, + "loss": 1.3343, + "step": 2874 + }, + { + "epoch": 0.06705402513651325, + "grad_norm": 1.3750114440917969, + "learning_rate": 1.4900233221041722e-05, + "loss": 1.1608, + "step": 2875 + }, + { + "epoch": 0.06707734827569117, + "grad_norm": 1.9823325872421265, + "learning_rate": 1.4905415910857736e-05, + "loss": 1.575, + "step": 2876 + }, + { + "epoch": 0.06710067141486908, + "grad_norm": 2.4376564025878906, + "learning_rate": 1.4910598600673752e-05, + "loss": 1.47, + "step": 2877 + }, + { + "epoch": 0.067123994554047, + "grad_norm": 1.6154961585998535, + "learning_rate": 1.4915781290489766e-05, + "loss": 1.2653, + "step": 2878 + }, + { + "epoch": 0.06714731769322492, + "grad_norm": 1.5177198648452759, + "learning_rate": 1.492096398030578e-05, + "loss": 1.2946, + "step": 2879 + }, + { + "epoch": 0.06717064083240283, + "grad_norm": 1.9357798099517822, + "learning_rate": 1.4926146670121795e-05, + "loss": 1.2401, + "step": 2880 + }, + { + "epoch": 0.06719396397158076, + "grad_norm": 1.568678617477417, + "learning_rate": 1.493132935993781e-05, + "loss": 1.3365, + "step": 2881 + }, + { + "epoch": 0.06721728711075867, + "grad_norm": 1.8375293016433716, + "learning_rate": 1.4936512049753824e-05, + "loss": 1.3164, + "step": 2882 + }, + { + "epoch": 0.06724061024993659, + "grad_norm": 1.971502661705017, + "learning_rate": 1.4941694739569838e-05, + "loss": 1.2666, + "step": 2883 + }, + { + "epoch": 0.0672639333891145, + "grad_norm": 1.5636154413223267, + "learning_rate": 1.4946877429385852e-05, + "loss": 1.3742, + "step": 2884 + }, + { + "epoch": 0.06728725652829243, + "grad_norm": 1.8986397981643677, + "learning_rate": 1.4952060119201869e-05, + "loss": 1.3876, + "step": 2885 + }, + { + "epoch": 0.06731057966747034, + "grad_norm": 2.2729148864746094, + "learning_rate": 1.4957242809017883e-05, + "loss": 1.5104, + "step": 2886 + }, + { + "epoch": 0.06733390280664826, + "grad_norm": 1.8707177639007568, + "learning_rate": 1.4962425498833897e-05, + "loss": 1.5612, + "step": 2887 + }, + { + "epoch": 0.06735722594582617, + "grad_norm": 2.0465052127838135, + "learning_rate": 1.4967608188649912e-05, + "loss": 1.9165, + "step": 2888 + }, + { + "epoch": 0.0673805490850041, + "grad_norm": 2.444533348083496, + "learning_rate": 1.4972790878465926e-05, + "loss": 1.6854, + "step": 2889 + }, + { + "epoch": 0.06740387222418201, + "grad_norm": 2.5257554054260254, + "learning_rate": 1.497797356828194e-05, + "loss": 1.4987, + "step": 2890 + }, + { + "epoch": 0.06742719536335993, + "grad_norm": 2.1791629791259766, + "learning_rate": 1.4983156258097955e-05, + "loss": 1.2276, + "step": 2891 + }, + { + "epoch": 0.06745051850253785, + "grad_norm": 1.8654357194900513, + "learning_rate": 1.4988338947913969e-05, + "loss": 1.7785, + "step": 2892 + }, + { + "epoch": 0.06747384164171577, + "grad_norm": 2.5087015628814697, + "learning_rate": 1.4993521637729985e-05, + "loss": 1.4029, + "step": 2893 + }, + { + "epoch": 0.06749716478089368, + "grad_norm": 1.6082079410552979, + "learning_rate": 1.4998704327546e-05, + "loss": 1.0552, + "step": 2894 + }, + { + "epoch": 0.0675204879200716, + "grad_norm": 1.8922289609909058, + "learning_rate": 1.500388701736201e-05, + "loss": 1.2605, + "step": 2895 + }, + { + "epoch": 0.06754381105924952, + "grad_norm": 1.8346471786499023, + "learning_rate": 1.5009069707178026e-05, + "loss": 1.4355, + "step": 2896 + }, + { + "epoch": 0.06756713419842744, + "grad_norm": 1.8859299421310425, + "learning_rate": 1.501425239699404e-05, + "loss": 1.2775, + "step": 2897 + }, + { + "epoch": 0.06759045733760535, + "grad_norm": 1.3990947008132935, + "learning_rate": 1.5019435086810055e-05, + "loss": 0.9683, + "step": 2898 + }, + { + "epoch": 0.06761378047678328, + "grad_norm": 1.876051664352417, + "learning_rate": 1.502461777662607e-05, + "loss": 1.7135, + "step": 2899 + }, + { + "epoch": 0.06763710361596119, + "grad_norm": 1.5841714143753052, + "learning_rate": 1.5029800466442084e-05, + "loss": 1.5269, + "step": 2900 + }, + { + "epoch": 0.06766042675513911, + "grad_norm": 1.8639863729476929, + "learning_rate": 1.5034983156258098e-05, + "loss": 1.4331, + "step": 2901 + }, + { + "epoch": 0.06768374989431702, + "grad_norm": 1.6989810466766357, + "learning_rate": 1.5040165846074112e-05, + "loss": 1.6676, + "step": 2902 + }, + { + "epoch": 0.06770707303349495, + "grad_norm": 1.7870492935180664, + "learning_rate": 1.5045348535890127e-05, + "loss": 1.5192, + "step": 2903 + }, + { + "epoch": 0.06773039617267286, + "grad_norm": 2.058709144592285, + "learning_rate": 1.5050531225706143e-05, + "loss": 1.69, + "step": 2904 + }, + { + "epoch": 0.06775371931185079, + "grad_norm": 1.7233374118804932, + "learning_rate": 1.5055713915522157e-05, + "loss": 1.4893, + "step": 2905 + }, + { + "epoch": 0.0677770424510287, + "grad_norm": 1.888060212135315, + "learning_rate": 1.5060896605338171e-05, + "loss": 1.7769, + "step": 2906 + }, + { + "epoch": 0.06780036559020661, + "grad_norm": 2.2901411056518555, + "learning_rate": 1.5066079295154186e-05, + "loss": 1.6078, + "step": 2907 + }, + { + "epoch": 0.06782368872938453, + "grad_norm": 1.6352299451828003, + "learning_rate": 1.50712619849702e-05, + "loss": 1.7949, + "step": 2908 + }, + { + "epoch": 0.06784701186856244, + "grad_norm": 1.6780446767807007, + "learning_rate": 1.5076444674786215e-05, + "loss": 1.1521, + "step": 2909 + }, + { + "epoch": 0.06787033500774037, + "grad_norm": 1.8701889514923096, + "learning_rate": 1.5081627364602229e-05, + "loss": 1.4567, + "step": 2910 + }, + { + "epoch": 0.06789365814691828, + "grad_norm": 1.741023302078247, + "learning_rate": 1.5086810054418243e-05, + "loss": 1.1188, + "step": 2911 + }, + { + "epoch": 0.0679169812860962, + "grad_norm": 1.6472866535186768, + "learning_rate": 1.5091992744234258e-05, + "loss": 1.6849, + "step": 2912 + }, + { + "epoch": 0.06794030442527411, + "grad_norm": 1.4536561965942383, + "learning_rate": 1.5097175434050274e-05, + "loss": 1.4544, + "step": 2913 + }, + { + "epoch": 0.06796362756445204, + "grad_norm": 2.170592784881592, + "learning_rate": 1.5102358123866288e-05, + "loss": 1.5684, + "step": 2914 + }, + { + "epoch": 0.06798695070362995, + "grad_norm": 2.3381991386413574, + "learning_rate": 1.5107540813682302e-05, + "loss": 1.6957, + "step": 2915 + }, + { + "epoch": 0.06801027384280788, + "grad_norm": 1.36526620388031, + "learning_rate": 1.5112723503498317e-05, + "loss": 1.1307, + "step": 2916 + }, + { + "epoch": 0.06803359698198579, + "grad_norm": 2.0184521675109863, + "learning_rate": 1.5117906193314331e-05, + "loss": 1.5328, + "step": 2917 + }, + { + "epoch": 0.06805692012116371, + "grad_norm": 1.6322370767593384, + "learning_rate": 1.5123088883130345e-05, + "loss": 1.3401, + "step": 2918 + }, + { + "epoch": 0.06808024326034162, + "grad_norm": 2.306300401687622, + "learning_rate": 1.512827157294636e-05, + "loss": 1.6381, + "step": 2919 + }, + { + "epoch": 0.06810356639951955, + "grad_norm": 1.7915669679641724, + "learning_rate": 1.5133454262762374e-05, + "loss": 1.4884, + "step": 2920 + }, + { + "epoch": 0.06812688953869746, + "grad_norm": 1.6436505317687988, + "learning_rate": 1.513863695257839e-05, + "loss": 1.0716, + "step": 2921 + }, + { + "epoch": 0.06815021267787538, + "grad_norm": 2.4779422283172607, + "learning_rate": 1.5143819642394404e-05, + "loss": 1.5326, + "step": 2922 + }, + { + "epoch": 0.0681735358170533, + "grad_norm": 1.9814671277999878, + "learning_rate": 1.5149002332210419e-05, + "loss": 1.5269, + "step": 2923 + }, + { + "epoch": 0.06819685895623122, + "grad_norm": 1.9176750183105469, + "learning_rate": 1.5154185022026433e-05, + "loss": 1.4689, + "step": 2924 + }, + { + "epoch": 0.06822018209540913, + "grad_norm": 1.3773646354675293, + "learning_rate": 1.5159367711842447e-05, + "loss": 1.2972, + "step": 2925 + }, + { + "epoch": 0.06824350523458705, + "grad_norm": 2.0976767539978027, + "learning_rate": 1.5164550401658462e-05, + "loss": 1.3239, + "step": 2926 + }, + { + "epoch": 0.06826682837376497, + "grad_norm": 1.9581456184387207, + "learning_rate": 1.5169733091474476e-05, + "loss": 1.9213, + "step": 2927 + }, + { + "epoch": 0.06829015151294289, + "grad_norm": 1.74188232421875, + "learning_rate": 1.517491578129049e-05, + "loss": 1.5739, + "step": 2928 + }, + { + "epoch": 0.0683134746521208, + "grad_norm": 1.7404353618621826, + "learning_rate": 1.5180098471106507e-05, + "loss": 1.6213, + "step": 2929 + }, + { + "epoch": 0.06833679779129873, + "grad_norm": 1.6050968170166016, + "learning_rate": 1.5185281160922521e-05, + "loss": 1.6145, + "step": 2930 + }, + { + "epoch": 0.06836012093047664, + "grad_norm": 2.024853229522705, + "learning_rate": 1.5190463850738535e-05, + "loss": 1.344, + "step": 2931 + }, + { + "epoch": 0.06838344406965456, + "grad_norm": 1.593658685684204, + "learning_rate": 1.519564654055455e-05, + "loss": 1.2873, + "step": 2932 + }, + { + "epoch": 0.06840676720883247, + "grad_norm": 2.121755838394165, + "learning_rate": 1.5200829230370564e-05, + "loss": 1.6169, + "step": 2933 + }, + { + "epoch": 0.0684300903480104, + "grad_norm": 1.858398199081421, + "learning_rate": 1.5206011920186578e-05, + "loss": 1.5265, + "step": 2934 + }, + { + "epoch": 0.06845341348718831, + "grad_norm": 1.6854084730148315, + "learning_rate": 1.5211194610002593e-05, + "loss": 1.3703, + "step": 2935 + }, + { + "epoch": 0.06847673662636622, + "grad_norm": 2.447187900543213, + "learning_rate": 1.5216377299818607e-05, + "loss": 1.5762, + "step": 2936 + }, + { + "epoch": 0.06850005976554414, + "grad_norm": 1.7991342544555664, + "learning_rate": 1.5221559989634623e-05, + "loss": 1.4965, + "step": 2937 + }, + { + "epoch": 0.06852338290472205, + "grad_norm": 1.6656452417373657, + "learning_rate": 1.5226742679450637e-05, + "loss": 1.5852, + "step": 2938 + }, + { + "epoch": 0.06854670604389998, + "grad_norm": 2.030794143676758, + "learning_rate": 1.5231925369266652e-05, + "loss": 1.479, + "step": 2939 + }, + { + "epoch": 0.06857002918307789, + "grad_norm": 1.8245702981948853, + "learning_rate": 1.5237108059082666e-05, + "loss": 1.6709, + "step": 2940 + }, + { + "epoch": 0.06859335232225582, + "grad_norm": 1.472434163093567, + "learning_rate": 1.524229074889868e-05, + "loss": 1.3113, + "step": 2941 + }, + { + "epoch": 0.06861667546143373, + "grad_norm": 2.1557259559631348, + "learning_rate": 1.5247473438714695e-05, + "loss": 1.8385, + "step": 2942 + }, + { + "epoch": 0.06863999860061165, + "grad_norm": 1.5355175733566284, + "learning_rate": 1.5252656128530709e-05, + "loss": 1.2249, + "step": 2943 + }, + { + "epoch": 0.06866332173978956, + "grad_norm": 1.8443330526351929, + "learning_rate": 1.5257838818346723e-05, + "loss": 1.4584, + "step": 2944 + }, + { + "epoch": 0.06868664487896749, + "grad_norm": 1.898336410522461, + "learning_rate": 1.526302150816274e-05, + "loss": 1.4696, + "step": 2945 + }, + { + "epoch": 0.0687099680181454, + "grad_norm": 1.7229390144348145, + "learning_rate": 1.5268204197978754e-05, + "loss": 1.4675, + "step": 2946 + }, + { + "epoch": 0.06873329115732332, + "grad_norm": 1.6735163927078247, + "learning_rate": 1.5273386887794768e-05, + "loss": 1.8393, + "step": 2947 + }, + { + "epoch": 0.06875661429650123, + "grad_norm": 1.5430946350097656, + "learning_rate": 1.5278569577610783e-05, + "loss": 1.252, + "step": 2948 + }, + { + "epoch": 0.06877993743567916, + "grad_norm": 2.2710747718811035, + "learning_rate": 1.5283752267426797e-05, + "loss": 1.2602, + "step": 2949 + }, + { + "epoch": 0.06880326057485707, + "grad_norm": 2.163055658340454, + "learning_rate": 1.5288934957242808e-05, + "loss": 1.5387, + "step": 2950 + }, + { + "epoch": 0.068826583714035, + "grad_norm": 1.905474066734314, + "learning_rate": 1.5294117647058822e-05, + "loss": 1.4113, + "step": 2951 + }, + { + "epoch": 0.0688499068532129, + "grad_norm": 1.6778980493545532, + "learning_rate": 1.529930033687484e-05, + "loss": 1.4245, + "step": 2952 + }, + { + "epoch": 0.06887322999239083, + "grad_norm": 1.684782862663269, + "learning_rate": 1.5304483026690854e-05, + "loss": 1.6875, + "step": 2953 + }, + { + "epoch": 0.06889655313156874, + "grad_norm": 1.8047994375228882, + "learning_rate": 1.530966571650687e-05, + "loss": 1.6221, + "step": 2954 + }, + { + "epoch": 0.06891987627074667, + "grad_norm": 1.7538729906082153, + "learning_rate": 1.5314848406322883e-05, + "loss": 1.5855, + "step": 2955 + }, + { + "epoch": 0.06894319940992458, + "grad_norm": 2.5780930519104004, + "learning_rate": 1.5320031096138897e-05, + "loss": 1.4451, + "step": 2956 + }, + { + "epoch": 0.0689665225491025, + "grad_norm": 2.0026376247406006, + "learning_rate": 1.532521378595491e-05, + "loss": 1.372, + "step": 2957 + }, + { + "epoch": 0.06898984568828041, + "grad_norm": 2.1854658126831055, + "learning_rate": 1.5330396475770926e-05, + "loss": 1.3999, + "step": 2958 + }, + { + "epoch": 0.06901316882745834, + "grad_norm": 1.9142454862594604, + "learning_rate": 1.533557916558694e-05, + "loss": 1.3172, + "step": 2959 + }, + { + "epoch": 0.06903649196663625, + "grad_norm": 1.423509120941162, + "learning_rate": 1.5340761855402955e-05, + "loss": 1.1681, + "step": 2960 + }, + { + "epoch": 0.06905981510581417, + "grad_norm": 1.7319942712783813, + "learning_rate": 1.534594454521897e-05, + "loss": 1.2578, + "step": 2961 + }, + { + "epoch": 0.06908313824499208, + "grad_norm": 1.640005111694336, + "learning_rate": 1.5351127235034983e-05, + "loss": 1.1881, + "step": 2962 + }, + { + "epoch": 0.06910646138417001, + "grad_norm": 1.9526257514953613, + "learning_rate": 1.5356309924850998e-05, + "loss": 1.7211, + "step": 2963 + }, + { + "epoch": 0.06912978452334792, + "grad_norm": 1.7995927333831787, + "learning_rate": 1.5361492614667012e-05, + "loss": 1.5302, + "step": 2964 + }, + { + "epoch": 0.06915310766252583, + "grad_norm": 1.7013497352600098, + "learning_rate": 1.5366675304483026e-05, + "loss": 1.4952, + "step": 2965 + }, + { + "epoch": 0.06917643080170376, + "grad_norm": 1.8493554592132568, + "learning_rate": 1.537185799429904e-05, + "loss": 1.5287, + "step": 2966 + }, + { + "epoch": 0.06919975394088167, + "grad_norm": 2.2868854999542236, + "learning_rate": 1.5377040684115055e-05, + "loss": 1.1454, + "step": 2967 + }, + { + "epoch": 0.06922307708005959, + "grad_norm": 2.045095443725586, + "learning_rate": 1.538222337393107e-05, + "loss": 1.3912, + "step": 2968 + }, + { + "epoch": 0.0692464002192375, + "grad_norm": 1.8001420497894287, + "learning_rate": 1.5387406063747087e-05, + "loss": 1.4114, + "step": 2969 + }, + { + "epoch": 0.06926972335841543, + "grad_norm": 2.4776105880737305, + "learning_rate": 1.53925887535631e-05, + "loss": 1.4288, + "step": 2970 + }, + { + "epoch": 0.06929304649759334, + "grad_norm": 2.2437918186187744, + "learning_rate": 1.5397771443379116e-05, + "loss": 1.2949, + "step": 2971 + }, + { + "epoch": 0.06931636963677126, + "grad_norm": 1.7763323783874512, + "learning_rate": 1.540295413319513e-05, + "loss": 1.0882, + "step": 2972 + }, + { + "epoch": 0.06933969277594917, + "grad_norm": 2.110168933868408, + "learning_rate": 1.5408136823011145e-05, + "loss": 1.1464, + "step": 2973 + }, + { + "epoch": 0.0693630159151271, + "grad_norm": 1.5015451908111572, + "learning_rate": 1.541331951282716e-05, + "loss": 1.4931, + "step": 2974 + }, + { + "epoch": 0.06938633905430501, + "grad_norm": 2.220963716506958, + "learning_rate": 1.5418502202643173e-05, + "loss": 1.3172, + "step": 2975 + }, + { + "epoch": 0.06940966219348293, + "grad_norm": 1.5186398029327393, + "learning_rate": 1.5423684892459188e-05, + "loss": 1.2238, + "step": 2976 + }, + { + "epoch": 0.06943298533266085, + "grad_norm": 1.7325942516326904, + "learning_rate": 1.5428867582275202e-05, + "loss": 1.7801, + "step": 2977 + }, + { + "epoch": 0.06945630847183877, + "grad_norm": 2.1025593280792236, + "learning_rate": 1.5434050272091216e-05, + "loss": 1.5691, + "step": 2978 + }, + { + "epoch": 0.06947963161101668, + "grad_norm": 1.9171198606491089, + "learning_rate": 1.543923296190723e-05, + "loss": 1.3018, + "step": 2979 + }, + { + "epoch": 0.0695029547501946, + "grad_norm": 1.9191886186599731, + "learning_rate": 1.5444415651723245e-05, + "loss": 1.1185, + "step": 2980 + }, + { + "epoch": 0.06952627788937252, + "grad_norm": 1.8381496667861938, + "learning_rate": 1.544959834153926e-05, + "loss": 1.6724, + "step": 2981 + }, + { + "epoch": 0.06954960102855044, + "grad_norm": 2.372688055038452, + "learning_rate": 1.5454781031355274e-05, + "loss": 1.4596, + "step": 2982 + }, + { + "epoch": 0.06957292416772835, + "grad_norm": 2.1257786750793457, + "learning_rate": 1.5459963721171288e-05, + "loss": 1.7045, + "step": 2983 + }, + { + "epoch": 0.06959624730690628, + "grad_norm": 1.7516330480575562, + "learning_rate": 1.5465146410987302e-05, + "loss": 1.5536, + "step": 2984 + }, + { + "epoch": 0.06961957044608419, + "grad_norm": 2.6538538932800293, + "learning_rate": 1.547032910080332e-05, + "loss": 1.3659, + "step": 2985 + }, + { + "epoch": 0.06964289358526211, + "grad_norm": 1.7529492378234863, + "learning_rate": 1.5475511790619334e-05, + "loss": 1.6208, + "step": 2986 + }, + { + "epoch": 0.06966621672444002, + "grad_norm": 2.090695858001709, + "learning_rate": 1.548069448043535e-05, + "loss": 1.348, + "step": 2987 + }, + { + "epoch": 0.06968953986361795, + "grad_norm": 2.129758834838867, + "learning_rate": 1.5485877170251363e-05, + "loss": 1.5219, + "step": 2988 + }, + { + "epoch": 0.06971286300279586, + "grad_norm": 1.8315856456756592, + "learning_rate": 1.5491059860067378e-05, + "loss": 1.2473, + "step": 2989 + }, + { + "epoch": 0.06973618614197379, + "grad_norm": 1.7378078699111938, + "learning_rate": 1.5496242549883392e-05, + "loss": 1.6124, + "step": 2990 + }, + { + "epoch": 0.0697595092811517, + "grad_norm": 2.115633487701416, + "learning_rate": 1.5501425239699406e-05, + "loss": 1.2983, + "step": 2991 + }, + { + "epoch": 0.06978283242032961, + "grad_norm": 1.7724013328552246, + "learning_rate": 1.550660792951542e-05, + "loss": 1.4654, + "step": 2992 + }, + { + "epoch": 0.06980615555950753, + "grad_norm": 2.1570277214050293, + "learning_rate": 1.5511790619331435e-05, + "loss": 1.5257, + "step": 2993 + }, + { + "epoch": 0.06982947869868544, + "grad_norm": 1.644012689590454, + "learning_rate": 1.551697330914745e-05, + "loss": 1.277, + "step": 2994 + }, + { + "epoch": 0.06985280183786337, + "grad_norm": 1.7980034351348877, + "learning_rate": 1.5522155998963464e-05, + "loss": 1.5572, + "step": 2995 + }, + { + "epoch": 0.06987612497704128, + "grad_norm": 1.6826823949813843, + "learning_rate": 1.5527338688779478e-05, + "loss": 1.246, + "step": 2996 + }, + { + "epoch": 0.0698994481162192, + "grad_norm": 1.8310699462890625, + "learning_rate": 1.5532521378595492e-05, + "loss": 1.3459, + "step": 2997 + }, + { + "epoch": 0.06992277125539711, + "grad_norm": 1.914998173713684, + "learning_rate": 1.5537704068411507e-05, + "loss": 1.4237, + "step": 2998 + }, + { + "epoch": 0.06994609439457504, + "grad_norm": 1.4848195314407349, + "learning_rate": 1.554288675822752e-05, + "loss": 1.2903, + "step": 2999 + }, + { + "epoch": 0.06996941753375295, + "grad_norm": 2.3208260536193848, + "learning_rate": 1.5548069448043535e-05, + "loss": 1.972, + "step": 3000 + }, + { + "epoch": 0.06999274067293088, + "grad_norm": 2.3595495223999023, + "learning_rate": 1.555325213785955e-05, + "loss": 1.4813, + "step": 3001 + }, + { + "epoch": 0.07001606381210879, + "grad_norm": 2.143214464187622, + "learning_rate": 1.5558434827675567e-05, + "loss": 1.2124, + "step": 3002 + }, + { + "epoch": 0.07003938695128671, + "grad_norm": 1.6811202764511108, + "learning_rate": 1.5563617517491582e-05, + "loss": 1.3735, + "step": 3003 + }, + { + "epoch": 0.07006271009046462, + "grad_norm": 1.5104790925979614, + "learning_rate": 1.5568800207307596e-05, + "loss": 1.5616, + "step": 3004 + }, + { + "epoch": 0.07008603322964255, + "grad_norm": 2.121225357055664, + "learning_rate": 1.5573982897123607e-05, + "loss": 1.857, + "step": 3005 + }, + { + "epoch": 0.07010935636882046, + "grad_norm": 2.3359665870666504, + "learning_rate": 1.557916558693962e-05, + "loss": 1.426, + "step": 3006 + }, + { + "epoch": 0.07013267950799838, + "grad_norm": 1.9051471948623657, + "learning_rate": 1.5584348276755636e-05, + "loss": 1.5802, + "step": 3007 + }, + { + "epoch": 0.0701560026471763, + "grad_norm": 1.8195501565933228, + "learning_rate": 1.558953096657165e-05, + "loss": 1.5288, + "step": 3008 + }, + { + "epoch": 0.07017932578635422, + "grad_norm": 1.8926808834075928, + "learning_rate": 1.5594713656387664e-05, + "loss": 1.415, + "step": 3009 + }, + { + "epoch": 0.07020264892553213, + "grad_norm": 1.7610915899276733, + "learning_rate": 1.559989634620368e-05, + "loss": 1.3034, + "step": 3010 + }, + { + "epoch": 0.07022597206471005, + "grad_norm": 1.8621258735656738, + "learning_rate": 1.5605079036019693e-05, + "loss": 1.3214, + "step": 3011 + }, + { + "epoch": 0.07024929520388797, + "grad_norm": 2.0164079666137695, + "learning_rate": 1.5610261725835708e-05, + "loss": 1.3016, + "step": 3012 + }, + { + "epoch": 0.07027261834306589, + "grad_norm": 1.7668334245681763, + "learning_rate": 1.5615444415651725e-05, + "loss": 1.3767, + "step": 3013 + }, + { + "epoch": 0.0702959414822438, + "grad_norm": 2.2490322589874268, + "learning_rate": 1.562062710546774e-05, + "loss": 1.6974, + "step": 3014 + }, + { + "epoch": 0.07031926462142173, + "grad_norm": 2.4273457527160645, + "learning_rate": 1.5625809795283754e-05, + "loss": 1.5084, + "step": 3015 + }, + { + "epoch": 0.07034258776059964, + "grad_norm": 1.5331447124481201, + "learning_rate": 1.5630992485099768e-05, + "loss": 1.3963, + "step": 3016 + }, + { + "epoch": 0.07036591089977756, + "grad_norm": 1.5221631526947021, + "learning_rate": 1.5636175174915783e-05, + "loss": 1.3032, + "step": 3017 + }, + { + "epoch": 0.07038923403895547, + "grad_norm": 1.9873692989349365, + "learning_rate": 1.5641357864731797e-05, + "loss": 1.8065, + "step": 3018 + }, + { + "epoch": 0.0704125571781334, + "grad_norm": 1.7702462673187256, + "learning_rate": 1.564654055454781e-05, + "loss": 1.2632, + "step": 3019 + }, + { + "epoch": 0.07043588031731131, + "grad_norm": 1.7505241632461548, + "learning_rate": 1.5651723244363826e-05, + "loss": 1.2402, + "step": 3020 + }, + { + "epoch": 0.07045920345648922, + "grad_norm": 1.8434094190597534, + "learning_rate": 1.565690593417984e-05, + "loss": 1.4673, + "step": 3021 + }, + { + "epoch": 0.07048252659566714, + "grad_norm": 1.4408884048461914, + "learning_rate": 1.5662088623995854e-05, + "loss": 1.182, + "step": 3022 + }, + { + "epoch": 0.07050584973484506, + "grad_norm": 1.3842504024505615, + "learning_rate": 1.566727131381187e-05, + "loss": 1.4564, + "step": 3023 + }, + { + "epoch": 0.07052917287402298, + "grad_norm": 1.9442074298858643, + "learning_rate": 1.5672454003627883e-05, + "loss": 1.5722, + "step": 3024 + }, + { + "epoch": 0.07055249601320089, + "grad_norm": 1.6745545864105225, + "learning_rate": 1.5677636693443897e-05, + "loss": 1.3703, + "step": 3025 + }, + { + "epoch": 0.07057581915237882, + "grad_norm": 1.7542271614074707, + "learning_rate": 1.5682819383259912e-05, + "loss": 1.1784, + "step": 3026 + }, + { + "epoch": 0.07059914229155673, + "grad_norm": 1.6971882581710815, + "learning_rate": 1.5688002073075926e-05, + "loss": 1.1932, + "step": 3027 + }, + { + "epoch": 0.07062246543073465, + "grad_norm": 1.9626144170761108, + "learning_rate": 1.569318476289194e-05, + "loss": 1.7452, + "step": 3028 + }, + { + "epoch": 0.07064578856991256, + "grad_norm": 1.5161895751953125, + "learning_rate": 1.5698367452707958e-05, + "loss": 1.4136, + "step": 3029 + }, + { + "epoch": 0.07066911170909049, + "grad_norm": 2.5730278491973877, + "learning_rate": 1.5703550142523973e-05, + "loss": 1.5446, + "step": 3030 + }, + { + "epoch": 0.0706924348482684, + "grad_norm": 1.8499232530593872, + "learning_rate": 1.5708732832339987e-05, + "loss": 1.5568, + "step": 3031 + }, + { + "epoch": 0.07071575798744632, + "grad_norm": 1.6006439924240112, + "learning_rate": 1.5713915522156e-05, + "loss": 1.596, + "step": 3032 + }, + { + "epoch": 0.07073908112662423, + "grad_norm": 2.0837395191192627, + "learning_rate": 1.5719098211972016e-05, + "loss": 1.443, + "step": 3033 + }, + { + "epoch": 0.07076240426580216, + "grad_norm": 1.77737557888031, + "learning_rate": 1.572428090178803e-05, + "loss": 1.7192, + "step": 3034 + }, + { + "epoch": 0.07078572740498007, + "grad_norm": 1.7115414142608643, + "learning_rate": 1.5729463591604044e-05, + "loss": 1.2101, + "step": 3035 + }, + { + "epoch": 0.070809050544158, + "grad_norm": 1.6676453351974487, + "learning_rate": 1.573464628142006e-05, + "loss": 1.4841, + "step": 3036 + }, + { + "epoch": 0.0708323736833359, + "grad_norm": 2.0481326580047607, + "learning_rate": 1.5739828971236073e-05, + "loss": 1.6479, + "step": 3037 + }, + { + "epoch": 0.07085569682251383, + "grad_norm": 1.7933276891708374, + "learning_rate": 1.5745011661052087e-05, + "loss": 1.0619, + "step": 3038 + }, + { + "epoch": 0.07087901996169174, + "grad_norm": 2.1594574451446533, + "learning_rate": 1.57501943508681e-05, + "loss": 1.5329, + "step": 3039 + }, + { + "epoch": 0.07090234310086967, + "grad_norm": 1.6190122365951538, + "learning_rate": 1.5755377040684116e-05, + "loss": 1.1178, + "step": 3040 + }, + { + "epoch": 0.07092566624004758, + "grad_norm": 1.773872971534729, + "learning_rate": 1.576055973050013e-05, + "loss": 1.6123, + "step": 3041 + }, + { + "epoch": 0.0709489893792255, + "grad_norm": 1.6447725296020508, + "learning_rate": 1.5765742420316145e-05, + "loss": 1.2291, + "step": 3042 + }, + { + "epoch": 0.07097231251840341, + "grad_norm": 2.039640426635742, + "learning_rate": 1.577092511013216e-05, + "loss": 1.7433, + "step": 3043 + }, + { + "epoch": 0.07099563565758134, + "grad_norm": 1.6734812259674072, + "learning_rate": 1.5776107799948173e-05, + "loss": 1.6518, + "step": 3044 + }, + { + "epoch": 0.07101895879675925, + "grad_norm": 1.7559072971343994, + "learning_rate": 1.5781290489764188e-05, + "loss": 1.142, + "step": 3045 + }, + { + "epoch": 0.07104228193593717, + "grad_norm": 1.6769473552703857, + "learning_rate": 1.5786473179580205e-05, + "loss": 1.5003, + "step": 3046 + }, + { + "epoch": 0.07106560507511508, + "grad_norm": 1.8988209962844849, + "learning_rate": 1.579165586939622e-05, + "loss": 1.6845, + "step": 3047 + }, + { + "epoch": 0.07108892821429301, + "grad_norm": 1.9083596467971802, + "learning_rate": 1.5796838559212234e-05, + "loss": 1.144, + "step": 3048 + }, + { + "epoch": 0.07111225135347092, + "grad_norm": 2.1103906631469727, + "learning_rate": 1.580202124902825e-05, + "loss": 1.1897, + "step": 3049 + }, + { + "epoch": 0.07113557449264883, + "grad_norm": 1.7527072429656982, + "learning_rate": 1.5807203938844263e-05, + "loss": 1.4615, + "step": 3050 + }, + { + "epoch": 0.07115889763182676, + "grad_norm": 1.745250940322876, + "learning_rate": 1.5812386628660277e-05, + "loss": 1.6192, + "step": 3051 + }, + { + "epoch": 0.07118222077100467, + "grad_norm": 2.378971815109253, + "learning_rate": 1.581756931847629e-05, + "loss": 1.5468, + "step": 3052 + }, + { + "epoch": 0.07120554391018259, + "grad_norm": 1.9410420656204224, + "learning_rate": 1.5822752008292306e-05, + "loss": 1.1459, + "step": 3053 + }, + { + "epoch": 0.0712288670493605, + "grad_norm": 1.7086267471313477, + "learning_rate": 1.582793469810832e-05, + "loss": 1.2372, + "step": 3054 + }, + { + "epoch": 0.07125219018853843, + "grad_norm": 1.9025890827178955, + "learning_rate": 1.5833117387924335e-05, + "loss": 1.561, + "step": 3055 + }, + { + "epoch": 0.07127551332771634, + "grad_norm": 1.5841965675354004, + "learning_rate": 1.583830007774035e-05, + "loss": 1.4006, + "step": 3056 + }, + { + "epoch": 0.07129883646689426, + "grad_norm": 2.0455946922302246, + "learning_rate": 1.5843482767556363e-05, + "loss": 1.2966, + "step": 3057 + }, + { + "epoch": 0.07132215960607217, + "grad_norm": 1.6825358867645264, + "learning_rate": 1.5848665457372378e-05, + "loss": 1.4364, + "step": 3058 + }, + { + "epoch": 0.0713454827452501, + "grad_norm": 1.7199450731277466, + "learning_rate": 1.5853848147188392e-05, + "loss": 1.1987, + "step": 3059 + }, + { + "epoch": 0.07136880588442801, + "grad_norm": 1.9703516960144043, + "learning_rate": 1.5859030837004406e-05, + "loss": 1.609, + "step": 3060 + }, + { + "epoch": 0.07139212902360594, + "grad_norm": 1.8907127380371094, + "learning_rate": 1.586421352682042e-05, + "loss": 1.5658, + "step": 3061 + }, + { + "epoch": 0.07141545216278385, + "grad_norm": 1.7374207973480225, + "learning_rate": 1.5869396216636435e-05, + "loss": 1.4888, + "step": 3062 + }, + { + "epoch": 0.07143877530196177, + "grad_norm": 1.549346685409546, + "learning_rate": 1.587457890645245e-05, + "loss": 1.2308, + "step": 3063 + }, + { + "epoch": 0.07146209844113968, + "grad_norm": 1.2408068180084229, + "learning_rate": 1.5879761596268464e-05, + "loss": 1.2161, + "step": 3064 + }, + { + "epoch": 0.0714854215803176, + "grad_norm": 1.6506532430648804, + "learning_rate": 1.5884944286084478e-05, + "loss": 1.2904, + "step": 3065 + }, + { + "epoch": 0.07150874471949552, + "grad_norm": 2.1740195751190186, + "learning_rate": 1.5890126975900492e-05, + "loss": 1.5094, + "step": 3066 + }, + { + "epoch": 0.07153206785867344, + "grad_norm": 1.4976023435592651, + "learning_rate": 1.5895309665716507e-05, + "loss": 1.2849, + "step": 3067 + }, + { + "epoch": 0.07155539099785135, + "grad_norm": 1.8397139310836792, + "learning_rate": 1.590049235553252e-05, + "loss": 1.6023, + "step": 3068 + }, + { + "epoch": 0.07157871413702928, + "grad_norm": 1.8328680992126465, + "learning_rate": 1.5905675045348535e-05, + "loss": 1.3928, + "step": 3069 + }, + { + "epoch": 0.07160203727620719, + "grad_norm": 1.487030029296875, + "learning_rate": 1.591085773516455e-05, + "loss": 1.2776, + "step": 3070 + }, + { + "epoch": 0.07162536041538511, + "grad_norm": 2.1938352584838867, + "learning_rate": 1.5916040424980564e-05, + "loss": 1.6936, + "step": 3071 + }, + { + "epoch": 0.07164868355456303, + "grad_norm": 1.7803940773010254, + "learning_rate": 1.592122311479658e-05, + "loss": 1.4459, + "step": 3072 + }, + { + "epoch": 0.07167200669374095, + "grad_norm": 3.9215104579925537, + "learning_rate": 1.5926405804612596e-05, + "loss": 1.3872, + "step": 3073 + }, + { + "epoch": 0.07169532983291886, + "grad_norm": 2.229813575744629, + "learning_rate": 1.593158849442861e-05, + "loss": 1.5246, + "step": 3074 + }, + { + "epoch": 0.07171865297209679, + "grad_norm": 1.808138132095337, + "learning_rate": 1.5936771184244625e-05, + "loss": 1.313, + "step": 3075 + }, + { + "epoch": 0.0717419761112747, + "grad_norm": 1.741746187210083, + "learning_rate": 1.594195387406064e-05, + "loss": 1.7373, + "step": 3076 + }, + { + "epoch": 0.07176529925045262, + "grad_norm": 1.9369068145751953, + "learning_rate": 1.5947136563876654e-05, + "loss": 1.4971, + "step": 3077 + }, + { + "epoch": 0.07178862238963053, + "grad_norm": 1.7145378589630127, + "learning_rate": 1.5952319253692668e-05, + "loss": 1.2739, + "step": 3078 + }, + { + "epoch": 0.07181194552880844, + "grad_norm": 1.8517347574234009, + "learning_rate": 1.5957501943508682e-05, + "loss": 1.5945, + "step": 3079 + }, + { + "epoch": 0.07183526866798637, + "grad_norm": 2.492448091506958, + "learning_rate": 1.5962684633324697e-05, + "loss": 1.4941, + "step": 3080 + }, + { + "epoch": 0.07185859180716428, + "grad_norm": 2.4276201725006104, + "learning_rate": 1.596786732314071e-05, + "loss": 1.3294, + "step": 3081 + }, + { + "epoch": 0.0718819149463422, + "grad_norm": 1.579856276512146, + "learning_rate": 1.5973050012956725e-05, + "loss": 1.317, + "step": 3082 + }, + { + "epoch": 0.07190523808552012, + "grad_norm": 2.4683403968811035, + "learning_rate": 1.597823270277274e-05, + "loss": 1.6417, + "step": 3083 + }, + { + "epoch": 0.07192856122469804, + "grad_norm": 1.8689765930175781, + "learning_rate": 1.5983415392588754e-05, + "loss": 1.6266, + "step": 3084 + }, + { + "epoch": 0.07195188436387595, + "grad_norm": 1.729116678237915, + "learning_rate": 1.598859808240477e-05, + "loss": 1.2618, + "step": 3085 + }, + { + "epoch": 0.07197520750305388, + "grad_norm": 1.9072259664535522, + "learning_rate": 1.5993780772220783e-05, + "loss": 1.1818, + "step": 3086 + }, + { + "epoch": 0.07199853064223179, + "grad_norm": 2.139037847518921, + "learning_rate": 1.5998963462036797e-05, + "loss": 1.2447, + "step": 3087 + }, + { + "epoch": 0.07202185378140971, + "grad_norm": 1.903786540031433, + "learning_rate": 1.600414615185281e-05, + "loss": 1.2269, + "step": 3088 + }, + { + "epoch": 0.07204517692058762, + "grad_norm": 1.8438259363174438, + "learning_rate": 1.600932884166883e-05, + "loss": 1.3741, + "step": 3089 + }, + { + "epoch": 0.07206850005976555, + "grad_norm": 1.5920443534851074, + "learning_rate": 1.6014511531484844e-05, + "loss": 1.5759, + "step": 3090 + }, + { + "epoch": 0.07209182319894346, + "grad_norm": 1.9949947595596313, + "learning_rate": 1.6019694221300858e-05, + "loss": 1.5934, + "step": 3091 + }, + { + "epoch": 0.07211514633812138, + "grad_norm": 1.9100804328918457, + "learning_rate": 1.6024876911116872e-05, + "loss": 1.5361, + "step": 3092 + }, + { + "epoch": 0.0721384694772993, + "grad_norm": 1.7444504499435425, + "learning_rate": 1.6030059600932887e-05, + "loss": 1.4275, + "step": 3093 + }, + { + "epoch": 0.07216179261647722, + "grad_norm": 1.7154817581176758, + "learning_rate": 1.60352422907489e-05, + "loss": 1.4801, + "step": 3094 + }, + { + "epoch": 0.07218511575565513, + "grad_norm": 1.7063403129577637, + "learning_rate": 1.6040424980564915e-05, + "loss": 1.2088, + "step": 3095 + }, + { + "epoch": 0.07220843889483305, + "grad_norm": 1.652329444885254, + "learning_rate": 1.604560767038093e-05, + "loss": 1.5211, + "step": 3096 + }, + { + "epoch": 0.07223176203401097, + "grad_norm": 1.7522650957107544, + "learning_rate": 1.6050790360196944e-05, + "loss": 1.3027, + "step": 3097 + }, + { + "epoch": 0.07225508517318889, + "grad_norm": 1.740250825881958, + "learning_rate": 1.6055973050012958e-05, + "loss": 1.8394, + "step": 3098 + }, + { + "epoch": 0.0722784083123668, + "grad_norm": 1.6055843830108643, + "learning_rate": 1.6061155739828973e-05, + "loss": 1.3905, + "step": 3099 + }, + { + "epoch": 0.07230173145154473, + "grad_norm": 1.9458154439926147, + "learning_rate": 1.6066338429644987e-05, + "loss": 1.212, + "step": 3100 + }, + { + "epoch": 0.07232505459072264, + "grad_norm": 1.5730454921722412, + "learning_rate": 1.6071521119461e-05, + "loss": 1.672, + "step": 3101 + }, + { + "epoch": 0.07234837772990056, + "grad_norm": 2.145181179046631, + "learning_rate": 1.6076703809277016e-05, + "loss": 1.4367, + "step": 3102 + }, + { + "epoch": 0.07237170086907847, + "grad_norm": 1.999090313911438, + "learning_rate": 1.608188649909303e-05, + "loss": 1.2467, + "step": 3103 + }, + { + "epoch": 0.0723950240082564, + "grad_norm": 1.9247711896896362, + "learning_rate": 1.6087069188909044e-05, + "loss": 1.4135, + "step": 3104 + }, + { + "epoch": 0.07241834714743431, + "grad_norm": 1.3318367004394531, + "learning_rate": 1.609225187872506e-05, + "loss": 1.2565, + "step": 3105 + }, + { + "epoch": 0.07244167028661222, + "grad_norm": 1.9068901538848877, + "learning_rate": 1.6097434568541076e-05, + "loss": 1.4268, + "step": 3106 + }, + { + "epoch": 0.07246499342579014, + "grad_norm": 1.8222874402999878, + "learning_rate": 1.610261725835709e-05, + "loss": 1.3323, + "step": 3107 + }, + { + "epoch": 0.07248831656496806, + "grad_norm": 1.9893977642059326, + "learning_rate": 1.6107799948173105e-05, + "loss": 1.5658, + "step": 3108 + }, + { + "epoch": 0.07251163970414598, + "grad_norm": 1.3330308198928833, + "learning_rate": 1.611298263798912e-05, + "loss": 1.3951, + "step": 3109 + }, + { + "epoch": 0.07253496284332389, + "grad_norm": 1.514310359954834, + "learning_rate": 1.6118165327805134e-05, + "loss": 1.2224, + "step": 3110 + }, + { + "epoch": 0.07255828598250182, + "grad_norm": 1.8405115604400635, + "learning_rate": 1.6123348017621148e-05, + "loss": 1.7513, + "step": 3111 + }, + { + "epoch": 0.07258160912167973, + "grad_norm": 1.7965067625045776, + "learning_rate": 1.6128530707437163e-05, + "loss": 1.4956, + "step": 3112 + }, + { + "epoch": 0.07260493226085765, + "grad_norm": 1.817044734954834, + "learning_rate": 1.6133713397253177e-05, + "loss": 1.5514, + "step": 3113 + }, + { + "epoch": 0.07262825540003556, + "grad_norm": 1.7619121074676514, + "learning_rate": 1.613889608706919e-05, + "loss": 1.2955, + "step": 3114 + }, + { + "epoch": 0.07265157853921349, + "grad_norm": 2.3285348415374756, + "learning_rate": 1.6144078776885202e-05, + "loss": 1.5876, + "step": 3115 + }, + { + "epoch": 0.0726749016783914, + "grad_norm": 1.8489972352981567, + "learning_rate": 1.6149261466701217e-05, + "loss": 1.8576, + "step": 3116 + }, + { + "epoch": 0.07269822481756932, + "grad_norm": 2.0354526042938232, + "learning_rate": 1.6154444156517234e-05, + "loss": 1.3596, + "step": 3117 + }, + { + "epoch": 0.07272154795674723, + "grad_norm": 1.593621015548706, + "learning_rate": 1.615962684633325e-05, + "loss": 1.7181, + "step": 3118 + }, + { + "epoch": 0.07274487109592516, + "grad_norm": 1.8863354921340942, + "learning_rate": 1.6164809536149263e-05, + "loss": 1.1748, + "step": 3119 + }, + { + "epoch": 0.07276819423510307, + "grad_norm": 1.617163896560669, + "learning_rate": 1.6169992225965277e-05, + "loss": 1.5123, + "step": 3120 + }, + { + "epoch": 0.072791517374281, + "grad_norm": 1.991641640663147, + "learning_rate": 1.617517491578129e-05, + "loss": 1.5395, + "step": 3121 + }, + { + "epoch": 0.0728148405134589, + "grad_norm": 2.255173444747925, + "learning_rate": 1.6180357605597306e-05, + "loss": 1.3539, + "step": 3122 + }, + { + "epoch": 0.07283816365263683, + "grad_norm": 1.4541120529174805, + "learning_rate": 1.618554029541332e-05, + "loss": 1.4153, + "step": 3123 + }, + { + "epoch": 0.07286148679181474, + "grad_norm": 1.4140597581863403, + "learning_rate": 1.6190722985229335e-05, + "loss": 1.1464, + "step": 3124 + }, + { + "epoch": 0.07288480993099267, + "grad_norm": 1.547636866569519, + "learning_rate": 1.619590567504535e-05, + "loss": 1.4475, + "step": 3125 + }, + { + "epoch": 0.07290813307017058, + "grad_norm": 1.637631893157959, + "learning_rate": 1.6201088364861363e-05, + "loss": 1.474, + "step": 3126 + }, + { + "epoch": 0.0729314562093485, + "grad_norm": 1.7381575107574463, + "learning_rate": 1.6206271054677378e-05, + "loss": 1.3445, + "step": 3127 + }, + { + "epoch": 0.07295477934852641, + "grad_norm": 1.9380948543548584, + "learning_rate": 1.6211453744493392e-05, + "loss": 1.4927, + "step": 3128 + }, + { + "epoch": 0.07297810248770434, + "grad_norm": 2.575976848602295, + "learning_rate": 1.6216636434309406e-05, + "loss": 1.2629, + "step": 3129 + }, + { + "epoch": 0.07300142562688225, + "grad_norm": 1.896838665008545, + "learning_rate": 1.622181912412542e-05, + "loss": 1.3337, + "step": 3130 + }, + { + "epoch": 0.07302474876606017, + "grad_norm": 2.829563617706299, + "learning_rate": 1.6227001813941435e-05, + "loss": 1.7271, + "step": 3131 + }, + { + "epoch": 0.07304807190523809, + "grad_norm": 1.8833341598510742, + "learning_rate": 1.623218450375745e-05, + "loss": 1.5144, + "step": 3132 + }, + { + "epoch": 0.07307139504441601, + "grad_norm": 1.7675219774246216, + "learning_rate": 1.6237367193573467e-05, + "loss": 1.476, + "step": 3133 + }, + { + "epoch": 0.07309471818359392, + "grad_norm": 1.708404779434204, + "learning_rate": 1.624254988338948e-05, + "loss": 1.5413, + "step": 3134 + }, + { + "epoch": 0.07311804132277183, + "grad_norm": 1.4991116523742676, + "learning_rate": 1.6247732573205496e-05, + "loss": 1.4898, + "step": 3135 + }, + { + "epoch": 0.07314136446194976, + "grad_norm": 1.7512024641036987, + "learning_rate": 1.625291526302151e-05, + "loss": 1.3796, + "step": 3136 + }, + { + "epoch": 0.07316468760112767, + "grad_norm": 1.567833423614502, + "learning_rate": 1.6258097952837525e-05, + "loss": 1.4894, + "step": 3137 + }, + { + "epoch": 0.07318801074030559, + "grad_norm": 1.9110795259475708, + "learning_rate": 1.626328064265354e-05, + "loss": 1.5233, + "step": 3138 + }, + { + "epoch": 0.0732113338794835, + "grad_norm": 1.618751049041748, + "learning_rate": 1.6268463332469553e-05, + "loss": 1.5909, + "step": 3139 + }, + { + "epoch": 0.07323465701866143, + "grad_norm": 1.643726110458374, + "learning_rate": 1.6273646022285568e-05, + "loss": 1.5929, + "step": 3140 + }, + { + "epoch": 0.07325798015783934, + "grad_norm": 1.7553377151489258, + "learning_rate": 1.6278828712101582e-05, + "loss": 1.2939, + "step": 3141 + }, + { + "epoch": 0.07328130329701726, + "grad_norm": 2.0515267848968506, + "learning_rate": 1.6284011401917596e-05, + "loss": 1.0752, + "step": 3142 + }, + { + "epoch": 0.07330462643619517, + "grad_norm": 2.997544288635254, + "learning_rate": 1.628919409173361e-05, + "loss": 1.0736, + "step": 3143 + }, + { + "epoch": 0.0733279495753731, + "grad_norm": 1.3428021669387817, + "learning_rate": 1.6294376781549625e-05, + "loss": 1.2843, + "step": 3144 + }, + { + "epoch": 0.07335127271455101, + "grad_norm": 2.3941478729248047, + "learning_rate": 1.629955947136564e-05, + "loss": 1.3924, + "step": 3145 + }, + { + "epoch": 0.07337459585372894, + "grad_norm": 2.03312087059021, + "learning_rate": 1.6304742161181654e-05, + "loss": 1.4825, + "step": 3146 + }, + { + "epoch": 0.07339791899290685, + "grad_norm": 1.8753705024719238, + "learning_rate": 1.6309924850997668e-05, + "loss": 1.2441, + "step": 3147 + }, + { + "epoch": 0.07342124213208477, + "grad_norm": 1.5599260330200195, + "learning_rate": 1.6315107540813682e-05, + "loss": 1.6125, + "step": 3148 + }, + { + "epoch": 0.07344456527126268, + "grad_norm": 1.8200551271438599, + "learning_rate": 1.6320290230629697e-05, + "loss": 1.214, + "step": 3149 + }, + { + "epoch": 0.07346788841044061, + "grad_norm": 1.768222451210022, + "learning_rate": 1.6325472920445714e-05, + "loss": 1.2032, + "step": 3150 + }, + { + "epoch": 0.07349121154961852, + "grad_norm": 1.6056883335113525, + "learning_rate": 1.633065561026173e-05, + "loss": 1.1922, + "step": 3151 + }, + { + "epoch": 0.07351453468879644, + "grad_norm": 1.8109151124954224, + "learning_rate": 1.6335838300077743e-05, + "loss": 1.6903, + "step": 3152 + }, + { + "epoch": 0.07353785782797435, + "grad_norm": 2.121112108230591, + "learning_rate": 1.6341020989893758e-05, + "loss": 1.4178, + "step": 3153 + }, + { + "epoch": 0.07356118096715228, + "grad_norm": 1.5514973402023315, + "learning_rate": 1.6346203679709772e-05, + "loss": 1.3702, + "step": 3154 + }, + { + "epoch": 0.07358450410633019, + "grad_norm": 1.8449887037277222, + "learning_rate": 1.6351386369525786e-05, + "loss": 1.5122, + "step": 3155 + }, + { + "epoch": 0.07360782724550811, + "grad_norm": 1.642410397529602, + "learning_rate": 1.63565690593418e-05, + "loss": 1.5452, + "step": 3156 + }, + { + "epoch": 0.07363115038468603, + "grad_norm": 1.4537593126296997, + "learning_rate": 1.6361751749157815e-05, + "loss": 1.4577, + "step": 3157 + }, + { + "epoch": 0.07365447352386395, + "grad_norm": 1.6113066673278809, + "learning_rate": 1.636693443897383e-05, + "loss": 1.3522, + "step": 3158 + }, + { + "epoch": 0.07367779666304186, + "grad_norm": 2.0721449851989746, + "learning_rate": 1.6372117128789844e-05, + "loss": 1.6864, + "step": 3159 + }, + { + "epoch": 0.07370111980221979, + "grad_norm": 1.5149180889129639, + "learning_rate": 1.6377299818605858e-05, + "loss": 1.4672, + "step": 3160 + }, + { + "epoch": 0.0737244429413977, + "grad_norm": 1.5581163167953491, + "learning_rate": 1.6382482508421872e-05, + "loss": 1.5262, + "step": 3161 + }, + { + "epoch": 0.07374776608057562, + "grad_norm": 1.804286003112793, + "learning_rate": 1.6387665198237887e-05, + "loss": 1.3961, + "step": 3162 + }, + { + "epoch": 0.07377108921975353, + "grad_norm": 1.7984623908996582, + "learning_rate": 1.63928478880539e-05, + "loss": 1.4955, + "step": 3163 + }, + { + "epoch": 0.07379441235893144, + "grad_norm": 2.099043607711792, + "learning_rate": 1.6398030577869915e-05, + "loss": 1.4355, + "step": 3164 + }, + { + "epoch": 0.07381773549810937, + "grad_norm": 1.6302787065505981, + "learning_rate": 1.640321326768593e-05, + "loss": 1.5327, + "step": 3165 + }, + { + "epoch": 0.07384105863728728, + "grad_norm": 2.0226998329162598, + "learning_rate": 1.6408395957501947e-05, + "loss": 1.3417, + "step": 3166 + }, + { + "epoch": 0.0738643817764652, + "grad_norm": 1.6209826469421387, + "learning_rate": 1.6413578647317962e-05, + "loss": 1.2699, + "step": 3167 + }, + { + "epoch": 0.07388770491564312, + "grad_norm": 1.7179981470108032, + "learning_rate": 1.6418761337133976e-05, + "loss": 1.5453, + "step": 3168 + }, + { + "epoch": 0.07391102805482104, + "grad_norm": 4.081711769104004, + "learning_rate": 1.642394402694999e-05, + "loss": 1.5203, + "step": 3169 + }, + { + "epoch": 0.07393435119399895, + "grad_norm": 2.2758281230926514, + "learning_rate": 1.6429126716766e-05, + "loss": 1.5944, + "step": 3170 + }, + { + "epoch": 0.07395767433317688, + "grad_norm": 1.8237699270248413, + "learning_rate": 1.6434309406582016e-05, + "loss": 1.2495, + "step": 3171 + }, + { + "epoch": 0.07398099747235479, + "grad_norm": 2.163329601287842, + "learning_rate": 1.643949209639803e-05, + "loss": 1.8061, + "step": 3172 + }, + { + "epoch": 0.07400432061153271, + "grad_norm": 1.429808497428894, + "learning_rate": 1.6444674786214044e-05, + "loss": 1.0702, + "step": 3173 + }, + { + "epoch": 0.07402764375071062, + "grad_norm": 1.5716477632522583, + "learning_rate": 1.644985747603006e-05, + "loss": 1.4544, + "step": 3174 + }, + { + "epoch": 0.07405096688988855, + "grad_norm": 1.6473677158355713, + "learning_rate": 1.6455040165846073e-05, + "loss": 1.2386, + "step": 3175 + }, + { + "epoch": 0.07407429002906646, + "grad_norm": 1.6965765953063965, + "learning_rate": 1.6460222855662088e-05, + "loss": 1.4224, + "step": 3176 + }, + { + "epoch": 0.07409761316824438, + "grad_norm": 1.925316333770752, + "learning_rate": 1.6465405545478105e-05, + "loss": 1.4812, + "step": 3177 + }, + { + "epoch": 0.0741209363074223, + "grad_norm": 1.7018409967422485, + "learning_rate": 1.647058823529412e-05, + "loss": 1.1986, + "step": 3178 + }, + { + "epoch": 0.07414425944660022, + "grad_norm": 1.725853443145752, + "learning_rate": 1.6475770925110134e-05, + "loss": 1.3903, + "step": 3179 + }, + { + "epoch": 0.07416758258577813, + "grad_norm": 1.9766136407852173, + "learning_rate": 1.6480953614926148e-05, + "loss": 1.3297, + "step": 3180 + }, + { + "epoch": 0.07419090572495605, + "grad_norm": 1.682305932044983, + "learning_rate": 1.6486136304742163e-05, + "loss": 1.3133, + "step": 3181 + }, + { + "epoch": 0.07421422886413397, + "grad_norm": 2.0981059074401855, + "learning_rate": 1.6491318994558177e-05, + "loss": 1.5268, + "step": 3182 + }, + { + "epoch": 0.07423755200331189, + "grad_norm": 1.6008816957473755, + "learning_rate": 1.649650168437419e-05, + "loss": 1.6714, + "step": 3183 + }, + { + "epoch": 0.0742608751424898, + "grad_norm": 1.780184268951416, + "learning_rate": 1.6501684374190206e-05, + "loss": 1.4079, + "step": 3184 + }, + { + "epoch": 0.07428419828166773, + "grad_norm": 1.654382586479187, + "learning_rate": 1.650686706400622e-05, + "loss": 1.4573, + "step": 3185 + }, + { + "epoch": 0.07430752142084564, + "grad_norm": 1.8154877424240112, + "learning_rate": 1.6512049753822234e-05, + "loss": 1.5714, + "step": 3186 + }, + { + "epoch": 0.07433084456002356, + "grad_norm": 1.7593024969100952, + "learning_rate": 1.651723244363825e-05, + "loss": 1.6577, + "step": 3187 + }, + { + "epoch": 0.07435416769920147, + "grad_norm": 2.0384089946746826, + "learning_rate": 1.6522415133454263e-05, + "loss": 1.2007, + "step": 3188 + }, + { + "epoch": 0.0743774908383794, + "grad_norm": 2.002246141433716, + "learning_rate": 1.6527597823270277e-05, + "loss": 1.4625, + "step": 3189 + }, + { + "epoch": 0.07440081397755731, + "grad_norm": 1.3846033811569214, + "learning_rate": 1.6532780513086292e-05, + "loss": 1.3212, + "step": 3190 + }, + { + "epoch": 0.07442413711673523, + "grad_norm": 2.119779348373413, + "learning_rate": 1.6537963202902306e-05, + "loss": 1.4693, + "step": 3191 + }, + { + "epoch": 0.07444746025591314, + "grad_norm": 1.6722887754440308, + "learning_rate": 1.654314589271832e-05, + "loss": 1.7212, + "step": 3192 + }, + { + "epoch": 0.07447078339509106, + "grad_norm": 1.9082591533660889, + "learning_rate": 1.6548328582534338e-05, + "loss": 1.5593, + "step": 3193 + }, + { + "epoch": 0.07449410653426898, + "grad_norm": 1.6746292114257812, + "learning_rate": 1.6553511272350353e-05, + "loss": 1.2147, + "step": 3194 + }, + { + "epoch": 0.07451742967344689, + "grad_norm": 2.499532461166382, + "learning_rate": 1.6558693962166367e-05, + "loss": 1.3359, + "step": 3195 + }, + { + "epoch": 0.07454075281262482, + "grad_norm": 1.7034624814987183, + "learning_rate": 1.656387665198238e-05, + "loss": 1.4738, + "step": 3196 + }, + { + "epoch": 0.07456407595180273, + "grad_norm": 1.6847600936889648, + "learning_rate": 1.6569059341798396e-05, + "loss": 1.2525, + "step": 3197 + }, + { + "epoch": 0.07458739909098065, + "grad_norm": 1.8061450719833374, + "learning_rate": 1.657424203161441e-05, + "loss": 1.4098, + "step": 3198 + }, + { + "epoch": 0.07461072223015856, + "grad_norm": 2.442981004714966, + "learning_rate": 1.6579424721430424e-05, + "loss": 1.3678, + "step": 3199 + }, + { + "epoch": 0.07463404536933649, + "grad_norm": 1.8011338710784912, + "learning_rate": 1.658460741124644e-05, + "loss": 1.7033, + "step": 3200 + }, + { + "epoch": 0.0746573685085144, + "grad_norm": 1.6890759468078613, + "learning_rate": 1.6589790101062453e-05, + "loss": 1.5371, + "step": 3201 + }, + { + "epoch": 0.07468069164769232, + "grad_norm": 1.3462575674057007, + "learning_rate": 1.6594972790878467e-05, + "loss": 1.1441, + "step": 3202 + }, + { + "epoch": 0.07470401478687023, + "grad_norm": 1.9787263870239258, + "learning_rate": 1.660015548069448e-05, + "loss": 1.9626, + "step": 3203 + }, + { + "epoch": 0.07472733792604816, + "grad_norm": 1.8819395303726196, + "learning_rate": 1.6605338170510496e-05, + "loss": 1.6671, + "step": 3204 + }, + { + "epoch": 0.07475066106522607, + "grad_norm": 2.3060483932495117, + "learning_rate": 1.661052086032651e-05, + "loss": 1.6444, + "step": 3205 + }, + { + "epoch": 0.074773984204404, + "grad_norm": 2.166884660720825, + "learning_rate": 1.6615703550142525e-05, + "loss": 1.0422, + "step": 3206 + }, + { + "epoch": 0.0747973073435819, + "grad_norm": 1.8128820657730103, + "learning_rate": 1.662088623995854e-05, + "loss": 1.4259, + "step": 3207 + }, + { + "epoch": 0.07482063048275983, + "grad_norm": 2.623307943344116, + "learning_rate": 1.6626068929774553e-05, + "loss": 1.3053, + "step": 3208 + }, + { + "epoch": 0.07484395362193774, + "grad_norm": 1.608069658279419, + "learning_rate": 1.6631251619590568e-05, + "loss": 1.2981, + "step": 3209 + }, + { + "epoch": 0.07486727676111567, + "grad_norm": 3.4346506595611572, + "learning_rate": 1.6636434309406585e-05, + "loss": 1.5589, + "step": 3210 + }, + { + "epoch": 0.07489059990029358, + "grad_norm": 2.037238597869873, + "learning_rate": 1.66416169992226e-05, + "loss": 1.6254, + "step": 3211 + }, + { + "epoch": 0.0749139230394715, + "grad_norm": 2.3247506618499756, + "learning_rate": 1.6646799689038614e-05, + "loss": 1.4934, + "step": 3212 + }, + { + "epoch": 0.07493724617864941, + "grad_norm": 2.582305908203125, + "learning_rate": 1.665198237885463e-05, + "loss": 1.7699, + "step": 3213 + }, + { + "epoch": 0.07496056931782734, + "grad_norm": 2.17980694770813, + "learning_rate": 1.6657165068670643e-05, + "loss": 1.3112, + "step": 3214 + }, + { + "epoch": 0.07498389245700525, + "grad_norm": 1.8005729913711548, + "learning_rate": 1.6662347758486657e-05, + "loss": 1.7, + "step": 3215 + }, + { + "epoch": 0.07500721559618317, + "grad_norm": 1.8963232040405273, + "learning_rate": 1.666753044830267e-05, + "loss": 1.2914, + "step": 3216 + }, + { + "epoch": 0.07503053873536109, + "grad_norm": 1.8256468772888184, + "learning_rate": 1.6672713138118686e-05, + "loss": 2.131, + "step": 3217 + }, + { + "epoch": 0.07505386187453901, + "grad_norm": 1.7902743816375732, + "learning_rate": 1.66778958279347e-05, + "loss": 1.7294, + "step": 3218 + }, + { + "epoch": 0.07507718501371692, + "grad_norm": 1.7782840728759766, + "learning_rate": 1.6683078517750715e-05, + "loss": 1.7079, + "step": 3219 + }, + { + "epoch": 0.07510050815289483, + "grad_norm": 1.5772191286087036, + "learning_rate": 1.668826120756673e-05, + "loss": 1.4248, + "step": 3220 + }, + { + "epoch": 0.07512383129207276, + "grad_norm": 1.633786916732788, + "learning_rate": 1.6693443897382743e-05, + "loss": 1.6423, + "step": 3221 + }, + { + "epoch": 0.07514715443125067, + "grad_norm": 1.9768214225769043, + "learning_rate": 1.6698626587198758e-05, + "loss": 1.1813, + "step": 3222 + }, + { + "epoch": 0.07517047757042859, + "grad_norm": 1.732069969177246, + "learning_rate": 1.6703809277014772e-05, + "loss": 1.448, + "step": 3223 + }, + { + "epoch": 0.0751938007096065, + "grad_norm": 1.6145514249801636, + "learning_rate": 1.6708991966830786e-05, + "loss": 1.2867, + "step": 3224 + }, + { + "epoch": 0.07521712384878443, + "grad_norm": 2.4257102012634277, + "learning_rate": 1.67141746566468e-05, + "loss": 1.5449, + "step": 3225 + }, + { + "epoch": 0.07524044698796234, + "grad_norm": 1.795608639717102, + "learning_rate": 1.6719357346462815e-05, + "loss": 1.5071, + "step": 3226 + }, + { + "epoch": 0.07526377012714026, + "grad_norm": 2.01072096824646, + "learning_rate": 1.672454003627883e-05, + "loss": 1.375, + "step": 3227 + }, + { + "epoch": 0.07528709326631818, + "grad_norm": 2.260044574737549, + "learning_rate": 1.6729722726094844e-05, + "loss": 1.6497, + "step": 3228 + }, + { + "epoch": 0.0753104164054961, + "grad_norm": 2.038987398147583, + "learning_rate": 1.6734905415910858e-05, + "loss": 1.1102, + "step": 3229 + }, + { + "epoch": 0.07533373954467401, + "grad_norm": 1.4211676120758057, + "learning_rate": 1.6740088105726872e-05, + "loss": 1.1084, + "step": 3230 + }, + { + "epoch": 0.07535706268385194, + "grad_norm": 1.623024582862854, + "learning_rate": 1.6745270795542887e-05, + "loss": 1.2943, + "step": 3231 + }, + { + "epoch": 0.07538038582302985, + "grad_norm": 1.5547914505004883, + "learning_rate": 1.67504534853589e-05, + "loss": 1.2063, + "step": 3232 + }, + { + "epoch": 0.07540370896220777, + "grad_norm": 30.15711784362793, + "learning_rate": 1.6755636175174915e-05, + "loss": 1.5127, + "step": 3233 + }, + { + "epoch": 0.07542703210138568, + "grad_norm": 2.2802798748016357, + "learning_rate": 1.676081886499093e-05, + "loss": 1.5358, + "step": 3234 + }, + { + "epoch": 0.07545035524056361, + "grad_norm": 2.0905349254608154, + "learning_rate": 1.6766001554806944e-05, + "loss": 1.4242, + "step": 3235 + }, + { + "epoch": 0.07547367837974152, + "grad_norm": 1.9069223403930664, + "learning_rate": 1.677118424462296e-05, + "loss": 1.4346, + "step": 3236 + }, + { + "epoch": 0.07549700151891944, + "grad_norm": 1.7567555904388428, + "learning_rate": 1.6776366934438976e-05, + "loss": 1.5666, + "step": 3237 + }, + { + "epoch": 0.07552032465809735, + "grad_norm": 1.5660747289657593, + "learning_rate": 1.678154962425499e-05, + "loss": 1.5741, + "step": 3238 + }, + { + "epoch": 0.07554364779727528, + "grad_norm": 2.2778217792510986, + "learning_rate": 1.6786732314071005e-05, + "loss": 1.7733, + "step": 3239 + }, + { + "epoch": 0.07556697093645319, + "grad_norm": 1.894871473312378, + "learning_rate": 1.679191500388702e-05, + "loss": 1.4862, + "step": 3240 + }, + { + "epoch": 0.07559029407563111, + "grad_norm": 1.7588225603103638, + "learning_rate": 1.6797097693703034e-05, + "loss": 1.5237, + "step": 3241 + }, + { + "epoch": 0.07561361721480903, + "grad_norm": 1.6358667612075806, + "learning_rate": 1.6802280383519048e-05, + "loss": 1.6416, + "step": 3242 + }, + { + "epoch": 0.07563694035398695, + "grad_norm": 1.745793104171753, + "learning_rate": 1.6807463073335062e-05, + "loss": 1.5638, + "step": 3243 + }, + { + "epoch": 0.07566026349316486, + "grad_norm": 1.5481939315795898, + "learning_rate": 1.6812645763151077e-05, + "loss": 1.5219, + "step": 3244 + }, + { + "epoch": 0.07568358663234279, + "grad_norm": 1.800512433052063, + "learning_rate": 1.681782845296709e-05, + "loss": 1.4218, + "step": 3245 + }, + { + "epoch": 0.0757069097715207, + "grad_norm": 1.7766098976135254, + "learning_rate": 1.6823011142783105e-05, + "loss": 1.5556, + "step": 3246 + }, + { + "epoch": 0.07573023291069862, + "grad_norm": 1.8715254068374634, + "learning_rate": 1.682819383259912e-05, + "loss": 0.9959, + "step": 3247 + }, + { + "epoch": 0.07575355604987653, + "grad_norm": 1.6228104829788208, + "learning_rate": 1.6833376522415134e-05, + "loss": 1.3913, + "step": 3248 + }, + { + "epoch": 0.07577687918905444, + "grad_norm": 2.1543338298797607, + "learning_rate": 1.683855921223115e-05, + "loss": 1.6023, + "step": 3249 + }, + { + "epoch": 0.07580020232823237, + "grad_norm": 2.092386484146118, + "learning_rate": 1.6843741902047163e-05, + "loss": 1.3446, + "step": 3250 + }, + { + "epoch": 0.07582352546741028, + "grad_norm": 2.0272581577301025, + "learning_rate": 1.6848924591863177e-05, + "loss": 1.4331, + "step": 3251 + }, + { + "epoch": 0.0758468486065882, + "grad_norm": 2.3082563877105713, + "learning_rate": 1.685410728167919e-05, + "loss": 1.3317, + "step": 3252 + }, + { + "epoch": 0.07587017174576612, + "grad_norm": 1.919071912765503, + "learning_rate": 1.6859289971495206e-05, + "loss": 1.4169, + "step": 3253 + }, + { + "epoch": 0.07589349488494404, + "grad_norm": 2.4359421730041504, + "learning_rate": 1.6864472661311224e-05, + "loss": 1.4297, + "step": 3254 + }, + { + "epoch": 0.07591681802412195, + "grad_norm": 1.7315095663070679, + "learning_rate": 1.6869655351127238e-05, + "loss": 1.603, + "step": 3255 + }, + { + "epoch": 0.07594014116329988, + "grad_norm": 2.1361732482910156, + "learning_rate": 1.6874838040943252e-05, + "loss": 1.3271, + "step": 3256 + }, + { + "epoch": 0.07596346430247779, + "grad_norm": 1.3706244230270386, + "learning_rate": 1.6880020730759267e-05, + "loss": 1.2237, + "step": 3257 + }, + { + "epoch": 0.07598678744165571, + "grad_norm": 1.6399321556091309, + "learning_rate": 1.688520342057528e-05, + "loss": 1.3485, + "step": 3258 + }, + { + "epoch": 0.07601011058083362, + "grad_norm": 2.2018356323242188, + "learning_rate": 1.6890386110391295e-05, + "loss": 1.2108, + "step": 3259 + }, + { + "epoch": 0.07603343372001155, + "grad_norm": 2.161050796508789, + "learning_rate": 1.689556880020731e-05, + "loss": 1.1575, + "step": 3260 + }, + { + "epoch": 0.07605675685918946, + "grad_norm": 2.0206055641174316, + "learning_rate": 1.6900751490023324e-05, + "loss": 1.4697, + "step": 3261 + }, + { + "epoch": 0.07608007999836738, + "grad_norm": 1.917266845703125, + "learning_rate": 1.6905934179839338e-05, + "loss": 1.2532, + "step": 3262 + }, + { + "epoch": 0.0761034031375453, + "grad_norm": 1.5717905759811401, + "learning_rate": 1.6911116869655353e-05, + "loss": 1.4017, + "step": 3263 + }, + { + "epoch": 0.07612672627672322, + "grad_norm": 2.3592898845672607, + "learning_rate": 1.6916299559471367e-05, + "loss": 1.4454, + "step": 3264 + }, + { + "epoch": 0.07615004941590113, + "grad_norm": 1.5780028104782104, + "learning_rate": 1.692148224928738e-05, + "loss": 1.1047, + "step": 3265 + }, + { + "epoch": 0.07617337255507906, + "grad_norm": 1.644834280014038, + "learning_rate": 1.6926664939103396e-05, + "loss": 1.2353, + "step": 3266 + }, + { + "epoch": 0.07619669569425697, + "grad_norm": 2.0033533573150635, + "learning_rate": 1.693184762891941e-05, + "loss": 1.5045, + "step": 3267 + }, + { + "epoch": 0.07622001883343489, + "grad_norm": 2.138641357421875, + "learning_rate": 1.6937030318735424e-05, + "loss": 1.0741, + "step": 3268 + }, + { + "epoch": 0.0762433419726128, + "grad_norm": 2.192629814147949, + "learning_rate": 1.694221300855144e-05, + "loss": 1.3045, + "step": 3269 + }, + { + "epoch": 0.07626666511179073, + "grad_norm": 2.714752435684204, + "learning_rate": 1.6947395698367456e-05, + "loss": 1.7175, + "step": 3270 + }, + { + "epoch": 0.07628998825096864, + "grad_norm": 1.7902692556381226, + "learning_rate": 1.695257838818347e-05, + "loss": 1.3403, + "step": 3271 + }, + { + "epoch": 0.07631331139014656, + "grad_norm": 2.086465835571289, + "learning_rate": 1.6957761077999485e-05, + "loss": 1.2808, + "step": 3272 + }, + { + "epoch": 0.07633663452932447, + "grad_norm": 1.8977822065353394, + "learning_rate": 1.69629437678155e-05, + "loss": 1.4753, + "step": 3273 + }, + { + "epoch": 0.0763599576685024, + "grad_norm": 1.6644923686981201, + "learning_rate": 1.6968126457631514e-05, + "loss": 1.5319, + "step": 3274 + }, + { + "epoch": 0.07638328080768031, + "grad_norm": 1.8487331867218018, + "learning_rate": 1.6973309147447528e-05, + "loss": 1.8381, + "step": 3275 + }, + { + "epoch": 0.07640660394685823, + "grad_norm": 2.051358699798584, + "learning_rate": 1.6978491837263543e-05, + "loss": 1.6011, + "step": 3276 + }, + { + "epoch": 0.07642992708603615, + "grad_norm": 1.8193670511245728, + "learning_rate": 1.6983674527079557e-05, + "loss": 1.8248, + "step": 3277 + }, + { + "epoch": 0.07645325022521406, + "grad_norm": 1.618811011314392, + "learning_rate": 1.698885721689557e-05, + "loss": 1.3692, + "step": 3278 + }, + { + "epoch": 0.07647657336439198, + "grad_norm": 1.6162301301956177, + "learning_rate": 1.6994039906711586e-05, + "loss": 1.2837, + "step": 3279 + }, + { + "epoch": 0.07649989650356989, + "grad_norm": 1.7301234006881714, + "learning_rate": 1.69992225965276e-05, + "loss": 1.3661, + "step": 3280 + }, + { + "epoch": 0.07652321964274782, + "grad_norm": 2.038130760192871, + "learning_rate": 1.7004405286343614e-05, + "loss": 1.1453, + "step": 3281 + }, + { + "epoch": 0.07654654278192573, + "grad_norm": 2.0911741256713867, + "learning_rate": 1.700958797615963e-05, + "loss": 1.4767, + "step": 3282 + }, + { + "epoch": 0.07656986592110365, + "grad_norm": 1.517926573753357, + "learning_rate": 1.7014770665975643e-05, + "loss": 1.2982, + "step": 3283 + }, + { + "epoch": 0.07659318906028156, + "grad_norm": 1.6235179901123047, + "learning_rate": 1.7019953355791657e-05, + "loss": 1.5828, + "step": 3284 + }, + { + "epoch": 0.07661651219945949, + "grad_norm": 2.2295985221862793, + "learning_rate": 1.702513604560767e-05, + "loss": 1.407, + "step": 3285 + }, + { + "epoch": 0.0766398353386374, + "grad_norm": 1.3985662460327148, + "learning_rate": 1.7030318735423686e-05, + "loss": 1.7479, + "step": 3286 + }, + { + "epoch": 0.07666315847781532, + "grad_norm": 1.876427412033081, + "learning_rate": 1.70355014252397e-05, + "loss": 1.4209, + "step": 3287 + }, + { + "epoch": 0.07668648161699324, + "grad_norm": 1.9090443849563599, + "learning_rate": 1.7040684115055715e-05, + "loss": 1.7284, + "step": 3288 + }, + { + "epoch": 0.07670980475617116, + "grad_norm": 1.4606293439865112, + "learning_rate": 1.704586680487173e-05, + "loss": 1.3944, + "step": 3289 + }, + { + "epoch": 0.07673312789534907, + "grad_norm": 1.9733707904815674, + "learning_rate": 1.7051049494687743e-05, + "loss": 0.9537, + "step": 3290 + }, + { + "epoch": 0.076756451034527, + "grad_norm": 2.197521209716797, + "learning_rate": 1.7056232184503758e-05, + "loss": 1.5473, + "step": 3291 + }, + { + "epoch": 0.0767797741737049, + "grad_norm": 1.5921342372894287, + "learning_rate": 1.7061414874319772e-05, + "loss": 1.2972, + "step": 3292 + }, + { + "epoch": 0.07680309731288283, + "grad_norm": 1.6160104274749756, + "learning_rate": 1.7066597564135786e-05, + "loss": 1.2229, + "step": 3293 + }, + { + "epoch": 0.07682642045206074, + "grad_norm": 1.9350335597991943, + "learning_rate": 1.70717802539518e-05, + "loss": 1.7013, + "step": 3294 + }, + { + "epoch": 0.07684974359123867, + "grad_norm": 1.429025411605835, + "learning_rate": 1.7076962943767815e-05, + "loss": 1.2247, + "step": 3295 + }, + { + "epoch": 0.07687306673041658, + "grad_norm": 1.7935131788253784, + "learning_rate": 1.708214563358383e-05, + "loss": 1.53, + "step": 3296 + }, + { + "epoch": 0.0768963898695945, + "grad_norm": 2.194880247116089, + "learning_rate": 1.7087328323399844e-05, + "loss": 1.1259, + "step": 3297 + }, + { + "epoch": 0.07691971300877241, + "grad_norm": 2.297866106033325, + "learning_rate": 1.709251101321586e-05, + "loss": 1.5837, + "step": 3298 + }, + { + "epoch": 0.07694303614795034, + "grad_norm": 2.34386944770813, + "learning_rate": 1.7097693703031876e-05, + "loss": 1.6849, + "step": 3299 + }, + { + "epoch": 0.07696635928712825, + "grad_norm": 1.9773756265640259, + "learning_rate": 1.710287639284789e-05, + "loss": 1.5308, + "step": 3300 + }, + { + "epoch": 0.07698968242630617, + "grad_norm": 2.028454065322876, + "learning_rate": 1.7108059082663905e-05, + "loss": 1.4155, + "step": 3301 + }, + { + "epoch": 0.07701300556548409, + "grad_norm": 1.9162853956222534, + "learning_rate": 1.711324177247992e-05, + "loss": 1.5933, + "step": 3302 + }, + { + "epoch": 0.07703632870466201, + "grad_norm": 1.8800437450408936, + "learning_rate": 1.7118424462295933e-05, + "loss": 1.2576, + "step": 3303 + }, + { + "epoch": 0.07705965184383992, + "grad_norm": 1.9667344093322754, + "learning_rate": 1.7123607152111948e-05, + "loss": 1.7147, + "step": 3304 + }, + { + "epoch": 0.07708297498301783, + "grad_norm": 1.681564450263977, + "learning_rate": 1.7128789841927962e-05, + "loss": 1.6175, + "step": 3305 + }, + { + "epoch": 0.07710629812219576, + "grad_norm": 1.9889628887176514, + "learning_rate": 1.7133972531743976e-05, + "loss": 1.5713, + "step": 3306 + }, + { + "epoch": 0.07712962126137367, + "grad_norm": 1.895285725593567, + "learning_rate": 1.713915522155999e-05, + "loss": 1.5917, + "step": 3307 + }, + { + "epoch": 0.07715294440055159, + "grad_norm": 2.204185724258423, + "learning_rate": 1.7144337911376005e-05, + "loss": 1.4123, + "step": 3308 + }, + { + "epoch": 0.0771762675397295, + "grad_norm": 1.6684226989746094, + "learning_rate": 1.714952060119202e-05, + "loss": 1.4985, + "step": 3309 + }, + { + "epoch": 0.07719959067890743, + "grad_norm": 1.4817101955413818, + "learning_rate": 1.7154703291008034e-05, + "loss": 1.4513, + "step": 3310 + }, + { + "epoch": 0.07722291381808534, + "grad_norm": 1.655494213104248, + "learning_rate": 1.7159885980824048e-05, + "loss": 1.3627, + "step": 3311 + }, + { + "epoch": 0.07724623695726326, + "grad_norm": 1.5405232906341553, + "learning_rate": 1.7165068670640062e-05, + "loss": 1.6758, + "step": 3312 + }, + { + "epoch": 0.07726956009644118, + "grad_norm": 2.788048267364502, + "learning_rate": 1.7170251360456077e-05, + "loss": 1.2079, + "step": 3313 + }, + { + "epoch": 0.0772928832356191, + "grad_norm": 1.8536003828048706, + "learning_rate": 1.7175434050272094e-05, + "loss": 1.6091, + "step": 3314 + }, + { + "epoch": 0.07731620637479701, + "grad_norm": 1.742221713066101, + "learning_rate": 1.718061674008811e-05, + "loss": 1.441, + "step": 3315 + }, + { + "epoch": 0.07733952951397494, + "grad_norm": 1.849967122077942, + "learning_rate": 1.7185799429904123e-05, + "loss": 1.5461, + "step": 3316 + }, + { + "epoch": 0.07736285265315285, + "grad_norm": 2.1449637413024902, + "learning_rate": 1.7190982119720138e-05, + "loss": 1.1739, + "step": 3317 + }, + { + "epoch": 0.07738617579233077, + "grad_norm": 1.9928157329559326, + "learning_rate": 1.7196164809536152e-05, + "loss": 1.467, + "step": 3318 + }, + { + "epoch": 0.07740949893150868, + "grad_norm": 1.9001449346542358, + "learning_rate": 1.7201347499352166e-05, + "loss": 1.6135, + "step": 3319 + }, + { + "epoch": 0.07743282207068661, + "grad_norm": 1.5023950338363647, + "learning_rate": 1.720653018916818e-05, + "loss": 1.3905, + "step": 3320 + }, + { + "epoch": 0.07745614520986452, + "grad_norm": 2.290374755859375, + "learning_rate": 1.7211712878984195e-05, + "loss": 1.4503, + "step": 3321 + }, + { + "epoch": 0.07747946834904244, + "grad_norm": 1.5706013441085815, + "learning_rate": 1.721689556880021e-05, + "loss": 1.4569, + "step": 3322 + }, + { + "epoch": 0.07750279148822035, + "grad_norm": 2.7582640647888184, + "learning_rate": 1.7222078258616224e-05, + "loss": 1.5863, + "step": 3323 + }, + { + "epoch": 0.07752611462739828, + "grad_norm": 2.0270185470581055, + "learning_rate": 1.7227260948432238e-05, + "loss": 1.6337, + "step": 3324 + }, + { + "epoch": 0.07754943776657619, + "grad_norm": 1.835780143737793, + "learning_rate": 1.7232443638248252e-05, + "loss": 1.5795, + "step": 3325 + }, + { + "epoch": 0.07757276090575412, + "grad_norm": 1.7932450771331787, + "learning_rate": 1.7237626328064267e-05, + "loss": 1.5944, + "step": 3326 + }, + { + "epoch": 0.07759608404493203, + "grad_norm": 1.7863045930862427, + "learning_rate": 1.724280901788028e-05, + "loss": 1.579, + "step": 3327 + }, + { + "epoch": 0.07761940718410995, + "grad_norm": 1.7453893423080444, + "learning_rate": 1.7247991707696295e-05, + "loss": 1.4776, + "step": 3328 + }, + { + "epoch": 0.07764273032328786, + "grad_norm": 1.7224361896514893, + "learning_rate": 1.725317439751231e-05, + "loss": 1.1699, + "step": 3329 + }, + { + "epoch": 0.07766605346246579, + "grad_norm": 2.057270050048828, + "learning_rate": 1.7258357087328324e-05, + "loss": 1.4944, + "step": 3330 + }, + { + "epoch": 0.0776893766016437, + "grad_norm": 1.8803025484085083, + "learning_rate": 1.7263539777144342e-05, + "loss": 1.7737, + "step": 3331 + }, + { + "epoch": 0.07771269974082162, + "grad_norm": 1.7562426328659058, + "learning_rate": 1.7268722466960356e-05, + "loss": 1.6312, + "step": 3332 + }, + { + "epoch": 0.07773602287999953, + "grad_norm": 2.3190042972564697, + "learning_rate": 1.727390515677637e-05, + "loss": 1.7718, + "step": 3333 + }, + { + "epoch": 0.07775934601917744, + "grad_norm": 1.585045337677002, + "learning_rate": 1.7279087846592385e-05, + "loss": 1.1472, + "step": 3334 + }, + { + "epoch": 0.07778266915835537, + "grad_norm": 1.8282815217971802, + "learning_rate": 1.72842705364084e-05, + "loss": 1.8519, + "step": 3335 + }, + { + "epoch": 0.07780599229753328, + "grad_norm": 1.8750232458114624, + "learning_rate": 1.728945322622441e-05, + "loss": 1.7706, + "step": 3336 + }, + { + "epoch": 0.0778293154367112, + "grad_norm": 1.8886610269546509, + "learning_rate": 1.7294635916040424e-05, + "loss": 1.7408, + "step": 3337 + }, + { + "epoch": 0.07785263857588912, + "grad_norm": 1.4651943445205688, + "learning_rate": 1.729981860585644e-05, + "loss": 1.3464, + "step": 3338 + }, + { + "epoch": 0.07787596171506704, + "grad_norm": 3.5322890281677246, + "learning_rate": 1.7305001295672453e-05, + "loss": 1.4605, + "step": 3339 + }, + { + "epoch": 0.07789928485424495, + "grad_norm": 2.985595226287842, + "learning_rate": 1.7310183985488468e-05, + "loss": 1.1655, + "step": 3340 + }, + { + "epoch": 0.07792260799342288, + "grad_norm": 1.8383092880249023, + "learning_rate": 1.7315366675304485e-05, + "loss": 1.604, + "step": 3341 + }, + { + "epoch": 0.07794593113260079, + "grad_norm": 1.765061378479004, + "learning_rate": 1.73205493651205e-05, + "loss": 1.2413, + "step": 3342 + }, + { + "epoch": 0.07796925427177871, + "grad_norm": 2.067577362060547, + "learning_rate": 1.7325732054936514e-05, + "loss": 1.5542, + "step": 3343 + }, + { + "epoch": 0.07799257741095662, + "grad_norm": 1.8837525844573975, + "learning_rate": 1.7330914744752528e-05, + "loss": 1.5958, + "step": 3344 + }, + { + "epoch": 0.07801590055013455, + "grad_norm": 2.2309885025024414, + "learning_rate": 1.7336097434568543e-05, + "loss": 1.328, + "step": 3345 + }, + { + "epoch": 0.07803922368931246, + "grad_norm": 2.1510369777679443, + "learning_rate": 1.7341280124384557e-05, + "loss": 1.6049, + "step": 3346 + }, + { + "epoch": 0.07806254682849038, + "grad_norm": 2.2758655548095703, + "learning_rate": 1.734646281420057e-05, + "loss": 1.6463, + "step": 3347 + }, + { + "epoch": 0.0780858699676683, + "grad_norm": 1.5804762840270996, + "learning_rate": 1.7351645504016586e-05, + "loss": 1.3067, + "step": 3348 + }, + { + "epoch": 0.07810919310684622, + "grad_norm": 1.7285902500152588, + "learning_rate": 1.73568281938326e-05, + "loss": 1.1706, + "step": 3349 + }, + { + "epoch": 0.07813251624602413, + "grad_norm": 1.9170994758605957, + "learning_rate": 1.7362010883648614e-05, + "loss": 1.6914, + "step": 3350 + }, + { + "epoch": 0.07815583938520206, + "grad_norm": 1.8732631206512451, + "learning_rate": 1.736719357346463e-05, + "loss": 1.8266, + "step": 3351 + }, + { + "epoch": 0.07817916252437997, + "grad_norm": 2.3439199924468994, + "learning_rate": 1.7372376263280643e-05, + "loss": 1.5885, + "step": 3352 + }, + { + "epoch": 0.07820248566355789, + "grad_norm": 2.0440049171447754, + "learning_rate": 1.7377558953096657e-05, + "loss": 1.2152, + "step": 3353 + }, + { + "epoch": 0.0782258088027358, + "grad_norm": 1.5103175640106201, + "learning_rate": 1.7382741642912672e-05, + "loss": 1.2679, + "step": 3354 + }, + { + "epoch": 0.07824913194191373, + "grad_norm": 1.7911981344223022, + "learning_rate": 1.7387924332728686e-05, + "loss": 1.8322, + "step": 3355 + }, + { + "epoch": 0.07827245508109164, + "grad_norm": 1.7144136428833008, + "learning_rate": 1.73931070225447e-05, + "loss": 1.5417, + "step": 3356 + }, + { + "epoch": 0.07829577822026956, + "grad_norm": 1.7464350461959839, + "learning_rate": 1.7398289712360715e-05, + "loss": 1.6126, + "step": 3357 + }, + { + "epoch": 0.07831910135944747, + "grad_norm": 2.226100444793701, + "learning_rate": 1.7403472402176733e-05, + "loss": 1.7248, + "step": 3358 + }, + { + "epoch": 0.0783424244986254, + "grad_norm": 1.8423552513122559, + "learning_rate": 1.7408655091992747e-05, + "loss": 1.399, + "step": 3359 + }, + { + "epoch": 0.07836574763780331, + "grad_norm": 2.0295534133911133, + "learning_rate": 1.741383778180876e-05, + "loss": 1.4527, + "step": 3360 + }, + { + "epoch": 0.07838907077698123, + "grad_norm": 2.043365240097046, + "learning_rate": 1.7419020471624776e-05, + "loss": 1.2569, + "step": 3361 + }, + { + "epoch": 0.07841239391615915, + "grad_norm": 2.070237874984741, + "learning_rate": 1.742420316144079e-05, + "loss": 1.6359, + "step": 3362 + }, + { + "epoch": 0.07843571705533706, + "grad_norm": 1.8740978240966797, + "learning_rate": 1.7429385851256804e-05, + "loss": 1.4223, + "step": 3363 + }, + { + "epoch": 0.07845904019451498, + "grad_norm": 1.8014066219329834, + "learning_rate": 1.743456854107282e-05, + "loss": 1.6767, + "step": 3364 + }, + { + "epoch": 0.07848236333369289, + "grad_norm": 1.7680472135543823, + "learning_rate": 1.7439751230888833e-05, + "loss": 1.227, + "step": 3365 + }, + { + "epoch": 0.07850568647287082, + "grad_norm": 2.5365848541259766, + "learning_rate": 1.7444933920704847e-05, + "loss": 1.336, + "step": 3366 + }, + { + "epoch": 0.07852900961204873, + "grad_norm": 1.6996607780456543, + "learning_rate": 1.745011661052086e-05, + "loss": 1.3024, + "step": 3367 + }, + { + "epoch": 0.07855233275122665, + "grad_norm": 2.24592924118042, + "learning_rate": 1.7455299300336876e-05, + "loss": 1.565, + "step": 3368 + }, + { + "epoch": 0.07857565589040456, + "grad_norm": 1.5899155139923096, + "learning_rate": 1.746048199015289e-05, + "loss": 1.3918, + "step": 3369 + }, + { + "epoch": 0.07859897902958249, + "grad_norm": 1.5738404989242554, + "learning_rate": 1.7465664679968905e-05, + "loss": 1.5093, + "step": 3370 + }, + { + "epoch": 0.0786223021687604, + "grad_norm": 1.885108470916748, + "learning_rate": 1.747084736978492e-05, + "loss": 1.6744, + "step": 3371 + }, + { + "epoch": 0.07864562530793832, + "grad_norm": 1.7116379737854004, + "learning_rate": 1.7476030059600933e-05, + "loss": 1.329, + "step": 3372 + }, + { + "epoch": 0.07866894844711624, + "grad_norm": 2.3172607421875, + "learning_rate": 1.7481212749416948e-05, + "loss": 1.538, + "step": 3373 + }, + { + "epoch": 0.07869227158629416, + "grad_norm": 1.7801569700241089, + "learning_rate": 1.7486395439232965e-05, + "loss": 1.1738, + "step": 3374 + }, + { + "epoch": 0.07871559472547207, + "grad_norm": 1.6906585693359375, + "learning_rate": 1.749157812904898e-05, + "loss": 1.575, + "step": 3375 + }, + { + "epoch": 0.07873891786465, + "grad_norm": 2.6695775985717773, + "learning_rate": 1.7496760818864994e-05, + "loss": 1.5512, + "step": 3376 + }, + { + "epoch": 0.0787622410038279, + "grad_norm": 2.102708578109741, + "learning_rate": 1.750194350868101e-05, + "loss": 1.5507, + "step": 3377 + }, + { + "epoch": 0.07878556414300583, + "grad_norm": 1.8281930685043335, + "learning_rate": 1.7507126198497023e-05, + "loss": 1.538, + "step": 3378 + }, + { + "epoch": 0.07880888728218374, + "grad_norm": 1.6384347677230835, + "learning_rate": 1.7512308888313037e-05, + "loss": 1.2935, + "step": 3379 + }, + { + "epoch": 0.07883221042136167, + "grad_norm": 1.8894944190979004, + "learning_rate": 1.751749157812905e-05, + "loss": 1.2876, + "step": 3380 + }, + { + "epoch": 0.07885553356053958, + "grad_norm": 1.902371883392334, + "learning_rate": 1.7522674267945066e-05, + "loss": 1.476, + "step": 3381 + }, + { + "epoch": 0.0788788566997175, + "grad_norm": 1.6878814697265625, + "learning_rate": 1.752785695776108e-05, + "loss": 1.7224, + "step": 3382 + }, + { + "epoch": 0.07890217983889541, + "grad_norm": 1.608279824256897, + "learning_rate": 1.7533039647577095e-05, + "loss": 1.2821, + "step": 3383 + }, + { + "epoch": 0.07892550297807334, + "grad_norm": 1.9098149538040161, + "learning_rate": 1.753822233739311e-05, + "loss": 1.5575, + "step": 3384 + }, + { + "epoch": 0.07894882611725125, + "grad_norm": 2.8241662979125977, + "learning_rate": 1.7543405027209123e-05, + "loss": 1.3103, + "step": 3385 + }, + { + "epoch": 0.07897214925642917, + "grad_norm": 1.626420021057129, + "learning_rate": 1.7548587717025138e-05, + "loss": 1.4841, + "step": 3386 + }, + { + "epoch": 0.07899547239560709, + "grad_norm": 1.8856135606765747, + "learning_rate": 1.7553770406841152e-05, + "loss": 1.4818, + "step": 3387 + }, + { + "epoch": 0.07901879553478501, + "grad_norm": 1.5790698528289795, + "learning_rate": 1.7558953096657166e-05, + "loss": 1.4033, + "step": 3388 + }, + { + "epoch": 0.07904211867396292, + "grad_norm": 1.8913196325302124, + "learning_rate": 1.756413578647318e-05, + "loss": 1.6889, + "step": 3389 + }, + { + "epoch": 0.07906544181314085, + "grad_norm": 2.2722718715667725, + "learning_rate": 1.7569318476289195e-05, + "loss": 1.5644, + "step": 3390 + }, + { + "epoch": 0.07908876495231876, + "grad_norm": 2.0640485286712646, + "learning_rate": 1.757450116610521e-05, + "loss": 1.805, + "step": 3391 + }, + { + "epoch": 0.07911208809149667, + "grad_norm": 1.9282963275909424, + "learning_rate": 1.7579683855921224e-05, + "loss": 1.525, + "step": 3392 + }, + { + "epoch": 0.0791354112306746, + "grad_norm": 1.8700670003890991, + "learning_rate": 1.7584866545737238e-05, + "loss": 1.6906, + "step": 3393 + }, + { + "epoch": 0.0791587343698525, + "grad_norm": 1.6949808597564697, + "learning_rate": 1.7590049235553252e-05, + "loss": 1.4373, + "step": 3394 + }, + { + "epoch": 0.07918205750903043, + "grad_norm": 1.5579369068145752, + "learning_rate": 1.7595231925369267e-05, + "loss": 1.2127, + "step": 3395 + }, + { + "epoch": 0.07920538064820834, + "grad_norm": 1.8959749937057495, + "learning_rate": 1.760041461518528e-05, + "loss": 1.7395, + "step": 3396 + }, + { + "epoch": 0.07922870378738626, + "grad_norm": 1.2566922903060913, + "learning_rate": 1.7605597305001295e-05, + "loss": 1.2768, + "step": 3397 + }, + { + "epoch": 0.07925202692656418, + "grad_norm": 1.7211780548095703, + "learning_rate": 1.761077999481731e-05, + "loss": 1.6815, + "step": 3398 + }, + { + "epoch": 0.0792753500657421, + "grad_norm": 1.7850645780563354, + "learning_rate": 1.7615962684633324e-05, + "loss": 1.4944, + "step": 3399 + }, + { + "epoch": 0.07929867320492001, + "grad_norm": 1.666786789894104, + "learning_rate": 1.762114537444934e-05, + "loss": 1.5897, + "step": 3400 + }, + { + "epoch": 0.07932199634409794, + "grad_norm": 1.5484236478805542, + "learning_rate": 1.7626328064265353e-05, + "loss": 1.3968, + "step": 3401 + }, + { + "epoch": 0.07934531948327585, + "grad_norm": 1.8221678733825684, + "learning_rate": 1.763151075408137e-05, + "loss": 1.2536, + "step": 3402 + }, + { + "epoch": 0.07936864262245377, + "grad_norm": 1.67190420627594, + "learning_rate": 1.7636693443897385e-05, + "loss": 1.2301, + "step": 3403 + }, + { + "epoch": 0.07939196576163168, + "grad_norm": 1.5914485454559326, + "learning_rate": 1.76418761337134e-05, + "loss": 1.3554, + "step": 3404 + }, + { + "epoch": 0.07941528890080961, + "grad_norm": 1.7257155179977417, + "learning_rate": 1.7647058823529414e-05, + "loss": 1.3212, + "step": 3405 + }, + { + "epoch": 0.07943861203998752, + "grad_norm": 1.5732362270355225, + "learning_rate": 1.7652241513345428e-05, + "loss": 1.1798, + "step": 3406 + }, + { + "epoch": 0.07946193517916544, + "grad_norm": 1.9243180751800537, + "learning_rate": 1.7657424203161442e-05, + "loss": 1.3676, + "step": 3407 + }, + { + "epoch": 0.07948525831834335, + "grad_norm": 1.6544080972671509, + "learning_rate": 1.7662606892977457e-05, + "loss": 1.561, + "step": 3408 + }, + { + "epoch": 0.07950858145752128, + "grad_norm": 2.302537202835083, + "learning_rate": 1.766778958279347e-05, + "loss": 1.5584, + "step": 3409 + }, + { + "epoch": 0.07953190459669919, + "grad_norm": 1.5818437337875366, + "learning_rate": 1.7672972272609485e-05, + "loss": 1.4021, + "step": 3410 + }, + { + "epoch": 0.07955522773587712, + "grad_norm": 1.276625156402588, + "learning_rate": 1.76781549624255e-05, + "loss": 1.2437, + "step": 3411 + }, + { + "epoch": 0.07957855087505503, + "grad_norm": 1.9232205152511597, + "learning_rate": 1.7683337652241514e-05, + "loss": 1.3494, + "step": 3412 + }, + { + "epoch": 0.07960187401423295, + "grad_norm": 1.8550798892974854, + "learning_rate": 1.768852034205753e-05, + "loss": 1.6728, + "step": 3413 + }, + { + "epoch": 0.07962519715341086, + "grad_norm": 1.8926242589950562, + "learning_rate": 1.7693703031873543e-05, + "loss": 1.3786, + "step": 3414 + }, + { + "epoch": 0.07964852029258879, + "grad_norm": 1.7592382431030273, + "learning_rate": 1.7698885721689557e-05, + "loss": 1.3268, + "step": 3415 + }, + { + "epoch": 0.0796718434317667, + "grad_norm": 2.5954487323760986, + "learning_rate": 1.770406841150557e-05, + "loss": 1.791, + "step": 3416 + }, + { + "epoch": 0.07969516657094462, + "grad_norm": 2.1821093559265137, + "learning_rate": 1.7709251101321586e-05, + "loss": 1.7194, + "step": 3417 + }, + { + "epoch": 0.07971848971012253, + "grad_norm": 1.3791011571884155, + "learning_rate": 1.7714433791137604e-05, + "loss": 1.6048, + "step": 3418 + }, + { + "epoch": 0.07974181284930044, + "grad_norm": 1.2134246826171875, + "learning_rate": 1.7719616480953618e-05, + "loss": 1.0106, + "step": 3419 + }, + { + "epoch": 0.07976513598847837, + "grad_norm": 2.4315738677978516, + "learning_rate": 1.7724799170769632e-05, + "loss": 1.4947, + "step": 3420 + }, + { + "epoch": 0.07978845912765628, + "grad_norm": 1.6762149333953857, + "learning_rate": 1.7729981860585647e-05, + "loss": 1.29, + "step": 3421 + }, + { + "epoch": 0.0798117822668342, + "grad_norm": 2.2112157344818115, + "learning_rate": 1.773516455040166e-05, + "loss": 1.5833, + "step": 3422 + }, + { + "epoch": 0.07983510540601212, + "grad_norm": 2.0395634174346924, + "learning_rate": 1.7740347240217675e-05, + "loss": 1.4179, + "step": 3423 + }, + { + "epoch": 0.07985842854519004, + "grad_norm": 4.181065082550049, + "learning_rate": 1.774552993003369e-05, + "loss": 1.3738, + "step": 3424 + }, + { + "epoch": 0.07988175168436795, + "grad_norm": 1.4794611930847168, + "learning_rate": 1.7750712619849704e-05, + "loss": 1.5617, + "step": 3425 + }, + { + "epoch": 0.07990507482354588, + "grad_norm": 1.5973138809204102, + "learning_rate": 1.7755895309665718e-05, + "loss": 1.4281, + "step": 3426 + }, + { + "epoch": 0.07992839796272379, + "grad_norm": 2.451594591140747, + "learning_rate": 1.7761077999481733e-05, + "loss": 1.361, + "step": 3427 + }, + { + "epoch": 0.07995172110190171, + "grad_norm": 1.8390647172927856, + "learning_rate": 1.7766260689297747e-05, + "loss": 1.6535, + "step": 3428 + }, + { + "epoch": 0.07997504424107962, + "grad_norm": 2.335458755493164, + "learning_rate": 1.777144337911376e-05, + "loss": 1.5528, + "step": 3429 + }, + { + "epoch": 0.07999836738025755, + "grad_norm": 1.8207273483276367, + "learning_rate": 1.7776626068929776e-05, + "loss": 1.4269, + "step": 3430 + }, + { + "epoch": 0.08002169051943546, + "grad_norm": 1.643661379814148, + "learning_rate": 1.778180875874579e-05, + "loss": 1.4368, + "step": 3431 + }, + { + "epoch": 0.08004501365861338, + "grad_norm": 1.9254742860794067, + "learning_rate": 1.7786991448561804e-05, + "loss": 1.207, + "step": 3432 + }, + { + "epoch": 0.0800683367977913, + "grad_norm": 1.9777443408966064, + "learning_rate": 1.779217413837782e-05, + "loss": 1.5559, + "step": 3433 + }, + { + "epoch": 0.08009165993696922, + "grad_norm": 1.8416067361831665, + "learning_rate": 1.7797356828193833e-05, + "loss": 1.1761, + "step": 3434 + }, + { + "epoch": 0.08011498307614713, + "grad_norm": 1.6335779428482056, + "learning_rate": 1.780253951800985e-05, + "loss": 1.5841, + "step": 3435 + }, + { + "epoch": 0.08013830621532506, + "grad_norm": 1.7504732608795166, + "learning_rate": 1.7807722207825865e-05, + "loss": 1.0417, + "step": 3436 + }, + { + "epoch": 0.08016162935450297, + "grad_norm": 1.9436529874801636, + "learning_rate": 1.781290489764188e-05, + "loss": 1.6339, + "step": 3437 + }, + { + "epoch": 0.08018495249368089, + "grad_norm": 1.320713996887207, + "learning_rate": 1.7818087587457894e-05, + "loss": 1.1599, + "step": 3438 + }, + { + "epoch": 0.0802082756328588, + "grad_norm": 1.8651103973388672, + "learning_rate": 1.7823270277273908e-05, + "loss": 1.5779, + "step": 3439 + }, + { + "epoch": 0.08023159877203673, + "grad_norm": 2.5607051849365234, + "learning_rate": 1.7828452967089923e-05, + "loss": 1.0184, + "step": 3440 + }, + { + "epoch": 0.08025492191121464, + "grad_norm": 2.009211778640747, + "learning_rate": 1.7833635656905937e-05, + "loss": 1.2315, + "step": 3441 + }, + { + "epoch": 0.08027824505039256, + "grad_norm": 1.6827813386917114, + "learning_rate": 1.783881834672195e-05, + "loss": 1.5273, + "step": 3442 + }, + { + "epoch": 0.08030156818957047, + "grad_norm": 1.7755669355392456, + "learning_rate": 1.7844001036537966e-05, + "loss": 1.2431, + "step": 3443 + }, + { + "epoch": 0.0803248913287484, + "grad_norm": 1.8076794147491455, + "learning_rate": 1.784918372635398e-05, + "loss": 1.3305, + "step": 3444 + }, + { + "epoch": 0.08034821446792631, + "grad_norm": 1.6127822399139404, + "learning_rate": 1.7854366416169994e-05, + "loss": 1.6228, + "step": 3445 + }, + { + "epoch": 0.08037153760710423, + "grad_norm": 1.5132166147232056, + "learning_rate": 1.785954910598601e-05, + "loss": 1.1506, + "step": 3446 + }, + { + "epoch": 0.08039486074628215, + "grad_norm": 1.5347700119018555, + "learning_rate": 1.7864731795802023e-05, + "loss": 1.2827, + "step": 3447 + }, + { + "epoch": 0.08041818388546006, + "grad_norm": 1.5111736059188843, + "learning_rate": 1.7869914485618037e-05, + "loss": 1.3593, + "step": 3448 + }, + { + "epoch": 0.08044150702463798, + "grad_norm": 1.5753391981124878, + "learning_rate": 1.787509717543405e-05, + "loss": 1.4866, + "step": 3449 + }, + { + "epoch": 0.08046483016381589, + "grad_norm": 1.7266024351119995, + "learning_rate": 1.7880279865250066e-05, + "loss": 1.4463, + "step": 3450 + }, + { + "epoch": 0.08048815330299382, + "grad_norm": 2.0583856105804443, + "learning_rate": 1.788546255506608e-05, + "loss": 1.4841, + "step": 3451 + }, + { + "epoch": 0.08051147644217173, + "grad_norm": 1.7603833675384521, + "learning_rate": 1.7890645244882095e-05, + "loss": 1.6678, + "step": 3452 + }, + { + "epoch": 0.08053479958134965, + "grad_norm": 1.6743794679641724, + "learning_rate": 1.789582793469811e-05, + "loss": 1.0625, + "step": 3453 + }, + { + "epoch": 0.08055812272052756, + "grad_norm": 2.879643201828003, + "learning_rate": 1.7901010624514123e-05, + "loss": 1.4258, + "step": 3454 + }, + { + "epoch": 0.08058144585970549, + "grad_norm": 2.4089555740356445, + "learning_rate": 1.7906193314330138e-05, + "loss": 1.3974, + "step": 3455 + }, + { + "epoch": 0.0806047689988834, + "grad_norm": 2.077892780303955, + "learning_rate": 1.7911376004146152e-05, + "loss": 1.3107, + "step": 3456 + }, + { + "epoch": 0.08062809213806132, + "grad_norm": 2.7976572513580322, + "learning_rate": 1.7916558693962166e-05, + "loss": 1.1065, + "step": 3457 + }, + { + "epoch": 0.08065141527723924, + "grad_norm": 1.862610101699829, + "learning_rate": 1.792174138377818e-05, + "loss": 1.5416, + "step": 3458 + }, + { + "epoch": 0.08067473841641716, + "grad_norm": 1.8210248947143555, + "learning_rate": 1.7926924073594195e-05, + "loss": 1.7809, + "step": 3459 + }, + { + "epoch": 0.08069806155559507, + "grad_norm": 1.653671145439148, + "learning_rate": 1.793210676341021e-05, + "loss": 1.3246, + "step": 3460 + }, + { + "epoch": 0.080721384694773, + "grad_norm": 1.676255464553833, + "learning_rate": 1.7937289453226224e-05, + "loss": 1.3022, + "step": 3461 + }, + { + "epoch": 0.08074470783395091, + "grad_norm": 1.759718418121338, + "learning_rate": 1.794247214304224e-05, + "loss": 1.1314, + "step": 3462 + }, + { + "epoch": 0.08076803097312883, + "grad_norm": 1.7641233205795288, + "learning_rate": 1.7947654832858256e-05, + "loss": 1.4204, + "step": 3463 + }, + { + "epoch": 0.08079135411230674, + "grad_norm": 1.5273410081863403, + "learning_rate": 1.795283752267427e-05, + "loss": 1.2823, + "step": 3464 + }, + { + "epoch": 0.08081467725148467, + "grad_norm": 1.91709303855896, + "learning_rate": 1.7958020212490285e-05, + "loss": 1.692, + "step": 3465 + }, + { + "epoch": 0.08083800039066258, + "grad_norm": 1.7322471141815186, + "learning_rate": 1.79632029023063e-05, + "loss": 1.4564, + "step": 3466 + }, + { + "epoch": 0.0808613235298405, + "grad_norm": 1.8525621891021729, + "learning_rate": 1.7968385592122313e-05, + "loss": 1.3558, + "step": 3467 + }, + { + "epoch": 0.08088464666901841, + "grad_norm": 1.6870750188827515, + "learning_rate": 1.7973568281938328e-05, + "loss": 1.2403, + "step": 3468 + }, + { + "epoch": 0.08090796980819634, + "grad_norm": 1.80242919921875, + "learning_rate": 1.7978750971754342e-05, + "loss": 1.4841, + "step": 3469 + }, + { + "epoch": 0.08093129294737425, + "grad_norm": 1.543397307395935, + "learning_rate": 1.7983933661570356e-05, + "loss": 1.474, + "step": 3470 + }, + { + "epoch": 0.08095461608655218, + "grad_norm": 1.908703327178955, + "learning_rate": 1.798911635138637e-05, + "loss": 1.3324, + "step": 3471 + }, + { + "epoch": 0.08097793922573009, + "grad_norm": 1.846138596534729, + "learning_rate": 1.7994299041202385e-05, + "loss": 1.4899, + "step": 3472 + }, + { + "epoch": 0.08100126236490801, + "grad_norm": 1.8396351337432861, + "learning_rate": 1.79994817310184e-05, + "loss": 1.4697, + "step": 3473 + }, + { + "epoch": 0.08102458550408592, + "grad_norm": 1.653918981552124, + "learning_rate": 1.8004664420834414e-05, + "loss": 1.6469, + "step": 3474 + }, + { + "epoch": 0.08104790864326385, + "grad_norm": 2.1722609996795654, + "learning_rate": 1.8009847110650428e-05, + "loss": 1.4355, + "step": 3475 + }, + { + "epoch": 0.08107123178244176, + "grad_norm": 2.0860023498535156, + "learning_rate": 1.8015029800466442e-05, + "loss": 1.7017, + "step": 3476 + }, + { + "epoch": 0.08109455492161967, + "grad_norm": 1.9186869859695435, + "learning_rate": 1.8020212490282457e-05, + "loss": 1.289, + "step": 3477 + }, + { + "epoch": 0.0811178780607976, + "grad_norm": 1.6853832006454468, + "learning_rate": 1.802539518009847e-05, + "loss": 1.4602, + "step": 3478 + }, + { + "epoch": 0.0811412011999755, + "grad_norm": 2.2143218517303467, + "learning_rate": 1.803057786991449e-05, + "loss": 1.3986, + "step": 3479 + }, + { + "epoch": 0.08116452433915343, + "grad_norm": 1.786441683769226, + "learning_rate": 1.8035760559730503e-05, + "loss": 1.5675, + "step": 3480 + }, + { + "epoch": 0.08118784747833134, + "grad_norm": 1.891284704208374, + "learning_rate": 1.8040943249546518e-05, + "loss": 1.537, + "step": 3481 + }, + { + "epoch": 0.08121117061750927, + "grad_norm": 1.7284735441207886, + "learning_rate": 1.8046125939362532e-05, + "loss": 1.6959, + "step": 3482 + }, + { + "epoch": 0.08123449375668718, + "grad_norm": 2.38594388961792, + "learning_rate": 1.8051308629178546e-05, + "loss": 1.2144, + "step": 3483 + }, + { + "epoch": 0.0812578168958651, + "grad_norm": 1.965506911277771, + "learning_rate": 1.805649131899456e-05, + "loss": 1.6692, + "step": 3484 + }, + { + "epoch": 0.08128114003504301, + "grad_norm": 2.0930988788604736, + "learning_rate": 1.8061674008810575e-05, + "loss": 1.0515, + "step": 3485 + }, + { + "epoch": 0.08130446317422094, + "grad_norm": 1.7666972875595093, + "learning_rate": 1.806685669862659e-05, + "loss": 1.2833, + "step": 3486 + }, + { + "epoch": 0.08132778631339885, + "grad_norm": 1.7547857761383057, + "learning_rate": 1.8072039388442604e-05, + "loss": 1.5865, + "step": 3487 + }, + { + "epoch": 0.08135110945257677, + "grad_norm": 1.726447343826294, + "learning_rate": 1.8077222078258618e-05, + "loss": 1.1413, + "step": 3488 + }, + { + "epoch": 0.08137443259175468, + "grad_norm": 1.6850913763046265, + "learning_rate": 1.8082404768074632e-05, + "loss": 1.6147, + "step": 3489 + }, + { + "epoch": 0.08139775573093261, + "grad_norm": 1.7971402406692505, + "learning_rate": 1.8087587457890647e-05, + "loss": 1.1584, + "step": 3490 + }, + { + "epoch": 0.08142107887011052, + "grad_norm": 1.5679081678390503, + "learning_rate": 1.809277014770666e-05, + "loss": 1.0253, + "step": 3491 + }, + { + "epoch": 0.08144440200928844, + "grad_norm": 1.7024198770523071, + "learning_rate": 1.8097952837522675e-05, + "loss": 1.367, + "step": 3492 + }, + { + "epoch": 0.08146772514846636, + "grad_norm": 1.712080955505371, + "learning_rate": 1.810313552733869e-05, + "loss": 1.5415, + "step": 3493 + }, + { + "epoch": 0.08149104828764428, + "grad_norm": 1.7190017700195312, + "learning_rate": 1.8108318217154704e-05, + "loss": 1.6035, + "step": 3494 + }, + { + "epoch": 0.08151437142682219, + "grad_norm": 1.6702404022216797, + "learning_rate": 1.8113500906970722e-05, + "loss": 1.4485, + "step": 3495 + }, + { + "epoch": 0.08153769456600012, + "grad_norm": 1.7290270328521729, + "learning_rate": 1.8118683596786736e-05, + "loss": 1.3473, + "step": 3496 + }, + { + "epoch": 0.08156101770517803, + "grad_norm": 1.8638476133346558, + "learning_rate": 1.812386628660275e-05, + "loss": 1.1051, + "step": 3497 + }, + { + "epoch": 0.08158434084435595, + "grad_norm": 1.6890590190887451, + "learning_rate": 1.8129048976418765e-05, + "loss": 1.207, + "step": 3498 + }, + { + "epoch": 0.08160766398353386, + "grad_norm": 2.0224850177764893, + "learning_rate": 1.813423166623478e-05, + "loss": 1.7992, + "step": 3499 + }, + { + "epoch": 0.08163098712271179, + "grad_norm": 2.054816246032715, + "learning_rate": 1.8139414356050794e-05, + "loss": 1.4456, + "step": 3500 + }, + { + "epoch": 0.0816543102618897, + "grad_norm": 1.79837965965271, + "learning_rate": 1.8144597045866804e-05, + "loss": 1.6059, + "step": 3501 + }, + { + "epoch": 0.08167763340106762, + "grad_norm": 1.9790756702423096, + "learning_rate": 1.814977973568282e-05, + "loss": 1.3804, + "step": 3502 + }, + { + "epoch": 0.08170095654024553, + "grad_norm": 1.8916029930114746, + "learning_rate": 1.8154962425498833e-05, + "loss": 1.4934, + "step": 3503 + }, + { + "epoch": 0.08172427967942346, + "grad_norm": 2.5700888633728027, + "learning_rate": 1.8160145115314848e-05, + "loss": 1.14, + "step": 3504 + }, + { + "epoch": 0.08174760281860137, + "grad_norm": 1.945611834526062, + "learning_rate": 1.8165327805130862e-05, + "loss": 1.4387, + "step": 3505 + }, + { + "epoch": 0.08177092595777928, + "grad_norm": 1.7046513557434082, + "learning_rate": 1.817051049494688e-05, + "loss": 1.1947, + "step": 3506 + }, + { + "epoch": 0.0817942490969572, + "grad_norm": 1.9995052814483643, + "learning_rate": 1.8175693184762894e-05, + "loss": 1.2849, + "step": 3507 + }, + { + "epoch": 0.08181757223613512, + "grad_norm": 1.865929126739502, + "learning_rate": 1.8180875874578908e-05, + "loss": 1.6354, + "step": 3508 + }, + { + "epoch": 0.08184089537531304, + "grad_norm": 2.0877327919006348, + "learning_rate": 1.8186058564394923e-05, + "loss": 1.5864, + "step": 3509 + }, + { + "epoch": 0.08186421851449095, + "grad_norm": 1.8026949167251587, + "learning_rate": 1.8191241254210937e-05, + "loss": 1.3427, + "step": 3510 + }, + { + "epoch": 0.08188754165366888, + "grad_norm": 1.9686529636383057, + "learning_rate": 1.819642394402695e-05, + "loss": 0.8916, + "step": 3511 + }, + { + "epoch": 0.08191086479284679, + "grad_norm": 1.882365345954895, + "learning_rate": 1.8201606633842966e-05, + "loss": 1.6041, + "step": 3512 + }, + { + "epoch": 0.08193418793202471, + "grad_norm": 2.731876850128174, + "learning_rate": 1.820678932365898e-05, + "loss": 1.9984, + "step": 3513 + }, + { + "epoch": 0.08195751107120262, + "grad_norm": 2.3877131938934326, + "learning_rate": 1.8211972013474994e-05, + "loss": 1.5732, + "step": 3514 + }, + { + "epoch": 0.08198083421038055, + "grad_norm": 1.7990912199020386, + "learning_rate": 1.821715470329101e-05, + "loss": 1.2355, + "step": 3515 + }, + { + "epoch": 0.08200415734955846, + "grad_norm": 1.7821038961410522, + "learning_rate": 1.8222337393107023e-05, + "loss": 1.5518, + "step": 3516 + }, + { + "epoch": 0.08202748048873638, + "grad_norm": 1.5398446321487427, + "learning_rate": 1.8227520082923037e-05, + "loss": 1.536, + "step": 3517 + }, + { + "epoch": 0.0820508036279143, + "grad_norm": 1.8921490907669067, + "learning_rate": 1.8232702772739052e-05, + "loss": 1.2674, + "step": 3518 + }, + { + "epoch": 0.08207412676709222, + "grad_norm": 2.987426519393921, + "learning_rate": 1.8237885462555066e-05, + "loss": 1.358, + "step": 3519 + }, + { + "epoch": 0.08209744990627013, + "grad_norm": 2.238374710083008, + "learning_rate": 1.824306815237108e-05, + "loss": 1.5369, + "step": 3520 + }, + { + "epoch": 0.08212077304544806, + "grad_norm": 3.6904711723327637, + "learning_rate": 1.8248250842187095e-05, + "loss": 1.4271, + "step": 3521 + }, + { + "epoch": 0.08214409618462597, + "grad_norm": 1.9176346063613892, + "learning_rate": 1.8253433532003113e-05, + "loss": 1.3368, + "step": 3522 + }, + { + "epoch": 0.08216741932380389, + "grad_norm": 2.0872647762298584, + "learning_rate": 1.8258616221819127e-05, + "loss": 1.4909, + "step": 3523 + }, + { + "epoch": 0.0821907424629818, + "grad_norm": 1.2928736209869385, + "learning_rate": 1.826379891163514e-05, + "loss": 1.1043, + "step": 3524 + }, + { + "epoch": 0.08221406560215973, + "grad_norm": 1.8889038562774658, + "learning_rate": 1.8268981601451156e-05, + "loss": 1.4896, + "step": 3525 + }, + { + "epoch": 0.08223738874133764, + "grad_norm": 1.6184550523757935, + "learning_rate": 1.827416429126717e-05, + "loss": 1.1775, + "step": 3526 + }, + { + "epoch": 0.08226071188051556, + "grad_norm": 1.6747300624847412, + "learning_rate": 1.8279346981083184e-05, + "loss": 1.5762, + "step": 3527 + }, + { + "epoch": 0.08228403501969347, + "grad_norm": 1.707043170928955, + "learning_rate": 1.82845296708992e-05, + "loss": 1.5885, + "step": 3528 + }, + { + "epoch": 0.0823073581588714, + "grad_norm": 1.6010284423828125, + "learning_rate": 1.8289712360715213e-05, + "loss": 1.5545, + "step": 3529 + }, + { + "epoch": 0.08233068129804931, + "grad_norm": 1.565683364868164, + "learning_rate": 1.8294895050531227e-05, + "loss": 1.5154, + "step": 3530 + }, + { + "epoch": 0.08235400443722724, + "grad_norm": 1.707773208618164, + "learning_rate": 1.830007774034724e-05, + "loss": 1.1917, + "step": 3531 + }, + { + "epoch": 0.08237732757640515, + "grad_norm": 1.8261590003967285, + "learning_rate": 1.8305260430163256e-05, + "loss": 1.4657, + "step": 3532 + }, + { + "epoch": 0.08240065071558306, + "grad_norm": 1.7365014553070068, + "learning_rate": 1.831044311997927e-05, + "loss": 1.4251, + "step": 3533 + }, + { + "epoch": 0.08242397385476098, + "grad_norm": 2.4612667560577393, + "learning_rate": 1.8315625809795285e-05, + "loss": 1.2089, + "step": 3534 + }, + { + "epoch": 0.08244729699393889, + "grad_norm": 1.5900131464004517, + "learning_rate": 1.83208084996113e-05, + "loss": 1.6323, + "step": 3535 + }, + { + "epoch": 0.08247062013311682, + "grad_norm": 2.6279211044311523, + "learning_rate": 1.8325991189427313e-05, + "loss": 2.0529, + "step": 3536 + }, + { + "epoch": 0.08249394327229473, + "grad_norm": 1.79181969165802, + "learning_rate": 1.8331173879243328e-05, + "loss": 1.4461, + "step": 3537 + }, + { + "epoch": 0.08251726641147265, + "grad_norm": 1.5957787036895752, + "learning_rate": 1.8336356569059342e-05, + "loss": 1.2476, + "step": 3538 + }, + { + "epoch": 0.08254058955065056, + "grad_norm": 1.5493981838226318, + "learning_rate": 1.834153925887536e-05, + "loss": 1.4666, + "step": 3539 + }, + { + "epoch": 0.08256391268982849, + "grad_norm": 1.5983459949493408, + "learning_rate": 1.8346721948691374e-05, + "loss": 1.1337, + "step": 3540 + }, + { + "epoch": 0.0825872358290064, + "grad_norm": 1.900614857673645, + "learning_rate": 1.835190463850739e-05, + "loss": 1.2832, + "step": 3541 + }, + { + "epoch": 0.08261055896818432, + "grad_norm": 1.7099312543869019, + "learning_rate": 1.8357087328323403e-05, + "loss": 1.5274, + "step": 3542 + }, + { + "epoch": 0.08263388210736224, + "grad_norm": 1.7623786926269531, + "learning_rate": 1.8362270018139417e-05, + "loss": 1.5002, + "step": 3543 + }, + { + "epoch": 0.08265720524654016, + "grad_norm": 1.6291403770446777, + "learning_rate": 1.836745270795543e-05, + "loss": 1.4322, + "step": 3544 + }, + { + "epoch": 0.08268052838571807, + "grad_norm": 1.7882250547409058, + "learning_rate": 1.8372635397771446e-05, + "loss": 1.5406, + "step": 3545 + }, + { + "epoch": 0.082703851524896, + "grad_norm": 1.33836829662323, + "learning_rate": 1.837781808758746e-05, + "loss": 1.0274, + "step": 3546 + }, + { + "epoch": 0.08272717466407391, + "grad_norm": 2.4266624450683594, + "learning_rate": 1.8383000777403475e-05, + "loss": 1.4865, + "step": 3547 + }, + { + "epoch": 0.08275049780325183, + "grad_norm": 1.9168795347213745, + "learning_rate": 1.838818346721949e-05, + "loss": 1.3113, + "step": 3548 + }, + { + "epoch": 0.08277382094242974, + "grad_norm": 1.602070689201355, + "learning_rate": 1.8393366157035503e-05, + "loss": 1.3721, + "step": 3549 + }, + { + "epoch": 0.08279714408160767, + "grad_norm": 2.380582571029663, + "learning_rate": 1.8398548846851518e-05, + "loss": 1.3196, + "step": 3550 + }, + { + "epoch": 0.08282046722078558, + "grad_norm": 1.694897174835205, + "learning_rate": 1.8403731536667532e-05, + "loss": 1.3993, + "step": 3551 + }, + { + "epoch": 0.0828437903599635, + "grad_norm": 2.2976338863372803, + "learning_rate": 1.8408914226483546e-05, + "loss": 1.5756, + "step": 3552 + }, + { + "epoch": 0.08286711349914141, + "grad_norm": 2.3146839141845703, + "learning_rate": 1.841409691629956e-05, + "loss": 1.2754, + "step": 3553 + }, + { + "epoch": 0.08289043663831934, + "grad_norm": 1.9801183938980103, + "learning_rate": 1.8419279606115575e-05, + "loss": 1.2338, + "step": 3554 + }, + { + "epoch": 0.08291375977749725, + "grad_norm": 2.159083843231201, + "learning_rate": 1.8424462295931593e-05, + "loss": 1.623, + "step": 3555 + }, + { + "epoch": 0.08293708291667518, + "grad_norm": 1.8101762533187866, + "learning_rate": 1.8429644985747604e-05, + "loss": 1.7546, + "step": 3556 + }, + { + "epoch": 0.08296040605585309, + "grad_norm": 2.1166656017303467, + "learning_rate": 1.8434827675563618e-05, + "loss": 1.472, + "step": 3557 + }, + { + "epoch": 0.08298372919503101, + "grad_norm": 2.1947038173675537, + "learning_rate": 1.8440010365379632e-05, + "loss": 1.3757, + "step": 3558 + }, + { + "epoch": 0.08300705233420892, + "grad_norm": 1.8868519067764282, + "learning_rate": 1.8445193055195647e-05, + "loss": 1.317, + "step": 3559 + }, + { + "epoch": 0.08303037547338685, + "grad_norm": 2.1068966388702393, + "learning_rate": 1.845037574501166e-05, + "loss": 1.6346, + "step": 3560 + }, + { + "epoch": 0.08305369861256476, + "grad_norm": 1.756743311882019, + "learning_rate": 1.8455558434827675e-05, + "loss": 1.3374, + "step": 3561 + }, + { + "epoch": 0.08307702175174267, + "grad_norm": 1.6982694864273071, + "learning_rate": 1.846074112464369e-05, + "loss": 1.2253, + "step": 3562 + }, + { + "epoch": 0.0831003448909206, + "grad_norm": 1.6374770402908325, + "learning_rate": 1.8465923814459704e-05, + "loss": 1.8107, + "step": 3563 + }, + { + "epoch": 0.0831236680300985, + "grad_norm": 1.6520603895187378, + "learning_rate": 1.847110650427572e-05, + "loss": 1.3199, + "step": 3564 + }, + { + "epoch": 0.08314699116927643, + "grad_norm": 1.5398011207580566, + "learning_rate": 1.8476289194091733e-05, + "loss": 1.5716, + "step": 3565 + }, + { + "epoch": 0.08317031430845434, + "grad_norm": 1.6022202968597412, + "learning_rate": 1.848147188390775e-05, + "loss": 1.4777, + "step": 3566 + }, + { + "epoch": 0.08319363744763227, + "grad_norm": 2.1725592613220215, + "learning_rate": 1.8486654573723765e-05, + "loss": 1.4681, + "step": 3567 + }, + { + "epoch": 0.08321696058681018, + "grad_norm": 1.59706449508667, + "learning_rate": 1.849183726353978e-05, + "loss": 1.2141, + "step": 3568 + }, + { + "epoch": 0.0832402837259881, + "grad_norm": 1.6120070219039917, + "learning_rate": 1.8497019953355794e-05, + "loss": 1.4908, + "step": 3569 + }, + { + "epoch": 0.08326360686516601, + "grad_norm": 1.4892857074737549, + "learning_rate": 1.8502202643171808e-05, + "loss": 1.6314, + "step": 3570 + }, + { + "epoch": 0.08328693000434394, + "grad_norm": 2.104597806930542, + "learning_rate": 1.8507385332987822e-05, + "loss": 1.4717, + "step": 3571 + }, + { + "epoch": 0.08331025314352185, + "grad_norm": 1.6070716381072998, + "learning_rate": 1.8512568022803837e-05, + "loss": 1.5624, + "step": 3572 + }, + { + "epoch": 0.08333357628269977, + "grad_norm": 1.9566452503204346, + "learning_rate": 1.851775071261985e-05, + "loss": 1.5337, + "step": 3573 + }, + { + "epoch": 0.08335689942187768, + "grad_norm": 1.8620479106903076, + "learning_rate": 1.8522933402435865e-05, + "loss": 1.6287, + "step": 3574 + }, + { + "epoch": 0.08338022256105561, + "grad_norm": 3.037355661392212, + "learning_rate": 1.852811609225188e-05, + "loss": 1.0238, + "step": 3575 + }, + { + "epoch": 0.08340354570023352, + "grad_norm": 1.7178518772125244, + "learning_rate": 1.8533298782067894e-05, + "loss": 1.5673, + "step": 3576 + }, + { + "epoch": 0.08342686883941144, + "grad_norm": 1.6301993131637573, + "learning_rate": 1.853848147188391e-05, + "loss": 1.458, + "step": 3577 + }, + { + "epoch": 0.08345019197858936, + "grad_norm": 1.7615132331848145, + "learning_rate": 1.8543664161699923e-05, + "loss": 1.7595, + "step": 3578 + }, + { + "epoch": 0.08347351511776728, + "grad_norm": 4.415511131286621, + "learning_rate": 1.8548846851515937e-05, + "loss": 1.3477, + "step": 3579 + }, + { + "epoch": 0.08349683825694519, + "grad_norm": 1.829351782798767, + "learning_rate": 1.855402954133195e-05, + "loss": 1.6647, + "step": 3580 + }, + { + "epoch": 0.08352016139612312, + "grad_norm": 1.7743425369262695, + "learning_rate": 1.8559212231147966e-05, + "loss": 1.8199, + "step": 3581 + }, + { + "epoch": 0.08354348453530103, + "grad_norm": 1.833815097808838, + "learning_rate": 1.856439492096398e-05, + "loss": 1.3673, + "step": 3582 + }, + { + "epoch": 0.08356680767447895, + "grad_norm": 1.5730009078979492, + "learning_rate": 1.8569577610779998e-05, + "loss": 1.2953, + "step": 3583 + }, + { + "epoch": 0.08359013081365686, + "grad_norm": 1.7415244579315186, + "learning_rate": 1.8574760300596012e-05, + "loss": 1.4384, + "step": 3584 + }, + { + "epoch": 0.08361345395283479, + "grad_norm": 1.6725150346755981, + "learning_rate": 1.8579942990412027e-05, + "loss": 1.2592, + "step": 3585 + }, + { + "epoch": 0.0836367770920127, + "grad_norm": 1.7817398309707642, + "learning_rate": 1.858512568022804e-05, + "loss": 1.3653, + "step": 3586 + }, + { + "epoch": 0.08366010023119062, + "grad_norm": 1.7642518281936646, + "learning_rate": 1.8590308370044055e-05, + "loss": 1.2734, + "step": 3587 + }, + { + "epoch": 0.08368342337036853, + "grad_norm": 1.9906455278396606, + "learning_rate": 1.859549105986007e-05, + "loss": 1.4007, + "step": 3588 + }, + { + "epoch": 0.08370674650954646, + "grad_norm": 1.8507903814315796, + "learning_rate": 1.8600673749676084e-05, + "loss": 1.2213, + "step": 3589 + }, + { + "epoch": 0.08373006964872437, + "grad_norm": 1.7837014198303223, + "learning_rate": 1.8605856439492098e-05, + "loss": 1.5054, + "step": 3590 + }, + { + "epoch": 0.08375339278790228, + "grad_norm": 1.8760827779769897, + "learning_rate": 1.8611039129308113e-05, + "loss": 1.4422, + "step": 3591 + }, + { + "epoch": 0.0837767159270802, + "grad_norm": 2.278310775756836, + "learning_rate": 1.8616221819124127e-05, + "loss": 1.4177, + "step": 3592 + }, + { + "epoch": 0.08380003906625812, + "grad_norm": 1.8723113536834717, + "learning_rate": 1.862140450894014e-05, + "loss": 1.3213, + "step": 3593 + }, + { + "epoch": 0.08382336220543604, + "grad_norm": 1.8388547897338867, + "learning_rate": 1.8626587198756156e-05, + "loss": 1.4389, + "step": 3594 + }, + { + "epoch": 0.08384668534461395, + "grad_norm": 2.0076687335968018, + "learning_rate": 1.863176988857217e-05, + "loss": 1.7453, + "step": 3595 + }, + { + "epoch": 0.08387000848379188, + "grad_norm": 1.573243498802185, + "learning_rate": 1.8636952578388184e-05, + "loss": 1.3174, + "step": 3596 + }, + { + "epoch": 0.08389333162296979, + "grad_norm": 1.5967497825622559, + "learning_rate": 1.86421352682042e-05, + "loss": 1.2045, + "step": 3597 + }, + { + "epoch": 0.08391665476214771, + "grad_norm": 2.108859062194824, + "learning_rate": 1.8647317958020213e-05, + "loss": 1.3171, + "step": 3598 + }, + { + "epoch": 0.08393997790132562, + "grad_norm": 1.9356852769851685, + "learning_rate": 1.865250064783623e-05, + "loss": 1.3646, + "step": 3599 + }, + { + "epoch": 0.08396330104050355, + "grad_norm": 2.345694065093994, + "learning_rate": 1.8657683337652245e-05, + "loss": 1.3063, + "step": 3600 + }, + { + "epoch": 0.08398662417968146, + "grad_norm": 1.3503670692443848, + "learning_rate": 1.866286602746826e-05, + "loss": 1.2187, + "step": 3601 + }, + { + "epoch": 0.08400994731885938, + "grad_norm": 1.4740415811538696, + "learning_rate": 1.8668048717284274e-05, + "loss": 1.4042, + "step": 3602 + }, + { + "epoch": 0.0840332704580373, + "grad_norm": 1.6068682670593262, + "learning_rate": 1.8673231407100288e-05, + "loss": 1.6119, + "step": 3603 + }, + { + "epoch": 0.08405659359721522, + "grad_norm": 1.9428056478500366, + "learning_rate": 1.8678414096916303e-05, + "loss": 1.2609, + "step": 3604 + }, + { + "epoch": 0.08407991673639313, + "grad_norm": 1.8283860683441162, + "learning_rate": 1.8683596786732317e-05, + "loss": 1.5166, + "step": 3605 + }, + { + "epoch": 0.08410323987557106, + "grad_norm": 1.4029210805892944, + "learning_rate": 1.868877947654833e-05, + "loss": 1.2865, + "step": 3606 + }, + { + "epoch": 0.08412656301474897, + "grad_norm": 1.2441627979278564, + "learning_rate": 1.8693962166364346e-05, + "loss": 1.1379, + "step": 3607 + }, + { + "epoch": 0.08414988615392689, + "grad_norm": 1.568545937538147, + "learning_rate": 1.869914485618036e-05, + "loss": 1.3603, + "step": 3608 + }, + { + "epoch": 0.0841732092931048, + "grad_norm": 1.4704279899597168, + "learning_rate": 1.8704327545996374e-05, + "loss": 1.3639, + "step": 3609 + }, + { + "epoch": 0.08419653243228273, + "grad_norm": 1.745092511177063, + "learning_rate": 1.870951023581239e-05, + "loss": 1.0921, + "step": 3610 + }, + { + "epoch": 0.08421985557146064, + "grad_norm": 1.942427396774292, + "learning_rate": 1.8714692925628403e-05, + "loss": 1.5507, + "step": 3611 + }, + { + "epoch": 0.08424317871063856, + "grad_norm": 1.6115508079528809, + "learning_rate": 1.8719875615444417e-05, + "loss": 1.4687, + "step": 3612 + }, + { + "epoch": 0.08426650184981647, + "grad_norm": 2.0204665660858154, + "learning_rate": 1.872505830526043e-05, + "loss": 1.5153, + "step": 3613 + }, + { + "epoch": 0.0842898249889944, + "grad_norm": 2.009833574295044, + "learning_rate": 1.8730240995076446e-05, + "loss": 1.4983, + "step": 3614 + }, + { + "epoch": 0.08431314812817231, + "grad_norm": 1.68868088722229, + "learning_rate": 1.873542368489246e-05, + "loss": 1.3068, + "step": 3615 + }, + { + "epoch": 0.08433647126735024, + "grad_norm": 1.6193383932113647, + "learning_rate": 1.8740606374708475e-05, + "loss": 1.2426, + "step": 3616 + }, + { + "epoch": 0.08435979440652815, + "grad_norm": 2.242953062057495, + "learning_rate": 1.874578906452449e-05, + "loss": 1.4347, + "step": 3617 + }, + { + "epoch": 0.08438311754570607, + "grad_norm": 1.6163315773010254, + "learning_rate": 1.8750971754340503e-05, + "loss": 1.4287, + "step": 3618 + }, + { + "epoch": 0.08440644068488398, + "grad_norm": 1.638716697692871, + "learning_rate": 1.8756154444156518e-05, + "loss": 1.3548, + "step": 3619 + }, + { + "epoch": 0.0844297638240619, + "grad_norm": 1.4887486696243286, + "learning_rate": 1.8761337133972532e-05, + "loss": 1.4593, + "step": 3620 + }, + { + "epoch": 0.08445308696323982, + "grad_norm": 1.63435959815979, + "learning_rate": 1.8766519823788546e-05, + "loss": 1.4153, + "step": 3621 + }, + { + "epoch": 0.08447641010241773, + "grad_norm": 1.5852644443511963, + "learning_rate": 1.877170251360456e-05, + "loss": 1.4955, + "step": 3622 + }, + { + "epoch": 0.08449973324159565, + "grad_norm": 1.7639789581298828, + "learning_rate": 1.8776885203420575e-05, + "loss": 1.6683, + "step": 3623 + }, + { + "epoch": 0.08452305638077356, + "grad_norm": 2.21766996383667, + "learning_rate": 1.878206789323659e-05, + "loss": 1.1225, + "step": 3624 + }, + { + "epoch": 0.08454637951995149, + "grad_norm": 1.663895606994629, + "learning_rate": 1.8787250583052604e-05, + "loss": 1.3233, + "step": 3625 + }, + { + "epoch": 0.0845697026591294, + "grad_norm": 1.4655165672302246, + "learning_rate": 1.879243327286862e-05, + "loss": 0.8843, + "step": 3626 + }, + { + "epoch": 0.08459302579830733, + "grad_norm": 1.8070948123931885, + "learning_rate": 1.8797615962684636e-05, + "loss": 1.3909, + "step": 3627 + }, + { + "epoch": 0.08461634893748524, + "grad_norm": 2.0682473182678223, + "learning_rate": 1.880279865250065e-05, + "loss": 1.3427, + "step": 3628 + }, + { + "epoch": 0.08463967207666316, + "grad_norm": 1.361215591430664, + "learning_rate": 1.8807981342316665e-05, + "loss": 1.1998, + "step": 3629 + }, + { + "epoch": 0.08466299521584107, + "grad_norm": 2.013563394546509, + "learning_rate": 1.881316403213268e-05, + "loss": 1.4059, + "step": 3630 + }, + { + "epoch": 0.084686318355019, + "grad_norm": 1.9156532287597656, + "learning_rate": 1.8818346721948693e-05, + "loss": 1.4498, + "step": 3631 + }, + { + "epoch": 0.08470964149419691, + "grad_norm": 1.635302186012268, + "learning_rate": 1.8823529411764708e-05, + "loss": 1.5367, + "step": 3632 + }, + { + "epoch": 0.08473296463337483, + "grad_norm": 1.9218707084655762, + "learning_rate": 1.8828712101580722e-05, + "loss": 1.4478, + "step": 3633 + }, + { + "epoch": 0.08475628777255274, + "grad_norm": 3.0237009525299072, + "learning_rate": 1.8833894791396736e-05, + "loss": 1.6139, + "step": 3634 + }, + { + "epoch": 0.08477961091173067, + "grad_norm": 2.358811140060425, + "learning_rate": 1.883907748121275e-05, + "loss": 1.5581, + "step": 3635 + }, + { + "epoch": 0.08480293405090858, + "grad_norm": 2.286472797393799, + "learning_rate": 1.8844260171028765e-05, + "loss": 1.5672, + "step": 3636 + }, + { + "epoch": 0.0848262571900865, + "grad_norm": 1.866599440574646, + "learning_rate": 1.884944286084478e-05, + "loss": 1.5127, + "step": 3637 + }, + { + "epoch": 0.08484958032926442, + "grad_norm": 2.1738510131835938, + "learning_rate": 1.8854625550660794e-05, + "loss": 1.6779, + "step": 3638 + }, + { + "epoch": 0.08487290346844234, + "grad_norm": 1.97782564163208, + "learning_rate": 1.8859808240476808e-05, + "loss": 1.4563, + "step": 3639 + }, + { + "epoch": 0.08489622660762025, + "grad_norm": 2.0570616722106934, + "learning_rate": 1.8864990930292822e-05, + "loss": 1.6798, + "step": 3640 + }, + { + "epoch": 0.08491954974679818, + "grad_norm": 1.8737928867340088, + "learning_rate": 1.8870173620108837e-05, + "loss": 1.2149, + "step": 3641 + }, + { + "epoch": 0.08494287288597609, + "grad_norm": 1.6535226106643677, + "learning_rate": 1.887535630992485e-05, + "loss": 1.2412, + "step": 3642 + }, + { + "epoch": 0.08496619602515401, + "grad_norm": 1.6835589408874512, + "learning_rate": 1.888053899974087e-05, + "loss": 1.4337, + "step": 3643 + }, + { + "epoch": 0.08498951916433192, + "grad_norm": 2.3011474609375, + "learning_rate": 1.8885721689556883e-05, + "loss": 1.1645, + "step": 3644 + }, + { + "epoch": 0.08501284230350985, + "grad_norm": 2.1055448055267334, + "learning_rate": 1.8890904379372898e-05, + "loss": 1.8118, + "step": 3645 + }, + { + "epoch": 0.08503616544268776, + "grad_norm": 1.537703514099121, + "learning_rate": 1.8896087069188912e-05, + "loss": 1.3199, + "step": 3646 + }, + { + "epoch": 0.08505948858186567, + "grad_norm": 1.3928148746490479, + "learning_rate": 1.8901269759004926e-05, + "loss": 0.9513, + "step": 3647 + }, + { + "epoch": 0.0850828117210436, + "grad_norm": 1.7757236957550049, + "learning_rate": 1.890645244882094e-05, + "loss": 1.4446, + "step": 3648 + }, + { + "epoch": 0.0851061348602215, + "grad_norm": 1.6766371726989746, + "learning_rate": 1.8911635138636955e-05, + "loss": 1.2737, + "step": 3649 + }, + { + "epoch": 0.08512945799939943, + "grad_norm": 1.717544674873352, + "learning_rate": 1.891681782845297e-05, + "loss": 1.3209, + "step": 3650 + }, + { + "epoch": 0.08515278113857734, + "grad_norm": 1.769384741783142, + "learning_rate": 1.8922000518268984e-05, + "loss": 1.3914, + "step": 3651 + }, + { + "epoch": 0.08517610427775527, + "grad_norm": 2.045825481414795, + "learning_rate": 1.8927183208084998e-05, + "loss": 1.5438, + "step": 3652 + }, + { + "epoch": 0.08519942741693318, + "grad_norm": 1.7861770391464233, + "learning_rate": 1.8932365897901012e-05, + "loss": 1.6111, + "step": 3653 + }, + { + "epoch": 0.0852227505561111, + "grad_norm": 2.143268585205078, + "learning_rate": 1.8937548587717027e-05, + "loss": 1.4671, + "step": 3654 + }, + { + "epoch": 0.08524607369528901, + "grad_norm": 1.7686920166015625, + "learning_rate": 1.894273127753304e-05, + "loss": 1.5865, + "step": 3655 + }, + { + "epoch": 0.08526939683446694, + "grad_norm": 1.485282063484192, + "learning_rate": 1.8947913967349055e-05, + "loss": 1.5447, + "step": 3656 + }, + { + "epoch": 0.08529271997364485, + "grad_norm": 1.9272282123565674, + "learning_rate": 1.895309665716507e-05, + "loss": 1.465, + "step": 3657 + }, + { + "epoch": 0.08531604311282277, + "grad_norm": 2.2373533248901367, + "learning_rate": 1.8958279346981084e-05, + "loss": 1.5705, + "step": 3658 + }, + { + "epoch": 0.08533936625200068, + "grad_norm": 2.107943296432495, + "learning_rate": 1.8963462036797102e-05, + "loss": 1.8744, + "step": 3659 + }, + { + "epoch": 0.08536268939117861, + "grad_norm": 1.6185489892959595, + "learning_rate": 1.8968644726613116e-05, + "loss": 1.3097, + "step": 3660 + }, + { + "epoch": 0.08538601253035652, + "grad_norm": 1.7128583192825317, + "learning_rate": 1.897382741642913e-05, + "loss": 1.6936, + "step": 3661 + }, + { + "epoch": 0.08540933566953444, + "grad_norm": 1.5283836126327515, + "learning_rate": 1.8979010106245145e-05, + "loss": 1.2054, + "step": 3662 + }, + { + "epoch": 0.08543265880871236, + "grad_norm": 1.9866423606872559, + "learning_rate": 1.898419279606116e-05, + "loss": 1.6309, + "step": 3663 + }, + { + "epoch": 0.08545598194789028, + "grad_norm": 1.8106653690338135, + "learning_rate": 1.8989375485877173e-05, + "loss": 1.4622, + "step": 3664 + }, + { + "epoch": 0.08547930508706819, + "grad_norm": 1.4780253171920776, + "learning_rate": 1.8994558175693188e-05, + "loss": 1.4476, + "step": 3665 + }, + { + "epoch": 0.08550262822624612, + "grad_norm": 2.043191432952881, + "learning_rate": 1.8999740865509202e-05, + "loss": 1.4721, + "step": 3666 + }, + { + "epoch": 0.08552595136542403, + "grad_norm": 1.4947582483291626, + "learning_rate": 1.9004923555325213e-05, + "loss": 1.1956, + "step": 3667 + }, + { + "epoch": 0.08554927450460195, + "grad_norm": 1.7940278053283691, + "learning_rate": 1.9010106245141227e-05, + "loss": 1.0641, + "step": 3668 + }, + { + "epoch": 0.08557259764377986, + "grad_norm": 1.7259446382522583, + "learning_rate": 1.9015288934957242e-05, + "loss": 1.3405, + "step": 3669 + }, + { + "epoch": 0.08559592078295779, + "grad_norm": 1.8191657066345215, + "learning_rate": 1.902047162477326e-05, + "loss": 1.3307, + "step": 3670 + }, + { + "epoch": 0.0856192439221357, + "grad_norm": 1.7000370025634766, + "learning_rate": 1.9025654314589274e-05, + "loss": 1.1367, + "step": 3671 + }, + { + "epoch": 0.08564256706131362, + "grad_norm": 1.889972448348999, + "learning_rate": 1.9030837004405288e-05, + "loss": 1.415, + "step": 3672 + }, + { + "epoch": 0.08566589020049153, + "grad_norm": 2.0806078910827637, + "learning_rate": 1.9036019694221303e-05, + "loss": 1.3672, + "step": 3673 + }, + { + "epoch": 0.08568921333966946, + "grad_norm": 2.796506881713867, + "learning_rate": 1.9041202384037317e-05, + "loss": 1.7955, + "step": 3674 + }, + { + "epoch": 0.08571253647884737, + "grad_norm": 2.1324260234832764, + "learning_rate": 1.904638507385333e-05, + "loss": 1.3036, + "step": 3675 + }, + { + "epoch": 0.08573585961802528, + "grad_norm": 2.0976030826568604, + "learning_rate": 1.9051567763669346e-05, + "loss": 1.5918, + "step": 3676 + }, + { + "epoch": 0.0857591827572032, + "grad_norm": 1.7113218307495117, + "learning_rate": 1.905675045348536e-05, + "loss": 1.4825, + "step": 3677 + }, + { + "epoch": 0.08578250589638112, + "grad_norm": 1.7739856243133545, + "learning_rate": 1.9061933143301374e-05, + "loss": 1.5581, + "step": 3678 + }, + { + "epoch": 0.08580582903555904, + "grad_norm": 1.7396571636199951, + "learning_rate": 1.906711583311739e-05, + "loss": 1.1139, + "step": 3679 + }, + { + "epoch": 0.08582915217473695, + "grad_norm": 1.5428905487060547, + "learning_rate": 1.9072298522933403e-05, + "loss": 1.4518, + "step": 3680 + }, + { + "epoch": 0.08585247531391488, + "grad_norm": 1.8691036701202393, + "learning_rate": 1.9077481212749417e-05, + "loss": 1.4603, + "step": 3681 + }, + { + "epoch": 0.08587579845309279, + "grad_norm": 1.3939930200576782, + "learning_rate": 1.9082663902565432e-05, + "loss": 1.5307, + "step": 3682 + }, + { + "epoch": 0.08589912159227071, + "grad_norm": 1.632413625717163, + "learning_rate": 1.9087846592381446e-05, + "loss": 1.3989, + "step": 3683 + }, + { + "epoch": 0.08592244473144862, + "grad_norm": 2.055506706237793, + "learning_rate": 1.909302928219746e-05, + "loss": 1.9212, + "step": 3684 + }, + { + "epoch": 0.08594576787062655, + "grad_norm": 1.6543344259262085, + "learning_rate": 1.9098211972013475e-05, + "loss": 1.3353, + "step": 3685 + }, + { + "epoch": 0.08596909100980446, + "grad_norm": 1.8599748611450195, + "learning_rate": 1.910339466182949e-05, + "loss": 1.3546, + "step": 3686 + }, + { + "epoch": 0.08599241414898239, + "grad_norm": 1.7803127765655518, + "learning_rate": 1.9108577351645507e-05, + "loss": 1.5504, + "step": 3687 + }, + { + "epoch": 0.0860157372881603, + "grad_norm": 1.5186231136322021, + "learning_rate": 1.911376004146152e-05, + "loss": 1.0052, + "step": 3688 + }, + { + "epoch": 0.08603906042733822, + "grad_norm": 1.7595012187957764, + "learning_rate": 1.9118942731277536e-05, + "loss": 1.2889, + "step": 3689 + }, + { + "epoch": 0.08606238356651613, + "grad_norm": 1.9913487434387207, + "learning_rate": 1.912412542109355e-05, + "loss": 1.4898, + "step": 3690 + }, + { + "epoch": 0.08608570670569406, + "grad_norm": 1.6132420301437378, + "learning_rate": 1.9129308110909564e-05, + "loss": 1.3643, + "step": 3691 + }, + { + "epoch": 0.08610902984487197, + "grad_norm": 1.6206722259521484, + "learning_rate": 1.913449080072558e-05, + "loss": 1.4306, + "step": 3692 + }, + { + "epoch": 0.08613235298404989, + "grad_norm": 1.7904775142669678, + "learning_rate": 1.9139673490541593e-05, + "loss": 1.2245, + "step": 3693 + }, + { + "epoch": 0.0861556761232278, + "grad_norm": 1.7304673194885254, + "learning_rate": 1.9144856180357607e-05, + "loss": 1.5856, + "step": 3694 + }, + { + "epoch": 0.08617899926240573, + "grad_norm": 1.8599059581756592, + "learning_rate": 1.915003887017362e-05, + "loss": 1.6862, + "step": 3695 + }, + { + "epoch": 0.08620232240158364, + "grad_norm": 1.7575186491012573, + "learning_rate": 1.9155221559989636e-05, + "loss": 1.2792, + "step": 3696 + }, + { + "epoch": 0.08622564554076156, + "grad_norm": 2.756476640701294, + "learning_rate": 1.916040424980565e-05, + "loss": 1.6286, + "step": 3697 + }, + { + "epoch": 0.08624896867993948, + "grad_norm": 1.9986953735351562, + "learning_rate": 1.9165586939621665e-05, + "loss": 1.8472, + "step": 3698 + }, + { + "epoch": 0.0862722918191174, + "grad_norm": 1.4760515689849854, + "learning_rate": 1.917076962943768e-05, + "loss": 1.3095, + "step": 3699 + }, + { + "epoch": 0.08629561495829531, + "grad_norm": 1.581083059310913, + "learning_rate": 1.9175952319253693e-05, + "loss": 1.6524, + "step": 3700 + }, + { + "epoch": 0.08631893809747324, + "grad_norm": 2.1014351844787598, + "learning_rate": 1.9181135009069708e-05, + "loss": 1.4966, + "step": 3701 + }, + { + "epoch": 0.08634226123665115, + "grad_norm": 1.7185317277908325, + "learning_rate": 1.9186317698885722e-05, + "loss": 1.5509, + "step": 3702 + }, + { + "epoch": 0.08636558437582907, + "grad_norm": 1.9920899868011475, + "learning_rate": 1.919150038870174e-05, + "loss": 1.6712, + "step": 3703 + }, + { + "epoch": 0.08638890751500698, + "grad_norm": 1.8467237949371338, + "learning_rate": 1.9196683078517754e-05, + "loss": 1.2693, + "step": 3704 + }, + { + "epoch": 0.0864122306541849, + "grad_norm": 1.5148426294326782, + "learning_rate": 1.920186576833377e-05, + "loss": 1.5189, + "step": 3705 + }, + { + "epoch": 0.08643555379336282, + "grad_norm": 1.7767192125320435, + "learning_rate": 1.9207048458149783e-05, + "loss": 1.2703, + "step": 3706 + }, + { + "epoch": 0.08645887693254073, + "grad_norm": 1.9807710647583008, + "learning_rate": 1.9212231147965797e-05, + "loss": 1.5145, + "step": 3707 + }, + { + "epoch": 0.08648220007171865, + "grad_norm": 1.7413864135742188, + "learning_rate": 1.921741383778181e-05, + "loss": 1.6929, + "step": 3708 + }, + { + "epoch": 0.08650552321089656, + "grad_norm": 1.85059654712677, + "learning_rate": 1.9222596527597826e-05, + "loss": 1.3701, + "step": 3709 + }, + { + "epoch": 0.08652884635007449, + "grad_norm": 1.7285747528076172, + "learning_rate": 1.922777921741384e-05, + "loss": 1.6005, + "step": 3710 + }, + { + "epoch": 0.0865521694892524, + "grad_norm": 2.677480697631836, + "learning_rate": 1.9232961907229855e-05, + "loss": 1.4945, + "step": 3711 + }, + { + "epoch": 0.08657549262843033, + "grad_norm": 2.22979998588562, + "learning_rate": 1.923814459704587e-05, + "loss": 1.6705, + "step": 3712 + }, + { + "epoch": 0.08659881576760824, + "grad_norm": 1.4900656938552856, + "learning_rate": 1.9243327286861883e-05, + "loss": 1.2837, + "step": 3713 + }, + { + "epoch": 0.08662213890678616, + "grad_norm": 1.2594544887542725, + "learning_rate": 1.9248509976677898e-05, + "loss": 1.1585, + "step": 3714 + }, + { + "epoch": 0.08664546204596407, + "grad_norm": 2.0319337844848633, + "learning_rate": 1.9253692666493912e-05, + "loss": 1.6878, + "step": 3715 + }, + { + "epoch": 0.086668785185142, + "grad_norm": 2.0499188899993896, + "learning_rate": 1.9258875356309926e-05, + "loss": 1.4759, + "step": 3716 + }, + { + "epoch": 0.08669210832431991, + "grad_norm": 2.2834079265594482, + "learning_rate": 1.926405804612594e-05, + "loss": 1.5455, + "step": 3717 + }, + { + "epoch": 0.08671543146349783, + "grad_norm": 1.8450220823287964, + "learning_rate": 1.9269240735941955e-05, + "loss": 1.5251, + "step": 3718 + }, + { + "epoch": 0.08673875460267574, + "grad_norm": 2.1184442043304443, + "learning_rate": 1.927442342575797e-05, + "loss": 1.7317, + "step": 3719 + }, + { + "epoch": 0.08676207774185367, + "grad_norm": 1.4140855073928833, + "learning_rate": 1.9279606115573987e-05, + "loss": 1.1828, + "step": 3720 + }, + { + "epoch": 0.08678540088103158, + "grad_norm": 1.673884630203247, + "learning_rate": 1.928478880539e-05, + "loss": 1.7712, + "step": 3721 + }, + { + "epoch": 0.0868087240202095, + "grad_norm": 1.5101699829101562, + "learning_rate": 1.9289971495206012e-05, + "loss": 1.4416, + "step": 3722 + }, + { + "epoch": 0.08683204715938742, + "grad_norm": 1.884077787399292, + "learning_rate": 1.9295154185022027e-05, + "loss": 1.1231, + "step": 3723 + }, + { + "epoch": 0.08685537029856534, + "grad_norm": 1.7956326007843018, + "learning_rate": 1.930033687483804e-05, + "loss": 1.9195, + "step": 3724 + }, + { + "epoch": 0.08687869343774325, + "grad_norm": 2.028334379196167, + "learning_rate": 1.9305519564654055e-05, + "loss": 1.6812, + "step": 3725 + }, + { + "epoch": 0.08690201657692118, + "grad_norm": 1.541755199432373, + "learning_rate": 1.931070225447007e-05, + "loss": 1.5305, + "step": 3726 + }, + { + "epoch": 0.08692533971609909, + "grad_norm": 2.167618989944458, + "learning_rate": 1.9315884944286084e-05, + "loss": 1.5456, + "step": 3727 + }, + { + "epoch": 0.08694866285527701, + "grad_norm": 1.8667240142822266, + "learning_rate": 1.93210676341021e-05, + "loss": 1.646, + "step": 3728 + }, + { + "epoch": 0.08697198599445492, + "grad_norm": 2.162578821182251, + "learning_rate": 1.9326250323918113e-05, + "loss": 1.53, + "step": 3729 + }, + { + "epoch": 0.08699530913363285, + "grad_norm": 1.4681026935577393, + "learning_rate": 1.9331433013734127e-05, + "loss": 1.4004, + "step": 3730 + }, + { + "epoch": 0.08701863227281076, + "grad_norm": 1.5720863342285156, + "learning_rate": 1.9336615703550145e-05, + "loss": 1.2065, + "step": 3731 + }, + { + "epoch": 0.08704195541198868, + "grad_norm": 1.8887262344360352, + "learning_rate": 1.934179839336616e-05, + "loss": 1.516, + "step": 3732 + }, + { + "epoch": 0.0870652785511666, + "grad_norm": 1.582972764968872, + "learning_rate": 1.9346981083182174e-05, + "loss": 1.2518, + "step": 3733 + }, + { + "epoch": 0.0870886016903445, + "grad_norm": 2.031712055206299, + "learning_rate": 1.9352163772998188e-05, + "loss": 1.2827, + "step": 3734 + }, + { + "epoch": 0.08711192482952243, + "grad_norm": 1.9298467636108398, + "learning_rate": 1.9357346462814202e-05, + "loss": 1.52, + "step": 3735 + }, + { + "epoch": 0.08713524796870034, + "grad_norm": 1.7419377565383911, + "learning_rate": 1.9362529152630217e-05, + "loss": 1.192, + "step": 3736 + }, + { + "epoch": 0.08715857110787827, + "grad_norm": 2.0010645389556885, + "learning_rate": 1.936771184244623e-05, + "loss": 1.3094, + "step": 3737 + }, + { + "epoch": 0.08718189424705618, + "grad_norm": 1.4915006160736084, + "learning_rate": 1.9372894532262245e-05, + "loss": 1.3347, + "step": 3738 + }, + { + "epoch": 0.0872052173862341, + "grad_norm": 1.9095317125320435, + "learning_rate": 1.937807722207826e-05, + "loss": 1.6573, + "step": 3739 + }, + { + "epoch": 0.08722854052541201, + "grad_norm": 1.959438443183899, + "learning_rate": 1.9383259911894274e-05, + "loss": 1.2514, + "step": 3740 + }, + { + "epoch": 0.08725186366458994, + "grad_norm": 1.4582146406173706, + "learning_rate": 1.938844260171029e-05, + "loss": 1.2887, + "step": 3741 + }, + { + "epoch": 0.08727518680376785, + "grad_norm": 1.5818802118301392, + "learning_rate": 1.9393625291526303e-05, + "loss": 1.1757, + "step": 3742 + }, + { + "epoch": 0.08729850994294577, + "grad_norm": 4.244632244110107, + "learning_rate": 1.9398807981342317e-05, + "loss": 1.6171, + "step": 3743 + }, + { + "epoch": 0.08732183308212368, + "grad_norm": 1.8370835781097412, + "learning_rate": 1.940399067115833e-05, + "loss": 1.2676, + "step": 3744 + }, + { + "epoch": 0.08734515622130161, + "grad_norm": 1.8897545337677002, + "learning_rate": 1.9409173360974346e-05, + "loss": 1.6854, + "step": 3745 + }, + { + "epoch": 0.08736847936047952, + "grad_norm": 1.985708475112915, + "learning_rate": 1.941435605079036e-05, + "loss": 1.3562, + "step": 3746 + }, + { + "epoch": 0.08739180249965744, + "grad_norm": 1.682483434677124, + "learning_rate": 1.9419538740606378e-05, + "loss": 1.375, + "step": 3747 + }, + { + "epoch": 0.08741512563883536, + "grad_norm": 1.743619441986084, + "learning_rate": 1.9424721430422392e-05, + "loss": 1.23, + "step": 3748 + }, + { + "epoch": 0.08743844877801328, + "grad_norm": 1.8465096950531006, + "learning_rate": 1.9429904120238407e-05, + "loss": 1.4852, + "step": 3749 + }, + { + "epoch": 0.08746177191719119, + "grad_norm": 1.4261040687561035, + "learning_rate": 1.943508681005442e-05, + "loss": 1.1589, + "step": 3750 + }, + { + "epoch": 0.08748509505636912, + "grad_norm": 1.8200268745422363, + "learning_rate": 1.9440269499870435e-05, + "loss": 1.5185, + "step": 3751 + }, + { + "epoch": 0.08750841819554703, + "grad_norm": 1.8040040731430054, + "learning_rate": 1.944545218968645e-05, + "loss": 1.2195, + "step": 3752 + }, + { + "epoch": 0.08753174133472495, + "grad_norm": 2.745818614959717, + "learning_rate": 1.9450634879502464e-05, + "loss": 1.5439, + "step": 3753 + }, + { + "epoch": 0.08755506447390286, + "grad_norm": 2.6260766983032227, + "learning_rate": 1.9455817569318478e-05, + "loss": 1.4478, + "step": 3754 + }, + { + "epoch": 0.08757838761308079, + "grad_norm": 1.6455649137496948, + "learning_rate": 1.9461000259134493e-05, + "loss": 1.5318, + "step": 3755 + }, + { + "epoch": 0.0876017107522587, + "grad_norm": 1.8115977048873901, + "learning_rate": 1.9466182948950507e-05, + "loss": 1.4069, + "step": 3756 + }, + { + "epoch": 0.08762503389143662, + "grad_norm": 2.10554575920105, + "learning_rate": 1.947136563876652e-05, + "loss": 1.8464, + "step": 3757 + }, + { + "epoch": 0.08764835703061453, + "grad_norm": 1.4829670190811157, + "learning_rate": 1.9476548328582536e-05, + "loss": 1.0982, + "step": 3758 + }, + { + "epoch": 0.08767168016979246, + "grad_norm": 1.6490533351898193, + "learning_rate": 1.948173101839855e-05, + "loss": 1.2205, + "step": 3759 + }, + { + "epoch": 0.08769500330897037, + "grad_norm": 1.7997782230377197, + "learning_rate": 1.9486913708214564e-05, + "loss": 1.6498, + "step": 3760 + }, + { + "epoch": 0.08771832644814828, + "grad_norm": 1.3423871994018555, + "learning_rate": 1.949209639803058e-05, + "loss": 1.2353, + "step": 3761 + }, + { + "epoch": 0.0877416495873262, + "grad_norm": 1.9823672771453857, + "learning_rate": 1.9497279087846593e-05, + "loss": 1.5512, + "step": 3762 + }, + { + "epoch": 0.08776497272650412, + "grad_norm": 1.8137081861495972, + "learning_rate": 1.9502461777662607e-05, + "loss": 1.1692, + "step": 3763 + }, + { + "epoch": 0.08778829586568204, + "grad_norm": 1.948642373085022, + "learning_rate": 1.9507644467478625e-05, + "loss": 1.1127, + "step": 3764 + }, + { + "epoch": 0.08781161900485995, + "grad_norm": 1.5273735523223877, + "learning_rate": 1.951282715729464e-05, + "loss": 1.258, + "step": 3765 + }, + { + "epoch": 0.08783494214403788, + "grad_norm": 1.8528945446014404, + "learning_rate": 1.9518009847110654e-05, + "loss": 1.283, + "step": 3766 + }, + { + "epoch": 0.08785826528321579, + "grad_norm": 2.193530559539795, + "learning_rate": 1.9523192536926668e-05, + "loss": 1.385, + "step": 3767 + }, + { + "epoch": 0.08788158842239371, + "grad_norm": 1.6347289085388184, + "learning_rate": 1.9528375226742683e-05, + "loss": 1.4411, + "step": 3768 + }, + { + "epoch": 0.08790491156157162, + "grad_norm": 1.4878569841384888, + "learning_rate": 1.9533557916558697e-05, + "loss": 1.338, + "step": 3769 + }, + { + "epoch": 0.08792823470074955, + "grad_norm": 1.523554801940918, + "learning_rate": 1.953874060637471e-05, + "loss": 1.4254, + "step": 3770 + }, + { + "epoch": 0.08795155783992746, + "grad_norm": 1.8349013328552246, + "learning_rate": 1.9543923296190726e-05, + "loss": 1.2373, + "step": 3771 + }, + { + "epoch": 0.08797488097910539, + "grad_norm": 1.794135332107544, + "learning_rate": 1.954910598600674e-05, + "loss": 1.2954, + "step": 3772 + }, + { + "epoch": 0.0879982041182833, + "grad_norm": 3.187321424484253, + "learning_rate": 1.9554288675822754e-05, + "loss": 1.0685, + "step": 3773 + }, + { + "epoch": 0.08802152725746122, + "grad_norm": 1.8911279439926147, + "learning_rate": 1.955947136563877e-05, + "loss": 1.4991, + "step": 3774 + }, + { + "epoch": 0.08804485039663913, + "grad_norm": 1.707911729812622, + "learning_rate": 1.9564654055454783e-05, + "loss": 1.6678, + "step": 3775 + }, + { + "epoch": 0.08806817353581706, + "grad_norm": 2.3012478351593018, + "learning_rate": 1.9569836745270797e-05, + "loss": 1.4112, + "step": 3776 + }, + { + "epoch": 0.08809149667499497, + "grad_norm": 1.5143091678619385, + "learning_rate": 1.957501943508681e-05, + "loss": 1.4209, + "step": 3777 + }, + { + "epoch": 0.08811481981417289, + "grad_norm": 1.869060754776001, + "learning_rate": 1.9580202124902826e-05, + "loss": 1.4921, + "step": 3778 + }, + { + "epoch": 0.0881381429533508, + "grad_norm": 2.0710952281951904, + "learning_rate": 1.958538481471884e-05, + "loss": 1.556, + "step": 3779 + }, + { + "epoch": 0.08816146609252873, + "grad_norm": 2.3364593982696533, + "learning_rate": 1.9590567504534855e-05, + "loss": 1.3446, + "step": 3780 + }, + { + "epoch": 0.08818478923170664, + "grad_norm": 1.7574700117111206, + "learning_rate": 1.959575019435087e-05, + "loss": 1.4665, + "step": 3781 + }, + { + "epoch": 0.08820811237088456, + "grad_norm": 1.8747336864471436, + "learning_rate": 1.9600932884166883e-05, + "loss": 1.3774, + "step": 3782 + }, + { + "epoch": 0.08823143551006248, + "grad_norm": 1.802948236465454, + "learning_rate": 1.9606115573982898e-05, + "loss": 1.3237, + "step": 3783 + }, + { + "epoch": 0.0882547586492404, + "grad_norm": 1.880509376525879, + "learning_rate": 1.9611298263798912e-05, + "loss": 1.6404, + "step": 3784 + }, + { + "epoch": 0.08827808178841831, + "grad_norm": 1.7206971645355225, + "learning_rate": 1.9616480953614926e-05, + "loss": 1.2631, + "step": 3785 + }, + { + "epoch": 0.08830140492759624, + "grad_norm": 2.0995943546295166, + "learning_rate": 1.962166364343094e-05, + "loss": 1.283, + "step": 3786 + }, + { + "epoch": 0.08832472806677415, + "grad_norm": 2.124122142791748, + "learning_rate": 1.9626846333246955e-05, + "loss": 1.2731, + "step": 3787 + }, + { + "epoch": 0.08834805120595207, + "grad_norm": 1.952669382095337, + "learning_rate": 1.963202902306297e-05, + "loss": 1.4505, + "step": 3788 + }, + { + "epoch": 0.08837137434512998, + "grad_norm": 1.7214585542678833, + "learning_rate": 1.9637211712878984e-05, + "loss": 1.7156, + "step": 3789 + }, + { + "epoch": 0.0883946974843079, + "grad_norm": 1.4622641801834106, + "learning_rate": 1.9642394402694998e-05, + "loss": 1.6118, + "step": 3790 + }, + { + "epoch": 0.08841802062348582, + "grad_norm": 1.7372667789459229, + "learning_rate": 1.9647577092511016e-05, + "loss": 1.5947, + "step": 3791 + }, + { + "epoch": 0.08844134376266373, + "grad_norm": 2.0285115242004395, + "learning_rate": 1.965275978232703e-05, + "loss": 1.8061, + "step": 3792 + }, + { + "epoch": 0.08846466690184165, + "grad_norm": 1.7647212743759155, + "learning_rate": 1.9657942472143045e-05, + "loss": 1.2375, + "step": 3793 + }, + { + "epoch": 0.08848799004101957, + "grad_norm": 2.0434303283691406, + "learning_rate": 1.966312516195906e-05, + "loss": 1.4688, + "step": 3794 + }, + { + "epoch": 0.08851131318019749, + "grad_norm": 2.1963582038879395, + "learning_rate": 1.9668307851775073e-05, + "loss": 1.3006, + "step": 3795 + }, + { + "epoch": 0.0885346363193754, + "grad_norm": 1.637125849723816, + "learning_rate": 1.9673490541591088e-05, + "loss": 1.4169, + "step": 3796 + }, + { + "epoch": 0.08855795945855333, + "grad_norm": 1.9857791662216187, + "learning_rate": 1.9678673231407102e-05, + "loss": 1.2184, + "step": 3797 + }, + { + "epoch": 0.08858128259773124, + "grad_norm": 1.8531452417373657, + "learning_rate": 1.9683855921223116e-05, + "loss": 1.6313, + "step": 3798 + }, + { + "epoch": 0.08860460573690916, + "grad_norm": 1.427011489868164, + "learning_rate": 1.968903861103913e-05, + "loss": 1.2982, + "step": 3799 + }, + { + "epoch": 0.08862792887608707, + "grad_norm": 2.0080931186676025, + "learning_rate": 1.9694221300855145e-05, + "loss": 1.0828, + "step": 3800 + }, + { + "epoch": 0.088651252015265, + "grad_norm": 1.6396167278289795, + "learning_rate": 1.969940399067116e-05, + "loss": 1.5011, + "step": 3801 + }, + { + "epoch": 0.08867457515444291, + "grad_norm": 1.9646265506744385, + "learning_rate": 1.9704586680487174e-05, + "loss": 1.7735, + "step": 3802 + }, + { + "epoch": 0.08869789829362083, + "grad_norm": 1.7817018032073975, + "learning_rate": 1.9709769370303188e-05, + "loss": 1.457, + "step": 3803 + }, + { + "epoch": 0.08872122143279874, + "grad_norm": 2.1395010948181152, + "learning_rate": 1.9714952060119202e-05, + "loss": 1.569, + "step": 3804 + }, + { + "epoch": 0.08874454457197667, + "grad_norm": 1.8735876083374023, + "learning_rate": 1.9720134749935217e-05, + "loss": 1.3087, + "step": 3805 + }, + { + "epoch": 0.08876786771115458, + "grad_norm": 1.593678593635559, + "learning_rate": 1.972531743975123e-05, + "loss": 1.6819, + "step": 3806 + }, + { + "epoch": 0.0887911908503325, + "grad_norm": 1.4559507369995117, + "learning_rate": 1.973050012956725e-05, + "loss": 1.6107, + "step": 3807 + }, + { + "epoch": 0.08881451398951042, + "grad_norm": 1.3212164640426636, + "learning_rate": 1.9735682819383263e-05, + "loss": 1.3423, + "step": 3808 + }, + { + "epoch": 0.08883783712868834, + "grad_norm": 1.7826329469680786, + "learning_rate": 1.9740865509199278e-05, + "loss": 1.3699, + "step": 3809 + }, + { + "epoch": 0.08886116026786625, + "grad_norm": 1.4961615800857544, + "learning_rate": 1.9746048199015292e-05, + "loss": 1.3433, + "step": 3810 + }, + { + "epoch": 0.08888448340704418, + "grad_norm": 1.5190882682800293, + "learning_rate": 1.9751230888831306e-05, + "loss": 1.2961, + "step": 3811 + }, + { + "epoch": 0.08890780654622209, + "grad_norm": 1.6436294317245483, + "learning_rate": 1.975641357864732e-05, + "loss": 1.2565, + "step": 3812 + }, + { + "epoch": 0.08893112968540001, + "grad_norm": 2.6425702571868896, + "learning_rate": 1.9761596268463335e-05, + "loss": 1.4867, + "step": 3813 + }, + { + "epoch": 0.08895445282457792, + "grad_norm": 1.6602445840835571, + "learning_rate": 1.976677895827935e-05, + "loss": 1.3292, + "step": 3814 + }, + { + "epoch": 0.08897777596375585, + "grad_norm": 2.169207811355591, + "learning_rate": 1.9771961648095364e-05, + "loss": 1.6807, + "step": 3815 + }, + { + "epoch": 0.08900109910293376, + "grad_norm": 1.6809145212173462, + "learning_rate": 1.9777144337911378e-05, + "loss": 1.4716, + "step": 3816 + }, + { + "epoch": 0.08902442224211168, + "grad_norm": 1.908535122871399, + "learning_rate": 1.9782327027727392e-05, + "loss": 1.3085, + "step": 3817 + }, + { + "epoch": 0.0890477453812896, + "grad_norm": 2.0437328815460205, + "learning_rate": 1.9787509717543407e-05, + "loss": 1.4063, + "step": 3818 + }, + { + "epoch": 0.0890710685204675, + "grad_norm": 1.5940383672714233, + "learning_rate": 1.979269240735942e-05, + "loss": 1.1642, + "step": 3819 + }, + { + "epoch": 0.08909439165964543, + "grad_norm": 2.0657832622528076, + "learning_rate": 1.9797875097175435e-05, + "loss": 1.329, + "step": 3820 + }, + { + "epoch": 0.08911771479882334, + "grad_norm": 1.712106704711914, + "learning_rate": 1.980305778699145e-05, + "loss": 1.5452, + "step": 3821 + }, + { + "epoch": 0.08914103793800127, + "grad_norm": 1.7552653551101685, + "learning_rate": 1.9808240476807464e-05, + "loss": 1.2633, + "step": 3822 + }, + { + "epoch": 0.08916436107717918, + "grad_norm": 2.0012056827545166, + "learning_rate": 1.981342316662348e-05, + "loss": 1.4567, + "step": 3823 + }, + { + "epoch": 0.0891876842163571, + "grad_norm": 1.6985337734222412, + "learning_rate": 1.9818605856439496e-05, + "loss": 1.496, + "step": 3824 + }, + { + "epoch": 0.08921100735553501, + "grad_norm": 1.4563976526260376, + "learning_rate": 1.982378854625551e-05, + "loss": 1.114, + "step": 3825 + }, + { + "epoch": 0.08923433049471294, + "grad_norm": 1.8010538816452026, + "learning_rate": 1.9828971236071525e-05, + "loss": 1.6439, + "step": 3826 + }, + { + "epoch": 0.08925765363389085, + "grad_norm": 1.8285574913024902, + "learning_rate": 1.983415392588754e-05, + "loss": 1.1611, + "step": 3827 + }, + { + "epoch": 0.08928097677306877, + "grad_norm": 1.7888448238372803, + "learning_rate": 1.9839336615703553e-05, + "loss": 1.4124, + "step": 3828 + }, + { + "epoch": 0.08930429991224668, + "grad_norm": 1.5003128051757812, + "learning_rate": 1.9844519305519568e-05, + "loss": 1.2427, + "step": 3829 + }, + { + "epoch": 0.08932762305142461, + "grad_norm": 2.063203811645508, + "learning_rate": 1.9849701995335582e-05, + "loss": 1.5464, + "step": 3830 + }, + { + "epoch": 0.08935094619060252, + "grad_norm": 1.610795497894287, + "learning_rate": 1.9854884685151597e-05, + "loss": 1.2449, + "step": 3831 + }, + { + "epoch": 0.08937426932978045, + "grad_norm": 1.6565496921539307, + "learning_rate": 1.9860067374967607e-05, + "loss": 1.4032, + "step": 3832 + }, + { + "epoch": 0.08939759246895836, + "grad_norm": 3.492591142654419, + "learning_rate": 1.9865250064783622e-05, + "loss": 1.4794, + "step": 3833 + }, + { + "epoch": 0.08942091560813628, + "grad_norm": 1.7358639240264893, + "learning_rate": 1.9870432754599636e-05, + "loss": 1.6205, + "step": 3834 + }, + { + "epoch": 0.08944423874731419, + "grad_norm": 1.711347222328186, + "learning_rate": 1.9875615444415654e-05, + "loss": 1.8102, + "step": 3835 + }, + { + "epoch": 0.08946756188649212, + "grad_norm": 1.66538667678833, + "learning_rate": 1.9880798134231668e-05, + "loss": 1.3878, + "step": 3836 + }, + { + "epoch": 0.08949088502567003, + "grad_norm": 2.1874492168426514, + "learning_rate": 1.9885980824047683e-05, + "loss": 1.4028, + "step": 3837 + }, + { + "epoch": 0.08951420816484795, + "grad_norm": 1.9095275402069092, + "learning_rate": 1.9891163513863697e-05, + "loss": 1.2287, + "step": 3838 + }, + { + "epoch": 0.08953753130402586, + "grad_norm": 1.9796547889709473, + "learning_rate": 1.989634620367971e-05, + "loss": 1.1588, + "step": 3839 + }, + { + "epoch": 0.08956085444320379, + "grad_norm": 1.461793065071106, + "learning_rate": 1.9901528893495726e-05, + "loss": 0.8892, + "step": 3840 + }, + { + "epoch": 0.0895841775823817, + "grad_norm": 1.6312065124511719, + "learning_rate": 1.990671158331174e-05, + "loss": 1.5716, + "step": 3841 + }, + { + "epoch": 0.08960750072155962, + "grad_norm": 1.985102653503418, + "learning_rate": 1.9911894273127754e-05, + "loss": 1.387, + "step": 3842 + }, + { + "epoch": 0.08963082386073754, + "grad_norm": 1.6724156141281128, + "learning_rate": 1.991707696294377e-05, + "loss": 1.48, + "step": 3843 + }, + { + "epoch": 0.08965414699991546, + "grad_norm": 2.1611175537109375, + "learning_rate": 1.9922259652759783e-05, + "loss": 1.586, + "step": 3844 + }, + { + "epoch": 0.08967747013909337, + "grad_norm": 1.7135130167007446, + "learning_rate": 1.9927442342575797e-05, + "loss": 1.3561, + "step": 3845 + }, + { + "epoch": 0.0897007932782713, + "grad_norm": 1.9912513494491577, + "learning_rate": 1.9932625032391812e-05, + "loss": 1.4008, + "step": 3846 + }, + { + "epoch": 0.0897241164174492, + "grad_norm": 1.6305721998214722, + "learning_rate": 1.9937807722207826e-05, + "loss": 1.187, + "step": 3847 + }, + { + "epoch": 0.08974743955662712, + "grad_norm": 1.52975594997406, + "learning_rate": 1.994299041202384e-05, + "loss": 1.3399, + "step": 3848 + }, + { + "epoch": 0.08977076269580504, + "grad_norm": 1.9327449798583984, + "learning_rate": 1.9948173101839855e-05, + "loss": 1.7132, + "step": 3849 + }, + { + "epoch": 0.08979408583498295, + "grad_norm": 1.8760170936584473, + "learning_rate": 1.995335579165587e-05, + "loss": 1.4139, + "step": 3850 + }, + { + "epoch": 0.08981740897416088, + "grad_norm": 2.0891501903533936, + "learning_rate": 1.9958538481471887e-05, + "loss": 1.5883, + "step": 3851 + }, + { + "epoch": 0.08984073211333879, + "grad_norm": 1.7092490196228027, + "learning_rate": 1.99637211712879e-05, + "loss": 1.7111, + "step": 3852 + }, + { + "epoch": 0.08986405525251671, + "grad_norm": 1.8873919248580933, + "learning_rate": 1.9968903861103916e-05, + "loss": 1.5662, + "step": 3853 + }, + { + "epoch": 0.08988737839169463, + "grad_norm": 1.5830549001693726, + "learning_rate": 1.997408655091993e-05, + "loss": 1.6033, + "step": 3854 + }, + { + "epoch": 0.08991070153087255, + "grad_norm": 2.1742613315582275, + "learning_rate": 1.9979269240735944e-05, + "loss": 1.7105, + "step": 3855 + }, + { + "epoch": 0.08993402467005046, + "grad_norm": 2.1118738651275635, + "learning_rate": 1.998445193055196e-05, + "loss": 1.4206, + "step": 3856 + }, + { + "epoch": 0.08995734780922839, + "grad_norm": 2.103388547897339, + "learning_rate": 1.9989634620367973e-05, + "loss": 1.6509, + "step": 3857 + }, + { + "epoch": 0.0899806709484063, + "grad_norm": 1.6943275928497314, + "learning_rate": 1.9994817310183987e-05, + "loss": 1.4174, + "step": 3858 + }, + { + "epoch": 0.09000399408758422, + "grad_norm": 2.7844085693359375, + "learning_rate": 2e-05, + "loss": 1.4774, + "step": 3859 + }, + { + "epoch": 0.09002731722676213, + "grad_norm": 1.8078947067260742, + "learning_rate": 1.9999999996829872e-05, + "loss": 1.2295, + "step": 3860 + }, + { + "epoch": 0.09005064036594006, + "grad_norm": 1.9623292684555054, + "learning_rate": 1.9999999987319475e-05, + "loss": 1.4901, + "step": 3861 + }, + { + "epoch": 0.09007396350511797, + "grad_norm": 1.6764906644821167, + "learning_rate": 1.9999999971468818e-05, + "loss": 1.2552, + "step": 3862 + }, + { + "epoch": 0.0900972866442959, + "grad_norm": 1.8891198635101318, + "learning_rate": 1.99999999492779e-05, + "loss": 1.7509, + "step": 3863 + }, + { + "epoch": 0.0901206097834738, + "grad_norm": 1.8767303228378296, + "learning_rate": 1.999999992074672e-05, + "loss": 1.495, + "step": 3864 + }, + { + "epoch": 0.09014393292265173, + "grad_norm": 2.5242903232574463, + "learning_rate": 1.9999999885875275e-05, + "loss": 1.6351, + "step": 3865 + }, + { + "epoch": 0.09016725606182964, + "grad_norm": 1.708688735961914, + "learning_rate": 1.9999999844663568e-05, + "loss": 1.4466, + "step": 3866 + }, + { + "epoch": 0.09019057920100756, + "grad_norm": 2.3333094120025635, + "learning_rate": 1.9999999797111602e-05, + "loss": 1.2606, + "step": 3867 + }, + { + "epoch": 0.09021390234018548, + "grad_norm": 1.7503955364227295, + "learning_rate": 1.999999974321937e-05, + "loss": 1.4878, + "step": 3868 + }, + { + "epoch": 0.0902372254793634, + "grad_norm": 2.1751153469085693, + "learning_rate": 1.9999999682986876e-05, + "loss": 1.5093, + "step": 3869 + }, + { + "epoch": 0.09026054861854131, + "grad_norm": 1.9692846536636353, + "learning_rate": 1.999999961641412e-05, + "loss": 1.2518, + "step": 3870 + }, + { + "epoch": 0.09028387175771924, + "grad_norm": 1.744352102279663, + "learning_rate": 1.9999999543501102e-05, + "loss": 1.4566, + "step": 3871 + }, + { + "epoch": 0.09030719489689715, + "grad_norm": 1.633447527885437, + "learning_rate": 1.999999946424782e-05, + "loss": 1.4261, + "step": 3872 + }, + { + "epoch": 0.09033051803607507, + "grad_norm": 1.6854172945022583, + "learning_rate": 1.999999937865428e-05, + "loss": 1.1874, + "step": 3873 + }, + { + "epoch": 0.09035384117525298, + "grad_norm": 1.6113791465759277, + "learning_rate": 1.9999999286720472e-05, + "loss": 1.0596, + "step": 3874 + }, + { + "epoch": 0.0903771643144309, + "grad_norm": 1.8399914503097534, + "learning_rate": 1.999999918844641e-05, + "loss": 1.6591, + "step": 3875 + }, + { + "epoch": 0.09040048745360882, + "grad_norm": 1.559280276298523, + "learning_rate": 1.999999908383208e-05, + "loss": 1.4071, + "step": 3876 + }, + { + "epoch": 0.09042381059278673, + "grad_norm": 2.0830109119415283, + "learning_rate": 1.999999897287749e-05, + "loss": 1.1728, + "step": 3877 + }, + { + "epoch": 0.09044713373196465, + "grad_norm": 2.0221612453460693, + "learning_rate": 1.9999998855582636e-05, + "loss": 1.4678, + "step": 3878 + }, + { + "epoch": 0.09047045687114257, + "grad_norm": 1.6200062036514282, + "learning_rate": 1.9999998731947522e-05, + "loss": 1.6207, + "step": 3879 + }, + { + "epoch": 0.09049378001032049, + "grad_norm": 1.6108903884887695, + "learning_rate": 1.9999998601972143e-05, + "loss": 0.9769, + "step": 3880 + }, + { + "epoch": 0.0905171031494984, + "grad_norm": 2.0176987648010254, + "learning_rate": 1.9999998465656505e-05, + "loss": 1.8439, + "step": 3881 + }, + { + "epoch": 0.09054042628867633, + "grad_norm": 1.806272268295288, + "learning_rate": 1.9999998323000608e-05, + "loss": 1.6001, + "step": 3882 + }, + { + "epoch": 0.09056374942785424, + "grad_norm": 1.7513991594314575, + "learning_rate": 1.9999998174004446e-05, + "loss": 1.4423, + "step": 3883 + }, + { + "epoch": 0.09058707256703216, + "grad_norm": 1.9214335680007935, + "learning_rate": 1.9999998018668023e-05, + "loss": 1.4684, + "step": 3884 + }, + { + "epoch": 0.09061039570621007, + "grad_norm": 1.46034836769104, + "learning_rate": 1.9999997856991338e-05, + "loss": 0.9545, + "step": 3885 + }, + { + "epoch": 0.090633718845388, + "grad_norm": 1.6021389961242676, + "learning_rate": 1.9999997688974394e-05, + "loss": 1.3276, + "step": 3886 + }, + { + "epoch": 0.09065704198456591, + "grad_norm": 1.7397795915603638, + "learning_rate": 1.9999997514617188e-05, + "loss": 1.5114, + "step": 3887 + }, + { + "epoch": 0.09068036512374383, + "grad_norm": 1.8831110000610352, + "learning_rate": 1.999999733391972e-05, + "loss": 1.8704, + "step": 3888 + }, + { + "epoch": 0.09070368826292174, + "grad_norm": 1.8681260347366333, + "learning_rate": 1.9999997146881995e-05, + "loss": 1.4932, + "step": 3889 + }, + { + "epoch": 0.09072701140209967, + "grad_norm": 1.951125979423523, + "learning_rate": 1.9999996953504003e-05, + "loss": 1.4437, + "step": 3890 + }, + { + "epoch": 0.09075033454127758, + "grad_norm": 2.1680264472961426, + "learning_rate": 1.9999996753785757e-05, + "loss": 1.3859, + "step": 3891 + }, + { + "epoch": 0.0907736576804555, + "grad_norm": 1.7976813316345215, + "learning_rate": 1.9999996547727246e-05, + "loss": 1.8385, + "step": 3892 + }, + { + "epoch": 0.09079698081963342, + "grad_norm": 1.821331262588501, + "learning_rate": 1.9999996335328476e-05, + "loss": 1.2497, + "step": 3893 + }, + { + "epoch": 0.09082030395881134, + "grad_norm": 2.1051344871520996, + "learning_rate": 1.9999996116589448e-05, + "loss": 1.5435, + "step": 3894 + }, + { + "epoch": 0.09084362709798925, + "grad_norm": 2.2845866680145264, + "learning_rate": 1.9999995891510154e-05, + "loss": 1.0889, + "step": 3895 + }, + { + "epoch": 0.09086695023716718, + "grad_norm": 1.772169589996338, + "learning_rate": 1.9999995660090605e-05, + "loss": 1.4212, + "step": 3896 + }, + { + "epoch": 0.09089027337634509, + "grad_norm": 1.899332046508789, + "learning_rate": 1.9999995422330798e-05, + "loss": 1.3668, + "step": 3897 + }, + { + "epoch": 0.09091359651552301, + "grad_norm": 1.870027780532837, + "learning_rate": 1.9999995178230726e-05, + "loss": 1.5513, + "step": 3898 + }, + { + "epoch": 0.09093691965470092, + "grad_norm": 1.521862506866455, + "learning_rate": 1.99999949277904e-05, + "loss": 1.4325, + "step": 3899 + }, + { + "epoch": 0.09096024279387885, + "grad_norm": 1.7939119338989258, + "learning_rate": 1.999999467100981e-05, + "loss": 1.6844, + "step": 3900 + }, + { + "epoch": 0.09098356593305676, + "grad_norm": 1.9809507131576538, + "learning_rate": 1.999999440788896e-05, + "loss": 1.4736, + "step": 3901 + }, + { + "epoch": 0.09100688907223468, + "grad_norm": 1.4991800785064697, + "learning_rate": 1.9999994138427855e-05, + "loss": 1.1604, + "step": 3902 + }, + { + "epoch": 0.0910302122114126, + "grad_norm": 1.4532808065414429, + "learning_rate": 1.999999386262649e-05, + "loss": 1.474, + "step": 3903 + }, + { + "epoch": 0.0910535353505905, + "grad_norm": 1.856518268585205, + "learning_rate": 1.9999993580484864e-05, + "loss": 1.4732, + "step": 3904 + }, + { + "epoch": 0.09107685848976843, + "grad_norm": 2.7852296829223633, + "learning_rate": 1.9999993292002983e-05, + "loss": 1.3989, + "step": 3905 + }, + { + "epoch": 0.09110018162894634, + "grad_norm": 2.3301925659179688, + "learning_rate": 1.9999992997180843e-05, + "loss": 1.0174, + "step": 3906 + }, + { + "epoch": 0.09112350476812427, + "grad_norm": 1.9908862113952637, + "learning_rate": 1.9999992696018445e-05, + "loss": 1.2663, + "step": 3907 + }, + { + "epoch": 0.09114682790730218, + "grad_norm": 1.4991986751556396, + "learning_rate": 1.9999992388515784e-05, + "loss": 1.3657, + "step": 3908 + }, + { + "epoch": 0.0911701510464801, + "grad_norm": 1.8717234134674072, + "learning_rate": 1.9999992074672873e-05, + "loss": 1.7159, + "step": 3909 + }, + { + "epoch": 0.09119347418565801, + "grad_norm": 1.4654630422592163, + "learning_rate": 1.99999917544897e-05, + "loss": 0.9469, + "step": 3910 + }, + { + "epoch": 0.09121679732483594, + "grad_norm": 1.6895606517791748, + "learning_rate": 1.999999142796627e-05, + "loss": 1.4659, + "step": 3911 + }, + { + "epoch": 0.09124012046401385, + "grad_norm": 1.6707404851913452, + "learning_rate": 1.9999991095102583e-05, + "loss": 1.2416, + "step": 3912 + }, + { + "epoch": 0.09126344360319177, + "grad_norm": 1.5567107200622559, + "learning_rate": 1.999999075589864e-05, + "loss": 1.5764, + "step": 3913 + }, + { + "epoch": 0.09128676674236968, + "grad_norm": 1.8884644508361816, + "learning_rate": 1.999999041035444e-05, + "loss": 1.5029, + "step": 3914 + }, + { + "epoch": 0.09131008988154761, + "grad_norm": 1.8918079137802124, + "learning_rate": 1.9999990058469984e-05, + "loss": 1.6129, + "step": 3915 + }, + { + "epoch": 0.09133341302072552, + "grad_norm": 1.551944375038147, + "learning_rate": 1.9999989700245273e-05, + "loss": 1.0586, + "step": 3916 + }, + { + "epoch": 0.09135673615990345, + "grad_norm": 1.844373345375061, + "learning_rate": 1.9999989335680304e-05, + "loss": 1.4028, + "step": 3917 + }, + { + "epoch": 0.09138005929908136, + "grad_norm": 1.399733543395996, + "learning_rate": 1.999998896477508e-05, + "loss": 1.34, + "step": 3918 + }, + { + "epoch": 0.09140338243825928, + "grad_norm": 2.2165145874023438, + "learning_rate": 1.9999988587529596e-05, + "loss": 1.3944, + "step": 3919 + }, + { + "epoch": 0.09142670557743719, + "grad_norm": 1.9752156734466553, + "learning_rate": 1.9999988203943862e-05, + "loss": 1.1461, + "step": 3920 + }, + { + "epoch": 0.09145002871661512, + "grad_norm": 2.117903470993042, + "learning_rate": 1.9999987814017872e-05, + "loss": 1.6446, + "step": 3921 + }, + { + "epoch": 0.09147335185579303, + "grad_norm": 1.4504482746124268, + "learning_rate": 1.9999987417751628e-05, + "loss": 1.3936, + "step": 3922 + }, + { + "epoch": 0.09149667499497095, + "grad_norm": 1.6209416389465332, + "learning_rate": 1.9999987015145128e-05, + "loss": 1.1825, + "step": 3923 + }, + { + "epoch": 0.09151999813414886, + "grad_norm": 1.6402407884597778, + "learning_rate": 1.9999986606198373e-05, + "loss": 1.5742, + "step": 3924 + }, + { + "epoch": 0.09154332127332679, + "grad_norm": 1.865707516670227, + "learning_rate": 1.9999986190911363e-05, + "loss": 1.1726, + "step": 3925 + }, + { + "epoch": 0.0915666444125047, + "grad_norm": 2.145291805267334, + "learning_rate": 1.9999985769284102e-05, + "loss": 1.3995, + "step": 3926 + }, + { + "epoch": 0.09158996755168262, + "grad_norm": 1.8508366346359253, + "learning_rate": 1.999998534131659e-05, + "loss": 1.5275, + "step": 3927 + }, + { + "epoch": 0.09161329069086054, + "grad_norm": 1.477274775505066, + "learning_rate": 1.999998490700882e-05, + "loss": 1.0495, + "step": 3928 + }, + { + "epoch": 0.09163661383003846, + "grad_norm": 1.945433259010315, + "learning_rate": 1.9999984466360797e-05, + "loss": 1.4533, + "step": 3929 + }, + { + "epoch": 0.09165993696921637, + "grad_norm": 1.7821992635726929, + "learning_rate": 1.9999984019372522e-05, + "loss": 1.3427, + "step": 3930 + }, + { + "epoch": 0.0916832601083943, + "grad_norm": 2.0035572052001953, + "learning_rate": 1.9999983566043996e-05, + "loss": 1.8269, + "step": 3931 + }, + { + "epoch": 0.09170658324757221, + "grad_norm": 1.5176634788513184, + "learning_rate": 1.9999983106375217e-05, + "loss": 1.3943, + "step": 3932 + }, + { + "epoch": 0.09172990638675012, + "grad_norm": 1.823038101196289, + "learning_rate": 1.9999982640366187e-05, + "loss": 1.1486, + "step": 3933 + }, + { + "epoch": 0.09175322952592804, + "grad_norm": 1.796398401260376, + "learning_rate": 1.9999982168016906e-05, + "loss": 1.4936, + "step": 3934 + }, + { + "epoch": 0.09177655266510595, + "grad_norm": 2.008840322494507, + "learning_rate": 1.999998168932737e-05, + "loss": 1.6116, + "step": 3935 + }, + { + "epoch": 0.09179987580428388, + "grad_norm": 1.9190891981124878, + "learning_rate": 1.9999981204297587e-05, + "loss": 1.7342, + "step": 3936 + }, + { + "epoch": 0.09182319894346179, + "grad_norm": 1.437469720840454, + "learning_rate": 1.999998071292755e-05, + "loss": 1.6174, + "step": 3937 + }, + { + "epoch": 0.09184652208263971, + "grad_norm": 1.8305333852767944, + "learning_rate": 1.9999980215217265e-05, + "loss": 1.8085, + "step": 3938 + }, + { + "epoch": 0.09186984522181763, + "grad_norm": 1.5656037330627441, + "learning_rate": 1.999997971116673e-05, + "loss": 1.2984, + "step": 3939 + }, + { + "epoch": 0.09189316836099555, + "grad_norm": 2.2326161861419678, + "learning_rate": 1.9999979200775947e-05, + "loss": 1.4892, + "step": 3940 + }, + { + "epoch": 0.09191649150017346, + "grad_norm": 1.7006014585494995, + "learning_rate": 1.999997868404491e-05, + "loss": 1.4769, + "step": 3941 + }, + { + "epoch": 0.09193981463935139, + "grad_norm": 2.0390775203704834, + "learning_rate": 1.999997816097363e-05, + "loss": 1.5616, + "step": 3942 + }, + { + "epoch": 0.0919631377785293, + "grad_norm": 2.844869613647461, + "learning_rate": 1.9999977631562096e-05, + "loss": 1.6326, + "step": 3943 + }, + { + "epoch": 0.09198646091770722, + "grad_norm": 1.5782520771026611, + "learning_rate": 1.9999977095810314e-05, + "loss": 1.3048, + "step": 3944 + }, + { + "epoch": 0.09200978405688513, + "grad_norm": 1.876190423965454, + "learning_rate": 1.9999976553718288e-05, + "loss": 1.5714, + "step": 3945 + }, + { + "epoch": 0.09203310719606306, + "grad_norm": 1.842055320739746, + "learning_rate": 1.9999976005286013e-05, + "loss": 1.293, + "step": 3946 + }, + { + "epoch": 0.09205643033524097, + "grad_norm": 1.8643383979797363, + "learning_rate": 1.9999975450513487e-05, + "loss": 1.5914, + "step": 3947 + }, + { + "epoch": 0.0920797534744189, + "grad_norm": 1.9277528524398804, + "learning_rate": 1.9999974889400716e-05, + "loss": 1.3995, + "step": 3948 + }, + { + "epoch": 0.0921030766135968, + "grad_norm": 1.9185326099395752, + "learning_rate": 1.99999743219477e-05, + "loss": 1.5797, + "step": 3949 + }, + { + "epoch": 0.09212639975277473, + "grad_norm": 1.56540048122406, + "learning_rate": 1.999997374815444e-05, + "loss": 1.283, + "step": 3950 + }, + { + "epoch": 0.09214972289195264, + "grad_norm": 1.7684359550476074, + "learning_rate": 1.9999973168020926e-05, + "loss": 1.383, + "step": 3951 + }, + { + "epoch": 0.09217304603113056, + "grad_norm": 2.0893876552581787, + "learning_rate": 1.9999972581547175e-05, + "loss": 1.1262, + "step": 3952 + }, + { + "epoch": 0.09219636917030848, + "grad_norm": 1.3322521448135376, + "learning_rate": 1.9999971988733173e-05, + "loss": 1.4106, + "step": 3953 + }, + { + "epoch": 0.0922196923094864, + "grad_norm": 1.9222743511199951, + "learning_rate": 1.999997138957893e-05, + "loss": 1.254, + "step": 3954 + }, + { + "epoch": 0.09224301544866431, + "grad_norm": 1.758832335472107, + "learning_rate": 1.999997078408444e-05, + "loss": 1.474, + "step": 3955 + }, + { + "epoch": 0.09226633858784224, + "grad_norm": 1.472639560699463, + "learning_rate": 1.9999970172249706e-05, + "loss": 1.2962, + "step": 3956 + }, + { + "epoch": 0.09228966172702015, + "grad_norm": 1.844859004020691, + "learning_rate": 1.999996955407473e-05, + "loss": 1.5245, + "step": 3957 + }, + { + "epoch": 0.09231298486619807, + "grad_norm": 1.6577290296554565, + "learning_rate": 1.999996892955951e-05, + "loss": 1.3443, + "step": 3958 + }, + { + "epoch": 0.09233630800537598, + "grad_norm": 2.016514539718628, + "learning_rate": 1.9999968298704045e-05, + "loss": 1.1707, + "step": 3959 + }, + { + "epoch": 0.09235963114455391, + "grad_norm": 2.0657806396484375, + "learning_rate": 1.9999967661508342e-05, + "loss": 1.2651, + "step": 3960 + }, + { + "epoch": 0.09238295428373182, + "grad_norm": 1.5270191431045532, + "learning_rate": 1.9999967017972394e-05, + "loss": 1.0796, + "step": 3961 + }, + { + "epoch": 0.09240627742290973, + "grad_norm": 1.3309963941574097, + "learning_rate": 1.9999966368096208e-05, + "loss": 1.4668, + "step": 3962 + }, + { + "epoch": 0.09242960056208765, + "grad_norm": 1.9748342037200928, + "learning_rate": 1.9999965711879773e-05, + "loss": 1.225, + "step": 3963 + }, + { + "epoch": 0.09245292370126557, + "grad_norm": 2.0958023071289062, + "learning_rate": 1.9999965049323104e-05, + "loss": 1.8684, + "step": 3964 + }, + { + "epoch": 0.09247624684044349, + "grad_norm": 1.841970443725586, + "learning_rate": 1.999996438042619e-05, + "loss": 1.5263, + "step": 3965 + }, + { + "epoch": 0.0924995699796214, + "grad_norm": 1.6390635967254639, + "learning_rate": 1.999996370518904e-05, + "loss": 1.1722, + "step": 3966 + }, + { + "epoch": 0.09252289311879933, + "grad_norm": 1.8602365255355835, + "learning_rate": 1.999996302361165e-05, + "loss": 1.4936, + "step": 3967 + }, + { + "epoch": 0.09254621625797724, + "grad_norm": 1.8336684703826904, + "learning_rate": 1.9999962335694022e-05, + "loss": 1.4065, + "step": 3968 + }, + { + "epoch": 0.09256953939715516, + "grad_norm": 2.072793960571289, + "learning_rate": 1.999996164143615e-05, + "loss": 1.7077, + "step": 3969 + }, + { + "epoch": 0.09259286253633307, + "grad_norm": 1.8812514543533325, + "learning_rate": 1.9999960940838047e-05, + "loss": 1.3055, + "step": 3970 + }, + { + "epoch": 0.092616185675511, + "grad_norm": 1.9260146617889404, + "learning_rate": 1.99999602338997e-05, + "loss": 1.4673, + "step": 3971 + }, + { + "epoch": 0.09263950881468891, + "grad_norm": 1.9745763540267944, + "learning_rate": 1.9999959520621116e-05, + "loss": 1.3011, + "step": 3972 + }, + { + "epoch": 0.09266283195386683, + "grad_norm": 1.6949586868286133, + "learning_rate": 1.99999588010023e-05, + "loss": 1.4868, + "step": 3973 + }, + { + "epoch": 0.09268615509304474, + "grad_norm": 1.3992211818695068, + "learning_rate": 1.9999958075043243e-05, + "loss": 1.4782, + "step": 3974 + }, + { + "epoch": 0.09270947823222267, + "grad_norm": 1.4269651174545288, + "learning_rate": 1.9999957342743955e-05, + "loss": 1.3966, + "step": 3975 + }, + { + "epoch": 0.09273280137140058, + "grad_norm": 2.057020902633667, + "learning_rate": 1.9999956604104428e-05, + "loss": 1.5686, + "step": 3976 + }, + { + "epoch": 0.0927561245105785, + "grad_norm": 1.815314769744873, + "learning_rate": 1.9999955859124663e-05, + "loss": 1.7073, + "step": 3977 + }, + { + "epoch": 0.09277944764975642, + "grad_norm": 1.6862220764160156, + "learning_rate": 1.999995510780467e-05, + "loss": 1.4345, + "step": 3978 + }, + { + "epoch": 0.09280277078893434, + "grad_norm": 2.4775919914245605, + "learning_rate": 1.999995435014444e-05, + "loss": 1.4523, + "step": 3979 + }, + { + "epoch": 0.09282609392811225, + "grad_norm": 2.2759830951690674, + "learning_rate": 1.9999953586143977e-05, + "loss": 1.6044, + "step": 3980 + }, + { + "epoch": 0.09284941706729018, + "grad_norm": 2.0369324684143066, + "learning_rate": 1.999995281580328e-05, + "loss": 1.2325, + "step": 3981 + }, + { + "epoch": 0.09287274020646809, + "grad_norm": 1.6006555557250977, + "learning_rate": 1.9999952039122348e-05, + "loss": 1.0778, + "step": 3982 + }, + { + "epoch": 0.09289606334564601, + "grad_norm": 1.7829402685165405, + "learning_rate": 1.9999951256101185e-05, + "loss": 1.3098, + "step": 3983 + }, + { + "epoch": 0.09291938648482392, + "grad_norm": 1.9090176820755005, + "learning_rate": 1.9999950466739794e-05, + "loss": 1.26, + "step": 3984 + }, + { + "epoch": 0.09294270962400185, + "grad_norm": 1.1756659746170044, + "learning_rate": 1.9999949671038168e-05, + "loss": 1.1115, + "step": 3985 + }, + { + "epoch": 0.09296603276317976, + "grad_norm": 1.5126562118530273, + "learning_rate": 1.9999948868996314e-05, + "loss": 1.1954, + "step": 3986 + }, + { + "epoch": 0.09298935590235768, + "grad_norm": 1.5372440814971924, + "learning_rate": 1.9999948060614226e-05, + "loss": 1.432, + "step": 3987 + }, + { + "epoch": 0.0930126790415356, + "grad_norm": 2.036574363708496, + "learning_rate": 1.9999947245891913e-05, + "loss": 1.6046, + "step": 3988 + }, + { + "epoch": 0.0930360021807135, + "grad_norm": 1.7800477743148804, + "learning_rate": 1.999994642482937e-05, + "loss": 1.1686, + "step": 3989 + }, + { + "epoch": 0.09305932531989143, + "grad_norm": 1.939978837966919, + "learning_rate": 1.9999945597426596e-05, + "loss": 1.524, + "step": 3990 + }, + { + "epoch": 0.09308264845906934, + "grad_norm": 3.610623598098755, + "learning_rate": 1.9999944763683596e-05, + "loss": 1.8123, + "step": 3991 + }, + { + "epoch": 0.09310597159824727, + "grad_norm": 1.878239393234253, + "learning_rate": 1.9999943923600368e-05, + "loss": 1.6922, + "step": 3992 + }, + { + "epoch": 0.09312929473742518, + "grad_norm": 1.7238104343414307, + "learning_rate": 1.9999943077176915e-05, + "loss": 1.4637, + "step": 3993 + }, + { + "epoch": 0.0931526178766031, + "grad_norm": 1.890332579612732, + "learning_rate": 1.999994222441323e-05, + "loss": 1.5306, + "step": 3994 + }, + { + "epoch": 0.09317594101578101, + "grad_norm": 1.9238927364349365, + "learning_rate": 1.9999941365309322e-05, + "loss": 1.4932, + "step": 3995 + }, + { + "epoch": 0.09319926415495894, + "grad_norm": 5.232817649841309, + "learning_rate": 1.9999940499865192e-05, + "loss": 1.466, + "step": 3996 + }, + { + "epoch": 0.09322258729413685, + "grad_norm": 1.527990460395813, + "learning_rate": 1.9999939628080834e-05, + "loss": 1.4395, + "step": 3997 + }, + { + "epoch": 0.09324591043331477, + "grad_norm": 1.881361961364746, + "learning_rate": 1.999993874995625e-05, + "loss": 1.4053, + "step": 3998 + }, + { + "epoch": 0.09326923357249269, + "grad_norm": 1.9249119758605957, + "learning_rate": 1.9999937865491444e-05, + "loss": 1.92, + "step": 3999 + }, + { + "epoch": 0.09329255671167061, + "grad_norm": 1.9354079961776733, + "learning_rate": 1.9999936974686416e-05, + "loss": 1.1394, + "step": 4000 + }, + { + "epoch": 0.09331587985084852, + "grad_norm": 2.135955810546875, + "learning_rate": 1.9999936077541163e-05, + "loss": 1.5347, + "step": 4001 + }, + { + "epoch": 0.09333920299002645, + "grad_norm": 1.5951508283615112, + "learning_rate": 1.9999935174055693e-05, + "loss": 1.4595, + "step": 4002 + }, + { + "epoch": 0.09336252612920436, + "grad_norm": 1.7067939043045044, + "learning_rate": 1.9999934264229998e-05, + "loss": 1.8495, + "step": 4003 + }, + { + "epoch": 0.09338584926838228, + "grad_norm": 1.8007570505142212, + "learning_rate": 1.999993334806408e-05, + "loss": 1.1062, + "step": 4004 + }, + { + "epoch": 0.09340917240756019, + "grad_norm": 1.5014314651489258, + "learning_rate": 1.9999932425557947e-05, + "loss": 1.2173, + "step": 4005 + }, + { + "epoch": 0.09343249554673812, + "grad_norm": 1.7208422422409058, + "learning_rate": 1.9999931496711592e-05, + "loss": 1.5016, + "step": 4006 + }, + { + "epoch": 0.09345581868591603, + "grad_norm": 2.022439479827881, + "learning_rate": 1.9999930561525016e-05, + "loss": 1.4124, + "step": 4007 + }, + { + "epoch": 0.09347914182509395, + "grad_norm": 1.6925363540649414, + "learning_rate": 1.999992961999822e-05, + "loss": 1.1304, + "step": 4008 + }, + { + "epoch": 0.09350246496427186, + "grad_norm": 1.7039397954940796, + "learning_rate": 1.999992867213121e-05, + "loss": 1.4684, + "step": 4009 + }, + { + "epoch": 0.09352578810344979, + "grad_norm": 1.7603716850280762, + "learning_rate": 1.9999927717923983e-05, + "loss": 1.3615, + "step": 4010 + }, + { + "epoch": 0.0935491112426277, + "grad_norm": 1.491625189781189, + "learning_rate": 1.999992675737654e-05, + "loss": 1.3722, + "step": 4011 + }, + { + "epoch": 0.09357243438180562, + "grad_norm": 3.466465473175049, + "learning_rate": 1.9999925790488873e-05, + "loss": 1.1393, + "step": 4012 + }, + { + "epoch": 0.09359575752098354, + "grad_norm": 1.9502816200256348, + "learning_rate": 1.9999924817261e-05, + "loss": 1.2407, + "step": 4013 + }, + { + "epoch": 0.09361908066016146, + "grad_norm": 1.3467035293579102, + "learning_rate": 1.9999923837692906e-05, + "loss": 1.3731, + "step": 4014 + }, + { + "epoch": 0.09364240379933937, + "grad_norm": 1.6433124542236328, + "learning_rate": 1.99999228517846e-05, + "loss": 1.0856, + "step": 4015 + }, + { + "epoch": 0.0936657269385173, + "grad_norm": 1.6822779178619385, + "learning_rate": 1.999992185953608e-05, + "loss": 1.3971, + "step": 4016 + }, + { + "epoch": 0.09368905007769521, + "grad_norm": 2.0529069900512695, + "learning_rate": 1.9999920860947347e-05, + "loss": 1.4433, + "step": 4017 + }, + { + "epoch": 0.09371237321687312, + "grad_norm": 1.6150270700454712, + "learning_rate": 1.9999919856018405e-05, + "loss": 1.2959, + "step": 4018 + }, + { + "epoch": 0.09373569635605104, + "grad_norm": 2.2174623012542725, + "learning_rate": 1.9999918844749245e-05, + "loss": 1.2943, + "step": 4019 + }, + { + "epoch": 0.09375901949522895, + "grad_norm": 2.1846296787261963, + "learning_rate": 1.999991782713988e-05, + "loss": 1.2629, + "step": 4020 + }, + { + "epoch": 0.09378234263440688, + "grad_norm": 1.459313154220581, + "learning_rate": 1.9999916803190304e-05, + "loss": 1.5141, + "step": 4021 + }, + { + "epoch": 0.09380566577358479, + "grad_norm": 2.6233582496643066, + "learning_rate": 1.9999915772900514e-05, + "loss": 1.6312, + "step": 4022 + }, + { + "epoch": 0.09382898891276271, + "grad_norm": 1.8272534608840942, + "learning_rate": 1.9999914736270518e-05, + "loss": 1.2389, + "step": 4023 + }, + { + "epoch": 0.09385231205194063, + "grad_norm": 1.7140253782272339, + "learning_rate": 1.9999913693300313e-05, + "loss": 1.2665, + "step": 4024 + }, + { + "epoch": 0.09387563519111855, + "grad_norm": 1.9143249988555908, + "learning_rate": 1.9999912643989898e-05, + "loss": 1.6875, + "step": 4025 + }, + { + "epoch": 0.09389895833029646, + "grad_norm": 1.8950088024139404, + "learning_rate": 1.9999911588339275e-05, + "loss": 1.651, + "step": 4026 + }, + { + "epoch": 0.09392228146947439, + "grad_norm": 1.800822138786316, + "learning_rate": 1.999991052634845e-05, + "loss": 0.9556, + "step": 4027 + }, + { + "epoch": 0.0939456046086523, + "grad_norm": 1.5436400175094604, + "learning_rate": 1.9999909458017416e-05, + "loss": 1.5147, + "step": 4028 + }, + { + "epoch": 0.09396892774783022, + "grad_norm": 1.5622018575668335, + "learning_rate": 1.9999908383346177e-05, + "loss": 1.1669, + "step": 4029 + }, + { + "epoch": 0.09399225088700813, + "grad_norm": 1.8378404378890991, + "learning_rate": 1.9999907302334737e-05, + "loss": 0.9293, + "step": 4030 + }, + { + "epoch": 0.09401557402618606, + "grad_norm": 1.7259092330932617, + "learning_rate": 1.999990621498309e-05, + "loss": 1.8631, + "step": 4031 + }, + { + "epoch": 0.09403889716536397, + "grad_norm": 1.811070203781128, + "learning_rate": 1.999990512129124e-05, + "loss": 1.5787, + "step": 4032 + }, + { + "epoch": 0.0940622203045419, + "grad_norm": 1.6709043979644775, + "learning_rate": 1.9999904021259188e-05, + "loss": 1.3278, + "step": 4033 + }, + { + "epoch": 0.0940855434437198, + "grad_norm": 1.8301382064819336, + "learning_rate": 1.9999902914886937e-05, + "loss": 1.5662, + "step": 4034 + }, + { + "epoch": 0.09410886658289773, + "grad_norm": 1.7583134174346924, + "learning_rate": 1.9999901802174483e-05, + "loss": 1.2864, + "step": 4035 + }, + { + "epoch": 0.09413218972207564, + "grad_norm": 3.2991154193878174, + "learning_rate": 1.9999900683121828e-05, + "loss": 1.4025, + "step": 4036 + }, + { + "epoch": 0.09415551286125357, + "grad_norm": 1.5830022096633911, + "learning_rate": 1.999989955772897e-05, + "loss": 1.4977, + "step": 4037 + }, + { + "epoch": 0.09417883600043148, + "grad_norm": 1.7454510927200317, + "learning_rate": 1.9999898425995918e-05, + "loss": 1.8235, + "step": 4038 + }, + { + "epoch": 0.0942021591396094, + "grad_norm": 1.6297307014465332, + "learning_rate": 1.999989728792267e-05, + "loss": 1.5289, + "step": 4039 + }, + { + "epoch": 0.09422548227878731, + "grad_norm": 2.0267040729522705, + "learning_rate": 1.999989614350922e-05, + "loss": 1.6013, + "step": 4040 + }, + { + "epoch": 0.09424880541796524, + "grad_norm": 1.4703158140182495, + "learning_rate": 1.9999894992755572e-05, + "loss": 1.5303, + "step": 4041 + }, + { + "epoch": 0.09427212855714315, + "grad_norm": 1.6133272647857666, + "learning_rate": 1.9999893835661736e-05, + "loss": 1.1337, + "step": 4042 + }, + { + "epoch": 0.09429545169632107, + "grad_norm": 1.936994194984436, + "learning_rate": 1.9999892672227698e-05, + "loss": 1.2797, + "step": 4043 + }, + { + "epoch": 0.09431877483549898, + "grad_norm": 1.6114977598190308, + "learning_rate": 1.9999891502453466e-05, + "loss": 1.5541, + "step": 4044 + }, + { + "epoch": 0.09434209797467691, + "grad_norm": 1.9083713293075562, + "learning_rate": 1.999989032633904e-05, + "loss": 1.6397, + "step": 4045 + }, + { + "epoch": 0.09436542111385482, + "grad_norm": 1.7421109676361084, + "learning_rate": 1.9999889143884424e-05, + "loss": 1.1853, + "step": 4046 + }, + { + "epoch": 0.09438874425303273, + "grad_norm": 1.8391941785812378, + "learning_rate": 1.999988795508961e-05, + "loss": 1.3414, + "step": 4047 + }, + { + "epoch": 0.09441206739221066, + "grad_norm": 1.8088597059249878, + "learning_rate": 1.9999886759954613e-05, + "loss": 1.3833, + "step": 4048 + }, + { + "epoch": 0.09443539053138857, + "grad_norm": 1.6824920177459717, + "learning_rate": 1.999988555847942e-05, + "loss": 1.2919, + "step": 4049 + }, + { + "epoch": 0.09445871367056649, + "grad_norm": 1.9679538011550903, + "learning_rate": 1.9999884350664035e-05, + "loss": 1.3646, + "step": 4050 + }, + { + "epoch": 0.0944820368097444, + "grad_norm": 1.8130271434783936, + "learning_rate": 1.9999883136508465e-05, + "loss": 1.2788, + "step": 4051 + }, + { + "epoch": 0.09450535994892233, + "grad_norm": 1.3992571830749512, + "learning_rate": 1.9999881916012708e-05, + "loss": 1.3074, + "step": 4052 + }, + { + "epoch": 0.09452868308810024, + "grad_norm": 1.8113707304000854, + "learning_rate": 1.999988068917676e-05, + "loss": 1.3189, + "step": 4053 + }, + { + "epoch": 0.09455200622727816, + "grad_norm": 2.156859874725342, + "learning_rate": 1.9999879456000625e-05, + "loss": 1.5443, + "step": 4054 + }, + { + "epoch": 0.09457532936645607, + "grad_norm": 1.979003667831421, + "learning_rate": 1.9999878216484302e-05, + "loss": 1.5466, + "step": 4055 + }, + { + "epoch": 0.094598652505634, + "grad_norm": 1.953880786895752, + "learning_rate": 1.9999876970627796e-05, + "loss": 1.4766, + "step": 4056 + }, + { + "epoch": 0.09462197564481191, + "grad_norm": 1.613236427307129, + "learning_rate": 1.999987571843111e-05, + "loss": 1.642, + "step": 4057 + }, + { + "epoch": 0.09464529878398983, + "grad_norm": 1.4525372982025146, + "learning_rate": 1.999987445989423e-05, + "loss": 1.4696, + "step": 4058 + }, + { + "epoch": 0.09466862192316775, + "grad_norm": 1.4650002717971802, + "learning_rate": 1.9999873195017177e-05, + "loss": 1.4614, + "step": 4059 + }, + { + "epoch": 0.09469194506234567, + "grad_norm": 1.5875036716461182, + "learning_rate": 1.9999871923799935e-05, + "loss": 0.8413, + "step": 4060 + }, + { + "epoch": 0.09471526820152358, + "grad_norm": 1.8234145641326904, + "learning_rate": 1.9999870646242512e-05, + "loss": 1.6689, + "step": 4061 + }, + { + "epoch": 0.0947385913407015, + "grad_norm": 1.4357320070266724, + "learning_rate": 1.9999869362344916e-05, + "loss": 1.04, + "step": 4062 + }, + { + "epoch": 0.09476191447987942, + "grad_norm": 1.530616283416748, + "learning_rate": 1.9999868072107133e-05, + "loss": 1.3837, + "step": 4063 + }, + { + "epoch": 0.09478523761905734, + "grad_norm": 2.2008919715881348, + "learning_rate": 1.9999866775529172e-05, + "loss": 1.3126, + "step": 4064 + }, + { + "epoch": 0.09480856075823525, + "grad_norm": 1.8661929368972778, + "learning_rate": 1.9999865472611034e-05, + "loss": 1.4651, + "step": 4065 + }, + { + "epoch": 0.09483188389741318, + "grad_norm": 1.8432332277297974, + "learning_rate": 1.999986416335272e-05, + "loss": 1.6874, + "step": 4066 + }, + { + "epoch": 0.09485520703659109, + "grad_norm": 1.8919012546539307, + "learning_rate": 1.999986284775423e-05, + "loss": 1.3761, + "step": 4067 + }, + { + "epoch": 0.09487853017576901, + "grad_norm": 1.7640912532806396, + "learning_rate": 1.999986152581556e-05, + "loss": 1.5106, + "step": 4068 + }, + { + "epoch": 0.09490185331494692, + "grad_norm": 1.3458236455917358, + "learning_rate": 1.999986019753672e-05, + "loss": 1.266, + "step": 4069 + }, + { + "epoch": 0.09492517645412485, + "grad_norm": 2.0958406925201416, + "learning_rate": 1.9999858862917705e-05, + "loss": 1.3424, + "step": 4070 + }, + { + "epoch": 0.09494849959330276, + "grad_norm": 1.7863316535949707, + "learning_rate": 1.9999857521958518e-05, + "loss": 1.5416, + "step": 4071 + }, + { + "epoch": 0.09497182273248068, + "grad_norm": 1.6585896015167236, + "learning_rate": 1.9999856174659157e-05, + "loss": 1.26, + "step": 4072 + }, + { + "epoch": 0.0949951458716586, + "grad_norm": 2.302985429763794, + "learning_rate": 1.9999854821019622e-05, + "loss": 1.2932, + "step": 4073 + }, + { + "epoch": 0.09501846901083652, + "grad_norm": 1.7254619598388672, + "learning_rate": 1.999985346103992e-05, + "loss": 1.2682, + "step": 4074 + }, + { + "epoch": 0.09504179215001443, + "grad_norm": 2.1413419246673584, + "learning_rate": 1.999985209472005e-05, + "loss": 1.38, + "step": 4075 + }, + { + "epoch": 0.09506511528919234, + "grad_norm": 1.833171010017395, + "learning_rate": 1.9999850722060007e-05, + "loss": 1.5078, + "step": 4076 + }, + { + "epoch": 0.09508843842837027, + "grad_norm": 1.5397589206695557, + "learning_rate": 1.9999849343059797e-05, + "loss": 1.3753, + "step": 4077 + }, + { + "epoch": 0.09511176156754818, + "grad_norm": 1.693420171737671, + "learning_rate": 1.999984795771942e-05, + "loss": 1.3732, + "step": 4078 + }, + { + "epoch": 0.0951350847067261, + "grad_norm": 1.6689990758895874, + "learning_rate": 1.9999846566038878e-05, + "loss": 1.2823, + "step": 4079 + }, + { + "epoch": 0.09515840784590401, + "grad_norm": 1.8790488243103027, + "learning_rate": 1.999984516801817e-05, + "loss": 1.3091, + "step": 4080 + }, + { + "epoch": 0.09518173098508194, + "grad_norm": 1.65789794921875, + "learning_rate": 1.9999843763657298e-05, + "loss": 1.4561, + "step": 4081 + }, + { + "epoch": 0.09520505412425985, + "grad_norm": 2.2451510429382324, + "learning_rate": 1.9999842352956264e-05, + "loss": 0.8032, + "step": 4082 + }, + { + "epoch": 0.09522837726343777, + "grad_norm": 1.5158308744430542, + "learning_rate": 1.9999840935915066e-05, + "loss": 1.3509, + "step": 4083 + }, + { + "epoch": 0.09525170040261569, + "grad_norm": 2.0572614669799805, + "learning_rate": 1.999983951253371e-05, + "loss": 0.9884, + "step": 4084 + }, + { + "epoch": 0.09527502354179361, + "grad_norm": 1.6487854719161987, + "learning_rate": 1.9999838082812187e-05, + "loss": 1.7546, + "step": 4085 + }, + { + "epoch": 0.09529834668097152, + "grad_norm": 2.0137455463409424, + "learning_rate": 1.9999836646750506e-05, + "loss": 1.3536, + "step": 4086 + }, + { + "epoch": 0.09532166982014945, + "grad_norm": 1.6713207960128784, + "learning_rate": 1.9999835204348667e-05, + "loss": 1.18, + "step": 4087 + }, + { + "epoch": 0.09534499295932736, + "grad_norm": 1.3659164905548096, + "learning_rate": 1.999983375560667e-05, + "loss": 1.228, + "step": 4088 + }, + { + "epoch": 0.09536831609850528, + "grad_norm": 2.0661914348602295, + "learning_rate": 1.9999832300524513e-05, + "loss": 1.5949, + "step": 4089 + }, + { + "epoch": 0.09539163923768319, + "grad_norm": 1.5829130411148071, + "learning_rate": 1.9999830839102205e-05, + "loss": 1.6185, + "step": 4090 + }, + { + "epoch": 0.09541496237686112, + "grad_norm": 1.7367058992385864, + "learning_rate": 1.999982937133974e-05, + "loss": 1.3331, + "step": 4091 + }, + { + "epoch": 0.09543828551603903, + "grad_norm": 1.6864798069000244, + "learning_rate": 1.9999827897237117e-05, + "loss": 1.4409, + "step": 4092 + }, + { + "epoch": 0.09546160865521695, + "grad_norm": 1.5921106338500977, + "learning_rate": 1.9999826416794345e-05, + "loss": 1.3799, + "step": 4093 + }, + { + "epoch": 0.09548493179439486, + "grad_norm": 2.0874788761138916, + "learning_rate": 1.999982493001142e-05, + "loss": 1.2227, + "step": 4094 + }, + { + "epoch": 0.09550825493357279, + "grad_norm": 2.1337532997131348, + "learning_rate": 1.999982343688834e-05, + "loss": 1.4877, + "step": 4095 + }, + { + "epoch": 0.0955315780727507, + "grad_norm": 2.3292899131774902, + "learning_rate": 1.9999821937425114e-05, + "loss": 1.5423, + "step": 4096 + }, + { + "epoch": 0.09555490121192863, + "grad_norm": 1.8624464273452759, + "learning_rate": 1.999982043162173e-05, + "loss": 1.2917, + "step": 4097 + }, + { + "epoch": 0.09557822435110654, + "grad_norm": 2.0876848697662354, + "learning_rate": 1.9999818919478205e-05, + "loss": 1.0749, + "step": 4098 + }, + { + "epoch": 0.09560154749028446, + "grad_norm": 1.882411241531372, + "learning_rate": 1.9999817400994533e-05, + "loss": 1.4965, + "step": 4099 + }, + { + "epoch": 0.09562487062946237, + "grad_norm": 1.6623929738998413, + "learning_rate": 1.999981587617071e-05, + "loss": 1.1074, + "step": 4100 + }, + { + "epoch": 0.0956481937686403, + "grad_norm": 1.3498985767364502, + "learning_rate": 1.9999814345006744e-05, + "loss": 1.1718, + "step": 4101 + }, + { + "epoch": 0.09567151690781821, + "grad_norm": 2.064556121826172, + "learning_rate": 1.9999812807502632e-05, + "loss": 1.4264, + "step": 4102 + }, + { + "epoch": 0.09569484004699612, + "grad_norm": 1.7603230476379395, + "learning_rate": 1.9999811263658376e-05, + "loss": 1.542, + "step": 4103 + }, + { + "epoch": 0.09571816318617404, + "grad_norm": 1.6809413433074951, + "learning_rate": 1.9999809713473977e-05, + "loss": 1.4546, + "step": 4104 + }, + { + "epoch": 0.09574148632535195, + "grad_norm": 1.659028172492981, + "learning_rate": 1.9999808156949435e-05, + "loss": 1.3343, + "step": 4105 + }, + { + "epoch": 0.09576480946452988, + "grad_norm": 1.9118539094924927, + "learning_rate": 1.9999806594084753e-05, + "loss": 1.6991, + "step": 4106 + }, + { + "epoch": 0.09578813260370779, + "grad_norm": 1.984724521636963, + "learning_rate": 1.9999805024879934e-05, + "loss": 1.5914, + "step": 4107 + }, + { + "epoch": 0.09581145574288571, + "grad_norm": 1.6731042861938477, + "learning_rate": 1.9999803449334972e-05, + "loss": 1.5506, + "step": 4108 + }, + { + "epoch": 0.09583477888206363, + "grad_norm": 1.8623292446136475, + "learning_rate": 1.9999801867449874e-05, + "loss": 1.3468, + "step": 4109 + }, + { + "epoch": 0.09585810202124155, + "grad_norm": 1.2610914707183838, + "learning_rate": 1.999980027922464e-05, + "loss": 1.2062, + "step": 4110 + }, + { + "epoch": 0.09588142516041946, + "grad_norm": 1.6934281587600708, + "learning_rate": 1.999979868465927e-05, + "loss": 1.3682, + "step": 4111 + }, + { + "epoch": 0.09590474829959739, + "grad_norm": 1.9726686477661133, + "learning_rate": 1.9999797083753764e-05, + "loss": 1.371, + "step": 4112 + }, + { + "epoch": 0.0959280714387753, + "grad_norm": 1.8796629905700684, + "learning_rate": 1.9999795476508123e-05, + "loss": 1.4994, + "step": 4113 + }, + { + "epoch": 0.09595139457795322, + "grad_norm": 1.68529212474823, + "learning_rate": 1.9999793862922352e-05, + "loss": 1.465, + "step": 4114 + }, + { + "epoch": 0.09597471771713113, + "grad_norm": 1.8680670261383057, + "learning_rate": 1.999979224299645e-05, + "loss": 1.6986, + "step": 4115 + }, + { + "epoch": 0.09599804085630906, + "grad_norm": 1.9507824182510376, + "learning_rate": 1.9999790616730412e-05, + "loss": 1.5233, + "step": 4116 + }, + { + "epoch": 0.09602136399548697, + "grad_norm": 1.6027692556381226, + "learning_rate": 1.9999788984124246e-05, + "loss": 1.5378, + "step": 4117 + }, + { + "epoch": 0.0960446871346649, + "grad_norm": 2.4456326961517334, + "learning_rate": 1.9999787345177953e-05, + "loss": 1.3405, + "step": 4118 + }, + { + "epoch": 0.0960680102738428, + "grad_norm": 1.7938956022262573, + "learning_rate": 1.999978569989153e-05, + "loss": 1.6514, + "step": 4119 + }, + { + "epoch": 0.09609133341302073, + "grad_norm": 1.6841145753860474, + "learning_rate": 1.999978404826498e-05, + "loss": 1.4499, + "step": 4120 + }, + { + "epoch": 0.09611465655219864, + "grad_norm": 1.6563725471496582, + "learning_rate": 1.9999782390298307e-05, + "loss": 1.7497, + "step": 4121 + }, + { + "epoch": 0.09613797969137657, + "grad_norm": 1.7308846712112427, + "learning_rate": 1.9999780725991512e-05, + "loss": 1.3619, + "step": 4122 + }, + { + "epoch": 0.09616130283055448, + "grad_norm": 2.038050413131714, + "learning_rate": 1.999977905534459e-05, + "loss": 1.4866, + "step": 4123 + }, + { + "epoch": 0.0961846259697324, + "grad_norm": 1.775932788848877, + "learning_rate": 1.9999777378357544e-05, + "loss": 1.5068, + "step": 4124 + }, + { + "epoch": 0.09620794910891031, + "grad_norm": 1.7475711107254028, + "learning_rate": 1.999977569503038e-05, + "loss": 1.2989, + "step": 4125 + }, + { + "epoch": 0.09623127224808824, + "grad_norm": 1.4972944259643555, + "learning_rate": 1.9999774005363094e-05, + "loss": 1.2221, + "step": 4126 + }, + { + "epoch": 0.09625459538726615, + "grad_norm": 1.9756770133972168, + "learning_rate": 1.999977230935569e-05, + "loss": 1.6019, + "step": 4127 + }, + { + "epoch": 0.09627791852644407, + "grad_norm": 1.3822906017303467, + "learning_rate": 1.9999770607008163e-05, + "loss": 1.4286, + "step": 4128 + }, + { + "epoch": 0.09630124166562198, + "grad_norm": 1.4641766548156738, + "learning_rate": 1.9999768898320525e-05, + "loss": 1.518, + "step": 4129 + }, + { + "epoch": 0.09632456480479991, + "grad_norm": 1.973097562789917, + "learning_rate": 1.9999767183292767e-05, + "loss": 1.4911, + "step": 4130 + }, + { + "epoch": 0.09634788794397782, + "grad_norm": 1.9415779113769531, + "learning_rate": 1.9999765461924893e-05, + "loss": 1.5681, + "step": 4131 + }, + { + "epoch": 0.09637121108315573, + "grad_norm": 1.7455964088439941, + "learning_rate": 1.9999763734216907e-05, + "loss": 1.8105, + "step": 4132 + }, + { + "epoch": 0.09639453422233366, + "grad_norm": 1.4776321649551392, + "learning_rate": 1.999976200016881e-05, + "loss": 1.4157, + "step": 4133 + }, + { + "epoch": 0.09641785736151157, + "grad_norm": 1.9864835739135742, + "learning_rate": 1.9999760259780602e-05, + "loss": 1.5405, + "step": 4134 + }, + { + "epoch": 0.09644118050068949, + "grad_norm": 1.9087644815444946, + "learning_rate": 1.999975851305228e-05, + "loss": 1.1071, + "step": 4135 + }, + { + "epoch": 0.0964645036398674, + "grad_norm": 1.7016524076461792, + "learning_rate": 1.999975675998385e-05, + "loss": 1.6047, + "step": 4136 + }, + { + "epoch": 0.09648782677904533, + "grad_norm": 1.7450127601623535, + "learning_rate": 1.9999755000575313e-05, + "loss": 1.7406, + "step": 4137 + }, + { + "epoch": 0.09651114991822324, + "grad_norm": 1.5106456279754639, + "learning_rate": 1.9999753234826667e-05, + "loss": 1.1769, + "step": 4138 + }, + { + "epoch": 0.09653447305740116, + "grad_norm": 1.538486361503601, + "learning_rate": 1.9999751462737915e-05, + "loss": 1.4457, + "step": 4139 + }, + { + "epoch": 0.09655779619657907, + "grad_norm": 1.83707594871521, + "learning_rate": 1.999974968430906e-05, + "loss": 1.8298, + "step": 4140 + }, + { + "epoch": 0.096581119335757, + "grad_norm": 1.85380220413208, + "learning_rate": 1.99997478995401e-05, + "loss": 1.5184, + "step": 4141 + }, + { + "epoch": 0.09660444247493491, + "grad_norm": 1.5038057565689087, + "learning_rate": 1.9999746108431036e-05, + "loss": 1.0325, + "step": 4142 + }, + { + "epoch": 0.09662776561411283, + "grad_norm": 1.5708307027816772, + "learning_rate": 1.9999744310981873e-05, + "loss": 1.0565, + "step": 4143 + }, + { + "epoch": 0.09665108875329075, + "grad_norm": 1.6708171367645264, + "learning_rate": 1.9999742507192608e-05, + "loss": 1.5968, + "step": 4144 + }, + { + "epoch": 0.09667441189246867, + "grad_norm": 2.186654806137085, + "learning_rate": 1.9999740697063244e-05, + "loss": 1.5134, + "step": 4145 + }, + { + "epoch": 0.09669773503164658, + "grad_norm": 2.057157039642334, + "learning_rate": 1.9999738880593783e-05, + "loss": 1.5829, + "step": 4146 + }, + { + "epoch": 0.0967210581708245, + "grad_norm": 2.131748914718628, + "learning_rate": 1.9999737057784224e-05, + "loss": 1.6146, + "step": 4147 + }, + { + "epoch": 0.09674438131000242, + "grad_norm": 1.6583216190338135, + "learning_rate": 1.9999735228634565e-05, + "loss": 1.3262, + "step": 4148 + }, + { + "epoch": 0.09676770444918034, + "grad_norm": 1.5584999322891235, + "learning_rate": 1.9999733393144818e-05, + "loss": 1.4176, + "step": 4149 + }, + { + "epoch": 0.09679102758835825, + "grad_norm": 1.7254819869995117, + "learning_rate": 1.9999731551314974e-05, + "loss": 1.3999, + "step": 4150 + }, + { + "epoch": 0.09681435072753618, + "grad_norm": 1.5138722658157349, + "learning_rate": 1.9999729703145038e-05, + "loss": 1.373, + "step": 4151 + }, + { + "epoch": 0.09683767386671409, + "grad_norm": 1.4465757608413696, + "learning_rate": 1.999972784863501e-05, + "loss": 1.2654, + "step": 4152 + }, + { + "epoch": 0.09686099700589201, + "grad_norm": 1.792013168334961, + "learning_rate": 1.9999725987784896e-05, + "loss": 1.4372, + "step": 4153 + }, + { + "epoch": 0.09688432014506992, + "grad_norm": 2.385373115539551, + "learning_rate": 1.999972412059469e-05, + "loss": 1.2813, + "step": 4154 + }, + { + "epoch": 0.09690764328424785, + "grad_norm": 1.9845149517059326, + "learning_rate": 1.9999722247064394e-05, + "loss": 1.6797, + "step": 4155 + }, + { + "epoch": 0.09693096642342576, + "grad_norm": 1.7642122507095337, + "learning_rate": 1.9999720367194016e-05, + "loss": 1.3114, + "step": 4156 + }, + { + "epoch": 0.09695428956260368, + "grad_norm": 1.6494389772415161, + "learning_rate": 1.9999718480983553e-05, + "loss": 1.7949, + "step": 4157 + }, + { + "epoch": 0.0969776127017816, + "grad_norm": 2.112375259399414, + "learning_rate": 1.9999716588433004e-05, + "loss": 1.6229, + "step": 4158 + }, + { + "epoch": 0.09700093584095952, + "grad_norm": 1.6811909675598145, + "learning_rate": 1.9999714689542373e-05, + "loss": 1.1249, + "step": 4159 + }, + { + "epoch": 0.09702425898013743, + "grad_norm": 2.01330304145813, + "learning_rate": 1.999971278431166e-05, + "loss": 1.25, + "step": 4160 + }, + { + "epoch": 0.09704758211931534, + "grad_norm": 1.8410260677337646, + "learning_rate": 1.9999710872740863e-05, + "loss": 1.6418, + "step": 4161 + }, + { + "epoch": 0.09707090525849327, + "grad_norm": 1.4903624057769775, + "learning_rate": 1.9999708954829992e-05, + "loss": 1.1562, + "step": 4162 + }, + { + "epoch": 0.09709422839767118, + "grad_norm": 1.8792493343353271, + "learning_rate": 1.9999707030579038e-05, + "loss": 1.2509, + "step": 4163 + }, + { + "epoch": 0.0971175515368491, + "grad_norm": 2.2513837814331055, + "learning_rate": 1.9999705099988012e-05, + "loss": 1.5141, + "step": 4164 + }, + { + "epoch": 0.09714087467602701, + "grad_norm": 1.8120847940444946, + "learning_rate": 1.9999703163056908e-05, + "loss": 1.5603, + "step": 4165 + }, + { + "epoch": 0.09716419781520494, + "grad_norm": 1.8012254238128662, + "learning_rate": 1.999970121978573e-05, + "loss": 1.7928, + "step": 4166 + }, + { + "epoch": 0.09718752095438285, + "grad_norm": 1.5914149284362793, + "learning_rate": 1.999969927017448e-05, + "loss": 1.398, + "step": 4167 + }, + { + "epoch": 0.09721084409356077, + "grad_norm": 2.2462551593780518, + "learning_rate": 1.9999697314223158e-05, + "loss": 1.3219, + "step": 4168 + }, + { + "epoch": 0.09723416723273869, + "grad_norm": 1.847602128982544, + "learning_rate": 1.999969535193176e-05, + "loss": 1.4635, + "step": 4169 + }, + { + "epoch": 0.09725749037191661, + "grad_norm": 1.968076467514038, + "learning_rate": 1.99996933833003e-05, + "loss": 1.6123, + "step": 4170 + }, + { + "epoch": 0.09728081351109452, + "grad_norm": 1.9207682609558105, + "learning_rate": 1.999969140832877e-05, + "loss": 1.3354, + "step": 4171 + }, + { + "epoch": 0.09730413665027245, + "grad_norm": 1.8169053792953491, + "learning_rate": 1.9999689427017174e-05, + "loss": 1.357, + "step": 4172 + }, + { + "epoch": 0.09732745978945036, + "grad_norm": 1.7797213792800903, + "learning_rate": 1.9999687439365506e-05, + "loss": 1.2203, + "step": 4173 + }, + { + "epoch": 0.09735078292862828, + "grad_norm": 1.9157795906066895, + "learning_rate": 1.999968544537378e-05, + "loss": 1.4148, + "step": 4174 + }, + { + "epoch": 0.0973741060678062, + "grad_norm": 1.6914931535720825, + "learning_rate": 1.999968344504199e-05, + "loss": 1.4039, + "step": 4175 + }, + { + "epoch": 0.09739742920698412, + "grad_norm": 6.293972015380859, + "learning_rate": 1.9999681438370134e-05, + "loss": 1.4866, + "step": 4176 + }, + { + "epoch": 0.09742075234616203, + "grad_norm": 1.5637798309326172, + "learning_rate": 1.999967942535822e-05, + "loss": 1.572, + "step": 4177 + }, + { + "epoch": 0.09744407548533995, + "grad_norm": 1.877367377281189, + "learning_rate": 1.999967740600625e-05, + "loss": 1.7372, + "step": 4178 + }, + { + "epoch": 0.09746739862451786, + "grad_norm": 1.6659308671951294, + "learning_rate": 1.999967538031422e-05, + "loss": 1.639, + "step": 4179 + }, + { + "epoch": 0.09749072176369579, + "grad_norm": 1.550208568572998, + "learning_rate": 1.999967334828213e-05, + "loss": 1.1717, + "step": 4180 + }, + { + "epoch": 0.0975140449028737, + "grad_norm": 2.298583745956421, + "learning_rate": 1.999967130990999e-05, + "loss": 1.5788, + "step": 4181 + }, + { + "epoch": 0.09753736804205163, + "grad_norm": 1.6380672454833984, + "learning_rate": 1.9999669265197796e-05, + "loss": 1.2321, + "step": 4182 + }, + { + "epoch": 0.09756069118122954, + "grad_norm": 1.7410871982574463, + "learning_rate": 1.9999667214145542e-05, + "loss": 1.1268, + "step": 4183 + }, + { + "epoch": 0.09758401432040746, + "grad_norm": 1.8795753717422485, + "learning_rate": 1.999966515675324e-05, + "loss": 1.3166, + "step": 4184 + }, + { + "epoch": 0.09760733745958537, + "grad_norm": 1.4578545093536377, + "learning_rate": 1.999966309302089e-05, + "loss": 1.2819, + "step": 4185 + }, + { + "epoch": 0.0976306605987633, + "grad_norm": 2.1507022380828857, + "learning_rate": 1.999966102294849e-05, + "loss": 1.3444, + "step": 4186 + }, + { + "epoch": 0.09765398373794121, + "grad_norm": 1.5323656797409058, + "learning_rate": 1.999965894653604e-05, + "loss": 1.4338, + "step": 4187 + }, + { + "epoch": 0.09767730687711913, + "grad_norm": 1.7071229219436646, + "learning_rate": 1.9999656863783546e-05, + "loss": 1.6295, + "step": 4188 + }, + { + "epoch": 0.09770063001629704, + "grad_norm": 1.5819435119628906, + "learning_rate": 1.999965477469101e-05, + "loss": 1.2241, + "step": 4189 + }, + { + "epoch": 0.09772395315547495, + "grad_norm": 1.9116559028625488, + "learning_rate": 1.9999652679258424e-05, + "loss": 1.5789, + "step": 4190 + }, + { + "epoch": 0.09774727629465288, + "grad_norm": 1.4237418174743652, + "learning_rate": 1.9999650577485797e-05, + "loss": 1.213, + "step": 4191 + }, + { + "epoch": 0.09777059943383079, + "grad_norm": 1.715041160583496, + "learning_rate": 1.9999648469373132e-05, + "loss": 1.4415, + "step": 4192 + }, + { + "epoch": 0.09779392257300872, + "grad_norm": 1.9360545873641968, + "learning_rate": 1.9999646354920425e-05, + "loss": 1.405, + "step": 4193 + }, + { + "epoch": 0.09781724571218663, + "grad_norm": 2.3374712467193604, + "learning_rate": 1.9999644234127684e-05, + "loss": 1.6741, + "step": 4194 + }, + { + "epoch": 0.09784056885136455, + "grad_norm": 2.0107362270355225, + "learning_rate": 1.99996421069949e-05, + "loss": 1.5874, + "step": 4195 + }, + { + "epoch": 0.09786389199054246, + "grad_norm": 2.5181028842926025, + "learning_rate": 1.9999639973522086e-05, + "loss": 1.6139, + "step": 4196 + }, + { + "epoch": 0.09788721512972039, + "grad_norm": 1.7766554355621338, + "learning_rate": 1.9999637833709233e-05, + "loss": 1.3219, + "step": 4197 + }, + { + "epoch": 0.0979105382688983, + "grad_norm": 2.135298013687134, + "learning_rate": 1.999963568755635e-05, + "loss": 1.598, + "step": 4198 + }, + { + "epoch": 0.09793386140807622, + "grad_norm": 2.5849833488464355, + "learning_rate": 1.9999633535063433e-05, + "loss": 1.19, + "step": 4199 + }, + { + "epoch": 0.09795718454725413, + "grad_norm": 1.7871876955032349, + "learning_rate": 1.999963137623049e-05, + "loss": 1.2969, + "step": 4200 + }, + { + "epoch": 0.09798050768643206, + "grad_norm": 1.6766856908798218, + "learning_rate": 1.9999629211057514e-05, + "loss": 1.7685, + "step": 4201 + }, + { + "epoch": 0.09800383082560997, + "grad_norm": 1.9163066148757935, + "learning_rate": 1.999962703954451e-05, + "loss": 1.6925, + "step": 4202 + }, + { + "epoch": 0.0980271539647879, + "grad_norm": 3.168355941772461, + "learning_rate": 1.9999624861691482e-05, + "loss": 1.1072, + "step": 4203 + }, + { + "epoch": 0.0980504771039658, + "grad_norm": 1.8867367506027222, + "learning_rate": 1.999962267749843e-05, + "loss": 0.8637, + "step": 4204 + }, + { + "epoch": 0.09807380024314373, + "grad_norm": 1.6280899047851562, + "learning_rate": 1.9999620486965356e-05, + "loss": 1.7693, + "step": 4205 + }, + { + "epoch": 0.09809712338232164, + "grad_norm": 1.8064197301864624, + "learning_rate": 1.9999618290092254e-05, + "loss": 1.579, + "step": 4206 + }, + { + "epoch": 0.09812044652149957, + "grad_norm": 1.6739108562469482, + "learning_rate": 1.9999616086879134e-05, + "loss": 1.326, + "step": 4207 + }, + { + "epoch": 0.09814376966067748, + "grad_norm": 1.659472107887268, + "learning_rate": 1.9999613877326e-05, + "loss": 1.4379, + "step": 4208 + }, + { + "epoch": 0.0981670927998554, + "grad_norm": 1.6488410234451294, + "learning_rate": 1.9999611661432844e-05, + "loss": 1.5683, + "step": 4209 + }, + { + "epoch": 0.09819041593903331, + "grad_norm": 1.5839974880218506, + "learning_rate": 1.999960943919967e-05, + "loss": 1.567, + "step": 4210 + }, + { + "epoch": 0.09821373907821124, + "grad_norm": 1.7465866804122925, + "learning_rate": 1.9999607210626485e-05, + "loss": 1.6317, + "step": 4211 + }, + { + "epoch": 0.09823706221738915, + "grad_norm": 1.8293263912200928, + "learning_rate": 1.9999604975713286e-05, + "loss": 1.6924, + "step": 4212 + }, + { + "epoch": 0.09826038535656707, + "grad_norm": 1.6221671104431152, + "learning_rate": 1.999960273446007e-05, + "loss": 1.5512, + "step": 4213 + }, + { + "epoch": 0.09828370849574498, + "grad_norm": 2.131727695465088, + "learning_rate": 1.9999600486866848e-05, + "loss": 1.6782, + "step": 4214 + }, + { + "epoch": 0.09830703163492291, + "grad_norm": 1.6054993867874146, + "learning_rate": 1.9999598232933615e-05, + "loss": 1.5069, + "step": 4215 + }, + { + "epoch": 0.09833035477410082, + "grad_norm": 1.7985066175460815, + "learning_rate": 1.999959597266038e-05, + "loss": 1.519, + "step": 4216 + }, + { + "epoch": 0.09835367791327873, + "grad_norm": 1.9092615842819214, + "learning_rate": 1.999959370604713e-05, + "loss": 1.6574, + "step": 4217 + }, + { + "epoch": 0.09837700105245666, + "grad_norm": 2.288851261138916, + "learning_rate": 1.9999591433093878e-05, + "loss": 1.5944, + "step": 4218 + }, + { + "epoch": 0.09840032419163457, + "grad_norm": 1.9145318269729614, + "learning_rate": 1.999958915380062e-05, + "loss": 1.7942, + "step": 4219 + }, + { + "epoch": 0.09842364733081249, + "grad_norm": 1.755913257598877, + "learning_rate": 1.9999586868167367e-05, + "loss": 1.3487, + "step": 4220 + }, + { + "epoch": 0.0984469704699904, + "grad_norm": 1.7752525806427002, + "learning_rate": 1.999958457619411e-05, + "loss": 1.4491, + "step": 4221 + }, + { + "epoch": 0.09847029360916833, + "grad_norm": 1.9941532611846924, + "learning_rate": 1.9999582277880852e-05, + "loss": 1.8066, + "step": 4222 + }, + { + "epoch": 0.09849361674834624, + "grad_norm": 1.6862519979476929, + "learning_rate": 1.99995799732276e-05, + "loss": 1.5079, + "step": 4223 + }, + { + "epoch": 0.09851693988752416, + "grad_norm": 1.443449854850769, + "learning_rate": 1.9999577662234347e-05, + "loss": 1.3898, + "step": 4224 + }, + { + "epoch": 0.09854026302670207, + "grad_norm": 1.8810759782791138, + "learning_rate": 1.9999575344901102e-05, + "loss": 1.5129, + "step": 4225 + }, + { + "epoch": 0.09856358616588, + "grad_norm": 1.6053975820541382, + "learning_rate": 1.9999573021227862e-05, + "loss": 1.3603, + "step": 4226 + }, + { + "epoch": 0.09858690930505791, + "grad_norm": 1.8535492420196533, + "learning_rate": 1.9999570691214632e-05, + "loss": 1.5042, + "step": 4227 + }, + { + "epoch": 0.09861023244423583, + "grad_norm": 1.8407317399978638, + "learning_rate": 1.999956835486141e-05, + "loss": 1.4641, + "step": 4228 + }, + { + "epoch": 0.09863355558341375, + "grad_norm": 1.6712974309921265, + "learning_rate": 1.9999566012168202e-05, + "loss": 1.2335, + "step": 4229 + }, + { + "epoch": 0.09865687872259167, + "grad_norm": 1.3778800964355469, + "learning_rate": 1.9999563663135006e-05, + "loss": 1.241, + "step": 4230 + }, + { + "epoch": 0.09868020186176958, + "grad_norm": 1.8800315856933594, + "learning_rate": 1.9999561307761823e-05, + "loss": 1.4065, + "step": 4231 + }, + { + "epoch": 0.0987035250009475, + "grad_norm": 2.0709855556488037, + "learning_rate": 1.9999558946048655e-05, + "loss": 1.5202, + "step": 4232 + }, + { + "epoch": 0.09872684814012542, + "grad_norm": 1.7393478155136108, + "learning_rate": 1.9999556577995504e-05, + "loss": 1.5224, + "step": 4233 + }, + { + "epoch": 0.09875017127930334, + "grad_norm": 1.7007765769958496, + "learning_rate": 1.999955420360237e-05, + "loss": 1.4261, + "step": 4234 + }, + { + "epoch": 0.09877349441848125, + "grad_norm": 1.7563802003860474, + "learning_rate": 1.9999551822869262e-05, + "loss": 1.4401, + "step": 4235 + }, + { + "epoch": 0.09879681755765918, + "grad_norm": 1.7062292098999023, + "learning_rate": 1.999954943579617e-05, + "loss": 1.6909, + "step": 4236 + }, + { + "epoch": 0.09882014069683709, + "grad_norm": 1.8759976625442505, + "learning_rate": 1.9999547042383102e-05, + "loss": 1.7044, + "step": 4237 + }, + { + "epoch": 0.09884346383601501, + "grad_norm": 1.8254616260528564, + "learning_rate": 1.999954464263006e-05, + "loss": 1.5238, + "step": 4238 + }, + { + "epoch": 0.09886678697519292, + "grad_norm": 2.0938169956207275, + "learning_rate": 1.9999542236537043e-05, + "loss": 0.9963, + "step": 4239 + }, + { + "epoch": 0.09889011011437085, + "grad_norm": 1.818045973777771, + "learning_rate": 1.9999539824104057e-05, + "loss": 1.5701, + "step": 4240 + }, + { + "epoch": 0.09891343325354876, + "grad_norm": 1.4690525531768799, + "learning_rate": 1.9999537405331098e-05, + "loss": 1.5044, + "step": 4241 + }, + { + "epoch": 0.09893675639272669, + "grad_norm": 1.7315692901611328, + "learning_rate": 1.999953498021817e-05, + "loss": 1.2657, + "step": 4242 + }, + { + "epoch": 0.0989600795319046, + "grad_norm": 2.039533853530884, + "learning_rate": 1.9999532548765272e-05, + "loss": 1.5758, + "step": 4243 + }, + { + "epoch": 0.09898340267108252, + "grad_norm": 2.1871633529663086, + "learning_rate": 1.999953011097241e-05, + "loss": 1.8441, + "step": 4244 + }, + { + "epoch": 0.09900672581026043, + "grad_norm": 1.8292895555496216, + "learning_rate": 1.9999527666839584e-05, + "loss": 1.4835, + "step": 4245 + }, + { + "epoch": 0.09903004894943834, + "grad_norm": 1.85469388961792, + "learning_rate": 1.9999525216366793e-05, + "loss": 1.6256, + "step": 4246 + }, + { + "epoch": 0.09905337208861627, + "grad_norm": 1.8915073871612549, + "learning_rate": 1.9999522759554045e-05, + "loss": 1.3652, + "step": 4247 + }, + { + "epoch": 0.09907669522779418, + "grad_norm": 1.4933258295059204, + "learning_rate": 1.999952029640133e-05, + "loss": 1.5552, + "step": 4248 + }, + { + "epoch": 0.0991000183669721, + "grad_norm": 2.428149938583374, + "learning_rate": 1.999951782690866e-05, + "loss": 1.4655, + "step": 4249 + }, + { + "epoch": 0.09912334150615001, + "grad_norm": 1.7580535411834717, + "learning_rate": 1.9999515351076036e-05, + "loss": 1.2424, + "step": 4250 + }, + { + "epoch": 0.09914666464532794, + "grad_norm": 1.9982497692108154, + "learning_rate": 1.9999512868903453e-05, + "loss": 1.4033, + "step": 4251 + }, + { + "epoch": 0.09916998778450585, + "grad_norm": 2.0487194061279297, + "learning_rate": 1.9999510380390917e-05, + "loss": 1.7197, + "step": 4252 + }, + { + "epoch": 0.09919331092368378, + "grad_norm": 1.6107131242752075, + "learning_rate": 1.999950788553843e-05, + "loss": 1.6606, + "step": 4253 + }, + { + "epoch": 0.09921663406286169, + "grad_norm": 2.1322474479675293, + "learning_rate": 1.999950538434599e-05, + "loss": 1.2501, + "step": 4254 + }, + { + "epoch": 0.09923995720203961, + "grad_norm": 1.7192622423171997, + "learning_rate": 1.9999502876813602e-05, + "loss": 1.4732, + "step": 4255 + }, + { + "epoch": 0.09926328034121752, + "grad_norm": 1.885438084602356, + "learning_rate": 1.9999500362941272e-05, + "loss": 1.6839, + "step": 4256 + }, + { + "epoch": 0.09928660348039545, + "grad_norm": 1.5883655548095703, + "learning_rate": 1.999949784272899e-05, + "loss": 1.478, + "step": 4257 + }, + { + "epoch": 0.09930992661957336, + "grad_norm": 2.0354058742523193, + "learning_rate": 1.9999495316176766e-05, + "loss": 1.3814, + "step": 4258 + }, + { + "epoch": 0.09933324975875128, + "grad_norm": 2.1163158416748047, + "learning_rate": 1.99994927832846e-05, + "loss": 1.677, + "step": 4259 + }, + { + "epoch": 0.0993565728979292, + "grad_norm": 2.6560354232788086, + "learning_rate": 1.999949024405249e-05, + "loss": 1.4997, + "step": 4260 + }, + { + "epoch": 0.09937989603710712, + "grad_norm": 2.152900218963623, + "learning_rate": 1.9999487698480443e-05, + "loss": 1.7245, + "step": 4261 + }, + { + "epoch": 0.09940321917628503, + "grad_norm": 1.2850993871688843, + "learning_rate": 1.999948514656846e-05, + "loss": 1.3858, + "step": 4262 + }, + { + "epoch": 0.09942654231546295, + "grad_norm": 2.006429433822632, + "learning_rate": 1.9999482588316537e-05, + "loss": 1.3124, + "step": 4263 + }, + { + "epoch": 0.09944986545464087, + "grad_norm": 2.0759809017181396, + "learning_rate": 1.999948002372468e-05, + "loss": 1.5445, + "step": 4264 + }, + { + "epoch": 0.09947318859381879, + "grad_norm": 1.7543976306915283, + "learning_rate": 1.999947745279289e-05, + "loss": 1.4313, + "step": 4265 + }, + { + "epoch": 0.0994965117329967, + "grad_norm": 1.5660021305084229, + "learning_rate": 1.9999474875521172e-05, + "loss": 1.3618, + "step": 4266 + }, + { + "epoch": 0.09951983487217463, + "grad_norm": 1.7976728677749634, + "learning_rate": 1.9999472291909523e-05, + "loss": 1.2068, + "step": 4267 + }, + { + "epoch": 0.09954315801135254, + "grad_norm": 1.2305090427398682, + "learning_rate": 1.9999469701957944e-05, + "loss": 0.9688, + "step": 4268 + }, + { + "epoch": 0.09956648115053046, + "grad_norm": 1.636967658996582, + "learning_rate": 1.9999467105666442e-05, + "loss": 1.2086, + "step": 4269 + }, + { + "epoch": 0.09958980428970837, + "grad_norm": 1.271087646484375, + "learning_rate": 1.9999464503035014e-05, + "loss": 0.8756, + "step": 4270 + }, + { + "epoch": 0.0996131274288863, + "grad_norm": 2.0414915084838867, + "learning_rate": 1.9999461894063663e-05, + "loss": 1.6801, + "step": 4271 + }, + { + "epoch": 0.09963645056806421, + "grad_norm": 1.539941430091858, + "learning_rate": 1.9999459278752392e-05, + "loss": 1.2946, + "step": 4272 + }, + { + "epoch": 0.09965977370724213, + "grad_norm": 2.676841974258423, + "learning_rate": 1.9999456657101198e-05, + "loss": 1.3578, + "step": 4273 + }, + { + "epoch": 0.09968309684642004, + "grad_norm": 1.3585354089736938, + "learning_rate": 1.9999454029110088e-05, + "loss": 1.087, + "step": 4274 + }, + { + "epoch": 0.09970641998559795, + "grad_norm": 1.530239224433899, + "learning_rate": 1.999945139477906e-05, + "loss": 1.3106, + "step": 4275 + }, + { + "epoch": 0.09972974312477588, + "grad_norm": 2.2003183364868164, + "learning_rate": 1.999944875410812e-05, + "loss": 1.381, + "step": 4276 + }, + { + "epoch": 0.09975306626395379, + "grad_norm": 2.0120465755462646, + "learning_rate": 1.9999446107097263e-05, + "loss": 1.5153, + "step": 4277 + }, + { + "epoch": 0.09977638940313172, + "grad_norm": 1.709818959236145, + "learning_rate": 1.9999443453746498e-05, + "loss": 1.5923, + "step": 4278 + }, + { + "epoch": 0.09979971254230963, + "grad_norm": 1.913179636001587, + "learning_rate": 1.9999440794055824e-05, + "loss": 1.5885, + "step": 4279 + }, + { + "epoch": 0.09982303568148755, + "grad_norm": 1.7344928979873657, + "learning_rate": 1.9999438128025243e-05, + "loss": 1.5675, + "step": 4280 + }, + { + "epoch": 0.09984635882066546, + "grad_norm": 1.9740248918533325, + "learning_rate": 1.999943545565475e-05, + "loss": 1.547, + "step": 4281 + }, + { + "epoch": 0.09986968195984339, + "grad_norm": 1.8129318952560425, + "learning_rate": 1.9999432776944357e-05, + "loss": 1.421, + "step": 4282 + }, + { + "epoch": 0.0998930050990213, + "grad_norm": 1.5063918828964233, + "learning_rate": 1.9999430091894058e-05, + "loss": 1.4418, + "step": 4283 + }, + { + "epoch": 0.09991632823819922, + "grad_norm": 1.7168734073638916, + "learning_rate": 1.999942740050386e-05, + "loss": 1.2619, + "step": 4284 + }, + { + "epoch": 0.09993965137737713, + "grad_norm": 1.9776476621627808, + "learning_rate": 1.9999424702773762e-05, + "loss": 1.222, + "step": 4285 + }, + { + "epoch": 0.09996297451655506, + "grad_norm": 2.2479588985443115, + "learning_rate": 1.999942199870377e-05, + "loss": 1.0917, + "step": 4286 + }, + { + "epoch": 0.09998629765573297, + "grad_norm": 1.6717889308929443, + "learning_rate": 1.9999419288293876e-05, + "loss": 1.086, + "step": 4287 + }, + { + "epoch": 0.1000096207949109, + "grad_norm": 1.878680944442749, + "learning_rate": 1.9999416571544088e-05, + "loss": 1.5367, + "step": 4288 + }, + { + "epoch": 0.1000329439340888, + "grad_norm": 1.7646178007125854, + "learning_rate": 1.9999413848454407e-05, + "loss": 1.4865, + "step": 4289 + }, + { + "epoch": 0.10005626707326673, + "grad_norm": 1.8721680641174316, + "learning_rate": 1.999941111902484e-05, + "loss": 1.3529, + "step": 4290 + }, + { + "epoch": 0.10007959021244464, + "grad_norm": 1.8077625036239624, + "learning_rate": 1.999940838325538e-05, + "loss": 1.4719, + "step": 4291 + }, + { + "epoch": 0.10010291335162257, + "grad_norm": 2.198845148086548, + "learning_rate": 1.9999405641146032e-05, + "loss": 1.6665, + "step": 4292 + }, + { + "epoch": 0.10012623649080048, + "grad_norm": 1.691707968711853, + "learning_rate": 1.99994028926968e-05, + "loss": 1.7262, + "step": 4293 + }, + { + "epoch": 0.1001495596299784, + "grad_norm": 1.6580450534820557, + "learning_rate": 1.9999400137907684e-05, + "loss": 1.4598, + "step": 4294 + }, + { + "epoch": 0.10017288276915631, + "grad_norm": 1.6645382642745972, + "learning_rate": 1.9999397376778685e-05, + "loss": 1.4816, + "step": 4295 + }, + { + "epoch": 0.10019620590833424, + "grad_norm": 1.5742464065551758, + "learning_rate": 1.9999394609309807e-05, + "loss": 1.5432, + "step": 4296 + }, + { + "epoch": 0.10021952904751215, + "grad_norm": 1.6496295928955078, + "learning_rate": 1.999939183550105e-05, + "loss": 1.6818, + "step": 4297 + }, + { + "epoch": 0.10024285218669007, + "grad_norm": 1.385653018951416, + "learning_rate": 1.9999389055352417e-05, + "loss": 1.2546, + "step": 4298 + }, + { + "epoch": 0.10026617532586798, + "grad_norm": 1.8121085166931152, + "learning_rate": 1.9999386268863906e-05, + "loss": 1.6767, + "step": 4299 + }, + { + "epoch": 0.10028949846504591, + "grad_norm": 1.7636923789978027, + "learning_rate": 1.9999383476035526e-05, + "loss": 1.3737, + "step": 4300 + }, + { + "epoch": 0.10031282160422382, + "grad_norm": 2.0811192989349365, + "learning_rate": 1.999938067686727e-05, + "loss": 1.193, + "step": 4301 + }, + { + "epoch": 0.10033614474340175, + "grad_norm": 1.491807460784912, + "learning_rate": 1.9999377871359145e-05, + "loss": 1.3031, + "step": 4302 + }, + { + "epoch": 0.10035946788257966, + "grad_norm": 1.7776809930801392, + "learning_rate": 1.999937505951115e-05, + "loss": 1.2694, + "step": 4303 + }, + { + "epoch": 0.10038279102175757, + "grad_norm": 2.235464334487915, + "learning_rate": 1.9999372241323295e-05, + "loss": 1.4957, + "step": 4304 + }, + { + "epoch": 0.10040611416093549, + "grad_norm": 2.128072500228882, + "learning_rate": 1.999936941679557e-05, + "loss": 1.4781, + "step": 4305 + }, + { + "epoch": 0.1004294373001134, + "grad_norm": 1.7737715244293213, + "learning_rate": 1.9999366585927984e-05, + "loss": 1.1417, + "step": 4306 + }, + { + "epoch": 0.10045276043929133, + "grad_norm": 1.5610971450805664, + "learning_rate": 1.9999363748720536e-05, + "loss": 0.8304, + "step": 4307 + }, + { + "epoch": 0.10047608357846924, + "grad_norm": 1.8552017211914062, + "learning_rate": 1.9999360905173232e-05, + "loss": 1.5004, + "step": 4308 + }, + { + "epoch": 0.10049940671764716, + "grad_norm": 1.4400602579116821, + "learning_rate": 1.9999358055286066e-05, + "loss": 1.3209, + "step": 4309 + }, + { + "epoch": 0.10052272985682507, + "grad_norm": 1.8621962070465088, + "learning_rate": 1.999935519905905e-05, + "loss": 1.6884, + "step": 4310 + }, + { + "epoch": 0.100546052996003, + "grad_norm": 1.6067652702331543, + "learning_rate": 1.9999352336492175e-05, + "loss": 1.1682, + "step": 4311 + }, + { + "epoch": 0.10056937613518091, + "grad_norm": 1.9501607418060303, + "learning_rate": 1.999934946758545e-05, + "loss": 1.4891, + "step": 4312 + }, + { + "epoch": 0.10059269927435883, + "grad_norm": 1.669527530670166, + "learning_rate": 1.9999346592338874e-05, + "loss": 1.3783, + "step": 4313 + }, + { + "epoch": 0.10061602241353675, + "grad_norm": 1.7186279296875, + "learning_rate": 1.999934371075245e-05, + "loss": 1.3234, + "step": 4314 + }, + { + "epoch": 0.10063934555271467, + "grad_norm": 1.6204017400741577, + "learning_rate": 1.9999340822826182e-05, + "loss": 1.267, + "step": 4315 + }, + { + "epoch": 0.10066266869189258, + "grad_norm": 1.8573533296585083, + "learning_rate": 1.9999337928560066e-05, + "loss": 1.5666, + "step": 4316 + }, + { + "epoch": 0.1006859918310705, + "grad_norm": 1.4896578788757324, + "learning_rate": 1.9999335027954113e-05, + "loss": 1.319, + "step": 4317 + }, + { + "epoch": 0.10070931497024842, + "grad_norm": 1.641883134841919, + "learning_rate": 1.9999332121008314e-05, + "loss": 1.5142, + "step": 4318 + }, + { + "epoch": 0.10073263810942634, + "grad_norm": 2.2819859981536865, + "learning_rate": 1.9999329207722677e-05, + "loss": 1.738, + "step": 4319 + }, + { + "epoch": 0.10075596124860425, + "grad_norm": 1.6085293292999268, + "learning_rate": 1.9999326288097204e-05, + "loss": 1.2742, + "step": 4320 + }, + { + "epoch": 0.10077928438778218, + "grad_norm": 1.5279383659362793, + "learning_rate": 1.9999323362131894e-05, + "loss": 1.3892, + "step": 4321 + }, + { + "epoch": 0.10080260752696009, + "grad_norm": 1.8018547296524048, + "learning_rate": 1.999932042982675e-05, + "loss": 1.2437, + "step": 4322 + }, + { + "epoch": 0.10082593066613801, + "grad_norm": 2.234227418899536, + "learning_rate": 1.9999317491181775e-05, + "loss": 1.8226, + "step": 4323 + }, + { + "epoch": 0.10084925380531592, + "grad_norm": 1.7504974603652954, + "learning_rate": 1.999931454619697e-05, + "loss": 1.1958, + "step": 4324 + }, + { + "epoch": 0.10087257694449385, + "grad_norm": 1.9000635147094727, + "learning_rate": 1.9999311594872337e-05, + "loss": 1.7756, + "step": 4325 + }, + { + "epoch": 0.10089590008367176, + "grad_norm": 1.346874475479126, + "learning_rate": 1.999930863720788e-05, + "loss": 1.241, + "step": 4326 + }, + { + "epoch": 0.10091922322284969, + "grad_norm": 1.410976529121399, + "learning_rate": 1.9999305673203596e-05, + "loss": 1.1809, + "step": 4327 + }, + { + "epoch": 0.1009425463620276, + "grad_norm": 1.805508017539978, + "learning_rate": 1.9999302702859492e-05, + "loss": 1.6081, + "step": 4328 + }, + { + "epoch": 0.10096586950120552, + "grad_norm": 1.7266390323638916, + "learning_rate": 1.9999299726175566e-05, + "loss": 1.3068, + "step": 4329 + }, + { + "epoch": 0.10098919264038343, + "grad_norm": 1.8772683143615723, + "learning_rate": 1.9999296743151823e-05, + "loss": 1.144, + "step": 4330 + }, + { + "epoch": 0.10101251577956134, + "grad_norm": 1.3331047296524048, + "learning_rate": 1.999929375378826e-05, + "loss": 1.2164, + "step": 4331 + }, + { + "epoch": 0.10103583891873927, + "grad_norm": 1.6070294380187988, + "learning_rate": 1.9999290758084885e-05, + "loss": 1.6565, + "step": 4332 + }, + { + "epoch": 0.10105916205791718, + "grad_norm": 1.5945744514465332, + "learning_rate": 1.9999287756041698e-05, + "loss": 1.298, + "step": 4333 + }, + { + "epoch": 0.1010824851970951, + "grad_norm": 1.7127183675765991, + "learning_rate": 1.9999284747658696e-05, + "loss": 1.6233, + "step": 4334 + }, + { + "epoch": 0.10110580833627301, + "grad_norm": 1.4577192068099976, + "learning_rate": 1.9999281732935886e-05, + "loss": 1.3159, + "step": 4335 + }, + { + "epoch": 0.10112913147545094, + "grad_norm": 2.0346920490264893, + "learning_rate": 1.9999278711873272e-05, + "loss": 1.3263, + "step": 4336 + }, + { + "epoch": 0.10115245461462885, + "grad_norm": 1.582476019859314, + "learning_rate": 1.999927568447085e-05, + "loss": 1.4117, + "step": 4337 + }, + { + "epoch": 0.10117577775380678, + "grad_norm": 1.9694240093231201, + "learning_rate": 1.999927265072863e-05, + "loss": 1.5795, + "step": 4338 + }, + { + "epoch": 0.10119910089298469, + "grad_norm": 1.84016752243042, + "learning_rate": 1.9999269610646603e-05, + "loss": 1.4963, + "step": 4339 + }, + { + "epoch": 0.10122242403216261, + "grad_norm": 1.886704444885254, + "learning_rate": 1.9999266564224777e-05, + "loss": 1.6491, + "step": 4340 + }, + { + "epoch": 0.10124574717134052, + "grad_norm": 1.9586236476898193, + "learning_rate": 1.9999263511463155e-05, + "loss": 1.5735, + "step": 4341 + }, + { + "epoch": 0.10126907031051845, + "grad_norm": 2.501908302307129, + "learning_rate": 1.9999260452361737e-05, + "loss": 0.9846, + "step": 4342 + }, + { + "epoch": 0.10129239344969636, + "grad_norm": 1.6577366590499878, + "learning_rate": 1.9999257386920526e-05, + "loss": 1.5666, + "step": 4343 + }, + { + "epoch": 0.10131571658887428, + "grad_norm": 1.3364449739456177, + "learning_rate": 1.9999254315139524e-05, + "loss": 1.1554, + "step": 4344 + }, + { + "epoch": 0.1013390397280522, + "grad_norm": 1.6358418464660645, + "learning_rate": 1.9999251237018727e-05, + "loss": 1.6063, + "step": 4345 + }, + { + "epoch": 0.10136236286723012, + "grad_norm": 1.7238401174545288, + "learning_rate": 1.999924815255815e-05, + "loss": 1.3965, + "step": 4346 + }, + { + "epoch": 0.10138568600640803, + "grad_norm": 1.815588116645813, + "learning_rate": 1.999924506175778e-05, + "loss": 1.1671, + "step": 4347 + }, + { + "epoch": 0.10140900914558595, + "grad_norm": 2.0672595500946045, + "learning_rate": 1.999924196461763e-05, + "loss": 1.5405, + "step": 4348 + }, + { + "epoch": 0.10143233228476387, + "grad_norm": 1.8933367729187012, + "learning_rate": 1.9999238861137696e-05, + "loss": 1.5728, + "step": 4349 + }, + { + "epoch": 0.10145565542394179, + "grad_norm": 1.6406468152999878, + "learning_rate": 1.9999235751317986e-05, + "loss": 1.2051, + "step": 4350 + }, + { + "epoch": 0.1014789785631197, + "grad_norm": 1.594834804534912, + "learning_rate": 1.9999232635158495e-05, + "loss": 1.6005, + "step": 4351 + }, + { + "epoch": 0.10150230170229763, + "grad_norm": 2.1673924922943115, + "learning_rate": 1.999922951265923e-05, + "loss": 1.1405, + "step": 4352 + }, + { + "epoch": 0.10152562484147554, + "grad_norm": 1.4452152252197266, + "learning_rate": 1.9999226383820187e-05, + "loss": 1.4138, + "step": 4353 + }, + { + "epoch": 0.10154894798065346, + "grad_norm": 1.6347564458847046, + "learning_rate": 1.9999223248641375e-05, + "loss": 1.3157, + "step": 4354 + }, + { + "epoch": 0.10157227111983137, + "grad_norm": 1.6961263418197632, + "learning_rate": 1.999922010712279e-05, + "loss": 1.5234, + "step": 4355 + }, + { + "epoch": 0.1015955942590093, + "grad_norm": 1.6713759899139404, + "learning_rate": 1.9999216959264444e-05, + "loss": 1.4066, + "step": 4356 + }, + { + "epoch": 0.10161891739818721, + "grad_norm": 1.6086574792861938, + "learning_rate": 1.9999213805066326e-05, + "loss": 1.5911, + "step": 4357 + }, + { + "epoch": 0.10164224053736513, + "grad_norm": 1.5126043558120728, + "learning_rate": 1.9999210644528444e-05, + "loss": 0.9794, + "step": 4358 + }, + { + "epoch": 0.10166556367654304, + "grad_norm": 1.6738828420639038, + "learning_rate": 1.9999207477650802e-05, + "loss": 1.3159, + "step": 4359 + }, + { + "epoch": 0.10168888681572096, + "grad_norm": 1.8790634870529175, + "learning_rate": 1.9999204304433397e-05, + "loss": 1.8513, + "step": 4360 + }, + { + "epoch": 0.10171220995489888, + "grad_norm": 1.7257745265960693, + "learning_rate": 1.9999201124876237e-05, + "loss": 1.4597, + "step": 4361 + }, + { + "epoch": 0.10173553309407679, + "grad_norm": 1.7159866094589233, + "learning_rate": 1.9999197938979317e-05, + "loss": 1.5366, + "step": 4362 + }, + { + "epoch": 0.10175885623325472, + "grad_norm": 1.5496243238449097, + "learning_rate": 1.9999194746742647e-05, + "loss": 1.5668, + "step": 4363 + }, + { + "epoch": 0.10178217937243263, + "grad_norm": 1.4768664836883545, + "learning_rate": 1.9999191548166227e-05, + "loss": 1.3203, + "step": 4364 + }, + { + "epoch": 0.10180550251161055, + "grad_norm": 1.7893187999725342, + "learning_rate": 1.9999188343250053e-05, + "loss": 1.527, + "step": 4365 + }, + { + "epoch": 0.10182882565078846, + "grad_norm": 1.770240306854248, + "learning_rate": 1.999918513199413e-05, + "loss": 1.5302, + "step": 4366 + }, + { + "epoch": 0.10185214878996639, + "grad_norm": 2.052283763885498, + "learning_rate": 1.9999181914398465e-05, + "loss": 1.6142, + "step": 4367 + }, + { + "epoch": 0.1018754719291443, + "grad_norm": 1.8870301246643066, + "learning_rate": 1.9999178690463054e-05, + "loss": 1.609, + "step": 4368 + }, + { + "epoch": 0.10189879506832222, + "grad_norm": 2.209730625152588, + "learning_rate": 1.99991754601879e-05, + "loss": 1.1869, + "step": 4369 + }, + { + "epoch": 0.10192211820750013, + "grad_norm": 1.906145453453064, + "learning_rate": 1.9999172223573008e-05, + "loss": 1.6161, + "step": 4370 + }, + { + "epoch": 0.10194544134667806, + "grad_norm": 1.7600947618484497, + "learning_rate": 1.9999168980618377e-05, + "loss": 1.7264, + "step": 4371 + }, + { + "epoch": 0.10196876448585597, + "grad_norm": 1.475500464439392, + "learning_rate": 1.9999165731324012e-05, + "loss": 1.1859, + "step": 4372 + }, + { + "epoch": 0.1019920876250339, + "grad_norm": 2.4348111152648926, + "learning_rate": 1.999916247568991e-05, + "loss": 1.4721, + "step": 4373 + }, + { + "epoch": 0.1020154107642118, + "grad_norm": 1.7595854997634888, + "learning_rate": 1.9999159213716083e-05, + "loss": 1.4112, + "step": 4374 + }, + { + "epoch": 0.10203873390338973, + "grad_norm": 2.1891770362854004, + "learning_rate": 1.9999155945402522e-05, + "loss": 1.4783, + "step": 4375 + }, + { + "epoch": 0.10206205704256764, + "grad_norm": 1.538309097290039, + "learning_rate": 1.9999152670749235e-05, + "loss": 1.5166, + "step": 4376 + }, + { + "epoch": 0.10208538018174557, + "grad_norm": 1.431139588356018, + "learning_rate": 1.9999149389756224e-05, + "loss": 1.1056, + "step": 4377 + }, + { + "epoch": 0.10210870332092348, + "grad_norm": 1.902206301689148, + "learning_rate": 1.9999146102423484e-05, + "loss": 1.3445, + "step": 4378 + }, + { + "epoch": 0.1021320264601014, + "grad_norm": 1.5516972541809082, + "learning_rate": 1.999914280875103e-05, + "loss": 1.3287, + "step": 4379 + }, + { + "epoch": 0.10215534959927931, + "grad_norm": 2.003575325012207, + "learning_rate": 1.999913950873885e-05, + "loss": 1.3581, + "step": 4380 + }, + { + "epoch": 0.10217867273845724, + "grad_norm": 2.0806925296783447, + "learning_rate": 1.9999136202386957e-05, + "loss": 1.7686, + "step": 4381 + }, + { + "epoch": 0.10220199587763515, + "grad_norm": 1.9133434295654297, + "learning_rate": 1.999913288969535e-05, + "loss": 1.5319, + "step": 4382 + }, + { + "epoch": 0.10222531901681307, + "grad_norm": 1.709938645362854, + "learning_rate": 1.999912957066403e-05, + "loss": 1.1938, + "step": 4383 + }, + { + "epoch": 0.10224864215599098, + "grad_norm": 2.0247511863708496, + "learning_rate": 1.9999126245292997e-05, + "loss": 1.5905, + "step": 4384 + }, + { + "epoch": 0.10227196529516891, + "grad_norm": 1.8318753242492676, + "learning_rate": 1.9999122913582256e-05, + "loss": 1.065, + "step": 4385 + }, + { + "epoch": 0.10229528843434682, + "grad_norm": 1.8385038375854492, + "learning_rate": 1.9999119575531808e-05, + "loss": 1.242, + "step": 4386 + }, + { + "epoch": 0.10231861157352475, + "grad_norm": 1.8718616962432861, + "learning_rate": 1.9999116231141658e-05, + "loss": 1.6333, + "step": 4387 + }, + { + "epoch": 0.10234193471270266, + "grad_norm": 1.9290051460266113, + "learning_rate": 1.9999112880411804e-05, + "loss": 1.7813, + "step": 4388 + }, + { + "epoch": 0.10236525785188057, + "grad_norm": 1.593847632408142, + "learning_rate": 1.9999109523342252e-05, + "loss": 1.5774, + "step": 4389 + }, + { + "epoch": 0.10238858099105849, + "grad_norm": 1.9645016193389893, + "learning_rate": 1.9999106159933e-05, + "loss": 1.3466, + "step": 4390 + }, + { + "epoch": 0.1024119041302364, + "grad_norm": 1.6337265968322754, + "learning_rate": 1.999910279018405e-05, + "loss": 0.938, + "step": 4391 + }, + { + "epoch": 0.10243522726941433, + "grad_norm": 1.5305380821228027, + "learning_rate": 1.9999099414095414e-05, + "loss": 1.4619, + "step": 4392 + }, + { + "epoch": 0.10245855040859224, + "grad_norm": 1.9405750036239624, + "learning_rate": 1.999909603166708e-05, + "loss": 1.7327, + "step": 4393 + }, + { + "epoch": 0.10248187354777016, + "grad_norm": 1.8218337297439575, + "learning_rate": 1.999909264289906e-05, + "loss": 1.4709, + "step": 4394 + }, + { + "epoch": 0.10250519668694807, + "grad_norm": 1.7523096799850464, + "learning_rate": 1.999908924779135e-05, + "loss": 1.4373, + "step": 4395 + }, + { + "epoch": 0.102528519826126, + "grad_norm": 1.9578635692596436, + "learning_rate": 1.999908584634396e-05, + "loss": 1.5482, + "step": 4396 + }, + { + "epoch": 0.10255184296530391, + "grad_norm": 2.027506113052368, + "learning_rate": 1.999908243855688e-05, + "loss": 1.7009, + "step": 4397 + }, + { + "epoch": 0.10257516610448184, + "grad_norm": 1.7978277206420898, + "learning_rate": 1.999907902443012e-05, + "loss": 1.4091, + "step": 4398 + }, + { + "epoch": 0.10259848924365975, + "grad_norm": 1.7331018447875977, + "learning_rate": 1.9999075603963685e-05, + "loss": 1.4605, + "step": 4399 + }, + { + "epoch": 0.10262181238283767, + "grad_norm": 1.545462727546692, + "learning_rate": 1.999907217715757e-05, + "loss": 1.2926, + "step": 4400 + }, + { + "epoch": 0.10264513552201558, + "grad_norm": 1.7517781257629395, + "learning_rate": 1.9999068744011785e-05, + "loss": 1.3769, + "step": 4401 + }, + { + "epoch": 0.1026684586611935, + "grad_norm": 1.7014862298965454, + "learning_rate": 1.9999065304526323e-05, + "loss": 1.5482, + "step": 4402 + }, + { + "epoch": 0.10269178180037142, + "grad_norm": 1.6097866296768188, + "learning_rate": 1.9999061858701195e-05, + "loss": 1.3104, + "step": 4403 + }, + { + "epoch": 0.10271510493954934, + "grad_norm": 1.8927146196365356, + "learning_rate": 1.99990584065364e-05, + "loss": 1.0695, + "step": 4404 + }, + { + "epoch": 0.10273842807872725, + "grad_norm": 1.688493251800537, + "learning_rate": 1.9999054948031937e-05, + "loss": 1.0971, + "step": 4405 + }, + { + "epoch": 0.10276175121790518, + "grad_norm": 1.8350069522857666, + "learning_rate": 1.999905148318781e-05, + "loss": 1.3597, + "step": 4406 + }, + { + "epoch": 0.10278507435708309, + "grad_norm": 1.635527491569519, + "learning_rate": 1.9999048012004026e-05, + "loss": 1.1458, + "step": 4407 + }, + { + "epoch": 0.10280839749626101, + "grad_norm": 1.957057237625122, + "learning_rate": 1.9999044534480578e-05, + "loss": 1.4101, + "step": 4408 + }, + { + "epoch": 0.10283172063543893, + "grad_norm": 1.4483978748321533, + "learning_rate": 1.9999041050617474e-05, + "loss": 1.2723, + "step": 4409 + }, + { + "epoch": 0.10285504377461685, + "grad_norm": 2.213428020477295, + "learning_rate": 1.999903756041472e-05, + "loss": 1.2912, + "step": 4410 + }, + { + "epoch": 0.10287836691379476, + "grad_norm": 1.8046005964279175, + "learning_rate": 1.9999034063872307e-05, + "loss": 1.5203, + "step": 4411 + }, + { + "epoch": 0.10290169005297269, + "grad_norm": 2.2218785285949707, + "learning_rate": 1.9999030560990248e-05, + "loss": 1.3761, + "step": 4412 + }, + { + "epoch": 0.1029250131921506, + "grad_norm": 2.170475482940674, + "learning_rate": 1.999902705176854e-05, + "loss": 1.3484, + "step": 4413 + }, + { + "epoch": 0.10294833633132852, + "grad_norm": 1.599300503730774, + "learning_rate": 1.9999023536207186e-05, + "loss": 1.3247, + "step": 4414 + }, + { + "epoch": 0.10297165947050643, + "grad_norm": 1.850488305091858, + "learning_rate": 1.999902001430619e-05, + "loss": 1.2363, + "step": 4415 + }, + { + "epoch": 0.10299498260968436, + "grad_norm": 1.4526803493499756, + "learning_rate": 1.9999016486065554e-05, + "loss": 1.4001, + "step": 4416 + }, + { + "epoch": 0.10301830574886227, + "grad_norm": 1.7001068592071533, + "learning_rate": 1.9999012951485278e-05, + "loss": 1.5615, + "step": 4417 + }, + { + "epoch": 0.10304162888804018, + "grad_norm": 1.7435797452926636, + "learning_rate": 1.999900941056536e-05, + "loss": 1.5585, + "step": 4418 + }, + { + "epoch": 0.1030649520272181, + "grad_norm": 1.8533247709274292, + "learning_rate": 1.9999005863305815e-05, + "loss": 1.6771, + "step": 4419 + }, + { + "epoch": 0.10308827516639602, + "grad_norm": 1.5894358158111572, + "learning_rate": 1.9999002309706635e-05, + "loss": 1.5284, + "step": 4420 + }, + { + "epoch": 0.10311159830557394, + "grad_norm": 2.0098137855529785, + "learning_rate": 1.9998998749767823e-05, + "loss": 1.3599, + "step": 4421 + }, + { + "epoch": 0.10313492144475185, + "grad_norm": 1.9894591569900513, + "learning_rate": 1.9998995183489385e-05, + "loss": 1.7052, + "step": 4422 + }, + { + "epoch": 0.10315824458392978, + "grad_norm": 1.7116707563400269, + "learning_rate": 1.9998991610871323e-05, + "loss": 1.3363, + "step": 4423 + }, + { + "epoch": 0.10318156772310769, + "grad_norm": 1.7851814031600952, + "learning_rate": 1.9998988031913637e-05, + "loss": 1.4181, + "step": 4424 + }, + { + "epoch": 0.10320489086228561, + "grad_norm": 1.8559414148330688, + "learning_rate": 1.9998984446616328e-05, + "loss": 1.4141, + "step": 4425 + }, + { + "epoch": 0.10322821400146352, + "grad_norm": 1.6662341356277466, + "learning_rate": 1.9998980854979405e-05, + "loss": 1.6026, + "step": 4426 + }, + { + "epoch": 0.10325153714064145, + "grad_norm": 2.0204248428344727, + "learning_rate": 1.999897725700286e-05, + "loss": 1.3882, + "step": 4427 + }, + { + "epoch": 0.10327486027981936, + "grad_norm": 1.7466933727264404, + "learning_rate": 1.9998973652686705e-05, + "loss": 1.4131, + "step": 4428 + }, + { + "epoch": 0.10329818341899728, + "grad_norm": 1.7322821617126465, + "learning_rate": 1.9998970042030938e-05, + "loss": 1.1848, + "step": 4429 + }, + { + "epoch": 0.1033215065581752, + "grad_norm": 1.723657250404358, + "learning_rate": 1.9998966425035556e-05, + "loss": 1.4047, + "step": 4430 + }, + { + "epoch": 0.10334482969735312, + "grad_norm": 2.131956100463867, + "learning_rate": 1.9998962801700573e-05, + "loss": 1.813, + "step": 4431 + }, + { + "epoch": 0.10336815283653103, + "grad_norm": 1.6857149600982666, + "learning_rate": 1.9998959172025982e-05, + "loss": 1.6333, + "step": 4432 + }, + { + "epoch": 0.10339147597570895, + "grad_norm": 2.233597755432129, + "learning_rate": 1.999895553601179e-05, + "loss": 1.4087, + "step": 4433 + }, + { + "epoch": 0.10341479911488687, + "grad_norm": 1.7040321826934814, + "learning_rate": 1.9998951893657997e-05, + "loss": 1.7103, + "step": 4434 + }, + { + "epoch": 0.10343812225406479, + "grad_norm": 1.805225133895874, + "learning_rate": 1.9998948244964606e-05, + "loss": 1.5802, + "step": 4435 + }, + { + "epoch": 0.1034614453932427, + "grad_norm": 1.5285016298294067, + "learning_rate": 1.999894458993162e-05, + "loss": 1.2591, + "step": 4436 + }, + { + "epoch": 0.10348476853242063, + "grad_norm": 1.8958289623260498, + "learning_rate": 1.9998940928559037e-05, + "loss": 1.3856, + "step": 4437 + }, + { + "epoch": 0.10350809167159854, + "grad_norm": 1.5795350074768066, + "learning_rate": 1.9998937260846866e-05, + "loss": 1.5508, + "step": 4438 + }, + { + "epoch": 0.10353141481077646, + "grad_norm": 1.894783854484558, + "learning_rate": 1.9998933586795108e-05, + "loss": 1.2985, + "step": 4439 + }, + { + "epoch": 0.10355473794995437, + "grad_norm": 1.511522650718689, + "learning_rate": 1.9998929906403762e-05, + "loss": 1.5026, + "step": 4440 + }, + { + "epoch": 0.1035780610891323, + "grad_norm": 1.7025114297866821, + "learning_rate": 1.9998926219672832e-05, + "loss": 1.5147, + "step": 4441 + }, + { + "epoch": 0.10360138422831021, + "grad_norm": 3.8708784580230713, + "learning_rate": 1.9998922526602317e-05, + "loss": 1.8936, + "step": 4442 + }, + { + "epoch": 0.10362470736748813, + "grad_norm": 1.8823548555374146, + "learning_rate": 1.9998918827192226e-05, + "loss": 1.4903, + "step": 4443 + }, + { + "epoch": 0.10364803050666604, + "grad_norm": 1.815712571144104, + "learning_rate": 1.999891512144256e-05, + "loss": 1.153, + "step": 4444 + }, + { + "epoch": 0.10367135364584396, + "grad_norm": 1.3985499143600464, + "learning_rate": 1.9998911409353313e-05, + "loss": 1.3144, + "step": 4445 + }, + { + "epoch": 0.10369467678502188, + "grad_norm": 1.7983309030532837, + "learning_rate": 1.99989076909245e-05, + "loss": 1.4602, + "step": 4446 + }, + { + "epoch": 0.10371799992419979, + "grad_norm": 1.762711763381958, + "learning_rate": 1.9998903966156114e-05, + "loss": 1.5969, + "step": 4447 + }, + { + "epoch": 0.10374132306337772, + "grad_norm": 1.8125685453414917, + "learning_rate": 1.999890023504816e-05, + "loss": 1.4237, + "step": 4448 + }, + { + "epoch": 0.10376464620255563, + "grad_norm": 1.8932796716690063, + "learning_rate": 1.9998896497600643e-05, + "loss": 1.6998, + "step": 4449 + }, + { + "epoch": 0.10378796934173355, + "grad_norm": 1.6668219566345215, + "learning_rate": 1.999889275381356e-05, + "loss": 1.4106, + "step": 4450 + }, + { + "epoch": 0.10381129248091146, + "grad_norm": 1.6609306335449219, + "learning_rate": 1.9998889003686917e-05, + "loss": 1.5416, + "step": 4451 + }, + { + "epoch": 0.10383461562008939, + "grad_norm": 2.811746597290039, + "learning_rate": 1.9998885247220716e-05, + "loss": 1.6334, + "step": 4452 + }, + { + "epoch": 0.1038579387592673, + "grad_norm": 1.5314466953277588, + "learning_rate": 1.999888148441496e-05, + "loss": 1.2161, + "step": 4453 + }, + { + "epoch": 0.10388126189844522, + "grad_norm": 2.1141018867492676, + "learning_rate": 1.9998877715269654e-05, + "loss": 1.2569, + "step": 4454 + }, + { + "epoch": 0.10390458503762313, + "grad_norm": 1.5534998178482056, + "learning_rate": 1.9998873939784792e-05, + "loss": 1.3211, + "step": 4455 + }, + { + "epoch": 0.10392790817680106, + "grad_norm": 1.6235876083374023, + "learning_rate": 1.9998870157960383e-05, + "loss": 1.2447, + "step": 4456 + }, + { + "epoch": 0.10395123131597897, + "grad_norm": 1.601912498474121, + "learning_rate": 1.999886636979643e-05, + "loss": 1.3548, + "step": 4457 + }, + { + "epoch": 0.1039745544551569, + "grad_norm": 1.9768726825714111, + "learning_rate": 1.999886257529293e-05, + "loss": 1.5343, + "step": 4458 + }, + { + "epoch": 0.1039978775943348, + "grad_norm": 1.945237159729004, + "learning_rate": 1.999885877444989e-05, + "loss": 1.3542, + "step": 4459 + }, + { + "epoch": 0.10402120073351273, + "grad_norm": 1.7455191612243652, + "learning_rate": 1.999885496726731e-05, + "loss": 1.4138, + "step": 4460 + }, + { + "epoch": 0.10404452387269064, + "grad_norm": 2.1632285118103027, + "learning_rate": 1.9998851153745193e-05, + "loss": 1.5867, + "step": 4461 + }, + { + "epoch": 0.10406784701186857, + "grad_norm": 1.6293684244155884, + "learning_rate": 1.9998847333883543e-05, + "loss": 1.2664, + "step": 4462 + }, + { + "epoch": 0.10409117015104648, + "grad_norm": 1.6983884572982788, + "learning_rate": 1.9998843507682363e-05, + "loss": 1.049, + "step": 4463 + }, + { + "epoch": 0.1041144932902244, + "grad_norm": 1.7191314697265625, + "learning_rate": 1.9998839675141653e-05, + "loss": 1.5225, + "step": 4464 + }, + { + "epoch": 0.10413781642940231, + "grad_norm": 1.8048819303512573, + "learning_rate": 1.9998835836261414e-05, + "loss": 1.4494, + "step": 4465 + }, + { + "epoch": 0.10416113956858024, + "grad_norm": 1.4836822748184204, + "learning_rate": 1.9998831991041654e-05, + "loss": 1.4242, + "step": 4466 + }, + { + "epoch": 0.10418446270775815, + "grad_norm": 1.657299518585205, + "learning_rate": 1.9998828139482368e-05, + "loss": 1.5711, + "step": 4467 + }, + { + "epoch": 0.10420778584693607, + "grad_norm": 1.707079291343689, + "learning_rate": 1.9998824281583565e-05, + "loss": 1.4206, + "step": 4468 + }, + { + "epoch": 0.10423110898611399, + "grad_norm": 1.547277569770813, + "learning_rate": 1.9998820417345243e-05, + "loss": 1.295, + "step": 4469 + }, + { + "epoch": 0.10425443212529191, + "grad_norm": 1.6435904502868652, + "learning_rate": 1.9998816546767407e-05, + "loss": 1.7703, + "step": 4470 + }, + { + "epoch": 0.10427775526446982, + "grad_norm": 2.265117645263672, + "learning_rate": 1.9998812669850062e-05, + "loss": 1.6327, + "step": 4471 + }, + { + "epoch": 0.10430107840364775, + "grad_norm": 1.6759183406829834, + "learning_rate": 1.9998808786593204e-05, + "loss": 1.3584, + "step": 4472 + }, + { + "epoch": 0.10432440154282566, + "grad_norm": 1.700777292251587, + "learning_rate": 1.999880489699684e-05, + "loss": 1.5565, + "step": 4473 + }, + { + "epoch": 0.10434772468200357, + "grad_norm": 1.345456838607788, + "learning_rate": 1.999880100106097e-05, + "loss": 1.2595, + "step": 4474 + }, + { + "epoch": 0.10437104782118149, + "grad_norm": 1.9874299764633179, + "learning_rate": 1.9998797098785598e-05, + "loss": 1.5397, + "step": 4475 + }, + { + "epoch": 0.1043943709603594, + "grad_norm": 1.6121760606765747, + "learning_rate": 1.9998793190170724e-05, + "loss": 1.2694, + "step": 4476 + }, + { + "epoch": 0.10441769409953733, + "grad_norm": 1.7067564725875854, + "learning_rate": 1.999878927521636e-05, + "loss": 1.6556, + "step": 4477 + }, + { + "epoch": 0.10444101723871524, + "grad_norm": 1.3974144458770752, + "learning_rate": 1.9998785353922493e-05, + "loss": 1.3259, + "step": 4478 + }, + { + "epoch": 0.10446434037789316, + "grad_norm": 1.415135383605957, + "learning_rate": 1.9998781426289135e-05, + "loss": 1.1619, + "step": 4479 + }, + { + "epoch": 0.10448766351707107, + "grad_norm": 1.335329294204712, + "learning_rate": 1.999877749231629e-05, + "loss": 1.3266, + "step": 4480 + }, + { + "epoch": 0.104510986656249, + "grad_norm": 1.7226165533065796, + "learning_rate": 1.9998773552003957e-05, + "loss": 1.1049, + "step": 4481 + }, + { + "epoch": 0.10453430979542691, + "grad_norm": 1.3039993047714233, + "learning_rate": 1.9998769605352137e-05, + "loss": 1.1237, + "step": 4482 + }, + { + "epoch": 0.10455763293460484, + "grad_norm": 2.0855329036712646, + "learning_rate": 1.9998765652360835e-05, + "loss": 1.4075, + "step": 4483 + }, + { + "epoch": 0.10458095607378275, + "grad_norm": 1.926064372062683, + "learning_rate": 1.9998761693030054e-05, + "loss": 1.8209, + "step": 4484 + }, + { + "epoch": 0.10460427921296067, + "grad_norm": 1.8120089769363403, + "learning_rate": 1.9998757727359797e-05, + "loss": 1.0948, + "step": 4485 + }, + { + "epoch": 0.10462760235213858, + "grad_norm": 2.193441390991211, + "learning_rate": 1.999875375535006e-05, + "loss": 1.7267, + "step": 4486 + }, + { + "epoch": 0.10465092549131651, + "grad_norm": 1.5877799987792969, + "learning_rate": 1.9998749777000856e-05, + "loss": 1.559, + "step": 4487 + }, + { + "epoch": 0.10467424863049442, + "grad_norm": 1.4523189067840576, + "learning_rate": 1.999874579231218e-05, + "loss": 1.2716, + "step": 4488 + }, + { + "epoch": 0.10469757176967234, + "grad_norm": 1.5455758571624756, + "learning_rate": 1.9998741801284038e-05, + "loss": 1.2111, + "step": 4489 + }, + { + "epoch": 0.10472089490885025, + "grad_norm": 1.694250464439392, + "learning_rate": 1.999873780391643e-05, + "loss": 1.1948, + "step": 4490 + }, + { + "epoch": 0.10474421804802818, + "grad_norm": 1.8756452798843384, + "learning_rate": 1.999873380020936e-05, + "loss": 1.1417, + "step": 4491 + }, + { + "epoch": 0.10476754118720609, + "grad_norm": 2.003406047821045, + "learning_rate": 1.999872979016283e-05, + "loss": 1.3391, + "step": 4492 + }, + { + "epoch": 0.10479086432638401, + "grad_norm": 1.6404839754104614, + "learning_rate": 1.9998725773776843e-05, + "loss": 1.5261, + "step": 4493 + }, + { + "epoch": 0.10481418746556193, + "grad_norm": 1.4287680387496948, + "learning_rate": 1.99987217510514e-05, + "loss": 1.2396, + "step": 4494 + }, + { + "epoch": 0.10483751060473985, + "grad_norm": 2.005293369293213, + "learning_rate": 1.999871772198651e-05, + "loss": 1.3859, + "step": 4495 + }, + { + "epoch": 0.10486083374391776, + "grad_norm": 1.467164397239685, + "learning_rate": 1.9998713686582164e-05, + "loss": 1.3692, + "step": 4496 + }, + { + "epoch": 0.10488415688309569, + "grad_norm": 1.782903790473938, + "learning_rate": 1.9998709644838373e-05, + "loss": 1.4374, + "step": 4497 + }, + { + "epoch": 0.1049074800222736, + "grad_norm": 1.6113072633743286, + "learning_rate": 1.9998705596755138e-05, + "loss": 1.2854, + "step": 4498 + }, + { + "epoch": 0.10493080316145152, + "grad_norm": 2.15000319480896, + "learning_rate": 1.9998701542332463e-05, + "loss": 1.2573, + "step": 4499 + }, + { + "epoch": 0.10495412630062943, + "grad_norm": 1.4802179336547852, + "learning_rate": 1.9998697481570347e-05, + "loss": 1.2542, + "step": 4500 + }, + { + "epoch": 0.10497744943980736, + "grad_norm": 1.824934482574463, + "learning_rate": 1.9998693414468793e-05, + "loss": 1.5855, + "step": 4501 + }, + { + "epoch": 0.10500077257898527, + "grad_norm": 1.978626012802124, + "learning_rate": 1.999868934102781e-05, + "loss": 1.5597, + "step": 4502 + }, + { + "epoch": 0.10502409571816318, + "grad_norm": 1.751846432685852, + "learning_rate": 1.9998685261247388e-05, + "loss": 1.5744, + "step": 4503 + }, + { + "epoch": 0.1050474188573411, + "grad_norm": 1.903283953666687, + "learning_rate": 1.999868117512754e-05, + "loss": 1.6709, + "step": 4504 + }, + { + "epoch": 0.10507074199651902, + "grad_norm": 1.7782729864120483, + "learning_rate": 1.9998677082668267e-05, + "loss": 1.1523, + "step": 4505 + }, + { + "epoch": 0.10509406513569694, + "grad_norm": 1.7197009325027466, + "learning_rate": 1.999867298386957e-05, + "loss": 0.9572, + "step": 4506 + }, + { + "epoch": 0.10511738827487485, + "grad_norm": 2.1358916759490967, + "learning_rate": 1.9998668878731452e-05, + "loss": 1.5962, + "step": 4507 + }, + { + "epoch": 0.10514071141405278, + "grad_norm": 1.4862276315689087, + "learning_rate": 1.9998664767253916e-05, + "loss": 1.3908, + "step": 4508 + }, + { + "epoch": 0.10516403455323069, + "grad_norm": 1.6323543787002563, + "learning_rate": 1.9998660649436962e-05, + "loss": 1.432, + "step": 4509 + }, + { + "epoch": 0.10518735769240861, + "grad_norm": 2.2728726863861084, + "learning_rate": 1.9998656525280596e-05, + "loss": 1.7178, + "step": 4510 + }, + { + "epoch": 0.10521068083158652, + "grad_norm": 1.7053133249282837, + "learning_rate": 1.999865239478482e-05, + "loss": 1.365, + "step": 4511 + }, + { + "epoch": 0.10523400397076445, + "grad_norm": 1.8892529010772705, + "learning_rate": 1.9998648257949633e-05, + "loss": 1.6853, + "step": 4512 + }, + { + "epoch": 0.10525732710994236, + "grad_norm": 1.7584593296051025, + "learning_rate": 1.9998644114775044e-05, + "loss": 1.1567, + "step": 4513 + }, + { + "epoch": 0.10528065024912028, + "grad_norm": 1.8040283918380737, + "learning_rate": 1.999863996526105e-05, + "loss": 1.8875, + "step": 4514 + }, + { + "epoch": 0.1053039733882982, + "grad_norm": 1.7637766599655151, + "learning_rate": 1.9998635809407653e-05, + "loss": 1.1819, + "step": 4515 + }, + { + "epoch": 0.10532729652747612, + "grad_norm": 1.8415699005126953, + "learning_rate": 1.9998631647214863e-05, + "loss": 1.2454, + "step": 4516 + }, + { + "epoch": 0.10535061966665403, + "grad_norm": 1.7374746799468994, + "learning_rate": 1.9998627478682678e-05, + "loss": 1.5977, + "step": 4517 + }, + { + "epoch": 0.10537394280583195, + "grad_norm": 1.87213134765625, + "learning_rate": 1.99986233038111e-05, + "loss": 1.4459, + "step": 4518 + }, + { + "epoch": 0.10539726594500987, + "grad_norm": 1.5321569442749023, + "learning_rate": 1.999861912260013e-05, + "loss": 1.6018, + "step": 4519 + }, + { + "epoch": 0.10542058908418779, + "grad_norm": 2.3718676567077637, + "learning_rate": 1.9998614935049774e-05, + "loss": 1.472, + "step": 4520 + }, + { + "epoch": 0.1054439122233657, + "grad_norm": 1.7300660610198975, + "learning_rate": 1.9998610741160035e-05, + "loss": 1.5318, + "step": 4521 + }, + { + "epoch": 0.10546723536254363, + "grad_norm": 1.7774510383605957, + "learning_rate": 1.9998606540930915e-05, + "loss": 1.1674, + "step": 4522 + }, + { + "epoch": 0.10549055850172154, + "grad_norm": 1.4192143678665161, + "learning_rate": 1.999860233436241e-05, + "loss": 1.2442, + "step": 4523 + }, + { + "epoch": 0.10551388164089946, + "grad_norm": 1.8033263683319092, + "learning_rate": 1.9998598121454536e-05, + "loss": 1.3566, + "step": 4524 + }, + { + "epoch": 0.10553720478007737, + "grad_norm": 1.5711365938186646, + "learning_rate": 1.9998593902207284e-05, + "loss": 1.5206, + "step": 4525 + }, + { + "epoch": 0.1055605279192553, + "grad_norm": 1.9537410736083984, + "learning_rate": 1.9998589676620664e-05, + "loss": 1.7395, + "step": 4526 + }, + { + "epoch": 0.10558385105843321, + "grad_norm": 1.5812814235687256, + "learning_rate": 1.999858544469467e-05, + "loss": 1.5238, + "step": 4527 + }, + { + "epoch": 0.10560717419761113, + "grad_norm": 1.6676138639450073, + "learning_rate": 1.9998581206429315e-05, + "loss": 1.4023, + "step": 4528 + }, + { + "epoch": 0.10563049733678904, + "grad_norm": 1.7668168544769287, + "learning_rate": 1.9998576961824597e-05, + "loss": 1.4974, + "step": 4529 + }, + { + "epoch": 0.10565382047596697, + "grad_norm": 2.2290921211242676, + "learning_rate": 1.9998572710880518e-05, + "loss": 1.3454, + "step": 4530 + }, + { + "epoch": 0.10567714361514488, + "grad_norm": 1.8207625150680542, + "learning_rate": 1.999856845359708e-05, + "loss": 1.253, + "step": 4531 + }, + { + "epoch": 0.10570046675432279, + "grad_norm": 2.1381027698516846, + "learning_rate": 1.9998564189974288e-05, + "loss": 1.3995, + "step": 4532 + }, + { + "epoch": 0.10572378989350072, + "grad_norm": 1.5824651718139648, + "learning_rate": 1.9998559920012144e-05, + "loss": 1.327, + "step": 4533 + }, + { + "epoch": 0.10574711303267863, + "grad_norm": 1.8291656970977783, + "learning_rate": 1.9998555643710653e-05, + "loss": 1.4791, + "step": 4534 + }, + { + "epoch": 0.10577043617185655, + "grad_norm": 1.7711025476455688, + "learning_rate": 1.999855136106981e-05, + "loss": 1.2384, + "step": 4535 + }, + { + "epoch": 0.10579375931103446, + "grad_norm": 1.7322211265563965, + "learning_rate": 1.9998547072089626e-05, + "loss": 1.5979, + "step": 4536 + }, + { + "epoch": 0.10581708245021239, + "grad_norm": 1.429113507270813, + "learning_rate": 1.9998542776770104e-05, + "loss": 1.1038, + "step": 4537 + }, + { + "epoch": 0.1058404055893903, + "grad_norm": 1.4199211597442627, + "learning_rate": 1.9998538475111238e-05, + "loss": 0.9931, + "step": 4538 + }, + { + "epoch": 0.10586372872856822, + "grad_norm": 1.6906683444976807, + "learning_rate": 1.9998534167113036e-05, + "loss": 1.2704, + "step": 4539 + }, + { + "epoch": 0.10588705186774613, + "grad_norm": 1.8531103134155273, + "learning_rate": 1.9998529852775506e-05, + "loss": 1.4959, + "step": 4540 + }, + { + "epoch": 0.10591037500692406, + "grad_norm": 1.4918322563171387, + "learning_rate": 1.999852553209864e-05, + "loss": 1.031, + "step": 4541 + }, + { + "epoch": 0.10593369814610197, + "grad_norm": 1.7900128364562988, + "learning_rate": 1.999852120508245e-05, + "loss": 1.3916, + "step": 4542 + }, + { + "epoch": 0.1059570212852799, + "grad_norm": 1.757381558418274, + "learning_rate": 1.9998516871726935e-05, + "loss": 1.5504, + "step": 4543 + }, + { + "epoch": 0.1059803444244578, + "grad_norm": 2.007852077484131, + "learning_rate": 1.9998512532032095e-05, + "loss": 1.2342, + "step": 4544 + }, + { + "epoch": 0.10600366756363573, + "grad_norm": 1.7031002044677734, + "learning_rate": 1.999850818599794e-05, + "loss": 1.4977, + "step": 4545 + }, + { + "epoch": 0.10602699070281364, + "grad_norm": 1.6903876066207886, + "learning_rate": 1.9998503833624463e-05, + "loss": 1.199, + "step": 4546 + }, + { + "epoch": 0.10605031384199157, + "grad_norm": 1.4657163619995117, + "learning_rate": 1.9998499474911674e-05, + "loss": 1.6284, + "step": 4547 + }, + { + "epoch": 0.10607363698116948, + "grad_norm": 1.5041929483413696, + "learning_rate": 1.9998495109859575e-05, + "loss": 1.4298, + "step": 4548 + }, + { + "epoch": 0.1060969601203474, + "grad_norm": 1.4863712787628174, + "learning_rate": 1.9998490738468166e-05, + "loss": 1.1771, + "step": 4549 + }, + { + "epoch": 0.10612028325952531, + "grad_norm": 1.8348568677902222, + "learning_rate": 1.999848636073745e-05, + "loss": 1.5684, + "step": 4550 + }, + { + "epoch": 0.10614360639870324, + "grad_norm": 1.5103323459625244, + "learning_rate": 1.9998481976667436e-05, + "loss": 1.282, + "step": 4551 + }, + { + "epoch": 0.10616692953788115, + "grad_norm": 1.5343869924545288, + "learning_rate": 1.999847758625812e-05, + "loss": 1.2171, + "step": 4552 + }, + { + "epoch": 0.10619025267705907, + "grad_norm": 1.3986709117889404, + "learning_rate": 1.9998473189509505e-05, + "loss": 1.3708, + "step": 4553 + }, + { + "epoch": 0.10621357581623699, + "grad_norm": 1.6608822345733643, + "learning_rate": 1.9998468786421598e-05, + "loss": 1.2693, + "step": 4554 + }, + { + "epoch": 0.10623689895541491, + "grad_norm": 1.467126488685608, + "learning_rate": 1.99984643769944e-05, + "loss": 1.4189, + "step": 4555 + }, + { + "epoch": 0.10626022209459282, + "grad_norm": 1.820080280303955, + "learning_rate": 1.9998459961227906e-05, + "loss": 1.5622, + "step": 4556 + }, + { + "epoch": 0.10628354523377075, + "grad_norm": 1.7327991724014282, + "learning_rate": 1.999845553912213e-05, + "loss": 1.4281, + "step": 4557 + }, + { + "epoch": 0.10630686837294866, + "grad_norm": 1.5667237043380737, + "learning_rate": 1.9998451110677073e-05, + "loss": 1.349, + "step": 4558 + }, + { + "epoch": 0.10633019151212657, + "grad_norm": 2.0479605197906494, + "learning_rate": 1.9998446675892733e-05, + "loss": 1.2973, + "step": 4559 + }, + { + "epoch": 0.10635351465130449, + "grad_norm": 1.624783992767334, + "learning_rate": 1.9998442234769116e-05, + "loss": 1.4658, + "step": 4560 + }, + { + "epoch": 0.1063768377904824, + "grad_norm": 1.7705590724945068, + "learning_rate": 1.9998437787306228e-05, + "loss": 0.8277, + "step": 4561 + }, + { + "epoch": 0.10640016092966033, + "grad_norm": 1.941250205039978, + "learning_rate": 1.9998433333504063e-05, + "loss": 1.4665, + "step": 4562 + }, + { + "epoch": 0.10642348406883824, + "grad_norm": 1.5868898630142212, + "learning_rate": 1.999842887336263e-05, + "loss": 1.6009, + "step": 4563 + }, + { + "epoch": 0.10644680720801616, + "grad_norm": 1.5634092092514038, + "learning_rate": 1.999842440688193e-05, + "loss": 1.4364, + "step": 4564 + }, + { + "epoch": 0.10647013034719408, + "grad_norm": 1.8790887594223022, + "learning_rate": 1.9998419934061968e-05, + "loss": 1.6344, + "step": 4565 + }, + { + "epoch": 0.106493453486372, + "grad_norm": 1.4845713376998901, + "learning_rate": 1.9998415454902743e-05, + "loss": 1.2816, + "step": 4566 + }, + { + "epoch": 0.10651677662554991, + "grad_norm": 1.503846287727356, + "learning_rate": 1.9998410969404264e-05, + "loss": 1.4821, + "step": 4567 + }, + { + "epoch": 0.10654009976472784, + "grad_norm": 1.9791507720947266, + "learning_rate": 1.9998406477566525e-05, + "loss": 1.3576, + "step": 4568 + }, + { + "epoch": 0.10656342290390575, + "grad_norm": 1.6551647186279297, + "learning_rate": 1.9998401979389537e-05, + "loss": 1.6536, + "step": 4569 + }, + { + "epoch": 0.10658674604308367, + "grad_norm": 1.733027458190918, + "learning_rate": 1.9998397474873298e-05, + "loss": 1.7508, + "step": 4570 + }, + { + "epoch": 0.10661006918226158, + "grad_norm": 1.4881467819213867, + "learning_rate": 1.9998392964017816e-05, + "loss": 1.2461, + "step": 4571 + }, + { + "epoch": 0.10663339232143951, + "grad_norm": 1.5950926542282104, + "learning_rate": 1.999838844682309e-05, + "loss": 1.5417, + "step": 4572 + }, + { + "epoch": 0.10665671546061742, + "grad_norm": 2.1507344245910645, + "learning_rate": 1.9998383923289116e-05, + "loss": 1.2756, + "step": 4573 + }, + { + "epoch": 0.10668003859979534, + "grad_norm": 1.5847529172897339, + "learning_rate": 1.999837939341591e-05, + "loss": 1.3398, + "step": 4574 + }, + { + "epoch": 0.10670336173897325, + "grad_norm": 1.9728766679763794, + "learning_rate": 1.999837485720347e-05, + "loss": 1.8232, + "step": 4575 + }, + { + "epoch": 0.10672668487815118, + "grad_norm": 1.8677773475646973, + "learning_rate": 1.9998370314651794e-05, + "loss": 1.4771, + "step": 4576 + }, + { + "epoch": 0.10675000801732909, + "grad_norm": 1.7102746963500977, + "learning_rate": 1.999836576576089e-05, + "loss": 1.4293, + "step": 4577 + }, + { + "epoch": 0.10677333115650701, + "grad_norm": 1.8963501453399658, + "learning_rate": 1.9998361210530762e-05, + "loss": 1.6308, + "step": 4578 + }, + { + "epoch": 0.10679665429568493, + "grad_norm": 2.2562477588653564, + "learning_rate": 1.999835664896141e-05, + "loss": 1.2972, + "step": 4579 + }, + { + "epoch": 0.10681997743486285, + "grad_norm": 1.480392575263977, + "learning_rate": 1.9998352081052834e-05, + "loss": 1.2876, + "step": 4580 + }, + { + "epoch": 0.10684330057404076, + "grad_norm": 2.29121470451355, + "learning_rate": 1.9998347506805044e-05, + "loss": 1.6237, + "step": 4581 + }, + { + "epoch": 0.10686662371321869, + "grad_norm": 1.8855741024017334, + "learning_rate": 1.9998342926218035e-05, + "loss": 1.7616, + "step": 4582 + }, + { + "epoch": 0.1068899468523966, + "grad_norm": 1.8817429542541504, + "learning_rate": 1.9998338339291815e-05, + "loss": 1.4687, + "step": 4583 + }, + { + "epoch": 0.10691326999157452, + "grad_norm": 1.6737823486328125, + "learning_rate": 1.999833374602639e-05, + "loss": 1.1106, + "step": 4584 + }, + { + "epoch": 0.10693659313075243, + "grad_norm": 1.6832067966461182, + "learning_rate": 1.9998329146421755e-05, + "loss": 1.2487, + "step": 4585 + }, + { + "epoch": 0.10695991626993036, + "grad_norm": 2.4442577362060547, + "learning_rate": 1.999832454047792e-05, + "loss": 1.6538, + "step": 4586 + }, + { + "epoch": 0.10698323940910827, + "grad_norm": 1.69062077999115, + "learning_rate": 1.9998319928194882e-05, + "loss": 1.2236, + "step": 4587 + }, + { + "epoch": 0.10700656254828618, + "grad_norm": 1.6283913850784302, + "learning_rate": 1.999831530957265e-05, + "loss": 1.2752, + "step": 4588 + }, + { + "epoch": 0.1070298856874641, + "grad_norm": 1.7547777891159058, + "learning_rate": 1.999831068461122e-05, + "loss": 1.4031, + "step": 4589 + }, + { + "epoch": 0.10705320882664202, + "grad_norm": 2.5570805072784424, + "learning_rate": 1.9998306053310598e-05, + "loss": 1.4149, + "step": 4590 + }, + { + "epoch": 0.10707653196581994, + "grad_norm": 1.416925072669983, + "learning_rate": 1.999830141567079e-05, + "loss": 1.3053, + "step": 4591 + }, + { + "epoch": 0.10709985510499785, + "grad_norm": 2.1444454193115234, + "learning_rate": 1.9998296771691796e-05, + "loss": 1.7971, + "step": 4592 + }, + { + "epoch": 0.10712317824417578, + "grad_norm": 1.529247522354126, + "learning_rate": 1.999829212137362e-05, + "loss": 1.4804, + "step": 4593 + }, + { + "epoch": 0.10714650138335369, + "grad_norm": 1.6415820121765137, + "learning_rate": 1.9998287464716264e-05, + "loss": 1.3182, + "step": 4594 + }, + { + "epoch": 0.10716982452253161, + "grad_norm": 1.848414421081543, + "learning_rate": 1.999828280171973e-05, + "loss": 1.5494, + "step": 4595 + }, + { + "epoch": 0.10719314766170952, + "grad_norm": 1.7559365034103394, + "learning_rate": 1.9998278132384024e-05, + "loss": 1.4093, + "step": 4596 + }, + { + "epoch": 0.10721647080088745, + "grad_norm": 1.9661378860473633, + "learning_rate": 1.9998273456709144e-05, + "loss": 1.4959, + "step": 4597 + }, + { + "epoch": 0.10723979394006536, + "grad_norm": 2.336165428161621, + "learning_rate": 1.99982687746951e-05, + "loss": 1.4747, + "step": 4598 + }, + { + "epoch": 0.10726311707924328, + "grad_norm": 1.5826678276062012, + "learning_rate": 1.999826408634189e-05, + "loss": 1.5984, + "step": 4599 + }, + { + "epoch": 0.1072864402184212, + "grad_norm": 1.8213845491409302, + "learning_rate": 1.9998259391649518e-05, + "loss": 1.5824, + "step": 4600 + }, + { + "epoch": 0.10730976335759912, + "grad_norm": 1.9903720617294312, + "learning_rate": 1.9998254690617985e-05, + "loss": 1.7531, + "step": 4601 + }, + { + "epoch": 0.10733308649677703, + "grad_norm": 1.8120417594909668, + "learning_rate": 1.99982499832473e-05, + "loss": 1.6241, + "step": 4602 + }, + { + "epoch": 0.10735640963595496, + "grad_norm": 1.43329656124115, + "learning_rate": 1.9998245269537455e-05, + "loss": 1.3178, + "step": 4603 + }, + { + "epoch": 0.10737973277513287, + "grad_norm": 1.7703838348388672, + "learning_rate": 1.9998240549488464e-05, + "loss": 1.5845, + "step": 4604 + }, + { + "epoch": 0.10740305591431079, + "grad_norm": 1.6270997524261475, + "learning_rate": 1.999823582310033e-05, + "loss": 1.5622, + "step": 4605 + }, + { + "epoch": 0.1074263790534887, + "grad_norm": 1.3259307146072388, + "learning_rate": 1.9998231090373048e-05, + "loss": 1.1944, + "step": 4606 + }, + { + "epoch": 0.10744970219266663, + "grad_norm": 1.5523977279663086, + "learning_rate": 1.9998226351306626e-05, + "loss": 1.5758, + "step": 4607 + }, + { + "epoch": 0.10747302533184454, + "grad_norm": 1.810379147529602, + "learning_rate": 1.9998221605901063e-05, + "loss": 1.7552, + "step": 4608 + }, + { + "epoch": 0.10749634847102246, + "grad_norm": 1.8421108722686768, + "learning_rate": 1.9998216854156367e-05, + "loss": 1.6426, + "step": 4609 + }, + { + "epoch": 0.10751967161020037, + "grad_norm": 1.9336531162261963, + "learning_rate": 1.999821209607254e-05, + "loss": 1.6727, + "step": 4610 + }, + { + "epoch": 0.1075429947493783, + "grad_norm": 1.3325319290161133, + "learning_rate": 1.9998207331649584e-05, + "loss": 1.3482, + "step": 4611 + }, + { + "epoch": 0.10756631788855621, + "grad_norm": 1.5683438777923584, + "learning_rate": 1.99982025608875e-05, + "loss": 1.4532, + "step": 4612 + }, + { + "epoch": 0.10758964102773413, + "grad_norm": 1.6363450288772583, + "learning_rate": 1.9998197783786298e-05, + "loss": 1.3097, + "step": 4613 + }, + { + "epoch": 0.10761296416691205, + "grad_norm": 1.9321645498275757, + "learning_rate": 1.999819300034597e-05, + "loss": 1.6566, + "step": 4614 + }, + { + "epoch": 0.10763628730608997, + "grad_norm": 1.6562678813934326, + "learning_rate": 1.999818821056653e-05, + "loss": 1.5087, + "step": 4615 + }, + { + "epoch": 0.10765961044526788, + "grad_norm": 1.3343039751052856, + "learning_rate": 1.9998183414447974e-05, + "loss": 1.7068, + "step": 4616 + }, + { + "epoch": 0.10768293358444579, + "grad_norm": 1.6911089420318604, + "learning_rate": 1.9998178611990308e-05, + "loss": 1.3978, + "step": 4617 + }, + { + "epoch": 0.10770625672362372, + "grad_norm": 1.6219162940979004, + "learning_rate": 1.999817380319353e-05, + "loss": 1.6076, + "step": 4618 + }, + { + "epoch": 0.10772957986280163, + "grad_norm": 1.6606857776641846, + "learning_rate": 1.9998168988057655e-05, + "loss": 1.2402, + "step": 4619 + }, + { + "epoch": 0.10775290300197955, + "grad_norm": 1.5331239700317383, + "learning_rate": 1.9998164166582672e-05, + "loss": 1.3521, + "step": 4620 + }, + { + "epoch": 0.10777622614115746, + "grad_norm": 1.4964178800582886, + "learning_rate": 1.9998159338768593e-05, + "loss": 1.242, + "step": 4621 + }, + { + "epoch": 0.10779954928033539, + "grad_norm": 1.9768437147140503, + "learning_rate": 1.9998154504615418e-05, + "loss": 1.3477, + "step": 4622 + }, + { + "epoch": 0.1078228724195133, + "grad_norm": 1.8231476545333862, + "learning_rate": 1.999814966412315e-05, + "loss": 1.8586, + "step": 4623 + }, + { + "epoch": 0.10784619555869122, + "grad_norm": 1.7720146179199219, + "learning_rate": 1.9998144817291795e-05, + "loss": 1.4332, + "step": 4624 + }, + { + "epoch": 0.10786951869786914, + "grad_norm": 1.5090407133102417, + "learning_rate": 1.999813996412135e-05, + "loss": 1.1353, + "step": 4625 + }, + { + "epoch": 0.10789284183704706, + "grad_norm": 2.024202585220337, + "learning_rate": 1.9998135104611825e-05, + "loss": 1.5498, + "step": 4626 + }, + { + "epoch": 0.10791616497622497, + "grad_norm": 1.5118862390518188, + "learning_rate": 1.999813023876322e-05, + "loss": 1.2549, + "step": 4627 + }, + { + "epoch": 0.1079394881154029, + "grad_norm": 1.607124924659729, + "learning_rate": 1.9998125366575537e-05, + "loss": 1.5745, + "step": 4628 + }, + { + "epoch": 0.1079628112545808, + "grad_norm": 2.0179083347320557, + "learning_rate": 1.999812048804878e-05, + "loss": 1.45, + "step": 4629 + }, + { + "epoch": 0.10798613439375873, + "grad_norm": 1.4259659051895142, + "learning_rate": 1.999811560318295e-05, + "loss": 1.4489, + "step": 4630 + }, + { + "epoch": 0.10800945753293664, + "grad_norm": 1.5856199264526367, + "learning_rate": 1.9998110711978055e-05, + "loss": 1.3063, + "step": 4631 + }, + { + "epoch": 0.10803278067211457, + "grad_norm": 1.8990132808685303, + "learning_rate": 1.9998105814434093e-05, + "loss": 1.4658, + "step": 4632 + }, + { + "epoch": 0.10805610381129248, + "grad_norm": 1.7485079765319824, + "learning_rate": 1.999810091055107e-05, + "loss": 1.3005, + "step": 4633 + }, + { + "epoch": 0.1080794269504704, + "grad_norm": 1.6214683055877686, + "learning_rate": 1.999809600032899e-05, + "loss": 1.1761, + "step": 4634 + }, + { + "epoch": 0.10810275008964831, + "grad_norm": 1.4445825815200806, + "learning_rate": 1.9998091083767855e-05, + "loss": 0.8278, + "step": 4635 + }, + { + "epoch": 0.10812607322882624, + "grad_norm": 1.336374044418335, + "learning_rate": 1.9998086160867663e-05, + "loss": 1.4518, + "step": 4636 + }, + { + "epoch": 0.10814939636800415, + "grad_norm": 2.1542108058929443, + "learning_rate": 1.999808123162843e-05, + "loss": 1.5429, + "step": 4637 + }, + { + "epoch": 0.10817271950718207, + "grad_norm": 1.8157985210418701, + "learning_rate": 1.9998076296050146e-05, + "loss": 1.2964, + "step": 4638 + }, + { + "epoch": 0.10819604264635999, + "grad_norm": 1.6846963167190552, + "learning_rate": 1.9998071354132818e-05, + "loss": 1.2976, + "step": 4639 + }, + { + "epoch": 0.10821936578553791, + "grad_norm": 1.6959627866744995, + "learning_rate": 1.999806640587645e-05, + "loss": 1.3066, + "step": 4640 + }, + { + "epoch": 0.10824268892471582, + "grad_norm": 2.090728998184204, + "learning_rate": 1.999806145128105e-05, + "loss": 1.4062, + "step": 4641 + }, + { + "epoch": 0.10826601206389375, + "grad_norm": 1.8047274351119995, + "learning_rate": 1.9998056490346615e-05, + "loss": 1.6949, + "step": 4642 + }, + { + "epoch": 0.10828933520307166, + "grad_norm": 2.0198192596435547, + "learning_rate": 1.999805152307315e-05, + "loss": 1.46, + "step": 4643 + }, + { + "epoch": 0.10831265834224958, + "grad_norm": 1.8486112356185913, + "learning_rate": 1.9998046549460654e-05, + "loss": 1.6438, + "step": 4644 + }, + { + "epoch": 0.10833598148142749, + "grad_norm": 1.8807770013809204, + "learning_rate": 1.9998041569509137e-05, + "loss": 1.4286, + "step": 4645 + }, + { + "epoch": 0.1083593046206054, + "grad_norm": 2.911431312561035, + "learning_rate": 1.99980365832186e-05, + "loss": 1.5008, + "step": 4646 + }, + { + "epoch": 0.10838262775978333, + "grad_norm": 1.6945289373397827, + "learning_rate": 1.9998031590589042e-05, + "loss": 1.592, + "step": 4647 + }, + { + "epoch": 0.10840595089896124, + "grad_norm": 1.5399086475372314, + "learning_rate": 1.999802659162047e-05, + "loss": 1.6958, + "step": 4648 + }, + { + "epoch": 0.10842927403813916, + "grad_norm": 1.7213293313980103, + "learning_rate": 1.999802158631289e-05, + "loss": 1.2511, + "step": 4649 + }, + { + "epoch": 0.10845259717731708, + "grad_norm": 1.6119493246078491, + "learning_rate": 1.99980165746663e-05, + "loss": 1.2312, + "step": 4650 + }, + { + "epoch": 0.108475920316495, + "grad_norm": 1.7431094646453857, + "learning_rate": 1.9998011556680703e-05, + "loss": 1.4495, + "step": 4651 + }, + { + "epoch": 0.10849924345567291, + "grad_norm": 1.5794720649719238, + "learning_rate": 1.9998006532356108e-05, + "loss": 1.1925, + "step": 4652 + }, + { + "epoch": 0.10852256659485084, + "grad_norm": 1.500409722328186, + "learning_rate": 1.9998001501692512e-05, + "loss": 1.2074, + "step": 4653 + }, + { + "epoch": 0.10854588973402875, + "grad_norm": 1.4552727937698364, + "learning_rate": 1.999799646468992e-05, + "loss": 1.2791, + "step": 4654 + }, + { + "epoch": 0.10856921287320667, + "grad_norm": 1.5714362859725952, + "learning_rate": 1.9997991421348336e-05, + "loss": 1.5333, + "step": 4655 + }, + { + "epoch": 0.10859253601238458, + "grad_norm": 1.6176605224609375, + "learning_rate": 1.9997986371667766e-05, + "loss": 1.6721, + "step": 4656 + }, + { + "epoch": 0.10861585915156251, + "grad_norm": 1.587973952293396, + "learning_rate": 1.999798131564821e-05, + "loss": 1.2112, + "step": 4657 + }, + { + "epoch": 0.10863918229074042, + "grad_norm": 1.6890060901641846, + "learning_rate": 1.9997976253289667e-05, + "loss": 1.2078, + "step": 4658 + }, + { + "epoch": 0.10866250542991834, + "grad_norm": 1.9246002435684204, + "learning_rate": 1.9997971184592147e-05, + "loss": 1.3152, + "step": 4659 + }, + { + "epoch": 0.10868582856909625, + "grad_norm": 2.5927438735961914, + "learning_rate": 1.999796610955565e-05, + "loss": 1.5726, + "step": 4660 + }, + { + "epoch": 0.10870915170827418, + "grad_norm": 1.6981041431427002, + "learning_rate": 1.999796102818018e-05, + "loss": 1.118, + "step": 4661 + }, + { + "epoch": 0.10873247484745209, + "grad_norm": 1.754831075668335, + "learning_rate": 1.999795594046574e-05, + "loss": 1.0722, + "step": 4662 + }, + { + "epoch": 0.10875579798663002, + "grad_norm": 2.376711845397949, + "learning_rate": 1.9997950846412334e-05, + "loss": 1.5271, + "step": 4663 + }, + { + "epoch": 0.10877912112580793, + "grad_norm": 1.8238438367843628, + "learning_rate": 1.9997945746019966e-05, + "loss": 1.533, + "step": 4664 + }, + { + "epoch": 0.10880244426498585, + "grad_norm": 1.9179550409317017, + "learning_rate": 1.9997940639288637e-05, + "loss": 1.535, + "step": 4665 + }, + { + "epoch": 0.10882576740416376, + "grad_norm": 2.026521682739258, + "learning_rate": 1.999793552621835e-05, + "loss": 1.3284, + "step": 4666 + }, + { + "epoch": 0.10884909054334169, + "grad_norm": 1.452511191368103, + "learning_rate": 1.999793040680911e-05, + "loss": 0.8912, + "step": 4667 + }, + { + "epoch": 0.1088724136825196, + "grad_norm": 1.7069728374481201, + "learning_rate": 1.9997925281060923e-05, + "loss": 1.5486, + "step": 4668 + }, + { + "epoch": 0.10889573682169752, + "grad_norm": 1.6261640787124634, + "learning_rate": 1.9997920148973785e-05, + "loss": 1.4506, + "step": 4669 + }, + { + "epoch": 0.10891905996087543, + "grad_norm": 1.6827213764190674, + "learning_rate": 1.9997915010547703e-05, + "loss": 1.0382, + "step": 4670 + }, + { + "epoch": 0.10894238310005336, + "grad_norm": 1.4625062942504883, + "learning_rate": 1.999790986578268e-05, + "loss": 1.4952, + "step": 4671 + }, + { + "epoch": 0.10896570623923127, + "grad_norm": 2.3439128398895264, + "learning_rate": 1.9997904714678724e-05, + "loss": 1.2131, + "step": 4672 + }, + { + "epoch": 0.10898902937840918, + "grad_norm": 1.8880949020385742, + "learning_rate": 1.9997899557235828e-05, + "loss": 1.2906, + "step": 4673 + }, + { + "epoch": 0.1090123525175871, + "grad_norm": 1.5955619812011719, + "learning_rate": 1.9997894393454008e-05, + "loss": 1.26, + "step": 4674 + }, + { + "epoch": 0.10903567565676502, + "grad_norm": 1.6589529514312744, + "learning_rate": 1.9997889223333254e-05, + "loss": 1.5532, + "step": 4675 + }, + { + "epoch": 0.10905899879594294, + "grad_norm": 1.7581021785736084, + "learning_rate": 1.999788404687358e-05, + "loss": 1.1995, + "step": 4676 + }, + { + "epoch": 0.10908232193512085, + "grad_norm": 2.3846583366394043, + "learning_rate": 1.9997878864074983e-05, + "loss": 1.2254, + "step": 4677 + }, + { + "epoch": 0.10910564507429878, + "grad_norm": 2.5479581356048584, + "learning_rate": 1.999787367493747e-05, + "loss": 1.552, + "step": 4678 + }, + { + "epoch": 0.10912896821347669, + "grad_norm": 1.7845826148986816, + "learning_rate": 1.999786847946104e-05, + "loss": 1.6222, + "step": 4679 + }, + { + "epoch": 0.10915229135265461, + "grad_norm": 1.794092059135437, + "learning_rate": 1.99978632776457e-05, + "loss": 1.0853, + "step": 4680 + }, + { + "epoch": 0.10917561449183252, + "grad_norm": 1.983144998550415, + "learning_rate": 1.9997858069491453e-05, + "loss": 1.7065, + "step": 4681 + }, + { + "epoch": 0.10919893763101045, + "grad_norm": 2.0391225814819336, + "learning_rate": 1.9997852854998302e-05, + "loss": 1.5509, + "step": 4682 + }, + { + "epoch": 0.10922226077018836, + "grad_norm": 2.037996768951416, + "learning_rate": 1.9997847634166248e-05, + "loss": 1.4287, + "step": 4683 + }, + { + "epoch": 0.10924558390936628, + "grad_norm": 2.2122488021850586, + "learning_rate": 1.9997842406995298e-05, + "loss": 1.5017, + "step": 4684 + }, + { + "epoch": 0.1092689070485442, + "grad_norm": 1.6282988786697388, + "learning_rate": 1.999783717348545e-05, + "loss": 1.7561, + "step": 4685 + }, + { + "epoch": 0.10929223018772212, + "grad_norm": 2.0643422603607178, + "learning_rate": 1.9997831933636715e-05, + "loss": 1.3793, + "step": 4686 + }, + { + "epoch": 0.10931555332690003, + "grad_norm": 1.6061822175979614, + "learning_rate": 1.999782668744909e-05, + "loss": 1.4003, + "step": 4687 + }, + { + "epoch": 0.10933887646607796, + "grad_norm": 1.6987704038619995, + "learning_rate": 1.999782143492258e-05, + "loss": 0.9222, + "step": 4688 + }, + { + "epoch": 0.10936219960525587, + "grad_norm": 1.7195863723754883, + "learning_rate": 1.9997816176057188e-05, + "loss": 1.1077, + "step": 4689 + }, + { + "epoch": 0.10938552274443379, + "grad_norm": 2.4128363132476807, + "learning_rate": 1.999781091085292e-05, + "loss": 1.5896, + "step": 4690 + }, + { + "epoch": 0.1094088458836117, + "grad_norm": 1.7924901247024536, + "learning_rate": 1.9997805639309776e-05, + "loss": 1.5602, + "step": 4691 + }, + { + "epoch": 0.10943216902278963, + "grad_norm": 1.849171757698059, + "learning_rate": 1.9997800361427763e-05, + "loss": 1.7518, + "step": 4692 + }, + { + "epoch": 0.10945549216196754, + "grad_norm": 1.589709997177124, + "learning_rate": 1.999779507720688e-05, + "loss": 1.3019, + "step": 4693 + }, + { + "epoch": 0.10947881530114546, + "grad_norm": 1.6170086860656738, + "learning_rate": 1.999778978664713e-05, + "loss": 1.3426, + "step": 4694 + }, + { + "epoch": 0.10950213844032337, + "grad_norm": 1.4563287496566772, + "learning_rate": 1.9997784489748527e-05, + "loss": 1.2167, + "step": 4695 + }, + { + "epoch": 0.1095254615795013, + "grad_norm": 1.3568425178527832, + "learning_rate": 1.999777918651106e-05, + "loss": 1.3701, + "step": 4696 + }, + { + "epoch": 0.10954878471867921, + "grad_norm": 1.3222202062606812, + "learning_rate": 1.9997773876934738e-05, + "loss": 1.2981, + "step": 4697 + }, + { + "epoch": 0.10957210785785713, + "grad_norm": 1.4239948987960815, + "learning_rate": 1.999776856101957e-05, + "loss": 1.1203, + "step": 4698 + }, + { + "epoch": 0.10959543099703505, + "grad_norm": 2.1866133213043213, + "learning_rate": 1.999776323876555e-05, + "loss": 1.5782, + "step": 4699 + }, + { + "epoch": 0.10961875413621297, + "grad_norm": 1.364729881286621, + "learning_rate": 1.9997757910172683e-05, + "loss": 1.4202, + "step": 4700 + }, + { + "epoch": 0.10964207727539088, + "grad_norm": 1.4880620241165161, + "learning_rate": 1.9997752575240978e-05, + "loss": 1.3443, + "step": 4701 + }, + { + "epoch": 0.10966540041456879, + "grad_norm": 2.045469045639038, + "learning_rate": 1.9997747233970438e-05, + "loss": 1.8308, + "step": 4702 + }, + { + "epoch": 0.10968872355374672, + "grad_norm": 1.7237162590026855, + "learning_rate": 1.999774188636106e-05, + "loss": 1.3901, + "step": 4703 + }, + { + "epoch": 0.10971204669292463, + "grad_norm": 2.113842725753784, + "learning_rate": 1.9997736532412854e-05, + "loss": 1.7517, + "step": 4704 + }, + { + "epoch": 0.10973536983210255, + "grad_norm": 1.5148341655731201, + "learning_rate": 1.9997731172125817e-05, + "loss": 1.2831, + "step": 4705 + }, + { + "epoch": 0.10975869297128046, + "grad_norm": 1.8842540979385376, + "learning_rate": 1.999772580549996e-05, + "loss": 1.4113, + "step": 4706 + }, + { + "epoch": 0.10978201611045839, + "grad_norm": 1.5884028673171997, + "learning_rate": 1.999772043253528e-05, + "loss": 1.4935, + "step": 4707 + }, + { + "epoch": 0.1098053392496363, + "grad_norm": 1.560294270515442, + "learning_rate": 1.999771505323178e-05, + "loss": 1.207, + "step": 4708 + }, + { + "epoch": 0.10982866238881422, + "grad_norm": 1.4098200798034668, + "learning_rate": 1.9997709667589473e-05, + "loss": 1.155, + "step": 4709 + }, + { + "epoch": 0.10985198552799214, + "grad_norm": 1.5581802129745483, + "learning_rate": 1.999770427560835e-05, + "loss": 1.3697, + "step": 4710 + }, + { + "epoch": 0.10987530866717006, + "grad_norm": 1.7986078262329102, + "learning_rate": 1.9997698877288423e-05, + "loss": 0.9984, + "step": 4711 + }, + { + "epoch": 0.10989863180634797, + "grad_norm": 1.4624197483062744, + "learning_rate": 1.9997693472629688e-05, + "loss": 1.3985, + "step": 4712 + }, + { + "epoch": 0.1099219549455259, + "grad_norm": 1.7539037466049194, + "learning_rate": 1.9997688061632157e-05, + "loss": 1.2829, + "step": 4713 + }, + { + "epoch": 0.1099452780847038, + "grad_norm": 1.590420126914978, + "learning_rate": 1.999768264429583e-05, + "loss": 1.249, + "step": 4714 + }, + { + "epoch": 0.10996860122388173, + "grad_norm": 2.3379104137420654, + "learning_rate": 1.999767722062071e-05, + "loss": 1.2626, + "step": 4715 + }, + { + "epoch": 0.10999192436305964, + "grad_norm": 2.0119636058807373, + "learning_rate": 1.9997671790606798e-05, + "loss": 1.4327, + "step": 4716 + }, + { + "epoch": 0.11001524750223757, + "grad_norm": 1.710679054260254, + "learning_rate": 1.99976663542541e-05, + "loss": 1.1288, + "step": 4717 + }, + { + "epoch": 0.11003857064141548, + "grad_norm": 2.4558565616607666, + "learning_rate": 1.9997660911562618e-05, + "loss": 1.4038, + "step": 4718 + }, + { + "epoch": 0.1100618937805934, + "grad_norm": 1.8955258131027222, + "learning_rate": 1.9997655462532356e-05, + "loss": 1.2128, + "step": 4719 + }, + { + "epoch": 0.11008521691977131, + "grad_norm": 2.1894257068634033, + "learning_rate": 1.999765000716332e-05, + "loss": 1.6796, + "step": 4720 + }, + { + "epoch": 0.11010854005894924, + "grad_norm": 1.8517687320709229, + "learning_rate": 1.9997644545455513e-05, + "loss": 1.4889, + "step": 4721 + }, + { + "epoch": 0.11013186319812715, + "grad_norm": 1.4544748067855835, + "learning_rate": 1.9997639077408935e-05, + "loss": 1.2158, + "step": 4722 + }, + { + "epoch": 0.11015518633730507, + "grad_norm": 1.4852479696273804, + "learning_rate": 1.999763360302359e-05, + "loss": 1.4004, + "step": 4723 + }, + { + "epoch": 0.11017850947648299, + "grad_norm": 1.441847324371338, + "learning_rate": 1.9997628122299485e-05, + "loss": 1.4372, + "step": 4724 + }, + { + "epoch": 0.11020183261566091, + "grad_norm": 1.5786794424057007, + "learning_rate": 1.9997622635236618e-05, + "loss": 1.4358, + "step": 4725 + }, + { + "epoch": 0.11022515575483882, + "grad_norm": 1.9009759426116943, + "learning_rate": 1.9997617141835e-05, + "loss": 1.3353, + "step": 4726 + }, + { + "epoch": 0.11024847889401675, + "grad_norm": 1.5824389457702637, + "learning_rate": 1.9997611642094627e-05, + "loss": 1.5705, + "step": 4727 + }, + { + "epoch": 0.11027180203319466, + "grad_norm": 2.0614171028137207, + "learning_rate": 1.9997606136015507e-05, + "loss": 1.1281, + "step": 4728 + }, + { + "epoch": 0.11029512517237258, + "grad_norm": 1.6476571559906006, + "learning_rate": 1.9997600623597643e-05, + "loss": 1.24, + "step": 4729 + }, + { + "epoch": 0.1103184483115505, + "grad_norm": 2.5238232612609863, + "learning_rate": 1.9997595104841038e-05, + "loss": 1.2169, + "step": 4730 + }, + { + "epoch": 0.1103417714507284, + "grad_norm": 1.9560441970825195, + "learning_rate": 1.9997589579745693e-05, + "loss": 1.4138, + "step": 4731 + }, + { + "epoch": 0.11036509458990633, + "grad_norm": 1.550877332687378, + "learning_rate": 1.9997584048311614e-05, + "loss": 1.1269, + "step": 4732 + }, + { + "epoch": 0.11038841772908424, + "grad_norm": 2.002683162689209, + "learning_rate": 1.9997578510538805e-05, + "loss": 1.5355, + "step": 4733 + }, + { + "epoch": 0.11041174086826216, + "grad_norm": 1.581544041633606, + "learning_rate": 1.999757296642727e-05, + "loss": 1.4729, + "step": 4734 + }, + { + "epoch": 0.11043506400744008, + "grad_norm": 2.1235954761505127, + "learning_rate": 1.999756741597701e-05, + "loss": 1.6424, + "step": 4735 + }, + { + "epoch": 0.110458387146618, + "grad_norm": 1.5193734169006348, + "learning_rate": 1.999756185918803e-05, + "loss": 1.0682, + "step": 4736 + }, + { + "epoch": 0.11048171028579591, + "grad_norm": 1.9458128213882446, + "learning_rate": 1.9997556296060332e-05, + "loss": 1.4472, + "step": 4737 + }, + { + "epoch": 0.11050503342497384, + "grad_norm": 1.8136341571807861, + "learning_rate": 1.9997550726593925e-05, + "loss": 1.7115, + "step": 4738 + }, + { + "epoch": 0.11052835656415175, + "grad_norm": 1.7098948955535889, + "learning_rate": 1.9997545150788805e-05, + "loss": 1.5029, + "step": 4739 + }, + { + "epoch": 0.11055167970332967, + "grad_norm": 1.6236376762390137, + "learning_rate": 1.999753956864498e-05, + "loss": 1.6369, + "step": 4740 + }, + { + "epoch": 0.11057500284250758, + "grad_norm": 1.7370587587356567, + "learning_rate": 1.999753398016245e-05, + "loss": 1.2514, + "step": 4741 + }, + { + "epoch": 0.11059832598168551, + "grad_norm": 2.082703113555908, + "learning_rate": 1.9997528385341223e-05, + "loss": 1.4751, + "step": 4742 + }, + { + "epoch": 0.11062164912086342, + "grad_norm": 1.9707653522491455, + "learning_rate": 1.99975227841813e-05, + "loss": 1.6404, + "step": 4743 + }, + { + "epoch": 0.11064497226004134, + "grad_norm": 1.470170259475708, + "learning_rate": 1.9997517176682687e-05, + "loss": 1.5853, + "step": 4744 + }, + { + "epoch": 0.11066829539921925, + "grad_norm": 1.8218199014663696, + "learning_rate": 1.9997511562845384e-05, + "loss": 1.2697, + "step": 4745 + }, + { + "epoch": 0.11069161853839718, + "grad_norm": 1.8697524070739746, + "learning_rate": 1.9997505942669394e-05, + "loss": 1.6328, + "step": 4746 + }, + { + "epoch": 0.11071494167757509, + "grad_norm": 1.5326578617095947, + "learning_rate": 1.999750031615473e-05, + "loss": 1.5544, + "step": 4747 + }, + { + "epoch": 0.11073826481675302, + "grad_norm": 1.7094053030014038, + "learning_rate": 1.999749468330138e-05, + "loss": 1.35, + "step": 4748 + }, + { + "epoch": 0.11076158795593093, + "grad_norm": 2.1253573894500732, + "learning_rate": 1.999748904410936e-05, + "loss": 1.4909, + "step": 4749 + }, + { + "epoch": 0.11078491109510885, + "grad_norm": 2.1014323234558105, + "learning_rate": 1.9997483398578668e-05, + "loss": 1.3446, + "step": 4750 + }, + { + "epoch": 0.11080823423428676, + "grad_norm": 1.844268560409546, + "learning_rate": 1.9997477746709308e-05, + "loss": 1.3962, + "step": 4751 + }, + { + "epoch": 0.11083155737346469, + "grad_norm": 2.438141345977783, + "learning_rate": 1.9997472088501285e-05, + "loss": 1.7026, + "step": 4752 + }, + { + "epoch": 0.1108548805126426, + "grad_norm": 1.7001752853393555, + "learning_rate": 1.9997466423954606e-05, + "loss": 1.2695, + "step": 4753 + }, + { + "epoch": 0.11087820365182052, + "grad_norm": 1.6355085372924805, + "learning_rate": 1.999746075306927e-05, + "loss": 1.0506, + "step": 4754 + }, + { + "epoch": 0.11090152679099843, + "grad_norm": 2.8396060466766357, + "learning_rate": 1.9997455075845278e-05, + "loss": 1.3554, + "step": 4755 + }, + { + "epoch": 0.11092484993017636, + "grad_norm": 1.9583638906478882, + "learning_rate": 1.999744939228264e-05, + "loss": 1.4245, + "step": 4756 + }, + { + "epoch": 0.11094817306935427, + "grad_norm": 1.8647066354751587, + "learning_rate": 1.9997443702381357e-05, + "loss": 1.6369, + "step": 4757 + }, + { + "epoch": 0.1109714962085322, + "grad_norm": 1.5789604187011719, + "learning_rate": 1.9997438006141426e-05, + "loss": 1.3321, + "step": 4758 + }, + { + "epoch": 0.1109948193477101, + "grad_norm": 1.3761906623840332, + "learning_rate": 1.9997432303562863e-05, + "loss": 0.9831, + "step": 4759 + }, + { + "epoch": 0.11101814248688802, + "grad_norm": 1.8360717296600342, + "learning_rate": 1.9997426594645664e-05, + "loss": 1.2267, + "step": 4760 + }, + { + "epoch": 0.11104146562606594, + "grad_norm": 1.647345781326294, + "learning_rate": 1.9997420879389834e-05, + "loss": 1.3699, + "step": 4761 + }, + { + "epoch": 0.11106478876524385, + "grad_norm": 1.7599635124206543, + "learning_rate": 1.9997415157795377e-05, + "loss": 1.9307, + "step": 4762 + }, + { + "epoch": 0.11108811190442178, + "grad_norm": 1.5892014503479004, + "learning_rate": 1.99974094298623e-05, + "loss": 1.4318, + "step": 4763 + }, + { + "epoch": 0.11111143504359969, + "grad_norm": 1.6504076719284058, + "learning_rate": 1.9997403695590596e-05, + "loss": 1.3844, + "step": 4764 + }, + { + "epoch": 0.11113475818277761, + "grad_norm": 2.1827478408813477, + "learning_rate": 1.9997397954980277e-05, + "loss": 1.802, + "step": 4765 + }, + { + "epoch": 0.11115808132195552, + "grad_norm": 1.9092530012130737, + "learning_rate": 1.9997392208031347e-05, + "loss": 1.6177, + "step": 4766 + }, + { + "epoch": 0.11118140446113345, + "grad_norm": 2.4833455085754395, + "learning_rate": 1.9997386454743808e-05, + "loss": 1.8023, + "step": 4767 + }, + { + "epoch": 0.11120472760031136, + "grad_norm": 1.7990853786468506, + "learning_rate": 1.9997380695117664e-05, + "loss": 1.7281, + "step": 4768 + }, + { + "epoch": 0.11122805073948928, + "grad_norm": 1.8523533344268799, + "learning_rate": 1.9997374929152915e-05, + "loss": 1.5475, + "step": 4769 + }, + { + "epoch": 0.1112513738786672, + "grad_norm": 1.3760818243026733, + "learning_rate": 1.9997369156849575e-05, + "loss": 1.3055, + "step": 4770 + }, + { + "epoch": 0.11127469701784512, + "grad_norm": 1.6081079244613647, + "learning_rate": 1.9997363378207632e-05, + "loss": 1.2354, + "step": 4771 + }, + { + "epoch": 0.11129802015702303, + "grad_norm": 1.8561193943023682, + "learning_rate": 1.9997357593227102e-05, + "loss": 1.3127, + "step": 4772 + }, + { + "epoch": 0.11132134329620096, + "grad_norm": 1.6075220108032227, + "learning_rate": 1.9997351801907984e-05, + "loss": 1.6588, + "step": 4773 + }, + { + "epoch": 0.11134466643537887, + "grad_norm": 1.7126284837722778, + "learning_rate": 1.9997346004250284e-05, + "loss": 1.4986, + "step": 4774 + }, + { + "epoch": 0.11136798957455679, + "grad_norm": 1.7078109979629517, + "learning_rate": 1.9997340200254003e-05, + "loss": 1.5173, + "step": 4775 + }, + { + "epoch": 0.1113913127137347, + "grad_norm": 1.885150671005249, + "learning_rate": 1.9997334389919144e-05, + "loss": 1.4414, + "step": 4776 + }, + { + "epoch": 0.11141463585291263, + "grad_norm": 1.6406018733978271, + "learning_rate": 1.9997328573245718e-05, + "loss": 1.1587, + "step": 4777 + }, + { + "epoch": 0.11143795899209054, + "grad_norm": 2.351262092590332, + "learning_rate": 1.999732275023372e-05, + "loss": 1.7137, + "step": 4778 + }, + { + "epoch": 0.11146128213126846, + "grad_norm": 1.8597307205200195, + "learning_rate": 1.9997316920883154e-05, + "loss": 1.2877, + "step": 4779 + }, + { + "epoch": 0.11148460527044637, + "grad_norm": 1.9467705488204956, + "learning_rate": 1.999731108519403e-05, + "loss": 1.4246, + "step": 4780 + }, + { + "epoch": 0.1115079284096243, + "grad_norm": 1.8004958629608154, + "learning_rate": 1.9997305243166347e-05, + "loss": 1.3196, + "step": 4781 + }, + { + "epoch": 0.11153125154880221, + "grad_norm": 1.4756791591644287, + "learning_rate": 1.999729939480011e-05, + "loss": 1.5004, + "step": 4782 + }, + { + "epoch": 0.11155457468798013, + "grad_norm": 1.7322580814361572, + "learning_rate": 1.9997293540095322e-05, + "loss": 1.5387, + "step": 4783 + }, + { + "epoch": 0.11157789782715805, + "grad_norm": 1.7892494201660156, + "learning_rate": 1.9997287679051992e-05, + "loss": 1.4908, + "step": 4784 + }, + { + "epoch": 0.11160122096633597, + "grad_norm": 1.6499955654144287, + "learning_rate": 1.9997281811670115e-05, + "loss": 1.1038, + "step": 4785 + }, + { + "epoch": 0.11162454410551388, + "grad_norm": 1.779866099357605, + "learning_rate": 1.99972759379497e-05, + "loss": 1.6099, + "step": 4786 + }, + { + "epoch": 0.11164786724469179, + "grad_norm": 1.6814109086990356, + "learning_rate": 1.999727005789075e-05, + "loss": 1.2583, + "step": 4787 + }, + { + "epoch": 0.11167119038386972, + "grad_norm": 2.033905506134033, + "learning_rate": 1.9997264171493266e-05, + "loss": 1.5961, + "step": 4788 + }, + { + "epoch": 0.11169451352304763, + "grad_norm": 1.79148530960083, + "learning_rate": 1.9997258278757256e-05, + "loss": 1.4551, + "step": 4789 + }, + { + "epoch": 0.11171783666222555, + "grad_norm": 2.1484973430633545, + "learning_rate": 1.9997252379682723e-05, + "loss": 1.6561, + "step": 4790 + }, + { + "epoch": 0.11174115980140346, + "grad_norm": 1.8857439756393433, + "learning_rate": 1.999724647426967e-05, + "loss": 1.8829, + "step": 4791 + }, + { + "epoch": 0.11176448294058139, + "grad_norm": 1.6438655853271484, + "learning_rate": 1.9997240562518095e-05, + "loss": 1.4009, + "step": 4792 + }, + { + "epoch": 0.1117878060797593, + "grad_norm": 1.4829570055007935, + "learning_rate": 1.999723464442801e-05, + "loss": 1.0155, + "step": 4793 + }, + { + "epoch": 0.11181112921893722, + "grad_norm": 1.896540641784668, + "learning_rate": 1.9997228719999417e-05, + "loss": 1.536, + "step": 4794 + }, + { + "epoch": 0.11183445235811514, + "grad_norm": 1.9464058876037598, + "learning_rate": 1.999722278923232e-05, + "loss": 1.3048, + "step": 4795 + }, + { + "epoch": 0.11185777549729306, + "grad_norm": 1.6867518424987793, + "learning_rate": 1.9997216852126718e-05, + "loss": 1.7278, + "step": 4796 + }, + { + "epoch": 0.11188109863647097, + "grad_norm": 1.5861499309539795, + "learning_rate": 1.9997210908682617e-05, + "loss": 1.4502, + "step": 4797 + }, + { + "epoch": 0.1119044217756489, + "grad_norm": 2.019364833831787, + "learning_rate": 1.9997204958900026e-05, + "loss": 1.3034, + "step": 4798 + }, + { + "epoch": 0.11192774491482681, + "grad_norm": 1.4557583332061768, + "learning_rate": 1.999719900277894e-05, + "loss": 1.3609, + "step": 4799 + }, + { + "epoch": 0.11195106805400473, + "grad_norm": 1.5779038667678833, + "learning_rate": 1.9997193040319368e-05, + "loss": 1.7772, + "step": 4800 + }, + { + "epoch": 0.11197439119318264, + "grad_norm": 2.1182780265808105, + "learning_rate": 1.9997187071521317e-05, + "loss": 1.7829, + "step": 4801 + }, + { + "epoch": 0.11199771433236057, + "grad_norm": 1.4608792066574097, + "learning_rate": 1.9997181096384787e-05, + "loss": 1.5513, + "step": 4802 + }, + { + "epoch": 0.11202103747153848, + "grad_norm": 1.8238307237625122, + "learning_rate": 1.9997175114909778e-05, + "loss": 1.2157, + "step": 4803 + }, + { + "epoch": 0.1120443606107164, + "grad_norm": 1.9729714393615723, + "learning_rate": 1.99971691270963e-05, + "loss": 1.3976, + "step": 4804 + }, + { + "epoch": 0.11206768374989431, + "grad_norm": 1.6585525274276733, + "learning_rate": 1.9997163132944353e-05, + "loss": 1.5639, + "step": 4805 + }, + { + "epoch": 0.11209100688907224, + "grad_norm": 1.906258463859558, + "learning_rate": 1.999715713245394e-05, + "loss": 1.4502, + "step": 4806 + }, + { + "epoch": 0.11211433002825015, + "grad_norm": 1.8018217086791992, + "learning_rate": 1.999715112562507e-05, + "loss": 1.4047, + "step": 4807 + }, + { + "epoch": 0.11213765316742808, + "grad_norm": 1.9643853902816772, + "learning_rate": 1.9997145112457744e-05, + "loss": 1.5038, + "step": 4808 + }, + { + "epoch": 0.11216097630660599, + "grad_norm": 1.5914634466171265, + "learning_rate": 1.9997139092951965e-05, + "loss": 1.4842, + "step": 4809 + }, + { + "epoch": 0.11218429944578391, + "grad_norm": 1.4541680812835693, + "learning_rate": 1.9997133067107736e-05, + "loss": 1.3954, + "step": 4810 + }, + { + "epoch": 0.11220762258496182, + "grad_norm": 1.8242120742797852, + "learning_rate": 1.9997127034925065e-05, + "loss": 1.4084, + "step": 4811 + }, + { + "epoch": 0.11223094572413975, + "grad_norm": 1.797040581703186, + "learning_rate": 1.9997120996403948e-05, + "loss": 1.5137, + "step": 4812 + }, + { + "epoch": 0.11225426886331766, + "grad_norm": 1.5651525259017944, + "learning_rate": 1.99971149515444e-05, + "loss": 1.4482, + "step": 4813 + }, + { + "epoch": 0.11227759200249558, + "grad_norm": 2.747147560119629, + "learning_rate": 1.9997108900346414e-05, + "loss": 1.349, + "step": 4814 + }, + { + "epoch": 0.1123009151416735, + "grad_norm": 1.3470970392227173, + "learning_rate": 1.999710284281e-05, + "loss": 1.0509, + "step": 4815 + }, + { + "epoch": 0.1123242382808514, + "grad_norm": 1.7725294828414917, + "learning_rate": 1.9997096778935162e-05, + "loss": 1.6337, + "step": 4816 + }, + { + "epoch": 0.11234756142002933, + "grad_norm": 1.618151068687439, + "learning_rate": 1.9997090708721897e-05, + "loss": 1.5256, + "step": 4817 + }, + { + "epoch": 0.11237088455920724, + "grad_norm": 1.4803017377853394, + "learning_rate": 1.9997084632170218e-05, + "loss": 1.2074, + "step": 4818 + }, + { + "epoch": 0.11239420769838517, + "grad_norm": 1.7909730672836304, + "learning_rate": 1.9997078549280126e-05, + "loss": 1.3639, + "step": 4819 + }, + { + "epoch": 0.11241753083756308, + "grad_norm": 2.1538288593292236, + "learning_rate": 1.9997072460051622e-05, + "loss": 1.386, + "step": 4820 + }, + { + "epoch": 0.112440853976741, + "grad_norm": 2.1644701957702637, + "learning_rate": 1.999706636448471e-05, + "loss": 1.8123, + "step": 4821 + }, + { + "epoch": 0.11246417711591891, + "grad_norm": 1.6548917293548584, + "learning_rate": 1.99970602625794e-05, + "loss": 1.5236, + "step": 4822 + }, + { + "epoch": 0.11248750025509684, + "grad_norm": 1.8485840559005737, + "learning_rate": 1.9997054154335687e-05, + "loss": 1.5656, + "step": 4823 + }, + { + "epoch": 0.11251082339427475, + "grad_norm": 1.8559378385543823, + "learning_rate": 1.9997048039753584e-05, + "loss": 1.5271, + "step": 4824 + }, + { + "epoch": 0.11253414653345267, + "grad_norm": 1.84505033493042, + "learning_rate": 1.9997041918833085e-05, + "loss": 1.6146, + "step": 4825 + }, + { + "epoch": 0.11255746967263058, + "grad_norm": 1.840195655822754, + "learning_rate": 1.99970357915742e-05, + "loss": 1.1284, + "step": 4826 + }, + { + "epoch": 0.11258079281180851, + "grad_norm": 1.8387125730514526, + "learning_rate": 1.9997029657976933e-05, + "loss": 1.6407, + "step": 4827 + }, + { + "epoch": 0.11260411595098642, + "grad_norm": 1.4782036542892456, + "learning_rate": 1.9997023518041286e-05, + "loss": 1.4918, + "step": 4828 + }, + { + "epoch": 0.11262743909016434, + "grad_norm": 1.4858899116516113, + "learning_rate": 1.9997017371767265e-05, + "loss": 0.9922, + "step": 4829 + }, + { + "epoch": 0.11265076222934226, + "grad_norm": 1.8135300874710083, + "learning_rate": 1.9997011219154873e-05, + "loss": 1.4396, + "step": 4830 + }, + { + "epoch": 0.11267408536852018, + "grad_norm": 1.5662935972213745, + "learning_rate": 1.9997005060204112e-05, + "loss": 1.3612, + "step": 4831 + }, + { + "epoch": 0.11269740850769809, + "grad_norm": 1.5332759618759155, + "learning_rate": 1.9996998894914984e-05, + "loss": 1.2752, + "step": 4832 + }, + { + "epoch": 0.11272073164687602, + "grad_norm": 1.4699090719223022, + "learning_rate": 1.9996992723287502e-05, + "loss": 1.5256, + "step": 4833 + }, + { + "epoch": 0.11274405478605393, + "grad_norm": 1.9431648254394531, + "learning_rate": 1.999698654532166e-05, + "loss": 1.3271, + "step": 4834 + }, + { + "epoch": 0.11276737792523185, + "grad_norm": 1.348463773727417, + "learning_rate": 1.999698036101747e-05, + "loss": 1.196, + "step": 4835 + }, + { + "epoch": 0.11279070106440976, + "grad_norm": 1.9276671409606934, + "learning_rate": 1.999697417037493e-05, + "loss": 1.4015, + "step": 4836 + }, + { + "epoch": 0.11281402420358769, + "grad_norm": 1.8967229127883911, + "learning_rate": 1.9996967973394046e-05, + "loss": 1.4184, + "step": 4837 + }, + { + "epoch": 0.1128373473427656, + "grad_norm": 1.8818328380584717, + "learning_rate": 1.999696177007482e-05, + "loss": 1.2718, + "step": 4838 + }, + { + "epoch": 0.11286067048194352, + "grad_norm": 1.7954262495040894, + "learning_rate": 1.999695556041726e-05, + "loss": 1.4324, + "step": 4839 + }, + { + "epoch": 0.11288399362112143, + "grad_norm": 1.7013620138168335, + "learning_rate": 1.9996949344421366e-05, + "loss": 1.2892, + "step": 4840 + }, + { + "epoch": 0.11290731676029936, + "grad_norm": 1.8514941930770874, + "learning_rate": 1.9996943122087144e-05, + "loss": 1.494, + "step": 4841 + }, + { + "epoch": 0.11293063989947727, + "grad_norm": 1.871381402015686, + "learning_rate": 1.9996936893414598e-05, + "loss": 1.6722, + "step": 4842 + }, + { + "epoch": 0.1129539630386552, + "grad_norm": 1.4767144918441772, + "learning_rate": 1.9996930658403735e-05, + "loss": 1.0865, + "step": 4843 + }, + { + "epoch": 0.1129772861778331, + "grad_norm": 2.1904797554016113, + "learning_rate": 1.999692441705455e-05, + "loss": 1.5067, + "step": 4844 + }, + { + "epoch": 0.11300060931701102, + "grad_norm": 1.3845102787017822, + "learning_rate": 1.9996918169367054e-05, + "loss": 1.4637, + "step": 4845 + }, + { + "epoch": 0.11302393245618894, + "grad_norm": 1.3896088600158691, + "learning_rate": 1.999691191534125e-05, + "loss": 1.4277, + "step": 4846 + }, + { + "epoch": 0.11304725559536685, + "grad_norm": 1.6329911947250366, + "learning_rate": 1.9996905654977143e-05, + "loss": 1.4913, + "step": 4847 + }, + { + "epoch": 0.11307057873454478, + "grad_norm": 1.9645805358886719, + "learning_rate": 1.999689938827473e-05, + "loss": 1.6053, + "step": 4848 + }, + { + "epoch": 0.11309390187372269, + "grad_norm": 1.4963576793670654, + "learning_rate": 1.9996893115234027e-05, + "loss": 1.5192, + "step": 4849 + }, + { + "epoch": 0.11311722501290061, + "grad_norm": 2.069765567779541, + "learning_rate": 1.999688683585503e-05, + "loss": 1.446, + "step": 4850 + }, + { + "epoch": 0.11314054815207852, + "grad_norm": 1.8455005884170532, + "learning_rate": 1.9996880550137742e-05, + "loss": 1.4464, + "step": 4851 + }, + { + "epoch": 0.11316387129125645, + "grad_norm": 1.8833606243133545, + "learning_rate": 1.999687425808217e-05, + "loss": 1.4146, + "step": 4852 + }, + { + "epoch": 0.11318719443043436, + "grad_norm": 2.2382304668426514, + "learning_rate": 1.999686795968832e-05, + "loss": 1.3997, + "step": 4853 + }, + { + "epoch": 0.11321051756961228, + "grad_norm": 1.5338480472564697, + "learning_rate": 1.9996861654956187e-05, + "loss": 1.2996, + "step": 4854 + }, + { + "epoch": 0.1132338407087902, + "grad_norm": 2.0787034034729004, + "learning_rate": 1.9996855343885787e-05, + "loss": 1.2946, + "step": 4855 + }, + { + "epoch": 0.11325716384796812, + "grad_norm": 1.8246303796768188, + "learning_rate": 1.9996849026477117e-05, + "loss": 1.5849, + "step": 4856 + }, + { + "epoch": 0.11328048698714603, + "grad_norm": 1.6724165678024292, + "learning_rate": 1.999684270273018e-05, + "loss": 1.2117, + "step": 4857 + }, + { + "epoch": 0.11330381012632396, + "grad_norm": 1.9172333478927612, + "learning_rate": 1.9996836372644982e-05, + "loss": 1.4549, + "step": 4858 + }, + { + "epoch": 0.11332713326550187, + "grad_norm": 2.3572545051574707, + "learning_rate": 1.999683003622153e-05, + "loss": 1.5828, + "step": 4859 + }, + { + "epoch": 0.11335045640467979, + "grad_norm": 1.7597121000289917, + "learning_rate": 1.9996823693459827e-05, + "loss": 1.3256, + "step": 4860 + }, + { + "epoch": 0.1133737795438577, + "grad_norm": 1.7418758869171143, + "learning_rate": 1.999681734435987e-05, + "loss": 1.5079, + "step": 4861 + }, + { + "epoch": 0.11339710268303563, + "grad_norm": 1.6415714025497437, + "learning_rate": 1.9996810988921675e-05, + "loss": 1.474, + "step": 4862 + }, + { + "epoch": 0.11342042582221354, + "grad_norm": 1.460545301437378, + "learning_rate": 1.9996804627145233e-05, + "loss": 1.4171, + "step": 4863 + }, + { + "epoch": 0.11344374896139146, + "grad_norm": 1.6469100713729858, + "learning_rate": 1.9996798259030555e-05, + "loss": 1.2755, + "step": 4864 + }, + { + "epoch": 0.11346707210056937, + "grad_norm": 1.7686614990234375, + "learning_rate": 1.9996791884577645e-05, + "loss": 1.3803, + "step": 4865 + }, + { + "epoch": 0.1134903952397473, + "grad_norm": 1.5787098407745361, + "learning_rate": 1.999678550378651e-05, + "loss": 1.3252, + "step": 4866 + }, + { + "epoch": 0.11351371837892521, + "grad_norm": 1.5128165483474731, + "learning_rate": 1.999677911665715e-05, + "loss": 1.3384, + "step": 4867 + }, + { + "epoch": 0.11353704151810314, + "grad_norm": 1.7517908811569214, + "learning_rate": 1.9996772723189566e-05, + "loss": 1.3678, + "step": 4868 + }, + { + "epoch": 0.11356036465728105, + "grad_norm": 2.1116812229156494, + "learning_rate": 1.999676632338377e-05, + "loss": 1.4632, + "step": 4869 + }, + { + "epoch": 0.11358368779645897, + "grad_norm": 1.730730414390564, + "learning_rate": 1.999675991723976e-05, + "loss": 1.2828, + "step": 4870 + }, + { + "epoch": 0.11360701093563688, + "grad_norm": 1.9060343503952026, + "learning_rate": 1.9996753504757538e-05, + "loss": 1.4953, + "step": 4871 + }, + { + "epoch": 0.1136303340748148, + "grad_norm": 1.9581001996994019, + "learning_rate": 1.9996747085937117e-05, + "loss": 1.6483, + "step": 4872 + }, + { + "epoch": 0.11365365721399272, + "grad_norm": 1.4604954719543457, + "learning_rate": 1.999674066077849e-05, + "loss": 1.3666, + "step": 4873 + }, + { + "epoch": 0.11367698035317063, + "grad_norm": 1.5647526979446411, + "learning_rate": 1.9996734229281672e-05, + "loss": 1.7014, + "step": 4874 + }, + { + "epoch": 0.11370030349234855, + "grad_norm": 1.5901166200637817, + "learning_rate": 1.9996727791446663e-05, + "loss": 1.564, + "step": 4875 + }, + { + "epoch": 0.11372362663152646, + "grad_norm": 1.818510890007019, + "learning_rate": 1.999672134727346e-05, + "loss": 1.6494, + "step": 4876 + }, + { + "epoch": 0.11374694977070439, + "grad_norm": 1.506116509437561, + "learning_rate": 1.9996714896762078e-05, + "loss": 1.486, + "step": 4877 + }, + { + "epoch": 0.1137702729098823, + "grad_norm": 1.6455276012420654, + "learning_rate": 1.9996708439912517e-05, + "loss": 1.4879, + "step": 4878 + }, + { + "epoch": 0.11379359604906022, + "grad_norm": 1.9927424192428589, + "learning_rate": 1.9996701976724778e-05, + "loss": 1.7139, + "step": 4879 + }, + { + "epoch": 0.11381691918823814, + "grad_norm": 1.825521469116211, + "learning_rate": 1.9996695507198867e-05, + "loss": 1.2458, + "step": 4880 + }, + { + "epoch": 0.11384024232741606, + "grad_norm": 2.3200228214263916, + "learning_rate": 1.999668903133479e-05, + "loss": 1.5115, + "step": 4881 + }, + { + "epoch": 0.11386356546659397, + "grad_norm": 2.874640464782715, + "learning_rate": 1.999668254913255e-05, + "loss": 1.2171, + "step": 4882 + }, + { + "epoch": 0.1138868886057719, + "grad_norm": 1.6035988330841064, + "learning_rate": 1.9996676060592152e-05, + "loss": 1.4418, + "step": 4883 + }, + { + "epoch": 0.11391021174494981, + "grad_norm": 1.5072420835494995, + "learning_rate": 1.9996669565713596e-05, + "loss": 1.4429, + "step": 4884 + }, + { + "epoch": 0.11393353488412773, + "grad_norm": 1.9192874431610107, + "learning_rate": 1.999666306449689e-05, + "loss": 1.5421, + "step": 4885 + }, + { + "epoch": 0.11395685802330564, + "grad_norm": 2.3463261127471924, + "learning_rate": 1.999665655694204e-05, + "loss": 1.9386, + "step": 4886 + }, + { + "epoch": 0.11398018116248357, + "grad_norm": 1.3098691701889038, + "learning_rate": 1.999665004304904e-05, + "loss": 1.5214, + "step": 4887 + }, + { + "epoch": 0.11400350430166148, + "grad_norm": 1.6770516633987427, + "learning_rate": 1.9996643522817905e-05, + "loss": 1.8164, + "step": 4888 + }, + { + "epoch": 0.1140268274408394, + "grad_norm": 1.6771339178085327, + "learning_rate": 1.999663699624864e-05, + "loss": 1.4531, + "step": 4889 + }, + { + "epoch": 0.11405015058001731, + "grad_norm": 1.4414012432098389, + "learning_rate": 1.999663046334124e-05, + "loss": 1.262, + "step": 4890 + }, + { + "epoch": 0.11407347371919524, + "grad_norm": 2.0771100521087646, + "learning_rate": 1.9996623924095714e-05, + "loss": 1.5053, + "step": 4891 + }, + { + "epoch": 0.11409679685837315, + "grad_norm": 1.397773027420044, + "learning_rate": 1.999661737851207e-05, + "loss": 1.4299, + "step": 4892 + }, + { + "epoch": 0.11412011999755108, + "grad_norm": 4.096678256988525, + "learning_rate": 1.9996610826590303e-05, + "loss": 1.276, + "step": 4893 + }, + { + "epoch": 0.11414344313672899, + "grad_norm": 2.586124897003174, + "learning_rate": 1.9996604268330424e-05, + "loss": 1.3923, + "step": 4894 + }, + { + "epoch": 0.11416676627590691, + "grad_norm": 2.3592286109924316, + "learning_rate": 1.9996597703732438e-05, + "loss": 1.5799, + "step": 4895 + }, + { + "epoch": 0.11419008941508482, + "grad_norm": 1.6324352025985718, + "learning_rate": 1.9996591132796345e-05, + "loss": 1.2847, + "step": 4896 + }, + { + "epoch": 0.11421341255426275, + "grad_norm": 1.6896082162857056, + "learning_rate": 1.999658455552215e-05, + "loss": 1.5221, + "step": 4897 + }, + { + "epoch": 0.11423673569344066, + "grad_norm": 1.6508216857910156, + "learning_rate": 1.9996577971909857e-05, + "loss": 1.094, + "step": 4898 + }, + { + "epoch": 0.11426005883261858, + "grad_norm": 2.0739083290100098, + "learning_rate": 1.9996571381959474e-05, + "loss": 1.4574, + "step": 4899 + }, + { + "epoch": 0.1142833819717965, + "grad_norm": 1.7015663385391235, + "learning_rate": 1.9996564785671e-05, + "loss": 1.506, + "step": 4900 + }, + { + "epoch": 0.1143067051109744, + "grad_norm": 1.7504394054412842, + "learning_rate": 1.9996558183044443e-05, + "loss": 1.3567, + "step": 4901 + }, + { + "epoch": 0.11433002825015233, + "grad_norm": 1.7425142526626587, + "learning_rate": 1.9996551574079802e-05, + "loss": 1.3854, + "step": 4902 + }, + { + "epoch": 0.11435335138933024, + "grad_norm": 1.5381382703781128, + "learning_rate": 1.999654495877709e-05, + "loss": 0.9359, + "step": 4903 + }, + { + "epoch": 0.11437667452850817, + "grad_norm": 1.9737110137939453, + "learning_rate": 1.9996538337136302e-05, + "loss": 1.2001, + "step": 4904 + }, + { + "epoch": 0.11439999766768608, + "grad_norm": 1.606192946434021, + "learning_rate": 1.999653170915745e-05, + "loss": 1.2032, + "step": 4905 + }, + { + "epoch": 0.114423320806864, + "grad_norm": 1.9052050113677979, + "learning_rate": 1.999652507484053e-05, + "loss": 1.3797, + "step": 4906 + }, + { + "epoch": 0.11444664394604191, + "grad_norm": 2.1226413249969482, + "learning_rate": 1.9996518434185558e-05, + "loss": 1.5129, + "step": 4907 + }, + { + "epoch": 0.11446996708521984, + "grad_norm": 1.5617464780807495, + "learning_rate": 1.9996511787192523e-05, + "loss": 1.2434, + "step": 4908 + }, + { + "epoch": 0.11449329022439775, + "grad_norm": 1.3686046600341797, + "learning_rate": 1.999650513386144e-05, + "loss": 1.1302, + "step": 4909 + }, + { + "epoch": 0.11451661336357567, + "grad_norm": 1.9747443199157715, + "learning_rate": 1.9996498474192312e-05, + "loss": 1.5502, + "step": 4910 + }, + { + "epoch": 0.11453993650275358, + "grad_norm": 1.9239708185195923, + "learning_rate": 1.999649180818514e-05, + "loss": 1.4629, + "step": 4911 + }, + { + "epoch": 0.11456325964193151, + "grad_norm": 1.6129769086837769, + "learning_rate": 1.999648513583993e-05, + "loss": 1.3739, + "step": 4912 + }, + { + "epoch": 0.11458658278110942, + "grad_norm": 1.641762614250183, + "learning_rate": 1.9996478457156682e-05, + "loss": 1.4724, + "step": 4913 + }, + { + "epoch": 0.11460990592028734, + "grad_norm": 1.5708448886871338, + "learning_rate": 1.999647177213541e-05, + "loss": 1.4224, + "step": 4914 + }, + { + "epoch": 0.11463322905946526, + "grad_norm": 1.589847207069397, + "learning_rate": 1.999646508077611e-05, + "loss": 1.7322, + "step": 4915 + }, + { + "epoch": 0.11465655219864318, + "grad_norm": 1.799497365951538, + "learning_rate": 1.9996458383078788e-05, + "loss": 1.7415, + "step": 4916 + }, + { + "epoch": 0.11467987533782109, + "grad_norm": 1.55296790599823, + "learning_rate": 1.999645167904345e-05, + "loss": 1.4574, + "step": 4917 + }, + { + "epoch": 0.11470319847699902, + "grad_norm": 2.3322129249572754, + "learning_rate": 1.9996444968670098e-05, + "loss": 1.4262, + "step": 4918 + }, + { + "epoch": 0.11472652161617693, + "grad_norm": 1.506402611732483, + "learning_rate": 1.999643825195874e-05, + "loss": 1.2566, + "step": 4919 + }, + { + "epoch": 0.11474984475535485, + "grad_norm": 1.5143823623657227, + "learning_rate": 1.9996431528909377e-05, + "loss": 1.5797, + "step": 4920 + }, + { + "epoch": 0.11477316789453276, + "grad_norm": 2.1306798458099365, + "learning_rate": 1.9996424799522015e-05, + "loss": 1.7044, + "step": 4921 + }, + { + "epoch": 0.11479649103371069, + "grad_norm": 1.8718247413635254, + "learning_rate": 1.9996418063796654e-05, + "loss": 1.4831, + "step": 4922 + }, + { + "epoch": 0.1148198141728886, + "grad_norm": 1.7717846632003784, + "learning_rate": 1.9996411321733302e-05, + "loss": 1.4004, + "step": 4923 + }, + { + "epoch": 0.11484313731206652, + "grad_norm": 1.2827835083007812, + "learning_rate": 1.999640457333196e-05, + "loss": 1.2938, + "step": 4924 + }, + { + "epoch": 0.11486646045124443, + "grad_norm": 1.321552038192749, + "learning_rate": 1.9996397818592643e-05, + "loss": 1.3143, + "step": 4925 + }, + { + "epoch": 0.11488978359042236, + "grad_norm": 2.193957567214966, + "learning_rate": 1.9996391057515342e-05, + "loss": 1.2899, + "step": 4926 + }, + { + "epoch": 0.11491310672960027, + "grad_norm": 1.7584928274154663, + "learning_rate": 1.9996384290100067e-05, + "loss": 1.5271, + "step": 4927 + }, + { + "epoch": 0.1149364298687782, + "grad_norm": 1.6716281175613403, + "learning_rate": 1.9996377516346823e-05, + "loss": 1.4389, + "step": 4928 + }, + { + "epoch": 0.1149597530079561, + "grad_norm": 2.298614740371704, + "learning_rate": 1.9996370736255612e-05, + "loss": 1.3807, + "step": 4929 + }, + { + "epoch": 0.11498307614713402, + "grad_norm": 2.030890464782715, + "learning_rate": 1.9996363949826443e-05, + "loss": 1.2514, + "step": 4930 + }, + { + "epoch": 0.11500639928631194, + "grad_norm": 1.8294849395751953, + "learning_rate": 1.9996357157059312e-05, + "loss": 1.5073, + "step": 4931 + }, + { + "epoch": 0.11502972242548985, + "grad_norm": 1.556658148765564, + "learning_rate": 1.999635035795423e-05, + "loss": 1.4253, + "step": 4932 + }, + { + "epoch": 0.11505304556466778, + "grad_norm": 1.7101117372512817, + "learning_rate": 1.99963435525112e-05, + "loss": 1.6612, + "step": 4933 + }, + { + "epoch": 0.11507636870384569, + "grad_norm": 2.06040096282959, + "learning_rate": 1.9996336740730226e-05, + "loss": 1.3494, + "step": 4934 + }, + { + "epoch": 0.11509969184302361, + "grad_norm": 1.6833947896957397, + "learning_rate": 1.999632992261131e-05, + "loss": 1.3294, + "step": 4935 + }, + { + "epoch": 0.11512301498220152, + "grad_norm": 1.780799388885498, + "learning_rate": 1.999632309815446e-05, + "loss": 1.3288, + "step": 4936 + }, + { + "epoch": 0.11514633812137945, + "grad_norm": 1.611113429069519, + "learning_rate": 1.9996316267359682e-05, + "loss": 1.483, + "step": 4937 + }, + { + "epoch": 0.11516966126055736, + "grad_norm": 1.5236009359359741, + "learning_rate": 1.9996309430226972e-05, + "loss": 1.5499, + "step": 4938 + }, + { + "epoch": 0.11519298439973528, + "grad_norm": 1.7466269731521606, + "learning_rate": 1.999630258675634e-05, + "loss": 1.4186, + "step": 4939 + }, + { + "epoch": 0.1152163075389132, + "grad_norm": 1.5798134803771973, + "learning_rate": 1.9996295736947792e-05, + "loss": 1.0816, + "step": 4940 + }, + { + "epoch": 0.11523963067809112, + "grad_norm": 1.6026114225387573, + "learning_rate": 1.9996288880801326e-05, + "loss": 1.4924, + "step": 4941 + }, + { + "epoch": 0.11526295381726903, + "grad_norm": 1.985709309577942, + "learning_rate": 1.9996282018316956e-05, + "loss": 1.5669, + "step": 4942 + }, + { + "epoch": 0.11528627695644696, + "grad_norm": 1.537269115447998, + "learning_rate": 1.9996275149494676e-05, + "loss": 1.4639, + "step": 4943 + }, + { + "epoch": 0.11530960009562487, + "grad_norm": 2.2672007083892822, + "learning_rate": 1.99962682743345e-05, + "loss": 1.7857, + "step": 4944 + }, + { + "epoch": 0.11533292323480279, + "grad_norm": 1.6579115390777588, + "learning_rate": 1.9996261392836422e-05, + "loss": 1.4009, + "step": 4945 + }, + { + "epoch": 0.1153562463739807, + "grad_norm": 1.4220229387283325, + "learning_rate": 1.9996254505000455e-05, + "loss": 1.4176, + "step": 4946 + }, + { + "epoch": 0.11537956951315863, + "grad_norm": 2.559842586517334, + "learning_rate": 1.99962476108266e-05, + "loss": 1.7005, + "step": 4947 + }, + { + "epoch": 0.11540289265233654, + "grad_norm": 2.431316614151001, + "learning_rate": 1.999624071031486e-05, + "loss": 1.243, + "step": 4948 + }, + { + "epoch": 0.11542621579151446, + "grad_norm": 2.263580799102783, + "learning_rate": 1.9996233803465242e-05, + "loss": 1.1228, + "step": 4949 + }, + { + "epoch": 0.11544953893069237, + "grad_norm": 2.068286895751953, + "learning_rate": 1.999622689027775e-05, + "loss": 1.3652, + "step": 4950 + }, + { + "epoch": 0.1154728620698703, + "grad_norm": 1.6092827320098877, + "learning_rate": 1.9996219970752387e-05, + "loss": 0.9919, + "step": 4951 + }, + { + "epoch": 0.11549618520904821, + "grad_norm": 1.8321871757507324, + "learning_rate": 1.999621304488916e-05, + "loss": 1.3283, + "step": 4952 + }, + { + "epoch": 0.11551950834822614, + "grad_norm": 1.6613811254501343, + "learning_rate": 1.9996206112688067e-05, + "loss": 1.281, + "step": 4953 + }, + { + "epoch": 0.11554283148740405, + "grad_norm": 1.8060203790664673, + "learning_rate": 1.999619917414912e-05, + "loss": 1.4334, + "step": 4954 + }, + { + "epoch": 0.11556615462658197, + "grad_norm": 1.5202760696411133, + "learning_rate": 1.9996192229272318e-05, + "loss": 1.0427, + "step": 4955 + }, + { + "epoch": 0.11558947776575988, + "grad_norm": 1.6206039190292358, + "learning_rate": 1.9996185278057673e-05, + "loss": 1.6303, + "step": 4956 + }, + { + "epoch": 0.1156128009049378, + "grad_norm": 1.7954822778701782, + "learning_rate": 1.999617832050518e-05, + "loss": 1.5478, + "step": 4957 + }, + { + "epoch": 0.11563612404411572, + "grad_norm": 2.0373642444610596, + "learning_rate": 1.9996171356614848e-05, + "loss": 1.8687, + "step": 4958 + }, + { + "epoch": 0.11565944718329363, + "grad_norm": 1.672339916229248, + "learning_rate": 1.9996164386386684e-05, + "loss": 1.8667, + "step": 4959 + }, + { + "epoch": 0.11568277032247155, + "grad_norm": 1.9105110168457031, + "learning_rate": 1.9996157409820682e-05, + "loss": 1.2615, + "step": 4960 + }, + { + "epoch": 0.11570609346164946, + "grad_norm": 1.501523494720459, + "learning_rate": 1.999615042691686e-05, + "loss": 1.3254, + "step": 4961 + }, + { + "epoch": 0.11572941660082739, + "grad_norm": 1.6732221841812134, + "learning_rate": 1.9996143437675216e-05, + "loss": 1.4104, + "step": 4962 + }, + { + "epoch": 0.1157527397400053, + "grad_norm": 1.6094794273376465, + "learning_rate": 1.9996136442095753e-05, + "loss": 1.4193, + "step": 4963 + }, + { + "epoch": 0.11577606287918323, + "grad_norm": 1.7315895557403564, + "learning_rate": 1.9996129440178475e-05, + "loss": 1.5297, + "step": 4964 + }, + { + "epoch": 0.11579938601836114, + "grad_norm": 1.7027325630187988, + "learning_rate": 1.999612243192339e-05, + "loss": 1.6268, + "step": 4965 + }, + { + "epoch": 0.11582270915753906, + "grad_norm": 1.5824599266052246, + "learning_rate": 1.9996115417330505e-05, + "loss": 1.3822, + "step": 4966 + }, + { + "epoch": 0.11584603229671697, + "grad_norm": 1.8605552911758423, + "learning_rate": 1.9996108396399813e-05, + "loss": 1.5074, + "step": 4967 + }, + { + "epoch": 0.1158693554358949, + "grad_norm": 1.8744456768035889, + "learning_rate": 1.999610136913133e-05, + "loss": 1.5779, + "step": 4968 + }, + { + "epoch": 0.11589267857507281, + "grad_norm": 1.5902279615402222, + "learning_rate": 1.9996094335525055e-05, + "loss": 1.6401, + "step": 4969 + }, + { + "epoch": 0.11591600171425073, + "grad_norm": 1.769426941871643, + "learning_rate": 1.9996087295580998e-05, + "loss": 1.3359, + "step": 4970 + }, + { + "epoch": 0.11593932485342864, + "grad_norm": 1.569571852684021, + "learning_rate": 1.9996080249299156e-05, + "loss": 1.703, + "step": 4971 + }, + { + "epoch": 0.11596264799260657, + "grad_norm": 1.530867338180542, + "learning_rate": 1.9996073196679535e-05, + "loss": 1.3807, + "step": 4972 + }, + { + "epoch": 0.11598597113178448, + "grad_norm": 1.6897547245025635, + "learning_rate": 1.9996066137722143e-05, + "loss": 1.3948, + "step": 4973 + }, + { + "epoch": 0.1160092942709624, + "grad_norm": 1.6676502227783203, + "learning_rate": 1.9996059072426983e-05, + "loss": 1.304, + "step": 4974 + }, + { + "epoch": 0.11603261741014032, + "grad_norm": 1.939272403717041, + "learning_rate": 1.9996052000794058e-05, + "loss": 1.7275, + "step": 4975 + }, + { + "epoch": 0.11605594054931824, + "grad_norm": 1.8904296159744263, + "learning_rate": 1.9996044922823375e-05, + "loss": 1.6057, + "step": 4976 + }, + { + "epoch": 0.11607926368849615, + "grad_norm": 1.7910157442092896, + "learning_rate": 1.9996037838514937e-05, + "loss": 1.5596, + "step": 4977 + }, + { + "epoch": 0.11610258682767408, + "grad_norm": 1.6389763355255127, + "learning_rate": 1.9996030747868745e-05, + "loss": 1.6748, + "step": 4978 + }, + { + "epoch": 0.11612590996685199, + "grad_norm": 1.6676112413406372, + "learning_rate": 1.9996023650884812e-05, + "loss": 1.7798, + "step": 4979 + }, + { + "epoch": 0.11614923310602991, + "grad_norm": 2.0701849460601807, + "learning_rate": 1.9996016547563134e-05, + "loss": 1.6253, + "step": 4980 + }, + { + "epoch": 0.11617255624520782, + "grad_norm": 1.6076871156692505, + "learning_rate": 1.9996009437903722e-05, + "loss": 1.605, + "step": 4981 + }, + { + "epoch": 0.11619587938438575, + "grad_norm": 1.608145833015442, + "learning_rate": 1.9996002321906576e-05, + "loss": 1.1783, + "step": 4982 + }, + { + "epoch": 0.11621920252356366, + "grad_norm": 1.4932103157043457, + "learning_rate": 1.9995995199571702e-05, + "loss": 1.2725, + "step": 4983 + }, + { + "epoch": 0.11624252566274158, + "grad_norm": 1.9273570775985718, + "learning_rate": 1.9995988070899104e-05, + "loss": 1.123, + "step": 4984 + }, + { + "epoch": 0.1162658488019195, + "grad_norm": 1.7300347089767456, + "learning_rate": 1.999598093588879e-05, + "loss": 1.5293, + "step": 4985 + }, + { + "epoch": 0.1162891719410974, + "grad_norm": 1.4431848526000977, + "learning_rate": 1.9995973794540756e-05, + "loss": 1.4358, + "step": 4986 + }, + { + "epoch": 0.11631249508027533, + "grad_norm": 2.3643648624420166, + "learning_rate": 1.999596664685502e-05, + "loss": 1.5636, + "step": 4987 + }, + { + "epoch": 0.11633581821945324, + "grad_norm": 1.670784592628479, + "learning_rate": 1.9995959492831573e-05, + "loss": 1.6598, + "step": 4988 + }, + { + "epoch": 0.11635914135863117, + "grad_norm": 2.2506015300750732, + "learning_rate": 1.9995952332470425e-05, + "loss": 1.3849, + "step": 4989 + }, + { + "epoch": 0.11638246449780908, + "grad_norm": 1.7601280212402344, + "learning_rate": 1.999594516577158e-05, + "loss": 1.6713, + "step": 4990 + }, + { + "epoch": 0.116405787636987, + "grad_norm": 1.5968663692474365, + "learning_rate": 1.999593799273505e-05, + "loss": 1.569, + "step": 4991 + }, + { + "epoch": 0.11642911077616491, + "grad_norm": 1.2882698774337769, + "learning_rate": 1.9995930813360828e-05, + "loss": 1.397, + "step": 4992 + }, + { + "epoch": 0.11645243391534284, + "grad_norm": 2.1098101139068604, + "learning_rate": 1.9995923627648923e-05, + "loss": 1.4164, + "step": 4993 + }, + { + "epoch": 0.11647575705452075, + "grad_norm": 1.6225732564926147, + "learning_rate": 1.999591643559934e-05, + "loss": 1.0269, + "step": 4994 + }, + { + "epoch": 0.11649908019369867, + "grad_norm": 1.720001220703125, + "learning_rate": 1.9995909237212086e-05, + "loss": 1.4198, + "step": 4995 + }, + { + "epoch": 0.11652240333287658, + "grad_norm": 1.8598988056182861, + "learning_rate": 1.999590203248716e-05, + "loss": 1.1092, + "step": 4996 + }, + { + "epoch": 0.11654572647205451, + "grad_norm": 2.120917320251465, + "learning_rate": 1.9995894821424576e-05, + "loss": 1.2088, + "step": 4997 + }, + { + "epoch": 0.11656904961123242, + "grad_norm": 1.8014084100723267, + "learning_rate": 1.9995887604024325e-05, + "loss": 1.6979, + "step": 4998 + }, + { + "epoch": 0.11659237275041034, + "grad_norm": 1.7007758617401123, + "learning_rate": 1.9995880380286424e-05, + "loss": 1.268, + "step": 4999 + }, + { + "epoch": 0.11661569588958826, + "grad_norm": 1.697314739227295, + "learning_rate": 1.9995873150210867e-05, + "loss": 1.4383, + "step": 5000 + } + ], + "logging_steps": 1, + "max_steps": 128625, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.275435348086096e+18, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}