diff --git "a/checkpoint-9000/trainer_state.json" "b/checkpoint-9000/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-9000/trainer_state.json" @@ -0,0 +1,63033 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.20990825260125887, + "eval_steps": 500, + "global_step": 9000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 2.332313917791765e-05, + "grad_norm": 1.6235620975494385, + "learning_rate": 5.182689816014512e-09, + "loss": 1.9275, + "step": 1 + }, + { + "epoch": 4.66462783558353e-05, + "grad_norm": 1.5710082054138184, + "learning_rate": 1.0365379632029025e-08, + "loss": 1.5593, + "step": 2 + }, + { + "epoch": 6.996941753375295e-05, + "grad_norm": 2.3231985569000244, + "learning_rate": 1.5548069448043534e-08, + "loss": 2.0021, + "step": 3 + }, + { + "epoch": 9.32925567116706e-05, + "grad_norm": 1.8349288702011108, + "learning_rate": 2.073075926405805e-08, + "loss": 2.1141, + "step": 4 + }, + { + "epoch": 0.00011661569588958826, + "grad_norm": 2.039928436279297, + "learning_rate": 2.5913449080072562e-08, + "loss": 1.9361, + "step": 5 + }, + { + "epoch": 0.0001399388350675059, + "grad_norm": 1.8988783359527588, + "learning_rate": 3.109613889608707e-08, + "loss": 2.2441, + "step": 6 + }, + { + "epoch": 0.00016326197424542356, + "grad_norm": 1.4865813255310059, + "learning_rate": 3.6278828712101586e-08, + "loss": 1.8118, + "step": 7 + }, + { + "epoch": 0.0001865851134233412, + "grad_norm": 1.4033368825912476, + "learning_rate": 4.14615185281161e-08, + "loss": 1.8838, + "step": 8 + }, + { + "epoch": 0.00020990825260125886, + "grad_norm": 1.876894235610962, + "learning_rate": 4.6644208344130604e-08, + "loss": 1.9916, + "step": 9 + }, + { + "epoch": 0.00023323139177917651, + "grad_norm": 2.4104366302490234, + "learning_rate": 5.1826898160145123e-08, + "loss": 1.8618, + "step": 10 + }, + { + "epoch": 0.0002565545309570942, + "grad_norm": 1.8457229137420654, + "learning_rate": 5.700958797615963e-08, + "loss": 1.7303, + "step": 11 + }, + { + "epoch": 0.0002798776701350118, + "grad_norm": 1.940317988395691, + "learning_rate": 6.219227779217413e-08, + "loss": 2.2692, + "step": 12 + }, + { + "epoch": 0.0003032008093129295, + "grad_norm": 2.455432891845703, + "learning_rate": 6.737496760818865e-08, + "loss": 2.3401, + "step": 13 + }, + { + "epoch": 0.0003265239484908471, + "grad_norm": 1.5163850784301758, + "learning_rate": 7.255765742420317e-08, + "loss": 2.1687, + "step": 14 + }, + { + "epoch": 0.0003498470876687648, + "grad_norm": 1.3012642860412598, + "learning_rate": 7.774034724021768e-08, + "loss": 1.8693, + "step": 15 + }, + { + "epoch": 0.0003731702268466824, + "grad_norm": 2.0896522998809814, + "learning_rate": 8.29230370562322e-08, + "loss": 1.7031, + "step": 16 + }, + { + "epoch": 0.0003964933660246001, + "grad_norm": 1.7818728685379028, + "learning_rate": 8.810572687224672e-08, + "loss": 2.0829, + "step": 17 + }, + { + "epoch": 0.0004198165052025177, + "grad_norm": 2.569828510284424, + "learning_rate": 9.328841668826121e-08, + "loss": 1.8998, + "step": 18 + }, + { + "epoch": 0.0004431396443804354, + "grad_norm": 1.4619100093841553, + "learning_rate": 9.847110650427573e-08, + "loss": 1.5964, + "step": 19 + }, + { + "epoch": 0.00046646278355835303, + "grad_norm": 1.9832793474197388, + "learning_rate": 1.0365379632029025e-07, + "loss": 1.9292, + "step": 20 + }, + { + "epoch": 0.0004897859227362707, + "grad_norm": 2.0182175636291504, + "learning_rate": 1.0883648613630475e-07, + "loss": 2.0115, + "step": 21 + }, + { + "epoch": 0.0005131090619141884, + "grad_norm": 1.4642307758331299, + "learning_rate": 1.1401917595231926e-07, + "loss": 2.0291, + "step": 22 + }, + { + "epoch": 0.000536432201092106, + "grad_norm": 2.887909173965454, + "learning_rate": 1.1920186576833378e-07, + "loss": 2.1946, + "step": 23 + }, + { + "epoch": 0.0005597553402700236, + "grad_norm": 1.595544457435608, + "learning_rate": 1.2438455558434827e-07, + "loss": 2.0246, + "step": 24 + }, + { + "epoch": 0.0005830784794479413, + "grad_norm": 1.5648566484451294, + "learning_rate": 1.295672454003628e-07, + "loss": 2.1832, + "step": 25 + }, + { + "epoch": 0.000606401618625859, + "grad_norm": 1.4702372550964355, + "learning_rate": 1.347499352163773e-07, + "loss": 1.6395, + "step": 26 + }, + { + "epoch": 0.0006297247578037766, + "grad_norm": 1.7178195714950562, + "learning_rate": 1.399326250323918e-07, + "loss": 1.6264, + "step": 27 + }, + { + "epoch": 0.0006530478969816942, + "grad_norm": 2.1751515865325928, + "learning_rate": 1.4511531484840635e-07, + "loss": 2.511, + "step": 28 + }, + { + "epoch": 0.0006763710361596119, + "grad_norm": 2.9443299770355225, + "learning_rate": 1.5029800466442085e-07, + "loss": 2.229, + "step": 29 + }, + { + "epoch": 0.0006996941753375296, + "grad_norm": 1.8316481113433838, + "learning_rate": 1.5548069448043536e-07, + "loss": 1.8414, + "step": 30 + }, + { + "epoch": 0.0007230173145154472, + "grad_norm": 1.9659239053726196, + "learning_rate": 1.6066338429644986e-07, + "loss": 2.0109, + "step": 31 + }, + { + "epoch": 0.0007463404536933648, + "grad_norm": 2.1653449535369873, + "learning_rate": 1.658460741124644e-07, + "loss": 2.0155, + "step": 32 + }, + { + "epoch": 0.0007696635928712825, + "grad_norm": 1.8755710124969482, + "learning_rate": 1.710287639284789e-07, + "loss": 2.1105, + "step": 33 + }, + { + "epoch": 0.0007929867320492002, + "grad_norm": 1.5989196300506592, + "learning_rate": 1.7621145374449343e-07, + "loss": 2.1583, + "step": 34 + }, + { + "epoch": 0.0008163098712271178, + "grad_norm": 1.865307331085205, + "learning_rate": 1.813941435605079e-07, + "loss": 2.001, + "step": 35 + }, + { + "epoch": 0.0008396330104050355, + "grad_norm": 1.4584789276123047, + "learning_rate": 1.8657683337652242e-07, + "loss": 1.8854, + "step": 36 + }, + { + "epoch": 0.0008629561495829531, + "grad_norm": 2.6818912029266357, + "learning_rate": 1.9175952319253695e-07, + "loss": 2.1888, + "step": 37 + }, + { + "epoch": 0.0008862792887608708, + "grad_norm": 2.17561674118042, + "learning_rate": 1.9694221300855146e-07, + "loss": 1.9616, + "step": 38 + }, + { + "epoch": 0.0009096024279387884, + "grad_norm": 1.252475619316101, + "learning_rate": 2.02124902824566e-07, + "loss": 1.9585, + "step": 39 + }, + { + "epoch": 0.0009329255671167061, + "grad_norm": 1.884366750717163, + "learning_rate": 2.073075926405805e-07, + "loss": 2.2436, + "step": 40 + }, + { + "epoch": 0.0009562487062946237, + "grad_norm": 1.4951350688934326, + "learning_rate": 2.1249028245659497e-07, + "loss": 1.7149, + "step": 41 + }, + { + "epoch": 0.0009795718454725414, + "grad_norm": 1.891728162765503, + "learning_rate": 2.176729722726095e-07, + "loss": 2.0472, + "step": 42 + }, + { + "epoch": 0.001002894984650459, + "grad_norm": 1.8992432355880737, + "learning_rate": 2.22855662088624e-07, + "loss": 2.1471, + "step": 43 + }, + { + "epoch": 0.0010262181238283768, + "grad_norm": 1.3931283950805664, + "learning_rate": 2.2803835190463852e-07, + "loss": 1.5292, + "step": 44 + }, + { + "epoch": 0.0010495412630062942, + "grad_norm": 1.8894548416137695, + "learning_rate": 2.3322104172065305e-07, + "loss": 1.7759, + "step": 45 + }, + { + "epoch": 0.001072864402184212, + "grad_norm": 1.592050552368164, + "learning_rate": 2.3840373153666755e-07, + "loss": 2.2498, + "step": 46 + }, + { + "epoch": 0.0010961875413621296, + "grad_norm": 1.3746178150177002, + "learning_rate": 2.4358642135268203e-07, + "loss": 1.8503, + "step": 47 + }, + { + "epoch": 0.0011195106805400473, + "grad_norm": 2.0268595218658447, + "learning_rate": 2.4876911116869654e-07, + "loss": 1.9358, + "step": 48 + }, + { + "epoch": 0.001142833819717965, + "grad_norm": 1.7836228609085083, + "learning_rate": 2.539518009847111e-07, + "loss": 1.9855, + "step": 49 + }, + { + "epoch": 0.0011661569588958826, + "grad_norm": 1.829447627067566, + "learning_rate": 2.591344908007256e-07, + "loss": 2.2802, + "step": 50 + }, + { + "epoch": 0.0011894800980738003, + "grad_norm": 2.2813496589660645, + "learning_rate": 2.643171806167401e-07, + "loss": 2.1593, + "step": 51 + }, + { + "epoch": 0.001212803237251718, + "grad_norm": 3.019044876098633, + "learning_rate": 2.694998704327546e-07, + "loss": 1.9534, + "step": 52 + }, + { + "epoch": 0.0012361263764296354, + "grad_norm": 2.011425256729126, + "learning_rate": 2.746825602487691e-07, + "loss": 2.1284, + "step": 53 + }, + { + "epoch": 0.0012594495156075531, + "grad_norm": 2.207106590270996, + "learning_rate": 2.798652500647836e-07, + "loss": 2.2427, + "step": 54 + }, + { + "epoch": 0.0012827726547854708, + "grad_norm": 1.3172473907470703, + "learning_rate": 2.8504793988079813e-07, + "loss": 1.9782, + "step": 55 + }, + { + "epoch": 0.0013060957939633885, + "grad_norm": 1.522895097732544, + "learning_rate": 2.902306296968127e-07, + "loss": 1.9455, + "step": 56 + }, + { + "epoch": 0.0013294189331413062, + "grad_norm": 2.657248020172119, + "learning_rate": 2.954133195128272e-07, + "loss": 1.959, + "step": 57 + }, + { + "epoch": 0.0013527420723192238, + "grad_norm": 1.9738789796829224, + "learning_rate": 3.005960093288417e-07, + "loss": 1.7878, + "step": 58 + }, + { + "epoch": 0.0013760652114971415, + "grad_norm": 1.5549254417419434, + "learning_rate": 3.057786991448562e-07, + "loss": 1.9405, + "step": 59 + }, + { + "epoch": 0.0013993883506750592, + "grad_norm": 2.9688899517059326, + "learning_rate": 3.109613889608707e-07, + "loss": 1.9969, + "step": 60 + }, + { + "epoch": 0.0014227114898529767, + "grad_norm": 1.4602586030960083, + "learning_rate": 3.1614407877688527e-07, + "loss": 1.9339, + "step": 61 + }, + { + "epoch": 0.0014460346290308943, + "grad_norm": 2.4017045497894287, + "learning_rate": 3.213267685928997e-07, + "loss": 2.0842, + "step": 62 + }, + { + "epoch": 0.001469357768208812, + "grad_norm": 1.7433497905731201, + "learning_rate": 3.2650945840891423e-07, + "loss": 2.0223, + "step": 63 + }, + { + "epoch": 0.0014926809073867297, + "grad_norm": 1.7395591735839844, + "learning_rate": 3.316921482249288e-07, + "loss": 1.9257, + "step": 64 + }, + { + "epoch": 0.0015160040465646474, + "grad_norm": 1.8336257934570312, + "learning_rate": 3.3687483804094324e-07, + "loss": 1.948, + "step": 65 + }, + { + "epoch": 0.001539327185742565, + "grad_norm": 1.6493985652923584, + "learning_rate": 3.420575278569578e-07, + "loss": 1.8672, + "step": 66 + }, + { + "epoch": 0.0015626503249204827, + "grad_norm": 1.5789337158203125, + "learning_rate": 3.472402176729723e-07, + "loss": 1.9446, + "step": 67 + }, + { + "epoch": 0.0015859734640984004, + "grad_norm": 1.3755509853363037, + "learning_rate": 3.5242290748898686e-07, + "loss": 2.1796, + "step": 68 + }, + { + "epoch": 0.001609296603276318, + "grad_norm": 1.7978087663650513, + "learning_rate": 3.576055973050013e-07, + "loss": 1.8974, + "step": 69 + }, + { + "epoch": 0.0016326197424542355, + "grad_norm": 1.8888216018676758, + "learning_rate": 3.627882871210158e-07, + "loss": 1.915, + "step": 70 + }, + { + "epoch": 0.0016559428816321532, + "grad_norm": 2.6150593757629395, + "learning_rate": 3.679709769370304e-07, + "loss": 2.2133, + "step": 71 + }, + { + "epoch": 0.001679266020810071, + "grad_norm": 1.7009005546569824, + "learning_rate": 3.7315366675304483e-07, + "loss": 2.1024, + "step": 72 + }, + { + "epoch": 0.0017025891599879886, + "grad_norm": 1.741734266281128, + "learning_rate": 3.783363565690594e-07, + "loss": 2.1839, + "step": 73 + }, + { + "epoch": 0.0017259122991659063, + "grad_norm": 2.7715041637420654, + "learning_rate": 3.835190463850739e-07, + "loss": 2.0734, + "step": 74 + }, + { + "epoch": 0.001749235438343824, + "grad_norm": 1.9710502624511719, + "learning_rate": 3.8870173620108835e-07, + "loss": 2.18, + "step": 75 + }, + { + "epoch": 0.0017725585775217416, + "grad_norm": 2.077986478805542, + "learning_rate": 3.938844260171029e-07, + "loss": 2.1482, + "step": 76 + }, + { + "epoch": 0.0017958817166996593, + "grad_norm": 2.583721160888672, + "learning_rate": 3.990671158331174e-07, + "loss": 2.5364, + "step": 77 + }, + { + "epoch": 0.0018192048558775768, + "grad_norm": 1.3425930738449097, + "learning_rate": 4.04249805649132e-07, + "loss": 1.8194, + "step": 78 + }, + { + "epoch": 0.0018425279950554944, + "grad_norm": 2.1111888885498047, + "learning_rate": 4.0943249546514643e-07, + "loss": 1.7878, + "step": 79 + }, + { + "epoch": 0.0018658511342334121, + "grad_norm": 2.0795626640319824, + "learning_rate": 4.14615185281161e-07, + "loss": 2.3006, + "step": 80 + }, + { + "epoch": 0.0018891742734113298, + "grad_norm": 1.273370623588562, + "learning_rate": 4.197978750971755e-07, + "loss": 1.7599, + "step": 81 + }, + { + "epoch": 0.0019124974125892475, + "grad_norm": 1.6202706098556519, + "learning_rate": 4.2498056491318994e-07, + "loss": 2.1727, + "step": 82 + }, + { + "epoch": 0.0019358205517671651, + "grad_norm": 2.4593732357025146, + "learning_rate": 4.301632547292045e-07, + "loss": 2.4588, + "step": 83 + }, + { + "epoch": 0.001959143690945083, + "grad_norm": 1.2617835998535156, + "learning_rate": 4.35345944545219e-07, + "loss": 1.9078, + "step": 84 + }, + { + "epoch": 0.0019824668301230003, + "grad_norm": 2.2640504837036133, + "learning_rate": 4.405286343612335e-07, + "loss": 1.8983, + "step": 85 + }, + { + "epoch": 0.002005789969300918, + "grad_norm": 1.6804454326629639, + "learning_rate": 4.45711324177248e-07, + "loss": 2.1049, + "step": 86 + }, + { + "epoch": 0.0020291131084788356, + "grad_norm": 2.060009717941284, + "learning_rate": 4.5089401399326253e-07, + "loss": 2.0153, + "step": 87 + }, + { + "epoch": 0.0020524362476567535, + "grad_norm": 1.7166160345077515, + "learning_rate": 4.5607670380927703e-07, + "loss": 2.1093, + "step": 88 + }, + { + "epoch": 0.002075759386834671, + "grad_norm": 1.6695979833602905, + "learning_rate": 4.6125939362529154e-07, + "loss": 1.8607, + "step": 89 + }, + { + "epoch": 0.0020990825260125885, + "grad_norm": 1.4339056015014648, + "learning_rate": 4.664420834413061e-07, + "loss": 2.2632, + "step": 90 + }, + { + "epoch": 0.0021224056651905064, + "grad_norm": 1.5228222608566284, + "learning_rate": 4.7162477325732055e-07, + "loss": 2.0851, + "step": 91 + }, + { + "epoch": 0.002145728804368424, + "grad_norm": 1.540848731994629, + "learning_rate": 4.768074630733351e-07, + "loss": 2.1446, + "step": 92 + }, + { + "epoch": 0.0021690519435463417, + "grad_norm": 1.480702519416809, + "learning_rate": 4.819901528893496e-07, + "loss": 2.0718, + "step": 93 + }, + { + "epoch": 0.002192375082724259, + "grad_norm": 2.23518705368042, + "learning_rate": 4.871728427053641e-07, + "loss": 1.6198, + "step": 94 + }, + { + "epoch": 0.002215698221902177, + "grad_norm": 1.6477755308151245, + "learning_rate": 4.923555325213786e-07, + "loss": 2.1136, + "step": 95 + }, + { + "epoch": 0.0022390213610800945, + "grad_norm": 1.9548614025115967, + "learning_rate": 4.975382223373931e-07, + "loss": 1.9143, + "step": 96 + }, + { + "epoch": 0.0022623445002580124, + "grad_norm": 1.3557407855987549, + "learning_rate": 5.027209121534076e-07, + "loss": 2.0044, + "step": 97 + }, + { + "epoch": 0.00228566763943593, + "grad_norm": 2.2781455516815186, + "learning_rate": 5.079036019694222e-07, + "loss": 1.7761, + "step": 98 + }, + { + "epoch": 0.0023089907786138474, + "grad_norm": 2.1195600032806396, + "learning_rate": 5.130862917854368e-07, + "loss": 1.8174, + "step": 99 + }, + { + "epoch": 0.0023323139177917653, + "grad_norm": 2.0798068046569824, + "learning_rate": 5.182689816014512e-07, + "loss": 2.1431, + "step": 100 + }, + { + "epoch": 0.0023556370569696827, + "grad_norm": 1.8773006200790405, + "learning_rate": 5.234516714174657e-07, + "loss": 1.5221, + "step": 101 + }, + { + "epoch": 0.0023789601961476006, + "grad_norm": 1.7917876243591309, + "learning_rate": 5.286343612334802e-07, + "loss": 1.9383, + "step": 102 + }, + { + "epoch": 0.002402283335325518, + "grad_norm": 1.4980329275131226, + "learning_rate": 5.338170510494947e-07, + "loss": 1.846, + "step": 103 + }, + { + "epoch": 0.002425606474503436, + "grad_norm": 2.0081095695495605, + "learning_rate": 5.389997408655092e-07, + "loss": 1.8777, + "step": 104 + }, + { + "epoch": 0.0024489296136813534, + "grad_norm": 1.525317907333374, + "learning_rate": 5.441824306815238e-07, + "loss": 1.971, + "step": 105 + }, + { + "epoch": 0.002472252752859271, + "grad_norm": 1.4131786823272705, + "learning_rate": 5.493651204975382e-07, + "loss": 2.2224, + "step": 106 + }, + { + "epoch": 0.002495575892037189, + "grad_norm": 1.164492130279541, + "learning_rate": 5.545478103135528e-07, + "loss": 1.8909, + "step": 107 + }, + { + "epoch": 0.0025188990312151062, + "grad_norm": 1.9998016357421875, + "learning_rate": 5.597305001295673e-07, + "loss": 2.1197, + "step": 108 + }, + { + "epoch": 0.002542222170393024, + "grad_norm": 1.6218236684799194, + "learning_rate": 5.649131899455818e-07, + "loss": 1.7799, + "step": 109 + }, + { + "epoch": 0.0025655453095709416, + "grad_norm": 1.535388708114624, + "learning_rate": 5.700958797615963e-07, + "loss": 1.7878, + "step": 110 + }, + { + "epoch": 0.0025888684487488595, + "grad_norm": 1.4929994344711304, + "learning_rate": 5.752785695776108e-07, + "loss": 2.0802, + "step": 111 + }, + { + "epoch": 0.002612191587926777, + "grad_norm": 2.183293104171753, + "learning_rate": 5.804612593936254e-07, + "loss": 2.0506, + "step": 112 + }, + { + "epoch": 0.002635514727104695, + "grad_norm": 1.6339191198349, + "learning_rate": 5.856439492096398e-07, + "loss": 1.7152, + "step": 113 + }, + { + "epoch": 0.0026588378662826123, + "grad_norm": 1.4886974096298218, + "learning_rate": 5.908266390256544e-07, + "loss": 1.8327, + "step": 114 + }, + { + "epoch": 0.0026821610054605298, + "grad_norm": 1.4198302030563354, + "learning_rate": 5.960093288416688e-07, + "loss": 1.8342, + "step": 115 + }, + { + "epoch": 0.0027054841446384477, + "grad_norm": 2.041900157928467, + "learning_rate": 6.011920186576834e-07, + "loss": 1.9101, + "step": 116 + }, + { + "epoch": 0.002728807283816365, + "grad_norm": 1.7576725482940674, + "learning_rate": 6.063747084736979e-07, + "loss": 2.3793, + "step": 117 + }, + { + "epoch": 0.002752130422994283, + "grad_norm": 1.620440125465393, + "learning_rate": 6.115573982897124e-07, + "loss": 1.7363, + "step": 118 + }, + { + "epoch": 0.0027754535621722005, + "grad_norm": 1.972102403640747, + "learning_rate": 6.16740088105727e-07, + "loss": 2.0338, + "step": 119 + }, + { + "epoch": 0.0027987767013501184, + "grad_norm": 1.5385342836380005, + "learning_rate": 6.219227779217414e-07, + "loss": 1.829, + "step": 120 + }, + { + "epoch": 0.002822099840528036, + "grad_norm": 1.4439769983291626, + "learning_rate": 6.27105467737756e-07, + "loss": 1.9893, + "step": 121 + }, + { + "epoch": 0.0028454229797059533, + "grad_norm": 1.5146026611328125, + "learning_rate": 6.322881575537705e-07, + "loss": 1.6563, + "step": 122 + }, + { + "epoch": 0.002868746118883871, + "grad_norm": 1.7177401781082153, + "learning_rate": 6.374708473697849e-07, + "loss": 1.9483, + "step": 123 + }, + { + "epoch": 0.0028920692580617887, + "grad_norm": 2.484865188598633, + "learning_rate": 6.426535371857994e-07, + "loss": 2.0949, + "step": 124 + }, + { + "epoch": 0.0029153923972397066, + "grad_norm": 1.5320651531219482, + "learning_rate": 6.47836227001814e-07, + "loss": 1.8557, + "step": 125 + }, + { + "epoch": 0.002938715536417624, + "grad_norm": 1.3804417848587036, + "learning_rate": 6.530189168178285e-07, + "loss": 1.8733, + "step": 126 + }, + { + "epoch": 0.002962038675595542, + "grad_norm": 2.0832831859588623, + "learning_rate": 6.58201606633843e-07, + "loss": 1.8556, + "step": 127 + }, + { + "epoch": 0.0029853618147734594, + "grad_norm": 1.2582931518554688, + "learning_rate": 6.633842964498576e-07, + "loss": 2.1239, + "step": 128 + }, + { + "epoch": 0.0030086849539513773, + "grad_norm": 1.6449629068374634, + "learning_rate": 6.685669862658721e-07, + "loss": 2.1635, + "step": 129 + }, + { + "epoch": 0.0030320080931292947, + "grad_norm": 1.3350502252578735, + "learning_rate": 6.737496760818865e-07, + "loss": 1.801, + "step": 130 + }, + { + "epoch": 0.003055331232307212, + "grad_norm": 1.7689651250839233, + "learning_rate": 6.78932365897901e-07, + "loss": 1.7541, + "step": 131 + }, + { + "epoch": 0.00307865437148513, + "grad_norm": 1.4711276292800903, + "learning_rate": 6.841150557139156e-07, + "loss": 2.3916, + "step": 132 + }, + { + "epoch": 0.0031019775106630476, + "grad_norm": 1.2806516885757446, + "learning_rate": 6.892977455299301e-07, + "loss": 1.8609, + "step": 133 + }, + { + "epoch": 0.0031253006498409655, + "grad_norm": 1.5531939268112183, + "learning_rate": 6.944804353459446e-07, + "loss": 1.7721, + "step": 134 + }, + { + "epoch": 0.003148623789018883, + "grad_norm": 1.6541032791137695, + "learning_rate": 6.996631251619592e-07, + "loss": 2.1091, + "step": 135 + }, + { + "epoch": 0.003171946928196801, + "grad_norm": 2.050734281539917, + "learning_rate": 7.048458149779737e-07, + "loss": 1.8932, + "step": 136 + }, + { + "epoch": 0.0031952700673747183, + "grad_norm": 1.2903157472610474, + "learning_rate": 7.100285047939881e-07, + "loss": 2.0833, + "step": 137 + }, + { + "epoch": 0.003218593206552636, + "grad_norm": 1.3316091299057007, + "learning_rate": 7.152111946100026e-07, + "loss": 1.9307, + "step": 138 + }, + { + "epoch": 0.0032419163457305536, + "grad_norm": 1.441341519355774, + "learning_rate": 7.203938844260172e-07, + "loss": 2.2529, + "step": 139 + }, + { + "epoch": 0.003265239484908471, + "grad_norm": 2.159276008605957, + "learning_rate": 7.255765742420316e-07, + "loss": 1.847, + "step": 140 + }, + { + "epoch": 0.003288562624086389, + "grad_norm": 1.8410853147506714, + "learning_rate": 7.307592640580462e-07, + "loss": 2.2465, + "step": 141 + }, + { + "epoch": 0.0033118857632643064, + "grad_norm": 1.8678739070892334, + "learning_rate": 7.359419538740608e-07, + "loss": 1.9261, + "step": 142 + }, + { + "epoch": 0.0033352089024422243, + "grad_norm": 1.2097922563552856, + "learning_rate": 7.411246436900751e-07, + "loss": 2.0205, + "step": 143 + }, + { + "epoch": 0.003358532041620142, + "grad_norm": 1.733077883720398, + "learning_rate": 7.463073335060897e-07, + "loss": 1.8389, + "step": 144 + }, + { + "epoch": 0.0033818551807980597, + "grad_norm": 1.7118474245071411, + "learning_rate": 7.514900233221042e-07, + "loss": 1.9511, + "step": 145 + }, + { + "epoch": 0.003405178319975977, + "grad_norm": 1.6960872411727905, + "learning_rate": 7.566727131381188e-07, + "loss": 1.8828, + "step": 146 + }, + { + "epoch": 0.0034285014591538946, + "grad_norm": 1.2409390211105347, + "learning_rate": 7.618554029541332e-07, + "loss": 1.6878, + "step": 147 + }, + { + "epoch": 0.0034518245983318125, + "grad_norm": 1.3440965414047241, + "learning_rate": 7.670380927701478e-07, + "loss": 1.64, + "step": 148 + }, + { + "epoch": 0.00347514773750973, + "grad_norm": 1.539393663406372, + "learning_rate": 7.722207825861624e-07, + "loss": 1.6754, + "step": 149 + }, + { + "epoch": 0.003498470876687648, + "grad_norm": 1.5395653247833252, + "learning_rate": 7.774034724021767e-07, + "loss": 1.9761, + "step": 150 + }, + { + "epoch": 0.0035217940158655653, + "grad_norm": 2.0169472694396973, + "learning_rate": 7.825861622181913e-07, + "loss": 1.6927, + "step": 151 + }, + { + "epoch": 0.0035451171550434832, + "grad_norm": 1.8776079416275024, + "learning_rate": 7.877688520342058e-07, + "loss": 1.9273, + "step": 152 + }, + { + "epoch": 0.0035684402942214007, + "grad_norm": 2.078824043273926, + "learning_rate": 7.929515418502204e-07, + "loss": 1.6756, + "step": 153 + }, + { + "epoch": 0.0035917634333993186, + "grad_norm": 1.407560110092163, + "learning_rate": 7.981342316662348e-07, + "loss": 1.6038, + "step": 154 + }, + { + "epoch": 0.003615086572577236, + "grad_norm": 1.1770573854446411, + "learning_rate": 8.033169214822494e-07, + "loss": 1.6679, + "step": 155 + }, + { + "epoch": 0.0036384097117551535, + "grad_norm": 1.2057602405548096, + "learning_rate": 8.08499611298264e-07, + "loss": 1.7916, + "step": 156 + }, + { + "epoch": 0.0036617328509330714, + "grad_norm": 1.117970585823059, + "learning_rate": 8.136823011142783e-07, + "loss": 1.7974, + "step": 157 + }, + { + "epoch": 0.003685055990110989, + "grad_norm": 1.5996465682983398, + "learning_rate": 8.188649909302929e-07, + "loss": 1.6053, + "step": 158 + }, + { + "epoch": 0.0037083791292889068, + "grad_norm": 1.4170929193496704, + "learning_rate": 8.240476807463074e-07, + "loss": 1.7155, + "step": 159 + }, + { + "epoch": 0.0037317022684668242, + "grad_norm": 1.8114391565322876, + "learning_rate": 8.29230370562322e-07, + "loss": 1.9192, + "step": 160 + }, + { + "epoch": 0.003755025407644742, + "grad_norm": 1.3462793827056885, + "learning_rate": 8.344130603783364e-07, + "loss": 1.4624, + "step": 161 + }, + { + "epoch": 0.0037783485468226596, + "grad_norm": 1.6305956840515137, + "learning_rate": 8.39595750194351e-07, + "loss": 1.8017, + "step": 162 + }, + { + "epoch": 0.003801671686000577, + "grad_norm": 1.662576675415039, + "learning_rate": 8.447784400103655e-07, + "loss": 1.733, + "step": 163 + }, + { + "epoch": 0.003824994825178495, + "grad_norm": 1.556788682937622, + "learning_rate": 8.499611298263799e-07, + "loss": 1.9586, + "step": 164 + }, + { + "epoch": 0.0038483179643564124, + "grad_norm": 1.5282272100448608, + "learning_rate": 8.551438196423944e-07, + "loss": 1.8254, + "step": 165 + }, + { + "epoch": 0.0038716411035343303, + "grad_norm": 1.6790592670440674, + "learning_rate": 8.60326509458409e-07, + "loss": 2.1866, + "step": 166 + }, + { + "epoch": 0.0038949642427122478, + "grad_norm": 1.5164263248443604, + "learning_rate": 8.655091992744236e-07, + "loss": 1.6651, + "step": 167 + }, + { + "epoch": 0.003918287381890166, + "grad_norm": 1.5002336502075195, + "learning_rate": 8.70691889090438e-07, + "loss": 1.9295, + "step": 168 + }, + { + "epoch": 0.0039416105210680836, + "grad_norm": 1.2122441530227661, + "learning_rate": 8.758745789064526e-07, + "loss": 1.761, + "step": 169 + }, + { + "epoch": 0.003964933660246001, + "grad_norm": 1.637898564338684, + "learning_rate": 8.81057268722467e-07, + "loss": 1.8697, + "step": 170 + }, + { + "epoch": 0.0039882567994239185, + "grad_norm": 0.988777220249176, + "learning_rate": 8.862399585384815e-07, + "loss": 2.1249, + "step": 171 + }, + { + "epoch": 0.004011579938601836, + "grad_norm": 1.8833587169647217, + "learning_rate": 8.91422648354496e-07, + "loss": 1.6915, + "step": 172 + }, + { + "epoch": 0.004034903077779753, + "grad_norm": 1.8418108224868774, + "learning_rate": 8.966053381705106e-07, + "loss": 2.0019, + "step": 173 + }, + { + "epoch": 0.004058226216957671, + "grad_norm": 1.6375901699066162, + "learning_rate": 9.017880279865251e-07, + "loss": 1.7625, + "step": 174 + }, + { + "epoch": 0.004081549356135589, + "grad_norm": 1.8701720237731934, + "learning_rate": 9.069707178025396e-07, + "loss": 1.801, + "step": 175 + }, + { + "epoch": 0.004104872495313507, + "grad_norm": 1.4488773345947266, + "learning_rate": 9.121534076185541e-07, + "loss": 1.9971, + "step": 176 + }, + { + "epoch": 0.004128195634491424, + "grad_norm": 0.9587986469268799, + "learning_rate": 9.173360974345686e-07, + "loss": 1.6253, + "step": 177 + }, + { + "epoch": 0.004151518773669342, + "grad_norm": 2.6533186435699463, + "learning_rate": 9.225187872505831e-07, + "loss": 1.572, + "step": 178 + }, + { + "epoch": 0.00417484191284726, + "grad_norm": 2.4528841972351074, + "learning_rate": 9.277014770665976e-07, + "loss": 1.7586, + "step": 179 + }, + { + "epoch": 0.004198165052025177, + "grad_norm": 1.1871824264526367, + "learning_rate": 9.328841668826122e-07, + "loss": 1.6765, + "step": 180 + }, + { + "epoch": 0.004221488191203095, + "grad_norm": 1.1292660236358643, + "learning_rate": 9.380668566986266e-07, + "loss": 2.0673, + "step": 181 + }, + { + "epoch": 0.004244811330381013, + "grad_norm": 1.3055285215377808, + "learning_rate": 9.432495465146411e-07, + "loss": 1.8103, + "step": 182 + }, + { + "epoch": 0.004268134469558931, + "grad_norm": 1.5225868225097656, + "learning_rate": 9.484322363306557e-07, + "loss": 2.0813, + "step": 183 + }, + { + "epoch": 0.004291457608736848, + "grad_norm": 1.2439767122268677, + "learning_rate": 9.536149261466702e-07, + "loss": 1.6919, + "step": 184 + }, + { + "epoch": 0.0043147807479147655, + "grad_norm": 1.2424002885818481, + "learning_rate": 9.587976159626847e-07, + "loss": 1.9506, + "step": 185 + }, + { + "epoch": 0.0043381038870926834, + "grad_norm": 0.9796323776245117, + "learning_rate": 9.639803057786992e-07, + "loss": 1.7342, + "step": 186 + }, + { + "epoch": 0.0043614270262706005, + "grad_norm": 1.2240192890167236, + "learning_rate": 9.691629955947138e-07, + "loss": 2.0646, + "step": 187 + }, + { + "epoch": 0.004384750165448518, + "grad_norm": 0.8779449462890625, + "learning_rate": 9.743456854107281e-07, + "loss": 1.4535, + "step": 188 + }, + { + "epoch": 0.004408073304626436, + "grad_norm": 1.3131407499313354, + "learning_rate": 9.795283752267427e-07, + "loss": 1.9817, + "step": 189 + }, + { + "epoch": 0.004431396443804354, + "grad_norm": 1.3259912729263306, + "learning_rate": 9.847110650427573e-07, + "loss": 1.709, + "step": 190 + }, + { + "epoch": 0.004454719582982271, + "grad_norm": 1.4236465692520142, + "learning_rate": 9.898937548587718e-07, + "loss": 1.7059, + "step": 191 + }, + { + "epoch": 0.004478042722160189, + "grad_norm": 1.2791959047317505, + "learning_rate": 9.950764446747862e-07, + "loss": 1.9633, + "step": 192 + }, + { + "epoch": 0.004501365861338107, + "grad_norm": 0.9857053160667419, + "learning_rate": 1.0002591344908007e-06, + "loss": 1.807, + "step": 193 + }, + { + "epoch": 0.004524689000516025, + "grad_norm": 1.264302372932434, + "learning_rate": 1.0054418243068153e-06, + "loss": 1.5389, + "step": 194 + }, + { + "epoch": 0.004548012139693942, + "grad_norm": 1.2205390930175781, + "learning_rate": 1.0106245141228298e-06, + "loss": 1.4549, + "step": 195 + }, + { + "epoch": 0.00457133527887186, + "grad_norm": 1.055471420288086, + "learning_rate": 1.0158072039388444e-06, + "loss": 1.6931, + "step": 196 + }, + { + "epoch": 0.004594658418049778, + "grad_norm": 1.0585546493530273, + "learning_rate": 1.020989893754859e-06, + "loss": 1.8054, + "step": 197 + }, + { + "epoch": 0.004617981557227695, + "grad_norm": 2.16025972366333, + "learning_rate": 1.0261725835708735e-06, + "loss": 2.0077, + "step": 198 + }, + { + "epoch": 0.004641304696405613, + "grad_norm": 2.125786781311035, + "learning_rate": 1.0313552733868879e-06, + "loss": 1.9117, + "step": 199 + }, + { + "epoch": 0.0046646278355835305, + "grad_norm": 1.3560391664505005, + "learning_rate": 1.0365379632029024e-06, + "loss": 1.9871, + "step": 200 + }, + { + "epoch": 0.004687950974761448, + "grad_norm": 1.3505181074142456, + "learning_rate": 1.041720653018917e-06, + "loss": 1.714, + "step": 201 + }, + { + "epoch": 0.004711274113939365, + "grad_norm": 1.1724427938461304, + "learning_rate": 1.0469033428349313e-06, + "loss": 1.7611, + "step": 202 + }, + { + "epoch": 0.004734597253117283, + "grad_norm": 1.1746799945831299, + "learning_rate": 1.0520860326509459e-06, + "loss": 1.867, + "step": 203 + }, + { + "epoch": 0.004757920392295201, + "grad_norm": 1.0976382493972778, + "learning_rate": 1.0572687224669604e-06, + "loss": 1.808, + "step": 204 + }, + { + "epoch": 0.004781243531473118, + "grad_norm": 1.3842298984527588, + "learning_rate": 1.062451412282975e-06, + "loss": 1.7973, + "step": 205 + }, + { + "epoch": 0.004804566670651036, + "grad_norm": 1.6715288162231445, + "learning_rate": 1.0676341020989893e-06, + "loss": 1.9817, + "step": 206 + }, + { + "epoch": 0.004827889809828954, + "grad_norm": 1.0734590291976929, + "learning_rate": 1.072816791915004e-06, + "loss": 1.4297, + "step": 207 + }, + { + "epoch": 0.004851212949006872, + "grad_norm": 1.0182546377182007, + "learning_rate": 1.0779994817310185e-06, + "loss": 1.713, + "step": 208 + }, + { + "epoch": 0.004874536088184789, + "grad_norm": 1.1884313821792603, + "learning_rate": 1.083182171547033e-06, + "loss": 1.5234, + "step": 209 + }, + { + "epoch": 0.004897859227362707, + "grad_norm": 1.520266056060791, + "learning_rate": 1.0883648613630476e-06, + "loss": 2.0598, + "step": 210 + }, + { + "epoch": 0.004921182366540625, + "grad_norm": 1.1709904670715332, + "learning_rate": 1.0935475511790621e-06, + "loss": 2.1461, + "step": 211 + }, + { + "epoch": 0.004944505505718542, + "grad_norm": 1.2634027004241943, + "learning_rate": 1.0987302409950765e-06, + "loss": 1.5076, + "step": 212 + }, + { + "epoch": 0.00496782864489646, + "grad_norm": 1.490717887878418, + "learning_rate": 1.103912930811091e-06, + "loss": 1.8628, + "step": 213 + }, + { + "epoch": 0.004991151784074378, + "grad_norm": 2.077373743057251, + "learning_rate": 1.1090956206271056e-06, + "loss": 1.9295, + "step": 214 + }, + { + "epoch": 0.0050144749232522955, + "grad_norm": 1.647877812385559, + "learning_rate": 1.1142783104431202e-06, + "loss": 1.7929, + "step": 215 + }, + { + "epoch": 0.0050377980624302125, + "grad_norm": 1.1937353610992432, + "learning_rate": 1.1194610002591345e-06, + "loss": 1.6509, + "step": 216 + }, + { + "epoch": 0.00506112120160813, + "grad_norm": 1.0805108547210693, + "learning_rate": 1.124643690075149e-06, + "loss": 1.6447, + "step": 217 + }, + { + "epoch": 0.005084444340786048, + "grad_norm": 1.1077872514724731, + "learning_rate": 1.1298263798911636e-06, + "loss": 1.7675, + "step": 218 + }, + { + "epoch": 0.005107767479963966, + "grad_norm": 0.8648241758346558, + "learning_rate": 1.135009069707178e-06, + "loss": 1.6687, + "step": 219 + }, + { + "epoch": 0.005131090619141883, + "grad_norm": 1.0522700548171997, + "learning_rate": 1.1401917595231925e-06, + "loss": 1.2878, + "step": 220 + }, + { + "epoch": 0.005154413758319801, + "grad_norm": 1.3021256923675537, + "learning_rate": 1.145374449339207e-06, + "loss": 1.8535, + "step": 221 + }, + { + "epoch": 0.005177736897497719, + "grad_norm": 1.2912962436676025, + "learning_rate": 1.1505571391552216e-06, + "loss": 1.865, + "step": 222 + }, + { + "epoch": 0.005201060036675636, + "grad_norm": 1.6733994483947754, + "learning_rate": 1.1557398289712362e-06, + "loss": 1.5748, + "step": 223 + }, + { + "epoch": 0.005224383175853554, + "grad_norm": 1.0865724086761475, + "learning_rate": 1.1609225187872508e-06, + "loss": 1.8159, + "step": 224 + }, + { + "epoch": 0.005247706315031472, + "grad_norm": 1.1498301029205322, + "learning_rate": 1.1661052086032653e-06, + "loss": 1.8579, + "step": 225 + }, + { + "epoch": 0.00527102945420939, + "grad_norm": 1.9360573291778564, + "learning_rate": 1.1712878984192797e-06, + "loss": 1.7366, + "step": 226 + }, + { + "epoch": 0.005294352593387307, + "grad_norm": 1.0133939981460571, + "learning_rate": 1.1764705882352942e-06, + "loss": 1.4571, + "step": 227 + }, + { + "epoch": 0.005317675732565225, + "grad_norm": 1.6443811655044556, + "learning_rate": 1.1816532780513088e-06, + "loss": 1.5312, + "step": 228 + }, + { + "epoch": 0.0053409988717431425, + "grad_norm": 1.1923338174819946, + "learning_rate": 1.1868359678673233e-06, + "loss": 1.6993, + "step": 229 + }, + { + "epoch": 0.0053643220109210596, + "grad_norm": 1.0345349311828613, + "learning_rate": 1.1920186576833377e-06, + "loss": 1.5739, + "step": 230 + }, + { + "epoch": 0.0053876451500989775, + "grad_norm": 0.9833806753158569, + "learning_rate": 1.1972013474993522e-06, + "loss": 1.819, + "step": 231 + }, + { + "epoch": 0.005410968289276895, + "grad_norm": 1.3315545320510864, + "learning_rate": 1.2023840373153668e-06, + "loss": 1.9472, + "step": 232 + }, + { + "epoch": 0.005434291428454813, + "grad_norm": 1.0042314529418945, + "learning_rate": 1.2075667271313812e-06, + "loss": 1.993, + "step": 233 + }, + { + "epoch": 0.00545761456763273, + "grad_norm": 1.2731118202209473, + "learning_rate": 1.2127494169473957e-06, + "loss": 1.6763, + "step": 234 + }, + { + "epoch": 0.005480937706810648, + "grad_norm": 0.9664155840873718, + "learning_rate": 1.2179321067634103e-06, + "loss": 1.3091, + "step": 235 + }, + { + "epoch": 0.005504260845988566, + "grad_norm": 1.6930897235870361, + "learning_rate": 1.2231147965794248e-06, + "loss": 1.6111, + "step": 236 + }, + { + "epoch": 0.005527583985166483, + "grad_norm": 0.9807016253471375, + "learning_rate": 1.2282974863954394e-06, + "loss": 1.6131, + "step": 237 + }, + { + "epoch": 0.005550907124344401, + "grad_norm": 1.321951150894165, + "learning_rate": 1.233480176211454e-06, + "loss": 1.242, + "step": 238 + }, + { + "epoch": 0.005574230263522319, + "grad_norm": 1.1465637683868408, + "learning_rate": 1.2386628660274685e-06, + "loss": 1.7035, + "step": 239 + }, + { + "epoch": 0.005597553402700237, + "grad_norm": 2.4264347553253174, + "learning_rate": 1.2438455558434829e-06, + "loss": 1.9859, + "step": 240 + }, + { + "epoch": 0.005620876541878154, + "grad_norm": 1.429149866104126, + "learning_rate": 1.2490282456594974e-06, + "loss": 1.8249, + "step": 241 + }, + { + "epoch": 0.005644199681056072, + "grad_norm": 1.1119049787521362, + "learning_rate": 1.254210935475512e-06, + "loss": 1.8005, + "step": 242 + }, + { + "epoch": 0.00566752282023399, + "grad_norm": 1.9002227783203125, + "learning_rate": 1.2593936252915265e-06, + "loss": 1.6951, + "step": 243 + }, + { + "epoch": 0.005690845959411907, + "grad_norm": 1.067659854888916, + "learning_rate": 1.264576315107541e-06, + "loss": 1.799, + "step": 244 + }, + { + "epoch": 0.0057141690985898245, + "grad_norm": 1.2947990894317627, + "learning_rate": 1.2697590049235552e-06, + "loss": 1.7837, + "step": 245 + }, + { + "epoch": 0.005737492237767742, + "grad_norm": 1.0790272951126099, + "learning_rate": 1.2749416947395698e-06, + "loss": 1.67, + "step": 246 + }, + { + "epoch": 0.00576081537694566, + "grad_norm": 1.3589330911636353, + "learning_rate": 1.2801243845555843e-06, + "loss": 1.9282, + "step": 247 + }, + { + "epoch": 0.005784138516123577, + "grad_norm": 1.4140998125076294, + "learning_rate": 1.285307074371599e-06, + "loss": 1.6708, + "step": 248 + }, + { + "epoch": 0.005807461655301495, + "grad_norm": 1.000994086265564, + "learning_rate": 1.2904897641876135e-06, + "loss": 1.4077, + "step": 249 + }, + { + "epoch": 0.005830784794479413, + "grad_norm": 1.3655062913894653, + "learning_rate": 1.295672454003628e-06, + "loss": 1.8862, + "step": 250 + }, + { + "epoch": 0.005854107933657331, + "grad_norm": 1.1164065599441528, + "learning_rate": 1.3008551438196426e-06, + "loss": 1.528, + "step": 251 + }, + { + "epoch": 0.005877431072835248, + "grad_norm": 1.1792149543762207, + "learning_rate": 1.306037833635657e-06, + "loss": 1.2879, + "step": 252 + }, + { + "epoch": 0.005900754212013166, + "grad_norm": 2.236320734024048, + "learning_rate": 1.3112205234516715e-06, + "loss": 1.4929, + "step": 253 + }, + { + "epoch": 0.005924077351191084, + "grad_norm": 1.8795088529586792, + "learning_rate": 1.316403213267686e-06, + "loss": 1.2468, + "step": 254 + }, + { + "epoch": 0.005947400490369001, + "grad_norm": 1.2248806953430176, + "learning_rate": 1.3215859030837006e-06, + "loss": 1.769, + "step": 255 + }, + { + "epoch": 0.005970723629546919, + "grad_norm": 1.252236008644104, + "learning_rate": 1.3267685928997152e-06, + "loss": 1.9014, + "step": 256 + }, + { + "epoch": 0.005994046768724837, + "grad_norm": 1.3926386833190918, + "learning_rate": 1.3319512827157297e-06, + "loss": 1.9599, + "step": 257 + }, + { + "epoch": 0.0060173699079027546, + "grad_norm": 1.5681990385055542, + "learning_rate": 1.3371339725317443e-06, + "loss": 1.8109, + "step": 258 + }, + { + "epoch": 0.006040693047080672, + "grad_norm": 1.6841275691986084, + "learning_rate": 1.3423166623477584e-06, + "loss": 1.4601, + "step": 259 + }, + { + "epoch": 0.0060640161862585895, + "grad_norm": 1.5262291431427002, + "learning_rate": 1.347499352163773e-06, + "loss": 1.6493, + "step": 260 + }, + { + "epoch": 0.006087339325436507, + "grad_norm": 1.0905576944351196, + "learning_rate": 1.3526820419797875e-06, + "loss": 2.0847, + "step": 261 + }, + { + "epoch": 0.006110662464614424, + "grad_norm": 1.4682683944702148, + "learning_rate": 1.357864731795802e-06, + "loss": 1.6889, + "step": 262 + }, + { + "epoch": 0.006133985603792342, + "grad_norm": 1.1054515838623047, + "learning_rate": 1.3630474216118166e-06, + "loss": 1.55, + "step": 263 + }, + { + "epoch": 0.00615730874297026, + "grad_norm": 1.3931388854980469, + "learning_rate": 1.3682301114278312e-06, + "loss": 1.655, + "step": 264 + }, + { + "epoch": 0.006180631882148178, + "grad_norm": 1.1766420602798462, + "learning_rate": 1.3734128012438458e-06, + "loss": 1.9555, + "step": 265 + }, + { + "epoch": 0.006203955021326095, + "grad_norm": 1.1652954816818237, + "learning_rate": 1.3785954910598601e-06, + "loss": 1.8446, + "step": 266 + }, + { + "epoch": 0.006227278160504013, + "grad_norm": 1.378980278968811, + "learning_rate": 1.3837781808758747e-06, + "loss": 1.4449, + "step": 267 + }, + { + "epoch": 0.006250601299681931, + "grad_norm": 1.2017453908920288, + "learning_rate": 1.3889608706918892e-06, + "loss": 1.6272, + "step": 268 + }, + { + "epoch": 0.006273924438859848, + "grad_norm": 1.2221115827560425, + "learning_rate": 1.3941435605079038e-06, + "loss": 1.7299, + "step": 269 + }, + { + "epoch": 0.006297247578037766, + "grad_norm": 1.189775824546814, + "learning_rate": 1.3993262503239183e-06, + "loss": 1.1664, + "step": 270 + }, + { + "epoch": 0.006320570717215684, + "grad_norm": 1.0103381872177124, + "learning_rate": 1.404508940139933e-06, + "loss": 1.3519, + "step": 271 + }, + { + "epoch": 0.006343893856393602, + "grad_norm": 1.1243481636047363, + "learning_rate": 1.4096916299559475e-06, + "loss": 1.6704, + "step": 272 + }, + { + "epoch": 0.006367216995571519, + "grad_norm": 1.8137811422348022, + "learning_rate": 1.4148743197719616e-06, + "loss": 1.279, + "step": 273 + }, + { + "epoch": 0.0063905401347494365, + "grad_norm": 1.0875202417373657, + "learning_rate": 1.4200570095879762e-06, + "loss": 1.1564, + "step": 274 + }, + { + "epoch": 0.0064138632739273544, + "grad_norm": 1.0839550495147705, + "learning_rate": 1.4252396994039907e-06, + "loss": 1.7263, + "step": 275 + }, + { + "epoch": 0.006437186413105272, + "grad_norm": 1.7203173637390137, + "learning_rate": 1.4304223892200053e-06, + "loss": 1.9309, + "step": 276 + }, + { + "epoch": 0.006460509552283189, + "grad_norm": 1.3320658206939697, + "learning_rate": 1.4356050790360198e-06, + "loss": 1.8276, + "step": 277 + }, + { + "epoch": 0.006483832691461107, + "grad_norm": 1.5260910987854004, + "learning_rate": 1.4407877688520344e-06, + "loss": 1.413, + "step": 278 + }, + { + "epoch": 0.006507155830639025, + "grad_norm": 1.2401058673858643, + "learning_rate": 1.445970458668049e-06, + "loss": 1.4087, + "step": 279 + }, + { + "epoch": 0.006530478969816942, + "grad_norm": 1.2722922563552856, + "learning_rate": 1.4511531484840633e-06, + "loss": 1.6216, + "step": 280 + }, + { + "epoch": 0.00655380210899486, + "grad_norm": 1.2668229341506958, + "learning_rate": 1.4563358383000779e-06, + "loss": 1.6252, + "step": 281 + }, + { + "epoch": 0.006577125248172778, + "grad_norm": 1.4556583166122437, + "learning_rate": 1.4615185281160924e-06, + "loss": 2.3276, + "step": 282 + }, + { + "epoch": 0.006600448387350696, + "grad_norm": 1.537610411643982, + "learning_rate": 1.466701217932107e-06, + "loss": 1.4319, + "step": 283 + }, + { + "epoch": 0.006623771526528613, + "grad_norm": 1.3130170106887817, + "learning_rate": 1.4718839077481215e-06, + "loss": 1.4978, + "step": 284 + }, + { + "epoch": 0.006647094665706531, + "grad_norm": 1.5020934343338013, + "learning_rate": 1.477066597564136e-06, + "loss": 1.8697, + "step": 285 + }, + { + "epoch": 0.006670417804884449, + "grad_norm": 1.6949779987335205, + "learning_rate": 1.4822492873801502e-06, + "loss": 1.7433, + "step": 286 + }, + { + "epoch": 0.006693740944062366, + "grad_norm": 1.5566325187683105, + "learning_rate": 1.4874319771961648e-06, + "loss": 1.5674, + "step": 287 + }, + { + "epoch": 0.006717064083240284, + "grad_norm": 1.015093445777893, + "learning_rate": 1.4926146670121793e-06, + "loss": 1.9903, + "step": 288 + }, + { + "epoch": 0.0067403872224182015, + "grad_norm": 2.229853868484497, + "learning_rate": 1.497797356828194e-06, + "loss": 1.1905, + "step": 289 + }, + { + "epoch": 0.006763710361596119, + "grad_norm": 1.5241860151290894, + "learning_rate": 1.5029800466442085e-06, + "loss": 1.958, + "step": 290 + }, + { + "epoch": 0.006787033500774036, + "grad_norm": 0.8666454553604126, + "learning_rate": 1.508162736460223e-06, + "loss": 1.7141, + "step": 291 + }, + { + "epoch": 0.006810356639951954, + "grad_norm": 1.4594520330429077, + "learning_rate": 1.5133454262762376e-06, + "loss": 1.7235, + "step": 292 + }, + { + "epoch": 0.006833679779129872, + "grad_norm": 1.3267074823379517, + "learning_rate": 1.518528116092252e-06, + "loss": 1.6172, + "step": 293 + }, + { + "epoch": 0.006857002918307789, + "grad_norm": 1.5386312007904053, + "learning_rate": 1.5237108059082665e-06, + "loss": 1.4843, + "step": 294 + }, + { + "epoch": 0.006880326057485707, + "grad_norm": 1.3275539875030518, + "learning_rate": 1.528893495724281e-06, + "loss": 1.5444, + "step": 295 + }, + { + "epoch": 0.006903649196663625, + "grad_norm": 1.1002707481384277, + "learning_rate": 1.5340761855402956e-06, + "loss": 1.717, + "step": 296 + }, + { + "epoch": 0.006926972335841543, + "grad_norm": 1.172974944114685, + "learning_rate": 1.5392588753563102e-06, + "loss": 1.6963, + "step": 297 + }, + { + "epoch": 0.00695029547501946, + "grad_norm": 1.0728440284729004, + "learning_rate": 1.5444415651723247e-06, + "loss": 1.6228, + "step": 298 + }, + { + "epoch": 0.006973618614197378, + "grad_norm": 1.274348258972168, + "learning_rate": 1.5496242549883393e-06, + "loss": 1.2559, + "step": 299 + }, + { + "epoch": 0.006996941753375296, + "grad_norm": 1.2520028352737427, + "learning_rate": 1.5548069448043534e-06, + "loss": 1.6118, + "step": 300 + }, + { + "epoch": 0.007020264892553213, + "grad_norm": 1.5844305753707886, + "learning_rate": 1.559989634620368e-06, + "loss": 1.5645, + "step": 301 + }, + { + "epoch": 0.007043588031731131, + "grad_norm": 2.285438299179077, + "learning_rate": 1.5651723244363825e-06, + "loss": 1.4541, + "step": 302 + }, + { + "epoch": 0.007066911170909049, + "grad_norm": 1.2873152494430542, + "learning_rate": 1.570355014252397e-06, + "loss": 1.4835, + "step": 303 + }, + { + "epoch": 0.0070902343100869665, + "grad_norm": 1.1332640647888184, + "learning_rate": 1.5755377040684116e-06, + "loss": 1.8279, + "step": 304 + }, + { + "epoch": 0.0071135574492648835, + "grad_norm": 1.6483525037765503, + "learning_rate": 1.5807203938844262e-06, + "loss": 1.2509, + "step": 305 + }, + { + "epoch": 0.007136880588442801, + "grad_norm": 1.0219485759735107, + "learning_rate": 1.5859030837004408e-06, + "loss": 1.8421, + "step": 306 + }, + { + "epoch": 0.007160203727620719, + "grad_norm": 1.2478340864181519, + "learning_rate": 1.5910857735164551e-06, + "loss": 1.9144, + "step": 307 + }, + { + "epoch": 0.007183526866798637, + "grad_norm": 1.4016437530517578, + "learning_rate": 1.5962684633324697e-06, + "loss": 1.5146, + "step": 308 + }, + { + "epoch": 0.007206850005976554, + "grad_norm": 1.1399790048599243, + "learning_rate": 1.6014511531484842e-06, + "loss": 1.6714, + "step": 309 + }, + { + "epoch": 0.007230173145154472, + "grad_norm": 2.047961473464966, + "learning_rate": 1.6066338429644988e-06, + "loss": 1.1777, + "step": 310 + }, + { + "epoch": 0.00725349628433239, + "grad_norm": 1.1410201787948608, + "learning_rate": 1.6118165327805133e-06, + "loss": 1.6783, + "step": 311 + }, + { + "epoch": 0.007276819423510307, + "grad_norm": 1.2840640544891357, + "learning_rate": 1.616999222596528e-06, + "loss": 1.9351, + "step": 312 + }, + { + "epoch": 0.007300142562688225, + "grad_norm": 0.9116181135177612, + "learning_rate": 1.6221819124125425e-06, + "loss": 1.7705, + "step": 313 + }, + { + "epoch": 0.007323465701866143, + "grad_norm": 1.3190463781356812, + "learning_rate": 1.6273646022285566e-06, + "loss": 1.4484, + "step": 314 + }, + { + "epoch": 0.007346788841044061, + "grad_norm": 0.9988270401954651, + "learning_rate": 1.6325472920445712e-06, + "loss": 1.5159, + "step": 315 + }, + { + "epoch": 0.007370111980221978, + "grad_norm": 0.8620725870132446, + "learning_rate": 1.6377299818605857e-06, + "loss": 1.5605, + "step": 316 + }, + { + "epoch": 0.007393435119399896, + "grad_norm": 1.284604549407959, + "learning_rate": 1.6429126716766003e-06, + "loss": 1.4822, + "step": 317 + }, + { + "epoch": 0.0074167582585778135, + "grad_norm": 1.2546097040176392, + "learning_rate": 1.6480953614926148e-06, + "loss": 1.436, + "step": 318 + }, + { + "epoch": 0.0074400813977557306, + "grad_norm": 0.9116978645324707, + "learning_rate": 1.6532780513086294e-06, + "loss": 1.2708, + "step": 319 + }, + { + "epoch": 0.0074634045369336485, + "grad_norm": 0.9910548329353333, + "learning_rate": 1.658460741124644e-06, + "loss": 1.8144, + "step": 320 + }, + { + "epoch": 0.007486727676111566, + "grad_norm": 1.9879093170166016, + "learning_rate": 1.6636434309406583e-06, + "loss": 1.4826, + "step": 321 + }, + { + "epoch": 0.007510050815289484, + "grad_norm": 1.0845030546188354, + "learning_rate": 1.6688261207566729e-06, + "loss": 1.3364, + "step": 322 + }, + { + "epoch": 0.007533373954467401, + "grad_norm": 1.342966079711914, + "learning_rate": 1.6740088105726874e-06, + "loss": 1.6453, + "step": 323 + }, + { + "epoch": 0.007556697093645319, + "grad_norm": 0.9570252895355225, + "learning_rate": 1.679191500388702e-06, + "loss": 1.5384, + "step": 324 + }, + { + "epoch": 0.007580020232823237, + "grad_norm": 1.531516671180725, + "learning_rate": 1.6843741902047165e-06, + "loss": 1.5775, + "step": 325 + }, + { + "epoch": 0.007603343372001154, + "grad_norm": 1.4623240232467651, + "learning_rate": 1.689556880020731e-06, + "loss": 1.7159, + "step": 326 + }, + { + "epoch": 0.007626666511179072, + "grad_norm": 1.109586238861084, + "learning_rate": 1.6947395698367454e-06, + "loss": 1.7403, + "step": 327 + }, + { + "epoch": 0.00764998965035699, + "grad_norm": 1.3199604749679565, + "learning_rate": 1.6999222596527598e-06, + "loss": 1.7208, + "step": 328 + }, + { + "epoch": 0.007673312789534908, + "grad_norm": 1.0979784727096558, + "learning_rate": 1.7051049494687743e-06, + "loss": 1.6097, + "step": 329 + }, + { + "epoch": 0.007696635928712825, + "grad_norm": 1.0952926874160767, + "learning_rate": 1.710287639284789e-06, + "loss": 1.8262, + "step": 330 + }, + { + "epoch": 0.007719959067890743, + "grad_norm": 1.1149373054504395, + "learning_rate": 1.7154703291008035e-06, + "loss": 1.5762, + "step": 331 + }, + { + "epoch": 0.007743282207068661, + "grad_norm": 1.2090753316879272, + "learning_rate": 1.720653018916818e-06, + "loss": 1.6161, + "step": 332 + }, + { + "epoch": 0.007766605346246578, + "grad_norm": 1.3476163148880005, + "learning_rate": 1.7258357087328326e-06, + "loss": 1.6854, + "step": 333 + }, + { + "epoch": 0.0077899284854244955, + "grad_norm": 1.3222614526748657, + "learning_rate": 1.7310183985488471e-06, + "loss": 1.5996, + "step": 334 + }, + { + "epoch": 0.007813251624602413, + "grad_norm": 1.2350871562957764, + "learning_rate": 1.7362010883648615e-06, + "loss": 1.5052, + "step": 335 + }, + { + "epoch": 0.007836574763780331, + "grad_norm": 1.4628745317459106, + "learning_rate": 1.741383778180876e-06, + "loss": 1.6268, + "step": 336 + }, + { + "epoch": 0.00785989790295825, + "grad_norm": 1.3481048345565796, + "learning_rate": 1.7465664679968906e-06, + "loss": 1.4308, + "step": 337 + }, + { + "epoch": 0.007883221042136167, + "grad_norm": 1.0008901357650757, + "learning_rate": 1.7517491578129052e-06, + "loss": 1.6487, + "step": 338 + }, + { + "epoch": 0.007906544181314083, + "grad_norm": 2.4258437156677246, + "learning_rate": 1.7569318476289195e-06, + "loss": 1.5327, + "step": 339 + }, + { + "epoch": 0.007929867320492001, + "grad_norm": 1.3444914817810059, + "learning_rate": 1.762114537444934e-06, + "loss": 1.5257, + "step": 340 + }, + { + "epoch": 0.007953190459669919, + "grad_norm": 2.297591209411621, + "learning_rate": 1.7672972272609486e-06, + "loss": 1.9581, + "step": 341 + }, + { + "epoch": 0.007976513598847837, + "grad_norm": 1.107711672782898, + "learning_rate": 1.772479917076963e-06, + "loss": 1.3486, + "step": 342 + }, + { + "epoch": 0.007999836738025755, + "grad_norm": 1.4064106941223145, + "learning_rate": 1.7776626068929775e-06, + "loss": 1.3169, + "step": 343 + }, + { + "epoch": 0.008023159877203673, + "grad_norm": 1.1236720085144043, + "learning_rate": 1.782845296708992e-06, + "loss": 2.0225, + "step": 344 + }, + { + "epoch": 0.00804648301638159, + "grad_norm": 1.9214081764221191, + "learning_rate": 1.7880279865250066e-06, + "loss": 1.7269, + "step": 345 + }, + { + "epoch": 0.008069806155559507, + "grad_norm": 1.1544204950332642, + "learning_rate": 1.7932106763410212e-06, + "loss": 1.8407, + "step": 346 + }, + { + "epoch": 0.008093129294737425, + "grad_norm": 1.3266545534133911, + "learning_rate": 1.7983933661570358e-06, + "loss": 1.3316, + "step": 347 + }, + { + "epoch": 0.008116452433915343, + "grad_norm": 1.4208300113677979, + "learning_rate": 1.8035760559730501e-06, + "loss": 1.7712, + "step": 348 + }, + { + "epoch": 0.00813977557309326, + "grad_norm": 1.1849939823150635, + "learning_rate": 1.8087587457890647e-06, + "loss": 1.3843, + "step": 349 + }, + { + "epoch": 0.008163098712271178, + "grad_norm": 0.9147690534591675, + "learning_rate": 1.8139414356050792e-06, + "loss": 1.703, + "step": 350 + }, + { + "epoch": 0.008186421851449096, + "grad_norm": 1.2026822566986084, + "learning_rate": 1.8191241254210938e-06, + "loss": 1.642, + "step": 351 + }, + { + "epoch": 0.008209744990627014, + "grad_norm": 1.6620279550552368, + "learning_rate": 1.8243068152371081e-06, + "loss": 1.2861, + "step": 352 + }, + { + "epoch": 0.00823306812980493, + "grad_norm": 1.20318603515625, + "learning_rate": 1.8294895050531227e-06, + "loss": 1.7781, + "step": 353 + }, + { + "epoch": 0.008256391268982848, + "grad_norm": 1.117148756980896, + "learning_rate": 1.8346721948691372e-06, + "loss": 1.7056, + "step": 354 + }, + { + "epoch": 0.008279714408160766, + "grad_norm": 1.3435394763946533, + "learning_rate": 1.8398548846851516e-06, + "loss": 1.7352, + "step": 355 + }, + { + "epoch": 0.008303037547338684, + "grad_norm": 1.6550534963607788, + "learning_rate": 1.8450375745011662e-06, + "loss": 1.4283, + "step": 356 + }, + { + "epoch": 0.008326360686516602, + "grad_norm": 1.0326530933380127, + "learning_rate": 1.8502202643171807e-06, + "loss": 1.8726, + "step": 357 + }, + { + "epoch": 0.00834968382569452, + "grad_norm": 1.1237214803695679, + "learning_rate": 1.8554029541331953e-06, + "loss": 1.7547, + "step": 358 + }, + { + "epoch": 0.008373006964872438, + "grad_norm": 1.3457711935043335, + "learning_rate": 1.8605856439492098e-06, + "loss": 1.5047, + "step": 359 + }, + { + "epoch": 0.008396330104050354, + "grad_norm": 1.3615081310272217, + "learning_rate": 1.8657683337652244e-06, + "loss": 1.3476, + "step": 360 + }, + { + "epoch": 0.008419653243228272, + "grad_norm": 1.4443084001541138, + "learning_rate": 1.870951023581239e-06, + "loss": 1.4259, + "step": 361 + }, + { + "epoch": 0.00844297638240619, + "grad_norm": 0.9154095649719238, + "learning_rate": 1.8761337133972533e-06, + "loss": 1.6089, + "step": 362 + }, + { + "epoch": 0.008466299521584108, + "grad_norm": 1.1972756385803223, + "learning_rate": 1.8813164032132679e-06, + "loss": 1.5704, + "step": 363 + }, + { + "epoch": 0.008489622660762025, + "grad_norm": 1.1325738430023193, + "learning_rate": 1.8864990930292822e-06, + "loss": 1.7252, + "step": 364 + }, + { + "epoch": 0.008512945799939943, + "grad_norm": 1.2257301807403564, + "learning_rate": 1.8916817828452968e-06, + "loss": 1.5124, + "step": 365 + }, + { + "epoch": 0.008536268939117861, + "grad_norm": 1.7714002132415771, + "learning_rate": 1.8968644726613113e-06, + "loss": 1.5799, + "step": 366 + }, + { + "epoch": 0.008559592078295777, + "grad_norm": 1.1215579509735107, + "learning_rate": 1.9020471624773259e-06, + "loss": 1.7692, + "step": 367 + }, + { + "epoch": 0.008582915217473695, + "grad_norm": 1.3264069557189941, + "learning_rate": 1.9072298522933404e-06, + "loss": 1.7848, + "step": 368 + }, + { + "epoch": 0.008606238356651613, + "grad_norm": 0.9898104667663574, + "learning_rate": 1.912412542109355e-06, + "loss": 1.945, + "step": 369 + }, + { + "epoch": 0.008629561495829531, + "grad_norm": 0.9507944583892822, + "learning_rate": 1.9175952319253693e-06, + "loss": 1.6469, + "step": 370 + }, + { + "epoch": 0.008652884635007449, + "grad_norm": 1.1940997838974, + "learning_rate": 1.9227779217413837e-06, + "loss": 1.5144, + "step": 371 + }, + { + "epoch": 0.008676207774185367, + "grad_norm": 1.2926305532455444, + "learning_rate": 1.9279606115573985e-06, + "loss": 1.6527, + "step": 372 + }, + { + "epoch": 0.008699530913363285, + "grad_norm": 0.9909786581993103, + "learning_rate": 1.933143301373413e-06, + "loss": 1.8003, + "step": 373 + }, + { + "epoch": 0.008722854052541201, + "grad_norm": 1.3900662660598755, + "learning_rate": 1.9383259911894276e-06, + "loss": 1.7743, + "step": 374 + }, + { + "epoch": 0.008746177191719119, + "grad_norm": 0.9942039251327515, + "learning_rate": 1.943508681005442e-06, + "loss": 1.5635, + "step": 375 + }, + { + "epoch": 0.008769500330897037, + "grad_norm": 1.3887672424316406, + "learning_rate": 1.9486913708214563e-06, + "loss": 1.744, + "step": 376 + }, + { + "epoch": 0.008792823470074955, + "grad_norm": 1.2873059511184692, + "learning_rate": 1.953874060637471e-06, + "loss": 1.64, + "step": 377 + }, + { + "epoch": 0.008816146609252873, + "grad_norm": 1.2259247303009033, + "learning_rate": 1.9590567504534854e-06, + "loss": 1.6418, + "step": 378 + }, + { + "epoch": 0.00883946974843079, + "grad_norm": 1.5709097385406494, + "learning_rate": 1.9642394402695e-06, + "loss": 1.4343, + "step": 379 + }, + { + "epoch": 0.008862792887608708, + "grad_norm": 1.016625165939331, + "learning_rate": 1.9694221300855145e-06, + "loss": 1.5838, + "step": 380 + }, + { + "epoch": 0.008886116026786626, + "grad_norm": 1.5763674974441528, + "learning_rate": 1.9746048199015293e-06, + "loss": 1.3391, + "step": 381 + }, + { + "epoch": 0.008909439165964542, + "grad_norm": 1.014722466468811, + "learning_rate": 1.9797875097175436e-06, + "loss": 1.7185, + "step": 382 + }, + { + "epoch": 0.00893276230514246, + "grad_norm": 1.5255705118179321, + "learning_rate": 1.984970199533558e-06, + "loss": 1.5749, + "step": 383 + }, + { + "epoch": 0.008956085444320378, + "grad_norm": 1.4036648273468018, + "learning_rate": 1.9901528893495723e-06, + "loss": 1.4134, + "step": 384 + }, + { + "epoch": 0.008979408583498296, + "grad_norm": 1.327813982963562, + "learning_rate": 1.995335579165587e-06, + "loss": 1.8475, + "step": 385 + }, + { + "epoch": 0.009002731722676214, + "grad_norm": 1.357269287109375, + "learning_rate": 2.0005182689816014e-06, + "loss": 1.4145, + "step": 386 + }, + { + "epoch": 0.009026054861854132, + "grad_norm": 1.4663738012313843, + "learning_rate": 2.005700958797616e-06, + "loss": 1.5207, + "step": 387 + }, + { + "epoch": 0.00904937800103205, + "grad_norm": 0.9792691469192505, + "learning_rate": 2.0108836486136305e-06, + "loss": 1.7392, + "step": 388 + }, + { + "epoch": 0.009072701140209966, + "grad_norm": 1.9074856042861938, + "learning_rate": 2.0160663384296453e-06, + "loss": 1.5931, + "step": 389 + }, + { + "epoch": 0.009096024279387884, + "grad_norm": 1.562455654144287, + "learning_rate": 2.0212490282456597e-06, + "loss": 1.3503, + "step": 390 + }, + { + "epoch": 0.009119347418565802, + "grad_norm": 1.6827714443206787, + "learning_rate": 2.026431718061674e-06, + "loss": 1.8409, + "step": 391 + }, + { + "epoch": 0.00914267055774372, + "grad_norm": 0.969691276550293, + "learning_rate": 2.0316144078776888e-06, + "loss": 1.5167, + "step": 392 + }, + { + "epoch": 0.009165993696921637, + "grad_norm": 1.1107996702194214, + "learning_rate": 2.036797097693703e-06, + "loss": 1.5723, + "step": 393 + }, + { + "epoch": 0.009189316836099555, + "grad_norm": 0.9862359762191772, + "learning_rate": 2.041979787509718e-06, + "loss": 1.1188, + "step": 394 + }, + { + "epoch": 0.009212639975277473, + "grad_norm": 1.4997074604034424, + "learning_rate": 2.0471624773257322e-06, + "loss": 1.6742, + "step": 395 + }, + { + "epoch": 0.00923596311445539, + "grad_norm": 1.1336885690689087, + "learning_rate": 2.052345167141747e-06, + "loss": 1.5602, + "step": 396 + }, + { + "epoch": 0.009259286253633307, + "grad_norm": 1.4929397106170654, + "learning_rate": 2.057527856957761e-06, + "loss": 1.4891, + "step": 397 + }, + { + "epoch": 0.009282609392811225, + "grad_norm": 1.3118637800216675, + "learning_rate": 2.0627105467737757e-06, + "loss": 1.5758, + "step": 398 + }, + { + "epoch": 0.009305932531989143, + "grad_norm": 1.1043623685836792, + "learning_rate": 2.06789323658979e-06, + "loss": 1.9455, + "step": 399 + }, + { + "epoch": 0.009329255671167061, + "grad_norm": 1.3472813367843628, + "learning_rate": 2.073075926405805e-06, + "loss": 1.4657, + "step": 400 + }, + { + "epoch": 0.009352578810344979, + "grad_norm": 1.5614628791809082, + "learning_rate": 2.078258616221819e-06, + "loss": 1.3351, + "step": 401 + }, + { + "epoch": 0.009375901949522897, + "grad_norm": 1.393477439880371, + "learning_rate": 2.083441306037834e-06, + "loss": 1.8887, + "step": 402 + }, + { + "epoch": 0.009399225088700813, + "grad_norm": 1.0576095581054688, + "learning_rate": 2.0886239958538483e-06, + "loss": 1.7814, + "step": 403 + }, + { + "epoch": 0.00942254822787873, + "grad_norm": 1.5161347389221191, + "learning_rate": 2.0938066856698626e-06, + "loss": 1.2316, + "step": 404 + }, + { + "epoch": 0.009445871367056649, + "grad_norm": 1.05890691280365, + "learning_rate": 2.0989893754858774e-06, + "loss": 1.5303, + "step": 405 + }, + { + "epoch": 0.009469194506234567, + "grad_norm": 0.801816463470459, + "learning_rate": 2.1041720653018918e-06, + "loss": 1.5165, + "step": 406 + }, + { + "epoch": 0.009492517645412485, + "grad_norm": 1.2811832427978516, + "learning_rate": 2.1093547551179065e-06, + "loss": 1.8638, + "step": 407 + }, + { + "epoch": 0.009515840784590402, + "grad_norm": 1.2984956502914429, + "learning_rate": 2.114537444933921e-06, + "loss": 1.4195, + "step": 408 + }, + { + "epoch": 0.00953916392376832, + "grad_norm": 2.3772926330566406, + "learning_rate": 2.1197201347499356e-06, + "loss": 1.2616, + "step": 409 + }, + { + "epoch": 0.009562487062946236, + "grad_norm": 1.102181315422058, + "learning_rate": 2.12490282456595e-06, + "loss": 1.6683, + "step": 410 + }, + { + "epoch": 0.009585810202124154, + "grad_norm": 1.4473963975906372, + "learning_rate": 2.1300855143819643e-06, + "loss": 1.6474, + "step": 411 + }, + { + "epoch": 0.009609133341302072, + "grad_norm": 2.3995816707611084, + "learning_rate": 2.1352682041979787e-06, + "loss": 1.6203, + "step": 412 + }, + { + "epoch": 0.00963245648047999, + "grad_norm": 0.9490773677825928, + "learning_rate": 2.1404508940139935e-06, + "loss": 1.8082, + "step": 413 + }, + { + "epoch": 0.009655779619657908, + "grad_norm": 0.9358771443367004, + "learning_rate": 2.145633583830008e-06, + "loss": 1.5929, + "step": 414 + }, + { + "epoch": 0.009679102758835826, + "grad_norm": 0.9875616431236267, + "learning_rate": 2.1508162736460226e-06, + "loss": 1.4312, + "step": 415 + }, + { + "epoch": 0.009702425898013744, + "grad_norm": 1.197416067123413, + "learning_rate": 2.155998963462037e-06, + "loss": 1.3165, + "step": 416 + }, + { + "epoch": 0.00972574903719166, + "grad_norm": 2.0210750102996826, + "learning_rate": 2.1611816532780513e-06, + "loss": 1.4962, + "step": 417 + }, + { + "epoch": 0.009749072176369578, + "grad_norm": 1.2700085639953613, + "learning_rate": 2.166364343094066e-06, + "loss": 1.6101, + "step": 418 + }, + { + "epoch": 0.009772395315547496, + "grad_norm": 1.124679684638977, + "learning_rate": 2.1715470329100804e-06, + "loss": 1.7477, + "step": 419 + }, + { + "epoch": 0.009795718454725414, + "grad_norm": 1.178290843963623, + "learning_rate": 2.176729722726095e-06, + "loss": 1.4108, + "step": 420 + }, + { + "epoch": 0.009819041593903332, + "grad_norm": 1.792117953300476, + "learning_rate": 2.1819124125421095e-06, + "loss": 1.5568, + "step": 421 + }, + { + "epoch": 0.00984236473308125, + "grad_norm": 1.7381610870361328, + "learning_rate": 2.1870951023581243e-06, + "loss": 1.3229, + "step": 422 + }, + { + "epoch": 0.009865687872259167, + "grad_norm": 1.023553490638733, + "learning_rate": 2.1922777921741386e-06, + "loss": 1.1633, + "step": 423 + }, + { + "epoch": 0.009889011011437084, + "grad_norm": 1.5537900924682617, + "learning_rate": 2.197460481990153e-06, + "loss": 1.291, + "step": 424 + }, + { + "epoch": 0.009912334150615001, + "grad_norm": 1.722598671913147, + "learning_rate": 2.2026431718061673e-06, + "loss": 1.5201, + "step": 425 + }, + { + "epoch": 0.00993565728979292, + "grad_norm": 1.546295166015625, + "learning_rate": 2.207825861622182e-06, + "loss": 1.3554, + "step": 426 + }, + { + "epoch": 0.009958980428970837, + "grad_norm": 1.4075593948364258, + "learning_rate": 2.2130085514381964e-06, + "loss": 1.3831, + "step": 427 + }, + { + "epoch": 0.009982303568148755, + "grad_norm": 1.441125512123108, + "learning_rate": 2.218191241254211e-06, + "loss": 1.4806, + "step": 428 + }, + { + "epoch": 0.010005626707326673, + "grad_norm": 1.4198213815689087, + "learning_rate": 2.2233739310702255e-06, + "loss": 1.6962, + "step": 429 + }, + { + "epoch": 0.010028949846504591, + "grad_norm": 1.1716971397399902, + "learning_rate": 2.2285566208862403e-06, + "loss": 1.0423, + "step": 430 + }, + { + "epoch": 0.010052272985682507, + "grad_norm": 1.1271895170211792, + "learning_rate": 2.2337393107022547e-06, + "loss": 1.4246, + "step": 431 + }, + { + "epoch": 0.010075596124860425, + "grad_norm": 1.2987208366394043, + "learning_rate": 2.238922000518269e-06, + "loss": 1.5946, + "step": 432 + }, + { + "epoch": 0.010098919264038343, + "grad_norm": 1.7283997535705566, + "learning_rate": 2.2441046903342838e-06, + "loss": 1.5761, + "step": 433 + }, + { + "epoch": 0.01012224240321626, + "grad_norm": 1.635098934173584, + "learning_rate": 2.249287380150298e-06, + "loss": 1.6912, + "step": 434 + }, + { + "epoch": 0.010145565542394179, + "grad_norm": 2.1896469593048096, + "learning_rate": 2.254470069966313e-06, + "loss": 1.2961, + "step": 435 + }, + { + "epoch": 0.010168888681572097, + "grad_norm": 1.1874053478240967, + "learning_rate": 2.2596527597823272e-06, + "loss": 1.4999, + "step": 436 + }, + { + "epoch": 0.010192211820750014, + "grad_norm": 1.2898855209350586, + "learning_rate": 2.264835449598342e-06, + "loss": 1.7152, + "step": 437 + }, + { + "epoch": 0.010215534959927932, + "grad_norm": 0.792107105255127, + "learning_rate": 2.270018139414356e-06, + "loss": 1.4129, + "step": 438 + }, + { + "epoch": 0.010238858099105849, + "grad_norm": 1.2092666625976562, + "learning_rate": 2.2752008292303707e-06, + "loss": 1.4687, + "step": 439 + }, + { + "epoch": 0.010262181238283766, + "grad_norm": 1.2261115312576294, + "learning_rate": 2.280383519046385e-06, + "loss": 1.5548, + "step": 440 + }, + { + "epoch": 0.010285504377461684, + "grad_norm": 2.0835094451904297, + "learning_rate": 2.2855662088624e-06, + "loss": 1.5925, + "step": 441 + }, + { + "epoch": 0.010308827516639602, + "grad_norm": 1.075907826423645, + "learning_rate": 2.290748898678414e-06, + "loss": 1.4967, + "step": 442 + }, + { + "epoch": 0.01033215065581752, + "grad_norm": 0.9633646011352539, + "learning_rate": 2.295931588494429e-06, + "loss": 1.6798, + "step": 443 + }, + { + "epoch": 0.010355473794995438, + "grad_norm": 1.6833699941635132, + "learning_rate": 2.3011142783104433e-06, + "loss": 1.3053, + "step": 444 + }, + { + "epoch": 0.010378796934173356, + "grad_norm": 1.1333974599838257, + "learning_rate": 2.3062969681264576e-06, + "loss": 1.3658, + "step": 445 + }, + { + "epoch": 0.010402120073351272, + "grad_norm": 1.3382309675216675, + "learning_rate": 2.3114796579424724e-06, + "loss": 1.6492, + "step": 446 + }, + { + "epoch": 0.01042544321252919, + "grad_norm": 0.7148923873901367, + "learning_rate": 2.3166623477584868e-06, + "loss": 1.6269, + "step": 447 + }, + { + "epoch": 0.010448766351707108, + "grad_norm": 1.084245204925537, + "learning_rate": 2.3218450375745015e-06, + "loss": 2.0708, + "step": 448 + }, + { + "epoch": 0.010472089490885026, + "grad_norm": 1.1463004350662231, + "learning_rate": 2.327027727390516e-06, + "loss": 2.0115, + "step": 449 + }, + { + "epoch": 0.010495412630062944, + "grad_norm": 1.5500133037567139, + "learning_rate": 2.3322104172065306e-06, + "loss": 1.5454, + "step": 450 + }, + { + "epoch": 0.010518735769240862, + "grad_norm": 1.2993839979171753, + "learning_rate": 2.337393107022545e-06, + "loss": 1.5475, + "step": 451 + }, + { + "epoch": 0.01054205890841878, + "grad_norm": 1.295839786529541, + "learning_rate": 2.3425757968385593e-06, + "loss": 1.2895, + "step": 452 + }, + { + "epoch": 0.010565382047596696, + "grad_norm": 1.045040488243103, + "learning_rate": 2.3477584866545737e-06, + "loss": 1.7306, + "step": 453 + }, + { + "epoch": 0.010588705186774613, + "grad_norm": 1.4592766761779785, + "learning_rate": 2.3529411764705885e-06, + "loss": 1.7795, + "step": 454 + }, + { + "epoch": 0.010612028325952531, + "grad_norm": 0.9432761073112488, + "learning_rate": 2.358123866286603e-06, + "loss": 1.6963, + "step": 455 + }, + { + "epoch": 0.01063535146513045, + "grad_norm": 1.3770086765289307, + "learning_rate": 2.3633065561026176e-06, + "loss": 1.2003, + "step": 456 + }, + { + "epoch": 0.010658674604308367, + "grad_norm": 1.1453793048858643, + "learning_rate": 2.368489245918632e-06, + "loss": 1.9012, + "step": 457 + }, + { + "epoch": 0.010681997743486285, + "grad_norm": 1.2836976051330566, + "learning_rate": 2.3736719357346467e-06, + "loss": 1.4324, + "step": 458 + }, + { + "epoch": 0.010705320882664203, + "grad_norm": 1.6498123407363892, + "learning_rate": 2.378854625550661e-06, + "loss": 1.6212, + "step": 459 + }, + { + "epoch": 0.010728644021842119, + "grad_norm": 1.3681795597076416, + "learning_rate": 2.3840373153666754e-06, + "loss": 1.6047, + "step": 460 + }, + { + "epoch": 0.010751967161020037, + "grad_norm": 1.4474722146987915, + "learning_rate": 2.38922000518269e-06, + "loss": 1.5279, + "step": 461 + }, + { + "epoch": 0.010775290300197955, + "grad_norm": 1.4832510948181152, + "learning_rate": 2.3944026949987045e-06, + "loss": 1.7073, + "step": 462 + }, + { + "epoch": 0.010798613439375873, + "grad_norm": 1.343935251235962, + "learning_rate": 2.3995853848147193e-06, + "loss": 1.4637, + "step": 463 + }, + { + "epoch": 0.01082193657855379, + "grad_norm": 1.8285539150238037, + "learning_rate": 2.4047680746307336e-06, + "loss": 1.3944, + "step": 464 + }, + { + "epoch": 0.010845259717731709, + "grad_norm": 1.4653230905532837, + "learning_rate": 2.4099507644467484e-06, + "loss": 1.8847, + "step": 465 + }, + { + "epoch": 0.010868582856909626, + "grad_norm": 1.4410351514816284, + "learning_rate": 2.4151334542627623e-06, + "loss": 1.7298, + "step": 466 + }, + { + "epoch": 0.010891905996087543, + "grad_norm": 1.3057256937026978, + "learning_rate": 2.420316144078777e-06, + "loss": 1.6188, + "step": 467 + }, + { + "epoch": 0.01091522913526546, + "grad_norm": 1.574479103088379, + "learning_rate": 2.4254988338947914e-06, + "loss": 1.585, + "step": 468 + }, + { + "epoch": 0.010938552274443378, + "grad_norm": 1.4391696453094482, + "learning_rate": 2.430681523710806e-06, + "loss": 1.7272, + "step": 469 + }, + { + "epoch": 0.010961875413621296, + "grad_norm": 2.304706335067749, + "learning_rate": 2.4358642135268205e-06, + "loss": 1.7127, + "step": 470 + }, + { + "epoch": 0.010985198552799214, + "grad_norm": 1.2380545139312744, + "learning_rate": 2.4410469033428353e-06, + "loss": 1.5428, + "step": 471 + }, + { + "epoch": 0.011008521691977132, + "grad_norm": 1.303446888923645, + "learning_rate": 2.4462295931588497e-06, + "loss": 1.609, + "step": 472 + }, + { + "epoch": 0.01103184483115505, + "grad_norm": 1.3888837099075317, + "learning_rate": 2.451412282974864e-06, + "loss": 1.7134, + "step": 473 + }, + { + "epoch": 0.011055167970332966, + "grad_norm": 0.9802701473236084, + "learning_rate": 2.4565949727908788e-06, + "loss": 1.4401, + "step": 474 + }, + { + "epoch": 0.011078491109510884, + "grad_norm": 1.5808403491973877, + "learning_rate": 2.461777662606893e-06, + "loss": 1.7415, + "step": 475 + }, + { + "epoch": 0.011101814248688802, + "grad_norm": 1.299912691116333, + "learning_rate": 2.466960352422908e-06, + "loss": 1.361, + "step": 476 + }, + { + "epoch": 0.01112513738786672, + "grad_norm": 0.9326110482215881, + "learning_rate": 2.4721430422389222e-06, + "loss": 1.222, + "step": 477 + }, + { + "epoch": 0.011148460527044638, + "grad_norm": 1.0385396480560303, + "learning_rate": 2.477325732054937e-06, + "loss": 1.4813, + "step": 478 + }, + { + "epoch": 0.011171783666222556, + "grad_norm": 1.1004397869110107, + "learning_rate": 2.482508421870951e-06, + "loss": 1.5064, + "step": 479 + }, + { + "epoch": 0.011195106805400474, + "grad_norm": 1.274898886680603, + "learning_rate": 2.4876911116869657e-06, + "loss": 1.3046, + "step": 480 + }, + { + "epoch": 0.01121842994457839, + "grad_norm": 1.0818660259246826, + "learning_rate": 2.49287380150298e-06, + "loss": 1.878, + "step": 481 + }, + { + "epoch": 0.011241753083756308, + "grad_norm": 1.2744652032852173, + "learning_rate": 2.498056491318995e-06, + "loss": 1.6394, + "step": 482 + }, + { + "epoch": 0.011265076222934226, + "grad_norm": 1.0467538833618164, + "learning_rate": 2.503239181135009e-06, + "loss": 1.8949, + "step": 483 + }, + { + "epoch": 0.011288399362112143, + "grad_norm": 1.2507177591323853, + "learning_rate": 2.508421870951024e-06, + "loss": 1.5386, + "step": 484 + }, + { + "epoch": 0.011311722501290061, + "grad_norm": 2.0707380771636963, + "learning_rate": 2.5136045607670383e-06, + "loss": 1.3359, + "step": 485 + }, + { + "epoch": 0.01133504564046798, + "grad_norm": 1.0060955286026, + "learning_rate": 2.518787250583053e-06, + "loss": 1.5551, + "step": 486 + }, + { + "epoch": 0.011358368779645897, + "grad_norm": 2.1019294261932373, + "learning_rate": 2.5239699403990674e-06, + "loss": 1.4009, + "step": 487 + }, + { + "epoch": 0.011381691918823813, + "grad_norm": 1.2085974216461182, + "learning_rate": 2.529152630215082e-06, + "loss": 1.1264, + "step": 488 + }, + { + "epoch": 0.011405015058001731, + "grad_norm": 1.2670215368270874, + "learning_rate": 2.5343353200310965e-06, + "loss": 1.4005, + "step": 489 + }, + { + "epoch": 0.011428338197179649, + "grad_norm": 0.976809024810791, + "learning_rate": 2.5395180098471104e-06, + "loss": 1.6539, + "step": 490 + }, + { + "epoch": 0.011451661336357567, + "grad_norm": 1.8012447357177734, + "learning_rate": 2.5447006996631252e-06, + "loss": 1.5083, + "step": 491 + }, + { + "epoch": 0.011474984475535485, + "grad_norm": 2.0657784938812256, + "learning_rate": 2.5498833894791396e-06, + "loss": 1.4127, + "step": 492 + }, + { + "epoch": 0.011498307614713403, + "grad_norm": 1.4070103168487549, + "learning_rate": 2.5550660792951543e-06, + "loss": 1.4707, + "step": 493 + }, + { + "epoch": 0.01152163075389132, + "grad_norm": 0.859045147895813, + "learning_rate": 2.5602487691111687e-06, + "loss": 1.6301, + "step": 494 + }, + { + "epoch": 0.011544953893069239, + "grad_norm": 1.5209952592849731, + "learning_rate": 2.5654314589271835e-06, + "loss": 1.8438, + "step": 495 + }, + { + "epoch": 0.011568277032247155, + "grad_norm": 1.1508231163024902, + "learning_rate": 2.570614148743198e-06, + "loss": 1.2495, + "step": 496 + }, + { + "epoch": 0.011591600171425073, + "grad_norm": 0.9130313396453857, + "learning_rate": 2.5757968385592126e-06, + "loss": 1.1848, + "step": 497 + }, + { + "epoch": 0.01161492331060299, + "grad_norm": 1.5925562381744385, + "learning_rate": 2.580979528375227e-06, + "loss": 1.4745, + "step": 498 + }, + { + "epoch": 0.011638246449780908, + "grad_norm": 2.5118539333343506, + "learning_rate": 2.5861622181912417e-06, + "loss": 1.6218, + "step": 499 + }, + { + "epoch": 0.011661569588958826, + "grad_norm": 1.272691249847412, + "learning_rate": 2.591344908007256e-06, + "loss": 1.2147, + "step": 500 + }, + { + "epoch": 0.011684892728136744, + "grad_norm": 1.1436160802841187, + "learning_rate": 2.596527597823271e-06, + "loss": 1.5556, + "step": 501 + }, + { + "epoch": 0.011708215867314662, + "grad_norm": 1.0195647478103638, + "learning_rate": 2.601710287639285e-06, + "loss": 1.3303, + "step": 502 + }, + { + "epoch": 0.011731539006492578, + "grad_norm": 1.4576568603515625, + "learning_rate": 2.6068929774553e-06, + "loss": 1.6531, + "step": 503 + }, + { + "epoch": 0.011754862145670496, + "grad_norm": 1.360716462135315, + "learning_rate": 2.612075667271314e-06, + "loss": 1.1761, + "step": 504 + }, + { + "epoch": 0.011778185284848414, + "grad_norm": 2.7770462036132812, + "learning_rate": 2.617258357087328e-06, + "loss": 1.247, + "step": 505 + }, + { + "epoch": 0.011801508424026332, + "grad_norm": 1.3706661462783813, + "learning_rate": 2.622441046903343e-06, + "loss": 1.5103, + "step": 506 + }, + { + "epoch": 0.01182483156320425, + "grad_norm": 1.5405017137527466, + "learning_rate": 2.6276237367193573e-06, + "loss": 1.6827, + "step": 507 + }, + { + "epoch": 0.011848154702382168, + "grad_norm": 1.1809494495391846, + "learning_rate": 2.632806426535372e-06, + "loss": 1.7162, + "step": 508 + }, + { + "epoch": 0.011871477841560086, + "grad_norm": 1.085557222366333, + "learning_rate": 2.6379891163513864e-06, + "loss": 1.514, + "step": 509 + }, + { + "epoch": 0.011894800980738002, + "grad_norm": 1.2155910730361938, + "learning_rate": 2.643171806167401e-06, + "loss": 1.4029, + "step": 510 + }, + { + "epoch": 0.01191812411991592, + "grad_norm": 1.240242600440979, + "learning_rate": 2.6483544959834155e-06, + "loss": 1.4336, + "step": 511 + }, + { + "epoch": 0.011941447259093838, + "grad_norm": 1.649802327156067, + "learning_rate": 2.6535371857994303e-06, + "loss": 1.9082, + "step": 512 + }, + { + "epoch": 0.011964770398271755, + "grad_norm": 1.3479831218719482, + "learning_rate": 2.6587198756154447e-06, + "loss": 1.5424, + "step": 513 + }, + { + "epoch": 0.011988093537449673, + "grad_norm": 1.2537102699279785, + "learning_rate": 2.6639025654314594e-06, + "loss": 1.6061, + "step": 514 + }, + { + "epoch": 0.012011416676627591, + "grad_norm": 1.1049939393997192, + "learning_rate": 2.6690852552474738e-06, + "loss": 1.8361, + "step": 515 + }, + { + "epoch": 0.012034739815805509, + "grad_norm": 2.9946062564849854, + "learning_rate": 2.6742679450634885e-06, + "loss": 1.4471, + "step": 516 + }, + { + "epoch": 0.012058062954983425, + "grad_norm": 0.9455610513687134, + "learning_rate": 2.6794506348795025e-06, + "loss": 1.6831, + "step": 517 + }, + { + "epoch": 0.012081386094161343, + "grad_norm": 1.4750438928604126, + "learning_rate": 2.684633324695517e-06, + "loss": 1.3143, + "step": 518 + }, + { + "epoch": 0.012104709233339261, + "grad_norm": 1.1056557893753052, + "learning_rate": 2.6898160145115316e-06, + "loss": 1.5054, + "step": 519 + }, + { + "epoch": 0.012128032372517179, + "grad_norm": 0.9718064069747925, + "learning_rate": 2.694998704327546e-06, + "loss": 1.3134, + "step": 520 + }, + { + "epoch": 0.012151355511695097, + "grad_norm": 2.2384724617004395, + "learning_rate": 2.7001813941435607e-06, + "loss": 1.4851, + "step": 521 + }, + { + "epoch": 0.012174678650873015, + "grad_norm": 1.2468239068984985, + "learning_rate": 2.705364083959575e-06, + "loss": 1.4873, + "step": 522 + }, + { + "epoch": 0.012198001790050933, + "grad_norm": 1.4248602390289307, + "learning_rate": 2.71054677377559e-06, + "loss": 1.7643, + "step": 523 + }, + { + "epoch": 0.012221324929228849, + "grad_norm": 1.3377385139465332, + "learning_rate": 2.715729463591604e-06, + "loss": 1.7064, + "step": 524 + }, + { + "epoch": 0.012244648068406767, + "grad_norm": 0.9933966994285583, + "learning_rate": 2.720912153407619e-06, + "loss": 1.7187, + "step": 525 + }, + { + "epoch": 0.012267971207584685, + "grad_norm": 1.018750548362732, + "learning_rate": 2.7260948432236333e-06, + "loss": 1.5915, + "step": 526 + }, + { + "epoch": 0.012291294346762602, + "grad_norm": 1.356325387954712, + "learning_rate": 2.731277533039648e-06, + "loss": 1.7193, + "step": 527 + }, + { + "epoch": 0.01231461748594052, + "grad_norm": 1.2781217098236084, + "learning_rate": 2.7364602228556624e-06, + "loss": 1.5494, + "step": 528 + }, + { + "epoch": 0.012337940625118438, + "grad_norm": 1.561498761177063, + "learning_rate": 2.741642912671677e-06, + "loss": 1.6972, + "step": 529 + }, + { + "epoch": 0.012361263764296356, + "grad_norm": 1.1695748567581177, + "learning_rate": 2.7468256024876915e-06, + "loss": 2.1633, + "step": 530 + }, + { + "epoch": 0.012384586903474272, + "grad_norm": 1.4304964542388916, + "learning_rate": 2.7520082923037054e-06, + "loss": 1.6321, + "step": 531 + }, + { + "epoch": 0.01240791004265219, + "grad_norm": 1.0513828992843628, + "learning_rate": 2.7571909821197202e-06, + "loss": 1.2897, + "step": 532 + }, + { + "epoch": 0.012431233181830108, + "grad_norm": 1.0206960439682007, + "learning_rate": 2.7623736719357346e-06, + "loss": 1.7842, + "step": 533 + }, + { + "epoch": 0.012454556321008026, + "grad_norm": 1.1440876722335815, + "learning_rate": 2.7675563617517493e-06, + "loss": 1.4399, + "step": 534 + }, + { + "epoch": 0.012477879460185944, + "grad_norm": 1.0837441682815552, + "learning_rate": 2.7727390515677637e-06, + "loss": 1.5155, + "step": 535 + }, + { + "epoch": 0.012501202599363862, + "grad_norm": 1.071378231048584, + "learning_rate": 2.7779217413837785e-06, + "loss": 1.6459, + "step": 536 + }, + { + "epoch": 0.01252452573854178, + "grad_norm": 1.6966552734375, + "learning_rate": 2.783104431199793e-06, + "loss": 1.6015, + "step": 537 + }, + { + "epoch": 0.012547848877719696, + "grad_norm": 1.2789183855056763, + "learning_rate": 2.7882871210158076e-06, + "loss": 1.2423, + "step": 538 + }, + { + "epoch": 0.012571172016897614, + "grad_norm": 1.2072651386260986, + "learning_rate": 2.793469810831822e-06, + "loss": 1.69, + "step": 539 + }, + { + "epoch": 0.012594495156075532, + "grad_norm": 1.5257117748260498, + "learning_rate": 2.7986525006478367e-06, + "loss": 1.7608, + "step": 540 + }, + { + "epoch": 0.01261781829525345, + "grad_norm": 1.0233759880065918, + "learning_rate": 2.803835190463851e-06, + "loss": 1.1299, + "step": 541 + }, + { + "epoch": 0.012641141434431367, + "grad_norm": 1.8280616998672485, + "learning_rate": 2.809017880279866e-06, + "loss": 1.3338, + "step": 542 + }, + { + "epoch": 0.012664464573609285, + "grad_norm": 1.6891363859176636, + "learning_rate": 2.81420057009588e-06, + "loss": 1.5505, + "step": 543 + }, + { + "epoch": 0.012687787712787203, + "grad_norm": 1.1501421928405762, + "learning_rate": 2.819383259911895e-06, + "loss": 1.6788, + "step": 544 + }, + { + "epoch": 0.01271111085196512, + "grad_norm": 1.107029914855957, + "learning_rate": 2.824565949727909e-06, + "loss": 1.3782, + "step": 545 + }, + { + "epoch": 0.012734433991143037, + "grad_norm": 0.9627429246902466, + "learning_rate": 2.829748639543923e-06, + "loss": 1.3155, + "step": 546 + }, + { + "epoch": 0.012757757130320955, + "grad_norm": 2.330007791519165, + "learning_rate": 2.834931329359938e-06, + "loss": 1.425, + "step": 547 + }, + { + "epoch": 0.012781080269498873, + "grad_norm": 1.4026503562927246, + "learning_rate": 2.8401140191759523e-06, + "loss": 1.5578, + "step": 548 + }, + { + "epoch": 0.012804403408676791, + "grad_norm": 0.9430487155914307, + "learning_rate": 2.845296708991967e-06, + "loss": 1.6075, + "step": 549 + }, + { + "epoch": 0.012827726547854709, + "grad_norm": 1.0779294967651367, + "learning_rate": 2.8504793988079814e-06, + "loss": 1.5169, + "step": 550 + }, + { + "epoch": 0.012851049687032627, + "grad_norm": 1.130324125289917, + "learning_rate": 2.855662088623996e-06, + "loss": 1.5016, + "step": 551 + }, + { + "epoch": 0.012874372826210545, + "grad_norm": 1.0127092599868774, + "learning_rate": 2.8608447784400105e-06, + "loss": 1.8715, + "step": 552 + }, + { + "epoch": 0.01289769596538846, + "grad_norm": 1.1831302642822266, + "learning_rate": 2.8660274682560253e-06, + "loss": 1.678, + "step": 553 + }, + { + "epoch": 0.012921019104566379, + "grad_norm": 1.3394455909729004, + "learning_rate": 2.8712101580720397e-06, + "loss": 1.4129, + "step": 554 + }, + { + "epoch": 0.012944342243744297, + "grad_norm": 1.2189030647277832, + "learning_rate": 2.8763928478880544e-06, + "loss": 1.7364, + "step": 555 + }, + { + "epoch": 0.012967665382922215, + "grad_norm": 1.2808138132095337, + "learning_rate": 2.8815755377040688e-06, + "loss": 1.6274, + "step": 556 + }, + { + "epoch": 0.012990988522100132, + "grad_norm": 1.0384689569473267, + "learning_rate": 2.8867582275200835e-06, + "loss": 1.5942, + "step": 557 + }, + { + "epoch": 0.01301431166127805, + "grad_norm": 1.8520807027816772, + "learning_rate": 2.891940917336098e-06, + "loss": 1.3067, + "step": 558 + }, + { + "epoch": 0.013037634800455968, + "grad_norm": 1.1817374229431152, + "learning_rate": 2.897123607152112e-06, + "loss": 1.6405, + "step": 559 + }, + { + "epoch": 0.013060957939633884, + "grad_norm": 1.1010823249816895, + "learning_rate": 2.9023062969681266e-06, + "loss": 1.4339, + "step": 560 + }, + { + "epoch": 0.013084281078811802, + "grad_norm": 1.2461942434310913, + "learning_rate": 2.907488986784141e-06, + "loss": 1.9866, + "step": 561 + }, + { + "epoch": 0.01310760421798972, + "grad_norm": 1.1503125429153442, + "learning_rate": 2.9126716766001557e-06, + "loss": 1.585, + "step": 562 + }, + { + "epoch": 0.013130927357167638, + "grad_norm": 1.542434573173523, + "learning_rate": 2.91785436641617e-06, + "loss": 1.4524, + "step": 563 + }, + { + "epoch": 0.013154250496345556, + "grad_norm": 1.0469673871994019, + "learning_rate": 2.923037056232185e-06, + "loss": 1.6884, + "step": 564 + }, + { + "epoch": 0.013177573635523474, + "grad_norm": 1.5137437582015991, + "learning_rate": 2.928219746048199e-06, + "loss": 1.5377, + "step": 565 + }, + { + "epoch": 0.013200896774701392, + "grad_norm": 1.1454534530639648, + "learning_rate": 2.933402435864214e-06, + "loss": 1.8508, + "step": 566 + }, + { + "epoch": 0.013224219913879308, + "grad_norm": 1.310381531715393, + "learning_rate": 2.9385851256802283e-06, + "loss": 1.5774, + "step": 567 + }, + { + "epoch": 0.013247543053057226, + "grad_norm": 1.1223838329315186, + "learning_rate": 2.943767815496243e-06, + "loss": 1.4496, + "step": 568 + }, + { + "epoch": 0.013270866192235144, + "grad_norm": 1.4537910223007202, + "learning_rate": 2.9489505053122574e-06, + "loss": 1.4423, + "step": 569 + }, + { + "epoch": 0.013294189331413062, + "grad_norm": 1.1783167123794556, + "learning_rate": 2.954133195128272e-06, + "loss": 1.9314, + "step": 570 + }, + { + "epoch": 0.01331751247059098, + "grad_norm": 1.211719274520874, + "learning_rate": 2.9593158849442865e-06, + "loss": 1.5366, + "step": 571 + }, + { + "epoch": 0.013340835609768897, + "grad_norm": 2.9552671909332275, + "learning_rate": 2.9644985747603004e-06, + "loss": 1.3431, + "step": 572 + }, + { + "epoch": 0.013364158748946815, + "grad_norm": 1.2814795970916748, + "learning_rate": 2.9696812645763152e-06, + "loss": 1.3879, + "step": 573 + }, + { + "epoch": 0.013387481888124731, + "grad_norm": 1.2598010301589966, + "learning_rate": 2.9748639543923296e-06, + "loss": 1.4775, + "step": 574 + }, + { + "epoch": 0.01341080502730265, + "grad_norm": 1.3874925374984741, + "learning_rate": 2.9800466442083443e-06, + "loss": 1.4012, + "step": 575 + }, + { + "epoch": 0.013434128166480567, + "grad_norm": 1.1846306324005127, + "learning_rate": 2.9852293340243587e-06, + "loss": 1.4491, + "step": 576 + }, + { + "epoch": 0.013457451305658485, + "grad_norm": 1.388150691986084, + "learning_rate": 2.9904120238403734e-06, + "loss": 1.6913, + "step": 577 + }, + { + "epoch": 0.013480774444836403, + "grad_norm": 1.8026880025863647, + "learning_rate": 2.995594713656388e-06, + "loss": 1.1754, + "step": 578 + }, + { + "epoch": 0.013504097584014321, + "grad_norm": 1.9366620779037476, + "learning_rate": 3.0007774034724026e-06, + "loss": 1.4406, + "step": 579 + }, + { + "epoch": 0.013527420723192239, + "grad_norm": 1.039657473564148, + "learning_rate": 3.005960093288417e-06, + "loss": 1.4823, + "step": 580 + }, + { + "epoch": 0.013550743862370155, + "grad_norm": 1.0928449630737305, + "learning_rate": 3.0111427831044317e-06, + "loss": 1.4502, + "step": 581 + }, + { + "epoch": 0.013574067001548073, + "grad_norm": 2.408292531967163, + "learning_rate": 3.016325472920446e-06, + "loss": 1.4778, + "step": 582 + }, + { + "epoch": 0.01359739014072599, + "grad_norm": 1.2284953594207764, + "learning_rate": 3.021508162736461e-06, + "loss": 1.5887, + "step": 583 + }, + { + "epoch": 0.013620713279903909, + "grad_norm": 1.3841763734817505, + "learning_rate": 3.026690852552475e-06, + "loss": 1.3778, + "step": 584 + }, + { + "epoch": 0.013644036419081827, + "grad_norm": 1.305172324180603, + "learning_rate": 3.03187354236849e-06, + "loss": 1.2837, + "step": 585 + }, + { + "epoch": 0.013667359558259744, + "grad_norm": 1.087904691696167, + "learning_rate": 3.037056232184504e-06, + "loss": 1.4361, + "step": 586 + }, + { + "epoch": 0.013690682697437662, + "grad_norm": 1.1818716526031494, + "learning_rate": 3.042238922000518e-06, + "loss": 1.4903, + "step": 587 + }, + { + "epoch": 0.013714005836615578, + "grad_norm": 0.9969412088394165, + "learning_rate": 3.047421611816533e-06, + "loss": 1.6923, + "step": 588 + }, + { + "epoch": 0.013737328975793496, + "grad_norm": 1.3729232549667358, + "learning_rate": 3.0526043016325473e-06, + "loss": 1.4219, + "step": 589 + }, + { + "epoch": 0.013760652114971414, + "grad_norm": 1.091769814491272, + "learning_rate": 3.057786991448562e-06, + "loss": 1.6978, + "step": 590 + }, + { + "epoch": 0.013783975254149332, + "grad_norm": 1.1668254137039185, + "learning_rate": 3.0629696812645764e-06, + "loss": 1.4609, + "step": 591 + }, + { + "epoch": 0.01380729839332725, + "grad_norm": 1.3739502429962158, + "learning_rate": 3.068152371080591e-06, + "loss": 1.7247, + "step": 592 + }, + { + "epoch": 0.013830621532505168, + "grad_norm": 1.480758547782898, + "learning_rate": 3.0733350608966055e-06, + "loss": 1.6142, + "step": 593 + }, + { + "epoch": 0.013853944671683086, + "grad_norm": 0.853581964969635, + "learning_rate": 3.0785177507126203e-06, + "loss": 1.5563, + "step": 594 + }, + { + "epoch": 0.013877267810861002, + "grad_norm": 1.144692063331604, + "learning_rate": 3.0837004405286347e-06, + "loss": 1.6145, + "step": 595 + }, + { + "epoch": 0.01390059095003892, + "grad_norm": 1.2413440942764282, + "learning_rate": 3.0888831303446494e-06, + "loss": 1.5762, + "step": 596 + }, + { + "epoch": 0.013923914089216838, + "grad_norm": 1.147834062576294, + "learning_rate": 3.0940658201606638e-06, + "loss": 1.4478, + "step": 597 + }, + { + "epoch": 0.013947237228394756, + "grad_norm": 1.0349398851394653, + "learning_rate": 3.0992485099766785e-06, + "loss": 1.612, + "step": 598 + }, + { + "epoch": 0.013970560367572674, + "grad_norm": 1.4780391454696655, + "learning_rate": 3.104431199792693e-06, + "loss": 1.5179, + "step": 599 + }, + { + "epoch": 0.013993883506750592, + "grad_norm": 1.1395933628082275, + "learning_rate": 3.109613889608707e-06, + "loss": 1.4845, + "step": 600 + }, + { + "epoch": 0.01401720664592851, + "grad_norm": 1.37168550491333, + "learning_rate": 3.1147965794247216e-06, + "loss": 1.581, + "step": 601 + }, + { + "epoch": 0.014040529785106426, + "grad_norm": 1.8260347843170166, + "learning_rate": 3.119979269240736e-06, + "loss": 1.1221, + "step": 602 + }, + { + "epoch": 0.014063852924284343, + "grad_norm": 2.5528669357299805, + "learning_rate": 3.1251619590567507e-06, + "loss": 1.255, + "step": 603 + }, + { + "epoch": 0.014087176063462261, + "grad_norm": 1.3272032737731934, + "learning_rate": 3.130344648872765e-06, + "loss": 1.2713, + "step": 604 + }, + { + "epoch": 0.01411049920264018, + "grad_norm": 1.147449254989624, + "learning_rate": 3.13552733868878e-06, + "loss": 1.3694, + "step": 605 + }, + { + "epoch": 0.014133822341818097, + "grad_norm": 1.173793077468872, + "learning_rate": 3.140710028504794e-06, + "loss": 1.5818, + "step": 606 + }, + { + "epoch": 0.014157145480996015, + "grad_norm": 1.2347713708877563, + "learning_rate": 3.145892718320809e-06, + "loss": 1.501, + "step": 607 + }, + { + "epoch": 0.014180468620173933, + "grad_norm": 1.3945446014404297, + "learning_rate": 3.1510754081368233e-06, + "loss": 1.8674, + "step": 608 + }, + { + "epoch": 0.01420379175935185, + "grad_norm": 1.239762544631958, + "learning_rate": 3.156258097952838e-06, + "loss": 1.2516, + "step": 609 + }, + { + "epoch": 0.014227114898529767, + "grad_norm": 1.552531361579895, + "learning_rate": 3.1614407877688524e-06, + "loss": 1.5358, + "step": 610 + }, + { + "epoch": 0.014250438037707685, + "grad_norm": 1.576997995376587, + "learning_rate": 3.166623477584867e-06, + "loss": 1.7601, + "step": 611 + }, + { + "epoch": 0.014273761176885603, + "grad_norm": 1.3251402378082275, + "learning_rate": 3.1718061674008815e-06, + "loss": 1.2758, + "step": 612 + }, + { + "epoch": 0.01429708431606352, + "grad_norm": 1.2837574481964111, + "learning_rate": 3.1769888572168963e-06, + "loss": 1.528, + "step": 613 + }, + { + "epoch": 0.014320407455241439, + "grad_norm": 0.9697505831718445, + "learning_rate": 3.1821715470329102e-06, + "loss": 1.6359, + "step": 614 + }, + { + "epoch": 0.014343730594419356, + "grad_norm": 1.2682685852050781, + "learning_rate": 3.1873542368489246e-06, + "loss": 1.4759, + "step": 615 + }, + { + "epoch": 0.014367053733597274, + "grad_norm": 0.9607746005058289, + "learning_rate": 3.1925369266649393e-06, + "loss": 1.7474, + "step": 616 + }, + { + "epoch": 0.01439037687277519, + "grad_norm": 1.056736946105957, + "learning_rate": 3.1977196164809537e-06, + "loss": 1.8812, + "step": 617 + }, + { + "epoch": 0.014413700011953108, + "grad_norm": 1.1990852355957031, + "learning_rate": 3.2029023062969684e-06, + "loss": 1.6217, + "step": 618 + }, + { + "epoch": 0.014437023151131026, + "grad_norm": 1.1339764595031738, + "learning_rate": 3.208084996112983e-06, + "loss": 1.3557, + "step": 619 + }, + { + "epoch": 0.014460346290308944, + "grad_norm": 1.0672523975372314, + "learning_rate": 3.2132676859289976e-06, + "loss": 1.8239, + "step": 620 + }, + { + "epoch": 0.014483669429486862, + "grad_norm": 1.4371954202651978, + "learning_rate": 3.218450375745012e-06, + "loss": 1.4571, + "step": 621 + }, + { + "epoch": 0.01450699256866478, + "grad_norm": 1.9893105030059814, + "learning_rate": 3.2236330655610267e-06, + "loss": 1.3716, + "step": 622 + }, + { + "epoch": 0.014530315707842698, + "grad_norm": 1.7084318399429321, + "learning_rate": 3.228815755377041e-06, + "loss": 1.5201, + "step": 623 + }, + { + "epoch": 0.014553638847020614, + "grad_norm": 1.308225154876709, + "learning_rate": 3.233998445193056e-06, + "loss": 1.9173, + "step": 624 + }, + { + "epoch": 0.014576961986198532, + "grad_norm": 0.9914215803146362, + "learning_rate": 3.23918113500907e-06, + "loss": 1.7351, + "step": 625 + }, + { + "epoch": 0.01460028512537645, + "grad_norm": 1.0292766094207764, + "learning_rate": 3.244363824825085e-06, + "loss": 1.4073, + "step": 626 + }, + { + "epoch": 0.014623608264554368, + "grad_norm": 1.0998982191085815, + "learning_rate": 3.2495465146410993e-06, + "loss": 1.5979, + "step": 627 + }, + { + "epoch": 0.014646931403732286, + "grad_norm": 1.1409685611724854, + "learning_rate": 3.254729204457113e-06, + "loss": 1.3442, + "step": 628 + }, + { + "epoch": 0.014670254542910204, + "grad_norm": 1.7685736417770386, + "learning_rate": 3.259911894273128e-06, + "loss": 1.251, + "step": 629 + }, + { + "epoch": 0.014693577682088121, + "grad_norm": 1.6536918878555298, + "learning_rate": 3.2650945840891423e-06, + "loss": 1.4698, + "step": 630 + }, + { + "epoch": 0.014716900821266038, + "grad_norm": 2.046391248703003, + "learning_rate": 3.270277273905157e-06, + "loss": 1.5142, + "step": 631 + }, + { + "epoch": 0.014740223960443955, + "grad_norm": 1.3458948135375977, + "learning_rate": 3.2754599637211714e-06, + "loss": 1.3999, + "step": 632 + }, + { + "epoch": 0.014763547099621873, + "grad_norm": 1.7265046834945679, + "learning_rate": 3.280642653537186e-06, + "loss": 1.2212, + "step": 633 + }, + { + "epoch": 0.014786870238799791, + "grad_norm": 1.3191124200820923, + "learning_rate": 3.2858253433532005e-06, + "loss": 1.4354, + "step": 634 + }, + { + "epoch": 0.01481019337797771, + "grad_norm": 1.2317379713058472, + "learning_rate": 3.2910080331692153e-06, + "loss": 1.5661, + "step": 635 + }, + { + "epoch": 0.014833516517155627, + "grad_norm": 1.400969386100769, + "learning_rate": 3.2961907229852297e-06, + "loss": 1.462, + "step": 636 + }, + { + "epoch": 0.014856839656333545, + "grad_norm": 2.060718059539795, + "learning_rate": 3.3013734128012444e-06, + "loss": 1.7522, + "step": 637 + }, + { + "epoch": 0.014880162795511461, + "grad_norm": 1.138715386390686, + "learning_rate": 3.3065561026172588e-06, + "loss": 1.4923, + "step": 638 + }, + { + "epoch": 0.014903485934689379, + "grad_norm": 1.1973599195480347, + "learning_rate": 3.3117387924332735e-06, + "loss": 1.4462, + "step": 639 + }, + { + "epoch": 0.014926809073867297, + "grad_norm": 1.266867756843567, + "learning_rate": 3.316921482249288e-06, + "loss": 1.3159, + "step": 640 + }, + { + "epoch": 0.014950132213045215, + "grad_norm": 3.4681708812713623, + "learning_rate": 3.322104172065302e-06, + "loss": 1.3566, + "step": 641 + }, + { + "epoch": 0.014973455352223133, + "grad_norm": 1.248502492904663, + "learning_rate": 3.3272868618813166e-06, + "loss": 1.6299, + "step": 642 + }, + { + "epoch": 0.01499677849140105, + "grad_norm": 1.561563491821289, + "learning_rate": 3.332469551697331e-06, + "loss": 1.3246, + "step": 643 + }, + { + "epoch": 0.015020101630578968, + "grad_norm": 1.1922053098678589, + "learning_rate": 3.3376522415133457e-06, + "loss": 1.6847, + "step": 644 + }, + { + "epoch": 0.015043424769756885, + "grad_norm": 1.0779014825820923, + "learning_rate": 3.34283493132936e-06, + "loss": 1.8025, + "step": 645 + }, + { + "epoch": 0.015066747908934803, + "grad_norm": 1.5236597061157227, + "learning_rate": 3.348017621145375e-06, + "loss": 1.3894, + "step": 646 + }, + { + "epoch": 0.01509007104811272, + "grad_norm": 1.2087934017181396, + "learning_rate": 3.353200310961389e-06, + "loss": 1.9119, + "step": 647 + }, + { + "epoch": 0.015113394187290638, + "grad_norm": 1.435085654258728, + "learning_rate": 3.358383000777404e-06, + "loss": 1.4334, + "step": 648 + }, + { + "epoch": 0.015136717326468556, + "grad_norm": 1.3662467002868652, + "learning_rate": 3.3635656905934183e-06, + "loss": 1.6717, + "step": 649 + }, + { + "epoch": 0.015160040465646474, + "grad_norm": 1.379262924194336, + "learning_rate": 3.368748380409433e-06, + "loss": 1.0914, + "step": 650 + }, + { + "epoch": 0.015183363604824392, + "grad_norm": 1.436503529548645, + "learning_rate": 3.3739310702254474e-06, + "loss": 1.296, + "step": 651 + }, + { + "epoch": 0.015206686744002308, + "grad_norm": 1.0189919471740723, + "learning_rate": 3.379113760041462e-06, + "loss": 1.5578, + "step": 652 + }, + { + "epoch": 0.015230009883180226, + "grad_norm": 1.3371915817260742, + "learning_rate": 3.3842964498574765e-06, + "loss": 1.3883, + "step": 653 + }, + { + "epoch": 0.015253333022358144, + "grad_norm": 1.152949333190918, + "learning_rate": 3.389479139673491e-06, + "loss": 1.3408, + "step": 654 + }, + { + "epoch": 0.015276656161536062, + "grad_norm": 0.865856945514679, + "learning_rate": 3.3946618294895052e-06, + "loss": 1.8154, + "step": 655 + }, + { + "epoch": 0.01529997930071398, + "grad_norm": 1.3607538938522339, + "learning_rate": 3.3998445193055196e-06, + "loss": 1.5139, + "step": 656 + }, + { + "epoch": 0.015323302439891898, + "grad_norm": 1.0469399690628052, + "learning_rate": 3.4050272091215343e-06, + "loss": 1.4246, + "step": 657 + }, + { + "epoch": 0.015346625579069816, + "grad_norm": 1.2417982816696167, + "learning_rate": 3.4102098989375487e-06, + "loss": 1.4392, + "step": 658 + }, + { + "epoch": 0.015369948718247732, + "grad_norm": 2.018418073654175, + "learning_rate": 3.4153925887535634e-06, + "loss": 1.5175, + "step": 659 + }, + { + "epoch": 0.01539327185742565, + "grad_norm": 1.2593055963516235, + "learning_rate": 3.420575278569578e-06, + "loss": 1.6338, + "step": 660 + }, + { + "epoch": 0.015416594996603568, + "grad_norm": 1.0297298431396484, + "learning_rate": 3.4257579683855926e-06, + "loss": 1.6309, + "step": 661 + }, + { + "epoch": 0.015439918135781485, + "grad_norm": 1.2963732481002808, + "learning_rate": 3.430940658201607e-06, + "loss": 1.3099, + "step": 662 + }, + { + "epoch": 0.015463241274959403, + "grad_norm": 1.0868266820907593, + "learning_rate": 3.4361233480176217e-06, + "loss": 1.4949, + "step": 663 + }, + { + "epoch": 0.015486564414137321, + "grad_norm": 1.156296968460083, + "learning_rate": 3.441306037833636e-06, + "loss": 1.7845, + "step": 664 + }, + { + "epoch": 0.015509887553315239, + "grad_norm": 1.412965178489685, + "learning_rate": 3.446488727649651e-06, + "loss": 1.19, + "step": 665 + }, + { + "epoch": 0.015533210692493155, + "grad_norm": 1.0419931411743164, + "learning_rate": 3.451671417465665e-06, + "loss": 1.7125, + "step": 666 + }, + { + "epoch": 0.015556533831671073, + "grad_norm": 1.035372018814087, + "learning_rate": 3.4568541072816795e-06, + "loss": 1.7003, + "step": 667 + }, + { + "epoch": 0.015579856970848991, + "grad_norm": 1.1559805870056152, + "learning_rate": 3.4620367970976943e-06, + "loss": 1.981, + "step": 668 + }, + { + "epoch": 0.015603180110026909, + "grad_norm": 0.8634515404701233, + "learning_rate": 3.467219486913708e-06, + "loss": 1.2609, + "step": 669 + }, + { + "epoch": 0.015626503249204827, + "grad_norm": 1.1953692436218262, + "learning_rate": 3.472402176729723e-06, + "loss": 1.3956, + "step": 670 + }, + { + "epoch": 0.015649826388382745, + "grad_norm": 0.9668301939964294, + "learning_rate": 3.4775848665457373e-06, + "loss": 1.0568, + "step": 671 + }, + { + "epoch": 0.015673149527560663, + "grad_norm": 2.4868035316467285, + "learning_rate": 3.482767556361752e-06, + "loss": 1.364, + "step": 672 + }, + { + "epoch": 0.01569647266673858, + "grad_norm": 1.4255839586257935, + "learning_rate": 3.4879502461777664e-06, + "loss": 1.5207, + "step": 673 + }, + { + "epoch": 0.0157197958059165, + "grad_norm": 1.2752389907836914, + "learning_rate": 3.493132935993781e-06, + "loss": 1.5141, + "step": 674 + }, + { + "epoch": 0.015743118945094416, + "grad_norm": 1.2186245918273926, + "learning_rate": 3.4983156258097955e-06, + "loss": 1.3655, + "step": 675 + }, + { + "epoch": 0.015766442084272334, + "grad_norm": 1.3544304370880127, + "learning_rate": 3.5034983156258103e-06, + "loss": 1.7428, + "step": 676 + }, + { + "epoch": 0.01578976522345025, + "grad_norm": 1.0968130826950073, + "learning_rate": 3.5086810054418247e-06, + "loss": 1.3491, + "step": 677 + }, + { + "epoch": 0.015813088362628167, + "grad_norm": 1.1593806743621826, + "learning_rate": 3.513863695257839e-06, + "loss": 1.6708, + "step": 678 + }, + { + "epoch": 0.015836411501806084, + "grad_norm": 1.0408954620361328, + "learning_rate": 3.5190463850738538e-06, + "loss": 1.6977, + "step": 679 + }, + { + "epoch": 0.015859734640984002, + "grad_norm": 1.196632742881775, + "learning_rate": 3.524229074889868e-06, + "loss": 1.2019, + "step": 680 + }, + { + "epoch": 0.01588305778016192, + "grad_norm": 1.2698166370391846, + "learning_rate": 3.529411764705883e-06, + "loss": 1.8457, + "step": 681 + }, + { + "epoch": 0.015906380919339838, + "grad_norm": 0.9075011014938354, + "learning_rate": 3.5345944545218972e-06, + "loss": 1.2717, + "step": 682 + }, + { + "epoch": 0.015929704058517756, + "grad_norm": 1.0426501035690308, + "learning_rate": 3.5397771443379116e-06, + "loss": 1.6601, + "step": 683 + }, + { + "epoch": 0.015953027197695674, + "grad_norm": 1.4904205799102783, + "learning_rate": 3.544959834153926e-06, + "loss": 1.6324, + "step": 684 + }, + { + "epoch": 0.015976350336873592, + "grad_norm": 1.0664643049240112, + "learning_rate": 3.5501425239699407e-06, + "loss": 1.4896, + "step": 685 + }, + { + "epoch": 0.01599967347605151, + "grad_norm": 1.3758978843688965, + "learning_rate": 3.555325213785955e-06, + "loss": 1.5457, + "step": 686 + }, + { + "epoch": 0.016022996615229428, + "grad_norm": 1.4759879112243652, + "learning_rate": 3.56050790360197e-06, + "loss": 1.3865, + "step": 687 + }, + { + "epoch": 0.016046319754407345, + "grad_norm": 1.4678733348846436, + "learning_rate": 3.565690593417984e-06, + "loss": 1.223, + "step": 688 + }, + { + "epoch": 0.016069642893585263, + "grad_norm": 1.2057251930236816, + "learning_rate": 3.570873283233999e-06, + "loss": 1.4864, + "step": 689 + }, + { + "epoch": 0.01609296603276318, + "grad_norm": 1.3976320028305054, + "learning_rate": 3.5760559730500133e-06, + "loss": 1.3371, + "step": 690 + }, + { + "epoch": 0.016116289171941096, + "grad_norm": 1.0588197708129883, + "learning_rate": 3.5812386628660276e-06, + "loss": 1.264, + "step": 691 + }, + { + "epoch": 0.016139612311119014, + "grad_norm": 0.891678512096405, + "learning_rate": 3.5864213526820424e-06, + "loss": 1.6566, + "step": 692 + }, + { + "epoch": 0.01616293545029693, + "grad_norm": 1.1149228811264038, + "learning_rate": 3.5916040424980567e-06, + "loss": 1.6862, + "step": 693 + }, + { + "epoch": 0.01618625858947485, + "grad_norm": 1.463218331336975, + "learning_rate": 3.5967867323140715e-06, + "loss": 1.5771, + "step": 694 + }, + { + "epoch": 0.016209581728652767, + "grad_norm": 1.291648030281067, + "learning_rate": 3.601969422130086e-06, + "loss": 1.443, + "step": 695 + }, + { + "epoch": 0.016232904867830685, + "grad_norm": 1.1534149646759033, + "learning_rate": 3.6071521119461002e-06, + "loss": 1.76, + "step": 696 + }, + { + "epoch": 0.016256228007008603, + "grad_norm": 1.3349847793579102, + "learning_rate": 3.6123348017621146e-06, + "loss": 2.0584, + "step": 697 + }, + { + "epoch": 0.01627955114618652, + "grad_norm": 1.665682315826416, + "learning_rate": 3.6175174915781293e-06, + "loss": 1.5989, + "step": 698 + }, + { + "epoch": 0.01630287428536444, + "grad_norm": 1.6486263275146484, + "learning_rate": 3.6227001813941437e-06, + "loss": 1.7698, + "step": 699 + }, + { + "epoch": 0.016326197424542357, + "grad_norm": 1.5153722763061523, + "learning_rate": 3.6278828712101584e-06, + "loss": 1.3312, + "step": 700 + }, + { + "epoch": 0.016349520563720275, + "grad_norm": 1.3090248107910156, + "learning_rate": 3.633065561026173e-06, + "loss": 1.0735, + "step": 701 + }, + { + "epoch": 0.016372843702898193, + "grad_norm": 1.5462753772735596, + "learning_rate": 3.6382482508421876e-06, + "loss": 1.5408, + "step": 702 + }, + { + "epoch": 0.01639616684207611, + "grad_norm": 1.3447730541229248, + "learning_rate": 3.643430940658202e-06, + "loss": 1.5295, + "step": 703 + }, + { + "epoch": 0.01641948998125403, + "grad_norm": 1.232865571975708, + "learning_rate": 3.6486136304742163e-06, + "loss": 1.8686, + "step": 704 + }, + { + "epoch": 0.016442813120431946, + "grad_norm": 0.9742329120635986, + "learning_rate": 3.653796320290231e-06, + "loss": 1.5951, + "step": 705 + }, + { + "epoch": 0.01646613625960986, + "grad_norm": 1.1572047472000122, + "learning_rate": 3.6589790101062454e-06, + "loss": 1.5068, + "step": 706 + }, + { + "epoch": 0.01648945939878778, + "grad_norm": 1.2024304866790771, + "learning_rate": 3.66416169992226e-06, + "loss": 1.3933, + "step": 707 + }, + { + "epoch": 0.016512782537965696, + "grad_norm": 2.442342758178711, + "learning_rate": 3.6693443897382745e-06, + "loss": 1.0126, + "step": 708 + }, + { + "epoch": 0.016536105677143614, + "grad_norm": 1.2786589860916138, + "learning_rate": 3.6745270795542893e-06, + "loss": 1.6902, + "step": 709 + }, + { + "epoch": 0.016559428816321532, + "grad_norm": 0.9200882315635681, + "learning_rate": 3.679709769370303e-06, + "loss": 1.3918, + "step": 710 + }, + { + "epoch": 0.01658275195549945, + "grad_norm": 1.3768819570541382, + "learning_rate": 3.684892459186318e-06, + "loss": 1.6518, + "step": 711 + }, + { + "epoch": 0.016606075094677368, + "grad_norm": 1.274484395980835, + "learning_rate": 3.6900751490023323e-06, + "loss": 1.3728, + "step": 712 + }, + { + "epoch": 0.016629398233855286, + "grad_norm": 1.1752501726150513, + "learning_rate": 3.695257838818347e-06, + "loss": 1.4234, + "step": 713 + }, + { + "epoch": 0.016652721373033204, + "grad_norm": 1.4458903074264526, + "learning_rate": 3.7004405286343614e-06, + "loss": 1.5695, + "step": 714 + }, + { + "epoch": 0.01667604451221112, + "grad_norm": 1.2630547285079956, + "learning_rate": 3.705623218450376e-06, + "loss": 1.5334, + "step": 715 + }, + { + "epoch": 0.01669936765138904, + "grad_norm": 1.3754082918167114, + "learning_rate": 3.7108059082663905e-06, + "loss": 1.4807, + "step": 716 + }, + { + "epoch": 0.016722690790566958, + "grad_norm": 1.4704689979553223, + "learning_rate": 3.715988598082405e-06, + "loss": 1.5409, + "step": 717 + }, + { + "epoch": 0.016746013929744875, + "grad_norm": 1.4692633152008057, + "learning_rate": 3.7211712878984197e-06, + "loss": 1.5922, + "step": 718 + }, + { + "epoch": 0.016769337068922793, + "grad_norm": 1.2148405313491821, + "learning_rate": 3.726353977714434e-06, + "loss": 1.8115, + "step": 719 + }, + { + "epoch": 0.016792660208100708, + "grad_norm": 1.5564905405044556, + "learning_rate": 3.7315366675304488e-06, + "loss": 1.4189, + "step": 720 + }, + { + "epoch": 0.016815983347278626, + "grad_norm": 1.130292296409607, + "learning_rate": 3.736719357346463e-06, + "loss": 1.4455, + "step": 721 + }, + { + "epoch": 0.016839306486456544, + "grad_norm": 2.0609545707702637, + "learning_rate": 3.741902047162478e-06, + "loss": 1.6052, + "step": 722 + }, + { + "epoch": 0.01686262962563446, + "grad_norm": 1.0422543287277222, + "learning_rate": 3.7470847369784922e-06, + "loss": 1.5889, + "step": 723 + }, + { + "epoch": 0.01688595276481238, + "grad_norm": 1.7926782369613647, + "learning_rate": 3.7522674267945066e-06, + "loss": 1.2304, + "step": 724 + }, + { + "epoch": 0.016909275903990297, + "grad_norm": 1.2486250400543213, + "learning_rate": 3.757450116610521e-06, + "loss": 1.7512, + "step": 725 + }, + { + "epoch": 0.016932599043168215, + "grad_norm": 1.6907048225402832, + "learning_rate": 3.7626328064265357e-06, + "loss": 1.2031, + "step": 726 + }, + { + "epoch": 0.016955922182346133, + "grad_norm": 1.2899296283721924, + "learning_rate": 3.76781549624255e-06, + "loss": 1.3111, + "step": 727 + }, + { + "epoch": 0.01697924532152405, + "grad_norm": 2.320288896560669, + "learning_rate": 3.7729981860585644e-06, + "loss": 1.2764, + "step": 728 + }, + { + "epoch": 0.01700256846070197, + "grad_norm": 1.4165383577346802, + "learning_rate": 3.778180875874579e-06, + "loss": 1.2847, + "step": 729 + }, + { + "epoch": 0.017025891599879887, + "grad_norm": 1.1537601947784424, + "learning_rate": 3.7833635656905935e-06, + "loss": 1.6002, + "step": 730 + }, + { + "epoch": 0.017049214739057805, + "grad_norm": 1.3128899335861206, + "learning_rate": 3.7885462555066083e-06, + "loss": 1.4159, + "step": 731 + }, + { + "epoch": 0.017072537878235722, + "grad_norm": 0.9494642615318298, + "learning_rate": 3.7937289453226226e-06, + "loss": 1.5425, + "step": 732 + }, + { + "epoch": 0.01709586101741364, + "grad_norm": 1.8949923515319824, + "learning_rate": 3.7989116351386374e-06, + "loss": 1.109, + "step": 733 + }, + { + "epoch": 0.017119184156591555, + "grad_norm": 1.3136776685714722, + "learning_rate": 3.8040943249546517e-06, + "loss": 1.4208, + "step": 734 + }, + { + "epoch": 0.017142507295769473, + "grad_norm": 1.0108048915863037, + "learning_rate": 3.8092770147706665e-06, + "loss": 1.3101, + "step": 735 + }, + { + "epoch": 0.01716583043494739, + "grad_norm": 1.1397989988327026, + "learning_rate": 3.814459704586681e-06, + "loss": 1.6643, + "step": 736 + }, + { + "epoch": 0.01718915357412531, + "grad_norm": 0.9662717580795288, + "learning_rate": 3.819642394402696e-06, + "loss": 1.5524, + "step": 737 + }, + { + "epoch": 0.017212476713303226, + "grad_norm": 1.5264514684677124, + "learning_rate": 3.82482508421871e-06, + "loss": 1.6702, + "step": 738 + }, + { + "epoch": 0.017235799852481144, + "grad_norm": 1.1797709465026855, + "learning_rate": 3.830007774034724e-06, + "loss": 1.5751, + "step": 739 + }, + { + "epoch": 0.017259122991659062, + "grad_norm": 1.3964486122131348, + "learning_rate": 3.835190463850739e-06, + "loss": 1.3497, + "step": 740 + }, + { + "epoch": 0.01728244613083698, + "grad_norm": 1.0540798902511597, + "learning_rate": 3.840373153666753e-06, + "loss": 1.623, + "step": 741 + }, + { + "epoch": 0.017305769270014898, + "grad_norm": 1.8619107007980347, + "learning_rate": 3.845555843482767e-06, + "loss": 1.836, + "step": 742 + }, + { + "epoch": 0.017329092409192816, + "grad_norm": 1.190048098564148, + "learning_rate": 3.8507385332987826e-06, + "loss": 1.6031, + "step": 743 + }, + { + "epoch": 0.017352415548370734, + "grad_norm": 1.32784903049469, + "learning_rate": 3.855921223114797e-06, + "loss": 1.6144, + "step": 744 + }, + { + "epoch": 0.01737573868754865, + "grad_norm": 1.7393810749053955, + "learning_rate": 3.861103912930811e-06, + "loss": 1.4898, + "step": 745 + }, + { + "epoch": 0.01739906182672657, + "grad_norm": 1.008122444152832, + "learning_rate": 3.866286602746826e-06, + "loss": 1.6506, + "step": 746 + }, + { + "epoch": 0.017422384965904487, + "grad_norm": 1.3282239437103271, + "learning_rate": 3.871469292562841e-06, + "loss": 1.5178, + "step": 747 + }, + { + "epoch": 0.017445708105082402, + "grad_norm": 1.4479358196258545, + "learning_rate": 3.876651982378855e-06, + "loss": 1.5896, + "step": 748 + }, + { + "epoch": 0.01746903124426032, + "grad_norm": 1.9100661277770996, + "learning_rate": 3.8818346721948695e-06, + "loss": 1.2946, + "step": 749 + }, + { + "epoch": 0.017492354383438238, + "grad_norm": 1.269235610961914, + "learning_rate": 3.887017362010884e-06, + "loss": 1.5707, + "step": 750 + }, + { + "epoch": 0.017515677522616156, + "grad_norm": 1.3187369108200073, + "learning_rate": 3.892200051826899e-06, + "loss": 1.8153, + "step": 751 + }, + { + "epoch": 0.017539000661794073, + "grad_norm": 1.3091131448745728, + "learning_rate": 3.8973827416429125e-06, + "loss": 1.5973, + "step": 752 + }, + { + "epoch": 0.01756232380097199, + "grad_norm": 1.4826890230178833, + "learning_rate": 3.902565431458927e-06, + "loss": 1.3277, + "step": 753 + }, + { + "epoch": 0.01758564694014991, + "grad_norm": 1.2626949548721313, + "learning_rate": 3.907748121274942e-06, + "loss": 1.5531, + "step": 754 + }, + { + "epoch": 0.017608970079327827, + "grad_norm": 1.1990412473678589, + "learning_rate": 3.912930811090956e-06, + "loss": 1.349, + "step": 755 + }, + { + "epoch": 0.017632293218505745, + "grad_norm": 1.3036906719207764, + "learning_rate": 3.918113500906971e-06, + "loss": 1.5648, + "step": 756 + }, + { + "epoch": 0.017655616357683663, + "grad_norm": 1.3129525184631348, + "learning_rate": 3.923296190722985e-06, + "loss": 1.7147, + "step": 757 + }, + { + "epoch": 0.01767893949686158, + "grad_norm": 1.4686280488967896, + "learning_rate": 3.928478880539e-06, + "loss": 1.6136, + "step": 758 + }, + { + "epoch": 0.0177022626360395, + "grad_norm": 1.6845604181289673, + "learning_rate": 3.933661570355015e-06, + "loss": 1.763, + "step": 759 + }, + { + "epoch": 0.017725585775217417, + "grad_norm": 2.019049644470215, + "learning_rate": 3.938844260171029e-06, + "loss": 1.2543, + "step": 760 + }, + { + "epoch": 0.017748908914395334, + "grad_norm": 1.4184072017669678, + "learning_rate": 3.944026949987043e-06, + "loss": 1.596, + "step": 761 + }, + { + "epoch": 0.017772232053573252, + "grad_norm": 1.127982497215271, + "learning_rate": 3.9492096398030585e-06, + "loss": 1.5485, + "step": 762 + }, + { + "epoch": 0.017795555192751167, + "grad_norm": 1.5097321271896362, + "learning_rate": 3.954392329619073e-06, + "loss": 1.5452, + "step": 763 + }, + { + "epoch": 0.017818878331929085, + "grad_norm": 1.3832807540893555, + "learning_rate": 3.959575019435087e-06, + "loss": 1.3865, + "step": 764 + }, + { + "epoch": 0.017842201471107003, + "grad_norm": 1.065623164176941, + "learning_rate": 3.964757709251102e-06, + "loss": 1.2218, + "step": 765 + }, + { + "epoch": 0.01786552461028492, + "grad_norm": 1.2190065383911133, + "learning_rate": 3.969940399067116e-06, + "loss": 1.2169, + "step": 766 + }, + { + "epoch": 0.01788884774946284, + "grad_norm": 1.741749882698059, + "learning_rate": 3.97512308888313e-06, + "loss": 1.7316, + "step": 767 + }, + { + "epoch": 0.017912170888640756, + "grad_norm": 1.2072060108184814, + "learning_rate": 3.980305778699145e-06, + "loss": 1.815, + "step": 768 + }, + { + "epoch": 0.017935494027818674, + "grad_norm": 1.4645625352859497, + "learning_rate": 3.98548846851516e-06, + "loss": 1.2218, + "step": 769 + }, + { + "epoch": 0.017958817166996592, + "grad_norm": 1.4466350078582764, + "learning_rate": 3.990671158331174e-06, + "loss": 1.7291, + "step": 770 + }, + { + "epoch": 0.01798214030617451, + "grad_norm": 1.364358901977539, + "learning_rate": 3.9958538481471885e-06, + "loss": 1.6527, + "step": 771 + }, + { + "epoch": 0.018005463445352428, + "grad_norm": 1.2262394428253174, + "learning_rate": 4.001036537963203e-06, + "loss": 1.5522, + "step": 772 + }, + { + "epoch": 0.018028786584530346, + "grad_norm": 1.694001317024231, + "learning_rate": 4.006219227779218e-06, + "loss": 1.5791, + "step": 773 + }, + { + "epoch": 0.018052109723708264, + "grad_norm": 0.7941157817840576, + "learning_rate": 4.011401917595232e-06, + "loss": 1.23, + "step": 774 + }, + { + "epoch": 0.01807543286288618, + "grad_norm": 1.1942747831344604, + "learning_rate": 4.016584607411247e-06, + "loss": 1.4316, + "step": 775 + }, + { + "epoch": 0.0180987560020641, + "grad_norm": 1.5809072256088257, + "learning_rate": 4.021767297227261e-06, + "loss": 1.7361, + "step": 776 + }, + { + "epoch": 0.018122079141242014, + "grad_norm": 1.2918401956558228, + "learning_rate": 4.026949987043276e-06, + "loss": 1.3285, + "step": 777 + }, + { + "epoch": 0.018145402280419932, + "grad_norm": 1.966123342514038, + "learning_rate": 4.032132676859291e-06, + "loss": 1.2037, + "step": 778 + }, + { + "epoch": 0.01816872541959785, + "grad_norm": 1.3362590074539185, + "learning_rate": 4.037315366675304e-06, + "loss": 1.3811, + "step": 779 + }, + { + "epoch": 0.018192048558775768, + "grad_norm": 1.0375605821609497, + "learning_rate": 4.042498056491319e-06, + "loss": 1.481, + "step": 780 + }, + { + "epoch": 0.018215371697953685, + "grad_norm": 2.414684295654297, + "learning_rate": 4.047680746307334e-06, + "loss": 1.773, + "step": 781 + }, + { + "epoch": 0.018238694837131603, + "grad_norm": 1.2252676486968994, + "learning_rate": 4.052863436123348e-06, + "loss": 1.514, + "step": 782 + }, + { + "epoch": 0.01826201797630952, + "grad_norm": 1.517791748046875, + "learning_rate": 4.058046125939362e-06, + "loss": 1.3442, + "step": 783 + }, + { + "epoch": 0.01828534111548744, + "grad_norm": 1.0303611755371094, + "learning_rate": 4.0632288157553776e-06, + "loss": 1.5593, + "step": 784 + }, + { + "epoch": 0.018308664254665357, + "grad_norm": 1.3615033626556396, + "learning_rate": 4.068411505571392e-06, + "loss": 1.6971, + "step": 785 + }, + { + "epoch": 0.018331987393843275, + "grad_norm": 1.1224147081375122, + "learning_rate": 4.073594195387406e-06, + "loss": 1.2134, + "step": 786 + }, + { + "epoch": 0.018355310533021193, + "grad_norm": 1.3592679500579834, + "learning_rate": 4.078776885203421e-06, + "loss": 1.7391, + "step": 787 + }, + { + "epoch": 0.01837863367219911, + "grad_norm": 1.6286187171936035, + "learning_rate": 4.083959575019436e-06, + "loss": 1.7279, + "step": 788 + }, + { + "epoch": 0.01840195681137703, + "grad_norm": 1.2597742080688477, + "learning_rate": 4.08914226483545e-06, + "loss": 1.5227, + "step": 789 + }, + { + "epoch": 0.018425279950554947, + "grad_norm": 1.2776849269866943, + "learning_rate": 4.0943249546514645e-06, + "loss": 1.3575, + "step": 790 + }, + { + "epoch": 0.01844860308973286, + "grad_norm": 1.2529163360595703, + "learning_rate": 4.099507644467479e-06, + "loss": 1.6356, + "step": 791 + }, + { + "epoch": 0.01847192622891078, + "grad_norm": 1.184187650680542, + "learning_rate": 4.104690334283494e-06, + "loss": 1.734, + "step": 792 + }, + { + "epoch": 0.018495249368088697, + "grad_norm": 1.176222562789917, + "learning_rate": 4.1098730240995075e-06, + "loss": 1.5206, + "step": 793 + }, + { + "epoch": 0.018518572507266615, + "grad_norm": 1.0694701671600342, + "learning_rate": 4.115055713915522e-06, + "loss": 1.1824, + "step": 794 + }, + { + "epoch": 0.018541895646444533, + "grad_norm": 1.5169551372528076, + "learning_rate": 4.120238403731537e-06, + "loss": 1.3817, + "step": 795 + }, + { + "epoch": 0.01856521878562245, + "grad_norm": 1.0996246337890625, + "learning_rate": 4.125421093547551e-06, + "loss": 1.0921, + "step": 796 + }, + { + "epoch": 0.01858854192480037, + "grad_norm": 1.0202140808105469, + "learning_rate": 4.130603783363566e-06, + "loss": 1.2687, + "step": 797 + }, + { + "epoch": 0.018611865063978286, + "grad_norm": 2.089864730834961, + "learning_rate": 4.13578647317958e-06, + "loss": 1.5417, + "step": 798 + }, + { + "epoch": 0.018635188203156204, + "grad_norm": 1.1465847492218018, + "learning_rate": 4.140969162995595e-06, + "loss": 1.3415, + "step": 799 + }, + { + "epoch": 0.018658511342334122, + "grad_norm": 1.1085565090179443, + "learning_rate": 4.14615185281161e-06, + "loss": 1.4662, + "step": 800 + }, + { + "epoch": 0.01868183448151204, + "grad_norm": 1.2206768989562988, + "learning_rate": 4.151334542627624e-06, + "loss": 1.4954, + "step": 801 + }, + { + "epoch": 0.018705157620689958, + "grad_norm": 1.1540756225585938, + "learning_rate": 4.156517232443638e-06, + "loss": 1.4953, + "step": 802 + }, + { + "epoch": 0.018728480759867876, + "grad_norm": 1.9667025804519653, + "learning_rate": 4.1616999222596535e-06, + "loss": 1.1834, + "step": 803 + }, + { + "epoch": 0.018751803899045794, + "grad_norm": 1.2202988862991333, + "learning_rate": 4.166882612075668e-06, + "loss": 1.7045, + "step": 804 + }, + { + "epoch": 0.018775127038223708, + "grad_norm": 1.2399123907089233, + "learning_rate": 4.172065301891682e-06, + "loss": 1.4937, + "step": 805 + }, + { + "epoch": 0.018798450177401626, + "grad_norm": 1.5780203342437744, + "learning_rate": 4.177247991707697e-06, + "loss": 1.6386, + "step": 806 + }, + { + "epoch": 0.018821773316579544, + "grad_norm": 1.524564266204834, + "learning_rate": 4.182430681523711e-06, + "loss": 1.4951, + "step": 807 + }, + { + "epoch": 0.01884509645575746, + "grad_norm": 1.342991590499878, + "learning_rate": 4.187613371339725e-06, + "loss": 1.3007, + "step": 808 + }, + { + "epoch": 0.01886841959493538, + "grad_norm": 1.320813775062561, + "learning_rate": 4.19279606115574e-06, + "loss": 1.2112, + "step": 809 + }, + { + "epoch": 0.018891742734113297, + "grad_norm": 1.2329927682876587, + "learning_rate": 4.197978750971755e-06, + "loss": 1.333, + "step": 810 + }, + { + "epoch": 0.018915065873291215, + "grad_norm": 1.3429094552993774, + "learning_rate": 4.203161440787769e-06, + "loss": 1.4805, + "step": 811 + }, + { + "epoch": 0.018938389012469133, + "grad_norm": 1.643641710281372, + "learning_rate": 4.2083441306037835e-06, + "loss": 1.5665, + "step": 812 + }, + { + "epoch": 0.01896171215164705, + "grad_norm": 1.111887812614441, + "learning_rate": 4.213526820419798e-06, + "loss": 1.6087, + "step": 813 + }, + { + "epoch": 0.01898503529082497, + "grad_norm": 1.3594610691070557, + "learning_rate": 4.218709510235813e-06, + "loss": 1.7666, + "step": 814 + }, + { + "epoch": 0.019008358430002887, + "grad_norm": 1.2298046350479126, + "learning_rate": 4.223892200051827e-06, + "loss": 1.5032, + "step": 815 + }, + { + "epoch": 0.019031681569180805, + "grad_norm": 1.2679171562194824, + "learning_rate": 4.229074889867842e-06, + "loss": 1.4375, + "step": 816 + }, + { + "epoch": 0.019055004708358723, + "grad_norm": 1.0543935298919678, + "learning_rate": 4.234257579683856e-06, + "loss": 1.6645, + "step": 817 + }, + { + "epoch": 0.01907832784753664, + "grad_norm": 1.2821168899536133, + "learning_rate": 4.239440269499871e-06, + "loss": 1.1945, + "step": 818 + }, + { + "epoch": 0.01910165098671456, + "grad_norm": 1.5575084686279297, + "learning_rate": 4.244622959315886e-06, + "loss": 1.3262, + "step": 819 + }, + { + "epoch": 0.019124974125892473, + "grad_norm": 1.2359989881515503, + "learning_rate": 4.2498056491319e-06, + "loss": 1.4127, + "step": 820 + }, + { + "epoch": 0.01914829726507039, + "grad_norm": 1.0559273958206177, + "learning_rate": 4.254988338947914e-06, + "loss": 1.4455, + "step": 821 + }, + { + "epoch": 0.01917162040424831, + "grad_norm": 1.3651732206344604, + "learning_rate": 4.260171028763929e-06, + "loss": 1.245, + "step": 822 + }, + { + "epoch": 0.019194943543426227, + "grad_norm": 1.0067932605743408, + "learning_rate": 4.265353718579943e-06, + "loss": 1.4954, + "step": 823 + }, + { + "epoch": 0.019218266682604145, + "grad_norm": 1.7477822303771973, + "learning_rate": 4.270536408395957e-06, + "loss": 1.8164, + "step": 824 + }, + { + "epoch": 0.019241589821782062, + "grad_norm": 1.1976604461669922, + "learning_rate": 4.2757190982119726e-06, + "loss": 1.4552, + "step": 825 + }, + { + "epoch": 0.01926491296095998, + "grad_norm": 1.306269884109497, + "learning_rate": 4.280901788027987e-06, + "loss": 1.6348, + "step": 826 + }, + { + "epoch": 0.019288236100137898, + "grad_norm": 1.5786314010620117, + "learning_rate": 4.286084477844001e-06, + "loss": 1.4592, + "step": 827 + }, + { + "epoch": 0.019311559239315816, + "grad_norm": 1.4481762647628784, + "learning_rate": 4.291267167660016e-06, + "loss": 1.3409, + "step": 828 + }, + { + "epoch": 0.019334882378493734, + "grad_norm": 1.1410714387893677, + "learning_rate": 4.296449857476031e-06, + "loss": 1.5746, + "step": 829 + }, + { + "epoch": 0.019358205517671652, + "grad_norm": 1.363434076309204, + "learning_rate": 4.301632547292045e-06, + "loss": 1.0836, + "step": 830 + }, + { + "epoch": 0.01938152865684957, + "grad_norm": 1.1413646936416626, + "learning_rate": 4.3068152371080595e-06, + "loss": 1.8687, + "step": 831 + }, + { + "epoch": 0.019404851796027488, + "grad_norm": 1.9734309911727905, + "learning_rate": 4.311997926924074e-06, + "loss": 1.3295, + "step": 832 + }, + { + "epoch": 0.019428174935205406, + "grad_norm": 1.5119333267211914, + "learning_rate": 4.317180616740089e-06, + "loss": 1.6817, + "step": 833 + }, + { + "epoch": 0.01945149807438332, + "grad_norm": 1.3933395147323608, + "learning_rate": 4.3223633065561025e-06, + "loss": 1.5288, + "step": 834 + }, + { + "epoch": 0.019474821213561238, + "grad_norm": 1.3713746070861816, + "learning_rate": 4.327545996372117e-06, + "loss": 1.6361, + "step": 835 + }, + { + "epoch": 0.019498144352739156, + "grad_norm": 1.1849229335784912, + "learning_rate": 4.332728686188132e-06, + "loss": 1.6611, + "step": 836 + }, + { + "epoch": 0.019521467491917074, + "grad_norm": 2.122307777404785, + "learning_rate": 4.337911376004146e-06, + "loss": 1.6258, + "step": 837 + }, + { + "epoch": 0.01954479063109499, + "grad_norm": 1.221781611442566, + "learning_rate": 4.343094065820161e-06, + "loss": 1.9081, + "step": 838 + }, + { + "epoch": 0.01956811377027291, + "grad_norm": 1.2895511388778687, + "learning_rate": 4.348276755636175e-06, + "loss": 1.2742, + "step": 839 + }, + { + "epoch": 0.019591436909450827, + "grad_norm": 1.1531336307525635, + "learning_rate": 4.35345944545219e-06, + "loss": 1.587, + "step": 840 + }, + { + "epoch": 0.019614760048628745, + "grad_norm": 1.3979135751724243, + "learning_rate": 4.358642135268205e-06, + "loss": 1.5208, + "step": 841 + }, + { + "epoch": 0.019638083187806663, + "grad_norm": 1.3758100271224976, + "learning_rate": 4.363824825084219e-06, + "loss": 1.246, + "step": 842 + }, + { + "epoch": 0.01966140632698458, + "grad_norm": 1.3759677410125732, + "learning_rate": 4.369007514900233e-06, + "loss": 1.7344, + "step": 843 + }, + { + "epoch": 0.0196847294661625, + "grad_norm": 1.5575461387634277, + "learning_rate": 4.3741902047162485e-06, + "loss": 1.5554, + "step": 844 + }, + { + "epoch": 0.019708052605340417, + "grad_norm": 1.5018088817596436, + "learning_rate": 4.379372894532263e-06, + "loss": 1.3433, + "step": 845 + }, + { + "epoch": 0.019731375744518335, + "grad_norm": 1.4393954277038574, + "learning_rate": 4.384555584348277e-06, + "loss": 1.7277, + "step": 846 + }, + { + "epoch": 0.019754698883696253, + "grad_norm": 1.0249360799789429, + "learning_rate": 4.389738274164292e-06, + "loss": 1.6538, + "step": 847 + }, + { + "epoch": 0.019778022022874167, + "grad_norm": 1.128587007522583, + "learning_rate": 4.394920963980306e-06, + "loss": 1.2935, + "step": 848 + }, + { + "epoch": 0.019801345162052085, + "grad_norm": 1.301287293434143, + "learning_rate": 4.40010365379632e-06, + "loss": 1.4193, + "step": 849 + }, + { + "epoch": 0.019824668301230003, + "grad_norm": 1.5180747509002686, + "learning_rate": 4.405286343612335e-06, + "loss": 1.2061, + "step": 850 + }, + { + "epoch": 0.01984799144040792, + "grad_norm": 0.9110321402549744, + "learning_rate": 4.41046903342835e-06, + "loss": 1.2803, + "step": 851 + }, + { + "epoch": 0.01987131457958584, + "grad_norm": 1.68843674659729, + "learning_rate": 4.415651723244364e-06, + "loss": 1.2037, + "step": 852 + }, + { + "epoch": 0.019894637718763757, + "grad_norm": 1.2198610305786133, + "learning_rate": 4.4208344130603785e-06, + "loss": 1.6652, + "step": 853 + }, + { + "epoch": 0.019917960857941674, + "grad_norm": 1.579087257385254, + "learning_rate": 4.426017102876393e-06, + "loss": 1.5859, + "step": 854 + }, + { + "epoch": 0.019941283997119592, + "grad_norm": 1.7198874950408936, + "learning_rate": 4.431199792692408e-06, + "loss": 1.4662, + "step": 855 + }, + { + "epoch": 0.01996460713629751, + "grad_norm": 2.817178726196289, + "learning_rate": 4.436382482508422e-06, + "loss": 1.3427, + "step": 856 + }, + { + "epoch": 0.019987930275475428, + "grad_norm": 1.4508287906646729, + "learning_rate": 4.441565172324437e-06, + "loss": 1.2893, + "step": 857 + }, + { + "epoch": 0.020011253414653346, + "grad_norm": 1.29767644405365, + "learning_rate": 4.446747862140451e-06, + "loss": 1.5759, + "step": 858 + }, + { + "epoch": 0.020034576553831264, + "grad_norm": 1.84248685836792, + "learning_rate": 4.451930551956466e-06, + "loss": 2.1373, + "step": 859 + }, + { + "epoch": 0.020057899693009182, + "grad_norm": 1.6153839826583862, + "learning_rate": 4.457113241772481e-06, + "loss": 1.3915, + "step": 860 + }, + { + "epoch": 0.0200812228321871, + "grad_norm": 1.3203104734420776, + "learning_rate": 4.462295931588495e-06, + "loss": 1.569, + "step": 861 + }, + { + "epoch": 0.020104545971365014, + "grad_norm": 1.6475995779037476, + "learning_rate": 4.467478621404509e-06, + "loss": 1.6446, + "step": 862 + }, + { + "epoch": 0.020127869110542932, + "grad_norm": 1.165834665298462, + "learning_rate": 4.472661311220524e-06, + "loss": 1.7323, + "step": 863 + }, + { + "epoch": 0.02015119224972085, + "grad_norm": 1.3182172775268555, + "learning_rate": 4.477844001036538e-06, + "loss": 1.6265, + "step": 864 + }, + { + "epoch": 0.020174515388898768, + "grad_norm": 1.1236745119094849, + "learning_rate": 4.483026690852552e-06, + "loss": 1.2358, + "step": 865 + }, + { + "epoch": 0.020197838528076686, + "grad_norm": 1.2104893922805786, + "learning_rate": 4.4882093806685676e-06, + "loss": 1.4677, + "step": 866 + }, + { + "epoch": 0.020221161667254604, + "grad_norm": 1.6824678182601929, + "learning_rate": 4.493392070484582e-06, + "loss": 1.5802, + "step": 867 + }, + { + "epoch": 0.02024448480643252, + "grad_norm": 1.0679930448532104, + "learning_rate": 4.498574760300596e-06, + "loss": 1.4105, + "step": 868 + }, + { + "epoch": 0.02026780794561044, + "grad_norm": 1.3705253601074219, + "learning_rate": 4.503757450116611e-06, + "loss": 1.5095, + "step": 869 + }, + { + "epoch": 0.020291131084788357, + "grad_norm": 1.307491660118103, + "learning_rate": 4.508940139932626e-06, + "loss": 1.3987, + "step": 870 + }, + { + "epoch": 0.020314454223966275, + "grad_norm": 1.4814496040344238, + "learning_rate": 4.51412282974864e-06, + "loss": 1.635, + "step": 871 + }, + { + "epoch": 0.020337777363144193, + "grad_norm": 0.935867190361023, + "learning_rate": 4.5193055195646545e-06, + "loss": 1.6734, + "step": 872 + }, + { + "epoch": 0.02036110050232211, + "grad_norm": 1.3890215158462524, + "learning_rate": 4.524488209380669e-06, + "loss": 1.4458, + "step": 873 + }, + { + "epoch": 0.02038442364150003, + "grad_norm": 1.628081202507019, + "learning_rate": 4.529670899196684e-06, + "loss": 1.4814, + "step": 874 + }, + { + "epoch": 0.020407746780677947, + "grad_norm": 1.5255577564239502, + "learning_rate": 4.534853589012698e-06, + "loss": 1.3884, + "step": 875 + }, + { + "epoch": 0.020431069919855865, + "grad_norm": 2.09283185005188, + "learning_rate": 4.540036278828712e-06, + "loss": 1.7396, + "step": 876 + }, + { + "epoch": 0.02045439305903378, + "grad_norm": 0.9901561737060547, + "learning_rate": 4.545218968644727e-06, + "loss": 1.4941, + "step": 877 + }, + { + "epoch": 0.020477716198211697, + "grad_norm": 1.8444923162460327, + "learning_rate": 4.550401658460741e-06, + "loss": 1.2724, + "step": 878 + }, + { + "epoch": 0.020501039337389615, + "grad_norm": 1.414305567741394, + "learning_rate": 4.555584348276756e-06, + "loss": 1.5781, + "step": 879 + }, + { + "epoch": 0.020524362476567533, + "grad_norm": 1.1960091590881348, + "learning_rate": 4.56076703809277e-06, + "loss": 1.536, + "step": 880 + }, + { + "epoch": 0.02054768561574545, + "grad_norm": 2.241649627685547, + "learning_rate": 4.565949727908785e-06, + "loss": 1.6636, + "step": 881 + }, + { + "epoch": 0.02057100875492337, + "grad_norm": 1.0672343969345093, + "learning_rate": 4.5711324177248e-06, + "loss": 1.6369, + "step": 882 + }, + { + "epoch": 0.020594331894101287, + "grad_norm": 1.6761622428894043, + "learning_rate": 4.576315107540814e-06, + "loss": 1.2554, + "step": 883 + }, + { + "epoch": 0.020617655033279204, + "grad_norm": 1.1365658044815063, + "learning_rate": 4.581497797356828e-06, + "loss": 1.6271, + "step": 884 + }, + { + "epoch": 0.020640978172457122, + "grad_norm": 1.0631389617919922, + "learning_rate": 4.5866804871728435e-06, + "loss": 1.6393, + "step": 885 + }, + { + "epoch": 0.02066430131163504, + "grad_norm": 3.27304744720459, + "learning_rate": 4.591863176988858e-06, + "loss": 1.3521, + "step": 886 + }, + { + "epoch": 0.020687624450812958, + "grad_norm": 1.3354477882385254, + "learning_rate": 4.597045866804872e-06, + "loss": 1.5137, + "step": 887 + }, + { + "epoch": 0.020710947589990876, + "grad_norm": 2.192812919616699, + "learning_rate": 4.602228556620887e-06, + "loss": 1.7294, + "step": 888 + }, + { + "epoch": 0.020734270729168794, + "grad_norm": 0.9716669321060181, + "learning_rate": 4.607411246436901e-06, + "loss": 1.4244, + "step": 889 + }, + { + "epoch": 0.020757593868346712, + "grad_norm": 1.0377227067947388, + "learning_rate": 4.612593936252915e-06, + "loss": 1.3041, + "step": 890 + }, + { + "epoch": 0.020780917007524626, + "grad_norm": 1.971074104309082, + "learning_rate": 4.61777662606893e-06, + "loss": 1.4917, + "step": 891 + }, + { + "epoch": 0.020804240146702544, + "grad_norm": 1.3108222484588623, + "learning_rate": 4.622959315884945e-06, + "loss": 1.5923, + "step": 892 + }, + { + "epoch": 0.020827563285880462, + "grad_norm": 1.4194189310073853, + "learning_rate": 4.628142005700959e-06, + "loss": 1.2378, + "step": 893 + }, + { + "epoch": 0.02085088642505838, + "grad_norm": 1.5872682332992554, + "learning_rate": 4.6333246955169735e-06, + "loss": 1.3573, + "step": 894 + }, + { + "epoch": 0.020874209564236298, + "grad_norm": 1.351704716682434, + "learning_rate": 4.638507385332988e-06, + "loss": 1.8374, + "step": 895 + }, + { + "epoch": 0.020897532703414216, + "grad_norm": 1.15986168384552, + "learning_rate": 4.643690075149003e-06, + "loss": 1.4303, + "step": 896 + }, + { + "epoch": 0.020920855842592134, + "grad_norm": 1.912819743156433, + "learning_rate": 4.648872764965017e-06, + "loss": 1.7733, + "step": 897 + }, + { + "epoch": 0.02094417898177005, + "grad_norm": 1.6582539081573486, + "learning_rate": 4.654055454781032e-06, + "loss": 1.4696, + "step": 898 + }, + { + "epoch": 0.02096750212094797, + "grad_norm": 1.147661805152893, + "learning_rate": 4.659238144597046e-06, + "loss": 1.5037, + "step": 899 + }, + { + "epoch": 0.020990825260125887, + "grad_norm": 1.1773402690887451, + "learning_rate": 4.664420834413061e-06, + "loss": 1.604, + "step": 900 + }, + { + "epoch": 0.021014148399303805, + "grad_norm": 1.9128248691558838, + "learning_rate": 4.669603524229076e-06, + "loss": 1.3081, + "step": 901 + }, + { + "epoch": 0.021037471538481723, + "grad_norm": 1.0742683410644531, + "learning_rate": 4.67478621404509e-06, + "loss": 1.5619, + "step": 902 + }, + { + "epoch": 0.02106079467765964, + "grad_norm": 1.19862699508667, + "learning_rate": 4.679968903861104e-06, + "loss": 1.6896, + "step": 903 + }, + { + "epoch": 0.02108411781683756, + "grad_norm": 1.276283860206604, + "learning_rate": 4.685151593677119e-06, + "loss": 1.65, + "step": 904 + }, + { + "epoch": 0.021107440956015473, + "grad_norm": 1.3582435846328735, + "learning_rate": 4.690334283493133e-06, + "loss": 1.2686, + "step": 905 + }, + { + "epoch": 0.02113076409519339, + "grad_norm": 1.2145341634750366, + "learning_rate": 4.695516973309147e-06, + "loss": 1.8032, + "step": 906 + }, + { + "epoch": 0.02115408723437131, + "grad_norm": 1.1219233274459839, + "learning_rate": 4.7006996631251626e-06, + "loss": 1.7681, + "step": 907 + }, + { + "epoch": 0.021177410373549227, + "grad_norm": 1.0474015474319458, + "learning_rate": 4.705882352941177e-06, + "loss": 1.4555, + "step": 908 + }, + { + "epoch": 0.021200733512727145, + "grad_norm": 1.6325182914733887, + "learning_rate": 4.711065042757191e-06, + "loss": 1.432, + "step": 909 + }, + { + "epoch": 0.021224056651905063, + "grad_norm": 1.5804178714752197, + "learning_rate": 4.716247732573206e-06, + "loss": 1.7409, + "step": 910 + }, + { + "epoch": 0.02124737979108298, + "grad_norm": 1.226804256439209, + "learning_rate": 4.721430422389221e-06, + "loss": 1.8077, + "step": 911 + }, + { + "epoch": 0.0212707029302609, + "grad_norm": 1.0747625827789307, + "learning_rate": 4.726613112205235e-06, + "loss": 1.411, + "step": 912 + }, + { + "epoch": 0.021294026069438816, + "grad_norm": 1.2126623392105103, + "learning_rate": 4.7317958020212495e-06, + "loss": 1.6464, + "step": 913 + }, + { + "epoch": 0.021317349208616734, + "grad_norm": 1.196486473083496, + "learning_rate": 4.736978491837264e-06, + "loss": 1.4365, + "step": 914 + }, + { + "epoch": 0.021340672347794652, + "grad_norm": 1.4727115631103516, + "learning_rate": 4.742161181653279e-06, + "loss": 1.5059, + "step": 915 + }, + { + "epoch": 0.02136399548697257, + "grad_norm": 1.293938159942627, + "learning_rate": 4.747343871469293e-06, + "loss": 1.5508, + "step": 916 + }, + { + "epoch": 0.021387318626150488, + "grad_norm": 1.3074458837509155, + "learning_rate": 4.752526561285307e-06, + "loss": 1.364, + "step": 917 + }, + { + "epoch": 0.021410641765328406, + "grad_norm": 1.708522081375122, + "learning_rate": 4.757709251101322e-06, + "loss": 1.2891, + "step": 918 + }, + { + "epoch": 0.02143396490450632, + "grad_norm": 1.2926160097122192, + "learning_rate": 4.762891940917336e-06, + "loss": 1.1779, + "step": 919 + }, + { + "epoch": 0.021457288043684238, + "grad_norm": 1.7751168012619019, + "learning_rate": 4.768074630733351e-06, + "loss": 1.3136, + "step": 920 + }, + { + "epoch": 0.021480611182862156, + "grad_norm": 1.3698194026947021, + "learning_rate": 4.773257320549365e-06, + "loss": 1.5203, + "step": 921 + }, + { + "epoch": 0.021503934322040074, + "grad_norm": 1.4710402488708496, + "learning_rate": 4.77844001036538e-06, + "loss": 2.0632, + "step": 922 + }, + { + "epoch": 0.021527257461217992, + "grad_norm": 1.3340466022491455, + "learning_rate": 4.783622700181395e-06, + "loss": 0.9449, + "step": 923 + }, + { + "epoch": 0.02155058060039591, + "grad_norm": 1.990078330039978, + "learning_rate": 4.788805389997409e-06, + "loss": 1.4095, + "step": 924 + }, + { + "epoch": 0.021573903739573828, + "grad_norm": 2.6495463848114014, + "learning_rate": 4.793988079813423e-06, + "loss": 1.5914, + "step": 925 + }, + { + "epoch": 0.021597226878751746, + "grad_norm": 1.368868350982666, + "learning_rate": 4.7991707696294385e-06, + "loss": 1.8007, + "step": 926 + }, + { + "epoch": 0.021620550017929663, + "grad_norm": 1.3946820497512817, + "learning_rate": 4.804353459445453e-06, + "loss": 1.3846, + "step": 927 + }, + { + "epoch": 0.02164387315710758, + "grad_norm": 1.6035547256469727, + "learning_rate": 4.809536149261467e-06, + "loss": 1.6677, + "step": 928 + }, + { + "epoch": 0.0216671962962855, + "grad_norm": 1.29734468460083, + "learning_rate": 4.814718839077482e-06, + "loss": 1.3697, + "step": 929 + }, + { + "epoch": 0.021690519435463417, + "grad_norm": 1.1746439933776855, + "learning_rate": 4.819901528893497e-06, + "loss": 1.6134, + "step": 930 + }, + { + "epoch": 0.021713842574641335, + "grad_norm": 1.255861759185791, + "learning_rate": 4.82508421870951e-06, + "loss": 1.6253, + "step": 931 + }, + { + "epoch": 0.021737165713819253, + "grad_norm": 1.5499615669250488, + "learning_rate": 4.830266908525525e-06, + "loss": 1.2794, + "step": 932 + }, + { + "epoch": 0.02176048885299717, + "grad_norm": 1.6138273477554321, + "learning_rate": 4.83544959834154e-06, + "loss": 1.6365, + "step": 933 + }, + { + "epoch": 0.021783811992175085, + "grad_norm": 1.7135401964187622, + "learning_rate": 4.840632288157554e-06, + "loss": 1.509, + "step": 934 + }, + { + "epoch": 0.021807135131353003, + "grad_norm": 1.4290528297424316, + "learning_rate": 4.8458149779735685e-06, + "loss": 1.3415, + "step": 935 + }, + { + "epoch": 0.02183045827053092, + "grad_norm": 2.034870147705078, + "learning_rate": 4.850997667789583e-06, + "loss": 1.6834, + "step": 936 + }, + { + "epoch": 0.02185378140970884, + "grad_norm": 1.6626250743865967, + "learning_rate": 4.856180357605598e-06, + "loss": 1.3573, + "step": 937 + }, + { + "epoch": 0.021877104548886757, + "grad_norm": 1.2256288528442383, + "learning_rate": 4.861363047421612e-06, + "loss": 1.5497, + "step": 938 + }, + { + "epoch": 0.021900427688064675, + "grad_norm": 1.218955397605896, + "learning_rate": 4.866545737237627e-06, + "loss": 1.6823, + "step": 939 + }, + { + "epoch": 0.021923750827242593, + "grad_norm": 1.0629289150238037, + "learning_rate": 4.871728427053641e-06, + "loss": 1.3894, + "step": 940 + }, + { + "epoch": 0.02194707396642051, + "grad_norm": 2.6169822216033936, + "learning_rate": 4.876911116869656e-06, + "loss": 1.4063, + "step": 941 + }, + { + "epoch": 0.02197039710559843, + "grad_norm": 1.1517153978347778, + "learning_rate": 4.882093806685671e-06, + "loss": 1.3838, + "step": 942 + }, + { + "epoch": 0.021993720244776346, + "grad_norm": 1.6320403814315796, + "learning_rate": 4.887276496501685e-06, + "loss": 1.5752, + "step": 943 + }, + { + "epoch": 0.022017043383954264, + "grad_norm": 1.7344862222671509, + "learning_rate": 4.892459186317699e-06, + "loss": 1.3182, + "step": 944 + }, + { + "epoch": 0.022040366523132182, + "grad_norm": 1.2497214078903198, + "learning_rate": 4.897641876133714e-06, + "loss": 1.2266, + "step": 945 + }, + { + "epoch": 0.0220636896623101, + "grad_norm": 1.996893048286438, + "learning_rate": 4.902824565949728e-06, + "loss": 1.2708, + "step": 946 + }, + { + "epoch": 0.022087012801488018, + "grad_norm": 1.1130571365356445, + "learning_rate": 4.908007255765742e-06, + "loss": 1.4791, + "step": 947 + }, + { + "epoch": 0.022110335940665932, + "grad_norm": 1.2698702812194824, + "learning_rate": 4.9131899455817576e-06, + "loss": 1.3711, + "step": 948 + }, + { + "epoch": 0.02213365907984385, + "grad_norm": 1.0363445281982422, + "learning_rate": 4.918372635397772e-06, + "loss": 1.4153, + "step": 949 + }, + { + "epoch": 0.022156982219021768, + "grad_norm": 1.1418310403823853, + "learning_rate": 4.923555325213786e-06, + "loss": 1.3377, + "step": 950 + }, + { + "epoch": 0.022180305358199686, + "grad_norm": 1.3740698099136353, + "learning_rate": 4.928738015029801e-06, + "loss": 1.375, + "step": 951 + }, + { + "epoch": 0.022203628497377604, + "grad_norm": 1.5656532049179077, + "learning_rate": 4.933920704845816e-06, + "loss": 1.651, + "step": 952 + }, + { + "epoch": 0.022226951636555522, + "grad_norm": 1.209380865097046, + "learning_rate": 4.93910339466183e-06, + "loss": 1.6956, + "step": 953 + }, + { + "epoch": 0.02225027477573344, + "grad_norm": 1.9917747974395752, + "learning_rate": 4.9442860844778445e-06, + "loss": 1.2802, + "step": 954 + }, + { + "epoch": 0.022273597914911358, + "grad_norm": 2.168260097503662, + "learning_rate": 4.949468774293859e-06, + "loss": 1.9773, + "step": 955 + }, + { + "epoch": 0.022296921054089276, + "grad_norm": 1.113978624343872, + "learning_rate": 4.954651464109874e-06, + "loss": 1.8121, + "step": 956 + }, + { + "epoch": 0.022320244193267193, + "grad_norm": 1.4833635091781616, + "learning_rate": 4.959834153925888e-06, + "loss": 1.694, + "step": 957 + }, + { + "epoch": 0.02234356733244511, + "grad_norm": 1.3287935256958008, + "learning_rate": 4.965016843741902e-06, + "loss": 1.4865, + "step": 958 + }, + { + "epoch": 0.02236689047162303, + "grad_norm": 1.5515238046646118, + "learning_rate": 4.970199533557917e-06, + "loss": 1.6035, + "step": 959 + }, + { + "epoch": 0.022390213610800947, + "grad_norm": 1.2824245691299438, + "learning_rate": 4.975382223373931e-06, + "loss": 1.5124, + "step": 960 + }, + { + "epoch": 0.022413536749978865, + "grad_norm": 1.2062418460845947, + "learning_rate": 4.980564913189946e-06, + "loss": 1.5982, + "step": 961 + }, + { + "epoch": 0.02243685988915678, + "grad_norm": 1.2790741920471191, + "learning_rate": 4.98574760300596e-06, + "loss": 1.586, + "step": 962 + }, + { + "epoch": 0.022460183028334697, + "grad_norm": 1.202909231185913, + "learning_rate": 4.990930292821975e-06, + "loss": 1.7387, + "step": 963 + }, + { + "epoch": 0.022483506167512615, + "grad_norm": 1.328963041305542, + "learning_rate": 4.99611298263799e-06, + "loss": 1.5611, + "step": 964 + }, + { + "epoch": 0.022506829306690533, + "grad_norm": 1.3728841543197632, + "learning_rate": 5.001295672454004e-06, + "loss": 1.6887, + "step": 965 + }, + { + "epoch": 0.02253015244586845, + "grad_norm": 1.2474596500396729, + "learning_rate": 5.006478362270018e-06, + "loss": 1.7337, + "step": 966 + }, + { + "epoch": 0.02255347558504637, + "grad_norm": 1.4526808261871338, + "learning_rate": 5.0116610520860335e-06, + "loss": 1.4009, + "step": 967 + }, + { + "epoch": 0.022576798724224287, + "grad_norm": 1.74959397315979, + "learning_rate": 5.016843741902048e-06, + "loss": 1.4153, + "step": 968 + }, + { + "epoch": 0.022600121863402205, + "grad_norm": 1.7886738777160645, + "learning_rate": 5.022026431718062e-06, + "loss": 1.3897, + "step": 969 + }, + { + "epoch": 0.022623445002580123, + "grad_norm": 1.3122284412384033, + "learning_rate": 5.027209121534077e-06, + "loss": 1.6551, + "step": 970 + }, + { + "epoch": 0.02264676814175804, + "grad_norm": 1.5374927520751953, + "learning_rate": 5.032391811350092e-06, + "loss": 1.6396, + "step": 971 + }, + { + "epoch": 0.02267009128093596, + "grad_norm": 1.6476905345916748, + "learning_rate": 5.037574501166106e-06, + "loss": 1.733, + "step": 972 + }, + { + "epoch": 0.022693414420113876, + "grad_norm": 1.3407307863235474, + "learning_rate": 5.0427571909821205e-06, + "loss": 1.4984, + "step": 973 + }, + { + "epoch": 0.022716737559291794, + "grad_norm": 1.5565712451934814, + "learning_rate": 5.047939880798135e-06, + "loss": 1.6524, + "step": 974 + }, + { + "epoch": 0.022740060698469712, + "grad_norm": 1.381903052330017, + "learning_rate": 5.053122570614149e-06, + "loss": 1.5325, + "step": 975 + }, + { + "epoch": 0.022763383837647626, + "grad_norm": 1.916326880455017, + "learning_rate": 5.058305260430164e-06, + "loss": 1.2326, + "step": 976 + }, + { + "epoch": 0.022786706976825544, + "grad_norm": 1.1621575355529785, + "learning_rate": 5.063487950246179e-06, + "loss": 1.2568, + "step": 977 + }, + { + "epoch": 0.022810030116003462, + "grad_norm": 1.3575561046600342, + "learning_rate": 5.068670640062193e-06, + "loss": 1.3755, + "step": 978 + }, + { + "epoch": 0.02283335325518138, + "grad_norm": 1.482701063156128, + "learning_rate": 5.0738533298782065e-06, + "loss": 1.598, + "step": 979 + }, + { + "epoch": 0.022856676394359298, + "grad_norm": 1.2530887126922607, + "learning_rate": 5.079036019694221e-06, + "loss": 1.66, + "step": 980 + }, + { + "epoch": 0.022879999533537216, + "grad_norm": 1.4960439205169678, + "learning_rate": 5.084218709510236e-06, + "loss": 1.5341, + "step": 981 + }, + { + "epoch": 0.022903322672715134, + "grad_norm": 1.507735252380371, + "learning_rate": 5.0894013993262504e-06, + "loss": 1.3987, + "step": 982 + }, + { + "epoch": 0.022926645811893052, + "grad_norm": 2.0131475925445557, + "learning_rate": 5.094584089142265e-06, + "loss": 1.3134, + "step": 983 + }, + { + "epoch": 0.02294996895107097, + "grad_norm": 1.8096015453338623, + "learning_rate": 5.099766778958279e-06, + "loss": 1.3707, + "step": 984 + }, + { + "epoch": 0.022973292090248888, + "grad_norm": 1.0444198846817017, + "learning_rate": 5.104949468774294e-06, + "loss": 1.4119, + "step": 985 + }, + { + "epoch": 0.022996615229426805, + "grad_norm": 1.3110159635543823, + "learning_rate": 5.110132158590309e-06, + "loss": 1.2187, + "step": 986 + }, + { + "epoch": 0.023019938368604723, + "grad_norm": 1.3191614151000977, + "learning_rate": 5.115314848406323e-06, + "loss": 1.3691, + "step": 987 + }, + { + "epoch": 0.02304326150778264, + "grad_norm": 1.3888386487960815, + "learning_rate": 5.120497538222337e-06, + "loss": 1.1934, + "step": 988 + }, + { + "epoch": 0.02306658464696056, + "grad_norm": 1.2101585865020752, + "learning_rate": 5.1256802280383526e-06, + "loss": 1.4962, + "step": 989 + }, + { + "epoch": 0.023089907786138477, + "grad_norm": 1.2938464879989624, + "learning_rate": 5.130862917854367e-06, + "loss": 1.4601, + "step": 990 + }, + { + "epoch": 0.02311323092531639, + "grad_norm": 2.072444200515747, + "learning_rate": 5.136045607670381e-06, + "loss": 1.7241, + "step": 991 + }, + { + "epoch": 0.02313655406449431, + "grad_norm": 1.7139407396316528, + "learning_rate": 5.141228297486396e-06, + "loss": 1.394, + "step": 992 + }, + { + "epoch": 0.023159877203672227, + "grad_norm": 1.5825177431106567, + "learning_rate": 5.146410987302411e-06, + "loss": 1.4218, + "step": 993 + }, + { + "epoch": 0.023183200342850145, + "grad_norm": 1.2233787775039673, + "learning_rate": 5.151593677118425e-06, + "loss": 1.2882, + "step": 994 + }, + { + "epoch": 0.023206523482028063, + "grad_norm": 1.6474647521972656, + "learning_rate": 5.1567763669344395e-06, + "loss": 1.6499, + "step": 995 + }, + { + "epoch": 0.02322984662120598, + "grad_norm": 1.669651985168457, + "learning_rate": 5.161959056750454e-06, + "loss": 1.1727, + "step": 996 + }, + { + "epoch": 0.0232531697603839, + "grad_norm": 1.4976879358291626, + "learning_rate": 5.167141746566469e-06, + "loss": 1.2149, + "step": 997 + }, + { + "epoch": 0.023276492899561817, + "grad_norm": 1.4033470153808594, + "learning_rate": 5.172324436382483e-06, + "loss": 1.3004, + "step": 998 + }, + { + "epoch": 0.023299816038739735, + "grad_norm": 1.3042150735855103, + "learning_rate": 5.177507126198498e-06, + "loss": 1.3803, + "step": 999 + }, + { + "epoch": 0.023323139177917653, + "grad_norm": 1.4327346086502075, + "learning_rate": 5.182689816014512e-06, + "loss": 1.7267, + "step": 1000 + }, + { + "epoch": 0.02334646231709557, + "grad_norm": 1.4823616743087769, + "learning_rate": 5.187872505830526e-06, + "loss": 1.6386, + "step": 1001 + }, + { + "epoch": 0.02336978545627349, + "grad_norm": 1.7083938121795654, + "learning_rate": 5.193055195646542e-06, + "loss": 1.3112, + "step": 1002 + }, + { + "epoch": 0.023393108595451406, + "grad_norm": 1.51584792137146, + "learning_rate": 5.198237885462556e-06, + "loss": 1.6169, + "step": 1003 + }, + { + "epoch": 0.023416431734629324, + "grad_norm": 1.0864455699920654, + "learning_rate": 5.20342057527857e-06, + "loss": 1.3013, + "step": 1004 + }, + { + "epoch": 0.02343975487380724, + "grad_norm": 1.9760619401931763, + "learning_rate": 5.208603265094585e-06, + "loss": 1.7865, + "step": 1005 + }, + { + "epoch": 0.023463078012985156, + "grad_norm": 2.5747292041778564, + "learning_rate": 5.2137859549106e-06, + "loss": 1.3345, + "step": 1006 + }, + { + "epoch": 0.023486401152163074, + "grad_norm": 1.689779281616211, + "learning_rate": 5.218968644726613e-06, + "loss": 1.7856, + "step": 1007 + }, + { + "epoch": 0.023509724291340992, + "grad_norm": 1.9847980737686157, + "learning_rate": 5.224151334542628e-06, + "loss": 1.8401, + "step": 1008 + }, + { + "epoch": 0.02353304743051891, + "grad_norm": 1.3654876947402954, + "learning_rate": 5.229334024358642e-06, + "loss": 1.7705, + "step": 1009 + }, + { + "epoch": 0.023556370569696828, + "grad_norm": 1.7249932289123535, + "learning_rate": 5.234516714174656e-06, + "loss": 1.1657, + "step": 1010 + }, + { + "epoch": 0.023579693708874746, + "grad_norm": 1.0710606575012207, + "learning_rate": 5.2396994039906716e-06, + "loss": 1.1676, + "step": 1011 + }, + { + "epoch": 0.023603016848052664, + "grad_norm": 1.213040828704834, + "learning_rate": 5.244882093806686e-06, + "loss": 1.4183, + "step": 1012 + }, + { + "epoch": 0.02362633998723058, + "grad_norm": 1.6341387033462524, + "learning_rate": 5.2500647836227e-06, + "loss": 1.6092, + "step": 1013 + }, + { + "epoch": 0.0236496631264085, + "grad_norm": 1.6445837020874023, + "learning_rate": 5.255247473438715e-06, + "loss": 1.6693, + "step": 1014 + }, + { + "epoch": 0.023672986265586417, + "grad_norm": 1.2804230451583862, + "learning_rate": 5.26043016325473e-06, + "loss": 1.5687, + "step": 1015 + }, + { + "epoch": 0.023696309404764335, + "grad_norm": 1.8683735132217407, + "learning_rate": 5.265612853070744e-06, + "loss": 1.3944, + "step": 1016 + }, + { + "epoch": 0.023719632543942253, + "grad_norm": 1.6504722833633423, + "learning_rate": 5.2707955428867585e-06, + "loss": 1.3018, + "step": 1017 + }, + { + "epoch": 0.02374295568312017, + "grad_norm": 1.71793532371521, + "learning_rate": 5.275978232702773e-06, + "loss": 1.4581, + "step": 1018 + }, + { + "epoch": 0.023766278822298086, + "grad_norm": 1.1414326429367065, + "learning_rate": 5.281160922518788e-06, + "loss": 1.4924, + "step": 1019 + }, + { + "epoch": 0.023789601961476003, + "grad_norm": 1.6553568840026855, + "learning_rate": 5.286343612334802e-06, + "loss": 1.6926, + "step": 1020 + }, + { + "epoch": 0.02381292510065392, + "grad_norm": 1.4217321872711182, + "learning_rate": 5.291526302150817e-06, + "loss": 1.4806, + "step": 1021 + }, + { + "epoch": 0.02383624823983184, + "grad_norm": 1.4322501420974731, + "learning_rate": 5.296708991966831e-06, + "loss": 1.5978, + "step": 1022 + }, + { + "epoch": 0.023859571379009757, + "grad_norm": 1.9824562072753906, + "learning_rate": 5.3018916817828454e-06, + "loss": 1.493, + "step": 1023 + }, + { + "epoch": 0.023882894518187675, + "grad_norm": 1.3815537691116333, + "learning_rate": 5.307074371598861e-06, + "loss": 1.3702, + "step": 1024 + }, + { + "epoch": 0.023906217657365593, + "grad_norm": 1.101647138595581, + "learning_rate": 5.312257061414875e-06, + "loss": 1.1745, + "step": 1025 + }, + { + "epoch": 0.02392954079654351, + "grad_norm": 1.2983593940734863, + "learning_rate": 5.317439751230889e-06, + "loss": 1.7473, + "step": 1026 + }, + { + "epoch": 0.02395286393572143, + "grad_norm": 1.2676076889038086, + "learning_rate": 5.322622441046904e-06, + "loss": 1.6349, + "step": 1027 + }, + { + "epoch": 0.023976187074899347, + "grad_norm": 1.2923870086669922, + "learning_rate": 5.327805130862919e-06, + "loss": 1.619, + "step": 1028 + }, + { + "epoch": 0.023999510214077265, + "grad_norm": 1.4195587635040283, + "learning_rate": 5.332987820678933e-06, + "loss": 1.4933, + "step": 1029 + }, + { + "epoch": 0.024022833353255182, + "grad_norm": 1.3498200178146362, + "learning_rate": 5.3381705104949476e-06, + "loss": 1.489, + "step": 1030 + }, + { + "epoch": 0.0240461564924331, + "grad_norm": 1.473960280418396, + "learning_rate": 5.343353200310962e-06, + "loss": 1.5181, + "step": 1031 + }, + { + "epoch": 0.024069479631611018, + "grad_norm": 1.2730071544647217, + "learning_rate": 5.348535890126977e-06, + "loss": 1.5796, + "step": 1032 + }, + { + "epoch": 0.024092802770788933, + "grad_norm": 1.2243895530700684, + "learning_rate": 5.3537185799429914e-06, + "loss": 1.4051, + "step": 1033 + }, + { + "epoch": 0.02411612590996685, + "grad_norm": 2.1219441890716553, + "learning_rate": 5.358901269759005e-06, + "loss": 1.4317, + "step": 1034 + }, + { + "epoch": 0.02413944904914477, + "grad_norm": 1.0719225406646729, + "learning_rate": 5.364083959575019e-06, + "loss": 1.3937, + "step": 1035 + }, + { + "epoch": 0.024162772188322686, + "grad_norm": 1.6711935997009277, + "learning_rate": 5.369266649391034e-06, + "loss": 1.5832, + "step": 1036 + }, + { + "epoch": 0.024186095327500604, + "grad_norm": 1.33745276927948, + "learning_rate": 5.374449339207049e-06, + "loss": 1.4582, + "step": 1037 + }, + { + "epoch": 0.024209418466678522, + "grad_norm": 1.4278967380523682, + "learning_rate": 5.379632029023063e-06, + "loss": 1.6069, + "step": 1038 + }, + { + "epoch": 0.02423274160585644, + "grad_norm": 1.2003988027572632, + "learning_rate": 5.3848147188390775e-06, + "loss": 1.4942, + "step": 1039 + }, + { + "epoch": 0.024256064745034358, + "grad_norm": 1.7350938320159912, + "learning_rate": 5.389997408655092e-06, + "loss": 1.637, + "step": 1040 + }, + { + "epoch": 0.024279387884212276, + "grad_norm": 1.6094862222671509, + "learning_rate": 5.395180098471107e-06, + "loss": 1.6944, + "step": 1041 + }, + { + "epoch": 0.024302711023390194, + "grad_norm": 1.369091510772705, + "learning_rate": 5.400362788287121e-06, + "loss": 1.6905, + "step": 1042 + }, + { + "epoch": 0.02432603416256811, + "grad_norm": 1.275787353515625, + "learning_rate": 5.405545478103136e-06, + "loss": 1.6749, + "step": 1043 + }, + { + "epoch": 0.02434935730174603, + "grad_norm": 1.24448823928833, + "learning_rate": 5.41072816791915e-06, + "loss": 1.4275, + "step": 1044 + }, + { + "epoch": 0.024372680440923947, + "grad_norm": 1.7868009805679321, + "learning_rate": 5.415910857735165e-06, + "loss": 1.5942, + "step": 1045 + }, + { + "epoch": 0.024396003580101865, + "grad_norm": 1.5386407375335693, + "learning_rate": 5.42109354755118e-06, + "loss": 1.6505, + "step": 1046 + }, + { + "epoch": 0.024419326719279783, + "grad_norm": 1.9666537046432495, + "learning_rate": 5.426276237367194e-06, + "loss": 1.7035, + "step": 1047 + }, + { + "epoch": 0.024442649858457698, + "grad_norm": 1.7937966585159302, + "learning_rate": 5.431458927183208e-06, + "loss": 1.7956, + "step": 1048 + }, + { + "epoch": 0.024465972997635616, + "grad_norm": 1.1397721767425537, + "learning_rate": 5.436641616999223e-06, + "loss": 1.3459, + "step": 1049 + }, + { + "epoch": 0.024489296136813533, + "grad_norm": 1.28958261013031, + "learning_rate": 5.441824306815238e-06, + "loss": 1.0963, + "step": 1050 + }, + { + "epoch": 0.02451261927599145, + "grad_norm": 1.3734923601150513, + "learning_rate": 5.447006996631252e-06, + "loss": 1.3196, + "step": 1051 + }, + { + "epoch": 0.02453594241516937, + "grad_norm": 1.8763736486434937, + "learning_rate": 5.4521896864472666e-06, + "loss": 1.7322, + "step": 1052 + }, + { + "epoch": 0.024559265554347287, + "grad_norm": 1.5179871320724487, + "learning_rate": 5.457372376263281e-06, + "loss": 1.2844, + "step": 1053 + }, + { + "epoch": 0.024582588693525205, + "grad_norm": 1.4944384098052979, + "learning_rate": 5.462555066079296e-06, + "loss": 1.442, + "step": 1054 + }, + { + "epoch": 0.024605911832703123, + "grad_norm": 1.499028205871582, + "learning_rate": 5.4677377558953105e-06, + "loss": 1.394, + "step": 1055 + }, + { + "epoch": 0.02462923497188104, + "grad_norm": 1.1869397163391113, + "learning_rate": 5.472920445711325e-06, + "loss": 1.2928, + "step": 1056 + }, + { + "epoch": 0.02465255811105896, + "grad_norm": 1.3456541299819946, + "learning_rate": 5.478103135527339e-06, + "loss": 1.5983, + "step": 1057 + }, + { + "epoch": 0.024675881250236877, + "grad_norm": 1.5931065082550049, + "learning_rate": 5.483285825343354e-06, + "loss": 1.4794, + "step": 1058 + }, + { + "epoch": 0.024699204389414794, + "grad_norm": 1.4096170663833618, + "learning_rate": 5.488468515159369e-06, + "loss": 1.471, + "step": 1059 + }, + { + "epoch": 0.024722527528592712, + "grad_norm": 1.5033949613571167, + "learning_rate": 5.493651204975383e-06, + "loss": 1.2857, + "step": 1060 + }, + { + "epoch": 0.02474585066777063, + "grad_norm": 1.632089614868164, + "learning_rate": 5.498833894791397e-06, + "loss": 1.5157, + "step": 1061 + }, + { + "epoch": 0.024769173806948545, + "grad_norm": 1.563462495803833, + "learning_rate": 5.504016584607411e-06, + "loss": 1.5072, + "step": 1062 + }, + { + "epoch": 0.024792496946126463, + "grad_norm": 1.4055378437042236, + "learning_rate": 5.509199274423426e-06, + "loss": 1.1545, + "step": 1063 + }, + { + "epoch": 0.02481582008530438, + "grad_norm": 1.3467985391616821, + "learning_rate": 5.5143819642394404e-06, + "loss": 1.4615, + "step": 1064 + }, + { + "epoch": 0.0248391432244823, + "grad_norm": 1.6450691223144531, + "learning_rate": 5.519564654055455e-06, + "loss": 1.8051, + "step": 1065 + }, + { + "epoch": 0.024862466363660216, + "grad_norm": 1.247313141822815, + "learning_rate": 5.524747343871469e-06, + "loss": 1.5971, + "step": 1066 + }, + { + "epoch": 0.024885789502838134, + "grad_norm": 1.7429383993148804, + "learning_rate": 5.529930033687484e-06, + "loss": 1.5401, + "step": 1067 + }, + { + "epoch": 0.024909112642016052, + "grad_norm": 1.7351207733154297, + "learning_rate": 5.535112723503499e-06, + "loss": 1.4898, + "step": 1068 + }, + { + "epoch": 0.02493243578119397, + "grad_norm": 1.5003080368041992, + "learning_rate": 5.540295413319513e-06, + "loss": 1.773, + "step": 1069 + }, + { + "epoch": 0.024955758920371888, + "grad_norm": 1.370918869972229, + "learning_rate": 5.545478103135527e-06, + "loss": 1.6648, + "step": 1070 + }, + { + "epoch": 0.024979082059549806, + "grad_norm": 1.125687837600708, + "learning_rate": 5.5506607929515426e-06, + "loss": 1.5297, + "step": 1071 + }, + { + "epoch": 0.025002405198727724, + "grad_norm": 1.984605073928833, + "learning_rate": 5.555843482767557e-06, + "loss": 1.4637, + "step": 1072 + }, + { + "epoch": 0.02502572833790564, + "grad_norm": 1.6429048776626587, + "learning_rate": 5.561026172583571e-06, + "loss": 1.2794, + "step": 1073 + }, + { + "epoch": 0.02504905147708356, + "grad_norm": 1.8730500936508179, + "learning_rate": 5.566208862399586e-06, + "loss": 1.4462, + "step": 1074 + }, + { + "epoch": 0.025072374616261477, + "grad_norm": 1.536036729812622, + "learning_rate": 5.5713915522156e-06, + "loss": 1.2484, + "step": 1075 + }, + { + "epoch": 0.025095697755439392, + "grad_norm": 1.2056294679641724, + "learning_rate": 5.576574242031615e-06, + "loss": 1.7819, + "step": 1076 + }, + { + "epoch": 0.02511902089461731, + "grad_norm": 1.4317046403884888, + "learning_rate": 5.5817569318476295e-06, + "loss": 1.5005, + "step": 1077 + }, + { + "epoch": 0.025142344033795228, + "grad_norm": 1.5313549041748047, + "learning_rate": 5.586939621663644e-06, + "loss": 1.6916, + "step": 1078 + }, + { + "epoch": 0.025165667172973145, + "grad_norm": 1.2438437938690186, + "learning_rate": 5.592122311479658e-06, + "loss": 1.4453, + "step": 1079 + }, + { + "epoch": 0.025188990312151063, + "grad_norm": 1.665187954902649, + "learning_rate": 5.597305001295673e-06, + "loss": 1.1324, + "step": 1080 + }, + { + "epoch": 0.02521231345132898, + "grad_norm": 1.910433053970337, + "learning_rate": 5.602487691111688e-06, + "loss": 2.003, + "step": 1081 + }, + { + "epoch": 0.0252356365905069, + "grad_norm": 1.6894274950027466, + "learning_rate": 5.607670380927702e-06, + "loss": 1.5041, + "step": 1082 + }, + { + "epoch": 0.025258959729684817, + "grad_norm": 1.246095061302185, + "learning_rate": 5.612853070743716e-06, + "loss": 1.7421, + "step": 1083 + }, + { + "epoch": 0.025282282868862735, + "grad_norm": 1.7268954515457153, + "learning_rate": 5.618035760559732e-06, + "loss": 1.4601, + "step": 1084 + }, + { + "epoch": 0.025305606008040653, + "grad_norm": 1.2897146940231323, + "learning_rate": 5.623218450375746e-06, + "loss": 1.4538, + "step": 1085 + }, + { + "epoch": 0.02532892914721857, + "grad_norm": 1.329236388206482, + "learning_rate": 5.62840114019176e-06, + "loss": 1.6763, + "step": 1086 + }, + { + "epoch": 0.02535225228639649, + "grad_norm": 1.4001597166061401, + "learning_rate": 5.633583830007775e-06, + "loss": 1.4887, + "step": 1087 + }, + { + "epoch": 0.025375575425574406, + "grad_norm": 2.036400079727173, + "learning_rate": 5.63876651982379e-06, + "loss": 1.4996, + "step": 1088 + }, + { + "epoch": 0.025398898564752324, + "grad_norm": 1.4963785409927368, + "learning_rate": 5.643949209639803e-06, + "loss": 1.6515, + "step": 1089 + }, + { + "epoch": 0.02542222170393024, + "grad_norm": 1.4221199750900269, + "learning_rate": 5.649131899455818e-06, + "loss": 1.814, + "step": 1090 + }, + { + "epoch": 0.025445544843108157, + "grad_norm": 1.7034932374954224, + "learning_rate": 5.654314589271832e-06, + "loss": 1.478, + "step": 1091 + }, + { + "epoch": 0.025468867982286075, + "grad_norm": 1.5419113636016846, + "learning_rate": 5.659497279087846e-06, + "loss": 1.8225, + "step": 1092 + }, + { + "epoch": 0.025492191121463992, + "grad_norm": 1.8337044715881348, + "learning_rate": 5.6646799689038616e-06, + "loss": 1.5037, + "step": 1093 + }, + { + "epoch": 0.02551551426064191, + "grad_norm": 1.3712172508239746, + "learning_rate": 5.669862658719876e-06, + "loss": 1.4449, + "step": 1094 + }, + { + "epoch": 0.02553883739981983, + "grad_norm": 1.312258005142212, + "learning_rate": 5.67504534853589e-06, + "loss": 1.5159, + "step": 1095 + }, + { + "epoch": 0.025562160538997746, + "grad_norm": 1.5284754037857056, + "learning_rate": 5.680228038351905e-06, + "loss": 1.4479, + "step": 1096 + }, + { + "epoch": 0.025585483678175664, + "grad_norm": 1.1178314685821533, + "learning_rate": 5.68541072816792e-06, + "loss": 1.4729, + "step": 1097 + }, + { + "epoch": 0.025608806817353582, + "grad_norm": 1.2439149618148804, + "learning_rate": 5.690593417983934e-06, + "loss": 1.436, + "step": 1098 + }, + { + "epoch": 0.0256321299565315, + "grad_norm": 1.580632209777832, + "learning_rate": 5.6957761077999485e-06, + "loss": 1.2718, + "step": 1099 + }, + { + "epoch": 0.025655453095709418, + "grad_norm": 1.6244875192642212, + "learning_rate": 5.700958797615963e-06, + "loss": 1.6024, + "step": 1100 + }, + { + "epoch": 0.025678776234887336, + "grad_norm": 1.2542647123336792, + "learning_rate": 5.706141487431977e-06, + "loss": 1.4344, + "step": 1101 + }, + { + "epoch": 0.025702099374065254, + "grad_norm": 1.227737307548523, + "learning_rate": 5.711324177247992e-06, + "loss": 1.2912, + "step": 1102 + }, + { + "epoch": 0.02572542251324317, + "grad_norm": 1.705132007598877, + "learning_rate": 5.716506867064007e-06, + "loss": 1.7786, + "step": 1103 + }, + { + "epoch": 0.02574874565242109, + "grad_norm": 1.4411309957504272, + "learning_rate": 5.721689556880021e-06, + "loss": 1.6456, + "step": 1104 + }, + { + "epoch": 0.025772068791599004, + "grad_norm": 1.5248507261276245, + "learning_rate": 5.7268722466960354e-06, + "loss": 1.308, + "step": 1105 + }, + { + "epoch": 0.02579539193077692, + "grad_norm": 1.3953535556793213, + "learning_rate": 5.732054936512051e-06, + "loss": 1.7294, + "step": 1106 + }, + { + "epoch": 0.02581871506995484, + "grad_norm": 2.0566859245300293, + "learning_rate": 5.737237626328065e-06, + "loss": 1.4392, + "step": 1107 + }, + { + "epoch": 0.025842038209132757, + "grad_norm": 1.4723169803619385, + "learning_rate": 5.742420316144079e-06, + "loss": 1.4799, + "step": 1108 + }, + { + "epoch": 0.025865361348310675, + "grad_norm": 1.4092565774917603, + "learning_rate": 5.747603005960094e-06, + "loss": 1.199, + "step": 1109 + }, + { + "epoch": 0.025888684487488593, + "grad_norm": 1.277365803718567, + "learning_rate": 5.752785695776109e-06, + "loss": 1.6108, + "step": 1110 + }, + { + "epoch": 0.02591200762666651, + "grad_norm": 2.465951919555664, + "learning_rate": 5.757968385592123e-06, + "loss": 1.6563, + "step": 1111 + }, + { + "epoch": 0.02593533076584443, + "grad_norm": 1.8686498403549194, + "learning_rate": 5.7631510754081376e-06, + "loss": 1.4241, + "step": 1112 + }, + { + "epoch": 0.025958653905022347, + "grad_norm": 1.6791915893554688, + "learning_rate": 5.768333765224152e-06, + "loss": 1.5922, + "step": 1113 + }, + { + "epoch": 0.025981977044200265, + "grad_norm": 1.7679352760314941, + "learning_rate": 5.773516455040167e-06, + "loss": 1.3589, + "step": 1114 + }, + { + "epoch": 0.026005300183378183, + "grad_norm": 1.535530686378479, + "learning_rate": 5.7786991448561814e-06, + "loss": 1.1027, + "step": 1115 + }, + { + "epoch": 0.0260286233225561, + "grad_norm": 1.5171246528625488, + "learning_rate": 5.783881834672196e-06, + "loss": 1.5711, + "step": 1116 + }, + { + "epoch": 0.02605194646173402, + "grad_norm": 1.101453185081482, + "learning_rate": 5.789064524488209e-06, + "loss": 1.2025, + "step": 1117 + }, + { + "epoch": 0.026075269600911936, + "grad_norm": 1.4143930673599243, + "learning_rate": 5.794247214304224e-06, + "loss": 1.4293, + "step": 1118 + }, + { + "epoch": 0.02609859274008985, + "grad_norm": 1.4917521476745605, + "learning_rate": 5.799429904120239e-06, + "loss": 1.5479, + "step": 1119 + }, + { + "epoch": 0.02612191587926777, + "grad_norm": 1.4023706912994385, + "learning_rate": 5.804612593936253e-06, + "loss": 1.7088, + "step": 1120 + }, + { + "epoch": 0.026145239018445687, + "grad_norm": 1.4056384563446045, + "learning_rate": 5.8097952837522675e-06, + "loss": 1.3657, + "step": 1121 + }, + { + "epoch": 0.026168562157623605, + "grad_norm": 1.3393616676330566, + "learning_rate": 5.814977973568282e-06, + "loss": 1.1497, + "step": 1122 + }, + { + "epoch": 0.026191885296801522, + "grad_norm": 1.6090584993362427, + "learning_rate": 5.820160663384296e-06, + "loss": 1.391, + "step": 1123 + }, + { + "epoch": 0.02621520843597944, + "grad_norm": 1.4391287565231323, + "learning_rate": 5.825343353200311e-06, + "loss": 1.4316, + "step": 1124 + }, + { + "epoch": 0.026238531575157358, + "grad_norm": 1.0588252544403076, + "learning_rate": 5.830526043016326e-06, + "loss": 1.3495, + "step": 1125 + }, + { + "epoch": 0.026261854714335276, + "grad_norm": 1.2646477222442627, + "learning_rate": 5.83570873283234e-06, + "loss": 1.9107, + "step": 1126 + }, + { + "epoch": 0.026285177853513194, + "grad_norm": 1.2594728469848633, + "learning_rate": 5.8408914226483545e-06, + "loss": 1.3878, + "step": 1127 + }, + { + "epoch": 0.026308500992691112, + "grad_norm": 2.413245677947998, + "learning_rate": 5.84607411246437e-06, + "loss": 1.2988, + "step": 1128 + }, + { + "epoch": 0.02633182413186903, + "grad_norm": 1.8143887519836426, + "learning_rate": 5.851256802280384e-06, + "loss": 1.8778, + "step": 1129 + }, + { + "epoch": 0.026355147271046948, + "grad_norm": 1.4549977779388428, + "learning_rate": 5.856439492096398e-06, + "loss": 1.7828, + "step": 1130 + }, + { + "epoch": 0.026378470410224866, + "grad_norm": 1.370773196220398, + "learning_rate": 5.861622181912413e-06, + "loss": 1.6647, + "step": 1131 + }, + { + "epoch": 0.026401793549402783, + "grad_norm": 1.7972664833068848, + "learning_rate": 5.866804871728428e-06, + "loss": 1.8871, + "step": 1132 + }, + { + "epoch": 0.026425116688580698, + "grad_norm": 1.6887913942337036, + "learning_rate": 5.871987561544442e-06, + "loss": 1.4938, + "step": 1133 + }, + { + "epoch": 0.026448439827758616, + "grad_norm": 1.4011859893798828, + "learning_rate": 5.8771702513604566e-06, + "loss": 1.2893, + "step": 1134 + }, + { + "epoch": 0.026471762966936534, + "grad_norm": 1.2820593118667603, + "learning_rate": 5.882352941176471e-06, + "loss": 1.8028, + "step": 1135 + }, + { + "epoch": 0.02649508610611445, + "grad_norm": 1.5501364469528198, + "learning_rate": 5.887535630992486e-06, + "loss": 1.5666, + "step": 1136 + }, + { + "epoch": 0.02651840924529237, + "grad_norm": 1.635021686553955, + "learning_rate": 5.8927183208085005e-06, + "loss": 1.4217, + "step": 1137 + }, + { + "epoch": 0.026541732384470287, + "grad_norm": 1.780432105064392, + "learning_rate": 5.897901010624515e-06, + "loss": 1.5926, + "step": 1138 + }, + { + "epoch": 0.026565055523648205, + "grad_norm": 1.747233271598816, + "learning_rate": 5.903083700440529e-06, + "loss": 1.7011, + "step": 1139 + }, + { + "epoch": 0.026588378662826123, + "grad_norm": 1.6612962484359741, + "learning_rate": 5.908266390256544e-06, + "loss": 1.1466, + "step": 1140 + }, + { + "epoch": 0.02661170180200404, + "grad_norm": 1.906965732574463, + "learning_rate": 5.913449080072559e-06, + "loss": 1.2679, + "step": 1141 + }, + { + "epoch": 0.02663502494118196, + "grad_norm": 1.3008593320846558, + "learning_rate": 5.918631769888573e-06, + "loss": 1.1242, + "step": 1142 + }, + { + "epoch": 0.026658348080359877, + "grad_norm": 1.2631815671920776, + "learning_rate": 5.923814459704587e-06, + "loss": 1.6476, + "step": 1143 + }, + { + "epoch": 0.026681671219537795, + "grad_norm": 1.3338450193405151, + "learning_rate": 5.928997149520601e-06, + "loss": 1.6404, + "step": 1144 + }, + { + "epoch": 0.026704994358715713, + "grad_norm": 1.4749959707260132, + "learning_rate": 5.934179839336616e-06, + "loss": 1.4754, + "step": 1145 + }, + { + "epoch": 0.02672831749789363, + "grad_norm": 1.399997353553772, + "learning_rate": 5.9393625291526304e-06, + "loss": 1.776, + "step": 1146 + }, + { + "epoch": 0.026751640637071545, + "grad_norm": 1.6688719987869263, + "learning_rate": 5.944545218968645e-06, + "loss": 1.4341, + "step": 1147 + }, + { + "epoch": 0.026774963776249463, + "grad_norm": 1.2055866718292236, + "learning_rate": 5.949727908784659e-06, + "loss": 1.366, + "step": 1148 + }, + { + "epoch": 0.02679828691542738, + "grad_norm": 1.834375262260437, + "learning_rate": 5.9549105986006735e-06, + "loss": 1.7205, + "step": 1149 + }, + { + "epoch": 0.0268216100546053, + "grad_norm": 1.6463091373443604, + "learning_rate": 5.960093288416689e-06, + "loss": 1.2175, + "step": 1150 + }, + { + "epoch": 0.026844933193783217, + "grad_norm": 1.2439314126968384, + "learning_rate": 5.965275978232703e-06, + "loss": 1.1599, + "step": 1151 + }, + { + "epoch": 0.026868256332961134, + "grad_norm": 1.428876519203186, + "learning_rate": 5.970458668048717e-06, + "loss": 1.7428, + "step": 1152 + }, + { + "epoch": 0.026891579472139052, + "grad_norm": 1.3530622720718384, + "learning_rate": 5.975641357864732e-06, + "loss": 1.4968, + "step": 1153 + }, + { + "epoch": 0.02691490261131697, + "grad_norm": 2.7352559566497803, + "learning_rate": 5.980824047680747e-06, + "loss": 1.5478, + "step": 1154 + }, + { + "epoch": 0.026938225750494888, + "grad_norm": 1.8357428312301636, + "learning_rate": 5.986006737496761e-06, + "loss": 1.5217, + "step": 1155 + }, + { + "epoch": 0.026961548889672806, + "grad_norm": 1.3974493741989136, + "learning_rate": 5.991189427312776e-06, + "loss": 1.6203, + "step": 1156 + }, + { + "epoch": 0.026984872028850724, + "grad_norm": 1.3089922666549683, + "learning_rate": 5.99637211712879e-06, + "loss": 1.7992, + "step": 1157 + }, + { + "epoch": 0.027008195168028642, + "grad_norm": 1.8275575637817383, + "learning_rate": 6.001554806944805e-06, + "loss": 1.4841, + "step": 1158 + }, + { + "epoch": 0.02703151830720656, + "grad_norm": 2.55710506439209, + "learning_rate": 6.0067374967608195e-06, + "loss": 1.3043, + "step": 1159 + }, + { + "epoch": 0.027054841446384478, + "grad_norm": 2.4591903686523438, + "learning_rate": 6.011920186576834e-06, + "loss": 1.3368, + "step": 1160 + }, + { + "epoch": 0.027078164585562395, + "grad_norm": 1.9370126724243164, + "learning_rate": 6.017102876392848e-06, + "loss": 1.4075, + "step": 1161 + }, + { + "epoch": 0.02710148772474031, + "grad_norm": 1.4310760498046875, + "learning_rate": 6.022285566208863e-06, + "loss": 1.5424, + "step": 1162 + }, + { + "epoch": 0.027124810863918228, + "grad_norm": 1.3892368078231812, + "learning_rate": 6.027468256024878e-06, + "loss": 1.6432, + "step": 1163 + }, + { + "epoch": 0.027148134003096146, + "grad_norm": 1.4820071458816528, + "learning_rate": 6.032650945840892e-06, + "loss": 1.409, + "step": 1164 + }, + { + "epoch": 0.027171457142274064, + "grad_norm": 1.1135878562927246, + "learning_rate": 6.037833635656906e-06, + "loss": 1.5977, + "step": 1165 + }, + { + "epoch": 0.02719478028145198, + "grad_norm": 1.6016969680786133, + "learning_rate": 6.043016325472922e-06, + "loss": 1.6486, + "step": 1166 + }, + { + "epoch": 0.0272181034206299, + "grad_norm": 1.5183762311935425, + "learning_rate": 6.048199015288936e-06, + "loss": 1.4068, + "step": 1167 + }, + { + "epoch": 0.027241426559807817, + "grad_norm": 1.4730808734893799, + "learning_rate": 6.05338170510495e-06, + "loss": 1.6202, + "step": 1168 + }, + { + "epoch": 0.027264749698985735, + "grad_norm": 1.4382350444793701, + "learning_rate": 6.058564394920965e-06, + "loss": 1.7055, + "step": 1169 + }, + { + "epoch": 0.027288072838163653, + "grad_norm": 0.9570834040641785, + "learning_rate": 6.06374708473698e-06, + "loss": 0.8602, + "step": 1170 + }, + { + "epoch": 0.02731139597734157, + "grad_norm": 1.2127379179000854, + "learning_rate": 6.068929774552994e-06, + "loss": 1.5333, + "step": 1171 + }, + { + "epoch": 0.02733471911651949, + "grad_norm": 1.5822348594665527, + "learning_rate": 6.074112464369008e-06, + "loss": 0.9605, + "step": 1172 + }, + { + "epoch": 0.027358042255697407, + "grad_norm": 1.3108526468276978, + "learning_rate": 6.079295154185022e-06, + "loss": 1.1987, + "step": 1173 + }, + { + "epoch": 0.027381365394875325, + "grad_norm": 2.005154848098755, + "learning_rate": 6.084477844001036e-06, + "loss": 1.7214, + "step": 1174 + }, + { + "epoch": 0.027404688534053243, + "grad_norm": 2.299222707748413, + "learning_rate": 6.089660533817051e-06, + "loss": 1.5244, + "step": 1175 + }, + { + "epoch": 0.027428011673231157, + "grad_norm": 1.2665340900421143, + "learning_rate": 6.094843223633066e-06, + "loss": 1.1735, + "step": 1176 + }, + { + "epoch": 0.027451334812409075, + "grad_norm": 1.418123483657837, + "learning_rate": 6.10002591344908e-06, + "loss": 1.6755, + "step": 1177 + }, + { + "epoch": 0.027474657951586993, + "grad_norm": 1.4280682802200317, + "learning_rate": 6.105208603265095e-06, + "loss": 1.6664, + "step": 1178 + }, + { + "epoch": 0.02749798109076491, + "grad_norm": 2.0804097652435303, + "learning_rate": 6.110391293081109e-06, + "loss": 1.4688, + "step": 1179 + }, + { + "epoch": 0.02752130422994283, + "grad_norm": 1.7536234855651855, + "learning_rate": 6.115573982897124e-06, + "loss": 1.5823, + "step": 1180 + }, + { + "epoch": 0.027544627369120746, + "grad_norm": 1.1604044437408447, + "learning_rate": 6.1207566727131385e-06, + "loss": 1.4818, + "step": 1181 + }, + { + "epoch": 0.027567950508298664, + "grad_norm": 1.3865594863891602, + "learning_rate": 6.125939362529153e-06, + "loss": 1.5467, + "step": 1182 + }, + { + "epoch": 0.027591273647476582, + "grad_norm": 1.526190996170044, + "learning_rate": 6.131122052345167e-06, + "loss": 1.3397, + "step": 1183 + }, + { + "epoch": 0.0276145967866545, + "grad_norm": 1.6010215282440186, + "learning_rate": 6.136304742161182e-06, + "loss": 1.5507, + "step": 1184 + }, + { + "epoch": 0.027637919925832418, + "grad_norm": 1.4297575950622559, + "learning_rate": 6.141487431977197e-06, + "loss": 1.397, + "step": 1185 + }, + { + "epoch": 0.027661243065010336, + "grad_norm": 1.380254864692688, + "learning_rate": 6.146670121793211e-06, + "loss": 1.251, + "step": 1186 + }, + { + "epoch": 0.027684566204188254, + "grad_norm": 1.5398340225219727, + "learning_rate": 6.1518528116092254e-06, + "loss": 1.7319, + "step": 1187 + }, + { + "epoch": 0.027707889343366172, + "grad_norm": 1.8836907148361206, + "learning_rate": 6.157035501425241e-06, + "loss": 1.1504, + "step": 1188 + }, + { + "epoch": 0.02773121248254409, + "grad_norm": 1.200628399848938, + "learning_rate": 6.162218191241255e-06, + "loss": 1.5138, + "step": 1189 + }, + { + "epoch": 0.027754535621722004, + "grad_norm": 1.7400058507919312, + "learning_rate": 6.167400881057269e-06, + "loss": 1.5398, + "step": 1190 + }, + { + "epoch": 0.027777858760899922, + "grad_norm": 1.2723171710968018, + "learning_rate": 6.172583570873284e-06, + "loss": 1.1157, + "step": 1191 + }, + { + "epoch": 0.02780118190007784, + "grad_norm": 1.4392553567886353, + "learning_rate": 6.177766260689299e-06, + "loss": 1.7444, + "step": 1192 + }, + { + "epoch": 0.027824505039255758, + "grad_norm": 1.533337950706482, + "learning_rate": 6.182948950505313e-06, + "loss": 1.4784, + "step": 1193 + }, + { + "epoch": 0.027847828178433676, + "grad_norm": 1.5458931922912598, + "learning_rate": 6.1881316403213276e-06, + "loss": 1.8139, + "step": 1194 + }, + { + "epoch": 0.027871151317611594, + "grad_norm": 1.133946180343628, + "learning_rate": 6.193314330137342e-06, + "loss": 1.5137, + "step": 1195 + }, + { + "epoch": 0.02789447445678951, + "grad_norm": 1.458628535270691, + "learning_rate": 6.198497019953357e-06, + "loss": 1.3172, + "step": 1196 + }, + { + "epoch": 0.02791779759596743, + "grad_norm": 2.2303454875946045, + "learning_rate": 6.2036797097693714e-06, + "loss": 1.2295, + "step": 1197 + }, + { + "epoch": 0.027941120735145347, + "grad_norm": 1.2555915117263794, + "learning_rate": 6.208862399585386e-06, + "loss": 1.5021, + "step": 1198 + }, + { + "epoch": 0.027964443874323265, + "grad_norm": 1.7872976064682007, + "learning_rate": 6.2140450894014e-06, + "loss": 0.9375, + "step": 1199 + }, + { + "epoch": 0.027987767013501183, + "grad_norm": 1.5110255479812622, + "learning_rate": 6.219227779217414e-06, + "loss": 1.871, + "step": 1200 + }, + { + "epoch": 0.0280110901526791, + "grad_norm": 1.5963770151138306, + "learning_rate": 6.224410469033428e-06, + "loss": 1.6184, + "step": 1201 + }, + { + "epoch": 0.02803441329185702, + "grad_norm": 1.7600239515304565, + "learning_rate": 6.229593158849443e-06, + "loss": 1.5337, + "step": 1202 + }, + { + "epoch": 0.028057736431034937, + "grad_norm": 1.3252232074737549, + "learning_rate": 6.2347758486654575e-06, + "loss": 1.4088, + "step": 1203 + }, + { + "epoch": 0.02808105957021285, + "grad_norm": 1.3839343786239624, + "learning_rate": 6.239958538481472e-06, + "loss": 1.305, + "step": 1204 + }, + { + "epoch": 0.02810438270939077, + "grad_norm": 1.6570122241973877, + "learning_rate": 6.245141228297486e-06, + "loss": 1.5596, + "step": 1205 + }, + { + "epoch": 0.028127705848568687, + "grad_norm": 1.4685866832733154, + "learning_rate": 6.250323918113501e-06, + "loss": 1.4931, + "step": 1206 + }, + { + "epoch": 0.028151028987746605, + "grad_norm": 1.263984203338623, + "learning_rate": 6.255506607929516e-06, + "loss": 1.5393, + "step": 1207 + }, + { + "epoch": 0.028174352126924523, + "grad_norm": 1.8634412288665771, + "learning_rate": 6.26068929774553e-06, + "loss": 1.2369, + "step": 1208 + }, + { + "epoch": 0.02819767526610244, + "grad_norm": 1.676034927368164, + "learning_rate": 6.2658719875615444e-06, + "loss": 1.5886, + "step": 1209 + }, + { + "epoch": 0.02822099840528036, + "grad_norm": 1.7271007299423218, + "learning_rate": 6.27105467737756e-06, + "loss": 1.2692, + "step": 1210 + }, + { + "epoch": 0.028244321544458276, + "grad_norm": 1.4238859415054321, + "learning_rate": 6.276237367193574e-06, + "loss": 1.6261, + "step": 1211 + }, + { + "epoch": 0.028267644683636194, + "grad_norm": 2.13999080657959, + "learning_rate": 6.281420057009588e-06, + "loss": 1.7009, + "step": 1212 + }, + { + "epoch": 0.028290967822814112, + "grad_norm": 2.1164069175720215, + "learning_rate": 6.286602746825603e-06, + "loss": 1.4856, + "step": 1213 + }, + { + "epoch": 0.02831429096199203, + "grad_norm": 1.6996465921401978, + "learning_rate": 6.291785436641618e-06, + "loss": 1.4621, + "step": 1214 + }, + { + "epoch": 0.028337614101169948, + "grad_norm": 1.466536045074463, + "learning_rate": 6.296968126457632e-06, + "loss": 1.5882, + "step": 1215 + }, + { + "epoch": 0.028360937240347866, + "grad_norm": 1.7248129844665527, + "learning_rate": 6.3021508162736466e-06, + "loss": 1.658, + "step": 1216 + }, + { + "epoch": 0.028384260379525784, + "grad_norm": 1.7973899841308594, + "learning_rate": 6.307333506089661e-06, + "loss": 1.4981, + "step": 1217 + }, + { + "epoch": 0.0284075835187037, + "grad_norm": 1.4502708911895752, + "learning_rate": 6.312516195905676e-06, + "loss": 1.8872, + "step": 1218 + }, + { + "epoch": 0.028430906657881616, + "grad_norm": 1.592411756515503, + "learning_rate": 6.3176988857216905e-06, + "loss": 1.4145, + "step": 1219 + }, + { + "epoch": 0.028454229797059534, + "grad_norm": 1.931400179862976, + "learning_rate": 6.322881575537705e-06, + "loss": 1.6221, + "step": 1220 + }, + { + "epoch": 0.028477552936237452, + "grad_norm": 1.5922832489013672, + "learning_rate": 6.328064265353719e-06, + "loss": 1.3897, + "step": 1221 + }, + { + "epoch": 0.02850087607541537, + "grad_norm": 1.4899603128433228, + "learning_rate": 6.333246955169734e-06, + "loss": 1.66, + "step": 1222 + }, + { + "epoch": 0.028524199214593288, + "grad_norm": 1.3820170164108276, + "learning_rate": 6.338429644985749e-06, + "loss": 1.8425, + "step": 1223 + }, + { + "epoch": 0.028547522353771206, + "grad_norm": 1.6127132177352905, + "learning_rate": 6.343612334801763e-06, + "loss": 1.3965, + "step": 1224 + }, + { + "epoch": 0.028570845492949123, + "grad_norm": 1.927259922027588, + "learning_rate": 6.348795024617777e-06, + "loss": 1.486, + "step": 1225 + }, + { + "epoch": 0.02859416863212704, + "grad_norm": 1.5987411737442017, + "learning_rate": 6.353977714433793e-06, + "loss": 1.4371, + "step": 1226 + }, + { + "epoch": 0.02861749177130496, + "grad_norm": 1.7805335521697998, + "learning_rate": 6.359160404249805e-06, + "loss": 1.56, + "step": 1227 + }, + { + "epoch": 0.028640814910482877, + "grad_norm": 1.7960704565048218, + "learning_rate": 6.3643430940658204e-06, + "loss": 1.5536, + "step": 1228 + }, + { + "epoch": 0.028664138049660795, + "grad_norm": 1.4014300107955933, + "learning_rate": 6.369525783881835e-06, + "loss": 1.4391, + "step": 1229 + }, + { + "epoch": 0.028687461188838713, + "grad_norm": 1.7049264907836914, + "learning_rate": 6.374708473697849e-06, + "loss": 1.9225, + "step": 1230 + }, + { + "epoch": 0.02871078432801663, + "grad_norm": 1.9948570728302002, + "learning_rate": 6.3798911635138635e-06, + "loss": 1.6279, + "step": 1231 + }, + { + "epoch": 0.02873410746719455, + "grad_norm": 2.101736068725586, + "learning_rate": 6.385073853329879e-06, + "loss": 1.5433, + "step": 1232 + }, + { + "epoch": 0.028757430606372463, + "grad_norm": 1.342325210571289, + "learning_rate": 6.390256543145893e-06, + "loss": 1.3606, + "step": 1233 + }, + { + "epoch": 0.02878075374555038, + "grad_norm": 1.5539692640304565, + "learning_rate": 6.395439232961907e-06, + "loss": 1.4339, + "step": 1234 + }, + { + "epoch": 0.0288040768847283, + "grad_norm": 1.6053344011306763, + "learning_rate": 6.400621922777922e-06, + "loss": 1.5735, + "step": 1235 + }, + { + "epoch": 0.028827400023906217, + "grad_norm": 1.1527775526046753, + "learning_rate": 6.405804612593937e-06, + "loss": 1.3265, + "step": 1236 + }, + { + "epoch": 0.028850723163084135, + "grad_norm": 2.401747465133667, + "learning_rate": 6.410987302409951e-06, + "loss": 1.3331, + "step": 1237 + }, + { + "epoch": 0.028874046302262053, + "grad_norm": 1.372536301612854, + "learning_rate": 6.416169992225966e-06, + "loss": 1.6371, + "step": 1238 + }, + { + "epoch": 0.02889736944143997, + "grad_norm": 1.528669834136963, + "learning_rate": 6.42135268204198e-06, + "loss": 1.4658, + "step": 1239 + }, + { + "epoch": 0.02892069258061789, + "grad_norm": 1.7370809316635132, + "learning_rate": 6.426535371857995e-06, + "loss": 1.4893, + "step": 1240 + }, + { + "epoch": 0.028944015719795806, + "grad_norm": 1.5757806301116943, + "learning_rate": 6.4317180616740095e-06, + "loss": 1.2563, + "step": 1241 + }, + { + "epoch": 0.028967338858973724, + "grad_norm": 1.2458890676498413, + "learning_rate": 6.436900751490024e-06, + "loss": 1.6522, + "step": 1242 + }, + { + "epoch": 0.028990661998151642, + "grad_norm": 1.743046760559082, + "learning_rate": 6.442083441306038e-06, + "loss": 1.6444, + "step": 1243 + }, + { + "epoch": 0.02901398513732956, + "grad_norm": 1.5543162822723389, + "learning_rate": 6.447266131122053e-06, + "loss": 1.6381, + "step": 1244 + }, + { + "epoch": 0.029037308276507478, + "grad_norm": 1.3490428924560547, + "learning_rate": 6.452448820938068e-06, + "loss": 1.4615, + "step": 1245 + }, + { + "epoch": 0.029060631415685396, + "grad_norm": 1.3732086420059204, + "learning_rate": 6.457631510754082e-06, + "loss": 1.4085, + "step": 1246 + }, + { + "epoch": 0.02908395455486331, + "grad_norm": 2.9364993572235107, + "learning_rate": 6.462814200570096e-06, + "loss": 1.4811, + "step": 1247 + }, + { + "epoch": 0.029107277694041228, + "grad_norm": 1.2069623470306396, + "learning_rate": 6.467996890386112e-06, + "loss": 1.3635, + "step": 1248 + }, + { + "epoch": 0.029130600833219146, + "grad_norm": 1.2883137464523315, + "learning_rate": 6.473179580202126e-06, + "loss": 1.4202, + "step": 1249 + }, + { + "epoch": 0.029153923972397064, + "grad_norm": 1.592976689338684, + "learning_rate": 6.47836227001814e-06, + "loss": 2.1116, + "step": 1250 + }, + { + "epoch": 0.029177247111574982, + "grad_norm": 1.394774079322815, + "learning_rate": 6.483544959834155e-06, + "loss": 1.5042, + "step": 1251 + }, + { + "epoch": 0.0292005702507529, + "grad_norm": 1.2127888202667236, + "learning_rate": 6.48872764965017e-06, + "loss": 1.3806, + "step": 1252 + }, + { + "epoch": 0.029223893389930818, + "grad_norm": 1.5445924997329712, + "learning_rate": 6.493910339466184e-06, + "loss": 1.5067, + "step": 1253 + }, + { + "epoch": 0.029247216529108735, + "grad_norm": 2.4520442485809326, + "learning_rate": 6.4990930292821985e-06, + "loss": 1.3649, + "step": 1254 + }, + { + "epoch": 0.029270539668286653, + "grad_norm": 2.032709836959839, + "learning_rate": 6.504275719098212e-06, + "loss": 1.2058, + "step": 1255 + }, + { + "epoch": 0.02929386280746457, + "grad_norm": 1.3742554187774658, + "learning_rate": 6.509458408914226e-06, + "loss": 1.4328, + "step": 1256 + }, + { + "epoch": 0.02931718594664249, + "grad_norm": 1.4859979152679443, + "learning_rate": 6.514641098730241e-06, + "loss": 1.6409, + "step": 1257 + }, + { + "epoch": 0.029340509085820407, + "grad_norm": 1.6881428956985474, + "learning_rate": 6.519823788546256e-06, + "loss": 1.6298, + "step": 1258 + }, + { + "epoch": 0.029363832224998325, + "grad_norm": 1.892412543296814, + "learning_rate": 6.52500647836227e-06, + "loss": 1.6898, + "step": 1259 + }, + { + "epoch": 0.029387155364176243, + "grad_norm": 1.4890961647033691, + "learning_rate": 6.530189168178285e-06, + "loss": 1.6164, + "step": 1260 + }, + { + "epoch": 0.029410478503354157, + "grad_norm": 1.530034065246582, + "learning_rate": 6.535371857994299e-06, + "loss": 1.4036, + "step": 1261 + }, + { + "epoch": 0.029433801642532075, + "grad_norm": 1.4801392555236816, + "learning_rate": 6.540554547810314e-06, + "loss": 1.5928, + "step": 1262 + }, + { + "epoch": 0.029457124781709993, + "grad_norm": 1.4419362545013428, + "learning_rate": 6.5457372376263285e-06, + "loss": 1.7833, + "step": 1263 + }, + { + "epoch": 0.02948044792088791, + "grad_norm": 1.6963889598846436, + "learning_rate": 6.550919927442343e-06, + "loss": 1.7366, + "step": 1264 + }, + { + "epoch": 0.02950377106006583, + "grad_norm": 1.4853816032409668, + "learning_rate": 6.556102617258357e-06, + "loss": 1.2297, + "step": 1265 + }, + { + "epoch": 0.029527094199243747, + "grad_norm": 1.6151559352874756, + "learning_rate": 6.561285307074372e-06, + "loss": 2.0062, + "step": 1266 + }, + { + "epoch": 0.029550417338421665, + "grad_norm": 1.3132925033569336, + "learning_rate": 6.566467996890387e-06, + "loss": 1.7708, + "step": 1267 + }, + { + "epoch": 0.029573740477599583, + "grad_norm": 1.4057172536849976, + "learning_rate": 6.571650686706401e-06, + "loss": 1.5811, + "step": 1268 + }, + { + "epoch": 0.0295970636167775, + "grad_norm": 1.5369668006896973, + "learning_rate": 6.5768333765224154e-06, + "loss": 1.5121, + "step": 1269 + }, + { + "epoch": 0.02962038675595542, + "grad_norm": 1.6567087173461914, + "learning_rate": 6.582016066338431e-06, + "loss": 1.2413, + "step": 1270 + }, + { + "epoch": 0.029643709895133336, + "grad_norm": 1.3374396562576294, + "learning_rate": 6.587198756154445e-06, + "loss": 1.5594, + "step": 1271 + }, + { + "epoch": 0.029667033034311254, + "grad_norm": 1.4892241954803467, + "learning_rate": 6.592381445970459e-06, + "loss": 1.6287, + "step": 1272 + }, + { + "epoch": 0.029690356173489172, + "grad_norm": 2.012141466140747, + "learning_rate": 6.597564135786474e-06, + "loss": 1.7356, + "step": 1273 + }, + { + "epoch": 0.02971367931266709, + "grad_norm": 2.2330586910247803, + "learning_rate": 6.602746825602489e-06, + "loss": 1.1928, + "step": 1274 + }, + { + "epoch": 0.029737002451845004, + "grad_norm": 1.7101742029190063, + "learning_rate": 6.607929515418503e-06, + "loss": 1.497, + "step": 1275 + }, + { + "epoch": 0.029760325591022922, + "grad_norm": 1.4773057699203491, + "learning_rate": 6.6131122052345175e-06, + "loss": 1.4135, + "step": 1276 + }, + { + "epoch": 0.02978364873020084, + "grad_norm": 1.4007784128189087, + "learning_rate": 6.618294895050532e-06, + "loss": 1.3921, + "step": 1277 + }, + { + "epoch": 0.029806971869378758, + "grad_norm": 1.7430599927902222, + "learning_rate": 6.623477584866547e-06, + "loss": 1.5352, + "step": 1278 + }, + { + "epoch": 0.029830295008556676, + "grad_norm": 2.562096118927002, + "learning_rate": 6.6286602746825614e-06, + "loss": 1.5325, + "step": 1279 + }, + { + "epoch": 0.029853618147734594, + "grad_norm": 1.192498803138733, + "learning_rate": 6.633842964498576e-06, + "loss": 1.1816, + "step": 1280 + }, + { + "epoch": 0.02987694128691251, + "grad_norm": 2.39277982711792, + "learning_rate": 6.63902565431459e-06, + "loss": 1.3732, + "step": 1281 + }, + { + "epoch": 0.02990026442609043, + "grad_norm": 1.3731800317764282, + "learning_rate": 6.644208344130604e-06, + "loss": 1.4175, + "step": 1282 + }, + { + "epoch": 0.029923587565268348, + "grad_norm": 2.297088146209717, + "learning_rate": 6.649391033946618e-06, + "loss": 1.5919, + "step": 1283 + }, + { + "epoch": 0.029946910704446265, + "grad_norm": 1.1062113046646118, + "learning_rate": 6.654573723762633e-06, + "loss": 1.3707, + "step": 1284 + }, + { + "epoch": 0.029970233843624183, + "grad_norm": 2.175673246383667, + "learning_rate": 6.6597564135786475e-06, + "loss": 1.4268, + "step": 1285 + }, + { + "epoch": 0.0299935569828021, + "grad_norm": 1.57578444480896, + "learning_rate": 6.664939103394662e-06, + "loss": 1.6065, + "step": 1286 + }, + { + "epoch": 0.03001688012198002, + "grad_norm": 1.757105827331543, + "learning_rate": 6.670121793210676e-06, + "loss": 1.5827, + "step": 1287 + }, + { + "epoch": 0.030040203261157937, + "grad_norm": 1.6778910160064697, + "learning_rate": 6.675304483026691e-06, + "loss": 1.4697, + "step": 1288 + }, + { + "epoch": 0.030063526400335855, + "grad_norm": 1.4940367937088013, + "learning_rate": 6.680487172842706e-06, + "loss": 1.2309, + "step": 1289 + }, + { + "epoch": 0.03008684953951377, + "grad_norm": 2.175011157989502, + "learning_rate": 6.68566986265872e-06, + "loss": 0.9675, + "step": 1290 + }, + { + "epoch": 0.030110172678691687, + "grad_norm": 2.0137412548065186, + "learning_rate": 6.6908525524747344e-06, + "loss": 1.6618, + "step": 1291 + }, + { + "epoch": 0.030133495817869605, + "grad_norm": 1.3541489839553833, + "learning_rate": 6.69603524229075e-06, + "loss": 1.2989, + "step": 1292 + }, + { + "epoch": 0.030156818957047523, + "grad_norm": 1.9265953302383423, + "learning_rate": 6.701217932106764e-06, + "loss": 1.3859, + "step": 1293 + }, + { + "epoch": 0.03018014209622544, + "grad_norm": 1.899145483970642, + "learning_rate": 6.706400621922778e-06, + "loss": 1.2468, + "step": 1294 + }, + { + "epoch": 0.03020346523540336, + "grad_norm": 1.6764010190963745, + "learning_rate": 6.711583311738793e-06, + "loss": 1.4796, + "step": 1295 + }, + { + "epoch": 0.030226788374581277, + "grad_norm": 1.502276062965393, + "learning_rate": 6.716766001554808e-06, + "loss": 1.6102, + "step": 1296 + }, + { + "epoch": 0.030250111513759195, + "grad_norm": 1.742180347442627, + "learning_rate": 6.721948691370822e-06, + "loss": 1.4743, + "step": 1297 + }, + { + "epoch": 0.030273434652937112, + "grad_norm": 1.503127098083496, + "learning_rate": 6.7271313811868366e-06, + "loss": 1.7023, + "step": 1298 + }, + { + "epoch": 0.03029675779211503, + "grad_norm": 1.4494696855545044, + "learning_rate": 6.732314071002851e-06, + "loss": 1.6774, + "step": 1299 + }, + { + "epoch": 0.030320080931292948, + "grad_norm": 1.3726390600204468, + "learning_rate": 6.737496760818866e-06, + "loss": 1.6272, + "step": 1300 + }, + { + "epoch": 0.030343404070470866, + "grad_norm": 1.6922540664672852, + "learning_rate": 6.7426794506348805e-06, + "loss": 1.6249, + "step": 1301 + }, + { + "epoch": 0.030366727209648784, + "grad_norm": 1.3822194337844849, + "learning_rate": 6.747862140450895e-06, + "loss": 1.779, + "step": 1302 + }, + { + "epoch": 0.030390050348826702, + "grad_norm": 1.2841784954071045, + "learning_rate": 6.753044830266909e-06, + "loss": 1.2516, + "step": 1303 + }, + { + "epoch": 0.030413373488004616, + "grad_norm": 2.045302152633667, + "learning_rate": 6.758227520082924e-06, + "loss": 1.4461, + "step": 1304 + }, + { + "epoch": 0.030436696627182534, + "grad_norm": 1.6968058347702026, + "learning_rate": 6.763410209898939e-06, + "loss": 1.545, + "step": 1305 + }, + { + "epoch": 0.030460019766360452, + "grad_norm": 1.6409857273101807, + "learning_rate": 6.768592899714953e-06, + "loss": 1.7205, + "step": 1306 + }, + { + "epoch": 0.03048334290553837, + "grad_norm": 1.2925307750701904, + "learning_rate": 6.773775589530967e-06, + "loss": 1.5889, + "step": 1307 + }, + { + "epoch": 0.030506666044716288, + "grad_norm": 1.4610506296157837, + "learning_rate": 6.778958279346982e-06, + "loss": 1.49, + "step": 1308 + }, + { + "epoch": 0.030529989183894206, + "grad_norm": 1.5941089391708374, + "learning_rate": 6.784140969162997e-06, + "loss": 1.8275, + "step": 1309 + }, + { + "epoch": 0.030553312323072124, + "grad_norm": 1.2063391208648682, + "learning_rate": 6.7893236589790104e-06, + "loss": 1.2659, + "step": 1310 + }, + { + "epoch": 0.03057663546225004, + "grad_norm": 1.512366771697998, + "learning_rate": 6.794506348795025e-06, + "loss": 1.502, + "step": 1311 + }, + { + "epoch": 0.03059995860142796, + "grad_norm": 2.0490636825561523, + "learning_rate": 6.799689038611039e-06, + "loss": 1.4567, + "step": 1312 + }, + { + "epoch": 0.030623281740605877, + "grad_norm": 2.196171522140503, + "learning_rate": 6.8048717284270535e-06, + "loss": 1.7189, + "step": 1313 + }, + { + "epoch": 0.030646604879783795, + "grad_norm": 1.434403419494629, + "learning_rate": 6.810054418243069e-06, + "loss": 1.4947, + "step": 1314 + }, + { + "epoch": 0.030669928018961713, + "grad_norm": 1.3586199283599854, + "learning_rate": 6.815237108059083e-06, + "loss": 1.5511, + "step": 1315 + }, + { + "epoch": 0.03069325115813963, + "grad_norm": 1.7212327718734741, + "learning_rate": 6.820419797875097e-06, + "loss": 1.625, + "step": 1316 + }, + { + "epoch": 0.03071657429731755, + "grad_norm": 1.7246372699737549, + "learning_rate": 6.825602487691112e-06, + "loss": 1.6043, + "step": 1317 + }, + { + "epoch": 0.030739897436495463, + "grad_norm": 1.401949405670166, + "learning_rate": 6.830785177507127e-06, + "loss": 0.9642, + "step": 1318 + }, + { + "epoch": 0.03076322057567338, + "grad_norm": 1.6501095294952393, + "learning_rate": 6.835967867323141e-06, + "loss": 1.4776, + "step": 1319 + }, + { + "epoch": 0.0307865437148513, + "grad_norm": 1.266641616821289, + "learning_rate": 6.841150557139156e-06, + "loss": 1.1332, + "step": 1320 + }, + { + "epoch": 0.030809866854029217, + "grad_norm": 1.0934447050094604, + "learning_rate": 6.84633324695517e-06, + "loss": 1.6201, + "step": 1321 + }, + { + "epoch": 0.030833189993207135, + "grad_norm": 1.4711166620254517, + "learning_rate": 6.851515936771185e-06, + "loss": 1.401, + "step": 1322 + }, + { + "epoch": 0.030856513132385053, + "grad_norm": 1.609348177909851, + "learning_rate": 6.8566986265871995e-06, + "loss": 1.5497, + "step": 1323 + }, + { + "epoch": 0.03087983627156297, + "grad_norm": 1.277185082435608, + "learning_rate": 6.861881316403214e-06, + "loss": 1.5056, + "step": 1324 + }, + { + "epoch": 0.03090315941074089, + "grad_norm": 1.4644626379013062, + "learning_rate": 6.867064006219228e-06, + "loss": 1.3443, + "step": 1325 + }, + { + "epoch": 0.030926482549918807, + "grad_norm": 1.4824533462524414, + "learning_rate": 6.872246696035243e-06, + "loss": 1.5054, + "step": 1326 + }, + { + "epoch": 0.030949805689096724, + "grad_norm": 1.4885330200195312, + "learning_rate": 6.877429385851258e-06, + "loss": 1.4403, + "step": 1327 + }, + { + "epoch": 0.030973128828274642, + "grad_norm": 1.639889121055603, + "learning_rate": 6.882612075667272e-06, + "loss": 1.7286, + "step": 1328 + }, + { + "epoch": 0.03099645196745256, + "grad_norm": 1.2644333839416504, + "learning_rate": 6.887794765483286e-06, + "loss": 1.4472, + "step": 1329 + }, + { + "epoch": 0.031019775106630478, + "grad_norm": 1.4533531665802002, + "learning_rate": 6.892977455299302e-06, + "loss": 1.6504, + "step": 1330 + }, + { + "epoch": 0.031043098245808396, + "grad_norm": 1.5860834121704102, + "learning_rate": 6.898160145115316e-06, + "loss": 1.3219, + "step": 1331 + }, + { + "epoch": 0.03106642138498631, + "grad_norm": 1.4244756698608398, + "learning_rate": 6.90334283493133e-06, + "loss": 1.2863, + "step": 1332 + }, + { + "epoch": 0.03108974452416423, + "grad_norm": 1.7279314994812012, + "learning_rate": 6.908525524747345e-06, + "loss": 1.5325, + "step": 1333 + }, + { + "epoch": 0.031113067663342146, + "grad_norm": 1.3759844303131104, + "learning_rate": 6.913708214563359e-06, + "loss": 1.7333, + "step": 1334 + }, + { + "epoch": 0.031136390802520064, + "grad_norm": 1.3596171140670776, + "learning_rate": 6.918890904379374e-06, + "loss": 1.4572, + "step": 1335 + }, + { + "epoch": 0.031159713941697982, + "grad_norm": 1.4598828554153442, + "learning_rate": 6.9240735941953885e-06, + "loss": 1.5375, + "step": 1336 + }, + { + "epoch": 0.0311830370808759, + "grad_norm": 1.7578270435333252, + "learning_rate": 6.929256284011402e-06, + "loss": 1.7456, + "step": 1337 + }, + { + "epoch": 0.031206360220053818, + "grad_norm": 1.8432106971740723, + "learning_rate": 6.934438973827416e-06, + "loss": 1.3632, + "step": 1338 + }, + { + "epoch": 0.031229683359231736, + "grad_norm": 1.3926173448562622, + "learning_rate": 6.939621663643431e-06, + "loss": 1.5246, + "step": 1339 + }, + { + "epoch": 0.031253006498409654, + "grad_norm": 1.639283299446106, + "learning_rate": 6.944804353459446e-06, + "loss": 1.4081, + "step": 1340 + }, + { + "epoch": 0.03127632963758757, + "grad_norm": 1.818247675895691, + "learning_rate": 6.94998704327546e-06, + "loss": 1.4222, + "step": 1341 + }, + { + "epoch": 0.03129965277676549, + "grad_norm": 1.7598317861557007, + "learning_rate": 6.955169733091475e-06, + "loss": 1.5457, + "step": 1342 + }, + { + "epoch": 0.03132297591594341, + "grad_norm": 1.9077101945877075, + "learning_rate": 6.960352422907489e-06, + "loss": 1.2585, + "step": 1343 + }, + { + "epoch": 0.031346299055121325, + "grad_norm": 1.7100765705108643, + "learning_rate": 6.965535112723504e-06, + "loss": 1.5487, + "step": 1344 + }, + { + "epoch": 0.03136962219429924, + "grad_norm": 1.4282541275024414, + "learning_rate": 6.9707178025395185e-06, + "loss": 1.7457, + "step": 1345 + }, + { + "epoch": 0.03139294533347716, + "grad_norm": 1.5989662408828735, + "learning_rate": 6.975900492355533e-06, + "loss": 1.7449, + "step": 1346 + }, + { + "epoch": 0.03141626847265508, + "grad_norm": 1.2489700317382812, + "learning_rate": 6.981083182171547e-06, + "loss": 1.4873, + "step": 1347 + }, + { + "epoch": 0.031439591611833, + "grad_norm": 1.60476815700531, + "learning_rate": 6.986265871987562e-06, + "loss": 1.4751, + "step": 1348 + }, + { + "epoch": 0.031462914751010915, + "grad_norm": 1.5303354263305664, + "learning_rate": 6.991448561803577e-06, + "loss": 1.5709, + "step": 1349 + }, + { + "epoch": 0.03148623789018883, + "grad_norm": 1.462499737739563, + "learning_rate": 6.996631251619591e-06, + "loss": 1.7366, + "step": 1350 + }, + { + "epoch": 0.03150956102936675, + "grad_norm": 1.4246290922164917, + "learning_rate": 7.0018139414356054e-06, + "loss": 1.1592, + "step": 1351 + }, + { + "epoch": 0.03153288416854467, + "grad_norm": 1.8897913694381714, + "learning_rate": 7.006996631251621e-06, + "loss": 1.1699, + "step": 1352 + }, + { + "epoch": 0.031556207307722586, + "grad_norm": 1.6516541242599487, + "learning_rate": 7.012179321067635e-06, + "loss": 1.4705, + "step": 1353 + }, + { + "epoch": 0.0315795304469005, + "grad_norm": 1.816272258758545, + "learning_rate": 7.017362010883649e-06, + "loss": 1.3166, + "step": 1354 + }, + { + "epoch": 0.031602853586078415, + "grad_norm": 1.631224274635315, + "learning_rate": 7.022544700699664e-06, + "loss": 1.9471, + "step": 1355 + }, + { + "epoch": 0.03162617672525633, + "grad_norm": 1.7657747268676758, + "learning_rate": 7.027727390515678e-06, + "loss": 1.6623, + "step": 1356 + }, + { + "epoch": 0.03164949986443425, + "grad_norm": 1.5499768257141113, + "learning_rate": 7.032910080331693e-06, + "loss": 1.328, + "step": 1357 + }, + { + "epoch": 0.03167282300361217, + "grad_norm": 1.5339092016220093, + "learning_rate": 7.0380927701477075e-06, + "loss": 1.79, + "step": 1358 + }, + { + "epoch": 0.03169614614279009, + "grad_norm": 2.1172358989715576, + "learning_rate": 7.043275459963722e-06, + "loss": 1.719, + "step": 1359 + }, + { + "epoch": 0.031719469281968005, + "grad_norm": 1.5365610122680664, + "learning_rate": 7.048458149779736e-06, + "loss": 1.2236, + "step": 1360 + }, + { + "epoch": 0.03174279242114592, + "grad_norm": 1.7277380228042603, + "learning_rate": 7.0536408395957514e-06, + "loss": 1.768, + "step": 1361 + }, + { + "epoch": 0.03176611556032384, + "grad_norm": 3.0157341957092285, + "learning_rate": 7.058823529411766e-06, + "loss": 1.023, + "step": 1362 + }, + { + "epoch": 0.03178943869950176, + "grad_norm": 1.682496190071106, + "learning_rate": 7.06400621922778e-06, + "loss": 1.5555, + "step": 1363 + }, + { + "epoch": 0.031812761838679676, + "grad_norm": 1.6679117679595947, + "learning_rate": 7.0691889090437945e-06, + "loss": 1.762, + "step": 1364 + }, + { + "epoch": 0.031836084977857594, + "grad_norm": 1.5026060342788696, + "learning_rate": 7.074371598859808e-06, + "loss": 1.2893, + "step": 1365 + }, + { + "epoch": 0.03185940811703551, + "grad_norm": 1.8401672840118408, + "learning_rate": 7.079554288675823e-06, + "loss": 1.4318, + "step": 1366 + }, + { + "epoch": 0.03188273125621343, + "grad_norm": 1.6953387260437012, + "learning_rate": 7.0847369784918375e-06, + "loss": 1.5304, + "step": 1367 + }, + { + "epoch": 0.03190605439539135, + "grad_norm": 1.7483880519866943, + "learning_rate": 7.089919668307852e-06, + "loss": 1.763, + "step": 1368 + }, + { + "epoch": 0.031929377534569266, + "grad_norm": 1.6970646381378174, + "learning_rate": 7.095102358123866e-06, + "loss": 1.4232, + "step": 1369 + }, + { + "epoch": 0.031952700673747184, + "grad_norm": 1.4489586353302002, + "learning_rate": 7.100285047939881e-06, + "loss": 1.2495, + "step": 1370 + }, + { + "epoch": 0.0319760238129251, + "grad_norm": 1.8368195295333862, + "learning_rate": 7.105467737755896e-06, + "loss": 1.3631, + "step": 1371 + }, + { + "epoch": 0.03199934695210302, + "grad_norm": 2.073723077774048, + "learning_rate": 7.11065042757191e-06, + "loss": 1.4958, + "step": 1372 + }, + { + "epoch": 0.03202267009128094, + "grad_norm": 1.7000291347503662, + "learning_rate": 7.1158331173879244e-06, + "loss": 1.5018, + "step": 1373 + }, + { + "epoch": 0.032045993230458855, + "grad_norm": 1.896183729171753, + "learning_rate": 7.12101580720394e-06, + "loss": 1.4754, + "step": 1374 + }, + { + "epoch": 0.03206931636963677, + "grad_norm": 1.4250632524490356, + "learning_rate": 7.126198497019954e-06, + "loss": 1.2758, + "step": 1375 + }, + { + "epoch": 0.03209263950881469, + "grad_norm": 1.968647837638855, + "learning_rate": 7.131381186835968e-06, + "loss": 1.5062, + "step": 1376 + }, + { + "epoch": 0.03211596264799261, + "grad_norm": 1.5044890642166138, + "learning_rate": 7.136563876651983e-06, + "loss": 1.7057, + "step": 1377 + }, + { + "epoch": 0.03213928578717053, + "grad_norm": 1.5252755880355835, + "learning_rate": 7.141746566467998e-06, + "loss": 1.4311, + "step": 1378 + }, + { + "epoch": 0.032162608926348445, + "grad_norm": 1.7001562118530273, + "learning_rate": 7.146929256284012e-06, + "loss": 1.5573, + "step": 1379 + }, + { + "epoch": 0.03218593206552636, + "grad_norm": 2.1587064266204834, + "learning_rate": 7.1521119461000266e-06, + "loss": 1.1552, + "step": 1380 + }, + { + "epoch": 0.03220925520470428, + "grad_norm": 1.5938003063201904, + "learning_rate": 7.157294635916041e-06, + "loss": 1.3843, + "step": 1381 + }, + { + "epoch": 0.03223257834388219, + "grad_norm": 1.5198419094085693, + "learning_rate": 7.162477325732055e-06, + "loss": 1.4412, + "step": 1382 + }, + { + "epoch": 0.03225590148306011, + "grad_norm": 1.8579787015914917, + "learning_rate": 7.1676600155480705e-06, + "loss": 1.2986, + "step": 1383 + }, + { + "epoch": 0.03227922462223803, + "grad_norm": 1.5341622829437256, + "learning_rate": 7.172842705364085e-06, + "loss": 1.2032, + "step": 1384 + }, + { + "epoch": 0.032302547761415945, + "grad_norm": 2.0681440830230713, + "learning_rate": 7.178025395180099e-06, + "loss": 1.7171, + "step": 1385 + }, + { + "epoch": 0.03232587090059386, + "grad_norm": 1.7611883878707886, + "learning_rate": 7.1832080849961135e-06, + "loss": 1.3376, + "step": 1386 + }, + { + "epoch": 0.03234919403977178, + "grad_norm": 1.6917016506195068, + "learning_rate": 7.188390774812129e-06, + "loss": 1.3909, + "step": 1387 + }, + { + "epoch": 0.0323725171789497, + "grad_norm": 1.1238902807235718, + "learning_rate": 7.193573464628143e-06, + "loss": 1.1826, + "step": 1388 + }, + { + "epoch": 0.03239584031812762, + "grad_norm": 1.5484822988510132, + "learning_rate": 7.198756154444157e-06, + "loss": 1.4476, + "step": 1389 + }, + { + "epoch": 0.032419163457305535, + "grad_norm": 1.703244686126709, + "learning_rate": 7.203938844260172e-06, + "loss": 1.5256, + "step": 1390 + }, + { + "epoch": 0.03244248659648345, + "grad_norm": 2.350940465927124, + "learning_rate": 7.209121534076187e-06, + "loss": 1.4486, + "step": 1391 + }, + { + "epoch": 0.03246580973566137, + "grad_norm": 1.2115894556045532, + "learning_rate": 7.2143042238922004e-06, + "loss": 1.2387, + "step": 1392 + }, + { + "epoch": 0.03248913287483929, + "grad_norm": 1.4883688688278198, + "learning_rate": 7.219486913708215e-06, + "loss": 1.4499, + "step": 1393 + }, + { + "epoch": 0.032512456014017206, + "grad_norm": 1.2324401140213013, + "learning_rate": 7.224669603524229e-06, + "loss": 1.3548, + "step": 1394 + }, + { + "epoch": 0.032535779153195124, + "grad_norm": 2.054262638092041, + "learning_rate": 7.2298522933402435e-06, + "loss": 1.4986, + "step": 1395 + }, + { + "epoch": 0.03255910229237304, + "grad_norm": 1.7639497518539429, + "learning_rate": 7.235034983156259e-06, + "loss": 1.4023, + "step": 1396 + }, + { + "epoch": 0.03258242543155096, + "grad_norm": 1.3556314706802368, + "learning_rate": 7.240217672972273e-06, + "loss": 1.4122, + "step": 1397 + }, + { + "epoch": 0.03260574857072888, + "grad_norm": 1.8941506147384644, + "learning_rate": 7.245400362788287e-06, + "loss": 1.1754, + "step": 1398 + }, + { + "epoch": 0.032629071709906796, + "grad_norm": 1.7958110570907593, + "learning_rate": 7.250583052604302e-06, + "loss": 1.9056, + "step": 1399 + }, + { + "epoch": 0.032652394849084714, + "grad_norm": 1.3702186346054077, + "learning_rate": 7.255765742420317e-06, + "loss": 1.5533, + "step": 1400 + }, + { + "epoch": 0.03267571798826263, + "grad_norm": 1.4540181159973145, + "learning_rate": 7.260948432236331e-06, + "loss": 1.4704, + "step": 1401 + }, + { + "epoch": 0.03269904112744055, + "grad_norm": 1.6024681329727173, + "learning_rate": 7.266131122052346e-06, + "loss": 1.4394, + "step": 1402 + }, + { + "epoch": 0.03272236426661847, + "grad_norm": 1.5546940565109253, + "learning_rate": 7.27131381186836e-06, + "loss": 1.54, + "step": 1403 + }, + { + "epoch": 0.032745687405796385, + "grad_norm": 1.5781769752502441, + "learning_rate": 7.276496501684375e-06, + "loss": 1.3658, + "step": 1404 + }, + { + "epoch": 0.0327690105449743, + "grad_norm": 1.4951281547546387, + "learning_rate": 7.2816791915003895e-06, + "loss": 1.3768, + "step": 1405 + }, + { + "epoch": 0.03279233368415222, + "grad_norm": 1.9413893222808838, + "learning_rate": 7.286861881316404e-06, + "loss": 1.3878, + "step": 1406 + }, + { + "epoch": 0.03281565682333014, + "grad_norm": 1.6263363361358643, + "learning_rate": 7.292044571132418e-06, + "loss": 1.4236, + "step": 1407 + }, + { + "epoch": 0.03283897996250806, + "grad_norm": 2.2151589393615723, + "learning_rate": 7.2972272609484325e-06, + "loss": 1.7296, + "step": 1408 + }, + { + "epoch": 0.032862303101685975, + "grad_norm": 1.3772640228271484, + "learning_rate": 7.302409950764448e-06, + "loss": 1.292, + "step": 1409 + }, + { + "epoch": 0.03288562624086389, + "grad_norm": 1.7607418298721313, + "learning_rate": 7.307592640580462e-06, + "loss": 1.6019, + "step": 1410 + }, + { + "epoch": 0.0329089493800418, + "grad_norm": 1.9470393657684326, + "learning_rate": 7.312775330396476e-06, + "loss": 1.3396, + "step": 1411 + }, + { + "epoch": 0.03293227251921972, + "grad_norm": 2.021190881729126, + "learning_rate": 7.317958020212491e-06, + "loss": 1.6207, + "step": 1412 + }, + { + "epoch": 0.03295559565839764, + "grad_norm": 1.7311667203903198, + "learning_rate": 7.323140710028506e-06, + "loss": 1.6409, + "step": 1413 + }, + { + "epoch": 0.03297891879757556, + "grad_norm": 1.6784627437591553, + "learning_rate": 7.32832339984452e-06, + "loss": 1.5595, + "step": 1414 + }, + { + "epoch": 0.033002241936753475, + "grad_norm": 1.517193078994751, + "learning_rate": 7.333506089660535e-06, + "loss": 1.574, + "step": 1415 + }, + { + "epoch": 0.03302556507593139, + "grad_norm": 1.4831286668777466, + "learning_rate": 7.338688779476549e-06, + "loss": 0.8727, + "step": 1416 + }, + { + "epoch": 0.03304888821510931, + "grad_norm": 1.6477752923965454, + "learning_rate": 7.343871469292564e-06, + "loss": 1.559, + "step": 1417 + }, + { + "epoch": 0.03307221135428723, + "grad_norm": 1.853326678276062, + "learning_rate": 7.3490541591085785e-06, + "loss": 1.8523, + "step": 1418 + }, + { + "epoch": 0.03309553449346515, + "grad_norm": 1.6894885301589966, + "learning_rate": 7.354236848924593e-06, + "loss": 1.5844, + "step": 1419 + }, + { + "epoch": 0.033118857632643064, + "grad_norm": 1.6442736387252808, + "learning_rate": 7.359419538740606e-06, + "loss": 1.986, + "step": 1420 + }, + { + "epoch": 0.03314218077182098, + "grad_norm": 1.787266731262207, + "learning_rate": 7.364602228556621e-06, + "loss": 1.4822, + "step": 1421 + }, + { + "epoch": 0.0331655039109989, + "grad_norm": 2.073798418045044, + "learning_rate": 7.369784918372636e-06, + "loss": 1.637, + "step": 1422 + }, + { + "epoch": 0.03318882705017682, + "grad_norm": 1.3428417444229126, + "learning_rate": 7.37496760818865e-06, + "loss": 1.5598, + "step": 1423 + }, + { + "epoch": 0.033212150189354736, + "grad_norm": 1.5737829208374023, + "learning_rate": 7.380150298004665e-06, + "loss": 1.2274, + "step": 1424 + }, + { + "epoch": 0.033235473328532654, + "grad_norm": 2.1165404319763184, + "learning_rate": 7.385332987820679e-06, + "loss": 1.4134, + "step": 1425 + }, + { + "epoch": 0.03325879646771057, + "grad_norm": 1.5476047992706299, + "learning_rate": 7.390515677636694e-06, + "loss": 1.6364, + "step": 1426 + }, + { + "epoch": 0.03328211960688849, + "grad_norm": 1.6927748918533325, + "learning_rate": 7.3956983674527085e-06, + "loss": 1.6977, + "step": 1427 + }, + { + "epoch": 0.03330544274606641, + "grad_norm": 1.4677228927612305, + "learning_rate": 7.400881057268723e-06, + "loss": 1.4168, + "step": 1428 + }, + { + "epoch": 0.033328765885244326, + "grad_norm": 1.5205353498458862, + "learning_rate": 7.406063747084737e-06, + "loss": 1.2843, + "step": 1429 + }, + { + "epoch": 0.03335208902442224, + "grad_norm": 1.5447300672531128, + "learning_rate": 7.411246436900752e-06, + "loss": 1.5689, + "step": 1430 + }, + { + "epoch": 0.03337541216360016, + "grad_norm": 1.63996160030365, + "learning_rate": 7.416429126716767e-06, + "loss": 1.5884, + "step": 1431 + }, + { + "epoch": 0.03339873530277808, + "grad_norm": 1.452081322669983, + "learning_rate": 7.421611816532781e-06, + "loss": 1.3101, + "step": 1432 + }, + { + "epoch": 0.033422058441956, + "grad_norm": 1.7910422086715698, + "learning_rate": 7.426794506348795e-06, + "loss": 1.4715, + "step": 1433 + }, + { + "epoch": 0.033445381581133915, + "grad_norm": 1.983233094215393, + "learning_rate": 7.43197719616481e-06, + "loss": 1.5482, + "step": 1434 + }, + { + "epoch": 0.03346870472031183, + "grad_norm": 1.767785906791687, + "learning_rate": 7.437159885980825e-06, + "loss": 1.6462, + "step": 1435 + }, + { + "epoch": 0.03349202785948975, + "grad_norm": 1.6161593198776245, + "learning_rate": 7.442342575796839e-06, + "loss": 1.279, + "step": 1436 + }, + { + "epoch": 0.03351535099866767, + "grad_norm": 1.4756333827972412, + "learning_rate": 7.447525265612854e-06, + "loss": 1.5475, + "step": 1437 + }, + { + "epoch": 0.03353867413784559, + "grad_norm": 1.8089308738708496, + "learning_rate": 7.452707955428868e-06, + "loss": 1.8059, + "step": 1438 + }, + { + "epoch": 0.0335619972770235, + "grad_norm": 1.6815400123596191, + "learning_rate": 7.457890645244883e-06, + "loss": 1.7294, + "step": 1439 + }, + { + "epoch": 0.033585320416201415, + "grad_norm": 2.2101638317108154, + "learning_rate": 7.4630733350608975e-06, + "loss": 1.1257, + "step": 1440 + }, + { + "epoch": 0.03360864355537933, + "grad_norm": 1.4447871446609497, + "learning_rate": 7.468256024876912e-06, + "loss": 1.5934, + "step": 1441 + }, + { + "epoch": 0.03363196669455725, + "grad_norm": 1.8209795951843262, + "learning_rate": 7.473438714692926e-06, + "loss": 1.4218, + "step": 1442 + }, + { + "epoch": 0.03365528983373517, + "grad_norm": 1.4553669691085815, + "learning_rate": 7.4786214045089414e-06, + "loss": 1.2059, + "step": 1443 + }, + { + "epoch": 0.03367861297291309, + "grad_norm": 1.7106033563613892, + "learning_rate": 7.483804094324956e-06, + "loss": 1.1671, + "step": 1444 + }, + { + "epoch": 0.033701936112091005, + "grad_norm": 1.3894087076187134, + "learning_rate": 7.48898678414097e-06, + "loss": 1.4522, + "step": 1445 + }, + { + "epoch": 0.03372525925126892, + "grad_norm": 1.1842654943466187, + "learning_rate": 7.4941694739569845e-06, + "loss": 1.4706, + "step": 1446 + }, + { + "epoch": 0.03374858239044684, + "grad_norm": 2.5644612312316895, + "learning_rate": 7.499352163773e-06, + "loss": 1.6062, + "step": 1447 + }, + { + "epoch": 0.03377190552962476, + "grad_norm": 1.5129215717315674, + "learning_rate": 7.504534853589013e-06, + "loss": 1.2178, + "step": 1448 + }, + { + "epoch": 0.033795228668802677, + "grad_norm": 1.7350616455078125, + "learning_rate": 7.5097175434050275e-06, + "loss": 1.7579, + "step": 1449 + }, + { + "epoch": 0.033818551807980594, + "grad_norm": 2.163621187210083, + "learning_rate": 7.514900233221042e-06, + "loss": 1.6504, + "step": 1450 + }, + { + "epoch": 0.03384187494715851, + "grad_norm": 1.946423888206482, + "learning_rate": 7.520082923037056e-06, + "loss": 1.6524, + "step": 1451 + }, + { + "epoch": 0.03386519808633643, + "grad_norm": 1.766641616821289, + "learning_rate": 7.525265612853071e-06, + "loss": 1.1683, + "step": 1452 + }, + { + "epoch": 0.03388852122551435, + "grad_norm": 1.928938627243042, + "learning_rate": 7.530448302669086e-06, + "loss": 1.4919, + "step": 1453 + }, + { + "epoch": 0.033911844364692266, + "grad_norm": 1.5574640035629272, + "learning_rate": 7.5356309924851e-06, + "loss": 1.3775, + "step": 1454 + }, + { + "epoch": 0.033935167503870184, + "grad_norm": 1.6000114679336548, + "learning_rate": 7.5408136823011144e-06, + "loss": 1.8033, + "step": 1455 + }, + { + "epoch": 0.0339584906430481, + "grad_norm": 1.4576321840286255, + "learning_rate": 7.545996372117129e-06, + "loss": 1.6291, + "step": 1456 + }, + { + "epoch": 0.03398181378222602, + "grad_norm": 1.67397940158844, + "learning_rate": 7.551179061933144e-06, + "loss": 1.501, + "step": 1457 + }, + { + "epoch": 0.03400513692140394, + "grad_norm": 1.6351300477981567, + "learning_rate": 7.556361751749158e-06, + "loss": 1.4177, + "step": 1458 + }, + { + "epoch": 0.034028460060581855, + "grad_norm": 1.806840181350708, + "learning_rate": 7.561544441565173e-06, + "loss": 1.2173, + "step": 1459 + }, + { + "epoch": 0.03405178319975977, + "grad_norm": 2.1059956550598145, + "learning_rate": 7.566727131381187e-06, + "loss": 1.2487, + "step": 1460 + }, + { + "epoch": 0.03407510633893769, + "grad_norm": 1.5448449850082397, + "learning_rate": 7.571909821197202e-06, + "loss": 1.4264, + "step": 1461 + }, + { + "epoch": 0.03409842947811561, + "grad_norm": 2.8610997200012207, + "learning_rate": 7.5770925110132166e-06, + "loss": 1.3305, + "step": 1462 + }, + { + "epoch": 0.03412175261729353, + "grad_norm": 1.7565038204193115, + "learning_rate": 7.582275200829231e-06, + "loss": 1.6971, + "step": 1463 + }, + { + "epoch": 0.034145075756471445, + "grad_norm": 1.5691516399383545, + "learning_rate": 7.587457890645245e-06, + "loss": 1.6759, + "step": 1464 + }, + { + "epoch": 0.03416839889564936, + "grad_norm": 1.4603890180587769, + "learning_rate": 7.5926405804612605e-06, + "loss": 1.6264, + "step": 1465 + }, + { + "epoch": 0.03419172203482728, + "grad_norm": 1.5885038375854492, + "learning_rate": 7.597823270277275e-06, + "loss": 1.124, + "step": 1466 + }, + { + "epoch": 0.0342150451740052, + "grad_norm": 1.4058237075805664, + "learning_rate": 7.603005960093289e-06, + "loss": 1.4257, + "step": 1467 + }, + { + "epoch": 0.03423836831318311, + "grad_norm": 1.552217721939087, + "learning_rate": 7.6081886499093035e-06, + "loss": 1.2563, + "step": 1468 + }, + { + "epoch": 0.03426169145236103, + "grad_norm": 2.235629081726074, + "learning_rate": 7.613371339725319e-06, + "loss": 1.8083, + "step": 1469 + }, + { + "epoch": 0.034285014591538945, + "grad_norm": 1.8639624118804932, + "learning_rate": 7.618554029541333e-06, + "loss": 1.5186, + "step": 1470 + }, + { + "epoch": 0.03430833773071686, + "grad_norm": 2.1537373065948486, + "learning_rate": 7.623736719357347e-06, + "loss": 1.3531, + "step": 1471 + }, + { + "epoch": 0.03433166086989478, + "grad_norm": 1.9041272401809692, + "learning_rate": 7.628919409173362e-06, + "loss": 1.4382, + "step": 1472 + }, + { + "epoch": 0.0343549840090727, + "grad_norm": 1.5207409858703613, + "learning_rate": 7.634102098989377e-06, + "loss": 1.349, + "step": 1473 + }, + { + "epoch": 0.03437830714825062, + "grad_norm": 1.446553349494934, + "learning_rate": 7.639284788805391e-06, + "loss": 1.1513, + "step": 1474 + }, + { + "epoch": 0.034401630287428535, + "grad_norm": 1.5411823987960815, + "learning_rate": 7.644467478621404e-06, + "loss": 1.2673, + "step": 1475 + }, + { + "epoch": 0.03442495342660645, + "grad_norm": 1.588210105895996, + "learning_rate": 7.64965016843742e-06, + "loss": 1.3414, + "step": 1476 + }, + { + "epoch": 0.03444827656578437, + "grad_norm": 1.4371896982192993, + "learning_rate": 7.654832858253434e-06, + "loss": 1.3386, + "step": 1477 + }, + { + "epoch": 0.03447159970496229, + "grad_norm": 1.2713488340377808, + "learning_rate": 7.660015548069449e-06, + "loss": 1.3674, + "step": 1478 + }, + { + "epoch": 0.034494922844140206, + "grad_norm": 1.9180690050125122, + "learning_rate": 7.665198237885463e-06, + "loss": 1.3761, + "step": 1479 + }, + { + "epoch": 0.034518245983318124, + "grad_norm": 1.7977988719940186, + "learning_rate": 7.670380927701477e-06, + "loss": 1.5416, + "step": 1480 + }, + { + "epoch": 0.03454156912249604, + "grad_norm": 1.6764715909957886, + "learning_rate": 7.675563617517492e-06, + "loss": 1.9225, + "step": 1481 + }, + { + "epoch": 0.03456489226167396, + "grad_norm": 1.8952007293701172, + "learning_rate": 7.680746307333506e-06, + "loss": 1.545, + "step": 1482 + }, + { + "epoch": 0.03458821540085188, + "grad_norm": 1.2648754119873047, + "learning_rate": 7.68592899714952e-06, + "loss": 1.3556, + "step": 1483 + }, + { + "epoch": 0.034611538540029796, + "grad_norm": 1.5882269144058228, + "learning_rate": 7.691111686965535e-06, + "loss": 1.4676, + "step": 1484 + }, + { + "epoch": 0.034634861679207714, + "grad_norm": 1.4746918678283691, + "learning_rate": 7.69629437678155e-06, + "loss": 1.5869, + "step": 1485 + }, + { + "epoch": 0.03465818481838563, + "grad_norm": 1.6212809085845947, + "learning_rate": 7.701477066597565e-06, + "loss": 1.7056, + "step": 1486 + }, + { + "epoch": 0.03468150795756355, + "grad_norm": 2.3746814727783203, + "learning_rate": 7.70665975641358e-06, + "loss": 1.4726, + "step": 1487 + }, + { + "epoch": 0.03470483109674147, + "grad_norm": 1.5706418752670288, + "learning_rate": 7.711842446229594e-06, + "loss": 1.3319, + "step": 1488 + }, + { + "epoch": 0.034728154235919385, + "grad_norm": 1.6811712980270386, + "learning_rate": 7.717025136045608e-06, + "loss": 1.4744, + "step": 1489 + }, + { + "epoch": 0.0347514773750973, + "grad_norm": 1.839852213859558, + "learning_rate": 7.722207825861623e-06, + "loss": 1.3563, + "step": 1490 + }, + { + "epoch": 0.03477480051427522, + "grad_norm": 1.2929447889328003, + "learning_rate": 7.727390515677637e-06, + "loss": 1.7944, + "step": 1491 + }, + { + "epoch": 0.03479812365345314, + "grad_norm": 1.7659885883331299, + "learning_rate": 7.732573205493651e-06, + "loss": 1.6265, + "step": 1492 + }, + { + "epoch": 0.03482144679263106, + "grad_norm": 1.7670022249221802, + "learning_rate": 7.737755895309667e-06, + "loss": 1.6311, + "step": 1493 + }, + { + "epoch": 0.034844769931808975, + "grad_norm": 1.7348347902297974, + "learning_rate": 7.742938585125682e-06, + "loss": 1.4573, + "step": 1494 + }, + { + "epoch": 0.03486809307098689, + "grad_norm": 1.5826637744903564, + "learning_rate": 7.748121274941696e-06, + "loss": 1.8183, + "step": 1495 + }, + { + "epoch": 0.034891416210164804, + "grad_norm": 1.6276066303253174, + "learning_rate": 7.75330396475771e-06, + "loss": 1.5105, + "step": 1496 + }, + { + "epoch": 0.03491473934934272, + "grad_norm": 1.4175602197647095, + "learning_rate": 7.758486654573725e-06, + "loss": 1.4397, + "step": 1497 + }, + { + "epoch": 0.03493806248852064, + "grad_norm": 1.2575039863586426, + "learning_rate": 7.763669344389739e-06, + "loss": 1.3638, + "step": 1498 + }, + { + "epoch": 0.03496138562769856, + "grad_norm": 1.591441035270691, + "learning_rate": 7.768852034205753e-06, + "loss": 1.2515, + "step": 1499 + }, + { + "epoch": 0.034984708766876475, + "grad_norm": 1.8170280456542969, + "learning_rate": 7.774034724021768e-06, + "loss": 1.6124, + "step": 1500 + }, + { + "epoch": 0.03500803190605439, + "grad_norm": 1.825690507888794, + "learning_rate": 7.779217413837784e-06, + "loss": 1.5076, + "step": 1501 + }, + { + "epoch": 0.03503135504523231, + "grad_norm": 1.61045241355896, + "learning_rate": 7.784400103653798e-06, + "loss": 1.5944, + "step": 1502 + }, + { + "epoch": 0.03505467818441023, + "grad_norm": 2.1213035583496094, + "learning_rate": 7.78958279346981e-06, + "loss": 1.561, + "step": 1503 + }, + { + "epoch": 0.03507800132358815, + "grad_norm": 1.5680464506149292, + "learning_rate": 7.794765483285825e-06, + "loss": 1.1515, + "step": 1504 + }, + { + "epoch": 0.035101324462766065, + "grad_norm": 1.7792956829071045, + "learning_rate": 7.79994817310184e-06, + "loss": 1.7459, + "step": 1505 + }, + { + "epoch": 0.03512464760194398, + "grad_norm": 1.5262699127197266, + "learning_rate": 7.805130862917854e-06, + "loss": 1.4087, + "step": 1506 + }, + { + "epoch": 0.0351479707411219, + "grad_norm": 1.9013603925704956, + "learning_rate": 7.81031355273387e-06, + "loss": 1.745, + "step": 1507 + }, + { + "epoch": 0.03517129388029982, + "grad_norm": 2.1864850521087646, + "learning_rate": 7.815496242549884e-06, + "loss": 1.6892, + "step": 1508 + }, + { + "epoch": 0.035194617019477736, + "grad_norm": 1.6094999313354492, + "learning_rate": 7.820678932365898e-06, + "loss": 1.1677, + "step": 1509 + }, + { + "epoch": 0.035217940158655654, + "grad_norm": 1.6659038066864014, + "learning_rate": 7.825861622181913e-06, + "loss": 1.3676, + "step": 1510 + }, + { + "epoch": 0.03524126329783357, + "grad_norm": 1.5591635704040527, + "learning_rate": 7.831044311997927e-06, + "loss": 1.3353, + "step": 1511 + }, + { + "epoch": 0.03526458643701149, + "grad_norm": 1.6324151754379272, + "learning_rate": 7.836227001813942e-06, + "loss": 1.5816, + "step": 1512 + }, + { + "epoch": 0.03528790957618941, + "grad_norm": 1.8007915019989014, + "learning_rate": 7.841409691629956e-06, + "loss": 1.9207, + "step": 1513 + }, + { + "epoch": 0.035311232715367326, + "grad_norm": 1.6061041355133057, + "learning_rate": 7.84659238144597e-06, + "loss": 1.6949, + "step": 1514 + }, + { + "epoch": 0.035334555854545244, + "grad_norm": 1.5150330066680908, + "learning_rate": 7.851775071261986e-06, + "loss": 1.7975, + "step": 1515 + }, + { + "epoch": 0.03535787899372316, + "grad_norm": 1.7966561317443848, + "learning_rate": 7.856957761078e-06, + "loss": 1.3558, + "step": 1516 + }, + { + "epoch": 0.03538120213290108, + "grad_norm": 1.6751410961151123, + "learning_rate": 7.862140450894015e-06, + "loss": 1.3387, + "step": 1517 + }, + { + "epoch": 0.035404525272079, + "grad_norm": 1.7746779918670654, + "learning_rate": 7.86732314071003e-06, + "loss": 1.8068, + "step": 1518 + }, + { + "epoch": 0.035427848411256915, + "grad_norm": 1.4943839311599731, + "learning_rate": 7.872505830526044e-06, + "loss": 1.4922, + "step": 1519 + }, + { + "epoch": 0.03545117155043483, + "grad_norm": 1.3683398962020874, + "learning_rate": 7.877688520342058e-06, + "loss": 1.382, + "step": 1520 + }, + { + "epoch": 0.03547449468961275, + "grad_norm": 1.6939599514007568, + "learning_rate": 7.882871210158072e-06, + "loss": 1.6179, + "step": 1521 + }, + { + "epoch": 0.03549781782879067, + "grad_norm": 1.4292916059494019, + "learning_rate": 7.888053899974087e-06, + "loss": 1.4422, + "step": 1522 + }, + { + "epoch": 0.03552114096796859, + "grad_norm": 1.96234929561615, + "learning_rate": 7.893236589790103e-06, + "loss": 1.1728, + "step": 1523 + }, + { + "epoch": 0.035544464107146505, + "grad_norm": 1.8289707899093628, + "learning_rate": 7.898419279606117e-06, + "loss": 1.4281, + "step": 1524 + }, + { + "epoch": 0.035567787246324416, + "grad_norm": 1.563638687133789, + "learning_rate": 7.903601969422131e-06, + "loss": 1.441, + "step": 1525 + }, + { + "epoch": 0.035591110385502334, + "grad_norm": 1.7753417491912842, + "learning_rate": 7.908784659238146e-06, + "loss": 1.5371, + "step": 1526 + }, + { + "epoch": 0.03561443352468025, + "grad_norm": 1.4442288875579834, + "learning_rate": 7.91396734905416e-06, + "loss": 1.21, + "step": 1527 + }, + { + "epoch": 0.03563775666385817, + "grad_norm": 1.5175955295562744, + "learning_rate": 7.919150038870174e-06, + "loss": 1.2075, + "step": 1528 + }, + { + "epoch": 0.03566107980303609, + "grad_norm": 1.6752229928970337, + "learning_rate": 7.924332728686189e-06, + "loss": 1.2919, + "step": 1529 + }, + { + "epoch": 0.035684402942214005, + "grad_norm": 1.7506253719329834, + "learning_rate": 7.929515418502203e-06, + "loss": 1.5369, + "step": 1530 + }, + { + "epoch": 0.03570772608139192, + "grad_norm": 1.9442663192749023, + "learning_rate": 7.934698108318218e-06, + "loss": 1.6756, + "step": 1531 + }, + { + "epoch": 0.03573104922056984, + "grad_norm": 1.658495545387268, + "learning_rate": 7.939880798134232e-06, + "loss": 1.2362, + "step": 1532 + }, + { + "epoch": 0.03575437235974776, + "grad_norm": 1.2289533615112305, + "learning_rate": 7.945063487950246e-06, + "loss": 1.5051, + "step": 1533 + }, + { + "epoch": 0.03577769549892568, + "grad_norm": 1.5502135753631592, + "learning_rate": 7.95024617776626e-06, + "loss": 1.3702, + "step": 1534 + }, + { + "epoch": 0.035801018638103595, + "grad_norm": 1.8727954626083374, + "learning_rate": 7.955428867582275e-06, + "loss": 1.417, + "step": 1535 + }, + { + "epoch": 0.03582434177728151, + "grad_norm": 1.1890602111816406, + "learning_rate": 7.96061155739829e-06, + "loss": 1.1737, + "step": 1536 + }, + { + "epoch": 0.03584766491645943, + "grad_norm": 5.72725772857666, + "learning_rate": 7.965794247214305e-06, + "loss": 1.3097, + "step": 1537 + }, + { + "epoch": 0.03587098805563735, + "grad_norm": 1.2847952842712402, + "learning_rate": 7.97097693703032e-06, + "loss": 1.456, + "step": 1538 + }, + { + "epoch": 0.035894311194815266, + "grad_norm": 2.3652467727661133, + "learning_rate": 7.976159626846334e-06, + "loss": 1.7498, + "step": 1539 + }, + { + "epoch": 0.035917634333993184, + "grad_norm": 2.2748360633850098, + "learning_rate": 7.981342316662348e-06, + "loss": 1.4181, + "step": 1540 + }, + { + "epoch": 0.0359409574731711, + "grad_norm": 1.9288114309310913, + "learning_rate": 7.986525006478363e-06, + "loss": 1.4505, + "step": 1541 + }, + { + "epoch": 0.03596428061234902, + "grad_norm": 1.9735311269760132, + "learning_rate": 7.991707696294377e-06, + "loss": 1.5196, + "step": 1542 + }, + { + "epoch": 0.03598760375152694, + "grad_norm": 1.5026898384094238, + "learning_rate": 7.996890386110391e-06, + "loss": 1.2868, + "step": 1543 + }, + { + "epoch": 0.036010926890704856, + "grad_norm": 1.4773675203323364, + "learning_rate": 8.002073075926406e-06, + "loss": 1.3777, + "step": 1544 + }, + { + "epoch": 0.036034250029882774, + "grad_norm": 1.7095143795013428, + "learning_rate": 8.007255765742422e-06, + "loss": 1.2692, + "step": 1545 + }, + { + "epoch": 0.03605757316906069, + "grad_norm": 1.7218233346939087, + "learning_rate": 8.012438455558436e-06, + "loss": 1.4015, + "step": 1546 + }, + { + "epoch": 0.03608089630823861, + "grad_norm": 1.5240681171417236, + "learning_rate": 8.01762114537445e-06, + "loss": 1.5267, + "step": 1547 + }, + { + "epoch": 0.03610421944741653, + "grad_norm": 1.9092682600021362, + "learning_rate": 8.022803835190465e-06, + "loss": 1.2564, + "step": 1548 + }, + { + "epoch": 0.036127542586594445, + "grad_norm": 1.844650149345398, + "learning_rate": 8.027986525006479e-06, + "loss": 1.5158, + "step": 1549 + }, + { + "epoch": 0.03615086572577236, + "grad_norm": 1.5689501762390137, + "learning_rate": 8.033169214822493e-06, + "loss": 1.5708, + "step": 1550 + }, + { + "epoch": 0.03617418886495028, + "grad_norm": 2.210259437561035, + "learning_rate": 8.038351904638508e-06, + "loss": 1.5915, + "step": 1551 + }, + { + "epoch": 0.0361975120041282, + "grad_norm": 1.4000816345214844, + "learning_rate": 8.043534594454522e-06, + "loss": 1.2189, + "step": 1552 + }, + { + "epoch": 0.03622083514330611, + "grad_norm": 1.4790806770324707, + "learning_rate": 8.048717284270538e-06, + "loss": 1.3637, + "step": 1553 + }, + { + "epoch": 0.03624415828248403, + "grad_norm": 1.9432685375213623, + "learning_rate": 8.053899974086553e-06, + "loss": 1.4459, + "step": 1554 + }, + { + "epoch": 0.036267481421661946, + "grad_norm": 1.9427974224090576, + "learning_rate": 8.059082663902567e-06, + "loss": 1.8405, + "step": 1555 + }, + { + "epoch": 0.036290804560839864, + "grad_norm": 1.6169490814208984, + "learning_rate": 8.064265353718581e-06, + "loss": 1.5894, + "step": 1556 + }, + { + "epoch": 0.03631412770001778, + "grad_norm": 2.189110517501831, + "learning_rate": 8.069448043534596e-06, + "loss": 1.4458, + "step": 1557 + }, + { + "epoch": 0.0363374508391957, + "grad_norm": 1.6950788497924805, + "learning_rate": 8.074630733350608e-06, + "loss": 1.2485, + "step": 1558 + }, + { + "epoch": 0.03636077397837362, + "grad_norm": 1.5580222606658936, + "learning_rate": 8.079813423166624e-06, + "loss": 1.3971, + "step": 1559 + }, + { + "epoch": 0.036384097117551535, + "grad_norm": 1.68899405002594, + "learning_rate": 8.084996112982639e-06, + "loss": 1.5722, + "step": 1560 + }, + { + "epoch": 0.03640742025672945, + "grad_norm": 1.6812056303024292, + "learning_rate": 8.090178802798653e-06, + "loss": 1.8336, + "step": 1561 + }, + { + "epoch": 0.03643074339590737, + "grad_norm": 2.962195634841919, + "learning_rate": 8.095361492614667e-06, + "loss": 1.7488, + "step": 1562 + }, + { + "epoch": 0.03645406653508529, + "grad_norm": 1.6132487058639526, + "learning_rate": 8.100544182430682e-06, + "loss": 1.9727, + "step": 1563 + }, + { + "epoch": 0.03647738967426321, + "grad_norm": 1.6288578510284424, + "learning_rate": 8.105726872246696e-06, + "loss": 1.6962, + "step": 1564 + }, + { + "epoch": 0.036500712813441125, + "grad_norm": 1.5894676446914673, + "learning_rate": 8.11090956206271e-06, + "loss": 1.7313, + "step": 1565 + }, + { + "epoch": 0.03652403595261904, + "grad_norm": 1.702314019203186, + "learning_rate": 8.116092251878725e-06, + "loss": 1.5682, + "step": 1566 + }, + { + "epoch": 0.03654735909179696, + "grad_norm": 2.3464395999908447, + "learning_rate": 8.12127494169474e-06, + "loss": 1.1367, + "step": 1567 + }, + { + "epoch": 0.03657068223097488, + "grad_norm": 1.3930420875549316, + "learning_rate": 8.126457631510755e-06, + "loss": 1.2127, + "step": 1568 + }, + { + "epoch": 0.036594005370152796, + "grad_norm": 1.964519739151001, + "learning_rate": 8.13164032132677e-06, + "loss": 1.5458, + "step": 1569 + }, + { + "epoch": 0.036617328509330714, + "grad_norm": 1.7511687278747559, + "learning_rate": 8.136823011142784e-06, + "loss": 1.4957, + "step": 1570 + }, + { + "epoch": 0.03664065164850863, + "grad_norm": 1.403041958808899, + "learning_rate": 8.142005700958798e-06, + "loss": 1.5422, + "step": 1571 + }, + { + "epoch": 0.03666397478768655, + "grad_norm": 2.368617534637451, + "learning_rate": 8.147188390774813e-06, + "loss": 1.2203, + "step": 1572 + }, + { + "epoch": 0.03668729792686447, + "grad_norm": 1.7351584434509277, + "learning_rate": 8.152371080590827e-06, + "loss": 1.5534, + "step": 1573 + }, + { + "epoch": 0.036710621066042386, + "grad_norm": 1.7059663534164429, + "learning_rate": 8.157553770406841e-06, + "loss": 1.5554, + "step": 1574 + }, + { + "epoch": 0.036733944205220304, + "grad_norm": 1.9748015403747559, + "learning_rate": 8.162736460222857e-06, + "loss": 1.3837, + "step": 1575 + }, + { + "epoch": 0.03675726734439822, + "grad_norm": 1.7517926692962646, + "learning_rate": 8.167919150038872e-06, + "loss": 1.5008, + "step": 1576 + }, + { + "epoch": 0.03678059048357614, + "grad_norm": 2.074340343475342, + "learning_rate": 8.173101839854886e-06, + "loss": 1.1944, + "step": 1577 + }, + { + "epoch": 0.03680391362275406, + "grad_norm": 1.7943975925445557, + "learning_rate": 8.1782845296709e-06, + "loss": 1.5017, + "step": 1578 + }, + { + "epoch": 0.036827236761931975, + "grad_norm": 1.7202725410461426, + "learning_rate": 8.183467219486915e-06, + "loss": 1.3468, + "step": 1579 + }, + { + "epoch": 0.03685055990110989, + "grad_norm": 2.03446364402771, + "learning_rate": 8.188649909302929e-06, + "loss": 1.8081, + "step": 1580 + }, + { + "epoch": 0.03687388304028781, + "grad_norm": 1.8767874240875244, + "learning_rate": 8.193832599118943e-06, + "loss": 1.3877, + "step": 1581 + }, + { + "epoch": 0.03689720617946572, + "grad_norm": 1.4143779277801514, + "learning_rate": 8.199015288934958e-06, + "loss": 1.5551, + "step": 1582 + }, + { + "epoch": 0.03692052931864364, + "grad_norm": 1.4130569696426392, + "learning_rate": 8.204197978750974e-06, + "loss": 1.3058, + "step": 1583 + }, + { + "epoch": 0.03694385245782156, + "grad_norm": 1.4558956623077393, + "learning_rate": 8.209380668566988e-06, + "loss": 1.4228, + "step": 1584 + }, + { + "epoch": 0.036967175596999476, + "grad_norm": 2.6582729816436768, + "learning_rate": 8.214563358383e-06, + "loss": 1.5081, + "step": 1585 + }, + { + "epoch": 0.036990498736177393, + "grad_norm": 1.4754345417022705, + "learning_rate": 8.219746048199015e-06, + "loss": 1.5688, + "step": 1586 + }, + { + "epoch": 0.03701382187535531, + "grad_norm": 1.5351654291152954, + "learning_rate": 8.22492873801503e-06, + "loss": 1.8144, + "step": 1587 + }, + { + "epoch": 0.03703714501453323, + "grad_norm": 1.6197818517684937, + "learning_rate": 8.230111427831044e-06, + "loss": 1.5846, + "step": 1588 + }, + { + "epoch": 0.03706046815371115, + "grad_norm": 1.8108611106872559, + "learning_rate": 8.23529411764706e-06, + "loss": 1.3416, + "step": 1589 + }, + { + "epoch": 0.037083791292889065, + "grad_norm": 1.6245759725570679, + "learning_rate": 8.240476807463074e-06, + "loss": 1.4319, + "step": 1590 + }, + { + "epoch": 0.03710711443206698, + "grad_norm": 1.9573677778244019, + "learning_rate": 8.245659497279088e-06, + "loss": 1.4856, + "step": 1591 + }, + { + "epoch": 0.0371304375712449, + "grad_norm": 2.195033550262451, + "learning_rate": 8.250842187095103e-06, + "loss": 1.4553, + "step": 1592 + }, + { + "epoch": 0.03715376071042282, + "grad_norm": 1.7342851161956787, + "learning_rate": 8.256024876911117e-06, + "loss": 1.4633, + "step": 1593 + }, + { + "epoch": 0.03717708384960074, + "grad_norm": 1.499495506286621, + "learning_rate": 8.261207566727132e-06, + "loss": 1.6055, + "step": 1594 + }, + { + "epoch": 0.037200406988778655, + "grad_norm": 1.4192696809768677, + "learning_rate": 8.266390256543146e-06, + "loss": 1.3659, + "step": 1595 + }, + { + "epoch": 0.03722373012795657, + "grad_norm": 1.8910040855407715, + "learning_rate": 8.27157294635916e-06, + "loss": 1.703, + "step": 1596 + }, + { + "epoch": 0.03724705326713449, + "grad_norm": 1.4962915182113647, + "learning_rate": 8.276755636175176e-06, + "loss": 1.3212, + "step": 1597 + }, + { + "epoch": 0.03727037640631241, + "grad_norm": 2.1940252780914307, + "learning_rate": 8.28193832599119e-06, + "loss": 1.8816, + "step": 1598 + }, + { + "epoch": 0.037293699545490326, + "grad_norm": 1.415831208229065, + "learning_rate": 8.287121015807205e-06, + "loss": 1.229, + "step": 1599 + }, + { + "epoch": 0.037317022684668244, + "grad_norm": 1.5565876960754395, + "learning_rate": 8.29230370562322e-06, + "loss": 1.7503, + "step": 1600 + }, + { + "epoch": 0.03734034582384616, + "grad_norm": 2.6450204849243164, + "learning_rate": 8.297486395439234e-06, + "loss": 1.3618, + "step": 1601 + }, + { + "epoch": 0.03736366896302408, + "grad_norm": 1.5824869871139526, + "learning_rate": 8.302669085255248e-06, + "loss": 1.3587, + "step": 1602 + }, + { + "epoch": 0.037386992102202, + "grad_norm": 1.6635199785232544, + "learning_rate": 8.307851775071262e-06, + "loss": 1.7323, + "step": 1603 + }, + { + "epoch": 0.037410315241379916, + "grad_norm": 1.5391467809677124, + "learning_rate": 8.313034464887277e-06, + "loss": 1.7489, + "step": 1604 + }, + { + "epoch": 0.037433638380557833, + "grad_norm": 2.136975049972534, + "learning_rate": 8.318217154703293e-06, + "loss": 1.4696, + "step": 1605 + }, + { + "epoch": 0.03745696151973575, + "grad_norm": 1.4561282396316528, + "learning_rate": 8.323399844519307e-06, + "loss": 1.746, + "step": 1606 + }, + { + "epoch": 0.03748028465891367, + "grad_norm": 1.323926329612732, + "learning_rate": 8.328582534335321e-06, + "loss": 1.3654, + "step": 1607 + }, + { + "epoch": 0.03750360779809159, + "grad_norm": 1.6495275497436523, + "learning_rate": 8.333765224151336e-06, + "loss": 1.6208, + "step": 1608 + }, + { + "epoch": 0.037526930937269505, + "grad_norm": 1.4379764795303345, + "learning_rate": 8.33894791396735e-06, + "loss": 1.4988, + "step": 1609 + }, + { + "epoch": 0.037550254076447416, + "grad_norm": 1.676405668258667, + "learning_rate": 8.344130603783364e-06, + "loss": 1.5563, + "step": 1610 + }, + { + "epoch": 0.037573577215625334, + "grad_norm": 1.0886626243591309, + "learning_rate": 8.349313293599379e-06, + "loss": 1.2886, + "step": 1611 + }, + { + "epoch": 0.03759690035480325, + "grad_norm": 1.5499573945999146, + "learning_rate": 8.354495983415393e-06, + "loss": 1.6758, + "step": 1612 + }, + { + "epoch": 0.03762022349398117, + "grad_norm": 1.2256261110305786, + "learning_rate": 8.359678673231408e-06, + "loss": 1.1831, + "step": 1613 + }, + { + "epoch": 0.03764354663315909, + "grad_norm": 2.0278513431549072, + "learning_rate": 8.364861363047422e-06, + "loss": 1.5379, + "step": 1614 + }, + { + "epoch": 0.037666869772337006, + "grad_norm": 1.6582108736038208, + "learning_rate": 8.370044052863436e-06, + "loss": 1.4499, + "step": 1615 + }, + { + "epoch": 0.03769019291151492, + "grad_norm": 2.517474412918091, + "learning_rate": 8.37522674267945e-06, + "loss": 1.3365, + "step": 1616 + }, + { + "epoch": 0.03771351605069284, + "grad_norm": 1.651391863822937, + "learning_rate": 8.380409432495465e-06, + "loss": 1.5475, + "step": 1617 + }, + { + "epoch": 0.03773683918987076, + "grad_norm": 1.9716179370880127, + "learning_rate": 8.38559212231148e-06, + "loss": 1.4809, + "step": 1618 + }, + { + "epoch": 0.03776016232904868, + "grad_norm": 2.0555307865142822, + "learning_rate": 8.390774812127495e-06, + "loss": 1.7496, + "step": 1619 + }, + { + "epoch": 0.037783485468226595, + "grad_norm": 1.5695487260818481, + "learning_rate": 8.39595750194351e-06, + "loss": 1.3654, + "step": 1620 + }, + { + "epoch": 0.03780680860740451, + "grad_norm": 1.4122220277786255, + "learning_rate": 8.401140191759524e-06, + "loss": 1.7411, + "step": 1621 + }, + { + "epoch": 0.03783013174658243, + "grad_norm": 1.4024474620819092, + "learning_rate": 8.406322881575538e-06, + "loss": 1.3807, + "step": 1622 + }, + { + "epoch": 0.03785345488576035, + "grad_norm": 1.932897686958313, + "learning_rate": 8.411505571391553e-06, + "loss": 1.6327, + "step": 1623 + }, + { + "epoch": 0.03787677802493827, + "grad_norm": 1.3100526332855225, + "learning_rate": 8.416688261207567e-06, + "loss": 1.0531, + "step": 1624 + }, + { + "epoch": 0.037900101164116184, + "grad_norm": 1.5780110359191895, + "learning_rate": 8.421870951023581e-06, + "loss": 1.3187, + "step": 1625 + }, + { + "epoch": 0.0379234243032941, + "grad_norm": 1.905220866203308, + "learning_rate": 8.427053640839596e-06, + "loss": 1.5322, + "step": 1626 + }, + { + "epoch": 0.03794674744247202, + "grad_norm": 1.4416756629943848, + "learning_rate": 8.432236330655612e-06, + "loss": 1.3633, + "step": 1627 + }, + { + "epoch": 0.03797007058164994, + "grad_norm": 1.720937728881836, + "learning_rate": 8.437419020471626e-06, + "loss": 1.4178, + "step": 1628 + }, + { + "epoch": 0.037993393720827856, + "grad_norm": 1.891658902168274, + "learning_rate": 8.44260171028764e-06, + "loss": 1.5383, + "step": 1629 + }, + { + "epoch": 0.038016716860005774, + "grad_norm": 2.179572105407715, + "learning_rate": 8.447784400103655e-06, + "loss": 1.3399, + "step": 1630 + }, + { + "epoch": 0.03804003999918369, + "grad_norm": 1.4478271007537842, + "learning_rate": 8.452967089919669e-06, + "loss": 1.4122, + "step": 1631 + }, + { + "epoch": 0.03806336313836161, + "grad_norm": 1.4049443006515503, + "learning_rate": 8.458149779735683e-06, + "loss": 1.5164, + "step": 1632 + }, + { + "epoch": 0.03808668627753953, + "grad_norm": 2.1146810054779053, + "learning_rate": 8.463332469551698e-06, + "loss": 1.5005, + "step": 1633 + }, + { + "epoch": 0.038110009416717446, + "grad_norm": 1.8528714179992676, + "learning_rate": 8.468515159367712e-06, + "loss": 1.4598, + "step": 1634 + }, + { + "epoch": 0.03813333255589536, + "grad_norm": 2.274590492248535, + "learning_rate": 8.473697849183728e-06, + "loss": 1.4343, + "step": 1635 + }, + { + "epoch": 0.03815665569507328, + "grad_norm": 1.837266445159912, + "learning_rate": 8.478880538999743e-06, + "loss": 1.6039, + "step": 1636 + }, + { + "epoch": 0.0381799788342512, + "grad_norm": 1.735687494277954, + "learning_rate": 8.484063228815757e-06, + "loss": 1.3623, + "step": 1637 + }, + { + "epoch": 0.03820330197342912, + "grad_norm": 1.8133695125579834, + "learning_rate": 8.489245918631771e-06, + "loss": 1.2941, + "step": 1638 + }, + { + "epoch": 0.03822662511260703, + "grad_norm": 1.9450503587722778, + "learning_rate": 8.494428608447786e-06, + "loss": 1.4706, + "step": 1639 + }, + { + "epoch": 0.038249948251784946, + "grad_norm": 1.6004278659820557, + "learning_rate": 8.4996112982638e-06, + "loss": 1.4726, + "step": 1640 + }, + { + "epoch": 0.038273271390962864, + "grad_norm": 1.7052674293518066, + "learning_rate": 8.504793988079814e-06, + "loss": 1.5521, + "step": 1641 + }, + { + "epoch": 0.03829659453014078, + "grad_norm": 1.3694720268249512, + "learning_rate": 8.509976677895829e-06, + "loss": 1.1333, + "step": 1642 + }, + { + "epoch": 0.0383199176693187, + "grad_norm": 1.7958831787109375, + "learning_rate": 8.515159367711843e-06, + "loss": 1.6234, + "step": 1643 + }, + { + "epoch": 0.03834324080849662, + "grad_norm": 1.7349238395690918, + "learning_rate": 8.520342057527857e-06, + "loss": 1.3697, + "step": 1644 + }, + { + "epoch": 0.038366563947674535, + "grad_norm": 1.5960413217544556, + "learning_rate": 8.525524747343872e-06, + "loss": 1.4687, + "step": 1645 + }, + { + "epoch": 0.03838988708685245, + "grad_norm": 2.78328800201416, + "learning_rate": 8.530707437159886e-06, + "loss": 1.462, + "step": 1646 + }, + { + "epoch": 0.03841321022603037, + "grad_norm": 1.310705304145813, + "learning_rate": 8.5358901269759e-06, + "loss": 1.347, + "step": 1647 + }, + { + "epoch": 0.03843653336520829, + "grad_norm": 1.554968237876892, + "learning_rate": 8.541072816791915e-06, + "loss": 1.41, + "step": 1648 + }, + { + "epoch": 0.03845985650438621, + "grad_norm": 2.0181522369384766, + "learning_rate": 8.54625550660793e-06, + "loss": 1.3945, + "step": 1649 + }, + { + "epoch": 0.038483179643564125, + "grad_norm": 1.816375494003296, + "learning_rate": 8.551438196423945e-06, + "loss": 1.6109, + "step": 1650 + }, + { + "epoch": 0.03850650278274204, + "grad_norm": 2.1661388874053955, + "learning_rate": 8.55662088623996e-06, + "loss": 1.8344, + "step": 1651 + }, + { + "epoch": 0.03852982592191996, + "grad_norm": 1.9306049346923828, + "learning_rate": 8.561803576055974e-06, + "loss": 1.2227, + "step": 1652 + }, + { + "epoch": 0.03855314906109788, + "grad_norm": 1.3145751953125, + "learning_rate": 8.566986265871988e-06, + "loss": 1.3881, + "step": 1653 + }, + { + "epoch": 0.038576472200275796, + "grad_norm": 1.6416202783584595, + "learning_rate": 8.572168955688003e-06, + "loss": 1.2782, + "step": 1654 + }, + { + "epoch": 0.038599795339453714, + "grad_norm": 1.3195691108703613, + "learning_rate": 8.577351645504017e-06, + "loss": 1.5958, + "step": 1655 + }, + { + "epoch": 0.03862311847863163, + "grad_norm": 1.786651372909546, + "learning_rate": 8.582534335320031e-06, + "loss": 1.6379, + "step": 1656 + }, + { + "epoch": 0.03864644161780955, + "grad_norm": 1.685196876525879, + "learning_rate": 8.587717025136047e-06, + "loss": 1.2548, + "step": 1657 + }, + { + "epoch": 0.03866976475698747, + "grad_norm": 2.0508875846862793, + "learning_rate": 8.592899714952062e-06, + "loss": 1.635, + "step": 1658 + }, + { + "epoch": 0.038693087896165386, + "grad_norm": 1.7226320505142212, + "learning_rate": 8.598082404768076e-06, + "loss": 1.4694, + "step": 1659 + }, + { + "epoch": 0.038716411035343304, + "grad_norm": 1.5333112478256226, + "learning_rate": 8.60326509458409e-06, + "loss": 1.4825, + "step": 1660 + }, + { + "epoch": 0.03873973417452122, + "grad_norm": 1.4121674299240112, + "learning_rate": 8.608447784400105e-06, + "loss": 1.2056, + "step": 1661 + }, + { + "epoch": 0.03876305731369914, + "grad_norm": 1.6394184827804565, + "learning_rate": 8.613630474216119e-06, + "loss": 1.5131, + "step": 1662 + }, + { + "epoch": 0.03878638045287706, + "grad_norm": 2.2525839805603027, + "learning_rate": 8.618813164032133e-06, + "loss": 1.4413, + "step": 1663 + }, + { + "epoch": 0.038809703592054975, + "grad_norm": 1.6599324941635132, + "learning_rate": 8.623995853848148e-06, + "loss": 1.1568, + "step": 1664 + }, + { + "epoch": 0.03883302673123289, + "grad_norm": 1.930284857749939, + "learning_rate": 8.629178543664162e-06, + "loss": 1.2182, + "step": 1665 + }, + { + "epoch": 0.03885634987041081, + "grad_norm": 1.366219401359558, + "learning_rate": 8.634361233480178e-06, + "loss": 1.6951, + "step": 1666 + }, + { + "epoch": 0.03887967300958872, + "grad_norm": 1.8555302619934082, + "learning_rate": 8.639543923296192e-06, + "loss": 1.4508, + "step": 1667 + }, + { + "epoch": 0.03890299614876664, + "grad_norm": 2.110704183578491, + "learning_rate": 8.644726613112205e-06, + "loss": 1.5057, + "step": 1668 + }, + { + "epoch": 0.03892631928794456, + "grad_norm": 1.4422646760940552, + "learning_rate": 8.64990930292822e-06, + "loss": 1.5628, + "step": 1669 + }, + { + "epoch": 0.038949642427122476, + "grad_norm": 1.8097025156021118, + "learning_rate": 8.655091992744234e-06, + "loss": 1.5336, + "step": 1670 + }, + { + "epoch": 0.038972965566300394, + "grad_norm": 1.5321156978607178, + "learning_rate": 8.66027468256025e-06, + "loss": 1.4985, + "step": 1671 + }, + { + "epoch": 0.03899628870547831, + "grad_norm": 1.715100884437561, + "learning_rate": 8.665457372376264e-06, + "loss": 1.5365, + "step": 1672 + }, + { + "epoch": 0.03901961184465623, + "grad_norm": 1.7432835102081299, + "learning_rate": 8.670640062192278e-06, + "loss": 1.5822, + "step": 1673 + }, + { + "epoch": 0.03904293498383415, + "grad_norm": 1.7451759576797485, + "learning_rate": 8.675822752008293e-06, + "loss": 1.6363, + "step": 1674 + }, + { + "epoch": 0.039066258123012065, + "grad_norm": 1.6405068635940552, + "learning_rate": 8.681005441824307e-06, + "loss": 1.6819, + "step": 1675 + }, + { + "epoch": 0.03908958126218998, + "grad_norm": 1.7980347871780396, + "learning_rate": 8.686188131640322e-06, + "loss": 1.5362, + "step": 1676 + }, + { + "epoch": 0.0391129044013679, + "grad_norm": 1.6365665197372437, + "learning_rate": 8.691370821456336e-06, + "loss": 2.0277, + "step": 1677 + }, + { + "epoch": 0.03913622754054582, + "grad_norm": 1.9490535259246826, + "learning_rate": 8.69655351127235e-06, + "loss": 1.5076, + "step": 1678 + }, + { + "epoch": 0.03915955067972374, + "grad_norm": 1.4164410829544067, + "learning_rate": 8.701736201088366e-06, + "loss": 1.8005, + "step": 1679 + }, + { + "epoch": 0.039182873818901655, + "grad_norm": 1.4707103967666626, + "learning_rate": 8.70691889090438e-06, + "loss": 1.353, + "step": 1680 + }, + { + "epoch": 0.03920619695807957, + "grad_norm": 1.7562110424041748, + "learning_rate": 8.712101580720395e-06, + "loss": 1.5621, + "step": 1681 + }, + { + "epoch": 0.03922952009725749, + "grad_norm": 2.0748794078826904, + "learning_rate": 8.71728427053641e-06, + "loss": 1.4923, + "step": 1682 + }, + { + "epoch": 0.03925284323643541, + "grad_norm": 2.031003475189209, + "learning_rate": 8.722466960352424e-06, + "loss": 1.1706, + "step": 1683 + }, + { + "epoch": 0.039276166375613326, + "grad_norm": 2.4340038299560547, + "learning_rate": 8.727649650168438e-06, + "loss": 1.3371, + "step": 1684 + }, + { + "epoch": 0.039299489514791244, + "grad_norm": 2.129331111907959, + "learning_rate": 8.732832339984452e-06, + "loss": 1.558, + "step": 1685 + }, + { + "epoch": 0.03932281265396916, + "grad_norm": 1.907139778137207, + "learning_rate": 8.738015029800467e-06, + "loss": 1.6016, + "step": 1686 + }, + { + "epoch": 0.03934613579314708, + "grad_norm": 1.8079878091812134, + "learning_rate": 8.743197719616483e-06, + "loss": 1.4186, + "step": 1687 + }, + { + "epoch": 0.039369458932325, + "grad_norm": 1.9196524620056152, + "learning_rate": 8.748380409432497e-06, + "loss": 1.6435, + "step": 1688 + }, + { + "epoch": 0.039392782071502916, + "grad_norm": 1.5702369213104248, + "learning_rate": 8.753563099248511e-06, + "loss": 1.6279, + "step": 1689 + }, + { + "epoch": 0.039416105210680834, + "grad_norm": 1.8079639673233032, + "learning_rate": 8.758745789064526e-06, + "loss": 1.4299, + "step": 1690 + }, + { + "epoch": 0.03943942834985875, + "grad_norm": 1.5084450244903564, + "learning_rate": 8.76392847888054e-06, + "loss": 1.6051, + "step": 1691 + }, + { + "epoch": 0.03946275148903667, + "grad_norm": 1.8773257732391357, + "learning_rate": 8.769111168696554e-06, + "loss": 1.2258, + "step": 1692 + }, + { + "epoch": 0.03948607462821459, + "grad_norm": 1.662649154663086, + "learning_rate": 8.774293858512569e-06, + "loss": 1.5057, + "step": 1693 + }, + { + "epoch": 0.039509397767392505, + "grad_norm": 1.7742561101913452, + "learning_rate": 8.779476548328583e-06, + "loss": 1.5083, + "step": 1694 + }, + { + "epoch": 0.03953272090657042, + "grad_norm": 1.6094675064086914, + "learning_rate": 8.784659238144598e-06, + "loss": 1.4416, + "step": 1695 + }, + { + "epoch": 0.039556044045748334, + "grad_norm": 1.7892067432403564, + "learning_rate": 8.789841927960612e-06, + "loss": 1.6939, + "step": 1696 + }, + { + "epoch": 0.03957936718492625, + "grad_norm": 1.4669241905212402, + "learning_rate": 8.795024617776626e-06, + "loss": 1.3218, + "step": 1697 + }, + { + "epoch": 0.03960269032410417, + "grad_norm": 1.6289660930633545, + "learning_rate": 8.80020730759264e-06, + "loss": 1.4026, + "step": 1698 + }, + { + "epoch": 0.03962601346328209, + "grad_norm": 1.4103940725326538, + "learning_rate": 8.805389997408655e-06, + "loss": 1.5594, + "step": 1699 + }, + { + "epoch": 0.039649336602460006, + "grad_norm": 1.8094227313995361, + "learning_rate": 8.81057268722467e-06, + "loss": 1.4749, + "step": 1700 + }, + { + "epoch": 0.039672659741637924, + "grad_norm": 1.9171851873397827, + "learning_rate": 8.815755377040685e-06, + "loss": 1.5853, + "step": 1701 + }, + { + "epoch": 0.03969598288081584, + "grad_norm": 1.7482846975326538, + "learning_rate": 8.8209380668567e-06, + "loss": 1.8572, + "step": 1702 + }, + { + "epoch": 0.03971930601999376, + "grad_norm": 1.494166374206543, + "learning_rate": 8.826120756672714e-06, + "loss": 1.4618, + "step": 1703 + }, + { + "epoch": 0.03974262915917168, + "grad_norm": 1.8293770551681519, + "learning_rate": 8.831303446488728e-06, + "loss": 1.278, + "step": 1704 + }, + { + "epoch": 0.039765952298349595, + "grad_norm": 1.7367064952850342, + "learning_rate": 8.836486136304743e-06, + "loss": 1.65, + "step": 1705 + }, + { + "epoch": 0.03978927543752751, + "grad_norm": 1.783642292022705, + "learning_rate": 8.841668826120757e-06, + "loss": 1.5325, + "step": 1706 + }, + { + "epoch": 0.03981259857670543, + "grad_norm": 1.5297502279281616, + "learning_rate": 8.846851515936771e-06, + "loss": 1.4132, + "step": 1707 + }, + { + "epoch": 0.03983592171588335, + "grad_norm": 1.9751566648483276, + "learning_rate": 8.852034205752786e-06, + "loss": 1.4317, + "step": 1708 + }, + { + "epoch": 0.03985924485506127, + "grad_norm": 2.1414785385131836, + "learning_rate": 8.857216895568802e-06, + "loss": 1.6401, + "step": 1709 + }, + { + "epoch": 0.039882567994239185, + "grad_norm": 1.4582406282424927, + "learning_rate": 8.862399585384816e-06, + "loss": 1.4949, + "step": 1710 + }, + { + "epoch": 0.0399058911334171, + "grad_norm": 1.3729748725891113, + "learning_rate": 8.86758227520083e-06, + "loss": 0.8325, + "step": 1711 + }, + { + "epoch": 0.03992921427259502, + "grad_norm": 1.5666522979736328, + "learning_rate": 8.872764965016845e-06, + "loss": 1.6165, + "step": 1712 + }, + { + "epoch": 0.03995253741177294, + "grad_norm": 1.8730623722076416, + "learning_rate": 8.877947654832859e-06, + "loss": 1.5912, + "step": 1713 + }, + { + "epoch": 0.039975860550950856, + "grad_norm": 1.3995941877365112, + "learning_rate": 8.883130344648873e-06, + "loss": 1.4624, + "step": 1714 + }, + { + "epoch": 0.039999183690128774, + "grad_norm": 1.6787446737289429, + "learning_rate": 8.888313034464888e-06, + "loss": 1.7264, + "step": 1715 + }, + { + "epoch": 0.04002250682930669, + "grad_norm": 1.6797045469284058, + "learning_rate": 8.893495724280902e-06, + "loss": 1.458, + "step": 1716 + }, + { + "epoch": 0.04004582996848461, + "grad_norm": 1.4562252759933472, + "learning_rate": 8.898678414096917e-06, + "loss": 1.4469, + "step": 1717 + }, + { + "epoch": 0.04006915310766253, + "grad_norm": 1.8270559310913086, + "learning_rate": 8.903861103912933e-06, + "loss": 1.5524, + "step": 1718 + }, + { + "epoch": 0.040092476246840446, + "grad_norm": 2.2723021507263184, + "learning_rate": 8.909043793728947e-06, + "loss": 1.5524, + "step": 1719 + }, + { + "epoch": 0.040115799386018364, + "grad_norm": 1.6696120500564575, + "learning_rate": 8.914226483544961e-06, + "loss": 1.6466, + "step": 1720 + }, + { + "epoch": 0.04013912252519628, + "grad_norm": 1.8067409992218018, + "learning_rate": 8.919409173360976e-06, + "loss": 1.4901, + "step": 1721 + }, + { + "epoch": 0.0401624456643742, + "grad_norm": 1.6212742328643799, + "learning_rate": 8.92459186317699e-06, + "loss": 1.3791, + "step": 1722 + }, + { + "epoch": 0.04018576880355212, + "grad_norm": 1.5557783842086792, + "learning_rate": 8.929774552993004e-06, + "loss": 1.4122, + "step": 1723 + }, + { + "epoch": 0.04020909194273003, + "grad_norm": 2.65142822265625, + "learning_rate": 8.934957242809019e-06, + "loss": 1.3679, + "step": 1724 + }, + { + "epoch": 0.040232415081907946, + "grad_norm": 1.9991352558135986, + "learning_rate": 8.940139932625033e-06, + "loss": 1.4746, + "step": 1725 + }, + { + "epoch": 0.040255738221085864, + "grad_norm": 2.054579257965088, + "learning_rate": 8.945322622441047e-06, + "loss": 1.5759, + "step": 1726 + }, + { + "epoch": 0.04027906136026378, + "grad_norm": 1.62351393699646, + "learning_rate": 8.950505312257062e-06, + "loss": 1.4139, + "step": 1727 + }, + { + "epoch": 0.0403023844994417, + "grad_norm": 1.754712462425232, + "learning_rate": 8.955688002073076e-06, + "loss": 1.2871, + "step": 1728 + }, + { + "epoch": 0.04032570763861962, + "grad_norm": 1.744728922843933, + "learning_rate": 8.96087069188909e-06, + "loss": 1.5207, + "step": 1729 + }, + { + "epoch": 0.040349030777797536, + "grad_norm": 1.9871348142623901, + "learning_rate": 8.966053381705105e-06, + "loss": 1.7927, + "step": 1730 + }, + { + "epoch": 0.040372353916975454, + "grad_norm": 1.898793339729309, + "learning_rate": 8.97123607152112e-06, + "loss": 1.5487, + "step": 1731 + }, + { + "epoch": 0.04039567705615337, + "grad_norm": 1.6234720945358276, + "learning_rate": 8.976418761337135e-06, + "loss": 1.3666, + "step": 1732 + }, + { + "epoch": 0.04041900019533129, + "grad_norm": 1.7883436679840088, + "learning_rate": 8.98160145115315e-06, + "loss": 1.59, + "step": 1733 + }, + { + "epoch": 0.04044232333450921, + "grad_norm": 2.030747890472412, + "learning_rate": 8.986784140969164e-06, + "loss": 1.5484, + "step": 1734 + }, + { + "epoch": 0.040465646473687125, + "grad_norm": 1.5323489904403687, + "learning_rate": 8.991966830785178e-06, + "loss": 1.4076, + "step": 1735 + }, + { + "epoch": 0.04048896961286504, + "grad_norm": 1.545076847076416, + "learning_rate": 8.997149520601193e-06, + "loss": 1.9024, + "step": 1736 + }, + { + "epoch": 0.04051229275204296, + "grad_norm": 1.775343656539917, + "learning_rate": 9.002332210417207e-06, + "loss": 1.6269, + "step": 1737 + }, + { + "epoch": 0.04053561589122088, + "grad_norm": 1.5936089754104614, + "learning_rate": 9.007514900233221e-06, + "loss": 1.388, + "step": 1738 + }, + { + "epoch": 0.0405589390303988, + "grad_norm": 2.0282087326049805, + "learning_rate": 9.012697590049236e-06, + "loss": 1.5258, + "step": 1739 + }, + { + "epoch": 0.040582262169576715, + "grad_norm": 1.769651174545288, + "learning_rate": 9.017880279865252e-06, + "loss": 1.6468, + "step": 1740 + }, + { + "epoch": 0.04060558530875463, + "grad_norm": 1.671475887298584, + "learning_rate": 9.023062969681266e-06, + "loss": 1.457, + "step": 1741 + }, + { + "epoch": 0.04062890844793255, + "grad_norm": 1.5717363357543945, + "learning_rate": 9.02824565949728e-06, + "loss": 1.0661, + "step": 1742 + }, + { + "epoch": 0.04065223158711047, + "grad_norm": 2.1011769771575928, + "learning_rate": 9.033428349313295e-06, + "loss": 1.8212, + "step": 1743 + }, + { + "epoch": 0.040675554726288386, + "grad_norm": 1.8593213558197021, + "learning_rate": 9.038611039129309e-06, + "loss": 1.2838, + "step": 1744 + }, + { + "epoch": 0.040698877865466304, + "grad_norm": 3.45039963722229, + "learning_rate": 9.043793728945323e-06, + "loss": 1.2977, + "step": 1745 + }, + { + "epoch": 0.04072220100464422, + "grad_norm": 1.5961792469024658, + "learning_rate": 9.048976418761338e-06, + "loss": 1.598, + "step": 1746 + }, + { + "epoch": 0.04074552414382214, + "grad_norm": 1.7901935577392578, + "learning_rate": 9.054159108577352e-06, + "loss": 1.197, + "step": 1747 + }, + { + "epoch": 0.04076884728300006, + "grad_norm": 1.7534990310668945, + "learning_rate": 9.059341798393368e-06, + "loss": 1.5957, + "step": 1748 + }, + { + "epoch": 0.040792170422177976, + "grad_norm": 2.0215656757354736, + "learning_rate": 9.064524488209382e-06, + "loss": 1.4019, + "step": 1749 + }, + { + "epoch": 0.040815493561355894, + "grad_norm": 1.7355159521102905, + "learning_rate": 9.069707178025397e-06, + "loss": 1.6056, + "step": 1750 + }, + { + "epoch": 0.04083881670053381, + "grad_norm": 2.3358545303344727, + "learning_rate": 9.07488986784141e-06, + "loss": 1.3946, + "step": 1751 + }, + { + "epoch": 0.04086213983971173, + "grad_norm": 2.4582395553588867, + "learning_rate": 9.080072557657424e-06, + "loss": 1.3848, + "step": 1752 + }, + { + "epoch": 0.04088546297888964, + "grad_norm": 1.8667892217636108, + "learning_rate": 9.08525524747344e-06, + "loss": 1.5908, + "step": 1753 + }, + { + "epoch": 0.04090878611806756, + "grad_norm": 2.2128000259399414, + "learning_rate": 9.090437937289454e-06, + "loss": 1.4584, + "step": 1754 + }, + { + "epoch": 0.040932109257245476, + "grad_norm": 1.714179277420044, + "learning_rate": 9.095620627105468e-06, + "loss": 1.3882, + "step": 1755 + }, + { + "epoch": 0.040955432396423394, + "grad_norm": 1.7891523838043213, + "learning_rate": 9.100803316921483e-06, + "loss": 1.6921, + "step": 1756 + }, + { + "epoch": 0.04097875553560131, + "grad_norm": 2.0620603561401367, + "learning_rate": 9.105986006737497e-06, + "loss": 1.4833, + "step": 1757 + }, + { + "epoch": 0.04100207867477923, + "grad_norm": 1.4664239883422852, + "learning_rate": 9.111168696553512e-06, + "loss": 1.5, + "step": 1758 + }, + { + "epoch": 0.04102540181395715, + "grad_norm": 2.151362180709839, + "learning_rate": 9.116351386369526e-06, + "loss": 1.4189, + "step": 1759 + }, + { + "epoch": 0.041048724953135066, + "grad_norm": 2.1404523849487305, + "learning_rate": 9.12153407618554e-06, + "loss": 1.512, + "step": 1760 + }, + { + "epoch": 0.041072048092312984, + "grad_norm": 1.5175687074661255, + "learning_rate": 9.126716766001556e-06, + "loss": 1.3527, + "step": 1761 + }, + { + "epoch": 0.0410953712314909, + "grad_norm": 1.6199604272842407, + "learning_rate": 9.13189945581757e-06, + "loss": 1.1717, + "step": 1762 + }, + { + "epoch": 0.04111869437066882, + "grad_norm": 1.655900001525879, + "learning_rate": 9.137082145633585e-06, + "loss": 1.4903, + "step": 1763 + }, + { + "epoch": 0.04114201750984674, + "grad_norm": 1.6075772047042847, + "learning_rate": 9.1422648354496e-06, + "loss": 1.3745, + "step": 1764 + }, + { + "epoch": 0.041165340649024655, + "grad_norm": 1.5534958839416504, + "learning_rate": 9.147447525265614e-06, + "loss": 1.4659, + "step": 1765 + }, + { + "epoch": 0.04118866378820257, + "grad_norm": 2.197490930557251, + "learning_rate": 9.152630215081628e-06, + "loss": 1.5412, + "step": 1766 + }, + { + "epoch": 0.04121198692738049, + "grad_norm": 2.1121668815612793, + "learning_rate": 9.157812904897642e-06, + "loss": 1.7137, + "step": 1767 + }, + { + "epoch": 0.04123531006655841, + "grad_norm": 2.2003660202026367, + "learning_rate": 9.162995594713657e-06, + "loss": 1.6095, + "step": 1768 + }, + { + "epoch": 0.04125863320573633, + "grad_norm": 1.617874264717102, + "learning_rate": 9.168178284529671e-06, + "loss": 1.4913, + "step": 1769 + }, + { + "epoch": 0.041281956344914245, + "grad_norm": 1.6809815168380737, + "learning_rate": 9.173360974345687e-06, + "loss": 1.6014, + "step": 1770 + }, + { + "epoch": 0.04130527948409216, + "grad_norm": 1.8234214782714844, + "learning_rate": 9.178543664161701e-06, + "loss": 1.4921, + "step": 1771 + }, + { + "epoch": 0.04132860262327008, + "grad_norm": 1.605371117591858, + "learning_rate": 9.183726353977716e-06, + "loss": 1.526, + "step": 1772 + }, + { + "epoch": 0.041351925762448, + "grad_norm": 1.7158360481262207, + "learning_rate": 9.18890904379373e-06, + "loss": 1.6063, + "step": 1773 + }, + { + "epoch": 0.041375248901625916, + "grad_norm": 1.8888566493988037, + "learning_rate": 9.194091733609744e-06, + "loss": 1.3013, + "step": 1774 + }, + { + "epoch": 0.041398572040803834, + "grad_norm": 1.8596553802490234, + "learning_rate": 9.199274423425759e-06, + "loss": 1.3611, + "step": 1775 + }, + { + "epoch": 0.04142189517998175, + "grad_norm": 1.770941972732544, + "learning_rate": 9.204457113241773e-06, + "loss": 1.553, + "step": 1776 + }, + { + "epoch": 0.04144521831915967, + "grad_norm": 1.4563987255096436, + "learning_rate": 9.209639803057788e-06, + "loss": 1.3261, + "step": 1777 + }, + { + "epoch": 0.04146854145833759, + "grad_norm": 1.5590494871139526, + "learning_rate": 9.214822492873802e-06, + "loss": 1.6303, + "step": 1778 + }, + { + "epoch": 0.041491864597515506, + "grad_norm": 1.6040290594100952, + "learning_rate": 9.220005182689816e-06, + "loss": 1.6656, + "step": 1779 + }, + { + "epoch": 0.041515187736693424, + "grad_norm": 1.6253089904785156, + "learning_rate": 9.22518787250583e-06, + "loss": 1.3086, + "step": 1780 + }, + { + "epoch": 0.041538510875871335, + "grad_norm": 2.282277822494507, + "learning_rate": 9.230370562321845e-06, + "loss": 1.3154, + "step": 1781 + }, + { + "epoch": 0.04156183401504925, + "grad_norm": 1.6955877542495728, + "learning_rate": 9.23555325213786e-06, + "loss": 1.4742, + "step": 1782 + }, + { + "epoch": 0.04158515715422717, + "grad_norm": 2.6918323040008545, + "learning_rate": 9.240735941953875e-06, + "loss": 1.4942, + "step": 1783 + }, + { + "epoch": 0.04160848029340509, + "grad_norm": 2.111135244369507, + "learning_rate": 9.24591863176989e-06, + "loss": 1.4501, + "step": 1784 + }, + { + "epoch": 0.041631803432583006, + "grad_norm": 1.6524665355682373, + "learning_rate": 9.251101321585904e-06, + "loss": 1.2801, + "step": 1785 + }, + { + "epoch": 0.041655126571760924, + "grad_norm": 1.812553882598877, + "learning_rate": 9.256284011401918e-06, + "loss": 1.2928, + "step": 1786 + }, + { + "epoch": 0.04167844971093884, + "grad_norm": 1.7474865913391113, + "learning_rate": 9.261466701217933e-06, + "loss": 1.5489, + "step": 1787 + }, + { + "epoch": 0.04170177285011676, + "grad_norm": 1.91874098777771, + "learning_rate": 9.266649391033947e-06, + "loss": 1.5997, + "step": 1788 + }, + { + "epoch": 0.04172509598929468, + "grad_norm": 1.4715979099273682, + "learning_rate": 9.271832080849961e-06, + "loss": 0.921, + "step": 1789 + }, + { + "epoch": 0.041748419128472596, + "grad_norm": 1.599254846572876, + "learning_rate": 9.277014770665976e-06, + "loss": 1.5168, + "step": 1790 + }, + { + "epoch": 0.04177174226765051, + "grad_norm": 1.8970310688018799, + "learning_rate": 9.28219746048199e-06, + "loss": 1.4821, + "step": 1791 + }, + { + "epoch": 0.04179506540682843, + "grad_norm": 1.5975875854492188, + "learning_rate": 9.287380150298006e-06, + "loss": 1.4889, + "step": 1792 + }, + { + "epoch": 0.04181838854600635, + "grad_norm": 1.7852643728256226, + "learning_rate": 9.29256284011402e-06, + "loss": 1.4124, + "step": 1793 + }, + { + "epoch": 0.04184171168518427, + "grad_norm": 1.8535397052764893, + "learning_rate": 9.297745529930035e-06, + "loss": 1.5964, + "step": 1794 + }, + { + "epoch": 0.041865034824362185, + "grad_norm": 1.532125473022461, + "learning_rate": 9.302928219746049e-06, + "loss": 1.2431, + "step": 1795 + }, + { + "epoch": 0.0418883579635401, + "grad_norm": 1.542386531829834, + "learning_rate": 9.308110909562063e-06, + "loss": 1.6327, + "step": 1796 + }, + { + "epoch": 0.04191168110271802, + "grad_norm": 1.8671448230743408, + "learning_rate": 9.313293599378078e-06, + "loss": 1.7695, + "step": 1797 + }, + { + "epoch": 0.04193500424189594, + "grad_norm": 1.6148124933242798, + "learning_rate": 9.318476289194092e-06, + "loss": 1.7227, + "step": 1798 + }, + { + "epoch": 0.04195832738107386, + "grad_norm": 1.4859371185302734, + "learning_rate": 9.323658979010107e-06, + "loss": 1.2807, + "step": 1799 + }, + { + "epoch": 0.041981650520251775, + "grad_norm": 3.0297629833221436, + "learning_rate": 9.328841668826123e-06, + "loss": 1.3824, + "step": 1800 + }, + { + "epoch": 0.04200497365942969, + "grad_norm": 1.6791976690292358, + "learning_rate": 9.334024358642137e-06, + "loss": 1.7325, + "step": 1801 + }, + { + "epoch": 0.04202829679860761, + "grad_norm": 1.4695453643798828, + "learning_rate": 9.339207048458151e-06, + "loss": 1.2062, + "step": 1802 + }, + { + "epoch": 0.04205161993778553, + "grad_norm": 1.5592173337936401, + "learning_rate": 9.344389738274166e-06, + "loss": 1.1919, + "step": 1803 + }, + { + "epoch": 0.042074943076963446, + "grad_norm": 1.4761253595352173, + "learning_rate": 9.34957242809018e-06, + "loss": 1.2845, + "step": 1804 + }, + { + "epoch": 0.042098266216141364, + "grad_norm": 1.3584182262420654, + "learning_rate": 9.354755117906194e-06, + "loss": 1.6216, + "step": 1805 + }, + { + "epoch": 0.04212158935531928, + "grad_norm": 2.0344326496124268, + "learning_rate": 9.359937807722209e-06, + "loss": 1.2301, + "step": 1806 + }, + { + "epoch": 0.0421449124944972, + "grad_norm": 1.549643874168396, + "learning_rate": 9.365120497538223e-06, + "loss": 1.446, + "step": 1807 + }, + { + "epoch": 0.04216823563367512, + "grad_norm": 1.6695293188095093, + "learning_rate": 9.370303187354237e-06, + "loss": 1.7588, + "step": 1808 + }, + { + "epoch": 0.042191558772853036, + "grad_norm": 1.817617416381836, + "learning_rate": 9.375485877170252e-06, + "loss": 1.5394, + "step": 1809 + }, + { + "epoch": 0.04221488191203095, + "grad_norm": 1.917152762413025, + "learning_rate": 9.380668566986266e-06, + "loss": 1.6437, + "step": 1810 + }, + { + "epoch": 0.042238205051208864, + "grad_norm": 0.9892622828483582, + "learning_rate": 9.38585125680228e-06, + "loss": 1.1554, + "step": 1811 + }, + { + "epoch": 0.04226152819038678, + "grad_norm": 1.577576994895935, + "learning_rate": 9.391033946618295e-06, + "loss": 1.4737, + "step": 1812 + }, + { + "epoch": 0.0422848513295647, + "grad_norm": 1.739229679107666, + "learning_rate": 9.39621663643431e-06, + "loss": 1.4077, + "step": 1813 + }, + { + "epoch": 0.04230817446874262, + "grad_norm": 1.6817034482955933, + "learning_rate": 9.401399326250325e-06, + "loss": 0.9329, + "step": 1814 + }, + { + "epoch": 0.042331497607920536, + "grad_norm": 1.6616978645324707, + "learning_rate": 9.40658201606634e-06, + "loss": 1.6185, + "step": 1815 + }, + { + "epoch": 0.042354820747098454, + "grad_norm": 1.379654049873352, + "learning_rate": 9.411764705882354e-06, + "loss": 1.6863, + "step": 1816 + }, + { + "epoch": 0.04237814388627637, + "grad_norm": 2.3998191356658936, + "learning_rate": 9.416947395698368e-06, + "loss": 1.4281, + "step": 1817 + }, + { + "epoch": 0.04240146702545429, + "grad_norm": 2.078322410583496, + "learning_rate": 9.422130085514383e-06, + "loss": 1.2324, + "step": 1818 + }, + { + "epoch": 0.04242479016463221, + "grad_norm": 1.8474605083465576, + "learning_rate": 9.427312775330397e-06, + "loss": 1.3242, + "step": 1819 + }, + { + "epoch": 0.042448113303810125, + "grad_norm": 1.4538230895996094, + "learning_rate": 9.432495465146411e-06, + "loss": 1.3117, + "step": 1820 + }, + { + "epoch": 0.04247143644298804, + "grad_norm": 2.528913974761963, + "learning_rate": 9.437678154962426e-06, + "loss": 1.3846, + "step": 1821 + }, + { + "epoch": 0.04249475958216596, + "grad_norm": 1.5370780229568481, + "learning_rate": 9.442860844778442e-06, + "loss": 1.4712, + "step": 1822 + }, + { + "epoch": 0.04251808272134388, + "grad_norm": 1.7554328441619873, + "learning_rate": 9.448043534594456e-06, + "loss": 1.5354, + "step": 1823 + }, + { + "epoch": 0.0425414058605218, + "grad_norm": 1.490560531616211, + "learning_rate": 9.45322622441047e-06, + "loss": 1.1725, + "step": 1824 + }, + { + "epoch": 0.042564728999699715, + "grad_norm": 1.55622136592865, + "learning_rate": 9.458408914226485e-06, + "loss": 1.5401, + "step": 1825 + }, + { + "epoch": 0.04258805213887763, + "grad_norm": 1.6288939714431763, + "learning_rate": 9.463591604042499e-06, + "loss": 1.404, + "step": 1826 + }, + { + "epoch": 0.04261137527805555, + "grad_norm": 1.9815454483032227, + "learning_rate": 9.468774293858513e-06, + "loss": 1.538, + "step": 1827 + }, + { + "epoch": 0.04263469841723347, + "grad_norm": 1.8967722654342651, + "learning_rate": 9.473956983674528e-06, + "loss": 1.4561, + "step": 1828 + }, + { + "epoch": 0.04265802155641139, + "grad_norm": 2.010972023010254, + "learning_rate": 9.479139673490542e-06, + "loss": 1.692, + "step": 1829 + }, + { + "epoch": 0.042681344695589304, + "grad_norm": 1.82353937625885, + "learning_rate": 9.484322363306558e-06, + "loss": 1.6394, + "step": 1830 + }, + { + "epoch": 0.04270466783476722, + "grad_norm": 1.6288769245147705, + "learning_rate": 9.489505053122572e-06, + "loss": 1.7251, + "step": 1831 + }, + { + "epoch": 0.04272799097394514, + "grad_norm": 2.7632317543029785, + "learning_rate": 9.494687742938587e-06, + "loss": 1.5771, + "step": 1832 + }, + { + "epoch": 0.04275131411312306, + "grad_norm": 1.7157068252563477, + "learning_rate": 9.499870432754601e-06, + "loss": 1.9245, + "step": 1833 + }, + { + "epoch": 0.042774637252300976, + "grad_norm": 1.6728345155715942, + "learning_rate": 9.505053122570614e-06, + "loss": 1.5874, + "step": 1834 + }, + { + "epoch": 0.042797960391478894, + "grad_norm": 1.6265268325805664, + "learning_rate": 9.51023581238663e-06, + "loss": 1.6633, + "step": 1835 + }, + { + "epoch": 0.04282128353065681, + "grad_norm": 1.8013489246368408, + "learning_rate": 9.515418502202644e-06, + "loss": 1.3856, + "step": 1836 + }, + { + "epoch": 0.04284460666983473, + "grad_norm": 1.85427987575531, + "learning_rate": 9.520601192018658e-06, + "loss": 1.2233, + "step": 1837 + }, + { + "epoch": 0.04286792980901264, + "grad_norm": 1.6943988800048828, + "learning_rate": 9.525783881834673e-06, + "loss": 1.3198, + "step": 1838 + }, + { + "epoch": 0.04289125294819056, + "grad_norm": 1.7103756666183472, + "learning_rate": 9.530966571650687e-06, + "loss": 1.4118, + "step": 1839 + }, + { + "epoch": 0.042914576087368476, + "grad_norm": 2.0107672214508057, + "learning_rate": 9.536149261466702e-06, + "loss": 1.3456, + "step": 1840 + }, + { + "epoch": 0.042937899226546394, + "grad_norm": 1.505422830581665, + "learning_rate": 9.541331951282716e-06, + "loss": 1.2676, + "step": 1841 + }, + { + "epoch": 0.04296122236572431, + "grad_norm": 2.090595245361328, + "learning_rate": 9.54651464109873e-06, + "loss": 1.2113, + "step": 1842 + }, + { + "epoch": 0.04298454550490223, + "grad_norm": 1.7776191234588623, + "learning_rate": 9.551697330914745e-06, + "loss": 1.5694, + "step": 1843 + }, + { + "epoch": 0.04300786864408015, + "grad_norm": 3.0254878997802734, + "learning_rate": 9.55688002073076e-06, + "loss": 1.244, + "step": 1844 + }, + { + "epoch": 0.043031191783258066, + "grad_norm": 1.8657838106155396, + "learning_rate": 9.562062710546775e-06, + "loss": 1.9444, + "step": 1845 + }, + { + "epoch": 0.043054514922435984, + "grad_norm": 2.1006710529327393, + "learning_rate": 9.56724540036279e-06, + "loss": 1.3202, + "step": 1846 + }, + { + "epoch": 0.0430778380616139, + "grad_norm": 1.2389309406280518, + "learning_rate": 9.572428090178804e-06, + "loss": 1.1992, + "step": 1847 + }, + { + "epoch": 0.04310116120079182, + "grad_norm": 2.162818193435669, + "learning_rate": 9.577610779994818e-06, + "loss": 1.5446, + "step": 1848 + }, + { + "epoch": 0.04312448433996974, + "grad_norm": 2.476367950439453, + "learning_rate": 9.582793469810832e-06, + "loss": 1.6178, + "step": 1849 + }, + { + "epoch": 0.043147807479147655, + "grad_norm": 2.1805801391601562, + "learning_rate": 9.587976159626847e-06, + "loss": 1.5745, + "step": 1850 + }, + { + "epoch": 0.04317113061832557, + "grad_norm": 1.7875632047653198, + "learning_rate": 9.593158849442861e-06, + "loss": 1.6798, + "step": 1851 + }, + { + "epoch": 0.04319445375750349, + "grad_norm": 2.506103515625, + "learning_rate": 9.598341539258877e-06, + "loss": 1.2824, + "step": 1852 + }, + { + "epoch": 0.04321777689668141, + "grad_norm": 2.027400016784668, + "learning_rate": 9.603524229074891e-06, + "loss": 1.7745, + "step": 1853 + }, + { + "epoch": 0.04324110003585933, + "grad_norm": 1.5254895687103271, + "learning_rate": 9.608706918890906e-06, + "loss": 1.6716, + "step": 1854 + }, + { + "epoch": 0.043264423175037245, + "grad_norm": 1.9832854270935059, + "learning_rate": 9.61388960870692e-06, + "loss": 1.2432, + "step": 1855 + }, + { + "epoch": 0.04328774631421516, + "grad_norm": 1.3785820007324219, + "learning_rate": 9.619072298522934e-06, + "loss": 1.3452, + "step": 1856 + }, + { + "epoch": 0.04331106945339308, + "grad_norm": 2.0536274909973145, + "learning_rate": 9.624254988338949e-06, + "loss": 1.9594, + "step": 1857 + }, + { + "epoch": 0.043334392592571, + "grad_norm": 1.8014826774597168, + "learning_rate": 9.629437678154963e-06, + "loss": 1.4811, + "step": 1858 + }, + { + "epoch": 0.043357715731748916, + "grad_norm": 1.5722678899765015, + "learning_rate": 9.634620367970978e-06, + "loss": 1.2694, + "step": 1859 + }, + { + "epoch": 0.043381038870926834, + "grad_norm": 1.849761724472046, + "learning_rate": 9.639803057786994e-06, + "loss": 1.4856, + "step": 1860 + }, + { + "epoch": 0.04340436201010475, + "grad_norm": 1.412558913230896, + "learning_rate": 9.644985747603006e-06, + "loss": 1.4672, + "step": 1861 + }, + { + "epoch": 0.04342768514928267, + "grad_norm": 2.028230667114258, + "learning_rate": 9.65016843741902e-06, + "loss": 1.5573, + "step": 1862 + }, + { + "epoch": 0.04345100828846059, + "grad_norm": 2.5457494258880615, + "learning_rate": 9.655351127235035e-06, + "loss": 1.3734, + "step": 1863 + }, + { + "epoch": 0.043474331427638506, + "grad_norm": 1.6199779510498047, + "learning_rate": 9.66053381705105e-06, + "loss": 1.6676, + "step": 1864 + }, + { + "epoch": 0.043497654566816424, + "grad_norm": 1.4922274351119995, + "learning_rate": 9.665716506867064e-06, + "loss": 1.5909, + "step": 1865 + }, + { + "epoch": 0.04352097770599434, + "grad_norm": 1.545914649963379, + "learning_rate": 9.67089919668308e-06, + "loss": 1.0683, + "step": 1866 + }, + { + "epoch": 0.04354430084517225, + "grad_norm": 1.4928728342056274, + "learning_rate": 9.676081886499094e-06, + "loss": 1.1975, + "step": 1867 + }, + { + "epoch": 0.04356762398435017, + "grad_norm": 2.042757272720337, + "learning_rate": 9.681264576315108e-06, + "loss": 1.9166, + "step": 1868 + }, + { + "epoch": 0.04359094712352809, + "grad_norm": 1.9415842294692993, + "learning_rate": 9.686447266131123e-06, + "loss": 1.5207, + "step": 1869 + }, + { + "epoch": 0.043614270262706006, + "grad_norm": 1.6906239986419678, + "learning_rate": 9.691629955947137e-06, + "loss": 1.4171, + "step": 1870 + }, + { + "epoch": 0.043637593401883924, + "grad_norm": 1.5644055604934692, + "learning_rate": 9.696812645763151e-06, + "loss": 1.4997, + "step": 1871 + }, + { + "epoch": 0.04366091654106184, + "grad_norm": 1.7778024673461914, + "learning_rate": 9.701995335579166e-06, + "loss": 1.3872, + "step": 1872 + }, + { + "epoch": 0.04368423968023976, + "grad_norm": 1.9999544620513916, + "learning_rate": 9.70717802539518e-06, + "loss": 1.6039, + "step": 1873 + }, + { + "epoch": 0.04370756281941768, + "grad_norm": 2.1065220832824707, + "learning_rate": 9.712360715211196e-06, + "loss": 1.4525, + "step": 1874 + }, + { + "epoch": 0.043730885958595596, + "grad_norm": 1.785739541053772, + "learning_rate": 9.71754340502721e-06, + "loss": 1.7723, + "step": 1875 + }, + { + "epoch": 0.043754209097773514, + "grad_norm": 1.7912609577178955, + "learning_rate": 9.722726094843225e-06, + "loss": 1.8857, + "step": 1876 + }, + { + "epoch": 0.04377753223695143, + "grad_norm": 2.2229981422424316, + "learning_rate": 9.727908784659239e-06, + "loss": 1.7359, + "step": 1877 + }, + { + "epoch": 0.04380085537612935, + "grad_norm": 1.7545627355575562, + "learning_rate": 9.733091474475253e-06, + "loss": 1.3878, + "step": 1878 + }, + { + "epoch": 0.04382417851530727, + "grad_norm": 1.6687484979629517, + "learning_rate": 9.738274164291268e-06, + "loss": 1.3148, + "step": 1879 + }, + { + "epoch": 0.043847501654485185, + "grad_norm": 1.661619782447815, + "learning_rate": 9.743456854107282e-06, + "loss": 1.5319, + "step": 1880 + }, + { + "epoch": 0.0438708247936631, + "grad_norm": 1.6879695653915405, + "learning_rate": 9.748639543923297e-06, + "loss": 1.2871, + "step": 1881 + }, + { + "epoch": 0.04389414793284102, + "grad_norm": 1.614043116569519, + "learning_rate": 9.753822233739313e-06, + "loss": 1.0429, + "step": 1882 + }, + { + "epoch": 0.04391747107201894, + "grad_norm": 1.310645341873169, + "learning_rate": 9.759004923555327e-06, + "loss": 1.5535, + "step": 1883 + }, + { + "epoch": 0.04394079421119686, + "grad_norm": 1.677807092666626, + "learning_rate": 9.764187613371341e-06, + "loss": 1.5612, + "step": 1884 + }, + { + "epoch": 0.043964117350374775, + "grad_norm": 2.004786252975464, + "learning_rate": 9.769370303187356e-06, + "loss": 1.1547, + "step": 1885 + }, + { + "epoch": 0.04398744048955269, + "grad_norm": 2.4537112712860107, + "learning_rate": 9.77455299300337e-06, + "loss": 1.6863, + "step": 1886 + }, + { + "epoch": 0.04401076362873061, + "grad_norm": 1.8132030963897705, + "learning_rate": 9.779735682819384e-06, + "loss": 1.2049, + "step": 1887 + }, + { + "epoch": 0.04403408676790853, + "grad_norm": 1.954026699066162, + "learning_rate": 9.784918372635399e-06, + "loss": 1.3946, + "step": 1888 + }, + { + "epoch": 0.044057409907086446, + "grad_norm": 1.742790699005127, + "learning_rate": 9.790101062451413e-06, + "loss": 1.3851, + "step": 1889 + }, + { + "epoch": 0.044080733046264364, + "grad_norm": 2.010481357574463, + "learning_rate": 9.795283752267427e-06, + "loss": 1.4181, + "step": 1890 + }, + { + "epoch": 0.04410405618544228, + "grad_norm": 1.6661536693572998, + "learning_rate": 9.800466442083442e-06, + "loss": 1.1611, + "step": 1891 + }, + { + "epoch": 0.0441273793246202, + "grad_norm": 1.6758571863174438, + "learning_rate": 9.805649131899456e-06, + "loss": 1.0906, + "step": 1892 + }, + { + "epoch": 0.04415070246379812, + "grad_norm": 1.7925001382827759, + "learning_rate": 9.81083182171547e-06, + "loss": 1.4299, + "step": 1893 + }, + { + "epoch": 0.044174025602976036, + "grad_norm": 1.9415634870529175, + "learning_rate": 9.816014511531485e-06, + "loss": 1.5619, + "step": 1894 + }, + { + "epoch": 0.04419734874215395, + "grad_norm": 1.3546884059906006, + "learning_rate": 9.821197201347499e-06, + "loss": 1.4994, + "step": 1895 + }, + { + "epoch": 0.044220671881331865, + "grad_norm": 2.0756897926330566, + "learning_rate": 9.826379891163515e-06, + "loss": 1.7483, + "step": 1896 + }, + { + "epoch": 0.04424399502050978, + "grad_norm": 1.7983125448226929, + "learning_rate": 9.83156258097953e-06, + "loss": 1.586, + "step": 1897 + }, + { + "epoch": 0.0442673181596877, + "grad_norm": 1.5559202432632446, + "learning_rate": 9.836745270795544e-06, + "loss": 1.5093, + "step": 1898 + }, + { + "epoch": 0.04429064129886562, + "grad_norm": 1.772439956665039, + "learning_rate": 9.841927960611558e-06, + "loss": 1.3449, + "step": 1899 + }, + { + "epoch": 0.044313964438043536, + "grad_norm": 1.9158481359481812, + "learning_rate": 9.847110650427573e-06, + "loss": 1.3239, + "step": 1900 + }, + { + "epoch": 0.044337287577221454, + "grad_norm": 1.801500916481018, + "learning_rate": 9.852293340243587e-06, + "loss": 1.1534, + "step": 1901 + }, + { + "epoch": 0.04436061071639937, + "grad_norm": 1.5766456127166748, + "learning_rate": 9.857476030059601e-06, + "loss": 1.7678, + "step": 1902 + }, + { + "epoch": 0.04438393385557729, + "grad_norm": 1.852655053138733, + "learning_rate": 9.862658719875616e-06, + "loss": 1.7286, + "step": 1903 + }, + { + "epoch": 0.04440725699475521, + "grad_norm": 1.9849982261657715, + "learning_rate": 9.867841409691632e-06, + "loss": 1.3984, + "step": 1904 + }, + { + "epoch": 0.044430580133933126, + "grad_norm": 1.7213250398635864, + "learning_rate": 9.873024099507646e-06, + "loss": 1.6215, + "step": 1905 + }, + { + "epoch": 0.044453903273111044, + "grad_norm": 1.9416676759719849, + "learning_rate": 9.87820678932366e-06, + "loss": 1.6314, + "step": 1906 + }, + { + "epoch": 0.04447722641228896, + "grad_norm": 1.8408985137939453, + "learning_rate": 9.883389479139675e-06, + "loss": 1.6611, + "step": 1907 + }, + { + "epoch": 0.04450054955146688, + "grad_norm": 1.528350591659546, + "learning_rate": 9.888572168955689e-06, + "loss": 1.7559, + "step": 1908 + }, + { + "epoch": 0.0445238726906448, + "grad_norm": 1.6557738780975342, + "learning_rate": 9.893754858771703e-06, + "loss": 1.5072, + "step": 1909 + }, + { + "epoch": 0.044547195829822715, + "grad_norm": 2.0431089401245117, + "learning_rate": 9.898937548587718e-06, + "loss": 1.2895, + "step": 1910 + }, + { + "epoch": 0.04457051896900063, + "grad_norm": 1.8927110433578491, + "learning_rate": 9.904120238403732e-06, + "loss": 1.4221, + "step": 1911 + }, + { + "epoch": 0.04459384210817855, + "grad_norm": 1.547044038772583, + "learning_rate": 9.909302928219748e-06, + "loss": 1.2597, + "step": 1912 + }, + { + "epoch": 0.04461716524735647, + "grad_norm": 1.81504487991333, + "learning_rate": 9.914485618035762e-06, + "loss": 1.4845, + "step": 1913 + }, + { + "epoch": 0.04464048838653439, + "grad_norm": 3.442282199859619, + "learning_rate": 9.919668307851777e-06, + "loss": 1.0979, + "step": 1914 + }, + { + "epoch": 0.044663811525712305, + "grad_norm": 1.8255623579025269, + "learning_rate": 9.924850997667791e-06, + "loss": 1.6663, + "step": 1915 + }, + { + "epoch": 0.04468713466489022, + "grad_norm": 1.7657500505447388, + "learning_rate": 9.930033687483804e-06, + "loss": 1.765, + "step": 1916 + }, + { + "epoch": 0.04471045780406814, + "grad_norm": 1.6761666536331177, + "learning_rate": 9.935216377299818e-06, + "loss": 1.3525, + "step": 1917 + }, + { + "epoch": 0.04473378094324606, + "grad_norm": 2.3319602012634277, + "learning_rate": 9.940399067115834e-06, + "loss": 1.1265, + "step": 1918 + }, + { + "epoch": 0.044757104082423976, + "grad_norm": 1.6062688827514648, + "learning_rate": 9.945581756931848e-06, + "loss": 1.6085, + "step": 1919 + }, + { + "epoch": 0.044780427221601894, + "grad_norm": 1.4931232929229736, + "learning_rate": 9.950764446747863e-06, + "loss": 1.6418, + "step": 1920 + }, + { + "epoch": 0.04480375036077981, + "grad_norm": 2.0092151165008545, + "learning_rate": 9.955947136563877e-06, + "loss": 1.2352, + "step": 1921 + }, + { + "epoch": 0.04482707349995773, + "grad_norm": 2.2695815563201904, + "learning_rate": 9.961129826379892e-06, + "loss": 1.3626, + "step": 1922 + }, + { + "epoch": 0.04485039663913565, + "grad_norm": 1.6969548463821411, + "learning_rate": 9.966312516195906e-06, + "loss": 1.6971, + "step": 1923 + }, + { + "epoch": 0.04487371977831356, + "grad_norm": 1.8436291217803955, + "learning_rate": 9.97149520601192e-06, + "loss": 1.7701, + "step": 1924 + }, + { + "epoch": 0.04489704291749148, + "grad_norm": 1.7749122381210327, + "learning_rate": 9.976677895827935e-06, + "loss": 1.3771, + "step": 1925 + }, + { + "epoch": 0.044920366056669395, + "grad_norm": 1.9239168167114258, + "learning_rate": 9.98186058564395e-06, + "loss": 1.7554, + "step": 1926 + }, + { + "epoch": 0.04494368919584731, + "grad_norm": 1.5236059427261353, + "learning_rate": 9.987043275459965e-06, + "loss": 1.337, + "step": 1927 + }, + { + "epoch": 0.04496701233502523, + "grad_norm": 2.0506536960601807, + "learning_rate": 9.99222596527598e-06, + "loss": 1.7227, + "step": 1928 + }, + { + "epoch": 0.04499033547420315, + "grad_norm": 1.4491156339645386, + "learning_rate": 9.997408655091994e-06, + "loss": 1.4032, + "step": 1929 + }, + { + "epoch": 0.045013658613381066, + "grad_norm": 2.175860643386841, + "learning_rate": 1.0002591344908008e-05, + "loss": 1.5994, + "step": 1930 + }, + { + "epoch": 0.045036981752558984, + "grad_norm": 1.9326441287994385, + "learning_rate": 1.0007774034724022e-05, + "loss": 1.1194, + "step": 1931 + }, + { + "epoch": 0.0450603048917369, + "grad_norm": 1.8562779426574707, + "learning_rate": 1.0012956724540037e-05, + "loss": 1.7551, + "step": 1932 + }, + { + "epoch": 0.04508362803091482, + "grad_norm": 1.7570141553878784, + "learning_rate": 1.0018139414356051e-05, + "loss": 0.97, + "step": 1933 + }, + { + "epoch": 0.04510695117009274, + "grad_norm": 1.2578299045562744, + "learning_rate": 1.0023322104172067e-05, + "loss": 1.3259, + "step": 1934 + }, + { + "epoch": 0.045130274309270656, + "grad_norm": 2.211773633956909, + "learning_rate": 1.0028504793988081e-05, + "loss": 1.6072, + "step": 1935 + }, + { + "epoch": 0.045153597448448574, + "grad_norm": 1.7696832418441772, + "learning_rate": 1.0033687483804096e-05, + "loss": 1.4227, + "step": 1936 + }, + { + "epoch": 0.04517692058762649, + "grad_norm": 1.940531611442566, + "learning_rate": 1.003887017362011e-05, + "loss": 1.8458, + "step": 1937 + }, + { + "epoch": 0.04520024372680441, + "grad_norm": 2.282905101776123, + "learning_rate": 1.0044052863436124e-05, + "loss": 1.2556, + "step": 1938 + }, + { + "epoch": 0.04522356686598233, + "grad_norm": 1.643122673034668, + "learning_rate": 1.0049235553252139e-05, + "loss": 1.5571, + "step": 1939 + }, + { + "epoch": 0.045246890005160245, + "grad_norm": 1.6886086463928223, + "learning_rate": 1.0054418243068153e-05, + "loss": 1.481, + "step": 1940 + }, + { + "epoch": 0.04527021314433816, + "grad_norm": 2.349867105484009, + "learning_rate": 1.0059600932884168e-05, + "loss": 1.8651, + "step": 1941 + }, + { + "epoch": 0.04529353628351608, + "grad_norm": 2.0965826511383057, + "learning_rate": 1.0064783622700184e-05, + "loss": 1.5702, + "step": 1942 + }, + { + "epoch": 0.045316859422694, + "grad_norm": 1.4684425592422485, + "learning_rate": 1.0069966312516198e-05, + "loss": 1.4283, + "step": 1943 + }, + { + "epoch": 0.04534018256187192, + "grad_norm": 3.0096945762634277, + "learning_rate": 1.0075149002332212e-05, + "loss": 1.4832, + "step": 1944 + }, + { + "epoch": 0.045363505701049835, + "grad_norm": 2.2389118671417236, + "learning_rate": 1.0080331692148227e-05, + "loss": 1.6346, + "step": 1945 + }, + { + "epoch": 0.04538682884022775, + "grad_norm": 1.7624162435531616, + "learning_rate": 1.0085514381964241e-05, + "loss": 1.7017, + "step": 1946 + }, + { + "epoch": 0.04541015197940567, + "grad_norm": 1.8136117458343506, + "learning_rate": 1.0090697071780255e-05, + "loss": 1.5987, + "step": 1947 + }, + { + "epoch": 0.04543347511858359, + "grad_norm": 1.678236484527588, + "learning_rate": 1.009587976159627e-05, + "loss": 1.3684, + "step": 1948 + }, + { + "epoch": 0.045456798257761506, + "grad_norm": 1.7862106561660767, + "learning_rate": 1.0101062451412284e-05, + "loss": 1.7998, + "step": 1949 + }, + { + "epoch": 0.045480121396939424, + "grad_norm": 2.0441555976867676, + "learning_rate": 1.0106245141228298e-05, + "loss": 1.2902, + "step": 1950 + }, + { + "epoch": 0.04550344453611734, + "grad_norm": 1.5820708274841309, + "learning_rate": 1.0111427831044314e-05, + "loss": 1.2032, + "step": 1951 + }, + { + "epoch": 0.04552676767529525, + "grad_norm": 1.4560632705688477, + "learning_rate": 1.0116610520860329e-05, + "loss": 1.5599, + "step": 1952 + }, + { + "epoch": 0.04555009081447317, + "grad_norm": 2.3671185970306396, + "learning_rate": 1.0121793210676343e-05, + "loss": 1.6144, + "step": 1953 + }, + { + "epoch": 0.04557341395365109, + "grad_norm": 1.7525554895401, + "learning_rate": 1.0126975900492357e-05, + "loss": 1.913, + "step": 1954 + }, + { + "epoch": 0.04559673709282901, + "grad_norm": 1.2725483179092407, + "learning_rate": 1.0132158590308372e-05, + "loss": 1.2048, + "step": 1955 + }, + { + "epoch": 0.045620060232006925, + "grad_norm": 1.8041915893554688, + "learning_rate": 1.0137341280124386e-05, + "loss": 1.1796, + "step": 1956 + }, + { + "epoch": 0.04564338337118484, + "grad_norm": 2.3629374504089355, + "learning_rate": 1.01425239699404e-05, + "loss": 1.8434, + "step": 1957 + }, + { + "epoch": 0.04566670651036276, + "grad_norm": 1.3975788354873657, + "learning_rate": 1.0147706659756413e-05, + "loss": 1.5474, + "step": 1958 + }, + { + "epoch": 0.04569002964954068, + "grad_norm": 1.4148329496383667, + "learning_rate": 1.0152889349572427e-05, + "loss": 1.4695, + "step": 1959 + }, + { + "epoch": 0.045713352788718596, + "grad_norm": 3.3544209003448486, + "learning_rate": 1.0158072039388442e-05, + "loss": 1.6851, + "step": 1960 + }, + { + "epoch": 0.045736675927896514, + "grad_norm": 1.795784592628479, + "learning_rate": 1.0163254729204458e-05, + "loss": 1.2823, + "step": 1961 + }, + { + "epoch": 0.04575999906707443, + "grad_norm": 2.3135123252868652, + "learning_rate": 1.0168437419020472e-05, + "loss": 1.7222, + "step": 1962 + }, + { + "epoch": 0.04578332220625235, + "grad_norm": 1.62346351146698, + "learning_rate": 1.0173620108836487e-05, + "loss": 1.3822, + "step": 1963 + }, + { + "epoch": 0.04580664534543027, + "grad_norm": 1.9713786840438843, + "learning_rate": 1.0178802798652501e-05, + "loss": 1.1212, + "step": 1964 + }, + { + "epoch": 0.045829968484608186, + "grad_norm": 1.5502241849899292, + "learning_rate": 1.0183985488468515e-05, + "loss": 1.0937, + "step": 1965 + }, + { + "epoch": 0.045853291623786104, + "grad_norm": 1.893622875213623, + "learning_rate": 1.018916817828453e-05, + "loss": 1.7849, + "step": 1966 + }, + { + "epoch": 0.04587661476296402, + "grad_norm": 1.7515870332717896, + "learning_rate": 1.0194350868100544e-05, + "loss": 1.4724, + "step": 1967 + }, + { + "epoch": 0.04589993790214194, + "grad_norm": 1.7589161396026611, + "learning_rate": 1.0199533557916558e-05, + "loss": 1.4281, + "step": 1968 + }, + { + "epoch": 0.04592326104131986, + "grad_norm": 2.377809762954712, + "learning_rate": 1.0204716247732573e-05, + "loss": 1.0402, + "step": 1969 + }, + { + "epoch": 0.045946584180497775, + "grad_norm": 1.6169410943984985, + "learning_rate": 1.0209898937548589e-05, + "loss": 1.2902, + "step": 1970 + }, + { + "epoch": 0.04596990731967569, + "grad_norm": 1.7550357580184937, + "learning_rate": 1.0215081627364603e-05, + "loss": 1.274, + "step": 1971 + }, + { + "epoch": 0.04599323045885361, + "grad_norm": 1.846411943435669, + "learning_rate": 1.0220264317180617e-05, + "loss": 1.2554, + "step": 1972 + }, + { + "epoch": 0.04601655359803153, + "grad_norm": 1.880225419998169, + "learning_rate": 1.0225447006996632e-05, + "loss": 1.3451, + "step": 1973 + }, + { + "epoch": 0.04603987673720945, + "grad_norm": 1.6644784212112427, + "learning_rate": 1.0230629696812646e-05, + "loss": 1.5651, + "step": 1974 + }, + { + "epoch": 0.046063199876387365, + "grad_norm": 1.2287671566009521, + "learning_rate": 1.023581238662866e-05, + "loss": 1.2272, + "step": 1975 + }, + { + "epoch": 0.04608652301556528, + "grad_norm": 5.595534801483154, + "learning_rate": 1.0240995076444675e-05, + "loss": 1.2381, + "step": 1976 + }, + { + "epoch": 0.0461098461547432, + "grad_norm": 1.6219606399536133, + "learning_rate": 1.0246177766260689e-05, + "loss": 1.1357, + "step": 1977 + }, + { + "epoch": 0.04613316929392112, + "grad_norm": 1.7713710069656372, + "learning_rate": 1.0251360456076705e-05, + "loss": 1.3458, + "step": 1978 + }, + { + "epoch": 0.046156492433099036, + "grad_norm": 1.6285533905029297, + "learning_rate": 1.025654314589272e-05, + "loss": 1.6516, + "step": 1979 + }, + { + "epoch": 0.046179815572276954, + "grad_norm": 1.479745864868164, + "learning_rate": 1.0261725835708734e-05, + "loss": 1.2629, + "step": 1980 + }, + { + "epoch": 0.046203138711454865, + "grad_norm": 1.6205228567123413, + "learning_rate": 1.0266908525524748e-05, + "loss": 1.6772, + "step": 1981 + }, + { + "epoch": 0.04622646185063278, + "grad_norm": 1.845969319343567, + "learning_rate": 1.0272091215340763e-05, + "loss": 1.7172, + "step": 1982 + }, + { + "epoch": 0.0462497849898107, + "grad_norm": 1.71135413646698, + "learning_rate": 1.0277273905156777e-05, + "loss": 1.3776, + "step": 1983 + }, + { + "epoch": 0.04627310812898862, + "grad_norm": 1.5999668836593628, + "learning_rate": 1.0282456594972791e-05, + "loss": 1.3148, + "step": 1984 + }, + { + "epoch": 0.04629643126816654, + "grad_norm": 2.372850179672241, + "learning_rate": 1.0287639284788806e-05, + "loss": 1.5203, + "step": 1985 + }, + { + "epoch": 0.046319754407344454, + "grad_norm": 1.9471055269241333, + "learning_rate": 1.0292821974604822e-05, + "loss": 1.1852, + "step": 1986 + }, + { + "epoch": 0.04634307754652237, + "grad_norm": 2.035149574279785, + "learning_rate": 1.0298004664420836e-05, + "loss": 1.5986, + "step": 1987 + }, + { + "epoch": 0.04636640068570029, + "grad_norm": 1.9274436235427856, + "learning_rate": 1.030318735423685e-05, + "loss": 1.3578, + "step": 1988 + }, + { + "epoch": 0.04638972382487821, + "grad_norm": 1.8304780721664429, + "learning_rate": 1.0308370044052865e-05, + "loss": 1.2624, + "step": 1989 + }, + { + "epoch": 0.046413046964056126, + "grad_norm": 2.2276337146759033, + "learning_rate": 1.0313552733868879e-05, + "loss": 1.5508, + "step": 1990 + }, + { + "epoch": 0.046436370103234044, + "grad_norm": 1.7837759256362915, + "learning_rate": 1.0318735423684893e-05, + "loss": 1.4839, + "step": 1991 + }, + { + "epoch": 0.04645969324241196, + "grad_norm": 1.766287088394165, + "learning_rate": 1.0323918113500908e-05, + "loss": 1.7001, + "step": 1992 + }, + { + "epoch": 0.04648301638158988, + "grad_norm": 1.6771559715270996, + "learning_rate": 1.0329100803316922e-05, + "loss": 1.6349, + "step": 1993 + }, + { + "epoch": 0.0465063395207678, + "grad_norm": 1.7568877935409546, + "learning_rate": 1.0334283493132938e-05, + "loss": 1.4524, + "step": 1994 + }, + { + "epoch": 0.046529662659945716, + "grad_norm": 2.070405960083008, + "learning_rate": 1.0339466182948952e-05, + "loss": 1.3437, + "step": 1995 + }, + { + "epoch": 0.04655298579912363, + "grad_norm": 2.852936267852783, + "learning_rate": 1.0344648872764967e-05, + "loss": 1.2623, + "step": 1996 + }, + { + "epoch": 0.04657630893830155, + "grad_norm": 1.3660649061203003, + "learning_rate": 1.0349831562580981e-05, + "loss": 1.3146, + "step": 1997 + }, + { + "epoch": 0.04659963207747947, + "grad_norm": 1.672303318977356, + "learning_rate": 1.0355014252396995e-05, + "loss": 1.3361, + "step": 1998 + }, + { + "epoch": 0.04662295521665739, + "grad_norm": 1.6566362380981445, + "learning_rate": 1.036019694221301e-05, + "loss": 1.4374, + "step": 1999 + }, + { + "epoch": 0.046646278355835305, + "grad_norm": 1.6957907676696777, + "learning_rate": 1.0365379632029024e-05, + "loss": 1.4639, + "step": 2000 + }, + { + "epoch": 0.04666960149501322, + "grad_norm": 1.7481802701950073, + "learning_rate": 1.0370562321845038e-05, + "loss": 1.3724, + "step": 2001 + }, + { + "epoch": 0.04669292463419114, + "grad_norm": 1.7186965942382812, + "learning_rate": 1.0375745011661053e-05, + "loss": 1.4501, + "step": 2002 + }, + { + "epoch": 0.04671624777336906, + "grad_norm": 2.016763925552368, + "learning_rate": 1.0380927701477069e-05, + "loss": 1.598, + "step": 2003 + }, + { + "epoch": 0.04673957091254698, + "grad_norm": 2.0749828815460205, + "learning_rate": 1.0386110391293083e-05, + "loss": 1.6236, + "step": 2004 + }, + { + "epoch": 0.046762894051724894, + "grad_norm": 1.5764251947402954, + "learning_rate": 1.0391293081109098e-05, + "loss": 1.2068, + "step": 2005 + }, + { + "epoch": 0.04678621719090281, + "grad_norm": 1.7553889751434326, + "learning_rate": 1.0396475770925112e-05, + "loss": 1.5647, + "step": 2006 + }, + { + "epoch": 0.04680954033008073, + "grad_norm": 1.603076457977295, + "learning_rate": 1.0401658460741126e-05, + "loss": 1.32, + "step": 2007 + }, + { + "epoch": 0.04683286346925865, + "grad_norm": 1.6535013914108276, + "learning_rate": 1.040684115055714e-05, + "loss": 1.5769, + "step": 2008 + }, + { + "epoch": 0.04685618660843656, + "grad_norm": 2.1688969135284424, + "learning_rate": 1.0412023840373155e-05, + "loss": 1.3436, + "step": 2009 + }, + { + "epoch": 0.04687950974761448, + "grad_norm": 2.3349978923797607, + "learning_rate": 1.041720653018917e-05, + "loss": 1.5229, + "step": 2010 + }, + { + "epoch": 0.046902832886792395, + "grad_norm": 1.5256779193878174, + "learning_rate": 1.0422389220005185e-05, + "loss": 1.5068, + "step": 2011 + }, + { + "epoch": 0.04692615602597031, + "grad_norm": 1.6567631959915161, + "learning_rate": 1.04275719098212e-05, + "loss": 1.4358, + "step": 2012 + }, + { + "epoch": 0.04694947916514823, + "grad_norm": 1.9560909271240234, + "learning_rate": 1.043275459963721e-05, + "loss": 1.4106, + "step": 2013 + }, + { + "epoch": 0.04697280230432615, + "grad_norm": 1.7204327583312988, + "learning_rate": 1.0437937289453227e-05, + "loss": 1.6955, + "step": 2014 + }, + { + "epoch": 0.046996125443504067, + "grad_norm": 1.4477440118789673, + "learning_rate": 1.0443119979269241e-05, + "loss": 1.3587, + "step": 2015 + }, + { + "epoch": 0.047019448582681984, + "grad_norm": 1.5402978658676147, + "learning_rate": 1.0448302669085255e-05, + "loss": 1.4493, + "step": 2016 + }, + { + "epoch": 0.0470427717218599, + "grad_norm": 2.143346071243286, + "learning_rate": 1.045348535890127e-05, + "loss": 1.4861, + "step": 2017 + }, + { + "epoch": 0.04706609486103782, + "grad_norm": 1.7556196451187134, + "learning_rate": 1.0458668048717284e-05, + "loss": 1.3845, + "step": 2018 + }, + { + "epoch": 0.04708941800021574, + "grad_norm": 1.7759301662445068, + "learning_rate": 1.0463850738533298e-05, + "loss": 1.7286, + "step": 2019 + }, + { + "epoch": 0.047112741139393656, + "grad_norm": 1.6534310579299927, + "learning_rate": 1.0469033428349313e-05, + "loss": 1.5995, + "step": 2020 + }, + { + "epoch": 0.047136064278571574, + "grad_norm": 1.8520358800888062, + "learning_rate": 1.0474216118165327e-05, + "loss": 1.7078, + "step": 2021 + }, + { + "epoch": 0.04715938741774949, + "grad_norm": 1.762181043624878, + "learning_rate": 1.0479398807981343e-05, + "loss": 1.4365, + "step": 2022 + }, + { + "epoch": 0.04718271055692741, + "grad_norm": 2.2282192707061768, + "learning_rate": 1.0484581497797357e-05, + "loss": 1.0928, + "step": 2023 + }, + { + "epoch": 0.04720603369610533, + "grad_norm": 1.602739691734314, + "learning_rate": 1.0489764187613372e-05, + "loss": 1.8262, + "step": 2024 + }, + { + "epoch": 0.047229356835283245, + "grad_norm": 1.9949517250061035, + "learning_rate": 1.0494946877429386e-05, + "loss": 1.4207, + "step": 2025 + }, + { + "epoch": 0.04725267997446116, + "grad_norm": 1.738860011100769, + "learning_rate": 1.05001295672454e-05, + "loss": 1.3735, + "step": 2026 + }, + { + "epoch": 0.04727600311363908, + "grad_norm": 1.5074880123138428, + "learning_rate": 1.0505312257061415e-05, + "loss": 1.5181, + "step": 2027 + }, + { + "epoch": 0.047299326252817, + "grad_norm": 2.0777339935302734, + "learning_rate": 1.051049494687743e-05, + "loss": 1.1551, + "step": 2028 + }, + { + "epoch": 0.04732264939199492, + "grad_norm": 1.905203104019165, + "learning_rate": 1.0515677636693444e-05, + "loss": 1.4136, + "step": 2029 + }, + { + "epoch": 0.047345972531172835, + "grad_norm": 1.9061765670776367, + "learning_rate": 1.052086032650946e-05, + "loss": 1.4066, + "step": 2030 + }, + { + "epoch": 0.04736929567035075, + "grad_norm": 2.3726284503936768, + "learning_rate": 1.0526043016325474e-05, + "loss": 1.5807, + "step": 2031 + }, + { + "epoch": 0.04739261880952867, + "grad_norm": 1.7928005456924438, + "learning_rate": 1.0531225706141488e-05, + "loss": 1.4065, + "step": 2032 + }, + { + "epoch": 0.04741594194870659, + "grad_norm": 2.108304977416992, + "learning_rate": 1.0536408395957503e-05, + "loss": 1.4989, + "step": 2033 + }, + { + "epoch": 0.047439265087884507, + "grad_norm": 1.770139455795288, + "learning_rate": 1.0541591085773517e-05, + "loss": 1.5682, + "step": 2034 + }, + { + "epoch": 0.047462588227062424, + "grad_norm": 1.5590190887451172, + "learning_rate": 1.0546773775589531e-05, + "loss": 1.5977, + "step": 2035 + }, + { + "epoch": 0.04748591136624034, + "grad_norm": 1.8117510080337524, + "learning_rate": 1.0551956465405546e-05, + "loss": 1.366, + "step": 2036 + }, + { + "epoch": 0.04750923450541826, + "grad_norm": 1.4459258317947388, + "learning_rate": 1.055713915522156e-05, + "loss": 1.3128, + "step": 2037 + }, + { + "epoch": 0.04753255764459617, + "grad_norm": 1.661622166633606, + "learning_rate": 1.0562321845037576e-05, + "loss": 1.195, + "step": 2038 + }, + { + "epoch": 0.04755588078377409, + "grad_norm": 3.0413358211517334, + "learning_rate": 1.056750453485359e-05, + "loss": 1.2938, + "step": 2039 + }, + { + "epoch": 0.04757920392295201, + "grad_norm": 2.0208404064178467, + "learning_rate": 1.0572687224669605e-05, + "loss": 1.4222, + "step": 2040 + }, + { + "epoch": 0.047602527062129925, + "grad_norm": 1.5804784297943115, + "learning_rate": 1.0577869914485619e-05, + "loss": 1.4815, + "step": 2041 + }, + { + "epoch": 0.04762585020130784, + "grad_norm": 1.550583839416504, + "learning_rate": 1.0583052604301633e-05, + "loss": 1.2965, + "step": 2042 + }, + { + "epoch": 0.04764917334048576, + "grad_norm": 1.808358907699585, + "learning_rate": 1.0588235294117648e-05, + "loss": 1.7293, + "step": 2043 + }, + { + "epoch": 0.04767249647966368, + "grad_norm": 1.5591018199920654, + "learning_rate": 1.0593417983933662e-05, + "loss": 1.411, + "step": 2044 + }, + { + "epoch": 0.047695819618841596, + "grad_norm": 1.9220789670944214, + "learning_rate": 1.0598600673749677e-05, + "loss": 1.6004, + "step": 2045 + }, + { + "epoch": 0.047719142758019514, + "grad_norm": 1.454370141029358, + "learning_rate": 1.0603783363565691e-05, + "loss": 1.237, + "step": 2046 + }, + { + "epoch": 0.04774246589719743, + "grad_norm": 2.064704179763794, + "learning_rate": 1.0608966053381707e-05, + "loss": 1.5761, + "step": 2047 + }, + { + "epoch": 0.04776578903637535, + "grad_norm": 2.0700747966766357, + "learning_rate": 1.0614148743197721e-05, + "loss": 1.5053, + "step": 2048 + }, + { + "epoch": 0.04778911217555327, + "grad_norm": 2.590925931930542, + "learning_rate": 1.0619331433013736e-05, + "loss": 1.6171, + "step": 2049 + }, + { + "epoch": 0.047812435314731186, + "grad_norm": 1.7378954887390137, + "learning_rate": 1.062451412282975e-05, + "loss": 1.7035, + "step": 2050 + }, + { + "epoch": 0.047835758453909104, + "grad_norm": 1.74659264087677, + "learning_rate": 1.0629696812645764e-05, + "loss": 1.4361, + "step": 2051 + }, + { + "epoch": 0.04785908159308702, + "grad_norm": 1.1299657821655273, + "learning_rate": 1.0634879502461779e-05, + "loss": 1.1613, + "step": 2052 + }, + { + "epoch": 0.04788240473226494, + "grad_norm": 1.803884506225586, + "learning_rate": 1.0640062192277793e-05, + "loss": 1.8098, + "step": 2053 + }, + { + "epoch": 0.04790572787144286, + "grad_norm": 1.7978615760803223, + "learning_rate": 1.0645244882093807e-05, + "loss": 1.3783, + "step": 2054 + }, + { + "epoch": 0.047929051010620775, + "grad_norm": 2.5186221599578857, + "learning_rate": 1.0650427571909823e-05, + "loss": 1.3875, + "step": 2055 + }, + { + "epoch": 0.04795237414979869, + "grad_norm": 2.0152792930603027, + "learning_rate": 1.0655610261725838e-05, + "loss": 1.3065, + "step": 2056 + }, + { + "epoch": 0.04797569728897661, + "grad_norm": 1.6235994100570679, + "learning_rate": 1.0660792951541852e-05, + "loss": 1.5058, + "step": 2057 + }, + { + "epoch": 0.04799902042815453, + "grad_norm": 1.4507194757461548, + "learning_rate": 1.0665975641357866e-05, + "loss": 1.0629, + "step": 2058 + }, + { + "epoch": 0.04802234356733245, + "grad_norm": 1.8985979557037354, + "learning_rate": 1.067115833117388e-05, + "loss": 1.9146, + "step": 2059 + }, + { + "epoch": 0.048045666706510365, + "grad_norm": 2.050368309020996, + "learning_rate": 1.0676341020989895e-05, + "loss": 1.4296, + "step": 2060 + }, + { + "epoch": 0.04806898984568828, + "grad_norm": 1.9865542650222778, + "learning_rate": 1.068152371080591e-05, + "loss": 1.3698, + "step": 2061 + }, + { + "epoch": 0.0480923129848662, + "grad_norm": 1.6175458431243896, + "learning_rate": 1.0686706400621924e-05, + "loss": 1.3281, + "step": 2062 + }, + { + "epoch": 0.04811563612404412, + "grad_norm": 2.048891067504883, + "learning_rate": 1.069188909043794e-05, + "loss": 1.4731, + "step": 2063 + }, + { + "epoch": 0.048138959263222036, + "grad_norm": 1.917079210281372, + "learning_rate": 1.0697071780253954e-05, + "loss": 1.3246, + "step": 2064 + }, + { + "epoch": 0.048162282402399954, + "grad_norm": 1.8361105918884277, + "learning_rate": 1.0702254470069969e-05, + "loss": 1.5121, + "step": 2065 + }, + { + "epoch": 0.048185605541577865, + "grad_norm": 2.233614921569824, + "learning_rate": 1.0707437159885983e-05, + "loss": 1.6197, + "step": 2066 + }, + { + "epoch": 0.04820892868075578, + "grad_norm": 2.2280359268188477, + "learning_rate": 1.0712619849701997e-05, + "loss": 1.8298, + "step": 2067 + }, + { + "epoch": 0.0482322518199337, + "grad_norm": 1.7553162574768066, + "learning_rate": 1.071780253951801e-05, + "loss": 1.3408, + "step": 2068 + }, + { + "epoch": 0.04825557495911162, + "grad_norm": 1.6910414695739746, + "learning_rate": 1.0722985229334024e-05, + "loss": 1.473, + "step": 2069 + }, + { + "epoch": 0.04827889809828954, + "grad_norm": 2.053236961364746, + "learning_rate": 1.0728167919150039e-05, + "loss": 1.6323, + "step": 2070 + }, + { + "epoch": 0.048302221237467455, + "grad_norm": 1.5760650634765625, + "learning_rate": 1.0733350608966053e-05, + "loss": 1.4442, + "step": 2071 + }, + { + "epoch": 0.04832554437664537, + "grad_norm": 1.9398868083953857, + "learning_rate": 1.0738533298782067e-05, + "loss": 1.342, + "step": 2072 + }, + { + "epoch": 0.04834886751582329, + "grad_norm": 1.6530135869979858, + "learning_rate": 1.0743715988598082e-05, + "loss": 1.47, + "step": 2073 + }, + { + "epoch": 0.04837219065500121, + "grad_norm": 1.8386073112487793, + "learning_rate": 1.0748898678414098e-05, + "loss": 1.5633, + "step": 2074 + }, + { + "epoch": 0.048395513794179126, + "grad_norm": 2.6090152263641357, + "learning_rate": 1.0754081368230112e-05, + "loss": 1.5872, + "step": 2075 + }, + { + "epoch": 0.048418836933357044, + "grad_norm": 1.598777174949646, + "learning_rate": 1.0759264058046126e-05, + "loss": 1.3968, + "step": 2076 + }, + { + "epoch": 0.04844216007253496, + "grad_norm": 1.8660935163497925, + "learning_rate": 1.076444674786214e-05, + "loss": 1.3865, + "step": 2077 + }, + { + "epoch": 0.04846548321171288, + "grad_norm": 1.7573018074035645, + "learning_rate": 1.0769629437678155e-05, + "loss": 1.5532, + "step": 2078 + }, + { + "epoch": 0.0484888063508908, + "grad_norm": 1.8254483938217163, + "learning_rate": 1.077481212749417e-05, + "loss": 1.6758, + "step": 2079 + }, + { + "epoch": 0.048512129490068716, + "grad_norm": 2.102221727371216, + "learning_rate": 1.0779994817310184e-05, + "loss": 1.6375, + "step": 2080 + }, + { + "epoch": 0.048535452629246634, + "grad_norm": 1.6811659336090088, + "learning_rate": 1.0785177507126198e-05, + "loss": 1.4086, + "step": 2081 + }, + { + "epoch": 0.04855877576842455, + "grad_norm": 1.7381610870361328, + "learning_rate": 1.0790360196942214e-05, + "loss": 1.4361, + "step": 2082 + }, + { + "epoch": 0.04858209890760247, + "grad_norm": 1.346579670906067, + "learning_rate": 1.0795542886758228e-05, + "loss": 1.5014, + "step": 2083 + }, + { + "epoch": 0.04860542204678039, + "grad_norm": 1.4739258289337158, + "learning_rate": 1.0800725576574243e-05, + "loss": 0.922, + "step": 2084 + }, + { + "epoch": 0.048628745185958305, + "grad_norm": 1.5859721899032593, + "learning_rate": 1.0805908266390257e-05, + "loss": 1.4007, + "step": 2085 + }, + { + "epoch": 0.04865206832513622, + "grad_norm": 2.35955548286438, + "learning_rate": 1.0811090956206272e-05, + "loss": 1.3838, + "step": 2086 + }, + { + "epoch": 0.04867539146431414, + "grad_norm": 2.1148109436035156, + "learning_rate": 1.0816273646022286e-05, + "loss": 1.3424, + "step": 2087 + }, + { + "epoch": 0.04869871460349206, + "grad_norm": 1.6170052289962769, + "learning_rate": 1.08214563358383e-05, + "loss": 1.3171, + "step": 2088 + }, + { + "epoch": 0.04872203774266998, + "grad_norm": 1.8172898292541504, + "learning_rate": 1.0826639025654315e-05, + "loss": 1.8264, + "step": 2089 + }, + { + "epoch": 0.048745360881847895, + "grad_norm": 1.5396174192428589, + "learning_rate": 1.083182171547033e-05, + "loss": 1.2572, + "step": 2090 + }, + { + "epoch": 0.04876868402102581, + "grad_norm": 1.6363898515701294, + "learning_rate": 1.0837004405286345e-05, + "loss": 1.4644, + "step": 2091 + }, + { + "epoch": 0.04879200716020373, + "grad_norm": 2.310607433319092, + "learning_rate": 1.084218709510236e-05, + "loss": 1.773, + "step": 2092 + }, + { + "epoch": 0.04881533029938165, + "grad_norm": 1.9963150024414062, + "learning_rate": 1.0847369784918374e-05, + "loss": 1.7195, + "step": 2093 + }, + { + "epoch": 0.048838653438559566, + "grad_norm": 2.051004409790039, + "learning_rate": 1.0852552474734388e-05, + "loss": 1.4198, + "step": 2094 + }, + { + "epoch": 0.04886197657773748, + "grad_norm": 1.7132155895233154, + "learning_rate": 1.0857735164550402e-05, + "loss": 1.3747, + "step": 2095 + }, + { + "epoch": 0.048885299716915395, + "grad_norm": 1.9538960456848145, + "learning_rate": 1.0862917854366417e-05, + "loss": 1.7172, + "step": 2096 + }, + { + "epoch": 0.04890862285609331, + "grad_norm": 2.2270054817199707, + "learning_rate": 1.0868100544182431e-05, + "loss": 1.5241, + "step": 2097 + }, + { + "epoch": 0.04893194599527123, + "grad_norm": 1.859562873840332, + "learning_rate": 1.0873283233998445e-05, + "loss": 1.5629, + "step": 2098 + }, + { + "epoch": 0.04895526913444915, + "grad_norm": 1.9491593837738037, + "learning_rate": 1.0878465923814461e-05, + "loss": 1.4231, + "step": 2099 + }, + { + "epoch": 0.04897859227362707, + "grad_norm": 1.566665768623352, + "learning_rate": 1.0883648613630476e-05, + "loss": 1.6466, + "step": 2100 + }, + { + "epoch": 0.049001915412804985, + "grad_norm": 1.6445392370224, + "learning_rate": 1.088883130344649e-05, + "loss": 1.3313, + "step": 2101 + }, + { + "epoch": 0.0490252385519829, + "grad_norm": 2.30572772026062, + "learning_rate": 1.0894013993262504e-05, + "loss": 1.9519, + "step": 2102 + }, + { + "epoch": 0.04904856169116082, + "grad_norm": 1.9258235692977905, + "learning_rate": 1.0899196683078519e-05, + "loss": 1.6955, + "step": 2103 + }, + { + "epoch": 0.04907188483033874, + "grad_norm": 2.236351490020752, + "learning_rate": 1.0904379372894533e-05, + "loss": 1.576, + "step": 2104 + }, + { + "epoch": 0.049095207969516656, + "grad_norm": 1.592620849609375, + "learning_rate": 1.0909562062710547e-05, + "loss": 1.4411, + "step": 2105 + }, + { + "epoch": 0.049118531108694574, + "grad_norm": 1.689764380455017, + "learning_rate": 1.0914744752526562e-05, + "loss": 1.554, + "step": 2106 + }, + { + "epoch": 0.04914185424787249, + "grad_norm": 1.9864122867584229, + "learning_rate": 1.0919927442342578e-05, + "loss": 1.5606, + "step": 2107 + }, + { + "epoch": 0.04916517738705041, + "grad_norm": 1.8440738916397095, + "learning_rate": 1.0925110132158592e-05, + "loss": 1.3705, + "step": 2108 + }, + { + "epoch": 0.04918850052622833, + "grad_norm": 1.4290404319763184, + "learning_rate": 1.0930292821974607e-05, + "loss": 1.3095, + "step": 2109 + }, + { + "epoch": 0.049211823665406246, + "grad_norm": 2.0556204319000244, + "learning_rate": 1.0935475511790621e-05, + "loss": 1.2491, + "step": 2110 + }, + { + "epoch": 0.049235146804584164, + "grad_norm": 1.4986300468444824, + "learning_rate": 1.0940658201606635e-05, + "loss": 1.48, + "step": 2111 + }, + { + "epoch": 0.04925846994376208, + "grad_norm": 1.9182485342025757, + "learning_rate": 1.094584089142265e-05, + "loss": 1.7362, + "step": 2112 + }, + { + "epoch": 0.04928179308294, + "grad_norm": 1.789497971534729, + "learning_rate": 1.0951023581238664e-05, + "loss": 1.5304, + "step": 2113 + }, + { + "epoch": 0.04930511622211792, + "grad_norm": 1.468636155128479, + "learning_rate": 1.0956206271054678e-05, + "loss": 1.1878, + "step": 2114 + }, + { + "epoch": 0.049328439361295835, + "grad_norm": 1.8166897296905518, + "learning_rate": 1.0961388960870694e-05, + "loss": 1.4757, + "step": 2115 + }, + { + "epoch": 0.04935176250047375, + "grad_norm": 1.592802882194519, + "learning_rate": 1.0966571650686709e-05, + "loss": 1.0244, + "step": 2116 + }, + { + "epoch": 0.04937508563965167, + "grad_norm": 1.8792015314102173, + "learning_rate": 1.0971754340502723e-05, + "loss": 1.5129, + "step": 2117 + }, + { + "epoch": 0.04939840877882959, + "grad_norm": 1.6895866394042969, + "learning_rate": 1.0976937030318737e-05, + "loss": 1.1866, + "step": 2118 + }, + { + "epoch": 0.04942173191800751, + "grad_norm": 2.1797773838043213, + "learning_rate": 1.0982119720134752e-05, + "loss": 1.7618, + "step": 2119 + }, + { + "epoch": 0.049445055057185425, + "grad_norm": 1.7822822332382202, + "learning_rate": 1.0987302409950766e-05, + "loss": 1.1358, + "step": 2120 + }, + { + "epoch": 0.04946837819636334, + "grad_norm": 2.3683722019195557, + "learning_rate": 1.099248509976678e-05, + "loss": 1.5685, + "step": 2121 + }, + { + "epoch": 0.04949170133554126, + "grad_norm": 1.9489614963531494, + "learning_rate": 1.0997667789582795e-05, + "loss": 1.7013, + "step": 2122 + }, + { + "epoch": 0.04951502447471917, + "grad_norm": 2.0891358852386475, + "learning_rate": 1.1002850479398807e-05, + "loss": 1.4574, + "step": 2123 + }, + { + "epoch": 0.04953834761389709, + "grad_norm": 2.1940853595733643, + "learning_rate": 1.1008033169214822e-05, + "loss": 1.0502, + "step": 2124 + }, + { + "epoch": 0.04956167075307501, + "grad_norm": 1.7693442106246948, + "learning_rate": 1.1013215859030836e-05, + "loss": 1.3404, + "step": 2125 + }, + { + "epoch": 0.049584993892252925, + "grad_norm": 1.9649901390075684, + "learning_rate": 1.1018398548846852e-05, + "loss": 1.6006, + "step": 2126 + }, + { + "epoch": 0.04960831703143084, + "grad_norm": 1.1453593969345093, + "learning_rate": 1.1023581238662867e-05, + "loss": 1.3859, + "step": 2127 + }, + { + "epoch": 0.04963164017060876, + "grad_norm": 1.6021969318389893, + "learning_rate": 1.1028763928478881e-05, + "loss": 1.2068, + "step": 2128 + }, + { + "epoch": 0.04965496330978668, + "grad_norm": 1.8170448541641235, + "learning_rate": 1.1033946618294895e-05, + "loss": 1.7754, + "step": 2129 + }, + { + "epoch": 0.0496782864489646, + "grad_norm": 1.5090433359146118, + "learning_rate": 1.103912930811091e-05, + "loss": 1.2575, + "step": 2130 + }, + { + "epoch": 0.049701609588142515, + "grad_norm": 2.0015881061553955, + "learning_rate": 1.1044311997926924e-05, + "loss": 1.4633, + "step": 2131 + }, + { + "epoch": 0.04972493272732043, + "grad_norm": 1.5064808130264282, + "learning_rate": 1.1049494687742938e-05, + "loss": 1.1692, + "step": 2132 + }, + { + "epoch": 0.04974825586649835, + "grad_norm": 1.6928976774215698, + "learning_rate": 1.1054677377558953e-05, + "loss": 1.3624, + "step": 2133 + }, + { + "epoch": 0.04977157900567627, + "grad_norm": 1.6377867460250854, + "learning_rate": 1.1059860067374969e-05, + "loss": 1.6004, + "step": 2134 + }, + { + "epoch": 0.049794902144854186, + "grad_norm": 2.1778640747070312, + "learning_rate": 1.1065042757190983e-05, + "loss": 1.7913, + "step": 2135 + }, + { + "epoch": 0.049818225284032104, + "grad_norm": 1.695723295211792, + "learning_rate": 1.1070225447006997e-05, + "loss": 1.2567, + "step": 2136 + }, + { + "epoch": 0.04984154842321002, + "grad_norm": 1.6048933267593384, + "learning_rate": 1.1075408136823012e-05, + "loss": 1.5821, + "step": 2137 + }, + { + "epoch": 0.04986487156238794, + "grad_norm": 1.6514248847961426, + "learning_rate": 1.1080590826639026e-05, + "loss": 1.2986, + "step": 2138 + }, + { + "epoch": 0.04988819470156586, + "grad_norm": 3.6867258548736572, + "learning_rate": 1.108577351645504e-05, + "loss": 1.2123, + "step": 2139 + }, + { + "epoch": 0.049911517840743776, + "grad_norm": 2.162198543548584, + "learning_rate": 1.1090956206271055e-05, + "loss": 1.41, + "step": 2140 + }, + { + "epoch": 0.049934840979921694, + "grad_norm": 1.446838617324829, + "learning_rate": 1.1096138896087069e-05, + "loss": 1.4192, + "step": 2141 + }, + { + "epoch": 0.04995816411909961, + "grad_norm": 1.6596550941467285, + "learning_rate": 1.1101321585903085e-05, + "loss": 1.4474, + "step": 2142 + }, + { + "epoch": 0.04998148725827753, + "grad_norm": 1.4073383808135986, + "learning_rate": 1.11065042757191e-05, + "loss": 1.1819, + "step": 2143 + }, + { + "epoch": 0.05000481039745545, + "grad_norm": 1.4993380308151245, + "learning_rate": 1.1111686965535114e-05, + "loss": 1.5477, + "step": 2144 + }, + { + "epoch": 0.050028133536633365, + "grad_norm": 2.566545248031616, + "learning_rate": 1.1116869655351128e-05, + "loss": 0.9798, + "step": 2145 + }, + { + "epoch": 0.05005145667581128, + "grad_norm": 1.9660520553588867, + "learning_rate": 1.1122052345167142e-05, + "loss": 1.4595, + "step": 2146 + }, + { + "epoch": 0.0500747798149892, + "grad_norm": 1.6653189659118652, + "learning_rate": 1.1127235034983157e-05, + "loss": 1.3836, + "step": 2147 + }, + { + "epoch": 0.05009810295416712, + "grad_norm": 1.651076078414917, + "learning_rate": 1.1132417724799171e-05, + "loss": 1.2737, + "step": 2148 + }, + { + "epoch": 0.05012142609334504, + "grad_norm": 1.9957411289215088, + "learning_rate": 1.1137600414615186e-05, + "loss": 1.236, + "step": 2149 + }, + { + "epoch": 0.050144749232522955, + "grad_norm": 1.9232287406921387, + "learning_rate": 1.11427831044312e-05, + "loss": 1.3815, + "step": 2150 + }, + { + "epoch": 0.05016807237170087, + "grad_norm": 2.0008256435394287, + "learning_rate": 1.1147965794247216e-05, + "loss": 1.5746, + "step": 2151 + }, + { + "epoch": 0.050191395510878783, + "grad_norm": 1.3993918895721436, + "learning_rate": 1.115314848406323e-05, + "loss": 1.4401, + "step": 2152 + }, + { + "epoch": 0.0502147186500567, + "grad_norm": 1.95811927318573, + "learning_rate": 1.1158331173879245e-05, + "loss": 1.5971, + "step": 2153 + }, + { + "epoch": 0.05023804178923462, + "grad_norm": 1.6759461164474487, + "learning_rate": 1.1163513863695259e-05, + "loss": 1.5051, + "step": 2154 + }, + { + "epoch": 0.05026136492841254, + "grad_norm": 1.6531785726547241, + "learning_rate": 1.1168696553511273e-05, + "loss": 1.6752, + "step": 2155 + }, + { + "epoch": 0.050284688067590455, + "grad_norm": 1.9572185277938843, + "learning_rate": 1.1173879243327288e-05, + "loss": 1.5851, + "step": 2156 + }, + { + "epoch": 0.05030801120676837, + "grad_norm": 1.6343263387680054, + "learning_rate": 1.1179061933143302e-05, + "loss": 1.3371, + "step": 2157 + }, + { + "epoch": 0.05033133434594629, + "grad_norm": 1.6465532779693604, + "learning_rate": 1.1184244622959316e-05, + "loss": 1.2993, + "step": 2158 + }, + { + "epoch": 0.05035465748512421, + "grad_norm": 2.2306525707244873, + "learning_rate": 1.1189427312775332e-05, + "loss": 1.3089, + "step": 2159 + }, + { + "epoch": 0.05037798062430213, + "grad_norm": 1.6966320276260376, + "learning_rate": 1.1194610002591347e-05, + "loss": 1.2082, + "step": 2160 + }, + { + "epoch": 0.050401303763480045, + "grad_norm": 1.576805830001831, + "learning_rate": 1.1199792692407361e-05, + "loss": 1.6838, + "step": 2161 + }, + { + "epoch": 0.05042462690265796, + "grad_norm": 1.8179023265838623, + "learning_rate": 1.1204975382223375e-05, + "loss": 1.2606, + "step": 2162 + }, + { + "epoch": 0.05044795004183588, + "grad_norm": 1.7981466054916382, + "learning_rate": 1.121015807203939e-05, + "loss": 1.2698, + "step": 2163 + }, + { + "epoch": 0.0504712731810138, + "grad_norm": 2.0942420959472656, + "learning_rate": 1.1215340761855404e-05, + "loss": 1.4324, + "step": 2164 + }, + { + "epoch": 0.050494596320191716, + "grad_norm": 3.04243803024292, + "learning_rate": 1.1220523451671418e-05, + "loss": 1.725, + "step": 2165 + }, + { + "epoch": 0.050517919459369634, + "grad_norm": 1.5606476068496704, + "learning_rate": 1.1225706141487433e-05, + "loss": 1.2715, + "step": 2166 + }, + { + "epoch": 0.05054124259854755, + "grad_norm": 1.8899224996566772, + "learning_rate": 1.1230888831303449e-05, + "loss": 1.5831, + "step": 2167 + }, + { + "epoch": 0.05056456573772547, + "grad_norm": 2.1356899738311768, + "learning_rate": 1.1236071521119463e-05, + "loss": 1.3111, + "step": 2168 + }, + { + "epoch": 0.05058788887690339, + "grad_norm": 2.962841033935547, + "learning_rate": 1.1241254210935478e-05, + "loss": 1.2757, + "step": 2169 + }, + { + "epoch": 0.050611212016081306, + "grad_norm": 1.5891367197036743, + "learning_rate": 1.1246436900751492e-05, + "loss": 1.5705, + "step": 2170 + }, + { + "epoch": 0.050634535155259223, + "grad_norm": 1.9614683389663696, + "learning_rate": 1.1251619590567506e-05, + "loss": 1.273, + "step": 2171 + }, + { + "epoch": 0.05065785829443714, + "grad_norm": 1.9047058820724487, + "learning_rate": 1.125680228038352e-05, + "loss": 1.2976, + "step": 2172 + }, + { + "epoch": 0.05068118143361506, + "grad_norm": 1.6429585218429565, + "learning_rate": 1.1261984970199535e-05, + "loss": 1.1797, + "step": 2173 + }, + { + "epoch": 0.05070450457279298, + "grad_norm": 1.5334621667861938, + "learning_rate": 1.126716766001555e-05, + "loss": 1.3446, + "step": 2174 + }, + { + "epoch": 0.050727827711970895, + "grad_norm": 2.300473690032959, + "learning_rate": 1.1272350349831565e-05, + "loss": 1.471, + "step": 2175 + }, + { + "epoch": 0.05075115085114881, + "grad_norm": 2.3872854709625244, + "learning_rate": 1.127753303964758e-05, + "loss": 1.4409, + "step": 2176 + }, + { + "epoch": 0.05077447399032673, + "grad_norm": 1.5523571968078613, + "learning_rate": 1.1282715729463594e-05, + "loss": 1.2701, + "step": 2177 + }, + { + "epoch": 0.05079779712950465, + "grad_norm": 2.0226919651031494, + "learning_rate": 1.1287898419279607e-05, + "loss": 1.702, + "step": 2178 + }, + { + "epoch": 0.05082112026868257, + "grad_norm": 2.056716203689575, + "learning_rate": 1.1293081109095621e-05, + "loss": 1.3019, + "step": 2179 + }, + { + "epoch": 0.05084444340786048, + "grad_norm": 3.767144203186035, + "learning_rate": 1.1298263798911635e-05, + "loss": 1.6133, + "step": 2180 + }, + { + "epoch": 0.050867766547038396, + "grad_norm": 1.7097781896591187, + "learning_rate": 1.130344648872765e-05, + "loss": 1.119, + "step": 2181 + }, + { + "epoch": 0.05089108968621631, + "grad_norm": 2.574557304382324, + "learning_rate": 1.1308629178543664e-05, + "loss": 1.4325, + "step": 2182 + }, + { + "epoch": 0.05091441282539423, + "grad_norm": 1.7624635696411133, + "learning_rate": 1.1313811868359678e-05, + "loss": 1.3827, + "step": 2183 + }, + { + "epoch": 0.05093773596457215, + "grad_norm": 1.9757330417633057, + "learning_rate": 1.1318994558175693e-05, + "loss": 1.4845, + "step": 2184 + }, + { + "epoch": 0.05096105910375007, + "grad_norm": 1.4644198417663574, + "learning_rate": 1.1324177247991707e-05, + "loss": 1.1953, + "step": 2185 + }, + { + "epoch": 0.050984382242927985, + "grad_norm": 1.6254751682281494, + "learning_rate": 1.1329359937807723e-05, + "loss": 1.4711, + "step": 2186 + }, + { + "epoch": 0.0510077053821059, + "grad_norm": 1.4148911237716675, + "learning_rate": 1.1334542627623737e-05, + "loss": 1.1752, + "step": 2187 + }, + { + "epoch": 0.05103102852128382, + "grad_norm": 1.9235097169876099, + "learning_rate": 1.1339725317439752e-05, + "loss": 1.5117, + "step": 2188 + }, + { + "epoch": 0.05105435166046174, + "grad_norm": 2.0073230266571045, + "learning_rate": 1.1344908007255766e-05, + "loss": 1.7692, + "step": 2189 + }, + { + "epoch": 0.05107767479963966, + "grad_norm": 1.6105692386627197, + "learning_rate": 1.135009069707178e-05, + "loss": 1.2348, + "step": 2190 + }, + { + "epoch": 0.051100997938817574, + "grad_norm": 1.764140009880066, + "learning_rate": 1.1355273386887795e-05, + "loss": 1.5298, + "step": 2191 + }, + { + "epoch": 0.05112432107799549, + "grad_norm": 1.5853471755981445, + "learning_rate": 1.136045607670381e-05, + "loss": 1.1606, + "step": 2192 + }, + { + "epoch": 0.05114764421717341, + "grad_norm": 2.409334659576416, + "learning_rate": 1.1365638766519824e-05, + "loss": 1.7928, + "step": 2193 + }, + { + "epoch": 0.05117096735635133, + "grad_norm": 1.611031174659729, + "learning_rate": 1.137082145633584e-05, + "loss": 1.3917, + "step": 2194 + }, + { + "epoch": 0.051194290495529246, + "grad_norm": 1.5654594898223877, + "learning_rate": 1.1376004146151854e-05, + "loss": 1.5245, + "step": 2195 + }, + { + "epoch": 0.051217613634707164, + "grad_norm": 1.8865010738372803, + "learning_rate": 1.1381186835967868e-05, + "loss": 1.7595, + "step": 2196 + }, + { + "epoch": 0.05124093677388508, + "grad_norm": 1.7687900066375732, + "learning_rate": 1.1386369525783883e-05, + "loss": 1.7168, + "step": 2197 + }, + { + "epoch": 0.051264259913063, + "grad_norm": 2.0165562629699707, + "learning_rate": 1.1391552215599897e-05, + "loss": 1.2842, + "step": 2198 + }, + { + "epoch": 0.05128758305224092, + "grad_norm": 1.9831840991973877, + "learning_rate": 1.1396734905415911e-05, + "loss": 1.7535, + "step": 2199 + }, + { + "epoch": 0.051310906191418836, + "grad_norm": 1.753906488418579, + "learning_rate": 1.1401917595231926e-05, + "loss": 1.439, + "step": 2200 + }, + { + "epoch": 0.05133422933059675, + "grad_norm": 1.7206554412841797, + "learning_rate": 1.140710028504794e-05, + "loss": 1.5502, + "step": 2201 + }, + { + "epoch": 0.05135755246977467, + "grad_norm": 1.8260259628295898, + "learning_rate": 1.1412282974863954e-05, + "loss": 1.6683, + "step": 2202 + }, + { + "epoch": 0.05138087560895259, + "grad_norm": 1.3575698137283325, + "learning_rate": 1.141746566467997e-05, + "loss": 1.3584, + "step": 2203 + }, + { + "epoch": 0.05140419874813051, + "grad_norm": 1.5947502851486206, + "learning_rate": 1.1422648354495985e-05, + "loss": 1.2867, + "step": 2204 + }, + { + "epoch": 0.051427521887308425, + "grad_norm": 2.2568745613098145, + "learning_rate": 1.1427831044311999e-05, + "loss": 1.5639, + "step": 2205 + }, + { + "epoch": 0.05145084502648634, + "grad_norm": 2.484616756439209, + "learning_rate": 1.1433013734128013e-05, + "loss": 1.4994, + "step": 2206 + }, + { + "epoch": 0.05147416816566426, + "grad_norm": 2.137057065963745, + "learning_rate": 1.1438196423944028e-05, + "loss": 1.907, + "step": 2207 + }, + { + "epoch": 0.05149749130484218, + "grad_norm": 1.7077836990356445, + "learning_rate": 1.1443379113760042e-05, + "loss": 1.5018, + "step": 2208 + }, + { + "epoch": 0.05152081444402009, + "grad_norm": 1.6885474920272827, + "learning_rate": 1.1448561803576057e-05, + "loss": 1.7111, + "step": 2209 + }, + { + "epoch": 0.05154413758319801, + "grad_norm": 1.669134497642517, + "learning_rate": 1.1453744493392071e-05, + "loss": 1.5672, + "step": 2210 + }, + { + "epoch": 0.051567460722375925, + "grad_norm": 1.5086790323257446, + "learning_rate": 1.1458927183208087e-05, + "loss": 1.522, + "step": 2211 + }, + { + "epoch": 0.05159078386155384, + "grad_norm": 1.5217182636260986, + "learning_rate": 1.1464109873024101e-05, + "loss": 1.3006, + "step": 2212 + }, + { + "epoch": 0.05161410700073176, + "grad_norm": 1.4646259546279907, + "learning_rate": 1.1469292562840116e-05, + "loss": 1.2538, + "step": 2213 + }, + { + "epoch": 0.05163743013990968, + "grad_norm": 2.1693215370178223, + "learning_rate": 1.147447525265613e-05, + "loss": 1.4208, + "step": 2214 + }, + { + "epoch": 0.0516607532790876, + "grad_norm": 1.264846682548523, + "learning_rate": 1.1479657942472144e-05, + "loss": 0.9305, + "step": 2215 + }, + { + "epoch": 0.051684076418265515, + "grad_norm": 1.9992789030075073, + "learning_rate": 1.1484840632288159e-05, + "loss": 1.3709, + "step": 2216 + }, + { + "epoch": 0.05170739955744343, + "grad_norm": 1.6985483169555664, + "learning_rate": 1.1490023322104173e-05, + "loss": 1.7966, + "step": 2217 + }, + { + "epoch": 0.05173072269662135, + "grad_norm": 1.4030596017837524, + "learning_rate": 1.1495206011920187e-05, + "loss": 1.3329, + "step": 2218 + }, + { + "epoch": 0.05175404583579927, + "grad_norm": 1.765177607536316, + "learning_rate": 1.1500388701736203e-05, + "loss": 1.3921, + "step": 2219 + }, + { + "epoch": 0.051777368974977186, + "grad_norm": 2.0211474895477295, + "learning_rate": 1.1505571391552218e-05, + "loss": 1.9322, + "step": 2220 + }, + { + "epoch": 0.051800692114155104, + "grad_norm": 2.1338424682617188, + "learning_rate": 1.1510754081368232e-05, + "loss": 1.9375, + "step": 2221 + }, + { + "epoch": 0.05182401525333302, + "grad_norm": 2.1635642051696777, + "learning_rate": 1.1515936771184246e-05, + "loss": 1.4048, + "step": 2222 + }, + { + "epoch": 0.05184733839251094, + "grad_norm": 1.5882627964019775, + "learning_rate": 1.152111946100026e-05, + "loss": 1.4047, + "step": 2223 + }, + { + "epoch": 0.05187066153168886, + "grad_norm": 1.6761504411697388, + "learning_rate": 1.1526302150816275e-05, + "loss": 1.506, + "step": 2224 + }, + { + "epoch": 0.051893984670866776, + "grad_norm": 2.401423454284668, + "learning_rate": 1.153148484063229e-05, + "loss": 1.132, + "step": 2225 + }, + { + "epoch": 0.051917307810044694, + "grad_norm": 2.2021756172180176, + "learning_rate": 1.1536667530448304e-05, + "loss": 1.8485, + "step": 2226 + }, + { + "epoch": 0.05194063094922261, + "grad_norm": 1.436631441116333, + "learning_rate": 1.154185022026432e-05, + "loss": 1.3811, + "step": 2227 + }, + { + "epoch": 0.05196395408840053, + "grad_norm": 1.6710056066513062, + "learning_rate": 1.1547032910080334e-05, + "loss": 1.5939, + "step": 2228 + }, + { + "epoch": 0.05198727722757845, + "grad_norm": 1.295320749282837, + "learning_rate": 1.1552215599896349e-05, + "loss": 1.1543, + "step": 2229 + }, + { + "epoch": 0.052010600366756365, + "grad_norm": 1.9142168760299683, + "learning_rate": 1.1557398289712363e-05, + "loss": 1.3995, + "step": 2230 + }, + { + "epoch": 0.05203392350593428, + "grad_norm": 1.6744799613952637, + "learning_rate": 1.1562580979528377e-05, + "loss": 1.547, + "step": 2231 + }, + { + "epoch": 0.0520572466451122, + "grad_norm": 2.2781121730804443, + "learning_rate": 1.1567763669344392e-05, + "loss": 1.5105, + "step": 2232 + }, + { + "epoch": 0.05208056978429012, + "grad_norm": 1.9038276672363281, + "learning_rate": 1.1572946359160404e-05, + "loss": 1.265, + "step": 2233 + }, + { + "epoch": 0.05210389292346804, + "grad_norm": 1.9407936334609985, + "learning_rate": 1.1578129048976419e-05, + "loss": 1.4646, + "step": 2234 + }, + { + "epoch": 0.052127216062645955, + "grad_norm": 2.0184576511383057, + "learning_rate": 1.1583311738792433e-05, + "loss": 1.3433, + "step": 2235 + }, + { + "epoch": 0.05215053920182387, + "grad_norm": 2.160403251647949, + "learning_rate": 1.1588494428608447e-05, + "loss": 1.3387, + "step": 2236 + }, + { + "epoch": 0.052173862341001784, + "grad_norm": 1.5148425102233887, + "learning_rate": 1.1593677118424462e-05, + "loss": 1.2889, + "step": 2237 + }, + { + "epoch": 0.0521971854801797, + "grad_norm": 2.4264883995056152, + "learning_rate": 1.1598859808240478e-05, + "loss": 1.3715, + "step": 2238 + }, + { + "epoch": 0.05222050861935762, + "grad_norm": 1.9291166067123413, + "learning_rate": 1.1604042498056492e-05, + "loss": 1.9834, + "step": 2239 + }, + { + "epoch": 0.05224383175853554, + "grad_norm": 1.8168792724609375, + "learning_rate": 1.1609225187872506e-05, + "loss": 1.4135, + "step": 2240 + }, + { + "epoch": 0.052267154897713455, + "grad_norm": 1.6432805061340332, + "learning_rate": 1.161440787768852e-05, + "loss": 1.5285, + "step": 2241 + }, + { + "epoch": 0.05229047803689137, + "grad_norm": 1.6379437446594238, + "learning_rate": 1.1619590567504535e-05, + "loss": 1.3672, + "step": 2242 + }, + { + "epoch": 0.05231380117606929, + "grad_norm": 1.8223236799240112, + "learning_rate": 1.162477325732055e-05, + "loss": 1.6033, + "step": 2243 + }, + { + "epoch": 0.05233712431524721, + "grad_norm": 1.4181009531021118, + "learning_rate": 1.1629955947136564e-05, + "loss": 1.6317, + "step": 2244 + }, + { + "epoch": 0.05236044745442513, + "grad_norm": 2.042304039001465, + "learning_rate": 1.1635138636952578e-05, + "loss": 1.4046, + "step": 2245 + }, + { + "epoch": 0.052383770593603045, + "grad_norm": 1.937261700630188, + "learning_rate": 1.1640321326768592e-05, + "loss": 1.086, + "step": 2246 + }, + { + "epoch": 0.05240709373278096, + "grad_norm": 1.686892032623291, + "learning_rate": 1.1645504016584608e-05, + "loss": 1.6014, + "step": 2247 + }, + { + "epoch": 0.05243041687195888, + "grad_norm": 2.37595272064209, + "learning_rate": 1.1650686706400623e-05, + "loss": 1.464, + "step": 2248 + }, + { + "epoch": 0.0524537400111368, + "grad_norm": 1.8354530334472656, + "learning_rate": 1.1655869396216637e-05, + "loss": 1.3907, + "step": 2249 + }, + { + "epoch": 0.052477063150314716, + "grad_norm": 1.997286319732666, + "learning_rate": 1.1661052086032652e-05, + "loss": 1.1932, + "step": 2250 + }, + { + "epoch": 0.052500386289492634, + "grad_norm": 1.6411911249160767, + "learning_rate": 1.1666234775848666e-05, + "loss": 1.5849, + "step": 2251 + }, + { + "epoch": 0.05252370942867055, + "grad_norm": 1.7450475692749023, + "learning_rate": 1.167141746566468e-05, + "loss": 1.5098, + "step": 2252 + }, + { + "epoch": 0.05254703256784847, + "grad_norm": 2.1054482460021973, + "learning_rate": 1.1676600155480695e-05, + "loss": 1.7326, + "step": 2253 + }, + { + "epoch": 0.05257035570702639, + "grad_norm": 2.1427578926086426, + "learning_rate": 1.1681782845296709e-05, + "loss": 1.2461, + "step": 2254 + }, + { + "epoch": 0.052593678846204306, + "grad_norm": 1.820042371749878, + "learning_rate": 1.1686965535112725e-05, + "loss": 1.8495, + "step": 2255 + }, + { + "epoch": 0.052617001985382224, + "grad_norm": 1.908542275428772, + "learning_rate": 1.169214822492874e-05, + "loss": 1.2177, + "step": 2256 + }, + { + "epoch": 0.05264032512456014, + "grad_norm": 2.222275495529175, + "learning_rate": 1.1697330914744754e-05, + "loss": 1.8614, + "step": 2257 + }, + { + "epoch": 0.05266364826373806, + "grad_norm": 1.8124415874481201, + "learning_rate": 1.1702513604560768e-05, + "loss": 1.162, + "step": 2258 + }, + { + "epoch": 0.05268697140291598, + "grad_norm": 1.7822272777557373, + "learning_rate": 1.1707696294376782e-05, + "loss": 1.6498, + "step": 2259 + }, + { + "epoch": 0.052710294542093895, + "grad_norm": 2.066291570663452, + "learning_rate": 1.1712878984192797e-05, + "loss": 1.5203, + "step": 2260 + }, + { + "epoch": 0.05273361768127181, + "grad_norm": 1.8691362142562866, + "learning_rate": 1.1718061674008811e-05, + "loss": 1.6722, + "step": 2261 + }, + { + "epoch": 0.05275694082044973, + "grad_norm": 1.8264986276626587, + "learning_rate": 1.1723244363824825e-05, + "loss": 1.787, + "step": 2262 + }, + { + "epoch": 0.05278026395962765, + "grad_norm": 1.7741049528121948, + "learning_rate": 1.1728427053640841e-05, + "loss": 1.5503, + "step": 2263 + }, + { + "epoch": 0.05280358709880557, + "grad_norm": 1.8473697900772095, + "learning_rate": 1.1733609743456856e-05, + "loss": 1.337, + "step": 2264 + }, + { + "epoch": 0.052826910237983485, + "grad_norm": 2.0709192752838135, + "learning_rate": 1.173879243327287e-05, + "loss": 1.6331, + "step": 2265 + }, + { + "epoch": 0.052850233377161396, + "grad_norm": 2.1398303508758545, + "learning_rate": 1.1743975123088884e-05, + "loss": 0.9651, + "step": 2266 + }, + { + "epoch": 0.052873556516339314, + "grad_norm": 1.9882025718688965, + "learning_rate": 1.1749157812904899e-05, + "loss": 1.8047, + "step": 2267 + }, + { + "epoch": 0.05289687965551723, + "grad_norm": 1.8728996515274048, + "learning_rate": 1.1754340502720913e-05, + "loss": 1.4712, + "step": 2268 + }, + { + "epoch": 0.05292020279469515, + "grad_norm": 1.6529200077056885, + "learning_rate": 1.1759523192536927e-05, + "loss": 1.5364, + "step": 2269 + }, + { + "epoch": 0.05294352593387307, + "grad_norm": 2.0231099128723145, + "learning_rate": 1.1764705882352942e-05, + "loss": 1.46, + "step": 2270 + }, + { + "epoch": 0.052966849073050985, + "grad_norm": 1.8899494409561157, + "learning_rate": 1.1769888572168958e-05, + "loss": 1.6656, + "step": 2271 + }, + { + "epoch": 0.0529901722122289, + "grad_norm": 2.0990450382232666, + "learning_rate": 1.1775071261984972e-05, + "loss": 1.4712, + "step": 2272 + }, + { + "epoch": 0.05301349535140682, + "grad_norm": 2.2885234355926514, + "learning_rate": 1.1780253951800987e-05, + "loss": 1.3969, + "step": 2273 + }, + { + "epoch": 0.05303681849058474, + "grad_norm": 2.202568531036377, + "learning_rate": 1.1785436641617001e-05, + "loss": 1.0861, + "step": 2274 + }, + { + "epoch": 0.05306014162976266, + "grad_norm": 3.381636381149292, + "learning_rate": 1.1790619331433015e-05, + "loss": 1.2669, + "step": 2275 + }, + { + "epoch": 0.053083464768940575, + "grad_norm": 1.6885939836502075, + "learning_rate": 1.179580202124903e-05, + "loss": 1.6913, + "step": 2276 + }, + { + "epoch": 0.05310678790811849, + "grad_norm": 1.9537405967712402, + "learning_rate": 1.1800984711065044e-05, + "loss": 1.0473, + "step": 2277 + }, + { + "epoch": 0.05313011104729641, + "grad_norm": 1.9460904598236084, + "learning_rate": 1.1806167400881058e-05, + "loss": 1.131, + "step": 2278 + }, + { + "epoch": 0.05315343418647433, + "grad_norm": 2.025679111480713, + "learning_rate": 1.1811350090697073e-05, + "loss": 1.1503, + "step": 2279 + }, + { + "epoch": 0.053176757325652246, + "grad_norm": 1.8040107488632202, + "learning_rate": 1.1816532780513089e-05, + "loss": 1.6038, + "step": 2280 + }, + { + "epoch": 0.053200080464830164, + "grad_norm": 2.133852958679199, + "learning_rate": 1.1821715470329103e-05, + "loss": 1.6102, + "step": 2281 + }, + { + "epoch": 0.05322340360400808, + "grad_norm": 1.8510808944702148, + "learning_rate": 1.1826898160145117e-05, + "loss": 1.5358, + "step": 2282 + }, + { + "epoch": 0.053246726743186, + "grad_norm": 1.9468835592269897, + "learning_rate": 1.1832080849961132e-05, + "loss": 1.4974, + "step": 2283 + }, + { + "epoch": 0.05327004988236392, + "grad_norm": 1.580652117729187, + "learning_rate": 1.1837263539777146e-05, + "loss": 1.6189, + "step": 2284 + }, + { + "epoch": 0.053293373021541836, + "grad_norm": 1.6107277870178223, + "learning_rate": 1.184244622959316e-05, + "loss": 1.3356, + "step": 2285 + }, + { + "epoch": 0.053316696160719754, + "grad_norm": 1.7563567161560059, + "learning_rate": 1.1847628919409175e-05, + "loss": 1.4425, + "step": 2286 + }, + { + "epoch": 0.05334001929989767, + "grad_norm": 1.6008427143096924, + "learning_rate": 1.1852811609225189e-05, + "loss": 1.5226, + "step": 2287 + }, + { + "epoch": 0.05336334243907559, + "grad_norm": 1.8432244062423706, + "learning_rate": 1.1857994299041202e-05, + "loss": 1.4107, + "step": 2288 + }, + { + "epoch": 0.05338666557825351, + "grad_norm": 1.6150907278060913, + "learning_rate": 1.1863176988857216e-05, + "loss": 1.4161, + "step": 2289 + }, + { + "epoch": 0.053409988717431425, + "grad_norm": 1.8245857954025269, + "learning_rate": 1.1868359678673232e-05, + "loss": 1.417, + "step": 2290 + }, + { + "epoch": 0.05343331185660934, + "grad_norm": 1.7012311220169067, + "learning_rate": 1.1873542368489247e-05, + "loss": 1.2383, + "step": 2291 + }, + { + "epoch": 0.05345663499578726, + "grad_norm": 1.9361399412155151, + "learning_rate": 1.1878725058305261e-05, + "loss": 1.5359, + "step": 2292 + }, + { + "epoch": 0.05347995813496518, + "grad_norm": 1.9297102689743042, + "learning_rate": 1.1883907748121275e-05, + "loss": 1.3487, + "step": 2293 + }, + { + "epoch": 0.05350328127414309, + "grad_norm": 2.098510503768921, + "learning_rate": 1.188909043793729e-05, + "loss": 0.9242, + "step": 2294 + }, + { + "epoch": 0.05352660441332101, + "grad_norm": 1.745192050933838, + "learning_rate": 1.1894273127753304e-05, + "loss": 1.4062, + "step": 2295 + }, + { + "epoch": 0.053549927552498926, + "grad_norm": 2.441746473312378, + "learning_rate": 1.1899455817569318e-05, + "loss": 1.7816, + "step": 2296 + }, + { + "epoch": 0.053573250691676844, + "grad_norm": 2.045830488204956, + "learning_rate": 1.1904638507385333e-05, + "loss": 1.7797, + "step": 2297 + }, + { + "epoch": 0.05359657383085476, + "grad_norm": 1.6408205032348633, + "learning_rate": 1.1909821197201347e-05, + "loss": 1.2563, + "step": 2298 + }, + { + "epoch": 0.05361989697003268, + "grad_norm": 2.0717265605926514, + "learning_rate": 1.1915003887017363e-05, + "loss": 1.5343, + "step": 2299 + }, + { + "epoch": 0.0536432201092106, + "grad_norm": 1.457613468170166, + "learning_rate": 1.1920186576833377e-05, + "loss": 1.7042, + "step": 2300 + }, + { + "epoch": 0.053666543248388515, + "grad_norm": 2.1533775329589844, + "learning_rate": 1.1925369266649392e-05, + "loss": 1.3517, + "step": 2301 + }, + { + "epoch": 0.05368986638756643, + "grad_norm": 1.5491520166397095, + "learning_rate": 1.1930551956465406e-05, + "loss": 1.4162, + "step": 2302 + }, + { + "epoch": 0.05371318952674435, + "grad_norm": 1.7081563472747803, + "learning_rate": 1.193573464628142e-05, + "loss": 1.6408, + "step": 2303 + }, + { + "epoch": 0.05373651266592227, + "grad_norm": 1.8476041555404663, + "learning_rate": 1.1940917336097435e-05, + "loss": 1.2826, + "step": 2304 + }, + { + "epoch": 0.05375983580510019, + "grad_norm": 2.2153637409210205, + "learning_rate": 1.1946100025913449e-05, + "loss": 1.723, + "step": 2305 + }, + { + "epoch": 0.053783158944278105, + "grad_norm": 6.036977767944336, + "learning_rate": 1.1951282715729463e-05, + "loss": 1.4402, + "step": 2306 + }, + { + "epoch": 0.05380648208345602, + "grad_norm": 1.876021385192871, + "learning_rate": 1.195646540554548e-05, + "loss": 1.1415, + "step": 2307 + }, + { + "epoch": 0.05382980522263394, + "grad_norm": 2.157916307449341, + "learning_rate": 1.1961648095361494e-05, + "loss": 1.6102, + "step": 2308 + }, + { + "epoch": 0.05385312836181186, + "grad_norm": 1.8045432567596436, + "learning_rate": 1.1966830785177508e-05, + "loss": 1.5312, + "step": 2309 + }, + { + "epoch": 0.053876451500989776, + "grad_norm": 2.102548837661743, + "learning_rate": 1.1972013474993522e-05, + "loss": 1.3526, + "step": 2310 + }, + { + "epoch": 0.053899774640167694, + "grad_norm": 1.9561266899108887, + "learning_rate": 1.1977196164809537e-05, + "loss": 1.5143, + "step": 2311 + }, + { + "epoch": 0.05392309777934561, + "grad_norm": 1.5144243240356445, + "learning_rate": 1.1982378854625551e-05, + "loss": 1.5124, + "step": 2312 + }, + { + "epoch": 0.05394642091852353, + "grad_norm": 1.7776747941970825, + "learning_rate": 1.1987561544441566e-05, + "loss": 1.2017, + "step": 2313 + }, + { + "epoch": 0.05396974405770145, + "grad_norm": 1.5333037376403809, + "learning_rate": 1.199274423425758e-05, + "loss": 1.2877, + "step": 2314 + }, + { + "epoch": 0.053993067196879366, + "grad_norm": 1.7108148336410522, + "learning_rate": 1.1997926924073596e-05, + "loss": 1.5089, + "step": 2315 + }, + { + "epoch": 0.054016390336057284, + "grad_norm": 1.963119626045227, + "learning_rate": 1.200310961388961e-05, + "loss": 1.1509, + "step": 2316 + }, + { + "epoch": 0.0540397134752352, + "grad_norm": 1.6362985372543335, + "learning_rate": 1.2008292303705625e-05, + "loss": 1.4243, + "step": 2317 + }, + { + "epoch": 0.05406303661441312, + "grad_norm": 1.8509998321533203, + "learning_rate": 1.2013474993521639e-05, + "loss": 1.2237, + "step": 2318 + }, + { + "epoch": 0.05408635975359104, + "grad_norm": 1.904492735862732, + "learning_rate": 1.2018657683337653e-05, + "loss": 1.1659, + "step": 2319 + }, + { + "epoch": 0.054109682892768955, + "grad_norm": 1.8940964937210083, + "learning_rate": 1.2023840373153668e-05, + "loss": 1.6011, + "step": 2320 + }, + { + "epoch": 0.05413300603194687, + "grad_norm": 1.7062005996704102, + "learning_rate": 1.2029023062969682e-05, + "loss": 1.3888, + "step": 2321 + }, + { + "epoch": 0.05415632917112479, + "grad_norm": 2.0538320541381836, + "learning_rate": 1.2034205752785696e-05, + "loss": 1.4461, + "step": 2322 + }, + { + "epoch": 0.0541796523103027, + "grad_norm": 2.1501717567443848, + "learning_rate": 1.2039388442601712e-05, + "loss": 1.3813, + "step": 2323 + }, + { + "epoch": 0.05420297544948062, + "grad_norm": 1.9368064403533936, + "learning_rate": 1.2044571132417727e-05, + "loss": 1.466, + "step": 2324 + }, + { + "epoch": 0.05422629858865854, + "grad_norm": 1.8357282876968384, + "learning_rate": 1.2049753822233741e-05, + "loss": 1.3254, + "step": 2325 + }, + { + "epoch": 0.054249621727836456, + "grad_norm": 1.5460752248764038, + "learning_rate": 1.2054936512049755e-05, + "loss": 1.2867, + "step": 2326 + }, + { + "epoch": 0.054272944867014374, + "grad_norm": 1.5794566869735718, + "learning_rate": 1.206011920186577e-05, + "loss": 1.4563, + "step": 2327 + }, + { + "epoch": 0.05429626800619229, + "grad_norm": 1.7742433547973633, + "learning_rate": 1.2065301891681784e-05, + "loss": 1.9052, + "step": 2328 + }, + { + "epoch": 0.05431959114537021, + "grad_norm": 1.3682368993759155, + "learning_rate": 1.2070484581497798e-05, + "loss": 1.123, + "step": 2329 + }, + { + "epoch": 0.05434291428454813, + "grad_norm": 1.7420629262924194, + "learning_rate": 1.2075667271313813e-05, + "loss": 1.3116, + "step": 2330 + }, + { + "epoch": 0.054366237423726045, + "grad_norm": 1.6572750806808472, + "learning_rate": 1.2080849961129827e-05, + "loss": 1.2575, + "step": 2331 + }, + { + "epoch": 0.05438956056290396, + "grad_norm": 2.035689353942871, + "learning_rate": 1.2086032650945843e-05, + "loss": 1.5586, + "step": 2332 + }, + { + "epoch": 0.05441288370208188, + "grad_norm": 1.4352741241455078, + "learning_rate": 1.2091215340761858e-05, + "loss": 1.2754, + "step": 2333 + }, + { + "epoch": 0.0544362068412598, + "grad_norm": 1.4306999444961548, + "learning_rate": 1.2096398030577872e-05, + "loss": 1.2013, + "step": 2334 + }, + { + "epoch": 0.05445952998043772, + "grad_norm": 1.8308879137039185, + "learning_rate": 1.2101580720393886e-05, + "loss": 1.4611, + "step": 2335 + }, + { + "epoch": 0.054482853119615635, + "grad_norm": 1.5827016830444336, + "learning_rate": 1.21067634102099e-05, + "loss": 1.6502, + "step": 2336 + }, + { + "epoch": 0.05450617625879355, + "grad_norm": 2.0508148670196533, + "learning_rate": 1.2111946100025915e-05, + "loss": 1.4868, + "step": 2337 + }, + { + "epoch": 0.05452949939797147, + "grad_norm": 1.8490535020828247, + "learning_rate": 1.211712878984193e-05, + "loss": 1.4469, + "step": 2338 + }, + { + "epoch": 0.05455282253714939, + "grad_norm": 2.151078701019287, + "learning_rate": 1.2122311479657944e-05, + "loss": 1.5714, + "step": 2339 + }, + { + "epoch": 0.054576145676327306, + "grad_norm": 1.6393033266067505, + "learning_rate": 1.212749416947396e-05, + "loss": 1.5442, + "step": 2340 + }, + { + "epoch": 0.054599468815505224, + "grad_norm": 1.6875606775283813, + "learning_rate": 1.2132676859289974e-05, + "loss": 1.6064, + "step": 2341 + }, + { + "epoch": 0.05462279195468314, + "grad_norm": 1.5508856773376465, + "learning_rate": 1.2137859549105988e-05, + "loss": 1.4948, + "step": 2342 + }, + { + "epoch": 0.05464611509386106, + "grad_norm": 1.6648694276809692, + "learning_rate": 1.2143042238922001e-05, + "loss": 1.4836, + "step": 2343 + }, + { + "epoch": 0.05466943823303898, + "grad_norm": 1.7704250812530518, + "learning_rate": 1.2148224928738015e-05, + "loss": 1.5293, + "step": 2344 + }, + { + "epoch": 0.054692761372216896, + "grad_norm": 1.8191028833389282, + "learning_rate": 1.215340761855403e-05, + "loss": 1.6612, + "step": 2345 + }, + { + "epoch": 0.054716084511394814, + "grad_norm": 2.0659608840942383, + "learning_rate": 1.2158590308370044e-05, + "loss": 1.3825, + "step": 2346 + }, + { + "epoch": 0.05473940765057273, + "grad_norm": 2.3801848888397217, + "learning_rate": 1.2163772998186058e-05, + "loss": 1.6521, + "step": 2347 + }, + { + "epoch": 0.05476273078975065, + "grad_norm": 1.5148749351501465, + "learning_rate": 1.2168955688002073e-05, + "loss": 1.3766, + "step": 2348 + }, + { + "epoch": 0.05478605392892857, + "grad_norm": 2.612302780151367, + "learning_rate": 1.2174138377818087e-05, + "loss": 1.5648, + "step": 2349 + }, + { + "epoch": 0.054809377068106485, + "grad_norm": 1.318306803703308, + "learning_rate": 1.2179321067634101e-05, + "loss": 1.1209, + "step": 2350 + }, + { + "epoch": 0.054832700207284396, + "grad_norm": 1.9958271980285645, + "learning_rate": 1.2184503757450117e-05, + "loss": 1.4942, + "step": 2351 + }, + { + "epoch": 0.054856023346462314, + "grad_norm": 3.188138008117676, + "learning_rate": 1.2189686447266132e-05, + "loss": 1.4955, + "step": 2352 + }, + { + "epoch": 0.05487934648564023, + "grad_norm": 1.9520068168640137, + "learning_rate": 1.2194869137082146e-05, + "loss": 1.4372, + "step": 2353 + }, + { + "epoch": 0.05490266962481815, + "grad_norm": 1.7835674285888672, + "learning_rate": 1.220005182689816e-05, + "loss": 1.6209, + "step": 2354 + }, + { + "epoch": 0.05492599276399607, + "grad_norm": 1.5796537399291992, + "learning_rate": 1.2205234516714175e-05, + "loss": 1.5106, + "step": 2355 + }, + { + "epoch": 0.054949315903173986, + "grad_norm": 1.6579697132110596, + "learning_rate": 1.221041720653019e-05, + "loss": 1.6087, + "step": 2356 + }, + { + "epoch": 0.0549726390423519, + "grad_norm": 1.838502287864685, + "learning_rate": 1.2215599896346204e-05, + "loss": 1.5969, + "step": 2357 + }, + { + "epoch": 0.05499596218152982, + "grad_norm": 2.256510019302368, + "learning_rate": 1.2220782586162218e-05, + "loss": 1.3455, + "step": 2358 + }, + { + "epoch": 0.05501928532070774, + "grad_norm": 1.9825702905654907, + "learning_rate": 1.2225965275978234e-05, + "loss": 1.5903, + "step": 2359 + }, + { + "epoch": 0.05504260845988566, + "grad_norm": 1.8769842386245728, + "learning_rate": 1.2231147965794248e-05, + "loss": 1.3899, + "step": 2360 + }, + { + "epoch": 0.055065931599063575, + "grad_norm": 1.608070969581604, + "learning_rate": 1.2236330655610263e-05, + "loss": 1.1432, + "step": 2361 + }, + { + "epoch": 0.05508925473824149, + "grad_norm": 1.7243940830230713, + "learning_rate": 1.2241513345426277e-05, + "loss": 1.425, + "step": 2362 + }, + { + "epoch": 0.05511257787741941, + "grad_norm": 1.8134387731552124, + "learning_rate": 1.2246696035242291e-05, + "loss": 1.4092, + "step": 2363 + }, + { + "epoch": 0.05513590101659733, + "grad_norm": 1.6564340591430664, + "learning_rate": 1.2251878725058306e-05, + "loss": 1.4602, + "step": 2364 + }, + { + "epoch": 0.05515922415577525, + "grad_norm": 2.2256054878234863, + "learning_rate": 1.225706141487432e-05, + "loss": 1.6256, + "step": 2365 + }, + { + "epoch": 0.055182547294953165, + "grad_norm": 2.0529842376708984, + "learning_rate": 1.2262244104690334e-05, + "loss": 1.4628, + "step": 2366 + }, + { + "epoch": 0.05520587043413108, + "grad_norm": 1.3641233444213867, + "learning_rate": 1.226742679450635e-05, + "loss": 1.3189, + "step": 2367 + }, + { + "epoch": 0.055229193573309, + "grad_norm": 2.196507692337036, + "learning_rate": 1.2272609484322365e-05, + "loss": 1.7297, + "step": 2368 + }, + { + "epoch": 0.05525251671248692, + "grad_norm": 1.719078540802002, + "learning_rate": 1.2277792174138379e-05, + "loss": 1.6974, + "step": 2369 + }, + { + "epoch": 0.055275839851664836, + "grad_norm": 1.8548617362976074, + "learning_rate": 1.2282974863954393e-05, + "loss": 1.7735, + "step": 2370 + }, + { + "epoch": 0.055299162990842754, + "grad_norm": 2.1644322872161865, + "learning_rate": 1.2288157553770408e-05, + "loss": 1.2362, + "step": 2371 + }, + { + "epoch": 0.05532248613002067, + "grad_norm": 1.642645239830017, + "learning_rate": 1.2293340243586422e-05, + "loss": 1.3763, + "step": 2372 + }, + { + "epoch": 0.05534580926919859, + "grad_norm": 1.4598188400268555, + "learning_rate": 1.2298522933402437e-05, + "loss": 1.019, + "step": 2373 + }, + { + "epoch": 0.05536913240837651, + "grad_norm": 1.7192937135696411, + "learning_rate": 1.2303705623218451e-05, + "loss": 1.1431, + "step": 2374 + }, + { + "epoch": 0.055392455547554426, + "grad_norm": 1.5887870788574219, + "learning_rate": 1.2308888313034467e-05, + "loss": 1.3876, + "step": 2375 + }, + { + "epoch": 0.055415778686732343, + "grad_norm": 3.285121440887451, + "learning_rate": 1.2314071002850481e-05, + "loss": 1.6588, + "step": 2376 + }, + { + "epoch": 0.05543910182591026, + "grad_norm": 1.7930113077163696, + "learning_rate": 1.2319253692666496e-05, + "loss": 1.3764, + "step": 2377 + }, + { + "epoch": 0.05546242496508818, + "grad_norm": 1.4698500633239746, + "learning_rate": 1.232443638248251e-05, + "loss": 1.1216, + "step": 2378 + }, + { + "epoch": 0.0554857481042661, + "grad_norm": 2.0502030849456787, + "learning_rate": 1.2329619072298524e-05, + "loss": 1.3501, + "step": 2379 + }, + { + "epoch": 0.05550907124344401, + "grad_norm": 2.3289663791656494, + "learning_rate": 1.2334801762114539e-05, + "loss": 1.7261, + "step": 2380 + }, + { + "epoch": 0.055532394382621926, + "grad_norm": 1.5533864498138428, + "learning_rate": 1.2339984451930553e-05, + "loss": 1.1996, + "step": 2381 + }, + { + "epoch": 0.055555717521799844, + "grad_norm": 1.894768476486206, + "learning_rate": 1.2345167141746567e-05, + "loss": 1.1702, + "step": 2382 + }, + { + "epoch": 0.05557904066097776, + "grad_norm": 2.2866978645324707, + "learning_rate": 1.2350349831562582e-05, + "loss": 1.692, + "step": 2383 + }, + { + "epoch": 0.05560236380015568, + "grad_norm": 1.9464361667633057, + "learning_rate": 1.2355532521378598e-05, + "loss": 1.5588, + "step": 2384 + }, + { + "epoch": 0.0556256869393336, + "grad_norm": 1.5696362257003784, + "learning_rate": 1.2360715211194612e-05, + "loss": 1.5714, + "step": 2385 + }, + { + "epoch": 0.055649010078511515, + "grad_norm": 1.9288362264633179, + "learning_rate": 1.2365897901010626e-05, + "loss": 1.4401, + "step": 2386 + }, + { + "epoch": 0.05567233321768943, + "grad_norm": 1.8174824714660645, + "learning_rate": 1.237108059082664e-05, + "loss": 1.5862, + "step": 2387 + }, + { + "epoch": 0.05569565635686735, + "grad_norm": 1.8247565031051636, + "learning_rate": 1.2376263280642655e-05, + "loss": 1.4851, + "step": 2388 + }, + { + "epoch": 0.05571897949604527, + "grad_norm": 1.764724612236023, + "learning_rate": 1.238144597045867e-05, + "loss": 1.533, + "step": 2389 + }, + { + "epoch": 0.05574230263522319, + "grad_norm": 1.6865999698638916, + "learning_rate": 1.2386628660274684e-05, + "loss": 1.6418, + "step": 2390 + }, + { + "epoch": 0.055765625774401105, + "grad_norm": 1.3049399852752686, + "learning_rate": 1.2391811350090698e-05, + "loss": 1.2936, + "step": 2391 + }, + { + "epoch": 0.05578894891357902, + "grad_norm": 1.7230569124221802, + "learning_rate": 1.2396994039906714e-05, + "loss": 1.3043, + "step": 2392 + }, + { + "epoch": 0.05581227205275694, + "grad_norm": 1.7113614082336426, + "learning_rate": 1.2402176729722729e-05, + "loss": 1.0848, + "step": 2393 + }, + { + "epoch": 0.05583559519193486, + "grad_norm": 1.8730138540267944, + "learning_rate": 1.2407359419538743e-05, + "loss": 1.5867, + "step": 2394 + }, + { + "epoch": 0.05585891833111278, + "grad_norm": 1.5495893955230713, + "learning_rate": 1.2412542109354757e-05, + "loss": 1.1484, + "step": 2395 + }, + { + "epoch": 0.055882241470290694, + "grad_norm": 2.3786067962646484, + "learning_rate": 1.2417724799170772e-05, + "loss": 1.523, + "step": 2396 + }, + { + "epoch": 0.05590556460946861, + "grad_norm": 2.259895086288452, + "learning_rate": 1.2422907488986786e-05, + "loss": 1.7245, + "step": 2397 + }, + { + "epoch": 0.05592888774864653, + "grad_norm": 2.1496381759643555, + "learning_rate": 1.24280901788028e-05, + "loss": 1.2052, + "step": 2398 + }, + { + "epoch": 0.05595221088782445, + "grad_norm": 1.9179364442825317, + "learning_rate": 1.2433272868618813e-05, + "loss": 1.3639, + "step": 2399 + }, + { + "epoch": 0.055975534027002366, + "grad_norm": 1.6580770015716553, + "learning_rate": 1.2438455558434827e-05, + "loss": 1.3285, + "step": 2400 + }, + { + "epoch": 0.055998857166180284, + "grad_norm": 1.7400684356689453, + "learning_rate": 1.2443638248250842e-05, + "loss": 1.3427, + "step": 2401 + }, + { + "epoch": 0.0560221803053582, + "grad_norm": 1.6764858961105347, + "learning_rate": 1.2448820938066856e-05, + "loss": 1.276, + "step": 2402 + }, + { + "epoch": 0.05604550344453612, + "grad_norm": 2.2973010540008545, + "learning_rate": 1.2454003627882872e-05, + "loss": 1.4305, + "step": 2403 + }, + { + "epoch": 0.05606882658371404, + "grad_norm": 2.2047767639160156, + "learning_rate": 1.2459186317698886e-05, + "loss": 1.4465, + "step": 2404 + }, + { + "epoch": 0.056092149722891955, + "grad_norm": 2.275881052017212, + "learning_rate": 1.24643690075149e-05, + "loss": 1.337, + "step": 2405 + }, + { + "epoch": 0.05611547286206987, + "grad_norm": 1.8678287267684937, + "learning_rate": 1.2469551697330915e-05, + "loss": 1.49, + "step": 2406 + }, + { + "epoch": 0.05613879600124779, + "grad_norm": 1.6373817920684814, + "learning_rate": 1.247473438714693e-05, + "loss": 1.6221, + "step": 2407 + }, + { + "epoch": 0.0561621191404257, + "grad_norm": 1.689551591873169, + "learning_rate": 1.2479917076962944e-05, + "loss": 1.6474, + "step": 2408 + }, + { + "epoch": 0.05618544227960362, + "grad_norm": 1.5831496715545654, + "learning_rate": 1.2485099766778958e-05, + "loss": 1.5889, + "step": 2409 + }, + { + "epoch": 0.05620876541878154, + "grad_norm": 1.933913230895996, + "learning_rate": 1.2490282456594972e-05, + "loss": 1.5632, + "step": 2410 + }, + { + "epoch": 0.056232088557959456, + "grad_norm": 1.9560495615005493, + "learning_rate": 1.2495465146410988e-05, + "loss": 1.8688, + "step": 2411 + }, + { + "epoch": 0.056255411697137374, + "grad_norm": 1.767649531364441, + "learning_rate": 1.2500647836227003e-05, + "loss": 1.1946, + "step": 2412 + }, + { + "epoch": 0.05627873483631529, + "grad_norm": 1.344802975654602, + "learning_rate": 1.2505830526043017e-05, + "loss": 1.4424, + "step": 2413 + }, + { + "epoch": 0.05630205797549321, + "grad_norm": 1.9637680053710938, + "learning_rate": 1.2511013215859032e-05, + "loss": 1.3989, + "step": 2414 + }, + { + "epoch": 0.05632538111467113, + "grad_norm": 1.3612053394317627, + "learning_rate": 1.2516195905675046e-05, + "loss": 1.2406, + "step": 2415 + }, + { + "epoch": 0.056348704253849045, + "grad_norm": 1.7085059881210327, + "learning_rate": 1.252137859549106e-05, + "loss": 1.2047, + "step": 2416 + }, + { + "epoch": 0.05637202739302696, + "grad_norm": 2.3130269050598145, + "learning_rate": 1.2526561285307075e-05, + "loss": 1.441, + "step": 2417 + }, + { + "epoch": 0.05639535053220488, + "grad_norm": 1.6563998460769653, + "learning_rate": 1.2531743975123089e-05, + "loss": 1.1357, + "step": 2418 + }, + { + "epoch": 0.0564186736713828, + "grad_norm": 2.186823606491089, + "learning_rate": 1.2536926664939105e-05, + "loss": 1.541, + "step": 2419 + }, + { + "epoch": 0.05644199681056072, + "grad_norm": 1.8775302171707153, + "learning_rate": 1.254210935475512e-05, + "loss": 1.3811, + "step": 2420 + }, + { + "epoch": 0.056465319949738635, + "grad_norm": 1.6848359107971191, + "learning_rate": 1.2547292044571134e-05, + "loss": 1.2492, + "step": 2421 + }, + { + "epoch": 0.05648864308891655, + "grad_norm": 2.209897994995117, + "learning_rate": 1.2552474734387148e-05, + "loss": 1.4139, + "step": 2422 + }, + { + "epoch": 0.05651196622809447, + "grad_norm": 2.023489475250244, + "learning_rate": 1.2557657424203162e-05, + "loss": 1.7047, + "step": 2423 + }, + { + "epoch": 0.05653528936727239, + "grad_norm": 2.25374436378479, + "learning_rate": 1.2562840114019177e-05, + "loss": 1.6429, + "step": 2424 + }, + { + "epoch": 0.056558612506450306, + "grad_norm": 2.11035418510437, + "learning_rate": 1.2568022803835191e-05, + "loss": 1.7968, + "step": 2425 + }, + { + "epoch": 0.056581935645628224, + "grad_norm": 1.6524832248687744, + "learning_rate": 1.2573205493651205e-05, + "loss": 1.3035, + "step": 2426 + }, + { + "epoch": 0.05660525878480614, + "grad_norm": 1.9546128511428833, + "learning_rate": 1.2578388183467221e-05, + "loss": 1.6473, + "step": 2427 + }, + { + "epoch": 0.05662858192398406, + "grad_norm": 1.8195034265518188, + "learning_rate": 1.2583570873283236e-05, + "loss": 1.2349, + "step": 2428 + }, + { + "epoch": 0.05665190506316198, + "grad_norm": 2.0104966163635254, + "learning_rate": 1.258875356309925e-05, + "loss": 1.564, + "step": 2429 + }, + { + "epoch": 0.056675228202339896, + "grad_norm": 1.6539239883422852, + "learning_rate": 1.2593936252915264e-05, + "loss": 1.6515, + "step": 2430 + }, + { + "epoch": 0.056698551341517814, + "grad_norm": 1.92090904712677, + "learning_rate": 1.2599118942731279e-05, + "loss": 1.6177, + "step": 2431 + }, + { + "epoch": 0.05672187448069573, + "grad_norm": 1.899638056755066, + "learning_rate": 1.2604301632547293e-05, + "loss": 1.5904, + "step": 2432 + }, + { + "epoch": 0.05674519761987365, + "grad_norm": 1.7024785280227661, + "learning_rate": 1.2609484322363307e-05, + "loss": 1.7716, + "step": 2433 + }, + { + "epoch": 0.05676852075905157, + "grad_norm": 1.4970227479934692, + "learning_rate": 1.2614667012179322e-05, + "loss": 1.4859, + "step": 2434 + }, + { + "epoch": 0.056791843898229485, + "grad_norm": 1.5742204189300537, + "learning_rate": 1.2619849701995336e-05, + "loss": 1.3328, + "step": 2435 + }, + { + "epoch": 0.0568151670374074, + "grad_norm": 1.95245361328125, + "learning_rate": 1.2625032391811352e-05, + "loss": 1.1161, + "step": 2436 + }, + { + "epoch": 0.056838490176585314, + "grad_norm": 1.610113501548767, + "learning_rate": 1.2630215081627367e-05, + "loss": 1.447, + "step": 2437 + }, + { + "epoch": 0.05686181331576323, + "grad_norm": 1.517272710800171, + "learning_rate": 1.2635397771443381e-05, + "loss": 1.4569, + "step": 2438 + }, + { + "epoch": 0.05688513645494115, + "grad_norm": 1.9070580005645752, + "learning_rate": 1.2640580461259395e-05, + "loss": 1.5182, + "step": 2439 + }, + { + "epoch": 0.05690845959411907, + "grad_norm": 2.516972303390503, + "learning_rate": 1.264576315107541e-05, + "loss": 1.4891, + "step": 2440 + }, + { + "epoch": 0.056931782733296986, + "grad_norm": 1.7125346660614014, + "learning_rate": 1.2650945840891424e-05, + "loss": 1.2783, + "step": 2441 + }, + { + "epoch": 0.056955105872474904, + "grad_norm": 2.2393455505371094, + "learning_rate": 1.2656128530707438e-05, + "loss": 1.6549, + "step": 2442 + }, + { + "epoch": 0.05697842901165282, + "grad_norm": 1.6830679178237915, + "learning_rate": 1.2661311220523453e-05, + "loss": 1.6243, + "step": 2443 + }, + { + "epoch": 0.05700175215083074, + "grad_norm": 1.647987961769104, + "learning_rate": 1.2666493910339469e-05, + "loss": 0.9855, + "step": 2444 + }, + { + "epoch": 0.05702507529000866, + "grad_norm": 2.4067366123199463, + "learning_rate": 1.2671676600155483e-05, + "loss": 1.5937, + "step": 2445 + }, + { + "epoch": 0.057048398429186575, + "grad_norm": 1.5815311670303345, + "learning_rate": 1.2676859289971497e-05, + "loss": 1.4023, + "step": 2446 + }, + { + "epoch": 0.05707172156836449, + "grad_norm": 1.9707512855529785, + "learning_rate": 1.2682041979787512e-05, + "loss": 1.6465, + "step": 2447 + }, + { + "epoch": 0.05709504470754241, + "grad_norm": 1.5195176601409912, + "learning_rate": 1.2687224669603526e-05, + "loss": 1.2986, + "step": 2448 + }, + { + "epoch": 0.05711836784672033, + "grad_norm": 2.393983840942383, + "learning_rate": 1.269240735941954e-05, + "loss": 1.3601, + "step": 2449 + }, + { + "epoch": 0.05714169098589825, + "grad_norm": 1.6375079154968262, + "learning_rate": 1.2697590049235555e-05, + "loss": 1.2357, + "step": 2450 + }, + { + "epoch": 0.057165014125076165, + "grad_norm": 1.8043434619903564, + "learning_rate": 1.2702772739051569e-05, + "loss": 1.2337, + "step": 2451 + }, + { + "epoch": 0.05718833726425408, + "grad_norm": 1.841853141784668, + "learning_rate": 1.2707955428867585e-05, + "loss": 1.4255, + "step": 2452 + }, + { + "epoch": 0.057211660403432, + "grad_norm": 1.9364678859710693, + "learning_rate": 1.27131381186836e-05, + "loss": 1.7609, + "step": 2453 + }, + { + "epoch": 0.05723498354260992, + "grad_norm": 2.492034912109375, + "learning_rate": 1.271832080849961e-05, + "loss": 1.397, + "step": 2454 + }, + { + "epoch": 0.057258306681787836, + "grad_norm": 2.044177532196045, + "learning_rate": 1.2723503498315627e-05, + "loss": 1.1781, + "step": 2455 + }, + { + "epoch": 0.057281629820965754, + "grad_norm": 2.0078577995300293, + "learning_rate": 1.2728686188131641e-05, + "loss": 1.3808, + "step": 2456 + }, + { + "epoch": 0.05730495296014367, + "grad_norm": 2.386645793914795, + "learning_rate": 1.2733868877947655e-05, + "loss": 1.634, + "step": 2457 + }, + { + "epoch": 0.05732827609932159, + "grad_norm": 1.5789023637771606, + "learning_rate": 1.273905156776367e-05, + "loss": 1.0976, + "step": 2458 + }, + { + "epoch": 0.05735159923849951, + "grad_norm": 1.582072138786316, + "learning_rate": 1.2744234257579684e-05, + "loss": 1.5047, + "step": 2459 + }, + { + "epoch": 0.057374922377677426, + "grad_norm": 1.7116421461105347, + "learning_rate": 1.2749416947395698e-05, + "loss": 1.4517, + "step": 2460 + }, + { + "epoch": 0.057398245516855344, + "grad_norm": 1.992920994758606, + "learning_rate": 1.2754599637211713e-05, + "loss": 1.167, + "step": 2461 + }, + { + "epoch": 0.05742156865603326, + "grad_norm": 1.7484151124954224, + "learning_rate": 1.2759782327027727e-05, + "loss": 1.5353, + "step": 2462 + }, + { + "epoch": 0.05744489179521118, + "grad_norm": 1.54849374294281, + "learning_rate": 1.2764965016843743e-05, + "loss": 1.6425, + "step": 2463 + }, + { + "epoch": 0.0574682149343891, + "grad_norm": 1.6086442470550537, + "learning_rate": 1.2770147706659757e-05, + "loss": 1.4122, + "step": 2464 + }, + { + "epoch": 0.05749153807356701, + "grad_norm": 1.4398033618927002, + "learning_rate": 1.2775330396475772e-05, + "loss": 1.2049, + "step": 2465 + }, + { + "epoch": 0.057514861212744926, + "grad_norm": 1.8544633388519287, + "learning_rate": 1.2780513086291786e-05, + "loss": 1.4098, + "step": 2466 + }, + { + "epoch": 0.057538184351922844, + "grad_norm": 2.2655575275421143, + "learning_rate": 1.27856957761078e-05, + "loss": 1.8327, + "step": 2467 + }, + { + "epoch": 0.05756150749110076, + "grad_norm": 1.9104077816009521, + "learning_rate": 1.2790878465923815e-05, + "loss": 1.6752, + "step": 2468 + }, + { + "epoch": 0.05758483063027868, + "grad_norm": 1.6727238893508911, + "learning_rate": 1.2796061155739829e-05, + "loss": 1.4592, + "step": 2469 + }, + { + "epoch": 0.0576081537694566, + "grad_norm": 1.9189153909683228, + "learning_rate": 1.2801243845555843e-05, + "loss": 1.4464, + "step": 2470 + }, + { + "epoch": 0.057631476908634516, + "grad_norm": 2.133301258087158, + "learning_rate": 1.280642653537186e-05, + "loss": 1.3992, + "step": 2471 + }, + { + "epoch": 0.057654800047812434, + "grad_norm": 2.135209560394287, + "learning_rate": 1.2811609225187874e-05, + "loss": 1.5513, + "step": 2472 + }, + { + "epoch": 0.05767812318699035, + "grad_norm": 1.6449888944625854, + "learning_rate": 1.2816791915003888e-05, + "loss": 1.6559, + "step": 2473 + }, + { + "epoch": 0.05770144632616827, + "grad_norm": 1.9080917835235596, + "learning_rate": 1.2821974604819902e-05, + "loss": 1.4798, + "step": 2474 + }, + { + "epoch": 0.05772476946534619, + "grad_norm": 1.9846621751785278, + "learning_rate": 1.2827157294635917e-05, + "loss": 1.6511, + "step": 2475 + }, + { + "epoch": 0.057748092604524105, + "grad_norm": 1.755775809288025, + "learning_rate": 1.2832339984451931e-05, + "loss": 1.5076, + "step": 2476 + }, + { + "epoch": 0.05777141574370202, + "grad_norm": 2.24576735496521, + "learning_rate": 1.2837522674267946e-05, + "loss": 1.173, + "step": 2477 + }, + { + "epoch": 0.05779473888287994, + "grad_norm": 2.244088649749756, + "learning_rate": 1.284270536408396e-05, + "loss": 1.871, + "step": 2478 + }, + { + "epoch": 0.05781806202205786, + "grad_norm": 2.0059969425201416, + "learning_rate": 1.2847888053899974e-05, + "loss": 1.5696, + "step": 2479 + }, + { + "epoch": 0.05784138516123578, + "grad_norm": 1.586342453956604, + "learning_rate": 1.285307074371599e-05, + "loss": 1.7007, + "step": 2480 + }, + { + "epoch": 0.057864708300413695, + "grad_norm": 2.4792561531066895, + "learning_rate": 1.2858253433532005e-05, + "loss": 1.3885, + "step": 2481 + }, + { + "epoch": 0.05788803143959161, + "grad_norm": 1.8159295320510864, + "learning_rate": 1.2863436123348019e-05, + "loss": 1.4572, + "step": 2482 + }, + { + "epoch": 0.05791135457876953, + "grad_norm": 1.6010699272155762, + "learning_rate": 1.2868618813164033e-05, + "loss": 1.3239, + "step": 2483 + }, + { + "epoch": 0.05793467771794745, + "grad_norm": 2.294011116027832, + "learning_rate": 1.2873801502980048e-05, + "loss": 1.4858, + "step": 2484 + }, + { + "epoch": 0.057958000857125366, + "grad_norm": 1.7822222709655762, + "learning_rate": 1.2878984192796062e-05, + "loss": 1.6677, + "step": 2485 + }, + { + "epoch": 0.057981323996303284, + "grad_norm": 1.7381556034088135, + "learning_rate": 1.2884166882612076e-05, + "loss": 1.5102, + "step": 2486 + }, + { + "epoch": 0.0580046471354812, + "grad_norm": 1.5182676315307617, + "learning_rate": 1.288934957242809e-05, + "loss": 1.4902, + "step": 2487 + }, + { + "epoch": 0.05802797027465912, + "grad_norm": 2.3234503269195557, + "learning_rate": 1.2894532262244107e-05, + "loss": 1.7547, + "step": 2488 + }, + { + "epoch": 0.05805129341383704, + "grad_norm": 1.8449368476867676, + "learning_rate": 1.2899714952060121e-05, + "loss": 1.4566, + "step": 2489 + }, + { + "epoch": 0.058074616553014956, + "grad_norm": 1.6624127626419067, + "learning_rate": 1.2904897641876135e-05, + "loss": 1.3047, + "step": 2490 + }, + { + "epoch": 0.058097939692192874, + "grad_norm": 1.881435513496399, + "learning_rate": 1.291008033169215e-05, + "loss": 1.4678, + "step": 2491 + }, + { + "epoch": 0.05812126283137079, + "grad_norm": 1.8103615045547485, + "learning_rate": 1.2915263021508164e-05, + "loss": 1.4687, + "step": 2492 + }, + { + "epoch": 0.0581445859705487, + "grad_norm": 2.051447868347168, + "learning_rate": 1.2920445711324178e-05, + "loss": 1.45, + "step": 2493 + }, + { + "epoch": 0.05816790910972662, + "grad_norm": 1.8762485980987549, + "learning_rate": 1.2925628401140193e-05, + "loss": 1.2467, + "step": 2494 + }, + { + "epoch": 0.05819123224890454, + "grad_norm": 1.860500454902649, + "learning_rate": 1.2930811090956207e-05, + "loss": 1.4693, + "step": 2495 + }, + { + "epoch": 0.058214555388082456, + "grad_norm": 1.8621612787246704, + "learning_rate": 1.2935993780772223e-05, + "loss": 1.7739, + "step": 2496 + }, + { + "epoch": 0.058237878527260374, + "grad_norm": 2.166384220123291, + "learning_rate": 1.2941176470588238e-05, + "loss": 1.3166, + "step": 2497 + }, + { + "epoch": 0.05826120166643829, + "grad_norm": 1.7007169723510742, + "learning_rate": 1.2946359160404252e-05, + "loss": 1.2054, + "step": 2498 + }, + { + "epoch": 0.05828452480561621, + "grad_norm": 1.8515853881835938, + "learning_rate": 1.2951541850220266e-05, + "loss": 1.5207, + "step": 2499 + }, + { + "epoch": 0.05830784794479413, + "grad_norm": 1.6332519054412842, + "learning_rate": 1.295672454003628e-05, + "loss": 1.5376, + "step": 2500 + }, + { + "epoch": 0.058331171083972046, + "grad_norm": 1.839971899986267, + "learning_rate": 1.2961907229852295e-05, + "loss": 1.4077, + "step": 2501 + }, + { + "epoch": 0.058354494223149964, + "grad_norm": 1.689015507698059, + "learning_rate": 1.296708991966831e-05, + "loss": 1.5571, + "step": 2502 + }, + { + "epoch": 0.05837781736232788, + "grad_norm": 1.962945580482483, + "learning_rate": 1.2972272609484324e-05, + "loss": 1.5348, + "step": 2503 + }, + { + "epoch": 0.0584011405015058, + "grad_norm": 1.7358533143997192, + "learning_rate": 1.297745529930034e-05, + "loss": 1.4625, + "step": 2504 + }, + { + "epoch": 0.05842446364068372, + "grad_norm": 1.5950040817260742, + "learning_rate": 1.2982637989116354e-05, + "loss": 1.184, + "step": 2505 + }, + { + "epoch": 0.058447786779861635, + "grad_norm": 1.8383833169937134, + "learning_rate": 1.2987820678932368e-05, + "loss": 1.6773, + "step": 2506 + }, + { + "epoch": 0.05847110991903955, + "grad_norm": 1.7583168745040894, + "learning_rate": 1.2993003368748383e-05, + "loss": 1.564, + "step": 2507 + }, + { + "epoch": 0.05849443305821747, + "grad_norm": 1.9862589836120605, + "learning_rate": 1.2998186058564397e-05, + "loss": 1.411, + "step": 2508 + }, + { + "epoch": 0.05851775619739539, + "grad_norm": 1.8201732635498047, + "learning_rate": 1.300336874838041e-05, + "loss": 1.3807, + "step": 2509 + }, + { + "epoch": 0.05854107933657331, + "grad_norm": 1.6528314352035522, + "learning_rate": 1.3008551438196424e-05, + "loss": 1.5299, + "step": 2510 + }, + { + "epoch": 0.058564402475751225, + "grad_norm": 2.0610220432281494, + "learning_rate": 1.3013734128012438e-05, + "loss": 1.6129, + "step": 2511 + }, + { + "epoch": 0.05858772561492914, + "grad_norm": 1.6855478286743164, + "learning_rate": 1.3018916817828453e-05, + "loss": 1.0253, + "step": 2512 + }, + { + "epoch": 0.05861104875410706, + "grad_norm": 1.9679440259933472, + "learning_rate": 1.3024099507644467e-05, + "loss": 1.4495, + "step": 2513 + }, + { + "epoch": 0.05863437189328498, + "grad_norm": 1.7133574485778809, + "learning_rate": 1.3029282197460481e-05, + "loss": 1.4932, + "step": 2514 + }, + { + "epoch": 0.058657695032462896, + "grad_norm": 1.9857035875320435, + "learning_rate": 1.3034464887276497e-05, + "loss": 1.429, + "step": 2515 + }, + { + "epoch": 0.058681018171640814, + "grad_norm": 1.3343905210494995, + "learning_rate": 1.3039647577092512e-05, + "loss": 1.4605, + "step": 2516 + }, + { + "epoch": 0.05870434131081873, + "grad_norm": 1.6987974643707275, + "learning_rate": 1.3044830266908526e-05, + "loss": 1.0714, + "step": 2517 + }, + { + "epoch": 0.05872766444999665, + "grad_norm": 1.5600383281707764, + "learning_rate": 1.305001295672454e-05, + "loss": 1.3758, + "step": 2518 + }, + { + "epoch": 0.05875098758917457, + "grad_norm": 1.6523587703704834, + "learning_rate": 1.3055195646540555e-05, + "loss": 1.3751, + "step": 2519 + }, + { + "epoch": 0.058774310728352486, + "grad_norm": 2.1189181804656982, + "learning_rate": 1.306037833635657e-05, + "loss": 1.5012, + "step": 2520 + }, + { + "epoch": 0.058797633867530404, + "grad_norm": 1.5232523679733276, + "learning_rate": 1.3065561026172584e-05, + "loss": 1.697, + "step": 2521 + }, + { + "epoch": 0.058820957006708315, + "grad_norm": 1.6514025926589966, + "learning_rate": 1.3070743715988598e-05, + "loss": 1.2868, + "step": 2522 + }, + { + "epoch": 0.05884428014588623, + "grad_norm": 1.9371200799942017, + "learning_rate": 1.3075926405804614e-05, + "loss": 1.5558, + "step": 2523 + }, + { + "epoch": 0.05886760328506415, + "grad_norm": 2.4826416969299316, + "learning_rate": 1.3081109095620628e-05, + "loss": 1.4871, + "step": 2524 + }, + { + "epoch": 0.05889092642424207, + "grad_norm": 1.9945695400238037, + "learning_rate": 1.3086291785436643e-05, + "loss": 1.4965, + "step": 2525 + }, + { + "epoch": 0.058914249563419986, + "grad_norm": 2.7831015586853027, + "learning_rate": 1.3091474475252657e-05, + "loss": 1.42, + "step": 2526 + }, + { + "epoch": 0.058937572702597904, + "grad_norm": 1.4849002361297607, + "learning_rate": 1.3096657165068671e-05, + "loss": 1.2279, + "step": 2527 + }, + { + "epoch": 0.05896089584177582, + "grad_norm": 1.8503111600875854, + "learning_rate": 1.3101839854884686e-05, + "loss": 1.2645, + "step": 2528 + }, + { + "epoch": 0.05898421898095374, + "grad_norm": 1.8192991018295288, + "learning_rate": 1.31070225447007e-05, + "loss": 1.5411, + "step": 2529 + }, + { + "epoch": 0.05900754212013166, + "grad_norm": 1.799962043762207, + "learning_rate": 1.3112205234516714e-05, + "loss": 1.4876, + "step": 2530 + }, + { + "epoch": 0.059030865259309576, + "grad_norm": 1.8138644695281982, + "learning_rate": 1.3117387924332729e-05, + "loss": 1.0486, + "step": 2531 + }, + { + "epoch": 0.059054188398487494, + "grad_norm": 1.7154499292373657, + "learning_rate": 1.3122570614148745e-05, + "loss": 1.4033, + "step": 2532 + }, + { + "epoch": 0.05907751153766541, + "grad_norm": 2.8421483039855957, + "learning_rate": 1.3127753303964759e-05, + "loss": 1.3163, + "step": 2533 + }, + { + "epoch": 0.05910083467684333, + "grad_norm": 1.5384453535079956, + "learning_rate": 1.3132935993780773e-05, + "loss": 1.2216, + "step": 2534 + }, + { + "epoch": 0.05912415781602125, + "grad_norm": 2.0954272747039795, + "learning_rate": 1.3138118683596788e-05, + "loss": 1.5509, + "step": 2535 + }, + { + "epoch": 0.059147480955199165, + "grad_norm": 1.9540061950683594, + "learning_rate": 1.3143301373412802e-05, + "loss": 1.4651, + "step": 2536 + }, + { + "epoch": 0.05917080409437708, + "grad_norm": 2.361682176589966, + "learning_rate": 1.3148484063228817e-05, + "loss": 1.6031, + "step": 2537 + }, + { + "epoch": 0.059194127233555, + "grad_norm": 2.1207122802734375, + "learning_rate": 1.3153666753044831e-05, + "loss": 1.6092, + "step": 2538 + }, + { + "epoch": 0.05921745037273292, + "grad_norm": 1.5452579259872437, + "learning_rate": 1.3158849442860845e-05, + "loss": 1.5587, + "step": 2539 + }, + { + "epoch": 0.05924077351191084, + "grad_norm": 1.748274803161621, + "learning_rate": 1.3164032132676861e-05, + "loss": 1.1223, + "step": 2540 + }, + { + "epoch": 0.059264096651088755, + "grad_norm": 2.0728864669799805, + "learning_rate": 1.3169214822492876e-05, + "loss": 1.3435, + "step": 2541 + }, + { + "epoch": 0.05928741979026667, + "grad_norm": 1.6231045722961426, + "learning_rate": 1.317439751230889e-05, + "loss": 1.3008, + "step": 2542 + }, + { + "epoch": 0.05931074292944459, + "grad_norm": 1.9777765274047852, + "learning_rate": 1.3179580202124904e-05, + "loss": 1.3855, + "step": 2543 + }, + { + "epoch": 0.05933406606862251, + "grad_norm": 1.9784355163574219, + "learning_rate": 1.3184762891940919e-05, + "loss": 1.2233, + "step": 2544 + }, + { + "epoch": 0.059357389207800426, + "grad_norm": 1.5931442975997925, + "learning_rate": 1.3189945581756933e-05, + "loss": 1.277, + "step": 2545 + }, + { + "epoch": 0.059380712346978344, + "grad_norm": 1.490401268005371, + "learning_rate": 1.3195128271572947e-05, + "loss": 1.4339, + "step": 2546 + }, + { + "epoch": 0.05940403548615626, + "grad_norm": 1.848841905593872, + "learning_rate": 1.3200310961388962e-05, + "loss": 1.4852, + "step": 2547 + }, + { + "epoch": 0.05942735862533418, + "grad_norm": 2.0940616130828857, + "learning_rate": 1.3205493651204978e-05, + "loss": 1.4573, + "step": 2548 + }, + { + "epoch": 0.0594506817645121, + "grad_norm": 2.3672728538513184, + "learning_rate": 1.3210676341020992e-05, + "loss": 1.7065, + "step": 2549 + }, + { + "epoch": 0.05947400490369001, + "grad_norm": 4.1710028648376465, + "learning_rate": 1.3215859030837006e-05, + "loss": 1.4652, + "step": 2550 + }, + { + "epoch": 0.05949732804286793, + "grad_norm": 1.6404699087142944, + "learning_rate": 1.322104172065302e-05, + "loss": 1.4241, + "step": 2551 + }, + { + "epoch": 0.059520651182045844, + "grad_norm": 2.475877523422241, + "learning_rate": 1.3226224410469035e-05, + "loss": 1.2603, + "step": 2552 + }, + { + "epoch": 0.05954397432122376, + "grad_norm": 1.8253686428070068, + "learning_rate": 1.323140710028505e-05, + "loss": 1.4414, + "step": 2553 + }, + { + "epoch": 0.05956729746040168, + "grad_norm": 1.9098048210144043, + "learning_rate": 1.3236589790101064e-05, + "loss": 1.4837, + "step": 2554 + }, + { + "epoch": 0.0595906205995796, + "grad_norm": 1.7729190587997437, + "learning_rate": 1.3241772479917078e-05, + "loss": 1.4985, + "step": 2555 + }, + { + "epoch": 0.059613943738757516, + "grad_norm": 1.6407618522644043, + "learning_rate": 1.3246955169733094e-05, + "loss": 1.323, + "step": 2556 + }, + { + "epoch": 0.059637266877935434, + "grad_norm": 1.8097273111343384, + "learning_rate": 1.3252137859549109e-05, + "loss": 1.4201, + "step": 2557 + }, + { + "epoch": 0.05966059001711335, + "grad_norm": 1.6986953020095825, + "learning_rate": 1.3257320549365123e-05, + "loss": 1.2728, + "step": 2558 + }, + { + "epoch": 0.05968391315629127, + "grad_norm": 2.6232433319091797, + "learning_rate": 1.3262503239181137e-05, + "loss": 1.3089, + "step": 2559 + }, + { + "epoch": 0.05970723629546919, + "grad_norm": 1.9292854070663452, + "learning_rate": 1.3267685928997152e-05, + "loss": 1.7235, + "step": 2560 + }, + { + "epoch": 0.059730559434647106, + "grad_norm": 2.367253065109253, + "learning_rate": 1.3272868618813166e-05, + "loss": 1.5625, + "step": 2561 + }, + { + "epoch": 0.05975388257382502, + "grad_norm": 1.9255868196487427, + "learning_rate": 1.327805130862918e-05, + "loss": 1.4485, + "step": 2562 + }, + { + "epoch": 0.05977720571300294, + "grad_norm": 1.65224289894104, + "learning_rate": 1.3283233998445195e-05, + "loss": 1.058, + "step": 2563 + }, + { + "epoch": 0.05980052885218086, + "grad_norm": 1.5006394386291504, + "learning_rate": 1.3288416688261207e-05, + "loss": 1.5081, + "step": 2564 + }, + { + "epoch": 0.05982385199135878, + "grad_norm": 1.7269837856292725, + "learning_rate": 1.3293599378077222e-05, + "loss": 1.4554, + "step": 2565 + }, + { + "epoch": 0.059847175130536695, + "grad_norm": 1.8299132585525513, + "learning_rate": 1.3298782067893236e-05, + "loss": 1.3201, + "step": 2566 + }, + { + "epoch": 0.05987049826971461, + "grad_norm": 1.5136997699737549, + "learning_rate": 1.3303964757709252e-05, + "loss": 1.6177, + "step": 2567 + }, + { + "epoch": 0.05989382140889253, + "grad_norm": 1.5479817390441895, + "learning_rate": 1.3309147447525266e-05, + "loss": 1.3188, + "step": 2568 + }, + { + "epoch": 0.05991714454807045, + "grad_norm": 1.8136457204818726, + "learning_rate": 1.331433013734128e-05, + "loss": 1.0279, + "step": 2569 + }, + { + "epoch": 0.05994046768724837, + "grad_norm": 1.8352434635162354, + "learning_rate": 1.3319512827157295e-05, + "loss": 1.5215, + "step": 2570 + }, + { + "epoch": 0.059963790826426284, + "grad_norm": 2.270509719848633, + "learning_rate": 1.332469551697331e-05, + "loss": 1.6787, + "step": 2571 + }, + { + "epoch": 0.0599871139656042, + "grad_norm": 1.903822898864746, + "learning_rate": 1.3329878206789324e-05, + "loss": 1.5921, + "step": 2572 + }, + { + "epoch": 0.06001043710478212, + "grad_norm": 2.159048318862915, + "learning_rate": 1.3335060896605338e-05, + "loss": 1.6827, + "step": 2573 + }, + { + "epoch": 0.06003376024396004, + "grad_norm": 1.7216435670852661, + "learning_rate": 1.3340243586421352e-05, + "loss": 1.6214, + "step": 2574 + }, + { + "epoch": 0.060057083383137956, + "grad_norm": 1.6836422681808472, + "learning_rate": 1.3345426276237368e-05, + "loss": 1.336, + "step": 2575 + }, + { + "epoch": 0.060080406522315874, + "grad_norm": 2.070140838623047, + "learning_rate": 1.3350608966053383e-05, + "loss": 1.3277, + "step": 2576 + }, + { + "epoch": 0.06010372966149379, + "grad_norm": 1.3669464588165283, + "learning_rate": 1.3355791655869397e-05, + "loss": 1.3032, + "step": 2577 + }, + { + "epoch": 0.06012705280067171, + "grad_norm": 1.7620892524719238, + "learning_rate": 1.3360974345685412e-05, + "loss": 1.6768, + "step": 2578 + }, + { + "epoch": 0.06015037593984962, + "grad_norm": 2.349331855773926, + "learning_rate": 1.3366157035501426e-05, + "loss": 1.3301, + "step": 2579 + }, + { + "epoch": 0.06017369907902754, + "grad_norm": 1.7582626342773438, + "learning_rate": 1.337133972531744e-05, + "loss": 1.78, + "step": 2580 + }, + { + "epoch": 0.060197022218205457, + "grad_norm": 1.4992632865905762, + "learning_rate": 1.3376522415133455e-05, + "loss": 1.3913, + "step": 2581 + }, + { + "epoch": 0.060220345357383374, + "grad_norm": 2.119130849838257, + "learning_rate": 1.3381705104949469e-05, + "loss": 1.5424, + "step": 2582 + }, + { + "epoch": 0.06024366849656129, + "grad_norm": 2.205332040786743, + "learning_rate": 1.3386887794765483e-05, + "loss": 1.2993, + "step": 2583 + }, + { + "epoch": 0.06026699163573921, + "grad_norm": 1.7924771308898926, + "learning_rate": 1.33920704845815e-05, + "loss": 1.6227, + "step": 2584 + }, + { + "epoch": 0.06029031477491713, + "grad_norm": 2.209590435028076, + "learning_rate": 1.3397253174397514e-05, + "loss": 1.4697, + "step": 2585 + }, + { + "epoch": 0.060313637914095046, + "grad_norm": 1.8160085678100586, + "learning_rate": 1.3402435864213528e-05, + "loss": 1.6885, + "step": 2586 + }, + { + "epoch": 0.060336961053272964, + "grad_norm": 1.7656259536743164, + "learning_rate": 1.3407618554029542e-05, + "loss": 1.7152, + "step": 2587 + }, + { + "epoch": 0.06036028419245088, + "grad_norm": 1.9651085138320923, + "learning_rate": 1.3412801243845557e-05, + "loss": 1.3886, + "step": 2588 + }, + { + "epoch": 0.0603836073316288, + "grad_norm": 1.6476036310195923, + "learning_rate": 1.3417983933661571e-05, + "loss": 1.4977, + "step": 2589 + }, + { + "epoch": 0.06040693047080672, + "grad_norm": 2.9965789318084717, + "learning_rate": 1.3423166623477585e-05, + "loss": 1.6411, + "step": 2590 + }, + { + "epoch": 0.060430253609984635, + "grad_norm": 2.1225550174713135, + "learning_rate": 1.34283493132936e-05, + "loss": 1.7122, + "step": 2591 + }, + { + "epoch": 0.06045357674916255, + "grad_norm": 1.2732594013214111, + "learning_rate": 1.3433532003109616e-05, + "loss": 1.329, + "step": 2592 + }, + { + "epoch": 0.06047689988834047, + "grad_norm": 1.5653432607650757, + "learning_rate": 1.343871469292563e-05, + "loss": 1.2586, + "step": 2593 + }, + { + "epoch": 0.06050022302751839, + "grad_norm": 2.590881824493408, + "learning_rate": 1.3443897382741644e-05, + "loss": 1.3558, + "step": 2594 + }, + { + "epoch": 0.06052354616669631, + "grad_norm": 1.7460978031158447, + "learning_rate": 1.3449080072557659e-05, + "loss": 1.116, + "step": 2595 + }, + { + "epoch": 0.060546869305874225, + "grad_norm": 1.6606675386428833, + "learning_rate": 1.3454262762373673e-05, + "loss": 1.2989, + "step": 2596 + }, + { + "epoch": 0.06057019244505214, + "grad_norm": 1.8426719903945923, + "learning_rate": 1.3459445452189687e-05, + "loss": 1.322, + "step": 2597 + }, + { + "epoch": 0.06059351558423006, + "grad_norm": 1.3999927043914795, + "learning_rate": 1.3464628142005702e-05, + "loss": 1.3036, + "step": 2598 + }, + { + "epoch": 0.06061683872340798, + "grad_norm": 1.3142712116241455, + "learning_rate": 1.3469810831821716e-05, + "loss": 1.4356, + "step": 2599 + }, + { + "epoch": 0.060640161862585897, + "grad_norm": 2.233654737472534, + "learning_rate": 1.3474993521637732e-05, + "loss": 1.4797, + "step": 2600 + }, + { + "epoch": 0.060663485001763814, + "grad_norm": 2.094975471496582, + "learning_rate": 1.3480176211453747e-05, + "loss": 1.2695, + "step": 2601 + }, + { + "epoch": 0.06068680814094173, + "grad_norm": 1.5429056882858276, + "learning_rate": 1.3485358901269761e-05, + "loss": 1.5335, + "step": 2602 + }, + { + "epoch": 0.06071013128011965, + "grad_norm": 1.8557305335998535, + "learning_rate": 1.3490541591085775e-05, + "loss": 1.5586, + "step": 2603 + }, + { + "epoch": 0.06073345441929757, + "grad_norm": 1.8338919878005981, + "learning_rate": 1.349572428090179e-05, + "loss": 1.5284, + "step": 2604 + }, + { + "epoch": 0.060756777558475486, + "grad_norm": 2.071093797683716, + "learning_rate": 1.3500906970717804e-05, + "loss": 1.3638, + "step": 2605 + }, + { + "epoch": 0.060780100697653404, + "grad_norm": 1.9140174388885498, + "learning_rate": 1.3506089660533818e-05, + "loss": 1.4051, + "step": 2606 + }, + { + "epoch": 0.060803423836831315, + "grad_norm": 1.5910030603408813, + "learning_rate": 1.3511272350349833e-05, + "loss": 1.241, + "step": 2607 + }, + { + "epoch": 0.06082674697600923, + "grad_norm": 1.788118839263916, + "learning_rate": 1.3516455040165849e-05, + "loss": 1.6792, + "step": 2608 + }, + { + "epoch": 0.06085007011518715, + "grad_norm": 1.9293417930603027, + "learning_rate": 1.3521637729981863e-05, + "loss": 1.424, + "step": 2609 + }, + { + "epoch": 0.06087339325436507, + "grad_norm": 2.037588596343994, + "learning_rate": 1.3526820419797877e-05, + "loss": 1.3211, + "step": 2610 + }, + { + "epoch": 0.060896716393542986, + "grad_norm": 1.746131420135498, + "learning_rate": 1.3532003109613892e-05, + "loss": 1.4224, + "step": 2611 + }, + { + "epoch": 0.060920039532720904, + "grad_norm": 1.830556035041809, + "learning_rate": 1.3537185799429906e-05, + "loss": 1.3317, + "step": 2612 + }, + { + "epoch": 0.06094336267189882, + "grad_norm": 1.5884668827056885, + "learning_rate": 1.354236848924592e-05, + "loss": 1.3695, + "step": 2613 + }, + { + "epoch": 0.06096668581107674, + "grad_norm": 1.6232162714004517, + "learning_rate": 1.3547551179061935e-05, + "loss": 1.314, + "step": 2614 + }, + { + "epoch": 0.06099000895025466, + "grad_norm": 1.992008090019226, + "learning_rate": 1.3552733868877949e-05, + "loss": 1.8423, + "step": 2615 + }, + { + "epoch": 0.061013332089432576, + "grad_norm": 1.4518450498580933, + "learning_rate": 1.3557916558693963e-05, + "loss": 1.5026, + "step": 2616 + }, + { + "epoch": 0.061036655228610494, + "grad_norm": 2.025433301925659, + "learning_rate": 1.356309924850998e-05, + "loss": 1.5297, + "step": 2617 + }, + { + "epoch": 0.06105997836778841, + "grad_norm": 3.8088228702545166, + "learning_rate": 1.3568281938325994e-05, + "loss": 1.4006, + "step": 2618 + }, + { + "epoch": 0.06108330150696633, + "grad_norm": 1.7329661846160889, + "learning_rate": 1.3573464628142007e-05, + "loss": 1.5897, + "step": 2619 + }, + { + "epoch": 0.06110662464614425, + "grad_norm": 1.9908490180969238, + "learning_rate": 1.3578647317958021e-05, + "loss": 1.9205, + "step": 2620 + }, + { + "epoch": 0.061129947785322165, + "grad_norm": 1.6847058534622192, + "learning_rate": 1.3583830007774035e-05, + "loss": 1.4672, + "step": 2621 + }, + { + "epoch": 0.06115327092450008, + "grad_norm": 1.6240817308425903, + "learning_rate": 1.358901269759005e-05, + "loss": 1.6396, + "step": 2622 + }, + { + "epoch": 0.061176594063678, + "grad_norm": 1.4848893880844116, + "learning_rate": 1.3594195387406064e-05, + "loss": 1.077, + "step": 2623 + }, + { + "epoch": 0.06119991720285592, + "grad_norm": 1.7938750982284546, + "learning_rate": 1.3599378077222078e-05, + "loss": 1.4143, + "step": 2624 + }, + { + "epoch": 0.06122324034203384, + "grad_norm": 1.827236294746399, + "learning_rate": 1.3604560767038093e-05, + "loss": 1.4336, + "step": 2625 + }, + { + "epoch": 0.061246563481211755, + "grad_norm": 1.7886489629745483, + "learning_rate": 1.3609743456854107e-05, + "loss": 1.6841, + "step": 2626 + }, + { + "epoch": 0.06126988662038967, + "grad_norm": 1.934395432472229, + "learning_rate": 1.3614926146670123e-05, + "loss": 1.3356, + "step": 2627 + }, + { + "epoch": 0.06129320975956759, + "grad_norm": 2.428112745285034, + "learning_rate": 1.3620108836486137e-05, + "loss": 1.6694, + "step": 2628 + }, + { + "epoch": 0.06131653289874551, + "grad_norm": 2.5163893699645996, + "learning_rate": 1.3625291526302152e-05, + "loss": 1.412, + "step": 2629 + }, + { + "epoch": 0.061339856037923426, + "grad_norm": 1.8245412111282349, + "learning_rate": 1.3630474216118166e-05, + "loss": 1.3696, + "step": 2630 + }, + { + "epoch": 0.061363179177101344, + "grad_norm": 2.0314536094665527, + "learning_rate": 1.363565690593418e-05, + "loss": 1.5721, + "step": 2631 + }, + { + "epoch": 0.06138650231627926, + "grad_norm": 1.6534019708633423, + "learning_rate": 1.3640839595750195e-05, + "loss": 1.7137, + "step": 2632 + }, + { + "epoch": 0.06140982545545718, + "grad_norm": 1.8295602798461914, + "learning_rate": 1.3646022285566209e-05, + "loss": 1.7957, + "step": 2633 + }, + { + "epoch": 0.0614331485946351, + "grad_norm": 1.9154319763183594, + "learning_rate": 1.3651204975382223e-05, + "loss": 1.6792, + "step": 2634 + }, + { + "epoch": 0.061456471733813016, + "grad_norm": 1.9036335945129395, + "learning_rate": 1.3656387665198238e-05, + "loss": 1.5156, + "step": 2635 + }, + { + "epoch": 0.06147979487299093, + "grad_norm": 1.7407076358795166, + "learning_rate": 1.3661570355014254e-05, + "loss": 1.6499, + "step": 2636 + }, + { + "epoch": 0.061503118012168845, + "grad_norm": 1.8979015350341797, + "learning_rate": 1.3666753044830268e-05, + "loss": 1.6375, + "step": 2637 + }, + { + "epoch": 0.06152644115134676, + "grad_norm": 1.8352705240249634, + "learning_rate": 1.3671935734646282e-05, + "loss": 1.4799, + "step": 2638 + }, + { + "epoch": 0.06154976429052468, + "grad_norm": 2.0247464179992676, + "learning_rate": 1.3677118424462297e-05, + "loss": 1.3662, + "step": 2639 + }, + { + "epoch": 0.0615730874297026, + "grad_norm": 1.6887078285217285, + "learning_rate": 1.3682301114278311e-05, + "loss": 1.7703, + "step": 2640 + }, + { + "epoch": 0.061596410568880516, + "grad_norm": 1.784588098526001, + "learning_rate": 1.3687483804094326e-05, + "loss": 1.571, + "step": 2641 + }, + { + "epoch": 0.061619733708058434, + "grad_norm": 2.69565749168396, + "learning_rate": 1.369266649391034e-05, + "loss": 1.5766, + "step": 2642 + }, + { + "epoch": 0.06164305684723635, + "grad_norm": 1.9466243982315063, + "learning_rate": 1.3697849183726354e-05, + "loss": 1.6967, + "step": 2643 + }, + { + "epoch": 0.06166637998641427, + "grad_norm": 2.077322006225586, + "learning_rate": 1.370303187354237e-05, + "loss": 1.4202, + "step": 2644 + }, + { + "epoch": 0.06168970312559219, + "grad_norm": 1.9221806526184082, + "learning_rate": 1.3708214563358385e-05, + "loss": 1.6579, + "step": 2645 + }, + { + "epoch": 0.061713026264770106, + "grad_norm": 1.5117321014404297, + "learning_rate": 1.3713397253174399e-05, + "loss": 1.3543, + "step": 2646 + }, + { + "epoch": 0.061736349403948024, + "grad_norm": 1.7436460256576538, + "learning_rate": 1.3718579942990413e-05, + "loss": 1.6648, + "step": 2647 + }, + { + "epoch": 0.06175967254312594, + "grad_norm": 1.8213878870010376, + "learning_rate": 1.3723762632806428e-05, + "loss": 1.3991, + "step": 2648 + }, + { + "epoch": 0.06178299568230386, + "grad_norm": 2.2070603370666504, + "learning_rate": 1.3728945322622442e-05, + "loss": 1.531, + "step": 2649 + }, + { + "epoch": 0.06180631882148178, + "grad_norm": 2.077101945877075, + "learning_rate": 1.3734128012438456e-05, + "loss": 1.3124, + "step": 2650 + }, + { + "epoch": 0.061829641960659695, + "grad_norm": 2.2365119457244873, + "learning_rate": 1.373931070225447e-05, + "loss": 1.4602, + "step": 2651 + }, + { + "epoch": 0.06185296509983761, + "grad_norm": 1.6677820682525635, + "learning_rate": 1.3744493392070487e-05, + "loss": 1.3884, + "step": 2652 + }, + { + "epoch": 0.06187628823901553, + "grad_norm": 1.9608267545700073, + "learning_rate": 1.3749676081886501e-05, + "loss": 1.5826, + "step": 2653 + }, + { + "epoch": 0.06189961137819345, + "grad_norm": 2.236156463623047, + "learning_rate": 1.3754858771702515e-05, + "loss": 1.4431, + "step": 2654 + }, + { + "epoch": 0.06192293451737137, + "grad_norm": 2.1494522094726562, + "learning_rate": 1.376004146151853e-05, + "loss": 1.2584, + "step": 2655 + }, + { + "epoch": 0.061946257656549285, + "grad_norm": 1.94198477268219, + "learning_rate": 1.3765224151334544e-05, + "loss": 1.3086, + "step": 2656 + }, + { + "epoch": 0.0619695807957272, + "grad_norm": 1.835691213607788, + "learning_rate": 1.3770406841150558e-05, + "loss": 1.7029, + "step": 2657 + }, + { + "epoch": 0.06199290393490512, + "grad_norm": 1.8903217315673828, + "learning_rate": 1.3775589530966573e-05, + "loss": 1.652, + "step": 2658 + }, + { + "epoch": 0.06201622707408304, + "grad_norm": 1.707434892654419, + "learning_rate": 1.3780772220782587e-05, + "loss": 1.2228, + "step": 2659 + }, + { + "epoch": 0.062039550213260956, + "grad_norm": 1.802922248840332, + "learning_rate": 1.3785954910598603e-05, + "loss": 1.2687, + "step": 2660 + }, + { + "epoch": 0.062062873352438874, + "grad_norm": 2.5531184673309326, + "learning_rate": 1.3791137600414618e-05, + "loss": 1.4097, + "step": 2661 + }, + { + "epoch": 0.06208619649161679, + "grad_norm": 1.693434715270996, + "learning_rate": 1.3796320290230632e-05, + "loss": 1.4266, + "step": 2662 + }, + { + "epoch": 0.06210951963079471, + "grad_norm": 1.6222015619277954, + "learning_rate": 1.3801502980046646e-05, + "loss": 1.3221, + "step": 2663 + }, + { + "epoch": 0.06213284276997262, + "grad_norm": 1.7141613960266113, + "learning_rate": 1.380668566986266e-05, + "loss": 1.4362, + "step": 2664 + }, + { + "epoch": 0.06215616590915054, + "grad_norm": 2.7775673866271973, + "learning_rate": 1.3811868359678675e-05, + "loss": 1.6965, + "step": 2665 + }, + { + "epoch": 0.06217948904832846, + "grad_norm": 1.24868905544281, + "learning_rate": 1.381705104949469e-05, + "loss": 1.2357, + "step": 2666 + }, + { + "epoch": 0.062202812187506375, + "grad_norm": 1.7877827882766724, + "learning_rate": 1.3822233739310704e-05, + "loss": 1.6933, + "step": 2667 + }, + { + "epoch": 0.06222613532668429, + "grad_norm": 1.5741866827011108, + "learning_rate": 1.3827416429126718e-05, + "loss": 1.5295, + "step": 2668 + }, + { + "epoch": 0.06224945846586221, + "grad_norm": 1.7893970012664795, + "learning_rate": 1.3832599118942734e-05, + "loss": 1.3697, + "step": 2669 + }, + { + "epoch": 0.06227278160504013, + "grad_norm": 1.8457837104797363, + "learning_rate": 1.3837781808758748e-05, + "loss": 1.3283, + "step": 2670 + }, + { + "epoch": 0.062296104744218046, + "grad_norm": 1.6998080015182495, + "learning_rate": 1.3842964498574763e-05, + "loss": 1.4704, + "step": 2671 + }, + { + "epoch": 0.062319427883395964, + "grad_norm": 1.6351855993270874, + "learning_rate": 1.3848147188390777e-05, + "loss": 1.5497, + "step": 2672 + }, + { + "epoch": 0.06234275102257388, + "grad_norm": 2.2880098819732666, + "learning_rate": 1.3853329878206791e-05, + "loss": 1.279, + "step": 2673 + }, + { + "epoch": 0.0623660741617518, + "grad_norm": 1.5100282430648804, + "learning_rate": 1.3858512568022804e-05, + "loss": 1.389, + "step": 2674 + }, + { + "epoch": 0.06238939730092972, + "grad_norm": 2.57076358795166, + "learning_rate": 1.3863695257838818e-05, + "loss": 1.4024, + "step": 2675 + }, + { + "epoch": 0.062412720440107636, + "grad_norm": 1.4813395738601685, + "learning_rate": 1.3868877947654833e-05, + "loss": 1.2078, + "step": 2676 + }, + { + "epoch": 0.062436043579285554, + "grad_norm": 2.9532761573791504, + "learning_rate": 1.3874060637470847e-05, + "loss": 1.3444, + "step": 2677 + }, + { + "epoch": 0.06245936671846347, + "grad_norm": 2.0162930488586426, + "learning_rate": 1.3879243327286861e-05, + "loss": 1.646, + "step": 2678 + }, + { + "epoch": 0.06248268985764139, + "grad_norm": 2.062361717224121, + "learning_rate": 1.3884426017102876e-05, + "loss": 1.485, + "step": 2679 + }, + { + "epoch": 0.06250601299681931, + "grad_norm": 2.0320801734924316, + "learning_rate": 1.3889608706918892e-05, + "loss": 1.6613, + "step": 2680 + }, + { + "epoch": 0.06252933613599722, + "grad_norm": 1.9245129823684692, + "learning_rate": 1.3894791396734906e-05, + "loss": 1.6381, + "step": 2681 + }, + { + "epoch": 0.06255265927517514, + "grad_norm": 2.0062341690063477, + "learning_rate": 1.389997408655092e-05, + "loss": 1.5528, + "step": 2682 + }, + { + "epoch": 0.06257598241435305, + "grad_norm": 2.170292377471924, + "learning_rate": 1.3905156776366935e-05, + "loss": 1.4878, + "step": 2683 + }, + { + "epoch": 0.06259930555353098, + "grad_norm": 1.6300157308578491, + "learning_rate": 1.391033946618295e-05, + "loss": 1.195, + "step": 2684 + }, + { + "epoch": 0.06262262869270889, + "grad_norm": 2.239786386489868, + "learning_rate": 1.3915522155998964e-05, + "loss": 1.6731, + "step": 2685 + }, + { + "epoch": 0.06264595183188681, + "grad_norm": 1.7368957996368408, + "learning_rate": 1.3920704845814978e-05, + "loss": 1.1561, + "step": 2686 + }, + { + "epoch": 0.06266927497106473, + "grad_norm": 1.863064169883728, + "learning_rate": 1.3925887535630992e-05, + "loss": 1.3789, + "step": 2687 + }, + { + "epoch": 0.06269259811024265, + "grad_norm": 1.7001705169677734, + "learning_rate": 1.3931070225447008e-05, + "loss": 1.8909, + "step": 2688 + }, + { + "epoch": 0.06271592124942056, + "grad_norm": 2.388777256011963, + "learning_rate": 1.3936252915263023e-05, + "loss": 1.4946, + "step": 2689 + }, + { + "epoch": 0.06273924438859849, + "grad_norm": 1.803241491317749, + "learning_rate": 1.3941435605079037e-05, + "loss": 1.781, + "step": 2690 + }, + { + "epoch": 0.0627625675277764, + "grad_norm": 1.699387550354004, + "learning_rate": 1.3946618294895051e-05, + "loss": 1.2251, + "step": 2691 + }, + { + "epoch": 0.06278589066695432, + "grad_norm": 2.1750314235687256, + "learning_rate": 1.3951800984711066e-05, + "loss": 1.4449, + "step": 2692 + }, + { + "epoch": 0.06280921380613223, + "grad_norm": 1.7841440439224243, + "learning_rate": 1.395698367452708e-05, + "loss": 1.5959, + "step": 2693 + }, + { + "epoch": 0.06283253694531016, + "grad_norm": 1.9455244541168213, + "learning_rate": 1.3962166364343094e-05, + "loss": 1.3483, + "step": 2694 + }, + { + "epoch": 0.06285586008448807, + "grad_norm": 2.1182174682617188, + "learning_rate": 1.3967349054159109e-05, + "loss": 1.4448, + "step": 2695 + }, + { + "epoch": 0.062879183223666, + "grad_norm": 1.6244593858718872, + "learning_rate": 1.3972531743975125e-05, + "loss": 1.4739, + "step": 2696 + }, + { + "epoch": 0.0629025063628439, + "grad_norm": 1.751879096031189, + "learning_rate": 1.3977714433791139e-05, + "loss": 1.4888, + "step": 2697 + }, + { + "epoch": 0.06292582950202183, + "grad_norm": 2.2495086193084717, + "learning_rate": 1.3982897123607153e-05, + "loss": 1.6448, + "step": 2698 + }, + { + "epoch": 0.06294915264119974, + "grad_norm": 1.6507688760757446, + "learning_rate": 1.3988079813423168e-05, + "loss": 1.735, + "step": 2699 + }, + { + "epoch": 0.06297247578037767, + "grad_norm": 2.61356520652771, + "learning_rate": 1.3993262503239182e-05, + "loss": 1.7015, + "step": 2700 + }, + { + "epoch": 0.06299579891955558, + "grad_norm": 1.9515256881713867, + "learning_rate": 1.3998445193055197e-05, + "loss": 1.7929, + "step": 2701 + }, + { + "epoch": 0.0630191220587335, + "grad_norm": 1.7551541328430176, + "learning_rate": 1.4003627882871211e-05, + "loss": 1.578, + "step": 2702 + }, + { + "epoch": 0.06304244519791141, + "grad_norm": 1.6188023090362549, + "learning_rate": 1.4008810572687225e-05, + "loss": 1.5236, + "step": 2703 + }, + { + "epoch": 0.06306576833708934, + "grad_norm": 1.9739527702331543, + "learning_rate": 1.4013993262503241e-05, + "loss": 1.3597, + "step": 2704 + }, + { + "epoch": 0.06308909147626725, + "grad_norm": 6.611608505249023, + "learning_rate": 1.4019175952319256e-05, + "loss": 1.6558, + "step": 2705 + }, + { + "epoch": 0.06311241461544517, + "grad_norm": 1.7356557846069336, + "learning_rate": 1.402435864213527e-05, + "loss": 1.3899, + "step": 2706 + }, + { + "epoch": 0.06313573775462308, + "grad_norm": 1.846679449081421, + "learning_rate": 1.4029541331951284e-05, + "loss": 1.3879, + "step": 2707 + }, + { + "epoch": 0.063159060893801, + "grad_norm": 2.0728909969329834, + "learning_rate": 1.4034724021767299e-05, + "loss": 1.554, + "step": 2708 + }, + { + "epoch": 0.06318238403297892, + "grad_norm": 2.1357173919677734, + "learning_rate": 1.4039906711583313e-05, + "loss": 1.6166, + "step": 2709 + }, + { + "epoch": 0.06320570717215683, + "grad_norm": 2.6331140995025635, + "learning_rate": 1.4045089401399327e-05, + "loss": 1.6178, + "step": 2710 + }, + { + "epoch": 0.06322903031133476, + "grad_norm": 1.7224211692810059, + "learning_rate": 1.4050272091215342e-05, + "loss": 1.5338, + "step": 2711 + }, + { + "epoch": 0.06325235345051267, + "grad_norm": 1.7378405332565308, + "learning_rate": 1.4055454781031356e-05, + "loss": 1.5187, + "step": 2712 + }, + { + "epoch": 0.06327567658969059, + "grad_norm": 2.088463544845581, + "learning_rate": 1.4060637470847372e-05, + "loss": 1.8847, + "step": 2713 + }, + { + "epoch": 0.0632989997288685, + "grad_norm": 1.7667561769485474, + "learning_rate": 1.4065820160663386e-05, + "loss": 1.4182, + "step": 2714 + }, + { + "epoch": 0.06332232286804643, + "grad_norm": 2.654961109161377, + "learning_rate": 1.40710028504794e-05, + "loss": 1.6021, + "step": 2715 + }, + { + "epoch": 0.06334564600722434, + "grad_norm": 1.7463783025741577, + "learning_rate": 1.4076185540295415e-05, + "loss": 1.6907, + "step": 2716 + }, + { + "epoch": 0.06336896914640226, + "grad_norm": 1.81992769241333, + "learning_rate": 1.408136823011143e-05, + "loss": 1.1185, + "step": 2717 + }, + { + "epoch": 0.06339229228558017, + "grad_norm": 1.9757153987884521, + "learning_rate": 1.4086550919927444e-05, + "loss": 1.5705, + "step": 2718 + }, + { + "epoch": 0.0634156154247581, + "grad_norm": 1.7173175811767578, + "learning_rate": 1.4091733609743458e-05, + "loss": 1.3057, + "step": 2719 + }, + { + "epoch": 0.06343893856393601, + "grad_norm": 1.5778136253356934, + "learning_rate": 1.4096916299559472e-05, + "loss": 1.3756, + "step": 2720 + }, + { + "epoch": 0.06346226170311393, + "grad_norm": 1.7649437189102173, + "learning_rate": 1.4102098989375489e-05, + "loss": 1.3827, + "step": 2721 + }, + { + "epoch": 0.06348558484229185, + "grad_norm": 1.6660621166229248, + "learning_rate": 1.4107281679191503e-05, + "loss": 1.2465, + "step": 2722 + }, + { + "epoch": 0.06350890798146977, + "grad_norm": 1.7358027696609497, + "learning_rate": 1.4112464369007517e-05, + "loss": 1.6967, + "step": 2723 + }, + { + "epoch": 0.06353223112064768, + "grad_norm": 2.084941864013672, + "learning_rate": 1.4117647058823532e-05, + "loss": 1.445, + "step": 2724 + }, + { + "epoch": 0.0635555542598256, + "grad_norm": 2.192439556121826, + "learning_rate": 1.4122829748639546e-05, + "loss": 1.5963, + "step": 2725 + }, + { + "epoch": 0.06357887739900352, + "grad_norm": 2.5304958820343018, + "learning_rate": 1.412801243845556e-05, + "loss": 1.8453, + "step": 2726 + }, + { + "epoch": 0.06360220053818144, + "grad_norm": 1.7766294479370117, + "learning_rate": 1.4133195128271575e-05, + "loss": 1.5245, + "step": 2727 + }, + { + "epoch": 0.06362552367735935, + "grad_norm": 1.8651812076568604, + "learning_rate": 1.4138377818087589e-05, + "loss": 1.5013, + "step": 2728 + }, + { + "epoch": 0.06364884681653728, + "grad_norm": 2.4332542419433594, + "learning_rate": 1.4143560507903602e-05, + "loss": 1.4505, + "step": 2729 + }, + { + "epoch": 0.06367216995571519, + "grad_norm": 1.2664331197738647, + "learning_rate": 1.4148743197719616e-05, + "loss": 1.1377, + "step": 2730 + }, + { + "epoch": 0.06369549309489311, + "grad_norm": 2.3579812049865723, + "learning_rate": 1.415392588753563e-05, + "loss": 1.2938, + "step": 2731 + }, + { + "epoch": 0.06371881623407102, + "grad_norm": 1.6869444847106934, + "learning_rate": 1.4159108577351646e-05, + "loss": 1.6204, + "step": 2732 + }, + { + "epoch": 0.06374213937324895, + "grad_norm": 1.6905202865600586, + "learning_rate": 1.416429126716766e-05, + "loss": 1.4774, + "step": 2733 + }, + { + "epoch": 0.06376546251242686, + "grad_norm": 1.5543984174728394, + "learning_rate": 1.4169473956983675e-05, + "loss": 1.3287, + "step": 2734 + }, + { + "epoch": 0.06378878565160478, + "grad_norm": 2.0648207664489746, + "learning_rate": 1.417465664679969e-05, + "loss": 1.6893, + "step": 2735 + }, + { + "epoch": 0.0638121087907827, + "grad_norm": 2.0521440505981445, + "learning_rate": 1.4179839336615704e-05, + "loss": 1.5036, + "step": 2736 + }, + { + "epoch": 0.0638354319299606, + "grad_norm": 1.6368522644042969, + "learning_rate": 1.4185022026431718e-05, + "loss": 1.5613, + "step": 2737 + }, + { + "epoch": 0.06385875506913853, + "grad_norm": 1.7415629625320435, + "learning_rate": 1.4190204716247732e-05, + "loss": 1.3272, + "step": 2738 + }, + { + "epoch": 0.06388207820831644, + "grad_norm": 2.0426433086395264, + "learning_rate": 1.4195387406063747e-05, + "loss": 1.6206, + "step": 2739 + }, + { + "epoch": 0.06390540134749437, + "grad_norm": 1.856251835823059, + "learning_rate": 1.4200570095879763e-05, + "loss": 1.6922, + "step": 2740 + }, + { + "epoch": 0.06392872448667228, + "grad_norm": 1.6982864141464233, + "learning_rate": 1.4205752785695777e-05, + "loss": 1.5107, + "step": 2741 + }, + { + "epoch": 0.0639520476258502, + "grad_norm": 2.2496535778045654, + "learning_rate": 1.4210935475511792e-05, + "loss": 1.5137, + "step": 2742 + }, + { + "epoch": 0.06397537076502811, + "grad_norm": 1.5447698831558228, + "learning_rate": 1.4216118165327806e-05, + "loss": 1.3271, + "step": 2743 + }, + { + "epoch": 0.06399869390420604, + "grad_norm": 1.8603590726852417, + "learning_rate": 1.422130085514382e-05, + "loss": 1.4773, + "step": 2744 + }, + { + "epoch": 0.06402201704338395, + "grad_norm": 2.3592123985290527, + "learning_rate": 1.4226483544959835e-05, + "loss": 0.9334, + "step": 2745 + }, + { + "epoch": 0.06404534018256187, + "grad_norm": 2.216006278991699, + "learning_rate": 1.4231666234775849e-05, + "loss": 1.3389, + "step": 2746 + }, + { + "epoch": 0.06406866332173979, + "grad_norm": 1.652770757675171, + "learning_rate": 1.4236848924591863e-05, + "loss": 1.3359, + "step": 2747 + }, + { + "epoch": 0.06409198646091771, + "grad_norm": 1.717504620552063, + "learning_rate": 1.424203161440788e-05, + "loss": 1.4533, + "step": 2748 + }, + { + "epoch": 0.06411530960009562, + "grad_norm": 1.7929201126098633, + "learning_rate": 1.4247214304223894e-05, + "loss": 1.3263, + "step": 2749 + }, + { + "epoch": 0.06413863273927355, + "grad_norm": 1.9233717918395996, + "learning_rate": 1.4252396994039908e-05, + "loss": 1.4278, + "step": 2750 + }, + { + "epoch": 0.06416195587845146, + "grad_norm": 2.1122560501098633, + "learning_rate": 1.4257579683855922e-05, + "loss": 1.3754, + "step": 2751 + }, + { + "epoch": 0.06418527901762938, + "grad_norm": 2.260162353515625, + "learning_rate": 1.4262762373671937e-05, + "loss": 1.5928, + "step": 2752 + }, + { + "epoch": 0.06420860215680729, + "grad_norm": 2.246264696121216, + "learning_rate": 1.4267945063487951e-05, + "loss": 1.3268, + "step": 2753 + }, + { + "epoch": 0.06423192529598522, + "grad_norm": 1.7550286054611206, + "learning_rate": 1.4273127753303965e-05, + "loss": 1.393, + "step": 2754 + }, + { + "epoch": 0.06425524843516313, + "grad_norm": 2.4047062397003174, + "learning_rate": 1.427831044311998e-05, + "loss": 1.581, + "step": 2755 + }, + { + "epoch": 0.06427857157434105, + "grad_norm": 3.7945029735565186, + "learning_rate": 1.4283493132935996e-05, + "loss": 1.4191, + "step": 2756 + }, + { + "epoch": 0.06430189471351896, + "grad_norm": 1.6757818460464478, + "learning_rate": 1.428867582275201e-05, + "loss": 1.3387, + "step": 2757 + }, + { + "epoch": 0.06432521785269689, + "grad_norm": 2.4777257442474365, + "learning_rate": 1.4293858512568024e-05, + "loss": 1.7051, + "step": 2758 + }, + { + "epoch": 0.0643485409918748, + "grad_norm": 1.803855299949646, + "learning_rate": 1.4299041202384039e-05, + "loss": 1.3828, + "step": 2759 + }, + { + "epoch": 0.06437186413105273, + "grad_norm": 1.8653554916381836, + "learning_rate": 1.4304223892200053e-05, + "loss": 1.2711, + "step": 2760 + }, + { + "epoch": 0.06439518727023064, + "grad_norm": 1.588797688484192, + "learning_rate": 1.4309406582016067e-05, + "loss": 1.2614, + "step": 2761 + }, + { + "epoch": 0.06441851040940856, + "grad_norm": 2.4432761669158936, + "learning_rate": 1.4314589271832082e-05, + "loss": 1.1186, + "step": 2762 + }, + { + "epoch": 0.06444183354858647, + "grad_norm": 1.8135852813720703, + "learning_rate": 1.4319771961648096e-05, + "loss": 1.4257, + "step": 2763 + }, + { + "epoch": 0.06446515668776438, + "grad_norm": 1.841215968132019, + "learning_rate": 1.432495465146411e-05, + "loss": 1.3623, + "step": 2764 + }, + { + "epoch": 0.06448847982694231, + "grad_norm": 2.0197958946228027, + "learning_rate": 1.4330137341280127e-05, + "loss": 1.468, + "step": 2765 + }, + { + "epoch": 0.06451180296612022, + "grad_norm": 2.380474090576172, + "learning_rate": 1.4335320031096141e-05, + "loss": 1.2547, + "step": 2766 + }, + { + "epoch": 0.06453512610529814, + "grad_norm": 2.137549638748169, + "learning_rate": 1.4340502720912155e-05, + "loss": 1.3884, + "step": 2767 + }, + { + "epoch": 0.06455844924447605, + "grad_norm": 1.8818745613098145, + "learning_rate": 1.434568541072817e-05, + "loss": 1.305, + "step": 2768 + }, + { + "epoch": 0.06458177238365398, + "grad_norm": 1.7254643440246582, + "learning_rate": 1.4350868100544184e-05, + "loss": 1.1492, + "step": 2769 + }, + { + "epoch": 0.06460509552283189, + "grad_norm": 1.8451322317123413, + "learning_rate": 1.4356050790360198e-05, + "loss": 1.5132, + "step": 2770 + }, + { + "epoch": 0.06462841866200981, + "grad_norm": 2.049947738647461, + "learning_rate": 1.4361233480176213e-05, + "loss": 1.4172, + "step": 2771 + }, + { + "epoch": 0.06465174180118773, + "grad_norm": 2.0844335556030273, + "learning_rate": 1.4366416169992227e-05, + "loss": 1.3341, + "step": 2772 + }, + { + "epoch": 0.06467506494036565, + "grad_norm": 2.167858362197876, + "learning_rate": 1.4371598859808243e-05, + "loss": 1.4029, + "step": 2773 + }, + { + "epoch": 0.06469838807954356, + "grad_norm": 2.2055740356445312, + "learning_rate": 1.4376781549624257e-05, + "loss": 1.4169, + "step": 2774 + }, + { + "epoch": 0.06472171121872149, + "grad_norm": 1.6567565202713013, + "learning_rate": 1.4381964239440272e-05, + "loss": 1.4323, + "step": 2775 + }, + { + "epoch": 0.0647450343578994, + "grad_norm": 1.3381197452545166, + "learning_rate": 1.4387146929256286e-05, + "loss": 1.1591, + "step": 2776 + }, + { + "epoch": 0.06476835749707732, + "grad_norm": 1.877096176147461, + "learning_rate": 1.43923296190723e-05, + "loss": 1.4447, + "step": 2777 + }, + { + "epoch": 0.06479168063625523, + "grad_norm": 2.3243725299835205, + "learning_rate": 1.4397512308888315e-05, + "loss": 1.2583, + "step": 2778 + }, + { + "epoch": 0.06481500377543316, + "grad_norm": 2.023531198501587, + "learning_rate": 1.4402694998704329e-05, + "loss": 1.0388, + "step": 2779 + }, + { + "epoch": 0.06483832691461107, + "grad_norm": 1.5490742921829224, + "learning_rate": 1.4407877688520343e-05, + "loss": 1.3603, + "step": 2780 + }, + { + "epoch": 0.064861650053789, + "grad_norm": 1.8510228395462036, + "learning_rate": 1.441306037833636e-05, + "loss": 1.5173, + "step": 2781 + }, + { + "epoch": 0.0648849731929669, + "grad_norm": 1.8563857078552246, + "learning_rate": 1.4418243068152374e-05, + "loss": 1.6119, + "step": 2782 + }, + { + "epoch": 0.06490829633214483, + "grad_norm": 2.1504569053649902, + "learning_rate": 1.4423425757968388e-05, + "loss": 1.2287, + "step": 2783 + }, + { + "epoch": 0.06493161947132274, + "grad_norm": 1.629302978515625, + "learning_rate": 1.4428608447784401e-05, + "loss": 1.4008, + "step": 2784 + }, + { + "epoch": 0.06495494261050067, + "grad_norm": 1.6605809926986694, + "learning_rate": 1.4433791137600415e-05, + "loss": 1.5132, + "step": 2785 + }, + { + "epoch": 0.06497826574967858, + "grad_norm": 1.8998322486877441, + "learning_rate": 1.443897382741643e-05, + "loss": 1.4024, + "step": 2786 + }, + { + "epoch": 0.0650015888888565, + "grad_norm": 1.89012610912323, + "learning_rate": 1.4444156517232444e-05, + "loss": 1.5893, + "step": 2787 + }, + { + "epoch": 0.06502491202803441, + "grad_norm": 2.7149722576141357, + "learning_rate": 1.4449339207048458e-05, + "loss": 1.4361, + "step": 2788 + }, + { + "epoch": 0.06504823516721234, + "grad_norm": 1.397595763206482, + "learning_rate": 1.4454521896864473e-05, + "loss": 1.1094, + "step": 2789 + }, + { + "epoch": 0.06507155830639025, + "grad_norm": 1.8745239973068237, + "learning_rate": 1.4459704586680487e-05, + "loss": 1.6704, + "step": 2790 + }, + { + "epoch": 0.06509488144556817, + "grad_norm": 1.8384937047958374, + "learning_rate": 1.4464887276496501e-05, + "loss": 1.7918, + "step": 2791 + }, + { + "epoch": 0.06511820458474608, + "grad_norm": 1.9192211627960205, + "learning_rate": 1.4470069966312517e-05, + "loss": 1.459, + "step": 2792 + }, + { + "epoch": 0.065141527723924, + "grad_norm": 1.795638918876648, + "learning_rate": 1.4475252656128532e-05, + "loss": 1.3766, + "step": 2793 + }, + { + "epoch": 0.06516485086310192, + "grad_norm": 1.8964101076126099, + "learning_rate": 1.4480435345944546e-05, + "loss": 1.5834, + "step": 2794 + }, + { + "epoch": 0.06518817400227983, + "grad_norm": 1.563743233680725, + "learning_rate": 1.448561803576056e-05, + "loss": 1.1312, + "step": 2795 + }, + { + "epoch": 0.06521149714145776, + "grad_norm": 1.5765119791030884, + "learning_rate": 1.4490800725576575e-05, + "loss": 1.4893, + "step": 2796 + }, + { + "epoch": 0.06523482028063567, + "grad_norm": 1.7887187004089355, + "learning_rate": 1.4495983415392589e-05, + "loss": 1.2458, + "step": 2797 + }, + { + "epoch": 0.06525814341981359, + "grad_norm": 1.4799649715423584, + "learning_rate": 1.4501166105208603e-05, + "loss": 1.358, + "step": 2798 + }, + { + "epoch": 0.0652814665589915, + "grad_norm": 2.0007877349853516, + "learning_rate": 1.4506348795024618e-05, + "loss": 0.915, + "step": 2799 + }, + { + "epoch": 0.06530478969816943, + "grad_norm": 2.1305413246154785, + "learning_rate": 1.4511531484840634e-05, + "loss": 1.7505, + "step": 2800 + }, + { + "epoch": 0.06532811283734734, + "grad_norm": 3.3206400871276855, + "learning_rate": 1.4516714174656648e-05, + "loss": 1.8972, + "step": 2801 + }, + { + "epoch": 0.06535143597652526, + "grad_norm": 1.7682409286499023, + "learning_rate": 1.4521896864472662e-05, + "loss": 1.4471, + "step": 2802 + }, + { + "epoch": 0.06537475911570317, + "grad_norm": 1.81817626953125, + "learning_rate": 1.4527079554288677e-05, + "loss": 1.1318, + "step": 2803 + }, + { + "epoch": 0.0653980822548811, + "grad_norm": 1.710696816444397, + "learning_rate": 1.4532262244104691e-05, + "loss": 1.4271, + "step": 2804 + }, + { + "epoch": 0.06542140539405901, + "grad_norm": 2.3982298374176025, + "learning_rate": 1.4537444933920706e-05, + "loss": 1.5585, + "step": 2805 + }, + { + "epoch": 0.06544472853323693, + "grad_norm": 1.7883695363998413, + "learning_rate": 1.454262762373672e-05, + "loss": 1.3224, + "step": 2806 + }, + { + "epoch": 0.06546805167241485, + "grad_norm": 1.9640473127365112, + "learning_rate": 1.4547810313552734e-05, + "loss": 1.8388, + "step": 2807 + }, + { + "epoch": 0.06549137481159277, + "grad_norm": 2.2285878658294678, + "learning_rate": 1.455299300336875e-05, + "loss": 1.2853, + "step": 2808 + }, + { + "epoch": 0.06551469795077068, + "grad_norm": 1.8806909322738647, + "learning_rate": 1.4558175693184765e-05, + "loss": 1.2943, + "step": 2809 + }, + { + "epoch": 0.0655380210899486, + "grad_norm": 1.5270092487335205, + "learning_rate": 1.4563358383000779e-05, + "loss": 1.2686, + "step": 2810 + }, + { + "epoch": 0.06556134422912652, + "grad_norm": 2.655914783477783, + "learning_rate": 1.4568541072816793e-05, + "loss": 1.5861, + "step": 2811 + }, + { + "epoch": 0.06558466736830444, + "grad_norm": 1.6872950792312622, + "learning_rate": 1.4573723762632808e-05, + "loss": 1.4328, + "step": 2812 + }, + { + "epoch": 0.06560799050748235, + "grad_norm": 2.5100176334381104, + "learning_rate": 1.4578906452448822e-05, + "loss": 1.4183, + "step": 2813 + }, + { + "epoch": 0.06563131364666028, + "grad_norm": 2.044571876525879, + "learning_rate": 1.4584089142264836e-05, + "loss": 1.4343, + "step": 2814 + }, + { + "epoch": 0.06565463678583819, + "grad_norm": 1.8628580570220947, + "learning_rate": 1.458927183208085e-05, + "loss": 1.4403, + "step": 2815 + }, + { + "epoch": 0.06567795992501611, + "grad_norm": 1.4153575897216797, + "learning_rate": 1.4594454521896865e-05, + "loss": 1.3196, + "step": 2816 + }, + { + "epoch": 0.06570128306419402, + "grad_norm": 1.8334929943084717, + "learning_rate": 1.4599637211712881e-05, + "loss": 1.5043, + "step": 2817 + }, + { + "epoch": 0.06572460620337195, + "grad_norm": 1.9358466863632202, + "learning_rate": 1.4604819901528895e-05, + "loss": 1.4786, + "step": 2818 + }, + { + "epoch": 0.06574792934254986, + "grad_norm": 2.730907678604126, + "learning_rate": 1.461000259134491e-05, + "loss": 1.7105, + "step": 2819 + }, + { + "epoch": 0.06577125248172778, + "grad_norm": 1.9616193771362305, + "learning_rate": 1.4615185281160924e-05, + "loss": 1.2128, + "step": 2820 + }, + { + "epoch": 0.0657945756209057, + "grad_norm": 2.0155746936798096, + "learning_rate": 1.4620367970976938e-05, + "loss": 1.5347, + "step": 2821 + }, + { + "epoch": 0.0658178987600836, + "grad_norm": 2.0486910343170166, + "learning_rate": 1.4625550660792953e-05, + "loss": 1.2862, + "step": 2822 + }, + { + "epoch": 0.06584122189926153, + "grad_norm": 1.7724339962005615, + "learning_rate": 1.4630733350608967e-05, + "loss": 1.193, + "step": 2823 + }, + { + "epoch": 0.06586454503843944, + "grad_norm": 1.8260146379470825, + "learning_rate": 1.4635916040424981e-05, + "loss": 1.6844, + "step": 2824 + }, + { + "epoch": 0.06588786817761737, + "grad_norm": 1.7288804054260254, + "learning_rate": 1.4641098730240998e-05, + "loss": 1.3953, + "step": 2825 + }, + { + "epoch": 0.06591119131679528, + "grad_norm": 1.7598590850830078, + "learning_rate": 1.4646281420057012e-05, + "loss": 1.4005, + "step": 2826 + }, + { + "epoch": 0.0659345144559732, + "grad_norm": 1.7817363739013672, + "learning_rate": 1.4651464109873026e-05, + "loss": 1.2379, + "step": 2827 + }, + { + "epoch": 0.06595783759515111, + "grad_norm": 1.7752867937088013, + "learning_rate": 1.465664679968904e-05, + "loss": 1.5554, + "step": 2828 + }, + { + "epoch": 0.06598116073432904, + "grad_norm": 1.7931783199310303, + "learning_rate": 1.4661829489505055e-05, + "loss": 1.5309, + "step": 2829 + }, + { + "epoch": 0.06600448387350695, + "grad_norm": 1.4036725759506226, + "learning_rate": 1.466701217932107e-05, + "loss": 1.2221, + "step": 2830 + }, + { + "epoch": 0.06602780701268487, + "grad_norm": 2.4719862937927246, + "learning_rate": 1.4672194869137084e-05, + "loss": 1.5346, + "step": 2831 + }, + { + "epoch": 0.06605113015186279, + "grad_norm": 1.5505207777023315, + "learning_rate": 1.4677377558953098e-05, + "loss": 1.2664, + "step": 2832 + }, + { + "epoch": 0.06607445329104071, + "grad_norm": 1.8633780479431152, + "learning_rate": 1.4682560248769114e-05, + "loss": 1.4508, + "step": 2833 + }, + { + "epoch": 0.06609777643021862, + "grad_norm": 2.0683753490448, + "learning_rate": 1.4687742938585128e-05, + "loss": 1.5232, + "step": 2834 + }, + { + "epoch": 0.06612109956939655, + "grad_norm": 1.9980714321136475, + "learning_rate": 1.4692925628401143e-05, + "loss": 1.3419, + "step": 2835 + }, + { + "epoch": 0.06614442270857446, + "grad_norm": 1.9079272747039795, + "learning_rate": 1.4698108318217157e-05, + "loss": 1.3417, + "step": 2836 + }, + { + "epoch": 0.06616774584775238, + "grad_norm": 2.0348222255706787, + "learning_rate": 1.4703291008033171e-05, + "loss": 1.1548, + "step": 2837 + }, + { + "epoch": 0.0661910689869303, + "grad_norm": 1.6220494508743286, + "learning_rate": 1.4708473697849186e-05, + "loss": 1.3376, + "step": 2838 + }, + { + "epoch": 0.06621439212610822, + "grad_norm": 2.169706344604492, + "learning_rate": 1.47136563876652e-05, + "loss": 1.4463, + "step": 2839 + }, + { + "epoch": 0.06623771526528613, + "grad_norm": 1.7391022443771362, + "learning_rate": 1.4718839077481213e-05, + "loss": 1.4331, + "step": 2840 + }, + { + "epoch": 0.06626103840446405, + "grad_norm": 1.9789973497390747, + "learning_rate": 1.4724021767297227e-05, + "loss": 1.4105, + "step": 2841 + }, + { + "epoch": 0.06628436154364196, + "grad_norm": 2.1749653816223145, + "learning_rate": 1.4729204457113241e-05, + "loss": 1.7178, + "step": 2842 + }, + { + "epoch": 0.06630768468281989, + "grad_norm": 1.8664934635162354, + "learning_rate": 1.4734387146929256e-05, + "loss": 1.6212, + "step": 2843 + }, + { + "epoch": 0.0663310078219978, + "grad_norm": 1.8881118297576904, + "learning_rate": 1.4739569836745272e-05, + "loss": 1.22, + "step": 2844 + }, + { + "epoch": 0.06635433096117573, + "grad_norm": 1.743923544883728, + "learning_rate": 1.4744752526561286e-05, + "loss": 1.7732, + "step": 2845 + }, + { + "epoch": 0.06637765410035364, + "grad_norm": 1.611082673072815, + "learning_rate": 1.47499352163773e-05, + "loss": 1.6571, + "step": 2846 + }, + { + "epoch": 0.06640097723953156, + "grad_norm": 1.955450177192688, + "learning_rate": 1.4755117906193315e-05, + "loss": 1.4484, + "step": 2847 + }, + { + "epoch": 0.06642430037870947, + "grad_norm": 1.7298091650009155, + "learning_rate": 1.476030059600933e-05, + "loss": 1.561, + "step": 2848 + }, + { + "epoch": 0.0664476235178874, + "grad_norm": 1.5733643770217896, + "learning_rate": 1.4765483285825344e-05, + "loss": 1.3661, + "step": 2849 + }, + { + "epoch": 0.06647094665706531, + "grad_norm": 1.6763901710510254, + "learning_rate": 1.4770665975641358e-05, + "loss": 1.5303, + "step": 2850 + }, + { + "epoch": 0.06649426979624322, + "grad_norm": 1.6987820863723755, + "learning_rate": 1.4775848665457372e-05, + "loss": 1.3741, + "step": 2851 + }, + { + "epoch": 0.06651759293542114, + "grad_norm": 1.3298968076705933, + "learning_rate": 1.4781031355273388e-05, + "loss": 1.129, + "step": 2852 + }, + { + "epoch": 0.06654091607459905, + "grad_norm": 1.903433084487915, + "learning_rate": 1.4786214045089403e-05, + "loss": 1.3274, + "step": 2853 + }, + { + "epoch": 0.06656423921377698, + "grad_norm": 1.4711178541183472, + "learning_rate": 1.4791396734905417e-05, + "loss": 1.3581, + "step": 2854 + }, + { + "epoch": 0.06658756235295489, + "grad_norm": 1.758985996246338, + "learning_rate": 1.4796579424721431e-05, + "loss": 1.7431, + "step": 2855 + }, + { + "epoch": 0.06661088549213282, + "grad_norm": 2.4290409088134766, + "learning_rate": 1.4801762114537446e-05, + "loss": 1.6673, + "step": 2856 + }, + { + "epoch": 0.06663420863131073, + "grad_norm": 2.1742918491363525, + "learning_rate": 1.480694480435346e-05, + "loss": 1.3935, + "step": 2857 + }, + { + "epoch": 0.06665753177048865, + "grad_norm": 2.2304227352142334, + "learning_rate": 1.4812127494169474e-05, + "loss": 0.9519, + "step": 2858 + }, + { + "epoch": 0.06668085490966656, + "grad_norm": 2.019341468811035, + "learning_rate": 1.4817310183985489e-05, + "loss": 1.2995, + "step": 2859 + }, + { + "epoch": 0.06670417804884449, + "grad_norm": 1.9606066942214966, + "learning_rate": 1.4822492873801505e-05, + "loss": 1.3055, + "step": 2860 + }, + { + "epoch": 0.0667275011880224, + "grad_norm": 1.6898326873779297, + "learning_rate": 1.4827675563617519e-05, + "loss": 1.5552, + "step": 2861 + }, + { + "epoch": 0.06675082432720032, + "grad_norm": 1.844104290008545, + "learning_rate": 1.4832858253433533e-05, + "loss": 1.2242, + "step": 2862 + }, + { + "epoch": 0.06677414746637823, + "grad_norm": 1.9563182592391968, + "learning_rate": 1.4838040943249548e-05, + "loss": 0.9554, + "step": 2863 + }, + { + "epoch": 0.06679747060555616, + "grad_norm": 1.9188711643218994, + "learning_rate": 1.4843223633065562e-05, + "loss": 1.4186, + "step": 2864 + }, + { + "epoch": 0.06682079374473407, + "grad_norm": 2.0907742977142334, + "learning_rate": 1.4848406322881576e-05, + "loss": 1.4292, + "step": 2865 + }, + { + "epoch": 0.066844116883912, + "grad_norm": 1.5654215812683105, + "learning_rate": 1.485358901269759e-05, + "loss": 1.3211, + "step": 2866 + }, + { + "epoch": 0.0668674400230899, + "grad_norm": 1.6763267517089844, + "learning_rate": 1.4858771702513605e-05, + "loss": 1.4544, + "step": 2867 + }, + { + "epoch": 0.06689076316226783, + "grad_norm": 1.7230890989303589, + "learning_rate": 1.486395439232962e-05, + "loss": 1.269, + "step": 2868 + }, + { + "epoch": 0.06691408630144574, + "grad_norm": 1.7296384572982788, + "learning_rate": 1.4869137082145636e-05, + "loss": 1.3582, + "step": 2869 + }, + { + "epoch": 0.06693740944062367, + "grad_norm": 2.278798818588257, + "learning_rate": 1.487431977196165e-05, + "loss": 1.7924, + "step": 2870 + }, + { + "epoch": 0.06696073257980158, + "grad_norm": 1.5812768936157227, + "learning_rate": 1.4879502461777664e-05, + "loss": 1.5316, + "step": 2871 + }, + { + "epoch": 0.0669840557189795, + "grad_norm": 1.7185741662979126, + "learning_rate": 1.4884685151593679e-05, + "loss": 1.3088, + "step": 2872 + }, + { + "epoch": 0.06700737885815741, + "grad_norm": 1.9661529064178467, + "learning_rate": 1.4889867841409693e-05, + "loss": 1.898, + "step": 2873 + }, + { + "epoch": 0.06703070199733534, + "grad_norm": 1.6297186613082886, + "learning_rate": 1.4895050531225707e-05, + "loss": 1.3343, + "step": 2874 + }, + { + "epoch": 0.06705402513651325, + "grad_norm": 1.3750114440917969, + "learning_rate": 1.4900233221041722e-05, + "loss": 1.1608, + "step": 2875 + }, + { + "epoch": 0.06707734827569117, + "grad_norm": 1.9823325872421265, + "learning_rate": 1.4905415910857736e-05, + "loss": 1.575, + "step": 2876 + }, + { + "epoch": 0.06710067141486908, + "grad_norm": 2.4376564025878906, + "learning_rate": 1.4910598600673752e-05, + "loss": 1.47, + "step": 2877 + }, + { + "epoch": 0.067123994554047, + "grad_norm": 1.6154961585998535, + "learning_rate": 1.4915781290489766e-05, + "loss": 1.2653, + "step": 2878 + }, + { + "epoch": 0.06714731769322492, + "grad_norm": 1.5177198648452759, + "learning_rate": 1.492096398030578e-05, + "loss": 1.2946, + "step": 2879 + }, + { + "epoch": 0.06717064083240283, + "grad_norm": 1.9357798099517822, + "learning_rate": 1.4926146670121795e-05, + "loss": 1.2401, + "step": 2880 + }, + { + "epoch": 0.06719396397158076, + "grad_norm": 1.568678617477417, + "learning_rate": 1.493132935993781e-05, + "loss": 1.3365, + "step": 2881 + }, + { + "epoch": 0.06721728711075867, + "grad_norm": 1.8375293016433716, + "learning_rate": 1.4936512049753824e-05, + "loss": 1.3164, + "step": 2882 + }, + { + "epoch": 0.06724061024993659, + "grad_norm": 1.971502661705017, + "learning_rate": 1.4941694739569838e-05, + "loss": 1.2666, + "step": 2883 + }, + { + "epoch": 0.0672639333891145, + "grad_norm": 1.5636154413223267, + "learning_rate": 1.4946877429385852e-05, + "loss": 1.3742, + "step": 2884 + }, + { + "epoch": 0.06728725652829243, + "grad_norm": 1.8986397981643677, + "learning_rate": 1.4952060119201869e-05, + "loss": 1.3876, + "step": 2885 + }, + { + "epoch": 0.06731057966747034, + "grad_norm": 2.2729148864746094, + "learning_rate": 1.4957242809017883e-05, + "loss": 1.5104, + "step": 2886 + }, + { + "epoch": 0.06733390280664826, + "grad_norm": 1.8707177639007568, + "learning_rate": 1.4962425498833897e-05, + "loss": 1.5612, + "step": 2887 + }, + { + "epoch": 0.06735722594582617, + "grad_norm": 2.0465052127838135, + "learning_rate": 1.4967608188649912e-05, + "loss": 1.9165, + "step": 2888 + }, + { + "epoch": 0.0673805490850041, + "grad_norm": 2.444533348083496, + "learning_rate": 1.4972790878465926e-05, + "loss": 1.6854, + "step": 2889 + }, + { + "epoch": 0.06740387222418201, + "grad_norm": 2.5257554054260254, + "learning_rate": 1.497797356828194e-05, + "loss": 1.4987, + "step": 2890 + }, + { + "epoch": 0.06742719536335993, + "grad_norm": 2.1791629791259766, + "learning_rate": 1.4983156258097955e-05, + "loss": 1.2276, + "step": 2891 + }, + { + "epoch": 0.06745051850253785, + "grad_norm": 1.8654357194900513, + "learning_rate": 1.4988338947913969e-05, + "loss": 1.7785, + "step": 2892 + }, + { + "epoch": 0.06747384164171577, + "grad_norm": 2.5087015628814697, + "learning_rate": 1.4993521637729985e-05, + "loss": 1.4029, + "step": 2893 + }, + { + "epoch": 0.06749716478089368, + "grad_norm": 1.6082079410552979, + "learning_rate": 1.4998704327546e-05, + "loss": 1.0552, + "step": 2894 + }, + { + "epoch": 0.0675204879200716, + "grad_norm": 1.8922289609909058, + "learning_rate": 1.500388701736201e-05, + "loss": 1.2605, + "step": 2895 + }, + { + "epoch": 0.06754381105924952, + "grad_norm": 1.8346471786499023, + "learning_rate": 1.5009069707178026e-05, + "loss": 1.4355, + "step": 2896 + }, + { + "epoch": 0.06756713419842744, + "grad_norm": 1.8859299421310425, + "learning_rate": 1.501425239699404e-05, + "loss": 1.2775, + "step": 2897 + }, + { + "epoch": 0.06759045733760535, + "grad_norm": 1.3990947008132935, + "learning_rate": 1.5019435086810055e-05, + "loss": 0.9683, + "step": 2898 + }, + { + "epoch": 0.06761378047678328, + "grad_norm": 1.876051664352417, + "learning_rate": 1.502461777662607e-05, + "loss": 1.7135, + "step": 2899 + }, + { + "epoch": 0.06763710361596119, + "grad_norm": 1.5841714143753052, + "learning_rate": 1.5029800466442084e-05, + "loss": 1.5269, + "step": 2900 + }, + { + "epoch": 0.06766042675513911, + "grad_norm": 1.8639863729476929, + "learning_rate": 1.5034983156258098e-05, + "loss": 1.4331, + "step": 2901 + }, + { + "epoch": 0.06768374989431702, + "grad_norm": 1.6989810466766357, + "learning_rate": 1.5040165846074112e-05, + "loss": 1.6676, + "step": 2902 + }, + { + "epoch": 0.06770707303349495, + "grad_norm": 1.7870492935180664, + "learning_rate": 1.5045348535890127e-05, + "loss": 1.5192, + "step": 2903 + }, + { + "epoch": 0.06773039617267286, + "grad_norm": 2.058709144592285, + "learning_rate": 1.5050531225706143e-05, + "loss": 1.69, + "step": 2904 + }, + { + "epoch": 0.06775371931185079, + "grad_norm": 1.7233374118804932, + "learning_rate": 1.5055713915522157e-05, + "loss": 1.4893, + "step": 2905 + }, + { + "epoch": 0.0677770424510287, + "grad_norm": 1.888060212135315, + "learning_rate": 1.5060896605338171e-05, + "loss": 1.7769, + "step": 2906 + }, + { + "epoch": 0.06780036559020661, + "grad_norm": 2.2901411056518555, + "learning_rate": 1.5066079295154186e-05, + "loss": 1.6078, + "step": 2907 + }, + { + "epoch": 0.06782368872938453, + "grad_norm": 1.6352299451828003, + "learning_rate": 1.50712619849702e-05, + "loss": 1.7949, + "step": 2908 + }, + { + "epoch": 0.06784701186856244, + "grad_norm": 1.6780446767807007, + "learning_rate": 1.5076444674786215e-05, + "loss": 1.1521, + "step": 2909 + }, + { + "epoch": 0.06787033500774037, + "grad_norm": 1.8701889514923096, + "learning_rate": 1.5081627364602229e-05, + "loss": 1.4567, + "step": 2910 + }, + { + "epoch": 0.06789365814691828, + "grad_norm": 1.741023302078247, + "learning_rate": 1.5086810054418243e-05, + "loss": 1.1188, + "step": 2911 + }, + { + "epoch": 0.0679169812860962, + "grad_norm": 1.6472866535186768, + "learning_rate": 1.5091992744234258e-05, + "loss": 1.6849, + "step": 2912 + }, + { + "epoch": 0.06794030442527411, + "grad_norm": 1.4536561965942383, + "learning_rate": 1.5097175434050274e-05, + "loss": 1.4544, + "step": 2913 + }, + { + "epoch": 0.06796362756445204, + "grad_norm": 2.170592784881592, + "learning_rate": 1.5102358123866288e-05, + "loss": 1.5684, + "step": 2914 + }, + { + "epoch": 0.06798695070362995, + "grad_norm": 2.3381991386413574, + "learning_rate": 1.5107540813682302e-05, + "loss": 1.6957, + "step": 2915 + }, + { + "epoch": 0.06801027384280788, + "grad_norm": 1.36526620388031, + "learning_rate": 1.5112723503498317e-05, + "loss": 1.1307, + "step": 2916 + }, + { + "epoch": 0.06803359698198579, + "grad_norm": 2.0184521675109863, + "learning_rate": 1.5117906193314331e-05, + "loss": 1.5328, + "step": 2917 + }, + { + "epoch": 0.06805692012116371, + "grad_norm": 1.6322370767593384, + "learning_rate": 1.5123088883130345e-05, + "loss": 1.3401, + "step": 2918 + }, + { + "epoch": 0.06808024326034162, + "grad_norm": 2.306300401687622, + "learning_rate": 1.512827157294636e-05, + "loss": 1.6381, + "step": 2919 + }, + { + "epoch": 0.06810356639951955, + "grad_norm": 1.7915669679641724, + "learning_rate": 1.5133454262762374e-05, + "loss": 1.4884, + "step": 2920 + }, + { + "epoch": 0.06812688953869746, + "grad_norm": 1.6436505317687988, + "learning_rate": 1.513863695257839e-05, + "loss": 1.0716, + "step": 2921 + }, + { + "epoch": 0.06815021267787538, + "grad_norm": 2.4779422283172607, + "learning_rate": 1.5143819642394404e-05, + "loss": 1.5326, + "step": 2922 + }, + { + "epoch": 0.0681735358170533, + "grad_norm": 1.9814671277999878, + "learning_rate": 1.5149002332210419e-05, + "loss": 1.5269, + "step": 2923 + }, + { + "epoch": 0.06819685895623122, + "grad_norm": 1.9176750183105469, + "learning_rate": 1.5154185022026433e-05, + "loss": 1.4689, + "step": 2924 + }, + { + "epoch": 0.06822018209540913, + "grad_norm": 1.3773646354675293, + "learning_rate": 1.5159367711842447e-05, + "loss": 1.2972, + "step": 2925 + }, + { + "epoch": 0.06824350523458705, + "grad_norm": 2.0976767539978027, + "learning_rate": 1.5164550401658462e-05, + "loss": 1.3239, + "step": 2926 + }, + { + "epoch": 0.06826682837376497, + "grad_norm": 1.9581456184387207, + "learning_rate": 1.5169733091474476e-05, + "loss": 1.9213, + "step": 2927 + }, + { + "epoch": 0.06829015151294289, + "grad_norm": 1.74188232421875, + "learning_rate": 1.517491578129049e-05, + "loss": 1.5739, + "step": 2928 + }, + { + "epoch": 0.0683134746521208, + "grad_norm": 1.7404353618621826, + "learning_rate": 1.5180098471106507e-05, + "loss": 1.6213, + "step": 2929 + }, + { + "epoch": 0.06833679779129873, + "grad_norm": 1.6050968170166016, + "learning_rate": 1.5185281160922521e-05, + "loss": 1.6145, + "step": 2930 + }, + { + "epoch": 0.06836012093047664, + "grad_norm": 2.024853229522705, + "learning_rate": 1.5190463850738535e-05, + "loss": 1.344, + "step": 2931 + }, + { + "epoch": 0.06838344406965456, + "grad_norm": 1.593658685684204, + "learning_rate": 1.519564654055455e-05, + "loss": 1.2873, + "step": 2932 + }, + { + "epoch": 0.06840676720883247, + "grad_norm": 2.121755838394165, + "learning_rate": 1.5200829230370564e-05, + "loss": 1.6169, + "step": 2933 + }, + { + "epoch": 0.0684300903480104, + "grad_norm": 1.858398199081421, + "learning_rate": 1.5206011920186578e-05, + "loss": 1.5265, + "step": 2934 + }, + { + "epoch": 0.06845341348718831, + "grad_norm": 1.6854084730148315, + "learning_rate": 1.5211194610002593e-05, + "loss": 1.3703, + "step": 2935 + }, + { + "epoch": 0.06847673662636622, + "grad_norm": 2.447187900543213, + "learning_rate": 1.5216377299818607e-05, + "loss": 1.5762, + "step": 2936 + }, + { + "epoch": 0.06850005976554414, + "grad_norm": 1.7991342544555664, + "learning_rate": 1.5221559989634623e-05, + "loss": 1.4965, + "step": 2937 + }, + { + "epoch": 0.06852338290472205, + "grad_norm": 1.6656452417373657, + "learning_rate": 1.5226742679450637e-05, + "loss": 1.5852, + "step": 2938 + }, + { + "epoch": 0.06854670604389998, + "grad_norm": 2.030794143676758, + "learning_rate": 1.5231925369266652e-05, + "loss": 1.479, + "step": 2939 + }, + { + "epoch": 0.06857002918307789, + "grad_norm": 1.8245702981948853, + "learning_rate": 1.5237108059082666e-05, + "loss": 1.6709, + "step": 2940 + }, + { + "epoch": 0.06859335232225582, + "grad_norm": 1.472434163093567, + "learning_rate": 1.524229074889868e-05, + "loss": 1.3113, + "step": 2941 + }, + { + "epoch": 0.06861667546143373, + "grad_norm": 2.1557259559631348, + "learning_rate": 1.5247473438714695e-05, + "loss": 1.8385, + "step": 2942 + }, + { + "epoch": 0.06863999860061165, + "grad_norm": 1.5355175733566284, + "learning_rate": 1.5252656128530709e-05, + "loss": 1.2249, + "step": 2943 + }, + { + "epoch": 0.06866332173978956, + "grad_norm": 1.8443330526351929, + "learning_rate": 1.5257838818346723e-05, + "loss": 1.4584, + "step": 2944 + }, + { + "epoch": 0.06868664487896749, + "grad_norm": 1.898336410522461, + "learning_rate": 1.526302150816274e-05, + "loss": 1.4696, + "step": 2945 + }, + { + "epoch": 0.0687099680181454, + "grad_norm": 1.7229390144348145, + "learning_rate": 1.5268204197978754e-05, + "loss": 1.4675, + "step": 2946 + }, + { + "epoch": 0.06873329115732332, + "grad_norm": 1.6735163927078247, + "learning_rate": 1.5273386887794768e-05, + "loss": 1.8393, + "step": 2947 + }, + { + "epoch": 0.06875661429650123, + "grad_norm": 1.5430946350097656, + "learning_rate": 1.5278569577610783e-05, + "loss": 1.252, + "step": 2948 + }, + { + "epoch": 0.06877993743567916, + "grad_norm": 2.2710747718811035, + "learning_rate": 1.5283752267426797e-05, + "loss": 1.2602, + "step": 2949 + }, + { + "epoch": 0.06880326057485707, + "grad_norm": 2.163055658340454, + "learning_rate": 1.5288934957242808e-05, + "loss": 1.5387, + "step": 2950 + }, + { + "epoch": 0.068826583714035, + "grad_norm": 1.905474066734314, + "learning_rate": 1.5294117647058822e-05, + "loss": 1.4113, + "step": 2951 + }, + { + "epoch": 0.0688499068532129, + "grad_norm": 1.6778980493545532, + "learning_rate": 1.529930033687484e-05, + "loss": 1.4245, + "step": 2952 + }, + { + "epoch": 0.06887322999239083, + "grad_norm": 1.684782862663269, + "learning_rate": 1.5304483026690854e-05, + "loss": 1.6875, + "step": 2953 + }, + { + "epoch": 0.06889655313156874, + "grad_norm": 1.8047994375228882, + "learning_rate": 1.530966571650687e-05, + "loss": 1.6221, + "step": 2954 + }, + { + "epoch": 0.06891987627074667, + "grad_norm": 1.7538729906082153, + "learning_rate": 1.5314848406322883e-05, + "loss": 1.5855, + "step": 2955 + }, + { + "epoch": 0.06894319940992458, + "grad_norm": 2.5780930519104004, + "learning_rate": 1.5320031096138897e-05, + "loss": 1.4451, + "step": 2956 + }, + { + "epoch": 0.0689665225491025, + "grad_norm": 2.0026376247406006, + "learning_rate": 1.532521378595491e-05, + "loss": 1.372, + "step": 2957 + }, + { + "epoch": 0.06898984568828041, + "grad_norm": 2.1854658126831055, + "learning_rate": 1.5330396475770926e-05, + "loss": 1.3999, + "step": 2958 + }, + { + "epoch": 0.06901316882745834, + "grad_norm": 1.9142454862594604, + "learning_rate": 1.533557916558694e-05, + "loss": 1.3172, + "step": 2959 + }, + { + "epoch": 0.06903649196663625, + "grad_norm": 1.423509120941162, + "learning_rate": 1.5340761855402955e-05, + "loss": 1.1681, + "step": 2960 + }, + { + "epoch": 0.06905981510581417, + "grad_norm": 1.7319942712783813, + "learning_rate": 1.534594454521897e-05, + "loss": 1.2578, + "step": 2961 + }, + { + "epoch": 0.06908313824499208, + "grad_norm": 1.640005111694336, + "learning_rate": 1.5351127235034983e-05, + "loss": 1.1881, + "step": 2962 + }, + { + "epoch": 0.06910646138417001, + "grad_norm": 1.9526257514953613, + "learning_rate": 1.5356309924850998e-05, + "loss": 1.7211, + "step": 2963 + }, + { + "epoch": 0.06912978452334792, + "grad_norm": 1.7995927333831787, + "learning_rate": 1.5361492614667012e-05, + "loss": 1.5302, + "step": 2964 + }, + { + "epoch": 0.06915310766252583, + "grad_norm": 1.7013497352600098, + "learning_rate": 1.5366675304483026e-05, + "loss": 1.4952, + "step": 2965 + }, + { + "epoch": 0.06917643080170376, + "grad_norm": 1.8493554592132568, + "learning_rate": 1.537185799429904e-05, + "loss": 1.5287, + "step": 2966 + }, + { + "epoch": 0.06919975394088167, + "grad_norm": 2.2868854999542236, + "learning_rate": 1.5377040684115055e-05, + "loss": 1.1454, + "step": 2967 + }, + { + "epoch": 0.06922307708005959, + "grad_norm": 2.045095443725586, + "learning_rate": 1.538222337393107e-05, + "loss": 1.3912, + "step": 2968 + }, + { + "epoch": 0.0692464002192375, + "grad_norm": 1.8001420497894287, + "learning_rate": 1.5387406063747087e-05, + "loss": 1.4114, + "step": 2969 + }, + { + "epoch": 0.06926972335841543, + "grad_norm": 2.4776105880737305, + "learning_rate": 1.53925887535631e-05, + "loss": 1.4288, + "step": 2970 + }, + { + "epoch": 0.06929304649759334, + "grad_norm": 2.2437918186187744, + "learning_rate": 1.5397771443379116e-05, + "loss": 1.2949, + "step": 2971 + }, + { + "epoch": 0.06931636963677126, + "grad_norm": 1.7763323783874512, + "learning_rate": 1.540295413319513e-05, + "loss": 1.0882, + "step": 2972 + }, + { + "epoch": 0.06933969277594917, + "grad_norm": 2.110168933868408, + "learning_rate": 1.5408136823011145e-05, + "loss": 1.1464, + "step": 2973 + }, + { + "epoch": 0.0693630159151271, + "grad_norm": 1.5015451908111572, + "learning_rate": 1.541331951282716e-05, + "loss": 1.4931, + "step": 2974 + }, + { + "epoch": 0.06938633905430501, + "grad_norm": 2.220963716506958, + "learning_rate": 1.5418502202643173e-05, + "loss": 1.3172, + "step": 2975 + }, + { + "epoch": 0.06940966219348293, + "grad_norm": 1.5186398029327393, + "learning_rate": 1.5423684892459188e-05, + "loss": 1.2238, + "step": 2976 + }, + { + "epoch": 0.06943298533266085, + "grad_norm": 1.7325942516326904, + "learning_rate": 1.5428867582275202e-05, + "loss": 1.7801, + "step": 2977 + }, + { + "epoch": 0.06945630847183877, + "grad_norm": 2.1025593280792236, + "learning_rate": 1.5434050272091216e-05, + "loss": 1.5691, + "step": 2978 + }, + { + "epoch": 0.06947963161101668, + "grad_norm": 1.9171198606491089, + "learning_rate": 1.543923296190723e-05, + "loss": 1.3018, + "step": 2979 + }, + { + "epoch": 0.0695029547501946, + "grad_norm": 1.9191886186599731, + "learning_rate": 1.5444415651723245e-05, + "loss": 1.1185, + "step": 2980 + }, + { + "epoch": 0.06952627788937252, + "grad_norm": 1.8381496667861938, + "learning_rate": 1.544959834153926e-05, + "loss": 1.6724, + "step": 2981 + }, + { + "epoch": 0.06954960102855044, + "grad_norm": 2.372688055038452, + "learning_rate": 1.5454781031355274e-05, + "loss": 1.4596, + "step": 2982 + }, + { + "epoch": 0.06957292416772835, + "grad_norm": 2.1257786750793457, + "learning_rate": 1.5459963721171288e-05, + "loss": 1.7045, + "step": 2983 + }, + { + "epoch": 0.06959624730690628, + "grad_norm": 1.7516330480575562, + "learning_rate": 1.5465146410987302e-05, + "loss": 1.5536, + "step": 2984 + }, + { + "epoch": 0.06961957044608419, + "grad_norm": 2.6538538932800293, + "learning_rate": 1.547032910080332e-05, + "loss": 1.3659, + "step": 2985 + }, + { + "epoch": 0.06964289358526211, + "grad_norm": 1.7529492378234863, + "learning_rate": 1.5475511790619334e-05, + "loss": 1.6208, + "step": 2986 + }, + { + "epoch": 0.06966621672444002, + "grad_norm": 2.090695858001709, + "learning_rate": 1.548069448043535e-05, + "loss": 1.348, + "step": 2987 + }, + { + "epoch": 0.06968953986361795, + "grad_norm": 2.129758834838867, + "learning_rate": 1.5485877170251363e-05, + "loss": 1.5219, + "step": 2988 + }, + { + "epoch": 0.06971286300279586, + "grad_norm": 1.8315856456756592, + "learning_rate": 1.5491059860067378e-05, + "loss": 1.2473, + "step": 2989 + }, + { + "epoch": 0.06973618614197379, + "grad_norm": 1.7378078699111938, + "learning_rate": 1.5496242549883392e-05, + "loss": 1.6124, + "step": 2990 + }, + { + "epoch": 0.0697595092811517, + "grad_norm": 2.115633487701416, + "learning_rate": 1.5501425239699406e-05, + "loss": 1.2983, + "step": 2991 + }, + { + "epoch": 0.06978283242032961, + "grad_norm": 1.7724013328552246, + "learning_rate": 1.550660792951542e-05, + "loss": 1.4654, + "step": 2992 + }, + { + "epoch": 0.06980615555950753, + "grad_norm": 2.1570277214050293, + "learning_rate": 1.5511790619331435e-05, + "loss": 1.5257, + "step": 2993 + }, + { + "epoch": 0.06982947869868544, + "grad_norm": 1.644012689590454, + "learning_rate": 1.551697330914745e-05, + "loss": 1.277, + "step": 2994 + }, + { + "epoch": 0.06985280183786337, + "grad_norm": 1.7980034351348877, + "learning_rate": 1.5522155998963464e-05, + "loss": 1.5572, + "step": 2995 + }, + { + "epoch": 0.06987612497704128, + "grad_norm": 1.6826823949813843, + "learning_rate": 1.5527338688779478e-05, + "loss": 1.246, + "step": 2996 + }, + { + "epoch": 0.0698994481162192, + "grad_norm": 1.8310699462890625, + "learning_rate": 1.5532521378595492e-05, + "loss": 1.3459, + "step": 2997 + }, + { + "epoch": 0.06992277125539711, + "grad_norm": 1.914998173713684, + "learning_rate": 1.5537704068411507e-05, + "loss": 1.4237, + "step": 2998 + }, + { + "epoch": 0.06994609439457504, + "grad_norm": 1.4848195314407349, + "learning_rate": 1.554288675822752e-05, + "loss": 1.2903, + "step": 2999 + }, + { + "epoch": 0.06996941753375295, + "grad_norm": 2.3208260536193848, + "learning_rate": 1.5548069448043535e-05, + "loss": 1.972, + "step": 3000 + }, + { + "epoch": 0.06999274067293088, + "grad_norm": 2.3595495223999023, + "learning_rate": 1.555325213785955e-05, + "loss": 1.4813, + "step": 3001 + }, + { + "epoch": 0.07001606381210879, + "grad_norm": 2.143214464187622, + "learning_rate": 1.5558434827675567e-05, + "loss": 1.2124, + "step": 3002 + }, + { + "epoch": 0.07003938695128671, + "grad_norm": 1.6811202764511108, + "learning_rate": 1.5563617517491582e-05, + "loss": 1.3735, + "step": 3003 + }, + { + "epoch": 0.07006271009046462, + "grad_norm": 1.5104790925979614, + "learning_rate": 1.5568800207307596e-05, + "loss": 1.5616, + "step": 3004 + }, + { + "epoch": 0.07008603322964255, + "grad_norm": 2.121225357055664, + "learning_rate": 1.5573982897123607e-05, + "loss": 1.857, + "step": 3005 + }, + { + "epoch": 0.07010935636882046, + "grad_norm": 2.3359665870666504, + "learning_rate": 1.557916558693962e-05, + "loss": 1.426, + "step": 3006 + }, + { + "epoch": 0.07013267950799838, + "grad_norm": 1.9051471948623657, + "learning_rate": 1.5584348276755636e-05, + "loss": 1.5802, + "step": 3007 + }, + { + "epoch": 0.0701560026471763, + "grad_norm": 1.8195501565933228, + "learning_rate": 1.558953096657165e-05, + "loss": 1.5288, + "step": 3008 + }, + { + "epoch": 0.07017932578635422, + "grad_norm": 1.8926808834075928, + "learning_rate": 1.5594713656387664e-05, + "loss": 1.415, + "step": 3009 + }, + { + "epoch": 0.07020264892553213, + "grad_norm": 1.7610915899276733, + "learning_rate": 1.559989634620368e-05, + "loss": 1.3034, + "step": 3010 + }, + { + "epoch": 0.07022597206471005, + "grad_norm": 1.8621258735656738, + "learning_rate": 1.5605079036019693e-05, + "loss": 1.3214, + "step": 3011 + }, + { + "epoch": 0.07024929520388797, + "grad_norm": 2.0164079666137695, + "learning_rate": 1.5610261725835708e-05, + "loss": 1.3016, + "step": 3012 + }, + { + "epoch": 0.07027261834306589, + "grad_norm": 1.7668334245681763, + "learning_rate": 1.5615444415651725e-05, + "loss": 1.3767, + "step": 3013 + }, + { + "epoch": 0.0702959414822438, + "grad_norm": 2.2490322589874268, + "learning_rate": 1.562062710546774e-05, + "loss": 1.6974, + "step": 3014 + }, + { + "epoch": 0.07031926462142173, + "grad_norm": 2.4273457527160645, + "learning_rate": 1.5625809795283754e-05, + "loss": 1.5084, + "step": 3015 + }, + { + "epoch": 0.07034258776059964, + "grad_norm": 1.5331447124481201, + "learning_rate": 1.5630992485099768e-05, + "loss": 1.3963, + "step": 3016 + }, + { + "epoch": 0.07036591089977756, + "grad_norm": 1.5221631526947021, + "learning_rate": 1.5636175174915783e-05, + "loss": 1.3032, + "step": 3017 + }, + { + "epoch": 0.07038923403895547, + "grad_norm": 1.9873692989349365, + "learning_rate": 1.5641357864731797e-05, + "loss": 1.8065, + "step": 3018 + }, + { + "epoch": 0.0704125571781334, + "grad_norm": 1.7702462673187256, + "learning_rate": 1.564654055454781e-05, + "loss": 1.2632, + "step": 3019 + }, + { + "epoch": 0.07043588031731131, + "grad_norm": 1.7505241632461548, + "learning_rate": 1.5651723244363826e-05, + "loss": 1.2402, + "step": 3020 + }, + { + "epoch": 0.07045920345648922, + "grad_norm": 1.8434094190597534, + "learning_rate": 1.565690593417984e-05, + "loss": 1.4673, + "step": 3021 + }, + { + "epoch": 0.07048252659566714, + "grad_norm": 1.4408884048461914, + "learning_rate": 1.5662088623995854e-05, + "loss": 1.182, + "step": 3022 + }, + { + "epoch": 0.07050584973484506, + "grad_norm": 1.3842504024505615, + "learning_rate": 1.566727131381187e-05, + "loss": 1.4564, + "step": 3023 + }, + { + "epoch": 0.07052917287402298, + "grad_norm": 1.9442074298858643, + "learning_rate": 1.5672454003627883e-05, + "loss": 1.5722, + "step": 3024 + }, + { + "epoch": 0.07055249601320089, + "grad_norm": 1.6745545864105225, + "learning_rate": 1.5677636693443897e-05, + "loss": 1.3703, + "step": 3025 + }, + { + "epoch": 0.07057581915237882, + "grad_norm": 1.7542271614074707, + "learning_rate": 1.5682819383259912e-05, + "loss": 1.1784, + "step": 3026 + }, + { + "epoch": 0.07059914229155673, + "grad_norm": 1.6971882581710815, + "learning_rate": 1.5688002073075926e-05, + "loss": 1.1932, + "step": 3027 + }, + { + "epoch": 0.07062246543073465, + "grad_norm": 1.9626144170761108, + "learning_rate": 1.569318476289194e-05, + "loss": 1.7452, + "step": 3028 + }, + { + "epoch": 0.07064578856991256, + "grad_norm": 1.5161895751953125, + "learning_rate": 1.5698367452707958e-05, + "loss": 1.4136, + "step": 3029 + }, + { + "epoch": 0.07066911170909049, + "grad_norm": 2.5730278491973877, + "learning_rate": 1.5703550142523973e-05, + "loss": 1.5446, + "step": 3030 + }, + { + "epoch": 0.0706924348482684, + "grad_norm": 1.8499232530593872, + "learning_rate": 1.5708732832339987e-05, + "loss": 1.5568, + "step": 3031 + }, + { + "epoch": 0.07071575798744632, + "grad_norm": 1.6006439924240112, + "learning_rate": 1.5713915522156e-05, + "loss": 1.596, + "step": 3032 + }, + { + "epoch": 0.07073908112662423, + "grad_norm": 2.0837395191192627, + "learning_rate": 1.5719098211972016e-05, + "loss": 1.443, + "step": 3033 + }, + { + "epoch": 0.07076240426580216, + "grad_norm": 1.77737557888031, + "learning_rate": 1.572428090178803e-05, + "loss": 1.7192, + "step": 3034 + }, + { + "epoch": 0.07078572740498007, + "grad_norm": 1.7115414142608643, + "learning_rate": 1.5729463591604044e-05, + "loss": 1.2101, + "step": 3035 + }, + { + "epoch": 0.070809050544158, + "grad_norm": 1.6676453351974487, + "learning_rate": 1.573464628142006e-05, + "loss": 1.4841, + "step": 3036 + }, + { + "epoch": 0.0708323736833359, + "grad_norm": 2.0481326580047607, + "learning_rate": 1.5739828971236073e-05, + "loss": 1.6479, + "step": 3037 + }, + { + "epoch": 0.07085569682251383, + "grad_norm": 1.7933276891708374, + "learning_rate": 1.5745011661052087e-05, + "loss": 1.0619, + "step": 3038 + }, + { + "epoch": 0.07087901996169174, + "grad_norm": 2.1594574451446533, + "learning_rate": 1.57501943508681e-05, + "loss": 1.5329, + "step": 3039 + }, + { + "epoch": 0.07090234310086967, + "grad_norm": 1.6190122365951538, + "learning_rate": 1.5755377040684116e-05, + "loss": 1.1178, + "step": 3040 + }, + { + "epoch": 0.07092566624004758, + "grad_norm": 1.773872971534729, + "learning_rate": 1.576055973050013e-05, + "loss": 1.6123, + "step": 3041 + }, + { + "epoch": 0.0709489893792255, + "grad_norm": 1.6447725296020508, + "learning_rate": 1.5765742420316145e-05, + "loss": 1.2291, + "step": 3042 + }, + { + "epoch": 0.07097231251840341, + "grad_norm": 2.039640426635742, + "learning_rate": 1.577092511013216e-05, + "loss": 1.7433, + "step": 3043 + }, + { + "epoch": 0.07099563565758134, + "grad_norm": 1.6734812259674072, + "learning_rate": 1.5776107799948173e-05, + "loss": 1.6518, + "step": 3044 + }, + { + "epoch": 0.07101895879675925, + "grad_norm": 1.7559072971343994, + "learning_rate": 1.5781290489764188e-05, + "loss": 1.142, + "step": 3045 + }, + { + "epoch": 0.07104228193593717, + "grad_norm": 1.6769473552703857, + "learning_rate": 1.5786473179580205e-05, + "loss": 1.5003, + "step": 3046 + }, + { + "epoch": 0.07106560507511508, + "grad_norm": 1.8988209962844849, + "learning_rate": 1.579165586939622e-05, + "loss": 1.6845, + "step": 3047 + }, + { + "epoch": 0.07108892821429301, + "grad_norm": 1.9083596467971802, + "learning_rate": 1.5796838559212234e-05, + "loss": 1.144, + "step": 3048 + }, + { + "epoch": 0.07111225135347092, + "grad_norm": 2.1103906631469727, + "learning_rate": 1.580202124902825e-05, + "loss": 1.1897, + "step": 3049 + }, + { + "epoch": 0.07113557449264883, + "grad_norm": 1.7527072429656982, + "learning_rate": 1.5807203938844263e-05, + "loss": 1.4615, + "step": 3050 + }, + { + "epoch": 0.07115889763182676, + "grad_norm": 1.745250940322876, + "learning_rate": 1.5812386628660277e-05, + "loss": 1.6192, + "step": 3051 + }, + { + "epoch": 0.07118222077100467, + "grad_norm": 2.378971815109253, + "learning_rate": 1.581756931847629e-05, + "loss": 1.5468, + "step": 3052 + }, + { + "epoch": 0.07120554391018259, + "grad_norm": 1.9410420656204224, + "learning_rate": 1.5822752008292306e-05, + "loss": 1.1459, + "step": 3053 + }, + { + "epoch": 0.0712288670493605, + "grad_norm": 1.7086267471313477, + "learning_rate": 1.582793469810832e-05, + "loss": 1.2372, + "step": 3054 + }, + { + "epoch": 0.07125219018853843, + "grad_norm": 1.9025890827178955, + "learning_rate": 1.5833117387924335e-05, + "loss": 1.561, + "step": 3055 + }, + { + "epoch": 0.07127551332771634, + "grad_norm": 1.5841965675354004, + "learning_rate": 1.583830007774035e-05, + "loss": 1.4006, + "step": 3056 + }, + { + "epoch": 0.07129883646689426, + "grad_norm": 2.0455946922302246, + "learning_rate": 1.5843482767556363e-05, + "loss": 1.2966, + "step": 3057 + }, + { + "epoch": 0.07132215960607217, + "grad_norm": 1.6825358867645264, + "learning_rate": 1.5848665457372378e-05, + "loss": 1.4364, + "step": 3058 + }, + { + "epoch": 0.0713454827452501, + "grad_norm": 1.7199450731277466, + "learning_rate": 1.5853848147188392e-05, + "loss": 1.1987, + "step": 3059 + }, + { + "epoch": 0.07136880588442801, + "grad_norm": 1.9703516960144043, + "learning_rate": 1.5859030837004406e-05, + "loss": 1.609, + "step": 3060 + }, + { + "epoch": 0.07139212902360594, + "grad_norm": 1.8907127380371094, + "learning_rate": 1.586421352682042e-05, + "loss": 1.5658, + "step": 3061 + }, + { + "epoch": 0.07141545216278385, + "grad_norm": 1.7374207973480225, + "learning_rate": 1.5869396216636435e-05, + "loss": 1.4888, + "step": 3062 + }, + { + "epoch": 0.07143877530196177, + "grad_norm": 1.549346685409546, + "learning_rate": 1.587457890645245e-05, + "loss": 1.2308, + "step": 3063 + }, + { + "epoch": 0.07146209844113968, + "grad_norm": 1.2408068180084229, + "learning_rate": 1.5879761596268464e-05, + "loss": 1.2161, + "step": 3064 + }, + { + "epoch": 0.0714854215803176, + "grad_norm": 1.6506532430648804, + "learning_rate": 1.5884944286084478e-05, + "loss": 1.2904, + "step": 3065 + }, + { + "epoch": 0.07150874471949552, + "grad_norm": 2.1740195751190186, + "learning_rate": 1.5890126975900492e-05, + "loss": 1.5094, + "step": 3066 + }, + { + "epoch": 0.07153206785867344, + "grad_norm": 1.4976023435592651, + "learning_rate": 1.5895309665716507e-05, + "loss": 1.2849, + "step": 3067 + }, + { + "epoch": 0.07155539099785135, + "grad_norm": 1.8397139310836792, + "learning_rate": 1.590049235553252e-05, + "loss": 1.6023, + "step": 3068 + }, + { + "epoch": 0.07157871413702928, + "grad_norm": 1.8328680992126465, + "learning_rate": 1.5905675045348535e-05, + "loss": 1.3928, + "step": 3069 + }, + { + "epoch": 0.07160203727620719, + "grad_norm": 1.487030029296875, + "learning_rate": 1.591085773516455e-05, + "loss": 1.2776, + "step": 3070 + }, + { + "epoch": 0.07162536041538511, + "grad_norm": 2.1938352584838867, + "learning_rate": 1.5916040424980564e-05, + "loss": 1.6936, + "step": 3071 + }, + { + "epoch": 0.07164868355456303, + "grad_norm": 1.7803940773010254, + "learning_rate": 1.592122311479658e-05, + "loss": 1.4459, + "step": 3072 + }, + { + "epoch": 0.07167200669374095, + "grad_norm": 3.9215104579925537, + "learning_rate": 1.5926405804612596e-05, + "loss": 1.3872, + "step": 3073 + }, + { + "epoch": 0.07169532983291886, + "grad_norm": 2.229813575744629, + "learning_rate": 1.593158849442861e-05, + "loss": 1.5246, + "step": 3074 + }, + { + "epoch": 0.07171865297209679, + "grad_norm": 1.808138132095337, + "learning_rate": 1.5936771184244625e-05, + "loss": 1.313, + "step": 3075 + }, + { + "epoch": 0.0717419761112747, + "grad_norm": 1.741746187210083, + "learning_rate": 1.594195387406064e-05, + "loss": 1.7373, + "step": 3076 + }, + { + "epoch": 0.07176529925045262, + "grad_norm": 1.9369068145751953, + "learning_rate": 1.5947136563876654e-05, + "loss": 1.4971, + "step": 3077 + }, + { + "epoch": 0.07178862238963053, + "grad_norm": 1.7145378589630127, + "learning_rate": 1.5952319253692668e-05, + "loss": 1.2739, + "step": 3078 + }, + { + "epoch": 0.07181194552880844, + "grad_norm": 1.8517347574234009, + "learning_rate": 1.5957501943508682e-05, + "loss": 1.5945, + "step": 3079 + }, + { + "epoch": 0.07183526866798637, + "grad_norm": 2.492448091506958, + "learning_rate": 1.5962684633324697e-05, + "loss": 1.4941, + "step": 3080 + }, + { + "epoch": 0.07185859180716428, + "grad_norm": 2.4276201725006104, + "learning_rate": 1.596786732314071e-05, + "loss": 1.3294, + "step": 3081 + }, + { + "epoch": 0.0718819149463422, + "grad_norm": 1.579856276512146, + "learning_rate": 1.5973050012956725e-05, + "loss": 1.317, + "step": 3082 + }, + { + "epoch": 0.07190523808552012, + "grad_norm": 2.4683403968811035, + "learning_rate": 1.597823270277274e-05, + "loss": 1.6417, + "step": 3083 + }, + { + "epoch": 0.07192856122469804, + "grad_norm": 1.8689765930175781, + "learning_rate": 1.5983415392588754e-05, + "loss": 1.6266, + "step": 3084 + }, + { + "epoch": 0.07195188436387595, + "grad_norm": 1.729116678237915, + "learning_rate": 1.598859808240477e-05, + "loss": 1.2618, + "step": 3085 + }, + { + "epoch": 0.07197520750305388, + "grad_norm": 1.9072259664535522, + "learning_rate": 1.5993780772220783e-05, + "loss": 1.1818, + "step": 3086 + }, + { + "epoch": 0.07199853064223179, + "grad_norm": 2.139037847518921, + "learning_rate": 1.5998963462036797e-05, + "loss": 1.2447, + "step": 3087 + }, + { + "epoch": 0.07202185378140971, + "grad_norm": 1.903786540031433, + "learning_rate": 1.600414615185281e-05, + "loss": 1.2269, + "step": 3088 + }, + { + "epoch": 0.07204517692058762, + "grad_norm": 1.8438259363174438, + "learning_rate": 1.600932884166883e-05, + "loss": 1.3741, + "step": 3089 + }, + { + "epoch": 0.07206850005976555, + "grad_norm": 1.5920443534851074, + "learning_rate": 1.6014511531484844e-05, + "loss": 1.5759, + "step": 3090 + }, + { + "epoch": 0.07209182319894346, + "grad_norm": 1.9949947595596313, + "learning_rate": 1.6019694221300858e-05, + "loss": 1.5934, + "step": 3091 + }, + { + "epoch": 0.07211514633812138, + "grad_norm": 1.9100804328918457, + "learning_rate": 1.6024876911116872e-05, + "loss": 1.5361, + "step": 3092 + }, + { + "epoch": 0.0721384694772993, + "grad_norm": 1.7444504499435425, + "learning_rate": 1.6030059600932887e-05, + "loss": 1.4275, + "step": 3093 + }, + { + "epoch": 0.07216179261647722, + "grad_norm": 1.7154817581176758, + "learning_rate": 1.60352422907489e-05, + "loss": 1.4801, + "step": 3094 + }, + { + "epoch": 0.07218511575565513, + "grad_norm": 1.7063403129577637, + "learning_rate": 1.6040424980564915e-05, + "loss": 1.2088, + "step": 3095 + }, + { + "epoch": 0.07220843889483305, + "grad_norm": 1.652329444885254, + "learning_rate": 1.604560767038093e-05, + "loss": 1.5211, + "step": 3096 + }, + { + "epoch": 0.07223176203401097, + "grad_norm": 1.7522650957107544, + "learning_rate": 1.6050790360196944e-05, + "loss": 1.3027, + "step": 3097 + }, + { + "epoch": 0.07225508517318889, + "grad_norm": 1.740250825881958, + "learning_rate": 1.6055973050012958e-05, + "loss": 1.8394, + "step": 3098 + }, + { + "epoch": 0.0722784083123668, + "grad_norm": 1.6055843830108643, + "learning_rate": 1.6061155739828973e-05, + "loss": 1.3905, + "step": 3099 + }, + { + "epoch": 0.07230173145154473, + "grad_norm": 1.9458154439926147, + "learning_rate": 1.6066338429644987e-05, + "loss": 1.212, + "step": 3100 + }, + { + "epoch": 0.07232505459072264, + "grad_norm": 1.5730454921722412, + "learning_rate": 1.6071521119461e-05, + "loss": 1.672, + "step": 3101 + }, + { + "epoch": 0.07234837772990056, + "grad_norm": 2.145181179046631, + "learning_rate": 1.6076703809277016e-05, + "loss": 1.4367, + "step": 3102 + }, + { + "epoch": 0.07237170086907847, + "grad_norm": 1.999090313911438, + "learning_rate": 1.608188649909303e-05, + "loss": 1.2467, + "step": 3103 + }, + { + "epoch": 0.0723950240082564, + "grad_norm": 1.9247711896896362, + "learning_rate": 1.6087069188909044e-05, + "loss": 1.4135, + "step": 3104 + }, + { + "epoch": 0.07241834714743431, + "grad_norm": 1.3318367004394531, + "learning_rate": 1.609225187872506e-05, + "loss": 1.2565, + "step": 3105 + }, + { + "epoch": 0.07244167028661222, + "grad_norm": 1.9068901538848877, + "learning_rate": 1.6097434568541076e-05, + "loss": 1.4268, + "step": 3106 + }, + { + "epoch": 0.07246499342579014, + "grad_norm": 1.8222874402999878, + "learning_rate": 1.610261725835709e-05, + "loss": 1.3323, + "step": 3107 + }, + { + "epoch": 0.07248831656496806, + "grad_norm": 1.9893977642059326, + "learning_rate": 1.6107799948173105e-05, + "loss": 1.5658, + "step": 3108 + }, + { + "epoch": 0.07251163970414598, + "grad_norm": 1.3330308198928833, + "learning_rate": 1.611298263798912e-05, + "loss": 1.3951, + "step": 3109 + }, + { + "epoch": 0.07253496284332389, + "grad_norm": 1.514310359954834, + "learning_rate": 1.6118165327805134e-05, + "loss": 1.2224, + "step": 3110 + }, + { + "epoch": 0.07255828598250182, + "grad_norm": 1.8405115604400635, + "learning_rate": 1.6123348017621148e-05, + "loss": 1.7513, + "step": 3111 + }, + { + "epoch": 0.07258160912167973, + "grad_norm": 1.7965067625045776, + "learning_rate": 1.6128530707437163e-05, + "loss": 1.4956, + "step": 3112 + }, + { + "epoch": 0.07260493226085765, + "grad_norm": 1.817044734954834, + "learning_rate": 1.6133713397253177e-05, + "loss": 1.5514, + "step": 3113 + }, + { + "epoch": 0.07262825540003556, + "grad_norm": 1.7619121074676514, + "learning_rate": 1.613889608706919e-05, + "loss": 1.2955, + "step": 3114 + }, + { + "epoch": 0.07265157853921349, + "grad_norm": 2.3285348415374756, + "learning_rate": 1.6144078776885202e-05, + "loss": 1.5876, + "step": 3115 + }, + { + "epoch": 0.0726749016783914, + "grad_norm": 1.8489972352981567, + "learning_rate": 1.6149261466701217e-05, + "loss": 1.8576, + "step": 3116 + }, + { + "epoch": 0.07269822481756932, + "grad_norm": 2.0354526042938232, + "learning_rate": 1.6154444156517234e-05, + "loss": 1.3596, + "step": 3117 + }, + { + "epoch": 0.07272154795674723, + "grad_norm": 1.593621015548706, + "learning_rate": 1.615962684633325e-05, + "loss": 1.7181, + "step": 3118 + }, + { + "epoch": 0.07274487109592516, + "grad_norm": 1.8863354921340942, + "learning_rate": 1.6164809536149263e-05, + "loss": 1.1748, + "step": 3119 + }, + { + "epoch": 0.07276819423510307, + "grad_norm": 1.617163896560669, + "learning_rate": 1.6169992225965277e-05, + "loss": 1.5123, + "step": 3120 + }, + { + "epoch": 0.072791517374281, + "grad_norm": 1.991641640663147, + "learning_rate": 1.617517491578129e-05, + "loss": 1.5395, + "step": 3121 + }, + { + "epoch": 0.0728148405134589, + "grad_norm": 2.255173444747925, + "learning_rate": 1.6180357605597306e-05, + "loss": 1.3539, + "step": 3122 + }, + { + "epoch": 0.07283816365263683, + "grad_norm": 1.4541120529174805, + "learning_rate": 1.618554029541332e-05, + "loss": 1.4153, + "step": 3123 + }, + { + "epoch": 0.07286148679181474, + "grad_norm": 1.4140597581863403, + "learning_rate": 1.6190722985229335e-05, + "loss": 1.1464, + "step": 3124 + }, + { + "epoch": 0.07288480993099267, + "grad_norm": 1.547636866569519, + "learning_rate": 1.619590567504535e-05, + "loss": 1.4475, + "step": 3125 + }, + { + "epoch": 0.07290813307017058, + "grad_norm": 1.637631893157959, + "learning_rate": 1.6201088364861363e-05, + "loss": 1.474, + "step": 3126 + }, + { + "epoch": 0.0729314562093485, + "grad_norm": 1.7381575107574463, + "learning_rate": 1.6206271054677378e-05, + "loss": 1.3445, + "step": 3127 + }, + { + "epoch": 0.07295477934852641, + "grad_norm": 1.9380948543548584, + "learning_rate": 1.6211453744493392e-05, + "loss": 1.4927, + "step": 3128 + }, + { + "epoch": 0.07297810248770434, + "grad_norm": 2.575976848602295, + "learning_rate": 1.6216636434309406e-05, + "loss": 1.2629, + "step": 3129 + }, + { + "epoch": 0.07300142562688225, + "grad_norm": 1.896838665008545, + "learning_rate": 1.622181912412542e-05, + "loss": 1.3337, + "step": 3130 + }, + { + "epoch": 0.07302474876606017, + "grad_norm": 2.829563617706299, + "learning_rate": 1.6227001813941435e-05, + "loss": 1.7271, + "step": 3131 + }, + { + "epoch": 0.07304807190523809, + "grad_norm": 1.8833341598510742, + "learning_rate": 1.623218450375745e-05, + "loss": 1.5144, + "step": 3132 + }, + { + "epoch": 0.07307139504441601, + "grad_norm": 1.7675219774246216, + "learning_rate": 1.6237367193573467e-05, + "loss": 1.476, + "step": 3133 + }, + { + "epoch": 0.07309471818359392, + "grad_norm": 1.708404779434204, + "learning_rate": 1.624254988338948e-05, + "loss": 1.5413, + "step": 3134 + }, + { + "epoch": 0.07311804132277183, + "grad_norm": 1.4991116523742676, + "learning_rate": 1.6247732573205496e-05, + "loss": 1.4898, + "step": 3135 + }, + { + "epoch": 0.07314136446194976, + "grad_norm": 1.7512024641036987, + "learning_rate": 1.625291526302151e-05, + "loss": 1.3796, + "step": 3136 + }, + { + "epoch": 0.07316468760112767, + "grad_norm": 1.567833423614502, + "learning_rate": 1.6258097952837525e-05, + "loss": 1.4894, + "step": 3137 + }, + { + "epoch": 0.07318801074030559, + "grad_norm": 1.9110795259475708, + "learning_rate": 1.626328064265354e-05, + "loss": 1.5233, + "step": 3138 + }, + { + "epoch": 0.0732113338794835, + "grad_norm": 1.618751049041748, + "learning_rate": 1.6268463332469553e-05, + "loss": 1.5909, + "step": 3139 + }, + { + "epoch": 0.07323465701866143, + "grad_norm": 1.643726110458374, + "learning_rate": 1.6273646022285568e-05, + "loss": 1.5929, + "step": 3140 + }, + { + "epoch": 0.07325798015783934, + "grad_norm": 1.7553377151489258, + "learning_rate": 1.6278828712101582e-05, + "loss": 1.2939, + "step": 3141 + }, + { + "epoch": 0.07328130329701726, + "grad_norm": 2.0515267848968506, + "learning_rate": 1.6284011401917596e-05, + "loss": 1.0752, + "step": 3142 + }, + { + "epoch": 0.07330462643619517, + "grad_norm": 2.997544288635254, + "learning_rate": 1.628919409173361e-05, + "loss": 1.0736, + "step": 3143 + }, + { + "epoch": 0.0733279495753731, + "grad_norm": 1.3428021669387817, + "learning_rate": 1.6294376781549625e-05, + "loss": 1.2843, + "step": 3144 + }, + { + "epoch": 0.07335127271455101, + "grad_norm": 2.3941478729248047, + "learning_rate": 1.629955947136564e-05, + "loss": 1.3924, + "step": 3145 + }, + { + "epoch": 0.07337459585372894, + "grad_norm": 2.03312087059021, + "learning_rate": 1.6304742161181654e-05, + "loss": 1.4825, + "step": 3146 + }, + { + "epoch": 0.07339791899290685, + "grad_norm": 1.8753705024719238, + "learning_rate": 1.6309924850997668e-05, + "loss": 1.2441, + "step": 3147 + }, + { + "epoch": 0.07342124213208477, + "grad_norm": 1.5599260330200195, + "learning_rate": 1.6315107540813682e-05, + "loss": 1.6125, + "step": 3148 + }, + { + "epoch": 0.07344456527126268, + "grad_norm": 1.8200551271438599, + "learning_rate": 1.6320290230629697e-05, + "loss": 1.214, + "step": 3149 + }, + { + "epoch": 0.07346788841044061, + "grad_norm": 1.768222451210022, + "learning_rate": 1.6325472920445714e-05, + "loss": 1.2032, + "step": 3150 + }, + { + "epoch": 0.07349121154961852, + "grad_norm": 1.6056883335113525, + "learning_rate": 1.633065561026173e-05, + "loss": 1.1922, + "step": 3151 + }, + { + "epoch": 0.07351453468879644, + "grad_norm": 1.8109151124954224, + "learning_rate": 1.6335838300077743e-05, + "loss": 1.6903, + "step": 3152 + }, + { + "epoch": 0.07353785782797435, + "grad_norm": 2.121112108230591, + "learning_rate": 1.6341020989893758e-05, + "loss": 1.4178, + "step": 3153 + }, + { + "epoch": 0.07356118096715228, + "grad_norm": 1.5514973402023315, + "learning_rate": 1.6346203679709772e-05, + "loss": 1.3702, + "step": 3154 + }, + { + "epoch": 0.07358450410633019, + "grad_norm": 1.8449887037277222, + "learning_rate": 1.6351386369525786e-05, + "loss": 1.5122, + "step": 3155 + }, + { + "epoch": 0.07360782724550811, + "grad_norm": 1.642410397529602, + "learning_rate": 1.63565690593418e-05, + "loss": 1.5452, + "step": 3156 + }, + { + "epoch": 0.07363115038468603, + "grad_norm": 1.4537593126296997, + "learning_rate": 1.6361751749157815e-05, + "loss": 1.4577, + "step": 3157 + }, + { + "epoch": 0.07365447352386395, + "grad_norm": 1.6113066673278809, + "learning_rate": 1.636693443897383e-05, + "loss": 1.3522, + "step": 3158 + }, + { + "epoch": 0.07367779666304186, + "grad_norm": 2.0721449851989746, + "learning_rate": 1.6372117128789844e-05, + "loss": 1.6864, + "step": 3159 + }, + { + "epoch": 0.07370111980221979, + "grad_norm": 1.5149180889129639, + "learning_rate": 1.6377299818605858e-05, + "loss": 1.4672, + "step": 3160 + }, + { + "epoch": 0.0737244429413977, + "grad_norm": 1.5581163167953491, + "learning_rate": 1.6382482508421872e-05, + "loss": 1.5262, + "step": 3161 + }, + { + "epoch": 0.07374776608057562, + "grad_norm": 1.804286003112793, + "learning_rate": 1.6387665198237887e-05, + "loss": 1.3961, + "step": 3162 + }, + { + "epoch": 0.07377108921975353, + "grad_norm": 1.7984623908996582, + "learning_rate": 1.63928478880539e-05, + "loss": 1.4955, + "step": 3163 + }, + { + "epoch": 0.07379441235893144, + "grad_norm": 2.099043607711792, + "learning_rate": 1.6398030577869915e-05, + "loss": 1.4355, + "step": 3164 + }, + { + "epoch": 0.07381773549810937, + "grad_norm": 1.6302787065505981, + "learning_rate": 1.640321326768593e-05, + "loss": 1.5327, + "step": 3165 + }, + { + "epoch": 0.07384105863728728, + "grad_norm": 2.0226998329162598, + "learning_rate": 1.6408395957501947e-05, + "loss": 1.3417, + "step": 3166 + }, + { + "epoch": 0.0738643817764652, + "grad_norm": 1.6209826469421387, + "learning_rate": 1.6413578647317962e-05, + "loss": 1.2699, + "step": 3167 + }, + { + "epoch": 0.07388770491564312, + "grad_norm": 1.7179981470108032, + "learning_rate": 1.6418761337133976e-05, + "loss": 1.5453, + "step": 3168 + }, + { + "epoch": 0.07391102805482104, + "grad_norm": 4.081711769104004, + "learning_rate": 1.642394402694999e-05, + "loss": 1.5203, + "step": 3169 + }, + { + "epoch": 0.07393435119399895, + "grad_norm": 2.2758281230926514, + "learning_rate": 1.6429126716766e-05, + "loss": 1.5944, + "step": 3170 + }, + { + "epoch": 0.07395767433317688, + "grad_norm": 1.8237699270248413, + "learning_rate": 1.6434309406582016e-05, + "loss": 1.2495, + "step": 3171 + }, + { + "epoch": 0.07398099747235479, + "grad_norm": 2.163329601287842, + "learning_rate": 1.643949209639803e-05, + "loss": 1.8061, + "step": 3172 + }, + { + "epoch": 0.07400432061153271, + "grad_norm": 1.429808497428894, + "learning_rate": 1.6444674786214044e-05, + "loss": 1.0702, + "step": 3173 + }, + { + "epoch": 0.07402764375071062, + "grad_norm": 1.5716477632522583, + "learning_rate": 1.644985747603006e-05, + "loss": 1.4544, + "step": 3174 + }, + { + "epoch": 0.07405096688988855, + "grad_norm": 1.6473677158355713, + "learning_rate": 1.6455040165846073e-05, + "loss": 1.2386, + "step": 3175 + }, + { + "epoch": 0.07407429002906646, + "grad_norm": 1.6965765953063965, + "learning_rate": 1.6460222855662088e-05, + "loss": 1.4224, + "step": 3176 + }, + { + "epoch": 0.07409761316824438, + "grad_norm": 1.925316333770752, + "learning_rate": 1.6465405545478105e-05, + "loss": 1.4812, + "step": 3177 + }, + { + "epoch": 0.0741209363074223, + "grad_norm": 1.7018409967422485, + "learning_rate": 1.647058823529412e-05, + "loss": 1.1986, + "step": 3178 + }, + { + "epoch": 0.07414425944660022, + "grad_norm": 1.725853443145752, + "learning_rate": 1.6475770925110134e-05, + "loss": 1.3903, + "step": 3179 + }, + { + "epoch": 0.07416758258577813, + "grad_norm": 1.9766136407852173, + "learning_rate": 1.6480953614926148e-05, + "loss": 1.3297, + "step": 3180 + }, + { + "epoch": 0.07419090572495605, + "grad_norm": 1.682305932044983, + "learning_rate": 1.6486136304742163e-05, + "loss": 1.3133, + "step": 3181 + }, + { + "epoch": 0.07421422886413397, + "grad_norm": 2.0981059074401855, + "learning_rate": 1.6491318994558177e-05, + "loss": 1.5268, + "step": 3182 + }, + { + "epoch": 0.07423755200331189, + "grad_norm": 1.6008816957473755, + "learning_rate": 1.649650168437419e-05, + "loss": 1.6714, + "step": 3183 + }, + { + "epoch": 0.0742608751424898, + "grad_norm": 1.780184268951416, + "learning_rate": 1.6501684374190206e-05, + "loss": 1.4079, + "step": 3184 + }, + { + "epoch": 0.07428419828166773, + "grad_norm": 1.654382586479187, + "learning_rate": 1.650686706400622e-05, + "loss": 1.4573, + "step": 3185 + }, + { + "epoch": 0.07430752142084564, + "grad_norm": 1.8154877424240112, + "learning_rate": 1.6512049753822234e-05, + "loss": 1.5714, + "step": 3186 + }, + { + "epoch": 0.07433084456002356, + "grad_norm": 1.7593024969100952, + "learning_rate": 1.651723244363825e-05, + "loss": 1.6577, + "step": 3187 + }, + { + "epoch": 0.07435416769920147, + "grad_norm": 2.0384089946746826, + "learning_rate": 1.6522415133454263e-05, + "loss": 1.2007, + "step": 3188 + }, + { + "epoch": 0.0743774908383794, + "grad_norm": 2.002246141433716, + "learning_rate": 1.6527597823270277e-05, + "loss": 1.4625, + "step": 3189 + }, + { + "epoch": 0.07440081397755731, + "grad_norm": 1.3846033811569214, + "learning_rate": 1.6532780513086292e-05, + "loss": 1.3212, + "step": 3190 + }, + { + "epoch": 0.07442413711673523, + "grad_norm": 2.119779348373413, + "learning_rate": 1.6537963202902306e-05, + "loss": 1.4693, + "step": 3191 + }, + { + "epoch": 0.07444746025591314, + "grad_norm": 1.6722887754440308, + "learning_rate": 1.654314589271832e-05, + "loss": 1.7212, + "step": 3192 + }, + { + "epoch": 0.07447078339509106, + "grad_norm": 1.9082591533660889, + "learning_rate": 1.6548328582534338e-05, + "loss": 1.5593, + "step": 3193 + }, + { + "epoch": 0.07449410653426898, + "grad_norm": 1.6746292114257812, + "learning_rate": 1.6553511272350353e-05, + "loss": 1.2147, + "step": 3194 + }, + { + "epoch": 0.07451742967344689, + "grad_norm": 2.499532461166382, + "learning_rate": 1.6558693962166367e-05, + "loss": 1.3359, + "step": 3195 + }, + { + "epoch": 0.07454075281262482, + "grad_norm": 1.7034624814987183, + "learning_rate": 1.656387665198238e-05, + "loss": 1.4738, + "step": 3196 + }, + { + "epoch": 0.07456407595180273, + "grad_norm": 1.6847600936889648, + "learning_rate": 1.6569059341798396e-05, + "loss": 1.2525, + "step": 3197 + }, + { + "epoch": 0.07458739909098065, + "grad_norm": 1.8061450719833374, + "learning_rate": 1.657424203161441e-05, + "loss": 1.4098, + "step": 3198 + }, + { + "epoch": 0.07461072223015856, + "grad_norm": 2.442981004714966, + "learning_rate": 1.6579424721430424e-05, + "loss": 1.3678, + "step": 3199 + }, + { + "epoch": 0.07463404536933649, + "grad_norm": 1.8011338710784912, + "learning_rate": 1.658460741124644e-05, + "loss": 1.7033, + "step": 3200 + }, + { + "epoch": 0.0746573685085144, + "grad_norm": 1.6890759468078613, + "learning_rate": 1.6589790101062453e-05, + "loss": 1.5371, + "step": 3201 + }, + { + "epoch": 0.07468069164769232, + "grad_norm": 1.3462575674057007, + "learning_rate": 1.6594972790878467e-05, + "loss": 1.1441, + "step": 3202 + }, + { + "epoch": 0.07470401478687023, + "grad_norm": 1.9787263870239258, + "learning_rate": 1.660015548069448e-05, + "loss": 1.9626, + "step": 3203 + }, + { + "epoch": 0.07472733792604816, + "grad_norm": 1.8819395303726196, + "learning_rate": 1.6605338170510496e-05, + "loss": 1.6671, + "step": 3204 + }, + { + "epoch": 0.07475066106522607, + "grad_norm": 2.3060483932495117, + "learning_rate": 1.661052086032651e-05, + "loss": 1.6444, + "step": 3205 + }, + { + "epoch": 0.074773984204404, + "grad_norm": 2.166884660720825, + "learning_rate": 1.6615703550142525e-05, + "loss": 1.0422, + "step": 3206 + }, + { + "epoch": 0.0747973073435819, + "grad_norm": 1.8128820657730103, + "learning_rate": 1.662088623995854e-05, + "loss": 1.4259, + "step": 3207 + }, + { + "epoch": 0.07482063048275983, + "grad_norm": 2.623307943344116, + "learning_rate": 1.6626068929774553e-05, + "loss": 1.3053, + "step": 3208 + }, + { + "epoch": 0.07484395362193774, + "grad_norm": 1.608069658279419, + "learning_rate": 1.6631251619590568e-05, + "loss": 1.2981, + "step": 3209 + }, + { + "epoch": 0.07486727676111567, + "grad_norm": 3.4346506595611572, + "learning_rate": 1.6636434309406585e-05, + "loss": 1.5589, + "step": 3210 + }, + { + "epoch": 0.07489059990029358, + "grad_norm": 2.037238597869873, + "learning_rate": 1.66416169992226e-05, + "loss": 1.6254, + "step": 3211 + }, + { + "epoch": 0.0749139230394715, + "grad_norm": 2.3247506618499756, + "learning_rate": 1.6646799689038614e-05, + "loss": 1.4934, + "step": 3212 + }, + { + "epoch": 0.07493724617864941, + "grad_norm": 2.582305908203125, + "learning_rate": 1.665198237885463e-05, + "loss": 1.7699, + "step": 3213 + }, + { + "epoch": 0.07496056931782734, + "grad_norm": 2.17980694770813, + "learning_rate": 1.6657165068670643e-05, + "loss": 1.3112, + "step": 3214 + }, + { + "epoch": 0.07498389245700525, + "grad_norm": 1.8005729913711548, + "learning_rate": 1.6662347758486657e-05, + "loss": 1.7, + "step": 3215 + }, + { + "epoch": 0.07500721559618317, + "grad_norm": 1.8963232040405273, + "learning_rate": 1.666753044830267e-05, + "loss": 1.2914, + "step": 3216 + }, + { + "epoch": 0.07503053873536109, + "grad_norm": 1.8256468772888184, + "learning_rate": 1.6672713138118686e-05, + "loss": 2.131, + "step": 3217 + }, + { + "epoch": 0.07505386187453901, + "grad_norm": 1.7902743816375732, + "learning_rate": 1.66778958279347e-05, + "loss": 1.7294, + "step": 3218 + }, + { + "epoch": 0.07507718501371692, + "grad_norm": 1.7782840728759766, + "learning_rate": 1.6683078517750715e-05, + "loss": 1.7079, + "step": 3219 + }, + { + "epoch": 0.07510050815289483, + "grad_norm": 1.5772191286087036, + "learning_rate": 1.668826120756673e-05, + "loss": 1.4248, + "step": 3220 + }, + { + "epoch": 0.07512383129207276, + "grad_norm": 1.633786916732788, + "learning_rate": 1.6693443897382743e-05, + "loss": 1.6423, + "step": 3221 + }, + { + "epoch": 0.07514715443125067, + "grad_norm": 1.9768214225769043, + "learning_rate": 1.6698626587198758e-05, + "loss": 1.1813, + "step": 3222 + }, + { + "epoch": 0.07517047757042859, + "grad_norm": 1.732069969177246, + "learning_rate": 1.6703809277014772e-05, + "loss": 1.448, + "step": 3223 + }, + { + "epoch": 0.0751938007096065, + "grad_norm": 1.6145514249801636, + "learning_rate": 1.6708991966830786e-05, + "loss": 1.2867, + "step": 3224 + }, + { + "epoch": 0.07521712384878443, + "grad_norm": 2.4257102012634277, + "learning_rate": 1.67141746566468e-05, + "loss": 1.5449, + "step": 3225 + }, + { + "epoch": 0.07524044698796234, + "grad_norm": 1.795608639717102, + "learning_rate": 1.6719357346462815e-05, + "loss": 1.5071, + "step": 3226 + }, + { + "epoch": 0.07526377012714026, + "grad_norm": 2.01072096824646, + "learning_rate": 1.672454003627883e-05, + "loss": 1.375, + "step": 3227 + }, + { + "epoch": 0.07528709326631818, + "grad_norm": 2.260044574737549, + "learning_rate": 1.6729722726094844e-05, + "loss": 1.6497, + "step": 3228 + }, + { + "epoch": 0.0753104164054961, + "grad_norm": 2.038987398147583, + "learning_rate": 1.6734905415910858e-05, + "loss": 1.1102, + "step": 3229 + }, + { + "epoch": 0.07533373954467401, + "grad_norm": 1.4211676120758057, + "learning_rate": 1.6740088105726872e-05, + "loss": 1.1084, + "step": 3230 + }, + { + "epoch": 0.07535706268385194, + "grad_norm": 1.623024582862854, + "learning_rate": 1.6745270795542887e-05, + "loss": 1.2943, + "step": 3231 + }, + { + "epoch": 0.07538038582302985, + "grad_norm": 1.5547914505004883, + "learning_rate": 1.67504534853589e-05, + "loss": 1.2063, + "step": 3232 + }, + { + "epoch": 0.07540370896220777, + "grad_norm": 30.15711784362793, + "learning_rate": 1.6755636175174915e-05, + "loss": 1.5127, + "step": 3233 + }, + { + "epoch": 0.07542703210138568, + "grad_norm": 2.2802798748016357, + "learning_rate": 1.676081886499093e-05, + "loss": 1.5358, + "step": 3234 + }, + { + "epoch": 0.07545035524056361, + "grad_norm": 2.0905349254608154, + "learning_rate": 1.6766001554806944e-05, + "loss": 1.4242, + "step": 3235 + }, + { + "epoch": 0.07547367837974152, + "grad_norm": 1.9069223403930664, + "learning_rate": 1.677118424462296e-05, + "loss": 1.4346, + "step": 3236 + }, + { + "epoch": 0.07549700151891944, + "grad_norm": 1.7567555904388428, + "learning_rate": 1.6776366934438976e-05, + "loss": 1.5666, + "step": 3237 + }, + { + "epoch": 0.07552032465809735, + "grad_norm": 1.5660747289657593, + "learning_rate": 1.678154962425499e-05, + "loss": 1.5741, + "step": 3238 + }, + { + "epoch": 0.07554364779727528, + "grad_norm": 2.2778217792510986, + "learning_rate": 1.6786732314071005e-05, + "loss": 1.7733, + "step": 3239 + }, + { + "epoch": 0.07556697093645319, + "grad_norm": 1.894871473312378, + "learning_rate": 1.679191500388702e-05, + "loss": 1.4862, + "step": 3240 + }, + { + "epoch": 0.07559029407563111, + "grad_norm": 1.7588225603103638, + "learning_rate": 1.6797097693703034e-05, + "loss": 1.5237, + "step": 3241 + }, + { + "epoch": 0.07561361721480903, + "grad_norm": 1.6358667612075806, + "learning_rate": 1.6802280383519048e-05, + "loss": 1.6416, + "step": 3242 + }, + { + "epoch": 0.07563694035398695, + "grad_norm": 1.745793104171753, + "learning_rate": 1.6807463073335062e-05, + "loss": 1.5638, + "step": 3243 + }, + { + "epoch": 0.07566026349316486, + "grad_norm": 1.5481939315795898, + "learning_rate": 1.6812645763151077e-05, + "loss": 1.5219, + "step": 3244 + }, + { + "epoch": 0.07568358663234279, + "grad_norm": 1.800512433052063, + "learning_rate": 1.681782845296709e-05, + "loss": 1.4218, + "step": 3245 + }, + { + "epoch": 0.0757069097715207, + "grad_norm": 1.7766098976135254, + "learning_rate": 1.6823011142783105e-05, + "loss": 1.5556, + "step": 3246 + }, + { + "epoch": 0.07573023291069862, + "grad_norm": 1.8715254068374634, + "learning_rate": 1.682819383259912e-05, + "loss": 0.9959, + "step": 3247 + }, + { + "epoch": 0.07575355604987653, + "grad_norm": 1.6228104829788208, + "learning_rate": 1.6833376522415134e-05, + "loss": 1.3913, + "step": 3248 + }, + { + "epoch": 0.07577687918905444, + "grad_norm": 2.1543338298797607, + "learning_rate": 1.683855921223115e-05, + "loss": 1.6023, + "step": 3249 + }, + { + "epoch": 0.07580020232823237, + "grad_norm": 2.092386484146118, + "learning_rate": 1.6843741902047163e-05, + "loss": 1.3446, + "step": 3250 + }, + { + "epoch": 0.07582352546741028, + "grad_norm": 2.0272581577301025, + "learning_rate": 1.6848924591863177e-05, + "loss": 1.4331, + "step": 3251 + }, + { + "epoch": 0.0758468486065882, + "grad_norm": 2.3082563877105713, + "learning_rate": 1.685410728167919e-05, + "loss": 1.3317, + "step": 3252 + }, + { + "epoch": 0.07587017174576612, + "grad_norm": 1.919071912765503, + "learning_rate": 1.6859289971495206e-05, + "loss": 1.4169, + "step": 3253 + }, + { + "epoch": 0.07589349488494404, + "grad_norm": 2.4359421730041504, + "learning_rate": 1.6864472661311224e-05, + "loss": 1.4297, + "step": 3254 + }, + { + "epoch": 0.07591681802412195, + "grad_norm": 1.7315095663070679, + "learning_rate": 1.6869655351127238e-05, + "loss": 1.603, + "step": 3255 + }, + { + "epoch": 0.07594014116329988, + "grad_norm": 2.1361732482910156, + "learning_rate": 1.6874838040943252e-05, + "loss": 1.3271, + "step": 3256 + }, + { + "epoch": 0.07596346430247779, + "grad_norm": 1.3706244230270386, + "learning_rate": 1.6880020730759267e-05, + "loss": 1.2237, + "step": 3257 + }, + { + "epoch": 0.07598678744165571, + "grad_norm": 1.6399321556091309, + "learning_rate": 1.688520342057528e-05, + "loss": 1.3485, + "step": 3258 + }, + { + "epoch": 0.07601011058083362, + "grad_norm": 2.2018356323242188, + "learning_rate": 1.6890386110391295e-05, + "loss": 1.2108, + "step": 3259 + }, + { + "epoch": 0.07603343372001155, + "grad_norm": 2.161050796508789, + "learning_rate": 1.689556880020731e-05, + "loss": 1.1575, + "step": 3260 + }, + { + "epoch": 0.07605675685918946, + "grad_norm": 2.0206055641174316, + "learning_rate": 1.6900751490023324e-05, + "loss": 1.4697, + "step": 3261 + }, + { + "epoch": 0.07608007999836738, + "grad_norm": 1.917266845703125, + "learning_rate": 1.6905934179839338e-05, + "loss": 1.2532, + "step": 3262 + }, + { + "epoch": 0.0761034031375453, + "grad_norm": 1.5717905759811401, + "learning_rate": 1.6911116869655353e-05, + "loss": 1.4017, + "step": 3263 + }, + { + "epoch": 0.07612672627672322, + "grad_norm": 2.3592898845672607, + "learning_rate": 1.6916299559471367e-05, + "loss": 1.4454, + "step": 3264 + }, + { + "epoch": 0.07615004941590113, + "grad_norm": 1.5780028104782104, + "learning_rate": 1.692148224928738e-05, + "loss": 1.1047, + "step": 3265 + }, + { + "epoch": 0.07617337255507906, + "grad_norm": 1.644834280014038, + "learning_rate": 1.6926664939103396e-05, + "loss": 1.2353, + "step": 3266 + }, + { + "epoch": 0.07619669569425697, + "grad_norm": 2.0033533573150635, + "learning_rate": 1.693184762891941e-05, + "loss": 1.5045, + "step": 3267 + }, + { + "epoch": 0.07622001883343489, + "grad_norm": 2.138641357421875, + "learning_rate": 1.6937030318735424e-05, + "loss": 1.0741, + "step": 3268 + }, + { + "epoch": 0.0762433419726128, + "grad_norm": 2.192629814147949, + "learning_rate": 1.694221300855144e-05, + "loss": 1.3045, + "step": 3269 + }, + { + "epoch": 0.07626666511179073, + "grad_norm": 2.714752435684204, + "learning_rate": 1.6947395698367456e-05, + "loss": 1.7175, + "step": 3270 + }, + { + "epoch": 0.07628998825096864, + "grad_norm": 1.7902692556381226, + "learning_rate": 1.695257838818347e-05, + "loss": 1.3403, + "step": 3271 + }, + { + "epoch": 0.07631331139014656, + "grad_norm": 2.086465835571289, + "learning_rate": 1.6957761077999485e-05, + "loss": 1.2808, + "step": 3272 + }, + { + "epoch": 0.07633663452932447, + "grad_norm": 1.8977822065353394, + "learning_rate": 1.69629437678155e-05, + "loss": 1.4753, + "step": 3273 + }, + { + "epoch": 0.0763599576685024, + "grad_norm": 1.6644923686981201, + "learning_rate": 1.6968126457631514e-05, + "loss": 1.5319, + "step": 3274 + }, + { + "epoch": 0.07638328080768031, + "grad_norm": 1.8487331867218018, + "learning_rate": 1.6973309147447528e-05, + "loss": 1.8381, + "step": 3275 + }, + { + "epoch": 0.07640660394685823, + "grad_norm": 2.051358699798584, + "learning_rate": 1.6978491837263543e-05, + "loss": 1.6011, + "step": 3276 + }, + { + "epoch": 0.07642992708603615, + "grad_norm": 1.8193670511245728, + "learning_rate": 1.6983674527079557e-05, + "loss": 1.8248, + "step": 3277 + }, + { + "epoch": 0.07645325022521406, + "grad_norm": 1.618811011314392, + "learning_rate": 1.698885721689557e-05, + "loss": 1.3692, + "step": 3278 + }, + { + "epoch": 0.07647657336439198, + "grad_norm": 1.6162301301956177, + "learning_rate": 1.6994039906711586e-05, + "loss": 1.2837, + "step": 3279 + }, + { + "epoch": 0.07649989650356989, + "grad_norm": 1.7301234006881714, + "learning_rate": 1.69992225965276e-05, + "loss": 1.3661, + "step": 3280 + }, + { + "epoch": 0.07652321964274782, + "grad_norm": 2.038130760192871, + "learning_rate": 1.7004405286343614e-05, + "loss": 1.1453, + "step": 3281 + }, + { + "epoch": 0.07654654278192573, + "grad_norm": 2.0911741256713867, + "learning_rate": 1.700958797615963e-05, + "loss": 1.4767, + "step": 3282 + }, + { + "epoch": 0.07656986592110365, + "grad_norm": 1.517926573753357, + "learning_rate": 1.7014770665975643e-05, + "loss": 1.2982, + "step": 3283 + }, + { + "epoch": 0.07659318906028156, + "grad_norm": 1.6235179901123047, + "learning_rate": 1.7019953355791657e-05, + "loss": 1.5828, + "step": 3284 + }, + { + "epoch": 0.07661651219945949, + "grad_norm": 2.2295985221862793, + "learning_rate": 1.702513604560767e-05, + "loss": 1.407, + "step": 3285 + }, + { + "epoch": 0.0766398353386374, + "grad_norm": 1.3985662460327148, + "learning_rate": 1.7030318735423686e-05, + "loss": 1.7479, + "step": 3286 + }, + { + "epoch": 0.07666315847781532, + "grad_norm": 1.876427412033081, + "learning_rate": 1.70355014252397e-05, + "loss": 1.4209, + "step": 3287 + }, + { + "epoch": 0.07668648161699324, + "grad_norm": 1.9090443849563599, + "learning_rate": 1.7040684115055715e-05, + "loss": 1.7284, + "step": 3288 + }, + { + "epoch": 0.07670980475617116, + "grad_norm": 1.4606293439865112, + "learning_rate": 1.704586680487173e-05, + "loss": 1.3944, + "step": 3289 + }, + { + "epoch": 0.07673312789534907, + "grad_norm": 1.9733707904815674, + "learning_rate": 1.7051049494687743e-05, + "loss": 0.9537, + "step": 3290 + }, + { + "epoch": 0.076756451034527, + "grad_norm": 2.197521209716797, + "learning_rate": 1.7056232184503758e-05, + "loss": 1.5473, + "step": 3291 + }, + { + "epoch": 0.0767797741737049, + "grad_norm": 1.5921342372894287, + "learning_rate": 1.7061414874319772e-05, + "loss": 1.2972, + "step": 3292 + }, + { + "epoch": 0.07680309731288283, + "grad_norm": 1.6160104274749756, + "learning_rate": 1.7066597564135786e-05, + "loss": 1.2229, + "step": 3293 + }, + { + "epoch": 0.07682642045206074, + "grad_norm": 1.9350335597991943, + "learning_rate": 1.70717802539518e-05, + "loss": 1.7013, + "step": 3294 + }, + { + "epoch": 0.07684974359123867, + "grad_norm": 1.429025411605835, + "learning_rate": 1.7076962943767815e-05, + "loss": 1.2247, + "step": 3295 + }, + { + "epoch": 0.07687306673041658, + "grad_norm": 1.7935131788253784, + "learning_rate": 1.708214563358383e-05, + "loss": 1.53, + "step": 3296 + }, + { + "epoch": 0.0768963898695945, + "grad_norm": 2.194880247116089, + "learning_rate": 1.7087328323399844e-05, + "loss": 1.1259, + "step": 3297 + }, + { + "epoch": 0.07691971300877241, + "grad_norm": 2.297866106033325, + "learning_rate": 1.709251101321586e-05, + "loss": 1.5837, + "step": 3298 + }, + { + "epoch": 0.07694303614795034, + "grad_norm": 2.34386944770813, + "learning_rate": 1.7097693703031876e-05, + "loss": 1.6849, + "step": 3299 + }, + { + "epoch": 0.07696635928712825, + "grad_norm": 1.9773756265640259, + "learning_rate": 1.710287639284789e-05, + "loss": 1.5308, + "step": 3300 + }, + { + "epoch": 0.07698968242630617, + "grad_norm": 2.028454065322876, + "learning_rate": 1.7108059082663905e-05, + "loss": 1.4155, + "step": 3301 + }, + { + "epoch": 0.07701300556548409, + "grad_norm": 1.9162853956222534, + "learning_rate": 1.711324177247992e-05, + "loss": 1.5933, + "step": 3302 + }, + { + "epoch": 0.07703632870466201, + "grad_norm": 1.8800437450408936, + "learning_rate": 1.7118424462295933e-05, + "loss": 1.2576, + "step": 3303 + }, + { + "epoch": 0.07705965184383992, + "grad_norm": 1.9667344093322754, + "learning_rate": 1.7123607152111948e-05, + "loss": 1.7147, + "step": 3304 + }, + { + "epoch": 0.07708297498301783, + "grad_norm": 1.681564450263977, + "learning_rate": 1.7128789841927962e-05, + "loss": 1.6175, + "step": 3305 + }, + { + "epoch": 0.07710629812219576, + "grad_norm": 1.9889628887176514, + "learning_rate": 1.7133972531743976e-05, + "loss": 1.5713, + "step": 3306 + }, + { + "epoch": 0.07712962126137367, + "grad_norm": 1.895285725593567, + "learning_rate": 1.713915522155999e-05, + "loss": 1.5917, + "step": 3307 + }, + { + "epoch": 0.07715294440055159, + "grad_norm": 2.204185724258423, + "learning_rate": 1.7144337911376005e-05, + "loss": 1.4123, + "step": 3308 + }, + { + "epoch": 0.0771762675397295, + "grad_norm": 1.6684226989746094, + "learning_rate": 1.714952060119202e-05, + "loss": 1.4985, + "step": 3309 + }, + { + "epoch": 0.07719959067890743, + "grad_norm": 1.4817101955413818, + "learning_rate": 1.7154703291008034e-05, + "loss": 1.4513, + "step": 3310 + }, + { + "epoch": 0.07722291381808534, + "grad_norm": 1.655494213104248, + "learning_rate": 1.7159885980824048e-05, + "loss": 1.3627, + "step": 3311 + }, + { + "epoch": 0.07724623695726326, + "grad_norm": 1.5405232906341553, + "learning_rate": 1.7165068670640062e-05, + "loss": 1.6758, + "step": 3312 + }, + { + "epoch": 0.07726956009644118, + "grad_norm": 2.788048267364502, + "learning_rate": 1.7170251360456077e-05, + "loss": 1.2079, + "step": 3313 + }, + { + "epoch": 0.0772928832356191, + "grad_norm": 1.8536003828048706, + "learning_rate": 1.7175434050272094e-05, + "loss": 1.6091, + "step": 3314 + }, + { + "epoch": 0.07731620637479701, + "grad_norm": 1.742221713066101, + "learning_rate": 1.718061674008811e-05, + "loss": 1.441, + "step": 3315 + }, + { + "epoch": 0.07733952951397494, + "grad_norm": 1.849967122077942, + "learning_rate": 1.7185799429904123e-05, + "loss": 1.5461, + "step": 3316 + }, + { + "epoch": 0.07736285265315285, + "grad_norm": 2.1449637413024902, + "learning_rate": 1.7190982119720138e-05, + "loss": 1.1739, + "step": 3317 + }, + { + "epoch": 0.07738617579233077, + "grad_norm": 1.9928157329559326, + "learning_rate": 1.7196164809536152e-05, + "loss": 1.467, + "step": 3318 + }, + { + "epoch": 0.07740949893150868, + "grad_norm": 1.9001449346542358, + "learning_rate": 1.7201347499352166e-05, + "loss": 1.6135, + "step": 3319 + }, + { + "epoch": 0.07743282207068661, + "grad_norm": 1.5023950338363647, + "learning_rate": 1.720653018916818e-05, + "loss": 1.3905, + "step": 3320 + }, + { + "epoch": 0.07745614520986452, + "grad_norm": 2.290374755859375, + "learning_rate": 1.7211712878984195e-05, + "loss": 1.4503, + "step": 3321 + }, + { + "epoch": 0.07747946834904244, + "grad_norm": 1.5706013441085815, + "learning_rate": 1.721689556880021e-05, + "loss": 1.4569, + "step": 3322 + }, + { + "epoch": 0.07750279148822035, + "grad_norm": 2.7582640647888184, + "learning_rate": 1.7222078258616224e-05, + "loss": 1.5863, + "step": 3323 + }, + { + "epoch": 0.07752611462739828, + "grad_norm": 2.0270185470581055, + "learning_rate": 1.7227260948432238e-05, + "loss": 1.6337, + "step": 3324 + }, + { + "epoch": 0.07754943776657619, + "grad_norm": 1.835780143737793, + "learning_rate": 1.7232443638248252e-05, + "loss": 1.5795, + "step": 3325 + }, + { + "epoch": 0.07757276090575412, + "grad_norm": 1.7932450771331787, + "learning_rate": 1.7237626328064267e-05, + "loss": 1.5944, + "step": 3326 + }, + { + "epoch": 0.07759608404493203, + "grad_norm": 1.7863045930862427, + "learning_rate": 1.724280901788028e-05, + "loss": 1.579, + "step": 3327 + }, + { + "epoch": 0.07761940718410995, + "grad_norm": 1.7453893423080444, + "learning_rate": 1.7247991707696295e-05, + "loss": 1.4776, + "step": 3328 + }, + { + "epoch": 0.07764273032328786, + "grad_norm": 1.7224361896514893, + "learning_rate": 1.725317439751231e-05, + "loss": 1.1699, + "step": 3329 + }, + { + "epoch": 0.07766605346246579, + "grad_norm": 2.057270050048828, + "learning_rate": 1.7258357087328324e-05, + "loss": 1.4944, + "step": 3330 + }, + { + "epoch": 0.0776893766016437, + "grad_norm": 1.8803025484085083, + "learning_rate": 1.7263539777144342e-05, + "loss": 1.7737, + "step": 3331 + }, + { + "epoch": 0.07771269974082162, + "grad_norm": 1.7562426328659058, + "learning_rate": 1.7268722466960356e-05, + "loss": 1.6312, + "step": 3332 + }, + { + "epoch": 0.07773602287999953, + "grad_norm": 2.3190042972564697, + "learning_rate": 1.727390515677637e-05, + "loss": 1.7718, + "step": 3333 + }, + { + "epoch": 0.07775934601917744, + "grad_norm": 1.585045337677002, + "learning_rate": 1.7279087846592385e-05, + "loss": 1.1472, + "step": 3334 + }, + { + "epoch": 0.07778266915835537, + "grad_norm": 1.8282815217971802, + "learning_rate": 1.72842705364084e-05, + "loss": 1.8519, + "step": 3335 + }, + { + "epoch": 0.07780599229753328, + "grad_norm": 1.8750232458114624, + "learning_rate": 1.728945322622441e-05, + "loss": 1.7706, + "step": 3336 + }, + { + "epoch": 0.0778293154367112, + "grad_norm": 1.8886610269546509, + "learning_rate": 1.7294635916040424e-05, + "loss": 1.7408, + "step": 3337 + }, + { + "epoch": 0.07785263857588912, + "grad_norm": 1.4651943445205688, + "learning_rate": 1.729981860585644e-05, + "loss": 1.3464, + "step": 3338 + }, + { + "epoch": 0.07787596171506704, + "grad_norm": 3.5322890281677246, + "learning_rate": 1.7305001295672453e-05, + "loss": 1.4605, + "step": 3339 + }, + { + "epoch": 0.07789928485424495, + "grad_norm": 2.985595226287842, + "learning_rate": 1.7310183985488468e-05, + "loss": 1.1655, + "step": 3340 + }, + { + "epoch": 0.07792260799342288, + "grad_norm": 1.8383092880249023, + "learning_rate": 1.7315366675304485e-05, + "loss": 1.604, + "step": 3341 + }, + { + "epoch": 0.07794593113260079, + "grad_norm": 1.765061378479004, + "learning_rate": 1.73205493651205e-05, + "loss": 1.2413, + "step": 3342 + }, + { + "epoch": 0.07796925427177871, + "grad_norm": 2.067577362060547, + "learning_rate": 1.7325732054936514e-05, + "loss": 1.5542, + "step": 3343 + }, + { + "epoch": 0.07799257741095662, + "grad_norm": 1.8837525844573975, + "learning_rate": 1.7330914744752528e-05, + "loss": 1.5958, + "step": 3344 + }, + { + "epoch": 0.07801590055013455, + "grad_norm": 2.2309885025024414, + "learning_rate": 1.7336097434568543e-05, + "loss": 1.328, + "step": 3345 + }, + { + "epoch": 0.07803922368931246, + "grad_norm": 2.1510369777679443, + "learning_rate": 1.7341280124384557e-05, + "loss": 1.6049, + "step": 3346 + }, + { + "epoch": 0.07806254682849038, + "grad_norm": 2.2758655548095703, + "learning_rate": 1.734646281420057e-05, + "loss": 1.6463, + "step": 3347 + }, + { + "epoch": 0.0780858699676683, + "grad_norm": 1.5804762840270996, + "learning_rate": 1.7351645504016586e-05, + "loss": 1.3067, + "step": 3348 + }, + { + "epoch": 0.07810919310684622, + "grad_norm": 1.7285902500152588, + "learning_rate": 1.73568281938326e-05, + "loss": 1.1706, + "step": 3349 + }, + { + "epoch": 0.07813251624602413, + "grad_norm": 1.9170994758605957, + "learning_rate": 1.7362010883648614e-05, + "loss": 1.6914, + "step": 3350 + }, + { + "epoch": 0.07815583938520206, + "grad_norm": 1.8732631206512451, + "learning_rate": 1.736719357346463e-05, + "loss": 1.8266, + "step": 3351 + }, + { + "epoch": 0.07817916252437997, + "grad_norm": 2.3439199924468994, + "learning_rate": 1.7372376263280643e-05, + "loss": 1.5885, + "step": 3352 + }, + { + "epoch": 0.07820248566355789, + "grad_norm": 2.0440049171447754, + "learning_rate": 1.7377558953096657e-05, + "loss": 1.2152, + "step": 3353 + }, + { + "epoch": 0.0782258088027358, + "grad_norm": 1.5103175640106201, + "learning_rate": 1.7382741642912672e-05, + "loss": 1.2679, + "step": 3354 + }, + { + "epoch": 0.07824913194191373, + "grad_norm": 1.7911981344223022, + "learning_rate": 1.7387924332728686e-05, + "loss": 1.8322, + "step": 3355 + }, + { + "epoch": 0.07827245508109164, + "grad_norm": 1.7144136428833008, + "learning_rate": 1.73931070225447e-05, + "loss": 1.5417, + "step": 3356 + }, + { + "epoch": 0.07829577822026956, + "grad_norm": 1.7464350461959839, + "learning_rate": 1.7398289712360715e-05, + "loss": 1.6126, + "step": 3357 + }, + { + "epoch": 0.07831910135944747, + "grad_norm": 2.226100444793701, + "learning_rate": 1.7403472402176733e-05, + "loss": 1.7248, + "step": 3358 + }, + { + "epoch": 0.0783424244986254, + "grad_norm": 1.8423552513122559, + "learning_rate": 1.7408655091992747e-05, + "loss": 1.399, + "step": 3359 + }, + { + "epoch": 0.07836574763780331, + "grad_norm": 2.0295534133911133, + "learning_rate": 1.741383778180876e-05, + "loss": 1.4527, + "step": 3360 + }, + { + "epoch": 0.07838907077698123, + "grad_norm": 2.043365240097046, + "learning_rate": 1.7419020471624776e-05, + "loss": 1.2569, + "step": 3361 + }, + { + "epoch": 0.07841239391615915, + "grad_norm": 2.070237874984741, + "learning_rate": 1.742420316144079e-05, + "loss": 1.6359, + "step": 3362 + }, + { + "epoch": 0.07843571705533706, + "grad_norm": 1.8740978240966797, + "learning_rate": 1.7429385851256804e-05, + "loss": 1.4223, + "step": 3363 + }, + { + "epoch": 0.07845904019451498, + "grad_norm": 1.8014066219329834, + "learning_rate": 1.743456854107282e-05, + "loss": 1.6767, + "step": 3364 + }, + { + "epoch": 0.07848236333369289, + "grad_norm": 1.7680472135543823, + "learning_rate": 1.7439751230888833e-05, + "loss": 1.227, + "step": 3365 + }, + { + "epoch": 0.07850568647287082, + "grad_norm": 2.5365848541259766, + "learning_rate": 1.7444933920704847e-05, + "loss": 1.336, + "step": 3366 + }, + { + "epoch": 0.07852900961204873, + "grad_norm": 1.6996607780456543, + "learning_rate": 1.745011661052086e-05, + "loss": 1.3024, + "step": 3367 + }, + { + "epoch": 0.07855233275122665, + "grad_norm": 2.24592924118042, + "learning_rate": 1.7455299300336876e-05, + "loss": 1.565, + "step": 3368 + }, + { + "epoch": 0.07857565589040456, + "grad_norm": 1.5899155139923096, + "learning_rate": 1.746048199015289e-05, + "loss": 1.3918, + "step": 3369 + }, + { + "epoch": 0.07859897902958249, + "grad_norm": 1.5738404989242554, + "learning_rate": 1.7465664679968905e-05, + "loss": 1.5093, + "step": 3370 + }, + { + "epoch": 0.0786223021687604, + "grad_norm": 1.885108470916748, + "learning_rate": 1.747084736978492e-05, + "loss": 1.6744, + "step": 3371 + }, + { + "epoch": 0.07864562530793832, + "grad_norm": 1.7116379737854004, + "learning_rate": 1.7476030059600933e-05, + "loss": 1.329, + "step": 3372 + }, + { + "epoch": 0.07866894844711624, + "grad_norm": 2.3172607421875, + "learning_rate": 1.7481212749416948e-05, + "loss": 1.538, + "step": 3373 + }, + { + "epoch": 0.07869227158629416, + "grad_norm": 1.7801569700241089, + "learning_rate": 1.7486395439232965e-05, + "loss": 1.1738, + "step": 3374 + }, + { + "epoch": 0.07871559472547207, + "grad_norm": 1.6906585693359375, + "learning_rate": 1.749157812904898e-05, + "loss": 1.575, + "step": 3375 + }, + { + "epoch": 0.07873891786465, + "grad_norm": 2.6695775985717773, + "learning_rate": 1.7496760818864994e-05, + "loss": 1.5512, + "step": 3376 + }, + { + "epoch": 0.0787622410038279, + "grad_norm": 2.102708578109741, + "learning_rate": 1.750194350868101e-05, + "loss": 1.5507, + "step": 3377 + }, + { + "epoch": 0.07878556414300583, + "grad_norm": 1.8281930685043335, + "learning_rate": 1.7507126198497023e-05, + "loss": 1.538, + "step": 3378 + }, + { + "epoch": 0.07880888728218374, + "grad_norm": 1.6384347677230835, + "learning_rate": 1.7512308888313037e-05, + "loss": 1.2935, + "step": 3379 + }, + { + "epoch": 0.07883221042136167, + "grad_norm": 1.8894944190979004, + "learning_rate": 1.751749157812905e-05, + "loss": 1.2876, + "step": 3380 + }, + { + "epoch": 0.07885553356053958, + "grad_norm": 1.902371883392334, + "learning_rate": 1.7522674267945066e-05, + "loss": 1.476, + "step": 3381 + }, + { + "epoch": 0.0788788566997175, + "grad_norm": 1.6878814697265625, + "learning_rate": 1.752785695776108e-05, + "loss": 1.7224, + "step": 3382 + }, + { + "epoch": 0.07890217983889541, + "grad_norm": 1.608279824256897, + "learning_rate": 1.7533039647577095e-05, + "loss": 1.2821, + "step": 3383 + }, + { + "epoch": 0.07892550297807334, + "grad_norm": 1.9098149538040161, + "learning_rate": 1.753822233739311e-05, + "loss": 1.5575, + "step": 3384 + }, + { + "epoch": 0.07894882611725125, + "grad_norm": 2.8241662979125977, + "learning_rate": 1.7543405027209123e-05, + "loss": 1.3103, + "step": 3385 + }, + { + "epoch": 0.07897214925642917, + "grad_norm": 1.626420021057129, + "learning_rate": 1.7548587717025138e-05, + "loss": 1.4841, + "step": 3386 + }, + { + "epoch": 0.07899547239560709, + "grad_norm": 1.8856135606765747, + "learning_rate": 1.7553770406841152e-05, + "loss": 1.4818, + "step": 3387 + }, + { + "epoch": 0.07901879553478501, + "grad_norm": 1.5790698528289795, + "learning_rate": 1.7558953096657166e-05, + "loss": 1.4033, + "step": 3388 + }, + { + "epoch": 0.07904211867396292, + "grad_norm": 1.8913196325302124, + "learning_rate": 1.756413578647318e-05, + "loss": 1.6889, + "step": 3389 + }, + { + "epoch": 0.07906544181314085, + "grad_norm": 2.2722718715667725, + "learning_rate": 1.7569318476289195e-05, + "loss": 1.5644, + "step": 3390 + }, + { + "epoch": 0.07908876495231876, + "grad_norm": 2.0640485286712646, + "learning_rate": 1.757450116610521e-05, + "loss": 1.805, + "step": 3391 + }, + { + "epoch": 0.07911208809149667, + "grad_norm": 1.9282963275909424, + "learning_rate": 1.7579683855921224e-05, + "loss": 1.525, + "step": 3392 + }, + { + "epoch": 0.0791354112306746, + "grad_norm": 1.8700670003890991, + "learning_rate": 1.7584866545737238e-05, + "loss": 1.6906, + "step": 3393 + }, + { + "epoch": 0.0791587343698525, + "grad_norm": 1.6949808597564697, + "learning_rate": 1.7590049235553252e-05, + "loss": 1.4373, + "step": 3394 + }, + { + "epoch": 0.07918205750903043, + "grad_norm": 1.5579369068145752, + "learning_rate": 1.7595231925369267e-05, + "loss": 1.2127, + "step": 3395 + }, + { + "epoch": 0.07920538064820834, + "grad_norm": 1.8959749937057495, + "learning_rate": 1.760041461518528e-05, + "loss": 1.7395, + "step": 3396 + }, + { + "epoch": 0.07922870378738626, + "grad_norm": 1.2566922903060913, + "learning_rate": 1.7605597305001295e-05, + "loss": 1.2768, + "step": 3397 + }, + { + "epoch": 0.07925202692656418, + "grad_norm": 1.7211780548095703, + "learning_rate": 1.761077999481731e-05, + "loss": 1.6815, + "step": 3398 + }, + { + "epoch": 0.0792753500657421, + "grad_norm": 1.7850645780563354, + "learning_rate": 1.7615962684633324e-05, + "loss": 1.4944, + "step": 3399 + }, + { + "epoch": 0.07929867320492001, + "grad_norm": 1.666786789894104, + "learning_rate": 1.762114537444934e-05, + "loss": 1.5897, + "step": 3400 + }, + { + "epoch": 0.07932199634409794, + "grad_norm": 1.5484236478805542, + "learning_rate": 1.7626328064265353e-05, + "loss": 1.3968, + "step": 3401 + }, + { + "epoch": 0.07934531948327585, + "grad_norm": 1.8221678733825684, + "learning_rate": 1.763151075408137e-05, + "loss": 1.2536, + "step": 3402 + }, + { + "epoch": 0.07936864262245377, + "grad_norm": 1.67190420627594, + "learning_rate": 1.7636693443897385e-05, + "loss": 1.2301, + "step": 3403 + }, + { + "epoch": 0.07939196576163168, + "grad_norm": 1.5914485454559326, + "learning_rate": 1.76418761337134e-05, + "loss": 1.3554, + "step": 3404 + }, + { + "epoch": 0.07941528890080961, + "grad_norm": 1.7257155179977417, + "learning_rate": 1.7647058823529414e-05, + "loss": 1.3212, + "step": 3405 + }, + { + "epoch": 0.07943861203998752, + "grad_norm": 1.5732362270355225, + "learning_rate": 1.7652241513345428e-05, + "loss": 1.1798, + "step": 3406 + }, + { + "epoch": 0.07946193517916544, + "grad_norm": 1.9243180751800537, + "learning_rate": 1.7657424203161442e-05, + "loss": 1.3676, + "step": 3407 + }, + { + "epoch": 0.07948525831834335, + "grad_norm": 1.6544080972671509, + "learning_rate": 1.7662606892977457e-05, + "loss": 1.561, + "step": 3408 + }, + { + "epoch": 0.07950858145752128, + "grad_norm": 2.302537202835083, + "learning_rate": 1.766778958279347e-05, + "loss": 1.5584, + "step": 3409 + }, + { + "epoch": 0.07953190459669919, + "grad_norm": 1.5818437337875366, + "learning_rate": 1.7672972272609485e-05, + "loss": 1.4021, + "step": 3410 + }, + { + "epoch": 0.07955522773587712, + "grad_norm": 1.276625156402588, + "learning_rate": 1.76781549624255e-05, + "loss": 1.2437, + "step": 3411 + }, + { + "epoch": 0.07957855087505503, + "grad_norm": 1.9232205152511597, + "learning_rate": 1.7683337652241514e-05, + "loss": 1.3494, + "step": 3412 + }, + { + "epoch": 0.07960187401423295, + "grad_norm": 1.8550798892974854, + "learning_rate": 1.768852034205753e-05, + "loss": 1.6728, + "step": 3413 + }, + { + "epoch": 0.07962519715341086, + "grad_norm": 1.8926242589950562, + "learning_rate": 1.7693703031873543e-05, + "loss": 1.3786, + "step": 3414 + }, + { + "epoch": 0.07964852029258879, + "grad_norm": 1.7592382431030273, + "learning_rate": 1.7698885721689557e-05, + "loss": 1.3268, + "step": 3415 + }, + { + "epoch": 0.0796718434317667, + "grad_norm": 2.5954487323760986, + "learning_rate": 1.770406841150557e-05, + "loss": 1.791, + "step": 3416 + }, + { + "epoch": 0.07969516657094462, + "grad_norm": 2.1821093559265137, + "learning_rate": 1.7709251101321586e-05, + "loss": 1.7194, + "step": 3417 + }, + { + "epoch": 0.07971848971012253, + "grad_norm": 1.3791011571884155, + "learning_rate": 1.7714433791137604e-05, + "loss": 1.6048, + "step": 3418 + }, + { + "epoch": 0.07974181284930044, + "grad_norm": 1.2134246826171875, + "learning_rate": 1.7719616480953618e-05, + "loss": 1.0106, + "step": 3419 + }, + { + "epoch": 0.07976513598847837, + "grad_norm": 2.4315738677978516, + "learning_rate": 1.7724799170769632e-05, + "loss": 1.4947, + "step": 3420 + }, + { + "epoch": 0.07978845912765628, + "grad_norm": 1.6762149333953857, + "learning_rate": 1.7729981860585647e-05, + "loss": 1.29, + "step": 3421 + }, + { + "epoch": 0.0798117822668342, + "grad_norm": 2.2112157344818115, + "learning_rate": 1.773516455040166e-05, + "loss": 1.5833, + "step": 3422 + }, + { + "epoch": 0.07983510540601212, + "grad_norm": 2.0395634174346924, + "learning_rate": 1.7740347240217675e-05, + "loss": 1.4179, + "step": 3423 + }, + { + "epoch": 0.07985842854519004, + "grad_norm": 4.181065082550049, + "learning_rate": 1.774552993003369e-05, + "loss": 1.3738, + "step": 3424 + }, + { + "epoch": 0.07988175168436795, + "grad_norm": 1.4794611930847168, + "learning_rate": 1.7750712619849704e-05, + "loss": 1.5617, + "step": 3425 + }, + { + "epoch": 0.07990507482354588, + "grad_norm": 1.5973138809204102, + "learning_rate": 1.7755895309665718e-05, + "loss": 1.4281, + "step": 3426 + }, + { + "epoch": 0.07992839796272379, + "grad_norm": 2.451594591140747, + "learning_rate": 1.7761077999481733e-05, + "loss": 1.361, + "step": 3427 + }, + { + "epoch": 0.07995172110190171, + "grad_norm": 1.8390647172927856, + "learning_rate": 1.7766260689297747e-05, + "loss": 1.6535, + "step": 3428 + }, + { + "epoch": 0.07997504424107962, + "grad_norm": 2.335458755493164, + "learning_rate": 1.777144337911376e-05, + "loss": 1.5528, + "step": 3429 + }, + { + "epoch": 0.07999836738025755, + "grad_norm": 1.8207273483276367, + "learning_rate": 1.7776626068929776e-05, + "loss": 1.4269, + "step": 3430 + }, + { + "epoch": 0.08002169051943546, + "grad_norm": 1.643661379814148, + "learning_rate": 1.778180875874579e-05, + "loss": 1.4368, + "step": 3431 + }, + { + "epoch": 0.08004501365861338, + "grad_norm": 1.9254742860794067, + "learning_rate": 1.7786991448561804e-05, + "loss": 1.207, + "step": 3432 + }, + { + "epoch": 0.0800683367977913, + "grad_norm": 1.9777443408966064, + "learning_rate": 1.779217413837782e-05, + "loss": 1.5559, + "step": 3433 + }, + { + "epoch": 0.08009165993696922, + "grad_norm": 1.8416067361831665, + "learning_rate": 1.7797356828193833e-05, + "loss": 1.1761, + "step": 3434 + }, + { + "epoch": 0.08011498307614713, + "grad_norm": 1.6335779428482056, + "learning_rate": 1.780253951800985e-05, + "loss": 1.5841, + "step": 3435 + }, + { + "epoch": 0.08013830621532506, + "grad_norm": 1.7504732608795166, + "learning_rate": 1.7807722207825865e-05, + "loss": 1.0417, + "step": 3436 + }, + { + "epoch": 0.08016162935450297, + "grad_norm": 1.9436529874801636, + "learning_rate": 1.781290489764188e-05, + "loss": 1.6339, + "step": 3437 + }, + { + "epoch": 0.08018495249368089, + "grad_norm": 1.320713996887207, + "learning_rate": 1.7818087587457894e-05, + "loss": 1.1599, + "step": 3438 + }, + { + "epoch": 0.0802082756328588, + "grad_norm": 1.8651103973388672, + "learning_rate": 1.7823270277273908e-05, + "loss": 1.5779, + "step": 3439 + }, + { + "epoch": 0.08023159877203673, + "grad_norm": 2.5607051849365234, + "learning_rate": 1.7828452967089923e-05, + "loss": 1.0184, + "step": 3440 + }, + { + "epoch": 0.08025492191121464, + "grad_norm": 2.009211778640747, + "learning_rate": 1.7833635656905937e-05, + "loss": 1.2315, + "step": 3441 + }, + { + "epoch": 0.08027824505039256, + "grad_norm": 1.6827813386917114, + "learning_rate": 1.783881834672195e-05, + "loss": 1.5273, + "step": 3442 + }, + { + "epoch": 0.08030156818957047, + "grad_norm": 1.7755669355392456, + "learning_rate": 1.7844001036537966e-05, + "loss": 1.2431, + "step": 3443 + }, + { + "epoch": 0.0803248913287484, + "grad_norm": 1.8076794147491455, + "learning_rate": 1.784918372635398e-05, + "loss": 1.3305, + "step": 3444 + }, + { + "epoch": 0.08034821446792631, + "grad_norm": 1.6127822399139404, + "learning_rate": 1.7854366416169994e-05, + "loss": 1.6228, + "step": 3445 + }, + { + "epoch": 0.08037153760710423, + "grad_norm": 1.5132166147232056, + "learning_rate": 1.785954910598601e-05, + "loss": 1.1506, + "step": 3446 + }, + { + "epoch": 0.08039486074628215, + "grad_norm": 1.5347700119018555, + "learning_rate": 1.7864731795802023e-05, + "loss": 1.2827, + "step": 3447 + }, + { + "epoch": 0.08041818388546006, + "grad_norm": 1.5111736059188843, + "learning_rate": 1.7869914485618037e-05, + "loss": 1.3593, + "step": 3448 + }, + { + "epoch": 0.08044150702463798, + "grad_norm": 1.5753391981124878, + "learning_rate": 1.787509717543405e-05, + "loss": 1.4866, + "step": 3449 + }, + { + "epoch": 0.08046483016381589, + "grad_norm": 1.7266024351119995, + "learning_rate": 1.7880279865250066e-05, + "loss": 1.4463, + "step": 3450 + }, + { + "epoch": 0.08048815330299382, + "grad_norm": 2.0583856105804443, + "learning_rate": 1.788546255506608e-05, + "loss": 1.4841, + "step": 3451 + }, + { + "epoch": 0.08051147644217173, + "grad_norm": 1.7603833675384521, + "learning_rate": 1.7890645244882095e-05, + "loss": 1.6678, + "step": 3452 + }, + { + "epoch": 0.08053479958134965, + "grad_norm": 1.6743794679641724, + "learning_rate": 1.789582793469811e-05, + "loss": 1.0625, + "step": 3453 + }, + { + "epoch": 0.08055812272052756, + "grad_norm": 2.879643201828003, + "learning_rate": 1.7901010624514123e-05, + "loss": 1.4258, + "step": 3454 + }, + { + "epoch": 0.08058144585970549, + "grad_norm": 2.4089555740356445, + "learning_rate": 1.7906193314330138e-05, + "loss": 1.3974, + "step": 3455 + }, + { + "epoch": 0.0806047689988834, + "grad_norm": 2.077892780303955, + "learning_rate": 1.7911376004146152e-05, + "loss": 1.3107, + "step": 3456 + }, + { + "epoch": 0.08062809213806132, + "grad_norm": 2.7976572513580322, + "learning_rate": 1.7916558693962166e-05, + "loss": 1.1065, + "step": 3457 + }, + { + "epoch": 0.08065141527723924, + "grad_norm": 1.862610101699829, + "learning_rate": 1.792174138377818e-05, + "loss": 1.5416, + "step": 3458 + }, + { + "epoch": 0.08067473841641716, + "grad_norm": 1.8210248947143555, + "learning_rate": 1.7926924073594195e-05, + "loss": 1.7809, + "step": 3459 + }, + { + "epoch": 0.08069806155559507, + "grad_norm": 1.653671145439148, + "learning_rate": 1.793210676341021e-05, + "loss": 1.3246, + "step": 3460 + }, + { + "epoch": 0.080721384694773, + "grad_norm": 1.676255464553833, + "learning_rate": 1.7937289453226224e-05, + "loss": 1.3022, + "step": 3461 + }, + { + "epoch": 0.08074470783395091, + "grad_norm": 1.759718418121338, + "learning_rate": 1.794247214304224e-05, + "loss": 1.1314, + "step": 3462 + }, + { + "epoch": 0.08076803097312883, + "grad_norm": 1.7641233205795288, + "learning_rate": 1.7947654832858256e-05, + "loss": 1.4204, + "step": 3463 + }, + { + "epoch": 0.08079135411230674, + "grad_norm": 1.5273410081863403, + "learning_rate": 1.795283752267427e-05, + "loss": 1.2823, + "step": 3464 + }, + { + "epoch": 0.08081467725148467, + "grad_norm": 1.91709303855896, + "learning_rate": 1.7958020212490285e-05, + "loss": 1.692, + "step": 3465 + }, + { + "epoch": 0.08083800039066258, + "grad_norm": 1.7322471141815186, + "learning_rate": 1.79632029023063e-05, + "loss": 1.4564, + "step": 3466 + }, + { + "epoch": 0.0808613235298405, + "grad_norm": 1.8525621891021729, + "learning_rate": 1.7968385592122313e-05, + "loss": 1.3558, + "step": 3467 + }, + { + "epoch": 0.08088464666901841, + "grad_norm": 1.6870750188827515, + "learning_rate": 1.7973568281938328e-05, + "loss": 1.2403, + "step": 3468 + }, + { + "epoch": 0.08090796980819634, + "grad_norm": 1.80242919921875, + "learning_rate": 1.7978750971754342e-05, + "loss": 1.4841, + "step": 3469 + }, + { + "epoch": 0.08093129294737425, + "grad_norm": 1.543397307395935, + "learning_rate": 1.7983933661570356e-05, + "loss": 1.474, + "step": 3470 + }, + { + "epoch": 0.08095461608655218, + "grad_norm": 1.908703327178955, + "learning_rate": 1.798911635138637e-05, + "loss": 1.3324, + "step": 3471 + }, + { + "epoch": 0.08097793922573009, + "grad_norm": 1.846138596534729, + "learning_rate": 1.7994299041202385e-05, + "loss": 1.4899, + "step": 3472 + }, + { + "epoch": 0.08100126236490801, + "grad_norm": 1.8396351337432861, + "learning_rate": 1.79994817310184e-05, + "loss": 1.4697, + "step": 3473 + }, + { + "epoch": 0.08102458550408592, + "grad_norm": 1.653918981552124, + "learning_rate": 1.8004664420834414e-05, + "loss": 1.6469, + "step": 3474 + }, + { + "epoch": 0.08104790864326385, + "grad_norm": 2.1722609996795654, + "learning_rate": 1.8009847110650428e-05, + "loss": 1.4355, + "step": 3475 + }, + { + "epoch": 0.08107123178244176, + "grad_norm": 2.0860023498535156, + "learning_rate": 1.8015029800466442e-05, + "loss": 1.7017, + "step": 3476 + }, + { + "epoch": 0.08109455492161967, + "grad_norm": 1.9186869859695435, + "learning_rate": 1.8020212490282457e-05, + "loss": 1.289, + "step": 3477 + }, + { + "epoch": 0.0811178780607976, + "grad_norm": 1.6853832006454468, + "learning_rate": 1.802539518009847e-05, + "loss": 1.4602, + "step": 3478 + }, + { + "epoch": 0.0811412011999755, + "grad_norm": 2.2143218517303467, + "learning_rate": 1.803057786991449e-05, + "loss": 1.3986, + "step": 3479 + }, + { + "epoch": 0.08116452433915343, + "grad_norm": 1.786441683769226, + "learning_rate": 1.8035760559730503e-05, + "loss": 1.5675, + "step": 3480 + }, + { + "epoch": 0.08118784747833134, + "grad_norm": 1.891284704208374, + "learning_rate": 1.8040943249546518e-05, + "loss": 1.537, + "step": 3481 + }, + { + "epoch": 0.08121117061750927, + "grad_norm": 1.7284735441207886, + "learning_rate": 1.8046125939362532e-05, + "loss": 1.6959, + "step": 3482 + }, + { + "epoch": 0.08123449375668718, + "grad_norm": 2.38594388961792, + "learning_rate": 1.8051308629178546e-05, + "loss": 1.2144, + "step": 3483 + }, + { + "epoch": 0.0812578168958651, + "grad_norm": 1.965506911277771, + "learning_rate": 1.805649131899456e-05, + "loss": 1.6692, + "step": 3484 + }, + { + "epoch": 0.08128114003504301, + "grad_norm": 2.0930988788604736, + "learning_rate": 1.8061674008810575e-05, + "loss": 1.0515, + "step": 3485 + }, + { + "epoch": 0.08130446317422094, + "grad_norm": 1.7666972875595093, + "learning_rate": 1.806685669862659e-05, + "loss": 1.2833, + "step": 3486 + }, + { + "epoch": 0.08132778631339885, + "grad_norm": 1.7547857761383057, + "learning_rate": 1.8072039388442604e-05, + "loss": 1.5865, + "step": 3487 + }, + { + "epoch": 0.08135110945257677, + "grad_norm": 1.726447343826294, + "learning_rate": 1.8077222078258618e-05, + "loss": 1.1413, + "step": 3488 + }, + { + "epoch": 0.08137443259175468, + "grad_norm": 1.6850913763046265, + "learning_rate": 1.8082404768074632e-05, + "loss": 1.6147, + "step": 3489 + }, + { + "epoch": 0.08139775573093261, + "grad_norm": 1.7971402406692505, + "learning_rate": 1.8087587457890647e-05, + "loss": 1.1584, + "step": 3490 + }, + { + "epoch": 0.08142107887011052, + "grad_norm": 1.5679081678390503, + "learning_rate": 1.809277014770666e-05, + "loss": 1.0253, + "step": 3491 + }, + { + "epoch": 0.08144440200928844, + "grad_norm": 1.7024198770523071, + "learning_rate": 1.8097952837522675e-05, + "loss": 1.367, + "step": 3492 + }, + { + "epoch": 0.08146772514846636, + "grad_norm": 1.712080955505371, + "learning_rate": 1.810313552733869e-05, + "loss": 1.5415, + "step": 3493 + }, + { + "epoch": 0.08149104828764428, + "grad_norm": 1.7190017700195312, + "learning_rate": 1.8108318217154704e-05, + "loss": 1.6035, + "step": 3494 + }, + { + "epoch": 0.08151437142682219, + "grad_norm": 1.6702404022216797, + "learning_rate": 1.8113500906970722e-05, + "loss": 1.4485, + "step": 3495 + }, + { + "epoch": 0.08153769456600012, + "grad_norm": 1.7290270328521729, + "learning_rate": 1.8118683596786736e-05, + "loss": 1.3473, + "step": 3496 + }, + { + "epoch": 0.08156101770517803, + "grad_norm": 1.8638476133346558, + "learning_rate": 1.812386628660275e-05, + "loss": 1.1051, + "step": 3497 + }, + { + "epoch": 0.08158434084435595, + "grad_norm": 1.6890590190887451, + "learning_rate": 1.8129048976418765e-05, + "loss": 1.207, + "step": 3498 + }, + { + "epoch": 0.08160766398353386, + "grad_norm": 2.0224850177764893, + "learning_rate": 1.813423166623478e-05, + "loss": 1.7992, + "step": 3499 + }, + { + "epoch": 0.08163098712271179, + "grad_norm": 2.054816246032715, + "learning_rate": 1.8139414356050794e-05, + "loss": 1.4456, + "step": 3500 + }, + { + "epoch": 0.0816543102618897, + "grad_norm": 1.79837965965271, + "learning_rate": 1.8144597045866804e-05, + "loss": 1.6059, + "step": 3501 + }, + { + "epoch": 0.08167763340106762, + "grad_norm": 1.9790756702423096, + "learning_rate": 1.814977973568282e-05, + "loss": 1.3804, + "step": 3502 + }, + { + "epoch": 0.08170095654024553, + "grad_norm": 1.8916029930114746, + "learning_rate": 1.8154962425498833e-05, + "loss": 1.4934, + "step": 3503 + }, + { + "epoch": 0.08172427967942346, + "grad_norm": 2.5700888633728027, + "learning_rate": 1.8160145115314848e-05, + "loss": 1.14, + "step": 3504 + }, + { + "epoch": 0.08174760281860137, + "grad_norm": 1.945611834526062, + "learning_rate": 1.8165327805130862e-05, + "loss": 1.4387, + "step": 3505 + }, + { + "epoch": 0.08177092595777928, + "grad_norm": 1.7046513557434082, + "learning_rate": 1.817051049494688e-05, + "loss": 1.1947, + "step": 3506 + }, + { + "epoch": 0.0817942490969572, + "grad_norm": 1.9995052814483643, + "learning_rate": 1.8175693184762894e-05, + "loss": 1.2849, + "step": 3507 + }, + { + "epoch": 0.08181757223613512, + "grad_norm": 1.865929126739502, + "learning_rate": 1.8180875874578908e-05, + "loss": 1.6354, + "step": 3508 + }, + { + "epoch": 0.08184089537531304, + "grad_norm": 2.0877327919006348, + "learning_rate": 1.8186058564394923e-05, + "loss": 1.5864, + "step": 3509 + }, + { + "epoch": 0.08186421851449095, + "grad_norm": 1.8026949167251587, + "learning_rate": 1.8191241254210937e-05, + "loss": 1.3427, + "step": 3510 + }, + { + "epoch": 0.08188754165366888, + "grad_norm": 1.9686529636383057, + "learning_rate": 1.819642394402695e-05, + "loss": 0.8916, + "step": 3511 + }, + { + "epoch": 0.08191086479284679, + "grad_norm": 1.882365345954895, + "learning_rate": 1.8201606633842966e-05, + "loss": 1.6041, + "step": 3512 + }, + { + "epoch": 0.08193418793202471, + "grad_norm": 2.731876850128174, + "learning_rate": 1.820678932365898e-05, + "loss": 1.9984, + "step": 3513 + }, + { + "epoch": 0.08195751107120262, + "grad_norm": 2.3877131938934326, + "learning_rate": 1.8211972013474994e-05, + "loss": 1.5732, + "step": 3514 + }, + { + "epoch": 0.08198083421038055, + "grad_norm": 1.7990912199020386, + "learning_rate": 1.821715470329101e-05, + "loss": 1.2355, + "step": 3515 + }, + { + "epoch": 0.08200415734955846, + "grad_norm": 1.7821038961410522, + "learning_rate": 1.8222337393107023e-05, + "loss": 1.5518, + "step": 3516 + }, + { + "epoch": 0.08202748048873638, + "grad_norm": 1.5398446321487427, + "learning_rate": 1.8227520082923037e-05, + "loss": 1.536, + "step": 3517 + }, + { + "epoch": 0.0820508036279143, + "grad_norm": 1.8921490907669067, + "learning_rate": 1.8232702772739052e-05, + "loss": 1.2674, + "step": 3518 + }, + { + "epoch": 0.08207412676709222, + "grad_norm": 2.987426519393921, + "learning_rate": 1.8237885462555066e-05, + "loss": 1.358, + "step": 3519 + }, + { + "epoch": 0.08209744990627013, + "grad_norm": 2.238374710083008, + "learning_rate": 1.824306815237108e-05, + "loss": 1.5369, + "step": 3520 + }, + { + "epoch": 0.08212077304544806, + "grad_norm": 3.6904711723327637, + "learning_rate": 1.8248250842187095e-05, + "loss": 1.4271, + "step": 3521 + }, + { + "epoch": 0.08214409618462597, + "grad_norm": 1.9176346063613892, + "learning_rate": 1.8253433532003113e-05, + "loss": 1.3368, + "step": 3522 + }, + { + "epoch": 0.08216741932380389, + "grad_norm": 2.0872647762298584, + "learning_rate": 1.8258616221819127e-05, + "loss": 1.4909, + "step": 3523 + }, + { + "epoch": 0.0821907424629818, + "grad_norm": 1.2928736209869385, + "learning_rate": 1.826379891163514e-05, + "loss": 1.1043, + "step": 3524 + }, + { + "epoch": 0.08221406560215973, + "grad_norm": 1.8889038562774658, + "learning_rate": 1.8268981601451156e-05, + "loss": 1.4896, + "step": 3525 + }, + { + "epoch": 0.08223738874133764, + "grad_norm": 1.6184550523757935, + "learning_rate": 1.827416429126717e-05, + "loss": 1.1775, + "step": 3526 + }, + { + "epoch": 0.08226071188051556, + "grad_norm": 1.6747300624847412, + "learning_rate": 1.8279346981083184e-05, + "loss": 1.5762, + "step": 3527 + }, + { + "epoch": 0.08228403501969347, + "grad_norm": 1.707043170928955, + "learning_rate": 1.82845296708992e-05, + "loss": 1.5885, + "step": 3528 + }, + { + "epoch": 0.0823073581588714, + "grad_norm": 1.6010284423828125, + "learning_rate": 1.8289712360715213e-05, + "loss": 1.5545, + "step": 3529 + }, + { + "epoch": 0.08233068129804931, + "grad_norm": 1.565683364868164, + "learning_rate": 1.8294895050531227e-05, + "loss": 1.5154, + "step": 3530 + }, + { + "epoch": 0.08235400443722724, + "grad_norm": 1.707773208618164, + "learning_rate": 1.830007774034724e-05, + "loss": 1.1917, + "step": 3531 + }, + { + "epoch": 0.08237732757640515, + "grad_norm": 1.8261590003967285, + "learning_rate": 1.8305260430163256e-05, + "loss": 1.4657, + "step": 3532 + }, + { + "epoch": 0.08240065071558306, + "grad_norm": 1.7365014553070068, + "learning_rate": 1.831044311997927e-05, + "loss": 1.4251, + "step": 3533 + }, + { + "epoch": 0.08242397385476098, + "grad_norm": 2.4612667560577393, + "learning_rate": 1.8315625809795285e-05, + "loss": 1.2089, + "step": 3534 + }, + { + "epoch": 0.08244729699393889, + "grad_norm": 1.5900131464004517, + "learning_rate": 1.83208084996113e-05, + "loss": 1.6323, + "step": 3535 + }, + { + "epoch": 0.08247062013311682, + "grad_norm": 2.6279211044311523, + "learning_rate": 1.8325991189427313e-05, + "loss": 2.0529, + "step": 3536 + }, + { + "epoch": 0.08249394327229473, + "grad_norm": 1.79181969165802, + "learning_rate": 1.8331173879243328e-05, + "loss": 1.4461, + "step": 3537 + }, + { + "epoch": 0.08251726641147265, + "grad_norm": 1.5957787036895752, + "learning_rate": 1.8336356569059342e-05, + "loss": 1.2476, + "step": 3538 + }, + { + "epoch": 0.08254058955065056, + "grad_norm": 1.5493981838226318, + "learning_rate": 1.834153925887536e-05, + "loss": 1.4666, + "step": 3539 + }, + { + "epoch": 0.08256391268982849, + "grad_norm": 1.5983459949493408, + "learning_rate": 1.8346721948691374e-05, + "loss": 1.1337, + "step": 3540 + }, + { + "epoch": 0.0825872358290064, + "grad_norm": 1.900614857673645, + "learning_rate": 1.835190463850739e-05, + "loss": 1.2832, + "step": 3541 + }, + { + "epoch": 0.08261055896818432, + "grad_norm": 1.7099312543869019, + "learning_rate": 1.8357087328323403e-05, + "loss": 1.5274, + "step": 3542 + }, + { + "epoch": 0.08263388210736224, + "grad_norm": 1.7623786926269531, + "learning_rate": 1.8362270018139417e-05, + "loss": 1.5002, + "step": 3543 + }, + { + "epoch": 0.08265720524654016, + "grad_norm": 1.6291403770446777, + "learning_rate": 1.836745270795543e-05, + "loss": 1.4322, + "step": 3544 + }, + { + "epoch": 0.08268052838571807, + "grad_norm": 1.7882250547409058, + "learning_rate": 1.8372635397771446e-05, + "loss": 1.5406, + "step": 3545 + }, + { + "epoch": 0.082703851524896, + "grad_norm": 1.33836829662323, + "learning_rate": 1.837781808758746e-05, + "loss": 1.0274, + "step": 3546 + }, + { + "epoch": 0.08272717466407391, + "grad_norm": 2.4266624450683594, + "learning_rate": 1.8383000777403475e-05, + "loss": 1.4865, + "step": 3547 + }, + { + "epoch": 0.08275049780325183, + "grad_norm": 1.9168795347213745, + "learning_rate": 1.838818346721949e-05, + "loss": 1.3113, + "step": 3548 + }, + { + "epoch": 0.08277382094242974, + "grad_norm": 1.602070689201355, + "learning_rate": 1.8393366157035503e-05, + "loss": 1.3721, + "step": 3549 + }, + { + "epoch": 0.08279714408160767, + "grad_norm": 2.380582571029663, + "learning_rate": 1.8398548846851518e-05, + "loss": 1.3196, + "step": 3550 + }, + { + "epoch": 0.08282046722078558, + "grad_norm": 1.694897174835205, + "learning_rate": 1.8403731536667532e-05, + "loss": 1.3993, + "step": 3551 + }, + { + "epoch": 0.0828437903599635, + "grad_norm": 2.2976338863372803, + "learning_rate": 1.8408914226483546e-05, + "loss": 1.5756, + "step": 3552 + }, + { + "epoch": 0.08286711349914141, + "grad_norm": 2.3146839141845703, + "learning_rate": 1.841409691629956e-05, + "loss": 1.2754, + "step": 3553 + }, + { + "epoch": 0.08289043663831934, + "grad_norm": 1.9801183938980103, + "learning_rate": 1.8419279606115575e-05, + "loss": 1.2338, + "step": 3554 + }, + { + "epoch": 0.08291375977749725, + "grad_norm": 2.159083843231201, + "learning_rate": 1.8424462295931593e-05, + "loss": 1.623, + "step": 3555 + }, + { + "epoch": 0.08293708291667518, + "grad_norm": 1.8101762533187866, + "learning_rate": 1.8429644985747604e-05, + "loss": 1.7546, + "step": 3556 + }, + { + "epoch": 0.08296040605585309, + "grad_norm": 2.1166656017303467, + "learning_rate": 1.8434827675563618e-05, + "loss": 1.472, + "step": 3557 + }, + { + "epoch": 0.08298372919503101, + "grad_norm": 2.1947038173675537, + "learning_rate": 1.8440010365379632e-05, + "loss": 1.3757, + "step": 3558 + }, + { + "epoch": 0.08300705233420892, + "grad_norm": 1.8868519067764282, + "learning_rate": 1.8445193055195647e-05, + "loss": 1.317, + "step": 3559 + }, + { + "epoch": 0.08303037547338685, + "grad_norm": 2.1068966388702393, + "learning_rate": 1.845037574501166e-05, + "loss": 1.6346, + "step": 3560 + }, + { + "epoch": 0.08305369861256476, + "grad_norm": 1.756743311882019, + "learning_rate": 1.8455558434827675e-05, + "loss": 1.3374, + "step": 3561 + }, + { + "epoch": 0.08307702175174267, + "grad_norm": 1.6982694864273071, + "learning_rate": 1.846074112464369e-05, + "loss": 1.2253, + "step": 3562 + }, + { + "epoch": 0.0831003448909206, + "grad_norm": 1.6374770402908325, + "learning_rate": 1.8465923814459704e-05, + "loss": 1.8107, + "step": 3563 + }, + { + "epoch": 0.0831236680300985, + "grad_norm": 1.6520603895187378, + "learning_rate": 1.847110650427572e-05, + "loss": 1.3199, + "step": 3564 + }, + { + "epoch": 0.08314699116927643, + "grad_norm": 1.5398011207580566, + "learning_rate": 1.8476289194091733e-05, + "loss": 1.5716, + "step": 3565 + }, + { + "epoch": 0.08317031430845434, + "grad_norm": 1.6022202968597412, + "learning_rate": 1.848147188390775e-05, + "loss": 1.4777, + "step": 3566 + }, + { + "epoch": 0.08319363744763227, + "grad_norm": 2.1725592613220215, + "learning_rate": 1.8486654573723765e-05, + "loss": 1.4681, + "step": 3567 + }, + { + "epoch": 0.08321696058681018, + "grad_norm": 1.59706449508667, + "learning_rate": 1.849183726353978e-05, + "loss": 1.2141, + "step": 3568 + }, + { + "epoch": 0.0832402837259881, + "grad_norm": 1.6120070219039917, + "learning_rate": 1.8497019953355794e-05, + "loss": 1.4908, + "step": 3569 + }, + { + "epoch": 0.08326360686516601, + "grad_norm": 1.4892857074737549, + "learning_rate": 1.8502202643171808e-05, + "loss": 1.6314, + "step": 3570 + }, + { + "epoch": 0.08328693000434394, + "grad_norm": 2.104597806930542, + "learning_rate": 1.8507385332987822e-05, + "loss": 1.4717, + "step": 3571 + }, + { + "epoch": 0.08331025314352185, + "grad_norm": 1.6070716381072998, + "learning_rate": 1.8512568022803837e-05, + "loss": 1.5624, + "step": 3572 + }, + { + "epoch": 0.08333357628269977, + "grad_norm": 1.9566452503204346, + "learning_rate": 1.851775071261985e-05, + "loss": 1.5337, + "step": 3573 + }, + { + "epoch": 0.08335689942187768, + "grad_norm": 1.8620479106903076, + "learning_rate": 1.8522933402435865e-05, + "loss": 1.6287, + "step": 3574 + }, + { + "epoch": 0.08338022256105561, + "grad_norm": 3.037355661392212, + "learning_rate": 1.852811609225188e-05, + "loss": 1.0238, + "step": 3575 + }, + { + "epoch": 0.08340354570023352, + "grad_norm": 1.7178518772125244, + "learning_rate": 1.8533298782067894e-05, + "loss": 1.5673, + "step": 3576 + }, + { + "epoch": 0.08342686883941144, + "grad_norm": 1.6301993131637573, + "learning_rate": 1.853848147188391e-05, + "loss": 1.458, + "step": 3577 + }, + { + "epoch": 0.08345019197858936, + "grad_norm": 1.7615132331848145, + "learning_rate": 1.8543664161699923e-05, + "loss": 1.7595, + "step": 3578 + }, + { + "epoch": 0.08347351511776728, + "grad_norm": 4.415511131286621, + "learning_rate": 1.8548846851515937e-05, + "loss": 1.3477, + "step": 3579 + }, + { + "epoch": 0.08349683825694519, + "grad_norm": 1.829351782798767, + "learning_rate": 1.855402954133195e-05, + "loss": 1.6647, + "step": 3580 + }, + { + "epoch": 0.08352016139612312, + "grad_norm": 1.7743425369262695, + "learning_rate": 1.8559212231147966e-05, + "loss": 1.8199, + "step": 3581 + }, + { + "epoch": 0.08354348453530103, + "grad_norm": 1.833815097808838, + "learning_rate": 1.856439492096398e-05, + "loss": 1.3673, + "step": 3582 + }, + { + "epoch": 0.08356680767447895, + "grad_norm": 1.5730009078979492, + "learning_rate": 1.8569577610779998e-05, + "loss": 1.2953, + "step": 3583 + }, + { + "epoch": 0.08359013081365686, + "grad_norm": 1.7415244579315186, + "learning_rate": 1.8574760300596012e-05, + "loss": 1.4384, + "step": 3584 + }, + { + "epoch": 0.08361345395283479, + "grad_norm": 1.6725150346755981, + "learning_rate": 1.8579942990412027e-05, + "loss": 1.2592, + "step": 3585 + }, + { + "epoch": 0.0836367770920127, + "grad_norm": 1.7817398309707642, + "learning_rate": 1.858512568022804e-05, + "loss": 1.3653, + "step": 3586 + }, + { + "epoch": 0.08366010023119062, + "grad_norm": 1.7642518281936646, + "learning_rate": 1.8590308370044055e-05, + "loss": 1.2734, + "step": 3587 + }, + { + "epoch": 0.08368342337036853, + "grad_norm": 1.9906455278396606, + "learning_rate": 1.859549105986007e-05, + "loss": 1.4007, + "step": 3588 + }, + { + "epoch": 0.08370674650954646, + "grad_norm": 1.8507903814315796, + "learning_rate": 1.8600673749676084e-05, + "loss": 1.2213, + "step": 3589 + }, + { + "epoch": 0.08373006964872437, + "grad_norm": 1.7837014198303223, + "learning_rate": 1.8605856439492098e-05, + "loss": 1.5054, + "step": 3590 + }, + { + "epoch": 0.08375339278790228, + "grad_norm": 1.8760827779769897, + "learning_rate": 1.8611039129308113e-05, + "loss": 1.4422, + "step": 3591 + }, + { + "epoch": 0.0837767159270802, + "grad_norm": 2.278310775756836, + "learning_rate": 1.8616221819124127e-05, + "loss": 1.4177, + "step": 3592 + }, + { + "epoch": 0.08380003906625812, + "grad_norm": 1.8723113536834717, + "learning_rate": 1.862140450894014e-05, + "loss": 1.3213, + "step": 3593 + }, + { + "epoch": 0.08382336220543604, + "grad_norm": 1.8388547897338867, + "learning_rate": 1.8626587198756156e-05, + "loss": 1.4389, + "step": 3594 + }, + { + "epoch": 0.08384668534461395, + "grad_norm": 2.0076687335968018, + "learning_rate": 1.863176988857217e-05, + "loss": 1.7453, + "step": 3595 + }, + { + "epoch": 0.08387000848379188, + "grad_norm": 1.573243498802185, + "learning_rate": 1.8636952578388184e-05, + "loss": 1.3174, + "step": 3596 + }, + { + "epoch": 0.08389333162296979, + "grad_norm": 1.5967497825622559, + "learning_rate": 1.86421352682042e-05, + "loss": 1.2045, + "step": 3597 + }, + { + "epoch": 0.08391665476214771, + "grad_norm": 2.108859062194824, + "learning_rate": 1.8647317958020213e-05, + "loss": 1.3171, + "step": 3598 + }, + { + "epoch": 0.08393997790132562, + "grad_norm": 1.9356852769851685, + "learning_rate": 1.865250064783623e-05, + "loss": 1.3646, + "step": 3599 + }, + { + "epoch": 0.08396330104050355, + "grad_norm": 2.345694065093994, + "learning_rate": 1.8657683337652245e-05, + "loss": 1.3063, + "step": 3600 + }, + { + "epoch": 0.08398662417968146, + "grad_norm": 1.3503670692443848, + "learning_rate": 1.866286602746826e-05, + "loss": 1.2187, + "step": 3601 + }, + { + "epoch": 0.08400994731885938, + "grad_norm": 1.4740415811538696, + "learning_rate": 1.8668048717284274e-05, + "loss": 1.4042, + "step": 3602 + }, + { + "epoch": 0.0840332704580373, + "grad_norm": 1.6068682670593262, + "learning_rate": 1.8673231407100288e-05, + "loss": 1.6119, + "step": 3603 + }, + { + "epoch": 0.08405659359721522, + "grad_norm": 1.9428056478500366, + "learning_rate": 1.8678414096916303e-05, + "loss": 1.2609, + "step": 3604 + }, + { + "epoch": 0.08407991673639313, + "grad_norm": 1.8283860683441162, + "learning_rate": 1.8683596786732317e-05, + "loss": 1.5166, + "step": 3605 + }, + { + "epoch": 0.08410323987557106, + "grad_norm": 1.4029210805892944, + "learning_rate": 1.868877947654833e-05, + "loss": 1.2865, + "step": 3606 + }, + { + "epoch": 0.08412656301474897, + "grad_norm": 1.2441627979278564, + "learning_rate": 1.8693962166364346e-05, + "loss": 1.1379, + "step": 3607 + }, + { + "epoch": 0.08414988615392689, + "grad_norm": 1.568545937538147, + "learning_rate": 1.869914485618036e-05, + "loss": 1.3603, + "step": 3608 + }, + { + "epoch": 0.0841732092931048, + "grad_norm": 1.4704279899597168, + "learning_rate": 1.8704327545996374e-05, + "loss": 1.3639, + "step": 3609 + }, + { + "epoch": 0.08419653243228273, + "grad_norm": 1.745092511177063, + "learning_rate": 1.870951023581239e-05, + "loss": 1.0921, + "step": 3610 + }, + { + "epoch": 0.08421985557146064, + "grad_norm": 1.942427396774292, + "learning_rate": 1.8714692925628403e-05, + "loss": 1.5507, + "step": 3611 + }, + { + "epoch": 0.08424317871063856, + "grad_norm": 1.6115508079528809, + "learning_rate": 1.8719875615444417e-05, + "loss": 1.4687, + "step": 3612 + }, + { + "epoch": 0.08426650184981647, + "grad_norm": 2.0204665660858154, + "learning_rate": 1.872505830526043e-05, + "loss": 1.5153, + "step": 3613 + }, + { + "epoch": 0.0842898249889944, + "grad_norm": 2.009833574295044, + "learning_rate": 1.8730240995076446e-05, + "loss": 1.4983, + "step": 3614 + }, + { + "epoch": 0.08431314812817231, + "grad_norm": 1.68868088722229, + "learning_rate": 1.873542368489246e-05, + "loss": 1.3068, + "step": 3615 + }, + { + "epoch": 0.08433647126735024, + "grad_norm": 1.6193383932113647, + "learning_rate": 1.8740606374708475e-05, + "loss": 1.2426, + "step": 3616 + }, + { + "epoch": 0.08435979440652815, + "grad_norm": 2.242953062057495, + "learning_rate": 1.874578906452449e-05, + "loss": 1.4347, + "step": 3617 + }, + { + "epoch": 0.08438311754570607, + "grad_norm": 1.6163315773010254, + "learning_rate": 1.8750971754340503e-05, + "loss": 1.4287, + "step": 3618 + }, + { + "epoch": 0.08440644068488398, + "grad_norm": 1.638716697692871, + "learning_rate": 1.8756154444156518e-05, + "loss": 1.3548, + "step": 3619 + }, + { + "epoch": 0.0844297638240619, + "grad_norm": 1.4887486696243286, + "learning_rate": 1.8761337133972532e-05, + "loss": 1.4593, + "step": 3620 + }, + { + "epoch": 0.08445308696323982, + "grad_norm": 1.63435959815979, + "learning_rate": 1.8766519823788546e-05, + "loss": 1.4153, + "step": 3621 + }, + { + "epoch": 0.08447641010241773, + "grad_norm": 1.5852644443511963, + "learning_rate": 1.877170251360456e-05, + "loss": 1.4955, + "step": 3622 + }, + { + "epoch": 0.08449973324159565, + "grad_norm": 1.7639789581298828, + "learning_rate": 1.8776885203420575e-05, + "loss": 1.6683, + "step": 3623 + }, + { + "epoch": 0.08452305638077356, + "grad_norm": 2.21766996383667, + "learning_rate": 1.878206789323659e-05, + "loss": 1.1225, + "step": 3624 + }, + { + "epoch": 0.08454637951995149, + "grad_norm": 1.663895606994629, + "learning_rate": 1.8787250583052604e-05, + "loss": 1.3233, + "step": 3625 + }, + { + "epoch": 0.0845697026591294, + "grad_norm": 1.4655165672302246, + "learning_rate": 1.879243327286862e-05, + "loss": 0.8843, + "step": 3626 + }, + { + "epoch": 0.08459302579830733, + "grad_norm": 1.8070948123931885, + "learning_rate": 1.8797615962684636e-05, + "loss": 1.3909, + "step": 3627 + }, + { + "epoch": 0.08461634893748524, + "grad_norm": 2.0682473182678223, + "learning_rate": 1.880279865250065e-05, + "loss": 1.3427, + "step": 3628 + }, + { + "epoch": 0.08463967207666316, + "grad_norm": 1.361215591430664, + "learning_rate": 1.8807981342316665e-05, + "loss": 1.1998, + "step": 3629 + }, + { + "epoch": 0.08466299521584107, + "grad_norm": 2.013563394546509, + "learning_rate": 1.881316403213268e-05, + "loss": 1.4059, + "step": 3630 + }, + { + "epoch": 0.084686318355019, + "grad_norm": 1.9156532287597656, + "learning_rate": 1.8818346721948693e-05, + "loss": 1.4498, + "step": 3631 + }, + { + "epoch": 0.08470964149419691, + "grad_norm": 1.635302186012268, + "learning_rate": 1.8823529411764708e-05, + "loss": 1.5367, + "step": 3632 + }, + { + "epoch": 0.08473296463337483, + "grad_norm": 1.9218707084655762, + "learning_rate": 1.8828712101580722e-05, + "loss": 1.4478, + "step": 3633 + }, + { + "epoch": 0.08475628777255274, + "grad_norm": 3.0237009525299072, + "learning_rate": 1.8833894791396736e-05, + "loss": 1.6139, + "step": 3634 + }, + { + "epoch": 0.08477961091173067, + "grad_norm": 2.358811140060425, + "learning_rate": 1.883907748121275e-05, + "loss": 1.5581, + "step": 3635 + }, + { + "epoch": 0.08480293405090858, + "grad_norm": 2.286472797393799, + "learning_rate": 1.8844260171028765e-05, + "loss": 1.5672, + "step": 3636 + }, + { + "epoch": 0.0848262571900865, + "grad_norm": 1.866599440574646, + "learning_rate": 1.884944286084478e-05, + "loss": 1.5127, + "step": 3637 + }, + { + "epoch": 0.08484958032926442, + "grad_norm": 2.1738510131835938, + "learning_rate": 1.8854625550660794e-05, + "loss": 1.6779, + "step": 3638 + }, + { + "epoch": 0.08487290346844234, + "grad_norm": 1.97782564163208, + "learning_rate": 1.8859808240476808e-05, + "loss": 1.4563, + "step": 3639 + }, + { + "epoch": 0.08489622660762025, + "grad_norm": 2.0570616722106934, + "learning_rate": 1.8864990930292822e-05, + "loss": 1.6798, + "step": 3640 + }, + { + "epoch": 0.08491954974679818, + "grad_norm": 1.8737928867340088, + "learning_rate": 1.8870173620108837e-05, + "loss": 1.2149, + "step": 3641 + }, + { + "epoch": 0.08494287288597609, + "grad_norm": 1.6535226106643677, + "learning_rate": 1.887535630992485e-05, + "loss": 1.2412, + "step": 3642 + }, + { + "epoch": 0.08496619602515401, + "grad_norm": 1.6835589408874512, + "learning_rate": 1.888053899974087e-05, + "loss": 1.4337, + "step": 3643 + }, + { + "epoch": 0.08498951916433192, + "grad_norm": 2.3011474609375, + "learning_rate": 1.8885721689556883e-05, + "loss": 1.1645, + "step": 3644 + }, + { + "epoch": 0.08501284230350985, + "grad_norm": 2.1055448055267334, + "learning_rate": 1.8890904379372898e-05, + "loss": 1.8118, + "step": 3645 + }, + { + "epoch": 0.08503616544268776, + "grad_norm": 1.537703514099121, + "learning_rate": 1.8896087069188912e-05, + "loss": 1.3199, + "step": 3646 + }, + { + "epoch": 0.08505948858186567, + "grad_norm": 1.3928148746490479, + "learning_rate": 1.8901269759004926e-05, + "loss": 0.9513, + "step": 3647 + }, + { + "epoch": 0.0850828117210436, + "grad_norm": 1.7757236957550049, + "learning_rate": 1.890645244882094e-05, + "loss": 1.4446, + "step": 3648 + }, + { + "epoch": 0.0851061348602215, + "grad_norm": 1.6766371726989746, + "learning_rate": 1.8911635138636955e-05, + "loss": 1.2737, + "step": 3649 + }, + { + "epoch": 0.08512945799939943, + "grad_norm": 1.717544674873352, + "learning_rate": 1.891681782845297e-05, + "loss": 1.3209, + "step": 3650 + }, + { + "epoch": 0.08515278113857734, + "grad_norm": 1.769384741783142, + "learning_rate": 1.8922000518268984e-05, + "loss": 1.3914, + "step": 3651 + }, + { + "epoch": 0.08517610427775527, + "grad_norm": 2.045825481414795, + "learning_rate": 1.8927183208084998e-05, + "loss": 1.5438, + "step": 3652 + }, + { + "epoch": 0.08519942741693318, + "grad_norm": 1.7861770391464233, + "learning_rate": 1.8932365897901012e-05, + "loss": 1.6111, + "step": 3653 + }, + { + "epoch": 0.0852227505561111, + "grad_norm": 2.143268585205078, + "learning_rate": 1.8937548587717027e-05, + "loss": 1.4671, + "step": 3654 + }, + { + "epoch": 0.08524607369528901, + "grad_norm": 1.7686920166015625, + "learning_rate": 1.894273127753304e-05, + "loss": 1.5865, + "step": 3655 + }, + { + "epoch": 0.08526939683446694, + "grad_norm": 1.485282063484192, + "learning_rate": 1.8947913967349055e-05, + "loss": 1.5447, + "step": 3656 + }, + { + "epoch": 0.08529271997364485, + "grad_norm": 1.9272282123565674, + "learning_rate": 1.895309665716507e-05, + "loss": 1.465, + "step": 3657 + }, + { + "epoch": 0.08531604311282277, + "grad_norm": 2.2373533248901367, + "learning_rate": 1.8958279346981084e-05, + "loss": 1.5705, + "step": 3658 + }, + { + "epoch": 0.08533936625200068, + "grad_norm": 2.107943296432495, + "learning_rate": 1.8963462036797102e-05, + "loss": 1.8744, + "step": 3659 + }, + { + "epoch": 0.08536268939117861, + "grad_norm": 1.6185489892959595, + "learning_rate": 1.8968644726613116e-05, + "loss": 1.3097, + "step": 3660 + }, + { + "epoch": 0.08538601253035652, + "grad_norm": 1.7128583192825317, + "learning_rate": 1.897382741642913e-05, + "loss": 1.6936, + "step": 3661 + }, + { + "epoch": 0.08540933566953444, + "grad_norm": 1.5283836126327515, + "learning_rate": 1.8979010106245145e-05, + "loss": 1.2054, + "step": 3662 + }, + { + "epoch": 0.08543265880871236, + "grad_norm": 1.9866423606872559, + "learning_rate": 1.898419279606116e-05, + "loss": 1.6309, + "step": 3663 + }, + { + "epoch": 0.08545598194789028, + "grad_norm": 1.8106653690338135, + "learning_rate": 1.8989375485877173e-05, + "loss": 1.4622, + "step": 3664 + }, + { + "epoch": 0.08547930508706819, + "grad_norm": 1.4780253171920776, + "learning_rate": 1.8994558175693188e-05, + "loss": 1.4476, + "step": 3665 + }, + { + "epoch": 0.08550262822624612, + "grad_norm": 2.043191432952881, + "learning_rate": 1.8999740865509202e-05, + "loss": 1.4721, + "step": 3666 + }, + { + "epoch": 0.08552595136542403, + "grad_norm": 1.4947582483291626, + "learning_rate": 1.9004923555325213e-05, + "loss": 1.1956, + "step": 3667 + }, + { + "epoch": 0.08554927450460195, + "grad_norm": 1.7940278053283691, + "learning_rate": 1.9010106245141227e-05, + "loss": 1.0641, + "step": 3668 + }, + { + "epoch": 0.08557259764377986, + "grad_norm": 1.7259446382522583, + "learning_rate": 1.9015288934957242e-05, + "loss": 1.3405, + "step": 3669 + }, + { + "epoch": 0.08559592078295779, + "grad_norm": 1.8191657066345215, + "learning_rate": 1.902047162477326e-05, + "loss": 1.3307, + "step": 3670 + }, + { + "epoch": 0.0856192439221357, + "grad_norm": 1.7000370025634766, + "learning_rate": 1.9025654314589274e-05, + "loss": 1.1367, + "step": 3671 + }, + { + "epoch": 0.08564256706131362, + "grad_norm": 1.889972448348999, + "learning_rate": 1.9030837004405288e-05, + "loss": 1.415, + "step": 3672 + }, + { + "epoch": 0.08566589020049153, + "grad_norm": 2.0806078910827637, + "learning_rate": 1.9036019694221303e-05, + "loss": 1.3672, + "step": 3673 + }, + { + "epoch": 0.08568921333966946, + "grad_norm": 2.796506881713867, + "learning_rate": 1.9041202384037317e-05, + "loss": 1.7955, + "step": 3674 + }, + { + "epoch": 0.08571253647884737, + "grad_norm": 2.1324260234832764, + "learning_rate": 1.904638507385333e-05, + "loss": 1.3036, + "step": 3675 + }, + { + "epoch": 0.08573585961802528, + "grad_norm": 2.0976030826568604, + "learning_rate": 1.9051567763669346e-05, + "loss": 1.5918, + "step": 3676 + }, + { + "epoch": 0.0857591827572032, + "grad_norm": 1.7113218307495117, + "learning_rate": 1.905675045348536e-05, + "loss": 1.4825, + "step": 3677 + }, + { + "epoch": 0.08578250589638112, + "grad_norm": 1.7739856243133545, + "learning_rate": 1.9061933143301374e-05, + "loss": 1.5581, + "step": 3678 + }, + { + "epoch": 0.08580582903555904, + "grad_norm": 1.7396571636199951, + "learning_rate": 1.906711583311739e-05, + "loss": 1.1139, + "step": 3679 + }, + { + "epoch": 0.08582915217473695, + "grad_norm": 1.5428905487060547, + "learning_rate": 1.9072298522933403e-05, + "loss": 1.4518, + "step": 3680 + }, + { + "epoch": 0.08585247531391488, + "grad_norm": 1.8691036701202393, + "learning_rate": 1.9077481212749417e-05, + "loss": 1.4603, + "step": 3681 + }, + { + "epoch": 0.08587579845309279, + "grad_norm": 1.3939930200576782, + "learning_rate": 1.9082663902565432e-05, + "loss": 1.5307, + "step": 3682 + }, + { + "epoch": 0.08589912159227071, + "grad_norm": 1.632413625717163, + "learning_rate": 1.9087846592381446e-05, + "loss": 1.3989, + "step": 3683 + }, + { + "epoch": 0.08592244473144862, + "grad_norm": 2.055506706237793, + "learning_rate": 1.909302928219746e-05, + "loss": 1.9212, + "step": 3684 + }, + { + "epoch": 0.08594576787062655, + "grad_norm": 1.6543344259262085, + "learning_rate": 1.9098211972013475e-05, + "loss": 1.3353, + "step": 3685 + }, + { + "epoch": 0.08596909100980446, + "grad_norm": 1.8599748611450195, + "learning_rate": 1.910339466182949e-05, + "loss": 1.3546, + "step": 3686 + }, + { + "epoch": 0.08599241414898239, + "grad_norm": 1.7803127765655518, + "learning_rate": 1.9108577351645507e-05, + "loss": 1.5504, + "step": 3687 + }, + { + "epoch": 0.0860157372881603, + "grad_norm": 1.5186231136322021, + "learning_rate": 1.911376004146152e-05, + "loss": 1.0052, + "step": 3688 + }, + { + "epoch": 0.08603906042733822, + "grad_norm": 1.7595012187957764, + "learning_rate": 1.9118942731277536e-05, + "loss": 1.2889, + "step": 3689 + }, + { + "epoch": 0.08606238356651613, + "grad_norm": 1.9913487434387207, + "learning_rate": 1.912412542109355e-05, + "loss": 1.4898, + "step": 3690 + }, + { + "epoch": 0.08608570670569406, + "grad_norm": 1.6132420301437378, + "learning_rate": 1.9129308110909564e-05, + "loss": 1.3643, + "step": 3691 + }, + { + "epoch": 0.08610902984487197, + "grad_norm": 1.6206722259521484, + "learning_rate": 1.913449080072558e-05, + "loss": 1.4306, + "step": 3692 + }, + { + "epoch": 0.08613235298404989, + "grad_norm": 1.7904775142669678, + "learning_rate": 1.9139673490541593e-05, + "loss": 1.2245, + "step": 3693 + }, + { + "epoch": 0.0861556761232278, + "grad_norm": 1.7304673194885254, + "learning_rate": 1.9144856180357607e-05, + "loss": 1.5856, + "step": 3694 + }, + { + "epoch": 0.08617899926240573, + "grad_norm": 1.8599059581756592, + "learning_rate": 1.915003887017362e-05, + "loss": 1.6862, + "step": 3695 + }, + { + "epoch": 0.08620232240158364, + "grad_norm": 1.7575186491012573, + "learning_rate": 1.9155221559989636e-05, + "loss": 1.2792, + "step": 3696 + }, + { + "epoch": 0.08622564554076156, + "grad_norm": 2.756476640701294, + "learning_rate": 1.916040424980565e-05, + "loss": 1.6286, + "step": 3697 + }, + { + "epoch": 0.08624896867993948, + "grad_norm": 1.9986953735351562, + "learning_rate": 1.9165586939621665e-05, + "loss": 1.8472, + "step": 3698 + }, + { + "epoch": 0.0862722918191174, + "grad_norm": 1.4760515689849854, + "learning_rate": 1.917076962943768e-05, + "loss": 1.3095, + "step": 3699 + }, + { + "epoch": 0.08629561495829531, + "grad_norm": 1.581083059310913, + "learning_rate": 1.9175952319253693e-05, + "loss": 1.6524, + "step": 3700 + }, + { + "epoch": 0.08631893809747324, + "grad_norm": 2.1014351844787598, + "learning_rate": 1.9181135009069708e-05, + "loss": 1.4966, + "step": 3701 + }, + { + "epoch": 0.08634226123665115, + "grad_norm": 1.7185317277908325, + "learning_rate": 1.9186317698885722e-05, + "loss": 1.5509, + "step": 3702 + }, + { + "epoch": 0.08636558437582907, + "grad_norm": 1.9920899868011475, + "learning_rate": 1.919150038870174e-05, + "loss": 1.6712, + "step": 3703 + }, + { + "epoch": 0.08638890751500698, + "grad_norm": 1.8467237949371338, + "learning_rate": 1.9196683078517754e-05, + "loss": 1.2693, + "step": 3704 + }, + { + "epoch": 0.0864122306541849, + "grad_norm": 1.5148426294326782, + "learning_rate": 1.920186576833377e-05, + "loss": 1.5189, + "step": 3705 + }, + { + "epoch": 0.08643555379336282, + "grad_norm": 1.7767192125320435, + "learning_rate": 1.9207048458149783e-05, + "loss": 1.2703, + "step": 3706 + }, + { + "epoch": 0.08645887693254073, + "grad_norm": 1.9807710647583008, + "learning_rate": 1.9212231147965797e-05, + "loss": 1.5145, + "step": 3707 + }, + { + "epoch": 0.08648220007171865, + "grad_norm": 1.7413864135742188, + "learning_rate": 1.921741383778181e-05, + "loss": 1.6929, + "step": 3708 + }, + { + "epoch": 0.08650552321089656, + "grad_norm": 1.85059654712677, + "learning_rate": 1.9222596527597826e-05, + "loss": 1.3701, + "step": 3709 + }, + { + "epoch": 0.08652884635007449, + "grad_norm": 1.7285747528076172, + "learning_rate": 1.922777921741384e-05, + "loss": 1.6005, + "step": 3710 + }, + { + "epoch": 0.0865521694892524, + "grad_norm": 2.677480697631836, + "learning_rate": 1.9232961907229855e-05, + "loss": 1.4945, + "step": 3711 + }, + { + "epoch": 0.08657549262843033, + "grad_norm": 2.22979998588562, + "learning_rate": 1.923814459704587e-05, + "loss": 1.6705, + "step": 3712 + }, + { + "epoch": 0.08659881576760824, + "grad_norm": 1.4900656938552856, + "learning_rate": 1.9243327286861883e-05, + "loss": 1.2837, + "step": 3713 + }, + { + "epoch": 0.08662213890678616, + "grad_norm": 1.2594544887542725, + "learning_rate": 1.9248509976677898e-05, + "loss": 1.1585, + "step": 3714 + }, + { + "epoch": 0.08664546204596407, + "grad_norm": 2.0319337844848633, + "learning_rate": 1.9253692666493912e-05, + "loss": 1.6878, + "step": 3715 + }, + { + "epoch": 0.086668785185142, + "grad_norm": 2.0499188899993896, + "learning_rate": 1.9258875356309926e-05, + "loss": 1.4759, + "step": 3716 + }, + { + "epoch": 0.08669210832431991, + "grad_norm": 2.2834079265594482, + "learning_rate": 1.926405804612594e-05, + "loss": 1.5455, + "step": 3717 + }, + { + "epoch": 0.08671543146349783, + "grad_norm": 1.8450220823287964, + "learning_rate": 1.9269240735941955e-05, + "loss": 1.5251, + "step": 3718 + }, + { + "epoch": 0.08673875460267574, + "grad_norm": 2.1184442043304443, + "learning_rate": 1.927442342575797e-05, + "loss": 1.7317, + "step": 3719 + }, + { + "epoch": 0.08676207774185367, + "grad_norm": 1.4140855073928833, + "learning_rate": 1.9279606115573987e-05, + "loss": 1.1828, + "step": 3720 + }, + { + "epoch": 0.08678540088103158, + "grad_norm": 1.673884630203247, + "learning_rate": 1.928478880539e-05, + "loss": 1.7712, + "step": 3721 + }, + { + "epoch": 0.0868087240202095, + "grad_norm": 1.5101699829101562, + "learning_rate": 1.9289971495206012e-05, + "loss": 1.4416, + "step": 3722 + }, + { + "epoch": 0.08683204715938742, + "grad_norm": 1.884077787399292, + "learning_rate": 1.9295154185022027e-05, + "loss": 1.1231, + "step": 3723 + }, + { + "epoch": 0.08685537029856534, + "grad_norm": 1.7956326007843018, + "learning_rate": 1.930033687483804e-05, + "loss": 1.9195, + "step": 3724 + }, + { + "epoch": 0.08687869343774325, + "grad_norm": 2.028334379196167, + "learning_rate": 1.9305519564654055e-05, + "loss": 1.6812, + "step": 3725 + }, + { + "epoch": 0.08690201657692118, + "grad_norm": 1.541755199432373, + "learning_rate": 1.931070225447007e-05, + "loss": 1.5305, + "step": 3726 + }, + { + "epoch": 0.08692533971609909, + "grad_norm": 2.167618989944458, + "learning_rate": 1.9315884944286084e-05, + "loss": 1.5456, + "step": 3727 + }, + { + "epoch": 0.08694866285527701, + "grad_norm": 1.8667240142822266, + "learning_rate": 1.93210676341021e-05, + "loss": 1.646, + "step": 3728 + }, + { + "epoch": 0.08697198599445492, + "grad_norm": 2.162578821182251, + "learning_rate": 1.9326250323918113e-05, + "loss": 1.53, + "step": 3729 + }, + { + "epoch": 0.08699530913363285, + "grad_norm": 1.4681026935577393, + "learning_rate": 1.9331433013734127e-05, + "loss": 1.4004, + "step": 3730 + }, + { + "epoch": 0.08701863227281076, + "grad_norm": 1.5720863342285156, + "learning_rate": 1.9336615703550145e-05, + "loss": 1.2065, + "step": 3731 + }, + { + "epoch": 0.08704195541198868, + "grad_norm": 1.8887262344360352, + "learning_rate": 1.934179839336616e-05, + "loss": 1.516, + "step": 3732 + }, + { + "epoch": 0.0870652785511666, + "grad_norm": 1.582972764968872, + "learning_rate": 1.9346981083182174e-05, + "loss": 1.2518, + "step": 3733 + }, + { + "epoch": 0.0870886016903445, + "grad_norm": 2.031712055206299, + "learning_rate": 1.9352163772998188e-05, + "loss": 1.2827, + "step": 3734 + }, + { + "epoch": 0.08711192482952243, + "grad_norm": 1.9298467636108398, + "learning_rate": 1.9357346462814202e-05, + "loss": 1.52, + "step": 3735 + }, + { + "epoch": 0.08713524796870034, + "grad_norm": 1.7419377565383911, + "learning_rate": 1.9362529152630217e-05, + "loss": 1.192, + "step": 3736 + }, + { + "epoch": 0.08715857110787827, + "grad_norm": 2.0010645389556885, + "learning_rate": 1.936771184244623e-05, + "loss": 1.3094, + "step": 3737 + }, + { + "epoch": 0.08718189424705618, + "grad_norm": 1.4915006160736084, + "learning_rate": 1.9372894532262245e-05, + "loss": 1.3347, + "step": 3738 + }, + { + "epoch": 0.0872052173862341, + "grad_norm": 1.9095317125320435, + "learning_rate": 1.937807722207826e-05, + "loss": 1.6573, + "step": 3739 + }, + { + "epoch": 0.08722854052541201, + "grad_norm": 1.959438443183899, + "learning_rate": 1.9383259911894274e-05, + "loss": 1.2514, + "step": 3740 + }, + { + "epoch": 0.08725186366458994, + "grad_norm": 1.4582146406173706, + "learning_rate": 1.938844260171029e-05, + "loss": 1.2887, + "step": 3741 + }, + { + "epoch": 0.08727518680376785, + "grad_norm": 1.5818802118301392, + "learning_rate": 1.9393625291526303e-05, + "loss": 1.1757, + "step": 3742 + }, + { + "epoch": 0.08729850994294577, + "grad_norm": 4.244632244110107, + "learning_rate": 1.9398807981342317e-05, + "loss": 1.6171, + "step": 3743 + }, + { + "epoch": 0.08732183308212368, + "grad_norm": 1.8370835781097412, + "learning_rate": 1.940399067115833e-05, + "loss": 1.2676, + "step": 3744 + }, + { + "epoch": 0.08734515622130161, + "grad_norm": 1.8897545337677002, + "learning_rate": 1.9409173360974346e-05, + "loss": 1.6854, + "step": 3745 + }, + { + "epoch": 0.08736847936047952, + "grad_norm": 1.985708475112915, + "learning_rate": 1.941435605079036e-05, + "loss": 1.3562, + "step": 3746 + }, + { + "epoch": 0.08739180249965744, + "grad_norm": 1.682483434677124, + "learning_rate": 1.9419538740606378e-05, + "loss": 1.375, + "step": 3747 + }, + { + "epoch": 0.08741512563883536, + "grad_norm": 1.743619441986084, + "learning_rate": 1.9424721430422392e-05, + "loss": 1.23, + "step": 3748 + }, + { + "epoch": 0.08743844877801328, + "grad_norm": 1.8465096950531006, + "learning_rate": 1.9429904120238407e-05, + "loss": 1.4852, + "step": 3749 + }, + { + "epoch": 0.08746177191719119, + "grad_norm": 1.4261040687561035, + "learning_rate": 1.943508681005442e-05, + "loss": 1.1589, + "step": 3750 + }, + { + "epoch": 0.08748509505636912, + "grad_norm": 1.8200268745422363, + "learning_rate": 1.9440269499870435e-05, + "loss": 1.5185, + "step": 3751 + }, + { + "epoch": 0.08750841819554703, + "grad_norm": 1.8040040731430054, + "learning_rate": 1.944545218968645e-05, + "loss": 1.2195, + "step": 3752 + }, + { + "epoch": 0.08753174133472495, + "grad_norm": 2.745818614959717, + "learning_rate": 1.9450634879502464e-05, + "loss": 1.5439, + "step": 3753 + }, + { + "epoch": 0.08755506447390286, + "grad_norm": 2.6260766983032227, + "learning_rate": 1.9455817569318478e-05, + "loss": 1.4478, + "step": 3754 + }, + { + "epoch": 0.08757838761308079, + "grad_norm": 1.6455649137496948, + "learning_rate": 1.9461000259134493e-05, + "loss": 1.5318, + "step": 3755 + }, + { + "epoch": 0.0876017107522587, + "grad_norm": 1.8115977048873901, + "learning_rate": 1.9466182948950507e-05, + "loss": 1.4069, + "step": 3756 + }, + { + "epoch": 0.08762503389143662, + "grad_norm": 2.10554575920105, + "learning_rate": 1.947136563876652e-05, + "loss": 1.8464, + "step": 3757 + }, + { + "epoch": 0.08764835703061453, + "grad_norm": 1.4829670190811157, + "learning_rate": 1.9476548328582536e-05, + "loss": 1.0982, + "step": 3758 + }, + { + "epoch": 0.08767168016979246, + "grad_norm": 1.6490533351898193, + "learning_rate": 1.948173101839855e-05, + "loss": 1.2205, + "step": 3759 + }, + { + "epoch": 0.08769500330897037, + "grad_norm": 1.7997782230377197, + "learning_rate": 1.9486913708214564e-05, + "loss": 1.6498, + "step": 3760 + }, + { + "epoch": 0.08771832644814828, + "grad_norm": 1.3423871994018555, + "learning_rate": 1.949209639803058e-05, + "loss": 1.2353, + "step": 3761 + }, + { + "epoch": 0.0877416495873262, + "grad_norm": 1.9823672771453857, + "learning_rate": 1.9497279087846593e-05, + "loss": 1.5512, + "step": 3762 + }, + { + "epoch": 0.08776497272650412, + "grad_norm": 1.8137081861495972, + "learning_rate": 1.9502461777662607e-05, + "loss": 1.1692, + "step": 3763 + }, + { + "epoch": 0.08778829586568204, + "grad_norm": 1.948642373085022, + "learning_rate": 1.9507644467478625e-05, + "loss": 1.1127, + "step": 3764 + }, + { + "epoch": 0.08781161900485995, + "grad_norm": 1.5273735523223877, + "learning_rate": 1.951282715729464e-05, + "loss": 1.258, + "step": 3765 + }, + { + "epoch": 0.08783494214403788, + "grad_norm": 1.8528945446014404, + "learning_rate": 1.9518009847110654e-05, + "loss": 1.283, + "step": 3766 + }, + { + "epoch": 0.08785826528321579, + "grad_norm": 2.193530559539795, + "learning_rate": 1.9523192536926668e-05, + "loss": 1.385, + "step": 3767 + }, + { + "epoch": 0.08788158842239371, + "grad_norm": 1.6347289085388184, + "learning_rate": 1.9528375226742683e-05, + "loss": 1.4411, + "step": 3768 + }, + { + "epoch": 0.08790491156157162, + "grad_norm": 1.4878569841384888, + "learning_rate": 1.9533557916558697e-05, + "loss": 1.338, + "step": 3769 + }, + { + "epoch": 0.08792823470074955, + "grad_norm": 1.523554801940918, + "learning_rate": 1.953874060637471e-05, + "loss": 1.4254, + "step": 3770 + }, + { + "epoch": 0.08795155783992746, + "grad_norm": 1.8349013328552246, + "learning_rate": 1.9543923296190726e-05, + "loss": 1.2373, + "step": 3771 + }, + { + "epoch": 0.08797488097910539, + "grad_norm": 1.794135332107544, + "learning_rate": 1.954910598600674e-05, + "loss": 1.2954, + "step": 3772 + }, + { + "epoch": 0.0879982041182833, + "grad_norm": 3.187321424484253, + "learning_rate": 1.9554288675822754e-05, + "loss": 1.0685, + "step": 3773 + }, + { + "epoch": 0.08802152725746122, + "grad_norm": 1.8911279439926147, + "learning_rate": 1.955947136563877e-05, + "loss": 1.4991, + "step": 3774 + }, + { + "epoch": 0.08804485039663913, + "grad_norm": 1.707911729812622, + "learning_rate": 1.9564654055454783e-05, + "loss": 1.6678, + "step": 3775 + }, + { + "epoch": 0.08806817353581706, + "grad_norm": 2.3012478351593018, + "learning_rate": 1.9569836745270797e-05, + "loss": 1.4112, + "step": 3776 + }, + { + "epoch": 0.08809149667499497, + "grad_norm": 1.5143091678619385, + "learning_rate": 1.957501943508681e-05, + "loss": 1.4209, + "step": 3777 + }, + { + "epoch": 0.08811481981417289, + "grad_norm": 1.869060754776001, + "learning_rate": 1.9580202124902826e-05, + "loss": 1.4921, + "step": 3778 + }, + { + "epoch": 0.0881381429533508, + "grad_norm": 2.0710952281951904, + "learning_rate": 1.958538481471884e-05, + "loss": 1.556, + "step": 3779 + }, + { + "epoch": 0.08816146609252873, + "grad_norm": 2.3364593982696533, + "learning_rate": 1.9590567504534855e-05, + "loss": 1.3446, + "step": 3780 + }, + { + "epoch": 0.08818478923170664, + "grad_norm": 1.7574700117111206, + "learning_rate": 1.959575019435087e-05, + "loss": 1.4665, + "step": 3781 + }, + { + "epoch": 0.08820811237088456, + "grad_norm": 1.8747336864471436, + "learning_rate": 1.9600932884166883e-05, + "loss": 1.3774, + "step": 3782 + }, + { + "epoch": 0.08823143551006248, + "grad_norm": 1.802948236465454, + "learning_rate": 1.9606115573982898e-05, + "loss": 1.3237, + "step": 3783 + }, + { + "epoch": 0.0882547586492404, + "grad_norm": 1.880509376525879, + "learning_rate": 1.9611298263798912e-05, + "loss": 1.6404, + "step": 3784 + }, + { + "epoch": 0.08827808178841831, + "grad_norm": 1.7206971645355225, + "learning_rate": 1.9616480953614926e-05, + "loss": 1.2631, + "step": 3785 + }, + { + "epoch": 0.08830140492759624, + "grad_norm": 2.0995943546295166, + "learning_rate": 1.962166364343094e-05, + "loss": 1.283, + "step": 3786 + }, + { + "epoch": 0.08832472806677415, + "grad_norm": 2.124122142791748, + "learning_rate": 1.9626846333246955e-05, + "loss": 1.2731, + "step": 3787 + }, + { + "epoch": 0.08834805120595207, + "grad_norm": 1.952669382095337, + "learning_rate": 1.963202902306297e-05, + "loss": 1.4505, + "step": 3788 + }, + { + "epoch": 0.08837137434512998, + "grad_norm": 1.7214585542678833, + "learning_rate": 1.9637211712878984e-05, + "loss": 1.7156, + "step": 3789 + }, + { + "epoch": 0.0883946974843079, + "grad_norm": 1.4622641801834106, + "learning_rate": 1.9642394402694998e-05, + "loss": 1.6118, + "step": 3790 + }, + { + "epoch": 0.08841802062348582, + "grad_norm": 1.7372667789459229, + "learning_rate": 1.9647577092511016e-05, + "loss": 1.5947, + "step": 3791 + }, + { + "epoch": 0.08844134376266373, + "grad_norm": 2.0285115242004395, + "learning_rate": 1.965275978232703e-05, + "loss": 1.8061, + "step": 3792 + }, + { + "epoch": 0.08846466690184165, + "grad_norm": 1.7647212743759155, + "learning_rate": 1.9657942472143045e-05, + "loss": 1.2375, + "step": 3793 + }, + { + "epoch": 0.08848799004101957, + "grad_norm": 2.0434303283691406, + "learning_rate": 1.966312516195906e-05, + "loss": 1.4688, + "step": 3794 + }, + { + "epoch": 0.08851131318019749, + "grad_norm": 2.1963582038879395, + "learning_rate": 1.9668307851775073e-05, + "loss": 1.3006, + "step": 3795 + }, + { + "epoch": 0.0885346363193754, + "grad_norm": 1.637125849723816, + "learning_rate": 1.9673490541591088e-05, + "loss": 1.4169, + "step": 3796 + }, + { + "epoch": 0.08855795945855333, + "grad_norm": 1.9857791662216187, + "learning_rate": 1.9678673231407102e-05, + "loss": 1.2184, + "step": 3797 + }, + { + "epoch": 0.08858128259773124, + "grad_norm": 1.8531452417373657, + "learning_rate": 1.9683855921223116e-05, + "loss": 1.6313, + "step": 3798 + }, + { + "epoch": 0.08860460573690916, + "grad_norm": 1.427011489868164, + "learning_rate": 1.968903861103913e-05, + "loss": 1.2982, + "step": 3799 + }, + { + "epoch": 0.08862792887608707, + "grad_norm": 2.0080931186676025, + "learning_rate": 1.9694221300855145e-05, + "loss": 1.0828, + "step": 3800 + }, + { + "epoch": 0.088651252015265, + "grad_norm": 1.6396167278289795, + "learning_rate": 1.969940399067116e-05, + "loss": 1.5011, + "step": 3801 + }, + { + "epoch": 0.08867457515444291, + "grad_norm": 1.9646265506744385, + "learning_rate": 1.9704586680487174e-05, + "loss": 1.7735, + "step": 3802 + }, + { + "epoch": 0.08869789829362083, + "grad_norm": 1.7817018032073975, + "learning_rate": 1.9709769370303188e-05, + "loss": 1.457, + "step": 3803 + }, + { + "epoch": 0.08872122143279874, + "grad_norm": 2.1395010948181152, + "learning_rate": 1.9714952060119202e-05, + "loss": 1.569, + "step": 3804 + }, + { + "epoch": 0.08874454457197667, + "grad_norm": 1.8735876083374023, + "learning_rate": 1.9720134749935217e-05, + "loss": 1.3087, + "step": 3805 + }, + { + "epoch": 0.08876786771115458, + "grad_norm": 1.593678593635559, + "learning_rate": 1.972531743975123e-05, + "loss": 1.6819, + "step": 3806 + }, + { + "epoch": 0.0887911908503325, + "grad_norm": 1.4559507369995117, + "learning_rate": 1.973050012956725e-05, + "loss": 1.6107, + "step": 3807 + }, + { + "epoch": 0.08881451398951042, + "grad_norm": 1.3212164640426636, + "learning_rate": 1.9735682819383263e-05, + "loss": 1.3423, + "step": 3808 + }, + { + "epoch": 0.08883783712868834, + "grad_norm": 1.7826329469680786, + "learning_rate": 1.9740865509199278e-05, + "loss": 1.3699, + "step": 3809 + }, + { + "epoch": 0.08886116026786625, + "grad_norm": 1.4961615800857544, + "learning_rate": 1.9746048199015292e-05, + "loss": 1.3433, + "step": 3810 + }, + { + "epoch": 0.08888448340704418, + "grad_norm": 1.5190882682800293, + "learning_rate": 1.9751230888831306e-05, + "loss": 1.2961, + "step": 3811 + }, + { + "epoch": 0.08890780654622209, + "grad_norm": 1.6436294317245483, + "learning_rate": 1.975641357864732e-05, + "loss": 1.2565, + "step": 3812 + }, + { + "epoch": 0.08893112968540001, + "grad_norm": 2.6425702571868896, + "learning_rate": 1.9761596268463335e-05, + "loss": 1.4867, + "step": 3813 + }, + { + "epoch": 0.08895445282457792, + "grad_norm": 1.6602445840835571, + "learning_rate": 1.976677895827935e-05, + "loss": 1.3292, + "step": 3814 + }, + { + "epoch": 0.08897777596375585, + "grad_norm": 2.169207811355591, + "learning_rate": 1.9771961648095364e-05, + "loss": 1.6807, + "step": 3815 + }, + { + "epoch": 0.08900109910293376, + "grad_norm": 1.6809145212173462, + "learning_rate": 1.9777144337911378e-05, + "loss": 1.4716, + "step": 3816 + }, + { + "epoch": 0.08902442224211168, + "grad_norm": 1.908535122871399, + "learning_rate": 1.9782327027727392e-05, + "loss": 1.3085, + "step": 3817 + }, + { + "epoch": 0.0890477453812896, + "grad_norm": 2.0437328815460205, + "learning_rate": 1.9787509717543407e-05, + "loss": 1.4063, + "step": 3818 + }, + { + "epoch": 0.0890710685204675, + "grad_norm": 1.5940383672714233, + "learning_rate": 1.979269240735942e-05, + "loss": 1.1642, + "step": 3819 + }, + { + "epoch": 0.08909439165964543, + "grad_norm": 2.0657832622528076, + "learning_rate": 1.9797875097175435e-05, + "loss": 1.329, + "step": 3820 + }, + { + "epoch": 0.08911771479882334, + "grad_norm": 1.712106704711914, + "learning_rate": 1.980305778699145e-05, + "loss": 1.5452, + "step": 3821 + }, + { + "epoch": 0.08914103793800127, + "grad_norm": 1.7552653551101685, + "learning_rate": 1.9808240476807464e-05, + "loss": 1.2633, + "step": 3822 + }, + { + "epoch": 0.08916436107717918, + "grad_norm": 2.0012056827545166, + "learning_rate": 1.981342316662348e-05, + "loss": 1.4567, + "step": 3823 + }, + { + "epoch": 0.0891876842163571, + "grad_norm": 1.6985337734222412, + "learning_rate": 1.9818605856439496e-05, + "loss": 1.496, + "step": 3824 + }, + { + "epoch": 0.08921100735553501, + "grad_norm": 1.4563976526260376, + "learning_rate": 1.982378854625551e-05, + "loss": 1.114, + "step": 3825 + }, + { + "epoch": 0.08923433049471294, + "grad_norm": 1.8010538816452026, + "learning_rate": 1.9828971236071525e-05, + "loss": 1.6439, + "step": 3826 + }, + { + "epoch": 0.08925765363389085, + "grad_norm": 1.8285574913024902, + "learning_rate": 1.983415392588754e-05, + "loss": 1.1611, + "step": 3827 + }, + { + "epoch": 0.08928097677306877, + "grad_norm": 1.7888448238372803, + "learning_rate": 1.9839336615703553e-05, + "loss": 1.4124, + "step": 3828 + }, + { + "epoch": 0.08930429991224668, + "grad_norm": 1.5003128051757812, + "learning_rate": 1.9844519305519568e-05, + "loss": 1.2427, + "step": 3829 + }, + { + "epoch": 0.08932762305142461, + "grad_norm": 2.063203811645508, + "learning_rate": 1.9849701995335582e-05, + "loss": 1.5464, + "step": 3830 + }, + { + "epoch": 0.08935094619060252, + "grad_norm": 1.610795497894287, + "learning_rate": 1.9854884685151597e-05, + "loss": 1.2449, + "step": 3831 + }, + { + "epoch": 0.08937426932978045, + "grad_norm": 1.6565496921539307, + "learning_rate": 1.9860067374967607e-05, + "loss": 1.4032, + "step": 3832 + }, + { + "epoch": 0.08939759246895836, + "grad_norm": 3.492591142654419, + "learning_rate": 1.9865250064783622e-05, + "loss": 1.4794, + "step": 3833 + }, + { + "epoch": 0.08942091560813628, + "grad_norm": 1.7358639240264893, + "learning_rate": 1.9870432754599636e-05, + "loss": 1.6205, + "step": 3834 + }, + { + "epoch": 0.08944423874731419, + "grad_norm": 1.711347222328186, + "learning_rate": 1.9875615444415654e-05, + "loss": 1.8102, + "step": 3835 + }, + { + "epoch": 0.08946756188649212, + "grad_norm": 1.66538667678833, + "learning_rate": 1.9880798134231668e-05, + "loss": 1.3878, + "step": 3836 + }, + { + "epoch": 0.08949088502567003, + "grad_norm": 2.1874492168426514, + "learning_rate": 1.9885980824047683e-05, + "loss": 1.4028, + "step": 3837 + }, + { + "epoch": 0.08951420816484795, + "grad_norm": 1.9095275402069092, + "learning_rate": 1.9891163513863697e-05, + "loss": 1.2287, + "step": 3838 + }, + { + "epoch": 0.08953753130402586, + "grad_norm": 1.9796547889709473, + "learning_rate": 1.989634620367971e-05, + "loss": 1.1588, + "step": 3839 + }, + { + "epoch": 0.08956085444320379, + "grad_norm": 1.461793065071106, + "learning_rate": 1.9901528893495726e-05, + "loss": 0.8892, + "step": 3840 + }, + { + "epoch": 0.0895841775823817, + "grad_norm": 1.6312065124511719, + "learning_rate": 1.990671158331174e-05, + "loss": 1.5716, + "step": 3841 + }, + { + "epoch": 0.08960750072155962, + "grad_norm": 1.985102653503418, + "learning_rate": 1.9911894273127754e-05, + "loss": 1.387, + "step": 3842 + }, + { + "epoch": 0.08963082386073754, + "grad_norm": 1.6724156141281128, + "learning_rate": 1.991707696294377e-05, + "loss": 1.48, + "step": 3843 + }, + { + "epoch": 0.08965414699991546, + "grad_norm": 2.1611175537109375, + "learning_rate": 1.9922259652759783e-05, + "loss": 1.586, + "step": 3844 + }, + { + "epoch": 0.08967747013909337, + "grad_norm": 1.7135130167007446, + "learning_rate": 1.9927442342575797e-05, + "loss": 1.3561, + "step": 3845 + }, + { + "epoch": 0.0897007932782713, + "grad_norm": 1.9912513494491577, + "learning_rate": 1.9932625032391812e-05, + "loss": 1.4008, + "step": 3846 + }, + { + "epoch": 0.0897241164174492, + "grad_norm": 1.6305721998214722, + "learning_rate": 1.9937807722207826e-05, + "loss": 1.187, + "step": 3847 + }, + { + "epoch": 0.08974743955662712, + "grad_norm": 1.52975594997406, + "learning_rate": 1.994299041202384e-05, + "loss": 1.3399, + "step": 3848 + }, + { + "epoch": 0.08977076269580504, + "grad_norm": 1.9327449798583984, + "learning_rate": 1.9948173101839855e-05, + "loss": 1.7132, + "step": 3849 + }, + { + "epoch": 0.08979408583498295, + "grad_norm": 1.8760170936584473, + "learning_rate": 1.995335579165587e-05, + "loss": 1.4139, + "step": 3850 + }, + { + "epoch": 0.08981740897416088, + "grad_norm": 2.0891501903533936, + "learning_rate": 1.9958538481471887e-05, + "loss": 1.5883, + "step": 3851 + }, + { + "epoch": 0.08984073211333879, + "grad_norm": 1.7092490196228027, + "learning_rate": 1.99637211712879e-05, + "loss": 1.7111, + "step": 3852 + }, + { + "epoch": 0.08986405525251671, + "grad_norm": 1.8873919248580933, + "learning_rate": 1.9968903861103916e-05, + "loss": 1.5662, + "step": 3853 + }, + { + "epoch": 0.08988737839169463, + "grad_norm": 1.5830549001693726, + "learning_rate": 1.997408655091993e-05, + "loss": 1.6033, + "step": 3854 + }, + { + "epoch": 0.08991070153087255, + "grad_norm": 2.1742613315582275, + "learning_rate": 1.9979269240735944e-05, + "loss": 1.7105, + "step": 3855 + }, + { + "epoch": 0.08993402467005046, + "grad_norm": 2.1118738651275635, + "learning_rate": 1.998445193055196e-05, + "loss": 1.4206, + "step": 3856 + }, + { + "epoch": 0.08995734780922839, + "grad_norm": 2.103388547897339, + "learning_rate": 1.9989634620367973e-05, + "loss": 1.6509, + "step": 3857 + }, + { + "epoch": 0.0899806709484063, + "grad_norm": 1.6943275928497314, + "learning_rate": 1.9994817310183987e-05, + "loss": 1.4174, + "step": 3858 + }, + { + "epoch": 0.09000399408758422, + "grad_norm": 2.7844085693359375, + "learning_rate": 2e-05, + "loss": 1.4774, + "step": 3859 + }, + { + "epoch": 0.09002731722676213, + "grad_norm": 1.8078947067260742, + "learning_rate": 1.9999999996829872e-05, + "loss": 1.2295, + "step": 3860 + }, + { + "epoch": 0.09005064036594006, + "grad_norm": 1.9623292684555054, + "learning_rate": 1.9999999987319475e-05, + "loss": 1.4901, + "step": 3861 + }, + { + "epoch": 0.09007396350511797, + "grad_norm": 1.6764906644821167, + "learning_rate": 1.9999999971468818e-05, + "loss": 1.2552, + "step": 3862 + }, + { + "epoch": 0.0900972866442959, + "grad_norm": 1.8891198635101318, + "learning_rate": 1.99999999492779e-05, + "loss": 1.7509, + "step": 3863 + }, + { + "epoch": 0.0901206097834738, + "grad_norm": 1.8767303228378296, + "learning_rate": 1.999999992074672e-05, + "loss": 1.495, + "step": 3864 + }, + { + "epoch": 0.09014393292265173, + "grad_norm": 2.5242903232574463, + "learning_rate": 1.9999999885875275e-05, + "loss": 1.6351, + "step": 3865 + }, + { + "epoch": 0.09016725606182964, + "grad_norm": 1.708688735961914, + "learning_rate": 1.9999999844663568e-05, + "loss": 1.4466, + "step": 3866 + }, + { + "epoch": 0.09019057920100756, + "grad_norm": 2.3333094120025635, + "learning_rate": 1.9999999797111602e-05, + "loss": 1.2606, + "step": 3867 + }, + { + "epoch": 0.09021390234018548, + "grad_norm": 1.7503955364227295, + "learning_rate": 1.999999974321937e-05, + "loss": 1.4878, + "step": 3868 + }, + { + "epoch": 0.0902372254793634, + "grad_norm": 2.1751153469085693, + "learning_rate": 1.9999999682986876e-05, + "loss": 1.5093, + "step": 3869 + }, + { + "epoch": 0.09026054861854131, + "grad_norm": 1.9692846536636353, + "learning_rate": 1.999999961641412e-05, + "loss": 1.2518, + "step": 3870 + }, + { + "epoch": 0.09028387175771924, + "grad_norm": 1.744352102279663, + "learning_rate": 1.9999999543501102e-05, + "loss": 1.4566, + "step": 3871 + }, + { + "epoch": 0.09030719489689715, + "grad_norm": 1.633447527885437, + "learning_rate": 1.999999946424782e-05, + "loss": 1.4261, + "step": 3872 + }, + { + "epoch": 0.09033051803607507, + "grad_norm": 1.6854172945022583, + "learning_rate": 1.999999937865428e-05, + "loss": 1.1874, + "step": 3873 + }, + { + "epoch": 0.09035384117525298, + "grad_norm": 1.6113791465759277, + "learning_rate": 1.9999999286720472e-05, + "loss": 1.0596, + "step": 3874 + }, + { + "epoch": 0.0903771643144309, + "grad_norm": 1.8399914503097534, + "learning_rate": 1.999999918844641e-05, + "loss": 1.6591, + "step": 3875 + }, + { + "epoch": 0.09040048745360882, + "grad_norm": 1.559280276298523, + "learning_rate": 1.999999908383208e-05, + "loss": 1.4071, + "step": 3876 + }, + { + "epoch": 0.09042381059278673, + "grad_norm": 2.0830109119415283, + "learning_rate": 1.999999897287749e-05, + "loss": 1.1728, + "step": 3877 + }, + { + "epoch": 0.09044713373196465, + "grad_norm": 2.0221612453460693, + "learning_rate": 1.9999998855582636e-05, + "loss": 1.4678, + "step": 3878 + }, + { + "epoch": 0.09047045687114257, + "grad_norm": 1.6200062036514282, + "learning_rate": 1.9999998731947522e-05, + "loss": 1.6207, + "step": 3879 + }, + { + "epoch": 0.09049378001032049, + "grad_norm": 1.6108903884887695, + "learning_rate": 1.9999998601972143e-05, + "loss": 0.9769, + "step": 3880 + }, + { + "epoch": 0.0905171031494984, + "grad_norm": 2.0176987648010254, + "learning_rate": 1.9999998465656505e-05, + "loss": 1.8439, + "step": 3881 + }, + { + "epoch": 0.09054042628867633, + "grad_norm": 1.806272268295288, + "learning_rate": 1.9999998323000608e-05, + "loss": 1.6001, + "step": 3882 + }, + { + "epoch": 0.09056374942785424, + "grad_norm": 1.7513991594314575, + "learning_rate": 1.9999998174004446e-05, + "loss": 1.4423, + "step": 3883 + }, + { + "epoch": 0.09058707256703216, + "grad_norm": 1.9214335680007935, + "learning_rate": 1.9999998018668023e-05, + "loss": 1.4684, + "step": 3884 + }, + { + "epoch": 0.09061039570621007, + "grad_norm": 1.46034836769104, + "learning_rate": 1.9999997856991338e-05, + "loss": 0.9545, + "step": 3885 + }, + { + "epoch": 0.090633718845388, + "grad_norm": 1.6021389961242676, + "learning_rate": 1.9999997688974394e-05, + "loss": 1.3276, + "step": 3886 + }, + { + "epoch": 0.09065704198456591, + "grad_norm": 1.7397795915603638, + "learning_rate": 1.9999997514617188e-05, + "loss": 1.5114, + "step": 3887 + }, + { + "epoch": 0.09068036512374383, + "grad_norm": 1.8831110000610352, + "learning_rate": 1.999999733391972e-05, + "loss": 1.8704, + "step": 3888 + }, + { + "epoch": 0.09070368826292174, + "grad_norm": 1.8681260347366333, + "learning_rate": 1.9999997146881995e-05, + "loss": 1.4932, + "step": 3889 + }, + { + "epoch": 0.09072701140209967, + "grad_norm": 1.951125979423523, + "learning_rate": 1.9999996953504003e-05, + "loss": 1.4437, + "step": 3890 + }, + { + "epoch": 0.09075033454127758, + "grad_norm": 2.1680264472961426, + "learning_rate": 1.9999996753785757e-05, + "loss": 1.3859, + "step": 3891 + }, + { + "epoch": 0.0907736576804555, + "grad_norm": 1.7976813316345215, + "learning_rate": 1.9999996547727246e-05, + "loss": 1.8385, + "step": 3892 + }, + { + "epoch": 0.09079698081963342, + "grad_norm": 1.821331262588501, + "learning_rate": 1.9999996335328476e-05, + "loss": 1.2497, + "step": 3893 + }, + { + "epoch": 0.09082030395881134, + "grad_norm": 2.1051344871520996, + "learning_rate": 1.9999996116589448e-05, + "loss": 1.5435, + "step": 3894 + }, + { + "epoch": 0.09084362709798925, + "grad_norm": 2.2845866680145264, + "learning_rate": 1.9999995891510154e-05, + "loss": 1.0889, + "step": 3895 + }, + { + "epoch": 0.09086695023716718, + "grad_norm": 1.772169589996338, + "learning_rate": 1.9999995660090605e-05, + "loss": 1.4212, + "step": 3896 + }, + { + "epoch": 0.09089027337634509, + "grad_norm": 1.899332046508789, + "learning_rate": 1.9999995422330798e-05, + "loss": 1.3668, + "step": 3897 + }, + { + "epoch": 0.09091359651552301, + "grad_norm": 1.870027780532837, + "learning_rate": 1.9999995178230726e-05, + "loss": 1.5513, + "step": 3898 + }, + { + "epoch": 0.09093691965470092, + "grad_norm": 1.521862506866455, + "learning_rate": 1.99999949277904e-05, + "loss": 1.4325, + "step": 3899 + }, + { + "epoch": 0.09096024279387885, + "grad_norm": 1.7939119338989258, + "learning_rate": 1.999999467100981e-05, + "loss": 1.6844, + "step": 3900 + }, + { + "epoch": 0.09098356593305676, + "grad_norm": 1.9809507131576538, + "learning_rate": 1.999999440788896e-05, + "loss": 1.4736, + "step": 3901 + }, + { + "epoch": 0.09100688907223468, + "grad_norm": 1.4991800785064697, + "learning_rate": 1.9999994138427855e-05, + "loss": 1.1604, + "step": 3902 + }, + { + "epoch": 0.0910302122114126, + "grad_norm": 1.4532808065414429, + "learning_rate": 1.999999386262649e-05, + "loss": 1.474, + "step": 3903 + }, + { + "epoch": 0.0910535353505905, + "grad_norm": 1.856518268585205, + "learning_rate": 1.9999993580484864e-05, + "loss": 1.4732, + "step": 3904 + }, + { + "epoch": 0.09107685848976843, + "grad_norm": 2.7852296829223633, + "learning_rate": 1.9999993292002983e-05, + "loss": 1.3989, + "step": 3905 + }, + { + "epoch": 0.09110018162894634, + "grad_norm": 2.3301925659179688, + "learning_rate": 1.9999992997180843e-05, + "loss": 1.0174, + "step": 3906 + }, + { + "epoch": 0.09112350476812427, + "grad_norm": 1.9908862113952637, + "learning_rate": 1.9999992696018445e-05, + "loss": 1.2663, + "step": 3907 + }, + { + "epoch": 0.09114682790730218, + "grad_norm": 1.4991986751556396, + "learning_rate": 1.9999992388515784e-05, + "loss": 1.3657, + "step": 3908 + }, + { + "epoch": 0.0911701510464801, + "grad_norm": 1.8717234134674072, + "learning_rate": 1.9999992074672873e-05, + "loss": 1.7159, + "step": 3909 + }, + { + "epoch": 0.09119347418565801, + "grad_norm": 1.4654630422592163, + "learning_rate": 1.99999917544897e-05, + "loss": 0.9469, + "step": 3910 + }, + { + "epoch": 0.09121679732483594, + "grad_norm": 1.6895606517791748, + "learning_rate": 1.999999142796627e-05, + "loss": 1.4659, + "step": 3911 + }, + { + "epoch": 0.09124012046401385, + "grad_norm": 1.6707404851913452, + "learning_rate": 1.9999991095102583e-05, + "loss": 1.2416, + "step": 3912 + }, + { + "epoch": 0.09126344360319177, + "grad_norm": 1.5567107200622559, + "learning_rate": 1.999999075589864e-05, + "loss": 1.5764, + "step": 3913 + }, + { + "epoch": 0.09128676674236968, + "grad_norm": 1.8884644508361816, + "learning_rate": 1.999999041035444e-05, + "loss": 1.5029, + "step": 3914 + }, + { + "epoch": 0.09131008988154761, + "grad_norm": 1.8918079137802124, + "learning_rate": 1.9999990058469984e-05, + "loss": 1.6129, + "step": 3915 + }, + { + "epoch": 0.09133341302072552, + "grad_norm": 1.551944375038147, + "learning_rate": 1.9999989700245273e-05, + "loss": 1.0586, + "step": 3916 + }, + { + "epoch": 0.09135673615990345, + "grad_norm": 1.844373345375061, + "learning_rate": 1.9999989335680304e-05, + "loss": 1.4028, + "step": 3917 + }, + { + "epoch": 0.09138005929908136, + "grad_norm": 1.399733543395996, + "learning_rate": 1.999998896477508e-05, + "loss": 1.34, + "step": 3918 + }, + { + "epoch": 0.09140338243825928, + "grad_norm": 2.2165145874023438, + "learning_rate": 1.9999988587529596e-05, + "loss": 1.3944, + "step": 3919 + }, + { + "epoch": 0.09142670557743719, + "grad_norm": 1.9752156734466553, + "learning_rate": 1.9999988203943862e-05, + "loss": 1.1461, + "step": 3920 + }, + { + "epoch": 0.09145002871661512, + "grad_norm": 2.117903470993042, + "learning_rate": 1.9999987814017872e-05, + "loss": 1.6446, + "step": 3921 + }, + { + "epoch": 0.09147335185579303, + "grad_norm": 1.4504482746124268, + "learning_rate": 1.9999987417751628e-05, + "loss": 1.3936, + "step": 3922 + }, + { + "epoch": 0.09149667499497095, + "grad_norm": 1.6209416389465332, + "learning_rate": 1.9999987015145128e-05, + "loss": 1.1825, + "step": 3923 + }, + { + "epoch": 0.09151999813414886, + "grad_norm": 1.6402407884597778, + "learning_rate": 1.9999986606198373e-05, + "loss": 1.5742, + "step": 3924 + }, + { + "epoch": 0.09154332127332679, + "grad_norm": 1.865707516670227, + "learning_rate": 1.9999986190911363e-05, + "loss": 1.1726, + "step": 3925 + }, + { + "epoch": 0.0915666444125047, + "grad_norm": 2.145291805267334, + "learning_rate": 1.9999985769284102e-05, + "loss": 1.3995, + "step": 3926 + }, + { + "epoch": 0.09158996755168262, + "grad_norm": 1.8508366346359253, + "learning_rate": 1.999998534131659e-05, + "loss": 1.5275, + "step": 3927 + }, + { + "epoch": 0.09161329069086054, + "grad_norm": 1.477274775505066, + "learning_rate": 1.999998490700882e-05, + "loss": 1.0495, + "step": 3928 + }, + { + "epoch": 0.09163661383003846, + "grad_norm": 1.945433259010315, + "learning_rate": 1.9999984466360797e-05, + "loss": 1.4533, + "step": 3929 + }, + { + "epoch": 0.09165993696921637, + "grad_norm": 1.7821992635726929, + "learning_rate": 1.9999984019372522e-05, + "loss": 1.3427, + "step": 3930 + }, + { + "epoch": 0.0916832601083943, + "grad_norm": 2.0035572052001953, + "learning_rate": 1.9999983566043996e-05, + "loss": 1.8269, + "step": 3931 + }, + { + "epoch": 0.09170658324757221, + "grad_norm": 1.5176634788513184, + "learning_rate": 1.9999983106375217e-05, + "loss": 1.3943, + "step": 3932 + }, + { + "epoch": 0.09172990638675012, + "grad_norm": 1.823038101196289, + "learning_rate": 1.9999982640366187e-05, + "loss": 1.1486, + "step": 3933 + }, + { + "epoch": 0.09175322952592804, + "grad_norm": 1.796398401260376, + "learning_rate": 1.9999982168016906e-05, + "loss": 1.4936, + "step": 3934 + }, + { + "epoch": 0.09177655266510595, + "grad_norm": 2.008840322494507, + "learning_rate": 1.999998168932737e-05, + "loss": 1.6116, + "step": 3935 + }, + { + "epoch": 0.09179987580428388, + "grad_norm": 1.9190891981124878, + "learning_rate": 1.9999981204297587e-05, + "loss": 1.7342, + "step": 3936 + }, + { + "epoch": 0.09182319894346179, + "grad_norm": 1.437469720840454, + "learning_rate": 1.999998071292755e-05, + "loss": 1.6174, + "step": 3937 + }, + { + "epoch": 0.09184652208263971, + "grad_norm": 1.8305333852767944, + "learning_rate": 1.9999980215217265e-05, + "loss": 1.8085, + "step": 3938 + }, + { + "epoch": 0.09186984522181763, + "grad_norm": 1.5656037330627441, + "learning_rate": 1.999997971116673e-05, + "loss": 1.2984, + "step": 3939 + }, + { + "epoch": 0.09189316836099555, + "grad_norm": 2.2326161861419678, + "learning_rate": 1.9999979200775947e-05, + "loss": 1.4892, + "step": 3940 + }, + { + "epoch": 0.09191649150017346, + "grad_norm": 1.7006014585494995, + "learning_rate": 1.999997868404491e-05, + "loss": 1.4769, + "step": 3941 + }, + { + "epoch": 0.09193981463935139, + "grad_norm": 2.0390775203704834, + "learning_rate": 1.999997816097363e-05, + "loss": 1.5616, + "step": 3942 + }, + { + "epoch": 0.0919631377785293, + "grad_norm": 2.844869613647461, + "learning_rate": 1.9999977631562096e-05, + "loss": 1.6326, + "step": 3943 + }, + { + "epoch": 0.09198646091770722, + "grad_norm": 1.5782520771026611, + "learning_rate": 1.9999977095810314e-05, + "loss": 1.3048, + "step": 3944 + }, + { + "epoch": 0.09200978405688513, + "grad_norm": 1.876190423965454, + "learning_rate": 1.9999976553718288e-05, + "loss": 1.5714, + "step": 3945 + }, + { + "epoch": 0.09203310719606306, + "grad_norm": 1.842055320739746, + "learning_rate": 1.9999976005286013e-05, + "loss": 1.293, + "step": 3946 + }, + { + "epoch": 0.09205643033524097, + "grad_norm": 1.8643383979797363, + "learning_rate": 1.9999975450513487e-05, + "loss": 1.5914, + "step": 3947 + }, + { + "epoch": 0.0920797534744189, + "grad_norm": 1.9277528524398804, + "learning_rate": 1.9999974889400716e-05, + "loss": 1.3995, + "step": 3948 + }, + { + "epoch": 0.0921030766135968, + "grad_norm": 1.9185326099395752, + "learning_rate": 1.99999743219477e-05, + "loss": 1.5797, + "step": 3949 + }, + { + "epoch": 0.09212639975277473, + "grad_norm": 1.56540048122406, + "learning_rate": 1.999997374815444e-05, + "loss": 1.283, + "step": 3950 + }, + { + "epoch": 0.09214972289195264, + "grad_norm": 1.7684359550476074, + "learning_rate": 1.9999973168020926e-05, + "loss": 1.383, + "step": 3951 + }, + { + "epoch": 0.09217304603113056, + "grad_norm": 2.0893876552581787, + "learning_rate": 1.9999972581547175e-05, + "loss": 1.1262, + "step": 3952 + }, + { + "epoch": 0.09219636917030848, + "grad_norm": 1.3322521448135376, + "learning_rate": 1.9999971988733173e-05, + "loss": 1.4106, + "step": 3953 + }, + { + "epoch": 0.0922196923094864, + "grad_norm": 1.9222743511199951, + "learning_rate": 1.999997138957893e-05, + "loss": 1.254, + "step": 3954 + }, + { + "epoch": 0.09224301544866431, + "grad_norm": 1.758832335472107, + "learning_rate": 1.999997078408444e-05, + "loss": 1.474, + "step": 3955 + }, + { + "epoch": 0.09226633858784224, + "grad_norm": 1.472639560699463, + "learning_rate": 1.9999970172249706e-05, + "loss": 1.2962, + "step": 3956 + }, + { + "epoch": 0.09228966172702015, + "grad_norm": 1.844859004020691, + "learning_rate": 1.999996955407473e-05, + "loss": 1.5245, + "step": 3957 + }, + { + "epoch": 0.09231298486619807, + "grad_norm": 1.6577290296554565, + "learning_rate": 1.999996892955951e-05, + "loss": 1.3443, + "step": 3958 + }, + { + "epoch": 0.09233630800537598, + "grad_norm": 2.016514539718628, + "learning_rate": 1.9999968298704045e-05, + "loss": 1.1707, + "step": 3959 + }, + { + "epoch": 0.09235963114455391, + "grad_norm": 2.0657806396484375, + "learning_rate": 1.9999967661508342e-05, + "loss": 1.2651, + "step": 3960 + }, + { + "epoch": 0.09238295428373182, + "grad_norm": 1.5270191431045532, + "learning_rate": 1.9999967017972394e-05, + "loss": 1.0796, + "step": 3961 + }, + { + "epoch": 0.09240627742290973, + "grad_norm": 1.3309963941574097, + "learning_rate": 1.9999966368096208e-05, + "loss": 1.4668, + "step": 3962 + }, + { + "epoch": 0.09242960056208765, + "grad_norm": 1.9748342037200928, + "learning_rate": 1.9999965711879773e-05, + "loss": 1.225, + "step": 3963 + }, + { + "epoch": 0.09245292370126557, + "grad_norm": 2.0958023071289062, + "learning_rate": 1.9999965049323104e-05, + "loss": 1.8684, + "step": 3964 + }, + { + "epoch": 0.09247624684044349, + "grad_norm": 1.841970443725586, + "learning_rate": 1.999996438042619e-05, + "loss": 1.5263, + "step": 3965 + }, + { + "epoch": 0.0924995699796214, + "grad_norm": 1.6390635967254639, + "learning_rate": 1.999996370518904e-05, + "loss": 1.1722, + "step": 3966 + }, + { + "epoch": 0.09252289311879933, + "grad_norm": 1.8602365255355835, + "learning_rate": 1.999996302361165e-05, + "loss": 1.4936, + "step": 3967 + }, + { + "epoch": 0.09254621625797724, + "grad_norm": 1.8336684703826904, + "learning_rate": 1.9999962335694022e-05, + "loss": 1.4065, + "step": 3968 + }, + { + "epoch": 0.09256953939715516, + "grad_norm": 2.072793960571289, + "learning_rate": 1.999996164143615e-05, + "loss": 1.7077, + "step": 3969 + }, + { + "epoch": 0.09259286253633307, + "grad_norm": 1.8812514543533325, + "learning_rate": 1.9999960940838047e-05, + "loss": 1.3055, + "step": 3970 + }, + { + "epoch": 0.092616185675511, + "grad_norm": 1.9260146617889404, + "learning_rate": 1.99999602338997e-05, + "loss": 1.4673, + "step": 3971 + }, + { + "epoch": 0.09263950881468891, + "grad_norm": 1.9745763540267944, + "learning_rate": 1.9999959520621116e-05, + "loss": 1.3011, + "step": 3972 + }, + { + "epoch": 0.09266283195386683, + "grad_norm": 1.6949586868286133, + "learning_rate": 1.99999588010023e-05, + "loss": 1.4868, + "step": 3973 + }, + { + "epoch": 0.09268615509304474, + "grad_norm": 1.3992211818695068, + "learning_rate": 1.9999958075043243e-05, + "loss": 1.4782, + "step": 3974 + }, + { + "epoch": 0.09270947823222267, + "grad_norm": 1.4269651174545288, + "learning_rate": 1.9999957342743955e-05, + "loss": 1.3966, + "step": 3975 + }, + { + "epoch": 0.09273280137140058, + "grad_norm": 2.057020902633667, + "learning_rate": 1.9999956604104428e-05, + "loss": 1.5686, + "step": 3976 + }, + { + "epoch": 0.0927561245105785, + "grad_norm": 1.815314769744873, + "learning_rate": 1.9999955859124663e-05, + "loss": 1.7073, + "step": 3977 + }, + { + "epoch": 0.09277944764975642, + "grad_norm": 1.6862220764160156, + "learning_rate": 1.999995510780467e-05, + "loss": 1.4345, + "step": 3978 + }, + { + "epoch": 0.09280277078893434, + "grad_norm": 2.4775919914245605, + "learning_rate": 1.999995435014444e-05, + "loss": 1.4523, + "step": 3979 + }, + { + "epoch": 0.09282609392811225, + "grad_norm": 2.2759830951690674, + "learning_rate": 1.9999953586143977e-05, + "loss": 1.6044, + "step": 3980 + }, + { + "epoch": 0.09284941706729018, + "grad_norm": 2.0369324684143066, + "learning_rate": 1.999995281580328e-05, + "loss": 1.2325, + "step": 3981 + }, + { + "epoch": 0.09287274020646809, + "grad_norm": 1.6006555557250977, + "learning_rate": 1.9999952039122348e-05, + "loss": 1.0778, + "step": 3982 + }, + { + "epoch": 0.09289606334564601, + "grad_norm": 1.7829402685165405, + "learning_rate": 1.9999951256101185e-05, + "loss": 1.3098, + "step": 3983 + }, + { + "epoch": 0.09291938648482392, + "grad_norm": 1.9090176820755005, + "learning_rate": 1.9999950466739794e-05, + "loss": 1.26, + "step": 3984 + }, + { + "epoch": 0.09294270962400185, + "grad_norm": 1.1756659746170044, + "learning_rate": 1.9999949671038168e-05, + "loss": 1.1115, + "step": 3985 + }, + { + "epoch": 0.09296603276317976, + "grad_norm": 1.5126562118530273, + "learning_rate": 1.9999948868996314e-05, + "loss": 1.1954, + "step": 3986 + }, + { + "epoch": 0.09298935590235768, + "grad_norm": 1.5372440814971924, + "learning_rate": 1.9999948060614226e-05, + "loss": 1.432, + "step": 3987 + }, + { + "epoch": 0.0930126790415356, + "grad_norm": 2.036574363708496, + "learning_rate": 1.9999947245891913e-05, + "loss": 1.6046, + "step": 3988 + }, + { + "epoch": 0.0930360021807135, + "grad_norm": 1.7800477743148804, + "learning_rate": 1.999994642482937e-05, + "loss": 1.1686, + "step": 3989 + }, + { + "epoch": 0.09305932531989143, + "grad_norm": 1.939978837966919, + "learning_rate": 1.9999945597426596e-05, + "loss": 1.524, + "step": 3990 + }, + { + "epoch": 0.09308264845906934, + "grad_norm": 3.610623598098755, + "learning_rate": 1.9999944763683596e-05, + "loss": 1.8123, + "step": 3991 + }, + { + "epoch": 0.09310597159824727, + "grad_norm": 1.878239393234253, + "learning_rate": 1.9999943923600368e-05, + "loss": 1.6922, + "step": 3992 + }, + { + "epoch": 0.09312929473742518, + "grad_norm": 1.7238104343414307, + "learning_rate": 1.9999943077176915e-05, + "loss": 1.4637, + "step": 3993 + }, + { + "epoch": 0.0931526178766031, + "grad_norm": 1.890332579612732, + "learning_rate": 1.999994222441323e-05, + "loss": 1.5306, + "step": 3994 + }, + { + "epoch": 0.09317594101578101, + "grad_norm": 1.9238927364349365, + "learning_rate": 1.9999941365309322e-05, + "loss": 1.4932, + "step": 3995 + }, + { + "epoch": 0.09319926415495894, + "grad_norm": 5.232817649841309, + "learning_rate": 1.9999940499865192e-05, + "loss": 1.466, + "step": 3996 + }, + { + "epoch": 0.09322258729413685, + "grad_norm": 1.527990460395813, + "learning_rate": 1.9999939628080834e-05, + "loss": 1.4395, + "step": 3997 + }, + { + "epoch": 0.09324591043331477, + "grad_norm": 1.881361961364746, + "learning_rate": 1.999993874995625e-05, + "loss": 1.4053, + "step": 3998 + }, + { + "epoch": 0.09326923357249269, + "grad_norm": 1.9249119758605957, + "learning_rate": 1.9999937865491444e-05, + "loss": 1.92, + "step": 3999 + }, + { + "epoch": 0.09329255671167061, + "grad_norm": 1.9354079961776733, + "learning_rate": 1.9999936974686416e-05, + "loss": 1.1394, + "step": 4000 + }, + { + "epoch": 0.09331587985084852, + "grad_norm": 2.135955810546875, + "learning_rate": 1.9999936077541163e-05, + "loss": 1.5347, + "step": 4001 + }, + { + "epoch": 0.09333920299002645, + "grad_norm": 1.5951508283615112, + "learning_rate": 1.9999935174055693e-05, + "loss": 1.4595, + "step": 4002 + }, + { + "epoch": 0.09336252612920436, + "grad_norm": 1.7067939043045044, + "learning_rate": 1.9999934264229998e-05, + "loss": 1.8495, + "step": 4003 + }, + { + "epoch": 0.09338584926838228, + "grad_norm": 1.8007570505142212, + "learning_rate": 1.999993334806408e-05, + "loss": 1.1062, + "step": 4004 + }, + { + "epoch": 0.09340917240756019, + "grad_norm": 1.5014314651489258, + "learning_rate": 1.9999932425557947e-05, + "loss": 1.2173, + "step": 4005 + }, + { + "epoch": 0.09343249554673812, + "grad_norm": 1.7208422422409058, + "learning_rate": 1.9999931496711592e-05, + "loss": 1.5016, + "step": 4006 + }, + { + "epoch": 0.09345581868591603, + "grad_norm": 2.022439479827881, + "learning_rate": 1.9999930561525016e-05, + "loss": 1.4124, + "step": 4007 + }, + { + "epoch": 0.09347914182509395, + "grad_norm": 1.6925363540649414, + "learning_rate": 1.999992961999822e-05, + "loss": 1.1304, + "step": 4008 + }, + { + "epoch": 0.09350246496427186, + "grad_norm": 1.7039397954940796, + "learning_rate": 1.999992867213121e-05, + "loss": 1.4684, + "step": 4009 + }, + { + "epoch": 0.09352578810344979, + "grad_norm": 1.7603716850280762, + "learning_rate": 1.9999927717923983e-05, + "loss": 1.3615, + "step": 4010 + }, + { + "epoch": 0.0935491112426277, + "grad_norm": 1.491625189781189, + "learning_rate": 1.999992675737654e-05, + "loss": 1.3722, + "step": 4011 + }, + { + "epoch": 0.09357243438180562, + "grad_norm": 3.466465473175049, + "learning_rate": 1.9999925790488873e-05, + "loss": 1.1393, + "step": 4012 + }, + { + "epoch": 0.09359575752098354, + "grad_norm": 1.9502816200256348, + "learning_rate": 1.9999924817261e-05, + "loss": 1.2407, + "step": 4013 + }, + { + "epoch": 0.09361908066016146, + "grad_norm": 1.3467035293579102, + "learning_rate": 1.9999923837692906e-05, + "loss": 1.3731, + "step": 4014 + }, + { + "epoch": 0.09364240379933937, + "grad_norm": 1.6433124542236328, + "learning_rate": 1.99999228517846e-05, + "loss": 1.0856, + "step": 4015 + }, + { + "epoch": 0.0936657269385173, + "grad_norm": 1.6822779178619385, + "learning_rate": 1.999992185953608e-05, + "loss": 1.3971, + "step": 4016 + }, + { + "epoch": 0.09368905007769521, + "grad_norm": 2.0529069900512695, + "learning_rate": 1.9999920860947347e-05, + "loss": 1.4433, + "step": 4017 + }, + { + "epoch": 0.09371237321687312, + "grad_norm": 1.6150270700454712, + "learning_rate": 1.9999919856018405e-05, + "loss": 1.2959, + "step": 4018 + }, + { + "epoch": 0.09373569635605104, + "grad_norm": 2.2174623012542725, + "learning_rate": 1.9999918844749245e-05, + "loss": 1.2943, + "step": 4019 + }, + { + "epoch": 0.09375901949522895, + "grad_norm": 2.1846296787261963, + "learning_rate": 1.999991782713988e-05, + "loss": 1.2629, + "step": 4020 + }, + { + "epoch": 0.09378234263440688, + "grad_norm": 1.459313154220581, + "learning_rate": 1.9999916803190304e-05, + "loss": 1.5141, + "step": 4021 + }, + { + "epoch": 0.09380566577358479, + "grad_norm": 2.6233582496643066, + "learning_rate": 1.9999915772900514e-05, + "loss": 1.6312, + "step": 4022 + }, + { + "epoch": 0.09382898891276271, + "grad_norm": 1.8272534608840942, + "learning_rate": 1.9999914736270518e-05, + "loss": 1.2389, + "step": 4023 + }, + { + "epoch": 0.09385231205194063, + "grad_norm": 1.7140253782272339, + "learning_rate": 1.9999913693300313e-05, + "loss": 1.2665, + "step": 4024 + }, + { + "epoch": 0.09387563519111855, + "grad_norm": 1.9143249988555908, + "learning_rate": 1.9999912643989898e-05, + "loss": 1.6875, + "step": 4025 + }, + { + "epoch": 0.09389895833029646, + "grad_norm": 1.8950088024139404, + "learning_rate": 1.9999911588339275e-05, + "loss": 1.651, + "step": 4026 + }, + { + "epoch": 0.09392228146947439, + "grad_norm": 1.800822138786316, + "learning_rate": 1.999991052634845e-05, + "loss": 0.9556, + "step": 4027 + }, + { + "epoch": 0.0939456046086523, + "grad_norm": 1.5436400175094604, + "learning_rate": 1.9999909458017416e-05, + "loss": 1.5147, + "step": 4028 + }, + { + "epoch": 0.09396892774783022, + "grad_norm": 1.5622018575668335, + "learning_rate": 1.9999908383346177e-05, + "loss": 1.1669, + "step": 4029 + }, + { + "epoch": 0.09399225088700813, + "grad_norm": 1.8378404378890991, + "learning_rate": 1.9999907302334737e-05, + "loss": 0.9293, + "step": 4030 + }, + { + "epoch": 0.09401557402618606, + "grad_norm": 1.7259092330932617, + "learning_rate": 1.999990621498309e-05, + "loss": 1.8631, + "step": 4031 + }, + { + "epoch": 0.09403889716536397, + "grad_norm": 1.811070203781128, + "learning_rate": 1.999990512129124e-05, + "loss": 1.5787, + "step": 4032 + }, + { + "epoch": 0.0940622203045419, + "grad_norm": 1.6709043979644775, + "learning_rate": 1.9999904021259188e-05, + "loss": 1.3278, + "step": 4033 + }, + { + "epoch": 0.0940855434437198, + "grad_norm": 1.8301382064819336, + "learning_rate": 1.9999902914886937e-05, + "loss": 1.5662, + "step": 4034 + }, + { + "epoch": 0.09410886658289773, + "grad_norm": 1.7583134174346924, + "learning_rate": 1.9999901802174483e-05, + "loss": 1.2864, + "step": 4035 + }, + { + "epoch": 0.09413218972207564, + "grad_norm": 3.2991154193878174, + "learning_rate": 1.9999900683121828e-05, + "loss": 1.4025, + "step": 4036 + }, + { + "epoch": 0.09415551286125357, + "grad_norm": 1.5830022096633911, + "learning_rate": 1.999989955772897e-05, + "loss": 1.4977, + "step": 4037 + }, + { + "epoch": 0.09417883600043148, + "grad_norm": 1.7454510927200317, + "learning_rate": 1.9999898425995918e-05, + "loss": 1.8235, + "step": 4038 + }, + { + "epoch": 0.0942021591396094, + "grad_norm": 1.6297307014465332, + "learning_rate": 1.999989728792267e-05, + "loss": 1.5289, + "step": 4039 + }, + { + "epoch": 0.09422548227878731, + "grad_norm": 2.0267040729522705, + "learning_rate": 1.999989614350922e-05, + "loss": 1.6013, + "step": 4040 + }, + { + "epoch": 0.09424880541796524, + "grad_norm": 1.4703158140182495, + "learning_rate": 1.9999894992755572e-05, + "loss": 1.5303, + "step": 4041 + }, + { + "epoch": 0.09427212855714315, + "grad_norm": 1.6133272647857666, + "learning_rate": 1.9999893835661736e-05, + "loss": 1.1337, + "step": 4042 + }, + { + "epoch": 0.09429545169632107, + "grad_norm": 1.936994194984436, + "learning_rate": 1.9999892672227698e-05, + "loss": 1.2797, + "step": 4043 + }, + { + "epoch": 0.09431877483549898, + "grad_norm": 1.6114977598190308, + "learning_rate": 1.9999891502453466e-05, + "loss": 1.5541, + "step": 4044 + }, + { + "epoch": 0.09434209797467691, + "grad_norm": 1.9083713293075562, + "learning_rate": 1.999989032633904e-05, + "loss": 1.6397, + "step": 4045 + }, + { + "epoch": 0.09436542111385482, + "grad_norm": 1.7421109676361084, + "learning_rate": 1.9999889143884424e-05, + "loss": 1.1853, + "step": 4046 + }, + { + "epoch": 0.09438874425303273, + "grad_norm": 1.8391941785812378, + "learning_rate": 1.999988795508961e-05, + "loss": 1.3414, + "step": 4047 + }, + { + "epoch": 0.09441206739221066, + "grad_norm": 1.8088597059249878, + "learning_rate": 1.9999886759954613e-05, + "loss": 1.3833, + "step": 4048 + }, + { + "epoch": 0.09443539053138857, + "grad_norm": 1.6824920177459717, + "learning_rate": 1.999988555847942e-05, + "loss": 1.2919, + "step": 4049 + }, + { + "epoch": 0.09445871367056649, + "grad_norm": 1.9679538011550903, + "learning_rate": 1.9999884350664035e-05, + "loss": 1.3646, + "step": 4050 + }, + { + "epoch": 0.0944820368097444, + "grad_norm": 1.8130271434783936, + "learning_rate": 1.9999883136508465e-05, + "loss": 1.2788, + "step": 4051 + }, + { + "epoch": 0.09450535994892233, + "grad_norm": 1.3992571830749512, + "learning_rate": 1.9999881916012708e-05, + "loss": 1.3074, + "step": 4052 + }, + { + "epoch": 0.09452868308810024, + "grad_norm": 1.8113707304000854, + "learning_rate": 1.999988068917676e-05, + "loss": 1.3189, + "step": 4053 + }, + { + "epoch": 0.09455200622727816, + "grad_norm": 2.156859874725342, + "learning_rate": 1.9999879456000625e-05, + "loss": 1.5443, + "step": 4054 + }, + { + "epoch": 0.09457532936645607, + "grad_norm": 1.979003667831421, + "learning_rate": 1.9999878216484302e-05, + "loss": 1.5466, + "step": 4055 + }, + { + "epoch": 0.094598652505634, + "grad_norm": 1.953880786895752, + "learning_rate": 1.9999876970627796e-05, + "loss": 1.4766, + "step": 4056 + }, + { + "epoch": 0.09462197564481191, + "grad_norm": 1.613236427307129, + "learning_rate": 1.999987571843111e-05, + "loss": 1.642, + "step": 4057 + }, + { + "epoch": 0.09464529878398983, + "grad_norm": 1.4525372982025146, + "learning_rate": 1.999987445989423e-05, + "loss": 1.4696, + "step": 4058 + }, + { + "epoch": 0.09466862192316775, + "grad_norm": 1.4650002717971802, + "learning_rate": 1.9999873195017177e-05, + "loss": 1.4614, + "step": 4059 + }, + { + "epoch": 0.09469194506234567, + "grad_norm": 1.5875036716461182, + "learning_rate": 1.9999871923799935e-05, + "loss": 0.8413, + "step": 4060 + }, + { + "epoch": 0.09471526820152358, + "grad_norm": 1.8234145641326904, + "learning_rate": 1.9999870646242512e-05, + "loss": 1.6689, + "step": 4061 + }, + { + "epoch": 0.0947385913407015, + "grad_norm": 1.4357320070266724, + "learning_rate": 1.9999869362344916e-05, + "loss": 1.04, + "step": 4062 + }, + { + "epoch": 0.09476191447987942, + "grad_norm": 1.530616283416748, + "learning_rate": 1.9999868072107133e-05, + "loss": 1.3837, + "step": 4063 + }, + { + "epoch": 0.09478523761905734, + "grad_norm": 2.2008919715881348, + "learning_rate": 1.9999866775529172e-05, + "loss": 1.3126, + "step": 4064 + }, + { + "epoch": 0.09480856075823525, + "grad_norm": 1.8661929368972778, + "learning_rate": 1.9999865472611034e-05, + "loss": 1.4651, + "step": 4065 + }, + { + "epoch": 0.09483188389741318, + "grad_norm": 1.8432332277297974, + "learning_rate": 1.999986416335272e-05, + "loss": 1.6874, + "step": 4066 + }, + { + "epoch": 0.09485520703659109, + "grad_norm": 1.8919012546539307, + "learning_rate": 1.999986284775423e-05, + "loss": 1.3761, + "step": 4067 + }, + { + "epoch": 0.09487853017576901, + "grad_norm": 1.7640912532806396, + "learning_rate": 1.999986152581556e-05, + "loss": 1.5106, + "step": 4068 + }, + { + "epoch": 0.09490185331494692, + "grad_norm": 1.3458236455917358, + "learning_rate": 1.999986019753672e-05, + "loss": 1.266, + "step": 4069 + }, + { + "epoch": 0.09492517645412485, + "grad_norm": 2.0958406925201416, + "learning_rate": 1.9999858862917705e-05, + "loss": 1.3424, + "step": 4070 + }, + { + "epoch": 0.09494849959330276, + "grad_norm": 1.7863316535949707, + "learning_rate": 1.9999857521958518e-05, + "loss": 1.5416, + "step": 4071 + }, + { + "epoch": 0.09497182273248068, + "grad_norm": 1.6585896015167236, + "learning_rate": 1.9999856174659157e-05, + "loss": 1.26, + "step": 4072 + }, + { + "epoch": 0.0949951458716586, + "grad_norm": 2.302985429763794, + "learning_rate": 1.9999854821019622e-05, + "loss": 1.2932, + "step": 4073 + }, + { + "epoch": 0.09501846901083652, + "grad_norm": 1.7254619598388672, + "learning_rate": 1.999985346103992e-05, + "loss": 1.2682, + "step": 4074 + }, + { + "epoch": 0.09504179215001443, + "grad_norm": 2.1413419246673584, + "learning_rate": 1.999985209472005e-05, + "loss": 1.38, + "step": 4075 + }, + { + "epoch": 0.09506511528919234, + "grad_norm": 1.833171010017395, + "learning_rate": 1.9999850722060007e-05, + "loss": 1.5078, + "step": 4076 + }, + { + "epoch": 0.09508843842837027, + "grad_norm": 1.5397589206695557, + "learning_rate": 1.9999849343059797e-05, + "loss": 1.3753, + "step": 4077 + }, + { + "epoch": 0.09511176156754818, + "grad_norm": 1.693420171737671, + "learning_rate": 1.999984795771942e-05, + "loss": 1.3732, + "step": 4078 + }, + { + "epoch": 0.0951350847067261, + "grad_norm": 1.6689990758895874, + "learning_rate": 1.9999846566038878e-05, + "loss": 1.2823, + "step": 4079 + }, + { + "epoch": 0.09515840784590401, + "grad_norm": 1.8790488243103027, + "learning_rate": 1.999984516801817e-05, + "loss": 1.3091, + "step": 4080 + }, + { + "epoch": 0.09518173098508194, + "grad_norm": 1.65789794921875, + "learning_rate": 1.9999843763657298e-05, + "loss": 1.4561, + "step": 4081 + }, + { + "epoch": 0.09520505412425985, + "grad_norm": 2.2451510429382324, + "learning_rate": 1.9999842352956264e-05, + "loss": 0.8032, + "step": 4082 + }, + { + "epoch": 0.09522837726343777, + "grad_norm": 1.5158308744430542, + "learning_rate": 1.9999840935915066e-05, + "loss": 1.3509, + "step": 4083 + }, + { + "epoch": 0.09525170040261569, + "grad_norm": 2.0572614669799805, + "learning_rate": 1.999983951253371e-05, + "loss": 0.9884, + "step": 4084 + }, + { + "epoch": 0.09527502354179361, + "grad_norm": 1.6487854719161987, + "learning_rate": 1.9999838082812187e-05, + "loss": 1.7546, + "step": 4085 + }, + { + "epoch": 0.09529834668097152, + "grad_norm": 2.0137455463409424, + "learning_rate": 1.9999836646750506e-05, + "loss": 1.3536, + "step": 4086 + }, + { + "epoch": 0.09532166982014945, + "grad_norm": 1.6713207960128784, + "learning_rate": 1.9999835204348667e-05, + "loss": 1.18, + "step": 4087 + }, + { + "epoch": 0.09534499295932736, + "grad_norm": 1.3659164905548096, + "learning_rate": 1.999983375560667e-05, + "loss": 1.228, + "step": 4088 + }, + { + "epoch": 0.09536831609850528, + "grad_norm": 2.0661914348602295, + "learning_rate": 1.9999832300524513e-05, + "loss": 1.5949, + "step": 4089 + }, + { + "epoch": 0.09539163923768319, + "grad_norm": 1.5829130411148071, + "learning_rate": 1.9999830839102205e-05, + "loss": 1.6185, + "step": 4090 + }, + { + "epoch": 0.09541496237686112, + "grad_norm": 1.7367058992385864, + "learning_rate": 1.999982937133974e-05, + "loss": 1.3331, + "step": 4091 + }, + { + "epoch": 0.09543828551603903, + "grad_norm": 1.6864798069000244, + "learning_rate": 1.9999827897237117e-05, + "loss": 1.4409, + "step": 4092 + }, + { + "epoch": 0.09546160865521695, + "grad_norm": 1.5921106338500977, + "learning_rate": 1.9999826416794345e-05, + "loss": 1.3799, + "step": 4093 + }, + { + "epoch": 0.09548493179439486, + "grad_norm": 2.0874788761138916, + "learning_rate": 1.999982493001142e-05, + "loss": 1.2227, + "step": 4094 + }, + { + "epoch": 0.09550825493357279, + "grad_norm": 2.1337532997131348, + "learning_rate": 1.999982343688834e-05, + "loss": 1.4877, + "step": 4095 + }, + { + "epoch": 0.0955315780727507, + "grad_norm": 2.3292899131774902, + "learning_rate": 1.9999821937425114e-05, + "loss": 1.5423, + "step": 4096 + }, + { + "epoch": 0.09555490121192863, + "grad_norm": 1.8624464273452759, + "learning_rate": 1.999982043162173e-05, + "loss": 1.2917, + "step": 4097 + }, + { + "epoch": 0.09557822435110654, + "grad_norm": 2.0876848697662354, + "learning_rate": 1.9999818919478205e-05, + "loss": 1.0749, + "step": 4098 + }, + { + "epoch": 0.09560154749028446, + "grad_norm": 1.882411241531372, + "learning_rate": 1.9999817400994533e-05, + "loss": 1.4965, + "step": 4099 + }, + { + "epoch": 0.09562487062946237, + "grad_norm": 1.6623929738998413, + "learning_rate": 1.999981587617071e-05, + "loss": 1.1074, + "step": 4100 + }, + { + "epoch": 0.0956481937686403, + "grad_norm": 1.3498985767364502, + "learning_rate": 1.9999814345006744e-05, + "loss": 1.1718, + "step": 4101 + }, + { + "epoch": 0.09567151690781821, + "grad_norm": 2.064556121826172, + "learning_rate": 1.9999812807502632e-05, + "loss": 1.4264, + "step": 4102 + }, + { + "epoch": 0.09569484004699612, + "grad_norm": 1.7603230476379395, + "learning_rate": 1.9999811263658376e-05, + "loss": 1.542, + "step": 4103 + }, + { + "epoch": 0.09571816318617404, + "grad_norm": 1.6809413433074951, + "learning_rate": 1.9999809713473977e-05, + "loss": 1.4546, + "step": 4104 + }, + { + "epoch": 0.09574148632535195, + "grad_norm": 1.659028172492981, + "learning_rate": 1.9999808156949435e-05, + "loss": 1.3343, + "step": 4105 + }, + { + "epoch": 0.09576480946452988, + "grad_norm": 1.9118539094924927, + "learning_rate": 1.9999806594084753e-05, + "loss": 1.6991, + "step": 4106 + }, + { + "epoch": 0.09578813260370779, + "grad_norm": 1.984724521636963, + "learning_rate": 1.9999805024879934e-05, + "loss": 1.5914, + "step": 4107 + }, + { + "epoch": 0.09581145574288571, + "grad_norm": 1.6731042861938477, + "learning_rate": 1.9999803449334972e-05, + "loss": 1.5506, + "step": 4108 + }, + { + "epoch": 0.09583477888206363, + "grad_norm": 1.8623292446136475, + "learning_rate": 1.9999801867449874e-05, + "loss": 1.3468, + "step": 4109 + }, + { + "epoch": 0.09585810202124155, + "grad_norm": 1.2610914707183838, + "learning_rate": 1.999980027922464e-05, + "loss": 1.2062, + "step": 4110 + }, + { + "epoch": 0.09588142516041946, + "grad_norm": 1.6934281587600708, + "learning_rate": 1.999979868465927e-05, + "loss": 1.3682, + "step": 4111 + }, + { + "epoch": 0.09590474829959739, + "grad_norm": 1.9726686477661133, + "learning_rate": 1.9999797083753764e-05, + "loss": 1.371, + "step": 4112 + }, + { + "epoch": 0.0959280714387753, + "grad_norm": 1.8796629905700684, + "learning_rate": 1.9999795476508123e-05, + "loss": 1.4994, + "step": 4113 + }, + { + "epoch": 0.09595139457795322, + "grad_norm": 1.68529212474823, + "learning_rate": 1.9999793862922352e-05, + "loss": 1.465, + "step": 4114 + }, + { + "epoch": 0.09597471771713113, + "grad_norm": 1.8680670261383057, + "learning_rate": 1.999979224299645e-05, + "loss": 1.6986, + "step": 4115 + }, + { + "epoch": 0.09599804085630906, + "grad_norm": 1.9507824182510376, + "learning_rate": 1.9999790616730412e-05, + "loss": 1.5233, + "step": 4116 + }, + { + "epoch": 0.09602136399548697, + "grad_norm": 1.6027692556381226, + "learning_rate": 1.9999788984124246e-05, + "loss": 1.5378, + "step": 4117 + }, + { + "epoch": 0.0960446871346649, + "grad_norm": 2.4456326961517334, + "learning_rate": 1.9999787345177953e-05, + "loss": 1.3405, + "step": 4118 + }, + { + "epoch": 0.0960680102738428, + "grad_norm": 1.7938956022262573, + "learning_rate": 1.999978569989153e-05, + "loss": 1.6514, + "step": 4119 + }, + { + "epoch": 0.09609133341302073, + "grad_norm": 1.6841145753860474, + "learning_rate": 1.999978404826498e-05, + "loss": 1.4499, + "step": 4120 + }, + { + "epoch": 0.09611465655219864, + "grad_norm": 1.6563725471496582, + "learning_rate": 1.9999782390298307e-05, + "loss": 1.7497, + "step": 4121 + }, + { + "epoch": 0.09613797969137657, + "grad_norm": 1.7308846712112427, + "learning_rate": 1.9999780725991512e-05, + "loss": 1.3619, + "step": 4122 + }, + { + "epoch": 0.09616130283055448, + "grad_norm": 2.038050413131714, + "learning_rate": 1.999977905534459e-05, + "loss": 1.4866, + "step": 4123 + }, + { + "epoch": 0.0961846259697324, + "grad_norm": 1.775932788848877, + "learning_rate": 1.9999777378357544e-05, + "loss": 1.5068, + "step": 4124 + }, + { + "epoch": 0.09620794910891031, + "grad_norm": 1.7475711107254028, + "learning_rate": 1.999977569503038e-05, + "loss": 1.2989, + "step": 4125 + }, + { + "epoch": 0.09623127224808824, + "grad_norm": 1.4972944259643555, + "learning_rate": 1.9999774005363094e-05, + "loss": 1.2221, + "step": 4126 + }, + { + "epoch": 0.09625459538726615, + "grad_norm": 1.9756770133972168, + "learning_rate": 1.999977230935569e-05, + "loss": 1.6019, + "step": 4127 + }, + { + "epoch": 0.09627791852644407, + "grad_norm": 1.3822906017303467, + "learning_rate": 1.9999770607008163e-05, + "loss": 1.4286, + "step": 4128 + }, + { + "epoch": 0.09630124166562198, + "grad_norm": 1.4641766548156738, + "learning_rate": 1.9999768898320525e-05, + "loss": 1.518, + "step": 4129 + }, + { + "epoch": 0.09632456480479991, + "grad_norm": 1.973097562789917, + "learning_rate": 1.9999767183292767e-05, + "loss": 1.4911, + "step": 4130 + }, + { + "epoch": 0.09634788794397782, + "grad_norm": 1.9415779113769531, + "learning_rate": 1.9999765461924893e-05, + "loss": 1.5681, + "step": 4131 + }, + { + "epoch": 0.09637121108315573, + "grad_norm": 1.7455964088439941, + "learning_rate": 1.9999763734216907e-05, + "loss": 1.8105, + "step": 4132 + }, + { + "epoch": 0.09639453422233366, + "grad_norm": 1.4776321649551392, + "learning_rate": 1.999976200016881e-05, + "loss": 1.4157, + "step": 4133 + }, + { + "epoch": 0.09641785736151157, + "grad_norm": 1.9864835739135742, + "learning_rate": 1.9999760259780602e-05, + "loss": 1.5405, + "step": 4134 + }, + { + "epoch": 0.09644118050068949, + "grad_norm": 1.9087644815444946, + "learning_rate": 1.999975851305228e-05, + "loss": 1.1071, + "step": 4135 + }, + { + "epoch": 0.0964645036398674, + "grad_norm": 1.7016524076461792, + "learning_rate": 1.999975675998385e-05, + "loss": 1.6047, + "step": 4136 + }, + { + "epoch": 0.09648782677904533, + "grad_norm": 1.7450127601623535, + "learning_rate": 1.9999755000575313e-05, + "loss": 1.7406, + "step": 4137 + }, + { + "epoch": 0.09651114991822324, + "grad_norm": 1.5106456279754639, + "learning_rate": 1.9999753234826667e-05, + "loss": 1.1769, + "step": 4138 + }, + { + "epoch": 0.09653447305740116, + "grad_norm": 1.538486361503601, + "learning_rate": 1.9999751462737915e-05, + "loss": 1.4457, + "step": 4139 + }, + { + "epoch": 0.09655779619657907, + "grad_norm": 1.83707594871521, + "learning_rate": 1.999974968430906e-05, + "loss": 1.8298, + "step": 4140 + }, + { + "epoch": 0.096581119335757, + "grad_norm": 1.85380220413208, + "learning_rate": 1.99997478995401e-05, + "loss": 1.5184, + "step": 4141 + }, + { + "epoch": 0.09660444247493491, + "grad_norm": 1.5038057565689087, + "learning_rate": 1.9999746108431036e-05, + "loss": 1.0325, + "step": 4142 + }, + { + "epoch": 0.09662776561411283, + "grad_norm": 1.5708307027816772, + "learning_rate": 1.9999744310981873e-05, + "loss": 1.0565, + "step": 4143 + }, + { + "epoch": 0.09665108875329075, + "grad_norm": 1.6708171367645264, + "learning_rate": 1.9999742507192608e-05, + "loss": 1.5968, + "step": 4144 + }, + { + "epoch": 0.09667441189246867, + "grad_norm": 2.186654806137085, + "learning_rate": 1.9999740697063244e-05, + "loss": 1.5134, + "step": 4145 + }, + { + "epoch": 0.09669773503164658, + "grad_norm": 2.057157039642334, + "learning_rate": 1.9999738880593783e-05, + "loss": 1.5829, + "step": 4146 + }, + { + "epoch": 0.0967210581708245, + "grad_norm": 2.131748914718628, + "learning_rate": 1.9999737057784224e-05, + "loss": 1.6146, + "step": 4147 + }, + { + "epoch": 0.09674438131000242, + "grad_norm": 1.6583216190338135, + "learning_rate": 1.9999735228634565e-05, + "loss": 1.3262, + "step": 4148 + }, + { + "epoch": 0.09676770444918034, + "grad_norm": 1.5584999322891235, + "learning_rate": 1.9999733393144818e-05, + "loss": 1.4176, + "step": 4149 + }, + { + "epoch": 0.09679102758835825, + "grad_norm": 1.7254819869995117, + "learning_rate": 1.9999731551314974e-05, + "loss": 1.3999, + "step": 4150 + }, + { + "epoch": 0.09681435072753618, + "grad_norm": 1.5138722658157349, + "learning_rate": 1.9999729703145038e-05, + "loss": 1.373, + "step": 4151 + }, + { + "epoch": 0.09683767386671409, + "grad_norm": 1.4465757608413696, + "learning_rate": 1.999972784863501e-05, + "loss": 1.2654, + "step": 4152 + }, + { + "epoch": 0.09686099700589201, + "grad_norm": 1.792013168334961, + "learning_rate": 1.9999725987784896e-05, + "loss": 1.4372, + "step": 4153 + }, + { + "epoch": 0.09688432014506992, + "grad_norm": 2.385373115539551, + "learning_rate": 1.999972412059469e-05, + "loss": 1.2813, + "step": 4154 + }, + { + "epoch": 0.09690764328424785, + "grad_norm": 1.9845149517059326, + "learning_rate": 1.9999722247064394e-05, + "loss": 1.6797, + "step": 4155 + }, + { + "epoch": 0.09693096642342576, + "grad_norm": 1.7642122507095337, + "learning_rate": 1.9999720367194016e-05, + "loss": 1.3114, + "step": 4156 + }, + { + "epoch": 0.09695428956260368, + "grad_norm": 1.6494389772415161, + "learning_rate": 1.9999718480983553e-05, + "loss": 1.7949, + "step": 4157 + }, + { + "epoch": 0.0969776127017816, + "grad_norm": 2.112375259399414, + "learning_rate": 1.9999716588433004e-05, + "loss": 1.6229, + "step": 4158 + }, + { + "epoch": 0.09700093584095952, + "grad_norm": 1.6811909675598145, + "learning_rate": 1.9999714689542373e-05, + "loss": 1.1249, + "step": 4159 + }, + { + "epoch": 0.09702425898013743, + "grad_norm": 2.01330304145813, + "learning_rate": 1.999971278431166e-05, + "loss": 1.25, + "step": 4160 + }, + { + "epoch": 0.09704758211931534, + "grad_norm": 1.8410260677337646, + "learning_rate": 1.9999710872740863e-05, + "loss": 1.6418, + "step": 4161 + }, + { + "epoch": 0.09707090525849327, + "grad_norm": 1.4903624057769775, + "learning_rate": 1.9999708954829992e-05, + "loss": 1.1562, + "step": 4162 + }, + { + "epoch": 0.09709422839767118, + "grad_norm": 1.8792493343353271, + "learning_rate": 1.9999707030579038e-05, + "loss": 1.2509, + "step": 4163 + }, + { + "epoch": 0.0971175515368491, + "grad_norm": 2.2513837814331055, + "learning_rate": 1.9999705099988012e-05, + "loss": 1.5141, + "step": 4164 + }, + { + "epoch": 0.09714087467602701, + "grad_norm": 1.8120847940444946, + "learning_rate": 1.9999703163056908e-05, + "loss": 1.5603, + "step": 4165 + }, + { + "epoch": 0.09716419781520494, + "grad_norm": 1.8012254238128662, + "learning_rate": 1.999970121978573e-05, + "loss": 1.7928, + "step": 4166 + }, + { + "epoch": 0.09718752095438285, + "grad_norm": 1.5914149284362793, + "learning_rate": 1.999969927017448e-05, + "loss": 1.398, + "step": 4167 + }, + { + "epoch": 0.09721084409356077, + "grad_norm": 2.2462551593780518, + "learning_rate": 1.9999697314223158e-05, + "loss": 1.3219, + "step": 4168 + }, + { + "epoch": 0.09723416723273869, + "grad_norm": 1.847602128982544, + "learning_rate": 1.999969535193176e-05, + "loss": 1.4635, + "step": 4169 + }, + { + "epoch": 0.09725749037191661, + "grad_norm": 1.968076467514038, + "learning_rate": 1.99996933833003e-05, + "loss": 1.6123, + "step": 4170 + }, + { + "epoch": 0.09728081351109452, + "grad_norm": 1.9207682609558105, + "learning_rate": 1.999969140832877e-05, + "loss": 1.3354, + "step": 4171 + }, + { + "epoch": 0.09730413665027245, + "grad_norm": 1.8169053792953491, + "learning_rate": 1.9999689427017174e-05, + "loss": 1.357, + "step": 4172 + }, + { + "epoch": 0.09732745978945036, + "grad_norm": 1.7797213792800903, + "learning_rate": 1.9999687439365506e-05, + "loss": 1.2203, + "step": 4173 + }, + { + "epoch": 0.09735078292862828, + "grad_norm": 1.9157795906066895, + "learning_rate": 1.999968544537378e-05, + "loss": 1.4148, + "step": 4174 + }, + { + "epoch": 0.0973741060678062, + "grad_norm": 1.6914931535720825, + "learning_rate": 1.999968344504199e-05, + "loss": 1.4039, + "step": 4175 + }, + { + "epoch": 0.09739742920698412, + "grad_norm": 6.293972015380859, + "learning_rate": 1.9999681438370134e-05, + "loss": 1.4866, + "step": 4176 + }, + { + "epoch": 0.09742075234616203, + "grad_norm": 1.5637798309326172, + "learning_rate": 1.999967942535822e-05, + "loss": 1.572, + "step": 4177 + }, + { + "epoch": 0.09744407548533995, + "grad_norm": 1.877367377281189, + "learning_rate": 1.999967740600625e-05, + "loss": 1.7372, + "step": 4178 + }, + { + "epoch": 0.09746739862451786, + "grad_norm": 1.6659308671951294, + "learning_rate": 1.999967538031422e-05, + "loss": 1.639, + "step": 4179 + }, + { + "epoch": 0.09749072176369579, + "grad_norm": 1.550208568572998, + "learning_rate": 1.999967334828213e-05, + "loss": 1.1717, + "step": 4180 + }, + { + "epoch": 0.0975140449028737, + "grad_norm": 2.298583745956421, + "learning_rate": 1.999967130990999e-05, + "loss": 1.5788, + "step": 4181 + }, + { + "epoch": 0.09753736804205163, + "grad_norm": 1.6380672454833984, + "learning_rate": 1.9999669265197796e-05, + "loss": 1.2321, + "step": 4182 + }, + { + "epoch": 0.09756069118122954, + "grad_norm": 1.7410871982574463, + "learning_rate": 1.9999667214145542e-05, + "loss": 1.1268, + "step": 4183 + }, + { + "epoch": 0.09758401432040746, + "grad_norm": 1.8795753717422485, + "learning_rate": 1.999966515675324e-05, + "loss": 1.3166, + "step": 4184 + }, + { + "epoch": 0.09760733745958537, + "grad_norm": 1.4578545093536377, + "learning_rate": 1.999966309302089e-05, + "loss": 1.2819, + "step": 4185 + }, + { + "epoch": 0.0976306605987633, + "grad_norm": 2.1507022380828857, + "learning_rate": 1.999966102294849e-05, + "loss": 1.3444, + "step": 4186 + }, + { + "epoch": 0.09765398373794121, + "grad_norm": 1.5323656797409058, + "learning_rate": 1.999965894653604e-05, + "loss": 1.4338, + "step": 4187 + }, + { + "epoch": 0.09767730687711913, + "grad_norm": 1.7071229219436646, + "learning_rate": 1.9999656863783546e-05, + "loss": 1.6295, + "step": 4188 + }, + { + "epoch": 0.09770063001629704, + "grad_norm": 1.5819435119628906, + "learning_rate": 1.999965477469101e-05, + "loss": 1.2241, + "step": 4189 + }, + { + "epoch": 0.09772395315547495, + "grad_norm": 1.9116559028625488, + "learning_rate": 1.9999652679258424e-05, + "loss": 1.5789, + "step": 4190 + }, + { + "epoch": 0.09774727629465288, + "grad_norm": 1.4237418174743652, + "learning_rate": 1.9999650577485797e-05, + "loss": 1.213, + "step": 4191 + }, + { + "epoch": 0.09777059943383079, + "grad_norm": 1.715041160583496, + "learning_rate": 1.9999648469373132e-05, + "loss": 1.4415, + "step": 4192 + }, + { + "epoch": 0.09779392257300872, + "grad_norm": 1.9360545873641968, + "learning_rate": 1.9999646354920425e-05, + "loss": 1.405, + "step": 4193 + }, + { + "epoch": 0.09781724571218663, + "grad_norm": 2.3374712467193604, + "learning_rate": 1.9999644234127684e-05, + "loss": 1.6741, + "step": 4194 + }, + { + "epoch": 0.09784056885136455, + "grad_norm": 2.0107362270355225, + "learning_rate": 1.99996421069949e-05, + "loss": 1.5874, + "step": 4195 + }, + { + "epoch": 0.09786389199054246, + "grad_norm": 2.5181028842926025, + "learning_rate": 1.9999639973522086e-05, + "loss": 1.6139, + "step": 4196 + }, + { + "epoch": 0.09788721512972039, + "grad_norm": 1.7766554355621338, + "learning_rate": 1.9999637833709233e-05, + "loss": 1.3219, + "step": 4197 + }, + { + "epoch": 0.0979105382688983, + "grad_norm": 2.135298013687134, + "learning_rate": 1.999963568755635e-05, + "loss": 1.598, + "step": 4198 + }, + { + "epoch": 0.09793386140807622, + "grad_norm": 2.5849833488464355, + "learning_rate": 1.9999633535063433e-05, + "loss": 1.19, + "step": 4199 + }, + { + "epoch": 0.09795718454725413, + "grad_norm": 1.7871876955032349, + "learning_rate": 1.999963137623049e-05, + "loss": 1.2969, + "step": 4200 + }, + { + "epoch": 0.09798050768643206, + "grad_norm": 1.6766856908798218, + "learning_rate": 1.9999629211057514e-05, + "loss": 1.7685, + "step": 4201 + }, + { + "epoch": 0.09800383082560997, + "grad_norm": 1.9163066148757935, + "learning_rate": 1.999962703954451e-05, + "loss": 1.6925, + "step": 4202 + }, + { + "epoch": 0.0980271539647879, + "grad_norm": 3.168355941772461, + "learning_rate": 1.9999624861691482e-05, + "loss": 1.1072, + "step": 4203 + }, + { + "epoch": 0.0980504771039658, + "grad_norm": 1.8867367506027222, + "learning_rate": 1.999962267749843e-05, + "loss": 0.8637, + "step": 4204 + }, + { + "epoch": 0.09807380024314373, + "grad_norm": 1.6280899047851562, + "learning_rate": 1.9999620486965356e-05, + "loss": 1.7693, + "step": 4205 + }, + { + "epoch": 0.09809712338232164, + "grad_norm": 1.8064197301864624, + "learning_rate": 1.9999618290092254e-05, + "loss": 1.579, + "step": 4206 + }, + { + "epoch": 0.09812044652149957, + "grad_norm": 1.6739108562469482, + "learning_rate": 1.9999616086879134e-05, + "loss": 1.326, + "step": 4207 + }, + { + "epoch": 0.09814376966067748, + "grad_norm": 1.659472107887268, + "learning_rate": 1.9999613877326e-05, + "loss": 1.4379, + "step": 4208 + }, + { + "epoch": 0.0981670927998554, + "grad_norm": 1.6488410234451294, + "learning_rate": 1.9999611661432844e-05, + "loss": 1.5683, + "step": 4209 + }, + { + "epoch": 0.09819041593903331, + "grad_norm": 1.5839974880218506, + "learning_rate": 1.999960943919967e-05, + "loss": 1.567, + "step": 4210 + }, + { + "epoch": 0.09821373907821124, + "grad_norm": 1.7465866804122925, + "learning_rate": 1.9999607210626485e-05, + "loss": 1.6317, + "step": 4211 + }, + { + "epoch": 0.09823706221738915, + "grad_norm": 1.8293263912200928, + "learning_rate": 1.9999604975713286e-05, + "loss": 1.6924, + "step": 4212 + }, + { + "epoch": 0.09826038535656707, + "grad_norm": 1.6221671104431152, + "learning_rate": 1.999960273446007e-05, + "loss": 1.5512, + "step": 4213 + }, + { + "epoch": 0.09828370849574498, + "grad_norm": 2.131727695465088, + "learning_rate": 1.9999600486866848e-05, + "loss": 1.6782, + "step": 4214 + }, + { + "epoch": 0.09830703163492291, + "grad_norm": 1.6054993867874146, + "learning_rate": 1.9999598232933615e-05, + "loss": 1.5069, + "step": 4215 + }, + { + "epoch": 0.09833035477410082, + "grad_norm": 1.7985066175460815, + "learning_rate": 1.999959597266038e-05, + "loss": 1.519, + "step": 4216 + }, + { + "epoch": 0.09835367791327873, + "grad_norm": 1.9092615842819214, + "learning_rate": 1.999959370604713e-05, + "loss": 1.6574, + "step": 4217 + }, + { + "epoch": 0.09837700105245666, + "grad_norm": 2.288851261138916, + "learning_rate": 1.9999591433093878e-05, + "loss": 1.5944, + "step": 4218 + }, + { + "epoch": 0.09840032419163457, + "grad_norm": 1.9145318269729614, + "learning_rate": 1.999958915380062e-05, + "loss": 1.7942, + "step": 4219 + }, + { + "epoch": 0.09842364733081249, + "grad_norm": 1.755913257598877, + "learning_rate": 1.9999586868167367e-05, + "loss": 1.3487, + "step": 4220 + }, + { + "epoch": 0.0984469704699904, + "grad_norm": 1.7752525806427002, + "learning_rate": 1.999958457619411e-05, + "loss": 1.4491, + "step": 4221 + }, + { + "epoch": 0.09847029360916833, + "grad_norm": 1.9941532611846924, + "learning_rate": 1.9999582277880852e-05, + "loss": 1.8066, + "step": 4222 + }, + { + "epoch": 0.09849361674834624, + "grad_norm": 1.6862519979476929, + "learning_rate": 1.99995799732276e-05, + "loss": 1.5079, + "step": 4223 + }, + { + "epoch": 0.09851693988752416, + "grad_norm": 1.443449854850769, + "learning_rate": 1.9999577662234347e-05, + "loss": 1.3898, + "step": 4224 + }, + { + "epoch": 0.09854026302670207, + "grad_norm": 1.8810759782791138, + "learning_rate": 1.9999575344901102e-05, + "loss": 1.5129, + "step": 4225 + }, + { + "epoch": 0.09856358616588, + "grad_norm": 1.6053975820541382, + "learning_rate": 1.9999573021227862e-05, + "loss": 1.3603, + "step": 4226 + }, + { + "epoch": 0.09858690930505791, + "grad_norm": 1.8535492420196533, + "learning_rate": 1.9999570691214632e-05, + "loss": 1.5042, + "step": 4227 + }, + { + "epoch": 0.09861023244423583, + "grad_norm": 1.8407317399978638, + "learning_rate": 1.999956835486141e-05, + "loss": 1.4641, + "step": 4228 + }, + { + "epoch": 0.09863355558341375, + "grad_norm": 1.6712974309921265, + "learning_rate": 1.9999566012168202e-05, + "loss": 1.2335, + "step": 4229 + }, + { + "epoch": 0.09865687872259167, + "grad_norm": 1.3778800964355469, + "learning_rate": 1.9999563663135006e-05, + "loss": 1.241, + "step": 4230 + }, + { + "epoch": 0.09868020186176958, + "grad_norm": 1.8800315856933594, + "learning_rate": 1.9999561307761823e-05, + "loss": 1.4065, + "step": 4231 + }, + { + "epoch": 0.0987035250009475, + "grad_norm": 2.0709855556488037, + "learning_rate": 1.9999558946048655e-05, + "loss": 1.5202, + "step": 4232 + }, + { + "epoch": 0.09872684814012542, + "grad_norm": 1.7393478155136108, + "learning_rate": 1.9999556577995504e-05, + "loss": 1.5224, + "step": 4233 + }, + { + "epoch": 0.09875017127930334, + "grad_norm": 1.7007765769958496, + "learning_rate": 1.999955420360237e-05, + "loss": 1.4261, + "step": 4234 + }, + { + "epoch": 0.09877349441848125, + "grad_norm": 1.7563802003860474, + "learning_rate": 1.9999551822869262e-05, + "loss": 1.4401, + "step": 4235 + }, + { + "epoch": 0.09879681755765918, + "grad_norm": 1.7062292098999023, + "learning_rate": 1.999954943579617e-05, + "loss": 1.6909, + "step": 4236 + }, + { + "epoch": 0.09882014069683709, + "grad_norm": 1.8759976625442505, + "learning_rate": 1.9999547042383102e-05, + "loss": 1.7044, + "step": 4237 + }, + { + "epoch": 0.09884346383601501, + "grad_norm": 1.8254616260528564, + "learning_rate": 1.999954464263006e-05, + "loss": 1.5238, + "step": 4238 + }, + { + "epoch": 0.09886678697519292, + "grad_norm": 2.0938169956207275, + "learning_rate": 1.9999542236537043e-05, + "loss": 0.9963, + "step": 4239 + }, + { + "epoch": 0.09889011011437085, + "grad_norm": 1.818045973777771, + "learning_rate": 1.9999539824104057e-05, + "loss": 1.5701, + "step": 4240 + }, + { + "epoch": 0.09891343325354876, + "grad_norm": 1.4690525531768799, + "learning_rate": 1.9999537405331098e-05, + "loss": 1.5044, + "step": 4241 + }, + { + "epoch": 0.09893675639272669, + "grad_norm": 1.7315692901611328, + "learning_rate": 1.999953498021817e-05, + "loss": 1.2657, + "step": 4242 + }, + { + "epoch": 0.0989600795319046, + "grad_norm": 2.039533853530884, + "learning_rate": 1.9999532548765272e-05, + "loss": 1.5758, + "step": 4243 + }, + { + "epoch": 0.09898340267108252, + "grad_norm": 2.1871633529663086, + "learning_rate": 1.999953011097241e-05, + "loss": 1.8441, + "step": 4244 + }, + { + "epoch": 0.09900672581026043, + "grad_norm": 1.8292895555496216, + "learning_rate": 1.9999527666839584e-05, + "loss": 1.4835, + "step": 4245 + }, + { + "epoch": 0.09903004894943834, + "grad_norm": 1.85469388961792, + "learning_rate": 1.9999525216366793e-05, + "loss": 1.6256, + "step": 4246 + }, + { + "epoch": 0.09905337208861627, + "grad_norm": 1.8915073871612549, + "learning_rate": 1.9999522759554045e-05, + "loss": 1.3652, + "step": 4247 + }, + { + "epoch": 0.09907669522779418, + "grad_norm": 1.4933258295059204, + "learning_rate": 1.999952029640133e-05, + "loss": 1.5552, + "step": 4248 + }, + { + "epoch": 0.0991000183669721, + "grad_norm": 2.428149938583374, + "learning_rate": 1.999951782690866e-05, + "loss": 1.4655, + "step": 4249 + }, + { + "epoch": 0.09912334150615001, + "grad_norm": 1.7580535411834717, + "learning_rate": 1.9999515351076036e-05, + "loss": 1.2424, + "step": 4250 + }, + { + "epoch": 0.09914666464532794, + "grad_norm": 1.9982497692108154, + "learning_rate": 1.9999512868903453e-05, + "loss": 1.4033, + "step": 4251 + }, + { + "epoch": 0.09916998778450585, + "grad_norm": 2.0487194061279297, + "learning_rate": 1.9999510380390917e-05, + "loss": 1.7197, + "step": 4252 + }, + { + "epoch": 0.09919331092368378, + "grad_norm": 1.6107131242752075, + "learning_rate": 1.999950788553843e-05, + "loss": 1.6606, + "step": 4253 + }, + { + "epoch": 0.09921663406286169, + "grad_norm": 2.1322474479675293, + "learning_rate": 1.999950538434599e-05, + "loss": 1.2501, + "step": 4254 + }, + { + "epoch": 0.09923995720203961, + "grad_norm": 1.7192622423171997, + "learning_rate": 1.9999502876813602e-05, + "loss": 1.4732, + "step": 4255 + }, + { + "epoch": 0.09926328034121752, + "grad_norm": 1.885438084602356, + "learning_rate": 1.9999500362941272e-05, + "loss": 1.6839, + "step": 4256 + }, + { + "epoch": 0.09928660348039545, + "grad_norm": 1.5883655548095703, + "learning_rate": 1.999949784272899e-05, + "loss": 1.478, + "step": 4257 + }, + { + "epoch": 0.09930992661957336, + "grad_norm": 2.0354058742523193, + "learning_rate": 1.9999495316176766e-05, + "loss": 1.3814, + "step": 4258 + }, + { + "epoch": 0.09933324975875128, + "grad_norm": 2.1163158416748047, + "learning_rate": 1.99994927832846e-05, + "loss": 1.677, + "step": 4259 + }, + { + "epoch": 0.0993565728979292, + "grad_norm": 2.6560354232788086, + "learning_rate": 1.999949024405249e-05, + "loss": 1.4997, + "step": 4260 + }, + { + "epoch": 0.09937989603710712, + "grad_norm": 2.152900218963623, + "learning_rate": 1.9999487698480443e-05, + "loss": 1.7245, + "step": 4261 + }, + { + "epoch": 0.09940321917628503, + "grad_norm": 1.2850993871688843, + "learning_rate": 1.999948514656846e-05, + "loss": 1.3858, + "step": 4262 + }, + { + "epoch": 0.09942654231546295, + "grad_norm": 2.006429433822632, + "learning_rate": 1.9999482588316537e-05, + "loss": 1.3124, + "step": 4263 + }, + { + "epoch": 0.09944986545464087, + "grad_norm": 2.0759809017181396, + "learning_rate": 1.999948002372468e-05, + "loss": 1.5445, + "step": 4264 + }, + { + "epoch": 0.09947318859381879, + "grad_norm": 1.7543976306915283, + "learning_rate": 1.999947745279289e-05, + "loss": 1.4313, + "step": 4265 + }, + { + "epoch": 0.0994965117329967, + "grad_norm": 1.5660021305084229, + "learning_rate": 1.9999474875521172e-05, + "loss": 1.3618, + "step": 4266 + }, + { + "epoch": 0.09951983487217463, + "grad_norm": 1.7976728677749634, + "learning_rate": 1.9999472291909523e-05, + "loss": 1.2068, + "step": 4267 + }, + { + "epoch": 0.09954315801135254, + "grad_norm": 1.2305090427398682, + "learning_rate": 1.9999469701957944e-05, + "loss": 0.9688, + "step": 4268 + }, + { + "epoch": 0.09956648115053046, + "grad_norm": 1.636967658996582, + "learning_rate": 1.9999467105666442e-05, + "loss": 1.2086, + "step": 4269 + }, + { + "epoch": 0.09958980428970837, + "grad_norm": 1.271087646484375, + "learning_rate": 1.9999464503035014e-05, + "loss": 0.8756, + "step": 4270 + }, + { + "epoch": 0.0996131274288863, + "grad_norm": 2.0414915084838867, + "learning_rate": 1.9999461894063663e-05, + "loss": 1.6801, + "step": 4271 + }, + { + "epoch": 0.09963645056806421, + "grad_norm": 1.539941430091858, + "learning_rate": 1.9999459278752392e-05, + "loss": 1.2946, + "step": 4272 + }, + { + "epoch": 0.09965977370724213, + "grad_norm": 2.676841974258423, + "learning_rate": 1.9999456657101198e-05, + "loss": 1.3578, + "step": 4273 + }, + { + "epoch": 0.09968309684642004, + "grad_norm": 1.3585354089736938, + "learning_rate": 1.9999454029110088e-05, + "loss": 1.087, + "step": 4274 + }, + { + "epoch": 0.09970641998559795, + "grad_norm": 1.530239224433899, + "learning_rate": 1.999945139477906e-05, + "loss": 1.3106, + "step": 4275 + }, + { + "epoch": 0.09972974312477588, + "grad_norm": 2.2003183364868164, + "learning_rate": 1.999944875410812e-05, + "loss": 1.381, + "step": 4276 + }, + { + "epoch": 0.09975306626395379, + "grad_norm": 2.0120465755462646, + "learning_rate": 1.9999446107097263e-05, + "loss": 1.5153, + "step": 4277 + }, + { + "epoch": 0.09977638940313172, + "grad_norm": 1.709818959236145, + "learning_rate": 1.9999443453746498e-05, + "loss": 1.5923, + "step": 4278 + }, + { + "epoch": 0.09979971254230963, + "grad_norm": 1.913179636001587, + "learning_rate": 1.9999440794055824e-05, + "loss": 1.5885, + "step": 4279 + }, + { + "epoch": 0.09982303568148755, + "grad_norm": 1.7344928979873657, + "learning_rate": 1.9999438128025243e-05, + "loss": 1.5675, + "step": 4280 + }, + { + "epoch": 0.09984635882066546, + "grad_norm": 1.9740248918533325, + "learning_rate": 1.999943545565475e-05, + "loss": 1.547, + "step": 4281 + }, + { + "epoch": 0.09986968195984339, + "grad_norm": 1.8129318952560425, + "learning_rate": 1.9999432776944357e-05, + "loss": 1.421, + "step": 4282 + }, + { + "epoch": 0.0998930050990213, + "grad_norm": 1.5063918828964233, + "learning_rate": 1.9999430091894058e-05, + "loss": 1.4418, + "step": 4283 + }, + { + "epoch": 0.09991632823819922, + "grad_norm": 1.7168734073638916, + "learning_rate": 1.999942740050386e-05, + "loss": 1.2619, + "step": 4284 + }, + { + "epoch": 0.09993965137737713, + "grad_norm": 1.9776476621627808, + "learning_rate": 1.9999424702773762e-05, + "loss": 1.222, + "step": 4285 + }, + { + "epoch": 0.09996297451655506, + "grad_norm": 2.2479588985443115, + "learning_rate": 1.999942199870377e-05, + "loss": 1.0917, + "step": 4286 + }, + { + "epoch": 0.09998629765573297, + "grad_norm": 1.6717889308929443, + "learning_rate": 1.9999419288293876e-05, + "loss": 1.086, + "step": 4287 + }, + { + "epoch": 0.1000096207949109, + "grad_norm": 1.878680944442749, + "learning_rate": 1.9999416571544088e-05, + "loss": 1.5367, + "step": 4288 + }, + { + "epoch": 0.1000329439340888, + "grad_norm": 1.7646178007125854, + "learning_rate": 1.9999413848454407e-05, + "loss": 1.4865, + "step": 4289 + }, + { + "epoch": 0.10005626707326673, + "grad_norm": 1.8721680641174316, + "learning_rate": 1.999941111902484e-05, + "loss": 1.3529, + "step": 4290 + }, + { + "epoch": 0.10007959021244464, + "grad_norm": 1.8077625036239624, + "learning_rate": 1.999940838325538e-05, + "loss": 1.4719, + "step": 4291 + }, + { + "epoch": 0.10010291335162257, + "grad_norm": 2.198845148086548, + "learning_rate": 1.9999405641146032e-05, + "loss": 1.6665, + "step": 4292 + }, + { + "epoch": 0.10012623649080048, + "grad_norm": 1.691707968711853, + "learning_rate": 1.99994028926968e-05, + "loss": 1.7262, + "step": 4293 + }, + { + "epoch": 0.1001495596299784, + "grad_norm": 1.6580450534820557, + "learning_rate": 1.9999400137907684e-05, + "loss": 1.4598, + "step": 4294 + }, + { + "epoch": 0.10017288276915631, + "grad_norm": 1.6645382642745972, + "learning_rate": 1.9999397376778685e-05, + "loss": 1.4816, + "step": 4295 + }, + { + "epoch": 0.10019620590833424, + "grad_norm": 1.5742464065551758, + "learning_rate": 1.9999394609309807e-05, + "loss": 1.5432, + "step": 4296 + }, + { + "epoch": 0.10021952904751215, + "grad_norm": 1.6496295928955078, + "learning_rate": 1.999939183550105e-05, + "loss": 1.6818, + "step": 4297 + }, + { + "epoch": 0.10024285218669007, + "grad_norm": 1.385653018951416, + "learning_rate": 1.9999389055352417e-05, + "loss": 1.2546, + "step": 4298 + }, + { + "epoch": 0.10026617532586798, + "grad_norm": 1.8121085166931152, + "learning_rate": 1.9999386268863906e-05, + "loss": 1.6767, + "step": 4299 + }, + { + "epoch": 0.10028949846504591, + "grad_norm": 1.7636923789978027, + "learning_rate": 1.9999383476035526e-05, + "loss": 1.3737, + "step": 4300 + }, + { + "epoch": 0.10031282160422382, + "grad_norm": 2.0811192989349365, + "learning_rate": 1.999938067686727e-05, + "loss": 1.193, + "step": 4301 + }, + { + "epoch": 0.10033614474340175, + "grad_norm": 1.491807460784912, + "learning_rate": 1.9999377871359145e-05, + "loss": 1.3031, + "step": 4302 + }, + { + "epoch": 0.10035946788257966, + "grad_norm": 1.7776809930801392, + "learning_rate": 1.999937505951115e-05, + "loss": 1.2694, + "step": 4303 + }, + { + "epoch": 0.10038279102175757, + "grad_norm": 2.235464334487915, + "learning_rate": 1.9999372241323295e-05, + "loss": 1.4957, + "step": 4304 + }, + { + "epoch": 0.10040611416093549, + "grad_norm": 2.128072500228882, + "learning_rate": 1.999936941679557e-05, + "loss": 1.4781, + "step": 4305 + }, + { + "epoch": 0.1004294373001134, + "grad_norm": 1.7737715244293213, + "learning_rate": 1.9999366585927984e-05, + "loss": 1.1417, + "step": 4306 + }, + { + "epoch": 0.10045276043929133, + "grad_norm": 1.5610971450805664, + "learning_rate": 1.9999363748720536e-05, + "loss": 0.8304, + "step": 4307 + }, + { + "epoch": 0.10047608357846924, + "grad_norm": 1.8552017211914062, + "learning_rate": 1.9999360905173232e-05, + "loss": 1.5004, + "step": 4308 + }, + { + "epoch": 0.10049940671764716, + "grad_norm": 1.4400602579116821, + "learning_rate": 1.9999358055286066e-05, + "loss": 1.3209, + "step": 4309 + }, + { + "epoch": 0.10052272985682507, + "grad_norm": 1.8621962070465088, + "learning_rate": 1.999935519905905e-05, + "loss": 1.6884, + "step": 4310 + }, + { + "epoch": 0.100546052996003, + "grad_norm": 1.6067652702331543, + "learning_rate": 1.9999352336492175e-05, + "loss": 1.1682, + "step": 4311 + }, + { + "epoch": 0.10056937613518091, + "grad_norm": 1.9501607418060303, + "learning_rate": 1.999934946758545e-05, + "loss": 1.4891, + "step": 4312 + }, + { + "epoch": 0.10059269927435883, + "grad_norm": 1.669527530670166, + "learning_rate": 1.9999346592338874e-05, + "loss": 1.3783, + "step": 4313 + }, + { + "epoch": 0.10061602241353675, + "grad_norm": 1.7186279296875, + "learning_rate": 1.999934371075245e-05, + "loss": 1.3234, + "step": 4314 + }, + { + "epoch": 0.10063934555271467, + "grad_norm": 1.6204017400741577, + "learning_rate": 1.9999340822826182e-05, + "loss": 1.267, + "step": 4315 + }, + { + "epoch": 0.10066266869189258, + "grad_norm": 1.8573533296585083, + "learning_rate": 1.9999337928560066e-05, + "loss": 1.5666, + "step": 4316 + }, + { + "epoch": 0.1006859918310705, + "grad_norm": 1.4896578788757324, + "learning_rate": 1.9999335027954113e-05, + "loss": 1.319, + "step": 4317 + }, + { + "epoch": 0.10070931497024842, + "grad_norm": 1.641883134841919, + "learning_rate": 1.9999332121008314e-05, + "loss": 1.5142, + "step": 4318 + }, + { + "epoch": 0.10073263810942634, + "grad_norm": 2.2819859981536865, + "learning_rate": 1.9999329207722677e-05, + "loss": 1.738, + "step": 4319 + }, + { + "epoch": 0.10075596124860425, + "grad_norm": 1.6085293292999268, + "learning_rate": 1.9999326288097204e-05, + "loss": 1.2742, + "step": 4320 + }, + { + "epoch": 0.10077928438778218, + "grad_norm": 1.5279383659362793, + "learning_rate": 1.9999323362131894e-05, + "loss": 1.3892, + "step": 4321 + }, + { + "epoch": 0.10080260752696009, + "grad_norm": 1.8018547296524048, + "learning_rate": 1.999932042982675e-05, + "loss": 1.2437, + "step": 4322 + }, + { + "epoch": 0.10082593066613801, + "grad_norm": 2.234227418899536, + "learning_rate": 1.9999317491181775e-05, + "loss": 1.8226, + "step": 4323 + }, + { + "epoch": 0.10084925380531592, + "grad_norm": 1.7504974603652954, + "learning_rate": 1.999931454619697e-05, + "loss": 1.1958, + "step": 4324 + }, + { + "epoch": 0.10087257694449385, + "grad_norm": 1.9000635147094727, + "learning_rate": 1.9999311594872337e-05, + "loss": 1.7756, + "step": 4325 + }, + { + "epoch": 0.10089590008367176, + "grad_norm": 1.346874475479126, + "learning_rate": 1.999930863720788e-05, + "loss": 1.241, + "step": 4326 + }, + { + "epoch": 0.10091922322284969, + "grad_norm": 1.410976529121399, + "learning_rate": 1.9999305673203596e-05, + "loss": 1.1809, + "step": 4327 + }, + { + "epoch": 0.1009425463620276, + "grad_norm": 1.805508017539978, + "learning_rate": 1.9999302702859492e-05, + "loss": 1.6081, + "step": 4328 + }, + { + "epoch": 0.10096586950120552, + "grad_norm": 1.7266390323638916, + "learning_rate": 1.9999299726175566e-05, + "loss": 1.3068, + "step": 4329 + }, + { + "epoch": 0.10098919264038343, + "grad_norm": 1.8772683143615723, + "learning_rate": 1.9999296743151823e-05, + "loss": 1.144, + "step": 4330 + }, + { + "epoch": 0.10101251577956134, + "grad_norm": 1.3331047296524048, + "learning_rate": 1.999929375378826e-05, + "loss": 1.2164, + "step": 4331 + }, + { + "epoch": 0.10103583891873927, + "grad_norm": 1.6070294380187988, + "learning_rate": 1.9999290758084885e-05, + "loss": 1.6565, + "step": 4332 + }, + { + "epoch": 0.10105916205791718, + "grad_norm": 1.5945744514465332, + "learning_rate": 1.9999287756041698e-05, + "loss": 1.298, + "step": 4333 + }, + { + "epoch": 0.1010824851970951, + "grad_norm": 1.7127183675765991, + "learning_rate": 1.9999284747658696e-05, + "loss": 1.6233, + "step": 4334 + }, + { + "epoch": 0.10110580833627301, + "grad_norm": 1.4577192068099976, + "learning_rate": 1.9999281732935886e-05, + "loss": 1.3159, + "step": 4335 + }, + { + "epoch": 0.10112913147545094, + "grad_norm": 2.0346920490264893, + "learning_rate": 1.9999278711873272e-05, + "loss": 1.3263, + "step": 4336 + }, + { + "epoch": 0.10115245461462885, + "grad_norm": 1.582476019859314, + "learning_rate": 1.999927568447085e-05, + "loss": 1.4117, + "step": 4337 + }, + { + "epoch": 0.10117577775380678, + "grad_norm": 1.9694240093231201, + "learning_rate": 1.999927265072863e-05, + "loss": 1.5795, + "step": 4338 + }, + { + "epoch": 0.10119910089298469, + "grad_norm": 1.84016752243042, + "learning_rate": 1.9999269610646603e-05, + "loss": 1.4963, + "step": 4339 + }, + { + "epoch": 0.10122242403216261, + "grad_norm": 1.886704444885254, + "learning_rate": 1.9999266564224777e-05, + "loss": 1.6491, + "step": 4340 + }, + { + "epoch": 0.10124574717134052, + "grad_norm": 1.9586236476898193, + "learning_rate": 1.9999263511463155e-05, + "loss": 1.5735, + "step": 4341 + }, + { + "epoch": 0.10126907031051845, + "grad_norm": 2.501908302307129, + "learning_rate": 1.9999260452361737e-05, + "loss": 0.9846, + "step": 4342 + }, + { + "epoch": 0.10129239344969636, + "grad_norm": 1.6577366590499878, + "learning_rate": 1.9999257386920526e-05, + "loss": 1.5666, + "step": 4343 + }, + { + "epoch": 0.10131571658887428, + "grad_norm": 1.3364449739456177, + "learning_rate": 1.9999254315139524e-05, + "loss": 1.1554, + "step": 4344 + }, + { + "epoch": 0.1013390397280522, + "grad_norm": 1.6358418464660645, + "learning_rate": 1.9999251237018727e-05, + "loss": 1.6063, + "step": 4345 + }, + { + "epoch": 0.10136236286723012, + "grad_norm": 1.7238401174545288, + "learning_rate": 1.999924815255815e-05, + "loss": 1.3965, + "step": 4346 + }, + { + "epoch": 0.10138568600640803, + "grad_norm": 1.815588116645813, + "learning_rate": 1.999924506175778e-05, + "loss": 1.1671, + "step": 4347 + }, + { + "epoch": 0.10140900914558595, + "grad_norm": 2.0672595500946045, + "learning_rate": 1.999924196461763e-05, + "loss": 1.5405, + "step": 4348 + }, + { + "epoch": 0.10143233228476387, + "grad_norm": 1.8933367729187012, + "learning_rate": 1.9999238861137696e-05, + "loss": 1.5728, + "step": 4349 + }, + { + "epoch": 0.10145565542394179, + "grad_norm": 1.6406468152999878, + "learning_rate": 1.9999235751317986e-05, + "loss": 1.2051, + "step": 4350 + }, + { + "epoch": 0.1014789785631197, + "grad_norm": 1.594834804534912, + "learning_rate": 1.9999232635158495e-05, + "loss": 1.6005, + "step": 4351 + }, + { + "epoch": 0.10150230170229763, + "grad_norm": 2.1673924922943115, + "learning_rate": 1.999922951265923e-05, + "loss": 1.1405, + "step": 4352 + }, + { + "epoch": 0.10152562484147554, + "grad_norm": 1.4452152252197266, + "learning_rate": 1.9999226383820187e-05, + "loss": 1.4138, + "step": 4353 + }, + { + "epoch": 0.10154894798065346, + "grad_norm": 1.6347564458847046, + "learning_rate": 1.9999223248641375e-05, + "loss": 1.3157, + "step": 4354 + }, + { + "epoch": 0.10157227111983137, + "grad_norm": 1.6961263418197632, + "learning_rate": 1.999922010712279e-05, + "loss": 1.5234, + "step": 4355 + }, + { + "epoch": 0.1015955942590093, + "grad_norm": 1.6713759899139404, + "learning_rate": 1.9999216959264444e-05, + "loss": 1.4066, + "step": 4356 + }, + { + "epoch": 0.10161891739818721, + "grad_norm": 1.6086574792861938, + "learning_rate": 1.9999213805066326e-05, + "loss": 1.5911, + "step": 4357 + }, + { + "epoch": 0.10164224053736513, + "grad_norm": 1.5126043558120728, + "learning_rate": 1.9999210644528444e-05, + "loss": 0.9794, + "step": 4358 + }, + { + "epoch": 0.10166556367654304, + "grad_norm": 1.6738828420639038, + "learning_rate": 1.9999207477650802e-05, + "loss": 1.3159, + "step": 4359 + }, + { + "epoch": 0.10168888681572096, + "grad_norm": 1.8790634870529175, + "learning_rate": 1.9999204304433397e-05, + "loss": 1.8513, + "step": 4360 + }, + { + "epoch": 0.10171220995489888, + "grad_norm": 1.7257745265960693, + "learning_rate": 1.9999201124876237e-05, + "loss": 1.4597, + "step": 4361 + }, + { + "epoch": 0.10173553309407679, + "grad_norm": 1.7159866094589233, + "learning_rate": 1.9999197938979317e-05, + "loss": 1.5366, + "step": 4362 + }, + { + "epoch": 0.10175885623325472, + "grad_norm": 1.5496243238449097, + "learning_rate": 1.9999194746742647e-05, + "loss": 1.5668, + "step": 4363 + }, + { + "epoch": 0.10178217937243263, + "grad_norm": 1.4768664836883545, + "learning_rate": 1.9999191548166227e-05, + "loss": 1.3203, + "step": 4364 + }, + { + "epoch": 0.10180550251161055, + "grad_norm": 1.7893187999725342, + "learning_rate": 1.9999188343250053e-05, + "loss": 1.527, + "step": 4365 + }, + { + "epoch": 0.10182882565078846, + "grad_norm": 1.770240306854248, + "learning_rate": 1.999918513199413e-05, + "loss": 1.5302, + "step": 4366 + }, + { + "epoch": 0.10185214878996639, + "grad_norm": 2.052283763885498, + "learning_rate": 1.9999181914398465e-05, + "loss": 1.6142, + "step": 4367 + }, + { + "epoch": 0.1018754719291443, + "grad_norm": 1.8870301246643066, + "learning_rate": 1.9999178690463054e-05, + "loss": 1.609, + "step": 4368 + }, + { + "epoch": 0.10189879506832222, + "grad_norm": 2.209730625152588, + "learning_rate": 1.99991754601879e-05, + "loss": 1.1869, + "step": 4369 + }, + { + "epoch": 0.10192211820750013, + "grad_norm": 1.906145453453064, + "learning_rate": 1.9999172223573008e-05, + "loss": 1.6161, + "step": 4370 + }, + { + "epoch": 0.10194544134667806, + "grad_norm": 1.7600947618484497, + "learning_rate": 1.9999168980618377e-05, + "loss": 1.7264, + "step": 4371 + }, + { + "epoch": 0.10196876448585597, + "grad_norm": 1.475500464439392, + "learning_rate": 1.9999165731324012e-05, + "loss": 1.1859, + "step": 4372 + }, + { + "epoch": 0.1019920876250339, + "grad_norm": 2.4348111152648926, + "learning_rate": 1.999916247568991e-05, + "loss": 1.4721, + "step": 4373 + }, + { + "epoch": 0.1020154107642118, + "grad_norm": 1.7595854997634888, + "learning_rate": 1.9999159213716083e-05, + "loss": 1.4112, + "step": 4374 + }, + { + "epoch": 0.10203873390338973, + "grad_norm": 2.1891770362854004, + "learning_rate": 1.9999155945402522e-05, + "loss": 1.4783, + "step": 4375 + }, + { + "epoch": 0.10206205704256764, + "grad_norm": 1.538309097290039, + "learning_rate": 1.9999152670749235e-05, + "loss": 1.5166, + "step": 4376 + }, + { + "epoch": 0.10208538018174557, + "grad_norm": 1.431139588356018, + "learning_rate": 1.9999149389756224e-05, + "loss": 1.1056, + "step": 4377 + }, + { + "epoch": 0.10210870332092348, + "grad_norm": 1.902206301689148, + "learning_rate": 1.9999146102423484e-05, + "loss": 1.3445, + "step": 4378 + }, + { + "epoch": 0.1021320264601014, + "grad_norm": 1.5516972541809082, + "learning_rate": 1.999914280875103e-05, + "loss": 1.3287, + "step": 4379 + }, + { + "epoch": 0.10215534959927931, + "grad_norm": 2.003575325012207, + "learning_rate": 1.999913950873885e-05, + "loss": 1.3581, + "step": 4380 + }, + { + "epoch": 0.10217867273845724, + "grad_norm": 2.0806925296783447, + "learning_rate": 1.9999136202386957e-05, + "loss": 1.7686, + "step": 4381 + }, + { + "epoch": 0.10220199587763515, + "grad_norm": 1.9133434295654297, + "learning_rate": 1.999913288969535e-05, + "loss": 1.5319, + "step": 4382 + }, + { + "epoch": 0.10222531901681307, + "grad_norm": 1.709938645362854, + "learning_rate": 1.999912957066403e-05, + "loss": 1.1938, + "step": 4383 + }, + { + "epoch": 0.10224864215599098, + "grad_norm": 2.0247511863708496, + "learning_rate": 1.9999126245292997e-05, + "loss": 1.5905, + "step": 4384 + }, + { + "epoch": 0.10227196529516891, + "grad_norm": 1.8318753242492676, + "learning_rate": 1.9999122913582256e-05, + "loss": 1.065, + "step": 4385 + }, + { + "epoch": 0.10229528843434682, + "grad_norm": 1.8385038375854492, + "learning_rate": 1.9999119575531808e-05, + "loss": 1.242, + "step": 4386 + }, + { + "epoch": 0.10231861157352475, + "grad_norm": 1.8718616962432861, + "learning_rate": 1.9999116231141658e-05, + "loss": 1.6333, + "step": 4387 + }, + { + "epoch": 0.10234193471270266, + "grad_norm": 1.9290051460266113, + "learning_rate": 1.9999112880411804e-05, + "loss": 1.7813, + "step": 4388 + }, + { + "epoch": 0.10236525785188057, + "grad_norm": 1.593847632408142, + "learning_rate": 1.9999109523342252e-05, + "loss": 1.5774, + "step": 4389 + }, + { + "epoch": 0.10238858099105849, + "grad_norm": 1.9645016193389893, + "learning_rate": 1.9999106159933e-05, + "loss": 1.3466, + "step": 4390 + }, + { + "epoch": 0.1024119041302364, + "grad_norm": 1.6337265968322754, + "learning_rate": 1.999910279018405e-05, + "loss": 0.938, + "step": 4391 + }, + { + "epoch": 0.10243522726941433, + "grad_norm": 1.5305380821228027, + "learning_rate": 1.9999099414095414e-05, + "loss": 1.4619, + "step": 4392 + }, + { + "epoch": 0.10245855040859224, + "grad_norm": 1.9405750036239624, + "learning_rate": 1.999909603166708e-05, + "loss": 1.7327, + "step": 4393 + }, + { + "epoch": 0.10248187354777016, + "grad_norm": 1.8218337297439575, + "learning_rate": 1.999909264289906e-05, + "loss": 1.4709, + "step": 4394 + }, + { + "epoch": 0.10250519668694807, + "grad_norm": 1.7523096799850464, + "learning_rate": 1.999908924779135e-05, + "loss": 1.4373, + "step": 4395 + }, + { + "epoch": 0.102528519826126, + "grad_norm": 1.9578635692596436, + "learning_rate": 1.999908584634396e-05, + "loss": 1.5482, + "step": 4396 + }, + { + "epoch": 0.10255184296530391, + "grad_norm": 2.027506113052368, + "learning_rate": 1.999908243855688e-05, + "loss": 1.7009, + "step": 4397 + }, + { + "epoch": 0.10257516610448184, + "grad_norm": 1.7978277206420898, + "learning_rate": 1.999907902443012e-05, + "loss": 1.4091, + "step": 4398 + }, + { + "epoch": 0.10259848924365975, + "grad_norm": 1.7331018447875977, + "learning_rate": 1.9999075603963685e-05, + "loss": 1.4605, + "step": 4399 + }, + { + "epoch": 0.10262181238283767, + "grad_norm": 1.545462727546692, + "learning_rate": 1.999907217715757e-05, + "loss": 1.2926, + "step": 4400 + }, + { + "epoch": 0.10264513552201558, + "grad_norm": 1.7517781257629395, + "learning_rate": 1.9999068744011785e-05, + "loss": 1.3769, + "step": 4401 + }, + { + "epoch": 0.1026684586611935, + "grad_norm": 1.7014862298965454, + "learning_rate": 1.9999065304526323e-05, + "loss": 1.5482, + "step": 4402 + }, + { + "epoch": 0.10269178180037142, + "grad_norm": 1.6097866296768188, + "learning_rate": 1.9999061858701195e-05, + "loss": 1.3104, + "step": 4403 + }, + { + "epoch": 0.10271510493954934, + "grad_norm": 1.8927146196365356, + "learning_rate": 1.99990584065364e-05, + "loss": 1.0695, + "step": 4404 + }, + { + "epoch": 0.10273842807872725, + "grad_norm": 1.688493251800537, + "learning_rate": 1.9999054948031937e-05, + "loss": 1.0971, + "step": 4405 + }, + { + "epoch": 0.10276175121790518, + "grad_norm": 1.8350069522857666, + "learning_rate": 1.999905148318781e-05, + "loss": 1.3597, + "step": 4406 + }, + { + "epoch": 0.10278507435708309, + "grad_norm": 1.635527491569519, + "learning_rate": 1.9999048012004026e-05, + "loss": 1.1458, + "step": 4407 + }, + { + "epoch": 0.10280839749626101, + "grad_norm": 1.957057237625122, + "learning_rate": 1.9999044534480578e-05, + "loss": 1.4101, + "step": 4408 + }, + { + "epoch": 0.10283172063543893, + "grad_norm": 1.4483978748321533, + "learning_rate": 1.9999041050617474e-05, + "loss": 1.2723, + "step": 4409 + }, + { + "epoch": 0.10285504377461685, + "grad_norm": 2.213428020477295, + "learning_rate": 1.999903756041472e-05, + "loss": 1.2912, + "step": 4410 + }, + { + "epoch": 0.10287836691379476, + "grad_norm": 1.8046005964279175, + "learning_rate": 1.9999034063872307e-05, + "loss": 1.5203, + "step": 4411 + }, + { + "epoch": 0.10290169005297269, + "grad_norm": 2.2218785285949707, + "learning_rate": 1.9999030560990248e-05, + "loss": 1.3761, + "step": 4412 + }, + { + "epoch": 0.1029250131921506, + "grad_norm": 2.170475482940674, + "learning_rate": 1.999902705176854e-05, + "loss": 1.3484, + "step": 4413 + }, + { + "epoch": 0.10294833633132852, + "grad_norm": 1.599300503730774, + "learning_rate": 1.9999023536207186e-05, + "loss": 1.3247, + "step": 4414 + }, + { + "epoch": 0.10297165947050643, + "grad_norm": 1.850488305091858, + "learning_rate": 1.999902001430619e-05, + "loss": 1.2363, + "step": 4415 + }, + { + "epoch": 0.10299498260968436, + "grad_norm": 1.4526803493499756, + "learning_rate": 1.9999016486065554e-05, + "loss": 1.4001, + "step": 4416 + }, + { + "epoch": 0.10301830574886227, + "grad_norm": 1.7001068592071533, + "learning_rate": 1.9999012951485278e-05, + "loss": 1.5615, + "step": 4417 + }, + { + "epoch": 0.10304162888804018, + "grad_norm": 1.7435797452926636, + "learning_rate": 1.999900941056536e-05, + "loss": 1.5585, + "step": 4418 + }, + { + "epoch": 0.1030649520272181, + "grad_norm": 1.8533247709274292, + "learning_rate": 1.9999005863305815e-05, + "loss": 1.6771, + "step": 4419 + }, + { + "epoch": 0.10308827516639602, + "grad_norm": 1.5894358158111572, + "learning_rate": 1.9999002309706635e-05, + "loss": 1.5284, + "step": 4420 + }, + { + "epoch": 0.10311159830557394, + "grad_norm": 2.0098137855529785, + "learning_rate": 1.9998998749767823e-05, + "loss": 1.3599, + "step": 4421 + }, + { + "epoch": 0.10313492144475185, + "grad_norm": 1.9894591569900513, + "learning_rate": 1.9998995183489385e-05, + "loss": 1.7052, + "step": 4422 + }, + { + "epoch": 0.10315824458392978, + "grad_norm": 1.7116707563400269, + "learning_rate": 1.9998991610871323e-05, + "loss": 1.3363, + "step": 4423 + }, + { + "epoch": 0.10318156772310769, + "grad_norm": 1.7851814031600952, + "learning_rate": 1.9998988031913637e-05, + "loss": 1.4181, + "step": 4424 + }, + { + "epoch": 0.10320489086228561, + "grad_norm": 1.8559414148330688, + "learning_rate": 1.9998984446616328e-05, + "loss": 1.4141, + "step": 4425 + }, + { + "epoch": 0.10322821400146352, + "grad_norm": 1.6662341356277466, + "learning_rate": 1.9998980854979405e-05, + "loss": 1.6026, + "step": 4426 + }, + { + "epoch": 0.10325153714064145, + "grad_norm": 2.0204248428344727, + "learning_rate": 1.999897725700286e-05, + "loss": 1.3882, + "step": 4427 + }, + { + "epoch": 0.10327486027981936, + "grad_norm": 1.7466933727264404, + "learning_rate": 1.9998973652686705e-05, + "loss": 1.4131, + "step": 4428 + }, + { + "epoch": 0.10329818341899728, + "grad_norm": 1.7322821617126465, + "learning_rate": 1.9998970042030938e-05, + "loss": 1.1848, + "step": 4429 + }, + { + "epoch": 0.1033215065581752, + "grad_norm": 1.723657250404358, + "learning_rate": 1.9998966425035556e-05, + "loss": 1.4047, + "step": 4430 + }, + { + "epoch": 0.10334482969735312, + "grad_norm": 2.131956100463867, + "learning_rate": 1.9998962801700573e-05, + "loss": 1.813, + "step": 4431 + }, + { + "epoch": 0.10336815283653103, + "grad_norm": 1.6857149600982666, + "learning_rate": 1.9998959172025982e-05, + "loss": 1.6333, + "step": 4432 + }, + { + "epoch": 0.10339147597570895, + "grad_norm": 2.233597755432129, + "learning_rate": 1.999895553601179e-05, + "loss": 1.4087, + "step": 4433 + }, + { + "epoch": 0.10341479911488687, + "grad_norm": 1.7040321826934814, + "learning_rate": 1.9998951893657997e-05, + "loss": 1.7103, + "step": 4434 + }, + { + "epoch": 0.10343812225406479, + "grad_norm": 1.805225133895874, + "learning_rate": 1.9998948244964606e-05, + "loss": 1.5802, + "step": 4435 + }, + { + "epoch": 0.1034614453932427, + "grad_norm": 1.5285016298294067, + "learning_rate": 1.999894458993162e-05, + "loss": 1.2591, + "step": 4436 + }, + { + "epoch": 0.10348476853242063, + "grad_norm": 1.8958289623260498, + "learning_rate": 1.9998940928559037e-05, + "loss": 1.3856, + "step": 4437 + }, + { + "epoch": 0.10350809167159854, + "grad_norm": 1.5795350074768066, + "learning_rate": 1.9998937260846866e-05, + "loss": 1.5508, + "step": 4438 + }, + { + "epoch": 0.10353141481077646, + "grad_norm": 1.894783854484558, + "learning_rate": 1.9998933586795108e-05, + "loss": 1.2985, + "step": 4439 + }, + { + "epoch": 0.10355473794995437, + "grad_norm": 1.511522650718689, + "learning_rate": 1.9998929906403762e-05, + "loss": 1.5026, + "step": 4440 + }, + { + "epoch": 0.1035780610891323, + "grad_norm": 1.7025114297866821, + "learning_rate": 1.9998926219672832e-05, + "loss": 1.5147, + "step": 4441 + }, + { + "epoch": 0.10360138422831021, + "grad_norm": 3.8708784580230713, + "learning_rate": 1.9998922526602317e-05, + "loss": 1.8936, + "step": 4442 + }, + { + "epoch": 0.10362470736748813, + "grad_norm": 1.8823548555374146, + "learning_rate": 1.9998918827192226e-05, + "loss": 1.4903, + "step": 4443 + }, + { + "epoch": 0.10364803050666604, + "grad_norm": 1.815712571144104, + "learning_rate": 1.999891512144256e-05, + "loss": 1.153, + "step": 4444 + }, + { + "epoch": 0.10367135364584396, + "grad_norm": 1.3985499143600464, + "learning_rate": 1.9998911409353313e-05, + "loss": 1.3144, + "step": 4445 + }, + { + "epoch": 0.10369467678502188, + "grad_norm": 1.7983309030532837, + "learning_rate": 1.99989076909245e-05, + "loss": 1.4602, + "step": 4446 + }, + { + "epoch": 0.10371799992419979, + "grad_norm": 1.762711763381958, + "learning_rate": 1.9998903966156114e-05, + "loss": 1.5969, + "step": 4447 + }, + { + "epoch": 0.10374132306337772, + "grad_norm": 1.8125685453414917, + "learning_rate": 1.999890023504816e-05, + "loss": 1.4237, + "step": 4448 + }, + { + "epoch": 0.10376464620255563, + "grad_norm": 1.8932796716690063, + "learning_rate": 1.9998896497600643e-05, + "loss": 1.6998, + "step": 4449 + }, + { + "epoch": 0.10378796934173355, + "grad_norm": 1.6668219566345215, + "learning_rate": 1.999889275381356e-05, + "loss": 1.4106, + "step": 4450 + }, + { + "epoch": 0.10381129248091146, + "grad_norm": 1.6609306335449219, + "learning_rate": 1.9998889003686917e-05, + "loss": 1.5416, + "step": 4451 + }, + { + "epoch": 0.10383461562008939, + "grad_norm": 2.811746597290039, + "learning_rate": 1.9998885247220716e-05, + "loss": 1.6334, + "step": 4452 + }, + { + "epoch": 0.1038579387592673, + "grad_norm": 1.5314466953277588, + "learning_rate": 1.999888148441496e-05, + "loss": 1.2161, + "step": 4453 + }, + { + "epoch": 0.10388126189844522, + "grad_norm": 2.1141018867492676, + "learning_rate": 1.9998877715269654e-05, + "loss": 1.2569, + "step": 4454 + }, + { + "epoch": 0.10390458503762313, + "grad_norm": 1.5534998178482056, + "learning_rate": 1.9998873939784792e-05, + "loss": 1.3211, + "step": 4455 + }, + { + "epoch": 0.10392790817680106, + "grad_norm": 1.6235876083374023, + "learning_rate": 1.9998870157960383e-05, + "loss": 1.2447, + "step": 4456 + }, + { + "epoch": 0.10395123131597897, + "grad_norm": 1.601912498474121, + "learning_rate": 1.999886636979643e-05, + "loss": 1.3548, + "step": 4457 + }, + { + "epoch": 0.1039745544551569, + "grad_norm": 1.9768726825714111, + "learning_rate": 1.999886257529293e-05, + "loss": 1.5343, + "step": 4458 + }, + { + "epoch": 0.1039978775943348, + "grad_norm": 1.945237159729004, + "learning_rate": 1.999885877444989e-05, + "loss": 1.3542, + "step": 4459 + }, + { + "epoch": 0.10402120073351273, + "grad_norm": 1.7455191612243652, + "learning_rate": 1.999885496726731e-05, + "loss": 1.4138, + "step": 4460 + }, + { + "epoch": 0.10404452387269064, + "grad_norm": 2.1632285118103027, + "learning_rate": 1.9998851153745193e-05, + "loss": 1.5867, + "step": 4461 + }, + { + "epoch": 0.10406784701186857, + "grad_norm": 1.6293684244155884, + "learning_rate": 1.9998847333883543e-05, + "loss": 1.2664, + "step": 4462 + }, + { + "epoch": 0.10409117015104648, + "grad_norm": 1.6983884572982788, + "learning_rate": 1.9998843507682363e-05, + "loss": 1.049, + "step": 4463 + }, + { + "epoch": 0.1041144932902244, + "grad_norm": 1.7191314697265625, + "learning_rate": 1.9998839675141653e-05, + "loss": 1.5225, + "step": 4464 + }, + { + "epoch": 0.10413781642940231, + "grad_norm": 1.8048819303512573, + "learning_rate": 1.9998835836261414e-05, + "loss": 1.4494, + "step": 4465 + }, + { + "epoch": 0.10416113956858024, + "grad_norm": 1.4836822748184204, + "learning_rate": 1.9998831991041654e-05, + "loss": 1.4242, + "step": 4466 + }, + { + "epoch": 0.10418446270775815, + "grad_norm": 1.657299518585205, + "learning_rate": 1.9998828139482368e-05, + "loss": 1.5711, + "step": 4467 + }, + { + "epoch": 0.10420778584693607, + "grad_norm": 1.707079291343689, + "learning_rate": 1.9998824281583565e-05, + "loss": 1.4206, + "step": 4468 + }, + { + "epoch": 0.10423110898611399, + "grad_norm": 1.547277569770813, + "learning_rate": 1.9998820417345243e-05, + "loss": 1.295, + "step": 4469 + }, + { + "epoch": 0.10425443212529191, + "grad_norm": 1.6435904502868652, + "learning_rate": 1.9998816546767407e-05, + "loss": 1.7703, + "step": 4470 + }, + { + "epoch": 0.10427775526446982, + "grad_norm": 2.265117645263672, + "learning_rate": 1.9998812669850062e-05, + "loss": 1.6327, + "step": 4471 + }, + { + "epoch": 0.10430107840364775, + "grad_norm": 1.6759183406829834, + "learning_rate": 1.9998808786593204e-05, + "loss": 1.3584, + "step": 4472 + }, + { + "epoch": 0.10432440154282566, + "grad_norm": 1.700777292251587, + "learning_rate": 1.999880489699684e-05, + "loss": 1.5565, + "step": 4473 + }, + { + "epoch": 0.10434772468200357, + "grad_norm": 1.345456838607788, + "learning_rate": 1.999880100106097e-05, + "loss": 1.2595, + "step": 4474 + }, + { + "epoch": 0.10437104782118149, + "grad_norm": 1.9874299764633179, + "learning_rate": 1.9998797098785598e-05, + "loss": 1.5397, + "step": 4475 + }, + { + "epoch": 0.1043943709603594, + "grad_norm": 1.6121760606765747, + "learning_rate": 1.9998793190170724e-05, + "loss": 1.2694, + "step": 4476 + }, + { + "epoch": 0.10441769409953733, + "grad_norm": 1.7067564725875854, + "learning_rate": 1.999878927521636e-05, + "loss": 1.6556, + "step": 4477 + }, + { + "epoch": 0.10444101723871524, + "grad_norm": 1.3974144458770752, + "learning_rate": 1.9998785353922493e-05, + "loss": 1.3259, + "step": 4478 + }, + { + "epoch": 0.10446434037789316, + "grad_norm": 1.415135383605957, + "learning_rate": 1.9998781426289135e-05, + "loss": 1.1619, + "step": 4479 + }, + { + "epoch": 0.10448766351707107, + "grad_norm": 1.335329294204712, + "learning_rate": 1.999877749231629e-05, + "loss": 1.3266, + "step": 4480 + }, + { + "epoch": 0.104510986656249, + "grad_norm": 1.7226165533065796, + "learning_rate": 1.9998773552003957e-05, + "loss": 1.1049, + "step": 4481 + }, + { + "epoch": 0.10453430979542691, + "grad_norm": 1.3039993047714233, + "learning_rate": 1.9998769605352137e-05, + "loss": 1.1237, + "step": 4482 + }, + { + "epoch": 0.10455763293460484, + "grad_norm": 2.0855329036712646, + "learning_rate": 1.9998765652360835e-05, + "loss": 1.4075, + "step": 4483 + }, + { + "epoch": 0.10458095607378275, + "grad_norm": 1.926064372062683, + "learning_rate": 1.9998761693030054e-05, + "loss": 1.8209, + "step": 4484 + }, + { + "epoch": 0.10460427921296067, + "grad_norm": 1.8120089769363403, + "learning_rate": 1.9998757727359797e-05, + "loss": 1.0948, + "step": 4485 + }, + { + "epoch": 0.10462760235213858, + "grad_norm": 2.193441390991211, + "learning_rate": 1.999875375535006e-05, + "loss": 1.7267, + "step": 4486 + }, + { + "epoch": 0.10465092549131651, + "grad_norm": 1.5877799987792969, + "learning_rate": 1.9998749777000856e-05, + "loss": 1.559, + "step": 4487 + }, + { + "epoch": 0.10467424863049442, + "grad_norm": 1.4523189067840576, + "learning_rate": 1.999874579231218e-05, + "loss": 1.2716, + "step": 4488 + }, + { + "epoch": 0.10469757176967234, + "grad_norm": 1.5455758571624756, + "learning_rate": 1.9998741801284038e-05, + "loss": 1.2111, + "step": 4489 + }, + { + "epoch": 0.10472089490885025, + "grad_norm": 1.694250464439392, + "learning_rate": 1.999873780391643e-05, + "loss": 1.1948, + "step": 4490 + }, + { + "epoch": 0.10474421804802818, + "grad_norm": 1.8756452798843384, + "learning_rate": 1.999873380020936e-05, + "loss": 1.1417, + "step": 4491 + }, + { + "epoch": 0.10476754118720609, + "grad_norm": 2.003406047821045, + "learning_rate": 1.999872979016283e-05, + "loss": 1.3391, + "step": 4492 + }, + { + "epoch": 0.10479086432638401, + "grad_norm": 1.6404839754104614, + "learning_rate": 1.9998725773776843e-05, + "loss": 1.5261, + "step": 4493 + }, + { + "epoch": 0.10481418746556193, + "grad_norm": 1.4287680387496948, + "learning_rate": 1.99987217510514e-05, + "loss": 1.2396, + "step": 4494 + }, + { + "epoch": 0.10483751060473985, + "grad_norm": 2.005293369293213, + "learning_rate": 1.999871772198651e-05, + "loss": 1.3859, + "step": 4495 + }, + { + "epoch": 0.10486083374391776, + "grad_norm": 1.467164397239685, + "learning_rate": 1.9998713686582164e-05, + "loss": 1.3692, + "step": 4496 + }, + { + "epoch": 0.10488415688309569, + "grad_norm": 1.782903790473938, + "learning_rate": 1.9998709644838373e-05, + "loss": 1.4374, + "step": 4497 + }, + { + "epoch": 0.1049074800222736, + "grad_norm": 1.6113072633743286, + "learning_rate": 1.9998705596755138e-05, + "loss": 1.2854, + "step": 4498 + }, + { + "epoch": 0.10493080316145152, + "grad_norm": 2.15000319480896, + "learning_rate": 1.9998701542332463e-05, + "loss": 1.2573, + "step": 4499 + }, + { + "epoch": 0.10495412630062943, + "grad_norm": 1.4802179336547852, + "learning_rate": 1.9998697481570347e-05, + "loss": 1.2542, + "step": 4500 + }, + { + "epoch": 0.10497744943980736, + "grad_norm": 1.824934482574463, + "learning_rate": 1.9998693414468793e-05, + "loss": 1.5855, + "step": 4501 + }, + { + "epoch": 0.10500077257898527, + "grad_norm": 1.978626012802124, + "learning_rate": 1.999868934102781e-05, + "loss": 1.5597, + "step": 4502 + }, + { + "epoch": 0.10502409571816318, + "grad_norm": 1.751846432685852, + "learning_rate": 1.9998685261247388e-05, + "loss": 1.5744, + "step": 4503 + }, + { + "epoch": 0.1050474188573411, + "grad_norm": 1.903283953666687, + "learning_rate": 1.999868117512754e-05, + "loss": 1.6709, + "step": 4504 + }, + { + "epoch": 0.10507074199651902, + "grad_norm": 1.7782729864120483, + "learning_rate": 1.9998677082668267e-05, + "loss": 1.1523, + "step": 4505 + }, + { + "epoch": 0.10509406513569694, + "grad_norm": 1.7197009325027466, + "learning_rate": 1.999867298386957e-05, + "loss": 0.9572, + "step": 4506 + }, + { + "epoch": 0.10511738827487485, + "grad_norm": 2.1358916759490967, + "learning_rate": 1.9998668878731452e-05, + "loss": 1.5962, + "step": 4507 + }, + { + "epoch": 0.10514071141405278, + "grad_norm": 1.4862276315689087, + "learning_rate": 1.9998664767253916e-05, + "loss": 1.3908, + "step": 4508 + }, + { + "epoch": 0.10516403455323069, + "grad_norm": 1.6323543787002563, + "learning_rate": 1.9998660649436962e-05, + "loss": 1.432, + "step": 4509 + }, + { + "epoch": 0.10518735769240861, + "grad_norm": 2.2728726863861084, + "learning_rate": 1.9998656525280596e-05, + "loss": 1.7178, + "step": 4510 + }, + { + "epoch": 0.10521068083158652, + "grad_norm": 1.7053133249282837, + "learning_rate": 1.999865239478482e-05, + "loss": 1.365, + "step": 4511 + }, + { + "epoch": 0.10523400397076445, + "grad_norm": 1.8892529010772705, + "learning_rate": 1.9998648257949633e-05, + "loss": 1.6853, + "step": 4512 + }, + { + "epoch": 0.10525732710994236, + "grad_norm": 1.7584593296051025, + "learning_rate": 1.9998644114775044e-05, + "loss": 1.1567, + "step": 4513 + }, + { + "epoch": 0.10528065024912028, + "grad_norm": 1.8040283918380737, + "learning_rate": 1.999863996526105e-05, + "loss": 1.8875, + "step": 4514 + }, + { + "epoch": 0.1053039733882982, + "grad_norm": 1.7637766599655151, + "learning_rate": 1.9998635809407653e-05, + "loss": 1.1819, + "step": 4515 + }, + { + "epoch": 0.10532729652747612, + "grad_norm": 1.8415699005126953, + "learning_rate": 1.9998631647214863e-05, + "loss": 1.2454, + "step": 4516 + }, + { + "epoch": 0.10535061966665403, + "grad_norm": 1.7374746799468994, + "learning_rate": 1.9998627478682678e-05, + "loss": 1.5977, + "step": 4517 + }, + { + "epoch": 0.10537394280583195, + "grad_norm": 1.87213134765625, + "learning_rate": 1.99986233038111e-05, + "loss": 1.4459, + "step": 4518 + }, + { + "epoch": 0.10539726594500987, + "grad_norm": 1.5321569442749023, + "learning_rate": 1.999861912260013e-05, + "loss": 1.6018, + "step": 4519 + }, + { + "epoch": 0.10542058908418779, + "grad_norm": 2.3718676567077637, + "learning_rate": 1.9998614935049774e-05, + "loss": 1.472, + "step": 4520 + }, + { + "epoch": 0.1054439122233657, + "grad_norm": 1.7300660610198975, + "learning_rate": 1.9998610741160035e-05, + "loss": 1.5318, + "step": 4521 + }, + { + "epoch": 0.10546723536254363, + "grad_norm": 1.7774510383605957, + "learning_rate": 1.9998606540930915e-05, + "loss": 1.1674, + "step": 4522 + }, + { + "epoch": 0.10549055850172154, + "grad_norm": 1.4192143678665161, + "learning_rate": 1.999860233436241e-05, + "loss": 1.2442, + "step": 4523 + }, + { + "epoch": 0.10551388164089946, + "grad_norm": 1.8033263683319092, + "learning_rate": 1.9998598121454536e-05, + "loss": 1.3566, + "step": 4524 + }, + { + "epoch": 0.10553720478007737, + "grad_norm": 1.5711365938186646, + "learning_rate": 1.9998593902207284e-05, + "loss": 1.5206, + "step": 4525 + }, + { + "epoch": 0.1055605279192553, + "grad_norm": 1.9537410736083984, + "learning_rate": 1.9998589676620664e-05, + "loss": 1.7395, + "step": 4526 + }, + { + "epoch": 0.10558385105843321, + "grad_norm": 1.5812814235687256, + "learning_rate": 1.999858544469467e-05, + "loss": 1.5238, + "step": 4527 + }, + { + "epoch": 0.10560717419761113, + "grad_norm": 1.6676138639450073, + "learning_rate": 1.9998581206429315e-05, + "loss": 1.4023, + "step": 4528 + }, + { + "epoch": 0.10563049733678904, + "grad_norm": 1.7668168544769287, + "learning_rate": 1.9998576961824597e-05, + "loss": 1.4974, + "step": 4529 + }, + { + "epoch": 0.10565382047596697, + "grad_norm": 2.2290921211242676, + "learning_rate": 1.9998572710880518e-05, + "loss": 1.3454, + "step": 4530 + }, + { + "epoch": 0.10567714361514488, + "grad_norm": 1.8207625150680542, + "learning_rate": 1.999856845359708e-05, + "loss": 1.253, + "step": 4531 + }, + { + "epoch": 0.10570046675432279, + "grad_norm": 2.1381027698516846, + "learning_rate": 1.9998564189974288e-05, + "loss": 1.3995, + "step": 4532 + }, + { + "epoch": 0.10572378989350072, + "grad_norm": 1.5824651718139648, + "learning_rate": 1.9998559920012144e-05, + "loss": 1.327, + "step": 4533 + }, + { + "epoch": 0.10574711303267863, + "grad_norm": 1.8291656970977783, + "learning_rate": 1.9998555643710653e-05, + "loss": 1.4791, + "step": 4534 + }, + { + "epoch": 0.10577043617185655, + "grad_norm": 1.7711025476455688, + "learning_rate": 1.999855136106981e-05, + "loss": 1.2384, + "step": 4535 + }, + { + "epoch": 0.10579375931103446, + "grad_norm": 1.7322211265563965, + "learning_rate": 1.9998547072089626e-05, + "loss": 1.5979, + "step": 4536 + }, + { + "epoch": 0.10581708245021239, + "grad_norm": 1.429113507270813, + "learning_rate": 1.9998542776770104e-05, + "loss": 1.1038, + "step": 4537 + }, + { + "epoch": 0.1058404055893903, + "grad_norm": 1.4199211597442627, + "learning_rate": 1.9998538475111238e-05, + "loss": 0.9931, + "step": 4538 + }, + { + "epoch": 0.10586372872856822, + "grad_norm": 1.6906683444976807, + "learning_rate": 1.9998534167113036e-05, + "loss": 1.2704, + "step": 4539 + }, + { + "epoch": 0.10588705186774613, + "grad_norm": 1.8531103134155273, + "learning_rate": 1.9998529852775506e-05, + "loss": 1.4959, + "step": 4540 + }, + { + "epoch": 0.10591037500692406, + "grad_norm": 1.4918322563171387, + "learning_rate": 1.999852553209864e-05, + "loss": 1.031, + "step": 4541 + }, + { + "epoch": 0.10593369814610197, + "grad_norm": 1.7900128364562988, + "learning_rate": 1.999852120508245e-05, + "loss": 1.3916, + "step": 4542 + }, + { + "epoch": 0.1059570212852799, + "grad_norm": 1.757381558418274, + "learning_rate": 1.9998516871726935e-05, + "loss": 1.5504, + "step": 4543 + }, + { + "epoch": 0.1059803444244578, + "grad_norm": 2.007852077484131, + "learning_rate": 1.9998512532032095e-05, + "loss": 1.2342, + "step": 4544 + }, + { + "epoch": 0.10600366756363573, + "grad_norm": 1.7031002044677734, + "learning_rate": 1.999850818599794e-05, + "loss": 1.4977, + "step": 4545 + }, + { + "epoch": 0.10602699070281364, + "grad_norm": 1.6903876066207886, + "learning_rate": 1.9998503833624463e-05, + "loss": 1.199, + "step": 4546 + }, + { + "epoch": 0.10605031384199157, + "grad_norm": 1.4657163619995117, + "learning_rate": 1.9998499474911674e-05, + "loss": 1.6284, + "step": 4547 + }, + { + "epoch": 0.10607363698116948, + "grad_norm": 1.5041929483413696, + "learning_rate": 1.9998495109859575e-05, + "loss": 1.4298, + "step": 4548 + }, + { + "epoch": 0.1060969601203474, + "grad_norm": 1.4863712787628174, + "learning_rate": 1.9998490738468166e-05, + "loss": 1.1771, + "step": 4549 + }, + { + "epoch": 0.10612028325952531, + "grad_norm": 1.8348568677902222, + "learning_rate": 1.999848636073745e-05, + "loss": 1.5684, + "step": 4550 + }, + { + "epoch": 0.10614360639870324, + "grad_norm": 1.5103323459625244, + "learning_rate": 1.9998481976667436e-05, + "loss": 1.282, + "step": 4551 + }, + { + "epoch": 0.10616692953788115, + "grad_norm": 1.5343869924545288, + "learning_rate": 1.999847758625812e-05, + "loss": 1.2171, + "step": 4552 + }, + { + "epoch": 0.10619025267705907, + "grad_norm": 1.3986709117889404, + "learning_rate": 1.9998473189509505e-05, + "loss": 1.3708, + "step": 4553 + }, + { + "epoch": 0.10621357581623699, + "grad_norm": 1.6608822345733643, + "learning_rate": 1.9998468786421598e-05, + "loss": 1.2693, + "step": 4554 + }, + { + "epoch": 0.10623689895541491, + "grad_norm": 1.467126488685608, + "learning_rate": 1.99984643769944e-05, + "loss": 1.4189, + "step": 4555 + }, + { + "epoch": 0.10626022209459282, + "grad_norm": 1.820080280303955, + "learning_rate": 1.9998459961227906e-05, + "loss": 1.5622, + "step": 4556 + }, + { + "epoch": 0.10628354523377075, + "grad_norm": 1.7327991724014282, + "learning_rate": 1.999845553912213e-05, + "loss": 1.4281, + "step": 4557 + }, + { + "epoch": 0.10630686837294866, + "grad_norm": 1.5667237043380737, + "learning_rate": 1.9998451110677073e-05, + "loss": 1.349, + "step": 4558 + }, + { + "epoch": 0.10633019151212657, + "grad_norm": 2.0479605197906494, + "learning_rate": 1.9998446675892733e-05, + "loss": 1.2973, + "step": 4559 + }, + { + "epoch": 0.10635351465130449, + "grad_norm": 1.624783992767334, + "learning_rate": 1.9998442234769116e-05, + "loss": 1.4658, + "step": 4560 + }, + { + "epoch": 0.1063768377904824, + "grad_norm": 1.7705590724945068, + "learning_rate": 1.9998437787306228e-05, + "loss": 0.8277, + "step": 4561 + }, + { + "epoch": 0.10640016092966033, + "grad_norm": 1.941250205039978, + "learning_rate": 1.9998433333504063e-05, + "loss": 1.4665, + "step": 4562 + }, + { + "epoch": 0.10642348406883824, + "grad_norm": 1.5868898630142212, + "learning_rate": 1.999842887336263e-05, + "loss": 1.6009, + "step": 4563 + }, + { + "epoch": 0.10644680720801616, + "grad_norm": 1.5634092092514038, + "learning_rate": 1.999842440688193e-05, + "loss": 1.4364, + "step": 4564 + }, + { + "epoch": 0.10647013034719408, + "grad_norm": 1.8790887594223022, + "learning_rate": 1.9998419934061968e-05, + "loss": 1.6344, + "step": 4565 + }, + { + "epoch": 0.106493453486372, + "grad_norm": 1.4845713376998901, + "learning_rate": 1.9998415454902743e-05, + "loss": 1.2816, + "step": 4566 + }, + { + "epoch": 0.10651677662554991, + "grad_norm": 1.503846287727356, + "learning_rate": 1.9998410969404264e-05, + "loss": 1.4821, + "step": 4567 + }, + { + "epoch": 0.10654009976472784, + "grad_norm": 1.9791507720947266, + "learning_rate": 1.9998406477566525e-05, + "loss": 1.3576, + "step": 4568 + }, + { + "epoch": 0.10656342290390575, + "grad_norm": 1.6551647186279297, + "learning_rate": 1.9998401979389537e-05, + "loss": 1.6536, + "step": 4569 + }, + { + "epoch": 0.10658674604308367, + "grad_norm": 1.733027458190918, + "learning_rate": 1.9998397474873298e-05, + "loss": 1.7508, + "step": 4570 + }, + { + "epoch": 0.10661006918226158, + "grad_norm": 1.4881467819213867, + "learning_rate": 1.9998392964017816e-05, + "loss": 1.2461, + "step": 4571 + }, + { + "epoch": 0.10663339232143951, + "grad_norm": 1.5950926542282104, + "learning_rate": 1.999838844682309e-05, + "loss": 1.5417, + "step": 4572 + }, + { + "epoch": 0.10665671546061742, + "grad_norm": 2.1507344245910645, + "learning_rate": 1.9998383923289116e-05, + "loss": 1.2756, + "step": 4573 + }, + { + "epoch": 0.10668003859979534, + "grad_norm": 1.5847529172897339, + "learning_rate": 1.999837939341591e-05, + "loss": 1.3398, + "step": 4574 + }, + { + "epoch": 0.10670336173897325, + "grad_norm": 1.9728766679763794, + "learning_rate": 1.999837485720347e-05, + "loss": 1.8232, + "step": 4575 + }, + { + "epoch": 0.10672668487815118, + "grad_norm": 1.8677773475646973, + "learning_rate": 1.9998370314651794e-05, + "loss": 1.4771, + "step": 4576 + }, + { + "epoch": 0.10675000801732909, + "grad_norm": 1.7102746963500977, + "learning_rate": 1.999836576576089e-05, + "loss": 1.4293, + "step": 4577 + }, + { + "epoch": 0.10677333115650701, + "grad_norm": 1.8963501453399658, + "learning_rate": 1.9998361210530762e-05, + "loss": 1.6308, + "step": 4578 + }, + { + "epoch": 0.10679665429568493, + "grad_norm": 2.2562477588653564, + "learning_rate": 1.999835664896141e-05, + "loss": 1.2972, + "step": 4579 + }, + { + "epoch": 0.10681997743486285, + "grad_norm": 1.480392575263977, + "learning_rate": 1.9998352081052834e-05, + "loss": 1.2876, + "step": 4580 + }, + { + "epoch": 0.10684330057404076, + "grad_norm": 2.29121470451355, + "learning_rate": 1.9998347506805044e-05, + "loss": 1.6237, + "step": 4581 + }, + { + "epoch": 0.10686662371321869, + "grad_norm": 1.8855741024017334, + "learning_rate": 1.9998342926218035e-05, + "loss": 1.7616, + "step": 4582 + }, + { + "epoch": 0.1068899468523966, + "grad_norm": 1.8817429542541504, + "learning_rate": 1.9998338339291815e-05, + "loss": 1.4687, + "step": 4583 + }, + { + "epoch": 0.10691326999157452, + "grad_norm": 1.6737823486328125, + "learning_rate": 1.999833374602639e-05, + "loss": 1.1106, + "step": 4584 + }, + { + "epoch": 0.10693659313075243, + "grad_norm": 1.6832067966461182, + "learning_rate": 1.9998329146421755e-05, + "loss": 1.2487, + "step": 4585 + }, + { + "epoch": 0.10695991626993036, + "grad_norm": 2.4442577362060547, + "learning_rate": 1.999832454047792e-05, + "loss": 1.6538, + "step": 4586 + }, + { + "epoch": 0.10698323940910827, + "grad_norm": 1.69062077999115, + "learning_rate": 1.9998319928194882e-05, + "loss": 1.2236, + "step": 4587 + }, + { + "epoch": 0.10700656254828618, + "grad_norm": 1.6283913850784302, + "learning_rate": 1.999831530957265e-05, + "loss": 1.2752, + "step": 4588 + }, + { + "epoch": 0.1070298856874641, + "grad_norm": 1.7547777891159058, + "learning_rate": 1.999831068461122e-05, + "loss": 1.4031, + "step": 4589 + }, + { + "epoch": 0.10705320882664202, + "grad_norm": 2.5570805072784424, + "learning_rate": 1.9998306053310598e-05, + "loss": 1.4149, + "step": 4590 + }, + { + "epoch": 0.10707653196581994, + "grad_norm": 1.416925072669983, + "learning_rate": 1.999830141567079e-05, + "loss": 1.3053, + "step": 4591 + }, + { + "epoch": 0.10709985510499785, + "grad_norm": 2.1444454193115234, + "learning_rate": 1.9998296771691796e-05, + "loss": 1.7971, + "step": 4592 + }, + { + "epoch": 0.10712317824417578, + "grad_norm": 1.529247522354126, + "learning_rate": 1.999829212137362e-05, + "loss": 1.4804, + "step": 4593 + }, + { + "epoch": 0.10714650138335369, + "grad_norm": 1.6415820121765137, + "learning_rate": 1.9998287464716264e-05, + "loss": 1.3182, + "step": 4594 + }, + { + "epoch": 0.10716982452253161, + "grad_norm": 1.848414421081543, + "learning_rate": 1.999828280171973e-05, + "loss": 1.5494, + "step": 4595 + }, + { + "epoch": 0.10719314766170952, + "grad_norm": 1.7559365034103394, + "learning_rate": 1.9998278132384024e-05, + "loss": 1.4093, + "step": 4596 + }, + { + "epoch": 0.10721647080088745, + "grad_norm": 1.9661378860473633, + "learning_rate": 1.9998273456709144e-05, + "loss": 1.4959, + "step": 4597 + }, + { + "epoch": 0.10723979394006536, + "grad_norm": 2.336165428161621, + "learning_rate": 1.99982687746951e-05, + "loss": 1.4747, + "step": 4598 + }, + { + "epoch": 0.10726311707924328, + "grad_norm": 1.5826678276062012, + "learning_rate": 1.999826408634189e-05, + "loss": 1.5984, + "step": 4599 + }, + { + "epoch": 0.1072864402184212, + "grad_norm": 1.8213845491409302, + "learning_rate": 1.9998259391649518e-05, + "loss": 1.5824, + "step": 4600 + }, + { + "epoch": 0.10730976335759912, + "grad_norm": 1.9903720617294312, + "learning_rate": 1.9998254690617985e-05, + "loss": 1.7531, + "step": 4601 + }, + { + "epoch": 0.10733308649677703, + "grad_norm": 1.8120417594909668, + "learning_rate": 1.99982499832473e-05, + "loss": 1.6241, + "step": 4602 + }, + { + "epoch": 0.10735640963595496, + "grad_norm": 1.43329656124115, + "learning_rate": 1.9998245269537455e-05, + "loss": 1.3178, + "step": 4603 + }, + { + "epoch": 0.10737973277513287, + "grad_norm": 1.7703838348388672, + "learning_rate": 1.9998240549488464e-05, + "loss": 1.5845, + "step": 4604 + }, + { + "epoch": 0.10740305591431079, + "grad_norm": 1.6270997524261475, + "learning_rate": 1.999823582310033e-05, + "loss": 1.5622, + "step": 4605 + }, + { + "epoch": 0.1074263790534887, + "grad_norm": 1.3259307146072388, + "learning_rate": 1.9998231090373048e-05, + "loss": 1.1944, + "step": 4606 + }, + { + "epoch": 0.10744970219266663, + "grad_norm": 1.5523977279663086, + "learning_rate": 1.9998226351306626e-05, + "loss": 1.5758, + "step": 4607 + }, + { + "epoch": 0.10747302533184454, + "grad_norm": 1.810379147529602, + "learning_rate": 1.9998221605901063e-05, + "loss": 1.7552, + "step": 4608 + }, + { + "epoch": 0.10749634847102246, + "grad_norm": 1.8421108722686768, + "learning_rate": 1.9998216854156367e-05, + "loss": 1.6426, + "step": 4609 + }, + { + "epoch": 0.10751967161020037, + "grad_norm": 1.9336531162261963, + "learning_rate": 1.999821209607254e-05, + "loss": 1.6727, + "step": 4610 + }, + { + "epoch": 0.1075429947493783, + "grad_norm": 1.3325319290161133, + "learning_rate": 1.9998207331649584e-05, + "loss": 1.3482, + "step": 4611 + }, + { + "epoch": 0.10756631788855621, + "grad_norm": 1.5683438777923584, + "learning_rate": 1.99982025608875e-05, + "loss": 1.4532, + "step": 4612 + }, + { + "epoch": 0.10758964102773413, + "grad_norm": 1.6363450288772583, + "learning_rate": 1.9998197783786298e-05, + "loss": 1.3097, + "step": 4613 + }, + { + "epoch": 0.10761296416691205, + "grad_norm": 1.9321645498275757, + "learning_rate": 1.999819300034597e-05, + "loss": 1.6566, + "step": 4614 + }, + { + "epoch": 0.10763628730608997, + "grad_norm": 1.6562678813934326, + "learning_rate": 1.999818821056653e-05, + "loss": 1.5087, + "step": 4615 + }, + { + "epoch": 0.10765961044526788, + "grad_norm": 1.3343039751052856, + "learning_rate": 1.9998183414447974e-05, + "loss": 1.7068, + "step": 4616 + }, + { + "epoch": 0.10768293358444579, + "grad_norm": 1.6911089420318604, + "learning_rate": 1.9998178611990308e-05, + "loss": 1.3978, + "step": 4617 + }, + { + "epoch": 0.10770625672362372, + "grad_norm": 1.6219162940979004, + "learning_rate": 1.999817380319353e-05, + "loss": 1.6076, + "step": 4618 + }, + { + "epoch": 0.10772957986280163, + "grad_norm": 1.6606857776641846, + "learning_rate": 1.9998168988057655e-05, + "loss": 1.2402, + "step": 4619 + }, + { + "epoch": 0.10775290300197955, + "grad_norm": 1.5331239700317383, + "learning_rate": 1.9998164166582672e-05, + "loss": 1.3521, + "step": 4620 + }, + { + "epoch": 0.10777622614115746, + "grad_norm": 1.4964178800582886, + "learning_rate": 1.9998159338768593e-05, + "loss": 1.242, + "step": 4621 + }, + { + "epoch": 0.10779954928033539, + "grad_norm": 1.9768437147140503, + "learning_rate": 1.9998154504615418e-05, + "loss": 1.3477, + "step": 4622 + }, + { + "epoch": 0.1078228724195133, + "grad_norm": 1.8231476545333862, + "learning_rate": 1.999814966412315e-05, + "loss": 1.8586, + "step": 4623 + }, + { + "epoch": 0.10784619555869122, + "grad_norm": 1.7720146179199219, + "learning_rate": 1.9998144817291795e-05, + "loss": 1.4332, + "step": 4624 + }, + { + "epoch": 0.10786951869786914, + "grad_norm": 1.5090407133102417, + "learning_rate": 1.999813996412135e-05, + "loss": 1.1353, + "step": 4625 + }, + { + "epoch": 0.10789284183704706, + "grad_norm": 2.024202585220337, + "learning_rate": 1.9998135104611825e-05, + "loss": 1.5498, + "step": 4626 + }, + { + "epoch": 0.10791616497622497, + "grad_norm": 1.5118862390518188, + "learning_rate": 1.999813023876322e-05, + "loss": 1.2549, + "step": 4627 + }, + { + "epoch": 0.1079394881154029, + "grad_norm": 1.607124924659729, + "learning_rate": 1.9998125366575537e-05, + "loss": 1.5745, + "step": 4628 + }, + { + "epoch": 0.1079628112545808, + "grad_norm": 2.0179083347320557, + "learning_rate": 1.999812048804878e-05, + "loss": 1.45, + "step": 4629 + }, + { + "epoch": 0.10798613439375873, + "grad_norm": 1.4259659051895142, + "learning_rate": 1.999811560318295e-05, + "loss": 1.4489, + "step": 4630 + }, + { + "epoch": 0.10800945753293664, + "grad_norm": 1.5856199264526367, + "learning_rate": 1.9998110711978055e-05, + "loss": 1.3063, + "step": 4631 + }, + { + "epoch": 0.10803278067211457, + "grad_norm": 1.8990132808685303, + "learning_rate": 1.9998105814434093e-05, + "loss": 1.4658, + "step": 4632 + }, + { + "epoch": 0.10805610381129248, + "grad_norm": 1.7485079765319824, + "learning_rate": 1.999810091055107e-05, + "loss": 1.3005, + "step": 4633 + }, + { + "epoch": 0.1080794269504704, + "grad_norm": 1.6214683055877686, + "learning_rate": 1.999809600032899e-05, + "loss": 1.1761, + "step": 4634 + }, + { + "epoch": 0.10810275008964831, + "grad_norm": 1.4445825815200806, + "learning_rate": 1.9998091083767855e-05, + "loss": 0.8278, + "step": 4635 + }, + { + "epoch": 0.10812607322882624, + "grad_norm": 1.336374044418335, + "learning_rate": 1.9998086160867663e-05, + "loss": 1.4518, + "step": 4636 + }, + { + "epoch": 0.10814939636800415, + "grad_norm": 2.1542108058929443, + "learning_rate": 1.999808123162843e-05, + "loss": 1.5429, + "step": 4637 + }, + { + "epoch": 0.10817271950718207, + "grad_norm": 1.8157985210418701, + "learning_rate": 1.9998076296050146e-05, + "loss": 1.2964, + "step": 4638 + }, + { + "epoch": 0.10819604264635999, + "grad_norm": 1.6846963167190552, + "learning_rate": 1.9998071354132818e-05, + "loss": 1.2976, + "step": 4639 + }, + { + "epoch": 0.10821936578553791, + "grad_norm": 1.6959627866744995, + "learning_rate": 1.999806640587645e-05, + "loss": 1.3066, + "step": 4640 + }, + { + "epoch": 0.10824268892471582, + "grad_norm": 2.090728998184204, + "learning_rate": 1.999806145128105e-05, + "loss": 1.4062, + "step": 4641 + }, + { + "epoch": 0.10826601206389375, + "grad_norm": 1.8047274351119995, + "learning_rate": 1.9998056490346615e-05, + "loss": 1.6949, + "step": 4642 + }, + { + "epoch": 0.10828933520307166, + "grad_norm": 2.0198192596435547, + "learning_rate": 1.999805152307315e-05, + "loss": 1.46, + "step": 4643 + }, + { + "epoch": 0.10831265834224958, + "grad_norm": 1.8486112356185913, + "learning_rate": 1.9998046549460654e-05, + "loss": 1.6438, + "step": 4644 + }, + { + "epoch": 0.10833598148142749, + "grad_norm": 1.8807770013809204, + "learning_rate": 1.9998041569509137e-05, + "loss": 1.4286, + "step": 4645 + }, + { + "epoch": 0.1083593046206054, + "grad_norm": 2.911431312561035, + "learning_rate": 1.99980365832186e-05, + "loss": 1.5008, + "step": 4646 + }, + { + "epoch": 0.10838262775978333, + "grad_norm": 1.6945289373397827, + "learning_rate": 1.9998031590589042e-05, + "loss": 1.592, + "step": 4647 + }, + { + "epoch": 0.10840595089896124, + "grad_norm": 1.5399086475372314, + "learning_rate": 1.999802659162047e-05, + "loss": 1.6958, + "step": 4648 + }, + { + "epoch": 0.10842927403813916, + "grad_norm": 1.7213293313980103, + "learning_rate": 1.999802158631289e-05, + "loss": 1.2511, + "step": 4649 + }, + { + "epoch": 0.10845259717731708, + "grad_norm": 1.6119493246078491, + "learning_rate": 1.99980165746663e-05, + "loss": 1.2312, + "step": 4650 + }, + { + "epoch": 0.108475920316495, + "grad_norm": 1.7431094646453857, + "learning_rate": 1.9998011556680703e-05, + "loss": 1.4495, + "step": 4651 + }, + { + "epoch": 0.10849924345567291, + "grad_norm": 1.5794720649719238, + "learning_rate": 1.9998006532356108e-05, + "loss": 1.1925, + "step": 4652 + }, + { + "epoch": 0.10852256659485084, + "grad_norm": 1.500409722328186, + "learning_rate": 1.9998001501692512e-05, + "loss": 1.2074, + "step": 4653 + }, + { + "epoch": 0.10854588973402875, + "grad_norm": 1.4552727937698364, + "learning_rate": 1.999799646468992e-05, + "loss": 1.2791, + "step": 4654 + }, + { + "epoch": 0.10856921287320667, + "grad_norm": 1.5714362859725952, + "learning_rate": 1.9997991421348336e-05, + "loss": 1.5333, + "step": 4655 + }, + { + "epoch": 0.10859253601238458, + "grad_norm": 1.6176605224609375, + "learning_rate": 1.9997986371667766e-05, + "loss": 1.6721, + "step": 4656 + }, + { + "epoch": 0.10861585915156251, + "grad_norm": 1.587973952293396, + "learning_rate": 1.999798131564821e-05, + "loss": 1.2112, + "step": 4657 + }, + { + "epoch": 0.10863918229074042, + "grad_norm": 1.6890060901641846, + "learning_rate": 1.9997976253289667e-05, + "loss": 1.2078, + "step": 4658 + }, + { + "epoch": 0.10866250542991834, + "grad_norm": 1.9246002435684204, + "learning_rate": 1.9997971184592147e-05, + "loss": 1.3152, + "step": 4659 + }, + { + "epoch": 0.10868582856909625, + "grad_norm": 2.5927438735961914, + "learning_rate": 1.999796610955565e-05, + "loss": 1.5726, + "step": 4660 + }, + { + "epoch": 0.10870915170827418, + "grad_norm": 1.6981041431427002, + "learning_rate": 1.999796102818018e-05, + "loss": 1.118, + "step": 4661 + }, + { + "epoch": 0.10873247484745209, + "grad_norm": 1.754831075668335, + "learning_rate": 1.999795594046574e-05, + "loss": 1.0722, + "step": 4662 + }, + { + "epoch": 0.10875579798663002, + "grad_norm": 2.376711845397949, + "learning_rate": 1.9997950846412334e-05, + "loss": 1.5271, + "step": 4663 + }, + { + "epoch": 0.10877912112580793, + "grad_norm": 1.8238438367843628, + "learning_rate": 1.9997945746019966e-05, + "loss": 1.533, + "step": 4664 + }, + { + "epoch": 0.10880244426498585, + "grad_norm": 1.9179550409317017, + "learning_rate": 1.9997940639288637e-05, + "loss": 1.535, + "step": 4665 + }, + { + "epoch": 0.10882576740416376, + "grad_norm": 2.026521682739258, + "learning_rate": 1.999793552621835e-05, + "loss": 1.3284, + "step": 4666 + }, + { + "epoch": 0.10884909054334169, + "grad_norm": 1.452511191368103, + "learning_rate": 1.999793040680911e-05, + "loss": 0.8912, + "step": 4667 + }, + { + "epoch": 0.1088724136825196, + "grad_norm": 1.7069728374481201, + "learning_rate": 1.9997925281060923e-05, + "loss": 1.5486, + "step": 4668 + }, + { + "epoch": 0.10889573682169752, + "grad_norm": 1.6261640787124634, + "learning_rate": 1.9997920148973785e-05, + "loss": 1.4506, + "step": 4669 + }, + { + "epoch": 0.10891905996087543, + "grad_norm": 1.6827213764190674, + "learning_rate": 1.9997915010547703e-05, + "loss": 1.0382, + "step": 4670 + }, + { + "epoch": 0.10894238310005336, + "grad_norm": 1.4625062942504883, + "learning_rate": 1.999790986578268e-05, + "loss": 1.4952, + "step": 4671 + }, + { + "epoch": 0.10896570623923127, + "grad_norm": 2.3439128398895264, + "learning_rate": 1.9997904714678724e-05, + "loss": 1.2131, + "step": 4672 + }, + { + "epoch": 0.10898902937840918, + "grad_norm": 1.8880949020385742, + "learning_rate": 1.9997899557235828e-05, + "loss": 1.2906, + "step": 4673 + }, + { + "epoch": 0.1090123525175871, + "grad_norm": 1.5955619812011719, + "learning_rate": 1.9997894393454008e-05, + "loss": 1.26, + "step": 4674 + }, + { + "epoch": 0.10903567565676502, + "grad_norm": 1.6589529514312744, + "learning_rate": 1.9997889223333254e-05, + "loss": 1.5532, + "step": 4675 + }, + { + "epoch": 0.10905899879594294, + "grad_norm": 1.7581021785736084, + "learning_rate": 1.999788404687358e-05, + "loss": 1.1995, + "step": 4676 + }, + { + "epoch": 0.10908232193512085, + "grad_norm": 2.3846583366394043, + "learning_rate": 1.9997878864074983e-05, + "loss": 1.2254, + "step": 4677 + }, + { + "epoch": 0.10910564507429878, + "grad_norm": 2.5479581356048584, + "learning_rate": 1.999787367493747e-05, + "loss": 1.552, + "step": 4678 + }, + { + "epoch": 0.10912896821347669, + "grad_norm": 1.7845826148986816, + "learning_rate": 1.999786847946104e-05, + "loss": 1.6222, + "step": 4679 + }, + { + "epoch": 0.10915229135265461, + "grad_norm": 1.794092059135437, + "learning_rate": 1.99978632776457e-05, + "loss": 1.0853, + "step": 4680 + }, + { + "epoch": 0.10917561449183252, + "grad_norm": 1.983144998550415, + "learning_rate": 1.9997858069491453e-05, + "loss": 1.7065, + "step": 4681 + }, + { + "epoch": 0.10919893763101045, + "grad_norm": 2.0391225814819336, + "learning_rate": 1.9997852854998302e-05, + "loss": 1.5509, + "step": 4682 + }, + { + "epoch": 0.10922226077018836, + "grad_norm": 2.037996768951416, + "learning_rate": 1.9997847634166248e-05, + "loss": 1.4287, + "step": 4683 + }, + { + "epoch": 0.10924558390936628, + "grad_norm": 2.2122488021850586, + "learning_rate": 1.9997842406995298e-05, + "loss": 1.5017, + "step": 4684 + }, + { + "epoch": 0.1092689070485442, + "grad_norm": 1.6282988786697388, + "learning_rate": 1.999783717348545e-05, + "loss": 1.7561, + "step": 4685 + }, + { + "epoch": 0.10929223018772212, + "grad_norm": 2.0643422603607178, + "learning_rate": 1.9997831933636715e-05, + "loss": 1.3793, + "step": 4686 + }, + { + "epoch": 0.10931555332690003, + "grad_norm": 1.6061822175979614, + "learning_rate": 1.999782668744909e-05, + "loss": 1.4003, + "step": 4687 + }, + { + "epoch": 0.10933887646607796, + "grad_norm": 1.6987704038619995, + "learning_rate": 1.999782143492258e-05, + "loss": 0.9222, + "step": 4688 + }, + { + "epoch": 0.10936219960525587, + "grad_norm": 1.7195863723754883, + "learning_rate": 1.9997816176057188e-05, + "loss": 1.1077, + "step": 4689 + }, + { + "epoch": 0.10938552274443379, + "grad_norm": 2.4128363132476807, + "learning_rate": 1.999781091085292e-05, + "loss": 1.5896, + "step": 4690 + }, + { + "epoch": 0.1094088458836117, + "grad_norm": 1.7924901247024536, + "learning_rate": 1.9997805639309776e-05, + "loss": 1.5602, + "step": 4691 + }, + { + "epoch": 0.10943216902278963, + "grad_norm": 1.849171757698059, + "learning_rate": 1.9997800361427763e-05, + "loss": 1.7518, + "step": 4692 + }, + { + "epoch": 0.10945549216196754, + "grad_norm": 1.589709997177124, + "learning_rate": 1.999779507720688e-05, + "loss": 1.3019, + "step": 4693 + }, + { + "epoch": 0.10947881530114546, + "grad_norm": 1.6170086860656738, + "learning_rate": 1.999778978664713e-05, + "loss": 1.3426, + "step": 4694 + }, + { + "epoch": 0.10950213844032337, + "grad_norm": 1.4563287496566772, + "learning_rate": 1.9997784489748527e-05, + "loss": 1.2167, + "step": 4695 + }, + { + "epoch": 0.1095254615795013, + "grad_norm": 1.3568425178527832, + "learning_rate": 1.999777918651106e-05, + "loss": 1.3701, + "step": 4696 + }, + { + "epoch": 0.10954878471867921, + "grad_norm": 1.3222202062606812, + "learning_rate": 1.9997773876934738e-05, + "loss": 1.2981, + "step": 4697 + }, + { + "epoch": 0.10957210785785713, + "grad_norm": 1.4239948987960815, + "learning_rate": 1.999776856101957e-05, + "loss": 1.1203, + "step": 4698 + }, + { + "epoch": 0.10959543099703505, + "grad_norm": 2.1866133213043213, + "learning_rate": 1.999776323876555e-05, + "loss": 1.5782, + "step": 4699 + }, + { + "epoch": 0.10961875413621297, + "grad_norm": 1.364729881286621, + "learning_rate": 1.9997757910172683e-05, + "loss": 1.4202, + "step": 4700 + }, + { + "epoch": 0.10964207727539088, + "grad_norm": 1.4880620241165161, + "learning_rate": 1.9997752575240978e-05, + "loss": 1.3443, + "step": 4701 + }, + { + "epoch": 0.10966540041456879, + "grad_norm": 2.045469045639038, + "learning_rate": 1.9997747233970438e-05, + "loss": 1.8308, + "step": 4702 + }, + { + "epoch": 0.10968872355374672, + "grad_norm": 1.7237162590026855, + "learning_rate": 1.999774188636106e-05, + "loss": 1.3901, + "step": 4703 + }, + { + "epoch": 0.10971204669292463, + "grad_norm": 2.113842725753784, + "learning_rate": 1.9997736532412854e-05, + "loss": 1.7517, + "step": 4704 + }, + { + "epoch": 0.10973536983210255, + "grad_norm": 1.5148341655731201, + "learning_rate": 1.9997731172125817e-05, + "loss": 1.2831, + "step": 4705 + }, + { + "epoch": 0.10975869297128046, + "grad_norm": 1.8842540979385376, + "learning_rate": 1.999772580549996e-05, + "loss": 1.4113, + "step": 4706 + }, + { + "epoch": 0.10978201611045839, + "grad_norm": 1.5884028673171997, + "learning_rate": 1.999772043253528e-05, + "loss": 1.4935, + "step": 4707 + }, + { + "epoch": 0.1098053392496363, + "grad_norm": 1.560294270515442, + "learning_rate": 1.999771505323178e-05, + "loss": 1.207, + "step": 4708 + }, + { + "epoch": 0.10982866238881422, + "grad_norm": 1.4098200798034668, + "learning_rate": 1.9997709667589473e-05, + "loss": 1.155, + "step": 4709 + }, + { + "epoch": 0.10985198552799214, + "grad_norm": 1.5581802129745483, + "learning_rate": 1.999770427560835e-05, + "loss": 1.3697, + "step": 4710 + }, + { + "epoch": 0.10987530866717006, + "grad_norm": 1.7986078262329102, + "learning_rate": 1.9997698877288423e-05, + "loss": 0.9984, + "step": 4711 + }, + { + "epoch": 0.10989863180634797, + "grad_norm": 1.4624197483062744, + "learning_rate": 1.9997693472629688e-05, + "loss": 1.3985, + "step": 4712 + }, + { + "epoch": 0.1099219549455259, + "grad_norm": 1.7539037466049194, + "learning_rate": 1.9997688061632157e-05, + "loss": 1.2829, + "step": 4713 + }, + { + "epoch": 0.1099452780847038, + "grad_norm": 1.590420126914978, + "learning_rate": 1.999768264429583e-05, + "loss": 1.249, + "step": 4714 + }, + { + "epoch": 0.10996860122388173, + "grad_norm": 2.3379104137420654, + "learning_rate": 1.999767722062071e-05, + "loss": 1.2626, + "step": 4715 + }, + { + "epoch": 0.10999192436305964, + "grad_norm": 2.0119636058807373, + "learning_rate": 1.9997671790606798e-05, + "loss": 1.4327, + "step": 4716 + }, + { + "epoch": 0.11001524750223757, + "grad_norm": 1.710679054260254, + "learning_rate": 1.99976663542541e-05, + "loss": 1.1288, + "step": 4717 + }, + { + "epoch": 0.11003857064141548, + "grad_norm": 2.4558565616607666, + "learning_rate": 1.9997660911562618e-05, + "loss": 1.4038, + "step": 4718 + }, + { + "epoch": 0.1100618937805934, + "grad_norm": 1.8955258131027222, + "learning_rate": 1.9997655462532356e-05, + "loss": 1.2128, + "step": 4719 + }, + { + "epoch": 0.11008521691977131, + "grad_norm": 2.1894257068634033, + "learning_rate": 1.999765000716332e-05, + "loss": 1.6796, + "step": 4720 + }, + { + "epoch": 0.11010854005894924, + "grad_norm": 1.8517687320709229, + "learning_rate": 1.9997644545455513e-05, + "loss": 1.4889, + "step": 4721 + }, + { + "epoch": 0.11013186319812715, + "grad_norm": 1.4544748067855835, + "learning_rate": 1.9997639077408935e-05, + "loss": 1.2158, + "step": 4722 + }, + { + "epoch": 0.11015518633730507, + "grad_norm": 1.4852479696273804, + "learning_rate": 1.999763360302359e-05, + "loss": 1.4004, + "step": 4723 + }, + { + "epoch": 0.11017850947648299, + "grad_norm": 1.441847324371338, + "learning_rate": 1.9997628122299485e-05, + "loss": 1.4372, + "step": 4724 + }, + { + "epoch": 0.11020183261566091, + "grad_norm": 1.5786794424057007, + "learning_rate": 1.9997622635236618e-05, + "loss": 1.4358, + "step": 4725 + }, + { + "epoch": 0.11022515575483882, + "grad_norm": 1.9009759426116943, + "learning_rate": 1.9997617141835e-05, + "loss": 1.3353, + "step": 4726 + }, + { + "epoch": 0.11024847889401675, + "grad_norm": 1.5824389457702637, + "learning_rate": 1.9997611642094627e-05, + "loss": 1.5705, + "step": 4727 + }, + { + "epoch": 0.11027180203319466, + "grad_norm": 2.0614171028137207, + "learning_rate": 1.9997606136015507e-05, + "loss": 1.1281, + "step": 4728 + }, + { + "epoch": 0.11029512517237258, + "grad_norm": 1.6476571559906006, + "learning_rate": 1.9997600623597643e-05, + "loss": 1.24, + "step": 4729 + }, + { + "epoch": 0.1103184483115505, + "grad_norm": 2.5238232612609863, + "learning_rate": 1.9997595104841038e-05, + "loss": 1.2169, + "step": 4730 + }, + { + "epoch": 0.1103417714507284, + "grad_norm": 1.9560441970825195, + "learning_rate": 1.9997589579745693e-05, + "loss": 1.4138, + "step": 4731 + }, + { + "epoch": 0.11036509458990633, + "grad_norm": 1.550877332687378, + "learning_rate": 1.9997584048311614e-05, + "loss": 1.1269, + "step": 4732 + }, + { + "epoch": 0.11038841772908424, + "grad_norm": 2.002683162689209, + "learning_rate": 1.9997578510538805e-05, + "loss": 1.5355, + "step": 4733 + }, + { + "epoch": 0.11041174086826216, + "grad_norm": 1.581544041633606, + "learning_rate": 1.999757296642727e-05, + "loss": 1.4729, + "step": 4734 + }, + { + "epoch": 0.11043506400744008, + "grad_norm": 2.1235954761505127, + "learning_rate": 1.999756741597701e-05, + "loss": 1.6424, + "step": 4735 + }, + { + "epoch": 0.110458387146618, + "grad_norm": 1.5193734169006348, + "learning_rate": 1.999756185918803e-05, + "loss": 1.0682, + "step": 4736 + }, + { + "epoch": 0.11048171028579591, + "grad_norm": 1.9458128213882446, + "learning_rate": 1.9997556296060332e-05, + "loss": 1.4472, + "step": 4737 + }, + { + "epoch": 0.11050503342497384, + "grad_norm": 1.8136341571807861, + "learning_rate": 1.9997550726593925e-05, + "loss": 1.7115, + "step": 4738 + }, + { + "epoch": 0.11052835656415175, + "grad_norm": 1.7098948955535889, + "learning_rate": 1.9997545150788805e-05, + "loss": 1.5029, + "step": 4739 + }, + { + "epoch": 0.11055167970332967, + "grad_norm": 1.6236376762390137, + "learning_rate": 1.999753956864498e-05, + "loss": 1.6369, + "step": 4740 + }, + { + "epoch": 0.11057500284250758, + "grad_norm": 1.7370587587356567, + "learning_rate": 1.999753398016245e-05, + "loss": 1.2514, + "step": 4741 + }, + { + "epoch": 0.11059832598168551, + "grad_norm": 2.082703113555908, + "learning_rate": 1.9997528385341223e-05, + "loss": 1.4751, + "step": 4742 + }, + { + "epoch": 0.11062164912086342, + "grad_norm": 1.9707653522491455, + "learning_rate": 1.99975227841813e-05, + "loss": 1.6404, + "step": 4743 + }, + { + "epoch": 0.11064497226004134, + "grad_norm": 1.470170259475708, + "learning_rate": 1.9997517176682687e-05, + "loss": 1.5853, + "step": 4744 + }, + { + "epoch": 0.11066829539921925, + "grad_norm": 1.8218199014663696, + "learning_rate": 1.9997511562845384e-05, + "loss": 1.2697, + "step": 4745 + }, + { + "epoch": 0.11069161853839718, + "grad_norm": 1.8697524070739746, + "learning_rate": 1.9997505942669394e-05, + "loss": 1.6328, + "step": 4746 + }, + { + "epoch": 0.11071494167757509, + "grad_norm": 1.5326578617095947, + "learning_rate": 1.999750031615473e-05, + "loss": 1.5544, + "step": 4747 + }, + { + "epoch": 0.11073826481675302, + "grad_norm": 1.7094053030014038, + "learning_rate": 1.999749468330138e-05, + "loss": 1.35, + "step": 4748 + }, + { + "epoch": 0.11076158795593093, + "grad_norm": 2.1253573894500732, + "learning_rate": 1.999748904410936e-05, + "loss": 1.4909, + "step": 4749 + }, + { + "epoch": 0.11078491109510885, + "grad_norm": 2.1014323234558105, + "learning_rate": 1.9997483398578668e-05, + "loss": 1.3446, + "step": 4750 + }, + { + "epoch": 0.11080823423428676, + "grad_norm": 1.844268560409546, + "learning_rate": 1.9997477746709308e-05, + "loss": 1.3962, + "step": 4751 + }, + { + "epoch": 0.11083155737346469, + "grad_norm": 2.438141345977783, + "learning_rate": 1.9997472088501285e-05, + "loss": 1.7026, + "step": 4752 + }, + { + "epoch": 0.1108548805126426, + "grad_norm": 1.7001752853393555, + "learning_rate": 1.9997466423954606e-05, + "loss": 1.2695, + "step": 4753 + }, + { + "epoch": 0.11087820365182052, + "grad_norm": 1.6355085372924805, + "learning_rate": 1.999746075306927e-05, + "loss": 1.0506, + "step": 4754 + }, + { + "epoch": 0.11090152679099843, + "grad_norm": 2.8396060466766357, + "learning_rate": 1.9997455075845278e-05, + "loss": 1.3554, + "step": 4755 + }, + { + "epoch": 0.11092484993017636, + "grad_norm": 1.9583638906478882, + "learning_rate": 1.999744939228264e-05, + "loss": 1.4245, + "step": 4756 + }, + { + "epoch": 0.11094817306935427, + "grad_norm": 1.8647066354751587, + "learning_rate": 1.9997443702381357e-05, + "loss": 1.6369, + "step": 4757 + }, + { + "epoch": 0.1109714962085322, + "grad_norm": 1.5789604187011719, + "learning_rate": 1.9997438006141426e-05, + "loss": 1.3321, + "step": 4758 + }, + { + "epoch": 0.1109948193477101, + "grad_norm": 1.3761906623840332, + "learning_rate": 1.9997432303562863e-05, + "loss": 0.9831, + "step": 4759 + }, + { + "epoch": 0.11101814248688802, + "grad_norm": 1.8360717296600342, + "learning_rate": 1.9997426594645664e-05, + "loss": 1.2267, + "step": 4760 + }, + { + "epoch": 0.11104146562606594, + "grad_norm": 1.647345781326294, + "learning_rate": 1.9997420879389834e-05, + "loss": 1.3699, + "step": 4761 + }, + { + "epoch": 0.11106478876524385, + "grad_norm": 1.7599635124206543, + "learning_rate": 1.9997415157795377e-05, + "loss": 1.9307, + "step": 4762 + }, + { + "epoch": 0.11108811190442178, + "grad_norm": 1.5892014503479004, + "learning_rate": 1.99974094298623e-05, + "loss": 1.4318, + "step": 4763 + }, + { + "epoch": 0.11111143504359969, + "grad_norm": 1.6504076719284058, + "learning_rate": 1.9997403695590596e-05, + "loss": 1.3844, + "step": 4764 + }, + { + "epoch": 0.11113475818277761, + "grad_norm": 2.1827478408813477, + "learning_rate": 1.9997397954980277e-05, + "loss": 1.802, + "step": 4765 + }, + { + "epoch": 0.11115808132195552, + "grad_norm": 1.9092530012130737, + "learning_rate": 1.9997392208031347e-05, + "loss": 1.6177, + "step": 4766 + }, + { + "epoch": 0.11118140446113345, + "grad_norm": 2.4833455085754395, + "learning_rate": 1.9997386454743808e-05, + "loss": 1.8023, + "step": 4767 + }, + { + "epoch": 0.11120472760031136, + "grad_norm": 1.7990853786468506, + "learning_rate": 1.9997380695117664e-05, + "loss": 1.7281, + "step": 4768 + }, + { + "epoch": 0.11122805073948928, + "grad_norm": 1.8523533344268799, + "learning_rate": 1.9997374929152915e-05, + "loss": 1.5475, + "step": 4769 + }, + { + "epoch": 0.1112513738786672, + "grad_norm": 1.3760818243026733, + "learning_rate": 1.9997369156849575e-05, + "loss": 1.3055, + "step": 4770 + }, + { + "epoch": 0.11127469701784512, + "grad_norm": 1.6081079244613647, + "learning_rate": 1.9997363378207632e-05, + "loss": 1.2354, + "step": 4771 + }, + { + "epoch": 0.11129802015702303, + "grad_norm": 1.8561193943023682, + "learning_rate": 1.9997357593227102e-05, + "loss": 1.3127, + "step": 4772 + }, + { + "epoch": 0.11132134329620096, + "grad_norm": 1.6075220108032227, + "learning_rate": 1.9997351801907984e-05, + "loss": 1.6588, + "step": 4773 + }, + { + "epoch": 0.11134466643537887, + "grad_norm": 1.7126284837722778, + "learning_rate": 1.9997346004250284e-05, + "loss": 1.4986, + "step": 4774 + }, + { + "epoch": 0.11136798957455679, + "grad_norm": 1.7078109979629517, + "learning_rate": 1.9997340200254003e-05, + "loss": 1.5173, + "step": 4775 + }, + { + "epoch": 0.1113913127137347, + "grad_norm": 1.885150671005249, + "learning_rate": 1.9997334389919144e-05, + "loss": 1.4414, + "step": 4776 + }, + { + "epoch": 0.11141463585291263, + "grad_norm": 1.6406018733978271, + "learning_rate": 1.9997328573245718e-05, + "loss": 1.1587, + "step": 4777 + }, + { + "epoch": 0.11143795899209054, + "grad_norm": 2.351262092590332, + "learning_rate": 1.999732275023372e-05, + "loss": 1.7137, + "step": 4778 + }, + { + "epoch": 0.11146128213126846, + "grad_norm": 1.8597307205200195, + "learning_rate": 1.9997316920883154e-05, + "loss": 1.2877, + "step": 4779 + }, + { + "epoch": 0.11148460527044637, + "grad_norm": 1.9467705488204956, + "learning_rate": 1.999731108519403e-05, + "loss": 1.4246, + "step": 4780 + }, + { + "epoch": 0.1115079284096243, + "grad_norm": 1.8004958629608154, + "learning_rate": 1.9997305243166347e-05, + "loss": 1.3196, + "step": 4781 + }, + { + "epoch": 0.11153125154880221, + "grad_norm": 1.4756791591644287, + "learning_rate": 1.999729939480011e-05, + "loss": 1.5004, + "step": 4782 + }, + { + "epoch": 0.11155457468798013, + "grad_norm": 1.7322580814361572, + "learning_rate": 1.9997293540095322e-05, + "loss": 1.5387, + "step": 4783 + }, + { + "epoch": 0.11157789782715805, + "grad_norm": 1.7892494201660156, + "learning_rate": 1.9997287679051992e-05, + "loss": 1.4908, + "step": 4784 + }, + { + "epoch": 0.11160122096633597, + "grad_norm": 1.6499955654144287, + "learning_rate": 1.9997281811670115e-05, + "loss": 1.1038, + "step": 4785 + }, + { + "epoch": 0.11162454410551388, + "grad_norm": 1.779866099357605, + "learning_rate": 1.99972759379497e-05, + "loss": 1.6099, + "step": 4786 + }, + { + "epoch": 0.11164786724469179, + "grad_norm": 1.6814109086990356, + "learning_rate": 1.999727005789075e-05, + "loss": 1.2583, + "step": 4787 + }, + { + "epoch": 0.11167119038386972, + "grad_norm": 2.033905506134033, + "learning_rate": 1.9997264171493266e-05, + "loss": 1.5961, + "step": 4788 + }, + { + "epoch": 0.11169451352304763, + "grad_norm": 1.79148530960083, + "learning_rate": 1.9997258278757256e-05, + "loss": 1.4551, + "step": 4789 + }, + { + "epoch": 0.11171783666222555, + "grad_norm": 2.1484973430633545, + "learning_rate": 1.9997252379682723e-05, + "loss": 1.6561, + "step": 4790 + }, + { + "epoch": 0.11174115980140346, + "grad_norm": 1.8857439756393433, + "learning_rate": 1.999724647426967e-05, + "loss": 1.8829, + "step": 4791 + }, + { + "epoch": 0.11176448294058139, + "grad_norm": 1.6438655853271484, + "learning_rate": 1.9997240562518095e-05, + "loss": 1.4009, + "step": 4792 + }, + { + "epoch": 0.1117878060797593, + "grad_norm": 1.4829570055007935, + "learning_rate": 1.999723464442801e-05, + "loss": 1.0155, + "step": 4793 + }, + { + "epoch": 0.11181112921893722, + "grad_norm": 1.896540641784668, + "learning_rate": 1.9997228719999417e-05, + "loss": 1.536, + "step": 4794 + }, + { + "epoch": 0.11183445235811514, + "grad_norm": 1.9464058876037598, + "learning_rate": 1.999722278923232e-05, + "loss": 1.3048, + "step": 4795 + }, + { + "epoch": 0.11185777549729306, + "grad_norm": 1.6867518424987793, + "learning_rate": 1.9997216852126718e-05, + "loss": 1.7278, + "step": 4796 + }, + { + "epoch": 0.11188109863647097, + "grad_norm": 1.5861499309539795, + "learning_rate": 1.9997210908682617e-05, + "loss": 1.4502, + "step": 4797 + }, + { + "epoch": 0.1119044217756489, + "grad_norm": 2.019364833831787, + "learning_rate": 1.9997204958900026e-05, + "loss": 1.3034, + "step": 4798 + }, + { + "epoch": 0.11192774491482681, + "grad_norm": 1.4557583332061768, + "learning_rate": 1.999719900277894e-05, + "loss": 1.3609, + "step": 4799 + }, + { + "epoch": 0.11195106805400473, + "grad_norm": 1.5779038667678833, + "learning_rate": 1.9997193040319368e-05, + "loss": 1.7772, + "step": 4800 + }, + { + "epoch": 0.11197439119318264, + "grad_norm": 2.1182780265808105, + "learning_rate": 1.9997187071521317e-05, + "loss": 1.7829, + "step": 4801 + }, + { + "epoch": 0.11199771433236057, + "grad_norm": 1.4608792066574097, + "learning_rate": 1.9997181096384787e-05, + "loss": 1.5513, + "step": 4802 + }, + { + "epoch": 0.11202103747153848, + "grad_norm": 1.8238307237625122, + "learning_rate": 1.9997175114909778e-05, + "loss": 1.2157, + "step": 4803 + }, + { + "epoch": 0.1120443606107164, + "grad_norm": 1.9729714393615723, + "learning_rate": 1.99971691270963e-05, + "loss": 1.3976, + "step": 4804 + }, + { + "epoch": 0.11206768374989431, + "grad_norm": 1.6585525274276733, + "learning_rate": 1.9997163132944353e-05, + "loss": 1.5639, + "step": 4805 + }, + { + "epoch": 0.11209100688907224, + "grad_norm": 1.906258463859558, + "learning_rate": 1.999715713245394e-05, + "loss": 1.4502, + "step": 4806 + }, + { + "epoch": 0.11211433002825015, + "grad_norm": 1.8018217086791992, + "learning_rate": 1.999715112562507e-05, + "loss": 1.4047, + "step": 4807 + }, + { + "epoch": 0.11213765316742808, + "grad_norm": 1.9643853902816772, + "learning_rate": 1.9997145112457744e-05, + "loss": 1.5038, + "step": 4808 + }, + { + "epoch": 0.11216097630660599, + "grad_norm": 1.5914634466171265, + "learning_rate": 1.9997139092951965e-05, + "loss": 1.4842, + "step": 4809 + }, + { + "epoch": 0.11218429944578391, + "grad_norm": 1.4541680812835693, + "learning_rate": 1.9997133067107736e-05, + "loss": 1.3954, + "step": 4810 + }, + { + "epoch": 0.11220762258496182, + "grad_norm": 1.8242120742797852, + "learning_rate": 1.9997127034925065e-05, + "loss": 1.4084, + "step": 4811 + }, + { + "epoch": 0.11223094572413975, + "grad_norm": 1.797040581703186, + "learning_rate": 1.9997120996403948e-05, + "loss": 1.5137, + "step": 4812 + }, + { + "epoch": 0.11225426886331766, + "grad_norm": 1.5651525259017944, + "learning_rate": 1.99971149515444e-05, + "loss": 1.4482, + "step": 4813 + }, + { + "epoch": 0.11227759200249558, + "grad_norm": 2.747147560119629, + "learning_rate": 1.9997108900346414e-05, + "loss": 1.349, + "step": 4814 + }, + { + "epoch": 0.1123009151416735, + "grad_norm": 1.3470970392227173, + "learning_rate": 1.999710284281e-05, + "loss": 1.0509, + "step": 4815 + }, + { + "epoch": 0.1123242382808514, + "grad_norm": 1.7725294828414917, + "learning_rate": 1.9997096778935162e-05, + "loss": 1.6337, + "step": 4816 + }, + { + "epoch": 0.11234756142002933, + "grad_norm": 1.618151068687439, + "learning_rate": 1.9997090708721897e-05, + "loss": 1.5256, + "step": 4817 + }, + { + "epoch": 0.11237088455920724, + "grad_norm": 1.4803017377853394, + "learning_rate": 1.9997084632170218e-05, + "loss": 1.2074, + "step": 4818 + }, + { + "epoch": 0.11239420769838517, + "grad_norm": 1.7909730672836304, + "learning_rate": 1.9997078549280126e-05, + "loss": 1.3639, + "step": 4819 + }, + { + "epoch": 0.11241753083756308, + "grad_norm": 2.1538288593292236, + "learning_rate": 1.9997072460051622e-05, + "loss": 1.386, + "step": 4820 + }, + { + "epoch": 0.112440853976741, + "grad_norm": 2.1644701957702637, + "learning_rate": 1.999706636448471e-05, + "loss": 1.8123, + "step": 4821 + }, + { + "epoch": 0.11246417711591891, + "grad_norm": 1.6548917293548584, + "learning_rate": 1.99970602625794e-05, + "loss": 1.5236, + "step": 4822 + }, + { + "epoch": 0.11248750025509684, + "grad_norm": 1.8485840559005737, + "learning_rate": 1.9997054154335687e-05, + "loss": 1.5656, + "step": 4823 + }, + { + "epoch": 0.11251082339427475, + "grad_norm": 1.8559378385543823, + "learning_rate": 1.9997048039753584e-05, + "loss": 1.5271, + "step": 4824 + }, + { + "epoch": 0.11253414653345267, + "grad_norm": 1.84505033493042, + "learning_rate": 1.9997041918833085e-05, + "loss": 1.6146, + "step": 4825 + }, + { + "epoch": 0.11255746967263058, + "grad_norm": 1.840195655822754, + "learning_rate": 1.99970357915742e-05, + "loss": 1.1284, + "step": 4826 + }, + { + "epoch": 0.11258079281180851, + "grad_norm": 1.8387125730514526, + "learning_rate": 1.9997029657976933e-05, + "loss": 1.6407, + "step": 4827 + }, + { + "epoch": 0.11260411595098642, + "grad_norm": 1.4782036542892456, + "learning_rate": 1.9997023518041286e-05, + "loss": 1.4918, + "step": 4828 + }, + { + "epoch": 0.11262743909016434, + "grad_norm": 1.4858899116516113, + "learning_rate": 1.9997017371767265e-05, + "loss": 0.9922, + "step": 4829 + }, + { + "epoch": 0.11265076222934226, + "grad_norm": 1.8135300874710083, + "learning_rate": 1.9997011219154873e-05, + "loss": 1.4396, + "step": 4830 + }, + { + "epoch": 0.11267408536852018, + "grad_norm": 1.5662935972213745, + "learning_rate": 1.9997005060204112e-05, + "loss": 1.3612, + "step": 4831 + }, + { + "epoch": 0.11269740850769809, + "grad_norm": 1.5332759618759155, + "learning_rate": 1.9996998894914984e-05, + "loss": 1.2752, + "step": 4832 + }, + { + "epoch": 0.11272073164687602, + "grad_norm": 1.4699090719223022, + "learning_rate": 1.9996992723287502e-05, + "loss": 1.5256, + "step": 4833 + }, + { + "epoch": 0.11274405478605393, + "grad_norm": 1.9431648254394531, + "learning_rate": 1.999698654532166e-05, + "loss": 1.3271, + "step": 4834 + }, + { + "epoch": 0.11276737792523185, + "grad_norm": 1.348463773727417, + "learning_rate": 1.999698036101747e-05, + "loss": 1.196, + "step": 4835 + }, + { + "epoch": 0.11279070106440976, + "grad_norm": 1.9276671409606934, + "learning_rate": 1.999697417037493e-05, + "loss": 1.4015, + "step": 4836 + }, + { + "epoch": 0.11281402420358769, + "grad_norm": 1.8967229127883911, + "learning_rate": 1.9996967973394046e-05, + "loss": 1.4184, + "step": 4837 + }, + { + "epoch": 0.1128373473427656, + "grad_norm": 1.8818328380584717, + "learning_rate": 1.999696177007482e-05, + "loss": 1.2718, + "step": 4838 + }, + { + "epoch": 0.11286067048194352, + "grad_norm": 1.7954262495040894, + "learning_rate": 1.999695556041726e-05, + "loss": 1.4324, + "step": 4839 + }, + { + "epoch": 0.11288399362112143, + "grad_norm": 1.7013620138168335, + "learning_rate": 1.9996949344421366e-05, + "loss": 1.2892, + "step": 4840 + }, + { + "epoch": 0.11290731676029936, + "grad_norm": 1.8514941930770874, + "learning_rate": 1.9996943122087144e-05, + "loss": 1.494, + "step": 4841 + }, + { + "epoch": 0.11293063989947727, + "grad_norm": 1.871381402015686, + "learning_rate": 1.9996936893414598e-05, + "loss": 1.6722, + "step": 4842 + }, + { + "epoch": 0.1129539630386552, + "grad_norm": 1.4767144918441772, + "learning_rate": 1.9996930658403735e-05, + "loss": 1.0865, + "step": 4843 + }, + { + "epoch": 0.1129772861778331, + "grad_norm": 2.1904797554016113, + "learning_rate": 1.999692441705455e-05, + "loss": 1.5067, + "step": 4844 + }, + { + "epoch": 0.11300060931701102, + "grad_norm": 1.3845102787017822, + "learning_rate": 1.9996918169367054e-05, + "loss": 1.4637, + "step": 4845 + }, + { + "epoch": 0.11302393245618894, + "grad_norm": 1.3896088600158691, + "learning_rate": 1.999691191534125e-05, + "loss": 1.4277, + "step": 4846 + }, + { + "epoch": 0.11304725559536685, + "grad_norm": 1.6329911947250366, + "learning_rate": 1.9996905654977143e-05, + "loss": 1.4913, + "step": 4847 + }, + { + "epoch": 0.11307057873454478, + "grad_norm": 1.9645805358886719, + "learning_rate": 1.999689938827473e-05, + "loss": 1.6053, + "step": 4848 + }, + { + "epoch": 0.11309390187372269, + "grad_norm": 1.4963576793670654, + "learning_rate": 1.9996893115234027e-05, + "loss": 1.5192, + "step": 4849 + }, + { + "epoch": 0.11311722501290061, + "grad_norm": 2.069765567779541, + "learning_rate": 1.999688683585503e-05, + "loss": 1.446, + "step": 4850 + }, + { + "epoch": 0.11314054815207852, + "grad_norm": 1.8455005884170532, + "learning_rate": 1.9996880550137742e-05, + "loss": 1.4464, + "step": 4851 + }, + { + "epoch": 0.11316387129125645, + "grad_norm": 1.8833606243133545, + "learning_rate": 1.999687425808217e-05, + "loss": 1.4146, + "step": 4852 + }, + { + "epoch": 0.11318719443043436, + "grad_norm": 2.2382304668426514, + "learning_rate": 1.999686795968832e-05, + "loss": 1.3997, + "step": 4853 + }, + { + "epoch": 0.11321051756961228, + "grad_norm": 1.5338480472564697, + "learning_rate": 1.9996861654956187e-05, + "loss": 1.2996, + "step": 4854 + }, + { + "epoch": 0.1132338407087902, + "grad_norm": 2.0787034034729004, + "learning_rate": 1.9996855343885787e-05, + "loss": 1.2946, + "step": 4855 + }, + { + "epoch": 0.11325716384796812, + "grad_norm": 1.8246303796768188, + "learning_rate": 1.9996849026477117e-05, + "loss": 1.5849, + "step": 4856 + }, + { + "epoch": 0.11328048698714603, + "grad_norm": 1.6724165678024292, + "learning_rate": 1.999684270273018e-05, + "loss": 1.2117, + "step": 4857 + }, + { + "epoch": 0.11330381012632396, + "grad_norm": 1.9172333478927612, + "learning_rate": 1.9996836372644982e-05, + "loss": 1.4549, + "step": 4858 + }, + { + "epoch": 0.11332713326550187, + "grad_norm": 2.3572545051574707, + "learning_rate": 1.999683003622153e-05, + "loss": 1.5828, + "step": 4859 + }, + { + "epoch": 0.11335045640467979, + "grad_norm": 1.7597121000289917, + "learning_rate": 1.9996823693459827e-05, + "loss": 1.3256, + "step": 4860 + }, + { + "epoch": 0.1133737795438577, + "grad_norm": 1.7418758869171143, + "learning_rate": 1.999681734435987e-05, + "loss": 1.5079, + "step": 4861 + }, + { + "epoch": 0.11339710268303563, + "grad_norm": 1.6415714025497437, + "learning_rate": 1.9996810988921675e-05, + "loss": 1.474, + "step": 4862 + }, + { + "epoch": 0.11342042582221354, + "grad_norm": 1.460545301437378, + "learning_rate": 1.9996804627145233e-05, + "loss": 1.4171, + "step": 4863 + }, + { + "epoch": 0.11344374896139146, + "grad_norm": 1.6469100713729858, + "learning_rate": 1.9996798259030555e-05, + "loss": 1.2755, + "step": 4864 + }, + { + "epoch": 0.11346707210056937, + "grad_norm": 1.7686614990234375, + "learning_rate": 1.9996791884577645e-05, + "loss": 1.3803, + "step": 4865 + }, + { + "epoch": 0.1134903952397473, + "grad_norm": 1.5787098407745361, + "learning_rate": 1.999678550378651e-05, + "loss": 1.3252, + "step": 4866 + }, + { + "epoch": 0.11351371837892521, + "grad_norm": 1.5128165483474731, + "learning_rate": 1.999677911665715e-05, + "loss": 1.3384, + "step": 4867 + }, + { + "epoch": 0.11353704151810314, + "grad_norm": 1.7517908811569214, + "learning_rate": 1.9996772723189566e-05, + "loss": 1.3678, + "step": 4868 + }, + { + "epoch": 0.11356036465728105, + "grad_norm": 2.1116812229156494, + "learning_rate": 1.999676632338377e-05, + "loss": 1.4632, + "step": 4869 + }, + { + "epoch": 0.11358368779645897, + "grad_norm": 1.730730414390564, + "learning_rate": 1.999675991723976e-05, + "loss": 1.2828, + "step": 4870 + }, + { + "epoch": 0.11360701093563688, + "grad_norm": 1.9060343503952026, + "learning_rate": 1.9996753504757538e-05, + "loss": 1.4953, + "step": 4871 + }, + { + "epoch": 0.1136303340748148, + "grad_norm": 1.9581001996994019, + "learning_rate": 1.9996747085937117e-05, + "loss": 1.6483, + "step": 4872 + }, + { + "epoch": 0.11365365721399272, + "grad_norm": 1.4604954719543457, + "learning_rate": 1.999674066077849e-05, + "loss": 1.3666, + "step": 4873 + }, + { + "epoch": 0.11367698035317063, + "grad_norm": 1.5647526979446411, + "learning_rate": 1.9996734229281672e-05, + "loss": 1.7014, + "step": 4874 + }, + { + "epoch": 0.11370030349234855, + "grad_norm": 1.5901166200637817, + "learning_rate": 1.9996727791446663e-05, + "loss": 1.564, + "step": 4875 + }, + { + "epoch": 0.11372362663152646, + "grad_norm": 1.818510890007019, + "learning_rate": 1.999672134727346e-05, + "loss": 1.6494, + "step": 4876 + }, + { + "epoch": 0.11374694977070439, + "grad_norm": 1.506116509437561, + "learning_rate": 1.9996714896762078e-05, + "loss": 1.486, + "step": 4877 + }, + { + "epoch": 0.1137702729098823, + "grad_norm": 1.6455276012420654, + "learning_rate": 1.9996708439912517e-05, + "loss": 1.4879, + "step": 4878 + }, + { + "epoch": 0.11379359604906022, + "grad_norm": 1.9927424192428589, + "learning_rate": 1.9996701976724778e-05, + "loss": 1.7139, + "step": 4879 + }, + { + "epoch": 0.11381691918823814, + "grad_norm": 1.825521469116211, + "learning_rate": 1.9996695507198867e-05, + "loss": 1.2458, + "step": 4880 + }, + { + "epoch": 0.11384024232741606, + "grad_norm": 2.3200228214263916, + "learning_rate": 1.999668903133479e-05, + "loss": 1.5115, + "step": 4881 + }, + { + "epoch": 0.11386356546659397, + "grad_norm": 2.874640464782715, + "learning_rate": 1.999668254913255e-05, + "loss": 1.2171, + "step": 4882 + }, + { + "epoch": 0.1138868886057719, + "grad_norm": 1.6035988330841064, + "learning_rate": 1.9996676060592152e-05, + "loss": 1.4418, + "step": 4883 + }, + { + "epoch": 0.11391021174494981, + "grad_norm": 1.5072420835494995, + "learning_rate": 1.9996669565713596e-05, + "loss": 1.4429, + "step": 4884 + }, + { + "epoch": 0.11393353488412773, + "grad_norm": 1.9192874431610107, + "learning_rate": 1.999666306449689e-05, + "loss": 1.5421, + "step": 4885 + }, + { + "epoch": 0.11395685802330564, + "grad_norm": 2.3463261127471924, + "learning_rate": 1.999665655694204e-05, + "loss": 1.9386, + "step": 4886 + }, + { + "epoch": 0.11398018116248357, + "grad_norm": 1.3098691701889038, + "learning_rate": 1.999665004304904e-05, + "loss": 1.5214, + "step": 4887 + }, + { + "epoch": 0.11400350430166148, + "grad_norm": 1.6770516633987427, + "learning_rate": 1.9996643522817905e-05, + "loss": 1.8164, + "step": 4888 + }, + { + "epoch": 0.1140268274408394, + "grad_norm": 1.6771339178085327, + "learning_rate": 1.999663699624864e-05, + "loss": 1.4531, + "step": 4889 + }, + { + "epoch": 0.11405015058001731, + "grad_norm": 1.4414012432098389, + "learning_rate": 1.999663046334124e-05, + "loss": 1.262, + "step": 4890 + }, + { + "epoch": 0.11407347371919524, + "grad_norm": 2.0771100521087646, + "learning_rate": 1.9996623924095714e-05, + "loss": 1.5053, + "step": 4891 + }, + { + "epoch": 0.11409679685837315, + "grad_norm": 1.397773027420044, + "learning_rate": 1.999661737851207e-05, + "loss": 1.4299, + "step": 4892 + }, + { + "epoch": 0.11412011999755108, + "grad_norm": 4.096678256988525, + "learning_rate": 1.9996610826590303e-05, + "loss": 1.276, + "step": 4893 + }, + { + "epoch": 0.11414344313672899, + "grad_norm": 2.586124897003174, + "learning_rate": 1.9996604268330424e-05, + "loss": 1.3923, + "step": 4894 + }, + { + "epoch": 0.11416676627590691, + "grad_norm": 2.3592286109924316, + "learning_rate": 1.9996597703732438e-05, + "loss": 1.5799, + "step": 4895 + }, + { + "epoch": 0.11419008941508482, + "grad_norm": 1.6324352025985718, + "learning_rate": 1.9996591132796345e-05, + "loss": 1.2847, + "step": 4896 + }, + { + "epoch": 0.11421341255426275, + "grad_norm": 1.6896082162857056, + "learning_rate": 1.999658455552215e-05, + "loss": 1.5221, + "step": 4897 + }, + { + "epoch": 0.11423673569344066, + "grad_norm": 1.6508216857910156, + "learning_rate": 1.9996577971909857e-05, + "loss": 1.094, + "step": 4898 + }, + { + "epoch": 0.11426005883261858, + "grad_norm": 2.0739083290100098, + "learning_rate": 1.9996571381959474e-05, + "loss": 1.4574, + "step": 4899 + }, + { + "epoch": 0.1142833819717965, + "grad_norm": 1.7015663385391235, + "learning_rate": 1.9996564785671e-05, + "loss": 1.506, + "step": 4900 + }, + { + "epoch": 0.1143067051109744, + "grad_norm": 1.7504394054412842, + "learning_rate": 1.9996558183044443e-05, + "loss": 1.3567, + "step": 4901 + }, + { + "epoch": 0.11433002825015233, + "grad_norm": 1.7425142526626587, + "learning_rate": 1.9996551574079802e-05, + "loss": 1.3854, + "step": 4902 + }, + { + "epoch": 0.11435335138933024, + "grad_norm": 1.5381382703781128, + "learning_rate": 1.999654495877709e-05, + "loss": 0.9359, + "step": 4903 + }, + { + "epoch": 0.11437667452850817, + "grad_norm": 1.9737110137939453, + "learning_rate": 1.9996538337136302e-05, + "loss": 1.2001, + "step": 4904 + }, + { + "epoch": 0.11439999766768608, + "grad_norm": 1.606192946434021, + "learning_rate": 1.999653170915745e-05, + "loss": 1.2032, + "step": 4905 + }, + { + "epoch": 0.114423320806864, + "grad_norm": 1.9052050113677979, + "learning_rate": 1.999652507484053e-05, + "loss": 1.3797, + "step": 4906 + }, + { + "epoch": 0.11444664394604191, + "grad_norm": 2.1226413249969482, + "learning_rate": 1.9996518434185558e-05, + "loss": 1.5129, + "step": 4907 + }, + { + "epoch": 0.11446996708521984, + "grad_norm": 1.5617464780807495, + "learning_rate": 1.9996511787192523e-05, + "loss": 1.2434, + "step": 4908 + }, + { + "epoch": 0.11449329022439775, + "grad_norm": 1.3686046600341797, + "learning_rate": 1.999650513386144e-05, + "loss": 1.1302, + "step": 4909 + }, + { + "epoch": 0.11451661336357567, + "grad_norm": 1.9747443199157715, + "learning_rate": 1.9996498474192312e-05, + "loss": 1.5502, + "step": 4910 + }, + { + "epoch": 0.11453993650275358, + "grad_norm": 1.9239708185195923, + "learning_rate": 1.999649180818514e-05, + "loss": 1.4629, + "step": 4911 + }, + { + "epoch": 0.11456325964193151, + "grad_norm": 1.6129769086837769, + "learning_rate": 1.999648513583993e-05, + "loss": 1.3739, + "step": 4912 + }, + { + "epoch": 0.11458658278110942, + "grad_norm": 1.641762614250183, + "learning_rate": 1.9996478457156682e-05, + "loss": 1.4724, + "step": 4913 + }, + { + "epoch": 0.11460990592028734, + "grad_norm": 1.5708448886871338, + "learning_rate": 1.999647177213541e-05, + "loss": 1.4224, + "step": 4914 + }, + { + "epoch": 0.11463322905946526, + "grad_norm": 1.589847207069397, + "learning_rate": 1.999646508077611e-05, + "loss": 1.7322, + "step": 4915 + }, + { + "epoch": 0.11465655219864318, + "grad_norm": 1.799497365951538, + "learning_rate": 1.9996458383078788e-05, + "loss": 1.7415, + "step": 4916 + }, + { + "epoch": 0.11467987533782109, + "grad_norm": 1.55296790599823, + "learning_rate": 1.999645167904345e-05, + "loss": 1.4574, + "step": 4917 + }, + { + "epoch": 0.11470319847699902, + "grad_norm": 2.3322129249572754, + "learning_rate": 1.9996444968670098e-05, + "loss": 1.4262, + "step": 4918 + }, + { + "epoch": 0.11472652161617693, + "grad_norm": 1.506402611732483, + "learning_rate": 1.999643825195874e-05, + "loss": 1.2566, + "step": 4919 + }, + { + "epoch": 0.11474984475535485, + "grad_norm": 1.5143823623657227, + "learning_rate": 1.9996431528909377e-05, + "loss": 1.5797, + "step": 4920 + }, + { + "epoch": 0.11477316789453276, + "grad_norm": 2.1306798458099365, + "learning_rate": 1.9996424799522015e-05, + "loss": 1.7044, + "step": 4921 + }, + { + "epoch": 0.11479649103371069, + "grad_norm": 1.8718247413635254, + "learning_rate": 1.9996418063796654e-05, + "loss": 1.4831, + "step": 4922 + }, + { + "epoch": 0.1148198141728886, + "grad_norm": 1.7717846632003784, + "learning_rate": 1.9996411321733302e-05, + "loss": 1.4004, + "step": 4923 + }, + { + "epoch": 0.11484313731206652, + "grad_norm": 1.2827835083007812, + "learning_rate": 1.999640457333196e-05, + "loss": 1.2938, + "step": 4924 + }, + { + "epoch": 0.11486646045124443, + "grad_norm": 1.321552038192749, + "learning_rate": 1.9996397818592643e-05, + "loss": 1.3143, + "step": 4925 + }, + { + "epoch": 0.11488978359042236, + "grad_norm": 2.193957567214966, + "learning_rate": 1.9996391057515342e-05, + "loss": 1.2899, + "step": 4926 + }, + { + "epoch": 0.11491310672960027, + "grad_norm": 1.7584928274154663, + "learning_rate": 1.9996384290100067e-05, + "loss": 1.5271, + "step": 4927 + }, + { + "epoch": 0.1149364298687782, + "grad_norm": 1.6716281175613403, + "learning_rate": 1.9996377516346823e-05, + "loss": 1.4389, + "step": 4928 + }, + { + "epoch": 0.1149597530079561, + "grad_norm": 2.298614740371704, + "learning_rate": 1.9996370736255612e-05, + "loss": 1.3807, + "step": 4929 + }, + { + "epoch": 0.11498307614713402, + "grad_norm": 2.030890464782715, + "learning_rate": 1.9996363949826443e-05, + "loss": 1.2514, + "step": 4930 + }, + { + "epoch": 0.11500639928631194, + "grad_norm": 1.8294849395751953, + "learning_rate": 1.9996357157059312e-05, + "loss": 1.5073, + "step": 4931 + }, + { + "epoch": 0.11502972242548985, + "grad_norm": 1.556658148765564, + "learning_rate": 1.999635035795423e-05, + "loss": 1.4253, + "step": 4932 + }, + { + "epoch": 0.11505304556466778, + "grad_norm": 1.7101117372512817, + "learning_rate": 1.99963435525112e-05, + "loss": 1.6612, + "step": 4933 + }, + { + "epoch": 0.11507636870384569, + "grad_norm": 2.06040096282959, + "learning_rate": 1.9996336740730226e-05, + "loss": 1.3494, + "step": 4934 + }, + { + "epoch": 0.11509969184302361, + "grad_norm": 1.6833947896957397, + "learning_rate": 1.999632992261131e-05, + "loss": 1.3294, + "step": 4935 + }, + { + "epoch": 0.11512301498220152, + "grad_norm": 1.780799388885498, + "learning_rate": 1.999632309815446e-05, + "loss": 1.3288, + "step": 4936 + }, + { + "epoch": 0.11514633812137945, + "grad_norm": 1.611113429069519, + "learning_rate": 1.9996316267359682e-05, + "loss": 1.483, + "step": 4937 + }, + { + "epoch": 0.11516966126055736, + "grad_norm": 1.5236009359359741, + "learning_rate": 1.9996309430226972e-05, + "loss": 1.5499, + "step": 4938 + }, + { + "epoch": 0.11519298439973528, + "grad_norm": 1.7466269731521606, + "learning_rate": 1.999630258675634e-05, + "loss": 1.4186, + "step": 4939 + }, + { + "epoch": 0.1152163075389132, + "grad_norm": 1.5798134803771973, + "learning_rate": 1.9996295736947792e-05, + "loss": 1.0816, + "step": 4940 + }, + { + "epoch": 0.11523963067809112, + "grad_norm": 1.6026114225387573, + "learning_rate": 1.9996288880801326e-05, + "loss": 1.4924, + "step": 4941 + }, + { + "epoch": 0.11526295381726903, + "grad_norm": 1.985709309577942, + "learning_rate": 1.9996282018316956e-05, + "loss": 1.5669, + "step": 4942 + }, + { + "epoch": 0.11528627695644696, + "grad_norm": 1.537269115447998, + "learning_rate": 1.9996275149494676e-05, + "loss": 1.4639, + "step": 4943 + }, + { + "epoch": 0.11530960009562487, + "grad_norm": 2.2672007083892822, + "learning_rate": 1.99962682743345e-05, + "loss": 1.7857, + "step": 4944 + }, + { + "epoch": 0.11533292323480279, + "grad_norm": 1.6579115390777588, + "learning_rate": 1.9996261392836422e-05, + "loss": 1.4009, + "step": 4945 + }, + { + "epoch": 0.1153562463739807, + "grad_norm": 1.4220229387283325, + "learning_rate": 1.9996254505000455e-05, + "loss": 1.4176, + "step": 4946 + }, + { + "epoch": 0.11537956951315863, + "grad_norm": 2.559842586517334, + "learning_rate": 1.99962476108266e-05, + "loss": 1.7005, + "step": 4947 + }, + { + "epoch": 0.11540289265233654, + "grad_norm": 2.431316614151001, + "learning_rate": 1.999624071031486e-05, + "loss": 1.243, + "step": 4948 + }, + { + "epoch": 0.11542621579151446, + "grad_norm": 2.263580799102783, + "learning_rate": 1.9996233803465242e-05, + "loss": 1.1228, + "step": 4949 + }, + { + "epoch": 0.11544953893069237, + "grad_norm": 2.068286895751953, + "learning_rate": 1.999622689027775e-05, + "loss": 1.3652, + "step": 4950 + }, + { + "epoch": 0.1154728620698703, + "grad_norm": 1.6092827320098877, + "learning_rate": 1.9996219970752387e-05, + "loss": 0.9919, + "step": 4951 + }, + { + "epoch": 0.11549618520904821, + "grad_norm": 1.8321871757507324, + "learning_rate": 1.999621304488916e-05, + "loss": 1.3283, + "step": 4952 + }, + { + "epoch": 0.11551950834822614, + "grad_norm": 1.6613811254501343, + "learning_rate": 1.9996206112688067e-05, + "loss": 1.281, + "step": 4953 + }, + { + "epoch": 0.11554283148740405, + "grad_norm": 1.8060203790664673, + "learning_rate": 1.999619917414912e-05, + "loss": 1.4334, + "step": 4954 + }, + { + "epoch": 0.11556615462658197, + "grad_norm": 1.5202760696411133, + "learning_rate": 1.9996192229272318e-05, + "loss": 1.0427, + "step": 4955 + }, + { + "epoch": 0.11558947776575988, + "grad_norm": 1.6206039190292358, + "learning_rate": 1.9996185278057673e-05, + "loss": 1.6303, + "step": 4956 + }, + { + "epoch": 0.1156128009049378, + "grad_norm": 1.7954822778701782, + "learning_rate": 1.999617832050518e-05, + "loss": 1.5478, + "step": 4957 + }, + { + "epoch": 0.11563612404411572, + "grad_norm": 2.0373642444610596, + "learning_rate": 1.9996171356614848e-05, + "loss": 1.8687, + "step": 4958 + }, + { + "epoch": 0.11565944718329363, + "grad_norm": 1.672339916229248, + "learning_rate": 1.9996164386386684e-05, + "loss": 1.8667, + "step": 4959 + }, + { + "epoch": 0.11568277032247155, + "grad_norm": 1.9105110168457031, + "learning_rate": 1.9996157409820682e-05, + "loss": 1.2615, + "step": 4960 + }, + { + "epoch": 0.11570609346164946, + "grad_norm": 1.501523494720459, + "learning_rate": 1.999615042691686e-05, + "loss": 1.3254, + "step": 4961 + }, + { + "epoch": 0.11572941660082739, + "grad_norm": 1.6732221841812134, + "learning_rate": 1.9996143437675216e-05, + "loss": 1.4104, + "step": 4962 + }, + { + "epoch": 0.1157527397400053, + "grad_norm": 1.6094794273376465, + "learning_rate": 1.9996136442095753e-05, + "loss": 1.4193, + "step": 4963 + }, + { + "epoch": 0.11577606287918323, + "grad_norm": 1.7315895557403564, + "learning_rate": 1.9996129440178475e-05, + "loss": 1.5297, + "step": 4964 + }, + { + "epoch": 0.11579938601836114, + "grad_norm": 1.7027325630187988, + "learning_rate": 1.999612243192339e-05, + "loss": 1.6268, + "step": 4965 + }, + { + "epoch": 0.11582270915753906, + "grad_norm": 1.5824599266052246, + "learning_rate": 1.9996115417330505e-05, + "loss": 1.3822, + "step": 4966 + }, + { + "epoch": 0.11584603229671697, + "grad_norm": 1.8605552911758423, + "learning_rate": 1.9996108396399813e-05, + "loss": 1.5074, + "step": 4967 + }, + { + "epoch": 0.1158693554358949, + "grad_norm": 1.8744456768035889, + "learning_rate": 1.999610136913133e-05, + "loss": 1.5779, + "step": 4968 + }, + { + "epoch": 0.11589267857507281, + "grad_norm": 1.5902279615402222, + "learning_rate": 1.9996094335525055e-05, + "loss": 1.6401, + "step": 4969 + }, + { + "epoch": 0.11591600171425073, + "grad_norm": 1.769426941871643, + "learning_rate": 1.9996087295580998e-05, + "loss": 1.3359, + "step": 4970 + }, + { + "epoch": 0.11593932485342864, + "grad_norm": 1.569571852684021, + "learning_rate": 1.9996080249299156e-05, + "loss": 1.703, + "step": 4971 + }, + { + "epoch": 0.11596264799260657, + "grad_norm": 1.530867338180542, + "learning_rate": 1.9996073196679535e-05, + "loss": 1.3807, + "step": 4972 + }, + { + "epoch": 0.11598597113178448, + "grad_norm": 1.6897547245025635, + "learning_rate": 1.9996066137722143e-05, + "loss": 1.3948, + "step": 4973 + }, + { + "epoch": 0.1160092942709624, + "grad_norm": 1.6676502227783203, + "learning_rate": 1.9996059072426983e-05, + "loss": 1.304, + "step": 4974 + }, + { + "epoch": 0.11603261741014032, + "grad_norm": 1.939272403717041, + "learning_rate": 1.9996052000794058e-05, + "loss": 1.7275, + "step": 4975 + }, + { + "epoch": 0.11605594054931824, + "grad_norm": 1.8904296159744263, + "learning_rate": 1.9996044922823375e-05, + "loss": 1.6057, + "step": 4976 + }, + { + "epoch": 0.11607926368849615, + "grad_norm": 1.7910157442092896, + "learning_rate": 1.9996037838514937e-05, + "loss": 1.5596, + "step": 4977 + }, + { + "epoch": 0.11610258682767408, + "grad_norm": 1.6389763355255127, + "learning_rate": 1.9996030747868745e-05, + "loss": 1.6748, + "step": 4978 + }, + { + "epoch": 0.11612590996685199, + "grad_norm": 1.6676112413406372, + "learning_rate": 1.9996023650884812e-05, + "loss": 1.7798, + "step": 4979 + }, + { + "epoch": 0.11614923310602991, + "grad_norm": 2.0701849460601807, + "learning_rate": 1.9996016547563134e-05, + "loss": 1.6253, + "step": 4980 + }, + { + "epoch": 0.11617255624520782, + "grad_norm": 1.6076871156692505, + "learning_rate": 1.9996009437903722e-05, + "loss": 1.605, + "step": 4981 + }, + { + "epoch": 0.11619587938438575, + "grad_norm": 1.608145833015442, + "learning_rate": 1.9996002321906576e-05, + "loss": 1.1783, + "step": 4982 + }, + { + "epoch": 0.11621920252356366, + "grad_norm": 1.4932103157043457, + "learning_rate": 1.9995995199571702e-05, + "loss": 1.2725, + "step": 4983 + }, + { + "epoch": 0.11624252566274158, + "grad_norm": 1.9273570775985718, + "learning_rate": 1.9995988070899104e-05, + "loss": 1.123, + "step": 4984 + }, + { + "epoch": 0.1162658488019195, + "grad_norm": 1.7300347089767456, + "learning_rate": 1.999598093588879e-05, + "loss": 1.5293, + "step": 4985 + }, + { + "epoch": 0.1162891719410974, + "grad_norm": 1.4431848526000977, + "learning_rate": 1.9995973794540756e-05, + "loss": 1.4358, + "step": 4986 + }, + { + "epoch": 0.11631249508027533, + "grad_norm": 2.3643648624420166, + "learning_rate": 1.999596664685502e-05, + "loss": 1.5636, + "step": 4987 + }, + { + "epoch": 0.11633581821945324, + "grad_norm": 1.670784592628479, + "learning_rate": 1.9995959492831573e-05, + "loss": 1.6598, + "step": 4988 + }, + { + "epoch": 0.11635914135863117, + "grad_norm": 2.2506015300750732, + "learning_rate": 1.9995952332470425e-05, + "loss": 1.3849, + "step": 4989 + }, + { + "epoch": 0.11638246449780908, + "grad_norm": 1.7601280212402344, + "learning_rate": 1.999594516577158e-05, + "loss": 1.6713, + "step": 4990 + }, + { + "epoch": 0.116405787636987, + "grad_norm": 1.5968663692474365, + "learning_rate": 1.999593799273505e-05, + "loss": 1.569, + "step": 4991 + }, + { + "epoch": 0.11642911077616491, + "grad_norm": 1.2882698774337769, + "learning_rate": 1.9995930813360828e-05, + "loss": 1.397, + "step": 4992 + }, + { + "epoch": 0.11645243391534284, + "grad_norm": 2.1098101139068604, + "learning_rate": 1.9995923627648923e-05, + "loss": 1.4164, + "step": 4993 + }, + { + "epoch": 0.11647575705452075, + "grad_norm": 1.6225732564926147, + "learning_rate": 1.999591643559934e-05, + "loss": 1.0269, + "step": 4994 + }, + { + "epoch": 0.11649908019369867, + "grad_norm": 1.720001220703125, + "learning_rate": 1.9995909237212086e-05, + "loss": 1.4198, + "step": 4995 + }, + { + "epoch": 0.11652240333287658, + "grad_norm": 1.8598988056182861, + "learning_rate": 1.999590203248716e-05, + "loss": 1.1092, + "step": 4996 + }, + { + "epoch": 0.11654572647205451, + "grad_norm": 2.120917320251465, + "learning_rate": 1.9995894821424576e-05, + "loss": 1.2088, + "step": 4997 + }, + { + "epoch": 0.11656904961123242, + "grad_norm": 1.8014084100723267, + "learning_rate": 1.9995887604024325e-05, + "loss": 1.6979, + "step": 4998 + }, + { + "epoch": 0.11659237275041034, + "grad_norm": 1.7007758617401123, + "learning_rate": 1.9995880380286424e-05, + "loss": 1.268, + "step": 4999 + }, + { + "epoch": 0.11661569588958826, + "grad_norm": 1.697314739227295, + "learning_rate": 1.9995873150210867e-05, + "loss": 1.4383, + "step": 5000 + }, + { + "epoch": 0.11663901902876618, + "grad_norm": 1.3730716705322266, + "learning_rate": 1.999586591379767e-05, + "loss": 1.4153, + "step": 5001 + }, + { + "epoch": 0.11666234216794409, + "grad_norm": 2.5582022666931152, + "learning_rate": 1.9995858671046827e-05, + "loss": 1.4287, + "step": 5002 + }, + { + "epoch": 0.11668566530712202, + "grad_norm": 1.8424543142318726, + "learning_rate": 1.999585142195835e-05, + "loss": 1.597, + "step": 5003 + }, + { + "epoch": 0.11670898844629993, + "grad_norm": 1.3542181253433228, + "learning_rate": 1.9995844166532237e-05, + "loss": 1.332, + "step": 5004 + }, + { + "epoch": 0.11673231158547785, + "grad_norm": 1.617600440979004, + "learning_rate": 1.99958369047685e-05, + "loss": 1.2476, + "step": 5005 + }, + { + "epoch": 0.11675563472465576, + "grad_norm": 3.108065366744995, + "learning_rate": 1.999582963666714e-05, + "loss": 1.3547, + "step": 5006 + }, + { + "epoch": 0.11677895786383369, + "grad_norm": 1.577943205833435, + "learning_rate": 1.9995822362228157e-05, + "loss": 1.377, + "step": 5007 + }, + { + "epoch": 0.1168022810030116, + "grad_norm": 1.4541665315628052, + "learning_rate": 1.9995815081451563e-05, + "loss": 1.1261, + "step": 5008 + }, + { + "epoch": 0.11682560414218952, + "grad_norm": 1.7075183391571045, + "learning_rate": 1.9995807794337362e-05, + "loss": 1.4891, + "step": 5009 + }, + { + "epoch": 0.11684892728136743, + "grad_norm": 1.8407599925994873, + "learning_rate": 1.9995800500885557e-05, + "loss": 1.3477, + "step": 5010 + }, + { + "epoch": 0.11687225042054536, + "grad_norm": 1.8975273370742798, + "learning_rate": 1.999579320109615e-05, + "loss": 1.5896, + "step": 5011 + }, + { + "epoch": 0.11689557355972327, + "grad_norm": 1.623226523399353, + "learning_rate": 1.9995785894969145e-05, + "loss": 1.3282, + "step": 5012 + }, + { + "epoch": 0.1169188966989012, + "grad_norm": 2.066908121109009, + "learning_rate": 1.9995778582504553e-05, + "loss": 1.7712, + "step": 5013 + }, + { + "epoch": 0.1169422198380791, + "grad_norm": 2.7155771255493164, + "learning_rate": 1.9995771263702376e-05, + "loss": 1.3274, + "step": 5014 + }, + { + "epoch": 0.11696554297725702, + "grad_norm": 1.5611293315887451, + "learning_rate": 1.9995763938562614e-05, + "loss": 1.3813, + "step": 5015 + }, + { + "epoch": 0.11698886611643494, + "grad_norm": 1.651684284210205, + "learning_rate": 1.9995756607085273e-05, + "loss": 1.4984, + "step": 5016 + }, + { + "epoch": 0.11701218925561285, + "grad_norm": 1.7349296808242798, + "learning_rate": 1.9995749269270364e-05, + "loss": 1.4722, + "step": 5017 + }, + { + "epoch": 0.11703551239479078, + "grad_norm": 1.508918285369873, + "learning_rate": 1.999574192511789e-05, + "loss": 1.6077, + "step": 5018 + }, + { + "epoch": 0.11705883553396869, + "grad_norm": 1.7078007459640503, + "learning_rate": 1.999573457462785e-05, + "loss": 1.1514, + "step": 5019 + }, + { + "epoch": 0.11708215867314661, + "grad_norm": 1.7976502180099487, + "learning_rate": 1.999572721780025e-05, + "loss": 1.3914, + "step": 5020 + }, + { + "epoch": 0.11710548181232452, + "grad_norm": 1.448433756828308, + "learning_rate": 1.99957198546351e-05, + "loss": 1.5623, + "step": 5021 + }, + { + "epoch": 0.11712880495150245, + "grad_norm": 1.7487640380859375, + "learning_rate": 1.99957124851324e-05, + "loss": 1.6158, + "step": 5022 + }, + { + "epoch": 0.11715212809068036, + "grad_norm": 1.7348347902297974, + "learning_rate": 1.9995705109292152e-05, + "loss": 1.4923, + "step": 5023 + }, + { + "epoch": 0.11717545122985829, + "grad_norm": 1.7495927810668945, + "learning_rate": 1.999569772711437e-05, + "loss": 1.6697, + "step": 5024 + }, + { + "epoch": 0.1171987743690362, + "grad_norm": 1.8108551502227783, + "learning_rate": 1.9995690338599054e-05, + "loss": 1.4447, + "step": 5025 + }, + { + "epoch": 0.11722209750821412, + "grad_norm": 1.8381212949752808, + "learning_rate": 1.9995682943746203e-05, + "loss": 1.612, + "step": 5026 + }, + { + "epoch": 0.11724542064739203, + "grad_norm": 1.7819368839263916, + "learning_rate": 1.9995675542555832e-05, + "loss": 1.1363, + "step": 5027 + }, + { + "epoch": 0.11726874378656996, + "grad_norm": 1.6602860689163208, + "learning_rate": 1.9995668135027936e-05, + "loss": 1.252, + "step": 5028 + }, + { + "epoch": 0.11729206692574787, + "grad_norm": 1.930919885635376, + "learning_rate": 1.9995660721162523e-05, + "loss": 1.2841, + "step": 5029 + }, + { + "epoch": 0.11731539006492579, + "grad_norm": 1.5136758089065552, + "learning_rate": 1.9995653300959602e-05, + "loss": 1.2137, + "step": 5030 + }, + { + "epoch": 0.1173387132041037, + "grad_norm": 1.8446388244628906, + "learning_rate": 1.9995645874419174e-05, + "loss": 1.5271, + "step": 5031 + }, + { + "epoch": 0.11736203634328163, + "grad_norm": 1.893015742301941, + "learning_rate": 1.9995638441541243e-05, + "loss": 1.4715, + "step": 5032 + }, + { + "epoch": 0.11738535948245954, + "grad_norm": 1.5898796319961548, + "learning_rate": 1.9995631002325814e-05, + "loss": 1.7304, + "step": 5033 + }, + { + "epoch": 0.11740868262163746, + "grad_norm": 1.6934348344802856, + "learning_rate": 1.9995623556772895e-05, + "loss": 1.3509, + "step": 5034 + }, + { + "epoch": 0.11743200576081538, + "grad_norm": 1.8448076248168945, + "learning_rate": 1.9995616104882485e-05, + "loss": 1.651, + "step": 5035 + }, + { + "epoch": 0.1174553288999933, + "grad_norm": 1.6741981506347656, + "learning_rate": 1.9995608646654595e-05, + "loss": 1.4272, + "step": 5036 + }, + { + "epoch": 0.11747865203917121, + "grad_norm": 1.569404125213623, + "learning_rate": 1.9995601182089225e-05, + "loss": 1.24, + "step": 5037 + }, + { + "epoch": 0.11750197517834914, + "grad_norm": 1.7793101072311401, + "learning_rate": 1.9995593711186382e-05, + "loss": 1.6022, + "step": 5038 + }, + { + "epoch": 0.11752529831752705, + "grad_norm": 1.755360722541809, + "learning_rate": 1.999558623394607e-05, + "loss": 1.4453, + "step": 5039 + }, + { + "epoch": 0.11754862145670497, + "grad_norm": 1.8099433183670044, + "learning_rate": 1.9995578750368296e-05, + "loss": 1.4977, + "step": 5040 + }, + { + "epoch": 0.11757194459588288, + "grad_norm": 1.7078289985656738, + "learning_rate": 1.9995571260453056e-05, + "loss": 1.3513, + "step": 5041 + }, + { + "epoch": 0.11759526773506081, + "grad_norm": 1.6589043140411377, + "learning_rate": 1.9995563764200366e-05, + "loss": 1.5807, + "step": 5042 + }, + { + "epoch": 0.11761859087423872, + "grad_norm": 1.5111606121063232, + "learning_rate": 1.9995556261610227e-05, + "loss": 1.222, + "step": 5043 + }, + { + "epoch": 0.11764191401341663, + "grad_norm": 1.7153401374816895, + "learning_rate": 1.9995548752682645e-05, + "loss": 1.4885, + "step": 5044 + }, + { + "epoch": 0.11766523715259455, + "grad_norm": 1.4604908227920532, + "learning_rate": 1.9995541237417617e-05, + "loss": 1.3504, + "step": 5045 + }, + { + "epoch": 0.11768856029177246, + "grad_norm": 1.9099212884902954, + "learning_rate": 1.9995533715815156e-05, + "loss": 1.5163, + "step": 5046 + }, + { + "epoch": 0.11771188343095039, + "grad_norm": 1.5202598571777344, + "learning_rate": 1.999552618787527e-05, + "loss": 1.2716, + "step": 5047 + }, + { + "epoch": 0.1177352065701283, + "grad_norm": 1.5597397089004517, + "learning_rate": 1.999551865359795e-05, + "loss": 1.333, + "step": 5048 + }, + { + "epoch": 0.11775852970930623, + "grad_norm": 1.9359806776046753, + "learning_rate": 1.999551111298321e-05, + "loss": 1.8203, + "step": 5049 + }, + { + "epoch": 0.11778185284848414, + "grad_norm": 1.762131690979004, + "learning_rate": 1.9995503566031055e-05, + "loss": 1.2897, + "step": 5050 + }, + { + "epoch": 0.11780517598766206, + "grad_norm": 1.976355791091919, + "learning_rate": 1.9995496012741487e-05, + "loss": 1.6226, + "step": 5051 + }, + { + "epoch": 0.11782849912683997, + "grad_norm": 1.5695573091506958, + "learning_rate": 1.9995488453114514e-05, + "loss": 1.4698, + "step": 5052 + }, + { + "epoch": 0.1178518222660179, + "grad_norm": 1.871893286705017, + "learning_rate": 1.9995480887150136e-05, + "loss": 1.4913, + "step": 5053 + }, + { + "epoch": 0.11787514540519581, + "grad_norm": 2.0343329906463623, + "learning_rate": 1.9995473314848365e-05, + "loss": 1.3842, + "step": 5054 + }, + { + "epoch": 0.11789846854437373, + "grad_norm": 1.577577829360962, + "learning_rate": 1.99954657362092e-05, + "loss": 1.1719, + "step": 5055 + }, + { + "epoch": 0.11792179168355164, + "grad_norm": 1.5345379114151, + "learning_rate": 1.9995458151232644e-05, + "loss": 1.0927, + "step": 5056 + }, + { + "epoch": 0.11794511482272957, + "grad_norm": 1.613741397857666, + "learning_rate": 1.9995450559918707e-05, + "loss": 1.5349, + "step": 5057 + }, + { + "epoch": 0.11796843796190748, + "grad_norm": 1.5391262769699097, + "learning_rate": 1.9995442962267392e-05, + "loss": 1.6071, + "step": 5058 + }, + { + "epoch": 0.1179917611010854, + "grad_norm": 1.8823424577713013, + "learning_rate": 1.9995435358278705e-05, + "loss": 1.4934, + "step": 5059 + }, + { + "epoch": 0.11801508424026332, + "grad_norm": 1.7665133476257324, + "learning_rate": 1.999542774795265e-05, + "loss": 1.497, + "step": 5060 + }, + { + "epoch": 0.11803840737944124, + "grad_norm": 1.4514787197113037, + "learning_rate": 1.999542013128923e-05, + "loss": 1.0035, + "step": 5061 + }, + { + "epoch": 0.11806173051861915, + "grad_norm": 1.4382503032684326, + "learning_rate": 1.999541250828845e-05, + "loss": 1.4106, + "step": 5062 + }, + { + "epoch": 0.11808505365779708, + "grad_norm": 1.5556076765060425, + "learning_rate": 1.9995404878950317e-05, + "loss": 1.4639, + "step": 5063 + }, + { + "epoch": 0.11810837679697499, + "grad_norm": 1.7828418016433716, + "learning_rate": 1.9995397243274835e-05, + "loss": 1.3349, + "step": 5064 + }, + { + "epoch": 0.11813169993615291, + "grad_norm": 1.6392643451690674, + "learning_rate": 1.9995389601262013e-05, + "loss": 1.3422, + "step": 5065 + }, + { + "epoch": 0.11815502307533082, + "grad_norm": 1.656132698059082, + "learning_rate": 1.9995381952911845e-05, + "loss": 1.4973, + "step": 5066 + }, + { + "epoch": 0.11817834621450875, + "grad_norm": 1.872511386871338, + "learning_rate": 1.9995374298224343e-05, + "loss": 1.2819, + "step": 5067 + }, + { + "epoch": 0.11820166935368666, + "grad_norm": 2.293154001235962, + "learning_rate": 1.9995366637199517e-05, + "loss": 1.4985, + "step": 5068 + }, + { + "epoch": 0.11822499249286458, + "grad_norm": 1.6052125692367554, + "learning_rate": 1.9995358969837363e-05, + "loss": 1.3017, + "step": 5069 + }, + { + "epoch": 0.1182483156320425, + "grad_norm": 1.852829933166504, + "learning_rate": 1.9995351296137885e-05, + "loss": 1.2533, + "step": 5070 + }, + { + "epoch": 0.11827163877122042, + "grad_norm": 1.7062973976135254, + "learning_rate": 1.9995343616101096e-05, + "loss": 1.4356, + "step": 5071 + }, + { + "epoch": 0.11829496191039833, + "grad_norm": 1.66119384765625, + "learning_rate": 1.9995335929726996e-05, + "loss": 1.2236, + "step": 5072 + }, + { + "epoch": 0.11831828504957624, + "grad_norm": 1.9246375560760498, + "learning_rate": 1.999532823701559e-05, + "loss": 1.3517, + "step": 5073 + }, + { + "epoch": 0.11834160818875417, + "grad_norm": 1.5225536823272705, + "learning_rate": 1.9995320537966884e-05, + "loss": 1.3621, + "step": 5074 + }, + { + "epoch": 0.11836493132793208, + "grad_norm": 1.7596664428710938, + "learning_rate": 1.9995312832580886e-05, + "loss": 1.5504, + "step": 5075 + }, + { + "epoch": 0.11838825446711, + "grad_norm": 1.956938624382019, + "learning_rate": 1.999530512085759e-05, + "loss": 1.414, + "step": 5076 + }, + { + "epoch": 0.11841157760628791, + "grad_norm": 1.7286083698272705, + "learning_rate": 1.9995297402797014e-05, + "loss": 1.2745, + "step": 5077 + }, + { + "epoch": 0.11843490074546584, + "grad_norm": 1.4133533239364624, + "learning_rate": 1.9995289678399155e-05, + "loss": 1.3255, + "step": 5078 + }, + { + "epoch": 0.11845822388464375, + "grad_norm": 1.7407102584838867, + "learning_rate": 1.9995281947664022e-05, + "loss": 1.3016, + "step": 5079 + }, + { + "epoch": 0.11848154702382167, + "grad_norm": 1.9496867656707764, + "learning_rate": 1.9995274210591612e-05, + "loss": 1.5491, + "step": 5080 + }, + { + "epoch": 0.11850487016299958, + "grad_norm": 2.137552261352539, + "learning_rate": 1.999526646718194e-05, + "loss": 1.642, + "step": 5081 + }, + { + "epoch": 0.11852819330217751, + "grad_norm": 1.8528087139129639, + "learning_rate": 1.9995258717435005e-05, + "loss": 1.5119, + "step": 5082 + }, + { + "epoch": 0.11855151644135542, + "grad_norm": 1.840563416481018, + "learning_rate": 1.9995250961350816e-05, + "loss": 1.5336, + "step": 5083 + }, + { + "epoch": 0.11857483958053334, + "grad_norm": 3.5251901149749756, + "learning_rate": 1.9995243198929373e-05, + "loss": 1.2675, + "step": 5084 + }, + { + "epoch": 0.11859816271971126, + "grad_norm": 1.6450800895690918, + "learning_rate": 1.9995235430170687e-05, + "loss": 1.3796, + "step": 5085 + }, + { + "epoch": 0.11862148585888918, + "grad_norm": 1.4160854816436768, + "learning_rate": 1.9995227655074755e-05, + "loss": 1.263, + "step": 5086 + }, + { + "epoch": 0.11864480899806709, + "grad_norm": 1.878952980041504, + "learning_rate": 1.9995219873641586e-05, + "loss": 1.6077, + "step": 5087 + }, + { + "epoch": 0.11866813213724502, + "grad_norm": 1.9942275285720825, + "learning_rate": 1.9995212085871188e-05, + "loss": 1.59, + "step": 5088 + }, + { + "epoch": 0.11869145527642293, + "grad_norm": 1.8743102550506592, + "learning_rate": 1.9995204291763564e-05, + "loss": 1.188, + "step": 5089 + }, + { + "epoch": 0.11871477841560085, + "grad_norm": 1.6744074821472168, + "learning_rate": 1.9995196491318717e-05, + "loss": 1.3511, + "step": 5090 + }, + { + "epoch": 0.11873810155477876, + "grad_norm": 1.771701455116272, + "learning_rate": 1.9995188684536654e-05, + "loss": 1.5823, + "step": 5091 + }, + { + "epoch": 0.11876142469395669, + "grad_norm": 1.5408809185028076, + "learning_rate": 1.9995180871417376e-05, + "loss": 1.3132, + "step": 5092 + }, + { + "epoch": 0.1187847478331346, + "grad_norm": 1.5890802145004272, + "learning_rate": 1.999517305196089e-05, + "loss": 1.4174, + "step": 5093 + }, + { + "epoch": 0.11880807097231252, + "grad_norm": 1.790546178817749, + "learning_rate": 1.9995165226167205e-05, + "loss": 1.504, + "step": 5094 + }, + { + "epoch": 0.11883139411149043, + "grad_norm": 2.384721040725708, + "learning_rate": 1.9995157394036322e-05, + "loss": 1.3735, + "step": 5095 + }, + { + "epoch": 0.11885471725066836, + "grad_norm": 1.727489948272705, + "learning_rate": 1.9995149555568244e-05, + "loss": 1.4592, + "step": 5096 + }, + { + "epoch": 0.11887804038984627, + "grad_norm": 1.5763919353485107, + "learning_rate": 1.9995141710762985e-05, + "loss": 1.5103, + "step": 5097 + }, + { + "epoch": 0.1189013635290242, + "grad_norm": 1.709712266921997, + "learning_rate": 1.999513385962054e-05, + "loss": 1.3255, + "step": 5098 + }, + { + "epoch": 0.1189246866682021, + "grad_norm": 1.4619394540786743, + "learning_rate": 1.9995126002140915e-05, + "loss": 1.2382, + "step": 5099 + }, + { + "epoch": 0.11894800980738002, + "grad_norm": 1.9581271409988403, + "learning_rate": 1.9995118138324123e-05, + "loss": 1.5246, + "step": 5100 + }, + { + "epoch": 0.11897133294655794, + "grad_norm": 1.578933835029602, + "learning_rate": 1.9995110268170162e-05, + "loss": 1.9479, + "step": 5101 + }, + { + "epoch": 0.11899465608573585, + "grad_norm": 1.6125664710998535, + "learning_rate": 1.999510239167904e-05, + "loss": 1.1328, + "step": 5102 + }, + { + "epoch": 0.11901797922491378, + "grad_norm": 1.7455062866210938, + "learning_rate": 1.9995094508850755e-05, + "loss": 1.7469, + "step": 5103 + }, + { + "epoch": 0.11904130236409169, + "grad_norm": 1.6053768396377563, + "learning_rate": 1.9995086619685323e-05, + "loss": 1.4065, + "step": 5104 + }, + { + "epoch": 0.11906462550326961, + "grad_norm": 1.5340065956115723, + "learning_rate": 1.9995078724182742e-05, + "loss": 1.234, + "step": 5105 + }, + { + "epoch": 0.11908794864244752, + "grad_norm": 1.577673316001892, + "learning_rate": 1.999507082234302e-05, + "loss": 1.2571, + "step": 5106 + }, + { + "epoch": 0.11911127178162545, + "grad_norm": 2.056129217147827, + "learning_rate": 1.9995062914166157e-05, + "loss": 1.4707, + "step": 5107 + }, + { + "epoch": 0.11913459492080336, + "grad_norm": 1.2501311302185059, + "learning_rate": 1.9995054999652166e-05, + "loss": 1.4259, + "step": 5108 + }, + { + "epoch": 0.11915791805998129, + "grad_norm": 1.5332967042922974, + "learning_rate": 1.9995047078801043e-05, + "loss": 1.5529, + "step": 5109 + }, + { + "epoch": 0.1191812411991592, + "grad_norm": 1.9253953695297241, + "learning_rate": 1.9995039151612803e-05, + "loss": 1.4761, + "step": 5110 + }, + { + "epoch": 0.11920456433833712, + "grad_norm": 1.6318061351776123, + "learning_rate": 1.9995031218087446e-05, + "loss": 1.8184, + "step": 5111 + }, + { + "epoch": 0.11922788747751503, + "grad_norm": 1.9887782335281372, + "learning_rate": 1.999502327822497e-05, + "loss": 1.7311, + "step": 5112 + }, + { + "epoch": 0.11925121061669296, + "grad_norm": 1.704048991203308, + "learning_rate": 1.999501533202539e-05, + "loss": 0.9308, + "step": 5113 + }, + { + "epoch": 0.11927453375587087, + "grad_norm": 1.799851417541504, + "learning_rate": 1.999500737948871e-05, + "loss": 1.8301, + "step": 5114 + }, + { + "epoch": 0.11929785689504879, + "grad_norm": 1.4910862445831299, + "learning_rate": 1.9994999420614933e-05, + "loss": 1.3, + "step": 5115 + }, + { + "epoch": 0.1193211800342267, + "grad_norm": 1.6470119953155518, + "learning_rate": 1.9994991455404062e-05, + "loss": 1.2041, + "step": 5116 + }, + { + "epoch": 0.11934450317340463, + "grad_norm": 1.6899874210357666, + "learning_rate": 1.9994983483856107e-05, + "loss": 1.2277, + "step": 5117 + }, + { + "epoch": 0.11936782631258254, + "grad_norm": 4.254084587097168, + "learning_rate": 1.9994975505971064e-05, + "loss": 1.4531, + "step": 5118 + }, + { + "epoch": 0.11939114945176046, + "grad_norm": 1.997368335723877, + "learning_rate": 1.9994967521748952e-05, + "loss": 1.2339, + "step": 5119 + }, + { + "epoch": 0.11941447259093838, + "grad_norm": 1.6572461128234863, + "learning_rate": 1.9994959531189762e-05, + "loss": 1.5654, + "step": 5120 + }, + { + "epoch": 0.1194377957301163, + "grad_norm": 2.1383442878723145, + "learning_rate": 1.9994951534293506e-05, + "loss": 1.3202, + "step": 5121 + }, + { + "epoch": 0.11946111886929421, + "grad_norm": 1.874514102935791, + "learning_rate": 1.9994943531060193e-05, + "loss": 1.4094, + "step": 5122 + }, + { + "epoch": 0.11948444200847214, + "grad_norm": 1.7515826225280762, + "learning_rate": 1.999493552148982e-05, + "loss": 1.4491, + "step": 5123 + }, + { + "epoch": 0.11950776514765005, + "grad_norm": 1.6808871030807495, + "learning_rate": 1.9994927505582394e-05, + "loss": 1.455, + "step": 5124 + }, + { + "epoch": 0.11953108828682797, + "grad_norm": 2.196124792098999, + "learning_rate": 1.9994919483337924e-05, + "loss": 1.372, + "step": 5125 + }, + { + "epoch": 0.11955441142600588, + "grad_norm": 1.775083065032959, + "learning_rate": 1.9994911454756408e-05, + "loss": 1.2551, + "step": 5126 + }, + { + "epoch": 0.11957773456518381, + "grad_norm": 1.6314014196395874, + "learning_rate": 1.9994903419837863e-05, + "loss": 1.5135, + "step": 5127 + }, + { + "epoch": 0.11960105770436172, + "grad_norm": 1.7472255229949951, + "learning_rate": 1.9994895378582284e-05, + "loss": 1.2635, + "step": 5128 + }, + { + "epoch": 0.11962438084353963, + "grad_norm": 1.6916024684906006, + "learning_rate": 1.999488733098968e-05, + "loss": 1.3864, + "step": 5129 + }, + { + "epoch": 0.11964770398271755, + "grad_norm": 1.9242362976074219, + "learning_rate": 1.999487927706005e-05, + "loss": 1.5841, + "step": 5130 + }, + { + "epoch": 0.11967102712189547, + "grad_norm": 2.0054173469543457, + "learning_rate": 1.999487121679341e-05, + "loss": 1.5605, + "step": 5131 + }, + { + "epoch": 0.11969435026107339, + "grad_norm": 1.9183844327926636, + "learning_rate": 1.9994863150189752e-05, + "loss": 1.7237, + "step": 5132 + }, + { + "epoch": 0.1197176734002513, + "grad_norm": 1.5911062955856323, + "learning_rate": 1.9994855077249097e-05, + "loss": 1.504, + "step": 5133 + }, + { + "epoch": 0.11974099653942923, + "grad_norm": 1.6197603940963745, + "learning_rate": 1.9994846997971436e-05, + "loss": 1.5239, + "step": 5134 + }, + { + "epoch": 0.11976431967860714, + "grad_norm": 1.7943172454833984, + "learning_rate": 1.999483891235678e-05, + "loss": 1.602, + "step": 5135 + }, + { + "epoch": 0.11978764281778506, + "grad_norm": 1.5191656351089478, + "learning_rate": 1.9994830820405134e-05, + "loss": 1.4079, + "step": 5136 + }, + { + "epoch": 0.11981096595696297, + "grad_norm": 2.047614812850952, + "learning_rate": 1.9994822722116506e-05, + "loss": 1.3455, + "step": 5137 + }, + { + "epoch": 0.1198342890961409, + "grad_norm": 1.773034691810608, + "learning_rate": 1.9994814617490897e-05, + "loss": 1.5533, + "step": 5138 + }, + { + "epoch": 0.11985761223531881, + "grad_norm": 1.7154484987258911, + "learning_rate": 1.999480650652831e-05, + "loss": 1.4436, + "step": 5139 + }, + { + "epoch": 0.11988093537449673, + "grad_norm": 1.9495506286621094, + "learning_rate": 1.9994798389228754e-05, + "loss": 1.4085, + "step": 5140 + }, + { + "epoch": 0.11990425851367464, + "grad_norm": 1.5221166610717773, + "learning_rate": 1.9994790265592235e-05, + "loss": 1.6403, + "step": 5141 + }, + { + "epoch": 0.11992758165285257, + "grad_norm": 1.6362521648406982, + "learning_rate": 1.9994782135618758e-05, + "loss": 1.3437, + "step": 5142 + }, + { + "epoch": 0.11995090479203048, + "grad_norm": 1.937803030014038, + "learning_rate": 1.9994773999308322e-05, + "loss": 1.6012, + "step": 5143 + }, + { + "epoch": 0.1199742279312084, + "grad_norm": 2.1985485553741455, + "learning_rate": 1.9994765856660942e-05, + "loss": 1.3364, + "step": 5144 + }, + { + "epoch": 0.11999755107038632, + "grad_norm": 1.5723555088043213, + "learning_rate": 1.9994757707676613e-05, + "loss": 1.2644, + "step": 5145 + }, + { + "epoch": 0.12002087420956424, + "grad_norm": 1.2739875316619873, + "learning_rate": 1.999474955235535e-05, + "loss": 1.2311, + "step": 5146 + }, + { + "epoch": 0.12004419734874215, + "grad_norm": 1.7219253778457642, + "learning_rate": 1.999474139069715e-05, + "loss": 1.5847, + "step": 5147 + }, + { + "epoch": 0.12006752048792008, + "grad_norm": 1.7871161699295044, + "learning_rate": 1.999473322270202e-05, + "loss": 1.5213, + "step": 5148 + }, + { + "epoch": 0.12009084362709799, + "grad_norm": 1.6022684574127197, + "learning_rate": 1.999472504836997e-05, + "loss": 1.6142, + "step": 5149 + }, + { + "epoch": 0.12011416676627591, + "grad_norm": 1.74579918384552, + "learning_rate": 1.9994716867701002e-05, + "loss": 1.3925, + "step": 5150 + }, + { + "epoch": 0.12013748990545382, + "grad_norm": 1.4294853210449219, + "learning_rate": 1.9994708680695123e-05, + "loss": 1.4432, + "step": 5151 + }, + { + "epoch": 0.12016081304463175, + "grad_norm": 1.5389394760131836, + "learning_rate": 1.9994700487352333e-05, + "loss": 1.2024, + "step": 5152 + }, + { + "epoch": 0.12018413618380966, + "grad_norm": 1.9452849626541138, + "learning_rate": 1.999469228767264e-05, + "loss": 1.2874, + "step": 5153 + }, + { + "epoch": 0.12020745932298758, + "grad_norm": 1.6799707412719727, + "learning_rate": 1.9994684081656053e-05, + "loss": 1.424, + "step": 5154 + }, + { + "epoch": 0.1202307824621655, + "grad_norm": 1.7166920900344849, + "learning_rate": 1.999467586930257e-05, + "loss": 1.3648, + "step": 5155 + }, + { + "epoch": 0.12025410560134342, + "grad_norm": 1.649631381034851, + "learning_rate": 1.9994667650612204e-05, + "loss": 1.2777, + "step": 5156 + }, + { + "epoch": 0.12027742874052133, + "grad_norm": 1.787745714187622, + "learning_rate": 1.9994659425584953e-05, + "loss": 1.6375, + "step": 5157 + }, + { + "epoch": 0.12030075187969924, + "grad_norm": 1.5783100128173828, + "learning_rate": 1.999465119422083e-05, + "loss": 1.1562, + "step": 5158 + }, + { + "epoch": 0.12032407501887717, + "grad_norm": 1.4640908241271973, + "learning_rate": 1.999464295651983e-05, + "loss": 1.6112, + "step": 5159 + }, + { + "epoch": 0.12034739815805508, + "grad_norm": 1.7693891525268555, + "learning_rate": 1.9994634712481966e-05, + "loss": 1.6203, + "step": 5160 + }, + { + "epoch": 0.120370721297233, + "grad_norm": 1.7248945236206055, + "learning_rate": 1.999462646210724e-05, + "loss": 1.2511, + "step": 5161 + }, + { + "epoch": 0.12039404443641091, + "grad_norm": 1.6078987121582031, + "learning_rate": 1.9994618205395663e-05, + "loss": 1.2726, + "step": 5162 + }, + { + "epoch": 0.12041736757558884, + "grad_norm": 1.4516338109970093, + "learning_rate": 1.999460994234723e-05, + "loss": 1.5845, + "step": 5163 + }, + { + "epoch": 0.12044069071476675, + "grad_norm": 1.2444758415222168, + "learning_rate": 1.9994601672961957e-05, + "loss": 0.9315, + "step": 5164 + }, + { + "epoch": 0.12046401385394467, + "grad_norm": 1.7344197034835815, + "learning_rate": 1.9994593397239842e-05, + "loss": 1.2271, + "step": 5165 + }, + { + "epoch": 0.12048733699312258, + "grad_norm": 1.5861235857009888, + "learning_rate": 1.999458511518089e-05, + "loss": 1.4443, + "step": 5166 + }, + { + "epoch": 0.12051066013230051, + "grad_norm": 1.5649861097335815, + "learning_rate": 1.9994576826785114e-05, + "loss": 1.5495, + "step": 5167 + }, + { + "epoch": 0.12053398327147842, + "grad_norm": 1.4794871807098389, + "learning_rate": 1.999456853205251e-05, + "loss": 1.4513, + "step": 5168 + }, + { + "epoch": 0.12055730641065635, + "grad_norm": 1.9839303493499756, + "learning_rate": 1.999456023098309e-05, + "loss": 1.4484, + "step": 5169 + }, + { + "epoch": 0.12058062954983426, + "grad_norm": 1.6965582370758057, + "learning_rate": 1.9994551923576854e-05, + "loss": 1.4189, + "step": 5170 + }, + { + "epoch": 0.12060395268901218, + "grad_norm": 1.8013246059417725, + "learning_rate": 1.999454360983381e-05, + "loss": 1.4711, + "step": 5171 + }, + { + "epoch": 0.12062727582819009, + "grad_norm": 2.126880168914795, + "learning_rate": 1.999453528975396e-05, + "loss": 1.4898, + "step": 5172 + }, + { + "epoch": 0.12065059896736802, + "grad_norm": 1.8758491277694702, + "learning_rate": 1.9994526963337318e-05, + "loss": 1.6876, + "step": 5173 + }, + { + "epoch": 0.12067392210654593, + "grad_norm": 1.616296648979187, + "learning_rate": 1.999451863058388e-05, + "loss": 1.2443, + "step": 5174 + }, + { + "epoch": 0.12069724524572385, + "grad_norm": 1.395225167274475, + "learning_rate": 1.9994510291493653e-05, + "loss": 1.0155, + "step": 5175 + }, + { + "epoch": 0.12072056838490176, + "grad_norm": 2.22456431388855, + "learning_rate": 1.999450194606665e-05, + "loss": 1.3071, + "step": 5176 + }, + { + "epoch": 0.12074389152407969, + "grad_norm": 1.6486225128173828, + "learning_rate": 1.9994493594302867e-05, + "loss": 1.5037, + "step": 5177 + }, + { + "epoch": 0.1207672146632576, + "grad_norm": 1.5535529851913452, + "learning_rate": 1.999448523620231e-05, + "loss": 1.3644, + "step": 5178 + }, + { + "epoch": 0.12079053780243552, + "grad_norm": 1.8484206199645996, + "learning_rate": 1.999447687176499e-05, + "loss": 1.2954, + "step": 5179 + }, + { + "epoch": 0.12081386094161344, + "grad_norm": 1.864751935005188, + "learning_rate": 1.999446850099091e-05, + "loss": 1.6541, + "step": 5180 + }, + { + "epoch": 0.12083718408079136, + "grad_norm": 1.8509420156478882, + "learning_rate": 1.9994460123880077e-05, + "loss": 1.0777, + "step": 5181 + }, + { + "epoch": 0.12086050721996927, + "grad_norm": 1.809068202972412, + "learning_rate": 1.9994451740432484e-05, + "loss": 1.5271, + "step": 5182 + }, + { + "epoch": 0.1208838303591472, + "grad_norm": 1.9914056062698364, + "learning_rate": 1.9994443350648158e-05, + "loss": 1.4236, + "step": 5183 + }, + { + "epoch": 0.1209071534983251, + "grad_norm": 1.7929853200912476, + "learning_rate": 1.9994434954527086e-05, + "loss": 1.769, + "step": 5184 + }, + { + "epoch": 0.12093047663750303, + "grad_norm": 1.5338125228881836, + "learning_rate": 1.999442655206928e-05, + "loss": 1.4364, + "step": 5185 + }, + { + "epoch": 0.12095379977668094, + "grad_norm": 1.911595344543457, + "learning_rate": 1.9994418143274744e-05, + "loss": 1.4768, + "step": 5186 + }, + { + "epoch": 0.12097712291585885, + "grad_norm": 1.9640648365020752, + "learning_rate": 1.999440972814349e-05, + "loss": 1.5574, + "step": 5187 + }, + { + "epoch": 0.12100044605503678, + "grad_norm": 1.9400410652160645, + "learning_rate": 1.999440130667551e-05, + "loss": 1.2392, + "step": 5188 + }, + { + "epoch": 0.12102376919421469, + "grad_norm": 1.4426178932189941, + "learning_rate": 1.9994392878870826e-05, + "loss": 1.0032, + "step": 5189 + }, + { + "epoch": 0.12104709233339261, + "grad_norm": 1.4520666599273682, + "learning_rate": 1.9994384444729427e-05, + "loss": 1.2546, + "step": 5190 + }, + { + "epoch": 0.12107041547257053, + "grad_norm": 1.5349781513214111, + "learning_rate": 1.999437600425133e-05, + "loss": 1.0804, + "step": 5191 + }, + { + "epoch": 0.12109373861174845, + "grad_norm": 1.783485770225525, + "learning_rate": 1.9994367557436532e-05, + "loss": 1.5303, + "step": 5192 + }, + { + "epoch": 0.12111706175092636, + "grad_norm": 1.787907600402832, + "learning_rate": 1.9994359104285047e-05, + "loss": 1.519, + "step": 5193 + }, + { + "epoch": 0.12114038489010429, + "grad_norm": 1.7436468601226807, + "learning_rate": 1.9994350644796875e-05, + "loss": 1.3104, + "step": 5194 + }, + { + "epoch": 0.1211637080292822, + "grad_norm": 1.7883096933364868, + "learning_rate": 1.9994342178972025e-05, + "loss": 1.174, + "step": 5195 + }, + { + "epoch": 0.12118703116846012, + "grad_norm": 1.7746797800064087, + "learning_rate": 1.9994333706810495e-05, + "loss": 1.8052, + "step": 5196 + }, + { + "epoch": 0.12121035430763803, + "grad_norm": 2.0402567386627197, + "learning_rate": 1.9994325228312297e-05, + "loss": 1.2368, + "step": 5197 + }, + { + "epoch": 0.12123367744681596, + "grad_norm": 1.561352252960205, + "learning_rate": 1.9994316743477436e-05, + "loss": 1.4033, + "step": 5198 + }, + { + "epoch": 0.12125700058599387, + "grad_norm": 1.7803243398666382, + "learning_rate": 1.9994308252305915e-05, + "loss": 1.6437, + "step": 5199 + }, + { + "epoch": 0.12128032372517179, + "grad_norm": 2.1737868785858154, + "learning_rate": 1.9994299754797737e-05, + "loss": 1.6835, + "step": 5200 + }, + { + "epoch": 0.1213036468643497, + "grad_norm": 1.8227667808532715, + "learning_rate": 1.9994291250952912e-05, + "loss": 1.3213, + "step": 5201 + }, + { + "epoch": 0.12132697000352763, + "grad_norm": 1.833287000656128, + "learning_rate": 1.9994282740771444e-05, + "loss": 1.1586, + "step": 5202 + }, + { + "epoch": 0.12135029314270554, + "grad_norm": 1.9518423080444336, + "learning_rate": 1.999427422425334e-05, + "loss": 1.5104, + "step": 5203 + }, + { + "epoch": 0.12137361628188346, + "grad_norm": 1.8010410070419312, + "learning_rate": 1.9994265701398602e-05, + "loss": 1.408, + "step": 5204 + }, + { + "epoch": 0.12139693942106138, + "grad_norm": 2.276822805404663, + "learning_rate": 1.9994257172207238e-05, + "loss": 1.819, + "step": 5205 + }, + { + "epoch": 0.1214202625602393, + "grad_norm": 1.5620603561401367, + "learning_rate": 1.999424863667925e-05, + "loss": 1.134, + "step": 5206 + }, + { + "epoch": 0.12144358569941721, + "grad_norm": 1.9355251789093018, + "learning_rate": 1.999424009481465e-05, + "loss": 1.6547, + "step": 5207 + }, + { + "epoch": 0.12146690883859514, + "grad_norm": 1.6456209421157837, + "learning_rate": 1.999423154661344e-05, + "loss": 1.7093, + "step": 5208 + }, + { + "epoch": 0.12149023197777305, + "grad_norm": 1.6039224863052368, + "learning_rate": 1.999422299207562e-05, + "loss": 1.1602, + "step": 5209 + }, + { + "epoch": 0.12151355511695097, + "grad_norm": 1.6987042427062988, + "learning_rate": 1.9994214431201204e-05, + "loss": 1.4111, + "step": 5210 + }, + { + "epoch": 0.12153687825612888, + "grad_norm": 1.7661241292953491, + "learning_rate": 1.9994205863990194e-05, + "loss": 1.2221, + "step": 5211 + }, + { + "epoch": 0.12156020139530681, + "grad_norm": 1.8467557430267334, + "learning_rate": 1.9994197290442594e-05, + "loss": 1.6562, + "step": 5212 + }, + { + "epoch": 0.12158352453448472, + "grad_norm": 1.6450353860855103, + "learning_rate": 1.999418871055841e-05, + "loss": 1.0889, + "step": 5213 + }, + { + "epoch": 0.12160684767366263, + "grad_norm": 1.9565606117248535, + "learning_rate": 1.9994180124337648e-05, + "loss": 1.6823, + "step": 5214 + }, + { + "epoch": 0.12163017081284055, + "grad_norm": 2.1629135608673096, + "learning_rate": 1.999417153178031e-05, + "loss": 1.0564, + "step": 5215 + }, + { + "epoch": 0.12165349395201847, + "grad_norm": 1.6880738735198975, + "learning_rate": 1.9994162932886413e-05, + "loss": 1.6544, + "step": 5216 + }, + { + "epoch": 0.12167681709119639, + "grad_norm": 1.9772284030914307, + "learning_rate": 1.999415432765595e-05, + "loss": 1.2833, + "step": 5217 + }, + { + "epoch": 0.1217001402303743, + "grad_norm": 1.785373568534851, + "learning_rate": 1.999414571608893e-05, + "loss": 1.4655, + "step": 5218 + }, + { + "epoch": 0.12172346336955223, + "grad_norm": 1.741829514503479, + "learning_rate": 1.999413709818536e-05, + "loss": 1.3966, + "step": 5219 + }, + { + "epoch": 0.12174678650873014, + "grad_norm": 1.6098731756210327, + "learning_rate": 1.9994128473945243e-05, + "loss": 1.434, + "step": 5220 + }, + { + "epoch": 0.12177010964790806, + "grad_norm": 1.8476279973983765, + "learning_rate": 1.9994119843368585e-05, + "loss": 1.6221, + "step": 5221 + }, + { + "epoch": 0.12179343278708597, + "grad_norm": 1.6263066530227661, + "learning_rate": 1.9994111206455396e-05, + "loss": 1.4709, + "step": 5222 + }, + { + "epoch": 0.1218167559262639, + "grad_norm": 1.6135395765304565, + "learning_rate": 1.999410256320568e-05, + "loss": 1.4385, + "step": 5223 + }, + { + "epoch": 0.12184007906544181, + "grad_norm": 1.915542721748352, + "learning_rate": 1.9994093913619433e-05, + "loss": 1.5076, + "step": 5224 + }, + { + "epoch": 0.12186340220461973, + "grad_norm": 1.8600517511367798, + "learning_rate": 1.9994085257696677e-05, + "loss": 1.2045, + "step": 5225 + }, + { + "epoch": 0.12188672534379764, + "grad_norm": 1.452391266822815, + "learning_rate": 1.9994076595437402e-05, + "loss": 1.5381, + "step": 5226 + }, + { + "epoch": 0.12191004848297557, + "grad_norm": 2.175764560699463, + "learning_rate": 1.9994067926841623e-05, + "loss": 1.6802, + "step": 5227 + }, + { + "epoch": 0.12193337162215348, + "grad_norm": 2.144850969314575, + "learning_rate": 1.9994059251909343e-05, + "loss": 1.4444, + "step": 5228 + }, + { + "epoch": 0.1219566947613314, + "grad_norm": 2.0190577507019043, + "learning_rate": 1.9994050570640566e-05, + "loss": 1.1841, + "step": 5229 + }, + { + "epoch": 0.12198001790050932, + "grad_norm": 2.01167893409729, + "learning_rate": 1.99940418830353e-05, + "loss": 1.512, + "step": 5230 + }, + { + "epoch": 0.12200334103968724, + "grad_norm": 1.9324928522109985, + "learning_rate": 1.999403318909355e-05, + "loss": 1.5614, + "step": 5231 + }, + { + "epoch": 0.12202666417886515, + "grad_norm": 1.6999582052230835, + "learning_rate": 1.9994024488815315e-05, + "loss": 1.669, + "step": 5232 + }, + { + "epoch": 0.12204998731804308, + "grad_norm": 1.7121517658233643, + "learning_rate": 1.999401578220061e-05, + "loss": 1.5156, + "step": 5233 + }, + { + "epoch": 0.12207331045722099, + "grad_norm": 1.92389976978302, + "learning_rate": 1.9994007069249437e-05, + "loss": 1.6186, + "step": 5234 + }, + { + "epoch": 0.12209663359639891, + "grad_norm": 1.759657382965088, + "learning_rate": 1.99939983499618e-05, + "loss": 1.6581, + "step": 5235 + }, + { + "epoch": 0.12211995673557682, + "grad_norm": 1.6198968887329102, + "learning_rate": 1.9993989624337704e-05, + "loss": 1.5227, + "step": 5236 + }, + { + "epoch": 0.12214327987475475, + "grad_norm": 1.703206181526184, + "learning_rate": 1.999398089237716e-05, + "loss": 1.4978, + "step": 5237 + }, + { + "epoch": 0.12216660301393266, + "grad_norm": 2.1598548889160156, + "learning_rate": 1.9993972154080167e-05, + "loss": 1.578, + "step": 5238 + }, + { + "epoch": 0.12218992615311058, + "grad_norm": 2.1588728427886963, + "learning_rate": 1.9993963409446735e-05, + "loss": 1.4551, + "step": 5239 + }, + { + "epoch": 0.1222132492922885, + "grad_norm": 1.5524038076400757, + "learning_rate": 1.9993954658476867e-05, + "loss": 1.4851, + "step": 5240 + }, + { + "epoch": 0.12223657243146642, + "grad_norm": 1.9396812915802002, + "learning_rate": 1.999394590117057e-05, + "loss": 1.5557, + "step": 5241 + }, + { + "epoch": 0.12225989557064433, + "grad_norm": 1.645156741142273, + "learning_rate": 1.9993937137527847e-05, + "loss": 1.4925, + "step": 5242 + }, + { + "epoch": 0.12228321870982224, + "grad_norm": 2.092407464981079, + "learning_rate": 1.9993928367548706e-05, + "loss": 1.7038, + "step": 5243 + }, + { + "epoch": 0.12230654184900017, + "grad_norm": 1.6597508192062378, + "learning_rate": 1.999391959123315e-05, + "loss": 1.6631, + "step": 5244 + }, + { + "epoch": 0.12232986498817808, + "grad_norm": 1.4524447917938232, + "learning_rate": 1.999391080858119e-05, + "loss": 1.2192, + "step": 5245 + }, + { + "epoch": 0.122353188127356, + "grad_norm": 1.9489341974258423, + "learning_rate": 1.9993902019592828e-05, + "loss": 1.3674, + "step": 5246 + }, + { + "epoch": 0.12237651126653391, + "grad_norm": 2.279472589492798, + "learning_rate": 1.999389322426807e-05, + "loss": 1.3308, + "step": 5247 + }, + { + "epoch": 0.12239983440571184, + "grad_norm": 1.964882493019104, + "learning_rate": 1.999388442260692e-05, + "loss": 1.3785, + "step": 5248 + }, + { + "epoch": 0.12242315754488975, + "grad_norm": 1.4800372123718262, + "learning_rate": 1.9993875614609387e-05, + "loss": 1.5374, + "step": 5249 + }, + { + "epoch": 0.12244648068406767, + "grad_norm": 1.6324028968811035, + "learning_rate": 1.9993866800275474e-05, + "loss": 1.3005, + "step": 5250 + }, + { + "epoch": 0.12246980382324558, + "grad_norm": 2.160531520843506, + "learning_rate": 1.9993857979605185e-05, + "loss": 1.3442, + "step": 5251 + }, + { + "epoch": 0.12249312696242351, + "grad_norm": 1.8576264381408691, + "learning_rate": 1.9993849152598527e-05, + "loss": 1.4702, + "step": 5252 + }, + { + "epoch": 0.12251645010160142, + "grad_norm": 2.558415412902832, + "learning_rate": 1.9993840319255508e-05, + "loss": 1.4287, + "step": 5253 + }, + { + "epoch": 0.12253977324077935, + "grad_norm": 1.551154613494873, + "learning_rate": 1.9993831479576133e-05, + "loss": 1.1179, + "step": 5254 + }, + { + "epoch": 0.12256309637995726, + "grad_norm": 1.7038143873214722, + "learning_rate": 1.9993822633560406e-05, + "loss": 1.5632, + "step": 5255 + }, + { + "epoch": 0.12258641951913518, + "grad_norm": 2.110316276550293, + "learning_rate": 1.999381378120833e-05, + "loss": 1.4701, + "step": 5256 + }, + { + "epoch": 0.12260974265831309, + "grad_norm": 1.6642460823059082, + "learning_rate": 1.9993804922519916e-05, + "loss": 1.5745, + "step": 5257 + }, + { + "epoch": 0.12263306579749102, + "grad_norm": 1.7695634365081787, + "learning_rate": 1.9993796057495167e-05, + "loss": 1.8309, + "step": 5258 + }, + { + "epoch": 0.12265638893666893, + "grad_norm": 1.4462796449661255, + "learning_rate": 1.999378718613409e-05, + "loss": 1.4294, + "step": 5259 + }, + { + "epoch": 0.12267971207584685, + "grad_norm": 1.5394349098205566, + "learning_rate": 1.9993778308436688e-05, + "loss": 1.5729, + "step": 5260 + }, + { + "epoch": 0.12270303521502476, + "grad_norm": 1.590888261795044, + "learning_rate": 1.9993769424402968e-05, + "loss": 1.4218, + "step": 5261 + }, + { + "epoch": 0.12272635835420269, + "grad_norm": 1.7652955055236816, + "learning_rate": 1.9993760534032937e-05, + "loss": 1.4049, + "step": 5262 + }, + { + "epoch": 0.1227496814933806, + "grad_norm": 1.9118036031723022, + "learning_rate": 1.99937516373266e-05, + "loss": 1.3574, + "step": 5263 + }, + { + "epoch": 0.12277300463255852, + "grad_norm": 1.849724292755127, + "learning_rate": 1.999374273428396e-05, + "loss": 1.6295, + "step": 5264 + }, + { + "epoch": 0.12279632777173644, + "grad_norm": 2.1726441383361816, + "learning_rate": 1.999373382490503e-05, + "loss": 1.5143, + "step": 5265 + }, + { + "epoch": 0.12281965091091436, + "grad_norm": 1.7471739053726196, + "learning_rate": 1.9993724909189804e-05, + "loss": 1.6585, + "step": 5266 + }, + { + "epoch": 0.12284297405009227, + "grad_norm": 1.6282989978790283, + "learning_rate": 1.9993715987138294e-05, + "loss": 1.7422, + "step": 5267 + }, + { + "epoch": 0.1228662971892702, + "grad_norm": 1.4847203493118286, + "learning_rate": 1.9993707058750507e-05, + "loss": 1.0243, + "step": 5268 + }, + { + "epoch": 0.12288962032844811, + "grad_norm": 3.8482096195220947, + "learning_rate": 1.999369812402645e-05, + "loss": 1.2423, + "step": 5269 + }, + { + "epoch": 0.12291294346762603, + "grad_norm": 1.5470126867294312, + "learning_rate": 1.9993689182966128e-05, + "loss": 1.372, + "step": 5270 + }, + { + "epoch": 0.12293626660680394, + "grad_norm": 1.5429582595825195, + "learning_rate": 1.999368023556954e-05, + "loss": 1.1523, + "step": 5271 + }, + { + "epoch": 0.12295958974598185, + "grad_norm": 1.8086215257644653, + "learning_rate": 1.9993671281836697e-05, + "loss": 1.314, + "step": 5272 + }, + { + "epoch": 0.12298291288515978, + "grad_norm": 1.4215608835220337, + "learning_rate": 1.9993662321767604e-05, + "loss": 1.1558, + "step": 5273 + }, + { + "epoch": 0.12300623602433769, + "grad_norm": 1.3274673223495483, + "learning_rate": 1.9993653355362264e-05, + "loss": 1.2743, + "step": 5274 + }, + { + "epoch": 0.12302955916351561, + "grad_norm": 1.5016624927520752, + "learning_rate": 1.999364438262069e-05, + "loss": 1.3474, + "step": 5275 + }, + { + "epoch": 0.12305288230269353, + "grad_norm": 1.6643635034561157, + "learning_rate": 1.9993635403542885e-05, + "loss": 1.2829, + "step": 5276 + }, + { + "epoch": 0.12307620544187145, + "grad_norm": 1.565714716911316, + "learning_rate": 1.999362641812885e-05, + "loss": 1.5664, + "step": 5277 + }, + { + "epoch": 0.12309952858104936, + "grad_norm": 1.909883737564087, + "learning_rate": 1.9993617426378592e-05, + "loss": 1.4488, + "step": 5278 + }, + { + "epoch": 0.12312285172022729, + "grad_norm": 1.5170515775680542, + "learning_rate": 1.9993608428292116e-05, + "loss": 1.6755, + "step": 5279 + }, + { + "epoch": 0.1231461748594052, + "grad_norm": 1.8806242942810059, + "learning_rate": 1.9993599423869434e-05, + "loss": 1.494, + "step": 5280 + }, + { + "epoch": 0.12316949799858312, + "grad_norm": 2.5287435054779053, + "learning_rate": 1.999359041311055e-05, + "loss": 1.4288, + "step": 5281 + }, + { + "epoch": 0.12319282113776103, + "grad_norm": 1.600243091583252, + "learning_rate": 1.999358139601546e-05, + "loss": 1.1991, + "step": 5282 + }, + { + "epoch": 0.12321614427693896, + "grad_norm": 1.5978606939315796, + "learning_rate": 1.9993572372584184e-05, + "loss": 1.3261, + "step": 5283 + }, + { + "epoch": 0.12323946741611687, + "grad_norm": 2.0403010845184326, + "learning_rate": 1.9993563342816714e-05, + "loss": 1.654, + "step": 5284 + }, + { + "epoch": 0.1232627905552948, + "grad_norm": 1.4866753816604614, + "learning_rate": 1.9993554306713068e-05, + "loss": 1.296, + "step": 5285 + }, + { + "epoch": 0.1232861136944727, + "grad_norm": 1.6702882051467896, + "learning_rate": 1.9993545264273243e-05, + "loss": 1.2592, + "step": 5286 + }, + { + "epoch": 0.12330943683365063, + "grad_norm": 1.4910953044891357, + "learning_rate": 1.999353621549725e-05, + "loss": 1.2759, + "step": 5287 + }, + { + "epoch": 0.12333275997282854, + "grad_norm": 1.55330491065979, + "learning_rate": 1.999352716038509e-05, + "loss": 1.0809, + "step": 5288 + }, + { + "epoch": 0.12335608311200646, + "grad_norm": 1.8966494798660278, + "learning_rate": 1.9993518098936776e-05, + "loss": 1.2786, + "step": 5289 + }, + { + "epoch": 0.12337940625118438, + "grad_norm": 1.6760200262069702, + "learning_rate": 1.9993509031152305e-05, + "loss": 1.4831, + "step": 5290 + }, + { + "epoch": 0.1234027293903623, + "grad_norm": 1.8702013492584229, + "learning_rate": 1.999349995703169e-05, + "loss": 1.4444, + "step": 5291 + }, + { + "epoch": 0.12342605252954021, + "grad_norm": 2.501312017440796, + "learning_rate": 1.999349087657493e-05, + "loss": 1.3903, + "step": 5292 + }, + { + "epoch": 0.12344937566871814, + "grad_norm": 1.6620635986328125, + "learning_rate": 1.999348178978204e-05, + "loss": 1.0562, + "step": 5293 + }, + { + "epoch": 0.12347269880789605, + "grad_norm": 1.4270248413085938, + "learning_rate": 1.9993472696653016e-05, + "loss": 1.2261, + "step": 5294 + }, + { + "epoch": 0.12349602194707397, + "grad_norm": 1.4495210647583008, + "learning_rate": 1.9993463597187867e-05, + "loss": 1.4482, + "step": 5295 + }, + { + "epoch": 0.12351934508625188, + "grad_norm": 1.9414210319519043, + "learning_rate": 1.99934544913866e-05, + "loss": 1.3528, + "step": 5296 + }, + { + "epoch": 0.12354266822542981, + "grad_norm": 1.8413746356964111, + "learning_rate": 1.9993445379249224e-05, + "loss": 1.6812, + "step": 5297 + }, + { + "epoch": 0.12356599136460772, + "grad_norm": 1.6159254312515259, + "learning_rate": 1.9993436260775738e-05, + "loss": 1.3213, + "step": 5298 + }, + { + "epoch": 0.12358931450378564, + "grad_norm": 1.4749362468719482, + "learning_rate": 1.9993427135966153e-05, + "loss": 1.7113, + "step": 5299 + }, + { + "epoch": 0.12361263764296355, + "grad_norm": 1.6183795928955078, + "learning_rate": 1.999341800482047e-05, + "loss": 1.2728, + "step": 5300 + }, + { + "epoch": 0.12363596078214147, + "grad_norm": 1.4831708669662476, + "learning_rate": 1.99934088673387e-05, + "loss": 1.3532, + "step": 5301 + }, + { + "epoch": 0.12365928392131939, + "grad_norm": 2.036362648010254, + "learning_rate": 1.9993399723520847e-05, + "loss": 1.83, + "step": 5302 + }, + { + "epoch": 0.1236826070604973, + "grad_norm": 1.5864958763122559, + "learning_rate": 1.9993390573366915e-05, + "loss": 1.3698, + "step": 5303 + }, + { + "epoch": 0.12370593019967523, + "grad_norm": 1.6193420886993408, + "learning_rate": 1.999338141687691e-05, + "loss": 1.3619, + "step": 5304 + }, + { + "epoch": 0.12372925333885314, + "grad_norm": 1.7173324823379517, + "learning_rate": 1.9993372254050843e-05, + "loss": 1.3796, + "step": 5305 + }, + { + "epoch": 0.12375257647803106, + "grad_norm": 1.6187129020690918, + "learning_rate": 1.999336308488871e-05, + "loss": 1.3185, + "step": 5306 + }, + { + "epoch": 0.12377589961720897, + "grad_norm": 1.9758219718933105, + "learning_rate": 1.9993353909390526e-05, + "loss": 1.5527, + "step": 5307 + }, + { + "epoch": 0.1237992227563869, + "grad_norm": 3.1125433444976807, + "learning_rate": 1.999334472755629e-05, + "loss": 1.4692, + "step": 5308 + }, + { + "epoch": 0.12382254589556481, + "grad_norm": 2.098446846008301, + "learning_rate": 1.9993335539386013e-05, + "loss": 1.2302, + "step": 5309 + }, + { + "epoch": 0.12384586903474273, + "grad_norm": 1.7464897632598877, + "learning_rate": 1.99933263448797e-05, + "loss": 1.6329, + "step": 5310 + }, + { + "epoch": 0.12386919217392064, + "grad_norm": 1.4976500272750854, + "learning_rate": 1.9993317144037357e-05, + "loss": 1.3796, + "step": 5311 + }, + { + "epoch": 0.12389251531309857, + "grad_norm": 1.660172939300537, + "learning_rate": 1.9993307936858985e-05, + "loss": 1.3155, + "step": 5312 + }, + { + "epoch": 0.12391583845227648, + "grad_norm": 1.7088837623596191, + "learning_rate": 1.9993298723344597e-05, + "loss": 1.6556, + "step": 5313 + }, + { + "epoch": 0.1239391615914544, + "grad_norm": 1.738512635231018, + "learning_rate": 1.9993289503494192e-05, + "loss": 1.3999, + "step": 5314 + }, + { + "epoch": 0.12396248473063232, + "grad_norm": 1.8124831914901733, + "learning_rate": 1.999328027730778e-05, + "loss": 1.4265, + "step": 5315 + }, + { + "epoch": 0.12398580786981024, + "grad_norm": 1.63262939453125, + "learning_rate": 1.9993271044785367e-05, + "loss": 1.3605, + "step": 5316 + }, + { + "epoch": 0.12400913100898815, + "grad_norm": 2.261629581451416, + "learning_rate": 1.9993261805926957e-05, + "loss": 1.7492, + "step": 5317 + }, + { + "epoch": 0.12403245414816608, + "grad_norm": 1.7597124576568604, + "learning_rate": 1.9993252560732558e-05, + "loss": 1.5009, + "step": 5318 + }, + { + "epoch": 0.12405577728734399, + "grad_norm": 1.7686125040054321, + "learning_rate": 1.9993243309202172e-05, + "loss": 1.3649, + "step": 5319 + }, + { + "epoch": 0.12407910042652191, + "grad_norm": 1.6629806756973267, + "learning_rate": 1.9993234051335807e-05, + "loss": 1.5079, + "step": 5320 + }, + { + "epoch": 0.12410242356569982, + "grad_norm": 1.739463448524475, + "learning_rate": 1.999322478713347e-05, + "loss": 1.3648, + "step": 5321 + }, + { + "epoch": 0.12412574670487775, + "grad_norm": 1.677564263343811, + "learning_rate": 1.999321551659517e-05, + "loss": 1.3415, + "step": 5322 + }, + { + "epoch": 0.12414906984405566, + "grad_norm": 2.385530710220337, + "learning_rate": 1.9993206239720905e-05, + "loss": 1.5282, + "step": 5323 + }, + { + "epoch": 0.12417239298323358, + "grad_norm": 1.9856312274932861, + "learning_rate": 1.9993196956510688e-05, + "loss": 1.3629, + "step": 5324 + }, + { + "epoch": 0.1241957161224115, + "grad_norm": 1.944859266281128, + "learning_rate": 1.999318766696452e-05, + "loss": 1.4692, + "step": 5325 + }, + { + "epoch": 0.12421903926158942, + "grad_norm": 1.8123466968536377, + "learning_rate": 1.9993178371082407e-05, + "loss": 1.2638, + "step": 5326 + }, + { + "epoch": 0.12424236240076733, + "grad_norm": 2.0977327823638916, + "learning_rate": 1.9993169068864357e-05, + "loss": 1.5563, + "step": 5327 + }, + { + "epoch": 0.12426568553994524, + "grad_norm": 2.071078062057495, + "learning_rate": 1.9993159760310378e-05, + "loss": 1.5508, + "step": 5328 + }, + { + "epoch": 0.12428900867912317, + "grad_norm": 1.8341642618179321, + "learning_rate": 1.9993150445420468e-05, + "loss": 1.6579, + "step": 5329 + }, + { + "epoch": 0.12431233181830108, + "grad_norm": 2.746169328689575, + "learning_rate": 1.9993141124194643e-05, + "loss": 1.1951, + "step": 5330 + }, + { + "epoch": 0.124335654957479, + "grad_norm": 2.1346287727355957, + "learning_rate": 1.9993131796632903e-05, + "loss": 1.1677, + "step": 5331 + }, + { + "epoch": 0.12435897809665691, + "grad_norm": 1.6425179243087769, + "learning_rate": 1.9993122462735255e-05, + "loss": 1.4173, + "step": 5332 + }, + { + "epoch": 0.12438230123583484, + "grad_norm": 2.0015392303466797, + "learning_rate": 1.9993113122501705e-05, + "loss": 1.095, + "step": 5333 + }, + { + "epoch": 0.12440562437501275, + "grad_norm": 2.0040698051452637, + "learning_rate": 1.999310377593226e-05, + "loss": 1.6499, + "step": 5334 + }, + { + "epoch": 0.12442894751419067, + "grad_norm": 2.078094244003296, + "learning_rate": 1.9993094423026923e-05, + "loss": 1.7472, + "step": 5335 + }, + { + "epoch": 0.12445227065336859, + "grad_norm": 1.9659695625305176, + "learning_rate": 1.9993085063785703e-05, + "loss": 1.1946, + "step": 5336 + }, + { + "epoch": 0.12447559379254651, + "grad_norm": 2.4215340614318848, + "learning_rate": 1.9993075698208603e-05, + "loss": 1.4901, + "step": 5337 + }, + { + "epoch": 0.12449891693172442, + "grad_norm": 1.626921534538269, + "learning_rate": 1.9993066326295635e-05, + "loss": 1.5455, + "step": 5338 + }, + { + "epoch": 0.12452224007090235, + "grad_norm": 1.913625955581665, + "learning_rate": 1.9993056948046796e-05, + "loss": 1.6662, + "step": 5339 + }, + { + "epoch": 0.12454556321008026, + "grad_norm": 2.2055249214172363, + "learning_rate": 1.9993047563462097e-05, + "loss": 1.4621, + "step": 5340 + }, + { + "epoch": 0.12456888634925818, + "grad_norm": 1.74037766456604, + "learning_rate": 1.9993038172541547e-05, + "loss": 1.1442, + "step": 5341 + }, + { + "epoch": 0.12459220948843609, + "grad_norm": 1.6794970035552979, + "learning_rate": 1.9993028775285147e-05, + "loss": 1.2495, + "step": 5342 + }, + { + "epoch": 0.12461553262761402, + "grad_norm": 1.421152949333191, + "learning_rate": 1.9993019371692903e-05, + "loss": 1.309, + "step": 5343 + }, + { + "epoch": 0.12463885576679193, + "grad_norm": 1.4981755018234253, + "learning_rate": 1.999300996176482e-05, + "loss": 1.6778, + "step": 5344 + }, + { + "epoch": 0.12466217890596985, + "grad_norm": 1.900719404220581, + "learning_rate": 1.999300054550091e-05, + "loss": 1.3013, + "step": 5345 + }, + { + "epoch": 0.12468550204514776, + "grad_norm": 1.4546258449554443, + "learning_rate": 1.9992991122901176e-05, + "loss": 1.264, + "step": 5346 + }, + { + "epoch": 0.12470882518432569, + "grad_norm": 1.4211875200271606, + "learning_rate": 1.999298169396562e-05, + "loss": 1.169, + "step": 5347 + }, + { + "epoch": 0.1247321483235036, + "grad_norm": 2.315979480743408, + "learning_rate": 1.9992972258694253e-05, + "loss": 1.3283, + "step": 5348 + }, + { + "epoch": 0.12475547146268152, + "grad_norm": 1.5772291421890259, + "learning_rate": 1.999296281708708e-05, + "loss": 1.4897, + "step": 5349 + }, + { + "epoch": 0.12477879460185944, + "grad_norm": 1.7055156230926514, + "learning_rate": 1.9992953369144105e-05, + "loss": 1.4096, + "step": 5350 + }, + { + "epoch": 0.12480211774103736, + "grad_norm": 1.9063371419906616, + "learning_rate": 1.9992943914865335e-05, + "loss": 1.5404, + "step": 5351 + }, + { + "epoch": 0.12482544088021527, + "grad_norm": 1.9070608615875244, + "learning_rate": 1.9992934454250775e-05, + "loss": 1.0824, + "step": 5352 + }, + { + "epoch": 0.1248487640193932, + "grad_norm": 4.212851524353027, + "learning_rate": 1.9992924987300436e-05, + "loss": 1.8992, + "step": 5353 + }, + { + "epoch": 0.12487208715857111, + "grad_norm": 1.9343132972717285, + "learning_rate": 1.9992915514014318e-05, + "loss": 1.249, + "step": 5354 + }, + { + "epoch": 0.12489541029774903, + "grad_norm": 1.4675266742706299, + "learning_rate": 1.999290603439243e-05, + "loss": 1.1419, + "step": 5355 + }, + { + "epoch": 0.12491873343692694, + "grad_norm": 1.9047664403915405, + "learning_rate": 1.9992896548434777e-05, + "loss": 1.2167, + "step": 5356 + }, + { + "epoch": 0.12494205657610485, + "grad_norm": 1.951023817062378, + "learning_rate": 1.9992887056141365e-05, + "loss": 1.4933, + "step": 5357 + }, + { + "epoch": 0.12496537971528278, + "grad_norm": 1.965632438659668, + "learning_rate": 1.99928775575122e-05, + "loss": 1.5671, + "step": 5358 + }, + { + "epoch": 0.12498870285446069, + "grad_norm": 1.7256234884262085, + "learning_rate": 1.999286805254729e-05, + "loss": 1.3747, + "step": 5359 + }, + { + "epoch": 0.12501202599363861, + "grad_norm": 1.7359243631362915, + "learning_rate": 1.9992858541246635e-05, + "loss": 1.4043, + "step": 5360 + }, + { + "epoch": 0.12503534913281653, + "grad_norm": 1.7163021564483643, + "learning_rate": 1.999284902361025e-05, + "loss": 1.0962, + "step": 5361 + }, + { + "epoch": 0.12505867227199444, + "grad_norm": 1.806894302368164, + "learning_rate": 1.999283949963813e-05, + "loss": 1.1988, + "step": 5362 + }, + { + "epoch": 0.12508199541117238, + "grad_norm": 1.628098964691162, + "learning_rate": 1.9992829969330297e-05, + "loss": 1.4655, + "step": 5363 + }, + { + "epoch": 0.1251053185503503, + "grad_norm": 23.135787963867188, + "learning_rate": 1.999282043268674e-05, + "loss": 0.9943, + "step": 5364 + }, + { + "epoch": 0.1251286416895282, + "grad_norm": 1.7356553077697754, + "learning_rate": 1.9992810889707477e-05, + "loss": 1.6763, + "step": 5365 + }, + { + "epoch": 0.1251519648287061, + "grad_norm": 1.85964834690094, + "learning_rate": 1.9992801340392508e-05, + "loss": 1.5229, + "step": 5366 + }, + { + "epoch": 0.12517528796788405, + "grad_norm": 1.9864522218704224, + "learning_rate": 1.9992791784741837e-05, + "loss": 1.3506, + "step": 5367 + }, + { + "epoch": 0.12519861110706196, + "grad_norm": 1.5905979871749878, + "learning_rate": 1.999278222275548e-05, + "loss": 1.5013, + "step": 5368 + }, + { + "epoch": 0.12522193424623987, + "grad_norm": 1.534953236579895, + "learning_rate": 1.9992772654433433e-05, + "loss": 1.1682, + "step": 5369 + }, + { + "epoch": 0.12524525738541778, + "grad_norm": 1.7068012952804565, + "learning_rate": 1.9992763079775702e-05, + "loss": 1.5048, + "step": 5370 + }, + { + "epoch": 0.12526858052459572, + "grad_norm": 1.8716068267822266, + "learning_rate": 1.9992753498782304e-05, + "loss": 1.6322, + "step": 5371 + }, + { + "epoch": 0.12529190366377363, + "grad_norm": 1.9307128190994263, + "learning_rate": 1.9992743911453235e-05, + "loss": 1.3273, + "step": 5372 + }, + { + "epoch": 0.12531522680295154, + "grad_norm": 1.5258667469024658, + "learning_rate": 1.9992734317788502e-05, + "loss": 1.2468, + "step": 5373 + }, + { + "epoch": 0.12533854994212945, + "grad_norm": 2.5228090286254883, + "learning_rate": 1.9992724717788118e-05, + "loss": 1.5027, + "step": 5374 + }, + { + "epoch": 0.1253618730813074, + "grad_norm": 1.451471209526062, + "learning_rate": 1.999271511145208e-05, + "loss": 1.4452, + "step": 5375 + }, + { + "epoch": 0.1253851962204853, + "grad_norm": 1.7861031293869019, + "learning_rate": 1.99927054987804e-05, + "loss": 1.5037, + "step": 5376 + }, + { + "epoch": 0.1254085193596632, + "grad_norm": 2.0532500743865967, + "learning_rate": 1.999269587977308e-05, + "loss": 1.6209, + "step": 5377 + }, + { + "epoch": 0.12543184249884112, + "grad_norm": 2.0508968830108643, + "learning_rate": 1.9992686254430128e-05, + "loss": 1.2473, + "step": 5378 + }, + { + "epoch": 0.12545516563801906, + "grad_norm": 1.5398067235946655, + "learning_rate": 1.9992676622751556e-05, + "loss": 1.5357, + "step": 5379 + }, + { + "epoch": 0.12547848877719697, + "grad_norm": 1.7507586479187012, + "learning_rate": 1.999266698473736e-05, + "loss": 1.6371, + "step": 5380 + }, + { + "epoch": 0.12550181191637488, + "grad_norm": 1.5591888427734375, + "learning_rate": 1.9992657340387554e-05, + "loss": 0.9845, + "step": 5381 + }, + { + "epoch": 0.1255251350555528, + "grad_norm": 2.0980377197265625, + "learning_rate": 1.9992647689702138e-05, + "loss": 1.3421, + "step": 5382 + }, + { + "epoch": 0.12554845819473073, + "grad_norm": 1.6887876987457275, + "learning_rate": 1.9992638032681123e-05, + "loss": 1.4285, + "step": 5383 + }, + { + "epoch": 0.12557178133390864, + "grad_norm": 2.206519842147827, + "learning_rate": 1.999262836932451e-05, + "loss": 1.7444, + "step": 5384 + }, + { + "epoch": 0.12559510447308656, + "grad_norm": 1.93436598777771, + "learning_rate": 1.9992618699632313e-05, + "loss": 1.5265, + "step": 5385 + }, + { + "epoch": 0.12561842761226447, + "grad_norm": 1.553687572479248, + "learning_rate": 1.999260902360453e-05, + "loss": 1.402, + "step": 5386 + }, + { + "epoch": 0.12564175075144238, + "grad_norm": 1.9996453523635864, + "learning_rate": 1.999259934124117e-05, + "loss": 1.5589, + "step": 5387 + }, + { + "epoch": 0.12566507389062032, + "grad_norm": 1.6471625566482544, + "learning_rate": 1.999258965254224e-05, + "loss": 1.5799, + "step": 5388 + }, + { + "epoch": 0.12568839702979823, + "grad_norm": 1.766697883605957, + "learning_rate": 1.999257995750775e-05, + "loss": 1.5823, + "step": 5389 + }, + { + "epoch": 0.12571172016897614, + "grad_norm": 2.0508477687835693, + "learning_rate": 1.99925702561377e-05, + "loss": 1.6147, + "step": 5390 + }, + { + "epoch": 0.12573504330815405, + "grad_norm": 1.7768009901046753, + "learning_rate": 1.9992560548432095e-05, + "loss": 1.5132, + "step": 5391 + }, + { + "epoch": 0.125758366447332, + "grad_norm": 1.7441701889038086, + "learning_rate": 1.9992550834390945e-05, + "loss": 1.7477, + "step": 5392 + }, + { + "epoch": 0.1257816895865099, + "grad_norm": 1.9432305097579956, + "learning_rate": 1.9992541114014258e-05, + "loss": 1.312, + "step": 5393 + }, + { + "epoch": 0.1258050127256878, + "grad_norm": 1.5495012998580933, + "learning_rate": 1.9992531387302038e-05, + "loss": 1.5193, + "step": 5394 + }, + { + "epoch": 0.12582833586486572, + "grad_norm": 1.9450881481170654, + "learning_rate": 1.999252165425429e-05, + "loss": 1.3406, + "step": 5395 + }, + { + "epoch": 0.12585165900404366, + "grad_norm": 1.2811824083328247, + "learning_rate": 1.999251191487102e-05, + "loss": 1.2533, + "step": 5396 + }, + { + "epoch": 0.12587498214322157, + "grad_norm": 1.9915629625320435, + "learning_rate": 1.9992502169152233e-05, + "loss": 1.7604, + "step": 5397 + }, + { + "epoch": 0.12589830528239948, + "grad_norm": 1.8594021797180176, + "learning_rate": 1.999249241709794e-05, + "loss": 1.525, + "step": 5398 + }, + { + "epoch": 0.1259216284215774, + "grad_norm": 1.283684253692627, + "learning_rate": 1.9992482658708144e-05, + "loss": 0.7236, + "step": 5399 + }, + { + "epoch": 0.12594495156075533, + "grad_norm": 1.7537599802017212, + "learning_rate": 1.999247289398285e-05, + "loss": 1.5792, + "step": 5400 + }, + { + "epoch": 0.12596827469993324, + "grad_norm": 1.8533450365066528, + "learning_rate": 1.999246312292207e-05, + "loss": 1.3474, + "step": 5401 + }, + { + "epoch": 0.12599159783911115, + "grad_norm": 1.9097071886062622, + "learning_rate": 1.9992453345525803e-05, + "loss": 1.2923, + "step": 5402 + }, + { + "epoch": 0.12601492097828906, + "grad_norm": 1.326207160949707, + "learning_rate": 1.9992443561794056e-05, + "loss": 1.2066, + "step": 5403 + }, + { + "epoch": 0.126038244117467, + "grad_norm": 1.9948997497558594, + "learning_rate": 1.999243377172684e-05, + "loss": 1.3208, + "step": 5404 + }, + { + "epoch": 0.1260615672566449, + "grad_norm": 1.56023108959198, + "learning_rate": 1.9992423975324163e-05, + "loss": 1.3878, + "step": 5405 + }, + { + "epoch": 0.12608489039582282, + "grad_norm": 1.7121708393096924, + "learning_rate": 1.9992414172586023e-05, + "loss": 1.2812, + "step": 5406 + }, + { + "epoch": 0.12610821353500073, + "grad_norm": 1.687635898590088, + "learning_rate": 1.999240436351243e-05, + "loss": 1.4624, + "step": 5407 + }, + { + "epoch": 0.12613153667417867, + "grad_norm": 1.7005701065063477, + "learning_rate": 1.9992394548103387e-05, + "loss": 0.9738, + "step": 5408 + }, + { + "epoch": 0.12615485981335658, + "grad_norm": 1.6180371046066284, + "learning_rate": 1.9992384726358906e-05, + "loss": 1.2548, + "step": 5409 + }, + { + "epoch": 0.1261781829525345, + "grad_norm": 1.5682753324508667, + "learning_rate": 1.9992374898278994e-05, + "loss": 1.6364, + "step": 5410 + }, + { + "epoch": 0.1262015060917124, + "grad_norm": 2.1962804794311523, + "learning_rate": 1.9992365063863648e-05, + "loss": 1.3374, + "step": 5411 + }, + { + "epoch": 0.12622482923089035, + "grad_norm": 1.8850183486938477, + "learning_rate": 1.9992355223112885e-05, + "loss": 1.5913, + "step": 5412 + }, + { + "epoch": 0.12624815237006826, + "grad_norm": 1.8761965036392212, + "learning_rate": 1.9992345376026706e-05, + "loss": 1.5692, + "step": 5413 + }, + { + "epoch": 0.12627147550924617, + "grad_norm": 1.4552438259124756, + "learning_rate": 1.9992335522605116e-05, + "loss": 1.2213, + "step": 5414 + }, + { + "epoch": 0.12629479864842408, + "grad_norm": 1.7658276557922363, + "learning_rate": 1.9992325662848123e-05, + "loss": 1.5189, + "step": 5415 + }, + { + "epoch": 0.126318121787602, + "grad_norm": 1.5983761548995972, + "learning_rate": 1.9992315796755734e-05, + "loss": 1.5318, + "step": 5416 + }, + { + "epoch": 0.12634144492677993, + "grad_norm": 1.8737430572509766, + "learning_rate": 1.999230592432795e-05, + "loss": 1.3566, + "step": 5417 + }, + { + "epoch": 0.12636476806595784, + "grad_norm": 1.7715520858764648, + "learning_rate": 1.999229604556479e-05, + "loss": 1.4259, + "step": 5418 + }, + { + "epoch": 0.12638809120513575, + "grad_norm": 1.8800181150436401, + "learning_rate": 1.9992286160466245e-05, + "loss": 1.8555, + "step": 5419 + }, + { + "epoch": 0.12641141434431366, + "grad_norm": 1.9288504123687744, + "learning_rate": 1.9992276269032334e-05, + "loss": 1.5633, + "step": 5420 + }, + { + "epoch": 0.1264347374834916, + "grad_norm": 1.8111616373062134, + "learning_rate": 1.9992266371263054e-05, + "loss": 1.3368, + "step": 5421 + }, + { + "epoch": 0.1264580606226695, + "grad_norm": 1.5306679010391235, + "learning_rate": 1.9992256467158414e-05, + "loss": 1.1889, + "step": 5422 + }, + { + "epoch": 0.12648138376184742, + "grad_norm": 1.628777027130127, + "learning_rate": 1.9992246556718423e-05, + "loss": 1.5189, + "step": 5423 + }, + { + "epoch": 0.12650470690102533, + "grad_norm": 1.5897852182388306, + "learning_rate": 1.9992236639943085e-05, + "loss": 1.439, + "step": 5424 + }, + { + "epoch": 0.12652803004020327, + "grad_norm": 1.8103121519088745, + "learning_rate": 1.999222671683241e-05, + "loss": 1.6832, + "step": 5425 + }, + { + "epoch": 0.12655135317938118, + "grad_norm": 1.8806496858596802, + "learning_rate": 1.9992216787386393e-05, + "loss": 1.4253, + "step": 5426 + }, + { + "epoch": 0.1265746763185591, + "grad_norm": 1.582359790802002, + "learning_rate": 1.9992206851605053e-05, + "loss": 1.4256, + "step": 5427 + }, + { + "epoch": 0.126597999457737, + "grad_norm": 2.1066372394561768, + "learning_rate": 1.9992196909488394e-05, + "loss": 2.1375, + "step": 5428 + }, + { + "epoch": 0.12662132259691494, + "grad_norm": 1.5111849308013916, + "learning_rate": 1.9992186961036416e-05, + "loss": 1.2058, + "step": 5429 + }, + { + "epoch": 0.12664464573609285, + "grad_norm": 1.3043365478515625, + "learning_rate": 1.9992177006249126e-05, + "loss": 1.2673, + "step": 5430 + }, + { + "epoch": 0.12666796887527076, + "grad_norm": 2.044915199279785, + "learning_rate": 1.999216704512654e-05, + "loss": 1.6466, + "step": 5431 + }, + { + "epoch": 0.12669129201444868, + "grad_norm": 1.512298345565796, + "learning_rate": 1.9992157077668656e-05, + "loss": 1.625, + "step": 5432 + }, + { + "epoch": 0.12671461515362661, + "grad_norm": 1.5365688800811768, + "learning_rate": 1.999214710387548e-05, + "loss": 1.1824, + "step": 5433 + }, + { + "epoch": 0.12673793829280453, + "grad_norm": 1.6051990985870361, + "learning_rate": 1.9992137123747023e-05, + "loss": 1.5006, + "step": 5434 + }, + { + "epoch": 0.12676126143198244, + "grad_norm": 1.6746844053268433, + "learning_rate": 1.9992127137283287e-05, + "loss": 1.7264, + "step": 5435 + }, + { + "epoch": 0.12678458457116035, + "grad_norm": 1.9029208421707153, + "learning_rate": 1.999211714448428e-05, + "loss": 1.5162, + "step": 5436 + }, + { + "epoch": 0.12680790771033829, + "grad_norm": 2.1730802059173584, + "learning_rate": 1.9992107145350007e-05, + "loss": 1.5358, + "step": 5437 + }, + { + "epoch": 0.1268312308495162, + "grad_norm": 1.850447416305542, + "learning_rate": 1.999209713988048e-05, + "loss": 1.5311, + "step": 5438 + }, + { + "epoch": 0.1268545539886941, + "grad_norm": 1.932978868484497, + "learning_rate": 1.99920871280757e-05, + "loss": 1.8633, + "step": 5439 + }, + { + "epoch": 0.12687787712787202, + "grad_norm": 2.062108278274536, + "learning_rate": 1.999207710993567e-05, + "loss": 1.3954, + "step": 5440 + }, + { + "epoch": 0.12690120026704996, + "grad_norm": 1.5972346067428589, + "learning_rate": 1.9992067085460404e-05, + "loss": 1.6961, + "step": 5441 + }, + { + "epoch": 0.12692452340622787, + "grad_norm": 1.686582088470459, + "learning_rate": 1.9992057054649905e-05, + "loss": 1.4629, + "step": 5442 + }, + { + "epoch": 0.12694784654540578, + "grad_norm": 2.015822649002075, + "learning_rate": 1.9992047017504183e-05, + "loss": 1.6751, + "step": 5443 + }, + { + "epoch": 0.1269711696845837, + "grad_norm": 1.7255650758743286, + "learning_rate": 1.9992036974023233e-05, + "loss": 1.3924, + "step": 5444 + }, + { + "epoch": 0.1269944928237616, + "grad_norm": 1.890475869178772, + "learning_rate": 1.9992026924207077e-05, + "loss": 1.295, + "step": 5445 + }, + { + "epoch": 0.12701781596293954, + "grad_norm": 1.798271656036377, + "learning_rate": 1.9992016868055707e-05, + "loss": 1.6618, + "step": 5446 + }, + { + "epoch": 0.12704113910211745, + "grad_norm": 1.8091495037078857, + "learning_rate": 1.999200680556914e-05, + "loss": 1.3049, + "step": 5447 + }, + { + "epoch": 0.12706446224129536, + "grad_norm": 1.6635726690292358, + "learning_rate": 1.9991996736747377e-05, + "loss": 1.5212, + "step": 5448 + }, + { + "epoch": 0.12708778538047327, + "grad_norm": 1.3743265867233276, + "learning_rate": 1.9991986661590428e-05, + "loss": 1.3127, + "step": 5449 + }, + { + "epoch": 0.1271111085196512, + "grad_norm": 1.867031216621399, + "learning_rate": 1.9991976580098295e-05, + "loss": 1.6738, + "step": 5450 + }, + { + "epoch": 0.12713443165882912, + "grad_norm": 1.4890648126602173, + "learning_rate": 1.9991966492270986e-05, + "loss": 1.4069, + "step": 5451 + }, + { + "epoch": 0.12715775479800703, + "grad_norm": 1.666800618171692, + "learning_rate": 1.9991956398108508e-05, + "loss": 1.4785, + "step": 5452 + }, + { + "epoch": 0.12718107793718494, + "grad_norm": 2.0823330879211426, + "learning_rate": 1.999194629761087e-05, + "loss": 1.5522, + "step": 5453 + }, + { + "epoch": 0.12720440107636288, + "grad_norm": 1.3950648307800293, + "learning_rate": 1.9991936190778074e-05, + "loss": 1.0546, + "step": 5454 + }, + { + "epoch": 0.1272277242155408, + "grad_norm": 1.78944730758667, + "learning_rate": 1.9991926077610128e-05, + "loss": 1.5421, + "step": 5455 + }, + { + "epoch": 0.1272510473547187, + "grad_norm": 1.553252935409546, + "learning_rate": 1.999191595810704e-05, + "loss": 1.1469, + "step": 5456 + }, + { + "epoch": 0.12727437049389662, + "grad_norm": 1.5558414459228516, + "learning_rate": 1.9991905832268812e-05, + "loss": 1.3023, + "step": 5457 + }, + { + "epoch": 0.12729769363307455, + "grad_norm": 1.6362791061401367, + "learning_rate": 1.9991895700095456e-05, + "loss": 1.4798, + "step": 5458 + }, + { + "epoch": 0.12732101677225247, + "grad_norm": 1.833692193031311, + "learning_rate": 1.9991885561586974e-05, + "loss": 1.2441, + "step": 5459 + }, + { + "epoch": 0.12734433991143038, + "grad_norm": 2.2851738929748535, + "learning_rate": 1.9991875416743374e-05, + "loss": 1.56, + "step": 5460 + }, + { + "epoch": 0.1273676630506083, + "grad_norm": 1.321982741355896, + "learning_rate": 1.9991865265564665e-05, + "loss": 1.4398, + "step": 5461 + }, + { + "epoch": 0.12739098618978623, + "grad_norm": 1.610442876815796, + "learning_rate": 1.9991855108050848e-05, + "loss": 1.4944, + "step": 5462 + }, + { + "epoch": 0.12741430932896414, + "grad_norm": 1.7381945848464966, + "learning_rate": 1.9991844944201935e-05, + "loss": 1.6252, + "step": 5463 + }, + { + "epoch": 0.12743763246814205, + "grad_norm": 1.8495548963546753, + "learning_rate": 1.9991834774017928e-05, + "loss": 1.6641, + "step": 5464 + }, + { + "epoch": 0.12746095560731996, + "grad_norm": 1.7316395044326782, + "learning_rate": 1.9991824597498836e-05, + "loss": 1.5605, + "step": 5465 + }, + { + "epoch": 0.1274842787464979, + "grad_norm": 1.7098296880722046, + "learning_rate": 1.9991814414644666e-05, + "loss": 1.4834, + "step": 5466 + }, + { + "epoch": 0.1275076018856758, + "grad_norm": 1.9938747882843018, + "learning_rate": 1.9991804225455422e-05, + "loss": 1.5607, + "step": 5467 + }, + { + "epoch": 0.12753092502485372, + "grad_norm": 2.192286491394043, + "learning_rate": 1.9991794029931113e-05, + "loss": 1.6055, + "step": 5468 + }, + { + "epoch": 0.12755424816403163, + "grad_norm": 1.9479405879974365, + "learning_rate": 1.9991783828071746e-05, + "loss": 1.1237, + "step": 5469 + }, + { + "epoch": 0.12757757130320957, + "grad_norm": 1.8798227310180664, + "learning_rate": 1.9991773619877325e-05, + "loss": 1.4722, + "step": 5470 + }, + { + "epoch": 0.12760089444238748, + "grad_norm": 1.7707977294921875, + "learning_rate": 1.9991763405347853e-05, + "loss": 1.3561, + "step": 5471 + }, + { + "epoch": 0.1276242175815654, + "grad_norm": 2.2476325035095215, + "learning_rate": 1.999175318448334e-05, + "loss": 1.5415, + "step": 5472 + }, + { + "epoch": 0.1276475407207433, + "grad_norm": 2.1162397861480713, + "learning_rate": 1.99917429572838e-05, + "loss": 1.7181, + "step": 5473 + }, + { + "epoch": 0.1276708638599212, + "grad_norm": 1.7683426141738892, + "learning_rate": 1.9991732723749228e-05, + "loss": 1.2474, + "step": 5474 + }, + { + "epoch": 0.12769418699909915, + "grad_norm": 1.456568956375122, + "learning_rate": 1.999172248387964e-05, + "loss": 1.3521, + "step": 5475 + }, + { + "epoch": 0.12771751013827706, + "grad_norm": 1.7632622718811035, + "learning_rate": 1.9991712237675033e-05, + "loss": 1.468, + "step": 5476 + }, + { + "epoch": 0.12774083327745497, + "grad_norm": 1.6316269636154175, + "learning_rate": 1.9991701985135418e-05, + "loss": 1.579, + "step": 5477 + }, + { + "epoch": 0.12776415641663288, + "grad_norm": 1.7061409950256348, + "learning_rate": 1.9991691726260803e-05, + "loss": 1.2391, + "step": 5478 + }, + { + "epoch": 0.12778747955581082, + "grad_norm": 1.5627297163009644, + "learning_rate": 1.9991681461051192e-05, + "loss": 1.4331, + "step": 5479 + }, + { + "epoch": 0.12781080269498873, + "grad_norm": 1.62676203250885, + "learning_rate": 1.9991671189506594e-05, + "loss": 1.2626, + "step": 5480 + }, + { + "epoch": 0.12783412583416665, + "grad_norm": 1.7847851514816284, + "learning_rate": 1.9991660911627013e-05, + "loss": 1.4807, + "step": 5481 + }, + { + "epoch": 0.12785744897334456, + "grad_norm": 1.6543134450912476, + "learning_rate": 1.999165062741246e-05, + "loss": 1.5356, + "step": 5482 + }, + { + "epoch": 0.1278807721125225, + "grad_norm": 1.5396884679794312, + "learning_rate": 1.9991640336862933e-05, + "loss": 1.2102, + "step": 5483 + }, + { + "epoch": 0.1279040952517004, + "grad_norm": 1.7303301095962524, + "learning_rate": 1.9991630039978446e-05, + "loss": 1.4038, + "step": 5484 + }, + { + "epoch": 0.12792741839087832, + "grad_norm": 1.7263692617416382, + "learning_rate": 1.9991619736759004e-05, + "loss": 1.2765, + "step": 5485 + }, + { + "epoch": 0.12795074153005623, + "grad_norm": 1.686104655265808, + "learning_rate": 1.9991609427204612e-05, + "loss": 1.4193, + "step": 5486 + }, + { + "epoch": 0.12797406466923417, + "grad_norm": 1.9811973571777344, + "learning_rate": 1.999159911131528e-05, + "loss": 1.2459, + "step": 5487 + }, + { + "epoch": 0.12799738780841208, + "grad_norm": 2.3382718563079834, + "learning_rate": 1.999158878909101e-05, + "loss": 1.1248, + "step": 5488 + }, + { + "epoch": 0.12802071094759, + "grad_norm": 1.6756922006607056, + "learning_rate": 1.9991578460531807e-05, + "loss": 1.2561, + "step": 5489 + }, + { + "epoch": 0.1280440340867679, + "grad_norm": 2.043768882751465, + "learning_rate": 1.9991568125637682e-05, + "loss": 1.0274, + "step": 5490 + }, + { + "epoch": 0.12806735722594584, + "grad_norm": 1.6697497367858887, + "learning_rate": 1.9991557784408643e-05, + "loss": 1.6783, + "step": 5491 + }, + { + "epoch": 0.12809068036512375, + "grad_norm": 1.409879446029663, + "learning_rate": 1.9991547436844695e-05, + "loss": 0.8973, + "step": 5492 + }, + { + "epoch": 0.12811400350430166, + "grad_norm": 2.2282512187957764, + "learning_rate": 1.999153708294584e-05, + "loss": 1.3042, + "step": 5493 + }, + { + "epoch": 0.12813732664347957, + "grad_norm": 1.9955637454986572, + "learning_rate": 1.999152672271209e-05, + "loss": 1.3585, + "step": 5494 + }, + { + "epoch": 0.1281606497826575, + "grad_norm": 1.7140265703201294, + "learning_rate": 1.999151635614345e-05, + "loss": 1.2718, + "step": 5495 + }, + { + "epoch": 0.12818397292183542, + "grad_norm": 1.758014440536499, + "learning_rate": 1.9991505983239925e-05, + "loss": 1.2066, + "step": 5496 + }, + { + "epoch": 0.12820729606101333, + "grad_norm": 1.6678245067596436, + "learning_rate": 1.9991495604001525e-05, + "loss": 1.3076, + "step": 5497 + }, + { + "epoch": 0.12823061920019124, + "grad_norm": 1.4391366243362427, + "learning_rate": 1.9991485218428254e-05, + "loss": 1.4211, + "step": 5498 + }, + { + "epoch": 0.12825394233936915, + "grad_norm": 1.9126677513122559, + "learning_rate": 1.9991474826520115e-05, + "loss": 1.7957, + "step": 5499 + }, + { + "epoch": 0.1282772654785471, + "grad_norm": 2.7926347255706787, + "learning_rate": 1.9991464428277126e-05, + "loss": 1.1987, + "step": 5500 + }, + { + "epoch": 0.128300588617725, + "grad_norm": 2.019442081451416, + "learning_rate": 1.999145402369928e-05, + "loss": 1.3248, + "step": 5501 + }, + { + "epoch": 0.12832391175690291, + "grad_norm": 1.7478313446044922, + "learning_rate": 1.9991443612786593e-05, + "loss": 1.6049, + "step": 5502 + }, + { + "epoch": 0.12834723489608083, + "grad_norm": 1.6614384651184082, + "learning_rate": 1.9991433195539067e-05, + "loss": 1.3455, + "step": 5503 + }, + { + "epoch": 0.12837055803525876, + "grad_norm": 1.4363422393798828, + "learning_rate": 1.999142277195671e-05, + "loss": 1.3315, + "step": 5504 + }, + { + "epoch": 0.12839388117443667, + "grad_norm": 1.639754056930542, + "learning_rate": 1.999141234203953e-05, + "loss": 1.4283, + "step": 5505 + }, + { + "epoch": 0.12841720431361459, + "grad_norm": 1.5692765712738037, + "learning_rate": 1.9991401905787532e-05, + "loss": 1.2873, + "step": 5506 + }, + { + "epoch": 0.1284405274527925, + "grad_norm": 2.008570671081543, + "learning_rate": 1.999139146320072e-05, + "loss": 1.2838, + "step": 5507 + }, + { + "epoch": 0.12846385059197044, + "grad_norm": 2.0946836471557617, + "learning_rate": 1.999138101427911e-05, + "loss": 1.2448, + "step": 5508 + }, + { + "epoch": 0.12848717373114835, + "grad_norm": 1.8349133729934692, + "learning_rate": 1.9991370559022698e-05, + "loss": 1.7411, + "step": 5509 + }, + { + "epoch": 0.12851049687032626, + "grad_norm": 1.7716482877731323, + "learning_rate": 1.9991360097431493e-05, + "loss": 1.3124, + "step": 5510 + }, + { + "epoch": 0.12853382000950417, + "grad_norm": 1.7455962896347046, + "learning_rate": 1.9991349629505505e-05, + "loss": 1.1583, + "step": 5511 + }, + { + "epoch": 0.1285571431486821, + "grad_norm": 1.9326730966567993, + "learning_rate": 1.999133915524474e-05, + "loss": 1.6983, + "step": 5512 + }, + { + "epoch": 0.12858046628786002, + "grad_norm": 1.992364764213562, + "learning_rate": 1.9991328674649202e-05, + "loss": 1.2729, + "step": 5513 + }, + { + "epoch": 0.12860378942703793, + "grad_norm": 1.7640149593353271, + "learning_rate": 1.99913181877189e-05, + "loss": 1.4982, + "step": 5514 + }, + { + "epoch": 0.12862711256621584, + "grad_norm": 1.6854970455169678, + "learning_rate": 1.999130769445384e-05, + "loss": 1.5468, + "step": 5515 + }, + { + "epoch": 0.12865043570539378, + "grad_norm": 1.5444120168685913, + "learning_rate": 1.999129719485403e-05, + "loss": 1.2955, + "step": 5516 + }, + { + "epoch": 0.1286737588445717, + "grad_norm": 1.7126643657684326, + "learning_rate": 1.9991286688919473e-05, + "loss": 1.2174, + "step": 5517 + }, + { + "epoch": 0.1286970819837496, + "grad_norm": 1.499959945678711, + "learning_rate": 1.9991276176650182e-05, + "loss": 1.512, + "step": 5518 + }, + { + "epoch": 0.1287204051229275, + "grad_norm": 2.1801090240478516, + "learning_rate": 1.999126565804615e-05, + "loss": 1.4187, + "step": 5519 + }, + { + "epoch": 0.12874372826210545, + "grad_norm": 1.9057375192642212, + "learning_rate": 1.9991255133107402e-05, + "loss": 1.3905, + "step": 5520 + }, + { + "epoch": 0.12876705140128336, + "grad_norm": 1.7352635860443115, + "learning_rate": 1.9991244601833934e-05, + "loss": 1.4871, + "step": 5521 + }, + { + "epoch": 0.12879037454046127, + "grad_norm": 1.3701163530349731, + "learning_rate": 1.9991234064225754e-05, + "loss": 1.0181, + "step": 5522 + }, + { + "epoch": 0.12881369767963918, + "grad_norm": 1.8005037307739258, + "learning_rate": 1.9991223520282868e-05, + "loss": 1.3757, + "step": 5523 + }, + { + "epoch": 0.12883702081881712, + "grad_norm": 1.6191970109939575, + "learning_rate": 1.999121297000529e-05, + "loss": 1.5706, + "step": 5524 + }, + { + "epoch": 0.12886034395799503, + "grad_norm": 1.6916390657424927, + "learning_rate": 1.9991202413393015e-05, + "loss": 1.5843, + "step": 5525 + }, + { + "epoch": 0.12888366709717294, + "grad_norm": 1.5621001720428467, + "learning_rate": 1.9991191850446058e-05, + "loss": 1.1276, + "step": 5526 + }, + { + "epoch": 0.12890699023635085, + "grad_norm": 1.721367359161377, + "learning_rate": 1.999118128116442e-05, + "loss": 1.1073, + "step": 5527 + }, + { + "epoch": 0.12893031337552877, + "grad_norm": 1.9346935749053955, + "learning_rate": 1.999117070554811e-05, + "loss": 1.6193, + "step": 5528 + }, + { + "epoch": 0.1289536365147067, + "grad_norm": 1.5504980087280273, + "learning_rate": 1.9991160123597143e-05, + "loss": 1.5058, + "step": 5529 + }, + { + "epoch": 0.12897695965388462, + "grad_norm": 1.8259384632110596, + "learning_rate": 1.999114953531151e-05, + "loss": 1.2885, + "step": 5530 + }, + { + "epoch": 0.12900028279306253, + "grad_norm": 1.7581303119659424, + "learning_rate": 1.999113894069123e-05, + "loss": 1.5147, + "step": 5531 + }, + { + "epoch": 0.12902360593224044, + "grad_norm": 1.6676334142684937, + "learning_rate": 1.9991128339736305e-05, + "loss": 1.1922, + "step": 5532 + }, + { + "epoch": 0.12904692907141838, + "grad_norm": 1.5131217241287231, + "learning_rate": 1.9991117732446742e-05, + "loss": 1.2488, + "step": 5533 + }, + { + "epoch": 0.1290702522105963, + "grad_norm": 1.7727043628692627, + "learning_rate": 1.9991107118822546e-05, + "loss": 0.8065, + "step": 5534 + }, + { + "epoch": 0.1290935753497742, + "grad_norm": 1.340714693069458, + "learning_rate": 1.999109649886373e-05, + "loss": 1.1928, + "step": 5535 + }, + { + "epoch": 0.1291168984889521, + "grad_norm": 1.6228406429290771, + "learning_rate": 1.9991085872570294e-05, + "loss": 1.8014, + "step": 5536 + }, + { + "epoch": 0.12914022162813005, + "grad_norm": 1.5064109563827515, + "learning_rate": 1.9991075239942246e-05, + "loss": 1.5698, + "step": 5537 + }, + { + "epoch": 0.12916354476730796, + "grad_norm": 1.5737848281860352, + "learning_rate": 1.9991064600979594e-05, + "loss": 1.6916, + "step": 5538 + }, + { + "epoch": 0.12918686790648587, + "grad_norm": 1.5255625247955322, + "learning_rate": 1.999105395568235e-05, + "loss": 1.2596, + "step": 5539 + }, + { + "epoch": 0.12921019104566378, + "grad_norm": 1.6454182863235474, + "learning_rate": 1.999104330405051e-05, + "loss": 1.3305, + "step": 5540 + }, + { + "epoch": 0.12923351418484172, + "grad_norm": 3.4412131309509277, + "learning_rate": 1.9991032646084087e-05, + "loss": 1.4387, + "step": 5541 + }, + { + "epoch": 0.12925683732401963, + "grad_norm": 1.4733779430389404, + "learning_rate": 1.999102198178309e-05, + "loss": 1.1945, + "step": 5542 + }, + { + "epoch": 0.12928016046319754, + "grad_norm": 1.9390318393707275, + "learning_rate": 1.999101131114752e-05, + "loss": 1.5764, + "step": 5543 + }, + { + "epoch": 0.12930348360237545, + "grad_norm": 1.4425219297409058, + "learning_rate": 1.9991000634177385e-05, + "loss": 1.2732, + "step": 5544 + }, + { + "epoch": 0.1293268067415534, + "grad_norm": 1.736465573310852, + "learning_rate": 1.9990989950872697e-05, + "loss": 1.2525, + "step": 5545 + }, + { + "epoch": 0.1293501298807313, + "grad_norm": 2.4079246520996094, + "learning_rate": 1.9990979261233456e-05, + "loss": 1.5843, + "step": 5546 + }, + { + "epoch": 0.1293734530199092, + "grad_norm": 2.3070828914642334, + "learning_rate": 1.9990968565259673e-05, + "loss": 1.2316, + "step": 5547 + }, + { + "epoch": 0.12939677615908712, + "grad_norm": 1.598985195159912, + "learning_rate": 1.9990957862951354e-05, + "loss": 1.5797, + "step": 5548 + }, + { + "epoch": 0.12942009929826506, + "grad_norm": 1.7216198444366455, + "learning_rate": 1.9990947154308506e-05, + "loss": 1.1824, + "step": 5549 + }, + { + "epoch": 0.12944342243744297, + "grad_norm": 1.9760773181915283, + "learning_rate": 1.9990936439331135e-05, + "loss": 1.4615, + "step": 5550 + }, + { + "epoch": 0.12946674557662088, + "grad_norm": 1.9602164030075073, + "learning_rate": 1.999092571801925e-05, + "loss": 1.3384, + "step": 5551 + }, + { + "epoch": 0.1294900687157988, + "grad_norm": 1.6859123706817627, + "learning_rate": 1.999091499037285e-05, + "loss": 0.8868, + "step": 5552 + }, + { + "epoch": 0.12951339185497673, + "grad_norm": 1.6201549768447876, + "learning_rate": 1.9990904256391953e-05, + "loss": 1.1629, + "step": 5553 + }, + { + "epoch": 0.12953671499415464, + "grad_norm": 1.928131103515625, + "learning_rate": 1.9990893516076557e-05, + "loss": 1.302, + "step": 5554 + }, + { + "epoch": 0.12956003813333256, + "grad_norm": 2.15155291557312, + "learning_rate": 1.9990882769426677e-05, + "loss": 1.4763, + "step": 5555 + }, + { + "epoch": 0.12958336127251047, + "grad_norm": 2.073052167892456, + "learning_rate": 1.999087201644231e-05, + "loss": 1.5798, + "step": 5556 + }, + { + "epoch": 0.12960668441168838, + "grad_norm": 1.6741328239440918, + "learning_rate": 1.9990861257123467e-05, + "loss": 1.1659, + "step": 5557 + }, + { + "epoch": 0.12963000755086632, + "grad_norm": 1.866536021232605, + "learning_rate": 1.9990850491470158e-05, + "loss": 1.323, + "step": 5558 + }, + { + "epoch": 0.12965333069004423, + "grad_norm": 1.677143931388855, + "learning_rate": 1.999083971948239e-05, + "loss": 1.3388, + "step": 5559 + }, + { + "epoch": 0.12967665382922214, + "grad_norm": 1.5677324533462524, + "learning_rate": 1.9990828941160166e-05, + "loss": 1.3152, + "step": 5560 + }, + { + "epoch": 0.12969997696840005, + "grad_norm": 1.6078276634216309, + "learning_rate": 1.999081815650349e-05, + "loss": 1.3093, + "step": 5561 + }, + { + "epoch": 0.129723300107578, + "grad_norm": 1.6168437004089355, + "learning_rate": 1.9990807365512377e-05, + "loss": 1.318, + "step": 5562 + }, + { + "epoch": 0.1297466232467559, + "grad_norm": 1.74843168258667, + "learning_rate": 1.999079656818683e-05, + "loss": 1.6746, + "step": 5563 + }, + { + "epoch": 0.1297699463859338, + "grad_norm": 1.6328022480010986, + "learning_rate": 1.9990785764526853e-05, + "loss": 1.2749, + "step": 5564 + }, + { + "epoch": 0.12979326952511172, + "grad_norm": 1.7836475372314453, + "learning_rate": 1.999077495453246e-05, + "loss": 1.3339, + "step": 5565 + }, + { + "epoch": 0.12981659266428966, + "grad_norm": 1.7767078876495361, + "learning_rate": 1.9990764138203647e-05, + "loss": 1.2536, + "step": 5566 + }, + { + "epoch": 0.12983991580346757, + "grad_norm": 1.8038657903671265, + "learning_rate": 1.9990753315540434e-05, + "loss": 1.2296, + "step": 5567 + }, + { + "epoch": 0.12986323894264548, + "grad_norm": 1.7227442264556885, + "learning_rate": 1.9990742486542816e-05, + "loss": 1.2814, + "step": 5568 + }, + { + "epoch": 0.1298865620818234, + "grad_norm": 1.8721829652786255, + "learning_rate": 1.9990731651210805e-05, + "loss": 1.6344, + "step": 5569 + }, + { + "epoch": 0.12990988522100133, + "grad_norm": 2.031453847885132, + "learning_rate": 1.999072080954441e-05, + "loss": 1.2227, + "step": 5570 + }, + { + "epoch": 0.12993320836017924, + "grad_norm": 1.5956660509109497, + "learning_rate": 1.9990709961543636e-05, + "loss": 1.6121, + "step": 5571 + }, + { + "epoch": 0.12995653149935715, + "grad_norm": 1.3925299644470215, + "learning_rate": 1.999069910720849e-05, + "loss": 1.6553, + "step": 5572 + }, + { + "epoch": 0.12997985463853506, + "grad_norm": 1.4701756238937378, + "learning_rate": 1.9990688246538977e-05, + "loss": 1.1665, + "step": 5573 + }, + { + "epoch": 0.130003177777713, + "grad_norm": 1.8524373769760132, + "learning_rate": 1.9990677379535106e-05, + "loss": 1.7061, + "step": 5574 + }, + { + "epoch": 0.1300265009168909, + "grad_norm": 1.6537120342254639, + "learning_rate": 1.9990666506196886e-05, + "loss": 1.4967, + "step": 5575 + }, + { + "epoch": 0.13004982405606882, + "grad_norm": 1.6119301319122314, + "learning_rate": 1.9990655626524316e-05, + "loss": 1.4172, + "step": 5576 + }, + { + "epoch": 0.13007314719524674, + "grad_norm": 1.4853447675704956, + "learning_rate": 1.9990644740517407e-05, + "loss": 1.4393, + "step": 5577 + }, + { + "epoch": 0.13009647033442467, + "grad_norm": 1.6495577096939087, + "learning_rate": 1.9990633848176172e-05, + "loss": 1.3208, + "step": 5578 + }, + { + "epoch": 0.13011979347360259, + "grad_norm": 2.002061128616333, + "learning_rate": 1.9990622949500615e-05, + "loss": 1.5294, + "step": 5579 + }, + { + "epoch": 0.1301431166127805, + "grad_norm": 1.8930041790008545, + "learning_rate": 1.9990612044490735e-05, + "loss": 1.4592, + "step": 5580 + }, + { + "epoch": 0.1301664397519584, + "grad_norm": 1.770723581314087, + "learning_rate": 1.9990601133146546e-05, + "loss": 1.2854, + "step": 5581 + }, + { + "epoch": 0.13018976289113635, + "grad_norm": 1.7341417074203491, + "learning_rate": 1.9990590215468052e-05, + "loss": 1.2771, + "step": 5582 + }, + { + "epoch": 0.13021308603031426, + "grad_norm": 2.093170166015625, + "learning_rate": 1.9990579291455266e-05, + "loss": 1.5747, + "step": 5583 + }, + { + "epoch": 0.13023640916949217, + "grad_norm": 1.7738980054855347, + "learning_rate": 1.9990568361108188e-05, + "loss": 1.3147, + "step": 5584 + }, + { + "epoch": 0.13025973230867008, + "grad_norm": 1.3787620067596436, + "learning_rate": 1.9990557424426828e-05, + "loss": 1.0273, + "step": 5585 + }, + { + "epoch": 0.130283055447848, + "grad_norm": 1.4266560077667236, + "learning_rate": 1.999054648141119e-05, + "loss": 1.7451, + "step": 5586 + }, + { + "epoch": 0.13030637858702593, + "grad_norm": 1.9341579675674438, + "learning_rate": 1.9990535532061286e-05, + "loss": 1.3356, + "step": 5587 + }, + { + "epoch": 0.13032970172620384, + "grad_norm": 1.9046876430511475, + "learning_rate": 1.9990524576377118e-05, + "loss": 1.7924, + "step": 5588 + }, + { + "epoch": 0.13035302486538175, + "grad_norm": 1.904853343963623, + "learning_rate": 1.9990513614358695e-05, + "loss": 1.8474, + "step": 5589 + }, + { + "epoch": 0.13037634800455966, + "grad_norm": 1.645563006401062, + "learning_rate": 1.9990502646006025e-05, + "loss": 1.309, + "step": 5590 + }, + { + "epoch": 0.1303996711437376, + "grad_norm": 1.3481857776641846, + "learning_rate": 1.9990491671319113e-05, + "loss": 1.16, + "step": 5591 + }, + { + "epoch": 0.1304229942829155, + "grad_norm": 1.8357621431350708, + "learning_rate": 1.999048069029797e-05, + "loss": 1.6692, + "step": 5592 + }, + { + "epoch": 0.13044631742209342, + "grad_norm": 1.7055871486663818, + "learning_rate": 1.99904697029426e-05, + "loss": 1.5492, + "step": 5593 + }, + { + "epoch": 0.13046964056127133, + "grad_norm": 1.7857271432876587, + "learning_rate": 1.9990458709253006e-05, + "loss": 1.6562, + "step": 5594 + }, + { + "epoch": 0.13049296370044927, + "grad_norm": 1.7274487018585205, + "learning_rate": 1.99904477092292e-05, + "loss": 1.5326, + "step": 5595 + }, + { + "epoch": 0.13051628683962718, + "grad_norm": 1.7780487537384033, + "learning_rate": 1.999043670287119e-05, + "loss": 1.373, + "step": 5596 + }, + { + "epoch": 0.1305396099788051, + "grad_norm": 1.9081625938415527, + "learning_rate": 1.9990425690178982e-05, + "loss": 1.8362, + "step": 5597 + }, + { + "epoch": 0.130562933117983, + "grad_norm": 1.6644736528396606, + "learning_rate": 1.999041467115258e-05, + "loss": 1.53, + "step": 5598 + }, + { + "epoch": 0.13058625625716094, + "grad_norm": 1.6263360977172852, + "learning_rate": 1.999040364579199e-05, + "loss": 1.7188, + "step": 5599 + }, + { + "epoch": 0.13060957939633885, + "grad_norm": 1.9173109531402588, + "learning_rate": 1.9990392614097226e-05, + "loss": 1.5303, + "step": 5600 + }, + { + "epoch": 0.13063290253551677, + "grad_norm": 1.4638445377349854, + "learning_rate": 1.999038157606829e-05, + "loss": 1.4713, + "step": 5601 + }, + { + "epoch": 0.13065622567469468, + "grad_norm": 1.9877592325210571, + "learning_rate": 1.999037053170519e-05, + "loss": 1.6401, + "step": 5602 + }, + { + "epoch": 0.13067954881387261, + "grad_norm": 1.562218189239502, + "learning_rate": 1.999035948100793e-05, + "loss": 1.5051, + "step": 5603 + }, + { + "epoch": 0.13070287195305053, + "grad_norm": 1.8412750959396362, + "learning_rate": 1.9990348423976516e-05, + "loss": 1.401, + "step": 5604 + }, + { + "epoch": 0.13072619509222844, + "grad_norm": 1.4882012605667114, + "learning_rate": 1.9990337360610967e-05, + "loss": 1.3083, + "step": 5605 + }, + { + "epoch": 0.13074951823140635, + "grad_norm": 2.0613253116607666, + "learning_rate": 1.999032629091128e-05, + "loss": 1.5288, + "step": 5606 + }, + { + "epoch": 0.1307728413705843, + "grad_norm": 1.7240405082702637, + "learning_rate": 1.999031521487746e-05, + "loss": 1.4161, + "step": 5607 + }, + { + "epoch": 0.1307961645097622, + "grad_norm": 1.847396969795227, + "learning_rate": 1.999030413250952e-05, + "loss": 1.3063, + "step": 5608 + }, + { + "epoch": 0.1308194876489401, + "grad_norm": 1.443750023841858, + "learning_rate": 1.9990293043807468e-05, + "loss": 1.2454, + "step": 5609 + }, + { + "epoch": 0.13084281078811802, + "grad_norm": 2.23435640335083, + "learning_rate": 1.9990281948771307e-05, + "loss": 1.2374, + "step": 5610 + }, + { + "epoch": 0.13086613392729596, + "grad_norm": 1.8044589757919312, + "learning_rate": 1.9990270847401043e-05, + "loss": 1.3916, + "step": 5611 + }, + { + "epoch": 0.13088945706647387, + "grad_norm": 2.0202250480651855, + "learning_rate": 1.9990259739696683e-05, + "loss": 1.4802, + "step": 5612 + }, + { + "epoch": 0.13091278020565178, + "grad_norm": 1.6977739334106445, + "learning_rate": 1.9990248625658242e-05, + "loss": 1.5839, + "step": 5613 + }, + { + "epoch": 0.1309361033448297, + "grad_norm": 1.6863194704055786, + "learning_rate": 1.9990237505285716e-05, + "loss": 1.3921, + "step": 5614 + }, + { + "epoch": 0.1309594264840076, + "grad_norm": 1.6709233522415161, + "learning_rate": 1.9990226378579118e-05, + "loss": 1.1667, + "step": 5615 + }, + { + "epoch": 0.13098274962318554, + "grad_norm": 1.550304889678955, + "learning_rate": 1.9990215245538454e-05, + "loss": 1.3381, + "step": 5616 + }, + { + "epoch": 0.13100607276236345, + "grad_norm": 1.5540437698364258, + "learning_rate": 1.9990204106163733e-05, + "loss": 1.4065, + "step": 5617 + }, + { + "epoch": 0.13102939590154136, + "grad_norm": 2.689389228820801, + "learning_rate": 1.999019296045496e-05, + "loss": 1.1326, + "step": 5618 + }, + { + "epoch": 0.13105271904071927, + "grad_norm": 1.8067376613616943, + "learning_rate": 1.999018180841214e-05, + "loss": 1.4505, + "step": 5619 + }, + { + "epoch": 0.1310760421798972, + "grad_norm": 1.9633121490478516, + "learning_rate": 1.9990170650035288e-05, + "loss": 1.3664, + "step": 5620 + }, + { + "epoch": 0.13109936531907512, + "grad_norm": 1.7463382482528687, + "learning_rate": 1.99901594853244e-05, + "loss": 1.5872, + "step": 5621 + }, + { + "epoch": 0.13112268845825303, + "grad_norm": 1.695978045463562, + "learning_rate": 1.999014831427949e-05, + "loss": 1.4773, + "step": 5622 + }, + { + "epoch": 0.13114601159743094, + "grad_norm": 1.4996044635772705, + "learning_rate": 1.9990137136900563e-05, + "loss": 1.1959, + "step": 5623 + }, + { + "epoch": 0.13116933473660888, + "grad_norm": 1.9717403650283813, + "learning_rate": 1.9990125953187628e-05, + "loss": 1.3907, + "step": 5624 + }, + { + "epoch": 0.1311926578757868, + "grad_norm": 1.952599287033081, + "learning_rate": 1.9990114763140692e-05, + "loss": 1.6086, + "step": 5625 + }, + { + "epoch": 0.1312159810149647, + "grad_norm": 1.9476780891418457, + "learning_rate": 1.999010356675976e-05, + "loss": 1.3522, + "step": 5626 + }, + { + "epoch": 0.13123930415414262, + "grad_norm": 1.7530410289764404, + "learning_rate": 1.999009236404484e-05, + "loss": 1.4332, + "step": 5627 + }, + { + "epoch": 0.13126262729332056, + "grad_norm": 1.517337679862976, + "learning_rate": 1.9990081154995938e-05, + "loss": 1.0702, + "step": 5628 + }, + { + "epoch": 0.13128595043249847, + "grad_norm": 2.4697213172912598, + "learning_rate": 1.9990069939613064e-05, + "loss": 1.3862, + "step": 5629 + }, + { + "epoch": 0.13130927357167638, + "grad_norm": 1.5803735256195068, + "learning_rate": 1.9990058717896224e-05, + "loss": 1.1741, + "step": 5630 + }, + { + "epoch": 0.1313325967108543, + "grad_norm": 1.7363076210021973, + "learning_rate": 1.9990047489845423e-05, + "loss": 1.4223, + "step": 5631 + }, + { + "epoch": 0.13135591985003223, + "grad_norm": 1.7471119165420532, + "learning_rate": 1.9990036255460666e-05, + "loss": 1.5598, + "step": 5632 + }, + { + "epoch": 0.13137924298921014, + "grad_norm": 1.571816086769104, + "learning_rate": 1.999002501474197e-05, + "loss": 1.2546, + "step": 5633 + }, + { + "epoch": 0.13140256612838805, + "grad_norm": 1.9666074514389038, + "learning_rate": 1.9990013767689334e-05, + "loss": 1.2296, + "step": 5634 + }, + { + "epoch": 0.13142588926756596, + "grad_norm": 3.2497670650482178, + "learning_rate": 1.9990002514302765e-05, + "loss": 1.4525, + "step": 5635 + }, + { + "epoch": 0.1314492124067439, + "grad_norm": 1.780283808708191, + "learning_rate": 1.9989991254582277e-05, + "loss": 1.2258, + "step": 5636 + }, + { + "epoch": 0.1314725355459218, + "grad_norm": 1.600226640701294, + "learning_rate": 1.9989979988527866e-05, + "loss": 1.4494, + "step": 5637 + }, + { + "epoch": 0.13149585868509972, + "grad_norm": 2.133347988128662, + "learning_rate": 1.9989968716139546e-05, + "loss": 1.6596, + "step": 5638 + }, + { + "epoch": 0.13151918182427763, + "grad_norm": 1.8568822145462036, + "learning_rate": 1.9989957437417327e-05, + "loss": 1.1597, + "step": 5639 + }, + { + "epoch": 0.13154250496345557, + "grad_norm": 1.9567466974258423, + "learning_rate": 1.998994615236121e-05, + "loss": 1.1788, + "step": 5640 + }, + { + "epoch": 0.13156582810263348, + "grad_norm": 1.767482876777649, + "learning_rate": 1.9989934860971206e-05, + "loss": 1.4208, + "step": 5641 + }, + { + "epoch": 0.1315891512418114, + "grad_norm": 1.6550018787384033, + "learning_rate": 1.9989923563247324e-05, + "loss": 1.1864, + "step": 5642 + }, + { + "epoch": 0.1316124743809893, + "grad_norm": 1.7208259105682373, + "learning_rate": 1.9989912259189564e-05, + "loss": 1.7372, + "step": 5643 + }, + { + "epoch": 0.1316357975201672, + "grad_norm": 1.7256152629852295, + "learning_rate": 1.9989900948797942e-05, + "loss": 1.3785, + "step": 5644 + }, + { + "epoch": 0.13165912065934515, + "grad_norm": 1.560926914215088, + "learning_rate": 1.9989889632072455e-05, + "loss": 1.558, + "step": 5645 + }, + { + "epoch": 0.13168244379852306, + "grad_norm": 1.796082854270935, + "learning_rate": 1.9989878309013116e-05, + "loss": 1.1878, + "step": 5646 + }, + { + "epoch": 0.13170576693770097, + "grad_norm": 1.7200309038162231, + "learning_rate": 1.9989866979619936e-05, + "loss": 1.418, + "step": 5647 + }, + { + "epoch": 0.13172909007687889, + "grad_norm": 2.0337626934051514, + "learning_rate": 1.9989855643892915e-05, + "loss": 1.707, + "step": 5648 + }, + { + "epoch": 0.13175241321605682, + "grad_norm": 1.985156774520874, + "learning_rate": 1.9989844301832063e-05, + "loss": 1.1133, + "step": 5649 + }, + { + "epoch": 0.13177573635523473, + "grad_norm": 1.705230712890625, + "learning_rate": 1.998983295343739e-05, + "loss": 1.4385, + "step": 5650 + }, + { + "epoch": 0.13179905949441265, + "grad_norm": 1.5013837814331055, + "learning_rate": 1.9989821598708898e-05, + "loss": 1.3295, + "step": 5651 + }, + { + "epoch": 0.13182238263359056, + "grad_norm": 1.8703545331954956, + "learning_rate": 1.99898102376466e-05, + "loss": 1.2098, + "step": 5652 + }, + { + "epoch": 0.1318457057727685, + "grad_norm": 1.499680757522583, + "learning_rate": 1.9989798870250497e-05, + "loss": 1.1251, + "step": 5653 + }, + { + "epoch": 0.1318690289119464, + "grad_norm": 1.602441668510437, + "learning_rate": 1.99897874965206e-05, + "loss": 1.4855, + "step": 5654 + }, + { + "epoch": 0.13189235205112432, + "grad_norm": 1.6819969415664673, + "learning_rate": 1.9989776116456917e-05, + "loss": 1.3356, + "step": 5655 + }, + { + "epoch": 0.13191567519030223, + "grad_norm": 1.6827677488327026, + "learning_rate": 1.998976473005945e-05, + "loss": 1.42, + "step": 5656 + }, + { + "epoch": 0.13193899832948017, + "grad_norm": 1.6931458711624146, + "learning_rate": 1.9989753337328216e-05, + "loss": 1.5455, + "step": 5657 + }, + { + "epoch": 0.13196232146865808, + "grad_norm": 1.921716332435608, + "learning_rate": 1.998974193826321e-05, + "loss": 1.2829, + "step": 5658 + }, + { + "epoch": 0.131985644607836, + "grad_norm": 2.079650640487671, + "learning_rate": 1.9989730532864448e-05, + "loss": 1.7596, + "step": 5659 + }, + { + "epoch": 0.1320089677470139, + "grad_norm": 1.6767427921295166, + "learning_rate": 1.9989719121131937e-05, + "loss": 1.3297, + "step": 5660 + }, + { + "epoch": 0.13203229088619184, + "grad_norm": 1.9462519884109497, + "learning_rate": 1.9989707703065676e-05, + "loss": 1.3486, + "step": 5661 + }, + { + "epoch": 0.13205561402536975, + "grad_norm": 1.6335049867630005, + "learning_rate": 1.9989696278665683e-05, + "loss": 1.1636, + "step": 5662 + }, + { + "epoch": 0.13207893716454766, + "grad_norm": 1.9665284156799316, + "learning_rate": 1.998968484793196e-05, + "loss": 1.6473, + "step": 5663 + }, + { + "epoch": 0.13210226030372557, + "grad_norm": 1.676806926727295, + "learning_rate": 1.998967341086451e-05, + "loss": 1.4059, + "step": 5664 + }, + { + "epoch": 0.1321255834429035, + "grad_norm": 1.8982559442520142, + "learning_rate": 1.998966196746335e-05, + "loss": 1.4388, + "step": 5665 + }, + { + "epoch": 0.13214890658208142, + "grad_norm": 1.8911465406417847, + "learning_rate": 1.998965051772848e-05, + "loss": 1.2531, + "step": 5666 + }, + { + "epoch": 0.13217222972125933, + "grad_norm": 2.246480703353882, + "learning_rate": 1.9989639061659908e-05, + "loss": 1.3387, + "step": 5667 + }, + { + "epoch": 0.13219555286043724, + "grad_norm": 1.913788914680481, + "learning_rate": 1.9989627599257643e-05, + "loss": 1.5634, + "step": 5668 + }, + { + "epoch": 0.13221887599961518, + "grad_norm": 1.6013555526733398, + "learning_rate": 1.9989616130521695e-05, + "loss": 1.195, + "step": 5669 + }, + { + "epoch": 0.1322421991387931, + "grad_norm": 2.518117904663086, + "learning_rate": 1.9989604655452065e-05, + "loss": 1.6266, + "step": 5670 + }, + { + "epoch": 0.132265522277971, + "grad_norm": 2.05083966255188, + "learning_rate": 1.9989593174048764e-05, + "loss": 1.0822, + "step": 5671 + }, + { + "epoch": 0.13228884541714891, + "grad_norm": 2.1194233894348145, + "learning_rate": 1.99895816863118e-05, + "loss": 1.4618, + "step": 5672 + }, + { + "epoch": 0.13231216855632683, + "grad_norm": 1.6310014724731445, + "learning_rate": 1.9989570192241175e-05, + "loss": 1.3309, + "step": 5673 + }, + { + "epoch": 0.13233549169550476, + "grad_norm": 1.812276005744934, + "learning_rate": 1.9989558691836905e-05, + "loss": 1.3051, + "step": 5674 + }, + { + "epoch": 0.13235881483468268, + "grad_norm": 1.7325299978256226, + "learning_rate": 1.998954718509899e-05, + "loss": 1.2206, + "step": 5675 + }, + { + "epoch": 0.1323821379738606, + "grad_norm": 1.6696186065673828, + "learning_rate": 1.998953567202744e-05, + "loss": 1.3367, + "step": 5676 + }, + { + "epoch": 0.1324054611130385, + "grad_norm": 1.6422505378723145, + "learning_rate": 1.998952415262226e-05, + "loss": 1.3354, + "step": 5677 + }, + { + "epoch": 0.13242878425221644, + "grad_norm": 1.643275499343872, + "learning_rate": 1.9989512626883462e-05, + "loss": 1.0802, + "step": 5678 + }, + { + "epoch": 0.13245210739139435, + "grad_norm": 1.5790220499038696, + "learning_rate": 1.9989501094811052e-05, + "loss": 1.4997, + "step": 5679 + }, + { + "epoch": 0.13247543053057226, + "grad_norm": 1.994425654411316, + "learning_rate": 1.9989489556405036e-05, + "loss": 1.7596, + "step": 5680 + }, + { + "epoch": 0.13249875366975017, + "grad_norm": 1.8514608144760132, + "learning_rate": 1.998947801166542e-05, + "loss": 1.5788, + "step": 5681 + }, + { + "epoch": 0.1325220768089281, + "grad_norm": 1.7819243669509888, + "learning_rate": 1.998946646059221e-05, + "loss": 1.5955, + "step": 5682 + }, + { + "epoch": 0.13254539994810602, + "grad_norm": 1.7954121828079224, + "learning_rate": 1.9989454903185417e-05, + "loss": 1.4176, + "step": 5683 + }, + { + "epoch": 0.13256872308728393, + "grad_norm": 1.661253809928894, + "learning_rate": 1.9989443339445055e-05, + "loss": 1.3857, + "step": 5684 + }, + { + "epoch": 0.13259204622646184, + "grad_norm": 1.847687840461731, + "learning_rate": 1.9989431769371116e-05, + "loss": 1.9831, + "step": 5685 + }, + { + "epoch": 0.13261536936563978, + "grad_norm": 1.5928136110305786, + "learning_rate": 1.9989420192963616e-05, + "loss": 1.121, + "step": 5686 + }, + { + "epoch": 0.1326386925048177, + "grad_norm": 1.7076382637023926, + "learning_rate": 1.998940861022256e-05, + "loss": 1.3647, + "step": 5687 + }, + { + "epoch": 0.1326620156439956, + "grad_norm": 1.8012871742248535, + "learning_rate": 1.998939702114796e-05, + "loss": 1.4799, + "step": 5688 + }, + { + "epoch": 0.1326853387831735, + "grad_norm": 1.734161376953125, + "learning_rate": 1.9989385425739818e-05, + "loss": 1.5009, + "step": 5689 + }, + { + "epoch": 0.13270866192235145, + "grad_norm": 1.4776747226715088, + "learning_rate": 1.9989373823998145e-05, + "loss": 1.1202, + "step": 5690 + }, + { + "epoch": 0.13273198506152936, + "grad_norm": 1.8409889936447144, + "learning_rate": 1.9989362215922943e-05, + "loss": 1.2246, + "step": 5691 + }, + { + "epoch": 0.13275530820070727, + "grad_norm": 2.2336180210113525, + "learning_rate": 1.998935060151423e-05, + "loss": 1.7343, + "step": 5692 + }, + { + "epoch": 0.13277863133988518, + "grad_norm": 2.068862199783325, + "learning_rate": 1.9989338980771998e-05, + "loss": 1.4449, + "step": 5693 + }, + { + "epoch": 0.13280195447906312, + "grad_norm": 1.6301617622375488, + "learning_rate": 1.9989327353696272e-05, + "loss": 1.2272, + "step": 5694 + }, + { + "epoch": 0.13282527761824103, + "grad_norm": 1.4859644174575806, + "learning_rate": 1.9989315720287043e-05, + "loss": 1.2215, + "step": 5695 + }, + { + "epoch": 0.13284860075741894, + "grad_norm": 1.431028127670288, + "learning_rate": 1.998930408054433e-05, + "loss": 1.1849, + "step": 5696 + }, + { + "epoch": 0.13287192389659686, + "grad_norm": 1.6294249296188354, + "learning_rate": 1.9989292434468132e-05, + "loss": 1.1808, + "step": 5697 + }, + { + "epoch": 0.1328952470357748, + "grad_norm": 1.648274540901184, + "learning_rate": 1.9989280782058463e-05, + "loss": 1.4383, + "step": 5698 + }, + { + "epoch": 0.1329185701749527, + "grad_norm": 1.834496021270752, + "learning_rate": 1.9989269123315326e-05, + "loss": 1.7121, + "step": 5699 + }, + { + "epoch": 0.13294189331413062, + "grad_norm": 1.638672947883606, + "learning_rate": 1.998925745823873e-05, + "loss": 1.6214, + "step": 5700 + }, + { + "epoch": 0.13296521645330853, + "grad_norm": 2.1762537956237793, + "learning_rate": 1.9989245786828686e-05, + "loss": 1.357, + "step": 5701 + }, + { + "epoch": 0.13298853959248644, + "grad_norm": 1.7930190563201904, + "learning_rate": 1.9989234109085194e-05, + "loss": 1.7046, + "step": 5702 + }, + { + "epoch": 0.13301186273166438, + "grad_norm": 1.844916582107544, + "learning_rate": 1.9989222425008267e-05, + "loss": 1.5751, + "step": 5703 + }, + { + "epoch": 0.1330351858708423, + "grad_norm": 5.638031005859375, + "learning_rate": 1.998921073459791e-05, + "loss": 1.3669, + "step": 5704 + }, + { + "epoch": 0.1330585090100202, + "grad_norm": 1.466515064239502, + "learning_rate": 1.9989199037854132e-05, + "loss": 1.5931, + "step": 5705 + }, + { + "epoch": 0.1330818321491981, + "grad_norm": 1.703168272972107, + "learning_rate": 1.998918733477694e-05, + "loss": 1.5492, + "step": 5706 + }, + { + "epoch": 0.13310515528837605, + "grad_norm": 1.7279112339019775, + "learning_rate": 1.998917562536634e-05, + "loss": 1.4521, + "step": 5707 + }, + { + "epoch": 0.13312847842755396, + "grad_norm": 1.8984036445617676, + "learning_rate": 1.998916390962234e-05, + "loss": 1.3514, + "step": 5708 + }, + { + "epoch": 0.13315180156673187, + "grad_norm": 1.7442967891693115, + "learning_rate": 1.998915218754495e-05, + "loss": 1.283, + "step": 5709 + }, + { + "epoch": 0.13317512470590978, + "grad_norm": 1.6294081211090088, + "learning_rate": 1.9989140459134173e-05, + "loss": 1.605, + "step": 5710 + }, + { + "epoch": 0.13319844784508772, + "grad_norm": 1.462605595588684, + "learning_rate": 1.9989128724390018e-05, + "loss": 1.433, + "step": 5711 + }, + { + "epoch": 0.13322177098426563, + "grad_norm": 1.8922992944717407, + "learning_rate": 1.9989116983312497e-05, + "loss": 1.3874, + "step": 5712 + }, + { + "epoch": 0.13324509412344354, + "grad_norm": 2.048027276992798, + "learning_rate": 1.998910523590161e-05, + "loss": 1.4394, + "step": 5713 + }, + { + "epoch": 0.13326841726262145, + "grad_norm": 2.1792821884155273, + "learning_rate": 1.9989093482157366e-05, + "loss": 1.6302, + "step": 5714 + }, + { + "epoch": 0.1332917404017994, + "grad_norm": 1.7628233432769775, + "learning_rate": 1.9989081722079778e-05, + "loss": 1.6911, + "step": 5715 + }, + { + "epoch": 0.1333150635409773, + "grad_norm": 1.5108592510223389, + "learning_rate": 1.998906995566885e-05, + "loss": 1.0619, + "step": 5716 + }, + { + "epoch": 0.1333383866801552, + "grad_norm": 1.504599928855896, + "learning_rate": 1.998905818292459e-05, + "loss": 1.4192, + "step": 5717 + }, + { + "epoch": 0.13336170981933312, + "grad_norm": 1.8720074892044067, + "learning_rate": 1.9989046403847004e-05, + "loss": 1.5808, + "step": 5718 + }, + { + "epoch": 0.13338503295851106, + "grad_norm": 1.4277008771896362, + "learning_rate": 1.99890346184361e-05, + "loss": 1.3481, + "step": 5719 + }, + { + "epoch": 0.13340835609768897, + "grad_norm": 1.6472880840301514, + "learning_rate": 1.9989022826691883e-05, + "loss": 1.126, + "step": 5720 + }, + { + "epoch": 0.13343167923686688, + "grad_norm": 2.031614065170288, + "learning_rate": 1.998901102861437e-05, + "loss": 1.1102, + "step": 5721 + }, + { + "epoch": 0.1334550023760448, + "grad_norm": 2.070117712020874, + "learning_rate": 1.9988999224203555e-05, + "loss": 1.8744, + "step": 5722 + }, + { + "epoch": 0.13347832551522273, + "grad_norm": 1.612544059753418, + "learning_rate": 1.9988987413459457e-05, + "loss": 1.5193, + "step": 5723 + }, + { + "epoch": 0.13350164865440065, + "grad_norm": 1.6464709043502808, + "learning_rate": 1.9988975596382077e-05, + "loss": 1.1822, + "step": 5724 + }, + { + "epoch": 0.13352497179357856, + "grad_norm": 1.441470742225647, + "learning_rate": 1.9988963772971422e-05, + "loss": 0.9961, + "step": 5725 + }, + { + "epoch": 0.13354829493275647, + "grad_norm": 1.9952623844146729, + "learning_rate": 1.9988951943227505e-05, + "loss": 1.2173, + "step": 5726 + }, + { + "epoch": 0.13357161807193438, + "grad_norm": 1.6786301136016846, + "learning_rate": 1.998894010715033e-05, + "loss": 1.3935, + "step": 5727 + }, + { + "epoch": 0.13359494121111232, + "grad_norm": 1.9806087017059326, + "learning_rate": 1.9988928264739905e-05, + "loss": 1.7098, + "step": 5728 + }, + { + "epoch": 0.13361826435029023, + "grad_norm": 1.3902217149734497, + "learning_rate": 1.9988916415996232e-05, + "loss": 1.1507, + "step": 5729 + }, + { + "epoch": 0.13364158748946814, + "grad_norm": 1.8615084886550903, + "learning_rate": 1.998890456091933e-05, + "loss": 1.7547, + "step": 5730 + }, + { + "epoch": 0.13366491062864605, + "grad_norm": 1.9365155696868896, + "learning_rate": 1.9988892699509196e-05, + "loss": 1.6085, + "step": 5731 + }, + { + "epoch": 0.133688233767824, + "grad_norm": 1.6931182146072388, + "learning_rate": 1.9988880831765845e-05, + "loss": 1.4243, + "step": 5732 + }, + { + "epoch": 0.1337115569070019, + "grad_norm": 1.6634023189544678, + "learning_rate": 1.998886895768928e-05, + "loss": 1.5052, + "step": 5733 + }, + { + "epoch": 0.1337348800461798, + "grad_norm": 1.895603895187378, + "learning_rate": 1.9988857077279512e-05, + "loss": 1.7746, + "step": 5734 + }, + { + "epoch": 0.13375820318535772, + "grad_norm": 1.4033432006835938, + "learning_rate": 1.9988845190536542e-05, + "loss": 1.2937, + "step": 5735 + }, + { + "epoch": 0.13378152632453566, + "grad_norm": 1.740257740020752, + "learning_rate": 1.9988833297460384e-05, + "loss": 1.4429, + "step": 5736 + }, + { + "epoch": 0.13380484946371357, + "grad_norm": 2.326231002807617, + "learning_rate": 1.9988821398051046e-05, + "loss": 1.5061, + "step": 5737 + }, + { + "epoch": 0.13382817260289148, + "grad_norm": 1.5010144710540771, + "learning_rate": 1.998880949230853e-05, + "loss": 1.4468, + "step": 5738 + }, + { + "epoch": 0.1338514957420694, + "grad_norm": 1.9904086589813232, + "learning_rate": 1.9988797580232845e-05, + "loss": 1.2319, + "step": 5739 + }, + { + "epoch": 0.13387481888124733, + "grad_norm": 1.8015780448913574, + "learning_rate": 1.9988785661824002e-05, + "loss": 1.3146, + "step": 5740 + }, + { + "epoch": 0.13389814202042524, + "grad_norm": 1.4130935668945312, + "learning_rate": 1.9988773737082006e-05, + "loss": 1.2075, + "step": 5741 + }, + { + "epoch": 0.13392146515960315, + "grad_norm": 1.4542776346206665, + "learning_rate": 1.9988761806006867e-05, + "loss": 1.2411, + "step": 5742 + }, + { + "epoch": 0.13394478829878106, + "grad_norm": 2.4227020740509033, + "learning_rate": 1.998874986859859e-05, + "loss": 1.5775, + "step": 5743 + }, + { + "epoch": 0.133968111437959, + "grad_norm": 2.2216720581054688, + "learning_rate": 1.9988737924857184e-05, + "loss": 1.2722, + "step": 5744 + }, + { + "epoch": 0.13399143457713691, + "grad_norm": 1.916792631149292, + "learning_rate": 1.9988725974782655e-05, + "loss": 1.5183, + "step": 5745 + }, + { + "epoch": 0.13401475771631483, + "grad_norm": 2.5872554779052734, + "learning_rate": 1.998871401837501e-05, + "loss": 1.6305, + "step": 5746 + }, + { + "epoch": 0.13403808085549274, + "grad_norm": 1.8227702379226685, + "learning_rate": 1.9988702055634262e-05, + "loss": 1.1386, + "step": 5747 + }, + { + "epoch": 0.13406140399467067, + "grad_norm": 1.6415590047836304, + "learning_rate": 1.998869008656041e-05, + "loss": 1.8479, + "step": 5748 + }, + { + "epoch": 0.13408472713384859, + "grad_norm": 1.7136770486831665, + "learning_rate": 1.998867811115347e-05, + "loss": 1.569, + "step": 5749 + }, + { + "epoch": 0.1341080502730265, + "grad_norm": 1.260227084159851, + "learning_rate": 1.9988666129413445e-05, + "loss": 1.0573, + "step": 5750 + }, + { + "epoch": 0.1341313734122044, + "grad_norm": 1.722510814666748, + "learning_rate": 1.9988654141340344e-05, + "loss": 1.4895, + "step": 5751 + }, + { + "epoch": 0.13415469655138235, + "grad_norm": 1.6698564291000366, + "learning_rate": 1.998864214693417e-05, + "loss": 1.4579, + "step": 5752 + }, + { + "epoch": 0.13417801969056026, + "grad_norm": 2.162900686264038, + "learning_rate": 1.9988630146194938e-05, + "loss": 1.4051, + "step": 5753 + }, + { + "epoch": 0.13420134282973817, + "grad_norm": 1.9218720197677612, + "learning_rate": 1.998861813912265e-05, + "loss": 1.7127, + "step": 5754 + }, + { + "epoch": 0.13422466596891608, + "grad_norm": 1.220187783241272, + "learning_rate": 1.998860612571732e-05, + "loss": 1.4373, + "step": 5755 + }, + { + "epoch": 0.134247989108094, + "grad_norm": 1.6740177869796753, + "learning_rate": 1.998859410597895e-05, + "loss": 1.3647, + "step": 5756 + }, + { + "epoch": 0.13427131224727193, + "grad_norm": 1.7140318155288696, + "learning_rate": 1.9988582079907548e-05, + "loss": 1.4046, + "step": 5757 + }, + { + "epoch": 0.13429463538644984, + "grad_norm": 1.5587612390518188, + "learning_rate": 1.998857004750312e-05, + "loss": 1.5312, + "step": 5758 + }, + { + "epoch": 0.13431795852562775, + "grad_norm": 1.595076560974121, + "learning_rate": 1.9988558008765682e-05, + "loss": 1.6365, + "step": 5759 + }, + { + "epoch": 0.13434128166480566, + "grad_norm": 1.6314584016799927, + "learning_rate": 1.9988545963695235e-05, + "loss": 1.7579, + "step": 5760 + }, + { + "epoch": 0.1343646048039836, + "grad_norm": 1.453542947769165, + "learning_rate": 1.9988533912291785e-05, + "loss": 1.337, + "step": 5761 + }, + { + "epoch": 0.1343879279431615, + "grad_norm": 1.555601954460144, + "learning_rate": 1.998852185455534e-05, + "loss": 1.6533, + "step": 5762 + }, + { + "epoch": 0.13441125108233942, + "grad_norm": 2.4596481323242188, + "learning_rate": 1.9988509790485913e-05, + "loss": 1.3532, + "step": 5763 + }, + { + "epoch": 0.13443457422151733, + "grad_norm": 1.381247878074646, + "learning_rate": 1.998849772008351e-05, + "loss": 1.6203, + "step": 5764 + }, + { + "epoch": 0.13445789736069527, + "grad_norm": 1.821500301361084, + "learning_rate": 1.9988485643348133e-05, + "loss": 1.5795, + "step": 5765 + }, + { + "epoch": 0.13448122049987318, + "grad_norm": 1.7372137308120728, + "learning_rate": 1.99884735602798e-05, + "loss": 1.5436, + "step": 5766 + }, + { + "epoch": 0.1345045436390511, + "grad_norm": 1.9504942893981934, + "learning_rate": 1.9988461470878507e-05, + "loss": 1.2909, + "step": 5767 + }, + { + "epoch": 0.134527866778229, + "grad_norm": 1.7436232566833496, + "learning_rate": 1.9988449375144273e-05, + "loss": 1.2289, + "step": 5768 + }, + { + "epoch": 0.13455118991740694, + "grad_norm": 1.546929121017456, + "learning_rate": 1.998843727307709e-05, + "loss": 1.4017, + "step": 5769 + }, + { + "epoch": 0.13457451305658485, + "grad_norm": 1.577972412109375, + "learning_rate": 1.9988425164676986e-05, + "loss": 1.4044, + "step": 5770 + }, + { + "epoch": 0.13459783619576277, + "grad_norm": 1.9270371198654175, + "learning_rate": 1.9988413049943955e-05, + "loss": 1.3849, + "step": 5771 + }, + { + "epoch": 0.13462115933494068, + "grad_norm": 1.6733180284500122, + "learning_rate": 1.9988400928878004e-05, + "loss": 1.5125, + "step": 5772 + }, + { + "epoch": 0.13464448247411862, + "grad_norm": 1.4023698568344116, + "learning_rate": 1.998838880147915e-05, + "loss": 1.193, + "step": 5773 + }, + { + "epoch": 0.13466780561329653, + "grad_norm": 1.6021320819854736, + "learning_rate": 1.998837666774739e-05, + "loss": 1.5431, + "step": 5774 + }, + { + "epoch": 0.13469112875247444, + "grad_norm": 1.6890685558319092, + "learning_rate": 1.998836452768274e-05, + "loss": 1.47, + "step": 5775 + }, + { + "epoch": 0.13471445189165235, + "grad_norm": 1.6689343452453613, + "learning_rate": 1.9988352381285205e-05, + "loss": 1.3217, + "step": 5776 + }, + { + "epoch": 0.1347377750308303, + "grad_norm": 1.6584681272506714, + "learning_rate": 1.998834022855479e-05, + "loss": 1.1732, + "step": 5777 + }, + { + "epoch": 0.1347610981700082, + "grad_norm": 1.5587846040725708, + "learning_rate": 1.998832806949151e-05, + "loss": 1.2888, + "step": 5778 + }, + { + "epoch": 0.1347844213091861, + "grad_norm": 1.7748236656188965, + "learning_rate": 1.9988315904095362e-05, + "loss": 1.5823, + "step": 5779 + }, + { + "epoch": 0.13480774444836402, + "grad_norm": 2.1331613063812256, + "learning_rate": 1.9988303732366365e-05, + "loss": 1.2202, + "step": 5780 + }, + { + "epoch": 0.13483106758754196, + "grad_norm": 1.7328897714614868, + "learning_rate": 1.9988291554304515e-05, + "loss": 1.3635, + "step": 5781 + }, + { + "epoch": 0.13485439072671987, + "grad_norm": 1.4480783939361572, + "learning_rate": 1.998827936990983e-05, + "loss": 1.2221, + "step": 5782 + }, + { + "epoch": 0.13487771386589778, + "grad_norm": 1.460590124130249, + "learning_rate": 1.998826717918231e-05, + "loss": 1.4233, + "step": 5783 + }, + { + "epoch": 0.1349010370050757, + "grad_norm": 1.6205053329467773, + "learning_rate": 1.998825498212197e-05, + "loss": 1.1772, + "step": 5784 + }, + { + "epoch": 0.1349243601442536, + "grad_norm": 1.6262480020523071, + "learning_rate": 1.9988242778728815e-05, + "loss": 1.7672, + "step": 5785 + }, + { + "epoch": 0.13494768328343154, + "grad_norm": 1.7244691848754883, + "learning_rate": 1.9988230569002847e-05, + "loss": 1.5407, + "step": 5786 + }, + { + "epoch": 0.13497100642260945, + "grad_norm": 2.783017158508301, + "learning_rate": 1.9988218352944086e-05, + "loss": 1.3019, + "step": 5787 + }, + { + "epoch": 0.13499432956178736, + "grad_norm": 1.8015656471252441, + "learning_rate": 1.9988206130552527e-05, + "loss": 1.5851, + "step": 5788 + }, + { + "epoch": 0.13501765270096527, + "grad_norm": 1.7585573196411133, + "learning_rate": 1.9988193901828183e-05, + "loss": 1.4238, + "step": 5789 + }, + { + "epoch": 0.1350409758401432, + "grad_norm": 1.664803147315979, + "learning_rate": 1.9988181666771062e-05, + "loss": 1.2124, + "step": 5790 + }, + { + "epoch": 0.13506429897932112, + "grad_norm": 1.4842329025268555, + "learning_rate": 1.998816942538117e-05, + "loss": 1.7294, + "step": 5791 + }, + { + "epoch": 0.13508762211849903, + "grad_norm": 1.938599705696106, + "learning_rate": 1.9988157177658522e-05, + "loss": 1.5568, + "step": 5792 + }, + { + "epoch": 0.13511094525767695, + "grad_norm": 2.0551724433898926, + "learning_rate": 1.9988144923603116e-05, + "loss": 1.377, + "step": 5793 + }, + { + "epoch": 0.13513426839685488, + "grad_norm": 1.7043901681900024, + "learning_rate": 1.9988132663214966e-05, + "loss": 1.3524, + "step": 5794 + }, + { + "epoch": 0.1351575915360328, + "grad_norm": 1.7444751262664795, + "learning_rate": 1.9988120396494073e-05, + "loss": 1.6895, + "step": 5795 + }, + { + "epoch": 0.1351809146752107, + "grad_norm": 1.809798240661621, + "learning_rate": 1.9988108123440457e-05, + "loss": 1.7041, + "step": 5796 + }, + { + "epoch": 0.13520423781438862, + "grad_norm": 1.4428502321243286, + "learning_rate": 1.9988095844054114e-05, + "loss": 1.5373, + "step": 5797 + }, + { + "epoch": 0.13522756095356656, + "grad_norm": 2.005716562271118, + "learning_rate": 1.9988083558335055e-05, + "loss": 1.7188, + "step": 5798 + }, + { + "epoch": 0.13525088409274447, + "grad_norm": 1.372430682182312, + "learning_rate": 1.9988071266283293e-05, + "loss": 1.1229, + "step": 5799 + }, + { + "epoch": 0.13527420723192238, + "grad_norm": 1.91326105594635, + "learning_rate": 1.9988058967898824e-05, + "loss": 1.4492, + "step": 5800 + }, + { + "epoch": 0.1352975303711003, + "grad_norm": 2.1554465293884277, + "learning_rate": 1.998804666318167e-05, + "loss": 1.3589, + "step": 5801 + }, + { + "epoch": 0.13532085351027823, + "grad_norm": 1.7652627229690552, + "learning_rate": 1.9988034352131832e-05, + "loss": 1.3842, + "step": 5802 + }, + { + "epoch": 0.13534417664945614, + "grad_norm": 1.683996319770813, + "learning_rate": 1.9988022034749316e-05, + "loss": 1.2538, + "step": 5803 + }, + { + "epoch": 0.13536749978863405, + "grad_norm": 2.0080783367156982, + "learning_rate": 1.998800971103413e-05, + "loss": 1.4925, + "step": 5804 + }, + { + "epoch": 0.13539082292781196, + "grad_norm": 1.728408694267273, + "learning_rate": 1.9987997380986286e-05, + "loss": 1.3652, + "step": 5805 + }, + { + "epoch": 0.1354141460669899, + "grad_norm": 1.7004222869873047, + "learning_rate": 1.998798504460579e-05, + "loss": 1.2393, + "step": 5806 + }, + { + "epoch": 0.1354374692061678, + "grad_norm": 1.4838793277740479, + "learning_rate": 1.9987972701892645e-05, + "loss": 1.4242, + "step": 5807 + }, + { + "epoch": 0.13546079234534572, + "grad_norm": 1.9853605031967163, + "learning_rate": 1.9987960352846868e-05, + "loss": 1.3261, + "step": 5808 + }, + { + "epoch": 0.13548411548452363, + "grad_norm": 2.4148504734039307, + "learning_rate": 1.998794799746846e-05, + "loss": 1.2635, + "step": 5809 + }, + { + "epoch": 0.13550743862370157, + "grad_norm": 1.8783892393112183, + "learning_rate": 1.9987935635757432e-05, + "loss": 1.4869, + "step": 5810 + }, + { + "epoch": 0.13553076176287948, + "grad_norm": 1.5341246128082275, + "learning_rate": 1.998792326771379e-05, + "loss": 1.4731, + "step": 5811 + }, + { + "epoch": 0.1355540849020574, + "grad_norm": 1.6594231128692627, + "learning_rate": 1.998791089333754e-05, + "loss": 1.6493, + "step": 5812 + }, + { + "epoch": 0.1355774080412353, + "grad_norm": 1.8053441047668457, + "learning_rate": 1.9987898512628697e-05, + "loss": 1.376, + "step": 5813 + }, + { + "epoch": 0.13560073118041321, + "grad_norm": 1.8567432165145874, + "learning_rate": 1.9987886125587258e-05, + "loss": 1.3293, + "step": 5814 + }, + { + "epoch": 0.13562405431959115, + "grad_norm": 1.8904708623886108, + "learning_rate": 1.9987873732213242e-05, + "loss": 1.5981, + "step": 5815 + }, + { + "epoch": 0.13564737745876906, + "grad_norm": 1.6645971536636353, + "learning_rate": 1.9987861332506648e-05, + "loss": 1.6767, + "step": 5816 + }, + { + "epoch": 0.13567070059794697, + "grad_norm": 1.5917236804962158, + "learning_rate": 1.9987848926467494e-05, + "loss": 1.6082, + "step": 5817 + }, + { + "epoch": 0.13569402373712489, + "grad_norm": 1.7671866416931152, + "learning_rate": 1.9987836514095775e-05, + "loss": 1.756, + "step": 5818 + }, + { + "epoch": 0.13571734687630282, + "grad_norm": 1.599556565284729, + "learning_rate": 1.998782409539151e-05, + "loss": 1.6617, + "step": 5819 + }, + { + "epoch": 0.13574067001548074, + "grad_norm": 1.5918755531311035, + "learning_rate": 1.9987811670354697e-05, + "loss": 1.3291, + "step": 5820 + }, + { + "epoch": 0.13576399315465865, + "grad_norm": 2.0007095336914062, + "learning_rate": 1.9987799238985355e-05, + "loss": 1.5134, + "step": 5821 + }, + { + "epoch": 0.13578731629383656, + "grad_norm": 1.2896279096603394, + "learning_rate": 1.9987786801283482e-05, + "loss": 1.0118, + "step": 5822 + }, + { + "epoch": 0.1358106394330145, + "grad_norm": 1.734490156173706, + "learning_rate": 1.9987774357249092e-05, + "loss": 1.6322, + "step": 5823 + }, + { + "epoch": 0.1358339625721924, + "grad_norm": 1.5640089511871338, + "learning_rate": 1.998776190688219e-05, + "loss": 1.4472, + "step": 5824 + }, + { + "epoch": 0.13585728571137032, + "grad_norm": 1.5651336908340454, + "learning_rate": 1.9987749450182784e-05, + "loss": 1.5271, + "step": 5825 + }, + { + "epoch": 0.13588060885054823, + "grad_norm": 1.6101703643798828, + "learning_rate": 1.9987736987150886e-05, + "loss": 1.6941, + "step": 5826 + }, + { + "epoch": 0.13590393198972617, + "grad_norm": 1.4283570051193237, + "learning_rate": 1.9987724517786495e-05, + "loss": 1.2762, + "step": 5827 + }, + { + "epoch": 0.13592725512890408, + "grad_norm": 1.6016660928726196, + "learning_rate": 1.9987712042089627e-05, + "loss": 1.5863, + "step": 5828 + }, + { + "epoch": 0.135950578268082, + "grad_norm": 1.5825492143630981, + "learning_rate": 1.998769956006029e-05, + "loss": 1.44, + "step": 5829 + }, + { + "epoch": 0.1359739014072599, + "grad_norm": 1.7828080654144287, + "learning_rate": 1.9987687071698486e-05, + "loss": 1.0777, + "step": 5830 + }, + { + "epoch": 0.13599722454643784, + "grad_norm": 1.6209686994552612, + "learning_rate": 1.9987674577004228e-05, + "loss": 1.6032, + "step": 5831 + }, + { + "epoch": 0.13602054768561575, + "grad_norm": 1.9211755990982056, + "learning_rate": 1.998766207597752e-05, + "loss": 1.6645, + "step": 5832 + }, + { + "epoch": 0.13604387082479366, + "grad_norm": 1.7378299236297607, + "learning_rate": 1.9987649568618373e-05, + "loss": 1.5676, + "step": 5833 + }, + { + "epoch": 0.13606719396397157, + "grad_norm": 1.794023871421814, + "learning_rate": 1.9987637054926796e-05, + "loss": 1.7505, + "step": 5834 + }, + { + "epoch": 0.1360905171031495, + "grad_norm": 1.7501341104507446, + "learning_rate": 1.9987624534902794e-05, + "loss": 1.4716, + "step": 5835 + }, + { + "epoch": 0.13611384024232742, + "grad_norm": 1.4883496761322021, + "learning_rate": 1.9987612008546374e-05, + "loss": 1.1932, + "step": 5836 + }, + { + "epoch": 0.13613716338150533, + "grad_norm": 1.6233457326889038, + "learning_rate": 1.9987599475857545e-05, + "loss": 1.321, + "step": 5837 + }, + { + "epoch": 0.13616048652068324, + "grad_norm": 1.6802023649215698, + "learning_rate": 1.9987586936836322e-05, + "loss": 1.4126, + "step": 5838 + }, + { + "epoch": 0.13618380965986118, + "grad_norm": 1.37955641746521, + "learning_rate": 1.9987574391482698e-05, + "loss": 1.4074, + "step": 5839 + }, + { + "epoch": 0.1362071327990391, + "grad_norm": 1.6011735200881958, + "learning_rate": 1.9987561839796695e-05, + "loss": 1.6959, + "step": 5840 + }, + { + "epoch": 0.136230455938217, + "grad_norm": 1.75760018825531, + "learning_rate": 1.9987549281778316e-05, + "loss": 1.5986, + "step": 5841 + }, + { + "epoch": 0.13625377907739492, + "grad_norm": 1.7888107299804688, + "learning_rate": 1.9987536717427565e-05, + "loss": 1.2823, + "step": 5842 + }, + { + "epoch": 0.13627710221657283, + "grad_norm": 1.727973222732544, + "learning_rate": 1.9987524146744457e-05, + "loss": 1.1134, + "step": 5843 + }, + { + "epoch": 0.13630042535575077, + "grad_norm": 2.212238073348999, + "learning_rate": 1.9987511569728995e-05, + "loss": 1.4435, + "step": 5844 + }, + { + "epoch": 0.13632374849492868, + "grad_norm": 1.9471218585968018, + "learning_rate": 1.998749898638119e-05, + "loss": 1.7248, + "step": 5845 + }, + { + "epoch": 0.1363470716341066, + "grad_norm": 1.893047571182251, + "learning_rate": 1.9987486396701047e-05, + "loss": 1.3725, + "step": 5846 + }, + { + "epoch": 0.1363703947732845, + "grad_norm": 1.8620859384536743, + "learning_rate": 1.9987473800688574e-05, + "loss": 1.0031, + "step": 5847 + }, + { + "epoch": 0.13639371791246244, + "grad_norm": 1.9266870021820068, + "learning_rate": 1.998746119834378e-05, + "loss": 1.639, + "step": 5848 + }, + { + "epoch": 0.13641704105164035, + "grad_norm": 1.3625257015228271, + "learning_rate": 1.9987448589666674e-05, + "loss": 1.1538, + "step": 5849 + }, + { + "epoch": 0.13644036419081826, + "grad_norm": 1.4815322160720825, + "learning_rate": 1.9987435974657265e-05, + "loss": 1.3088, + "step": 5850 + }, + { + "epoch": 0.13646368732999617, + "grad_norm": 1.5348459482192993, + "learning_rate": 1.998742335331556e-05, + "loss": 1.5016, + "step": 5851 + }, + { + "epoch": 0.1364870104691741, + "grad_norm": 1.8962697982788086, + "learning_rate": 1.9987410725641564e-05, + "loss": 1.3317, + "step": 5852 + }, + { + "epoch": 0.13651033360835202, + "grad_norm": 1.8656713962554932, + "learning_rate": 1.998739809163529e-05, + "loss": 1.4903, + "step": 5853 + }, + { + "epoch": 0.13653365674752993, + "grad_norm": 1.5151500701904297, + "learning_rate": 1.9987385451296742e-05, + "loss": 1.317, + "step": 5854 + }, + { + "epoch": 0.13655697988670784, + "grad_norm": 2.1853296756744385, + "learning_rate": 1.9987372804625932e-05, + "loss": 1.9944, + "step": 5855 + }, + { + "epoch": 0.13658030302588578, + "grad_norm": 1.5933955907821655, + "learning_rate": 1.998736015162286e-05, + "loss": 1.3042, + "step": 5856 + }, + { + "epoch": 0.1366036261650637, + "grad_norm": 1.6365134716033936, + "learning_rate": 1.9987347492287542e-05, + "loss": 1.7024, + "step": 5857 + }, + { + "epoch": 0.1366269493042416, + "grad_norm": 1.6374387741088867, + "learning_rate": 1.9987334826619986e-05, + "loss": 1.5208, + "step": 5858 + }, + { + "epoch": 0.1366502724434195, + "grad_norm": 1.8526194095611572, + "learning_rate": 1.9987322154620194e-05, + "loss": 1.474, + "step": 5859 + }, + { + "epoch": 0.13667359558259745, + "grad_norm": 1.5601885318756104, + "learning_rate": 1.998730947628818e-05, + "loss": 1.3996, + "step": 5860 + }, + { + "epoch": 0.13669691872177536, + "grad_norm": 1.780155062675476, + "learning_rate": 1.9987296791623948e-05, + "loss": 1.2727, + "step": 5861 + }, + { + "epoch": 0.13672024186095327, + "grad_norm": 1.7737231254577637, + "learning_rate": 1.9987284100627508e-05, + "loss": 1.1482, + "step": 5862 + }, + { + "epoch": 0.13674356500013118, + "grad_norm": 1.6502045392990112, + "learning_rate": 1.9987271403298866e-05, + "loss": 1.3824, + "step": 5863 + }, + { + "epoch": 0.13676688813930912, + "grad_norm": 1.7158825397491455, + "learning_rate": 1.9987258699638033e-05, + "loss": 1.7508, + "step": 5864 + }, + { + "epoch": 0.13679021127848703, + "grad_norm": 1.5892664194107056, + "learning_rate": 1.9987245989645015e-05, + "loss": 1.5751, + "step": 5865 + }, + { + "epoch": 0.13681353441766494, + "grad_norm": 1.6207221746444702, + "learning_rate": 1.9987233273319823e-05, + "loss": 1.355, + "step": 5866 + }, + { + "epoch": 0.13683685755684286, + "grad_norm": 1.9680052995681763, + "learning_rate": 1.9987220550662464e-05, + "loss": 1.4701, + "step": 5867 + }, + { + "epoch": 0.1368601806960208, + "grad_norm": 1.5549229383468628, + "learning_rate": 1.9987207821672943e-05, + "loss": 1.343, + "step": 5868 + }, + { + "epoch": 0.1368835038351987, + "grad_norm": 1.935019612312317, + "learning_rate": 1.998719508635127e-05, + "loss": 1.4097, + "step": 5869 + }, + { + "epoch": 0.13690682697437662, + "grad_norm": 1.7179702520370483, + "learning_rate": 1.9987182344697454e-05, + "loss": 1.267, + "step": 5870 + }, + { + "epoch": 0.13693015011355453, + "grad_norm": 1.856831669807434, + "learning_rate": 1.99871695967115e-05, + "loss": 1.1633, + "step": 5871 + }, + { + "epoch": 0.13695347325273244, + "grad_norm": 1.538719892501831, + "learning_rate": 1.998715684239342e-05, + "loss": 1.1936, + "step": 5872 + }, + { + "epoch": 0.13697679639191038, + "grad_norm": 2.0261423587799072, + "learning_rate": 1.9987144081743217e-05, + "loss": 1.4518, + "step": 5873 + }, + { + "epoch": 0.1370001195310883, + "grad_norm": 1.705793023109436, + "learning_rate": 1.9987131314760904e-05, + "loss": 1.4938, + "step": 5874 + }, + { + "epoch": 0.1370234426702662, + "grad_norm": 1.6730681657791138, + "learning_rate": 1.9987118541446492e-05, + "loss": 1.3952, + "step": 5875 + }, + { + "epoch": 0.1370467658094441, + "grad_norm": 2.075650215148926, + "learning_rate": 1.998710576179998e-05, + "loss": 1.5905, + "step": 5876 + }, + { + "epoch": 0.13707008894862205, + "grad_norm": 2.0692715644836426, + "learning_rate": 1.9987092975821385e-05, + "loss": 1.747, + "step": 5877 + }, + { + "epoch": 0.13709341208779996, + "grad_norm": 1.419540286064148, + "learning_rate": 1.9987080183510707e-05, + "loss": 1.1716, + "step": 5878 + }, + { + "epoch": 0.13711673522697787, + "grad_norm": 1.4417150020599365, + "learning_rate": 1.9987067384867956e-05, + "loss": 1.1546, + "step": 5879 + }, + { + "epoch": 0.13714005836615578, + "grad_norm": 1.4849956035614014, + "learning_rate": 1.998705457989315e-05, + "loss": 1.5233, + "step": 5880 + }, + { + "epoch": 0.13716338150533372, + "grad_norm": 1.627962350845337, + "learning_rate": 1.9987041768586284e-05, + "loss": 1.2824, + "step": 5881 + }, + { + "epoch": 0.13718670464451163, + "grad_norm": 1.7242908477783203, + "learning_rate": 1.998702895094737e-05, + "loss": 1.4096, + "step": 5882 + }, + { + "epoch": 0.13721002778368954, + "grad_norm": 1.9949018955230713, + "learning_rate": 1.998701612697642e-05, + "loss": 1.2226, + "step": 5883 + }, + { + "epoch": 0.13723335092286745, + "grad_norm": 1.559213638305664, + "learning_rate": 1.9987003296673437e-05, + "loss": 1.5843, + "step": 5884 + }, + { + "epoch": 0.1372566740620454, + "grad_norm": 1.5795493125915527, + "learning_rate": 1.9986990460038433e-05, + "loss": 0.931, + "step": 5885 + }, + { + "epoch": 0.1372799972012233, + "grad_norm": 1.8416163921356201, + "learning_rate": 1.9986977617071417e-05, + "loss": 1.576, + "step": 5886 + }, + { + "epoch": 0.1373033203404012, + "grad_norm": 1.7067124843597412, + "learning_rate": 1.9986964767772393e-05, + "loss": 1.3873, + "step": 5887 + }, + { + "epoch": 0.13732664347957912, + "grad_norm": 1.7658650875091553, + "learning_rate": 1.998695191214137e-05, + "loss": 1.5238, + "step": 5888 + }, + { + "epoch": 0.13734996661875706, + "grad_norm": 1.8947843313217163, + "learning_rate": 1.998693905017836e-05, + "loss": 1.4994, + "step": 5889 + }, + { + "epoch": 0.13737328975793497, + "grad_norm": 1.3939714431762695, + "learning_rate": 1.998692618188337e-05, + "loss": 1.4281, + "step": 5890 + }, + { + "epoch": 0.13739661289711289, + "grad_norm": 1.5842130184173584, + "learning_rate": 1.99869133072564e-05, + "loss": 1.3824, + "step": 5891 + }, + { + "epoch": 0.1374199360362908, + "grad_norm": 1.6344809532165527, + "learning_rate": 1.998690042629747e-05, + "loss": 1.3019, + "step": 5892 + }, + { + "epoch": 0.13744325917546873, + "grad_norm": 2.048349618911743, + "learning_rate": 1.998688753900658e-05, + "loss": 1.3927, + "step": 5893 + }, + { + "epoch": 0.13746658231464665, + "grad_norm": 2.7215261459350586, + "learning_rate": 1.9986874645383746e-05, + "loss": 1.7618, + "step": 5894 + }, + { + "epoch": 0.13748990545382456, + "grad_norm": 1.806826114654541, + "learning_rate": 1.9986861745428963e-05, + "loss": 1.338, + "step": 5895 + }, + { + "epoch": 0.13751322859300247, + "grad_norm": 1.5835437774658203, + "learning_rate": 1.9986848839142254e-05, + "loss": 1.5083, + "step": 5896 + }, + { + "epoch": 0.1375365517321804, + "grad_norm": 1.7267756462097168, + "learning_rate": 1.9986835926523617e-05, + "loss": 1.3759, + "step": 5897 + }, + { + "epoch": 0.13755987487135832, + "grad_norm": 1.7098757028579712, + "learning_rate": 1.9986823007573067e-05, + "loss": 1.5995, + "step": 5898 + }, + { + "epoch": 0.13758319801053623, + "grad_norm": 1.897100567817688, + "learning_rate": 1.998681008229061e-05, + "loss": 1.2634, + "step": 5899 + }, + { + "epoch": 0.13760652114971414, + "grad_norm": 1.6434178352355957, + "learning_rate": 1.998679715067625e-05, + "loss": 1.5113, + "step": 5900 + }, + { + "epoch": 0.13762984428889205, + "grad_norm": 1.4281799793243408, + "learning_rate": 1.998678421273e-05, + "loss": 1.3666, + "step": 5901 + }, + { + "epoch": 0.13765316742807, + "grad_norm": 1.8021718263626099, + "learning_rate": 1.9986771268451867e-05, + "loss": 1.336, + "step": 5902 + }, + { + "epoch": 0.1376764905672479, + "grad_norm": 1.419053316116333, + "learning_rate": 1.9986758317841858e-05, + "loss": 1.3577, + "step": 5903 + }, + { + "epoch": 0.1376998137064258, + "grad_norm": 1.8294508457183838, + "learning_rate": 1.998674536089998e-05, + "loss": 1.265, + "step": 5904 + }, + { + "epoch": 0.13772313684560372, + "grad_norm": 1.5300843715667725, + "learning_rate": 1.9986732397626245e-05, + "loss": 1.7839, + "step": 5905 + }, + { + "epoch": 0.13774645998478166, + "grad_norm": 1.7234196662902832, + "learning_rate": 1.9986719428020658e-05, + "loss": 1.4359, + "step": 5906 + }, + { + "epoch": 0.13776978312395957, + "grad_norm": 1.9976733922958374, + "learning_rate": 1.998670645208323e-05, + "loss": 1.6696, + "step": 5907 + }, + { + "epoch": 0.13779310626313748, + "grad_norm": 1.6174886226654053, + "learning_rate": 1.998669346981397e-05, + "loss": 1.0726, + "step": 5908 + }, + { + "epoch": 0.1378164294023154, + "grad_norm": 1.701895833015442, + "learning_rate": 1.9986680481212883e-05, + "loss": 1.3765, + "step": 5909 + }, + { + "epoch": 0.13783975254149333, + "grad_norm": 1.5521197319030762, + "learning_rate": 1.9986667486279974e-05, + "loss": 1.6321, + "step": 5910 + }, + { + "epoch": 0.13786307568067124, + "grad_norm": 2.126631736755371, + "learning_rate": 1.998665448501526e-05, + "loss": 1.4743, + "step": 5911 + }, + { + "epoch": 0.13788639881984915, + "grad_norm": 1.643113374710083, + "learning_rate": 1.998664147741875e-05, + "loss": 1.3438, + "step": 5912 + }, + { + "epoch": 0.13790972195902707, + "grad_norm": 2.1543045043945312, + "learning_rate": 1.9986628463490438e-05, + "loss": 1.2992, + "step": 5913 + }, + { + "epoch": 0.137933045098205, + "grad_norm": 1.5609725713729858, + "learning_rate": 1.9986615443230343e-05, + "loss": 1.3241, + "step": 5914 + }, + { + "epoch": 0.13795636823738291, + "grad_norm": 1.9155333042144775, + "learning_rate": 1.9986602416638473e-05, + "loss": 1.4536, + "step": 5915 + }, + { + "epoch": 0.13797969137656083, + "grad_norm": 1.3895609378814697, + "learning_rate": 1.9986589383714835e-05, + "loss": 1.1264, + "step": 5916 + }, + { + "epoch": 0.13800301451573874, + "grad_norm": 1.943770408630371, + "learning_rate": 1.998657634445944e-05, + "loss": 1.6614, + "step": 5917 + }, + { + "epoch": 0.13802633765491668, + "grad_norm": 1.637864351272583, + "learning_rate": 1.998656329887229e-05, + "loss": 1.3344, + "step": 5918 + }, + { + "epoch": 0.1380496607940946, + "grad_norm": 1.8925068378448486, + "learning_rate": 1.9986550246953397e-05, + "loss": 1.5077, + "step": 5919 + }, + { + "epoch": 0.1380729839332725, + "grad_norm": 1.7850497961044312, + "learning_rate": 1.998653718870277e-05, + "loss": 1.3675, + "step": 5920 + }, + { + "epoch": 0.1380963070724504, + "grad_norm": 1.8133920431137085, + "learning_rate": 1.9986524124120415e-05, + "loss": 1.1895, + "step": 5921 + }, + { + "epoch": 0.13811963021162835, + "grad_norm": 1.7098134756088257, + "learning_rate": 1.9986511053206343e-05, + "loss": 1.5657, + "step": 5922 + }, + { + "epoch": 0.13814295335080626, + "grad_norm": 2.2276642322540283, + "learning_rate": 1.9986497975960558e-05, + "loss": 1.5623, + "step": 5923 + }, + { + "epoch": 0.13816627648998417, + "grad_norm": 1.7869797945022583, + "learning_rate": 1.9986484892383076e-05, + "loss": 1.6699, + "step": 5924 + }, + { + "epoch": 0.13818959962916208, + "grad_norm": 1.9060138463974, + "learning_rate": 1.9986471802473898e-05, + "loss": 1.0772, + "step": 5925 + }, + { + "epoch": 0.13821292276834002, + "grad_norm": 1.7892858982086182, + "learning_rate": 1.9986458706233033e-05, + "loss": 1.2455, + "step": 5926 + }, + { + "epoch": 0.13823624590751793, + "grad_norm": 1.5083891153335571, + "learning_rate": 1.998644560366049e-05, + "loss": 1.4511, + "step": 5927 + }, + { + "epoch": 0.13825956904669584, + "grad_norm": 1.8618026971817017, + "learning_rate": 1.998643249475628e-05, + "loss": 1.7313, + "step": 5928 + }, + { + "epoch": 0.13828289218587375, + "grad_norm": 1.7617428302764893, + "learning_rate": 1.998641937952041e-05, + "loss": 1.4426, + "step": 5929 + }, + { + "epoch": 0.13830621532505166, + "grad_norm": 1.7681574821472168, + "learning_rate": 1.998640625795289e-05, + "loss": 1.4612, + "step": 5930 + }, + { + "epoch": 0.1383295384642296, + "grad_norm": 1.5730996131896973, + "learning_rate": 1.9986393130053723e-05, + "loss": 1.7798, + "step": 5931 + }, + { + "epoch": 0.1383528616034075, + "grad_norm": 1.6683655977249146, + "learning_rate": 1.9986379995822922e-05, + "loss": 1.4263, + "step": 5932 + }, + { + "epoch": 0.13837618474258542, + "grad_norm": 1.9588593244552612, + "learning_rate": 1.9986366855260493e-05, + "loss": 1.1437, + "step": 5933 + }, + { + "epoch": 0.13839950788176333, + "grad_norm": 1.5498912334442139, + "learning_rate": 1.9986353708366448e-05, + "loss": 1.5542, + "step": 5934 + }, + { + "epoch": 0.13842283102094127, + "grad_norm": 1.4728989601135254, + "learning_rate": 1.9986340555140787e-05, + "loss": 1.3032, + "step": 5935 + }, + { + "epoch": 0.13844615416011918, + "grad_norm": 1.865103006362915, + "learning_rate": 1.998632739558353e-05, + "loss": 1.5746, + "step": 5936 + }, + { + "epoch": 0.1384694772992971, + "grad_norm": 2.1221933364868164, + "learning_rate": 1.9986314229694678e-05, + "loss": 1.3656, + "step": 5937 + }, + { + "epoch": 0.138492800438475, + "grad_norm": 2.028693914413452, + "learning_rate": 1.9986301057474236e-05, + "loss": 1.259, + "step": 5938 + }, + { + "epoch": 0.13851612357765294, + "grad_norm": 1.7283875942230225, + "learning_rate": 1.9986287878922223e-05, + "loss": 1.3185, + "step": 5939 + }, + { + "epoch": 0.13853944671683086, + "grad_norm": 2.430925130844116, + "learning_rate": 1.9986274694038635e-05, + "loss": 1.4215, + "step": 5940 + }, + { + "epoch": 0.13856276985600877, + "grad_norm": 2.350492238998413, + "learning_rate": 1.9986261502823493e-05, + "loss": 1.3892, + "step": 5941 + }, + { + "epoch": 0.13858609299518668, + "grad_norm": 2.417433977127075, + "learning_rate": 1.9986248305276796e-05, + "loss": 1.8118, + "step": 5942 + }, + { + "epoch": 0.13860941613436462, + "grad_norm": 1.8303501605987549, + "learning_rate": 1.9986235101398556e-05, + "loss": 1.2223, + "step": 5943 + }, + { + "epoch": 0.13863273927354253, + "grad_norm": 1.3595937490463257, + "learning_rate": 1.998622189118878e-05, + "loss": 1.2737, + "step": 5944 + }, + { + "epoch": 0.13865606241272044, + "grad_norm": 1.9302918910980225, + "learning_rate": 1.9986208674647475e-05, + "loss": 1.1931, + "step": 5945 + }, + { + "epoch": 0.13867938555189835, + "grad_norm": 1.8451989889144897, + "learning_rate": 1.9986195451774656e-05, + "loss": 1.4484, + "step": 5946 + }, + { + "epoch": 0.1387027086910763, + "grad_norm": 1.769647240638733, + "learning_rate": 1.9986182222570324e-05, + "loss": 1.2957, + "step": 5947 + }, + { + "epoch": 0.1387260318302542, + "grad_norm": 1.4282586574554443, + "learning_rate": 1.9986168987034494e-05, + "loss": 1.5363, + "step": 5948 + }, + { + "epoch": 0.1387493549694321, + "grad_norm": 2.138019561767578, + "learning_rate": 1.9986155745167165e-05, + "loss": 1.4282, + "step": 5949 + }, + { + "epoch": 0.13877267810861002, + "grad_norm": 1.9666112661361694, + "learning_rate": 1.9986142496968355e-05, + "loss": 1.7417, + "step": 5950 + }, + { + "epoch": 0.13879600124778796, + "grad_norm": 1.6037325859069824, + "learning_rate": 1.9986129242438066e-05, + "loss": 1.4991, + "step": 5951 + }, + { + "epoch": 0.13881932438696587, + "grad_norm": 1.6661661863327026, + "learning_rate": 1.998611598157631e-05, + "loss": 1.4651, + "step": 5952 + }, + { + "epoch": 0.13884264752614378, + "grad_norm": 1.7917193174362183, + "learning_rate": 1.9986102714383095e-05, + "loss": 1.5217, + "step": 5953 + }, + { + "epoch": 0.1388659706653217, + "grad_norm": 1.9374152421951294, + "learning_rate": 1.9986089440858428e-05, + "loss": 1.5581, + "step": 5954 + }, + { + "epoch": 0.1388892938044996, + "grad_norm": 1.9052860736846924, + "learning_rate": 1.998607616100232e-05, + "loss": 1.514, + "step": 5955 + }, + { + "epoch": 0.13891261694367754, + "grad_norm": 1.7253270149230957, + "learning_rate": 1.9986062874814775e-05, + "loss": 1.3399, + "step": 5956 + }, + { + "epoch": 0.13893594008285545, + "grad_norm": 1.7499825954437256, + "learning_rate": 1.9986049582295807e-05, + "loss": 1.3011, + "step": 5957 + }, + { + "epoch": 0.13895926322203336, + "grad_norm": 1.579527735710144, + "learning_rate": 1.9986036283445417e-05, + "loss": 1.2057, + "step": 5958 + }, + { + "epoch": 0.13898258636121127, + "grad_norm": 1.8384255170822144, + "learning_rate": 1.998602297826362e-05, + "loss": 1.5699, + "step": 5959 + }, + { + "epoch": 0.1390059095003892, + "grad_norm": 1.5306830406188965, + "learning_rate": 1.9986009666750424e-05, + "loss": 1.2356, + "step": 5960 + }, + { + "epoch": 0.13902923263956712, + "grad_norm": 2.1967837810516357, + "learning_rate": 1.9985996348905834e-05, + "loss": 1.1663, + "step": 5961 + }, + { + "epoch": 0.13905255577874504, + "grad_norm": 1.7975126504898071, + "learning_rate": 1.9985983024729857e-05, + "loss": 1.7858, + "step": 5962 + }, + { + "epoch": 0.13907587891792295, + "grad_norm": 1.723204255104065, + "learning_rate": 1.998596969422251e-05, + "loss": 1.2506, + "step": 5963 + }, + { + "epoch": 0.13909920205710088, + "grad_norm": 1.8693764209747314, + "learning_rate": 1.9985956357383795e-05, + "loss": 1.5436, + "step": 5964 + }, + { + "epoch": 0.1391225251962788, + "grad_norm": 1.6206592321395874, + "learning_rate": 1.998594301421372e-05, + "loss": 1.2804, + "step": 5965 + }, + { + "epoch": 0.1391458483354567, + "grad_norm": 1.7973251342773438, + "learning_rate": 1.9985929664712294e-05, + "loss": 1.2463, + "step": 5966 + }, + { + "epoch": 0.13916917147463462, + "grad_norm": 1.9007964134216309, + "learning_rate": 1.998591630887953e-05, + "loss": 1.2544, + "step": 5967 + }, + { + "epoch": 0.13919249461381256, + "grad_norm": 2.0027081966400146, + "learning_rate": 1.998590294671543e-05, + "loss": 1.5609, + "step": 5968 + }, + { + "epoch": 0.13921581775299047, + "grad_norm": 2.0610148906707764, + "learning_rate": 1.9985889578220005e-05, + "loss": 1.2126, + "step": 5969 + }, + { + "epoch": 0.13923914089216838, + "grad_norm": 1.7119241952896118, + "learning_rate": 1.9985876203393264e-05, + "loss": 1.2958, + "step": 5970 + }, + { + "epoch": 0.1392624640313463, + "grad_norm": 2.198431968688965, + "learning_rate": 1.9985862822235215e-05, + "loss": 1.091, + "step": 5971 + }, + { + "epoch": 0.13928578717052423, + "grad_norm": 1.4475634098052979, + "learning_rate": 1.9985849434745872e-05, + "loss": 0.9317, + "step": 5972 + }, + { + "epoch": 0.13930911030970214, + "grad_norm": 1.6330029964447021, + "learning_rate": 1.9985836040925233e-05, + "loss": 1.6857, + "step": 5973 + }, + { + "epoch": 0.13933243344888005, + "grad_norm": 1.8736703395843506, + "learning_rate": 1.998582264077331e-05, + "loss": 1.731, + "step": 5974 + }, + { + "epoch": 0.13935575658805796, + "grad_norm": 1.9036253690719604, + "learning_rate": 1.998580923429012e-05, + "loss": 1.2292, + "step": 5975 + }, + { + "epoch": 0.1393790797272359, + "grad_norm": 2.0656869411468506, + "learning_rate": 1.998579582147566e-05, + "loss": 1.1653, + "step": 5976 + }, + { + "epoch": 0.1394024028664138, + "grad_norm": 1.8336554765701294, + "learning_rate": 1.9985782402329942e-05, + "loss": 1.7517, + "step": 5977 + }, + { + "epoch": 0.13942572600559172, + "grad_norm": 1.6612292528152466, + "learning_rate": 1.998576897685298e-05, + "loss": 1.5828, + "step": 5978 + }, + { + "epoch": 0.13944904914476963, + "grad_norm": 2.3920350074768066, + "learning_rate": 1.9985755545044778e-05, + "loss": 1.1367, + "step": 5979 + }, + { + "epoch": 0.13947237228394757, + "grad_norm": 2.2012009620666504, + "learning_rate": 1.9985742106905342e-05, + "loss": 1.2318, + "step": 5980 + }, + { + "epoch": 0.13949569542312548, + "grad_norm": 1.7054622173309326, + "learning_rate": 1.9985728662434684e-05, + "loss": 1.3642, + "step": 5981 + }, + { + "epoch": 0.1395190185623034, + "grad_norm": 1.954516887664795, + "learning_rate": 1.9985715211632813e-05, + "loss": 1.5542, + "step": 5982 + }, + { + "epoch": 0.1395423417014813, + "grad_norm": 1.4238861799240112, + "learning_rate": 1.9985701754499738e-05, + "loss": 1.0473, + "step": 5983 + }, + { + "epoch": 0.13956566484065921, + "grad_norm": 1.7874926328659058, + "learning_rate": 1.998568829103546e-05, + "loss": 1.5126, + "step": 5984 + }, + { + "epoch": 0.13958898797983715, + "grad_norm": 1.665626883506775, + "learning_rate": 1.998567482124e-05, + "loss": 1.4718, + "step": 5985 + }, + { + "epoch": 0.13961231111901506, + "grad_norm": 1.6119762659072876, + "learning_rate": 1.9985661345113357e-05, + "loss": 1.2944, + "step": 5986 + }, + { + "epoch": 0.13963563425819298, + "grad_norm": 1.8473111391067505, + "learning_rate": 1.998564786265554e-05, + "loss": 1.6694, + "step": 5987 + }, + { + "epoch": 0.1396589573973709, + "grad_norm": 1.9592182636260986, + "learning_rate": 1.9985634373866564e-05, + "loss": 1.524, + "step": 5988 + }, + { + "epoch": 0.13968228053654883, + "grad_norm": 1.7166043519973755, + "learning_rate": 1.9985620878746435e-05, + "loss": 1.608, + "step": 5989 + }, + { + "epoch": 0.13970560367572674, + "grad_norm": 1.7846938371658325, + "learning_rate": 1.9985607377295156e-05, + "loss": 1.3213, + "step": 5990 + }, + { + "epoch": 0.13972892681490465, + "grad_norm": 1.802619218826294, + "learning_rate": 1.998559386951274e-05, + "loss": 1.3188, + "step": 5991 + }, + { + "epoch": 0.13975224995408256, + "grad_norm": 2.7373600006103516, + "learning_rate": 1.9985580355399198e-05, + "loss": 1.2913, + "step": 5992 + }, + { + "epoch": 0.1397755730932605, + "grad_norm": 1.9509944915771484, + "learning_rate": 1.9985566834954536e-05, + "loss": 1.5278, + "step": 5993 + }, + { + "epoch": 0.1397988962324384, + "grad_norm": 1.68146812915802, + "learning_rate": 1.998555330817876e-05, + "loss": 1.6428, + "step": 5994 + }, + { + "epoch": 0.13982221937161632, + "grad_norm": 2.0023419857025146, + "learning_rate": 1.998553977507188e-05, + "loss": 1.7057, + "step": 5995 + }, + { + "epoch": 0.13984554251079423, + "grad_norm": 1.6589012145996094, + "learning_rate": 1.998552623563391e-05, + "loss": 1.1189, + "step": 5996 + }, + { + "epoch": 0.13986886564997217, + "grad_norm": 1.6851308345794678, + "learning_rate": 1.9985512689864852e-05, + "loss": 1.4658, + "step": 5997 + }, + { + "epoch": 0.13989218878915008, + "grad_norm": 1.5446650981903076, + "learning_rate": 1.9985499137764716e-05, + "loss": 1.1115, + "step": 5998 + }, + { + "epoch": 0.139915511928328, + "grad_norm": 1.6361374855041504, + "learning_rate": 1.9985485579333513e-05, + "loss": 1.2511, + "step": 5999 + }, + { + "epoch": 0.1399388350675059, + "grad_norm": 2.139852523803711, + "learning_rate": 1.998547201457125e-05, + "loss": 1.1905, + "step": 6000 + }, + { + "epoch": 0.13996215820668384, + "grad_norm": 1.6102153062820435, + "learning_rate": 1.9985458443477935e-05, + "loss": 1.326, + "step": 6001 + }, + { + "epoch": 0.13998548134586175, + "grad_norm": 1.9492695331573486, + "learning_rate": 1.9985444866053577e-05, + "loss": 1.5153, + "step": 6002 + }, + { + "epoch": 0.14000880448503966, + "grad_norm": 1.8968290090560913, + "learning_rate": 1.9985431282298183e-05, + "loss": 1.3207, + "step": 6003 + }, + { + "epoch": 0.14003212762421757, + "grad_norm": 2.0098559856414795, + "learning_rate": 1.9985417692211767e-05, + "loss": 1.708, + "step": 6004 + }, + { + "epoch": 0.1400554507633955, + "grad_norm": 2.1262738704681396, + "learning_rate": 1.998540409579433e-05, + "loss": 1.4273, + "step": 6005 + }, + { + "epoch": 0.14007877390257342, + "grad_norm": 2.353754997253418, + "learning_rate": 1.9985390493045887e-05, + "loss": 1.3932, + "step": 6006 + }, + { + "epoch": 0.14010209704175133, + "grad_norm": 1.7316818237304688, + "learning_rate": 1.9985376883966444e-05, + "loss": 1.7993, + "step": 6007 + }, + { + "epoch": 0.14012542018092924, + "grad_norm": 1.966521143913269, + "learning_rate": 1.998536326855601e-05, + "loss": 1.8455, + "step": 6008 + }, + { + "epoch": 0.14014874332010718, + "grad_norm": 2.211327314376831, + "learning_rate": 1.9985349646814596e-05, + "loss": 1.3215, + "step": 6009 + }, + { + "epoch": 0.1401720664592851, + "grad_norm": 2.1321232318878174, + "learning_rate": 1.9985336018742205e-05, + "loss": 1.2153, + "step": 6010 + }, + { + "epoch": 0.140195389598463, + "grad_norm": 1.9806395769119263, + "learning_rate": 1.998532238433885e-05, + "loss": 1.7741, + "step": 6011 + }, + { + "epoch": 0.14021871273764092, + "grad_norm": 1.6093829870224, + "learning_rate": 1.9985308743604538e-05, + "loss": 1.3344, + "step": 6012 + }, + { + "epoch": 0.14024203587681883, + "grad_norm": 1.559790015220642, + "learning_rate": 1.9985295096539275e-05, + "loss": 1.2971, + "step": 6013 + }, + { + "epoch": 0.14026535901599677, + "grad_norm": 1.5524859428405762, + "learning_rate": 1.998528144314308e-05, + "loss": 1.2704, + "step": 6014 + }, + { + "epoch": 0.14028868215517468, + "grad_norm": 1.3505932092666626, + "learning_rate": 1.9985267783415948e-05, + "loss": 1.2587, + "step": 6015 + }, + { + "epoch": 0.1403120052943526, + "grad_norm": 3.4421284198760986, + "learning_rate": 1.9985254117357896e-05, + "loss": 1.5112, + "step": 6016 + }, + { + "epoch": 0.1403353284335305, + "grad_norm": 1.7249492406845093, + "learning_rate": 1.998524044496893e-05, + "loss": 1.1893, + "step": 6017 + }, + { + "epoch": 0.14035865157270844, + "grad_norm": 1.4829734563827515, + "learning_rate": 1.998522676624906e-05, + "loss": 1.1448, + "step": 6018 + }, + { + "epoch": 0.14038197471188635, + "grad_norm": 1.785304307937622, + "learning_rate": 1.9985213081198295e-05, + "loss": 1.6817, + "step": 6019 + }, + { + "epoch": 0.14040529785106426, + "grad_norm": 2.0902676582336426, + "learning_rate": 1.9985199389816645e-05, + "loss": 1.1452, + "step": 6020 + }, + { + "epoch": 0.14042862099024217, + "grad_norm": 1.9945709705352783, + "learning_rate": 1.998518569210411e-05, + "loss": 1.2422, + "step": 6021 + }, + { + "epoch": 0.1404519441294201, + "grad_norm": 2.506932497024536, + "learning_rate": 1.998517198806071e-05, + "loss": 1.509, + "step": 6022 + }, + { + "epoch": 0.14047526726859802, + "grad_norm": 1.608056664466858, + "learning_rate": 1.9985158277686448e-05, + "loss": 1.5649, + "step": 6023 + }, + { + "epoch": 0.14049859040777593, + "grad_norm": 1.8673421144485474, + "learning_rate": 1.9985144560981332e-05, + "loss": 1.8225, + "step": 6024 + }, + { + "epoch": 0.14052191354695384, + "grad_norm": 1.7653878927230835, + "learning_rate": 1.9985130837945376e-05, + "loss": 1.1869, + "step": 6025 + }, + { + "epoch": 0.14054523668613178, + "grad_norm": 1.5135056972503662, + "learning_rate": 1.998511710857858e-05, + "loss": 1.5364, + "step": 6026 + }, + { + "epoch": 0.1405685598253097, + "grad_norm": 1.7755826711654663, + "learning_rate": 1.9985103372880964e-05, + "loss": 1.5993, + "step": 6027 + }, + { + "epoch": 0.1405918829644876, + "grad_norm": 1.6194390058517456, + "learning_rate": 1.9985089630852524e-05, + "loss": 1.1574, + "step": 6028 + }, + { + "epoch": 0.1406152061036655, + "grad_norm": 2.509934425354004, + "learning_rate": 1.9985075882493278e-05, + "loss": 1.7279, + "step": 6029 + }, + { + "epoch": 0.14063852924284345, + "grad_norm": 1.7961852550506592, + "learning_rate": 1.998506212780323e-05, + "loss": 1.7813, + "step": 6030 + }, + { + "epoch": 0.14066185238202136, + "grad_norm": 1.6347060203552246, + "learning_rate": 1.998504836678239e-05, + "loss": 1.3222, + "step": 6031 + }, + { + "epoch": 0.14068517552119927, + "grad_norm": 2.0044994354248047, + "learning_rate": 1.9985034599430772e-05, + "loss": 1.4786, + "step": 6032 + }, + { + "epoch": 0.14070849866037718, + "grad_norm": 1.552268624305725, + "learning_rate": 1.9985020825748375e-05, + "loss": 1.4359, + "step": 6033 + }, + { + "epoch": 0.14073182179955512, + "grad_norm": 1.6400984525680542, + "learning_rate": 1.9985007045735214e-05, + "loss": 1.3523, + "step": 6034 + }, + { + "epoch": 0.14075514493873303, + "grad_norm": 1.768753170967102, + "learning_rate": 1.9984993259391297e-05, + "loss": 1.0996, + "step": 6035 + }, + { + "epoch": 0.14077846807791095, + "grad_norm": 1.563262701034546, + "learning_rate": 1.998497946671663e-05, + "loss": 1.508, + "step": 6036 + }, + { + "epoch": 0.14080179121708886, + "grad_norm": 1.7188997268676758, + "learning_rate": 1.9984965667711224e-05, + "loss": 1.4617, + "step": 6037 + }, + { + "epoch": 0.1408251143562668, + "grad_norm": 1.620224118232727, + "learning_rate": 1.9984951862375094e-05, + "loss": 1.4751, + "step": 6038 + }, + { + "epoch": 0.1408484374954447, + "grad_norm": 1.7192742824554443, + "learning_rate": 1.9984938050708234e-05, + "loss": 1.3163, + "step": 6039 + }, + { + "epoch": 0.14087176063462262, + "grad_norm": 1.695589542388916, + "learning_rate": 1.9984924232710665e-05, + "loss": 1.4246, + "step": 6040 + }, + { + "epoch": 0.14089508377380053, + "grad_norm": 1.8395006656646729, + "learning_rate": 1.9984910408382393e-05, + "loss": 1.4144, + "step": 6041 + }, + { + "epoch": 0.14091840691297844, + "grad_norm": 1.3727190494537354, + "learning_rate": 1.9984896577723424e-05, + "loss": 1.3329, + "step": 6042 + }, + { + "epoch": 0.14094173005215638, + "grad_norm": 1.518173336982727, + "learning_rate": 1.9984882740733768e-05, + "loss": 1.7846, + "step": 6043 + }, + { + "epoch": 0.1409650531913343, + "grad_norm": 1.6783645153045654, + "learning_rate": 1.9984868897413435e-05, + "loss": 1.4808, + "step": 6044 + }, + { + "epoch": 0.1409883763305122, + "grad_norm": 1.6759785413742065, + "learning_rate": 1.9984855047762432e-05, + "loss": 1.314, + "step": 6045 + }, + { + "epoch": 0.1410116994696901, + "grad_norm": 1.9766584634780884, + "learning_rate": 1.998484119178077e-05, + "loss": 1.8433, + "step": 6046 + }, + { + "epoch": 0.14103502260886805, + "grad_norm": 1.3789831399917603, + "learning_rate": 1.9984827329468457e-05, + "loss": 1.2249, + "step": 6047 + }, + { + "epoch": 0.14105834574804596, + "grad_norm": 1.8732948303222656, + "learning_rate": 1.9984813460825498e-05, + "loss": 1.4802, + "step": 6048 + }, + { + "epoch": 0.14108166888722387, + "grad_norm": 1.7507439851760864, + "learning_rate": 1.998479958585191e-05, + "loss": 1.6732, + "step": 6049 + }, + { + "epoch": 0.14110499202640178, + "grad_norm": 1.6961098909378052, + "learning_rate": 1.9984785704547692e-05, + "loss": 1.8398, + "step": 6050 + }, + { + "epoch": 0.14112831516557972, + "grad_norm": 1.7741752862930298, + "learning_rate": 1.9984771816912862e-05, + "loss": 1.5246, + "step": 6051 + }, + { + "epoch": 0.14115163830475763, + "grad_norm": 1.9861652851104736, + "learning_rate": 1.9984757922947423e-05, + "loss": 1.792, + "step": 6052 + }, + { + "epoch": 0.14117496144393554, + "grad_norm": 1.6446164846420288, + "learning_rate": 1.9984744022651385e-05, + "loss": 1.462, + "step": 6053 + }, + { + "epoch": 0.14119828458311345, + "grad_norm": 1.7127081155776978, + "learning_rate": 1.9984730116024758e-05, + "loss": 1.2577, + "step": 6054 + }, + { + "epoch": 0.1412216077222914, + "grad_norm": 2.0493862628936768, + "learning_rate": 1.998471620306755e-05, + "loss": 1.4209, + "step": 6055 + }, + { + "epoch": 0.1412449308614693, + "grad_norm": 2.262053966522217, + "learning_rate": 1.9984702283779765e-05, + "loss": 1.174, + "step": 6056 + }, + { + "epoch": 0.14126825400064721, + "grad_norm": 1.6032483577728271, + "learning_rate": 1.9984688358161423e-05, + "loss": 1.1465, + "step": 6057 + }, + { + "epoch": 0.14129157713982513, + "grad_norm": 2.6073451042175293, + "learning_rate": 1.9984674426212526e-05, + "loss": 1.5099, + "step": 6058 + }, + { + "epoch": 0.14131490027900306, + "grad_norm": 1.50624680519104, + "learning_rate": 1.9984660487933078e-05, + "loss": 1.6381, + "step": 6059 + }, + { + "epoch": 0.14133822341818097, + "grad_norm": 1.6334619522094727, + "learning_rate": 1.9984646543323095e-05, + "loss": 1.5765, + "step": 6060 + }, + { + "epoch": 0.14136154655735889, + "grad_norm": 1.7184330224990845, + "learning_rate": 1.9984632592382588e-05, + "loss": 1.4832, + "step": 6061 + }, + { + "epoch": 0.1413848696965368, + "grad_norm": 1.7488726377487183, + "learning_rate": 1.998461863511156e-05, + "loss": 1.4686, + "step": 6062 + }, + { + "epoch": 0.14140819283571474, + "grad_norm": 1.9074149131774902, + "learning_rate": 1.998460467151002e-05, + "loss": 1.3417, + "step": 6063 + }, + { + "epoch": 0.14143151597489265, + "grad_norm": 1.7359164953231812, + "learning_rate": 1.998459070157798e-05, + "loss": 1.3991, + "step": 6064 + }, + { + "epoch": 0.14145483911407056, + "grad_norm": 1.4685072898864746, + "learning_rate": 1.9984576725315447e-05, + "loss": 1.0244, + "step": 6065 + }, + { + "epoch": 0.14147816225324847, + "grad_norm": 1.3788610696792603, + "learning_rate": 1.998456274272243e-05, + "loss": 1.2246, + "step": 6066 + }, + { + "epoch": 0.1415014853924264, + "grad_norm": 1.7546474933624268, + "learning_rate": 1.998454875379894e-05, + "loss": 1.172, + "step": 6067 + }, + { + "epoch": 0.14152480853160432, + "grad_norm": 1.8506349325180054, + "learning_rate": 1.998453475854498e-05, + "loss": 1.3342, + "step": 6068 + }, + { + "epoch": 0.14154813167078223, + "grad_norm": 1.7376810312271118, + "learning_rate": 1.998452075696056e-05, + "loss": 1.3008, + "step": 6069 + }, + { + "epoch": 0.14157145480996014, + "grad_norm": 1.648751139640808, + "learning_rate": 1.9984506749045697e-05, + "loss": 1.4869, + "step": 6070 + }, + { + "epoch": 0.14159477794913805, + "grad_norm": 1.3988120555877686, + "learning_rate": 1.9984492734800397e-05, + "loss": 1.0069, + "step": 6071 + }, + { + "epoch": 0.141618101088316, + "grad_norm": 1.8989641666412354, + "learning_rate": 1.9984478714224664e-05, + "loss": 1.3959, + "step": 6072 + }, + { + "epoch": 0.1416414242274939, + "grad_norm": 1.6855353116989136, + "learning_rate": 1.9984464687318508e-05, + "loss": 1.4939, + "step": 6073 + }, + { + "epoch": 0.1416647473666718, + "grad_norm": 1.5465542078018188, + "learning_rate": 1.998445065408194e-05, + "loss": 1.5399, + "step": 6074 + }, + { + "epoch": 0.14168807050584972, + "grad_norm": 2.1934516429901123, + "learning_rate": 1.998443661451497e-05, + "loss": 1.7079, + "step": 6075 + }, + { + "epoch": 0.14171139364502766, + "grad_norm": 1.6067827939987183, + "learning_rate": 1.9984422568617603e-05, + "loss": 1.6389, + "step": 6076 + }, + { + "epoch": 0.14173471678420557, + "grad_norm": 1.6995450258255005, + "learning_rate": 1.9984408516389848e-05, + "loss": 1.5188, + "step": 6077 + }, + { + "epoch": 0.14175803992338348, + "grad_norm": 1.5899747610092163, + "learning_rate": 1.9984394457831718e-05, + "loss": 1.2731, + "step": 6078 + }, + { + "epoch": 0.1417813630625614, + "grad_norm": 1.8115757703781128, + "learning_rate": 1.9984380392943222e-05, + "loss": 1.1092, + "step": 6079 + }, + { + "epoch": 0.14180468620173933, + "grad_norm": 1.8454803228378296, + "learning_rate": 1.9984366321724365e-05, + "loss": 1.1613, + "step": 6080 + }, + { + "epoch": 0.14182800934091724, + "grad_norm": 2.673640251159668, + "learning_rate": 1.9984352244175157e-05, + "loss": 1.1975, + "step": 6081 + }, + { + "epoch": 0.14185133248009515, + "grad_norm": 1.578275203704834, + "learning_rate": 1.9984338160295607e-05, + "loss": 1.3263, + "step": 6082 + }, + { + "epoch": 0.14187465561927307, + "grad_norm": 1.7498990297317505, + "learning_rate": 1.9984324070085725e-05, + "loss": 1.2659, + "step": 6083 + }, + { + "epoch": 0.141897978758451, + "grad_norm": 1.5607050657272339, + "learning_rate": 1.998430997354552e-05, + "loss": 1.0995, + "step": 6084 + }, + { + "epoch": 0.14192130189762892, + "grad_norm": 1.9511826038360596, + "learning_rate": 1.9984295870675e-05, + "loss": 1.4303, + "step": 6085 + }, + { + "epoch": 0.14194462503680683, + "grad_norm": 1.731707215309143, + "learning_rate": 1.9984281761474172e-05, + "loss": 1.3738, + "step": 6086 + }, + { + "epoch": 0.14196794817598474, + "grad_norm": 2.0200207233428955, + "learning_rate": 1.998426764594305e-05, + "loss": 1.5839, + "step": 6087 + }, + { + "epoch": 0.14199127131516268, + "grad_norm": 1.776933193206787, + "learning_rate": 1.9984253524081642e-05, + "loss": 1.2857, + "step": 6088 + }, + { + "epoch": 0.1420145944543406, + "grad_norm": 1.8621747493743896, + "learning_rate": 1.9984239395889953e-05, + "loss": 1.2597, + "step": 6089 + }, + { + "epoch": 0.1420379175935185, + "grad_norm": 1.3612380027770996, + "learning_rate": 1.9984225261367994e-05, + "loss": 1.463, + "step": 6090 + }, + { + "epoch": 0.1420612407326964, + "grad_norm": 2.0647497177124023, + "learning_rate": 1.9984211120515774e-05, + "loss": 1.5595, + "step": 6091 + }, + { + "epoch": 0.14208456387187435, + "grad_norm": 1.9524799585342407, + "learning_rate": 1.99841969733333e-05, + "loss": 1.3274, + "step": 6092 + }, + { + "epoch": 0.14210788701105226, + "grad_norm": 1.677502155303955, + "learning_rate": 1.998418281982059e-05, + "loss": 1.3088, + "step": 6093 + }, + { + "epoch": 0.14213121015023017, + "grad_norm": 1.9468404054641724, + "learning_rate": 1.9984168659977637e-05, + "loss": 1.6402, + "step": 6094 + }, + { + "epoch": 0.14215453328940808, + "grad_norm": 1.9174586534500122, + "learning_rate": 1.9984154493804465e-05, + "loss": 1.4074, + "step": 6095 + }, + { + "epoch": 0.14217785642858602, + "grad_norm": 1.4662233591079712, + "learning_rate": 1.9984140321301074e-05, + "loss": 1.0374, + "step": 6096 + }, + { + "epoch": 0.14220117956776393, + "grad_norm": 1.74077308177948, + "learning_rate": 1.9984126142467477e-05, + "loss": 1.825, + "step": 6097 + }, + { + "epoch": 0.14222450270694184, + "grad_norm": 1.6266309022903442, + "learning_rate": 1.998411195730368e-05, + "loss": 1.9217, + "step": 6098 + }, + { + "epoch": 0.14224782584611975, + "grad_norm": 1.7686604261398315, + "learning_rate": 1.9984097765809697e-05, + "loss": 1.5284, + "step": 6099 + }, + { + "epoch": 0.14227114898529766, + "grad_norm": 1.6298646926879883, + "learning_rate": 1.9984083567985533e-05, + "loss": 1.5221, + "step": 6100 + }, + { + "epoch": 0.1422944721244756, + "grad_norm": 1.7376065254211426, + "learning_rate": 1.9984069363831197e-05, + "loss": 1.646, + "step": 6101 + }, + { + "epoch": 0.1423177952636535, + "grad_norm": 1.3586046695709229, + "learning_rate": 1.99840551533467e-05, + "loss": 1.169, + "step": 6102 + }, + { + "epoch": 0.14234111840283142, + "grad_norm": 1.4665659666061401, + "learning_rate": 1.9984040936532048e-05, + "loss": 1.5004, + "step": 6103 + }, + { + "epoch": 0.14236444154200933, + "grad_norm": 1.7270456552505493, + "learning_rate": 1.9984026713387254e-05, + "loss": 1.6951, + "step": 6104 + }, + { + "epoch": 0.14238776468118727, + "grad_norm": 1.6258809566497803, + "learning_rate": 1.9984012483912325e-05, + "loss": 1.5982, + "step": 6105 + }, + { + "epoch": 0.14241108782036518, + "grad_norm": 1.8847723007202148, + "learning_rate": 1.9983998248107268e-05, + "loss": 1.3552, + "step": 6106 + }, + { + "epoch": 0.1424344109595431, + "grad_norm": 2.151219606399536, + "learning_rate": 1.9983984005972094e-05, + "loss": 1.4544, + "step": 6107 + }, + { + "epoch": 0.142457734098721, + "grad_norm": 1.8810778856277466, + "learning_rate": 1.9983969757506815e-05, + "loss": 1.7113, + "step": 6108 + }, + { + "epoch": 0.14248105723789894, + "grad_norm": 1.7763164043426514, + "learning_rate": 1.9983955502711432e-05, + "loss": 1.6847, + "step": 6109 + }, + { + "epoch": 0.14250438037707686, + "grad_norm": 1.719817042350769, + "learning_rate": 1.9983941241585966e-05, + "loss": 1.5407, + "step": 6110 + }, + { + "epoch": 0.14252770351625477, + "grad_norm": 1.9127790927886963, + "learning_rate": 1.9983926974130415e-05, + "loss": 1.1192, + "step": 6111 + }, + { + "epoch": 0.14255102665543268, + "grad_norm": 1.6890794038772583, + "learning_rate": 1.998391270034479e-05, + "loss": 1.3354, + "step": 6112 + }, + { + "epoch": 0.14257434979461062, + "grad_norm": 1.6422415971755981, + "learning_rate": 1.9983898420229106e-05, + "loss": 1.5008, + "step": 6113 + }, + { + "epoch": 0.14259767293378853, + "grad_norm": 2.2658350467681885, + "learning_rate": 1.9983884133783365e-05, + "loss": 1.3011, + "step": 6114 + }, + { + "epoch": 0.14262099607296644, + "grad_norm": 2.12793231010437, + "learning_rate": 1.9983869841007584e-05, + "loss": 1.5791, + "step": 6115 + }, + { + "epoch": 0.14264431921214435, + "grad_norm": 1.543769121170044, + "learning_rate": 1.9983855541901763e-05, + "loss": 1.0595, + "step": 6116 + }, + { + "epoch": 0.1426676423513223, + "grad_norm": 1.6334683895111084, + "learning_rate": 1.998384123646592e-05, + "loss": 1.3127, + "step": 6117 + }, + { + "epoch": 0.1426909654905002, + "grad_norm": 1.9104158878326416, + "learning_rate": 1.9983826924700052e-05, + "loss": 1.4317, + "step": 6118 + }, + { + "epoch": 0.1427142886296781, + "grad_norm": 1.602925181388855, + "learning_rate": 1.9983812606604186e-05, + "loss": 1.2167, + "step": 6119 + }, + { + "epoch": 0.14273761176885602, + "grad_norm": 1.5853163003921509, + "learning_rate": 1.9983798282178314e-05, + "loss": 1.5809, + "step": 6120 + }, + { + "epoch": 0.14276093490803396, + "grad_norm": 1.4522337913513184, + "learning_rate": 1.998378395142245e-05, + "loss": 1.2491, + "step": 6121 + }, + { + "epoch": 0.14278425804721187, + "grad_norm": 1.691820502281189, + "learning_rate": 1.998376961433661e-05, + "loss": 1.3375, + "step": 6122 + }, + { + "epoch": 0.14280758118638978, + "grad_norm": 1.6772396564483643, + "learning_rate": 1.9983755270920795e-05, + "loss": 1.5535, + "step": 6123 + }, + { + "epoch": 0.1428309043255677, + "grad_norm": 1.589551329612732, + "learning_rate": 1.9983740921175018e-05, + "loss": 1.5387, + "step": 6124 + }, + { + "epoch": 0.14285422746474563, + "grad_norm": 1.7393651008605957, + "learning_rate": 1.9983726565099285e-05, + "loss": 1.3167, + "step": 6125 + }, + { + "epoch": 0.14287755060392354, + "grad_norm": 2.027698040008545, + "learning_rate": 1.998371220269361e-05, + "loss": 1.8813, + "step": 6126 + }, + { + "epoch": 0.14290087374310145, + "grad_norm": 1.567395567893982, + "learning_rate": 1.9983697833958e-05, + "loss": 1.3043, + "step": 6127 + }, + { + "epoch": 0.14292419688227936, + "grad_norm": 1.5137680768966675, + "learning_rate": 1.998368345889246e-05, + "loss": 1.5617, + "step": 6128 + }, + { + "epoch": 0.14294752002145727, + "grad_norm": 2.1351406574249268, + "learning_rate": 1.9983669077497004e-05, + "loss": 1.6296, + "step": 6129 + }, + { + "epoch": 0.1429708431606352, + "grad_norm": 1.9155551195144653, + "learning_rate": 1.9983654689771642e-05, + "loss": 1.4952, + "step": 6130 + }, + { + "epoch": 0.14299416629981312, + "grad_norm": 2.2925221920013428, + "learning_rate": 1.998364029571638e-05, + "loss": 1.5835, + "step": 6131 + }, + { + "epoch": 0.14301748943899104, + "grad_norm": 1.660978078842163, + "learning_rate": 1.9983625895331228e-05, + "loss": 1.6656, + "step": 6132 + }, + { + "epoch": 0.14304081257816895, + "grad_norm": 1.8334819078445435, + "learning_rate": 1.9983611488616193e-05, + "loss": 1.571, + "step": 6133 + }, + { + "epoch": 0.14306413571734689, + "grad_norm": 1.664991855621338, + "learning_rate": 1.9983597075571285e-05, + "loss": 1.2742, + "step": 6134 + }, + { + "epoch": 0.1430874588565248, + "grad_norm": 1.9092906713485718, + "learning_rate": 1.998358265619652e-05, + "loss": 1.7917, + "step": 6135 + }, + { + "epoch": 0.1431107819957027, + "grad_norm": 1.7395601272583008, + "learning_rate": 1.9983568230491897e-05, + "loss": 1.2414, + "step": 6136 + }, + { + "epoch": 0.14313410513488062, + "grad_norm": 1.9282655715942383, + "learning_rate": 1.9983553798457428e-05, + "loss": 1.2613, + "step": 6137 + }, + { + "epoch": 0.14315742827405856, + "grad_norm": 1.9499088525772095, + "learning_rate": 1.9983539360093126e-05, + "loss": 1.2537, + "step": 6138 + }, + { + "epoch": 0.14318075141323647, + "grad_norm": 1.868593454360962, + "learning_rate": 1.9983524915399e-05, + "loss": 1.3487, + "step": 6139 + }, + { + "epoch": 0.14320407455241438, + "grad_norm": 2.008288621902466, + "learning_rate": 1.9983510464375056e-05, + "loss": 1.3357, + "step": 6140 + }, + { + "epoch": 0.1432273976915923, + "grad_norm": 1.9047181606292725, + "learning_rate": 1.9983496007021305e-05, + "loss": 1.1795, + "step": 6141 + }, + { + "epoch": 0.14325072083077023, + "grad_norm": 1.7448071241378784, + "learning_rate": 1.998348154333775e-05, + "loss": 1.117, + "step": 6142 + }, + { + "epoch": 0.14327404396994814, + "grad_norm": 1.7927271127700806, + "learning_rate": 1.9983467073324412e-05, + "loss": 1.4406, + "step": 6143 + }, + { + "epoch": 0.14329736710912605, + "grad_norm": 2.0013065338134766, + "learning_rate": 1.998345259698129e-05, + "loss": 1.7044, + "step": 6144 + }, + { + "epoch": 0.14332069024830396, + "grad_norm": 1.900384545326233, + "learning_rate": 1.9983438114308404e-05, + "loss": 1.5505, + "step": 6145 + }, + { + "epoch": 0.1433440133874819, + "grad_norm": 1.8173954486846924, + "learning_rate": 1.9983423625305748e-05, + "loss": 1.3698, + "step": 6146 + }, + { + "epoch": 0.1433673365266598, + "grad_norm": 2.0743818283081055, + "learning_rate": 1.998340912997334e-05, + "loss": 1.4185, + "step": 6147 + }, + { + "epoch": 0.14339065966583772, + "grad_norm": 1.7615689039230347, + "learning_rate": 1.998339462831119e-05, + "loss": 1.5295, + "step": 6148 + }, + { + "epoch": 0.14341398280501563, + "grad_norm": 1.543298602104187, + "learning_rate": 1.9983380120319307e-05, + "loss": 1.3294, + "step": 6149 + }, + { + "epoch": 0.14343730594419357, + "grad_norm": 1.8405792713165283, + "learning_rate": 1.99833656059977e-05, + "loss": 1.3359, + "step": 6150 + }, + { + "epoch": 0.14346062908337148, + "grad_norm": 1.8143842220306396, + "learning_rate": 1.9983351085346374e-05, + "loss": 1.2314, + "step": 6151 + }, + { + "epoch": 0.1434839522225494, + "grad_norm": 1.2752970457077026, + "learning_rate": 1.9983336558365343e-05, + "loss": 1.3063, + "step": 6152 + }, + { + "epoch": 0.1435072753617273, + "grad_norm": 1.308411955833435, + "learning_rate": 1.9983322025054614e-05, + "loss": 1.1683, + "step": 6153 + }, + { + "epoch": 0.14353059850090524, + "grad_norm": 1.8233157396316528, + "learning_rate": 1.9983307485414197e-05, + "loss": 1.3901, + "step": 6154 + }, + { + "epoch": 0.14355392164008315, + "grad_norm": 2.0783843994140625, + "learning_rate": 1.99832929394441e-05, + "loss": 1.4295, + "step": 6155 + }, + { + "epoch": 0.14357724477926107, + "grad_norm": 1.5064458847045898, + "learning_rate": 1.9983278387144332e-05, + "loss": 1.0984, + "step": 6156 + }, + { + "epoch": 0.14360056791843898, + "grad_norm": 1.9447945356369019, + "learning_rate": 1.9983263828514907e-05, + "loss": 1.6027, + "step": 6157 + }, + { + "epoch": 0.1436238910576169, + "grad_norm": 1.8615599870681763, + "learning_rate": 1.998324926355583e-05, + "loss": 1.371, + "step": 6158 + }, + { + "epoch": 0.14364721419679483, + "grad_norm": 2.176682949066162, + "learning_rate": 1.9983234692267107e-05, + "loss": 1.5532, + "step": 6159 + }, + { + "epoch": 0.14367053733597274, + "grad_norm": 1.7804512977600098, + "learning_rate": 1.9983220114648756e-05, + "loss": 1.341, + "step": 6160 + }, + { + "epoch": 0.14369386047515065, + "grad_norm": 1.544712781906128, + "learning_rate": 1.998320553070078e-05, + "loss": 1.4943, + "step": 6161 + }, + { + "epoch": 0.14371718361432856, + "grad_norm": 1.3159451484680176, + "learning_rate": 1.9983190940423186e-05, + "loss": 1.3127, + "step": 6162 + }, + { + "epoch": 0.1437405067535065, + "grad_norm": 2.388256311416626, + "learning_rate": 1.9983176343815992e-05, + "loss": 1.6049, + "step": 6163 + }, + { + "epoch": 0.1437638298926844, + "grad_norm": 1.5700067281723022, + "learning_rate": 1.99831617408792e-05, + "loss": 1.1873, + "step": 6164 + }, + { + "epoch": 0.14378715303186232, + "grad_norm": 1.8588881492614746, + "learning_rate": 1.998314713161282e-05, + "loss": 1.6263, + "step": 6165 + }, + { + "epoch": 0.14381047617104023, + "grad_norm": 2.047004222869873, + "learning_rate": 1.9983132516016862e-05, + "loss": 1.7287, + "step": 6166 + }, + { + "epoch": 0.14383379931021817, + "grad_norm": 1.6787564754486084, + "learning_rate": 1.9983117894091338e-05, + "loss": 1.7092, + "step": 6167 + }, + { + "epoch": 0.14385712244939608, + "grad_norm": 1.9672847986221313, + "learning_rate": 1.9983103265836256e-05, + "loss": 1.188, + "step": 6168 + }, + { + "epoch": 0.143880445588574, + "grad_norm": 2.422473907470703, + "learning_rate": 1.9983088631251625e-05, + "loss": 1.3494, + "step": 6169 + }, + { + "epoch": 0.1439037687277519, + "grad_norm": 1.6457571983337402, + "learning_rate": 1.998307399033745e-05, + "loss": 1.2711, + "step": 6170 + }, + { + "epoch": 0.14392709186692984, + "grad_norm": 1.6237685680389404, + "learning_rate": 1.9983059343093748e-05, + "loss": 1.3457, + "step": 6171 + }, + { + "epoch": 0.14395041500610775, + "grad_norm": 2.0398764610290527, + "learning_rate": 1.9983044689520522e-05, + "loss": 1.695, + "step": 6172 + }, + { + "epoch": 0.14397373814528566, + "grad_norm": 2.4313852787017822, + "learning_rate": 1.998303002961778e-05, + "loss": 1.2997, + "step": 6173 + }, + { + "epoch": 0.14399706128446357, + "grad_norm": 1.6472761631011963, + "learning_rate": 1.998301536338554e-05, + "loss": 1.5706, + "step": 6174 + }, + { + "epoch": 0.1440203844236415, + "grad_norm": 2.012556552886963, + "learning_rate": 1.9983000690823804e-05, + "loss": 1.2949, + "step": 6175 + }, + { + "epoch": 0.14404370756281942, + "grad_norm": 1.4228172302246094, + "learning_rate": 1.9982986011932587e-05, + "loss": 1.6881, + "step": 6176 + }, + { + "epoch": 0.14406703070199733, + "grad_norm": 1.5519723892211914, + "learning_rate": 1.998297132671189e-05, + "loss": 1.2734, + "step": 6177 + }, + { + "epoch": 0.14409035384117524, + "grad_norm": 1.9358651638031006, + "learning_rate": 1.998295663516173e-05, + "loss": 1.3786, + "step": 6178 + }, + { + "epoch": 0.14411367698035318, + "grad_norm": 1.5061684846878052, + "learning_rate": 1.9982941937282112e-05, + "loss": 1.0558, + "step": 6179 + }, + { + "epoch": 0.1441370001195311, + "grad_norm": 1.4675248861312866, + "learning_rate": 1.9982927233073047e-05, + "loss": 1.1563, + "step": 6180 + }, + { + "epoch": 0.144160323258709, + "grad_norm": 1.8398594856262207, + "learning_rate": 1.9982912522534545e-05, + "loss": 1.1871, + "step": 6181 + }, + { + "epoch": 0.14418364639788692, + "grad_norm": 1.4180809259414673, + "learning_rate": 1.9982897805666616e-05, + "loss": 1.0769, + "step": 6182 + }, + { + "epoch": 0.14420696953706483, + "grad_norm": 1.7799228429794312, + "learning_rate": 1.9982883082469264e-05, + "loss": 1.4718, + "step": 6183 + }, + { + "epoch": 0.14423029267624277, + "grad_norm": 2.484895944595337, + "learning_rate": 1.9982868352942502e-05, + "loss": 1.7347, + "step": 6184 + }, + { + "epoch": 0.14425361581542068, + "grad_norm": 1.8689348697662354, + "learning_rate": 1.998285361708634e-05, + "loss": 1.4675, + "step": 6185 + }, + { + "epoch": 0.1442769389545986, + "grad_norm": 2.0851986408233643, + "learning_rate": 1.9982838874900787e-05, + "loss": 1.6437, + "step": 6186 + }, + { + "epoch": 0.1443002620937765, + "grad_norm": 1.6126699447631836, + "learning_rate": 1.9982824126385854e-05, + "loss": 1.1683, + "step": 6187 + }, + { + "epoch": 0.14432358523295444, + "grad_norm": 1.7898924350738525, + "learning_rate": 1.998280937154155e-05, + "loss": 0.9045, + "step": 6188 + }, + { + "epoch": 0.14434690837213235, + "grad_norm": 1.9518897533416748, + "learning_rate": 1.9982794610367877e-05, + "loss": 1.5413, + "step": 6189 + }, + { + "epoch": 0.14437023151131026, + "grad_norm": 2.0393829345703125, + "learning_rate": 1.998277984286485e-05, + "loss": 1.7706, + "step": 6190 + }, + { + "epoch": 0.14439355465048817, + "grad_norm": 2.091890573501587, + "learning_rate": 1.9982765069032483e-05, + "loss": 1.3571, + "step": 6191 + }, + { + "epoch": 0.1444168777896661, + "grad_norm": 1.767745018005371, + "learning_rate": 1.998275028887078e-05, + "loss": 1.455, + "step": 6192 + }, + { + "epoch": 0.14444020092884402, + "grad_norm": 1.4978595972061157, + "learning_rate": 1.998273550237975e-05, + "loss": 1.6303, + "step": 6193 + }, + { + "epoch": 0.14446352406802193, + "grad_norm": 1.638681173324585, + "learning_rate": 1.9982720709559404e-05, + "loss": 1.3891, + "step": 6194 + }, + { + "epoch": 0.14448684720719984, + "grad_norm": 2.0340912342071533, + "learning_rate": 1.9982705910409753e-05, + "loss": 1.3799, + "step": 6195 + }, + { + "epoch": 0.14451017034637778, + "grad_norm": 1.9758403301239014, + "learning_rate": 1.99826911049308e-05, + "loss": 1.5578, + "step": 6196 + }, + { + "epoch": 0.1445334934855557, + "grad_norm": 1.4881457090377808, + "learning_rate": 1.998267629312256e-05, + "loss": 1.2504, + "step": 6197 + }, + { + "epoch": 0.1445568166247336, + "grad_norm": 1.694440245628357, + "learning_rate": 1.9982661474985042e-05, + "loss": 1.6447, + "step": 6198 + }, + { + "epoch": 0.1445801397639115, + "grad_norm": 1.6758443117141724, + "learning_rate": 1.9982646650518257e-05, + "loss": 1.4907, + "step": 6199 + }, + { + "epoch": 0.14460346290308945, + "grad_norm": 1.521031379699707, + "learning_rate": 1.998263181972221e-05, + "loss": 0.9921, + "step": 6200 + }, + { + "epoch": 0.14462678604226736, + "grad_norm": 1.5668867826461792, + "learning_rate": 1.9982616982596912e-05, + "loss": 1.3527, + "step": 6201 + }, + { + "epoch": 0.14465010918144527, + "grad_norm": 1.7445333003997803, + "learning_rate": 1.9982602139142373e-05, + "loss": 1.3936, + "step": 6202 + }, + { + "epoch": 0.14467343232062319, + "grad_norm": 1.7091766595840454, + "learning_rate": 1.9982587289358603e-05, + "loss": 1.3193, + "step": 6203 + }, + { + "epoch": 0.14469675545980112, + "grad_norm": 1.5586146116256714, + "learning_rate": 1.9982572433245607e-05, + "loss": 1.4521, + "step": 6204 + }, + { + "epoch": 0.14472007859897904, + "grad_norm": 1.849519968032837, + "learning_rate": 1.9982557570803404e-05, + "loss": 1.8506, + "step": 6205 + }, + { + "epoch": 0.14474340173815695, + "grad_norm": 1.8658661842346191, + "learning_rate": 1.998254270203199e-05, + "loss": 1.4541, + "step": 6206 + }, + { + "epoch": 0.14476672487733486, + "grad_norm": 1.9396770000457764, + "learning_rate": 1.998252782693139e-05, + "loss": 1.7934, + "step": 6207 + }, + { + "epoch": 0.1447900480165128, + "grad_norm": 2.2900960445404053, + "learning_rate": 1.9982512945501598e-05, + "loss": 1.5647, + "step": 6208 + }, + { + "epoch": 0.1448133711556907, + "grad_norm": 1.5928584337234497, + "learning_rate": 1.9982498057742634e-05, + "loss": 1.1862, + "step": 6209 + }, + { + "epoch": 0.14483669429486862, + "grad_norm": 1.608139991760254, + "learning_rate": 1.9982483163654505e-05, + "loss": 1.3259, + "step": 6210 + }, + { + "epoch": 0.14486001743404653, + "grad_norm": 1.733022689819336, + "learning_rate": 1.998246826323722e-05, + "loss": 1.1973, + "step": 6211 + }, + { + "epoch": 0.14488334057322444, + "grad_norm": 1.90958833694458, + "learning_rate": 1.9982453356490782e-05, + "loss": 1.7954, + "step": 6212 + }, + { + "epoch": 0.14490666371240238, + "grad_norm": 1.583602786064148, + "learning_rate": 1.9982438443415213e-05, + "loss": 1.3206, + "step": 6213 + }, + { + "epoch": 0.1449299868515803, + "grad_norm": 1.6032410860061646, + "learning_rate": 1.9982423524010516e-05, + "loss": 1.3206, + "step": 6214 + }, + { + "epoch": 0.1449533099907582, + "grad_norm": 1.9835193157196045, + "learning_rate": 1.9982408598276696e-05, + "loss": 1.3731, + "step": 6215 + }, + { + "epoch": 0.1449766331299361, + "grad_norm": 1.67859947681427, + "learning_rate": 1.998239366621377e-05, + "loss": 1.5772, + "step": 6216 + }, + { + "epoch": 0.14499995626911405, + "grad_norm": 1.6912411451339722, + "learning_rate": 1.9982378727821744e-05, + "loss": 1.5823, + "step": 6217 + }, + { + "epoch": 0.14502327940829196, + "grad_norm": 1.6941653490066528, + "learning_rate": 1.9982363783100627e-05, + "loss": 1.4471, + "step": 6218 + }, + { + "epoch": 0.14504660254746987, + "grad_norm": 1.6394405364990234, + "learning_rate": 1.998234883205043e-05, + "loss": 1.4236, + "step": 6219 + }, + { + "epoch": 0.14506992568664778, + "grad_norm": 1.7556601762771606, + "learning_rate": 1.9982333874671162e-05, + "loss": 1.3592, + "step": 6220 + }, + { + "epoch": 0.14509324882582572, + "grad_norm": 2.3149731159210205, + "learning_rate": 1.9982318910962833e-05, + "loss": 1.1884, + "step": 6221 + }, + { + "epoch": 0.14511657196500363, + "grad_norm": 1.9247477054595947, + "learning_rate": 1.998230394092545e-05, + "loss": 1.2506, + "step": 6222 + }, + { + "epoch": 0.14513989510418154, + "grad_norm": 2.2213850021362305, + "learning_rate": 1.9982288964559023e-05, + "loss": 1.32, + "step": 6223 + }, + { + "epoch": 0.14516321824335945, + "grad_norm": 1.4995282888412476, + "learning_rate": 1.9982273981863563e-05, + "loss": 1.3913, + "step": 6224 + }, + { + "epoch": 0.1451865413825374, + "grad_norm": 1.541117787361145, + "learning_rate": 1.9982258992839083e-05, + "loss": 1.2822, + "step": 6225 + }, + { + "epoch": 0.1452098645217153, + "grad_norm": 1.7992687225341797, + "learning_rate": 1.9982243997485586e-05, + "loss": 1.3662, + "step": 6226 + }, + { + "epoch": 0.14523318766089321, + "grad_norm": 1.5719181299209595, + "learning_rate": 1.9982228995803083e-05, + "loss": 1.4968, + "step": 6227 + }, + { + "epoch": 0.14525651080007113, + "grad_norm": 1.5120388269424438, + "learning_rate": 1.9982213987791587e-05, + "loss": 1.2898, + "step": 6228 + }, + { + "epoch": 0.14527983393924906, + "grad_norm": 1.6571401357650757, + "learning_rate": 1.9982198973451105e-05, + "loss": 1.2515, + "step": 6229 + }, + { + "epoch": 0.14530315707842698, + "grad_norm": 1.3121395111083984, + "learning_rate": 1.9982183952781646e-05, + "loss": 1.1124, + "step": 6230 + }, + { + "epoch": 0.1453264802176049, + "grad_norm": 2.017638921737671, + "learning_rate": 1.9982168925783222e-05, + "loss": 1.7416, + "step": 6231 + }, + { + "epoch": 0.1453498033567828, + "grad_norm": 1.780114769935608, + "learning_rate": 1.998215389245584e-05, + "loss": 1.2815, + "step": 6232 + }, + { + "epoch": 0.14537312649596074, + "grad_norm": 2.1087558269500732, + "learning_rate": 1.998213885279951e-05, + "loss": 1.4622, + "step": 6233 + }, + { + "epoch": 0.14539644963513865, + "grad_norm": 1.7390282154083252, + "learning_rate": 1.998212380681424e-05, + "loss": 1.1713, + "step": 6234 + }, + { + "epoch": 0.14541977277431656, + "grad_norm": 2.0155489444732666, + "learning_rate": 1.9982108754500044e-05, + "loss": 1.5964, + "step": 6235 + }, + { + "epoch": 0.14544309591349447, + "grad_norm": 1.6597096920013428, + "learning_rate": 1.9982093695856926e-05, + "loss": 1.138, + "step": 6236 + }, + { + "epoch": 0.1454664190526724, + "grad_norm": 1.7902147769927979, + "learning_rate": 1.9982078630884904e-05, + "loss": 1.532, + "step": 6237 + }, + { + "epoch": 0.14548974219185032, + "grad_norm": 1.6770360469818115, + "learning_rate": 1.9982063559583977e-05, + "loss": 1.6072, + "step": 6238 + }, + { + "epoch": 0.14551306533102823, + "grad_norm": 1.7789121866226196, + "learning_rate": 1.9982048481954162e-05, + "loss": 1.2907, + "step": 6239 + }, + { + "epoch": 0.14553638847020614, + "grad_norm": 1.768479585647583, + "learning_rate": 1.9982033397995466e-05, + "loss": 1.2593, + "step": 6240 + }, + { + "epoch": 0.14555971160938405, + "grad_norm": 1.6811294555664062, + "learning_rate": 1.99820183077079e-05, + "loss": 1.6821, + "step": 6241 + }, + { + "epoch": 0.145583034748562, + "grad_norm": 1.980768084526062, + "learning_rate": 1.9982003211091467e-05, + "loss": 1.5062, + "step": 6242 + }, + { + "epoch": 0.1456063578877399, + "grad_norm": 1.8397732973098755, + "learning_rate": 1.9981988108146185e-05, + "loss": 1.515, + "step": 6243 + }, + { + "epoch": 0.1456296810269178, + "grad_norm": 2.2778093814849854, + "learning_rate": 1.9981972998872063e-05, + "loss": 1.5415, + "step": 6244 + }, + { + "epoch": 0.14565300416609572, + "grad_norm": 1.9170689582824707, + "learning_rate": 1.9981957883269107e-05, + "loss": 1.4491, + "step": 6245 + }, + { + "epoch": 0.14567632730527366, + "grad_norm": 1.9864757061004639, + "learning_rate": 1.9981942761337327e-05, + "loss": 1.3964, + "step": 6246 + }, + { + "epoch": 0.14569965044445157, + "grad_norm": 2.3734593391418457, + "learning_rate": 1.998192763307673e-05, + "loss": 1.5044, + "step": 6247 + }, + { + "epoch": 0.14572297358362948, + "grad_norm": 1.7962688207626343, + "learning_rate": 1.9981912498487335e-05, + "loss": 1.3108, + "step": 6248 + }, + { + "epoch": 0.1457462967228074, + "grad_norm": 1.8185255527496338, + "learning_rate": 1.9981897357569142e-05, + "loss": 1.6581, + "step": 6249 + }, + { + "epoch": 0.14576961986198533, + "grad_norm": 1.6769304275512695, + "learning_rate": 1.9981882210322164e-05, + "loss": 1.4067, + "step": 6250 + }, + { + "epoch": 0.14579294300116324, + "grad_norm": 1.8980764150619507, + "learning_rate": 1.9981867056746412e-05, + "loss": 1.4054, + "step": 6251 + }, + { + "epoch": 0.14581626614034116, + "grad_norm": 1.206648588180542, + "learning_rate": 1.998185189684189e-05, + "loss": 0.9875, + "step": 6252 + }, + { + "epoch": 0.14583958927951907, + "grad_norm": 2.0364739894866943, + "learning_rate": 1.998183673060862e-05, + "loss": 1.3392, + "step": 6253 + }, + { + "epoch": 0.145862912418697, + "grad_norm": 1.6145824193954468, + "learning_rate": 1.9981821558046593e-05, + "loss": 1.3968, + "step": 6254 + }, + { + "epoch": 0.14588623555787492, + "grad_norm": 1.7379778623580933, + "learning_rate": 1.9981806379155836e-05, + "loss": 1.1958, + "step": 6255 + }, + { + "epoch": 0.14590955869705283, + "grad_norm": 1.6130266189575195, + "learning_rate": 1.998179119393635e-05, + "loss": 1.1073, + "step": 6256 + }, + { + "epoch": 0.14593288183623074, + "grad_norm": 1.541894793510437, + "learning_rate": 1.998177600238815e-05, + "loss": 1.268, + "step": 6257 + }, + { + "epoch": 0.14595620497540868, + "grad_norm": 1.765008568763733, + "learning_rate": 1.9981760804511236e-05, + "loss": 1.3751, + "step": 6258 + }, + { + "epoch": 0.1459795281145866, + "grad_norm": 1.8454070091247559, + "learning_rate": 1.9981745600305628e-05, + "loss": 1.4114, + "step": 6259 + }, + { + "epoch": 0.1460028512537645, + "grad_norm": 1.7741239070892334, + "learning_rate": 1.998173038977133e-05, + "loss": 1.1736, + "step": 6260 + }, + { + "epoch": 0.1460261743929424, + "grad_norm": 1.5152240991592407, + "learning_rate": 1.998171517290835e-05, + "loss": 1.5241, + "step": 6261 + }, + { + "epoch": 0.14604949753212035, + "grad_norm": 1.65603768825531, + "learning_rate": 1.99816999497167e-05, + "loss": 1.511, + "step": 6262 + }, + { + "epoch": 0.14607282067129826, + "grad_norm": 1.8949073553085327, + "learning_rate": 1.9981684720196392e-05, + "loss": 1.8411, + "step": 6263 + }, + { + "epoch": 0.14609614381047617, + "grad_norm": 1.719747543334961, + "learning_rate": 1.9981669484347436e-05, + "loss": 1.4842, + "step": 6264 + }, + { + "epoch": 0.14611946694965408, + "grad_norm": 1.5213533639907837, + "learning_rate": 1.9981654242169836e-05, + "loss": 1.2176, + "step": 6265 + }, + { + "epoch": 0.14614279008883202, + "grad_norm": 1.4638770818710327, + "learning_rate": 1.998163899366361e-05, + "loss": 1.2112, + "step": 6266 + }, + { + "epoch": 0.14616611322800993, + "grad_norm": 2.062044620513916, + "learning_rate": 1.998162373882876e-05, + "loss": 1.1089, + "step": 6267 + }, + { + "epoch": 0.14618943636718784, + "grad_norm": 1.5639095306396484, + "learning_rate": 1.9981608477665298e-05, + "loss": 1.3685, + "step": 6268 + }, + { + "epoch": 0.14621275950636575, + "grad_norm": 1.5224006175994873, + "learning_rate": 1.9981593210173234e-05, + "loss": 1.2416, + "step": 6269 + }, + { + "epoch": 0.14623608264554366, + "grad_norm": 1.6118040084838867, + "learning_rate": 1.9981577936352577e-05, + "loss": 1.451, + "step": 6270 + }, + { + "epoch": 0.1462594057847216, + "grad_norm": 1.8948736190795898, + "learning_rate": 1.998156265620334e-05, + "loss": 1.3398, + "step": 6271 + }, + { + "epoch": 0.1462827289238995, + "grad_norm": 1.4965900182724, + "learning_rate": 1.9981547369725528e-05, + "loss": 1.4583, + "step": 6272 + }, + { + "epoch": 0.14630605206307742, + "grad_norm": 2.3831355571746826, + "learning_rate": 1.9981532076919156e-05, + "loss": 1.5213, + "step": 6273 + }, + { + "epoch": 0.14632937520225534, + "grad_norm": 2.1660211086273193, + "learning_rate": 1.9981516777784228e-05, + "loss": 1.6619, + "step": 6274 + }, + { + "epoch": 0.14635269834143327, + "grad_norm": 1.609186053276062, + "learning_rate": 1.9981501472320758e-05, + "loss": 1.4018, + "step": 6275 + }, + { + "epoch": 0.14637602148061118, + "grad_norm": 1.8440485000610352, + "learning_rate": 1.9981486160528752e-05, + "loss": 1.3104, + "step": 6276 + }, + { + "epoch": 0.1463993446197891, + "grad_norm": 1.6569260358810425, + "learning_rate": 1.9981470842408225e-05, + "loss": 1.2758, + "step": 6277 + }, + { + "epoch": 0.146422667758967, + "grad_norm": 1.7688606977462769, + "learning_rate": 1.9981455517959182e-05, + "loss": 1.1114, + "step": 6278 + }, + { + "epoch": 0.14644599089814495, + "grad_norm": 3.1900360584259033, + "learning_rate": 1.998144018718163e-05, + "loss": 1.4462, + "step": 6279 + }, + { + "epoch": 0.14646931403732286, + "grad_norm": 1.974710464477539, + "learning_rate": 1.998142485007559e-05, + "loss": 1.5234, + "step": 6280 + }, + { + "epoch": 0.14649263717650077, + "grad_norm": 1.5688261985778809, + "learning_rate": 1.998140950664106e-05, + "loss": 1.1338, + "step": 6281 + }, + { + "epoch": 0.14651596031567868, + "grad_norm": 1.4419082403182983, + "learning_rate": 1.9981394156878052e-05, + "loss": 1.2841, + "step": 6282 + }, + { + "epoch": 0.14653928345485662, + "grad_norm": 1.6446573734283447, + "learning_rate": 1.9981378800786583e-05, + "loss": 1.1957, + "step": 6283 + }, + { + "epoch": 0.14656260659403453, + "grad_norm": 2.722903251647949, + "learning_rate": 1.9981363438366656e-05, + "loss": 1.5655, + "step": 6284 + }, + { + "epoch": 0.14658592973321244, + "grad_norm": 1.8135250806808472, + "learning_rate": 1.9981348069618284e-05, + "loss": 1.3129, + "step": 6285 + }, + { + "epoch": 0.14660925287239035, + "grad_norm": 1.892442226409912, + "learning_rate": 1.9981332694541472e-05, + "loss": 1.8082, + "step": 6286 + }, + { + "epoch": 0.1466325760115683, + "grad_norm": 1.9138824939727783, + "learning_rate": 1.9981317313136233e-05, + "loss": 1.1554, + "step": 6287 + }, + { + "epoch": 0.1466558991507462, + "grad_norm": 2.0759596824645996, + "learning_rate": 1.9981301925402584e-05, + "loss": 1.7002, + "step": 6288 + }, + { + "epoch": 0.1466792222899241, + "grad_norm": 1.4600279331207275, + "learning_rate": 1.998128653134052e-05, + "loss": 1.2624, + "step": 6289 + }, + { + "epoch": 0.14670254542910202, + "grad_norm": 2.0629379749298096, + "learning_rate": 1.9981271130950062e-05, + "loss": 1.297, + "step": 6290 + }, + { + "epoch": 0.14672586856827996, + "grad_norm": 1.4304953813552856, + "learning_rate": 1.9981255724231213e-05, + "loss": 1.4381, + "step": 6291 + }, + { + "epoch": 0.14674919170745787, + "grad_norm": 1.431989312171936, + "learning_rate": 1.9981240311183988e-05, + "loss": 1.1335, + "step": 6292 + }, + { + "epoch": 0.14677251484663578, + "grad_norm": 1.8915808200836182, + "learning_rate": 1.9981224891808393e-05, + "loss": 1.6591, + "step": 6293 + }, + { + "epoch": 0.1467958379858137, + "grad_norm": 1.753125548362732, + "learning_rate": 1.9981209466104442e-05, + "loss": 1.4789, + "step": 6294 + }, + { + "epoch": 0.14681916112499163, + "grad_norm": 1.6588149070739746, + "learning_rate": 1.998119403407214e-05, + "loss": 1.2405, + "step": 6295 + }, + { + "epoch": 0.14684248426416954, + "grad_norm": 1.7429893016815186, + "learning_rate": 1.9981178595711496e-05, + "loss": 1.4793, + "step": 6296 + }, + { + "epoch": 0.14686580740334745, + "grad_norm": 1.6872663497924805, + "learning_rate": 1.998116315102253e-05, + "loss": 1.2561, + "step": 6297 + }, + { + "epoch": 0.14688913054252536, + "grad_norm": 1.5151638984680176, + "learning_rate": 1.998114770000524e-05, + "loss": 1.2937, + "step": 6298 + }, + { + "epoch": 0.14691245368170328, + "grad_norm": 1.7517789602279663, + "learning_rate": 1.998113224265964e-05, + "loss": 1.6803, + "step": 6299 + }, + { + "epoch": 0.14693577682088121, + "grad_norm": 1.7625466585159302, + "learning_rate": 1.998111677898574e-05, + "loss": 1.3151, + "step": 6300 + }, + { + "epoch": 0.14695909996005913, + "grad_norm": 1.6429600715637207, + "learning_rate": 1.9981101308983552e-05, + "loss": 1.2651, + "step": 6301 + }, + { + "epoch": 0.14698242309923704, + "grad_norm": 1.4725855588912964, + "learning_rate": 1.9981085832653086e-05, + "loss": 1.2581, + "step": 6302 + }, + { + "epoch": 0.14700574623841495, + "grad_norm": 1.815018653869629, + "learning_rate": 1.998107034999434e-05, + "loss": 1.6052, + "step": 6303 + }, + { + "epoch": 0.14702906937759289, + "grad_norm": 1.798174500465393, + "learning_rate": 1.9981054861007343e-05, + "loss": 1.4409, + "step": 6304 + }, + { + "epoch": 0.1470523925167708, + "grad_norm": 1.5628280639648438, + "learning_rate": 1.998103936569209e-05, + "loss": 1.23, + "step": 6305 + }, + { + "epoch": 0.1470757156559487, + "grad_norm": 1.5914443731307983, + "learning_rate": 1.99810238640486e-05, + "loss": 1.3851, + "step": 6306 + }, + { + "epoch": 0.14709903879512662, + "grad_norm": 1.769966959953308, + "learning_rate": 1.9981008356076875e-05, + "loss": 1.4637, + "step": 6307 + }, + { + "epoch": 0.14712236193430456, + "grad_norm": 1.8808379173278809, + "learning_rate": 1.998099284177693e-05, + "loss": 1.2935, + "step": 6308 + }, + { + "epoch": 0.14714568507348247, + "grad_norm": 1.9115874767303467, + "learning_rate": 1.9980977321148772e-05, + "loss": 1.2437, + "step": 6309 + }, + { + "epoch": 0.14716900821266038, + "grad_norm": 1.8019105195999146, + "learning_rate": 1.9980961794192417e-05, + "loss": 1.2596, + "step": 6310 + }, + { + "epoch": 0.1471923313518383, + "grad_norm": 1.2985539436340332, + "learning_rate": 1.998094626090787e-05, + "loss": 0.995, + "step": 6311 + }, + { + "epoch": 0.14721565449101623, + "grad_norm": 1.730983018875122, + "learning_rate": 1.9980930721295134e-05, + "loss": 1.2336, + "step": 6312 + }, + { + "epoch": 0.14723897763019414, + "grad_norm": 1.7046700716018677, + "learning_rate": 1.9980915175354233e-05, + "loss": 1.3821, + "step": 6313 + }, + { + "epoch": 0.14726230076937205, + "grad_norm": 1.6626967191696167, + "learning_rate": 1.9980899623085165e-05, + "loss": 1.3474, + "step": 6314 + }, + { + "epoch": 0.14728562390854996, + "grad_norm": 2.1308975219726562, + "learning_rate": 1.9980884064487945e-05, + "loss": 1.5086, + "step": 6315 + }, + { + "epoch": 0.1473089470477279, + "grad_norm": 2.055668592453003, + "learning_rate": 1.9980868499562586e-05, + "loss": 1.7346, + "step": 6316 + }, + { + "epoch": 0.1473322701869058, + "grad_norm": 1.908616304397583, + "learning_rate": 1.998085292830909e-05, + "loss": 1.7066, + "step": 6317 + }, + { + "epoch": 0.14735559332608372, + "grad_norm": 1.7856305837631226, + "learning_rate": 1.9980837350727475e-05, + "loss": 1.4127, + "step": 6318 + }, + { + "epoch": 0.14737891646526163, + "grad_norm": 1.784224033355713, + "learning_rate": 1.9980821766817746e-05, + "loss": 1.3026, + "step": 6319 + }, + { + "epoch": 0.14740223960443957, + "grad_norm": 1.9266581535339355, + "learning_rate": 1.9980806176579915e-05, + "loss": 1.217, + "step": 6320 + }, + { + "epoch": 0.14742556274361748, + "grad_norm": 2.2485921382904053, + "learning_rate": 1.9980790580013986e-05, + "loss": 1.6592, + "step": 6321 + }, + { + "epoch": 0.1474488858827954, + "grad_norm": 1.627590298652649, + "learning_rate": 1.998077497711998e-05, + "loss": 1.4816, + "step": 6322 + }, + { + "epoch": 0.1474722090219733, + "grad_norm": 1.7750526666641235, + "learning_rate": 1.9980759367897896e-05, + "loss": 1.4028, + "step": 6323 + }, + { + "epoch": 0.14749553216115124, + "grad_norm": 1.7806413173675537, + "learning_rate": 1.9980743752347753e-05, + "loss": 1.3278, + "step": 6324 + }, + { + "epoch": 0.14751885530032915, + "grad_norm": 1.822229027748108, + "learning_rate": 1.9980728130469552e-05, + "loss": 1.2962, + "step": 6325 + }, + { + "epoch": 0.14754217843950707, + "grad_norm": 1.8200899362564087, + "learning_rate": 1.998071250226331e-05, + "loss": 1.52, + "step": 6326 + }, + { + "epoch": 0.14756550157868498, + "grad_norm": 2.0332694053649902, + "learning_rate": 1.998069686772903e-05, + "loss": 1.6824, + "step": 6327 + }, + { + "epoch": 0.1475888247178629, + "grad_norm": 1.6384061574935913, + "learning_rate": 1.998068122686673e-05, + "loss": 1.2046, + "step": 6328 + }, + { + "epoch": 0.14761214785704083, + "grad_norm": 1.9524784088134766, + "learning_rate": 1.9980665579676417e-05, + "loss": 1.4129, + "step": 6329 + }, + { + "epoch": 0.14763547099621874, + "grad_norm": 1.6985217332839966, + "learning_rate": 1.99806499261581e-05, + "loss": 1.3566, + "step": 6330 + }, + { + "epoch": 0.14765879413539665, + "grad_norm": 2.629821300506592, + "learning_rate": 1.9980634266311786e-05, + "loss": 1.3667, + "step": 6331 + }, + { + "epoch": 0.14768211727457456, + "grad_norm": 1.7384490966796875, + "learning_rate": 1.9980618600137492e-05, + "loss": 1.369, + "step": 6332 + }, + { + "epoch": 0.1477054404137525, + "grad_norm": 1.8491767644882202, + "learning_rate": 1.998060292763522e-05, + "loss": 1.7044, + "step": 6333 + }, + { + "epoch": 0.1477287635529304, + "grad_norm": 1.5684857368469238, + "learning_rate": 1.9980587248804982e-05, + "loss": 1.2894, + "step": 6334 + }, + { + "epoch": 0.14775208669210832, + "grad_norm": 2.069748878479004, + "learning_rate": 1.9980571563646793e-05, + "loss": 1.6364, + "step": 6335 + }, + { + "epoch": 0.14777540983128623, + "grad_norm": 2.025808811187744, + "learning_rate": 1.998055587216066e-05, + "loss": 1.1513, + "step": 6336 + }, + { + "epoch": 0.14779873297046417, + "grad_norm": 1.739645004272461, + "learning_rate": 1.9980540174346593e-05, + "loss": 1.5545, + "step": 6337 + }, + { + "epoch": 0.14782205610964208, + "grad_norm": 1.8423300981521606, + "learning_rate": 1.99805244702046e-05, + "loss": 1.4438, + "step": 6338 + }, + { + "epoch": 0.14784537924882, + "grad_norm": 1.6627358198165894, + "learning_rate": 1.998050875973469e-05, + "loss": 1.4511, + "step": 6339 + }, + { + "epoch": 0.1478687023879979, + "grad_norm": 2.385854482650757, + "learning_rate": 1.998049304293688e-05, + "loss": 1.5281, + "step": 6340 + }, + { + "epoch": 0.14789202552717584, + "grad_norm": 1.6110758781433105, + "learning_rate": 1.9980477319811174e-05, + "loss": 1.5689, + "step": 6341 + }, + { + "epoch": 0.14791534866635375, + "grad_norm": 1.502506971359253, + "learning_rate": 1.998046159035758e-05, + "loss": 1.2348, + "step": 6342 + }, + { + "epoch": 0.14793867180553166, + "grad_norm": 2.742504119873047, + "learning_rate": 1.9980445854576117e-05, + "loss": 1.4501, + "step": 6343 + }, + { + "epoch": 0.14796199494470957, + "grad_norm": 1.8370131254196167, + "learning_rate": 1.9980430112466783e-05, + "loss": 1.225, + "step": 6344 + }, + { + "epoch": 0.1479853180838875, + "grad_norm": 1.7540124654769897, + "learning_rate": 1.9980414364029598e-05, + "loss": 1.2098, + "step": 6345 + }, + { + "epoch": 0.14800864122306542, + "grad_norm": 1.8304939270019531, + "learning_rate": 1.998039860926457e-05, + "loss": 1.4693, + "step": 6346 + }, + { + "epoch": 0.14803196436224333, + "grad_norm": 1.869245171546936, + "learning_rate": 1.9980382848171702e-05, + "loss": 1.4976, + "step": 6347 + }, + { + "epoch": 0.14805528750142125, + "grad_norm": 1.7381535768508911, + "learning_rate": 1.9980367080751014e-05, + "loss": 1.669, + "step": 6348 + }, + { + "epoch": 0.14807861064059918, + "grad_norm": 1.6199153661727905, + "learning_rate": 1.998035130700251e-05, + "loss": 1.3269, + "step": 6349 + }, + { + "epoch": 0.1481019337797771, + "grad_norm": 2.246129274368286, + "learning_rate": 1.99803355269262e-05, + "loss": 1.2174, + "step": 6350 + }, + { + "epoch": 0.148125256918955, + "grad_norm": 1.695785403251648, + "learning_rate": 1.9980319740522096e-05, + "loss": 1.4156, + "step": 6351 + }, + { + "epoch": 0.14814858005813292, + "grad_norm": 1.4945704936981201, + "learning_rate": 1.9980303947790207e-05, + "loss": 1.4054, + "step": 6352 + }, + { + "epoch": 0.14817190319731086, + "grad_norm": 1.8915104866027832, + "learning_rate": 1.9980288148730545e-05, + "loss": 1.2275, + "step": 6353 + }, + { + "epoch": 0.14819522633648877, + "grad_norm": 1.646942138671875, + "learning_rate": 1.9980272343343117e-05, + "loss": 1.4376, + "step": 6354 + }, + { + "epoch": 0.14821854947566668, + "grad_norm": 1.5930567979812622, + "learning_rate": 1.9980256531627936e-05, + "loss": 1.7221, + "step": 6355 + }, + { + "epoch": 0.1482418726148446, + "grad_norm": 1.9852091073989868, + "learning_rate": 1.9980240713585005e-05, + "loss": 1.616, + "step": 6356 + }, + { + "epoch": 0.1482651957540225, + "grad_norm": 1.8610070943832397, + "learning_rate": 1.9980224889214342e-05, + "loss": 1.7411, + "step": 6357 + }, + { + "epoch": 0.14828851889320044, + "grad_norm": 1.6750504970550537, + "learning_rate": 1.9980209058515957e-05, + "loss": 1.2728, + "step": 6358 + }, + { + "epoch": 0.14831184203237835, + "grad_norm": 1.7558854818344116, + "learning_rate": 1.9980193221489856e-05, + "loss": 1.5902, + "step": 6359 + }, + { + "epoch": 0.14833516517155626, + "grad_norm": 1.665004849433899, + "learning_rate": 1.998017737813605e-05, + "loss": 1.5172, + "step": 6360 + }, + { + "epoch": 0.14835848831073417, + "grad_norm": 1.7077908515930176, + "learning_rate": 1.998016152845455e-05, + "loss": 1.4948, + "step": 6361 + }, + { + "epoch": 0.1483818114499121, + "grad_norm": 1.7445045709609985, + "learning_rate": 1.9980145672445367e-05, + "loss": 1.6677, + "step": 6362 + }, + { + "epoch": 0.14840513458909002, + "grad_norm": 1.9059146642684937, + "learning_rate": 1.9980129810108506e-05, + "loss": 1.5568, + "step": 6363 + }, + { + "epoch": 0.14842845772826793, + "grad_norm": 1.7856662273406982, + "learning_rate": 1.9980113941443982e-05, + "loss": 1.5833, + "step": 6364 + }, + { + "epoch": 0.14845178086744584, + "grad_norm": 1.9401298761367798, + "learning_rate": 1.9980098066451805e-05, + "loss": 1.4729, + "step": 6365 + }, + { + "epoch": 0.14847510400662378, + "grad_norm": 1.5855766534805298, + "learning_rate": 1.9980082185131983e-05, + "loss": 1.6215, + "step": 6366 + }, + { + "epoch": 0.1484984271458017, + "grad_norm": 1.7955992221832275, + "learning_rate": 1.998006629748453e-05, + "loss": 1.2094, + "step": 6367 + }, + { + "epoch": 0.1485217502849796, + "grad_norm": 1.8252360820770264, + "learning_rate": 1.998005040350945e-05, + "loss": 1.5357, + "step": 6368 + }, + { + "epoch": 0.14854507342415751, + "grad_norm": 1.6559171676635742, + "learning_rate": 1.9980034503206754e-05, + "loss": 1.5619, + "step": 6369 + }, + { + "epoch": 0.14856839656333545, + "grad_norm": 1.6331405639648438, + "learning_rate": 1.9980018596576456e-05, + "loss": 1.0916, + "step": 6370 + }, + { + "epoch": 0.14859171970251336, + "grad_norm": 1.792582631111145, + "learning_rate": 1.9980002683618564e-05, + "loss": 1.2332, + "step": 6371 + }, + { + "epoch": 0.14861504284169127, + "grad_norm": 2.238001823425293, + "learning_rate": 1.997998676433309e-05, + "loss": 1.9084, + "step": 6372 + }, + { + "epoch": 0.14863836598086919, + "grad_norm": 1.7714085578918457, + "learning_rate": 1.997997083872004e-05, + "loss": 1.2718, + "step": 6373 + }, + { + "epoch": 0.14866168912004712, + "grad_norm": 2.5147957801818848, + "learning_rate": 1.997995490677943e-05, + "loss": 1.1215, + "step": 6374 + }, + { + "epoch": 0.14868501225922504, + "grad_norm": 2.174163341522217, + "learning_rate": 1.997993896851126e-05, + "loss": 1.6837, + "step": 6375 + }, + { + "epoch": 0.14870833539840295, + "grad_norm": 1.533829689025879, + "learning_rate": 1.997992302391555e-05, + "loss": 1.3963, + "step": 6376 + }, + { + "epoch": 0.14873165853758086, + "grad_norm": 1.8047055006027222, + "learning_rate": 1.997990707299231e-05, + "loss": 1.1422, + "step": 6377 + }, + { + "epoch": 0.1487549816767588, + "grad_norm": 1.8243073225021362, + "learning_rate": 1.9979891115741547e-05, + "loss": 1.3383, + "step": 6378 + }, + { + "epoch": 0.1487783048159367, + "grad_norm": 1.8305389881134033, + "learning_rate": 1.9979875152163265e-05, + "loss": 1.469, + "step": 6379 + }, + { + "epoch": 0.14880162795511462, + "grad_norm": 1.694000005722046, + "learning_rate": 1.9979859182257485e-05, + "loss": 1.3852, + "step": 6380 + }, + { + "epoch": 0.14882495109429253, + "grad_norm": 1.9437839984893799, + "learning_rate": 1.997984320602421e-05, + "loss": 1.2002, + "step": 6381 + }, + { + "epoch": 0.14884827423347047, + "grad_norm": 1.7798144817352295, + "learning_rate": 1.997982722346345e-05, + "loss": 1.2684, + "step": 6382 + }, + { + "epoch": 0.14887159737264838, + "grad_norm": 1.7351757287979126, + "learning_rate": 1.9979811234575222e-05, + "loss": 1.3483, + "step": 6383 + }, + { + "epoch": 0.1488949205118263, + "grad_norm": 1.5627840757369995, + "learning_rate": 1.9979795239359535e-05, + "loss": 1.3657, + "step": 6384 + }, + { + "epoch": 0.1489182436510042, + "grad_norm": 1.7215213775634766, + "learning_rate": 1.997977923781639e-05, + "loss": 1.6804, + "step": 6385 + }, + { + "epoch": 0.1489415667901821, + "grad_norm": 1.7614259719848633, + "learning_rate": 1.9979763229945803e-05, + "loss": 1.4823, + "step": 6386 + }, + { + "epoch": 0.14896488992936005, + "grad_norm": 1.8664698600769043, + "learning_rate": 1.9979747215747785e-05, + "loss": 1.579, + "step": 6387 + }, + { + "epoch": 0.14898821306853796, + "grad_norm": 1.9672660827636719, + "learning_rate": 1.9979731195222344e-05, + "loss": 1.3812, + "step": 6388 + }, + { + "epoch": 0.14901153620771587, + "grad_norm": 1.7829407453536987, + "learning_rate": 1.9979715168369495e-05, + "loss": 1.5902, + "step": 6389 + }, + { + "epoch": 0.14903485934689378, + "grad_norm": 2.2432830333709717, + "learning_rate": 1.9979699135189242e-05, + "loss": 1.5942, + "step": 6390 + }, + { + "epoch": 0.14905818248607172, + "grad_norm": 1.8594882488250732, + "learning_rate": 1.99796830956816e-05, + "loss": 1.3613, + "step": 6391 + }, + { + "epoch": 0.14908150562524963, + "grad_norm": 1.764748454093933, + "learning_rate": 1.9979667049846575e-05, + "loss": 1.279, + "step": 6392 + }, + { + "epoch": 0.14910482876442754, + "grad_norm": 1.6265698671340942, + "learning_rate": 1.997965099768418e-05, + "loss": 1.2791, + "step": 6393 + }, + { + "epoch": 0.14912815190360545, + "grad_norm": 2.353329658508301, + "learning_rate": 1.9979634939194422e-05, + "loss": 1.3027, + "step": 6394 + }, + { + "epoch": 0.1491514750427834, + "grad_norm": 1.7532522678375244, + "learning_rate": 1.9979618874377316e-05, + "loss": 1.2593, + "step": 6395 + }, + { + "epoch": 0.1491747981819613, + "grad_norm": 1.6128414869308472, + "learning_rate": 1.9979602803232867e-05, + "loss": 1.2884, + "step": 6396 + }, + { + "epoch": 0.14919812132113922, + "grad_norm": 2.1591663360595703, + "learning_rate": 1.9979586725761088e-05, + "loss": 1.2383, + "step": 6397 + }, + { + "epoch": 0.14922144446031713, + "grad_norm": 1.4252793788909912, + "learning_rate": 1.997957064196199e-05, + "loss": 1.5374, + "step": 6398 + }, + { + "epoch": 0.14924476759949507, + "grad_norm": 1.838645339012146, + "learning_rate": 1.9979554551835584e-05, + "loss": 1.2094, + "step": 6399 + }, + { + "epoch": 0.14926809073867298, + "grad_norm": 2.757004499435425, + "learning_rate": 1.9979538455381878e-05, + "loss": 1.0902, + "step": 6400 + }, + { + "epoch": 0.1492914138778509, + "grad_norm": 1.4752918481826782, + "learning_rate": 1.997952235260088e-05, + "loss": 1.5916, + "step": 6401 + }, + { + "epoch": 0.1493147370170288, + "grad_norm": 1.768027901649475, + "learning_rate": 1.9979506243492604e-05, + "loss": 1.3541, + "step": 6402 + }, + { + "epoch": 0.14933806015620674, + "grad_norm": 1.8835238218307495, + "learning_rate": 1.9979490128057063e-05, + "loss": 1.455, + "step": 6403 + }, + { + "epoch": 0.14936138329538465, + "grad_norm": 2.0347955226898193, + "learning_rate": 1.9979474006294257e-05, + "loss": 1.3051, + "step": 6404 + }, + { + "epoch": 0.14938470643456256, + "grad_norm": 1.5055944919586182, + "learning_rate": 1.9979457878204207e-05, + "loss": 1.4491, + "step": 6405 + }, + { + "epoch": 0.14940802957374047, + "grad_norm": 1.8938367366790771, + "learning_rate": 1.9979441743786915e-05, + "loss": 1.4598, + "step": 6406 + }, + { + "epoch": 0.1494313527129184, + "grad_norm": 1.7014673948287964, + "learning_rate": 1.99794256030424e-05, + "loss": 1.6592, + "step": 6407 + }, + { + "epoch": 0.14945467585209632, + "grad_norm": 2.487757444381714, + "learning_rate": 1.9979409455970665e-05, + "loss": 1.7027, + "step": 6408 + }, + { + "epoch": 0.14947799899127423, + "grad_norm": 2.082040309906006, + "learning_rate": 1.997939330257172e-05, + "loss": 1.5729, + "step": 6409 + }, + { + "epoch": 0.14950132213045214, + "grad_norm": 1.8862005472183228, + "learning_rate": 1.997937714284558e-05, + "loss": 1.5448, + "step": 6410 + }, + { + "epoch": 0.14952464526963005, + "grad_norm": 1.3823610544204712, + "learning_rate": 1.997936097679225e-05, + "loss": 1.4054, + "step": 6411 + }, + { + "epoch": 0.149547968408808, + "grad_norm": 1.669594407081604, + "learning_rate": 1.9979344804411744e-05, + "loss": 1.4086, + "step": 6412 + }, + { + "epoch": 0.1495712915479859, + "grad_norm": 1.6962192058563232, + "learning_rate": 1.9979328625704072e-05, + "loss": 1.7387, + "step": 6413 + }, + { + "epoch": 0.1495946146871638, + "grad_norm": 1.6447385549545288, + "learning_rate": 1.9979312440669246e-05, + "loss": 1.3169, + "step": 6414 + }, + { + "epoch": 0.14961793782634172, + "grad_norm": 2.078167676925659, + "learning_rate": 1.9979296249307274e-05, + "loss": 1.6391, + "step": 6415 + }, + { + "epoch": 0.14964126096551966, + "grad_norm": 1.6783324480056763, + "learning_rate": 1.9979280051618163e-05, + "loss": 1.4871, + "step": 6416 + }, + { + "epoch": 0.14966458410469757, + "grad_norm": 2.452965497970581, + "learning_rate": 1.997926384760193e-05, + "loss": 1.4015, + "step": 6417 + }, + { + "epoch": 0.14968790724387548, + "grad_norm": 1.6726702451705933, + "learning_rate": 1.9979247637258575e-05, + "loss": 1.7195, + "step": 6418 + }, + { + "epoch": 0.1497112303830534, + "grad_norm": 1.9917819499969482, + "learning_rate": 1.997923142058812e-05, + "loss": 1.6801, + "step": 6419 + }, + { + "epoch": 0.14973455352223133, + "grad_norm": 1.935634970664978, + "learning_rate": 1.9979215197590572e-05, + "loss": 1.5383, + "step": 6420 + }, + { + "epoch": 0.14975787666140924, + "grad_norm": 1.5669339895248413, + "learning_rate": 1.9979198968265935e-05, + "loss": 1.4116, + "step": 6421 + }, + { + "epoch": 0.14978119980058716, + "grad_norm": 1.6722368001937866, + "learning_rate": 1.997918273261423e-05, + "loss": 1.296, + "step": 6422 + }, + { + "epoch": 0.14980452293976507, + "grad_norm": 1.9812976121902466, + "learning_rate": 1.9979166490635456e-05, + "loss": 1.0995, + "step": 6423 + }, + { + "epoch": 0.149827846078943, + "grad_norm": 2.0594632625579834, + "learning_rate": 1.9979150242329628e-05, + "loss": 1.7338, + "step": 6424 + }, + { + "epoch": 0.14985116921812092, + "grad_norm": 1.9277935028076172, + "learning_rate": 1.9979133987696758e-05, + "loss": 1.6301, + "step": 6425 + }, + { + "epoch": 0.14987449235729883, + "grad_norm": 1.9171355962753296, + "learning_rate": 1.997911772673686e-05, + "loss": 1.6942, + "step": 6426 + }, + { + "epoch": 0.14989781549647674, + "grad_norm": 1.5507508516311646, + "learning_rate": 1.997910145944993e-05, + "loss": 1.072, + "step": 6427 + }, + { + "epoch": 0.14992113863565468, + "grad_norm": 1.6676766872406006, + "learning_rate": 1.9979085185835995e-05, + "loss": 1.3225, + "step": 6428 + }, + { + "epoch": 0.1499444617748326, + "grad_norm": 1.7878844738006592, + "learning_rate": 1.9979068905895055e-05, + "loss": 1.5722, + "step": 6429 + }, + { + "epoch": 0.1499677849140105, + "grad_norm": 1.924278736114502, + "learning_rate": 1.9979052619627124e-05, + "loss": 1.2868, + "step": 6430 + }, + { + "epoch": 0.1499911080531884, + "grad_norm": 1.7272096872329712, + "learning_rate": 1.997903632703221e-05, + "loss": 1.3176, + "step": 6431 + }, + { + "epoch": 0.15001443119236635, + "grad_norm": 1.628877878189087, + "learning_rate": 1.9979020028110328e-05, + "loss": 1.4085, + "step": 6432 + }, + { + "epoch": 0.15003775433154426, + "grad_norm": 1.6059218645095825, + "learning_rate": 1.9979003722861483e-05, + "loss": 1.597, + "step": 6433 + }, + { + "epoch": 0.15006107747072217, + "grad_norm": 2.2409846782684326, + "learning_rate": 1.997898741128569e-05, + "loss": 1.4655, + "step": 6434 + }, + { + "epoch": 0.15008440060990008, + "grad_norm": 1.577303409576416, + "learning_rate": 1.9978971093382953e-05, + "loss": 1.187, + "step": 6435 + }, + { + "epoch": 0.15010772374907802, + "grad_norm": 1.542309284210205, + "learning_rate": 1.9978954769153293e-05, + "loss": 1.7159, + "step": 6436 + }, + { + "epoch": 0.15013104688825593, + "grad_norm": 1.7154550552368164, + "learning_rate": 1.9978938438596706e-05, + "loss": 1.6311, + "step": 6437 + }, + { + "epoch": 0.15015437002743384, + "grad_norm": 1.847980260848999, + "learning_rate": 1.9978922101713217e-05, + "loss": 1.4868, + "step": 6438 + }, + { + "epoch": 0.15017769316661175, + "grad_norm": 1.5811986923217773, + "learning_rate": 1.9978905758502825e-05, + "loss": 1.3122, + "step": 6439 + }, + { + "epoch": 0.15020101630578966, + "grad_norm": 1.5300745964050293, + "learning_rate": 1.9978889408965548e-05, + "loss": 1.6769, + "step": 6440 + }, + { + "epoch": 0.1502243394449676, + "grad_norm": 2.312790632247925, + "learning_rate": 1.997887305310139e-05, + "loss": 1.4959, + "step": 6441 + }, + { + "epoch": 0.1502476625841455, + "grad_norm": 1.855665922164917, + "learning_rate": 1.9978856690910364e-05, + "loss": 1.4539, + "step": 6442 + }, + { + "epoch": 0.15027098572332342, + "grad_norm": 1.6974115371704102, + "learning_rate": 1.997884032239248e-05, + "loss": 1.3456, + "step": 6443 + }, + { + "epoch": 0.15029430886250134, + "grad_norm": 1.5362883806228638, + "learning_rate": 1.9978823947547754e-05, + "loss": 1.6035, + "step": 6444 + }, + { + "epoch": 0.15031763200167927, + "grad_norm": 1.4362261295318604, + "learning_rate": 1.9978807566376188e-05, + "loss": 1.3136, + "step": 6445 + }, + { + "epoch": 0.15034095514085719, + "grad_norm": 1.5727035999298096, + "learning_rate": 1.9978791178877798e-05, + "loss": 1.5213, + "step": 6446 + }, + { + "epoch": 0.1503642782800351, + "grad_norm": 1.47102952003479, + "learning_rate": 1.9978774785052593e-05, + "loss": 1.4537, + "step": 6447 + }, + { + "epoch": 0.150387601419213, + "grad_norm": 1.2262845039367676, + "learning_rate": 1.997875838490058e-05, + "loss": 1.167, + "step": 6448 + }, + { + "epoch": 0.15041092455839095, + "grad_norm": 2.3677914142608643, + "learning_rate": 1.9978741978421775e-05, + "loss": 1.7975, + "step": 6449 + }, + { + "epoch": 0.15043424769756886, + "grad_norm": 1.4873206615447998, + "learning_rate": 1.9978725565616185e-05, + "loss": 1.2652, + "step": 6450 + }, + { + "epoch": 0.15045757083674677, + "grad_norm": 1.8219521045684814, + "learning_rate": 1.9978709146483818e-05, + "loss": 1.4345, + "step": 6451 + }, + { + "epoch": 0.15048089397592468, + "grad_norm": 1.6071401834487915, + "learning_rate": 1.997869272102469e-05, + "loss": 1.3622, + "step": 6452 + }, + { + "epoch": 0.15050421711510262, + "grad_norm": 2.2650022506713867, + "learning_rate": 1.997867628923881e-05, + "loss": 1.7672, + "step": 6453 + }, + { + "epoch": 0.15052754025428053, + "grad_norm": 1.5832401514053345, + "learning_rate": 1.9978659851126183e-05, + "loss": 1.5779, + "step": 6454 + }, + { + "epoch": 0.15055086339345844, + "grad_norm": 1.6860471963882446, + "learning_rate": 1.9978643406686827e-05, + "loss": 1.4339, + "step": 6455 + }, + { + "epoch": 0.15057418653263635, + "grad_norm": 1.957720398902893, + "learning_rate": 1.9978626955920747e-05, + "loss": 1.5964, + "step": 6456 + }, + { + "epoch": 0.1505975096718143, + "grad_norm": 1.667849063873291, + "learning_rate": 1.9978610498827958e-05, + "loss": 1.0868, + "step": 6457 + }, + { + "epoch": 0.1506208328109922, + "grad_norm": 1.9904413223266602, + "learning_rate": 1.9978594035408466e-05, + "loss": 1.3838, + "step": 6458 + }, + { + "epoch": 0.1506441559501701, + "grad_norm": 1.8178479671478271, + "learning_rate": 1.9978577565662286e-05, + "loss": 1.3676, + "step": 6459 + }, + { + "epoch": 0.15066747908934802, + "grad_norm": 1.8836613893508911, + "learning_rate": 1.9978561089589427e-05, + "loss": 1.3298, + "step": 6460 + }, + { + "epoch": 0.15069080222852596, + "grad_norm": 1.9004164934158325, + "learning_rate": 1.9978544607189892e-05, + "loss": 1.7459, + "step": 6461 + }, + { + "epoch": 0.15071412536770387, + "grad_norm": 1.6539698839187622, + "learning_rate": 1.9978528118463702e-05, + "loss": 1.5108, + "step": 6462 + }, + { + "epoch": 0.15073744850688178, + "grad_norm": 1.710914969444275, + "learning_rate": 1.9978511623410864e-05, + "loss": 1.5018, + "step": 6463 + }, + { + "epoch": 0.1507607716460597, + "grad_norm": 1.3733198642730713, + "learning_rate": 1.9978495122031388e-05, + "loss": 1.1493, + "step": 6464 + }, + { + "epoch": 0.15078409478523763, + "grad_norm": 1.7734730243682861, + "learning_rate": 1.997847861432528e-05, + "loss": 1.7355, + "step": 6465 + }, + { + "epoch": 0.15080741792441554, + "grad_norm": 2.04535174369812, + "learning_rate": 1.9978462100292558e-05, + "loss": 1.097, + "step": 6466 + }, + { + "epoch": 0.15083074106359345, + "grad_norm": 1.492285132408142, + "learning_rate": 1.9978445579933225e-05, + "loss": 1.0915, + "step": 6467 + }, + { + "epoch": 0.15085406420277137, + "grad_norm": 1.6826637983322144, + "learning_rate": 1.9978429053247297e-05, + "loss": 1.2832, + "step": 6468 + }, + { + "epoch": 0.15087738734194928, + "grad_norm": 2.174293041229248, + "learning_rate": 1.997841252023479e-05, + "loss": 1.6778, + "step": 6469 + }, + { + "epoch": 0.15090071048112721, + "grad_norm": 2.450093984603882, + "learning_rate": 1.99783959808957e-05, + "loss": 1.9501, + "step": 6470 + }, + { + "epoch": 0.15092403362030513, + "grad_norm": 2.112665891647339, + "learning_rate": 1.9978379435230046e-05, + "loss": 1.2682, + "step": 6471 + }, + { + "epoch": 0.15094735675948304, + "grad_norm": 1.7335790395736694, + "learning_rate": 1.9978362883237837e-05, + "loss": 1.0613, + "step": 6472 + }, + { + "epoch": 0.15097067989866095, + "grad_norm": 1.874605655670166, + "learning_rate": 1.9978346324919088e-05, + "loss": 1.4522, + "step": 6473 + }, + { + "epoch": 0.1509940030378389, + "grad_norm": 1.7361513376235962, + "learning_rate": 1.9978329760273803e-05, + "loss": 1.3894, + "step": 6474 + }, + { + "epoch": 0.1510173261770168, + "grad_norm": 1.5649514198303223, + "learning_rate": 1.9978313189301994e-05, + "loss": 1.2021, + "step": 6475 + }, + { + "epoch": 0.1510406493161947, + "grad_norm": 1.6188381910324097, + "learning_rate": 1.9978296612003673e-05, + "loss": 1.0341, + "step": 6476 + }, + { + "epoch": 0.15106397245537262, + "grad_norm": 1.5095072984695435, + "learning_rate": 1.997828002837885e-05, + "loss": 1.3471, + "step": 6477 + }, + { + "epoch": 0.15108729559455056, + "grad_norm": 2.0995352268218994, + "learning_rate": 1.9978263438427533e-05, + "loss": 1.5183, + "step": 6478 + }, + { + "epoch": 0.15111061873372847, + "grad_norm": 1.7003324031829834, + "learning_rate": 1.997824684214974e-05, + "loss": 1.5548, + "step": 6479 + }, + { + "epoch": 0.15113394187290638, + "grad_norm": 1.7376984357833862, + "learning_rate": 1.9978230239545475e-05, + "loss": 1.3929, + "step": 6480 + }, + { + "epoch": 0.1511572650120843, + "grad_norm": 1.6754313707351685, + "learning_rate": 1.997821363061475e-05, + "loss": 1.7666, + "step": 6481 + }, + { + "epoch": 0.15118058815126223, + "grad_norm": 1.5937837362289429, + "learning_rate": 1.9978197015357574e-05, + "loss": 1.4147, + "step": 6482 + }, + { + "epoch": 0.15120391129044014, + "grad_norm": 1.6369043588638306, + "learning_rate": 1.997818039377396e-05, + "loss": 1.5992, + "step": 6483 + }, + { + "epoch": 0.15122723442961805, + "grad_norm": 2.1818599700927734, + "learning_rate": 1.9978163765863918e-05, + "loss": 1.3684, + "step": 6484 + }, + { + "epoch": 0.15125055756879596, + "grad_norm": 3.3784594535827637, + "learning_rate": 1.9978147131627456e-05, + "loss": 1.5897, + "step": 6485 + }, + { + "epoch": 0.1512738807079739, + "grad_norm": 1.8567595481872559, + "learning_rate": 1.997813049106459e-05, + "loss": 1.4494, + "step": 6486 + }, + { + "epoch": 0.1512972038471518, + "grad_norm": 1.8432731628417969, + "learning_rate": 1.9978113844175326e-05, + "loss": 1.6747, + "step": 6487 + }, + { + "epoch": 0.15132052698632972, + "grad_norm": 1.642920970916748, + "learning_rate": 1.9978097190959674e-05, + "loss": 1.3777, + "step": 6488 + }, + { + "epoch": 0.15134385012550763, + "grad_norm": 1.8871757984161377, + "learning_rate": 1.997808053141765e-05, + "loss": 1.4572, + "step": 6489 + }, + { + "epoch": 0.15136717326468557, + "grad_norm": 2.2074224948883057, + "learning_rate": 1.9978063865549257e-05, + "loss": 1.1339, + "step": 6490 + }, + { + "epoch": 0.15139049640386348, + "grad_norm": 1.860770344734192, + "learning_rate": 1.9978047193354514e-05, + "loss": 1.5212, + "step": 6491 + }, + { + "epoch": 0.1514138195430414, + "grad_norm": 1.8878321647644043, + "learning_rate": 1.9978030514833425e-05, + "loss": 1.3368, + "step": 6492 + }, + { + "epoch": 0.1514371426822193, + "grad_norm": 1.7903316020965576, + "learning_rate": 1.9978013829986e-05, + "loss": 1.3396, + "step": 6493 + }, + { + "epoch": 0.15146046582139724, + "grad_norm": 1.9103925228118896, + "learning_rate": 1.9977997138812257e-05, + "loss": 1.0665, + "step": 6494 + }, + { + "epoch": 0.15148378896057516, + "grad_norm": 1.822698950767517, + "learning_rate": 1.9977980441312198e-05, + "loss": 1.4066, + "step": 6495 + }, + { + "epoch": 0.15150711209975307, + "grad_norm": 1.8628785610198975, + "learning_rate": 1.997796373748584e-05, + "loss": 1.5485, + "step": 6496 + }, + { + "epoch": 0.15153043523893098, + "grad_norm": 1.5208309888839722, + "learning_rate": 1.997794702733319e-05, + "loss": 1.3008, + "step": 6497 + }, + { + "epoch": 0.1515537583781089, + "grad_norm": 2.140714645385742, + "learning_rate": 1.997793031085426e-05, + "loss": 1.5179, + "step": 6498 + }, + { + "epoch": 0.15157708151728683, + "grad_norm": 1.988162875175476, + "learning_rate": 1.997791358804906e-05, + "loss": 1.2495, + "step": 6499 + }, + { + "epoch": 0.15160040465646474, + "grad_norm": 2.1209254264831543, + "learning_rate": 1.9977896858917602e-05, + "loss": 1.5261, + "step": 6500 + }, + { + "epoch": 0.15162372779564265, + "grad_norm": 2.0882837772369385, + "learning_rate": 1.9977880123459893e-05, + "loss": 0.9897, + "step": 6501 + }, + { + "epoch": 0.15164705093482056, + "grad_norm": 1.7871941328048706, + "learning_rate": 1.9977863381675946e-05, + "loss": 1.6516, + "step": 6502 + }, + { + "epoch": 0.1516703740739985, + "grad_norm": 1.9040305614471436, + "learning_rate": 1.9977846633565774e-05, + "loss": 1.6347, + "step": 6503 + }, + { + "epoch": 0.1516936972131764, + "grad_norm": 1.9576791524887085, + "learning_rate": 1.9977829879129383e-05, + "loss": 1.2796, + "step": 6504 + }, + { + "epoch": 0.15171702035235432, + "grad_norm": 1.7492595911026, + "learning_rate": 1.9977813118366786e-05, + "loss": 1.5032, + "step": 6505 + }, + { + "epoch": 0.15174034349153223, + "grad_norm": 1.689988613128662, + "learning_rate": 1.9977796351277997e-05, + "loss": 1.3516, + "step": 6506 + }, + { + "epoch": 0.15176366663071017, + "grad_norm": 1.831680417060852, + "learning_rate": 1.997777957786302e-05, + "loss": 1.4084, + "step": 6507 + }, + { + "epoch": 0.15178698976988808, + "grad_norm": 2.5989856719970703, + "learning_rate": 1.997776279812187e-05, + "loss": 1.6565, + "step": 6508 + }, + { + "epoch": 0.151810312909066, + "grad_norm": 1.674654245376587, + "learning_rate": 1.9977746012054555e-05, + "loss": 1.4803, + "step": 6509 + }, + { + "epoch": 0.1518336360482439, + "grad_norm": 2.247751474380493, + "learning_rate": 1.9977729219661088e-05, + "loss": 1.3014, + "step": 6510 + }, + { + "epoch": 0.15185695918742184, + "grad_norm": 1.8973122835159302, + "learning_rate": 1.997771242094148e-05, + "loss": 1.5883, + "step": 6511 + }, + { + "epoch": 0.15188028232659975, + "grad_norm": 1.621720552444458, + "learning_rate": 1.997769561589574e-05, + "loss": 1.4257, + "step": 6512 + }, + { + "epoch": 0.15190360546577766, + "grad_norm": 1.5546311140060425, + "learning_rate": 1.9977678804523877e-05, + "loss": 1.2801, + "step": 6513 + }, + { + "epoch": 0.15192692860495557, + "grad_norm": 1.8398672342300415, + "learning_rate": 1.9977661986825906e-05, + "loss": 1.8715, + "step": 6514 + }, + { + "epoch": 0.1519502517441335, + "grad_norm": 1.7678091526031494, + "learning_rate": 1.9977645162801833e-05, + "loss": 1.6086, + "step": 6515 + }, + { + "epoch": 0.15197357488331142, + "grad_norm": 2.146550416946411, + "learning_rate": 1.997762833245167e-05, + "loss": 1.5868, + "step": 6516 + }, + { + "epoch": 0.15199689802248934, + "grad_norm": 1.8834812641143799, + "learning_rate": 1.997761149577543e-05, + "loss": 1.5714, + "step": 6517 + }, + { + "epoch": 0.15202022116166725, + "grad_norm": 1.730095386505127, + "learning_rate": 1.9977594652773127e-05, + "loss": 1.6212, + "step": 6518 + }, + { + "epoch": 0.15204354430084518, + "grad_norm": 1.641436219215393, + "learning_rate": 1.9977577803444764e-05, + "loss": 1.6385, + "step": 6519 + }, + { + "epoch": 0.1520668674400231, + "grad_norm": 1.968949794769287, + "learning_rate": 1.9977560947790353e-05, + "loss": 1.4906, + "step": 6520 + }, + { + "epoch": 0.152090190579201, + "grad_norm": 1.6064125299453735, + "learning_rate": 1.9977544085809906e-05, + "loss": 1.641, + "step": 6521 + }, + { + "epoch": 0.15211351371837892, + "grad_norm": 1.8463408946990967, + "learning_rate": 1.9977527217503438e-05, + "loss": 1.4831, + "step": 6522 + }, + { + "epoch": 0.15213683685755686, + "grad_norm": 1.7644277811050415, + "learning_rate": 1.997751034287095e-05, + "loss": 1.4995, + "step": 6523 + }, + { + "epoch": 0.15216015999673477, + "grad_norm": 1.8047285079956055, + "learning_rate": 1.9977493461912464e-05, + "loss": 1.5235, + "step": 6524 + }, + { + "epoch": 0.15218348313591268, + "grad_norm": 2.555365800857544, + "learning_rate": 1.9977476574627982e-05, + "loss": 1.2483, + "step": 6525 + }, + { + "epoch": 0.1522068062750906, + "grad_norm": 1.70860755443573, + "learning_rate": 1.997745968101752e-05, + "loss": 1.3158, + "step": 6526 + }, + { + "epoch": 0.1522301294142685, + "grad_norm": 1.7148840427398682, + "learning_rate": 1.9977442781081086e-05, + "loss": 1.086, + "step": 6527 + }, + { + "epoch": 0.15225345255344644, + "grad_norm": 1.980648159980774, + "learning_rate": 1.997742587481869e-05, + "loss": 1.2188, + "step": 6528 + }, + { + "epoch": 0.15227677569262435, + "grad_norm": 1.8529666662216187, + "learning_rate": 1.9977408962230344e-05, + "loss": 1.2604, + "step": 6529 + }, + { + "epoch": 0.15230009883180226, + "grad_norm": 1.4504103660583496, + "learning_rate": 1.9977392043316063e-05, + "loss": 1.2932, + "step": 6530 + }, + { + "epoch": 0.15232342197098017, + "grad_norm": 2.077039957046509, + "learning_rate": 1.997737511807585e-05, + "loss": 1.37, + "step": 6531 + }, + { + "epoch": 0.1523467451101581, + "grad_norm": 1.5881446599960327, + "learning_rate": 1.9977358186509717e-05, + "loss": 1.4382, + "step": 6532 + }, + { + "epoch": 0.15237006824933602, + "grad_norm": 1.8780879974365234, + "learning_rate": 1.997734124861768e-05, + "loss": 1.2797, + "step": 6533 + }, + { + "epoch": 0.15239339138851393, + "grad_norm": 1.6316667795181274, + "learning_rate": 1.9977324304399746e-05, + "loss": 1.2751, + "step": 6534 + }, + { + "epoch": 0.15241671452769184, + "grad_norm": 2.035203695297241, + "learning_rate": 1.9977307353855927e-05, + "loss": 1.5434, + "step": 6535 + }, + { + "epoch": 0.15244003766686978, + "grad_norm": 1.874140977859497, + "learning_rate": 1.997729039698623e-05, + "loss": 1.2624, + "step": 6536 + }, + { + "epoch": 0.1524633608060477, + "grad_norm": 1.658991813659668, + "learning_rate": 1.9977273433790674e-05, + "loss": 1.3825, + "step": 6537 + }, + { + "epoch": 0.1524866839452256, + "grad_norm": 2.5232551097869873, + "learning_rate": 1.997725646426926e-05, + "loss": 1.5688, + "step": 6538 + }, + { + "epoch": 0.15251000708440351, + "grad_norm": 2.214003324508667, + "learning_rate": 1.9977239488422005e-05, + "loss": 1.7858, + "step": 6539 + }, + { + "epoch": 0.15253333022358145, + "grad_norm": 1.6410120725631714, + "learning_rate": 1.9977222506248922e-05, + "loss": 1.2271, + "step": 6540 + }, + { + "epoch": 0.15255665336275936, + "grad_norm": 1.8160293102264404, + "learning_rate": 1.9977205517750017e-05, + "loss": 1.6119, + "step": 6541 + }, + { + "epoch": 0.15257997650193728, + "grad_norm": 1.975029468536377, + "learning_rate": 1.9977188522925297e-05, + "loss": 1.2518, + "step": 6542 + }, + { + "epoch": 0.1526032996411152, + "grad_norm": 1.7145087718963623, + "learning_rate": 1.9977171521774782e-05, + "loss": 1.0661, + "step": 6543 + }, + { + "epoch": 0.15262662278029313, + "grad_norm": 1.8253360986709595, + "learning_rate": 1.9977154514298473e-05, + "loss": 1.5222, + "step": 6544 + }, + { + "epoch": 0.15264994591947104, + "grad_norm": 2.014852285385132, + "learning_rate": 1.997713750049639e-05, + "loss": 1.7369, + "step": 6545 + }, + { + "epoch": 0.15267326905864895, + "grad_norm": 1.85854172706604, + "learning_rate": 1.9977120480368543e-05, + "loss": 1.3594, + "step": 6546 + }, + { + "epoch": 0.15269659219782686, + "grad_norm": 1.8542810678482056, + "learning_rate": 1.9977103453914935e-05, + "loss": 1.3398, + "step": 6547 + }, + { + "epoch": 0.1527199153370048, + "grad_norm": 1.7564404010772705, + "learning_rate": 1.9977086421135583e-05, + "loss": 1.154, + "step": 6548 + }, + { + "epoch": 0.1527432384761827, + "grad_norm": 1.9835318326950073, + "learning_rate": 1.9977069382030495e-05, + "loss": 1.7294, + "step": 6549 + }, + { + "epoch": 0.15276656161536062, + "grad_norm": 2.2860097885131836, + "learning_rate": 1.9977052336599683e-05, + "loss": 1.364, + "step": 6550 + }, + { + "epoch": 0.15278988475453853, + "grad_norm": 1.6872493028640747, + "learning_rate": 1.997703528484316e-05, + "loss": 1.5011, + "step": 6551 + }, + { + "epoch": 0.15281320789371647, + "grad_norm": 1.4953231811523438, + "learning_rate": 1.9977018226760933e-05, + "loss": 1.4451, + "step": 6552 + }, + { + "epoch": 0.15283653103289438, + "grad_norm": 2.006622791290283, + "learning_rate": 1.9977001162353015e-05, + "loss": 1.2026, + "step": 6553 + }, + { + "epoch": 0.1528598541720723, + "grad_norm": 1.8958851099014282, + "learning_rate": 1.9976984091619415e-05, + "loss": 1.4197, + "step": 6554 + }, + { + "epoch": 0.1528831773112502, + "grad_norm": 1.663368582725525, + "learning_rate": 1.997696701456015e-05, + "loss": 1.3074, + "step": 6555 + }, + { + "epoch": 0.1529065004504281, + "grad_norm": 1.7730720043182373, + "learning_rate": 1.9976949931175224e-05, + "loss": 1.6625, + "step": 6556 + }, + { + "epoch": 0.15292982358960605, + "grad_norm": 1.7763030529022217, + "learning_rate": 1.9976932841464647e-05, + "loss": 1.5853, + "step": 6557 + }, + { + "epoch": 0.15295314672878396, + "grad_norm": 1.6516815423965454, + "learning_rate": 1.9976915745428434e-05, + "loss": 1.154, + "step": 6558 + }, + { + "epoch": 0.15297646986796187, + "grad_norm": 1.5111597776412964, + "learning_rate": 1.9976898643066593e-05, + "loss": 1.0464, + "step": 6559 + }, + { + "epoch": 0.15299979300713978, + "grad_norm": 1.7658908367156982, + "learning_rate": 1.9976881534379137e-05, + "loss": 1.3063, + "step": 6560 + }, + { + "epoch": 0.15302311614631772, + "grad_norm": 1.6070139408111572, + "learning_rate": 1.9976864419366077e-05, + "loss": 1.4139, + "step": 6561 + }, + { + "epoch": 0.15304643928549563, + "grad_norm": 1.9629520177841187, + "learning_rate": 1.9976847298027425e-05, + "loss": 1.7102, + "step": 6562 + }, + { + "epoch": 0.15306976242467354, + "grad_norm": 1.6279053688049316, + "learning_rate": 1.9976830170363185e-05, + "loss": 1.6218, + "step": 6563 + }, + { + "epoch": 0.15309308556385146, + "grad_norm": 1.913233995437622, + "learning_rate": 1.9976813036373377e-05, + "loss": 1.6707, + "step": 6564 + }, + { + "epoch": 0.1531164087030294, + "grad_norm": 1.9475053548812866, + "learning_rate": 1.997679589605801e-05, + "loss": 1.762, + "step": 6565 + }, + { + "epoch": 0.1531397318422073, + "grad_norm": 2.150068998336792, + "learning_rate": 1.9976778749417084e-05, + "loss": 1.2494, + "step": 6566 + }, + { + "epoch": 0.15316305498138522, + "grad_norm": 1.6049610376358032, + "learning_rate": 1.9976761596450625e-05, + "loss": 1.4975, + "step": 6567 + }, + { + "epoch": 0.15318637812056313, + "grad_norm": 1.5477256774902344, + "learning_rate": 1.9976744437158636e-05, + "loss": 1.4893, + "step": 6568 + }, + { + "epoch": 0.15320970125974107, + "grad_norm": 1.8656615018844604, + "learning_rate": 1.9976727271541126e-05, + "loss": 1.505, + "step": 6569 + }, + { + "epoch": 0.15323302439891898, + "grad_norm": 1.7299082279205322, + "learning_rate": 1.9976710099598114e-05, + "loss": 1.3042, + "step": 6570 + }, + { + "epoch": 0.1532563475380969, + "grad_norm": 1.5539323091506958, + "learning_rate": 1.9976692921329605e-05, + "loss": 1.0444, + "step": 6571 + }, + { + "epoch": 0.1532796706772748, + "grad_norm": 1.462532877922058, + "learning_rate": 1.9976675736735606e-05, + "loss": 1.0247, + "step": 6572 + }, + { + "epoch": 0.15330299381645274, + "grad_norm": 1.7787960767745972, + "learning_rate": 1.9976658545816135e-05, + "loss": 1.2049, + "step": 6573 + }, + { + "epoch": 0.15332631695563065, + "grad_norm": 1.535142421722412, + "learning_rate": 1.99766413485712e-05, + "loss": 1.0657, + "step": 6574 + }, + { + "epoch": 0.15334964009480856, + "grad_norm": 1.9274897575378418, + "learning_rate": 1.9976624145000816e-05, + "loss": 1.5395, + "step": 6575 + }, + { + "epoch": 0.15337296323398647, + "grad_norm": 2.2430832386016846, + "learning_rate": 1.9976606935104988e-05, + "loss": 1.1804, + "step": 6576 + }, + { + "epoch": 0.1533962863731644, + "grad_norm": 1.6187376976013184, + "learning_rate": 1.997658971888373e-05, + "loss": 1.2581, + "step": 6577 + }, + { + "epoch": 0.15341960951234232, + "grad_norm": 2.1292576789855957, + "learning_rate": 1.9976572496337053e-05, + "loss": 1.4201, + "step": 6578 + }, + { + "epoch": 0.15344293265152023, + "grad_norm": 1.9969521760940552, + "learning_rate": 1.9976555267464963e-05, + "loss": 1.2763, + "step": 6579 + }, + { + "epoch": 0.15346625579069814, + "grad_norm": 1.6187206506729126, + "learning_rate": 1.9976538032267478e-05, + "loss": 1.5049, + "step": 6580 + }, + { + "epoch": 0.15348957892987608, + "grad_norm": 1.7974942922592163, + "learning_rate": 1.9976520790744605e-05, + "loss": 1.6748, + "step": 6581 + }, + { + "epoch": 0.153512902069054, + "grad_norm": 2.093141555786133, + "learning_rate": 1.9976503542896358e-05, + "loss": 1.5338, + "step": 6582 + }, + { + "epoch": 0.1535362252082319, + "grad_norm": 1.8473402261734009, + "learning_rate": 1.9976486288722743e-05, + "loss": 1.0662, + "step": 6583 + }, + { + "epoch": 0.1535595483474098, + "grad_norm": 1.8073716163635254, + "learning_rate": 1.9976469028223774e-05, + "loss": 1.2896, + "step": 6584 + }, + { + "epoch": 0.15358287148658772, + "grad_norm": 1.926819086074829, + "learning_rate": 1.9976451761399465e-05, + "loss": 1.3485, + "step": 6585 + }, + { + "epoch": 0.15360619462576566, + "grad_norm": 1.990258812904358, + "learning_rate": 1.9976434488249822e-05, + "loss": 1.2535, + "step": 6586 + }, + { + "epoch": 0.15362951776494357, + "grad_norm": 1.6320831775665283, + "learning_rate": 1.9976417208774855e-05, + "loss": 1.6234, + "step": 6587 + }, + { + "epoch": 0.15365284090412148, + "grad_norm": 1.3978424072265625, + "learning_rate": 1.9976399922974582e-05, + "loss": 1.2573, + "step": 6588 + }, + { + "epoch": 0.1536761640432994, + "grad_norm": 1.6854643821716309, + "learning_rate": 1.9976382630849005e-05, + "loss": 1.219, + "step": 6589 + }, + { + "epoch": 0.15369948718247733, + "grad_norm": 1.743311882019043, + "learning_rate": 1.9976365332398143e-05, + "loss": 1.2314, + "step": 6590 + }, + { + "epoch": 0.15372281032165525, + "grad_norm": 1.5786190032958984, + "learning_rate": 1.9976348027622e-05, + "loss": 1.6609, + "step": 6591 + }, + { + "epoch": 0.15374613346083316, + "grad_norm": 2.682681083679199, + "learning_rate": 1.9976330716520595e-05, + "loss": 1.3582, + "step": 6592 + }, + { + "epoch": 0.15376945660001107, + "grad_norm": 2.111043930053711, + "learning_rate": 1.997631339909393e-05, + "loss": 1.359, + "step": 6593 + }, + { + "epoch": 0.153792779739189, + "grad_norm": 1.6102101802825928, + "learning_rate": 1.9976296075342025e-05, + "loss": 1.4807, + "step": 6594 + }, + { + "epoch": 0.15381610287836692, + "grad_norm": 1.682159423828125, + "learning_rate": 1.9976278745264883e-05, + "loss": 1.3404, + "step": 6595 + }, + { + "epoch": 0.15383942601754483, + "grad_norm": 1.5754555463790894, + "learning_rate": 1.997626140886252e-05, + "loss": 1.2176, + "step": 6596 + }, + { + "epoch": 0.15386274915672274, + "grad_norm": 1.7442524433135986, + "learning_rate": 1.997624406613494e-05, + "loss": 0.9646, + "step": 6597 + }, + { + "epoch": 0.15388607229590068, + "grad_norm": 2.2065911293029785, + "learning_rate": 1.9976226717082167e-05, + "loss": 1.5093, + "step": 6598 + }, + { + "epoch": 0.1539093954350786, + "grad_norm": 1.2307199239730835, + "learning_rate": 1.99762093617042e-05, + "loss": 0.9137, + "step": 6599 + }, + { + "epoch": 0.1539327185742565, + "grad_norm": 1.7049083709716797, + "learning_rate": 1.997619200000106e-05, + "loss": 1.1474, + "step": 6600 + }, + { + "epoch": 0.1539560417134344, + "grad_norm": 2.1069562435150146, + "learning_rate": 1.9976174631972745e-05, + "loss": 1.3523, + "step": 6601 + }, + { + "epoch": 0.15397936485261235, + "grad_norm": 1.7707322835922241, + "learning_rate": 1.9976157257619276e-05, + "loss": 1.4683, + "step": 6602 + }, + { + "epoch": 0.15400268799179026, + "grad_norm": 2.370839834213257, + "learning_rate": 1.997613987694066e-05, + "loss": 1.2749, + "step": 6603 + }, + { + "epoch": 0.15402601113096817, + "grad_norm": 1.7016270160675049, + "learning_rate": 1.9976122489936915e-05, + "loss": 1.1921, + "step": 6604 + }, + { + "epoch": 0.15404933427014608, + "grad_norm": 2.3219470977783203, + "learning_rate": 1.9976105096608038e-05, + "loss": 1.3731, + "step": 6605 + }, + { + "epoch": 0.15407265740932402, + "grad_norm": 1.5784168243408203, + "learning_rate": 1.9976087696954055e-05, + "loss": 1.0343, + "step": 6606 + }, + { + "epoch": 0.15409598054850193, + "grad_norm": 1.608054280281067, + "learning_rate": 1.997607029097497e-05, + "loss": 1.4794, + "step": 6607 + }, + { + "epoch": 0.15411930368767984, + "grad_norm": 2.4747016429901123, + "learning_rate": 1.9976052878670792e-05, + "loss": 1.5144, + "step": 6608 + }, + { + "epoch": 0.15414262682685775, + "grad_norm": 1.7735477685928345, + "learning_rate": 1.9976035460041533e-05, + "loss": 1.2392, + "step": 6609 + }, + { + "epoch": 0.15416594996603566, + "grad_norm": 2.03709077835083, + "learning_rate": 1.997601803508721e-05, + "loss": 1.3911, + "step": 6610 + }, + { + "epoch": 0.1541892731052136, + "grad_norm": 1.6344530582427979, + "learning_rate": 1.9976000603807826e-05, + "loss": 1.3024, + "step": 6611 + }, + { + "epoch": 0.15421259624439151, + "grad_norm": 2.1996278762817383, + "learning_rate": 1.99759831662034e-05, + "loss": 1.5863, + "step": 6612 + }, + { + "epoch": 0.15423591938356943, + "grad_norm": 1.6755666732788086, + "learning_rate": 1.9975965722273933e-05, + "loss": 1.509, + "step": 6613 + }, + { + "epoch": 0.15425924252274734, + "grad_norm": 1.8049845695495605, + "learning_rate": 1.9975948272019448e-05, + "loss": 1.4801, + "step": 6614 + }, + { + "epoch": 0.15428256566192527, + "grad_norm": 1.7644851207733154, + "learning_rate": 1.9975930815439945e-05, + "loss": 1.489, + "step": 6615 + }, + { + "epoch": 0.15430588880110319, + "grad_norm": 1.8081635236740112, + "learning_rate": 1.997591335253544e-05, + "loss": 1.7064, + "step": 6616 + }, + { + "epoch": 0.1543292119402811, + "grad_norm": 1.7060184478759766, + "learning_rate": 1.9975895883305947e-05, + "loss": 1.4251, + "step": 6617 + }, + { + "epoch": 0.154352535079459, + "grad_norm": 1.5712186098098755, + "learning_rate": 1.9975878407751476e-05, + "loss": 1.5077, + "step": 6618 + }, + { + "epoch": 0.15437585821863695, + "grad_norm": 1.7551257610321045, + "learning_rate": 1.997586092587203e-05, + "loss": 1.5307, + "step": 6619 + }, + { + "epoch": 0.15439918135781486, + "grad_norm": 1.6834040880203247, + "learning_rate": 1.997584343766763e-05, + "loss": 1.4455, + "step": 6620 + }, + { + "epoch": 0.15442250449699277, + "grad_norm": 1.510616421699524, + "learning_rate": 1.9975825943138286e-05, + "loss": 1.3996, + "step": 6621 + }, + { + "epoch": 0.15444582763617068, + "grad_norm": 2.5418965816497803, + "learning_rate": 1.9975808442283998e-05, + "loss": 1.3904, + "step": 6622 + }, + { + "epoch": 0.15446915077534862, + "grad_norm": 2.0235586166381836, + "learning_rate": 1.9975790935104796e-05, + "loss": 1.7278, + "step": 6623 + }, + { + "epoch": 0.15449247391452653, + "grad_norm": 1.45208740234375, + "learning_rate": 1.9975773421600674e-05, + "loss": 0.8894, + "step": 6624 + }, + { + "epoch": 0.15451579705370444, + "grad_norm": 1.8872328996658325, + "learning_rate": 1.9975755901771648e-05, + "loss": 1.4595, + "step": 6625 + }, + { + "epoch": 0.15453912019288235, + "grad_norm": 1.7560652494430542, + "learning_rate": 1.9975738375617736e-05, + "loss": 1.1951, + "step": 6626 + }, + { + "epoch": 0.1545624433320603, + "grad_norm": 1.8685259819030762, + "learning_rate": 1.997572084313894e-05, + "loss": 1.4705, + "step": 6627 + }, + { + "epoch": 0.1545857664712382, + "grad_norm": 1.4958946704864502, + "learning_rate": 1.997570330433528e-05, + "loss": 1.6251, + "step": 6628 + }, + { + "epoch": 0.1546090896104161, + "grad_norm": 2.04071044921875, + "learning_rate": 1.997568575920676e-05, + "loss": 1.7738, + "step": 6629 + }, + { + "epoch": 0.15463241274959402, + "grad_norm": 2.07562518119812, + "learning_rate": 1.9975668207753392e-05, + "loss": 1.5428, + "step": 6630 + }, + { + "epoch": 0.15465573588877196, + "grad_norm": 1.6336381435394287, + "learning_rate": 1.997565064997519e-05, + "loss": 1.353, + "step": 6631 + }, + { + "epoch": 0.15467905902794987, + "grad_norm": 2.258463144302368, + "learning_rate": 1.997563308587216e-05, + "loss": 1.4666, + "step": 6632 + }, + { + "epoch": 0.15470238216712778, + "grad_norm": 2.67533540725708, + "learning_rate": 1.997561551544432e-05, + "loss": 1.6113, + "step": 6633 + }, + { + "epoch": 0.1547257053063057, + "grad_norm": 1.5503212213516235, + "learning_rate": 1.9975597938691678e-05, + "loss": 1.3946, + "step": 6634 + }, + { + "epoch": 0.15474902844548363, + "grad_norm": 2.220231056213379, + "learning_rate": 1.9975580355614244e-05, + "loss": 1.0386, + "step": 6635 + }, + { + "epoch": 0.15477235158466154, + "grad_norm": 1.5641748905181885, + "learning_rate": 1.997556276621203e-05, + "loss": 1.2402, + "step": 6636 + }, + { + "epoch": 0.15479567472383945, + "grad_norm": 1.6891651153564453, + "learning_rate": 1.9975545170485047e-05, + "loss": 1.1679, + "step": 6637 + }, + { + "epoch": 0.15481899786301737, + "grad_norm": 1.9122427701950073, + "learning_rate": 1.9975527568433307e-05, + "loss": 1.1499, + "step": 6638 + }, + { + "epoch": 0.15484232100219528, + "grad_norm": 2.0877859592437744, + "learning_rate": 1.997550996005682e-05, + "loss": 1.4607, + "step": 6639 + }, + { + "epoch": 0.15486564414137322, + "grad_norm": 1.6109565496444702, + "learning_rate": 1.99754923453556e-05, + "loss": 1.3616, + "step": 6640 + }, + { + "epoch": 0.15488896728055113, + "grad_norm": 1.7341620922088623, + "learning_rate": 1.9975474724329655e-05, + "loss": 1.2823, + "step": 6641 + }, + { + "epoch": 0.15491229041972904, + "grad_norm": 2.1587436199188232, + "learning_rate": 1.9975457096978995e-05, + "loss": 1.3406, + "step": 6642 + }, + { + "epoch": 0.15493561355890695, + "grad_norm": 1.6364099979400635, + "learning_rate": 1.9975439463303636e-05, + "loss": 1.3457, + "step": 6643 + }, + { + "epoch": 0.1549589366980849, + "grad_norm": 1.8146032094955444, + "learning_rate": 1.9975421823303586e-05, + "loss": 1.3537, + "step": 6644 + }, + { + "epoch": 0.1549822598372628, + "grad_norm": 1.84891676902771, + "learning_rate": 1.9975404176978857e-05, + "loss": 1.7385, + "step": 6645 + }, + { + "epoch": 0.1550055829764407, + "grad_norm": 1.7921994924545288, + "learning_rate": 1.9975386524329458e-05, + "loss": 1.528, + "step": 6646 + }, + { + "epoch": 0.15502890611561862, + "grad_norm": 1.7305406332015991, + "learning_rate": 1.9975368865355403e-05, + "loss": 1.408, + "step": 6647 + }, + { + "epoch": 0.15505222925479656, + "grad_norm": 2.0734448432922363, + "learning_rate": 1.99753512000567e-05, + "loss": 1.695, + "step": 6648 + }, + { + "epoch": 0.15507555239397447, + "grad_norm": 2.416996717453003, + "learning_rate": 1.9975333528433368e-05, + "loss": 1.6213, + "step": 6649 + }, + { + "epoch": 0.15509887553315238, + "grad_norm": 2.126291275024414, + "learning_rate": 1.997531585048541e-05, + "loss": 1.2444, + "step": 6650 + }, + { + "epoch": 0.1551221986723303, + "grad_norm": 1.7532602548599243, + "learning_rate": 1.9975298166212838e-05, + "loss": 1.5105, + "step": 6651 + }, + { + "epoch": 0.15514552181150823, + "grad_norm": 1.6512845754623413, + "learning_rate": 1.9975280475615665e-05, + "loss": 1.4917, + "step": 6652 + }, + { + "epoch": 0.15516884495068614, + "grad_norm": 1.4222108125686646, + "learning_rate": 1.9975262778693903e-05, + "loss": 1.3406, + "step": 6653 + }, + { + "epoch": 0.15519216808986405, + "grad_norm": 1.4187642335891724, + "learning_rate": 1.9975245075447564e-05, + "loss": 1.283, + "step": 6654 + }, + { + "epoch": 0.15521549122904196, + "grad_norm": 1.5749461650848389, + "learning_rate": 1.9975227365876656e-05, + "loss": 1.4405, + "step": 6655 + }, + { + "epoch": 0.1552388143682199, + "grad_norm": 1.9636822938919067, + "learning_rate": 1.9975209649981194e-05, + "loss": 1.6201, + "step": 6656 + }, + { + "epoch": 0.1552621375073978, + "grad_norm": 2.371785879135132, + "learning_rate": 1.9975191927761188e-05, + "loss": 1.3969, + "step": 6657 + }, + { + "epoch": 0.15528546064657572, + "grad_norm": 1.7409937381744385, + "learning_rate": 1.9975174199216644e-05, + "loss": 1.4693, + "step": 6658 + }, + { + "epoch": 0.15530878378575363, + "grad_norm": 1.833670735359192, + "learning_rate": 1.997515646434758e-05, + "loss": 1.6418, + "step": 6659 + }, + { + "epoch": 0.15533210692493157, + "grad_norm": 1.6446765661239624, + "learning_rate": 1.9975138723154005e-05, + "loss": 1.6299, + "step": 6660 + }, + { + "epoch": 0.15535543006410948, + "grad_norm": 1.7893515825271606, + "learning_rate": 1.997512097563593e-05, + "loss": 1.2745, + "step": 6661 + }, + { + "epoch": 0.1553787532032874, + "grad_norm": 2.5941355228424072, + "learning_rate": 1.9975103221793368e-05, + "loss": 1.3054, + "step": 6662 + }, + { + "epoch": 0.1554020763424653, + "grad_norm": 1.934799313545227, + "learning_rate": 1.9975085461626326e-05, + "loss": 1.3949, + "step": 6663 + }, + { + "epoch": 0.15542539948164324, + "grad_norm": 1.6852493286132812, + "learning_rate": 1.997506769513482e-05, + "loss": 1.6524, + "step": 6664 + }, + { + "epoch": 0.15544872262082116, + "grad_norm": 2.0462136268615723, + "learning_rate": 1.997504992231886e-05, + "loss": 1.1957, + "step": 6665 + }, + { + "epoch": 0.15547204575999907, + "grad_norm": 1.5753675699234009, + "learning_rate": 1.9975032143178453e-05, + "loss": 1.3169, + "step": 6666 + }, + { + "epoch": 0.15549536889917698, + "grad_norm": 2.026578426361084, + "learning_rate": 1.9975014357713617e-05, + "loss": 1.4244, + "step": 6667 + }, + { + "epoch": 0.1555186920383549, + "grad_norm": 2.166987895965576, + "learning_rate": 1.9974996565924356e-05, + "loss": 1.5595, + "step": 6668 + }, + { + "epoch": 0.15554201517753283, + "grad_norm": 1.8792282342910767, + "learning_rate": 1.997497876781069e-05, + "loss": 1.3776, + "step": 6669 + }, + { + "epoch": 0.15556533831671074, + "grad_norm": 1.8747671842575073, + "learning_rate": 1.9974960963372623e-05, + "loss": 1.3708, + "step": 6670 + }, + { + "epoch": 0.15558866145588865, + "grad_norm": 2.0838735103607178, + "learning_rate": 1.9974943152610165e-05, + "loss": 1.2766, + "step": 6671 + }, + { + "epoch": 0.15561198459506656, + "grad_norm": 1.746366262435913, + "learning_rate": 1.997492533552334e-05, + "loss": 1.0505, + "step": 6672 + }, + { + "epoch": 0.1556353077342445, + "grad_norm": 1.5280516147613525, + "learning_rate": 1.9974907512112146e-05, + "loss": 1.3384, + "step": 6673 + }, + { + "epoch": 0.1556586308734224, + "grad_norm": 1.8798584938049316, + "learning_rate": 1.9974889682376597e-05, + "loss": 1.6077, + "step": 6674 + }, + { + "epoch": 0.15568195401260032, + "grad_norm": 1.9050573110580444, + "learning_rate": 1.9974871846316708e-05, + "loss": 1.5385, + "step": 6675 + }, + { + "epoch": 0.15570527715177823, + "grad_norm": 1.3099949359893799, + "learning_rate": 1.9974854003932488e-05, + "loss": 1.1917, + "step": 6676 + }, + { + "epoch": 0.15572860029095617, + "grad_norm": 1.584046721458435, + "learning_rate": 1.997483615522395e-05, + "loss": 1.2905, + "step": 6677 + }, + { + "epoch": 0.15575192343013408, + "grad_norm": 1.6466825008392334, + "learning_rate": 1.9974818300191102e-05, + "loss": 1.4862, + "step": 6678 + }, + { + "epoch": 0.155775246569312, + "grad_norm": 1.3746250867843628, + "learning_rate": 1.9974800438833957e-05, + "loss": 1.5604, + "step": 6679 + }, + { + "epoch": 0.1557985697084899, + "grad_norm": 2.245645761489868, + "learning_rate": 1.997478257115253e-05, + "loss": 1.2202, + "step": 6680 + }, + { + "epoch": 0.15582189284766784, + "grad_norm": 1.7908310890197754, + "learning_rate": 1.9974764697146824e-05, + "loss": 1.3758, + "step": 6681 + }, + { + "epoch": 0.15584521598684575, + "grad_norm": 2.050823211669922, + "learning_rate": 1.997474681681686e-05, + "loss": 1.4698, + "step": 6682 + }, + { + "epoch": 0.15586853912602366, + "grad_norm": 1.971430778503418, + "learning_rate": 1.997472893016264e-05, + "loss": 1.4667, + "step": 6683 + }, + { + "epoch": 0.15589186226520158, + "grad_norm": 1.547082543373108, + "learning_rate": 1.9974711037184184e-05, + "loss": 1.3488, + "step": 6684 + }, + { + "epoch": 0.1559151854043795, + "grad_norm": 1.5321309566497803, + "learning_rate": 1.9974693137881498e-05, + "loss": 1.3336, + "step": 6685 + }, + { + "epoch": 0.15593850854355742, + "grad_norm": 1.6248953342437744, + "learning_rate": 1.9974675232254593e-05, + "loss": 1.0957, + "step": 6686 + }, + { + "epoch": 0.15596183168273534, + "grad_norm": 2.627365827560425, + "learning_rate": 1.9974657320303486e-05, + "loss": 1.2141, + "step": 6687 + }, + { + "epoch": 0.15598515482191325, + "grad_norm": 1.4590063095092773, + "learning_rate": 1.9974639402028183e-05, + "loss": 1.0617, + "step": 6688 + }, + { + "epoch": 0.15600847796109119, + "grad_norm": 1.5410219430923462, + "learning_rate": 1.9974621477428692e-05, + "loss": 1.1891, + "step": 6689 + }, + { + "epoch": 0.1560318011002691, + "grad_norm": 5.158645153045654, + "learning_rate": 1.9974603546505036e-05, + "loss": 1.5829, + "step": 6690 + }, + { + "epoch": 0.156055124239447, + "grad_norm": 1.5512306690216064, + "learning_rate": 1.9974585609257215e-05, + "loss": 1.3996, + "step": 6691 + }, + { + "epoch": 0.15607844737862492, + "grad_norm": 2.5099856853485107, + "learning_rate": 1.9974567665685246e-05, + "loss": 1.4971, + "step": 6692 + }, + { + "epoch": 0.15610177051780286, + "grad_norm": 1.719483733177185, + "learning_rate": 1.997454971578914e-05, + "loss": 1.402, + "step": 6693 + }, + { + "epoch": 0.15612509365698077, + "grad_norm": 1.6442888975143433, + "learning_rate": 1.9974531759568906e-05, + "loss": 1.6019, + "step": 6694 + }, + { + "epoch": 0.15614841679615868, + "grad_norm": 1.8424293994903564, + "learning_rate": 1.9974513797024558e-05, + "loss": 1.5562, + "step": 6695 + }, + { + "epoch": 0.1561717399353366, + "grad_norm": 1.844246506690979, + "learning_rate": 1.9974495828156106e-05, + "loss": 1.5199, + "step": 6696 + }, + { + "epoch": 0.1561950630745145, + "grad_norm": 1.5487847328186035, + "learning_rate": 1.9974477852963564e-05, + "loss": 1.2344, + "step": 6697 + }, + { + "epoch": 0.15621838621369244, + "grad_norm": 2.296854019165039, + "learning_rate": 1.9974459871446942e-05, + "loss": 1.2332, + "step": 6698 + }, + { + "epoch": 0.15624170935287035, + "grad_norm": 1.8721725940704346, + "learning_rate": 1.9974441883606243e-05, + "loss": 1.3737, + "step": 6699 + }, + { + "epoch": 0.15626503249204826, + "grad_norm": 1.7151719331741333, + "learning_rate": 1.9974423889441494e-05, + "loss": 1.173, + "step": 6700 + }, + { + "epoch": 0.15628835563122617, + "grad_norm": 1.461303472518921, + "learning_rate": 1.9974405888952696e-05, + "loss": 1.2968, + "step": 6701 + }, + { + "epoch": 0.1563116787704041, + "grad_norm": 1.794656753540039, + "learning_rate": 1.997438788213986e-05, + "loss": 1.6953, + "step": 6702 + }, + { + "epoch": 0.15633500190958202, + "grad_norm": 1.8690533638000488, + "learning_rate": 1.9974369869003e-05, + "loss": 1.4083, + "step": 6703 + }, + { + "epoch": 0.15635832504875993, + "grad_norm": 1.635138988494873, + "learning_rate": 1.9974351849542132e-05, + "loss": 1.3456, + "step": 6704 + }, + { + "epoch": 0.15638164818793784, + "grad_norm": 1.587601900100708, + "learning_rate": 1.997433382375726e-05, + "loss": 0.9619, + "step": 6705 + }, + { + "epoch": 0.15640497132711578, + "grad_norm": 1.7306455373764038, + "learning_rate": 1.99743157916484e-05, + "loss": 1.1387, + "step": 6706 + }, + { + "epoch": 0.1564282944662937, + "grad_norm": 1.6445868015289307, + "learning_rate": 1.9974297753215562e-05, + "loss": 1.4319, + "step": 6707 + }, + { + "epoch": 0.1564516176054716, + "grad_norm": 1.7855867147445679, + "learning_rate": 1.9974279708458756e-05, + "loss": 1.5157, + "step": 6708 + }, + { + "epoch": 0.15647494074464952, + "grad_norm": 1.8095883131027222, + "learning_rate": 1.9974261657377997e-05, + "loss": 1.3021, + "step": 6709 + }, + { + "epoch": 0.15649826388382745, + "grad_norm": 1.8344168663024902, + "learning_rate": 1.9974243599973295e-05, + "loss": 1.6525, + "step": 6710 + }, + { + "epoch": 0.15652158702300537, + "grad_norm": 1.8539479970932007, + "learning_rate": 1.9974225536244657e-05, + "loss": 1.1229, + "step": 6711 + }, + { + "epoch": 0.15654491016218328, + "grad_norm": 2.025026559829712, + "learning_rate": 1.99742074661921e-05, + "loss": 1.7504, + "step": 6712 + }, + { + "epoch": 0.1565682333013612, + "grad_norm": 1.597999930381775, + "learning_rate": 1.9974189389815634e-05, + "loss": 1.1692, + "step": 6713 + }, + { + "epoch": 0.15659155644053913, + "grad_norm": 2.394742965698242, + "learning_rate": 1.9974171307115268e-05, + "loss": 1.4318, + "step": 6714 + }, + { + "epoch": 0.15661487957971704, + "grad_norm": 1.6257845163345337, + "learning_rate": 1.997415321809102e-05, + "loss": 1.4437, + "step": 6715 + }, + { + "epoch": 0.15663820271889495, + "grad_norm": 1.5955930948257446, + "learning_rate": 1.9974135122742892e-05, + "loss": 1.344, + "step": 6716 + }, + { + "epoch": 0.15666152585807286, + "grad_norm": 1.8653430938720703, + "learning_rate": 1.99741170210709e-05, + "loss": 1.3607, + "step": 6717 + }, + { + "epoch": 0.1566848489972508, + "grad_norm": 1.9283877611160278, + "learning_rate": 1.997409891307506e-05, + "loss": 1.1396, + "step": 6718 + }, + { + "epoch": 0.1567081721364287, + "grad_norm": 1.6683522462844849, + "learning_rate": 1.9974080798755374e-05, + "loss": 1.2991, + "step": 6719 + }, + { + "epoch": 0.15673149527560662, + "grad_norm": 2.015848398208618, + "learning_rate": 1.9974062678111865e-05, + "loss": 1.5503, + "step": 6720 + }, + { + "epoch": 0.15675481841478453, + "grad_norm": 1.7482249736785889, + "learning_rate": 1.9974044551144534e-05, + "loss": 1.2537, + "step": 6721 + }, + { + "epoch": 0.15677814155396247, + "grad_norm": 1.6192493438720703, + "learning_rate": 1.99740264178534e-05, + "loss": 1.4485, + "step": 6722 + }, + { + "epoch": 0.15680146469314038, + "grad_norm": 1.7025721073150635, + "learning_rate": 1.9974008278238472e-05, + "loss": 1.2134, + "step": 6723 + }, + { + "epoch": 0.1568247878323183, + "grad_norm": 1.9277011156082153, + "learning_rate": 1.9973990132299758e-05, + "loss": 1.6158, + "step": 6724 + }, + { + "epoch": 0.1568481109714962, + "grad_norm": 1.7736979722976685, + "learning_rate": 1.9973971980037275e-05, + "loss": 1.6605, + "step": 6725 + }, + { + "epoch": 0.1568714341106741, + "grad_norm": 1.7737277746200562, + "learning_rate": 1.9973953821451028e-05, + "loss": 1.0986, + "step": 6726 + }, + { + "epoch": 0.15689475724985205, + "grad_norm": 1.5077496767044067, + "learning_rate": 1.9973935656541036e-05, + "loss": 1.4735, + "step": 6727 + }, + { + "epoch": 0.15691808038902996, + "grad_norm": 1.6117923259735107, + "learning_rate": 1.997391748530731e-05, + "loss": 1.6942, + "step": 6728 + }, + { + "epoch": 0.15694140352820787, + "grad_norm": 1.5320708751678467, + "learning_rate": 1.997389930774985e-05, + "loss": 1.3674, + "step": 6729 + }, + { + "epoch": 0.15696472666738578, + "grad_norm": 1.4641287326812744, + "learning_rate": 1.9973881123868682e-05, + "loss": 1.3591, + "step": 6730 + }, + { + "epoch": 0.15698804980656372, + "grad_norm": 1.7848944664001465, + "learning_rate": 1.997386293366381e-05, + "loss": 1.2286, + "step": 6731 + }, + { + "epoch": 0.15701137294574163, + "grad_norm": 2.297985553741455, + "learning_rate": 1.9973844737135246e-05, + "loss": 1.0139, + "step": 6732 + }, + { + "epoch": 0.15703469608491955, + "grad_norm": 1.5454133749008179, + "learning_rate": 1.9973826534283002e-05, + "loss": 1.4371, + "step": 6733 + }, + { + "epoch": 0.15705801922409746, + "grad_norm": 2.0432276725769043, + "learning_rate": 1.9973808325107092e-05, + "loss": 1.7083, + "step": 6734 + }, + { + "epoch": 0.1570813423632754, + "grad_norm": 2.0467238426208496, + "learning_rate": 1.9973790109607525e-05, + "loss": 1.3067, + "step": 6735 + }, + { + "epoch": 0.1571046655024533, + "grad_norm": 1.6913917064666748, + "learning_rate": 1.9973771887784315e-05, + "loss": 1.5092, + "step": 6736 + }, + { + "epoch": 0.15712798864163122, + "grad_norm": 1.64970064163208, + "learning_rate": 1.997375365963747e-05, + "loss": 1.3449, + "step": 6737 + }, + { + "epoch": 0.15715131178080913, + "grad_norm": 1.7438068389892578, + "learning_rate": 1.9973735425167003e-05, + "loss": 1.3812, + "step": 6738 + }, + { + "epoch": 0.15717463491998707, + "grad_norm": 1.8489435911178589, + "learning_rate": 1.9973717184372928e-05, + "loss": 1.2187, + "step": 6739 + }, + { + "epoch": 0.15719795805916498, + "grad_norm": 1.8721107244491577, + "learning_rate": 1.9973698937255254e-05, + "loss": 1.1979, + "step": 6740 + }, + { + "epoch": 0.1572212811983429, + "grad_norm": 1.6620566844940186, + "learning_rate": 1.997368068381399e-05, + "loss": 1.4267, + "step": 6741 + }, + { + "epoch": 0.1572446043375208, + "grad_norm": 1.8906793594360352, + "learning_rate": 1.9973662424049153e-05, + "loss": 1.9623, + "step": 6742 + }, + { + "epoch": 0.15726792747669874, + "grad_norm": 1.9150793552398682, + "learning_rate": 1.9973644157960754e-05, + "loss": 1.1983, + "step": 6743 + }, + { + "epoch": 0.15729125061587665, + "grad_norm": 2.782684564590454, + "learning_rate": 1.99736258855488e-05, + "loss": 1.2631, + "step": 6744 + }, + { + "epoch": 0.15731457375505456, + "grad_norm": 1.6907206773757935, + "learning_rate": 1.9973607606813303e-05, + "loss": 0.9995, + "step": 6745 + }, + { + "epoch": 0.15733789689423247, + "grad_norm": 1.882422924041748, + "learning_rate": 1.997358932175428e-05, + "loss": 1.2082, + "step": 6746 + }, + { + "epoch": 0.1573612200334104, + "grad_norm": 2.3805737495422363, + "learning_rate": 1.997357103037174e-05, + "loss": 1.2265, + "step": 6747 + }, + { + "epoch": 0.15738454317258832, + "grad_norm": 1.754375696182251, + "learning_rate": 1.997355273266569e-05, + "loss": 1.4863, + "step": 6748 + }, + { + "epoch": 0.15740786631176623, + "grad_norm": 1.7270593643188477, + "learning_rate": 1.997353442863615e-05, + "loss": 1.1557, + "step": 6749 + }, + { + "epoch": 0.15743118945094414, + "grad_norm": 2.6254312992095947, + "learning_rate": 1.9973516118283128e-05, + "loss": 1.7985, + "step": 6750 + }, + { + "epoch": 0.15745451259012208, + "grad_norm": 1.692750096321106, + "learning_rate": 1.997349780160663e-05, + "loss": 1.2584, + "step": 6751 + }, + { + "epoch": 0.1574778357293, + "grad_norm": 1.632986068725586, + "learning_rate": 1.997347947860668e-05, + "loss": 1.4841, + "step": 6752 + }, + { + "epoch": 0.1575011588684779, + "grad_norm": 2.3704335689544678, + "learning_rate": 1.9973461149283276e-05, + "loss": 1.2312, + "step": 6753 + }, + { + "epoch": 0.1575244820076558, + "grad_norm": 1.8180193901062012, + "learning_rate": 1.997344281363644e-05, + "loss": 1.4821, + "step": 6754 + }, + { + "epoch": 0.15754780514683372, + "grad_norm": 1.671475887298584, + "learning_rate": 1.9973424471666177e-05, + "loss": 1.4956, + "step": 6755 + }, + { + "epoch": 0.15757112828601166, + "grad_norm": 1.8579593896865845, + "learning_rate": 1.99734061233725e-05, + "loss": 1.3269, + "step": 6756 + }, + { + "epoch": 0.15759445142518957, + "grad_norm": 1.7671566009521484, + "learning_rate": 1.9973387768755423e-05, + "loss": 1.312, + "step": 6757 + }, + { + "epoch": 0.15761777456436749, + "grad_norm": 2.11686110496521, + "learning_rate": 1.997336940781496e-05, + "loss": 1.5192, + "step": 6758 + }, + { + "epoch": 0.1576410977035454, + "grad_norm": 1.9094016551971436, + "learning_rate": 1.9973351040551113e-05, + "loss": 1.6546, + "step": 6759 + }, + { + "epoch": 0.15766442084272334, + "grad_norm": 1.628429651260376, + "learning_rate": 1.9973332666963902e-05, + "loss": 1.3838, + "step": 6760 + }, + { + "epoch": 0.15768774398190125, + "grad_norm": 1.7725311517715454, + "learning_rate": 1.9973314287053336e-05, + "loss": 1.5766, + "step": 6761 + }, + { + "epoch": 0.15771106712107916, + "grad_norm": 1.5493593215942383, + "learning_rate": 1.9973295900819428e-05, + "loss": 1.0989, + "step": 6762 + }, + { + "epoch": 0.15773439026025707, + "grad_norm": 1.6573630571365356, + "learning_rate": 1.9973277508262185e-05, + "loss": 1.4873, + "step": 6763 + }, + { + "epoch": 0.157757713399435, + "grad_norm": 1.6448739767074585, + "learning_rate": 1.9973259109381625e-05, + "loss": 1.4385, + "step": 6764 + }, + { + "epoch": 0.15778103653861292, + "grad_norm": 1.5988082885742188, + "learning_rate": 1.9973240704177757e-05, + "loss": 1.2067, + "step": 6765 + }, + { + "epoch": 0.15780435967779083, + "grad_norm": 1.4983617067337036, + "learning_rate": 1.9973222292650592e-05, + "loss": 1.2972, + "step": 6766 + }, + { + "epoch": 0.15782768281696874, + "grad_norm": 1.949512243270874, + "learning_rate": 1.9973203874800143e-05, + "loss": 1.1035, + "step": 6767 + }, + { + "epoch": 0.15785100595614668, + "grad_norm": 2.0653328895568848, + "learning_rate": 1.997318545062642e-05, + "loss": 1.264, + "step": 6768 + }, + { + "epoch": 0.1578743290953246, + "grad_norm": 1.8771398067474365, + "learning_rate": 1.9973167020129437e-05, + "loss": 1.0905, + "step": 6769 + }, + { + "epoch": 0.1578976522345025, + "grad_norm": 1.3598778247833252, + "learning_rate": 1.99731485833092e-05, + "loss": 1.1124, + "step": 6770 + }, + { + "epoch": 0.1579209753736804, + "grad_norm": 2.103792190551758, + "learning_rate": 1.997313014016573e-05, + "loss": 1.0179, + "step": 6771 + }, + { + "epoch": 0.15794429851285835, + "grad_norm": 2.0136849880218506, + "learning_rate": 1.997311169069903e-05, + "loss": 1.6813, + "step": 6772 + }, + { + "epoch": 0.15796762165203626, + "grad_norm": 2.02883243560791, + "learning_rate": 1.997309323490912e-05, + "loss": 1.2157, + "step": 6773 + }, + { + "epoch": 0.15799094479121417, + "grad_norm": 1.5233128070831299, + "learning_rate": 1.9973074772796004e-05, + "loss": 1.4825, + "step": 6774 + }, + { + "epoch": 0.15801426793039208, + "grad_norm": 2.325742721557617, + "learning_rate": 1.9973056304359697e-05, + "loss": 1.4318, + "step": 6775 + }, + { + "epoch": 0.15803759106957002, + "grad_norm": 1.9791834354400635, + "learning_rate": 1.997303782960021e-05, + "loss": 1.5169, + "step": 6776 + }, + { + "epoch": 0.15806091420874793, + "grad_norm": 1.5982872247695923, + "learning_rate": 1.9973019348517556e-05, + "loss": 1.6131, + "step": 6777 + }, + { + "epoch": 0.15808423734792584, + "grad_norm": 1.770094394683838, + "learning_rate": 1.9973000861111743e-05, + "loss": 1.4003, + "step": 6778 + }, + { + "epoch": 0.15810756048710375, + "grad_norm": 1.7591725587844849, + "learning_rate": 1.997298236738279e-05, + "loss": 1.0755, + "step": 6779 + }, + { + "epoch": 0.1581308836262817, + "grad_norm": 1.9108467102050781, + "learning_rate": 1.9972963867330705e-05, + "loss": 1.576, + "step": 6780 + }, + { + "epoch": 0.1581542067654596, + "grad_norm": 2.162534475326538, + "learning_rate": 1.9972945360955497e-05, + "loss": 1.3765, + "step": 6781 + }, + { + "epoch": 0.15817752990463751, + "grad_norm": 2.1520111560821533, + "learning_rate": 1.997292684825718e-05, + "loss": 1.5708, + "step": 6782 + }, + { + "epoch": 0.15820085304381543, + "grad_norm": 2.040696859359741, + "learning_rate": 1.9972908329235762e-05, + "loss": 1.5354, + "step": 6783 + }, + { + "epoch": 0.15822417618299334, + "grad_norm": 1.828201413154602, + "learning_rate": 1.9972889803891262e-05, + "loss": 1.1399, + "step": 6784 + }, + { + "epoch": 0.15824749932217128, + "grad_norm": 1.7604451179504395, + "learning_rate": 1.9972871272223688e-05, + "loss": 0.8447, + "step": 6785 + }, + { + "epoch": 0.1582708224613492, + "grad_norm": 1.724661946296692, + "learning_rate": 1.9972852734233046e-05, + "loss": 1.4101, + "step": 6786 + }, + { + "epoch": 0.1582941456005271, + "grad_norm": 1.6871005296707153, + "learning_rate": 1.9972834189919362e-05, + "loss": 1.3734, + "step": 6787 + }, + { + "epoch": 0.158317468739705, + "grad_norm": 1.43993079662323, + "learning_rate": 1.9972815639282637e-05, + "loss": 1.1234, + "step": 6788 + }, + { + "epoch": 0.15834079187888295, + "grad_norm": 1.9368919134140015, + "learning_rate": 1.9972797082322883e-05, + "loss": 1.5464, + "step": 6789 + }, + { + "epoch": 0.15836411501806086, + "grad_norm": 2.2056307792663574, + "learning_rate": 1.9972778519040116e-05, + "loss": 1.4798, + "step": 6790 + }, + { + "epoch": 0.15838743815723877, + "grad_norm": 1.7927438020706177, + "learning_rate": 1.9972759949434342e-05, + "loss": 1.3831, + "step": 6791 + }, + { + "epoch": 0.15841076129641668, + "grad_norm": 2.0403501987457275, + "learning_rate": 1.997274137350558e-05, + "loss": 1.0281, + "step": 6792 + }, + { + "epoch": 0.15843408443559462, + "grad_norm": 1.5637606382369995, + "learning_rate": 1.9972722791253835e-05, + "loss": 1.4427, + "step": 6793 + }, + { + "epoch": 0.15845740757477253, + "grad_norm": 1.7798465490341187, + "learning_rate": 1.9972704202679125e-05, + "loss": 1.2417, + "step": 6794 + }, + { + "epoch": 0.15848073071395044, + "grad_norm": 2.513185977935791, + "learning_rate": 1.997268560778146e-05, + "loss": 1.297, + "step": 6795 + }, + { + "epoch": 0.15850405385312835, + "grad_norm": 1.8942391872406006, + "learning_rate": 1.9972667006560845e-05, + "loss": 1.2666, + "step": 6796 + }, + { + "epoch": 0.1585273769923063, + "grad_norm": 1.8632128238677979, + "learning_rate": 1.99726483990173e-05, + "loss": 1.2204, + "step": 6797 + }, + { + "epoch": 0.1585507001314842, + "grad_norm": 1.780616283416748, + "learning_rate": 1.9972629785150833e-05, + "loss": 1.1574, + "step": 6798 + }, + { + "epoch": 0.1585740232706621, + "grad_norm": 1.8262677192687988, + "learning_rate": 1.997261116496146e-05, + "loss": 1.4737, + "step": 6799 + }, + { + "epoch": 0.15859734640984002, + "grad_norm": 1.8839592933654785, + "learning_rate": 1.997259253844919e-05, + "loss": 1.1453, + "step": 6800 + }, + { + "epoch": 0.15862066954901796, + "grad_norm": 1.4337824583053589, + "learning_rate": 1.9972573905614032e-05, + "loss": 1.277, + "step": 6801 + }, + { + "epoch": 0.15864399268819587, + "grad_norm": 1.5538688898086548, + "learning_rate": 1.9972555266456002e-05, + "loss": 1.0072, + "step": 6802 + }, + { + "epoch": 0.15866731582737378, + "grad_norm": 1.690869688987732, + "learning_rate": 1.9972536620975108e-05, + "loss": 1.537, + "step": 6803 + }, + { + "epoch": 0.1586906389665517, + "grad_norm": 1.7880688905715942, + "learning_rate": 1.9972517969171363e-05, + "loss": 1.5519, + "step": 6804 + }, + { + "epoch": 0.15871396210572963, + "grad_norm": 2.035653829574585, + "learning_rate": 1.9972499311044782e-05, + "loss": 0.8677, + "step": 6805 + }, + { + "epoch": 0.15873728524490754, + "grad_norm": 2.0863165855407715, + "learning_rate": 1.9972480646595374e-05, + "loss": 1.4978, + "step": 6806 + }, + { + "epoch": 0.15876060838408546, + "grad_norm": 1.6389529705047607, + "learning_rate": 1.997246197582315e-05, + "loss": 1.2295, + "step": 6807 + }, + { + "epoch": 0.15878393152326337, + "grad_norm": 3.292201280593872, + "learning_rate": 1.997244329872813e-05, + "loss": 1.4913, + "step": 6808 + }, + { + "epoch": 0.1588072546624413, + "grad_norm": 1.437779426574707, + "learning_rate": 1.9972424615310314e-05, + "loss": 1.3202, + "step": 6809 + }, + { + "epoch": 0.15883057780161922, + "grad_norm": 1.4648094177246094, + "learning_rate": 1.997240592556972e-05, + "loss": 1.3008, + "step": 6810 + }, + { + "epoch": 0.15885390094079713, + "grad_norm": 1.4451961517333984, + "learning_rate": 1.9972387229506355e-05, + "loss": 1.6112, + "step": 6811 + }, + { + "epoch": 0.15887722407997504, + "grad_norm": 1.4557857513427734, + "learning_rate": 1.997236852712024e-05, + "loss": 1.402, + "step": 6812 + }, + { + "epoch": 0.15890054721915295, + "grad_norm": 1.776967167854309, + "learning_rate": 1.9972349818411378e-05, + "loss": 0.9228, + "step": 6813 + }, + { + "epoch": 0.1589238703583309, + "grad_norm": 2.00443696975708, + "learning_rate": 1.9972331103379785e-05, + "loss": 1.5063, + "step": 6814 + }, + { + "epoch": 0.1589471934975088, + "grad_norm": 1.43555748462677, + "learning_rate": 1.9972312382025474e-05, + "loss": 1.4547, + "step": 6815 + }, + { + "epoch": 0.1589705166366867, + "grad_norm": 1.5375853776931763, + "learning_rate": 1.9972293654348455e-05, + "loss": 1.1662, + "step": 6816 + }, + { + "epoch": 0.15899383977586462, + "grad_norm": 1.8221385478973389, + "learning_rate": 1.997227492034874e-05, + "loss": 1.6261, + "step": 6817 + }, + { + "epoch": 0.15901716291504256, + "grad_norm": 2.4301693439483643, + "learning_rate": 1.9972256180026338e-05, + "loss": 1.2217, + "step": 6818 + }, + { + "epoch": 0.15904048605422047, + "grad_norm": 1.7987867593765259, + "learning_rate": 1.9972237433381266e-05, + "loss": 1.4584, + "step": 6819 + }, + { + "epoch": 0.15906380919339838, + "grad_norm": 1.622259497642517, + "learning_rate": 1.9972218680413534e-05, + "loss": 1.3583, + "step": 6820 + }, + { + "epoch": 0.1590871323325763, + "grad_norm": 1.935543179512024, + "learning_rate": 1.9972199921123152e-05, + "loss": 1.5266, + "step": 6821 + }, + { + "epoch": 0.15911045547175423, + "grad_norm": 1.5773462057113647, + "learning_rate": 1.9972181155510136e-05, + "loss": 1.2908, + "step": 6822 + }, + { + "epoch": 0.15913377861093214, + "grad_norm": 1.8542346954345703, + "learning_rate": 1.997216238357449e-05, + "loss": 1.2901, + "step": 6823 + }, + { + "epoch": 0.15915710175011005, + "grad_norm": 1.7497351169586182, + "learning_rate": 1.9972143605316235e-05, + "loss": 1.7695, + "step": 6824 + }, + { + "epoch": 0.15918042488928796, + "grad_norm": 2.3112666606903076, + "learning_rate": 1.997212482073538e-05, + "loss": 1.4925, + "step": 6825 + }, + { + "epoch": 0.1592037480284659, + "grad_norm": 1.9577590227127075, + "learning_rate": 1.9972106029831936e-05, + "loss": 1.3851, + "step": 6826 + }, + { + "epoch": 0.1592270711676438, + "grad_norm": 1.6315211057662964, + "learning_rate": 1.9972087232605916e-05, + "loss": 1.2738, + "step": 6827 + }, + { + "epoch": 0.15925039430682172, + "grad_norm": 2.0599303245544434, + "learning_rate": 1.9972068429057327e-05, + "loss": 1.8131, + "step": 6828 + }, + { + "epoch": 0.15927371744599964, + "grad_norm": 1.9331268072128296, + "learning_rate": 1.9972049619186186e-05, + "loss": 1.4477, + "step": 6829 + }, + { + "epoch": 0.15929704058517757, + "grad_norm": 1.6079254150390625, + "learning_rate": 1.9972030802992506e-05, + "loss": 1.2436, + "step": 6830 + }, + { + "epoch": 0.15932036372435548, + "grad_norm": 1.500957727432251, + "learning_rate": 1.9972011980476294e-05, + "loss": 1.6946, + "step": 6831 + }, + { + "epoch": 0.1593436868635334, + "grad_norm": 1.7917413711547852, + "learning_rate": 1.9971993151637568e-05, + "loss": 1.3268, + "step": 6832 + }, + { + "epoch": 0.1593670100027113, + "grad_norm": 1.9288430213928223, + "learning_rate": 1.9971974316476334e-05, + "loss": 1.4615, + "step": 6833 + }, + { + "epoch": 0.15939033314188925, + "grad_norm": 1.9726512432098389, + "learning_rate": 1.9971955474992605e-05, + "loss": 1.7005, + "step": 6834 + }, + { + "epoch": 0.15941365628106716, + "grad_norm": 2.108999729156494, + "learning_rate": 1.9971936627186396e-05, + "loss": 1.2724, + "step": 6835 + }, + { + "epoch": 0.15943697942024507, + "grad_norm": 1.6765875816345215, + "learning_rate": 1.997191777305772e-05, + "loss": 1.4647, + "step": 6836 + }, + { + "epoch": 0.15946030255942298, + "grad_norm": 4.092576503753662, + "learning_rate": 1.997189891260658e-05, + "loss": 1.6062, + "step": 6837 + }, + { + "epoch": 0.1594836256986009, + "grad_norm": 1.579464077949524, + "learning_rate": 1.9971880045833e-05, + "loss": 1.3075, + "step": 6838 + }, + { + "epoch": 0.15950694883777883, + "grad_norm": 1.6589868068695068, + "learning_rate": 1.9971861172736985e-05, + "loss": 1.6071, + "step": 6839 + }, + { + "epoch": 0.15953027197695674, + "grad_norm": 2.050368070602417, + "learning_rate": 1.9971842293318547e-05, + "loss": 1.8545, + "step": 6840 + }, + { + "epoch": 0.15955359511613465, + "grad_norm": 1.9863017797470093, + "learning_rate": 1.9971823407577703e-05, + "loss": 1.4001, + "step": 6841 + }, + { + "epoch": 0.15957691825531256, + "grad_norm": 1.6969603300094604, + "learning_rate": 1.9971804515514455e-05, + "loss": 0.9803, + "step": 6842 + }, + { + "epoch": 0.1596002413944905, + "grad_norm": 2.0984017848968506, + "learning_rate": 1.9971785617128825e-05, + "loss": 1.3447, + "step": 6843 + }, + { + "epoch": 0.1596235645336684, + "grad_norm": 2.058102607727051, + "learning_rate": 1.9971766712420822e-05, + "loss": 1.4627, + "step": 6844 + }, + { + "epoch": 0.15964688767284632, + "grad_norm": 2.0890908241271973, + "learning_rate": 1.9971747801390454e-05, + "loss": 1.7778, + "step": 6845 + }, + { + "epoch": 0.15967021081202423, + "grad_norm": 1.4315153360366821, + "learning_rate": 1.9971728884037737e-05, + "loss": 0.9965, + "step": 6846 + }, + { + "epoch": 0.15969353395120217, + "grad_norm": 2.011157512664795, + "learning_rate": 1.9971709960362685e-05, + "loss": 1.2496, + "step": 6847 + }, + { + "epoch": 0.15971685709038008, + "grad_norm": 1.520325779914856, + "learning_rate": 1.99716910303653e-05, + "loss": 1.3732, + "step": 6848 + }, + { + "epoch": 0.159740180229558, + "grad_norm": 1.5078030824661255, + "learning_rate": 1.9971672094045608e-05, + "loss": 1.2771, + "step": 6849 + }, + { + "epoch": 0.1597635033687359, + "grad_norm": 1.8892250061035156, + "learning_rate": 1.997165315140361e-05, + "loss": 1.3925, + "step": 6850 + }, + { + "epoch": 0.15978682650791384, + "grad_norm": 1.7715742588043213, + "learning_rate": 1.9971634202439326e-05, + "loss": 1.3464, + "step": 6851 + }, + { + "epoch": 0.15981014964709175, + "grad_norm": 1.8896077871322632, + "learning_rate": 1.9971615247152764e-05, + "loss": 1.5632, + "step": 6852 + }, + { + "epoch": 0.15983347278626966, + "grad_norm": 1.5427639484405518, + "learning_rate": 1.997159628554393e-05, + "loss": 1.3368, + "step": 6853 + }, + { + "epoch": 0.15985679592544758, + "grad_norm": 1.8526346683502197, + "learning_rate": 1.9971577317612846e-05, + "loss": 1.3783, + "step": 6854 + }, + { + "epoch": 0.15988011906462551, + "grad_norm": 2.5592474937438965, + "learning_rate": 1.9971558343359522e-05, + "loss": 1.5924, + "step": 6855 + }, + { + "epoch": 0.15990344220380343, + "grad_norm": 1.6828668117523193, + "learning_rate": 1.9971539362783968e-05, + "loss": 1.2389, + "step": 6856 + }, + { + "epoch": 0.15992676534298134, + "grad_norm": 1.9929118156433105, + "learning_rate": 1.9971520375886196e-05, + "loss": 1.5167, + "step": 6857 + }, + { + "epoch": 0.15995008848215925, + "grad_norm": 1.573113203048706, + "learning_rate": 1.9971501382666215e-05, + "loss": 1.2493, + "step": 6858 + }, + { + "epoch": 0.15997341162133719, + "grad_norm": 1.6516765356063843, + "learning_rate": 1.9971482383124045e-05, + "loss": 1.5362, + "step": 6859 + }, + { + "epoch": 0.1599967347605151, + "grad_norm": 1.562059998512268, + "learning_rate": 1.997146337725969e-05, + "loss": 1.4781, + "step": 6860 + }, + { + "epoch": 0.160020057899693, + "grad_norm": 1.6389825344085693, + "learning_rate": 1.9971444365073166e-05, + "loss": 1.036, + "step": 6861 + }, + { + "epoch": 0.16004338103887092, + "grad_norm": 1.910739779472351, + "learning_rate": 1.9971425346564484e-05, + "loss": 1.4821, + "step": 6862 + }, + { + "epoch": 0.16006670417804886, + "grad_norm": 1.9274989366531372, + "learning_rate": 1.9971406321733657e-05, + "loss": 1.5445, + "step": 6863 + }, + { + "epoch": 0.16009002731722677, + "grad_norm": 1.6977241039276123, + "learning_rate": 1.9971387290580698e-05, + "loss": 1.4468, + "step": 6864 + }, + { + "epoch": 0.16011335045640468, + "grad_norm": 1.4958165884017944, + "learning_rate": 1.9971368253105617e-05, + "loss": 1.1568, + "step": 6865 + }, + { + "epoch": 0.1601366735955826, + "grad_norm": 2.2128617763519287, + "learning_rate": 1.9971349209308428e-05, + "loss": 1.4054, + "step": 6866 + }, + { + "epoch": 0.1601599967347605, + "grad_norm": 1.7472902536392212, + "learning_rate": 1.997133015918914e-05, + "loss": 1.4596, + "step": 6867 + }, + { + "epoch": 0.16018331987393844, + "grad_norm": 1.6749449968338013, + "learning_rate": 1.9971311102747766e-05, + "loss": 1.4251, + "step": 6868 + }, + { + "epoch": 0.16020664301311635, + "grad_norm": 2.2691407203674316, + "learning_rate": 1.997129203998432e-05, + "loss": 1.3161, + "step": 6869 + }, + { + "epoch": 0.16022996615229426, + "grad_norm": 2.4141719341278076, + "learning_rate": 1.9971272970898813e-05, + "loss": 1.8045, + "step": 6870 + }, + { + "epoch": 0.16025328929147217, + "grad_norm": 1.6759750843048096, + "learning_rate": 1.997125389549126e-05, + "loss": 1.5677, + "step": 6871 + }, + { + "epoch": 0.1602766124306501, + "grad_norm": 1.9316600561141968, + "learning_rate": 1.9971234813761665e-05, + "loss": 1.5186, + "step": 6872 + }, + { + "epoch": 0.16029993556982802, + "grad_norm": 1.5170408487319946, + "learning_rate": 1.997121572571005e-05, + "loss": 1.0636, + "step": 6873 + }, + { + "epoch": 0.16032325870900593, + "grad_norm": 1.6830649375915527, + "learning_rate": 1.997119663133642e-05, + "loss": 1.3046, + "step": 6874 + }, + { + "epoch": 0.16034658184818384, + "grad_norm": 1.7859324216842651, + "learning_rate": 1.997117753064079e-05, + "loss": 1.8614, + "step": 6875 + }, + { + "epoch": 0.16036990498736178, + "grad_norm": 1.696930170059204, + "learning_rate": 1.997115842362317e-05, + "loss": 1.4656, + "step": 6876 + }, + { + "epoch": 0.1603932281265397, + "grad_norm": 2.0053863525390625, + "learning_rate": 1.9971139310283577e-05, + "loss": 1.5921, + "step": 6877 + }, + { + "epoch": 0.1604165512657176, + "grad_norm": 1.8283029794692993, + "learning_rate": 1.9971120190622018e-05, + "loss": 1.7856, + "step": 6878 + }, + { + "epoch": 0.16043987440489552, + "grad_norm": 2.024780035018921, + "learning_rate": 1.9971101064638506e-05, + "loss": 1.3185, + "step": 6879 + }, + { + "epoch": 0.16046319754407345, + "grad_norm": 1.783490777015686, + "learning_rate": 1.9971081932333055e-05, + "loss": 1.5163, + "step": 6880 + }, + { + "epoch": 0.16048652068325137, + "grad_norm": 1.7589796781539917, + "learning_rate": 1.997106279370568e-05, + "loss": 1.6658, + "step": 6881 + }, + { + "epoch": 0.16050984382242928, + "grad_norm": 1.7256914377212524, + "learning_rate": 1.9971043648756385e-05, + "loss": 1.2966, + "step": 6882 + }, + { + "epoch": 0.1605331669616072, + "grad_norm": 1.872182011604309, + "learning_rate": 1.9971024497485186e-05, + "loss": 1.2795, + "step": 6883 + }, + { + "epoch": 0.16055649010078513, + "grad_norm": 2.169407367706299, + "learning_rate": 1.99710053398921e-05, + "loss": 1.2944, + "step": 6884 + }, + { + "epoch": 0.16057981323996304, + "grad_norm": 1.770311713218689, + "learning_rate": 1.997098617597713e-05, + "loss": 1.5595, + "step": 6885 + }, + { + "epoch": 0.16060313637914095, + "grad_norm": 1.7672606706619263, + "learning_rate": 1.9970967005740297e-05, + "loss": 1.2704, + "step": 6886 + }, + { + "epoch": 0.16062645951831886, + "grad_norm": 2.180710554122925, + "learning_rate": 1.9970947829181607e-05, + "loss": 1.6284, + "step": 6887 + }, + { + "epoch": 0.1606497826574968, + "grad_norm": 2.0260558128356934, + "learning_rate": 1.9970928646301072e-05, + "loss": 1.5453, + "step": 6888 + }, + { + "epoch": 0.1606731057966747, + "grad_norm": 1.5749690532684326, + "learning_rate": 1.997090945709871e-05, + "loss": 1.7417, + "step": 6889 + }, + { + "epoch": 0.16069642893585262, + "grad_norm": 3.691758394241333, + "learning_rate": 1.997089026157453e-05, + "loss": 1.5391, + "step": 6890 + }, + { + "epoch": 0.16071975207503053, + "grad_norm": 1.595268726348877, + "learning_rate": 1.997087105972854e-05, + "loss": 1.2346, + "step": 6891 + }, + { + "epoch": 0.16074307521420847, + "grad_norm": 1.6389485597610474, + "learning_rate": 1.9970851851560763e-05, + "loss": 1.3954, + "step": 6892 + }, + { + "epoch": 0.16076639835338638, + "grad_norm": 1.8842037916183472, + "learning_rate": 1.99708326370712e-05, + "loss": 1.5401, + "step": 6893 + }, + { + "epoch": 0.1607897214925643, + "grad_norm": 2.236269950866699, + "learning_rate": 1.9970813416259865e-05, + "loss": 1.6513, + "step": 6894 + }, + { + "epoch": 0.1608130446317422, + "grad_norm": 1.9487333297729492, + "learning_rate": 1.9970794189126776e-05, + "loss": 1.5055, + "step": 6895 + }, + { + "epoch": 0.1608363677709201, + "grad_norm": 1.7102299928665161, + "learning_rate": 1.9970774955671937e-05, + "loss": 1.471, + "step": 6896 + }, + { + "epoch": 0.16085969091009805, + "grad_norm": 1.9718152284622192, + "learning_rate": 1.997075571589537e-05, + "loss": 1.6506, + "step": 6897 + }, + { + "epoch": 0.16088301404927596, + "grad_norm": 1.6216707229614258, + "learning_rate": 1.997073646979708e-05, + "loss": 1.3616, + "step": 6898 + }, + { + "epoch": 0.16090633718845387, + "grad_norm": 1.7869497537612915, + "learning_rate": 1.997071721737708e-05, + "loss": 1.5377, + "step": 6899 + }, + { + "epoch": 0.16092966032763178, + "grad_norm": 1.7608362436294556, + "learning_rate": 1.9970697958635386e-05, + "loss": 1.3979, + "step": 6900 + }, + { + "epoch": 0.16095298346680972, + "grad_norm": 1.556756615638733, + "learning_rate": 1.9970678693572005e-05, + "loss": 1.2175, + "step": 6901 + }, + { + "epoch": 0.16097630660598763, + "grad_norm": 1.540128469467163, + "learning_rate": 1.9970659422186953e-05, + "loss": 1.3331, + "step": 6902 + }, + { + "epoch": 0.16099962974516555, + "grad_norm": 1.5031625032424927, + "learning_rate": 1.9970640144480244e-05, + "loss": 1.4308, + "step": 6903 + }, + { + "epoch": 0.16102295288434346, + "grad_norm": 1.883215308189392, + "learning_rate": 1.9970620860451887e-05, + "loss": 1.1143, + "step": 6904 + }, + { + "epoch": 0.1610462760235214, + "grad_norm": 2.0445594787597656, + "learning_rate": 1.9970601570101892e-05, + "loss": 1.2949, + "step": 6905 + }, + { + "epoch": 0.1610695991626993, + "grad_norm": 2.028017044067383, + "learning_rate": 1.997058227343027e-05, + "loss": 1.3578, + "step": 6906 + }, + { + "epoch": 0.16109292230187722, + "grad_norm": 2.0462186336517334, + "learning_rate": 1.9970562970437046e-05, + "loss": 1.8628, + "step": 6907 + }, + { + "epoch": 0.16111624544105513, + "grad_norm": 1.7676228284835815, + "learning_rate": 1.9970543661122218e-05, + "loss": 1.3558, + "step": 6908 + }, + { + "epoch": 0.16113956858023307, + "grad_norm": 2.02531099319458, + "learning_rate": 1.9970524345485803e-05, + "loss": 1.0512, + "step": 6909 + }, + { + "epoch": 0.16116289171941098, + "grad_norm": 1.8282703161239624, + "learning_rate": 1.997050502352782e-05, + "loss": 1.3406, + "step": 6910 + }, + { + "epoch": 0.1611862148585889, + "grad_norm": 1.5266894102096558, + "learning_rate": 1.997048569524827e-05, + "loss": 1.4841, + "step": 6911 + }, + { + "epoch": 0.1612095379977668, + "grad_norm": 1.603039026260376, + "learning_rate": 1.9970466360647166e-05, + "loss": 1.2915, + "step": 6912 + }, + { + "epoch": 0.16123286113694474, + "grad_norm": 1.8709169626235962, + "learning_rate": 1.997044701972453e-05, + "loss": 1.6629, + "step": 6913 + }, + { + "epoch": 0.16125618427612265, + "grad_norm": 1.6525893211364746, + "learning_rate": 1.9970427672480367e-05, + "loss": 1.5528, + "step": 6914 + }, + { + "epoch": 0.16127950741530056, + "grad_norm": 1.6227173805236816, + "learning_rate": 1.9970408318914694e-05, + "loss": 1.3298, + "step": 6915 + }, + { + "epoch": 0.16130283055447847, + "grad_norm": 1.8322980403900146, + "learning_rate": 1.997038895902752e-05, + "loss": 1.3597, + "step": 6916 + }, + { + "epoch": 0.1613261536936564, + "grad_norm": 1.7330659627914429, + "learning_rate": 1.9970369592818857e-05, + "loss": 1.8001, + "step": 6917 + }, + { + "epoch": 0.16134947683283432, + "grad_norm": 1.788169026374817, + "learning_rate": 1.9970350220288717e-05, + "loss": 1.5454, + "step": 6918 + }, + { + "epoch": 0.16137279997201223, + "grad_norm": 1.954914927482605, + "learning_rate": 1.9970330841437112e-05, + "loss": 1.3778, + "step": 6919 + }, + { + "epoch": 0.16139612311119014, + "grad_norm": 1.874538779258728, + "learning_rate": 1.9970311456264056e-05, + "loss": 1.3965, + "step": 6920 + }, + { + "epoch": 0.16141944625036808, + "grad_norm": 1.6227762699127197, + "learning_rate": 1.9970292064769563e-05, + "loss": 1.3139, + "step": 6921 + }, + { + "epoch": 0.161442769389546, + "grad_norm": 1.9832369089126587, + "learning_rate": 1.9970272666953642e-05, + "loss": 1.3209, + "step": 6922 + }, + { + "epoch": 0.1614660925287239, + "grad_norm": 1.8870136737823486, + "learning_rate": 1.9970253262816304e-05, + "loss": 1.2687, + "step": 6923 + }, + { + "epoch": 0.16148941566790181, + "grad_norm": 1.919926643371582, + "learning_rate": 1.9970233852357567e-05, + "loss": 1.2895, + "step": 6924 + }, + { + "epoch": 0.16151273880707973, + "grad_norm": 1.4992543458938599, + "learning_rate": 1.997021443557744e-05, + "loss": 1.0926, + "step": 6925 + }, + { + "epoch": 0.16153606194625766, + "grad_norm": 1.596825361251831, + "learning_rate": 1.997019501247593e-05, + "loss": 1.2567, + "step": 6926 + }, + { + "epoch": 0.16155938508543558, + "grad_norm": 1.5956554412841797, + "learning_rate": 1.997017558305306e-05, + "loss": 1.4354, + "step": 6927 + }, + { + "epoch": 0.16158270822461349, + "grad_norm": 1.9804601669311523, + "learning_rate": 1.997015614730884e-05, + "loss": 1.5303, + "step": 6928 + }, + { + "epoch": 0.1616060313637914, + "grad_norm": 2.4778785705566406, + "learning_rate": 1.9970136705243273e-05, + "loss": 1.3828, + "step": 6929 + }, + { + "epoch": 0.16162935450296934, + "grad_norm": 1.7353850603103638, + "learning_rate": 1.9970117256856375e-05, + "loss": 1.591, + "step": 6930 + }, + { + "epoch": 0.16165267764214725, + "grad_norm": 1.7676527500152588, + "learning_rate": 1.997009780214817e-05, + "loss": 1.1895, + "step": 6931 + }, + { + "epoch": 0.16167600078132516, + "grad_norm": 1.8275247812271118, + "learning_rate": 1.9970078341118654e-05, + "loss": 1.2562, + "step": 6932 + }, + { + "epoch": 0.16169932392050307, + "grad_norm": 2.3520774841308594, + "learning_rate": 1.9970058873767847e-05, + "loss": 1.6699, + "step": 6933 + }, + { + "epoch": 0.161722647059681, + "grad_norm": 2.107851266860962, + "learning_rate": 1.9970039400095766e-05, + "loss": 1.5269, + "step": 6934 + }, + { + "epoch": 0.16174597019885892, + "grad_norm": 1.653956413269043, + "learning_rate": 1.9970019920102416e-05, + "loss": 1.2963, + "step": 6935 + }, + { + "epoch": 0.16176929333803683, + "grad_norm": 1.7972065210342407, + "learning_rate": 1.9970000433787812e-05, + "loss": 1.3813, + "step": 6936 + }, + { + "epoch": 0.16179261647721474, + "grad_norm": 1.9668611288070679, + "learning_rate": 1.9969980941151964e-05, + "loss": 1.5222, + "step": 6937 + }, + { + "epoch": 0.16181593961639268, + "grad_norm": 2.47537899017334, + "learning_rate": 1.9969961442194885e-05, + "loss": 1.8132, + "step": 6938 + }, + { + "epoch": 0.1618392627555706, + "grad_norm": 1.7840425968170166, + "learning_rate": 1.9969941936916593e-05, + "loss": 1.4882, + "step": 6939 + }, + { + "epoch": 0.1618625858947485, + "grad_norm": 2.087616443634033, + "learning_rate": 1.9969922425317093e-05, + "loss": 1.3951, + "step": 6940 + }, + { + "epoch": 0.1618859090339264, + "grad_norm": 1.9563913345336914, + "learning_rate": 1.99699029073964e-05, + "loss": 1.616, + "step": 6941 + }, + { + "epoch": 0.16190923217310435, + "grad_norm": 1.3666577339172363, + "learning_rate": 1.9969883383154527e-05, + "loss": 1.5317, + "step": 6942 + }, + { + "epoch": 0.16193255531228226, + "grad_norm": 2.185541868209839, + "learning_rate": 1.9969863852591488e-05, + "loss": 1.6775, + "step": 6943 + }, + { + "epoch": 0.16195587845146017, + "grad_norm": 1.76042640209198, + "learning_rate": 1.9969844315707293e-05, + "loss": 1.7692, + "step": 6944 + }, + { + "epoch": 0.16197920159063808, + "grad_norm": 1.7053894996643066, + "learning_rate": 1.9969824772501955e-05, + "loss": 1.4177, + "step": 6945 + }, + { + "epoch": 0.16200252472981602, + "grad_norm": 1.7546114921569824, + "learning_rate": 1.9969805222975487e-05, + "loss": 1.5052, + "step": 6946 + }, + { + "epoch": 0.16202584786899393, + "grad_norm": 1.70600426197052, + "learning_rate": 1.99697856671279e-05, + "loss": 1.339, + "step": 6947 + }, + { + "epoch": 0.16204917100817184, + "grad_norm": 1.6352766752243042, + "learning_rate": 1.9969766104959207e-05, + "loss": 1.1468, + "step": 6948 + }, + { + "epoch": 0.16207249414734975, + "grad_norm": 1.6745156049728394, + "learning_rate": 1.996974653646942e-05, + "loss": 1.2691, + "step": 6949 + }, + { + "epoch": 0.1620958172865277, + "grad_norm": 1.397991418838501, + "learning_rate": 1.9969726961658552e-05, + "loss": 1.2144, + "step": 6950 + }, + { + "epoch": 0.1621191404257056, + "grad_norm": 1.6745747327804565, + "learning_rate": 1.9969707380526616e-05, + "loss": 1.4635, + "step": 6951 + }, + { + "epoch": 0.16214246356488352, + "grad_norm": 1.8930832147598267, + "learning_rate": 1.9969687793073625e-05, + "loss": 1.4523, + "step": 6952 + }, + { + "epoch": 0.16216578670406143, + "grad_norm": 1.7865025997161865, + "learning_rate": 1.9969668199299587e-05, + "loss": 1.6846, + "step": 6953 + }, + { + "epoch": 0.16218910984323934, + "grad_norm": 1.8963770866394043, + "learning_rate": 1.996964859920452e-05, + "loss": 1.142, + "step": 6954 + }, + { + "epoch": 0.16221243298241728, + "grad_norm": 1.569000244140625, + "learning_rate": 1.9969628992788435e-05, + "loss": 1.3626, + "step": 6955 + }, + { + "epoch": 0.1622357561215952, + "grad_norm": 2.0674116611480713, + "learning_rate": 1.996960938005134e-05, + "loss": 1.6715, + "step": 6956 + }, + { + "epoch": 0.1622590792607731, + "grad_norm": 1.974768042564392, + "learning_rate": 1.9969589760993254e-05, + "loss": 1.7881, + "step": 6957 + }, + { + "epoch": 0.162282402399951, + "grad_norm": 1.986040711402893, + "learning_rate": 1.9969570135614184e-05, + "loss": 1.525, + "step": 6958 + }, + { + "epoch": 0.16230572553912895, + "grad_norm": 1.9898048639297485, + "learning_rate": 1.9969550503914147e-05, + "loss": 1.3851, + "step": 6959 + }, + { + "epoch": 0.16232904867830686, + "grad_norm": 1.5382859706878662, + "learning_rate": 1.9969530865893153e-05, + "loss": 1.1379, + "step": 6960 + }, + { + "epoch": 0.16235237181748477, + "grad_norm": 1.9350718259811401, + "learning_rate": 1.9969511221551214e-05, + "loss": 1.3584, + "step": 6961 + }, + { + "epoch": 0.16237569495666268, + "grad_norm": 1.9872678518295288, + "learning_rate": 1.9969491570888342e-05, + "loss": 1.5838, + "step": 6962 + }, + { + "epoch": 0.16239901809584062, + "grad_norm": 1.4732686281204224, + "learning_rate": 1.996947191390455e-05, + "loss": 1.3253, + "step": 6963 + }, + { + "epoch": 0.16242234123501853, + "grad_norm": 1.8161550760269165, + "learning_rate": 1.9969452250599855e-05, + "loss": 1.3767, + "step": 6964 + }, + { + "epoch": 0.16244566437419644, + "grad_norm": 1.8465569019317627, + "learning_rate": 1.9969432580974264e-05, + "loss": 1.3911, + "step": 6965 + }, + { + "epoch": 0.16246898751337435, + "grad_norm": 1.6987483501434326, + "learning_rate": 1.996941290502779e-05, + "loss": 1.5322, + "step": 6966 + }, + { + "epoch": 0.1624923106525523, + "grad_norm": 1.7524677515029907, + "learning_rate": 1.9969393222760446e-05, + "loss": 1.3264, + "step": 6967 + }, + { + "epoch": 0.1625156337917302, + "grad_norm": 1.7579399347305298, + "learning_rate": 1.9969373534172246e-05, + "loss": 1.3732, + "step": 6968 + }, + { + "epoch": 0.1625389569309081, + "grad_norm": 1.7114061117172241, + "learning_rate": 1.9969353839263202e-05, + "loss": 1.5419, + "step": 6969 + }, + { + "epoch": 0.16256228007008602, + "grad_norm": 1.6196529865264893, + "learning_rate": 1.9969334138033324e-05, + "loss": 1.2772, + "step": 6970 + }, + { + "epoch": 0.16258560320926396, + "grad_norm": 1.7949297428131104, + "learning_rate": 1.996931443048263e-05, + "loss": 1.069, + "step": 6971 + }, + { + "epoch": 0.16260892634844187, + "grad_norm": 2.2059574127197266, + "learning_rate": 1.9969294716611126e-05, + "loss": 1.8824, + "step": 6972 + }, + { + "epoch": 0.16263224948761978, + "grad_norm": 1.816112995147705, + "learning_rate": 1.996927499641883e-05, + "loss": 1.3603, + "step": 6973 + }, + { + "epoch": 0.1626555726267977, + "grad_norm": 1.7519638538360596, + "learning_rate": 1.9969255269905747e-05, + "loss": 1.4346, + "step": 6974 + }, + { + "epoch": 0.16267889576597563, + "grad_norm": 1.913529634475708, + "learning_rate": 1.9969235537071897e-05, + "loss": 1.3277, + "step": 6975 + }, + { + "epoch": 0.16270221890515355, + "grad_norm": 1.88670814037323, + "learning_rate": 1.996921579791729e-05, + "loss": 1.3718, + "step": 6976 + }, + { + "epoch": 0.16272554204433146, + "grad_norm": 1.962681531906128, + "learning_rate": 1.996919605244194e-05, + "loss": 1.3515, + "step": 6977 + }, + { + "epoch": 0.16274886518350937, + "grad_norm": 2.084559917449951, + "learning_rate": 1.9969176300645857e-05, + "loss": 1.3992, + "step": 6978 + }, + { + "epoch": 0.1627721883226873, + "grad_norm": 2.1220080852508545, + "learning_rate": 1.9969156542529053e-05, + "loss": 1.2996, + "step": 6979 + }, + { + "epoch": 0.16279551146186522, + "grad_norm": 2.3124918937683105, + "learning_rate": 1.9969136778091543e-05, + "loss": 1.4948, + "step": 6980 + }, + { + "epoch": 0.16281883460104313, + "grad_norm": 2.203763246536255, + "learning_rate": 1.996911700733334e-05, + "loss": 1.4182, + "step": 6981 + }, + { + "epoch": 0.16284215774022104, + "grad_norm": 1.7209028005599976, + "learning_rate": 1.996909723025445e-05, + "loss": 1.1128, + "step": 6982 + }, + { + "epoch": 0.16286548087939895, + "grad_norm": 1.8497819900512695, + "learning_rate": 1.9969077446854896e-05, + "loss": 1.5973, + "step": 6983 + }, + { + "epoch": 0.1628888040185769, + "grad_norm": 1.8451229333877563, + "learning_rate": 1.9969057657134687e-05, + "loss": 1.0865, + "step": 6984 + }, + { + "epoch": 0.1629121271577548, + "grad_norm": 1.8536510467529297, + "learning_rate": 1.996903786109383e-05, + "loss": 1.4085, + "step": 6985 + }, + { + "epoch": 0.1629354502969327, + "grad_norm": 2.1494762897491455, + "learning_rate": 1.996901805873234e-05, + "loss": 1.4014, + "step": 6986 + }, + { + "epoch": 0.16295877343611062, + "grad_norm": 1.5268653631210327, + "learning_rate": 1.996899825005023e-05, + "loss": 1.1855, + "step": 6987 + }, + { + "epoch": 0.16298209657528856, + "grad_norm": 1.681309700012207, + "learning_rate": 1.9968978435047517e-05, + "loss": 1.4547, + "step": 6988 + }, + { + "epoch": 0.16300541971446647, + "grad_norm": 1.62532377243042, + "learning_rate": 1.996895861372421e-05, + "loss": 1.4909, + "step": 6989 + }, + { + "epoch": 0.16302874285364438, + "grad_norm": 1.7449836730957031, + "learning_rate": 1.996893878608032e-05, + "loss": 1.159, + "step": 6990 + }, + { + "epoch": 0.1630520659928223, + "grad_norm": 1.8509869575500488, + "learning_rate": 1.9968918952115862e-05, + "loss": 1.6086, + "step": 6991 + }, + { + "epoch": 0.16307538913200023, + "grad_norm": 1.4284809827804565, + "learning_rate": 1.9968899111830845e-05, + "loss": 1.2295, + "step": 6992 + }, + { + "epoch": 0.16309871227117814, + "grad_norm": 1.9893970489501953, + "learning_rate": 1.9968879265225288e-05, + "loss": 1.9101, + "step": 6993 + }, + { + "epoch": 0.16312203541035605, + "grad_norm": 2.0493311882019043, + "learning_rate": 1.99688594122992e-05, + "loss": 1.2667, + "step": 6994 + }, + { + "epoch": 0.16314535854953396, + "grad_norm": 1.9017201662063599, + "learning_rate": 1.996883955305259e-05, + "loss": 1.8983, + "step": 6995 + }, + { + "epoch": 0.1631686816887119, + "grad_norm": 1.5711759328842163, + "learning_rate": 1.9968819687485475e-05, + "loss": 1.3654, + "step": 6996 + }, + { + "epoch": 0.16319200482788981, + "grad_norm": 1.6462739706039429, + "learning_rate": 1.996879981559787e-05, + "loss": 1.5808, + "step": 6997 + }, + { + "epoch": 0.16321532796706772, + "grad_norm": 1.6169548034667969, + "learning_rate": 1.9968779937389777e-05, + "loss": 1.4737, + "step": 6998 + }, + { + "epoch": 0.16323865110624564, + "grad_norm": 2.1660170555114746, + "learning_rate": 1.9968760052861224e-05, + "loss": 1.9446, + "step": 6999 + }, + { + "epoch": 0.16326197424542357, + "grad_norm": 1.8017972707748413, + "learning_rate": 1.996874016201221e-05, + "loss": 1.5504, + "step": 7000 + }, + { + "epoch": 0.16328529738460149, + "grad_norm": 1.6860605478286743, + "learning_rate": 1.9968720264842753e-05, + "loss": 1.6342, + "step": 7001 + }, + { + "epoch": 0.1633086205237794, + "grad_norm": 1.8465931415557861, + "learning_rate": 1.9968700361352865e-05, + "loss": 1.6139, + "step": 7002 + }, + { + "epoch": 0.1633319436629573, + "grad_norm": 1.5825061798095703, + "learning_rate": 1.9968680451542564e-05, + "loss": 1.345, + "step": 7003 + }, + { + "epoch": 0.16335526680213525, + "grad_norm": 1.5611317157745361, + "learning_rate": 1.9968660535411855e-05, + "loss": 1.1099, + "step": 7004 + }, + { + "epoch": 0.16337858994131316, + "grad_norm": 2.158984422683716, + "learning_rate": 1.9968640612960756e-05, + "loss": 1.485, + "step": 7005 + }, + { + "epoch": 0.16340191308049107, + "grad_norm": 1.9534698724746704, + "learning_rate": 1.9968620684189273e-05, + "loss": 0.9893, + "step": 7006 + }, + { + "epoch": 0.16342523621966898, + "grad_norm": 1.4820966720581055, + "learning_rate": 1.9968600749097423e-05, + "loss": 1.2288, + "step": 7007 + }, + { + "epoch": 0.16344855935884692, + "grad_norm": 1.7916117906570435, + "learning_rate": 1.996858080768522e-05, + "loss": 1.3898, + "step": 7008 + }, + { + "epoch": 0.16347188249802483, + "grad_norm": 1.7616554498672485, + "learning_rate": 1.9968560859952678e-05, + "loss": 1.7303, + "step": 7009 + }, + { + "epoch": 0.16349520563720274, + "grad_norm": 2.009220600128174, + "learning_rate": 1.99685409058998e-05, + "loss": 1.2409, + "step": 7010 + }, + { + "epoch": 0.16351852877638065, + "grad_norm": 1.708229660987854, + "learning_rate": 1.996852094552661e-05, + "loss": 1.0896, + "step": 7011 + }, + { + "epoch": 0.16354185191555856, + "grad_norm": 1.81900954246521, + "learning_rate": 1.9968500978833117e-05, + "loss": 1.7152, + "step": 7012 + }, + { + "epoch": 0.1635651750547365, + "grad_norm": 1.8087997436523438, + "learning_rate": 1.996848100581933e-05, + "loss": 1.5202, + "step": 7013 + }, + { + "epoch": 0.1635884981939144, + "grad_norm": 2.1490230560302734, + "learning_rate": 1.9968461026485264e-05, + "loss": 1.8941, + "step": 7014 + }, + { + "epoch": 0.16361182133309232, + "grad_norm": 2.3672919273376465, + "learning_rate": 1.9968441040830934e-05, + "loss": 1.7129, + "step": 7015 + }, + { + "epoch": 0.16363514447227023, + "grad_norm": 1.6247576475143433, + "learning_rate": 1.996842104885635e-05, + "loss": 1.2042, + "step": 7016 + }, + { + "epoch": 0.16365846761144817, + "grad_norm": 1.8973323106765747, + "learning_rate": 1.9968401050561524e-05, + "loss": 1.2227, + "step": 7017 + }, + { + "epoch": 0.16368179075062608, + "grad_norm": 2.0967612266540527, + "learning_rate": 1.9968381045946474e-05, + "loss": 1.8886, + "step": 7018 + }, + { + "epoch": 0.163705113889804, + "grad_norm": 1.8892911672592163, + "learning_rate": 1.9968361035011202e-05, + "loss": 1.506, + "step": 7019 + }, + { + "epoch": 0.1637284370289819, + "grad_norm": 2.2188339233398438, + "learning_rate": 1.9968341017755734e-05, + "loss": 1.7251, + "step": 7020 + }, + { + "epoch": 0.16375176016815984, + "grad_norm": 2.489098072052002, + "learning_rate": 1.996832099418007e-05, + "loss": 1.5135, + "step": 7021 + }, + { + "epoch": 0.16377508330733775, + "grad_norm": 2.011087417602539, + "learning_rate": 1.9968300964284235e-05, + "loss": 1.4212, + "step": 7022 + }, + { + "epoch": 0.16379840644651567, + "grad_norm": 1.8064333200454712, + "learning_rate": 1.9968280928068232e-05, + "loss": 1.7932, + "step": 7023 + }, + { + "epoch": 0.16382172958569358, + "grad_norm": 1.5356563329696655, + "learning_rate": 1.996826088553208e-05, + "loss": 1.122, + "step": 7024 + }, + { + "epoch": 0.16384505272487151, + "grad_norm": 1.7291035652160645, + "learning_rate": 1.9968240836675787e-05, + "loss": 1.8771, + "step": 7025 + }, + { + "epoch": 0.16386837586404943, + "grad_norm": 1.5458476543426514, + "learning_rate": 1.9968220781499364e-05, + "loss": 1.2568, + "step": 7026 + }, + { + "epoch": 0.16389169900322734, + "grad_norm": 1.5525649785995483, + "learning_rate": 1.9968200720002833e-05, + "loss": 1.5439, + "step": 7027 + }, + { + "epoch": 0.16391502214240525, + "grad_norm": 1.7532949447631836, + "learning_rate": 1.99681806521862e-05, + "loss": 1.2055, + "step": 7028 + }, + { + "epoch": 0.1639383452815832, + "grad_norm": 1.7019926309585571, + "learning_rate": 1.9968160578049476e-05, + "loss": 1.5521, + "step": 7029 + }, + { + "epoch": 0.1639616684207611, + "grad_norm": 1.4859123229980469, + "learning_rate": 1.9968140497592675e-05, + "loss": 1.3693, + "step": 7030 + }, + { + "epoch": 0.163984991559939, + "grad_norm": 1.8772660493850708, + "learning_rate": 1.9968120410815815e-05, + "loss": 1.2304, + "step": 7031 + }, + { + "epoch": 0.16400831469911692, + "grad_norm": 1.6126536130905151, + "learning_rate": 1.9968100317718907e-05, + "loss": 1.4126, + "step": 7032 + }, + { + "epoch": 0.16403163783829486, + "grad_norm": 1.4203507900238037, + "learning_rate": 1.9968080218301957e-05, + "loss": 1.1222, + "step": 7033 + }, + { + "epoch": 0.16405496097747277, + "grad_norm": 2.122234344482422, + "learning_rate": 1.9968060112564983e-05, + "loss": 0.9916, + "step": 7034 + }, + { + "epoch": 0.16407828411665068, + "grad_norm": 1.634675145149231, + "learning_rate": 1.9968040000508e-05, + "loss": 1.7078, + "step": 7035 + }, + { + "epoch": 0.1641016072558286, + "grad_norm": 2.0541889667510986, + "learning_rate": 1.9968019882131016e-05, + "loss": 1.4425, + "step": 7036 + }, + { + "epoch": 0.16412493039500653, + "grad_norm": 1.79814875125885, + "learning_rate": 1.9967999757434046e-05, + "loss": 1.323, + "step": 7037 + }, + { + "epoch": 0.16414825353418444, + "grad_norm": 1.7532646656036377, + "learning_rate": 1.9967979626417102e-05, + "loss": 1.5984, + "step": 7038 + }, + { + "epoch": 0.16417157667336235, + "grad_norm": 1.729487419128418, + "learning_rate": 1.99679594890802e-05, + "loss": 1.5229, + "step": 7039 + }, + { + "epoch": 0.16419489981254026, + "grad_norm": 2.194769859313965, + "learning_rate": 1.9967939345423345e-05, + "loss": 1.4132, + "step": 7040 + }, + { + "epoch": 0.16421822295171817, + "grad_norm": 1.9310816526412964, + "learning_rate": 1.996791919544656e-05, + "loss": 1.5776, + "step": 7041 + }, + { + "epoch": 0.1642415460908961, + "grad_norm": 2.033327579498291, + "learning_rate": 1.996789903914985e-05, + "loss": 1.1316, + "step": 7042 + }, + { + "epoch": 0.16426486923007402, + "grad_norm": 1.9263747930526733, + "learning_rate": 1.996787887653323e-05, + "loss": 1.8699, + "step": 7043 + }, + { + "epoch": 0.16428819236925193, + "grad_norm": 1.3683924674987793, + "learning_rate": 1.9967858707596712e-05, + "loss": 1.2279, + "step": 7044 + }, + { + "epoch": 0.16431151550842985, + "grad_norm": 1.9059028625488281, + "learning_rate": 1.996783853234031e-05, + "loss": 1.3895, + "step": 7045 + }, + { + "epoch": 0.16433483864760778, + "grad_norm": 1.7717410326004028, + "learning_rate": 1.996781835076404e-05, + "loss": 1.3983, + "step": 7046 + }, + { + "epoch": 0.1643581617867857, + "grad_norm": 1.610771656036377, + "learning_rate": 1.996779816286791e-05, + "loss": 1.2286, + "step": 7047 + }, + { + "epoch": 0.1643814849259636, + "grad_norm": 2.1119332313537598, + "learning_rate": 1.9967777968651933e-05, + "loss": 1.5223, + "step": 7048 + }, + { + "epoch": 0.16440480806514152, + "grad_norm": 2.2004001140594482, + "learning_rate": 1.9967757768116122e-05, + "loss": 1.4184, + "step": 7049 + }, + { + "epoch": 0.16442813120431946, + "grad_norm": 2.3330156803131104, + "learning_rate": 1.996773756126049e-05, + "loss": 1.4714, + "step": 7050 + }, + { + "epoch": 0.16445145434349737, + "grad_norm": 1.790088176727295, + "learning_rate": 1.9967717348085057e-05, + "loss": 1.3945, + "step": 7051 + }, + { + "epoch": 0.16447477748267528, + "grad_norm": 1.7344818115234375, + "learning_rate": 1.9967697128589827e-05, + "loss": 1.2152, + "step": 7052 + }, + { + "epoch": 0.1644981006218532, + "grad_norm": 1.9308151006698608, + "learning_rate": 1.9967676902774813e-05, + "loss": 1.4019, + "step": 7053 + }, + { + "epoch": 0.16452142376103113, + "grad_norm": 1.8574289083480835, + "learning_rate": 1.996765667064003e-05, + "loss": 1.3299, + "step": 7054 + }, + { + "epoch": 0.16454474690020904, + "grad_norm": 1.4864813089370728, + "learning_rate": 1.9967636432185494e-05, + "loss": 1.2277, + "step": 7055 + }, + { + "epoch": 0.16456807003938695, + "grad_norm": 1.9565708637237549, + "learning_rate": 1.9967616187411215e-05, + "loss": 1.4115, + "step": 7056 + }, + { + "epoch": 0.16459139317856486, + "grad_norm": 2.6394917964935303, + "learning_rate": 1.99675959363172e-05, + "loss": 1.5166, + "step": 7057 + }, + { + "epoch": 0.1646147163177428, + "grad_norm": 1.521405577659607, + "learning_rate": 1.9967575678903473e-05, + "loss": 1.4184, + "step": 7058 + }, + { + "epoch": 0.1646380394569207, + "grad_norm": 1.6250877380371094, + "learning_rate": 1.996755541517004e-05, + "loss": 1.1636, + "step": 7059 + }, + { + "epoch": 0.16466136259609862, + "grad_norm": 2.1334705352783203, + "learning_rate": 1.9967535145116914e-05, + "loss": 1.5203, + "step": 7060 + }, + { + "epoch": 0.16468468573527653, + "grad_norm": 4.5847649574279785, + "learning_rate": 1.996751486874411e-05, + "loss": 1.3412, + "step": 7061 + }, + { + "epoch": 0.16470800887445447, + "grad_norm": 1.5987204313278198, + "learning_rate": 1.996749458605164e-05, + "loss": 1.271, + "step": 7062 + }, + { + "epoch": 0.16473133201363238, + "grad_norm": 1.6528213024139404, + "learning_rate": 1.9967474297039517e-05, + "loss": 1.3885, + "step": 7063 + }, + { + "epoch": 0.1647546551528103, + "grad_norm": 2.1325011253356934, + "learning_rate": 1.996745400170775e-05, + "loss": 1.6278, + "step": 7064 + }, + { + "epoch": 0.1647779782919882, + "grad_norm": 2.441575765609741, + "learning_rate": 1.996743370005636e-05, + "loss": 1.5449, + "step": 7065 + }, + { + "epoch": 0.16480130143116611, + "grad_norm": 2.1928791999816895, + "learning_rate": 1.9967413392085355e-05, + "loss": 1.3818, + "step": 7066 + }, + { + "epoch": 0.16482462457034405, + "grad_norm": 1.6368918418884277, + "learning_rate": 1.9967393077794747e-05, + "loss": 1.3269, + "step": 7067 + }, + { + "epoch": 0.16484794770952196, + "grad_norm": 4.704078674316406, + "learning_rate": 1.996737275718455e-05, + "loss": 1.4742, + "step": 7068 + }, + { + "epoch": 0.16487127084869987, + "grad_norm": 2.0634074211120605, + "learning_rate": 1.9967352430254777e-05, + "loss": 1.5856, + "step": 7069 + }, + { + "epoch": 0.16489459398787779, + "grad_norm": 1.7057820558547974, + "learning_rate": 1.996733209700544e-05, + "loss": 1.4149, + "step": 7070 + }, + { + "epoch": 0.16491791712705572, + "grad_norm": 1.5194261074066162, + "learning_rate": 1.9967311757436554e-05, + "loss": 1.4383, + "step": 7071 + }, + { + "epoch": 0.16494124026623364, + "grad_norm": 1.777976632118225, + "learning_rate": 1.996729141154813e-05, + "loss": 1.4549, + "step": 7072 + }, + { + "epoch": 0.16496456340541155, + "grad_norm": 1.826999306678772, + "learning_rate": 1.9967271059340182e-05, + "loss": 1.6944, + "step": 7073 + }, + { + "epoch": 0.16498788654458946, + "grad_norm": 1.7993879318237305, + "learning_rate": 1.996725070081272e-05, + "loss": 1.3918, + "step": 7074 + }, + { + "epoch": 0.1650112096837674, + "grad_norm": 2.16528058052063, + "learning_rate": 1.9967230335965765e-05, + "loss": 1.2996, + "step": 7075 + }, + { + "epoch": 0.1650345328229453, + "grad_norm": 1.7694876194000244, + "learning_rate": 1.996720996479932e-05, + "loss": 1.1816, + "step": 7076 + }, + { + "epoch": 0.16505785596212322, + "grad_norm": 1.9555736780166626, + "learning_rate": 1.9967189587313405e-05, + "loss": 1.7992, + "step": 7077 + }, + { + "epoch": 0.16508117910130113, + "grad_norm": 2.025458574295044, + "learning_rate": 1.9967169203508027e-05, + "loss": 1.2218, + "step": 7078 + }, + { + "epoch": 0.16510450224047907, + "grad_norm": 1.5428764820098877, + "learning_rate": 1.9967148813383207e-05, + "loss": 1.6011, + "step": 7079 + }, + { + "epoch": 0.16512782537965698, + "grad_norm": 1.9191975593566895, + "learning_rate": 1.9967128416938948e-05, + "loss": 1.5261, + "step": 7080 + }, + { + "epoch": 0.1651511485188349, + "grad_norm": 1.510679006576538, + "learning_rate": 1.996710801417527e-05, + "loss": 1.3609, + "step": 7081 + }, + { + "epoch": 0.1651744716580128, + "grad_norm": 1.7697256803512573, + "learning_rate": 1.996708760509218e-05, + "loss": 1.7494, + "step": 7082 + }, + { + "epoch": 0.16519779479719074, + "grad_norm": 1.969698429107666, + "learning_rate": 1.99670671896897e-05, + "loss": 1.7952, + "step": 7083 + }, + { + "epoch": 0.16522111793636865, + "grad_norm": 1.6848329305648804, + "learning_rate": 1.9967046767967835e-05, + "loss": 1.2637, + "step": 7084 + }, + { + "epoch": 0.16524444107554656, + "grad_norm": 2.1741111278533936, + "learning_rate": 1.9967026339926602e-05, + "loss": 1.3464, + "step": 7085 + }, + { + "epoch": 0.16526776421472447, + "grad_norm": 1.6760141849517822, + "learning_rate": 1.996700590556601e-05, + "loss": 1.2399, + "step": 7086 + }, + { + "epoch": 0.1652910873539024, + "grad_norm": 2.5988433361053467, + "learning_rate": 1.9966985464886077e-05, + "loss": 1.5534, + "step": 7087 + }, + { + "epoch": 0.16531441049308032, + "grad_norm": 1.753618836402893, + "learning_rate": 1.9966965017886813e-05, + "loss": 1.394, + "step": 7088 + }, + { + "epoch": 0.16533773363225823, + "grad_norm": 2.032742977142334, + "learning_rate": 1.9966944564568233e-05, + "loss": 1.5625, + "step": 7089 + }, + { + "epoch": 0.16536105677143614, + "grad_norm": 1.727437973022461, + "learning_rate": 1.9966924104930346e-05, + "loss": 1.5113, + "step": 7090 + }, + { + "epoch": 0.16538437991061408, + "grad_norm": 1.6378858089447021, + "learning_rate": 1.996690363897317e-05, + "loss": 1.3349, + "step": 7091 + }, + { + "epoch": 0.165407703049792, + "grad_norm": 1.5332919359207153, + "learning_rate": 1.9966883166696715e-05, + "loss": 1.306, + "step": 7092 + }, + { + "epoch": 0.1654310261889699, + "grad_norm": 1.6865594387054443, + "learning_rate": 1.996686268810099e-05, + "loss": 1.3704, + "step": 7093 + }, + { + "epoch": 0.16545434932814782, + "grad_norm": 1.778311014175415, + "learning_rate": 1.9966842203186018e-05, + "loss": 1.3108, + "step": 7094 + }, + { + "epoch": 0.16547767246732573, + "grad_norm": 1.7964422702789307, + "learning_rate": 1.9966821711951806e-05, + "loss": 1.4089, + "step": 7095 + }, + { + "epoch": 0.16550099560650366, + "grad_norm": 1.699665904045105, + "learning_rate": 1.9966801214398363e-05, + "loss": 1.1574, + "step": 7096 + }, + { + "epoch": 0.16552431874568158, + "grad_norm": 1.7493106126785278, + "learning_rate": 1.996678071052571e-05, + "loss": 1.4367, + "step": 7097 + }, + { + "epoch": 0.1655476418848595, + "grad_norm": 1.6605054140090942, + "learning_rate": 1.9966760200333857e-05, + "loss": 1.3299, + "step": 7098 + }, + { + "epoch": 0.1655709650240374, + "grad_norm": 1.6909204721450806, + "learning_rate": 1.9966739683822814e-05, + "loss": 1.3259, + "step": 7099 + }, + { + "epoch": 0.16559428816321534, + "grad_norm": 1.6388825178146362, + "learning_rate": 1.9966719160992596e-05, + "loss": 1.4812, + "step": 7100 + }, + { + "epoch": 0.16561761130239325, + "grad_norm": 1.8611371517181396, + "learning_rate": 1.996669863184322e-05, + "loss": 1.2404, + "step": 7101 + }, + { + "epoch": 0.16564093444157116, + "grad_norm": 1.6462162733078003, + "learning_rate": 1.996667809637469e-05, + "loss": 1.4788, + "step": 7102 + }, + { + "epoch": 0.16566425758074907, + "grad_norm": 1.6674835681915283, + "learning_rate": 1.996665755458703e-05, + "loss": 1.5092, + "step": 7103 + }, + { + "epoch": 0.165687580719927, + "grad_norm": 1.6079919338226318, + "learning_rate": 1.996663700648024e-05, + "loss": 1.3137, + "step": 7104 + }, + { + "epoch": 0.16571090385910492, + "grad_norm": 1.609168529510498, + "learning_rate": 1.9966616452054345e-05, + "loss": 1.7199, + "step": 7105 + }, + { + "epoch": 0.16573422699828283, + "grad_norm": 1.5320578813552856, + "learning_rate": 1.9966595891309354e-05, + "loss": 1.3413, + "step": 7106 + }, + { + "epoch": 0.16575755013746074, + "grad_norm": 1.838019609451294, + "learning_rate": 1.996657532424528e-05, + "loss": 1.6791, + "step": 7107 + }, + { + "epoch": 0.16578087327663868, + "grad_norm": 1.8124356269836426, + "learning_rate": 1.9966554750862138e-05, + "loss": 1.8691, + "step": 7108 + }, + { + "epoch": 0.1658041964158166, + "grad_norm": 1.901100516319275, + "learning_rate": 1.9966534171159933e-05, + "loss": 1.7064, + "step": 7109 + }, + { + "epoch": 0.1658275195549945, + "grad_norm": 2.071823835372925, + "learning_rate": 1.9966513585138686e-05, + "loss": 1.0339, + "step": 7110 + }, + { + "epoch": 0.1658508426941724, + "grad_norm": 2.172261953353882, + "learning_rate": 1.9966492992798407e-05, + "loss": 1.4656, + "step": 7111 + }, + { + "epoch": 0.16587416583335035, + "grad_norm": 1.721413016319275, + "learning_rate": 1.996647239413911e-05, + "loss": 1.6139, + "step": 7112 + }, + { + "epoch": 0.16589748897252826, + "grad_norm": 1.6631319522857666, + "learning_rate": 1.996645178916081e-05, + "loss": 1.4706, + "step": 7113 + }, + { + "epoch": 0.16592081211170617, + "grad_norm": 1.823330044746399, + "learning_rate": 1.9966431177863516e-05, + "loss": 1.684, + "step": 7114 + }, + { + "epoch": 0.16594413525088408, + "grad_norm": 1.6135430335998535, + "learning_rate": 1.9966410560247246e-05, + "loss": 1.1659, + "step": 7115 + }, + { + "epoch": 0.16596745839006202, + "grad_norm": 1.8139829635620117, + "learning_rate": 1.9966389936312004e-05, + "loss": 1.3776, + "step": 7116 + }, + { + "epoch": 0.16599078152923993, + "grad_norm": 1.9071526527404785, + "learning_rate": 1.9966369306057813e-05, + "loss": 1.4584, + "step": 7117 + }, + { + "epoch": 0.16601410466841784, + "grad_norm": 1.7981209754943848, + "learning_rate": 1.996634866948468e-05, + "loss": 1.4229, + "step": 7118 + }, + { + "epoch": 0.16603742780759576, + "grad_norm": 2.0948495864868164, + "learning_rate": 1.9966328026592622e-05, + "loss": 1.0076, + "step": 7119 + }, + { + "epoch": 0.1660607509467737, + "grad_norm": 1.8014196157455444, + "learning_rate": 1.996630737738165e-05, + "loss": 1.236, + "step": 7120 + }, + { + "epoch": 0.1660840740859516, + "grad_norm": 1.6294891834259033, + "learning_rate": 1.9966286721851778e-05, + "loss": 1.2827, + "step": 7121 + }, + { + "epoch": 0.16610739722512952, + "grad_norm": 1.767707347869873, + "learning_rate": 1.996626606000302e-05, + "loss": 1.4478, + "step": 7122 + }, + { + "epoch": 0.16613072036430743, + "grad_norm": 2.174978256225586, + "learning_rate": 1.9966245391835387e-05, + "loss": 1.4606, + "step": 7123 + }, + { + "epoch": 0.16615404350348534, + "grad_norm": 2.0141193866729736, + "learning_rate": 1.996622471734889e-05, + "loss": 1.6759, + "step": 7124 + }, + { + "epoch": 0.16617736664266328, + "grad_norm": 1.4914830923080444, + "learning_rate": 1.996620403654355e-05, + "loss": 1.2386, + "step": 7125 + }, + { + "epoch": 0.1662006897818412, + "grad_norm": 1.703888177871704, + "learning_rate": 1.996618334941937e-05, + "loss": 1.3241, + "step": 7126 + }, + { + "epoch": 0.1662240129210191, + "grad_norm": 2.273123025894165, + "learning_rate": 1.996616265597637e-05, + "loss": 1.4787, + "step": 7127 + }, + { + "epoch": 0.166247336060197, + "grad_norm": 1.6673020124435425, + "learning_rate": 1.996614195621456e-05, + "loss": 1.3458, + "step": 7128 + }, + { + "epoch": 0.16627065919937495, + "grad_norm": 1.9732284545898438, + "learning_rate": 1.9966121250133955e-05, + "loss": 1.4743, + "step": 7129 + }, + { + "epoch": 0.16629398233855286, + "grad_norm": 2.3098034858703613, + "learning_rate": 1.996610053773457e-05, + "loss": 1.714, + "step": 7130 + }, + { + "epoch": 0.16631730547773077, + "grad_norm": 1.8179925680160522, + "learning_rate": 1.9966079819016412e-05, + "loss": 1.5515, + "step": 7131 + }, + { + "epoch": 0.16634062861690868, + "grad_norm": 1.6882884502410889, + "learning_rate": 1.9966059093979498e-05, + "loss": 1.4241, + "step": 7132 + }, + { + "epoch": 0.16636395175608662, + "grad_norm": 1.8849619626998901, + "learning_rate": 1.9966038362623843e-05, + "loss": 1.2974, + "step": 7133 + }, + { + "epoch": 0.16638727489526453, + "grad_norm": 1.8820072412490845, + "learning_rate": 1.9966017624949455e-05, + "loss": 1.5605, + "step": 7134 + }, + { + "epoch": 0.16641059803444244, + "grad_norm": 1.7704676389694214, + "learning_rate": 1.9965996880956357e-05, + "loss": 1.5283, + "step": 7135 + }, + { + "epoch": 0.16643392117362035, + "grad_norm": 1.8763025999069214, + "learning_rate": 1.996597613064455e-05, + "loss": 1.4043, + "step": 7136 + }, + { + "epoch": 0.1664572443127983, + "grad_norm": 1.5356117486953735, + "learning_rate": 1.996595537401405e-05, + "loss": 1.3362, + "step": 7137 + }, + { + "epoch": 0.1664805674519762, + "grad_norm": 1.674979329109192, + "learning_rate": 1.9965934611064877e-05, + "loss": 1.468, + "step": 7138 + }, + { + "epoch": 0.1665038905911541, + "grad_norm": 2.3137106895446777, + "learning_rate": 1.9965913841797037e-05, + "loss": 1.138, + "step": 7139 + }, + { + "epoch": 0.16652721373033202, + "grad_norm": 1.6028974056243896, + "learning_rate": 1.996589306621055e-05, + "loss": 1.2343, + "step": 7140 + }, + { + "epoch": 0.16655053686950996, + "grad_norm": 1.64051353931427, + "learning_rate": 1.996587228430542e-05, + "loss": 1.3448, + "step": 7141 + }, + { + "epoch": 0.16657386000868787, + "grad_norm": 1.5828136205673218, + "learning_rate": 1.996585149608167e-05, + "loss": 1.5132, + "step": 7142 + }, + { + "epoch": 0.16659718314786578, + "grad_norm": 1.4626339673995972, + "learning_rate": 1.9965830701539307e-05, + "loss": 0.9941, + "step": 7143 + }, + { + "epoch": 0.1666205062870437, + "grad_norm": 1.8194259405136108, + "learning_rate": 1.9965809900678345e-05, + "loss": 1.6585, + "step": 7144 + }, + { + "epoch": 0.16664382942622163, + "grad_norm": 1.9046740531921387, + "learning_rate": 1.9965789093498794e-05, + "loss": 1.5215, + "step": 7145 + }, + { + "epoch": 0.16666715256539955, + "grad_norm": 2.04133677482605, + "learning_rate": 1.996576828000068e-05, + "loss": 1.2349, + "step": 7146 + }, + { + "epoch": 0.16669047570457746, + "grad_norm": 1.6033862829208374, + "learning_rate": 1.9965747460184e-05, + "loss": 1.4442, + "step": 7147 + }, + { + "epoch": 0.16671379884375537, + "grad_norm": 1.9133673906326294, + "learning_rate": 1.9965726634048778e-05, + "loss": 1.6702, + "step": 7148 + }, + { + "epoch": 0.1667371219829333, + "grad_norm": 2.3222618103027344, + "learning_rate": 1.996570580159502e-05, + "loss": 1.3979, + "step": 7149 + }, + { + "epoch": 0.16676044512211122, + "grad_norm": 1.864763617515564, + "learning_rate": 1.9965684962822748e-05, + "loss": 1.5716, + "step": 7150 + }, + { + "epoch": 0.16678376826128913, + "grad_norm": 2.300797939300537, + "learning_rate": 1.9965664117731968e-05, + "loss": 1.9432, + "step": 7151 + }, + { + "epoch": 0.16680709140046704, + "grad_norm": 1.9790103435516357, + "learning_rate": 1.9965643266322695e-05, + "loss": 1.7029, + "step": 7152 + }, + { + "epoch": 0.16683041453964495, + "grad_norm": 2.7141332626342773, + "learning_rate": 1.9965622408594942e-05, + "loss": 1.6891, + "step": 7153 + }, + { + "epoch": 0.1668537376788229, + "grad_norm": 1.7064889669418335, + "learning_rate": 1.9965601544548724e-05, + "loss": 1.3981, + "step": 7154 + }, + { + "epoch": 0.1668770608180008, + "grad_norm": 1.9003815650939941, + "learning_rate": 1.996558067418405e-05, + "loss": 1.2768, + "step": 7155 + }, + { + "epoch": 0.1669003839571787, + "grad_norm": 1.9486384391784668, + "learning_rate": 1.996555979750094e-05, + "loss": 1.8508, + "step": 7156 + }, + { + "epoch": 0.16692370709635662, + "grad_norm": 1.6710269451141357, + "learning_rate": 1.99655389144994e-05, + "loss": 1.5321, + "step": 7157 + }, + { + "epoch": 0.16694703023553456, + "grad_norm": 1.919404149055481, + "learning_rate": 1.996551802517945e-05, + "loss": 1.4401, + "step": 7158 + }, + { + "epoch": 0.16697035337471247, + "grad_norm": 1.5106115341186523, + "learning_rate": 1.9965497129541095e-05, + "loss": 1.285, + "step": 7159 + }, + { + "epoch": 0.16699367651389038, + "grad_norm": 1.7402207851409912, + "learning_rate": 1.9965476227584358e-05, + "loss": 1.4301, + "step": 7160 + }, + { + "epoch": 0.1670169996530683, + "grad_norm": 1.9264389276504517, + "learning_rate": 1.9965455319309247e-05, + "loss": 1.7805, + "step": 7161 + }, + { + "epoch": 0.16704032279224623, + "grad_norm": 1.8590655326843262, + "learning_rate": 1.9965434404715774e-05, + "loss": 1.674, + "step": 7162 + }, + { + "epoch": 0.16706364593142414, + "grad_norm": 1.7098841667175293, + "learning_rate": 1.9965413483803955e-05, + "loss": 1.4243, + "step": 7163 + }, + { + "epoch": 0.16708696907060205, + "grad_norm": 1.7281845808029175, + "learning_rate": 1.99653925565738e-05, + "loss": 1.5485, + "step": 7164 + }, + { + "epoch": 0.16711029220977996, + "grad_norm": 1.67409348487854, + "learning_rate": 1.9965371623025328e-05, + "loss": 1.3313, + "step": 7165 + }, + { + "epoch": 0.1671336153489579, + "grad_norm": 2.0242795944213867, + "learning_rate": 1.9965350683158547e-05, + "loss": 1.3106, + "step": 7166 + }, + { + "epoch": 0.16715693848813581, + "grad_norm": 1.5257019996643066, + "learning_rate": 1.996532973697347e-05, + "loss": 1.4357, + "step": 7167 + }, + { + "epoch": 0.16718026162731373, + "grad_norm": 2.0941483974456787, + "learning_rate": 1.9965308784470117e-05, + "loss": 1.5558, + "step": 7168 + }, + { + "epoch": 0.16720358476649164, + "grad_norm": 2.469031810760498, + "learning_rate": 1.9965287825648494e-05, + "loss": 1.6816, + "step": 7169 + }, + { + "epoch": 0.16722690790566958, + "grad_norm": 1.765934705734253, + "learning_rate": 1.9965266860508615e-05, + "loss": 1.6717, + "step": 7170 + }, + { + "epoch": 0.16725023104484749, + "grad_norm": 1.5918166637420654, + "learning_rate": 1.9965245889050497e-05, + "loss": 1.1194, + "step": 7171 + }, + { + "epoch": 0.1672735541840254, + "grad_norm": 1.67392098903656, + "learning_rate": 1.9965224911274154e-05, + "loss": 1.5278, + "step": 7172 + }, + { + "epoch": 0.1672968773232033, + "grad_norm": 3.2219200134277344, + "learning_rate": 1.9965203927179596e-05, + "loss": 1.6443, + "step": 7173 + }, + { + "epoch": 0.16732020046238125, + "grad_norm": 2.0264484882354736, + "learning_rate": 1.9965182936766832e-05, + "loss": 1.441, + "step": 7174 + }, + { + "epoch": 0.16734352360155916, + "grad_norm": 2.057920455932617, + "learning_rate": 1.9965161940035887e-05, + "loss": 1.1619, + "step": 7175 + }, + { + "epoch": 0.16736684674073707, + "grad_norm": 1.9368195533752441, + "learning_rate": 1.9965140936986764e-05, + "loss": 1.353, + "step": 7176 + }, + { + "epoch": 0.16739016987991498, + "grad_norm": 2.2538695335388184, + "learning_rate": 1.9965119927619477e-05, + "loss": 1.3727, + "step": 7177 + }, + { + "epoch": 0.16741349301909292, + "grad_norm": 2.958879232406616, + "learning_rate": 1.996509891193405e-05, + "loss": 1.3719, + "step": 7178 + }, + { + "epoch": 0.16743681615827083, + "grad_norm": 1.6595252752304077, + "learning_rate": 1.9965077889930484e-05, + "loss": 1.2144, + "step": 7179 + }, + { + "epoch": 0.16746013929744874, + "grad_norm": 1.5306041240692139, + "learning_rate": 1.9965056861608798e-05, + "loss": 1.3319, + "step": 7180 + }, + { + "epoch": 0.16748346243662665, + "grad_norm": 1.520293116569519, + "learning_rate": 1.9965035826969005e-05, + "loss": 1.54, + "step": 7181 + }, + { + "epoch": 0.16750678557580456, + "grad_norm": 1.6744221448898315, + "learning_rate": 1.9965014786011116e-05, + "loss": 1.5144, + "step": 7182 + }, + { + "epoch": 0.1675301087149825, + "grad_norm": 1.7226622104644775, + "learning_rate": 1.9964993738735147e-05, + "loss": 1.4599, + "step": 7183 + }, + { + "epoch": 0.1675534318541604, + "grad_norm": 1.6248610019683838, + "learning_rate": 1.9964972685141108e-05, + "loss": 1.1309, + "step": 7184 + }, + { + "epoch": 0.16757675499333832, + "grad_norm": 1.8633750677108765, + "learning_rate": 1.9964951625229017e-05, + "loss": 1.6777, + "step": 7185 + }, + { + "epoch": 0.16760007813251623, + "grad_norm": 2.2037839889526367, + "learning_rate": 1.9964930558998887e-05, + "loss": 1.2941, + "step": 7186 + }, + { + "epoch": 0.16762340127169417, + "grad_norm": 1.765029788017273, + "learning_rate": 1.9964909486450724e-05, + "loss": 1.0097, + "step": 7187 + }, + { + "epoch": 0.16764672441087208, + "grad_norm": 1.7806310653686523, + "learning_rate": 1.996488840758455e-05, + "loss": 1.3139, + "step": 7188 + }, + { + "epoch": 0.16767004755005, + "grad_norm": 2.1823582649230957, + "learning_rate": 1.9964867322400376e-05, + "loss": 1.446, + "step": 7189 + }, + { + "epoch": 0.1676933706892279, + "grad_norm": 1.8053174018859863, + "learning_rate": 1.9964846230898215e-05, + "loss": 1.2409, + "step": 7190 + }, + { + "epoch": 0.16771669382840584, + "grad_norm": 1.960282802581787, + "learning_rate": 1.9964825133078076e-05, + "loss": 1.4659, + "step": 7191 + }, + { + "epoch": 0.16774001696758375, + "grad_norm": 1.8729490041732788, + "learning_rate": 1.996480402893998e-05, + "loss": 1.5572, + "step": 7192 + }, + { + "epoch": 0.16776334010676167, + "grad_norm": 1.4355278015136719, + "learning_rate": 1.9964782918483934e-05, + "loss": 1.3477, + "step": 7193 + }, + { + "epoch": 0.16778666324593958, + "grad_norm": 1.7145345211029053, + "learning_rate": 1.9964761801709955e-05, + "loss": 1.3994, + "step": 7194 + }, + { + "epoch": 0.16780998638511752, + "grad_norm": 1.5740821361541748, + "learning_rate": 1.9964740678618055e-05, + "loss": 1.3037, + "step": 7195 + }, + { + "epoch": 0.16783330952429543, + "grad_norm": 1.7540680170059204, + "learning_rate": 1.996471954920825e-05, + "loss": 1.3145, + "step": 7196 + }, + { + "epoch": 0.16785663266347334, + "grad_norm": 1.9058054685592651, + "learning_rate": 1.9964698413480546e-05, + "loss": 1.2606, + "step": 7197 + }, + { + "epoch": 0.16787995580265125, + "grad_norm": 1.8215430974960327, + "learning_rate": 1.9964677271434967e-05, + "loss": 1.2557, + "step": 7198 + }, + { + "epoch": 0.1679032789418292, + "grad_norm": 1.8879115581512451, + "learning_rate": 1.996465612307152e-05, + "loss": 1.5438, + "step": 7199 + }, + { + "epoch": 0.1679266020810071, + "grad_norm": 1.8122518062591553, + "learning_rate": 1.996463496839022e-05, + "loss": 1.2999, + "step": 7200 + }, + { + "epoch": 0.167949925220185, + "grad_norm": 1.8456789255142212, + "learning_rate": 1.9964613807391074e-05, + "loss": 1.6591, + "step": 7201 + }, + { + "epoch": 0.16797324835936292, + "grad_norm": 1.6927258968353271, + "learning_rate": 1.996459264007411e-05, + "loss": 1.4637, + "step": 7202 + }, + { + "epoch": 0.16799657149854086, + "grad_norm": 1.800552487373352, + "learning_rate": 1.9964571466439325e-05, + "loss": 1.7278, + "step": 7203 + }, + { + "epoch": 0.16801989463771877, + "grad_norm": 1.9658238887786865, + "learning_rate": 1.9964550286486745e-05, + "loss": 1.6857, + "step": 7204 + }, + { + "epoch": 0.16804321777689668, + "grad_norm": 1.4997915029525757, + "learning_rate": 1.996452910021638e-05, + "loss": 1.2049, + "step": 7205 + }, + { + "epoch": 0.1680665409160746, + "grad_norm": 1.7877811193466187, + "learning_rate": 1.9964507907628235e-05, + "loss": 1.3692, + "step": 7206 + }, + { + "epoch": 0.16808986405525253, + "grad_norm": 1.739357829093933, + "learning_rate": 1.9964486708722333e-05, + "loss": 1.4918, + "step": 7207 + }, + { + "epoch": 0.16811318719443044, + "grad_norm": 1.739524006843567, + "learning_rate": 1.996446550349869e-05, + "loss": 1.6183, + "step": 7208 + }, + { + "epoch": 0.16813651033360835, + "grad_norm": 2.2582924365997314, + "learning_rate": 1.996444429195731e-05, + "loss": 1.1531, + "step": 7209 + }, + { + "epoch": 0.16815983347278626, + "grad_norm": 2.1701419353485107, + "learning_rate": 1.996442307409821e-05, + "loss": 1.8706, + "step": 7210 + }, + { + "epoch": 0.16818315661196417, + "grad_norm": 1.687563419342041, + "learning_rate": 1.9964401849921405e-05, + "loss": 1.2408, + "step": 7211 + }, + { + "epoch": 0.1682064797511421, + "grad_norm": 1.6890877485275269, + "learning_rate": 1.9964380619426907e-05, + "loss": 1.4024, + "step": 7212 + }, + { + "epoch": 0.16822980289032002, + "grad_norm": 1.8228999376296997, + "learning_rate": 1.996435938261473e-05, + "loss": 1.2111, + "step": 7213 + }, + { + "epoch": 0.16825312602949793, + "grad_norm": 1.6760106086730957, + "learning_rate": 1.9964338139484888e-05, + "loss": 1.3794, + "step": 7214 + }, + { + "epoch": 0.16827644916867585, + "grad_norm": 1.7331671714782715, + "learning_rate": 1.9964316890037398e-05, + "loss": 1.5604, + "step": 7215 + }, + { + "epoch": 0.16829977230785378, + "grad_norm": 2.1284615993499756, + "learning_rate": 1.9964295634272265e-05, + "loss": 1.4723, + "step": 7216 + }, + { + "epoch": 0.1683230954470317, + "grad_norm": 1.8090451955795288, + "learning_rate": 1.9964274372189506e-05, + "loss": 1.4425, + "step": 7217 + }, + { + "epoch": 0.1683464185862096, + "grad_norm": 1.694318175315857, + "learning_rate": 1.996425310378914e-05, + "loss": 1.8858, + "step": 7218 + }, + { + "epoch": 0.16836974172538752, + "grad_norm": 1.8148483037948608, + "learning_rate": 1.9964231829071172e-05, + "loss": 1.3133, + "step": 7219 + }, + { + "epoch": 0.16839306486456546, + "grad_norm": 1.907382607460022, + "learning_rate": 1.996421054803562e-05, + "loss": 1.2057, + "step": 7220 + }, + { + "epoch": 0.16841638800374337, + "grad_norm": 1.607773780822754, + "learning_rate": 1.99641892606825e-05, + "loss": 1.2427, + "step": 7221 + }, + { + "epoch": 0.16843971114292128, + "grad_norm": 1.879823923110962, + "learning_rate": 1.996416796701182e-05, + "loss": 1.9061, + "step": 7222 + }, + { + "epoch": 0.1684630342820992, + "grad_norm": 2.1378557682037354, + "learning_rate": 1.9964146667023592e-05, + "loss": 1.3632, + "step": 7223 + }, + { + "epoch": 0.16848635742127713, + "grad_norm": 2.0743753910064697, + "learning_rate": 1.996412536071784e-05, + "loss": 1.8272, + "step": 7224 + }, + { + "epoch": 0.16850968056045504, + "grad_norm": 1.7573965787887573, + "learning_rate": 1.9964104048094566e-05, + "loss": 1.3803, + "step": 7225 + }, + { + "epoch": 0.16853300369963295, + "grad_norm": 1.6198687553405762, + "learning_rate": 1.9964082729153793e-05, + "loss": 1.6499, + "step": 7226 + }, + { + "epoch": 0.16855632683881086, + "grad_norm": 2.224005699157715, + "learning_rate": 1.996406140389553e-05, + "loss": 1.472, + "step": 7227 + }, + { + "epoch": 0.1685796499779888, + "grad_norm": 1.5503630638122559, + "learning_rate": 1.9964040072319785e-05, + "loss": 1.2588, + "step": 7228 + }, + { + "epoch": 0.1686029731171667, + "grad_norm": 1.5661872625350952, + "learning_rate": 1.996401873442658e-05, + "loss": 1.3442, + "step": 7229 + }, + { + "epoch": 0.16862629625634462, + "grad_norm": 1.882939338684082, + "learning_rate": 1.9963997390215928e-05, + "loss": 1.2877, + "step": 7230 + }, + { + "epoch": 0.16864961939552253, + "grad_norm": 1.669281244277954, + "learning_rate": 1.9963976039687835e-05, + "loss": 0.9725, + "step": 7231 + }, + { + "epoch": 0.16867294253470047, + "grad_norm": 1.556573748588562, + "learning_rate": 1.9963954682842324e-05, + "loss": 1.1517, + "step": 7232 + }, + { + "epoch": 0.16869626567387838, + "grad_norm": 1.868234634399414, + "learning_rate": 1.9963933319679403e-05, + "loss": 1.4761, + "step": 7233 + }, + { + "epoch": 0.1687195888130563, + "grad_norm": 1.435646891593933, + "learning_rate": 1.9963911950199084e-05, + "loss": 1.3009, + "step": 7234 + }, + { + "epoch": 0.1687429119522342, + "grad_norm": 1.7980226278305054, + "learning_rate": 1.9963890574401386e-05, + "loss": 1.564, + "step": 7235 + }, + { + "epoch": 0.16876623509141214, + "grad_norm": 1.656304955482483, + "learning_rate": 1.996386919228632e-05, + "loss": 1.3012, + "step": 7236 + }, + { + "epoch": 0.16878955823059005, + "grad_norm": 1.7057609558105469, + "learning_rate": 1.9963847803853897e-05, + "loss": 1.4827, + "step": 7237 + }, + { + "epoch": 0.16881288136976796, + "grad_norm": 1.6094976663589478, + "learning_rate": 1.9963826409104135e-05, + "loss": 1.4931, + "step": 7238 + }, + { + "epoch": 0.16883620450894588, + "grad_norm": 1.62102210521698, + "learning_rate": 1.9963805008037046e-05, + "loss": 1.3114, + "step": 7239 + }, + { + "epoch": 0.1688595276481238, + "grad_norm": 1.839906930923462, + "learning_rate": 1.9963783600652642e-05, + "loss": 1.5903, + "step": 7240 + }, + { + "epoch": 0.16888285078730172, + "grad_norm": 2.154062271118164, + "learning_rate": 1.996376218695094e-05, + "loss": 1.6608, + "step": 7241 + }, + { + "epoch": 0.16890617392647964, + "grad_norm": 2.086367607116699, + "learning_rate": 1.9963740766931944e-05, + "loss": 1.6589, + "step": 7242 + }, + { + "epoch": 0.16892949706565755, + "grad_norm": 1.879960298538208, + "learning_rate": 1.996371934059568e-05, + "loss": 1.4494, + "step": 7243 + }, + { + "epoch": 0.16895282020483546, + "grad_norm": 1.759545087814331, + "learning_rate": 1.9963697907942157e-05, + "loss": 1.4571, + "step": 7244 + }, + { + "epoch": 0.1689761433440134, + "grad_norm": 1.6885735988616943, + "learning_rate": 1.9963676468971386e-05, + "loss": 1.4822, + "step": 7245 + }, + { + "epoch": 0.1689994664831913, + "grad_norm": 1.7630573511123657, + "learning_rate": 1.9963655023683384e-05, + "loss": 1.7039, + "step": 7246 + }, + { + "epoch": 0.16902278962236922, + "grad_norm": 1.9831657409667969, + "learning_rate": 1.996363357207816e-05, + "loss": 1.4948, + "step": 7247 + }, + { + "epoch": 0.16904611276154713, + "grad_norm": 1.627004623413086, + "learning_rate": 1.9963612114155734e-05, + "loss": 1.3898, + "step": 7248 + }, + { + "epoch": 0.16906943590072507, + "grad_norm": 1.7681422233581543, + "learning_rate": 1.9963590649916118e-05, + "loss": 1.4567, + "step": 7249 + }, + { + "epoch": 0.16909275903990298, + "grad_norm": 1.7141947746276855, + "learning_rate": 1.996356917935932e-05, + "loss": 1.6394, + "step": 7250 + }, + { + "epoch": 0.1691160821790809, + "grad_norm": 1.6076695919036865, + "learning_rate": 1.996354770248536e-05, + "loss": 1.1328, + "step": 7251 + }, + { + "epoch": 0.1691394053182588, + "grad_norm": 1.7933634519577026, + "learning_rate": 1.996352621929425e-05, + "loss": 1.4971, + "step": 7252 + }, + { + "epoch": 0.16916272845743674, + "grad_norm": 2.2354485988616943, + "learning_rate": 1.9963504729786e-05, + "loss": 1.657, + "step": 7253 + }, + { + "epoch": 0.16918605159661465, + "grad_norm": 1.699470043182373, + "learning_rate": 1.9963483233960627e-05, + "loss": 1.4368, + "step": 7254 + }, + { + "epoch": 0.16920937473579256, + "grad_norm": 1.5841920375823975, + "learning_rate": 1.9963461731818143e-05, + "loss": 1.1731, + "step": 7255 + }, + { + "epoch": 0.16923269787497047, + "grad_norm": 1.90166175365448, + "learning_rate": 1.9963440223358565e-05, + "loss": 1.4857, + "step": 7256 + }, + { + "epoch": 0.1692560210141484, + "grad_norm": 1.7348477840423584, + "learning_rate": 1.9963418708581904e-05, + "loss": 1.1685, + "step": 7257 + }, + { + "epoch": 0.16927934415332632, + "grad_norm": 2.053377151489258, + "learning_rate": 1.9963397187488172e-05, + "loss": 1.4698, + "step": 7258 + }, + { + "epoch": 0.16930266729250423, + "grad_norm": 2.8906612396240234, + "learning_rate": 1.9963375660077388e-05, + "loss": 1.2653, + "step": 7259 + }, + { + "epoch": 0.16932599043168214, + "grad_norm": 1.4677352905273438, + "learning_rate": 1.996335412634956e-05, + "loss": 1.1502, + "step": 7260 + }, + { + "epoch": 0.16934931357086008, + "grad_norm": 1.7461823225021362, + "learning_rate": 1.9963332586304703e-05, + "loss": 1.5145, + "step": 7261 + }, + { + "epoch": 0.169372636710038, + "grad_norm": 1.6560553312301636, + "learning_rate": 1.996331103994283e-05, + "loss": 1.3874, + "step": 7262 + }, + { + "epoch": 0.1693959598492159, + "grad_norm": 1.724831461906433, + "learning_rate": 1.9963289487263962e-05, + "loss": 1.4612, + "step": 7263 + }, + { + "epoch": 0.16941928298839382, + "grad_norm": 3.735696315765381, + "learning_rate": 1.9963267928268105e-05, + "loss": 1.202, + "step": 7264 + }, + { + "epoch": 0.16944260612757175, + "grad_norm": 1.6487818956375122, + "learning_rate": 1.996324636295527e-05, + "loss": 1.1299, + "step": 7265 + }, + { + "epoch": 0.16946592926674967, + "grad_norm": 1.4822131395339966, + "learning_rate": 1.996322479132548e-05, + "loss": 1.1543, + "step": 7266 + }, + { + "epoch": 0.16948925240592758, + "grad_norm": 1.6810075044631958, + "learning_rate": 1.996320321337874e-05, + "loss": 1.2764, + "step": 7267 + }, + { + "epoch": 0.1695125755451055, + "grad_norm": 1.6501418352127075, + "learning_rate": 1.996318162911507e-05, + "loss": 1.5249, + "step": 7268 + }, + { + "epoch": 0.1695358986842834, + "grad_norm": 2.0641491413116455, + "learning_rate": 1.9963160038534483e-05, + "loss": 1.3715, + "step": 7269 + }, + { + "epoch": 0.16955922182346134, + "grad_norm": 2.091966152191162, + "learning_rate": 1.9963138441636988e-05, + "loss": 1.4848, + "step": 7270 + }, + { + "epoch": 0.16958254496263925, + "grad_norm": 1.7986277341842651, + "learning_rate": 1.9963116838422606e-05, + "loss": 1.7356, + "step": 7271 + }, + { + "epoch": 0.16960586810181716, + "grad_norm": 2.0773775577545166, + "learning_rate": 1.9963095228891343e-05, + "loss": 1.521, + "step": 7272 + }, + { + "epoch": 0.16962919124099507, + "grad_norm": 1.8262965679168701, + "learning_rate": 1.9963073613043214e-05, + "loss": 1.4819, + "step": 7273 + }, + { + "epoch": 0.169652514380173, + "grad_norm": 1.8210368156433105, + "learning_rate": 1.996305199087824e-05, + "loss": 1.65, + "step": 7274 + }, + { + "epoch": 0.16967583751935092, + "grad_norm": 1.873431921005249, + "learning_rate": 1.9963030362396427e-05, + "loss": 1.3606, + "step": 7275 + }, + { + "epoch": 0.16969916065852883, + "grad_norm": 1.8186115026474, + "learning_rate": 1.996300872759779e-05, + "loss": 1.5232, + "step": 7276 + }, + { + "epoch": 0.16972248379770674, + "grad_norm": 1.8831424713134766, + "learning_rate": 1.9962987086482345e-05, + "loss": 1.4222, + "step": 7277 + }, + { + "epoch": 0.16974580693688468, + "grad_norm": 1.9786714315414429, + "learning_rate": 1.9962965439050102e-05, + "loss": 1.2625, + "step": 7278 + }, + { + "epoch": 0.1697691300760626, + "grad_norm": 1.595861792564392, + "learning_rate": 1.996294378530108e-05, + "loss": 1.3231, + "step": 7279 + }, + { + "epoch": 0.1697924532152405, + "grad_norm": 1.7728326320648193, + "learning_rate": 1.996292212523529e-05, + "loss": 1.1445, + "step": 7280 + }, + { + "epoch": 0.1698157763544184, + "grad_norm": 1.929486870765686, + "learning_rate": 1.9962900458852745e-05, + "loss": 1.133, + "step": 7281 + }, + { + "epoch": 0.16983909949359635, + "grad_norm": 1.8549599647521973, + "learning_rate": 1.9962878786153462e-05, + "loss": 1.1759, + "step": 7282 + }, + { + "epoch": 0.16986242263277426, + "grad_norm": 2.059351682662964, + "learning_rate": 1.996285710713745e-05, + "loss": 1.7028, + "step": 7283 + }, + { + "epoch": 0.16988574577195217, + "grad_norm": 1.7382444143295288, + "learning_rate": 1.9962835421804726e-05, + "loss": 1.8668, + "step": 7284 + }, + { + "epoch": 0.16990906891113008, + "grad_norm": 1.9037734270095825, + "learning_rate": 1.99628137301553e-05, + "loss": 1.4782, + "step": 7285 + }, + { + "epoch": 0.16993239205030802, + "grad_norm": 2.109489679336548, + "learning_rate": 1.996279203218919e-05, + "loss": 1.6, + "step": 7286 + }, + { + "epoch": 0.16995571518948593, + "grad_norm": 1.9111486673355103, + "learning_rate": 1.9962770327906413e-05, + "loss": 1.6095, + "step": 7287 + }, + { + "epoch": 0.16997903832866385, + "grad_norm": 1.631386399269104, + "learning_rate": 1.9962748617306972e-05, + "loss": 1.35, + "step": 7288 + }, + { + "epoch": 0.17000236146784176, + "grad_norm": 1.9402213096618652, + "learning_rate": 1.996272690039089e-05, + "loss": 1.372, + "step": 7289 + }, + { + "epoch": 0.1700256846070197, + "grad_norm": 1.6260658502578735, + "learning_rate": 1.9962705177158176e-05, + "loss": 1.4418, + "step": 7290 + }, + { + "epoch": 0.1700490077461976, + "grad_norm": 2.009672164916992, + "learning_rate": 1.9962683447608847e-05, + "loss": 1.6506, + "step": 7291 + }, + { + "epoch": 0.17007233088537552, + "grad_norm": 1.7285696268081665, + "learning_rate": 1.996266171174291e-05, + "loss": 1.5241, + "step": 7292 + }, + { + "epoch": 0.17009565402455343, + "grad_norm": 2.1354665756225586, + "learning_rate": 1.9962639969560387e-05, + "loss": 1.4205, + "step": 7293 + }, + { + "epoch": 0.17011897716373134, + "grad_norm": 1.7839125394821167, + "learning_rate": 1.996261822106129e-05, + "loss": 1.6104, + "step": 7294 + }, + { + "epoch": 0.17014230030290928, + "grad_norm": 2.111867666244507, + "learning_rate": 1.9962596466245633e-05, + "loss": 1.4986, + "step": 7295 + }, + { + "epoch": 0.1701656234420872, + "grad_norm": 1.9032070636749268, + "learning_rate": 1.9962574705113427e-05, + "loss": 1.5528, + "step": 7296 + }, + { + "epoch": 0.1701889465812651, + "grad_norm": 2.484650135040283, + "learning_rate": 1.9962552937664686e-05, + "loss": 1.5925, + "step": 7297 + }, + { + "epoch": 0.170212269720443, + "grad_norm": 1.8242621421813965, + "learning_rate": 1.9962531163899424e-05, + "loss": 1.3941, + "step": 7298 + }, + { + "epoch": 0.17023559285962095, + "grad_norm": 1.5441805124282837, + "learning_rate": 1.9962509383817658e-05, + "loss": 1.5267, + "step": 7299 + }, + { + "epoch": 0.17025891599879886, + "grad_norm": 1.8445488214492798, + "learning_rate": 1.9962487597419397e-05, + "loss": 1.5954, + "step": 7300 + }, + { + "epoch": 0.17028223913797677, + "grad_norm": 1.9592714309692383, + "learning_rate": 1.996246580470466e-05, + "loss": 1.4085, + "step": 7301 + }, + { + "epoch": 0.17030556227715468, + "grad_norm": 1.8544763326644897, + "learning_rate": 1.9962444005673455e-05, + "loss": 1.5938, + "step": 7302 + }, + { + "epoch": 0.17032888541633262, + "grad_norm": 2.2029178142547607, + "learning_rate": 1.9962422200325803e-05, + "loss": 1.4138, + "step": 7303 + }, + { + "epoch": 0.17035220855551053, + "grad_norm": 1.7620607614517212, + "learning_rate": 1.9962400388661712e-05, + "loss": 1.1659, + "step": 7304 + }, + { + "epoch": 0.17037553169468844, + "grad_norm": 1.5933129787445068, + "learning_rate": 1.9962378570681195e-05, + "loss": 1.3796, + "step": 7305 + }, + { + "epoch": 0.17039885483386635, + "grad_norm": 1.940445899963379, + "learning_rate": 1.996235674638427e-05, + "loss": 1.4915, + "step": 7306 + }, + { + "epoch": 0.1704221779730443, + "grad_norm": 2.0876340866088867, + "learning_rate": 1.996233491577095e-05, + "loss": 1.3587, + "step": 7307 + }, + { + "epoch": 0.1704455011122222, + "grad_norm": 1.8238178491592407, + "learning_rate": 1.996231307884125e-05, + "loss": 1.431, + "step": 7308 + }, + { + "epoch": 0.17046882425140011, + "grad_norm": 1.887245774269104, + "learning_rate": 1.996229123559518e-05, + "loss": 1.5855, + "step": 7309 + }, + { + "epoch": 0.17049214739057802, + "grad_norm": 2.0348429679870605, + "learning_rate": 1.9962269386032754e-05, + "loss": 1.4762, + "step": 7310 + }, + { + "epoch": 0.17051547052975596, + "grad_norm": 2.1599831581115723, + "learning_rate": 1.996224753015399e-05, + "loss": 1.2882, + "step": 7311 + }, + { + "epoch": 0.17053879366893387, + "grad_norm": 2.517827272415161, + "learning_rate": 1.9962225667958898e-05, + "loss": 1.3917, + "step": 7312 + }, + { + "epoch": 0.17056211680811179, + "grad_norm": 2.142540216445923, + "learning_rate": 1.99622037994475e-05, + "loss": 1.1496, + "step": 7313 + }, + { + "epoch": 0.1705854399472897, + "grad_norm": 1.8890477418899536, + "learning_rate": 1.996218192461979e-05, + "loss": 1.3631, + "step": 7314 + }, + { + "epoch": 0.17060876308646764, + "grad_norm": 1.6192525625228882, + "learning_rate": 1.9962160043475806e-05, + "loss": 1.3797, + "step": 7315 + }, + { + "epoch": 0.17063208622564555, + "grad_norm": 1.771726131439209, + "learning_rate": 1.9962138156015548e-05, + "loss": 1.3737, + "step": 7316 + }, + { + "epoch": 0.17065540936482346, + "grad_norm": 1.4178739786148071, + "learning_rate": 1.9962116262239032e-05, + "loss": 1.2781, + "step": 7317 + }, + { + "epoch": 0.17067873250400137, + "grad_norm": 1.606480360031128, + "learning_rate": 1.9962094362146274e-05, + "loss": 1.3683, + "step": 7318 + }, + { + "epoch": 0.1707020556431793, + "grad_norm": 1.5734496116638184, + "learning_rate": 1.996207245573729e-05, + "loss": 0.9601, + "step": 7319 + }, + { + "epoch": 0.17072537878235722, + "grad_norm": 1.4303914308547974, + "learning_rate": 1.996205054301208e-05, + "loss": 1.2904, + "step": 7320 + }, + { + "epoch": 0.17074870192153513, + "grad_norm": 1.8955293893814087, + "learning_rate": 1.9962028623970676e-05, + "loss": 1.4578, + "step": 7321 + }, + { + "epoch": 0.17077202506071304, + "grad_norm": 2.023099422454834, + "learning_rate": 1.9962006698613082e-05, + "loss": 1.4774, + "step": 7322 + }, + { + "epoch": 0.17079534819989095, + "grad_norm": 3.883347988128662, + "learning_rate": 1.9961984766939317e-05, + "loss": 1.5777, + "step": 7323 + }, + { + "epoch": 0.1708186713390689, + "grad_norm": 1.8709912300109863, + "learning_rate": 1.996196282894939e-05, + "loss": 1.6563, + "step": 7324 + }, + { + "epoch": 0.1708419944782468, + "grad_norm": 1.8157764673233032, + "learning_rate": 1.996194088464332e-05, + "loss": 1.2787, + "step": 7325 + }, + { + "epoch": 0.1708653176174247, + "grad_norm": 1.886013388633728, + "learning_rate": 1.9961918934021115e-05, + "loss": 1.7499, + "step": 7326 + }, + { + "epoch": 0.17088864075660262, + "grad_norm": 1.750476598739624, + "learning_rate": 1.996189697708279e-05, + "loss": 1.3723, + "step": 7327 + }, + { + "epoch": 0.17091196389578056, + "grad_norm": 1.8375768661499023, + "learning_rate": 1.9961875013828363e-05, + "loss": 1.2884, + "step": 7328 + }, + { + "epoch": 0.17093528703495847, + "grad_norm": 1.8985698223114014, + "learning_rate": 1.9961853044257845e-05, + "loss": 1.4474, + "step": 7329 + }, + { + "epoch": 0.17095861017413638, + "grad_norm": 1.723751187324524, + "learning_rate": 1.9961831068371253e-05, + "loss": 1.2133, + "step": 7330 + }, + { + "epoch": 0.1709819333133143, + "grad_norm": 1.690781831741333, + "learning_rate": 1.9961809086168594e-05, + "loss": 1.3405, + "step": 7331 + }, + { + "epoch": 0.17100525645249223, + "grad_norm": 1.8595227003097534, + "learning_rate": 1.9961787097649892e-05, + "loss": 1.5546, + "step": 7332 + }, + { + "epoch": 0.17102857959167014, + "grad_norm": 1.7749403715133667, + "learning_rate": 1.9961765102815153e-05, + "loss": 1.7123, + "step": 7333 + }, + { + "epoch": 0.17105190273084805, + "grad_norm": 1.5466742515563965, + "learning_rate": 1.9961743101664394e-05, + "loss": 1.1188, + "step": 7334 + }, + { + "epoch": 0.17107522587002597, + "grad_norm": 1.8160669803619385, + "learning_rate": 1.996172109419763e-05, + "loss": 1.1894, + "step": 7335 + }, + { + "epoch": 0.1710985490092039, + "grad_norm": 1.9019039869308472, + "learning_rate": 1.9961699080414868e-05, + "loss": 1.4289, + "step": 7336 + }, + { + "epoch": 0.17112187214838182, + "grad_norm": 4.011369228363037, + "learning_rate": 1.996167706031613e-05, + "loss": 1.2411, + "step": 7337 + }, + { + "epoch": 0.17114519528755973, + "grad_norm": 1.6779215335845947, + "learning_rate": 1.9961655033901426e-05, + "loss": 1.6196, + "step": 7338 + }, + { + "epoch": 0.17116851842673764, + "grad_norm": 1.800858736038208, + "learning_rate": 1.996163300117077e-05, + "loss": 1.4937, + "step": 7339 + }, + { + "epoch": 0.17119184156591558, + "grad_norm": 1.6043599843978882, + "learning_rate": 1.9961610962124185e-05, + "loss": 1.3004, + "step": 7340 + }, + { + "epoch": 0.1712151647050935, + "grad_norm": 1.8198878765106201, + "learning_rate": 1.996158891676167e-05, + "loss": 1.7781, + "step": 7341 + }, + { + "epoch": 0.1712384878442714, + "grad_norm": 1.8257044553756714, + "learning_rate": 1.996156686508325e-05, + "loss": 1.3438, + "step": 7342 + }, + { + "epoch": 0.1712618109834493, + "grad_norm": 1.847093939781189, + "learning_rate": 1.996154480708893e-05, + "loss": 1.9422, + "step": 7343 + }, + { + "epoch": 0.17128513412262725, + "grad_norm": 1.908883810043335, + "learning_rate": 1.9961522742778735e-05, + "loss": 1.1885, + "step": 7344 + }, + { + "epoch": 0.17130845726180516, + "grad_norm": 1.6020523309707642, + "learning_rate": 1.996150067215267e-05, + "loss": 1.2662, + "step": 7345 + }, + { + "epoch": 0.17133178040098307, + "grad_norm": 1.555682897567749, + "learning_rate": 1.9961478595210752e-05, + "loss": 1.3253, + "step": 7346 + }, + { + "epoch": 0.17135510354016098, + "grad_norm": 1.7264537811279297, + "learning_rate": 1.9961456511952993e-05, + "loss": 1.2488, + "step": 7347 + }, + { + "epoch": 0.17137842667933892, + "grad_norm": 1.8023107051849365, + "learning_rate": 1.996143442237941e-05, + "loss": 1.5922, + "step": 7348 + }, + { + "epoch": 0.17140174981851683, + "grad_norm": 1.8738701343536377, + "learning_rate": 1.996141232649002e-05, + "loss": 1.504, + "step": 7349 + }, + { + "epoch": 0.17142507295769474, + "grad_norm": 2.0497548580169678, + "learning_rate": 1.996139022428483e-05, + "loss": 1.0777, + "step": 7350 + }, + { + "epoch": 0.17144839609687265, + "grad_norm": 2.327644109725952, + "learning_rate": 1.9961368115763858e-05, + "loss": 1.4711, + "step": 7351 + }, + { + "epoch": 0.17147171923605056, + "grad_norm": 1.833647608757019, + "learning_rate": 1.9961346000927115e-05, + "loss": 1.5743, + "step": 7352 + }, + { + "epoch": 0.1714950423752285, + "grad_norm": 2.0540010929107666, + "learning_rate": 1.9961323879774617e-05, + "loss": 1.1418, + "step": 7353 + }, + { + "epoch": 0.1715183655144064, + "grad_norm": 2.0669445991516113, + "learning_rate": 1.996130175230638e-05, + "loss": 1.0494, + "step": 7354 + }, + { + "epoch": 0.17154168865358432, + "grad_norm": 1.673045039176941, + "learning_rate": 1.9961279618522415e-05, + "loss": 1.4874, + "step": 7355 + }, + { + "epoch": 0.17156501179276223, + "grad_norm": 2.0173518657684326, + "learning_rate": 1.996125747842274e-05, + "loss": 1.5697, + "step": 7356 + }, + { + "epoch": 0.17158833493194017, + "grad_norm": 1.703830599784851, + "learning_rate": 1.9961235332007364e-05, + "loss": 1.1647, + "step": 7357 + }, + { + "epoch": 0.17161165807111808, + "grad_norm": 1.6575689315795898, + "learning_rate": 1.9961213179276306e-05, + "loss": 1.5479, + "step": 7358 + }, + { + "epoch": 0.171634981210296, + "grad_norm": 1.771578073501587, + "learning_rate": 1.9961191020229573e-05, + "loss": 1.2048, + "step": 7359 + }, + { + "epoch": 0.1716583043494739, + "grad_norm": 2.0495445728302, + "learning_rate": 1.9961168854867186e-05, + "loss": 1.1726, + "step": 7360 + }, + { + "epoch": 0.17168162748865184, + "grad_norm": 1.6176714897155762, + "learning_rate": 1.9961146683189155e-05, + "loss": 1.1814, + "step": 7361 + }, + { + "epoch": 0.17170495062782976, + "grad_norm": 1.6740190982818604, + "learning_rate": 1.9961124505195495e-05, + "loss": 1.3065, + "step": 7362 + }, + { + "epoch": 0.17172827376700767, + "grad_norm": 1.8634616136550903, + "learning_rate": 1.9961102320886224e-05, + "loss": 1.5083, + "step": 7363 + }, + { + "epoch": 0.17175159690618558, + "grad_norm": 1.2991069555282593, + "learning_rate": 1.996108013026135e-05, + "loss": 1.0809, + "step": 7364 + }, + { + "epoch": 0.17177492004536352, + "grad_norm": 1.7909940481185913, + "learning_rate": 1.996105793332089e-05, + "loss": 1.3165, + "step": 7365 + }, + { + "epoch": 0.17179824318454143, + "grad_norm": 2.052440643310547, + "learning_rate": 1.9961035730064856e-05, + "loss": 1.3404, + "step": 7366 + }, + { + "epoch": 0.17182156632371934, + "grad_norm": 2.088730573654175, + "learning_rate": 1.9961013520493265e-05, + "loss": 1.4628, + "step": 7367 + }, + { + "epoch": 0.17184488946289725, + "grad_norm": 2.1689155101776123, + "learning_rate": 1.9960991304606132e-05, + "loss": 1.3762, + "step": 7368 + }, + { + "epoch": 0.1718682126020752, + "grad_norm": 1.62769615650177, + "learning_rate": 1.9960969082403467e-05, + "loss": 1.3, + "step": 7369 + }, + { + "epoch": 0.1718915357412531, + "grad_norm": 2.146425247192383, + "learning_rate": 1.9960946853885284e-05, + "loss": 1.5153, + "step": 7370 + }, + { + "epoch": 0.171914858880431, + "grad_norm": 1.6554946899414062, + "learning_rate": 1.9960924619051603e-05, + "loss": 1.6156, + "step": 7371 + }, + { + "epoch": 0.17193818201960892, + "grad_norm": 1.7955783605575562, + "learning_rate": 1.9960902377902434e-05, + "loss": 1.2538, + "step": 7372 + }, + { + "epoch": 0.17196150515878686, + "grad_norm": 1.7063616514205933, + "learning_rate": 1.996088013043779e-05, + "loss": 1.5204, + "step": 7373 + }, + { + "epoch": 0.17198482829796477, + "grad_norm": 2.144882917404175, + "learning_rate": 1.996085787665769e-05, + "loss": 1.7344, + "step": 7374 + }, + { + "epoch": 0.17200815143714268, + "grad_norm": 1.6654056310653687, + "learning_rate": 1.996083561656214e-05, + "loss": 1.3071, + "step": 7375 + }, + { + "epoch": 0.1720314745763206, + "grad_norm": 2.0944314002990723, + "learning_rate": 1.996081335015116e-05, + "loss": 1.7446, + "step": 7376 + }, + { + "epoch": 0.17205479771549853, + "grad_norm": 1.9592459201812744, + "learning_rate": 1.9960791077424763e-05, + "loss": 1.7852, + "step": 7377 + }, + { + "epoch": 0.17207812085467644, + "grad_norm": 1.7489794492721558, + "learning_rate": 1.9960768798382962e-05, + "loss": 1.3748, + "step": 7378 + }, + { + "epoch": 0.17210144399385435, + "grad_norm": 2.324974536895752, + "learning_rate": 1.9960746513025775e-05, + "loss": 1.369, + "step": 7379 + }, + { + "epoch": 0.17212476713303226, + "grad_norm": 1.9958993196487427, + "learning_rate": 1.996072422135321e-05, + "loss": 1.6394, + "step": 7380 + }, + { + "epoch": 0.17214809027221017, + "grad_norm": 1.8678290843963623, + "learning_rate": 1.9960701923365286e-05, + "loss": 1.4112, + "step": 7381 + }, + { + "epoch": 0.1721714134113881, + "grad_norm": 2.0843727588653564, + "learning_rate": 1.9960679619062015e-05, + "loss": 1.4366, + "step": 7382 + }, + { + "epoch": 0.17219473655056602, + "grad_norm": 1.6551953554153442, + "learning_rate": 1.996065730844341e-05, + "loss": 1.1877, + "step": 7383 + }, + { + "epoch": 0.17221805968974394, + "grad_norm": 1.8490272760391235, + "learning_rate": 1.996063499150949e-05, + "loss": 1.4618, + "step": 7384 + }, + { + "epoch": 0.17224138282892185, + "grad_norm": 1.9955418109893799, + "learning_rate": 1.9960612668260263e-05, + "loss": 1.4489, + "step": 7385 + }, + { + "epoch": 0.17226470596809978, + "grad_norm": 1.737937331199646, + "learning_rate": 1.996059033869575e-05, + "loss": 1.4282, + "step": 7386 + }, + { + "epoch": 0.1722880291072777, + "grad_norm": 1.855420470237732, + "learning_rate": 1.9960568002815957e-05, + "loss": 1.1258, + "step": 7387 + }, + { + "epoch": 0.1723113522464556, + "grad_norm": 2.0366740226745605, + "learning_rate": 1.9960545660620904e-05, + "loss": 1.6804, + "step": 7388 + }, + { + "epoch": 0.17233467538563352, + "grad_norm": 1.880635142326355, + "learning_rate": 1.9960523312110603e-05, + "loss": 1.2003, + "step": 7389 + }, + { + "epoch": 0.17235799852481146, + "grad_norm": 1.6550692319869995, + "learning_rate": 1.996050095728507e-05, + "loss": 1.392, + "step": 7390 + }, + { + "epoch": 0.17238132166398937, + "grad_norm": 2.008336305618286, + "learning_rate": 1.9960478596144315e-05, + "loss": 1.3317, + "step": 7391 + }, + { + "epoch": 0.17240464480316728, + "grad_norm": 1.6898361444473267, + "learning_rate": 1.9960456228688358e-05, + "loss": 1.2289, + "step": 7392 + }, + { + "epoch": 0.1724279679423452, + "grad_norm": 2.0439491271972656, + "learning_rate": 1.996043385491721e-05, + "loss": 1.4943, + "step": 7393 + }, + { + "epoch": 0.17245129108152313, + "grad_norm": 2.73618745803833, + "learning_rate": 1.9960411474830885e-05, + "loss": 1.4013, + "step": 7394 + }, + { + "epoch": 0.17247461422070104, + "grad_norm": 1.8021255731582642, + "learning_rate": 1.9960389088429395e-05, + "loss": 1.5953, + "step": 7395 + }, + { + "epoch": 0.17249793735987895, + "grad_norm": 1.936721920967102, + "learning_rate": 1.9960366695712764e-05, + "loss": 1.4701, + "step": 7396 + }, + { + "epoch": 0.17252126049905686, + "grad_norm": 1.8027193546295166, + "learning_rate": 1.9960344296680994e-05, + "loss": 1.4896, + "step": 7397 + }, + { + "epoch": 0.1725445836382348, + "grad_norm": 1.808326005935669, + "learning_rate": 1.9960321891334105e-05, + "loss": 1.794, + "step": 7398 + }, + { + "epoch": 0.1725679067774127, + "grad_norm": 1.60789954662323, + "learning_rate": 1.996029947967211e-05, + "loss": 1.0746, + "step": 7399 + }, + { + "epoch": 0.17259122991659062, + "grad_norm": 2.0105931758880615, + "learning_rate": 1.9960277061695023e-05, + "loss": 1.3413, + "step": 7400 + }, + { + "epoch": 0.17261455305576853, + "grad_norm": 2.054696798324585, + "learning_rate": 1.9960254637402857e-05, + "loss": 1.1728, + "step": 7401 + }, + { + "epoch": 0.17263787619494647, + "grad_norm": 1.643372893333435, + "learning_rate": 1.9960232206795633e-05, + "loss": 1.3505, + "step": 7402 + }, + { + "epoch": 0.17266119933412438, + "grad_norm": 1.7273507118225098, + "learning_rate": 1.996020976987336e-05, + "loss": 1.4824, + "step": 7403 + }, + { + "epoch": 0.1726845224733023, + "grad_norm": 1.667300820350647, + "learning_rate": 1.996018732663605e-05, + "loss": 1.6327, + "step": 7404 + }, + { + "epoch": 0.1727078456124802, + "grad_norm": 1.8596947193145752, + "learning_rate": 1.9960164877083715e-05, + "loss": 1.6462, + "step": 7405 + }, + { + "epoch": 0.17273116875165814, + "grad_norm": 1.7757184505462646, + "learning_rate": 1.9960142421216383e-05, + "loss": 1.4335, + "step": 7406 + }, + { + "epoch": 0.17275449189083605, + "grad_norm": 1.6777504682540894, + "learning_rate": 1.9960119959034055e-05, + "loss": 1.5025, + "step": 7407 + }, + { + "epoch": 0.17277781503001396, + "grad_norm": 1.6041209697723389, + "learning_rate": 1.9960097490536747e-05, + "loss": 1.2403, + "step": 7408 + }, + { + "epoch": 0.17280113816919188, + "grad_norm": 1.8389767408370972, + "learning_rate": 1.996007501572448e-05, + "loss": 1.5236, + "step": 7409 + }, + { + "epoch": 0.1728244613083698, + "grad_norm": 1.706955909729004, + "learning_rate": 1.996005253459726e-05, + "loss": 1.4785, + "step": 7410 + }, + { + "epoch": 0.17284778444754773, + "grad_norm": 1.508392572402954, + "learning_rate": 1.996003004715511e-05, + "loss": 1.3597, + "step": 7411 + }, + { + "epoch": 0.17287110758672564, + "grad_norm": 1.642703652381897, + "learning_rate": 1.9960007553398037e-05, + "loss": 1.4064, + "step": 7412 + }, + { + "epoch": 0.17289443072590355, + "grad_norm": 1.712317943572998, + "learning_rate": 1.9959985053326058e-05, + "loss": 1.4196, + "step": 7413 + }, + { + "epoch": 0.17291775386508146, + "grad_norm": 1.5978766679763794, + "learning_rate": 1.9959962546939187e-05, + "loss": 1.1459, + "step": 7414 + }, + { + "epoch": 0.1729410770042594, + "grad_norm": 2.1440157890319824, + "learning_rate": 1.995994003423744e-05, + "loss": 1.2601, + "step": 7415 + }, + { + "epoch": 0.1729644001434373, + "grad_norm": 2.0604898929595947, + "learning_rate": 1.9959917515220826e-05, + "loss": 1.501, + "step": 7416 + }, + { + "epoch": 0.17298772328261522, + "grad_norm": 1.9424153566360474, + "learning_rate": 1.9959894989889362e-05, + "loss": 1.5569, + "step": 7417 + }, + { + "epoch": 0.17301104642179313, + "grad_norm": 2.0561015605926514, + "learning_rate": 1.9959872458243068e-05, + "loss": 1.4556, + "step": 7418 + }, + { + "epoch": 0.17303436956097107, + "grad_norm": 1.8013122081756592, + "learning_rate": 1.995984992028195e-05, + "loss": 1.6372, + "step": 7419 + }, + { + "epoch": 0.17305769270014898, + "grad_norm": 1.5892102718353271, + "learning_rate": 1.995982737600603e-05, + "loss": 1.7014, + "step": 7420 + }, + { + "epoch": 0.1730810158393269, + "grad_norm": 2.5263142585754395, + "learning_rate": 1.995980482541531e-05, + "loss": 1.1461, + "step": 7421 + }, + { + "epoch": 0.1731043389785048, + "grad_norm": 2.1230969429016113, + "learning_rate": 1.9959782268509816e-05, + "loss": 1.5626, + "step": 7422 + }, + { + "epoch": 0.17312766211768274, + "grad_norm": 1.638551950454712, + "learning_rate": 1.9959759705289562e-05, + "loss": 1.4081, + "step": 7423 + }, + { + "epoch": 0.17315098525686065, + "grad_norm": 2.00850248336792, + "learning_rate": 1.9959737135754556e-05, + "loss": 1.501, + "step": 7424 + }, + { + "epoch": 0.17317430839603856, + "grad_norm": 1.6464558839797974, + "learning_rate": 1.9959714559904814e-05, + "loss": 1.2613, + "step": 7425 + }, + { + "epoch": 0.17319763153521647, + "grad_norm": 1.97989821434021, + "learning_rate": 1.9959691977740353e-05, + "loss": 1.3028, + "step": 7426 + }, + { + "epoch": 0.1732209546743944, + "grad_norm": 2.013129472732544, + "learning_rate": 1.9959669389261184e-05, + "loss": 1.8812, + "step": 7427 + }, + { + "epoch": 0.17324427781357232, + "grad_norm": 2.034947156906128, + "learning_rate": 1.9959646794467326e-05, + "loss": 1.3143, + "step": 7428 + }, + { + "epoch": 0.17326760095275023, + "grad_norm": 1.6890825033187866, + "learning_rate": 1.9959624193358787e-05, + "loss": 1.4729, + "step": 7429 + }, + { + "epoch": 0.17329092409192814, + "grad_norm": 2.0438952445983887, + "learning_rate": 1.9959601585935587e-05, + "loss": 1.4283, + "step": 7430 + }, + { + "epoch": 0.17331424723110608, + "grad_norm": 1.687237024307251, + "learning_rate": 1.995957897219774e-05, + "loss": 1.1062, + "step": 7431 + }, + { + "epoch": 0.173337570370284, + "grad_norm": 1.8502991199493408, + "learning_rate": 1.9959556352145255e-05, + "loss": 1.3845, + "step": 7432 + }, + { + "epoch": 0.1733608935094619, + "grad_norm": 1.6245472431182861, + "learning_rate": 1.995953372577815e-05, + "loss": 1.4537, + "step": 7433 + }, + { + "epoch": 0.17338421664863982, + "grad_norm": 1.8942934274673462, + "learning_rate": 1.9959511093096444e-05, + "loss": 1.2242, + "step": 7434 + }, + { + "epoch": 0.17340753978781775, + "grad_norm": 1.7095831632614136, + "learning_rate": 1.995948845410014e-05, + "loss": 1.6305, + "step": 7435 + }, + { + "epoch": 0.17343086292699567, + "grad_norm": 1.8893944025039673, + "learning_rate": 1.9959465808789262e-05, + "loss": 1.5255, + "step": 7436 + }, + { + "epoch": 0.17345418606617358, + "grad_norm": 1.8854620456695557, + "learning_rate": 1.995944315716382e-05, + "loss": 1.5482, + "step": 7437 + }, + { + "epoch": 0.1734775092053515, + "grad_norm": 2.0252392292022705, + "learning_rate": 1.995942049922383e-05, + "loss": 1.4968, + "step": 7438 + }, + { + "epoch": 0.1735008323445294, + "grad_norm": 1.642406702041626, + "learning_rate": 1.995939783496931e-05, + "loss": 1.3289, + "step": 7439 + }, + { + "epoch": 0.17352415548370734, + "grad_norm": 1.6644012928009033, + "learning_rate": 1.9959375164400265e-05, + "loss": 1.3511, + "step": 7440 + }, + { + "epoch": 0.17354747862288525, + "grad_norm": 1.755669355392456, + "learning_rate": 1.9959352487516717e-05, + "loss": 1.118, + "step": 7441 + }, + { + "epoch": 0.17357080176206316, + "grad_norm": 1.8327054977416992, + "learning_rate": 1.9959329804318678e-05, + "loss": 1.3437, + "step": 7442 + }, + { + "epoch": 0.17359412490124107, + "grad_norm": 2.2309327125549316, + "learning_rate": 1.995930711480616e-05, + "loss": 1.278, + "step": 7443 + }, + { + "epoch": 0.173617448040419, + "grad_norm": 1.882546067237854, + "learning_rate": 1.9959284418979182e-05, + "loss": 1.2738, + "step": 7444 + }, + { + "epoch": 0.17364077117959692, + "grad_norm": 1.9131346940994263, + "learning_rate": 1.9959261716837756e-05, + "loss": 1.2236, + "step": 7445 + }, + { + "epoch": 0.17366409431877483, + "grad_norm": 1.5388514995574951, + "learning_rate": 1.9959239008381896e-05, + "loss": 1.2165, + "step": 7446 + }, + { + "epoch": 0.17368741745795274, + "grad_norm": 1.828491449356079, + "learning_rate": 1.995921629361162e-05, + "loss": 1.3933, + "step": 7447 + }, + { + "epoch": 0.17371074059713068, + "grad_norm": 1.8521742820739746, + "learning_rate": 1.9959193572526934e-05, + "loss": 1.3324, + "step": 7448 + }, + { + "epoch": 0.1737340637363086, + "grad_norm": 1.6978318691253662, + "learning_rate": 1.9959170845127863e-05, + "loss": 1.0602, + "step": 7449 + }, + { + "epoch": 0.1737573868754865, + "grad_norm": 1.8695061206817627, + "learning_rate": 1.9959148111414412e-05, + "loss": 1.1591, + "step": 7450 + }, + { + "epoch": 0.1737807100146644, + "grad_norm": 2.5344576835632324, + "learning_rate": 1.9959125371386602e-05, + "loss": 1.3498, + "step": 7451 + }, + { + "epoch": 0.17380403315384235, + "grad_norm": 1.913453459739685, + "learning_rate": 1.9959102625044445e-05, + "loss": 1.3066, + "step": 7452 + }, + { + "epoch": 0.17382735629302026, + "grad_norm": 1.6568899154663086, + "learning_rate": 1.9959079872387956e-05, + "loss": 1.3877, + "step": 7453 + }, + { + "epoch": 0.17385067943219817, + "grad_norm": 1.9239006042480469, + "learning_rate": 1.9959057113417145e-05, + "loss": 1.5509, + "step": 7454 + }, + { + "epoch": 0.17387400257137609, + "grad_norm": 2.2151055335998535, + "learning_rate": 1.9959034348132035e-05, + "loss": 1.9646, + "step": 7455 + }, + { + "epoch": 0.17389732571055402, + "grad_norm": 1.6090549230575562, + "learning_rate": 1.9959011576532633e-05, + "loss": 1.3097, + "step": 7456 + }, + { + "epoch": 0.17392064884973193, + "grad_norm": 2.2213821411132812, + "learning_rate": 1.995898879861896e-05, + "loss": 1.4774, + "step": 7457 + }, + { + "epoch": 0.17394397198890985, + "grad_norm": 1.7382277250289917, + "learning_rate": 1.9958966014391022e-05, + "loss": 1.7051, + "step": 7458 + }, + { + "epoch": 0.17396729512808776, + "grad_norm": 1.491949200630188, + "learning_rate": 1.995894322384884e-05, + "loss": 1.2318, + "step": 7459 + }, + { + "epoch": 0.1739906182672657, + "grad_norm": 1.8409011363983154, + "learning_rate": 1.9958920426992427e-05, + "loss": 1.3673, + "step": 7460 + }, + { + "epoch": 0.1740139414064436, + "grad_norm": 1.8713476657867432, + "learning_rate": 1.9958897623821796e-05, + "loss": 1.2973, + "step": 7461 + }, + { + "epoch": 0.17403726454562152, + "grad_norm": 1.773823857307434, + "learning_rate": 1.9958874814336962e-05, + "loss": 1.4187, + "step": 7462 + }, + { + "epoch": 0.17406058768479943, + "grad_norm": 2.2369003295898438, + "learning_rate": 1.995885199853794e-05, + "loss": 1.5246, + "step": 7463 + }, + { + "epoch": 0.17408391082397737, + "grad_norm": 2.2250094413757324, + "learning_rate": 1.9958829176424747e-05, + "loss": 1.6894, + "step": 7464 + }, + { + "epoch": 0.17410723396315528, + "grad_norm": 2.0792441368103027, + "learning_rate": 1.9958806347997394e-05, + "loss": 1.4963, + "step": 7465 + }, + { + "epoch": 0.1741305571023332, + "grad_norm": 1.9938381910324097, + "learning_rate": 1.9958783513255894e-05, + "loss": 1.3687, + "step": 7466 + }, + { + "epoch": 0.1741538802415111, + "grad_norm": 1.6669647693634033, + "learning_rate": 1.9958760672200265e-05, + "loss": 1.4074, + "step": 7467 + }, + { + "epoch": 0.174177203380689, + "grad_norm": 1.8183245658874512, + "learning_rate": 1.995873782483052e-05, + "loss": 1.289, + "step": 7468 + }, + { + "epoch": 0.17420052651986695, + "grad_norm": 1.5924521684646606, + "learning_rate": 1.9958714971146677e-05, + "loss": 1.4224, + "step": 7469 + }, + { + "epoch": 0.17422384965904486, + "grad_norm": 2.1422579288482666, + "learning_rate": 1.995869211114874e-05, + "loss": 1.5068, + "step": 7470 + }, + { + "epoch": 0.17424717279822277, + "grad_norm": 1.45002281665802, + "learning_rate": 1.9958669244836738e-05, + "loss": 1.2915, + "step": 7471 + }, + { + "epoch": 0.17427049593740068, + "grad_norm": 1.591386318206787, + "learning_rate": 1.9958646372210676e-05, + "loss": 1.4222, + "step": 7472 + }, + { + "epoch": 0.17429381907657862, + "grad_norm": 1.8106725215911865, + "learning_rate": 1.995862349327057e-05, + "loss": 1.762, + "step": 7473 + }, + { + "epoch": 0.17431714221575653, + "grad_norm": 1.7604529857635498, + "learning_rate": 1.9958600608016434e-05, + "loss": 1.7642, + "step": 7474 + }, + { + "epoch": 0.17434046535493444, + "grad_norm": 1.7146596908569336, + "learning_rate": 1.9958577716448286e-05, + "loss": 1.524, + "step": 7475 + }, + { + "epoch": 0.17436378849411235, + "grad_norm": 1.6630059480667114, + "learning_rate": 1.9958554818566135e-05, + "loss": 1.1701, + "step": 7476 + }, + { + "epoch": 0.1743871116332903, + "grad_norm": 1.6740410327911377, + "learning_rate": 1.9958531914370007e-05, + "loss": 1.3548, + "step": 7477 + }, + { + "epoch": 0.1744104347724682, + "grad_norm": 1.7843668460845947, + "learning_rate": 1.99585090038599e-05, + "loss": 1.405, + "step": 7478 + }, + { + "epoch": 0.17443375791164611, + "grad_norm": 1.7200489044189453, + "learning_rate": 1.9958486087035838e-05, + "loss": 1.0387, + "step": 7479 + }, + { + "epoch": 0.17445708105082403, + "grad_norm": 1.8557630777359009, + "learning_rate": 1.9958463163897837e-05, + "loss": 1.2048, + "step": 7480 + }, + { + "epoch": 0.17448040419000196, + "grad_norm": 2.407553195953369, + "learning_rate": 1.9958440234445907e-05, + "loss": 1.537, + "step": 7481 + }, + { + "epoch": 0.17450372732917988, + "grad_norm": 2.958691120147705, + "learning_rate": 1.9958417298680065e-05, + "loss": 1.9173, + "step": 7482 + }, + { + "epoch": 0.1745270504683578, + "grad_norm": 1.8521904945373535, + "learning_rate": 1.9958394356600324e-05, + "loss": 1.399, + "step": 7483 + }, + { + "epoch": 0.1745503736075357, + "grad_norm": 1.7346025705337524, + "learning_rate": 1.99583714082067e-05, + "loss": 1.1985, + "step": 7484 + }, + { + "epoch": 0.17457369674671364, + "grad_norm": 1.4364285469055176, + "learning_rate": 1.995834845349921e-05, + "loss": 0.8637, + "step": 7485 + }, + { + "epoch": 0.17459701988589155, + "grad_norm": 1.8489397764205933, + "learning_rate": 1.9958325492477864e-05, + "loss": 1.5346, + "step": 7486 + }, + { + "epoch": 0.17462034302506946, + "grad_norm": 2.029994010925293, + "learning_rate": 1.9958302525142677e-05, + "loss": 1.4969, + "step": 7487 + }, + { + "epoch": 0.17464366616424737, + "grad_norm": 1.786665916442871, + "learning_rate": 1.9958279551493666e-05, + "loss": 1.5858, + "step": 7488 + }, + { + "epoch": 0.1746669893034253, + "grad_norm": 1.6301120519638062, + "learning_rate": 1.9958256571530844e-05, + "loss": 1.6973, + "step": 7489 + }, + { + "epoch": 0.17469031244260322, + "grad_norm": 1.9332666397094727, + "learning_rate": 1.9958233585254225e-05, + "loss": 1.4344, + "step": 7490 + }, + { + "epoch": 0.17471363558178113, + "grad_norm": 1.5923314094543457, + "learning_rate": 1.9958210592663826e-05, + "loss": 1.5026, + "step": 7491 + }, + { + "epoch": 0.17473695872095904, + "grad_norm": 1.6616913080215454, + "learning_rate": 1.9958187593759656e-05, + "loss": 1.5028, + "step": 7492 + }, + { + "epoch": 0.17476028186013698, + "grad_norm": 1.801474928855896, + "learning_rate": 1.9958164588541737e-05, + "loss": 1.3489, + "step": 7493 + }, + { + "epoch": 0.1747836049993149, + "grad_norm": 2.3320960998535156, + "learning_rate": 1.9958141577010082e-05, + "loss": 1.6853, + "step": 7494 + }, + { + "epoch": 0.1748069281384928, + "grad_norm": 2.266580820083618, + "learning_rate": 1.99581185591647e-05, + "loss": 1.3332, + "step": 7495 + }, + { + "epoch": 0.1748302512776707, + "grad_norm": 2.0603036880493164, + "learning_rate": 1.995809553500561e-05, + "loss": 1.4419, + "step": 7496 + }, + { + "epoch": 0.17485357441684862, + "grad_norm": 2.0398075580596924, + "learning_rate": 1.995807250453283e-05, + "loss": 1.6396, + "step": 7497 + }, + { + "epoch": 0.17487689755602656, + "grad_norm": 1.5097776651382446, + "learning_rate": 1.9958049467746364e-05, + "loss": 1.1263, + "step": 7498 + }, + { + "epoch": 0.17490022069520447, + "grad_norm": 2.1187610626220703, + "learning_rate": 1.9958026424646238e-05, + "loss": 1.205, + "step": 7499 + }, + { + "epoch": 0.17492354383438238, + "grad_norm": 2.2780778408050537, + "learning_rate": 1.995800337523246e-05, + "loss": 1.2434, + "step": 7500 + }, + { + "epoch": 0.1749468669735603, + "grad_norm": 1.7291078567504883, + "learning_rate": 1.995798031950505e-05, + "loss": 1.3686, + "step": 7501 + }, + { + "epoch": 0.17497019011273823, + "grad_norm": 1.6485790014266968, + "learning_rate": 1.9957957257464016e-05, + "loss": 1.3344, + "step": 7502 + }, + { + "epoch": 0.17499351325191614, + "grad_norm": 1.811347484588623, + "learning_rate": 1.9957934189109372e-05, + "loss": 1.1058, + "step": 7503 + }, + { + "epoch": 0.17501683639109405, + "grad_norm": 1.794769287109375, + "learning_rate": 1.9957911114441142e-05, + "loss": 1.3808, + "step": 7504 + }, + { + "epoch": 0.17504015953027197, + "grad_norm": 1.6334198713302612, + "learning_rate": 1.995788803345933e-05, + "loss": 1.2926, + "step": 7505 + }, + { + "epoch": 0.1750634826694499, + "grad_norm": 2.117227077484131, + "learning_rate": 1.9957864946163963e-05, + "loss": 1.2627, + "step": 7506 + }, + { + "epoch": 0.17508680580862782, + "grad_norm": 1.6352413892745972, + "learning_rate": 1.9957841852555042e-05, + "loss": 1.4342, + "step": 7507 + }, + { + "epoch": 0.17511012894780573, + "grad_norm": 1.7545527219772339, + "learning_rate": 1.995781875263259e-05, + "loss": 1.0597, + "step": 7508 + }, + { + "epoch": 0.17513345208698364, + "grad_norm": 1.9575591087341309, + "learning_rate": 1.995779564639662e-05, + "loss": 1.2881, + "step": 7509 + }, + { + "epoch": 0.17515677522616158, + "grad_norm": 1.7080249786376953, + "learning_rate": 1.9957772533847144e-05, + "loss": 1.4235, + "step": 7510 + }, + { + "epoch": 0.1751800983653395, + "grad_norm": 1.7985914945602417, + "learning_rate": 1.995774941498418e-05, + "loss": 1.4527, + "step": 7511 + }, + { + "epoch": 0.1752034215045174, + "grad_norm": 3.360142469406128, + "learning_rate": 1.995772628980774e-05, + "loss": 1.4593, + "step": 7512 + }, + { + "epoch": 0.1752267446436953, + "grad_norm": 1.7174075841903687, + "learning_rate": 1.9957703158317843e-05, + "loss": 1.4802, + "step": 7513 + }, + { + "epoch": 0.17525006778287325, + "grad_norm": 2.442721128463745, + "learning_rate": 1.9957680020514497e-05, + "loss": 1.5331, + "step": 7514 + }, + { + "epoch": 0.17527339092205116, + "grad_norm": 1.6225166320800781, + "learning_rate": 1.9957656876397723e-05, + "loss": 1.4475, + "step": 7515 + }, + { + "epoch": 0.17529671406122907, + "grad_norm": 1.7712762355804443, + "learning_rate": 1.9957633725967535e-05, + "loss": 1.6375, + "step": 7516 + }, + { + "epoch": 0.17532003720040698, + "grad_norm": 1.7618972063064575, + "learning_rate": 1.9957610569223943e-05, + "loss": 1.5224, + "step": 7517 + }, + { + "epoch": 0.17534336033958492, + "grad_norm": 1.7801103591918945, + "learning_rate": 1.9957587406166963e-05, + "loss": 1.7208, + "step": 7518 + }, + { + "epoch": 0.17536668347876283, + "grad_norm": 1.8361573219299316, + "learning_rate": 1.9957564236796614e-05, + "loss": 1.6857, + "step": 7519 + }, + { + "epoch": 0.17539000661794074, + "grad_norm": 1.9152382612228394, + "learning_rate": 1.9957541061112907e-05, + "loss": 1.3629, + "step": 7520 + }, + { + "epoch": 0.17541332975711865, + "grad_norm": 1.6304298639297485, + "learning_rate": 1.9957517879115858e-05, + "loss": 1.3315, + "step": 7521 + }, + { + "epoch": 0.17543665289629656, + "grad_norm": 1.9285849332809448, + "learning_rate": 1.9957494690805483e-05, + "loss": 1.6289, + "step": 7522 + }, + { + "epoch": 0.1754599760354745, + "grad_norm": 1.9097024202346802, + "learning_rate": 1.995747149618179e-05, + "loss": 1.6606, + "step": 7523 + }, + { + "epoch": 0.1754832991746524, + "grad_norm": 1.999050498008728, + "learning_rate": 1.99574482952448e-05, + "loss": 1.6991, + "step": 7524 + }, + { + "epoch": 0.17550662231383032, + "grad_norm": 1.9476864337921143, + "learning_rate": 1.9957425087994527e-05, + "loss": 1.7806, + "step": 7525 + }, + { + "epoch": 0.17552994545300823, + "grad_norm": 1.531092882156372, + "learning_rate": 1.995740187443099e-05, + "loss": 1.2195, + "step": 7526 + }, + { + "epoch": 0.17555326859218617, + "grad_norm": 1.938628077507019, + "learning_rate": 1.9957378654554193e-05, + "loss": 1.3229, + "step": 7527 + }, + { + "epoch": 0.17557659173136408, + "grad_norm": 1.8217439651489258, + "learning_rate": 1.995735542836416e-05, + "loss": 1.3692, + "step": 7528 + }, + { + "epoch": 0.175599914870542, + "grad_norm": 2.082664728164673, + "learning_rate": 1.99573321958609e-05, + "loss": 1.568, + "step": 7529 + }, + { + "epoch": 0.1756232380097199, + "grad_norm": 1.7567086219787598, + "learning_rate": 1.995730895704443e-05, + "loss": 1.4269, + "step": 7530 + }, + { + "epoch": 0.17564656114889785, + "grad_norm": 2.4646668434143066, + "learning_rate": 1.9957285711914765e-05, + "loss": 1.3296, + "step": 7531 + }, + { + "epoch": 0.17566988428807576, + "grad_norm": 1.582240104675293, + "learning_rate": 1.995726246047192e-05, + "loss": 1.2029, + "step": 7532 + }, + { + "epoch": 0.17569320742725367, + "grad_norm": 1.7753610610961914, + "learning_rate": 1.9957239202715905e-05, + "loss": 1.3487, + "step": 7533 + }, + { + "epoch": 0.17571653056643158, + "grad_norm": 1.64071786403656, + "learning_rate": 1.9957215938646747e-05, + "loss": 1.609, + "step": 7534 + }, + { + "epoch": 0.17573985370560952, + "grad_norm": 1.8634204864501953, + "learning_rate": 1.9957192668264446e-05, + "loss": 1.3848, + "step": 7535 + }, + { + "epoch": 0.17576317684478743, + "grad_norm": 2.1266956329345703, + "learning_rate": 1.9957169391569025e-05, + "loss": 1.5076, + "step": 7536 + }, + { + "epoch": 0.17578649998396534, + "grad_norm": 1.9144436120986938, + "learning_rate": 1.9957146108560498e-05, + "loss": 1.5407, + "step": 7537 + }, + { + "epoch": 0.17580982312314325, + "grad_norm": 1.3567081689834595, + "learning_rate": 1.995712281923888e-05, + "loss": 1.1153, + "step": 7538 + }, + { + "epoch": 0.1758331462623212, + "grad_norm": 1.4810099601745605, + "learning_rate": 1.9957099523604183e-05, + "loss": 1.4039, + "step": 7539 + }, + { + "epoch": 0.1758564694014991, + "grad_norm": 1.4987905025482178, + "learning_rate": 1.9957076221656423e-05, + "loss": 1.2833, + "step": 7540 + }, + { + "epoch": 0.175879792540677, + "grad_norm": 1.7592594623565674, + "learning_rate": 1.9957052913395615e-05, + "loss": 1.4488, + "step": 7541 + }, + { + "epoch": 0.17590311567985492, + "grad_norm": 2.0585262775421143, + "learning_rate": 1.995702959882178e-05, + "loss": 1.8028, + "step": 7542 + }, + { + "epoch": 0.17592643881903286, + "grad_norm": 1.829224705696106, + "learning_rate": 1.9957006277934917e-05, + "loss": 1.565, + "step": 7543 + }, + { + "epoch": 0.17594976195821077, + "grad_norm": 1.5405752658843994, + "learning_rate": 1.995698295073506e-05, + "loss": 1.2102, + "step": 7544 + }, + { + "epoch": 0.17597308509738868, + "grad_norm": 2.2146971225738525, + "learning_rate": 1.995695961722221e-05, + "loss": 1.3387, + "step": 7545 + }, + { + "epoch": 0.1759964082365666, + "grad_norm": 1.6367859840393066, + "learning_rate": 1.9956936277396382e-05, + "loss": 1.4268, + "step": 7546 + }, + { + "epoch": 0.17601973137574453, + "grad_norm": 1.6368263959884644, + "learning_rate": 1.9956912931257602e-05, + "loss": 1.4074, + "step": 7547 + }, + { + "epoch": 0.17604305451492244, + "grad_norm": 1.9027099609375, + "learning_rate": 1.9956889578805875e-05, + "loss": 1.5604, + "step": 7548 + }, + { + "epoch": 0.17606637765410035, + "grad_norm": 1.6178146600723267, + "learning_rate": 1.995686622004122e-05, + "loss": 1.2442, + "step": 7549 + }, + { + "epoch": 0.17608970079327826, + "grad_norm": 1.9754891395568848, + "learning_rate": 1.995684285496365e-05, + "loss": 1.4551, + "step": 7550 + }, + { + "epoch": 0.17611302393245618, + "grad_norm": 1.6860450506210327, + "learning_rate": 1.9956819483573177e-05, + "loss": 1.4199, + "step": 7551 + }, + { + "epoch": 0.17613634707163411, + "grad_norm": 1.6076298952102661, + "learning_rate": 1.9956796105869822e-05, + "loss": 1.3903, + "step": 7552 + }, + { + "epoch": 0.17615967021081202, + "grad_norm": 2.156200408935547, + "learning_rate": 1.9956772721853595e-05, + "loss": 1.185, + "step": 7553 + }, + { + "epoch": 0.17618299334998994, + "grad_norm": 1.792455792427063, + "learning_rate": 1.9956749331524517e-05, + "loss": 1.1988, + "step": 7554 + }, + { + "epoch": 0.17620631648916785, + "grad_norm": 2.15897798538208, + "learning_rate": 1.9956725934882593e-05, + "loss": 1.5833, + "step": 7555 + }, + { + "epoch": 0.17622963962834579, + "grad_norm": 2.179049253463745, + "learning_rate": 1.9956702531927848e-05, + "loss": 1.294, + "step": 7556 + }, + { + "epoch": 0.1762529627675237, + "grad_norm": 1.5361319780349731, + "learning_rate": 1.995667912266029e-05, + "loss": 1.2582, + "step": 7557 + }, + { + "epoch": 0.1762762859067016, + "grad_norm": 1.6376020908355713, + "learning_rate": 1.9956655707079935e-05, + "loss": 1.4979, + "step": 7558 + }, + { + "epoch": 0.17629960904587952, + "grad_norm": 1.8840078115463257, + "learning_rate": 1.99566322851868e-05, + "loss": 1.2726, + "step": 7559 + }, + { + "epoch": 0.17632293218505746, + "grad_norm": 2.3798086643218994, + "learning_rate": 1.99566088569809e-05, + "loss": 1.2838, + "step": 7560 + }, + { + "epoch": 0.17634625532423537, + "grad_norm": 1.5987768173217773, + "learning_rate": 1.9956585422462247e-05, + "loss": 1.2724, + "step": 7561 + }, + { + "epoch": 0.17636957846341328, + "grad_norm": 1.6699000597000122, + "learning_rate": 1.9956561981630855e-05, + "loss": 1.6578, + "step": 7562 + }, + { + "epoch": 0.1763929016025912, + "grad_norm": 1.581972599029541, + "learning_rate": 1.9956538534486746e-05, + "loss": 1.3302, + "step": 7563 + }, + { + "epoch": 0.17641622474176913, + "grad_norm": 1.8507814407348633, + "learning_rate": 1.995651508102993e-05, + "loss": 1.6153, + "step": 7564 + }, + { + "epoch": 0.17643954788094704, + "grad_norm": 1.7788665294647217, + "learning_rate": 1.9956491621260418e-05, + "loss": 1.5928, + "step": 7565 + }, + { + "epoch": 0.17646287102012495, + "grad_norm": 1.8744378089904785, + "learning_rate": 1.9956468155178232e-05, + "loss": 1.2372, + "step": 7566 + }, + { + "epoch": 0.17648619415930286, + "grad_norm": 2.0118894577026367, + "learning_rate": 1.995644468278338e-05, + "loss": 1.6873, + "step": 7567 + }, + { + "epoch": 0.1765095172984808, + "grad_norm": 1.5521833896636963, + "learning_rate": 1.9956421204075887e-05, + "loss": 1.4139, + "step": 7568 + }, + { + "epoch": 0.1765328404376587, + "grad_norm": 2.0811495780944824, + "learning_rate": 1.995639771905576e-05, + "loss": 1.2691, + "step": 7569 + }, + { + "epoch": 0.17655616357683662, + "grad_norm": 1.9467955827713013, + "learning_rate": 1.995637422772301e-05, + "loss": 1.1366, + "step": 7570 + }, + { + "epoch": 0.17657948671601453, + "grad_norm": 2.0793039798736572, + "learning_rate": 1.995635073007766e-05, + "loss": 1.1672, + "step": 7571 + }, + { + "epoch": 0.17660280985519247, + "grad_norm": 1.8685739040374756, + "learning_rate": 1.9956327226119722e-05, + "loss": 1.2943, + "step": 7572 + }, + { + "epoch": 0.17662613299437038, + "grad_norm": 1.592610239982605, + "learning_rate": 1.9956303715849213e-05, + "loss": 1.2456, + "step": 7573 + }, + { + "epoch": 0.1766494561335483, + "grad_norm": 2.042036771774292, + "learning_rate": 1.9956280199266147e-05, + "loss": 1.5035, + "step": 7574 + }, + { + "epoch": 0.1766727792727262, + "grad_norm": 1.7709101438522339, + "learning_rate": 1.9956256676370537e-05, + "loss": 0.7421, + "step": 7575 + }, + { + "epoch": 0.17669610241190414, + "grad_norm": 1.701223611831665, + "learning_rate": 1.9956233147162397e-05, + "loss": 1.2938, + "step": 7576 + }, + { + "epoch": 0.17671942555108205, + "grad_norm": 1.7159689664840698, + "learning_rate": 1.9956209611641744e-05, + "loss": 1.4417, + "step": 7577 + }, + { + "epoch": 0.17674274869025997, + "grad_norm": 1.8911962509155273, + "learning_rate": 1.9956186069808595e-05, + "loss": 1.4872, + "step": 7578 + }, + { + "epoch": 0.17676607182943788, + "grad_norm": 2.066342353820801, + "learning_rate": 1.995616252166296e-05, + "loss": 1.3539, + "step": 7579 + }, + { + "epoch": 0.1767893949686158, + "grad_norm": 1.6544362306594849, + "learning_rate": 1.995613896720486e-05, + "loss": 1.4763, + "step": 7580 + }, + { + "epoch": 0.17681271810779373, + "grad_norm": 1.7198089361190796, + "learning_rate": 1.9956115406434304e-05, + "loss": 1.4307, + "step": 7581 + }, + { + "epoch": 0.17683604124697164, + "grad_norm": 1.7077969312667847, + "learning_rate": 1.995609183935131e-05, + "loss": 1.4646, + "step": 7582 + }, + { + "epoch": 0.17685936438614955, + "grad_norm": 2.0743556022644043, + "learning_rate": 1.995606826595589e-05, + "loss": 1.2905, + "step": 7583 + }, + { + "epoch": 0.17688268752532746, + "grad_norm": 1.6784299612045288, + "learning_rate": 1.9956044686248063e-05, + "loss": 1.1482, + "step": 7584 + }, + { + "epoch": 0.1769060106645054, + "grad_norm": 1.6246466636657715, + "learning_rate": 1.9956021100227844e-05, + "loss": 1.4036, + "step": 7585 + }, + { + "epoch": 0.1769293338036833, + "grad_norm": 1.8272980451583862, + "learning_rate": 1.9955997507895245e-05, + "loss": 1.4105, + "step": 7586 + }, + { + "epoch": 0.17695265694286122, + "grad_norm": 1.8344686031341553, + "learning_rate": 1.9955973909250285e-05, + "loss": 1.4691, + "step": 7587 + }, + { + "epoch": 0.17697598008203913, + "grad_norm": 1.8780725002288818, + "learning_rate": 1.995595030429297e-05, + "loss": 1.7068, + "step": 7588 + }, + { + "epoch": 0.17699930322121707, + "grad_norm": 1.601305365562439, + "learning_rate": 1.9955926693023327e-05, + "loss": 1.1699, + "step": 7589 + }, + { + "epoch": 0.17702262636039498, + "grad_norm": 2.2963593006134033, + "learning_rate": 1.9955903075441364e-05, + "loss": 1.3301, + "step": 7590 + }, + { + "epoch": 0.1770459494995729, + "grad_norm": 2.3291563987731934, + "learning_rate": 1.9955879451547093e-05, + "loss": 1.3803, + "step": 7591 + }, + { + "epoch": 0.1770692726387508, + "grad_norm": 1.7723854780197144, + "learning_rate": 1.9955855821340537e-05, + "loss": 1.4771, + "step": 7592 + }, + { + "epoch": 0.17709259577792874, + "grad_norm": 1.6509628295898438, + "learning_rate": 1.9955832184821705e-05, + "loss": 1.5636, + "step": 7593 + }, + { + "epoch": 0.17711591891710665, + "grad_norm": 2.0127668380737305, + "learning_rate": 1.9955808541990614e-05, + "loss": 1.4667, + "step": 7594 + }, + { + "epoch": 0.17713924205628456, + "grad_norm": 1.786911129951477, + "learning_rate": 1.995578489284728e-05, + "loss": 1.4516, + "step": 7595 + }, + { + "epoch": 0.17716256519546247, + "grad_norm": 1.5668991804122925, + "learning_rate": 1.9955761237391717e-05, + "loss": 1.2536, + "step": 7596 + }, + { + "epoch": 0.1771858883346404, + "grad_norm": 2.0865986347198486, + "learning_rate": 1.995573757562394e-05, + "loss": 1.2561, + "step": 7597 + }, + { + "epoch": 0.17720921147381832, + "grad_norm": 1.6549034118652344, + "learning_rate": 1.9955713907543966e-05, + "loss": 1.5479, + "step": 7598 + }, + { + "epoch": 0.17723253461299623, + "grad_norm": 1.5690453052520752, + "learning_rate": 1.9955690233151804e-05, + "loss": 1.2661, + "step": 7599 + }, + { + "epoch": 0.17725585775217415, + "grad_norm": 1.7996708154678345, + "learning_rate": 1.9955666552447478e-05, + "loss": 1.2485, + "step": 7600 + }, + { + "epoch": 0.17727918089135208, + "grad_norm": 1.578605055809021, + "learning_rate": 1.995564286543099e-05, + "loss": 1.0789, + "step": 7601 + }, + { + "epoch": 0.17730250403053, + "grad_norm": 1.75475013256073, + "learning_rate": 1.995561917210237e-05, + "loss": 1.0795, + "step": 7602 + }, + { + "epoch": 0.1773258271697079, + "grad_norm": 1.8244965076446533, + "learning_rate": 1.9955595472461624e-05, + "loss": 1.1132, + "step": 7603 + }, + { + "epoch": 0.17734915030888582, + "grad_norm": 1.8258978128433228, + "learning_rate": 1.995557176650877e-05, + "loss": 1.563, + "step": 7604 + }, + { + "epoch": 0.17737247344806376, + "grad_norm": 1.833031177520752, + "learning_rate": 1.995554805424382e-05, + "loss": 1.5069, + "step": 7605 + }, + { + "epoch": 0.17739579658724167, + "grad_norm": 1.9621665477752686, + "learning_rate": 1.995552433566679e-05, + "loss": 1.3458, + "step": 7606 + }, + { + "epoch": 0.17741911972641958, + "grad_norm": 1.8530577421188354, + "learning_rate": 1.9955500610777703e-05, + "loss": 1.4873, + "step": 7607 + }, + { + "epoch": 0.1774424428655975, + "grad_norm": 2.2823164463043213, + "learning_rate": 1.995547687957656e-05, + "loss": 1.373, + "step": 7608 + }, + { + "epoch": 0.1774657660047754, + "grad_norm": 1.9768073558807373, + "learning_rate": 1.9955453142063387e-05, + "loss": 1.4895, + "step": 7609 + }, + { + "epoch": 0.17748908914395334, + "grad_norm": 1.8940035104751587, + "learning_rate": 1.9955429398238196e-05, + "loss": 1.491, + "step": 7610 + }, + { + "epoch": 0.17751241228313125, + "grad_norm": 2.055825710296631, + "learning_rate": 1.9955405648100997e-05, + "loss": 1.8061, + "step": 7611 + }, + { + "epoch": 0.17753573542230916, + "grad_norm": 1.8181272745132446, + "learning_rate": 1.9955381891651814e-05, + "loss": 1.0888, + "step": 7612 + }, + { + "epoch": 0.17755905856148707, + "grad_norm": 1.676640510559082, + "learning_rate": 1.9955358128890656e-05, + "loss": 1.3759, + "step": 7613 + }, + { + "epoch": 0.177582381700665, + "grad_norm": 1.8350028991699219, + "learning_rate": 1.995533435981754e-05, + "loss": 1.2514, + "step": 7614 + }, + { + "epoch": 0.17760570483984292, + "grad_norm": 2.285431146621704, + "learning_rate": 1.995531058443248e-05, + "loss": 1.6577, + "step": 7615 + }, + { + "epoch": 0.17762902797902083, + "grad_norm": 2.132215976715088, + "learning_rate": 1.995528680273549e-05, + "loss": 1.5325, + "step": 7616 + }, + { + "epoch": 0.17765235111819874, + "grad_norm": 1.9450451135635376, + "learning_rate": 1.995526301472659e-05, + "loss": 1.522, + "step": 7617 + }, + { + "epoch": 0.17767567425737668, + "grad_norm": 1.761906623840332, + "learning_rate": 1.995523922040579e-05, + "loss": 1.2768, + "step": 7618 + }, + { + "epoch": 0.1776989973965546, + "grad_norm": 1.8472840785980225, + "learning_rate": 1.995521541977311e-05, + "loss": 1.1036, + "step": 7619 + }, + { + "epoch": 0.1777223205357325, + "grad_norm": 1.7682737112045288, + "learning_rate": 1.9955191612828558e-05, + "loss": 1.512, + "step": 7620 + }, + { + "epoch": 0.17774564367491041, + "grad_norm": 1.4685750007629395, + "learning_rate": 1.995516779957215e-05, + "loss": 1.3279, + "step": 7621 + }, + { + "epoch": 0.17776896681408835, + "grad_norm": 1.703102707862854, + "learning_rate": 1.9955143980003912e-05, + "loss": 1.6442, + "step": 7622 + }, + { + "epoch": 0.17779228995326626, + "grad_norm": 2.3316774368286133, + "learning_rate": 1.9955120154123848e-05, + "loss": 1.2365, + "step": 7623 + }, + { + "epoch": 0.17781561309244417, + "grad_norm": 2.0295238494873047, + "learning_rate": 1.995509632193198e-05, + "loss": 1.3972, + "step": 7624 + }, + { + "epoch": 0.17783893623162209, + "grad_norm": 1.4795743227005005, + "learning_rate": 1.995507248342831e-05, + "loss": 1.0349, + "step": 7625 + }, + { + "epoch": 0.17786225937080002, + "grad_norm": 1.6969382762908936, + "learning_rate": 1.9955048638612873e-05, + "loss": 1.2816, + "step": 7626 + }, + { + "epoch": 0.17788558250997794, + "grad_norm": 1.660207748413086, + "learning_rate": 1.9955024787485667e-05, + "loss": 1.1243, + "step": 7627 + }, + { + "epoch": 0.17790890564915585, + "grad_norm": 1.9735254049301147, + "learning_rate": 1.995500093004672e-05, + "loss": 1.5368, + "step": 7628 + }, + { + "epoch": 0.17793222878833376, + "grad_norm": 2.5160105228424072, + "learning_rate": 1.9954977066296037e-05, + "loss": 1.3223, + "step": 7629 + }, + { + "epoch": 0.1779555519275117, + "grad_norm": 1.5423505306243896, + "learning_rate": 1.9954953196233637e-05, + "loss": 1.0097, + "step": 7630 + }, + { + "epoch": 0.1779788750666896, + "grad_norm": 1.8010445833206177, + "learning_rate": 1.9954929319859536e-05, + "loss": 1.4765, + "step": 7631 + }, + { + "epoch": 0.17800219820586752, + "grad_norm": 1.7566972970962524, + "learning_rate": 1.995490543717375e-05, + "loss": 1.4627, + "step": 7632 + }, + { + "epoch": 0.17802552134504543, + "grad_norm": 1.9390946626663208, + "learning_rate": 1.995488154817629e-05, + "loss": 1.0446, + "step": 7633 + }, + { + "epoch": 0.17804884448422337, + "grad_norm": 1.5867067575454712, + "learning_rate": 1.9954857652867178e-05, + "loss": 1.5296, + "step": 7634 + }, + { + "epoch": 0.17807216762340128, + "grad_norm": 1.9449414014816284, + "learning_rate": 1.9954833751246417e-05, + "loss": 1.5667, + "step": 7635 + }, + { + "epoch": 0.1780954907625792, + "grad_norm": 1.9395817518234253, + "learning_rate": 1.9954809843314038e-05, + "loss": 1.4323, + "step": 7636 + }, + { + "epoch": 0.1781188139017571, + "grad_norm": 1.8652347326278687, + "learning_rate": 1.9954785929070043e-05, + "loss": 1.4049, + "step": 7637 + }, + { + "epoch": 0.178142137040935, + "grad_norm": 1.7469221353530884, + "learning_rate": 1.9954762008514453e-05, + "loss": 1.1035, + "step": 7638 + }, + { + "epoch": 0.17816546018011295, + "grad_norm": 2.4852421283721924, + "learning_rate": 1.9954738081647286e-05, + "loss": 1.3271, + "step": 7639 + }, + { + "epoch": 0.17818878331929086, + "grad_norm": 1.9622604846954346, + "learning_rate": 1.9954714148468554e-05, + "loss": 1.3292, + "step": 7640 + }, + { + "epoch": 0.17821210645846877, + "grad_norm": 1.7717835903167725, + "learning_rate": 1.9954690208978268e-05, + "loss": 1.3661, + "step": 7641 + }, + { + "epoch": 0.17823542959764668, + "grad_norm": 1.8761320114135742, + "learning_rate": 1.995466626317645e-05, + "loss": 1.8629, + "step": 7642 + }, + { + "epoch": 0.17825875273682462, + "grad_norm": 2.09906268119812, + "learning_rate": 1.995464231106311e-05, + "loss": 1.4514, + "step": 7643 + }, + { + "epoch": 0.17828207587600253, + "grad_norm": 1.7465312480926514, + "learning_rate": 1.9954618352638267e-05, + "loss": 1.2813, + "step": 7644 + }, + { + "epoch": 0.17830539901518044, + "grad_norm": 1.7027121782302856, + "learning_rate": 1.9954594387901934e-05, + "loss": 1.0443, + "step": 7645 + }, + { + "epoch": 0.17832872215435835, + "grad_norm": 1.8701332807540894, + "learning_rate": 1.9954570416854127e-05, + "loss": 1.4999, + "step": 7646 + }, + { + "epoch": 0.1783520452935363, + "grad_norm": 1.5521444082260132, + "learning_rate": 1.995454643949486e-05, + "loss": 1.3537, + "step": 7647 + }, + { + "epoch": 0.1783753684327142, + "grad_norm": 1.7324097156524658, + "learning_rate": 1.995452245582415e-05, + "loss": 1.3853, + "step": 7648 + }, + { + "epoch": 0.17839869157189212, + "grad_norm": 2.01025128364563, + "learning_rate": 1.995449846584201e-05, + "loss": 1.7232, + "step": 7649 + }, + { + "epoch": 0.17842201471107003, + "grad_norm": 1.9316757917404175, + "learning_rate": 1.9954474469548458e-05, + "loss": 1.6255, + "step": 7650 + }, + { + "epoch": 0.17844533785024796, + "grad_norm": 1.5871044397354126, + "learning_rate": 1.995445046694351e-05, + "loss": 1.4814, + "step": 7651 + }, + { + "epoch": 0.17846866098942588, + "grad_norm": 1.7314268350601196, + "learning_rate": 1.9954426458027175e-05, + "loss": 1.4757, + "step": 7652 + }, + { + "epoch": 0.1784919841286038, + "grad_norm": 1.774599313735962, + "learning_rate": 1.9954402442799476e-05, + "loss": 1.6899, + "step": 7653 + }, + { + "epoch": 0.1785153072677817, + "grad_norm": 1.9956495761871338, + "learning_rate": 1.995437842126042e-05, + "loss": 1.3684, + "step": 7654 + }, + { + "epoch": 0.17853863040695964, + "grad_norm": 2.1670079231262207, + "learning_rate": 1.995435439341003e-05, + "loss": 1.5502, + "step": 7655 + }, + { + "epoch": 0.17856195354613755, + "grad_norm": 1.7335801124572754, + "learning_rate": 1.9954330359248318e-05, + "loss": 1.3009, + "step": 7656 + }, + { + "epoch": 0.17858527668531546, + "grad_norm": 1.4420621395111084, + "learning_rate": 1.99543063187753e-05, + "loss": 1.2506, + "step": 7657 + }, + { + "epoch": 0.17860859982449337, + "grad_norm": 1.7339067459106445, + "learning_rate": 1.9954282271990987e-05, + "loss": 1.4506, + "step": 7658 + }, + { + "epoch": 0.1786319229636713, + "grad_norm": 1.8854928016662598, + "learning_rate": 1.9954258218895402e-05, + "loss": 1.5568, + "step": 7659 + }, + { + "epoch": 0.17865524610284922, + "grad_norm": 1.734731674194336, + "learning_rate": 1.995423415948855e-05, + "loss": 1.2894, + "step": 7660 + }, + { + "epoch": 0.17867856924202713, + "grad_norm": 1.9439735412597656, + "learning_rate": 1.9954210093770457e-05, + "loss": 1.3408, + "step": 7661 + }, + { + "epoch": 0.17870189238120504, + "grad_norm": 1.8619352579116821, + "learning_rate": 1.9954186021741135e-05, + "loss": 1.2505, + "step": 7662 + }, + { + "epoch": 0.17872521552038298, + "grad_norm": 2.1091971397399902, + "learning_rate": 1.9954161943400593e-05, + "loss": 1.2589, + "step": 7663 + }, + { + "epoch": 0.1787485386595609, + "grad_norm": 1.8055461645126343, + "learning_rate": 1.9954137858748853e-05, + "loss": 1.3902, + "step": 7664 + }, + { + "epoch": 0.1787718617987388, + "grad_norm": 1.5422627925872803, + "learning_rate": 1.995411376778593e-05, + "loss": 1.4803, + "step": 7665 + }, + { + "epoch": 0.1787951849379167, + "grad_norm": 1.7025034427642822, + "learning_rate": 1.9954089670511836e-05, + "loss": 1.5973, + "step": 7666 + }, + { + "epoch": 0.17881850807709462, + "grad_norm": 4.555098056793213, + "learning_rate": 1.995406556692659e-05, + "loss": 1.3775, + "step": 7667 + }, + { + "epoch": 0.17884183121627256, + "grad_norm": 1.9661860466003418, + "learning_rate": 1.99540414570302e-05, + "loss": 1.4594, + "step": 7668 + }, + { + "epoch": 0.17886515435545047, + "grad_norm": 1.7500932216644287, + "learning_rate": 1.9954017340822692e-05, + "loss": 1.1972, + "step": 7669 + }, + { + "epoch": 0.17888847749462838, + "grad_norm": 2.186952829360962, + "learning_rate": 1.995399321830407e-05, + "loss": 1.3076, + "step": 7670 + }, + { + "epoch": 0.1789118006338063, + "grad_norm": 1.5743695497512817, + "learning_rate": 1.9953969089474358e-05, + "loss": 1.0066, + "step": 7671 + }, + { + "epoch": 0.17893512377298423, + "grad_norm": 2.0963456630706787, + "learning_rate": 1.995394495433357e-05, + "loss": 1.6933, + "step": 7672 + }, + { + "epoch": 0.17895844691216214, + "grad_norm": 1.927824854850769, + "learning_rate": 1.9953920812881717e-05, + "loss": 1.5635, + "step": 7673 + }, + { + "epoch": 0.17898177005134006, + "grad_norm": 1.9599376916885376, + "learning_rate": 1.9953896665118816e-05, + "loss": 1.3818, + "step": 7674 + }, + { + "epoch": 0.17900509319051797, + "grad_norm": 2.200408697128296, + "learning_rate": 1.9953872511044886e-05, + "loss": 1.367, + "step": 7675 + }, + { + "epoch": 0.1790284163296959, + "grad_norm": 2.042245388031006, + "learning_rate": 1.9953848350659937e-05, + "loss": 1.6858, + "step": 7676 + }, + { + "epoch": 0.17905173946887382, + "grad_norm": 2.0669524669647217, + "learning_rate": 1.9953824183963987e-05, + "loss": 1.6614, + "step": 7677 + }, + { + "epoch": 0.17907506260805173, + "grad_norm": 2.2936909198760986, + "learning_rate": 1.9953800010957054e-05, + "loss": 1.1258, + "step": 7678 + }, + { + "epoch": 0.17909838574722964, + "grad_norm": 1.9498659372329712, + "learning_rate": 1.9953775831639147e-05, + "loss": 1.5419, + "step": 7679 + }, + { + "epoch": 0.17912170888640758, + "grad_norm": 1.6897910833358765, + "learning_rate": 1.9953751646010285e-05, + "loss": 1.5197, + "step": 7680 + }, + { + "epoch": 0.1791450320255855, + "grad_norm": 1.6281800270080566, + "learning_rate": 1.995372745407049e-05, + "loss": 1.3633, + "step": 7681 + }, + { + "epoch": 0.1791683551647634, + "grad_norm": 1.791869044303894, + "learning_rate": 1.9953703255819762e-05, + "loss": 1.407, + "step": 7682 + }, + { + "epoch": 0.1791916783039413, + "grad_norm": 1.7550508975982666, + "learning_rate": 1.995367905125813e-05, + "loss": 1.3475, + "step": 7683 + }, + { + "epoch": 0.17921500144311925, + "grad_norm": 1.8093650341033936, + "learning_rate": 1.99536548403856e-05, + "loss": 1.3675, + "step": 7684 + }, + { + "epoch": 0.17923832458229716, + "grad_norm": 1.8545957803726196, + "learning_rate": 1.9953630623202194e-05, + "loss": 1.4556, + "step": 7685 + }, + { + "epoch": 0.17926164772147507, + "grad_norm": 1.828643560409546, + "learning_rate": 1.9953606399707924e-05, + "loss": 1.3361, + "step": 7686 + }, + { + "epoch": 0.17928497086065298, + "grad_norm": 2.1794252395629883, + "learning_rate": 1.9953582169902807e-05, + "loss": 1.6516, + "step": 7687 + }, + { + "epoch": 0.17930829399983092, + "grad_norm": 1.990898847579956, + "learning_rate": 1.9953557933786855e-05, + "loss": 1.4582, + "step": 7688 + }, + { + "epoch": 0.17933161713900883, + "grad_norm": 1.745801329612732, + "learning_rate": 1.995353369136009e-05, + "loss": 1.609, + "step": 7689 + }, + { + "epoch": 0.17935494027818674, + "grad_norm": 1.7669341564178467, + "learning_rate": 1.9953509442622524e-05, + "loss": 1.2883, + "step": 7690 + }, + { + "epoch": 0.17937826341736465, + "grad_norm": 1.6127214431762695, + "learning_rate": 1.9953485187574166e-05, + "loss": 1.7413, + "step": 7691 + }, + { + "epoch": 0.1794015865565426, + "grad_norm": 1.757543921470642, + "learning_rate": 1.995346092621504e-05, + "loss": 1.5064, + "step": 7692 + }, + { + "epoch": 0.1794249096957205, + "grad_norm": 1.8118480443954468, + "learning_rate": 1.995343665854516e-05, + "loss": 1.488, + "step": 7693 + }, + { + "epoch": 0.1794482328348984, + "grad_norm": 1.913851261138916, + "learning_rate": 1.995341238456454e-05, + "loss": 1.2413, + "step": 7694 + }, + { + "epoch": 0.17947155597407632, + "grad_norm": 1.9650887250900269, + "learning_rate": 1.9953388104273193e-05, + "loss": 1.1789, + "step": 7695 + }, + { + "epoch": 0.17949487911325424, + "grad_norm": 1.8011995553970337, + "learning_rate": 1.9953363817671136e-05, + "loss": 1.6274, + "step": 7696 + }, + { + "epoch": 0.17951820225243217, + "grad_norm": 2.012784481048584, + "learning_rate": 1.995333952475839e-05, + "loss": 1.3999, + "step": 7697 + }, + { + "epoch": 0.17954152539161009, + "grad_norm": 2.3164222240448, + "learning_rate": 1.995331522553496e-05, + "loss": 1.0031, + "step": 7698 + }, + { + "epoch": 0.179564848530788, + "grad_norm": 1.8640482425689697, + "learning_rate": 1.995329092000087e-05, + "loss": 1.4932, + "step": 7699 + }, + { + "epoch": 0.1795881716699659, + "grad_norm": 2.0448622703552246, + "learning_rate": 1.995326660815613e-05, + "loss": 1.2193, + "step": 7700 + }, + { + "epoch": 0.17961149480914385, + "grad_norm": 1.5516481399536133, + "learning_rate": 1.9953242290000756e-05, + "loss": 1.2949, + "step": 7701 + }, + { + "epoch": 0.17963481794832176, + "grad_norm": 1.8063503503799438, + "learning_rate": 1.9953217965534768e-05, + "loss": 1.1561, + "step": 7702 + }, + { + "epoch": 0.17965814108749967, + "grad_norm": 2.1006388664245605, + "learning_rate": 1.9953193634758178e-05, + "loss": 1.2707, + "step": 7703 + }, + { + "epoch": 0.17968146422667758, + "grad_norm": 1.677132487297058, + "learning_rate": 1.9953169297671002e-05, + "loss": 1.2189, + "step": 7704 + }, + { + "epoch": 0.17970478736585552, + "grad_norm": 1.7836931943893433, + "learning_rate": 1.9953144954273255e-05, + "loss": 1.3136, + "step": 7705 + }, + { + "epoch": 0.17972811050503343, + "grad_norm": 1.736484408378601, + "learning_rate": 1.9953120604564953e-05, + "loss": 1.2979, + "step": 7706 + }, + { + "epoch": 0.17975143364421134, + "grad_norm": 1.5010826587677002, + "learning_rate": 1.995309624854611e-05, + "loss": 1.4009, + "step": 7707 + }, + { + "epoch": 0.17977475678338925, + "grad_norm": 1.6489570140838623, + "learning_rate": 1.9953071886216747e-05, + "loss": 1.4107, + "step": 7708 + }, + { + "epoch": 0.1797980799225672, + "grad_norm": 1.5576456785202026, + "learning_rate": 1.9953047517576872e-05, + "loss": 1.5408, + "step": 7709 + }, + { + "epoch": 0.1798214030617451, + "grad_norm": 1.9113500118255615, + "learning_rate": 1.9953023142626504e-05, + "loss": 1.6715, + "step": 7710 + }, + { + "epoch": 0.179844726200923, + "grad_norm": 2.117830276489258, + "learning_rate": 1.995299876136566e-05, + "loss": 2.1567, + "step": 7711 + }, + { + "epoch": 0.17986804934010092, + "grad_norm": 2.0453438758850098, + "learning_rate": 1.9952974373794348e-05, + "loss": 1.6922, + "step": 7712 + }, + { + "epoch": 0.17989137247927886, + "grad_norm": 1.6134496927261353, + "learning_rate": 1.9952949979912593e-05, + "loss": 0.946, + "step": 7713 + }, + { + "epoch": 0.17991469561845677, + "grad_norm": 1.7333590984344482, + "learning_rate": 1.9952925579720406e-05, + "loss": 1.4392, + "step": 7714 + }, + { + "epoch": 0.17993801875763468, + "grad_norm": 1.9809272289276123, + "learning_rate": 1.9952901173217803e-05, + "loss": 1.8501, + "step": 7715 + }, + { + "epoch": 0.1799613418968126, + "grad_norm": 2.0089478492736816, + "learning_rate": 1.9952876760404798e-05, + "loss": 1.2344, + "step": 7716 + }, + { + "epoch": 0.17998466503599053, + "grad_norm": 2.1440131664276123, + "learning_rate": 1.995285234128141e-05, + "loss": 1.7311, + "step": 7717 + }, + { + "epoch": 0.18000798817516844, + "grad_norm": 1.9179160594940186, + "learning_rate": 1.995282791584765e-05, + "loss": 1.8116, + "step": 7718 + }, + { + "epoch": 0.18003131131434635, + "grad_norm": 1.9084237813949585, + "learning_rate": 1.995280348410354e-05, + "loss": 1.3408, + "step": 7719 + }, + { + "epoch": 0.18005463445352426, + "grad_norm": 2.0958609580993652, + "learning_rate": 1.9952779046049084e-05, + "loss": 1.3809, + "step": 7720 + }, + { + "epoch": 0.1800779575927022, + "grad_norm": 1.6694787740707397, + "learning_rate": 1.995275460168431e-05, + "loss": 1.2998, + "step": 7721 + }, + { + "epoch": 0.18010128073188011, + "grad_norm": 2.0068771839141846, + "learning_rate": 1.9952730151009225e-05, + "loss": 0.8224, + "step": 7722 + }, + { + "epoch": 0.18012460387105803, + "grad_norm": 2.424816846847534, + "learning_rate": 1.9952705694023852e-05, + "loss": 1.3996, + "step": 7723 + }, + { + "epoch": 0.18014792701023594, + "grad_norm": 1.9995489120483398, + "learning_rate": 1.9952681230728198e-05, + "loss": 1.4086, + "step": 7724 + }, + { + "epoch": 0.18017125014941385, + "grad_norm": 2.0435433387756348, + "learning_rate": 1.9952656761122288e-05, + "loss": 1.7112, + "step": 7725 + }, + { + "epoch": 0.1801945732885918, + "grad_norm": 1.9436379671096802, + "learning_rate": 1.9952632285206127e-05, + "loss": 1.4289, + "step": 7726 + }, + { + "epoch": 0.1802178964277697, + "grad_norm": 1.607019305229187, + "learning_rate": 1.995260780297974e-05, + "loss": 0.9455, + "step": 7727 + }, + { + "epoch": 0.1802412195669476, + "grad_norm": 1.7733348608016968, + "learning_rate": 1.9952583314443136e-05, + "loss": 1.2941, + "step": 7728 + }, + { + "epoch": 0.18026454270612552, + "grad_norm": 1.9602652788162231, + "learning_rate": 1.995255881959633e-05, + "loss": 1.5627, + "step": 7729 + }, + { + "epoch": 0.18028786584530346, + "grad_norm": 2.0037474632263184, + "learning_rate": 1.995253431843935e-05, + "loss": 1.3459, + "step": 7730 + }, + { + "epoch": 0.18031118898448137, + "grad_norm": 2.603443145751953, + "learning_rate": 1.995250981097219e-05, + "loss": 1.9365, + "step": 7731 + }, + { + "epoch": 0.18033451212365928, + "grad_norm": 1.5938153266906738, + "learning_rate": 1.9952485297194887e-05, + "loss": 1.3482, + "step": 7732 + }, + { + "epoch": 0.1803578352628372, + "grad_norm": 1.636825442314148, + "learning_rate": 1.995246077710744e-05, + "loss": 1.4048, + "step": 7733 + }, + { + "epoch": 0.18038115840201513, + "grad_norm": 1.674079179763794, + "learning_rate": 1.9952436250709878e-05, + "loss": 1.3529, + "step": 7734 + }, + { + "epoch": 0.18040448154119304, + "grad_norm": 1.9464101791381836, + "learning_rate": 1.9952411718002205e-05, + "loss": 1.4172, + "step": 7735 + }, + { + "epoch": 0.18042780468037095, + "grad_norm": 1.9305797815322876, + "learning_rate": 1.9952387178984445e-05, + "loss": 1.5855, + "step": 7736 + }, + { + "epoch": 0.18045112781954886, + "grad_norm": 1.7572907209396362, + "learning_rate": 1.9952362633656608e-05, + "loss": 1.0293, + "step": 7737 + }, + { + "epoch": 0.1804744509587268, + "grad_norm": 1.5664640665054321, + "learning_rate": 1.995233808201871e-05, + "loss": 1.1804, + "step": 7738 + }, + { + "epoch": 0.1804977740979047, + "grad_norm": 1.517714500427246, + "learning_rate": 1.9952313524070775e-05, + "loss": 1.1558, + "step": 7739 + }, + { + "epoch": 0.18052109723708262, + "grad_norm": 1.7013710737228394, + "learning_rate": 1.995228895981281e-05, + "loss": 1.4108, + "step": 7740 + }, + { + "epoch": 0.18054442037626053, + "grad_norm": 1.934568166732788, + "learning_rate": 1.995226438924483e-05, + "loss": 1.4829, + "step": 7741 + }, + { + "epoch": 0.18056774351543847, + "grad_norm": 1.794360876083374, + "learning_rate": 1.9952239812366852e-05, + "loss": 1.4404, + "step": 7742 + }, + { + "epoch": 0.18059106665461638, + "grad_norm": 1.7504444122314453, + "learning_rate": 1.9952215229178895e-05, + "loss": 1.6702, + "step": 7743 + }, + { + "epoch": 0.1806143897937943, + "grad_norm": 2.0151686668395996, + "learning_rate": 1.995219063968097e-05, + "loss": 1.3546, + "step": 7744 + }, + { + "epoch": 0.1806377129329722, + "grad_norm": 1.813712239265442, + "learning_rate": 1.9952166043873097e-05, + "loss": 1.5818, + "step": 7745 + }, + { + "epoch": 0.18066103607215014, + "grad_norm": 2.1814630031585693, + "learning_rate": 1.995214144175529e-05, + "loss": 1.3947, + "step": 7746 + }, + { + "epoch": 0.18068435921132805, + "grad_norm": 1.995795726776123, + "learning_rate": 1.9952116833327563e-05, + "loss": 1.5124, + "step": 7747 + }, + { + "epoch": 0.18070768235050597, + "grad_norm": 2.118664503097534, + "learning_rate": 1.995209221858993e-05, + "loss": 1.1901, + "step": 7748 + }, + { + "epoch": 0.18073100548968388, + "grad_norm": 1.9460968971252441, + "learning_rate": 1.9952067597542413e-05, + "loss": 1.4506, + "step": 7749 + }, + { + "epoch": 0.1807543286288618, + "grad_norm": 1.8565677404403687, + "learning_rate": 1.9952042970185022e-05, + "loss": 1.3443, + "step": 7750 + }, + { + "epoch": 0.18077765176803973, + "grad_norm": 2.223280668258667, + "learning_rate": 1.9952018336517774e-05, + "loss": 1.2614, + "step": 7751 + }, + { + "epoch": 0.18080097490721764, + "grad_norm": 1.7500272989273071, + "learning_rate": 1.9951993696540687e-05, + "loss": 1.5245, + "step": 7752 + }, + { + "epoch": 0.18082429804639555, + "grad_norm": 1.6643186807632446, + "learning_rate": 1.9951969050253774e-05, + "loss": 1.268, + "step": 7753 + }, + { + "epoch": 0.18084762118557346, + "grad_norm": 1.9510525465011597, + "learning_rate": 1.995194439765705e-05, + "loss": 1.2547, + "step": 7754 + }, + { + "epoch": 0.1808709443247514, + "grad_norm": 1.9294110536575317, + "learning_rate": 1.9951919738750535e-05, + "loss": 1.4543, + "step": 7755 + }, + { + "epoch": 0.1808942674639293, + "grad_norm": 2.3303568363189697, + "learning_rate": 1.995189507353424e-05, + "loss": 1.6689, + "step": 7756 + }, + { + "epoch": 0.18091759060310722, + "grad_norm": 1.8439604043960571, + "learning_rate": 1.9951870402008182e-05, + "loss": 1.3558, + "step": 7757 + }, + { + "epoch": 0.18094091374228513, + "grad_norm": 1.987000584602356, + "learning_rate": 1.9951845724172378e-05, + "loss": 1.5797, + "step": 7758 + }, + { + "epoch": 0.18096423688146307, + "grad_norm": 1.9180837869644165, + "learning_rate": 1.9951821040026842e-05, + "loss": 1.3592, + "step": 7759 + }, + { + "epoch": 0.18098756002064098, + "grad_norm": 2.3658576011657715, + "learning_rate": 1.995179634957159e-05, + "loss": 1.4963, + "step": 7760 + }, + { + "epoch": 0.1810108831598189, + "grad_norm": 1.8923370838165283, + "learning_rate": 1.9951771652806634e-05, + "loss": 1.2647, + "step": 7761 + }, + { + "epoch": 0.1810342062989968, + "grad_norm": 3.9141342639923096, + "learning_rate": 1.9951746949732e-05, + "loss": 1.2409, + "step": 7762 + }, + { + "epoch": 0.18105752943817474, + "grad_norm": 1.7689353227615356, + "learning_rate": 1.9951722240347694e-05, + "loss": 1.226, + "step": 7763 + }, + { + "epoch": 0.18108085257735265, + "grad_norm": 1.891399621963501, + "learning_rate": 1.9951697524653735e-05, + "loss": 1.3668, + "step": 7764 + }, + { + "epoch": 0.18110417571653056, + "grad_norm": 2.060267448425293, + "learning_rate": 1.9951672802650136e-05, + "loss": 1.4403, + "step": 7765 + }, + { + "epoch": 0.18112749885570847, + "grad_norm": 2.0051429271698, + "learning_rate": 1.995164807433692e-05, + "loss": 1.3455, + "step": 7766 + }, + { + "epoch": 0.1811508219948864, + "grad_norm": 2.3693885803222656, + "learning_rate": 1.9951623339714093e-05, + "loss": 1.1971, + "step": 7767 + }, + { + "epoch": 0.18117414513406432, + "grad_norm": 1.4533361196517944, + "learning_rate": 1.995159859878168e-05, + "loss": 1.177, + "step": 7768 + }, + { + "epoch": 0.18119746827324223, + "grad_norm": 1.8532360792160034, + "learning_rate": 1.995157385153969e-05, + "loss": 1.4304, + "step": 7769 + }, + { + "epoch": 0.18122079141242015, + "grad_norm": 1.8564075231552124, + "learning_rate": 1.995154909798814e-05, + "loss": 1.7541, + "step": 7770 + }, + { + "epoch": 0.18124411455159808, + "grad_norm": 1.4625389575958252, + "learning_rate": 1.995152433812705e-05, + "loss": 1.4585, + "step": 7771 + }, + { + "epoch": 0.181267437690776, + "grad_norm": 1.7740625143051147, + "learning_rate": 1.995149957195643e-05, + "loss": 1.8495, + "step": 7772 + }, + { + "epoch": 0.1812907608299539, + "grad_norm": 2.132014751434326, + "learning_rate": 1.9951474799476295e-05, + "loss": 1.2292, + "step": 7773 + }, + { + "epoch": 0.18131408396913182, + "grad_norm": 1.9407774209976196, + "learning_rate": 1.995145002068667e-05, + "loss": 1.5272, + "step": 7774 + }, + { + "epoch": 0.18133740710830976, + "grad_norm": 2.316868543624878, + "learning_rate": 1.995142523558756e-05, + "loss": 1.5848, + "step": 7775 + }, + { + "epoch": 0.18136073024748767, + "grad_norm": 2.0947868824005127, + "learning_rate": 1.9951400444178985e-05, + "loss": 1.3105, + "step": 7776 + }, + { + "epoch": 0.18138405338666558, + "grad_norm": 1.5654137134552002, + "learning_rate": 1.9951375646460965e-05, + "loss": 0.9631, + "step": 7777 + }, + { + "epoch": 0.1814073765258435, + "grad_norm": 1.708593726158142, + "learning_rate": 1.9951350842433508e-05, + "loss": 1.0616, + "step": 7778 + }, + { + "epoch": 0.1814306996650214, + "grad_norm": 1.8287885189056396, + "learning_rate": 1.9951326032096634e-05, + "loss": 1.4118, + "step": 7779 + }, + { + "epoch": 0.18145402280419934, + "grad_norm": 1.764153003692627, + "learning_rate": 1.9951301215450358e-05, + "loss": 1.3811, + "step": 7780 + }, + { + "epoch": 0.18147734594337725, + "grad_norm": 1.9842628240585327, + "learning_rate": 1.9951276392494696e-05, + "loss": 1.4299, + "step": 7781 + }, + { + "epoch": 0.18150066908255516, + "grad_norm": 2.128582000732422, + "learning_rate": 1.995125156322966e-05, + "loss": 1.2794, + "step": 7782 + }, + { + "epoch": 0.18152399222173307, + "grad_norm": 2.1670703887939453, + "learning_rate": 1.995122672765528e-05, + "loss": 1.3001, + "step": 7783 + }, + { + "epoch": 0.181547315360911, + "grad_norm": 1.7968361377716064, + "learning_rate": 1.995120188577155e-05, + "loss": 1.1823, + "step": 7784 + }, + { + "epoch": 0.18157063850008892, + "grad_norm": 1.8632546663284302, + "learning_rate": 1.9951177037578498e-05, + "loss": 1.3404, + "step": 7785 + }, + { + "epoch": 0.18159396163926683, + "grad_norm": 1.6482899188995361, + "learning_rate": 1.9951152183076144e-05, + "loss": 1.1589, + "step": 7786 + }, + { + "epoch": 0.18161728477844474, + "grad_norm": 2.170668601989746, + "learning_rate": 1.9951127322264493e-05, + "loss": 1.4277, + "step": 7787 + }, + { + "epoch": 0.18164060791762268, + "grad_norm": 1.5821505784988403, + "learning_rate": 1.995110245514357e-05, + "loss": 1.4434, + "step": 7788 + }, + { + "epoch": 0.1816639310568006, + "grad_norm": 1.6057379245758057, + "learning_rate": 1.995107758171338e-05, + "loss": 1.4562, + "step": 7789 + }, + { + "epoch": 0.1816872541959785, + "grad_norm": 1.5650666952133179, + "learning_rate": 1.9951052701973955e-05, + "loss": 1.2628, + "step": 7790 + }, + { + "epoch": 0.18171057733515641, + "grad_norm": 1.578609824180603, + "learning_rate": 1.9951027815925296e-05, + "loss": 1.5376, + "step": 7791 + }, + { + "epoch": 0.18173390047433435, + "grad_norm": 1.7028483152389526, + "learning_rate": 1.9951002923567424e-05, + "loss": 1.3087, + "step": 7792 + }, + { + "epoch": 0.18175722361351226, + "grad_norm": 1.7956820726394653, + "learning_rate": 1.995097802490036e-05, + "loss": 1.3792, + "step": 7793 + }, + { + "epoch": 0.18178054675269018, + "grad_norm": 2.034055233001709, + "learning_rate": 1.9950953119924108e-05, + "loss": 1.8915, + "step": 7794 + }, + { + "epoch": 0.1818038698918681, + "grad_norm": 1.92952561378479, + "learning_rate": 1.995092820863869e-05, + "loss": 1.4018, + "step": 7795 + }, + { + "epoch": 0.18182719303104602, + "grad_norm": 1.6291195154190063, + "learning_rate": 1.995090329104413e-05, + "loss": 1.3854, + "step": 7796 + }, + { + "epoch": 0.18185051617022394, + "grad_norm": 1.756932258605957, + "learning_rate": 1.995087836714043e-05, + "loss": 1.4034, + "step": 7797 + }, + { + "epoch": 0.18187383930940185, + "grad_norm": 1.7828006744384766, + "learning_rate": 1.9950853436927615e-05, + "loss": 1.4662, + "step": 7798 + }, + { + "epoch": 0.18189716244857976, + "grad_norm": 1.8222310543060303, + "learning_rate": 1.995082850040569e-05, + "loss": 1.4408, + "step": 7799 + }, + { + "epoch": 0.1819204855877577, + "grad_norm": 1.5640708208084106, + "learning_rate": 1.9950803557574688e-05, + "loss": 1.1088, + "step": 7800 + }, + { + "epoch": 0.1819438087269356, + "grad_norm": 1.5882632732391357, + "learning_rate": 1.995077860843461e-05, + "loss": 1.2513, + "step": 7801 + }, + { + "epoch": 0.18196713186611352, + "grad_norm": 1.6316782236099243, + "learning_rate": 1.995075365298548e-05, + "loss": 1.1751, + "step": 7802 + }, + { + "epoch": 0.18199045500529143, + "grad_norm": 1.8012183904647827, + "learning_rate": 1.995072869122731e-05, + "loss": 1.1257, + "step": 7803 + }, + { + "epoch": 0.18201377814446937, + "grad_norm": 2.3012046813964844, + "learning_rate": 1.9950703723160115e-05, + "loss": 1.6665, + "step": 7804 + }, + { + "epoch": 0.18203710128364728, + "grad_norm": 1.6385375261306763, + "learning_rate": 1.9950678748783917e-05, + "loss": 1.6655, + "step": 7805 + }, + { + "epoch": 0.1820604244228252, + "grad_norm": 1.723596453666687, + "learning_rate": 1.9950653768098723e-05, + "loss": 1.0, + "step": 7806 + }, + { + "epoch": 0.1820837475620031, + "grad_norm": 1.857401967048645, + "learning_rate": 1.9950628781104556e-05, + "loss": 1.4103, + "step": 7807 + }, + { + "epoch": 0.182107070701181, + "grad_norm": 1.8278862237930298, + "learning_rate": 1.9950603787801427e-05, + "loss": 1.376, + "step": 7808 + }, + { + "epoch": 0.18213039384035895, + "grad_norm": 1.9029324054718018, + "learning_rate": 1.9950578788189353e-05, + "loss": 1.4689, + "step": 7809 + }, + { + "epoch": 0.18215371697953686, + "grad_norm": 1.9172009229660034, + "learning_rate": 1.9950553782268354e-05, + "loss": 1.249, + "step": 7810 + }, + { + "epoch": 0.18217704011871477, + "grad_norm": 1.5534167289733887, + "learning_rate": 1.9950528770038443e-05, + "loss": 1.2669, + "step": 7811 + }, + { + "epoch": 0.18220036325789268, + "grad_norm": 1.7160600423812866, + "learning_rate": 1.995050375149963e-05, + "loss": 1.4793, + "step": 7812 + }, + { + "epoch": 0.18222368639707062, + "grad_norm": 2.357579469680786, + "learning_rate": 1.9950478726651942e-05, + "loss": 1.5548, + "step": 7813 + }, + { + "epoch": 0.18224700953624853, + "grad_norm": 2.302377223968506, + "learning_rate": 1.9950453695495388e-05, + "loss": 0.9838, + "step": 7814 + }, + { + "epoch": 0.18227033267542644, + "grad_norm": 1.7904486656188965, + "learning_rate": 1.9950428658029984e-05, + "loss": 1.4548, + "step": 7815 + }, + { + "epoch": 0.18229365581460436, + "grad_norm": 1.4344203472137451, + "learning_rate": 1.995040361425575e-05, + "loss": 1.2973, + "step": 7816 + }, + { + "epoch": 0.1823169789537823, + "grad_norm": 1.6197093725204468, + "learning_rate": 1.9950378564172696e-05, + "loss": 1.0221, + "step": 7817 + }, + { + "epoch": 0.1823403020929602, + "grad_norm": 1.8861066102981567, + "learning_rate": 1.995035350778084e-05, + "loss": 1.3761, + "step": 7818 + }, + { + "epoch": 0.18236362523213812, + "grad_norm": 1.7642205953598022, + "learning_rate": 1.99503284450802e-05, + "loss": 1.6522, + "step": 7819 + }, + { + "epoch": 0.18238694837131603, + "grad_norm": 1.8410332202911377, + "learning_rate": 1.9950303376070786e-05, + "loss": 1.2023, + "step": 7820 + }, + { + "epoch": 0.18241027151049397, + "grad_norm": 1.6206011772155762, + "learning_rate": 1.9950278300752625e-05, + "loss": 1.3395, + "step": 7821 + }, + { + "epoch": 0.18243359464967188, + "grad_norm": 2.164038896560669, + "learning_rate": 1.9950253219125725e-05, + "loss": 1.5496, + "step": 7822 + }, + { + "epoch": 0.1824569177888498, + "grad_norm": 1.8630125522613525, + "learning_rate": 1.99502281311901e-05, + "loss": 1.4755, + "step": 7823 + }, + { + "epoch": 0.1824802409280277, + "grad_norm": 1.6058955192565918, + "learning_rate": 1.9950203036945774e-05, + "loss": 1.4143, + "step": 7824 + }, + { + "epoch": 0.18250356406720564, + "grad_norm": 1.9605003595352173, + "learning_rate": 1.9950177936392756e-05, + "loss": 1.196, + "step": 7825 + }, + { + "epoch": 0.18252688720638355, + "grad_norm": 1.7179044485092163, + "learning_rate": 1.995015282953106e-05, + "loss": 1.3728, + "step": 7826 + }, + { + "epoch": 0.18255021034556146, + "grad_norm": 2.0115163326263428, + "learning_rate": 1.995012771636071e-05, + "loss": 1.465, + "step": 7827 + }, + { + "epoch": 0.18257353348473937, + "grad_norm": 1.9200307130813599, + "learning_rate": 1.995010259688172e-05, + "loss": 1.1474, + "step": 7828 + }, + { + "epoch": 0.1825968566239173, + "grad_norm": 1.5416468381881714, + "learning_rate": 1.99500774710941e-05, + "loss": 1.5158, + "step": 7829 + }, + { + "epoch": 0.18262017976309522, + "grad_norm": 1.5020161867141724, + "learning_rate": 1.9950052338997872e-05, + "loss": 1.1586, + "step": 7830 + }, + { + "epoch": 0.18264350290227313, + "grad_norm": 1.6229578256607056, + "learning_rate": 1.9950027200593046e-05, + "loss": 1.5399, + "step": 7831 + }, + { + "epoch": 0.18266682604145104, + "grad_norm": 1.5579558610916138, + "learning_rate": 1.9950002055879644e-05, + "loss": 1.4313, + "step": 7832 + }, + { + "epoch": 0.18269014918062898, + "grad_norm": 2.0480518341064453, + "learning_rate": 1.994997690485768e-05, + "loss": 1.1419, + "step": 7833 + }, + { + "epoch": 0.1827134723198069, + "grad_norm": 1.9527422189712524, + "learning_rate": 1.994995174752717e-05, + "loss": 1.6549, + "step": 7834 + }, + { + "epoch": 0.1827367954589848, + "grad_norm": 1.853236198425293, + "learning_rate": 1.9949926583888123e-05, + "loss": 1.3254, + "step": 7835 + }, + { + "epoch": 0.1827601185981627, + "grad_norm": 1.632718801498413, + "learning_rate": 1.9949901413940566e-05, + "loss": 1.3404, + "step": 7836 + }, + { + "epoch": 0.18278344173734062, + "grad_norm": 1.8052852153778076, + "learning_rate": 1.9949876237684514e-05, + "loss": 1.5772, + "step": 7837 + }, + { + "epoch": 0.18280676487651856, + "grad_norm": 1.8948980569839478, + "learning_rate": 1.9949851055119974e-05, + "loss": 1.4877, + "step": 7838 + }, + { + "epoch": 0.18283008801569647, + "grad_norm": 1.9826672077178955, + "learning_rate": 1.9949825866246966e-05, + "loss": 1.2956, + "step": 7839 + }, + { + "epoch": 0.18285341115487438, + "grad_norm": 2.2389214038848877, + "learning_rate": 1.9949800671065515e-05, + "loss": 1.5039, + "step": 7840 + }, + { + "epoch": 0.1828767342940523, + "grad_norm": 1.8382562398910522, + "learning_rate": 1.994977546957562e-05, + "loss": 1.4504, + "step": 7841 + }, + { + "epoch": 0.18290005743323023, + "grad_norm": 1.8261767625808716, + "learning_rate": 1.9949750261777314e-05, + "loss": 1.4329, + "step": 7842 + }, + { + "epoch": 0.18292338057240815, + "grad_norm": 2.0445351600646973, + "learning_rate": 1.99497250476706e-05, + "loss": 1.4831, + "step": 7843 + }, + { + "epoch": 0.18294670371158606, + "grad_norm": 1.9984127283096313, + "learning_rate": 1.99496998272555e-05, + "loss": 1.7045, + "step": 7844 + }, + { + "epoch": 0.18297002685076397, + "grad_norm": 1.9160175323486328, + "learning_rate": 1.9949674600532033e-05, + "loss": 1.0752, + "step": 7845 + }, + { + "epoch": 0.1829933499899419, + "grad_norm": 1.86298406124115, + "learning_rate": 1.9949649367500203e-05, + "loss": 1.5798, + "step": 7846 + }, + { + "epoch": 0.18301667312911982, + "grad_norm": 1.5580376386642456, + "learning_rate": 1.994962412816004e-05, + "loss": 1.6269, + "step": 7847 + }, + { + "epoch": 0.18303999626829773, + "grad_norm": 1.9273715019226074, + "learning_rate": 1.9949598882511553e-05, + "loss": 1.3724, + "step": 7848 + }, + { + "epoch": 0.18306331940747564, + "grad_norm": 1.7421609163284302, + "learning_rate": 1.994957363055476e-05, + "loss": 1.3843, + "step": 7849 + }, + { + "epoch": 0.18308664254665358, + "grad_norm": 2.112464666366577, + "learning_rate": 1.9949548372289677e-05, + "loss": 1.2381, + "step": 7850 + }, + { + "epoch": 0.1831099656858315, + "grad_norm": 1.757556676864624, + "learning_rate": 1.9949523107716315e-05, + "loss": 1.3811, + "step": 7851 + }, + { + "epoch": 0.1831332888250094, + "grad_norm": 2.1633970737457275, + "learning_rate": 1.9949497836834695e-05, + "loss": 1.3936, + "step": 7852 + }, + { + "epoch": 0.1831566119641873, + "grad_norm": 1.8878066539764404, + "learning_rate": 1.9949472559644832e-05, + "loss": 1.7053, + "step": 7853 + }, + { + "epoch": 0.18317993510336525, + "grad_norm": 2.2800045013427734, + "learning_rate": 1.994944727614675e-05, + "loss": 1.331, + "step": 7854 + }, + { + "epoch": 0.18320325824254316, + "grad_norm": 2.029324769973755, + "learning_rate": 1.9949421986340448e-05, + "loss": 1.2419, + "step": 7855 + }, + { + "epoch": 0.18322658138172107, + "grad_norm": 1.7914031744003296, + "learning_rate": 1.9949396690225954e-05, + "loss": 1.406, + "step": 7856 + }, + { + "epoch": 0.18324990452089898, + "grad_norm": 1.7080191373825073, + "learning_rate": 1.994937138780328e-05, + "loss": 1.4495, + "step": 7857 + }, + { + "epoch": 0.18327322766007692, + "grad_norm": 1.8241506814956665, + "learning_rate": 1.9949346079072445e-05, + "loss": 1.5079, + "step": 7858 + }, + { + "epoch": 0.18329655079925483, + "grad_norm": 1.4758386611938477, + "learning_rate": 1.9949320764033463e-05, + "loss": 1.1653, + "step": 7859 + }, + { + "epoch": 0.18331987393843274, + "grad_norm": 1.950578212738037, + "learning_rate": 1.9949295442686352e-05, + "loss": 1.4683, + "step": 7860 + }, + { + "epoch": 0.18334319707761065, + "grad_norm": 1.9383132457733154, + "learning_rate": 1.9949270115031127e-05, + "loss": 1.396, + "step": 7861 + }, + { + "epoch": 0.1833665202167886, + "grad_norm": 2.133117198944092, + "learning_rate": 1.99492447810678e-05, + "loss": 1.1046, + "step": 7862 + }, + { + "epoch": 0.1833898433559665, + "grad_norm": 1.4439677000045776, + "learning_rate": 1.9949219440796394e-05, + "loss": 1.2416, + "step": 7863 + }, + { + "epoch": 0.18341316649514441, + "grad_norm": 1.7757548093795776, + "learning_rate": 1.9949194094216917e-05, + "loss": 1.3335, + "step": 7864 + }, + { + "epoch": 0.18343648963432233, + "grad_norm": 2.072064161300659, + "learning_rate": 1.9949168741329393e-05, + "loss": 1.6219, + "step": 7865 + }, + { + "epoch": 0.18345981277350024, + "grad_norm": 1.83402681350708, + "learning_rate": 1.9949143382133835e-05, + "loss": 1.3725, + "step": 7866 + }, + { + "epoch": 0.18348313591267817, + "grad_norm": 1.798584222793579, + "learning_rate": 1.9949118016630256e-05, + "loss": 1.2537, + "step": 7867 + }, + { + "epoch": 0.18350645905185609, + "grad_norm": 1.7333439588546753, + "learning_rate": 1.9949092644818678e-05, + "loss": 1.6023, + "step": 7868 + }, + { + "epoch": 0.183529782191034, + "grad_norm": 1.559775948524475, + "learning_rate": 1.9949067266699113e-05, + "loss": 1.6489, + "step": 7869 + }, + { + "epoch": 0.1835531053302119, + "grad_norm": 1.6801786422729492, + "learning_rate": 1.994904188227158e-05, + "loss": 1.0308, + "step": 7870 + }, + { + "epoch": 0.18357642846938985, + "grad_norm": 2.0101044178009033, + "learning_rate": 1.994901649153609e-05, + "loss": 1.8475, + "step": 7871 + }, + { + "epoch": 0.18359975160856776, + "grad_norm": 1.735427975654602, + "learning_rate": 1.9948991094492665e-05, + "loss": 1.4016, + "step": 7872 + }, + { + "epoch": 0.18362307474774567, + "grad_norm": 2.236271381378174, + "learning_rate": 1.9948965691141317e-05, + "loss": 1.8247, + "step": 7873 + }, + { + "epoch": 0.18364639788692358, + "grad_norm": 1.6231406927108765, + "learning_rate": 1.9948940281482066e-05, + "loss": 1.2666, + "step": 7874 + }, + { + "epoch": 0.18366972102610152, + "grad_norm": 1.5862990617752075, + "learning_rate": 1.994891486551492e-05, + "loss": 1.0966, + "step": 7875 + }, + { + "epoch": 0.18369304416527943, + "grad_norm": 1.5115714073181152, + "learning_rate": 1.9948889443239908e-05, + "loss": 1.3332, + "step": 7876 + }, + { + "epoch": 0.18371636730445734, + "grad_norm": 1.5919275283813477, + "learning_rate": 1.994886401465703e-05, + "loss": 1.4316, + "step": 7877 + }, + { + "epoch": 0.18373969044363525, + "grad_norm": 1.4632034301757812, + "learning_rate": 1.994883857976632e-05, + "loss": 1.0894, + "step": 7878 + }, + { + "epoch": 0.1837630135828132, + "grad_norm": 2.1663873195648193, + "learning_rate": 1.994881313856778e-05, + "loss": 1.6187, + "step": 7879 + }, + { + "epoch": 0.1837863367219911, + "grad_norm": 1.6479003429412842, + "learning_rate": 1.9948787691061434e-05, + "loss": 1.2336, + "step": 7880 + }, + { + "epoch": 0.183809659861169, + "grad_norm": 2.011998414993286, + "learning_rate": 1.9948762237247295e-05, + "loss": 1.9481, + "step": 7881 + }, + { + "epoch": 0.18383298300034692, + "grad_norm": 1.9634594917297363, + "learning_rate": 1.994873677712538e-05, + "loss": 1.7112, + "step": 7882 + }, + { + "epoch": 0.18385630613952486, + "grad_norm": 1.568006157875061, + "learning_rate": 1.9948711310695704e-05, + "loss": 1.4836, + "step": 7883 + }, + { + "epoch": 0.18387962927870277, + "grad_norm": 1.7757046222686768, + "learning_rate": 1.994868583795828e-05, + "loss": 1.5037, + "step": 7884 + }, + { + "epoch": 0.18390295241788068, + "grad_norm": 1.5361061096191406, + "learning_rate": 1.9948660358913133e-05, + "loss": 1.5535, + "step": 7885 + }, + { + "epoch": 0.1839262755570586, + "grad_norm": 1.9387253522872925, + "learning_rate": 1.994863487356027e-05, + "loss": 1.6374, + "step": 7886 + }, + { + "epoch": 0.18394959869623653, + "grad_norm": 1.7119598388671875, + "learning_rate": 1.9948609381899714e-05, + "loss": 1.3282, + "step": 7887 + }, + { + "epoch": 0.18397292183541444, + "grad_norm": 1.6209826469421387, + "learning_rate": 1.994858388393148e-05, + "loss": 1.433, + "step": 7888 + }, + { + "epoch": 0.18399624497459235, + "grad_norm": 2.0493063926696777, + "learning_rate": 1.994855837965558e-05, + "loss": 1.465, + "step": 7889 + }, + { + "epoch": 0.18401956811377027, + "grad_norm": 1.7230323553085327, + "learning_rate": 1.9948532869072034e-05, + "loss": 1.3467, + "step": 7890 + }, + { + "epoch": 0.1840428912529482, + "grad_norm": 1.8008559942245483, + "learning_rate": 1.9948507352180856e-05, + "loss": 1.3258, + "step": 7891 + }, + { + "epoch": 0.18406621439212612, + "grad_norm": 2.363764762878418, + "learning_rate": 1.9948481828982064e-05, + "loss": 1.6337, + "step": 7892 + }, + { + "epoch": 0.18408953753130403, + "grad_norm": 1.7654318809509277, + "learning_rate": 1.9948456299475675e-05, + "loss": 1.5968, + "step": 7893 + }, + { + "epoch": 0.18411286067048194, + "grad_norm": 1.8745006322860718, + "learning_rate": 1.99484307636617e-05, + "loss": 1.0991, + "step": 7894 + }, + { + "epoch": 0.18413618380965985, + "grad_norm": 1.5488570928573608, + "learning_rate": 1.994840522154016e-05, + "loss": 1.0296, + "step": 7895 + }, + { + "epoch": 0.1841595069488378, + "grad_norm": 4.654823303222656, + "learning_rate": 1.994837967311107e-05, + "loss": 1.8138, + "step": 7896 + }, + { + "epoch": 0.1841828300880157, + "grad_norm": 2.2157368659973145, + "learning_rate": 1.9948354118374445e-05, + "loss": 1.1345, + "step": 7897 + }, + { + "epoch": 0.1842061532271936, + "grad_norm": 2.1762731075286865, + "learning_rate": 1.9948328557330305e-05, + "loss": 1.4445, + "step": 7898 + }, + { + "epoch": 0.18422947636637152, + "grad_norm": 1.9618299007415771, + "learning_rate": 1.994830298997866e-05, + "loss": 1.147, + "step": 7899 + }, + { + "epoch": 0.18425279950554946, + "grad_norm": 2.0561485290527344, + "learning_rate": 1.9948277416319533e-05, + "loss": 2.022, + "step": 7900 + }, + { + "epoch": 0.18427612264472737, + "grad_norm": 1.801593542098999, + "learning_rate": 1.9948251836352935e-05, + "loss": 1.334, + "step": 7901 + }, + { + "epoch": 0.18429944578390528, + "grad_norm": 2.034825563430786, + "learning_rate": 1.9948226250078886e-05, + "loss": 1.5302, + "step": 7902 + }, + { + "epoch": 0.1843227689230832, + "grad_norm": 1.9712649583816528, + "learning_rate": 1.99482006574974e-05, + "loss": 1.6461, + "step": 7903 + }, + { + "epoch": 0.18434609206226113, + "grad_norm": 1.6814563274383545, + "learning_rate": 1.994817505860849e-05, + "loss": 1.2784, + "step": 7904 + }, + { + "epoch": 0.18436941520143904, + "grad_norm": 1.9082130193710327, + "learning_rate": 1.9948149453412183e-05, + "loss": 1.2921, + "step": 7905 + }, + { + "epoch": 0.18439273834061695, + "grad_norm": 1.7603179216384888, + "learning_rate": 1.9948123841908482e-05, + "loss": 1.2773, + "step": 7906 + }, + { + "epoch": 0.18441606147979486, + "grad_norm": 2.4115519523620605, + "learning_rate": 1.994809822409741e-05, + "loss": 1.4807, + "step": 7907 + }, + { + "epoch": 0.1844393846189728, + "grad_norm": 1.894085168838501, + "learning_rate": 1.9948072599978986e-05, + "loss": 1.4898, + "step": 7908 + }, + { + "epoch": 0.1844627077581507, + "grad_norm": 1.9746428728103638, + "learning_rate": 1.994804696955322e-05, + "loss": 1.4724, + "step": 7909 + }, + { + "epoch": 0.18448603089732862, + "grad_norm": 1.759278655052185, + "learning_rate": 1.994802133282013e-05, + "loss": 1.4526, + "step": 7910 + }, + { + "epoch": 0.18450935403650653, + "grad_norm": 1.6658263206481934, + "learning_rate": 1.9947995689779735e-05, + "loss": 1.4929, + "step": 7911 + }, + { + "epoch": 0.18453267717568447, + "grad_norm": 2.0002615451812744, + "learning_rate": 1.994797004043205e-05, + "loss": 1.2647, + "step": 7912 + }, + { + "epoch": 0.18455600031486238, + "grad_norm": 1.4965345859527588, + "learning_rate": 1.994794438477709e-05, + "loss": 1.3285, + "step": 7913 + }, + { + "epoch": 0.1845793234540403, + "grad_norm": 1.4657378196716309, + "learning_rate": 1.9947918722814873e-05, + "loss": 1.2983, + "step": 7914 + }, + { + "epoch": 0.1846026465932182, + "grad_norm": 2.1683332920074463, + "learning_rate": 1.9947893054545412e-05, + "loss": 1.7678, + "step": 7915 + }, + { + "epoch": 0.18462596973239614, + "grad_norm": 1.7204591035842896, + "learning_rate": 1.9947867379968727e-05, + "loss": 1.0738, + "step": 7916 + }, + { + "epoch": 0.18464929287157406, + "grad_norm": 1.9574320316314697, + "learning_rate": 1.9947841699084834e-05, + "loss": 1.4893, + "step": 7917 + }, + { + "epoch": 0.18467261601075197, + "grad_norm": 1.807695746421814, + "learning_rate": 1.9947816011893745e-05, + "loss": 1.255, + "step": 7918 + }, + { + "epoch": 0.18469593914992988, + "grad_norm": 2.0153746604919434, + "learning_rate": 1.9947790318395482e-05, + "loss": 1.4044, + "step": 7919 + }, + { + "epoch": 0.18471926228910782, + "grad_norm": 1.8730332851409912, + "learning_rate": 1.994776461859006e-05, + "loss": 1.2058, + "step": 7920 + }, + { + "epoch": 0.18474258542828573, + "grad_norm": 2.0137884616851807, + "learning_rate": 1.9947738912477494e-05, + "loss": 1.7921, + "step": 7921 + }, + { + "epoch": 0.18476590856746364, + "grad_norm": 1.6705057621002197, + "learning_rate": 1.99477132000578e-05, + "loss": 1.2416, + "step": 7922 + }, + { + "epoch": 0.18478923170664155, + "grad_norm": 1.7816013097763062, + "learning_rate": 1.9947687481330992e-05, + "loss": 1.6055, + "step": 7923 + }, + { + "epoch": 0.18481255484581946, + "grad_norm": 1.6914864778518677, + "learning_rate": 1.994766175629709e-05, + "loss": 1.52, + "step": 7924 + }, + { + "epoch": 0.1848358779849974, + "grad_norm": 1.6175390481948853, + "learning_rate": 1.994763602495611e-05, + "loss": 1.424, + "step": 7925 + }, + { + "epoch": 0.1848592011241753, + "grad_norm": 2.0779285430908203, + "learning_rate": 1.994761028730807e-05, + "loss": 1.37, + "step": 7926 + }, + { + "epoch": 0.18488252426335322, + "grad_norm": 1.708913803100586, + "learning_rate": 1.994758454335298e-05, + "loss": 1.4573, + "step": 7927 + }, + { + "epoch": 0.18490584740253113, + "grad_norm": 2.074214220046997, + "learning_rate": 1.994755879309086e-05, + "loss": 1.4446, + "step": 7928 + }, + { + "epoch": 0.18492917054170907, + "grad_norm": 2.137077808380127, + "learning_rate": 1.9947533036521728e-05, + "loss": 1.4644, + "step": 7929 + }, + { + "epoch": 0.18495249368088698, + "grad_norm": 2.0304949283599854, + "learning_rate": 1.9947507273645602e-05, + "loss": 1.5185, + "step": 7930 + }, + { + "epoch": 0.1849758168200649, + "grad_norm": 1.7847139835357666, + "learning_rate": 1.994748150446249e-05, + "loss": 1.1471, + "step": 7931 + }, + { + "epoch": 0.1849991399592428, + "grad_norm": 1.67708420753479, + "learning_rate": 1.9947455728972416e-05, + "loss": 1.2985, + "step": 7932 + }, + { + "epoch": 0.18502246309842074, + "grad_norm": 2.097397565841675, + "learning_rate": 1.9947429947175394e-05, + "loss": 1.1422, + "step": 7933 + }, + { + "epoch": 0.18504578623759865, + "grad_norm": 2.0835020542144775, + "learning_rate": 1.9947404159071438e-05, + "loss": 1.4407, + "step": 7934 + }, + { + "epoch": 0.18506910937677656, + "grad_norm": 1.7284703254699707, + "learning_rate": 1.994737836466057e-05, + "loss": 1.105, + "step": 7935 + }, + { + "epoch": 0.18509243251595447, + "grad_norm": 2.159390926361084, + "learning_rate": 1.99473525639428e-05, + "loss": 1.582, + "step": 7936 + }, + { + "epoch": 0.1851157556551324, + "grad_norm": 1.9313980340957642, + "learning_rate": 1.994732675691815e-05, + "loss": 1.6096, + "step": 7937 + }, + { + "epoch": 0.18513907879431032, + "grad_norm": 2.401472568511963, + "learning_rate": 1.9947300943586634e-05, + "loss": 1.7527, + "step": 7938 + }, + { + "epoch": 0.18516240193348824, + "grad_norm": 2.1686158180236816, + "learning_rate": 1.9947275123948265e-05, + "loss": 1.353, + "step": 7939 + }, + { + "epoch": 0.18518572507266615, + "grad_norm": 1.7129522562026978, + "learning_rate": 1.9947249298003066e-05, + "loss": 1.4647, + "step": 7940 + }, + { + "epoch": 0.18520904821184409, + "grad_norm": 1.9989738464355469, + "learning_rate": 1.9947223465751048e-05, + "loss": 1.7509, + "step": 7941 + }, + { + "epoch": 0.185232371351022, + "grad_norm": 2.116532802581787, + "learning_rate": 1.994719762719223e-05, + "loss": 1.2229, + "step": 7942 + }, + { + "epoch": 0.1852556944901999, + "grad_norm": 2.018582344055176, + "learning_rate": 1.9947171782326625e-05, + "loss": 1.6193, + "step": 7943 + }, + { + "epoch": 0.18527901762937782, + "grad_norm": 1.9196641445159912, + "learning_rate": 1.9947145931154253e-05, + "loss": 1.3477, + "step": 7944 + }, + { + "epoch": 0.18530234076855576, + "grad_norm": 1.7149230241775513, + "learning_rate": 1.994712007367513e-05, + "loss": 1.5486, + "step": 7945 + }, + { + "epoch": 0.18532566390773367, + "grad_norm": 2.124716281890869, + "learning_rate": 1.9947094209889273e-05, + "loss": 1.2705, + "step": 7946 + }, + { + "epoch": 0.18534898704691158, + "grad_norm": 1.7222310304641724, + "learning_rate": 1.9947068339796693e-05, + "loss": 1.2072, + "step": 7947 + }, + { + "epoch": 0.1853723101860895, + "grad_norm": 1.5903428792953491, + "learning_rate": 1.9947042463397416e-05, + "loss": 1.4994, + "step": 7948 + }, + { + "epoch": 0.18539563332526743, + "grad_norm": 1.560240626335144, + "learning_rate": 1.9947016580691448e-05, + "loss": 1.2752, + "step": 7949 + }, + { + "epoch": 0.18541895646444534, + "grad_norm": 2.0885708332061768, + "learning_rate": 1.994699069167881e-05, + "loss": 1.3853, + "step": 7950 + }, + { + "epoch": 0.18544227960362325, + "grad_norm": 1.8798753023147583, + "learning_rate": 1.9946964796359522e-05, + "loss": 1.3853, + "step": 7951 + }, + { + "epoch": 0.18546560274280116, + "grad_norm": 2.1947786808013916, + "learning_rate": 1.99469388947336e-05, + "loss": 1.5042, + "step": 7952 + }, + { + "epoch": 0.18548892588197907, + "grad_norm": 1.6437233686447144, + "learning_rate": 1.9946912986801054e-05, + "loss": 1.3633, + "step": 7953 + }, + { + "epoch": 0.185512249021157, + "grad_norm": 2.2299771308898926, + "learning_rate": 1.9946887072561902e-05, + "loss": 1.434, + "step": 7954 + }, + { + "epoch": 0.18553557216033492, + "grad_norm": 1.5597115755081177, + "learning_rate": 1.9946861152016167e-05, + "loss": 1.1671, + "step": 7955 + }, + { + "epoch": 0.18555889529951283, + "grad_norm": 1.588959813117981, + "learning_rate": 1.9946835225163857e-05, + "loss": 1.4668, + "step": 7956 + }, + { + "epoch": 0.18558221843869074, + "grad_norm": 2.8228628635406494, + "learning_rate": 1.994680929200499e-05, + "loss": 1.5046, + "step": 7957 + }, + { + "epoch": 0.18560554157786868, + "grad_norm": 1.5038644075393677, + "learning_rate": 1.9946783352539592e-05, + "loss": 1.3631, + "step": 7958 + }, + { + "epoch": 0.1856288647170466, + "grad_norm": 1.5900075435638428, + "learning_rate": 1.9946757406767667e-05, + "loss": 1.2752, + "step": 7959 + }, + { + "epoch": 0.1856521878562245, + "grad_norm": 1.9840675592422485, + "learning_rate": 1.994673145468924e-05, + "loss": 1.3292, + "step": 7960 + }, + { + "epoch": 0.18567551099540242, + "grad_norm": 1.9660426378250122, + "learning_rate": 1.9946705496304322e-05, + "loss": 1.138, + "step": 7961 + }, + { + "epoch": 0.18569883413458035, + "grad_norm": 1.448322057723999, + "learning_rate": 1.994667953161293e-05, + "loss": 1.2275, + "step": 7962 + }, + { + "epoch": 0.18572215727375826, + "grad_norm": 1.7390021085739136, + "learning_rate": 1.9946653560615087e-05, + "loss": 1.4812, + "step": 7963 + }, + { + "epoch": 0.18574548041293618, + "grad_norm": 3.5953032970428467, + "learning_rate": 1.99466275833108e-05, + "loss": 1.1249, + "step": 7964 + }, + { + "epoch": 0.1857688035521141, + "grad_norm": 1.7775022983551025, + "learning_rate": 1.994660159970009e-05, + "loss": 1.3036, + "step": 7965 + }, + { + "epoch": 0.18579212669129203, + "grad_norm": 3.1567859649658203, + "learning_rate": 1.9946575609782977e-05, + "loss": 1.2808, + "step": 7966 + }, + { + "epoch": 0.18581544983046994, + "grad_norm": 1.883466124534607, + "learning_rate": 1.9946549613559473e-05, + "loss": 1.3621, + "step": 7967 + }, + { + "epoch": 0.18583877296964785, + "grad_norm": 2.5921695232391357, + "learning_rate": 1.994652361102959e-05, + "loss": 1.0041, + "step": 7968 + }, + { + "epoch": 0.18586209610882576, + "grad_norm": 1.8654274940490723, + "learning_rate": 1.9946497602193356e-05, + "loss": 1.4541, + "step": 7969 + }, + { + "epoch": 0.1858854192480037, + "grad_norm": 2.1639301776885986, + "learning_rate": 1.994647158705078e-05, + "loss": 1.5828, + "step": 7970 + }, + { + "epoch": 0.1859087423871816, + "grad_norm": 2.2094762325286865, + "learning_rate": 1.994644556560188e-05, + "loss": 1.8598, + "step": 7971 + }, + { + "epoch": 0.18593206552635952, + "grad_norm": 2.0362534523010254, + "learning_rate": 1.994641953784667e-05, + "loss": 1.257, + "step": 7972 + }, + { + "epoch": 0.18595538866553743, + "grad_norm": 1.8667770624160767, + "learning_rate": 1.994639350378517e-05, + "loss": 1.6087, + "step": 7973 + }, + { + "epoch": 0.18597871180471537, + "grad_norm": 2.361189126968384, + "learning_rate": 1.9946367463417396e-05, + "loss": 1.2616, + "step": 7974 + }, + { + "epoch": 0.18600203494389328, + "grad_norm": 1.7298156023025513, + "learning_rate": 1.9946341416743366e-05, + "loss": 1.2566, + "step": 7975 + }, + { + "epoch": 0.1860253580830712, + "grad_norm": 1.6245572566986084, + "learning_rate": 1.9946315363763094e-05, + "loss": 1.5809, + "step": 7976 + }, + { + "epoch": 0.1860486812222491, + "grad_norm": 1.4986340999603271, + "learning_rate": 1.9946289304476593e-05, + "loss": 1.1742, + "step": 7977 + }, + { + "epoch": 0.186072004361427, + "grad_norm": 1.9172253608703613, + "learning_rate": 1.9946263238883885e-05, + "loss": 1.2802, + "step": 7978 + }, + { + "epoch": 0.18609532750060495, + "grad_norm": 2.1387524604797363, + "learning_rate": 1.9946237166984987e-05, + "loss": 1.5364, + "step": 7979 + }, + { + "epoch": 0.18611865063978286, + "grad_norm": 2.082533597946167, + "learning_rate": 1.994621108877991e-05, + "loss": 1.3603, + "step": 7980 + }, + { + "epoch": 0.18614197377896077, + "grad_norm": 2.1373279094696045, + "learning_rate": 1.9946185004268678e-05, + "loss": 1.4161, + "step": 7981 + }, + { + "epoch": 0.18616529691813868, + "grad_norm": 1.6638572216033936, + "learning_rate": 1.9946158913451302e-05, + "loss": 1.4324, + "step": 7982 + }, + { + "epoch": 0.18618862005731662, + "grad_norm": 2.010808229446411, + "learning_rate": 1.9946132816327803e-05, + "loss": 1.7157, + "step": 7983 + }, + { + "epoch": 0.18621194319649453, + "grad_norm": 2.016139507293701, + "learning_rate": 1.9946106712898187e-05, + "loss": 1.3872, + "step": 7984 + }, + { + "epoch": 0.18623526633567244, + "grad_norm": 1.898192286491394, + "learning_rate": 1.9946080603162486e-05, + "loss": 1.4716, + "step": 7985 + }, + { + "epoch": 0.18625858947485036, + "grad_norm": 2.282742738723755, + "learning_rate": 1.9946054487120702e-05, + "loss": 1.7555, + "step": 7986 + }, + { + "epoch": 0.1862819126140283, + "grad_norm": 1.9659360647201538, + "learning_rate": 1.9946028364772863e-05, + "loss": 1.3135, + "step": 7987 + }, + { + "epoch": 0.1863052357532062, + "grad_norm": 1.7083957195281982, + "learning_rate": 1.9946002236118983e-05, + "loss": 1.2011, + "step": 7988 + }, + { + "epoch": 0.18632855889238412, + "grad_norm": 1.4872326850891113, + "learning_rate": 1.994597610115907e-05, + "loss": 1.2929, + "step": 7989 + }, + { + "epoch": 0.18635188203156203, + "grad_norm": 1.591417908668518, + "learning_rate": 1.994594995989315e-05, + "loss": 1.2564, + "step": 7990 + }, + { + "epoch": 0.18637520517073997, + "grad_norm": 1.9930405616760254, + "learning_rate": 1.9945923812321237e-05, + "loss": 1.7572, + "step": 7991 + }, + { + "epoch": 0.18639852830991788, + "grad_norm": 1.6667670011520386, + "learning_rate": 1.994589765844335e-05, + "loss": 1.5365, + "step": 7992 + }, + { + "epoch": 0.1864218514490958, + "grad_norm": 1.870199203491211, + "learning_rate": 1.99458714982595e-05, + "loss": 1.3736, + "step": 7993 + }, + { + "epoch": 0.1864451745882737, + "grad_norm": 1.3614585399627686, + "learning_rate": 1.9945845331769705e-05, + "loss": 1.0934, + "step": 7994 + }, + { + "epoch": 0.18646849772745164, + "grad_norm": 1.7856804132461548, + "learning_rate": 1.9945819158973986e-05, + "loss": 1.4461, + "step": 7995 + }, + { + "epoch": 0.18649182086662955, + "grad_norm": 1.4931102991104126, + "learning_rate": 1.9945792979872355e-05, + "loss": 1.2985, + "step": 7996 + }, + { + "epoch": 0.18651514400580746, + "grad_norm": 2.0775625705718994, + "learning_rate": 1.9945766794464827e-05, + "loss": 1.4307, + "step": 7997 + }, + { + "epoch": 0.18653846714498537, + "grad_norm": 1.823507308959961, + "learning_rate": 1.9945740602751424e-05, + "loss": 1.5225, + "step": 7998 + }, + { + "epoch": 0.1865617902841633, + "grad_norm": 1.7743809223175049, + "learning_rate": 1.9945714404732162e-05, + "loss": 1.4676, + "step": 7999 + }, + { + "epoch": 0.18658511342334122, + "grad_norm": 1.7636680603027344, + "learning_rate": 1.9945688200407055e-05, + "loss": 0.8919, + "step": 8000 + }, + { + "epoch": 0.18660843656251913, + "grad_norm": 2.207695722579956, + "learning_rate": 1.9945661989776123e-05, + "loss": 1.8364, + "step": 8001 + }, + { + "epoch": 0.18663175970169704, + "grad_norm": 2.1753907203674316, + "learning_rate": 1.9945635772839376e-05, + "loss": 1.385, + "step": 8002 + }, + { + "epoch": 0.18665508284087498, + "grad_norm": 1.8537788391113281, + "learning_rate": 1.9945609549596835e-05, + "loss": 1.5613, + "step": 8003 + }, + { + "epoch": 0.1866784059800529, + "grad_norm": 2.003807783126831, + "learning_rate": 1.9945583320048517e-05, + "loss": 1.3628, + "step": 8004 + }, + { + "epoch": 0.1867017291192308, + "grad_norm": 1.829106330871582, + "learning_rate": 1.994555708419444e-05, + "loss": 1.4376, + "step": 8005 + }, + { + "epoch": 0.1867250522584087, + "grad_norm": 2.1139347553253174, + "learning_rate": 1.994553084203462e-05, + "loss": 1.4208, + "step": 8006 + }, + { + "epoch": 0.18674837539758662, + "grad_norm": 1.9346517324447632, + "learning_rate": 1.9945504593569066e-05, + "loss": 1.2677, + "step": 8007 + }, + { + "epoch": 0.18677169853676456, + "grad_norm": 1.983305811882019, + "learning_rate": 1.9945478338797807e-05, + "loss": 0.9442, + "step": 8008 + }, + { + "epoch": 0.18679502167594247, + "grad_norm": 2.406250476837158, + "learning_rate": 1.994545207772085e-05, + "loss": 1.4273, + "step": 8009 + }, + { + "epoch": 0.18681834481512039, + "grad_norm": 1.7576619386672974, + "learning_rate": 1.994542581033822e-05, + "loss": 1.3803, + "step": 8010 + }, + { + "epoch": 0.1868416679542983, + "grad_norm": 1.7855862379074097, + "learning_rate": 1.9945399536649926e-05, + "loss": 1.3361, + "step": 8011 + }, + { + "epoch": 0.18686499109347623, + "grad_norm": 3.4729361534118652, + "learning_rate": 1.9945373256655984e-05, + "loss": 1.3804, + "step": 8012 + }, + { + "epoch": 0.18688831423265415, + "grad_norm": 1.842843770980835, + "learning_rate": 1.994534697035642e-05, + "loss": 1.3715, + "step": 8013 + }, + { + "epoch": 0.18691163737183206, + "grad_norm": 1.673634648323059, + "learning_rate": 1.9945320677751243e-05, + "loss": 1.3377, + "step": 8014 + }, + { + "epoch": 0.18693496051100997, + "grad_norm": 1.689892292022705, + "learning_rate": 1.994529437884047e-05, + "loss": 1.6465, + "step": 8015 + }, + { + "epoch": 0.1869582836501879, + "grad_norm": 1.6069334745407104, + "learning_rate": 1.994526807362412e-05, + "loss": 1.3692, + "step": 8016 + }, + { + "epoch": 0.18698160678936582, + "grad_norm": 1.6543606519699097, + "learning_rate": 1.994524176210221e-05, + "loss": 1.356, + "step": 8017 + }, + { + "epoch": 0.18700492992854373, + "grad_norm": 2.767754077911377, + "learning_rate": 1.9945215444274755e-05, + "loss": 1.3487, + "step": 8018 + }, + { + "epoch": 0.18702825306772164, + "grad_norm": 2.0824317932128906, + "learning_rate": 1.994518912014177e-05, + "loss": 1.4738, + "step": 8019 + }, + { + "epoch": 0.18705157620689958, + "grad_norm": 2.4381985664367676, + "learning_rate": 1.994516278970328e-05, + "loss": 1.25, + "step": 8020 + }, + { + "epoch": 0.1870748993460775, + "grad_norm": 1.8610236644744873, + "learning_rate": 1.994513645295929e-05, + "loss": 1.4521, + "step": 8021 + }, + { + "epoch": 0.1870982224852554, + "grad_norm": 1.769096851348877, + "learning_rate": 1.9945110109909823e-05, + "loss": 1.4287, + "step": 8022 + }, + { + "epoch": 0.1871215456244333, + "grad_norm": 2.075126886367798, + "learning_rate": 1.9945083760554898e-05, + "loss": 1.2237, + "step": 8023 + }, + { + "epoch": 0.18714486876361125, + "grad_norm": 2.218397617340088, + "learning_rate": 1.9945057404894528e-05, + "loss": 1.5074, + "step": 8024 + }, + { + "epoch": 0.18716819190278916, + "grad_norm": 1.8922489881515503, + "learning_rate": 1.994503104292873e-05, + "loss": 1.9894, + "step": 8025 + }, + { + "epoch": 0.18719151504196707, + "grad_norm": 1.7199231386184692, + "learning_rate": 1.994500467465752e-05, + "loss": 1.3731, + "step": 8026 + }, + { + "epoch": 0.18721483818114498, + "grad_norm": 1.6674853563308716, + "learning_rate": 1.9944978300080916e-05, + "loss": 1.2887, + "step": 8027 + }, + { + "epoch": 0.18723816132032292, + "grad_norm": 1.5613702535629272, + "learning_rate": 1.9944951919198937e-05, + "loss": 1.3634, + "step": 8028 + }, + { + "epoch": 0.18726148445950083, + "grad_norm": 1.809898853302002, + "learning_rate": 1.9944925532011593e-05, + "loss": 1.494, + "step": 8029 + }, + { + "epoch": 0.18728480759867874, + "grad_norm": 2.0476179122924805, + "learning_rate": 1.994489913851891e-05, + "loss": 1.6709, + "step": 8030 + }, + { + "epoch": 0.18730813073785665, + "grad_norm": 1.7805126905441284, + "learning_rate": 1.99448727387209e-05, + "loss": 1.1503, + "step": 8031 + }, + { + "epoch": 0.1873314538770346, + "grad_norm": 1.7238508462905884, + "learning_rate": 1.9944846332617575e-05, + "loss": 1.3155, + "step": 8032 + }, + { + "epoch": 0.1873547770162125, + "grad_norm": 1.9023475646972656, + "learning_rate": 1.9944819920208958e-05, + "loss": 1.7491, + "step": 8033 + }, + { + "epoch": 0.18737810015539041, + "grad_norm": 1.6812968254089355, + "learning_rate": 1.9944793501495065e-05, + "loss": 1.6871, + "step": 8034 + }, + { + "epoch": 0.18740142329456833, + "grad_norm": 1.6871724128723145, + "learning_rate": 1.9944767076475914e-05, + "loss": 1.2066, + "step": 8035 + }, + { + "epoch": 0.18742474643374624, + "grad_norm": 1.9575331211090088, + "learning_rate": 1.9944740645151516e-05, + "loss": 1.6621, + "step": 8036 + }, + { + "epoch": 0.18744806957292418, + "grad_norm": 1.4951329231262207, + "learning_rate": 1.994471420752189e-05, + "loss": 1.3975, + "step": 8037 + }, + { + "epoch": 0.1874713927121021, + "grad_norm": 1.585641622543335, + "learning_rate": 1.994468776358706e-05, + "loss": 1.3611, + "step": 8038 + }, + { + "epoch": 0.18749471585128, + "grad_norm": 1.741133451461792, + "learning_rate": 1.9944661313347034e-05, + "loss": 1.1002, + "step": 8039 + }, + { + "epoch": 0.1875180389904579, + "grad_norm": 1.6459580659866333, + "learning_rate": 1.994463485680183e-05, + "loss": 1.6053, + "step": 8040 + }, + { + "epoch": 0.18754136212963585, + "grad_norm": 2.058819532394409, + "learning_rate": 1.9944608393951467e-05, + "loss": 1.2962, + "step": 8041 + }, + { + "epoch": 0.18756468526881376, + "grad_norm": 1.8824231624603271, + "learning_rate": 1.9944581924795962e-05, + "loss": 1.3528, + "step": 8042 + }, + { + "epoch": 0.18758800840799167, + "grad_norm": 1.7201250791549683, + "learning_rate": 1.994455544933533e-05, + "loss": 1.2389, + "step": 8043 + }, + { + "epoch": 0.18761133154716958, + "grad_norm": 1.6976569890975952, + "learning_rate": 1.994452896756959e-05, + "loss": 1.2481, + "step": 8044 + }, + { + "epoch": 0.18763465468634752, + "grad_norm": 1.670017123222351, + "learning_rate": 1.994450247949876e-05, + "loss": 1.2618, + "step": 8045 + }, + { + "epoch": 0.18765797782552543, + "grad_norm": 1.6716724634170532, + "learning_rate": 1.994447598512285e-05, + "loss": 1.3585, + "step": 8046 + }, + { + "epoch": 0.18768130096470334, + "grad_norm": 1.8448940515518188, + "learning_rate": 1.9944449484441882e-05, + "loss": 1.3258, + "step": 8047 + }, + { + "epoch": 0.18770462410388125, + "grad_norm": 1.909206509590149, + "learning_rate": 1.9944422977455874e-05, + "loss": 1.3532, + "step": 8048 + }, + { + "epoch": 0.1877279472430592, + "grad_norm": 1.9497190713882446, + "learning_rate": 1.994439646416484e-05, + "loss": 1.3155, + "step": 8049 + }, + { + "epoch": 0.1877512703822371, + "grad_norm": 1.8700488805770874, + "learning_rate": 1.9944369944568795e-05, + "loss": 1.3001, + "step": 8050 + }, + { + "epoch": 0.187774593521415, + "grad_norm": 1.807328462600708, + "learning_rate": 1.994434341866776e-05, + "loss": 1.3008, + "step": 8051 + }, + { + "epoch": 0.18779791666059292, + "grad_norm": 2.675473213195801, + "learning_rate": 1.994431688646175e-05, + "loss": 1.3074, + "step": 8052 + }, + { + "epoch": 0.18782123979977086, + "grad_norm": 1.877972960472107, + "learning_rate": 1.9944290347950785e-05, + "loss": 1.3972, + "step": 8053 + }, + { + "epoch": 0.18784456293894877, + "grad_norm": 2.1658921241760254, + "learning_rate": 1.9944263803134872e-05, + "loss": 1.0549, + "step": 8054 + }, + { + "epoch": 0.18786788607812668, + "grad_norm": 1.9666105508804321, + "learning_rate": 1.9944237252014043e-05, + "loss": 1.3477, + "step": 8055 + }, + { + "epoch": 0.1878912092173046, + "grad_norm": 1.710775375366211, + "learning_rate": 1.99442106945883e-05, + "loss": 1.6234, + "step": 8056 + }, + { + "epoch": 0.18791453235648253, + "grad_norm": 1.8230870962142944, + "learning_rate": 1.994418413085767e-05, + "loss": 1.2233, + "step": 8057 + }, + { + "epoch": 0.18793785549566044, + "grad_norm": 1.753082513809204, + "learning_rate": 1.9944157560822165e-05, + "loss": 1.4733, + "step": 8058 + }, + { + "epoch": 0.18796117863483836, + "grad_norm": 1.6921221017837524, + "learning_rate": 1.9944130984481803e-05, + "loss": 1.5471, + "step": 8059 + }, + { + "epoch": 0.18798450177401627, + "grad_norm": 1.82761549949646, + "learning_rate": 1.9944104401836603e-05, + "loss": 1.5045, + "step": 8060 + }, + { + "epoch": 0.1880078249131942, + "grad_norm": 1.9594773054122925, + "learning_rate": 1.9944077812886574e-05, + "loss": 1.7704, + "step": 8061 + }, + { + "epoch": 0.18803114805237212, + "grad_norm": 1.6918596029281616, + "learning_rate": 1.9944051217631743e-05, + "loss": 1.4566, + "step": 8062 + }, + { + "epoch": 0.18805447119155003, + "grad_norm": 2.213282585144043, + "learning_rate": 1.994402461607212e-05, + "loss": 1.5426, + "step": 8063 + }, + { + "epoch": 0.18807779433072794, + "grad_norm": 2.0702004432678223, + "learning_rate": 1.994399800820773e-05, + "loss": 1.3526, + "step": 8064 + }, + { + "epoch": 0.18810111746990585, + "grad_norm": 1.8484736680984497, + "learning_rate": 1.9943971394038577e-05, + "loss": 1.2083, + "step": 8065 + }, + { + "epoch": 0.1881244406090838, + "grad_norm": 2.404710292816162, + "learning_rate": 1.994394477356469e-05, + "loss": 1.5698, + "step": 8066 + }, + { + "epoch": 0.1881477637482617, + "grad_norm": 2.142174482345581, + "learning_rate": 1.994391814678608e-05, + "loss": 1.3843, + "step": 8067 + }, + { + "epoch": 0.1881710868874396, + "grad_norm": 1.6937545537948608, + "learning_rate": 1.9943891513702764e-05, + "loss": 1.3174, + "step": 8068 + }, + { + "epoch": 0.18819441002661752, + "grad_norm": 1.9351191520690918, + "learning_rate": 1.994386487431476e-05, + "loss": 1.5521, + "step": 8069 + }, + { + "epoch": 0.18821773316579546, + "grad_norm": 1.9769344329833984, + "learning_rate": 1.9943838228622084e-05, + "loss": 1.1974, + "step": 8070 + }, + { + "epoch": 0.18824105630497337, + "grad_norm": 1.6110507249832153, + "learning_rate": 1.9943811576624754e-05, + "loss": 1.4979, + "step": 8071 + }, + { + "epoch": 0.18826437944415128, + "grad_norm": 1.7549691200256348, + "learning_rate": 1.9943784918322787e-05, + "loss": 1.6069, + "step": 8072 + }, + { + "epoch": 0.1882877025833292, + "grad_norm": 1.6509368419647217, + "learning_rate": 1.9943758253716196e-05, + "loss": 1.3733, + "step": 8073 + }, + { + "epoch": 0.18831102572250713, + "grad_norm": 1.928328275680542, + "learning_rate": 1.9943731582805005e-05, + "loss": 1.4462, + "step": 8074 + }, + { + "epoch": 0.18833434886168504, + "grad_norm": 1.787205457687378, + "learning_rate": 1.9943704905589224e-05, + "loss": 1.114, + "step": 8075 + }, + { + "epoch": 0.18835767200086295, + "grad_norm": 1.811537742614746, + "learning_rate": 1.9943678222068877e-05, + "loss": 1.3102, + "step": 8076 + }, + { + "epoch": 0.18838099514004086, + "grad_norm": 1.5359605550765991, + "learning_rate": 1.9943651532243974e-05, + "loss": 1.4107, + "step": 8077 + }, + { + "epoch": 0.1884043182792188, + "grad_norm": 1.966202735900879, + "learning_rate": 1.9943624836114536e-05, + "loss": 1.3086, + "step": 8078 + }, + { + "epoch": 0.1884276414183967, + "grad_norm": 1.8312933444976807, + "learning_rate": 1.9943598133680577e-05, + "loss": 1.786, + "step": 8079 + }, + { + "epoch": 0.18845096455757462, + "grad_norm": 2.050481081008911, + "learning_rate": 1.994357142494212e-05, + "loss": 1.4257, + "step": 8080 + }, + { + "epoch": 0.18847428769675253, + "grad_norm": 1.5473322868347168, + "learning_rate": 1.9943544709899174e-05, + "loss": 1.4178, + "step": 8081 + }, + { + "epoch": 0.18849761083593047, + "grad_norm": 1.9566378593444824, + "learning_rate": 1.994351798855176e-05, + "loss": 1.5021, + "step": 8082 + }, + { + "epoch": 0.18852093397510838, + "grad_norm": 1.685613751411438, + "learning_rate": 1.9943491260899895e-05, + "loss": 1.477, + "step": 8083 + }, + { + "epoch": 0.1885442571142863, + "grad_norm": 2.1665124893188477, + "learning_rate": 1.9943464526943596e-05, + "loss": 1.4167, + "step": 8084 + }, + { + "epoch": 0.1885675802534642, + "grad_norm": 1.6005083322525024, + "learning_rate": 1.994343778668288e-05, + "loss": 1.3154, + "step": 8085 + }, + { + "epoch": 0.18859090339264215, + "grad_norm": 1.801766276359558, + "learning_rate": 1.994341104011776e-05, + "loss": 1.1059, + "step": 8086 + }, + { + "epoch": 0.18861422653182006, + "grad_norm": 1.7353712320327759, + "learning_rate": 1.994338428724826e-05, + "loss": 1.8245, + "step": 8087 + }, + { + "epoch": 0.18863754967099797, + "grad_norm": 1.7517952919006348, + "learning_rate": 1.9943357528074392e-05, + "loss": 1.6825, + "step": 8088 + }, + { + "epoch": 0.18866087281017588, + "grad_norm": 1.4427310228347778, + "learning_rate": 1.9943330762596175e-05, + "loss": 1.1498, + "step": 8089 + }, + { + "epoch": 0.18868419594935382, + "grad_norm": 2.2579891681671143, + "learning_rate": 1.9943303990813626e-05, + "loss": 1.643, + "step": 8090 + }, + { + "epoch": 0.18870751908853173, + "grad_norm": 1.8416584730148315, + "learning_rate": 1.9943277212726758e-05, + "loss": 1.331, + "step": 8091 + }, + { + "epoch": 0.18873084222770964, + "grad_norm": 1.7211225032806396, + "learning_rate": 1.9943250428335592e-05, + "loss": 1.2361, + "step": 8092 + }, + { + "epoch": 0.18875416536688755, + "grad_norm": 1.7025599479675293, + "learning_rate": 1.9943223637640147e-05, + "loss": 1.5869, + "step": 8093 + }, + { + "epoch": 0.18877748850606546, + "grad_norm": 1.8268306255340576, + "learning_rate": 1.9943196840640432e-05, + "loss": 1.3484, + "step": 8094 + }, + { + "epoch": 0.1888008116452434, + "grad_norm": 1.8497521877288818, + "learning_rate": 1.9943170037336474e-05, + "loss": 1.6849, + "step": 8095 + }, + { + "epoch": 0.1888241347844213, + "grad_norm": 1.8361694812774658, + "learning_rate": 1.9943143227728284e-05, + "loss": 1.4866, + "step": 8096 + }, + { + "epoch": 0.18884745792359922, + "grad_norm": 2.5526516437530518, + "learning_rate": 1.9943116411815878e-05, + "loss": 1.4491, + "step": 8097 + }, + { + "epoch": 0.18887078106277713, + "grad_norm": 1.5819147825241089, + "learning_rate": 1.9943089589599277e-05, + "loss": 1.4978, + "step": 8098 + }, + { + "epoch": 0.18889410420195507, + "grad_norm": 1.6812257766723633, + "learning_rate": 1.99430627610785e-05, + "loss": 1.3715, + "step": 8099 + }, + { + "epoch": 0.18891742734113298, + "grad_norm": 2.1168878078460693, + "learning_rate": 1.9943035926253555e-05, + "loss": 1.4707, + "step": 8100 + }, + { + "epoch": 0.1889407504803109, + "grad_norm": 1.9880980253219604, + "learning_rate": 1.9943009085124463e-05, + "loss": 1.6262, + "step": 8101 + }, + { + "epoch": 0.1889640736194888, + "grad_norm": 1.6023935079574585, + "learning_rate": 1.9942982237691244e-05, + "loss": 1.4451, + "step": 8102 + }, + { + "epoch": 0.18898739675866674, + "grad_norm": 1.5494956970214844, + "learning_rate": 1.9942955383953915e-05, + "loss": 0.8919, + "step": 8103 + }, + { + "epoch": 0.18901071989784465, + "grad_norm": 1.7539976835250854, + "learning_rate": 1.9942928523912492e-05, + "loss": 1.3114, + "step": 8104 + }, + { + "epoch": 0.18903404303702256, + "grad_norm": 2.1129629611968994, + "learning_rate": 1.9942901657566986e-05, + "loss": 1.6568, + "step": 8105 + }, + { + "epoch": 0.18905736617620048, + "grad_norm": 2.1358554363250732, + "learning_rate": 1.9942874784917423e-05, + "loss": 1.5842, + "step": 8106 + }, + { + "epoch": 0.18908068931537841, + "grad_norm": 1.8218640089035034, + "learning_rate": 1.9942847905963818e-05, + "loss": 1.2124, + "step": 8107 + }, + { + "epoch": 0.18910401245455633, + "grad_norm": 1.9492056369781494, + "learning_rate": 1.9942821020706184e-05, + "loss": 1.2896, + "step": 8108 + }, + { + "epoch": 0.18912733559373424, + "grad_norm": 1.777604579925537, + "learning_rate": 1.994279412914454e-05, + "loss": 1.3758, + "step": 8109 + }, + { + "epoch": 0.18915065873291215, + "grad_norm": 1.6945788860321045, + "learning_rate": 1.9942767231278902e-05, + "loss": 1.3345, + "step": 8110 + }, + { + "epoch": 0.18917398187209009, + "grad_norm": 1.644107699394226, + "learning_rate": 1.9942740327109292e-05, + "loss": 1.4362, + "step": 8111 + }, + { + "epoch": 0.189197305011268, + "grad_norm": 1.9671415090560913, + "learning_rate": 1.9942713416635724e-05, + "loss": 1.5851, + "step": 8112 + }, + { + "epoch": 0.1892206281504459, + "grad_norm": 1.5927369594573975, + "learning_rate": 1.994268649985821e-05, + "loss": 1.5235, + "step": 8113 + }, + { + "epoch": 0.18924395128962382, + "grad_norm": 2.0805578231811523, + "learning_rate": 1.9942659576776778e-05, + "loss": 1.7172, + "step": 8114 + }, + { + "epoch": 0.18926727442880176, + "grad_norm": 2.035644292831421, + "learning_rate": 1.9942632647391434e-05, + "loss": 1.7488, + "step": 8115 + }, + { + "epoch": 0.18929059756797967, + "grad_norm": 2.7134170532226562, + "learning_rate": 1.9942605711702203e-05, + "loss": 1.8708, + "step": 8116 + }, + { + "epoch": 0.18931392070715758, + "grad_norm": 1.614647388458252, + "learning_rate": 1.9942578769709096e-05, + "loss": 1.3746, + "step": 8117 + }, + { + "epoch": 0.1893372438463355, + "grad_norm": 1.6869707107543945, + "learning_rate": 1.9942551821412136e-05, + "loss": 1.4048, + "step": 8118 + }, + { + "epoch": 0.18936056698551343, + "grad_norm": 2.027705669403076, + "learning_rate": 1.9942524866811336e-05, + "loss": 1.4787, + "step": 8119 + }, + { + "epoch": 0.18938389012469134, + "grad_norm": 1.6126846075057983, + "learning_rate": 1.9942497905906714e-05, + "loss": 1.0186, + "step": 8120 + }, + { + "epoch": 0.18940721326386925, + "grad_norm": 1.8642371892929077, + "learning_rate": 1.9942470938698286e-05, + "loss": 1.1917, + "step": 8121 + }, + { + "epoch": 0.18943053640304716, + "grad_norm": 1.6385704278945923, + "learning_rate": 1.9942443965186073e-05, + "loss": 1.1947, + "step": 8122 + }, + { + "epoch": 0.18945385954222507, + "grad_norm": 2.0080950260162354, + "learning_rate": 1.9942416985370085e-05, + "loss": 1.5238, + "step": 8123 + }, + { + "epoch": 0.189477182681403, + "grad_norm": 1.7528451681137085, + "learning_rate": 1.9942389999250345e-05, + "loss": 1.4537, + "step": 8124 + }, + { + "epoch": 0.18950050582058092, + "grad_norm": 1.492655873298645, + "learning_rate": 1.994236300682687e-05, + "loss": 1.2211, + "step": 8125 + }, + { + "epoch": 0.18952382895975883, + "grad_norm": 1.956758975982666, + "learning_rate": 1.9942336008099677e-05, + "loss": 1.2076, + "step": 8126 + }, + { + "epoch": 0.18954715209893674, + "grad_norm": 2.105048418045044, + "learning_rate": 1.9942309003068782e-05, + "loss": 1.7738, + "step": 8127 + }, + { + "epoch": 0.18957047523811468, + "grad_norm": 1.71355140209198, + "learning_rate": 1.9942281991734203e-05, + "loss": 1.1747, + "step": 8128 + }, + { + "epoch": 0.1895937983772926, + "grad_norm": 1.5540777444839478, + "learning_rate": 1.994225497409595e-05, + "loss": 1.5639, + "step": 8129 + }, + { + "epoch": 0.1896171215164705, + "grad_norm": 1.809600591659546, + "learning_rate": 1.9942227950154053e-05, + "loss": 1.7625, + "step": 8130 + }, + { + "epoch": 0.18964044465564842, + "grad_norm": 1.4709620475769043, + "learning_rate": 1.9942200919908518e-05, + "loss": 1.2442, + "step": 8131 + }, + { + "epoch": 0.18966376779482635, + "grad_norm": 1.7274200916290283, + "learning_rate": 1.994217388335937e-05, + "loss": 1.547, + "step": 8132 + }, + { + "epoch": 0.18968709093400427, + "grad_norm": 1.774335503578186, + "learning_rate": 1.9942146840506622e-05, + "loss": 1.8524, + "step": 8133 + }, + { + "epoch": 0.18971041407318218, + "grad_norm": 2.0405828952789307, + "learning_rate": 1.994211979135029e-05, + "loss": 1.3801, + "step": 8134 + }, + { + "epoch": 0.1897337372123601, + "grad_norm": 1.4285510778427124, + "learning_rate": 1.9942092735890398e-05, + "loss": 1.2567, + "step": 8135 + }, + { + "epoch": 0.18975706035153803, + "grad_norm": 1.6399363279342651, + "learning_rate": 1.9942065674126954e-05, + "loss": 1.4007, + "step": 8136 + }, + { + "epoch": 0.18978038349071594, + "grad_norm": 1.9271926879882812, + "learning_rate": 1.994203860605998e-05, + "loss": 1.5349, + "step": 8137 + }, + { + "epoch": 0.18980370662989385, + "grad_norm": 2.0672764778137207, + "learning_rate": 1.9942011531689493e-05, + "loss": 1.4373, + "step": 8138 + }, + { + "epoch": 0.18982702976907176, + "grad_norm": 1.8264596462249756, + "learning_rate": 1.9941984451015506e-05, + "loss": 1.4586, + "step": 8139 + }, + { + "epoch": 0.1898503529082497, + "grad_norm": 1.599394679069519, + "learning_rate": 1.9941957364038043e-05, + "loss": 1.543, + "step": 8140 + }, + { + "epoch": 0.1898736760474276, + "grad_norm": 1.9549330472946167, + "learning_rate": 1.9941930270757116e-05, + "loss": 1.2742, + "step": 8141 + }, + { + "epoch": 0.18989699918660552, + "grad_norm": 1.809180498123169, + "learning_rate": 1.9941903171172746e-05, + "loss": 1.8312, + "step": 8142 + }, + { + "epoch": 0.18992032232578343, + "grad_norm": 1.4620929956436157, + "learning_rate": 1.994187606528495e-05, + "loss": 1.0797, + "step": 8143 + }, + { + "epoch": 0.18994364546496137, + "grad_norm": 1.6927831172943115, + "learning_rate": 1.994184895309374e-05, + "loss": 1.4469, + "step": 8144 + }, + { + "epoch": 0.18996696860413928, + "grad_norm": 1.9087426662445068, + "learning_rate": 1.994182183459914e-05, + "loss": 1.7712, + "step": 8145 + }, + { + "epoch": 0.1899902917433172, + "grad_norm": 1.6097540855407715, + "learning_rate": 1.9941794709801165e-05, + "loss": 1.301, + "step": 8146 + }, + { + "epoch": 0.1900136148824951, + "grad_norm": 1.695923924446106, + "learning_rate": 1.9941767578699827e-05, + "loss": 1.5565, + "step": 8147 + }, + { + "epoch": 0.19003693802167304, + "grad_norm": 1.8039361238479614, + "learning_rate": 1.9941740441295148e-05, + "loss": 1.3068, + "step": 8148 + }, + { + "epoch": 0.19006026116085095, + "grad_norm": 1.503901481628418, + "learning_rate": 1.9941713297587145e-05, + "loss": 1.2126, + "step": 8149 + }, + { + "epoch": 0.19008358430002886, + "grad_norm": 1.7467129230499268, + "learning_rate": 1.9941686147575837e-05, + "loss": 1.3303, + "step": 8150 + }, + { + "epoch": 0.19010690743920677, + "grad_norm": 2.086000680923462, + "learning_rate": 1.9941658991261236e-05, + "loss": 1.7307, + "step": 8151 + }, + { + "epoch": 0.19013023057838468, + "grad_norm": 1.9080475568771362, + "learning_rate": 1.9941631828643368e-05, + "loss": 1.5615, + "step": 8152 + }, + { + "epoch": 0.19015355371756262, + "grad_norm": 2.4976491928100586, + "learning_rate": 1.9941604659722236e-05, + "loss": 1.4729, + "step": 8153 + }, + { + "epoch": 0.19017687685674053, + "grad_norm": 2.011699676513672, + "learning_rate": 1.994157748449787e-05, + "loss": 1.6697, + "step": 8154 + }, + { + "epoch": 0.19020019999591845, + "grad_norm": 1.579748272895813, + "learning_rate": 1.9941550302970284e-05, + "loss": 1.404, + "step": 8155 + }, + { + "epoch": 0.19022352313509636, + "grad_norm": 2.2332043647766113, + "learning_rate": 1.994152311513949e-05, + "loss": 1.469, + "step": 8156 + }, + { + "epoch": 0.1902468462742743, + "grad_norm": 2.1241250038146973, + "learning_rate": 1.9941495921005515e-05, + "loss": 1.5282, + "step": 8157 + }, + { + "epoch": 0.1902701694134522, + "grad_norm": 1.9136888980865479, + "learning_rate": 1.9941468720568368e-05, + "loss": 1.3127, + "step": 8158 + }, + { + "epoch": 0.19029349255263012, + "grad_norm": 2.1656668186187744, + "learning_rate": 1.994144151382807e-05, + "loss": 1.3418, + "step": 8159 + }, + { + "epoch": 0.19031681569180803, + "grad_norm": 2.0331249237060547, + "learning_rate": 1.9941414300784632e-05, + "loss": 1.2434, + "step": 8160 + }, + { + "epoch": 0.19034013883098597, + "grad_norm": 1.9308236837387085, + "learning_rate": 1.994138708143808e-05, + "loss": 1.3219, + "step": 8161 + }, + { + "epoch": 0.19036346197016388, + "grad_norm": 1.8444973230361938, + "learning_rate": 1.994135985578843e-05, + "loss": 1.1319, + "step": 8162 + }, + { + "epoch": 0.1903867851093418, + "grad_norm": 1.738200306892395, + "learning_rate": 1.9941332623835692e-05, + "loss": 1.4609, + "step": 8163 + }, + { + "epoch": 0.1904101082485197, + "grad_norm": 2.131331205368042, + "learning_rate": 1.994130538557989e-05, + "loss": 1.6053, + "step": 8164 + }, + { + "epoch": 0.19043343138769764, + "grad_norm": 1.9915144443511963, + "learning_rate": 1.994127814102104e-05, + "loss": 1.1201, + "step": 8165 + }, + { + "epoch": 0.19045675452687555, + "grad_norm": 1.7361172437667847, + "learning_rate": 1.9941250890159157e-05, + "loss": 1.4459, + "step": 8166 + }, + { + "epoch": 0.19048007766605346, + "grad_norm": 1.6310242414474487, + "learning_rate": 1.994122363299426e-05, + "loss": 1.7441, + "step": 8167 + }, + { + "epoch": 0.19050340080523137, + "grad_norm": 1.64780855178833, + "learning_rate": 1.994119636952637e-05, + "loss": 1.5425, + "step": 8168 + }, + { + "epoch": 0.1905267239444093, + "grad_norm": 1.6156033277511597, + "learning_rate": 1.9941169099755497e-05, + "loss": 1.2656, + "step": 8169 + }, + { + "epoch": 0.19055004708358722, + "grad_norm": 2.1582720279693604, + "learning_rate": 1.9941141823681662e-05, + "loss": 1.7187, + "step": 8170 + }, + { + "epoch": 0.19057337022276513, + "grad_norm": 1.742720127105713, + "learning_rate": 1.9941114541304885e-05, + "loss": 1.1932, + "step": 8171 + }, + { + "epoch": 0.19059669336194304, + "grad_norm": 1.6932387351989746, + "learning_rate": 1.9941087252625177e-05, + "loss": 1.421, + "step": 8172 + }, + { + "epoch": 0.19062001650112098, + "grad_norm": 2.1907033920288086, + "learning_rate": 1.994105995764256e-05, + "loss": 1.126, + "step": 8173 + }, + { + "epoch": 0.1906433396402989, + "grad_norm": 1.831464409828186, + "learning_rate": 1.994103265635705e-05, + "loss": 1.4429, + "step": 8174 + }, + { + "epoch": 0.1906666627794768, + "grad_norm": 1.551263451576233, + "learning_rate": 1.9941005348768663e-05, + "loss": 1.4275, + "step": 8175 + }, + { + "epoch": 0.19068998591865471, + "grad_norm": 1.668155550956726, + "learning_rate": 1.994097803487742e-05, + "loss": 1.3945, + "step": 8176 + }, + { + "epoch": 0.19071330905783265, + "grad_norm": 2.0183050632476807, + "learning_rate": 1.9940950714683335e-05, + "loss": 1.3959, + "step": 8177 + }, + { + "epoch": 0.19073663219701056, + "grad_norm": 1.7435818910598755, + "learning_rate": 1.9940923388186428e-05, + "loss": 1.4235, + "step": 8178 + }, + { + "epoch": 0.19075995533618847, + "grad_norm": 2.131803274154663, + "learning_rate": 1.9940896055386712e-05, + "loss": 1.3167, + "step": 8179 + }, + { + "epoch": 0.19078327847536639, + "grad_norm": 2.0928549766540527, + "learning_rate": 1.9940868716284206e-05, + "loss": 1.5546, + "step": 8180 + }, + { + "epoch": 0.1908066016145443, + "grad_norm": 1.8970998525619507, + "learning_rate": 1.9940841370878932e-05, + "loss": 1.3789, + "step": 8181 + }, + { + "epoch": 0.19082992475372224, + "grad_norm": 1.8596991300582886, + "learning_rate": 1.99408140191709e-05, + "loss": 1.5757, + "step": 8182 + }, + { + "epoch": 0.19085324789290015, + "grad_norm": 1.9836488962173462, + "learning_rate": 1.994078666116013e-05, + "loss": 1.7786, + "step": 8183 + }, + { + "epoch": 0.19087657103207806, + "grad_norm": 2.0053515434265137, + "learning_rate": 1.9940759296846645e-05, + "loss": 1.4643, + "step": 8184 + }, + { + "epoch": 0.19089989417125597, + "grad_norm": 1.9837238788604736, + "learning_rate": 1.9940731926230455e-05, + "loss": 1.2548, + "step": 8185 + }, + { + "epoch": 0.1909232173104339, + "grad_norm": 1.9303414821624756, + "learning_rate": 1.994070454931158e-05, + "loss": 1.4516, + "step": 8186 + }, + { + "epoch": 0.19094654044961182, + "grad_norm": 1.9113258123397827, + "learning_rate": 1.994067716609004e-05, + "loss": 1.495, + "step": 8187 + }, + { + "epoch": 0.19096986358878973, + "grad_norm": 2.4014432430267334, + "learning_rate": 1.9940649776565845e-05, + "loss": 1.397, + "step": 8188 + }, + { + "epoch": 0.19099318672796764, + "grad_norm": 1.7973248958587646, + "learning_rate": 1.994062238073902e-05, + "loss": 1.4857, + "step": 8189 + }, + { + "epoch": 0.19101650986714558, + "grad_norm": 2.1522111892700195, + "learning_rate": 1.994059497860958e-05, + "loss": 1.7626, + "step": 8190 + }, + { + "epoch": 0.1910398330063235, + "grad_norm": 1.9723261594772339, + "learning_rate": 1.994056757017754e-05, + "loss": 1.5435, + "step": 8191 + }, + { + "epoch": 0.1910631561455014, + "grad_norm": 2.4121131896972656, + "learning_rate": 1.994054015544292e-05, + "loss": 1.3905, + "step": 8192 + }, + { + "epoch": 0.1910864792846793, + "grad_norm": 1.960960030555725, + "learning_rate": 1.9940512734405734e-05, + "loss": 1.5881, + "step": 8193 + }, + { + "epoch": 0.19110980242385725, + "grad_norm": 3.1330838203430176, + "learning_rate": 1.9940485307066003e-05, + "loss": 1.711, + "step": 8194 + }, + { + "epoch": 0.19113312556303516, + "grad_norm": 1.6552695035934448, + "learning_rate": 1.9940457873423745e-05, + "loss": 1.4651, + "step": 8195 + }, + { + "epoch": 0.19115644870221307, + "grad_norm": 1.606136441230774, + "learning_rate": 1.9940430433478975e-05, + "loss": 1.2181, + "step": 8196 + }, + { + "epoch": 0.19117977184139098, + "grad_norm": 1.7391279935836792, + "learning_rate": 1.994040298723171e-05, + "loss": 1.4736, + "step": 8197 + }, + { + "epoch": 0.19120309498056892, + "grad_norm": 1.5154005289077759, + "learning_rate": 1.9940375534681973e-05, + "loss": 0.9942, + "step": 8198 + }, + { + "epoch": 0.19122641811974683, + "grad_norm": 1.5764151811599731, + "learning_rate": 1.9940348075829775e-05, + "loss": 1.2371, + "step": 8199 + }, + { + "epoch": 0.19124974125892474, + "grad_norm": 2.1152052879333496, + "learning_rate": 1.9940320610675133e-05, + "loss": 1.3679, + "step": 8200 + }, + { + "epoch": 0.19127306439810265, + "grad_norm": 2.0302867889404297, + "learning_rate": 1.9940293139218068e-05, + "loss": 1.3589, + "step": 8201 + }, + { + "epoch": 0.1912963875372806, + "grad_norm": 1.7224639654159546, + "learning_rate": 1.9940265661458597e-05, + "loss": 1.2444, + "step": 8202 + }, + { + "epoch": 0.1913197106764585, + "grad_norm": 1.805887222290039, + "learning_rate": 1.9940238177396736e-05, + "loss": 1.1172, + "step": 8203 + }, + { + "epoch": 0.19134303381563642, + "grad_norm": 1.9463870525360107, + "learning_rate": 1.9940210687032503e-05, + "loss": 1.3261, + "step": 8204 + }, + { + "epoch": 0.19136635695481433, + "grad_norm": 2.283754348754883, + "learning_rate": 1.9940183190365914e-05, + "loss": 1.6568, + "step": 8205 + }, + { + "epoch": 0.19138968009399224, + "grad_norm": 1.6482785940170288, + "learning_rate": 1.994015568739699e-05, + "loss": 1.2817, + "step": 8206 + }, + { + "epoch": 0.19141300323317018, + "grad_norm": 1.8738189935684204, + "learning_rate": 1.9940128178125742e-05, + "loss": 1.3948, + "step": 8207 + }, + { + "epoch": 0.1914363263723481, + "grad_norm": 1.9089139699935913, + "learning_rate": 1.99401006625522e-05, + "loss": 1.5015, + "step": 8208 + }, + { + "epoch": 0.191459649511526, + "grad_norm": 2.5860321521759033, + "learning_rate": 1.994007314067637e-05, + "loss": 1.3626, + "step": 8209 + }, + { + "epoch": 0.1914829726507039, + "grad_norm": 1.9446245431900024, + "learning_rate": 1.9940045612498268e-05, + "loss": 1.3897, + "step": 8210 + }, + { + "epoch": 0.19150629578988185, + "grad_norm": 1.932563304901123, + "learning_rate": 1.994001807801792e-05, + "loss": 1.6446, + "step": 8211 + }, + { + "epoch": 0.19152961892905976, + "grad_norm": 1.754152774810791, + "learning_rate": 1.9939990537235337e-05, + "loss": 1.5565, + "step": 8212 + }, + { + "epoch": 0.19155294206823767, + "grad_norm": 1.8276276588439941, + "learning_rate": 1.9939962990150543e-05, + "loss": 1.8008, + "step": 8213 + }, + { + "epoch": 0.19157626520741558, + "grad_norm": 2.1867787837982178, + "learning_rate": 1.9939935436763548e-05, + "loss": 1.6536, + "step": 8214 + }, + { + "epoch": 0.19159958834659352, + "grad_norm": 1.856912612915039, + "learning_rate": 1.9939907877074375e-05, + "loss": 1.4362, + "step": 8215 + }, + { + "epoch": 0.19162291148577143, + "grad_norm": 1.7540032863616943, + "learning_rate": 1.993988031108304e-05, + "loss": 1.4562, + "step": 8216 + }, + { + "epoch": 0.19164623462494934, + "grad_norm": 1.659964680671692, + "learning_rate": 1.9939852738789557e-05, + "loss": 1.5884, + "step": 8217 + }, + { + "epoch": 0.19166955776412725, + "grad_norm": 1.6010884046554565, + "learning_rate": 1.9939825160193948e-05, + "loss": 1.2433, + "step": 8218 + }, + { + "epoch": 0.1916928809033052, + "grad_norm": 2.1334609985351562, + "learning_rate": 1.9939797575296226e-05, + "loss": 1.2439, + "step": 8219 + }, + { + "epoch": 0.1917162040424831, + "grad_norm": 1.5269479751586914, + "learning_rate": 1.9939769984096417e-05, + "loss": 1.28, + "step": 8220 + }, + { + "epoch": 0.191739527181661, + "grad_norm": 2.059091806411743, + "learning_rate": 1.993974238659453e-05, + "loss": 1.5162, + "step": 8221 + }, + { + "epoch": 0.19176285032083892, + "grad_norm": 1.6710606813430786, + "learning_rate": 1.9939714782790584e-05, + "loss": 1.4241, + "step": 8222 + }, + { + "epoch": 0.19178617346001686, + "grad_norm": 1.7723292112350464, + "learning_rate": 1.99396871726846e-05, + "loss": 1.6087, + "step": 8223 + }, + { + "epoch": 0.19180949659919477, + "grad_norm": 1.5672788619995117, + "learning_rate": 1.9939659556276595e-05, + "loss": 0.986, + "step": 8224 + }, + { + "epoch": 0.19183281973837268, + "grad_norm": 1.6078954935073853, + "learning_rate": 1.993963193356658e-05, + "loss": 1.1278, + "step": 8225 + }, + { + "epoch": 0.1918561428775506, + "grad_norm": 1.7335697412490845, + "learning_rate": 1.993960430455458e-05, + "loss": 1.3327, + "step": 8226 + }, + { + "epoch": 0.19187946601672853, + "grad_norm": 2.223494529724121, + "learning_rate": 1.993957666924061e-05, + "loss": 1.6315, + "step": 8227 + }, + { + "epoch": 0.19190278915590644, + "grad_norm": 1.922817349433899, + "learning_rate": 1.993954902762469e-05, + "loss": 1.2411, + "step": 8228 + }, + { + "epoch": 0.19192611229508436, + "grad_norm": 2.0232694149017334, + "learning_rate": 1.9939521379706834e-05, + "loss": 1.5902, + "step": 8229 + }, + { + "epoch": 0.19194943543426227, + "grad_norm": 1.9064017534255981, + "learning_rate": 1.9939493725487055e-05, + "loss": 1.4238, + "step": 8230 + }, + { + "epoch": 0.1919727585734402, + "grad_norm": 2.1726772785186768, + "learning_rate": 1.9939466064965382e-05, + "loss": 1.4383, + "step": 8231 + }, + { + "epoch": 0.19199608171261812, + "grad_norm": 1.883219599723816, + "learning_rate": 1.9939438398141824e-05, + "loss": 1.5913, + "step": 8232 + }, + { + "epoch": 0.19201940485179603, + "grad_norm": 1.8388091325759888, + "learning_rate": 1.9939410725016403e-05, + "loss": 1.427, + "step": 8233 + }, + { + "epoch": 0.19204272799097394, + "grad_norm": 1.9217509031295776, + "learning_rate": 1.993938304558913e-05, + "loss": 1.5574, + "step": 8234 + }, + { + "epoch": 0.19206605113015185, + "grad_norm": 2.0600483417510986, + "learning_rate": 1.9939355359860034e-05, + "loss": 1.3138, + "step": 8235 + }, + { + "epoch": 0.1920893742693298, + "grad_norm": 1.8839313983917236, + "learning_rate": 1.9939327667829122e-05, + "loss": 1.2559, + "step": 8236 + }, + { + "epoch": 0.1921126974085077, + "grad_norm": 2.019608736038208, + "learning_rate": 1.9939299969496413e-05, + "loss": 1.2768, + "step": 8237 + }, + { + "epoch": 0.1921360205476856, + "grad_norm": 1.7075214385986328, + "learning_rate": 1.9939272264861928e-05, + "loss": 1.078, + "step": 8238 + }, + { + "epoch": 0.19215934368686352, + "grad_norm": 1.9671835899353027, + "learning_rate": 1.9939244553925684e-05, + "loss": 1.2695, + "step": 8239 + }, + { + "epoch": 0.19218266682604146, + "grad_norm": 2.107933282852173, + "learning_rate": 1.99392168366877e-05, + "loss": 1.3553, + "step": 8240 + }, + { + "epoch": 0.19220598996521937, + "grad_norm": 1.9310173988342285, + "learning_rate": 1.9939189113147993e-05, + "loss": 1.5225, + "step": 8241 + }, + { + "epoch": 0.19222931310439728, + "grad_norm": 1.822830080986023, + "learning_rate": 1.9939161383306577e-05, + "loss": 1.6149, + "step": 8242 + }, + { + "epoch": 0.1922526362435752, + "grad_norm": 1.9030147790908813, + "learning_rate": 1.9939133647163472e-05, + "loss": 1.4139, + "step": 8243 + }, + { + "epoch": 0.19227595938275313, + "grad_norm": 2.1023008823394775, + "learning_rate": 1.9939105904718694e-05, + "loss": 1.6931, + "step": 8244 + }, + { + "epoch": 0.19229928252193104, + "grad_norm": 1.9022200107574463, + "learning_rate": 1.993907815597226e-05, + "loss": 1.5072, + "step": 8245 + }, + { + "epoch": 0.19232260566110895, + "grad_norm": 2.298271417617798, + "learning_rate": 1.993905040092419e-05, + "loss": 1.1554, + "step": 8246 + }, + { + "epoch": 0.19234592880028686, + "grad_norm": 2.1457529067993164, + "learning_rate": 1.9939022639574504e-05, + "loss": 1.3937, + "step": 8247 + }, + { + "epoch": 0.1923692519394648, + "grad_norm": 1.7208447456359863, + "learning_rate": 1.9938994871923217e-05, + "loss": 1.4657, + "step": 8248 + }, + { + "epoch": 0.1923925750786427, + "grad_norm": 1.661069393157959, + "learning_rate": 1.9938967097970344e-05, + "loss": 1.6141, + "step": 8249 + }, + { + "epoch": 0.19241589821782062, + "grad_norm": 1.6546950340270996, + "learning_rate": 1.9938939317715905e-05, + "loss": 1.3783, + "step": 8250 + }, + { + "epoch": 0.19243922135699854, + "grad_norm": 1.9210864305496216, + "learning_rate": 1.993891153115992e-05, + "loss": 1.3776, + "step": 8251 + }, + { + "epoch": 0.19246254449617647, + "grad_norm": 2.0697920322418213, + "learning_rate": 1.9938883738302404e-05, + "loss": 1.3164, + "step": 8252 + }, + { + "epoch": 0.19248586763535439, + "grad_norm": 1.8057719469070435, + "learning_rate": 1.9938855939143372e-05, + "loss": 1.8588, + "step": 8253 + }, + { + "epoch": 0.1925091907745323, + "grad_norm": 2.334623098373413, + "learning_rate": 1.993882813368285e-05, + "loss": 1.1505, + "step": 8254 + }, + { + "epoch": 0.1925325139137102, + "grad_norm": 1.8724678754806519, + "learning_rate": 1.9938800321920844e-05, + "loss": 1.2595, + "step": 8255 + }, + { + "epoch": 0.19255583705288815, + "grad_norm": 2.0478932857513428, + "learning_rate": 1.993877250385738e-05, + "loss": 1.4533, + "step": 8256 + }, + { + "epoch": 0.19257916019206606, + "grad_norm": 1.7206997871398926, + "learning_rate": 1.9938744679492473e-05, + "loss": 1.3614, + "step": 8257 + }, + { + "epoch": 0.19260248333124397, + "grad_norm": 1.703972339630127, + "learning_rate": 1.993871684882614e-05, + "loss": 1.2832, + "step": 8258 + }, + { + "epoch": 0.19262580647042188, + "grad_norm": 2.436664581298828, + "learning_rate": 1.99386890118584e-05, + "loss": 1.2964, + "step": 8259 + }, + { + "epoch": 0.19264912960959982, + "grad_norm": 2.16241717338562, + "learning_rate": 1.9938661168589273e-05, + "loss": 1.4524, + "step": 8260 + }, + { + "epoch": 0.19267245274877773, + "grad_norm": 1.5384286642074585, + "learning_rate": 1.993863331901877e-05, + "loss": 1.3384, + "step": 8261 + }, + { + "epoch": 0.19269577588795564, + "grad_norm": 1.8156768083572388, + "learning_rate": 1.9938605463146915e-05, + "loss": 1.2819, + "step": 8262 + }, + { + "epoch": 0.19271909902713355, + "grad_norm": 1.953841209411621, + "learning_rate": 1.993857760097372e-05, + "loss": 1.4241, + "step": 8263 + }, + { + "epoch": 0.19274242216631146, + "grad_norm": 1.863234281539917, + "learning_rate": 1.9938549732499212e-05, + "loss": 1.5635, + "step": 8264 + }, + { + "epoch": 0.1927657453054894, + "grad_norm": 1.9012207984924316, + "learning_rate": 1.9938521857723397e-05, + "loss": 1.408, + "step": 8265 + }, + { + "epoch": 0.1927890684446673, + "grad_norm": 2.1410226821899414, + "learning_rate": 1.9938493976646298e-05, + "loss": 1.5889, + "step": 8266 + }, + { + "epoch": 0.19281239158384522, + "grad_norm": 1.8135639429092407, + "learning_rate": 1.9938466089267938e-05, + "loss": 1.3625, + "step": 8267 + }, + { + "epoch": 0.19283571472302313, + "grad_norm": 1.847774863243103, + "learning_rate": 1.9938438195588326e-05, + "loss": 1.3395, + "step": 8268 + }, + { + "epoch": 0.19285903786220107, + "grad_norm": 1.7511404752731323, + "learning_rate": 1.993841029560748e-05, + "loss": 1.0757, + "step": 8269 + }, + { + "epoch": 0.19288236100137898, + "grad_norm": 1.901037335395813, + "learning_rate": 1.9938382389325426e-05, + "loss": 1.6033, + "step": 8270 + }, + { + "epoch": 0.1929056841405569, + "grad_norm": 1.7303388118743896, + "learning_rate": 1.9938354476742177e-05, + "loss": 1.0348, + "step": 8271 + }, + { + "epoch": 0.1929290072797348, + "grad_norm": 1.800917387008667, + "learning_rate": 1.993832655785775e-05, + "loss": 1.0826, + "step": 8272 + }, + { + "epoch": 0.19295233041891274, + "grad_norm": 1.8702244758605957, + "learning_rate": 1.993829863267216e-05, + "loss": 1.3946, + "step": 8273 + }, + { + "epoch": 0.19297565355809065, + "grad_norm": 2.0794334411621094, + "learning_rate": 1.993827070118543e-05, + "loss": 1.6631, + "step": 8274 + }, + { + "epoch": 0.19299897669726856, + "grad_norm": 1.828033685684204, + "learning_rate": 1.9938242763397575e-05, + "loss": 1.1395, + "step": 8275 + }, + { + "epoch": 0.19302229983644648, + "grad_norm": 1.4373477697372437, + "learning_rate": 1.9938214819308614e-05, + "loss": 1.3966, + "step": 8276 + }, + { + "epoch": 0.19304562297562441, + "grad_norm": 2.033454418182373, + "learning_rate": 1.993818686891856e-05, + "loss": 1.4578, + "step": 8277 + }, + { + "epoch": 0.19306894611480233, + "grad_norm": 1.8383326530456543, + "learning_rate": 1.993815891222744e-05, + "loss": 1.4925, + "step": 8278 + }, + { + "epoch": 0.19309226925398024, + "grad_norm": 1.8341922760009766, + "learning_rate": 1.9938130949235265e-05, + "loss": 1.2592, + "step": 8279 + }, + { + "epoch": 0.19311559239315815, + "grad_norm": 1.7840993404388428, + "learning_rate": 1.993810297994205e-05, + "loss": 1.4799, + "step": 8280 + }, + { + "epoch": 0.1931389155323361, + "grad_norm": 2.1023433208465576, + "learning_rate": 1.993807500434782e-05, + "loss": 1.3603, + "step": 8281 + }, + { + "epoch": 0.193162238671514, + "grad_norm": 1.6232949495315552, + "learning_rate": 1.993804702245259e-05, + "loss": 1.2889, + "step": 8282 + }, + { + "epoch": 0.1931855618106919, + "grad_norm": 1.9029287099838257, + "learning_rate": 1.9938019034256374e-05, + "loss": 1.2462, + "step": 8283 + }, + { + "epoch": 0.19320888494986982, + "grad_norm": 1.722389817237854, + "learning_rate": 1.9937991039759198e-05, + "loss": 1.3609, + "step": 8284 + }, + { + "epoch": 0.19323220808904776, + "grad_norm": 1.72370183467865, + "learning_rate": 1.9937963038961073e-05, + "loss": 1.338, + "step": 8285 + }, + { + "epoch": 0.19325553122822567, + "grad_norm": 1.5311522483825684, + "learning_rate": 1.993793503186202e-05, + "loss": 1.2694, + "step": 8286 + }, + { + "epoch": 0.19327885436740358, + "grad_norm": 1.5350550413131714, + "learning_rate": 1.993790701846205e-05, + "loss": 1.2806, + "step": 8287 + }, + { + "epoch": 0.1933021775065815, + "grad_norm": 1.815001368522644, + "learning_rate": 1.9937878998761187e-05, + "loss": 1.4768, + "step": 8288 + }, + { + "epoch": 0.19332550064575943, + "grad_norm": 2.1854968070983887, + "learning_rate": 1.993785097275945e-05, + "loss": 1.3703, + "step": 8289 + }, + { + "epoch": 0.19334882378493734, + "grad_norm": 1.7093299627304077, + "learning_rate": 1.9937822940456857e-05, + "loss": 0.984, + "step": 8290 + }, + { + "epoch": 0.19337214692411525, + "grad_norm": 2.3020713329315186, + "learning_rate": 1.993779490185342e-05, + "loss": 1.2495, + "step": 8291 + }, + { + "epoch": 0.19339547006329316, + "grad_norm": 2.0324885845184326, + "learning_rate": 1.993776685694916e-05, + "loss": 1.7025, + "step": 8292 + }, + { + "epoch": 0.19341879320247107, + "grad_norm": 2.1256227493286133, + "learning_rate": 1.9937738805744097e-05, + "loss": 1.5928, + "step": 8293 + }, + { + "epoch": 0.193442116341649, + "grad_norm": 1.8027822971343994, + "learning_rate": 1.9937710748238244e-05, + "loss": 1.1877, + "step": 8294 + }, + { + "epoch": 0.19346543948082692, + "grad_norm": 1.8855466842651367, + "learning_rate": 1.9937682684431622e-05, + "loss": 1.6135, + "step": 8295 + }, + { + "epoch": 0.19348876262000483, + "grad_norm": 1.685184121131897, + "learning_rate": 1.993765461432425e-05, + "loss": 1.5247, + "step": 8296 + }, + { + "epoch": 0.19351208575918274, + "grad_norm": 1.8112637996673584, + "learning_rate": 1.9937626537916145e-05, + "loss": 1.6907, + "step": 8297 + }, + { + "epoch": 0.19353540889836068, + "grad_norm": 1.6806775331497192, + "learning_rate": 1.9937598455207322e-05, + "loss": 1.5199, + "step": 8298 + }, + { + "epoch": 0.1935587320375386, + "grad_norm": 1.9604697227478027, + "learning_rate": 1.9937570366197803e-05, + "loss": 1.7577, + "step": 8299 + }, + { + "epoch": 0.1935820551767165, + "grad_norm": 1.73598051071167, + "learning_rate": 1.99375422708876e-05, + "loss": 1.5872, + "step": 8300 + }, + { + "epoch": 0.19360537831589442, + "grad_norm": 1.7190916538238525, + "learning_rate": 1.9937514169276734e-05, + "loss": 1.3338, + "step": 8301 + }, + { + "epoch": 0.19362870145507236, + "grad_norm": 1.8900700807571411, + "learning_rate": 1.9937486061365225e-05, + "loss": 1.4146, + "step": 8302 + }, + { + "epoch": 0.19365202459425027, + "grad_norm": 2.3338429927825928, + "learning_rate": 1.9937457947153088e-05, + "loss": 1.6415, + "step": 8303 + }, + { + "epoch": 0.19367534773342818, + "grad_norm": 1.632159948348999, + "learning_rate": 1.9937429826640344e-05, + "loss": 1.3692, + "step": 8304 + }, + { + "epoch": 0.1936986708726061, + "grad_norm": 1.8497684001922607, + "learning_rate": 1.993740169982701e-05, + "loss": 1.7593, + "step": 8305 + }, + { + "epoch": 0.19372199401178403, + "grad_norm": 1.8938544988632202, + "learning_rate": 1.99373735667131e-05, + "loss": 1.4039, + "step": 8306 + }, + { + "epoch": 0.19374531715096194, + "grad_norm": 1.692214846611023, + "learning_rate": 1.9937345427298634e-05, + "loss": 1.5541, + "step": 8307 + }, + { + "epoch": 0.19376864029013985, + "grad_norm": 1.6402863264083862, + "learning_rate": 1.9937317281583628e-05, + "loss": 1.4334, + "step": 8308 + }, + { + "epoch": 0.19379196342931776, + "grad_norm": 1.7711254358291626, + "learning_rate": 1.9937289129568102e-05, + "loss": 1.5076, + "step": 8309 + }, + { + "epoch": 0.1938152865684957, + "grad_norm": 1.532541036605835, + "learning_rate": 1.9937260971252078e-05, + "loss": 0.9471, + "step": 8310 + }, + { + "epoch": 0.1938386097076736, + "grad_norm": 1.8225079774856567, + "learning_rate": 1.9937232806635568e-05, + "loss": 1.3895, + "step": 8311 + }, + { + "epoch": 0.19386193284685152, + "grad_norm": 2.6701905727386475, + "learning_rate": 1.993720463571859e-05, + "loss": 1.8402, + "step": 8312 + }, + { + "epoch": 0.19388525598602943, + "grad_norm": 1.7007910013198853, + "learning_rate": 1.9937176458501164e-05, + "loss": 1.0612, + "step": 8313 + }, + { + "epoch": 0.19390857912520737, + "grad_norm": 1.7752270698547363, + "learning_rate": 1.9937148274983307e-05, + "loss": 1.5183, + "step": 8314 + }, + { + "epoch": 0.19393190226438528, + "grad_norm": 1.84112548828125, + "learning_rate": 1.9937120085165037e-05, + "loss": 1.5197, + "step": 8315 + }, + { + "epoch": 0.1939552254035632, + "grad_norm": 1.7148497104644775, + "learning_rate": 1.9937091889046372e-05, + "loss": 1.178, + "step": 8316 + }, + { + "epoch": 0.1939785485427411, + "grad_norm": 1.8114657402038574, + "learning_rate": 1.993706368662733e-05, + "loss": 1.053, + "step": 8317 + }, + { + "epoch": 0.19400187168191904, + "grad_norm": 1.714318871498108, + "learning_rate": 1.993703547790793e-05, + "loss": 1.2557, + "step": 8318 + }, + { + "epoch": 0.19402519482109695, + "grad_norm": 1.7611223459243774, + "learning_rate": 1.993700726288819e-05, + "loss": 1.1467, + "step": 8319 + }, + { + "epoch": 0.19404851796027486, + "grad_norm": 1.7090764045715332, + "learning_rate": 1.9936979041568123e-05, + "loss": 1.341, + "step": 8320 + }, + { + "epoch": 0.19407184109945277, + "grad_norm": 1.8492157459259033, + "learning_rate": 1.9936950813947755e-05, + "loss": 1.595, + "step": 8321 + }, + { + "epoch": 0.19409516423863069, + "grad_norm": 2.048724889755249, + "learning_rate": 1.9936922580027094e-05, + "loss": 1.4831, + "step": 8322 + }, + { + "epoch": 0.19411848737780862, + "grad_norm": 1.8928842544555664, + "learning_rate": 1.9936894339806165e-05, + "loss": 1.0957, + "step": 8323 + }, + { + "epoch": 0.19414181051698653, + "grad_norm": 2.4616496562957764, + "learning_rate": 1.9936866093284984e-05, + "loss": 1.0411, + "step": 8324 + }, + { + "epoch": 0.19416513365616445, + "grad_norm": 2.000389575958252, + "learning_rate": 1.9936837840463566e-05, + "loss": 1.8461, + "step": 8325 + }, + { + "epoch": 0.19418845679534236, + "grad_norm": 2.0179214477539062, + "learning_rate": 1.9936809581341937e-05, + "loss": 1.6596, + "step": 8326 + }, + { + "epoch": 0.1942117799345203, + "grad_norm": 1.8602657318115234, + "learning_rate": 1.9936781315920108e-05, + "loss": 1.4661, + "step": 8327 + }, + { + "epoch": 0.1942351030736982, + "grad_norm": 1.7421925067901611, + "learning_rate": 1.99367530441981e-05, + "loss": 1.3653, + "step": 8328 + }, + { + "epoch": 0.19425842621287612, + "grad_norm": 1.7605869770050049, + "learning_rate": 1.9936724766175928e-05, + "loss": 1.4307, + "step": 8329 + }, + { + "epoch": 0.19428174935205403, + "grad_norm": 2.1662378311157227, + "learning_rate": 1.9936696481853612e-05, + "loss": 1.4849, + "step": 8330 + }, + { + "epoch": 0.19430507249123197, + "grad_norm": 1.81052827835083, + "learning_rate": 1.993666819123117e-05, + "loss": 1.5526, + "step": 8331 + }, + { + "epoch": 0.19432839563040988, + "grad_norm": 2.163729190826416, + "learning_rate": 1.993663989430862e-05, + "loss": 1.1936, + "step": 8332 + }, + { + "epoch": 0.1943517187695878, + "grad_norm": 2.029008388519287, + "learning_rate": 1.9936611591085976e-05, + "loss": 1.5758, + "step": 8333 + }, + { + "epoch": 0.1943750419087657, + "grad_norm": 2.1959445476531982, + "learning_rate": 1.993658328156326e-05, + "loss": 1.4522, + "step": 8334 + }, + { + "epoch": 0.19439836504794364, + "grad_norm": 1.7279036045074463, + "learning_rate": 1.9936554965740497e-05, + "loss": 1.5621, + "step": 8335 + }, + { + "epoch": 0.19442168818712155, + "grad_norm": 1.5420937538146973, + "learning_rate": 1.993652664361769e-05, + "loss": 1.2472, + "step": 8336 + }, + { + "epoch": 0.19444501132629946, + "grad_norm": 2.157424211502075, + "learning_rate": 1.9936498315194867e-05, + "loss": 1.592, + "step": 8337 + }, + { + "epoch": 0.19446833446547737, + "grad_norm": 1.8372224569320679, + "learning_rate": 1.993646998047204e-05, + "loss": 1.6362, + "step": 8338 + }, + { + "epoch": 0.1944916576046553, + "grad_norm": 1.9427121877670288, + "learning_rate": 1.9936441639449234e-05, + "loss": 1.7334, + "step": 8339 + }, + { + "epoch": 0.19451498074383322, + "grad_norm": 1.8625268936157227, + "learning_rate": 1.993641329212646e-05, + "loss": 1.4163, + "step": 8340 + }, + { + "epoch": 0.19453830388301113, + "grad_norm": 2.0741419792175293, + "learning_rate": 1.9936384938503738e-05, + "loss": 1.5314, + "step": 8341 + }, + { + "epoch": 0.19456162702218904, + "grad_norm": 1.6607245206832886, + "learning_rate": 1.9936356578581092e-05, + "loss": 1.4889, + "step": 8342 + }, + { + "epoch": 0.19458495016136698, + "grad_norm": 1.9155807495117188, + "learning_rate": 1.993632821235853e-05, + "loss": 1.7107, + "step": 8343 + }, + { + "epoch": 0.1946082733005449, + "grad_norm": 1.676759123802185, + "learning_rate": 1.993629983983608e-05, + "loss": 1.4258, + "step": 8344 + }, + { + "epoch": 0.1946315964397228, + "grad_norm": 1.9480493068695068, + "learning_rate": 1.9936271461013753e-05, + "loss": 1.3307, + "step": 8345 + }, + { + "epoch": 0.19465491957890071, + "grad_norm": 1.9305952787399292, + "learning_rate": 1.993624307589157e-05, + "loss": 1.1002, + "step": 8346 + }, + { + "epoch": 0.19467824271807865, + "grad_norm": 1.811282753944397, + "learning_rate": 1.9936214684469543e-05, + "loss": 1.3501, + "step": 8347 + }, + { + "epoch": 0.19470156585725656, + "grad_norm": 1.5984883308410645, + "learning_rate": 1.99361862867477e-05, + "loss": 1.4709, + "step": 8348 + }, + { + "epoch": 0.19472488899643448, + "grad_norm": 3.960463047027588, + "learning_rate": 1.993615788272605e-05, + "loss": 1.6045, + "step": 8349 + }, + { + "epoch": 0.1947482121356124, + "grad_norm": 1.7727078199386597, + "learning_rate": 1.993612947240462e-05, + "loss": 1.271, + "step": 8350 + }, + { + "epoch": 0.1947715352747903, + "grad_norm": 1.784284234046936, + "learning_rate": 1.993610105578342e-05, + "loss": 1.3612, + "step": 8351 + }, + { + "epoch": 0.19479485841396824, + "grad_norm": 1.7001585960388184, + "learning_rate": 1.9936072632862473e-05, + "loss": 1.3249, + "step": 8352 + }, + { + "epoch": 0.19481818155314615, + "grad_norm": 1.700894832611084, + "learning_rate": 1.9936044203641795e-05, + "loss": 1.2342, + "step": 8353 + }, + { + "epoch": 0.19484150469232406, + "grad_norm": 2.0045692920684814, + "learning_rate": 1.99360157681214e-05, + "loss": 1.6919, + "step": 8354 + }, + { + "epoch": 0.19486482783150197, + "grad_norm": 2.304435968399048, + "learning_rate": 1.9935987326301314e-05, + "loss": 1.4972, + "step": 8355 + }, + { + "epoch": 0.1948881509706799, + "grad_norm": 1.9663649797439575, + "learning_rate": 1.9935958878181553e-05, + "loss": 1.351, + "step": 8356 + }, + { + "epoch": 0.19491147410985782, + "grad_norm": 2.026777982711792, + "learning_rate": 1.9935930423762132e-05, + "loss": 1.5526, + "step": 8357 + }, + { + "epoch": 0.19493479724903573, + "grad_norm": 1.7235180139541626, + "learning_rate": 1.9935901963043064e-05, + "loss": 1.3958, + "step": 8358 + }, + { + "epoch": 0.19495812038821364, + "grad_norm": 2.060513496398926, + "learning_rate": 1.993587349602438e-05, + "loss": 1.3645, + "step": 8359 + }, + { + "epoch": 0.19498144352739158, + "grad_norm": 1.6914254426956177, + "learning_rate": 1.993584502270609e-05, + "loss": 1.3862, + "step": 8360 + }, + { + "epoch": 0.1950047666665695, + "grad_norm": 1.839327335357666, + "learning_rate": 1.9935816543088212e-05, + "loss": 1.7362, + "step": 8361 + }, + { + "epoch": 0.1950280898057474, + "grad_norm": 1.9103553295135498, + "learning_rate": 1.9935788057170767e-05, + "loss": 1.5803, + "step": 8362 + }, + { + "epoch": 0.1950514129449253, + "grad_norm": 1.9524987936019897, + "learning_rate": 1.9935759564953772e-05, + "loss": 1.7747, + "step": 8363 + }, + { + "epoch": 0.19507473608410325, + "grad_norm": 2.0070862770080566, + "learning_rate": 1.9935731066437244e-05, + "loss": 1.4792, + "step": 8364 + }, + { + "epoch": 0.19509805922328116, + "grad_norm": 1.81747567653656, + "learning_rate": 1.99357025616212e-05, + "loss": 1.5766, + "step": 8365 + }, + { + "epoch": 0.19512138236245907, + "grad_norm": 1.6705777645111084, + "learning_rate": 1.993567405050566e-05, + "loss": 1.2761, + "step": 8366 + }, + { + "epoch": 0.19514470550163698, + "grad_norm": 1.6513408422470093, + "learning_rate": 1.993564553309064e-05, + "loss": 1.4495, + "step": 8367 + }, + { + "epoch": 0.19516802864081492, + "grad_norm": 1.986993432044983, + "learning_rate": 1.9935617009376166e-05, + "loss": 1.4228, + "step": 8368 + }, + { + "epoch": 0.19519135177999283, + "grad_norm": 1.7411285638809204, + "learning_rate": 1.9935588479362247e-05, + "loss": 1.2399, + "step": 8369 + }, + { + "epoch": 0.19521467491917074, + "grad_norm": 1.782590389251709, + "learning_rate": 1.9935559943048903e-05, + "loss": 1.3768, + "step": 8370 + }, + { + "epoch": 0.19523799805834866, + "grad_norm": 2.074950933456421, + "learning_rate": 1.9935531400436155e-05, + "loss": 1.4575, + "step": 8371 + }, + { + "epoch": 0.1952613211975266, + "grad_norm": 1.9944992065429688, + "learning_rate": 1.9935502851524015e-05, + "loss": 1.4174, + "step": 8372 + }, + { + "epoch": 0.1952846443367045, + "grad_norm": 1.663567066192627, + "learning_rate": 1.993547429631251e-05, + "loss": 1.3109, + "step": 8373 + }, + { + "epoch": 0.19530796747588242, + "grad_norm": 2.102902889251709, + "learning_rate": 1.993544573480165e-05, + "loss": 1.4009, + "step": 8374 + }, + { + "epoch": 0.19533129061506033, + "grad_norm": 2.3223838806152344, + "learning_rate": 1.993541716699146e-05, + "loss": 1.4741, + "step": 8375 + }, + { + "epoch": 0.19535461375423827, + "grad_norm": 1.7912638187408447, + "learning_rate": 1.993538859288195e-05, + "loss": 1.015, + "step": 8376 + }, + { + "epoch": 0.19537793689341618, + "grad_norm": 2.424626111984253, + "learning_rate": 1.993536001247315e-05, + "loss": 1.442, + "step": 8377 + }, + { + "epoch": 0.1954012600325941, + "grad_norm": 1.7092453241348267, + "learning_rate": 1.9935331425765065e-05, + "loss": 1.433, + "step": 8378 + }, + { + "epoch": 0.195424583171772, + "grad_norm": 1.706035852432251, + "learning_rate": 1.9935302832757722e-05, + "loss": 1.4083, + "step": 8379 + }, + { + "epoch": 0.1954479063109499, + "grad_norm": 1.8730711936950684, + "learning_rate": 1.993527423345113e-05, + "loss": 1.4119, + "step": 8380 + }, + { + "epoch": 0.19547122945012785, + "grad_norm": 2.1011226177215576, + "learning_rate": 1.9935245627845322e-05, + "loss": 1.3162, + "step": 8381 + }, + { + "epoch": 0.19549455258930576, + "grad_norm": 1.7778310775756836, + "learning_rate": 1.99352170159403e-05, + "loss": 1.2743, + "step": 8382 + }, + { + "epoch": 0.19551787572848367, + "grad_norm": 1.8772116899490356, + "learning_rate": 1.9935188397736097e-05, + "loss": 1.4373, + "step": 8383 + }, + { + "epoch": 0.19554119886766158, + "grad_norm": 2.2312886714935303, + "learning_rate": 1.9935159773232718e-05, + "loss": 1.3826, + "step": 8384 + }, + { + "epoch": 0.19556452200683952, + "grad_norm": 1.867264747619629, + "learning_rate": 1.993513114243019e-05, + "loss": 1.4405, + "step": 8385 + }, + { + "epoch": 0.19558784514601743, + "grad_norm": 1.6862198114395142, + "learning_rate": 1.9935102505328527e-05, + "loss": 1.3709, + "step": 8386 + }, + { + "epoch": 0.19561116828519534, + "grad_norm": 1.5756068229675293, + "learning_rate": 1.9935073861927748e-05, + "loss": 1.6684, + "step": 8387 + }, + { + "epoch": 0.19563449142437325, + "grad_norm": 1.5222783088684082, + "learning_rate": 1.993504521222787e-05, + "loss": 1.293, + "step": 8388 + }, + { + "epoch": 0.1956578145635512, + "grad_norm": 1.8556909561157227, + "learning_rate": 1.9935016556228913e-05, + "loss": 1.5397, + "step": 8389 + }, + { + "epoch": 0.1956811377027291, + "grad_norm": 1.6563314199447632, + "learning_rate": 1.9934987893930894e-05, + "loss": 1.4494, + "step": 8390 + }, + { + "epoch": 0.195704460841907, + "grad_norm": 1.921899437904358, + "learning_rate": 1.9934959225333833e-05, + "loss": 1.6434, + "step": 8391 + }, + { + "epoch": 0.19572778398108492, + "grad_norm": 2.3415145874023438, + "learning_rate": 1.993493055043775e-05, + "loss": 1.3609, + "step": 8392 + }, + { + "epoch": 0.19575110712026286, + "grad_norm": 3.0161237716674805, + "learning_rate": 1.9934901869242657e-05, + "loss": 1.4848, + "step": 8393 + }, + { + "epoch": 0.19577443025944077, + "grad_norm": 2.0257351398468018, + "learning_rate": 1.993487318174858e-05, + "loss": 1.7626, + "step": 8394 + }, + { + "epoch": 0.19579775339861868, + "grad_norm": 2.156226396560669, + "learning_rate": 1.9934844487955526e-05, + "loss": 1.5458, + "step": 8395 + }, + { + "epoch": 0.1958210765377966, + "grad_norm": 2.186569929122925, + "learning_rate": 1.993481578786352e-05, + "loss": 1.1547, + "step": 8396 + }, + { + "epoch": 0.19584439967697453, + "grad_norm": 1.679700493812561, + "learning_rate": 1.9934787081472584e-05, + "loss": 1.1922, + "step": 8397 + }, + { + "epoch": 0.19586772281615245, + "grad_norm": 1.9217767715454102, + "learning_rate": 1.993475836878273e-05, + "loss": 1.3956, + "step": 8398 + }, + { + "epoch": 0.19589104595533036, + "grad_norm": 1.8655316829681396, + "learning_rate": 1.9934729649793977e-05, + "loss": 1.4308, + "step": 8399 + }, + { + "epoch": 0.19591436909450827, + "grad_norm": 1.8242164850234985, + "learning_rate": 1.9934700924506346e-05, + "loss": 1.3624, + "step": 8400 + }, + { + "epoch": 0.1959376922336862, + "grad_norm": 1.5966317653656006, + "learning_rate": 1.9934672192919856e-05, + "loss": 1.6537, + "step": 8401 + }, + { + "epoch": 0.19596101537286412, + "grad_norm": 1.7605122327804565, + "learning_rate": 1.993464345503452e-05, + "loss": 1.198, + "step": 8402 + }, + { + "epoch": 0.19598433851204203, + "grad_norm": 1.7923510074615479, + "learning_rate": 1.993461471085036e-05, + "loss": 1.3318, + "step": 8403 + }, + { + "epoch": 0.19600766165121994, + "grad_norm": 1.7124557495117188, + "learning_rate": 1.9934585960367395e-05, + "loss": 1.6144, + "step": 8404 + }, + { + "epoch": 0.19603098479039785, + "grad_norm": 2.038177251815796, + "learning_rate": 1.993455720358564e-05, + "loss": 1.4963, + "step": 8405 + }, + { + "epoch": 0.1960543079295758, + "grad_norm": 1.973798394203186, + "learning_rate": 1.9934528440505116e-05, + "loss": 1.8071, + "step": 8406 + }, + { + "epoch": 0.1960776310687537, + "grad_norm": 1.618823766708374, + "learning_rate": 1.9934499671125838e-05, + "loss": 1.5493, + "step": 8407 + }, + { + "epoch": 0.1961009542079316, + "grad_norm": 1.5235133171081543, + "learning_rate": 1.9934470895447827e-05, + "loss": 1.1931, + "step": 8408 + }, + { + "epoch": 0.19612427734710952, + "grad_norm": 1.6298905611038208, + "learning_rate": 1.9934442113471103e-05, + "loss": 1.3901, + "step": 8409 + }, + { + "epoch": 0.19614760048628746, + "grad_norm": 1.8593742847442627, + "learning_rate": 1.993441332519568e-05, + "loss": 1.5012, + "step": 8410 + }, + { + "epoch": 0.19617092362546537, + "grad_norm": 1.7803932428359985, + "learning_rate": 1.993438453062158e-05, + "loss": 1.4103, + "step": 8411 + }, + { + "epoch": 0.19619424676464328, + "grad_norm": 1.643013834953308, + "learning_rate": 1.993435572974882e-05, + "loss": 1.2547, + "step": 8412 + }, + { + "epoch": 0.1962175699038212, + "grad_norm": 1.9514284133911133, + "learning_rate": 1.993432692257741e-05, + "loss": 1.4125, + "step": 8413 + }, + { + "epoch": 0.19624089304299913, + "grad_norm": 1.6965689659118652, + "learning_rate": 1.9934298109107382e-05, + "loss": 1.522, + "step": 8414 + }, + { + "epoch": 0.19626421618217704, + "grad_norm": 1.6323589086532593, + "learning_rate": 1.993426928933875e-05, + "loss": 1.7213, + "step": 8415 + }, + { + "epoch": 0.19628753932135495, + "grad_norm": 2.008805513381958, + "learning_rate": 1.9934240463271527e-05, + "loss": 1.2416, + "step": 8416 + }, + { + "epoch": 0.19631086246053286, + "grad_norm": 2.1970055103302, + "learning_rate": 1.9934211630905738e-05, + "loss": 1.6097, + "step": 8417 + }, + { + "epoch": 0.1963341855997108, + "grad_norm": 1.7546666860580444, + "learning_rate": 1.9934182792241395e-05, + "loss": 1.3337, + "step": 8418 + }, + { + "epoch": 0.19635750873888871, + "grad_norm": 1.8595305681228638, + "learning_rate": 1.9934153947278522e-05, + "loss": 1.4366, + "step": 8419 + }, + { + "epoch": 0.19638083187806663, + "grad_norm": 2.333857297897339, + "learning_rate": 1.993412509601713e-05, + "loss": 1.2228, + "step": 8420 + }, + { + "epoch": 0.19640415501724454, + "grad_norm": 1.9866163730621338, + "learning_rate": 1.9934096238457242e-05, + "loss": 1.5973, + "step": 8421 + }, + { + "epoch": 0.19642747815642247, + "grad_norm": 2.110611915588379, + "learning_rate": 1.9934067374598884e-05, + "loss": 1.4892, + "step": 8422 + }, + { + "epoch": 0.19645080129560039, + "grad_norm": 1.8320127725601196, + "learning_rate": 1.9934038504442055e-05, + "loss": 1.149, + "step": 8423 + }, + { + "epoch": 0.1964741244347783, + "grad_norm": 1.9715638160705566, + "learning_rate": 1.9934009627986795e-05, + "loss": 1.4006, + "step": 8424 + }, + { + "epoch": 0.1964974475739562, + "grad_norm": 1.8911473751068115, + "learning_rate": 1.9933980745233107e-05, + "loss": 1.4139, + "step": 8425 + }, + { + "epoch": 0.19652077071313415, + "grad_norm": 1.958382248878479, + "learning_rate": 1.9933951856181017e-05, + "loss": 1.68, + "step": 8426 + }, + { + "epoch": 0.19654409385231206, + "grad_norm": 1.945895791053772, + "learning_rate": 1.993392296083054e-05, + "loss": 1.5419, + "step": 8427 + }, + { + "epoch": 0.19656741699148997, + "grad_norm": 1.8021414279937744, + "learning_rate": 1.993389405918169e-05, + "loss": 1.0763, + "step": 8428 + }, + { + "epoch": 0.19659074013066788, + "grad_norm": 2.082706928253174, + "learning_rate": 1.9933865151234497e-05, + "loss": 1.2996, + "step": 8429 + }, + { + "epoch": 0.19661406326984582, + "grad_norm": 1.7989561557769775, + "learning_rate": 1.993383623698897e-05, + "loss": 0.9398, + "step": 8430 + }, + { + "epoch": 0.19663738640902373, + "grad_norm": 2.0141401290893555, + "learning_rate": 1.993380731644513e-05, + "loss": 1.913, + "step": 8431 + }, + { + "epoch": 0.19666070954820164, + "grad_norm": 1.95287024974823, + "learning_rate": 1.9933778389602998e-05, + "loss": 1.4019, + "step": 8432 + }, + { + "epoch": 0.19668403268737955, + "grad_norm": 1.8718055486679077, + "learning_rate": 1.9933749456462585e-05, + "loss": 1.4403, + "step": 8433 + }, + { + "epoch": 0.19670735582655746, + "grad_norm": 1.7382854223251343, + "learning_rate": 1.9933720517023917e-05, + "loss": 1.5327, + "step": 8434 + }, + { + "epoch": 0.1967306789657354, + "grad_norm": 1.8627125024795532, + "learning_rate": 1.993369157128701e-05, + "loss": 1.6061, + "step": 8435 + }, + { + "epoch": 0.1967540021049133, + "grad_norm": 2.3834621906280518, + "learning_rate": 1.9933662619251877e-05, + "loss": 1.3788, + "step": 8436 + }, + { + "epoch": 0.19677732524409122, + "grad_norm": 2.0460476875305176, + "learning_rate": 1.9933633660918547e-05, + "loss": 1.2991, + "step": 8437 + }, + { + "epoch": 0.19680064838326913, + "grad_norm": 1.512341022491455, + "learning_rate": 1.993360469628703e-05, + "loss": 1.2888, + "step": 8438 + }, + { + "epoch": 0.19682397152244707, + "grad_norm": 2.2218823432922363, + "learning_rate": 1.9933575725357345e-05, + "loss": 1.4719, + "step": 8439 + }, + { + "epoch": 0.19684729466162498, + "grad_norm": 2.313004493713379, + "learning_rate": 1.9933546748129515e-05, + "loss": 1.0424, + "step": 8440 + }, + { + "epoch": 0.1968706178008029, + "grad_norm": 1.938903570175171, + "learning_rate": 1.9933517764603554e-05, + "loss": 1.3931, + "step": 8441 + }, + { + "epoch": 0.1968939409399808, + "grad_norm": 1.8541818857192993, + "learning_rate": 1.9933488774779484e-05, + "loss": 1.4483, + "step": 8442 + }, + { + "epoch": 0.19691726407915874, + "grad_norm": 1.9733726978302002, + "learning_rate": 1.993345977865732e-05, + "loss": 1.7058, + "step": 8443 + }, + { + "epoch": 0.19694058721833665, + "grad_norm": 1.6924889087677002, + "learning_rate": 1.9933430776237083e-05, + "loss": 1.3036, + "step": 8444 + }, + { + "epoch": 0.19696391035751457, + "grad_norm": 1.772095799446106, + "learning_rate": 1.993340176751879e-05, + "loss": 1.5046, + "step": 8445 + }, + { + "epoch": 0.19698723349669248, + "grad_norm": 1.9810925722122192, + "learning_rate": 1.993337275250246e-05, + "loss": 1.5741, + "step": 8446 + }, + { + "epoch": 0.19701055663587042, + "grad_norm": 1.8354748487472534, + "learning_rate": 1.9933343731188105e-05, + "loss": 1.1581, + "step": 8447 + }, + { + "epoch": 0.19703387977504833, + "grad_norm": 1.7307049036026, + "learning_rate": 1.9933314703575754e-05, + "loss": 1.5071, + "step": 8448 + }, + { + "epoch": 0.19705720291422624, + "grad_norm": 2.0288522243499756, + "learning_rate": 1.9933285669665417e-05, + "loss": 1.1799, + "step": 8449 + }, + { + "epoch": 0.19708052605340415, + "grad_norm": 1.9966660737991333, + "learning_rate": 1.9933256629457123e-05, + "loss": 1.5485, + "step": 8450 + }, + { + "epoch": 0.1971038491925821, + "grad_norm": 1.7407492399215698, + "learning_rate": 1.9933227582950878e-05, + "loss": 1.2877, + "step": 8451 + }, + { + "epoch": 0.19712717233176, + "grad_norm": 1.6298173666000366, + "learning_rate": 1.993319853014671e-05, + "loss": 1.5518, + "step": 8452 + }, + { + "epoch": 0.1971504954709379, + "grad_norm": 1.5235477685928345, + "learning_rate": 1.993316947104463e-05, + "loss": 1.5917, + "step": 8453 + }, + { + "epoch": 0.19717381861011582, + "grad_norm": 1.6726043224334717, + "learning_rate": 1.993314040564466e-05, + "loss": 1.6742, + "step": 8454 + }, + { + "epoch": 0.19719714174929376, + "grad_norm": 2.669126272201538, + "learning_rate": 1.993311133394682e-05, + "loss": 1.4229, + "step": 8455 + }, + { + "epoch": 0.19722046488847167, + "grad_norm": 1.787628173828125, + "learning_rate": 1.9933082255951125e-05, + "loss": 1.3999, + "step": 8456 + }, + { + "epoch": 0.19724378802764958, + "grad_norm": 2.18803071975708, + "learning_rate": 1.9933053171657595e-05, + "loss": 1.3056, + "step": 8457 + }, + { + "epoch": 0.1972671111668275, + "grad_norm": 1.9719247817993164, + "learning_rate": 1.993302408106625e-05, + "loss": 1.4606, + "step": 8458 + }, + { + "epoch": 0.19729043430600543, + "grad_norm": 1.6693620681762695, + "learning_rate": 1.9932994984177106e-05, + "loss": 1.552, + "step": 8459 + }, + { + "epoch": 0.19731375744518334, + "grad_norm": 2.258840322494507, + "learning_rate": 1.9932965880990184e-05, + "loss": 1.4734, + "step": 8460 + }, + { + "epoch": 0.19733708058436125, + "grad_norm": 1.8174350261688232, + "learning_rate": 1.9932936771505498e-05, + "loss": 1.546, + "step": 8461 + }, + { + "epoch": 0.19736040372353916, + "grad_norm": 2.116029977798462, + "learning_rate": 1.993290765572307e-05, + "loss": 1.5311, + "step": 8462 + }, + { + "epoch": 0.19738372686271707, + "grad_norm": 14.94326400756836, + "learning_rate": 1.993287853364292e-05, + "loss": 1.4776, + "step": 8463 + }, + { + "epoch": 0.197407050001895, + "grad_norm": 1.796077847480774, + "learning_rate": 1.9932849405265062e-05, + "loss": 1.5255, + "step": 8464 + }, + { + "epoch": 0.19743037314107292, + "grad_norm": 1.7261086702346802, + "learning_rate": 1.9932820270589518e-05, + "loss": 1.4443, + "step": 8465 + }, + { + "epoch": 0.19745369628025083, + "grad_norm": 2.0750107765197754, + "learning_rate": 1.9932791129616304e-05, + "loss": 1.2516, + "step": 8466 + }, + { + "epoch": 0.19747701941942875, + "grad_norm": 1.9281182289123535, + "learning_rate": 1.9932761982345437e-05, + "loss": 1.8904, + "step": 8467 + }, + { + "epoch": 0.19750034255860668, + "grad_norm": 1.9967366456985474, + "learning_rate": 1.9932732828776945e-05, + "loss": 1.2234, + "step": 8468 + }, + { + "epoch": 0.1975236656977846, + "grad_norm": 1.7236995697021484, + "learning_rate": 1.9932703668910834e-05, + "loss": 1.5208, + "step": 8469 + }, + { + "epoch": 0.1975469888369625, + "grad_norm": 2.094712972640991, + "learning_rate": 1.993267450274713e-05, + "loss": 1.3343, + "step": 8470 + }, + { + "epoch": 0.19757031197614042, + "grad_norm": 1.75678288936615, + "learning_rate": 1.993264533028585e-05, + "loss": 1.2848, + "step": 8471 + }, + { + "epoch": 0.19759363511531836, + "grad_norm": 2.091444969177246, + "learning_rate": 1.9932616151527015e-05, + "loss": 1.2873, + "step": 8472 + }, + { + "epoch": 0.19761695825449627, + "grad_norm": 1.8612337112426758, + "learning_rate": 1.9932586966470635e-05, + "loss": 1.462, + "step": 8473 + }, + { + "epoch": 0.19764028139367418, + "grad_norm": 1.602911353111267, + "learning_rate": 1.9932557775116735e-05, + "loss": 1.3486, + "step": 8474 + }, + { + "epoch": 0.1976636045328521, + "grad_norm": 1.660016655921936, + "learning_rate": 1.9932528577465334e-05, + "loss": 1.4297, + "step": 8475 + }, + { + "epoch": 0.19768692767203003, + "grad_norm": 1.901232361793518, + "learning_rate": 1.993249937351645e-05, + "loss": 1.5246, + "step": 8476 + }, + { + "epoch": 0.19771025081120794, + "grad_norm": 1.8903714418411255, + "learning_rate": 1.99324701632701e-05, + "loss": 1.2257, + "step": 8477 + }, + { + "epoch": 0.19773357395038585, + "grad_norm": 2.4235196113586426, + "learning_rate": 1.99324409467263e-05, + "loss": 1.4321, + "step": 8478 + }, + { + "epoch": 0.19775689708956376, + "grad_norm": 2.1022965908050537, + "learning_rate": 1.993241172388508e-05, + "loss": 1.4087, + "step": 8479 + }, + { + "epoch": 0.1977802202287417, + "grad_norm": 1.6765600442886353, + "learning_rate": 1.9932382494746442e-05, + "loss": 1.1597, + "step": 8480 + }, + { + "epoch": 0.1978035433679196, + "grad_norm": 1.620179295539856, + "learning_rate": 1.9932353259310416e-05, + "loss": 1.437, + "step": 8481 + }, + { + "epoch": 0.19782686650709752, + "grad_norm": 2.2046236991882324, + "learning_rate": 1.9932324017577016e-05, + "loss": 1.5628, + "step": 8482 + }, + { + "epoch": 0.19785018964627543, + "grad_norm": 1.8506474494934082, + "learning_rate": 1.9932294769546264e-05, + "loss": 1.3129, + "step": 8483 + }, + { + "epoch": 0.19787351278545337, + "grad_norm": 1.9097737073898315, + "learning_rate": 1.9932265515218175e-05, + "loss": 1.3553, + "step": 8484 + }, + { + "epoch": 0.19789683592463128, + "grad_norm": 2.0041489601135254, + "learning_rate": 1.9932236254592768e-05, + "loss": 1.4395, + "step": 8485 + }, + { + "epoch": 0.1979201590638092, + "grad_norm": 1.943987250328064, + "learning_rate": 1.993220698767006e-05, + "loss": 1.1899, + "step": 8486 + }, + { + "epoch": 0.1979434822029871, + "grad_norm": 1.8285917043685913, + "learning_rate": 1.9932177714450078e-05, + "loss": 1.3239, + "step": 8487 + }, + { + "epoch": 0.19796680534216504, + "grad_norm": 1.5781675577163696, + "learning_rate": 1.993214843493283e-05, + "loss": 1.3484, + "step": 8488 + }, + { + "epoch": 0.19799012848134295, + "grad_norm": 2.2479825019836426, + "learning_rate": 1.993211914911834e-05, + "loss": 1.4274, + "step": 8489 + }, + { + "epoch": 0.19801345162052086, + "grad_norm": 2.09220290184021, + "learning_rate": 1.993208985700663e-05, + "loss": 1.752, + "step": 8490 + }, + { + "epoch": 0.19803677475969877, + "grad_norm": 2.9764270782470703, + "learning_rate": 1.993206055859771e-05, + "loss": 1.6049, + "step": 8491 + }, + { + "epoch": 0.19806009789887669, + "grad_norm": 1.8535959720611572, + "learning_rate": 1.9932031253891603e-05, + "loss": 1.4721, + "step": 8492 + }, + { + "epoch": 0.19808342103805462, + "grad_norm": 1.937528371810913, + "learning_rate": 1.993200194288833e-05, + "loss": 1.7421, + "step": 8493 + }, + { + "epoch": 0.19810674417723254, + "grad_norm": 1.5672012567520142, + "learning_rate": 1.9931972625587904e-05, + "loss": 1.3571, + "step": 8494 + }, + { + "epoch": 0.19813006731641045, + "grad_norm": 1.7897682189941406, + "learning_rate": 1.993194330199035e-05, + "loss": 1.1632, + "step": 8495 + }, + { + "epoch": 0.19815339045558836, + "grad_norm": 1.8621209859848022, + "learning_rate": 1.993191397209568e-05, + "loss": 1.4293, + "step": 8496 + }, + { + "epoch": 0.1981767135947663, + "grad_norm": 2.0851376056671143, + "learning_rate": 1.9931884635903915e-05, + "loss": 1.1419, + "step": 8497 + }, + { + "epoch": 0.1982000367339442, + "grad_norm": 1.7102707624435425, + "learning_rate": 1.9931855293415078e-05, + "loss": 1.4313, + "step": 8498 + }, + { + "epoch": 0.19822335987312212, + "grad_norm": 1.7569828033447266, + "learning_rate": 1.993182594462918e-05, + "loss": 1.5157, + "step": 8499 + }, + { + "epoch": 0.19824668301230003, + "grad_norm": 1.9593416452407837, + "learning_rate": 1.9931796589546248e-05, + "loss": 1.3844, + "step": 8500 + }, + { + "epoch": 0.19827000615147797, + "grad_norm": 1.7128303050994873, + "learning_rate": 1.9931767228166296e-05, + "loss": 1.4206, + "step": 8501 + }, + { + "epoch": 0.19829332929065588, + "grad_norm": 1.9299733638763428, + "learning_rate": 1.993173786048934e-05, + "loss": 1.6216, + "step": 8502 + }, + { + "epoch": 0.1983166524298338, + "grad_norm": 1.583783745765686, + "learning_rate": 1.99317084865154e-05, + "loss": 1.4987, + "step": 8503 + }, + { + "epoch": 0.1983399755690117, + "grad_norm": 1.7246174812316895, + "learning_rate": 1.9931679106244498e-05, + "loss": 1.5168, + "step": 8504 + }, + { + "epoch": 0.19836329870818964, + "grad_norm": 1.6178362369537354, + "learning_rate": 1.9931649719676653e-05, + "loss": 1.4585, + "step": 8505 + }, + { + "epoch": 0.19838662184736755, + "grad_norm": 2.0836119651794434, + "learning_rate": 1.993162032681188e-05, + "loss": 1.373, + "step": 8506 + }, + { + "epoch": 0.19840994498654546, + "grad_norm": 1.7251373529434204, + "learning_rate": 1.9931590927650196e-05, + "loss": 1.4048, + "step": 8507 + }, + { + "epoch": 0.19843326812572337, + "grad_norm": 2.1542091369628906, + "learning_rate": 1.993156152219163e-05, + "loss": 1.3016, + "step": 8508 + }, + { + "epoch": 0.1984565912649013, + "grad_norm": 1.6134393215179443, + "learning_rate": 1.9931532110436184e-05, + "loss": 1.198, + "step": 8509 + }, + { + "epoch": 0.19847991440407922, + "grad_norm": 2.5934648513793945, + "learning_rate": 1.993150269238389e-05, + "loss": 0.9192, + "step": 8510 + }, + { + "epoch": 0.19850323754325713, + "grad_norm": 2.2866601943969727, + "learning_rate": 1.9931473268034764e-05, + "loss": 1.2081, + "step": 8511 + }, + { + "epoch": 0.19852656068243504, + "grad_norm": 2.229982614517212, + "learning_rate": 1.993144383738882e-05, + "loss": 1.3032, + "step": 8512 + }, + { + "epoch": 0.19854988382161298, + "grad_norm": 2.184948682785034, + "learning_rate": 1.9931414400446083e-05, + "loss": 1.4859, + "step": 8513 + }, + { + "epoch": 0.1985732069607909, + "grad_norm": 1.8512685298919678, + "learning_rate": 1.9931384957206567e-05, + "loss": 1.1557, + "step": 8514 + }, + { + "epoch": 0.1985965300999688, + "grad_norm": 1.8854377269744873, + "learning_rate": 1.9931355507670292e-05, + "loss": 1.6258, + "step": 8515 + }, + { + "epoch": 0.19861985323914672, + "grad_norm": 2.4306952953338623, + "learning_rate": 1.993132605183728e-05, + "loss": 1.3071, + "step": 8516 + }, + { + "epoch": 0.19864317637832465, + "grad_norm": 1.9151445627212524, + "learning_rate": 1.9931296589707542e-05, + "loss": 1.4528, + "step": 8517 + }, + { + "epoch": 0.19866649951750256, + "grad_norm": 1.9716284275054932, + "learning_rate": 1.99312671212811e-05, + "loss": 1.4552, + "step": 8518 + }, + { + "epoch": 0.19868982265668048, + "grad_norm": 1.9568582773208618, + "learning_rate": 1.9931237646557983e-05, + "loss": 1.8142, + "step": 8519 + }, + { + "epoch": 0.1987131457958584, + "grad_norm": 1.772557020187378, + "learning_rate": 1.9931208165538192e-05, + "loss": 1.2108, + "step": 8520 + }, + { + "epoch": 0.1987364689350363, + "grad_norm": 2.101158857345581, + "learning_rate": 1.9931178678221757e-05, + "loss": 1.6281, + "step": 8521 + }, + { + "epoch": 0.19875979207421424, + "grad_norm": 1.8404721021652222, + "learning_rate": 1.9931149184608693e-05, + "loss": 1.6477, + "step": 8522 + }, + { + "epoch": 0.19878311521339215, + "grad_norm": 1.7882312536239624, + "learning_rate": 1.993111968469902e-05, + "loss": 1.3042, + "step": 8523 + }, + { + "epoch": 0.19880643835257006, + "grad_norm": 1.9858789443969727, + "learning_rate": 1.9931090178492758e-05, + "loss": 1.2813, + "step": 8524 + }, + { + "epoch": 0.19882976149174797, + "grad_norm": 1.962374210357666, + "learning_rate": 1.9931060665989924e-05, + "loss": 0.8552, + "step": 8525 + }, + { + "epoch": 0.1988530846309259, + "grad_norm": 2.9652040004730225, + "learning_rate": 1.9931031147190536e-05, + "loss": 1.7061, + "step": 8526 + }, + { + "epoch": 0.19887640777010382, + "grad_norm": 1.9163116216659546, + "learning_rate": 1.9931001622094612e-05, + "loss": 1.5735, + "step": 8527 + }, + { + "epoch": 0.19889973090928173, + "grad_norm": 1.7331854104995728, + "learning_rate": 1.9930972090702175e-05, + "loss": 1.6385, + "step": 8528 + }, + { + "epoch": 0.19892305404845964, + "grad_norm": 1.6774250268936157, + "learning_rate": 1.993094255301324e-05, + "loss": 1.5119, + "step": 8529 + }, + { + "epoch": 0.19894637718763758, + "grad_norm": 1.5432186126708984, + "learning_rate": 1.9930913009027824e-05, + "loss": 1.398, + "step": 8530 + }, + { + "epoch": 0.1989697003268155, + "grad_norm": 2.4643189907073975, + "learning_rate": 1.9930883458745952e-05, + "loss": 1.336, + "step": 8531 + }, + { + "epoch": 0.1989930234659934, + "grad_norm": 2.23405122756958, + "learning_rate": 1.9930853902167637e-05, + "loss": 1.165, + "step": 8532 + }, + { + "epoch": 0.1990163466051713, + "grad_norm": 1.6167285442352295, + "learning_rate": 1.9930824339292902e-05, + "loss": 1.5255, + "step": 8533 + }, + { + "epoch": 0.19903966974434925, + "grad_norm": 2.2048158645629883, + "learning_rate": 1.9930794770121762e-05, + "loss": 1.1898, + "step": 8534 + }, + { + "epoch": 0.19906299288352716, + "grad_norm": 1.5268076658248901, + "learning_rate": 1.993076519465424e-05, + "loss": 1.2922, + "step": 8535 + }, + { + "epoch": 0.19908631602270507, + "grad_norm": 1.7449623346328735, + "learning_rate": 1.9930735612890348e-05, + "loss": 1.1434, + "step": 8536 + }, + { + "epoch": 0.19910963916188298, + "grad_norm": 2.3672146797180176, + "learning_rate": 1.9930706024830112e-05, + "loss": 1.4381, + "step": 8537 + }, + { + "epoch": 0.19913296230106092, + "grad_norm": 1.8534438610076904, + "learning_rate": 1.9930676430473545e-05, + "loss": 1.4847, + "step": 8538 + }, + { + "epoch": 0.19915628544023883, + "grad_norm": 1.7172642946243286, + "learning_rate": 1.993064682982067e-05, + "loss": 1.5198, + "step": 8539 + }, + { + "epoch": 0.19917960857941674, + "grad_norm": 2.1724014282226562, + "learning_rate": 1.9930617222871504e-05, + "loss": 1.2473, + "step": 8540 + }, + { + "epoch": 0.19920293171859466, + "grad_norm": 1.6071799993515015, + "learning_rate": 1.9930587609626068e-05, + "loss": 1.1529, + "step": 8541 + }, + { + "epoch": 0.1992262548577726, + "grad_norm": 1.7100168466567993, + "learning_rate": 1.9930557990084375e-05, + "loss": 1.3967, + "step": 8542 + }, + { + "epoch": 0.1992495779969505, + "grad_norm": 1.7153252363204956, + "learning_rate": 1.993052836424645e-05, + "loss": 1.2877, + "step": 8543 + }, + { + "epoch": 0.19927290113612842, + "grad_norm": 1.8357136249542236, + "learning_rate": 1.9930498732112307e-05, + "loss": 1.5028, + "step": 8544 + }, + { + "epoch": 0.19929622427530633, + "grad_norm": 1.5904570817947388, + "learning_rate": 1.993046909368197e-05, + "loss": 1.3046, + "step": 8545 + }, + { + "epoch": 0.19931954741448427, + "grad_norm": 1.4243555068969727, + "learning_rate": 1.9930439448955453e-05, + "loss": 0.7939, + "step": 8546 + }, + { + "epoch": 0.19934287055366218, + "grad_norm": 2.0232248306274414, + "learning_rate": 1.9930409797932778e-05, + "loss": 1.251, + "step": 8547 + }, + { + "epoch": 0.1993661936928401, + "grad_norm": 2.077549934387207, + "learning_rate": 1.993038014061396e-05, + "loss": 1.4089, + "step": 8548 + }, + { + "epoch": 0.199389516832018, + "grad_norm": 2.039625406265259, + "learning_rate": 1.9930350476999023e-05, + "loss": 1.5803, + "step": 8549 + }, + { + "epoch": 0.1994128399711959, + "grad_norm": 1.78765869140625, + "learning_rate": 1.9930320807087984e-05, + "loss": 1.2045, + "step": 8550 + }, + { + "epoch": 0.19943616311037385, + "grad_norm": 1.716369390487671, + "learning_rate": 1.9930291130880858e-05, + "loss": 1.1589, + "step": 8551 + }, + { + "epoch": 0.19945948624955176, + "grad_norm": 1.9728193283081055, + "learning_rate": 1.9930261448377668e-05, + "loss": 1.5191, + "step": 8552 + }, + { + "epoch": 0.19948280938872967, + "grad_norm": 1.8161976337432861, + "learning_rate": 1.993023175957843e-05, + "loss": 1.3584, + "step": 8553 + }, + { + "epoch": 0.19950613252790758, + "grad_norm": 2.175579071044922, + "learning_rate": 1.9930202064483165e-05, + "loss": 1.5044, + "step": 8554 + }, + { + "epoch": 0.19952945566708552, + "grad_norm": 1.6410448551177979, + "learning_rate": 1.9930172363091892e-05, + "loss": 1.3973, + "step": 8555 + }, + { + "epoch": 0.19955277880626343, + "grad_norm": 2.159579038619995, + "learning_rate": 1.993014265540463e-05, + "loss": 1.6108, + "step": 8556 + }, + { + "epoch": 0.19957610194544134, + "grad_norm": 1.858014702796936, + "learning_rate": 1.9930112941421397e-05, + "loss": 1.518, + "step": 8557 + }, + { + "epoch": 0.19959942508461925, + "grad_norm": 1.7806264162063599, + "learning_rate": 1.9930083221142208e-05, + "loss": 1.2978, + "step": 8558 + }, + { + "epoch": 0.1996227482237972, + "grad_norm": 2.0742080211639404, + "learning_rate": 1.9930053494567087e-05, + "loss": 1.1868, + "step": 8559 + }, + { + "epoch": 0.1996460713629751, + "grad_norm": 1.6594098806381226, + "learning_rate": 1.9930023761696055e-05, + "loss": 1.6583, + "step": 8560 + }, + { + "epoch": 0.199669394502153, + "grad_norm": 2.0368945598602295, + "learning_rate": 1.9929994022529122e-05, + "loss": 1.2844, + "step": 8561 + }, + { + "epoch": 0.19969271764133092, + "grad_norm": 1.653772234916687, + "learning_rate": 1.9929964277066316e-05, + "loss": 1.6852, + "step": 8562 + }, + { + "epoch": 0.19971604078050886, + "grad_norm": 1.761475920677185, + "learning_rate": 1.9929934525307652e-05, + "loss": 1.3267, + "step": 8563 + }, + { + "epoch": 0.19973936391968677, + "grad_norm": 1.4252156019210815, + "learning_rate": 1.9929904767253148e-05, + "loss": 1.1293, + "step": 8564 + }, + { + "epoch": 0.19976268705886469, + "grad_norm": 1.7329976558685303, + "learning_rate": 1.992987500290282e-05, + "loss": 1.4515, + "step": 8565 + }, + { + "epoch": 0.1997860101980426, + "grad_norm": 1.7763077020645142, + "learning_rate": 1.9929845232256695e-05, + "loss": 1.3109, + "step": 8566 + }, + { + "epoch": 0.19980933333722053, + "grad_norm": 2.011014461517334, + "learning_rate": 1.9929815455314786e-05, + "loss": 1.3432, + "step": 8567 + }, + { + "epoch": 0.19983265647639845, + "grad_norm": 2.408893585205078, + "learning_rate": 1.9929785672077112e-05, + "loss": 1.5558, + "step": 8568 + }, + { + "epoch": 0.19985597961557636, + "grad_norm": 2.4763290882110596, + "learning_rate": 1.9929755882543696e-05, + "loss": 1.4367, + "step": 8569 + }, + { + "epoch": 0.19987930275475427, + "grad_norm": 2.267183542251587, + "learning_rate": 1.9929726086714552e-05, + "loss": 1.6151, + "step": 8570 + }, + { + "epoch": 0.1999026258939322, + "grad_norm": 2.1209559440612793, + "learning_rate": 1.99296962845897e-05, + "loss": 1.5078, + "step": 8571 + }, + { + "epoch": 0.19992594903311012, + "grad_norm": 1.9684478044509888, + "learning_rate": 1.9929666476169163e-05, + "loss": 1.5521, + "step": 8572 + }, + { + "epoch": 0.19994927217228803, + "grad_norm": 2.004594564437866, + "learning_rate": 1.9929636661452956e-05, + "loss": 1.204, + "step": 8573 + }, + { + "epoch": 0.19997259531146594, + "grad_norm": 1.6245849132537842, + "learning_rate": 1.9929606840441096e-05, + "loss": 1.4088, + "step": 8574 + }, + { + "epoch": 0.19999591845064388, + "grad_norm": 2.3038241863250732, + "learning_rate": 1.992957701313361e-05, + "loss": 1.3, + "step": 8575 + }, + { + "epoch": 0.2000192415898218, + "grad_norm": 1.7967134714126587, + "learning_rate": 1.9929547179530505e-05, + "loss": 0.8908, + "step": 8576 + }, + { + "epoch": 0.2000425647289997, + "grad_norm": 1.9462894201278687, + "learning_rate": 1.992951733963181e-05, + "loss": 1.5149, + "step": 8577 + }, + { + "epoch": 0.2000658878681776, + "grad_norm": 1.9436413049697876, + "learning_rate": 1.992948749343754e-05, + "loss": 1.3846, + "step": 8578 + }, + { + "epoch": 0.20008921100735552, + "grad_norm": 2.1051881313323975, + "learning_rate": 1.9929457640947712e-05, + "loss": 1.4316, + "step": 8579 + }, + { + "epoch": 0.20011253414653346, + "grad_norm": 1.6956332921981812, + "learning_rate": 1.992942778216235e-05, + "loss": 1.2556, + "step": 8580 + }, + { + "epoch": 0.20013585728571137, + "grad_norm": 2.0119099617004395, + "learning_rate": 1.992939791708147e-05, + "loss": 1.8544, + "step": 8581 + }, + { + "epoch": 0.20015918042488928, + "grad_norm": 2.085557699203491, + "learning_rate": 1.992936804570509e-05, + "loss": 1.4618, + "step": 8582 + }, + { + "epoch": 0.2001825035640672, + "grad_norm": 1.6107583045959473, + "learning_rate": 1.9929338168033227e-05, + "loss": 1.3139, + "step": 8583 + }, + { + "epoch": 0.20020582670324513, + "grad_norm": 2.1184771060943604, + "learning_rate": 1.992930828406591e-05, + "loss": 1.8629, + "step": 8584 + }, + { + "epoch": 0.20022914984242304, + "grad_norm": 1.6549603939056396, + "learning_rate": 1.9929278393803146e-05, + "loss": 1.216, + "step": 8585 + }, + { + "epoch": 0.20025247298160095, + "grad_norm": 1.7777299880981445, + "learning_rate": 1.9929248497244956e-05, + "loss": 1.6253, + "step": 8586 + }, + { + "epoch": 0.20027579612077887, + "grad_norm": 1.789884328842163, + "learning_rate": 1.992921859439137e-05, + "loss": 1.0988, + "step": 8587 + }, + { + "epoch": 0.2002991192599568, + "grad_norm": 1.6690226793289185, + "learning_rate": 1.9929188685242393e-05, + "loss": 1.2647, + "step": 8588 + }, + { + "epoch": 0.20032244239913471, + "grad_norm": 1.8306410312652588, + "learning_rate": 1.9929158769798048e-05, + "loss": 1.5433, + "step": 8589 + }, + { + "epoch": 0.20034576553831263, + "grad_norm": 2.274371385574341, + "learning_rate": 1.992912884805836e-05, + "loss": 1.8103, + "step": 8590 + }, + { + "epoch": 0.20036908867749054, + "grad_norm": 1.6047557592391968, + "learning_rate": 1.9929098920023345e-05, + "loss": 1.6261, + "step": 8591 + }, + { + "epoch": 0.20039241181666848, + "grad_norm": 1.817678689956665, + "learning_rate": 1.9929068985693014e-05, + "loss": 1.4836, + "step": 8592 + }, + { + "epoch": 0.2004157349558464, + "grad_norm": 1.796488881111145, + "learning_rate": 1.99290390450674e-05, + "loss": 1.5, + "step": 8593 + }, + { + "epoch": 0.2004390580950243, + "grad_norm": 2.0216352939605713, + "learning_rate": 1.992900909814651e-05, + "loss": 1.5251, + "step": 8594 + }, + { + "epoch": 0.2004623812342022, + "grad_norm": 2.112666606903076, + "learning_rate": 1.9928979144930367e-05, + "loss": 1.2229, + "step": 8595 + }, + { + "epoch": 0.20048570437338015, + "grad_norm": 1.9515166282653809, + "learning_rate": 1.9928949185418994e-05, + "loss": 1.577, + "step": 8596 + }, + { + "epoch": 0.20050902751255806, + "grad_norm": 1.7351491451263428, + "learning_rate": 1.9928919219612404e-05, + "loss": 1.5138, + "step": 8597 + }, + { + "epoch": 0.20053235065173597, + "grad_norm": 1.6844878196716309, + "learning_rate": 1.9928889247510618e-05, + "loss": 1.5027, + "step": 8598 + }, + { + "epoch": 0.20055567379091388, + "grad_norm": 2.9149434566497803, + "learning_rate": 1.992885926911366e-05, + "loss": 1.4791, + "step": 8599 + }, + { + "epoch": 0.20057899693009182, + "grad_norm": 1.690651297569275, + "learning_rate": 1.9928829284421538e-05, + "loss": 1.1719, + "step": 8600 + }, + { + "epoch": 0.20060232006926973, + "grad_norm": 2.2557668685913086, + "learning_rate": 1.992879929343428e-05, + "loss": 1.3211, + "step": 8601 + }, + { + "epoch": 0.20062564320844764, + "grad_norm": 1.64676833152771, + "learning_rate": 1.9928769296151904e-05, + "loss": 1.4015, + "step": 8602 + }, + { + "epoch": 0.20064896634762555, + "grad_norm": 1.682283878326416, + "learning_rate": 1.9928739292574427e-05, + "loss": 1.3068, + "step": 8603 + }, + { + "epoch": 0.2006722894868035, + "grad_norm": 2.1106038093566895, + "learning_rate": 1.992870928270187e-05, + "loss": 1.5022, + "step": 8604 + }, + { + "epoch": 0.2006956126259814, + "grad_norm": 1.7299132347106934, + "learning_rate": 1.992867926653425e-05, + "loss": 1.2536, + "step": 8605 + }, + { + "epoch": 0.2007189357651593, + "grad_norm": 1.9662495851516724, + "learning_rate": 1.9928649244071587e-05, + "loss": 1.4383, + "step": 8606 + }, + { + "epoch": 0.20074225890433722, + "grad_norm": 1.5652048587799072, + "learning_rate": 1.9928619215313894e-05, + "loss": 1.2395, + "step": 8607 + }, + { + "epoch": 0.20076558204351513, + "grad_norm": 1.680894136428833, + "learning_rate": 1.9928589180261202e-05, + "loss": 1.25, + "step": 8608 + }, + { + "epoch": 0.20078890518269307, + "grad_norm": 1.7617830038070679, + "learning_rate": 1.992855913891352e-05, + "loss": 1.6103, + "step": 8609 + }, + { + "epoch": 0.20081222832187098, + "grad_norm": 1.9974735975265503, + "learning_rate": 1.9928529091270877e-05, + "loss": 1.369, + "step": 8610 + }, + { + "epoch": 0.2008355514610489, + "grad_norm": 1.9050729274749756, + "learning_rate": 1.992849903733328e-05, + "loss": 1.0756, + "step": 8611 + }, + { + "epoch": 0.2008588746002268, + "grad_norm": 1.7208689451217651, + "learning_rate": 1.9928468977100755e-05, + "loss": 1.2512, + "step": 8612 + }, + { + "epoch": 0.20088219773940474, + "grad_norm": 1.9844028949737549, + "learning_rate": 1.992843891057332e-05, + "loss": 1.1617, + "step": 8613 + }, + { + "epoch": 0.20090552087858266, + "grad_norm": 1.7846125364303589, + "learning_rate": 1.9928408837750997e-05, + "loss": 1.4224, + "step": 8614 + }, + { + "epoch": 0.20092884401776057, + "grad_norm": 2.0279417037963867, + "learning_rate": 1.99283787586338e-05, + "loss": 1.3022, + "step": 8615 + }, + { + "epoch": 0.20095216715693848, + "grad_norm": 2.204399347305298, + "learning_rate": 1.9928348673221747e-05, + "loss": 1.7996, + "step": 8616 + }, + { + "epoch": 0.20097549029611642, + "grad_norm": 1.9394294023513794, + "learning_rate": 1.9928318581514867e-05, + "loss": 1.3811, + "step": 8617 + }, + { + "epoch": 0.20099881343529433, + "grad_norm": 1.7966961860656738, + "learning_rate": 1.9928288483513167e-05, + "loss": 1.084, + "step": 8618 + }, + { + "epoch": 0.20102213657447224, + "grad_norm": 1.8421030044555664, + "learning_rate": 1.9928258379216673e-05, + "loss": 1.6016, + "step": 8619 + }, + { + "epoch": 0.20104545971365015, + "grad_norm": 2.0328686237335205, + "learning_rate": 1.9928228268625403e-05, + "loss": 1.1737, + "step": 8620 + }, + { + "epoch": 0.2010687828528281, + "grad_norm": 1.673363447189331, + "learning_rate": 1.9928198151739377e-05, + "loss": 1.3223, + "step": 8621 + }, + { + "epoch": 0.201092105992006, + "grad_norm": 1.574405312538147, + "learning_rate": 1.992816802855861e-05, + "loss": 1.566, + "step": 8622 + }, + { + "epoch": 0.2011154291311839, + "grad_norm": 1.8290834426879883, + "learning_rate": 1.9928137899083126e-05, + "loss": 1.2777, + "step": 8623 + }, + { + "epoch": 0.20113875227036182, + "grad_norm": 2.2401552200317383, + "learning_rate": 1.992810776331294e-05, + "loss": 1.3386, + "step": 8624 + }, + { + "epoch": 0.20116207540953976, + "grad_norm": 2.142089366912842, + "learning_rate": 1.9928077621248075e-05, + "loss": 1.4547, + "step": 8625 + }, + { + "epoch": 0.20118539854871767, + "grad_norm": 1.5809448957443237, + "learning_rate": 1.9928047472888545e-05, + "loss": 1.3224, + "step": 8626 + }, + { + "epoch": 0.20120872168789558, + "grad_norm": 2.037525177001953, + "learning_rate": 1.9928017318234377e-05, + "loss": 1.6199, + "step": 8627 + }, + { + "epoch": 0.2012320448270735, + "grad_norm": 2.1955416202545166, + "learning_rate": 1.9927987157285583e-05, + "loss": 1.381, + "step": 8628 + }, + { + "epoch": 0.20125536796625143, + "grad_norm": 1.4721949100494385, + "learning_rate": 1.9927956990042184e-05, + "loss": 1.6143, + "step": 8629 + }, + { + "epoch": 0.20127869110542934, + "grad_norm": 1.780194640159607, + "learning_rate": 1.99279268165042e-05, + "loss": 1.6453, + "step": 8630 + }, + { + "epoch": 0.20130201424460725, + "grad_norm": 1.8660014867782593, + "learning_rate": 1.992789663667165e-05, + "loss": 1.1227, + "step": 8631 + }, + { + "epoch": 0.20132533738378516, + "grad_norm": 2.272500991821289, + "learning_rate": 1.992786645054455e-05, + "loss": 1.4123, + "step": 8632 + }, + { + "epoch": 0.20134866052296307, + "grad_norm": 1.909055233001709, + "learning_rate": 1.9927836258122924e-05, + "loss": 1.7311, + "step": 8633 + }, + { + "epoch": 0.201371983662141, + "grad_norm": 1.6715971231460571, + "learning_rate": 1.992780605940679e-05, + "loss": 1.0716, + "step": 8634 + }, + { + "epoch": 0.20139530680131892, + "grad_norm": 2.029350996017456, + "learning_rate": 1.9927775854396168e-05, + "loss": 1.5559, + "step": 8635 + }, + { + "epoch": 0.20141862994049683, + "grad_norm": 1.988607406616211, + "learning_rate": 1.992774564309107e-05, + "loss": 1.516, + "step": 8636 + }, + { + "epoch": 0.20144195307967475, + "grad_norm": 2.102781295776367, + "learning_rate": 1.9927715425491527e-05, + "loss": 1.0337, + "step": 8637 + }, + { + "epoch": 0.20146527621885268, + "grad_norm": 2.192837953567505, + "learning_rate": 1.992768520159755e-05, + "loss": 1.4841, + "step": 8638 + }, + { + "epoch": 0.2014885993580306, + "grad_norm": 1.6843454837799072, + "learning_rate": 1.9927654971409158e-05, + "loss": 1.4046, + "step": 8639 + }, + { + "epoch": 0.2015119224972085, + "grad_norm": 2.6186683177948, + "learning_rate": 1.9927624734926373e-05, + "loss": 1.625, + "step": 8640 + }, + { + "epoch": 0.20153524563638642, + "grad_norm": 1.5319478511810303, + "learning_rate": 1.992759449214921e-05, + "loss": 1.1996, + "step": 8641 + }, + { + "epoch": 0.20155856877556436, + "grad_norm": 1.786947250366211, + "learning_rate": 1.9927564243077694e-05, + "loss": 1.3614, + "step": 8642 + }, + { + "epoch": 0.20158189191474227, + "grad_norm": 1.9961731433868408, + "learning_rate": 1.9927533987711844e-05, + "loss": 1.583, + "step": 8643 + }, + { + "epoch": 0.20160521505392018, + "grad_norm": 1.8245047330856323, + "learning_rate": 1.9927503726051675e-05, + "loss": 1.7002, + "step": 8644 + }, + { + "epoch": 0.2016285381930981, + "grad_norm": 1.7347209453582764, + "learning_rate": 1.992747345809721e-05, + "loss": 1.4671, + "step": 8645 + }, + { + "epoch": 0.20165186133227603, + "grad_norm": 1.5864899158477783, + "learning_rate": 1.9927443183848463e-05, + "loss": 1.2217, + "step": 8646 + }, + { + "epoch": 0.20167518447145394, + "grad_norm": 1.8574601411819458, + "learning_rate": 1.9927412903305457e-05, + "loss": 1.5846, + "step": 8647 + }, + { + "epoch": 0.20169850761063185, + "grad_norm": 2.4750797748565674, + "learning_rate": 1.9927382616468213e-05, + "loss": 1.465, + "step": 8648 + }, + { + "epoch": 0.20172183074980976, + "grad_norm": 1.6023808717727661, + "learning_rate": 1.9927352323336746e-05, + "loss": 1.2668, + "step": 8649 + }, + { + "epoch": 0.2017451538889877, + "grad_norm": 1.719538927078247, + "learning_rate": 1.9927322023911074e-05, + "loss": 1.5154, + "step": 8650 + }, + { + "epoch": 0.2017684770281656, + "grad_norm": 2.036890745162964, + "learning_rate": 1.9927291718191223e-05, + "loss": 1.4945, + "step": 8651 + }, + { + "epoch": 0.20179180016734352, + "grad_norm": 2.223398208618164, + "learning_rate": 1.992726140617721e-05, + "loss": 1.7082, + "step": 8652 + }, + { + "epoch": 0.20181512330652143, + "grad_norm": 2.0190601348876953, + "learning_rate": 1.992723108786905e-05, + "loss": 1.1958, + "step": 8653 + }, + { + "epoch": 0.20183844644569937, + "grad_norm": 2.0929315090179443, + "learning_rate": 1.9927200763266764e-05, + "loss": 1.5001, + "step": 8654 + }, + { + "epoch": 0.20186176958487728, + "grad_norm": 1.743127465248108, + "learning_rate": 1.9927170432370373e-05, + "loss": 1.3011, + "step": 8655 + }, + { + "epoch": 0.2018850927240552, + "grad_norm": 1.8542795181274414, + "learning_rate": 1.9927140095179897e-05, + "loss": 1.7652, + "step": 8656 + }, + { + "epoch": 0.2019084158632331, + "grad_norm": 1.6031721830368042, + "learning_rate": 1.992710975169535e-05, + "loss": 1.0514, + "step": 8657 + }, + { + "epoch": 0.20193173900241104, + "grad_norm": 1.9875092506408691, + "learning_rate": 1.9927079401916758e-05, + "loss": 0.8593, + "step": 8658 + }, + { + "epoch": 0.20195506214158895, + "grad_norm": 2.047773838043213, + "learning_rate": 1.9927049045844135e-05, + "loss": 1.9972, + "step": 8659 + }, + { + "epoch": 0.20197838528076686, + "grad_norm": 2.0824506282806396, + "learning_rate": 1.9927018683477502e-05, + "loss": 1.3114, + "step": 8660 + }, + { + "epoch": 0.20200170841994478, + "grad_norm": 2.1166629791259766, + "learning_rate": 1.992698831481688e-05, + "loss": 1.3794, + "step": 8661 + }, + { + "epoch": 0.2020250315591227, + "grad_norm": 1.9460465908050537, + "learning_rate": 1.992695793986229e-05, + "loss": 1.4684, + "step": 8662 + }, + { + "epoch": 0.20204835469830063, + "grad_norm": 1.6829252243041992, + "learning_rate": 1.9926927558613744e-05, + "loss": 1.6767, + "step": 8663 + }, + { + "epoch": 0.20207167783747854, + "grad_norm": 1.7887227535247803, + "learning_rate": 1.9926897171071266e-05, + "loss": 1.3593, + "step": 8664 + }, + { + "epoch": 0.20209500097665645, + "grad_norm": 1.558276653289795, + "learning_rate": 1.9926866777234872e-05, + "loss": 1.3381, + "step": 8665 + }, + { + "epoch": 0.20211832411583436, + "grad_norm": 2.137524127960205, + "learning_rate": 1.992683637710459e-05, + "loss": 1.5931, + "step": 8666 + }, + { + "epoch": 0.2021416472550123, + "grad_norm": 1.6543177366256714, + "learning_rate": 1.992680597068043e-05, + "loss": 1.1084, + "step": 8667 + }, + { + "epoch": 0.2021649703941902, + "grad_norm": 1.9740118980407715, + "learning_rate": 1.992677555796241e-05, + "loss": 1.3241, + "step": 8668 + }, + { + "epoch": 0.20218829353336812, + "grad_norm": 1.655184268951416, + "learning_rate": 1.9926745138950558e-05, + "loss": 1.5705, + "step": 8669 + }, + { + "epoch": 0.20221161667254603, + "grad_norm": 1.645855188369751, + "learning_rate": 1.9926714713644887e-05, + "loss": 1.4062, + "step": 8670 + }, + { + "epoch": 0.20223493981172397, + "grad_norm": 2.6039462089538574, + "learning_rate": 1.992668428204542e-05, + "loss": 1.767, + "step": 8671 + }, + { + "epoch": 0.20225826295090188, + "grad_norm": 1.8323744535446167, + "learning_rate": 1.9926653844152173e-05, + "loss": 1.279, + "step": 8672 + }, + { + "epoch": 0.2022815860900798, + "grad_norm": 1.901573896408081, + "learning_rate": 1.992662339996517e-05, + "loss": 1.6495, + "step": 8673 + }, + { + "epoch": 0.2023049092292577, + "grad_norm": 1.9171459674835205, + "learning_rate": 1.9926592949484423e-05, + "loss": 1.6408, + "step": 8674 + }, + { + "epoch": 0.20232823236843564, + "grad_norm": 1.8259592056274414, + "learning_rate": 1.9926562492709957e-05, + "loss": 1.4305, + "step": 8675 + }, + { + "epoch": 0.20235155550761355, + "grad_norm": 1.8189347982406616, + "learning_rate": 1.9926532029641793e-05, + "loss": 1.1183, + "step": 8676 + }, + { + "epoch": 0.20237487864679146, + "grad_norm": 2.762092113494873, + "learning_rate": 1.9926501560279944e-05, + "loss": 1.43, + "step": 8677 + }, + { + "epoch": 0.20239820178596937, + "grad_norm": 2.1132094860076904, + "learning_rate": 1.9926471084624436e-05, + "loss": 1.1977, + "step": 8678 + }, + { + "epoch": 0.2024215249251473, + "grad_norm": 2.0202882289886475, + "learning_rate": 1.9926440602675278e-05, + "loss": 1.4328, + "step": 8679 + }, + { + "epoch": 0.20244484806432522, + "grad_norm": 1.9041361808776855, + "learning_rate": 1.9926410114432498e-05, + "loss": 1.6001, + "step": 8680 + }, + { + "epoch": 0.20246817120350313, + "grad_norm": 2.513190507888794, + "learning_rate": 1.9926379619896118e-05, + "loss": 1.6316, + "step": 8681 + }, + { + "epoch": 0.20249149434268104, + "grad_norm": 1.7311691045761108, + "learning_rate": 1.992634911906615e-05, + "loss": 1.5237, + "step": 8682 + }, + { + "epoch": 0.20251481748185898, + "grad_norm": 1.688225507736206, + "learning_rate": 1.9926318611942614e-05, + "loss": 1.2565, + "step": 8683 + }, + { + "epoch": 0.2025381406210369, + "grad_norm": 1.8149502277374268, + "learning_rate": 1.9926288098525534e-05, + "loss": 1.4996, + "step": 8684 + }, + { + "epoch": 0.2025614637602148, + "grad_norm": 1.7872846126556396, + "learning_rate": 1.9926257578814928e-05, + "loss": 1.7019, + "step": 8685 + }, + { + "epoch": 0.20258478689939272, + "grad_norm": 1.686917781829834, + "learning_rate": 1.9926227052810813e-05, + "loss": 1.1004, + "step": 8686 + }, + { + "epoch": 0.20260811003857065, + "grad_norm": 1.878817081451416, + "learning_rate": 1.9926196520513206e-05, + "loss": 1.4728, + "step": 8687 + }, + { + "epoch": 0.20263143317774857, + "grad_norm": 2.56231427192688, + "learning_rate": 1.992616598192213e-05, + "loss": 1.3895, + "step": 8688 + }, + { + "epoch": 0.20265475631692648, + "grad_norm": 1.7516465187072754, + "learning_rate": 1.9926135437037608e-05, + "loss": 1.1023, + "step": 8689 + }, + { + "epoch": 0.2026780794561044, + "grad_norm": 1.869144082069397, + "learning_rate": 1.9926104885859653e-05, + "loss": 1.3844, + "step": 8690 + }, + { + "epoch": 0.2027014025952823, + "grad_norm": 1.7043218612670898, + "learning_rate": 1.9926074328388288e-05, + "loss": 1.5623, + "step": 8691 + }, + { + "epoch": 0.20272472573446024, + "grad_norm": 1.7223814725875854, + "learning_rate": 1.9926043764623533e-05, + "loss": 1.3279, + "step": 8692 + }, + { + "epoch": 0.20274804887363815, + "grad_norm": 2.0240321159362793, + "learning_rate": 1.9926013194565403e-05, + "loss": 1.6416, + "step": 8693 + }, + { + "epoch": 0.20277137201281606, + "grad_norm": 2.0044267177581787, + "learning_rate": 1.992598261821392e-05, + "loss": 1.2893, + "step": 8694 + }, + { + "epoch": 0.20279469515199397, + "grad_norm": 1.549274206161499, + "learning_rate": 1.9925952035569106e-05, + "loss": 1.2753, + "step": 8695 + }, + { + "epoch": 0.2028180182911719, + "grad_norm": 2.364896774291992, + "learning_rate": 1.9925921446630975e-05, + "loss": 1.6467, + "step": 8696 + }, + { + "epoch": 0.20284134143034982, + "grad_norm": 2.6411540508270264, + "learning_rate": 1.992589085139955e-05, + "loss": 1.3512, + "step": 8697 + }, + { + "epoch": 0.20286466456952773, + "grad_norm": 2.0302560329437256, + "learning_rate": 1.992586024987485e-05, + "loss": 1.4654, + "step": 8698 + }, + { + "epoch": 0.20288798770870564, + "grad_norm": 2.0057921409606934, + "learning_rate": 1.9925829642056894e-05, + "loss": 1.4444, + "step": 8699 + }, + { + "epoch": 0.20291131084788358, + "grad_norm": 1.9912748336791992, + "learning_rate": 1.99257990279457e-05, + "loss": 1.6062, + "step": 8700 + }, + { + "epoch": 0.2029346339870615, + "grad_norm": 1.704063057899475, + "learning_rate": 1.9925768407541294e-05, + "loss": 1.3408, + "step": 8701 + }, + { + "epoch": 0.2029579571262394, + "grad_norm": 1.5035160779953003, + "learning_rate": 1.9925737780843685e-05, + "loss": 1.446, + "step": 8702 + }, + { + "epoch": 0.2029812802654173, + "grad_norm": 1.84805428981781, + "learning_rate": 1.9925707147852898e-05, + "loss": 1.2996, + "step": 8703 + }, + { + "epoch": 0.20300460340459525, + "grad_norm": 2.0463736057281494, + "learning_rate": 1.9925676508568955e-05, + "loss": 1.2612, + "step": 8704 + }, + { + "epoch": 0.20302792654377316, + "grad_norm": 1.9819360971450806, + "learning_rate": 1.992564586299187e-05, + "loss": 1.6805, + "step": 8705 + }, + { + "epoch": 0.20305124968295107, + "grad_norm": 1.903632402420044, + "learning_rate": 1.9925615211121667e-05, + "loss": 1.7348, + "step": 8706 + }, + { + "epoch": 0.20307457282212898, + "grad_norm": 2.221893072128296, + "learning_rate": 1.992558455295836e-05, + "loss": 1.4662, + "step": 8707 + }, + { + "epoch": 0.20309789596130692, + "grad_norm": 1.7994598150253296, + "learning_rate": 1.9925553888501975e-05, + "loss": 1.3152, + "step": 8708 + }, + { + "epoch": 0.20312121910048483, + "grad_norm": 1.75600266456604, + "learning_rate": 1.9925523217752523e-05, + "loss": 1.164, + "step": 8709 + }, + { + "epoch": 0.20314454223966275, + "grad_norm": 2.030285596847534, + "learning_rate": 1.9925492540710034e-05, + "loss": 1.1364, + "step": 8710 + }, + { + "epoch": 0.20316786537884066, + "grad_norm": 1.6719369888305664, + "learning_rate": 1.9925461857374522e-05, + "loss": 1.2113, + "step": 8711 + }, + { + "epoch": 0.2031911885180186, + "grad_norm": 1.7777940034866333, + "learning_rate": 1.9925431167746004e-05, + "loss": 1.3115, + "step": 8712 + }, + { + "epoch": 0.2032145116571965, + "grad_norm": 1.6478523015975952, + "learning_rate": 1.9925400471824505e-05, + "loss": 1.5389, + "step": 8713 + }, + { + "epoch": 0.20323783479637442, + "grad_norm": 2.1136772632598877, + "learning_rate": 1.9925369769610042e-05, + "loss": 1.2938, + "step": 8714 + }, + { + "epoch": 0.20326115793555233, + "grad_norm": 2.039743661880493, + "learning_rate": 1.992533906110263e-05, + "loss": 1.1118, + "step": 8715 + }, + { + "epoch": 0.20328448107473027, + "grad_norm": 1.6830363273620605, + "learning_rate": 1.9925308346302294e-05, + "loss": 1.2009, + "step": 8716 + }, + { + "epoch": 0.20330780421390818, + "grad_norm": 1.8598593473434448, + "learning_rate": 1.9925277625209053e-05, + "loss": 1.6406, + "step": 8717 + }, + { + "epoch": 0.2033311273530861, + "grad_norm": 1.7212172746658325, + "learning_rate": 1.9925246897822923e-05, + "loss": 1.5523, + "step": 8718 + }, + { + "epoch": 0.203354450492264, + "grad_norm": 2.0677688121795654, + "learning_rate": 1.992521616414393e-05, + "loss": 1.1334, + "step": 8719 + }, + { + "epoch": 0.2033777736314419, + "grad_norm": 1.6897939443588257, + "learning_rate": 1.9925185424172086e-05, + "loss": 1.4238, + "step": 8720 + }, + { + "epoch": 0.20340109677061985, + "grad_norm": 1.6465643644332886, + "learning_rate": 1.9925154677907415e-05, + "loss": 1.399, + "step": 8721 + }, + { + "epoch": 0.20342441990979776, + "grad_norm": 1.6797456741333008, + "learning_rate": 1.9925123925349935e-05, + "loss": 1.6383, + "step": 8722 + }, + { + "epoch": 0.20344774304897567, + "grad_norm": 1.6820688247680664, + "learning_rate": 1.9925093166499665e-05, + "loss": 1.3603, + "step": 8723 + }, + { + "epoch": 0.20347106618815358, + "grad_norm": 1.83897864818573, + "learning_rate": 1.9925062401356627e-05, + "loss": 1.6247, + "step": 8724 + }, + { + "epoch": 0.20349438932733152, + "grad_norm": 1.9001243114471436, + "learning_rate": 1.992503162992084e-05, + "loss": 1.4808, + "step": 8725 + }, + { + "epoch": 0.20351771246650943, + "grad_norm": 1.9484024047851562, + "learning_rate": 1.992500085219232e-05, + "loss": 1.1417, + "step": 8726 + }, + { + "epoch": 0.20354103560568734, + "grad_norm": 1.6613361835479736, + "learning_rate": 1.992497006817109e-05, + "loss": 1.491, + "step": 8727 + }, + { + "epoch": 0.20356435874486525, + "grad_norm": 1.7884044647216797, + "learning_rate": 1.9924939277857167e-05, + "loss": 0.9364, + "step": 8728 + }, + { + "epoch": 0.2035876818840432, + "grad_norm": 1.7043697834014893, + "learning_rate": 1.9924908481250573e-05, + "loss": 1.3981, + "step": 8729 + }, + { + "epoch": 0.2036110050232211, + "grad_norm": 1.8538833856582642, + "learning_rate": 1.9924877678351327e-05, + "loss": 1.476, + "step": 8730 + }, + { + "epoch": 0.20363432816239901, + "grad_norm": 2.285177707672119, + "learning_rate": 1.9924846869159445e-05, + "loss": 1.6123, + "step": 8731 + }, + { + "epoch": 0.20365765130157693, + "grad_norm": 2.364985942840576, + "learning_rate": 1.9924816053674953e-05, + "loss": 1.5083, + "step": 8732 + }, + { + "epoch": 0.20368097444075486, + "grad_norm": 2.1389856338500977, + "learning_rate": 1.9924785231897866e-05, + "loss": 1.4382, + "step": 8733 + }, + { + "epoch": 0.20370429757993277, + "grad_norm": 2.118715524673462, + "learning_rate": 1.9924754403828203e-05, + "loss": 1.776, + "step": 8734 + }, + { + "epoch": 0.20372762071911069, + "grad_norm": 1.484419345855713, + "learning_rate": 1.992472356946599e-05, + "loss": 0.9963, + "step": 8735 + }, + { + "epoch": 0.2037509438582886, + "grad_norm": 1.9472377300262451, + "learning_rate": 1.9924692728811238e-05, + "loss": 1.5576, + "step": 8736 + }, + { + "epoch": 0.20377426699746654, + "grad_norm": 1.895979404449463, + "learning_rate": 1.992466188186397e-05, + "loss": 0.9937, + "step": 8737 + }, + { + "epoch": 0.20379759013664445, + "grad_norm": 1.6635349988937378, + "learning_rate": 1.9924631028624205e-05, + "loss": 1.5651, + "step": 8738 + }, + { + "epoch": 0.20382091327582236, + "grad_norm": 2.0051400661468506, + "learning_rate": 1.9924600169091966e-05, + "loss": 1.367, + "step": 8739 + }, + { + "epoch": 0.20384423641500027, + "grad_norm": 1.6480952501296997, + "learning_rate": 1.992456930326727e-05, + "loss": 1.1594, + "step": 8740 + }, + { + "epoch": 0.2038675595541782, + "grad_norm": 1.6607714891433716, + "learning_rate": 1.9924538431150134e-05, + "loss": 1.2025, + "step": 8741 + }, + { + "epoch": 0.20389088269335612, + "grad_norm": 2.307953119277954, + "learning_rate": 1.992450755274058e-05, + "loss": 1.2454, + "step": 8742 + }, + { + "epoch": 0.20391420583253403, + "grad_norm": 1.680513858795166, + "learning_rate": 1.992447666803863e-05, + "loss": 1.2715, + "step": 8743 + }, + { + "epoch": 0.20393752897171194, + "grad_norm": 1.8988139629364014, + "learning_rate": 1.99244457770443e-05, + "loss": 1.3859, + "step": 8744 + }, + { + "epoch": 0.20396085211088988, + "grad_norm": 1.8415597677230835, + "learning_rate": 1.992441487975761e-05, + "loss": 1.3106, + "step": 8745 + }, + { + "epoch": 0.2039841752500678, + "grad_norm": 1.7065378427505493, + "learning_rate": 1.9924383976178582e-05, + "loss": 0.9613, + "step": 8746 + }, + { + "epoch": 0.2040074983892457, + "grad_norm": 2.0893259048461914, + "learning_rate": 1.9924353066307233e-05, + "loss": 1.3543, + "step": 8747 + }, + { + "epoch": 0.2040308215284236, + "grad_norm": 2.249833345413208, + "learning_rate": 1.9924322150143585e-05, + "loss": 1.4813, + "step": 8748 + }, + { + "epoch": 0.20405414466760152, + "grad_norm": 2.202148199081421, + "learning_rate": 1.9924291227687654e-05, + "loss": 1.3918, + "step": 8749 + }, + { + "epoch": 0.20407746780677946, + "grad_norm": 1.9354767799377441, + "learning_rate": 1.9924260298939468e-05, + "loss": 1.4875, + "step": 8750 + }, + { + "epoch": 0.20410079094595737, + "grad_norm": 1.8233189582824707, + "learning_rate": 1.9924229363899033e-05, + "loss": 1.4094, + "step": 8751 + }, + { + "epoch": 0.20412411408513528, + "grad_norm": 1.723515510559082, + "learning_rate": 1.992419842256638e-05, + "loss": 1.4259, + "step": 8752 + }, + { + "epoch": 0.2041474372243132, + "grad_norm": 1.7915992736816406, + "learning_rate": 1.992416747494152e-05, + "loss": 1.4247, + "step": 8753 + }, + { + "epoch": 0.20417076036349113, + "grad_norm": 1.6213306188583374, + "learning_rate": 1.992413652102448e-05, + "loss": 1.2403, + "step": 8754 + }, + { + "epoch": 0.20419408350266904, + "grad_norm": 1.60962975025177, + "learning_rate": 1.9924105560815278e-05, + "loss": 1.3427, + "step": 8755 + }, + { + "epoch": 0.20421740664184695, + "grad_norm": 1.8469569683074951, + "learning_rate": 1.9924074594313932e-05, + "loss": 1.3315, + "step": 8756 + }, + { + "epoch": 0.20424072978102487, + "grad_norm": 1.9690977334976196, + "learning_rate": 1.9924043621520463e-05, + "loss": 1.383, + "step": 8757 + }, + { + "epoch": 0.2042640529202028, + "grad_norm": 1.3607131242752075, + "learning_rate": 1.9924012642434887e-05, + "loss": 1.2414, + "step": 8758 + }, + { + "epoch": 0.20428737605938072, + "grad_norm": 3.4915966987609863, + "learning_rate": 1.992398165705723e-05, + "loss": 1.4393, + "step": 8759 + }, + { + "epoch": 0.20431069919855863, + "grad_norm": 1.5933380126953125, + "learning_rate": 1.9923950665387505e-05, + "loss": 1.2841, + "step": 8760 + }, + { + "epoch": 0.20433402233773654, + "grad_norm": 1.5613057613372803, + "learning_rate": 1.9923919667425736e-05, + "loss": 1.2373, + "step": 8761 + }, + { + "epoch": 0.20435734547691448, + "grad_norm": 1.6714661121368408, + "learning_rate": 1.9923888663171942e-05, + "loss": 1.346, + "step": 8762 + }, + { + "epoch": 0.2043806686160924, + "grad_norm": 1.7481532096862793, + "learning_rate": 1.9923857652626143e-05, + "loss": 1.4136, + "step": 8763 + }, + { + "epoch": 0.2044039917552703, + "grad_norm": 2.2159597873687744, + "learning_rate": 1.9923826635788356e-05, + "loss": 1.1866, + "step": 8764 + }, + { + "epoch": 0.2044273148944482, + "grad_norm": 1.657617211341858, + "learning_rate": 1.9923795612658602e-05, + "loss": 1.0427, + "step": 8765 + }, + { + "epoch": 0.20445063803362615, + "grad_norm": 1.6769158840179443, + "learning_rate": 1.99237645832369e-05, + "loss": 1.4225, + "step": 8766 + }, + { + "epoch": 0.20447396117280406, + "grad_norm": 1.8116823434829712, + "learning_rate": 1.9923733547523273e-05, + "loss": 1.4399, + "step": 8767 + }, + { + "epoch": 0.20449728431198197, + "grad_norm": 1.5913594961166382, + "learning_rate": 1.992370250551774e-05, + "loss": 1.396, + "step": 8768 + }, + { + "epoch": 0.20452060745115988, + "grad_norm": 1.9643194675445557, + "learning_rate": 1.9923671457220315e-05, + "loss": 1.4328, + "step": 8769 + }, + { + "epoch": 0.20454393059033782, + "grad_norm": 1.8307501077651978, + "learning_rate": 1.9923640402631023e-05, + "loss": 1.2509, + "step": 8770 + }, + { + "epoch": 0.20456725372951573, + "grad_norm": 1.800936222076416, + "learning_rate": 1.9923609341749884e-05, + "loss": 1.5155, + "step": 8771 + }, + { + "epoch": 0.20459057686869364, + "grad_norm": 2.303677558898926, + "learning_rate": 1.9923578274576917e-05, + "loss": 1.3478, + "step": 8772 + }, + { + "epoch": 0.20461390000787155, + "grad_norm": 2.0325570106506348, + "learning_rate": 1.9923547201112136e-05, + "loss": 1.4897, + "step": 8773 + }, + { + "epoch": 0.2046372231470495, + "grad_norm": 1.9240190982818604, + "learning_rate": 1.992351612135557e-05, + "loss": 1.4139, + "step": 8774 + }, + { + "epoch": 0.2046605462862274, + "grad_norm": 1.8259384632110596, + "learning_rate": 1.9923485035307232e-05, + "loss": 1.5495, + "step": 8775 + }, + { + "epoch": 0.2046838694254053, + "grad_norm": 1.4900740385055542, + "learning_rate": 1.9923453942967145e-05, + "loss": 1.3862, + "step": 8776 + }, + { + "epoch": 0.20470719256458322, + "grad_norm": 1.721463680267334, + "learning_rate": 1.9923422844335327e-05, + "loss": 1.2483, + "step": 8777 + }, + { + "epoch": 0.20473051570376113, + "grad_norm": 1.936254620552063, + "learning_rate": 1.9923391739411796e-05, + "loss": 1.6259, + "step": 8778 + }, + { + "epoch": 0.20475383884293907, + "grad_norm": 1.9787451028823853, + "learning_rate": 1.9923360628196578e-05, + "loss": 1.1056, + "step": 8779 + }, + { + "epoch": 0.20477716198211698, + "grad_norm": 1.792060136795044, + "learning_rate": 1.9923329510689688e-05, + "loss": 1.2141, + "step": 8780 + }, + { + "epoch": 0.2048004851212949, + "grad_norm": 1.6591053009033203, + "learning_rate": 1.9923298386891146e-05, + "loss": 1.4437, + "step": 8781 + }, + { + "epoch": 0.2048238082604728, + "grad_norm": 1.7790863513946533, + "learning_rate": 1.9923267256800975e-05, + "loss": 1.5406, + "step": 8782 + }, + { + "epoch": 0.20484713139965074, + "grad_norm": 2.250915765762329, + "learning_rate": 1.992323612041919e-05, + "loss": 1.4576, + "step": 8783 + }, + { + "epoch": 0.20487045453882866, + "grad_norm": 1.631272554397583, + "learning_rate": 1.992320497774581e-05, + "loss": 1.5338, + "step": 8784 + }, + { + "epoch": 0.20489377767800657, + "grad_norm": 2.0557973384857178, + "learning_rate": 1.9923173828780863e-05, + "loss": 1.6981, + "step": 8785 + }, + { + "epoch": 0.20491710081718448, + "grad_norm": 2.0663645267486572, + "learning_rate": 1.9923142673524357e-05, + "loss": 1.3737, + "step": 8786 + }, + { + "epoch": 0.20494042395636242, + "grad_norm": 1.84579336643219, + "learning_rate": 1.992311151197632e-05, + "loss": 1.5768, + "step": 8787 + }, + { + "epoch": 0.20496374709554033, + "grad_norm": 1.8822888135910034, + "learning_rate": 1.9923080344136775e-05, + "loss": 1.1937, + "step": 8788 + }, + { + "epoch": 0.20498707023471824, + "grad_norm": 1.9036933183670044, + "learning_rate": 1.992304917000573e-05, + "loss": 1.5067, + "step": 8789 + }, + { + "epoch": 0.20501039337389615, + "grad_norm": 1.7210967540740967, + "learning_rate": 1.9923017989583214e-05, + "loss": 1.4302, + "step": 8790 + }, + { + "epoch": 0.2050337165130741, + "grad_norm": 1.9354089498519897, + "learning_rate": 1.9922986802869243e-05, + "loss": 1.3187, + "step": 8791 + }, + { + "epoch": 0.205057039652252, + "grad_norm": 1.9072645902633667, + "learning_rate": 1.992295560986384e-05, + "loss": 1.4165, + "step": 8792 + }, + { + "epoch": 0.2050803627914299, + "grad_norm": 1.5938284397125244, + "learning_rate": 1.9922924410567018e-05, + "loss": 1.1451, + "step": 8793 + }, + { + "epoch": 0.20510368593060782, + "grad_norm": 1.992186188697815, + "learning_rate": 1.9922893204978808e-05, + "loss": 0.9548, + "step": 8794 + }, + { + "epoch": 0.20512700906978576, + "grad_norm": 1.7065699100494385, + "learning_rate": 1.992286199309922e-05, + "loss": 1.3368, + "step": 8795 + }, + { + "epoch": 0.20515033220896367, + "grad_norm": 2.340069055557251, + "learning_rate": 1.9922830774928275e-05, + "loss": 1.1548, + "step": 8796 + }, + { + "epoch": 0.20517365534814158, + "grad_norm": 1.9976458549499512, + "learning_rate": 1.9922799550466e-05, + "loss": 1.5394, + "step": 8797 + }, + { + "epoch": 0.2051969784873195, + "grad_norm": 2.1043970584869385, + "learning_rate": 1.9922768319712404e-05, + "loss": 1.7279, + "step": 8798 + }, + { + "epoch": 0.20522030162649743, + "grad_norm": 1.7726044654846191, + "learning_rate": 1.9922737082667515e-05, + "loss": 1.536, + "step": 8799 + }, + { + "epoch": 0.20524362476567534, + "grad_norm": 1.9535636901855469, + "learning_rate": 1.992270583933135e-05, + "loss": 1.6134, + "step": 8800 + }, + { + "epoch": 0.20526694790485325, + "grad_norm": 1.8124380111694336, + "learning_rate": 1.992267458970393e-05, + "loss": 1.3651, + "step": 8801 + }, + { + "epoch": 0.20529027104403116, + "grad_norm": 1.7399616241455078, + "learning_rate": 1.992264333378527e-05, + "loss": 1.5408, + "step": 8802 + }, + { + "epoch": 0.2053135941832091, + "grad_norm": 1.7448316812515259, + "learning_rate": 1.99226120715754e-05, + "loss": 1.4518, + "step": 8803 + }, + { + "epoch": 0.205336917322387, + "grad_norm": 1.7521073818206787, + "learning_rate": 1.9922580803074326e-05, + "loss": 1.2033, + "step": 8804 + }, + { + "epoch": 0.20536024046156492, + "grad_norm": 2.092233419418335, + "learning_rate": 1.992254952828208e-05, + "loss": 1.0959, + "step": 8805 + }, + { + "epoch": 0.20538356360074284, + "grad_norm": 1.6378101110458374, + "learning_rate": 1.9922518247198675e-05, + "loss": 1.3844, + "step": 8806 + }, + { + "epoch": 0.20540688673992075, + "grad_norm": 1.849848747253418, + "learning_rate": 1.9922486959824135e-05, + "loss": 1.5196, + "step": 8807 + }, + { + "epoch": 0.20543020987909869, + "grad_norm": 1.8436238765716553, + "learning_rate": 1.992245566615848e-05, + "loss": 1.3191, + "step": 8808 + }, + { + "epoch": 0.2054535330182766, + "grad_norm": 2.73968243598938, + "learning_rate": 1.992242436620172e-05, + "loss": 1.4113, + "step": 8809 + }, + { + "epoch": 0.2054768561574545, + "grad_norm": 2.0241315364837646, + "learning_rate": 1.9922393059953888e-05, + "loss": 1.519, + "step": 8810 + }, + { + "epoch": 0.20550017929663242, + "grad_norm": 1.9673908948898315, + "learning_rate": 1.9922361747415e-05, + "loss": 1.4782, + "step": 8811 + }, + { + "epoch": 0.20552350243581036, + "grad_norm": 2.1451499462127686, + "learning_rate": 1.9922330428585067e-05, + "loss": 1.7216, + "step": 8812 + }, + { + "epoch": 0.20554682557498827, + "grad_norm": 1.7855838537216187, + "learning_rate": 1.9922299103464124e-05, + "loss": 1.6039, + "step": 8813 + }, + { + "epoch": 0.20557014871416618, + "grad_norm": 1.6132923364639282, + "learning_rate": 1.9922267772052178e-05, + "loss": 1.2745, + "step": 8814 + }, + { + "epoch": 0.2055934718533441, + "grad_norm": 1.478954553604126, + "learning_rate": 1.9922236434349255e-05, + "loss": 1.4165, + "step": 8815 + }, + { + "epoch": 0.20561679499252203, + "grad_norm": 1.9536352157592773, + "learning_rate": 1.992220509035537e-05, + "loss": 1.3855, + "step": 8816 + }, + { + "epoch": 0.20564011813169994, + "grad_norm": 1.918891191482544, + "learning_rate": 1.992217374007055e-05, + "loss": 1.4946, + "step": 8817 + }, + { + "epoch": 0.20566344127087785, + "grad_norm": 2.120265483856201, + "learning_rate": 1.9922142383494812e-05, + "loss": 1.4393, + "step": 8818 + }, + { + "epoch": 0.20568676441005576, + "grad_norm": 1.618989109992981, + "learning_rate": 1.9922111020628177e-05, + "loss": 1.5332, + "step": 8819 + }, + { + "epoch": 0.2057100875492337, + "grad_norm": 1.767477035522461, + "learning_rate": 1.992207965147066e-05, + "loss": 1.1539, + "step": 8820 + }, + { + "epoch": 0.2057334106884116, + "grad_norm": 2.1358931064605713, + "learning_rate": 1.9922048276022282e-05, + "loss": 1.4643, + "step": 8821 + }, + { + "epoch": 0.20575673382758952, + "grad_norm": 1.7090671062469482, + "learning_rate": 1.9922016894283068e-05, + "loss": 1.1219, + "step": 8822 + }, + { + "epoch": 0.20578005696676743, + "grad_norm": 1.8848412036895752, + "learning_rate": 1.9921985506253035e-05, + "loss": 1.1351, + "step": 8823 + }, + { + "epoch": 0.20580338010594537, + "grad_norm": 1.7191115617752075, + "learning_rate": 1.9921954111932203e-05, + "loss": 1.339, + "step": 8824 + }, + { + "epoch": 0.20582670324512328, + "grad_norm": 2.4290459156036377, + "learning_rate": 1.992192271132059e-05, + "loss": 1.5941, + "step": 8825 + }, + { + "epoch": 0.2058500263843012, + "grad_norm": 2.552546977996826, + "learning_rate": 1.992189130441822e-05, + "loss": 1.8204, + "step": 8826 + }, + { + "epoch": 0.2058733495234791, + "grad_norm": 1.975969910621643, + "learning_rate": 1.9921859891225107e-05, + "loss": 1.1902, + "step": 8827 + }, + { + "epoch": 0.20589667266265704, + "grad_norm": 1.9177992343902588, + "learning_rate": 1.9921828471741274e-05, + "loss": 1.4441, + "step": 8828 + }, + { + "epoch": 0.20591999580183495, + "grad_norm": 1.7927231788635254, + "learning_rate": 1.9921797045966748e-05, + "loss": 1.4061, + "step": 8829 + }, + { + "epoch": 0.20594331894101287, + "grad_norm": 2.1434812545776367, + "learning_rate": 1.9921765613901532e-05, + "loss": 1.2991, + "step": 8830 + }, + { + "epoch": 0.20596664208019078, + "grad_norm": 1.827897071838379, + "learning_rate": 1.9921734175545663e-05, + "loss": 1.1525, + "step": 8831 + }, + { + "epoch": 0.20598996521936871, + "grad_norm": 1.8619117736816406, + "learning_rate": 1.992170273089915e-05, + "loss": 1.2108, + "step": 8832 + }, + { + "epoch": 0.20601328835854663, + "grad_norm": 1.6954329013824463, + "learning_rate": 1.9921671279962024e-05, + "loss": 1.5327, + "step": 8833 + }, + { + "epoch": 0.20603661149772454, + "grad_norm": 1.8722609281539917, + "learning_rate": 1.9921639822734292e-05, + "loss": 1.5927, + "step": 8834 + }, + { + "epoch": 0.20605993463690245, + "grad_norm": 1.865293025970459, + "learning_rate": 1.992160835921598e-05, + "loss": 1.311, + "step": 8835 + }, + { + "epoch": 0.20608325777608036, + "grad_norm": 2.044468641281128, + "learning_rate": 1.992157688940711e-05, + "loss": 1.4472, + "step": 8836 + }, + { + "epoch": 0.2061065809152583, + "grad_norm": 1.694170594215393, + "learning_rate": 1.99215454133077e-05, + "loss": 1.3139, + "step": 8837 + }, + { + "epoch": 0.2061299040544362, + "grad_norm": 1.898816466331482, + "learning_rate": 1.992151393091777e-05, + "loss": 1.2368, + "step": 8838 + }, + { + "epoch": 0.20615322719361412, + "grad_norm": 1.9734357595443726, + "learning_rate": 1.9921482442237338e-05, + "loss": 1.2225, + "step": 8839 + }, + { + "epoch": 0.20617655033279203, + "grad_norm": 1.9752990007400513, + "learning_rate": 1.9921450947266424e-05, + "loss": 1.4305, + "step": 8840 + }, + { + "epoch": 0.20619987347196997, + "grad_norm": 1.791849970817566, + "learning_rate": 1.9921419446005053e-05, + "loss": 1.3757, + "step": 8841 + }, + { + "epoch": 0.20622319661114788, + "grad_norm": 1.8073325157165527, + "learning_rate": 1.9921387938453237e-05, + "loss": 1.6193, + "step": 8842 + }, + { + "epoch": 0.2062465197503258, + "grad_norm": 2.058304786682129, + "learning_rate": 1.9921356424611005e-05, + "loss": 1.5403, + "step": 8843 + }, + { + "epoch": 0.2062698428895037, + "grad_norm": 1.9174165725708008, + "learning_rate": 1.992132490447837e-05, + "loss": 1.571, + "step": 8844 + }, + { + "epoch": 0.20629316602868164, + "grad_norm": 1.888541340827942, + "learning_rate": 1.9921293378055357e-05, + "loss": 1.3371, + "step": 8845 + }, + { + "epoch": 0.20631648916785955, + "grad_norm": 1.7331855297088623, + "learning_rate": 1.992126184534198e-05, + "loss": 1.08, + "step": 8846 + }, + { + "epoch": 0.20633981230703746, + "grad_norm": 1.8521579504013062, + "learning_rate": 1.9921230306338265e-05, + "loss": 1.36, + "step": 8847 + }, + { + "epoch": 0.20636313544621537, + "grad_norm": 1.7578471899032593, + "learning_rate": 1.9921198761044232e-05, + "loss": 1.2992, + "step": 8848 + }, + { + "epoch": 0.2063864585853933, + "grad_norm": 1.8637561798095703, + "learning_rate": 1.9921167209459896e-05, + "loss": 1.3249, + "step": 8849 + }, + { + "epoch": 0.20640978172457122, + "grad_norm": 1.8409662246704102, + "learning_rate": 1.992113565158528e-05, + "loss": 1.3963, + "step": 8850 + }, + { + "epoch": 0.20643310486374913, + "grad_norm": 1.9264672994613647, + "learning_rate": 1.9921104087420403e-05, + "loss": 1.6858, + "step": 8851 + }, + { + "epoch": 0.20645642800292704, + "grad_norm": 2.1116433143615723, + "learning_rate": 1.9921072516965287e-05, + "loss": 1.5501, + "step": 8852 + }, + { + "epoch": 0.20647975114210498, + "grad_norm": 1.825519323348999, + "learning_rate": 1.992104094021995e-05, + "loss": 1.3821, + "step": 8853 + }, + { + "epoch": 0.2065030742812829, + "grad_norm": 1.9409719705581665, + "learning_rate": 1.9921009357184408e-05, + "loss": 1.4545, + "step": 8854 + }, + { + "epoch": 0.2065263974204608, + "grad_norm": 1.9195404052734375, + "learning_rate": 1.992097776785869e-05, + "loss": 1.1922, + "step": 8855 + }, + { + "epoch": 0.20654972055963872, + "grad_norm": 1.652100920677185, + "learning_rate": 1.9920946172242808e-05, + "loss": 1.3986, + "step": 8856 + }, + { + "epoch": 0.20657304369881666, + "grad_norm": 1.8900972604751587, + "learning_rate": 1.992091457033679e-05, + "loss": 1.531, + "step": 8857 + }, + { + "epoch": 0.20659636683799457, + "grad_norm": 1.4838114976882935, + "learning_rate": 1.992088296214065e-05, + "loss": 1.4327, + "step": 8858 + }, + { + "epoch": 0.20661968997717248, + "grad_norm": 1.4899314641952515, + "learning_rate": 1.992085134765441e-05, + "loss": 1.2292, + "step": 8859 + }, + { + "epoch": 0.2066430131163504, + "grad_norm": 1.5614373683929443, + "learning_rate": 1.9920819726878087e-05, + "loss": 1.4186, + "step": 8860 + }, + { + "epoch": 0.2066663362555283, + "grad_norm": 2.139237642288208, + "learning_rate": 1.992078809981171e-05, + "loss": 1.5737, + "step": 8861 + }, + { + "epoch": 0.20668965939470624, + "grad_norm": 2.1542747020721436, + "learning_rate": 1.9920756466455288e-05, + "loss": 1.1823, + "step": 8862 + }, + { + "epoch": 0.20671298253388415, + "grad_norm": 2.5610742568969727, + "learning_rate": 1.992072482680885e-05, + "loss": 1.4806, + "step": 8863 + }, + { + "epoch": 0.20673630567306206, + "grad_norm": 1.64136803150177, + "learning_rate": 1.992069318087241e-05, + "loss": 1.3098, + "step": 8864 + }, + { + "epoch": 0.20675962881223997, + "grad_norm": 2.099841833114624, + "learning_rate": 1.992066152864599e-05, + "loss": 1.2724, + "step": 8865 + }, + { + "epoch": 0.2067829519514179, + "grad_norm": 1.7435263395309448, + "learning_rate": 1.9920629870129608e-05, + "loss": 1.1815, + "step": 8866 + }, + { + "epoch": 0.20680627509059582, + "grad_norm": 1.7112057209014893, + "learning_rate": 1.992059820532329e-05, + "loss": 1.5124, + "step": 8867 + }, + { + "epoch": 0.20682959822977373, + "grad_norm": 1.7069768905639648, + "learning_rate": 1.9920566534227045e-05, + "loss": 1.2702, + "step": 8868 + }, + { + "epoch": 0.20685292136895164, + "grad_norm": 1.9476577043533325, + "learning_rate": 1.9920534856840908e-05, + "loss": 1.4219, + "step": 8869 + }, + { + "epoch": 0.20687624450812958, + "grad_norm": 2.1855361461639404, + "learning_rate": 1.9920503173164886e-05, + "loss": 1.454, + "step": 8870 + }, + { + "epoch": 0.2068995676473075, + "grad_norm": 1.4870364665985107, + "learning_rate": 1.992047148319901e-05, + "loss": 1.0694, + "step": 8871 + }, + { + "epoch": 0.2069228907864854, + "grad_norm": 2.0364720821380615, + "learning_rate": 1.992043978694329e-05, + "loss": 1.5936, + "step": 8872 + }, + { + "epoch": 0.2069462139256633, + "grad_norm": 2.615400552749634, + "learning_rate": 1.9920408084397755e-05, + "loss": 1.7353, + "step": 8873 + }, + { + "epoch": 0.20696953706484125, + "grad_norm": 1.9306919574737549, + "learning_rate": 1.9920376375562416e-05, + "loss": 1.8241, + "step": 8874 + }, + { + "epoch": 0.20699286020401916, + "grad_norm": 1.9919795989990234, + "learning_rate": 1.99203446604373e-05, + "loss": 1.3006, + "step": 8875 + }, + { + "epoch": 0.20701618334319707, + "grad_norm": 1.9794896841049194, + "learning_rate": 1.9920312939022425e-05, + "loss": 1.3783, + "step": 8876 + }, + { + "epoch": 0.20703950648237499, + "grad_norm": 1.6727242469787598, + "learning_rate": 1.9920281211317815e-05, + "loss": 1.2052, + "step": 8877 + }, + { + "epoch": 0.20706282962155292, + "grad_norm": 1.7483283281326294, + "learning_rate": 1.992024947732348e-05, + "loss": 1.277, + "step": 8878 + }, + { + "epoch": 0.20708615276073083, + "grad_norm": 1.6579687595367432, + "learning_rate": 1.992021773703945e-05, + "loss": 1.4, + "step": 8879 + }, + { + "epoch": 0.20710947589990875, + "grad_norm": 1.9297069311141968, + "learning_rate": 1.992018599046574e-05, + "loss": 1.5716, + "step": 8880 + }, + { + "epoch": 0.20713279903908666, + "grad_norm": 2.1345489025115967, + "learning_rate": 1.9920154237602374e-05, + "loss": 1.8763, + "step": 8881 + }, + { + "epoch": 0.2071561221782646, + "grad_norm": 1.9327037334442139, + "learning_rate": 1.992012247844937e-05, + "loss": 1.5438, + "step": 8882 + }, + { + "epoch": 0.2071794453174425, + "grad_norm": 1.8208495378494263, + "learning_rate": 1.9920090713006743e-05, + "loss": 1.3595, + "step": 8883 + }, + { + "epoch": 0.20720276845662042, + "grad_norm": 1.770696759223938, + "learning_rate": 1.9920058941274522e-05, + "loss": 1.4383, + "step": 8884 + }, + { + "epoch": 0.20722609159579833, + "grad_norm": 1.6497207880020142, + "learning_rate": 1.9920027163252725e-05, + "loss": 1.2984, + "step": 8885 + }, + { + "epoch": 0.20724941473497627, + "grad_norm": 2.6174473762512207, + "learning_rate": 1.9919995378941367e-05, + "loss": 1.4454, + "step": 8886 + }, + { + "epoch": 0.20727273787415418, + "grad_norm": 1.7459523677825928, + "learning_rate": 1.991996358834047e-05, + "loss": 1.5941, + "step": 8887 + }, + { + "epoch": 0.2072960610133321, + "grad_norm": 1.976410150527954, + "learning_rate": 1.991993179145006e-05, + "loss": 1.2245, + "step": 8888 + }, + { + "epoch": 0.20731938415251, + "grad_norm": 1.9827830791473389, + "learning_rate": 1.9919899988270148e-05, + "loss": 1.2809, + "step": 8889 + }, + { + "epoch": 0.2073427072916879, + "grad_norm": 1.4886244535446167, + "learning_rate": 1.9919868178800762e-05, + "loss": 1.2854, + "step": 8890 + }, + { + "epoch": 0.20736603043086585, + "grad_norm": 1.7278337478637695, + "learning_rate": 1.991983636304192e-05, + "loss": 1.3221, + "step": 8891 + }, + { + "epoch": 0.20738935357004376, + "grad_norm": 2.2526237964630127, + "learning_rate": 1.9919804540993638e-05, + "loss": 1.6167, + "step": 8892 + }, + { + "epoch": 0.20741267670922167, + "grad_norm": 1.8424715995788574, + "learning_rate": 1.9919772712655943e-05, + "loss": 1.5498, + "step": 8893 + }, + { + "epoch": 0.20743599984839958, + "grad_norm": 1.7487908601760864, + "learning_rate": 1.9919740878028852e-05, + "loss": 1.6001, + "step": 8894 + }, + { + "epoch": 0.20745932298757752, + "grad_norm": 2.056684732437134, + "learning_rate": 1.9919709037112382e-05, + "loss": 1.4393, + "step": 8895 + }, + { + "epoch": 0.20748264612675543, + "grad_norm": 2.1060032844543457, + "learning_rate": 1.9919677189906558e-05, + "loss": 1.629, + "step": 8896 + }, + { + "epoch": 0.20750596926593334, + "grad_norm": 2.238145351409912, + "learning_rate": 1.9919645336411398e-05, + "loss": 1.4371, + "step": 8897 + }, + { + "epoch": 0.20752929240511125, + "grad_norm": 1.8802592754364014, + "learning_rate": 1.991961347662692e-05, + "loss": 1.2715, + "step": 8898 + }, + { + "epoch": 0.2075526155442892, + "grad_norm": 1.659878134727478, + "learning_rate": 1.991958161055315e-05, + "loss": 1.5157, + "step": 8899 + }, + { + "epoch": 0.2075759386834671, + "grad_norm": 2.106590747833252, + "learning_rate": 1.9919549738190103e-05, + "loss": 1.3419, + "step": 8900 + }, + { + "epoch": 0.20759926182264501, + "grad_norm": 2.1046085357666016, + "learning_rate": 1.9919517859537797e-05, + "loss": 1.5822, + "step": 8901 + }, + { + "epoch": 0.20762258496182293, + "grad_norm": 1.6544556617736816, + "learning_rate": 1.9919485974596262e-05, + "loss": 1.489, + "step": 8902 + }, + { + "epoch": 0.20764590810100086, + "grad_norm": 2.0881459712982178, + "learning_rate": 1.9919454083365513e-05, + "loss": 1.4799, + "step": 8903 + }, + { + "epoch": 0.20766923124017878, + "grad_norm": 1.8463943004608154, + "learning_rate": 1.9919422185845566e-05, + "loss": 1.5057, + "step": 8904 + }, + { + "epoch": 0.2076925543793567, + "grad_norm": 2.0330324172973633, + "learning_rate": 1.991939028203645e-05, + "loss": 1.6595, + "step": 8905 + }, + { + "epoch": 0.2077158775185346, + "grad_norm": 2.062559127807617, + "learning_rate": 1.9919358371938174e-05, + "loss": 1.4329, + "step": 8906 + }, + { + "epoch": 0.20773920065771254, + "grad_norm": 1.8217717409133911, + "learning_rate": 1.9919326455550767e-05, + "loss": 1.2091, + "step": 8907 + }, + { + "epoch": 0.20776252379689045, + "grad_norm": 2.250155448913574, + "learning_rate": 1.9919294532874246e-05, + "loss": 1.6404, + "step": 8908 + }, + { + "epoch": 0.20778584693606836, + "grad_norm": 1.8394792079925537, + "learning_rate": 1.991926260390863e-05, + "loss": 1.3912, + "step": 8909 + }, + { + "epoch": 0.20780917007524627, + "grad_norm": 1.7551653385162354, + "learning_rate": 1.9919230668653947e-05, + "loss": 1.4195, + "step": 8910 + }, + { + "epoch": 0.2078324932144242, + "grad_norm": 1.9586412906646729, + "learning_rate": 1.9919198727110207e-05, + "loss": 1.2036, + "step": 8911 + }, + { + "epoch": 0.20785581635360212, + "grad_norm": 1.888896107673645, + "learning_rate": 1.9919166779277438e-05, + "loss": 1.3466, + "step": 8912 + }, + { + "epoch": 0.20787913949278003, + "grad_norm": 2.0014193058013916, + "learning_rate": 1.991913482515565e-05, + "loss": 1.2896, + "step": 8913 + }, + { + "epoch": 0.20790246263195794, + "grad_norm": 1.9210609197616577, + "learning_rate": 1.9919102864744876e-05, + "loss": 1.5128, + "step": 8914 + }, + { + "epoch": 0.20792578577113588, + "grad_norm": 1.7737550735473633, + "learning_rate": 1.991907089804513e-05, + "loss": 1.5872, + "step": 8915 + }, + { + "epoch": 0.2079491089103138, + "grad_norm": 1.9794683456420898, + "learning_rate": 1.9919038925056433e-05, + "loss": 1.5188, + "step": 8916 + }, + { + "epoch": 0.2079724320494917, + "grad_norm": 2.306197166442871, + "learning_rate": 1.99190069457788e-05, + "loss": 1.0732, + "step": 8917 + }, + { + "epoch": 0.2079957551886696, + "grad_norm": 1.9478257894515991, + "learning_rate": 1.9918974960212262e-05, + "loss": 1.1285, + "step": 8918 + }, + { + "epoch": 0.20801907832784752, + "grad_norm": 1.8708724975585938, + "learning_rate": 1.991894296835683e-05, + "loss": 1.2511, + "step": 8919 + }, + { + "epoch": 0.20804240146702546, + "grad_norm": 1.604002594947815, + "learning_rate": 1.9918910970212528e-05, + "loss": 1.35, + "step": 8920 + }, + { + "epoch": 0.20806572460620337, + "grad_norm": 1.7653684616088867, + "learning_rate": 1.9918878965779377e-05, + "loss": 1.3181, + "step": 8921 + }, + { + "epoch": 0.20808904774538128, + "grad_norm": 1.9717421531677246, + "learning_rate": 1.9918846955057396e-05, + "loss": 1.5109, + "step": 8922 + }, + { + "epoch": 0.2081123708845592, + "grad_norm": 1.49480402469635, + "learning_rate": 1.991881493804661e-05, + "loss": 1.1379, + "step": 8923 + }, + { + "epoch": 0.20813569402373713, + "grad_norm": 2.0267350673675537, + "learning_rate": 1.9918782914747025e-05, + "loss": 1.3152, + "step": 8924 + }, + { + "epoch": 0.20815901716291504, + "grad_norm": 2.402163028717041, + "learning_rate": 1.991875088515868e-05, + "loss": 1.5545, + "step": 8925 + }, + { + "epoch": 0.20818234030209296, + "grad_norm": 1.9399529695510864, + "learning_rate": 1.991871884928158e-05, + "loss": 1.5063, + "step": 8926 + }, + { + "epoch": 0.20820566344127087, + "grad_norm": 1.9817485809326172, + "learning_rate": 1.9918686807115755e-05, + "loss": 1.4723, + "step": 8927 + }, + { + "epoch": 0.2082289865804488, + "grad_norm": 1.828019380569458, + "learning_rate": 1.9918654758661224e-05, + "loss": 1.2797, + "step": 8928 + }, + { + "epoch": 0.20825230971962672, + "grad_norm": 2.1117448806762695, + "learning_rate": 1.9918622703918006e-05, + "loss": 1.4576, + "step": 8929 + }, + { + "epoch": 0.20827563285880463, + "grad_norm": 1.6421380043029785, + "learning_rate": 1.9918590642886117e-05, + "loss": 1.3288, + "step": 8930 + }, + { + "epoch": 0.20829895599798254, + "grad_norm": 1.8511327505111694, + "learning_rate": 1.9918558575565584e-05, + "loss": 1.3026, + "step": 8931 + }, + { + "epoch": 0.20832227913716048, + "grad_norm": 1.8834081888198853, + "learning_rate": 1.991852650195642e-05, + "loss": 1.0507, + "step": 8932 + }, + { + "epoch": 0.2083456022763384, + "grad_norm": 2.1445794105529785, + "learning_rate": 1.9918494422058655e-05, + "loss": 1.5095, + "step": 8933 + }, + { + "epoch": 0.2083689254155163, + "grad_norm": 1.6548465490341187, + "learning_rate": 1.9918462335872303e-05, + "loss": 1.1371, + "step": 8934 + }, + { + "epoch": 0.2083922485546942, + "grad_norm": 1.772477149963379, + "learning_rate": 1.9918430243397385e-05, + "loss": 1.6767, + "step": 8935 + }, + { + "epoch": 0.20841557169387215, + "grad_norm": 1.6616123914718628, + "learning_rate": 1.9918398144633922e-05, + "loss": 1.5473, + "step": 8936 + }, + { + "epoch": 0.20843889483305006, + "grad_norm": 1.8380600214004517, + "learning_rate": 1.991836603958193e-05, + "loss": 1.4756, + "step": 8937 + }, + { + "epoch": 0.20846221797222797, + "grad_norm": 1.7076685428619385, + "learning_rate": 1.991833392824144e-05, + "loss": 1.3724, + "step": 8938 + }, + { + "epoch": 0.20848554111140588, + "grad_norm": 1.8073532581329346, + "learning_rate": 1.9918301810612463e-05, + "loss": 1.1355, + "step": 8939 + }, + { + "epoch": 0.20850886425058382, + "grad_norm": 1.9371896982192993, + "learning_rate": 1.9918269686695022e-05, + "loss": 1.3418, + "step": 8940 + }, + { + "epoch": 0.20853218738976173, + "grad_norm": 2.1045961380004883, + "learning_rate": 1.991823755648914e-05, + "loss": 1.3505, + "step": 8941 + }, + { + "epoch": 0.20855551052893964, + "grad_norm": 1.512935757637024, + "learning_rate": 1.991820541999483e-05, + "loss": 1.3489, + "step": 8942 + }, + { + "epoch": 0.20857883366811755, + "grad_norm": 1.7579790353775024, + "learning_rate": 1.9918173277212123e-05, + "loss": 1.487, + "step": 8943 + }, + { + "epoch": 0.2086021568072955, + "grad_norm": 2.223262310028076, + "learning_rate": 1.991814112814103e-05, + "loss": 1.2722, + "step": 8944 + }, + { + "epoch": 0.2086254799464734, + "grad_norm": 2.3253536224365234, + "learning_rate": 1.9918108972781575e-05, + "loss": 1.647, + "step": 8945 + }, + { + "epoch": 0.2086488030856513, + "grad_norm": 1.940615177154541, + "learning_rate": 1.991807681113378e-05, + "loss": 1.173, + "step": 8946 + }, + { + "epoch": 0.20867212622482922, + "grad_norm": 2.31413197517395, + "learning_rate": 1.9918044643197665e-05, + "loss": 1.4387, + "step": 8947 + }, + { + "epoch": 0.20869544936400714, + "grad_norm": 2.253386974334717, + "learning_rate": 1.9918012468973246e-05, + "loss": 1.6272, + "step": 8948 + }, + { + "epoch": 0.20871877250318507, + "grad_norm": 2.0728704929351807, + "learning_rate": 1.991798028846055e-05, + "loss": 1.6382, + "step": 8949 + }, + { + "epoch": 0.20874209564236298, + "grad_norm": 1.941677212715149, + "learning_rate": 1.9917948101659596e-05, + "loss": 1.8738, + "step": 8950 + }, + { + "epoch": 0.2087654187815409, + "grad_norm": 1.6983940601348877, + "learning_rate": 1.9917915908570395e-05, + "loss": 1.4235, + "step": 8951 + }, + { + "epoch": 0.2087887419207188, + "grad_norm": 1.7597581148147583, + "learning_rate": 1.991788370919298e-05, + "loss": 1.3217, + "step": 8952 + }, + { + "epoch": 0.20881206505989675, + "grad_norm": 1.9837123155593872, + "learning_rate": 1.9917851503527366e-05, + "loss": 0.9669, + "step": 8953 + }, + { + "epoch": 0.20883538819907466, + "grad_norm": 2.034768581390381, + "learning_rate": 1.9917819291573575e-05, + "loss": 1.4135, + "step": 8954 + }, + { + "epoch": 0.20885871133825257, + "grad_norm": 1.9721709489822388, + "learning_rate": 1.9917787073331622e-05, + "loss": 1.3557, + "step": 8955 + }, + { + "epoch": 0.20888203447743048, + "grad_norm": 1.836617350578308, + "learning_rate": 1.9917754848801537e-05, + "loss": 1.3505, + "step": 8956 + }, + { + "epoch": 0.20890535761660842, + "grad_norm": 1.7302457094192505, + "learning_rate": 1.991772261798333e-05, + "loss": 1.4848, + "step": 8957 + }, + { + "epoch": 0.20892868075578633, + "grad_norm": 1.9135240316390991, + "learning_rate": 1.991769038087703e-05, + "loss": 1.5674, + "step": 8958 + }, + { + "epoch": 0.20895200389496424, + "grad_norm": 1.7172304391860962, + "learning_rate": 1.9917658137482652e-05, + "loss": 0.8983, + "step": 8959 + }, + { + "epoch": 0.20897532703414215, + "grad_norm": 1.8652372360229492, + "learning_rate": 1.991762588780022e-05, + "loss": 1.4429, + "step": 8960 + }, + { + "epoch": 0.2089986501733201, + "grad_norm": 2.126966714859009, + "learning_rate": 1.991759363182975e-05, + "loss": 1.0319, + "step": 8961 + }, + { + "epoch": 0.209021973312498, + "grad_norm": 1.5285292863845825, + "learning_rate": 1.9917561369571267e-05, + "loss": 1.0554, + "step": 8962 + }, + { + "epoch": 0.2090452964516759, + "grad_norm": 1.8848071098327637, + "learning_rate": 1.991752910102479e-05, + "loss": 1.2429, + "step": 8963 + }, + { + "epoch": 0.20906861959085382, + "grad_norm": 2.039638042449951, + "learning_rate": 1.991749682619034e-05, + "loss": 1.4376, + "step": 8964 + }, + { + "epoch": 0.20909194273003176, + "grad_norm": 1.9621413946151733, + "learning_rate": 1.9917464545067936e-05, + "loss": 1.2362, + "step": 8965 + }, + { + "epoch": 0.20911526586920967, + "grad_norm": 2.0251457691192627, + "learning_rate": 1.9917432257657595e-05, + "loss": 1.5834, + "step": 8966 + }, + { + "epoch": 0.20913858900838758, + "grad_norm": 2.412130117416382, + "learning_rate": 1.9917399963959348e-05, + "loss": 1.7807, + "step": 8967 + }, + { + "epoch": 0.2091619121475655, + "grad_norm": 1.6178112030029297, + "learning_rate": 1.9917367663973206e-05, + "loss": 1.46, + "step": 8968 + }, + { + "epoch": 0.20918523528674343, + "grad_norm": 1.969982385635376, + "learning_rate": 1.9917335357699194e-05, + "loss": 1.8451, + "step": 8969 + }, + { + "epoch": 0.20920855842592134, + "grad_norm": 1.9167366027832031, + "learning_rate": 1.9917303045137327e-05, + "loss": 1.5903, + "step": 8970 + }, + { + "epoch": 0.20923188156509925, + "grad_norm": 2.287653684616089, + "learning_rate": 1.9917270726287633e-05, + "loss": 1.6476, + "step": 8971 + }, + { + "epoch": 0.20925520470427716, + "grad_norm": 2.193716049194336, + "learning_rate": 1.991723840115013e-05, + "loss": 1.433, + "step": 8972 + }, + { + "epoch": 0.2092785278434551, + "grad_norm": 1.622120976448059, + "learning_rate": 1.9917206069724834e-05, + "loss": 1.4023, + "step": 8973 + }, + { + "epoch": 0.20930185098263301, + "grad_norm": 1.7062550783157349, + "learning_rate": 1.991717373201177e-05, + "loss": 1.471, + "step": 8974 + }, + { + "epoch": 0.20932517412181093, + "grad_norm": 1.5441185235977173, + "learning_rate": 1.991714138801096e-05, + "loss": 1.3545, + "step": 8975 + }, + { + "epoch": 0.20934849726098884, + "grad_norm": 1.4008463621139526, + "learning_rate": 1.991710903772242e-05, + "loss": 1.326, + "step": 8976 + }, + { + "epoch": 0.20937182040016675, + "grad_norm": 1.732420563697815, + "learning_rate": 1.9917076681146173e-05, + "loss": 1.4796, + "step": 8977 + }, + { + "epoch": 0.20939514353934469, + "grad_norm": 1.8088231086730957, + "learning_rate": 1.9917044318282238e-05, + "loss": 1.671, + "step": 8978 + }, + { + "epoch": 0.2094184666785226, + "grad_norm": 1.6906882524490356, + "learning_rate": 1.991701194913064e-05, + "loss": 1.1608, + "step": 8979 + }, + { + "epoch": 0.2094417898177005, + "grad_norm": 2.563253402709961, + "learning_rate": 1.9916979573691392e-05, + "loss": 1.4332, + "step": 8980 + }, + { + "epoch": 0.20946511295687842, + "grad_norm": 2.9737558364868164, + "learning_rate": 1.991694719196452e-05, + "loss": 1.7824, + "step": 8981 + }, + { + "epoch": 0.20948843609605636, + "grad_norm": 2.010021209716797, + "learning_rate": 1.9916914803950043e-05, + "loss": 1.1484, + "step": 8982 + }, + { + "epoch": 0.20951175923523427, + "grad_norm": 1.8764690160751343, + "learning_rate": 1.9916882409647983e-05, + "loss": 1.3175, + "step": 8983 + }, + { + "epoch": 0.20953508237441218, + "grad_norm": 1.9616918563842773, + "learning_rate": 1.9916850009058356e-05, + "loss": 1.3959, + "step": 8984 + }, + { + "epoch": 0.2095584055135901, + "grad_norm": 1.6082313060760498, + "learning_rate": 1.9916817602181186e-05, + "loss": 1.3168, + "step": 8985 + }, + { + "epoch": 0.20958172865276803, + "grad_norm": 2.006948232650757, + "learning_rate": 1.9916785189016498e-05, + "loss": 1.4186, + "step": 8986 + }, + { + "epoch": 0.20960505179194594, + "grad_norm": 2.04799485206604, + "learning_rate": 1.9916752769564304e-05, + "loss": 1.5701, + "step": 8987 + }, + { + "epoch": 0.20962837493112385, + "grad_norm": 1.5707134008407593, + "learning_rate": 1.991672034382463e-05, + "loss": 1.1264, + "step": 8988 + }, + { + "epoch": 0.20965169807030176, + "grad_norm": 1.97533118724823, + "learning_rate": 1.9916687911797493e-05, + "loss": 1.3683, + "step": 8989 + }, + { + "epoch": 0.2096750212094797, + "grad_norm": 1.4880280494689941, + "learning_rate": 1.9916655473482915e-05, + "loss": 1.089, + "step": 8990 + }, + { + "epoch": 0.2096983443486576, + "grad_norm": 2.3304264545440674, + "learning_rate": 1.9916623028880918e-05, + "loss": 1.1888, + "step": 8991 + }, + { + "epoch": 0.20972166748783552, + "grad_norm": 1.771702766418457, + "learning_rate": 1.9916590577991527e-05, + "loss": 1.4043, + "step": 8992 + }, + { + "epoch": 0.20974499062701343, + "grad_norm": 2.0741004943847656, + "learning_rate": 1.9916558120814752e-05, + "loss": 1.79, + "step": 8993 + }, + { + "epoch": 0.20976831376619137, + "grad_norm": 1.7142226696014404, + "learning_rate": 1.9916525657350618e-05, + "loss": 1.1207, + "step": 8994 + }, + { + "epoch": 0.20979163690536928, + "grad_norm": 1.9289735555648804, + "learning_rate": 1.991649318759915e-05, + "loss": 1.3861, + "step": 8995 + }, + { + "epoch": 0.2098149600445472, + "grad_norm": 1.7312073707580566, + "learning_rate": 1.991646071156036e-05, + "loss": 1.408, + "step": 8996 + }, + { + "epoch": 0.2098382831837251, + "grad_norm": 1.6762757301330566, + "learning_rate": 1.9916428229234276e-05, + "loss": 1.4668, + "step": 8997 + }, + { + "epoch": 0.20986160632290304, + "grad_norm": 2.100386619567871, + "learning_rate": 1.9916395740620917e-05, + "loss": 1.3201, + "step": 8998 + }, + { + "epoch": 0.20988492946208095, + "grad_norm": 1.804983139038086, + "learning_rate": 1.9916363245720304e-05, + "loss": 1.2731, + "step": 8999 + }, + { + "epoch": 0.20990825260125887, + "grad_norm": 2.026416540145874, + "learning_rate": 1.9916330744532453e-05, + "loss": 1.2903, + "step": 9000 + } + ], + "logging_steps": 1, + "max_steps": 128625, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.297411924549763e+18, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}