{ "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 4690, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0010660980810234541, "grad_norm": 5.4528142214861735, "learning_rate": 1.7057569296375268e-07, "loss": 0.837, "step": 1 }, { "epoch": 0.0021321961620469083, "grad_norm": 5.511675498609771, "learning_rate": 3.4115138592750537e-07, "loss": 0.8508, "step": 2 }, { "epoch": 0.0031982942430703624, "grad_norm": 5.464511849599797, "learning_rate": 5.11727078891258e-07, "loss": 0.8447, "step": 3 }, { "epoch": 0.0042643923240938165, "grad_norm": 5.447268946229452, "learning_rate": 6.823027718550107e-07, "loss": 0.836, "step": 4 }, { "epoch": 0.005330490405117271, "grad_norm": 5.381379257626523, "learning_rate": 8.528784648187634e-07, "loss": 0.8463, "step": 5 }, { "epoch": 0.006396588486140725, "grad_norm": 5.4441126548183725, "learning_rate": 1.023454157782516e-06, "loss": 0.8514, "step": 6 }, { "epoch": 0.007462686567164179, "grad_norm": 5.008014360316408, "learning_rate": 1.1940298507462686e-06, "loss": 0.8215, "step": 7 }, { "epoch": 0.008528784648187633, "grad_norm": 5.000283670760778, "learning_rate": 1.3646055437100215e-06, "loss": 0.8361, "step": 8 }, { "epoch": 0.009594882729211088, "grad_norm": 3.964243964413472, "learning_rate": 1.5351812366737743e-06, "loss": 0.7919, "step": 9 }, { "epoch": 0.010660980810234541, "grad_norm": 3.7864879773612823, "learning_rate": 1.7057569296375267e-06, "loss": 0.7889, "step": 10 }, { "epoch": 0.011727078891257996, "grad_norm": 2.512068820205037, "learning_rate": 1.8763326226012796e-06, "loss": 0.7586, "step": 11 }, { "epoch": 0.01279317697228145, "grad_norm": 2.259289045541819, "learning_rate": 2.046908315565032e-06, "loss": 0.7527, "step": 12 }, { "epoch": 0.013859275053304905, "grad_norm": 2.138291149917742, "learning_rate": 2.217484008528785e-06, "loss": 0.7504, "step": 13 }, { "epoch": 0.014925373134328358, "grad_norm": 1.9179684528050671, "learning_rate": 2.3880597014925373e-06, "loss": 0.7449, "step": 14 }, { "epoch": 0.015991471215351813, "grad_norm": 2.0680127043179235, "learning_rate": 2.55863539445629e-06, "loss": 0.7272, "step": 15 }, { "epoch": 0.017057569296375266, "grad_norm": 3.1604717469534447, "learning_rate": 2.729211087420043e-06, "loss": 0.7213, "step": 16 }, { "epoch": 0.01812366737739872, "grad_norm": 3.278566086414656, "learning_rate": 2.8997867803837954e-06, "loss": 0.7202, "step": 17 }, { "epoch": 0.019189765458422176, "grad_norm": 3.1700600356728033, "learning_rate": 3.0703624733475486e-06, "loss": 0.7189, "step": 18 }, { "epoch": 0.02025586353944563, "grad_norm": 3.0282075577668857, "learning_rate": 3.240938166311301e-06, "loss": 0.7239, "step": 19 }, { "epoch": 0.021321961620469083, "grad_norm": 2.652594115580126, "learning_rate": 3.4115138592750535e-06, "loss": 0.701, "step": 20 }, { "epoch": 0.022388059701492536, "grad_norm": 1.9034175357194445, "learning_rate": 3.582089552238806e-06, "loss": 0.6933, "step": 21 }, { "epoch": 0.023454157782515993, "grad_norm": 1.5538529247302069, "learning_rate": 3.752665245202559e-06, "loss": 0.6571, "step": 22 }, { "epoch": 0.024520255863539446, "grad_norm": 1.2849096876637858, "learning_rate": 3.9232409381663116e-06, "loss": 0.6563, "step": 23 }, { "epoch": 0.0255863539445629, "grad_norm": 1.0948864208805895, "learning_rate": 4.093816631130064e-06, "loss": 0.6466, "step": 24 }, { "epoch": 0.026652452025586353, "grad_norm": 1.0624300872894943, "learning_rate": 4.264392324093816e-06, "loss": 0.6424, "step": 25 }, { "epoch": 0.02771855010660981, "grad_norm": 1.131312650641839, "learning_rate": 4.43496801705757e-06, "loss": 0.6369, "step": 26 }, { "epoch": 0.028784648187633263, "grad_norm": 1.0990409054108197, "learning_rate": 4.605543710021322e-06, "loss": 0.624, "step": 27 }, { "epoch": 0.029850746268656716, "grad_norm": 0.9649930973741134, "learning_rate": 4.7761194029850745e-06, "loss": 0.6149, "step": 28 }, { "epoch": 0.03091684434968017, "grad_norm": 0.7025445114737157, "learning_rate": 4.946695095948828e-06, "loss": 0.6143, "step": 29 }, { "epoch": 0.031982942430703626, "grad_norm": 0.7153092735896356, "learning_rate": 5.11727078891258e-06, "loss": 0.6087, "step": 30 }, { "epoch": 0.03304904051172708, "grad_norm": 0.8673451563666814, "learning_rate": 5.2878464818763335e-06, "loss": 0.6062, "step": 31 }, { "epoch": 0.03411513859275053, "grad_norm": 0.8051803897020771, "learning_rate": 5.458422174840086e-06, "loss": 0.5961, "step": 32 }, { "epoch": 0.035181236673773986, "grad_norm": 0.6282947287043709, "learning_rate": 5.628997867803838e-06, "loss": 0.5945, "step": 33 }, { "epoch": 0.03624733475479744, "grad_norm": 0.5284036506183948, "learning_rate": 5.799573560767591e-06, "loss": 0.5873, "step": 34 }, { "epoch": 0.03731343283582089, "grad_norm": 0.6560915453306406, "learning_rate": 5.970149253731343e-06, "loss": 0.5926, "step": 35 }, { "epoch": 0.03837953091684435, "grad_norm": 0.6254389118129263, "learning_rate": 6.140724946695097e-06, "loss": 0.5925, "step": 36 }, { "epoch": 0.039445628997867806, "grad_norm": 0.4747128371854551, "learning_rate": 6.31130063965885e-06, "loss": 0.5789, "step": 37 }, { "epoch": 0.04051172707889126, "grad_norm": 0.43328964711656565, "learning_rate": 6.481876332622602e-06, "loss": 0.5777, "step": 38 }, { "epoch": 0.04157782515991471, "grad_norm": 0.5041187156835155, "learning_rate": 6.6524520255863545e-06, "loss": 0.5731, "step": 39 }, { "epoch": 0.042643923240938165, "grad_norm": 0.4996573987096877, "learning_rate": 6.823027718550107e-06, "loss": 0.5704, "step": 40 }, { "epoch": 0.04371002132196162, "grad_norm": 0.40178653209472504, "learning_rate": 6.993603411513859e-06, "loss": 0.5687, "step": 41 }, { "epoch": 0.04477611940298507, "grad_norm": 0.39678565068179716, "learning_rate": 7.164179104477612e-06, "loss": 0.5606, "step": 42 }, { "epoch": 0.04584221748400853, "grad_norm": 0.3838477339215644, "learning_rate": 7.334754797441366e-06, "loss": 0.5551, "step": 43 }, { "epoch": 0.046908315565031986, "grad_norm": 0.39376377958875114, "learning_rate": 7.505330490405118e-06, "loss": 0.5586, "step": 44 }, { "epoch": 0.04797441364605544, "grad_norm": 0.4089383682673747, "learning_rate": 7.67590618336887e-06, "loss": 0.544, "step": 45 }, { "epoch": 0.04904051172707889, "grad_norm": 0.3674905464499231, "learning_rate": 7.846481876332623e-06, "loss": 0.5562, "step": 46 }, { "epoch": 0.050106609808102345, "grad_norm": 0.3391811295227044, "learning_rate": 8.017057569296376e-06, "loss": 0.5489, "step": 47 }, { "epoch": 0.0511727078891258, "grad_norm": 0.3771487479148055, "learning_rate": 8.187633262260128e-06, "loss": 0.5553, "step": 48 }, { "epoch": 0.05223880597014925, "grad_norm": 0.3264015224929875, "learning_rate": 8.35820895522388e-06, "loss": 0.5505, "step": 49 }, { "epoch": 0.053304904051172705, "grad_norm": 0.3331723647408018, "learning_rate": 8.528784648187633e-06, "loss": 0.5357, "step": 50 }, { "epoch": 0.054371002132196165, "grad_norm": 0.37485261764922745, "learning_rate": 8.699360341151387e-06, "loss": 0.5441, "step": 51 }, { "epoch": 0.05543710021321962, "grad_norm": 0.9120774222301191, "learning_rate": 8.86993603411514e-06, "loss": 0.5324, "step": 52 }, { "epoch": 0.05650319829424307, "grad_norm": 0.3263354859744661, "learning_rate": 9.040511727078892e-06, "loss": 0.5352, "step": 53 }, { "epoch": 0.057569296375266525, "grad_norm": 0.35885753984901503, "learning_rate": 9.211087420042644e-06, "loss": 0.5405, "step": 54 }, { "epoch": 0.05863539445628998, "grad_norm": 0.29649061716771913, "learning_rate": 9.381663113006397e-06, "loss": 0.5361, "step": 55 }, { "epoch": 0.05970149253731343, "grad_norm": 0.24951946598497796, "learning_rate": 9.552238805970149e-06, "loss": 0.5301, "step": 56 }, { "epoch": 0.060767590618336885, "grad_norm": 0.3772047271710351, "learning_rate": 9.722814498933903e-06, "loss": 0.5448, "step": 57 }, { "epoch": 0.06183368869936034, "grad_norm": 0.2493874964259867, "learning_rate": 9.893390191897656e-06, "loss": 0.522, "step": 58 }, { "epoch": 0.0628997867803838, "grad_norm": 0.24173737961013422, "learning_rate": 1.0063965884861408e-05, "loss": 0.5295, "step": 59 }, { "epoch": 0.06396588486140725, "grad_norm": 0.3214864858366852, "learning_rate": 1.023454157782516e-05, "loss": 0.534, "step": 60 }, { "epoch": 0.0650319829424307, "grad_norm": 0.22660214109074844, "learning_rate": 1.0405117270788913e-05, "loss": 0.5207, "step": 61 }, { "epoch": 0.06609808102345416, "grad_norm": 0.22692046089904563, "learning_rate": 1.0575692963752667e-05, "loss": 0.524, "step": 62 }, { "epoch": 0.06716417910447761, "grad_norm": 0.267684560903461, "learning_rate": 1.074626865671642e-05, "loss": 0.5241, "step": 63 }, { "epoch": 0.06823027718550106, "grad_norm": 0.2145100827755043, "learning_rate": 1.0916844349680172e-05, "loss": 0.5267, "step": 64 }, { "epoch": 0.06929637526652452, "grad_norm": 0.2581454974388823, "learning_rate": 1.1087420042643924e-05, "loss": 0.5263, "step": 65 }, { "epoch": 0.07036247334754797, "grad_norm": 0.23991967383468807, "learning_rate": 1.1257995735607677e-05, "loss": 0.5165, "step": 66 }, { "epoch": 0.07142857142857142, "grad_norm": 0.19942140384569496, "learning_rate": 1.1428571428571429e-05, "loss": 0.5168, "step": 67 }, { "epoch": 0.07249466950959488, "grad_norm": 0.2412614596538889, "learning_rate": 1.1599147121535181e-05, "loss": 0.5137, "step": 68 }, { "epoch": 0.07356076759061833, "grad_norm": 0.213075163701682, "learning_rate": 1.1769722814498934e-05, "loss": 0.5177, "step": 69 }, { "epoch": 0.07462686567164178, "grad_norm": 0.2078353315515298, "learning_rate": 1.1940298507462686e-05, "loss": 0.5203, "step": 70 }, { "epoch": 0.07569296375266525, "grad_norm": 0.24803061207482333, "learning_rate": 1.2110874200426442e-05, "loss": 0.5127, "step": 71 }, { "epoch": 0.0767590618336887, "grad_norm": 0.200388439144567, "learning_rate": 1.2281449893390195e-05, "loss": 0.524, "step": 72 }, { "epoch": 0.07782515991471216, "grad_norm": 0.26609196507744726, "learning_rate": 1.2452025586353947e-05, "loss": 0.5166, "step": 73 }, { "epoch": 0.07889125799573561, "grad_norm": 0.2746212953481589, "learning_rate": 1.26226012793177e-05, "loss": 0.5097, "step": 74 }, { "epoch": 0.07995735607675906, "grad_norm": 0.25408109337829365, "learning_rate": 1.2793176972281452e-05, "loss": 0.5124, "step": 75 }, { "epoch": 0.08102345415778252, "grad_norm": 0.23041327392909428, "learning_rate": 1.2963752665245204e-05, "loss": 0.5114, "step": 76 }, { "epoch": 0.08208955223880597, "grad_norm": 0.25359601440739266, "learning_rate": 1.3134328358208957e-05, "loss": 0.5126, "step": 77 }, { "epoch": 0.08315565031982942, "grad_norm": 0.26693374625690547, "learning_rate": 1.3304904051172709e-05, "loss": 0.5026, "step": 78 }, { "epoch": 0.08422174840085288, "grad_norm": 0.32689327987808636, "learning_rate": 1.3475479744136461e-05, "loss": 0.51, "step": 79 }, { "epoch": 0.08528784648187633, "grad_norm": 0.3372086808231497, "learning_rate": 1.3646055437100214e-05, "loss": 0.5026, "step": 80 }, { "epoch": 0.08635394456289978, "grad_norm": 0.3813830415514882, "learning_rate": 1.3816631130063966e-05, "loss": 0.5066, "step": 81 }, { "epoch": 0.08742004264392324, "grad_norm": 0.5438093416524072, "learning_rate": 1.3987206823027719e-05, "loss": 0.5164, "step": 82 }, { "epoch": 0.08848614072494669, "grad_norm": 0.6495392831189992, "learning_rate": 1.4157782515991471e-05, "loss": 0.5043, "step": 83 }, { "epoch": 0.08955223880597014, "grad_norm": 0.6977147730974188, "learning_rate": 1.4328358208955224e-05, "loss": 0.5036, "step": 84 }, { "epoch": 0.0906183368869936, "grad_norm": 0.6222679447369283, "learning_rate": 1.4498933901918976e-05, "loss": 0.5098, "step": 85 }, { "epoch": 0.09168443496801706, "grad_norm": 0.47152661245187916, "learning_rate": 1.4669509594882732e-05, "loss": 0.5055, "step": 86 }, { "epoch": 0.09275053304904052, "grad_norm": 0.3720647971705006, "learning_rate": 1.4840085287846484e-05, "loss": 0.5093, "step": 87 }, { "epoch": 0.09381663113006397, "grad_norm": 0.429318588389719, "learning_rate": 1.5010660980810237e-05, "loss": 0.5012, "step": 88 }, { "epoch": 0.09488272921108742, "grad_norm": 0.5114262103736821, "learning_rate": 1.5181236673773989e-05, "loss": 0.5057, "step": 89 }, { "epoch": 0.09594882729211088, "grad_norm": 0.4452652595986177, "learning_rate": 1.535181236673774e-05, "loss": 0.5044, "step": 90 }, { "epoch": 0.09701492537313433, "grad_norm": 0.30399112681775725, "learning_rate": 1.5522388059701494e-05, "loss": 0.5039, "step": 91 }, { "epoch": 0.09808102345415778, "grad_norm": 0.44638035180999586, "learning_rate": 1.5692963752665246e-05, "loss": 0.5088, "step": 92 }, { "epoch": 0.09914712153518124, "grad_norm": 0.5355105792075635, "learning_rate": 1.5863539445629e-05, "loss": 0.5068, "step": 93 }, { "epoch": 0.10021321961620469, "grad_norm": 1.6435340842916062, "learning_rate": 1.603411513859275e-05, "loss": 0.5044, "step": 94 }, { "epoch": 0.10127931769722814, "grad_norm": 0.41709741899745906, "learning_rate": 1.6204690831556504e-05, "loss": 0.4993, "step": 95 }, { "epoch": 0.1023454157782516, "grad_norm": 0.8489238666616422, "learning_rate": 1.6375266524520256e-05, "loss": 0.4982, "step": 96 }, { "epoch": 0.10341151385927505, "grad_norm": 0.7950895739539379, "learning_rate": 1.654584221748401e-05, "loss": 0.4955, "step": 97 }, { "epoch": 0.1044776119402985, "grad_norm": 0.3524964920301634, "learning_rate": 1.671641791044776e-05, "loss": 0.4896, "step": 98 }, { "epoch": 0.10554371002132196, "grad_norm": 0.48403087660030325, "learning_rate": 1.6886993603411513e-05, "loss": 0.4938, "step": 99 }, { "epoch": 0.10660980810234541, "grad_norm": 0.8862953843822765, "learning_rate": 1.7057569296375266e-05, "loss": 0.5003, "step": 100 }, { "epoch": 0.10767590618336886, "grad_norm": 0.84687190937384, "learning_rate": 1.7228144989339018e-05, "loss": 0.498, "step": 101 }, { "epoch": 0.10874200426439233, "grad_norm": 0.7104553983279852, "learning_rate": 1.7398720682302774e-05, "loss": 0.5002, "step": 102 }, { "epoch": 0.10980810234541578, "grad_norm": 1.1434719823658013, "learning_rate": 1.7569296375266526e-05, "loss": 0.4992, "step": 103 }, { "epoch": 0.11087420042643924, "grad_norm": 0.8704506872447706, "learning_rate": 1.773987206823028e-05, "loss": 0.4951, "step": 104 }, { "epoch": 0.11194029850746269, "grad_norm": 0.897490279226691, "learning_rate": 1.791044776119403e-05, "loss": 0.4916, "step": 105 }, { "epoch": 0.11300639658848614, "grad_norm": 1.484307049969091, "learning_rate": 1.8081023454157784e-05, "loss": 0.4943, "step": 106 }, { "epoch": 0.1140724946695096, "grad_norm": 0.6421017701917927, "learning_rate": 1.8251599147121536e-05, "loss": 0.4906, "step": 107 }, { "epoch": 0.11513859275053305, "grad_norm": 1.6387082014606487, "learning_rate": 1.842217484008529e-05, "loss": 0.4976, "step": 108 }, { "epoch": 0.1162046908315565, "grad_norm": 0.7508434661676452, "learning_rate": 1.859275053304904e-05, "loss": 0.4893, "step": 109 }, { "epoch": 0.11727078891257996, "grad_norm": 1.5878659815817608, "learning_rate": 1.8763326226012793e-05, "loss": 0.4936, "step": 110 }, { "epoch": 0.11833688699360341, "grad_norm": 1.2700012445274647, "learning_rate": 1.8933901918976546e-05, "loss": 0.4976, "step": 111 }, { "epoch": 0.11940298507462686, "grad_norm": 1.4468232793600246, "learning_rate": 1.9104477611940298e-05, "loss": 0.4916, "step": 112 }, { "epoch": 0.12046908315565032, "grad_norm": 1.2073925718708678, "learning_rate": 1.9275053304904054e-05, "loss": 0.4924, "step": 113 }, { "epoch": 0.12153518123667377, "grad_norm": 1.382482149756203, "learning_rate": 1.9445628997867806e-05, "loss": 0.495, "step": 114 }, { "epoch": 0.12260127931769722, "grad_norm": 1.1681250872190037, "learning_rate": 1.961620469083156e-05, "loss": 0.4929, "step": 115 }, { "epoch": 0.12366737739872068, "grad_norm": 1.1644844991590728, "learning_rate": 1.978678038379531e-05, "loss": 0.487, "step": 116 }, { "epoch": 0.12473347547974413, "grad_norm": 1.1590131735118943, "learning_rate": 1.9957356076759064e-05, "loss": 0.491, "step": 117 }, { "epoch": 0.1257995735607676, "grad_norm": 0.8845517269114772, "learning_rate": 2.0127931769722816e-05, "loss": 0.4899, "step": 118 }, { "epoch": 0.12686567164179105, "grad_norm": 0.8720137285450001, "learning_rate": 2.029850746268657e-05, "loss": 0.4948, "step": 119 }, { "epoch": 0.1279317697228145, "grad_norm": 0.8270825293355472, "learning_rate": 2.046908315565032e-05, "loss": 0.4873, "step": 120 }, { "epoch": 0.12899786780383796, "grad_norm": 0.7112733086734145, "learning_rate": 2.0639658848614073e-05, "loss": 0.4852, "step": 121 }, { "epoch": 0.1300639658848614, "grad_norm": 0.63087466396226, "learning_rate": 2.0810234541577826e-05, "loss": 0.4842, "step": 122 }, { "epoch": 0.13113006396588486, "grad_norm": 0.6768967900200205, "learning_rate": 2.098081023454158e-05, "loss": 0.4892, "step": 123 }, { "epoch": 0.13219616204690832, "grad_norm": 0.4441105086375026, "learning_rate": 2.1151385927505334e-05, "loss": 0.49, "step": 124 }, { "epoch": 0.13326226012793177, "grad_norm": 0.7871223305057534, "learning_rate": 2.1321961620469086e-05, "loss": 0.4861, "step": 125 }, { "epoch": 0.13432835820895522, "grad_norm": 0.7473471277752516, "learning_rate": 2.149253731343284e-05, "loss": 0.4859, "step": 126 }, { "epoch": 0.13539445628997868, "grad_norm": 0.6860378579705178, "learning_rate": 2.166311300639659e-05, "loss": 0.483, "step": 127 }, { "epoch": 0.13646055437100213, "grad_norm": 0.6877050783906112, "learning_rate": 2.1833688699360344e-05, "loss": 0.4778, "step": 128 }, { "epoch": 0.13752665245202558, "grad_norm": 0.7802953077555831, "learning_rate": 2.2004264392324096e-05, "loss": 0.4841, "step": 129 }, { "epoch": 0.13859275053304904, "grad_norm": 0.6105421403859265, "learning_rate": 2.217484008528785e-05, "loss": 0.4888, "step": 130 }, { "epoch": 0.1396588486140725, "grad_norm": 0.8085739454604434, "learning_rate": 2.23454157782516e-05, "loss": 0.4844, "step": 131 }, { "epoch": 0.14072494669509594, "grad_norm": 0.599568911512429, "learning_rate": 2.2515991471215353e-05, "loss": 0.4859, "step": 132 }, { "epoch": 0.1417910447761194, "grad_norm": 0.7298411511687625, "learning_rate": 2.2686567164179106e-05, "loss": 0.4839, "step": 133 }, { "epoch": 0.14285714285714285, "grad_norm": 0.5022968348693084, "learning_rate": 2.2857142857142858e-05, "loss": 0.4799, "step": 134 }, { "epoch": 0.1439232409381663, "grad_norm": 0.7782509559217928, "learning_rate": 2.302771855010661e-05, "loss": 0.4811, "step": 135 }, { "epoch": 0.14498933901918976, "grad_norm": 0.8576912283821915, "learning_rate": 2.3198294243070363e-05, "loss": 0.4855, "step": 136 }, { "epoch": 0.1460554371002132, "grad_norm": 0.75095345536952, "learning_rate": 2.3368869936034115e-05, "loss": 0.4831, "step": 137 }, { "epoch": 0.14712153518123666, "grad_norm": 1.1021094209824167, "learning_rate": 2.3539445628997868e-05, "loss": 0.485, "step": 138 }, { "epoch": 0.14818763326226012, "grad_norm": 1.3850416902760037, "learning_rate": 2.371002132196162e-05, "loss": 0.478, "step": 139 }, { "epoch": 0.14925373134328357, "grad_norm": 0.5373300308852162, "learning_rate": 2.3880597014925373e-05, "loss": 0.4743, "step": 140 }, { "epoch": 0.15031982942430705, "grad_norm": 1.2965983051499783, "learning_rate": 2.405117270788913e-05, "loss": 0.4877, "step": 141 }, { "epoch": 0.1513859275053305, "grad_norm": 0.9983577194279637, "learning_rate": 2.4221748400852884e-05, "loss": 0.4814, "step": 142 }, { "epoch": 0.15245202558635396, "grad_norm": 0.720371040224616, "learning_rate": 2.4392324093816637e-05, "loss": 0.4809, "step": 143 }, { "epoch": 0.1535181236673774, "grad_norm": 0.8215017978389719, "learning_rate": 2.456289978678039e-05, "loss": 0.4751, "step": 144 }, { "epoch": 0.15458422174840086, "grad_norm": 0.8563504324634484, "learning_rate": 2.473347547974414e-05, "loss": 0.4788, "step": 145 }, { "epoch": 0.15565031982942432, "grad_norm": 1.0397801044745016, "learning_rate": 2.4904051172707894e-05, "loss": 0.4828, "step": 146 }, { "epoch": 0.15671641791044777, "grad_norm": 0.8499495288588741, "learning_rate": 2.5074626865671646e-05, "loss": 0.4823, "step": 147 }, { "epoch": 0.15778251599147122, "grad_norm": 0.7506494495443314, "learning_rate": 2.52452025586354e-05, "loss": 0.4719, "step": 148 }, { "epoch": 0.15884861407249468, "grad_norm": 0.8572573387032592, "learning_rate": 2.541577825159915e-05, "loss": 0.4779, "step": 149 }, { "epoch": 0.15991471215351813, "grad_norm": 0.8980484862912236, "learning_rate": 2.5586353944562904e-05, "loss": 0.4669, "step": 150 }, { "epoch": 0.16098081023454158, "grad_norm": 1.3008931725410442, "learning_rate": 2.5756929637526656e-05, "loss": 0.4778, "step": 151 }, { "epoch": 0.16204690831556504, "grad_norm": 0.5404519887326864, "learning_rate": 2.592750533049041e-05, "loss": 0.4731, "step": 152 }, { "epoch": 0.1631130063965885, "grad_norm": 1.0716812716539528, "learning_rate": 2.609808102345416e-05, "loss": 0.4798, "step": 153 }, { "epoch": 0.16417910447761194, "grad_norm": 1.2709697777670215, "learning_rate": 2.6268656716417913e-05, "loss": 0.4871, "step": 154 }, { "epoch": 0.1652452025586354, "grad_norm": 0.537320261984439, "learning_rate": 2.6439232409381666e-05, "loss": 0.4749, "step": 155 }, { "epoch": 0.16631130063965885, "grad_norm": 1.1514684368533086, "learning_rate": 2.6609808102345418e-05, "loss": 0.4753, "step": 156 }, { "epoch": 0.1673773987206823, "grad_norm": 0.9432669540338412, "learning_rate": 2.678038379530917e-05, "loss": 0.4807, "step": 157 }, { "epoch": 0.16844349680170576, "grad_norm": 0.642515332753071, "learning_rate": 2.6950959488272923e-05, "loss": 0.477, "step": 158 }, { "epoch": 0.1695095948827292, "grad_norm": 0.8211012768546435, "learning_rate": 2.7121535181236675e-05, "loss": 0.4766, "step": 159 }, { "epoch": 0.17057569296375266, "grad_norm": 0.8776070226460816, "learning_rate": 2.7292110874200428e-05, "loss": 0.4717, "step": 160 }, { "epoch": 0.17164179104477612, "grad_norm": 0.7718966074497755, "learning_rate": 2.746268656716418e-05, "loss": 0.4687, "step": 161 }, { "epoch": 0.17270788912579957, "grad_norm": 0.7779381456798092, "learning_rate": 2.7633262260127933e-05, "loss": 0.4679, "step": 162 }, { "epoch": 0.17377398720682302, "grad_norm": 0.5901156093872161, "learning_rate": 2.7803837953091685e-05, "loss": 0.4751, "step": 163 }, { "epoch": 0.17484008528784648, "grad_norm": 0.7154596410625235, "learning_rate": 2.7974413646055437e-05, "loss": 0.4728, "step": 164 }, { "epoch": 0.17590618336886993, "grad_norm": 0.7000348159687018, "learning_rate": 2.814498933901919e-05, "loss": 0.4725, "step": 165 }, { "epoch": 0.17697228144989338, "grad_norm": 0.7657735939109871, "learning_rate": 2.8315565031982942e-05, "loss": 0.4784, "step": 166 }, { "epoch": 0.17803837953091683, "grad_norm": 0.8475989590871966, "learning_rate": 2.8486140724946695e-05, "loss": 0.4711, "step": 167 }, { "epoch": 0.1791044776119403, "grad_norm": 1.0922356892583558, "learning_rate": 2.8656716417910447e-05, "loss": 0.478, "step": 168 }, { "epoch": 0.18017057569296374, "grad_norm": 0.970068377938896, "learning_rate": 2.88272921108742e-05, "loss": 0.4653, "step": 169 }, { "epoch": 0.1812366737739872, "grad_norm": 0.7059217860390857, "learning_rate": 2.8997867803837952e-05, "loss": 0.474, "step": 170 }, { "epoch": 0.18230277185501065, "grad_norm": 1.0368427776415303, "learning_rate": 2.9168443496801708e-05, "loss": 0.4745, "step": 171 }, { "epoch": 0.18336886993603413, "grad_norm": 1.1739249199645363, "learning_rate": 2.9339019189765464e-05, "loss": 0.4713, "step": 172 }, { "epoch": 0.18443496801705758, "grad_norm": 0.772697311844866, "learning_rate": 2.9509594882729216e-05, "loss": 0.4718, "step": 173 }, { "epoch": 0.18550106609808104, "grad_norm": 0.8330939252501977, "learning_rate": 2.968017057569297e-05, "loss": 0.4722, "step": 174 }, { "epoch": 0.1865671641791045, "grad_norm": 0.831387960842226, "learning_rate": 2.985074626865672e-05, "loss": 0.472, "step": 175 }, { "epoch": 0.18763326226012794, "grad_norm": 0.7976609685235961, "learning_rate": 3.0021321961620473e-05, "loss": 0.47, "step": 176 }, { "epoch": 0.1886993603411514, "grad_norm": 0.8823919254760567, "learning_rate": 3.0191897654584226e-05, "loss": 0.4759, "step": 177 }, { "epoch": 0.18976545842217485, "grad_norm": 0.8804820784317922, "learning_rate": 3.0362473347547978e-05, "loss": 0.4659, "step": 178 }, { "epoch": 0.1908315565031983, "grad_norm": 0.9681666262840064, "learning_rate": 3.053304904051173e-05, "loss": 0.4729, "step": 179 }, { "epoch": 0.19189765458422176, "grad_norm": 1.1048871958759772, "learning_rate": 3.070362473347548e-05, "loss": 0.4665, "step": 180 }, { "epoch": 0.1929637526652452, "grad_norm": 0.8870408442356555, "learning_rate": 3.0874200426439235e-05, "loss": 0.4745, "step": 181 }, { "epoch": 0.19402985074626866, "grad_norm": 0.9631165384838795, "learning_rate": 3.104477611940299e-05, "loss": 0.4698, "step": 182 }, { "epoch": 0.19509594882729211, "grad_norm": 0.9385999582540799, "learning_rate": 3.121535181236674e-05, "loss": 0.4705, "step": 183 }, { "epoch": 0.19616204690831557, "grad_norm": 0.7939219655274056, "learning_rate": 3.138592750533049e-05, "loss": 0.4732, "step": 184 }, { "epoch": 0.19722814498933902, "grad_norm": 0.736357203214717, "learning_rate": 3.1556503198294245e-05, "loss": 0.4746, "step": 185 }, { "epoch": 0.19829424307036247, "grad_norm": 0.8037466041799969, "learning_rate": 3.1727078891258e-05, "loss": 0.4636, "step": 186 }, { "epoch": 0.19936034115138593, "grad_norm": 1.0193532786616932, "learning_rate": 3.189765458422175e-05, "loss": 0.4721, "step": 187 }, { "epoch": 0.20042643923240938, "grad_norm": 1.4921054947249843, "learning_rate": 3.20682302771855e-05, "loss": 0.4718, "step": 188 }, { "epoch": 0.20149253731343283, "grad_norm": 0.7621564304858446, "learning_rate": 3.2238805970149255e-05, "loss": 0.4653, "step": 189 }, { "epoch": 0.2025586353944563, "grad_norm": 1.070754972082962, "learning_rate": 3.240938166311301e-05, "loss": 0.4654, "step": 190 }, { "epoch": 0.20362473347547974, "grad_norm": 1.5652147505824308, "learning_rate": 3.257995735607676e-05, "loss": 0.472, "step": 191 }, { "epoch": 0.2046908315565032, "grad_norm": 0.869980401463994, "learning_rate": 3.275053304904051e-05, "loss": 0.4646, "step": 192 }, { "epoch": 0.20575692963752665, "grad_norm": 1.4005482830515732, "learning_rate": 3.2921108742004264e-05, "loss": 0.4697, "step": 193 }, { "epoch": 0.2068230277185501, "grad_norm": 1.2810330542613626, "learning_rate": 3.309168443496802e-05, "loss": 0.4631, "step": 194 }, { "epoch": 0.20788912579957355, "grad_norm": 1.036250299110791, "learning_rate": 3.326226012793177e-05, "loss": 0.4757, "step": 195 }, { "epoch": 0.208955223880597, "grad_norm": 0.8072040874296945, "learning_rate": 3.343283582089552e-05, "loss": 0.4629, "step": 196 }, { "epoch": 0.21002132196162046, "grad_norm": 1.4196573201874585, "learning_rate": 3.3603411513859274e-05, "loss": 0.4684, "step": 197 }, { "epoch": 0.21108742004264391, "grad_norm": 0.6191497943271066, "learning_rate": 3.3773987206823026e-05, "loss": 0.4631, "step": 198 }, { "epoch": 0.21215351812366737, "grad_norm": 1.2158443187059038, "learning_rate": 3.394456289978678e-05, "loss": 0.4671, "step": 199 }, { "epoch": 0.21321961620469082, "grad_norm": 0.8129012874793476, "learning_rate": 3.411513859275053e-05, "loss": 0.4618, "step": 200 }, { "epoch": 0.21428571428571427, "grad_norm": 1.0518786895781704, "learning_rate": 3.4285714285714284e-05, "loss": 0.4697, "step": 201 }, { "epoch": 0.21535181236673773, "grad_norm": 0.9688679826055264, "learning_rate": 3.4456289978678036e-05, "loss": 0.4683, "step": 202 }, { "epoch": 0.21641791044776118, "grad_norm": 1.3724882136741294, "learning_rate": 3.462686567164179e-05, "loss": 0.4718, "step": 203 }, { "epoch": 0.21748400852878466, "grad_norm": 0.5641264907256553, "learning_rate": 3.479744136460555e-05, "loss": 0.4551, "step": 204 }, { "epoch": 0.21855010660980811, "grad_norm": 1.0701546739161645, "learning_rate": 3.49680170575693e-05, "loss": 0.4612, "step": 205 }, { "epoch": 0.21961620469083157, "grad_norm": 1.049942985757254, "learning_rate": 3.513859275053305e-05, "loss": 0.4696, "step": 206 }, { "epoch": 0.22068230277185502, "grad_norm": 1.1283220333372375, "learning_rate": 3.5309168443496805e-05, "loss": 0.4632, "step": 207 }, { "epoch": 0.22174840085287847, "grad_norm": 0.9308516582288439, "learning_rate": 3.547974413646056e-05, "loss": 0.4661, "step": 208 }, { "epoch": 0.22281449893390193, "grad_norm": 1.1185548874871707, "learning_rate": 3.565031982942431e-05, "loss": 0.4641, "step": 209 }, { "epoch": 0.22388059701492538, "grad_norm": 0.919634938330087, "learning_rate": 3.582089552238806e-05, "loss": 0.4707, "step": 210 }, { "epoch": 0.22494669509594883, "grad_norm": 1.4776402736329493, "learning_rate": 3.5991471215351815e-05, "loss": 0.4652, "step": 211 }, { "epoch": 0.2260127931769723, "grad_norm": 0.7167195593546435, "learning_rate": 3.616204690831557e-05, "loss": 0.4595, "step": 212 }, { "epoch": 0.22707889125799574, "grad_norm": 1.2140834565384866, "learning_rate": 3.633262260127932e-05, "loss": 0.4711, "step": 213 }, { "epoch": 0.2281449893390192, "grad_norm": 1.0029544450885874, "learning_rate": 3.650319829424307e-05, "loss": 0.4596, "step": 214 }, { "epoch": 0.22921108742004265, "grad_norm": 1.5043815628272939, "learning_rate": 3.6673773987206824e-05, "loss": 0.4638, "step": 215 }, { "epoch": 0.2302771855010661, "grad_norm": 0.8569767572345512, "learning_rate": 3.684434968017058e-05, "loss": 0.4598, "step": 216 }, { "epoch": 0.23134328358208955, "grad_norm": 1.3925636332034894, "learning_rate": 3.701492537313433e-05, "loss": 0.4604, "step": 217 }, { "epoch": 0.232409381663113, "grad_norm": 1.1816721080629815, "learning_rate": 3.718550106609808e-05, "loss": 0.4518, "step": 218 }, { "epoch": 0.23347547974413646, "grad_norm": 0.8651413005301738, "learning_rate": 3.7356076759061834e-05, "loss": 0.465, "step": 219 }, { "epoch": 0.2345415778251599, "grad_norm": 1.1820130745722497, "learning_rate": 3.7526652452025586e-05, "loss": 0.4637, "step": 220 }, { "epoch": 0.23560767590618337, "grad_norm": 0.8427968342892499, "learning_rate": 3.769722814498934e-05, "loss": 0.4617, "step": 221 }, { "epoch": 0.23667377398720682, "grad_norm": 1.11073913884723, "learning_rate": 3.786780383795309e-05, "loss": 0.4655, "step": 222 }, { "epoch": 0.23773987206823027, "grad_norm": 0.9189934131123165, "learning_rate": 3.8038379530916844e-05, "loss": 0.4766, "step": 223 }, { "epoch": 0.23880597014925373, "grad_norm": 1.1535170456681654, "learning_rate": 3.8208955223880596e-05, "loss": 0.4625, "step": 224 }, { "epoch": 0.23987206823027718, "grad_norm": 0.9374181623692117, "learning_rate": 3.8379530916844355e-05, "loss": 0.4669, "step": 225 }, { "epoch": 0.24093816631130063, "grad_norm": 1.0563319134423954, "learning_rate": 3.855010660980811e-05, "loss": 0.4611, "step": 226 }, { "epoch": 0.2420042643923241, "grad_norm": 1.168785335326744, "learning_rate": 3.872068230277186e-05, "loss": 0.4634, "step": 227 }, { "epoch": 0.24307036247334754, "grad_norm": 1.0248065818014773, "learning_rate": 3.889125799573561e-05, "loss": 0.4593, "step": 228 }, { "epoch": 0.244136460554371, "grad_norm": 0.8637465871329612, "learning_rate": 3.9061833688699365e-05, "loss": 0.4632, "step": 229 }, { "epoch": 0.24520255863539445, "grad_norm": 0.8460415005099113, "learning_rate": 3.923240938166312e-05, "loss": 0.4624, "step": 230 }, { "epoch": 0.2462686567164179, "grad_norm": 0.8955587399970688, "learning_rate": 3.940298507462687e-05, "loss": 0.457, "step": 231 }, { "epoch": 0.24733475479744135, "grad_norm": 1.1657262128526866, "learning_rate": 3.957356076759062e-05, "loss": 0.4645, "step": 232 }, { "epoch": 0.2484008528784648, "grad_norm": 1.0166441196536635, "learning_rate": 3.9744136460554375e-05, "loss": 0.4634, "step": 233 }, { "epoch": 0.24946695095948826, "grad_norm": 0.9826615671115314, "learning_rate": 3.991471215351813e-05, "loss": 0.4614, "step": 234 }, { "epoch": 0.2505330490405117, "grad_norm": 1.0599689795674345, "learning_rate": 4.008528784648188e-05, "loss": 0.4731, "step": 235 }, { "epoch": 0.2515991471215352, "grad_norm": 0.9093543668098233, "learning_rate": 4.025586353944563e-05, "loss": 0.4566, "step": 236 }, { "epoch": 0.2526652452025586, "grad_norm": 1.013585300158083, "learning_rate": 4.0426439232409384e-05, "loss": 0.468, "step": 237 }, { "epoch": 0.2537313432835821, "grad_norm": 1.036723508797436, "learning_rate": 4.059701492537314e-05, "loss": 0.4584, "step": 238 }, { "epoch": 0.2547974413646055, "grad_norm": 1.316129392280748, "learning_rate": 4.076759061833689e-05, "loss": 0.4675, "step": 239 }, { "epoch": 0.255863539445629, "grad_norm": 1.0896330595803854, "learning_rate": 4.093816631130064e-05, "loss": 0.4623, "step": 240 }, { "epoch": 0.25692963752665243, "grad_norm": 1.1510385991266068, "learning_rate": 4.1108742004264394e-05, "loss": 0.4649, "step": 241 }, { "epoch": 0.2579957356076759, "grad_norm": 0.8229957414542415, "learning_rate": 4.1279317697228146e-05, "loss": 0.4584, "step": 242 }, { "epoch": 0.25906183368869934, "grad_norm": 0.9568171883082784, "learning_rate": 4.14498933901919e-05, "loss": 0.4628, "step": 243 }, { "epoch": 0.2601279317697228, "grad_norm": 0.9570238151905792, "learning_rate": 4.162046908315565e-05, "loss": 0.4544, "step": 244 }, { "epoch": 0.26119402985074625, "grad_norm": 1.1790086045153048, "learning_rate": 4.1791044776119404e-05, "loss": 0.4565, "step": 245 }, { "epoch": 0.2622601279317697, "grad_norm": 0.772255483862789, "learning_rate": 4.196162046908316e-05, "loss": 0.4575, "step": 246 }, { "epoch": 0.26332622601279315, "grad_norm": 1.1363053470021465, "learning_rate": 4.213219616204691e-05, "loss": 0.4612, "step": 247 }, { "epoch": 0.26439232409381663, "grad_norm": 0.7565375099785713, "learning_rate": 4.230277185501067e-05, "loss": 0.456, "step": 248 }, { "epoch": 0.26545842217484006, "grad_norm": 0.6892425990836365, "learning_rate": 4.247334754797441e-05, "loss": 0.4559, "step": 249 }, { "epoch": 0.26652452025586354, "grad_norm": 0.7417047962613619, "learning_rate": 4.264392324093817e-05, "loss": 0.4508, "step": 250 }, { "epoch": 0.267590618336887, "grad_norm": 0.7470072441966399, "learning_rate": 4.2814498933901925e-05, "loss": 0.461, "step": 251 }, { "epoch": 0.26865671641791045, "grad_norm": 0.9617786110698198, "learning_rate": 4.298507462686568e-05, "loss": 0.4513, "step": 252 }, { "epoch": 0.2697228144989339, "grad_norm": 1.327440065045842, "learning_rate": 4.315565031982943e-05, "loss": 0.4638, "step": 253 }, { "epoch": 0.27078891257995735, "grad_norm": 0.7442017560708861, "learning_rate": 4.332622601279318e-05, "loss": 0.4557, "step": 254 }, { "epoch": 0.27185501066098083, "grad_norm": 0.7202532764515195, "learning_rate": 4.3496801705756935e-05, "loss": 0.4543, "step": 255 }, { "epoch": 0.27292110874200426, "grad_norm": 0.8784340624592276, "learning_rate": 4.366737739872069e-05, "loss": 0.4579, "step": 256 }, { "epoch": 0.27398720682302774, "grad_norm": 0.8848099666534693, "learning_rate": 4.383795309168444e-05, "loss": 0.4527, "step": 257 }, { "epoch": 0.27505330490405117, "grad_norm": 0.7690402407029407, "learning_rate": 4.400852878464819e-05, "loss": 0.4576, "step": 258 }, { "epoch": 0.27611940298507465, "grad_norm": 0.828823436313651, "learning_rate": 4.4179104477611944e-05, "loss": 0.4601, "step": 259 }, { "epoch": 0.2771855010660981, "grad_norm": 1.186866608247454, "learning_rate": 4.43496801705757e-05, "loss": 0.4606, "step": 260 }, { "epoch": 0.27825159914712155, "grad_norm": 0.9191614369421419, "learning_rate": 4.452025586353945e-05, "loss": 0.457, "step": 261 }, { "epoch": 0.279317697228145, "grad_norm": 0.9275759467424034, "learning_rate": 4.46908315565032e-05, "loss": 0.4531, "step": 262 }, { "epoch": 0.28038379530916846, "grad_norm": 1.3018942692140634, "learning_rate": 4.4861407249466954e-05, "loss": 0.4519, "step": 263 }, { "epoch": 0.2814498933901919, "grad_norm": 1.2828905543825992, "learning_rate": 4.5031982942430706e-05, "loss": 0.4517, "step": 264 }, { "epoch": 0.28251599147121537, "grad_norm": 0.8444164365212923, "learning_rate": 4.5202558635394466e-05, "loss": 0.4579, "step": 265 }, { "epoch": 0.2835820895522388, "grad_norm": 0.7390719742847955, "learning_rate": 4.537313432835821e-05, "loss": 0.4564, "step": 266 }, { "epoch": 0.2846481876332623, "grad_norm": 0.804814881156404, "learning_rate": 4.554371002132197e-05, "loss": 0.4516, "step": 267 }, { "epoch": 0.2857142857142857, "grad_norm": 1.232886488524498, "learning_rate": 4.5714285714285716e-05, "loss": 0.4552, "step": 268 }, { "epoch": 0.2867803837953092, "grad_norm": 1.1058003798103109, "learning_rate": 4.5884861407249475e-05, "loss": 0.4531, "step": 269 }, { "epoch": 0.2878464818763326, "grad_norm": 1.1891457338823357, "learning_rate": 4.605543710021322e-05, "loss": 0.4511, "step": 270 }, { "epoch": 0.2889125799573561, "grad_norm": 1.0630830374386846, "learning_rate": 4.622601279317698e-05, "loss": 0.4506, "step": 271 }, { "epoch": 0.2899786780383795, "grad_norm": 0.8558226373887493, "learning_rate": 4.6396588486140726e-05, "loss": 0.4533, "step": 272 }, { "epoch": 0.291044776119403, "grad_norm": 0.8431544624031982, "learning_rate": 4.6567164179104485e-05, "loss": 0.4489, "step": 273 }, { "epoch": 0.2921108742004264, "grad_norm": 1.0780225196774107, "learning_rate": 4.673773987206823e-05, "loss": 0.4535, "step": 274 }, { "epoch": 0.2931769722814499, "grad_norm": 1.040484818452518, "learning_rate": 4.690831556503199e-05, "loss": 0.4526, "step": 275 }, { "epoch": 0.2942430703624733, "grad_norm": 1.4317419430672074, "learning_rate": 4.7078891257995735e-05, "loss": 0.4491, "step": 276 }, { "epoch": 0.2953091684434968, "grad_norm": 0.7350004141525754, "learning_rate": 4.7249466950959495e-05, "loss": 0.4508, "step": 277 }, { "epoch": 0.29637526652452023, "grad_norm": 1.0597151973403933, "learning_rate": 4.742004264392324e-05, "loss": 0.4536, "step": 278 }, { "epoch": 0.2974413646055437, "grad_norm": 1.7726271804234848, "learning_rate": 4.7590618336887e-05, "loss": 0.4533, "step": 279 }, { "epoch": 0.29850746268656714, "grad_norm": 0.7058843328989329, "learning_rate": 4.7761194029850745e-05, "loss": 0.4546, "step": 280 }, { "epoch": 0.2995735607675906, "grad_norm": 2.0525677674617597, "learning_rate": 4.7931769722814504e-05, "loss": 0.4591, "step": 281 }, { "epoch": 0.3006396588486141, "grad_norm": 1.0197143782821263, "learning_rate": 4.810234541577826e-05, "loss": 0.4539, "step": 282 }, { "epoch": 0.3017057569296375, "grad_norm": 2.682566079537771, "learning_rate": 4.827292110874201e-05, "loss": 0.4696, "step": 283 }, { "epoch": 0.302771855010661, "grad_norm": 2.7347833899867395, "learning_rate": 4.844349680170577e-05, "loss": 0.4854, "step": 284 }, { "epoch": 0.30383795309168443, "grad_norm": 1.4147621114370599, "learning_rate": 4.8614072494669514e-05, "loss": 0.4613, "step": 285 }, { "epoch": 0.3049040511727079, "grad_norm": 1.4281612301500841, "learning_rate": 4.878464818763327e-05, "loss": 0.4609, "step": 286 }, { "epoch": 0.30597014925373134, "grad_norm": 1.0989723638165922, "learning_rate": 4.895522388059702e-05, "loss": 0.4627, "step": 287 }, { "epoch": 0.3070362473347548, "grad_norm": 1.025429892081604, "learning_rate": 4.912579957356078e-05, "loss": 0.4516, "step": 288 }, { "epoch": 0.30810234541577824, "grad_norm": 1.1574096426201355, "learning_rate": 4.9296375266524524e-05, "loss": 0.463, "step": 289 }, { "epoch": 0.3091684434968017, "grad_norm": 0.9660522145703263, "learning_rate": 4.946695095948828e-05, "loss": 0.4551, "step": 290 }, { "epoch": 0.31023454157782515, "grad_norm": 0.7154708715646663, "learning_rate": 4.963752665245203e-05, "loss": 0.4592, "step": 291 }, { "epoch": 0.31130063965884863, "grad_norm": 0.6930388295609867, "learning_rate": 4.980810234541579e-05, "loss": 0.4541, "step": 292 }, { "epoch": 0.31236673773987206, "grad_norm": 0.9114648684646235, "learning_rate": 4.997867803837953e-05, "loss": 0.4574, "step": 293 }, { "epoch": 0.31343283582089554, "grad_norm": 1.245129676815517, "learning_rate": 5.014925373134329e-05, "loss": 0.4542, "step": 294 }, { "epoch": 0.31449893390191896, "grad_norm": 0.9624168019988285, "learning_rate": 5.031982942430704e-05, "loss": 0.4536, "step": 295 }, { "epoch": 0.31556503198294245, "grad_norm": 0.8009138019519386, "learning_rate": 5.04904051172708e-05, "loss": 0.4512, "step": 296 }, { "epoch": 0.31663113006396587, "grad_norm": 0.6547749384104303, "learning_rate": 5.066098081023454e-05, "loss": 0.4519, "step": 297 }, { "epoch": 0.31769722814498935, "grad_norm": 0.83857949506653, "learning_rate": 5.08315565031983e-05, "loss": 0.4513, "step": 298 }, { "epoch": 0.3187633262260128, "grad_norm": 0.934818630813169, "learning_rate": 5.100213219616205e-05, "loss": 0.4547, "step": 299 }, { "epoch": 0.31982942430703626, "grad_norm": 1.1298040583791609, "learning_rate": 5.117270788912581e-05, "loss": 0.4573, "step": 300 }, { "epoch": 0.3208955223880597, "grad_norm": 1.1242753661227198, "learning_rate": 5.134328358208955e-05, "loss": 0.4555, "step": 301 }, { "epoch": 0.32196162046908317, "grad_norm": 0.8386711391896823, "learning_rate": 5.151385927505331e-05, "loss": 0.4454, "step": 302 }, { "epoch": 0.3230277185501066, "grad_norm": 1.0644491026122258, "learning_rate": 5.168443496801706e-05, "loss": 0.4544, "step": 303 }, { "epoch": 0.32409381663113007, "grad_norm": 1.0964593961058677, "learning_rate": 5.185501066098082e-05, "loss": 0.4556, "step": 304 }, { "epoch": 0.3251599147121535, "grad_norm": 0.9831083339112521, "learning_rate": 5.202558635394456e-05, "loss": 0.4467, "step": 305 }, { "epoch": 0.326226012793177, "grad_norm": 0.9900349752767852, "learning_rate": 5.219616204690832e-05, "loss": 0.4516, "step": 306 }, { "epoch": 0.3272921108742004, "grad_norm": 1.0598608792836142, "learning_rate": 5.236673773987207e-05, "loss": 0.4498, "step": 307 }, { "epoch": 0.3283582089552239, "grad_norm": 0.9889355725212928, "learning_rate": 5.2537313432835826e-05, "loss": 0.4514, "step": 308 }, { "epoch": 0.3294243070362473, "grad_norm": 0.9865582250975005, "learning_rate": 5.270788912579957e-05, "loss": 0.4513, "step": 309 }, { "epoch": 0.3304904051172708, "grad_norm": 1.252376286795114, "learning_rate": 5.287846481876333e-05, "loss": 0.4523, "step": 310 }, { "epoch": 0.3315565031982942, "grad_norm": 0.6493116364566459, "learning_rate": 5.304904051172708e-05, "loss": 0.445, "step": 311 }, { "epoch": 0.3326226012793177, "grad_norm": 1.0832555318476744, "learning_rate": 5.3219616204690836e-05, "loss": 0.4551, "step": 312 }, { "epoch": 0.3336886993603412, "grad_norm": 1.0891590791019572, "learning_rate": 5.3390191897654595e-05, "loss": 0.4501, "step": 313 }, { "epoch": 0.3347547974413646, "grad_norm": 0.5965442300858297, "learning_rate": 5.356076759061834e-05, "loss": 0.4433, "step": 314 }, { "epoch": 0.3358208955223881, "grad_norm": 0.9796043883687924, "learning_rate": 5.37313432835821e-05, "loss": 0.4517, "step": 315 }, { "epoch": 0.3368869936034115, "grad_norm": 0.993909895121157, "learning_rate": 5.3901918976545846e-05, "loss": 0.452, "step": 316 }, { "epoch": 0.337953091684435, "grad_norm": 0.6471597791846867, "learning_rate": 5.4072494669509605e-05, "loss": 0.4489, "step": 317 }, { "epoch": 0.3390191897654584, "grad_norm": 1.200049501099017, "learning_rate": 5.424307036247335e-05, "loss": 0.4519, "step": 318 }, { "epoch": 0.3400852878464819, "grad_norm": 0.8547891891381943, "learning_rate": 5.441364605543711e-05, "loss": 0.4491, "step": 319 }, { "epoch": 0.3411513859275053, "grad_norm": 0.8872211588826856, "learning_rate": 5.4584221748400855e-05, "loss": 0.4522, "step": 320 }, { "epoch": 0.3422174840085288, "grad_norm": 1.114364988812215, "learning_rate": 5.4754797441364615e-05, "loss": 0.4503, "step": 321 }, { "epoch": 0.34328358208955223, "grad_norm": 0.9431484820101103, "learning_rate": 5.492537313432836e-05, "loss": 0.4433, "step": 322 }, { "epoch": 0.3443496801705757, "grad_norm": 1.0650041065819467, "learning_rate": 5.509594882729212e-05, "loss": 0.4484, "step": 323 }, { "epoch": 0.34541577825159914, "grad_norm": 1.1014629307818748, "learning_rate": 5.5266524520255865e-05, "loss": 0.4439, "step": 324 }, { "epoch": 0.3464818763326226, "grad_norm": 1.3284550365482601, "learning_rate": 5.5437100213219624e-05, "loss": 0.4489, "step": 325 }, { "epoch": 0.34754797441364604, "grad_norm": 0.6740188587682626, "learning_rate": 5.560767590618337e-05, "loss": 0.4461, "step": 326 }, { "epoch": 0.3486140724946695, "grad_norm": 0.8535552014500319, "learning_rate": 5.577825159914713e-05, "loss": 0.4469, "step": 327 }, { "epoch": 0.34968017057569295, "grad_norm": 0.9344951335512115, "learning_rate": 5.5948827292110875e-05, "loss": 0.4403, "step": 328 }, { "epoch": 0.35074626865671643, "grad_norm": 1.6493682506419003, "learning_rate": 5.6119402985074634e-05, "loss": 0.4471, "step": 329 }, { "epoch": 0.35181236673773986, "grad_norm": 0.5980828549596681, "learning_rate": 5.628997867803838e-05, "loss": 0.4407, "step": 330 }, { "epoch": 0.35287846481876334, "grad_norm": 1.3993066518146489, "learning_rate": 5.646055437100214e-05, "loss": 0.4463, "step": 331 }, { "epoch": 0.35394456289978676, "grad_norm": 1.364713422133084, "learning_rate": 5.6631130063965884e-05, "loss": 0.4515, "step": 332 }, { "epoch": 0.35501066098081024, "grad_norm": 0.6912706432665139, "learning_rate": 5.6801705756929644e-05, "loss": 0.4495, "step": 333 }, { "epoch": 0.35607675906183367, "grad_norm": 1.0070213135643369, "learning_rate": 5.697228144989339e-05, "loss": 0.4444, "step": 334 }, { "epoch": 0.35714285714285715, "grad_norm": 1.60072329010338, "learning_rate": 5.714285714285715e-05, "loss": 0.4491, "step": 335 }, { "epoch": 0.3582089552238806, "grad_norm": 0.7418596944182502, "learning_rate": 5.7313432835820894e-05, "loss": 0.4426, "step": 336 }, { "epoch": 0.35927505330490406, "grad_norm": 1.5682849618752128, "learning_rate": 5.748400852878465e-05, "loss": 0.4557, "step": 337 }, { "epoch": 0.3603411513859275, "grad_norm": 0.9757128890719955, "learning_rate": 5.76545842217484e-05, "loss": 0.4434, "step": 338 }, { "epoch": 0.36140724946695096, "grad_norm": 1.023456270027187, "learning_rate": 5.782515991471216e-05, "loss": 0.4493, "step": 339 }, { "epoch": 0.3624733475479744, "grad_norm": 1.1069512289563967, "learning_rate": 5.7995735607675904e-05, "loss": 0.4487, "step": 340 }, { "epoch": 0.36353944562899787, "grad_norm": 1.2762171846197148, "learning_rate": 5.816631130063966e-05, "loss": 0.4494, "step": 341 }, { "epoch": 0.3646055437100213, "grad_norm": 0.9563297488524058, "learning_rate": 5.8336886993603415e-05, "loss": 0.4538, "step": 342 }, { "epoch": 0.3656716417910448, "grad_norm": 0.8844708150282824, "learning_rate": 5.850746268656717e-05, "loss": 0.4599, "step": 343 }, { "epoch": 0.36673773987206826, "grad_norm": 0.9248540817564378, "learning_rate": 5.867803837953093e-05, "loss": 0.4532, "step": 344 }, { "epoch": 0.3678038379530917, "grad_norm": 0.897676935037186, "learning_rate": 5.884861407249467e-05, "loss": 0.4424, "step": 345 }, { "epoch": 0.36886993603411516, "grad_norm": 1.3371410250762796, "learning_rate": 5.901918976545843e-05, "loss": 0.4528, "step": 346 }, { "epoch": 0.3699360341151386, "grad_norm": 0.710536412391709, "learning_rate": 5.918976545842218e-05, "loss": 0.4496, "step": 347 }, { "epoch": 0.37100213219616207, "grad_norm": 0.9252679189981835, "learning_rate": 5.936034115138594e-05, "loss": 0.4512, "step": 348 }, { "epoch": 0.3720682302771855, "grad_norm": 1.2517055018731635, "learning_rate": 5.953091684434968e-05, "loss": 0.4473, "step": 349 }, { "epoch": 0.373134328358209, "grad_norm": 0.9298497655115758, "learning_rate": 5.970149253731344e-05, "loss": 0.4411, "step": 350 }, { "epoch": 0.3742004264392324, "grad_norm": 0.7999639695039473, "learning_rate": 5.987206823027719e-05, "loss": 0.4525, "step": 351 }, { "epoch": 0.3752665245202559, "grad_norm": 0.9596155583258207, "learning_rate": 6.0042643923240946e-05, "loss": 0.4472, "step": 352 }, { "epoch": 0.3763326226012793, "grad_norm": 1.4841986378845615, "learning_rate": 6.021321961620469e-05, "loss": 0.453, "step": 353 }, { "epoch": 0.3773987206823028, "grad_norm": 0.704856727131918, "learning_rate": 6.038379530916845e-05, "loss": 0.4418, "step": 354 }, { "epoch": 0.3784648187633262, "grad_norm": 1.1708028615880235, "learning_rate": 6.05543710021322e-05, "loss": 0.4483, "step": 355 }, { "epoch": 0.3795309168443497, "grad_norm": 1.6336721778285583, "learning_rate": 6.0724946695095956e-05, "loss": 0.4523, "step": 356 }, { "epoch": 0.3805970149253731, "grad_norm": 0.7634972844271057, "learning_rate": 6.08955223880597e-05, "loss": 0.438, "step": 357 }, { "epoch": 0.3816631130063966, "grad_norm": 2.0551249748572276, "learning_rate": 6.106609808102346e-05, "loss": 0.4465, "step": 358 }, { "epoch": 0.38272921108742003, "grad_norm": 1.1247068935515665, "learning_rate": 6.123667377398721e-05, "loss": 0.4528, "step": 359 }, { "epoch": 0.3837953091684435, "grad_norm": 2.399781202126181, "learning_rate": 6.140724946695097e-05, "loss": 0.4638, "step": 360 }, { "epoch": 0.38486140724946694, "grad_norm": 2.227691194378861, "learning_rate": 6.157782515991472e-05, "loss": 0.4605, "step": 361 }, { "epoch": 0.3859275053304904, "grad_norm": 1.286105573757533, "learning_rate": 6.174840085287847e-05, "loss": 0.4549, "step": 362 }, { "epoch": 0.38699360341151384, "grad_norm": 1.5661635396554183, "learning_rate": 6.191897654584222e-05, "loss": 0.4553, "step": 363 }, { "epoch": 0.3880597014925373, "grad_norm": 1.1356847004536101, "learning_rate": 6.208955223880598e-05, "loss": 0.4551, "step": 364 }, { "epoch": 0.38912579957356075, "grad_norm": 1.5226959517373695, "learning_rate": 6.226012793176973e-05, "loss": 0.4527, "step": 365 }, { "epoch": 0.39019189765458423, "grad_norm": 1.2678150824491392, "learning_rate": 6.243070362473348e-05, "loss": 0.4535, "step": 366 }, { "epoch": 0.39125799573560766, "grad_norm": 1.2767687308223976, "learning_rate": 6.260127931769723e-05, "loss": 0.4533, "step": 367 }, { "epoch": 0.39232409381663114, "grad_norm": 1.2583112180411133, "learning_rate": 6.277185501066099e-05, "loss": 0.4479, "step": 368 }, { "epoch": 0.39339019189765456, "grad_norm": 0.8712817882443821, "learning_rate": 6.294243070362474e-05, "loss": 0.4487, "step": 369 }, { "epoch": 0.39445628997867804, "grad_norm": 1.367324065405987, "learning_rate": 6.311300639658849e-05, "loss": 0.4478, "step": 370 }, { "epoch": 0.39552238805970147, "grad_norm": 0.8890072622408898, "learning_rate": 6.328358208955224e-05, "loss": 0.448, "step": 371 }, { "epoch": 0.39658848614072495, "grad_norm": 1.3147387007679658, "learning_rate": 6.3454157782516e-05, "loss": 0.4462, "step": 372 }, { "epoch": 0.3976545842217484, "grad_norm": 1.0068738388038798, "learning_rate": 6.362473347547975e-05, "loss": 0.4477, "step": 373 }, { "epoch": 0.39872068230277186, "grad_norm": 0.8943784235638984, "learning_rate": 6.37953091684435e-05, "loss": 0.4386, "step": 374 }, { "epoch": 0.3997867803837953, "grad_norm": 0.9000653302119703, "learning_rate": 6.396588486140725e-05, "loss": 0.4473, "step": 375 }, { "epoch": 0.40085287846481876, "grad_norm": 0.9661659593224818, "learning_rate": 6.4136460554371e-05, "loss": 0.438, "step": 376 }, { "epoch": 0.40191897654584224, "grad_norm": 1.2385940487697282, "learning_rate": 6.430703624733477e-05, "loss": 0.4466, "step": 377 }, { "epoch": 0.40298507462686567, "grad_norm": 0.7438645095725858, "learning_rate": 6.447761194029851e-05, "loss": 0.4419, "step": 378 }, { "epoch": 0.40405117270788915, "grad_norm": 0.7506871300171193, "learning_rate": 6.464818763326228e-05, "loss": 0.4448, "step": 379 }, { "epoch": 0.4051172707889126, "grad_norm": 0.9063302132407942, "learning_rate": 6.481876332622601e-05, "loss": 0.443, "step": 380 }, { "epoch": 0.40618336886993606, "grad_norm": 0.7598737346973297, "learning_rate": 6.498933901918978e-05, "loss": 0.4387, "step": 381 }, { "epoch": 0.4072494669509595, "grad_norm": 0.7700124455332041, "learning_rate": 6.515991471215352e-05, "loss": 0.4427, "step": 382 }, { "epoch": 0.40831556503198296, "grad_norm": 0.8602508992172722, "learning_rate": 6.533049040511728e-05, "loss": 0.4417, "step": 383 }, { "epoch": 0.4093816631130064, "grad_norm": 1.0830008793393127, "learning_rate": 6.550106609808102e-05, "loss": 0.4438, "step": 384 }, { "epoch": 0.41044776119402987, "grad_norm": 1.556613073128131, "learning_rate": 6.567164179104479e-05, "loss": 0.4564, "step": 385 }, { "epoch": 0.4115138592750533, "grad_norm": 0.6513365090394059, "learning_rate": 6.584221748400853e-05, "loss": 0.4549, "step": 386 }, { "epoch": 0.4125799573560768, "grad_norm": 1.7019947089510525, "learning_rate": 6.60127931769723e-05, "loss": 0.4577, "step": 387 }, { "epoch": 0.4136460554371002, "grad_norm": 0.8602634122130145, "learning_rate": 6.618336886993603e-05, "loss": 0.4453, "step": 388 }, { "epoch": 0.4147121535181237, "grad_norm": 1.4701074651429544, "learning_rate": 6.63539445628998e-05, "loss": 0.4553, "step": 389 }, { "epoch": 0.4157782515991471, "grad_norm": 1.0042942676179272, "learning_rate": 6.652452025586354e-05, "loss": 0.4517, "step": 390 }, { "epoch": 0.4168443496801706, "grad_norm": 1.2563707215527287, "learning_rate": 6.66950959488273e-05, "loss": 0.4583, "step": 391 }, { "epoch": 0.417910447761194, "grad_norm": 0.8702664926874648, "learning_rate": 6.686567164179104e-05, "loss": 0.4447, "step": 392 }, { "epoch": 0.4189765458422175, "grad_norm": 1.2431851878497833, "learning_rate": 6.703624733475481e-05, "loss": 0.4561, "step": 393 }, { "epoch": 0.4200426439232409, "grad_norm": 0.74543869723929, "learning_rate": 6.720682302771855e-05, "loss": 0.4433, "step": 394 }, { "epoch": 0.4211087420042644, "grad_norm": 1.0726381467701929, "learning_rate": 6.737739872068231e-05, "loss": 0.4435, "step": 395 }, { "epoch": 0.42217484008528783, "grad_norm": 0.8897029348079197, "learning_rate": 6.754797441364605e-05, "loss": 0.4432, "step": 396 }, { "epoch": 0.4232409381663113, "grad_norm": 0.9052325689021318, "learning_rate": 6.771855010660982e-05, "loss": 0.4468, "step": 397 }, { "epoch": 0.42430703624733473, "grad_norm": 0.9903919600049477, "learning_rate": 6.788912579957356e-05, "loss": 0.4421, "step": 398 }, { "epoch": 0.4253731343283582, "grad_norm": 1.2565089097200555, "learning_rate": 6.805970149253732e-05, "loss": 0.4484, "step": 399 }, { "epoch": 0.42643923240938164, "grad_norm": 1.217499983640722, "learning_rate": 6.823027718550106e-05, "loss": 0.45, "step": 400 }, { "epoch": 0.4275053304904051, "grad_norm": 0.8648975399735686, "learning_rate": 6.840085287846483e-05, "loss": 0.4411, "step": 401 }, { "epoch": 0.42857142857142855, "grad_norm": 0.7803010542574316, "learning_rate": 6.857142857142857e-05, "loss": 0.4452, "step": 402 }, { "epoch": 0.42963752665245203, "grad_norm": 1.070484531211215, "learning_rate": 6.874200426439233e-05, "loss": 0.4415, "step": 403 }, { "epoch": 0.43070362473347545, "grad_norm": 1.0518099835702677, "learning_rate": 6.891257995735607e-05, "loss": 0.4483, "step": 404 }, { "epoch": 0.43176972281449894, "grad_norm": 1.0987178262200696, "learning_rate": 6.908315565031984e-05, "loss": 0.4504, "step": 405 }, { "epoch": 0.43283582089552236, "grad_norm": 1.298631454730131, "learning_rate": 6.925373134328358e-05, "loss": 0.4484, "step": 406 }, { "epoch": 0.43390191897654584, "grad_norm": 0.7350226964695964, "learning_rate": 6.942430703624734e-05, "loss": 0.4469, "step": 407 }, { "epoch": 0.4349680170575693, "grad_norm": 1.003473508893585, "learning_rate": 6.95948827292111e-05, "loss": 0.4473, "step": 408 }, { "epoch": 0.43603411513859275, "grad_norm": 1.5274417692020188, "learning_rate": 6.976545842217485e-05, "loss": 0.4406, "step": 409 }, { "epoch": 0.43710021321961623, "grad_norm": 0.795881436552545, "learning_rate": 6.99360341151386e-05, "loss": 0.4458, "step": 410 }, { "epoch": 0.43816631130063965, "grad_norm": 1.8802864262284982, "learning_rate": 7.010660980810235e-05, "loss": 0.452, "step": 411 }, { "epoch": 0.43923240938166314, "grad_norm": 1.0226364397036434, "learning_rate": 7.02771855010661e-05, "loss": 0.4439, "step": 412 }, { "epoch": 0.44029850746268656, "grad_norm": 2.2181657239884736, "learning_rate": 7.044776119402986e-05, "loss": 0.4635, "step": 413 }, { "epoch": 0.44136460554371004, "grad_norm": 1.9407812835735618, "learning_rate": 7.061833688699361e-05, "loss": 0.4592, "step": 414 }, { "epoch": 0.44243070362473347, "grad_norm": 1.309070344848072, "learning_rate": 7.078891257995736e-05, "loss": 0.4465, "step": 415 }, { "epoch": 0.44349680170575695, "grad_norm": 1.2883250095052161, "learning_rate": 7.095948827292111e-05, "loss": 0.4444, "step": 416 }, { "epoch": 0.4445628997867804, "grad_norm": 1.0903343923529414, "learning_rate": 7.113006396588487e-05, "loss": 0.4496, "step": 417 }, { "epoch": 0.44562899786780386, "grad_norm": 1.183664382780786, "learning_rate": 7.130063965884862e-05, "loss": 0.4438, "step": 418 }, { "epoch": 0.4466950959488273, "grad_norm": 0.658411366668824, "learning_rate": 7.147121535181237e-05, "loss": 0.4487, "step": 419 }, { "epoch": 0.44776119402985076, "grad_norm": 0.9629596016665242, "learning_rate": 7.164179104477612e-05, "loss": 0.4492, "step": 420 }, { "epoch": 0.4488272921108742, "grad_norm": 0.6918711094052541, "learning_rate": 7.181236673773988e-05, "loss": 0.4459, "step": 421 }, { "epoch": 0.44989339019189767, "grad_norm": 0.8563536919057261, "learning_rate": 7.198294243070363e-05, "loss": 0.4414, "step": 422 }, { "epoch": 0.4509594882729211, "grad_norm": 0.6997094585258404, "learning_rate": 7.215351812366738e-05, "loss": 0.4371, "step": 423 }, { "epoch": 0.4520255863539446, "grad_norm": 0.9078702301189076, "learning_rate": 7.232409381663113e-05, "loss": 0.4385, "step": 424 }, { "epoch": 0.453091684434968, "grad_norm": 0.9105908002111917, "learning_rate": 7.249466950959489e-05, "loss": 0.4435, "step": 425 }, { "epoch": 0.4541577825159915, "grad_norm": 1.17133908131614, "learning_rate": 7.266524520255864e-05, "loss": 0.4436, "step": 426 }, { "epoch": 0.4552238805970149, "grad_norm": 0.9871418897305207, "learning_rate": 7.283582089552239e-05, "loss": 0.4382, "step": 427 }, { "epoch": 0.4562899786780384, "grad_norm": 1.1416965993237809, "learning_rate": 7.300639658848614e-05, "loss": 0.4413, "step": 428 }, { "epoch": 0.4573560767590618, "grad_norm": 0.8995759612900418, "learning_rate": 7.31769722814499e-05, "loss": 0.441, "step": 429 }, { "epoch": 0.4584221748400853, "grad_norm": 0.9218576549939405, "learning_rate": 7.334754797441365e-05, "loss": 0.4384, "step": 430 }, { "epoch": 0.4594882729211087, "grad_norm": 0.858746072858543, "learning_rate": 7.35181236673774e-05, "loss": 0.4431, "step": 431 }, { "epoch": 0.4605543710021322, "grad_norm": 0.935134912271826, "learning_rate": 7.368869936034115e-05, "loss": 0.4419, "step": 432 }, { "epoch": 0.4616204690831556, "grad_norm": 0.9646962439556347, "learning_rate": 7.38592750533049e-05, "loss": 0.4447, "step": 433 }, { "epoch": 0.4626865671641791, "grad_norm": 1.1656061718106945, "learning_rate": 7.402985074626866e-05, "loss": 0.4449, "step": 434 }, { "epoch": 0.46375266524520253, "grad_norm": 0.7326350434710983, "learning_rate": 7.420042643923241e-05, "loss": 0.4306, "step": 435 }, { "epoch": 0.464818763326226, "grad_norm": 0.4426357716461027, "learning_rate": 7.437100213219616e-05, "loss": 0.4391, "step": 436 }, { "epoch": 0.46588486140724944, "grad_norm": 0.6078030487605691, "learning_rate": 7.454157782515992e-05, "loss": 0.4347, "step": 437 }, { "epoch": 0.4669509594882729, "grad_norm": 0.8052535102563257, "learning_rate": 7.471215351812367e-05, "loss": 0.4399, "step": 438 }, { "epoch": 0.4680170575692964, "grad_norm": 0.993986917853917, "learning_rate": 7.488272921108743e-05, "loss": 0.4491, "step": 439 }, { "epoch": 0.4690831556503198, "grad_norm": 1.1313255325891831, "learning_rate": 7.505330490405117e-05, "loss": 0.4493, "step": 440 }, { "epoch": 0.4701492537313433, "grad_norm": 0.6842654100869138, "learning_rate": 7.522388059701494e-05, "loss": 0.4381, "step": 441 }, { "epoch": 0.47121535181236673, "grad_norm": 0.914667240873549, "learning_rate": 7.539445628997868e-05, "loss": 0.4298, "step": 442 }, { "epoch": 0.4722814498933902, "grad_norm": 1.079253155327782, "learning_rate": 7.556503198294244e-05, "loss": 0.4419, "step": 443 }, { "epoch": 0.47334754797441364, "grad_norm": 0.7735237895647086, "learning_rate": 7.573560767590618e-05, "loss": 0.4363, "step": 444 }, { "epoch": 0.4744136460554371, "grad_norm": 0.8494163950632265, "learning_rate": 7.590618336886995e-05, "loss": 0.4424, "step": 445 }, { "epoch": 0.47547974413646055, "grad_norm": 0.8832423465229242, "learning_rate": 7.607675906183369e-05, "loss": 0.4433, "step": 446 }, { "epoch": 0.47654584221748403, "grad_norm": 0.9230538736492173, "learning_rate": 7.624733475479745e-05, "loss": 0.4373, "step": 447 }, { "epoch": 0.47761194029850745, "grad_norm": 1.0547271971615721, "learning_rate": 7.641791044776119e-05, "loss": 0.4383, "step": 448 }, { "epoch": 0.47867803837953093, "grad_norm": 0.8897022777657935, "learning_rate": 7.658848614072496e-05, "loss": 0.442, "step": 449 }, { "epoch": 0.47974413646055436, "grad_norm": 0.9036068567889025, "learning_rate": 7.675906183368871e-05, "loss": 0.4438, "step": 450 }, { "epoch": 0.48081023454157784, "grad_norm": 0.881957048697129, "learning_rate": 7.692963752665246e-05, "loss": 0.4417, "step": 451 }, { "epoch": 0.48187633262260127, "grad_norm": 0.9500253572061279, "learning_rate": 7.710021321961622e-05, "loss": 0.4366, "step": 452 }, { "epoch": 0.48294243070362475, "grad_norm": 1.2095267797004439, "learning_rate": 7.727078891257997e-05, "loss": 0.4362, "step": 453 }, { "epoch": 0.4840085287846482, "grad_norm": 1.0482792150572497, "learning_rate": 7.744136460554372e-05, "loss": 0.4322, "step": 454 }, { "epoch": 0.48507462686567165, "grad_norm": 1.0807249109863046, "learning_rate": 7.761194029850747e-05, "loss": 0.4463, "step": 455 }, { "epoch": 0.4861407249466951, "grad_norm": 1.0949378713445743, "learning_rate": 7.778251599147123e-05, "loss": 0.4384, "step": 456 }, { "epoch": 0.48720682302771856, "grad_norm": 1.1230076391837633, "learning_rate": 7.795309168443498e-05, "loss": 0.4328, "step": 457 }, { "epoch": 0.488272921108742, "grad_norm": 0.9284196769926588, "learning_rate": 7.812366737739873e-05, "loss": 0.433, "step": 458 }, { "epoch": 0.48933901918976547, "grad_norm": 0.8307602310830237, "learning_rate": 7.829424307036248e-05, "loss": 0.4355, "step": 459 }, { "epoch": 0.4904051172707889, "grad_norm": 0.6845844913178624, "learning_rate": 7.846481876332623e-05, "loss": 0.4324, "step": 460 }, { "epoch": 0.4914712153518124, "grad_norm": 0.6727473022234879, "learning_rate": 7.863539445628999e-05, "loss": 0.4343, "step": 461 }, { "epoch": 0.4925373134328358, "grad_norm": 0.9118111002510331, "learning_rate": 7.880597014925374e-05, "loss": 0.4332, "step": 462 }, { "epoch": 0.4936034115138593, "grad_norm": 1.0468419215009597, "learning_rate": 7.897654584221749e-05, "loss": 0.4303, "step": 463 }, { "epoch": 0.4946695095948827, "grad_norm": 1.0423557702670259, "learning_rate": 7.914712153518124e-05, "loss": 0.4468, "step": 464 }, { "epoch": 0.4957356076759062, "grad_norm": 1.0941125408721255, "learning_rate": 7.9317697228145e-05, "loss": 0.437, "step": 465 }, { "epoch": 0.4968017057569296, "grad_norm": 1.009365737040848, "learning_rate": 7.948827292110875e-05, "loss": 0.4344, "step": 466 }, { "epoch": 0.4978678038379531, "grad_norm": 0.976490369885011, "learning_rate": 7.96588486140725e-05, "loss": 0.4405, "step": 467 }, { "epoch": 0.4989339019189765, "grad_norm": 1.0336971441325562, "learning_rate": 7.982942430703625e-05, "loss": 0.4415, "step": 468 }, { "epoch": 0.5, "grad_norm": 1.1162353727508352, "learning_rate": 8e-05, "loss": 0.441, "step": 469 }, { "epoch": 0.5010660980810234, "grad_norm": 0.870590499557518, "learning_rate": 7.999998892103944e-05, "loss": 0.4385, "step": 470 }, { "epoch": 0.502132196162047, "grad_norm": 1.069571130953163, "learning_rate": 7.999995568416386e-05, "loss": 0.436, "step": 471 }, { "epoch": 0.5031982942430704, "grad_norm": 0.9278286961514632, "learning_rate": 7.99999002893917e-05, "loss": 0.439, "step": 472 }, { "epoch": 0.5042643923240938, "grad_norm": 0.7961408606111287, "learning_rate": 7.999982273675363e-05, "loss": 0.4411, "step": 473 }, { "epoch": 0.5053304904051172, "grad_norm": 0.8234137572564111, "learning_rate": 7.999972302629264e-05, "loss": 0.4301, "step": 474 }, { "epoch": 0.5063965884861408, "grad_norm": 0.8624956407119708, "learning_rate": 7.999960115806391e-05, "loss": 0.4329, "step": 475 }, { "epoch": 0.5074626865671642, "grad_norm": 0.8970331272898654, "learning_rate": 7.999945713213499e-05, "loss": 0.4326, "step": 476 }, { "epoch": 0.5085287846481876, "grad_norm": 1.0727116898193436, "learning_rate": 7.999929094858566e-05, "loss": 0.4345, "step": 477 }, { "epoch": 0.509594882729211, "grad_norm": 1.3157426521337816, "learning_rate": 7.999910260750796e-05, "loss": 0.4431, "step": 478 }, { "epoch": 0.5106609808102346, "grad_norm": 0.8323751755101396, "learning_rate": 7.999889210900623e-05, "loss": 0.4368, "step": 479 }, { "epoch": 0.511727078891258, "grad_norm": 0.7429493009987966, "learning_rate": 7.999865945319708e-05, "loss": 0.4352, "step": 480 }, { "epoch": 0.5127931769722814, "grad_norm": 0.7605461290267354, "learning_rate": 7.999840464020936e-05, "loss": 0.438, "step": 481 }, { "epoch": 0.5138592750533049, "grad_norm": 1.0898292270229712, "learning_rate": 7.999812767018428e-05, "loss": 0.4397, "step": 482 }, { "epoch": 0.5149253731343284, "grad_norm": 1.1380886467744538, "learning_rate": 7.999782854327523e-05, "loss": 0.4367, "step": 483 }, { "epoch": 0.5159914712153518, "grad_norm": 0.923359986404487, "learning_rate": 7.99975072596479e-05, "loss": 0.4384, "step": 484 }, { "epoch": 0.5170575692963753, "grad_norm": 0.8375674330038942, "learning_rate": 7.999716381948029e-05, "loss": 0.436, "step": 485 }, { "epoch": 0.5181236673773987, "grad_norm": 0.7570588135036903, "learning_rate": 7.999679822296263e-05, "loss": 0.4333, "step": 486 }, { "epoch": 0.5191897654584222, "grad_norm": 0.8549659681016898, "learning_rate": 7.999641047029747e-05, "loss": 0.4356, "step": 487 }, { "epoch": 0.5202558635394456, "grad_norm": 0.8852253500570002, "learning_rate": 7.999600056169956e-05, "loss": 0.4367, "step": 488 }, { "epoch": 0.5213219616204691, "grad_norm": 0.9048960575709987, "learning_rate": 7.9995568497396e-05, "loss": 0.435, "step": 489 }, { "epoch": 0.5223880597014925, "grad_norm": 0.88758605404917, "learning_rate": 7.999511427762612e-05, "loss": 0.4326, "step": 490 }, { "epoch": 0.523454157782516, "grad_norm": 0.9051344896994227, "learning_rate": 7.999463790264155e-05, "loss": 0.4345, "step": 491 }, { "epoch": 0.5245202558635395, "grad_norm": 0.9863125025310046, "learning_rate": 7.999413937270616e-05, "loss": 0.4302, "step": 492 }, { "epoch": 0.5255863539445629, "grad_norm": 1.0414867859886516, "learning_rate": 7.999361868809611e-05, "loss": 0.4339, "step": 493 }, { "epoch": 0.5266524520255863, "grad_norm": 0.9324905753531538, "learning_rate": 7.999307584909985e-05, "loss": 0.4411, "step": 494 }, { "epoch": 0.5277185501066098, "grad_norm": 0.8218386585042996, "learning_rate": 7.999251085601804e-05, "loss": 0.4349, "step": 495 }, { "epoch": 0.5287846481876333, "grad_norm": 0.6954291556313583, "learning_rate": 7.999192370916371e-05, "loss": 0.4406, "step": 496 }, { "epoch": 0.5298507462686567, "grad_norm": 0.599463593583278, "learning_rate": 7.999131440886208e-05, "loss": 0.4294, "step": 497 }, { "epoch": 0.5309168443496801, "grad_norm": 0.6547393051285391, "learning_rate": 7.999068295545068e-05, "loss": 0.4257, "step": 498 }, { "epoch": 0.5319829424307037, "grad_norm": 0.5821330571530523, "learning_rate": 7.99900293492793e-05, "loss": 0.4283, "step": 499 }, { "epoch": 0.5330490405117271, "grad_norm": 0.6804915712665963, "learning_rate": 7.998935359071001e-05, "loss": 0.4328, "step": 500 }, { "epoch": 0.5341151385927505, "grad_norm": 0.8261518180898303, "learning_rate": 7.998865568011713e-05, "loss": 0.4272, "step": 501 }, { "epoch": 0.535181236673774, "grad_norm": 0.984885443645453, "learning_rate": 7.998793561788727e-05, "loss": 0.4304, "step": 502 }, { "epoch": 0.5362473347547975, "grad_norm": 1.0533079413867938, "learning_rate": 7.998719340441933e-05, "loss": 0.4325, "step": 503 }, { "epoch": 0.5373134328358209, "grad_norm": 0.7930549662599339, "learning_rate": 7.998642904012442e-05, "loss": 0.4318, "step": 504 }, { "epoch": 0.5383795309168443, "grad_norm": 0.7826200841082803, "learning_rate": 7.998564252542599e-05, "loss": 0.4335, "step": 505 }, { "epoch": 0.5394456289978679, "grad_norm": 0.9783531802962122, "learning_rate": 7.998483386075972e-05, "loss": 0.4359, "step": 506 }, { "epoch": 0.5405117270788913, "grad_norm": 1.0305053743252428, "learning_rate": 7.998400304657356e-05, "loss": 0.4295, "step": 507 }, { "epoch": 0.5415778251599147, "grad_norm": 1.0480888416790912, "learning_rate": 7.998315008332773e-05, "loss": 0.4312, "step": 508 }, { "epoch": 0.5426439232409381, "grad_norm": 1.0913103402452111, "learning_rate": 7.998227497149475e-05, "loss": 0.4359, "step": 509 }, { "epoch": 0.5437100213219617, "grad_norm": 0.7828175941941907, "learning_rate": 7.998137771155938e-05, "loss": 0.4253, "step": 510 }, { "epoch": 0.5447761194029851, "grad_norm": 0.753244557765304, "learning_rate": 7.998045830401864e-05, "loss": 0.4393, "step": 511 }, { "epoch": 0.5458422174840085, "grad_norm": 0.7846311196148755, "learning_rate": 7.997951674938185e-05, "loss": 0.4381, "step": 512 }, { "epoch": 0.5469083155650319, "grad_norm": 0.5773342532146049, "learning_rate": 7.997855304817059e-05, "loss": 0.4324, "step": 513 }, { "epoch": 0.5479744136460555, "grad_norm": 0.5590862741693943, "learning_rate": 7.997756720091866e-05, "loss": 0.4347, "step": 514 }, { "epoch": 0.5490405117270789, "grad_norm": 0.5750952292279558, "learning_rate": 7.997655920817222e-05, "loss": 0.4316, "step": 515 }, { "epoch": 0.5501066098081023, "grad_norm": 0.6003198820780814, "learning_rate": 7.997552907048961e-05, "loss": 0.4315, "step": 516 }, { "epoch": 0.5511727078891258, "grad_norm": 0.7339386833494144, "learning_rate": 7.997447678844148e-05, "loss": 0.4334, "step": 517 }, { "epoch": 0.5522388059701493, "grad_norm": 0.901683791960714, "learning_rate": 7.997340236261076e-05, "loss": 0.4301, "step": 518 }, { "epoch": 0.5533049040511727, "grad_norm": 1.1497802402889508, "learning_rate": 7.997230579359261e-05, "loss": 0.4305, "step": 519 }, { "epoch": 0.5543710021321961, "grad_norm": 1.0986758974484423, "learning_rate": 7.997118708199447e-05, "loss": 0.4319, "step": 520 }, { "epoch": 0.5554371002132196, "grad_norm": 0.9768374453514556, "learning_rate": 7.997004622843603e-05, "loss": 0.4298, "step": 521 }, { "epoch": 0.5565031982942431, "grad_norm": 0.8738815441198616, "learning_rate": 7.996888323354932e-05, "loss": 0.4334, "step": 522 }, { "epoch": 0.5575692963752665, "grad_norm": 0.7838623932210756, "learning_rate": 7.996769809797851e-05, "loss": 0.4188, "step": 523 }, { "epoch": 0.55863539445629, "grad_norm": 0.7503706641165158, "learning_rate": 7.996649082238015e-05, "loss": 0.4293, "step": 524 }, { "epoch": 0.5597014925373134, "grad_norm": 0.7732501093769599, "learning_rate": 7.9965261407423e-05, "loss": 0.4222, "step": 525 }, { "epoch": 0.5607675906183369, "grad_norm": 0.7009167747253003, "learning_rate": 7.996400985378807e-05, "loss": 0.4208, "step": 526 }, { "epoch": 0.5618336886993603, "grad_norm": 0.6810945772619069, "learning_rate": 7.996273616216868e-05, "loss": 0.4327, "step": 527 }, { "epoch": 0.5628997867803838, "grad_norm": 0.683953064924211, "learning_rate": 7.996144033327038e-05, "loss": 0.432, "step": 528 }, { "epoch": 0.5639658848614072, "grad_norm": 0.6899323492718954, "learning_rate": 7.9960122367811e-05, "loss": 0.4213, "step": 529 }, { "epoch": 0.5650319829424307, "grad_norm": 0.7321597867806313, "learning_rate": 7.995878226652061e-05, "loss": 0.4324, "step": 530 }, { "epoch": 0.5660980810234542, "grad_norm": 0.8962870617353619, "learning_rate": 7.995742003014156e-05, "loss": 0.4331, "step": 531 }, { "epoch": 0.5671641791044776, "grad_norm": 0.9482010305629667, "learning_rate": 7.995603565942846e-05, "loss": 0.4305, "step": 532 }, { "epoch": 0.5682302771855011, "grad_norm": 0.8547510485520907, "learning_rate": 7.995462915514819e-05, "loss": 0.428, "step": 533 }, { "epoch": 0.5692963752665245, "grad_norm": 0.7205166965219559, "learning_rate": 7.995320051807987e-05, "loss": 0.4274, "step": 534 }, { "epoch": 0.570362473347548, "grad_norm": 0.6501576811903161, "learning_rate": 7.995174974901489e-05, "loss": 0.4349, "step": 535 }, { "epoch": 0.5714285714285714, "grad_norm": 0.7124693365482168, "learning_rate": 7.99502768487569e-05, "loss": 0.4332, "step": 536 }, { "epoch": 0.5724946695095949, "grad_norm": 0.9137269789343048, "learning_rate": 7.994878181812181e-05, "loss": 0.4329, "step": 537 }, { "epoch": 0.5735607675906184, "grad_norm": 1.042428645077687, "learning_rate": 7.994726465793782e-05, "loss": 0.4251, "step": 538 }, { "epoch": 0.5746268656716418, "grad_norm": 0.9004373071226697, "learning_rate": 7.994572536904529e-05, "loss": 0.4285, "step": 539 }, { "epoch": 0.5756929637526652, "grad_norm": 0.8473522603996991, "learning_rate": 7.994416395229696e-05, "loss": 0.4366, "step": 540 }, { "epoch": 0.5767590618336887, "grad_norm": 0.7717855677683063, "learning_rate": 7.994258040855776e-05, "loss": 0.426, "step": 541 }, { "epoch": 0.5778251599147122, "grad_norm": 0.7447673936934801, "learning_rate": 7.99409747387049e-05, "loss": 0.4301, "step": 542 }, { "epoch": 0.5788912579957356, "grad_norm": 0.7651727619884234, "learning_rate": 7.993934694362782e-05, "loss": 0.429, "step": 543 }, { "epoch": 0.579957356076759, "grad_norm": 0.771334240782246, "learning_rate": 7.993769702422824e-05, "loss": 0.4351, "step": 544 }, { "epoch": 0.5810234541577826, "grad_norm": 0.8138260174231243, "learning_rate": 7.993602498142015e-05, "loss": 0.4296, "step": 545 }, { "epoch": 0.582089552238806, "grad_norm": 0.9083617257757822, "learning_rate": 7.993433081612975e-05, "loss": 0.4207, "step": 546 }, { "epoch": 0.5831556503198294, "grad_norm": 1.015947743360487, "learning_rate": 7.993261452929551e-05, "loss": 0.435, "step": 547 }, { "epoch": 0.5842217484008528, "grad_norm": 0.9636126108742088, "learning_rate": 7.99308761218682e-05, "loss": 0.43, "step": 548 }, { "epoch": 0.5852878464818764, "grad_norm": 0.9480436655111312, "learning_rate": 7.992911559481077e-05, "loss": 0.4333, "step": 549 }, { "epoch": 0.5863539445628998, "grad_norm": 0.9004301184201612, "learning_rate": 7.992733294909848e-05, "loss": 0.4287, "step": 550 }, { "epoch": 0.5874200426439232, "grad_norm": 0.8680389446297649, "learning_rate": 7.992552818571883e-05, "loss": 0.4286, "step": 551 }, { "epoch": 0.5884861407249466, "grad_norm": 0.8221740761869751, "learning_rate": 7.992370130567155e-05, "loss": 0.4264, "step": 552 }, { "epoch": 0.5895522388059702, "grad_norm": 0.6856491729915255, "learning_rate": 7.992185230996864e-05, "loss": 0.4271, "step": 553 }, { "epoch": 0.5906183368869936, "grad_norm": 0.587285396525197, "learning_rate": 7.991998119963436e-05, "loss": 0.4249, "step": 554 }, { "epoch": 0.591684434968017, "grad_norm": 0.6202171961164545, "learning_rate": 7.991808797570519e-05, "loss": 0.437, "step": 555 }, { "epoch": 0.5927505330490405, "grad_norm": 0.5703467722471379, "learning_rate": 7.991617263922988e-05, "loss": 0.4261, "step": 556 }, { "epoch": 0.593816631130064, "grad_norm": 0.5467752825405972, "learning_rate": 7.991423519126945e-05, "loss": 0.4261, "step": 557 }, { "epoch": 0.5948827292110874, "grad_norm": 0.584586914451865, "learning_rate": 7.991227563289713e-05, "loss": 0.4306, "step": 558 }, { "epoch": 0.5959488272921108, "grad_norm": 0.5995377319354194, "learning_rate": 7.991029396519839e-05, "loss": 0.4295, "step": 559 }, { "epoch": 0.5970149253731343, "grad_norm": 0.5825912670190095, "learning_rate": 7.9908290189271e-05, "loss": 0.4246, "step": 560 }, { "epoch": 0.5980810234541578, "grad_norm": 0.5445221492611624, "learning_rate": 7.990626430622494e-05, "loss": 0.4258, "step": 561 }, { "epoch": 0.5991471215351812, "grad_norm": 0.6388885844122285, "learning_rate": 7.990421631718244e-05, "loss": 0.4237, "step": 562 }, { "epoch": 0.6002132196162047, "grad_norm": 0.7985864321661731, "learning_rate": 7.9902146223278e-05, "loss": 0.4263, "step": 563 }, { "epoch": 0.6012793176972282, "grad_norm": 0.9243564293353863, "learning_rate": 7.990005402565831e-05, "loss": 0.4319, "step": 564 }, { "epoch": 0.6023454157782516, "grad_norm": 1.0041597610953659, "learning_rate": 7.989793972548236e-05, "loss": 0.4317, "step": 565 }, { "epoch": 0.603411513859275, "grad_norm": 0.9913301041254353, "learning_rate": 7.989580332392137e-05, "loss": 0.4228, "step": 566 }, { "epoch": 0.6044776119402985, "grad_norm": 0.9215600177004183, "learning_rate": 7.989364482215878e-05, "loss": 0.4349, "step": 567 }, { "epoch": 0.605543710021322, "grad_norm": 0.8910563956821462, "learning_rate": 7.989146422139029e-05, "loss": 0.4256, "step": 568 }, { "epoch": 0.6066098081023454, "grad_norm": 0.9133910919719787, "learning_rate": 7.988926152282384e-05, "loss": 0.4269, "step": 569 }, { "epoch": 0.6076759061833689, "grad_norm": 0.964984044461557, "learning_rate": 7.988703672767962e-05, "loss": 0.4271, "step": 570 }, { "epoch": 0.6087420042643923, "grad_norm": 1.0759011919787849, "learning_rate": 7.988478983719003e-05, "loss": 0.4252, "step": 571 }, { "epoch": 0.6098081023454158, "grad_norm": 0.8837314544429662, "learning_rate": 7.988252085259976e-05, "loss": 0.4297, "step": 572 }, { "epoch": 0.6108742004264393, "grad_norm": 0.6734363195402464, "learning_rate": 7.988022977516569e-05, "loss": 0.4295, "step": 573 }, { "epoch": 0.6119402985074627, "grad_norm": 0.6034655355722486, "learning_rate": 7.987791660615695e-05, "loss": 0.4251, "step": 574 }, { "epoch": 0.6130063965884861, "grad_norm": 0.5138841355005869, "learning_rate": 7.987558134685494e-05, "loss": 0.4257, "step": 575 }, { "epoch": 0.6140724946695096, "grad_norm": 0.5645600852787206, "learning_rate": 7.987322399855324e-05, "loss": 0.4241, "step": 576 }, { "epoch": 0.6151385927505331, "grad_norm": 0.5906802943834981, "learning_rate": 7.987084456255773e-05, "loss": 0.4315, "step": 577 }, { "epoch": 0.6162046908315565, "grad_norm": 0.6711742756532632, "learning_rate": 7.986844304018649e-05, "loss": 0.429, "step": 578 }, { "epoch": 0.6172707889125799, "grad_norm": 0.8364188220124186, "learning_rate": 7.986601943276982e-05, "loss": 0.4195, "step": 579 }, { "epoch": 0.6183368869936035, "grad_norm": 0.9774403295859477, "learning_rate": 7.986357374165028e-05, "loss": 0.4332, "step": 580 }, { "epoch": 0.6194029850746269, "grad_norm": 1.04494654978909, "learning_rate": 7.986110596818265e-05, "loss": 0.428, "step": 581 }, { "epoch": 0.6204690831556503, "grad_norm": 0.9878512856031877, "learning_rate": 7.985861611373397e-05, "loss": 0.4248, "step": 582 }, { "epoch": 0.6215351812366737, "grad_norm": 0.9524713066725912, "learning_rate": 7.985610417968348e-05, "loss": 0.4251, "step": 583 }, { "epoch": 0.6226012793176973, "grad_norm": 0.74751402505921, "learning_rate": 7.985357016742264e-05, "loss": 0.4319, "step": 584 }, { "epoch": 0.6236673773987207, "grad_norm": 0.4034413623064052, "learning_rate": 7.985101407835519e-05, "loss": 0.4271, "step": 585 }, { "epoch": 0.6247334754797441, "grad_norm": 0.43538759493027, "learning_rate": 7.984843591389706e-05, "loss": 0.4295, "step": 586 }, { "epoch": 0.6257995735607675, "grad_norm": 0.5770360682304801, "learning_rate": 7.984583567547643e-05, "loss": 0.4247, "step": 587 }, { "epoch": 0.6268656716417911, "grad_norm": 0.601220491680326, "learning_rate": 7.984321336453364e-05, "loss": 0.4217, "step": 588 }, { "epoch": 0.6279317697228145, "grad_norm": 0.5416809102850237, "learning_rate": 7.984056898252141e-05, "loss": 0.4178, "step": 589 }, { "epoch": 0.6289978678038379, "grad_norm": 0.5991300962093767, "learning_rate": 7.983790253090452e-05, "loss": 0.4268, "step": 590 }, { "epoch": 0.6300639658848614, "grad_norm": 0.5939600973200748, "learning_rate": 7.983521401116005e-05, "loss": 0.4201, "step": 591 }, { "epoch": 0.6311300639658849, "grad_norm": 0.5745808408760033, "learning_rate": 7.983250342477733e-05, "loss": 0.4263, "step": 592 }, { "epoch": 0.6321961620469083, "grad_norm": 0.6369439414081867, "learning_rate": 7.982977077325788e-05, "loss": 0.4221, "step": 593 }, { "epoch": 0.6332622601279317, "grad_norm": 0.7105979728558217, "learning_rate": 7.98270160581154e-05, "loss": 0.4235, "step": 594 }, { "epoch": 0.6343283582089553, "grad_norm": 0.6814030775883104, "learning_rate": 7.982423928087593e-05, "loss": 0.4255, "step": 595 }, { "epoch": 0.6353944562899787, "grad_norm": 0.6048922616611062, "learning_rate": 7.982144044307762e-05, "loss": 0.4216, "step": 596 }, { "epoch": 0.6364605543710021, "grad_norm": 0.6271148450150956, "learning_rate": 7.981861954627088e-05, "loss": 0.429, "step": 597 }, { "epoch": 0.6375266524520256, "grad_norm": 0.7648541301808355, "learning_rate": 7.981577659201833e-05, "loss": 0.425, "step": 598 }, { "epoch": 0.6385927505330491, "grad_norm": 0.7722433311546377, "learning_rate": 7.981291158189486e-05, "loss": 0.4266, "step": 599 }, { "epoch": 0.6396588486140725, "grad_norm": 0.8045597536473933, "learning_rate": 7.98100245174875e-05, "loss": 0.4226, "step": 600 }, { "epoch": 0.6407249466950959, "grad_norm": 0.985587897420224, "learning_rate": 7.980711540039554e-05, "loss": 0.422, "step": 601 }, { "epoch": 0.6417910447761194, "grad_norm": 1.1286184314617196, "learning_rate": 7.980418423223049e-05, "loss": 0.4215, "step": 602 }, { "epoch": 0.6428571428571429, "grad_norm": 0.7080020035877171, "learning_rate": 7.980123101461606e-05, "loss": 0.4247, "step": 603 }, { "epoch": 0.6439232409381663, "grad_norm": 0.5626555829828844, "learning_rate": 7.979825574918818e-05, "loss": 0.419, "step": 604 }, { "epoch": 0.6449893390191898, "grad_norm": 0.636169281039428, "learning_rate": 7.979525843759499e-05, "loss": 0.4278, "step": 605 }, { "epoch": 0.6460554371002132, "grad_norm": 0.7198635719149019, "learning_rate": 7.979223908149685e-05, "loss": 0.4255, "step": 606 }, { "epoch": 0.6471215351812367, "grad_norm": 0.8342343801146002, "learning_rate": 7.978919768256631e-05, "loss": 0.4227, "step": 607 }, { "epoch": 0.6481876332622601, "grad_norm": 0.9031065878045232, "learning_rate": 7.978613424248818e-05, "loss": 0.4308, "step": 608 }, { "epoch": 0.6492537313432836, "grad_norm": 0.8023633786215478, "learning_rate": 7.978304876295941e-05, "loss": 0.419, "step": 609 }, { "epoch": 0.650319829424307, "grad_norm": 0.9239599844896783, "learning_rate": 7.977994124568922e-05, "loss": 0.424, "step": 610 }, { "epoch": 0.6513859275053305, "grad_norm": 0.9482115919888203, "learning_rate": 7.9776811692399e-05, "loss": 0.4186, "step": 611 }, { "epoch": 0.652452025586354, "grad_norm": 0.786317946938952, "learning_rate": 7.977366010482236e-05, "loss": 0.4141, "step": 612 }, { "epoch": 0.6535181236673774, "grad_norm": 0.6264488608057049, "learning_rate": 7.977048648470513e-05, "loss": 0.4209, "step": 613 }, { "epoch": 0.6545842217484008, "grad_norm": 0.7117064313531933, "learning_rate": 7.976729083380532e-05, "loss": 0.4262, "step": 614 }, { "epoch": 0.6556503198294243, "grad_norm": 0.7175291742270824, "learning_rate": 7.976407315389314e-05, "loss": 0.4251, "step": 615 }, { "epoch": 0.6567164179104478, "grad_norm": 0.6490475195317971, "learning_rate": 7.976083344675105e-05, "loss": 0.4137, "step": 616 }, { "epoch": 0.6577825159914712, "grad_norm": 0.6621191439676352, "learning_rate": 7.975757171417365e-05, "loss": 0.4216, "step": 617 }, { "epoch": 0.6588486140724946, "grad_norm": 0.6960151950050518, "learning_rate": 7.97542879579678e-05, "loss": 0.4276, "step": 618 }, { "epoch": 0.6599147121535182, "grad_norm": 0.6648695880576607, "learning_rate": 7.975098217995248e-05, "loss": 0.4209, "step": 619 }, { "epoch": 0.6609808102345416, "grad_norm": 0.6121386791069903, "learning_rate": 7.974765438195897e-05, "loss": 0.419, "step": 620 }, { "epoch": 0.662046908315565, "grad_norm": 0.5748855152032502, "learning_rate": 7.974430456583069e-05, "loss": 0.4249, "step": 621 }, { "epoch": 0.6631130063965884, "grad_norm": 0.5788717188362661, "learning_rate": 7.974093273342325e-05, "loss": 0.4204, "step": 622 }, { "epoch": 0.664179104477612, "grad_norm": 0.499053345156499, "learning_rate": 7.973753888660446e-05, "loss": 0.4258, "step": 623 }, { "epoch": 0.6652452025586354, "grad_norm": 0.6143099826003396, "learning_rate": 7.973412302725435e-05, "loss": 0.4233, "step": 624 }, { "epoch": 0.6663113006396588, "grad_norm": 0.7815047847063621, "learning_rate": 7.973068515726514e-05, "loss": 0.4226, "step": 625 }, { "epoch": 0.6673773987206824, "grad_norm": 0.8746470159019124, "learning_rate": 7.972722527854119e-05, "loss": 0.4286, "step": 626 }, { "epoch": 0.6684434968017058, "grad_norm": 0.8412210250981513, "learning_rate": 7.972374339299915e-05, "loss": 0.4222, "step": 627 }, { "epoch": 0.6695095948827292, "grad_norm": 0.8583591463497402, "learning_rate": 7.972023950256775e-05, "loss": 0.4248, "step": 628 }, { "epoch": 0.6705756929637526, "grad_norm": 0.8975533200078734, "learning_rate": 7.9716713609188e-05, "loss": 0.4332, "step": 629 }, { "epoch": 0.6716417910447762, "grad_norm": 0.9787622637115202, "learning_rate": 7.971316571481306e-05, "loss": 0.4305, "step": 630 }, { "epoch": 0.6727078891257996, "grad_norm": 0.9814190373327007, "learning_rate": 7.970959582140825e-05, "loss": 0.4339, "step": 631 }, { "epoch": 0.673773987206823, "grad_norm": 0.9082356663533697, "learning_rate": 7.970600393095113e-05, "loss": 0.4262, "step": 632 }, { "epoch": 0.6748400852878464, "grad_norm": 0.7973168134733778, "learning_rate": 7.970239004543141e-05, "loss": 0.4263, "step": 633 }, { "epoch": 0.67590618336887, "grad_norm": 0.7223873106620373, "learning_rate": 7.969875416685101e-05, "loss": 0.4174, "step": 634 }, { "epoch": 0.6769722814498934, "grad_norm": 0.7431984730465593, "learning_rate": 7.9695096297224e-05, "loss": 0.4296, "step": 635 }, { "epoch": 0.6780383795309168, "grad_norm": 0.7282657781320044, "learning_rate": 7.969141643857665e-05, "loss": 0.4267, "step": 636 }, { "epoch": 0.6791044776119403, "grad_norm": 0.5869814576553907, "learning_rate": 7.968771459294742e-05, "loss": 0.4191, "step": 637 }, { "epoch": 0.6801705756929638, "grad_norm": 0.5004088468685179, "learning_rate": 7.968399076238694e-05, "loss": 0.4198, "step": 638 }, { "epoch": 0.6812366737739872, "grad_norm": 0.6199279637183518, "learning_rate": 7.968024494895802e-05, "loss": 0.423, "step": 639 }, { "epoch": 0.6823027718550106, "grad_norm": 0.6026223540177856, "learning_rate": 7.967647715473563e-05, "loss": 0.4252, "step": 640 }, { "epoch": 0.6833688699360341, "grad_norm": 0.5355415437778251, "learning_rate": 7.967268738180694e-05, "loss": 0.4131, "step": 641 }, { "epoch": 0.6844349680170576, "grad_norm": 0.5663522144573513, "learning_rate": 7.966887563227132e-05, "loss": 0.4252, "step": 642 }, { "epoch": 0.685501066098081, "grad_norm": 0.5690136220036935, "learning_rate": 7.966504190824021e-05, "loss": 0.4186, "step": 643 }, { "epoch": 0.6865671641791045, "grad_norm": 0.6635581651982301, "learning_rate": 7.966118621183735e-05, "loss": 0.4161, "step": 644 }, { "epoch": 0.6876332622601279, "grad_norm": 0.7754746339129004, "learning_rate": 7.96573085451986e-05, "loss": 0.4285, "step": 645 }, { "epoch": 0.6886993603411514, "grad_norm": 0.9415320034442117, "learning_rate": 7.965340891047196e-05, "loss": 0.4242, "step": 646 }, { "epoch": 0.6897654584221748, "grad_norm": 1.133439191591166, "learning_rate": 7.964948730981763e-05, "loss": 0.4264, "step": 647 }, { "epoch": 0.6908315565031983, "grad_norm": 0.8248991981565036, "learning_rate": 7.964554374540797e-05, "loss": 0.4271, "step": 648 }, { "epoch": 0.6918976545842217, "grad_norm": 0.49941087039661575, "learning_rate": 7.964157821942752e-05, "loss": 0.4185, "step": 649 }, { "epoch": 0.6929637526652452, "grad_norm": 0.5088744299454488, "learning_rate": 7.963759073407297e-05, "loss": 0.4228, "step": 650 }, { "epoch": 0.6940298507462687, "grad_norm": 0.5561088080079999, "learning_rate": 7.963358129155318e-05, "loss": 0.4199, "step": 651 }, { "epoch": 0.6950959488272921, "grad_norm": 0.5380479849912571, "learning_rate": 7.962954989408916e-05, "loss": 0.4251, "step": 652 }, { "epoch": 0.6961620469083155, "grad_norm": 0.5750528453656307, "learning_rate": 7.962549654391412e-05, "loss": 0.4199, "step": 653 }, { "epoch": 0.697228144989339, "grad_norm": 0.6325892582842803, "learning_rate": 7.962142124327338e-05, "loss": 0.4206, "step": 654 }, { "epoch": 0.6982942430703625, "grad_norm": 0.7898343271492005, "learning_rate": 7.961732399442448e-05, "loss": 0.422, "step": 655 }, { "epoch": 0.6993603411513859, "grad_norm": 0.9578185478631844, "learning_rate": 7.961320479963703e-05, "loss": 0.4158, "step": 656 }, { "epoch": 0.7004264392324094, "grad_norm": 1.064875777304307, "learning_rate": 7.96090636611929e-05, "loss": 0.4212, "step": 657 }, { "epoch": 0.7014925373134329, "grad_norm": 0.870822886466798, "learning_rate": 7.960490058138604e-05, "loss": 0.4294, "step": 658 }, { "epoch": 0.7025586353944563, "grad_norm": 0.7665000673413538, "learning_rate": 7.960071556252259e-05, "loss": 0.4293, "step": 659 }, { "epoch": 0.7036247334754797, "grad_norm": 0.7943565964649919, "learning_rate": 7.959650860692082e-05, "loss": 0.4153, "step": 660 }, { "epoch": 0.7046908315565032, "grad_norm": 0.820866857806031, "learning_rate": 7.959227971691118e-05, "loss": 0.4218, "step": 661 }, { "epoch": 0.7057569296375267, "grad_norm": 0.6751476503197974, "learning_rate": 7.958802889483626e-05, "loss": 0.4193, "step": 662 }, { "epoch": 0.7068230277185501, "grad_norm": 0.3829844790409102, "learning_rate": 7.958375614305076e-05, "loss": 0.42, "step": 663 }, { "epoch": 0.7078891257995735, "grad_norm": 0.5522600827629381, "learning_rate": 7.957946146392159e-05, "loss": 0.4224, "step": 664 }, { "epoch": 0.7089552238805971, "grad_norm": 0.7878606313595656, "learning_rate": 7.957514485982778e-05, "loss": 0.4144, "step": 665 }, { "epoch": 0.7100213219616205, "grad_norm": 0.7385093188465528, "learning_rate": 7.95708063331605e-05, "loss": 0.4259, "step": 666 }, { "epoch": 0.7110874200426439, "grad_norm": 0.58178433551703, "learning_rate": 7.956644588632307e-05, "loss": 0.422, "step": 667 }, { "epoch": 0.7121535181236673, "grad_norm": 0.6284258703892865, "learning_rate": 7.956206352173093e-05, "loss": 0.421, "step": 668 }, { "epoch": 0.7132196162046909, "grad_norm": 0.7936414413541673, "learning_rate": 7.95576592418117e-05, "loss": 0.4259, "step": 669 }, { "epoch": 0.7142857142857143, "grad_norm": 0.8521993846503206, "learning_rate": 7.955323304900514e-05, "loss": 0.4228, "step": 670 }, { "epoch": 0.7153518123667377, "grad_norm": 0.7127889848840783, "learning_rate": 7.954878494576312e-05, "loss": 0.4201, "step": 671 }, { "epoch": 0.7164179104477612, "grad_norm": 0.6074747505915871, "learning_rate": 7.954431493454964e-05, "loss": 0.4213, "step": 672 }, { "epoch": 0.7174840085287847, "grad_norm": 0.7129990882836668, "learning_rate": 7.953982301784085e-05, "loss": 0.4234, "step": 673 }, { "epoch": 0.7185501066098081, "grad_norm": 0.7722302015202298, "learning_rate": 7.953530919812506e-05, "loss": 0.4191, "step": 674 }, { "epoch": 0.7196162046908315, "grad_norm": 0.7433421116192664, "learning_rate": 7.95307734779027e-05, "loss": 0.4211, "step": 675 }, { "epoch": 0.720682302771855, "grad_norm": 0.8264421753718556, "learning_rate": 7.95262158596863e-05, "loss": 0.4218, "step": 676 }, { "epoch": 0.7217484008528785, "grad_norm": 1.0203826309064437, "learning_rate": 7.952163634600055e-05, "loss": 0.4171, "step": 677 }, { "epoch": 0.7228144989339019, "grad_norm": 0.8482809756462577, "learning_rate": 7.951703493938226e-05, "loss": 0.4194, "step": 678 }, { "epoch": 0.7238805970149254, "grad_norm": 0.5426953720506598, "learning_rate": 7.951241164238039e-05, "loss": 0.4191, "step": 679 }, { "epoch": 0.7249466950959488, "grad_norm": 0.7193984844467518, "learning_rate": 7.950776645755596e-05, "loss": 0.4213, "step": 680 }, { "epoch": 0.7260127931769723, "grad_norm": 0.7142598516288384, "learning_rate": 7.950309938748221e-05, "loss": 0.4205, "step": 681 }, { "epoch": 0.7270788912579957, "grad_norm": 0.39915429482120396, "learning_rate": 7.949841043474445e-05, "loss": 0.4147, "step": 682 }, { "epoch": 0.7281449893390192, "grad_norm": 0.5327714348171071, "learning_rate": 7.949369960194009e-05, "loss": 0.4183, "step": 683 }, { "epoch": 0.7292110874200426, "grad_norm": 0.5612054527272023, "learning_rate": 7.94889668916787e-05, "loss": 0.4185, "step": 684 }, { "epoch": 0.7302771855010661, "grad_norm": 0.4666415892203282, "learning_rate": 7.948421230658196e-05, "loss": 0.4182, "step": 685 }, { "epoch": 0.7313432835820896, "grad_norm": 0.520314689388758, "learning_rate": 7.947943584928364e-05, "loss": 0.4237, "step": 686 }, { "epoch": 0.732409381663113, "grad_norm": 0.41852601792232424, "learning_rate": 7.947463752242968e-05, "loss": 0.4136, "step": 687 }, { "epoch": 0.7334754797441365, "grad_norm": 0.44187419017345353, "learning_rate": 7.946981732867808e-05, "loss": 0.4188, "step": 688 }, { "epoch": 0.7345415778251599, "grad_norm": 0.5519060149192045, "learning_rate": 7.946497527069898e-05, "loss": 0.4178, "step": 689 }, { "epoch": 0.7356076759061834, "grad_norm": 0.5189998274103562, "learning_rate": 7.946011135117466e-05, "loss": 0.4156, "step": 690 }, { "epoch": 0.7366737739872068, "grad_norm": 0.5845444307770654, "learning_rate": 7.945522557279944e-05, "loss": 0.413, "step": 691 }, { "epoch": 0.7377398720682303, "grad_norm": 0.7034648713070897, "learning_rate": 7.94503179382798e-05, "loss": 0.4234, "step": 692 }, { "epoch": 0.7388059701492538, "grad_norm": 0.8180017083563943, "learning_rate": 7.944538845033431e-05, "loss": 0.4181, "step": 693 }, { "epoch": 0.7398720682302772, "grad_norm": 1.0411030646276598, "learning_rate": 7.944043711169367e-05, "loss": 0.4177, "step": 694 }, { "epoch": 0.7409381663113006, "grad_norm": 1.1183438708338787, "learning_rate": 7.943546392510065e-05, "loss": 0.4245, "step": 695 }, { "epoch": 0.7420042643923241, "grad_norm": 0.7568820848776477, "learning_rate": 7.943046889331013e-05, "loss": 0.4223, "step": 696 }, { "epoch": 0.7430703624733476, "grad_norm": 0.4694996552198355, "learning_rate": 7.94254520190891e-05, "loss": 0.4204, "step": 697 }, { "epoch": 0.744136460554371, "grad_norm": 0.42506834068311544, "learning_rate": 7.942041330521665e-05, "loss": 0.4133, "step": 698 }, { "epoch": 0.7452025586353944, "grad_norm": 0.7432323993182501, "learning_rate": 7.941535275448399e-05, "loss": 0.4204, "step": 699 }, { "epoch": 0.746268656716418, "grad_norm": 1.017099030937784, "learning_rate": 7.941027036969437e-05, "loss": 0.4151, "step": 700 }, { "epoch": 0.7473347547974414, "grad_norm": 1.042873613244398, "learning_rate": 7.940516615366318e-05, "loss": 0.4225, "step": 701 }, { "epoch": 0.7484008528784648, "grad_norm": 0.672037071535835, "learning_rate": 7.940004010921787e-05, "loss": 0.4155, "step": 702 }, { "epoch": 0.7494669509594882, "grad_norm": 0.47977822737782094, "learning_rate": 7.939489223919803e-05, "loss": 0.4239, "step": 703 }, { "epoch": 0.7505330490405118, "grad_norm": 0.5839565536019953, "learning_rate": 7.938972254645529e-05, "loss": 0.4157, "step": 704 }, { "epoch": 0.7515991471215352, "grad_norm": 0.659589798795434, "learning_rate": 7.938453103385343e-05, "loss": 0.4297, "step": 705 }, { "epoch": 0.7526652452025586, "grad_norm": 0.6224572447097061, "learning_rate": 7.937931770426825e-05, "loss": 0.4178, "step": 706 }, { "epoch": 0.753731343283582, "grad_norm": 0.5223926429775323, "learning_rate": 7.937408256058764e-05, "loss": 0.4195, "step": 707 }, { "epoch": 0.7547974413646056, "grad_norm": 0.5263236599515704, "learning_rate": 7.936882560571165e-05, "loss": 0.4225, "step": 708 }, { "epoch": 0.755863539445629, "grad_norm": 0.6701920085872527, "learning_rate": 7.936354684255231e-05, "loss": 0.423, "step": 709 }, { "epoch": 0.7569296375266524, "grad_norm": 0.8271768873467236, "learning_rate": 7.935824627403382e-05, "loss": 0.4197, "step": 710 }, { "epoch": 0.7579957356076759, "grad_norm": 0.8903386742919085, "learning_rate": 7.935292390309239e-05, "loss": 0.4161, "step": 711 }, { "epoch": 0.7590618336886994, "grad_norm": 0.8971908647303243, "learning_rate": 7.934757973267636e-05, "loss": 0.4212, "step": 712 }, { "epoch": 0.7601279317697228, "grad_norm": 0.8738601854484014, "learning_rate": 7.93422137657461e-05, "loss": 0.4145, "step": 713 }, { "epoch": 0.7611940298507462, "grad_norm": 0.8238424939346762, "learning_rate": 7.93368260052741e-05, "loss": 0.4231, "step": 714 }, { "epoch": 0.7622601279317697, "grad_norm": 0.8322879935982136, "learning_rate": 7.933141645424489e-05, "loss": 0.4123, "step": 715 }, { "epoch": 0.7633262260127932, "grad_norm": 0.7762441733310584, "learning_rate": 7.932598511565506e-05, "loss": 0.4139, "step": 716 }, { "epoch": 0.7643923240938166, "grad_norm": 0.7714495728551265, "learning_rate": 7.932053199251332e-05, "loss": 0.4172, "step": 717 }, { "epoch": 0.7654584221748401, "grad_norm": 0.6994068976148212, "learning_rate": 7.931505708784042e-05, "loss": 0.4209, "step": 718 }, { "epoch": 0.7665245202558635, "grad_norm": 0.4918911877930488, "learning_rate": 7.930956040466912e-05, "loss": 0.4187, "step": 719 }, { "epoch": 0.767590618336887, "grad_norm": 0.4206999757056118, "learning_rate": 7.930404194604436e-05, "loss": 0.4197, "step": 720 }, { "epoch": 0.7686567164179104, "grad_norm": 0.491102787820834, "learning_rate": 7.929850171502304e-05, "loss": 0.4223, "step": 721 }, { "epoch": 0.7697228144989339, "grad_norm": 0.5414023421551015, "learning_rate": 7.92929397146742e-05, "loss": 0.4158, "step": 722 }, { "epoch": 0.7707889125799574, "grad_norm": 0.43007919824776014, "learning_rate": 7.928735594807885e-05, "loss": 0.4197, "step": 723 }, { "epoch": 0.7718550106609808, "grad_norm": 0.357619381539661, "learning_rate": 7.928175041833014e-05, "loss": 0.4111, "step": 724 }, { "epoch": 0.7729211087420043, "grad_norm": 0.44213212514382777, "learning_rate": 7.927612312853321e-05, "loss": 0.4233, "step": 725 }, { "epoch": 0.7739872068230277, "grad_norm": 0.4580051310888454, "learning_rate": 7.927047408180533e-05, "loss": 0.409, "step": 726 }, { "epoch": 0.7750533049040512, "grad_norm": 0.5077929282870794, "learning_rate": 7.926480328127573e-05, "loss": 0.4198, "step": 727 }, { "epoch": 0.7761194029850746, "grad_norm": 0.6011826055428627, "learning_rate": 7.925911073008578e-05, "loss": 0.4217, "step": 728 }, { "epoch": 0.7771855010660981, "grad_norm": 0.9129504733266989, "learning_rate": 7.925339643138885e-05, "loss": 0.4231, "step": 729 }, { "epoch": 0.7782515991471215, "grad_norm": 1.2218378663823979, "learning_rate": 7.924766038835035e-05, "loss": 0.4243, "step": 730 }, { "epoch": 0.779317697228145, "grad_norm": 0.7903329161767807, "learning_rate": 7.924190260414776e-05, "loss": 0.4208, "step": 731 }, { "epoch": 0.7803837953091685, "grad_norm": 0.6805821512299308, "learning_rate": 7.923612308197058e-05, "loss": 0.42, "step": 732 }, { "epoch": 0.7814498933901919, "grad_norm": 0.6416299900106649, "learning_rate": 7.923032182502037e-05, "loss": 0.4201, "step": 733 }, { "epoch": 0.7825159914712153, "grad_norm": 0.6659708417529658, "learning_rate": 7.922449883651074e-05, "loss": 0.4144, "step": 734 }, { "epoch": 0.7835820895522388, "grad_norm": 0.5293729604666099, "learning_rate": 7.92186541196673e-05, "loss": 0.4219, "step": 735 }, { "epoch": 0.7846481876332623, "grad_norm": 0.4286984074785958, "learning_rate": 7.921278767772774e-05, "loss": 0.4141, "step": 736 }, { "epoch": 0.7857142857142857, "grad_norm": 0.49672004967076516, "learning_rate": 7.920689951394175e-05, "loss": 0.4222, "step": 737 }, { "epoch": 0.7867803837953091, "grad_norm": 0.6284915886717239, "learning_rate": 7.920098963157108e-05, "loss": 0.4252, "step": 738 }, { "epoch": 0.7878464818763327, "grad_norm": 0.787902960458551, "learning_rate": 7.919505803388949e-05, "loss": 0.4248, "step": 739 }, { "epoch": 0.7889125799573561, "grad_norm": 0.8686899347398345, "learning_rate": 7.918910472418278e-05, "loss": 0.415, "step": 740 }, { "epoch": 0.7899786780383795, "grad_norm": 0.8437670042808346, "learning_rate": 7.918312970574875e-05, "loss": 0.4164, "step": 741 }, { "epoch": 0.7910447761194029, "grad_norm": 0.8540882179405255, "learning_rate": 7.917713298189728e-05, "loss": 0.4253, "step": 742 }, { "epoch": 0.7921108742004265, "grad_norm": 0.9478085360927803, "learning_rate": 7.917111455595023e-05, "loss": 0.4196, "step": 743 }, { "epoch": 0.7931769722814499, "grad_norm": 0.9327562081026817, "learning_rate": 7.916507443124153e-05, "loss": 0.4242, "step": 744 }, { "epoch": 0.7942430703624733, "grad_norm": 0.8551898160448683, "learning_rate": 7.915901261111703e-05, "loss": 0.4178, "step": 745 }, { "epoch": 0.7953091684434968, "grad_norm": 0.6425189560126273, "learning_rate": 7.91529290989347e-05, "loss": 0.4213, "step": 746 }, { "epoch": 0.7963752665245203, "grad_norm": 0.5781816257700045, "learning_rate": 7.91468238980645e-05, "loss": 0.4291, "step": 747 }, { "epoch": 0.7974413646055437, "grad_norm": 0.6921643667155062, "learning_rate": 7.914069701188837e-05, "loss": 0.4197, "step": 748 }, { "epoch": 0.7985074626865671, "grad_norm": 0.7614603203693651, "learning_rate": 7.913454844380031e-05, "loss": 0.4156, "step": 749 }, { "epoch": 0.7995735607675906, "grad_norm": 0.7095354236567498, "learning_rate": 7.912837819720628e-05, "loss": 0.4127, "step": 750 }, { "epoch": 0.8006396588486141, "grad_norm": 0.6873164412119117, "learning_rate": 7.91221862755243e-05, "loss": 0.417, "step": 751 }, { "epoch": 0.8017057569296375, "grad_norm": 0.718103908521537, "learning_rate": 7.911597268218435e-05, "loss": 0.4234, "step": 752 }, { "epoch": 0.802771855010661, "grad_norm": 0.6911031732156341, "learning_rate": 7.910973742062847e-05, "loss": 0.4142, "step": 753 }, { "epoch": 0.8038379530916845, "grad_norm": 0.7537929051659943, "learning_rate": 7.910348049431064e-05, "loss": 0.4161, "step": 754 }, { "epoch": 0.8049040511727079, "grad_norm": 0.845377153307119, "learning_rate": 7.909720190669689e-05, "loss": 0.4135, "step": 755 }, { "epoch": 0.8059701492537313, "grad_norm": 0.9386808147663165, "learning_rate": 7.909090166126523e-05, "loss": 0.4175, "step": 756 }, { "epoch": 0.8070362473347548, "grad_norm": 0.874198077040556, "learning_rate": 7.908457976150565e-05, "loss": 0.4168, "step": 757 }, { "epoch": 0.8081023454157783, "grad_norm": 0.734655468857772, "learning_rate": 7.907823621092017e-05, "loss": 0.4218, "step": 758 }, { "epoch": 0.8091684434968017, "grad_norm": 0.664396615528394, "learning_rate": 7.907187101302279e-05, "loss": 0.4095, "step": 759 }, { "epoch": 0.8102345415778252, "grad_norm": 0.5039424928453715, "learning_rate": 7.90654841713395e-05, "loss": 0.4102, "step": 760 }, { "epoch": 0.8113006396588486, "grad_norm": 0.38736014140762187, "learning_rate": 7.905907568940825e-05, "loss": 0.4075, "step": 761 }, { "epoch": 0.8123667377398721, "grad_norm": 0.5398794260824558, "learning_rate": 7.905264557077905e-05, "loss": 0.4185, "step": 762 }, { "epoch": 0.8134328358208955, "grad_norm": 0.6961113732246285, "learning_rate": 7.904619381901382e-05, "loss": 0.419, "step": 763 }, { "epoch": 0.814498933901919, "grad_norm": 0.7099747167015686, "learning_rate": 7.903972043768652e-05, "loss": 0.4192, "step": 764 }, { "epoch": 0.8155650319829424, "grad_norm": 0.7111718390365964, "learning_rate": 7.903322543038302e-05, "loss": 0.4231, "step": 765 }, { "epoch": 0.8166311300639659, "grad_norm": 0.7243885981248829, "learning_rate": 7.902670880070126e-05, "loss": 0.4213, "step": 766 }, { "epoch": 0.8176972281449894, "grad_norm": 0.8443210902757722, "learning_rate": 7.902017055225111e-05, "loss": 0.4173, "step": 767 }, { "epoch": 0.8187633262260128, "grad_norm": 0.9889792776736421, "learning_rate": 7.901361068865441e-05, "loss": 0.4188, "step": 768 }, { "epoch": 0.8198294243070362, "grad_norm": 1.1016684504340584, "learning_rate": 7.9007029213545e-05, "loss": 0.4206, "step": 769 }, { "epoch": 0.8208955223880597, "grad_norm": 0.6894548537797883, "learning_rate": 7.900042613056864e-05, "loss": 0.4147, "step": 770 }, { "epoch": 0.8219616204690832, "grad_norm": 0.4780239669310881, "learning_rate": 7.899380144338313e-05, "loss": 0.4098, "step": 771 }, { "epoch": 0.8230277185501066, "grad_norm": 0.7367707580027851, "learning_rate": 7.898715515565817e-05, "loss": 0.425, "step": 772 }, { "epoch": 0.82409381663113, "grad_norm": 0.9588196347102074, "learning_rate": 7.898048727107549e-05, "loss": 0.4201, "step": 773 }, { "epoch": 0.8251599147121536, "grad_norm": 0.9328164536411601, "learning_rate": 7.897379779332873e-05, "loss": 0.4086, "step": 774 }, { "epoch": 0.826226012793177, "grad_norm": 0.7663371340787454, "learning_rate": 7.896708672612352e-05, "loss": 0.4157, "step": 775 }, { "epoch": 0.8272921108742004, "grad_norm": 0.5139058112934316, "learning_rate": 7.896035407317746e-05, "loss": 0.417, "step": 776 }, { "epoch": 0.8283582089552238, "grad_norm": 0.5600567127054864, "learning_rate": 7.895359983822004e-05, "loss": 0.4164, "step": 777 }, { "epoch": 0.8294243070362474, "grad_norm": 0.7393036162635956, "learning_rate": 7.894682402499283e-05, "loss": 0.4254, "step": 778 }, { "epoch": 0.8304904051172708, "grad_norm": 0.7834777517715102, "learning_rate": 7.894002663724921e-05, "loss": 0.4156, "step": 779 }, { "epoch": 0.8315565031982942, "grad_norm": 0.760746777211738, "learning_rate": 7.89332076787546e-05, "loss": 0.4183, "step": 780 }, { "epoch": 0.8326226012793176, "grad_norm": 0.7072386884303129, "learning_rate": 7.892636715328638e-05, "loss": 0.4085, "step": 781 }, { "epoch": 0.8336886993603412, "grad_norm": 0.5891467963351983, "learning_rate": 7.89195050646338e-05, "loss": 0.4174, "step": 782 }, { "epoch": 0.8347547974413646, "grad_norm": 0.4907232824657599, "learning_rate": 7.891262141659812e-05, "loss": 0.4172, "step": 783 }, { "epoch": 0.835820895522388, "grad_norm": 0.46554454666441136, "learning_rate": 7.890571621299252e-05, "loss": 0.4135, "step": 784 }, { "epoch": 0.8368869936034116, "grad_norm": 0.448212012101914, "learning_rate": 7.889878945764215e-05, "loss": 0.4187, "step": 785 }, { "epoch": 0.837953091684435, "grad_norm": 0.6062486130131177, "learning_rate": 7.889184115438403e-05, "loss": 0.4184, "step": 786 }, { "epoch": 0.8390191897654584, "grad_norm": 0.6256523573917208, "learning_rate": 7.888487130706719e-05, "loss": 0.4108, "step": 787 }, { "epoch": 0.8400852878464818, "grad_norm": 0.5794981444661234, "learning_rate": 7.887787991955254e-05, "loss": 0.4156, "step": 788 }, { "epoch": 0.8411513859275054, "grad_norm": 0.5092450399277311, "learning_rate": 7.887086699571297e-05, "loss": 0.4138, "step": 789 }, { "epoch": 0.8422174840085288, "grad_norm": 0.3853513561237604, "learning_rate": 7.886383253943326e-05, "loss": 0.4101, "step": 790 }, { "epoch": 0.8432835820895522, "grad_norm": 0.5174800575237039, "learning_rate": 7.885677655461013e-05, "loss": 0.415, "step": 791 }, { "epoch": 0.8443496801705757, "grad_norm": 0.7565196574017777, "learning_rate": 7.884969904515224e-05, "loss": 0.4163, "step": 792 }, { "epoch": 0.8454157782515992, "grad_norm": 0.9209372045007519, "learning_rate": 7.884260001498015e-05, "loss": 0.4153, "step": 793 }, { "epoch": 0.8464818763326226, "grad_norm": 1.0713514936667083, "learning_rate": 7.883547946802637e-05, "loss": 0.4158, "step": 794 }, { "epoch": 0.847547974413646, "grad_norm": 0.8424265293921103, "learning_rate": 7.882833740823531e-05, "loss": 0.4085, "step": 795 }, { "epoch": 0.8486140724946695, "grad_norm": 0.4698049947846306, "learning_rate": 7.882117383956328e-05, "loss": 0.4176, "step": 796 }, { "epoch": 0.849680170575693, "grad_norm": 0.45769071058539357, "learning_rate": 7.881398876597855e-05, "loss": 0.4122, "step": 797 }, { "epoch": 0.8507462686567164, "grad_norm": 0.5978831801648813, "learning_rate": 7.880678219146125e-05, "loss": 0.4134, "step": 798 }, { "epoch": 0.8518123667377399, "grad_norm": 0.6151035364335734, "learning_rate": 7.879955412000348e-05, "loss": 0.413, "step": 799 }, { "epoch": 0.8528784648187633, "grad_norm": 0.5747882183128944, "learning_rate": 7.87923045556092e-05, "loss": 0.4119, "step": 800 }, { "epoch": 0.8539445628997868, "grad_norm": 0.5283977724250003, "learning_rate": 7.878503350229428e-05, "loss": 0.4123, "step": 801 }, { "epoch": 0.8550106609808102, "grad_norm": 0.5661813646706929, "learning_rate": 7.877774096408652e-05, "loss": 0.4173, "step": 802 }, { "epoch": 0.8560767590618337, "grad_norm": 0.7555232731803762, "learning_rate": 7.87704269450256e-05, "loss": 0.414, "step": 803 }, { "epoch": 0.8571428571428571, "grad_norm": 0.9330140626616096, "learning_rate": 7.876309144916312e-05, "loss": 0.4174, "step": 804 }, { "epoch": 0.8582089552238806, "grad_norm": 1.0609474276194846, "learning_rate": 7.875573448056255e-05, "loss": 0.4184, "step": 805 }, { "epoch": 0.8592750533049041, "grad_norm": 0.9156452459182889, "learning_rate": 7.874835604329928e-05, "loss": 0.4129, "step": 806 }, { "epoch": 0.8603411513859275, "grad_norm": 0.7492113965064349, "learning_rate": 7.874095614146057e-05, "loss": 0.4134, "step": 807 }, { "epoch": 0.8614072494669509, "grad_norm": 0.5003348192655799, "learning_rate": 7.873353477914559e-05, "loss": 0.4194, "step": 808 }, { "epoch": 0.8624733475479744, "grad_norm": 0.5127452250412682, "learning_rate": 7.872609196046537e-05, "loss": 0.4098, "step": 809 }, { "epoch": 0.8635394456289979, "grad_norm": 0.6332821308118973, "learning_rate": 7.871862768954285e-05, "loss": 0.4158, "step": 810 }, { "epoch": 0.8646055437100213, "grad_norm": 0.7546855273362159, "learning_rate": 7.871114197051289e-05, "loss": 0.4182, "step": 811 }, { "epoch": 0.8656716417910447, "grad_norm": 0.7220245458809873, "learning_rate": 7.870363480752214e-05, "loss": 0.4088, "step": 812 }, { "epoch": 0.8667377398720683, "grad_norm": 0.6636256075414101, "learning_rate": 7.869610620472918e-05, "loss": 0.4169, "step": 813 }, { "epoch": 0.8678038379530917, "grad_norm": 0.5862374738875737, "learning_rate": 7.86885561663045e-05, "loss": 0.4182, "step": 814 }, { "epoch": 0.8688699360341151, "grad_norm": 0.47483648705484865, "learning_rate": 7.868098469643039e-05, "loss": 0.4151, "step": 815 }, { "epoch": 0.8699360341151386, "grad_norm": 0.45874603100581357, "learning_rate": 7.867339179930108e-05, "loss": 0.4139, "step": 816 }, { "epoch": 0.8710021321961621, "grad_norm": 0.5617556695876103, "learning_rate": 7.866577747912262e-05, "loss": 0.4123, "step": 817 }, { "epoch": 0.8720682302771855, "grad_norm": 0.6132829210691816, "learning_rate": 7.865814174011295e-05, "loss": 0.4102, "step": 818 }, { "epoch": 0.8731343283582089, "grad_norm": 0.6010015927279606, "learning_rate": 7.86504845865019e-05, "loss": 0.419, "step": 819 }, { "epoch": 0.8742004264392325, "grad_norm": 0.7239480812231027, "learning_rate": 7.864280602253109e-05, "loss": 0.4208, "step": 820 }, { "epoch": 0.8752665245202559, "grad_norm": 0.8304065457671317, "learning_rate": 7.863510605245409e-05, "loss": 0.4241, "step": 821 }, { "epoch": 0.8763326226012793, "grad_norm": 0.9073014145416783, "learning_rate": 7.862738468053625e-05, "loss": 0.4175, "step": 822 }, { "epoch": 0.8773987206823027, "grad_norm": 0.8823598817021182, "learning_rate": 7.861964191105483e-05, "loss": 0.4191, "step": 823 }, { "epoch": 0.8784648187633263, "grad_norm": 0.7779060734831246, "learning_rate": 7.861187774829891e-05, "loss": 0.4101, "step": 824 }, { "epoch": 0.8795309168443497, "grad_norm": 0.7580593362013702, "learning_rate": 7.860409219656942e-05, "loss": 0.4122, "step": 825 }, { "epoch": 0.8805970149253731, "grad_norm": 0.877095795887976, "learning_rate": 7.85962852601792e-05, "loss": 0.4186, "step": 826 }, { "epoch": 0.8816631130063965, "grad_norm": 0.965490750532556, "learning_rate": 7.858845694345283e-05, "loss": 0.4114, "step": 827 }, { "epoch": 0.8827292110874201, "grad_norm": 0.9339934525386837, "learning_rate": 7.858060725072682e-05, "loss": 0.4119, "step": 828 }, { "epoch": 0.8837953091684435, "grad_norm": 0.7470003853390679, "learning_rate": 7.857273618634949e-05, "loss": 0.411, "step": 829 }, { "epoch": 0.8848614072494669, "grad_norm": 0.5531734893024891, "learning_rate": 7.8564843754681e-05, "loss": 0.4061, "step": 830 }, { "epoch": 0.8859275053304904, "grad_norm": 0.569407252019787, "learning_rate": 7.855692996009332e-05, "loss": 0.409, "step": 831 }, { "epoch": 0.8869936034115139, "grad_norm": 0.6880810896792248, "learning_rate": 7.854899480697033e-05, "loss": 0.4112, "step": 832 }, { "epoch": 0.8880597014925373, "grad_norm": 0.7188833271346624, "learning_rate": 7.854103829970765e-05, "loss": 0.4159, "step": 833 }, { "epoch": 0.8891257995735607, "grad_norm": 0.6200962748194073, "learning_rate": 7.853306044271281e-05, "loss": 0.4104, "step": 834 }, { "epoch": 0.8901918976545842, "grad_norm": 0.6029255662836998, "learning_rate": 7.852506124040509e-05, "loss": 0.41, "step": 835 }, { "epoch": 0.8912579957356077, "grad_norm": 0.5455766236674676, "learning_rate": 7.851704069721567e-05, "loss": 0.4155, "step": 836 }, { "epoch": 0.8923240938166311, "grad_norm": 0.59815834722666, "learning_rate": 7.850899881758746e-05, "loss": 0.4099, "step": 837 }, { "epoch": 0.8933901918976546, "grad_norm": 0.6269668637980782, "learning_rate": 7.850093560597529e-05, "loss": 0.4086, "step": 838 }, { "epoch": 0.894456289978678, "grad_norm": 0.5257832120028597, "learning_rate": 7.849285106684576e-05, "loss": 0.41, "step": 839 }, { "epoch": 0.8955223880597015, "grad_norm": 0.6116996403200065, "learning_rate": 7.848474520467727e-05, "loss": 0.4201, "step": 840 }, { "epoch": 0.896588486140725, "grad_norm": 0.6371232956894439, "learning_rate": 7.847661802396004e-05, "loss": 0.412, "step": 841 }, { "epoch": 0.8976545842217484, "grad_norm": 0.5766017052591853, "learning_rate": 7.84684695291961e-05, "loss": 0.4166, "step": 842 }, { "epoch": 0.8987206823027718, "grad_norm": 0.6967805160059684, "learning_rate": 7.846029972489932e-05, "loss": 0.4134, "step": 843 }, { "epoch": 0.8997867803837953, "grad_norm": 0.7884289858058776, "learning_rate": 7.845210861559533e-05, "loss": 0.4128, "step": 844 }, { "epoch": 0.9008528784648188, "grad_norm": 0.7180820620379316, "learning_rate": 7.84438962058216e-05, "loss": 0.4151, "step": 845 }, { "epoch": 0.9019189765458422, "grad_norm": 0.595379884431672, "learning_rate": 7.843566250012734e-05, "loss": 0.4197, "step": 846 }, { "epoch": 0.9029850746268657, "grad_norm": 0.44736210153445316, "learning_rate": 7.842740750307362e-05, "loss": 0.4096, "step": 847 }, { "epoch": 0.9040511727078892, "grad_norm": 0.41043276693031083, "learning_rate": 7.841913121923327e-05, "loss": 0.4179, "step": 848 }, { "epoch": 0.9051172707889126, "grad_norm": 0.3848671265951802, "learning_rate": 7.841083365319093e-05, "loss": 0.4119, "step": 849 }, { "epoch": 0.906183368869936, "grad_norm": 0.3978522634845732, "learning_rate": 7.840251480954302e-05, "loss": 0.4109, "step": 850 }, { "epoch": 0.9072494669509595, "grad_norm": 0.5471539558932556, "learning_rate": 7.839417469289773e-05, "loss": 0.4137, "step": 851 }, { "epoch": 0.908315565031983, "grad_norm": 0.6676260565465565, "learning_rate": 7.838581330787508e-05, "loss": 0.4132, "step": 852 }, { "epoch": 0.9093816631130064, "grad_norm": 0.5826596489602703, "learning_rate": 7.837743065910682e-05, "loss": 0.4066, "step": 853 }, { "epoch": 0.9104477611940298, "grad_norm": 0.5230535811454794, "learning_rate": 7.83690267512365e-05, "loss": 0.405, "step": 854 }, { "epoch": 0.9115138592750534, "grad_norm": 0.5626160739139322, "learning_rate": 7.836060158891947e-05, "loss": 0.4089, "step": 855 }, { "epoch": 0.9125799573560768, "grad_norm": 0.4697100322492265, "learning_rate": 7.835215517682282e-05, "loss": 0.4143, "step": 856 }, { "epoch": 0.9136460554371002, "grad_norm": 0.36866479086378523, "learning_rate": 7.834368751962542e-05, "loss": 0.405, "step": 857 }, { "epoch": 0.9147121535181236, "grad_norm": 0.4787182490306844, "learning_rate": 7.833519862201791e-05, "loss": 0.4144, "step": 858 }, { "epoch": 0.9157782515991472, "grad_norm": 0.5878869019859624, "learning_rate": 7.83266884887027e-05, "loss": 0.4183, "step": 859 }, { "epoch": 0.9168443496801706, "grad_norm": 0.7152505035199795, "learning_rate": 7.831815712439397e-05, "loss": 0.4095, "step": 860 }, { "epoch": 0.917910447761194, "grad_norm": 0.7805590910529553, "learning_rate": 7.830960453381764e-05, "loss": 0.4106, "step": 861 }, { "epoch": 0.9189765458422174, "grad_norm": 0.7572280791863363, "learning_rate": 7.830103072171142e-05, "loss": 0.4102, "step": 862 }, { "epoch": 0.920042643923241, "grad_norm": 0.7493776976712094, "learning_rate": 7.829243569282473e-05, "loss": 0.4088, "step": 863 }, { "epoch": 0.9211087420042644, "grad_norm": 0.7609395686522328, "learning_rate": 7.828381945191879e-05, "loss": 0.4111, "step": 864 }, { "epoch": 0.9221748400852878, "grad_norm": 0.8239887481859134, "learning_rate": 7.827518200376654e-05, "loss": 0.4195, "step": 865 }, { "epoch": 0.9232409381663113, "grad_norm": 0.8016884574752428, "learning_rate": 7.826652335315268e-05, "loss": 0.4056, "step": 866 }, { "epoch": 0.9243070362473348, "grad_norm": 0.7736961735336623, "learning_rate": 7.825784350487365e-05, "loss": 0.4184, "step": 867 }, { "epoch": 0.9253731343283582, "grad_norm": 0.8458442232897897, "learning_rate": 7.824914246373764e-05, "loss": 0.4159, "step": 868 }, { "epoch": 0.9264392324093816, "grad_norm": 0.9770784658028318, "learning_rate": 7.824042023456458e-05, "loss": 0.414, "step": 869 }, { "epoch": 0.9275053304904051, "grad_norm": 0.9265743591320977, "learning_rate": 7.823167682218611e-05, "loss": 0.4145, "step": 870 }, { "epoch": 0.9285714285714286, "grad_norm": 0.7609304260280078, "learning_rate": 7.822291223144564e-05, "loss": 0.4133, "step": 871 }, { "epoch": 0.929637526652452, "grad_norm": 0.6283447272354892, "learning_rate": 7.821412646719829e-05, "loss": 0.412, "step": 872 }, { "epoch": 0.9307036247334755, "grad_norm": 0.6090160541584229, "learning_rate": 7.820531953431093e-05, "loss": 0.4083, "step": 873 }, { "epoch": 0.9317697228144989, "grad_norm": 0.7086782043281862, "learning_rate": 7.819649143766215e-05, "loss": 0.4121, "step": 874 }, { "epoch": 0.9328358208955224, "grad_norm": 0.6602890170546788, "learning_rate": 7.818764218214224e-05, "loss": 0.403, "step": 875 }, { "epoch": 0.9339019189765458, "grad_norm": 0.5431433609726145, "learning_rate": 7.817877177265323e-05, "loss": 0.4145, "step": 876 }, { "epoch": 0.9349680170575693, "grad_norm": 0.5717886780149007, "learning_rate": 7.816988021410885e-05, "loss": 0.4162, "step": 877 }, { "epoch": 0.9360341151385928, "grad_norm": 0.518417714023232, "learning_rate": 7.816096751143459e-05, "loss": 0.4058, "step": 878 }, { "epoch": 0.9371002132196162, "grad_norm": 0.43803136693398226, "learning_rate": 7.815203366956762e-05, "loss": 0.4071, "step": 879 }, { "epoch": 0.9381663113006397, "grad_norm": 0.5244904028290258, "learning_rate": 7.814307869345682e-05, "loss": 0.4214, "step": 880 }, { "epoch": 0.9392324093816631, "grad_norm": 0.637533228472145, "learning_rate": 7.813410258806275e-05, "loss": 0.4178, "step": 881 }, { "epoch": 0.9402985074626866, "grad_norm": 0.7047581414477792, "learning_rate": 7.812510535835775e-05, "loss": 0.4073, "step": 882 }, { "epoch": 0.94136460554371, "grad_norm": 0.641172550785535, "learning_rate": 7.811608700932582e-05, "loss": 0.4103, "step": 883 }, { "epoch": 0.9424307036247335, "grad_norm": 0.5994428458639327, "learning_rate": 7.810704754596263e-05, "loss": 0.4135, "step": 884 }, { "epoch": 0.9434968017057569, "grad_norm": 0.6070504903362325, "learning_rate": 7.809798697327558e-05, "loss": 0.4168, "step": 885 }, { "epoch": 0.9445628997867804, "grad_norm": 0.6389004815644979, "learning_rate": 7.808890529628374e-05, "loss": 0.4126, "step": 886 }, { "epoch": 0.9456289978678039, "grad_norm": 0.5841071688845674, "learning_rate": 7.807980252001791e-05, "loss": 0.4163, "step": 887 }, { "epoch": 0.9466950959488273, "grad_norm": 0.5441565394837324, "learning_rate": 7.807067864952055e-05, "loss": 0.4073, "step": 888 }, { "epoch": 0.9477611940298507, "grad_norm": 0.5817224526693675, "learning_rate": 7.806153368984583e-05, "loss": 0.4095, "step": 889 }, { "epoch": 0.9488272921108742, "grad_norm": 0.6150981739518016, "learning_rate": 7.805236764605954e-05, "loss": 0.4111, "step": 890 }, { "epoch": 0.9498933901918977, "grad_norm": 0.6063228912727467, "learning_rate": 7.804318052323922e-05, "loss": 0.4118, "step": 891 }, { "epoch": 0.9509594882729211, "grad_norm": 0.5902912758191065, "learning_rate": 7.803397232647406e-05, "loss": 0.4145, "step": 892 }, { "epoch": 0.9520255863539445, "grad_norm": 0.5500221760128661, "learning_rate": 7.80247430608649e-05, "loss": 0.4173, "step": 893 }, { "epoch": 0.9530916844349681, "grad_norm": 0.5579754203833759, "learning_rate": 7.80154927315243e-05, "loss": 0.4088, "step": 894 }, { "epoch": 0.9541577825159915, "grad_norm": 0.6650648355002524, "learning_rate": 7.800622134357644e-05, "loss": 0.405, "step": 895 }, { "epoch": 0.9552238805970149, "grad_norm": 0.7556125255322795, "learning_rate": 7.799692890215721e-05, "loss": 0.4125, "step": 896 }, { "epoch": 0.9562899786780383, "grad_norm": 0.7165371046432817, "learning_rate": 7.798761541241413e-05, "loss": 0.4086, "step": 897 }, { "epoch": 0.9573560767590619, "grad_norm": 0.6697196433407722, "learning_rate": 7.797828087950637e-05, "loss": 0.4125, "step": 898 }, { "epoch": 0.9584221748400853, "grad_norm": 0.6629917240602252, "learning_rate": 7.79689253086048e-05, "loss": 0.4172, "step": 899 }, { "epoch": 0.9594882729211087, "grad_norm": 0.6424942713817352, "learning_rate": 7.795954870489191e-05, "loss": 0.4131, "step": 900 }, { "epoch": 0.9605543710021321, "grad_norm": 0.4594700765306279, "learning_rate": 7.795015107356186e-05, "loss": 0.4122, "step": 901 }, { "epoch": 0.9616204690831557, "grad_norm": 0.40226566645245615, "learning_rate": 7.794073241982043e-05, "loss": 0.4141, "step": 902 }, { "epoch": 0.9626865671641791, "grad_norm": 0.5260998304778581, "learning_rate": 7.793129274888508e-05, "loss": 0.4119, "step": 903 }, { "epoch": 0.9637526652452025, "grad_norm": 0.628398237724531, "learning_rate": 7.792183206598491e-05, "loss": 0.4088, "step": 904 }, { "epoch": 0.964818763326226, "grad_norm": 0.6607508208979607, "learning_rate": 7.791235037636062e-05, "loss": 0.4126, "step": 905 }, { "epoch": 0.9658848614072495, "grad_norm": 0.7250208135809759, "learning_rate": 7.79028476852646e-05, "loss": 0.4159, "step": 906 }, { "epoch": 0.9669509594882729, "grad_norm": 0.7410413741786345, "learning_rate": 7.789332399796079e-05, "loss": 0.4099, "step": 907 }, { "epoch": 0.9680170575692963, "grad_norm": 0.6585611107689859, "learning_rate": 7.78837793197249e-05, "loss": 0.4099, "step": 908 }, { "epoch": 0.9690831556503199, "grad_norm": 0.6090162859858071, "learning_rate": 7.787421365584414e-05, "loss": 0.4113, "step": 909 }, { "epoch": 0.9701492537313433, "grad_norm": 0.6733319020369772, "learning_rate": 7.786462701161738e-05, "loss": 0.4093, "step": 910 }, { "epoch": 0.9712153518123667, "grad_norm": 0.7485470209020791, "learning_rate": 7.785501939235513e-05, "loss": 0.4141, "step": 911 }, { "epoch": 0.9722814498933902, "grad_norm": 0.6601939210618039, "learning_rate": 7.784539080337955e-05, "loss": 0.4079, "step": 912 }, { "epoch": 0.9733475479744137, "grad_norm": 0.5436166438340034, "learning_rate": 7.783574125002432e-05, "loss": 0.4041, "step": 913 }, { "epoch": 0.9744136460554371, "grad_norm": 0.617167761753248, "learning_rate": 7.782607073763484e-05, "loss": 0.4086, "step": 914 }, { "epoch": 0.9754797441364605, "grad_norm": 0.6443414561615247, "learning_rate": 7.781637927156804e-05, "loss": 0.4104, "step": 915 }, { "epoch": 0.976545842217484, "grad_norm": 0.5741265835867478, "learning_rate": 7.780666685719249e-05, "loss": 0.4148, "step": 916 }, { "epoch": 0.9776119402985075, "grad_norm": 0.42635079769878254, "learning_rate": 7.779693349988839e-05, "loss": 0.4048, "step": 917 }, { "epoch": 0.9786780383795309, "grad_norm": 0.47310627157488677, "learning_rate": 7.77871792050475e-05, "loss": 0.409, "step": 918 }, { "epoch": 0.9797441364605544, "grad_norm": 0.5694527852620955, "learning_rate": 7.777740397807316e-05, "loss": 0.4093, "step": 919 }, { "epoch": 0.9808102345415778, "grad_norm": 0.5924875514072048, "learning_rate": 7.776760782438038e-05, "loss": 0.4111, "step": 920 }, { "epoch": 0.9818763326226013, "grad_norm": 0.49411522824636867, "learning_rate": 7.775779074939571e-05, "loss": 0.4154, "step": 921 }, { "epoch": 0.9829424307036247, "grad_norm": 0.5511529858288959, "learning_rate": 7.77479527585573e-05, "loss": 0.4132, "step": 922 }, { "epoch": 0.9840085287846482, "grad_norm": 0.6881028326855904, "learning_rate": 7.773809385731487e-05, "loss": 0.4013, "step": 923 }, { "epoch": 0.9850746268656716, "grad_norm": 0.8172075890856048, "learning_rate": 7.772821405112974e-05, "loss": 0.417, "step": 924 }, { "epoch": 0.9861407249466951, "grad_norm": 0.9223848954361227, "learning_rate": 7.771831334547483e-05, "loss": 0.4071, "step": 925 }, { "epoch": 0.9872068230277186, "grad_norm": 0.9603810119097379, "learning_rate": 7.77083917458346e-05, "loss": 0.4114, "step": 926 }, { "epoch": 0.988272921108742, "grad_norm": 1.0088704018315873, "learning_rate": 7.769844925770512e-05, "loss": 0.4168, "step": 927 }, { "epoch": 0.9893390191897654, "grad_norm": 0.9263247355978909, "learning_rate": 7.7688485886594e-05, "loss": 0.4068, "step": 928 }, { "epoch": 0.990405117270789, "grad_norm": 0.6336714165848655, "learning_rate": 7.76785016380204e-05, "loss": 0.4039, "step": 929 }, { "epoch": 0.9914712153518124, "grad_norm": 0.3609254347454348, "learning_rate": 7.766849651751512e-05, "loss": 0.4094, "step": 930 }, { "epoch": 0.9925373134328358, "grad_norm": 0.37973843808767777, "learning_rate": 7.765847053062046e-05, "loss": 0.4124, "step": 931 }, { "epoch": 0.9936034115138592, "grad_norm": 0.6199111977710898, "learning_rate": 7.764842368289028e-05, "loss": 0.4065, "step": 932 }, { "epoch": 0.9946695095948828, "grad_norm": 0.7388812044022451, "learning_rate": 7.763835597989005e-05, "loss": 0.4126, "step": 933 }, { "epoch": 0.9957356076759062, "grad_norm": 0.6631941161213446, "learning_rate": 7.762826742719672e-05, "loss": 0.4077, "step": 934 }, { "epoch": 0.9968017057569296, "grad_norm": 0.5013692544152526, "learning_rate": 7.761815803039883e-05, "loss": 0.4088, "step": 935 }, { "epoch": 0.997867803837953, "grad_norm": 0.5138742991856071, "learning_rate": 7.760802779509647e-05, "loss": 0.4109, "step": 936 }, { "epoch": 0.9989339019189766, "grad_norm": 0.5647725207929322, "learning_rate": 7.759787672690124e-05, "loss": 0.4136, "step": 937 }, { "epoch": 1.0, "grad_norm": 0.49646274447841793, "learning_rate": 7.758770483143634e-05, "loss": 0.4046, "step": 938 }, { "epoch": 1.0010660980810235, "grad_norm": 0.41415428574276214, "learning_rate": 7.757751211433646e-05, "loss": 0.3978, "step": 939 }, { "epoch": 1.0021321961620469, "grad_norm": 0.4418990401143746, "learning_rate": 7.75672985812478e-05, "loss": 0.4007, "step": 940 }, { "epoch": 1.0031982942430704, "grad_norm": 0.5702542511470434, "learning_rate": 7.75570642378282e-05, "loss": 0.4008, "step": 941 }, { "epoch": 1.004264392324094, "grad_norm": 0.6510908049787664, "learning_rate": 7.754680908974687e-05, "loss": 0.4006, "step": 942 }, { "epoch": 1.0053304904051172, "grad_norm": 0.7117289661008723, "learning_rate": 7.75365331426847e-05, "loss": 0.4029, "step": 943 }, { "epoch": 1.0063965884861408, "grad_norm": 0.8003621078354415, "learning_rate": 7.752623640233398e-05, "loss": 0.4028, "step": 944 }, { "epoch": 1.007462686567164, "grad_norm": 0.7457196896824791, "learning_rate": 7.751591887439859e-05, "loss": 0.4077, "step": 945 }, { "epoch": 1.0085287846481876, "grad_norm": 0.650649322119117, "learning_rate": 7.75055805645939e-05, "loss": 0.4018, "step": 946 }, { "epoch": 1.0095948827292112, "grad_norm": 0.7354343157171254, "learning_rate": 7.749522147864681e-05, "loss": 0.397, "step": 947 }, { "epoch": 1.0106609808102345, "grad_norm": 0.8392238195122467, "learning_rate": 7.748484162229572e-05, "loss": 0.3956, "step": 948 }, { "epoch": 1.011727078891258, "grad_norm": 0.6811125765381959, "learning_rate": 7.747444100129048e-05, "loss": 0.4008, "step": 949 }, { "epoch": 1.0127931769722816, "grad_norm": 0.5072610455516853, "learning_rate": 7.746401962139255e-05, "loss": 0.3975, "step": 950 }, { "epoch": 1.0138592750533049, "grad_norm": 0.475527294382562, "learning_rate": 7.745357748837482e-05, "loss": 0.3977, "step": 951 }, { "epoch": 1.0149253731343284, "grad_norm": 0.6012477231886632, "learning_rate": 7.744311460802166e-05, "loss": 0.4016, "step": 952 }, { "epoch": 1.0159914712153517, "grad_norm": 0.6774620182327961, "learning_rate": 7.7432630986129e-05, "loss": 0.395, "step": 953 }, { "epoch": 1.0170575692963753, "grad_norm": 0.6361012701129908, "learning_rate": 7.742212662850421e-05, "loss": 0.3961, "step": 954 }, { "epoch": 1.0181236673773988, "grad_norm": 0.4593566880462273, "learning_rate": 7.741160154096614e-05, "loss": 0.4015, "step": 955 }, { "epoch": 1.019189765458422, "grad_norm": 0.4253384790692246, "learning_rate": 7.740105572934516e-05, "loss": 0.3969, "step": 956 }, { "epoch": 1.0202558635394456, "grad_norm": 0.5377056528363119, "learning_rate": 7.739048919948309e-05, "loss": 0.3925, "step": 957 }, { "epoch": 1.0213219616204692, "grad_norm": 0.4792515921280752, "learning_rate": 7.737990195723325e-05, "loss": 0.3954, "step": 958 }, { "epoch": 1.0223880597014925, "grad_norm": 0.37572937988220495, "learning_rate": 7.736929400846041e-05, "loss": 0.3946, "step": 959 }, { "epoch": 1.023454157782516, "grad_norm": 0.4748049142013816, "learning_rate": 7.735866535904083e-05, "loss": 0.3938, "step": 960 }, { "epoch": 1.0245202558635393, "grad_norm": 0.5674490975829488, "learning_rate": 7.734801601486224e-05, "loss": 0.4053, "step": 961 }, { "epoch": 1.0255863539445629, "grad_norm": 0.49630911394420096, "learning_rate": 7.733734598182379e-05, "loss": 0.391, "step": 962 }, { "epoch": 1.0266524520255864, "grad_norm": 0.41816138182017965, "learning_rate": 7.732665526583616e-05, "loss": 0.3974, "step": 963 }, { "epoch": 1.0277185501066097, "grad_norm": 0.48628841101926257, "learning_rate": 7.731594387282144e-05, "loss": 0.3897, "step": 964 }, { "epoch": 1.0287846481876333, "grad_norm": 0.5327676523101473, "learning_rate": 7.730521180871317e-05, "loss": 0.3958, "step": 965 }, { "epoch": 1.0298507462686568, "grad_norm": 0.5698192622774071, "learning_rate": 7.729445907945637e-05, "loss": 0.3996, "step": 966 }, { "epoch": 1.0309168443496801, "grad_norm": 0.6600227288417884, "learning_rate": 7.728368569100749e-05, "loss": 0.398, "step": 967 }, { "epoch": 1.0319829424307037, "grad_norm": 0.7199584858108914, "learning_rate": 7.727289164933443e-05, "loss": 0.3952, "step": 968 }, { "epoch": 1.033049040511727, "grad_norm": 0.6780630536094696, "learning_rate": 7.726207696041653e-05, "loss": 0.3901, "step": 969 }, { "epoch": 1.0341151385927505, "grad_norm": 0.6051350564389102, "learning_rate": 7.725124163024456e-05, "loss": 0.4027, "step": 970 }, { "epoch": 1.035181236673774, "grad_norm": 0.6703569464507834, "learning_rate": 7.724038566482073e-05, "loss": 0.3982, "step": 971 }, { "epoch": 1.0362473347547974, "grad_norm": 0.701997393773405, "learning_rate": 7.722950907015867e-05, "loss": 0.4064, "step": 972 }, { "epoch": 1.037313432835821, "grad_norm": 0.7033610282992381, "learning_rate": 7.721861185228347e-05, "loss": 0.4024, "step": 973 }, { "epoch": 1.0383795309168444, "grad_norm": 0.7094358785590401, "learning_rate": 7.72076940172316e-05, "loss": 0.4075, "step": 974 }, { "epoch": 1.0394456289978677, "grad_norm": 0.5778245595035633, "learning_rate": 7.719675557105101e-05, "loss": 0.399, "step": 975 }, { "epoch": 1.0405117270788913, "grad_norm": 0.495341844608196, "learning_rate": 7.718579651980099e-05, "loss": 0.3987, "step": 976 }, { "epoch": 1.0415778251599148, "grad_norm": 0.5705131082078807, "learning_rate": 7.717481686955231e-05, "loss": 0.4012, "step": 977 }, { "epoch": 1.0426439232409381, "grad_norm": 0.7460655829453172, "learning_rate": 7.71638166263871e-05, "loss": 0.3996, "step": 978 }, { "epoch": 1.0437100213219617, "grad_norm": 0.8323692131007797, "learning_rate": 7.715279579639895e-05, "loss": 0.3893, "step": 979 }, { "epoch": 1.044776119402985, "grad_norm": 0.7432520305478516, "learning_rate": 7.714175438569282e-05, "loss": 0.398, "step": 980 }, { "epoch": 1.0458422174840085, "grad_norm": 0.6728948975330855, "learning_rate": 7.713069240038506e-05, "loss": 0.4017, "step": 981 }, { "epoch": 1.046908315565032, "grad_norm": 0.4983139656283151, "learning_rate": 7.711960984660346e-05, "loss": 0.394, "step": 982 }, { "epoch": 1.0479744136460554, "grad_norm": 0.33754531605342536, "learning_rate": 7.710850673048717e-05, "loss": 0.3944, "step": 983 }, { "epoch": 1.049040511727079, "grad_norm": 0.3725573631734465, "learning_rate": 7.709738305818674e-05, "loss": 0.4005, "step": 984 }, { "epoch": 1.0501066098081024, "grad_norm": 0.4551248038249476, "learning_rate": 7.708623883586409e-05, "loss": 0.3971, "step": 985 }, { "epoch": 1.0511727078891258, "grad_norm": 0.41338624681327313, "learning_rate": 7.707507406969256e-05, "loss": 0.4075, "step": 986 }, { "epoch": 1.0522388059701493, "grad_norm": 0.3693936866798565, "learning_rate": 7.706388876585685e-05, "loss": 0.4035, "step": 987 }, { "epoch": 1.0533049040511726, "grad_norm": 0.3457954659560392, "learning_rate": 7.705268293055302e-05, "loss": 0.3988, "step": 988 }, { "epoch": 1.0543710021321961, "grad_norm": 0.3988913386622119, "learning_rate": 7.704145656998853e-05, "loss": 0.3896, "step": 989 }, { "epoch": 1.0554371002132197, "grad_norm": 0.48450611938317545, "learning_rate": 7.703020969038222e-05, "loss": 0.392, "step": 990 }, { "epoch": 1.056503198294243, "grad_norm": 0.5311541916427419, "learning_rate": 7.701894229796424e-05, "loss": 0.3952, "step": 991 }, { "epoch": 1.0575692963752665, "grad_norm": 0.5299103995739938, "learning_rate": 7.700765439897616e-05, "loss": 0.3959, "step": 992 }, { "epoch": 1.05863539445629, "grad_norm": 0.5180100310098522, "learning_rate": 7.69963459996709e-05, "loss": 0.3953, "step": 993 }, { "epoch": 1.0597014925373134, "grad_norm": 0.6335245544459096, "learning_rate": 7.69850171063127e-05, "loss": 0.3963, "step": 994 }, { "epoch": 1.060767590618337, "grad_norm": 0.7565263057674985, "learning_rate": 7.697366772517719e-05, "loss": 0.4011, "step": 995 }, { "epoch": 1.0618336886993602, "grad_norm": 0.7626681326172062, "learning_rate": 7.696229786255136e-05, "loss": 0.3973, "step": 996 }, { "epoch": 1.0628997867803838, "grad_norm": 0.7180860950610816, "learning_rate": 7.695090752473348e-05, "loss": 0.4045, "step": 997 }, { "epoch": 1.0639658848614073, "grad_norm": 0.6227425406154337, "learning_rate": 7.693949671803323e-05, "loss": 0.3976, "step": 998 }, { "epoch": 1.0650319829424306, "grad_norm": 0.7201441487555086, "learning_rate": 7.69280654487716e-05, "loss": 0.3985, "step": 999 }, { "epoch": 1.0660980810234542, "grad_norm": 0.8997609053026269, "learning_rate": 7.691661372328093e-05, "loss": 0.3986, "step": 1000 }, { "epoch": 1.0671641791044777, "grad_norm": 1.000428399051643, "learning_rate": 7.690514154790485e-05, "loss": 0.3952, "step": 1001 }, { "epoch": 1.068230277185501, "grad_norm": 0.8713614689438668, "learning_rate": 7.689364892899838e-05, "loss": 0.397, "step": 1002 }, { "epoch": 1.0692963752665245, "grad_norm": 0.7060560170201216, "learning_rate": 7.688213587292783e-05, "loss": 0.398, "step": 1003 }, { "epoch": 1.070362473347548, "grad_norm": 0.5741051379265467, "learning_rate": 7.687060238607082e-05, "loss": 0.3979, "step": 1004 }, { "epoch": 1.0714285714285714, "grad_norm": 0.46041215316462103, "learning_rate": 7.685904847481631e-05, "loss": 0.3959, "step": 1005 }, { "epoch": 1.072494669509595, "grad_norm": 0.3666940747393567, "learning_rate": 7.684747414556457e-05, "loss": 0.3933, "step": 1006 }, { "epoch": 1.0735607675906182, "grad_norm": 0.3941609183280189, "learning_rate": 7.683587940472716e-05, "loss": 0.4004, "step": 1007 }, { "epoch": 1.0746268656716418, "grad_norm": 0.46186462913236076, "learning_rate": 7.6824264258727e-05, "loss": 0.4004, "step": 1008 }, { "epoch": 1.0756929637526653, "grad_norm": 0.5333763291785089, "learning_rate": 7.681262871399824e-05, "loss": 0.3989, "step": 1009 }, { "epoch": 1.0767590618336886, "grad_norm": 0.4861997016882574, "learning_rate": 7.680097277698637e-05, "loss": 0.3974, "step": 1010 }, { "epoch": 1.0778251599147122, "grad_norm": 0.420953916020597, "learning_rate": 7.678929645414822e-05, "loss": 0.4036, "step": 1011 }, { "epoch": 1.0788912579957357, "grad_norm": 0.41234855570784623, "learning_rate": 7.67775997519518e-05, "loss": 0.3946, "step": 1012 }, { "epoch": 1.079957356076759, "grad_norm": 0.542278036057536, "learning_rate": 7.676588267687651e-05, "loss": 0.3957, "step": 1013 }, { "epoch": 1.0810234541577826, "grad_norm": 0.6900107120803527, "learning_rate": 7.6754145235413e-05, "loss": 0.3974, "step": 1014 }, { "epoch": 1.0820895522388059, "grad_norm": 0.7866757471169271, "learning_rate": 7.674238743406319e-05, "loss": 0.4015, "step": 1015 }, { "epoch": 1.0831556503198294, "grad_norm": 0.8352910554164142, "learning_rate": 7.673060927934032e-05, "loss": 0.404, "step": 1016 }, { "epoch": 1.084221748400853, "grad_norm": 0.7695238174542417, "learning_rate": 7.671881077776884e-05, "loss": 0.3926, "step": 1017 }, { "epoch": 1.0852878464818763, "grad_norm": 0.5536485102257792, "learning_rate": 7.670699193588453e-05, "loss": 0.3985, "step": 1018 }, { "epoch": 1.0863539445628998, "grad_norm": 0.4228656833300754, "learning_rate": 7.66951527602344e-05, "loss": 0.3996, "step": 1019 }, { "epoch": 1.0874200426439233, "grad_norm": 0.46124002487349797, "learning_rate": 7.668329325737674e-05, "loss": 0.3891, "step": 1020 }, { "epoch": 1.0884861407249466, "grad_norm": 0.4882167434505241, "learning_rate": 7.667141343388111e-05, "loss": 0.4026, "step": 1021 }, { "epoch": 1.0895522388059702, "grad_norm": 0.44551496006705965, "learning_rate": 7.665951329632829e-05, "loss": 0.4028, "step": 1022 }, { "epoch": 1.0906183368869935, "grad_norm": 0.3735062426653466, "learning_rate": 7.664759285131039e-05, "loss": 0.3934, "step": 1023 }, { "epoch": 1.091684434968017, "grad_norm": 0.3517866272882918, "learning_rate": 7.663565210543065e-05, "loss": 0.3908, "step": 1024 }, { "epoch": 1.0927505330490406, "grad_norm": 0.36578234461021986, "learning_rate": 7.662369106530367e-05, "loss": 0.401, "step": 1025 }, { "epoch": 1.0938166311300639, "grad_norm": 0.37034761078040107, "learning_rate": 7.661170973755523e-05, "loss": 0.3963, "step": 1026 }, { "epoch": 1.0948827292110874, "grad_norm": 0.44787182009231896, "learning_rate": 7.659970812882236e-05, "loss": 0.396, "step": 1027 }, { "epoch": 1.095948827292111, "grad_norm": 0.46802593434023804, "learning_rate": 7.658768624575331e-05, "loss": 0.3895, "step": 1028 }, { "epoch": 1.0970149253731343, "grad_norm": 0.46152503018911306, "learning_rate": 7.657564409500763e-05, "loss": 0.4017, "step": 1029 }, { "epoch": 1.0980810234541578, "grad_norm": 0.46014261998376804, "learning_rate": 7.6563581683256e-05, "loss": 0.3986, "step": 1030 }, { "epoch": 1.0991471215351813, "grad_norm": 0.5602013316636195, "learning_rate": 7.655149901718038e-05, "loss": 0.3998, "step": 1031 }, { "epoch": 1.1002132196162047, "grad_norm": 0.6361354791978112, "learning_rate": 7.653939610347393e-05, "loss": 0.3931, "step": 1032 }, { "epoch": 1.1012793176972282, "grad_norm": 0.6096074276843847, "learning_rate": 7.652727294884107e-05, "loss": 0.403, "step": 1033 }, { "epoch": 1.1023454157782515, "grad_norm": 0.6436284856894914, "learning_rate": 7.651512955999737e-05, "loss": 0.4062, "step": 1034 }, { "epoch": 1.103411513859275, "grad_norm": 0.7789944154844247, "learning_rate": 7.650296594366962e-05, "loss": 0.4001, "step": 1035 }, { "epoch": 1.1044776119402986, "grad_norm": 0.7656806635135831, "learning_rate": 7.649078210659587e-05, "loss": 0.3941, "step": 1036 }, { "epoch": 1.105543710021322, "grad_norm": 0.6743206765604671, "learning_rate": 7.647857805552532e-05, "loss": 0.3982, "step": 1037 }, { "epoch": 1.1066098081023454, "grad_norm": 0.6738594516444768, "learning_rate": 7.646635379721837e-05, "loss": 0.3995, "step": 1038 }, { "epoch": 1.1076759061833688, "grad_norm": 0.5665383403100596, "learning_rate": 7.645410933844663e-05, "loss": 0.3996, "step": 1039 }, { "epoch": 1.1087420042643923, "grad_norm": 0.4665621519906685, "learning_rate": 7.644184468599289e-05, "loss": 0.3986, "step": 1040 }, { "epoch": 1.1098081023454158, "grad_norm": 0.39730689152255527, "learning_rate": 7.642955984665113e-05, "loss": 0.4026, "step": 1041 }, { "epoch": 1.1108742004264391, "grad_norm": 0.4095924968703311, "learning_rate": 7.641725482722651e-05, "loss": 0.3944, "step": 1042 }, { "epoch": 1.1119402985074627, "grad_norm": 0.42742701569060193, "learning_rate": 7.640492963453538e-05, "loss": 0.4004, "step": 1043 }, { "epoch": 1.1130063965884862, "grad_norm": 0.4714508774381032, "learning_rate": 7.639258427540526e-05, "loss": 0.3923, "step": 1044 }, { "epoch": 1.1140724946695095, "grad_norm": 0.43458932184535304, "learning_rate": 7.638021875667483e-05, "loss": 0.4013, "step": 1045 }, { "epoch": 1.115138592750533, "grad_norm": 0.4337664712771011, "learning_rate": 7.636783308519394e-05, "loss": 0.3971, "step": 1046 }, { "epoch": 1.1162046908315566, "grad_norm": 0.5374420364179895, "learning_rate": 7.63554272678236e-05, "loss": 0.3987, "step": 1047 }, { "epoch": 1.11727078891258, "grad_norm": 0.599484068723148, "learning_rate": 7.634300131143601e-05, "loss": 0.4031, "step": 1048 }, { "epoch": 1.1183368869936035, "grad_norm": 0.5397453012699273, "learning_rate": 7.63305552229145e-05, "loss": 0.3929, "step": 1049 }, { "epoch": 1.1194029850746268, "grad_norm": 0.49396537617545705, "learning_rate": 7.631808900915355e-05, "loss": 0.4011, "step": 1050 }, { "epoch": 1.1204690831556503, "grad_norm": 0.5246477358086288, "learning_rate": 7.63056026770588e-05, "loss": 0.395, "step": 1051 }, { "epoch": 1.1215351812366738, "grad_norm": 0.4694349641844175, "learning_rate": 7.6293096233547e-05, "loss": 0.4, "step": 1052 }, { "epoch": 1.1226012793176972, "grad_norm": 0.49101259479940984, "learning_rate": 7.628056968554613e-05, "loss": 0.4023, "step": 1053 }, { "epoch": 1.1236673773987207, "grad_norm": 0.5743556431588469, "learning_rate": 7.626802303999519e-05, "loss": 0.3978, "step": 1054 }, { "epoch": 1.1247334754797442, "grad_norm": 0.5806632736863245, "learning_rate": 7.62554563038444e-05, "loss": 0.3934, "step": 1055 }, { "epoch": 1.1257995735607675, "grad_norm": 0.5458834450664017, "learning_rate": 7.624286948405506e-05, "loss": 0.4, "step": 1056 }, { "epoch": 1.126865671641791, "grad_norm": 0.5294205958455052, "learning_rate": 7.623026258759963e-05, "loss": 0.4058, "step": 1057 }, { "epoch": 1.1279317697228146, "grad_norm": 0.517163593116652, "learning_rate": 7.621763562146167e-05, "loss": 0.3987, "step": 1058 }, { "epoch": 1.128997867803838, "grad_norm": 0.4383142153955423, "learning_rate": 7.620498859263584e-05, "loss": 0.3953, "step": 1059 }, { "epoch": 1.1300639658848615, "grad_norm": 0.48624167313347716, "learning_rate": 7.619232150812799e-05, "loss": 0.4012, "step": 1060 }, { "epoch": 1.1311300639658848, "grad_norm": 0.5650158980091758, "learning_rate": 7.617963437495498e-05, "loss": 0.3989, "step": 1061 }, { "epoch": 1.1321961620469083, "grad_norm": 0.6366745345835356, "learning_rate": 7.616692720014484e-05, "loss": 0.3948, "step": 1062 }, { "epoch": 1.1332622601279319, "grad_norm": 0.7232155780875018, "learning_rate": 7.615419999073667e-05, "loss": 0.3894, "step": 1063 }, { "epoch": 1.1343283582089552, "grad_norm": 0.8621803906529987, "learning_rate": 7.614145275378072e-05, "loss": 0.4004, "step": 1064 }, { "epoch": 1.1353944562899787, "grad_norm": 1.031817218923734, "learning_rate": 7.612868549633825e-05, "loss": 0.3913, "step": 1065 }, { "epoch": 1.136460554371002, "grad_norm": 0.995716991471701, "learning_rate": 7.611589822548168e-05, "loss": 0.3989, "step": 1066 }, { "epoch": 1.1375266524520256, "grad_norm": 0.9135720092202024, "learning_rate": 7.61030909482945e-05, "loss": 0.4013, "step": 1067 }, { "epoch": 1.138592750533049, "grad_norm": 0.7949810308021543, "learning_rate": 7.609026367187125e-05, "loss": 0.3977, "step": 1068 }, { "epoch": 1.1396588486140724, "grad_norm": 0.7103297637427972, "learning_rate": 7.607741640331761e-05, "loss": 0.3925, "step": 1069 }, { "epoch": 1.140724946695096, "grad_norm": 0.6765477797632053, "learning_rate": 7.606454914975029e-05, "loss": 0.3957, "step": 1070 }, { "epoch": 1.1417910447761195, "grad_norm": 0.6061563911680509, "learning_rate": 7.605166191829705e-05, "loss": 0.3952, "step": 1071 }, { "epoch": 1.1428571428571428, "grad_norm": 0.6115108481298734, "learning_rate": 7.603875471609677e-05, "loss": 0.4006, "step": 1072 }, { "epoch": 1.1439232409381663, "grad_norm": 0.7609527472496548, "learning_rate": 7.602582755029938e-05, "loss": 0.3987, "step": 1073 }, { "epoch": 1.1449893390191899, "grad_norm": 0.7173940431562806, "learning_rate": 7.601288042806582e-05, "loss": 0.3995, "step": 1074 }, { "epoch": 1.1460554371002132, "grad_norm": 0.5756472561097495, "learning_rate": 7.599991335656817e-05, "loss": 0.3929, "step": 1075 }, { "epoch": 1.1471215351812367, "grad_norm": 0.5051191561774657, "learning_rate": 7.598692634298949e-05, "loss": 0.3974, "step": 1076 }, { "epoch": 1.14818763326226, "grad_norm": 0.506878322162707, "learning_rate": 7.59739193945239e-05, "loss": 0.3988, "step": 1077 }, { "epoch": 1.1492537313432836, "grad_norm": 0.4945186749103627, "learning_rate": 7.596089251837659e-05, "loss": 0.3967, "step": 1078 }, { "epoch": 1.150319829424307, "grad_norm": 0.5094561991761961, "learning_rate": 7.594784572176378e-05, "loss": 0.3945, "step": 1079 }, { "epoch": 1.1513859275053304, "grad_norm": 0.5005613849300656, "learning_rate": 7.593477901191268e-05, "loss": 0.3962, "step": 1080 }, { "epoch": 1.152452025586354, "grad_norm": 0.39722111509884156, "learning_rate": 7.592169239606161e-05, "loss": 0.407, "step": 1081 }, { "epoch": 1.1535181236673775, "grad_norm": 0.3086629383868203, "learning_rate": 7.590858588145985e-05, "loss": 0.3968, "step": 1082 }, { "epoch": 1.1545842217484008, "grad_norm": 0.32105906743248047, "learning_rate": 7.589545947536774e-05, "loss": 0.4023, "step": 1083 }, { "epoch": 1.1556503198294243, "grad_norm": 0.393821577059538, "learning_rate": 7.588231318505661e-05, "loss": 0.4, "step": 1084 }, { "epoch": 1.1567164179104479, "grad_norm": 0.44231624171196116, "learning_rate": 7.586914701780886e-05, "loss": 0.4008, "step": 1085 }, { "epoch": 1.1577825159914712, "grad_norm": 0.4026841091750872, "learning_rate": 7.585596098091782e-05, "loss": 0.3966, "step": 1086 }, { "epoch": 1.1588486140724947, "grad_norm": 0.4378761127318457, "learning_rate": 7.58427550816879e-05, "loss": 0.3967, "step": 1087 }, { "epoch": 1.159914712153518, "grad_norm": 0.5373953761219459, "learning_rate": 7.582952932743445e-05, "loss": 0.4052, "step": 1088 }, { "epoch": 1.1609808102345416, "grad_norm": 0.6376217391121629, "learning_rate": 7.581628372548388e-05, "loss": 0.3968, "step": 1089 }, { "epoch": 1.1620469083155651, "grad_norm": 0.7124571886953409, "learning_rate": 7.580301828317354e-05, "loss": 0.3967, "step": 1090 }, { "epoch": 1.1631130063965884, "grad_norm": 0.710369838726693, "learning_rate": 7.578973300785182e-05, "loss": 0.3926, "step": 1091 }, { "epoch": 1.164179104477612, "grad_norm": 0.601219772248201, "learning_rate": 7.577642790687805e-05, "loss": 0.3977, "step": 1092 }, { "epoch": 1.1652452025586353, "grad_norm": 0.4939938541558801, "learning_rate": 7.57631029876226e-05, "loss": 0.3976, "step": 1093 }, { "epoch": 1.1663113006396588, "grad_norm": 0.5843429305407729, "learning_rate": 7.574975825746673e-05, "loss": 0.3985, "step": 1094 }, { "epoch": 1.1673773987206824, "grad_norm": 0.7423520288508058, "learning_rate": 7.573639372380277e-05, "loss": 0.3937, "step": 1095 }, { "epoch": 1.1684434968017057, "grad_norm": 0.6911804134372755, "learning_rate": 7.572300939403395e-05, "loss": 0.3998, "step": 1096 }, { "epoch": 1.1695095948827292, "grad_norm": 0.5980230638871175, "learning_rate": 7.570960527557452e-05, "loss": 0.3927, "step": 1097 }, { "epoch": 1.1705756929637527, "grad_norm": 0.5740905237257593, "learning_rate": 7.569618137584964e-05, "loss": 0.3966, "step": 1098 }, { "epoch": 1.171641791044776, "grad_norm": 0.5934276451380195, "learning_rate": 7.568273770229546e-05, "loss": 0.3958, "step": 1099 }, { "epoch": 1.1727078891257996, "grad_norm": 0.6018407068991795, "learning_rate": 7.566927426235909e-05, "loss": 0.392, "step": 1100 }, { "epoch": 1.1737739872068231, "grad_norm": 0.5584205973152891, "learning_rate": 7.565579106349857e-05, "loss": 0.403, "step": 1101 }, { "epoch": 1.1748400852878464, "grad_norm": 0.5611127016532973, "learning_rate": 7.564228811318288e-05, "loss": 0.3946, "step": 1102 }, { "epoch": 1.17590618336887, "grad_norm": 0.6160481911439404, "learning_rate": 7.562876541889195e-05, "loss": 0.3942, "step": 1103 }, { "epoch": 1.1769722814498933, "grad_norm": 0.5055559528539143, "learning_rate": 7.561522298811667e-05, "loss": 0.3928, "step": 1104 }, { "epoch": 1.1780383795309168, "grad_norm": 0.4031249818431398, "learning_rate": 7.560166082835883e-05, "loss": 0.3947, "step": 1105 }, { "epoch": 1.1791044776119404, "grad_norm": 0.4844578748616443, "learning_rate": 7.558807894713116e-05, "loss": 0.4, "step": 1106 }, { "epoch": 1.1801705756929637, "grad_norm": 0.5923421522905422, "learning_rate": 7.557447735195732e-05, "loss": 0.3932, "step": 1107 }, { "epoch": 1.1812366737739872, "grad_norm": 0.5732059132565734, "learning_rate": 7.556085605037191e-05, "loss": 0.396, "step": 1108 }, { "epoch": 1.1823027718550105, "grad_norm": 0.5727038238228728, "learning_rate": 7.554721504992038e-05, "loss": 0.3942, "step": 1109 }, { "epoch": 1.183368869936034, "grad_norm": 0.5941550331742329, "learning_rate": 7.553355435815915e-05, "loss": 0.3921, "step": 1110 }, { "epoch": 1.1844349680170576, "grad_norm": 0.6162721480823822, "learning_rate": 7.551987398265554e-05, "loss": 0.3973, "step": 1111 }, { "epoch": 1.1855010660980811, "grad_norm": 0.6503625421341107, "learning_rate": 7.550617393098777e-05, "loss": 0.4, "step": 1112 }, { "epoch": 1.1865671641791045, "grad_norm": 0.6428847327632391, "learning_rate": 7.549245421074496e-05, "loss": 0.4032, "step": 1113 }, { "epoch": 1.187633262260128, "grad_norm": 0.6516650235926786, "learning_rate": 7.54787148295271e-05, "loss": 0.3972, "step": 1114 }, { "epoch": 1.1886993603411513, "grad_norm": 0.6996227895669477, "learning_rate": 7.546495579494512e-05, "loss": 0.3977, "step": 1115 }, { "epoch": 1.1897654584221748, "grad_norm": 0.6011748919765059, "learning_rate": 7.54511771146208e-05, "loss": 0.3937, "step": 1116 }, { "epoch": 1.1908315565031984, "grad_norm": 0.4608169210715356, "learning_rate": 7.54373787961868e-05, "loss": 0.4012, "step": 1117 }, { "epoch": 1.1918976545842217, "grad_norm": 0.41047190471676254, "learning_rate": 7.542356084728669e-05, "loss": 0.3999, "step": 1118 }, { "epoch": 1.1929637526652452, "grad_norm": 0.41683875837640383, "learning_rate": 7.540972327557487e-05, "loss": 0.3962, "step": 1119 }, { "epoch": 1.1940298507462686, "grad_norm": 0.4017382310711894, "learning_rate": 7.539586608871667e-05, "loss": 0.3971, "step": 1120 }, { "epoch": 1.195095948827292, "grad_norm": 0.3327575746497003, "learning_rate": 7.538198929438823e-05, "loss": 0.3998, "step": 1121 }, { "epoch": 1.1961620469083156, "grad_norm": 0.36060866682655995, "learning_rate": 7.536809290027657e-05, "loss": 0.3892, "step": 1122 }, { "epoch": 1.197228144989339, "grad_norm": 0.4016423989687354, "learning_rate": 7.53541769140796e-05, "loss": 0.3928, "step": 1123 }, { "epoch": 1.1982942430703625, "grad_norm": 0.4667699426682389, "learning_rate": 7.5340241343506e-05, "loss": 0.3927, "step": 1124 }, { "epoch": 1.199360341151386, "grad_norm": 0.52739114765906, "learning_rate": 7.532628619627541e-05, "loss": 0.3965, "step": 1125 }, { "epoch": 1.2004264392324093, "grad_norm": 0.5145519240447523, "learning_rate": 7.531231148011821e-05, "loss": 0.3934, "step": 1126 }, { "epoch": 1.2014925373134329, "grad_norm": 0.46867941356549736, "learning_rate": 7.529831720277569e-05, "loss": 0.4006, "step": 1127 }, { "epoch": 1.2025586353944564, "grad_norm": 0.394042649849781, "learning_rate": 7.528430337199995e-05, "loss": 0.4003, "step": 1128 }, { "epoch": 1.2036247334754797, "grad_norm": 0.449136494508411, "learning_rate": 7.527026999555393e-05, "loss": 0.3978, "step": 1129 }, { "epoch": 1.2046908315565032, "grad_norm": 0.5439656094014544, "learning_rate": 7.525621708121136e-05, "loss": 0.3974, "step": 1130 }, { "epoch": 1.2057569296375266, "grad_norm": 0.6999516585400055, "learning_rate": 7.524214463675686e-05, "loss": 0.4066, "step": 1131 }, { "epoch": 1.20682302771855, "grad_norm": 0.9223862712790244, "learning_rate": 7.522805266998582e-05, "loss": 0.3935, "step": 1132 }, { "epoch": 1.2078891257995736, "grad_norm": 1.005522314616768, "learning_rate": 7.521394118870446e-05, "loss": 0.3996, "step": 1133 }, { "epoch": 1.208955223880597, "grad_norm": 0.9189093535473554, "learning_rate": 7.519981020072979e-05, "loss": 0.3986, "step": 1134 }, { "epoch": 1.2100213219616205, "grad_norm": 0.7474366825523275, "learning_rate": 7.518565971388967e-05, "loss": 0.3975, "step": 1135 }, { "epoch": 1.2110874200426438, "grad_norm": 0.517722383461889, "learning_rate": 7.51714897360227e-05, "loss": 0.3915, "step": 1136 }, { "epoch": 1.2121535181236673, "grad_norm": 0.460287240625528, "learning_rate": 7.515730027497836e-05, "loss": 0.3991, "step": 1137 }, { "epoch": 1.2132196162046909, "grad_norm": 0.5513920397892742, "learning_rate": 7.514309133861684e-05, "loss": 0.3876, "step": 1138 }, { "epoch": 1.2142857142857142, "grad_norm": 0.5926185140798839, "learning_rate": 7.512886293480914e-05, "loss": 0.3987, "step": 1139 }, { "epoch": 1.2153518123667377, "grad_norm": 0.6015497667677062, "learning_rate": 7.51146150714371e-05, "loss": 0.3942, "step": 1140 }, { "epoch": 1.2164179104477613, "grad_norm": 0.4979357814188255, "learning_rate": 7.510034775639324e-05, "loss": 0.3962, "step": 1141 }, { "epoch": 1.2174840085287846, "grad_norm": 0.3795778689452729, "learning_rate": 7.508606099758097e-05, "loss": 0.4005, "step": 1142 }, { "epoch": 1.2185501066098081, "grad_norm": 0.3928656441581091, "learning_rate": 7.507175480291437e-05, "loss": 0.3984, "step": 1143 }, { "epoch": 1.2196162046908317, "grad_norm": 0.3930681703186637, "learning_rate": 7.505742918031836e-05, "loss": 0.3984, "step": 1144 }, { "epoch": 1.220682302771855, "grad_norm": 0.4091879483768399, "learning_rate": 7.504308413772856e-05, "loss": 0.3933, "step": 1145 }, { "epoch": 1.2217484008528785, "grad_norm": 0.46891356577284954, "learning_rate": 7.502871968309139e-05, "loss": 0.3988, "step": 1146 }, { "epoch": 1.2228144989339018, "grad_norm": 0.5103860857976652, "learning_rate": 7.5014335824364e-05, "loss": 0.3935, "step": 1147 }, { "epoch": 1.2238805970149254, "grad_norm": 0.5082917446932953, "learning_rate": 7.499993256951433e-05, "loss": 0.3927, "step": 1148 }, { "epoch": 1.224946695095949, "grad_norm": 0.49379137994921035, "learning_rate": 7.498550992652101e-05, "loss": 0.4001, "step": 1149 }, { "epoch": 1.2260127931769722, "grad_norm": 0.5353013303840448, "learning_rate": 7.497106790337345e-05, "loss": 0.3929, "step": 1150 }, { "epoch": 1.2270788912579957, "grad_norm": 0.5876139459836854, "learning_rate": 7.495660650807174e-05, "loss": 0.3994, "step": 1151 }, { "epoch": 1.2281449893390193, "grad_norm": 0.734210823795574, "learning_rate": 7.494212574862682e-05, "loss": 0.3967, "step": 1152 }, { "epoch": 1.2292110874200426, "grad_norm": 0.7913917318938646, "learning_rate": 7.49276256330602e-05, "loss": 0.3936, "step": 1153 }, { "epoch": 1.2302771855010661, "grad_norm": 0.7405044856069717, "learning_rate": 7.491310616940422e-05, "loss": 0.4, "step": 1154 }, { "epoch": 1.2313432835820897, "grad_norm": 0.6663912293042208, "learning_rate": 7.489856736570192e-05, "loss": 0.4047, "step": 1155 }, { "epoch": 1.232409381663113, "grad_norm": 0.6021454402370039, "learning_rate": 7.488400923000703e-05, "loss": 0.3907, "step": 1156 }, { "epoch": 1.2334754797441365, "grad_norm": 0.5514602002833817, "learning_rate": 7.4869431770384e-05, "loss": 0.3978, "step": 1157 }, { "epoch": 1.2345415778251598, "grad_norm": 0.48903597186198305, "learning_rate": 7.485483499490799e-05, "loss": 0.4005, "step": 1158 }, { "epoch": 1.2356076759061834, "grad_norm": 0.5332694417143288, "learning_rate": 7.484021891166486e-05, "loss": 0.3952, "step": 1159 }, { "epoch": 1.236673773987207, "grad_norm": 0.49543835364305194, "learning_rate": 7.482558352875113e-05, "loss": 0.3986, "step": 1160 }, { "epoch": 1.2377398720682302, "grad_norm": 0.4261853368714419, "learning_rate": 7.481092885427408e-05, "loss": 0.39, "step": 1161 }, { "epoch": 1.2388059701492538, "grad_norm": 0.4265314566032864, "learning_rate": 7.479625489635162e-05, "loss": 0.3911, "step": 1162 }, { "epoch": 1.239872068230277, "grad_norm": 0.5029650666087466, "learning_rate": 7.478156166311236e-05, "loss": 0.3994, "step": 1163 }, { "epoch": 1.2409381663113006, "grad_norm": 0.4455729452597588, "learning_rate": 7.476684916269559e-05, "loss": 0.3952, "step": 1164 }, { "epoch": 1.2420042643923241, "grad_norm": 0.47338288349681196, "learning_rate": 7.475211740325127e-05, "loss": 0.3969, "step": 1165 }, { "epoch": 1.2430703624733475, "grad_norm": 0.4305124151121982, "learning_rate": 7.473736639294004e-05, "loss": 0.3967, "step": 1166 }, { "epoch": 1.244136460554371, "grad_norm": 0.40696000141373095, "learning_rate": 7.472259613993316e-05, "loss": 0.393, "step": 1167 }, { "epoch": 1.2452025586353945, "grad_norm": 0.45477308879606393, "learning_rate": 7.470780665241262e-05, "loss": 0.3958, "step": 1168 }, { "epoch": 1.2462686567164178, "grad_norm": 0.5704960820413824, "learning_rate": 7.469299793857101e-05, "loss": 0.3989, "step": 1169 }, { "epoch": 1.2473347547974414, "grad_norm": 0.6135689230632373, "learning_rate": 7.467817000661159e-05, "loss": 0.3921, "step": 1170 }, { "epoch": 1.248400852878465, "grad_norm": 0.6777534760469583, "learning_rate": 7.466332286474826e-05, "loss": 0.3962, "step": 1171 }, { "epoch": 1.2494669509594882, "grad_norm": 0.6898047595171254, "learning_rate": 7.464845652120557e-05, "loss": 0.3915, "step": 1172 }, { "epoch": 1.2505330490405118, "grad_norm": 0.7123893467359865, "learning_rate": 7.46335709842187e-05, "loss": 0.4025, "step": 1173 }, { "epoch": 1.251599147121535, "grad_norm": 0.6820123806207717, "learning_rate": 7.461866626203348e-05, "loss": 0.3941, "step": 1174 }, { "epoch": 1.2526652452025586, "grad_norm": 0.6533571683107087, "learning_rate": 7.460374236290631e-05, "loss": 0.4025, "step": 1175 }, { "epoch": 1.2537313432835822, "grad_norm": 0.6571872810240704, "learning_rate": 7.45887992951043e-05, "loss": 0.3976, "step": 1176 }, { "epoch": 1.2547974413646055, "grad_norm": 0.6849183239389355, "learning_rate": 7.457383706690511e-05, "loss": 0.3976, "step": 1177 }, { "epoch": 1.255863539445629, "grad_norm": 0.6130858125092414, "learning_rate": 7.455885568659705e-05, "loss": 0.3917, "step": 1178 }, { "epoch": 1.2569296375266523, "grad_norm": 0.5653602132939001, "learning_rate": 7.454385516247899e-05, "loss": 0.3898, "step": 1179 }, { "epoch": 1.2579957356076759, "grad_norm": 0.6091279178319541, "learning_rate": 7.452883550286049e-05, "loss": 0.3928, "step": 1180 }, { "epoch": 1.2590618336886994, "grad_norm": 0.5595753993224933, "learning_rate": 7.451379671606162e-05, "loss": 0.3957, "step": 1181 }, { "epoch": 1.260127931769723, "grad_norm": 0.5079986278587565, "learning_rate": 7.449873881041312e-05, "loss": 0.3893, "step": 1182 }, { "epoch": 1.2611940298507462, "grad_norm": 0.464090085159201, "learning_rate": 7.448366179425628e-05, "loss": 0.3926, "step": 1183 }, { "epoch": 1.2622601279317698, "grad_norm": 0.510243217894761, "learning_rate": 7.446856567594294e-05, "loss": 0.3937, "step": 1184 }, { "epoch": 1.263326226012793, "grad_norm": 0.46150844552833925, "learning_rate": 7.445345046383563e-05, "loss": 0.3947, "step": 1185 }, { "epoch": 1.2643923240938166, "grad_norm": 0.3949887975107038, "learning_rate": 7.443831616630734e-05, "loss": 0.3884, "step": 1186 }, { "epoch": 1.2654584221748402, "grad_norm": 0.5026720516501154, "learning_rate": 7.442316279174172e-05, "loss": 0.3906, "step": 1187 }, { "epoch": 1.2665245202558635, "grad_norm": 0.502955271204735, "learning_rate": 7.440799034853294e-05, "loss": 0.3906, "step": 1188 }, { "epoch": 1.267590618336887, "grad_norm": 0.43070711981801013, "learning_rate": 7.439279884508573e-05, "loss": 0.3959, "step": 1189 }, { "epoch": 1.2686567164179103, "grad_norm": 0.3992942313925878, "learning_rate": 7.437758828981542e-05, "loss": 0.3979, "step": 1190 }, { "epoch": 1.2697228144989339, "grad_norm": 0.41776106633422166, "learning_rate": 7.436235869114785e-05, "loss": 0.3888, "step": 1191 }, { "epoch": 1.2707889125799574, "grad_norm": 0.3678243374968389, "learning_rate": 7.434711005751942e-05, "loss": 0.3929, "step": 1192 }, { "epoch": 1.271855010660981, "grad_norm": 0.4462778916558436, "learning_rate": 7.43318423973771e-05, "loss": 0.4052, "step": 1193 }, { "epoch": 1.2729211087420043, "grad_norm": 0.4072295381721561, "learning_rate": 7.431655571917835e-05, "loss": 0.3901, "step": 1194 }, { "epoch": 1.2739872068230278, "grad_norm": 0.33668391253724705, "learning_rate": 7.430125003139124e-05, "loss": 0.395, "step": 1195 }, { "epoch": 1.275053304904051, "grad_norm": 0.49984395156609207, "learning_rate": 7.428592534249429e-05, "loss": 0.3921, "step": 1196 }, { "epoch": 1.2761194029850746, "grad_norm": 0.4961401301018006, "learning_rate": 7.42705816609766e-05, "loss": 0.3986, "step": 1197 }, { "epoch": 1.2771855010660982, "grad_norm": 0.36987736865515364, "learning_rate": 7.425521899533776e-05, "loss": 0.3863, "step": 1198 }, { "epoch": 1.2782515991471215, "grad_norm": 0.37433483688551117, "learning_rate": 7.42398373540879e-05, "loss": 0.4008, "step": 1199 }, { "epoch": 1.279317697228145, "grad_norm": 0.3960664301566479, "learning_rate": 7.422443674574764e-05, "loss": 0.3936, "step": 1200 }, { "epoch": 1.2803837953091683, "grad_norm": 0.42962763499848106, "learning_rate": 7.42090171788481e-05, "loss": 0.3968, "step": 1201 }, { "epoch": 1.2814498933901919, "grad_norm": 0.4947438283060391, "learning_rate": 7.419357866193097e-05, "loss": 0.3974, "step": 1202 }, { "epoch": 1.2825159914712154, "grad_norm": 0.7638002002790177, "learning_rate": 7.417812120354833e-05, "loss": 0.3987, "step": 1203 }, { "epoch": 1.2835820895522387, "grad_norm": 0.6618293847952098, "learning_rate": 7.416264481226284e-05, "loss": 0.3954, "step": 1204 }, { "epoch": 1.2846481876332623, "grad_norm": 0.7564644303522643, "learning_rate": 7.414714949664761e-05, "loss": 0.4025, "step": 1205 }, { "epoch": 1.2857142857142856, "grad_norm": 0.7341957818072022, "learning_rate": 7.413163526528623e-05, "loss": 0.3972, "step": 1206 }, { "epoch": 1.2867803837953091, "grad_norm": 0.7172296615238862, "learning_rate": 7.41161021267728e-05, "loss": 0.4024, "step": 1207 }, { "epoch": 1.2878464818763327, "grad_norm": 0.6773018518559085, "learning_rate": 7.410055008971186e-05, "loss": 0.3986, "step": 1208 }, { "epoch": 1.2889125799573562, "grad_norm": 0.7499881724261244, "learning_rate": 7.40849791627184e-05, "loss": 0.401, "step": 1209 }, { "epoch": 1.2899786780383795, "grad_norm": 0.6552329889222517, "learning_rate": 7.406938935441795e-05, "loss": 0.399, "step": 1210 }, { "epoch": 1.291044776119403, "grad_norm": 0.6452927503574722, "learning_rate": 7.405378067344645e-05, "loss": 0.4023, "step": 1211 }, { "epoch": 1.2921108742004264, "grad_norm": 0.6122823637184173, "learning_rate": 7.403815312845027e-05, "loss": 0.4003, "step": 1212 }, { "epoch": 1.29317697228145, "grad_norm": 0.4885206292712561, "learning_rate": 7.402250672808627e-05, "loss": 0.3948, "step": 1213 }, { "epoch": 1.2942430703624734, "grad_norm": 0.36971065365308836, "learning_rate": 7.400684148102175e-05, "loss": 0.3983, "step": 1214 }, { "epoch": 1.2953091684434968, "grad_norm": 0.5341249123573403, "learning_rate": 7.399115739593444e-05, "loss": 0.3998, "step": 1215 }, { "epoch": 1.2963752665245203, "grad_norm": 0.6951670150170282, "learning_rate": 7.397545448151249e-05, "loss": 0.3917, "step": 1216 }, { "epoch": 1.2974413646055436, "grad_norm": 0.6307408026599629, "learning_rate": 7.395973274645452e-05, "loss": 0.3966, "step": 1217 }, { "epoch": 1.2985074626865671, "grad_norm": 0.41244623390075813, "learning_rate": 7.394399219946955e-05, "loss": 0.3943, "step": 1218 }, { "epoch": 1.2995735607675907, "grad_norm": 0.4959351860092428, "learning_rate": 7.392823284927704e-05, "loss": 0.3969, "step": 1219 }, { "epoch": 1.3006396588486142, "grad_norm": 0.6354141360008508, "learning_rate": 7.391245470460682e-05, "loss": 0.3912, "step": 1220 }, { "epoch": 1.3017057569296375, "grad_norm": 0.5545244238798931, "learning_rate": 7.389665777419916e-05, "loss": 0.3959, "step": 1221 }, { "epoch": 1.302771855010661, "grad_norm": 0.4981798925494521, "learning_rate": 7.388084206680477e-05, "loss": 0.3975, "step": 1222 }, { "epoch": 1.3038379530916844, "grad_norm": 0.6996743346801269, "learning_rate": 7.386500759118472e-05, "loss": 0.3969, "step": 1223 }, { "epoch": 1.304904051172708, "grad_norm": 0.8545480674302531, "learning_rate": 7.384915435611047e-05, "loss": 0.3911, "step": 1224 }, { "epoch": 1.3059701492537314, "grad_norm": 0.9034425556959509, "learning_rate": 7.38332823703639e-05, "loss": 0.3984, "step": 1225 }, { "epoch": 1.3070362473347548, "grad_norm": 0.8405317648162198, "learning_rate": 7.381739164273727e-05, "loss": 0.3919, "step": 1226 }, { "epoch": 1.3081023454157783, "grad_norm": 0.7611193691575167, "learning_rate": 7.380148218203321e-05, "loss": 0.3936, "step": 1227 }, { "epoch": 1.3091684434968016, "grad_norm": 0.7529452227808444, "learning_rate": 7.378555399706473e-05, "loss": 0.3976, "step": 1228 }, { "epoch": 1.3102345415778252, "grad_norm": 0.7498395498416355, "learning_rate": 7.376960709665522e-05, "loss": 0.3938, "step": 1229 }, { "epoch": 1.3113006396588487, "grad_norm": 0.617409199350578, "learning_rate": 7.375364148963845e-05, "loss": 0.4, "step": 1230 }, { "epoch": 1.312366737739872, "grad_norm": 0.5012176007830088, "learning_rate": 7.373765718485851e-05, "loss": 0.3898, "step": 1231 }, { "epoch": 1.3134328358208955, "grad_norm": 0.43946542231558117, "learning_rate": 7.37216541911699e-05, "loss": 0.4011, "step": 1232 }, { "epoch": 1.3144989339019189, "grad_norm": 0.37173647672592797, "learning_rate": 7.370563251743743e-05, "loss": 0.4035, "step": 1233 }, { "epoch": 1.3155650319829424, "grad_norm": 0.3453534441885958, "learning_rate": 7.368959217253627e-05, "loss": 0.3997, "step": 1234 }, { "epoch": 1.316631130063966, "grad_norm": 0.35475360394383565, "learning_rate": 7.367353316535195e-05, "loss": 0.396, "step": 1235 }, { "epoch": 1.3176972281449895, "grad_norm": 0.4807258068112041, "learning_rate": 7.365745550478034e-05, "loss": 0.3981, "step": 1236 }, { "epoch": 1.3187633262260128, "grad_norm": 0.5770643962715295, "learning_rate": 7.364135919972759e-05, "loss": 0.397, "step": 1237 }, { "epoch": 1.3198294243070363, "grad_norm": 0.5494777777905933, "learning_rate": 7.362524425911024e-05, "loss": 0.3937, "step": 1238 }, { "epoch": 1.3208955223880596, "grad_norm": 0.4823030916464439, "learning_rate": 7.360911069185513e-05, "loss": 0.3946, "step": 1239 }, { "epoch": 1.3219616204690832, "grad_norm": 0.44738974205793797, "learning_rate": 7.35929585068994e-05, "loss": 0.3987, "step": 1240 }, { "epoch": 1.3230277185501067, "grad_norm": 0.4647985462455625, "learning_rate": 7.357678771319055e-05, "loss": 0.3989, "step": 1241 }, { "epoch": 1.32409381663113, "grad_norm": 0.4907957965266697, "learning_rate": 7.356059831968634e-05, "loss": 0.3928, "step": 1242 }, { "epoch": 1.3251599147121536, "grad_norm": 0.4834632280662174, "learning_rate": 7.354439033535486e-05, "loss": 0.3925, "step": 1243 }, { "epoch": 1.3262260127931769, "grad_norm": 0.5312494947802512, "learning_rate": 7.352816376917448e-05, "loss": 0.395, "step": 1244 }, { "epoch": 1.3272921108742004, "grad_norm": 0.6134924022779664, "learning_rate": 7.351191863013387e-05, "loss": 0.3952, "step": 1245 }, { "epoch": 1.328358208955224, "grad_norm": 0.6854173044473056, "learning_rate": 7.349565492723204e-05, "loss": 0.3993, "step": 1246 }, { "epoch": 1.3294243070362473, "grad_norm": 0.6787952149521692, "learning_rate": 7.347937266947817e-05, "loss": 0.388, "step": 1247 }, { "epoch": 1.3304904051172708, "grad_norm": 0.5985162637767774, "learning_rate": 7.346307186589183e-05, "loss": 0.3964, "step": 1248 }, { "epoch": 1.331556503198294, "grad_norm": 0.5267115292559772, "learning_rate": 7.344675252550278e-05, "loss": 0.397, "step": 1249 }, { "epoch": 1.3326226012793176, "grad_norm": 0.4627942903946516, "learning_rate": 7.343041465735115e-05, "loss": 0.398, "step": 1250 }, { "epoch": 1.3336886993603412, "grad_norm": 0.38809726070928446, "learning_rate": 7.34140582704872e-05, "loss": 0.3912, "step": 1251 }, { "epoch": 1.3347547974413647, "grad_norm": 0.38768792360594717, "learning_rate": 7.339768337397156e-05, "loss": 0.3976, "step": 1252 }, { "epoch": 1.335820895522388, "grad_norm": 0.8280978007735911, "learning_rate": 7.338128997687505e-05, "loss": 0.3916, "step": 1253 }, { "epoch": 1.3368869936034116, "grad_norm": 0.5768542990929353, "learning_rate": 7.336487808827878e-05, "loss": 0.4006, "step": 1254 }, { "epoch": 1.3379530916844349, "grad_norm": 0.5660915592706494, "learning_rate": 7.334844771727407e-05, "loss": 0.3889, "step": 1255 }, { "epoch": 1.3390191897654584, "grad_norm": 0.5472277393608467, "learning_rate": 7.333199887296249e-05, "loss": 0.3966, "step": 1256 }, { "epoch": 1.340085287846482, "grad_norm": 0.591414528568585, "learning_rate": 7.331553156445585e-05, "loss": 0.3942, "step": 1257 }, { "epoch": 1.3411513859275053, "grad_norm": 0.551915916206759, "learning_rate": 7.329904580087618e-05, "loss": 0.3981, "step": 1258 }, { "epoch": 1.3422174840085288, "grad_norm": 0.5979571239616868, "learning_rate": 7.328254159135575e-05, "loss": 0.391, "step": 1259 }, { "epoch": 1.3432835820895521, "grad_norm": 0.6293378397487646, "learning_rate": 7.326601894503703e-05, "loss": 0.39, "step": 1260 }, { "epoch": 1.3443496801705757, "grad_norm": 0.6163722703678122, "learning_rate": 7.324947787107267e-05, "loss": 0.3967, "step": 1261 }, { "epoch": 1.3454157782515992, "grad_norm": 0.5746957535175811, "learning_rate": 7.323291837862561e-05, "loss": 0.3948, "step": 1262 }, { "epoch": 1.3464818763326227, "grad_norm": 0.5421145237930761, "learning_rate": 7.321634047686895e-05, "loss": 0.3914, "step": 1263 }, { "epoch": 1.347547974413646, "grad_norm": 0.5487443991159977, "learning_rate": 7.319974417498594e-05, "loss": 0.3895, "step": 1264 }, { "epoch": 1.3486140724946696, "grad_norm": 0.5644289944018167, "learning_rate": 7.318312948217012e-05, "loss": 0.3928, "step": 1265 }, { "epoch": 1.349680170575693, "grad_norm": 0.6211957603031802, "learning_rate": 7.316649640762515e-05, "loss": 0.3914, "step": 1266 }, { "epoch": 1.3507462686567164, "grad_norm": 0.6243920858668569, "learning_rate": 7.314984496056487e-05, "loss": 0.3918, "step": 1267 }, { "epoch": 1.35181236673774, "grad_norm": 0.48121327241122974, "learning_rate": 7.313317515021334e-05, "loss": 0.3949, "step": 1268 }, { "epoch": 1.3528784648187633, "grad_norm": 0.42705279500592713, "learning_rate": 7.311648698580475e-05, "loss": 0.3914, "step": 1269 }, { "epoch": 1.3539445628997868, "grad_norm": 0.4868001076889179, "learning_rate": 7.309978047658348e-05, "loss": 0.3969, "step": 1270 }, { "epoch": 1.3550106609808101, "grad_norm": 0.5141089848205457, "learning_rate": 7.308305563180409e-05, "loss": 0.3947, "step": 1271 }, { "epoch": 1.3560767590618337, "grad_norm": 0.5120767848632203, "learning_rate": 7.306631246073125e-05, "loss": 0.393, "step": 1272 }, { "epoch": 1.3571428571428572, "grad_norm": 0.5231030391185801, "learning_rate": 7.30495509726398e-05, "loss": 0.3897, "step": 1273 }, { "epoch": 1.3582089552238805, "grad_norm": 0.43134602113074105, "learning_rate": 7.303277117681475e-05, "loss": 0.3955, "step": 1274 }, { "epoch": 1.359275053304904, "grad_norm": 0.2918563082914735, "learning_rate": 7.301597308255124e-05, "loss": 0.3933, "step": 1275 }, { "epoch": 1.3603411513859274, "grad_norm": 0.2886464392254564, "learning_rate": 7.299915669915454e-05, "loss": 0.3947, "step": 1276 }, { "epoch": 1.361407249466951, "grad_norm": 0.3719373094507802, "learning_rate": 7.298232203594003e-05, "loss": 0.3972, "step": 1277 }, { "epoch": 1.3624733475479744, "grad_norm": 0.3893538092048672, "learning_rate": 7.296546910223327e-05, "loss": 0.3976, "step": 1278 }, { "epoch": 1.363539445628998, "grad_norm": 0.33287474885809093, "learning_rate": 7.294859790736989e-05, "loss": 0.3899, "step": 1279 }, { "epoch": 1.3646055437100213, "grad_norm": 0.44445251040661327, "learning_rate": 7.293170846069564e-05, "loss": 0.3982, "step": 1280 }, { "epoch": 1.3656716417910448, "grad_norm": 0.5752395269544717, "learning_rate": 7.291480077156642e-05, "loss": 0.3968, "step": 1281 }, { "epoch": 1.3667377398720681, "grad_norm": 0.5884059035360013, "learning_rate": 7.289787484934823e-05, "loss": 0.387, "step": 1282 }, { "epoch": 1.3678038379530917, "grad_norm": 0.6411711941306089, "learning_rate": 7.288093070341709e-05, "loss": 0.3979, "step": 1283 }, { "epoch": 1.3688699360341152, "grad_norm": 0.7297554798439111, "learning_rate": 7.286396834315925e-05, "loss": 0.3975, "step": 1284 }, { "epoch": 1.3699360341151385, "grad_norm": 0.8133268333459146, "learning_rate": 7.284698777797091e-05, "loss": 0.3958, "step": 1285 }, { "epoch": 1.371002132196162, "grad_norm": 0.8171389709979388, "learning_rate": 7.282998901725846e-05, "loss": 0.3935, "step": 1286 }, { "epoch": 1.3720682302771854, "grad_norm": 0.6715918102841708, "learning_rate": 7.281297207043832e-05, "loss": 0.3963, "step": 1287 }, { "epoch": 1.373134328358209, "grad_norm": 0.5821302267603919, "learning_rate": 7.279593694693698e-05, "loss": 0.3957, "step": 1288 }, { "epoch": 1.3742004264392325, "grad_norm": 0.5793056879165892, "learning_rate": 7.277888365619104e-05, "loss": 0.399, "step": 1289 }, { "epoch": 1.375266524520256, "grad_norm": 0.6131326531548551, "learning_rate": 7.276181220764713e-05, "loss": 0.391, "step": 1290 }, { "epoch": 1.3763326226012793, "grad_norm": 0.6655360716979257, "learning_rate": 7.274472261076192e-05, "loss": 0.3936, "step": 1291 }, { "epoch": 1.3773987206823028, "grad_norm": 0.6514488453949313, "learning_rate": 7.272761487500219e-05, "loss": 0.389, "step": 1292 }, { "epoch": 1.3784648187633262, "grad_norm": 0.6004752538724129, "learning_rate": 7.271048900984473e-05, "loss": 0.3979, "step": 1293 }, { "epoch": 1.3795309168443497, "grad_norm": 0.5286751186792054, "learning_rate": 7.269334502477636e-05, "loss": 0.3975, "step": 1294 }, { "epoch": 1.3805970149253732, "grad_norm": 0.3782372078245128, "learning_rate": 7.267618292929398e-05, "loss": 0.3965, "step": 1295 }, { "epoch": 1.3816631130063965, "grad_norm": 0.4478939359799176, "learning_rate": 7.265900273290448e-05, "loss": 0.3882, "step": 1296 }, { "epoch": 1.38272921108742, "grad_norm": 0.40504400102254523, "learning_rate": 7.264180444512481e-05, "loss": 0.3979, "step": 1297 }, { "epoch": 1.3837953091684434, "grad_norm": 0.38764841393473026, "learning_rate": 7.262458807548191e-05, "loss": 0.3963, "step": 1298 }, { "epoch": 1.384861407249467, "grad_norm": 0.3656453484587496, "learning_rate": 7.260735363351278e-05, "loss": 0.3938, "step": 1299 }, { "epoch": 1.3859275053304905, "grad_norm": 0.48752680564499556, "learning_rate": 7.259010112876437e-05, "loss": 0.3929, "step": 1300 }, { "epoch": 1.3869936034115138, "grad_norm": 0.5660988157671498, "learning_rate": 7.257283057079371e-05, "loss": 0.3965, "step": 1301 }, { "epoch": 1.3880597014925373, "grad_norm": 0.5415116199650241, "learning_rate": 7.255554196916777e-05, "loss": 0.3969, "step": 1302 }, { "epoch": 1.3891257995735606, "grad_norm": 0.48579681088844257, "learning_rate": 7.253823533346353e-05, "loss": 0.3963, "step": 1303 }, { "epoch": 1.3901918976545842, "grad_norm": 0.4521402830070269, "learning_rate": 7.2520910673268e-05, "loss": 0.3889, "step": 1304 }, { "epoch": 1.3912579957356077, "grad_norm": 0.3868095009917062, "learning_rate": 7.250356799817811e-05, "loss": 0.3847, "step": 1305 }, { "epoch": 1.3923240938166312, "grad_norm": 0.3772982157721458, "learning_rate": 7.24862073178008e-05, "loss": 0.3898, "step": 1306 }, { "epoch": 1.3933901918976546, "grad_norm": 0.3776649103582633, "learning_rate": 7.2468828641753e-05, "loss": 0.3917, "step": 1307 }, { "epoch": 1.394456289978678, "grad_norm": 0.3824919655119669, "learning_rate": 7.245143197966158e-05, "loss": 0.3938, "step": 1308 }, { "epoch": 1.3955223880597014, "grad_norm": 0.5015303372462967, "learning_rate": 7.243401734116341e-05, "loss": 0.3896, "step": 1309 }, { "epoch": 1.396588486140725, "grad_norm": 0.594587592040505, "learning_rate": 7.241658473590526e-05, "loss": 0.3927, "step": 1310 }, { "epoch": 1.3976545842217485, "grad_norm": 0.560051836440119, "learning_rate": 7.239913417354393e-05, "loss": 0.3956, "step": 1311 }, { "epoch": 1.3987206823027718, "grad_norm": 0.4940699663851546, "learning_rate": 7.238166566374607e-05, "loss": 0.3976, "step": 1312 }, { "epoch": 1.3997867803837953, "grad_norm": 0.4683362628695071, "learning_rate": 7.236417921618839e-05, "loss": 0.3934, "step": 1313 }, { "epoch": 1.4008528784648187, "grad_norm": 0.46229731243535377, "learning_rate": 7.234667484055742e-05, "loss": 0.3897, "step": 1314 }, { "epoch": 1.4019189765458422, "grad_norm": 0.4363083782167336, "learning_rate": 7.232915254654968e-05, "loss": 0.3925, "step": 1315 }, { "epoch": 1.4029850746268657, "grad_norm": 0.4257678057099821, "learning_rate": 7.231161234387165e-05, "loss": 0.3926, "step": 1316 }, { "epoch": 1.4040511727078893, "grad_norm": 0.4846979047099743, "learning_rate": 7.229405424223967e-05, "loss": 0.3913, "step": 1317 }, { "epoch": 1.4051172707889126, "grad_norm": 0.5325656515388444, "learning_rate": 7.227647825137998e-05, "loss": 0.3922, "step": 1318 }, { "epoch": 1.4061833688699361, "grad_norm": 0.544343816677594, "learning_rate": 7.225888438102882e-05, "loss": 0.3888, "step": 1319 }, { "epoch": 1.4072494669509594, "grad_norm": 0.5779022828781076, "learning_rate": 7.224127264093225e-05, "loss": 0.4001, "step": 1320 }, { "epoch": 1.408315565031983, "grad_norm": 0.7411524315870592, "learning_rate": 7.222364304084627e-05, "loss": 0.3919, "step": 1321 }, { "epoch": 1.4093816631130065, "grad_norm": 0.9208203333396868, "learning_rate": 7.220599559053676e-05, "loss": 0.3985, "step": 1322 }, { "epoch": 1.4104477611940298, "grad_norm": 0.9562178280419171, "learning_rate": 7.218833029977948e-05, "loss": 0.4012, "step": 1323 }, { "epoch": 1.4115138592750534, "grad_norm": 0.8517901376976156, "learning_rate": 7.217064717836009e-05, "loss": 0.3941, "step": 1324 }, { "epoch": 1.4125799573560767, "grad_norm": 0.6475331249527876, "learning_rate": 7.215294623607414e-05, "loss": 0.3952, "step": 1325 }, { "epoch": 1.4136460554371002, "grad_norm": 0.4701406210795358, "learning_rate": 7.213522748272699e-05, "loss": 0.3912, "step": 1326 }, { "epoch": 1.4147121535181237, "grad_norm": 0.519797694341973, "learning_rate": 7.211749092813395e-05, "loss": 0.3941, "step": 1327 }, { "epoch": 1.415778251599147, "grad_norm": 0.5546483656534481, "learning_rate": 7.20997365821201e-05, "loss": 0.39, "step": 1328 }, { "epoch": 1.4168443496801706, "grad_norm": 0.5322694569447141, "learning_rate": 7.208196445452048e-05, "loss": 0.3933, "step": 1329 }, { "epoch": 1.417910447761194, "grad_norm": 0.5615390005781126, "learning_rate": 7.20641745551799e-05, "loss": 0.389, "step": 1330 }, { "epoch": 1.4189765458422174, "grad_norm": 0.5359170050084836, "learning_rate": 7.204636689395304e-05, "loss": 0.3915, "step": 1331 }, { "epoch": 1.420042643923241, "grad_norm": 0.46419011837159624, "learning_rate": 7.202854148070443e-05, "loss": 0.3879, "step": 1332 }, { "epoch": 1.4211087420042645, "grad_norm": 0.40341695125405613, "learning_rate": 7.201069832530838e-05, "loss": 0.3926, "step": 1333 }, { "epoch": 1.4221748400852878, "grad_norm": 0.37929724565888073, "learning_rate": 7.199283743764913e-05, "loss": 0.3876, "step": 1334 }, { "epoch": 1.4232409381663114, "grad_norm": 0.3708060382902423, "learning_rate": 7.197495882762065e-05, "loss": 0.394, "step": 1335 }, { "epoch": 1.4243070362473347, "grad_norm": 0.42952036732595084, "learning_rate": 7.195706250512676e-05, "loss": 0.3911, "step": 1336 }, { "epoch": 1.4253731343283582, "grad_norm": 0.3633415202092657, "learning_rate": 7.19391484800811e-05, "loss": 0.3881, "step": 1337 }, { "epoch": 1.4264392324093818, "grad_norm": 0.3233744886500194, "learning_rate": 7.192121676240713e-05, "loss": 0.3969, "step": 1338 }, { "epoch": 1.427505330490405, "grad_norm": 0.35414304904628113, "learning_rate": 7.190326736203805e-05, "loss": 0.3968, "step": 1339 }, { "epoch": 1.4285714285714286, "grad_norm": 0.36418321869973685, "learning_rate": 7.188530028891691e-05, "loss": 0.3932, "step": 1340 }, { "epoch": 1.429637526652452, "grad_norm": 0.32413601902229694, "learning_rate": 7.186731555299654e-05, "loss": 0.3936, "step": 1341 }, { "epoch": 1.4307036247334755, "grad_norm": 0.3875245106191369, "learning_rate": 7.184931316423955e-05, "loss": 0.3956, "step": 1342 }, { "epoch": 1.431769722814499, "grad_norm": 0.40455914476109633, "learning_rate": 7.183129313261833e-05, "loss": 0.393, "step": 1343 }, { "epoch": 1.4328358208955223, "grad_norm": 0.4415785397804514, "learning_rate": 7.181325546811503e-05, "loss": 0.3961, "step": 1344 }, { "epoch": 1.4339019189765458, "grad_norm": 0.5500841020492822, "learning_rate": 7.179520018072158e-05, "loss": 0.3954, "step": 1345 }, { "epoch": 1.4349680170575694, "grad_norm": 0.6263260139567476, "learning_rate": 7.177712728043967e-05, "loss": 0.3981, "step": 1346 }, { "epoch": 1.4360341151385927, "grad_norm": 0.7401499402434726, "learning_rate": 7.175903677728077e-05, "loss": 0.3974, "step": 1347 }, { "epoch": 1.4371002132196162, "grad_norm": 0.8726338431226237, "learning_rate": 7.174092868126604e-05, "loss": 0.3871, "step": 1348 }, { "epoch": 1.4381663113006398, "grad_norm": 0.8511811546689546, "learning_rate": 7.172280300242648e-05, "loss": 0.3935, "step": 1349 }, { "epoch": 1.439232409381663, "grad_norm": 0.8021327414666732, "learning_rate": 7.17046597508027e-05, "loss": 0.3989, "step": 1350 }, { "epoch": 1.4402985074626866, "grad_norm": 0.7465635016649216, "learning_rate": 7.168649893644517e-05, "loss": 0.3911, "step": 1351 }, { "epoch": 1.44136460554371, "grad_norm": 0.722112156696109, "learning_rate": 7.166832056941405e-05, "loss": 0.4008, "step": 1352 }, { "epoch": 1.4424307036247335, "grad_norm": 0.6928780087951198, "learning_rate": 7.165012465977916e-05, "loss": 0.3944, "step": 1353 }, { "epoch": 1.443496801705757, "grad_norm": 0.5324488681456964, "learning_rate": 7.163191121762012e-05, "loss": 0.3923, "step": 1354 }, { "epoch": 1.4445628997867803, "grad_norm": 0.37600301998098723, "learning_rate": 7.161368025302622e-05, "loss": 0.3931, "step": 1355 }, { "epoch": 1.4456289978678039, "grad_norm": 0.43884431846324246, "learning_rate": 7.159543177609648e-05, "loss": 0.4019, "step": 1356 }, { "epoch": 1.4466950959488272, "grad_norm": 0.4943768560008293, "learning_rate": 7.15771657969396e-05, "loss": 0.3918, "step": 1357 }, { "epoch": 1.4477611940298507, "grad_norm": 0.5078112364255435, "learning_rate": 7.155888232567396e-05, "loss": 0.3931, "step": 1358 }, { "epoch": 1.4488272921108742, "grad_norm": 0.5262670681715143, "learning_rate": 7.15405813724277e-05, "loss": 0.3901, "step": 1359 }, { "epoch": 1.4498933901918978, "grad_norm": 0.44931799250014814, "learning_rate": 7.152226294733857e-05, "loss": 0.3928, "step": 1360 }, { "epoch": 1.450959488272921, "grad_norm": 0.3766185113949834, "learning_rate": 7.150392706055401e-05, "loss": 0.3888, "step": 1361 }, { "epoch": 1.4520255863539446, "grad_norm": 0.31545132670855125, "learning_rate": 7.148557372223118e-05, "loss": 0.3917, "step": 1362 }, { "epoch": 1.453091684434968, "grad_norm": 0.4218667315766686, "learning_rate": 7.146720294253687e-05, "loss": 0.3958, "step": 1363 }, { "epoch": 1.4541577825159915, "grad_norm": 0.5982279310643812, "learning_rate": 7.14488147316475e-05, "loss": 0.3923, "step": 1364 }, { "epoch": 1.455223880597015, "grad_norm": 0.6848592048351014, "learning_rate": 7.143040909974923e-05, "loss": 0.3984, "step": 1365 }, { "epoch": 1.4562899786780383, "grad_norm": 0.6819349249907524, "learning_rate": 7.14119860570378e-05, "loss": 0.3939, "step": 1366 }, { "epoch": 1.4573560767590619, "grad_norm": 0.6440204606115185, "learning_rate": 7.139354561371863e-05, "loss": 0.3997, "step": 1367 }, { "epoch": 1.4584221748400852, "grad_norm": 0.630295208498841, "learning_rate": 7.137508778000676e-05, "loss": 0.3948, "step": 1368 }, { "epoch": 1.4594882729211087, "grad_norm": 0.5314317110342841, "learning_rate": 7.135661256612688e-05, "loss": 0.3889, "step": 1369 }, { "epoch": 1.4605543710021323, "grad_norm": 0.34981486673823153, "learning_rate": 7.133811998231327e-05, "loss": 0.3935, "step": 1370 }, { "epoch": 1.4616204690831556, "grad_norm": 0.35709223587817546, "learning_rate": 7.131961003880989e-05, "loss": 0.3934, "step": 1371 }, { "epoch": 1.462686567164179, "grad_norm": 0.45371546795069073, "learning_rate": 7.130108274587027e-05, "loss": 0.3968, "step": 1372 }, { "epoch": 1.4637526652452024, "grad_norm": 0.5253418087756354, "learning_rate": 7.128253811375759e-05, "loss": 0.394, "step": 1373 }, { "epoch": 1.464818763326226, "grad_norm": 0.4991498770169075, "learning_rate": 7.126397615274459e-05, "loss": 0.3846, "step": 1374 }, { "epoch": 1.4658848614072495, "grad_norm": 0.49528918882508394, "learning_rate": 7.124539687311362e-05, "loss": 0.3945, "step": 1375 }, { "epoch": 1.466950959488273, "grad_norm": 0.5068485592396924, "learning_rate": 7.122680028515668e-05, "loss": 0.3959, "step": 1376 }, { "epoch": 1.4680170575692963, "grad_norm": 0.5465623764731636, "learning_rate": 7.120818639917527e-05, "loss": 0.3983, "step": 1377 }, { "epoch": 1.4690831556503199, "grad_norm": 0.6180418289803238, "learning_rate": 7.118955522548053e-05, "loss": 0.3906, "step": 1378 }, { "epoch": 1.4701492537313432, "grad_norm": 0.6374392706839199, "learning_rate": 7.117090677439317e-05, "loss": 0.4016, "step": 1379 }, { "epoch": 1.4712153518123667, "grad_norm": 0.5780281794791482, "learning_rate": 7.115224105624346e-05, "loss": 0.3891, "step": 1380 }, { "epoch": 1.4722814498933903, "grad_norm": 0.5215481763118731, "learning_rate": 7.113355808137122e-05, "loss": 0.3937, "step": 1381 }, { "epoch": 1.4733475479744136, "grad_norm": 0.5508793693931563, "learning_rate": 7.111485786012588e-05, "loss": 0.3993, "step": 1382 }, { "epoch": 1.4744136460554371, "grad_norm": 0.6235158688200941, "learning_rate": 7.109614040286636e-05, "loss": 0.3909, "step": 1383 }, { "epoch": 1.4754797441364604, "grad_norm": 0.6200848574811243, "learning_rate": 7.107740571996118e-05, "loss": 0.4002, "step": 1384 }, { "epoch": 1.476545842217484, "grad_norm": 0.5904012350207049, "learning_rate": 7.105865382178836e-05, "loss": 0.3934, "step": 1385 }, { "epoch": 1.4776119402985075, "grad_norm": 0.5289964058379455, "learning_rate": 7.10398847187355e-05, "loss": 0.3914, "step": 1386 }, { "epoch": 1.478678038379531, "grad_norm": 0.443464050960444, "learning_rate": 7.10210984211997e-05, "loss": 0.392, "step": 1387 }, { "epoch": 1.4797441364605544, "grad_norm": 0.4427955685061585, "learning_rate": 7.100229493958757e-05, "loss": 0.3969, "step": 1388 }, { "epoch": 1.480810234541578, "grad_norm": 0.46393678647424846, "learning_rate": 7.09834742843153e-05, "loss": 0.3934, "step": 1389 }, { "epoch": 1.4818763326226012, "grad_norm": 0.39212911960500413, "learning_rate": 7.096463646580853e-05, "loss": 0.3969, "step": 1390 }, { "epoch": 1.4829424307036247, "grad_norm": 0.31518826807678046, "learning_rate": 7.094578149450243e-05, "loss": 0.3965, "step": 1391 }, { "epoch": 1.4840085287846483, "grad_norm": 0.35137446772893605, "learning_rate": 7.092690938084168e-05, "loss": 0.394, "step": 1392 }, { "epoch": 1.4850746268656716, "grad_norm": 0.43189084089863594, "learning_rate": 7.090802013528047e-05, "loss": 0.3907, "step": 1393 }, { "epoch": 1.4861407249466951, "grad_norm": 0.4502661111740289, "learning_rate": 7.088911376828241e-05, "loss": 0.3946, "step": 1394 }, { "epoch": 1.4872068230277184, "grad_norm": 0.44527023350907197, "learning_rate": 7.087019029032071e-05, "loss": 0.3977, "step": 1395 }, { "epoch": 1.488272921108742, "grad_norm": 0.45794515563438, "learning_rate": 7.085124971187794e-05, "loss": 0.3974, "step": 1396 }, { "epoch": 1.4893390191897655, "grad_norm": 0.46227336496153126, "learning_rate": 7.083229204344623e-05, "loss": 0.3861, "step": 1397 }, { "epoch": 1.4904051172707888, "grad_norm": 0.4503424372012637, "learning_rate": 7.081331729552712e-05, "loss": 0.3951, "step": 1398 }, { "epoch": 1.4914712153518124, "grad_norm": 0.4826116426207179, "learning_rate": 7.079432547863164e-05, "loss": 0.3883, "step": 1399 }, { "epoch": 1.4925373134328357, "grad_norm": 0.526692112887762, "learning_rate": 7.077531660328028e-05, "loss": 0.3908, "step": 1400 }, { "epoch": 1.4936034115138592, "grad_norm": 0.4637243917340167, "learning_rate": 7.075629068000297e-05, "loss": 0.3939, "step": 1401 }, { "epoch": 1.4946695095948828, "grad_norm": 0.4348762733252528, "learning_rate": 7.073724771933906e-05, "loss": 0.3859, "step": 1402 }, { "epoch": 1.4957356076759063, "grad_norm": 0.4277816665773673, "learning_rate": 7.071818773183738e-05, "loss": 0.3969, "step": 1403 }, { "epoch": 1.4968017057569296, "grad_norm": 0.4767523309718055, "learning_rate": 7.069911072805618e-05, "loss": 0.3918, "step": 1404 }, { "epoch": 1.4978678038379531, "grad_norm": 0.4905037191035888, "learning_rate": 7.068001671856309e-05, "loss": 0.3906, "step": 1405 }, { "epoch": 1.4989339019189765, "grad_norm": 0.4362857431277552, "learning_rate": 7.066090571393524e-05, "loss": 0.3909, "step": 1406 }, { "epoch": 1.5, "grad_norm": 0.3290996660217568, "learning_rate": 7.064177772475912e-05, "loss": 0.3976, "step": 1407 }, { "epoch": 1.5010660980810235, "grad_norm": 0.3744727365069323, "learning_rate": 7.062263276163064e-05, "loss": 0.3962, "step": 1408 }, { "epoch": 1.502132196162047, "grad_norm": 0.5302924684211919, "learning_rate": 7.060347083515511e-05, "loss": 0.3847, "step": 1409 }, { "epoch": 1.5031982942430704, "grad_norm": 0.636023047266614, "learning_rate": 7.058429195594727e-05, "loss": 0.3952, "step": 1410 }, { "epoch": 1.5042643923240937, "grad_norm": 0.5944045338529551, "learning_rate": 7.056509613463118e-05, "loss": 0.3901, "step": 1411 }, { "epoch": 1.5053304904051172, "grad_norm": 0.4426101595482755, "learning_rate": 7.054588338184034e-05, "loss": 0.3949, "step": 1412 }, { "epoch": 1.5063965884861408, "grad_norm": 0.3582450742335668, "learning_rate": 7.052665370821764e-05, "loss": 0.3882, "step": 1413 }, { "epoch": 1.5074626865671643, "grad_norm": 0.3954777880467866, "learning_rate": 7.050740712441528e-05, "loss": 0.3972, "step": 1414 }, { "epoch": 1.5085287846481876, "grad_norm": 0.4416647029712508, "learning_rate": 7.04881436410949e-05, "loss": 0.39, "step": 1415 }, { "epoch": 1.509594882729211, "grad_norm": 0.44707077361093805, "learning_rate": 7.046886326892747e-05, "loss": 0.3855, "step": 1416 }, { "epoch": 1.5106609808102345, "grad_norm": 0.40402387918207455, "learning_rate": 7.044956601859329e-05, "loss": 0.3928, "step": 1417 }, { "epoch": 1.511727078891258, "grad_norm": 0.31989191868470707, "learning_rate": 7.043025190078205e-05, "loss": 0.3919, "step": 1418 }, { "epoch": 1.5127931769722816, "grad_norm": 0.2810822260181101, "learning_rate": 7.041092092619277e-05, "loss": 0.3852, "step": 1419 }, { "epoch": 1.5138592750533049, "grad_norm": 0.38427721507681256, "learning_rate": 7.039157310553378e-05, "loss": 0.3975, "step": 1420 }, { "epoch": 1.5149253731343284, "grad_norm": 0.43787056804713514, "learning_rate": 7.03722084495228e-05, "loss": 0.3986, "step": 1421 }, { "epoch": 1.5159914712153517, "grad_norm": 0.39271707463092426, "learning_rate": 7.035282696888684e-05, "loss": 0.393, "step": 1422 }, { "epoch": 1.5170575692963753, "grad_norm": 0.41954871590765713, "learning_rate": 7.033342867436221e-05, "loss": 0.3979, "step": 1423 }, { "epoch": 1.5181236673773988, "grad_norm": 0.488489761777447, "learning_rate": 7.031401357669456e-05, "loss": 0.3881, "step": 1424 }, { "epoch": 1.5191897654584223, "grad_norm": 0.5515408415736078, "learning_rate": 7.029458168663887e-05, "loss": 0.3862, "step": 1425 }, { "epoch": 1.5202558635394456, "grad_norm": 0.6316116354397406, "learning_rate": 7.027513301495937e-05, "loss": 0.3889, "step": 1426 }, { "epoch": 1.521321961620469, "grad_norm": 0.6866508216670858, "learning_rate": 7.025566757242962e-05, "loss": 0.3927, "step": 1427 }, { "epoch": 1.5223880597014925, "grad_norm": 0.6861740365624607, "learning_rate": 7.023618536983249e-05, "loss": 0.3976, "step": 1428 }, { "epoch": 1.523454157782516, "grad_norm": 0.6832689017816802, "learning_rate": 7.021668641796008e-05, "loss": 0.3935, "step": 1429 }, { "epoch": 1.5245202558635396, "grad_norm": 0.678454190085777, "learning_rate": 7.019717072761377e-05, "loss": 0.3879, "step": 1430 }, { "epoch": 1.5255863539445629, "grad_norm": 0.6514965485012345, "learning_rate": 7.01776383096043e-05, "loss": 0.3917, "step": 1431 }, { "epoch": 1.5266524520255862, "grad_norm": 0.5811169015448752, "learning_rate": 7.01580891747516e-05, "loss": 0.3942, "step": 1432 }, { "epoch": 1.5277185501066097, "grad_norm": 0.569097620467461, "learning_rate": 7.013852333388483e-05, "loss": 0.3955, "step": 1433 }, { "epoch": 1.5287846481876333, "grad_norm": 0.4808827995992214, "learning_rate": 7.011894079784248e-05, "loss": 0.3908, "step": 1434 }, { "epoch": 1.5298507462686568, "grad_norm": 0.3656185727390282, "learning_rate": 7.009934157747227e-05, "loss": 0.3946, "step": 1435 }, { "epoch": 1.5309168443496801, "grad_norm": 0.5145348764394406, "learning_rate": 7.007972568363112e-05, "loss": 0.3882, "step": 1436 }, { "epoch": 1.5319829424307037, "grad_norm": 0.6632114181696835, "learning_rate": 7.006009312718525e-05, "loss": 0.3912, "step": 1437 }, { "epoch": 1.533049040511727, "grad_norm": 0.6714409788868576, "learning_rate": 7.004044391901005e-05, "loss": 0.3946, "step": 1438 }, { "epoch": 1.5341151385927505, "grad_norm": 0.5938831067807631, "learning_rate": 7.002077806999016e-05, "loss": 0.3911, "step": 1439 }, { "epoch": 1.535181236673774, "grad_norm": 0.5658150891169751, "learning_rate": 7.000109559101944e-05, "loss": 0.39, "step": 1440 }, { "epoch": 1.5362473347547976, "grad_norm": 0.5719079089059944, "learning_rate": 6.998139649300097e-05, "loss": 0.3963, "step": 1441 }, { "epoch": 1.537313432835821, "grad_norm": 0.4711947234722605, "learning_rate": 6.996168078684702e-05, "loss": 0.3909, "step": 1442 }, { "epoch": 1.5383795309168442, "grad_norm": 0.4358846322039788, "learning_rate": 6.994194848347908e-05, "loss": 0.3933, "step": 1443 }, { "epoch": 1.5394456289978677, "grad_norm": 0.6131004479680616, "learning_rate": 6.99221995938278e-05, "loss": 0.3995, "step": 1444 }, { "epoch": 1.5405117270788913, "grad_norm": 0.6962404795880364, "learning_rate": 6.990243412883304e-05, "loss": 0.3925, "step": 1445 }, { "epoch": 1.5415778251599148, "grad_norm": 0.6694951018990504, "learning_rate": 6.988265209944387e-05, "loss": 0.3892, "step": 1446 }, { "epoch": 1.5426439232409381, "grad_norm": 0.5156094912368259, "learning_rate": 6.986285351661847e-05, "loss": 0.3892, "step": 1447 }, { "epoch": 1.5437100213219617, "grad_norm": 0.41423025778526606, "learning_rate": 6.984303839132425e-05, "loss": 0.3964, "step": 1448 }, { "epoch": 1.544776119402985, "grad_norm": 0.3696131621291919, "learning_rate": 6.982320673453773e-05, "loss": 0.3905, "step": 1449 }, { "epoch": 1.5458422174840085, "grad_norm": 0.37566704365434295, "learning_rate": 6.980335855724465e-05, "loss": 0.3898, "step": 1450 }, { "epoch": 1.546908315565032, "grad_norm": 0.44656110415033884, "learning_rate": 6.978349387043986e-05, "loss": 0.3954, "step": 1451 }, { "epoch": 1.5479744136460556, "grad_norm": 0.5097702199954142, "learning_rate": 6.976361268512735e-05, "loss": 0.3916, "step": 1452 }, { "epoch": 1.549040511727079, "grad_norm": 0.6115774145379382, "learning_rate": 6.974371501232027e-05, "loss": 0.4001, "step": 1453 }, { "epoch": 1.5501066098081022, "grad_norm": 0.6298592763126765, "learning_rate": 6.97238008630409e-05, "loss": 0.3913, "step": 1454 }, { "epoch": 1.5511727078891258, "grad_norm": 0.5640252014215751, "learning_rate": 6.970387024832066e-05, "loss": 0.3972, "step": 1455 }, { "epoch": 1.5522388059701493, "grad_norm": 0.5637434287286831, "learning_rate": 6.968392317920005e-05, "loss": 0.3914, "step": 1456 }, { "epoch": 1.5533049040511728, "grad_norm": 0.5758115244979767, "learning_rate": 6.96639596667287e-05, "loss": 0.3888, "step": 1457 }, { "epoch": 1.5543710021321961, "grad_norm": 0.579669625426786, "learning_rate": 6.96439797219654e-05, "loss": 0.3964, "step": 1458 }, { "epoch": 1.5554371002132195, "grad_norm": 0.5774000389217503, "learning_rate": 6.962398335597798e-05, "loss": 0.391, "step": 1459 }, { "epoch": 1.556503198294243, "grad_norm": 0.5665645013725628, "learning_rate": 6.960397057984336e-05, "loss": 0.3925, "step": 1460 }, { "epoch": 1.5575692963752665, "grad_norm": 0.5243134790028094, "learning_rate": 6.958394140464761e-05, "loss": 0.3931, "step": 1461 }, { "epoch": 1.55863539445629, "grad_norm": 0.5072252071927075, "learning_rate": 6.956389584148586e-05, "loss": 0.3874, "step": 1462 }, { "epoch": 1.5597014925373134, "grad_norm": 0.42649762130370344, "learning_rate": 6.954383390146228e-05, "loss": 0.3902, "step": 1463 }, { "epoch": 1.560767590618337, "grad_norm": 0.33264595497577854, "learning_rate": 6.952375559569016e-05, "loss": 0.3863, "step": 1464 }, { "epoch": 1.5618336886993602, "grad_norm": 0.3759252125909897, "learning_rate": 6.950366093529184e-05, "loss": 0.393, "step": 1465 }, { "epoch": 1.5628997867803838, "grad_norm": 0.4265770921364556, "learning_rate": 6.94835499313987e-05, "loss": 0.3852, "step": 1466 }, { "epoch": 1.5639658848614073, "grad_norm": 0.4579903850772894, "learning_rate": 6.946342259515122e-05, "loss": 0.3911, "step": 1467 }, { "epoch": 1.5650319829424308, "grad_norm": 0.5479175031792544, "learning_rate": 6.944327893769887e-05, "loss": 0.3962, "step": 1468 }, { "epoch": 1.5660980810234542, "grad_norm": 0.6055176521095669, "learning_rate": 6.94231189702002e-05, "loss": 0.3864, "step": 1469 }, { "epoch": 1.5671641791044775, "grad_norm": 0.6727380474108113, "learning_rate": 6.940294270382278e-05, "loss": 0.3921, "step": 1470 }, { "epoch": 1.568230277185501, "grad_norm": 0.6356621826960686, "learning_rate": 6.938275014974323e-05, "loss": 0.3947, "step": 1471 }, { "epoch": 1.5692963752665245, "grad_norm": 0.5243747711823524, "learning_rate": 6.936254131914717e-05, "loss": 0.3959, "step": 1472 }, { "epoch": 1.570362473347548, "grad_norm": 0.46811482341260074, "learning_rate": 6.934231622322923e-05, "loss": 0.3904, "step": 1473 }, { "epoch": 1.5714285714285714, "grad_norm": 0.3465924585456931, "learning_rate": 6.932207487319305e-05, "loss": 0.393, "step": 1474 }, { "epoch": 1.572494669509595, "grad_norm": 0.3025626410259389, "learning_rate": 6.930181728025133e-05, "loss": 0.3925, "step": 1475 }, { "epoch": 1.5735607675906182, "grad_norm": 0.391223961099086, "learning_rate": 6.928154345562569e-05, "loss": 0.39, "step": 1476 }, { "epoch": 1.5746268656716418, "grad_norm": 0.41487388189624425, "learning_rate": 6.926125341054676e-05, "loss": 0.3924, "step": 1477 }, { "epoch": 1.5756929637526653, "grad_norm": 0.45859115746381557, "learning_rate": 6.92409471562542e-05, "loss": 0.3948, "step": 1478 }, { "epoch": 1.5767590618336889, "grad_norm": 0.5136700870287039, "learning_rate": 6.922062470399663e-05, "loss": 0.3861, "step": 1479 }, { "epoch": 1.5778251599147122, "grad_norm": 0.5017752722816414, "learning_rate": 6.920028606503161e-05, "loss": 0.3912, "step": 1480 }, { "epoch": 1.5788912579957355, "grad_norm": 0.46815074536424645, "learning_rate": 6.91799312506257e-05, "loss": 0.3923, "step": 1481 }, { "epoch": 1.579957356076759, "grad_norm": 0.4683376068962054, "learning_rate": 6.915956027205438e-05, "loss": 0.3891, "step": 1482 }, { "epoch": 1.5810234541577826, "grad_norm": 0.3838678721557934, "learning_rate": 6.913917314060215e-05, "loss": 0.3884, "step": 1483 }, { "epoch": 1.582089552238806, "grad_norm": 0.28512465563571143, "learning_rate": 6.911876986756241e-05, "loss": 0.3946, "step": 1484 }, { "epoch": 1.5831556503198294, "grad_norm": 0.2798136165240047, "learning_rate": 6.90983504642375e-05, "loss": 0.3925, "step": 1485 }, { "epoch": 1.5842217484008527, "grad_norm": 0.32595231025368043, "learning_rate": 6.907791494193873e-05, "loss": 0.3933, "step": 1486 }, { "epoch": 1.5852878464818763, "grad_norm": 0.33263930837599737, "learning_rate": 6.905746331198631e-05, "loss": 0.3882, "step": 1487 }, { "epoch": 1.5863539445628998, "grad_norm": 0.3001000433354499, "learning_rate": 6.903699558570935e-05, "loss": 0.3944, "step": 1488 }, { "epoch": 1.5874200426439233, "grad_norm": 0.33691283404723554, "learning_rate": 6.901651177444596e-05, "loss": 0.3901, "step": 1489 }, { "epoch": 1.5884861407249466, "grad_norm": 0.36842928704561506, "learning_rate": 6.899601188954306e-05, "loss": 0.3891, "step": 1490 }, { "epoch": 1.5895522388059702, "grad_norm": 0.37394858126001956, "learning_rate": 6.897549594235654e-05, "loss": 0.3964, "step": 1491 }, { "epoch": 1.5906183368869935, "grad_norm": 0.3288296151355204, "learning_rate": 6.895496394425118e-05, "loss": 0.3894, "step": 1492 }, { "epoch": 1.591684434968017, "grad_norm": 0.33186383529359254, "learning_rate": 6.893441590660064e-05, "loss": 0.3964, "step": 1493 }, { "epoch": 1.5927505330490406, "grad_norm": 0.32170536578140646, "learning_rate": 6.891385184078744e-05, "loss": 0.3892, "step": 1494 }, { "epoch": 1.593816631130064, "grad_norm": 0.34823720165895466, "learning_rate": 6.889327175820302e-05, "loss": 0.3912, "step": 1495 }, { "epoch": 1.5948827292110874, "grad_norm": 0.36827171478815857, "learning_rate": 6.887267567024767e-05, "loss": 0.3877, "step": 1496 }, { "epoch": 1.5959488272921107, "grad_norm": 0.324329055251669, "learning_rate": 6.885206358833056e-05, "loss": 0.3907, "step": 1497 }, { "epoch": 1.5970149253731343, "grad_norm": 0.3724585692258156, "learning_rate": 6.883143552386971e-05, "loss": 0.3882, "step": 1498 }, { "epoch": 1.5980810234541578, "grad_norm": 0.3726849206910898, "learning_rate": 6.881079148829198e-05, "loss": 0.3905, "step": 1499 }, { "epoch": 1.5991471215351813, "grad_norm": 0.37495710080763744, "learning_rate": 6.879013149303312e-05, "loss": 0.3935, "step": 1500 }, { "epoch": 1.6002132196162047, "grad_norm": 0.5021479834874241, "learning_rate": 6.876945554953767e-05, "loss": 0.3925, "step": 1501 }, { "epoch": 1.6012793176972282, "grad_norm": 0.6598112081849651, "learning_rate": 6.874876366925904e-05, "loss": 0.3917, "step": 1502 }, { "epoch": 1.6023454157782515, "grad_norm": 0.7759163516332811, "learning_rate": 6.872805586365947e-05, "loss": 0.3885, "step": 1503 }, { "epoch": 1.603411513859275, "grad_norm": 0.8164276907570536, "learning_rate": 6.870733214420998e-05, "loss": 0.3928, "step": 1504 }, { "epoch": 1.6044776119402986, "grad_norm": 0.8744103848461371, "learning_rate": 6.868659252239045e-05, "loss": 0.394, "step": 1505 }, { "epoch": 1.6055437100213221, "grad_norm": 0.8615414040135513, "learning_rate": 6.866583700968954e-05, "loss": 0.3838, "step": 1506 }, { "epoch": 1.6066098081023454, "grad_norm": 0.7600225280740014, "learning_rate": 6.864506561760474e-05, "loss": 0.3963, "step": 1507 }, { "epoch": 1.6076759061833688, "grad_norm": 0.5613832665703143, "learning_rate": 6.862427835764231e-05, "loss": 0.3995, "step": 1508 }, { "epoch": 1.6087420042643923, "grad_norm": 0.3881251295902559, "learning_rate": 6.860347524131733e-05, "loss": 0.3852, "step": 1509 }, { "epoch": 1.6098081023454158, "grad_norm": 0.3100627174569291, "learning_rate": 6.858265628015362e-05, "loss": 0.3914, "step": 1510 }, { "epoch": 1.6108742004264394, "grad_norm": 0.3947566702558035, "learning_rate": 6.856182148568382e-05, "loss": 0.3961, "step": 1511 }, { "epoch": 1.6119402985074627, "grad_norm": 0.49178390954875206, "learning_rate": 6.854097086944932e-05, "loss": 0.3941, "step": 1512 }, { "epoch": 1.613006396588486, "grad_norm": 0.5414001966948181, "learning_rate": 6.852010444300028e-05, "loss": 0.3926, "step": 1513 }, { "epoch": 1.6140724946695095, "grad_norm": 0.49531130512745414, "learning_rate": 6.849922221789561e-05, "loss": 0.3915, "step": 1514 }, { "epoch": 1.615138592750533, "grad_norm": 0.41543603344069213, "learning_rate": 6.847832420570298e-05, "loss": 0.3883, "step": 1515 }, { "epoch": 1.6162046908315566, "grad_norm": 0.33242146656919136, "learning_rate": 6.84574104179988e-05, "loss": 0.3948, "step": 1516 }, { "epoch": 1.61727078891258, "grad_norm": 0.3892193202210671, "learning_rate": 6.843648086636822e-05, "loss": 0.3943, "step": 1517 }, { "epoch": 1.6183368869936035, "grad_norm": 0.5073950555317511, "learning_rate": 6.841553556240514e-05, "loss": 0.3899, "step": 1518 }, { "epoch": 1.6194029850746268, "grad_norm": 0.543311094919119, "learning_rate": 6.839457451771214e-05, "loss": 0.3915, "step": 1519 }, { "epoch": 1.6204690831556503, "grad_norm": 0.5283833705504892, "learning_rate": 6.837359774390058e-05, "loss": 0.3874, "step": 1520 }, { "epoch": 1.6215351812366738, "grad_norm": 0.5077032134792163, "learning_rate": 6.835260525259048e-05, "loss": 0.3904, "step": 1521 }, { "epoch": 1.6226012793176974, "grad_norm": 0.5106764897990909, "learning_rate": 6.83315970554106e-05, "loss": 0.3898, "step": 1522 }, { "epoch": 1.6236673773987207, "grad_norm": 0.5286416559892751, "learning_rate": 6.831057316399839e-05, "loss": 0.3886, "step": 1523 }, { "epoch": 1.624733475479744, "grad_norm": 0.409097737243627, "learning_rate": 6.828953358999998e-05, "loss": 0.3899, "step": 1524 }, { "epoch": 1.6257995735607675, "grad_norm": 0.3213269166849657, "learning_rate": 6.826847834507024e-05, "loss": 0.3892, "step": 1525 }, { "epoch": 1.626865671641791, "grad_norm": 0.3126601387070219, "learning_rate": 6.824740744087262e-05, "loss": 0.3861, "step": 1526 }, { "epoch": 1.6279317697228146, "grad_norm": 0.39308792247161223, "learning_rate": 6.822632088907937e-05, "loss": 0.3912, "step": 1527 }, { "epoch": 1.628997867803838, "grad_norm": 0.48501314817610786, "learning_rate": 6.820521870137129e-05, "loss": 0.3901, "step": 1528 }, { "epoch": 1.6300639658848612, "grad_norm": 0.5074473205522086, "learning_rate": 6.818410088943791e-05, "loss": 0.391, "step": 1529 }, { "epoch": 1.6311300639658848, "grad_norm": 0.5821772106338331, "learning_rate": 6.816296746497744e-05, "loss": 0.3868, "step": 1530 }, { "epoch": 1.6321961620469083, "grad_norm": 0.6719640615154807, "learning_rate": 6.814181843969664e-05, "loss": 0.3974, "step": 1531 }, { "epoch": 1.6332622601279319, "grad_norm": 0.599159712940155, "learning_rate": 6.812065382531101e-05, "loss": 0.391, "step": 1532 }, { "epoch": 1.6343283582089554, "grad_norm": 0.5406155214874598, "learning_rate": 6.809947363354464e-05, "loss": 0.3907, "step": 1533 }, { "epoch": 1.6353944562899787, "grad_norm": 0.5404737249091326, "learning_rate": 6.807827787613024e-05, "loss": 0.3933, "step": 1534 }, { "epoch": 1.636460554371002, "grad_norm": 0.5080707037084039, "learning_rate": 6.805706656480917e-05, "loss": 0.3802, "step": 1535 }, { "epoch": 1.6375266524520256, "grad_norm": 0.45278920241628184, "learning_rate": 6.803583971133139e-05, "loss": 0.3965, "step": 1536 }, { "epoch": 1.638592750533049, "grad_norm": 0.46161933055133486, "learning_rate": 6.801459732745547e-05, "loss": 0.4023, "step": 1537 }, { "epoch": 1.6396588486140726, "grad_norm": 0.5047213330551431, "learning_rate": 6.799333942494861e-05, "loss": 0.386, "step": 1538 }, { "epoch": 1.640724946695096, "grad_norm": 0.4450247054157199, "learning_rate": 6.797206601558654e-05, "loss": 0.3933, "step": 1539 }, { "epoch": 1.6417910447761193, "grad_norm": 0.3955698230219666, "learning_rate": 6.795077711115368e-05, "loss": 0.3941, "step": 1540 }, { "epoch": 1.6428571428571428, "grad_norm": 0.4975244131254161, "learning_rate": 6.792947272344292e-05, "loss": 0.3985, "step": 1541 }, { "epoch": 1.6439232409381663, "grad_norm": 0.4201752241177451, "learning_rate": 6.790815286425581e-05, "loss": 0.3899, "step": 1542 }, { "epoch": 1.6449893390191899, "grad_norm": 0.3070985227762259, "learning_rate": 6.788681754540245e-05, "loss": 0.3931, "step": 1543 }, { "epoch": 1.6460554371002132, "grad_norm": 0.40901085056872827, "learning_rate": 6.78654667787015e-05, "loss": 0.3911, "step": 1544 }, { "epoch": 1.6471215351812367, "grad_norm": 0.4407517158255465, "learning_rate": 6.784410057598016e-05, "loss": 0.3888, "step": 1545 }, { "epoch": 1.64818763326226, "grad_norm": 0.3680848288610597, "learning_rate": 6.782271894907419e-05, "loss": 0.3867, "step": 1546 }, { "epoch": 1.6492537313432836, "grad_norm": 0.38558182531740504, "learning_rate": 6.78013219098279e-05, "loss": 0.3873, "step": 1547 }, { "epoch": 1.650319829424307, "grad_norm": 0.43293933049711597, "learning_rate": 6.777990947009418e-05, "loss": 0.3876, "step": 1548 }, { "epoch": 1.6513859275053306, "grad_norm": 0.5130367741977939, "learning_rate": 6.775848164173436e-05, "loss": 0.3971, "step": 1549 }, { "epoch": 1.652452025586354, "grad_norm": 0.5953290932568412, "learning_rate": 6.773703843661837e-05, "loss": 0.3919, "step": 1550 }, { "epoch": 1.6535181236673773, "grad_norm": 0.603488942008407, "learning_rate": 6.771557986662462e-05, "loss": 0.39, "step": 1551 }, { "epoch": 1.6545842217484008, "grad_norm": 0.5050491129848195, "learning_rate": 6.769410594364004e-05, "loss": 0.3875, "step": 1552 }, { "epoch": 1.6556503198294243, "grad_norm": 0.4097987145826878, "learning_rate": 6.767261667956009e-05, "loss": 0.3923, "step": 1553 }, { "epoch": 1.6567164179104479, "grad_norm": 0.38259958335783656, "learning_rate": 6.765111208628866e-05, "loss": 0.394, "step": 1554 }, { "epoch": 1.6577825159914712, "grad_norm": 0.36285793649423737, "learning_rate": 6.762959217573823e-05, "loss": 0.3902, "step": 1555 }, { "epoch": 1.6588486140724945, "grad_norm": 0.38527741858086, "learning_rate": 6.760805695982967e-05, "loss": 0.3934, "step": 1556 }, { "epoch": 1.659914712153518, "grad_norm": 0.43180049398669657, "learning_rate": 6.75865064504924e-05, "loss": 0.3948, "step": 1557 }, { "epoch": 1.6609808102345416, "grad_norm": 0.4605936531276027, "learning_rate": 6.756494065966426e-05, "loss": 0.3853, "step": 1558 }, { "epoch": 1.6620469083155651, "grad_norm": 0.5637754400106197, "learning_rate": 6.754335959929159e-05, "loss": 0.3906, "step": 1559 }, { "epoch": 1.6631130063965884, "grad_norm": 0.6706100285105229, "learning_rate": 6.752176328132918e-05, "loss": 0.3896, "step": 1560 }, { "epoch": 1.664179104477612, "grad_norm": 0.6676286093604591, "learning_rate": 6.750015171774025e-05, "loss": 0.3916, "step": 1561 }, { "epoch": 1.6652452025586353, "grad_norm": 0.5819487463144472, "learning_rate": 6.747852492049648e-05, "loss": 0.3865, "step": 1562 }, { "epoch": 1.6663113006396588, "grad_norm": 0.4906481414544558, "learning_rate": 6.745688290157803e-05, "loss": 0.3901, "step": 1563 }, { "epoch": 1.6673773987206824, "grad_norm": 0.43576919495858, "learning_rate": 6.743522567297341e-05, "loss": 0.3946, "step": 1564 }, { "epoch": 1.668443496801706, "grad_norm": 0.3626934309190754, "learning_rate": 6.741355324667963e-05, "loss": 0.3859, "step": 1565 }, { "epoch": 1.6695095948827292, "grad_norm": 0.30739074025899915, "learning_rate": 6.739186563470208e-05, "loss": 0.3855, "step": 1566 }, { "epoch": 1.6705756929637525, "grad_norm": 0.39101458085234175, "learning_rate": 6.737016284905455e-05, "loss": 0.3931, "step": 1567 }, { "epoch": 1.671641791044776, "grad_norm": 0.5051971466609865, "learning_rate": 6.734844490175929e-05, "loss": 0.3881, "step": 1568 }, { "epoch": 1.6727078891257996, "grad_norm": 0.5097545702694593, "learning_rate": 6.732671180484687e-05, "loss": 0.385, "step": 1569 }, { "epoch": 1.6737739872068231, "grad_norm": 0.39153153015185177, "learning_rate": 6.730496357035634e-05, "loss": 0.3876, "step": 1570 }, { "epoch": 1.6748400852878464, "grad_norm": 0.3456157078554192, "learning_rate": 6.728320021033509e-05, "loss": 0.3913, "step": 1571 }, { "epoch": 1.67590618336887, "grad_norm": 0.38289300413638977, "learning_rate": 6.726142173683884e-05, "loss": 0.3889, "step": 1572 }, { "epoch": 1.6769722814498933, "grad_norm": 0.36549754916811916, "learning_rate": 6.723962816193178e-05, "loss": 0.389, "step": 1573 }, { "epoch": 1.6780383795309168, "grad_norm": 0.33051094759676547, "learning_rate": 6.721781949768639e-05, "loss": 0.3919, "step": 1574 }, { "epoch": 1.6791044776119404, "grad_norm": 0.36416872005257267, "learning_rate": 6.719599575618357e-05, "loss": 0.3912, "step": 1575 }, { "epoch": 1.680170575692964, "grad_norm": 0.34559502107142376, "learning_rate": 6.717415694951251e-05, "loss": 0.3851, "step": 1576 }, { "epoch": 1.6812366737739872, "grad_norm": 0.3116613026962773, "learning_rate": 6.715230308977078e-05, "loss": 0.3916, "step": 1577 }, { "epoch": 1.6823027718550105, "grad_norm": 0.30050046517095325, "learning_rate": 6.713043418906428e-05, "loss": 0.3962, "step": 1578 }, { "epoch": 1.683368869936034, "grad_norm": 0.3577516859630259, "learning_rate": 6.710855025950727e-05, "loss": 0.3888, "step": 1579 }, { "epoch": 1.6844349680170576, "grad_norm": 0.4364133412686821, "learning_rate": 6.708665131322227e-05, "loss": 0.3863, "step": 1580 }, { "epoch": 1.6855010660980811, "grad_norm": 0.4687325703384506, "learning_rate": 6.706473736234018e-05, "loss": 0.3915, "step": 1581 }, { "epoch": 1.6865671641791045, "grad_norm": 0.5090267219412645, "learning_rate": 6.704280841900019e-05, "loss": 0.3839, "step": 1582 }, { "epoch": 1.6876332622601278, "grad_norm": 0.5457964796096376, "learning_rate": 6.70208644953498e-05, "loss": 0.3956, "step": 1583 }, { "epoch": 1.6886993603411513, "grad_norm": 0.6368745088599855, "learning_rate": 6.699890560354478e-05, "loss": 0.3902, "step": 1584 }, { "epoch": 1.6897654584221748, "grad_norm": 0.7026042269970657, "learning_rate": 6.697693175574923e-05, "loss": 0.3898, "step": 1585 }, { "epoch": 1.6908315565031984, "grad_norm": 0.6804973492768571, "learning_rate": 6.695494296413554e-05, "loss": 0.3835, "step": 1586 }, { "epoch": 1.6918976545842217, "grad_norm": 0.6670494458267645, "learning_rate": 6.693293924088432e-05, "loss": 0.3951, "step": 1587 }, { "epoch": 1.6929637526652452, "grad_norm": 0.6433701055713933, "learning_rate": 6.691092059818451e-05, "loss": 0.3884, "step": 1588 }, { "epoch": 1.6940298507462686, "grad_norm": 0.6565301872411362, "learning_rate": 6.688888704823329e-05, "loss": 0.3951, "step": 1589 }, { "epoch": 1.695095948827292, "grad_norm": 0.6960842723329089, "learning_rate": 6.686683860323611e-05, "loss": 0.3946, "step": 1590 }, { "epoch": 1.6961620469083156, "grad_norm": 0.6818586763465625, "learning_rate": 6.684477527540664e-05, "loss": 0.3946, "step": 1591 }, { "epoch": 1.6972281449893392, "grad_norm": 0.5572883382416285, "learning_rate": 6.682269707696685e-05, "loss": 0.3867, "step": 1592 }, { "epoch": 1.6982942430703625, "grad_norm": 0.3990760403818603, "learning_rate": 6.680060402014689e-05, "loss": 0.3854, "step": 1593 }, { "epoch": 1.6993603411513858, "grad_norm": 0.4118819183924536, "learning_rate": 6.677849611718515e-05, "loss": 0.3864, "step": 1594 }, { "epoch": 1.7004264392324093, "grad_norm": 0.49548805198701124, "learning_rate": 6.67563733803283e-05, "loss": 0.3974, "step": 1595 }, { "epoch": 1.7014925373134329, "grad_norm": 0.5484992450512396, "learning_rate": 6.673423582183117e-05, "loss": 0.3895, "step": 1596 }, { "epoch": 1.7025586353944564, "grad_norm": 0.5575698301270008, "learning_rate": 6.67120834539568e-05, "loss": 0.3877, "step": 1597 }, { "epoch": 1.7036247334754797, "grad_norm": 0.5151550662716535, "learning_rate": 6.668991628897648e-05, "loss": 0.3877, "step": 1598 }, { "epoch": 1.7046908315565032, "grad_norm": 0.4728544267085605, "learning_rate": 6.666773433916965e-05, "loss": 0.3869, "step": 1599 }, { "epoch": 1.7057569296375266, "grad_norm": 0.47045760280939164, "learning_rate": 6.664553761682395e-05, "loss": 0.3851, "step": 1600 }, { "epoch": 1.70682302771855, "grad_norm": 0.45385159010082227, "learning_rate": 6.662332613423522e-05, "loss": 0.3957, "step": 1601 }, { "epoch": 1.7078891257995736, "grad_norm": 0.39340233247394074, "learning_rate": 6.660109990370747e-05, "loss": 0.3831, "step": 1602 }, { "epoch": 1.7089552238805972, "grad_norm": 0.4438034696481542, "learning_rate": 6.657885893755288e-05, "loss": 0.3941, "step": 1603 }, { "epoch": 1.7100213219616205, "grad_norm": 0.5254817458294889, "learning_rate": 6.655660324809177e-05, "loss": 0.387, "step": 1604 }, { "epoch": 1.7110874200426438, "grad_norm": 0.4913954053153753, "learning_rate": 6.653433284765266e-05, "loss": 0.3826, "step": 1605 }, { "epoch": 1.7121535181236673, "grad_norm": 0.36756522486239, "learning_rate": 6.651204774857218e-05, "loss": 0.4007, "step": 1606 }, { "epoch": 1.7132196162046909, "grad_norm": 0.34100380243193384, "learning_rate": 6.648974796319512e-05, "loss": 0.3932, "step": 1607 }, { "epoch": 1.7142857142857144, "grad_norm": 0.38180887373133354, "learning_rate": 6.646743350387438e-05, "loss": 0.3857, "step": 1608 }, { "epoch": 1.7153518123667377, "grad_norm": 0.4155049597785398, "learning_rate": 6.644510438297105e-05, "loss": 0.3925, "step": 1609 }, { "epoch": 1.716417910447761, "grad_norm": 0.4422223445035819, "learning_rate": 6.642276061285428e-05, "loss": 0.387, "step": 1610 }, { "epoch": 1.7174840085287846, "grad_norm": 0.5794607050629841, "learning_rate": 6.640040220590136e-05, "loss": 0.3887, "step": 1611 }, { "epoch": 1.7185501066098081, "grad_norm": 0.6678832046144233, "learning_rate": 6.63780291744977e-05, "loss": 0.3899, "step": 1612 }, { "epoch": 1.7196162046908317, "grad_norm": 0.6122776429863264, "learning_rate": 6.635564153103677e-05, "loss": 0.3929, "step": 1613 }, { "epoch": 1.720682302771855, "grad_norm": 0.5059533853234256, "learning_rate": 6.633323928792018e-05, "loss": 0.3915, "step": 1614 }, { "epoch": 1.7217484008528785, "grad_norm": 0.4164548494103394, "learning_rate": 6.631082245755762e-05, "loss": 0.3962, "step": 1615 }, { "epoch": 1.7228144989339018, "grad_norm": 0.39276463156642827, "learning_rate": 6.628839105236681e-05, "loss": 0.3904, "step": 1616 }, { "epoch": 1.7238805970149254, "grad_norm": 0.41559718909247517, "learning_rate": 6.626594508477361e-05, "loss": 0.3917, "step": 1617 }, { "epoch": 1.724946695095949, "grad_norm": 0.44665059431355064, "learning_rate": 6.624348456721191e-05, "loss": 0.3904, "step": 1618 }, { "epoch": 1.7260127931769724, "grad_norm": 0.4665709219109089, "learning_rate": 6.622100951212368e-05, "loss": 0.3901, "step": 1619 }, { "epoch": 1.7270788912579957, "grad_norm": 0.5150232707357006, "learning_rate": 6.619851993195893e-05, "loss": 0.3922, "step": 1620 }, { "epoch": 1.728144989339019, "grad_norm": 0.6026588085567588, "learning_rate": 6.61760158391757e-05, "loss": 0.3972, "step": 1621 }, { "epoch": 1.7292110874200426, "grad_norm": 0.6385069316475197, "learning_rate": 6.615349724624012e-05, "loss": 0.3963, "step": 1622 }, { "epoch": 1.7302771855010661, "grad_norm": 0.6066727609006709, "learning_rate": 6.61309641656263e-05, "loss": 0.3916, "step": 1623 }, { "epoch": 1.7313432835820897, "grad_norm": 0.5241084773133352, "learning_rate": 6.610841660981639e-05, "loss": 0.3897, "step": 1624 }, { "epoch": 1.732409381663113, "grad_norm": 0.4853459956518966, "learning_rate": 6.608585459130057e-05, "loss": 0.3943, "step": 1625 }, { "epoch": 1.7334754797441365, "grad_norm": 0.43877664392065524, "learning_rate": 6.606327812257705e-05, "loss": 0.3889, "step": 1626 }, { "epoch": 1.7345415778251598, "grad_norm": 0.39226116124002375, "learning_rate": 6.604068721615198e-05, "loss": 0.394, "step": 1627 }, { "epoch": 1.7356076759061834, "grad_norm": 0.34110686996523315, "learning_rate": 6.601808188453957e-05, "loss": 0.3912, "step": 1628 }, { "epoch": 1.736673773987207, "grad_norm": 0.3299899773295145, "learning_rate": 6.599546214026199e-05, "loss": 0.396, "step": 1629 }, { "epoch": 1.7377398720682304, "grad_norm": 0.376767657465232, "learning_rate": 6.597282799584941e-05, "loss": 0.3878, "step": 1630 }, { "epoch": 1.7388059701492538, "grad_norm": 0.3870812218773304, "learning_rate": 6.595017946383998e-05, "loss": 0.3924, "step": 1631 }, { "epoch": 1.739872068230277, "grad_norm": 0.32816223635729713, "learning_rate": 6.59275165567798e-05, "loss": 0.3891, "step": 1632 }, { "epoch": 1.7409381663113006, "grad_norm": 0.3368154332955513, "learning_rate": 6.590483928722293e-05, "loss": 0.3867, "step": 1633 }, { "epoch": 1.7420042643923241, "grad_norm": 0.32846108321184647, "learning_rate": 6.58821476677314e-05, "loss": 0.3899, "step": 1634 }, { "epoch": 1.7430703624733477, "grad_norm": 0.30960636706164546, "learning_rate": 6.585944171087521e-05, "loss": 0.3967, "step": 1635 }, { "epoch": 1.744136460554371, "grad_norm": 0.3431044349932443, "learning_rate": 6.583672142923226e-05, "loss": 0.3957, "step": 1636 }, { "epoch": 1.7452025586353943, "grad_norm": 0.3817314249874563, "learning_rate": 6.581398683538842e-05, "loss": 0.3938, "step": 1637 }, { "epoch": 1.7462686567164178, "grad_norm": 0.3846143335468199, "learning_rate": 6.579123794193746e-05, "loss": 0.3875, "step": 1638 }, { "epoch": 1.7473347547974414, "grad_norm": 0.40698506124086886, "learning_rate": 6.576847476148109e-05, "loss": 0.3878, "step": 1639 }, { "epoch": 1.748400852878465, "grad_norm": 0.5160395538152548, "learning_rate": 6.574569730662893e-05, "loss": 0.3935, "step": 1640 }, { "epoch": 1.7494669509594882, "grad_norm": 0.6377912942089725, "learning_rate": 6.57229055899985e-05, "loss": 0.392, "step": 1641 }, { "epoch": 1.7505330490405118, "grad_norm": 0.6901659496461329, "learning_rate": 6.570009962421523e-05, "loss": 0.3912, "step": 1642 }, { "epoch": 1.751599147121535, "grad_norm": 0.6169852874235238, "learning_rate": 6.567727942191246e-05, "loss": 0.391, "step": 1643 }, { "epoch": 1.7526652452025586, "grad_norm": 0.4948279659725487, "learning_rate": 6.565444499573136e-05, "loss": 0.3893, "step": 1644 }, { "epoch": 1.7537313432835822, "grad_norm": 0.5319346837051117, "learning_rate": 6.563159635832105e-05, "loss": 0.396, "step": 1645 }, { "epoch": 1.7547974413646057, "grad_norm": 0.6070963028163101, "learning_rate": 6.560873352233846e-05, "loss": 0.3879, "step": 1646 }, { "epoch": 1.755863539445629, "grad_norm": 0.5500219234930699, "learning_rate": 6.558585650044842e-05, "loss": 0.3962, "step": 1647 }, { "epoch": 1.7569296375266523, "grad_norm": 0.3774689595977208, "learning_rate": 6.556296530532364e-05, "loss": 0.3853, "step": 1648 }, { "epoch": 1.7579957356076759, "grad_norm": 0.3097496369115005, "learning_rate": 6.554005994964459e-05, "loss": 0.3889, "step": 1649 }, { "epoch": 1.7590618336886994, "grad_norm": 0.3192713728666256, "learning_rate": 6.55171404460997e-05, "loss": 0.3904, "step": 1650 }, { "epoch": 1.760127931769723, "grad_norm": 0.32864817312795563, "learning_rate": 6.549420680738516e-05, "loss": 0.3944, "step": 1651 }, { "epoch": 1.7611940298507462, "grad_norm": 0.32211875803439644, "learning_rate": 6.547125904620504e-05, "loss": 0.385, "step": 1652 }, { "epoch": 1.7622601279317696, "grad_norm": 0.30017127275221656, "learning_rate": 6.544829717527118e-05, "loss": 0.3885, "step": 1653 }, { "epoch": 1.763326226012793, "grad_norm": 0.3646601960483686, "learning_rate": 6.542532120730327e-05, "loss": 0.3905, "step": 1654 }, { "epoch": 1.7643923240938166, "grad_norm": 0.4908471802914999, "learning_rate": 6.540233115502881e-05, "loss": 0.3889, "step": 1655 }, { "epoch": 1.7654584221748402, "grad_norm": 0.5574952396992828, "learning_rate": 6.537932703118308e-05, "loss": 0.3881, "step": 1656 }, { "epoch": 1.7665245202558635, "grad_norm": 0.5663694328254277, "learning_rate": 6.535630884850917e-05, "loss": 0.3862, "step": 1657 }, { "epoch": 1.767590618336887, "grad_norm": 0.5503251345750754, "learning_rate": 6.533327661975799e-05, "loss": 0.3826, "step": 1658 }, { "epoch": 1.7686567164179103, "grad_norm": 0.4691123534761767, "learning_rate": 6.531023035768815e-05, "loss": 0.3905, "step": 1659 }, { "epoch": 1.7697228144989339, "grad_norm": 0.41955605601856755, "learning_rate": 6.528717007506612e-05, "loss": 0.396, "step": 1660 }, { "epoch": 1.7707889125799574, "grad_norm": 0.4274791936147579, "learning_rate": 6.526409578466606e-05, "loss": 0.3874, "step": 1661 }, { "epoch": 1.771855010660981, "grad_norm": 0.4609745881429261, "learning_rate": 6.524100749926997e-05, "loss": 0.3965, "step": 1662 }, { "epoch": 1.7729211087420043, "grad_norm": 0.5004816396707016, "learning_rate": 6.521790523166752e-05, "loss": 0.391, "step": 1663 }, { "epoch": 1.7739872068230276, "grad_norm": 0.5323254145339794, "learning_rate": 6.51947889946562e-05, "loss": 0.3855, "step": 1664 }, { "epoch": 1.775053304904051, "grad_norm": 0.5367601341161515, "learning_rate": 6.517165880104119e-05, "loss": 0.395, "step": 1665 }, { "epoch": 1.7761194029850746, "grad_norm": 0.5721670713464904, "learning_rate": 6.51485146636354e-05, "loss": 0.3831, "step": 1666 }, { "epoch": 1.7771855010660982, "grad_norm": 0.6751411525618288, "learning_rate": 6.51253565952595e-05, "loss": 0.3897, "step": 1667 }, { "epoch": 1.7782515991471215, "grad_norm": 0.7475500173594719, "learning_rate": 6.510218460874186e-05, "loss": 0.3918, "step": 1668 }, { "epoch": 1.779317697228145, "grad_norm": 0.7174243154443929, "learning_rate": 6.507899871691852e-05, "loss": 0.3872, "step": 1669 }, { "epoch": 1.7803837953091683, "grad_norm": 0.5785749621394422, "learning_rate": 6.50557989326333e-05, "loss": 0.3898, "step": 1670 }, { "epoch": 1.7814498933901919, "grad_norm": 0.4450886636964885, "learning_rate": 6.503258526873767e-05, "loss": 0.3824, "step": 1671 }, { "epoch": 1.7825159914712154, "grad_norm": 0.4254053253981925, "learning_rate": 6.500935773809076e-05, "loss": 0.3911, "step": 1672 }, { "epoch": 1.783582089552239, "grad_norm": 0.4474966076928897, "learning_rate": 6.498611635355947e-05, "loss": 0.39, "step": 1673 }, { "epoch": 1.7846481876332623, "grad_norm": 0.45486240848719184, "learning_rate": 6.496286112801826e-05, "loss": 0.3914, "step": 1674 }, { "epoch": 1.7857142857142856, "grad_norm": 0.514667280468136, "learning_rate": 6.493959207434934e-05, "loss": 0.3861, "step": 1675 }, { "epoch": 1.7867803837953091, "grad_norm": 0.5337184119841549, "learning_rate": 6.491630920544257e-05, "loss": 0.3871, "step": 1676 }, { "epoch": 1.7878464818763327, "grad_norm": 0.4780735793732188, "learning_rate": 6.489301253419545e-05, "loss": 0.3821, "step": 1677 }, { "epoch": 1.7889125799573562, "grad_norm": 0.4231898830746279, "learning_rate": 6.48697020735131e-05, "loss": 0.386, "step": 1678 }, { "epoch": 1.7899786780383795, "grad_norm": 0.3663697733311359, "learning_rate": 6.484637783630832e-05, "loss": 0.385, "step": 1679 }, { "epoch": 1.7910447761194028, "grad_norm": 0.30984371824344287, "learning_rate": 6.482303983550151e-05, "loss": 0.391, "step": 1680 }, { "epoch": 1.7921108742004264, "grad_norm": 0.35751913701235005, "learning_rate": 6.479968808402075e-05, "loss": 0.3957, "step": 1681 }, { "epoch": 1.79317697228145, "grad_norm": 0.4093914369810105, "learning_rate": 6.477632259480165e-05, "loss": 0.3935, "step": 1682 }, { "epoch": 1.7942430703624734, "grad_norm": 0.41377357296377376, "learning_rate": 6.475294338078752e-05, "loss": 0.3844, "step": 1683 }, { "epoch": 1.7953091684434968, "grad_norm": 0.39868902718714505, "learning_rate": 6.472955045492918e-05, "loss": 0.392, "step": 1684 }, { "epoch": 1.7963752665245203, "grad_norm": 0.42005660101933967, "learning_rate": 6.470614383018512e-05, "loss": 0.3932, "step": 1685 }, { "epoch": 1.7974413646055436, "grad_norm": 0.4135157735982948, "learning_rate": 6.468272351952141e-05, "loss": 0.3938, "step": 1686 }, { "epoch": 1.7985074626865671, "grad_norm": 0.42801661015742615, "learning_rate": 6.465928953591165e-05, "loss": 0.3879, "step": 1687 }, { "epoch": 1.7995735607675907, "grad_norm": 0.4816981612374122, "learning_rate": 6.463584189233709e-05, "loss": 0.3951, "step": 1688 }, { "epoch": 1.8006396588486142, "grad_norm": 0.46743748656875295, "learning_rate": 6.461238060178647e-05, "loss": 0.3939, "step": 1689 }, { "epoch": 1.8017057569296375, "grad_norm": 0.4222158155941226, "learning_rate": 6.458890567725614e-05, "loss": 0.3906, "step": 1690 }, { "epoch": 1.8027718550106608, "grad_norm": 0.3658953583582503, "learning_rate": 6.456541713174999e-05, "loss": 0.3962, "step": 1691 }, { "epoch": 1.8038379530916844, "grad_norm": 0.33668688496020444, "learning_rate": 6.454191497827945e-05, "loss": 0.3921, "step": 1692 }, { "epoch": 1.804904051172708, "grad_norm": 0.3206971308888266, "learning_rate": 6.451839922986349e-05, "loss": 0.3883, "step": 1693 }, { "epoch": 1.8059701492537314, "grad_norm": 0.3143057384612725, "learning_rate": 6.449486989952863e-05, "loss": 0.3896, "step": 1694 }, { "epoch": 1.8070362473347548, "grad_norm": 0.34510492572302714, "learning_rate": 6.447132700030887e-05, "loss": 0.3826, "step": 1695 }, { "epoch": 1.8081023454157783, "grad_norm": 0.4547140804511741, "learning_rate": 6.444777054524576e-05, "loss": 0.3884, "step": 1696 }, { "epoch": 1.8091684434968016, "grad_norm": 0.47384268369658744, "learning_rate": 6.442420054738837e-05, "loss": 0.3834, "step": 1697 }, { "epoch": 1.8102345415778252, "grad_norm": 0.47381168033622467, "learning_rate": 6.440061701979323e-05, "loss": 0.3812, "step": 1698 }, { "epoch": 1.8113006396588487, "grad_norm": 0.4548557810738229, "learning_rate": 6.43770199755244e-05, "loss": 0.3879, "step": 1699 }, { "epoch": 1.8123667377398722, "grad_norm": 0.4563303678565612, "learning_rate": 6.435340942765341e-05, "loss": 0.3881, "step": 1700 }, { "epoch": 1.8134328358208955, "grad_norm": 0.4345832462667786, "learning_rate": 6.432978538925928e-05, "loss": 0.3875, "step": 1701 }, { "epoch": 1.8144989339019189, "grad_norm": 0.36369291668844994, "learning_rate": 6.430614787342853e-05, "loss": 0.3891, "step": 1702 }, { "epoch": 1.8155650319829424, "grad_norm": 0.351336330190162, "learning_rate": 6.428249689325505e-05, "loss": 0.3884, "step": 1703 }, { "epoch": 1.816631130063966, "grad_norm": 0.3720689107883433, "learning_rate": 6.425883246184031e-05, "loss": 0.3917, "step": 1704 }, { "epoch": 1.8176972281449895, "grad_norm": 0.3277156951268358, "learning_rate": 6.423515459229313e-05, "loss": 0.3857, "step": 1705 }, { "epoch": 1.8187633262260128, "grad_norm": 0.3146191623979247, "learning_rate": 6.421146329772988e-05, "loss": 0.3881, "step": 1706 }, { "epoch": 1.819829424307036, "grad_norm": 0.39372224035911607, "learning_rate": 6.418775859127424e-05, "loss": 0.3847, "step": 1707 }, { "epoch": 1.8208955223880596, "grad_norm": 0.4723072413424627, "learning_rate": 6.416404048605744e-05, "loss": 0.3946, "step": 1708 }, { "epoch": 1.8219616204690832, "grad_norm": 0.5461469639906784, "learning_rate": 6.414030899521802e-05, "loss": 0.3977, "step": 1709 }, { "epoch": 1.8230277185501067, "grad_norm": 0.5644867900194028, "learning_rate": 6.411656413190205e-05, "loss": 0.3854, "step": 1710 }, { "epoch": 1.82409381663113, "grad_norm": 0.48007304376329935, "learning_rate": 6.409280590926292e-05, "loss": 0.3947, "step": 1711 }, { "epoch": 1.8251599147121536, "grad_norm": 0.41131804519163606, "learning_rate": 6.406903434046146e-05, "loss": 0.3913, "step": 1712 }, { "epoch": 1.8262260127931769, "grad_norm": 0.37379945962827266, "learning_rate": 6.404524943866588e-05, "loss": 0.3807, "step": 1713 }, { "epoch": 1.8272921108742004, "grad_norm": 0.42604642417053085, "learning_rate": 6.402145121705178e-05, "loss": 0.3953, "step": 1714 }, { "epoch": 1.828358208955224, "grad_norm": 0.541443650037536, "learning_rate": 6.399763968880214e-05, "loss": 0.3915, "step": 1715 }, { "epoch": 1.8294243070362475, "grad_norm": 0.5461544504898415, "learning_rate": 6.397381486710728e-05, "loss": 0.3917, "step": 1716 }, { "epoch": 1.8304904051172708, "grad_norm": 0.4620349232311522, "learning_rate": 6.394997676516497e-05, "loss": 0.3951, "step": 1717 }, { "epoch": 1.831556503198294, "grad_norm": 0.3606206070971171, "learning_rate": 6.392612539618024e-05, "loss": 0.3877, "step": 1718 }, { "epoch": 1.8326226012793176, "grad_norm": 0.3665152949807116, "learning_rate": 6.39022607733655e-05, "loss": 0.3911, "step": 1719 }, { "epoch": 1.8336886993603412, "grad_norm": 0.3617479779123929, "learning_rate": 6.387838290994056e-05, "loss": 0.3864, "step": 1720 }, { "epoch": 1.8347547974413647, "grad_norm": 0.35982095746420845, "learning_rate": 6.385449181913246e-05, "loss": 0.387, "step": 1721 }, { "epoch": 1.835820895522388, "grad_norm": 0.42991157325695156, "learning_rate": 6.383058751417566e-05, "loss": 0.3872, "step": 1722 }, { "epoch": 1.8368869936034116, "grad_norm": 0.4025624229827069, "learning_rate": 6.380667000831188e-05, "loss": 0.3879, "step": 1723 }, { "epoch": 1.8379530916844349, "grad_norm": 0.37576823662421394, "learning_rate": 6.37827393147902e-05, "loss": 0.3934, "step": 1724 }, { "epoch": 1.8390191897654584, "grad_norm": 0.4724397154377103, "learning_rate": 6.375879544686695e-05, "loss": 0.3907, "step": 1725 }, { "epoch": 1.840085287846482, "grad_norm": 0.49890032962764724, "learning_rate": 6.37348384178058e-05, "loss": 0.3905, "step": 1726 }, { "epoch": 1.8411513859275055, "grad_norm": 0.5271786209536122, "learning_rate": 6.371086824087772e-05, "loss": 0.3879, "step": 1727 }, { "epoch": 1.8422174840085288, "grad_norm": 0.5789984364979213, "learning_rate": 6.368688492936091e-05, "loss": 0.3968, "step": 1728 }, { "epoch": 1.8432835820895521, "grad_norm": 0.5914257408271724, "learning_rate": 6.366288849654091e-05, "loss": 0.3917, "step": 1729 }, { "epoch": 1.8443496801705757, "grad_norm": 0.4741221375005763, "learning_rate": 6.363887895571045e-05, "loss": 0.3845, "step": 1730 }, { "epoch": 1.8454157782515992, "grad_norm": 0.3731893195453408, "learning_rate": 6.361485632016963e-05, "loss": 0.3927, "step": 1731 }, { "epoch": 1.8464818763326227, "grad_norm": 0.36291074847148286, "learning_rate": 6.359082060322569e-05, "loss": 0.3834, "step": 1732 }, { "epoch": 1.847547974413646, "grad_norm": 0.33824345650854754, "learning_rate": 6.356677181819319e-05, "loss": 0.3923, "step": 1733 }, { "epoch": 1.8486140724946694, "grad_norm": 0.3590581537534159, "learning_rate": 6.35427099783939e-05, "loss": 0.3885, "step": 1734 }, { "epoch": 1.849680170575693, "grad_norm": 0.393824937605615, "learning_rate": 6.351863509715684e-05, "loss": 0.3808, "step": 1735 }, { "epoch": 1.8507462686567164, "grad_norm": 0.38639667752223744, "learning_rate": 6.349454718781822e-05, "loss": 0.3893, "step": 1736 }, { "epoch": 1.85181236673774, "grad_norm": 0.3933756538854629, "learning_rate": 6.347044626372153e-05, "loss": 0.3866, "step": 1737 }, { "epoch": 1.8528784648187633, "grad_norm": 0.4385122841126179, "learning_rate": 6.34463323382174e-05, "loss": 0.388, "step": 1738 }, { "epoch": 1.8539445628997868, "grad_norm": 0.4219487489482543, "learning_rate": 6.342220542466368e-05, "loss": 0.3902, "step": 1739 }, { "epoch": 1.8550106609808101, "grad_norm": 0.42067268031548977, "learning_rate": 6.339806553642545e-05, "loss": 0.3873, "step": 1740 }, { "epoch": 1.8560767590618337, "grad_norm": 0.4563965192041482, "learning_rate": 6.337391268687495e-05, "loss": 0.3865, "step": 1741 }, { "epoch": 1.8571428571428572, "grad_norm": 0.4746787282948449, "learning_rate": 6.334974688939161e-05, "loss": 0.3832, "step": 1742 }, { "epoch": 1.8582089552238807, "grad_norm": 0.4616609792809343, "learning_rate": 6.3325568157362e-05, "loss": 0.3878, "step": 1743 }, { "epoch": 1.859275053304904, "grad_norm": 0.4503741534784094, "learning_rate": 6.33013765041799e-05, "loss": 0.3857, "step": 1744 }, { "epoch": 1.8603411513859274, "grad_norm": 0.3431922287079442, "learning_rate": 6.327717194324622e-05, "loss": 0.3903, "step": 1745 }, { "epoch": 1.861407249466951, "grad_norm": 0.2609150587662325, "learning_rate": 6.325295448796903e-05, "loss": 0.391, "step": 1746 }, { "epoch": 1.8624733475479744, "grad_norm": 0.33270455925916104, "learning_rate": 6.322872415176356e-05, "loss": 0.3934, "step": 1747 }, { "epoch": 1.863539445628998, "grad_norm": 0.43954183331919916, "learning_rate": 6.320448094805214e-05, "loss": 0.3873, "step": 1748 }, { "epoch": 1.8646055437100213, "grad_norm": 0.5101457999652256, "learning_rate": 6.318022489026425e-05, "loss": 0.3906, "step": 1749 }, { "epoch": 1.8656716417910446, "grad_norm": 0.5395181016945636, "learning_rate": 6.315595599183646e-05, "loss": 0.3858, "step": 1750 }, { "epoch": 1.8667377398720681, "grad_norm": 0.6031812678730777, "learning_rate": 6.313167426621253e-05, "loss": 0.3868, "step": 1751 }, { "epoch": 1.8678038379530917, "grad_norm": 0.6041907845639172, "learning_rate": 6.310737972684322e-05, "loss": 0.3902, "step": 1752 }, { "epoch": 1.8688699360341152, "grad_norm": 0.5089623629692637, "learning_rate": 6.308307238718649e-05, "loss": 0.385, "step": 1753 }, { "epoch": 1.8699360341151388, "grad_norm": 0.40212189395827613, "learning_rate": 6.305875226070729e-05, "loss": 0.3914, "step": 1754 }, { "epoch": 1.871002132196162, "grad_norm": 0.35938255962719357, "learning_rate": 6.303441936087776e-05, "loss": 0.3867, "step": 1755 }, { "epoch": 1.8720682302771854, "grad_norm": 0.42202040294266446, "learning_rate": 6.301007370117703e-05, "loss": 0.3902, "step": 1756 }, { "epoch": 1.873134328358209, "grad_norm": 0.5647279892704614, "learning_rate": 6.298571529509135e-05, "loss": 0.3845, "step": 1757 }, { "epoch": 1.8742004264392325, "grad_norm": 0.6637603262956403, "learning_rate": 6.296134415611399e-05, "loss": 0.3854, "step": 1758 }, { "epoch": 1.875266524520256, "grad_norm": 0.7031726822916566, "learning_rate": 6.29369602977453e-05, "loss": 0.3855, "step": 1759 }, { "epoch": 1.8763326226012793, "grad_norm": 0.7834542862723698, "learning_rate": 6.291256373349269e-05, "loss": 0.3945, "step": 1760 }, { "epoch": 1.8773987206823026, "grad_norm": 0.8226206202632336, "learning_rate": 6.288815447687056e-05, "loss": 0.3901, "step": 1761 }, { "epoch": 1.8784648187633262, "grad_norm": 0.7994895608418134, "learning_rate": 6.286373254140038e-05, "loss": 0.3893, "step": 1762 }, { "epoch": 1.8795309168443497, "grad_norm": 0.7178750160169209, "learning_rate": 6.283929794061065e-05, "loss": 0.3917, "step": 1763 }, { "epoch": 1.8805970149253732, "grad_norm": 0.5231999499518026, "learning_rate": 6.281485068803683e-05, "loss": 0.3857, "step": 1764 }, { "epoch": 1.8816631130063965, "grad_norm": 0.39458981834934215, "learning_rate": 6.279039079722147e-05, "loss": 0.3873, "step": 1765 }, { "epoch": 1.88272921108742, "grad_norm": 0.5291211636623102, "learning_rate": 6.276591828171406e-05, "loss": 0.386, "step": 1766 }, { "epoch": 1.8837953091684434, "grad_norm": 0.6451705117892833, "learning_rate": 6.274143315507108e-05, "loss": 0.3861, "step": 1767 }, { "epoch": 1.884861407249467, "grad_norm": 0.6150320216897379, "learning_rate": 6.271693543085607e-05, "loss": 0.3891, "step": 1768 }, { "epoch": 1.8859275053304905, "grad_norm": 0.4359432097990482, "learning_rate": 6.269242512263945e-05, "loss": 0.3929, "step": 1769 }, { "epoch": 1.886993603411514, "grad_norm": 0.31560057559310783, "learning_rate": 6.266790224399867e-05, "loss": 0.3887, "step": 1770 }, { "epoch": 1.8880597014925373, "grad_norm": 0.4422794491574624, "learning_rate": 6.264336680851813e-05, "loss": 0.3902, "step": 1771 }, { "epoch": 1.8891257995735606, "grad_norm": 0.5106376023504305, "learning_rate": 6.26188188297892e-05, "loss": 0.3891, "step": 1772 }, { "epoch": 1.8901918976545842, "grad_norm": 0.4415611667008926, "learning_rate": 6.259425832141017e-05, "loss": 0.3859, "step": 1773 }, { "epoch": 1.8912579957356077, "grad_norm": 0.40641816501262484, "learning_rate": 6.256968529698628e-05, "loss": 0.3937, "step": 1774 }, { "epoch": 1.8923240938166312, "grad_norm": 0.448521267716046, "learning_rate": 6.254509977012972e-05, "loss": 0.3793, "step": 1775 }, { "epoch": 1.8933901918976546, "grad_norm": 0.5095261552719494, "learning_rate": 6.252050175445959e-05, "loss": 0.3887, "step": 1776 }, { "epoch": 1.8944562899786779, "grad_norm": 0.5730353742123191, "learning_rate": 6.249589126360192e-05, "loss": 0.3876, "step": 1777 }, { "epoch": 1.8955223880597014, "grad_norm": 0.6087785843423836, "learning_rate": 6.247126831118962e-05, "loss": 0.391, "step": 1778 }, { "epoch": 1.896588486140725, "grad_norm": 0.5935074468788941, "learning_rate": 6.244663291086256e-05, "loss": 0.392, "step": 1779 }, { "epoch": 1.8976545842217485, "grad_norm": 0.5590142718060106, "learning_rate": 6.242198507626746e-05, "loss": 0.3893, "step": 1780 }, { "epoch": 1.8987206823027718, "grad_norm": 0.5336039934734914, "learning_rate": 6.23973248210579e-05, "loss": 0.3855, "step": 1781 }, { "epoch": 1.8997867803837953, "grad_norm": 0.445389003613856, "learning_rate": 6.237265215889444e-05, "loss": 0.3886, "step": 1782 }, { "epoch": 1.9008528784648187, "grad_norm": 0.3716042168347448, "learning_rate": 6.234796710344441e-05, "loss": 0.3798, "step": 1783 }, { "epoch": 1.9019189765458422, "grad_norm": 0.4696131039929026, "learning_rate": 6.232326966838207e-05, "loss": 0.3924, "step": 1784 }, { "epoch": 1.9029850746268657, "grad_norm": 0.5401242190818367, "learning_rate": 6.229855986738851e-05, "loss": 0.3868, "step": 1785 }, { "epoch": 1.9040511727078893, "grad_norm": 0.4997589943313716, "learning_rate": 6.227383771415166e-05, "loss": 0.3889, "step": 1786 }, { "epoch": 1.9051172707889126, "grad_norm": 0.4625159920961848, "learning_rate": 6.224910322236634e-05, "loss": 0.3902, "step": 1787 }, { "epoch": 1.906183368869936, "grad_norm": 0.4473030136236895, "learning_rate": 6.222435640573414e-05, "loss": 0.3922, "step": 1788 }, { "epoch": 1.9072494669509594, "grad_norm": 0.3688121331850802, "learning_rate": 6.219959727796354e-05, "loss": 0.3819, "step": 1789 }, { "epoch": 1.908315565031983, "grad_norm": 0.27997416856472385, "learning_rate": 6.217482585276979e-05, "loss": 0.3935, "step": 1790 }, { "epoch": 1.9093816631130065, "grad_norm": 0.2996081067229179, "learning_rate": 6.215004214387497e-05, "loss": 0.3889, "step": 1791 }, { "epoch": 1.9104477611940298, "grad_norm": 0.4281784065305154, "learning_rate": 6.212524616500798e-05, "loss": 0.3865, "step": 1792 }, { "epoch": 1.9115138592750534, "grad_norm": 0.44659189904413066, "learning_rate": 6.210043792990449e-05, "loss": 0.3868, "step": 1793 }, { "epoch": 1.9125799573560767, "grad_norm": 0.3623900185596153, "learning_rate": 6.2075617452307e-05, "loss": 0.3823, "step": 1794 }, { "epoch": 1.9136460554371002, "grad_norm": 0.37626885568568225, "learning_rate": 6.205078474596473e-05, "loss": 0.3873, "step": 1795 }, { "epoch": 1.9147121535181237, "grad_norm": 0.3768400015936373, "learning_rate": 6.202593982463373e-05, "loss": 0.39, "step": 1796 }, { "epoch": 1.9157782515991473, "grad_norm": 0.4025791301348382, "learning_rate": 6.200108270207679e-05, "loss": 0.3791, "step": 1797 }, { "epoch": 1.9168443496801706, "grad_norm": 0.5071478998115798, "learning_rate": 6.197621339206345e-05, "loss": 0.3819, "step": 1798 }, { "epoch": 1.917910447761194, "grad_norm": 0.5571967213799047, "learning_rate": 6.195133190837004e-05, "loss": 0.3892, "step": 1799 }, { "epoch": 1.9189765458422174, "grad_norm": 0.5126160369273954, "learning_rate": 6.192643826477959e-05, "loss": 0.3835, "step": 1800 }, { "epoch": 1.920042643923241, "grad_norm": 0.44604407636296517, "learning_rate": 6.190153247508189e-05, "loss": 0.3953, "step": 1801 }, { "epoch": 1.9211087420042645, "grad_norm": 0.39601219897116957, "learning_rate": 6.187661455307346e-05, "loss": 0.3916, "step": 1802 }, { "epoch": 1.9221748400852878, "grad_norm": 0.38498451897362057, "learning_rate": 6.185168451255752e-05, "loss": 0.393, "step": 1803 }, { "epoch": 1.9232409381663111, "grad_norm": 0.36258385258732584, "learning_rate": 6.182674236734404e-05, "loss": 0.3858, "step": 1804 }, { "epoch": 1.9243070362473347, "grad_norm": 0.29544150204233544, "learning_rate": 6.180178813124965e-05, "loss": 0.385, "step": 1805 }, { "epoch": 1.9253731343283582, "grad_norm": 0.3264189147080742, "learning_rate": 6.177682181809772e-05, "loss": 0.3851, "step": 1806 }, { "epoch": 1.9264392324093818, "grad_norm": 0.3541649704837544, "learning_rate": 6.175184344171827e-05, "loss": 0.3885, "step": 1807 }, { "epoch": 1.927505330490405, "grad_norm": 0.30666928053826004, "learning_rate": 6.172685301594802e-05, "loss": 0.3873, "step": 1808 }, { "epoch": 1.9285714285714286, "grad_norm": 0.2639778431840215, "learning_rate": 6.170185055463039e-05, "loss": 0.3854, "step": 1809 }, { "epoch": 1.929637526652452, "grad_norm": 0.277287436322927, "learning_rate": 6.167683607161542e-05, "loss": 0.3865, "step": 1810 }, { "epoch": 1.9307036247334755, "grad_norm": 0.3219135209443072, "learning_rate": 6.165180958075985e-05, "loss": 0.3848, "step": 1811 }, { "epoch": 1.931769722814499, "grad_norm": 0.3303760398066151, "learning_rate": 6.162677109592704e-05, "loss": 0.386, "step": 1812 }, { "epoch": 1.9328358208955225, "grad_norm": 0.3084131821261726, "learning_rate": 6.160172063098703e-05, "loss": 0.3884, "step": 1813 }, { "epoch": 1.9339019189765458, "grad_norm": 0.31138352082161325, "learning_rate": 6.157665819981646e-05, "loss": 0.3832, "step": 1814 }, { "epoch": 1.9349680170575692, "grad_norm": 0.3417574287817514, "learning_rate": 6.155158381629863e-05, "loss": 0.39, "step": 1815 }, { "epoch": 1.9360341151385927, "grad_norm": 0.29411408973125125, "learning_rate": 6.152649749432343e-05, "loss": 0.3857, "step": 1816 }, { "epoch": 1.9371002132196162, "grad_norm": 0.28770100823418326, "learning_rate": 6.150139924778738e-05, "loss": 0.3868, "step": 1817 }, { "epoch": 1.9381663113006398, "grad_norm": 0.3600800861434762, "learning_rate": 6.14762890905936e-05, "loss": 0.3907, "step": 1818 }, { "epoch": 1.939232409381663, "grad_norm": 0.40220665801015487, "learning_rate": 6.145116703665184e-05, "loss": 0.3839, "step": 1819 }, { "epoch": 1.9402985074626866, "grad_norm": 0.4133033373544916, "learning_rate": 6.142603309987838e-05, "loss": 0.387, "step": 1820 }, { "epoch": 1.94136460554371, "grad_norm": 0.47788470275197026, "learning_rate": 6.140088729419613e-05, "loss": 0.3903, "step": 1821 }, { "epoch": 1.9424307036247335, "grad_norm": 0.4896797272268825, "learning_rate": 6.137572963353455e-05, "loss": 0.385, "step": 1822 }, { "epoch": 1.943496801705757, "grad_norm": 0.45525617704254706, "learning_rate": 6.135056013182969e-05, "loss": 0.3949, "step": 1823 }, { "epoch": 1.9445628997867805, "grad_norm": 0.41754979578711426, "learning_rate": 6.132537880302412e-05, "loss": 0.3901, "step": 1824 }, { "epoch": 1.9456289978678039, "grad_norm": 0.36895879752123967, "learning_rate": 6.130018566106702e-05, "loss": 0.3845, "step": 1825 }, { "epoch": 1.9466950959488272, "grad_norm": 0.2966243721883261, "learning_rate": 6.127498071991406e-05, "loss": 0.3894, "step": 1826 }, { "epoch": 1.9477611940298507, "grad_norm": 0.29871248582481646, "learning_rate": 6.12497639935275e-05, "loss": 0.3859, "step": 1827 }, { "epoch": 1.9488272921108742, "grad_norm": 0.3298826130551037, "learning_rate": 6.122453549587603e-05, "loss": 0.3857, "step": 1828 }, { "epoch": 1.9498933901918978, "grad_norm": 0.3676840915758954, "learning_rate": 6.119929524093499e-05, "loss": 0.3894, "step": 1829 }, { "epoch": 1.950959488272921, "grad_norm": 0.3611638397068088, "learning_rate": 6.117404324268615e-05, "loss": 0.3917, "step": 1830 }, { "epoch": 1.9520255863539444, "grad_norm": 0.36356753037609535, "learning_rate": 6.11487795151178e-05, "loss": 0.3885, "step": 1831 }, { "epoch": 1.953091684434968, "grad_norm": 0.4256434759309199, "learning_rate": 6.112350407222472e-05, "loss": 0.3871, "step": 1832 }, { "epoch": 1.9541577825159915, "grad_norm": 0.5131488919037342, "learning_rate": 6.109821692800822e-05, "loss": 0.3925, "step": 1833 }, { "epoch": 1.955223880597015, "grad_norm": 0.5200430235069933, "learning_rate": 6.107291809647603e-05, "loss": 0.3898, "step": 1834 }, { "epoch": 1.9562899786780383, "grad_norm": 0.44124171195877854, "learning_rate": 6.104760759164242e-05, "loss": 0.3887, "step": 1835 }, { "epoch": 1.9573560767590619, "grad_norm": 0.3699718554474287, "learning_rate": 6.102228542752809e-05, "loss": 0.3901, "step": 1836 }, { "epoch": 1.9584221748400852, "grad_norm": 0.3748188674691452, "learning_rate": 6.0996951618160164e-05, "loss": 0.3948, "step": 1837 }, { "epoch": 1.9594882729211087, "grad_norm": 0.3898699056134908, "learning_rate": 6.097160617757231e-05, "loss": 0.3906, "step": 1838 }, { "epoch": 1.9605543710021323, "grad_norm": 0.3988044604464649, "learning_rate": 6.094624911980455e-05, "loss": 0.3859, "step": 1839 }, { "epoch": 1.9616204690831558, "grad_norm": 0.3913844382375237, "learning_rate": 6.0920880458903396e-05, "loss": 0.3861, "step": 1840 }, { "epoch": 1.962686567164179, "grad_norm": 0.32899811131426077, "learning_rate": 6.089550020892175e-05, "loss": 0.3866, "step": 1841 }, { "epoch": 1.9637526652452024, "grad_norm": 0.31976863842060194, "learning_rate": 6.0870108383918964e-05, "loss": 0.3827, "step": 1842 }, { "epoch": 1.964818763326226, "grad_norm": 0.4262488192151024, "learning_rate": 6.084470499796077e-05, "loss": 0.389, "step": 1843 }, { "epoch": 1.9658848614072495, "grad_norm": 0.4077721083204829, "learning_rate": 6.081929006511935e-05, "loss": 0.3839, "step": 1844 }, { "epoch": 1.966950959488273, "grad_norm": 0.3347619429742381, "learning_rate": 6.079386359947325e-05, "loss": 0.3831, "step": 1845 }, { "epoch": 1.9680170575692963, "grad_norm": 0.24957267266383398, "learning_rate": 6.07684256151074e-05, "loss": 0.3821, "step": 1846 }, { "epoch": 1.9690831556503199, "grad_norm": 0.2976405468436129, "learning_rate": 6.074297612611312e-05, "loss": 0.3833, "step": 1847 }, { "epoch": 1.9701492537313432, "grad_norm": 0.4249489763296379, "learning_rate": 6.071751514658811e-05, "loss": 0.39, "step": 1848 }, { "epoch": 1.9712153518123667, "grad_norm": 0.45599309991133324, "learning_rate": 6.069204269063644e-05, "loss": 0.3922, "step": 1849 }, { "epoch": 1.9722814498933903, "grad_norm": 0.38232128582738506, "learning_rate": 6.066655877236851e-05, "loss": 0.3889, "step": 1850 }, { "epoch": 1.9733475479744138, "grad_norm": 0.34695673920538606, "learning_rate": 6.06410634059011e-05, "loss": 0.3894, "step": 1851 }, { "epoch": 1.9744136460554371, "grad_norm": 0.4535787896409531, "learning_rate": 6.06155566053573e-05, "loss": 0.3886, "step": 1852 }, { "epoch": 1.9754797441364604, "grad_norm": 0.5028521097525349, "learning_rate": 6.059003838486657e-05, "loss": 0.3914, "step": 1853 }, { "epoch": 1.976545842217484, "grad_norm": 0.43918619216494054, "learning_rate": 6.056450875856467e-05, "loss": 0.3849, "step": 1854 }, { "epoch": 1.9776119402985075, "grad_norm": 0.45035177429943046, "learning_rate": 6.053896774059368e-05, "loss": 0.391, "step": 1855 }, { "epoch": 1.978678038379531, "grad_norm": 0.5612538235109835, "learning_rate": 6.051341534510201e-05, "loss": 0.3929, "step": 1856 }, { "epoch": 1.9797441364605544, "grad_norm": 0.6454231135981549, "learning_rate": 6.048785158624436e-05, "loss": 0.3871, "step": 1857 }, { "epoch": 1.9808102345415777, "grad_norm": 0.7284809133654999, "learning_rate": 6.0462276478181696e-05, "loss": 0.391, "step": 1858 }, { "epoch": 1.9818763326226012, "grad_norm": 0.7042942526408297, "learning_rate": 6.043669003508134e-05, "loss": 0.3827, "step": 1859 }, { "epoch": 1.9829424307036247, "grad_norm": 0.6203970586107856, "learning_rate": 6.0411092271116815e-05, "loss": 0.393, "step": 1860 }, { "epoch": 1.9840085287846483, "grad_norm": 0.5381848565811816, "learning_rate": 6.038548320046797e-05, "loss": 0.3884, "step": 1861 }, { "epoch": 1.9850746268656716, "grad_norm": 0.3673908310108203, "learning_rate": 6.035986283732091e-05, "loss": 0.3858, "step": 1862 }, { "epoch": 1.9861407249466951, "grad_norm": 0.2992259271172484, "learning_rate": 6.0334231195867954e-05, "loss": 0.3876, "step": 1863 }, { "epoch": 1.9872068230277184, "grad_norm": 0.5048589299680658, "learning_rate": 6.030858829030773e-05, "loss": 0.3915, "step": 1864 }, { "epoch": 1.988272921108742, "grad_norm": 0.6398688231174923, "learning_rate": 6.0282934134845055e-05, "loss": 0.3871, "step": 1865 }, { "epoch": 1.9893390191897655, "grad_norm": 0.6041356427710289, "learning_rate": 6.025726874369101e-05, "loss": 0.3906, "step": 1866 }, { "epoch": 1.990405117270789, "grad_norm": 0.436971182640899, "learning_rate": 6.023159213106288e-05, "loss": 0.3859, "step": 1867 }, { "epoch": 1.9914712153518124, "grad_norm": 0.3014364362613937, "learning_rate": 6.020590431118417e-05, "loss": 0.3871, "step": 1868 }, { "epoch": 1.9925373134328357, "grad_norm": 0.34001157186356745, "learning_rate": 6.018020529828461e-05, "loss": 0.3852, "step": 1869 }, { "epoch": 1.9936034115138592, "grad_norm": 0.4404981881926425, "learning_rate": 6.0154495106600094e-05, "loss": 0.3844, "step": 1870 }, { "epoch": 1.9946695095948828, "grad_norm": 0.48771270351289164, "learning_rate": 6.012877375037278e-05, "loss": 0.3866, "step": 1871 }, { "epoch": 1.9957356076759063, "grad_norm": 0.4990494207144115, "learning_rate": 6.01030412438509e-05, "loss": 0.3858, "step": 1872 }, { "epoch": 1.9968017057569296, "grad_norm": 0.468499732746224, "learning_rate": 6.007729760128898e-05, "loss": 0.3843, "step": 1873 }, { "epoch": 1.997867803837953, "grad_norm": 0.41065101398677445, "learning_rate": 6.0051542836947625e-05, "loss": 0.3886, "step": 1874 }, { "epoch": 1.9989339019189765, "grad_norm": 0.444948728644674, "learning_rate": 6.002577696509365e-05, "loss": 0.3855, "step": 1875 }, { "epoch": 2.0, "grad_norm": 0.5080421482170958, "learning_rate": 6.000000000000001e-05, "loss": 0.3761, "step": 1876 }, { "epoch": 2.0010660980810235, "grad_norm": 0.5368826782068931, "learning_rate": 5.9974211955945795e-05, "loss": 0.3645, "step": 1877 }, { "epoch": 2.002132196162047, "grad_norm": 0.5340141540515194, "learning_rate": 5.9948412847216255e-05, "loss": 0.3729, "step": 1878 }, { "epoch": 2.00319829424307, "grad_norm": 0.48248902079325257, "learning_rate": 5.992260268810273e-05, "loss": 0.3696, "step": 1879 }, { "epoch": 2.0042643923240937, "grad_norm": 0.4867113475226342, "learning_rate": 5.989678149290274e-05, "loss": 0.3736, "step": 1880 }, { "epoch": 2.0053304904051172, "grad_norm": 0.5127638734663647, "learning_rate": 5.987094927591987e-05, "loss": 0.3655, "step": 1881 }, { "epoch": 2.0063965884861408, "grad_norm": 0.5333958878222103, "learning_rate": 5.9845106051463815e-05, "loss": 0.3731, "step": 1882 }, { "epoch": 2.0074626865671643, "grad_norm": 0.496391847282843, "learning_rate": 5.9819251833850395e-05, "loss": 0.3634, "step": 1883 }, { "epoch": 2.008528784648188, "grad_norm": 0.3918376522955584, "learning_rate": 5.979338663740149e-05, "loss": 0.3748, "step": 1884 }, { "epoch": 2.009594882729211, "grad_norm": 0.3947052793986647, "learning_rate": 5.9767510476445097e-05, "loss": 0.3721, "step": 1885 }, { "epoch": 2.0106609808102345, "grad_norm": 0.41802301612396614, "learning_rate": 5.974162336531522e-05, "loss": 0.368, "step": 1886 }, { "epoch": 2.011727078891258, "grad_norm": 0.4284040217771835, "learning_rate": 5.9715725318352024e-05, "loss": 0.366, "step": 1887 }, { "epoch": 2.0127931769722816, "grad_norm": 0.4723022945666149, "learning_rate": 5.968981634990164e-05, "loss": 0.3688, "step": 1888 }, { "epoch": 2.013859275053305, "grad_norm": 0.44534446364431024, "learning_rate": 5.9663896474316325e-05, "loss": 0.3685, "step": 1889 }, { "epoch": 2.014925373134328, "grad_norm": 0.4512122720314313, "learning_rate": 5.9637965705954316e-05, "loss": 0.3675, "step": 1890 }, { "epoch": 2.0159914712153517, "grad_norm": 0.41496594298021805, "learning_rate": 5.961202405917993e-05, "loss": 0.3708, "step": 1891 }, { "epoch": 2.0170575692963753, "grad_norm": 0.4351254782206402, "learning_rate": 5.9586071548363475e-05, "loss": 0.3649, "step": 1892 }, { "epoch": 2.018123667377399, "grad_norm": 0.42476919414349634, "learning_rate": 5.95601081878813e-05, "loss": 0.3663, "step": 1893 }, { "epoch": 2.0191897654584223, "grad_norm": 0.3868685718287423, "learning_rate": 5.9534133992115766e-05, "loss": 0.3696, "step": 1894 }, { "epoch": 2.0202558635394454, "grad_norm": 0.3249684583348523, "learning_rate": 5.9508148975455216e-05, "loss": 0.368, "step": 1895 }, { "epoch": 2.021321961620469, "grad_norm": 0.35899462535259774, "learning_rate": 5.9482153152294e-05, "loss": 0.3673, "step": 1896 }, { "epoch": 2.0223880597014925, "grad_norm": 0.3843529775756281, "learning_rate": 5.945614653703245e-05, "loss": 0.3671, "step": 1897 }, { "epoch": 2.023454157782516, "grad_norm": 0.4180113835206048, "learning_rate": 5.9430129144076894e-05, "loss": 0.3731, "step": 1898 }, { "epoch": 2.0245202558635396, "grad_norm": 0.37475722594758115, "learning_rate": 5.9404100987839594e-05, "loss": 0.3649, "step": 1899 }, { "epoch": 2.025586353944563, "grad_norm": 0.28629598775419896, "learning_rate": 5.937806208273881e-05, "loss": 0.3679, "step": 1900 }, { "epoch": 2.026652452025586, "grad_norm": 0.300199898611435, "learning_rate": 5.9352012443198744e-05, "loss": 0.3751, "step": 1901 }, { "epoch": 2.0277185501066097, "grad_norm": 0.33065361440169816, "learning_rate": 5.932595208364954e-05, "loss": 0.3674, "step": 1902 }, { "epoch": 2.0287846481876333, "grad_norm": 0.45055044275702016, "learning_rate": 5.9299881018527286e-05, "loss": 0.3669, "step": 1903 }, { "epoch": 2.029850746268657, "grad_norm": 0.5359016977011789, "learning_rate": 5.927379926227398e-05, "loss": 0.3736, "step": 1904 }, { "epoch": 2.0309168443496803, "grad_norm": 0.4728072949818313, "learning_rate": 5.924770682933758e-05, "loss": 0.3673, "step": 1905 }, { "epoch": 2.0319829424307034, "grad_norm": 0.35779722355951155, "learning_rate": 5.9221603734171916e-05, "loss": 0.3712, "step": 1906 }, { "epoch": 2.033049040511727, "grad_norm": 0.25778926704533023, "learning_rate": 5.919548999123677e-05, "loss": 0.3605, "step": 1907 }, { "epoch": 2.0341151385927505, "grad_norm": 0.30958936271010357, "learning_rate": 5.9169365614997786e-05, "loss": 0.3633, "step": 1908 }, { "epoch": 2.035181236673774, "grad_norm": 0.3966071294002005, "learning_rate": 5.914323061992651e-05, "loss": 0.3712, "step": 1909 }, { "epoch": 2.0362473347547976, "grad_norm": 0.418075833743344, "learning_rate": 5.9117085020500375e-05, "loss": 0.367, "step": 1910 }, { "epoch": 2.0373134328358207, "grad_norm": 0.4289422566733278, "learning_rate": 5.909092883120269e-05, "loss": 0.3677, "step": 1911 }, { "epoch": 2.038379530916844, "grad_norm": 0.38407217216684475, "learning_rate": 5.9064762066522614e-05, "loss": 0.3657, "step": 1912 }, { "epoch": 2.0394456289978677, "grad_norm": 0.3152326073948108, "learning_rate": 5.9038584740955166e-05, "loss": 0.3775, "step": 1913 }, { "epoch": 2.0405117270788913, "grad_norm": 0.2902867416768938, "learning_rate": 5.9012396869001255e-05, "loss": 0.3629, "step": 1914 }, { "epoch": 2.041577825159915, "grad_norm": 0.33254014516271413, "learning_rate": 5.8986198465167566e-05, "loss": 0.3652, "step": 1915 }, { "epoch": 2.0426439232409384, "grad_norm": 0.3355861259716711, "learning_rate": 5.895998954396669e-05, "loss": 0.3739, "step": 1916 }, { "epoch": 2.0437100213219614, "grad_norm": 0.33027480184371727, "learning_rate": 5.893377011991696e-05, "loss": 0.3642, "step": 1917 }, { "epoch": 2.044776119402985, "grad_norm": 0.36437673592398223, "learning_rate": 5.8907540207542616e-05, "loss": 0.3673, "step": 1918 }, { "epoch": 2.0458422174840085, "grad_norm": 0.37919989065780685, "learning_rate": 5.888129982137364e-05, "loss": 0.3697, "step": 1919 }, { "epoch": 2.046908315565032, "grad_norm": 0.39804386255773294, "learning_rate": 5.885504897594587e-05, "loss": 0.3706, "step": 1920 }, { "epoch": 2.0479744136460556, "grad_norm": 0.4048265506757637, "learning_rate": 5.882878768580089e-05, "loss": 0.368, "step": 1921 }, { "epoch": 2.0490405117270787, "grad_norm": 0.39214048772357035, "learning_rate": 5.880251596548608e-05, "loss": 0.3729, "step": 1922 }, { "epoch": 2.050106609808102, "grad_norm": 0.3768034830806379, "learning_rate": 5.877623382955463e-05, "loss": 0.366, "step": 1923 }, { "epoch": 2.0511727078891258, "grad_norm": 0.36912589068028795, "learning_rate": 5.874994129256546e-05, "loss": 0.3704, "step": 1924 }, { "epoch": 2.0522388059701493, "grad_norm": 0.34472606046121007, "learning_rate": 5.872363836908328e-05, "loss": 0.3605, "step": 1925 }, { "epoch": 2.053304904051173, "grad_norm": 0.3447390942221336, "learning_rate": 5.869732507367854e-05, "loss": 0.3632, "step": 1926 }, { "epoch": 2.0543710021321964, "grad_norm": 0.39703909632410184, "learning_rate": 5.867100142092745e-05, "loss": 0.3685, "step": 1927 }, { "epoch": 2.0554371002132195, "grad_norm": 0.44058895200097586, "learning_rate": 5.864466742541192e-05, "loss": 0.3698, "step": 1928 }, { "epoch": 2.056503198294243, "grad_norm": 0.4721233594093033, "learning_rate": 5.861832310171963e-05, "loss": 0.3764, "step": 1929 }, { "epoch": 2.0575692963752665, "grad_norm": 0.4144618882599998, "learning_rate": 5.8591968464443964e-05, "loss": 0.3667, "step": 1930 }, { "epoch": 2.05863539445629, "grad_norm": 0.302798084281156, "learning_rate": 5.856560352818403e-05, "loss": 0.3753, "step": 1931 }, { "epoch": 2.0597014925373136, "grad_norm": 0.35481671123309483, "learning_rate": 5.853922830754462e-05, "loss": 0.3725, "step": 1932 }, { "epoch": 2.0607675906183367, "grad_norm": 0.3547278123247793, "learning_rate": 5.851284281713623e-05, "loss": 0.3718, "step": 1933 }, { "epoch": 2.0618336886993602, "grad_norm": 0.3404365623104542, "learning_rate": 5.848644707157508e-05, "loss": 0.3768, "step": 1934 }, { "epoch": 2.0628997867803838, "grad_norm": 0.3550698273564781, "learning_rate": 5.8460041085483004e-05, "loss": 0.3727, "step": 1935 }, { "epoch": 2.0639658848614073, "grad_norm": 0.35208070245068634, "learning_rate": 5.8433624873487577e-05, "loss": 0.37, "step": 1936 }, { "epoch": 2.065031982942431, "grad_norm": 0.34247134066073665, "learning_rate": 5.840719845022198e-05, "loss": 0.3687, "step": 1937 }, { "epoch": 2.066098081023454, "grad_norm": 0.35093264721237094, "learning_rate": 5.8380761830325095e-05, "loss": 0.3736, "step": 1938 }, { "epoch": 2.0671641791044775, "grad_norm": 0.3638250203992304, "learning_rate": 5.8354315028441434e-05, "loss": 0.368, "step": 1939 }, { "epoch": 2.068230277185501, "grad_norm": 0.31391019565989364, "learning_rate": 5.832785805922115e-05, "loss": 0.3755, "step": 1940 }, { "epoch": 2.0692963752665245, "grad_norm": 0.32954228352966447, "learning_rate": 5.830139093732003e-05, "loss": 0.3695, "step": 1941 }, { "epoch": 2.070362473347548, "grad_norm": 0.39242675952607037, "learning_rate": 5.827491367739948e-05, "loss": 0.3732, "step": 1942 }, { "epoch": 2.0714285714285716, "grad_norm": 0.45718271088467405, "learning_rate": 5.824842629412653e-05, "loss": 0.3711, "step": 1943 }, { "epoch": 2.0724946695095947, "grad_norm": 0.449089038740122, "learning_rate": 5.822192880217381e-05, "loss": 0.3682, "step": 1944 }, { "epoch": 2.0735607675906182, "grad_norm": 0.37218601001021756, "learning_rate": 5.819542121621955e-05, "loss": 0.3672, "step": 1945 }, { "epoch": 2.074626865671642, "grad_norm": 0.3365200917538621, "learning_rate": 5.8168903550947586e-05, "loss": 0.3667, "step": 1946 }, { "epoch": 2.0756929637526653, "grad_norm": 0.3776897290391663, "learning_rate": 5.814237582104732e-05, "loss": 0.369, "step": 1947 }, { "epoch": 2.076759061833689, "grad_norm": 0.4241260819869886, "learning_rate": 5.811583804121373e-05, "loss": 0.3721, "step": 1948 }, { "epoch": 2.077825159914712, "grad_norm": 0.43159827522631167, "learning_rate": 5.808929022614738e-05, "loss": 0.3707, "step": 1949 }, { "epoch": 2.0788912579957355, "grad_norm": 0.46068845826576565, "learning_rate": 5.806273239055437e-05, "loss": 0.3665, "step": 1950 }, { "epoch": 2.079957356076759, "grad_norm": 0.4492734276793744, "learning_rate": 5.803616454914636e-05, "loss": 0.3687, "step": 1951 }, { "epoch": 2.0810234541577826, "grad_norm": 0.4521877821462862, "learning_rate": 5.800958671664057e-05, "loss": 0.3638, "step": 1952 }, { "epoch": 2.082089552238806, "grad_norm": 0.41545909536895476, "learning_rate": 5.798299890775971e-05, "loss": 0.3738, "step": 1953 }, { "epoch": 2.0831556503198296, "grad_norm": 0.2972587597086915, "learning_rate": 5.795640113723207e-05, "loss": 0.367, "step": 1954 }, { "epoch": 2.0842217484008527, "grad_norm": 0.27641022547500294, "learning_rate": 5.7929793419791416e-05, "loss": 0.3711, "step": 1955 }, { "epoch": 2.0852878464818763, "grad_norm": 0.39396716547715316, "learning_rate": 5.790317577017705e-05, "loss": 0.3698, "step": 1956 }, { "epoch": 2.0863539445629, "grad_norm": 0.38769077302687954, "learning_rate": 5.787654820313376e-05, "loss": 0.371, "step": 1957 }, { "epoch": 2.0874200426439233, "grad_norm": 0.31630048451704645, "learning_rate": 5.784991073341184e-05, "loss": 0.3697, "step": 1958 }, { "epoch": 2.088486140724947, "grad_norm": 0.23632112461394705, "learning_rate": 5.782326337576705e-05, "loss": 0.3674, "step": 1959 }, { "epoch": 2.08955223880597, "grad_norm": 0.2503975390702189, "learning_rate": 5.779660614496067e-05, "loss": 0.367, "step": 1960 }, { "epoch": 2.0906183368869935, "grad_norm": 0.31776105349081096, "learning_rate": 5.776993905575939e-05, "loss": 0.3656, "step": 1961 }, { "epoch": 2.091684434968017, "grad_norm": 0.29292989917695805, "learning_rate": 5.77432621229354e-05, "loss": 0.3712, "step": 1962 }, { "epoch": 2.0927505330490406, "grad_norm": 0.2736016829783113, "learning_rate": 5.771657536126634e-05, "loss": 0.3682, "step": 1963 }, { "epoch": 2.093816631130064, "grad_norm": 0.301999599571911, "learning_rate": 5.768987878553531e-05, "loss": 0.3683, "step": 1964 }, { "epoch": 2.094882729211087, "grad_norm": 0.3234173819094893, "learning_rate": 5.766317241053077e-05, "loss": 0.369, "step": 1965 }, { "epoch": 2.0959488272921107, "grad_norm": 0.324842749445397, "learning_rate": 5.763645625104673e-05, "loss": 0.3634, "step": 1966 }, { "epoch": 2.0970149253731343, "grad_norm": 0.29363848696991995, "learning_rate": 5.7609730321882495e-05, "loss": 0.364, "step": 1967 }, { "epoch": 2.098081023454158, "grad_norm": 0.2645265350384704, "learning_rate": 5.758299463784287e-05, "loss": 0.365, "step": 1968 }, { "epoch": 2.0991471215351813, "grad_norm": 0.2474828784122483, "learning_rate": 5.755624921373805e-05, "loss": 0.3743, "step": 1969 }, { "epoch": 2.100213219616205, "grad_norm": 0.2310435326688506, "learning_rate": 5.752949406438357e-05, "loss": 0.372, "step": 1970 }, { "epoch": 2.101279317697228, "grad_norm": 0.22201440839590442, "learning_rate": 5.7502729204600416e-05, "loss": 0.3721, "step": 1971 }, { "epoch": 2.1023454157782515, "grad_norm": 0.2541491754650197, "learning_rate": 5.747595464921493e-05, "loss": 0.3715, "step": 1972 }, { "epoch": 2.103411513859275, "grad_norm": 0.31165144411877344, "learning_rate": 5.744917041305882e-05, "loss": 0.3654, "step": 1973 }, { "epoch": 2.1044776119402986, "grad_norm": 0.28922154074403444, "learning_rate": 5.7422376510969165e-05, "loss": 0.3728, "step": 1974 }, { "epoch": 2.105543710021322, "grad_norm": 0.27560948128394847, "learning_rate": 5.739557295778838e-05, "loss": 0.3695, "step": 1975 }, { "epoch": 2.106609808102345, "grad_norm": 0.2625778705797159, "learning_rate": 5.736875976836426e-05, "loss": 0.3701, "step": 1976 }, { "epoch": 2.1076759061833688, "grad_norm": 0.3217515539116153, "learning_rate": 5.73419369575499e-05, "loss": 0.3724, "step": 1977 }, { "epoch": 2.1087420042643923, "grad_norm": 0.3509916580574595, "learning_rate": 5.731510454020377e-05, "loss": 0.3658, "step": 1978 }, { "epoch": 2.109808102345416, "grad_norm": 0.3227845858529989, "learning_rate": 5.728826253118961e-05, "loss": 0.3709, "step": 1979 }, { "epoch": 2.1108742004264394, "grad_norm": 0.3134700723493593, "learning_rate": 5.7261410945376496e-05, "loss": 0.3759, "step": 1980 }, { "epoch": 2.111940298507463, "grad_norm": 0.31039720803106174, "learning_rate": 5.723454979763882e-05, "loss": 0.3724, "step": 1981 }, { "epoch": 2.113006396588486, "grad_norm": 0.32706272995692337, "learning_rate": 5.720767910285626e-05, "loss": 0.3722, "step": 1982 }, { "epoch": 2.1140724946695095, "grad_norm": 0.30080459490071365, "learning_rate": 5.718079887591381e-05, "loss": 0.3733, "step": 1983 }, { "epoch": 2.115138592750533, "grad_norm": 0.3043060216756326, "learning_rate": 5.715390913170167e-05, "loss": 0.375, "step": 1984 }, { "epoch": 2.1162046908315566, "grad_norm": 0.28712340226310096, "learning_rate": 5.7127009885115394e-05, "loss": 0.3733, "step": 1985 }, { "epoch": 2.11727078891258, "grad_norm": 0.31229581889122415, "learning_rate": 5.710010115105576e-05, "loss": 0.3717, "step": 1986 }, { "epoch": 2.1183368869936032, "grad_norm": 0.3112982453246555, "learning_rate": 5.707318294442881e-05, "loss": 0.375, "step": 1987 }, { "epoch": 2.1194029850746268, "grad_norm": 0.2935803415699457, "learning_rate": 5.704625528014582e-05, "loss": 0.3688, "step": 1988 }, { "epoch": 2.1204690831556503, "grad_norm": 0.34557760313152336, "learning_rate": 5.701931817312334e-05, "loss": 0.3654, "step": 1989 }, { "epoch": 2.121535181236674, "grad_norm": 0.32408207030896125, "learning_rate": 5.6992371638283094e-05, "loss": 0.3681, "step": 1990 }, { "epoch": 2.1226012793176974, "grad_norm": 0.3209402684629185, "learning_rate": 5.6965415690552083e-05, "loss": 0.3691, "step": 1991 }, { "epoch": 2.1236673773987205, "grad_norm": 0.37804296374542823, "learning_rate": 5.693845034486251e-05, "loss": 0.3717, "step": 1992 }, { "epoch": 2.124733475479744, "grad_norm": 0.402239674935612, "learning_rate": 5.691147561615175e-05, "loss": 0.3676, "step": 1993 }, { "epoch": 2.1257995735607675, "grad_norm": 0.40844484750895815, "learning_rate": 5.688449151936243e-05, "loss": 0.3724, "step": 1994 }, { "epoch": 2.126865671641791, "grad_norm": 0.42766169546473953, "learning_rate": 5.6857498069442306e-05, "loss": 0.3709, "step": 1995 }, { "epoch": 2.1279317697228146, "grad_norm": 0.37265857219610005, "learning_rate": 5.683049528134437e-05, "loss": 0.3737, "step": 1996 }, { "epoch": 2.128997867803838, "grad_norm": 0.3122405362957184, "learning_rate": 5.680348317002676e-05, "loss": 0.3677, "step": 1997 }, { "epoch": 2.1300639658848612, "grad_norm": 0.38579576284804146, "learning_rate": 5.677646175045276e-05, "loss": 0.3716, "step": 1998 }, { "epoch": 2.131130063965885, "grad_norm": 0.49480827662022464, "learning_rate": 5.674943103759086e-05, "loss": 0.3778, "step": 1999 }, { "epoch": 2.1321961620469083, "grad_norm": 0.5608752285357496, "learning_rate": 5.672239104641466e-05, "loss": 0.3734, "step": 2000 }, { "epoch": 2.133262260127932, "grad_norm": 0.4928793043710862, "learning_rate": 5.669534179190289e-05, "loss": 0.3682, "step": 2001 }, { "epoch": 2.1343283582089554, "grad_norm": 0.38504633757152384, "learning_rate": 5.666828328903947e-05, "loss": 0.3745, "step": 2002 }, { "epoch": 2.1353944562899785, "grad_norm": 0.2618645941857424, "learning_rate": 5.664121555281339e-05, "loss": 0.3747, "step": 2003 }, { "epoch": 2.136460554371002, "grad_norm": 0.2755495972773531, "learning_rate": 5.661413859821874e-05, "loss": 0.3791, "step": 2004 }, { "epoch": 2.1375266524520256, "grad_norm": 0.34215532694497924, "learning_rate": 5.658705244025479e-05, "loss": 0.3773, "step": 2005 }, { "epoch": 2.138592750533049, "grad_norm": 0.30880880152023477, "learning_rate": 5.6559957093925826e-05, "loss": 0.3748, "step": 2006 }, { "epoch": 2.1396588486140726, "grad_norm": 0.3224170651717293, "learning_rate": 5.653285257424129e-05, "loss": 0.3693, "step": 2007 }, { "epoch": 2.140724946695096, "grad_norm": 0.35605595038557464, "learning_rate": 5.650573889621566e-05, "loss": 0.3713, "step": 2008 }, { "epoch": 2.1417910447761193, "grad_norm": 0.3377860560809023, "learning_rate": 5.6478616074868506e-05, "loss": 0.376, "step": 2009 }, { "epoch": 2.142857142857143, "grad_norm": 0.2591813760532827, "learning_rate": 5.645148412522447e-05, "loss": 0.3745, "step": 2010 }, { "epoch": 2.1439232409381663, "grad_norm": 0.28268845158045547, "learning_rate": 5.642434306231323e-05, "loss": 0.3708, "step": 2011 }, { "epoch": 2.14498933901919, "grad_norm": 0.35060571169428506, "learning_rate": 5.639719290116954e-05, "loss": 0.3654, "step": 2012 }, { "epoch": 2.1460554371002134, "grad_norm": 0.3694429698768061, "learning_rate": 5.637003365683317e-05, "loss": 0.3671, "step": 2013 }, { "epoch": 2.1471215351812365, "grad_norm": 0.34971403364074133, "learning_rate": 5.6342865344348935e-05, "loss": 0.3677, "step": 2014 }, { "epoch": 2.14818763326226, "grad_norm": 0.27907178596426185, "learning_rate": 5.631568797876665e-05, "loss": 0.3741, "step": 2015 }, { "epoch": 2.1492537313432836, "grad_norm": 0.27207794051401735, "learning_rate": 5.628850157514118e-05, "loss": 0.3658, "step": 2016 }, { "epoch": 2.150319829424307, "grad_norm": 0.28041793573401386, "learning_rate": 5.6261306148532377e-05, "loss": 0.3673, "step": 2017 }, { "epoch": 2.1513859275053306, "grad_norm": 0.24028826632892578, "learning_rate": 5.62341017140051e-05, "loss": 0.3708, "step": 2018 }, { "epoch": 2.1524520255863537, "grad_norm": 0.3066348756647207, "learning_rate": 5.6206888286629186e-05, "loss": 0.3722, "step": 2019 }, { "epoch": 2.1535181236673773, "grad_norm": 0.2776104929301282, "learning_rate": 5.6179665881479444e-05, "loss": 0.371, "step": 2020 }, { "epoch": 2.154584221748401, "grad_norm": 0.2932002920422834, "learning_rate": 5.61524345136357e-05, "loss": 0.3765, "step": 2021 }, { "epoch": 2.1556503198294243, "grad_norm": 0.42204445414488995, "learning_rate": 5.6125194198182683e-05, "loss": 0.3747, "step": 2022 }, { "epoch": 2.156716417910448, "grad_norm": 0.4892242798710215, "learning_rate": 5.609794495021016e-05, "loss": 0.3751, "step": 2023 }, { "epoch": 2.1577825159914714, "grad_norm": 0.43720511006835944, "learning_rate": 5.607068678481274e-05, "loss": 0.368, "step": 2024 }, { "epoch": 2.1588486140724945, "grad_norm": 0.33287359123355437, "learning_rate": 5.6043419717090075e-05, "loss": 0.3694, "step": 2025 }, { "epoch": 2.159914712153518, "grad_norm": 0.28896027090363063, "learning_rate": 5.6016143762146685e-05, "loss": 0.3688, "step": 2026 }, { "epoch": 2.1609808102345416, "grad_norm": 0.2887140479083496, "learning_rate": 5.598885893509203e-05, "loss": 0.3737, "step": 2027 }, { "epoch": 2.162046908315565, "grad_norm": 0.2695168201652278, "learning_rate": 5.59615652510405e-05, "loss": 0.3705, "step": 2028 }, { "epoch": 2.1631130063965887, "grad_norm": 0.2407806636654491, "learning_rate": 5.593426272511136e-05, "loss": 0.3698, "step": 2029 }, { "epoch": 2.1641791044776117, "grad_norm": 0.32911747478553777, "learning_rate": 5.590695137242881e-05, "loss": 0.3752, "step": 2030 }, { "epoch": 2.1652452025586353, "grad_norm": 0.3868451813947213, "learning_rate": 5.587963120812189e-05, "loss": 0.3794, "step": 2031 }, { "epoch": 2.166311300639659, "grad_norm": 0.34786974694646033, "learning_rate": 5.585230224732458e-05, "loss": 0.3652, "step": 2032 }, { "epoch": 2.1673773987206824, "grad_norm": 0.35478109819952885, "learning_rate": 5.582496450517569e-05, "loss": 0.3795, "step": 2033 }, { "epoch": 2.168443496801706, "grad_norm": 0.3508750515755295, "learning_rate": 5.5797617996818915e-05, "loss": 0.3646, "step": 2034 }, { "epoch": 2.1695095948827294, "grad_norm": 0.3447886192528013, "learning_rate": 5.57702627374028e-05, "loss": 0.3676, "step": 2035 }, { "epoch": 2.1705756929637525, "grad_norm": 0.41611174037016146, "learning_rate": 5.5742898742080726e-05, "loss": 0.3703, "step": 2036 }, { "epoch": 2.171641791044776, "grad_norm": 0.4442311781729241, "learning_rate": 5.5715526026010935e-05, "loss": 0.3659, "step": 2037 }, { "epoch": 2.1727078891257996, "grad_norm": 0.3670273857196127, "learning_rate": 5.568814460435649e-05, "loss": 0.3768, "step": 2038 }, { "epoch": 2.173773987206823, "grad_norm": 0.2593513887640986, "learning_rate": 5.5660754492285264e-05, "loss": 0.3669, "step": 2039 }, { "epoch": 2.1748400852878467, "grad_norm": 0.28282579166446825, "learning_rate": 5.563335570496996e-05, "loss": 0.3739, "step": 2040 }, { "epoch": 2.1759061833688698, "grad_norm": 0.36977014027652605, "learning_rate": 5.560594825758809e-05, "loss": 0.3615, "step": 2041 }, { "epoch": 2.1769722814498933, "grad_norm": 0.37906406180450586, "learning_rate": 5.557853216532194e-05, "loss": 0.374, "step": 2042 }, { "epoch": 2.178038379530917, "grad_norm": 0.3003073120798734, "learning_rate": 5.555110744335863e-05, "loss": 0.3688, "step": 2043 }, { "epoch": 2.1791044776119404, "grad_norm": 0.30581675422030474, "learning_rate": 5.552367410688999e-05, "loss": 0.3709, "step": 2044 }, { "epoch": 2.180170575692964, "grad_norm": 0.4243247655082495, "learning_rate": 5.5496232171112703e-05, "loss": 0.3725, "step": 2045 }, { "epoch": 2.181236673773987, "grad_norm": 0.5418944865918137, "learning_rate": 5.546878165122815e-05, "loss": 0.3747, "step": 2046 }, { "epoch": 2.1823027718550105, "grad_norm": 0.5717964571682613, "learning_rate": 5.544132256244249e-05, "loss": 0.3701, "step": 2047 }, { "epoch": 2.183368869936034, "grad_norm": 0.5656819581120287, "learning_rate": 5.5413854919966654e-05, "loss": 0.3721, "step": 2048 }, { "epoch": 2.1844349680170576, "grad_norm": 0.48959229163286955, "learning_rate": 5.538637873901626e-05, "loss": 0.3697, "step": 2049 }, { "epoch": 2.185501066098081, "grad_norm": 0.39474515330398696, "learning_rate": 5.5358894034811705e-05, "loss": 0.3723, "step": 2050 }, { "epoch": 2.1865671641791047, "grad_norm": 0.29811160693240174, "learning_rate": 5.533140082257808e-05, "loss": 0.3765, "step": 2051 }, { "epoch": 2.1876332622601278, "grad_norm": 0.2398629529634624, "learning_rate": 5.530389911754519e-05, "loss": 0.3702, "step": 2052 }, { "epoch": 2.1886993603411513, "grad_norm": 0.2641014816643812, "learning_rate": 5.527638893494755e-05, "loss": 0.375, "step": 2053 }, { "epoch": 2.189765458422175, "grad_norm": 0.26988134759785193, "learning_rate": 5.5248870290024396e-05, "loss": 0.3702, "step": 2054 }, { "epoch": 2.1908315565031984, "grad_norm": 0.26917332673147915, "learning_rate": 5.5221343198019596e-05, "loss": 0.3695, "step": 2055 }, { "epoch": 2.191897654584222, "grad_norm": 0.32231195092404136, "learning_rate": 5.519380767418176e-05, "loss": 0.3685, "step": 2056 }, { "epoch": 2.192963752665245, "grad_norm": 0.3242363910277114, "learning_rate": 5.5166263733764096e-05, "loss": 0.3695, "step": 2057 }, { "epoch": 2.1940298507462686, "grad_norm": 0.30073683294689246, "learning_rate": 5.5138711392024545e-05, "loss": 0.3704, "step": 2058 }, { "epoch": 2.195095948827292, "grad_norm": 0.26491871391522587, "learning_rate": 5.5111150664225665e-05, "loss": 0.3715, "step": 2059 }, { "epoch": 2.1961620469083156, "grad_norm": 0.294788631674408, "learning_rate": 5.508358156563466e-05, "loss": 0.3695, "step": 2060 }, { "epoch": 2.197228144989339, "grad_norm": 0.3148068642489263, "learning_rate": 5.505600411152341e-05, "loss": 0.3723, "step": 2061 }, { "epoch": 2.1982942430703627, "grad_norm": 0.2787016615117562, "learning_rate": 5.502841831716833e-05, "loss": 0.3739, "step": 2062 }, { "epoch": 2.199360341151386, "grad_norm": 0.2643411187566, "learning_rate": 5.5000824197850575e-05, "loss": 0.3739, "step": 2063 }, { "epoch": 2.2004264392324093, "grad_norm": 0.2991815067944646, "learning_rate": 5.497322176885582e-05, "loss": 0.3734, "step": 2064 }, { "epoch": 2.201492537313433, "grad_norm": 0.31374929510374666, "learning_rate": 5.494561104547439e-05, "loss": 0.3668, "step": 2065 }, { "epoch": 2.2025586353944564, "grad_norm": 0.2983982343106373, "learning_rate": 5.491799204300119e-05, "loss": 0.3756, "step": 2066 }, { "epoch": 2.20362473347548, "grad_norm": 0.3055575842818981, "learning_rate": 5.489036477673571e-05, "loss": 0.3739, "step": 2067 }, { "epoch": 2.204690831556503, "grad_norm": 0.36274620979363453, "learning_rate": 5.486272926198202e-05, "loss": 0.3701, "step": 2068 }, { "epoch": 2.2057569296375266, "grad_norm": 0.38535842211347343, "learning_rate": 5.483508551404875e-05, "loss": 0.3683, "step": 2069 }, { "epoch": 2.20682302771855, "grad_norm": 0.36592619429217743, "learning_rate": 5.4807433548249106e-05, "loss": 0.3719, "step": 2070 }, { "epoch": 2.2078891257995736, "grad_norm": 0.320538300126649, "learning_rate": 5.4779773379900856e-05, "loss": 0.3681, "step": 2071 }, { "epoch": 2.208955223880597, "grad_norm": 0.3284949841600627, "learning_rate": 5.4752105024326265e-05, "loss": 0.3722, "step": 2072 }, { "epoch": 2.2100213219616203, "grad_norm": 0.36352567522233187, "learning_rate": 5.4724428496852184e-05, "loss": 0.3742, "step": 2073 }, { "epoch": 2.211087420042644, "grad_norm": 0.3708212783274699, "learning_rate": 5.469674381280997e-05, "loss": 0.3723, "step": 2074 }, { "epoch": 2.2121535181236673, "grad_norm": 0.34144099793040533, "learning_rate": 5.4669050987535504e-05, "loss": 0.3716, "step": 2075 }, { "epoch": 2.213219616204691, "grad_norm": 0.31816989140951235, "learning_rate": 5.464135003636914e-05, "loss": 0.3651, "step": 2076 }, { "epoch": 2.2142857142857144, "grad_norm": 0.29852456569861313, "learning_rate": 5.461364097465581e-05, "loss": 0.3711, "step": 2077 }, { "epoch": 2.2153518123667375, "grad_norm": 0.2455931272133471, "learning_rate": 5.4585923817744864e-05, "loss": 0.3718, "step": 2078 }, { "epoch": 2.216417910447761, "grad_norm": 0.26413290094993985, "learning_rate": 5.455819858099018e-05, "loss": 0.365, "step": 2079 }, { "epoch": 2.2174840085287846, "grad_norm": 0.24066490453483533, "learning_rate": 5.4530465279750087e-05, "loss": 0.3705, "step": 2080 }, { "epoch": 2.218550106609808, "grad_norm": 0.2618435559346739, "learning_rate": 5.450272392938742e-05, "loss": 0.3735, "step": 2081 }, { "epoch": 2.2196162046908317, "grad_norm": 0.24293872764002158, "learning_rate": 5.4474974545269394e-05, "loss": 0.3717, "step": 2082 }, { "epoch": 2.220682302771855, "grad_norm": 0.2368196730572011, "learning_rate": 5.444721714276778e-05, "loss": 0.3639, "step": 2083 }, { "epoch": 2.2217484008528783, "grad_norm": 0.27260123271017317, "learning_rate": 5.44194517372587e-05, "loss": 0.3685, "step": 2084 }, { "epoch": 2.222814498933902, "grad_norm": 0.270272968775527, "learning_rate": 5.439167834412277e-05, "loss": 0.3672, "step": 2085 }, { "epoch": 2.2238805970149254, "grad_norm": 0.29536457376389963, "learning_rate": 5.436389697874499e-05, "loss": 0.3656, "step": 2086 }, { "epoch": 2.224946695095949, "grad_norm": 0.38141850154306833, "learning_rate": 5.4336107656514796e-05, "loss": 0.3718, "step": 2087 }, { "epoch": 2.2260127931769724, "grad_norm": 0.38555373110953656, "learning_rate": 5.430831039282603e-05, "loss": 0.3653, "step": 2088 }, { "epoch": 2.227078891257996, "grad_norm": 0.33806578284967886, "learning_rate": 5.428050520307693e-05, "loss": 0.3651, "step": 2089 }, { "epoch": 2.228144989339019, "grad_norm": 0.3782268091971875, "learning_rate": 5.425269210267013e-05, "loss": 0.3711, "step": 2090 }, { "epoch": 2.2292110874200426, "grad_norm": 0.4292891226917417, "learning_rate": 5.422487110701263e-05, "loss": 0.3702, "step": 2091 }, { "epoch": 2.230277185501066, "grad_norm": 0.41142332636108486, "learning_rate": 5.419704223151584e-05, "loss": 0.3661, "step": 2092 }, { "epoch": 2.2313432835820897, "grad_norm": 0.3326023012950207, "learning_rate": 5.416920549159549e-05, "loss": 0.3721, "step": 2093 }, { "epoch": 2.232409381663113, "grad_norm": 0.26521116383654986, "learning_rate": 5.4141360902671696e-05, "loss": 0.3723, "step": 2094 }, { "epoch": 2.2334754797441363, "grad_norm": 0.2612433990558302, "learning_rate": 5.411350848016891e-05, "loss": 0.3601, "step": 2095 }, { "epoch": 2.23454157782516, "grad_norm": 0.3434339511954728, "learning_rate": 5.4085648239515914e-05, "loss": 0.3692, "step": 2096 }, { "epoch": 2.2356076759061834, "grad_norm": 0.3918166163519152, "learning_rate": 5.4057780196145856e-05, "loss": 0.3601, "step": 2097 }, { "epoch": 2.236673773987207, "grad_norm": 0.3818529019674707, "learning_rate": 5.402990436549617e-05, "loss": 0.3703, "step": 2098 }, { "epoch": 2.2377398720682304, "grad_norm": 0.36098975401837946, "learning_rate": 5.4002020763008624e-05, "loss": 0.3711, "step": 2099 }, { "epoch": 2.2388059701492535, "grad_norm": 0.31093262243992925, "learning_rate": 5.397412940412927e-05, "loss": 0.3698, "step": 2100 }, { "epoch": 2.239872068230277, "grad_norm": 0.3234247785754041, "learning_rate": 5.39462303043085e-05, "loss": 0.3699, "step": 2101 }, { "epoch": 2.2409381663113006, "grad_norm": 0.3388924115506885, "learning_rate": 5.391832347900095e-05, "loss": 0.3714, "step": 2102 }, { "epoch": 2.242004264392324, "grad_norm": 0.2867861763859992, "learning_rate": 5.389040894366554e-05, "loss": 0.3681, "step": 2103 }, { "epoch": 2.2430703624733477, "grad_norm": 0.27923094302877083, "learning_rate": 5.386248671376549e-05, "loss": 0.3711, "step": 2104 }, { "epoch": 2.2441364605543708, "grad_norm": 0.31103472441375013, "learning_rate": 5.383455680476824e-05, "loss": 0.3647, "step": 2105 }, { "epoch": 2.2452025586353943, "grad_norm": 0.2851322771459375, "learning_rate": 5.380661923214553e-05, "loss": 0.3692, "step": 2106 }, { "epoch": 2.246268656716418, "grad_norm": 0.32537762914509355, "learning_rate": 5.377867401137332e-05, "loss": 0.372, "step": 2107 }, { "epoch": 2.2473347547974414, "grad_norm": 0.36286224273560747, "learning_rate": 5.375072115793181e-05, "loss": 0.3736, "step": 2108 }, { "epoch": 2.248400852878465, "grad_norm": 0.39983878699233394, "learning_rate": 5.3722760687305414e-05, "loss": 0.3726, "step": 2109 }, { "epoch": 2.2494669509594885, "grad_norm": 0.3468587900831904, "learning_rate": 5.3694792614982794e-05, "loss": 0.3701, "step": 2110 }, { "epoch": 2.2505330490405115, "grad_norm": 0.29431394759332513, "learning_rate": 5.366681695645681e-05, "loss": 0.3741, "step": 2111 }, { "epoch": 2.251599147121535, "grad_norm": 0.292448439837048, "learning_rate": 5.363883372722452e-05, "loss": 0.3705, "step": 2112 }, { "epoch": 2.2526652452025586, "grad_norm": 0.3477064399327982, "learning_rate": 5.3610842942787156e-05, "loss": 0.3698, "step": 2113 }, { "epoch": 2.253731343283582, "grad_norm": 0.30311348425697426, "learning_rate": 5.3582844618650196e-05, "loss": 0.3734, "step": 2114 }, { "epoch": 2.2547974413646057, "grad_norm": 0.29982583065786783, "learning_rate": 5.355483877032324e-05, "loss": 0.376, "step": 2115 }, { "epoch": 2.2558635394456292, "grad_norm": 0.2708619876359533, "learning_rate": 5.352682541332006e-05, "loss": 0.3743, "step": 2116 }, { "epoch": 2.2569296375266523, "grad_norm": 0.28890106497206663, "learning_rate": 5.349880456315862e-05, "loss": 0.3715, "step": 2117 }, { "epoch": 2.257995735607676, "grad_norm": 0.2663781257854233, "learning_rate": 5.347077623536099e-05, "loss": 0.3679, "step": 2118 }, { "epoch": 2.2590618336886994, "grad_norm": 0.2674833564078522, "learning_rate": 5.344274044545344e-05, "loss": 0.3669, "step": 2119 }, { "epoch": 2.260127931769723, "grad_norm": 0.28497773715596586, "learning_rate": 5.3414697208966315e-05, "loss": 0.3669, "step": 2120 }, { "epoch": 2.2611940298507465, "grad_norm": 0.26215123696658, "learning_rate": 5.3386646541434126e-05, "loss": 0.3706, "step": 2121 }, { "epoch": 2.2622601279317696, "grad_norm": 0.32005074927132454, "learning_rate": 5.335858845839546e-05, "loss": 0.3702, "step": 2122 }, { "epoch": 2.263326226012793, "grad_norm": 0.32411250381064693, "learning_rate": 5.333052297539308e-05, "loss": 0.3699, "step": 2123 }, { "epoch": 2.2643923240938166, "grad_norm": 0.29000255006330194, "learning_rate": 5.3302450107973766e-05, "loss": 0.368, "step": 2124 }, { "epoch": 2.26545842217484, "grad_norm": 0.27431618415808595, "learning_rate": 5.327436987168844e-05, "loss": 0.371, "step": 2125 }, { "epoch": 2.2665245202558637, "grad_norm": 0.378024359549344, "learning_rate": 5.324628228209209e-05, "loss": 0.3721, "step": 2126 }, { "epoch": 2.267590618336887, "grad_norm": 0.43512140087622997, "learning_rate": 5.321818735474379e-05, "loss": 0.3715, "step": 2127 }, { "epoch": 2.2686567164179103, "grad_norm": 0.3750669758497868, "learning_rate": 5.3190085105206666e-05, "loss": 0.3764, "step": 2128 }, { "epoch": 2.269722814498934, "grad_norm": 0.28073557975371866, "learning_rate": 5.31619755490479e-05, "loss": 0.3748, "step": 2129 }, { "epoch": 2.2707889125799574, "grad_norm": 0.2949560743757881, "learning_rate": 5.3133858701838735e-05, "loss": 0.3709, "step": 2130 }, { "epoch": 2.271855010660981, "grad_norm": 0.3204542728152737, "learning_rate": 5.310573457915443e-05, "loss": 0.3703, "step": 2131 }, { "epoch": 2.272921108742004, "grad_norm": 0.2868376825764523, "learning_rate": 5.307760319657429e-05, "loss": 0.3708, "step": 2132 }, { "epoch": 2.2739872068230276, "grad_norm": 0.251294097973084, "learning_rate": 5.3049464569681654e-05, "loss": 0.3689, "step": 2133 }, { "epoch": 2.275053304904051, "grad_norm": 0.27463227847743765, "learning_rate": 5.3021318714063834e-05, "loss": 0.3761, "step": 2134 }, { "epoch": 2.2761194029850746, "grad_norm": 0.33592667040636226, "learning_rate": 5.299316564531219e-05, "loss": 0.3731, "step": 2135 }, { "epoch": 2.277185501066098, "grad_norm": 0.38903214147148135, "learning_rate": 5.296500537902205e-05, "loss": 0.37, "step": 2136 }, { "epoch": 2.2782515991471217, "grad_norm": 0.34915279703933755, "learning_rate": 5.293683793079274e-05, "loss": 0.3649, "step": 2137 }, { "epoch": 2.279317697228145, "grad_norm": 0.3506016673900483, "learning_rate": 5.2908663316227577e-05, "loss": 0.3734, "step": 2138 }, { "epoch": 2.2803837953091683, "grad_norm": 0.3878071683023028, "learning_rate": 5.28804815509338e-05, "loss": 0.3707, "step": 2139 }, { "epoch": 2.281449893390192, "grad_norm": 0.35568233275956773, "learning_rate": 5.285229265052268e-05, "loss": 0.3758, "step": 2140 }, { "epoch": 2.2825159914712154, "grad_norm": 0.3418784225462311, "learning_rate": 5.2824096630609385e-05, "loss": 0.37, "step": 2141 }, { "epoch": 2.283582089552239, "grad_norm": 0.3607422406160099, "learning_rate": 5.2795893506813024e-05, "loss": 0.3728, "step": 2142 }, { "epoch": 2.2846481876332625, "grad_norm": 0.3721645564569364, "learning_rate": 5.276768329475671e-05, "loss": 0.3717, "step": 2143 }, { "epoch": 2.2857142857142856, "grad_norm": 0.3220957092565807, "learning_rate": 5.2739466010067385e-05, "loss": 0.3739, "step": 2144 }, { "epoch": 2.286780383795309, "grad_norm": 0.2920281939799502, "learning_rate": 5.271124166837599e-05, "loss": 0.366, "step": 2145 }, { "epoch": 2.2878464818763327, "grad_norm": 0.3080101489292651, "learning_rate": 5.2683010285317333e-05, "loss": 0.3675, "step": 2146 }, { "epoch": 2.288912579957356, "grad_norm": 0.3042289029809852, "learning_rate": 5.265477187653012e-05, "loss": 0.369, "step": 2147 }, { "epoch": 2.2899786780383797, "grad_norm": 0.3475082339985931, "learning_rate": 5.262652645765699e-05, "loss": 0.3702, "step": 2148 }, { "epoch": 2.291044776119403, "grad_norm": 0.3115045075355101, "learning_rate": 5.2598274044344414e-05, "loss": 0.3687, "step": 2149 }, { "epoch": 2.2921108742004264, "grad_norm": 0.32610593675887717, "learning_rate": 5.257001465224278e-05, "loss": 0.3692, "step": 2150 }, { "epoch": 2.29317697228145, "grad_norm": 0.37456808053238067, "learning_rate": 5.2541748297006306e-05, "loss": 0.3721, "step": 2151 }, { "epoch": 2.2942430703624734, "grad_norm": 0.3350461025624831, "learning_rate": 5.251347499429309e-05, "loss": 0.3732, "step": 2152 }, { "epoch": 2.295309168443497, "grad_norm": 0.2568059568097965, "learning_rate": 5.2485194759765074e-05, "loss": 0.3691, "step": 2153 }, { "epoch": 2.29637526652452, "grad_norm": 0.27831384466881337, "learning_rate": 5.245690760908803e-05, "loss": 0.3686, "step": 2154 }, { "epoch": 2.2974413646055436, "grad_norm": 0.26602965147812224, "learning_rate": 5.242861355793157e-05, "loss": 0.3645, "step": 2155 }, { "epoch": 2.298507462686567, "grad_norm": 0.2587039755436819, "learning_rate": 5.240031262196914e-05, "loss": 0.3757, "step": 2156 }, { "epoch": 2.2995735607675907, "grad_norm": 0.27151417310136094, "learning_rate": 5.237200481687798e-05, "loss": 0.368, "step": 2157 }, { "epoch": 2.300639658848614, "grad_norm": 0.26058762590380463, "learning_rate": 5.234369015833914e-05, "loss": 0.3697, "step": 2158 }, { "epoch": 2.3017057569296373, "grad_norm": 0.33421242107141547, "learning_rate": 5.2315368662037485e-05, "loss": 0.3675, "step": 2159 }, { "epoch": 2.302771855010661, "grad_norm": 0.3659615591367041, "learning_rate": 5.228704034366162e-05, "loss": 0.3703, "step": 2160 }, { "epoch": 2.3038379530916844, "grad_norm": 0.27978795201644363, "learning_rate": 5.2258705218904005e-05, "loss": 0.371, "step": 2161 }, { "epoch": 2.304904051172708, "grad_norm": 0.21852529768913895, "learning_rate": 5.2230363303460794e-05, "loss": 0.3683, "step": 2162 }, { "epoch": 2.3059701492537314, "grad_norm": 0.3118783856447842, "learning_rate": 5.220201461303193e-05, "loss": 0.3714, "step": 2163 }, { "epoch": 2.307036247334755, "grad_norm": 0.3392350656900747, "learning_rate": 5.2173659163321145e-05, "loss": 0.3682, "step": 2164 }, { "epoch": 2.308102345415778, "grad_norm": 0.32734701181884485, "learning_rate": 5.2145296970035846e-05, "loss": 0.3744, "step": 2165 }, { "epoch": 2.3091684434968016, "grad_norm": 0.303313761017607, "learning_rate": 5.211692804888726e-05, "loss": 0.3744, "step": 2166 }, { "epoch": 2.310234541577825, "grad_norm": 0.2905681835769858, "learning_rate": 5.2088552415590254e-05, "loss": 0.3713, "step": 2167 }, { "epoch": 2.3113006396588487, "grad_norm": 0.3534515955282182, "learning_rate": 5.2060170085863484e-05, "loss": 0.3703, "step": 2168 }, { "epoch": 2.3123667377398722, "grad_norm": 0.4264619477891961, "learning_rate": 5.203178107542925e-05, "loss": 0.3715, "step": 2169 }, { "epoch": 2.3134328358208958, "grad_norm": 0.47866741810652247, "learning_rate": 5.200338540001364e-05, "loss": 0.3706, "step": 2170 }, { "epoch": 2.314498933901919, "grad_norm": 0.4733916257306672, "learning_rate": 5.1974983075346335e-05, "loss": 0.3639, "step": 2171 }, { "epoch": 2.3155650319829424, "grad_norm": 0.42832051016045336, "learning_rate": 5.194657411716076e-05, "loss": 0.3698, "step": 2172 }, { "epoch": 2.316631130063966, "grad_norm": 0.43789085333356936, "learning_rate": 5.1918158541194014e-05, "loss": 0.3735, "step": 2173 }, { "epoch": 2.3176972281449895, "grad_norm": 0.4573284771159797, "learning_rate": 5.188973636318684e-05, "loss": 0.3713, "step": 2174 }, { "epoch": 2.318763326226013, "grad_norm": 0.3916370694042795, "learning_rate": 5.1861307598883644e-05, "loss": 0.3723, "step": 2175 }, { "epoch": 2.319829424307036, "grad_norm": 0.2585801987726249, "learning_rate": 5.1832872264032495e-05, "loss": 0.37, "step": 2176 }, { "epoch": 2.3208955223880596, "grad_norm": 0.2602560441928257, "learning_rate": 5.180443037438508e-05, "loss": 0.3711, "step": 2177 }, { "epoch": 2.321961620469083, "grad_norm": 0.36519338651936206, "learning_rate": 5.1775981945696736e-05, "loss": 0.3701, "step": 2178 }, { "epoch": 2.3230277185501067, "grad_norm": 0.34363559674089617, "learning_rate": 5.1747526993726406e-05, "loss": 0.3701, "step": 2179 }, { "epoch": 2.3240938166311302, "grad_norm": 0.34656810990367415, "learning_rate": 5.1719065534236665e-05, "loss": 0.3784, "step": 2180 }, { "epoch": 2.3251599147121533, "grad_norm": 0.3632003951397555, "learning_rate": 5.169059758299367e-05, "loss": 0.3667, "step": 2181 }, { "epoch": 2.326226012793177, "grad_norm": 0.33113190305897466, "learning_rate": 5.1662123155767195e-05, "loss": 0.3672, "step": 2182 }, { "epoch": 2.3272921108742004, "grad_norm": 0.3490341083520184, "learning_rate": 5.163364226833058e-05, "loss": 0.3715, "step": 2183 }, { "epoch": 2.328358208955224, "grad_norm": 0.30252710599573834, "learning_rate": 5.1605154936460774e-05, "loss": 0.3699, "step": 2184 }, { "epoch": 2.3294243070362475, "grad_norm": 0.23971322338603304, "learning_rate": 5.1576661175938274e-05, "loss": 0.3684, "step": 2185 }, { "epoch": 2.3304904051172706, "grad_norm": 0.2935195353785945, "learning_rate": 5.154816100254714e-05, "loss": 0.3728, "step": 2186 }, { "epoch": 2.331556503198294, "grad_norm": 0.32527117670349187, "learning_rate": 5.151965443207498e-05, "loss": 0.3672, "step": 2187 }, { "epoch": 2.3326226012793176, "grad_norm": 0.29601539551210887, "learning_rate": 5.149114148031296e-05, "loss": 0.3722, "step": 2188 }, { "epoch": 2.333688699360341, "grad_norm": 0.29110059563391355, "learning_rate": 5.1462622163055764e-05, "loss": 0.3711, "step": 2189 }, { "epoch": 2.3347547974413647, "grad_norm": 0.2777132993013124, "learning_rate": 5.143409649610163e-05, "loss": 0.3733, "step": 2190 }, { "epoch": 2.3358208955223883, "grad_norm": 0.2838065564769729, "learning_rate": 5.14055644952523e-05, "loss": 0.3677, "step": 2191 }, { "epoch": 2.3368869936034113, "grad_norm": 0.26546607714553994, "learning_rate": 5.137702617631299e-05, "loss": 0.3705, "step": 2192 }, { "epoch": 2.337953091684435, "grad_norm": 0.2691865136150305, "learning_rate": 5.134848155509245e-05, "loss": 0.3693, "step": 2193 }, { "epoch": 2.3390191897654584, "grad_norm": 0.3248422617546964, "learning_rate": 5.131993064740293e-05, "loss": 0.373, "step": 2194 }, { "epoch": 2.340085287846482, "grad_norm": 0.44219209612841087, "learning_rate": 5.1291373469060156e-05, "loss": 0.3746, "step": 2195 }, { "epoch": 2.3411513859275055, "grad_norm": 0.4487435007125376, "learning_rate": 5.12628100358833e-05, "loss": 0.3693, "step": 2196 }, { "epoch": 2.342217484008529, "grad_norm": 0.39660870409128357, "learning_rate": 5.123424036369504e-05, "loss": 0.3748, "step": 2197 }, { "epoch": 2.343283582089552, "grad_norm": 0.35851143039701705, "learning_rate": 5.120566446832146e-05, "loss": 0.3691, "step": 2198 }, { "epoch": 2.3443496801705757, "grad_norm": 0.334759825605026, "learning_rate": 5.117708236559216e-05, "loss": 0.3673, "step": 2199 }, { "epoch": 2.345415778251599, "grad_norm": 0.2941272239375261, "learning_rate": 5.114849407134012e-05, "loss": 0.3643, "step": 2200 }, { "epoch": 2.3464818763326227, "grad_norm": 0.2789608400358062, "learning_rate": 5.111989960140175e-05, "loss": 0.3671, "step": 2201 }, { "epoch": 2.3475479744136463, "grad_norm": 0.3040283299295704, "learning_rate": 5.109129897161694e-05, "loss": 0.3772, "step": 2202 }, { "epoch": 2.3486140724946694, "grad_norm": 0.319463979431584, "learning_rate": 5.106269219782891e-05, "loss": 0.3749, "step": 2203 }, { "epoch": 2.349680170575693, "grad_norm": 0.2870715537863343, "learning_rate": 5.1034079295884366e-05, "loss": 0.3702, "step": 2204 }, { "epoch": 2.3507462686567164, "grad_norm": 0.23446913387159526, "learning_rate": 5.100546028163334e-05, "loss": 0.3753, "step": 2205 }, { "epoch": 2.35181236673774, "grad_norm": 0.25437833515945757, "learning_rate": 5.0976835170929296e-05, "loss": 0.3666, "step": 2206 }, { "epoch": 2.3528784648187635, "grad_norm": 0.30198542630658826, "learning_rate": 5.0948203979629046e-05, "loss": 0.3673, "step": 2207 }, { "epoch": 2.3539445628997866, "grad_norm": 0.3141893057528808, "learning_rate": 5.091956672359279e-05, "loss": 0.3627, "step": 2208 }, { "epoch": 2.35501066098081, "grad_norm": 0.2805349073413673, "learning_rate": 5.089092341868407e-05, "loss": 0.3676, "step": 2209 }, { "epoch": 2.3560767590618337, "grad_norm": 0.2464031232660798, "learning_rate": 5.08622740807698e-05, "loss": 0.3719, "step": 2210 }, { "epoch": 2.357142857142857, "grad_norm": 0.23438323370618938, "learning_rate": 5.0833618725720214e-05, "loss": 0.3615, "step": 2211 }, { "epoch": 2.3582089552238807, "grad_norm": 0.2997661424210569, "learning_rate": 5.080495736940889e-05, "loss": 0.3705, "step": 2212 }, { "epoch": 2.359275053304904, "grad_norm": 0.28620092721857604, "learning_rate": 5.077629002771274e-05, "loss": 0.3713, "step": 2213 }, { "epoch": 2.3603411513859274, "grad_norm": 0.2468965983835025, "learning_rate": 5.074761671651198e-05, "loss": 0.3696, "step": 2214 }, { "epoch": 2.361407249466951, "grad_norm": 0.25371334633765574, "learning_rate": 5.071893745169012e-05, "loss": 0.3656, "step": 2215 }, { "epoch": 2.3624733475479744, "grad_norm": 0.24096589331073096, "learning_rate": 5.0690252249133986e-05, "loss": 0.3709, "step": 2216 }, { "epoch": 2.363539445628998, "grad_norm": 0.2152100387619128, "learning_rate": 5.066156112473371e-05, "loss": 0.3727, "step": 2217 }, { "epoch": 2.364605543710021, "grad_norm": 0.19650715541656136, "learning_rate": 5.063286409438265e-05, "loss": 0.3678, "step": 2218 }, { "epoch": 2.3656716417910446, "grad_norm": 0.21642059764416482, "learning_rate": 5.0604161173977504e-05, "loss": 0.3701, "step": 2219 }, { "epoch": 2.366737739872068, "grad_norm": 0.27848392769400904, "learning_rate": 5.057545237941818e-05, "loss": 0.3697, "step": 2220 }, { "epoch": 2.3678038379530917, "grad_norm": 0.3063826642109227, "learning_rate": 5.054673772660785e-05, "loss": 0.3669, "step": 2221 }, { "epoch": 2.368869936034115, "grad_norm": 0.32561764160202256, "learning_rate": 5.0518017231452965e-05, "loss": 0.369, "step": 2222 }, { "epoch": 2.3699360341151388, "grad_norm": 0.2857238132037038, "learning_rate": 5.048929090986315e-05, "loss": 0.3658, "step": 2223 }, { "epoch": 2.3710021321961623, "grad_norm": 0.24657414607956593, "learning_rate": 5.046055877775134e-05, "loss": 0.3689, "step": 2224 }, { "epoch": 2.3720682302771854, "grad_norm": 0.25182509640136513, "learning_rate": 5.04318208510336e-05, "loss": 0.3737, "step": 2225 }, { "epoch": 2.373134328358209, "grad_norm": 0.2663592462420767, "learning_rate": 5.040307714562928e-05, "loss": 0.3654, "step": 2226 }, { "epoch": 2.3742004264392325, "grad_norm": 0.2879071667851132, "learning_rate": 5.0374327677460865e-05, "loss": 0.362, "step": 2227 }, { "epoch": 2.375266524520256, "grad_norm": 0.31268904527265867, "learning_rate": 5.034557246245411e-05, "loss": 0.3716, "step": 2228 }, { "epoch": 2.3763326226012795, "grad_norm": 0.33284965204702893, "learning_rate": 5.031681151653788e-05, "loss": 0.3736, "step": 2229 }, { "epoch": 2.3773987206823026, "grad_norm": 0.33195392118584366, "learning_rate": 5.028804485564424e-05, "loss": 0.3677, "step": 2230 }, { "epoch": 2.378464818763326, "grad_norm": 0.3258278442178994, "learning_rate": 5.025927249570844e-05, "loss": 0.3693, "step": 2231 }, { "epoch": 2.3795309168443497, "grad_norm": 0.27343346069158153, "learning_rate": 5.0230494452668864e-05, "loss": 0.3733, "step": 2232 }, { "epoch": 2.3805970149253732, "grad_norm": 0.25449187697759107, "learning_rate": 5.020171074246707e-05, "loss": 0.3668, "step": 2233 }, { "epoch": 2.3816631130063968, "grad_norm": 0.30392088734456846, "learning_rate": 5.01729213810477e-05, "loss": 0.3728, "step": 2234 }, { "epoch": 2.38272921108742, "grad_norm": 0.3421952140379261, "learning_rate": 5.014412638435861e-05, "loss": 0.3684, "step": 2235 }, { "epoch": 2.3837953091684434, "grad_norm": 0.2857424583185023, "learning_rate": 5.011532576835069e-05, "loss": 0.3629, "step": 2236 }, { "epoch": 2.384861407249467, "grad_norm": 0.2704981763316653, "learning_rate": 5.008651954897802e-05, "loss": 0.3686, "step": 2237 }, { "epoch": 2.3859275053304905, "grad_norm": 0.31308011375671224, "learning_rate": 5.005770774219771e-05, "loss": 0.3668, "step": 2238 }, { "epoch": 2.386993603411514, "grad_norm": 0.3213528795416127, "learning_rate": 5.002889036397005e-05, "loss": 0.3732, "step": 2239 }, { "epoch": 2.388059701492537, "grad_norm": 0.34520492416317516, "learning_rate": 5.000006743025834e-05, "loss": 0.3745, "step": 2240 }, { "epoch": 2.3891257995735606, "grad_norm": 0.30507013156130014, "learning_rate": 4.997123895702898e-05, "loss": 0.3705, "step": 2241 }, { "epoch": 2.390191897654584, "grad_norm": 0.2626181582745989, "learning_rate": 4.994240496025147e-05, "loss": 0.3727, "step": 2242 }, { "epoch": 2.3912579957356077, "grad_norm": 0.28512789584756165, "learning_rate": 4.9913565455898327e-05, "loss": 0.3756, "step": 2243 }, { "epoch": 2.3923240938166312, "grad_norm": 0.34534631086868595, "learning_rate": 4.988472045994515e-05, "loss": 0.3713, "step": 2244 }, { "epoch": 2.3933901918976543, "grad_norm": 0.3429792538710119, "learning_rate": 4.9855869988370566e-05, "loss": 0.3703, "step": 2245 }, { "epoch": 2.394456289978678, "grad_norm": 0.25240157643598227, "learning_rate": 4.982701405715622e-05, "loss": 0.3677, "step": 2246 }, { "epoch": 2.3955223880597014, "grad_norm": 0.2549810372821131, "learning_rate": 4.9798152682286824e-05, "loss": 0.3711, "step": 2247 }, { "epoch": 2.396588486140725, "grad_norm": 0.351895077255566, "learning_rate": 4.976928587975006e-05, "loss": 0.3686, "step": 2248 }, { "epoch": 2.3976545842217485, "grad_norm": 0.30952942264717365, "learning_rate": 4.974041366553665e-05, "loss": 0.372, "step": 2249 }, { "epoch": 2.398720682302772, "grad_norm": 0.2747608348019261, "learning_rate": 4.9711536055640285e-05, "loss": 0.3728, "step": 2250 }, { "epoch": 2.399786780383795, "grad_norm": 0.29969340676529904, "learning_rate": 4.9682653066057676e-05, "loss": 0.3696, "step": 2251 }, { "epoch": 2.4008528784648187, "grad_norm": 0.32127749277577844, "learning_rate": 4.965376471278848e-05, "loss": 0.3733, "step": 2252 }, { "epoch": 2.401918976545842, "grad_norm": 0.3048300891627042, "learning_rate": 4.962487101183536e-05, "loss": 0.369, "step": 2253 }, { "epoch": 2.4029850746268657, "grad_norm": 0.3219753451808507, "learning_rate": 4.959597197920392e-05, "loss": 0.3681, "step": 2254 }, { "epoch": 2.4040511727078893, "grad_norm": 0.2965082879251037, "learning_rate": 4.956706763090272e-05, "loss": 0.3724, "step": 2255 }, { "epoch": 2.405117270788913, "grad_norm": 0.275785450919491, "learning_rate": 4.953815798294327e-05, "loss": 0.3675, "step": 2256 }, { "epoch": 2.406183368869936, "grad_norm": 0.8759881578646651, "learning_rate": 4.950924305134001e-05, "loss": 0.3698, "step": 2257 }, { "epoch": 2.4072494669509594, "grad_norm": 0.2354192544695265, "learning_rate": 4.948032285211031e-05, "loss": 0.3693, "step": 2258 }, { "epoch": 2.408315565031983, "grad_norm": 0.28320526847754757, "learning_rate": 4.945139740127444e-05, "loss": 0.3685, "step": 2259 }, { "epoch": 2.4093816631130065, "grad_norm": 0.27994748944288567, "learning_rate": 4.9422466714855635e-05, "loss": 0.3716, "step": 2260 }, { "epoch": 2.41044776119403, "grad_norm": 0.2577423187481169, "learning_rate": 4.939353080887996e-05, "loss": 0.3736, "step": 2261 }, { "epoch": 2.411513859275053, "grad_norm": 0.20189014886728646, "learning_rate": 4.936458969937642e-05, "loss": 0.3634, "step": 2262 }, { "epoch": 2.4125799573560767, "grad_norm": 0.27831002089150636, "learning_rate": 4.933564340237687e-05, "loss": 0.3698, "step": 2263 }, { "epoch": 2.4136460554371, "grad_norm": 0.33503367593001443, "learning_rate": 4.930669193391607e-05, "loss": 0.3759, "step": 2264 }, { "epoch": 2.4147121535181237, "grad_norm": 0.2557075329669632, "learning_rate": 4.927773531003161e-05, "loss": 0.3719, "step": 2265 }, { "epoch": 2.4157782515991473, "grad_norm": 0.1959575254371725, "learning_rate": 4.9248773546763984e-05, "loss": 0.3666, "step": 2266 }, { "epoch": 2.4168443496801704, "grad_norm": 0.2566851026971127, "learning_rate": 4.921980666015647e-05, "loss": 0.3722, "step": 2267 }, { "epoch": 2.417910447761194, "grad_norm": 0.3373730577896526, "learning_rate": 4.919083466625524e-05, "loss": 0.3704, "step": 2268 }, { "epoch": 2.4189765458422174, "grad_norm": 0.29147620272337893, "learning_rate": 4.916185758110928e-05, "loss": 0.3764, "step": 2269 }, { "epoch": 2.420042643923241, "grad_norm": 0.23015006905725324, "learning_rate": 4.913287542077035e-05, "loss": 0.3765, "step": 2270 }, { "epoch": 2.4211087420042645, "grad_norm": 0.2666917022092777, "learning_rate": 4.91038882012931e-05, "loss": 0.3685, "step": 2271 }, { "epoch": 2.4221748400852876, "grad_norm": 0.2807196321927748, "learning_rate": 4.907489593873493e-05, "loss": 0.3673, "step": 2272 }, { "epoch": 2.423240938166311, "grad_norm": 0.23362271125449918, "learning_rate": 4.904589864915605e-05, "loss": 0.367, "step": 2273 }, { "epoch": 2.4243070362473347, "grad_norm": 0.2813715337244537, "learning_rate": 4.901689634861943e-05, "loss": 0.3712, "step": 2274 }, { "epoch": 2.425373134328358, "grad_norm": 0.29016723056538085, "learning_rate": 4.898788905319087e-05, "loss": 0.373, "step": 2275 }, { "epoch": 2.4264392324093818, "grad_norm": 0.302081673689402, "learning_rate": 4.895887677893889e-05, "loss": 0.3654, "step": 2276 }, { "epoch": 2.4275053304904053, "grad_norm": 0.28261113280230443, "learning_rate": 4.892985954193478e-05, "loss": 0.3748, "step": 2277 }, { "epoch": 2.4285714285714284, "grad_norm": 0.32074482716991354, "learning_rate": 4.890083735825258e-05, "loss": 0.3649, "step": 2278 }, { "epoch": 2.429637526652452, "grad_norm": 0.3198080528899319, "learning_rate": 4.887181024396907e-05, "loss": 0.3593, "step": 2279 }, { "epoch": 2.4307036247334755, "grad_norm": 0.25493105066167904, "learning_rate": 4.884277821516377e-05, "loss": 0.3661, "step": 2280 }, { "epoch": 2.431769722814499, "grad_norm": 0.21652193711164933, "learning_rate": 4.881374128791892e-05, "loss": 0.3712, "step": 2281 }, { "epoch": 2.4328358208955225, "grad_norm": 0.25095432049789496, "learning_rate": 4.878469947831945e-05, "loss": 0.3704, "step": 2282 }, { "epoch": 2.433901918976546, "grad_norm": 0.26679049451296216, "learning_rate": 4.875565280245303e-05, "loss": 0.3741, "step": 2283 }, { "epoch": 2.434968017057569, "grad_norm": 0.2999335227622482, "learning_rate": 4.872660127640999e-05, "loss": 0.3713, "step": 2284 }, { "epoch": 2.4360341151385927, "grad_norm": 0.2695301789461439, "learning_rate": 4.8697544916283386e-05, "loss": 0.3659, "step": 2285 }, { "epoch": 2.4371002132196162, "grad_norm": 0.21471471033527165, "learning_rate": 4.866848373816893e-05, "loss": 0.3682, "step": 2286 }, { "epoch": 2.4381663113006398, "grad_norm": 0.23947760320903563, "learning_rate": 4.863941775816498e-05, "loss": 0.372, "step": 2287 }, { "epoch": 2.4392324093816633, "grad_norm": 0.267331462460001, "learning_rate": 4.8610346992372603e-05, "loss": 0.3654, "step": 2288 }, { "epoch": 2.4402985074626864, "grad_norm": 0.20555064622823938, "learning_rate": 4.85812714568955e-05, "loss": 0.3693, "step": 2289 }, { "epoch": 2.44136460554371, "grad_norm": 0.21294997398754437, "learning_rate": 4.855219116783997e-05, "loss": 0.3728, "step": 2290 }, { "epoch": 2.4424307036247335, "grad_norm": 0.21976358289050846, "learning_rate": 4.8523106141315005e-05, "loss": 0.3792, "step": 2291 }, { "epoch": 2.443496801705757, "grad_norm": 0.23884227282875203, "learning_rate": 4.8494016393432205e-05, "loss": 0.3607, "step": 2292 }, { "epoch": 2.4445628997867805, "grad_norm": 0.2624658049626465, "learning_rate": 4.846492194030577e-05, "loss": 0.3707, "step": 2293 }, { "epoch": 2.4456289978678036, "grad_norm": 0.28918877258810133, "learning_rate": 4.843582279805251e-05, "loss": 0.3737, "step": 2294 }, { "epoch": 2.446695095948827, "grad_norm": 0.31404526252223114, "learning_rate": 4.840671898279185e-05, "loss": 0.3659, "step": 2295 }, { "epoch": 2.4477611940298507, "grad_norm": 0.29838133141624607, "learning_rate": 4.837761051064579e-05, "loss": 0.3687, "step": 2296 }, { "epoch": 2.4488272921108742, "grad_norm": 0.22887592628169673, "learning_rate": 4.834849739773889e-05, "loss": 0.3722, "step": 2297 }, { "epoch": 2.449893390191898, "grad_norm": 0.23115344121100373, "learning_rate": 4.8319379660198316e-05, "loss": 0.3737, "step": 2298 }, { "epoch": 2.450959488272921, "grad_norm": 0.23213200641703416, "learning_rate": 4.829025731415378e-05, "loss": 0.364, "step": 2299 }, { "epoch": 2.4520255863539444, "grad_norm": 0.2116924844746429, "learning_rate": 4.826113037573756e-05, "loss": 0.3674, "step": 2300 }, { "epoch": 2.453091684434968, "grad_norm": 0.2559342759140697, "learning_rate": 4.823199886108445e-05, "loss": 0.3752, "step": 2301 }, { "epoch": 2.4541577825159915, "grad_norm": 0.293845262944136, "learning_rate": 4.82028627863318e-05, "loss": 0.3709, "step": 2302 }, { "epoch": 2.455223880597015, "grad_norm": 0.29454888718180283, "learning_rate": 4.817372216761948e-05, "loss": 0.3722, "step": 2303 }, { "epoch": 2.4562899786780386, "grad_norm": 0.24511640910718516, "learning_rate": 4.8144577021089884e-05, "loss": 0.3747, "step": 2304 }, { "epoch": 2.4573560767590616, "grad_norm": 0.2763149476728892, "learning_rate": 4.81154273628879e-05, "loss": 0.363, "step": 2305 }, { "epoch": 2.458422174840085, "grad_norm": 0.3243976443000684, "learning_rate": 4.8086273209160936e-05, "loss": 0.3693, "step": 2306 }, { "epoch": 2.4594882729211087, "grad_norm": 0.28035443109857583, "learning_rate": 4.8057114576058863e-05, "loss": 0.3709, "step": 2307 }, { "epoch": 2.4605543710021323, "grad_norm": 0.20090541292771277, "learning_rate": 4.802795147973406e-05, "loss": 0.3666, "step": 2308 }, { "epoch": 2.461620469083156, "grad_norm": 0.2580300489910844, "learning_rate": 4.799878393634136e-05, "loss": 0.3668, "step": 2309 }, { "epoch": 2.4626865671641793, "grad_norm": 0.25079828700699003, "learning_rate": 4.796961196203806e-05, "loss": 0.3763, "step": 2310 }, { "epoch": 2.4637526652452024, "grad_norm": 0.24137932132689202, "learning_rate": 4.7940435572983936e-05, "loss": 0.3705, "step": 2311 }, { "epoch": 2.464818763326226, "grad_norm": 0.24074250093617247, "learning_rate": 4.791125478534118e-05, "loss": 0.3701, "step": 2312 }, { "epoch": 2.4658848614072495, "grad_norm": 0.21896631390072002, "learning_rate": 4.7882069615274435e-05, "loss": 0.3683, "step": 2313 }, { "epoch": 2.466950959488273, "grad_norm": 0.22445307173012358, "learning_rate": 4.7852880078950764e-05, "loss": 0.3721, "step": 2314 }, { "epoch": 2.4680170575692966, "grad_norm": 0.27306888281171515, "learning_rate": 4.782368619253965e-05, "loss": 0.3687, "step": 2315 }, { "epoch": 2.4690831556503197, "grad_norm": 0.36336555042461727, "learning_rate": 4.7794487972213e-05, "loss": 0.3639, "step": 2316 }, { "epoch": 2.470149253731343, "grad_norm": 0.3601488690657708, "learning_rate": 4.776528543414511e-05, "loss": 0.3738, "step": 2317 }, { "epoch": 2.4712153518123667, "grad_norm": 0.35339799743518896, "learning_rate": 4.773607859451265e-05, "loss": 0.3713, "step": 2318 }, { "epoch": 2.4722814498933903, "grad_norm": 0.3693531153358677, "learning_rate": 4.770686746949472e-05, "loss": 0.3681, "step": 2319 }, { "epoch": 2.473347547974414, "grad_norm": 0.39311860926438036, "learning_rate": 4.767765207527275e-05, "loss": 0.3696, "step": 2320 }, { "epoch": 2.474413646055437, "grad_norm": 0.37612079543054977, "learning_rate": 4.764843242803053e-05, "loss": 0.3661, "step": 2321 }, { "epoch": 2.4754797441364604, "grad_norm": 0.3521543006142998, "learning_rate": 4.761920854395426e-05, "loss": 0.3716, "step": 2322 }, { "epoch": 2.476545842217484, "grad_norm": 0.26305773466563936, "learning_rate": 4.7589980439232433e-05, "loss": 0.3707, "step": 2323 }, { "epoch": 2.4776119402985075, "grad_norm": 0.2559553568605443, "learning_rate": 4.756074813005591e-05, "loss": 0.3704, "step": 2324 }, { "epoch": 2.478678038379531, "grad_norm": 0.2890296801712053, "learning_rate": 4.753151163261787e-05, "loss": 0.3669, "step": 2325 }, { "epoch": 2.479744136460554, "grad_norm": 0.268964216758793, "learning_rate": 4.75022709631138e-05, "loss": 0.3704, "step": 2326 }, { "epoch": 2.4808102345415777, "grad_norm": 0.29305303643593816, "learning_rate": 4.747302613774153e-05, "loss": 0.3667, "step": 2327 }, { "epoch": 2.481876332622601, "grad_norm": 0.3343530498108033, "learning_rate": 4.7443777172701146e-05, "loss": 0.3703, "step": 2328 }, { "epoch": 2.4829424307036247, "grad_norm": 0.3429097239396157, "learning_rate": 4.74145240841951e-05, "loss": 0.374, "step": 2329 }, { "epoch": 2.4840085287846483, "grad_norm": 0.27000380770235705, "learning_rate": 4.738526688842803e-05, "loss": 0.371, "step": 2330 }, { "epoch": 2.485074626865672, "grad_norm": 0.24578044590662604, "learning_rate": 4.735600560160695e-05, "loss": 0.3713, "step": 2331 }, { "epoch": 2.486140724946695, "grad_norm": 0.2595315485397087, "learning_rate": 4.7326740239941054e-05, "loss": 0.3748, "step": 2332 }, { "epoch": 2.4872068230277184, "grad_norm": 0.2207117203267181, "learning_rate": 4.729747081964185e-05, "loss": 0.3635, "step": 2333 }, { "epoch": 2.488272921108742, "grad_norm": 0.22525118617182993, "learning_rate": 4.7268197356923076e-05, "loss": 0.3693, "step": 2334 }, { "epoch": 2.4893390191897655, "grad_norm": 0.26088710793376507, "learning_rate": 4.7238919868000704e-05, "loss": 0.3609, "step": 2335 }, { "epoch": 2.490405117270789, "grad_norm": 0.2885428062747875, "learning_rate": 4.720963836909295e-05, "loss": 0.3762, "step": 2336 }, { "epoch": 2.4914712153518126, "grad_norm": 0.25259361954597753, "learning_rate": 4.718035287642022e-05, "loss": 0.3714, "step": 2337 }, { "epoch": 2.4925373134328357, "grad_norm": 0.24137669671809825, "learning_rate": 4.715106340620518e-05, "loss": 0.3677, "step": 2338 }, { "epoch": 2.4936034115138592, "grad_norm": 0.2686347142176362, "learning_rate": 4.712176997467266e-05, "loss": 0.3706, "step": 2339 }, { "epoch": 2.4946695095948828, "grad_norm": 0.2812137990571536, "learning_rate": 4.709247259804971e-05, "loss": 0.371, "step": 2340 }, { "epoch": 2.4957356076759063, "grad_norm": 0.24346310160667034, "learning_rate": 4.706317129256554e-05, "loss": 0.3716, "step": 2341 }, { "epoch": 2.49680170575693, "grad_norm": 0.23214815216215345, "learning_rate": 4.703386607445157e-05, "loss": 0.3712, "step": 2342 }, { "epoch": 2.497867803837953, "grad_norm": 0.2739545324648638, "learning_rate": 4.7004556959941335e-05, "loss": 0.3651, "step": 2343 }, { "epoch": 2.4989339019189765, "grad_norm": 0.25915247784626605, "learning_rate": 4.69752439652706e-05, "loss": 0.3663, "step": 2344 }, { "epoch": 2.5, "grad_norm": 0.25618378366217026, "learning_rate": 4.694592710667723e-05, "loss": 0.374, "step": 2345 }, { "epoch": 2.5010660980810235, "grad_norm": 0.30631526116242996, "learning_rate": 4.69166064004012e-05, "loss": 0.3716, "step": 2346 }, { "epoch": 2.502132196162047, "grad_norm": 0.3366426624946492, "learning_rate": 4.688728186268472e-05, "loss": 0.3714, "step": 2347 }, { "epoch": 2.50319829424307, "grad_norm": 0.25931098502063027, "learning_rate": 4.685795350977202e-05, "loss": 0.3715, "step": 2348 }, { "epoch": 2.5042643923240937, "grad_norm": 0.2123807260445706, "learning_rate": 4.6828621357909494e-05, "loss": 0.3649, "step": 2349 }, { "epoch": 2.5053304904051172, "grad_norm": 0.19519486858966945, "learning_rate": 4.679928542334564e-05, "loss": 0.3756, "step": 2350 }, { "epoch": 2.5063965884861408, "grad_norm": 0.24863053693547293, "learning_rate": 4.676994572233101e-05, "loss": 0.3676, "step": 2351 }, { "epoch": 2.5074626865671643, "grad_norm": 0.25591846224110576, "learning_rate": 4.674060227111831e-05, "loss": 0.3746, "step": 2352 }, { "epoch": 2.5085287846481874, "grad_norm": 0.2281497146449445, "learning_rate": 4.6711255085962275e-05, "loss": 0.3636, "step": 2353 }, { "epoch": 2.509594882729211, "grad_norm": 0.2994568082565574, "learning_rate": 4.66819041831197e-05, "loss": 0.373, "step": 2354 }, { "epoch": 2.5106609808102345, "grad_norm": 0.29113727854820887, "learning_rate": 4.66525495788495e-05, "loss": 0.3723, "step": 2355 }, { "epoch": 2.511727078891258, "grad_norm": 0.2845367853179552, "learning_rate": 4.662319128941256e-05, "loss": 0.3659, "step": 2356 }, { "epoch": 2.5127931769722816, "grad_norm": 0.24591581925857822, "learning_rate": 4.6593829331071854e-05, "loss": 0.3777, "step": 2357 }, { "epoch": 2.5138592750533046, "grad_norm": 0.2322117630084555, "learning_rate": 4.6564463720092405e-05, "loss": 0.3679, "step": 2358 }, { "epoch": 2.5149253731343286, "grad_norm": 0.22223391656686764, "learning_rate": 4.653509447274121e-05, "loss": 0.3685, "step": 2359 }, { "epoch": 2.5159914712153517, "grad_norm": 0.2403114798612083, "learning_rate": 4.650572160528733e-05, "loss": 0.3661, "step": 2360 }, { "epoch": 2.5170575692963753, "grad_norm": 0.2414630524740656, "learning_rate": 4.647634513400178e-05, "loss": 0.3682, "step": 2361 }, { "epoch": 2.518123667377399, "grad_norm": 0.2006519271395349, "learning_rate": 4.644696507515762e-05, "loss": 0.3654, "step": 2362 }, { "epoch": 2.5191897654584223, "grad_norm": 0.24007590073272467, "learning_rate": 4.641758144502985e-05, "loss": 0.3659, "step": 2363 }, { "epoch": 2.520255863539446, "grad_norm": 0.26804872309010386, "learning_rate": 4.638819425989551e-05, "loss": 0.3709, "step": 2364 }, { "epoch": 2.521321961620469, "grad_norm": 0.24000354250919373, "learning_rate": 4.635880353603356e-05, "loss": 0.3712, "step": 2365 }, { "epoch": 2.5223880597014925, "grad_norm": 0.2652528639205553, "learning_rate": 4.632940928972491e-05, "loss": 0.3688, "step": 2366 }, { "epoch": 2.523454157782516, "grad_norm": 0.2580420852617307, "learning_rate": 4.630001153725247e-05, "loss": 0.3703, "step": 2367 }, { "epoch": 2.5245202558635396, "grad_norm": 0.22089649535392641, "learning_rate": 4.627061029490105e-05, "loss": 0.3711, "step": 2368 }, { "epoch": 2.525586353944563, "grad_norm": 0.2545584727108516, "learning_rate": 4.6241205578957435e-05, "loss": 0.3737, "step": 2369 }, { "epoch": 2.526652452025586, "grad_norm": 0.2967974395396264, "learning_rate": 4.6211797405710285e-05, "loss": 0.3712, "step": 2370 }, { "epoch": 2.5277185501066097, "grad_norm": 0.2791964105337959, "learning_rate": 4.618238579145022e-05, "loss": 0.3732, "step": 2371 }, { "epoch": 2.5287846481876333, "grad_norm": 0.26473846775672055, "learning_rate": 4.6152970752469716e-05, "loss": 0.3672, "step": 2372 }, { "epoch": 2.529850746268657, "grad_norm": 0.2049402609403739, "learning_rate": 4.612355230506321e-05, "loss": 0.3691, "step": 2373 }, { "epoch": 2.5309168443496803, "grad_norm": 0.1856152488020746, "learning_rate": 4.609413046552697e-05, "loss": 0.3741, "step": 2374 }, { "epoch": 2.5319829424307034, "grad_norm": 0.2258681946492881, "learning_rate": 4.606470525015917e-05, "loss": 0.369, "step": 2375 }, { "epoch": 2.533049040511727, "grad_norm": 0.25975437520850114, "learning_rate": 4.603527667525987e-05, "loss": 0.3712, "step": 2376 }, { "epoch": 2.5341151385927505, "grad_norm": 0.2306018312278267, "learning_rate": 4.6005844757130937e-05, "loss": 0.3644, "step": 2377 }, { "epoch": 2.535181236673774, "grad_norm": 0.24072038440875435, "learning_rate": 4.597640951207615e-05, "loss": 0.3737, "step": 2378 }, { "epoch": 2.5362473347547976, "grad_norm": 0.2686312117022459, "learning_rate": 4.5946970956401086e-05, "loss": 0.3646, "step": 2379 }, { "epoch": 2.5373134328358207, "grad_norm": 0.2940976442002358, "learning_rate": 4.59175291064132e-05, "loss": 0.3668, "step": 2380 }, { "epoch": 2.538379530916844, "grad_norm": 0.2926304524554624, "learning_rate": 4.588808397842172e-05, "loss": 0.3662, "step": 2381 }, { "epoch": 2.5394456289978677, "grad_norm": 0.24031871433591176, "learning_rate": 4.585863558873774e-05, "loss": 0.365, "step": 2382 }, { "epoch": 2.5405117270788913, "grad_norm": 0.23386304551259074, "learning_rate": 4.582918395367412e-05, "loss": 0.3708, "step": 2383 }, { "epoch": 2.541577825159915, "grad_norm": 0.23138470902625616, "learning_rate": 4.5799729089545546e-05, "loss": 0.3666, "step": 2384 }, { "epoch": 2.542643923240938, "grad_norm": 0.20493257086499, "learning_rate": 4.577027101266847e-05, "loss": 0.3641, "step": 2385 }, { "epoch": 2.543710021321962, "grad_norm": 0.2742256917065172, "learning_rate": 4.574080973936115e-05, "loss": 0.3656, "step": 2386 }, { "epoch": 2.544776119402985, "grad_norm": 0.2573784786511887, "learning_rate": 4.5711345285943585e-05, "loss": 0.3722, "step": 2387 }, { "epoch": 2.5458422174840085, "grad_norm": 0.220635079626304, "learning_rate": 4.568187766873757e-05, "loss": 0.3707, "step": 2388 }, { "epoch": 2.546908315565032, "grad_norm": 0.24947774232184444, "learning_rate": 4.565240690406661e-05, "loss": 0.3685, "step": 2389 }, { "epoch": 2.5479744136460556, "grad_norm": 0.22455688901803345, "learning_rate": 4.5622933008256e-05, "loss": 0.3663, "step": 2390 }, { "epoch": 2.549040511727079, "grad_norm": 0.24210657316421766, "learning_rate": 4.559345599763273e-05, "loss": 0.3715, "step": 2391 }, { "epoch": 2.550106609808102, "grad_norm": 0.23303940791069144, "learning_rate": 4.556397588852553e-05, "loss": 0.3748, "step": 2392 }, { "epoch": 2.5511727078891258, "grad_norm": 0.20346922824676458, "learning_rate": 4.553449269726487e-05, "loss": 0.378, "step": 2393 }, { "epoch": 2.5522388059701493, "grad_norm": 0.25118366195456854, "learning_rate": 4.550500644018289e-05, "loss": 0.3722, "step": 2394 }, { "epoch": 2.553304904051173, "grad_norm": 0.23799851735663155, "learning_rate": 4.547551713361344e-05, "loss": 0.3654, "step": 2395 }, { "epoch": 2.5543710021321964, "grad_norm": 0.24008761159127065, "learning_rate": 4.544602479389207e-05, "loss": 0.3715, "step": 2396 }, { "epoch": 2.5554371002132195, "grad_norm": 0.3224169072622209, "learning_rate": 4.5416529437355996e-05, "loss": 0.3772, "step": 2397 }, { "epoch": 2.556503198294243, "grad_norm": 0.28620149997931676, "learning_rate": 4.538703108034414e-05, "loss": 0.3678, "step": 2398 }, { "epoch": 2.5575692963752665, "grad_norm": 0.2192934282403724, "learning_rate": 4.535752973919701e-05, "loss": 0.3703, "step": 2399 }, { "epoch": 2.55863539445629, "grad_norm": 0.1709676106553397, "learning_rate": 4.532802543025686e-05, "loss": 0.3699, "step": 2400 }, { "epoch": 2.5597014925373136, "grad_norm": 0.24392777534293011, "learning_rate": 4.529851816986752e-05, "loss": 0.3687, "step": 2401 }, { "epoch": 2.5607675906183367, "grad_norm": 0.24891562920270666, "learning_rate": 4.5269007974374494e-05, "loss": 0.3697, "step": 2402 }, { "epoch": 2.5618336886993602, "grad_norm": 0.2384548720836165, "learning_rate": 4.5239494860124895e-05, "loss": 0.3685, "step": 2403 }, { "epoch": 2.5628997867803838, "grad_norm": 0.27098692831127463, "learning_rate": 4.5209978843467436e-05, "loss": 0.3678, "step": 2404 }, { "epoch": 2.5639658848614073, "grad_norm": 0.20778244660818992, "learning_rate": 4.5180459940752474e-05, "loss": 0.3667, "step": 2405 }, { "epoch": 2.565031982942431, "grad_norm": 0.22120855767567077, "learning_rate": 4.515093816833193e-05, "loss": 0.3682, "step": 2406 }, { "epoch": 2.566098081023454, "grad_norm": 0.2634670562341265, "learning_rate": 4.512141354255935e-05, "loss": 0.3689, "step": 2407 }, { "epoch": 2.5671641791044775, "grad_norm": 0.2583204612978333, "learning_rate": 4.509188607978983e-05, "loss": 0.365, "step": 2408 }, { "epoch": 2.568230277185501, "grad_norm": 0.279446147717918, "learning_rate": 4.5062355796380066e-05, "loss": 0.3685, "step": 2409 }, { "epoch": 2.5692963752665245, "grad_norm": 0.2489838679141536, "learning_rate": 4.503282270868829e-05, "loss": 0.3699, "step": 2410 }, { "epoch": 2.570362473347548, "grad_norm": 0.2818230446222446, "learning_rate": 4.500328683307428e-05, "loss": 0.3735, "step": 2411 }, { "epoch": 2.571428571428571, "grad_norm": 0.32345371657550076, "learning_rate": 4.4973748185899416e-05, "loss": 0.3729, "step": 2412 }, { "epoch": 2.572494669509595, "grad_norm": 0.27902407224490827, "learning_rate": 4.4944206783526536e-05, "loss": 0.3723, "step": 2413 }, { "epoch": 2.5735607675906182, "grad_norm": 0.2424292615142393, "learning_rate": 4.4914662642320054e-05, "loss": 0.3687, "step": 2414 }, { "epoch": 2.574626865671642, "grad_norm": 0.251352116384431, "learning_rate": 4.4885115778645895e-05, "loss": 0.3727, "step": 2415 }, { "epoch": 2.5756929637526653, "grad_norm": 0.24325740030245527, "learning_rate": 4.485556620887148e-05, "loss": 0.3674, "step": 2416 }, { "epoch": 2.576759061833689, "grad_norm": 0.30803045624118025, "learning_rate": 4.482601394936573e-05, "loss": 0.3707, "step": 2417 }, { "epoch": 2.5778251599147124, "grad_norm": 0.3292401151984711, "learning_rate": 4.479645901649908e-05, "loss": 0.3732, "step": 2418 }, { "epoch": 2.5788912579957355, "grad_norm": 0.29582277126522766, "learning_rate": 4.476690142664342e-05, "loss": 0.3744, "step": 2419 }, { "epoch": 2.579957356076759, "grad_norm": 0.26686256357070676, "learning_rate": 4.47373411961721e-05, "loss": 0.3687, "step": 2420 }, { "epoch": 2.5810234541577826, "grad_norm": 0.24943332540729626, "learning_rate": 4.470777834145997e-05, "loss": 0.3658, "step": 2421 }, { "epoch": 2.582089552238806, "grad_norm": 0.2121258978338484, "learning_rate": 4.467821287888331e-05, "loss": 0.3668, "step": 2422 }, { "epoch": 2.5831556503198296, "grad_norm": 0.2156156797629481, "learning_rate": 4.464864482481984e-05, "loss": 0.3642, "step": 2423 }, { "epoch": 2.5842217484008527, "grad_norm": 0.2006814924668892, "learning_rate": 4.461907419564874e-05, "loss": 0.3697, "step": 2424 }, { "epoch": 2.5852878464818763, "grad_norm": 0.210853772833808, "learning_rate": 4.4589501007750595e-05, "loss": 0.3749, "step": 2425 }, { "epoch": 2.5863539445629, "grad_norm": 0.25717175869949604, "learning_rate": 4.4559925277507416e-05, "loss": 0.3698, "step": 2426 }, { "epoch": 2.5874200426439233, "grad_norm": 0.25073349786641386, "learning_rate": 4.4530347021302626e-05, "loss": 0.3713, "step": 2427 }, { "epoch": 2.588486140724947, "grad_norm": 0.2643644113163801, "learning_rate": 4.450076625552102e-05, "loss": 0.3747, "step": 2428 }, { "epoch": 2.58955223880597, "grad_norm": 0.24060221681359292, "learning_rate": 4.447118299654883e-05, "loss": 0.3599, "step": 2429 }, { "epoch": 2.5906183368869935, "grad_norm": 0.2720300708647726, "learning_rate": 4.444159726077363e-05, "loss": 0.3718, "step": 2430 }, { "epoch": 2.591684434968017, "grad_norm": 0.3229511582480334, "learning_rate": 4.4412009064584384e-05, "loss": 0.3666, "step": 2431 }, { "epoch": 2.5927505330490406, "grad_norm": 0.3888728224230281, "learning_rate": 4.438241842437142e-05, "loss": 0.3654, "step": 2432 }, { "epoch": 2.593816631130064, "grad_norm": 0.38553138291681316, "learning_rate": 4.435282535652641e-05, "loss": 0.3724, "step": 2433 }, { "epoch": 2.594882729211087, "grad_norm": 0.31969838656564453, "learning_rate": 4.4323229877442374e-05, "loss": 0.3694, "step": 2434 }, { "epoch": 2.5959488272921107, "grad_norm": 0.40489827746268775, "learning_rate": 4.429363200351366e-05, "loss": 0.3701, "step": 2435 }, { "epoch": 2.5970149253731343, "grad_norm": 0.4247590203835174, "learning_rate": 4.426403175113598e-05, "loss": 0.3751, "step": 2436 }, { "epoch": 2.598081023454158, "grad_norm": 0.3951246174491148, "learning_rate": 4.423442913670631e-05, "loss": 0.3693, "step": 2437 }, { "epoch": 2.5991471215351813, "grad_norm": 0.3111985733932929, "learning_rate": 4.420482417662297e-05, "loss": 0.3704, "step": 2438 }, { "epoch": 2.6002132196162044, "grad_norm": 0.18873972891323956, "learning_rate": 4.417521688728556e-05, "loss": 0.3661, "step": 2439 }, { "epoch": 2.6012793176972284, "grad_norm": 0.2495555063976354, "learning_rate": 4.4145607285095e-05, "loss": 0.3755, "step": 2440 }, { "epoch": 2.6023454157782515, "grad_norm": 0.36376650288976037, "learning_rate": 4.411599538645345e-05, "loss": 0.3681, "step": 2441 }, { "epoch": 2.603411513859275, "grad_norm": 0.4693799411527801, "learning_rate": 4.408638120776436e-05, "loss": 0.3705, "step": 2442 }, { "epoch": 2.6044776119402986, "grad_norm": 0.46887250402317515, "learning_rate": 4.405676476543247e-05, "loss": 0.3671, "step": 2443 }, { "epoch": 2.605543710021322, "grad_norm": 0.36949597097867015, "learning_rate": 4.402714607586373e-05, "loss": 0.3738, "step": 2444 }, { "epoch": 2.6066098081023457, "grad_norm": 0.3009704252138755, "learning_rate": 4.399752515546538e-05, "loss": 0.3681, "step": 2445 }, { "epoch": 2.6076759061833688, "grad_norm": 0.237310962341784, "learning_rate": 4.396790202064583e-05, "loss": 0.3686, "step": 2446 }, { "epoch": 2.6087420042643923, "grad_norm": 0.23276180946430972, "learning_rate": 4.393827668781478e-05, "loss": 0.3683, "step": 2447 }, { "epoch": 2.609808102345416, "grad_norm": 0.32501790451361395, "learning_rate": 4.390864917338313e-05, "loss": 0.3708, "step": 2448 }, { "epoch": 2.6108742004264394, "grad_norm": 0.3613827214092068, "learning_rate": 4.387901949376297e-05, "loss": 0.3651, "step": 2449 }, { "epoch": 2.611940298507463, "grad_norm": 0.3553110668616482, "learning_rate": 4.3849387665367614e-05, "loss": 0.3728, "step": 2450 }, { "epoch": 2.613006396588486, "grad_norm": 0.292471368689879, "learning_rate": 4.381975370461155e-05, "loss": 0.3745, "step": 2451 }, { "epoch": 2.6140724946695095, "grad_norm": 0.2016201008306948, "learning_rate": 4.379011762791045e-05, "loss": 0.3671, "step": 2452 }, { "epoch": 2.615138592750533, "grad_norm": 0.2616397181578619, "learning_rate": 4.3760479451681164e-05, "loss": 0.3705, "step": 2453 }, { "epoch": 2.6162046908315566, "grad_norm": 0.3997537994958033, "learning_rate": 4.3730839192341705e-05, "loss": 0.3699, "step": 2454 }, { "epoch": 2.61727078891258, "grad_norm": 0.4296512845219555, "learning_rate": 4.370119686631123e-05, "loss": 0.3654, "step": 2455 }, { "epoch": 2.6183368869936032, "grad_norm": 0.327116358825709, "learning_rate": 4.3671552490010036e-05, "loss": 0.372, "step": 2456 }, { "epoch": 2.6194029850746268, "grad_norm": 0.27235595479261326, "learning_rate": 4.3641906079859584e-05, "loss": 0.3698, "step": 2457 }, { "epoch": 2.6204690831556503, "grad_norm": 0.35732218737027005, "learning_rate": 4.3612257652282446e-05, "loss": 0.3667, "step": 2458 }, { "epoch": 2.621535181236674, "grad_norm": 0.37424865085620856, "learning_rate": 4.358260722370229e-05, "loss": 0.364, "step": 2459 }, { "epoch": 2.6226012793176974, "grad_norm": 0.42605572507707096, "learning_rate": 4.355295481054394e-05, "loss": 0.3735, "step": 2460 }, { "epoch": 2.6236673773987205, "grad_norm": 0.45443453638837905, "learning_rate": 4.352330042923328e-05, "loss": 0.372, "step": 2461 }, { "epoch": 2.624733475479744, "grad_norm": 0.41383949227823347, "learning_rate": 4.3493644096197274e-05, "loss": 0.3717, "step": 2462 }, { "epoch": 2.6257995735607675, "grad_norm": 0.2860714841801369, "learning_rate": 4.3463985827864024e-05, "loss": 0.3646, "step": 2463 }, { "epoch": 2.626865671641791, "grad_norm": 0.23137129814316412, "learning_rate": 4.343432564066264e-05, "loss": 0.3701, "step": 2464 }, { "epoch": 2.6279317697228146, "grad_norm": 0.2512842592763747, "learning_rate": 4.340466355102336e-05, "loss": 0.3672, "step": 2465 }, { "epoch": 2.6289978678038377, "grad_norm": 0.28901573296850475, "learning_rate": 4.3374999575377393e-05, "loss": 0.3651, "step": 2466 }, { "epoch": 2.6300639658848612, "grad_norm": 0.2993536460359868, "learning_rate": 4.334533373015709e-05, "loss": 0.3688, "step": 2467 }, { "epoch": 2.631130063965885, "grad_norm": 0.2614300970595663, "learning_rate": 4.3315666031795736e-05, "loss": 0.3678, "step": 2468 }, { "epoch": 2.6321961620469083, "grad_norm": 0.25189646001058735, "learning_rate": 4.328599649672774e-05, "loss": 0.3712, "step": 2469 }, { "epoch": 2.633262260127932, "grad_norm": 0.24766325430481256, "learning_rate": 4.3256325141388464e-05, "loss": 0.3641, "step": 2470 }, { "epoch": 2.6343283582089554, "grad_norm": 0.3035417706772448, "learning_rate": 4.3226651982214286e-05, "loss": 0.3684, "step": 2471 }, { "epoch": 2.635394456289979, "grad_norm": 0.2636223605711239, "learning_rate": 4.319697703564261e-05, "loss": 0.3688, "step": 2472 }, { "epoch": 2.636460554371002, "grad_norm": 0.20666359945758703, "learning_rate": 4.3167300318111805e-05, "loss": 0.3636, "step": 2473 }, { "epoch": 2.6375266524520256, "grad_norm": 0.24577933602515428, "learning_rate": 4.313762184606124e-05, "loss": 0.3681, "step": 2474 }, { "epoch": 2.638592750533049, "grad_norm": 0.23261579434937743, "learning_rate": 4.310794163593123e-05, "loss": 0.3665, "step": 2475 }, { "epoch": 2.6396588486140726, "grad_norm": 0.21528938082866045, "learning_rate": 4.307825970416308e-05, "loss": 0.3712, "step": 2476 }, { "epoch": 2.640724946695096, "grad_norm": 0.2560143422062812, "learning_rate": 4.3048576067199034e-05, "loss": 0.3668, "step": 2477 }, { "epoch": 2.6417910447761193, "grad_norm": 0.24544814002376064, "learning_rate": 4.3018890741482296e-05, "loss": 0.3705, "step": 2478 }, { "epoch": 2.642857142857143, "grad_norm": 0.2439487587446761, "learning_rate": 4.298920374345698e-05, "loss": 0.3656, "step": 2479 }, { "epoch": 2.6439232409381663, "grad_norm": 0.25308388826969747, "learning_rate": 4.295951508956814e-05, "loss": 0.3638, "step": 2480 }, { "epoch": 2.64498933901919, "grad_norm": 0.30616158821118233, "learning_rate": 4.292982479626175e-05, "loss": 0.3628, "step": 2481 }, { "epoch": 2.6460554371002134, "grad_norm": 0.2635196635922222, "learning_rate": 4.290013287998469e-05, "loss": 0.369, "step": 2482 }, { "epoch": 2.6471215351812365, "grad_norm": 0.2737120868957713, "learning_rate": 4.287043935718474e-05, "loss": 0.3729, "step": 2483 }, { "epoch": 2.64818763326226, "grad_norm": 0.31387047069745105, "learning_rate": 4.2840744244310565e-05, "loss": 0.3749, "step": 2484 }, { "epoch": 2.6492537313432836, "grad_norm": 0.29584704047191807, "learning_rate": 4.281104755781172e-05, "loss": 0.371, "step": 2485 }, { "epoch": 2.650319829424307, "grad_norm": 0.31359007568593433, "learning_rate": 4.278134931413862e-05, "loss": 0.3698, "step": 2486 }, { "epoch": 2.6513859275053306, "grad_norm": 0.2606911686791426, "learning_rate": 4.275164952974256e-05, "loss": 0.3712, "step": 2487 }, { "epoch": 2.6524520255863537, "grad_norm": 0.2110093354675097, "learning_rate": 4.272194822107566e-05, "loss": 0.3699, "step": 2488 }, { "epoch": 2.6535181236673773, "grad_norm": 0.1961521426965646, "learning_rate": 4.2692245404590906e-05, "loss": 0.366, "step": 2489 }, { "epoch": 2.654584221748401, "grad_norm": 0.2515365045262743, "learning_rate": 4.266254109674213e-05, "loss": 0.3683, "step": 2490 }, { "epoch": 2.6556503198294243, "grad_norm": 0.3249680113246906, "learning_rate": 4.263283531398395e-05, "loss": 0.3637, "step": 2491 }, { "epoch": 2.656716417910448, "grad_norm": 0.3118378628242833, "learning_rate": 4.260312807277185e-05, "loss": 0.3719, "step": 2492 }, { "epoch": 2.657782515991471, "grad_norm": 0.22886489658313464, "learning_rate": 4.2573419389562074e-05, "loss": 0.3715, "step": 2493 }, { "epoch": 2.6588486140724945, "grad_norm": 0.1662131591027839, "learning_rate": 4.254370928081171e-05, "loss": 0.3772, "step": 2494 }, { "epoch": 2.659914712153518, "grad_norm": 0.1969499093984148, "learning_rate": 4.25139977629786e-05, "loss": 0.3699, "step": 2495 }, { "epoch": 2.6609808102345416, "grad_norm": 0.25930037314010335, "learning_rate": 4.248428485252139e-05, "loss": 0.3655, "step": 2496 }, { "epoch": 2.662046908315565, "grad_norm": 0.26415096213448547, "learning_rate": 4.2454570565899476e-05, "loss": 0.3734, "step": 2497 }, { "epoch": 2.663113006396588, "grad_norm": 0.27116001496004927, "learning_rate": 4.242485491957305e-05, "loss": 0.3657, "step": 2498 }, { "epoch": 2.664179104477612, "grad_norm": 0.25863792879198805, "learning_rate": 4.239513793000301e-05, "loss": 0.3753, "step": 2499 }, { "epoch": 2.6652452025586353, "grad_norm": 0.23133617297550452, "learning_rate": 4.2365419613651035e-05, "loss": 0.3737, "step": 2500 }, { "epoch": 2.666311300639659, "grad_norm": 0.20980481301021206, "learning_rate": 4.233569998697954e-05, "loss": 0.3701, "step": 2501 }, { "epoch": 2.6673773987206824, "grad_norm": 0.24297415494208804, "learning_rate": 4.2305979066451626e-05, "loss": 0.3677, "step": 2502 }, { "epoch": 2.668443496801706, "grad_norm": 0.2493426212110305, "learning_rate": 4.2276256868531165e-05, "loss": 0.3652, "step": 2503 }, { "epoch": 2.6695095948827294, "grad_norm": 0.2303457530378945, "learning_rate": 4.22465334096827e-05, "loss": 0.3771, "step": 2504 }, { "epoch": 2.6705756929637525, "grad_norm": 0.223769372564616, "learning_rate": 4.221680870637148e-05, "loss": 0.3673, "step": 2505 }, { "epoch": 2.671641791044776, "grad_norm": 0.2590177839797346, "learning_rate": 4.2187082775063436e-05, "loss": 0.3685, "step": 2506 }, { "epoch": 2.6727078891257996, "grad_norm": 0.24760592724404568, "learning_rate": 4.2157355632225216e-05, "loss": 0.368, "step": 2507 }, { "epoch": 2.673773987206823, "grad_norm": 0.22621973085952393, "learning_rate": 4.2127627294324095e-05, "loss": 0.3676, "step": 2508 }, { "epoch": 2.6748400852878467, "grad_norm": 0.2335594327305558, "learning_rate": 4.2097897777828025e-05, "loss": 0.3743, "step": 2509 }, { "epoch": 2.6759061833688698, "grad_norm": 0.2281132024099097, "learning_rate": 4.2068167099205625e-05, "loss": 0.3692, "step": 2510 }, { "epoch": 2.6769722814498933, "grad_norm": 0.22341724033801894, "learning_rate": 4.203843527492613e-05, "loss": 0.3702, "step": 2511 }, { "epoch": 2.678038379530917, "grad_norm": 0.2424014361387363, "learning_rate": 4.200870232145943e-05, "loss": 0.3706, "step": 2512 }, { "epoch": 2.6791044776119404, "grad_norm": 0.1953899351329716, "learning_rate": 4.1978968255276043e-05, "loss": 0.3652, "step": 2513 }, { "epoch": 2.680170575692964, "grad_norm": 0.22170371983312567, "learning_rate": 4.1949233092847095e-05, "loss": 0.3684, "step": 2514 }, { "epoch": 2.681236673773987, "grad_norm": 0.23765221374061374, "learning_rate": 4.1919496850644316e-05, "loss": 0.3637, "step": 2515 }, { "epoch": 2.6823027718550105, "grad_norm": 0.23687777883894273, "learning_rate": 4.1889759545140045e-05, "loss": 0.3655, "step": 2516 }, { "epoch": 2.683368869936034, "grad_norm": 0.22338536825632072, "learning_rate": 4.186002119280718e-05, "loss": 0.3707, "step": 2517 }, { "epoch": 2.6844349680170576, "grad_norm": 0.22005314669076753, "learning_rate": 4.183028181011927e-05, "loss": 0.374, "step": 2518 }, { "epoch": 2.685501066098081, "grad_norm": 0.2303045665765263, "learning_rate": 4.180054141355035e-05, "loss": 0.3747, "step": 2519 }, { "epoch": 2.6865671641791042, "grad_norm": 0.18534817689847655, "learning_rate": 4.177080001957506e-05, "loss": 0.3667, "step": 2520 }, { "epoch": 2.6876332622601278, "grad_norm": 0.2629553843000235, "learning_rate": 4.174105764466859e-05, "loss": 0.3705, "step": 2521 }, { "epoch": 2.6886993603411513, "grad_norm": 0.29148782597094064, "learning_rate": 4.1711314305306676e-05, "loss": 0.3704, "step": 2522 }, { "epoch": 2.689765458422175, "grad_norm": 0.2811624835341636, "learning_rate": 4.168157001796557e-05, "loss": 0.3723, "step": 2523 }, { "epoch": 2.6908315565031984, "grad_norm": 0.24880403947380741, "learning_rate": 4.165182479912208e-05, "loss": 0.367, "step": 2524 }, { "epoch": 2.6918976545842215, "grad_norm": 0.2008980746882621, "learning_rate": 4.1622078665253486e-05, "loss": 0.3661, "step": 2525 }, { "epoch": 2.6929637526652455, "grad_norm": 0.23350481252873526, "learning_rate": 4.159233163283762e-05, "loss": 0.3762, "step": 2526 }, { "epoch": 2.6940298507462686, "grad_norm": 0.20261861943939447, "learning_rate": 4.156258371835279e-05, "loss": 0.3701, "step": 2527 }, { "epoch": 2.695095948827292, "grad_norm": 0.25996452870016273, "learning_rate": 4.153283493827777e-05, "loss": 0.3776, "step": 2528 }, { "epoch": 2.6961620469083156, "grad_norm": 0.19687929153495226, "learning_rate": 4.150308530909187e-05, "loss": 0.3636, "step": 2529 }, { "epoch": 2.697228144989339, "grad_norm": 0.19556973866759192, "learning_rate": 4.147333484727484e-05, "loss": 0.3586, "step": 2530 }, { "epoch": 2.6982942430703627, "grad_norm": 0.21721440727421978, "learning_rate": 4.144358356930686e-05, "loss": 0.3685, "step": 2531 }, { "epoch": 2.699360341151386, "grad_norm": 0.2289561477297396, "learning_rate": 4.141383149166861e-05, "loss": 0.3693, "step": 2532 }, { "epoch": 2.7004264392324093, "grad_norm": 0.21622654938339614, "learning_rate": 4.138407863084119e-05, "loss": 0.3676, "step": 2533 }, { "epoch": 2.701492537313433, "grad_norm": 0.20523535800109563, "learning_rate": 4.1354325003306146e-05, "loss": 0.369, "step": 2534 }, { "epoch": 2.7025586353944564, "grad_norm": 0.24463058005414473, "learning_rate": 4.132457062554543e-05, "loss": 0.3677, "step": 2535 }, { "epoch": 2.70362473347548, "grad_norm": 0.1851731295878307, "learning_rate": 4.129481551404143e-05, "loss": 0.374, "step": 2536 }, { "epoch": 2.704690831556503, "grad_norm": 0.19922292111977952, "learning_rate": 4.1265059685276936e-05, "loss": 0.3654, "step": 2537 }, { "epoch": 2.7057569296375266, "grad_norm": 0.2752693144696535, "learning_rate": 4.123530315573512e-05, "loss": 0.3694, "step": 2538 }, { "epoch": 2.70682302771855, "grad_norm": 0.23479171085838138, "learning_rate": 4.120554594189955e-05, "loss": 0.3734, "step": 2539 }, { "epoch": 2.7078891257995736, "grad_norm": 0.1999301041207985, "learning_rate": 4.117578806025419e-05, "loss": 0.3674, "step": 2540 }, { "epoch": 2.708955223880597, "grad_norm": 0.21844202367376767, "learning_rate": 4.114602952728335e-05, "loss": 0.3695, "step": 2541 }, { "epoch": 2.7100213219616203, "grad_norm": 0.22666372255765088, "learning_rate": 4.111627035947171e-05, "loss": 0.3675, "step": 2542 }, { "epoch": 2.711087420042644, "grad_norm": 0.20627199369884083, "learning_rate": 4.108651057330432e-05, "loss": 0.375, "step": 2543 }, { "epoch": 2.7121535181236673, "grad_norm": 0.17843299855865702, "learning_rate": 4.1056750185266515e-05, "loss": 0.3685, "step": 2544 }, { "epoch": 2.713219616204691, "grad_norm": 0.2435265879503595, "learning_rate": 4.102698921184405e-05, "loss": 0.37, "step": 2545 }, { "epoch": 2.7142857142857144, "grad_norm": 0.2682269696770955, "learning_rate": 4.0997227669522924e-05, "loss": 0.3607, "step": 2546 }, { "epoch": 2.7153518123667375, "grad_norm": 0.23667515131488986, "learning_rate": 4.096746557478949e-05, "loss": 0.372, "step": 2547 }, { "epoch": 2.716417910447761, "grad_norm": 0.20880475425387418, "learning_rate": 4.0937702944130426e-05, "loss": 0.368, "step": 2548 }, { "epoch": 2.7174840085287846, "grad_norm": 0.18392143745348416, "learning_rate": 4.0907939794032654e-05, "loss": 0.3733, "step": 2549 }, { "epoch": 2.718550106609808, "grad_norm": 0.2165652019434693, "learning_rate": 4.087817614098343e-05, "loss": 0.3751, "step": 2550 }, { "epoch": 2.7196162046908317, "grad_norm": 0.24037086751445425, "learning_rate": 4.084841200147025e-05, "loss": 0.3739, "step": 2551 }, { "epoch": 2.7206823027718547, "grad_norm": 0.2501714427485925, "learning_rate": 4.0818647391980926e-05, "loss": 0.3737, "step": 2552 }, { "epoch": 2.7217484008528787, "grad_norm": 0.2344141067398134, "learning_rate": 4.078888232900349e-05, "loss": 0.3698, "step": 2553 }, { "epoch": 2.722814498933902, "grad_norm": 0.2349463019216503, "learning_rate": 4.075911682902625e-05, "loss": 0.3736, "step": 2554 }, { "epoch": 2.7238805970149254, "grad_norm": 0.20029237944534903, "learning_rate": 4.0729350908537724e-05, "loss": 0.3757, "step": 2555 }, { "epoch": 2.724946695095949, "grad_norm": 0.27878488904776033, "learning_rate": 4.069958458402671e-05, "loss": 0.3683, "step": 2556 }, { "epoch": 2.7260127931769724, "grad_norm": 0.29774631195776696, "learning_rate": 4.06698178719822e-05, "loss": 0.3685, "step": 2557 }, { "epoch": 2.727078891257996, "grad_norm": 0.2104346445232383, "learning_rate": 4.0640050788893386e-05, "loss": 0.3652, "step": 2558 }, { "epoch": 2.728144989339019, "grad_norm": 0.2317958367423688, "learning_rate": 4.0610283351249716e-05, "loss": 0.3692, "step": 2559 }, { "epoch": 2.7292110874200426, "grad_norm": 0.24510244686355384, "learning_rate": 4.058051557554078e-05, "loss": 0.3665, "step": 2560 }, { "epoch": 2.730277185501066, "grad_norm": 0.19542448243466262, "learning_rate": 4.0550747478256384e-05, "loss": 0.3648, "step": 2561 }, { "epoch": 2.7313432835820897, "grad_norm": 0.2389772106956452, "learning_rate": 4.052097907588652e-05, "loss": 0.3703, "step": 2562 }, { "epoch": 2.732409381663113, "grad_norm": 0.3107936261643165, "learning_rate": 4.049121038492131e-05, "loss": 0.3788, "step": 2563 }, { "epoch": 2.7334754797441363, "grad_norm": 0.2919215370644085, "learning_rate": 4.0461441421851075e-05, "loss": 0.3606, "step": 2564 }, { "epoch": 2.73454157782516, "grad_norm": 0.24187272238294624, "learning_rate": 4.043167220316628e-05, "loss": 0.3758, "step": 2565 }, { "epoch": 2.7356076759061834, "grad_norm": 0.24598827183680877, "learning_rate": 4.040190274535752e-05, "loss": 0.3715, "step": 2566 }, { "epoch": 2.736673773987207, "grad_norm": 0.21275237895976437, "learning_rate": 4.037213306491552e-05, "loss": 0.3739, "step": 2567 }, { "epoch": 2.7377398720682304, "grad_norm": 0.25491281948796235, "learning_rate": 4.0342363178331146e-05, "loss": 0.3671, "step": 2568 }, { "epoch": 2.7388059701492535, "grad_norm": 0.2833091097168945, "learning_rate": 4.031259310209536e-05, "loss": 0.371, "step": 2569 }, { "epoch": 2.739872068230277, "grad_norm": 0.18872311871080574, "learning_rate": 4.028282285269925e-05, "loss": 0.3759, "step": 2570 }, { "epoch": 2.7409381663113006, "grad_norm": 0.19447861987862541, "learning_rate": 4.0253052446633966e-05, "loss": 0.3713, "step": 2571 }, { "epoch": 2.742004264392324, "grad_norm": 0.20648269121909482, "learning_rate": 4.022328190039079e-05, "loss": 0.3647, "step": 2572 }, { "epoch": 2.7430703624733477, "grad_norm": 0.1961509547861183, "learning_rate": 4.019351123046104e-05, "loss": 0.3594, "step": 2573 }, { "epoch": 2.7441364605543708, "grad_norm": 0.21126880262710795, "learning_rate": 4.0163740453336125e-05, "loss": 0.3675, "step": 2574 }, { "epoch": 2.7452025586353943, "grad_norm": 0.20602815602021118, "learning_rate": 4.0133969585507514e-05, "loss": 0.3674, "step": 2575 }, { "epoch": 2.746268656716418, "grad_norm": 0.21819379544686981, "learning_rate": 4.010419864346671e-05, "loss": 0.3759, "step": 2576 }, { "epoch": 2.7473347547974414, "grad_norm": 0.2076650890871053, "learning_rate": 4.0074427643705274e-05, "loss": 0.3676, "step": 2577 }, { "epoch": 2.748400852878465, "grad_norm": 0.2017797586755664, "learning_rate": 4.004465660271479e-05, "loss": 0.3685, "step": 2578 }, { "epoch": 2.749466950959488, "grad_norm": 0.2744626448821499, "learning_rate": 4.001488553698687e-05, "loss": 0.3691, "step": 2579 }, { "epoch": 2.750533049040512, "grad_norm": 0.22575769767487924, "learning_rate": 3.998511446301315e-05, "loss": 0.3713, "step": 2580 }, { "epoch": 2.751599147121535, "grad_norm": 0.18090826243386235, "learning_rate": 3.995534339728522e-05, "loss": 0.3643, "step": 2581 }, { "epoch": 2.7526652452025586, "grad_norm": 0.1810203385011346, "learning_rate": 3.992557235629473e-05, "loss": 0.3679, "step": 2582 }, { "epoch": 2.753731343283582, "grad_norm": 0.2004645021897271, "learning_rate": 3.989580135653329e-05, "loss": 0.3623, "step": 2583 }, { "epoch": 2.7547974413646057, "grad_norm": 0.20665259172252912, "learning_rate": 3.98660304144925e-05, "loss": 0.3709, "step": 2584 }, { "epoch": 2.7558635394456292, "grad_norm": 0.21652327508884206, "learning_rate": 3.983625954666389e-05, "loss": 0.3638, "step": 2585 }, { "epoch": 2.7569296375266523, "grad_norm": 0.1905584143867772, "learning_rate": 3.9806488769538966e-05, "loss": 0.3698, "step": 2586 }, { "epoch": 2.757995735607676, "grad_norm": 0.2122311206240018, "learning_rate": 3.977671809960923e-05, "loss": 0.3671, "step": 2587 }, { "epoch": 2.7590618336886994, "grad_norm": 0.29171836988191924, "learning_rate": 3.974694755336604e-05, "loss": 0.3693, "step": 2588 }, { "epoch": 2.760127931769723, "grad_norm": 0.27181002702035517, "learning_rate": 3.971717714730076e-05, "loss": 0.3665, "step": 2589 }, { "epoch": 2.7611940298507465, "grad_norm": 0.21014364337275343, "learning_rate": 3.968740689790464e-05, "loss": 0.3729, "step": 2590 }, { "epoch": 2.7622601279317696, "grad_norm": 0.2526245805801581, "learning_rate": 3.9657636821668874e-05, "loss": 0.3713, "step": 2591 }, { "epoch": 2.763326226012793, "grad_norm": 0.25768688989060495, "learning_rate": 3.9627866935084496e-05, "loss": 0.3689, "step": 2592 }, { "epoch": 2.7643923240938166, "grad_norm": 0.2526926794684304, "learning_rate": 3.959809725464249e-05, "loss": 0.369, "step": 2593 }, { "epoch": 2.76545842217484, "grad_norm": 0.28824876900515534, "learning_rate": 3.956832779683374e-05, "loss": 0.3716, "step": 2594 }, { "epoch": 2.7665245202558637, "grad_norm": 0.26584418015578326, "learning_rate": 3.953855857814894e-05, "loss": 0.3703, "step": 2595 }, { "epoch": 2.767590618336887, "grad_norm": 0.23043100400918048, "learning_rate": 3.950878961507871e-05, "loss": 0.3652, "step": 2596 }, { "epoch": 2.7686567164179103, "grad_norm": 0.2502510778112517, "learning_rate": 3.94790209241135e-05, "loss": 0.3672, "step": 2597 }, { "epoch": 2.769722814498934, "grad_norm": 0.2639439073598543, "learning_rate": 3.944925252174363e-05, "loss": 0.3788, "step": 2598 }, { "epoch": 2.7707889125799574, "grad_norm": 0.2841116000042142, "learning_rate": 3.9419484424459235e-05, "loss": 0.3727, "step": 2599 }, { "epoch": 2.771855010660981, "grad_norm": 0.2735556859075221, "learning_rate": 3.938971664875029e-05, "loss": 0.3717, "step": 2600 }, { "epoch": 2.772921108742004, "grad_norm": 0.28637597882164606, "learning_rate": 3.935994921110661e-05, "loss": 0.3688, "step": 2601 }, { "epoch": 2.7739872068230276, "grad_norm": 0.265199180250517, "learning_rate": 3.933018212801782e-05, "loss": 0.3756, "step": 2602 }, { "epoch": 2.775053304904051, "grad_norm": 0.24641376266091428, "learning_rate": 3.9300415415973295e-05, "loss": 0.3694, "step": 2603 }, { "epoch": 2.7761194029850746, "grad_norm": 0.2029121854284716, "learning_rate": 3.9270649091462276e-05, "loss": 0.3619, "step": 2604 }, { "epoch": 2.777185501066098, "grad_norm": 0.31971921589949576, "learning_rate": 3.9240883170973776e-05, "loss": 0.3664, "step": 2605 }, { "epoch": 2.7782515991471213, "grad_norm": 0.34026995782754504, "learning_rate": 3.9211117670996524e-05, "loss": 0.3709, "step": 2606 }, { "epoch": 2.7793176972281453, "grad_norm": 0.31436146070519905, "learning_rate": 3.918135260801908e-05, "loss": 0.3697, "step": 2607 }, { "epoch": 2.7803837953091683, "grad_norm": 0.30334671667064705, "learning_rate": 3.9151587998529754e-05, "loss": 0.3694, "step": 2608 }, { "epoch": 2.781449893390192, "grad_norm": 0.26731009437526165, "learning_rate": 3.912182385901659e-05, "loss": 0.368, "step": 2609 }, { "epoch": 2.7825159914712154, "grad_norm": 0.27147115708275976, "learning_rate": 3.909206020596736e-05, "loss": 0.3626, "step": 2610 }, { "epoch": 2.783582089552239, "grad_norm": 0.32889706955527953, "learning_rate": 3.906229705586959e-05, "loss": 0.3696, "step": 2611 }, { "epoch": 2.7846481876332625, "grad_norm": 0.3499202779561899, "learning_rate": 3.903253442521051e-05, "loss": 0.368, "step": 2612 }, { "epoch": 2.7857142857142856, "grad_norm": 0.2870452604692784, "learning_rate": 3.9002772330477096e-05, "loss": 0.3678, "step": 2613 }, { "epoch": 2.786780383795309, "grad_norm": 0.2959372810749118, "learning_rate": 3.897301078815597e-05, "loss": 0.3666, "step": 2614 }, { "epoch": 2.7878464818763327, "grad_norm": 0.251783830117275, "learning_rate": 3.894324981473349e-05, "loss": 0.369, "step": 2615 }, { "epoch": 2.788912579957356, "grad_norm": 0.20368390077877208, "learning_rate": 3.891348942669571e-05, "loss": 0.3659, "step": 2616 }, { "epoch": 2.7899786780383797, "grad_norm": 0.3014830515949927, "learning_rate": 3.88837296405283e-05, "loss": 0.3645, "step": 2617 }, { "epoch": 2.791044776119403, "grad_norm": 0.26718545932498405, "learning_rate": 3.8853970472716656e-05, "loss": 0.365, "step": 2618 }, { "epoch": 2.7921108742004264, "grad_norm": 0.24899810548394205, "learning_rate": 3.882421193974581e-05, "loss": 0.369, "step": 2619 }, { "epoch": 2.79317697228145, "grad_norm": 0.2285678956121162, "learning_rate": 3.879445405810047e-05, "loss": 0.3654, "step": 2620 }, { "epoch": 2.7942430703624734, "grad_norm": 0.25841393123219936, "learning_rate": 3.876469684426489e-05, "loss": 0.3705, "step": 2621 }, { "epoch": 2.795309168443497, "grad_norm": 0.21957675621749626, "learning_rate": 3.873494031472307e-05, "loss": 0.3706, "step": 2622 }, { "epoch": 2.79637526652452, "grad_norm": 0.20277378161854126, "learning_rate": 3.870518448595858e-05, "loss": 0.3708, "step": 2623 }, { "epoch": 2.7974413646055436, "grad_norm": 0.19507078530612013, "learning_rate": 3.867542937445458e-05, "loss": 0.3695, "step": 2624 }, { "epoch": 2.798507462686567, "grad_norm": 0.22778058725192096, "learning_rate": 3.864567499669387e-05, "loss": 0.3704, "step": 2625 }, { "epoch": 2.7995735607675907, "grad_norm": 0.2139823499648018, "learning_rate": 3.861592136915881e-05, "loss": 0.368, "step": 2626 }, { "epoch": 2.800639658848614, "grad_norm": 0.2755772492578195, "learning_rate": 3.858616850833141e-05, "loss": 0.3744, "step": 2627 }, { "epoch": 2.8017057569296373, "grad_norm": 0.22789410651650585, "learning_rate": 3.855641643069316e-05, "loss": 0.3691, "step": 2628 }, { "epoch": 2.802771855010661, "grad_norm": 0.19420102545723644, "learning_rate": 3.852666515272517e-05, "loss": 0.3701, "step": 2629 }, { "epoch": 2.8038379530916844, "grad_norm": 0.22416479381704973, "learning_rate": 3.849691469090814e-05, "loss": 0.3657, "step": 2630 }, { "epoch": 2.804904051172708, "grad_norm": 0.2226881113118721, "learning_rate": 3.846716506172224e-05, "loss": 0.3712, "step": 2631 }, { "epoch": 2.8059701492537314, "grad_norm": 0.1957187061751904, "learning_rate": 3.8437416281647226e-05, "loss": 0.3686, "step": 2632 }, { "epoch": 2.8070362473347545, "grad_norm": 0.196407100670983, "learning_rate": 3.8407668367162397e-05, "loss": 0.3638, "step": 2633 }, { "epoch": 2.8081023454157785, "grad_norm": 0.1860094576496804, "learning_rate": 3.837792133474653e-05, "loss": 0.364, "step": 2634 }, { "epoch": 2.8091684434968016, "grad_norm": 0.1954370294920811, "learning_rate": 3.8348175200877934e-05, "loss": 0.3699, "step": 2635 }, { "epoch": 2.810234541577825, "grad_norm": 0.2202007081382702, "learning_rate": 3.831842998203444e-05, "loss": 0.367, "step": 2636 }, { "epoch": 2.8113006396588487, "grad_norm": 0.18691426984371892, "learning_rate": 3.828868569469333e-05, "loss": 0.3675, "step": 2637 }, { "epoch": 2.8123667377398722, "grad_norm": 0.16936594094488316, "learning_rate": 3.825894235533143e-05, "loss": 0.3606, "step": 2638 }, { "epoch": 2.8134328358208958, "grad_norm": 0.2188119066765755, "learning_rate": 3.8229199980424957e-05, "loss": 0.371, "step": 2639 }, { "epoch": 2.814498933901919, "grad_norm": 0.19720285663307738, "learning_rate": 3.819945858644966e-05, "loss": 0.3647, "step": 2640 }, { "epoch": 2.8155650319829424, "grad_norm": 0.2081234890857164, "learning_rate": 3.8169718189880735e-05, "loss": 0.3663, "step": 2641 }, { "epoch": 2.816631130063966, "grad_norm": 0.1684341668760121, "learning_rate": 3.8139978807192824e-05, "loss": 0.367, "step": 2642 }, { "epoch": 2.8176972281449895, "grad_norm": 0.1883701323467958, "learning_rate": 3.8110240454859975e-05, "loss": 0.3665, "step": 2643 }, { "epoch": 2.818763326226013, "grad_norm": 0.18509131366924267, "learning_rate": 3.808050314935569e-05, "loss": 0.3658, "step": 2644 }, { "epoch": 2.819829424307036, "grad_norm": 0.18175683445064622, "learning_rate": 3.8050766907152925e-05, "loss": 0.3685, "step": 2645 }, { "epoch": 2.8208955223880596, "grad_norm": 0.1749386564896647, "learning_rate": 3.802103174472397e-05, "loss": 0.364, "step": 2646 }, { "epoch": 2.821961620469083, "grad_norm": 0.1877228031849092, "learning_rate": 3.799129767854058e-05, "loss": 0.3659, "step": 2647 }, { "epoch": 2.8230277185501067, "grad_norm": 0.23830061584479004, "learning_rate": 3.796156472507388e-05, "loss": 0.3681, "step": 2648 }, { "epoch": 2.8240938166311302, "grad_norm": 0.20524827433645737, "learning_rate": 3.79318329007944e-05, "loss": 0.3688, "step": 2649 }, { "epoch": 2.8251599147121533, "grad_norm": 0.20637998521159778, "learning_rate": 3.790210222217199e-05, "loss": 0.3704, "step": 2650 }, { "epoch": 2.826226012793177, "grad_norm": 0.18452122451122424, "learning_rate": 3.787237270567591e-05, "loss": 0.3687, "step": 2651 }, { "epoch": 2.8272921108742004, "grad_norm": 0.2102244859465425, "learning_rate": 3.7842644367774804e-05, "loss": 0.3719, "step": 2652 }, { "epoch": 2.828358208955224, "grad_norm": 0.17836976761573398, "learning_rate": 3.781291722493657e-05, "loss": 0.3701, "step": 2653 }, { "epoch": 2.8294243070362475, "grad_norm": 0.20742348576542263, "learning_rate": 3.7783191293628535e-05, "loss": 0.3703, "step": 2654 }, { "epoch": 2.8304904051172706, "grad_norm": 0.19693977146930702, "learning_rate": 3.775346659031731e-05, "loss": 0.3684, "step": 2655 }, { "epoch": 2.831556503198294, "grad_norm": 0.2143540021763547, "learning_rate": 3.7723743131468855e-05, "loss": 0.3677, "step": 2656 }, { "epoch": 2.8326226012793176, "grad_norm": 0.2456330609899814, "learning_rate": 3.769402093354838e-05, "loss": 0.3673, "step": 2657 }, { "epoch": 2.833688699360341, "grad_norm": 0.2504165053266609, "learning_rate": 3.766430001302047e-05, "loss": 0.3684, "step": 2658 }, { "epoch": 2.8347547974413647, "grad_norm": 0.20402042897097772, "learning_rate": 3.7634580386348965e-05, "loss": 0.3686, "step": 2659 }, { "epoch": 2.835820895522388, "grad_norm": 0.2261020174040382, "learning_rate": 3.7604862069997006e-05, "loss": 0.3646, "step": 2660 }, { "epoch": 2.836886993603412, "grad_norm": 0.18453288873034465, "learning_rate": 3.7575145080426966e-05, "loss": 0.3645, "step": 2661 }, { "epoch": 2.837953091684435, "grad_norm": 0.22623379991968529, "learning_rate": 3.7545429434100524e-05, "loss": 0.3717, "step": 2662 }, { "epoch": 2.8390191897654584, "grad_norm": 0.2781308273184132, "learning_rate": 3.751571514747863e-05, "loss": 0.3727, "step": 2663 }, { "epoch": 2.840085287846482, "grad_norm": 0.2095156006824654, "learning_rate": 3.748600223702141e-05, "loss": 0.3709, "step": 2664 }, { "epoch": 2.8411513859275055, "grad_norm": 0.19821375447623568, "learning_rate": 3.7456290719188294e-05, "loss": 0.3716, "step": 2665 }, { "epoch": 2.842217484008529, "grad_norm": 0.19589404239503924, "learning_rate": 3.742658061043793e-05, "loss": 0.3686, "step": 2666 }, { "epoch": 2.843283582089552, "grad_norm": 0.2663508610327882, "learning_rate": 3.7396871927228165e-05, "loss": 0.3699, "step": 2667 }, { "epoch": 2.8443496801705757, "grad_norm": 0.22076482818199494, "learning_rate": 3.7367164686016055e-05, "loss": 0.3738, "step": 2668 }, { "epoch": 2.845415778251599, "grad_norm": 0.15927170037082947, "learning_rate": 3.733745890325788e-05, "loss": 0.3658, "step": 2669 }, { "epoch": 2.8464818763326227, "grad_norm": 0.20652944107978283, "learning_rate": 3.7307754595409094e-05, "loss": 0.3637, "step": 2670 }, { "epoch": 2.8475479744136463, "grad_norm": 0.24357837304067567, "learning_rate": 3.727805177892435e-05, "loss": 0.3693, "step": 2671 }, { "epoch": 2.8486140724946694, "grad_norm": 0.245995883074642, "learning_rate": 3.7248350470257456e-05, "loss": 0.3672, "step": 2672 }, { "epoch": 2.849680170575693, "grad_norm": 0.19478885206841665, "learning_rate": 3.721865068586138e-05, "loss": 0.37, "step": 2673 }, { "epoch": 2.8507462686567164, "grad_norm": 0.2530400593412467, "learning_rate": 3.71889524421883e-05, "loss": 0.3649, "step": 2674 }, { "epoch": 2.85181236673774, "grad_norm": 0.2694796185289994, "learning_rate": 3.715925575568945e-05, "loss": 0.3674, "step": 2675 }, { "epoch": 2.8528784648187635, "grad_norm": 0.2246054293784152, "learning_rate": 3.712956064281527e-05, "loss": 0.3628, "step": 2676 }, { "epoch": 2.8539445628997866, "grad_norm": 0.204940141551566, "learning_rate": 3.7099867120015316e-05, "loss": 0.3711, "step": 2677 }, { "epoch": 2.85501066098081, "grad_norm": 0.2634467161792027, "learning_rate": 3.707017520373827e-05, "loss": 0.3711, "step": 2678 }, { "epoch": 2.8560767590618337, "grad_norm": 0.2790273311361743, "learning_rate": 3.7040484910431874e-05, "loss": 0.3692, "step": 2679 }, { "epoch": 2.857142857142857, "grad_norm": 0.1836831295255749, "learning_rate": 3.7010796256543034e-05, "loss": 0.3698, "step": 2680 }, { "epoch": 2.8582089552238807, "grad_norm": 0.2364870469211339, "learning_rate": 3.6981109258517724e-05, "loss": 0.3703, "step": 2681 }, { "epoch": 2.859275053304904, "grad_norm": 0.27041513703480374, "learning_rate": 3.695142393280098e-05, "loss": 0.3665, "step": 2682 }, { "epoch": 2.8603411513859274, "grad_norm": 0.20251757763705422, "learning_rate": 3.692174029583693e-05, "loss": 0.3744, "step": 2683 }, { "epoch": 2.861407249466951, "grad_norm": 0.23004902486067985, "learning_rate": 3.689205836406878e-05, "loss": 0.3666, "step": 2684 }, { "epoch": 2.8624733475479744, "grad_norm": 0.210050309914908, "learning_rate": 3.686237815393878e-05, "loss": 0.3726, "step": 2685 }, { "epoch": 2.863539445628998, "grad_norm": 0.2447284603799976, "learning_rate": 3.683269968188821e-05, "loss": 0.3665, "step": 2686 }, { "epoch": 2.864605543710021, "grad_norm": 0.1878574669176508, "learning_rate": 3.68030229643574e-05, "loss": 0.3739, "step": 2687 }, { "epoch": 2.8656716417910446, "grad_norm": 0.20854764527126085, "learning_rate": 3.6773348017785714e-05, "loss": 0.3671, "step": 2688 }, { "epoch": 2.866737739872068, "grad_norm": 0.20789517420801748, "learning_rate": 3.6743674858611556e-05, "loss": 0.3767, "step": 2689 }, { "epoch": 2.8678038379530917, "grad_norm": 0.17804786727873217, "learning_rate": 3.6714003503272265e-05, "loss": 0.3627, "step": 2690 }, { "epoch": 2.868869936034115, "grad_norm": 0.1808758602651246, "learning_rate": 3.668433396820426e-05, "loss": 0.3702, "step": 2691 }, { "epoch": 2.8699360341151388, "grad_norm": 0.23463363469678927, "learning_rate": 3.665466626984294e-05, "loss": 0.3703, "step": 2692 }, { "epoch": 2.8710021321961623, "grad_norm": 0.246472560729465, "learning_rate": 3.662500042462262e-05, "loss": 0.361, "step": 2693 }, { "epoch": 2.8720682302771854, "grad_norm": 0.20564287257852504, "learning_rate": 3.6595336448976655e-05, "loss": 0.3701, "step": 2694 }, { "epoch": 2.873134328358209, "grad_norm": 0.2106812492583941, "learning_rate": 3.656567435933736e-05, "loss": 0.3628, "step": 2695 }, { "epoch": 2.8742004264392325, "grad_norm": 0.20453409365105865, "learning_rate": 3.6536014172135996e-05, "loss": 0.3649, "step": 2696 }, { "epoch": 2.875266524520256, "grad_norm": 0.20646291714378256, "learning_rate": 3.650635590380274e-05, "loss": 0.3666, "step": 2697 }, { "epoch": 2.8763326226012795, "grad_norm": 0.1836576189111925, "learning_rate": 3.647669957076674e-05, "loss": 0.3686, "step": 2698 }, { "epoch": 2.8773987206823026, "grad_norm": 0.21929175594887504, "learning_rate": 3.644704518945607e-05, "loss": 0.3747, "step": 2699 }, { "epoch": 2.878464818763326, "grad_norm": 0.18711665960997673, "learning_rate": 3.641739277629772e-05, "loss": 0.3641, "step": 2700 }, { "epoch": 2.8795309168443497, "grad_norm": 0.17522013442035012, "learning_rate": 3.638774234771757e-05, "loss": 0.3799, "step": 2701 }, { "epoch": 2.8805970149253732, "grad_norm": 0.17988902594802456, "learning_rate": 3.635809392014042e-05, "loss": 0.3703, "step": 2702 }, { "epoch": 2.8816631130063968, "grad_norm": 0.17377049259647168, "learning_rate": 3.632844750998998e-05, "loss": 0.3755, "step": 2703 }, { "epoch": 2.88272921108742, "grad_norm": 0.19092248556619265, "learning_rate": 3.629880313368879e-05, "loss": 0.3632, "step": 2704 }, { "epoch": 2.8837953091684434, "grad_norm": 0.18451802560202984, "learning_rate": 3.6269160807658315e-05, "loss": 0.3685, "step": 2705 }, { "epoch": 2.884861407249467, "grad_norm": 0.16498456390879312, "learning_rate": 3.6239520548318836e-05, "loss": 0.3645, "step": 2706 }, { "epoch": 2.8859275053304905, "grad_norm": 0.19518509163645606, "learning_rate": 3.620988237208956e-05, "loss": 0.3779, "step": 2707 }, { "epoch": 2.886993603411514, "grad_norm": 0.2093799236223249, "learning_rate": 3.6180246295388465e-05, "loss": 0.3683, "step": 2708 }, { "epoch": 2.888059701492537, "grad_norm": 0.18513792614540311, "learning_rate": 3.615061233463239e-05, "loss": 0.3673, "step": 2709 }, { "epoch": 2.8891257995735606, "grad_norm": 0.18716040338636258, "learning_rate": 3.612098050623705e-05, "loss": 0.3648, "step": 2710 }, { "epoch": 2.890191897654584, "grad_norm": 0.1570679754787949, "learning_rate": 3.6091350826616886e-05, "loss": 0.3602, "step": 2711 }, { "epoch": 2.8912579957356077, "grad_norm": 0.1646466111643218, "learning_rate": 3.606172331218523e-05, "loss": 0.3715, "step": 2712 }, { "epoch": 2.8923240938166312, "grad_norm": 0.19255818967368557, "learning_rate": 3.603209797935418e-05, "loss": 0.3643, "step": 2713 }, { "epoch": 2.8933901918976543, "grad_norm": 0.23514975868653648, "learning_rate": 3.600247484453465e-05, "loss": 0.3767, "step": 2714 }, { "epoch": 2.894456289978678, "grad_norm": 0.1715556714292468, "learning_rate": 3.597285392413628e-05, "loss": 0.3653, "step": 2715 }, { "epoch": 2.8955223880597014, "grad_norm": 0.19278886011586072, "learning_rate": 3.5943235234567534e-05, "loss": 0.3688, "step": 2716 }, { "epoch": 2.896588486140725, "grad_norm": 0.20799089269786109, "learning_rate": 3.591361879223564e-05, "loss": 0.3684, "step": 2717 }, { "epoch": 2.8976545842217485, "grad_norm": 0.18186928796074078, "learning_rate": 3.588400461354657e-05, "loss": 0.3641, "step": 2718 }, { "epoch": 2.8987206823027716, "grad_norm": 0.2508899499795832, "learning_rate": 3.5854392714905015e-05, "loss": 0.3701, "step": 2719 }, { "epoch": 2.8997867803837956, "grad_norm": 0.24536404303538248, "learning_rate": 3.582478311271445e-05, "loss": 0.3684, "step": 2720 }, { "epoch": 2.9008528784648187, "grad_norm": 0.21642088139256604, "learning_rate": 3.579517582337705e-05, "loss": 0.3717, "step": 2721 }, { "epoch": 2.901918976545842, "grad_norm": 0.24159904518479744, "learning_rate": 3.57655708632937e-05, "loss": 0.3698, "step": 2722 }, { "epoch": 2.9029850746268657, "grad_norm": 0.20752353712283125, "learning_rate": 3.573596824886403e-05, "loss": 0.3687, "step": 2723 }, { "epoch": 2.9040511727078893, "grad_norm": 0.23067889172808448, "learning_rate": 3.570636799648634e-05, "loss": 0.3699, "step": 2724 }, { "epoch": 2.905117270788913, "grad_norm": 0.20818039787632228, "learning_rate": 3.5676770122557646e-05, "loss": 0.3684, "step": 2725 }, { "epoch": 2.906183368869936, "grad_norm": 0.21491825417478544, "learning_rate": 3.5647174643473605e-05, "loss": 0.369, "step": 2726 }, { "epoch": 2.9072494669509594, "grad_norm": 0.19045430642939612, "learning_rate": 3.5617581575628586e-05, "loss": 0.3727, "step": 2727 }, { "epoch": 2.908315565031983, "grad_norm": 0.2022812002014471, "learning_rate": 3.5587990935415616e-05, "loss": 0.3632, "step": 2728 }, { "epoch": 2.9093816631130065, "grad_norm": 0.23520827934411384, "learning_rate": 3.555840273922638e-05, "loss": 0.3698, "step": 2729 }, { "epoch": 2.91044776119403, "grad_norm": 0.22589615939607646, "learning_rate": 3.552881700345118e-05, "loss": 0.3674, "step": 2730 }, { "epoch": 2.911513859275053, "grad_norm": 0.17879660376241868, "learning_rate": 3.5499233744478986e-05, "loss": 0.3711, "step": 2731 }, { "epoch": 2.9125799573560767, "grad_norm": 0.1701981830839262, "learning_rate": 3.54696529786974e-05, "loss": 0.373, "step": 2732 }, { "epoch": 2.9136460554371, "grad_norm": 0.17388147057917147, "learning_rate": 3.54400747224926e-05, "loss": 0.3694, "step": 2733 }, { "epoch": 2.9147121535181237, "grad_norm": 0.198912284935368, "learning_rate": 3.541049899224941e-05, "loss": 0.3638, "step": 2734 }, { "epoch": 2.9157782515991473, "grad_norm": 0.1991247673513127, "learning_rate": 3.538092580435127e-05, "loss": 0.3689, "step": 2735 }, { "epoch": 2.9168443496801704, "grad_norm": 0.21753207227603405, "learning_rate": 3.5351355175180175e-05, "loss": 0.3677, "step": 2736 }, { "epoch": 2.917910447761194, "grad_norm": 0.19550372191277038, "learning_rate": 3.53217871211167e-05, "loss": 0.3635, "step": 2737 }, { "epoch": 2.9189765458422174, "grad_norm": 0.18563885663326216, "learning_rate": 3.529222165854005e-05, "loss": 0.3649, "step": 2738 }, { "epoch": 2.920042643923241, "grad_norm": 0.22917352839550473, "learning_rate": 3.526265880382791e-05, "loss": 0.3781, "step": 2739 }, { "epoch": 2.9211087420042645, "grad_norm": 0.1757204517843413, "learning_rate": 3.523309857335659e-05, "loss": 0.3714, "step": 2740 }, { "epoch": 2.9221748400852876, "grad_norm": 0.19335871653526074, "learning_rate": 3.5203540983500925e-05, "loss": 0.364, "step": 2741 }, { "epoch": 2.923240938166311, "grad_norm": 0.2516292353731809, "learning_rate": 3.517398605063426e-05, "loss": 0.3645, "step": 2742 }, { "epoch": 2.9243070362473347, "grad_norm": 0.20904716360457082, "learning_rate": 3.514443379112853e-05, "loss": 0.3732, "step": 2743 }, { "epoch": 2.925373134328358, "grad_norm": 0.22653010522456504, "learning_rate": 3.511488422135412e-05, "loss": 0.3732, "step": 2744 }, { "epoch": 2.9264392324093818, "grad_norm": 0.21041114421861964, "learning_rate": 3.508533735767995e-05, "loss": 0.3677, "step": 2745 }, { "epoch": 2.927505330490405, "grad_norm": 0.19966036046378136, "learning_rate": 3.505579321647347e-05, "loss": 0.3655, "step": 2746 }, { "epoch": 2.928571428571429, "grad_norm": 0.1739262049297657, "learning_rate": 3.5026251814100604e-05, "loss": 0.3688, "step": 2747 }, { "epoch": 2.929637526652452, "grad_norm": 0.18367778908948013, "learning_rate": 3.4996713166925724e-05, "loss": 0.3673, "step": 2748 }, { "epoch": 2.9307036247334755, "grad_norm": 0.2188532499554241, "learning_rate": 3.496717729131172e-05, "loss": 0.3669, "step": 2749 }, { "epoch": 2.931769722814499, "grad_norm": 0.20187002981612032, "learning_rate": 3.493764420361995e-05, "loss": 0.3705, "step": 2750 }, { "epoch": 2.9328358208955225, "grad_norm": 0.16322026861126518, "learning_rate": 3.490811392021018e-05, "loss": 0.3611, "step": 2751 }, { "epoch": 2.933901918976546, "grad_norm": 0.19108916099257472, "learning_rate": 3.4878586457440655e-05, "loss": 0.3699, "step": 2752 }, { "epoch": 2.934968017057569, "grad_norm": 0.21458163159618987, "learning_rate": 3.484906183166807e-05, "loss": 0.3687, "step": 2753 }, { "epoch": 2.9360341151385927, "grad_norm": 0.1904908632261195, "learning_rate": 3.481954005924755e-05, "loss": 0.3657, "step": 2754 }, { "epoch": 2.9371002132196162, "grad_norm": 0.18685652839997016, "learning_rate": 3.4790021156532585e-05, "loss": 0.3688, "step": 2755 }, { "epoch": 2.9381663113006398, "grad_norm": 0.19812866208833532, "learning_rate": 3.476050513987512e-05, "loss": 0.3665, "step": 2756 }, { "epoch": 2.9392324093816633, "grad_norm": 0.19248170405628037, "learning_rate": 3.4730992025625506e-05, "loss": 0.3705, "step": 2757 }, { "epoch": 2.9402985074626864, "grad_norm": 0.17211944249351815, "learning_rate": 3.4701481830132486e-05, "loss": 0.3644, "step": 2758 }, { "epoch": 2.94136460554371, "grad_norm": 0.1964548683909922, "learning_rate": 3.467197456974315e-05, "loss": 0.3652, "step": 2759 }, { "epoch": 2.9424307036247335, "grad_norm": 0.20371355156065427, "learning_rate": 3.4642470260802986e-05, "loss": 0.3749, "step": 2760 }, { "epoch": 2.943496801705757, "grad_norm": 0.1469689938993747, "learning_rate": 3.4612968919655886e-05, "loss": 0.3682, "step": 2761 }, { "epoch": 2.9445628997867805, "grad_norm": 0.1964620626188304, "learning_rate": 3.458347056264401e-05, "loss": 0.3726, "step": 2762 }, { "epoch": 2.9456289978678036, "grad_norm": 0.18358666019195752, "learning_rate": 3.4553975206107944e-05, "loss": 0.3668, "step": 2763 }, { "epoch": 2.946695095948827, "grad_norm": 0.16170424306429584, "learning_rate": 3.452448286638657e-05, "loss": 0.3735, "step": 2764 }, { "epoch": 2.9477611940298507, "grad_norm": 0.18591540468001805, "learning_rate": 3.4494993559817134e-05, "loss": 0.37, "step": 2765 }, { "epoch": 2.9488272921108742, "grad_norm": 0.22260097019813457, "learning_rate": 3.446550730273514e-05, "loss": 0.3656, "step": 2766 }, { "epoch": 2.949893390191898, "grad_norm": 0.2482136891258535, "learning_rate": 3.4436024111474475e-05, "loss": 0.3683, "step": 2767 }, { "epoch": 2.950959488272921, "grad_norm": 0.19730939109015577, "learning_rate": 3.440654400236729e-05, "loss": 0.378, "step": 2768 }, { "epoch": 2.9520255863539444, "grad_norm": 0.20731684739497086, "learning_rate": 3.437706699174402e-05, "loss": 0.364, "step": 2769 }, { "epoch": 2.953091684434968, "grad_norm": 0.22896154197636323, "learning_rate": 3.43475930959334e-05, "loss": 0.3685, "step": 2770 }, { "epoch": 2.9541577825159915, "grad_norm": 0.20500126289635404, "learning_rate": 3.431812233126245e-05, "loss": 0.3748, "step": 2771 }, { "epoch": 2.955223880597015, "grad_norm": 0.1633714645059073, "learning_rate": 3.428865471405643e-05, "loss": 0.3591, "step": 2772 }, { "epoch": 2.956289978678038, "grad_norm": 0.21578628128041608, "learning_rate": 3.425919026063886e-05, "loss": 0.3745, "step": 2773 }, { "epoch": 2.957356076759062, "grad_norm": 0.16947850465245867, "learning_rate": 3.422972898733154e-05, "loss": 0.3695, "step": 2774 }, { "epoch": 2.958422174840085, "grad_norm": 0.15700808274896483, "learning_rate": 3.420027091045446e-05, "loss": 0.3627, "step": 2775 }, { "epoch": 2.9594882729211087, "grad_norm": 0.1897865970091229, "learning_rate": 3.417081604632589e-05, "loss": 0.3624, "step": 2776 }, { "epoch": 2.9605543710021323, "grad_norm": 0.19673063754579187, "learning_rate": 3.414136441126227e-05, "loss": 0.3692, "step": 2777 }, { "epoch": 2.961620469083156, "grad_norm": 0.17553015896004215, "learning_rate": 3.4111916021578285e-05, "loss": 0.3678, "step": 2778 }, { "epoch": 2.9626865671641793, "grad_norm": 0.2183249284173145, "learning_rate": 3.408247089358681e-05, "loss": 0.3747, "step": 2779 }, { "epoch": 2.9637526652452024, "grad_norm": 0.21927885211369533, "learning_rate": 3.405302904359893e-05, "loss": 0.3673, "step": 2780 }, { "epoch": 2.964818763326226, "grad_norm": 0.18731814539514857, "learning_rate": 3.402359048792386e-05, "loss": 0.3716, "step": 2781 }, { "epoch": 2.9658848614072495, "grad_norm": 0.2338787632638593, "learning_rate": 3.399415524286907e-05, "loss": 0.3732, "step": 2782 }, { "epoch": 2.966950959488273, "grad_norm": 0.21152997804401275, "learning_rate": 3.396472332474015e-05, "loss": 0.3669, "step": 2783 }, { "epoch": 2.9680170575692966, "grad_norm": 0.17079876375720443, "learning_rate": 3.393529474984083e-05, "loss": 0.3714, "step": 2784 }, { "epoch": 2.9690831556503197, "grad_norm": 0.20326343887457965, "learning_rate": 3.390586953447304e-05, "loss": 0.3685, "step": 2785 }, { "epoch": 2.970149253731343, "grad_norm": 0.265596968296806, "learning_rate": 3.38764476949368e-05, "loss": 0.3701, "step": 2786 }, { "epoch": 2.9712153518123667, "grad_norm": 0.20949193099124558, "learning_rate": 3.38470292475303e-05, "loss": 0.3634, "step": 2787 }, { "epoch": 2.9722814498933903, "grad_norm": 0.19275153065422299, "learning_rate": 3.3817614208549796e-05, "loss": 0.3652, "step": 2788 }, { "epoch": 2.973347547974414, "grad_norm": 0.25513016177139153, "learning_rate": 3.378820259428972e-05, "loss": 0.3678, "step": 2789 }, { "epoch": 2.974413646055437, "grad_norm": 0.20641698835987704, "learning_rate": 3.3758794421042585e-05, "loss": 0.3686, "step": 2790 }, { "epoch": 2.9754797441364604, "grad_norm": 0.21686027546060258, "learning_rate": 3.3729389705098956e-05, "loss": 0.3721, "step": 2791 }, { "epoch": 2.976545842217484, "grad_norm": 0.21228392649559347, "learning_rate": 3.3699988462747536e-05, "loss": 0.3619, "step": 2792 }, { "epoch": 2.9776119402985075, "grad_norm": 0.15906755651360552, "learning_rate": 3.3670590710275095e-05, "loss": 0.3634, "step": 2793 }, { "epoch": 2.978678038379531, "grad_norm": 0.23673718965759863, "learning_rate": 3.3641196463966466e-05, "loss": 0.3682, "step": 2794 }, { "epoch": 2.979744136460554, "grad_norm": 0.2179359974386122, "learning_rate": 3.36118057401045e-05, "loss": 0.3661, "step": 2795 }, { "epoch": 2.9808102345415777, "grad_norm": 0.18870552087602346, "learning_rate": 3.358241855497015e-05, "loss": 0.3692, "step": 2796 }, { "epoch": 2.981876332622601, "grad_norm": 0.21000830418856412, "learning_rate": 3.35530349248424e-05, "loss": 0.3723, "step": 2797 }, { "epoch": 2.9829424307036247, "grad_norm": 0.1834568664565684, "learning_rate": 3.352365486599823e-05, "loss": 0.3699, "step": 2798 }, { "epoch": 2.9840085287846483, "grad_norm": 0.21955054980022862, "learning_rate": 3.349427839471268e-05, "loss": 0.3693, "step": 2799 }, { "epoch": 2.9850746268656714, "grad_norm": 0.19131385619838284, "learning_rate": 3.346490552725879e-05, "loss": 0.3633, "step": 2800 }, { "epoch": 2.9861407249466954, "grad_norm": 0.22039397671168487, "learning_rate": 3.3435536279907615e-05, "loss": 0.3668, "step": 2801 }, { "epoch": 2.9872068230277184, "grad_norm": 0.25817755908994, "learning_rate": 3.340617066892815e-05, "loss": 0.3694, "step": 2802 }, { "epoch": 2.988272921108742, "grad_norm": 0.17992068437591005, "learning_rate": 3.3376808710587456e-05, "loss": 0.3608, "step": 2803 }, { "epoch": 2.9893390191897655, "grad_norm": 0.19725409347741943, "learning_rate": 3.334745042115052e-05, "loss": 0.3674, "step": 2804 }, { "epoch": 2.990405117270789, "grad_norm": 0.2199411430669973, "learning_rate": 3.331809581688031e-05, "loss": 0.3685, "step": 2805 }, { "epoch": 2.9914712153518126, "grad_norm": 0.186450392495569, "learning_rate": 3.328874491403774e-05, "loss": 0.3652, "step": 2806 }, { "epoch": 2.9925373134328357, "grad_norm": 0.2010767437452293, "learning_rate": 3.32593977288817e-05, "loss": 0.3795, "step": 2807 }, { "epoch": 2.9936034115138592, "grad_norm": 0.19580642217271252, "learning_rate": 3.3230054277668994e-05, "loss": 0.3688, "step": 2808 }, { "epoch": 2.9946695095948828, "grad_norm": 0.17801010293169015, "learning_rate": 3.320071457665437e-05, "loss": 0.3694, "step": 2809 }, { "epoch": 2.9957356076759063, "grad_norm": 0.183255241372143, "learning_rate": 3.317137864209051e-05, "loss": 0.3711, "step": 2810 }, { "epoch": 2.99680170575693, "grad_norm": 0.1656711458529668, "learning_rate": 3.3142046490227984e-05, "loss": 0.3668, "step": 2811 }, { "epoch": 2.997867803837953, "grad_norm": 0.15999468329558042, "learning_rate": 3.311271813731529e-05, "loss": 0.3625, "step": 2812 }, { "epoch": 2.9989339019189765, "grad_norm": 0.2031732214049462, "learning_rate": 3.3083393599598804e-05, "loss": 0.3667, "step": 2813 }, { "epoch": 3.0, "grad_norm": 0.1833937765340318, "learning_rate": 3.305407289332279e-05, "loss": 0.356, "step": 2814 }, { "epoch": 3.0010660980810235, "grad_norm": 0.16676086865014514, "learning_rate": 3.3024756034729403e-05, "loss": 0.3446, "step": 2815 }, { "epoch": 3.002132196162047, "grad_norm": 0.1944616035010321, "learning_rate": 3.299544304005867e-05, "loss": 0.3499, "step": 2816 }, { "epoch": 3.00319829424307, "grad_norm": 0.21726317154600533, "learning_rate": 3.296613392554845e-05, "loss": 0.3447, "step": 2817 }, { "epoch": 3.0042643923240937, "grad_norm": 0.18162073687276598, "learning_rate": 3.293682870743446e-05, "loss": 0.3449, "step": 2818 }, { "epoch": 3.0053304904051172, "grad_norm": 0.19337743208072578, "learning_rate": 3.2907527401950314e-05, "loss": 0.3438, "step": 2819 }, { "epoch": 3.0063965884861408, "grad_norm": 0.20006928027604456, "learning_rate": 3.287823002532735e-05, "loss": 0.348, "step": 2820 }, { "epoch": 3.0074626865671643, "grad_norm": 0.21921022479851462, "learning_rate": 3.284893659379483e-05, "loss": 0.355, "step": 2821 }, { "epoch": 3.008528784648188, "grad_norm": 0.19159498596282584, "learning_rate": 3.2819647123579785e-05, "loss": 0.3471, "step": 2822 }, { "epoch": 3.009594882729211, "grad_norm": 0.1975964755116819, "learning_rate": 3.2790361630907073e-05, "loss": 0.351, "step": 2823 }, { "epoch": 3.0106609808102345, "grad_norm": 0.19162779591776102, "learning_rate": 3.276108013199931e-05, "loss": 0.3471, "step": 2824 }, { "epoch": 3.011727078891258, "grad_norm": 0.20069049454309051, "learning_rate": 3.273180264307693e-05, "loss": 0.3473, "step": 2825 }, { "epoch": 3.0127931769722816, "grad_norm": 0.18162811798879747, "learning_rate": 3.270252918035817e-05, "loss": 0.3497, "step": 2826 }, { "epoch": 3.013859275053305, "grad_norm": 0.2190938039478538, "learning_rate": 3.2673259760058966e-05, "loss": 0.342, "step": 2827 }, { "epoch": 3.014925373134328, "grad_norm": 0.20368209134299048, "learning_rate": 3.264399439839307e-05, "loss": 0.3565, "step": 2828 }, { "epoch": 3.0159914712153517, "grad_norm": 0.21564518091097337, "learning_rate": 3.261473311157197e-05, "loss": 0.3477, "step": 2829 }, { "epoch": 3.0170575692963753, "grad_norm": 0.22960493863723194, "learning_rate": 3.258547591580493e-05, "loss": 0.355, "step": 2830 }, { "epoch": 3.018123667377399, "grad_norm": 0.20630353684777758, "learning_rate": 3.255622282729886e-05, "loss": 0.3491, "step": 2831 }, { "epoch": 3.0191897654584223, "grad_norm": 0.19432289775660633, "learning_rate": 3.252697386225848e-05, "loss": 0.3463, "step": 2832 }, { "epoch": 3.0202558635394454, "grad_norm": 0.22226714326699387, "learning_rate": 3.24977290368862e-05, "loss": 0.3474, "step": 2833 }, { "epoch": 3.021321961620469, "grad_norm": 0.20919728574830324, "learning_rate": 3.2468488367382146e-05, "loss": 0.3543, "step": 2834 }, { "epoch": 3.0223880597014925, "grad_norm": 0.2097539980157466, "learning_rate": 3.2439251869944096e-05, "loss": 0.3546, "step": 2835 }, { "epoch": 3.023454157782516, "grad_norm": 0.2378340506907076, "learning_rate": 3.2410019560767566e-05, "loss": 0.3507, "step": 2836 }, { "epoch": 3.0245202558635396, "grad_norm": 0.19806299173295153, "learning_rate": 3.238079145604576e-05, "loss": 0.3497, "step": 2837 }, { "epoch": 3.025586353944563, "grad_norm": 0.21295221351192548, "learning_rate": 3.235156757196948e-05, "loss": 0.3427, "step": 2838 }, { "epoch": 3.026652452025586, "grad_norm": 0.22762110787574957, "learning_rate": 3.2322347924727264e-05, "loss": 0.352, "step": 2839 }, { "epoch": 3.0277185501066097, "grad_norm": 0.20297127463286518, "learning_rate": 3.229313253050529e-05, "loss": 0.3481, "step": 2840 }, { "epoch": 3.0287846481876333, "grad_norm": 0.1801189363866266, "learning_rate": 3.2263921405487356e-05, "loss": 0.3491, "step": 2841 }, { "epoch": 3.029850746268657, "grad_norm": 0.2148709514009852, "learning_rate": 3.2234714565854895e-05, "loss": 0.3562, "step": 2842 }, { "epoch": 3.0309168443496803, "grad_norm": 0.1905295271571919, "learning_rate": 3.2205512027787005e-05, "loss": 0.3468, "step": 2843 }, { "epoch": 3.0319829424307034, "grad_norm": 0.2416529548256475, "learning_rate": 3.2176313807460355e-05, "loss": 0.3521, "step": 2844 }, { "epoch": 3.033049040511727, "grad_norm": 0.19945666420393923, "learning_rate": 3.214711992104925e-05, "loss": 0.3519, "step": 2845 }, { "epoch": 3.0341151385927505, "grad_norm": 0.1969420298529883, "learning_rate": 3.211793038472558e-05, "loss": 0.3489, "step": 2846 }, { "epoch": 3.035181236673774, "grad_norm": 0.24393952090774143, "learning_rate": 3.208874521465882e-05, "loss": 0.3555, "step": 2847 }, { "epoch": 3.0362473347547976, "grad_norm": 0.17068118729565857, "learning_rate": 3.205956442701607e-05, "loss": 0.3492, "step": 2848 }, { "epoch": 3.0373134328358207, "grad_norm": 0.18265684276116836, "learning_rate": 3.203038803796195e-05, "loss": 0.3536, "step": 2849 }, { "epoch": 3.038379530916844, "grad_norm": 0.1750878986387194, "learning_rate": 3.200121606365865e-05, "loss": 0.3467, "step": 2850 }, { "epoch": 3.0394456289978677, "grad_norm": 0.1501654395365917, "learning_rate": 3.197204852026595e-05, "loss": 0.3463, "step": 2851 }, { "epoch": 3.0405117270788913, "grad_norm": 0.1881586833141222, "learning_rate": 3.194288542394116e-05, "loss": 0.3485, "step": 2852 }, { "epoch": 3.041577825159915, "grad_norm": 0.17391665540804127, "learning_rate": 3.191372679083908e-05, "loss": 0.349, "step": 2853 }, { "epoch": 3.0426439232409384, "grad_norm": 0.1855229052361205, "learning_rate": 3.18845726371121e-05, "loss": 0.3522, "step": 2854 }, { "epoch": 3.0437100213219614, "grad_norm": 0.19940822544482417, "learning_rate": 3.1855422978910136e-05, "loss": 0.3482, "step": 2855 }, { "epoch": 3.044776119402985, "grad_norm": 0.16112705274220399, "learning_rate": 3.182627783238053e-05, "loss": 0.347, "step": 2856 }, { "epoch": 3.0458422174840085, "grad_norm": 0.20167791211166475, "learning_rate": 3.179713721366821e-05, "loss": 0.3534, "step": 2857 }, { "epoch": 3.046908315565032, "grad_norm": 0.18479202502339434, "learning_rate": 3.176800113891556e-05, "loss": 0.351, "step": 2858 }, { "epoch": 3.0479744136460556, "grad_norm": 0.16272058166990602, "learning_rate": 3.173886962426246e-05, "loss": 0.3494, "step": 2859 }, { "epoch": 3.0490405117270787, "grad_norm": 0.16165195964515638, "learning_rate": 3.1709742685846224e-05, "loss": 0.3473, "step": 2860 }, { "epoch": 3.050106609808102, "grad_norm": 0.17386330305907433, "learning_rate": 3.168062033980169e-05, "loss": 0.3497, "step": 2861 }, { "epoch": 3.0511727078891258, "grad_norm": 0.1458661691624279, "learning_rate": 3.165150260226112e-05, "loss": 0.35, "step": 2862 }, { "epoch": 3.0522388059701493, "grad_norm": 0.20435764871354212, "learning_rate": 3.162238948935423e-05, "loss": 0.3486, "step": 2863 }, { "epoch": 3.053304904051173, "grad_norm": 0.19071168515585496, "learning_rate": 3.159328101720816e-05, "loss": 0.3471, "step": 2864 }, { "epoch": 3.0543710021321964, "grad_norm": 0.18693245756126411, "learning_rate": 3.156417720194749e-05, "loss": 0.3408, "step": 2865 }, { "epoch": 3.0554371002132195, "grad_norm": 0.19376415825853904, "learning_rate": 3.153507805969425e-05, "loss": 0.3462, "step": 2866 }, { "epoch": 3.056503198294243, "grad_norm": 0.23970364168448857, "learning_rate": 3.150598360656781e-05, "loss": 0.3585, "step": 2867 }, { "epoch": 3.0575692963752665, "grad_norm": 0.24686273765529104, "learning_rate": 3.1476893858685e-05, "loss": 0.3522, "step": 2868 }, { "epoch": 3.05863539445629, "grad_norm": 0.20134496461618384, "learning_rate": 3.1447808832160034e-05, "loss": 0.3534, "step": 2869 }, { "epoch": 3.0597014925373136, "grad_norm": 0.20811412001280552, "learning_rate": 3.141872854310452e-05, "loss": 0.3529, "step": 2870 }, { "epoch": 3.0607675906183367, "grad_norm": 0.2252811945149839, "learning_rate": 3.13896530076274e-05, "loss": 0.3507, "step": 2871 }, { "epoch": 3.0618336886993602, "grad_norm": 0.27899336334092645, "learning_rate": 3.1360582241835025e-05, "loss": 0.3445, "step": 2872 }, { "epoch": 3.0628997867803838, "grad_norm": 0.18208464259977086, "learning_rate": 3.13315162618311e-05, "loss": 0.3512, "step": 2873 }, { "epoch": 3.0639658848614073, "grad_norm": 0.19082089290473603, "learning_rate": 3.130245508371663e-05, "loss": 0.3528, "step": 2874 }, { "epoch": 3.065031982942431, "grad_norm": 0.2131583906254748, "learning_rate": 3.127339872359002e-05, "loss": 0.3521, "step": 2875 }, { "epoch": 3.066098081023454, "grad_norm": 0.17934067948301102, "learning_rate": 3.1244347197546986e-05, "loss": 0.3484, "step": 2876 }, { "epoch": 3.0671641791044775, "grad_norm": 0.21329220073762686, "learning_rate": 3.1215300521680564e-05, "loss": 0.3564, "step": 2877 }, { "epoch": 3.068230277185501, "grad_norm": 0.21979298064989125, "learning_rate": 3.118625871208109e-05, "loss": 0.3441, "step": 2878 }, { "epoch": 3.0692963752665245, "grad_norm": 0.1899827348463154, "learning_rate": 3.115722178483624e-05, "loss": 0.3548, "step": 2879 }, { "epoch": 3.070362473347548, "grad_norm": 0.19208909798825324, "learning_rate": 3.1128189756030934e-05, "loss": 0.3472, "step": 2880 }, { "epoch": 3.0714285714285716, "grad_norm": 0.15715982773841544, "learning_rate": 3.109916264174743e-05, "loss": 0.3447, "step": 2881 }, { "epoch": 3.0724946695095947, "grad_norm": 0.19051789498649285, "learning_rate": 3.1070140458065235e-05, "loss": 0.3478, "step": 2882 }, { "epoch": 3.0735607675906182, "grad_norm": 0.17970300278684168, "learning_rate": 3.104112322106112e-05, "loss": 0.3468, "step": 2883 }, { "epoch": 3.074626865671642, "grad_norm": 0.1733355341791082, "learning_rate": 3.1012110946809134e-05, "loss": 0.355, "step": 2884 }, { "epoch": 3.0756929637526653, "grad_norm": 0.1997157707879784, "learning_rate": 3.0983103651380574e-05, "loss": 0.3472, "step": 2885 }, { "epoch": 3.076759061833689, "grad_norm": 0.1683478624827942, "learning_rate": 3.0954101350843966e-05, "loss": 0.3472, "step": 2886 }, { "epoch": 3.077825159914712, "grad_norm": 0.17517827837363226, "learning_rate": 3.092510406126508e-05, "loss": 0.3466, "step": 2887 }, { "epoch": 3.0788912579957355, "grad_norm": 0.18556306694129251, "learning_rate": 3.089611179870691e-05, "loss": 0.3498, "step": 2888 }, { "epoch": 3.079957356076759, "grad_norm": 0.17804006539183986, "learning_rate": 3.086712457922966e-05, "loss": 0.3459, "step": 2889 }, { "epoch": 3.0810234541577826, "grad_norm": 0.17926971743225267, "learning_rate": 3.083814241889074e-05, "loss": 0.3485, "step": 2890 }, { "epoch": 3.082089552238806, "grad_norm": 0.18765772737419348, "learning_rate": 3.0809165333744765e-05, "loss": 0.3555, "step": 2891 }, { "epoch": 3.0831556503198296, "grad_norm": 0.1665179916020946, "learning_rate": 3.0780193339843545e-05, "loss": 0.346, "step": 2892 }, { "epoch": 3.0842217484008527, "grad_norm": 0.16253422719498084, "learning_rate": 3.075122645323603e-05, "loss": 0.347, "step": 2893 }, { "epoch": 3.0852878464818763, "grad_norm": 0.1705949117676651, "learning_rate": 3.072226468996839e-05, "loss": 0.3531, "step": 2894 }, { "epoch": 3.0863539445629, "grad_norm": 0.16669859994578132, "learning_rate": 3.0693308066083954e-05, "loss": 0.3489, "step": 2895 }, { "epoch": 3.0874200426439233, "grad_norm": 0.15555031037234748, "learning_rate": 3.0664356597623144e-05, "loss": 0.3505, "step": 2896 }, { "epoch": 3.088486140724947, "grad_norm": 0.18596041745219372, "learning_rate": 3.0635410300623596e-05, "loss": 0.354, "step": 2897 }, { "epoch": 3.08955223880597, "grad_norm": 0.1905420596562174, "learning_rate": 3.060646919112004e-05, "loss": 0.3481, "step": 2898 }, { "epoch": 3.0906183368869935, "grad_norm": 0.18015092645672678, "learning_rate": 3.057753328514438e-05, "loss": 0.3492, "step": 2899 }, { "epoch": 3.091684434968017, "grad_norm": 0.1773289970330081, "learning_rate": 3.0548602598725564e-05, "loss": 0.3523, "step": 2900 }, { "epoch": 3.0927505330490406, "grad_norm": 0.19120781604022777, "learning_rate": 3.0519677147889705e-05, "loss": 0.3533, "step": 2901 }, { "epoch": 3.093816631130064, "grad_norm": 0.18705169216756135, "learning_rate": 3.0490756948660017e-05, "loss": 0.3486, "step": 2902 }, { "epoch": 3.094882729211087, "grad_norm": 0.21344733721957102, "learning_rate": 3.046184201705675e-05, "loss": 0.3546, "step": 2903 }, { "epoch": 3.0959488272921107, "grad_norm": 0.16522989881515734, "learning_rate": 3.043293236909729e-05, "loss": 0.3486, "step": 2904 }, { "epoch": 3.0970149253731343, "grad_norm": 0.1819572191490791, "learning_rate": 3.0404028020796087e-05, "loss": 0.3496, "step": 2905 }, { "epoch": 3.098081023454158, "grad_norm": 0.15871225159309935, "learning_rate": 3.0375128988164655e-05, "loss": 0.3476, "step": 2906 }, { "epoch": 3.0991471215351813, "grad_norm": 0.17994107490800862, "learning_rate": 3.0346235287211532e-05, "loss": 0.3509, "step": 2907 }, { "epoch": 3.100213219616205, "grad_norm": 0.16605077988407455, "learning_rate": 3.0317346933942337e-05, "loss": 0.3563, "step": 2908 }, { "epoch": 3.101279317697228, "grad_norm": 0.16665287698505823, "learning_rate": 3.028846394435973e-05, "loss": 0.3468, "step": 2909 }, { "epoch": 3.1023454157782515, "grad_norm": 0.19464014834448382, "learning_rate": 3.0259586334463366e-05, "loss": 0.3461, "step": 2910 }, { "epoch": 3.103411513859275, "grad_norm": 0.1387449385010934, "learning_rate": 3.0230714120249947e-05, "loss": 0.351, "step": 2911 }, { "epoch": 3.1044776119402986, "grad_norm": 0.17872611874146532, "learning_rate": 3.020184731771319e-05, "loss": 0.3488, "step": 2912 }, { "epoch": 3.105543710021322, "grad_norm": 0.18416826482656345, "learning_rate": 3.017298594284379e-05, "loss": 0.3487, "step": 2913 }, { "epoch": 3.106609808102345, "grad_norm": 0.17275054090026867, "learning_rate": 3.0144130011629448e-05, "loss": 0.354, "step": 2914 }, { "epoch": 3.1076759061833688, "grad_norm": 0.17763788392460803, "learning_rate": 3.011527954005486e-05, "loss": 0.3485, "step": 2915 }, { "epoch": 3.1087420042643923, "grad_norm": 0.1945341807582054, "learning_rate": 3.0086434544101677e-05, "loss": 0.3489, "step": 2916 }, { "epoch": 3.109808102345416, "grad_norm": 0.14591971732114, "learning_rate": 3.005759503974854e-05, "loss": 0.3499, "step": 2917 }, { "epoch": 3.1108742004264394, "grad_norm": 0.16096483078908078, "learning_rate": 3.0028761042971028e-05, "loss": 0.3446, "step": 2918 }, { "epoch": 3.111940298507463, "grad_norm": 0.1877410826166536, "learning_rate": 2.9999932569741673e-05, "loss": 0.3543, "step": 2919 }, { "epoch": 3.113006396588486, "grad_norm": 0.1583922934703547, "learning_rate": 2.9971109636029952e-05, "loss": 0.3454, "step": 2920 }, { "epoch": 3.1140724946695095, "grad_norm": 0.16306779868577526, "learning_rate": 2.994229225780229e-05, "loss": 0.3419, "step": 2921 }, { "epoch": 3.115138592750533, "grad_norm": 0.1919727737485944, "learning_rate": 2.991348045102199e-05, "loss": 0.3479, "step": 2922 }, { "epoch": 3.1162046908315566, "grad_norm": 0.14431740367185825, "learning_rate": 2.988467423164931e-05, "loss": 0.3519, "step": 2923 }, { "epoch": 3.11727078891258, "grad_norm": 0.19602916545574073, "learning_rate": 2.9855873615641414e-05, "loss": 0.3489, "step": 2924 }, { "epoch": 3.1183368869936032, "grad_norm": 0.15941172314892962, "learning_rate": 2.982707861895231e-05, "loss": 0.3473, "step": 2925 }, { "epoch": 3.1194029850746268, "grad_norm": 0.15400208783886424, "learning_rate": 2.9798289257532946e-05, "loss": 0.347, "step": 2926 }, { "epoch": 3.1204690831556503, "grad_norm": 0.16733091547516227, "learning_rate": 2.976950554733114e-05, "loss": 0.3487, "step": 2927 }, { "epoch": 3.121535181236674, "grad_norm": 0.16131815693424373, "learning_rate": 2.9740727504291577e-05, "loss": 0.3515, "step": 2928 }, { "epoch": 3.1226012793176974, "grad_norm": 0.18988500484746434, "learning_rate": 2.9711955144355776e-05, "loss": 0.3475, "step": 2929 }, { "epoch": 3.1236673773987205, "grad_norm": 0.16873039013955726, "learning_rate": 2.9683188483462135e-05, "loss": 0.3506, "step": 2930 }, { "epoch": 3.124733475479744, "grad_norm": 0.18997760424236068, "learning_rate": 2.9654427537545915e-05, "loss": 0.3527, "step": 2931 }, { "epoch": 3.1257995735607675, "grad_norm": 0.16015025466863764, "learning_rate": 2.962567232253914e-05, "loss": 0.349, "step": 2932 }, { "epoch": 3.126865671641791, "grad_norm": 0.14231292340170754, "learning_rate": 2.9596922854370737e-05, "loss": 0.3487, "step": 2933 }, { "epoch": 3.1279317697228146, "grad_norm": 0.16799598139417324, "learning_rate": 2.9568179148966406e-05, "loss": 0.3504, "step": 2934 }, { "epoch": 3.128997867803838, "grad_norm": 0.13092435516555, "learning_rate": 2.9539441222248685e-05, "loss": 0.3502, "step": 2935 }, { "epoch": 3.1300639658848612, "grad_norm": 0.16201697102720602, "learning_rate": 2.9510709090136855e-05, "loss": 0.3507, "step": 2936 }, { "epoch": 3.131130063965885, "grad_norm": 0.13920375866502643, "learning_rate": 2.9481982768547048e-05, "loss": 0.3484, "step": 2937 }, { "epoch": 3.1321961620469083, "grad_norm": 0.1608670042161187, "learning_rate": 2.945326227339215e-05, "loss": 0.3505, "step": 2938 }, { "epoch": 3.133262260127932, "grad_norm": 0.15274868977241857, "learning_rate": 2.942454762058184e-05, "loss": 0.353, "step": 2939 }, { "epoch": 3.1343283582089554, "grad_norm": 0.1728035181021134, "learning_rate": 2.939583882602251e-05, "loss": 0.3537, "step": 2940 }, { "epoch": 3.1353944562899785, "grad_norm": 0.20061129935035046, "learning_rate": 2.936713590561735e-05, "loss": 0.3537, "step": 2941 }, { "epoch": 3.136460554371002, "grad_norm": 0.16121457578954346, "learning_rate": 2.9338438875266315e-05, "loss": 0.3493, "step": 2942 }, { "epoch": 3.1375266524520256, "grad_norm": 0.1985681459680183, "learning_rate": 2.930974775086602e-05, "loss": 0.3494, "step": 2943 }, { "epoch": 3.138592750533049, "grad_norm": 0.13586545126970054, "learning_rate": 2.928106254830989e-05, "loss": 0.3485, "step": 2944 }, { "epoch": 3.1396588486140726, "grad_norm": 0.193935638265073, "learning_rate": 2.9252383283488038e-05, "loss": 0.3506, "step": 2945 }, { "epoch": 3.140724946695096, "grad_norm": 0.15997507985832404, "learning_rate": 2.9223709972287274e-05, "loss": 0.3501, "step": 2946 }, { "epoch": 3.1417910447761193, "grad_norm": 0.16394293111831423, "learning_rate": 2.9195042630591115e-05, "loss": 0.3428, "step": 2947 }, { "epoch": 3.142857142857143, "grad_norm": 0.17865359760016075, "learning_rate": 2.9166381274279803e-05, "loss": 0.3519, "step": 2948 }, { "epoch": 3.1439232409381663, "grad_norm": 0.17991378591531254, "learning_rate": 2.913772591923021e-05, "loss": 0.3485, "step": 2949 }, { "epoch": 3.14498933901919, "grad_norm": 0.1693801011865618, "learning_rate": 2.9109076581315937e-05, "loss": 0.3559, "step": 2950 }, { "epoch": 3.1460554371002134, "grad_norm": 0.14463952221236143, "learning_rate": 2.908043327640723e-05, "loss": 0.3579, "step": 2951 }, { "epoch": 3.1471215351812365, "grad_norm": 0.1573143821262481, "learning_rate": 2.9051796020370964e-05, "loss": 0.3488, "step": 2952 }, { "epoch": 3.14818763326226, "grad_norm": 0.16678149027123818, "learning_rate": 2.9023164829070718e-05, "loss": 0.3559, "step": 2953 }, { "epoch": 3.1492537313432836, "grad_norm": 0.140262490078489, "learning_rate": 2.8994539718366672e-05, "loss": 0.3535, "step": 2954 }, { "epoch": 3.150319829424307, "grad_norm": 0.2046721039374505, "learning_rate": 2.8965920704115644e-05, "loss": 0.3511, "step": 2955 }, { "epoch": 3.1513859275053306, "grad_norm": 0.1438158341906599, "learning_rate": 2.8937307802171085e-05, "loss": 0.3443, "step": 2956 }, { "epoch": 3.1524520255863537, "grad_norm": 0.14253271019539046, "learning_rate": 2.8908701028383084e-05, "loss": 0.3475, "step": 2957 }, { "epoch": 3.1535181236673773, "grad_norm": 0.14190886636267147, "learning_rate": 2.888010039859826e-05, "loss": 0.3465, "step": 2958 }, { "epoch": 3.154584221748401, "grad_norm": 0.1635398501114063, "learning_rate": 2.8851505928659896e-05, "loss": 0.3521, "step": 2959 }, { "epoch": 3.1556503198294243, "grad_norm": 0.1520685576715207, "learning_rate": 2.8822917634407858e-05, "loss": 0.3463, "step": 2960 }, { "epoch": 3.156716417910448, "grad_norm": 0.1723966595861352, "learning_rate": 2.8794335531678545e-05, "loss": 0.3481, "step": 2961 }, { "epoch": 3.1577825159914714, "grad_norm": 0.16208068603676945, "learning_rate": 2.8765759636304973e-05, "loss": 0.3575, "step": 2962 }, { "epoch": 3.1588486140724945, "grad_norm": 0.1453800336738485, "learning_rate": 2.8737189964116705e-05, "loss": 0.3471, "step": 2963 }, { "epoch": 3.159914712153518, "grad_norm": 0.1858091083588778, "learning_rate": 2.8708626530939865e-05, "loss": 0.3506, "step": 2964 }, { "epoch": 3.1609808102345416, "grad_norm": 0.18692761383630035, "learning_rate": 2.868006935259708e-05, "loss": 0.3467, "step": 2965 }, { "epoch": 3.162046908315565, "grad_norm": 0.14846566192818872, "learning_rate": 2.8651518444907556e-05, "loss": 0.3471, "step": 2966 }, { "epoch": 3.1631130063965887, "grad_norm": 0.16153996741706544, "learning_rate": 2.862297382368702e-05, "loss": 0.3542, "step": 2967 }, { "epoch": 3.1641791044776117, "grad_norm": 0.16957351241623203, "learning_rate": 2.8594435504747724e-05, "loss": 0.3525, "step": 2968 }, { "epoch": 3.1652452025586353, "grad_norm": 0.13030665419204615, "learning_rate": 2.856590350389837e-05, "loss": 0.3463, "step": 2969 }, { "epoch": 3.166311300639659, "grad_norm": 0.13338420990478728, "learning_rate": 2.8537377836944232e-05, "loss": 0.3566, "step": 2970 }, { "epoch": 3.1673773987206824, "grad_norm": 0.15453541002399912, "learning_rate": 2.850885851968706e-05, "loss": 0.3455, "step": 2971 }, { "epoch": 3.168443496801706, "grad_norm": 0.1539275910876287, "learning_rate": 2.8480345567925036e-05, "loss": 0.3505, "step": 2972 }, { "epoch": 3.1695095948827294, "grad_norm": 0.1601347292242495, "learning_rate": 2.8451838997452875e-05, "loss": 0.355, "step": 2973 }, { "epoch": 3.1705756929637525, "grad_norm": 0.16109585673660984, "learning_rate": 2.8423338824061732e-05, "loss": 0.3494, "step": 2974 }, { "epoch": 3.171641791044776, "grad_norm": 0.15883551225228731, "learning_rate": 2.839484506353924e-05, "loss": 0.3461, "step": 2975 }, { "epoch": 3.1727078891257996, "grad_norm": 0.18127651611679274, "learning_rate": 2.836635773166943e-05, "loss": 0.3533, "step": 2976 }, { "epoch": 3.173773987206823, "grad_norm": 0.14832179685684777, "learning_rate": 2.833787684423282e-05, "loss": 0.3512, "step": 2977 }, { "epoch": 3.1748400852878467, "grad_norm": 0.1833917506129557, "learning_rate": 2.8309402417006344e-05, "loss": 0.3488, "step": 2978 }, { "epoch": 3.1759061833688698, "grad_norm": 0.16898776691695933, "learning_rate": 2.8280934465763352e-05, "loss": 0.3451, "step": 2979 }, { "epoch": 3.1769722814498933, "grad_norm": 0.14677271644886078, "learning_rate": 2.82524730062736e-05, "loss": 0.3509, "step": 2980 }, { "epoch": 3.178038379530917, "grad_norm": 0.19057648965710483, "learning_rate": 2.8224018054303278e-05, "loss": 0.3551, "step": 2981 }, { "epoch": 3.1791044776119404, "grad_norm": 0.13538971093481636, "learning_rate": 2.8195569625614933e-05, "loss": 0.3456, "step": 2982 }, { "epoch": 3.180170575692964, "grad_norm": 0.1390167814695544, "learning_rate": 2.816712773596751e-05, "loss": 0.3531, "step": 2983 }, { "epoch": 3.181236673773987, "grad_norm": 0.15503459765163632, "learning_rate": 2.8138692401116366e-05, "loss": 0.3496, "step": 2984 }, { "epoch": 3.1823027718550105, "grad_norm": 0.14462829278663678, "learning_rate": 2.811026363681317e-05, "loss": 0.3553, "step": 2985 }, { "epoch": 3.183368869936034, "grad_norm": 0.17695463207505402, "learning_rate": 2.8081841458806002e-05, "loss": 0.3533, "step": 2986 }, { "epoch": 3.1844349680170576, "grad_norm": 0.1591547097918123, "learning_rate": 2.8053425882839252e-05, "loss": 0.3471, "step": 2987 }, { "epoch": 3.185501066098081, "grad_norm": 0.23045461569297743, "learning_rate": 2.802501692465368e-05, "loss": 0.3558, "step": 2988 }, { "epoch": 3.1865671641791047, "grad_norm": 0.17997948094924288, "learning_rate": 2.799661459998638e-05, "loss": 0.3451, "step": 2989 }, { "epoch": 3.1876332622601278, "grad_norm": 0.15073757914110497, "learning_rate": 2.7968218924570757e-05, "loss": 0.3499, "step": 2990 }, { "epoch": 3.1886993603411513, "grad_norm": 0.149109855986733, "learning_rate": 2.7939829914136533e-05, "loss": 0.3505, "step": 2991 }, { "epoch": 3.189765458422175, "grad_norm": 0.1828583325585577, "learning_rate": 2.791144758440975e-05, "loss": 0.3534, "step": 2992 }, { "epoch": 3.1908315565031984, "grad_norm": 0.1835467829513033, "learning_rate": 2.788307195111276e-05, "loss": 0.3571, "step": 2993 }, { "epoch": 3.191897654584222, "grad_norm": 0.14112165071165333, "learning_rate": 2.7854703029964157e-05, "loss": 0.349, "step": 2994 }, { "epoch": 3.192963752665245, "grad_norm": 0.14994590904845737, "learning_rate": 2.7826340836678868e-05, "loss": 0.3512, "step": 2995 }, { "epoch": 3.1940298507462686, "grad_norm": 0.15001218329008587, "learning_rate": 2.779798538696807e-05, "loss": 0.3508, "step": 2996 }, { "epoch": 3.195095948827292, "grad_norm": 0.14167484348076181, "learning_rate": 2.776963669653923e-05, "loss": 0.3531, "step": 2997 }, { "epoch": 3.1961620469083156, "grad_norm": 0.16415685787588938, "learning_rate": 2.7741294781096008e-05, "loss": 0.3475, "step": 2998 }, { "epoch": 3.197228144989339, "grad_norm": 0.18767514637702254, "learning_rate": 2.7712959656338375e-05, "loss": 0.3503, "step": 2999 }, { "epoch": 3.1982942430703627, "grad_norm": 0.1318270842245805, "learning_rate": 2.7684631337962535e-05, "loss": 0.3483, "step": 3000 }, { "epoch": 3.199360341151386, "grad_norm": 0.17743086770734862, "learning_rate": 2.7656309841660864e-05, "loss": 0.3497, "step": 3001 }, { "epoch": 3.2004264392324093, "grad_norm": 0.20226460565691426, "learning_rate": 2.7627995183122025e-05, "loss": 0.3548, "step": 3002 }, { "epoch": 3.201492537313433, "grad_norm": 0.1404795955564027, "learning_rate": 2.7599687378030862e-05, "loss": 0.3456, "step": 3003 }, { "epoch": 3.2025586353944564, "grad_norm": 0.1664198795587612, "learning_rate": 2.7571386442068443e-05, "loss": 0.348, "step": 3004 }, { "epoch": 3.20362473347548, "grad_norm": 0.1871561566919217, "learning_rate": 2.754309239091199e-05, "loss": 0.3487, "step": 3005 }, { "epoch": 3.204690831556503, "grad_norm": 0.1762158383532131, "learning_rate": 2.7514805240234942e-05, "loss": 0.3489, "step": 3006 }, { "epoch": 3.2057569296375266, "grad_norm": 0.1409821970880933, "learning_rate": 2.7486525005706915e-05, "loss": 0.3462, "step": 3007 }, { "epoch": 3.20682302771855, "grad_norm": 0.1919928381276622, "learning_rate": 2.745825170299371e-05, "loss": 0.3431, "step": 3008 }, { "epoch": 3.2078891257995736, "grad_norm": 0.14051119559763037, "learning_rate": 2.7429985347757232e-05, "loss": 0.3448, "step": 3009 }, { "epoch": 3.208955223880597, "grad_norm": 0.1581183828916611, "learning_rate": 2.7401725955655582e-05, "loss": 0.3494, "step": 3010 }, { "epoch": 3.2100213219616203, "grad_norm": 0.16965574848535128, "learning_rate": 2.7373473542343023e-05, "loss": 0.3447, "step": 3011 }, { "epoch": 3.211087420042644, "grad_norm": 0.1596889434072171, "learning_rate": 2.7345228123469886e-05, "loss": 0.3483, "step": 3012 }, { "epoch": 3.2121535181236673, "grad_norm": 0.17408804946301867, "learning_rate": 2.731698971468268e-05, "loss": 0.3567, "step": 3013 }, { "epoch": 3.213219616204691, "grad_norm": 0.16081014646576602, "learning_rate": 2.7288758331624025e-05, "loss": 0.3447, "step": 3014 }, { "epoch": 3.2142857142857144, "grad_norm": 0.1387088635896051, "learning_rate": 2.7260533989932628e-05, "loss": 0.3512, "step": 3015 }, { "epoch": 3.2153518123667375, "grad_norm": 0.15123466339681416, "learning_rate": 2.7232316705243305e-05, "loss": 0.3481, "step": 3016 }, { "epoch": 3.216417910447761, "grad_norm": 0.15290558325073092, "learning_rate": 2.720410649318698e-05, "loss": 0.3488, "step": 3017 }, { "epoch": 3.2174840085287846, "grad_norm": 0.17149168701297954, "learning_rate": 2.7175903369390638e-05, "loss": 0.3517, "step": 3018 }, { "epoch": 3.218550106609808, "grad_norm": 0.16114167670939453, "learning_rate": 2.7147707349477327e-05, "loss": 0.352, "step": 3019 }, { "epoch": 3.2196162046908317, "grad_norm": 0.14097634690323843, "learning_rate": 2.7119518449066205e-05, "loss": 0.349, "step": 3020 }, { "epoch": 3.220682302771855, "grad_norm": 0.16246129452859187, "learning_rate": 2.7091336683772437e-05, "loss": 0.3496, "step": 3021 }, { "epoch": 3.2217484008528783, "grad_norm": 0.16336429392891585, "learning_rate": 2.7063162069207262e-05, "loss": 0.3474, "step": 3022 }, { "epoch": 3.222814498933902, "grad_norm": 0.14988643488196962, "learning_rate": 2.7034994620977965e-05, "loss": 0.3477, "step": 3023 }, { "epoch": 3.2238805970149254, "grad_norm": 0.16792178240326763, "learning_rate": 2.700683435468782e-05, "loss": 0.3515, "step": 3024 }, { "epoch": 3.224946695095949, "grad_norm": 0.14865516690552993, "learning_rate": 2.6978681285936176e-05, "loss": 0.3535, "step": 3025 }, { "epoch": 3.2260127931769724, "grad_norm": 0.15929449524500977, "learning_rate": 2.6950535430318373e-05, "loss": 0.3519, "step": 3026 }, { "epoch": 3.227078891257996, "grad_norm": 0.1599091744019697, "learning_rate": 2.692239680342572e-05, "loss": 0.3529, "step": 3027 }, { "epoch": 3.228144989339019, "grad_norm": 0.1484966105614425, "learning_rate": 2.689426542084558e-05, "loss": 0.3579, "step": 3028 }, { "epoch": 3.2292110874200426, "grad_norm": 0.1385274858632669, "learning_rate": 2.686614129816129e-05, "loss": 0.3497, "step": 3029 }, { "epoch": 3.230277185501066, "grad_norm": 0.14991864182995415, "learning_rate": 2.683802445095211e-05, "loss": 0.3525, "step": 3030 }, { "epoch": 3.2313432835820897, "grad_norm": 0.15476961485342514, "learning_rate": 2.6809914894793344e-05, "loss": 0.3424, "step": 3031 }, { "epoch": 3.232409381663113, "grad_norm": 0.12397588075589204, "learning_rate": 2.6781812645256216e-05, "loss": 0.3426, "step": 3032 }, { "epoch": 3.2334754797441363, "grad_norm": 0.15055980357741625, "learning_rate": 2.6753717717907925e-05, "loss": 0.3515, "step": 3033 }, { "epoch": 3.23454157782516, "grad_norm": 0.14872467882760348, "learning_rate": 2.672563012831158e-05, "loss": 0.3456, "step": 3034 }, { "epoch": 3.2356076759061834, "grad_norm": 0.14261242873484498, "learning_rate": 2.6697549892026247e-05, "loss": 0.3544, "step": 3035 }, { "epoch": 3.236673773987207, "grad_norm": 0.15857980357826498, "learning_rate": 2.666947702460693e-05, "loss": 0.3546, "step": 3036 }, { "epoch": 3.2377398720682304, "grad_norm": 0.16110840435766394, "learning_rate": 2.6641411541604544e-05, "loss": 0.3536, "step": 3037 }, { "epoch": 3.2388059701492535, "grad_norm": 0.16212693241700432, "learning_rate": 2.6613353458565887e-05, "loss": 0.3511, "step": 3038 }, { "epoch": 3.239872068230277, "grad_norm": 0.14587623221049775, "learning_rate": 2.6585302791033688e-05, "loss": 0.3522, "step": 3039 }, { "epoch": 3.2409381663113006, "grad_norm": 0.13597680071298357, "learning_rate": 2.6557259554546577e-05, "loss": 0.3447, "step": 3040 }, { "epoch": 3.242004264392324, "grad_norm": 0.16519790924407182, "learning_rate": 2.6529223764639013e-05, "loss": 0.3531, "step": 3041 }, { "epoch": 3.2430703624733477, "grad_norm": 0.12865148005181068, "learning_rate": 2.650119543684139e-05, "loss": 0.3453, "step": 3042 }, { "epoch": 3.2441364605543708, "grad_norm": 0.16931532502182053, "learning_rate": 2.6473174586679947e-05, "loss": 0.3481, "step": 3043 }, { "epoch": 3.2452025586353943, "grad_norm": 0.20153378061586566, "learning_rate": 2.644516122967678e-05, "loss": 0.3474, "step": 3044 }, { "epoch": 3.246268656716418, "grad_norm": 0.15391567159601363, "learning_rate": 2.6417155381349814e-05, "loss": 0.3555, "step": 3045 }, { "epoch": 3.2473347547974414, "grad_norm": 0.18485138313641797, "learning_rate": 2.638915705721284e-05, "loss": 0.348, "step": 3046 }, { "epoch": 3.248400852878465, "grad_norm": 0.19279732548240924, "learning_rate": 2.6361166272775503e-05, "loss": 0.3473, "step": 3047 }, { "epoch": 3.2494669509594885, "grad_norm": 0.14914227409372177, "learning_rate": 2.6333183043543207e-05, "loss": 0.3491, "step": 3048 }, { "epoch": 3.2505330490405115, "grad_norm": 0.198163678392842, "learning_rate": 2.630520738501721e-05, "loss": 0.3537, "step": 3049 }, { "epoch": 3.251599147121535, "grad_norm": 0.1495697517853574, "learning_rate": 2.6277239312694596e-05, "loss": 0.3521, "step": 3050 }, { "epoch": 3.2526652452025586, "grad_norm": 0.18049308488761015, "learning_rate": 2.624927884206821e-05, "loss": 0.3559, "step": 3051 }, { "epoch": 3.253731343283582, "grad_norm": 0.1554880781007249, "learning_rate": 2.6221325988626686e-05, "loss": 0.3474, "step": 3052 }, { "epoch": 3.2547974413646057, "grad_norm": 0.17014877400454104, "learning_rate": 2.619338076785448e-05, "loss": 0.3471, "step": 3053 }, { "epoch": 3.2558635394456292, "grad_norm": 0.2175697746157974, "learning_rate": 2.6165443195231763e-05, "loss": 0.3526, "step": 3054 }, { "epoch": 3.2569296375266523, "grad_norm": 0.1533228757448852, "learning_rate": 2.6137513286234528e-05, "loss": 0.355, "step": 3055 }, { "epoch": 3.257995735607676, "grad_norm": 0.170158319003947, "learning_rate": 2.6109591056334474e-05, "loss": 0.3497, "step": 3056 }, { "epoch": 3.2590618336886994, "grad_norm": 0.2110756154467986, "learning_rate": 2.608167652099906e-05, "loss": 0.3439, "step": 3057 }, { "epoch": 3.260127931769723, "grad_norm": 0.14837278713007337, "learning_rate": 2.6053769695691507e-05, "loss": 0.3423, "step": 3058 }, { "epoch": 3.2611940298507465, "grad_norm": 0.1809824101682613, "learning_rate": 2.6025870595870733e-05, "loss": 0.3487, "step": 3059 }, { "epoch": 3.2622601279317696, "grad_norm": 0.21033652261955751, "learning_rate": 2.5997979236991386e-05, "loss": 0.3527, "step": 3060 }, { "epoch": 3.263326226012793, "grad_norm": 0.21776809754829038, "learning_rate": 2.5970095634503833e-05, "loss": 0.3472, "step": 3061 }, { "epoch": 3.2643923240938166, "grad_norm": 0.15831570347972593, "learning_rate": 2.5942219803854168e-05, "loss": 0.3437, "step": 3062 }, { "epoch": 3.26545842217484, "grad_norm": 0.17838265493780678, "learning_rate": 2.59143517604841e-05, "loss": 0.3476, "step": 3063 }, { "epoch": 3.2665245202558637, "grad_norm": 0.20607722523788694, "learning_rate": 2.588649151983111e-05, "loss": 0.3472, "step": 3064 }, { "epoch": 3.267590618336887, "grad_norm": 0.13745662304832107, "learning_rate": 2.5858639097328314e-05, "loss": 0.351, "step": 3065 }, { "epoch": 3.2686567164179103, "grad_norm": 0.1941806894488011, "learning_rate": 2.583079450840453e-05, "loss": 0.3537, "step": 3066 }, { "epoch": 3.269722814498934, "grad_norm": 0.21824169614328015, "learning_rate": 2.5802957768484173e-05, "loss": 0.3461, "step": 3067 }, { "epoch": 3.2707889125799574, "grad_norm": 0.16071832337632452, "learning_rate": 2.5775128892987368e-05, "loss": 0.3505, "step": 3068 }, { "epoch": 3.271855010660981, "grad_norm": 0.163987167314734, "learning_rate": 2.574730789732989e-05, "loss": 0.345, "step": 3069 }, { "epoch": 3.272921108742004, "grad_norm": 0.19905464540458992, "learning_rate": 2.5719494796923085e-05, "loss": 0.3574, "step": 3070 }, { "epoch": 3.2739872068230276, "grad_norm": 0.16383647964999232, "learning_rate": 2.569168960717398e-05, "loss": 0.3495, "step": 3071 }, { "epoch": 3.275053304904051, "grad_norm": 0.17693978324786094, "learning_rate": 2.5663892343485214e-05, "loss": 0.3499, "step": 3072 }, { "epoch": 3.2761194029850746, "grad_norm": 0.18500426824572122, "learning_rate": 2.5636103021255026e-05, "loss": 0.3549, "step": 3073 }, { "epoch": 3.277185501066098, "grad_norm": 0.19504056601794661, "learning_rate": 2.5608321655877243e-05, "loss": 0.3525, "step": 3074 }, { "epoch": 3.2782515991471217, "grad_norm": 0.16037712681498448, "learning_rate": 2.5580548262741304e-05, "loss": 0.3474, "step": 3075 }, { "epoch": 3.279317697228145, "grad_norm": 0.17838055168396916, "learning_rate": 2.5552782857232238e-05, "loss": 0.3519, "step": 3076 }, { "epoch": 3.2803837953091683, "grad_norm": 0.18837820237068092, "learning_rate": 2.5525025454730612e-05, "loss": 0.3497, "step": 3077 }, { "epoch": 3.281449893390192, "grad_norm": 0.1403381013776277, "learning_rate": 2.54972760706126e-05, "loss": 0.3505, "step": 3078 }, { "epoch": 3.2825159914712154, "grad_norm": 0.234296775090172, "learning_rate": 2.546953472024991e-05, "loss": 0.3502, "step": 3079 }, { "epoch": 3.283582089552239, "grad_norm": 0.17074806676345755, "learning_rate": 2.5441801419009835e-05, "loss": 0.3449, "step": 3080 }, { "epoch": 3.2846481876332625, "grad_norm": 0.17367375722672437, "learning_rate": 2.541407618225515e-05, "loss": 0.3503, "step": 3081 }, { "epoch": 3.2857142857142856, "grad_norm": 0.22550229833239124, "learning_rate": 2.53863590253442e-05, "loss": 0.3477, "step": 3082 }, { "epoch": 3.286780383795309, "grad_norm": 0.13550221130573722, "learning_rate": 2.5358649963630867e-05, "loss": 0.3449, "step": 3083 }, { "epoch": 3.2878464818763327, "grad_norm": 0.2501969239886366, "learning_rate": 2.533094901246452e-05, "loss": 0.3528, "step": 3084 }, { "epoch": 3.288912579957356, "grad_norm": 0.20047206624632516, "learning_rate": 2.5303256187190038e-05, "loss": 0.3466, "step": 3085 }, { "epoch": 3.2899786780383797, "grad_norm": 0.14385706915661878, "learning_rate": 2.527557150314783e-05, "loss": 0.348, "step": 3086 }, { "epoch": 3.291044776119403, "grad_norm": 0.23455244794599786, "learning_rate": 2.524789497567375e-05, "loss": 0.3575, "step": 3087 }, { "epoch": 3.2921108742004264, "grad_norm": 0.19720110623267217, "learning_rate": 2.522022662009916e-05, "loss": 0.355, "step": 3088 }, { "epoch": 3.29317697228145, "grad_norm": 0.16009116202356855, "learning_rate": 2.5192566451750904e-05, "loss": 0.3483, "step": 3089 }, { "epoch": 3.2942430703624734, "grad_norm": 0.19300246066961, "learning_rate": 2.516491448595126e-05, "loss": 0.3537, "step": 3090 }, { "epoch": 3.295309168443497, "grad_norm": 0.17653040799060407, "learning_rate": 2.5137270738018e-05, "loss": 0.3463, "step": 3091 }, { "epoch": 3.29637526652452, "grad_norm": 0.19537049403171847, "learning_rate": 2.5109635223264305e-05, "loss": 0.3424, "step": 3092 }, { "epoch": 3.2974413646055436, "grad_norm": 0.18015777011327852, "learning_rate": 2.5082007956998817e-05, "loss": 0.3476, "step": 3093 }, { "epoch": 3.298507462686567, "grad_norm": 0.20459512311591024, "learning_rate": 2.505438895452562e-05, "loss": 0.351, "step": 3094 }, { "epoch": 3.2995735607675907, "grad_norm": 0.12366732649317048, "learning_rate": 2.5026778231144194e-05, "loss": 0.354, "step": 3095 }, { "epoch": 3.300639658848614, "grad_norm": 0.18890110077159672, "learning_rate": 2.4999175802149438e-05, "loss": 0.3523, "step": 3096 }, { "epoch": 3.3017057569296373, "grad_norm": 0.1626946147400098, "learning_rate": 2.4971581682831668e-05, "loss": 0.3527, "step": 3097 }, { "epoch": 3.302771855010661, "grad_norm": 0.14801464468188533, "learning_rate": 2.494399588847662e-05, "loss": 0.3533, "step": 3098 }, { "epoch": 3.3038379530916844, "grad_norm": 0.1593500938211409, "learning_rate": 2.4916418434365346e-05, "loss": 0.3528, "step": 3099 }, { "epoch": 3.304904051172708, "grad_norm": 0.19644211588346217, "learning_rate": 2.488884933577434e-05, "loss": 0.3454, "step": 3100 }, { "epoch": 3.3059701492537314, "grad_norm": 0.1353438588547366, "learning_rate": 2.4861288607975458e-05, "loss": 0.3516, "step": 3101 }, { "epoch": 3.307036247334755, "grad_norm": 0.17358657320991785, "learning_rate": 2.4833736266235917e-05, "loss": 0.3499, "step": 3102 }, { "epoch": 3.308102345415778, "grad_norm": 0.15831348237018122, "learning_rate": 2.4806192325818258e-05, "loss": 0.3531, "step": 3103 }, { "epoch": 3.3091684434968016, "grad_norm": 0.13584971050526615, "learning_rate": 2.47786568019804e-05, "loss": 0.3593, "step": 3104 }, { "epoch": 3.310234541577825, "grad_norm": 0.15418755306420462, "learning_rate": 2.475112970997562e-05, "loss": 0.341, "step": 3105 }, { "epoch": 3.3113006396588487, "grad_norm": 0.1615129100696353, "learning_rate": 2.472361106505245e-05, "loss": 0.3514, "step": 3106 }, { "epoch": 3.3123667377398722, "grad_norm": 0.15930398883446692, "learning_rate": 2.4696100882454817e-05, "loss": 0.351, "step": 3107 }, { "epoch": 3.3134328358208958, "grad_norm": 0.1403230514001666, "learning_rate": 2.466859917742193e-05, "loss": 0.3505, "step": 3108 }, { "epoch": 3.314498933901919, "grad_norm": 0.17920374641541106, "learning_rate": 2.464110596518831e-05, "loss": 0.3449, "step": 3109 }, { "epoch": 3.3155650319829424, "grad_norm": 0.1254186924644907, "learning_rate": 2.4613621260983755e-05, "loss": 0.3461, "step": 3110 }, { "epoch": 3.316631130063966, "grad_norm": 0.13780163296736875, "learning_rate": 2.458614508003336e-05, "loss": 0.3511, "step": 3111 }, { "epoch": 3.3176972281449895, "grad_norm": 0.143814949009123, "learning_rate": 2.455867743755751e-05, "loss": 0.3505, "step": 3112 }, { "epoch": 3.318763326226013, "grad_norm": 0.12209116221359133, "learning_rate": 2.4531218348771866e-05, "loss": 0.3443, "step": 3113 }, { "epoch": 3.319829424307036, "grad_norm": 0.14799939510773816, "learning_rate": 2.450376782888731e-05, "loss": 0.3471, "step": 3114 }, { "epoch": 3.3208955223880596, "grad_norm": 0.14123259883639513, "learning_rate": 2.4476325893110008e-05, "loss": 0.3502, "step": 3115 }, { "epoch": 3.321961620469083, "grad_norm": 0.14375390587624387, "learning_rate": 2.4448892556641393e-05, "loss": 0.3463, "step": 3116 }, { "epoch": 3.3230277185501067, "grad_norm": 0.17421629802465335, "learning_rate": 2.4421467834678067e-05, "loss": 0.3475, "step": 3117 }, { "epoch": 3.3240938166311302, "grad_norm": 0.14674808581087775, "learning_rate": 2.439405174241192e-05, "loss": 0.3451, "step": 3118 }, { "epoch": 3.3251599147121533, "grad_norm": 0.21187149354636753, "learning_rate": 2.4366644295030054e-05, "loss": 0.3483, "step": 3119 }, { "epoch": 3.326226012793177, "grad_norm": 0.2012599089170034, "learning_rate": 2.433924550771476e-05, "loss": 0.3483, "step": 3120 }, { "epoch": 3.3272921108742004, "grad_norm": 0.16780980496521763, "learning_rate": 2.4311855395643527e-05, "loss": 0.3462, "step": 3121 }, { "epoch": 3.328358208955224, "grad_norm": 0.16005417142229794, "learning_rate": 2.428447397398908e-05, "loss": 0.3533, "step": 3122 }, { "epoch": 3.3294243070362475, "grad_norm": 0.15099935350620822, "learning_rate": 2.425710125791929e-05, "loss": 0.35, "step": 3123 }, { "epoch": 3.3304904051172706, "grad_norm": 0.15768848577018113, "learning_rate": 2.4229737262597216e-05, "loss": 0.3498, "step": 3124 }, { "epoch": 3.331556503198294, "grad_norm": 0.155851400636398, "learning_rate": 2.4202382003181098e-05, "loss": 0.3557, "step": 3125 }, { "epoch": 3.3326226012793176, "grad_norm": 0.14267053890933395, "learning_rate": 2.4175035494824316e-05, "loss": 0.3486, "step": 3126 }, { "epoch": 3.333688699360341, "grad_norm": 0.15843384932692273, "learning_rate": 2.4147697752675433e-05, "loss": 0.3515, "step": 3127 }, { "epoch": 3.3347547974413647, "grad_norm": 0.17371867125289828, "learning_rate": 2.4120368791878122e-05, "loss": 0.3491, "step": 3128 }, { "epoch": 3.3358208955223883, "grad_norm": 0.16084952359302623, "learning_rate": 2.4093048627571206e-05, "loss": 0.3529, "step": 3129 }, { "epoch": 3.3368869936034113, "grad_norm": 0.14408159186617614, "learning_rate": 2.4065737274888646e-05, "loss": 0.3525, "step": 3130 }, { "epoch": 3.337953091684435, "grad_norm": 0.1656284179555205, "learning_rate": 2.403843474895952e-05, "loss": 0.3486, "step": 3131 }, { "epoch": 3.3390191897654584, "grad_norm": 0.14146974935511217, "learning_rate": 2.401114106490798e-05, "loss": 0.3482, "step": 3132 }, { "epoch": 3.340085287846482, "grad_norm": 0.1573564750602216, "learning_rate": 2.3983856237853322e-05, "loss": 0.3543, "step": 3133 }, { "epoch": 3.3411513859275055, "grad_norm": 0.1757062911156589, "learning_rate": 2.395658028290995e-05, "loss": 0.3538, "step": 3134 }, { "epoch": 3.342217484008529, "grad_norm": 0.13319696217453428, "learning_rate": 2.3929313215187274e-05, "loss": 0.3488, "step": 3135 }, { "epoch": 3.343283582089552, "grad_norm": 0.15216598233478518, "learning_rate": 2.390205504978986e-05, "loss": 0.3574, "step": 3136 }, { "epoch": 3.3443496801705757, "grad_norm": 0.14909080713095296, "learning_rate": 2.3874805801817313e-05, "loss": 0.3488, "step": 3137 }, { "epoch": 3.345415778251599, "grad_norm": 0.12886400979675366, "learning_rate": 2.384756548636432e-05, "loss": 0.3474, "step": 3138 }, { "epoch": 3.3464818763326227, "grad_norm": 0.17178937827304538, "learning_rate": 2.3820334118520566e-05, "loss": 0.3487, "step": 3139 }, { "epoch": 3.3475479744136463, "grad_norm": 0.14792535442811752, "learning_rate": 2.3793111713370824e-05, "loss": 0.3493, "step": 3140 }, { "epoch": 3.3486140724946694, "grad_norm": 0.13063840167660581, "learning_rate": 2.3765898285994898e-05, "loss": 0.3436, "step": 3141 }, { "epoch": 3.349680170575693, "grad_norm": 0.1415902890105865, "learning_rate": 2.3738693851467627e-05, "loss": 0.3449, "step": 3142 }, { "epoch": 3.3507462686567164, "grad_norm": 0.14468858858404718, "learning_rate": 2.371149842485882e-05, "loss": 0.3515, "step": 3143 }, { "epoch": 3.35181236673774, "grad_norm": 0.12569229542636323, "learning_rate": 2.3684312021233353e-05, "loss": 0.3531, "step": 3144 }, { "epoch": 3.3528784648187635, "grad_norm": 0.13410217015876344, "learning_rate": 2.3657134655651085e-05, "loss": 0.3493, "step": 3145 }, { "epoch": 3.3539445628997866, "grad_norm": 0.12050748503992254, "learning_rate": 2.3629966343166836e-05, "loss": 0.3475, "step": 3146 }, { "epoch": 3.35501066098081, "grad_norm": 0.13625653337833818, "learning_rate": 2.3602807098830462e-05, "loss": 0.3527, "step": 3147 }, { "epoch": 3.3560767590618337, "grad_norm": 0.1270892484979118, "learning_rate": 2.3575656937686765e-05, "loss": 0.3451, "step": 3148 }, { "epoch": 3.357142857142857, "grad_norm": 0.1292034055414621, "learning_rate": 2.3548515874775547e-05, "loss": 0.3468, "step": 3149 }, { "epoch": 3.3582089552238807, "grad_norm": 0.13474465507291278, "learning_rate": 2.3521383925131508e-05, "loss": 0.3475, "step": 3150 }, { "epoch": 3.359275053304904, "grad_norm": 0.14535330585186346, "learning_rate": 2.349426110378435e-05, "loss": 0.349, "step": 3151 }, { "epoch": 3.3603411513859274, "grad_norm": 0.14304977788811093, "learning_rate": 2.3467147425758735e-05, "loss": 0.3497, "step": 3152 }, { "epoch": 3.361407249466951, "grad_norm": 0.13088921289643768, "learning_rate": 2.3440042906074187e-05, "loss": 0.3476, "step": 3153 }, { "epoch": 3.3624733475479744, "grad_norm": 0.13796584925331404, "learning_rate": 2.3412947559745226e-05, "loss": 0.3463, "step": 3154 }, { "epoch": 3.363539445628998, "grad_norm": 0.1388788022895029, "learning_rate": 2.338586140178127e-05, "loss": 0.3485, "step": 3155 }, { "epoch": 3.364605543710021, "grad_norm": 0.12618907252498665, "learning_rate": 2.335878444718663e-05, "loss": 0.3547, "step": 3156 }, { "epoch": 3.3656716417910446, "grad_norm": 0.14196252167717818, "learning_rate": 2.3331716710960536e-05, "loss": 0.3495, "step": 3157 }, { "epoch": 3.366737739872068, "grad_norm": 0.13807258992426674, "learning_rate": 2.3304658208097105e-05, "loss": 0.3498, "step": 3158 }, { "epoch": 3.3678038379530917, "grad_norm": 0.12981419614673304, "learning_rate": 2.3277608953585346e-05, "loss": 0.3441, "step": 3159 }, { "epoch": 3.368869936034115, "grad_norm": 0.16657533516126402, "learning_rate": 2.3250568962409155e-05, "loss": 0.353, "step": 3160 }, { "epoch": 3.3699360341151388, "grad_norm": 0.12148830285530396, "learning_rate": 2.322353824954725e-05, "loss": 0.3523, "step": 3161 }, { "epoch": 3.3710021321961623, "grad_norm": 0.1706380629428814, "learning_rate": 2.319651682997325e-05, "loss": 0.3487, "step": 3162 }, { "epoch": 3.3720682302771854, "grad_norm": 0.14429216152158644, "learning_rate": 2.316950471865564e-05, "loss": 0.3511, "step": 3163 }, { "epoch": 3.373134328358209, "grad_norm": 0.13448516843067299, "learning_rate": 2.31425019305577e-05, "loss": 0.3542, "step": 3164 }, { "epoch": 3.3742004264392325, "grad_norm": 0.16508680435233514, "learning_rate": 2.3115508480637575e-05, "loss": 0.3477, "step": 3165 }, { "epoch": 3.375266524520256, "grad_norm": 0.1326808111603962, "learning_rate": 2.308852438384824e-05, "loss": 0.349, "step": 3166 }, { "epoch": 3.3763326226012795, "grad_norm": 0.14047757472242012, "learning_rate": 2.3061549655137498e-05, "loss": 0.3513, "step": 3167 }, { "epoch": 3.3773987206823026, "grad_norm": 0.1441083727611898, "learning_rate": 2.3034584309447913e-05, "loss": 0.3479, "step": 3168 }, { "epoch": 3.378464818763326, "grad_norm": 0.13591931890052086, "learning_rate": 2.3007628361716902e-05, "loss": 0.3515, "step": 3169 }, { "epoch": 3.3795309168443497, "grad_norm": 0.1255576121479874, "learning_rate": 2.298068182687666e-05, "loss": 0.3519, "step": 3170 }, { "epoch": 3.3805970149253732, "grad_norm": 0.14967290050741613, "learning_rate": 2.295374471985418e-05, "loss": 0.3482, "step": 3171 }, { "epoch": 3.3816631130063968, "grad_norm": 0.12421430145394535, "learning_rate": 2.2926817055571194e-05, "loss": 0.3526, "step": 3172 }, { "epoch": 3.38272921108742, "grad_norm": 0.17262897437577504, "learning_rate": 2.289989884894425e-05, "loss": 0.3442, "step": 3173 }, { "epoch": 3.3837953091684434, "grad_norm": 0.14546213806847885, "learning_rate": 2.287299011488461e-05, "loss": 0.3531, "step": 3174 }, { "epoch": 3.384861407249467, "grad_norm": 0.1574355823375362, "learning_rate": 2.2846090868298333e-05, "loss": 0.3474, "step": 3175 }, { "epoch": 3.3859275053304905, "grad_norm": 0.15336705073042523, "learning_rate": 2.2819201124086216e-05, "loss": 0.3503, "step": 3176 }, { "epoch": 3.386993603411514, "grad_norm": 0.13640452357535515, "learning_rate": 2.279232089714374e-05, "loss": 0.3505, "step": 3177 }, { "epoch": 3.388059701492537, "grad_norm": 0.15004013469173, "learning_rate": 2.2765450202361186e-05, "loss": 0.3519, "step": 3178 }, { "epoch": 3.3891257995735606, "grad_norm": 0.14151494212433288, "learning_rate": 2.273858905462353e-05, "loss": 0.3518, "step": 3179 }, { "epoch": 3.390191897654584, "grad_norm": 0.1345161110814909, "learning_rate": 2.2711737468810418e-05, "loss": 0.3481, "step": 3180 }, { "epoch": 3.3912579957356077, "grad_norm": 0.12660333007474672, "learning_rate": 2.268489545979625e-05, "loss": 0.3484, "step": 3181 }, { "epoch": 3.3923240938166312, "grad_norm": 0.13060878781996574, "learning_rate": 2.265806304245012e-05, "loss": 0.349, "step": 3182 }, { "epoch": 3.3933901918976543, "grad_norm": 0.13348764262187832, "learning_rate": 2.263124023163576e-05, "loss": 0.3471, "step": 3183 }, { "epoch": 3.394456289978678, "grad_norm": 0.12898484952700903, "learning_rate": 2.2604427042211633e-05, "loss": 0.3521, "step": 3184 }, { "epoch": 3.3955223880597014, "grad_norm": 0.14350540973892006, "learning_rate": 2.2577623489030865e-05, "loss": 0.3485, "step": 3185 }, { "epoch": 3.396588486140725, "grad_norm": 0.11886459992083558, "learning_rate": 2.25508295869412e-05, "loss": 0.3518, "step": 3186 }, { "epoch": 3.3976545842217485, "grad_norm": 0.12711816829481948, "learning_rate": 2.2524045350785088e-05, "loss": 0.3508, "step": 3187 }, { "epoch": 3.398720682302772, "grad_norm": 0.12078532866593578, "learning_rate": 2.2497270795399598e-05, "loss": 0.3519, "step": 3188 }, { "epoch": 3.399786780383795, "grad_norm": 0.14360930786740936, "learning_rate": 2.2470505935616457e-05, "loss": 0.3496, "step": 3189 }, { "epoch": 3.4008528784648187, "grad_norm": 0.12963425574071996, "learning_rate": 2.2443750786261976e-05, "loss": 0.3553, "step": 3190 }, { "epoch": 3.401918976545842, "grad_norm": 0.13049268581835108, "learning_rate": 2.2417005362157135e-05, "loss": 0.3517, "step": 3191 }, { "epoch": 3.4029850746268657, "grad_norm": 0.15413582639564263, "learning_rate": 2.2390269678117525e-05, "loss": 0.3517, "step": 3192 }, { "epoch": 3.4040511727078893, "grad_norm": 0.12663844292929444, "learning_rate": 2.2363543748953296e-05, "loss": 0.3466, "step": 3193 }, { "epoch": 3.405117270788913, "grad_norm": 0.13821512264352942, "learning_rate": 2.2336827589469232e-05, "loss": 0.3523, "step": 3194 }, { "epoch": 3.406183368869936, "grad_norm": 0.11822096798559724, "learning_rate": 2.2310121214464706e-05, "loss": 0.3439, "step": 3195 }, { "epoch": 3.4072494669509594, "grad_norm": 0.12664000688547125, "learning_rate": 2.228342463873367e-05, "loss": 0.3566, "step": 3196 }, { "epoch": 3.408315565031983, "grad_norm": 0.13892075241413948, "learning_rate": 2.2256737877064607e-05, "loss": 0.3459, "step": 3197 }, { "epoch": 3.4093816631130065, "grad_norm": 0.12083777967142596, "learning_rate": 2.2230060944240623e-05, "loss": 0.3519, "step": 3198 }, { "epoch": 3.41044776119403, "grad_norm": 0.14125853339995179, "learning_rate": 2.220339385503934e-05, "loss": 0.351, "step": 3199 }, { "epoch": 3.411513859275053, "grad_norm": 0.13762034609216564, "learning_rate": 2.2176736624232964e-05, "loss": 0.3427, "step": 3200 }, { "epoch": 3.4125799573560767, "grad_norm": 0.17537017660770718, "learning_rate": 2.2150089266588173e-05, "loss": 0.356, "step": 3201 }, { "epoch": 3.4136460554371, "grad_norm": 0.13074766627409623, "learning_rate": 2.2123451796866247e-05, "loss": 0.3531, "step": 3202 }, { "epoch": 3.4147121535181237, "grad_norm": 0.15980968926250733, "learning_rate": 2.2096824229822973e-05, "loss": 0.348, "step": 3203 }, { "epoch": 3.4157782515991473, "grad_norm": 0.1180753762858572, "learning_rate": 2.2070206580208598e-05, "loss": 0.3505, "step": 3204 }, { "epoch": 3.4168443496801704, "grad_norm": 0.14397710716700302, "learning_rate": 2.2043598862767937e-05, "loss": 0.3539, "step": 3205 }, { "epoch": 3.417910447761194, "grad_norm": 0.15418073714942476, "learning_rate": 2.2017001092240288e-05, "loss": 0.349, "step": 3206 }, { "epoch": 3.4189765458422174, "grad_norm": 0.15081085999678343, "learning_rate": 2.1990413283359447e-05, "loss": 0.3483, "step": 3207 }, { "epoch": 3.420042643923241, "grad_norm": 0.11082213481326116, "learning_rate": 2.1963835450853646e-05, "loss": 0.3508, "step": 3208 }, { "epoch": 3.4211087420042645, "grad_norm": 0.15412690956286193, "learning_rate": 2.1937267609445634e-05, "loss": 0.3537, "step": 3209 }, { "epoch": 3.4221748400852876, "grad_norm": 0.13940387381283997, "learning_rate": 2.191070977385264e-05, "loss": 0.3509, "step": 3210 }, { "epoch": 3.423240938166311, "grad_norm": 0.12890974324334237, "learning_rate": 2.1884161958786283e-05, "loss": 0.3516, "step": 3211 }, { "epoch": 3.4243070362473347, "grad_norm": 0.14545024164766443, "learning_rate": 2.1857624178952693e-05, "loss": 0.3447, "step": 3212 }, { "epoch": 3.425373134328358, "grad_norm": 0.1330648345375272, "learning_rate": 2.1831096449052424e-05, "loss": 0.3448, "step": 3213 }, { "epoch": 3.4264392324093818, "grad_norm": 0.12671343057546725, "learning_rate": 2.1804578783780465e-05, "loss": 0.3516, "step": 3214 }, { "epoch": 3.4275053304904053, "grad_norm": 0.14329897961042323, "learning_rate": 2.177807119782621e-05, "loss": 0.3501, "step": 3215 }, { "epoch": 3.4285714285714284, "grad_norm": 0.1337530920580984, "learning_rate": 2.175157370587348e-05, "loss": 0.3466, "step": 3216 }, { "epoch": 3.429637526652452, "grad_norm": 0.16912612472386718, "learning_rate": 2.1725086322600526e-05, "loss": 0.3507, "step": 3217 }, { "epoch": 3.4307036247334755, "grad_norm": 0.14266480292217754, "learning_rate": 2.1698609062679985e-05, "loss": 0.3545, "step": 3218 }, { "epoch": 3.431769722814499, "grad_norm": 0.14472729674524853, "learning_rate": 2.167214194077886e-05, "loss": 0.3496, "step": 3219 }, { "epoch": 3.4328358208955225, "grad_norm": 0.16298798619576543, "learning_rate": 2.1645684971558572e-05, "loss": 0.3474, "step": 3220 }, { "epoch": 3.433901918976546, "grad_norm": 0.1468219073239334, "learning_rate": 2.1619238169674918e-05, "loss": 0.353, "step": 3221 }, { "epoch": 3.434968017057569, "grad_norm": 0.19187830974162423, "learning_rate": 2.1592801549778034e-05, "loss": 0.3518, "step": 3222 }, { "epoch": 3.4360341151385927, "grad_norm": 0.17741138849918361, "learning_rate": 2.1566375126512437e-05, "loss": 0.3464, "step": 3223 }, { "epoch": 3.4371002132196162, "grad_norm": 0.1603692083176825, "learning_rate": 2.1539958914517e-05, "loss": 0.3476, "step": 3224 }, { "epoch": 3.4381663113006398, "grad_norm": 0.1663268863315761, "learning_rate": 2.151355292842494e-05, "loss": 0.3496, "step": 3225 }, { "epoch": 3.4392324093816633, "grad_norm": 0.1767760891428736, "learning_rate": 2.1487157182863773e-05, "loss": 0.3465, "step": 3226 }, { "epoch": 3.4402985074626864, "grad_norm": 0.14846390418886782, "learning_rate": 2.1460771692455388e-05, "loss": 0.3572, "step": 3227 }, { "epoch": 3.44136460554371, "grad_norm": 0.1805328018495273, "learning_rate": 2.143439647181597e-05, "loss": 0.3521, "step": 3228 }, { "epoch": 3.4424307036247335, "grad_norm": 0.15018285087651392, "learning_rate": 2.140803153555604e-05, "loss": 0.356, "step": 3229 }, { "epoch": 3.443496801705757, "grad_norm": 0.16606433943154644, "learning_rate": 2.1381676898280372e-05, "loss": 0.3473, "step": 3230 }, { "epoch": 3.4445628997867805, "grad_norm": 0.16838391423331878, "learning_rate": 2.135533257458808e-05, "loss": 0.3482, "step": 3231 }, { "epoch": 3.4456289978678036, "grad_norm": 0.1327187010033925, "learning_rate": 2.1328998579072566e-05, "loss": 0.3562, "step": 3232 }, { "epoch": 3.446695095948827, "grad_norm": 0.15065705283311742, "learning_rate": 2.130267492632146e-05, "loss": 0.3471, "step": 3233 }, { "epoch": 3.4477611940298507, "grad_norm": 0.11852053817686456, "learning_rate": 2.1276361630916718e-05, "loss": 0.3542, "step": 3234 }, { "epoch": 3.4488272921108742, "grad_norm": 0.13152597541802122, "learning_rate": 2.125005870743453e-05, "loss": 0.3539, "step": 3235 }, { "epoch": 3.449893390191898, "grad_norm": 0.14103478135014386, "learning_rate": 2.1223766170445383e-05, "loss": 0.3508, "step": 3236 }, { "epoch": 3.450959488272921, "grad_norm": 0.11147740547754476, "learning_rate": 2.1197484034513927e-05, "loss": 0.3453, "step": 3237 }, { "epoch": 3.4520255863539444, "grad_norm": 0.12515850742634446, "learning_rate": 2.1171212314199117e-05, "loss": 0.3512, "step": 3238 }, { "epoch": 3.453091684434968, "grad_norm": 0.12854446210826828, "learning_rate": 2.1144951024054144e-05, "loss": 0.3471, "step": 3239 }, { "epoch": 3.4541577825159915, "grad_norm": 0.12150324184398577, "learning_rate": 2.111870017862636e-05, "loss": 0.3472, "step": 3240 }, { "epoch": 3.455223880597015, "grad_norm": 0.1402826179243876, "learning_rate": 2.1092459792457384e-05, "loss": 0.355, "step": 3241 }, { "epoch": 3.4562899786780386, "grad_norm": 0.12712753288646708, "learning_rate": 2.1066229880083035e-05, "loss": 0.35, "step": 3242 }, { "epoch": 3.4573560767590616, "grad_norm": 0.13839576676511672, "learning_rate": 2.104001045603333e-05, "loss": 0.3527, "step": 3243 }, { "epoch": 3.458422174840085, "grad_norm": 0.13644673504349322, "learning_rate": 2.1013801534832434e-05, "loss": 0.3509, "step": 3244 }, { "epoch": 3.4594882729211087, "grad_norm": 0.14306136604732483, "learning_rate": 2.0987603130998745e-05, "loss": 0.3507, "step": 3245 }, { "epoch": 3.4605543710021323, "grad_norm": 0.15950680626735908, "learning_rate": 2.096141525904484e-05, "loss": 0.3485, "step": 3246 }, { "epoch": 3.461620469083156, "grad_norm": 0.11275460669446143, "learning_rate": 2.09352379334774e-05, "loss": 0.3529, "step": 3247 }, { "epoch": 3.4626865671641793, "grad_norm": 0.1745871935155396, "learning_rate": 2.0909071168797332e-05, "loss": 0.3509, "step": 3248 }, { "epoch": 3.4637526652452024, "grad_norm": 0.12473410126901693, "learning_rate": 2.0882914979499635e-05, "loss": 0.353, "step": 3249 }, { "epoch": 3.464818763326226, "grad_norm": 0.15474894939169337, "learning_rate": 2.0856769380073497e-05, "loss": 0.3544, "step": 3250 }, { "epoch": 3.4658848614072495, "grad_norm": 0.14452008614896242, "learning_rate": 2.0830634385002234e-05, "loss": 0.3528, "step": 3251 }, { "epoch": 3.466950959488273, "grad_norm": 0.1418673442984818, "learning_rate": 2.080451000876325e-05, "loss": 0.3476, "step": 3252 }, { "epoch": 3.4680170575692966, "grad_norm": 0.16943424994947526, "learning_rate": 2.0778396265828097e-05, "loss": 0.356, "step": 3253 }, { "epoch": 3.4690831556503197, "grad_norm": 0.14164905105719902, "learning_rate": 2.0752293170662453e-05, "loss": 0.3492, "step": 3254 }, { "epoch": 3.470149253731343, "grad_norm": 0.17297216086140343, "learning_rate": 2.0726200737726044e-05, "loss": 0.3456, "step": 3255 }, { "epoch": 3.4712153518123667, "grad_norm": 0.13358545880627057, "learning_rate": 2.0700118981472737e-05, "loss": 0.35, "step": 3256 }, { "epoch": 3.4722814498933903, "grad_norm": 0.130949084809917, "learning_rate": 2.0674047916350472e-05, "loss": 0.3485, "step": 3257 }, { "epoch": 3.473347547974414, "grad_norm": 0.15140183416957245, "learning_rate": 2.0647987556801276e-05, "loss": 0.3488, "step": 3258 }, { "epoch": 3.474413646055437, "grad_norm": 0.13537402309349822, "learning_rate": 2.0621937917261202e-05, "loss": 0.3526, "step": 3259 }, { "epoch": 3.4754797441364604, "grad_norm": 0.12927192577901372, "learning_rate": 2.059589901216042e-05, "loss": 0.3578, "step": 3260 }, { "epoch": 3.476545842217484, "grad_norm": 0.13927688677691658, "learning_rate": 2.0569870855923133e-05, "loss": 0.3514, "step": 3261 }, { "epoch": 3.4776119402985075, "grad_norm": 0.1446653742467127, "learning_rate": 2.0543853462967568e-05, "loss": 0.3476, "step": 3262 }, { "epoch": 3.478678038379531, "grad_norm": 0.14655142804794563, "learning_rate": 2.0517846847706018e-05, "loss": 0.3521, "step": 3263 }, { "epoch": 3.479744136460554, "grad_norm": 0.15775326994652664, "learning_rate": 2.0491851024544798e-05, "loss": 0.3515, "step": 3264 }, { "epoch": 3.4808102345415777, "grad_norm": 0.14124595672889267, "learning_rate": 2.0465866007884254e-05, "loss": 0.3519, "step": 3265 }, { "epoch": 3.481876332622601, "grad_norm": 0.14159196279556782, "learning_rate": 2.0439891812118713e-05, "loss": 0.3467, "step": 3266 }, { "epoch": 3.4829424307036247, "grad_norm": 0.15538385639162236, "learning_rate": 2.0413928451636532e-05, "loss": 0.3478, "step": 3267 }, { "epoch": 3.4840085287846483, "grad_norm": 0.12201107022654291, "learning_rate": 2.038797594082009e-05, "loss": 0.3468, "step": 3268 }, { "epoch": 3.485074626865672, "grad_norm": 0.13632684112183946, "learning_rate": 2.0362034294045694e-05, "loss": 0.3461, "step": 3269 }, { "epoch": 3.486140724946695, "grad_norm": 0.14100032656692765, "learning_rate": 2.0336103525683685e-05, "loss": 0.3519, "step": 3270 }, { "epoch": 3.4872068230277184, "grad_norm": 0.13444787195465405, "learning_rate": 2.0310183650098357e-05, "loss": 0.3495, "step": 3271 }, { "epoch": 3.488272921108742, "grad_norm": 0.12769235515457522, "learning_rate": 2.0284274681647993e-05, "loss": 0.3453, "step": 3272 }, { "epoch": 3.4893390191897655, "grad_norm": 0.12220035736690575, "learning_rate": 2.0258376634684786e-05, "loss": 0.3496, "step": 3273 }, { "epoch": 3.490405117270789, "grad_norm": 0.1568430272842312, "learning_rate": 2.023248952355492e-05, "loss": 0.354, "step": 3274 }, { "epoch": 3.4914712153518126, "grad_norm": 0.11426477449378872, "learning_rate": 2.0206613362598507e-05, "loss": 0.3476, "step": 3275 }, { "epoch": 3.4925373134328357, "grad_norm": 0.15450922714432697, "learning_rate": 2.018074816614962e-05, "loss": 0.351, "step": 3276 }, { "epoch": 3.4936034115138592, "grad_norm": 0.13239244497688343, "learning_rate": 2.0154893948536195e-05, "loss": 0.3516, "step": 3277 }, { "epoch": 3.4946695095948828, "grad_norm": 0.1591598013421958, "learning_rate": 2.0129050724080138e-05, "loss": 0.3499, "step": 3278 }, { "epoch": 3.4957356076759063, "grad_norm": 0.15772096557717954, "learning_rate": 2.0103218507097274e-05, "loss": 0.3497, "step": 3279 }, { "epoch": 3.49680170575693, "grad_norm": 0.12425309706820227, "learning_rate": 2.0077397311897274e-05, "loss": 0.3474, "step": 3280 }, { "epoch": 3.497867803837953, "grad_norm": 0.1506011178175042, "learning_rate": 2.005158715278376e-05, "loss": 0.3472, "step": 3281 }, { "epoch": 3.4989339019189765, "grad_norm": 0.11149966822962049, "learning_rate": 2.0025788044054212e-05, "loss": 0.3504, "step": 3282 }, { "epoch": 3.5, "grad_norm": 0.15273982349883355, "learning_rate": 2.0000000000000012e-05, "loss": 0.3485, "step": 3283 }, { "epoch": 3.5010660980810235, "grad_norm": 0.12108827495133537, "learning_rate": 1.9974223034906362e-05, "loss": 0.3521, "step": 3284 }, { "epoch": 3.502132196162047, "grad_norm": 0.14237030818834184, "learning_rate": 1.9948457163052385e-05, "loss": 0.3519, "step": 3285 }, { "epoch": 3.50319829424307, "grad_norm": 0.11471865230712094, "learning_rate": 1.9922702398711026e-05, "loss": 0.3536, "step": 3286 }, { "epoch": 3.5042643923240937, "grad_norm": 0.13762079947986805, "learning_rate": 1.989695875614911e-05, "loss": 0.3496, "step": 3287 }, { "epoch": 3.5053304904051172, "grad_norm": 0.12762020300370305, "learning_rate": 1.987122624962724e-05, "loss": 0.3483, "step": 3288 }, { "epoch": 3.5063965884861408, "grad_norm": 0.13891339336483755, "learning_rate": 1.9845504893399906e-05, "loss": 0.3494, "step": 3289 }, { "epoch": 3.5074626865671643, "grad_norm": 0.12926973335164987, "learning_rate": 1.9819794701715412e-05, "loss": 0.3489, "step": 3290 }, { "epoch": 3.5085287846481874, "grad_norm": 0.11926976710182068, "learning_rate": 1.9794095688815846e-05, "loss": 0.3496, "step": 3291 }, { "epoch": 3.509594882729211, "grad_norm": 0.1169531192619173, "learning_rate": 1.9768407868937136e-05, "loss": 0.3536, "step": 3292 }, { "epoch": 3.5106609808102345, "grad_norm": 0.11709738090603788, "learning_rate": 1.9742731256308997e-05, "loss": 0.3505, "step": 3293 }, { "epoch": 3.511727078891258, "grad_norm": 0.11720868076698791, "learning_rate": 1.9717065865154962e-05, "loss": 0.3555, "step": 3294 }, { "epoch": 3.5127931769722816, "grad_norm": 0.11598807611455429, "learning_rate": 1.969141170969228e-05, "loss": 0.3492, "step": 3295 }, { "epoch": 3.5138592750533046, "grad_norm": 0.11677301458081207, "learning_rate": 1.9665768804132046e-05, "loss": 0.3513, "step": 3296 }, { "epoch": 3.5149253731343286, "grad_norm": 0.11769361312971446, "learning_rate": 1.9640137162679108e-05, "loss": 0.358, "step": 3297 }, { "epoch": 3.5159914712153517, "grad_norm": 0.11945244360571523, "learning_rate": 1.9614516799532035e-05, "loss": 0.3508, "step": 3298 }, { "epoch": 3.5170575692963753, "grad_norm": 0.10553148957707399, "learning_rate": 1.9588907728883192e-05, "loss": 0.3527, "step": 3299 }, { "epoch": 3.518123667377399, "grad_norm": 0.12506068947753565, "learning_rate": 1.9563309964918664e-05, "loss": 0.3525, "step": 3300 }, { "epoch": 3.5191897654584223, "grad_norm": 0.1274298809654222, "learning_rate": 1.9537723521818307e-05, "loss": 0.3586, "step": 3301 }, { "epoch": 3.520255863539446, "grad_norm": 0.11963014257012117, "learning_rate": 1.9512148413755653e-05, "loss": 0.3554, "step": 3302 }, { "epoch": 3.521321961620469, "grad_norm": 0.1426495501126215, "learning_rate": 1.9486584654897987e-05, "loss": 0.3544, "step": 3303 }, { "epoch": 3.5223880597014925, "grad_norm": 0.11378394285714087, "learning_rate": 1.9461032259406317e-05, "loss": 0.357, "step": 3304 }, { "epoch": 3.523454157782516, "grad_norm": 0.13151158677780098, "learning_rate": 1.9435491241435343e-05, "loss": 0.3504, "step": 3305 }, { "epoch": 3.5245202558635396, "grad_norm": 0.11047177913070025, "learning_rate": 1.9409961615133435e-05, "loss": 0.3449, "step": 3306 }, { "epoch": 3.525586353944563, "grad_norm": 0.14518194022909464, "learning_rate": 1.9384443394642697e-05, "loss": 0.3537, "step": 3307 }, { "epoch": 3.526652452025586, "grad_norm": 0.13288265424403722, "learning_rate": 1.9358936594098915e-05, "loss": 0.3472, "step": 3308 }, { "epoch": 3.5277185501066097, "grad_norm": 0.13481285136461793, "learning_rate": 1.9333441227631494e-05, "loss": 0.3522, "step": 3309 }, { "epoch": 3.5287846481876333, "grad_norm": 0.12955597370536356, "learning_rate": 1.9307957309363562e-05, "loss": 0.3591, "step": 3310 }, { "epoch": 3.529850746268657, "grad_norm": 0.13070491932515202, "learning_rate": 1.928248485341188e-05, "loss": 0.3523, "step": 3311 }, { "epoch": 3.5309168443496803, "grad_norm": 0.12150544834528254, "learning_rate": 1.9257023873886885e-05, "loss": 0.3548, "step": 3312 }, { "epoch": 3.5319829424307034, "grad_norm": 0.12389814641248816, "learning_rate": 1.9231574384892608e-05, "loss": 0.3491, "step": 3313 }, { "epoch": 3.533049040511727, "grad_norm": 0.11812191547579137, "learning_rate": 1.9206136400526753e-05, "loss": 0.3552, "step": 3314 }, { "epoch": 3.5341151385927505, "grad_norm": 0.11705609586266169, "learning_rate": 1.9180709934880657e-05, "loss": 0.3567, "step": 3315 }, { "epoch": 3.535181236673774, "grad_norm": 0.12705382222301062, "learning_rate": 1.915529500203923e-05, "loss": 0.3522, "step": 3316 }, { "epoch": 3.5362473347547976, "grad_norm": 0.1195951170844615, "learning_rate": 1.9129891616081045e-05, "loss": 0.3459, "step": 3317 }, { "epoch": 3.5373134328358207, "grad_norm": 0.11156152984847112, "learning_rate": 1.910449979107827e-05, "loss": 0.3524, "step": 3318 }, { "epoch": 3.538379530916844, "grad_norm": 0.12607688725487942, "learning_rate": 1.907911954109662e-05, "loss": 0.3505, "step": 3319 }, { "epoch": 3.5394456289978677, "grad_norm": 0.12385238816738517, "learning_rate": 1.9053750880195453e-05, "loss": 0.353, "step": 3320 }, { "epoch": 3.5405117270788913, "grad_norm": 0.11986136600785428, "learning_rate": 1.9028393822427707e-05, "loss": 0.3534, "step": 3321 }, { "epoch": 3.541577825159915, "grad_norm": 0.1306944687194926, "learning_rate": 1.900304838183984e-05, "loss": 0.3451, "step": 3322 }, { "epoch": 3.542643923240938, "grad_norm": 0.12521720177645257, "learning_rate": 1.8977714572471942e-05, "loss": 0.3463, "step": 3323 }, { "epoch": 3.543710021321962, "grad_norm": 0.14676298985824088, "learning_rate": 1.8952392408357596e-05, "loss": 0.348, "step": 3324 }, { "epoch": 3.544776119402985, "grad_norm": 0.1270324568921544, "learning_rate": 1.892708190352398e-05, "loss": 0.3469, "step": 3325 }, { "epoch": 3.5458422174840085, "grad_norm": 0.13384984086554236, "learning_rate": 1.890178307199181e-05, "loss": 0.3586, "step": 3326 }, { "epoch": 3.546908315565032, "grad_norm": 0.1403291759377033, "learning_rate": 1.88764959277753e-05, "loss": 0.3544, "step": 3327 }, { "epoch": 3.5479744136460556, "grad_norm": 0.1578563606515396, "learning_rate": 1.8851220484882223e-05, "loss": 0.3484, "step": 3328 }, { "epoch": 3.549040511727079, "grad_norm": 0.1332388726289066, "learning_rate": 1.8825956757313864e-05, "loss": 0.3546, "step": 3329 }, { "epoch": 3.550106609808102, "grad_norm": 0.14974892308226678, "learning_rate": 1.8800704759065027e-05, "loss": 0.3465, "step": 3330 }, { "epoch": 3.5511727078891258, "grad_norm": 0.13311408301746736, "learning_rate": 1.877546450412398e-05, "loss": 0.3425, "step": 3331 }, { "epoch": 3.5522388059701493, "grad_norm": 0.1486101241297523, "learning_rate": 1.8750236006472525e-05, "loss": 0.3448, "step": 3332 }, { "epoch": 3.553304904051173, "grad_norm": 0.126555084429222, "learning_rate": 1.872501928008594e-05, "loss": 0.3476, "step": 3333 }, { "epoch": 3.5543710021321964, "grad_norm": 0.1288483891538286, "learning_rate": 1.8699814338933e-05, "loss": 0.3517, "step": 3334 }, { "epoch": 3.5554371002132195, "grad_norm": 0.13278424016806864, "learning_rate": 1.8674621196975892e-05, "loss": 0.352, "step": 3335 }, { "epoch": 3.556503198294243, "grad_norm": 0.13632340052187167, "learning_rate": 1.864943986817033e-05, "loss": 0.3486, "step": 3336 }, { "epoch": 3.5575692963752665, "grad_norm": 0.12666253550466774, "learning_rate": 1.8624270366465476e-05, "loss": 0.351, "step": 3337 }, { "epoch": 3.55863539445629, "grad_norm": 0.13872865389172592, "learning_rate": 1.8599112705803894e-05, "loss": 0.3546, "step": 3338 }, { "epoch": 3.5597014925373136, "grad_norm": 0.11819669860101821, "learning_rate": 1.857396690012163e-05, "loss": 0.3496, "step": 3339 }, { "epoch": 3.5607675906183367, "grad_norm": 0.1324098142656928, "learning_rate": 1.8548832963348167e-05, "loss": 0.3512, "step": 3340 }, { "epoch": 3.5618336886993602, "grad_norm": 0.1213227275635731, "learning_rate": 1.8523710909406408e-05, "loss": 0.3529, "step": 3341 }, { "epoch": 3.5628997867803838, "grad_norm": 0.13139162851354952, "learning_rate": 1.8498600752212633e-05, "loss": 0.3463, "step": 3342 }, { "epoch": 3.5639658848614073, "grad_norm": 0.12022593351799091, "learning_rate": 1.847350250567658e-05, "loss": 0.349, "step": 3343 }, { "epoch": 3.565031982942431, "grad_norm": 0.14848931387258837, "learning_rate": 1.8448416183701387e-05, "loss": 0.3523, "step": 3344 }, { "epoch": 3.566098081023454, "grad_norm": 0.13293877643754679, "learning_rate": 1.8423341800183547e-05, "loss": 0.3548, "step": 3345 }, { "epoch": 3.5671641791044775, "grad_norm": 0.13819112943650344, "learning_rate": 1.8398279369012975e-05, "loss": 0.3586, "step": 3346 }, { "epoch": 3.568230277185501, "grad_norm": 0.15221572285365978, "learning_rate": 1.8373228904072958e-05, "loss": 0.3487, "step": 3347 }, { "epoch": 3.5692963752665245, "grad_norm": 0.12335264810660222, "learning_rate": 1.8348190419240168e-05, "loss": 0.3576, "step": 3348 }, { "epoch": 3.570362473347548, "grad_norm": 0.14402755325400285, "learning_rate": 1.8323163928384597e-05, "loss": 0.3557, "step": 3349 }, { "epoch": 3.571428571428571, "grad_norm": 0.121086447817541, "learning_rate": 1.829814944536963e-05, "loss": 0.3545, "step": 3350 }, { "epoch": 3.572494669509595, "grad_norm": 0.1456232992893829, "learning_rate": 1.8273146984051987e-05, "loss": 0.3508, "step": 3351 }, { "epoch": 3.5735607675906182, "grad_norm": 0.13621927041516765, "learning_rate": 1.8248156558281756e-05, "loss": 0.3479, "step": 3352 }, { "epoch": 3.574626865671642, "grad_norm": 0.12188450870001802, "learning_rate": 1.8223178181902296e-05, "loss": 0.3514, "step": 3353 }, { "epoch": 3.5756929637526653, "grad_norm": 0.13329743005861297, "learning_rate": 1.8198211868750352e-05, "loss": 0.3495, "step": 3354 }, { "epoch": 3.576759061833689, "grad_norm": 0.118410852419534, "learning_rate": 1.8173257632655973e-05, "loss": 0.3482, "step": 3355 }, { "epoch": 3.5778251599147124, "grad_norm": 0.12392097571524742, "learning_rate": 1.8148315487442482e-05, "loss": 0.3509, "step": 3356 }, { "epoch": 3.5788912579957355, "grad_norm": 0.149797120649777, "learning_rate": 1.8123385446926546e-05, "loss": 0.351, "step": 3357 }, { "epoch": 3.579957356076759, "grad_norm": 0.11637490638409546, "learning_rate": 1.8098467524918114e-05, "loss": 0.348, "step": 3358 }, { "epoch": 3.5810234541577826, "grad_norm": 0.1302506928123163, "learning_rate": 1.807356173522043e-05, "loss": 0.3473, "step": 3359 }, { "epoch": 3.582089552238806, "grad_norm": 0.11734314593339212, "learning_rate": 1.8048668091629976e-05, "loss": 0.3556, "step": 3360 }, { "epoch": 3.5831556503198296, "grad_norm": 0.13699195492761537, "learning_rate": 1.802378660793656e-05, "loss": 0.3509, "step": 3361 }, { "epoch": 3.5842217484008527, "grad_norm": 0.1192014977364567, "learning_rate": 1.7998917297923224e-05, "loss": 0.3471, "step": 3362 }, { "epoch": 3.5852878464818763, "grad_norm": 0.1296315171549434, "learning_rate": 1.7974060175366288e-05, "loss": 0.3465, "step": 3363 }, { "epoch": 3.5863539445629, "grad_norm": 0.13858112246619184, "learning_rate": 1.794921525403528e-05, "loss": 0.3497, "step": 3364 }, { "epoch": 3.5874200426439233, "grad_norm": 0.1119373071168273, "learning_rate": 1.7924382547693006e-05, "loss": 0.3545, "step": 3365 }, { "epoch": 3.588486140724947, "grad_norm": 0.13479994066024156, "learning_rate": 1.7899562070095517e-05, "loss": 0.3459, "step": 3366 }, { "epoch": 3.58955223880597, "grad_norm": 0.11961904622503035, "learning_rate": 1.7874753834992023e-05, "loss": 0.3486, "step": 3367 }, { "epoch": 3.5906183368869935, "grad_norm": 0.1295577113450289, "learning_rate": 1.7849957856125032e-05, "loss": 0.3437, "step": 3368 }, { "epoch": 3.591684434968017, "grad_norm": 0.13384088967192623, "learning_rate": 1.7825174147230212e-05, "loss": 0.3508, "step": 3369 }, { "epoch": 3.5927505330490406, "grad_norm": 0.14860888147610143, "learning_rate": 1.780040272203647e-05, "loss": 0.3456, "step": 3370 }, { "epoch": 3.593816631130064, "grad_norm": 0.12817477422964021, "learning_rate": 1.7775643594265858e-05, "loss": 0.3502, "step": 3371 }, { "epoch": 3.594882729211087, "grad_norm": 0.15577728782871172, "learning_rate": 1.775089677763366e-05, "loss": 0.3537, "step": 3372 }, { "epoch": 3.5959488272921107, "grad_norm": 0.12786761103356425, "learning_rate": 1.7726162285848343e-05, "loss": 0.3447, "step": 3373 }, { "epoch": 3.5970149253731343, "grad_norm": 0.14028709617612561, "learning_rate": 1.77014401326115e-05, "loss": 0.3463, "step": 3374 }, { "epoch": 3.598081023454158, "grad_norm": 0.12954870341900906, "learning_rate": 1.7676730331617934e-05, "loss": 0.3519, "step": 3375 }, { "epoch": 3.5991471215351813, "grad_norm": 0.12691468222674968, "learning_rate": 1.765203289655559e-05, "loss": 0.3491, "step": 3376 }, { "epoch": 3.6002132196162044, "grad_norm": 0.15452666879821078, "learning_rate": 1.7627347841105575e-05, "loss": 0.3469, "step": 3377 }, { "epoch": 3.6012793176972284, "grad_norm": 0.12502978476826676, "learning_rate": 1.7602675178942102e-05, "loss": 0.3441, "step": 3378 }, { "epoch": 3.6023454157782515, "grad_norm": 0.13006787492356886, "learning_rate": 1.7578014923732558e-05, "loss": 0.3501, "step": 3379 }, { "epoch": 3.603411513859275, "grad_norm": 0.10724477831469756, "learning_rate": 1.7553367089137438e-05, "loss": 0.3501, "step": 3380 }, { "epoch": 3.6044776119402986, "grad_norm": 0.13170260076841805, "learning_rate": 1.7528731688810383e-05, "loss": 0.3505, "step": 3381 }, { "epoch": 3.605543710021322, "grad_norm": 0.11121721943362509, "learning_rate": 1.7504108736398087e-05, "loss": 0.3523, "step": 3382 }, { "epoch": 3.6066098081023457, "grad_norm": 0.14589447941044803, "learning_rate": 1.747949824554041e-05, "loss": 0.3546, "step": 3383 }, { "epoch": 3.6076759061833688, "grad_norm": 0.13319282485716943, "learning_rate": 1.745490022987029e-05, "loss": 0.3496, "step": 3384 }, { "epoch": 3.6087420042643923, "grad_norm": 0.12337788148881837, "learning_rate": 1.7430314703013727e-05, "loss": 0.3504, "step": 3385 }, { "epoch": 3.609808102345416, "grad_norm": 0.14541264966288076, "learning_rate": 1.7405741678589838e-05, "loss": 0.3507, "step": 3386 }, { "epoch": 3.6108742004264394, "grad_norm": 0.11959380106376388, "learning_rate": 1.7381181170210814e-05, "loss": 0.3511, "step": 3387 }, { "epoch": 3.611940298507463, "grad_norm": 0.1245152479458827, "learning_rate": 1.7356633191481877e-05, "loss": 0.3469, "step": 3388 }, { "epoch": 3.613006396588486, "grad_norm": 0.10647154581436648, "learning_rate": 1.7332097756001335e-05, "loss": 0.3512, "step": 3389 }, { "epoch": 3.6140724946695095, "grad_norm": 0.11205355107663482, "learning_rate": 1.730757487736057e-05, "loss": 0.3449, "step": 3390 }, { "epoch": 3.615138592750533, "grad_norm": 0.1071566104296763, "learning_rate": 1.7283064569143947e-05, "loss": 0.349, "step": 3391 }, { "epoch": 3.6162046908315566, "grad_norm": 0.10713350170160939, "learning_rate": 1.7258566844928915e-05, "loss": 0.3495, "step": 3392 }, { "epoch": 3.61727078891258, "grad_norm": 0.11331625037340512, "learning_rate": 1.7234081718285965e-05, "loss": 0.351, "step": 3393 }, { "epoch": 3.6183368869936032, "grad_norm": 0.115641631530902, "learning_rate": 1.7209609202778542e-05, "loss": 0.3487, "step": 3394 }, { "epoch": 3.6194029850746268, "grad_norm": 0.12437375198658421, "learning_rate": 1.7185149311963186e-05, "loss": 0.3513, "step": 3395 }, { "epoch": 3.6204690831556503, "grad_norm": 0.11954602193026391, "learning_rate": 1.716070205938938e-05, "loss": 0.3523, "step": 3396 }, { "epoch": 3.621535181236674, "grad_norm": 0.1310194394078668, "learning_rate": 1.7136267458599633e-05, "loss": 0.3495, "step": 3397 }, { "epoch": 3.6226012793176974, "grad_norm": 0.11894966733116948, "learning_rate": 1.711184552312945e-05, "loss": 0.3491, "step": 3398 }, { "epoch": 3.6236673773987205, "grad_norm": 0.1260933584074009, "learning_rate": 1.7087436266507333e-05, "loss": 0.3516, "step": 3399 }, { "epoch": 3.624733475479744, "grad_norm": 0.11495853607822465, "learning_rate": 1.706303970225471e-05, "loss": 0.3507, "step": 3400 }, { "epoch": 3.6257995735607675, "grad_norm": 0.1580722155106881, "learning_rate": 1.703865584388602e-05, "loss": 0.349, "step": 3401 }, { "epoch": 3.626865671641791, "grad_norm": 0.1166417464300119, "learning_rate": 1.7014284704908673e-05, "loss": 0.3504, "step": 3402 }, { "epoch": 3.6279317697228146, "grad_norm": 0.13094944084926075, "learning_rate": 1.6989926298822977e-05, "loss": 0.3517, "step": 3403 }, { "epoch": 3.6289978678038377, "grad_norm": 0.1091018787371315, "learning_rate": 1.6965580639122247e-05, "loss": 0.347, "step": 3404 }, { "epoch": 3.6300639658848612, "grad_norm": 0.13573951697931824, "learning_rate": 1.694124773929271e-05, "loss": 0.3506, "step": 3405 }, { "epoch": 3.631130063965885, "grad_norm": 0.11997387496896626, "learning_rate": 1.691692761281354e-05, "loss": 0.3505, "step": 3406 }, { "epoch": 3.6321961620469083, "grad_norm": 0.1581248499900699, "learning_rate": 1.6892620273156795e-05, "loss": 0.351, "step": 3407 }, { "epoch": 3.633262260127932, "grad_norm": 0.1253330836795441, "learning_rate": 1.686832573378749e-05, "loss": 0.3527, "step": 3408 }, { "epoch": 3.6343283582089554, "grad_norm": 0.12308261327495958, "learning_rate": 1.684404400816354e-05, "loss": 0.3485, "step": 3409 }, { "epoch": 3.635394456289979, "grad_norm": 0.1094772181271851, "learning_rate": 1.6819775109735777e-05, "loss": 0.3526, "step": 3410 }, { "epoch": 3.636460554371002, "grad_norm": 0.14783525680446527, "learning_rate": 1.6795519051947877e-05, "loss": 0.3493, "step": 3411 }, { "epoch": 3.6375266524520256, "grad_norm": 0.11897120110784674, "learning_rate": 1.6771275848236447e-05, "loss": 0.3482, "step": 3412 }, { "epoch": 3.638592750533049, "grad_norm": 0.1346934227512216, "learning_rate": 1.674704551203098e-05, "loss": 0.3498, "step": 3413 }, { "epoch": 3.6396588486140726, "grad_norm": 0.11012404823074266, "learning_rate": 1.6722828056753794e-05, "loss": 0.35, "step": 3414 }, { "epoch": 3.640724946695096, "grad_norm": 0.13021214801697548, "learning_rate": 1.6698623495820117e-05, "loss": 0.3483, "step": 3415 }, { "epoch": 3.6417910447761193, "grad_norm": 0.10956855386418592, "learning_rate": 1.6674431842638014e-05, "loss": 0.3492, "step": 3416 }, { "epoch": 3.642857142857143, "grad_norm": 0.11548747621938685, "learning_rate": 1.6650253110608415e-05, "loss": 0.345, "step": 3417 }, { "epoch": 3.6439232409381663, "grad_norm": 0.10744538789719912, "learning_rate": 1.662608731312506e-05, "loss": 0.3513, "step": 3418 }, { "epoch": 3.64498933901919, "grad_norm": 0.12523119814259084, "learning_rate": 1.6601934463574553e-05, "loss": 0.349, "step": 3419 }, { "epoch": 3.6460554371002134, "grad_norm": 0.1183786159156495, "learning_rate": 1.657779457533632e-05, "loss": 0.3497, "step": 3420 }, { "epoch": 3.6471215351812365, "grad_norm": 0.1258499101414295, "learning_rate": 1.6553667661782624e-05, "loss": 0.3471, "step": 3421 }, { "epoch": 3.64818763326226, "grad_norm": 0.11610530838109377, "learning_rate": 1.652955373627848e-05, "loss": 0.3533, "step": 3422 }, { "epoch": 3.6492537313432836, "grad_norm": 0.12286306836120377, "learning_rate": 1.6505452812181775e-05, "loss": 0.3483, "step": 3423 }, { "epoch": 3.650319829424307, "grad_norm": 0.09734440175920124, "learning_rate": 1.648136490284318e-05, "loss": 0.344, "step": 3424 }, { "epoch": 3.6513859275053306, "grad_norm": 0.13317697498320796, "learning_rate": 1.645729002160611e-05, "loss": 0.3485, "step": 3425 }, { "epoch": 3.6524520255863537, "grad_norm": 0.11968047549008518, "learning_rate": 1.643322818180682e-05, "loss": 0.3504, "step": 3426 }, { "epoch": 3.6535181236673773, "grad_norm": 0.12970016078696814, "learning_rate": 1.6409179396774317e-05, "loss": 0.3494, "step": 3427 }, { "epoch": 3.654584221748401, "grad_norm": 0.10979013596405256, "learning_rate": 1.638514367983039e-05, "loss": 0.3475, "step": 3428 }, { "epoch": 3.6556503198294243, "grad_norm": 0.11958509569515018, "learning_rate": 1.6361121044289553e-05, "loss": 0.3424, "step": 3429 }, { "epoch": 3.656716417910448, "grad_norm": 0.1210201578994878, "learning_rate": 1.6337111503459104e-05, "loss": 0.3443, "step": 3430 }, { "epoch": 3.657782515991471, "grad_norm": 0.11828921119105222, "learning_rate": 1.63131150706391e-05, "loss": 0.3442, "step": 3431 }, { "epoch": 3.6588486140724945, "grad_norm": 0.12968866909782567, "learning_rate": 1.6289131759122292e-05, "loss": 0.3488, "step": 3432 }, { "epoch": 3.659914712153518, "grad_norm": 0.11513145282740704, "learning_rate": 1.62651615821942e-05, "loss": 0.3515, "step": 3433 }, { "epoch": 3.6609808102345416, "grad_norm": 0.10887008435723226, "learning_rate": 1.6241204553133054e-05, "loss": 0.3422, "step": 3434 }, { "epoch": 3.662046908315565, "grad_norm": 0.10848962774276776, "learning_rate": 1.6217260685209815e-05, "loss": 0.3508, "step": 3435 }, { "epoch": 3.663113006396588, "grad_norm": 0.10984253318601138, "learning_rate": 1.619332999168812e-05, "loss": 0.3473, "step": 3436 }, { "epoch": 3.664179104477612, "grad_norm": 0.10354031929218735, "learning_rate": 1.6169412485824343e-05, "loss": 0.3511, "step": 3437 }, { "epoch": 3.6652452025586353, "grad_norm": 0.11266732298785025, "learning_rate": 1.6145508180867537e-05, "loss": 0.3438, "step": 3438 }, { "epoch": 3.666311300639659, "grad_norm": 0.11007091054264355, "learning_rate": 1.6121617090059455e-05, "loss": 0.3519, "step": 3439 }, { "epoch": 3.6673773987206824, "grad_norm": 0.11010507294324125, "learning_rate": 1.6097739226634494e-05, "loss": 0.3503, "step": 3440 }, { "epoch": 3.668443496801706, "grad_norm": 0.1107608574897804, "learning_rate": 1.6073874603819767e-05, "loss": 0.3444, "step": 3441 }, { "epoch": 3.6695095948827294, "grad_norm": 0.09996660829470029, "learning_rate": 1.605002323483505e-05, "loss": 0.3508, "step": 3442 }, { "epoch": 3.6705756929637525, "grad_norm": 0.10541897341054957, "learning_rate": 1.6026185132892722e-05, "loss": 0.3473, "step": 3443 }, { "epoch": 3.671641791044776, "grad_norm": 0.09364677982992888, "learning_rate": 1.6002360311197874e-05, "loss": 0.345, "step": 3444 }, { "epoch": 3.6727078891257996, "grad_norm": 0.11633515138576979, "learning_rate": 1.5978548782948228e-05, "loss": 0.3537, "step": 3445 }, { "epoch": 3.673773987206823, "grad_norm": 0.10455339919916513, "learning_rate": 1.595475056133413e-05, "loss": 0.3441, "step": 3446 }, { "epoch": 3.6748400852878467, "grad_norm": 0.10817600283132285, "learning_rate": 1.5930965659538547e-05, "loss": 0.3513, "step": 3447 }, { "epoch": 3.6759061833688698, "grad_norm": 0.11724298481759776, "learning_rate": 1.590719409073708e-05, "loss": 0.3471, "step": 3448 }, { "epoch": 3.6769722814498933, "grad_norm": 0.11034160049080355, "learning_rate": 1.5883435868097942e-05, "loss": 0.3494, "step": 3449 }, { "epoch": 3.678038379530917, "grad_norm": 0.10935682929664699, "learning_rate": 1.5859691004781977e-05, "loss": 0.3531, "step": 3450 }, { "epoch": 3.6791044776119404, "grad_norm": 0.10669839160222076, "learning_rate": 1.5835959513942577e-05, "loss": 0.3464, "step": 3451 }, { "epoch": 3.680170575692964, "grad_norm": 0.10499647433795345, "learning_rate": 1.5812241408725757e-05, "loss": 0.3484, "step": 3452 }, { "epoch": 3.681236673773987, "grad_norm": 0.10722258869475547, "learning_rate": 1.5788536702270136e-05, "loss": 0.3498, "step": 3453 }, { "epoch": 3.6823027718550105, "grad_norm": 0.1120874379953065, "learning_rate": 1.5764845407706863e-05, "loss": 0.3561, "step": 3454 }, { "epoch": 3.683368869936034, "grad_norm": 0.11848268184587527, "learning_rate": 1.5741167538159697e-05, "loss": 0.3464, "step": 3455 }, { "epoch": 3.6844349680170576, "grad_norm": 0.12047369839392386, "learning_rate": 1.5717503106744957e-05, "loss": 0.3572, "step": 3456 }, { "epoch": 3.685501066098081, "grad_norm": 0.12166580342753797, "learning_rate": 1.569385212657149e-05, "loss": 0.3489, "step": 3457 }, { "epoch": 3.6865671641791042, "grad_norm": 0.12106373222633703, "learning_rate": 1.567021461074071e-05, "loss": 0.3516, "step": 3458 }, { "epoch": 3.6876332622601278, "grad_norm": 0.11484973241212469, "learning_rate": 1.5646590572346596e-05, "loss": 0.3524, "step": 3459 }, { "epoch": 3.6886993603411513, "grad_norm": 0.10937051777873967, "learning_rate": 1.5622980024475608e-05, "loss": 0.3439, "step": 3460 }, { "epoch": 3.689765458422175, "grad_norm": 0.12137658902823265, "learning_rate": 1.5599382980206773e-05, "loss": 0.3516, "step": 3461 }, { "epoch": 3.6908315565031984, "grad_norm": 0.12974179145102452, "learning_rate": 1.5575799452611647e-05, "loss": 0.3545, "step": 3462 }, { "epoch": 3.6918976545842215, "grad_norm": 0.13163480419478382, "learning_rate": 1.5552229454754245e-05, "loss": 0.3462, "step": 3463 }, { "epoch": 3.6929637526652455, "grad_norm": 0.09995629474690315, "learning_rate": 1.5528672999691137e-05, "loss": 0.3463, "step": 3464 }, { "epoch": 3.6940298507462686, "grad_norm": 0.12284728073753405, "learning_rate": 1.550513010047139e-05, "loss": 0.3478, "step": 3465 }, { "epoch": 3.695095948827292, "grad_norm": 0.11912635744633349, "learning_rate": 1.5481600770136512e-05, "loss": 0.3469, "step": 3466 }, { "epoch": 3.6961620469083156, "grad_norm": 0.1046940554664763, "learning_rate": 1.5458085021720557e-05, "loss": 0.349, "step": 3467 }, { "epoch": 3.697228144989339, "grad_norm": 0.10784839095846516, "learning_rate": 1.543458286825003e-05, "loss": 0.3533, "step": 3468 }, { "epoch": 3.6982942430703627, "grad_norm": 0.1195343323249887, "learning_rate": 1.5411094322743875e-05, "loss": 0.3508, "step": 3469 }, { "epoch": 3.699360341151386, "grad_norm": 0.10962783595704657, "learning_rate": 1.5387619398213543e-05, "loss": 0.3471, "step": 3470 }, { "epoch": 3.7004264392324093, "grad_norm": 0.1150816905348978, "learning_rate": 1.5364158107662935e-05, "loss": 0.3518, "step": 3471 }, { "epoch": 3.701492537313433, "grad_norm": 0.13829883854757222, "learning_rate": 1.534071046408836e-05, "loss": 0.3521, "step": 3472 }, { "epoch": 3.7025586353944564, "grad_norm": 0.12445105825556152, "learning_rate": 1.531727648047861e-05, "loss": 0.3554, "step": 3473 }, { "epoch": 3.70362473347548, "grad_norm": 0.1157531505696079, "learning_rate": 1.5293856169814885e-05, "loss": 0.3489, "step": 3474 }, { "epoch": 3.704690831556503, "grad_norm": 0.1330840415067106, "learning_rate": 1.527044954507084e-05, "loss": 0.35, "step": 3475 }, { "epoch": 3.7057569296375266, "grad_norm": 0.10784494484991011, "learning_rate": 1.5247056619212507e-05, "loss": 0.3488, "step": 3476 }, { "epoch": 3.70682302771855, "grad_norm": 0.12127485364434097, "learning_rate": 1.5223677405198354e-05, "loss": 0.3499, "step": 3477 }, { "epoch": 3.7078891257995736, "grad_norm": 0.11013802869572574, "learning_rate": 1.5200311915979255e-05, "loss": 0.3421, "step": 3478 }, { "epoch": 3.708955223880597, "grad_norm": 0.12178309778215979, "learning_rate": 1.517696016449849e-05, "loss": 0.3527, "step": 3479 }, { "epoch": 3.7100213219616203, "grad_norm": 0.11504029087431382, "learning_rate": 1.515362216369169e-05, "loss": 0.3483, "step": 3480 }, { "epoch": 3.711087420042644, "grad_norm": 0.12440538915523314, "learning_rate": 1.5130297926486908e-05, "loss": 0.3481, "step": 3481 }, { "epoch": 3.7121535181236673, "grad_norm": 0.1157756581499601, "learning_rate": 1.5106987465804572e-05, "loss": 0.3472, "step": 3482 }, { "epoch": 3.713219616204691, "grad_norm": 0.13884491519651773, "learning_rate": 1.5083690794557435e-05, "loss": 0.3491, "step": 3483 }, { "epoch": 3.7142857142857144, "grad_norm": 0.11160024061874728, "learning_rate": 1.5060407925650662e-05, "loss": 0.3442, "step": 3484 }, { "epoch": 3.7153518123667375, "grad_norm": 0.1388304685706655, "learning_rate": 1.503713887198175e-05, "loss": 0.3524, "step": 3485 }, { "epoch": 3.716417910447761, "grad_norm": 0.10291882827022071, "learning_rate": 1.5013883646440555e-05, "loss": 0.3469, "step": 3486 }, { "epoch": 3.7174840085287846, "grad_norm": 0.11581595497958068, "learning_rate": 1.499064226190924e-05, "loss": 0.3478, "step": 3487 }, { "epoch": 3.718550106609808, "grad_norm": 0.12039433257965831, "learning_rate": 1.4967414731262339e-05, "loss": 0.3476, "step": 3488 }, { "epoch": 3.7196162046908317, "grad_norm": 0.11994551705543056, "learning_rate": 1.494420106736671e-05, "loss": 0.349, "step": 3489 }, { "epoch": 3.7206823027718547, "grad_norm": 0.10943678193661488, "learning_rate": 1.4921001283081488e-05, "loss": 0.3443, "step": 3490 }, { "epoch": 3.7217484008528787, "grad_norm": 0.12365465123838881, "learning_rate": 1.489781539125816e-05, "loss": 0.3428, "step": 3491 }, { "epoch": 3.722814498933902, "grad_norm": 0.12996918119430528, "learning_rate": 1.4874643404740505e-05, "loss": 0.3492, "step": 3492 }, { "epoch": 3.7238805970149254, "grad_norm": 0.12320823229427783, "learning_rate": 1.4851485336364616e-05, "loss": 0.3555, "step": 3493 }, { "epoch": 3.724946695095949, "grad_norm": 0.11983024758002496, "learning_rate": 1.4828341198958827e-05, "loss": 0.3529, "step": 3494 }, { "epoch": 3.7260127931769724, "grad_norm": 0.11090723612747737, "learning_rate": 1.4805211005343804e-05, "loss": 0.3497, "step": 3495 }, { "epoch": 3.727078891257996, "grad_norm": 0.11889325003301451, "learning_rate": 1.4782094768332477e-05, "loss": 0.3475, "step": 3496 }, { "epoch": 3.728144989339019, "grad_norm": 0.11039401592347903, "learning_rate": 1.4758992500730047e-05, "loss": 0.3506, "step": 3497 }, { "epoch": 3.7292110874200426, "grad_norm": 0.11723348316498812, "learning_rate": 1.4735904215333942e-05, "loss": 0.3491, "step": 3498 }, { "epoch": 3.730277185501066, "grad_norm": 0.1096667948027839, "learning_rate": 1.4712829924933888e-05, "loss": 0.35, "step": 3499 }, { "epoch": 3.7313432835820897, "grad_norm": 0.12827667700436324, "learning_rate": 1.4689769642311862e-05, "loss": 0.3488, "step": 3500 }, { "epoch": 3.732409381663113, "grad_norm": 0.11597291937955526, "learning_rate": 1.466672338024202e-05, "loss": 0.3529, "step": 3501 }, { "epoch": 3.7334754797441363, "grad_norm": 0.12098185152644861, "learning_rate": 1.4643691151490825e-05, "loss": 0.3543, "step": 3502 }, { "epoch": 3.73454157782516, "grad_norm": 0.1264853588754496, "learning_rate": 1.462067296881692e-05, "loss": 0.3494, "step": 3503 }, { "epoch": 3.7356076759061834, "grad_norm": 0.11554640924569769, "learning_rate": 1.4597668844971203e-05, "loss": 0.3477, "step": 3504 }, { "epoch": 3.736673773987207, "grad_norm": 0.1263291190867985, "learning_rate": 1.4574678792696735e-05, "loss": 0.3464, "step": 3505 }, { "epoch": 3.7377398720682304, "grad_norm": 0.1140310154067713, "learning_rate": 1.4551702824728824e-05, "loss": 0.3534, "step": 3506 }, { "epoch": 3.7388059701492535, "grad_norm": 0.1374330707269001, "learning_rate": 1.452874095379496e-05, "loss": 0.3525, "step": 3507 }, { "epoch": 3.739872068230277, "grad_norm": 0.10373341206088472, "learning_rate": 1.4505793192614838e-05, "loss": 0.3487, "step": 3508 }, { "epoch": 3.7409381663113006, "grad_norm": 0.12296501711153075, "learning_rate": 1.4482859553900302e-05, "loss": 0.3523, "step": 3509 }, { "epoch": 3.742004264392324, "grad_norm": 0.09959644973450633, "learning_rate": 1.4459940050355412e-05, "loss": 0.3459, "step": 3510 }, { "epoch": 3.7430703624733477, "grad_norm": 0.12381271538841246, "learning_rate": 1.4437034694676388e-05, "loss": 0.3442, "step": 3511 }, { "epoch": 3.7441364605543708, "grad_norm": 0.11420172023496247, "learning_rate": 1.4414143499551583e-05, "loss": 0.3435, "step": 3512 }, { "epoch": 3.7452025586353943, "grad_norm": 0.12761045293800952, "learning_rate": 1.4391266477661545e-05, "loss": 0.3502, "step": 3513 }, { "epoch": 3.746268656716418, "grad_norm": 0.12938266287837766, "learning_rate": 1.4368403641678951e-05, "loss": 0.353, "step": 3514 }, { "epoch": 3.7473347547974414, "grad_norm": 0.12147529927692077, "learning_rate": 1.434555500426864e-05, "loss": 0.3529, "step": 3515 }, { "epoch": 3.748400852878465, "grad_norm": 0.12991621986150606, "learning_rate": 1.4322720578087546e-05, "loss": 0.3508, "step": 3516 }, { "epoch": 3.749466950959488, "grad_norm": 0.11210702326795871, "learning_rate": 1.4299900375784761e-05, "loss": 0.3491, "step": 3517 }, { "epoch": 3.750533049040512, "grad_norm": 0.12221491366601378, "learning_rate": 1.4277094410001508e-05, "loss": 0.3486, "step": 3518 }, { "epoch": 3.751599147121535, "grad_norm": 0.11758736893333697, "learning_rate": 1.4254302693371083e-05, "loss": 0.3525, "step": 3519 }, { "epoch": 3.7526652452025586, "grad_norm": 0.11720998722860178, "learning_rate": 1.4231525238518917e-05, "loss": 0.3526, "step": 3520 }, { "epoch": 3.753731343283582, "grad_norm": 0.11702149986171113, "learning_rate": 1.4208762058062546e-05, "loss": 0.3567, "step": 3521 }, { "epoch": 3.7547974413646057, "grad_norm": 0.11781947922119294, "learning_rate": 1.4186013164611593e-05, "loss": 0.3476, "step": 3522 }, { "epoch": 3.7558635394456292, "grad_norm": 0.11028285471866768, "learning_rate": 1.4163278570767744e-05, "loss": 0.3472, "step": 3523 }, { "epoch": 3.7569296375266523, "grad_norm": 0.11480800401974497, "learning_rate": 1.4140558289124795e-05, "loss": 0.3498, "step": 3524 }, { "epoch": 3.757995735607676, "grad_norm": 0.10511476764554756, "learning_rate": 1.411785233226861e-05, "loss": 0.346, "step": 3525 }, { "epoch": 3.7590618336886994, "grad_norm": 0.11246301899459499, "learning_rate": 1.4095160712777087e-05, "loss": 0.3463, "step": 3526 }, { "epoch": 3.760127931769723, "grad_norm": 0.11075751087710273, "learning_rate": 1.4072483443220213e-05, "loss": 0.351, "step": 3527 }, { "epoch": 3.7611940298507465, "grad_norm": 0.11457686956190802, "learning_rate": 1.4049820536160033e-05, "loss": 0.3507, "step": 3528 }, { "epoch": 3.7622601279317696, "grad_norm": 0.13257820543611734, "learning_rate": 1.4027172004150594e-05, "loss": 0.3483, "step": 3529 }, { "epoch": 3.763326226012793, "grad_norm": 0.09934268832616149, "learning_rate": 1.400453785973801e-05, "loss": 0.3471, "step": 3530 }, { "epoch": 3.7643923240938166, "grad_norm": 0.1217605278748467, "learning_rate": 1.3981918115460448e-05, "loss": 0.3452, "step": 3531 }, { "epoch": 3.76545842217484, "grad_norm": 0.1025857492577431, "learning_rate": 1.3959312783848033e-05, "loss": 0.3496, "step": 3532 }, { "epoch": 3.7665245202558637, "grad_norm": 0.12375183362556455, "learning_rate": 1.3936721877422965e-05, "loss": 0.3532, "step": 3533 }, { "epoch": 3.767590618336887, "grad_norm": 0.10188334610518712, "learning_rate": 1.3914145408699437e-05, "loss": 0.3436, "step": 3534 }, { "epoch": 3.7686567164179103, "grad_norm": 0.12657161432596883, "learning_rate": 1.3891583390183621e-05, "loss": 0.3517, "step": 3535 }, { "epoch": 3.769722814498934, "grad_norm": 0.11261285266481309, "learning_rate": 1.3869035834373712e-05, "loss": 0.3533, "step": 3536 }, { "epoch": 3.7707889125799574, "grad_norm": 0.11006622035677963, "learning_rate": 1.3846502753759899e-05, "loss": 0.3475, "step": 3537 }, { "epoch": 3.771855010660981, "grad_norm": 0.10803610024542715, "learning_rate": 1.3823984160824306e-05, "loss": 0.3521, "step": 3538 }, { "epoch": 3.772921108742004, "grad_norm": 0.10308686673663463, "learning_rate": 1.3801480068041083e-05, "loss": 0.3486, "step": 3539 }, { "epoch": 3.7739872068230276, "grad_norm": 0.11418577872966433, "learning_rate": 1.3778990487876338e-05, "loss": 0.3527, "step": 3540 }, { "epoch": 3.775053304904051, "grad_norm": 0.115083278681719, "learning_rate": 1.3756515432788105e-05, "loss": 0.3547, "step": 3541 }, { "epoch": 3.7761194029850746, "grad_norm": 0.105339094797037, "learning_rate": 1.3734054915226405e-05, "loss": 0.3509, "step": 3542 }, { "epoch": 3.777185501066098, "grad_norm": 0.11586176556566104, "learning_rate": 1.3711608947633202e-05, "loss": 0.3508, "step": 3543 }, { "epoch": 3.7782515991471213, "grad_norm": 0.1066837228197777, "learning_rate": 1.3689177542442406e-05, "loss": 0.3477, "step": 3544 }, { "epoch": 3.7793176972281453, "grad_norm": 0.11175162407532467, "learning_rate": 1.3666760712079828e-05, "loss": 0.3486, "step": 3545 }, { "epoch": 3.7803837953091683, "grad_norm": 0.10538041603411623, "learning_rate": 1.3644358468963233e-05, "loss": 0.3477, "step": 3546 }, { "epoch": 3.781449893390192, "grad_norm": 0.12393679042543049, "learning_rate": 1.3621970825502317e-05, "loss": 0.3535, "step": 3547 }, { "epoch": 3.7825159914712154, "grad_norm": 0.10491158418048802, "learning_rate": 1.3599597794098648e-05, "loss": 0.3562, "step": 3548 }, { "epoch": 3.783582089552239, "grad_norm": 0.12508143946693426, "learning_rate": 1.3577239387145729e-05, "loss": 0.3513, "step": 3549 }, { "epoch": 3.7846481876332625, "grad_norm": 0.11761432608648041, "learning_rate": 1.3554895617028958e-05, "loss": 0.3502, "step": 3550 }, { "epoch": 3.7857142857142856, "grad_norm": 0.12468188783060895, "learning_rate": 1.3532566496125634e-05, "loss": 0.349, "step": 3551 }, { "epoch": 3.786780383795309, "grad_norm": 0.11776026667363994, "learning_rate": 1.3510252036804907e-05, "loss": 0.3496, "step": 3552 }, { "epoch": 3.7878464818763327, "grad_norm": 0.10743422269996004, "learning_rate": 1.348795225142784e-05, "loss": 0.3512, "step": 3553 }, { "epoch": 3.788912579957356, "grad_norm": 0.10720339132099257, "learning_rate": 1.3465667152347353e-05, "loss": 0.346, "step": 3554 }, { "epoch": 3.7899786780383797, "grad_norm": 0.10732063941636186, "learning_rate": 1.3443396751908243e-05, "loss": 0.3459, "step": 3555 }, { "epoch": 3.791044776119403, "grad_norm": 0.1113815053455615, "learning_rate": 1.3421141062447136e-05, "loss": 0.3491, "step": 3556 }, { "epoch": 3.7921108742004264, "grad_norm": 0.11139797758194125, "learning_rate": 1.3398900096292535e-05, "loss": 0.3513, "step": 3557 }, { "epoch": 3.79317697228145, "grad_norm": 0.11752612657415666, "learning_rate": 1.3376673865764796e-05, "loss": 0.354, "step": 3558 }, { "epoch": 3.7942430703624734, "grad_norm": 0.11709263878140849, "learning_rate": 1.3354462383176064e-05, "loss": 0.3547, "step": 3559 }, { "epoch": 3.795309168443497, "grad_norm": 0.12982606895742937, "learning_rate": 1.3332265660830364e-05, "loss": 0.3467, "step": 3560 }, { "epoch": 3.79637526652452, "grad_norm": 0.11565736542146775, "learning_rate": 1.3310083711023527e-05, "loss": 0.3516, "step": 3561 }, { "epoch": 3.7974413646055436, "grad_norm": 0.09828691895649047, "learning_rate": 1.3287916546043209e-05, "loss": 0.3487, "step": 3562 }, { "epoch": 3.798507462686567, "grad_norm": 0.1155849088844352, "learning_rate": 1.3265764178168841e-05, "loss": 0.3512, "step": 3563 }, { "epoch": 3.7995735607675907, "grad_norm": 0.10134447160803774, "learning_rate": 1.3243626619671704e-05, "loss": 0.3541, "step": 3564 }, { "epoch": 3.800639658848614, "grad_norm": 0.10380565104455901, "learning_rate": 1.3221503882814846e-05, "loss": 0.3466, "step": 3565 }, { "epoch": 3.8017057569296373, "grad_norm": 0.10299695475323739, "learning_rate": 1.3199395979853132e-05, "loss": 0.3503, "step": 3566 }, { "epoch": 3.802771855010661, "grad_norm": 0.131615477948487, "learning_rate": 1.3177302923033164e-05, "loss": 0.3515, "step": 3567 }, { "epoch": 3.8038379530916844, "grad_norm": 0.09761956730459642, "learning_rate": 1.3155224724593364e-05, "loss": 0.3516, "step": 3568 }, { "epoch": 3.804904051172708, "grad_norm": 0.10306669274819454, "learning_rate": 1.3133161396763909e-05, "loss": 0.3456, "step": 3569 }, { "epoch": 3.8059701492537314, "grad_norm": 0.10145722545444867, "learning_rate": 1.311111295176672e-05, "loss": 0.3439, "step": 3570 }, { "epoch": 3.8070362473347545, "grad_norm": 0.10770148837143485, "learning_rate": 1.3089079401815497e-05, "loss": 0.3496, "step": 3571 }, { "epoch": 3.8081023454157785, "grad_norm": 0.11320765417229854, "learning_rate": 1.3067060759115684e-05, "loss": 0.3542, "step": 3572 }, { "epoch": 3.8091684434968016, "grad_norm": 0.1163579650557563, "learning_rate": 1.3045057035864477e-05, "loss": 0.3554, "step": 3573 }, { "epoch": 3.810234541577825, "grad_norm": 0.09905809774744204, "learning_rate": 1.302306824425077e-05, "loss": 0.3486, "step": 3574 }, { "epoch": 3.8113006396588487, "grad_norm": 0.12608559744995004, "learning_rate": 1.3001094396455223e-05, "loss": 0.3514, "step": 3575 }, { "epoch": 3.8123667377398722, "grad_norm": 0.09595835889524401, "learning_rate": 1.297913550465022e-05, "loss": 0.3462, "step": 3576 }, { "epoch": 3.8134328358208958, "grad_norm": 0.13266355171728472, "learning_rate": 1.2957191580999821e-05, "loss": 0.3542, "step": 3577 }, { "epoch": 3.814498933901919, "grad_norm": 0.09688667994875337, "learning_rate": 1.2935262637659824e-05, "loss": 0.3512, "step": 3578 }, { "epoch": 3.8155650319829424, "grad_norm": 0.13077081224692166, "learning_rate": 1.2913348686777734e-05, "loss": 0.3463, "step": 3579 }, { "epoch": 3.816631130063966, "grad_norm": 0.1000818706820891, "learning_rate": 1.2891449740492749e-05, "loss": 0.3499, "step": 3580 }, { "epoch": 3.8176972281449895, "grad_norm": 0.12244332857777668, "learning_rate": 1.2869565810935724e-05, "loss": 0.3483, "step": 3581 }, { "epoch": 3.818763326226013, "grad_norm": 0.10413603824918938, "learning_rate": 1.2847696910229228e-05, "loss": 0.3513, "step": 3582 }, { "epoch": 3.819829424307036, "grad_norm": 0.10738770551253633, "learning_rate": 1.2825843050487495e-05, "loss": 0.3474, "step": 3583 }, { "epoch": 3.8208955223880596, "grad_norm": 0.1181068563058338, "learning_rate": 1.2804004243816444e-05, "loss": 0.3541, "step": 3584 }, { "epoch": 3.821961620469083, "grad_norm": 0.10846504895224321, "learning_rate": 1.2782180502313609e-05, "loss": 0.3499, "step": 3585 }, { "epoch": 3.8230277185501067, "grad_norm": 0.10970803247483393, "learning_rate": 1.2760371838068228e-05, "loss": 0.3463, "step": 3586 }, { "epoch": 3.8240938166311302, "grad_norm": 0.11300737915406556, "learning_rate": 1.2738578263161175e-05, "loss": 0.3497, "step": 3587 }, { "epoch": 3.8251599147121533, "grad_norm": 0.10850041595269576, "learning_rate": 1.2716799789664931e-05, "loss": 0.3503, "step": 3588 }, { "epoch": 3.826226012793177, "grad_norm": 0.1242287822226434, "learning_rate": 1.2695036429643657e-05, "loss": 0.3464, "step": 3589 }, { "epoch": 3.8272921108742004, "grad_norm": 0.11234434571174157, "learning_rate": 1.2673288195153118e-05, "loss": 0.349, "step": 3590 }, { "epoch": 3.828358208955224, "grad_norm": 0.11607729586786202, "learning_rate": 1.2651555098240724e-05, "loss": 0.3502, "step": 3591 }, { "epoch": 3.8294243070362475, "grad_norm": 0.11570224496079395, "learning_rate": 1.2629837150945447e-05, "loss": 0.3499, "step": 3592 }, { "epoch": 3.8304904051172706, "grad_norm": 0.12136791006711244, "learning_rate": 1.2608134365297922e-05, "loss": 0.354, "step": 3593 }, { "epoch": 3.831556503198294, "grad_norm": 0.10914056771119587, "learning_rate": 1.2586446753320374e-05, "loss": 0.3538, "step": 3594 }, { "epoch": 3.8326226012793176, "grad_norm": 0.12699655682552763, "learning_rate": 1.256477432702659e-05, "loss": 0.3469, "step": 3595 }, { "epoch": 3.833688699360341, "grad_norm": 0.11727167103613097, "learning_rate": 1.2543117098421976e-05, "loss": 0.345, "step": 3596 }, { "epoch": 3.8347547974413647, "grad_norm": 0.1207939527624419, "learning_rate": 1.2521475079503524e-05, "loss": 0.3477, "step": 3597 }, { "epoch": 3.835820895522388, "grad_norm": 0.11622965193657621, "learning_rate": 1.2499848282259767e-05, "loss": 0.3489, "step": 3598 }, { "epoch": 3.836886993603412, "grad_norm": 0.13564634838395118, "learning_rate": 1.2478236718670834e-05, "loss": 0.3498, "step": 3599 }, { "epoch": 3.837953091684435, "grad_norm": 0.11550765632689088, "learning_rate": 1.2456640400708424e-05, "loss": 0.3515, "step": 3600 }, { "epoch": 3.8390191897654584, "grad_norm": 0.10993793619669771, "learning_rate": 1.2435059340335753e-05, "loss": 0.3435, "step": 3601 }, { "epoch": 3.840085287846482, "grad_norm": 0.11694097576572215, "learning_rate": 1.241349354950761e-05, "loss": 0.3487, "step": 3602 }, { "epoch": 3.8411513859275055, "grad_norm": 0.10341436996831099, "learning_rate": 1.2391943040170343e-05, "loss": 0.348, "step": 3603 }, { "epoch": 3.842217484008529, "grad_norm": 0.12769414945414873, "learning_rate": 1.2370407824261785e-05, "loss": 0.3489, "step": 3604 }, { "epoch": 3.843283582089552, "grad_norm": 0.09401881452558605, "learning_rate": 1.2348887913711343e-05, "loss": 0.3522, "step": 3605 }, { "epoch": 3.8443496801705757, "grad_norm": 0.11795172219684597, "learning_rate": 1.2327383320439937e-05, "loss": 0.3449, "step": 3606 }, { "epoch": 3.845415778251599, "grad_norm": 0.1051555043312947, "learning_rate": 1.2305894056359967e-05, "loss": 0.3499, "step": 3607 }, { "epoch": 3.8464818763326227, "grad_norm": 0.12236877442049936, "learning_rate": 1.2284420133375385e-05, "loss": 0.3522, "step": 3608 }, { "epoch": 3.8475479744136463, "grad_norm": 0.10450987023264936, "learning_rate": 1.2262961563381643e-05, "loss": 0.35, "step": 3609 }, { "epoch": 3.8486140724946694, "grad_norm": 0.11632851457098174, "learning_rate": 1.224151835826565e-05, "loss": 0.3425, "step": 3610 }, { "epoch": 3.849680170575693, "grad_norm": 0.1313585390733969, "learning_rate": 1.222009052990583e-05, "loss": 0.3501, "step": 3611 }, { "epoch": 3.8507462686567164, "grad_norm": 0.09885089090066136, "learning_rate": 1.2198678090172096e-05, "loss": 0.3476, "step": 3612 }, { "epoch": 3.85181236673774, "grad_norm": 0.12386048454800534, "learning_rate": 1.2177281050925829e-05, "loss": 0.3533, "step": 3613 }, { "epoch": 3.8528784648187635, "grad_norm": 0.10109383676324435, "learning_rate": 1.2155899424019864e-05, "loss": 0.3479, "step": 3614 }, { "epoch": 3.8539445628997866, "grad_norm": 0.11463340028361436, "learning_rate": 1.2134533221298517e-05, "loss": 0.3547, "step": 3615 }, { "epoch": 3.85501066098081, "grad_norm": 0.10969960215345126, "learning_rate": 1.2113182454597565e-05, "loss": 0.3514, "step": 3616 }, { "epoch": 3.8560767590618337, "grad_norm": 0.11112827116259213, "learning_rate": 1.2091847135744198e-05, "loss": 0.349, "step": 3617 }, { "epoch": 3.857142857142857, "grad_norm": 0.11843720838928255, "learning_rate": 1.2070527276557092e-05, "loss": 0.3441, "step": 3618 }, { "epoch": 3.8582089552238807, "grad_norm": 0.10458805732550369, "learning_rate": 1.2049222888846334e-05, "loss": 0.3561, "step": 3619 }, { "epoch": 3.859275053304904, "grad_norm": 0.11054412675997474, "learning_rate": 1.2027933984413469e-05, "loss": 0.3527, "step": 3620 }, { "epoch": 3.8603411513859274, "grad_norm": 0.10285700468758723, "learning_rate": 1.2006660575051407e-05, "loss": 0.3471, "step": 3621 }, { "epoch": 3.861407249466951, "grad_norm": 0.1136506392149834, "learning_rate": 1.1985402672544532e-05, "loss": 0.3453, "step": 3622 }, { "epoch": 3.8624733475479744, "grad_norm": 0.09831280484139478, "learning_rate": 1.1964160288668629e-05, "loss": 0.3506, "step": 3623 }, { "epoch": 3.863539445628998, "grad_norm": 0.11187723045529688, "learning_rate": 1.1942933435190845e-05, "loss": 0.3499, "step": 3624 }, { "epoch": 3.864605543710021, "grad_norm": 0.09266608981403165, "learning_rate": 1.1921722123869773e-05, "loss": 0.351, "step": 3625 }, { "epoch": 3.8656716417910446, "grad_norm": 0.11511057624858669, "learning_rate": 1.1900526366455369e-05, "loss": 0.3468, "step": 3626 }, { "epoch": 3.866737739872068, "grad_norm": 0.10956860731954124, "learning_rate": 1.1879346174689e-05, "loss": 0.3492, "step": 3627 }, { "epoch": 3.8678038379530917, "grad_norm": 0.10708959205937109, "learning_rate": 1.1858181560303366e-05, "loss": 0.3469, "step": 3628 }, { "epoch": 3.868869936034115, "grad_norm": 0.11831696552244149, "learning_rate": 1.183703253502257e-05, "loss": 0.3462, "step": 3629 }, { "epoch": 3.8699360341151388, "grad_norm": 0.11785222740074164, "learning_rate": 1.1815899110562081e-05, "loss": 0.3504, "step": 3630 }, { "epoch": 3.8710021321961623, "grad_norm": 0.11362853709274134, "learning_rate": 1.1794781298628725e-05, "loss": 0.349, "step": 3631 }, { "epoch": 3.8720682302771854, "grad_norm": 0.11822864612991446, "learning_rate": 1.1773679110920648e-05, "loss": 0.3477, "step": 3632 }, { "epoch": 3.873134328358209, "grad_norm": 0.10399345332909624, "learning_rate": 1.1752592559127378e-05, "loss": 0.3473, "step": 3633 }, { "epoch": 3.8742004264392325, "grad_norm": 0.10956934208464444, "learning_rate": 1.1731521654929785e-05, "loss": 0.3497, "step": 3634 }, { "epoch": 3.875266524520256, "grad_norm": 0.12301783671574228, "learning_rate": 1.1710466410000021e-05, "loss": 0.3505, "step": 3635 }, { "epoch": 3.8763326226012795, "grad_norm": 0.11590042052538112, "learning_rate": 1.1689426836001618e-05, "loss": 0.3478, "step": 3636 }, { "epoch": 3.8773987206823026, "grad_norm": 0.10524997397293638, "learning_rate": 1.1668402944589405e-05, "loss": 0.3476, "step": 3637 }, { "epoch": 3.878464818763326, "grad_norm": 0.13955869940094448, "learning_rate": 1.1647394747409538e-05, "loss": 0.3486, "step": 3638 }, { "epoch": 3.8795309168443497, "grad_norm": 0.09733253071530201, "learning_rate": 1.1626402256099439e-05, "loss": 0.3501, "step": 3639 }, { "epoch": 3.8805970149253732, "grad_norm": 0.13743368067281408, "learning_rate": 1.1605425482287869e-05, "loss": 0.3448, "step": 3640 }, { "epoch": 3.8816631130063968, "grad_norm": 0.13018722986242173, "learning_rate": 1.1584464437594875e-05, "loss": 0.3507, "step": 3641 }, { "epoch": 3.88272921108742, "grad_norm": 0.11788165392769949, "learning_rate": 1.1563519133631793e-05, "loss": 0.3506, "step": 3642 }, { "epoch": 3.8837953091684434, "grad_norm": 0.13701749120457338, "learning_rate": 1.1542589582001215e-05, "loss": 0.3456, "step": 3643 }, { "epoch": 3.884861407249467, "grad_norm": 0.10813589381609363, "learning_rate": 1.1521675794297028e-05, "loss": 0.3483, "step": 3644 }, { "epoch": 3.8859275053304905, "grad_norm": 0.1224953861353445, "learning_rate": 1.1500777782104406e-05, "loss": 0.3417, "step": 3645 }, { "epoch": 3.886993603411514, "grad_norm": 0.11152538352587757, "learning_rate": 1.1479895556999731e-05, "loss": 0.3514, "step": 3646 }, { "epoch": 3.888059701492537, "grad_norm": 0.11316497383817867, "learning_rate": 1.145902913055068e-05, "loss": 0.3523, "step": 3647 }, { "epoch": 3.8891257995735606, "grad_norm": 0.11019891396827555, "learning_rate": 1.1438178514316181e-05, "loss": 0.3498, "step": 3648 }, { "epoch": 3.890191897654584, "grad_norm": 0.10842084319722904, "learning_rate": 1.1417343719846387e-05, "loss": 0.354, "step": 3649 }, { "epoch": 3.8912579957356077, "grad_norm": 0.1056924090606256, "learning_rate": 1.1396524758682678e-05, "loss": 0.3456, "step": 3650 }, { "epoch": 3.8923240938166312, "grad_norm": 0.10490460770794618, "learning_rate": 1.137572164235769e-05, "loss": 0.3492, "step": 3651 }, { "epoch": 3.8933901918976543, "grad_norm": 0.09975927894418671, "learning_rate": 1.1354934382395272e-05, "loss": 0.3509, "step": 3652 }, { "epoch": 3.894456289978678, "grad_norm": 0.10145902183398905, "learning_rate": 1.1334162990310471e-05, "loss": 0.3447, "step": 3653 }, { "epoch": 3.8955223880597014, "grad_norm": 0.10481224542330943, "learning_rate": 1.1313407477609561e-05, "loss": 0.347, "step": 3654 }, { "epoch": 3.896588486140725, "grad_norm": 0.10001648400892982, "learning_rate": 1.1292667855790027e-05, "loss": 0.3513, "step": 3655 }, { "epoch": 3.8976545842217485, "grad_norm": 0.09792997968349304, "learning_rate": 1.1271944136340544e-05, "loss": 0.3423, "step": 3656 }, { "epoch": 3.8987206823027716, "grad_norm": 0.1087579485635863, "learning_rate": 1.1251236330740962e-05, "loss": 0.3518, "step": 3657 }, { "epoch": 3.8997867803837956, "grad_norm": 0.10273076928541128, "learning_rate": 1.123054445046233e-05, "loss": 0.3403, "step": 3658 }, { "epoch": 3.9008528784648187, "grad_norm": 0.11360397048458296, "learning_rate": 1.1209868506966881e-05, "loss": 0.3479, "step": 3659 }, { "epoch": 3.901918976545842, "grad_norm": 0.09676431212644183, "learning_rate": 1.118920851170803e-05, "loss": 0.3412, "step": 3660 }, { "epoch": 3.9029850746268657, "grad_norm": 0.09307333554900554, "learning_rate": 1.1168564476130301e-05, "loss": 0.3482, "step": 3661 }, { "epoch": 3.9040511727078893, "grad_norm": 0.10397754347232523, "learning_rate": 1.1147936411669446e-05, "loss": 0.3481, "step": 3662 }, { "epoch": 3.905117270788913, "grad_norm": 0.08937777918666276, "learning_rate": 1.1127324329752342e-05, "loss": 0.3483, "step": 3663 }, { "epoch": 3.906183368869936, "grad_norm": 0.09042966313131054, "learning_rate": 1.110672824179699e-05, "loss": 0.3487, "step": 3664 }, { "epoch": 3.9072494669509594, "grad_norm": 0.09557462612752123, "learning_rate": 1.1086148159212562e-05, "loss": 0.3536, "step": 3665 }, { "epoch": 3.908315565031983, "grad_norm": 0.09035818870121247, "learning_rate": 1.1065584093399373e-05, "loss": 0.3519, "step": 3666 }, { "epoch": 3.9093816631130065, "grad_norm": 0.0942584757184624, "learning_rate": 1.1045036055748817e-05, "loss": 0.3448, "step": 3667 }, { "epoch": 3.91044776119403, "grad_norm": 0.09364257142264377, "learning_rate": 1.102450405764345e-05, "loss": 0.3504, "step": 3668 }, { "epoch": 3.911513859275053, "grad_norm": 0.09007339517858659, "learning_rate": 1.100398811045695e-05, "loss": 0.3501, "step": 3669 }, { "epoch": 3.9125799573560767, "grad_norm": 0.08911386586011126, "learning_rate": 1.0983488225554053e-05, "loss": 0.3493, "step": 3670 }, { "epoch": 3.9136460554371, "grad_norm": 0.09675034869903551, "learning_rate": 1.0963004414290653e-05, "loss": 0.3572, "step": 3671 }, { "epoch": 3.9147121535181237, "grad_norm": 0.09188043129655729, "learning_rate": 1.0942536688013713e-05, "loss": 0.3541, "step": 3672 }, { "epoch": 3.9157782515991473, "grad_norm": 0.09937464367903363, "learning_rate": 1.0922085058061285e-05, "loss": 0.3482, "step": 3673 }, { "epoch": 3.9168443496801704, "grad_norm": 0.09926679761687571, "learning_rate": 1.0901649535762506e-05, "loss": 0.3567, "step": 3674 }, { "epoch": 3.917910447761194, "grad_norm": 0.09965290213224968, "learning_rate": 1.0881230132437608e-05, "loss": 0.3462, "step": 3675 }, { "epoch": 3.9189765458422174, "grad_norm": 0.1003530954835924, "learning_rate": 1.0860826859397862e-05, "loss": 0.3472, "step": 3676 }, { "epoch": 3.920042643923241, "grad_norm": 0.09596106891446698, "learning_rate": 1.0840439727945626e-05, "loss": 0.3479, "step": 3677 }, { "epoch": 3.9211087420042645, "grad_norm": 0.09919666795826218, "learning_rate": 1.0820068749374327e-05, "loss": 0.3449, "step": 3678 }, { "epoch": 3.9221748400852876, "grad_norm": 0.1012132428938292, "learning_rate": 1.0799713934968406e-05, "loss": 0.3512, "step": 3679 }, { "epoch": 3.923240938166311, "grad_norm": 0.09755542244495895, "learning_rate": 1.0779375296003374e-05, "loss": 0.352, "step": 3680 }, { "epoch": 3.9243070362473347, "grad_norm": 0.09000645950013296, "learning_rate": 1.0759052843745806e-05, "loss": 0.3468, "step": 3681 }, { "epoch": 3.925373134328358, "grad_norm": 0.09876937193470242, "learning_rate": 1.073874658945325e-05, "loss": 0.3504, "step": 3682 }, { "epoch": 3.9264392324093818, "grad_norm": 0.09239651298132856, "learning_rate": 1.0718456544374333e-05, "loss": 0.3491, "step": 3683 }, { "epoch": 3.927505330490405, "grad_norm": 0.10058467408628505, "learning_rate": 1.0698182719748682e-05, "loss": 0.3511, "step": 3684 }, { "epoch": 3.928571428571429, "grad_norm": 0.08600735325668393, "learning_rate": 1.0677925126806956e-05, "loss": 0.3421, "step": 3685 }, { "epoch": 3.929637526652452, "grad_norm": 0.09083257850771187, "learning_rate": 1.0657683776770788e-05, "loss": 0.3493, "step": 3686 }, { "epoch": 3.9307036247334755, "grad_norm": 0.09314333955394852, "learning_rate": 1.0637458680852841e-05, "loss": 0.3533, "step": 3687 }, { "epoch": 3.931769722814499, "grad_norm": 0.09360495013685566, "learning_rate": 1.0617249850256766e-05, "loss": 0.349, "step": 3688 }, { "epoch": 3.9328358208955225, "grad_norm": 0.09627416992471283, "learning_rate": 1.0597057296177225e-05, "loss": 0.3516, "step": 3689 }, { "epoch": 3.933901918976546, "grad_norm": 0.09541087809563052, "learning_rate": 1.0576881029799808e-05, "loss": 0.3543, "step": 3690 }, { "epoch": 3.934968017057569, "grad_norm": 0.10348458965898262, "learning_rate": 1.0556721062301141e-05, "loss": 0.3567, "step": 3691 }, { "epoch": 3.9360341151385927, "grad_norm": 0.09872750858387926, "learning_rate": 1.05365774048488e-05, "loss": 0.3491, "step": 3692 }, { "epoch": 3.9371002132196162, "grad_norm": 0.10350018663429181, "learning_rate": 1.051645006860131e-05, "loss": 0.3498, "step": 3693 }, { "epoch": 3.9381663113006398, "grad_norm": 0.10594157752622746, "learning_rate": 1.0496339064708172e-05, "loss": 0.3506, "step": 3694 }, { "epoch": 3.9392324093816633, "grad_norm": 0.10372816420096216, "learning_rate": 1.0476244404309846e-05, "loss": 0.3514, "step": 3695 }, { "epoch": 3.9402985074626864, "grad_norm": 0.10601590445244279, "learning_rate": 1.0456166098537737e-05, "loss": 0.3444, "step": 3696 }, { "epoch": 3.94136460554371, "grad_norm": 0.11008051797371131, "learning_rate": 1.0436104158514158e-05, "loss": 0.3473, "step": 3697 }, { "epoch": 3.9424307036247335, "grad_norm": 0.09606351168418183, "learning_rate": 1.0416058595352391e-05, "loss": 0.3484, "step": 3698 }, { "epoch": 3.943496801705757, "grad_norm": 0.1071996803306346, "learning_rate": 1.039602942015664e-05, "loss": 0.3476, "step": 3699 }, { "epoch": 3.9445628997867805, "grad_norm": 0.09623667142559411, "learning_rate": 1.0376016644022044e-05, "loss": 0.3447, "step": 3700 }, { "epoch": 3.9456289978678036, "grad_norm": 0.10721079752870356, "learning_rate": 1.035602027803461e-05, "loss": 0.3469, "step": 3701 }, { "epoch": 3.946695095948827, "grad_norm": 0.1025843055534788, "learning_rate": 1.0336040333271295e-05, "loss": 0.3447, "step": 3702 }, { "epoch": 3.9477611940298507, "grad_norm": 0.09871032111134458, "learning_rate": 1.0316076820799968e-05, "loss": 0.3529, "step": 3703 }, { "epoch": 3.9488272921108742, "grad_norm": 0.10311671354974446, "learning_rate": 1.029612975167935e-05, "loss": 0.3474, "step": 3704 }, { "epoch": 3.949893390191898, "grad_norm": 0.10336281514714324, "learning_rate": 1.0276199136959097e-05, "loss": 0.3512, "step": 3705 }, { "epoch": 3.950959488272921, "grad_norm": 0.10601850200687606, "learning_rate": 1.025628498767973e-05, "loss": 0.3514, "step": 3706 }, { "epoch": 3.9520255863539444, "grad_norm": 0.08908113002105139, "learning_rate": 1.0236387314872664e-05, "loss": 0.3435, "step": 3707 }, { "epoch": 3.953091684434968, "grad_norm": 0.10671007341009846, "learning_rate": 1.0216506129560155e-05, "loss": 0.3551, "step": 3708 }, { "epoch": 3.9541577825159915, "grad_norm": 0.09784872373144313, "learning_rate": 1.0196641442755354e-05, "loss": 0.3466, "step": 3709 }, { "epoch": 3.955223880597015, "grad_norm": 0.11088241430701273, "learning_rate": 1.0176793265462282e-05, "loss": 0.3455, "step": 3710 }, { "epoch": 3.956289978678038, "grad_norm": 0.0910427685127364, "learning_rate": 1.0156961608675768e-05, "loss": 0.3505, "step": 3711 }, { "epoch": 3.957356076759062, "grad_norm": 0.10461641359651551, "learning_rate": 1.0137146483381538e-05, "loss": 0.354, "step": 3712 }, { "epoch": 3.958422174840085, "grad_norm": 0.09627545711712145, "learning_rate": 1.0117347900556137e-05, "loss": 0.3513, "step": 3713 }, { "epoch": 3.9594882729211087, "grad_norm": 0.1073622509994735, "learning_rate": 1.0097565871166961e-05, "loss": 0.3546, "step": 3714 }, { "epoch": 3.9605543710021323, "grad_norm": 0.09484625771277952, "learning_rate": 1.0077800406172207e-05, "loss": 0.3524, "step": 3715 }, { "epoch": 3.961620469083156, "grad_norm": 0.08894481152860242, "learning_rate": 1.0058051516520929e-05, "loss": 0.3474, "step": 3716 }, { "epoch": 3.9626865671641793, "grad_norm": 0.09733312978906855, "learning_rate": 1.0038319213152979e-05, "loss": 0.3495, "step": 3717 }, { "epoch": 3.9637526652452024, "grad_norm": 0.09864930949926093, "learning_rate": 1.001860350699904e-05, "loss": 0.3496, "step": 3718 }, { "epoch": 3.964818763326226, "grad_norm": 0.0981894103889243, "learning_rate": 9.99890440898057e-06, "loss": 0.3529, "step": 3719 }, { "epoch": 3.9658848614072495, "grad_norm": 0.10482896841184723, "learning_rate": 9.97922193000985e-06, "loss": 0.3519, "step": 3720 }, { "epoch": 3.966950959488273, "grad_norm": 0.10498811386677907, "learning_rate": 9.959556080989973e-06, "loss": 0.3488, "step": 3721 }, { "epoch": 3.9680170575692966, "grad_norm": 0.11366999497073144, "learning_rate": 9.939906872814764e-06, "loss": 0.3535, "step": 3722 }, { "epoch": 3.9690831556503197, "grad_norm": 0.09665112411107446, "learning_rate": 9.920274316368879e-06, "loss": 0.3511, "step": 3723 }, { "epoch": 3.970149253731343, "grad_norm": 0.10146451503365203, "learning_rate": 9.900658422527734e-06, "loss": 0.346, "step": 3724 }, { "epoch": 3.9712153518123667, "grad_norm": 0.10925095180031567, "learning_rate": 9.881059202157525e-06, "loss": 0.3507, "step": 3725 }, { "epoch": 3.9722814498933903, "grad_norm": 0.09225433257638278, "learning_rate": 9.86147666611518e-06, "loss": 0.3482, "step": 3726 }, { "epoch": 3.973347547974414, "grad_norm": 0.0943825856008904, "learning_rate": 9.841910825248412e-06, "loss": 0.3464, "step": 3727 }, { "epoch": 3.974413646055437, "grad_norm": 0.09290035671178411, "learning_rate": 9.82236169039569e-06, "loss": 0.3449, "step": 3728 }, { "epoch": 3.9754797441364604, "grad_norm": 0.0921926282044607, "learning_rate": 9.802829272386227e-06, "loss": 0.3554, "step": 3729 }, { "epoch": 3.976545842217484, "grad_norm": 0.0960555525918967, "learning_rate": 9.783313582039935e-06, "loss": 0.3483, "step": 3730 }, { "epoch": 3.9776119402985075, "grad_norm": 0.0917415114727513, "learning_rate": 9.763814630167516e-06, "loss": 0.3458, "step": 3731 }, { "epoch": 3.978678038379531, "grad_norm": 0.08801795546770015, "learning_rate": 9.744332427570384e-06, "loss": 0.3436, "step": 3732 }, { "epoch": 3.979744136460554, "grad_norm": 0.09754716872379321, "learning_rate": 9.72486698504064e-06, "loss": 0.3406, "step": 3733 }, { "epoch": 3.9808102345415777, "grad_norm": 0.09042401583271688, "learning_rate": 9.705418313361141e-06, "loss": 0.3486, "step": 3734 }, { "epoch": 3.981876332622601, "grad_norm": 0.08028758020380314, "learning_rate": 9.685986423305449e-06, "loss": 0.3494, "step": 3735 }, { "epoch": 3.9829424307036247, "grad_norm": 0.09007532159921652, "learning_rate": 9.666571325637806e-06, "loss": 0.3482, "step": 3736 }, { "epoch": 3.9840085287846483, "grad_norm": 0.09260516061644863, "learning_rate": 9.647173031113173e-06, "loss": 0.3452, "step": 3737 }, { "epoch": 3.9850746268656714, "grad_norm": 0.09089810495372147, "learning_rate": 9.627791550477209e-06, "loss": 0.3462, "step": 3738 }, { "epoch": 3.9861407249466954, "grad_norm": 0.09771736284518276, "learning_rate": 9.608426894466225e-06, "loss": 0.3532, "step": 3739 }, { "epoch": 3.9872068230277184, "grad_norm": 0.1259794070436244, "learning_rate": 9.589079073807244e-06, "loss": 0.349, "step": 3740 }, { "epoch": 3.988272921108742, "grad_norm": 0.09140483649945423, "learning_rate": 9.569748099217962e-06, "loss": 0.3433, "step": 3741 }, { "epoch": 3.9893390191897655, "grad_norm": 0.12248710790523741, "learning_rate": 9.55043398140672e-06, "loss": 0.3522, "step": 3742 }, { "epoch": 3.990405117270789, "grad_norm": 0.09833501236467464, "learning_rate": 9.53113673107254e-06, "loss": 0.3531, "step": 3743 }, { "epoch": 3.9914712153518126, "grad_norm": 0.10722069561647879, "learning_rate": 9.511856358905108e-06, "loss": 0.3513, "step": 3744 }, { "epoch": 3.9925373134328357, "grad_norm": 0.12373429125821357, "learning_rate": 9.492592875584728e-06, "loss": 0.3498, "step": 3745 }, { "epoch": 3.9936034115138592, "grad_norm": 0.10792510560456024, "learning_rate": 9.473346291782376e-06, "loss": 0.3532, "step": 3746 }, { "epoch": 3.9946695095948828, "grad_norm": 0.10969296551778704, "learning_rate": 9.454116618159675e-06, "loss": 0.3481, "step": 3747 }, { "epoch": 3.9957356076759063, "grad_norm": 0.10095085119982077, "learning_rate": 9.434903865368837e-06, "loss": 0.3484, "step": 3748 }, { "epoch": 3.99680170575693, "grad_norm": 0.1008779164840716, "learning_rate": 9.415708044052744e-06, "loss": 0.3515, "step": 3749 }, { "epoch": 3.997867803837953, "grad_norm": 0.09334484031468043, "learning_rate": 9.396529164844893e-06, "loss": 0.3536, "step": 3750 }, { "epoch": 3.9989339019189765, "grad_norm": 0.08642194713513292, "learning_rate": 9.377367238369368e-06, "loss": 0.3451, "step": 3751 }, { "epoch": 4.0, "grad_norm": 0.1134408652616496, "learning_rate": 9.358222275240884e-06, "loss": 0.336, "step": 3752 }, { "epoch": 4.001066098081023, "grad_norm": 0.104623946486961, "learning_rate": 9.33909428606476e-06, "loss": 0.3372, "step": 3753 }, { "epoch": 4.002132196162047, "grad_norm": 0.10983574887630808, "learning_rate": 9.31998328143692e-06, "loss": 0.3316, "step": 3754 }, { "epoch": 4.00319829424307, "grad_norm": 0.10886704749727195, "learning_rate": 9.30088927194384e-06, "loss": 0.3346, "step": 3755 }, { "epoch": 4.004264392324094, "grad_norm": 0.11057981475076427, "learning_rate": 9.281812268162626e-06, "loss": 0.3377, "step": 3756 }, { "epoch": 4.005330490405117, "grad_norm": 0.12400384882704346, "learning_rate": 9.262752280660944e-06, "loss": 0.3342, "step": 3757 }, { "epoch": 4.00639658848614, "grad_norm": 0.11298478774918766, "learning_rate": 9.243709319997047e-06, "loss": 0.338, "step": 3758 }, { "epoch": 4.007462686567164, "grad_norm": 0.13596408021241022, "learning_rate": 9.224683396719728e-06, "loss": 0.335, "step": 3759 }, { "epoch": 4.008528784648187, "grad_norm": 0.11672755460945555, "learning_rate": 9.205674521368362e-06, "loss": 0.3316, "step": 3760 }, { "epoch": 4.009594882729211, "grad_norm": 0.11331525635027023, "learning_rate": 9.186682704472898e-06, "loss": 0.3294, "step": 3761 }, { "epoch": 4.0106609808102345, "grad_norm": 0.1195438195692172, "learning_rate": 9.167707956553787e-06, "loss": 0.3343, "step": 3762 }, { "epoch": 4.011727078891258, "grad_norm": 0.1091957644805815, "learning_rate": 9.148750288122063e-06, "loss": 0.3348, "step": 3763 }, { "epoch": 4.0127931769722816, "grad_norm": 0.11204673273341918, "learning_rate": 9.129809709679297e-06, "loss": 0.3321, "step": 3764 }, { "epoch": 4.013859275053305, "grad_norm": 0.10282221122253991, "learning_rate": 9.110886231717595e-06, "loss": 0.3339, "step": 3765 }, { "epoch": 4.014925373134329, "grad_norm": 0.10864476435323264, "learning_rate": 9.09197986471955e-06, "loss": 0.3355, "step": 3766 }, { "epoch": 4.015991471215352, "grad_norm": 0.10772893714735123, "learning_rate": 9.073090619158322e-06, "loss": 0.3306, "step": 3767 }, { "epoch": 4.017057569296376, "grad_norm": 0.11249780372035993, "learning_rate": 9.054218505497587e-06, "loss": 0.3343, "step": 3768 }, { "epoch": 4.018123667377399, "grad_norm": 0.1126010464588701, "learning_rate": 9.035363534191486e-06, "loss": 0.3312, "step": 3769 }, { "epoch": 4.019189765458422, "grad_norm": 0.12907028042017912, "learning_rate": 9.016525715684711e-06, "loss": 0.3327, "step": 3770 }, { "epoch": 4.020255863539446, "grad_norm": 0.11807831450158258, "learning_rate": 8.99770506041243e-06, "loss": 0.3386, "step": 3771 }, { "epoch": 4.021321961620469, "grad_norm": 0.10963014735265454, "learning_rate": 8.978901578800316e-06, "loss": 0.3309, "step": 3772 }, { "epoch": 4.022388059701493, "grad_norm": 0.11867805956803665, "learning_rate": 8.960115281264507e-06, "loss": 0.3423, "step": 3773 }, { "epoch": 4.023454157782516, "grad_norm": 0.11598119651875034, "learning_rate": 8.941346178211639e-06, "loss": 0.3281, "step": 3774 }, { "epoch": 4.024520255863539, "grad_norm": 0.10424229151793267, "learning_rate": 8.922594280038823e-06, "loss": 0.323, "step": 3775 }, { "epoch": 4.025586353944563, "grad_norm": 0.10762211731941046, "learning_rate": 8.903859597133646e-06, "loss": 0.3344, "step": 3776 }, { "epoch": 4.026652452025586, "grad_norm": 0.11394090140716011, "learning_rate": 8.88514213987413e-06, "loss": 0.3331, "step": 3777 }, { "epoch": 4.02771855010661, "grad_norm": 0.09343973744188745, "learning_rate": 8.866441918628777e-06, "loss": 0.333, "step": 3778 }, { "epoch": 4.028784648187633, "grad_norm": 0.11583112915467723, "learning_rate": 8.847758943756556e-06, "loss": 0.3385, "step": 3779 }, { "epoch": 4.029850746268656, "grad_norm": 0.10675906671526333, "learning_rate": 8.829093225606842e-06, "loss": 0.3338, "step": 3780 }, { "epoch": 4.03091684434968, "grad_norm": 0.10418803649968214, "learning_rate": 8.810444774519475e-06, "loss": 0.3383, "step": 3781 }, { "epoch": 4.031982942430703, "grad_norm": 0.12106960416467054, "learning_rate": 8.791813600824742e-06, "loss": 0.3378, "step": 3782 }, { "epoch": 4.033049040511727, "grad_norm": 0.11769429634565491, "learning_rate": 8.773199714843339e-06, "loss": 0.3326, "step": 3783 }, { "epoch": 4.0341151385927505, "grad_norm": 0.11355409773845418, "learning_rate": 8.754603126886385e-06, "loss": 0.3351, "step": 3784 }, { "epoch": 4.035181236673774, "grad_norm": 0.09880842808847773, "learning_rate": 8.736023847255426e-06, "loss": 0.3356, "step": 3785 }, { "epoch": 4.036247334754798, "grad_norm": 0.11158321630050068, "learning_rate": 8.71746188624242e-06, "loss": 0.3363, "step": 3786 }, { "epoch": 4.037313432835821, "grad_norm": 0.1161769963643131, "learning_rate": 8.698917254129732e-06, "loss": 0.3328, "step": 3787 }, { "epoch": 4.038379530916845, "grad_norm": 0.11351757271831203, "learning_rate": 8.680389961190116e-06, "loss": 0.3311, "step": 3788 }, { "epoch": 4.039445628997868, "grad_norm": 0.10521775128019155, "learning_rate": 8.66188001768673e-06, "loss": 0.3346, "step": 3789 }, { "epoch": 4.040511727078891, "grad_norm": 0.10112600207752875, "learning_rate": 8.64338743387314e-06, "loss": 0.3365, "step": 3790 }, { "epoch": 4.041577825159915, "grad_norm": 0.10617915170837067, "learning_rate": 8.624912219993248e-06, "loss": 0.3332, "step": 3791 }, { "epoch": 4.042643923240938, "grad_norm": 0.10169862043274906, "learning_rate": 8.606454386281368e-06, "loss": 0.3276, "step": 3792 }, { "epoch": 4.043710021321962, "grad_norm": 0.10543707108073325, "learning_rate": 8.588013942962195e-06, "loss": 0.337, "step": 3793 }, { "epoch": 4.044776119402985, "grad_norm": 0.10597483996500817, "learning_rate": 8.569590900250775e-06, "loss": 0.3389, "step": 3794 }, { "epoch": 4.045842217484009, "grad_norm": 0.10326713662353049, "learning_rate": 8.551185268352502e-06, "loss": 0.3337, "step": 3795 }, { "epoch": 4.046908315565032, "grad_norm": 0.0966216388278116, "learning_rate": 8.532797057463145e-06, "loss": 0.3398, "step": 3796 }, { "epoch": 4.047974413646055, "grad_norm": 0.09422934050818582, "learning_rate": 8.51442627776883e-06, "loss": 0.3373, "step": 3797 }, { "epoch": 4.049040511727079, "grad_norm": 0.1036405776308619, "learning_rate": 8.496072939445997e-06, "loss": 0.3336, "step": 3798 }, { "epoch": 4.050106609808102, "grad_norm": 0.10297234333425688, "learning_rate": 8.477737052661444e-06, "loss": 0.3357, "step": 3799 }, { "epoch": 4.051172707889126, "grad_norm": 0.09051843440055769, "learning_rate": 8.459418627572304e-06, "loss": 0.3356, "step": 3800 }, { "epoch": 4.052238805970149, "grad_norm": 0.09979444399952614, "learning_rate": 8.44111767432604e-06, "loss": 0.3383, "step": 3801 }, { "epoch": 4.053304904051172, "grad_norm": 0.09217956240397418, "learning_rate": 8.422834203060418e-06, "loss": 0.3324, "step": 3802 }, { "epoch": 4.054371002132196, "grad_norm": 0.09053551898130377, "learning_rate": 8.404568223903529e-06, "loss": 0.3336, "step": 3803 }, { "epoch": 4.0554371002132195, "grad_norm": 0.09177723542384852, "learning_rate": 8.386319746973787e-06, "loss": 0.3339, "step": 3804 }, { "epoch": 4.056503198294243, "grad_norm": 0.09189669268340435, "learning_rate": 8.36808878237989e-06, "loss": 0.3382, "step": 3805 }, { "epoch": 4.0575692963752665, "grad_norm": 0.0942192210220161, "learning_rate": 8.349875340220847e-06, "loss": 0.327, "step": 3806 }, { "epoch": 4.05863539445629, "grad_norm": 0.09403630781253433, "learning_rate": 8.331679430585971e-06, "loss": 0.3406, "step": 3807 }, { "epoch": 4.059701492537314, "grad_norm": 0.10368304373550687, "learning_rate": 8.313501063554827e-06, "loss": 0.3329, "step": 3808 }, { "epoch": 4.060767590618337, "grad_norm": 0.09734440431718942, "learning_rate": 8.295340249197301e-06, "loss": 0.3339, "step": 3809 }, { "epoch": 4.061833688699361, "grad_norm": 0.09789072226393697, "learning_rate": 8.277196997573545e-06, "loss": 0.3392, "step": 3810 }, { "epoch": 4.062899786780384, "grad_norm": 0.11027941919044942, "learning_rate": 8.259071318733962e-06, "loss": 0.3326, "step": 3811 }, { "epoch": 4.063965884861407, "grad_norm": 0.09341818511371246, "learning_rate": 8.240963222719243e-06, "loss": 0.3304, "step": 3812 }, { "epoch": 4.065031982942431, "grad_norm": 0.0985386753548714, "learning_rate": 8.222872719560339e-06, "loss": 0.3373, "step": 3813 }, { "epoch": 4.066098081023454, "grad_norm": 0.09601340900666522, "learning_rate": 8.204799819278438e-06, "loss": 0.3352, "step": 3814 }, { "epoch": 4.067164179104478, "grad_norm": 0.11111998488141665, "learning_rate": 8.186744531884989e-06, "loss": 0.3399, "step": 3815 }, { "epoch": 4.068230277185501, "grad_norm": 0.09039086409906084, "learning_rate": 8.168706867381692e-06, "loss": 0.3316, "step": 3816 }, { "epoch": 4.069296375266524, "grad_norm": 0.09819313076741153, "learning_rate": 8.150686835760467e-06, "loss": 0.3389, "step": 3817 }, { "epoch": 4.070362473347548, "grad_norm": 0.10154340371524484, "learning_rate": 8.132684447003471e-06, "loss": 0.3348, "step": 3818 }, { "epoch": 4.071428571428571, "grad_norm": 0.08699449860021097, "learning_rate": 8.114699711083113e-06, "loss": 0.3377, "step": 3819 }, { "epoch": 4.072494669509595, "grad_norm": 0.10459986672211814, "learning_rate": 8.096732637961974e-06, "loss": 0.3366, "step": 3820 }, { "epoch": 4.073560767590618, "grad_norm": 0.09394697895679324, "learning_rate": 8.078783237592894e-06, "loss": 0.3309, "step": 3821 }, { "epoch": 4.074626865671641, "grad_norm": 0.0914538941685346, "learning_rate": 8.060851519918901e-06, "loss": 0.3361, "step": 3822 }, { "epoch": 4.075692963752665, "grad_norm": 0.08804168096705205, "learning_rate": 8.042937494873255e-06, "loss": 0.3342, "step": 3823 }, { "epoch": 4.076759061833688, "grad_norm": 0.08507323171593308, "learning_rate": 8.025041172379366e-06, "loss": 0.3338, "step": 3824 }, { "epoch": 4.077825159914712, "grad_norm": 0.08903348958715734, "learning_rate": 8.007162562350882e-06, "loss": 0.3372, "step": 3825 }, { "epoch": 4.0788912579957355, "grad_norm": 0.08728752400888612, "learning_rate": 7.989301674691634e-06, "loss": 0.3299, "step": 3826 }, { "epoch": 4.0799573560767595, "grad_norm": 0.10425110765874443, "learning_rate": 7.971458519295598e-06, "loss": 0.3363, "step": 3827 }, { "epoch": 4.081023454157783, "grad_norm": 0.08654110423017557, "learning_rate": 7.953633106046971e-06, "loss": 0.3362, "step": 3828 }, { "epoch": 4.082089552238806, "grad_norm": 0.09648199398820004, "learning_rate": 7.935825444820109e-06, "loss": 0.3336, "step": 3829 }, { "epoch": 4.08315565031983, "grad_norm": 0.09601930808087997, "learning_rate": 7.918035545479532e-06, "loss": 0.3331, "step": 3830 }, { "epoch": 4.084221748400853, "grad_norm": 0.08913546055906947, "learning_rate": 7.900263417879905e-06, "loss": 0.3333, "step": 3831 }, { "epoch": 4.085287846481877, "grad_norm": 0.08746186482392278, "learning_rate": 7.882509071866074e-06, "loss": 0.3351, "step": 3832 }, { "epoch": 4.0863539445629, "grad_norm": 0.09436654519402421, "learning_rate": 7.864772517273019e-06, "loss": 0.3287, "step": 3833 }, { "epoch": 4.087420042643923, "grad_norm": 0.09443565575400105, "learning_rate": 7.847053763925884e-06, "loss": 0.3306, "step": 3834 }, { "epoch": 4.088486140724947, "grad_norm": 0.09012184838426746, "learning_rate": 7.829352821639915e-06, "loss": 0.3345, "step": 3835 }, { "epoch": 4.08955223880597, "grad_norm": 0.08459214899851109, "learning_rate": 7.811669700220523e-06, "loss": 0.3338, "step": 3836 }, { "epoch": 4.090618336886994, "grad_norm": 0.0940823716359374, "learning_rate": 7.794004409463256e-06, "loss": 0.3333, "step": 3837 }, { "epoch": 4.091684434968017, "grad_norm": 0.09658670184543283, "learning_rate": 7.77635695915374e-06, "loss": 0.332, "step": 3838 }, { "epoch": 4.09275053304904, "grad_norm": 0.09760335361225825, "learning_rate": 7.758727359067752e-06, "loss": 0.334, "step": 3839 }, { "epoch": 4.093816631130064, "grad_norm": 0.09269720585969814, "learning_rate": 7.741115618971182e-06, "loss": 0.3381, "step": 3840 }, { "epoch": 4.094882729211087, "grad_norm": 0.08955928469567048, "learning_rate": 7.723521748620023e-06, "loss": 0.334, "step": 3841 }, { "epoch": 4.095948827292111, "grad_norm": 0.09989251564772907, "learning_rate": 7.705945757760349e-06, "loss": 0.3324, "step": 3842 }, { "epoch": 4.097014925373134, "grad_norm": 0.08940858807005556, "learning_rate": 7.688387656128355e-06, "loss": 0.3462, "step": 3843 }, { "epoch": 4.098081023454157, "grad_norm": 0.09359897320243363, "learning_rate": 7.670847453450325e-06, "loss": 0.3359, "step": 3844 }, { "epoch": 4.099147121535181, "grad_norm": 0.09401701464605473, "learning_rate": 7.653325159442597e-06, "loss": 0.3352, "step": 3845 }, { "epoch": 4.100213219616204, "grad_norm": 0.08829874201235319, "learning_rate": 7.63582078381163e-06, "loss": 0.3375, "step": 3846 }, { "epoch": 4.101279317697228, "grad_norm": 0.10865348890518493, "learning_rate": 7.618334336253927e-06, "loss": 0.3344, "step": 3847 }, { "epoch": 4.1023454157782515, "grad_norm": 0.09122759099076949, "learning_rate": 7.60086582645609e-06, "loss": 0.3311, "step": 3848 }, { "epoch": 4.103411513859275, "grad_norm": 0.08531789969850555, "learning_rate": 7.5834152640947444e-06, "loss": 0.3326, "step": 3849 }, { "epoch": 4.104477611940299, "grad_norm": 0.108511169138579, "learning_rate": 7.565982658836599e-06, "loss": 0.3299, "step": 3850 }, { "epoch": 4.105543710021322, "grad_norm": 0.07907053738511821, "learning_rate": 7.548568020338418e-06, "loss": 0.3335, "step": 3851 }, { "epoch": 4.106609808102346, "grad_norm": 0.09975341694550234, "learning_rate": 7.531171358247009e-06, "loss": 0.3352, "step": 3852 }, { "epoch": 4.107675906183369, "grad_norm": 0.08769932023249331, "learning_rate": 7.5137926821992055e-06, "loss": 0.334, "step": 3853 }, { "epoch": 4.108742004264393, "grad_norm": 0.10031133415920226, "learning_rate": 7.496432001821898e-06, "loss": 0.3311, "step": 3854 }, { "epoch": 4.109808102345416, "grad_norm": 0.09300616655005942, "learning_rate": 7.4790893267320115e-06, "loss": 0.3347, "step": 3855 }, { "epoch": 4.110874200426439, "grad_norm": 0.09170488621474711, "learning_rate": 7.461764666536471e-06, "loss": 0.3334, "step": 3856 }, { "epoch": 4.111940298507463, "grad_norm": 0.10055859853923502, "learning_rate": 7.444458030832238e-06, "loss": 0.3395, "step": 3857 }, { "epoch": 4.113006396588486, "grad_norm": 0.08680033935993048, "learning_rate": 7.427169429206294e-06, "loss": 0.3411, "step": 3858 }, { "epoch": 4.11407249466951, "grad_norm": 0.09485145644525105, "learning_rate": 7.409898871235639e-06, "loss": 0.333, "step": 3859 }, { "epoch": 4.115138592750533, "grad_norm": 0.09123152248422452, "learning_rate": 7.39264636648724e-06, "loss": 0.3331, "step": 3860 }, { "epoch": 4.116204690831556, "grad_norm": 0.10403836531483734, "learning_rate": 7.375411924518099e-06, "loss": 0.328, "step": 3861 }, { "epoch": 4.11727078891258, "grad_norm": 0.08367336466007777, "learning_rate": 7.358195554875203e-06, "loss": 0.3337, "step": 3862 }, { "epoch": 4.118336886993603, "grad_norm": 0.10846732273711228, "learning_rate": 7.340997267095535e-06, "loss": 0.3333, "step": 3863 }, { "epoch": 4.119402985074627, "grad_norm": 0.12164518439766003, "learning_rate": 7.323817070706036e-06, "loss": 0.3341, "step": 3864 }, { "epoch": 4.12046908315565, "grad_norm": 0.09560739668306972, "learning_rate": 7.3066549752236435e-06, "loss": 0.3364, "step": 3865 }, { "epoch": 4.121535181236673, "grad_norm": 0.09724336285167177, "learning_rate": 7.289510990155286e-06, "loss": 0.3264, "step": 3866 }, { "epoch": 4.122601279317697, "grad_norm": 0.1017353520878921, "learning_rate": 7.2723851249978114e-06, "loss": 0.3305, "step": 3867 }, { "epoch": 4.1236673773987205, "grad_norm": 0.1069625874611212, "learning_rate": 7.255277389238075e-06, "loss": 0.335, "step": 3868 }, { "epoch": 4.1247334754797444, "grad_norm": 0.09669306285547304, "learning_rate": 7.238187792352871e-06, "loss": 0.328, "step": 3869 }, { "epoch": 4.1257995735607675, "grad_norm": 0.10282239370897285, "learning_rate": 7.221116343808963e-06, "loss": 0.3311, "step": 3870 }, { "epoch": 4.126865671641791, "grad_norm": 0.10105373041143043, "learning_rate": 7.2040630530630175e-06, "loss": 0.334, "step": 3871 }, { "epoch": 4.127931769722815, "grad_norm": 0.0854802255803901, "learning_rate": 7.187027929561683e-06, "loss": 0.3374, "step": 3872 }, { "epoch": 4.128997867803838, "grad_norm": 0.10520223879341609, "learning_rate": 7.170010982741549e-06, "loss": 0.3332, "step": 3873 }, { "epoch": 4.130063965884862, "grad_norm": 0.09932971506774815, "learning_rate": 7.153012222029097e-06, "loss": 0.3313, "step": 3874 }, { "epoch": 4.131130063965885, "grad_norm": 0.09719494548254369, "learning_rate": 7.136031656840763e-06, "loss": 0.3308, "step": 3875 }, { "epoch": 4.132196162046908, "grad_norm": 0.08740775740958265, "learning_rate": 7.1190692965829126e-06, "loss": 0.3353, "step": 3876 }, { "epoch": 4.133262260127932, "grad_norm": 0.08603123144210989, "learning_rate": 7.102125150651784e-06, "loss": 0.3288, "step": 3877 }, { "epoch": 4.134328358208955, "grad_norm": 0.09256501786020642, "learning_rate": 7.085199228433577e-06, "loss": 0.3333, "step": 3878 }, { "epoch": 4.135394456289979, "grad_norm": 0.09108945673753906, "learning_rate": 7.068291539304368e-06, "loss": 0.3381, "step": 3879 }, { "epoch": 4.136460554371002, "grad_norm": 0.0796397200521715, "learning_rate": 7.0514020926301285e-06, "loss": 0.3314, "step": 3880 }, { "epoch": 4.137526652452026, "grad_norm": 0.09620207070657663, "learning_rate": 7.034530897766738e-06, "loss": 0.3348, "step": 3881 }, { "epoch": 4.138592750533049, "grad_norm": 0.08902821892098343, "learning_rate": 7.017677964059979e-06, "loss": 0.336, "step": 3882 }, { "epoch": 4.139658848614072, "grad_norm": 0.08377254099774022, "learning_rate": 7.000843300845473e-06, "loss": 0.3312, "step": 3883 }, { "epoch": 4.140724946695096, "grad_norm": 0.09387188110362456, "learning_rate": 6.984026917448763e-06, "loss": 0.3355, "step": 3884 }, { "epoch": 4.141791044776119, "grad_norm": 0.08819576291353799, "learning_rate": 6.967228823185257e-06, "loss": 0.3306, "step": 3885 }, { "epoch": 4.142857142857143, "grad_norm": 0.09756341988481407, "learning_rate": 6.950449027360213e-06, "loss": 0.3359, "step": 3886 }, { "epoch": 4.143923240938166, "grad_norm": 0.08464278902185567, "learning_rate": 6.9336875392687695e-06, "loss": 0.3336, "step": 3887 }, { "epoch": 4.144989339019189, "grad_norm": 0.08869325843091481, "learning_rate": 6.91694436819593e-06, "loss": 0.3325, "step": 3888 }, { "epoch": 4.146055437100213, "grad_norm": 0.08580192830087008, "learning_rate": 6.9002195234165295e-06, "loss": 0.329, "step": 3889 }, { "epoch": 4.1471215351812365, "grad_norm": 0.09091187799203043, "learning_rate": 6.8835130141952625e-06, "loss": 0.3319, "step": 3890 }, { "epoch": 4.1481876332622605, "grad_norm": 0.08949013100276429, "learning_rate": 6.866824849786673e-06, "loss": 0.3348, "step": 3891 }, { "epoch": 4.149253731343284, "grad_norm": 0.07727474516040857, "learning_rate": 6.850155039435145e-06, "loss": 0.3353, "step": 3892 }, { "epoch": 4.150319829424307, "grad_norm": 0.09632177794333178, "learning_rate": 6.833503592374864e-06, "loss": 0.3387, "step": 3893 }, { "epoch": 4.151385927505331, "grad_norm": 0.08865373218038916, "learning_rate": 6.8168705178298835e-06, "loss": 0.3309, "step": 3894 }, { "epoch": 4.152452025586354, "grad_norm": 0.09466620554710381, "learning_rate": 6.800255825014063e-06, "loss": 0.333, "step": 3895 }, { "epoch": 4.153518123667378, "grad_norm": 0.08200431933116618, "learning_rate": 6.78365952313107e-06, "loss": 0.3304, "step": 3896 }, { "epoch": 4.154584221748401, "grad_norm": 0.0857938915091229, "learning_rate": 6.767081621374392e-06, "loss": 0.3335, "step": 3897 }, { "epoch": 4.155650319829424, "grad_norm": 0.09483772251293643, "learning_rate": 6.750522128927332e-06, "loss": 0.3366, "step": 3898 }, { "epoch": 4.156716417910448, "grad_norm": 0.0963326853227681, "learning_rate": 6.733981054962995e-06, "loss": 0.3287, "step": 3899 }, { "epoch": 4.157782515991471, "grad_norm": 0.10620078154702665, "learning_rate": 6.717458408644262e-06, "loss": 0.3337, "step": 3900 }, { "epoch": 4.158848614072495, "grad_norm": 0.0807624229086923, "learning_rate": 6.700954199123821e-06, "loss": 0.3351, "step": 3901 }, { "epoch": 4.159914712153518, "grad_norm": 0.08529115263300027, "learning_rate": 6.68446843554416e-06, "loss": 0.3324, "step": 3902 }, { "epoch": 4.160980810234541, "grad_norm": 0.09055299262053584, "learning_rate": 6.66800112703752e-06, "loss": 0.3337, "step": 3903 }, { "epoch": 4.162046908315565, "grad_norm": 0.10436288204758602, "learning_rate": 6.6515522827259414e-06, "loss": 0.3353, "step": 3904 }, { "epoch": 4.163113006396588, "grad_norm": 0.09038774397915303, "learning_rate": 6.63512191172123e-06, "loss": 0.3332, "step": 3905 }, { "epoch": 4.164179104477612, "grad_norm": 0.10833203568207923, "learning_rate": 6.618710023124961e-06, "loss": 0.3335, "step": 3906 }, { "epoch": 4.165245202558635, "grad_norm": 0.08665653296892867, "learning_rate": 6.6023166260284555e-06, "loss": 0.3384, "step": 3907 }, { "epoch": 4.166311300639659, "grad_norm": 0.08387664029237912, "learning_rate": 6.585941729512808e-06, "loss": 0.3385, "step": 3908 }, { "epoch": 4.167377398720682, "grad_norm": 0.08361794082456976, "learning_rate": 6.569585342648861e-06, "loss": 0.3333, "step": 3909 }, { "epoch": 4.1684434968017055, "grad_norm": 0.08647483778408228, "learning_rate": 6.55324747449722e-06, "loss": 0.3383, "step": 3910 }, { "epoch": 4.169509594882729, "grad_norm": 0.08605099292944113, "learning_rate": 6.536928134108183e-06, "loss": 0.3436, "step": 3911 }, { "epoch": 4.1705756929637525, "grad_norm": 0.1009274470047588, "learning_rate": 6.520627330521838e-06, "loss": 0.337, "step": 3912 }, { "epoch": 4.1716417910447765, "grad_norm": 0.08627645906743135, "learning_rate": 6.504345072767986e-06, "loss": 0.3391, "step": 3913 }, { "epoch": 4.1727078891258, "grad_norm": 0.08829399505823976, "learning_rate": 6.48808136986613e-06, "loss": 0.3321, "step": 3914 }, { "epoch": 4.173773987206823, "grad_norm": 0.08315647833828722, "learning_rate": 6.471836230825533e-06, "loss": 0.3386, "step": 3915 }, { "epoch": 4.174840085287847, "grad_norm": 0.08482870520076757, "learning_rate": 6.455609664645153e-06, "loss": 0.3345, "step": 3916 }, { "epoch": 4.17590618336887, "grad_norm": 0.08464445899920194, "learning_rate": 6.439401680313677e-06, "loss": 0.3321, "step": 3917 }, { "epoch": 4.176972281449894, "grad_norm": 0.10394390037142433, "learning_rate": 6.423212286809462e-06, "loss": 0.3308, "step": 3918 }, { "epoch": 4.178038379530917, "grad_norm": 0.09043061918479862, "learning_rate": 6.407041493100603e-06, "loss": 0.3375, "step": 3919 }, { "epoch": 4.17910447761194, "grad_norm": 0.08930388093603546, "learning_rate": 6.390889308144879e-06, "loss": 0.3412, "step": 3920 }, { "epoch": 4.180170575692964, "grad_norm": 0.09541571773419251, "learning_rate": 6.374755740889775e-06, "loss": 0.3381, "step": 3921 }, { "epoch": 4.181236673773987, "grad_norm": 0.09036041961820482, "learning_rate": 6.3586408002724195e-06, "loss": 0.3319, "step": 3922 }, { "epoch": 4.182302771855011, "grad_norm": 0.09106038987637816, "learning_rate": 6.342544495219671e-06, "loss": 0.3308, "step": 3923 }, { "epoch": 4.183368869936034, "grad_norm": 0.08457513382844575, "learning_rate": 6.326466834648055e-06, "loss": 0.3318, "step": 3924 }, { "epoch": 4.184434968017057, "grad_norm": 0.08404396446642012, "learning_rate": 6.310407827463736e-06, "loss": 0.3295, "step": 3925 }, { "epoch": 4.185501066098081, "grad_norm": 0.09617345450918036, "learning_rate": 6.29436748256258e-06, "loss": 0.331, "step": 3926 }, { "epoch": 4.186567164179104, "grad_norm": 0.09146742715237753, "learning_rate": 6.278345808830102e-06, "loss": 0.3346, "step": 3927 }, { "epoch": 4.187633262260128, "grad_norm": 0.08405415079436931, "learning_rate": 6.262342815141495e-06, "loss": 0.3346, "step": 3928 }, { "epoch": 4.188699360341151, "grad_norm": 0.09298813400757024, "learning_rate": 6.246358510361559e-06, "loss": 0.3318, "step": 3929 }, { "epoch": 4.189765458422174, "grad_norm": 0.08261784827703691, "learning_rate": 6.230392903344777e-06, "loss": 0.3314, "step": 3930 }, { "epoch": 4.190831556503198, "grad_norm": 0.09664506287358952, "learning_rate": 6.214446002935282e-06, "loss": 0.3275, "step": 3931 }, { "epoch": 4.1918976545842215, "grad_norm": 0.10130324537487397, "learning_rate": 6.198517817966805e-06, "loss": 0.3392, "step": 3932 }, { "epoch": 4.1929637526652455, "grad_norm": 0.0885774081244104, "learning_rate": 6.182608357262738e-06, "loss": 0.332, "step": 3933 }, { "epoch": 4.1940298507462686, "grad_norm": 0.09028658637288987, "learning_rate": 6.166717629636103e-06, "loss": 0.3273, "step": 3934 }, { "epoch": 4.1950959488272925, "grad_norm": 0.09059894957809664, "learning_rate": 6.150845643889542e-06, "loss": 0.3391, "step": 3935 }, { "epoch": 4.196162046908316, "grad_norm": 0.0947415382692891, "learning_rate": 6.1349924088152905e-06, "loss": 0.3329, "step": 3936 }, { "epoch": 4.197228144989339, "grad_norm": 0.07775785182191923, "learning_rate": 6.119157933195232e-06, "loss": 0.3337, "step": 3937 }, { "epoch": 4.198294243070363, "grad_norm": 0.09230865898141519, "learning_rate": 6.1033422258008364e-06, "loss": 0.3356, "step": 3938 }, { "epoch": 4.199360341151386, "grad_norm": 0.09075042427461232, "learning_rate": 6.087545295393198e-06, "loss": 0.3366, "step": 3939 }, { "epoch": 4.20042643923241, "grad_norm": 0.08591434886921874, "learning_rate": 6.071767150722974e-06, "loss": 0.3329, "step": 3940 }, { "epoch": 4.201492537313433, "grad_norm": 0.08354052632216036, "learning_rate": 6.056007800530444e-06, "loss": 0.337, "step": 3941 }, { "epoch": 4.202558635394456, "grad_norm": 0.09416369800009326, "learning_rate": 6.040267253545482e-06, "loss": 0.3336, "step": 3942 }, { "epoch": 4.20362473347548, "grad_norm": 0.08465467278990185, "learning_rate": 6.024545518487515e-06, "loss": 0.3357, "step": 3943 }, { "epoch": 4.204690831556503, "grad_norm": 0.08504324724039129, "learning_rate": 6.0088426040655704e-06, "loss": 0.3328, "step": 3944 }, { "epoch": 4.205756929637527, "grad_norm": 0.07972041873768856, "learning_rate": 5.993158518978255e-06, "loss": 0.3296, "step": 3945 }, { "epoch": 4.20682302771855, "grad_norm": 0.08077528320595509, "learning_rate": 5.977493271913739e-06, "loss": 0.331, "step": 3946 }, { "epoch": 4.207889125799573, "grad_norm": 0.07617274164835192, "learning_rate": 5.961846871549739e-06, "loss": 0.331, "step": 3947 }, { "epoch": 4.208955223880597, "grad_norm": 0.09002104870840356, "learning_rate": 5.946219326553557e-06, "loss": 0.3404, "step": 3948 }, { "epoch": 4.21002132196162, "grad_norm": 0.09718872414198558, "learning_rate": 5.930610645582051e-06, "loss": 0.3367, "step": 3949 }, { "epoch": 4.211087420042644, "grad_norm": 0.08759177753235749, "learning_rate": 5.915020837281602e-06, "loss": 0.3389, "step": 3950 }, { "epoch": 4.212153518123667, "grad_norm": 0.08550266499801752, "learning_rate": 5.899449910288169e-06, "loss": 0.3314, "step": 3951 }, { "epoch": 4.21321961620469, "grad_norm": 0.08610889895227287, "learning_rate": 5.883897873227216e-06, "loss": 0.3321, "step": 3952 }, { "epoch": 4.214285714285714, "grad_norm": 0.07857770022593799, "learning_rate": 5.868364734713776e-06, "loss": 0.3407, "step": 3953 }, { "epoch": 4.2153518123667375, "grad_norm": 0.10475981108457424, "learning_rate": 5.852850503352407e-06, "loss": 0.3384, "step": 3954 }, { "epoch": 4.2164179104477615, "grad_norm": 0.07804224672207336, "learning_rate": 5.837355187737172e-06, "loss": 0.3368, "step": 3955 }, { "epoch": 4.217484008528785, "grad_norm": 0.08058373335851238, "learning_rate": 5.821878796451681e-06, "loss": 0.3322, "step": 3956 }, { "epoch": 4.218550106609808, "grad_norm": 0.08404369349869568, "learning_rate": 5.806421338069053e-06, "loss": 0.337, "step": 3957 }, { "epoch": 4.219616204690832, "grad_norm": 0.08080561168777425, "learning_rate": 5.790982821151905e-06, "loss": 0.3339, "step": 3958 }, { "epoch": 4.220682302771855, "grad_norm": 0.0828839824572233, "learning_rate": 5.7755632542523744e-06, "loss": 0.3308, "step": 3959 }, { "epoch": 4.221748400852879, "grad_norm": 0.09417702718528696, "learning_rate": 5.7601626459121175e-06, "loss": 0.3337, "step": 3960 }, { "epoch": 4.222814498933902, "grad_norm": 0.08229090459967245, "learning_rate": 5.744781004662247e-06, "loss": 0.3349, "step": 3961 }, { "epoch": 4.223880597014926, "grad_norm": 0.08396218859737672, "learning_rate": 5.729418339023407e-06, "loss": 0.3415, "step": 3962 }, { "epoch": 4.224946695095949, "grad_norm": 0.08423179577148057, "learning_rate": 5.714074657505708e-06, "loss": 0.3349, "step": 3963 }, { "epoch": 4.226012793176972, "grad_norm": 0.0747196908529032, "learning_rate": 5.6987499686087695e-06, "loss": 0.3361, "step": 3964 }, { "epoch": 4.227078891257996, "grad_norm": 0.09430188688349524, "learning_rate": 5.683444280821651e-06, "loss": 0.3381, "step": 3965 }, { "epoch": 4.228144989339019, "grad_norm": 0.07893437331383663, "learning_rate": 5.668157602622914e-06, "loss": 0.333, "step": 3966 }, { "epoch": 4.229211087420043, "grad_norm": 0.07954371770687908, "learning_rate": 5.6528899424805886e-06, "loss": 0.3354, "step": 3967 }, { "epoch": 4.230277185501066, "grad_norm": 0.07863279825537235, "learning_rate": 5.637641308852169e-06, "loss": 0.3242, "step": 3968 }, { "epoch": 4.231343283582089, "grad_norm": 0.08110711920649173, "learning_rate": 5.622411710184592e-06, "loss": 0.3359, "step": 3969 }, { "epoch": 4.232409381663113, "grad_norm": 0.08076149758379426, "learning_rate": 5.607201154914275e-06, "loss": 0.3315, "step": 3970 }, { "epoch": 4.233475479744136, "grad_norm": 0.08084146195418195, "learning_rate": 5.592009651467081e-06, "loss": 0.3333, "step": 3971 }, { "epoch": 4.23454157782516, "grad_norm": 0.07585877249863301, "learning_rate": 5.5768372082582925e-06, "loss": 0.3276, "step": 3972 }, { "epoch": 4.235607675906183, "grad_norm": 0.07846952562528792, "learning_rate": 5.561683833692666e-06, "loss": 0.335, "step": 3973 }, { "epoch": 4.2366737739872065, "grad_norm": 0.08010327334926544, "learning_rate": 5.546549536164381e-06, "loss": 0.3341, "step": 3974 }, { "epoch": 4.23773987206823, "grad_norm": 0.0799875870943478, "learning_rate": 5.531434324057068e-06, "loss": 0.336, "step": 3975 }, { "epoch": 4.2388059701492535, "grad_norm": 0.08230353675191754, "learning_rate": 5.516338205743745e-06, "loss": 0.3299, "step": 3976 }, { "epoch": 4.2398720682302775, "grad_norm": 0.08053168122604346, "learning_rate": 5.501261189586889e-06, "loss": 0.3339, "step": 3977 }, { "epoch": 4.240938166311301, "grad_norm": 0.07665080765192847, "learning_rate": 5.486203283938376e-06, "loss": 0.3365, "step": 3978 }, { "epoch": 4.242004264392324, "grad_norm": 0.08493277690607401, "learning_rate": 5.471164497139523e-06, "loss": 0.3352, "step": 3979 }, { "epoch": 4.243070362473348, "grad_norm": 0.08757131176321435, "learning_rate": 5.456144837521012e-06, "loss": 0.336, "step": 3980 }, { "epoch": 4.244136460554371, "grad_norm": 0.08088307895934461, "learning_rate": 5.441144313402964e-06, "loss": 0.3352, "step": 3981 }, { "epoch": 4.245202558635395, "grad_norm": 0.10303950476433886, "learning_rate": 5.426162933094898e-06, "loss": 0.3363, "step": 3982 }, { "epoch": 4.246268656716418, "grad_norm": 0.08551651606703374, "learning_rate": 5.411200704895705e-06, "loss": 0.3419, "step": 3983 }, { "epoch": 4.247334754797441, "grad_norm": 0.08310317399531018, "learning_rate": 5.396257637093687e-06, "loss": 0.334, "step": 3984 }, { "epoch": 4.248400852878465, "grad_norm": 0.08656256571909457, "learning_rate": 5.381333737966525e-06, "loss": 0.3345, "step": 3985 }, { "epoch": 4.249466950959488, "grad_norm": 0.08701071532379277, "learning_rate": 5.3664290157813005e-06, "loss": 0.3326, "step": 3986 }, { "epoch": 4.250533049040512, "grad_norm": 0.08260458610600334, "learning_rate": 5.3515434787944295e-06, "loss": 0.3357, "step": 3987 }, { "epoch": 4.251599147121535, "grad_norm": 0.09118839401494544, "learning_rate": 5.336677135251744e-06, "loss": 0.3309, "step": 3988 }, { "epoch": 4.252665245202559, "grad_norm": 0.0833732616912321, "learning_rate": 5.321829993388421e-06, "loss": 0.3332, "step": 3989 }, { "epoch": 4.253731343283582, "grad_norm": 0.0770950611344399, "learning_rate": 5.307002061429001e-06, "loss": 0.3295, "step": 3990 }, { "epoch": 4.254797441364605, "grad_norm": 0.08654137149689174, "learning_rate": 5.292193347587389e-06, "loss": 0.3335, "step": 3991 }, { "epoch": 4.255863539445629, "grad_norm": 0.08023871846633238, "learning_rate": 5.277403860066841e-06, "loss": 0.3318, "step": 3992 }, { "epoch": 4.256929637526652, "grad_norm": 0.08681903564804878, "learning_rate": 5.262633607059982e-06, "loss": 0.3386, "step": 3993 }, { "epoch": 4.257995735607676, "grad_norm": 0.08369350417988806, "learning_rate": 5.247882596748737e-06, "loss": 0.3356, "step": 3994 }, { "epoch": 4.259061833688699, "grad_norm": 0.0911097900892705, "learning_rate": 5.233150837304415e-06, "loss": 0.3389, "step": 3995 }, { "epoch": 4.2601279317697225, "grad_norm": 0.08091682403105288, "learning_rate": 5.218438336887643e-06, "loss": 0.3418, "step": 3996 }, { "epoch": 4.2611940298507465, "grad_norm": 0.07957719556291395, "learning_rate": 5.203745103648392e-06, "loss": 0.3353, "step": 3997 }, { "epoch": 4.26226012793177, "grad_norm": 0.077328833322233, "learning_rate": 5.189071145725928e-06, "loss": 0.3383, "step": 3998 }, { "epoch": 4.2633262260127935, "grad_norm": 0.08081742747511964, "learning_rate": 5.174416471248873e-06, "loss": 0.3325, "step": 3999 }, { "epoch": 4.264392324093817, "grad_norm": 0.07861878616801701, "learning_rate": 5.159781088335161e-06, "loss": 0.3342, "step": 4000 }, { "epoch": 4.26545842217484, "grad_norm": 0.08400538210654203, "learning_rate": 5.145165005092017e-06, "loss": 0.3327, "step": 4001 }, { "epoch": 4.266524520255864, "grad_norm": 0.07589313698458251, "learning_rate": 5.130568229616004e-06, "loss": 0.3332, "step": 4002 }, { "epoch": 4.267590618336887, "grad_norm": 0.08147340980126598, "learning_rate": 5.115990769992971e-06, "loss": 0.3281, "step": 4003 }, { "epoch": 4.268656716417911, "grad_norm": 0.07835055312388887, "learning_rate": 5.101432634298089e-06, "loss": 0.3339, "step": 4004 }, { "epoch": 4.269722814498934, "grad_norm": 0.07947896584006585, "learning_rate": 5.086893830595783e-06, "loss": 0.3382, "step": 4005 }, { "epoch": 4.270788912579957, "grad_norm": 0.07841316295017421, "learning_rate": 5.07237436693981e-06, "loss": 0.333, "step": 4006 }, { "epoch": 4.271855010660981, "grad_norm": 0.09374263052005652, "learning_rate": 5.057874251373194e-06, "loss": 0.3332, "step": 4007 }, { "epoch": 4.272921108742004, "grad_norm": 0.07579864101983415, "learning_rate": 5.0433934919282525e-06, "loss": 0.3324, "step": 4008 }, { "epoch": 4.273987206823028, "grad_norm": 0.0804083559161328, "learning_rate": 5.0289320966265645e-06, "loss": 0.3325, "step": 4009 }, { "epoch": 4.275053304904051, "grad_norm": 0.08512014086820764, "learning_rate": 5.014490073478993e-06, "loss": 0.342, "step": 4010 }, { "epoch": 4.276119402985074, "grad_norm": 0.08530022661651851, "learning_rate": 5.00006743048568e-06, "loss": 0.3348, "step": 4011 }, { "epoch": 4.277185501066098, "grad_norm": 0.07751500270346133, "learning_rate": 4.985664175636e-06, "loss": 0.3377, "step": 4012 }, { "epoch": 4.278251599147121, "grad_norm": 0.075817452063534, "learning_rate": 4.97128031690862e-06, "loss": 0.3324, "step": 4013 }, { "epoch": 4.279317697228145, "grad_norm": 0.07989341231264155, "learning_rate": 4.956915862271445e-06, "loss": 0.3371, "step": 4014 }, { "epoch": 4.280383795309168, "grad_norm": 0.08374182318320397, "learning_rate": 4.942570819681649e-06, "loss": 0.3349, "step": 4015 }, { "epoch": 4.281449893390192, "grad_norm": 0.08077261991520684, "learning_rate": 4.928245197085626e-06, "loss": 0.3418, "step": 4016 }, { "epoch": 4.282515991471215, "grad_norm": 0.09142010449739497, "learning_rate": 4.913939002419028e-06, "loss": 0.3403, "step": 4017 }, { "epoch": 4.2835820895522385, "grad_norm": 0.08261620442873233, "learning_rate": 4.899652243606752e-06, "loss": 0.3342, "step": 4018 }, { "epoch": 4.2846481876332625, "grad_norm": 0.07949162404713021, "learning_rate": 4.88538492856291e-06, "loss": 0.3342, "step": 4019 }, { "epoch": 4.285714285714286, "grad_norm": 0.08029027369166614, "learning_rate": 4.871137065190854e-06, "loss": 0.3347, "step": 4020 }, { "epoch": 4.28678038379531, "grad_norm": 0.0792316332451311, "learning_rate": 4.856908661383175e-06, "loss": 0.339, "step": 4021 }, { "epoch": 4.287846481876333, "grad_norm": 0.0801437973547572, "learning_rate": 4.842699725021649e-06, "loss": 0.331, "step": 4022 }, { "epoch": 4.288912579957356, "grad_norm": 0.08488879536646476, "learning_rate": 4.828510263977295e-06, "loss": 0.3306, "step": 4023 }, { "epoch": 4.28997867803838, "grad_norm": 0.07756702190689371, "learning_rate": 4.814340286110346e-06, "loss": 0.3321, "step": 4024 }, { "epoch": 4.291044776119403, "grad_norm": 0.09257841600912252, "learning_rate": 4.800189799270221e-06, "loss": 0.3341, "step": 4025 }, { "epoch": 4.292110874200427, "grad_norm": 0.08241075259625737, "learning_rate": 4.786058811295564e-06, "loss": 0.3379, "step": 4026 }, { "epoch": 4.29317697228145, "grad_norm": 0.08900515508524041, "learning_rate": 4.771947330014195e-06, "loss": 0.3335, "step": 4027 }, { "epoch": 4.294243070362473, "grad_norm": 0.08609235279832582, "learning_rate": 4.757855363243149e-06, "loss": 0.3395, "step": 4028 }, { "epoch": 4.295309168443497, "grad_norm": 0.08284456057342021, "learning_rate": 4.743782918788653e-06, "loss": 0.3291, "step": 4029 }, { "epoch": 4.29637526652452, "grad_norm": 0.0713526073589266, "learning_rate": 4.729730004446094e-06, "loss": 0.3265, "step": 4030 }, { "epoch": 4.297441364605544, "grad_norm": 0.08416229502175335, "learning_rate": 4.715696628000057e-06, "loss": 0.3343, "step": 4031 }, { "epoch": 4.298507462686567, "grad_norm": 0.07860706489090766, "learning_rate": 4.701682797224316e-06, "loss": 0.3342, "step": 4032 }, { "epoch": 4.29957356076759, "grad_norm": 0.07625660551514413, "learning_rate": 4.687688519881799e-06, "loss": 0.3352, "step": 4033 }, { "epoch": 4.300639658848614, "grad_norm": 0.08922308300609538, "learning_rate": 4.673713803724602e-06, "loss": 0.3264, "step": 4034 }, { "epoch": 4.301705756929637, "grad_norm": 0.07964888287437034, "learning_rate": 4.659758656494e-06, "loss": 0.3347, "step": 4035 }, { "epoch": 4.302771855010661, "grad_norm": 0.08135734469936476, "learning_rate": 4.645823085920409e-06, "loss": 0.3411, "step": 4036 }, { "epoch": 4.303837953091684, "grad_norm": 0.07699749391475862, "learning_rate": 4.6319070997234315e-06, "loss": 0.3366, "step": 4037 }, { "epoch": 4.3049040511727075, "grad_norm": 0.079215440831635, "learning_rate": 4.618010705611777e-06, "loss": 0.3404, "step": 4038 }, { "epoch": 4.3059701492537314, "grad_norm": 0.08102338622902823, "learning_rate": 4.604133911283333e-06, "loss": 0.3379, "step": 4039 }, { "epoch": 4.3070362473347545, "grad_norm": 0.08630172945404184, "learning_rate": 4.590276724425136e-06, "loss": 0.3319, "step": 4040 }, { "epoch": 4.3081023454157785, "grad_norm": 0.09511994379660102, "learning_rate": 4.576439152713326e-06, "loss": 0.3315, "step": 4041 }, { "epoch": 4.309168443496802, "grad_norm": 0.08335455005241084, "learning_rate": 4.562621203813211e-06, "loss": 0.3316, "step": 4042 }, { "epoch": 4.310234541577826, "grad_norm": 0.08307933355980296, "learning_rate": 4.548822885379212e-06, "loss": 0.336, "step": 4043 }, { "epoch": 4.311300639658849, "grad_norm": 0.0843309027586609, "learning_rate": 4.535044205054893e-06, "loss": 0.3295, "step": 4044 }, { "epoch": 4.312366737739872, "grad_norm": 0.08460136640969265, "learning_rate": 4.521285170472904e-06, "loss": 0.336, "step": 4045 }, { "epoch": 4.313432835820896, "grad_norm": 0.07814369280046486, "learning_rate": 4.507545789255052e-06, "loss": 0.3332, "step": 4046 }, { "epoch": 4.314498933901919, "grad_norm": 0.0855030516487757, "learning_rate": 4.4938260690122435e-06, "loss": 0.3342, "step": 4047 }, { "epoch": 4.315565031982943, "grad_norm": 0.08812581908559658, "learning_rate": 4.480126017344471e-06, "loss": 0.3329, "step": 4048 }, { "epoch": 4.316631130063966, "grad_norm": 0.07749089456051617, "learning_rate": 4.466445641840862e-06, "loss": 0.3333, "step": 4049 }, { "epoch": 4.317697228144989, "grad_norm": 0.07815807671811865, "learning_rate": 4.45278495007964e-06, "loss": 0.3363, "step": 4050 }, { "epoch": 4.318763326226013, "grad_norm": 0.07358888097958845, "learning_rate": 4.439143949628118e-06, "loss": 0.3295, "step": 4051 }, { "epoch": 4.319829424307036, "grad_norm": 0.08431613734886675, "learning_rate": 4.425522648042684e-06, "loss": 0.3275, "step": 4052 }, { "epoch": 4.32089552238806, "grad_norm": 0.07450901993676554, "learning_rate": 4.411921052868846e-06, "loss": 0.33, "step": 4053 }, { "epoch": 4.321961620469083, "grad_norm": 0.08108642538320732, "learning_rate": 4.3983391716411775e-06, "loss": 0.3374, "step": 4054 }, { "epoch": 4.323027718550106, "grad_norm": 0.08057675359517547, "learning_rate": 4.384777011883343e-06, "loss": 0.3327, "step": 4055 }, { "epoch": 4.32409381663113, "grad_norm": 0.07748909745389584, "learning_rate": 4.371234581108059e-06, "loss": 0.3264, "step": 4056 }, { "epoch": 4.325159914712153, "grad_norm": 0.0920305873286274, "learning_rate": 4.3577118868171335e-06, "loss": 0.3326, "step": 4057 }, { "epoch": 4.326226012793177, "grad_norm": 0.07953455564884193, "learning_rate": 4.344208936501449e-06, "loss": 0.3327, "step": 4058 }, { "epoch": 4.3272921108742, "grad_norm": 0.08324725165522431, "learning_rate": 4.3307257376409155e-06, "loss": 0.3329, "step": 4059 }, { "epoch": 4.3283582089552235, "grad_norm": 0.08278692213524609, "learning_rate": 4.317262297704541e-06, "loss": 0.3374, "step": 4060 }, { "epoch": 4.3294243070362475, "grad_norm": 0.07643673415451332, "learning_rate": 4.3038186241503644e-06, "loss": 0.338, "step": 4061 }, { "epoch": 4.330490405117271, "grad_norm": 0.07581981080944176, "learning_rate": 4.290394724425495e-06, "loss": 0.3384, "step": 4062 }, { "epoch": 4.3315565031982945, "grad_norm": 0.07771968698621509, "learning_rate": 4.276990605966056e-06, "loss": 0.3337, "step": 4063 }, { "epoch": 4.332622601279318, "grad_norm": 0.07895244133624792, "learning_rate": 4.2636062761972406e-06, "loss": 0.3409, "step": 4064 }, { "epoch": 4.333688699360341, "grad_norm": 0.07797323195669634, "learning_rate": 4.2502417425332746e-06, "loss": 0.3376, "step": 4065 }, { "epoch": 4.334754797441365, "grad_norm": 0.07997960877114406, "learning_rate": 4.236897012377421e-06, "loss": 0.3371, "step": 4066 }, { "epoch": 4.335820895522388, "grad_norm": 0.07600199349205194, "learning_rate": 4.223572093121951e-06, "loss": 0.3364, "step": 4067 }, { "epoch": 4.336886993603412, "grad_norm": 0.08268817910100379, "learning_rate": 4.210266992148188e-06, "loss": 0.336, "step": 4068 }, { "epoch": 4.337953091684435, "grad_norm": 0.0745603318232261, "learning_rate": 4.196981716826471e-06, "loss": 0.3323, "step": 4069 }, { "epoch": 4.339019189765459, "grad_norm": 0.09208585615342714, "learning_rate": 4.183716274516134e-06, "loss": 0.3314, "step": 4070 }, { "epoch": 4.340085287846482, "grad_norm": 0.07711887841230759, "learning_rate": 4.170470672565557e-06, "loss": 0.3331, "step": 4071 }, { "epoch": 4.341151385927505, "grad_norm": 0.08621935992949319, "learning_rate": 4.157244918312113e-06, "loss": 0.3341, "step": 4072 }, { "epoch": 4.342217484008529, "grad_norm": 0.08226222595272614, "learning_rate": 4.144039019082184e-06, "loss": 0.333, "step": 4073 }, { "epoch": 4.343283582089552, "grad_norm": 0.07808395675180577, "learning_rate": 4.1308529821911495e-06, "loss": 0.3303, "step": 4074 }, { "epoch": 4.344349680170576, "grad_norm": 0.0769818042274672, "learning_rate": 4.117686814943382e-06, "loss": 0.3417, "step": 4075 }, { "epoch": 4.345415778251599, "grad_norm": 0.07879847363496587, "learning_rate": 4.104540524632268e-06, "loss": 0.3382, "step": 4076 }, { "epoch": 4.346481876332622, "grad_norm": 0.08293035530571763, "learning_rate": 4.091414118540158e-06, "loss": 0.3331, "step": 4077 }, { "epoch": 4.347547974413646, "grad_norm": 0.07727360738321705, "learning_rate": 4.078307603938397e-06, "loss": 0.337, "step": 4078 }, { "epoch": 4.348614072494669, "grad_norm": 0.07551576922057891, "learning_rate": 4.0652209880873214e-06, "loss": 0.328, "step": 4079 }, { "epoch": 4.349680170575693, "grad_norm": 0.07917186089359039, "learning_rate": 4.052154278236242e-06, "loss": 0.334, "step": 4080 }, { "epoch": 4.350746268656716, "grad_norm": 0.07567017656306375, "learning_rate": 4.039107481623417e-06, "loss": 0.3345, "step": 4081 }, { "epoch": 4.3518123667377395, "grad_norm": 0.07605848497463297, "learning_rate": 4.026080605476104e-06, "loss": 0.3311, "step": 4082 }, { "epoch": 4.3528784648187635, "grad_norm": 0.08119621016465227, "learning_rate": 4.013073657010518e-06, "loss": 0.334, "step": 4083 }, { "epoch": 4.353944562899787, "grad_norm": 0.07763766653584449, "learning_rate": 4.000086643431838e-06, "loss": 0.3346, "step": 4084 }, { "epoch": 4.355010660980811, "grad_norm": 0.08680938596509069, "learning_rate": 3.987119571934179e-06, "loss": 0.3379, "step": 4085 }, { "epoch": 4.356076759061834, "grad_norm": 0.07539433250460861, "learning_rate": 3.974172449700633e-06, "loss": 0.3312, "step": 4086 }, { "epoch": 4.357142857142857, "grad_norm": 0.08358711815421738, "learning_rate": 3.961245283903239e-06, "loss": 0.3346, "step": 4087 }, { "epoch": 4.358208955223881, "grad_norm": 0.07975462632947826, "learning_rate": 3.948338081702958e-06, "loss": 0.3278, "step": 4088 }, { "epoch": 4.359275053304904, "grad_norm": 0.07783915298575807, "learning_rate": 3.935450850249725e-06, "loss": 0.3356, "step": 4089 }, { "epoch": 4.360341151385928, "grad_norm": 0.07816454670420965, "learning_rate": 3.9225835966823966e-06, "loss": 0.3361, "step": 4090 }, { "epoch": 4.361407249466951, "grad_norm": 0.08148461720782253, "learning_rate": 3.909736328128748e-06, "loss": 0.3333, "step": 4091 }, { "epoch": 4.362473347547974, "grad_norm": 0.08042384846739183, "learning_rate": 3.896909051705509e-06, "loss": 0.3321, "step": 4092 }, { "epoch": 4.363539445628998, "grad_norm": 0.08319011697671452, "learning_rate": 3.884101774518327e-06, "loss": 0.3354, "step": 4093 }, { "epoch": 4.364605543710021, "grad_norm": 0.0782252777997426, "learning_rate": 3.871314503661761e-06, "loss": 0.3361, "step": 4094 }, { "epoch": 4.365671641791045, "grad_norm": 0.07065927300340862, "learning_rate": 3.858547246219293e-06, "loss": 0.3312, "step": 4095 }, { "epoch": 4.366737739872068, "grad_norm": 0.08468956849885999, "learning_rate": 3.845800009263334e-06, "loss": 0.3307, "step": 4096 }, { "epoch": 4.367803837953092, "grad_norm": 0.08901806534396679, "learning_rate": 3.833072799855173e-06, "loss": 0.3315, "step": 4097 }, { "epoch": 4.368869936034115, "grad_norm": 0.07301214410208626, "learning_rate": 3.820365625045037e-06, "loss": 0.3331, "step": 4098 }, { "epoch": 4.369936034115138, "grad_norm": 0.08198685107248344, "learning_rate": 3.8076784918720242e-06, "loss": 0.3311, "step": 4099 }, { "epoch": 4.371002132196162, "grad_norm": 0.08355479249429133, "learning_rate": 3.7950114073641573e-06, "loss": 0.3355, "step": 4100 }, { "epoch": 4.372068230277185, "grad_norm": 0.07726712718675953, "learning_rate": 3.7823643785383434e-06, "loss": 0.333, "step": 4101 }, { "epoch": 4.373134328358209, "grad_norm": 0.07279799524384821, "learning_rate": 3.7697374124003872e-06, "loss": 0.3303, "step": 4102 }, { "epoch": 4.3742004264392325, "grad_norm": 0.08528796671091622, "learning_rate": 3.757130515944951e-06, "loss": 0.3342, "step": 4103 }, { "epoch": 4.3752665245202556, "grad_norm": 0.08572585263620255, "learning_rate": 3.7445436961556135e-06, "loss": 0.3369, "step": 4104 }, { "epoch": 4.3763326226012795, "grad_norm": 0.0748149784520539, "learning_rate": 3.7319769600048237e-06, "loss": 0.3387, "step": 4105 }, { "epoch": 4.377398720682303, "grad_norm": 0.07657381538769502, "learning_rate": 3.7194303144538847e-06, "loss": 0.3327, "step": 4106 }, { "epoch": 4.378464818763327, "grad_norm": 0.0807481452009072, "learning_rate": 3.706903766452996e-06, "loss": 0.3343, "step": 4107 }, { "epoch": 4.37953091684435, "grad_norm": 0.08393196828122008, "learning_rate": 3.6943973229412124e-06, "loss": 0.3374, "step": 4108 }, { "epoch": 4.380597014925373, "grad_norm": 0.0782549467161215, "learning_rate": 3.681910990846462e-06, "loss": 0.3287, "step": 4109 }, { "epoch": 4.381663113006397, "grad_norm": 0.08070850418302111, "learning_rate": 3.669444777085507e-06, "loss": 0.337, "step": 4110 }, { "epoch": 4.38272921108742, "grad_norm": 0.07534023101241813, "learning_rate": 3.6569986885639954e-06, "loss": 0.3341, "step": 4111 }, { "epoch": 4.383795309168444, "grad_norm": 0.07838807058793515, "learning_rate": 3.6445727321764035e-06, "loss": 0.3353, "step": 4112 }, { "epoch": 4.384861407249467, "grad_norm": 0.07822860155918042, "learning_rate": 3.6321669148060833e-06, "loss": 0.3342, "step": 4113 }, { "epoch": 4.38592750533049, "grad_norm": 0.07732616158756847, "learning_rate": 3.619781243325187e-06, "loss": 0.3382, "step": 4114 }, { "epoch": 4.386993603411514, "grad_norm": 0.0767992597048576, "learning_rate": 3.6074157245947495e-06, "loss": 0.3392, "step": 4115 }, { "epoch": 4.388059701492537, "grad_norm": 0.08304386197535715, "learning_rate": 3.5950703654646303e-06, "loss": 0.3368, "step": 4116 }, { "epoch": 4.389125799573561, "grad_norm": 0.07467146074464837, "learning_rate": 3.5827451727735007e-06, "loss": 0.333, "step": 4117 }, { "epoch": 4.390191897654584, "grad_norm": 0.07659772726124718, "learning_rate": 3.5704401533488865e-06, "loss": 0.3332, "step": 4118 }, { "epoch": 4.391257995735607, "grad_norm": 0.0779901058287516, "learning_rate": 3.5581553140071256e-06, "loss": 0.3353, "step": 4119 }, { "epoch": 4.392324093816631, "grad_norm": 0.08271167127027722, "learning_rate": 3.5458906615533883e-06, "loss": 0.3325, "step": 4120 }, { "epoch": 4.393390191897654, "grad_norm": 0.07640335606088489, "learning_rate": 3.53364620278164e-06, "loss": 0.3336, "step": 4121 }, { "epoch": 4.394456289978678, "grad_norm": 0.08042193883025106, "learning_rate": 3.5214219444746856e-06, "loss": 0.3384, "step": 4122 }, { "epoch": 4.395522388059701, "grad_norm": 0.07840556030820514, "learning_rate": 3.5092178934041353e-06, "loss": 0.3386, "step": 4123 }, { "epoch": 4.396588486140725, "grad_norm": 0.08151804289922143, "learning_rate": 3.497034056330382e-06, "loss": 0.3317, "step": 4124 }, { "epoch": 4.3976545842217485, "grad_norm": 0.08168021320582781, "learning_rate": 3.4848704400026434e-06, "loss": 0.3321, "step": 4125 }, { "epoch": 4.398720682302772, "grad_norm": 0.07609160414759338, "learning_rate": 3.4727270511589396e-06, "loss": 0.3363, "step": 4126 }, { "epoch": 4.399786780383796, "grad_norm": 0.07842449906210139, "learning_rate": 3.4606038965260715e-06, "loss": 0.3281, "step": 4127 }, { "epoch": 4.400852878464819, "grad_norm": 0.07972055102539237, "learning_rate": 3.4485009828196357e-06, "loss": 0.3348, "step": 4128 }, { "epoch": 4.401918976545843, "grad_norm": 0.07239605085558472, "learning_rate": 3.4364183167440123e-06, "loss": 0.3314, "step": 4129 }, { "epoch": 4.402985074626866, "grad_norm": 0.07272119283107233, "learning_rate": 3.4243559049923803e-06, "loss": 0.3345, "step": 4130 }, { "epoch": 4.404051172707889, "grad_norm": 0.07698965215086702, "learning_rate": 3.412313754246688e-06, "loss": 0.3364, "step": 4131 }, { "epoch": 4.405117270788913, "grad_norm": 0.07835880219239048, "learning_rate": 3.400291871177652e-06, "loss": 0.3325, "step": 4132 }, { "epoch": 4.406183368869936, "grad_norm": 0.07883352671429816, "learning_rate": 3.3882902624447777e-06, "loss": 0.3302, "step": 4133 }, { "epoch": 4.40724946695096, "grad_norm": 0.07598330265933242, "learning_rate": 3.3763089346963417e-06, "loss": 0.3315, "step": 4134 }, { "epoch": 4.408315565031983, "grad_norm": 0.07233756048331239, "learning_rate": 3.3643478945693552e-06, "loss": 0.3344, "step": 4135 }, { "epoch": 4.409381663113006, "grad_norm": 0.07815412688938289, "learning_rate": 3.352407148689625e-06, "loss": 0.3355, "step": 4136 }, { "epoch": 4.41044776119403, "grad_norm": 0.08194740814774278, "learning_rate": 3.3404867036716994e-06, "loss": 0.3353, "step": 4137 }, { "epoch": 4.411513859275053, "grad_norm": 0.07605509113317131, "learning_rate": 3.328586566118901e-06, "loss": 0.3352, "step": 4138 }, { "epoch": 4.412579957356077, "grad_norm": 0.07758256960053231, "learning_rate": 3.316706742623268e-06, "loss": 0.3391, "step": 4139 }, { "epoch": 4.4136460554371, "grad_norm": 0.07526062814521468, "learning_rate": 3.3048472397656115e-06, "loss": 0.3331, "step": 4140 }, { "epoch": 4.414712153518123, "grad_norm": 0.08486437341532607, "learning_rate": 3.2930080641154816e-06, "loss": 0.3323, "step": 4141 }, { "epoch": 4.415778251599147, "grad_norm": 0.0830672341181274, "learning_rate": 3.2811892222311694e-06, "loss": 0.3319, "step": 4142 }, { "epoch": 4.41684434968017, "grad_norm": 0.07212703483795091, "learning_rate": 3.269390720659691e-06, "loss": 0.3332, "step": 4143 }, { "epoch": 4.417910447761194, "grad_norm": 0.0790520570244998, "learning_rate": 3.257612565936805e-06, "loss": 0.3359, "step": 4144 }, { "epoch": 4.418976545842217, "grad_norm": 0.07333741287228444, "learning_rate": 3.2458547645870086e-06, "loss": 0.3285, "step": 4145 }, { "epoch": 4.4200426439232405, "grad_norm": 0.07631456166587812, "learning_rate": 3.2341173231234956e-06, "loss": 0.3362, "step": 4146 }, { "epoch": 4.4211087420042645, "grad_norm": 0.07268449281095922, "learning_rate": 3.2224002480482075e-06, "loss": 0.335, "step": 4147 }, { "epoch": 4.422174840085288, "grad_norm": 0.07265611750504483, "learning_rate": 3.210703545851792e-06, "loss": 0.3323, "step": 4148 }, { "epoch": 4.423240938166312, "grad_norm": 0.07887619669716188, "learning_rate": 3.1990272230136266e-06, "loss": 0.3305, "step": 4149 }, { "epoch": 4.424307036247335, "grad_norm": 0.07492581724533613, "learning_rate": 3.187371286001768e-06, "loss": 0.3387, "step": 4150 }, { "epoch": 4.425373134328359, "grad_norm": 0.07375810082278332, "learning_rate": 3.175735741273007e-06, "loss": 0.3323, "step": 4151 }, { "epoch": 4.426439232409382, "grad_norm": 0.0756484197290268, "learning_rate": 3.164120595272837e-06, "loss": 0.3339, "step": 4152 }, { "epoch": 4.427505330490405, "grad_norm": 0.07829165152170597, "learning_rate": 3.1525258544354354e-06, "loss": 0.3358, "step": 4153 }, { "epoch": 4.428571428571429, "grad_norm": 0.07587711508081742, "learning_rate": 3.140951525183691e-06, "loss": 0.3389, "step": 4154 }, { "epoch": 4.429637526652452, "grad_norm": 0.07159911504321198, "learning_rate": 3.1293976139291814e-06, "loss": 0.3293, "step": 4155 }, { "epoch": 4.430703624733475, "grad_norm": 0.0699462370372574, "learning_rate": 3.117864127072179e-06, "loss": 0.3384, "step": 4156 }, { "epoch": 4.431769722814499, "grad_norm": 0.07194403996523205, "learning_rate": 3.106351071001621e-06, "loss": 0.3298, "step": 4157 }, { "epoch": 4.432835820895522, "grad_norm": 0.07348866543415371, "learning_rate": 3.0948584520951488e-06, "loss": 0.3327, "step": 4158 }, { "epoch": 4.433901918976546, "grad_norm": 0.07901403560235466, "learning_rate": 3.083386276719087e-06, "loss": 0.3354, "step": 4159 }, { "epoch": 4.434968017057569, "grad_norm": 0.07614682029865204, "learning_rate": 3.071934551228406e-06, "loss": 0.3399, "step": 4160 }, { "epoch": 4.436034115138593, "grad_norm": 0.07095662489839566, "learning_rate": 3.060503281966778e-06, "loss": 0.3364, "step": 4161 }, { "epoch": 4.437100213219616, "grad_norm": 0.07661517083301671, "learning_rate": 3.049092475266533e-06, "loss": 0.3407, "step": 4162 }, { "epoch": 4.438166311300639, "grad_norm": 0.07612860870888066, "learning_rate": 3.037702137448659e-06, "loss": 0.336, "step": 4163 }, { "epoch": 4.439232409381663, "grad_norm": 0.07257317111602125, "learning_rate": 3.0263322748228117e-06, "loss": 0.3347, "step": 4164 }, { "epoch": 4.440298507462686, "grad_norm": 0.06942974184000834, "learning_rate": 3.0149828936873084e-06, "loss": 0.3379, "step": 4165 }, { "epoch": 4.44136460554371, "grad_norm": 0.07367479114255061, "learning_rate": 3.003654000329115e-06, "loss": 0.3325, "step": 4166 }, { "epoch": 4.4424307036247335, "grad_norm": 0.07405338625538695, "learning_rate": 2.9923456010238426e-06, "loss": 0.3377, "step": 4167 }, { "epoch": 4.443496801705757, "grad_norm": 0.07391480199022413, "learning_rate": 2.98105770203577e-06, "loss": 0.3336, "step": 4168 }, { "epoch": 4.4445628997867805, "grad_norm": 0.0697155481700767, "learning_rate": 2.9697903096177973e-06, "loss": 0.3323, "step": 4169 }, { "epoch": 4.445628997867804, "grad_norm": 0.07479820610427203, "learning_rate": 2.9585434300114734e-06, "loss": 0.332, "step": 4170 }, { "epoch": 4.446695095948828, "grad_norm": 0.07917174679867747, "learning_rate": 2.9473170694469934e-06, "loss": 0.3346, "step": 4171 }, { "epoch": 4.447761194029851, "grad_norm": 0.07299094494689012, "learning_rate": 2.9361112341431643e-06, "loss": 0.3303, "step": 4172 }, { "epoch": 4.448827292110874, "grad_norm": 0.07367637209674983, "learning_rate": 2.924925930307447e-06, "loss": 0.3346, "step": 4173 }, { "epoch": 4.449893390191898, "grad_norm": 0.07495993556247976, "learning_rate": 2.9137611641359222e-06, "loss": 0.3321, "step": 4174 }, { "epoch": 4.450959488272921, "grad_norm": 0.07267443806340775, "learning_rate": 2.902616941813281e-06, "loss": 0.3299, "step": 4175 }, { "epoch": 4.452025586353945, "grad_norm": 0.0708925866597283, "learning_rate": 2.8914932695128393e-06, "loss": 0.3332, "step": 4176 }, { "epoch": 4.453091684434968, "grad_norm": 0.07269636700113985, "learning_rate": 2.880390153396544e-06, "loss": 0.3303, "step": 4177 }, { "epoch": 4.454157782515992, "grad_norm": 0.07332256346717313, "learning_rate": 2.86930759961495e-06, "loss": 0.3294, "step": 4178 }, { "epoch": 4.455223880597015, "grad_norm": 0.07309550795550446, "learning_rate": 2.8582456143071956e-06, "loss": 0.3318, "step": 4179 }, { "epoch": 4.456289978678038, "grad_norm": 0.07177315229711953, "learning_rate": 2.8472042036010594e-06, "loss": 0.3333, "step": 4180 }, { "epoch": 4.457356076759062, "grad_norm": 0.0747294310777951, "learning_rate": 2.8361833736129107e-06, "loss": 0.3239, "step": 4181 }, { "epoch": 4.458422174840085, "grad_norm": 0.07554108527569321, "learning_rate": 2.8251831304477108e-06, "loss": 0.3381, "step": 4182 }, { "epoch": 4.459488272921108, "grad_norm": 0.07278816202889332, "learning_rate": 2.81420348019902e-06, "loss": 0.3347, "step": 4183 }, { "epoch": 4.460554371002132, "grad_norm": 0.07323691646275544, "learning_rate": 2.8032444289490012e-06, "loss": 0.3337, "step": 4184 }, { "epoch": 4.461620469083155, "grad_norm": 0.07761545782310925, "learning_rate": 2.792305982768402e-06, "loss": 0.3331, "step": 4185 }, { "epoch": 4.462686567164179, "grad_norm": 0.0739374400916545, "learning_rate": 2.7813881477165395e-06, "loss": 0.3279, "step": 4186 }, { "epoch": 4.463752665245202, "grad_norm": 0.07242003332809858, "learning_rate": 2.7704909298413362e-06, "loss": 0.3363, "step": 4187 }, { "epoch": 4.464818763326226, "grad_norm": 0.07187298705284119, "learning_rate": 2.7596143351792837e-06, "loss": 0.3386, "step": 4188 }, { "epoch": 4.4658848614072495, "grad_norm": 0.07239042923705526, "learning_rate": 2.7487583697554555e-06, "loss": 0.3323, "step": 4189 }, { "epoch": 4.466950959488273, "grad_norm": 0.07678589531267026, "learning_rate": 2.7379230395834764e-06, "loss": 0.3329, "step": 4190 }, { "epoch": 4.468017057569297, "grad_norm": 0.07189456346112492, "learning_rate": 2.7271083506655728e-06, "loss": 0.3382, "step": 4191 }, { "epoch": 4.46908315565032, "grad_norm": 0.07515170520654751, "learning_rate": 2.716314308992516e-06, "loss": 0.3385, "step": 4192 }, { "epoch": 4.470149253731344, "grad_norm": 0.07313542483059168, "learning_rate": 2.7055409205436346e-06, "loss": 0.3316, "step": 4193 }, { "epoch": 4.471215351812367, "grad_norm": 0.07066165546162541, "learning_rate": 2.6947881912868346e-06, "loss": 0.3315, "step": 4194 }, { "epoch": 4.47228144989339, "grad_norm": 0.07272630195981485, "learning_rate": 2.6840561271785694e-06, "loss": 0.3354, "step": 4195 }, { "epoch": 4.473347547974414, "grad_norm": 0.07161497516187117, "learning_rate": 2.6733447341638472e-06, "loss": 0.3388, "step": 4196 }, { "epoch": 4.474413646055437, "grad_norm": 0.07454202797819427, "learning_rate": 2.662654018176212e-06, "loss": 0.3333, "step": 4197 }, { "epoch": 4.475479744136461, "grad_norm": 0.07508088009677, "learning_rate": 2.6519839851377737e-06, "loss": 0.3387, "step": 4198 }, { "epoch": 4.476545842217484, "grad_norm": 0.07804787962517462, "learning_rate": 2.6413346409591745e-06, "loss": 0.3317, "step": 4199 }, { "epoch": 4.477611940298507, "grad_norm": 0.07128589665687961, "learning_rate": 2.630705991539602e-06, "loss": 0.3251, "step": 4200 }, { "epoch": 4.478678038379531, "grad_norm": 0.07355996310666547, "learning_rate": 2.6200980427667635e-06, "loss": 0.3297, "step": 4201 }, { "epoch": 4.479744136460554, "grad_norm": 0.07500471190524303, "learning_rate": 2.6095108005169188e-06, "loss": 0.3292, "step": 4202 }, { "epoch": 4.480810234541578, "grad_norm": 0.07946453180732672, "learning_rate": 2.5989442706548574e-06, "loss": 0.3352, "step": 4203 }, { "epoch": 4.481876332622601, "grad_norm": 0.06970897348276822, "learning_rate": 2.5883984590338738e-06, "loss": 0.3279, "step": 4204 }, { "epoch": 4.482942430703625, "grad_norm": 0.08238106944138114, "learning_rate": 2.5778733714958027e-06, "loss": 0.3324, "step": 4205 }, { "epoch": 4.484008528784648, "grad_norm": 0.07235582940293811, "learning_rate": 2.5673690138710018e-06, "loss": 0.3365, "step": 4206 }, { "epoch": 4.485074626865671, "grad_norm": 0.07346252026606727, "learning_rate": 2.556885391978341e-06, "loss": 0.3349, "step": 4207 }, { "epoch": 4.486140724946695, "grad_norm": 0.07558966474067527, "learning_rate": 2.5464225116251886e-06, "loss": 0.3358, "step": 4208 }, { "epoch": 4.4872068230277184, "grad_norm": 0.07040408366742829, "learning_rate": 2.53598037860745e-06, "loss": 0.331, "step": 4209 }, { "epoch": 4.4882729211087415, "grad_norm": 0.07488263115221298, "learning_rate": 2.5255589987095207e-06, "loss": 0.3342, "step": 4210 }, { "epoch": 4.4893390191897655, "grad_norm": 0.07051801023243459, "learning_rate": 2.5151583777042988e-06, "loss": 0.3306, "step": 4211 }, { "epoch": 4.490405117270789, "grad_norm": 0.07182136451708207, "learning_rate": 2.5047785213531882e-06, "loss": 0.3272, "step": 4212 }, { "epoch": 4.491471215351813, "grad_norm": 0.07727816037885224, "learning_rate": 2.494419435406097e-06, "loss": 0.3354, "step": 4213 }, { "epoch": 4.492537313432836, "grad_norm": 0.07344914255236583, "learning_rate": 2.4840811256014164e-06, "loss": 0.3293, "step": 4214 }, { "epoch": 4.49360341151386, "grad_norm": 0.07192568760462288, "learning_rate": 2.4737635976660325e-06, "loss": 0.3287, "step": 4215 }, { "epoch": 4.494669509594883, "grad_norm": 0.0701357026382714, "learning_rate": 2.4634668573153154e-06, "loss": 0.3386, "step": 4216 }, { "epoch": 4.495735607675906, "grad_norm": 0.06637179767672738, "learning_rate": 2.4531909102531294e-06, "loss": 0.3325, "step": 4217 }, { "epoch": 4.49680170575693, "grad_norm": 0.07656271024622706, "learning_rate": 2.442935762171819e-06, "loss": 0.3394, "step": 4218 }, { "epoch": 4.497867803837953, "grad_norm": 0.0714666290950529, "learning_rate": 2.4327014187521948e-06, "loss": 0.3367, "step": 4219 }, { "epoch": 4.498933901918977, "grad_norm": 0.07606369364800027, "learning_rate": 2.422487885663554e-06, "loss": 0.3386, "step": 4220 }, { "epoch": 4.5, "grad_norm": 0.06844926088782544, "learning_rate": 2.4122951685636674e-06, "loss": 0.3326, "step": 4221 }, { "epoch": 4.501066098081023, "grad_norm": 0.07046377328122992, "learning_rate": 2.4021232730987622e-06, "loss": 0.3352, "step": 4222 }, { "epoch": 4.502132196162047, "grad_norm": 0.07467074933049231, "learning_rate": 2.3919722049035433e-06, "loss": 0.3334, "step": 4223 }, { "epoch": 4.50319829424307, "grad_norm": 0.07060359053074924, "learning_rate": 2.38184196960118e-06, "loss": 0.3362, "step": 4224 }, { "epoch": 4.504264392324094, "grad_norm": 0.07379325677186419, "learning_rate": 2.3717325728032935e-06, "loss": 0.3429, "step": 4225 }, { "epoch": 4.505330490405117, "grad_norm": 0.07310277967696212, "learning_rate": 2.3616440201099567e-06, "loss": 0.3388, "step": 4226 }, { "epoch": 4.50639658848614, "grad_norm": 0.07006559210490895, "learning_rate": 2.3515763171097115e-06, "loss": 0.3355, "step": 4227 }, { "epoch": 4.507462686567164, "grad_norm": 0.07133173745971073, "learning_rate": 2.341529469379551e-06, "loss": 0.3318, "step": 4228 }, { "epoch": 4.508528784648187, "grad_norm": 0.07098735264122637, "learning_rate": 2.3315034824848846e-06, "loss": 0.3346, "step": 4229 }, { "epoch": 4.509594882729211, "grad_norm": 0.0777511243612946, "learning_rate": 2.3214983619795995e-06, "loss": 0.3382, "step": 4230 }, { "epoch": 4.5106609808102345, "grad_norm": 0.06908524988843785, "learning_rate": 2.3115141134060215e-06, "loss": 0.3351, "step": 4231 }, { "epoch": 4.5117270788912585, "grad_norm": 0.07463758297713084, "learning_rate": 2.301550742294887e-06, "loss": 0.3385, "step": 4232 }, { "epoch": 4.5127931769722816, "grad_norm": 0.07271121038748618, "learning_rate": 2.2916082541653983e-06, "loss": 0.3349, "step": 4233 }, { "epoch": 4.513859275053305, "grad_norm": 0.07475883982198514, "learning_rate": 2.281686654525177e-06, "loss": 0.333, "step": 4234 }, { "epoch": 4.514925373134329, "grad_norm": 0.07660331206011552, "learning_rate": 2.2717859488702665e-06, "loss": 0.3378, "step": 4235 }, { "epoch": 4.515991471215352, "grad_norm": 0.07040590430776542, "learning_rate": 2.2619061426851463e-06, "loss": 0.3353, "step": 4236 }, { "epoch": 4.517057569296375, "grad_norm": 0.07585806379146305, "learning_rate": 2.252047241442723e-06, "loss": 0.3395, "step": 4237 }, { "epoch": 4.518123667377399, "grad_norm": 0.07561687413038395, "learning_rate": 2.2422092506043036e-06, "loss": 0.3367, "step": 4238 }, { "epoch": 4.519189765458422, "grad_norm": 0.06884029998313776, "learning_rate": 2.2323921756196263e-06, "loss": 0.336, "step": 4239 }, { "epoch": 4.520255863539446, "grad_norm": 0.0699098475783151, "learning_rate": 2.2225960219268526e-06, "loss": 0.3348, "step": 4240 }, { "epoch": 4.521321961620469, "grad_norm": 0.07422236573639096, "learning_rate": 2.212820794952526e-06, "loss": 0.3304, "step": 4241 }, { "epoch": 4.522388059701493, "grad_norm": 0.07002733474761201, "learning_rate": 2.2030665001116213e-06, "loss": 0.3297, "step": 4242 }, { "epoch": 4.523454157782516, "grad_norm": 0.07060390805754775, "learning_rate": 2.1933331428075146e-06, "loss": 0.3305, "step": 4243 }, { "epoch": 4.524520255863539, "grad_norm": 0.07131829668068262, "learning_rate": 2.1836207284319724e-06, "loss": 0.331, "step": 4244 }, { "epoch": 4.525586353944563, "grad_norm": 0.07092568911405243, "learning_rate": 2.1739292623651755e-06, "loss": 0.3324, "step": 4245 }, { "epoch": 4.526652452025586, "grad_norm": 0.07054221449199592, "learning_rate": 2.164258749975683e-06, "loss": 0.3366, "step": 4246 }, { "epoch": 4.52771855010661, "grad_norm": 0.07706442180790107, "learning_rate": 2.154609196620472e-06, "loss": 0.3371, "step": 4247 }, { "epoch": 4.528784648187633, "grad_norm": 0.07508166585212031, "learning_rate": 2.144980607644871e-06, "loss": 0.3321, "step": 4248 }, { "epoch": 4.529850746268656, "grad_norm": 0.07179015508589877, "learning_rate": 2.135372988382636e-06, "loss": 0.3326, "step": 4249 }, { "epoch": 4.53091684434968, "grad_norm": 0.07298872176356, "learning_rate": 2.1257863441558867e-06, "loss": 0.3336, "step": 4250 }, { "epoch": 4.531982942430703, "grad_norm": 0.07322352264632, "learning_rate": 2.116220680275114e-06, "loss": 0.335, "step": 4251 }, { "epoch": 4.533049040511727, "grad_norm": 0.07412360547888225, "learning_rate": 2.1066760020392075e-06, "loss": 0.333, "step": 4252 }, { "epoch": 4.5341151385927505, "grad_norm": 0.07037038219362778, "learning_rate": 2.0971523147354224e-06, "loss": 0.3345, "step": 4253 }, { "epoch": 4.535181236673774, "grad_norm": 0.07385081135383645, "learning_rate": 2.0876496236393915e-06, "loss": 0.3361, "step": 4254 }, { "epoch": 4.536247334754798, "grad_norm": 0.07977224842487596, "learning_rate": 2.0781679340151007e-06, "loss": 0.3322, "step": 4255 }, { "epoch": 4.537313432835821, "grad_norm": 0.07150280522896328, "learning_rate": 2.0687072511149207e-06, "loss": 0.3343, "step": 4256 }, { "epoch": 4.538379530916845, "grad_norm": 0.07550230379695838, "learning_rate": 2.0592675801795715e-06, "loss": 0.3289, "step": 4257 }, { "epoch": 4.539445628997868, "grad_norm": 0.07468614314295811, "learning_rate": 2.0498489264381537e-06, "loss": 0.3364, "step": 4258 }, { "epoch": 4.540511727078892, "grad_norm": 0.07766324270385316, "learning_rate": 2.040451295108099e-06, "loss": 0.3332, "step": 4259 }, { "epoch": 4.541577825159915, "grad_norm": 0.0736473092823455, "learning_rate": 2.0310746913952075e-06, "loss": 0.3307, "step": 4260 }, { "epoch": 4.542643923240938, "grad_norm": 0.0734896770686822, "learning_rate": 2.0217191204936393e-06, "loss": 0.3396, "step": 4261 }, { "epoch": 4.543710021321962, "grad_norm": 0.06952100029741023, "learning_rate": 2.012384587585885e-06, "loss": 0.3346, "step": 4262 }, { "epoch": 4.544776119402985, "grad_norm": 0.07462393218580446, "learning_rate": 2.003071097842795e-06, "loss": 0.3356, "step": 4263 }, { "epoch": 4.545842217484008, "grad_norm": 0.07095210368849357, "learning_rate": 1.993778656423557e-06, "loss": 0.3299, "step": 4264 }, { "epoch": 4.546908315565032, "grad_norm": 0.07173596162673962, "learning_rate": 1.9845072684757084e-06, "loss": 0.3331, "step": 4265 }, { "epoch": 4.547974413646055, "grad_norm": 0.07462211744625402, "learning_rate": 1.975256939135104e-06, "loss": 0.3331, "step": 4266 }, { "epoch": 4.549040511727079, "grad_norm": 0.06797661761770143, "learning_rate": 1.966027673525952e-06, "loss": 0.3349, "step": 4267 }, { "epoch": 4.550106609808102, "grad_norm": 0.06909061390055764, "learning_rate": 1.9568194767607897e-06, "loss": 0.333, "step": 4268 }, { "epoch": 4.551172707889126, "grad_norm": 0.07230285207266503, "learning_rate": 1.9476323539404697e-06, "loss": 0.336, "step": 4269 }, { "epoch": 4.552238805970149, "grad_norm": 0.06795386640951454, "learning_rate": 1.9384663101541834e-06, "loss": 0.3306, "step": 4270 }, { "epoch": 4.553304904051172, "grad_norm": 0.07185284208858062, "learning_rate": 1.9293213504794474e-06, "loss": 0.3329, "step": 4271 }, { "epoch": 4.554371002132196, "grad_norm": 0.06964066381645194, "learning_rate": 1.9201974799820976e-06, "loss": 0.3336, "step": 4272 }, { "epoch": 4.5554371002132195, "grad_norm": 0.06841885093169003, "learning_rate": 1.911094703716274e-06, "loss": 0.3373, "step": 4273 }, { "epoch": 4.556503198294243, "grad_norm": 0.07829671714309364, "learning_rate": 1.9020130267244408e-06, "loss": 0.3337, "step": 4274 }, { "epoch": 4.5575692963752665, "grad_norm": 0.07118332494914492, "learning_rate": 1.8929524540373868e-06, "loss": 0.3337, "step": 4275 }, { "epoch": 4.55863539445629, "grad_norm": 0.07251578667557201, "learning_rate": 1.8839129906741903e-06, "loss": 0.3374, "step": 4276 }, { "epoch": 4.559701492537314, "grad_norm": 0.06723960489550845, "learning_rate": 1.8748946416422464e-06, "loss": 0.3306, "step": 4277 }, { "epoch": 4.560767590618337, "grad_norm": 0.07111432053727844, "learning_rate": 1.8658974119372475e-06, "loss": 0.3356, "step": 4278 }, { "epoch": 4.561833688699361, "grad_norm": 0.06575145639594351, "learning_rate": 1.856921306543198e-06, "loss": 0.3326, "step": 4279 }, { "epoch": 4.562899786780384, "grad_norm": 0.07466010683175027, "learning_rate": 1.847966330432387e-06, "loss": 0.3331, "step": 4280 }, { "epoch": 4.563965884861407, "grad_norm": 0.07631299648143247, "learning_rate": 1.839032488565411e-06, "loss": 0.3351, "step": 4281 }, { "epoch": 4.565031982942431, "grad_norm": 0.06992148350886665, "learning_rate": 1.8301197858911512e-06, "loss": 0.3341, "step": 4282 }, { "epoch": 4.566098081023454, "grad_norm": 0.06950406008223693, "learning_rate": 1.8212282273467874e-06, "loss": 0.3326, "step": 4283 }, { "epoch": 4.567164179104478, "grad_norm": 0.0715513717486181, "learning_rate": 1.8123578178577706e-06, "loss": 0.3314, "step": 4284 }, { "epoch": 4.568230277185501, "grad_norm": 0.07759734713431235, "learning_rate": 1.8035085623378544e-06, "loss": 0.3284, "step": 4285 }, { "epoch": 4.569296375266525, "grad_norm": 0.07289404960905593, "learning_rate": 1.7946804656890648e-06, "loss": 0.3339, "step": 4286 }, { "epoch": 4.570362473347548, "grad_norm": 0.07031085270085344, "learning_rate": 1.7858735328017119e-06, "loss": 0.3352, "step": 4287 }, { "epoch": 4.571428571428571, "grad_norm": 0.0683840513250094, "learning_rate": 1.7770877685543687e-06, "loss": 0.3319, "step": 4288 }, { "epoch": 4.572494669509595, "grad_norm": 0.07276081186481145, "learning_rate": 1.768323177813902e-06, "loss": 0.3314, "step": 4289 }, { "epoch": 4.573560767590618, "grad_norm": 0.07091066982025185, "learning_rate": 1.7595797654354374e-06, "loss": 0.3327, "step": 4290 }, { "epoch": 4.574626865671641, "grad_norm": 0.07166306063065245, "learning_rate": 1.750857536262367e-06, "loss": 0.3349, "step": 4291 }, { "epoch": 4.575692963752665, "grad_norm": 0.07054961140916155, "learning_rate": 1.7421564951263547e-06, "loss": 0.332, "step": 4292 }, { "epoch": 4.576759061833688, "grad_norm": 0.07361301895145772, "learning_rate": 1.7334766468473275e-06, "loss": 0.3337, "step": 4293 }, { "epoch": 4.577825159914712, "grad_norm": 0.07045798280358322, "learning_rate": 1.7248179962334699e-06, "loss": 0.3359, "step": 4294 }, { "epoch": 4.5788912579957355, "grad_norm": 0.06774482441192077, "learning_rate": 1.7161805480812166e-06, "loss": 0.3327, "step": 4295 }, { "epoch": 4.5799573560767595, "grad_norm": 0.0718262080789473, "learning_rate": 1.7075643071752735e-06, "loss": 0.3345, "step": 4296 }, { "epoch": 4.581023454157783, "grad_norm": 0.0691615331261111, "learning_rate": 1.6989692782885914e-06, "loss": 0.332, "step": 4297 }, { "epoch": 4.582089552238806, "grad_norm": 0.0731179918926363, "learning_rate": 1.6903954661823618e-06, "loss": 0.3332, "step": 4298 }, { "epoch": 4.58315565031983, "grad_norm": 0.07078605872262074, "learning_rate": 1.6818428756060346e-06, "loss": 0.3331, "step": 4299 }, { "epoch": 4.584221748400853, "grad_norm": 0.06734701136745615, "learning_rate": 1.6733115112973042e-06, "loss": 0.3328, "step": 4300 }, { "epoch": 4.585287846481877, "grad_norm": 0.06790749484835054, "learning_rate": 1.6648013779820972e-06, "loss": 0.3385, "step": 4301 }, { "epoch": 4.5863539445629, "grad_norm": 0.07187466083950145, "learning_rate": 1.656312480374589e-06, "loss": 0.3356, "step": 4302 }, { "epoch": 4.587420042643923, "grad_norm": 0.07459252483448003, "learning_rate": 1.6478448231771914e-06, "loss": 0.333, "step": 4303 }, { "epoch": 4.588486140724947, "grad_norm": 0.0717675220591279, "learning_rate": 1.639398411080535e-06, "loss": 0.3319, "step": 4304 }, { "epoch": 4.58955223880597, "grad_norm": 0.07118074349730492, "learning_rate": 1.6309732487634989e-06, "loss": 0.3318, "step": 4305 }, { "epoch": 4.590618336886994, "grad_norm": 0.06815517038121584, "learning_rate": 1.6225693408931898e-06, "loss": 0.3395, "step": 4306 }, { "epoch": 4.591684434968017, "grad_norm": 0.06997889027085387, "learning_rate": 1.6141866921249282e-06, "loss": 0.331, "step": 4307 }, { "epoch": 4.59275053304904, "grad_norm": 0.07279040293236479, "learning_rate": 1.6058253071022711e-06, "loss": 0.3322, "step": 4308 }, { "epoch": 4.593816631130064, "grad_norm": 0.0745884036380402, "learning_rate": 1.5974851904569931e-06, "loss": 0.3353, "step": 4309 }, { "epoch": 4.594882729211087, "grad_norm": 0.06892857071091296, "learning_rate": 1.589166346809079e-06, "loss": 0.3336, "step": 4310 }, { "epoch": 4.595948827292111, "grad_norm": 0.06646283141120592, "learning_rate": 1.58086878076674e-06, "loss": 0.3355, "step": 4311 }, { "epoch": 4.597014925373134, "grad_norm": 0.06892657692261639, "learning_rate": 1.5725924969263973e-06, "loss": 0.3341, "step": 4312 }, { "epoch": 4.598081023454158, "grad_norm": 0.07247620559120913, "learning_rate": 1.5643374998726768e-06, "loss": 0.3371, "step": 4313 }, { "epoch": 4.599147121535181, "grad_norm": 0.0682871385404355, "learning_rate": 1.5561037941784184e-06, "loss": 0.337, "step": 4314 }, { "epoch": 4.600213219616204, "grad_norm": 0.07060756058234041, "learning_rate": 1.5478913844046716e-06, "loss": 0.3356, "step": 4315 }, { "epoch": 4.601279317697228, "grad_norm": 0.06829225466485105, "learning_rate": 1.5397002751006863e-06, "loss": 0.337, "step": 4316 }, { "epoch": 4.6023454157782515, "grad_norm": 0.06766530449323094, "learning_rate": 1.531530470803908e-06, "loss": 0.3321, "step": 4317 }, { "epoch": 4.603411513859275, "grad_norm": 0.0670931989643599, "learning_rate": 1.5233819760399793e-06, "loss": 0.335, "step": 4318 }, { "epoch": 4.604477611940299, "grad_norm": 0.06938288654468625, "learning_rate": 1.5152547953227515e-06, "loss": 0.3318, "step": 4319 }, { "epoch": 4.605543710021322, "grad_norm": 0.06867733948754896, "learning_rate": 1.5071489331542543e-06, "loss": 0.3314, "step": 4320 }, { "epoch": 4.606609808102346, "grad_norm": 0.07777991942380198, "learning_rate": 1.499064394024714e-06, "loss": 0.3385, "step": 4321 }, { "epoch": 4.607675906183369, "grad_norm": 0.06967592936560713, "learning_rate": 1.4910011824125436e-06, "loss": 0.3323, "step": 4322 }, { "epoch": 4.608742004264393, "grad_norm": 0.06751157296924212, "learning_rate": 1.482959302784357e-06, "loss": 0.3393, "step": 4323 }, { "epoch": 4.609808102345416, "grad_norm": 0.06979920693263406, "learning_rate": 1.4749387595949195e-06, "loss": 0.336, "step": 4324 }, { "epoch": 4.610874200426439, "grad_norm": 0.07053350912123188, "learning_rate": 1.4669395572872015e-06, "loss": 0.3298, "step": 4325 }, { "epoch": 4.611940298507463, "grad_norm": 0.07371928778646851, "learning_rate": 1.4589617002923516e-06, "loss": 0.3342, "step": 4326 }, { "epoch": 4.613006396588486, "grad_norm": 0.07223172335830214, "learning_rate": 1.4510051930296799e-06, "loss": 0.3321, "step": 4327 }, { "epoch": 4.61407249466951, "grad_norm": 0.066958488669665, "learning_rate": 1.4430700399066821e-06, "loss": 0.3312, "step": 4328 }, { "epoch": 4.615138592750533, "grad_norm": 0.07053436931086877, "learning_rate": 1.435156245319016e-06, "loss": 0.3372, "step": 4329 }, { "epoch": 4.616204690831556, "grad_norm": 0.06968744357208878, "learning_rate": 1.4272638136505257e-06, "loss": 0.3368, "step": 4330 }, { "epoch": 4.61727078891258, "grad_norm": 0.06859881353886961, "learning_rate": 1.4193927492731897e-06, "loss": 0.3277, "step": 4331 }, { "epoch": 4.618336886993603, "grad_norm": 0.07231488171540013, "learning_rate": 1.4115430565471776e-06, "loss": 0.3321, "step": 4332 }, { "epoch": 4.619402985074627, "grad_norm": 0.0666791642191909, "learning_rate": 1.4037147398208118e-06, "loss": 0.3371, "step": 4333 }, { "epoch": 4.62046908315565, "grad_norm": 0.0776009579849058, "learning_rate": 1.3959078034305785e-06, "loss": 0.3347, "step": 4334 }, { "epoch": 4.621535181236673, "grad_norm": 0.06614100139929324, "learning_rate": 1.3881222517010983e-06, "loss": 0.3376, "step": 4335 }, { "epoch": 4.622601279317697, "grad_norm": 0.06757259397296564, "learning_rate": 1.3803580889451795e-06, "loss": 0.3353, "step": 4336 }, { "epoch": 4.6236673773987205, "grad_norm": 0.07131647416320316, "learning_rate": 1.3726153194637548e-06, "loss": 0.3441, "step": 4337 }, { "epoch": 4.6247334754797444, "grad_norm": 0.06932530118685151, "learning_rate": 1.3648939475459178e-06, "loss": 0.329, "step": 4338 }, { "epoch": 4.6257995735607675, "grad_norm": 0.07255409329631657, "learning_rate": 1.3571939774689091e-06, "loss": 0.3288, "step": 4339 }, { "epoch": 4.6268656716417915, "grad_norm": 0.06673195482515673, "learning_rate": 1.3495154134981126e-06, "loss": 0.3354, "step": 4340 }, { "epoch": 4.627931769722815, "grad_norm": 0.06572556100183624, "learning_rate": 1.34185825988705e-06, "loss": 0.3329, "step": 4341 }, { "epoch": 4.628997867803838, "grad_norm": 0.06490987870262467, "learning_rate": 1.3342225208773906e-06, "loss": 0.3354, "step": 4342 }, { "epoch": 4.630063965884862, "grad_norm": 0.07309588285106411, "learning_rate": 1.3266082006989333e-06, "loss": 0.331, "step": 4343 }, { "epoch": 4.631130063965885, "grad_norm": 0.07038414592289387, "learning_rate": 1.3190153035696196e-06, "loss": 0.3435, "step": 4344 }, { "epoch": 4.632196162046908, "grad_norm": 0.06928635687457206, "learning_rate": 1.311443833695516e-06, "loss": 0.3364, "step": 4345 }, { "epoch": 4.633262260127932, "grad_norm": 0.06537020208079272, "learning_rate": 1.303893795270823e-06, "loss": 0.3316, "step": 4346 }, { "epoch": 4.634328358208955, "grad_norm": 0.06896350457838996, "learning_rate": 1.296365192477871e-06, "loss": 0.3367, "step": 4347 }, { "epoch": 4.635394456289979, "grad_norm": 0.06599969292032785, "learning_rate": 1.2888580294871233e-06, "loss": 0.3346, "step": 4348 }, { "epoch": 4.636460554371002, "grad_norm": 0.06925949481192152, "learning_rate": 1.281372310457143e-06, "loss": 0.3349, "step": 4349 }, { "epoch": 4.637526652452026, "grad_norm": 0.07032567037244174, "learning_rate": 1.2739080395346347e-06, "loss": 0.3324, "step": 4350 }, { "epoch": 4.638592750533049, "grad_norm": 0.06808103931489896, "learning_rate": 1.2664652208544205e-06, "loss": 0.3364, "step": 4351 }, { "epoch": 4.639658848614072, "grad_norm": 0.07415952469477234, "learning_rate": 1.2590438585394372e-06, "loss": 0.3368, "step": 4352 }, { "epoch": 4.640724946695096, "grad_norm": 0.07472723302057231, "learning_rate": 1.2516439567007254e-06, "loss": 0.3312, "step": 4353 }, { "epoch": 4.641791044776119, "grad_norm": 0.06723931963836592, "learning_rate": 1.2442655194374464e-06, "loss": 0.3363, "step": 4354 }, { "epoch": 4.642857142857143, "grad_norm": 0.06731787952306181, "learning_rate": 1.2369085508368862e-06, "loss": 0.3384, "step": 4355 }, { "epoch": 4.643923240938166, "grad_norm": 0.06995325462163068, "learning_rate": 1.2295730549744023e-06, "loss": 0.3331, "step": 4356 }, { "epoch": 4.644989339019189, "grad_norm": 0.07264991945604682, "learning_rate": 1.2222590359134868e-06, "loss": 0.3335, "step": 4357 }, { "epoch": 4.646055437100213, "grad_norm": 0.0664979493499962, "learning_rate": 1.2149664977057296e-06, "loss": 0.331, "step": 4358 }, { "epoch": 4.6471215351812365, "grad_norm": 0.06852799522357257, "learning_rate": 1.2076954443908152e-06, "loss": 0.3342, "step": 4359 }, { "epoch": 4.6481876332622605, "grad_norm": 0.0700385586316551, "learning_rate": 1.20044587999653e-06, "loss": 0.3324, "step": 4360 }, { "epoch": 4.649253731343284, "grad_norm": 0.06744640423267867, "learning_rate": 1.1932178085387514e-06, "loss": 0.3336, "step": 4361 }, { "epoch": 4.650319829424307, "grad_norm": 0.07133474365028102, "learning_rate": 1.1860112340214624e-06, "loss": 0.3357, "step": 4362 }, { "epoch": 4.651385927505331, "grad_norm": 0.06780843638739921, "learning_rate": 1.178826160436728e-06, "loss": 0.3339, "step": 4363 }, { "epoch": 4.652452025586354, "grad_norm": 0.06642970597324976, "learning_rate": 1.1716625917647018e-06, "loss": 0.3341, "step": 4364 }, { "epoch": 4.653518123667378, "grad_norm": 0.07245838112455406, "learning_rate": 1.1645205319736318e-06, "loss": 0.3345, "step": 4365 }, { "epoch": 4.654584221748401, "grad_norm": 0.07452837382246935, "learning_rate": 1.1573999850198515e-06, "loss": 0.3311, "step": 4366 }, { "epoch": 4.655650319829425, "grad_norm": 0.07068223858842627, "learning_rate": 1.1503009548477695e-06, "loss": 0.3382, "step": 4367 }, { "epoch": 4.656716417910448, "grad_norm": 0.06557927350005839, "learning_rate": 1.143223445389876e-06, "loss": 0.3304, "step": 4368 }, { "epoch": 4.657782515991471, "grad_norm": 0.0663596931630506, "learning_rate": 1.1361674605667505e-06, "loss": 0.3306, "step": 4369 }, { "epoch": 4.658848614072495, "grad_norm": 0.06703039652535904, "learning_rate": 1.1291330042870396e-06, "loss": 0.3375, "step": 4370 }, { "epoch": 4.659914712153518, "grad_norm": 0.06939523423465807, "learning_rate": 1.122120080447462e-06, "loss": 0.3369, "step": 4371 }, { "epoch": 4.660980810234541, "grad_norm": 0.07088459747200242, "learning_rate": 1.115128692932821e-06, "loss": 0.3397, "step": 4372 }, { "epoch": 4.662046908315565, "grad_norm": 0.07022817101518151, "learning_rate": 1.1081588456159786e-06, "loss": 0.3359, "step": 4373 }, { "epoch": 4.663113006396588, "grad_norm": 0.07123058169602929, "learning_rate": 1.10121054235786e-06, "loss": 0.3333, "step": 4374 }, { "epoch": 4.664179104477612, "grad_norm": 0.0675246173798063, "learning_rate": 1.0942837870074795e-06, "loss": 0.3333, "step": 4375 }, { "epoch": 4.665245202558635, "grad_norm": 0.06471932900262051, "learning_rate": 1.087378583401888e-06, "loss": 0.3321, "step": 4376 }, { "epoch": 4.666311300639659, "grad_norm": 0.06686992898419163, "learning_rate": 1.080494935366212e-06, "loss": 0.3331, "step": 4377 }, { "epoch": 4.667377398720682, "grad_norm": 0.06937875917999614, "learning_rate": 1.073632846713637e-06, "loss": 0.3362, "step": 4378 }, { "epoch": 4.6684434968017055, "grad_norm": 0.06987137809858991, "learning_rate": 1.0667923212454023e-06, "loss": 0.3368, "step": 4379 }, { "epoch": 4.669509594882729, "grad_norm": 0.06867240275257108, "learning_rate": 1.0599733627508013e-06, "loss": 0.3346, "step": 4380 }, { "epoch": 4.6705756929637525, "grad_norm": 0.06826231476314719, "learning_rate": 1.0531759750071857e-06, "loss": 0.3313, "step": 4381 }, { "epoch": 4.6716417910447765, "grad_norm": 0.06977322414302481, "learning_rate": 1.0464001617799525e-06, "loss": 0.3349, "step": 4382 }, { "epoch": 4.6727078891258, "grad_norm": 0.07065917384047074, "learning_rate": 1.0396459268225523e-06, "loss": 0.3303, "step": 4383 }, { "epoch": 4.673773987206823, "grad_norm": 0.06723899201810622, "learning_rate": 1.0329132738764814e-06, "loss": 0.3376, "step": 4384 }, { "epoch": 4.674840085287847, "grad_norm": 0.06708033630230664, "learning_rate": 1.0262022066712763e-06, "loss": 0.3368, "step": 4385 }, { "epoch": 4.67590618336887, "grad_norm": 0.06703109426648023, "learning_rate": 1.0195127289245188e-06, "loss": 0.3305, "step": 4386 }, { "epoch": 4.676972281449894, "grad_norm": 0.07014172973051326, "learning_rate": 1.0128448443418315e-06, "loss": 0.3354, "step": 4387 }, { "epoch": 4.678038379530917, "grad_norm": 0.07076031888009388, "learning_rate": 1.0061985566168863e-06, "loss": 0.3393, "step": 4388 }, { "epoch": 4.67910447761194, "grad_norm": 0.06971401068803788, "learning_rate": 9.99573869431365e-07, "loss": 0.3405, "step": 4389 }, { "epoch": 4.680170575692964, "grad_norm": 0.06595206438851949, "learning_rate": 9.92970786455012e-07, "loss": 0.3291, "step": 4390 }, { "epoch": 4.681236673773987, "grad_norm": 0.07143631976414835, "learning_rate": 9.863893113455857e-07, "loss": 0.3294, "step": 4391 }, { "epoch": 4.682302771855011, "grad_norm": 0.06704484678485922, "learning_rate": 9.798294477488901e-07, "loss": 0.3355, "step": 4392 }, { "epoch": 4.683368869936034, "grad_norm": 0.06626390001420994, "learning_rate": 9.732911992987382e-07, "loss": 0.3352, "step": 4393 }, { "epoch": 4.684434968017058, "grad_norm": 0.06399841711925291, "learning_rate": 9.667745696169839e-07, "loss": 0.3365, "step": 4394 }, { "epoch": 4.685501066098081, "grad_norm": 0.06442430133602112, "learning_rate": 9.602795623135042e-07, "loss": 0.3306, "step": 4395 }, { "epoch": 4.686567164179104, "grad_norm": 0.06647357574669362, "learning_rate": 9.5380618098619e-07, "loss": 0.3311, "step": 4396 }, { "epoch": 4.687633262260128, "grad_norm": 0.06493583228185477, "learning_rate": 9.473544292209591e-07, "loss": 0.327, "step": 4397 }, { "epoch": 4.688699360341151, "grad_norm": 0.06666580308482166, "learning_rate": 9.409243105917532e-07, "loss": 0.334, "step": 4398 }, { "epoch": 4.689765458422174, "grad_norm": 0.06921334075814284, "learning_rate": 9.345158286605182e-07, "loss": 0.3363, "step": 4399 }, { "epoch": 4.690831556503198, "grad_norm": 0.06673171229849625, "learning_rate": 9.281289869772192e-07, "loss": 0.328, "step": 4400 }, { "epoch": 4.6918976545842215, "grad_norm": 0.06584271320243143, "learning_rate": 9.217637890798348e-07, "loss": 0.3394, "step": 4401 }, { "epoch": 4.6929637526652455, "grad_norm": 0.0671007285287849, "learning_rate": 9.154202384943622e-07, "loss": 0.3318, "step": 4402 }, { "epoch": 4.6940298507462686, "grad_norm": 0.06331709951463382, "learning_rate": 9.090983387347863e-07, "loss": 0.3347, "step": 4403 }, { "epoch": 4.6950959488272925, "grad_norm": 0.06898127420033802, "learning_rate": 9.027980933031188e-07, "loss": 0.3374, "step": 4404 }, { "epoch": 4.696162046908316, "grad_norm": 0.07051509740595827, "learning_rate": 8.965195056893638e-07, "loss": 0.3351, "step": 4405 }, { "epoch": 4.697228144989339, "grad_norm": 0.06642503187647329, "learning_rate": 8.902625793715391e-07, "loss": 0.3289, "step": 4406 }, { "epoch": 4.698294243070363, "grad_norm": 0.062149770848458644, "learning_rate": 8.8402731781565e-07, "loss": 0.3321, "step": 4407 }, { "epoch": 4.699360341151386, "grad_norm": 0.06584174207599507, "learning_rate": 8.778137244757068e-07, "loss": 0.3303, "step": 4408 }, { "epoch": 4.70042643923241, "grad_norm": 0.06691401972973397, "learning_rate": 8.716218027937251e-07, "loss": 0.3305, "step": 4409 }, { "epoch": 4.701492537313433, "grad_norm": 0.0690126967878217, "learning_rate": 8.654515561997034e-07, "loss": 0.3354, "step": 4410 }, { "epoch": 4.702558635394456, "grad_norm": 0.06740746680029383, "learning_rate": 8.593029881116322e-07, "loss": 0.3242, "step": 4411 }, { "epoch": 4.70362473347548, "grad_norm": 0.06898874376741002, "learning_rate": 8.531761019355067e-07, "loss": 0.3387, "step": 4412 }, { "epoch": 4.704690831556503, "grad_norm": 0.06501918561728087, "learning_rate": 8.470709010653011e-07, "loss": 0.3329, "step": 4413 }, { "epoch": 4.705756929637527, "grad_norm": 0.06601690952715261, "learning_rate": 8.409873888829768e-07, "loss": 0.3378, "step": 4414 }, { "epoch": 4.70682302771855, "grad_norm": 0.06584452917029096, "learning_rate": 8.349255687584867e-07, "loss": 0.3418, "step": 4415 }, { "epoch": 4.707889125799573, "grad_norm": 0.07132046643473576, "learning_rate": 8.288854440497629e-07, "loss": 0.3356, "step": 4416 }, { "epoch": 4.708955223880597, "grad_norm": 0.06688057560968289, "learning_rate": 8.228670181027199e-07, "loss": 0.3406, "step": 4417 }, { "epoch": 4.71002132196162, "grad_norm": 0.06467424384256552, "learning_rate": 8.168702942512507e-07, "loss": 0.3404, "step": 4418 }, { "epoch": 4.711087420042644, "grad_norm": 0.06658144857820635, "learning_rate": 8.108952758172317e-07, "loss": 0.3373, "step": 4419 }, { "epoch": 4.712153518123667, "grad_norm": 0.06579984809073838, "learning_rate": 8.049419661105129e-07, "loss": 0.3327, "step": 4420 }, { "epoch": 4.713219616204691, "grad_norm": 0.06783834549966072, "learning_rate": 7.99010368428923e-07, "loss": 0.3363, "step": 4421 }, { "epoch": 4.714285714285714, "grad_norm": 0.065981531581556, "learning_rate": 7.93100486058247e-07, "loss": 0.3362, "step": 4422 }, { "epoch": 4.7153518123667375, "grad_norm": 0.066232413118734, "learning_rate": 7.872123222722617e-07, "loss": 0.3285, "step": 4423 }, { "epoch": 4.7164179104477615, "grad_norm": 0.06404021502472225, "learning_rate": 7.813458803327001e-07, "loss": 0.333, "step": 4424 }, { "epoch": 4.717484008528785, "grad_norm": 0.06489096407537417, "learning_rate": 7.755011634892695e-07, "loss": 0.3336, "step": 4425 }, { "epoch": 4.718550106609808, "grad_norm": 0.06614640728593563, "learning_rate": 7.696781749796333e-07, "loss": 0.3326, "step": 4426 }, { "epoch": 4.719616204690832, "grad_norm": 0.06662313517511247, "learning_rate": 7.638769180294292e-07, "loss": 0.3354, "step": 4427 }, { "epoch": 4.720682302771855, "grad_norm": 0.06630182982279928, "learning_rate": 7.580973958522553e-07, "loss": 0.3274, "step": 4428 }, { "epoch": 4.721748400852879, "grad_norm": 0.06231304613251542, "learning_rate": 7.523396116496573e-07, "loss": 0.3351, "step": 4429 }, { "epoch": 4.722814498933902, "grad_norm": 0.06561797780372054, "learning_rate": 7.46603568611155e-07, "loss": 0.3303, "step": 4430 }, { "epoch": 4.723880597014926, "grad_norm": 0.06606066948383217, "learning_rate": 7.408892699142156e-07, "loss": 0.3337, "step": 4431 }, { "epoch": 4.724946695095949, "grad_norm": 0.06532372136865669, "learning_rate": 7.35196718724267e-07, "loss": 0.3361, "step": 4432 }, { "epoch": 4.726012793176972, "grad_norm": 0.07451746009716578, "learning_rate": 7.295259181946801e-07, "loss": 0.3353, "step": 4433 }, { "epoch": 4.727078891257996, "grad_norm": 0.06275057701061912, "learning_rate": 7.23876871466791e-07, "loss": 0.3384, "step": 4434 }, { "epoch": 4.728144989339019, "grad_norm": 0.06554740340061337, "learning_rate": 7.182495816698787e-07, "loss": 0.3302, "step": 4435 }, { "epoch": 4.729211087420042, "grad_norm": 0.0690906174501306, "learning_rate": 7.126440519211608e-07, "loss": 0.3378, "step": 4436 }, { "epoch": 4.730277185501066, "grad_norm": 0.06708686099386059, "learning_rate": 7.070602853258112e-07, "loss": 0.3358, "step": 4437 }, { "epoch": 4.731343283582089, "grad_norm": 0.06891228927845029, "learning_rate": 7.014982849769558e-07, "loss": 0.3354, "step": 4438 }, { "epoch": 4.732409381663113, "grad_norm": 0.06537602757352802, "learning_rate": 6.95958053955641e-07, "loss": 0.3371, "step": 4439 }, { "epoch": 4.733475479744136, "grad_norm": 0.06604287763417778, "learning_rate": 6.904395953308784e-07, "loss": 0.328, "step": 4440 }, { "epoch": 4.73454157782516, "grad_norm": 0.06405394013935223, "learning_rate": 6.849429121596007e-07, "loss": 0.3346, "step": 4441 }, { "epoch": 4.735607675906183, "grad_norm": 0.06528070970380216, "learning_rate": 6.794680074866833e-07, "loss": 0.3419, "step": 4442 }, { "epoch": 4.7366737739872065, "grad_norm": 0.06512815748529073, "learning_rate": 6.740148843449401e-07, "loss": 0.3331, "step": 4443 }, { "epoch": 4.73773987206823, "grad_norm": 0.06646695491242108, "learning_rate": 6.685835457551281e-07, "loss": 0.3346, "step": 4444 }, { "epoch": 4.7388059701492535, "grad_norm": 0.0672561173174234, "learning_rate": 6.631739947259075e-07, "loss": 0.3368, "step": 4445 }, { "epoch": 4.7398720682302775, "grad_norm": 0.06724341190669111, "learning_rate": 6.577862342539032e-07, "loss": 0.3291, "step": 4446 }, { "epoch": 4.740938166311301, "grad_norm": 0.06595018864181441, "learning_rate": 6.524202673236524e-07, "loss": 0.3335, "step": 4447 }, { "epoch": 4.742004264392325, "grad_norm": 0.06439992810416004, "learning_rate": 6.470760969076173e-07, "loss": 0.3371, "step": 4448 }, { "epoch": 4.743070362473348, "grad_norm": 0.06515103944575167, "learning_rate": 6.417537259661899e-07, "loss": 0.3392, "step": 4449 }, { "epoch": 4.744136460554371, "grad_norm": 0.06518098968521178, "learning_rate": 6.364531574476962e-07, "loss": 0.3263, "step": 4450 }, { "epoch": 4.745202558635395, "grad_norm": 0.06553602384394039, "learning_rate": 6.311743942883652e-07, "loss": 0.3319, "step": 4451 }, { "epoch": 4.746268656716418, "grad_norm": 0.06646799447298098, "learning_rate": 6.259174394123602e-07, "loss": 0.3287, "step": 4452 }, { "epoch": 4.747334754797441, "grad_norm": 0.06614233529232587, "learning_rate": 6.206822957317693e-07, "loss": 0.3309, "step": 4453 }, { "epoch": 4.748400852878465, "grad_norm": 0.06540305031790555, "learning_rate": 6.154689661465752e-07, "loss": 0.3389, "step": 4454 }, { "epoch": 4.749466950959488, "grad_norm": 0.06512815324295504, "learning_rate": 6.102774535447031e-07, "loss": 0.3341, "step": 4455 }, { "epoch": 4.750533049040512, "grad_norm": 0.06377242318608925, "learning_rate": 6.051077608019773e-07, "loss": 0.3335, "step": 4456 }, { "epoch": 4.751599147121535, "grad_norm": 0.06507854548241519, "learning_rate": 5.99959890782138e-07, "loss": 0.3292, "step": 4457 }, { "epoch": 4.752665245202559, "grad_norm": 0.06569932677501229, "learning_rate": 5.948338463368419e-07, "loss": 0.3407, "step": 4458 }, { "epoch": 4.753731343283582, "grad_norm": 0.06613270042766406, "learning_rate": 5.897296303056444e-07, "loss": 0.3418, "step": 4459 }, { "epoch": 4.754797441364605, "grad_norm": 0.06517187632853641, "learning_rate": 5.846472455160213e-07, "loss": 0.3361, "step": 4460 }, { "epoch": 4.755863539445629, "grad_norm": 0.06689709594911152, "learning_rate": 5.795866947833472e-07, "loss": 0.3365, "step": 4461 }, { "epoch": 4.756929637526652, "grad_norm": 0.06666505765328634, "learning_rate": 5.745479809109045e-07, "loss": 0.3306, "step": 4462 }, { "epoch": 4.757995735607675, "grad_norm": 0.06646312328818217, "learning_rate": 5.695311066898779e-07, "loss": 0.3335, "step": 4463 }, { "epoch": 4.759061833688699, "grad_norm": 0.06785871586907363, "learning_rate": 5.6453607489936e-07, "loss": 0.3364, "step": 4464 }, { "epoch": 4.7601279317697225, "grad_norm": 0.06542533321108031, "learning_rate": 5.595628883063331e-07, "loss": 0.3355, "step": 4465 }, { "epoch": 4.7611940298507465, "grad_norm": 0.06297758066267968, "learning_rate": 5.546115496656867e-07, "loss": 0.3383, "step": 4466 }, { "epoch": 4.76226012793177, "grad_norm": 0.065278353186685, "learning_rate": 5.496820617202047e-07, "loss": 0.3349, "step": 4467 }, { "epoch": 4.7633262260127935, "grad_norm": 0.06538720202179056, "learning_rate": 5.447744272005695e-07, "loss": 0.3331, "step": 4468 }, { "epoch": 4.764392324093817, "grad_norm": 0.06718790884542368, "learning_rate": 5.398886488253485e-07, "loss": 0.3379, "step": 4469 }, { "epoch": 4.76545842217484, "grad_norm": 0.06449471524894189, "learning_rate": 5.350247293010169e-07, "loss": 0.3349, "step": 4470 }, { "epoch": 4.766524520255864, "grad_norm": 0.06433921241405373, "learning_rate": 5.301826713219305e-07, "loss": 0.3319, "step": 4471 }, { "epoch": 4.767590618336887, "grad_norm": 0.06817594418030926, "learning_rate": 5.253624775703347e-07, "loss": 0.3361, "step": 4472 }, { "epoch": 4.768656716417911, "grad_norm": 0.06600839287904284, "learning_rate": 5.205641507163694e-07, "loss": 0.3343, "step": 4473 }, { "epoch": 4.769722814498934, "grad_norm": 0.06616379620806581, "learning_rate": 5.157876934180551e-07, "loss": 0.3318, "step": 4474 }, { "epoch": 4.770788912579958, "grad_norm": 0.06606131200753886, "learning_rate": 5.110331083213105e-07, "loss": 0.3376, "step": 4475 }, { "epoch": 4.771855010660981, "grad_norm": 0.06329917860131302, "learning_rate": 5.063003980599179e-07, "loss": 0.3366, "step": 4476 }, { "epoch": 4.772921108742004, "grad_norm": 0.06481792763454347, "learning_rate": 5.01589565255558e-07, "loss": 0.3394, "step": 4477 }, { "epoch": 4.773987206823028, "grad_norm": 0.0687947416522106, "learning_rate": 4.969006125177834e-07, "loss": 0.3368, "step": 4478 }, { "epoch": 4.775053304904051, "grad_norm": 0.06404270869284907, "learning_rate": 4.922335424440361e-07, "loss": 0.3323, "step": 4479 }, { "epoch": 4.776119402985074, "grad_norm": 0.06757244081482576, "learning_rate": 4.875883576196261e-07, "loss": 0.3367, "step": 4480 }, { "epoch": 4.777185501066098, "grad_norm": 0.06305226854333544, "learning_rate": 4.829650606177438e-07, "loss": 0.3345, "step": 4481 }, { "epoch": 4.778251599147121, "grad_norm": 0.06783144251481625, "learning_rate": 4.783636539994607e-07, "loss": 0.3381, "step": 4482 }, { "epoch": 4.779317697228145, "grad_norm": 0.07112258236166988, "learning_rate": 4.737841403137067e-07, "loss": 0.3319, "step": 4483 }, { "epoch": 4.780383795309168, "grad_norm": 0.06778618377309222, "learning_rate": 4.692265220973058e-07, "loss": 0.3368, "step": 4484 }, { "epoch": 4.781449893390192, "grad_norm": 0.061819307755361846, "learning_rate": 4.6469080187493634e-07, "loss": 0.3305, "step": 4485 }, { "epoch": 4.782515991471215, "grad_norm": 0.06472343334221725, "learning_rate": 4.601769821591529e-07, "loss": 0.3301, "step": 4486 }, { "epoch": 4.7835820895522385, "grad_norm": 0.06769820854581794, "learning_rate": 4.5568506545037305e-07, "loss": 0.3363, "step": 4487 }, { "epoch": 4.7846481876332625, "grad_norm": 0.06599930627838296, "learning_rate": 4.512150542368909e-07, "loss": 0.3344, "step": 4488 }, { "epoch": 4.785714285714286, "grad_norm": 0.06669160582080108, "learning_rate": 4.467669509948591e-07, "loss": 0.335, "step": 4489 }, { "epoch": 4.786780383795309, "grad_norm": 0.06657029869145847, "learning_rate": 4.423407581882932e-07, "loss": 0.3318, "step": 4490 }, { "epoch": 4.787846481876333, "grad_norm": 0.06875440174953255, "learning_rate": 4.3793647826907203e-07, "loss": 0.3301, "step": 4491 }, { "epoch": 4.788912579957356, "grad_norm": 0.06843506786311358, "learning_rate": 4.3355411367694164e-07, "loss": 0.3377, "step": 4492 }, { "epoch": 4.78997867803838, "grad_norm": 0.06459340857709406, "learning_rate": 4.2919366683951135e-07, "loss": 0.3336, "step": 4493 }, { "epoch": 4.791044776119403, "grad_norm": 0.06386692441739224, "learning_rate": 4.2485514017222674e-07, "loss": 0.333, "step": 4494 }, { "epoch": 4.792110874200427, "grad_norm": 0.06649809538469802, "learning_rate": 4.205385360784142e-07, "loss": 0.337, "step": 4495 }, { "epoch": 4.79317697228145, "grad_norm": 0.06494463953807365, "learning_rate": 4.162438569492455e-07, "loss": 0.3381, "step": 4496 }, { "epoch": 4.794243070362473, "grad_norm": 0.06443760229121664, "learning_rate": 4.119711051637554e-07, "loss": 0.331, "step": 4497 }, { "epoch": 4.795309168443497, "grad_norm": 0.06694814400064329, "learning_rate": 4.077202830888238e-07, "loss": 0.3366, "step": 4498 }, { "epoch": 4.79637526652452, "grad_norm": 0.061826241061163335, "learning_rate": 4.0349139307918063e-07, "loss": 0.3229, "step": 4499 }, { "epoch": 4.797441364605544, "grad_norm": 0.06413189534977501, "learning_rate": 3.992844374774141e-07, "loss": 0.3357, "step": 4500 }, { "epoch": 4.798507462686567, "grad_norm": 0.06301453498466782, "learning_rate": 3.950994186139623e-07, "loss": 0.3262, "step": 4501 }, { "epoch": 4.79957356076759, "grad_norm": 0.0669025986464228, "learning_rate": 3.909363388071041e-07, "loss": 0.3333, "step": 4502 }, { "epoch": 4.800639658848614, "grad_norm": 0.06727116449771486, "learning_rate": 3.8679520036296823e-07, "loss": 0.3285, "step": 4503 }, { "epoch": 4.801705756929637, "grad_norm": 0.06423952902920181, "learning_rate": 3.826760055755374e-07, "loss": 0.3298, "step": 4504 }, { "epoch": 4.802771855010661, "grad_norm": 0.06287554539652727, "learning_rate": 3.785787567266219e-07, "loss": 0.3314, "step": 4505 }, { "epoch": 4.803837953091684, "grad_norm": 0.06428325628977336, "learning_rate": 3.745034560858907e-07, "loss": 0.3374, "step": 4506 }, { "epoch": 4.8049040511727075, "grad_norm": 0.0615240827531194, "learning_rate": 3.7045010591084893e-07, "loss": 0.3346, "step": 4507 }, { "epoch": 4.8059701492537314, "grad_norm": 0.06350727088776514, "learning_rate": 3.66418708446834e-07, "loss": 0.331, "step": 4508 }, { "epoch": 4.8070362473347545, "grad_norm": 0.06394487503770653, "learning_rate": 3.6240926592704173e-07, "loss": 0.3281, "step": 4509 }, { "epoch": 4.8081023454157785, "grad_norm": 0.06562664075180806, "learning_rate": 3.58421780572491e-07, "loss": 0.3364, "step": 4510 }, { "epoch": 4.809168443496802, "grad_norm": 0.0641531508096308, "learning_rate": 3.5445625459203715e-07, "loss": 0.3311, "step": 4511 }, { "epoch": 4.810234541577826, "grad_norm": 0.06317633180319338, "learning_rate": 3.5051269018238075e-07, "loss": 0.3327, "step": 4512 }, { "epoch": 4.811300639658849, "grad_norm": 0.06640788788183018, "learning_rate": 3.4659108952805e-07, "loss": 0.3339, "step": 4513 }, { "epoch": 4.812366737739872, "grad_norm": 0.06476401703214942, "learning_rate": 3.4269145480140486e-07, "loss": 0.3283, "step": 4514 }, { "epoch": 4.813432835820896, "grad_norm": 0.06632322608724953, "learning_rate": 3.388137881626419e-07, "loss": 0.3333, "step": 4515 }, { "epoch": 4.814498933901919, "grad_norm": 0.06584746139018403, "learning_rate": 3.3495809175978944e-07, "loss": 0.3341, "step": 4516 }, { "epoch": 4.815565031982942, "grad_norm": 0.06225839649171285, "learning_rate": 3.311243677287035e-07, "loss": 0.339, "step": 4517 }, { "epoch": 4.816631130063966, "grad_norm": 0.06266507769908893, "learning_rate": 3.2731261819306306e-07, "loss": 0.3338, "step": 4518 }, { "epoch": 4.817697228144989, "grad_norm": 0.06361144124581149, "learning_rate": 3.2352284526438347e-07, "loss": 0.3262, "step": 4519 }, { "epoch": 4.818763326226013, "grad_norm": 0.06657050940016936, "learning_rate": 3.1975505104199446e-07, "loss": 0.3347, "step": 4520 }, { "epoch": 4.819829424307036, "grad_norm": 0.06336689637535396, "learning_rate": 3.1600923761307077e-07, "loss": 0.3314, "step": 4521 }, { "epoch": 4.82089552238806, "grad_norm": 0.06697036655954806, "learning_rate": 3.1228540705258827e-07, "loss": 0.3373, "step": 4522 }, { "epoch": 4.821961620469083, "grad_norm": 0.06669058010931536, "learning_rate": 3.08583561423359e-07, "loss": 0.3333, "step": 4523 }, { "epoch": 4.823027718550106, "grad_norm": 0.06204857673664051, "learning_rate": 3.0490370277601375e-07, "loss": 0.3305, "step": 4524 }, { "epoch": 4.82409381663113, "grad_norm": 0.06414424868604003, "learning_rate": 3.0124583314900204e-07, "loss": 0.3285, "step": 4525 }, { "epoch": 4.825159914712153, "grad_norm": 0.06489342987918388, "learning_rate": 2.976099545685962e-07, "loss": 0.3389, "step": 4526 }, { "epoch": 4.826226012793177, "grad_norm": 0.06392477993982196, "learning_rate": 2.9399606904887856e-07, "loss": 0.3301, "step": 4527 }, { "epoch": 4.8272921108742, "grad_norm": 0.06216530530535471, "learning_rate": 2.9040417859175884e-07, "loss": 0.3278, "step": 4528 }, { "epoch": 4.8283582089552235, "grad_norm": 0.06907666472180728, "learning_rate": 2.8683428518695654e-07, "loss": 0.3462, "step": 4529 }, { "epoch": 4.8294243070362475, "grad_norm": 0.06389830168648562, "learning_rate": 2.832863908120009e-07, "loss": 0.3316, "step": 4530 }, { "epoch": 4.830490405117271, "grad_norm": 0.0659969650902936, "learning_rate": 2.7976049743224876e-07, "loss": 0.334, "step": 4531 }, { "epoch": 4.8315565031982945, "grad_norm": 0.06337199459457302, "learning_rate": 2.762566070008621e-07, "loss": 0.3379, "step": 4532 }, { "epoch": 4.832622601279318, "grad_norm": 0.06123098557327691, "learning_rate": 2.7277472145880837e-07, "loss": 0.3337, "step": 4533 }, { "epoch": 4.833688699360341, "grad_norm": 0.06421391981205138, "learning_rate": 2.6931484273487796e-07, "loss": 0.3351, "step": 4534 }, { "epoch": 4.834754797441365, "grad_norm": 0.06631306221864534, "learning_rate": 2.658769727456534e-07, "loss": 0.3395, "step": 4535 }, { "epoch": 4.835820895522388, "grad_norm": 0.06390409569316938, "learning_rate": 2.6246111339554903e-07, "loss": 0.335, "step": 4536 }, { "epoch": 4.836886993603412, "grad_norm": 0.06255999806490187, "learning_rate": 2.5906726657676685e-07, "loss": 0.3371, "step": 4537 }, { "epoch": 4.837953091684435, "grad_norm": 0.0638021517251054, "learning_rate": 2.556954341693185e-07, "loss": 0.3324, "step": 4538 }, { "epoch": 4.839019189765459, "grad_norm": 0.06516423398474619, "learning_rate": 2.5234561804102556e-07, "loss": 0.3363, "step": 4539 }, { "epoch": 4.840085287846482, "grad_norm": 0.06196390275858242, "learning_rate": 2.4901782004751905e-07, "loss": 0.3265, "step": 4540 }, { "epoch": 4.841151385927505, "grad_norm": 0.06281847969805605, "learning_rate": 2.457120420322179e-07, "loss": 0.3373, "step": 4541 }, { "epoch": 4.842217484008529, "grad_norm": 0.06560695196791491, "learning_rate": 2.424282858263549e-07, "loss": 0.3375, "step": 4542 }, { "epoch": 4.843283582089552, "grad_norm": 0.06533330528579777, "learning_rate": 2.3916655324895953e-07, "loss": 0.3293, "step": 4543 }, { "epoch": 4.844349680170575, "grad_norm": 0.06480021279499126, "learning_rate": 2.35926846106862e-07, "loss": 0.3355, "step": 4544 }, { "epoch": 4.845415778251599, "grad_norm": 0.06287240019527855, "learning_rate": 2.3270916619469342e-07, "loss": 0.3364, "step": 4545 }, { "epoch": 4.846481876332622, "grad_norm": 0.06700125191000252, "learning_rate": 2.2951351529487685e-07, "loss": 0.338, "step": 4546 }, { "epoch": 4.847547974413646, "grad_norm": 0.06356697255958557, "learning_rate": 2.2633989517764522e-07, "loss": 0.335, "step": 4547 }, { "epoch": 4.848614072494669, "grad_norm": 0.06622567910030012, "learning_rate": 2.2318830760100995e-07, "loss": 0.3398, "step": 4548 }, { "epoch": 4.849680170575693, "grad_norm": 0.06340820773787822, "learning_rate": 2.2005875431078794e-07, "loss": 0.3361, "step": 4549 }, { "epoch": 4.850746268656716, "grad_norm": 0.06467319645032904, "learning_rate": 2.169512370405924e-07, "loss": 0.3362, "step": 4550 }, { "epoch": 4.8518123667377395, "grad_norm": 0.06494652144642511, "learning_rate": 2.138657575118286e-07, "loss": 0.334, "step": 4551 }, { "epoch": 4.8528784648187635, "grad_norm": 0.06344916494239451, "learning_rate": 2.1080231743368928e-07, "loss": 0.3261, "step": 4552 }, { "epoch": 4.853944562899787, "grad_norm": 0.06552581044666182, "learning_rate": 2.0776091850315483e-07, "loss": 0.3338, "step": 4553 }, { "epoch": 4.855010660980811, "grad_norm": 0.06182298175401099, "learning_rate": 2.0474156240501086e-07, "loss": 0.3335, "step": 4554 }, { "epoch": 4.856076759061834, "grad_norm": 0.06542556498649621, "learning_rate": 2.0174425081182615e-07, "loss": 0.3364, "step": 4555 }, { "epoch": 4.857142857142857, "grad_norm": 0.06371796771439006, "learning_rate": 1.9876898538394362e-07, "loss": 0.3353, "step": 4556 }, { "epoch": 4.858208955223881, "grad_norm": 0.06396421709422397, "learning_rate": 1.9581576776951605e-07, "loss": 0.3295, "step": 4557 }, { "epoch": 4.859275053304904, "grad_norm": 0.06288608503660843, "learning_rate": 1.9288459960446592e-07, "loss": 0.3292, "step": 4558 }, { "epoch": 4.860341151385928, "grad_norm": 0.06363937784876181, "learning_rate": 1.899754825125122e-07, "loss": 0.3366, "step": 4559 }, { "epoch": 4.861407249466951, "grad_norm": 0.06336696226394795, "learning_rate": 1.8708841810515244e-07, "loss": 0.3274, "step": 4560 }, { "epoch": 4.862473347547974, "grad_norm": 0.0635413388603882, "learning_rate": 1.8422340798167181e-07, "loss": 0.3359, "step": 4561 }, { "epoch": 4.863539445628998, "grad_norm": 0.06482344768083516, "learning_rate": 1.8138045372913414e-07, "loss": 0.3387, "step": 4562 }, { "epoch": 4.864605543710021, "grad_norm": 0.06543147651304172, "learning_rate": 1.7855955692239525e-07, "loss": 0.3369, "step": 4563 }, { "epoch": 4.865671641791045, "grad_norm": 0.0634061615569733, "learning_rate": 1.757607191240762e-07, "loss": 0.3337, "step": 4564 }, { "epoch": 4.866737739872068, "grad_norm": 0.06497196758674298, "learning_rate": 1.7298394188459466e-07, "loss": 0.3337, "step": 4565 }, { "epoch": 4.867803837953092, "grad_norm": 0.0643026256592065, "learning_rate": 1.7022922674213793e-07, "loss": 0.3358, "step": 4566 }, { "epoch": 4.868869936034115, "grad_norm": 0.06380088154716912, "learning_rate": 1.6749657522267205e-07, "loss": 0.3323, "step": 4567 }, { "epoch": 4.869936034115138, "grad_norm": 0.062292059451068596, "learning_rate": 1.6478598883995057e-07, "loss": 0.3297, "step": 4568 }, { "epoch": 4.871002132196162, "grad_norm": 0.06429186614170503, "learning_rate": 1.620974690954924e-07, "loss": 0.3318, "step": 4569 }, { "epoch": 4.872068230277185, "grad_norm": 0.061982887675499386, "learning_rate": 1.59431017478604e-07, "loss": 0.3376, "step": 4570 }, { "epoch": 4.8731343283582085, "grad_norm": 0.0636117943991852, "learning_rate": 1.5678663546634832e-07, "loss": 0.3342, "step": 4571 }, { "epoch": 4.8742004264392325, "grad_norm": 0.06390306822281343, "learning_rate": 1.541643245235891e-07, "loss": 0.3332, "step": 4572 }, { "epoch": 4.8752665245202556, "grad_norm": 0.06414487391507678, "learning_rate": 1.515640861029466e-07, "loss": 0.3319, "step": 4573 }, { "epoch": 4.8763326226012795, "grad_norm": 0.06234839482693829, "learning_rate": 1.4898592164481528e-07, "loss": 0.3345, "step": 4574 }, { "epoch": 4.877398720682303, "grad_norm": 0.06282579096563232, "learning_rate": 1.4642983257736388e-07, "loss": 0.3318, "step": 4575 }, { "epoch": 4.878464818763327, "grad_norm": 0.06413128319015131, "learning_rate": 1.4389582031653525e-07, "loss": 0.3288, "step": 4576 }, { "epoch": 4.87953091684435, "grad_norm": 0.061604350840344914, "learning_rate": 1.4138388626603772e-07, "loss": 0.3293, "step": 4577 }, { "epoch": 4.880597014925373, "grad_norm": 0.06492239286709452, "learning_rate": 1.388940318173537e-07, "loss": 0.3359, "step": 4578 }, { "epoch": 4.881663113006397, "grad_norm": 0.06750216444127496, "learning_rate": 1.3642625834973555e-07, "loss": 0.3319, "step": 4579 }, { "epoch": 4.88272921108742, "grad_norm": 0.06551862088137983, "learning_rate": 1.3398056723019638e-07, "loss": 0.3371, "step": 4580 }, { "epoch": 4.883795309168444, "grad_norm": 0.06303973420478017, "learning_rate": 1.3155695981352356e-07, "loss": 0.3339, "step": 4581 }, { "epoch": 4.884861407249467, "grad_norm": 0.06431600254532914, "learning_rate": 1.2915543744227433e-07, "loss": 0.3387, "step": 4582 }, { "epoch": 4.88592750533049, "grad_norm": 0.06224741808886467, "learning_rate": 1.267760014467667e-07, "loss": 0.3303, "step": 4583 }, { "epoch": 4.886993603411514, "grad_norm": 0.0686255077912375, "learning_rate": 1.2441865314507529e-07, "loss": 0.3315, "step": 4584 }, { "epoch": 4.888059701492537, "grad_norm": 0.06375747762463076, "learning_rate": 1.220833938430621e-07, "loss": 0.3329, "step": 4585 }, { "epoch": 4.889125799573561, "grad_norm": 0.06360467146598325, "learning_rate": 1.1977022483432355e-07, "loss": 0.3304, "step": 4586 }, { "epoch": 4.890191897654584, "grad_norm": 0.06371846108482049, "learning_rate": 1.1747914740025235e-07, "loss": 0.3316, "step": 4587 }, { "epoch": 4.891257995735607, "grad_norm": 0.06123471557161623, "learning_rate": 1.152101628099711e-07, "loss": 0.333, "step": 4588 }, { "epoch": 4.892324093816631, "grad_norm": 0.0653623572630271, "learning_rate": 1.1296327232038995e-07, "loss": 0.3383, "step": 4589 }, { "epoch": 4.893390191897654, "grad_norm": 0.06317084300404086, "learning_rate": 1.1073847717616659e-07, "loss": 0.3355, "step": 4590 }, { "epoch": 4.894456289978678, "grad_norm": 0.06436959516579645, "learning_rate": 1.0853577860971965e-07, "loss": 0.3375, "step": 4591 }, { "epoch": 4.895522388059701, "grad_norm": 0.06436654116700972, "learning_rate": 1.0635517784122862e-07, "loss": 0.3361, "step": 4592 }, { "epoch": 4.896588486140725, "grad_norm": 0.0638870216071855, "learning_rate": 1.0419667607863393e-07, "loss": 0.3359, "step": 4593 }, { "epoch": 4.8976545842217485, "grad_norm": 0.06402271945130204, "learning_rate": 1.0206027451764133e-07, "loss": 0.3365, "step": 4594 }, { "epoch": 4.898720682302772, "grad_norm": 0.06360664197572094, "learning_rate": 9.994597434169529e-08, "loss": 0.3333, "step": 4595 }, { "epoch": 4.899786780383796, "grad_norm": 0.06298163310834395, "learning_rate": 9.785377672201002e-08, "loss": 0.3304, "step": 4596 }, { "epoch": 4.900852878464819, "grad_norm": 0.06222784082540263, "learning_rate": 9.578368281756068e-08, "loss": 0.3307, "step": 4597 }, { "epoch": 4.901918976545842, "grad_norm": 0.06603440394113538, "learning_rate": 9.373569377506553e-08, "loss": 0.3351, "step": 4598 }, { "epoch": 4.902985074626866, "grad_norm": 0.061308075618675476, "learning_rate": 9.17098107290082e-08, "loss": 0.3373, "step": 4599 }, { "epoch": 4.904051172707889, "grad_norm": 0.06278471458473436, "learning_rate": 8.970603480161988e-08, "loss": 0.3301, "step": 4600 }, { "epoch": 4.905117270788913, "grad_norm": 0.0626450228202153, "learning_rate": 8.772436710288822e-08, "loss": 0.3327, "step": 4601 }, { "epoch": 4.906183368869936, "grad_norm": 0.06235194061204951, "learning_rate": 8.576480873055737e-08, "loss": 0.3337, "step": 4602 }, { "epoch": 4.90724946695096, "grad_norm": 0.06548793549407254, "learning_rate": 8.382736077011899e-08, "loss": 0.3319, "step": 4603 }, { "epoch": 4.908315565031983, "grad_norm": 0.06161345936082664, "learning_rate": 8.191202429481681e-08, "loss": 0.3343, "step": 4604 }, { "epoch": 4.909381663113006, "grad_norm": 0.06227151592088014, "learning_rate": 8.001880036565102e-08, "loss": 0.3299, "step": 4605 }, { "epoch": 4.91044776119403, "grad_norm": 0.0628595755152068, "learning_rate": 7.814769003136491e-08, "loss": 0.335, "step": 4606 }, { "epoch": 4.911513859275053, "grad_norm": 0.06133289029888643, "learning_rate": 7.629869432845827e-08, "loss": 0.3327, "step": 4607 }, { "epoch": 4.912579957356077, "grad_norm": 0.06399713009061905, "learning_rate": 7.447181428117844e-08, "loss": 0.3368, "step": 4608 }, { "epoch": 4.9136460554371, "grad_norm": 0.06471533837047432, "learning_rate": 7.266705090152482e-08, "loss": 0.3371, "step": 4609 }, { "epoch": 4.914712153518123, "grad_norm": 0.0631935092408263, "learning_rate": 7.08844051892399e-08, "loss": 0.3346, "step": 4610 }, { "epoch": 4.915778251599147, "grad_norm": 0.06449225156722846, "learning_rate": 6.912387813181376e-08, "loss": 0.3322, "step": 4611 }, { "epoch": 4.91684434968017, "grad_norm": 0.06188700970750854, "learning_rate": 6.738547070449297e-08, "loss": 0.3325, "step": 4612 }, { "epoch": 4.917910447761194, "grad_norm": 0.06443747535001423, "learning_rate": 6.566918387026278e-08, "loss": 0.335, "step": 4613 }, { "epoch": 4.918976545842217, "grad_norm": 0.06527583600959296, "learning_rate": 6.397501857985599e-08, "loss": 0.3299, "step": 4614 }, { "epoch": 4.9200426439232405, "grad_norm": 0.06254955888214284, "learning_rate": 6.230297577175304e-08, "loss": 0.3318, "step": 4615 }, { "epoch": 4.9211087420042645, "grad_norm": 0.06654203572641401, "learning_rate": 6.065305637218188e-08, "loss": 0.3377, "step": 4616 }, { "epoch": 4.922174840085288, "grad_norm": 0.06615365835912221, "learning_rate": 5.902526129510477e-08, "loss": 0.3324, "step": 4617 }, { "epoch": 4.923240938166312, "grad_norm": 0.06358574578582833, "learning_rate": 5.741959144223597e-08, "loss": 0.3405, "step": 4618 }, { "epoch": 4.924307036247335, "grad_norm": 0.06592424042080675, "learning_rate": 5.583604770304174e-08, "loss": 0.3329, "step": 4619 }, { "epoch": 4.925373134328359, "grad_norm": 0.06489809675701481, "learning_rate": 5.427463095471375e-08, "loss": 0.3331, "step": 4620 }, { "epoch": 4.926439232409382, "grad_norm": 0.06346784271705076, "learning_rate": 5.27353420622001e-08, "loss": 0.3378, "step": 4621 }, { "epoch": 4.927505330490405, "grad_norm": 0.06492417082655748, "learning_rate": 5.121818187818761e-08, "loss": 0.3352, "step": 4622 }, { "epoch": 4.928571428571429, "grad_norm": 0.06263743897374244, "learning_rate": 4.9723151243106225e-08, "loss": 0.3375, "step": 4623 }, { "epoch": 4.929637526652452, "grad_norm": 0.06117815118487387, "learning_rate": 4.825025098512015e-08, "loss": 0.3407, "step": 4624 }, { "epoch": 4.930703624733475, "grad_norm": 0.06245193201459552, "learning_rate": 4.679948192013673e-08, "loss": 0.3279, "step": 4625 }, { "epoch": 4.931769722814499, "grad_norm": 0.06712207626671726, "learning_rate": 4.537084485181531e-08, "loss": 0.3347, "step": 4626 }, { "epoch": 4.932835820895522, "grad_norm": 0.0637857168923382, "learning_rate": 4.396434057154508e-08, "loss": 0.3303, "step": 4627 }, { "epoch": 4.933901918976546, "grad_norm": 0.06368341666107143, "learning_rate": 4.257996985844948e-08, "loss": 0.3293, "step": 4628 }, { "epoch": 4.934968017057569, "grad_norm": 0.06745977801668324, "learning_rate": 4.121773347940394e-08, "loss": 0.3329, "step": 4629 }, { "epoch": 4.936034115138593, "grad_norm": 0.06316814202138803, "learning_rate": 3.987763218901375e-08, "loss": 0.3323, "step": 4630 }, { "epoch": 4.937100213219616, "grad_norm": 0.06238635312813817, "learning_rate": 3.85596667296273e-08, "loss": 0.3319, "step": 4631 }, { "epoch": 4.938166311300639, "grad_norm": 0.06453862518488947, "learning_rate": 3.7263837831327255e-08, "loss": 0.3316, "step": 4632 }, { "epoch": 4.939232409381663, "grad_norm": 0.06561075891909876, "learning_rate": 3.5990146211939416e-08, "loss": 0.3357, "step": 4633 }, { "epoch": 4.940298507462686, "grad_norm": 0.06243805915307683, "learning_rate": 3.473859257701495e-08, "loss": 0.3357, "step": 4634 }, { "epoch": 4.94136460554371, "grad_norm": 0.06580397289868427, "learning_rate": 3.3509177619857056e-08, "loss": 0.3354, "step": 4635 }, { "epoch": 4.9424307036247335, "grad_norm": 0.06077030430444124, "learning_rate": 3.2301902021494304e-08, "loss": 0.3247, "step": 4636 }, { "epoch": 4.943496801705757, "grad_norm": 0.06466626231200338, "learning_rate": 3.111676645069839e-08, "loss": 0.3321, "step": 4637 }, { "epoch": 4.9445628997867805, "grad_norm": 0.06068940940138413, "learning_rate": 2.9953771563966396e-08, "loss": 0.3288, "step": 4638 }, { "epoch": 4.945628997867804, "grad_norm": 0.06278652639592003, "learning_rate": 2.881291800554298e-08, "loss": 0.3372, "step": 4639 }, { "epoch": 4.946695095948828, "grad_norm": 0.061309406774551704, "learning_rate": 2.769420640739817e-08, "loss": 0.3353, "step": 4640 }, { "epoch": 4.947761194029851, "grad_norm": 0.06281507165537796, "learning_rate": 2.6597637389240704e-08, "loss": 0.3321, "step": 4641 }, { "epoch": 4.948827292110874, "grad_norm": 0.061550308085559016, "learning_rate": 2.552321155851356e-08, "loss": 0.3325, "step": 4642 }, { "epoch": 4.949893390191898, "grad_norm": 0.06375020106010763, "learning_rate": 2.447092951038954e-08, "loss": 0.3315, "step": 4643 }, { "epoch": 4.950959488272921, "grad_norm": 0.06552775876706644, "learning_rate": 2.344079182778458e-08, "loss": 0.3445, "step": 4644 }, { "epoch": 4.952025586353945, "grad_norm": 0.06809372509479077, "learning_rate": 2.2432799081339997e-08, "loss": 0.3325, "step": 4645 }, { "epoch": 4.953091684434968, "grad_norm": 0.06352213483924132, "learning_rate": 2.1446951829422468e-08, "loss": 0.3363, "step": 4646 }, { "epoch": 4.954157782515992, "grad_norm": 0.0628161589783622, "learning_rate": 2.0483250618150708e-08, "loss": 0.334, "step": 4647 }, { "epoch": 4.955223880597015, "grad_norm": 0.06170202655763391, "learning_rate": 1.954169598136435e-08, "loss": 0.3379, "step": 4648 }, { "epoch": 4.956289978678038, "grad_norm": 0.06284704035908611, "learning_rate": 1.862228844062841e-08, "loss": 0.334, "step": 4649 }, { "epoch": 4.957356076759062, "grad_norm": 0.06343922078643444, "learning_rate": 1.772502850525548e-08, "loss": 0.3358, "step": 4650 }, { "epoch": 4.958422174840085, "grad_norm": 0.06367103465728541, "learning_rate": 1.6849916672270206e-08, "loss": 0.3362, "step": 4651 }, { "epoch": 4.959488272921108, "grad_norm": 0.06339160396502194, "learning_rate": 1.5996953426449245e-08, "loss": 0.3362, "step": 4652 }, { "epoch": 4.960554371002132, "grad_norm": 0.06146054352349314, "learning_rate": 1.51661392402902e-08, "loss": 0.3327, "step": 4653 }, { "epoch": 4.961620469083155, "grad_norm": 0.0644259680452347, "learning_rate": 1.4357474574011598e-08, "loss": 0.3275, "step": 4654 }, { "epoch": 4.962686567164179, "grad_norm": 0.06489614895198179, "learning_rate": 1.3570959875579547e-08, "loss": 0.3354, "step": 4655 }, { "epoch": 4.963752665245202, "grad_norm": 0.061701893851838184, "learning_rate": 1.2806595580676651e-08, "loss": 0.3296, "step": 4656 }, { "epoch": 4.964818763326226, "grad_norm": 0.06305988708818967, "learning_rate": 1.2064382112728646e-08, "loss": 0.3271, "step": 4657 }, { "epoch": 4.9658848614072495, "grad_norm": 0.06467949485812308, "learning_rate": 1.1344319882873323e-08, "loss": 0.3373, "step": 4658 }, { "epoch": 4.966950959488273, "grad_norm": 0.0616143550364817, "learning_rate": 1.064640928999605e-08, "loss": 0.3352, "step": 4659 }, { "epoch": 4.968017057569297, "grad_norm": 0.06426893511659539, "learning_rate": 9.970650720703135e-09, "loss": 0.3364, "step": 4660 }, { "epoch": 4.96908315565032, "grad_norm": 0.06068623678209842, "learning_rate": 9.317044549321808e-09, "loss": 0.3317, "step": 4661 }, { "epoch": 4.970149253731344, "grad_norm": 0.06197114220318593, "learning_rate": 8.685591137922444e-09, "loss": 0.3286, "step": 4662 }, { "epoch": 4.971215351812367, "grad_norm": 0.06386456846237887, "learning_rate": 8.076290836296352e-09, "loss": 0.3352, "step": 4663 }, { "epoch": 4.97228144989339, "grad_norm": 0.06241553173429086, "learning_rate": 7.489143981960211e-09, "loss": 0.3375, "step": 4664 }, { "epoch": 4.973347547974414, "grad_norm": 0.06569632030334341, "learning_rate": 6.924150900169402e-09, "loss": 0.334, "step": 4665 }, { "epoch": 4.974413646055437, "grad_norm": 0.06246704081644333, "learning_rate": 6.381311903900234e-09, "loss": 0.3328, "step": 4666 }, { "epoch": 4.975479744136461, "grad_norm": 0.06349233758317888, "learning_rate": 5.860627293849952e-09, "loss": 0.3371, "step": 4667 }, { "epoch": 4.976545842217484, "grad_norm": 0.061921416439339974, "learning_rate": 5.36209735845894e-09, "loss": 0.3386, "step": 4668 }, { "epoch": 4.977611940298507, "grad_norm": 0.062197087906872645, "learning_rate": 4.885722373879631e-09, "loss": 0.3402, "step": 4669 }, { "epoch": 4.978678038379531, "grad_norm": 0.06266534129824461, "learning_rate": 4.431502604003157e-09, "loss": 0.3249, "step": 4670 }, { "epoch": 4.979744136460554, "grad_norm": 0.062785232042293, "learning_rate": 3.999438300446024e-09, "loss": 0.3328, "step": 4671 }, { "epoch": 4.980810234541578, "grad_norm": 0.06336115371955366, "learning_rate": 3.5895297025456686e-09, "loss": 0.3298, "step": 4672 }, { "epoch": 4.981876332622601, "grad_norm": 0.06207536666619536, "learning_rate": 3.2017770373737877e-09, "loss": 0.3376, "step": 4673 }, { "epoch": 4.982942430703625, "grad_norm": 0.06040648920327252, "learning_rate": 2.8361805197185678e-09, "loss": 0.3313, "step": 4674 }, { "epoch": 4.984008528784648, "grad_norm": 0.06070989404951965, "learning_rate": 2.4927403521068926e-09, "loss": 0.3359, "step": 4675 }, { "epoch": 4.985074626865671, "grad_norm": 0.061482753937327835, "learning_rate": 2.1714567247821393e-09, "loss": 0.3363, "step": 4676 }, { "epoch": 4.986140724946695, "grad_norm": 0.06142287227143837, "learning_rate": 1.872329815726381e-09, "loss": 0.3397, "step": 4677 }, { "epoch": 4.9872068230277184, "grad_norm": 0.06141544242318959, "learning_rate": 1.5953597906337437e-09, "loss": 0.3335, "step": 4678 }, { "epoch": 4.9882729211087415, "grad_norm": 0.06367883708761624, "learning_rate": 1.3405468029370484e-09, "loss": 0.3371, "step": 4679 }, { "epoch": 4.9893390191897655, "grad_norm": 0.06116208249352169, "learning_rate": 1.1078909937811688e-09, "loss": 0.3295, "step": 4680 }, { "epoch": 4.990405117270789, "grad_norm": 0.06315685975189177, "learning_rate": 8.973924920541166e-10, "loss": 0.3355, "step": 4681 }, { "epoch": 4.991471215351813, "grad_norm": 0.06339091373459602, "learning_rate": 7.090514143515137e-10, "loss": 0.3304, "step": 4682 }, { "epoch": 4.992537313432836, "grad_norm": 0.06281856911763233, "learning_rate": 5.428678650165609e-10, "loss": 0.3336, "step": 4683 }, { "epoch": 4.99360341151386, "grad_norm": 0.062428937737431266, "learning_rate": 3.988419360956286e-10, "loss": 0.3368, "step": 4684 }, { "epoch": 4.994669509594883, "grad_norm": 0.063210979241412, "learning_rate": 2.769737073737844e-10, "loss": 0.3338, "step": 4685 }, { "epoch": 4.995735607675906, "grad_norm": 0.06278318354034783, "learning_rate": 1.7726324636591075e-10, "loss": 0.3374, "step": 4686 }, { "epoch": 4.99680170575693, "grad_norm": 0.06485031870184303, "learning_rate": 9.971060830338275e-11, "loss": 0.3327, "step": 4687 }, { "epoch": 4.997867803837953, "grad_norm": 0.062487731895542534, "learning_rate": 4.431583613850876e-11, "loss": 0.3312, "step": 4688 }, { "epoch": 4.998933901918977, "grad_norm": 0.06242272501094243, "learning_rate": 1.1078960571175856e-11, "loss": 0.3316, "step": 4689 }, { "epoch": 5.0, "grad_norm": 0.08020564798786287, "learning_rate": 0.0, "loss": 0.3297, "step": 4690 }, { "epoch": 5.0, "step": 4690, "total_flos": 7.868010719084544e+16, "train_loss": 0.37999461384724453, "train_runtime": 78929.6999, "train_samples_per_second": 30.397, "train_steps_per_second": 0.059 } ], "logging_steps": 1, "max_steps": 4690, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 7.868010719084544e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }