{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.5034168564920274, "eval_steps": 500, "global_step": 660, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002277904328018223, "grad_norm": 0.0, "learning_rate": 1.999771975829438e-05, "loss": 1.9165, "step": 1 }, { "epoch": 0.004555808656036446, "grad_norm": 0.0, "learning_rate": 1.9995434832230086e-05, "loss": 2.8215, "step": 2 }, { "epoch": 0.00683371298405467, "grad_norm": 0.0, "learning_rate": 1.9993145207357477e-05, "loss": 2.8218, "step": 3 }, { "epoch": 0.009111617312072893, "grad_norm": 0.0, "learning_rate": 1.999085086916743e-05, "loss": 2.25, "step": 4 }, { "epoch": 0.011389521640091117, "grad_norm": 0.0, "learning_rate": 1.9988551803091016e-05, "loss": 2.0216, "step": 5 }, { "epoch": 0.01366742596810934, "grad_norm": 0.0, "learning_rate": 1.99862479944992e-05, "loss": 2.0384, "step": 6 }, { "epoch": 0.015945330296127564, "grad_norm": 0.0, "learning_rate": 1.9983939428702538e-05, "loss": 2.0138, "step": 7 }, { "epoch": 0.018223234624145785, "grad_norm": 0.0, "learning_rate": 1.9981626090950854e-05, "loss": 2.1282, "step": 8 }, { "epoch": 0.02050113895216401, "grad_norm": 0.0, "learning_rate": 1.9979307966432923e-05, "loss": 2.1009, "step": 9 }, { "epoch": 0.022779043280182234, "grad_norm": 0.0, "learning_rate": 1.9976985040276184e-05, "loss": 1.9757, "step": 10 }, { "epoch": 0.025056947608200455, "grad_norm": 0.0, "learning_rate": 1.9974657297546367e-05, "loss": 1.9489, "step": 11 }, { "epoch": 0.02733485193621868, "grad_norm": 0.0, "learning_rate": 1.9972324723247235e-05, "loss": 2.0921, "step": 12 }, { "epoch": 0.029612756264236904, "grad_norm": 0.0, "learning_rate": 1.9969987302320216e-05, "loss": 2.0912, "step": 13 }, { "epoch": 0.03189066059225513, "grad_norm": 0.0, "learning_rate": 1.9967645019644098e-05, "loss": 1.979, "step": 14 }, { "epoch": 0.03416856492027335, "grad_norm": 0.0, "learning_rate": 1.99652978600347e-05, "loss": 1.9276, "step": 15 }, { "epoch": 0.03644646924829157, "grad_norm": 0.0, "learning_rate": 1.996294580824456e-05, "loss": 2.0417, "step": 16 }, { "epoch": 0.0387243735763098, "grad_norm": 0.0, "learning_rate": 1.996058884896256e-05, "loss": 1.9939, "step": 17 }, { "epoch": 0.04100227790432802, "grad_norm": 0.0, "learning_rate": 1.9958226966813648e-05, "loss": 2.0766, "step": 18 }, { "epoch": 0.04328018223234624, "grad_norm": 0.0, "learning_rate": 1.9955860146358464e-05, "loss": 1.9781, "step": 19 }, { "epoch": 0.04555808656036447, "grad_norm": 0.0, "learning_rate": 1.9953488372093024e-05, "loss": 2.0005, "step": 20 }, { "epoch": 0.04783599088838269, "grad_norm": 0.0, "learning_rate": 1.995111162844838e-05, "loss": 2.006, "step": 21 }, { "epoch": 0.05011389521640091, "grad_norm": 0.0, "learning_rate": 1.994872989979026e-05, "loss": 1.9772, "step": 22 }, { "epoch": 0.05239179954441914, "grad_norm": 0.0, "learning_rate": 1.9946343170418757e-05, "loss": 2.0346, "step": 23 }, { "epoch": 0.05466970387243736, "grad_norm": 0.0, "learning_rate": 1.9943951424567963e-05, "loss": 1.8697, "step": 24 }, { "epoch": 0.05694760820045558, "grad_norm": 0.0, "learning_rate": 1.994155464640561e-05, "loss": 2.0224, "step": 25 }, { "epoch": 0.05922551252847381, "grad_norm": 0.0, "learning_rate": 1.9939152820032765e-05, "loss": 1.9172, "step": 26 }, { "epoch": 0.06150341685649203, "grad_norm": 0.0, "learning_rate": 1.9936745929483427e-05, "loss": 1.948, "step": 27 }, { "epoch": 0.06378132118451026, "grad_norm": 0.0, "learning_rate": 1.9934333958724203e-05, "loss": 1.9458, "step": 28 }, { "epoch": 0.06605922551252848, "grad_norm": 0.0, "learning_rate": 1.993191689165395e-05, "loss": 1.8594, "step": 29 }, { "epoch": 0.0683371298405467, "grad_norm": 0.0, "learning_rate": 1.992949471210341e-05, "loss": 1.9468, "step": 30 }, { "epoch": 0.07061503416856492, "grad_norm": 0.0, "learning_rate": 1.9927067403834844e-05, "loss": 1.9019, "step": 31 }, { "epoch": 0.07289293849658314, "grad_norm": 0.0, "learning_rate": 1.9924634950541687e-05, "loss": 1.9821, "step": 32 }, { "epoch": 0.07517084282460136, "grad_norm": 0.0, "learning_rate": 1.992219733584817e-05, "loss": 1.8991, "step": 33 }, { "epoch": 0.0774487471526196, "grad_norm": 0.0, "learning_rate": 1.9919754543308948e-05, "loss": 1.97, "step": 34 }, { "epoch": 0.07972665148063782, "grad_norm": 0.0, "learning_rate": 1.9917306556408743e-05, "loss": 1.925, "step": 35 }, { "epoch": 0.08200455580865604, "grad_norm": 0.0, "learning_rate": 1.991485335856197e-05, "loss": 1.8711, "step": 36 }, { "epoch": 0.08428246013667426, "grad_norm": 0.0, "learning_rate": 1.991239493311235e-05, "loss": 1.8825, "step": 37 }, { "epoch": 0.08656036446469248, "grad_norm": 0.0, "learning_rate": 1.9909931263332544e-05, "loss": 1.8301, "step": 38 }, { "epoch": 0.0888382687927107, "grad_norm": 0.0, "learning_rate": 1.9907462332423777e-05, "loss": 1.8035, "step": 39 }, { "epoch": 0.09111617312072894, "grad_norm": 0.0, "learning_rate": 1.990498812351544e-05, "loss": 1.8363, "step": 40 }, { "epoch": 0.09339407744874716, "grad_norm": 0.0, "learning_rate": 1.9902508619664727e-05, "loss": 1.9067, "step": 41 }, { "epoch": 0.09567198177676538, "grad_norm": 0.0, "learning_rate": 1.9900023803856224e-05, "loss": 1.921, "step": 42 }, { "epoch": 0.0979498861047836, "grad_norm": 0.0, "learning_rate": 1.9897533659001547e-05, "loss": 1.862, "step": 43 }, { "epoch": 0.10022779043280182, "grad_norm": 0.0, "learning_rate": 1.9895038167938934e-05, "loss": 1.9782, "step": 44 }, { "epoch": 0.10250569476082004, "grad_norm": 0.0, "learning_rate": 1.9892537313432837e-05, "loss": 2.0042, "step": 45 }, { "epoch": 0.10478359908883828, "grad_norm": 0.0, "learning_rate": 1.989003107817356e-05, "loss": 1.9677, "step": 46 }, { "epoch": 0.1070615034168565, "grad_norm": 0.0, "learning_rate": 1.9887519444776835e-05, "loss": 1.7879, "step": 47 }, { "epoch": 0.10933940774487472, "grad_norm": 0.0, "learning_rate": 1.988500239578342e-05, "loss": 1.9285, "step": 48 }, { "epoch": 0.11161731207289294, "grad_norm": 0.0, "learning_rate": 1.9882479913658713e-05, "loss": 1.8736, "step": 49 }, { "epoch": 0.11389521640091116, "grad_norm": 0.0, "learning_rate": 1.9879951980792316e-05, "loss": 1.9533, "step": 50 }, { "epoch": 0.11617312072892938, "grad_norm": 0.0, "learning_rate": 1.987741857949766e-05, "loss": 1.8078, "step": 51 }, { "epoch": 0.11845102505694761, "grad_norm": 0.0, "learning_rate": 1.9874879692011553e-05, "loss": 1.9007, "step": 52 }, { "epoch": 0.12072892938496584, "grad_norm": 0.0, "learning_rate": 1.98723353004938e-05, "loss": 1.9432, "step": 53 }, { "epoch": 0.12300683371298406, "grad_norm": 0.0, "learning_rate": 1.9869785387026768e-05, "loss": 1.9253, "step": 54 }, { "epoch": 0.1252847380410023, "grad_norm": 0.0, "learning_rate": 1.986722993361497e-05, "loss": 1.723, "step": 55 }, { "epoch": 0.1275626423690205, "grad_norm": 0.0, "learning_rate": 1.986466892218463e-05, "loss": 1.8857, "step": 56 }, { "epoch": 0.12984054669703873, "grad_norm": 0.0, "learning_rate": 1.9862102334583286e-05, "loss": 1.828, "step": 57 }, { "epoch": 0.13211845102505695, "grad_norm": 0.0, "learning_rate": 1.9859530152579322e-05, "loss": 2.0451, "step": 58 }, { "epoch": 0.13439635535307518, "grad_norm": 0.0, "learning_rate": 1.985695235786156e-05, "loss": 1.9004, "step": 59 }, { "epoch": 0.1366742596810934, "grad_norm": 0.0, "learning_rate": 1.9854368932038836e-05, "loss": 1.8175, "step": 60 }, { "epoch": 0.13895216400911162, "grad_norm": 0.0, "learning_rate": 1.9851779856639535e-05, "loss": 1.8686, "step": 61 }, { "epoch": 0.14123006833712984, "grad_norm": 0.0, "learning_rate": 1.9849185113111166e-05, "loss": 1.812, "step": 62 }, { "epoch": 0.14350797266514806, "grad_norm": 0.0, "learning_rate": 1.984658468281992e-05, "loss": 1.9148, "step": 63 }, { "epoch": 0.14578587699316628, "grad_norm": 0.0, "learning_rate": 1.9843978547050223e-05, "loss": 1.9053, "step": 64 }, { "epoch": 0.1480637813211845, "grad_norm": 0.0, "learning_rate": 1.9841366687004273e-05, "loss": 1.8183, "step": 65 }, { "epoch": 0.15034168564920272, "grad_norm": 0.0, "learning_rate": 1.9838749083801612e-05, "loss": 1.8301, "step": 66 }, { "epoch": 0.15261958997722094, "grad_norm": 0.0, "learning_rate": 1.9836125718478662e-05, "loss": 1.8372, "step": 67 }, { "epoch": 0.1548974943052392, "grad_norm": 0.0, "learning_rate": 1.983349657198825e-05, "loss": 2.0162, "step": 68 }, { "epoch": 0.1571753986332574, "grad_norm": 0.0, "learning_rate": 1.9830861625199166e-05, "loss": 1.961, "step": 69 }, { "epoch": 0.15945330296127563, "grad_norm": 0.0, "learning_rate": 1.9828220858895707e-05, "loss": 1.8757, "step": 70 }, { "epoch": 0.16173120728929385, "grad_norm": 0.0, "learning_rate": 1.982557425377718e-05, "loss": 1.731, "step": 71 }, { "epoch": 0.16400911161731208, "grad_norm": 0.0, "learning_rate": 1.9822921790457455e-05, "loss": 2.0243, "step": 72 }, { "epoch": 0.1662870159453303, "grad_norm": 0.0, "learning_rate": 1.9820263449464484e-05, "loss": 1.952, "step": 73 }, { "epoch": 0.16856492027334852, "grad_norm": 0.0, "learning_rate": 1.9817599211239834e-05, "loss": 1.8578, "step": 74 }, { "epoch": 0.17084282460136674, "grad_norm": 0.0, "learning_rate": 1.9814929056138188e-05, "loss": 1.875, "step": 75 }, { "epoch": 0.17312072892938496, "grad_norm": 0.0, "learning_rate": 1.9812252964426878e-05, "loss": 1.9307, "step": 76 }, { "epoch": 0.17539863325740318, "grad_norm": 0.0, "learning_rate": 1.9809570916285398e-05, "loss": 1.9268, "step": 77 }, { "epoch": 0.1776765375854214, "grad_norm": 0.0, "learning_rate": 1.9806882891804902e-05, "loss": 1.868, "step": 78 }, { "epoch": 0.17995444191343962, "grad_norm": 0.0, "learning_rate": 1.9804188870987733e-05, "loss": 2.0597, "step": 79 }, { "epoch": 0.18223234624145787, "grad_norm": 0.0, "learning_rate": 1.98014888337469e-05, "loss": 1.9474, "step": 80 }, { "epoch": 0.1845102505694761, "grad_norm": 0.0, "learning_rate": 1.9798782759905605e-05, "loss": 1.8404, "step": 81 }, { "epoch": 0.1867881548974943, "grad_norm": 0.0, "learning_rate": 1.9796070629196717e-05, "loss": 1.7861, "step": 82 }, { "epoch": 0.18906605922551253, "grad_norm": 0.0, "learning_rate": 1.9793352421262293e-05, "loss": 1.8087, "step": 83 }, { "epoch": 0.19134396355353075, "grad_norm": 0.0, "learning_rate": 1.9790628115653045e-05, "loss": 1.9031, "step": 84 }, { "epoch": 0.19362186788154898, "grad_norm": 0.0, "learning_rate": 1.9787897691827823e-05, "loss": 1.9287, "step": 85 }, { "epoch": 0.1958997722095672, "grad_norm": 0.0, "learning_rate": 1.9785161129153136e-05, "loss": 1.9303, "step": 86 }, { "epoch": 0.19817767653758542, "grad_norm": 0.0, "learning_rate": 1.978241840690259e-05, "loss": 1.7859, "step": 87 }, { "epoch": 0.20045558086560364, "grad_norm": 0.0, "learning_rate": 1.9779669504256388e-05, "loss": 1.7389, "step": 88 }, { "epoch": 0.20273348519362186, "grad_norm": 0.0, "learning_rate": 1.977691440030079e-05, "loss": 1.9208, "step": 89 }, { "epoch": 0.20501138952164008, "grad_norm": 0.0, "learning_rate": 1.9774153074027606e-05, "loss": 1.9071, "step": 90 }, { "epoch": 0.2072892938496583, "grad_norm": 0.0, "learning_rate": 1.977138550433363e-05, "loss": 1.8619, "step": 91 }, { "epoch": 0.20956719817767655, "grad_norm": 0.0, "learning_rate": 1.9768611670020123e-05, "loss": 1.6974, "step": 92 }, { "epoch": 0.21184510250569477, "grad_norm": 0.0, "learning_rate": 1.9765831549792272e-05, "loss": 1.8706, "step": 93 }, { "epoch": 0.214123006833713, "grad_norm": 0.0, "learning_rate": 1.9763045122258635e-05, "loss": 1.7, "step": 94 }, { "epoch": 0.2164009111617312, "grad_norm": 0.0, "learning_rate": 1.97602523659306e-05, "loss": 1.7257, "step": 95 }, { "epoch": 0.21867881548974943, "grad_norm": 0.0, "learning_rate": 1.975745325922183e-05, "loss": 1.8205, "step": 96 }, { "epoch": 0.22095671981776766, "grad_norm": 0.0, "learning_rate": 1.9754647780447705e-05, "loss": 1.9095, "step": 97 }, { "epoch": 0.22323462414578588, "grad_norm": 0.0, "learning_rate": 1.9751835907824768e-05, "loss": 1.8607, "step": 98 }, { "epoch": 0.2255125284738041, "grad_norm": 0.0, "learning_rate": 1.974901761947015e-05, "loss": 1.8098, "step": 99 }, { "epoch": 0.22779043280182232, "grad_norm": 0.0, "learning_rate": 1.9746192893401017e-05, "loss": 1.816, "step": 100 }, { "epoch": 0.23006833712984054, "grad_norm": 0.0, "learning_rate": 1.9743361707533988e-05, "loss": 1.8485, "step": 101 }, { "epoch": 0.23234624145785876, "grad_norm": 0.0, "learning_rate": 1.9740524039684558e-05, "loss": 1.6743, "step": 102 }, { "epoch": 0.23462414578587698, "grad_norm": 0.0, "learning_rate": 1.9737679867566535e-05, "loss": 1.9887, "step": 103 }, { "epoch": 0.23690205011389523, "grad_norm": 0.0, "learning_rate": 1.9734829168791436e-05, "loss": 1.8318, "step": 104 }, { "epoch": 0.23917995444191345, "grad_norm": 0.0, "learning_rate": 1.9731971920867902e-05, "loss": 1.9805, "step": 105 }, { "epoch": 0.24145785876993167, "grad_norm": 0.0, "learning_rate": 1.9729108101201127e-05, "loss": 1.7662, "step": 106 }, { "epoch": 0.2437357630979499, "grad_norm": 0.0, "learning_rate": 1.9726237687092234e-05, "loss": 1.9131, "step": 107 }, { "epoch": 0.2460136674259681, "grad_norm": 0.0, "learning_rate": 1.9723360655737707e-05, "loss": 1.7177, "step": 108 }, { "epoch": 0.24829157175398633, "grad_norm": 0.0, "learning_rate": 1.972047698422875e-05, "loss": 1.8605, "step": 109 }, { "epoch": 0.2505694760820046, "grad_norm": 0.0, "learning_rate": 1.9717586649550706e-05, "loss": 1.9709, "step": 110 }, { "epoch": 0.2528473804100228, "grad_norm": 0.0, "learning_rate": 1.9714689628582445e-05, "loss": 1.7844, "step": 111 }, { "epoch": 0.255125284738041, "grad_norm": 0.0, "learning_rate": 1.971178589809573e-05, "loss": 1.8793, "step": 112 }, { "epoch": 0.25740318906605925, "grad_norm": 0.0, "learning_rate": 1.9708875434754607e-05, "loss": 1.993, "step": 113 }, { "epoch": 0.25968109339407747, "grad_norm": 0.0, "learning_rate": 1.9705958215114784e-05, "loss": 1.8287, "step": 114 }, { "epoch": 0.2619589977220957, "grad_norm": 0.0, "learning_rate": 1.9703034215622984e-05, "loss": 1.7294, "step": 115 }, { "epoch": 0.2642369020501139, "grad_norm": 0.0, "learning_rate": 1.970010341261634e-05, "loss": 1.7535, "step": 116 }, { "epoch": 0.26651480637813213, "grad_norm": 0.0, "learning_rate": 1.969716578232173e-05, "loss": 1.8323, "step": 117 }, { "epoch": 0.26879271070615035, "grad_norm": 0.0, "learning_rate": 1.969422130085515e-05, "loss": 1.8709, "step": 118 }, { "epoch": 0.27107061503416857, "grad_norm": 0.0, "learning_rate": 1.9691269944221044e-05, "loss": 1.8039, "step": 119 }, { "epoch": 0.2733485193621868, "grad_norm": 0.0, "learning_rate": 1.968831168831169e-05, "loss": 2.0044, "step": 120 }, { "epoch": 0.275626423690205, "grad_norm": 0.0, "learning_rate": 1.9685346508906514e-05, "loss": 1.8278, "step": 121 }, { "epoch": 0.27790432801822323, "grad_norm": 0.0, "learning_rate": 1.9682374381671442e-05, "loss": 1.7364, "step": 122 }, { "epoch": 0.28018223234624146, "grad_norm": 0.0, "learning_rate": 1.9679395282158218e-05, "loss": 2.0259, "step": 123 }, { "epoch": 0.2824601366742597, "grad_norm": 0.0, "learning_rate": 1.967640918580376e-05, "loss": 1.7846, "step": 124 }, { "epoch": 0.2847380410022779, "grad_norm": 0.0, "learning_rate": 1.967341606792946e-05, "loss": 1.8559, "step": 125 }, { "epoch": 0.2870159453302961, "grad_norm": 0.0, "learning_rate": 1.967041590374052e-05, "loss": 1.866, "step": 126 }, { "epoch": 0.28929384965831434, "grad_norm": 0.0, "learning_rate": 1.966740866832526e-05, "loss": 1.6675, "step": 127 }, { "epoch": 0.29157175398633256, "grad_norm": 0.0, "learning_rate": 1.9664394336654432e-05, "loss": 1.7645, "step": 128 }, { "epoch": 0.2938496583143508, "grad_norm": 0.0, "learning_rate": 1.9661372883580523e-05, "loss": 1.9748, "step": 129 }, { "epoch": 0.296127562642369, "grad_norm": 0.0, "learning_rate": 1.9658344283837056e-05, "loss": 1.8305, "step": 130 }, { "epoch": 0.2984054669703872, "grad_norm": 0.0, "learning_rate": 1.965530851203789e-05, "loss": 1.9718, "step": 131 }, { "epoch": 0.30068337129840544, "grad_norm": 0.0, "learning_rate": 1.9652265542676502e-05, "loss": 1.7814, "step": 132 }, { "epoch": 0.30296127562642367, "grad_norm": 0.0, "learning_rate": 1.9649215350125283e-05, "loss": 1.9832, "step": 133 }, { "epoch": 0.3052391799544419, "grad_norm": 0.0, "learning_rate": 1.9646157908634805e-05, "loss": 1.6894, "step": 134 }, { "epoch": 0.30751708428246016, "grad_norm": 0.0, "learning_rate": 1.9643093192333114e-05, "loss": 1.86, "step": 135 }, { "epoch": 0.3097949886104784, "grad_norm": 0.0, "learning_rate": 1.9640021175224987e-05, "loss": 1.72, "step": 136 }, { "epoch": 0.3120728929384966, "grad_norm": 0.0, "learning_rate": 1.9636941831191202e-05, "loss": 1.8442, "step": 137 }, { "epoch": 0.3143507972665148, "grad_norm": 0.0, "learning_rate": 1.9633855133987797e-05, "loss": 1.9141, "step": 138 }, { "epoch": 0.31662870159453305, "grad_norm": 0.0, "learning_rate": 1.963076105724532e-05, "loss": 1.8632, "step": 139 }, { "epoch": 0.31890660592255127, "grad_norm": 0.0, "learning_rate": 1.962765957446809e-05, "loss": 1.823, "step": 140 }, { "epoch": 0.3211845102505695, "grad_norm": 0.0, "learning_rate": 1.962455065903342e-05, "loss": 1.9247, "step": 141 }, { "epoch": 0.3234624145785877, "grad_norm": 0.0, "learning_rate": 1.9621434284190885e-05, "loss": 1.9465, "step": 142 }, { "epoch": 0.32574031890660593, "grad_norm": 0.0, "learning_rate": 1.9618310423061524e-05, "loss": 1.8873, "step": 143 }, { "epoch": 0.32801822323462415, "grad_norm": 0.0, "learning_rate": 1.9615179048637097e-05, "loss": 1.9864, "step": 144 }, { "epoch": 0.33029612756264237, "grad_norm": 0.0, "learning_rate": 1.9612040133779266e-05, "loss": 1.9989, "step": 145 }, { "epoch": 0.3325740318906606, "grad_norm": 0.0, "learning_rate": 1.960889365121886e-05, "loss": 1.7546, "step": 146 }, { "epoch": 0.3348519362186788, "grad_norm": 0.0, "learning_rate": 1.9605739573555053e-05, "loss": 1.8092, "step": 147 }, { "epoch": 0.33712984054669703, "grad_norm": 0.0, "learning_rate": 1.9602577873254568e-05, "loss": 1.8627, "step": 148 }, { "epoch": 0.33940774487471526, "grad_norm": 0.0, "learning_rate": 1.9599408522650895e-05, "loss": 1.8892, "step": 149 }, { "epoch": 0.3416856492027335, "grad_norm": 0.0, "learning_rate": 1.9596231493943474e-05, "loss": 1.8105, "step": 150 }, { "epoch": 0.3439635535307517, "grad_norm": 0.0, "learning_rate": 1.9593046759196877e-05, "loss": 1.8469, "step": 151 }, { "epoch": 0.3462414578587699, "grad_norm": 0.0, "learning_rate": 1.958985429033999e-05, "loss": 1.8693, "step": 152 }, { "epoch": 0.34851936218678814, "grad_norm": 0.0, "learning_rate": 1.9586654059165205e-05, "loss": 1.8949, "step": 153 }, { "epoch": 0.35079726651480636, "grad_norm": 0.0, "learning_rate": 1.9583446037327565e-05, "loss": 1.8999, "step": 154 }, { "epoch": 0.3530751708428246, "grad_norm": 0.0, "learning_rate": 1.9580230196343943e-05, "loss": 1.8187, "step": 155 }, { "epoch": 0.3553530751708428, "grad_norm": 0.0, "learning_rate": 1.957700650759219e-05, "loss": 1.68, "step": 156 }, { "epoch": 0.357630979498861, "grad_norm": 0.0, "learning_rate": 1.9573774942310304e-05, "loss": 1.7054, "step": 157 }, { "epoch": 0.35990888382687924, "grad_norm": 0.0, "learning_rate": 1.9570535471595545e-05, "loss": 1.8753, "step": 158 }, { "epoch": 0.3621867881548975, "grad_norm": 0.0, "learning_rate": 1.9567288066403597e-05, "loss": 1.8248, "step": 159 }, { "epoch": 0.36446469248291574, "grad_norm": 0.0, "learning_rate": 1.9564032697547687e-05, "loss": 1.8118, "step": 160 }, { "epoch": 0.36674259681093396, "grad_norm": 0.0, "learning_rate": 1.9560769335697723e-05, "loss": 1.8175, "step": 161 }, { "epoch": 0.3690205011389522, "grad_norm": 0.0, "learning_rate": 1.9557497951379406e-05, "loss": 1.8191, "step": 162 }, { "epoch": 0.3712984054669704, "grad_norm": 0.0, "learning_rate": 1.9554218514973334e-05, "loss": 1.8828, "step": 163 }, { "epoch": 0.3735763097949886, "grad_norm": 0.0, "learning_rate": 1.955093099671413e-05, "loss": 1.9217, "step": 164 }, { "epoch": 0.37585421412300685, "grad_norm": 0.0, "learning_rate": 1.9547635366689516e-05, "loss": 1.9383, "step": 165 }, { "epoch": 0.37813211845102507, "grad_norm": 0.0, "learning_rate": 1.954433159483942e-05, "loss": 1.9053, "step": 166 }, { "epoch": 0.3804100227790433, "grad_norm": 0.0, "learning_rate": 1.9541019650955066e-05, "loss": 1.7693, "step": 167 }, { "epoch": 0.3826879271070615, "grad_norm": 0.0, "learning_rate": 1.9537699504678044e-05, "loss": 1.8591, "step": 168 }, { "epoch": 0.38496583143507973, "grad_norm": 0.0, "learning_rate": 1.953437112549938e-05, "loss": 1.9008, "step": 169 }, { "epoch": 0.38724373576309795, "grad_norm": 0.0, "learning_rate": 1.953103448275862e-05, "loss": 1.7367, "step": 170 }, { "epoch": 0.3895216400911162, "grad_norm": 0.0, "learning_rate": 1.952768954564287e-05, "loss": 1.9731, "step": 171 }, { "epoch": 0.3917995444191344, "grad_norm": 0.0, "learning_rate": 1.9524336283185842e-05, "loss": 1.8188, "step": 172 }, { "epoch": 0.3940774487471526, "grad_norm": 0.0, "learning_rate": 1.9520974664266928e-05, "loss": 1.8901, "step": 173 }, { "epoch": 0.39635535307517084, "grad_norm": 0.0, "learning_rate": 1.9517604657610205e-05, "loss": 1.733, "step": 174 }, { "epoch": 0.39863325740318906, "grad_norm": 0.0, "learning_rate": 1.9514226231783483e-05, "loss": 1.9324, "step": 175 }, { "epoch": 0.4009111617312073, "grad_norm": 0.0, "learning_rate": 1.9510839355197333e-05, "loss": 2.0288, "step": 176 }, { "epoch": 0.4031890660592255, "grad_norm": 0.0, "learning_rate": 1.9507443996104078e-05, "loss": 1.8382, "step": 177 }, { "epoch": 0.4054669703872437, "grad_norm": 0.0, "learning_rate": 1.9504040122596826e-05, "loss": 1.7701, "step": 178 }, { "epoch": 0.40774487471526194, "grad_norm": 0.0, "learning_rate": 1.9500627702608454e-05, "loss": 1.8828, "step": 179 }, { "epoch": 0.41002277904328016, "grad_norm": 0.0, "learning_rate": 1.9497206703910617e-05, "loss": 1.7847, "step": 180 }, { "epoch": 0.4123006833712984, "grad_norm": 0.0, "learning_rate": 1.9493777094112713e-05, "loss": 1.9039, "step": 181 }, { "epoch": 0.4145785876993166, "grad_norm": 0.0, "learning_rate": 1.949033884066088e-05, "loss": 1.8635, "step": 182 }, { "epoch": 0.4168564920273349, "grad_norm": 0.0, "learning_rate": 1.9486891910836954e-05, "loss": 1.671, "step": 183 }, { "epoch": 0.4191343963553531, "grad_norm": 0.0, "learning_rate": 1.9483436271757442e-05, "loss": 2.0151, "step": 184 }, { "epoch": 0.4214123006833713, "grad_norm": 0.0, "learning_rate": 1.9479971890372456e-05, "loss": 1.8151, "step": 185 }, { "epoch": 0.42369020501138954, "grad_norm": 0.0, "learning_rate": 1.947649873346468e-05, "loss": 1.9972, "step": 186 }, { "epoch": 0.42596810933940776, "grad_norm": 0.0, "learning_rate": 1.9473016767648306e-05, "loss": 1.8369, "step": 187 }, { "epoch": 0.428246013667426, "grad_norm": 0.0, "learning_rate": 1.9469525959367946e-05, "loss": 1.7784, "step": 188 }, { "epoch": 0.4305239179954442, "grad_norm": 0.0, "learning_rate": 1.9466026274897585e-05, "loss": 1.7226, "step": 189 }, { "epoch": 0.4328018223234624, "grad_norm": 0.0, "learning_rate": 1.9462517680339464e-05, "loss": 1.8151, "step": 190 }, { "epoch": 0.43507972665148065, "grad_norm": 0.0, "learning_rate": 1.9459000141623002e-05, "loss": 1.8637, "step": 191 }, { "epoch": 0.43735763097949887, "grad_norm": 0.0, "learning_rate": 1.945547362450369e-05, "loss": 1.8807, "step": 192 }, { "epoch": 0.4396355353075171, "grad_norm": 0.0, "learning_rate": 1.945193809456198e-05, "loss": 1.8704, "step": 193 }, { "epoch": 0.4419134396355353, "grad_norm": 0.0, "learning_rate": 1.9448393517202162e-05, "loss": 1.8055, "step": 194 }, { "epoch": 0.44419134396355353, "grad_norm": 0.0, "learning_rate": 1.9444839857651248e-05, "loss": 1.7756, "step": 195 }, { "epoch": 0.44646924829157175, "grad_norm": 0.0, "learning_rate": 1.9441277080957813e-05, "loss": 1.8105, "step": 196 }, { "epoch": 0.44874715261959, "grad_norm": 0.0, "learning_rate": 1.9437705151990866e-05, "loss": 1.9053, "step": 197 }, { "epoch": 0.4510250569476082, "grad_norm": 0.0, "learning_rate": 1.94341240354387e-05, "loss": 1.975, "step": 198 }, { "epoch": 0.4533029612756264, "grad_norm": 0.0, "learning_rate": 1.94305336958077e-05, "loss": 1.7839, "step": 199 }, { "epoch": 0.45558086560364464, "grad_norm": 0.0, "learning_rate": 1.9426934097421205e-05, "loss": 1.765, "step": 200 }, { "epoch": 0.45785876993166286, "grad_norm": 0.0, "learning_rate": 1.9423325204418306e-05, "loss": 1.9151, "step": 201 }, { "epoch": 0.4601366742596811, "grad_norm": 0.0, "learning_rate": 1.941970698075266e-05, "loss": 2.0266, "step": 202 }, { "epoch": 0.4624145785876993, "grad_norm": 0.0, "learning_rate": 1.9416079390191285e-05, "loss": 1.8082, "step": 203 }, { "epoch": 0.4646924829157175, "grad_norm": 0.0, "learning_rate": 1.9412442396313366e-05, "loss": 1.8551, "step": 204 }, { "epoch": 0.46697038724373574, "grad_norm": 0.0, "learning_rate": 1.9408795962509016e-05, "loss": 1.7932, "step": 205 }, { "epoch": 0.46924829157175396, "grad_norm": 0.0, "learning_rate": 1.9405140051978056e-05, "loss": 1.7762, "step": 206 }, { "epoch": 0.4715261958997722, "grad_norm": 0.0, "learning_rate": 1.9401474627728788e-05, "loss": 1.9933, "step": 207 }, { "epoch": 0.47380410022779046, "grad_norm": 0.0, "learning_rate": 1.9397799652576727e-05, "loss": 1.9953, "step": 208 }, { "epoch": 0.4760820045558087, "grad_norm": 0.0, "learning_rate": 1.9394115089143356e-05, "loss": 1.7761, "step": 209 }, { "epoch": 0.4783599088838269, "grad_norm": 0.0, "learning_rate": 1.939042089985486e-05, "loss": 1.6961, "step": 210 }, { "epoch": 0.4806378132118451, "grad_norm": 0.0, "learning_rate": 1.9386717046940853e-05, "loss": 1.7874, "step": 211 }, { "epoch": 0.48291571753986334, "grad_norm": 0.0, "learning_rate": 1.9383003492433062e-05, "loss": 2.0687, "step": 212 }, { "epoch": 0.48519362186788156, "grad_norm": 0.0, "learning_rate": 1.937928019816407e-05, "loss": 1.8469, "step": 213 }, { "epoch": 0.4874715261958998, "grad_norm": 0.0, "learning_rate": 1.9375547125765978e-05, "loss": 1.861, "step": 214 }, { "epoch": 0.489749430523918, "grad_norm": 0.0, "learning_rate": 1.9371804236669104e-05, "loss": 1.8835, "step": 215 }, { "epoch": 0.4920273348519362, "grad_norm": 0.0, "learning_rate": 1.9368051492100644e-05, "loss": 1.8315, "step": 216 }, { "epoch": 0.49430523917995445, "grad_norm": 0.0, "learning_rate": 1.9364288853083345e-05, "loss": 1.9031, "step": 217 }, { "epoch": 0.49658314350797267, "grad_norm": 0.0, "learning_rate": 1.9360516280434146e-05, "loss": 1.9127, "step": 218 }, { "epoch": 0.4988610478359909, "grad_norm": 0.0, "learning_rate": 1.9356733734762817e-05, "loss": 1.7706, "step": 219 }, { "epoch": 0.5011389521640092, "grad_norm": 0.0, "learning_rate": 1.935294117647059e-05, "loss": 1.9683, "step": 220 }, { "epoch": 0.5034168564920274, "grad_norm": 0.0, "learning_rate": 1.9349138565748787e-05, "loss": 1.6735, "step": 221 }, { "epoch": 0.5056947608200456, "grad_norm": 0.0, "learning_rate": 1.9345325862577412e-05, "loss": 1.8115, "step": 222 }, { "epoch": 0.5079726651480638, "grad_norm": 0.0, "learning_rate": 1.9341503026723757e-05, "loss": 1.8155, "step": 223 }, { "epoch": 0.510250569476082, "grad_norm": 0.0, "learning_rate": 1.933767001774098e-05, "loss": 1.8103, "step": 224 }, { "epoch": 0.5125284738041003, "grad_norm": 0.0, "learning_rate": 1.9333826794966695e-05, "loss": 1.7367, "step": 225 }, { "epoch": 0.5148063781321185, "grad_norm": 0.0, "learning_rate": 1.9329973317521497e-05, "loss": 1.7631, "step": 226 }, { "epoch": 0.5170842824601367, "grad_norm": 0.0, "learning_rate": 1.9326109544307555e-05, "loss": 1.8696, "step": 227 }, { "epoch": 0.5193621867881549, "grad_norm": 0.0, "learning_rate": 1.9322235434007134e-05, "loss": 1.8613, "step": 228 }, { "epoch": 0.5216400911161732, "grad_norm": 0.0, "learning_rate": 1.9318350945081115e-05, "loss": 1.6882, "step": 229 }, { "epoch": 0.5239179954441914, "grad_norm": 0.0, "learning_rate": 1.9314456035767514e-05, "loss": 1.8036, "step": 230 }, { "epoch": 0.5261958997722096, "grad_norm": 0.0, "learning_rate": 1.9310550664079988e-05, "loss": 1.7296, "step": 231 }, { "epoch": 0.5284738041002278, "grad_norm": 0.0, "learning_rate": 1.930663478780634e-05, "loss": 1.6914, "step": 232 }, { "epoch": 0.530751708428246, "grad_norm": 0.0, "learning_rate": 1.930270836450696e-05, "loss": 1.9104, "step": 233 }, { "epoch": 0.5330296127562643, "grad_norm": 0.0, "learning_rate": 1.9298771351513336e-05, "loss": 1.7403, "step": 234 }, { "epoch": 0.5353075170842825, "grad_norm": 0.0, "learning_rate": 1.9294823705926484e-05, "loss": 1.8471, "step": 235 }, { "epoch": 0.5375854214123007, "grad_norm": 0.0, "learning_rate": 1.9290865384615384e-05, "loss": 1.8498, "step": 236 }, { "epoch": 0.5398633257403189, "grad_norm": 0.0, "learning_rate": 1.9286896344215435e-05, "loss": 1.8415, "step": 237 }, { "epoch": 0.5421412300683371, "grad_norm": 0.0, "learning_rate": 1.9282916541126848e-05, "loss": 1.868, "step": 238 }, { "epoch": 0.5444191343963554, "grad_norm": 0.0, "learning_rate": 1.927892593151305e-05, "loss": 1.8843, "step": 239 }, { "epoch": 0.5466970387243736, "grad_norm": 0.0, "learning_rate": 1.9274924471299096e-05, "loss": 1.9664, "step": 240 }, { "epoch": 0.5489749430523918, "grad_norm": 0.0, "learning_rate": 1.927091211617002e-05, "loss": 1.6881, "step": 241 }, { "epoch": 0.55125284738041, "grad_norm": 0.0, "learning_rate": 1.9266888821569223e-05, "loss": 1.7565, "step": 242 }, { "epoch": 0.5535307517084282, "grad_norm": 0.0, "learning_rate": 1.92628545426968e-05, "loss": 1.7555, "step": 243 }, { "epoch": 0.5558086560364465, "grad_norm": 0.0, "learning_rate": 1.92588092345079e-05, "loss": 1.719, "step": 244 }, { "epoch": 0.5580865603644647, "grad_norm": 0.0, "learning_rate": 1.9254752851711028e-05, "loss": 1.8767, "step": 245 }, { "epoch": 0.5603644646924829, "grad_norm": 0.0, "learning_rate": 1.9250685348766376e-05, "loss": 1.8385, "step": 246 }, { "epoch": 0.5626423690205011, "grad_norm": 0.0, "learning_rate": 1.9246606679884093e-05, "loss": 1.8968, "step": 247 }, { "epoch": 0.5649202733485194, "grad_norm": 0.0, "learning_rate": 1.9242516799022603e-05, "loss": 1.6632, "step": 248 }, { "epoch": 0.5671981776765376, "grad_norm": 0.0, "learning_rate": 1.9238415659886835e-05, "loss": 1.9958, "step": 249 }, { "epoch": 0.5694760820045558, "grad_norm": 0.0, "learning_rate": 1.9234303215926494e-05, "loss": 1.7144, "step": 250 }, { "epoch": 0.571753986332574, "grad_norm": 0.0, "learning_rate": 1.9230179420334307e-05, "loss": 1.8842, "step": 251 }, { "epoch": 0.5740318906605922, "grad_norm": 0.0, "learning_rate": 1.922604422604423e-05, "loss": 1.7184, "step": 252 }, { "epoch": 0.5763097949886105, "grad_norm": 0.0, "learning_rate": 1.9221897585729664e-05, "loss": 1.8718, "step": 253 }, { "epoch": 0.5785876993166287, "grad_norm": 0.0, "learning_rate": 1.9217739451801666e-05, "loss": 1.726, "step": 254 }, { "epoch": 0.5808656036446469, "grad_norm": 0.0, "learning_rate": 1.9213569776407097e-05, "loss": 1.7056, "step": 255 }, { "epoch": 0.5831435079726651, "grad_norm": 0.0, "learning_rate": 1.9209388511426808e-05, "loss": 1.7493, "step": 256 }, { "epoch": 0.5854214123006833, "grad_norm": 0.0, "learning_rate": 1.920519560847379e-05, "loss": 1.8129, "step": 257 }, { "epoch": 0.5876993166287016, "grad_norm": 0.0, "learning_rate": 1.9200991018891298e-05, "loss": 1.8607, "step": 258 }, { "epoch": 0.5899772209567198, "grad_norm": 0.0, "learning_rate": 1.9196774693750973e-05, "loss": 1.7757, "step": 259 }, { "epoch": 0.592255125284738, "grad_norm": 0.0, "learning_rate": 1.9192546583850936e-05, "loss": 1.7289, "step": 260 }, { "epoch": 0.5945330296127562, "grad_norm": 0.0, "learning_rate": 1.918830663971389e-05, "loss": 1.7827, "step": 261 }, { "epoch": 0.5968109339407744, "grad_norm": 0.0, "learning_rate": 1.9184054811585176e-05, "loss": 1.8738, "step": 262 }, { "epoch": 0.5990888382687927, "grad_norm": 0.0, "learning_rate": 1.9179791049430845e-05, "loss": 1.7512, "step": 263 }, { "epoch": 0.6013667425968109, "grad_norm": 0.0, "learning_rate": 1.9175515302935666e-05, "loss": 1.9041, "step": 264 }, { "epoch": 0.6036446469248291, "grad_norm": 0.0, "learning_rate": 1.9171227521501178e-05, "loss": 1.761, "step": 265 }, { "epoch": 0.6059225512528473, "grad_norm": 0.0, "learning_rate": 1.9166927654243663e-05, "loss": 1.8059, "step": 266 }, { "epoch": 0.6082004555808656, "grad_norm": 0.0, "learning_rate": 1.916261564999216e-05, "loss": 1.8029, "step": 267 }, { "epoch": 0.6104783599088838, "grad_norm": 0.0, "learning_rate": 1.915829145728643e-05, "loss": 1.7574, "step": 268 }, { "epoch": 0.6127562642369021, "grad_norm": 0.0, "learning_rate": 1.9153955024374905e-05, "loss": 1.7662, "step": 269 }, { "epoch": 0.6150341685649203, "grad_norm": 0.0, "learning_rate": 1.91496062992126e-05, "loss": 1.8448, "step": 270 }, { "epoch": 0.6173120728929385, "grad_norm": 0.0, "learning_rate": 1.9145245229459078e-05, "loss": 1.6852, "step": 271 }, { "epoch": 0.6195899772209568, "grad_norm": 0.0, "learning_rate": 1.9140871762476314e-05, "loss": 1.773, "step": 272 }, { "epoch": 0.621867881548975, "grad_norm": 0.0, "learning_rate": 1.913648584532659e-05, "loss": 1.8345, "step": 273 }, { "epoch": 0.6241457858769932, "grad_norm": 0.0, "learning_rate": 1.9132087424770354e-05, "loss": 1.7964, "step": 274 }, { "epoch": 0.6264236902050114, "grad_norm": 0.0, "learning_rate": 1.912767644726408e-05, "loss": 1.8436, "step": 275 }, { "epoch": 0.6287015945330297, "grad_norm": 0.0, "learning_rate": 1.912325285895807e-05, "loss": 1.9349, "step": 276 }, { "epoch": 0.6309794988610479, "grad_norm": 0.0, "learning_rate": 1.911881660569429e-05, "loss": 1.7675, "step": 277 }, { "epoch": 0.6332574031890661, "grad_norm": 0.0, "learning_rate": 1.911436763300414e-05, "loss": 1.8384, "step": 278 }, { "epoch": 0.6355353075170843, "grad_norm": 0.0, "learning_rate": 1.910990588610624e-05, "loss": 1.8475, "step": 279 }, { "epoch": 0.6378132118451025, "grad_norm": 0.0, "learning_rate": 1.9105431309904158e-05, "loss": 1.852, "step": 280 }, { "epoch": 0.6400911161731208, "grad_norm": 0.0, "learning_rate": 1.9100943848984165e-05, "loss": 1.7185, "step": 281 }, { "epoch": 0.642369020501139, "grad_norm": 0.0, "learning_rate": 1.9096443447612948e-05, "loss": 1.7923, "step": 282 }, { "epoch": 0.6446469248291572, "grad_norm": 0.0, "learning_rate": 1.9091930049735282e-05, "loss": 1.9492, "step": 283 }, { "epoch": 0.6469248291571754, "grad_norm": 0.0, "learning_rate": 1.9087403598971723e-05, "loss": 1.9283, "step": 284 }, { "epoch": 0.6492027334851936, "grad_norm": 0.0, "learning_rate": 1.9082864038616254e-05, "loss": 1.8545, "step": 285 }, { "epoch": 0.6514806378132119, "grad_norm": 0.0, "learning_rate": 1.9078311311633904e-05, "loss": 1.8646, "step": 286 }, { "epoch": 0.6537585421412301, "grad_norm": 0.0, "learning_rate": 1.9073745360658385e-05, "loss": 1.8924, "step": 287 }, { "epoch": 0.6560364464692483, "grad_norm": 0.0, "learning_rate": 1.906916612798966e-05, "loss": 1.8022, "step": 288 }, { "epoch": 0.6583143507972665, "grad_norm": 0.0, "learning_rate": 1.9064573555591522e-05, "loss": 1.8965, "step": 289 }, { "epoch": 0.6605922551252847, "grad_norm": 0.0, "learning_rate": 1.9059967585089144e-05, "loss": 1.9303, "step": 290 }, { "epoch": 0.662870159453303, "grad_norm": 0.0, "learning_rate": 1.9055348157766597e-05, "loss": 1.8566, "step": 291 }, { "epoch": 0.6651480637813212, "grad_norm": 0.0, "learning_rate": 1.9050715214564374e-05, "loss": 1.7818, "step": 292 }, { "epoch": 0.6674259681093394, "grad_norm": 0.0, "learning_rate": 1.904606869607684e-05, "loss": 1.9262, "step": 293 }, { "epoch": 0.6697038724373576, "grad_norm": 0.0, "learning_rate": 1.9041408542549725e-05, "loss": 1.6944, "step": 294 }, { "epoch": 0.6719817767653758, "grad_norm": 0.0, "learning_rate": 1.9036734693877552e-05, "loss": 1.7894, "step": 295 }, { "epoch": 0.6742596810933941, "grad_norm": 0.0, "learning_rate": 1.9032047089601048e-05, "loss": 1.9153, "step": 296 }, { "epoch": 0.6765375854214123, "grad_norm": 0.0, "learning_rate": 1.9027345668904537e-05, "loss": 1.8007, "step": 297 }, { "epoch": 0.6788154897494305, "grad_norm": 0.0, "learning_rate": 1.9022630370613315e-05, "loss": 1.8416, "step": 298 }, { "epoch": 0.6810933940774487, "grad_norm": 0.0, "learning_rate": 1.9017901133191004e-05, "loss": 1.6625, "step": 299 }, { "epoch": 0.683371298405467, "grad_norm": 0.0, "learning_rate": 1.9013157894736845e-05, "loss": 1.8183, "step": 300 }, { "epoch": 0.6856492027334852, "grad_norm": 0.0, "learning_rate": 1.9008400592983036e-05, "loss": 1.8195, "step": 301 }, { "epoch": 0.6879271070615034, "grad_norm": 0.0, "learning_rate": 1.9003629165291983e-05, "loss": 1.7959, "step": 302 }, { "epoch": 0.6902050113895216, "grad_norm": 0.0, "learning_rate": 1.899884354865356e-05, "loss": 1.7854, "step": 303 }, { "epoch": 0.6924829157175398, "grad_norm": 0.0, "learning_rate": 1.899404367968233e-05, "loss": 1.7821, "step": 304 }, { "epoch": 0.6947608200455581, "grad_norm": 0.0, "learning_rate": 1.8989229494614746e-05, "loss": 1.9069, "step": 305 }, { "epoch": 0.6970387243735763, "grad_norm": 0.0, "learning_rate": 1.8984400929306342e-05, "loss": 1.7895, "step": 306 }, { "epoch": 0.6993166287015945, "grad_norm": 0.0, "learning_rate": 1.8979557919228855e-05, "loss": 1.818, "step": 307 }, { "epoch": 0.7015945330296127, "grad_norm": 0.0, "learning_rate": 1.897470039946738e-05, "loss": 1.8809, "step": 308 }, { "epoch": 0.7038724373576309, "grad_norm": 0.0, "learning_rate": 1.8969828304717456e-05, "loss": 1.6762, "step": 309 }, { "epoch": 0.7061503416856492, "grad_norm": 0.0, "learning_rate": 1.8964941569282137e-05, "loss": 1.7813, "step": 310 }, { "epoch": 0.7084282460136674, "grad_norm": 0.0, "learning_rate": 1.896004012706905e-05, "loss": 1.8738, "step": 311 }, { "epoch": 0.7107061503416856, "grad_norm": 0.0, "learning_rate": 1.8955123911587412e-05, "loss": 1.7677, "step": 312 }, { "epoch": 0.7129840546697038, "grad_norm": 0.0, "learning_rate": 1.8950192855945e-05, "loss": 1.7614, "step": 313 }, { "epoch": 0.715261958997722, "grad_norm": 0.0, "learning_rate": 1.894524689284515e-05, "loss": 1.9156, "step": 314 }, { "epoch": 0.7175398633257403, "grad_norm": 0.0, "learning_rate": 1.8940285954583686e-05, "loss": 1.6247, "step": 315 }, { "epoch": 0.7198177676537585, "grad_norm": 0.0, "learning_rate": 1.8935309973045825e-05, "loss": 1.7746, "step": 316 }, { "epoch": 0.7220956719817767, "grad_norm": 0.0, "learning_rate": 1.8930318879703055e-05, "loss": 1.6941, "step": 317 }, { "epoch": 0.724373576309795, "grad_norm": 0.0, "learning_rate": 1.8925312605610003e-05, "loss": 1.7349, "step": 318 }, { "epoch": 0.7266514806378133, "grad_norm": 0.0, "learning_rate": 1.8920291081401257e-05, "loss": 1.786, "step": 319 }, { "epoch": 0.7289293849658315, "grad_norm": 0.0, "learning_rate": 1.8915254237288138e-05, "loss": 1.87, "step": 320 }, { "epoch": 0.7312072892938497, "grad_norm": 0.0, "learning_rate": 1.891020200305551e-05, "loss": 1.8122, "step": 321 }, { "epoch": 0.7334851936218679, "grad_norm": 0.0, "learning_rate": 1.8905134308058486e-05, "loss": 1.9103, "step": 322 }, { "epoch": 0.7357630979498861, "grad_norm": 0.0, "learning_rate": 1.8900051081219138e-05, "loss": 1.7457, "step": 323 }, { "epoch": 0.7380410022779044, "grad_norm": 0.0, "learning_rate": 1.8894952251023194e-05, "loss": 1.7686, "step": 324 }, { "epoch": 0.7403189066059226, "grad_norm": 0.0, "learning_rate": 1.8889837745516653e-05, "loss": 1.7409, "step": 325 }, { "epoch": 0.7425968109339408, "grad_norm": 0.0, "learning_rate": 1.8884707492302432e-05, "loss": 1.9199, "step": 326 }, { "epoch": 0.744874715261959, "grad_norm": 0.0, "learning_rate": 1.8879561418536924e-05, "loss": 1.844, "step": 327 }, { "epoch": 0.7471526195899773, "grad_norm": 0.0, "learning_rate": 1.8874399450926562e-05, "loss": 1.6653, "step": 328 }, { "epoch": 0.7494305239179955, "grad_norm": 0.0, "learning_rate": 1.886922151572435e-05, "loss": 1.9141, "step": 329 }, { "epoch": 0.7517084282460137, "grad_norm": 0.0, "learning_rate": 1.8864027538726336e-05, "loss": 1.8721, "step": 330 }, { "epoch": 0.7539863325740319, "grad_norm": 0.0, "learning_rate": 1.885881744526806e-05, "loss": 1.6069, "step": 331 }, { "epoch": 0.7562642369020501, "grad_norm": 0.0, "learning_rate": 1.8853591160221e-05, "loss": 1.9076, "step": 332 }, { "epoch": 0.7585421412300684, "grad_norm": 0.0, "learning_rate": 1.8848348607988935e-05, "loss": 1.7952, "step": 333 }, { "epoch": 0.7608200455580866, "grad_norm": 0.0, "learning_rate": 1.884308971250433e-05, "loss": 1.6736, "step": 334 }, { "epoch": 0.7630979498861048, "grad_norm": 0.0, "learning_rate": 1.8837814397224635e-05, "loss": 1.9223, "step": 335 }, { "epoch": 0.765375854214123, "grad_norm": 0.0, "learning_rate": 1.8832522585128564e-05, "loss": 1.7733, "step": 336 }, { "epoch": 0.7676537585421412, "grad_norm": 0.0, "learning_rate": 1.8827214198712374e-05, "loss": 1.7597, "step": 337 }, { "epoch": 0.7699316628701595, "grad_norm": 0.0, "learning_rate": 1.882188915998606e-05, "loss": 1.6681, "step": 338 }, { "epoch": 0.7722095671981777, "grad_norm": 0.0, "learning_rate": 1.8816547390469545e-05, "loss": 1.6973, "step": 339 }, { "epoch": 0.7744874715261959, "grad_norm": 0.0, "learning_rate": 1.8811188811188814e-05, "loss": 1.7656, "step": 340 }, { "epoch": 0.7767653758542141, "grad_norm": 0.0, "learning_rate": 1.880581334267204e-05, "loss": 2.0147, "step": 341 }, { "epoch": 0.7790432801822323, "grad_norm": 0.0, "learning_rate": 1.8800420904945635e-05, "loss": 1.7254, "step": 342 }, { "epoch": 0.7813211845102506, "grad_norm": 0.0, "learning_rate": 1.87950114175303e-05, "loss": 1.864, "step": 343 }, { "epoch": 0.7835990888382688, "grad_norm": 0.0, "learning_rate": 1.8789584799437017e-05, "loss": 1.7519, "step": 344 }, { "epoch": 0.785876993166287, "grad_norm": 0.0, "learning_rate": 1.8784140969162997e-05, "loss": 1.8621, "step": 345 }, { "epoch": 0.7881548974943052, "grad_norm": 0.0, "learning_rate": 1.8778679844687614e-05, "loss": 1.8879, "step": 346 }, { "epoch": 0.7904328018223234, "grad_norm": 0.0, "learning_rate": 1.8773201343468273e-05, "loss": 1.6798, "step": 347 }, { "epoch": 0.7927107061503417, "grad_norm": 0.0, "learning_rate": 1.8767705382436264e-05, "loss": 1.9381, "step": 348 }, { "epoch": 0.7949886104783599, "grad_norm": 0.0, "learning_rate": 1.8762191877992554e-05, "loss": 2.0407, "step": 349 }, { "epoch": 0.7972665148063781, "grad_norm": 0.0, "learning_rate": 1.8756660746003553e-05, "loss": 1.7358, "step": 350 }, { "epoch": 0.7995444191343963, "grad_norm": 0.0, "learning_rate": 1.8751111901796833e-05, "loss": 1.8904, "step": 351 }, { "epoch": 0.8018223234624146, "grad_norm": 0.0, "learning_rate": 1.8745545260156807e-05, "loss": 1.8213, "step": 352 }, { "epoch": 0.8041002277904328, "grad_norm": 0.0, "learning_rate": 1.8739960735320364e-05, "loss": 1.9779, "step": 353 }, { "epoch": 0.806378132118451, "grad_norm": 0.0, "learning_rate": 1.873435824097247e-05, "loss": 1.7869, "step": 354 }, { "epoch": 0.8086560364464692, "grad_norm": 0.0, "learning_rate": 1.872873769024172e-05, "loss": 1.7634, "step": 355 }, { "epoch": 0.8109339407744874, "grad_norm": 0.0, "learning_rate": 1.872309899569584e-05, "loss": 1.7391, "step": 356 }, { "epoch": 0.8132118451025057, "grad_norm": 0.0, "learning_rate": 1.8717442069337165e-05, "loss": 1.7673, "step": 357 }, { "epoch": 0.8154897494305239, "grad_norm": 0.0, "learning_rate": 1.8711766822598056e-05, "loss": 1.839, "step": 358 }, { "epoch": 0.8177676537585421, "grad_norm": 0.0, "learning_rate": 1.870607316633628e-05, "loss": 1.693, "step": 359 }, { "epoch": 0.8200455580865603, "grad_norm": 0.0, "learning_rate": 1.8700361010830327e-05, "loss": 1.8105, "step": 360 }, { "epoch": 0.8223234624145785, "grad_norm": 0.0, "learning_rate": 1.8694630265774728e-05, "loss": 1.7951, "step": 361 }, { "epoch": 0.8246013667425968, "grad_norm": 0.0, "learning_rate": 1.8688880840275266e-05, "loss": 1.6912, "step": 362 }, { "epoch": 0.826879271070615, "grad_norm": 0.0, "learning_rate": 1.8683112642844187e-05, "loss": 1.7662, "step": 363 }, { "epoch": 0.8291571753986332, "grad_norm": 0.0, "learning_rate": 1.8677325581395348e-05, "loss": 1.9301, "step": 364 }, { "epoch": 0.8314350797266514, "grad_norm": 0.0, "learning_rate": 1.867151956323931e-05, "loss": 1.7746, "step": 365 }, { "epoch": 0.8337129840546698, "grad_norm": 0.0, "learning_rate": 1.8665694495078384e-05, "loss": 1.8972, "step": 366 }, { "epoch": 0.835990888382688, "grad_norm": 0.0, "learning_rate": 1.8659850283001647e-05, "loss": 1.9447, "step": 367 }, { "epoch": 0.8382687927107062, "grad_norm": 0.0, "learning_rate": 1.8653986832479884e-05, "loss": 1.7172, "step": 368 }, { "epoch": 0.8405466970387244, "grad_norm": 0.0, "learning_rate": 1.8648104048360505e-05, "loss": 1.8894, "step": 369 }, { "epoch": 0.8428246013667426, "grad_norm": 0.0, "learning_rate": 1.8642201834862387e-05, "loss": 1.8513, "step": 370 }, { "epoch": 0.8451025056947609, "grad_norm": 0.0, "learning_rate": 1.8636280095570668e-05, "loss": 1.7918, "step": 371 }, { "epoch": 0.8473804100227791, "grad_norm": 0.0, "learning_rate": 1.863033873343152e-05, "loss": 1.7627, "step": 372 }, { "epoch": 0.8496583143507973, "grad_norm": 0.0, "learning_rate": 1.8624377650746822e-05, "loss": 1.9015, "step": 373 }, { "epoch": 0.8519362186788155, "grad_norm": 0.0, "learning_rate": 1.8618396749168824e-05, "loss": 1.8908, "step": 374 }, { "epoch": 0.8542141230068337, "grad_norm": 0.0, "learning_rate": 1.8612395929694728e-05, "loss": 1.6868, "step": 375 }, { "epoch": 0.856492027334852, "grad_norm": 0.0, "learning_rate": 1.8606375092661234e-05, "loss": 1.8831, "step": 376 }, { "epoch": 0.8587699316628702, "grad_norm": 0.0, "learning_rate": 1.8600334137739e-05, "loss": 1.8061, "step": 377 }, { "epoch": 0.8610478359908884, "grad_norm": 0.0, "learning_rate": 1.8594272963927112e-05, "loss": 1.753, "step": 378 }, { "epoch": 0.8633257403189066, "grad_norm": 0.0, "learning_rate": 1.8588191469547407e-05, "loss": 1.8116, "step": 379 }, { "epoch": 0.8656036446469249, "grad_norm": 0.0, "learning_rate": 1.858208955223881e-05, "loss": 1.8972, "step": 380 }, { "epoch": 0.8678815489749431, "grad_norm": 0.0, "learning_rate": 1.85759671089516e-05, "loss": 1.769, "step": 381 }, { "epoch": 0.8701594533029613, "grad_norm": 0.0, "learning_rate": 1.85698240359416e-05, "loss": 1.677, "step": 382 }, { "epoch": 0.8724373576309795, "grad_norm": 0.0, "learning_rate": 1.85636602287643e-05, "loss": 1.8313, "step": 383 }, { "epoch": 0.8747152619589977, "grad_norm": 0.0, "learning_rate": 1.8557475582268972e-05, "loss": 1.8365, "step": 384 }, { "epoch": 0.876993166287016, "grad_norm": 0.0, "learning_rate": 1.8551269990592663e-05, "loss": 1.7318, "step": 385 }, { "epoch": 0.8792710706150342, "grad_norm": 0.0, "learning_rate": 1.8545043347154167e-05, "loss": 1.7535, "step": 386 }, { "epoch": 0.8815489749430524, "grad_norm": 0.0, "learning_rate": 1.8538795544647917e-05, "loss": 1.8066, "step": 387 }, { "epoch": 0.8838268792710706, "grad_norm": 0.0, "learning_rate": 1.8532526475037826e-05, "loss": 1.8251, "step": 388 }, { "epoch": 0.8861047835990888, "grad_norm": 0.0, "learning_rate": 1.8526236029551052e-05, "loss": 1.7838, "step": 389 }, { "epoch": 0.8883826879271071, "grad_norm": 0.0, "learning_rate": 1.851992409867173e-05, "loss": 1.9, "step": 390 }, { "epoch": 0.8906605922551253, "grad_norm": 0.0, "learning_rate": 1.8513590572134575e-05, "loss": 1.72, "step": 391 }, { "epoch": 0.8929384965831435, "grad_norm": 0.0, "learning_rate": 1.8507235338918506e-05, "loss": 1.7089, "step": 392 }, { "epoch": 0.8952164009111617, "grad_norm": 0.0, "learning_rate": 1.8500858287240127e-05, "loss": 1.864, "step": 393 }, { "epoch": 0.89749430523918, "grad_norm": 0.0, "learning_rate": 1.8494459304547195e-05, "loss": 1.8561, "step": 394 }, { "epoch": 0.8997722095671982, "grad_norm": 0.0, "learning_rate": 1.8488038277511965e-05, "loss": 1.9116, "step": 395 }, { "epoch": 0.9020501138952164, "grad_norm": 0.0, "learning_rate": 1.8481595092024542e-05, "loss": 1.895, "step": 396 }, { "epoch": 0.9043280182232346, "grad_norm": 0.0, "learning_rate": 1.8475129633186098e-05, "loss": 1.7424, "step": 397 }, { "epoch": 0.9066059225512528, "grad_norm": 0.0, "learning_rate": 1.846864178530204e-05, "loss": 1.7934, "step": 398 }, { "epoch": 0.908883826879271, "grad_norm": 0.0, "learning_rate": 1.8462131431875122e-05, "loss": 1.7733, "step": 399 }, { "epoch": 0.9111617312072893, "grad_norm": 0.0, "learning_rate": 1.8455598455598454e-05, "loss": 1.8057, "step": 400 }, { "epoch": 0.9134396355353075, "grad_norm": 0.0, "learning_rate": 1.8449042738348483e-05, "loss": 1.7436, "step": 401 }, { "epoch": 0.9157175398633257, "grad_norm": 0.0, "learning_rate": 1.844246416117784e-05, "loss": 1.842, "step": 402 }, { "epoch": 0.9179954441913439, "grad_norm": 0.0, "learning_rate": 1.8435862604308172e-05, "loss": 1.9916, "step": 403 }, { "epoch": 0.9202733485193622, "grad_norm": 0.0, "learning_rate": 1.842923794712286e-05, "loss": 1.8973, "step": 404 }, { "epoch": 0.9225512528473804, "grad_norm": 0.0, "learning_rate": 1.842259006815969e-05, "loss": 1.8911, "step": 405 }, { "epoch": 0.9248291571753986, "grad_norm": 0.0, "learning_rate": 1.8415918845103393e-05, "loss": 1.8594, "step": 406 }, { "epoch": 0.9271070615034168, "grad_norm": 0.0, "learning_rate": 1.840922415477819e-05, "loss": 1.8265, "step": 407 }, { "epoch": 0.929384965831435, "grad_norm": 0.0, "learning_rate": 1.8402505873140175e-05, "loss": 1.8717, "step": 408 }, { "epoch": 0.9316628701594533, "grad_norm": 0.0, "learning_rate": 1.8395763875269664e-05, "loss": 1.857, "step": 409 }, { "epoch": 0.9339407744874715, "grad_norm": 0.0, "learning_rate": 1.838899803536346e-05, "loss": 1.8825, "step": 410 }, { "epoch": 0.9362186788154897, "grad_norm": 0.0, "learning_rate": 1.8382208226727024e-05, "loss": 1.7305, "step": 411 }, { "epoch": 0.9384965831435079, "grad_norm": 0.0, "learning_rate": 1.8375394321766563e-05, "loss": 1.9017, "step": 412 }, { "epoch": 0.9407744874715261, "grad_norm": 0.0, "learning_rate": 1.836855619198104e-05, "loss": 2.0254, "step": 413 }, { "epoch": 0.9430523917995444, "grad_norm": 0.0, "learning_rate": 1.83616937079541e-05, "loss": 1.9443, "step": 414 }, { "epoch": 0.9453302961275627, "grad_norm": 0.0, "learning_rate": 1.835480673934589e-05, "loss": 1.9315, "step": 415 }, { "epoch": 0.9476082004555809, "grad_norm": 0.0, "learning_rate": 1.834789515488483e-05, "loss": 1.8397, "step": 416 }, { "epoch": 0.9498861047835991, "grad_norm": 0.0, "learning_rate": 1.834095882235926e-05, "loss": 1.6211, "step": 417 }, { "epoch": 0.9521640091116174, "grad_norm": 0.0, "learning_rate": 1.8333997608609008e-05, "loss": 1.8534, "step": 418 }, { "epoch": 0.9544419134396356, "grad_norm": 0.0, "learning_rate": 1.832701137951687e-05, "loss": 1.6394, "step": 419 }, { "epoch": 0.9567198177676538, "grad_norm": 0.0, "learning_rate": 1.832e-05, "loss": 1.7309, "step": 420 }, { "epoch": 0.958997722095672, "grad_norm": 0.0, "learning_rate": 1.8312963334001203e-05, "loss": 1.7431, "step": 421 }, { "epoch": 0.9612756264236902, "grad_norm": 0.0, "learning_rate": 1.8305901244480132e-05, "loss": 1.6638, "step": 422 }, { "epoch": 0.9635535307517085, "grad_norm": 0.0, "learning_rate": 1.8298813593404385e-05, "loss": 1.6408, "step": 423 }, { "epoch": 0.9658314350797267, "grad_norm": 0.0, "learning_rate": 1.8291700241740533e-05, "loss": 1.8247, "step": 424 }, { "epoch": 0.9681093394077449, "grad_norm": 0.0, "learning_rate": 1.8284561049445004e-05, "loss": 1.8894, "step": 425 }, { "epoch": 0.9703872437357631, "grad_norm": 0.0, "learning_rate": 1.8277395875454913e-05, "loss": 1.7693, "step": 426 }, { "epoch": 0.9726651480637813, "grad_norm": 0.0, "learning_rate": 1.827020457767875e-05, "loss": 1.9301, "step": 427 }, { "epoch": 0.9749430523917996, "grad_norm": 0.0, "learning_rate": 1.8262987012987013e-05, "loss": 1.7352, "step": 428 }, { "epoch": 0.9772209567198178, "grad_norm": 0.0, "learning_rate": 1.8255743037202683e-05, "loss": 1.7201, "step": 429 }, { "epoch": 0.979498861047836, "grad_norm": 0.0, "learning_rate": 1.824847250509165e-05, "loss": 1.7524, "step": 430 }, { "epoch": 0.9817767653758542, "grad_norm": 0.0, "learning_rate": 1.824117527035299e-05, "loss": 1.9939, "step": 431 }, { "epoch": 0.9840546697038725, "grad_norm": 0.0, "learning_rate": 1.823385118560916e-05, "loss": 1.7163, "step": 432 }, { "epoch": 0.9863325740318907, "grad_norm": 0.0, "learning_rate": 1.8226500102396067e-05, "loss": 1.8772, "step": 433 }, { "epoch": 0.9886104783599089, "grad_norm": 0.0, "learning_rate": 1.821912187115306e-05, "loss": 1.7732, "step": 434 }, { "epoch": 0.9908883826879271, "grad_norm": 0.0, "learning_rate": 1.8211716341212746e-05, "loss": 1.9645, "step": 435 }, { "epoch": 0.9931662870159453, "grad_norm": 0.0, "learning_rate": 1.8204283360790775e-05, "loss": 1.8126, "step": 436 }, { "epoch": 0.9954441913439636, "grad_norm": 0.0, "learning_rate": 1.819682277697545e-05, "loss": 1.933, "step": 437 }, { "epoch": 0.9977220956719818, "grad_norm": 0.0, "learning_rate": 1.818933443571724e-05, "loss": 2.0602, "step": 438 }, { "epoch": 1.0, "grad_norm": 0.0, "learning_rate": 1.8181818181818182e-05, "loss": 2.1612, "step": 439 }, { "epoch": 1.0022779043280183, "grad_norm": 0.0, "learning_rate": 1.8174273858921162e-05, "loss": 1.894, "step": 440 }, { "epoch": 1.0045558086560364, "grad_norm": 0.0, "learning_rate": 1.8166701309499066e-05, "loss": 1.7733, "step": 441 }, { "epoch": 1.0068337129840548, "grad_norm": 0.0, "learning_rate": 1.8159100374843813e-05, "loss": 1.8797, "step": 442 }, { "epoch": 1.0091116173120729, "grad_norm": 0.0, "learning_rate": 1.815147089505529e-05, "loss": 1.851, "step": 443 }, { "epoch": 1.0113895216400912, "grad_norm": 0.0, "learning_rate": 1.8143812709030104e-05, "loss": 1.8259, "step": 444 }, { "epoch": 1.0136674259681093, "grad_norm": 0.0, "learning_rate": 1.813612565445026e-05, "loss": 1.8208, "step": 445 }, { "epoch": 1.0159453302961277, "grad_norm": 0.0, "learning_rate": 1.812840956777172e-05, "loss": 1.8305, "step": 446 }, { "epoch": 1.0182232346241458, "grad_norm": 0.0, "learning_rate": 1.812066428421274e-05, "loss": 1.8653, "step": 447 }, { "epoch": 1.020501138952164, "grad_norm": 0.0, "learning_rate": 1.811288963774221e-05, "loss": 1.6481, "step": 448 }, { "epoch": 1.0227790432801822, "grad_norm": 0.0, "learning_rate": 1.8105085461067737e-05, "loss": 1.9251, "step": 449 }, { "epoch": 1.0250569476082005, "grad_norm": 0.0, "learning_rate": 1.809725158562368e-05, "loss": 1.8392, "step": 450 }, { "epoch": 1.0273348519362187, "grad_norm": 0.0, "learning_rate": 1.808938784155899e-05, "loss": 1.6409, "step": 451 }, { "epoch": 1.029612756264237, "grad_norm": 0.0, "learning_rate": 1.808149405772496e-05, "loss": 1.7448, "step": 452 }, { "epoch": 1.031890660592255, "grad_norm": 0.0, "learning_rate": 1.807357006166277e-05, "loss": 1.687, "step": 453 }, { "epoch": 1.0341685649202734, "grad_norm": 0.0, "learning_rate": 1.806561567959097e-05, "loss": 1.798, "step": 454 }, { "epoch": 1.0364464692482915, "grad_norm": 0.0, "learning_rate": 1.8057630736392745e-05, "loss": 1.9294, "step": 455 }, { "epoch": 1.0387243735763099, "grad_norm": 0.0, "learning_rate": 1.804961505560308e-05, "loss": 1.7994, "step": 456 }, { "epoch": 1.041002277904328, "grad_norm": 0.0, "learning_rate": 1.804156845939576e-05, "loss": 1.8258, "step": 457 }, { "epoch": 1.0432801822323463, "grad_norm": 0.0, "learning_rate": 1.8033490768570207e-05, "loss": 1.7976, "step": 458 }, { "epoch": 1.0455580865603644, "grad_norm": 0.0, "learning_rate": 1.802538180253818e-05, "loss": 1.7135, "step": 459 }, { "epoch": 1.0478359908883828, "grad_norm": 0.0, "learning_rate": 1.8017241379310346e-05, "loss": 1.7839, "step": 460 }, { "epoch": 1.0501138952164009, "grad_norm": 0.0, "learning_rate": 1.800906931548262e-05, "loss": 1.9235, "step": 461 }, { "epoch": 1.0523917995444192, "grad_norm": 0.0, "learning_rate": 1.8000865426222415e-05, "loss": 1.8343, "step": 462 }, { "epoch": 1.0546697038724373, "grad_norm": 0.0, "learning_rate": 1.7992629525254716e-05, "loss": 1.7827, "step": 463 }, { "epoch": 1.0569476082004556, "grad_norm": 0.0, "learning_rate": 1.798436142484796e-05, "loss": 1.6388, "step": 464 }, { "epoch": 1.0592255125284737, "grad_norm": 0.0, "learning_rate": 1.7976060935799784e-05, "loss": 1.8052, "step": 465 }, { "epoch": 1.061503416856492, "grad_norm": 0.0, "learning_rate": 1.796772786742259e-05, "loss": 1.758, "step": 466 }, { "epoch": 1.0637813211845102, "grad_norm": 0.0, "learning_rate": 1.795936202752895e-05, "loss": 1.6495, "step": 467 }, { "epoch": 1.0660592255125285, "grad_norm": 0.0, "learning_rate": 1.7950963222416815e-05, "loss": 1.8546, "step": 468 }, { "epoch": 1.0683371298405466, "grad_norm": 0.0, "learning_rate": 1.7942531256854574e-05, "loss": 1.844, "step": 469 }, { "epoch": 1.070615034168565, "grad_norm": 0.0, "learning_rate": 1.7934065934065938e-05, "loss": 1.8132, "step": 470 }, { "epoch": 1.072892938496583, "grad_norm": 0.0, "learning_rate": 1.7925567055714605e-05, "loss": 1.6304, "step": 471 }, { "epoch": 1.0751708428246014, "grad_norm": 0.0, "learning_rate": 1.791703442188879e-05, "loss": 1.6719, "step": 472 }, { "epoch": 1.0774487471526195, "grad_norm": 0.0, "learning_rate": 1.7908467831085566e-05, "loss": 1.7527, "step": 473 }, { "epoch": 1.0797266514806378, "grad_norm": 0.0, "learning_rate": 1.789986708019495e-05, "loss": 1.8619, "step": 474 }, { "epoch": 1.082004555808656, "grad_norm": 0.0, "learning_rate": 1.7891231964483905e-05, "loss": 1.759, "step": 475 }, { "epoch": 1.0842824601366743, "grad_norm": 0.0, "learning_rate": 1.7882562277580073e-05, "loss": 1.8707, "step": 476 }, { "epoch": 1.0865603644646924, "grad_norm": 0.0, "learning_rate": 1.7873857811455315e-05, "loss": 1.8425, "step": 477 }, { "epoch": 1.0888382687927107, "grad_norm": 0.0, "learning_rate": 1.786511835640911e-05, "loss": 1.862, "step": 478 }, { "epoch": 1.0911161731207288, "grad_norm": 0.0, "learning_rate": 1.785634370105169e-05, "loss": 1.7945, "step": 479 }, { "epoch": 1.0933940774487472, "grad_norm": 0.0, "learning_rate": 1.7847533632286997e-05, "loss": 1.775, "step": 480 }, { "epoch": 1.0956719817767653, "grad_norm": 0.0, "learning_rate": 1.783868793529544e-05, "loss": 1.9576, "step": 481 }, { "epoch": 1.0979498861047836, "grad_norm": 0.0, "learning_rate": 1.7829806393516437e-05, "loss": 1.6999, "step": 482 }, { "epoch": 1.1002277904328017, "grad_norm": 0.0, "learning_rate": 1.7820888788630725e-05, "loss": 1.9236, "step": 483 }, { "epoch": 1.10250569476082, "grad_norm": 0.0, "learning_rate": 1.7811934900542496e-05, "loss": 1.8691, "step": 484 }, { "epoch": 1.1047835990888384, "grad_norm": 0.0, "learning_rate": 1.780294450736127e-05, "loss": 1.7682, "step": 485 }, { "epoch": 1.1070615034168565, "grad_norm": 0.0, "learning_rate": 1.7793917385383568e-05, "loss": 1.7202, "step": 486 }, { "epoch": 1.1093394077448746, "grad_norm": 0.0, "learning_rate": 1.7784853309074372e-05, "loss": 1.9333, "step": 487 }, { "epoch": 1.111617312072893, "grad_norm": 0.0, "learning_rate": 1.7775752051048316e-05, "loss": 1.6755, "step": 488 }, { "epoch": 1.1138952164009113, "grad_norm": 0.0, "learning_rate": 1.7766613382050697e-05, "loss": 1.7697, "step": 489 }, { "epoch": 1.1161731207289294, "grad_norm": 0.0, "learning_rate": 1.7757437070938218e-05, "loss": 1.7659, "step": 490 }, { "epoch": 1.1184510250569477, "grad_norm": 0.0, "learning_rate": 1.7748222884659486e-05, "loss": 2.0891, "step": 491 }, { "epoch": 1.1207289293849658, "grad_norm": 0.0, "learning_rate": 1.7738970588235293e-05, "loss": 1.7104, "step": 492 }, { "epoch": 1.1230068337129842, "grad_norm": 0.0, "learning_rate": 1.7729679944738662e-05, "loss": 1.9044, "step": 493 }, { "epoch": 1.1252847380410023, "grad_norm": 0.0, "learning_rate": 1.7720350715274574e-05, "loss": 1.8321, "step": 494 }, { "epoch": 1.1275626423690206, "grad_norm": 0.0, "learning_rate": 1.771098265895954e-05, "loss": 1.7164, "step": 495 }, { "epoch": 1.1298405466970387, "grad_norm": 0.0, "learning_rate": 1.7701575532900836e-05, "loss": 1.9128, "step": 496 }, { "epoch": 1.132118451025057, "grad_norm": 0.0, "learning_rate": 1.7692129092175527e-05, "loss": 1.7434, "step": 497 }, { "epoch": 1.1343963553530751, "grad_norm": 0.0, "learning_rate": 1.7682643089809216e-05, "loss": 1.7226, "step": 498 }, { "epoch": 1.1366742596810935, "grad_norm": 0.0, "learning_rate": 1.767311727675449e-05, "loss": 1.7137, "step": 499 }, { "epoch": 1.1389521640091116, "grad_norm": 0.0, "learning_rate": 1.7663551401869158e-05, "loss": 1.8651, "step": 500 }, { "epoch": 1.14123006833713, "grad_norm": 0.0, "learning_rate": 1.765394521189417e-05, "loss": 1.8598, "step": 501 }, { "epoch": 1.143507972665148, "grad_norm": 0.0, "learning_rate": 1.7644298451431255e-05, "loss": 1.7573, "step": 502 }, { "epoch": 1.1457858769931664, "grad_norm": 0.0, "learning_rate": 1.7634610862920293e-05, "loss": 1.9351, "step": 503 }, { "epoch": 1.1480637813211845, "grad_norm": 0.0, "learning_rate": 1.76248821866164e-05, "loss": 1.9187, "step": 504 }, { "epoch": 1.1503416856492028, "grad_norm": 0.0, "learning_rate": 1.761511216056671e-05, "loss": 1.8173, "step": 505 }, { "epoch": 1.152619589977221, "grad_norm": 0.0, "learning_rate": 1.7605300520586845e-05, "loss": 1.6676, "step": 506 }, { "epoch": 1.1548974943052392, "grad_norm": 0.0, "learning_rate": 1.7595447000237135e-05, "loss": 1.6559, "step": 507 }, { "epoch": 1.1571753986332574, "grad_norm": 0.0, "learning_rate": 1.758555133079848e-05, "loss": 1.6879, "step": 508 }, { "epoch": 1.1594533029612757, "grad_norm": 0.0, "learning_rate": 1.7575613241247916e-05, "loss": 1.8797, "step": 509 }, { "epoch": 1.1617312072892938, "grad_norm": 0.0, "learning_rate": 1.7565632458233892e-05, "loss": 1.8194, "step": 510 }, { "epoch": 1.1640091116173121, "grad_norm": 0.0, "learning_rate": 1.7555608706051185e-05, "loss": 1.7954, "step": 511 }, { "epoch": 1.1662870159453302, "grad_norm": 0.0, "learning_rate": 1.754554170661553e-05, "loss": 1.797, "step": 512 }, { "epoch": 1.1685649202733486, "grad_norm": 0.0, "learning_rate": 1.7535431179437905e-05, "loss": 1.9259, "step": 513 }, { "epoch": 1.1708428246013667, "grad_norm": 0.0, "learning_rate": 1.752527684159846e-05, "loss": 1.8345, "step": 514 }, { "epoch": 1.173120728929385, "grad_norm": 0.0, "learning_rate": 1.7515078407720147e-05, "loss": 1.771, "step": 515 }, { "epoch": 1.1753986332574031, "grad_norm": 0.0, "learning_rate": 1.7504835589941973e-05, "loss": 1.8106, "step": 516 }, { "epoch": 1.1776765375854215, "grad_norm": 0.0, "learning_rate": 1.749454809789193e-05, "loss": 1.978, "step": 517 }, { "epoch": 1.1799544419134396, "grad_norm": 0.0, "learning_rate": 1.7484215638659544e-05, "loss": 1.9003, "step": 518 }, { "epoch": 1.182232346241458, "grad_norm": 0.0, "learning_rate": 1.7473837916768072e-05, "loss": 1.7944, "step": 519 }, { "epoch": 1.184510250569476, "grad_norm": 0.0, "learning_rate": 1.7463414634146342e-05, "loss": 1.793, "step": 520 }, { "epoch": 1.1867881548974943, "grad_norm": 0.0, "learning_rate": 1.745294549010022e-05, "loss": 1.8594, "step": 521 }, { "epoch": 1.1890660592255125, "grad_norm": 0.0, "learning_rate": 1.7442430181283685e-05, "loss": 1.7914, "step": 522 }, { "epoch": 1.1913439635535308, "grad_norm": 0.0, "learning_rate": 1.7431868401669533e-05, "loss": 1.8498, "step": 523 }, { "epoch": 1.193621867881549, "grad_norm": 0.0, "learning_rate": 1.7421259842519685e-05, "loss": 1.9455, "step": 524 }, { "epoch": 1.1958997722095672, "grad_norm": 0.0, "learning_rate": 1.7410604192355117e-05, "loss": 1.7882, "step": 525 }, { "epoch": 1.1981776765375853, "grad_norm": 0.0, "learning_rate": 1.739990113692536e-05, "loss": 1.8283, "step": 526 }, { "epoch": 1.2004555808656037, "grad_norm": 0.0, "learning_rate": 1.7389150359177605e-05, "loss": 1.6758, "step": 527 }, { "epoch": 1.2027334851936218, "grad_norm": 0.0, "learning_rate": 1.7378351539225422e-05, "loss": 1.7471, "step": 528 }, { "epoch": 1.20501138952164, "grad_norm": 0.0, "learning_rate": 1.7367504354316996e-05, "loss": 1.7923, "step": 529 }, { "epoch": 1.2072892938496582, "grad_norm": 0.0, "learning_rate": 1.7356608478802993e-05, "loss": 1.8953, "step": 530 }, { "epoch": 1.2095671981776766, "grad_norm": 0.0, "learning_rate": 1.7345663584103974e-05, "loss": 1.8777, "step": 531 }, { "epoch": 1.2118451025056949, "grad_norm": 0.0, "learning_rate": 1.7334669338677356e-05, "loss": 1.8428, "step": 532 }, { "epoch": 1.214123006833713, "grad_norm": 0.0, "learning_rate": 1.7323625407983932e-05, "loss": 1.5904, "step": 533 }, { "epoch": 1.216400911161731, "grad_norm": 0.0, "learning_rate": 1.731253145445395e-05, "loss": 1.6985, "step": 534 }, { "epoch": 1.2186788154897494, "grad_norm": 0.0, "learning_rate": 1.7301387137452713e-05, "loss": 1.6751, "step": 535 }, { "epoch": 1.2209567198177678, "grad_norm": 0.0, "learning_rate": 1.7290192113245704e-05, "loss": 1.7691, "step": 536 }, { "epoch": 1.2232346241457859, "grad_norm": 0.0, "learning_rate": 1.7278946034963264e-05, "loss": 1.8451, "step": 537 }, { "epoch": 1.225512528473804, "grad_norm": 0.0, "learning_rate": 1.7267648552564757e-05, "loss": 1.7276, "step": 538 }, { "epoch": 1.2277904328018223, "grad_norm": 0.0, "learning_rate": 1.725629931280224e-05, "loss": 1.7447, "step": 539 }, { "epoch": 1.2300683371298406, "grad_norm": 0.0, "learning_rate": 1.7244897959183674e-05, "loss": 1.8618, "step": 540 }, { "epoch": 1.2323462414578588, "grad_norm": 0.0, "learning_rate": 1.7233444131935567e-05, "loss": 1.909, "step": 541 }, { "epoch": 1.2346241457858769, "grad_norm": 0.0, "learning_rate": 1.7221937467965147e-05, "loss": 1.7861, "step": 542 }, { "epoch": 1.2369020501138952, "grad_norm": 0.0, "learning_rate": 1.7210377600821992e-05, "loss": 1.8093, "step": 543 }, { "epoch": 1.2391799544419135, "grad_norm": 0.0, "learning_rate": 1.7198764160659114e-05, "loss": 1.8772, "step": 544 }, { "epoch": 1.2414578587699316, "grad_norm": 0.0, "learning_rate": 1.7187096774193548e-05, "loss": 1.7474, "step": 545 }, { "epoch": 1.24373576309795, "grad_norm": 0.0, "learning_rate": 1.7175375064666324e-05, "loss": 1.8976, "step": 546 }, { "epoch": 1.246013667425968, "grad_norm": 0.0, "learning_rate": 1.716359865180192e-05, "loss": 1.9038, "step": 547 }, { "epoch": 1.2482915717539864, "grad_norm": 0.0, "learning_rate": 1.7151767151767152e-05, "loss": 1.8241, "step": 548 }, { "epoch": 1.2505694760820045, "grad_norm": 0.0, "learning_rate": 1.7139880177129463e-05, "loss": 1.8321, "step": 549 }, { "epoch": 1.2528473804100229, "grad_norm": 0.0, "learning_rate": 1.7127937336814622e-05, "loss": 1.7573, "step": 550 }, { "epoch": 1.255125284738041, "grad_norm": 0.0, "learning_rate": 1.7115938236063858e-05, "loss": 1.8974, "step": 551 }, { "epoch": 1.2574031890660593, "grad_norm": 0.0, "learning_rate": 1.710388247639035e-05, "loss": 1.6982, "step": 552 }, { "epoch": 1.2596810933940774, "grad_norm": 0.0, "learning_rate": 1.7091769655535105e-05, "loss": 1.8387, "step": 553 }, { "epoch": 1.2619589977220957, "grad_norm": 0.0, "learning_rate": 1.7079599367422247e-05, "loss": 1.935, "step": 554 }, { "epoch": 1.2642369020501139, "grad_norm": 0.0, "learning_rate": 1.706737120211361e-05, "loss": 1.873, "step": 555 }, { "epoch": 1.2665148063781322, "grad_norm": 0.0, "learning_rate": 1.705508474576271e-05, "loss": 1.9897, "step": 556 }, { "epoch": 1.2687927107061503, "grad_norm": 0.0, "learning_rate": 1.7042739580568093e-05, "loss": 1.9435, "step": 557 }, { "epoch": 1.2710706150341686, "grad_norm": 0.0, "learning_rate": 1.703033528472592e-05, "loss": 2.0429, "step": 558 }, { "epoch": 1.2733485193621867, "grad_norm": 0.0, "learning_rate": 1.701787143238197e-05, "loss": 1.7305, "step": 559 }, { "epoch": 1.275626423690205, "grad_norm": 0.0, "learning_rate": 1.700534759358289e-05, "loss": 1.8407, "step": 560 }, { "epoch": 1.2779043280182232, "grad_norm": 0.0, "learning_rate": 1.699276333422675e-05, "loss": 1.8374, "step": 561 }, { "epoch": 1.2801822323462415, "grad_norm": 0.0, "learning_rate": 1.69801182160129e-05, "loss": 1.8984, "step": 562 }, { "epoch": 1.2824601366742596, "grad_norm": 0.0, "learning_rate": 1.696741179639106e-05, "loss": 1.8733, "step": 563 }, { "epoch": 1.284738041002278, "grad_norm": 0.0, "learning_rate": 1.695464362850972e-05, "loss": 1.7977, "step": 564 }, { "epoch": 1.287015945330296, "grad_norm": 0.0, "learning_rate": 1.6941813261163737e-05, "loss": 1.8713, "step": 565 }, { "epoch": 1.2892938496583144, "grad_norm": 0.0, "learning_rate": 1.692892023874118e-05, "loss": 2.0701, "step": 566 }, { "epoch": 1.2915717539863325, "grad_norm": 0.0, "learning_rate": 1.6915964101169432e-05, "loss": 1.9373, "step": 567 }, { "epoch": 1.2938496583143508, "grad_norm": 0.0, "learning_rate": 1.6902944383860415e-05, "loss": 1.9324, "step": 568 }, { "epoch": 1.296127562642369, "grad_norm": 0.0, "learning_rate": 1.6889860617655097e-05, "loss": 1.8836, "step": 569 }, { "epoch": 1.2984054669703873, "grad_norm": 0.0, "learning_rate": 1.6876712328767124e-05, "loss": 1.6978, "step": 570 }, { "epoch": 1.3006833712984054, "grad_norm": 0.0, "learning_rate": 1.6863499038725624e-05, "loss": 1.7005, "step": 571 }, { "epoch": 1.3029612756264237, "grad_norm": 0.0, "learning_rate": 1.685022026431718e-05, "loss": 1.7452, "step": 572 }, { "epoch": 1.3052391799544418, "grad_norm": 0.0, "learning_rate": 1.6836875517526915e-05, "loss": 1.7374, "step": 573 }, { "epoch": 1.3075170842824602, "grad_norm": 0.0, "learning_rate": 1.6823464305478693e-05, "loss": 1.955, "step": 574 }, { "epoch": 1.3097949886104785, "grad_norm": 0.0, "learning_rate": 1.680998613037448e-05, "loss": 1.9013, "step": 575 }, { "epoch": 1.3120728929384966, "grad_norm": 0.0, "learning_rate": 1.6796440489432706e-05, "loss": 1.771, "step": 576 }, { "epoch": 1.3143507972665147, "grad_norm": 0.0, "learning_rate": 1.678282687482576e-05, "loss": 1.8485, "step": 577 }, { "epoch": 1.316628701594533, "grad_norm": 0.0, "learning_rate": 1.6769144773616545e-05, "loss": 1.8416, "step": 578 }, { "epoch": 1.3189066059225514, "grad_norm": 0.0, "learning_rate": 1.6755393667694036e-05, "loss": 1.83, "step": 579 }, { "epoch": 1.3211845102505695, "grad_norm": 0.0, "learning_rate": 1.6741573033707864e-05, "loss": 1.7065, "step": 580 }, { "epoch": 1.3234624145785876, "grad_norm": 0.0, "learning_rate": 1.6727682343001974e-05, "loss": 1.72, "step": 581 }, { "epoch": 1.325740318906606, "grad_norm": 0.0, "learning_rate": 1.6713721061547152e-05, "loss": 1.7954, "step": 582 }, { "epoch": 1.3280182232346243, "grad_norm": 0.0, "learning_rate": 1.6699688649872628e-05, "loss": 1.7777, "step": 583 }, { "epoch": 1.3302961275626424, "grad_norm": 0.0, "learning_rate": 1.6685584562996594e-05, "loss": 1.6499, "step": 584 }, { "epoch": 1.3325740318906605, "grad_norm": 0.0, "learning_rate": 1.667140825035562e-05, "loss": 1.9445, "step": 585 }, { "epoch": 1.3348519362186788, "grad_norm": 0.0, "learning_rate": 1.665715915573303e-05, "loss": 1.7321, "step": 586 }, { "epoch": 1.3371298405466971, "grad_norm": 0.0, "learning_rate": 1.6642836717186162e-05, "loss": 1.8359, "step": 587 }, { "epoch": 1.3394077448747153, "grad_norm": 0.0, "learning_rate": 1.6628440366972475e-05, "loss": 1.8457, "step": 588 }, { "epoch": 1.3416856492027334, "grad_norm": 0.0, "learning_rate": 1.6613969531474562e-05, "loss": 1.6885, "step": 589 }, { "epoch": 1.3439635535307517, "grad_norm": 0.0, "learning_rate": 1.6599423631123922e-05, "loss": 1.8215, "step": 590 }, { "epoch": 1.34624145785877, "grad_norm": 0.0, "learning_rate": 1.6584802080323605e-05, "loss": 2.0192, "step": 591 }, { "epoch": 1.3485193621867881, "grad_norm": 0.0, "learning_rate": 1.6570104287369643e-05, "loss": 1.7816, "step": 592 }, { "epoch": 1.3507972665148062, "grad_norm": 0.0, "learning_rate": 1.655532965437119e-05, "loss": 1.8412, "step": 593 }, { "epoch": 1.3530751708428246, "grad_norm": 0.0, "learning_rate": 1.654047757716948e-05, "loss": 1.7873, "step": 594 }, { "epoch": 1.355353075170843, "grad_norm": 0.0, "learning_rate": 1.6525547445255474e-05, "loss": 1.6725, "step": 595 }, { "epoch": 1.357630979498861, "grad_norm": 0.0, "learning_rate": 1.6510538641686185e-05, "loss": 1.8629, "step": 596 }, { "epoch": 1.3599088838268791, "grad_norm": 0.0, "learning_rate": 1.649545054299971e-05, "loss": 1.7792, "step": 597 }, { "epoch": 1.3621867881548975, "grad_norm": 0.0, "learning_rate": 1.64802825191289e-05, "loss": 1.7525, "step": 598 }, { "epoch": 1.3644646924829158, "grad_norm": 0.0, "learning_rate": 1.6465033933313664e-05, "loss": 1.7946, "step": 599 }, { "epoch": 1.366742596810934, "grad_norm": 0.0, "learning_rate": 1.6449704142011833e-05, "loss": 1.8881, "step": 600 }, { "epoch": 1.3690205011389522, "grad_norm": 0.0, "learning_rate": 1.6434292494808663e-05, "loss": 1.7876, "step": 601 }, { "epoch": 1.3712984054669703, "grad_norm": 0.0, "learning_rate": 1.641879833432481e-05, "loss": 1.726, "step": 602 }, { "epoch": 1.3735763097949887, "grad_norm": 0.0, "learning_rate": 1.6403220996122876e-05, "loss": 1.8706, "step": 603 }, { "epoch": 1.3758542141230068, "grad_norm": 0.0, "learning_rate": 1.638755980861244e-05, "loss": 1.7448, "step": 604 }, { "epoch": 1.3781321184510251, "grad_norm": 0.0, "learning_rate": 1.6371814092953527e-05, "loss": 1.94, "step": 605 }, { "epoch": 1.3804100227790432, "grad_norm": 0.0, "learning_rate": 1.6355983162958508e-05, "loss": 1.8216, "step": 606 }, { "epoch": 1.3826879271070616, "grad_norm": 0.0, "learning_rate": 1.6340066324992464e-05, "loss": 1.8271, "step": 607 }, { "epoch": 1.3849658314350797, "grad_norm": 0.0, "learning_rate": 1.6324062877871826e-05, "loss": 1.9633, "step": 608 }, { "epoch": 1.387243735763098, "grad_norm": 0.0, "learning_rate": 1.6307972112761445e-05, "loss": 1.8003, "step": 609 }, { "epoch": 1.3895216400911161, "grad_norm": 0.0, "learning_rate": 1.629179331306991e-05, "loss": 1.7099, "step": 610 }, { "epoch": 1.3917995444191344, "grad_norm": 0.0, "learning_rate": 1.6275525754343188e-05, "loss": 1.8283, "step": 611 }, { "epoch": 1.3940774487471526, "grad_norm": 0.0, "learning_rate": 1.6259168704156484e-05, "loss": 1.6879, "step": 612 }, { "epoch": 1.396355353075171, "grad_norm": 0.0, "learning_rate": 1.624272142200429e-05, "loss": 1.8087, "step": 613 }, { "epoch": 1.398633257403189, "grad_norm": 0.0, "learning_rate": 1.6226183159188693e-05, "loss": 1.6436, "step": 614 }, { "epoch": 1.4009111617312073, "grad_norm": 0.0, "learning_rate": 1.62095531587057e-05, "loss": 1.8455, "step": 615 }, { "epoch": 1.4031890660592254, "grad_norm": 0.0, "learning_rate": 1.619283065512979e-05, "loss": 1.7757, "step": 616 }, { "epoch": 1.4054669703872438, "grad_norm": 0.0, "learning_rate": 1.6176014874496436e-05, "loss": 1.7478, "step": 617 }, { "epoch": 1.4077448747152619, "grad_norm": 0.0, "learning_rate": 1.615910503418272e-05, "loss": 1.7987, "step": 618 }, { "epoch": 1.4100227790432802, "grad_norm": 0.0, "learning_rate": 1.6142100342785915e-05, "loss": 1.8381, "step": 619 }, { "epoch": 1.4123006833712983, "grad_norm": 0.0, "learning_rate": 1.6125000000000002e-05, "loss": 1.8054, "step": 620 }, { "epoch": 1.4145785876993167, "grad_norm": 0.0, "learning_rate": 1.6107803196490126e-05, "loss": 1.878, "step": 621 }, { "epoch": 1.416856492027335, "grad_norm": 0.0, "learning_rate": 1.6090509113764927e-05, "loss": 1.8996, "step": 622 }, { "epoch": 1.419134396355353, "grad_norm": 0.0, "learning_rate": 1.6073116924046647e-05, "loss": 1.6438, "step": 623 }, { "epoch": 1.4214123006833712, "grad_norm": 0.0, "learning_rate": 1.6055625790139063e-05, "loss": 1.9268, "step": 624 }, { "epoch": 1.4236902050113895, "grad_norm": 0.0, "learning_rate": 1.6038034865293186e-05, "loss": 1.7209, "step": 625 }, { "epoch": 1.4259681093394079, "grad_norm": 0.0, "learning_rate": 1.6020343293070568e-05, "loss": 1.7906, "step": 626 }, { "epoch": 1.428246013667426, "grad_norm": 0.0, "learning_rate": 1.6002550207204335e-05, "loss": 1.779, "step": 627 }, { "epoch": 1.430523917995444, "grad_norm": 0.0, "learning_rate": 1.59846547314578e-05, "loss": 1.6729, "step": 628 }, { "epoch": 1.4328018223234624, "grad_norm": 0.0, "learning_rate": 1.5966655979480607e-05, "loss": 1.7129, "step": 629 }, { "epoch": 1.4350797266514808, "grad_norm": 0.0, "learning_rate": 1.594855305466238e-05, "loss": 1.7471, "step": 630 }, { "epoch": 1.4373576309794989, "grad_norm": 0.0, "learning_rate": 1.5930345049983877e-05, "loss": 1.7756, "step": 631 }, { "epoch": 1.439635535307517, "grad_norm": 0.0, "learning_rate": 1.591203104786546e-05, "loss": 1.8302, "step": 632 }, { "epoch": 1.4419134396355353, "grad_norm": 0.0, "learning_rate": 1.5893610120012975e-05, "loss": 1.7776, "step": 633 }, { "epoch": 1.4441913439635536, "grad_norm": 0.0, "learning_rate": 1.58750813272609e-05, "loss": 1.7861, "step": 634 }, { "epoch": 1.4464692482915718, "grad_norm": 0.0, "learning_rate": 1.5856443719412723e-05, "loss": 1.8694, "step": 635 }, { "epoch": 1.4487471526195899, "grad_norm": 0.0, "learning_rate": 1.5837696335078534e-05, "loss": 1.7036, "step": 636 }, { "epoch": 1.4510250569476082, "grad_norm": 0.0, "learning_rate": 1.5818838201509683e-05, "loss": 1.9997, "step": 637 }, { "epoch": 1.4533029612756265, "grad_norm": 0.0, "learning_rate": 1.579986833443055e-05, "loss": 1.8658, "step": 638 }, { "epoch": 1.4555808656036446, "grad_norm": 0.0, "learning_rate": 1.5780785737867282e-05, "loss": 1.7876, "step": 639 }, { "epoch": 1.4578587699316627, "grad_norm": 0.0, "learning_rate": 1.576158940397351e-05, "loss": 1.8197, "step": 640 }, { "epoch": 1.460136674259681, "grad_norm": 0.0, "learning_rate": 1.5742278312852876e-05, "loss": 1.7887, "step": 641 }, { "epoch": 1.4624145785876994, "grad_norm": 0.0, "learning_rate": 1.5722851432378414e-05, "loss": 2.0391, "step": 642 }, { "epoch": 1.4646924829157175, "grad_norm": 0.0, "learning_rate": 1.570330771800869e-05, "loss": 1.9192, "step": 643 }, { "epoch": 1.4669703872437356, "grad_norm": 0.0, "learning_rate": 1.5683646112600534e-05, "loss": 1.7891, "step": 644 }, { "epoch": 1.469248291571754, "grad_norm": 0.0, "learning_rate": 1.5663865546218488e-05, "loss": 1.7657, "step": 645 }, { "epoch": 1.4715261958997723, "grad_norm": 0.0, "learning_rate": 1.5643964935940662e-05, "loss": 1.8506, "step": 646 }, { "epoch": 1.4738041002277904, "grad_norm": 0.0, "learning_rate": 1.5623943185661144e-05, "loss": 1.8685, "step": 647 }, { "epoch": 1.4760820045558087, "grad_norm": 0.0, "learning_rate": 1.5603799185888738e-05, "loss": 1.7975, "step": 648 }, { "epoch": 1.4783599088838268, "grad_norm": 0.0, "learning_rate": 1.5583531813542022e-05, "loss": 1.8862, "step": 649 }, { "epoch": 1.4806378132118452, "grad_norm": 0.0, "learning_rate": 1.5563139931740613e-05, "loss": 1.7348, "step": 650 }, { "epoch": 1.4829157175398633, "grad_norm": 0.0, "learning_rate": 1.5542622389592607e-05, "loss": 1.6889, "step": 651 }, { "epoch": 1.4851936218678816, "grad_norm": 0.0, "learning_rate": 1.5521978021978023e-05, "loss": 1.9386, "step": 652 }, { "epoch": 1.4874715261958997, "grad_norm": 0.0, "learning_rate": 1.550120564932828e-05, "loss": 1.9238, "step": 653 }, { "epoch": 1.489749430523918, "grad_norm": 0.0, "learning_rate": 1.548030407740152e-05, "loss": 1.8025, "step": 654 }, { "epoch": 1.4920273348519362, "grad_norm": 0.0, "learning_rate": 1.545927209705373e-05, "loss": 1.7167, "step": 655 }, { "epoch": 1.4943052391799545, "grad_norm": 0.0, "learning_rate": 1.5438108484005562e-05, "loss": 1.7308, "step": 656 }, { "epoch": 1.4965831435079726, "grad_norm": 0.0, "learning_rate": 1.5416811998604815e-05, "loss": 1.8564, "step": 657 }, { "epoch": 1.498861047835991, "grad_norm": 0.0, "learning_rate": 1.5395381385584325e-05, "loss": 1.621, "step": 658 }, { "epoch": 1.501138952164009, "grad_norm": 0.0, "learning_rate": 1.537381537381537e-05, "loss": 1.8461, "step": 659 }, { "epoch": 1.5034168564920274, "grad_norm": 0.0, "learning_rate": 1.535211267605634e-05, "loss": 1.859, "step": 660 } ], "logging_steps": 1, "max_steps": 878, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 220, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 7.847387829381366e+18, "train_batch_size": 32, "trial_name": null, "trial_params": null }