{ "best_metric": null, "best_model_checkpoint": null, "epoch": 10.0, "eval_steps": 500, "global_step": 12920, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.007739938080495356, "grad_norm": 8.506338119506836, "learning_rate": 1.998606811145511e-05, "loss": 2.0064, "step": 10 }, { "epoch": 0.015479876160990712, "grad_norm": 4.7707037925720215, "learning_rate": 1.997058823529412e-05, "loss": 1.3355, "step": 20 }, { "epoch": 0.02321981424148607, "grad_norm": 11.43391227722168, "learning_rate": 1.995510835913313e-05, "loss": 3.5956, "step": 30 }, { "epoch": 0.030959752321981424, "grad_norm": 0.21515123546123505, "learning_rate": 1.9939628482972137e-05, "loss": 3.4264, "step": 40 }, { "epoch": 0.03869969040247678, "grad_norm": 2.740605354309082, "learning_rate": 1.9924148606811147e-05, "loss": 3.0759, "step": 50 }, { "epoch": 0.04643962848297214, "grad_norm": 4.352437496185303, "learning_rate": 1.9908668730650157e-05, "loss": 3.3508, "step": 60 }, { "epoch": 0.05417956656346749, "grad_norm": 0.05267472565174103, "learning_rate": 1.9893188854489167e-05, "loss": 1.594, "step": 70 }, { "epoch": 0.06191950464396285, "grad_norm": 10.989522933959961, "learning_rate": 1.9877708978328173e-05, "loss": 2.8205, "step": 80 }, { "epoch": 0.0696594427244582, "grad_norm": 8.449481964111328, "learning_rate": 1.9862229102167187e-05, "loss": 2.4271, "step": 90 }, { "epoch": 0.07739938080495357, "grad_norm": 0.29544803500175476, "learning_rate": 1.984829721362229e-05, "loss": 1.3092, "step": 100 }, { "epoch": 0.08513931888544891, "grad_norm": 0.10880222171545029, "learning_rate": 1.98328173374613e-05, "loss": 2.5392, "step": 110 }, { "epoch": 0.09287925696594428, "grad_norm": 9.368032455444336, "learning_rate": 1.981733746130031e-05, "loss": 1.5991, "step": 120 }, { "epoch": 0.10061919504643962, "grad_norm": 11.469496726989746, "learning_rate": 1.980185758513932e-05, "loss": 2.0813, "step": 130 }, { "epoch": 0.10835913312693499, "grad_norm": 9.862323760986328, "learning_rate": 1.978637770897833e-05, "loss": 2.649, "step": 140 }, { "epoch": 0.11609907120743033, "grad_norm": 8.737398147583008, "learning_rate": 1.9770897832817338e-05, "loss": 1.5553, "step": 150 }, { "epoch": 0.1238390092879257, "grad_norm": 6.55385160446167, "learning_rate": 1.9755417956656348e-05, "loss": 1.743, "step": 160 }, { "epoch": 0.13157894736842105, "grad_norm": 6.633718490600586, "learning_rate": 1.9739938080495358e-05, "loss": 0.7426, "step": 170 }, { "epoch": 0.1393188854489164, "grad_norm": 6.836884021759033, "learning_rate": 1.9724458204334368e-05, "loss": 1.0845, "step": 180 }, { "epoch": 0.14705882352941177, "grad_norm": 0.0683404728770256, "learning_rate": 1.9708978328173375e-05, "loss": 0.7103, "step": 190 }, { "epoch": 0.15479876160990713, "grad_norm": 16.651533126831055, "learning_rate": 1.9693498452012385e-05, "loss": 1.531, "step": 200 }, { "epoch": 0.16253869969040247, "grad_norm": 9.910208702087402, "learning_rate": 1.9678018575851395e-05, "loss": 2.8037, "step": 210 }, { "epoch": 0.17027863777089783, "grad_norm": 1.3444373607635498, "learning_rate": 1.9662538699690405e-05, "loss": 2.7936, "step": 220 }, { "epoch": 0.1780185758513932, "grad_norm": 12.756002426147461, "learning_rate": 1.964705882352941e-05, "loss": 2.1467, "step": 230 }, { "epoch": 0.18575851393188855, "grad_norm": 9.536038398742676, "learning_rate": 1.9631578947368425e-05, "loss": 1.2144, "step": 240 }, { "epoch": 0.19349845201238391, "grad_norm": 17.46320152282715, "learning_rate": 1.961609907120743e-05, "loss": 1.8731, "step": 250 }, { "epoch": 0.20123839009287925, "grad_norm": 18.28465461730957, "learning_rate": 1.960061919504644e-05, "loss": 1.6671, "step": 260 }, { "epoch": 0.2089783281733746, "grad_norm": 6.441016674041748, "learning_rate": 1.958513931888545e-05, "loss": 1.7131, "step": 270 }, { "epoch": 0.21671826625386997, "grad_norm": 15.477100372314453, "learning_rate": 1.956965944272446e-05, "loss": 1.4353, "step": 280 }, { "epoch": 0.22445820433436534, "grad_norm": 14.33431625366211, "learning_rate": 1.9554179566563468e-05, "loss": 1.5589, "step": 290 }, { "epoch": 0.23219814241486067, "grad_norm": 12.015625, "learning_rate": 1.9538699690402478e-05, "loss": 1.1182, "step": 300 }, { "epoch": 0.23993808049535603, "grad_norm": 0.09702865034341812, "learning_rate": 1.9523219814241488e-05, "loss": 1.2126, "step": 310 }, { "epoch": 0.2476780185758514, "grad_norm": 13.268048286437988, "learning_rate": 1.9507739938080498e-05, "loss": 1.0105, "step": 320 }, { "epoch": 0.25541795665634676, "grad_norm": 7.2029924392700195, "learning_rate": 1.9492260061919505e-05, "loss": 0.4993, "step": 330 }, { "epoch": 0.2631578947368421, "grad_norm": 7.723134994506836, "learning_rate": 1.9476780185758515e-05, "loss": 0.2176, "step": 340 }, { "epoch": 0.2708978328173375, "grad_norm": 16.317968368530273, "learning_rate": 1.9461300309597525e-05, "loss": 1.7535, "step": 350 }, { "epoch": 0.2786377708978328, "grad_norm": 2.531498670578003, "learning_rate": 1.9445820433436535e-05, "loss": 1.7211, "step": 360 }, { "epoch": 0.28637770897832815, "grad_norm": 2.1575376987457275, "learning_rate": 1.9430340557275545e-05, "loss": 0.9047, "step": 370 }, { "epoch": 0.29411764705882354, "grad_norm": 17.665647506713867, "learning_rate": 1.9414860681114555e-05, "loss": 1.2776, "step": 380 }, { "epoch": 0.3018575851393189, "grad_norm": 7.1050124168396, "learning_rate": 1.939938080495356e-05, "loss": 1.4221, "step": 390 }, { "epoch": 0.30959752321981426, "grad_norm": 3.0147929191589355, "learning_rate": 1.938390092879257e-05, "loss": 0.859, "step": 400 }, { "epoch": 0.3173374613003096, "grad_norm": 20.308488845825195, "learning_rate": 1.936842105263158e-05, "loss": 0.8, "step": 410 }, { "epoch": 0.32507739938080493, "grad_norm": 3.4890947341918945, "learning_rate": 1.935294117647059e-05, "loss": 1.1082, "step": 420 }, { "epoch": 0.3328173374613003, "grad_norm": 0.8235903978347778, "learning_rate": 1.9337461300309598e-05, "loss": 1.6303, "step": 430 }, { "epoch": 0.34055727554179566, "grad_norm": 0.04457276687026024, "learning_rate": 1.9321981424148608e-05, "loss": 0.2507, "step": 440 }, { "epoch": 0.34829721362229105, "grad_norm": 14.781039237976074, "learning_rate": 1.9306501547987618e-05, "loss": 1.0837, "step": 450 }, { "epoch": 0.3560371517027864, "grad_norm": 12.6392183303833, "learning_rate": 1.9291021671826628e-05, "loss": 1.1298, "step": 460 }, { "epoch": 0.3637770897832817, "grad_norm": 0.32890239357948303, "learning_rate": 1.9275541795665635e-05, "loss": 1.9762, "step": 470 }, { "epoch": 0.3715170278637771, "grad_norm": 0.32050493359565735, "learning_rate": 1.9260061919504645e-05, "loss": 1.4656, "step": 480 }, { "epoch": 0.37925696594427244, "grad_norm": 2.434720516204834, "learning_rate": 1.9244582043343655e-05, "loss": 0.6325, "step": 490 }, { "epoch": 0.38699690402476783, "grad_norm": 11.567814826965332, "learning_rate": 1.9229102167182665e-05, "loss": 0.3975, "step": 500 }, { "epoch": 0.39473684210526316, "grad_norm": 18.813968658447266, "learning_rate": 1.9213622291021675e-05, "loss": 0.7963, "step": 510 }, { "epoch": 0.4024767801857585, "grad_norm": 20.558246612548828, "learning_rate": 1.9198142414860685e-05, "loss": 1.9874, "step": 520 }, { "epoch": 0.4102167182662539, "grad_norm": 0.22306686639785767, "learning_rate": 1.918266253869969e-05, "loss": 0.4433, "step": 530 }, { "epoch": 0.4179566563467492, "grad_norm": 10.95577621459961, "learning_rate": 1.91671826625387e-05, "loss": 0.9135, "step": 540 }, { "epoch": 0.42569659442724456, "grad_norm": 4.70222282409668, "learning_rate": 1.915170278637771e-05, "loss": 0.9007, "step": 550 }, { "epoch": 0.43343653250773995, "grad_norm": 1.1274303197860718, "learning_rate": 1.913622291021672e-05, "loss": 0.7367, "step": 560 }, { "epoch": 0.4411764705882353, "grad_norm": 0.007167542353272438, "learning_rate": 1.9120743034055728e-05, "loss": 1.5318, "step": 570 }, { "epoch": 0.44891640866873067, "grad_norm": 23.503353118896484, "learning_rate": 1.9105263157894738e-05, "loss": 0.6634, "step": 580 }, { "epoch": 0.456656346749226, "grad_norm": 0.37044626474380493, "learning_rate": 1.9089783281733748e-05, "loss": 0.5184, "step": 590 }, { "epoch": 0.46439628482972134, "grad_norm": 16.5557804107666, "learning_rate": 1.9074303405572754e-05, "loss": 0.8234, "step": 600 }, { "epoch": 0.47213622291021673, "grad_norm": 25.89270782470703, "learning_rate": 1.9058823529411764e-05, "loss": 1.6588, "step": 610 }, { "epoch": 0.47987616099071206, "grad_norm": 25.049070358276367, "learning_rate": 1.9043343653250778e-05, "loss": 1.6225, "step": 620 }, { "epoch": 0.48761609907120745, "grad_norm": 0.03764468804001808, "learning_rate": 1.9027863777089784e-05, "loss": 0.6432, "step": 630 }, { "epoch": 0.4953560371517028, "grad_norm": 0.086015485227108, "learning_rate": 1.9012383900928794e-05, "loss": 1.0614, "step": 640 }, { "epoch": 0.5030959752321982, "grad_norm": 2.7963881492614746, "learning_rate": 1.8996904024767804e-05, "loss": 0.7412, "step": 650 }, { "epoch": 0.5108359133126935, "grad_norm": 0.2601015865802765, "learning_rate": 1.8981424148606814e-05, "loss": 0.634, "step": 660 }, { "epoch": 0.5185758513931888, "grad_norm": 13.00772762298584, "learning_rate": 1.896594427244582e-05, "loss": 1.1349, "step": 670 }, { "epoch": 0.5263157894736842, "grad_norm": 0.2349524348974228, "learning_rate": 1.895046439628483e-05, "loss": 0.7847, "step": 680 }, { "epoch": 0.5340557275541795, "grad_norm": 8.338811874389648, "learning_rate": 1.893498452012384e-05, "loss": 1.2663, "step": 690 }, { "epoch": 0.541795665634675, "grad_norm": 0.013970553874969482, "learning_rate": 1.891950464396285e-05, "loss": 0.6119, "step": 700 }, { "epoch": 0.5495356037151703, "grad_norm": 5.991543292999268, "learning_rate": 1.8904024767801858e-05, "loss": 2.1364, "step": 710 }, { "epoch": 0.5572755417956656, "grad_norm": 20.14146614074707, "learning_rate": 1.8888544891640868e-05, "loss": 0.5414, "step": 720 }, { "epoch": 0.565015479876161, "grad_norm": 26.391416549682617, "learning_rate": 1.8873065015479878e-05, "loss": 1.1469, "step": 730 }, { "epoch": 0.5727554179566563, "grad_norm": 0.09505829960107803, "learning_rate": 1.8857585139318888e-05, "loss": 0.9982, "step": 740 }, { "epoch": 0.5804953560371517, "grad_norm": 0.011161567643284798, "learning_rate": 1.8842105263157898e-05, "loss": 0.9547, "step": 750 }, { "epoch": 0.5882352941176471, "grad_norm": 57.349815368652344, "learning_rate": 1.8826625386996908e-05, "loss": 0.5702, "step": 760 }, { "epoch": 0.5959752321981424, "grad_norm": 9.467449188232422, "learning_rate": 1.8811145510835914e-05, "loss": 0.1995, "step": 770 }, { "epoch": 0.6037151702786377, "grad_norm": 0.02475058287382126, "learning_rate": 1.8795665634674924e-05, "loss": 1.3749, "step": 780 }, { "epoch": 0.6114551083591331, "grad_norm": 5.4445624351501465, "learning_rate": 1.8780185758513934e-05, "loss": 0.4923, "step": 790 }, { "epoch": 0.6191950464396285, "grad_norm": 0.0061449650675058365, "learning_rate": 1.8764705882352944e-05, "loss": 0.587, "step": 800 }, { "epoch": 0.6269349845201239, "grad_norm": 2.2460343837738037, "learning_rate": 1.874922600619195e-05, "loss": 0.125, "step": 810 }, { "epoch": 0.6346749226006192, "grad_norm": 0.007582011632621288, "learning_rate": 1.873374613003096e-05, "loss": 0.6083, "step": 820 }, { "epoch": 0.6424148606811145, "grad_norm": 2.325960636138916, "learning_rate": 1.871826625386997e-05, "loss": 1.0961, "step": 830 }, { "epoch": 0.6501547987616099, "grad_norm": 1.7312732934951782, "learning_rate": 1.8702786377708978e-05, "loss": 0.5809, "step": 840 }, { "epoch": 0.6578947368421053, "grad_norm": 30.86764144897461, "learning_rate": 1.8687306501547988e-05, "loss": 1.5849, "step": 850 }, { "epoch": 0.6656346749226006, "grad_norm": 0.1302964836359024, "learning_rate": 1.8671826625387e-05, "loss": 1.395, "step": 860 }, { "epoch": 0.673374613003096, "grad_norm": 0.037994563579559326, "learning_rate": 1.8656346749226008e-05, "loss": 0.281, "step": 870 }, { "epoch": 0.6811145510835913, "grad_norm": 0.021319111809134483, "learning_rate": 1.8640866873065018e-05, "loss": 0.0646, "step": 880 }, { "epoch": 0.6888544891640866, "grad_norm": 15.3261137008667, "learning_rate": 1.8625386996904028e-05, "loss": 2.8183, "step": 890 }, { "epoch": 0.6965944272445821, "grad_norm": 0.3605537712574005, "learning_rate": 1.8609907120743038e-05, "loss": 0.8517, "step": 900 }, { "epoch": 0.7043343653250774, "grad_norm": 23.898944854736328, "learning_rate": 1.8594427244582044e-05, "loss": 1.3356, "step": 910 }, { "epoch": 0.7120743034055728, "grad_norm": 0.037733446806669235, "learning_rate": 1.8578947368421054e-05, "loss": 0.3386, "step": 920 }, { "epoch": 0.7198142414860681, "grad_norm": 0.8914482593536377, "learning_rate": 1.8563467492260064e-05, "loss": 1.6529, "step": 930 }, { "epoch": 0.7275541795665634, "grad_norm": 0.03205841779708862, "learning_rate": 1.854798761609907e-05, "loss": 2.2015, "step": 940 }, { "epoch": 0.7352941176470589, "grad_norm": 44.147037506103516, "learning_rate": 1.853250773993808e-05, "loss": 0.5081, "step": 950 }, { "epoch": 0.7430340557275542, "grad_norm": 0.3772943615913391, "learning_rate": 1.851702786377709e-05, "loss": 0.6404, "step": 960 }, { "epoch": 0.7507739938080495, "grad_norm": 0.7029751539230347, "learning_rate": 1.85015479876161e-05, "loss": 0.8721, "step": 970 }, { "epoch": 0.7585139318885449, "grad_norm": 29.153505325317383, "learning_rate": 1.8486068111455107e-05, "loss": 0.7126, "step": 980 }, { "epoch": 0.7662538699690402, "grad_norm": 0.02660970762372017, "learning_rate": 1.847058823529412e-05, "loss": 0.4565, "step": 990 }, { "epoch": 0.7739938080495357, "grad_norm": 24.370214462280273, "learning_rate": 1.845510835913313e-05, "loss": 1.7868, "step": 1000 }, { "epoch": 0.781733746130031, "grad_norm": 9.742637634277344, "learning_rate": 1.8439628482972137e-05, "loss": 0.3382, "step": 1010 }, { "epoch": 0.7894736842105263, "grad_norm": 1.175869345664978, "learning_rate": 1.8424148606811147e-05, "loss": 0.3069, "step": 1020 }, { "epoch": 0.7972136222910217, "grad_norm": 0.04197060316801071, "learning_rate": 1.8408668730650157e-05, "loss": 1.6935, "step": 1030 }, { "epoch": 0.804953560371517, "grad_norm": 0.22277726233005524, "learning_rate": 1.8393188854489164e-05, "loss": 0.3969, "step": 1040 }, { "epoch": 0.8126934984520123, "grad_norm": 4.776662826538086, "learning_rate": 1.8377708978328174e-05, "loss": 0.8469, "step": 1050 }, { "epoch": 0.8204334365325078, "grad_norm": 13.338683128356934, "learning_rate": 1.8362229102167184e-05, "loss": 1.4713, "step": 1060 }, { "epoch": 0.8281733746130031, "grad_norm": 7.291410446166992, "learning_rate": 1.8346749226006194e-05, "loss": 0.6544, "step": 1070 }, { "epoch": 0.8359133126934984, "grad_norm": 17.760698318481445, "learning_rate": 1.83312693498452e-05, "loss": 1.6695, "step": 1080 }, { "epoch": 0.8436532507739938, "grad_norm": 14.199136734008789, "learning_rate": 1.831578947368421e-05, "loss": 1.8401, "step": 1090 }, { "epoch": 0.8513931888544891, "grad_norm": 0.3752070367336273, "learning_rate": 1.830030959752322e-05, "loss": 0.9054, "step": 1100 }, { "epoch": 0.8591331269349846, "grad_norm": 18.48265266418457, "learning_rate": 1.828482972136223e-05, "loss": 1.2207, "step": 1110 }, { "epoch": 0.8668730650154799, "grad_norm": 0.05996339023113251, "learning_rate": 1.826934984520124e-05, "loss": 0.9683, "step": 1120 }, { "epoch": 0.8746130030959752, "grad_norm": 0.030478883534669876, "learning_rate": 1.825386996904025e-05, "loss": 1.5157, "step": 1130 }, { "epoch": 0.8823529411764706, "grad_norm": 0.03278845176100731, "learning_rate": 1.823839009287926e-05, "loss": 0.4431, "step": 1140 }, { "epoch": 0.8900928792569659, "grad_norm": 25.75578498840332, "learning_rate": 1.8222910216718267e-05, "loss": 1.718, "step": 1150 }, { "epoch": 0.8978328173374613, "grad_norm": 0.019352613016963005, "learning_rate": 1.8207430340557277e-05, "loss": 1.7718, "step": 1160 }, { "epoch": 0.9055727554179567, "grad_norm": 24.575620651245117, "learning_rate": 1.8191950464396287e-05, "loss": 0.6765, "step": 1170 }, { "epoch": 0.913312693498452, "grad_norm": 26.581939697265625, "learning_rate": 1.8176470588235294e-05, "loss": 0.7462, "step": 1180 }, { "epoch": 0.9210526315789473, "grad_norm": 47.84867477416992, "learning_rate": 1.8160990712074304e-05, "loss": 1.1642, "step": 1190 }, { "epoch": 0.9287925696594427, "grad_norm": 14.253039360046387, "learning_rate": 1.8145510835913314e-05, "loss": 0.5097, "step": 1200 }, { "epoch": 0.9365325077399381, "grad_norm": 0.0883980244398117, "learning_rate": 1.8130030959752324e-05, "loss": 0.6184, "step": 1210 }, { "epoch": 0.9442724458204335, "grad_norm": 0.028200991451740265, "learning_rate": 1.811455108359133e-05, "loss": 0.4538, "step": 1220 }, { "epoch": 0.9520123839009288, "grad_norm": 0.6853126287460327, "learning_rate": 1.809907120743034e-05, "loss": 0.9124, "step": 1230 }, { "epoch": 0.9597523219814241, "grad_norm": 16.458114624023438, "learning_rate": 1.8083591331269354e-05, "loss": 1.0246, "step": 1240 }, { "epoch": 0.9674922600619195, "grad_norm": 3.0164809226989746, "learning_rate": 1.806811145510836e-05, "loss": 1.6219, "step": 1250 }, { "epoch": 0.9752321981424149, "grad_norm": 29.956066131591797, "learning_rate": 1.805263157894737e-05, "loss": 1.3312, "step": 1260 }, { "epoch": 0.9829721362229102, "grad_norm": 0.4175031781196594, "learning_rate": 1.803715170278638e-05, "loss": 0.1467, "step": 1270 }, { "epoch": 0.9907120743034056, "grad_norm": 3.1807973384857178, "learning_rate": 1.8021671826625387e-05, "loss": 0.6078, "step": 1280 }, { "epoch": 0.9984520123839009, "grad_norm": 13.216703414916992, "learning_rate": 1.8006191950464397e-05, "loss": 2.0101, "step": 1290 }, { "epoch": 1.0, "eval_accuracy": 0.66, "eval_f1": 0.6518757259001162, "eval_loss": 1.308005690574646, "eval_runtime": 1.308, "eval_samples_per_second": 38.225, "eval_steps_per_second": 38.225, "step": 1292 }, { "epoch": 1.0061919504643964, "grad_norm": 2.3001062870025635, "learning_rate": 1.7990712074303407e-05, "loss": 0.7115, "step": 1300 }, { "epoch": 1.0139318885448916, "grad_norm": 18.025787353515625, "learning_rate": 1.7975232198142417e-05, "loss": 1.0015, "step": 1310 }, { "epoch": 1.021671826625387, "grad_norm": 0.5099636316299438, "learning_rate": 1.7959752321981424e-05, "loss": 0.1076, "step": 1320 }, { "epoch": 1.0294117647058822, "grad_norm": 0.3632793724536896, "learning_rate": 1.7944272445820434e-05, "loss": 1.6257, "step": 1330 }, { "epoch": 1.0371517027863777, "grad_norm": 0.07609053701162338, "learning_rate": 1.7928792569659444e-05, "loss": 0.7241, "step": 1340 }, { "epoch": 1.0448916408668731, "grad_norm": 0.22190319001674652, "learning_rate": 1.7913312693498454e-05, "loss": 1.532, "step": 1350 }, { "epoch": 1.0526315789473684, "grad_norm": 21.99039649963379, "learning_rate": 1.7897832817337464e-05, "loss": 1.1934, "step": 1360 }, { "epoch": 1.0603715170278638, "grad_norm": 0.14937640726566315, "learning_rate": 1.7882352941176474e-05, "loss": 0.9303, "step": 1370 }, { "epoch": 1.068111455108359, "grad_norm": 0.007174472790211439, "learning_rate": 1.786687306501548e-05, "loss": 0.3152, "step": 1380 }, { "epoch": 1.0758513931888545, "grad_norm": 35.69093704223633, "learning_rate": 1.785139318885449e-05, "loss": 1.3553, "step": 1390 }, { "epoch": 1.08359133126935, "grad_norm": 0.019195424392819405, "learning_rate": 1.78359133126935e-05, "loss": 0.948, "step": 1400 }, { "epoch": 1.0913312693498451, "grad_norm": 25.10902214050293, "learning_rate": 1.782043343653251e-05, "loss": 0.7343, "step": 1410 }, { "epoch": 1.0990712074303406, "grad_norm": 1.5584079027175903, "learning_rate": 1.7804953560371517e-05, "loss": 0.9831, "step": 1420 }, { "epoch": 1.1068111455108358, "grad_norm": 31.298072814941406, "learning_rate": 1.7789473684210527e-05, "loss": 0.7376, "step": 1430 }, { "epoch": 1.1145510835913313, "grad_norm": 0.17294420301914215, "learning_rate": 1.7773993808049537e-05, "loss": 0.0251, "step": 1440 }, { "epoch": 1.1222910216718267, "grad_norm": 65.6518325805664, "learning_rate": 1.7758513931888547e-05, "loss": 0.6924, "step": 1450 }, { "epoch": 1.130030959752322, "grad_norm": 0.4761132001876831, "learning_rate": 1.7743034055727554e-05, "loss": 0.8977, "step": 1460 }, { "epoch": 1.1377708978328174, "grad_norm": 0.025213854387402534, "learning_rate": 1.7727554179566564e-05, "loss": 1.843, "step": 1470 }, { "epoch": 1.1455108359133126, "grad_norm": 34.82686996459961, "learning_rate": 1.7712074303405574e-05, "loss": 1.0225, "step": 1480 }, { "epoch": 1.153250773993808, "grad_norm": 32.56637954711914, "learning_rate": 1.7696594427244584e-05, "loss": 0.6607, "step": 1490 }, { "epoch": 1.1609907120743035, "grad_norm": 1.7202068567276, "learning_rate": 1.7681114551083594e-05, "loss": 0.8367, "step": 1500 }, { "epoch": 1.1687306501547987, "grad_norm": 11.974830627441406, "learning_rate": 1.7665634674922604e-05, "loss": 1.645, "step": 1510 }, { "epoch": 1.1764705882352942, "grad_norm": 0.9545431137084961, "learning_rate": 1.765015479876161e-05, "loss": 1.0642, "step": 1520 }, { "epoch": 1.1842105263157894, "grad_norm": 0.6323758363723755, "learning_rate": 1.763467492260062e-05, "loss": 0.258, "step": 1530 }, { "epoch": 1.1919504643962848, "grad_norm": 22.921859741210938, "learning_rate": 1.761919504643963e-05, "loss": 0.5059, "step": 1540 }, { "epoch": 1.1996904024767803, "grad_norm": 0.024385841563344002, "learning_rate": 1.760371517027864e-05, "loss": 1.0363, "step": 1550 }, { "epoch": 1.2074303405572755, "grad_norm": 1.279547095298767, "learning_rate": 1.7588235294117647e-05, "loss": 0.5037, "step": 1560 }, { "epoch": 1.215170278637771, "grad_norm": 0.13738131523132324, "learning_rate": 1.7572755417956657e-05, "loss": 0.2811, "step": 1570 }, { "epoch": 1.2229102167182662, "grad_norm": 0.06290140748023987, "learning_rate": 1.7557275541795667e-05, "loss": 1.4449, "step": 1580 }, { "epoch": 1.2306501547987616, "grad_norm": 21.201923370361328, "learning_rate": 1.7541795665634677e-05, "loss": 0.1935, "step": 1590 }, { "epoch": 1.238390092879257, "grad_norm": 0.20100288093090057, "learning_rate": 1.7526315789473683e-05, "loss": 0.8282, "step": 1600 }, { "epoch": 1.2461300309597523, "grad_norm": 1.5086302757263184, "learning_rate": 1.7510835913312697e-05, "loss": 1.2196, "step": 1610 }, { "epoch": 1.2538699690402477, "grad_norm": 17.41587257385254, "learning_rate": 1.7495356037151704e-05, "loss": 1.0463, "step": 1620 }, { "epoch": 1.261609907120743, "grad_norm": 30.081039428710938, "learning_rate": 1.7479876160990714e-05, "loss": 0.7323, "step": 1630 }, { "epoch": 1.2693498452012384, "grad_norm": 13.7053804397583, "learning_rate": 1.7464396284829724e-05, "loss": 2.3771, "step": 1640 }, { "epoch": 1.2770897832817338, "grad_norm": 0.7035312652587891, "learning_rate": 1.7448916408668734e-05, "loss": 0.587, "step": 1650 }, { "epoch": 1.284829721362229, "grad_norm": 12.712949752807617, "learning_rate": 1.743343653250774e-05, "loss": 0.8693, "step": 1660 }, { "epoch": 1.2925696594427245, "grad_norm": 27.038021087646484, "learning_rate": 1.741795665634675e-05, "loss": 0.4911, "step": 1670 }, { "epoch": 1.3003095975232197, "grad_norm": 35.02375411987305, "learning_rate": 1.740247678018576e-05, "loss": 0.4655, "step": 1680 }, { "epoch": 1.3080495356037152, "grad_norm": 27.82124900817871, "learning_rate": 1.738699690402477e-05, "loss": 1.7654, "step": 1690 }, { "epoch": 1.3157894736842106, "grad_norm": 30.71179962158203, "learning_rate": 1.7371517027863777e-05, "loss": 0.241, "step": 1700 }, { "epoch": 1.3235294117647058, "grad_norm": 15.270631790161133, "learning_rate": 1.7356037151702787e-05, "loss": 0.5628, "step": 1710 }, { "epoch": 1.3312693498452013, "grad_norm": 1.8740601539611816, "learning_rate": 1.7340557275541797e-05, "loss": 1.0145, "step": 1720 }, { "epoch": 1.3390092879256965, "grad_norm": 0.01709054224193096, "learning_rate": 1.7325077399380807e-05, "loss": 0.554, "step": 1730 }, { "epoch": 1.346749226006192, "grad_norm": 0.08021119236946106, "learning_rate": 1.7309597523219817e-05, "loss": 0.5256, "step": 1740 }, { "epoch": 1.3544891640866874, "grad_norm": 33.3912239074707, "learning_rate": 1.7294117647058827e-05, "loss": 1.6058, "step": 1750 }, { "epoch": 1.3622291021671826, "grad_norm": 19.775798797607422, "learning_rate": 1.7278637770897833e-05, "loss": 0.8272, "step": 1760 }, { "epoch": 1.369969040247678, "grad_norm": 0.19204963743686676, "learning_rate": 1.7264705882352945e-05, "loss": 1.0239, "step": 1770 }, { "epoch": 1.3777089783281733, "grad_norm": 18.580913543701172, "learning_rate": 1.7249226006191952e-05, "loss": 0.4475, "step": 1780 }, { "epoch": 1.3854489164086687, "grad_norm": 1.0312788486480713, "learning_rate": 1.7233746130030962e-05, "loss": 0.2649, "step": 1790 }, { "epoch": 1.3931888544891642, "grad_norm": 0.35893258452415466, "learning_rate": 1.7218266253869972e-05, "loss": 0.5901, "step": 1800 }, { "epoch": 1.4009287925696594, "grad_norm": 0.03804398700594902, "learning_rate": 1.720278637770898e-05, "loss": 0.7927, "step": 1810 }, { "epoch": 1.4086687306501549, "grad_norm": 37.0048942565918, "learning_rate": 1.718730650154799e-05, "loss": 1.0768, "step": 1820 }, { "epoch": 1.41640866873065, "grad_norm": 0.021752920001745224, "learning_rate": 1.7171826625387e-05, "loss": 0.688, "step": 1830 }, { "epoch": 1.4241486068111455, "grad_norm": 0.02591339498758316, "learning_rate": 1.715634674922601e-05, "loss": 1.1263, "step": 1840 }, { "epoch": 1.431888544891641, "grad_norm": 11.517024993896484, "learning_rate": 1.7140866873065015e-05, "loss": 1.142, "step": 1850 }, { "epoch": 1.4396284829721362, "grad_norm": 20.871641159057617, "learning_rate": 1.7125386996904025e-05, "loss": 1.2256, "step": 1860 }, { "epoch": 1.4473684210526316, "grad_norm": 6.332366466522217, "learning_rate": 1.7109907120743035e-05, "loss": 0.0861, "step": 1870 }, { "epoch": 1.4551083591331269, "grad_norm": 0.021078817546367645, "learning_rate": 1.7094427244582045e-05, "loss": 0.3372, "step": 1880 }, { "epoch": 1.4628482972136223, "grad_norm": 35.069541931152344, "learning_rate": 1.7078947368421055e-05, "loss": 1.5673, "step": 1890 }, { "epoch": 1.4705882352941178, "grad_norm": 0.07564757019281387, "learning_rate": 1.7063467492260065e-05, "loss": 1.7361, "step": 1900 }, { "epoch": 1.478328173374613, "grad_norm": 1.1005617380142212, "learning_rate": 1.704798761609907e-05, "loss": 0.6976, "step": 1910 }, { "epoch": 1.4860681114551084, "grad_norm": 54.3600959777832, "learning_rate": 1.703250773993808e-05, "loss": 0.2337, "step": 1920 }, { "epoch": 1.4938080495356036, "grad_norm": 0.05760627239942551, "learning_rate": 1.701702786377709e-05, "loss": 1.6809, "step": 1930 }, { "epoch": 1.501547987616099, "grad_norm": 16.619665145874023, "learning_rate": 1.70015479876161e-05, "loss": 0.9658, "step": 1940 }, { "epoch": 1.5092879256965945, "grad_norm": 0.09029588103294373, "learning_rate": 1.6986068111455108e-05, "loss": 0.7521, "step": 1950 }, { "epoch": 1.5170278637770898, "grad_norm": 4.400278091430664, "learning_rate": 1.6970588235294118e-05, "loss": 0.807, "step": 1960 }, { "epoch": 1.524767801857585, "grad_norm": 0.014661544933915138, "learning_rate": 1.6955108359133128e-05, "loss": 0.9708, "step": 1970 }, { "epoch": 1.5325077399380804, "grad_norm": 13.40234661102295, "learning_rate": 1.6939628482972138e-05, "loss": 0.7447, "step": 1980 }, { "epoch": 1.5402476780185759, "grad_norm": 1.9354584217071533, "learning_rate": 1.6924148606811145e-05, "loss": 0.4995, "step": 1990 }, { "epoch": 1.5479876160990713, "grad_norm": 28.48665428161621, "learning_rate": 1.6908668730650155e-05, "loss": 0.942, "step": 2000 }, { "epoch": 1.5557275541795665, "grad_norm": 0.27385973930358887, "learning_rate": 1.6893188854489165e-05, "loss": 1.2704, "step": 2010 }, { "epoch": 1.5634674922600618, "grad_norm": 31.140518188476562, "learning_rate": 1.6877708978328175e-05, "loss": 0.973, "step": 2020 }, { "epoch": 1.5712074303405572, "grad_norm": 1.630332350730896, "learning_rate": 1.6862229102167185e-05, "loss": 0.825, "step": 2030 }, { "epoch": 1.5789473684210527, "grad_norm": 26.483957290649414, "learning_rate": 1.6846749226006195e-05, "loss": 0.6268, "step": 2040 }, { "epoch": 1.586687306501548, "grad_norm": 0.22045840322971344, "learning_rate": 1.68312693498452e-05, "loss": 0.546, "step": 2050 }, { "epoch": 1.5944272445820433, "grad_norm": 8.974145889282227, "learning_rate": 1.681578947368421e-05, "loss": 1.2905, "step": 2060 }, { "epoch": 1.6021671826625385, "grad_norm": 9.615457534790039, "learning_rate": 1.680030959752322e-05, "loss": 1.3837, "step": 2070 }, { "epoch": 1.609907120743034, "grad_norm": 0.13241997361183167, "learning_rate": 1.678482972136223e-05, "loss": 0.077, "step": 2080 }, { "epoch": 1.6176470588235294, "grad_norm": 32.08259963989258, "learning_rate": 1.6769349845201238e-05, "loss": 1.0013, "step": 2090 }, { "epoch": 1.6253869969040249, "grad_norm": 25.423423767089844, "learning_rate": 1.6753869969040248e-05, "loss": 0.4924, "step": 2100 }, { "epoch": 1.63312693498452, "grad_norm": 0.07571066915988922, "learning_rate": 1.6738390092879258e-05, "loss": 0.0948, "step": 2110 }, { "epoch": 1.6408668730650153, "grad_norm": 37.0987663269043, "learning_rate": 1.6722910216718268e-05, "loss": 0.6736, "step": 2120 }, { "epoch": 1.6486068111455108, "grad_norm": 0.1293116807937622, "learning_rate": 1.6707430340557278e-05, "loss": 0.9082, "step": 2130 }, { "epoch": 1.6563467492260062, "grad_norm": 1.2246845960617065, "learning_rate": 1.6691950464396288e-05, "loss": 0.1941, "step": 2140 }, { "epoch": 1.6640866873065017, "grad_norm": 0.09594547003507614, "learning_rate": 1.6676470588235295e-05, "loss": 0.2725, "step": 2150 }, { "epoch": 1.671826625386997, "grad_norm": 27.792299270629883, "learning_rate": 1.6660990712074305e-05, "loss": 1.173, "step": 2160 }, { "epoch": 1.6795665634674921, "grad_norm": 39.21937561035156, "learning_rate": 1.6645510835913315e-05, "loss": 0.6402, "step": 2170 }, { "epoch": 1.6873065015479876, "grad_norm": 0.12942864000797272, "learning_rate": 1.6630030959752325e-05, "loss": 1.89, "step": 2180 }, { "epoch": 1.695046439628483, "grad_norm": 0.04539963975548744, "learning_rate": 1.661455108359133e-05, "loss": 0.9752, "step": 2190 }, { "epoch": 1.7027863777089784, "grad_norm": 12.6195650100708, "learning_rate": 1.659907120743034e-05, "loss": 1.4136, "step": 2200 }, { "epoch": 1.7105263157894737, "grad_norm": 0.04622410237789154, "learning_rate": 1.658359133126935e-05, "loss": 1.3654, "step": 2210 }, { "epoch": 1.718266253869969, "grad_norm": 13.263771057128906, "learning_rate": 1.656811145510836e-05, "loss": 0.639, "step": 2220 }, { "epoch": 1.7260061919504643, "grad_norm": 10.771790504455566, "learning_rate": 1.6552631578947368e-05, "loss": 0.6251, "step": 2230 }, { "epoch": 1.7337461300309598, "grad_norm": 0.0699467584490776, "learning_rate": 1.6537151702786378e-05, "loss": 0.6933, "step": 2240 }, { "epoch": 1.7414860681114552, "grad_norm": 8.696311950683594, "learning_rate": 1.6521671826625388e-05, "loss": 1.0198, "step": 2250 }, { "epoch": 1.7492260061919505, "grad_norm": 0.26265281438827515, "learning_rate": 1.6506191950464398e-05, "loss": 1.5758, "step": 2260 }, { "epoch": 1.7569659442724457, "grad_norm": 0.0739678293466568, "learning_rate": 1.6490712074303408e-05, "loss": 0.6469, "step": 2270 }, { "epoch": 1.7647058823529411, "grad_norm": 16.67066192626953, "learning_rate": 1.6475232198142418e-05, "loss": 1.2836, "step": 2280 }, { "epoch": 1.7724458204334366, "grad_norm": 12.96693229675293, "learning_rate": 1.6459752321981425e-05, "loss": 0.8455, "step": 2290 }, { "epoch": 1.780185758513932, "grad_norm": 15.42802906036377, "learning_rate": 1.6444272445820435e-05, "loss": 1.096, "step": 2300 }, { "epoch": 1.7879256965944272, "grad_norm": 0.6122158765792847, "learning_rate": 1.6428792569659445e-05, "loss": 1.1153, "step": 2310 }, { "epoch": 1.7956656346749225, "grad_norm": 25.35101318359375, "learning_rate": 1.6413312693498455e-05, "loss": 1.6331, "step": 2320 }, { "epoch": 1.803405572755418, "grad_norm": 0.13095131516456604, "learning_rate": 1.639783281733746e-05, "loss": 0.8313, "step": 2330 }, { "epoch": 1.8111455108359134, "grad_norm": 0.0269408468157053, "learning_rate": 1.638235294117647e-05, "loss": 0.4396, "step": 2340 }, { "epoch": 1.8188854489164088, "grad_norm": 0.0338105633854866, "learning_rate": 1.636687306501548e-05, "loss": 0.5798, "step": 2350 }, { "epoch": 1.826625386996904, "grad_norm": 0.09239430725574493, "learning_rate": 1.635139318885449e-05, "loss": 0.7027, "step": 2360 }, { "epoch": 1.8343653250773992, "grad_norm": 1.0628262758255005, "learning_rate": 1.6335913312693498e-05, "loss": 0.4091, "step": 2370 }, { "epoch": 1.8421052631578947, "grad_norm": 0.09572208672761917, "learning_rate": 1.632043343653251e-05, "loss": 0.6929, "step": 2380 }, { "epoch": 1.8498452012383901, "grad_norm": 6.947652339935303, "learning_rate": 1.6304953560371518e-05, "loss": 1.516, "step": 2390 }, { "epoch": 1.8575851393188856, "grad_norm": 35.1392822265625, "learning_rate": 1.6289473684210528e-05, "loss": 1.4484, "step": 2400 }, { "epoch": 1.8653250773993808, "grad_norm": 1.5096639394760132, "learning_rate": 1.6273993808049538e-05, "loss": 0.6211, "step": 2410 }, { "epoch": 1.873065015479876, "grad_norm": 0.06429480761289597, "learning_rate": 1.6258513931888548e-05, "loss": 1.5146, "step": 2420 }, { "epoch": 1.8808049535603715, "grad_norm": 6.4479451179504395, "learning_rate": 1.6243034055727554e-05, "loss": 0.5785, "step": 2430 }, { "epoch": 1.888544891640867, "grad_norm": 42.45814895629883, "learning_rate": 1.6227554179566564e-05, "loss": 1.3982, "step": 2440 }, { "epoch": 1.8962848297213624, "grad_norm": 0.32254356145858765, "learning_rate": 1.6212074303405574e-05, "loss": 0.7849, "step": 2450 }, { "epoch": 1.9040247678018576, "grad_norm": 0.028771542012691498, "learning_rate": 1.6196594427244584e-05, "loss": 1.7047, "step": 2460 }, { "epoch": 1.9117647058823528, "grad_norm": 27.303747177124023, "learning_rate": 1.618111455108359e-05, "loss": 0.7179, "step": 2470 }, { "epoch": 1.9195046439628483, "grad_norm": 0.3763441741466522, "learning_rate": 1.61656346749226e-05, "loss": 0.3346, "step": 2480 }, { "epoch": 1.9272445820433437, "grad_norm": 0.11286848038434982, "learning_rate": 1.615015479876161e-05, "loss": 0.8063, "step": 2490 }, { "epoch": 1.9349845201238391, "grad_norm": 21.59604263305664, "learning_rate": 1.613467492260062e-05, "loss": 1.4715, "step": 2500 }, { "epoch": 1.9427244582043344, "grad_norm": 27.21000862121582, "learning_rate": 1.611919504643963e-05, "loss": 0.5636, "step": 2510 }, { "epoch": 1.9504643962848296, "grad_norm": 0.10806488245725632, "learning_rate": 1.610371517027864e-05, "loss": 0.6911, "step": 2520 }, { "epoch": 1.958204334365325, "grad_norm": 0.11146494001150131, "learning_rate": 1.6088235294117648e-05, "loss": 0.7332, "step": 2530 }, { "epoch": 1.9659442724458205, "grad_norm": 0.02843203768134117, "learning_rate": 1.6072755417956658e-05, "loss": 0.7937, "step": 2540 }, { "epoch": 1.973684210526316, "grad_norm": 18.05766487121582, "learning_rate": 1.6057275541795668e-05, "loss": 1.0725, "step": 2550 }, { "epoch": 1.9814241486068112, "grad_norm": 0.062031399458646774, "learning_rate": 1.6041795665634678e-05, "loss": 0.4602, "step": 2560 }, { "epoch": 1.9891640866873064, "grad_norm": 52.70853805541992, "learning_rate": 1.6026315789473684e-05, "loss": 0.3279, "step": 2570 }, { "epoch": 1.9969040247678018, "grad_norm": 0.12377507239580154, "learning_rate": 1.6010835913312694e-05, "loss": 1.4705, "step": 2580 }, { "epoch": 2.0, "eval_accuracy": 0.72, "eval_f1": 0.7168273504273505, "eval_loss": 1.1612240076065063, "eval_runtime": 1.3077, "eval_samples_per_second": 38.236, "eval_steps_per_second": 38.236, "step": 2584 }, { "epoch": 2.0046439628482973, "grad_norm": 0.04444746673107147, "learning_rate": 1.5995356037151704e-05, "loss": 0.1044, "step": 2590 }, { "epoch": 2.0123839009287927, "grad_norm": 15.327290534973145, "learning_rate": 1.5979876160990714e-05, "loss": 0.4224, "step": 2600 }, { "epoch": 2.0201238390092877, "grad_norm": 0.10992716997861862, "learning_rate": 1.596439628482972e-05, "loss": 0.9307, "step": 2610 }, { "epoch": 2.027863777089783, "grad_norm": 21.27305793762207, "learning_rate": 1.594891640866873e-05, "loss": 0.4775, "step": 2620 }, { "epoch": 2.0356037151702786, "grad_norm": 0.056346993893384933, "learning_rate": 1.593343653250774e-05, "loss": 0.6093, "step": 2630 }, { "epoch": 2.043343653250774, "grad_norm": 32.548954010009766, "learning_rate": 1.591795665634675e-05, "loss": 1.3185, "step": 2640 }, { "epoch": 2.0510835913312695, "grad_norm": 2.01470947265625, "learning_rate": 1.590247678018576e-05, "loss": 0.2576, "step": 2650 }, { "epoch": 2.0588235294117645, "grad_norm": 1.1700825691223145, "learning_rate": 1.588699690402477e-05, "loss": 0.3939, "step": 2660 }, { "epoch": 2.06656346749226, "grad_norm": 0.4983654022216797, "learning_rate": 1.5871517027863778e-05, "loss": 0.8454, "step": 2670 }, { "epoch": 2.0743034055727554, "grad_norm": 36.228965759277344, "learning_rate": 1.5856037151702788e-05, "loss": 0.6533, "step": 2680 }, { "epoch": 2.082043343653251, "grad_norm": 0.03695230185985565, "learning_rate": 1.5840557275541798e-05, "loss": 1.2504, "step": 2690 }, { "epoch": 2.0897832817337463, "grad_norm": 0.199468195438385, "learning_rate": 1.5825077399380808e-05, "loss": 1.1332, "step": 2700 }, { "epoch": 2.0975232198142413, "grad_norm": 0.03587948530912399, "learning_rate": 1.5809597523219814e-05, "loss": 0.169, "step": 2710 }, { "epoch": 2.1052631578947367, "grad_norm": 0.38496339321136475, "learning_rate": 1.5794117647058824e-05, "loss": 0.6628, "step": 2720 }, { "epoch": 2.113003095975232, "grad_norm": 37.080039978027344, "learning_rate": 1.5778637770897834e-05, "loss": 0.9078, "step": 2730 }, { "epoch": 2.1207430340557276, "grad_norm": 0.4930584728717804, "learning_rate": 1.576315789473684e-05, "loss": 1.4462, "step": 2740 }, { "epoch": 2.128482972136223, "grad_norm": 0.13436377048492432, "learning_rate": 1.5747678018575854e-05, "loss": 0.459, "step": 2750 }, { "epoch": 2.136222910216718, "grad_norm": 31.625699996948242, "learning_rate": 1.5732198142414864e-05, "loss": 0.4214, "step": 2760 }, { "epoch": 2.1439628482972135, "grad_norm": 0.11084353923797607, "learning_rate": 1.571671826625387e-05, "loss": 0.2174, "step": 2770 }, { "epoch": 2.151702786377709, "grad_norm": 8.879451751708984, "learning_rate": 1.570123839009288e-05, "loss": 1.0263, "step": 2780 }, { "epoch": 2.1594427244582044, "grad_norm": 0.9105573296546936, "learning_rate": 1.568575851393189e-05, "loss": 0.3543, "step": 2790 }, { "epoch": 2.1671826625387, "grad_norm": 1.504798173904419, "learning_rate": 1.56702786377709e-05, "loss": 0.8409, "step": 2800 }, { "epoch": 2.174922600619195, "grad_norm": 31.001140594482422, "learning_rate": 1.5654798761609907e-05, "loss": 0.9455, "step": 2810 }, { "epoch": 2.1826625386996903, "grad_norm": 4.5337748527526855, "learning_rate": 1.5639318885448917e-05, "loss": 1.4447, "step": 2820 }, { "epoch": 2.1904024767801857, "grad_norm": 33.933013916015625, "learning_rate": 1.5623839009287927e-05, "loss": 1.1236, "step": 2830 }, { "epoch": 2.198142414860681, "grad_norm": 23.865520477294922, "learning_rate": 1.5608359133126934e-05, "loss": 1.4619, "step": 2840 }, { "epoch": 2.2058823529411766, "grad_norm": 0.37970438599586487, "learning_rate": 1.5592879256965944e-05, "loss": 1.1784, "step": 2850 }, { "epoch": 2.2136222910216716, "grad_norm": 0.5323224663734436, "learning_rate": 1.5577399380804954e-05, "loss": 0.0141, "step": 2860 }, { "epoch": 2.221362229102167, "grad_norm": 86.63206481933594, "learning_rate": 1.5561919504643964e-05, "loss": 0.4059, "step": 2870 }, { "epoch": 2.2291021671826625, "grad_norm": 12.29488468170166, "learning_rate": 1.5546439628482974e-05, "loss": 0.9241, "step": 2880 }, { "epoch": 2.236842105263158, "grad_norm": 5.396212577819824, "learning_rate": 1.5530959752321984e-05, "loss": 0.7794, "step": 2890 }, { "epoch": 2.2445820433436534, "grad_norm": 24.045963287353516, "learning_rate": 1.5515479876160994e-05, "loss": 1.4417, "step": 2900 }, { "epoch": 2.2523219814241484, "grad_norm": 0.03553316369652748, "learning_rate": 1.55e-05, "loss": 0.5436, "step": 2910 }, { "epoch": 2.260061919504644, "grad_norm": 0.06873774528503418, "learning_rate": 1.548452012383901e-05, "loss": 0.6071, "step": 2920 }, { "epoch": 2.2678018575851393, "grad_norm": 17.724674224853516, "learning_rate": 1.546904024767802e-05, "loss": 0.5822, "step": 2930 }, { "epoch": 2.2755417956656347, "grad_norm": 0.29069697856903076, "learning_rate": 1.545356037151703e-05, "loss": 0.6545, "step": 2940 }, { "epoch": 2.28328173374613, "grad_norm": 0.06973748654127121, "learning_rate": 1.5438080495356037e-05, "loss": 0.3732, "step": 2950 }, { "epoch": 2.291021671826625, "grad_norm": 0.040797796100378036, "learning_rate": 1.5422600619195047e-05, "loss": 1.5001, "step": 2960 }, { "epoch": 2.2987616099071206, "grad_norm": 0.19929097592830658, "learning_rate": 1.5407120743034057e-05, "loss": 0.456, "step": 2970 }, { "epoch": 2.306501547987616, "grad_norm": 69.55315399169922, "learning_rate": 1.5391640866873064e-05, "loss": 0.7225, "step": 2980 }, { "epoch": 2.3142414860681115, "grad_norm": 0.03111192025244236, "learning_rate": 1.5376160990712074e-05, "loss": 0.0568, "step": 2990 }, { "epoch": 2.321981424148607, "grad_norm": 0.05499303340911865, "learning_rate": 1.5360681114551087e-05, "loss": 1.267, "step": 3000 }, { "epoch": 2.329721362229102, "grad_norm": 21.98712158203125, "learning_rate": 1.5345201238390094e-05, "loss": 1.1007, "step": 3010 }, { "epoch": 2.3374613003095974, "grad_norm": 0.03790492191910744, "learning_rate": 1.5329721362229104e-05, "loss": 1.2134, "step": 3020 }, { "epoch": 2.345201238390093, "grad_norm": 33.72384262084961, "learning_rate": 1.5314241486068114e-05, "loss": 1.1896, "step": 3030 }, { "epoch": 2.3529411764705883, "grad_norm": 38.13752365112305, "learning_rate": 1.5298761609907124e-05, "loss": 0.981, "step": 3040 }, { "epoch": 2.3606811145510838, "grad_norm": 50.29285430908203, "learning_rate": 1.528328173374613e-05, "loss": 0.4338, "step": 3050 }, { "epoch": 2.3684210526315788, "grad_norm": 0.4301454424858093, "learning_rate": 1.526780185758514e-05, "loss": 0.7296, "step": 3060 }, { "epoch": 2.376160990712074, "grad_norm": 0.16649337112903595, "learning_rate": 1.5252321981424149e-05, "loss": 0.3775, "step": 3070 }, { "epoch": 2.3839009287925697, "grad_norm": 0.23876243829727173, "learning_rate": 1.5236842105263159e-05, "loss": 0.4495, "step": 3080 }, { "epoch": 2.391640866873065, "grad_norm": 0.21197867393493652, "learning_rate": 1.5221362229102167e-05, "loss": 1.1692, "step": 3090 }, { "epoch": 2.3993808049535605, "grad_norm": 0.07017607986927032, "learning_rate": 1.5205882352941177e-05, "loss": 0.2007, "step": 3100 }, { "epoch": 2.4071207430340555, "grad_norm": 0.17968213558197021, "learning_rate": 1.5190402476780185e-05, "loss": 0.8, "step": 3110 }, { "epoch": 2.414860681114551, "grad_norm": 31.117765426635742, "learning_rate": 1.5174922600619195e-05, "loss": 1.2805, "step": 3120 }, { "epoch": 2.4226006191950464, "grad_norm": 1.7993072271347046, "learning_rate": 1.5159442724458207e-05, "loss": 0.6417, "step": 3130 }, { "epoch": 2.430340557275542, "grad_norm": 0.03542105853557587, "learning_rate": 1.5143962848297215e-05, "loss": 0.7523, "step": 3140 }, { "epoch": 2.4380804953560373, "grad_norm": 0.06181148439645767, "learning_rate": 1.5128482972136225e-05, "loss": 0.0153, "step": 3150 }, { "epoch": 2.4458204334365323, "grad_norm": 0.07718396931886673, "learning_rate": 1.5113003095975234e-05, "loss": 0.9057, "step": 3160 }, { "epoch": 2.4535603715170278, "grad_norm": 36.24127960205078, "learning_rate": 1.5097523219814244e-05, "loss": 0.5393, "step": 3170 }, { "epoch": 2.461300309597523, "grad_norm": 0.38641583919525146, "learning_rate": 1.5082043343653252e-05, "loss": 0.356, "step": 3180 }, { "epoch": 2.4690402476780187, "grad_norm": 2.2882111072540283, "learning_rate": 1.506656346749226e-05, "loss": 0.2642, "step": 3190 }, { "epoch": 2.476780185758514, "grad_norm": 0.02347792498767376, "learning_rate": 1.505108359133127e-05, "loss": 0.6202, "step": 3200 }, { "epoch": 2.484520123839009, "grad_norm": 0.0761231854557991, "learning_rate": 1.5035603715170279e-05, "loss": 0.6635, "step": 3210 }, { "epoch": 2.4922600619195046, "grad_norm": 6.28391170501709, "learning_rate": 1.5020123839009289e-05, "loss": 0.6006, "step": 3220 }, { "epoch": 2.5, "grad_norm": 0.18807028234004974, "learning_rate": 1.5004643962848297e-05, "loss": 0.1729, "step": 3230 }, { "epoch": 2.5077399380804954, "grad_norm": 37.76420593261719, "learning_rate": 1.4989164086687307e-05, "loss": 0.4392, "step": 3240 }, { "epoch": 2.515479876160991, "grad_norm": 0.10174084454774857, "learning_rate": 1.4973684210526319e-05, "loss": 0.809, "step": 3250 }, { "epoch": 2.523219814241486, "grad_norm": 0.22918260097503662, "learning_rate": 1.4958204334365327e-05, "loss": 0.9881, "step": 3260 }, { "epoch": 2.5309597523219813, "grad_norm": 0.3912219703197479, "learning_rate": 1.4942724458204337e-05, "loss": 0.018, "step": 3270 }, { "epoch": 2.538699690402477, "grad_norm": 0.11304505169391632, "learning_rate": 1.4927244582043345e-05, "loss": 0.2779, "step": 3280 }, { "epoch": 2.5464396284829722, "grad_norm": 29.893775939941406, "learning_rate": 1.4911764705882354e-05, "loss": 0.3155, "step": 3290 }, { "epoch": 2.5541795665634677, "grad_norm": 0.10493505746126175, "learning_rate": 1.4896284829721364e-05, "loss": 1.8129, "step": 3300 }, { "epoch": 2.5619195046439627, "grad_norm": 61.5128173828125, "learning_rate": 1.4880804953560372e-05, "loss": 1.9661, "step": 3310 }, { "epoch": 2.569659442724458, "grad_norm": 0.0851774662733078, "learning_rate": 1.4865325077399382e-05, "loss": 0.5278, "step": 3320 }, { "epoch": 2.5773993808049536, "grad_norm": 9.292902946472168, "learning_rate": 1.484984520123839e-05, "loss": 0.8827, "step": 3330 }, { "epoch": 2.585139318885449, "grad_norm": 0.07514331489801407, "learning_rate": 1.48343653250774e-05, "loss": 0.4951, "step": 3340 }, { "epoch": 2.5928792569659445, "grad_norm": 0.08628485351800919, "learning_rate": 1.4818885448916409e-05, "loss": 1.2251, "step": 3350 }, { "epoch": 2.6006191950464395, "grad_norm": 0.5280678868293762, "learning_rate": 1.4803405572755419e-05, "loss": 0.6581, "step": 3360 }, { "epoch": 2.608359133126935, "grad_norm": 0.037730976939201355, "learning_rate": 1.478792569659443e-05, "loss": 0.6062, "step": 3370 }, { "epoch": 2.6160990712074303, "grad_norm": 0.7146918177604675, "learning_rate": 1.4772445820433439e-05, "loss": 0.1867, "step": 3380 }, { "epoch": 2.623839009287926, "grad_norm": 0.6190572381019592, "learning_rate": 1.4756965944272449e-05, "loss": 0.8437, "step": 3390 }, { "epoch": 2.6315789473684212, "grad_norm": 14.108997344970703, "learning_rate": 1.4741486068111457e-05, "loss": 0.5242, "step": 3400 }, { "epoch": 2.6393188854489162, "grad_norm": 0.051540836691856384, "learning_rate": 1.4726006191950465e-05, "loss": 0.5259, "step": 3410 }, { "epoch": 2.6470588235294117, "grad_norm": 21.807861328125, "learning_rate": 1.4710526315789475e-05, "loss": 0.8135, "step": 3420 }, { "epoch": 2.654798761609907, "grad_norm": 51.1254997253418, "learning_rate": 1.4695046439628484e-05, "loss": 1.08, "step": 3430 }, { "epoch": 2.6625386996904026, "grad_norm": 0.1269294023513794, "learning_rate": 1.4679566563467494e-05, "loss": 0.5714, "step": 3440 }, { "epoch": 2.670278637770898, "grad_norm": 0.2944725751876831, "learning_rate": 1.4664086687306502e-05, "loss": 0.4598, "step": 3450 }, { "epoch": 2.678018575851393, "grad_norm": 0.050833508372306824, "learning_rate": 1.4648606811145512e-05, "loss": 0.5808, "step": 3460 }, { "epoch": 2.6857585139318885, "grad_norm": 6.656693458557129, "learning_rate": 1.463312693498452e-05, "loss": 0.0297, "step": 3470 }, { "epoch": 2.693498452012384, "grad_norm": 35.97060775756836, "learning_rate": 1.461764705882353e-05, "loss": 0.4452, "step": 3480 }, { "epoch": 2.7012383900928794, "grad_norm": 0.047542016953229904, "learning_rate": 1.4602167182662538e-05, "loss": 1.6265, "step": 3490 }, { "epoch": 2.708978328173375, "grad_norm": 0.06533114612102509, "learning_rate": 1.458668730650155e-05, "loss": 0.3721, "step": 3500 }, { "epoch": 2.71671826625387, "grad_norm": 0.04569459334015846, "learning_rate": 1.4571207430340558e-05, "loss": 0.3591, "step": 3510 }, { "epoch": 2.7244582043343653, "grad_norm": 0.03699889034032822, "learning_rate": 1.4555727554179568e-05, "loss": 0.5377, "step": 3520 }, { "epoch": 2.7321981424148607, "grad_norm": 23.03508758544922, "learning_rate": 1.4540247678018577e-05, "loss": 0.1592, "step": 3530 }, { "epoch": 2.739938080495356, "grad_norm": 1.258757472038269, "learning_rate": 1.4524767801857587e-05, "loss": 1.3272, "step": 3540 }, { "epoch": 2.7476780185758516, "grad_norm": 29.31418800354004, "learning_rate": 1.4509287925696595e-05, "loss": 0.3853, "step": 3550 }, { "epoch": 2.7554179566563466, "grad_norm": 0.05578644946217537, "learning_rate": 1.4493808049535605e-05, "loss": 0.3107, "step": 3560 }, { "epoch": 2.763157894736842, "grad_norm": 0.08776181191205978, "learning_rate": 1.4478328173374613e-05, "loss": 0.369, "step": 3570 }, { "epoch": 2.7708978328173375, "grad_norm": 0.36477822065353394, "learning_rate": 1.4462848297213623e-05, "loss": 1.1087, "step": 3580 }, { "epoch": 2.778637770897833, "grad_norm": 65.80636596679688, "learning_rate": 1.4447368421052632e-05, "loss": 0.757, "step": 3590 }, { "epoch": 2.7863777089783284, "grad_norm": 1.9613147974014282, "learning_rate": 1.4431888544891642e-05, "loss": 0.4403, "step": 3600 }, { "epoch": 2.7941176470588234, "grad_norm": 40.365142822265625, "learning_rate": 1.441640866873065e-05, "loss": 0.852, "step": 3610 }, { "epoch": 2.801857585139319, "grad_norm": 0.03506498783826828, "learning_rate": 1.4400928792569662e-05, "loss": 0.9358, "step": 3620 }, { "epoch": 2.8095975232198143, "grad_norm": 11.416526794433594, "learning_rate": 1.438544891640867e-05, "loss": 1.2572, "step": 3630 }, { "epoch": 2.8173374613003097, "grad_norm": 0.20842906832695007, "learning_rate": 1.436996904024768e-05, "loss": 0.6845, "step": 3640 }, { "epoch": 2.825077399380805, "grad_norm": 0.3351168632507324, "learning_rate": 1.4354489164086688e-05, "loss": 0.8545, "step": 3650 }, { "epoch": 2.8328173374613, "grad_norm": 0.06866948306560516, "learning_rate": 1.4339009287925698e-05, "loss": 1.8003, "step": 3660 }, { "epoch": 2.8405572755417956, "grad_norm": 0.05061683431267738, "learning_rate": 1.4323529411764707e-05, "loss": 0.2537, "step": 3670 }, { "epoch": 2.848297213622291, "grad_norm": 8.898414611816406, "learning_rate": 1.4308049535603717e-05, "loss": 0.6529, "step": 3680 }, { "epoch": 2.8560371517027865, "grad_norm": 8.195051193237305, "learning_rate": 1.4292569659442725e-05, "loss": 0.4924, "step": 3690 }, { "epoch": 2.863777089783282, "grad_norm": 0.9663034081459045, "learning_rate": 1.4277089783281735e-05, "loss": 0.1161, "step": 3700 }, { "epoch": 2.871517027863777, "grad_norm": 0.09168121218681335, "learning_rate": 1.4261609907120743e-05, "loss": 0.6791, "step": 3710 }, { "epoch": 2.8792569659442724, "grad_norm": 0.4560064375400543, "learning_rate": 1.4246130030959753e-05, "loss": 0.3524, "step": 3720 }, { "epoch": 2.886996904024768, "grad_norm": 0.05033763125538826, "learning_rate": 1.4230650154798762e-05, "loss": 0.8217, "step": 3730 }, { "epoch": 2.8947368421052633, "grad_norm": 4.335209369659424, "learning_rate": 1.4215170278637772e-05, "loss": 1.0966, "step": 3740 }, { "epoch": 2.9024767801857587, "grad_norm": 0.0823301449418068, "learning_rate": 1.4199690402476782e-05, "loss": 0.1504, "step": 3750 }, { "epoch": 2.9102167182662537, "grad_norm": 2.030927896499634, "learning_rate": 1.4184210526315792e-05, "loss": 0.1468, "step": 3760 }, { "epoch": 2.917956656346749, "grad_norm": 11.15888786315918, "learning_rate": 1.41687306501548e-05, "loss": 0.132, "step": 3770 }, { "epoch": 2.9256965944272446, "grad_norm": 42.66494369506836, "learning_rate": 1.415325077399381e-05, "loss": 1.4952, "step": 3780 }, { "epoch": 2.93343653250774, "grad_norm": 0.563718318939209, "learning_rate": 1.4137770897832818e-05, "loss": 0.2536, "step": 3790 }, { "epoch": 2.9411764705882355, "grad_norm": 0.08501070737838745, "learning_rate": 1.4122291021671828e-05, "loss": 0.0205, "step": 3800 }, { "epoch": 2.9489164086687305, "grad_norm": 0.034077223390340805, "learning_rate": 1.4106811145510837e-05, "loss": 2.1656, "step": 3810 }, { "epoch": 2.956656346749226, "grad_norm": 0.7754413485527039, "learning_rate": 1.4091331269349847e-05, "loss": 0.5234, "step": 3820 }, { "epoch": 2.9643962848297214, "grad_norm": 35.28246307373047, "learning_rate": 1.4075851393188855e-05, "loss": 1.11, "step": 3830 }, { "epoch": 2.972136222910217, "grad_norm": 0.39776936173439026, "learning_rate": 1.4060371517027865e-05, "loss": 0.3902, "step": 3840 }, { "epoch": 2.9798761609907123, "grad_norm": 0.3578409254550934, "learning_rate": 1.4044891640866873e-05, "loss": 0.858, "step": 3850 }, { "epoch": 2.9876160990712073, "grad_norm": 0.040049389004707336, "learning_rate": 1.4029411764705883e-05, "loss": 1.6966, "step": 3860 }, { "epoch": 2.9953560371517027, "grad_norm": 0.05782315880060196, "learning_rate": 1.4013931888544893e-05, "loss": 0.6452, "step": 3870 }, { "epoch": 3.0, "eval_accuracy": 0.76, "eval_f1": 0.7598095238095237, "eval_loss": 0.9912994503974915, "eval_runtime": 1.2965, "eval_samples_per_second": 38.565, "eval_steps_per_second": 38.565, "step": 3876 }, { "epoch": 3.003095975232198, "grad_norm": 49.298065185546875, "learning_rate": 1.3998452012383903e-05, "loss": 0.5044, "step": 3880 }, { "epoch": 3.0108359133126936, "grad_norm": 0.04470430314540863, "learning_rate": 1.3982972136222911e-05, "loss": 0.3059, "step": 3890 }, { "epoch": 3.018575851393189, "grad_norm": 0.08331624418497086, "learning_rate": 1.3967492260061921e-05, "loss": 0.7811, "step": 3900 }, { "epoch": 3.026315789473684, "grad_norm": 0.09133301675319672, "learning_rate": 1.395201238390093e-05, "loss": 0.8812, "step": 3910 }, { "epoch": 3.0340557275541795, "grad_norm": 44.76803970336914, "learning_rate": 1.393653250773994e-05, "loss": 0.9143, "step": 3920 }, { "epoch": 3.041795665634675, "grad_norm": 0.11133873462677002, "learning_rate": 1.3921052631578948e-05, "loss": 1.1227, "step": 3930 }, { "epoch": 3.0495356037151704, "grad_norm": 0.2368796318769455, "learning_rate": 1.3905572755417958e-05, "loss": 0.0585, "step": 3940 }, { "epoch": 3.057275541795666, "grad_norm": 0.029614118859171867, "learning_rate": 1.3890092879256966e-05, "loss": 0.3534, "step": 3950 }, { "epoch": 3.065015479876161, "grad_norm": 8.862608909606934, "learning_rate": 1.3874613003095976e-05, "loss": 0.1063, "step": 3960 }, { "epoch": 3.0727554179566563, "grad_norm": 0.5369104146957397, "learning_rate": 1.3859133126934985e-05, "loss": 1.1207, "step": 3970 }, { "epoch": 3.0804953560371517, "grad_norm": 1.7445117235183716, "learning_rate": 1.3843653250773995e-05, "loss": 0.1972, "step": 3980 }, { "epoch": 3.088235294117647, "grad_norm": 0.06086982041597366, "learning_rate": 1.3828173374613003e-05, "loss": 0.5166, "step": 3990 }, { "epoch": 3.0959752321981426, "grad_norm": 0.5017050504684448, "learning_rate": 1.3812693498452015e-05, "loss": 1.5453, "step": 4000 }, { "epoch": 3.1037151702786376, "grad_norm": 0.04809604585170746, "learning_rate": 1.3797213622291023e-05, "loss": 0.0112, "step": 4010 }, { "epoch": 3.111455108359133, "grad_norm": 0.02549964003264904, "learning_rate": 1.3781733746130033e-05, "loss": 0.5002, "step": 4020 }, { "epoch": 3.1191950464396285, "grad_norm": 0.11794928461313248, "learning_rate": 1.3766253869969041e-05, "loss": 0.023, "step": 4030 }, { "epoch": 3.126934984520124, "grad_norm": 0.059615358710289, "learning_rate": 1.3750773993808051e-05, "loss": 0.2827, "step": 4040 }, { "epoch": 3.1346749226006194, "grad_norm": 50.299591064453125, "learning_rate": 1.373529411764706e-05, "loss": 1.3132, "step": 4050 }, { "epoch": 3.1424148606811144, "grad_norm": 28.69629669189453, "learning_rate": 1.371981424148607e-05, "loss": 1.2628, "step": 4060 }, { "epoch": 3.15015479876161, "grad_norm": 0.0947827473282814, "learning_rate": 1.3704334365325078e-05, "loss": 0.3177, "step": 4070 }, { "epoch": 3.1578947368421053, "grad_norm": 16.990875244140625, "learning_rate": 1.3688854489164088e-05, "loss": 0.437, "step": 4080 }, { "epoch": 3.1656346749226008, "grad_norm": 0.045119963586330414, "learning_rate": 1.3673374613003096e-05, "loss": 0.0592, "step": 4090 }, { "epoch": 3.173374613003096, "grad_norm": 0.10691500455141068, "learning_rate": 1.3657894736842106e-05, "loss": 0.5114, "step": 4100 }, { "epoch": 3.181114551083591, "grad_norm": 0.19114838540554047, "learning_rate": 1.3642414860681115e-05, "loss": 1.1726, "step": 4110 }, { "epoch": 3.1888544891640866, "grad_norm": 0.057168181985616684, "learning_rate": 1.3626934984520126e-05, "loss": 0.6743, "step": 4120 }, { "epoch": 3.196594427244582, "grad_norm": 2.531748056411743, "learning_rate": 1.3611455108359135e-05, "loss": 0.5969, "step": 4130 }, { "epoch": 3.2043343653250775, "grad_norm": 0.03029949963092804, "learning_rate": 1.3595975232198145e-05, "loss": 1.3093, "step": 4140 }, { "epoch": 3.212074303405573, "grad_norm": 0.04683234915137291, "learning_rate": 1.3580495356037153e-05, "loss": 0.7787, "step": 4150 }, { "epoch": 3.219814241486068, "grad_norm": 1.2075011730194092, "learning_rate": 1.3565015479876163e-05, "loss": 0.2168, "step": 4160 }, { "epoch": 3.2275541795665634, "grad_norm": 0.08152808248996735, "learning_rate": 1.3549535603715171e-05, "loss": 0.6047, "step": 4170 }, { "epoch": 3.235294117647059, "grad_norm": 30.323863983154297, "learning_rate": 1.3534055727554181e-05, "loss": 1.2015, "step": 4180 }, { "epoch": 3.2430340557275543, "grad_norm": 0.11758198589086533, "learning_rate": 1.351857585139319e-05, "loss": 0.4192, "step": 4190 }, { "epoch": 3.2507739938080498, "grad_norm": 9.75107479095459, "learning_rate": 1.35030959752322e-05, "loss": 1.0658, "step": 4200 }, { "epoch": 3.2585139318885448, "grad_norm": 0.030274180695414543, "learning_rate": 1.3487616099071208e-05, "loss": 1.3553, "step": 4210 }, { "epoch": 3.26625386996904, "grad_norm": 0.1400892287492752, "learning_rate": 1.3472136222910218e-05, "loss": 0.0268, "step": 4220 }, { "epoch": 3.2739938080495357, "grad_norm": 0.18268881738185883, "learning_rate": 1.3456656346749226e-05, "loss": 0.0315, "step": 4230 }, { "epoch": 3.281733746130031, "grad_norm": 7.974414348602295, "learning_rate": 1.3441176470588238e-05, "loss": 1.2321, "step": 4240 }, { "epoch": 3.2894736842105265, "grad_norm": 0.08304005116224289, "learning_rate": 1.3425696594427246e-05, "loss": 0.4877, "step": 4250 }, { "epoch": 3.2972136222910216, "grad_norm": 3.9400973320007324, "learning_rate": 1.3410216718266256e-05, "loss": 0.3918, "step": 4260 }, { "epoch": 3.304953560371517, "grad_norm": 3.499447822570801, "learning_rate": 1.3394736842105264e-05, "loss": 0.6642, "step": 4270 }, { "epoch": 3.3126934984520124, "grad_norm": 41.200531005859375, "learning_rate": 1.3379256965944274e-05, "loss": 1.0846, "step": 4280 }, { "epoch": 3.320433436532508, "grad_norm": 17.365467071533203, "learning_rate": 1.3363777089783283e-05, "loss": 0.9964, "step": 4290 }, { "epoch": 3.3281733746130033, "grad_norm": 0.07096298038959503, "learning_rate": 1.3348297213622293e-05, "loss": 1.0846, "step": 4300 }, { "epoch": 3.3359133126934983, "grad_norm": 0.042028624564409256, "learning_rate": 1.3332817337461301e-05, "loss": 0.9487, "step": 4310 }, { "epoch": 3.343653250773994, "grad_norm": 1.071221113204956, "learning_rate": 1.3317337461300311e-05, "loss": 0.6097, "step": 4320 }, { "epoch": 3.3513931888544892, "grad_norm": 47.64500045776367, "learning_rate": 1.330185758513932e-05, "loss": 0.5178, "step": 4330 }, { "epoch": 3.3591331269349847, "grad_norm": 0.06479723006486893, "learning_rate": 1.328637770897833e-05, "loss": 0.9685, "step": 4340 }, { "epoch": 3.36687306501548, "grad_norm": 2.724148988723755, "learning_rate": 1.3270897832817338e-05, "loss": 0.1263, "step": 4350 }, { "epoch": 3.374613003095975, "grad_norm": 27.397666931152344, "learning_rate": 1.3255417956656346e-05, "loss": 1.0686, "step": 4360 }, { "epoch": 3.3823529411764706, "grad_norm": 0.4985007047653198, "learning_rate": 1.3239938080495358e-05, "loss": 1.2225, "step": 4370 }, { "epoch": 3.390092879256966, "grad_norm": 43.03326416015625, "learning_rate": 1.3224458204334368e-05, "loss": 0.3144, "step": 4380 }, { "epoch": 3.3978328173374615, "grad_norm": 0.035674821585416794, "learning_rate": 1.3208978328173376e-05, "loss": 0.1895, "step": 4390 }, { "epoch": 3.405572755417957, "grad_norm": 0.23882943391799927, "learning_rate": 1.3193498452012386e-05, "loss": 0.5685, "step": 4400 }, { "epoch": 3.413312693498452, "grad_norm": 0.17329153418540955, "learning_rate": 1.3178018575851394e-05, "loss": 1.7336, "step": 4410 }, { "epoch": 3.4210526315789473, "grad_norm": 0.314429372549057, "learning_rate": 1.3162538699690404e-05, "loss": 0.4558, "step": 4420 }, { "epoch": 3.428792569659443, "grad_norm": 0.029965396970510483, "learning_rate": 1.3147058823529413e-05, "loss": 0.0109, "step": 4430 }, { "epoch": 3.4365325077399382, "grad_norm": 0.5258276462554932, "learning_rate": 1.3131578947368423e-05, "loss": 0.4063, "step": 4440 }, { "epoch": 3.4442724458204337, "grad_norm": 43.69557571411133, "learning_rate": 1.3116099071207431e-05, "loss": 0.5241, "step": 4450 }, { "epoch": 3.4520123839009287, "grad_norm": 0.07699594646692276, "learning_rate": 1.310061919504644e-05, "loss": 0.2252, "step": 4460 }, { "epoch": 3.459752321981424, "grad_norm": 0.037656739354133606, "learning_rate": 1.308513931888545e-05, "loss": 0.7423, "step": 4470 }, { "epoch": 3.4674922600619196, "grad_norm": 0.01498746033757925, "learning_rate": 1.3069659442724458e-05, "loss": 0.1875, "step": 4480 }, { "epoch": 3.475232198142415, "grad_norm": 0.9402735233306885, "learning_rate": 1.305417956656347e-05, "loss": 0.2188, "step": 4490 }, { "epoch": 3.4829721362229105, "grad_norm": 49.73341751098633, "learning_rate": 1.303869969040248e-05, "loss": 0.6394, "step": 4500 }, { "epoch": 3.4907120743034055, "grad_norm": 8.402969360351562, "learning_rate": 1.3023219814241488e-05, "loss": 0.5003, "step": 4510 }, { "epoch": 3.498452012383901, "grad_norm": 0.04983089119195938, "learning_rate": 1.3007739938080498e-05, "loss": 1.1356, "step": 4520 }, { "epoch": 3.5061919504643964, "grad_norm": 3.2768659591674805, "learning_rate": 1.2992260061919506e-05, "loss": 1.2293, "step": 4530 }, { "epoch": 3.513931888544892, "grad_norm": 22.382577896118164, "learning_rate": 1.2976780185758516e-05, "loss": 0.4556, "step": 4540 }, { "epoch": 3.5216718266253872, "grad_norm": 0.26380711793899536, "learning_rate": 1.2961300309597524e-05, "loss": 0.0621, "step": 4550 }, { "epoch": 3.5294117647058822, "grad_norm": 8.952470779418945, "learning_rate": 1.2945820433436534e-05, "loss": 1.0895, "step": 4560 }, { "epoch": 3.5371517027863777, "grad_norm": 0.28140974044799805, "learning_rate": 1.2930340557275543e-05, "loss": 0.7888, "step": 4570 }, { "epoch": 3.544891640866873, "grad_norm": 0.034215766936540604, "learning_rate": 1.291486068111455e-05, "loss": 0.0267, "step": 4580 }, { "epoch": 3.5526315789473686, "grad_norm": 0.03821820765733719, "learning_rate": 1.289938080495356e-05, "loss": 0.7483, "step": 4590 }, { "epoch": 3.560371517027864, "grad_norm": 4.500551700592041, "learning_rate": 1.2883900928792569e-05, "loss": 0.0459, "step": 4600 }, { "epoch": 3.568111455108359, "grad_norm": 0.021589813753962517, "learning_rate": 1.2868421052631579e-05, "loss": 0.8281, "step": 4610 }, { "epoch": 3.5758513931888545, "grad_norm": 0.03971223905682564, "learning_rate": 1.285294117647059e-05, "loss": 0.9086, "step": 4620 }, { "epoch": 3.58359133126935, "grad_norm": 0.18547774851322174, "learning_rate": 1.2837461300309599e-05, "loss": 0.0172, "step": 4630 }, { "epoch": 3.5913312693498454, "grad_norm": 1.1909455060958862, "learning_rate": 1.2821981424148609e-05, "loss": 0.0213, "step": 4640 }, { "epoch": 3.599071207430341, "grad_norm": 0.036741238087415695, "learning_rate": 1.2806501547987617e-05, "loss": 0.5754, "step": 4650 }, { "epoch": 3.606811145510836, "grad_norm": 0.309771329164505, "learning_rate": 1.2791021671826627e-05, "loss": 0.2633, "step": 4660 }, { "epoch": 3.6145510835913313, "grad_norm": 0.10040708631277084, "learning_rate": 1.2775541795665636e-05, "loss": 0.0441, "step": 4670 }, { "epoch": 3.6222910216718267, "grad_norm": 25.874502182006836, "learning_rate": 1.2760061919504644e-05, "loss": 0.532, "step": 4680 }, { "epoch": 3.6300309597523217, "grad_norm": 0.06081942841410637, "learning_rate": 1.2744582043343654e-05, "loss": 0.5983, "step": 4690 }, { "epoch": 3.6377708978328176, "grad_norm": 0.05650794506072998, "learning_rate": 1.2729102167182662e-05, "loss": 0.037, "step": 4700 }, { "epoch": 3.6455108359133126, "grad_norm": 0.04229149967432022, "learning_rate": 1.2713622291021672e-05, "loss": 0.1202, "step": 4710 }, { "epoch": 3.653250773993808, "grad_norm": 0.42258989810943604, "learning_rate": 1.269814241486068e-05, "loss": 0.4526, "step": 4720 }, { "epoch": 3.6609907120743035, "grad_norm": 1.6414612531661987, "learning_rate": 1.268266253869969e-05, "loss": 0.0191, "step": 4730 }, { "epoch": 3.6687306501547985, "grad_norm": 0.029779210686683655, "learning_rate": 1.2667182662538702e-05, "loss": 0.6119, "step": 4740 }, { "epoch": 3.6764705882352944, "grad_norm": 0.03590656816959381, "learning_rate": 1.265170278637771e-05, "loss": 0.5064, "step": 4750 }, { "epoch": 3.6842105263157894, "grad_norm": 0.040363505482673645, "learning_rate": 1.263622291021672e-05, "loss": 0.2463, "step": 4760 }, { "epoch": 3.691950464396285, "grad_norm": 0.06570034474134445, "learning_rate": 1.2620743034055729e-05, "loss": 1.0068, "step": 4770 }, { "epoch": 3.6996904024767803, "grad_norm": 0.11740195751190186, "learning_rate": 1.2605263157894739e-05, "loss": 1.1699, "step": 4780 }, { "epoch": 3.7074303405572753, "grad_norm": 36.13241958618164, "learning_rate": 1.2589783281733747e-05, "loss": 0.1493, "step": 4790 }, { "epoch": 3.715170278637771, "grad_norm": 0.05624663457274437, "learning_rate": 1.2574303405572756e-05, "loss": 0.0808, "step": 4800 }, { "epoch": 3.722910216718266, "grad_norm": 0.018279779702425003, "learning_rate": 1.2558823529411766e-05, "loss": 0.1165, "step": 4810 }, { "epoch": 3.7306501547987616, "grad_norm": 0.03757398948073387, "learning_rate": 1.2543343653250774e-05, "loss": 0.0845, "step": 4820 }, { "epoch": 3.738390092879257, "grad_norm": 0.01982535794377327, "learning_rate": 1.2527863777089784e-05, "loss": 0.3013, "step": 4830 }, { "epoch": 3.746130030959752, "grad_norm": 38.11370086669922, "learning_rate": 1.2512383900928792e-05, "loss": 0.7985, "step": 4840 }, { "epoch": 3.753869969040248, "grad_norm": 0.06611485779285431, "learning_rate": 1.2498452012383902e-05, "loss": 0.2028, "step": 4850 }, { "epoch": 3.761609907120743, "grad_norm": 0.029886670410633087, "learning_rate": 1.248297213622291e-05, "loss": 0.2808, "step": 4860 }, { "epoch": 3.7693498452012384, "grad_norm": 0.01799454726278782, "learning_rate": 1.2467492260061919e-05, "loss": 0.7497, "step": 4870 }, { "epoch": 3.777089783281734, "grad_norm": 0.8087806701660156, "learning_rate": 1.2452012383900929e-05, "loss": 0.7306, "step": 4880 }, { "epoch": 3.784829721362229, "grad_norm": 0.026307599619030952, "learning_rate": 1.243653250773994e-05, "loss": 1.0238, "step": 4890 }, { "epoch": 3.7925696594427247, "grad_norm": 0.4978444278240204, "learning_rate": 1.2421052631578949e-05, "loss": 0.0621, "step": 4900 }, { "epoch": 3.8003095975232197, "grad_norm": 0.027980167418718338, "learning_rate": 1.2405572755417959e-05, "loss": 0.5647, "step": 4910 }, { "epoch": 3.808049535603715, "grad_norm": 0.026450349017977715, "learning_rate": 1.2390092879256967e-05, "loss": 0.7702, "step": 4920 }, { "epoch": 3.8157894736842106, "grad_norm": 0.039206575602293015, "learning_rate": 1.2374613003095977e-05, "loss": 0.5643, "step": 4930 }, { "epoch": 3.8235294117647056, "grad_norm": 0.08837167918682098, "learning_rate": 1.2359133126934986e-05, "loss": 0.3886, "step": 4940 }, { "epoch": 3.8312693498452015, "grad_norm": 0.3407689929008484, "learning_rate": 1.2343653250773996e-05, "loss": 0.6649, "step": 4950 }, { "epoch": 3.8390092879256965, "grad_norm": 26.24752426147461, "learning_rate": 1.2328173374613004e-05, "loss": 0.5842, "step": 4960 }, { "epoch": 3.846749226006192, "grad_norm": 0.03525685891509056, "learning_rate": 1.2312693498452014e-05, "loss": 0.6775, "step": 4970 }, { "epoch": 3.8544891640866874, "grad_norm": 0.03134894371032715, "learning_rate": 1.2297213622291022e-05, "loss": 0.3044, "step": 4980 }, { "epoch": 3.8622291021671824, "grad_norm": 0.11389874666929245, "learning_rate": 1.228173374613003e-05, "loss": 0.828, "step": 4990 }, { "epoch": 3.8699690402476783, "grad_norm": 27.99997901916504, "learning_rate": 1.226625386996904e-05, "loss": 0.1932, "step": 5000 }, { "epoch": 3.8777089783281733, "grad_norm": 0.16149520874023438, "learning_rate": 1.2250773993808049e-05, "loss": 0.0413, "step": 5010 }, { "epoch": 3.8854489164086687, "grad_norm": 0.1479569673538208, "learning_rate": 1.223529411764706e-05, "loss": 1.0729, "step": 5020 }, { "epoch": 3.893188854489164, "grad_norm": 22.204856872558594, "learning_rate": 1.221981424148607e-05, "loss": 1.0195, "step": 5030 }, { "epoch": 3.900928792569659, "grad_norm": 0.05954609811306, "learning_rate": 1.2204334365325079e-05, "loss": 0.0788, "step": 5040 }, { "epoch": 3.9086687306501546, "grad_norm": 7.796119213104248, "learning_rate": 1.2188854489164089e-05, "loss": 1.1504, "step": 5050 }, { "epoch": 3.91640866873065, "grad_norm": 2.9817376136779785, "learning_rate": 1.2173374613003097e-05, "loss": 0.5464, "step": 5060 }, { "epoch": 3.9241486068111455, "grad_norm": 0.0389409214258194, "learning_rate": 1.2157894736842107e-05, "loss": 1.1231, "step": 5070 }, { "epoch": 3.931888544891641, "grad_norm": 4.861098289489746, "learning_rate": 1.2142414860681115e-05, "loss": 0.0601, "step": 5080 }, { "epoch": 3.939628482972136, "grad_norm": 0.09085500985383987, "learning_rate": 1.2126934984520124e-05, "loss": 0.6197, "step": 5090 }, { "epoch": 3.9473684210526314, "grad_norm": 33.50436019897461, "learning_rate": 1.2111455108359134e-05, "loss": 1.2418, "step": 5100 }, { "epoch": 3.955108359133127, "grad_norm": 56.94590759277344, "learning_rate": 1.2095975232198142e-05, "loss": 1.7285, "step": 5110 }, { "epoch": 3.9628482972136223, "grad_norm": 0.020337959751486778, "learning_rate": 1.2080495356037152e-05, "loss": 0.6613, "step": 5120 }, { "epoch": 3.9705882352941178, "grad_norm": 0.029728984460234642, "learning_rate": 1.206501547987616e-05, "loss": 0.6961, "step": 5130 }, { "epoch": 3.9783281733746128, "grad_norm": 19.383586883544922, "learning_rate": 1.2049535603715172e-05, "loss": 0.878, "step": 5140 }, { "epoch": 3.986068111455108, "grad_norm": 0.02791929431259632, "learning_rate": 1.2034055727554182e-05, "loss": 0.9056, "step": 5150 }, { "epoch": 3.9938080495356036, "grad_norm": 0.03993268683552742, "learning_rate": 1.201857585139319e-05, "loss": 0.0372, "step": 5160 }, { "epoch": 4.0, "eval_accuracy": 0.8, "eval_f1": 0.7991111111111111, "eval_loss": 0.884955883026123, "eval_runtime": 1.3066, "eval_samples_per_second": 38.267, "eval_steps_per_second": 38.267, "step": 5168 }, { "epoch": 4.001547987616099, "grad_norm": 0.03281019628047943, "learning_rate": 1.20030959752322e-05, "loss": 0.6162, "step": 5170 }, { "epoch": 4.0092879256965945, "grad_norm": 8.634324073791504, "learning_rate": 1.1987616099071209e-05, "loss": 2.1323, "step": 5180 }, { "epoch": 4.0170278637770895, "grad_norm": 29.158843994140625, "learning_rate": 1.1972136222910219e-05, "loss": 0.8002, "step": 5190 }, { "epoch": 4.024767801857585, "grad_norm": 10.542095184326172, "learning_rate": 1.1956656346749227e-05, "loss": 1.1514, "step": 5200 }, { "epoch": 4.03250773993808, "grad_norm": 2.3038291931152344, "learning_rate": 1.1941176470588235e-05, "loss": 0.0266, "step": 5210 }, { "epoch": 4.040247678018575, "grad_norm": 0.039619531482458115, "learning_rate": 1.1925696594427245e-05, "loss": 0.5284, "step": 5220 }, { "epoch": 4.047987616099071, "grad_norm": 0.04173695296049118, "learning_rate": 1.1910216718266254e-05, "loss": 0.2846, "step": 5230 }, { "epoch": 4.055727554179566, "grad_norm": 27.401147842407227, "learning_rate": 1.1894736842105264e-05, "loss": 0.5535, "step": 5240 }, { "epoch": 4.063467492260062, "grad_norm": 2.5797548294067383, "learning_rate": 1.1879256965944272e-05, "loss": 0.5543, "step": 5250 }, { "epoch": 4.071207430340557, "grad_norm": 0.037547528743743896, "learning_rate": 1.1863777089783284e-05, "loss": 0.0111, "step": 5260 }, { "epoch": 4.078947368421052, "grad_norm": 0.10080958902835846, "learning_rate": 1.1848297213622294e-05, "loss": 0.0622, "step": 5270 }, { "epoch": 4.086687306501548, "grad_norm": 0.032087188214063644, "learning_rate": 1.1832817337461302e-05, "loss": 0.9024, "step": 5280 }, { "epoch": 4.094427244582043, "grad_norm": 0.03558613359928131, "learning_rate": 1.1817337461300312e-05, "loss": 1.0445, "step": 5290 }, { "epoch": 4.102167182662539, "grad_norm": 21.2044734954834, "learning_rate": 1.180185758513932e-05, "loss": 0.5232, "step": 5300 }, { "epoch": 4.109907120743034, "grad_norm": 85.69635009765625, "learning_rate": 1.1786377708978329e-05, "loss": 0.6963, "step": 5310 }, { "epoch": 4.117647058823529, "grad_norm": 0.033341363072395325, "learning_rate": 1.1770897832817339e-05, "loss": 0.0511, "step": 5320 }, { "epoch": 4.125386996904025, "grad_norm": 0.09053324162960052, "learning_rate": 1.1755417956656347e-05, "loss": 0.5567, "step": 5330 }, { "epoch": 4.13312693498452, "grad_norm": 85.49312591552734, "learning_rate": 1.1739938080495357e-05, "loss": 0.6859, "step": 5340 }, { "epoch": 4.140866873065016, "grad_norm": 0.05283334106206894, "learning_rate": 1.1724458204334365e-05, "loss": 0.0604, "step": 5350 }, { "epoch": 4.148606811145511, "grad_norm": 144.27883911132812, "learning_rate": 1.1708978328173375e-05, "loss": 0.3717, "step": 5360 }, { "epoch": 4.156346749226006, "grad_norm": 0.03461809083819389, "learning_rate": 1.1693498452012383e-05, "loss": 0.3953, "step": 5370 }, { "epoch": 4.164086687306502, "grad_norm": 1.8080397844314575, "learning_rate": 1.1678018575851393e-05, "loss": 0.0138, "step": 5380 }, { "epoch": 4.171826625386997, "grad_norm": 0.935083270072937, "learning_rate": 1.1662538699690405e-05, "loss": 0.3896, "step": 5390 }, { "epoch": 4.179566563467493, "grad_norm": 23.78544807434082, "learning_rate": 1.1647058823529413e-05, "loss": 0.776, "step": 5400 }, { "epoch": 4.187306501547988, "grad_norm": 69.31302642822266, "learning_rate": 1.1631578947368423e-05, "loss": 0.3748, "step": 5410 }, { "epoch": 4.195046439628483, "grad_norm": 0.023863522335886955, "learning_rate": 1.1616099071207432e-05, "loss": 0.8235, "step": 5420 }, { "epoch": 4.2027863777089784, "grad_norm": 0.03563246876001358, "learning_rate": 1.160061919504644e-05, "loss": 0.4829, "step": 5430 }, { "epoch": 4.2105263157894735, "grad_norm": 0.04414466395974159, "learning_rate": 1.158513931888545e-05, "loss": 1.1075, "step": 5440 }, { "epoch": 4.218266253869969, "grad_norm": 0.02487238496541977, "learning_rate": 1.1569659442724458e-05, "loss": 0.6934, "step": 5450 }, { "epoch": 4.226006191950464, "grad_norm": 0.01940145343542099, "learning_rate": 1.1554179566563468e-05, "loss": 0.478, "step": 5460 }, { "epoch": 4.233746130030959, "grad_norm": 0.03658424690365791, "learning_rate": 1.1538699690402477e-05, "loss": 0.0379, "step": 5470 }, { "epoch": 4.241486068111455, "grad_norm": 70.93460083007812, "learning_rate": 1.1523219814241487e-05, "loss": 0.8949, "step": 5480 }, { "epoch": 4.24922600619195, "grad_norm": 0.21980462968349457, "learning_rate": 1.1507739938080495e-05, "loss": 0.5011, "step": 5490 }, { "epoch": 4.256965944272446, "grad_norm": 8.605793952941895, "learning_rate": 1.1492260061919505e-05, "loss": 1.4279, "step": 5500 }, { "epoch": 4.264705882352941, "grad_norm": 0.016837310045957565, "learning_rate": 1.1476780185758517e-05, "loss": 1.1886, "step": 5510 }, { "epoch": 4.272445820433436, "grad_norm": 0.14988501369953156, "learning_rate": 1.1461300309597525e-05, "loss": 1.5584, "step": 5520 }, { "epoch": 4.280185758513932, "grad_norm": 0.05577724054455757, "learning_rate": 1.1445820433436533e-05, "loss": 0.6809, "step": 5530 }, { "epoch": 4.287925696594427, "grad_norm": 0.03546776622533798, "learning_rate": 1.1430340557275543e-05, "loss": 0.0124, "step": 5540 }, { "epoch": 4.295665634674923, "grad_norm": 1.6593599319458008, "learning_rate": 1.1414860681114552e-05, "loss": 0.1384, "step": 5550 }, { "epoch": 4.303405572755418, "grad_norm": 1.9435834884643555, "learning_rate": 1.1399380804953562e-05, "loss": 0.4613, "step": 5560 }, { "epoch": 4.311145510835913, "grad_norm": 23.787334442138672, "learning_rate": 1.138390092879257e-05, "loss": 0.7504, "step": 5570 }, { "epoch": 4.318885448916409, "grad_norm": 0.04316208139061928, "learning_rate": 1.136842105263158e-05, "loss": 0.2828, "step": 5580 }, { "epoch": 4.326625386996904, "grad_norm": 0.0201573483645916, "learning_rate": 1.1352941176470588e-05, "loss": 1.3077, "step": 5590 }, { "epoch": 4.3343653250774, "grad_norm": 0.11980602145195007, "learning_rate": 1.1337461300309598e-05, "loss": 0.4597, "step": 5600 }, { "epoch": 4.342105263157895, "grad_norm": 0.03457239270210266, "learning_rate": 1.1321981424148607e-05, "loss": 0.7496, "step": 5610 }, { "epoch": 4.34984520123839, "grad_norm": 0.040638796985149384, "learning_rate": 1.1306501547987617e-05, "loss": 0.352, "step": 5620 }, { "epoch": 4.357585139318886, "grad_norm": 29.464889526367188, "learning_rate": 1.1291021671826625e-05, "loss": 1.0502, "step": 5630 }, { "epoch": 4.365325077399381, "grad_norm": 0.049538224935531616, "learning_rate": 1.1275541795665637e-05, "loss": 0.5088, "step": 5640 }, { "epoch": 4.3730650154798765, "grad_norm": 0.03715967386960983, "learning_rate": 1.1260061919504645e-05, "loss": 0.1493, "step": 5650 }, { "epoch": 4.3808049535603715, "grad_norm": 0.043973058462142944, "learning_rate": 1.1244582043343655e-05, "loss": 0.8111, "step": 5660 }, { "epoch": 4.3885448916408665, "grad_norm": 88.6701431274414, "learning_rate": 1.1229102167182663e-05, "loss": 0.6645, "step": 5670 }, { "epoch": 4.396284829721362, "grad_norm": 0.11968568712472916, "learning_rate": 1.1213622291021673e-05, "loss": 0.0106, "step": 5680 }, { "epoch": 4.404024767801857, "grad_norm": 0.033197708427906036, "learning_rate": 1.1198142414860682e-05, "loss": 0.0223, "step": 5690 }, { "epoch": 4.411764705882353, "grad_norm": 0.047147709876298904, "learning_rate": 1.1182662538699692e-05, "loss": 0.3239, "step": 5700 }, { "epoch": 4.419504643962848, "grad_norm": 10.11269474029541, "learning_rate": 1.11671826625387e-05, "loss": 1.0759, "step": 5710 }, { "epoch": 4.427244582043343, "grad_norm": 0.3081977963447571, "learning_rate": 1.115170278637771e-05, "loss": 0.7713, "step": 5720 }, { "epoch": 4.434984520123839, "grad_norm": 9.754612922668457, "learning_rate": 1.1136222910216718e-05, "loss": 1.5546, "step": 5730 }, { "epoch": 4.442724458204334, "grad_norm": 0.022936569526791573, "learning_rate": 1.1120743034055728e-05, "loss": 0.4399, "step": 5740 }, { "epoch": 4.45046439628483, "grad_norm": 2.199319362640381, "learning_rate": 1.1105263157894736e-05, "loss": 0.4875, "step": 5750 }, { "epoch": 4.458204334365325, "grad_norm": 0.3061660826206207, "learning_rate": 1.1089783281733748e-05, "loss": 0.0099, "step": 5760 }, { "epoch": 4.46594427244582, "grad_norm": 0.1751408874988556, "learning_rate": 1.1074303405572756e-05, "loss": 0.6985, "step": 5770 }, { "epoch": 4.473684210526316, "grad_norm": 0.014674928970634937, "learning_rate": 1.1058823529411766e-05, "loss": 1.1971, "step": 5780 }, { "epoch": 4.481424148606811, "grad_norm": 0.04052010178565979, "learning_rate": 1.1043343653250775e-05, "loss": 0.0905, "step": 5790 }, { "epoch": 4.489164086687307, "grad_norm": 0.045240990817546844, "learning_rate": 1.1027863777089785e-05, "loss": 0.72, "step": 5800 }, { "epoch": 4.496904024767802, "grad_norm": 0.022863060235977173, "learning_rate": 1.1012383900928793e-05, "loss": 0.2276, "step": 5810 }, { "epoch": 4.504643962848297, "grad_norm": 0.13357888162136078, "learning_rate": 1.0996904024767803e-05, "loss": 1.058, "step": 5820 }, { "epoch": 4.512383900928793, "grad_norm": 8.226449012756348, "learning_rate": 1.0981424148606811e-05, "loss": 0.7676, "step": 5830 }, { "epoch": 4.520123839009288, "grad_norm": 0.04828062653541565, "learning_rate": 1.0965944272445821e-05, "loss": 0.7964, "step": 5840 }, { "epoch": 4.527863777089784, "grad_norm": 0.9141461253166199, "learning_rate": 1.095046439628483e-05, "loss": 0.046, "step": 5850 }, { "epoch": 4.535603715170279, "grad_norm": 10.115195274353027, "learning_rate": 1.093498452012384e-05, "loss": 0.718, "step": 5860 }, { "epoch": 4.543343653250774, "grad_norm": 0.36758652329444885, "learning_rate": 1.0919504643962848e-05, "loss": 0.0624, "step": 5870 }, { "epoch": 4.5510835913312695, "grad_norm": 0.012661722488701344, "learning_rate": 1.0904024767801858e-05, "loss": 0.6111, "step": 5880 }, { "epoch": 4.5588235294117645, "grad_norm": 1.6662551164627075, "learning_rate": 1.0888544891640868e-05, "loss": 0.0221, "step": 5890 }, { "epoch": 4.56656346749226, "grad_norm": 0.6792493462562561, "learning_rate": 1.0873065015479878e-05, "loss": 1.2557, "step": 5900 }, { "epoch": 4.574303405572755, "grad_norm": 0.12748956680297852, "learning_rate": 1.0857585139318886e-05, "loss": 0.4236, "step": 5910 }, { "epoch": 4.58204334365325, "grad_norm": 0.05110849067568779, "learning_rate": 1.0842105263157896e-05, "loss": 0.0375, "step": 5920 }, { "epoch": 4.589783281733746, "grad_norm": 0.16009318828582764, "learning_rate": 1.0826625386996905e-05, "loss": 0.2892, "step": 5930 }, { "epoch": 4.597523219814241, "grad_norm": 0.029583079740405083, "learning_rate": 1.0811145510835915e-05, "loss": 0.2872, "step": 5940 }, { "epoch": 4.605263157894737, "grad_norm": 0.17274527251720428, "learning_rate": 1.0795665634674923e-05, "loss": 0.6504, "step": 5950 }, { "epoch": 4.613003095975232, "grad_norm": 19.10822868347168, "learning_rate": 1.0780185758513933e-05, "loss": 1.4017, "step": 5960 }, { "epoch": 4.620743034055727, "grad_norm": 0.07249661535024643, "learning_rate": 1.0764705882352941e-05, "loss": 0.0081, "step": 5970 }, { "epoch": 4.628482972136223, "grad_norm": 0.022549929097294807, "learning_rate": 1.0749226006191951e-05, "loss": 1.3443, "step": 5980 }, { "epoch": 4.636222910216718, "grad_norm": 0.2669324576854706, "learning_rate": 1.073374613003096e-05, "loss": 0.7952, "step": 5990 }, { "epoch": 4.643962848297214, "grad_norm": 0.038402359932661057, "learning_rate": 1.071826625386997e-05, "loss": 1.0963, "step": 6000 }, { "epoch": 4.651702786377709, "grad_norm": 0.024487299844622612, "learning_rate": 1.070278637770898e-05, "loss": 1.0496, "step": 6010 }, { "epoch": 4.659442724458204, "grad_norm": 0.038668934255838394, "learning_rate": 1.068730650154799e-05, "loss": 0.6435, "step": 6020 }, { "epoch": 4.6671826625387, "grad_norm": 40.28995132446289, "learning_rate": 1.0671826625386998e-05, "loss": 1.1581, "step": 6030 }, { "epoch": 4.674922600619195, "grad_norm": 0.0357036255300045, "learning_rate": 1.0656346749226008e-05, "loss": 0.0887, "step": 6040 }, { "epoch": 4.682662538699691, "grad_norm": 0.04441540688276291, "learning_rate": 1.0640866873065016e-05, "loss": 0.4153, "step": 6050 }, { "epoch": 4.690402476780186, "grad_norm": 0.17303596436977386, "learning_rate": 1.0625386996904026e-05, "loss": 0.547, "step": 6060 }, { "epoch": 4.698142414860681, "grad_norm": 0.15647529065608978, "learning_rate": 1.0609907120743034e-05, "loss": 0.0175, "step": 6070 }, { "epoch": 4.705882352941177, "grad_norm": 67.16046905517578, "learning_rate": 1.0594427244582045e-05, "loss": 0.4754, "step": 6080 }, { "epoch": 4.713622291021672, "grad_norm": 0.11639310419559479, "learning_rate": 1.0578947368421053e-05, "loss": 0.0091, "step": 6090 }, { "epoch": 4.7213622291021675, "grad_norm": 0.03444235399365425, "learning_rate": 1.0563467492260063e-05, "loss": 0.8087, "step": 6100 }, { "epoch": 4.7291021671826625, "grad_norm": 0.2738116383552551, "learning_rate": 1.0547987616099071e-05, "loss": 0.5292, "step": 6110 }, { "epoch": 4.7368421052631575, "grad_norm": 0.06224815919995308, "learning_rate": 1.0532507739938081e-05, "loss": 0.0098, "step": 6120 }, { "epoch": 4.744582043343653, "grad_norm": 0.03102247416973114, "learning_rate": 1.0517027863777091e-05, "loss": 0.4147, "step": 6130 }, { "epoch": 4.752321981424148, "grad_norm": 34.31550598144531, "learning_rate": 1.0501547987616101e-05, "loss": 1.5746, "step": 6140 }, { "epoch": 4.760061919504644, "grad_norm": 29.851680755615234, "learning_rate": 1.048606811145511e-05, "loss": 0.8295, "step": 6150 }, { "epoch": 4.767801857585139, "grad_norm": 0.10100807249546051, "learning_rate": 1.047058823529412e-05, "loss": 0.2769, "step": 6160 }, { "epoch": 4.775541795665634, "grad_norm": 29.823352813720703, "learning_rate": 1.0455108359133128e-05, "loss": 1.6038, "step": 6170 }, { "epoch": 4.78328173374613, "grad_norm": 0.12566882371902466, "learning_rate": 1.0439628482972138e-05, "loss": 1.2563, "step": 6180 }, { "epoch": 4.791021671826625, "grad_norm": 0.06456822156906128, "learning_rate": 1.0424148606811146e-05, "loss": 0.022, "step": 6190 }, { "epoch": 4.798761609907121, "grad_norm": 0.02737046405673027, "learning_rate": 1.0408668730650156e-05, "loss": 0.1894, "step": 6200 }, { "epoch": 4.806501547987616, "grad_norm": 0.02114146761596203, "learning_rate": 1.0393188854489164e-05, "loss": 0.1725, "step": 6210 }, { "epoch": 4.814241486068111, "grad_norm": 0.07709182053804398, "learning_rate": 1.0377708978328174e-05, "loss": 0.2006, "step": 6220 }, { "epoch": 4.821981424148607, "grad_norm": 16.739532470703125, "learning_rate": 1.0362229102167183e-05, "loss": 0.1989, "step": 6230 }, { "epoch": 4.829721362229102, "grad_norm": 0.024075696244835854, "learning_rate": 1.0346749226006193e-05, "loss": 0.0089, "step": 6240 }, { "epoch": 4.837461300309598, "grad_norm": 0.0897703468799591, "learning_rate": 1.0331269349845201e-05, "loss": 0.7651, "step": 6250 }, { "epoch": 4.845201238390093, "grad_norm": 0.07373488694429398, "learning_rate": 1.0315789473684213e-05, "loss": 1.0775, "step": 6260 }, { "epoch": 4.852941176470588, "grad_norm": 3.4655709266662598, "learning_rate": 1.0300309597523221e-05, "loss": 0.7153, "step": 6270 }, { "epoch": 4.860681114551084, "grad_norm": 0.02915586717426777, "learning_rate": 1.0284829721362231e-05, "loss": 0.5916, "step": 6280 }, { "epoch": 4.868421052631579, "grad_norm": 25.028549194335938, "learning_rate": 1.026934984520124e-05, "loss": 1.176, "step": 6290 }, { "epoch": 4.876160990712075, "grad_norm": 50.710845947265625, "learning_rate": 1.025386996904025e-05, "loss": 0.372, "step": 6300 }, { "epoch": 4.88390092879257, "grad_norm": 0.17811448872089386, "learning_rate": 1.0238390092879258e-05, "loss": 0.4337, "step": 6310 }, { "epoch": 4.891640866873065, "grad_norm": 0.7899062037467957, "learning_rate": 1.0222910216718268e-05, "loss": 0.0359, "step": 6320 }, { "epoch": 4.8993808049535605, "grad_norm": 0.057269543409347534, "learning_rate": 1.0207430340557276e-05, "loss": 0.4854, "step": 6330 }, { "epoch": 4.9071207430340555, "grad_norm": 40.77609634399414, "learning_rate": 1.0191950464396286e-05, "loss": 0.1279, "step": 6340 }, { "epoch": 4.914860681114551, "grad_norm": 0.031198320910334587, "learning_rate": 1.0176470588235294e-05, "loss": 0.2879, "step": 6350 }, { "epoch": 4.922600619195046, "grad_norm": 0.056804269552230835, "learning_rate": 1.0160990712074304e-05, "loss": 1.5842, "step": 6360 }, { "epoch": 4.930340557275541, "grad_norm": 24.026704788208008, "learning_rate": 1.0145510835913313e-05, "loss": 0.6426, "step": 6370 }, { "epoch": 4.938080495356037, "grad_norm": 0.21523810923099518, "learning_rate": 1.0130030959752324e-05, "loss": 0.2442, "step": 6380 }, { "epoch": 4.945820433436532, "grad_norm": 26.67902946472168, "learning_rate": 1.0114551083591333e-05, "loss": 1.0699, "step": 6390 }, { "epoch": 4.953560371517028, "grad_norm": 0.03276975080370903, "learning_rate": 1.0099071207430343e-05, "loss": 0.1181, "step": 6400 }, { "epoch": 4.961300309597523, "grad_norm": 0.028117669746279716, "learning_rate": 1.0083591331269351e-05, "loss": 0.2968, "step": 6410 }, { "epoch": 4.969040247678018, "grad_norm": 38.684696197509766, "learning_rate": 1.0068111455108361e-05, "loss": 0.8345, "step": 6420 }, { "epoch": 4.976780185758514, "grad_norm": 41.38811492919922, "learning_rate": 1.005263157894737e-05, "loss": 0.7747, "step": 6430 }, { "epoch": 4.984520123839009, "grad_norm": 0.05249479040503502, "learning_rate": 1.003715170278638e-05, "loss": 0.284, "step": 6440 }, { "epoch": 4.992260061919505, "grad_norm": 0.038099758327007294, "learning_rate": 1.0021671826625387e-05, "loss": 0.4573, "step": 6450 }, { "epoch": 5.0, "grad_norm": 3.36181902885437, "learning_rate": 1.0006191950464397e-05, "loss": 1.0612, "step": 6460 }, { "epoch": 5.0, "eval_accuracy": 0.82, "eval_f1": 0.817712096332786, "eval_loss": 0.8156516551971436, "eval_runtime": 1.3225, "eval_samples_per_second": 37.807, "eval_steps_per_second": 37.807, "step": 6460 }, { "epoch": 5.007739938080495, "grad_norm": 10.3129243850708, "learning_rate": 9.990712074303406e-06, "loss": 1.0171, "step": 6470 }, { "epoch": 5.015479876160991, "grad_norm": 0.1671464890241623, "learning_rate": 9.975232198142416e-06, "loss": 0.6974, "step": 6480 }, { "epoch": 5.023219814241486, "grad_norm": 37.45733642578125, "learning_rate": 9.959752321981426e-06, "loss": 0.8142, "step": 6490 }, { "epoch": 5.030959752321982, "grad_norm": 43.19553756713867, "learning_rate": 9.944272445820434e-06, "loss": 0.8863, "step": 6500 }, { "epoch": 5.038699690402477, "grad_norm": 0.06552344560623169, "learning_rate": 9.928792569659444e-06, "loss": 1.1843, "step": 6510 }, { "epoch": 5.046439628482972, "grad_norm": 0.2929050624370575, "learning_rate": 9.913312693498452e-06, "loss": 0.086, "step": 6520 }, { "epoch": 5.054179566563468, "grad_norm": 0.1430848389863968, "learning_rate": 9.897832817337462e-06, "loss": 0.0746, "step": 6530 }, { "epoch": 5.061919504643963, "grad_norm": 0.6365134716033936, "learning_rate": 9.882352941176472e-06, "loss": 0.7373, "step": 6540 }, { "epoch": 5.069659442724459, "grad_norm": 0.08395009487867355, "learning_rate": 9.86687306501548e-06, "loss": 0.567, "step": 6550 }, { "epoch": 5.077399380804954, "grad_norm": 0.02187863551080227, "learning_rate": 9.85139318885449e-06, "loss": 0.4922, "step": 6560 }, { "epoch": 5.085139318885449, "grad_norm": 0.11344428360462189, "learning_rate": 9.835913312693499e-06, "loss": 0.9952, "step": 6570 }, { "epoch": 5.0928792569659445, "grad_norm": 0.37315574288368225, "learning_rate": 9.820433436532509e-06, "loss": 0.2982, "step": 6580 }, { "epoch": 5.1006191950464395, "grad_norm": 0.08135538548231125, "learning_rate": 9.804953560371517e-06, "loss": 0.9768, "step": 6590 }, { "epoch": 5.108359133126935, "grad_norm": 0.47782257199287415, "learning_rate": 9.789473684210527e-06, "loss": 1.0762, "step": 6600 }, { "epoch": 5.11609907120743, "grad_norm": 0.054620515555143356, "learning_rate": 9.773993808049537e-06, "loss": 0.0112, "step": 6610 }, { "epoch": 5.123839009287925, "grad_norm": 0.04898601397871971, "learning_rate": 9.758513931888546e-06, "loss": 0.1815, "step": 6620 }, { "epoch": 5.131578947368421, "grad_norm": 0.06143815815448761, "learning_rate": 9.743034055727556e-06, "loss": 0.5481, "step": 6630 }, { "epoch": 5.139318885448916, "grad_norm": 10.986651420593262, "learning_rate": 9.727554179566564e-06, "loss": 1.3822, "step": 6640 }, { "epoch": 5.147058823529412, "grad_norm": 11.385930061340332, "learning_rate": 9.712074303405572e-06, "loss": 0.3746, "step": 6650 }, { "epoch": 5.154798761609907, "grad_norm": 0.06475861370563507, "learning_rate": 9.696594427244584e-06, "loss": 0.016, "step": 6660 }, { "epoch": 5.162538699690402, "grad_norm": 0.03605762869119644, "learning_rate": 9.681114551083592e-06, "loss": 1.2274, "step": 6670 }, { "epoch": 5.170278637770898, "grad_norm": 0.06123343110084534, "learning_rate": 9.665634674922602e-06, "loss": 0.8978, "step": 6680 }, { "epoch": 5.178018575851393, "grad_norm": 0.014502284117043018, "learning_rate": 9.65015479876161e-06, "loss": 0.0927, "step": 6690 }, { "epoch": 5.185758513931889, "grad_norm": 0.0276712104678154, "learning_rate": 9.634674922600619e-06, "loss": 0.1808, "step": 6700 }, { "epoch": 5.193498452012384, "grad_norm": 1.2984869480133057, "learning_rate": 9.619195046439629e-06, "loss": 0.0118, "step": 6710 }, { "epoch": 5.201238390092879, "grad_norm": 6.035773277282715, "learning_rate": 9.603715170278639e-06, "loss": 0.0427, "step": 6720 }, { "epoch": 5.208978328173375, "grad_norm": 0.03757443651556969, "learning_rate": 9.588235294117649e-06, "loss": 0.0833, "step": 6730 }, { "epoch": 5.21671826625387, "grad_norm": 0.04457368329167366, "learning_rate": 9.572755417956657e-06, "loss": 1.0578, "step": 6740 }, { "epoch": 5.224458204334366, "grad_norm": 0.04282601177692413, "learning_rate": 9.557275541795667e-06, "loss": 0.1295, "step": 6750 }, { "epoch": 5.232198142414861, "grad_norm": 0.02639881707727909, "learning_rate": 9.541795665634676e-06, "loss": 0.8937, "step": 6760 }, { "epoch": 5.239938080495356, "grad_norm": 0.023649532347917557, "learning_rate": 9.526315789473684e-06, "loss": 0.1342, "step": 6770 }, { "epoch": 5.247678018575852, "grad_norm": 0.030982544645667076, "learning_rate": 9.510835913312694e-06, "loss": 1.6762, "step": 6780 }, { "epoch": 5.255417956656347, "grad_norm": 0.017185375094413757, "learning_rate": 9.495356037151704e-06, "loss": 0.1797, "step": 6790 }, { "epoch": 5.2631578947368425, "grad_norm": 0.08222652226686478, "learning_rate": 9.479876160990714e-06, "loss": 0.0171, "step": 6800 }, { "epoch": 5.2708978328173375, "grad_norm": 0.04039729759097099, "learning_rate": 9.464396284829722e-06, "loss": 0.4627, "step": 6810 }, { "epoch": 5.2786377708978325, "grad_norm": 0.5557768940925598, "learning_rate": 9.44891640866873e-06, "loss": 0.9593, "step": 6820 }, { "epoch": 5.286377708978328, "grad_norm": 0.289194792509079, "learning_rate": 9.43343653250774e-06, "loss": 0.3434, "step": 6830 }, { "epoch": 5.294117647058823, "grad_norm": 9.574075698852539, "learning_rate": 9.417956656346749e-06, "loss": 0.0384, "step": 6840 }, { "epoch": 5.301857585139319, "grad_norm": 0.0871453583240509, "learning_rate": 9.40247678018576e-06, "loss": 0.2016, "step": 6850 }, { "epoch": 5.309597523219814, "grad_norm": 0.03614587336778641, "learning_rate": 9.386996904024769e-06, "loss": 0.164, "step": 6860 }, { "epoch": 5.317337461300309, "grad_norm": 0.03367859125137329, "learning_rate": 9.371517027863777e-06, "loss": 0.6166, "step": 6870 }, { "epoch": 5.325077399380805, "grad_norm": 0.6445887684822083, "learning_rate": 9.356037151702787e-06, "loss": 0.0476, "step": 6880 }, { "epoch": 5.3328173374613, "grad_norm": 0.02883070893585682, "learning_rate": 9.340557275541795e-06, "loss": 0.0096, "step": 6890 }, { "epoch": 5.340557275541796, "grad_norm": 0.03978569433093071, "learning_rate": 9.325077399380805e-06, "loss": 0.7187, "step": 6900 }, { "epoch": 5.348297213622291, "grad_norm": 0.5020480155944824, "learning_rate": 9.309597523219815e-06, "loss": 0.1887, "step": 6910 }, { "epoch": 5.356037151702786, "grad_norm": 0.23880119621753693, "learning_rate": 9.294117647058824e-06, "loss": 1.6212, "step": 6920 }, { "epoch": 5.363777089783282, "grad_norm": 0.07249978184700012, "learning_rate": 9.278637770897834e-06, "loss": 0.283, "step": 6930 }, { "epoch": 5.371517027863777, "grad_norm": 0.039158158004283905, "learning_rate": 9.263157894736842e-06, "loss": 0.4894, "step": 6940 }, { "epoch": 5.379256965944273, "grad_norm": 0.04014299437403679, "learning_rate": 9.249226006191952e-06, "loss": 0.5263, "step": 6950 }, { "epoch": 5.386996904024768, "grad_norm": 0.06752253323793411, "learning_rate": 9.23374613003096e-06, "loss": 1.3858, "step": 6960 }, { "epoch": 5.394736842105263, "grad_norm": 0.05008096620440483, "learning_rate": 9.21826625386997e-06, "loss": 0.4359, "step": 6970 }, { "epoch": 5.402476780185759, "grad_norm": 0.049045976251363754, "learning_rate": 9.202786377708979e-06, "loss": 0.7796, "step": 6980 }, { "epoch": 5.410216718266254, "grad_norm": 0.11052145808935165, "learning_rate": 9.187306501547989e-06, "loss": 0.5549, "step": 6990 }, { "epoch": 5.41795665634675, "grad_norm": 118.4418716430664, "learning_rate": 9.171826625386999e-06, "loss": 0.8204, "step": 7000 }, { "epoch": 5.425696594427245, "grad_norm": 0.6176676154136658, "learning_rate": 9.156346749226007e-06, "loss": 0.6642, "step": 7010 }, { "epoch": 5.43343653250774, "grad_norm": 81.53767395019531, "learning_rate": 9.140866873065017e-06, "loss": 1.466, "step": 7020 }, { "epoch": 5.4411764705882355, "grad_norm": 0.05234875902533531, "learning_rate": 9.125386996904025e-06, "loss": 1.7874, "step": 7030 }, { "epoch": 5.4489164086687305, "grad_norm": 0.023949066177010536, "learning_rate": 9.109907120743035e-06, "loss": 0.0155, "step": 7040 }, { "epoch": 5.456656346749226, "grad_norm": 0.03363420069217682, "learning_rate": 9.094427244582044e-06, "loss": 0.5064, "step": 7050 }, { "epoch": 5.464396284829721, "grad_norm": 0.04458250105381012, "learning_rate": 9.078947368421054e-06, "loss": 0.4344, "step": 7060 }, { "epoch": 5.472136222910216, "grad_norm": 2.157409906387329, "learning_rate": 9.063467492260064e-06, "loss": 0.3105, "step": 7070 }, { "epoch": 5.479876160990712, "grad_norm": 0.033306073397397995, "learning_rate": 9.047987616099072e-06, "loss": 0.4501, "step": 7080 }, { "epoch": 5.487616099071207, "grad_norm": 0.17406100034713745, "learning_rate": 9.032507739938082e-06, "loss": 0.3477, "step": 7090 }, { "epoch": 5.495356037151703, "grad_norm": 4.555756568908691, "learning_rate": 9.01702786377709e-06, "loss": 0.024, "step": 7100 }, { "epoch": 5.503095975232198, "grad_norm": 0.1760099232196808, "learning_rate": 9.001547987616099e-06, "loss": 0.7186, "step": 7110 }, { "epoch": 5.510835913312693, "grad_norm": 2.6890175342559814, "learning_rate": 8.98606811145511e-06, "loss": 0.7622, "step": 7120 }, { "epoch": 5.518575851393189, "grad_norm": 0.05589776486158371, "learning_rate": 8.970588235294119e-06, "loss": 0.0896, "step": 7130 }, { "epoch": 5.526315789473684, "grad_norm": 0.03822312876582146, "learning_rate": 8.955108359133129e-06, "loss": 0.2354, "step": 7140 }, { "epoch": 5.534055727554179, "grad_norm": 0.04057033732533455, "learning_rate": 8.939628482972137e-06, "loss": 0.0292, "step": 7150 }, { "epoch": 5.541795665634675, "grad_norm": 0.07189463824033737, "learning_rate": 8.924148606811147e-06, "loss": 0.9391, "step": 7160 }, { "epoch": 5.54953560371517, "grad_norm": 0.04728087782859802, "learning_rate": 8.908668730650155e-06, "loss": 0.2709, "step": 7170 }, { "epoch": 5.557275541795666, "grad_norm": 0.10525445640087128, "learning_rate": 8.893188854489165e-06, "loss": 0.2316, "step": 7180 }, { "epoch": 5.565015479876161, "grad_norm": 0.4815293252468109, "learning_rate": 8.877708978328175e-06, "loss": 0.322, "step": 7190 }, { "epoch": 5.572755417956657, "grad_norm": 9.037779808044434, "learning_rate": 8.862229102167183e-06, "loss": 1.0722, "step": 7200 }, { "epoch": 5.580495356037152, "grad_norm": 0.15179972350597382, "learning_rate": 8.846749226006193e-06, "loss": 0.6824, "step": 7210 }, { "epoch": 5.588235294117647, "grad_norm": 0.02039916440844536, "learning_rate": 8.831269349845202e-06, "loss": 0.6267, "step": 7220 }, { "epoch": 5.595975232198143, "grad_norm": 0.04403864964842796, "learning_rate": 8.81578947368421e-06, "loss": 0.1246, "step": 7230 }, { "epoch": 5.603715170278638, "grad_norm": 0.03592114523053169, "learning_rate": 8.80030959752322e-06, "loss": 1.6872, "step": 7240 }, { "epoch": 5.611455108359133, "grad_norm": 0.07168206572532654, "learning_rate": 8.78482972136223e-06, "loss": 0.0484, "step": 7250 }, { "epoch": 5.6191950464396285, "grad_norm": 0.2258443981409073, "learning_rate": 8.76934984520124e-06, "loss": 0.0074, "step": 7260 }, { "epoch": 5.6269349845201235, "grad_norm": 35.460472106933594, "learning_rate": 8.753869969040248e-06, "loss": 1.1906, "step": 7270 }, { "epoch": 5.634674922600619, "grad_norm": 0.01941072568297386, "learning_rate": 8.738390092879257e-06, "loss": 1.0085, "step": 7280 }, { "epoch": 5.642414860681114, "grad_norm": 15.37927532196045, "learning_rate": 8.722910216718267e-06, "loss": 0.0431, "step": 7290 }, { "epoch": 5.65015479876161, "grad_norm": 0.03524412959814072, "learning_rate": 8.707430340557275e-06, "loss": 0.2264, "step": 7300 }, { "epoch": 5.657894736842105, "grad_norm": 0.05594306066632271, "learning_rate": 8.691950464396287e-06, "loss": 0.0221, "step": 7310 }, { "epoch": 5.6656346749226, "grad_norm": 47.52218246459961, "learning_rate": 8.676470588235295e-06, "loss": 0.5628, "step": 7320 }, { "epoch": 5.673374613003096, "grad_norm": 0.02925911359488964, "learning_rate": 8.660990712074303e-06, "loss": 0.3226, "step": 7330 }, { "epoch": 5.681114551083591, "grad_norm": 1.5460282564163208, "learning_rate": 8.645510835913313e-06, "loss": 0.0413, "step": 7340 }, { "epoch": 5.688854489164086, "grad_norm": 0.03300033509731293, "learning_rate": 8.630030959752322e-06, "loss": 0.6455, "step": 7350 }, { "epoch": 5.696594427244582, "grad_norm": 0.10905322432518005, "learning_rate": 8.614551083591332e-06, "loss": 0.076, "step": 7360 }, { "epoch": 5.704334365325077, "grad_norm": 0.03878280147910118, "learning_rate": 8.599071207430342e-06, "loss": 0.5817, "step": 7370 }, { "epoch": 5.712074303405573, "grad_norm": 0.74409419298172, "learning_rate": 8.583591331269352e-06, "loss": 0.4119, "step": 7380 }, { "epoch": 5.719814241486068, "grad_norm": 0.03640138357877731, "learning_rate": 8.56811145510836e-06, "loss": 0.4941, "step": 7390 }, { "epoch": 5.727554179566564, "grad_norm": 0.037649523466825485, "learning_rate": 8.552631578947368e-06, "loss": 0.3802, "step": 7400 }, { "epoch": 5.735294117647059, "grad_norm": 1.30524742603302, "learning_rate": 8.537151702786378e-06, "loss": 0.9587, "step": 7410 }, { "epoch": 5.743034055727554, "grad_norm": 0.11640552431344986, "learning_rate": 8.521671826625387e-06, "loss": 0.4949, "step": 7420 }, { "epoch": 5.75077399380805, "grad_norm": 0.062201518565416336, "learning_rate": 8.506191950464398e-06, "loss": 0.4555, "step": 7430 }, { "epoch": 5.758513931888545, "grad_norm": 28.5511531829834, "learning_rate": 8.490712074303407e-06, "loss": 0.4528, "step": 7440 }, { "epoch": 5.76625386996904, "grad_norm": 43.70050811767578, "learning_rate": 8.475232198142415e-06, "loss": 1.036, "step": 7450 }, { "epoch": 5.773993808049536, "grad_norm": 23.438291549682617, "learning_rate": 8.459752321981425e-06, "loss": 0.7974, "step": 7460 }, { "epoch": 5.781733746130031, "grad_norm": 0.03143590688705444, "learning_rate": 8.444272445820433e-06, "loss": 0.8049, "step": 7470 }, { "epoch": 5.7894736842105265, "grad_norm": 0.026820233091711998, "learning_rate": 8.428792569659443e-06, "loss": 0.3405, "step": 7480 }, { "epoch": 5.7972136222910216, "grad_norm": 0.033795323222875595, "learning_rate": 8.413312693498453e-06, "loss": 1.1518, "step": 7490 }, { "epoch": 5.804953560371517, "grad_norm": 69.6653060913086, "learning_rate": 8.397832817337462e-06, "loss": 0.7706, "step": 7500 }, { "epoch": 5.812693498452012, "grad_norm": 0.13078753650188446, "learning_rate": 8.382352941176472e-06, "loss": 0.9108, "step": 7510 }, { "epoch": 5.820433436532507, "grad_norm": 0.025900380685925484, "learning_rate": 8.36687306501548e-06, "loss": 0.1567, "step": 7520 }, { "epoch": 5.828173374613003, "grad_norm": 0.01936858333647251, "learning_rate": 8.35139318885449e-06, "loss": 0.0079, "step": 7530 }, { "epoch": 5.835913312693498, "grad_norm": 0.07614408433437347, "learning_rate": 8.335913312693498e-06, "loss": 0.463, "step": 7540 }, { "epoch": 5.843653250773993, "grad_norm": 0.3218093514442444, "learning_rate": 8.320433436532508e-06, "loss": 0.228, "step": 7550 }, { "epoch": 5.851393188854489, "grad_norm": 81.01888275146484, "learning_rate": 8.304953560371518e-06, "loss": 0.8016, "step": 7560 }, { "epoch": 5.859133126934984, "grad_norm": 0.0149591825902462, "learning_rate": 8.289473684210526e-06, "loss": 0.3493, "step": 7570 }, { "epoch": 5.86687306501548, "grad_norm": 0.06928674876689911, "learning_rate": 8.273993808049536e-06, "loss": 0.8052, "step": 7580 }, { "epoch": 5.874613003095975, "grad_norm": 2.6595444679260254, "learning_rate": 8.258513931888545e-06, "loss": 0.108, "step": 7590 }, { "epoch": 5.882352941176471, "grad_norm": 0.6622132658958435, "learning_rate": 8.243034055727555e-06, "loss": 0.016, "step": 7600 }, { "epoch": 5.890092879256966, "grad_norm": 0.02797524817287922, "learning_rate": 8.227554179566563e-06, "loss": 0.3307, "step": 7610 }, { "epoch": 5.897832817337461, "grad_norm": 0.015555012971162796, "learning_rate": 8.212074303405573e-06, "loss": 0.4981, "step": 7620 }, { "epoch": 5.905572755417957, "grad_norm": 0.030535627156496048, "learning_rate": 8.196594427244583e-06, "loss": 0.9077, "step": 7630 }, { "epoch": 5.913312693498452, "grad_norm": 0.021094361320137978, "learning_rate": 8.181114551083591e-06, "loss": 0.6739, "step": 7640 }, { "epoch": 5.921052631578947, "grad_norm": 0.05732201412320137, "learning_rate": 8.165634674922601e-06, "loss": 0.0236, "step": 7650 }, { "epoch": 5.928792569659443, "grad_norm": 0.07169334590435028, "learning_rate": 8.15015479876161e-06, "loss": 0.7167, "step": 7660 }, { "epoch": 5.936532507739938, "grad_norm": 9.617886543273926, "learning_rate": 8.13467492260062e-06, "loss": 0.905, "step": 7670 }, { "epoch": 5.944272445820434, "grad_norm": 0.3554931879043579, "learning_rate": 8.11919504643963e-06, "loss": 0.497, "step": 7680 }, { "epoch": 5.952012383900929, "grad_norm": 89.34410858154297, "learning_rate": 8.103715170278638e-06, "loss": 0.8098, "step": 7690 }, { "epoch": 5.959752321981425, "grad_norm": 0.34099361300468445, "learning_rate": 8.088235294117648e-06, "loss": 0.5604, "step": 7700 }, { "epoch": 5.96749226006192, "grad_norm": 0.028241179883480072, "learning_rate": 8.072755417956656e-06, "loss": 0.0108, "step": 7710 }, { "epoch": 5.975232198142415, "grad_norm": 0.03413006290793419, "learning_rate": 8.057275541795666e-06, "loss": 0.9646, "step": 7720 }, { "epoch": 5.9829721362229105, "grad_norm": 0.49321335554122925, "learning_rate": 8.041795665634675e-06, "loss": 0.0637, "step": 7730 }, { "epoch": 5.9907120743034055, "grad_norm": 0.02558736875653267, "learning_rate": 8.026315789473685e-06, "loss": 0.4093, "step": 7740 }, { "epoch": 5.9984520123839005, "grad_norm": 25.80232810974121, "learning_rate": 8.010835913312695e-06, "loss": 0.4589, "step": 7750 }, { "epoch": 6.0, "eval_accuracy": 0.84, "eval_f1": 0.8386100386100387, "eval_loss": 0.7758023738861084, "eval_runtime": 1.3174, "eval_samples_per_second": 37.955, "eval_steps_per_second": 37.955, "step": 7752 }, { "epoch": 6.006191950464396, "grad_norm": 0.05321564897894859, "learning_rate": 7.995356037151703e-06, "loss": 0.4591, "step": 7760 }, { "epoch": 6.013931888544891, "grad_norm": 0.011337725445628166, "learning_rate": 7.979876160990713e-06, "loss": 0.5767, "step": 7770 }, { "epoch": 6.021671826625387, "grad_norm": 55.26630783081055, "learning_rate": 7.964396284829721e-06, "loss": 0.3219, "step": 7780 }, { "epoch": 6.029411764705882, "grad_norm": 0.033986181020736694, "learning_rate": 7.948916408668731e-06, "loss": 0.4561, "step": 7790 }, { "epoch": 6.037151702786378, "grad_norm": 37.220947265625, "learning_rate": 7.93343653250774e-06, "loss": 0.4386, "step": 7800 }, { "epoch": 6.044891640866873, "grad_norm": 0.4084802567958832, "learning_rate": 7.91795665634675e-06, "loss": 0.1413, "step": 7810 }, { "epoch": 6.052631578947368, "grad_norm": 0.03546377271413803, "learning_rate": 7.90247678018576e-06, "loss": 0.0075, "step": 7820 }, { "epoch": 6.060371517027864, "grad_norm": 0.023248814046382904, "learning_rate": 7.886996904024768e-06, "loss": 0.3886, "step": 7830 }, { "epoch": 6.068111455108359, "grad_norm": 20.20774269104004, "learning_rate": 7.871517027863778e-06, "loss": 0.8309, "step": 7840 }, { "epoch": 6.075851393188855, "grad_norm": 0.02238421142101288, "learning_rate": 7.856037151702786e-06, "loss": 0.895, "step": 7850 }, { "epoch": 6.08359133126935, "grad_norm": 0.03560001775622368, "learning_rate": 7.840557275541796e-06, "loss": 0.0115, "step": 7860 }, { "epoch": 6.091331269349845, "grad_norm": 0.0887119472026825, "learning_rate": 7.825077399380806e-06, "loss": 0.0094, "step": 7870 }, { "epoch": 6.099071207430341, "grad_norm": 27.18391990661621, "learning_rate": 7.809597523219815e-06, "loss": 0.8418, "step": 7880 }, { "epoch": 6.106811145510836, "grad_norm": 28.69709587097168, "learning_rate": 7.794117647058825e-06, "loss": 0.6666, "step": 7890 }, { "epoch": 6.114551083591332, "grad_norm": 0.28389397263526917, "learning_rate": 7.778637770897833e-06, "loss": 0.2675, "step": 7900 }, { "epoch": 6.122291021671827, "grad_norm": 0.1673971265554428, "learning_rate": 7.763157894736843e-06, "loss": 0.0922, "step": 7910 }, { "epoch": 6.130030959752322, "grad_norm": 0.05517880246043205, "learning_rate": 7.747678018575851e-06, "loss": 0.9095, "step": 7920 }, { "epoch": 6.137770897832818, "grad_norm": 10.325530052185059, "learning_rate": 7.732198142414861e-06, "loss": 0.8915, "step": 7930 }, { "epoch": 6.145510835913313, "grad_norm": 0.04550738260149956, "learning_rate": 7.716718266253871e-06, "loss": 0.3955, "step": 7940 }, { "epoch": 6.153250773993808, "grad_norm": 1.3613225221633911, "learning_rate": 7.70123839009288e-06, "loss": 0.0551, "step": 7950 }, { "epoch": 6.1609907120743035, "grad_norm": 0.15682007372379303, "learning_rate": 7.68575851393189e-06, "loss": 0.2452, "step": 7960 }, { "epoch": 6.1687306501547985, "grad_norm": 0.17441047728061676, "learning_rate": 7.670278637770898e-06, "loss": 0.0076, "step": 7970 }, { "epoch": 6.176470588235294, "grad_norm": 0.014211468398571014, "learning_rate": 7.654798761609908e-06, "loss": 0.2265, "step": 7980 }, { "epoch": 6.184210526315789, "grad_norm": 0.025167670100927353, "learning_rate": 7.639318885448918e-06, "loss": 0.1718, "step": 7990 }, { "epoch": 6.191950464396285, "grad_norm": 0.02954765595495701, "learning_rate": 7.623839009287927e-06, "loss": 0.937, "step": 8000 }, { "epoch": 6.19969040247678, "grad_norm": 0.020528504624962807, "learning_rate": 7.608359133126936e-06, "loss": 0.2188, "step": 8010 }, { "epoch": 6.207430340557275, "grad_norm": 0.05648907274007797, "learning_rate": 7.592879256965945e-06, "loss": 0.6138, "step": 8020 }, { "epoch": 6.215170278637771, "grad_norm": 0.05865255743265152, "learning_rate": 7.5773993808049536e-06, "loss": 1.2876, "step": 8030 }, { "epoch": 6.222910216718266, "grad_norm": 7.47597599029541, "learning_rate": 7.561919504643963e-06, "loss": 0.7536, "step": 8040 }, { "epoch": 6.230650154798761, "grad_norm": 0.07647200673818588, "learning_rate": 7.5464396284829736e-06, "loss": 0.6844, "step": 8050 }, { "epoch": 6.238390092879257, "grad_norm": 7.578645706176758, "learning_rate": 7.530959752321983e-06, "loss": 1.2935, "step": 8060 }, { "epoch": 6.246130030959752, "grad_norm": 0.03884744644165039, "learning_rate": 7.515479876160992e-06, "loss": 0.5505, "step": 8070 }, { "epoch": 6.253869969040248, "grad_norm": 63.417232513427734, "learning_rate": 7.500000000000001e-06, "loss": 1.2129, "step": 8080 }, { "epoch": 6.261609907120743, "grad_norm": 0.02632290869951248, "learning_rate": 7.484520123839009e-06, "loss": 0.0359, "step": 8090 }, { "epoch": 6.269349845201239, "grad_norm": 0.04231875017285347, "learning_rate": 7.4690402476780185e-06, "loss": 0.5081, "step": 8100 }, { "epoch": 6.277089783281734, "grad_norm": 10.248048782348633, "learning_rate": 7.453560371517028e-06, "loss": 1.1047, "step": 8110 }, { "epoch": 6.284829721362229, "grad_norm": 0.025162868201732635, "learning_rate": 7.4380804953560385e-06, "loss": 0.3609, "step": 8120 }, { "epoch": 6.292569659442725, "grad_norm": 0.06309065222740173, "learning_rate": 7.422600619195048e-06, "loss": 0.0368, "step": 8130 }, { "epoch": 6.30030959752322, "grad_norm": 18.042043685913086, "learning_rate": 7.407120743034056e-06, "loss": 0.0452, "step": 8140 }, { "epoch": 6.308049535603715, "grad_norm": 0.048326391726732254, "learning_rate": 7.391640866873065e-06, "loss": 0.4325, "step": 8150 }, { "epoch": 6.315789473684211, "grad_norm": 0.8153269290924072, "learning_rate": 7.376160990712074e-06, "loss": 0.6428, "step": 8160 }, { "epoch": 6.323529411764706, "grad_norm": 0.06932692974805832, "learning_rate": 7.3606811145510834e-06, "loss": 1.1045, "step": 8170 }, { "epoch": 6.3312693498452015, "grad_norm": 0.014919210225343704, "learning_rate": 7.345201238390094e-06, "loss": 0.2637, "step": 8180 }, { "epoch": 6.3390092879256965, "grad_norm": 0.0241678636521101, "learning_rate": 7.3297213622291034e-06, "loss": 0.0272, "step": 8190 }, { "epoch": 6.346749226006192, "grad_norm": 0.03343890607357025, "learning_rate": 7.314241486068112e-06, "loss": 1.0674, "step": 8200 }, { "epoch": 6.354489164086687, "grad_norm": 0.014633177779614925, "learning_rate": 7.298761609907121e-06, "loss": 1.001, "step": 8210 }, { "epoch": 6.362229102167182, "grad_norm": 0.06903253495693207, "learning_rate": 7.28328173374613e-06, "loss": 0.1546, "step": 8220 }, { "epoch": 6.369969040247678, "grad_norm": 0.09170027077198029, "learning_rate": 7.267801857585139e-06, "loss": 0.6324, "step": 8230 }, { "epoch": 6.377708978328173, "grad_norm": 0.027656368911266327, "learning_rate": 7.25232198142415e-06, "loss": 0.3448, "step": 8240 }, { "epoch": 6.385448916408668, "grad_norm": 1.1068994998931885, "learning_rate": 7.236842105263158e-06, "loss": 0.2811, "step": 8250 }, { "epoch": 6.393188854489164, "grad_norm": 0.02312501333653927, "learning_rate": 7.2213622291021675e-06, "loss": 0.0728, "step": 8260 }, { "epoch": 6.400928792569659, "grad_norm": 17.569738388061523, "learning_rate": 7.205882352941177e-06, "loss": 1.2367, "step": 8270 }, { "epoch": 6.408668730650155, "grad_norm": 0.01699947752058506, "learning_rate": 7.190402476780186e-06, "loss": 0.0826, "step": 8280 }, { "epoch": 6.41640866873065, "grad_norm": 15.992704391479492, "learning_rate": 7.174922600619195e-06, "loss": 0.6937, "step": 8290 }, { "epoch": 6.424148606811146, "grad_norm": 2.9142398834228516, "learning_rate": 7.159442724458206e-06, "loss": 0.0385, "step": 8300 }, { "epoch": 6.431888544891641, "grad_norm": 0.017890112474560738, "learning_rate": 7.143962848297214e-06, "loss": 0.2427, "step": 8310 }, { "epoch": 6.439628482972136, "grad_norm": 0.02168497070670128, "learning_rate": 7.128482972136223e-06, "loss": 0.5588, "step": 8320 }, { "epoch": 6.447368421052632, "grad_norm": 0.06378014385700226, "learning_rate": 7.1130030959752325e-06, "loss": 0.251, "step": 8330 }, { "epoch": 6.455108359133127, "grad_norm": 0.46395790576934814, "learning_rate": 7.097523219814242e-06, "loss": 0.5745, "step": 8340 }, { "epoch": 6.462848297213622, "grad_norm": 0.06932064145803452, "learning_rate": 7.082043343653251e-06, "loss": 1.0122, "step": 8350 }, { "epoch": 6.470588235294118, "grad_norm": 0.8538666367530823, "learning_rate": 7.066563467492261e-06, "loss": 0.5598, "step": 8360 }, { "epoch": 6.478328173374613, "grad_norm": 0.04000185430049896, "learning_rate": 7.05108359133127e-06, "loss": 0.7076, "step": 8370 }, { "epoch": 6.486068111455109, "grad_norm": 1.4221463203430176, "learning_rate": 7.035603715170279e-06, "loss": 0.7204, "step": 8380 }, { "epoch": 6.493808049535604, "grad_norm": 76.32229614257812, "learning_rate": 7.020123839009288e-06, "loss": 0.6399, "step": 8390 }, { "epoch": 6.5015479876160995, "grad_norm": 0.0218702033162117, "learning_rate": 7.004643962848297e-06, "loss": 1.3044, "step": 8400 }, { "epoch": 6.5092879256965945, "grad_norm": 0.05823648348450661, "learning_rate": 6.9891640866873066e-06, "loss": 0.5163, "step": 8410 }, { "epoch": 6.5170278637770895, "grad_norm": 0.022042028605937958, "learning_rate": 6.973684210526316e-06, "loss": 0.5662, "step": 8420 }, { "epoch": 6.524767801857585, "grad_norm": 0.056647755205631256, "learning_rate": 6.958204334365326e-06, "loss": 0.4895, "step": 8430 }, { "epoch": 6.53250773993808, "grad_norm": 0.02915012091398239, "learning_rate": 6.942724458204335e-06, "loss": 0.7852, "step": 8440 }, { "epoch": 6.540247678018575, "grad_norm": 0.8901259899139404, "learning_rate": 6.927244582043344e-06, "loss": 0.4311, "step": 8450 }, { "epoch": 6.547987616099071, "grad_norm": 0.10570290684700012, "learning_rate": 6.911764705882353e-06, "loss": 0.0094, "step": 8460 }, { "epoch": 6.555727554179566, "grad_norm": 0.02990216389298439, "learning_rate": 6.896284829721362e-06, "loss": 0.3964, "step": 8470 }, { "epoch": 6.563467492260062, "grad_norm": 0.04897937551140785, "learning_rate": 6.8808049535603715e-06, "loss": 0.995, "step": 8480 }, { "epoch": 6.571207430340557, "grad_norm": 0.15522898733615875, "learning_rate": 6.8653250773993815e-06, "loss": 0.7436, "step": 8490 }, { "epoch": 6.578947368421053, "grad_norm": 3.2360148429870605, "learning_rate": 6.849845201238391e-06, "loss": 0.0254, "step": 8500 }, { "epoch": 6.586687306501548, "grad_norm": 0.04028725624084473, "learning_rate": 6.8343653250774e-06, "loss": 0.0115, "step": 8510 }, { "epoch": 6.594427244582043, "grad_norm": 0.44617602229118347, "learning_rate": 6.818885448916409e-06, "loss": 1.2386, "step": 8520 }, { "epoch": 6.602167182662539, "grad_norm": 0.0869651511311531, "learning_rate": 6.803405572755418e-06, "loss": 0.6177, "step": 8530 }, { "epoch": 6.609907120743034, "grad_norm": 0.058694057166576385, "learning_rate": 6.787925696594427e-06, "loss": 1.2809, "step": 8540 }, { "epoch": 6.617647058823529, "grad_norm": 0.04547760263085365, "learning_rate": 6.772445820433437e-06, "loss": 0.4662, "step": 8550 }, { "epoch": 6.625386996904025, "grad_norm": 0.012752283364534378, "learning_rate": 6.7569659442724464e-06, "loss": 0.928, "step": 8560 }, { "epoch": 6.63312693498452, "grad_norm": 0.04218968749046326, "learning_rate": 6.741486068111456e-06, "loss": 0.318, "step": 8570 }, { "epoch": 6.640866873065016, "grad_norm": 10.485530853271484, "learning_rate": 6.726006191950465e-06, "loss": 1.0848, "step": 8580 }, { "epoch": 6.648606811145511, "grad_norm": 1.3498311042785645, "learning_rate": 6.710526315789474e-06, "loss": 0.0836, "step": 8590 }, { "epoch": 6.656346749226007, "grad_norm": 0.17515872418880463, "learning_rate": 6.695046439628483e-06, "loss": 0.2477, "step": 8600 }, { "epoch": 6.664086687306502, "grad_norm": 0.05281757563352585, "learning_rate": 6.679566563467493e-06, "loss": 0.5538, "step": 8610 }, { "epoch": 6.671826625386997, "grad_norm": 0.8526882529258728, "learning_rate": 6.664086687306502e-06, "loss": 0.4636, "step": 8620 }, { "epoch": 6.679566563467493, "grad_norm": 0.7361013293266296, "learning_rate": 6.648606811145511e-06, "loss": 0.0374, "step": 8630 }, { "epoch": 6.687306501547988, "grad_norm": 0.06277801841497421, "learning_rate": 6.6331269349845205e-06, "loss": 0.7779, "step": 8640 }, { "epoch": 6.695046439628483, "grad_norm": 0.2898872196674347, "learning_rate": 6.61764705882353e-06, "loss": 0.507, "step": 8650 }, { "epoch": 6.7027863777089784, "grad_norm": 0.03553653135895729, "learning_rate": 6.602167182662539e-06, "loss": 1.1524, "step": 8660 }, { "epoch": 6.7105263157894735, "grad_norm": 16.942094802856445, "learning_rate": 6.586687306501548e-06, "loss": 0.5067, "step": 8670 }, { "epoch": 6.718266253869969, "grad_norm": 0.4304063618183136, "learning_rate": 6.571207430340558e-06, "loss": 0.0207, "step": 8680 }, { "epoch": 6.726006191950464, "grad_norm": 0.06903474777936935, "learning_rate": 6.555727554179567e-06, "loss": 0.6542, "step": 8690 }, { "epoch": 6.73374613003096, "grad_norm": 0.04965166002511978, "learning_rate": 6.540247678018576e-06, "loss": 0.0383, "step": 8700 }, { "epoch": 6.741486068111455, "grad_norm": 0.03767887502908707, "learning_rate": 6.5247678018575855e-06, "loss": 0.6611, "step": 8710 }, { "epoch": 6.74922600619195, "grad_norm": 0.0658891573548317, "learning_rate": 6.509287925696595e-06, "loss": 0.0977, "step": 8720 }, { "epoch": 6.756965944272446, "grad_norm": 3.215214252471924, "learning_rate": 6.493808049535604e-06, "loss": 0.1595, "step": 8730 }, { "epoch": 6.764705882352941, "grad_norm": 0.03766189143061638, "learning_rate": 6.478328173374614e-06, "loss": 0.6046, "step": 8740 }, { "epoch": 6.772445820433436, "grad_norm": 0.014408232644200325, "learning_rate": 6.462848297213623e-06, "loss": 0.3998, "step": 8750 }, { "epoch": 6.780185758513932, "grad_norm": 0.02371295355260372, "learning_rate": 6.447368421052632e-06, "loss": 0.3583, "step": 8760 }, { "epoch": 6.787925696594427, "grad_norm": 0.13063675165176392, "learning_rate": 6.431888544891641e-06, "loss": 0.3432, "step": 8770 }, { "epoch": 6.795665634674923, "grad_norm": 2.951378345489502, "learning_rate": 6.41640866873065e-06, "loss": 0.5286, "step": 8780 }, { "epoch": 6.803405572755418, "grad_norm": 28.54975128173828, "learning_rate": 6.4009287925696596e-06, "loss": 0.0399, "step": 8790 }, { "epoch": 6.811145510835914, "grad_norm": 0.03477620705962181, "learning_rate": 6.3854489164086696e-06, "loss": 0.5798, "step": 8800 }, { "epoch": 6.818885448916409, "grad_norm": 17.899215698242188, "learning_rate": 6.369969040247679e-06, "loss": 0.6742, "step": 8810 }, { "epoch": 6.826625386996904, "grad_norm": 0.027265943586826324, "learning_rate": 6.354489164086688e-06, "loss": 0.6302, "step": 8820 }, { "epoch": 6.8343653250774, "grad_norm": 0.0378473736345768, "learning_rate": 6.339009287925697e-06, "loss": 0.4832, "step": 8830 }, { "epoch": 6.842105263157895, "grad_norm": 38.767906188964844, "learning_rate": 6.323529411764706e-06, "loss": 1.3961, "step": 8840 }, { "epoch": 6.84984520123839, "grad_norm": 29.71550750732422, "learning_rate": 6.308049535603715e-06, "loss": 1.4259, "step": 8850 }, { "epoch": 6.857585139318886, "grad_norm": 0.03491980955004692, "learning_rate": 6.292569659442725e-06, "loss": 0.5028, "step": 8860 }, { "epoch": 6.865325077399381, "grad_norm": 0.43304648995399475, "learning_rate": 6.2770897832817345e-06, "loss": 0.2357, "step": 8870 }, { "epoch": 6.8730650154798765, "grad_norm": 0.019276734441518784, "learning_rate": 6.261609907120744e-06, "loss": 0.0258, "step": 8880 }, { "epoch": 6.8808049535603715, "grad_norm": 0.09815941751003265, "learning_rate": 6.246130030959753e-06, "loss": 0.4473, "step": 8890 }, { "epoch": 6.888544891640867, "grad_norm": 0.034711070358753204, "learning_rate": 6.230650154798762e-06, "loss": 0.5598, "step": 8900 }, { "epoch": 6.896284829721362, "grad_norm": 0.04081498831510544, "learning_rate": 6.215170278637771e-06, "loss": 0.5652, "step": 8910 }, { "epoch": 6.904024767801857, "grad_norm": 0.036093372851610184, "learning_rate": 6.199690402476781e-06, "loss": 0.0192, "step": 8920 }, { "epoch": 6.911764705882353, "grad_norm": 64.654052734375, "learning_rate": 6.18421052631579e-06, "loss": 0.4782, "step": 8930 }, { "epoch": 6.919504643962848, "grad_norm": 0.018247678875923157, "learning_rate": 6.1687306501547994e-06, "loss": 0.8251, "step": 8940 }, { "epoch": 6.927244582043343, "grad_norm": 0.02691546268761158, "learning_rate": 6.153250773993809e-06, "loss": 0.8432, "step": 8950 }, { "epoch": 6.934984520123839, "grad_norm": 0.07483381032943726, "learning_rate": 6.137770897832818e-06, "loss": 0.4064, "step": 8960 }, { "epoch": 6.942724458204334, "grad_norm": 0.14961549639701843, "learning_rate": 6.123839009287926e-06, "loss": 0.887, "step": 8970 }, { "epoch": 6.95046439628483, "grad_norm": 0.05196261778473854, "learning_rate": 6.108359133126935e-06, "loss": 0.0128, "step": 8980 }, { "epoch": 6.958204334365325, "grad_norm": 0.02424195222556591, "learning_rate": 6.092879256965944e-06, "loss": 0.0115, "step": 8990 }, { "epoch": 6.965944272445821, "grad_norm": 0.013693656772375107, "learning_rate": 6.0773993808049535e-06, "loss": 1.4998, "step": 9000 }, { "epoch": 6.973684210526316, "grad_norm": 5.620328903198242, "learning_rate": 6.0619195046439635e-06, "loss": 0.5313, "step": 9010 }, { "epoch": 6.981424148606811, "grad_norm": 3.050372362136841, "learning_rate": 6.046439628482973e-06, "loss": 0.3213, "step": 9020 }, { "epoch": 6.989164086687307, "grad_norm": 5.6900315284729, "learning_rate": 6.030959752321982e-06, "loss": 0.3541, "step": 9030 }, { "epoch": 6.996904024767802, "grad_norm": 0.059771519154310226, "learning_rate": 6.015479876160991e-06, "loss": 0.2809, "step": 9040 }, { "epoch": 7.0, "eval_accuracy": 0.84, "eval_f1": 0.8386100386100387, "eval_loss": 0.7678514122962952, "eval_runtime": 1.3061, "eval_samples_per_second": 38.283, "eval_steps_per_second": 38.283, "step": 9044 }, { "epoch": 7.004643962848297, "grad_norm": 0.053383901715278625, "learning_rate": 6e-06, "loss": 1.279, "step": 9050 }, { "epoch": 7.012383900928793, "grad_norm": 0.07739610224962234, "learning_rate": 5.984520123839009e-06, "loss": 0.709, "step": 9060 }, { "epoch": 7.020123839009288, "grad_norm": 61.138858795166016, "learning_rate": 5.969040247678019e-06, "loss": 1.485, "step": 9070 }, { "epoch": 7.027863777089784, "grad_norm": 49.466190338134766, "learning_rate": 5.9535603715170285e-06, "loss": 0.2435, "step": 9080 }, { "epoch": 7.035603715170279, "grad_norm": 12.681292533874512, "learning_rate": 5.938080495356038e-06, "loss": 0.6268, "step": 9090 }, { "epoch": 7.043343653250774, "grad_norm": 0.137156680226326, "learning_rate": 5.922600619195047e-06, "loss": 0.4507, "step": 9100 }, { "epoch": 7.0510835913312695, "grad_norm": 0.021559931337833405, "learning_rate": 5.907120743034056e-06, "loss": 0.1113, "step": 9110 }, { "epoch": 7.0588235294117645, "grad_norm": 3.805744171142578, "learning_rate": 5.891640866873065e-06, "loss": 0.9724, "step": 9120 }, { "epoch": 7.06656346749226, "grad_norm": 0.05050995573401451, "learning_rate": 5.876160990712074e-06, "loss": 1.4029, "step": 9130 }, { "epoch": 7.074303405572755, "grad_norm": 0.07934688031673431, "learning_rate": 5.860681114551084e-06, "loss": 0.1794, "step": 9140 }, { "epoch": 7.08204334365325, "grad_norm": 3.874812126159668, "learning_rate": 5.845201238390093e-06, "loss": 1.1601, "step": 9150 }, { "epoch": 7.089783281733746, "grad_norm": 0.7336801290512085, "learning_rate": 5.8297213622291026e-06, "loss": 0.0603, "step": 9160 }, { "epoch": 7.097523219814241, "grad_norm": 0.2147514373064041, "learning_rate": 5.814241486068112e-06, "loss": 0.4185, "step": 9170 }, { "epoch": 7.105263157894737, "grad_norm": 0.05408315733075142, "learning_rate": 5.798761609907121e-06, "loss": 0.0334, "step": 9180 }, { "epoch": 7.113003095975232, "grad_norm": 0.03233062103390694, "learning_rate": 5.78328173374613e-06, "loss": 0.3426, "step": 9190 }, { "epoch": 7.120743034055727, "grad_norm": 0.02257455699145794, "learning_rate": 5.76780185758514e-06, "loss": 0.8589, "step": 9200 }, { "epoch": 7.128482972136223, "grad_norm": 0.04232734814286232, "learning_rate": 5.752321981424149e-06, "loss": 0.751, "step": 9210 }, { "epoch": 7.136222910216718, "grad_norm": 0.03816604986786842, "learning_rate": 5.736842105263158e-06, "loss": 1.1984, "step": 9220 }, { "epoch": 7.143962848297214, "grad_norm": 0.047452472150325775, "learning_rate": 5.7213622291021675e-06, "loss": 0.4418, "step": 9230 }, { "epoch": 7.151702786377709, "grad_norm": 0.04547743871808052, "learning_rate": 5.705882352941177e-06, "loss": 0.4915, "step": 9240 }, { "epoch": 7.159442724458204, "grad_norm": 0.05770479887723923, "learning_rate": 5.690402476780186e-06, "loss": 0.3321, "step": 9250 }, { "epoch": 7.1671826625387, "grad_norm": 0.2581842541694641, "learning_rate": 5.674922600619196e-06, "loss": 0.0164, "step": 9260 }, { "epoch": 7.174922600619195, "grad_norm": 0.3079908788204193, "learning_rate": 5.659442724458205e-06, "loss": 0.3508, "step": 9270 }, { "epoch": 7.182662538699691, "grad_norm": 13.49255657196045, "learning_rate": 5.643962848297214e-06, "loss": 1.284, "step": 9280 }, { "epoch": 7.190402476780186, "grad_norm": 66.24537658691406, "learning_rate": 5.628482972136223e-06, "loss": 0.9771, "step": 9290 }, { "epoch": 7.198142414860681, "grad_norm": 10.241527557373047, "learning_rate": 5.6130030959752324e-06, "loss": 1.0646, "step": 9300 }, { "epoch": 7.205882352941177, "grad_norm": 20.768251419067383, "learning_rate": 5.597523219814242e-06, "loss": 0.3348, "step": 9310 }, { "epoch": 7.213622291021672, "grad_norm": 58.89311599731445, "learning_rate": 5.582043343653252e-06, "loss": 0.4057, "step": 9320 }, { "epoch": 7.2213622291021675, "grad_norm": 64.81076049804688, "learning_rate": 5.566563467492261e-06, "loss": 0.2846, "step": 9330 }, { "epoch": 7.2291021671826625, "grad_norm": 0.03546363115310669, "learning_rate": 5.55108359133127e-06, "loss": 0.4741, "step": 9340 }, { "epoch": 7.2368421052631575, "grad_norm": 9.74581241607666, "learning_rate": 5.535603715170279e-06, "loss": 1.1338, "step": 9350 }, { "epoch": 7.244582043343653, "grad_norm": 0.5772343873977661, "learning_rate": 5.520123839009288e-06, "loss": 0.0496, "step": 9360 }, { "epoch": 7.252321981424148, "grad_norm": 47.405967712402344, "learning_rate": 5.504643962848297e-06, "loss": 0.4243, "step": 9370 }, { "epoch": 7.260061919504644, "grad_norm": 0.03672086447477341, "learning_rate": 5.489164086687307e-06, "loss": 1.6398, "step": 9380 }, { "epoch": 7.267801857585139, "grad_norm": 0.13642112910747528, "learning_rate": 5.4736842105263165e-06, "loss": 1.2174, "step": 9390 }, { "epoch": 7.275541795665634, "grad_norm": 9.857564926147461, "learning_rate": 5.458204334365326e-06, "loss": 0.7216, "step": 9400 }, { "epoch": 7.28328173374613, "grad_norm": 28.288990020751953, "learning_rate": 5.442724458204335e-06, "loss": 0.2523, "step": 9410 }, { "epoch": 7.291021671826625, "grad_norm": 57.741695404052734, "learning_rate": 5.427244582043344e-06, "loss": 0.7204, "step": 9420 }, { "epoch": 7.298761609907121, "grad_norm": 0.0713956281542778, "learning_rate": 5.411764705882353e-06, "loss": 1.6416, "step": 9430 }, { "epoch": 7.306501547987616, "grad_norm": 44.71361541748047, "learning_rate": 5.396284829721362e-06, "loss": 0.092, "step": 9440 }, { "epoch": 7.314241486068111, "grad_norm": 0.02380669116973877, "learning_rate": 5.380804953560372e-06, "loss": 0.0336, "step": 9450 }, { "epoch": 7.321981424148607, "grad_norm": 2.231900691986084, "learning_rate": 5.3653250773993815e-06, "loss": 0.4397, "step": 9460 }, { "epoch": 7.329721362229102, "grad_norm": 0.2188711315393448, "learning_rate": 5.349845201238391e-06, "loss": 0.0284, "step": 9470 }, { "epoch": 7.337461300309598, "grad_norm": 0.035856977105140686, "learning_rate": 5.3343653250774e-06, "loss": 0.3627, "step": 9480 }, { "epoch": 7.345201238390093, "grad_norm": 0.031220568343997, "learning_rate": 5.318885448916409e-06, "loss": 0.5151, "step": 9490 }, { "epoch": 7.352941176470588, "grad_norm": 2.283757209777832, "learning_rate": 5.303405572755418e-06, "loss": 0.4283, "step": 9500 }, { "epoch": 7.360681114551084, "grad_norm": 0.13574466109275818, "learning_rate": 5.287925696594428e-06, "loss": 0.2437, "step": 9510 }, { "epoch": 7.368421052631579, "grad_norm": 0.04867443069815636, "learning_rate": 5.272445820433437e-06, "loss": 0.8115, "step": 9520 }, { "epoch": 7.376160990712075, "grad_norm": 0.030066927894949913, "learning_rate": 5.256965944272446e-06, "loss": 0.8971, "step": 9530 }, { "epoch": 7.38390092879257, "grad_norm": 46.0897331237793, "learning_rate": 5.2414860681114555e-06, "loss": 0.5933, "step": 9540 }, { "epoch": 7.391640866873065, "grad_norm": 0.027414608746767044, "learning_rate": 5.226006191950465e-06, "loss": 0.1437, "step": 9550 }, { "epoch": 7.3993808049535605, "grad_norm": 0.06854629516601562, "learning_rate": 5.210526315789474e-06, "loss": 1.0164, "step": 9560 }, { "epoch": 7.4071207430340555, "grad_norm": 0.04333164542913437, "learning_rate": 5.195046439628484e-06, "loss": 0.3303, "step": 9570 }, { "epoch": 7.414860681114551, "grad_norm": 30.47051429748535, "learning_rate": 5.179566563467493e-06, "loss": 0.1376, "step": 9580 }, { "epoch": 7.422600619195046, "grad_norm": 0.06368532031774521, "learning_rate": 5.164086687306502e-06, "loss": 0.5414, "step": 9590 }, { "epoch": 7.430340557275541, "grad_norm": 0.6605077981948853, "learning_rate": 5.148606811145511e-06, "loss": 0.5301, "step": 9600 }, { "epoch": 7.438080495356037, "grad_norm": 61.88010025024414, "learning_rate": 5.1331269349845205e-06, "loss": 0.8437, "step": 9610 }, { "epoch": 7.445820433436532, "grad_norm": 0.05211161449551582, "learning_rate": 5.11764705882353e-06, "loss": 0.7713, "step": 9620 }, { "epoch": 7.453560371517028, "grad_norm": 0.6048247218132019, "learning_rate": 5.10216718266254e-06, "loss": 0.5187, "step": 9630 }, { "epoch": 7.461300309597523, "grad_norm": 0.9319159984588623, "learning_rate": 5.086687306501549e-06, "loss": 0.0533, "step": 9640 }, { "epoch": 7.469040247678018, "grad_norm": 31.288846969604492, "learning_rate": 5.071207430340558e-06, "loss": 0.6297, "step": 9650 }, { "epoch": 7.476780185758514, "grad_norm": 0.14734207093715668, "learning_rate": 5.055727554179567e-06, "loss": 0.0338, "step": 9660 }, { "epoch": 7.484520123839009, "grad_norm": 10.709227561950684, "learning_rate": 5.040247678018576e-06, "loss": 0.9159, "step": 9670 }, { "epoch": 7.492260061919505, "grad_norm": 31.05280113220215, "learning_rate": 5.024767801857585e-06, "loss": 0.9866, "step": 9680 }, { "epoch": 7.5, "grad_norm": 0.035152412950992584, "learning_rate": 5.0092879256965954e-06, "loss": 0.1535, "step": 9690 }, { "epoch": 7.507739938080495, "grad_norm": 0.044311169534921646, "learning_rate": 4.993808049535604e-06, "loss": 0.0412, "step": 9700 }, { "epoch": 7.515479876160991, "grad_norm": 0.022498058155179024, "learning_rate": 4.978328173374614e-06, "loss": 0.0979, "step": 9710 }, { "epoch": 7.523219814241486, "grad_norm": 0.040518198162317276, "learning_rate": 4.962848297213623e-06, "loss": 0.4396, "step": 9720 }, { "epoch": 7.530959752321982, "grad_norm": 0.09141626209020615, "learning_rate": 4.947368421052632e-06, "loss": 0.0384, "step": 9730 }, { "epoch": 7.538699690402477, "grad_norm": 0.06551285833120346, "learning_rate": 4.931888544891641e-06, "loss": 0.0518, "step": 9740 }, { "epoch": 7.546439628482972, "grad_norm": 0.04375738650560379, "learning_rate": 4.91640866873065e-06, "loss": 1.0439, "step": 9750 }, { "epoch": 7.554179566563468, "grad_norm": 1.9471406936645508, "learning_rate": 4.9009287925696595e-06, "loss": 0.1638, "step": 9760 }, { "epoch": 7.561919504643963, "grad_norm": 0.3752700388431549, "learning_rate": 4.8854489164086695e-06, "loss": 0.5112, "step": 9770 }, { "epoch": 7.569659442724459, "grad_norm": 0.02589801885187626, "learning_rate": 4.869969040247679e-06, "loss": 0.0309, "step": 9780 }, { "epoch": 7.577399380804954, "grad_norm": 12.285547256469727, "learning_rate": 4.854489164086688e-06, "loss": 0.5626, "step": 9790 }, { "epoch": 7.585139318885449, "grad_norm": 0.03315320983529091, "learning_rate": 4.839009287925697e-06, "loss": 0.0769, "step": 9800 }, { "epoch": 7.5928792569659445, "grad_norm": 0.47495362162590027, "learning_rate": 4.823529411764706e-06, "loss": 0.8151, "step": 9810 }, { "epoch": 7.6006191950464395, "grad_norm": 0.17149126529693604, "learning_rate": 4.808049535603715e-06, "loss": 0.5982, "step": 9820 }, { "epoch": 7.608359133126935, "grad_norm": 0.033966850489377975, "learning_rate": 4.792569659442725e-06, "loss": 0.3156, "step": 9830 }, { "epoch": 7.61609907120743, "grad_norm": 0.08276546746492386, "learning_rate": 4.7770897832817345e-06, "loss": 0.2289, "step": 9840 }, { "epoch": 7.623839009287925, "grad_norm": 0.024919508025050163, "learning_rate": 4.761609907120744e-06, "loss": 0.1095, "step": 9850 }, { "epoch": 7.631578947368421, "grad_norm": 0.17491966485977173, "learning_rate": 4.746130030959753e-06, "loss": 0.2621, "step": 9860 }, { "epoch": 7.639318885448916, "grad_norm": 0.027620425447821617, "learning_rate": 4.730650154798762e-06, "loss": 0.3022, "step": 9870 }, { "epoch": 7.647058823529412, "grad_norm": 0.06240718066692352, "learning_rate": 4.715170278637771e-06, "loss": 1.0774, "step": 9880 }, { "epoch": 7.654798761609907, "grad_norm": 0.043136876076459885, "learning_rate": 4.69969040247678e-06, "loss": 0.0113, "step": 9890 }, { "epoch": 7.662538699690402, "grad_norm": 0.016377059742808342, "learning_rate": 4.68421052631579e-06, "loss": 0.0628, "step": 9900 }, { "epoch": 7.670278637770898, "grad_norm": 0.02461991272866726, "learning_rate": 4.668730650154799e-06, "loss": 0.4368, "step": 9910 }, { "epoch": 7.678018575851393, "grad_norm": 0.03219279274344444, "learning_rate": 4.6532507739938085e-06, "loss": 0.7435, "step": 9920 }, { "epoch": 7.685758513931889, "grad_norm": 1.2825266122817993, "learning_rate": 4.637770897832818e-06, "loss": 0.0178, "step": 9930 }, { "epoch": 7.693498452012384, "grad_norm": 0.06038570776581764, "learning_rate": 4.622291021671827e-06, "loss": 0.0412, "step": 9940 }, { "epoch": 7.701238390092879, "grad_norm": 0.010630088858306408, "learning_rate": 4.606811145510836e-06, "loss": 0.6543, "step": 9950 }, { "epoch": 7.708978328173375, "grad_norm": 34.03522491455078, "learning_rate": 4.591331269349846e-06, "loss": 0.4795, "step": 9960 }, { "epoch": 7.71671826625387, "grad_norm": 0.04777824506163597, "learning_rate": 4.575851393188855e-06, "loss": 0.1216, "step": 9970 }, { "epoch": 7.724458204334366, "grad_norm": 0.028296375647187233, "learning_rate": 4.560371517027864e-06, "loss": 0.4903, "step": 9980 }, { "epoch": 7.732198142414861, "grad_norm": 0.4013304114341736, "learning_rate": 4.5448916408668735e-06, "loss": 0.0108, "step": 9990 }, { "epoch": 7.739938080495356, "grad_norm": 0.3253251910209656, "learning_rate": 4.529411764705883e-06, "loss": 0.0152, "step": 10000 }, { "epoch": 7.747678018575852, "grad_norm": 0.056400660425424576, "learning_rate": 4.513931888544892e-06, "loss": 0.9604, "step": 10010 }, { "epoch": 7.755417956656347, "grad_norm": 35.374900817871094, "learning_rate": 4.498452012383902e-06, "loss": 0.3504, "step": 10020 }, { "epoch": 7.7631578947368425, "grad_norm": 1.2675303220748901, "learning_rate": 4.482972136222911e-06, "loss": 0.2354, "step": 10030 }, { "epoch": 7.7708978328173375, "grad_norm": 0.03075457364320755, "learning_rate": 4.46749226006192e-06, "loss": 1.3869, "step": 10040 }, { "epoch": 7.7786377708978325, "grad_norm": 0.022344134747982025, "learning_rate": 4.452012383900929e-06, "loss": 0.101, "step": 10050 }, { "epoch": 7.786377708978328, "grad_norm": 0.02855776809155941, "learning_rate": 4.436532507739938e-06, "loss": 0.0205, "step": 10060 }, { "epoch": 7.794117647058823, "grad_norm": 32.65413284301758, "learning_rate": 4.4210526315789476e-06, "loss": 0.7439, "step": 10070 }, { "epoch": 7.801857585139319, "grad_norm": 0.05484652891755104, "learning_rate": 4.4055727554179576e-06, "loss": 0.5829, "step": 10080 }, { "epoch": 7.809597523219814, "grad_norm": 0.02822122909128666, "learning_rate": 4.390092879256967e-06, "loss": 0.2481, "step": 10090 }, { "epoch": 7.817337461300309, "grad_norm": 0.7090976238250732, "learning_rate": 4.374613003095976e-06, "loss": 0.0634, "step": 10100 }, { "epoch": 7.825077399380805, "grad_norm": 0.027229061350226402, "learning_rate": 4.359133126934985e-06, "loss": 0.0992, "step": 10110 }, { "epoch": 7.8328173374613, "grad_norm": 0.034177087247371674, "learning_rate": 4.343653250773994e-06, "loss": 0.0234, "step": 10120 }, { "epoch": 7.840557275541796, "grad_norm": 0.13867497444152832, "learning_rate": 4.328173374613003e-06, "loss": 0.7203, "step": 10130 }, { "epoch": 7.848297213622291, "grad_norm": 0.11251337826251984, "learning_rate": 4.3126934984520125e-06, "loss": 0.4493, "step": 10140 }, { "epoch": 7.856037151702786, "grad_norm": 0.021405575796961784, "learning_rate": 4.2972136222910225e-06, "loss": 0.1519, "step": 10150 }, { "epoch": 7.863777089783282, "grad_norm": 0.4485969841480255, "learning_rate": 4.281733746130031e-06, "loss": 0.0096, "step": 10160 }, { "epoch": 7.871517027863777, "grad_norm": 0.03996383026242256, "learning_rate": 4.26625386996904e-06, "loss": 0.0229, "step": 10170 }, { "epoch": 7.879256965944273, "grad_norm": 0.0937669426202774, "learning_rate": 4.25077399380805e-06, "loss": 0.0558, "step": 10180 }, { "epoch": 7.886996904024768, "grad_norm": 0.023265693336725235, "learning_rate": 4.235294117647059e-06, "loss": 0.1326, "step": 10190 }, { "epoch": 7.894736842105263, "grad_norm": 0.030038027092814445, "learning_rate": 4.219814241486068e-06, "loss": 0.0971, "step": 10200 }, { "epoch": 7.902476780185759, "grad_norm": 14.853510856628418, "learning_rate": 4.204334365325078e-06, "loss": 0.5487, "step": 10210 }, { "epoch": 7.910216718266254, "grad_norm": 0.011156076565384865, "learning_rate": 4.188854489164087e-06, "loss": 0.4948, "step": 10220 }, { "epoch": 7.91795665634675, "grad_norm": 0.044642455875873566, "learning_rate": 4.173374613003096e-06, "loss": 0.2767, "step": 10230 }, { "epoch": 7.925696594427245, "grad_norm": 0.017108885571360588, "learning_rate": 4.157894736842106e-06, "loss": 0.0081, "step": 10240 }, { "epoch": 7.93343653250774, "grad_norm": 0.028285803273320198, "learning_rate": 4.142414860681115e-06, "loss": 0.009, "step": 10250 }, { "epoch": 7.9411764705882355, "grad_norm": 73.00074768066406, "learning_rate": 4.126934984520124e-06, "loss": 0.3513, "step": 10260 }, { "epoch": 7.9489164086687305, "grad_norm": 0.040326014161109924, "learning_rate": 4.111455108359133e-06, "loss": 0.2766, "step": 10270 }, { "epoch": 7.956656346749226, "grad_norm": 2.273895025253296, "learning_rate": 4.095975232198142e-06, "loss": 0.4503, "step": 10280 }, { "epoch": 7.964396284829721, "grad_norm": 0.029970407485961914, "learning_rate": 4.0804953560371515e-06, "loss": 1.4588, "step": 10290 }, { "epoch": 7.972136222910216, "grad_norm": 0.029040377587080002, "learning_rate": 4.0650154798761615e-06, "loss": 0.0084, "step": 10300 }, { "epoch": 7.979876160990712, "grad_norm": 0.24737367033958435, "learning_rate": 4.049535603715171e-06, "loss": 0.0075, "step": 10310 }, { "epoch": 7.987616099071207, "grad_norm": 0.7249574661254883, "learning_rate": 4.03405572755418e-06, "loss": 0.4685, "step": 10320 }, { "epoch": 7.995356037151703, "grad_norm": 0.8763111233711243, "learning_rate": 4.018575851393189e-06, "loss": 0.0096, "step": 10330 }, { "epoch": 8.0, "eval_accuracy": 0.84, "eval_f1": 0.8386100386100387, "eval_loss": 0.7574268579483032, "eval_runtime": 1.3074, "eval_samples_per_second": 38.243, "eval_steps_per_second": 38.243, "step": 10336 }, { "epoch": 8.003095975232197, "grad_norm": 0.5625190138816833, "learning_rate": 4.003095975232198e-06, "loss": 0.1157, "step": 10340 }, { "epoch": 8.010835913312693, "grad_norm": 1.6426266431808472, "learning_rate": 3.987616099071207e-06, "loss": 0.4731, "step": 10350 }, { "epoch": 8.018575851393189, "grad_norm": 34.64018630981445, "learning_rate": 3.972136222910217e-06, "loss": 0.1029, "step": 10360 }, { "epoch": 8.026315789473685, "grad_norm": 0.21714405715465546, "learning_rate": 3.9566563467492265e-06, "loss": 0.4889, "step": 10370 }, { "epoch": 8.034055727554179, "grad_norm": 0.016920117661356926, "learning_rate": 3.941176470588236e-06, "loss": 0.4952, "step": 10380 }, { "epoch": 8.041795665634675, "grad_norm": 0.09030075371265411, "learning_rate": 3.925696594427245e-06, "loss": 0.0082, "step": 10390 }, { "epoch": 8.04953560371517, "grad_norm": 0.03461069241166115, "learning_rate": 3.910216718266254e-06, "loss": 0.083, "step": 10400 }, { "epoch": 8.057275541795665, "grad_norm": 0.024260008707642555, "learning_rate": 3.894736842105263e-06, "loss": 0.0305, "step": 10410 }, { "epoch": 8.06501547987616, "grad_norm": 0.05174719914793968, "learning_rate": 3.879256965944273e-06, "loss": 0.5514, "step": 10420 }, { "epoch": 8.072755417956657, "grad_norm": 0.015384849160909653, "learning_rate": 3.863777089783282e-06, "loss": 0.4358, "step": 10430 }, { "epoch": 8.08049535603715, "grad_norm": 0.4380377531051636, "learning_rate": 3.848297213622291e-06, "loss": 0.2483, "step": 10440 }, { "epoch": 8.088235294117647, "grad_norm": 0.035760823637247086, "learning_rate": 3.8328173374613006e-06, "loss": 0.0226, "step": 10450 }, { "epoch": 8.095975232198143, "grad_norm": 0.027462555095553398, "learning_rate": 3.81733746130031e-06, "loss": 0.6408, "step": 10460 }, { "epoch": 8.103715170278639, "grad_norm": 0.04360973834991455, "learning_rate": 3.8018575851393193e-06, "loss": 0.3414, "step": 10470 }, { "epoch": 8.111455108359133, "grad_norm": 0.03566218540072441, "learning_rate": 3.786377708978328e-06, "loss": 0.3618, "step": 10480 }, { "epoch": 8.119195046439629, "grad_norm": 0.8556517958641052, "learning_rate": 3.770897832817338e-06, "loss": 0.198, "step": 10490 }, { "epoch": 8.126934984520124, "grad_norm": 0.028089012950658798, "learning_rate": 3.755417956656347e-06, "loss": 0.1441, "step": 10500 }, { "epoch": 8.134674922600619, "grad_norm": 0.13658052682876587, "learning_rate": 3.739938080495356e-06, "loss": 0.008, "step": 10510 }, { "epoch": 8.142414860681114, "grad_norm": 0.06542599946260452, "learning_rate": 3.724458204334366e-06, "loss": 0.2318, "step": 10520 }, { "epoch": 8.15015479876161, "grad_norm": 0.026353923603892326, "learning_rate": 3.708978328173375e-06, "loss": 0.3526, "step": 10530 }, { "epoch": 8.157894736842104, "grad_norm": 0.042792681604623795, "learning_rate": 3.693498452012384e-06, "loss": 0.3716, "step": 10540 }, { "epoch": 8.1656346749226, "grad_norm": 2.907512664794922, "learning_rate": 3.678018575851394e-06, "loss": 0.0848, "step": 10550 }, { "epoch": 8.173374613003096, "grad_norm": 0.2566719353199005, "learning_rate": 3.662538699690403e-06, "loss": 1.4183, "step": 10560 }, { "epoch": 8.181114551083592, "grad_norm": 53.634490966796875, "learning_rate": 3.6470588235294117e-06, "loss": 0.8437, "step": 10570 }, { "epoch": 8.188854489164086, "grad_norm": 0.02184741385281086, "learning_rate": 3.6315789473684217e-06, "loss": 0.9481, "step": 10580 }, { "epoch": 8.196594427244582, "grad_norm": 0.019968654960393906, "learning_rate": 3.6160990712074304e-06, "loss": 0.2767, "step": 10590 }, { "epoch": 8.204334365325078, "grad_norm": 0.010409082286059856, "learning_rate": 3.6006191950464396e-06, "loss": 0.5566, "step": 10600 }, { "epoch": 8.212074303405572, "grad_norm": 0.0694267600774765, "learning_rate": 3.5851393188854496e-06, "loss": 1.2456, "step": 10610 }, { "epoch": 8.219814241486068, "grad_norm": 0.01388638000935316, "learning_rate": 3.5696594427244583e-06, "loss": 0.2401, "step": 10620 }, { "epoch": 8.227554179566564, "grad_norm": 0.02353646047413349, "learning_rate": 3.5541795665634675e-06, "loss": 0.0094, "step": 10630 }, { "epoch": 8.235294117647058, "grad_norm": 0.015857504680752754, "learning_rate": 3.5386996904024775e-06, "loss": 0.9901, "step": 10640 }, { "epoch": 8.243034055727554, "grad_norm": 0.11505137383937836, "learning_rate": 3.5232198142414862e-06, "loss": 1.1086, "step": 10650 }, { "epoch": 8.25077399380805, "grad_norm": 0.030285712331533432, "learning_rate": 3.5077399380804954e-06, "loss": 0.175, "step": 10660 }, { "epoch": 8.258513931888546, "grad_norm": 35.11290740966797, "learning_rate": 3.4922600619195054e-06, "loss": 1.2155, "step": 10670 }, { "epoch": 8.26625386996904, "grad_norm": 0.6865656971931458, "learning_rate": 3.476780185758514e-06, "loss": 0.8421, "step": 10680 }, { "epoch": 8.273993808049536, "grad_norm": 0.1924859583377838, "learning_rate": 3.4613003095975233e-06, "loss": 0.7124, "step": 10690 }, { "epoch": 8.281733746130032, "grad_norm": 0.015945428982377052, "learning_rate": 3.4458204334365333e-06, "loss": 0.0118, "step": 10700 }, { "epoch": 8.289473684210526, "grad_norm": 24.189233779907227, "learning_rate": 3.430340557275542e-06, "loss": 0.6079, "step": 10710 }, { "epoch": 8.297213622291022, "grad_norm": 65.20381927490234, "learning_rate": 3.414860681114551e-06, "loss": 0.603, "step": 10720 }, { "epoch": 8.304953560371517, "grad_norm": 33.175048828125, "learning_rate": 3.3993808049535603e-06, "loss": 0.0355, "step": 10730 }, { "epoch": 8.312693498452012, "grad_norm": 0.02605711854994297, "learning_rate": 3.38390092879257e-06, "loss": 0.1237, "step": 10740 }, { "epoch": 8.320433436532507, "grad_norm": 0.5962255001068115, "learning_rate": 3.368421052631579e-06, "loss": 0.2312, "step": 10750 }, { "epoch": 8.328173374613003, "grad_norm": 0.12032397091388702, "learning_rate": 3.352941176470588e-06, "loss": 0.4254, "step": 10760 }, { "epoch": 8.3359133126935, "grad_norm": 0.03817048668861389, "learning_rate": 3.3374613003095978e-06, "loss": 0.5434, "step": 10770 }, { "epoch": 8.343653250773993, "grad_norm": 0.026361919939517975, "learning_rate": 3.321981424148607e-06, "loss": 1.0912, "step": 10780 }, { "epoch": 8.35139318885449, "grad_norm": 0.21669290959835052, "learning_rate": 3.306501547987616e-06, "loss": 0.0377, "step": 10790 }, { "epoch": 8.359133126934985, "grad_norm": 0.13411226868629456, "learning_rate": 3.2910216718266257e-06, "loss": 0.2981, "step": 10800 }, { "epoch": 8.36687306501548, "grad_norm": 0.016349023208022118, "learning_rate": 3.275541795665635e-06, "loss": 0.4389, "step": 10810 }, { "epoch": 8.374613003095975, "grad_norm": 30.61772918701172, "learning_rate": 3.260061919504644e-06, "loss": 0.0852, "step": 10820 }, { "epoch": 8.382352941176471, "grad_norm": 0.03886829689145088, "learning_rate": 3.2445820433436536e-06, "loss": 0.3231, "step": 10830 }, { "epoch": 8.390092879256965, "grad_norm": 0.39876118302345276, "learning_rate": 3.2291021671826627e-06, "loss": 0.0402, "step": 10840 }, { "epoch": 8.397832817337461, "grad_norm": 60.39808654785156, "learning_rate": 3.213622291021672e-06, "loss": 0.4348, "step": 10850 }, { "epoch": 8.405572755417957, "grad_norm": 0.027166040614247322, "learning_rate": 3.1981424148606814e-06, "loss": 0.4946, "step": 10860 }, { "epoch": 8.413312693498453, "grad_norm": 52.32682418823242, "learning_rate": 3.1826625386996906e-06, "loss": 1.068, "step": 10870 }, { "epoch": 8.421052631578947, "grad_norm": 0.12136401236057281, "learning_rate": 3.1671826625386998e-06, "loss": 0.0138, "step": 10880 }, { "epoch": 8.428792569659443, "grad_norm": 0.03123597428202629, "learning_rate": 3.1517027863777093e-06, "loss": 0.5329, "step": 10890 }, { "epoch": 8.436532507739939, "grad_norm": 0.02554795891046524, "learning_rate": 3.1362229102167185e-06, "loss": 0.7094, "step": 10900 }, { "epoch": 8.444272445820433, "grad_norm": 0.030589330941438675, "learning_rate": 3.1207430340557276e-06, "loss": 1.747, "step": 10910 }, { "epoch": 8.452012383900929, "grad_norm": 20.828351974487305, "learning_rate": 3.1052631578947372e-06, "loss": 0.2559, "step": 10920 }, { "epoch": 8.459752321981425, "grad_norm": 0.04463785141706467, "learning_rate": 3.0897832817337464e-06, "loss": 0.0078, "step": 10930 }, { "epoch": 8.467492260061919, "grad_norm": 7.013974666595459, "learning_rate": 3.0743034055727555e-06, "loss": 1.043, "step": 10940 }, { "epoch": 8.475232198142415, "grad_norm": 4.810568809509277, "learning_rate": 3.058823529411765e-06, "loss": 0.5841, "step": 10950 }, { "epoch": 8.48297213622291, "grad_norm": 0.22540149092674255, "learning_rate": 3.0433436532507743e-06, "loss": 0.0096, "step": 10960 }, { "epoch": 8.490712074303406, "grad_norm": 0.042846448719501495, "learning_rate": 3.0278637770897834e-06, "loss": 0.092, "step": 10970 }, { "epoch": 8.4984520123839, "grad_norm": NaN, "learning_rate": 3.013931888544892e-06, "loss": 0.1966, "step": 10980 }, { "epoch": 8.506191950464396, "grad_norm": 0.05810638144612312, "learning_rate": 2.9984520123839013e-06, "loss": 0.4628, "step": 10990 }, { "epoch": 8.513931888544892, "grad_norm": 0.02283499576151371, "learning_rate": 2.98297213622291e-06, "loss": 0.5686, "step": 11000 }, { "epoch": 8.521671826625386, "grad_norm": 0.026276158168911934, "learning_rate": 2.96749226006192e-06, "loss": 0.1626, "step": 11010 }, { "epoch": 8.529411764705882, "grad_norm": 0.3862849771976471, "learning_rate": 2.9520123839009292e-06, "loss": 1.1492, "step": 11020 }, { "epoch": 8.537151702786378, "grad_norm": 1.293433427810669, "learning_rate": 2.936532507739938e-06, "loss": 0.5367, "step": 11030 }, { "epoch": 8.544891640866872, "grad_norm": 0.044649720191955566, "learning_rate": 2.921052631578948e-06, "loss": 0.5282, "step": 11040 }, { "epoch": 8.552631578947368, "grad_norm": 0.0585334450006485, "learning_rate": 2.905572755417957e-06, "loss": 0.2533, "step": 11050 }, { "epoch": 8.560371517027864, "grad_norm": 0.033100664615631104, "learning_rate": 2.890092879256966e-06, "loss": 0.5472, "step": 11060 }, { "epoch": 8.56811145510836, "grad_norm": 0.060769222676754, "learning_rate": 2.874613003095976e-06, "loss": 0.5286, "step": 11070 }, { "epoch": 8.575851393188854, "grad_norm": 0.05562102049589157, "learning_rate": 2.859133126934985e-06, "loss": 0.637, "step": 11080 }, { "epoch": 8.58359133126935, "grad_norm": 0.030359258875250816, "learning_rate": 2.8436532507739937e-06, "loss": 0.211, "step": 11090 }, { "epoch": 8.591331269349846, "grad_norm": 14.27761459350586, "learning_rate": 2.8281733746130037e-06, "loss": 0.0341, "step": 11100 }, { "epoch": 8.59907120743034, "grad_norm": 44.734375, "learning_rate": 2.8126934984520125e-06, "loss": 0.8659, "step": 11110 }, { "epoch": 8.606811145510836, "grad_norm": 0.044410716742277145, "learning_rate": 2.7972136222910216e-06, "loss": 0.7719, "step": 11120 }, { "epoch": 8.614551083591332, "grad_norm": 0.014188318513333797, "learning_rate": 2.7817337461300316e-06, "loss": 0.0836, "step": 11130 }, { "epoch": 8.622291021671826, "grad_norm": 0.07636909186840057, "learning_rate": 2.7662538699690404e-06, "loss": 0.3132, "step": 11140 }, { "epoch": 8.630030959752322, "grad_norm": 0.06633859872817993, "learning_rate": 2.7507739938080495e-06, "loss": 0.8958, "step": 11150 }, { "epoch": 8.637770897832818, "grad_norm": 0.11431477218866348, "learning_rate": 2.7352941176470595e-06, "loss": 0.5936, "step": 11160 }, { "epoch": 8.645510835913313, "grad_norm": 0.06183658167719841, "learning_rate": 2.7213622291021674e-06, "loss": 0.7049, "step": 11170 }, { "epoch": 8.653250773993808, "grad_norm": 0.0501876026391983, "learning_rate": 2.7058823529411766e-06, "loss": 0.7778, "step": 11180 }, { "epoch": 8.660990712074303, "grad_norm": 5.82777738571167, "learning_rate": 2.690402476780186e-06, "loss": 0.9888, "step": 11190 }, { "epoch": 8.6687306501548, "grad_norm": 0.020140502601861954, "learning_rate": 2.6749226006191953e-06, "loss": 0.0113, "step": 11200 }, { "epoch": 8.676470588235293, "grad_norm": 0.0846886932849884, "learning_rate": 2.6594427244582045e-06, "loss": 0.0743, "step": 11210 }, { "epoch": 8.68421052631579, "grad_norm": 0.042100805789232254, "learning_rate": 2.643962848297214e-06, "loss": 0.5118, "step": 11220 }, { "epoch": 8.691950464396285, "grad_norm": 9.06766128540039, "learning_rate": 2.628482972136223e-06, "loss": 0.0199, "step": 11230 }, { "epoch": 8.69969040247678, "grad_norm": 0.25248879194259644, "learning_rate": 2.6130030959752324e-06, "loss": 0.9631, "step": 11240 }, { "epoch": 8.707430340557275, "grad_norm": 0.028604131191968918, "learning_rate": 2.597523219814242e-06, "loss": 0.0527, "step": 11250 }, { "epoch": 8.715170278637771, "grad_norm": 0.017351288348436356, "learning_rate": 2.582043343653251e-06, "loss": 0.4663, "step": 11260 }, { "epoch": 8.722910216718267, "grad_norm": 0.028541484847664833, "learning_rate": 2.5665634674922602e-06, "loss": 0.0992, "step": 11270 }, { "epoch": 8.730650154798761, "grad_norm": 0.02545725181698799, "learning_rate": 2.55108359133127e-06, "loss": 0.7256, "step": 11280 }, { "epoch": 8.738390092879257, "grad_norm": 0.03570405766367912, "learning_rate": 2.535603715170279e-06, "loss": 0.0482, "step": 11290 }, { "epoch": 8.746130030959753, "grad_norm": 0.11926468461751938, "learning_rate": 2.520123839009288e-06, "loss": 0.3195, "step": 11300 }, { "epoch": 8.753869969040247, "grad_norm": 0.12441518902778625, "learning_rate": 2.5046439628482977e-06, "loss": 0.036, "step": 11310 }, { "epoch": 8.761609907120743, "grad_norm": 0.4622245132923126, "learning_rate": 2.489164086687307e-06, "loss": 0.3432, "step": 11320 }, { "epoch": 8.769349845201239, "grad_norm": 1.630367398262024, "learning_rate": 2.473684210526316e-06, "loss": 0.5162, "step": 11330 }, { "epoch": 8.777089783281733, "grad_norm": 0.1955188363790512, "learning_rate": 2.458204334365325e-06, "loss": 1.1155, "step": 11340 }, { "epoch": 8.784829721362229, "grad_norm": 17.6148681640625, "learning_rate": 2.4427244582043348e-06, "loss": 1.1212, "step": 11350 }, { "epoch": 8.792569659442725, "grad_norm": 0.03537004441022873, "learning_rate": 2.427244582043344e-06, "loss": 0.0103, "step": 11360 }, { "epoch": 8.80030959752322, "grad_norm": 0.19991812109947205, "learning_rate": 2.411764705882353e-06, "loss": 0.0134, "step": 11370 }, { "epoch": 8.808049535603715, "grad_norm": 0.018819047138094902, "learning_rate": 2.3962848297213626e-06, "loss": 0.0315, "step": 11380 }, { "epoch": 8.81578947368421, "grad_norm": 8.964973449707031, "learning_rate": 2.380804953560372e-06, "loss": 0.4614, "step": 11390 }, { "epoch": 8.823529411764707, "grad_norm": 0.03933088481426239, "learning_rate": 2.365325077399381e-06, "loss": 0.3536, "step": 11400 }, { "epoch": 8.8312693498452, "grad_norm": 45.6381950378418, "learning_rate": 2.34984520123839e-06, "loss": 1.1461, "step": 11410 }, { "epoch": 8.839009287925697, "grad_norm": 0.01202535629272461, "learning_rate": 2.3343653250773997e-06, "loss": 0.0522, "step": 11420 }, { "epoch": 8.846749226006192, "grad_norm": 0.02425566129386425, "learning_rate": 2.318885448916409e-06, "loss": 0.5296, "step": 11430 }, { "epoch": 8.854489164086687, "grad_norm": 0.020927056670188904, "learning_rate": 2.303405572755418e-06, "loss": 0.8775, "step": 11440 }, { "epoch": 8.862229102167182, "grad_norm": 42.657596588134766, "learning_rate": 2.2879256965944276e-06, "loss": 0.0909, "step": 11450 }, { "epoch": 8.869969040247678, "grad_norm": 0.03544936329126358, "learning_rate": 2.2724458204334367e-06, "loss": 1.6549, "step": 11460 }, { "epoch": 8.877708978328174, "grad_norm": 19.22626495361328, "learning_rate": 2.256965944272446e-06, "loss": 0.436, "step": 11470 }, { "epoch": 8.885448916408668, "grad_norm": 44.65915298461914, "learning_rate": 2.2414860681114555e-06, "loss": 0.4315, "step": 11480 }, { "epoch": 8.893188854489164, "grad_norm": 0.2381812483072281, "learning_rate": 2.2260061919504646e-06, "loss": 0.7954, "step": 11490 }, { "epoch": 8.90092879256966, "grad_norm": 30.512800216674805, "learning_rate": 2.2105263157894738e-06, "loss": 0.4334, "step": 11500 }, { "epoch": 8.908668730650154, "grad_norm": 0.020642533898353577, "learning_rate": 2.1950464396284834e-06, "loss": 0.2711, "step": 11510 }, { "epoch": 8.91640866873065, "grad_norm": 0.077863909304142, "learning_rate": 2.1795665634674925e-06, "loss": 0.0622, "step": 11520 }, { "epoch": 8.924148606811146, "grad_norm": 0.06492021679878235, "learning_rate": 2.1640866873065017e-06, "loss": 1.1591, "step": 11530 }, { "epoch": 8.93188854489164, "grad_norm": 0.013351391069591045, "learning_rate": 2.1486068111455113e-06, "loss": 0.6511, "step": 11540 }, { "epoch": 8.939628482972136, "grad_norm": 23.39949607849121, "learning_rate": 2.13312693498452e-06, "loss": 0.8537, "step": 11550 }, { "epoch": 8.947368421052632, "grad_norm": 0.09169264137744904, "learning_rate": 2.1176470588235296e-06, "loss": 0.6039, "step": 11560 }, { "epoch": 8.955108359133128, "grad_norm": 0.20286595821380615, "learning_rate": 2.102167182662539e-06, "loss": 0.4882, "step": 11570 }, { "epoch": 8.962848297213622, "grad_norm": 13.02745246887207, "learning_rate": 2.086687306501548e-06, "loss": 0.5669, "step": 11580 }, { "epoch": 8.970588235294118, "grad_norm": 0.0411616675555706, "learning_rate": 2.0712074303405575e-06, "loss": 0.0175, "step": 11590 }, { "epoch": 8.978328173374614, "grad_norm": 0.055178187787532806, "learning_rate": 2.0557275541795666e-06, "loss": 0.5357, "step": 11600 }, { "epoch": 8.986068111455108, "grad_norm": 0.03217758610844612, "learning_rate": 2.0402476780185758e-06, "loss": 0.0712, "step": 11610 }, { "epoch": 8.993808049535604, "grad_norm": 0.03935324773192406, "learning_rate": 2.0247678018575853e-06, "loss": 0.5555, "step": 11620 }, { "epoch": 9.0, "eval_accuracy": 0.84, "eval_f1": 0.8386100386100387, "eval_loss": 0.756130576133728, "eval_runtime": 1.2903, "eval_samples_per_second": 38.75, "eval_steps_per_second": 38.75, "step": 11628 }, { "epoch": 9.0015479876161, "grad_norm": 0.025945346802473068, "learning_rate": 2.0092879256965945e-06, "loss": 0.0426, "step": 11630 }, { "epoch": 9.009287925696594, "grad_norm": 0.43173420429229736, "learning_rate": 1.9938080495356037e-06, "loss": 0.033, "step": 11640 }, { "epoch": 9.01702786377709, "grad_norm": 0.06241856887936592, "learning_rate": 1.9783281733746132e-06, "loss": 0.0069, "step": 11650 }, { "epoch": 9.024767801857585, "grad_norm": 0.04509132355451584, "learning_rate": 1.9628482972136224e-06, "loss": 0.9242, "step": 11660 }, { "epoch": 9.032507739938081, "grad_norm": 0.04985249787569046, "learning_rate": 1.9473684210526315e-06, "loss": 0.0205, "step": 11670 }, { "epoch": 9.040247678018575, "grad_norm": 0.030469361692667007, "learning_rate": 1.931888544891641e-06, "loss": 0.5548, "step": 11680 }, { "epoch": 9.047987616099071, "grad_norm": 0.029195072129368782, "learning_rate": 1.9164086687306503e-06, "loss": 0.5619, "step": 11690 }, { "epoch": 9.055727554179567, "grad_norm": 0.035856060683727264, "learning_rate": 1.9009287925696596e-06, "loss": 0.2853, "step": 11700 }, { "epoch": 9.063467492260061, "grad_norm": 81.39704895019531, "learning_rate": 1.885448916408669e-06, "loss": 0.413, "step": 11710 }, { "epoch": 9.071207430340557, "grad_norm": 0.047187354415655136, "learning_rate": 1.869969040247678e-06, "loss": 0.0059, "step": 11720 }, { "epoch": 9.078947368421053, "grad_norm": 0.042003095149993896, "learning_rate": 1.8544891640866875e-06, "loss": 0.2853, "step": 11730 }, { "epoch": 9.086687306501547, "grad_norm": 0.5084086656570435, "learning_rate": 1.839009287925697e-06, "loss": 1.1509, "step": 11740 }, { "epoch": 9.094427244582043, "grad_norm": 70.09038543701172, "learning_rate": 1.8235294117647058e-06, "loss": 0.4451, "step": 11750 }, { "epoch": 9.102167182662539, "grad_norm": 0.02903985232114792, "learning_rate": 1.8080495356037152e-06, "loss": 0.2631, "step": 11760 }, { "epoch": 9.109907120743035, "grad_norm": 0.29316192865371704, "learning_rate": 1.7925696594427248e-06, "loss": 0.3378, "step": 11770 }, { "epoch": 9.117647058823529, "grad_norm": 0.04585903510451317, "learning_rate": 1.7770897832817337e-06, "loss": 0.344, "step": 11780 }, { "epoch": 9.125386996904025, "grad_norm": 26.058895111083984, "learning_rate": 1.7616099071207431e-06, "loss": 0.3641, "step": 11790 }, { "epoch": 9.13312693498452, "grad_norm": 0.7439231872558594, "learning_rate": 1.7461300309597527e-06, "loss": 0.0644, "step": 11800 }, { "epoch": 9.140866873065015, "grad_norm": 0.013866459019482136, "learning_rate": 1.7306501547987616e-06, "loss": 0.2861, "step": 11810 }, { "epoch": 9.14860681114551, "grad_norm": 0.044012364000082016, "learning_rate": 1.715170278637771e-06, "loss": 0.0069, "step": 11820 }, { "epoch": 9.156346749226007, "grad_norm": 0.012545653618872166, "learning_rate": 1.6996904024767802e-06, "loss": 0.1516, "step": 11830 }, { "epoch": 9.1640866873065, "grad_norm": 0.0749073177576065, "learning_rate": 1.6842105263157895e-06, "loss": 0.0445, "step": 11840 }, { "epoch": 9.171826625386997, "grad_norm": 0.15658582746982574, "learning_rate": 1.6687306501547989e-06, "loss": 1.1271, "step": 11850 }, { "epoch": 9.179566563467493, "grad_norm": 0.24040530622005463, "learning_rate": 1.653250773993808e-06, "loss": 0.0626, "step": 11860 }, { "epoch": 9.187306501547988, "grad_norm": 0.3965591490268707, "learning_rate": 1.6377708978328174e-06, "loss": 0.2499, "step": 11870 }, { "epoch": 9.195046439628483, "grad_norm": 0.038677919656038284, "learning_rate": 1.6222910216718268e-06, "loss": 0.3589, "step": 11880 }, { "epoch": 9.202786377708978, "grad_norm": 0.9232624173164368, "learning_rate": 1.606811145510836e-06, "loss": 1.1871, "step": 11890 }, { "epoch": 9.210526315789474, "grad_norm": 0.0341479629278183, "learning_rate": 1.5913312693498453e-06, "loss": 0.4666, "step": 11900 }, { "epoch": 9.218266253869968, "grad_norm": 0.07696428149938583, "learning_rate": 1.5758513931888547e-06, "loss": 0.0627, "step": 11910 }, { "epoch": 9.226006191950464, "grad_norm": 4.171335697174072, "learning_rate": 1.5603715170278638e-06, "loss": 0.5905, "step": 11920 }, { "epoch": 9.23374613003096, "grad_norm": 51.90923309326172, "learning_rate": 1.5448916408668732e-06, "loss": 0.8366, "step": 11930 }, { "epoch": 9.241486068111454, "grad_norm": 0.07199366390705109, "learning_rate": 1.5294117647058826e-06, "loss": 0.8072, "step": 11940 }, { "epoch": 9.24922600619195, "grad_norm": 0.023102892562747, "learning_rate": 1.5139318885448917e-06, "loss": 0.0091, "step": 11950 }, { "epoch": 9.256965944272446, "grad_norm": 0.05089113861322403, "learning_rate": 1.498452012383901e-06, "loss": 0.6449, "step": 11960 }, { "epoch": 9.264705882352942, "grad_norm": 0.045417021960020065, "learning_rate": 1.4829721362229104e-06, "loss": 0.8783, "step": 11970 }, { "epoch": 9.272445820433436, "grad_norm": 31.23624038696289, "learning_rate": 1.4674922600619196e-06, "loss": 0.6166, "step": 11980 }, { "epoch": 9.280185758513932, "grad_norm": 0.05905742198228836, "learning_rate": 1.452012383900929e-06, "loss": 0.7633, "step": 11990 }, { "epoch": 9.287925696594428, "grad_norm": 0.07599391788244247, "learning_rate": 1.4365325077399381e-06, "loss": 0.0101, "step": 12000 }, { "epoch": 9.295665634674922, "grad_norm": 0.039988983422517776, "learning_rate": 1.4210526315789475e-06, "loss": 0.0097, "step": 12010 }, { "epoch": 9.303405572755418, "grad_norm": 0.02742581255733967, "learning_rate": 1.4055727554179569e-06, "loss": 1.0874, "step": 12020 }, { "epoch": 9.311145510835914, "grad_norm": 0.04828483983874321, "learning_rate": 1.390092879256966e-06, "loss": 0.5394, "step": 12030 }, { "epoch": 9.318885448916408, "grad_norm": 0.10451111942529678, "learning_rate": 1.3746130030959754e-06, "loss": 0.5437, "step": 12040 }, { "epoch": 9.326625386996904, "grad_norm": 0.11850859969854355, "learning_rate": 1.3591331269349848e-06, "loss": 0.0463, "step": 12050 }, { "epoch": 9.3343653250774, "grad_norm": 0.03070574626326561, "learning_rate": 1.343653250773994e-06, "loss": 1.1991, "step": 12060 }, { "epoch": 9.342105263157896, "grad_norm": 0.027258578687906265, "learning_rate": 1.3281733746130033e-06, "loss": 0.1991, "step": 12070 }, { "epoch": 9.34984520123839, "grad_norm": 0.02842220850288868, "learning_rate": 1.3126934984520126e-06, "loss": 0.012, "step": 12080 }, { "epoch": 9.357585139318886, "grad_norm": 0.021733157336711884, "learning_rate": 1.2972136222910218e-06, "loss": 0.0065, "step": 12090 }, { "epoch": 9.365325077399381, "grad_norm": 0.027990013360977173, "learning_rate": 1.2817337461300312e-06, "loss": 0.1906, "step": 12100 }, { "epoch": 9.373065015479876, "grad_norm": 0.044587425887584686, "learning_rate": 1.2662538699690405e-06, "loss": 0.6339, "step": 12110 }, { "epoch": 9.380804953560371, "grad_norm": 1.8298673629760742, "learning_rate": 1.2507739938080497e-06, "loss": 0.4403, "step": 12120 }, { "epoch": 9.388544891640867, "grad_norm": 0.023872006684541702, "learning_rate": 1.235294117647059e-06, "loss": 0.0164, "step": 12130 }, { "epoch": 9.396284829721361, "grad_norm": 51.92340850830078, "learning_rate": 1.2198142414860682e-06, "loss": 0.8936, "step": 12140 }, { "epoch": 9.404024767801857, "grad_norm": 0.14623349905014038, "learning_rate": 1.2043343653250774e-06, "loss": 0.6297, "step": 12150 }, { "epoch": 9.411764705882353, "grad_norm": 7.620347499847412, "learning_rate": 1.1888544891640867e-06, "loss": 0.0642, "step": 12160 }, { "epoch": 9.41950464396285, "grad_norm": 47.53596115112305, "learning_rate": 1.173374613003096e-06, "loss": 0.8382, "step": 12170 }, { "epoch": 9.427244582043343, "grad_norm": 0.06319626420736313, "learning_rate": 1.1578947368421053e-06, "loss": 0.4265, "step": 12180 }, { "epoch": 9.43498452012384, "grad_norm": 0.054025761783123016, "learning_rate": 1.1424148606811146e-06, "loss": 0.5139, "step": 12190 }, { "epoch": 9.442724458204335, "grad_norm": 0.8037497401237488, "learning_rate": 1.126934984520124e-06, "loss": 1.1481, "step": 12200 }, { "epoch": 9.45046439628483, "grad_norm": 0.009485302492976189, "learning_rate": 1.1114551083591331e-06, "loss": 0.3358, "step": 12210 }, { "epoch": 9.458204334365325, "grad_norm": 0.05529876425862312, "learning_rate": 1.0959752321981425e-06, "loss": 0.2154, "step": 12220 }, { "epoch": 9.465944272445821, "grad_norm": 0.02836499735713005, "learning_rate": 1.0804953560371519e-06, "loss": 0.0115, "step": 12230 }, { "epoch": 9.473684210526315, "grad_norm": 2.9413037300109863, "learning_rate": 1.065015479876161e-06, "loss": 0.523, "step": 12240 }, { "epoch": 9.481424148606811, "grad_norm": 52.54439163208008, "learning_rate": 1.0495356037151704e-06, "loss": 0.7893, "step": 12250 }, { "epoch": 9.489164086687307, "grad_norm": 3.6481902599334717, "learning_rate": 1.0340557275541796e-06, "loss": 0.0128, "step": 12260 }, { "epoch": 9.496904024767803, "grad_norm": 0.7434276342391968, "learning_rate": 1.018575851393189e-06, "loss": 0.1457, "step": 12270 }, { "epoch": 9.504643962848297, "grad_norm": 0.035547249019145966, "learning_rate": 1.0030959752321983e-06, "loss": 0.0069, "step": 12280 }, { "epoch": 9.512383900928793, "grad_norm": 0.05527057498693466, "learning_rate": 9.876160990712074e-07, "loss": 0.0074, "step": 12290 }, { "epoch": 9.520123839009289, "grad_norm": 0.03656148910522461, "learning_rate": 9.721362229102168e-07, "loss": 0.5552, "step": 12300 }, { "epoch": 9.527863777089783, "grad_norm": 0.018427018076181412, "learning_rate": 9.566563467492262e-07, "loss": 0.6165, "step": 12310 }, { "epoch": 9.535603715170279, "grad_norm": 26.996368408203125, "learning_rate": 9.411764705882353e-07, "loss": 0.1039, "step": 12320 }, { "epoch": 9.543343653250774, "grad_norm": 27.452367782592773, "learning_rate": 9.256965944272446e-07, "loss": 1.0633, "step": 12330 }, { "epoch": 9.551083591331269, "grad_norm": 0.04964013025164604, "learning_rate": 9.10216718266254e-07, "loss": 0.3725, "step": 12340 }, { "epoch": 9.558823529411764, "grad_norm": 0.025155318900942802, "learning_rate": 8.947368421052632e-07, "loss": 1.4094, "step": 12350 }, { "epoch": 9.56656346749226, "grad_norm": 1.704766869544983, "learning_rate": 8.792569659442725e-07, "loss": 1.2946, "step": 12360 }, { "epoch": 9.574303405572756, "grad_norm": 9.638593673706055, "learning_rate": 8.637770897832819e-07, "loss": 0.4001, "step": 12370 }, { "epoch": 9.58204334365325, "grad_norm": 1.364628791809082, "learning_rate": 8.482972136222911e-07, "loss": 0.0174, "step": 12380 }, { "epoch": 9.589783281733746, "grad_norm": 1.519652009010315, "learning_rate": 8.328173374613004e-07, "loss": 0.6613, "step": 12390 }, { "epoch": 9.597523219814242, "grad_norm": 0.07219968736171722, "learning_rate": 8.173374613003096e-07, "loss": 0.6489, "step": 12400 }, { "epoch": 9.605263157894736, "grad_norm": 0.01786046102643013, "learning_rate": 8.01857585139319e-07, "loss": 0.0173, "step": 12410 }, { "epoch": 9.613003095975232, "grad_norm": 0.04482196643948555, "learning_rate": 7.863777089783283e-07, "loss": 2.1072, "step": 12420 }, { "epoch": 9.620743034055728, "grad_norm": 0.03466297686100006, "learning_rate": 7.708978328173375e-07, "loss": 0.6166, "step": 12430 }, { "epoch": 9.628482972136222, "grad_norm": 0.15375687181949615, "learning_rate": 7.554179566563469e-07, "loss": 0.5323, "step": 12440 }, { "epoch": 9.636222910216718, "grad_norm": 35.36479568481445, "learning_rate": 7.399380804953562e-07, "loss": 0.0635, "step": 12450 }, { "epoch": 9.643962848297214, "grad_norm": 0.023269806057214737, "learning_rate": 7.244582043343653e-07, "loss": 0.0076, "step": 12460 }, { "epoch": 9.651702786377708, "grad_norm": 0.032083477824926376, "learning_rate": 7.089783281733746e-07, "loss": 0.8724, "step": 12470 }, { "epoch": 9.659442724458204, "grad_norm": 0.023244427517056465, "learning_rate": 6.93498452012384e-07, "loss": 0.0538, "step": 12480 }, { "epoch": 9.6671826625387, "grad_norm": 44.159793853759766, "learning_rate": 6.780185758513932e-07, "loss": 0.5481, "step": 12490 }, { "epoch": 9.674922600619196, "grad_norm": 29.829814910888672, "learning_rate": 6.625386996904025e-07, "loss": 0.8991, "step": 12500 }, { "epoch": 9.68266253869969, "grad_norm": 0.04366055876016617, "learning_rate": 6.470588235294118e-07, "loss": 0.017, "step": 12510 }, { "epoch": 9.690402476780186, "grad_norm": 1.2012635469436646, "learning_rate": 6.315789473684211e-07, "loss": 0.6572, "step": 12520 }, { "epoch": 9.698142414860682, "grad_norm": 0.051347266882658005, "learning_rate": 6.160990712074304e-07, "loss": 0.0076, "step": 12530 }, { "epoch": 9.705882352941176, "grad_norm": 0.013334005139768124, "learning_rate": 6.006191950464397e-07, "loss": 0.1591, "step": 12540 }, { "epoch": 9.713622291021672, "grad_norm": 20.57217788696289, "learning_rate": 5.85139318885449e-07, "loss": 0.9953, "step": 12550 }, { "epoch": 9.721362229102168, "grad_norm": 1.3979016542434692, "learning_rate": 5.696594427244582e-07, "loss": 1.5419, "step": 12560 }, { "epoch": 9.729102167182663, "grad_norm": 0.5834305286407471, "learning_rate": 5.541795665634676e-07, "loss": 0.5408, "step": 12570 }, { "epoch": 9.736842105263158, "grad_norm": 0.23698380589485168, "learning_rate": 5.386996904024768e-07, "loss": 0.2818, "step": 12580 }, { "epoch": 9.744582043343653, "grad_norm": 0.07556024938821793, "learning_rate": 5.232198142414861e-07, "loss": 0.0877, "step": 12590 }, { "epoch": 9.75232198142415, "grad_norm": 0.019476426765322685, "learning_rate": 5.077399380804954e-07, "loss": 0.0081, "step": 12600 }, { "epoch": 9.760061919504643, "grad_norm": 0.15590164065361023, "learning_rate": 4.922600619195047e-07, "loss": 0.0066, "step": 12610 }, { "epoch": 9.76780185758514, "grad_norm": 0.15517547726631165, "learning_rate": 4.767801857585139e-07, "loss": 0.5927, "step": 12620 }, { "epoch": 9.775541795665635, "grad_norm": 65.19478607177734, "learning_rate": 4.6130030959752324e-07, "loss": 0.7544, "step": 12630 }, { "epoch": 9.78328173374613, "grad_norm": 0.25748342275619507, "learning_rate": 4.4582043343653255e-07, "loss": 0.6974, "step": 12640 }, { "epoch": 9.791021671826625, "grad_norm": 65.89250946044922, "learning_rate": 4.303405572755418e-07, "loss": 0.363, "step": 12650 }, { "epoch": 9.798761609907121, "grad_norm": 10.20340633392334, "learning_rate": 4.1486068111455113e-07, "loss": 0.7234, "step": 12660 }, { "epoch": 9.806501547987615, "grad_norm": 0.029881009832024574, "learning_rate": 3.993808049535604e-07, "loss": 0.5241, "step": 12670 }, { "epoch": 9.814241486068111, "grad_norm": 0.3110284209251404, "learning_rate": 3.839009287925697e-07, "loss": 1.2772, "step": 12680 }, { "epoch": 9.821981424148607, "grad_norm": 84.15982055664062, "learning_rate": 3.6842105263157896e-07, "loss": 0.9397, "step": 12690 }, { "epoch": 9.829721362229103, "grad_norm": 1.0606378316879272, "learning_rate": 3.529411764705883e-07, "loss": 0.4866, "step": 12700 }, { "epoch": 9.837461300309597, "grad_norm": 0.2711246609687805, "learning_rate": 3.374613003095976e-07, "loss": 0.7253, "step": 12710 }, { "epoch": 9.845201238390093, "grad_norm": 0.4452156722545624, "learning_rate": 3.219814241486068e-07, "loss": 0.0255, "step": 12720 }, { "epoch": 9.852941176470589, "grad_norm": 0.03974607586860657, "learning_rate": 3.065015479876161e-07, "loss": 0.0069, "step": 12730 }, { "epoch": 9.860681114551083, "grad_norm": 0.03675782307982445, "learning_rate": 2.9102167182662543e-07, "loss": 0.2249, "step": 12740 }, { "epoch": 9.868421052631579, "grad_norm": 11.034073829650879, "learning_rate": 2.755417956656347e-07, "loss": 1.4651, "step": 12750 }, { "epoch": 9.876160990712075, "grad_norm": 0.10024832934141159, "learning_rate": 2.60061919504644e-07, "loss": 0.4338, "step": 12760 }, { "epoch": 9.88390092879257, "grad_norm": 0.8758727312088013, "learning_rate": 2.4458204334365327e-07, "loss": 0.843, "step": 12770 }, { "epoch": 9.891640866873065, "grad_norm": 35.65162658691406, "learning_rate": 2.2910216718266256e-07, "loss": 1.065, "step": 12780 }, { "epoch": 9.89938080495356, "grad_norm": 2.0535528659820557, "learning_rate": 2.1362229102167187e-07, "loss": 0.9725, "step": 12790 }, { "epoch": 9.907120743034056, "grad_norm": 0.08367957919836044, "learning_rate": 1.9814241486068113e-07, "loss": 0.2195, "step": 12800 }, { "epoch": 9.91486068111455, "grad_norm": 0.023978492245078087, "learning_rate": 1.8266253869969042e-07, "loss": 0.599, "step": 12810 }, { "epoch": 9.922600619195046, "grad_norm": 85.4802474975586, "learning_rate": 1.671826625386997e-07, "loss": 0.6971, "step": 12820 }, { "epoch": 9.930340557275542, "grad_norm": 0.027309222146868706, "learning_rate": 1.51702786377709e-07, "loss": 0.0862, "step": 12830 }, { "epoch": 9.938080495356036, "grad_norm": 0.04960598424077034, "learning_rate": 1.3622291021671828e-07, "loss": 0.0113, "step": 12840 }, { "epoch": 9.945820433436532, "grad_norm": 0.14275790750980377, "learning_rate": 1.2074303405572757e-07, "loss": 0.0565, "step": 12850 }, { "epoch": 9.953560371517028, "grad_norm": 60.284080505371094, "learning_rate": 1.0526315789473685e-07, "loss": 0.0905, "step": 12860 }, { "epoch": 9.961300309597522, "grad_norm": 0.06609198451042175, "learning_rate": 8.978328173374613e-08, "loss": 0.4196, "step": 12870 }, { "epoch": 9.969040247678018, "grad_norm": 105.55493927001953, "learning_rate": 7.430340557275542e-08, "loss": 0.415, "step": 12880 }, { "epoch": 9.976780185758514, "grad_norm": 1.370802640914917, "learning_rate": 5.882352941176471e-08, "loss": 0.3144, "step": 12890 }, { "epoch": 9.98452012383901, "grad_norm": 3.633511543273926, "learning_rate": 4.3343653250774e-08, "loss": 0.4713, "step": 12900 }, { "epoch": 9.992260061919504, "grad_norm": 0.02021286077797413, "learning_rate": 2.7863777089783284e-08, "loss": 0.0948, "step": 12910 }, { "epoch": 10.0, "grad_norm": 0.04593948274850845, "learning_rate": 1.238390092879257e-08, "loss": 0.1079, "step": 12920 } ], "logging_steps": 10, "max_steps": 12920, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3434632306606080.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }