diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,2844 +1,12761 @@ { - "best_metric": 0.8456375838926175, - "best_model_checkpoint": "ctsinov1/checkpoint-3148", - "epoch": 4.2, + "best_global_step": 9800, + "best_metric": 0.8585858585858586, + "best_model_checkpoint": "ctsinov1/checkpoint-9800", + "epoch": 49.02, "eval_steps": 500, - "global_step": 3935, + "global_step": 17500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { - "epoch": 0.0, - "grad_norm": 6.577972888946533, - "learning_rate": 2.538071065989848e-07, - "loss": 0.7076, + "epoch": 0.0005714285714285715, + "grad_norm": 6.513674259185791, + "learning_rate": 5.142857142857143e-08, + "loss": 0.696, "step": 10 }, { - "epoch": 0.01, - "grad_norm": 6.098381519317627, - "learning_rate": 5.076142131979696e-07, - "loss": 0.7072, + "epoch": 0.001142857142857143, + "grad_norm": 2.7451493740081787, + "learning_rate": 1.0857142857142857e-07, + "loss": 0.6997, "step": 20 }, { - "epoch": 0.01, - "grad_norm": 6.236210346221924, - "learning_rate": 7.614213197969544e-07, - "loss": 0.7278, + "epoch": 0.0017142857142857142, + "grad_norm": 2.719616174697876, + "learning_rate": 1.657142857142857e-07, + "loss": 0.6917, "step": 30 }, { - "epoch": 0.01, - "grad_norm": 7.610434532165527, - "learning_rate": 1.0152284263959392e-06, - "loss": 0.6914, + "epoch": 0.002285714285714286, + "grad_norm": 2.852513074874878, + "learning_rate": 2.228571428571429e-07, + "loss": 0.6825, "step": 40 }, { - "epoch": 0.01, - "grad_norm": 5.927430152893066, - "learning_rate": 1.2690355329949238e-06, - "loss": 0.7105, + "epoch": 0.002857142857142857, + "grad_norm": 6.225112438201904, + "learning_rate": 2.8e-07, + "loss": 0.685, "step": 50 }, { - "epoch": 0.02, - "grad_norm": 22.82501220703125, - "learning_rate": 1.5228426395939088e-06, - "loss": 0.694, + "epoch": 0.0034285714285714284, + "grad_norm": 2.038743495941162, + "learning_rate": 3.371428571428572e-07, + "loss": 0.691, "step": 60 }, { - "epoch": 0.02, - "grad_norm": 24.948528289794922, - "learning_rate": 1.7766497461928936e-06, - "loss": 0.7014, + "epoch": 0.004, + "grad_norm": 2.501828908920288, + "learning_rate": 3.9428571428571436e-07, + "loss": 0.701, "step": 70 }, { - "epoch": 0.02, - "grad_norm": 12.509531021118164, - "learning_rate": 2.0304568527918785e-06, - "loss": 0.6915, + "epoch": 0.004571428571428572, + "grad_norm": 5.308904647827148, + "learning_rate": 4.514285714285715e-07, + "loss": 0.6841, "step": 80 }, { - "epoch": 0.02, - "grad_norm": 10.74423885345459, - "learning_rate": 2.284263959390863e-06, - "loss": 0.6788, + "epoch": 0.005142857142857143, + "grad_norm": 2.171583890914917, + "learning_rate": 5.085714285714286e-07, + "loss": 0.6845, "step": 90 }, { - "epoch": 0.03, - "grad_norm": 16.888263702392578, - "learning_rate": 2.5380710659898476e-06, - "loss": 0.6601, + "epoch": 0.005714285714285714, + "grad_norm": 1.9283939599990845, + "learning_rate": 5.657142857142857e-07, + "loss": 0.6901, "step": 100 }, { - "epoch": 0.03, - "grad_norm": 10.467493057250977, - "learning_rate": 2.7918781725888327e-06, - "loss": 0.6162, + "epoch": 0.006285714285714286, + "grad_norm": 5.542513370513916, + "learning_rate": 6.228571428571429e-07, + "loss": 0.6768, "step": 110 }, { - "epoch": 0.03, - "grad_norm": 13.966683387756348, - "learning_rate": 3.0456852791878177e-06, - "loss": 0.5912, + "epoch": 0.006857142857142857, + "grad_norm": 6.506889820098877, + "learning_rate": 6.800000000000001e-07, + "loss": 0.666, "step": 120 }, { - "epoch": 0.03, - "grad_norm": 5.591104030609131, - "learning_rate": 3.2994923857868023e-06, - "loss": 0.4726, + "epoch": 0.0074285714285714285, + "grad_norm": 3.021516799926758, + "learning_rate": 7.371428571428572e-07, + "loss": 0.6799, "step": 130 }, { - "epoch": 0.04, - "grad_norm": 8.37885856628418, - "learning_rate": 3.5532994923857873e-06, - "loss": 1.0922, + "epoch": 0.008, + "grad_norm": 3.173678398132324, + "learning_rate": 7.942857142857144e-07, + "loss": 0.6749, "step": 140 }, { - "epoch": 0.04, - "grad_norm": 24.369611740112305, - "learning_rate": 3.8071065989847715e-06, - "loss": 1.0916, + "epoch": 0.008571428571428572, + "grad_norm": 6.398750305175781, + "learning_rate": 8.514285714285716e-07, + "loss": 0.7071, "step": 150 }, { - "epoch": 0.04, - "grad_norm": 12.621369361877441, - "learning_rate": 4.060913705583757e-06, - "loss": 0.6395, + "epoch": 0.009142857142857144, + "grad_norm": 4.193422317504883, + "learning_rate": 9.085714285714286e-07, + "loss": 0.7018, "step": 160 }, { - "epoch": 0.04, - "grad_norm": 7.74204683303833, - "learning_rate": 4.3147208121827415e-06, - "loss": 0.6146, + "epoch": 0.009714285714285713, + "grad_norm": 6.120445728302002, + "learning_rate": 9.657142857142857e-07, + "loss": 0.6685, "step": 170 }, { - "epoch": 0.05, - "grad_norm": 29.357898712158203, - "learning_rate": 4.568527918781726e-06, - "loss": 0.6861, + "epoch": 0.010285714285714285, + "grad_norm": 9.130088806152344, + "learning_rate": 1.0228571428571429e-06, + "loss": 0.6791, "step": 180 }, { - "epoch": 0.05, - "grad_norm": 14.954898834228516, - "learning_rate": 4.822335025380711e-06, - "loss": 0.7809, + "epoch": 0.010857142857142857, + "grad_norm": 4.139962673187256, + "learning_rate": 1.08e-06, + "loss": 0.6318, "step": 190 }, { - "epoch": 0.05, - "grad_norm": 12.208268165588379, - "learning_rate": 5.076142131979695e-06, - "loss": 0.5812, + "epoch": 0.011428571428571429, + "grad_norm": 10.299067497253418, + "learning_rate": 1.1371428571428572e-06, + "loss": 0.6648, "step": 200 }, { - "epoch": 0.05, - "grad_norm": 75.62841033935547, - "learning_rate": 5.329949238578681e-06, - "loss": 0.7408, + "epoch": 0.012, + "grad_norm": 8.751834869384766, + "learning_rate": 1.1942857142857144e-06, + "loss": 0.7039, "step": 210 }, { - "epoch": 0.06, - "grad_norm": 17.70330047607422, - "learning_rate": 5.583756345177665e-06, - "loss": 0.8077, + "epoch": 0.012571428571428572, + "grad_norm": 6.685519695281982, + "learning_rate": 1.2514285714285715e-06, + "loss": 0.6488, "step": 220 }, { - "epoch": 0.06, - "grad_norm": 26.325668334960938, - "learning_rate": 5.83756345177665e-06, - "loss": 0.6845, + "epoch": 0.013142857142857144, + "grad_norm": 12.093446731567383, + "learning_rate": 1.3085714285714287e-06, + "loss": 0.6134, "step": 230 }, { - "epoch": 0.06, - "grad_norm": 30.602115631103516, - "learning_rate": 6.091370558375635e-06, - "loss": 0.7371, + "epoch": 0.013714285714285714, + "grad_norm": 10.574247360229492, + "learning_rate": 1.3657142857142857e-06, + "loss": 0.6786, "step": 240 }, { - "epoch": 0.06, - "grad_norm": 17.599937438964844, - "learning_rate": 6.34517766497462e-06, - "loss": 0.6351, + "epoch": 0.014285714285714285, + "grad_norm": 12.45152473449707, + "learning_rate": 1.422857142857143e-06, + "loss": 0.6112, "step": 250 }, { - "epoch": 0.07, - "grad_norm": 27.942121505737305, - "learning_rate": 6.5989847715736045e-06, - "loss": 0.6677, + "epoch": 0.014857142857142857, + "grad_norm": 20.439712524414062, + "learning_rate": 1.48e-06, + "loss": 0.6244, "step": 260 }, { - "epoch": 0.07, - "grad_norm": 22.718555450439453, - "learning_rate": 6.852791878172589e-06, - "loss": 0.6294, + "epoch": 0.015428571428571429, + "grad_norm": 14.572081565856934, + "learning_rate": 1.5371428571428574e-06, + "loss": 0.6225, "step": 270 }, { - "epoch": 0.07, - "grad_norm": 15.279131889343262, - "learning_rate": 7.106598984771575e-06, - "loss": 0.7051, + "epoch": 0.016, + "grad_norm": 12.138855934143066, + "learning_rate": 1.5942857142857144e-06, + "loss": 0.6541, "step": 280 }, { - "epoch": 0.07, - "grad_norm": 2.718256950378418, - "learning_rate": 7.360406091370559e-06, - "loss": 0.2624, + "epoch": 0.01657142857142857, + "grad_norm": 10.430656433105469, + "learning_rate": 1.6514285714285715e-06, + "loss": 0.6444, "step": 290 }, { - "epoch": 0.08, - "grad_norm": 7.366211414337158, - "learning_rate": 7.614213197969543e-06, - "loss": 1.0217, + "epoch": 0.017142857142857144, + "grad_norm": 20.285198211669922, + "learning_rate": 1.7085714285714287e-06, + "loss": 0.6046, "step": 300 }, { - "epoch": 0.08, - "grad_norm": 88.110595703125, - "learning_rate": 7.86802030456853e-06, - "loss": 0.97, + "epoch": 0.017714285714285714, + "grad_norm": 14.162407875061035, + "learning_rate": 1.7657142857142859e-06, + "loss": 0.5698, "step": 310 }, { - "epoch": 0.08, - "grad_norm": 8.04466438293457, - "learning_rate": 8.121827411167514e-06, - "loss": 0.7516, + "epoch": 0.018285714285714287, + "grad_norm": 12.920883178710938, + "learning_rate": 1.8228571428571428e-06, + "loss": 0.5905, "step": 320 }, { - "epoch": 0.08, - "grad_norm": 9.804755210876465, - "learning_rate": 8.375634517766498e-06, - "loss": 0.6807, + "epoch": 0.018857142857142857, + "grad_norm": 13.900638580322266, + "learning_rate": 1.8800000000000002e-06, + "loss": 0.5498, "step": 330 }, { - "epoch": 0.09, - "grad_norm": 8.710782051086426, - "learning_rate": 8.629441624365483e-06, - "loss": 0.431, + "epoch": 0.019428571428571427, + "grad_norm": 12.032271385192871, + "learning_rate": 1.9371428571428576e-06, + "loss": 0.5398, "step": 340 }, { - "epoch": 0.09, - "grad_norm": 6.989062786102295, - "learning_rate": 8.883248730964468e-06, - "loss": 0.4437, + "epoch": 0.02, + "grad_norm": 10.851804733276367, + "learning_rate": 1.9942857142857146e-06, + "loss": 0.5787, + "step": 350 + }, + { + "epoch": 0.02, + "eval_accuracy": 0.7104377104377104, + "eval_loss": 0.5787273049354553, + "eval_runtime": 141.3287, + "eval_samples_per_second": 2.101, + "eval_steps_per_second": 1.054, "step": 350 }, { - "epoch": 0.09, - "grad_norm": 0.23465396463871002, - "learning_rate": 9.137055837563452e-06, - "loss": 0.6021, + "epoch": 1.0005714285714287, + "grad_norm": 21.635120391845703, + "learning_rate": 2.0514285714285715e-06, + "loss": 0.5574, "step": 360 }, { - "epoch": 0.09, - "grad_norm": 4.12635612487793, - "learning_rate": 9.390862944162438e-06, - "loss": 1.1584, + "epoch": 1.0011428571428571, + "grad_norm": 8.378313064575195, + "learning_rate": 2.108571428571429e-06, + "loss": 0.6273, "step": 370 }, { - "epoch": 0.1, - "grad_norm": 67.80364990234375, - "learning_rate": 9.644670050761421e-06, - "loss": 1.3129, + "epoch": 1.0017142857142858, + "grad_norm": 25.1311092376709, + "learning_rate": 2.165714285714286e-06, + "loss": 0.4157, "step": 380 }, { - "epoch": 0.1, - "grad_norm": 1.1369630098342896, - "learning_rate": 9.898477157360406e-06, - "loss": 0.4667, + "epoch": 1.0022857142857142, + "grad_norm": 68.14948272705078, + "learning_rate": 2.222857142857143e-06, + "loss": 0.6408, "step": 390 }, { - "epoch": 0.1, - "grad_norm": 0.777511477470398, - "learning_rate": 9.983055634001694e-06, - "loss": 1.0065, + "epoch": 1.002857142857143, + "grad_norm": 7.816642761230469, + "learning_rate": 2.28e-06, + "loss": 0.5284, "step": 400 }, { - "epoch": 0.1, - "grad_norm": 5.83087158203125, - "learning_rate": 9.95481502400452e-06, - "loss": 1.6977, + "epoch": 1.0034285714285713, + "grad_norm": 23.304162979125977, + "learning_rate": 2.337142857142857e-06, + "loss": 0.6163, "step": 410 }, { - "epoch": 0.11, - "grad_norm": 0.4953118562698364, - "learning_rate": 9.926574414007344e-06, - "loss": 0.7726, + "epoch": 1.004, + "grad_norm": 48.6783332824707, + "learning_rate": 2.3942857142857145e-06, + "loss": 0.5474, "step": 420 }, { - "epoch": 0.11, - "grad_norm": 15.068619728088379, - "learning_rate": 9.898333804010167e-06, - "loss": 0.6931, + "epoch": 1.0045714285714287, + "grad_norm": 17.72844696044922, + "learning_rate": 2.4514285714285715e-06, + "loss": 0.5792, "step": 430 }, { - "epoch": 0.11, - "grad_norm": 9.647561073303223, - "learning_rate": 9.870093194012991e-06, - "loss": 0.6504, + "epoch": 1.0051428571428571, + "grad_norm": 8.830244064331055, + "learning_rate": 2.5085714285714285e-06, + "loss": 0.5057, "step": 440 }, { - "epoch": 0.11, - "grad_norm": 5.142507076263428, - "learning_rate": 9.841852584015815e-06, - "loss": 0.4246, + "epoch": 1.0057142857142858, + "grad_norm": 63.260135650634766, + "learning_rate": 2.565714285714286e-06, + "loss": 0.5394, "step": 450 }, { - "epoch": 0.12, - "grad_norm": 44.15083694458008, - "learning_rate": 9.81361197401864e-06, - "loss": 1.5019, + "epoch": 1.0062857142857142, + "grad_norm": 3.5656652450561523, + "learning_rate": 2.6228571428571432e-06, + "loss": 0.4402, "step": 460 }, { - "epoch": 0.12, - "grad_norm": 73.28787231445312, - "learning_rate": 9.785371364021462e-06, - "loss": 0.9851, + "epoch": 1.006857142857143, + "grad_norm": 31.8543758392334, + "learning_rate": 2.68e-06, + "loss": 0.5913, "step": 470 }, { - "epoch": 0.12, - "grad_norm": 5.464163780212402, - "learning_rate": 9.757130754024288e-06, - "loss": 0.8729, + "epoch": 1.0074285714285713, + "grad_norm": 3.7504777908325195, + "learning_rate": 2.737142857142857e-06, + "loss": 0.579, "step": 480 }, { - "epoch": 0.12, - "grad_norm": 0.4673428535461426, - "learning_rate": 9.728890144027113e-06, - "loss": 1.3793, + "epoch": 1.008, + "grad_norm": 13.457418441772461, + "learning_rate": 2.7942857142857145e-06, + "loss": 0.3719, "step": 490 }, { - "epoch": 0.13, - "grad_norm": 68.62584686279297, - "learning_rate": 9.700649534029935e-06, - "loss": 1.7333, + "epoch": 1.0085714285714287, + "grad_norm": 8.717528343200684, + "learning_rate": 2.8514285714285715e-06, + "loss": 0.5286, "step": 500 }, { - "epoch": 0.13, - "grad_norm": 43.09763717651367, - "learning_rate": 9.67240892403276e-06, - "loss": 0.9818, + "epoch": 1.0091428571428571, + "grad_norm": 51.75168991088867, + "learning_rate": 2.908571428571429e-06, + "loss": 0.5389, "step": 510 }, { - "epoch": 0.13, - "grad_norm": 1.3706504106521606, - "learning_rate": 9.644168314035584e-06, - "loss": 0.6076, + "epoch": 1.0097142857142858, + "grad_norm": 4.350738048553467, + "learning_rate": 2.9657142857142862e-06, + "loss": 0.1809, "step": 520 }, { - "epoch": 0.13, - "grad_norm": 0.3608866333961487, - "learning_rate": 9.615927704038408e-06, - "loss": 0.011, + "epoch": 1.0102857142857142, + "grad_norm": 51.596439361572266, + "learning_rate": 3.0228571428571428e-06, + "loss": 0.4441, "step": 530 }, { - "epoch": 0.14, - "grad_norm": 2.5606987476348877, - "learning_rate": 9.587687094041232e-06, - "loss": 1.1129, + "epoch": 1.010857142857143, + "grad_norm": 60.40998840332031, + "learning_rate": 3.08e-06, + "loss": 0.6152, "step": 540 }, { - "epoch": 0.14, - "grad_norm": 0.4630231559276581, - "learning_rate": 9.559446484044057e-06, - "loss": 0.5555, + "epoch": 1.0114285714285713, + "grad_norm": 13.335647583007812, + "learning_rate": 3.1371428571428575e-06, + "loss": 0.3943, "step": 550 }, { - "epoch": 0.14, - "grad_norm": 2.497171401977539, - "learning_rate": 9.531205874046881e-06, - "loss": 0.5851, + "epoch": 1.012, + "grad_norm": 0.6494603157043457, + "learning_rate": 3.194285714285715e-06, + "loss": 0.4774, "step": 560 }, { - "epoch": 0.14, - "grad_norm": 72.00778198242188, - "learning_rate": 9.502965264049704e-06, - "loss": 1.1757, + "epoch": 1.0125714285714287, + "grad_norm": 2.483837127685547, + "learning_rate": 3.2514285714285715e-06, + "loss": 0.5635, "step": 570 }, { - "epoch": 0.15, - "grad_norm": 51.60250473022461, - "learning_rate": 9.474724654052528e-06, - "loss": 0.5766, + "epoch": 1.0131428571428571, + "grad_norm": 79.77306365966797, + "learning_rate": 3.308571428571429e-06, + "loss": 1.0499, "step": 580 }, { - "epoch": 0.15, - "grad_norm": 0.111382856965065, - "learning_rate": 9.446484044055352e-06, - "loss": 0.0039, + "epoch": 1.0137142857142858, + "grad_norm": 1.0996540784835815, + "learning_rate": 3.3657142857142862e-06, + "loss": 0.5509, "step": 590 }, { - "epoch": 0.15, - "grad_norm": 0.24558886885643005, - "learning_rate": 9.418243434058176e-06, - "loss": 0.0022, + "epoch": 1.0142857142857142, + "grad_norm": 48.051490783691406, + "learning_rate": 3.422857142857143e-06, + "loss": 1.0613, "step": 600 }, { - "epoch": 0.16, - "grad_norm": 0.46412792801856995, - "learning_rate": 9.390002824061e-06, - "loss": 2.3804, + "epoch": 1.014857142857143, + "grad_norm": 1.7596064805984497, + "learning_rate": 3.48e-06, + "loss": 0.2783, "step": 610 }, { - "epoch": 0.16, - "grad_norm": 0.6763355731964111, - "learning_rate": 9.361762214063825e-06, - "loss": 1.9181, + "epoch": 1.0154285714285713, + "grad_norm": 0.31064021587371826, + "learning_rate": 3.5371428571428575e-06, + "loss": 0.649, "step": 620 }, { - "epoch": 0.16, - "grad_norm": 0.18327564001083374, - "learning_rate": 9.33352160406665e-06, - "loss": 1.68, + "epoch": 1.016, + "grad_norm": 0.5567959547042847, + "learning_rate": 3.5942857142857145e-06, + "loss": 0.2289, "step": 630 }, { - "epoch": 0.16, - "grad_norm": 0.0858134999871254, - "learning_rate": 9.305280994069472e-06, - "loss": 0.4881, + "epoch": 1.0165714285714285, + "grad_norm": 2.9928736686706543, + "learning_rate": 3.651428571428572e-06, + "loss": 0.2232, "step": 640 }, { - "epoch": 0.17, - "grad_norm": 43.90891647338867, - "learning_rate": 9.277040384072296e-06, - "loss": 0.9335, + "epoch": 1.0171428571428571, + "grad_norm": 56.99592208862305, + "learning_rate": 3.7085714285714284e-06, + "loss": 0.3615, "step": 650 }, { - "epoch": 0.17, - "grad_norm": 0.740903913974762, - "learning_rate": 9.24879977407512e-06, - "loss": 1.8727, + "epoch": 1.0177142857142858, + "grad_norm": 1.2344051599502563, + "learning_rate": 3.7657142857142858e-06, + "loss": 1.8052, "step": 660 }, { - "epoch": 0.17, - "grad_norm": 41.42824172973633, - "learning_rate": 9.220559164077945e-06, - "loss": 1.0477, + "epoch": 1.0182857142857142, + "grad_norm": 44.120845794677734, + "learning_rate": 3.822857142857143e-06, + "loss": 0.7242, "step": 670 }, { - "epoch": 0.17, - "grad_norm": 0.6841148138046265, - "learning_rate": 9.192318554080769e-06, - "loss": 0.0093, + "epoch": 1.018857142857143, + "grad_norm": 1.3891487121582031, + "learning_rate": 3.88e-06, + "loss": 0.737, "step": 680 }, { - "epoch": 0.18, - "grad_norm": 0.15574102103710175, - "learning_rate": 9.164077944083593e-06, - "loss": 0.0061, + "epoch": 1.0194285714285714, + "grad_norm": 31.26248550415039, + "learning_rate": 3.937142857142858e-06, + "loss": 0.5545, "step": 690 }, { - "epoch": 0.18, - "grad_norm": 30.2320613861084, - "learning_rate": 9.135837334086418e-06, - "loss": 1.2614, + "epoch": 1.02, + "grad_norm": 39.20292663574219, + "learning_rate": 3.994285714285714e-06, + "loss": 0.5175, + "step": 700 + }, + { + "epoch": 1.02, + "eval_accuracy": 0.8080808080808081, + "eval_loss": 0.7401928901672363, + "eval_runtime": 137.8773, + "eval_samples_per_second": 2.154, + "eval_steps_per_second": 1.081, "step": 700 }, { - "epoch": 0.18, - "grad_norm": 0.1653001755475998, - "learning_rate": 9.107596724089242e-06, - "loss": 1.4186, + "epoch": 2.0005714285714284, + "grad_norm": 0.3124562203884125, + "learning_rate": 4.051428571428572e-06, + "loss": 1.2603, "step": 710 }, { - "epoch": 0.18, - "grad_norm": 5.986731052398682, - "learning_rate": 9.079356114092065e-06, - "loss": 0.5271, + "epoch": 2.0011428571428573, + "grad_norm": 20.57735252380371, + "learning_rate": 4.108571428571429e-06, + "loss": 0.693, "step": 720 }, { - "epoch": 0.19, - "grad_norm": 0.5438697338104248, - "learning_rate": 9.051115504094889e-06, - "loss": 1.6235, + "epoch": 2.001714285714286, + "grad_norm": 49.15703582763672, + "learning_rate": 4.165714285714287e-06, + "loss": 1.4011, "step": 730 }, { - "epoch": 0.19, - "grad_norm": 132.91490173339844, - "learning_rate": 9.022874894097713e-06, - "loss": 0.8793, + "epoch": 2.0022857142857142, + "grad_norm": 21.79818344116211, + "learning_rate": 4.222857142857143e-06, + "loss": 1.7675, "step": 740 }, { - "epoch": 0.19, - "grad_norm": 222.04197692871094, - "learning_rate": 8.994634284100537e-06, - "loss": 0.0887, + "epoch": 2.0028571428571427, + "grad_norm": 8.412939071655273, + "learning_rate": 4.2800000000000005e-06, + "loss": 0.7266, "step": 750 }, { - "epoch": 0.19, - "grad_norm": 55.007293701171875, - "learning_rate": 8.96639367410336e-06, - "loss": 3.5521, + "epoch": 2.0034285714285716, + "grad_norm": 16.968673706054688, + "learning_rate": 4.3371428571428575e-06, + "loss": 0.7732, "step": 760 }, { - "epoch": 0.2, - "grad_norm": 43.21747589111328, - "learning_rate": 8.938153064106186e-06, - "loss": 0.8727, + "epoch": 2.004, + "grad_norm": 70.78853607177734, + "learning_rate": 4.3942857142857144e-06, + "loss": 0.7847, "step": 770 }, { - "epoch": 0.2, - "grad_norm": 52.716983795166016, - "learning_rate": 8.90991245410901e-06, - "loss": 1.8305, + "epoch": 2.0045714285714284, + "grad_norm": 6.835943698883057, + "learning_rate": 4.451428571428571e-06, + "loss": 0.8395, "step": 780 }, { - "epoch": 0.2, - "eval_accuracy": 0.6023489932885906, - "eval_loss": 1.2154779434204102, - "eval_runtime": 323.3556, - "eval_samples_per_second": 1.843, - "eval_steps_per_second": 1.843, - "step": 787 - }, - { - "epoch": 1.0, - "grad_norm": 130.62637329101562, - "learning_rate": 8.881671844111833e-06, - "loss": 1.5259, + "epoch": 2.0051428571428573, + "grad_norm": 24.16145133972168, + "learning_rate": 4.508571428571429e-06, + "loss": 0.531, "step": 790 }, { - "epoch": 1.0, - "grad_norm": 28.14907455444336, - "learning_rate": 8.853431234114657e-06, - "loss": 0.9381, + "epoch": 2.005714285714286, + "grad_norm": 1.41634202003479, + "learning_rate": 4.565714285714286e-06, + "loss": 0.4976, "step": 800 }, { - "epoch": 1.01, - "grad_norm": 4.458512783050537, - "learning_rate": 8.825190624117481e-06, - "loss": 0.6209, + "epoch": 2.0062857142857142, + "grad_norm": 2.940735340118408, + "learning_rate": 4.622857142857143e-06, + "loss": 0.4175, "step": 810 }, { - "epoch": 1.01, - "grad_norm": 2.9990813732147217, - "learning_rate": 8.796950014120306e-06, - "loss": 0.3972, + "epoch": 2.0068571428571427, + "grad_norm": 2.9641330242156982, + "learning_rate": 4.680000000000001e-06, + "loss": 0.2843, "step": 820 }, { - "epoch": 1.01, - "grad_norm": 317.2392272949219, - "learning_rate": 8.76870940412313e-06, - "loss": 1.7995, + "epoch": 2.0074285714285716, + "grad_norm": 46.827030181884766, + "learning_rate": 4.737142857142857e-06, + "loss": 1.0676, "step": 830 }, { - "epoch": 1.01, - "grad_norm": 0.08616624772548676, - "learning_rate": 8.740468794125954e-06, - "loss": 0.94, + "epoch": 2.008, + "grad_norm": 92.20220184326172, + "learning_rate": 4.794285714285715e-06, + "loss": 0.789, "step": 840 }, { - "epoch": 1.02, - "grad_norm": 0.08287107199430466, - "learning_rate": 8.712228184128779e-06, - "loss": 0.0513, + "epoch": 2.0085714285714285, + "grad_norm": 181.356201171875, + "learning_rate": 4.851428571428572e-06, + "loss": 0.6685, "step": 850 }, { - "epoch": 1.02, - "grad_norm": 177.66172790527344, - "learning_rate": 8.683987574131601e-06, - "loss": 0.7323, + "epoch": 2.0091428571428573, + "grad_norm": 0.2601974904537201, + "learning_rate": 4.90857142857143e-06, + "loss": 0.6834, "step": 860 }, { - "epoch": 1.02, - "grad_norm": 79.82263946533203, - "learning_rate": 8.655746964134425e-06, - "loss": 1.6152, + "epoch": 2.009714285714286, + "grad_norm": 123.33201599121094, + "learning_rate": 4.965714285714286e-06, + "loss": 0.8586, "step": 870 }, { - "epoch": 1.02, - "grad_norm": 0.3191066086292267, - "learning_rate": 8.62750635413725e-06, - "loss": 1.4359, + "epoch": 2.0102857142857142, + "grad_norm": 143.15835571289062, + "learning_rate": 5.0228571428571435e-06, + "loss": 0.8327, "step": 880 }, { - "epoch": 1.03, - "grad_norm": 172.41537475585938, - "learning_rate": 8.599265744140074e-06, - "loss": 1.473, + "epoch": 2.0108571428571427, + "grad_norm": 2.790085792541504, + "learning_rate": 5.0800000000000005e-06, + "loss": 1.6946, "step": 890 }, { - "epoch": 1.03, - "grad_norm": 9.612916946411133, - "learning_rate": 8.571025134142898e-06, - "loss": 1.0049, + "epoch": 2.0114285714285716, + "grad_norm": 50.61666488647461, + "learning_rate": 5.1371428571428574e-06, + "loss": 0.3813, "step": 900 }, { - "epoch": 1.03, - "grad_norm": 27.31446647644043, - "learning_rate": 8.542784524145723e-06, - "loss": 0.7613, + "epoch": 2.012, + "grad_norm": 4.110306739807129, + "learning_rate": 5.194285714285715e-06, + "loss": 0.5825, "step": 910 }, { - "epoch": 1.03, - "grad_norm": 0.6966761350631714, - "learning_rate": 8.514543914148547e-06, - "loss": 0.3961, + "epoch": 2.0125714285714285, + "grad_norm": 2.2944891452789307, + "learning_rate": 5.251428571428571e-06, + "loss": 0.9097, "step": 920 }, { - "epoch": 1.04, - "grad_norm": 0.1365099400281906, - "learning_rate": 8.48630330415137e-06, - "loss": 0.1293, + "epoch": 2.0131428571428573, + "grad_norm": 0.1945544183254242, + "learning_rate": 5.308571428571428e-06, + "loss": 0.2092, "step": 930 }, { - "epoch": 1.04, - "grad_norm": 0.0884900763630867, - "learning_rate": 8.458062694154194e-06, - "loss": 0.5977, + "epoch": 2.013714285714286, + "grad_norm": 0.31246262788772583, + "learning_rate": 5.365714285714286e-06, + "loss": 0.6408, "step": 940 }, { - "epoch": 1.04, - "grad_norm": 2.8027291297912598, - "learning_rate": 8.429822084157018e-06, - "loss": 1.2093, + "epoch": 2.0142857142857142, + "grad_norm": 0.27790567278862, + "learning_rate": 5.422857142857143e-06, + "loss": 0.2627, "step": 950 }, { - "epoch": 1.04, - "grad_norm": 75.59962463378906, - "learning_rate": 8.401581474159842e-06, - "loss": 0.4025, + "epoch": 2.0148571428571427, + "grad_norm": 0.11815852671861649, + "learning_rate": 5.480000000000001e-06, + "loss": 0.7804, "step": 960 }, { - "epoch": 1.05, - "grad_norm": 2.279308795928955, - "learning_rate": 8.373340864162667e-06, - "loss": 2.1111, + "epoch": 2.0154285714285716, + "grad_norm": 81.77935028076172, + "learning_rate": 5.537142857142858e-06, + "loss": 0.9296, "step": 970 }, { - "epoch": 1.05, - "grad_norm": 0.3451221287250519, - "learning_rate": 8.345100254165491e-06, - "loss": 0.8048, + "epoch": 2.016, + "grad_norm": 0.16326524317264557, + "learning_rate": 5.594285714285714e-06, + "loss": 0.811, "step": 980 }, { - "epoch": 1.05, - "grad_norm": 0.8533955812454224, - "learning_rate": 8.316859644168315e-06, - "loss": 0.9632, + "epoch": 2.0165714285714285, + "grad_norm": 0.6108279228210449, + "learning_rate": 5.651428571428572e-06, + "loss": 0.0187, "step": 990 }, { - "epoch": 1.05, - "grad_norm": 55.09623718261719, - "learning_rate": 8.28861903417114e-06, - "loss": 0.5962, + "epoch": 2.0171428571428573, + "grad_norm": 0.0926850363612175, + "learning_rate": 5.708571428571429e-06, + "loss": 1.1172, "step": 1000 }, { - "epoch": 1.06, - "grad_norm": 0.5793609023094177, - "learning_rate": 8.260378424173962e-06, - "loss": 0.8425, + "epoch": 2.017714285714286, + "grad_norm": 27.080734252929688, + "learning_rate": 5.7657142857142865e-06, + "loss": 0.2747, "step": 1010 }, { - "epoch": 1.06, - "grad_norm": 0.020316464826464653, - "learning_rate": 8.232137814176786e-06, - "loss": 0.4848, + "epoch": 2.0182857142857142, + "grad_norm": 32.981544494628906, + "learning_rate": 5.8228571428571435e-06, + "loss": 0.754, "step": 1020 }, { - "epoch": 1.06, - "grad_norm": 0.025828547775745392, - "learning_rate": 8.20389720417961e-06, - "loss": 0.4477, + "epoch": 2.0188571428571427, + "grad_norm": 30.81633186340332, + "learning_rate": 5.8800000000000005e-06, + "loss": 0.3928, "step": 1030 }, { - "epoch": 1.06, - "grad_norm": 0.02087996155023575, - "learning_rate": 8.175656594182435e-06, - "loss": 0.391, + "epoch": 2.0194285714285716, + "grad_norm": 0.45116323232650757, + "learning_rate": 5.937142857142858e-06, + "loss": 0.3754, "step": 1040 }, { - "epoch": 1.07, - "grad_norm": 172.80055236816406, - "learning_rate": 8.14741598418526e-06, - "loss": 1.9783, + "epoch": 2.02, + "grad_norm": 0.12361976504325867, + "learning_rate": 5.994285714285714e-06, + "loss": 0.4062, + "step": 1050 + }, + { + "epoch": 2.02, + "eval_accuracy": 0.8282828282828283, + "eval_loss": 0.8531781435012817, + "eval_runtime": 137.2114, + "eval_samples_per_second": 2.165, + "eval_steps_per_second": 1.086, "step": 1050 }, { - "epoch": 1.07, - "grad_norm": 0.15524566173553467, - "learning_rate": 8.119175374188084e-06, - "loss": 0.3193, + "epoch": 3.0005714285714284, + "grad_norm": 0.03133957087993622, + "learning_rate": 6.051428571428571e-06, + "loss": 0.9268, "step": 1060 }, { - "epoch": 1.07, - "grad_norm": 0.34669268131256104, - "learning_rate": 8.090934764190908e-06, - "loss": 0.5599, + "epoch": 3.0011428571428573, + "grad_norm": 2.317265748977661, + "learning_rate": 6.108571428571429e-06, + "loss": 0.602, "step": 1070 }, { - "epoch": 1.07, - "grad_norm": 0.14205870032310486, - "learning_rate": 8.06269415419373e-06, - "loss": 1.6577, + "epoch": 3.001714285714286, + "grad_norm": 0.7146720290184021, + "learning_rate": 6.165714285714286e-06, + "loss": 0.2092, "step": 1080 }, { - "epoch": 1.08, - "grad_norm": 0.0540863536298275, - "learning_rate": 8.034453544196555e-06, - "loss": 0.4903, + "epoch": 3.0022857142857142, + "grad_norm": 0.33893364667892456, + "learning_rate": 6.222857142857144e-06, + "loss": 1.658, "step": 1090 }, { - "epoch": 1.08, - "grad_norm": 3.833791494369507, - "learning_rate": 8.006212934199379e-06, - "loss": 0.3037, + "epoch": 3.0028571428571427, + "grad_norm": 45.28017807006836, + "learning_rate": 6.280000000000001e-06, + "loss": 0.2583, "step": 1100 }, { - "epoch": 1.08, - "grad_norm": 37.181514739990234, - "learning_rate": 7.977972324202203e-06, - "loss": 1.1881, + "epoch": 3.0034285714285716, + "grad_norm": 0.11307206004858017, + "learning_rate": 6.337142857142857e-06, + "loss": 0.381, "step": 1110 }, { - "epoch": 1.08, - "grad_norm": 66.44300079345703, - "learning_rate": 7.949731714205028e-06, - "loss": 0.5832, + "epoch": 3.004, + "grad_norm": 0.7531479597091675, + "learning_rate": 6.394285714285715e-06, + "loss": 0.5401, "step": 1120 }, { - "epoch": 1.09, - "grad_norm": 154.6601104736328, - "learning_rate": 7.921491104207852e-06, - "loss": 2.519, + "epoch": 3.0045714285714284, + "grad_norm": 0.3606512248516083, + "learning_rate": 6.451428571428572e-06, + "loss": 0.8567, "step": 1130 }, { - "epoch": 1.09, - "grad_norm": 30.22860336303711, - "learning_rate": 7.893250494210676e-06, - "loss": 0.9667, + "epoch": 3.0051428571428573, + "grad_norm": 1.0117719173431396, + "learning_rate": 6.5085714285714295e-06, + "loss": 0.8051, "step": 1140 }, { - "epoch": 1.09, - "grad_norm": 0.13177183270454407, - "learning_rate": 7.865009884213499e-06, - "loss": 1.0848, + "epoch": 3.005714285714286, + "grad_norm": 26.470552444458008, + "learning_rate": 6.5657142857142865e-06, + "loss": 0.8478, "step": 1150 }, { - "epoch": 1.09, - "grad_norm": 39.34396743774414, - "learning_rate": 7.836769274216323e-06, - "loss": 0.5417, + "epoch": 3.0062857142857142, + "grad_norm": 0.5472680926322937, + "learning_rate": 6.6228571428571435e-06, + "loss": 0.2333, "step": 1160 }, { - "epoch": 1.1, - "grad_norm": 0.5694972276687622, - "learning_rate": 7.808528664219147e-06, - "loss": 1.0765, + "epoch": 3.0068571428571427, + "grad_norm": 56.5042839050293, + "learning_rate": 6.680000000000001e-06, + "loss": 1.0095, "step": 1170 }, { - "epoch": 1.1, - "grad_norm": 0.16328032314777374, - "learning_rate": 7.780288054221972e-06, - "loss": 0.01, + "epoch": 3.0074285714285716, + "grad_norm": 0.2309829443693161, + "learning_rate": 6.737142857142857e-06, + "loss": 0.495, "step": 1180 }, { - "epoch": 1.1, - "grad_norm": 0.15245263278484344, - "learning_rate": 7.752047444224796e-06, - "loss": 0.6003, + "epoch": 3.008, + "grad_norm": 0.2706996202468872, + "learning_rate": 6.794285714285714e-06, + "loss": 0.7256, "step": 1190 }, { - "epoch": 1.1, - "grad_norm": 0.11677064746618271, - "learning_rate": 7.72380683422762e-06, - "loss": 0.0037, + "epoch": 3.0085714285714285, + "grad_norm": 31.313552856445312, + "learning_rate": 6.851428571428572e-06, + "loss": 1.3175, "step": 1200 }, { - "epoch": 1.11, - "grad_norm": 0.06681229919195175, - "learning_rate": 7.695566224230445e-06, - "loss": 0.0045, + "epoch": 3.0091428571428573, + "grad_norm": 37.99323272705078, + "learning_rate": 6.908571428571429e-06, + "loss": 0.7056, "step": 1210 }, { - "epoch": 1.11, - "grad_norm": 0.03700481355190277, - "learning_rate": 7.667325614233269e-06, - "loss": 0.6897, + "epoch": 3.009714285714286, + "grad_norm": 34.16461181640625, + "learning_rate": 6.965714285714287e-06, + "loss": 0.4093, "step": 1220 }, { - "epoch": 1.11, - "grad_norm": 0.4864102005958557, - "learning_rate": 7.639085004236091e-06, - "loss": 2.1175, + "epoch": 3.0102857142857142, + "grad_norm": 0.45271262526512146, + "learning_rate": 7.022857142857144e-06, + "loss": 0.3888, "step": 1230 }, { - "epoch": 1.12, - "grad_norm": 0.16693036258220673, - "learning_rate": 7.610844394238917e-06, - "loss": 1.1404, + "epoch": 3.0108571428571427, + "grad_norm": 15.342572212219238, + "learning_rate": 7.08e-06, + "loss": 0.6555, "step": 1240 }, { - "epoch": 1.12, - "grad_norm": 0.8580232858657837, - "learning_rate": 7.58260378424174e-06, - "loss": 0.0049, + "epoch": 3.0114285714285716, + "grad_norm": 209.43014526367188, + "learning_rate": 7.137142857142858e-06, + "loss": 1.0105, "step": 1250 }, { - "epoch": 1.12, - "grad_norm": 0.25582146644592285, - "learning_rate": 7.554363174244564e-06, - "loss": 0.0043, + "epoch": 3.012, + "grad_norm": 46.23904800415039, + "learning_rate": 7.194285714285715e-06, + "loss": 1.2347, "step": 1260 }, { - "epoch": 1.12, - "grad_norm": 0.11917036771774292, - "learning_rate": 7.526122564247388e-06, - "loss": 0.2731, + "epoch": 3.0125714285714285, + "grad_norm": 120.28668975830078, + "learning_rate": 7.251428571428572e-06, + "loss": 1.229, "step": 1270 }, { - "epoch": 1.13, - "grad_norm": 46.90564727783203, - "learning_rate": 7.497881954250212e-06, - "loss": 0.7342, + "epoch": 3.0131428571428573, + "grad_norm": 15.921452522277832, + "learning_rate": 7.3085714285714295e-06, + "loss": 0.8367, "step": 1280 }, { - "epoch": 1.13, - "grad_norm": 24.272871017456055, - "learning_rate": 7.469641344253037e-06, - "loss": 1.1628, + "epoch": 3.013714285714286, + "grad_norm": 0.2926520109176636, + "learning_rate": 7.365714285714286e-06, + "loss": 0.2382, "step": 1290 }, { - "epoch": 1.13, - "grad_norm": 0.12642551958560944, - "learning_rate": 7.441400734255861e-06, - "loss": 0.4991, + "epoch": 3.0142857142857142, + "grad_norm": 0.1254614293575287, + "learning_rate": 7.422857142857144e-06, + "loss": 0.857, "step": 1300 }, { - "epoch": 1.13, - "grad_norm": 26.394733428955078, - "learning_rate": 7.413160124258685e-06, - "loss": 1.6511, + "epoch": 3.0148571428571427, + "grad_norm": 0.10389392822980881, + "learning_rate": 7.48e-06, + "loss": 0.0128, "step": 1310 }, { - "epoch": 1.14, - "grad_norm": 0.33993953466415405, - "learning_rate": 7.384919514261508e-06, - "loss": 0.4879, + "epoch": 3.0154285714285716, + "grad_norm": 0.1118193119764328, + "learning_rate": 7.537142857142857e-06, + "loss": 0.01, "step": 1320 }, { - "epoch": 1.14, - "grad_norm": 0.2504374086856842, - "learning_rate": 7.356678904264333e-06, - "loss": 0.4509, + "epoch": 3.016, + "grad_norm": 0.023473914712667465, + "learning_rate": 7.594285714285715e-06, + "loss": 1.3965, "step": 1330 }, { - "epoch": 1.14, - "grad_norm": 0.05574103072285652, - "learning_rate": 7.328438294267157e-06, - "loss": 0.4486, + "epoch": 3.0165714285714285, + "grad_norm": 79.61312103271484, + "learning_rate": 7.651428571428571e-06, + "loss": 1.6107, "step": 1340 }, { - "epoch": 1.14, - "grad_norm": 0.23328031599521637, - "learning_rate": 7.30019768426998e-06, - "loss": 0.5054, + "epoch": 3.0171428571428573, + "grad_norm": 4.563874244689941, + "learning_rate": 7.708571428571429e-06, + "loss": 0.3567, "step": 1350 }, { - "epoch": 1.15, - "grad_norm": 0.010512278415262699, - "learning_rate": 7.2719570742728055e-06, - "loss": 1.1599, + "epoch": 3.017714285714286, + "grad_norm": 33.95674514770508, + "learning_rate": 7.765714285714287e-06, + "loss": 1.1828, "step": 1360 }, { - "epoch": 1.15, - "grad_norm": 0.042285043746232986, - "learning_rate": 7.243716464275629e-06, - "loss": 0.4782, + "epoch": 3.0182857142857142, + "grad_norm": 36.74417495727539, + "learning_rate": 7.822857142857143e-06, + "loss": 0.9229, "step": 1370 }, { - "epoch": 1.15, - "grad_norm": 0.012322720140218735, - "learning_rate": 7.215475854278453e-06, - "loss": 1.0997, + "epoch": 3.0188571428571427, + "grad_norm": 63.20321273803711, + "learning_rate": 7.88e-06, + "loss": 0.3935, "step": 1380 }, { - "epoch": 1.15, - "grad_norm": 0.014798709191381931, - "learning_rate": 7.1872352442812775e-06, - "loss": 1.2561, + "epoch": 3.0194285714285716, + "grad_norm": 29.39291000366211, + "learning_rate": 7.937142857142857e-06, + "loss": 0.7986, "step": 1390 }, { - "epoch": 1.16, - "grad_norm": 0.45606276392936707, - "learning_rate": 7.158994634284101e-06, - "loss": 1.2503, + "epoch": 3.02, + "grad_norm": 29.3126277923584, + "learning_rate": 7.994285714285715e-06, + "loss": 0.7962, + "step": 1400 + }, + { + "epoch": 3.02, + "eval_accuracy": 0.8114478114478114, + "eval_loss": 0.7183520197868347, + "eval_runtime": 137.6694, + "eval_samples_per_second": 2.157, + "eval_steps_per_second": 1.082, "step": 1400 }, { - "epoch": 1.16, - "grad_norm": 0.04647226259112358, - "learning_rate": 7.130754024286925e-06, - "loss": 1.2027, + "epoch": 4.000571428571429, + "grad_norm": 2.059375047683716, + "learning_rate": 8.051428571428573e-06, + "loss": 0.7828, "step": 1410 }, { - "epoch": 1.16, - "grad_norm": 0.34346136450767517, - "learning_rate": 7.102513414289749e-06, - "loss": 0.0049, + "epoch": 4.001142857142857, + "grad_norm": 47.22036361694336, + "learning_rate": 8.108571428571429e-06, + "loss": 0.9877, "step": 1420 }, { - "epoch": 1.16, - "grad_norm": 23.320772171020508, - "learning_rate": 7.074272804292574e-06, - "loss": 0.8748, + "epoch": 4.001714285714286, + "grad_norm": 0.17464332282543182, + "learning_rate": 8.165714285714286e-06, + "loss": 0.6497, "step": 1430 }, { - "epoch": 1.17, - "grad_norm": 0.4166504144668579, - "learning_rate": 7.0460321942953965e-06, - "loss": 0.8503, + "epoch": 4.002285714285715, + "grad_norm": 1.4610955715179443, + "learning_rate": 8.222857142857144e-06, + "loss": 0.5103, "step": 1440 }, { - "epoch": 1.17, - "grad_norm": 0.028790142387151718, - "learning_rate": 7.017791584298222e-06, - "loss": 0.755, + "epoch": 4.002857142857143, + "grad_norm": 0.10568273067474365, + "learning_rate": 8.28e-06, + "loss": 0.0078, "step": 1450 }, { - "epoch": 1.17, - "grad_norm": 0.10875601321458817, - "learning_rate": 6.989550974301046e-06, - "loss": 1.2674, + "epoch": 4.003428571428572, + "grad_norm": 0.04568742215633392, + "learning_rate": 8.337142857142858e-06, + "loss": 0.0033, "step": 1460 }, { - "epoch": 1.17, - "grad_norm": 0.24065102636814117, - "learning_rate": 6.961310364303869e-06, - "loss": 1.0944, + "epoch": 4.004, + "grad_norm": 0.1109299287199974, + "learning_rate": 8.394285714285714e-06, + "loss": 0.8707, "step": 1470 }, { - "epoch": 1.18, - "grad_norm": 0.17569658160209656, - "learning_rate": 6.933069754306694e-06, - "loss": 1.0381, + "epoch": 4.0045714285714284, + "grad_norm": 0.08762586861848831, + "learning_rate": 8.451428571428572e-06, + "loss": 0.7401, "step": 1480 }, { - "epoch": 1.18, - "grad_norm": 130.54307556152344, - "learning_rate": 6.904829144309517e-06, - "loss": 1.0751, + "epoch": 4.005142857142857, + "grad_norm": 0.16827107965946198, + "learning_rate": 8.50857142857143e-06, + "loss": 0.6692, "step": 1490 }, { - "epoch": 1.18, - "grad_norm": 0.1745329350233078, - "learning_rate": 6.876588534312341e-06, - "loss": 0.97, + "epoch": 4.005714285714285, + "grad_norm": 0.9331682324409485, + "learning_rate": 8.565714285714286e-06, + "loss": 0.005, "step": 1500 }, { - "epoch": 1.18, - "grad_norm": 24.403528213500977, - "learning_rate": 6.8483479243151665e-06, - "loss": 0.8853, + "epoch": 4.006285714285714, + "grad_norm": 0.14198555052280426, + "learning_rate": 8.622857142857144e-06, + "loss": 0.0065, "step": 1510 }, { - "epoch": 1.19, - "grad_norm": 3.1027746200561523, - "learning_rate": 6.82010731431799e-06, - "loss": 0.6651, + "epoch": 4.006857142857143, + "grad_norm": 0.1513282060623169, + "learning_rate": 8.68e-06, + "loss": 0.5806, "step": 1520 }, { - "epoch": 1.19, - "grad_norm": 5.846127033233643, - "learning_rate": 6.791866704320814e-06, - "loss": 0.4536, + "epoch": 4.007428571428571, + "grad_norm": 0.06693244725465775, + "learning_rate": 8.737142857142858e-06, + "loss": 0.7874, "step": 1530 }, { - "epoch": 1.19, - "grad_norm": 0.017441660165786743, - "learning_rate": 6.763626094323638e-06, - "loss": 1.0821, + "epoch": 4.008, + "grad_norm": 0.12991634011268616, + "learning_rate": 8.794285714285716e-06, + "loss": 0.2847, "step": 1540 }, { - "epoch": 1.19, - "grad_norm": 0.20883318781852722, - "learning_rate": 6.735385484326462e-06, - "loss": 0.0086, + "epoch": 4.008571428571429, + "grad_norm": 19.66016960144043, + "learning_rate": 8.851428571428572e-06, + "loss": 1.0498, "step": 1550 }, { - "epoch": 1.2, - "grad_norm": 0.15525928139686584, - "learning_rate": 6.707144874329285e-06, - "loss": 0.5036, + "epoch": 4.009142857142857, + "grad_norm": 2.3184783458709717, + "learning_rate": 8.90857142857143e-06, + "loss": 0.2343, "step": 1560 }, { - "epoch": 1.2, - "grad_norm": 0.15142841637134552, - "learning_rate": 6.67890426433211e-06, - "loss": 0.7465, + "epoch": 4.009714285714286, + "grad_norm": 0.2929948568344116, + "learning_rate": 8.965714285714287e-06, + "loss": 0.5086, "step": 1570 }, { - "epoch": 1.2, - "eval_accuracy": 0.8338926174496645, - "eval_loss": 0.7484979629516602, - "eval_runtime": 324.9358, - "eval_samples_per_second": 1.834, - "eval_steps_per_second": 1.834, - "step": 1574 - }, - { - "epoch": 2.0, - "grad_norm": 0.13457632064819336, - "learning_rate": 6.650663654334935e-06, - "loss": 1.7798, + "epoch": 4.010285714285715, + "grad_norm": 22.23305892944336, + "learning_rate": 9.022857142857143e-06, + "loss": 0.5249, "step": 1580 }, { - "epoch": 2.0, - "grad_norm": 0.16015511751174927, - "learning_rate": 6.622423044337758e-06, - "loss": 0.3209, + "epoch": 4.010857142857143, + "grad_norm": 15.748601913452148, + "learning_rate": 9.080000000000001e-06, + "loss": 1.3767, "step": 1590 }, { - "epoch": 2.01, - "grad_norm": 0.16651424765586853, - "learning_rate": 6.5941824343405826e-06, - "loss": 1.5177, + "epoch": 4.011428571428572, + "grad_norm": 0.976803183555603, + "learning_rate": 9.137142857142857e-06, + "loss": 0.4042, "step": 1600 }, { - "epoch": 2.01, - "grad_norm": 0.35417357087135315, - "learning_rate": 6.565941824343406e-06, - "loss": 0.515, + "epoch": 4.012, + "grad_norm": 0.44382333755493164, + "learning_rate": 9.194285714285715e-06, + "loss": 0.4972, "step": 1610 }, { - "epoch": 2.01, - "grad_norm": 0.05276577174663544, - "learning_rate": 6.53770121434623e-06, - "loss": 1.9682, + "epoch": 4.0125714285714285, + "grad_norm": 27.20537757873535, + "learning_rate": 9.251428571428573e-06, + "loss": 0.9029, "step": 1620 }, { - "epoch": 2.01, - "grad_norm": 0.23119059205055237, - "learning_rate": 6.5094606043490546e-06, - "loss": 0.4395, + "epoch": 4.013142857142857, + "grad_norm": 14.854856491088867, + "learning_rate": 9.308571428571429e-06, + "loss": 1.1477, "step": 1630 }, { - "epoch": 2.02, - "grad_norm": 0.4055440425872803, - "learning_rate": 6.481219994351878e-06, - "loss": 0.0106, + "epoch": 4.013714285714285, + "grad_norm": 14.011804580688477, + "learning_rate": 9.365714285714287e-06, + "loss": 0.4896, "step": 1640 }, { - "epoch": 2.02, - "grad_norm": 0.15100878477096558, - "learning_rate": 6.452979384354703e-06, - "loss": 0.0059, + "epoch": 4.014285714285714, + "grad_norm": 0.38275304436683655, + "learning_rate": 9.422857142857143e-06, + "loss": 0.8805, "step": 1650 }, { - "epoch": 2.02, - "grad_norm": 0.14264227449893951, - "learning_rate": 6.424738774357527e-06, - "loss": 1.2383, + "epoch": 4.014857142857143, + "grad_norm": 28.851638793945312, + "learning_rate": 9.48e-06, + "loss": 1.3966, "step": 1660 }, { - "epoch": 2.02, - "grad_norm": 0.16001996397972107, - "learning_rate": 6.396498164360351e-06, - "loss": 1.02, + "epoch": 4.015428571428571, + "grad_norm": 35.203712463378906, + "learning_rate": 9.537142857142859e-06, + "loss": 0.613, "step": 1670 }, { - "epoch": 2.03, - "grad_norm": 0.15914620459079742, - "learning_rate": 6.368257554363175e-06, - "loss": 0.005, + "epoch": 4.016, + "grad_norm": 0.1584346443414688, + "learning_rate": 9.594285714285715e-06, + "loss": 0.2728, "step": 1680 }, { - "epoch": 2.03, - "grad_norm": 0.11949492990970612, - "learning_rate": 6.340016944365999e-06, - "loss": 0.5224, + "epoch": 4.016571428571429, + "grad_norm": 0.10078814625740051, + "learning_rate": 9.651428571428572e-06, + "loss": 0.4286, "step": 1690 }, { - "epoch": 2.03, - "grad_norm": 68.48541259765625, - "learning_rate": 6.311776334368823e-06, - "loss": 0.7585, + "epoch": 4.017142857142857, + "grad_norm": 13.510937690734863, + "learning_rate": 9.70857142857143e-06, + "loss": 0.6424, "step": 1700 }, { - "epoch": 2.03, - "grad_norm": 78.09130859375, - "learning_rate": 6.283535724371646e-06, - "loss": 1.5627, + "epoch": 4.017714285714286, + "grad_norm": 17.916301727294922, + "learning_rate": 9.765714285714286e-06, + "loss": 0.993, "step": 1710 }, { - "epoch": 2.04, - "grad_norm": 0.06310901045799255, - "learning_rate": 6.2552951143744715e-06, - "loss": 1.0456, + "epoch": 4.018285714285715, + "grad_norm": 35.681949615478516, + "learning_rate": 9.822857142857144e-06, + "loss": 1.2221, "step": 1720 }, { - "epoch": 2.04, - "grad_norm": 0.7593610882759094, - "learning_rate": 6.227054504377295e-06, - "loss": 0.6762, + "epoch": 4.018857142857143, + "grad_norm": 15.966533660888672, + "learning_rate": 9.88e-06, + "loss": 0.7781, "step": 1730 }, { - "epoch": 2.04, - "grad_norm": 0.1845823973417282, - "learning_rate": 6.198813894380119e-06, - "loss": 0.9624, + "epoch": 4.019428571428572, + "grad_norm": 6.339288234710693, + "learning_rate": 9.937142857142858e-06, + "loss": 0.4475, "step": 1740 }, { - "epoch": 2.04, - "grad_norm": 0.49727919697761536, - "learning_rate": 6.1705732843829435e-06, - "loss": 1.1605, + "epoch": 4.02, + "grad_norm": 145.03355407714844, + "learning_rate": 9.994285714285716e-06, + "loss": 0.8225, + "step": 1750 + }, + { + "epoch": 4.02, + "eval_accuracy": 0.5656565656565656, + "eval_loss": 1.686766266822815, + "eval_runtime": 137.779, + "eval_samples_per_second": 2.156, + "eval_steps_per_second": 1.081, "step": 1750 }, { - "epoch": 2.05, - "grad_norm": 25.680139541625977, - "learning_rate": 6.142332674385767e-06, - "loss": 1.5904, + "epoch": 5.000571428571429, + "grad_norm": 9.777188301086426, + "learning_rate": 9.994285714285716e-06, + "loss": 1.4855, "step": 1760 }, { - "epoch": 2.05, - "grad_norm": 0.20278450846672058, - "learning_rate": 6.114092064388591e-06, - "loss": 0.3835, + "epoch": 5.001142857142857, + "grad_norm": 39.02730178833008, + "learning_rate": 9.987936507936509e-06, + "loss": 1.214, "step": 1770 }, { - "epoch": 2.05, - "grad_norm": 0.2322162538766861, - "learning_rate": 6.085851454391415e-06, - "loss": 0.9161, + "epoch": 5.001714285714286, + "grad_norm": 40.402408599853516, + "learning_rate": 9.981587301587303e-06, + "loss": 0.5619, "step": 1780 }, { - "epoch": 2.05, - "grad_norm": 0.2889300286769867, - "learning_rate": 6.05761084439424e-06, - "loss": 0.2446, + "epoch": 5.002285714285715, + "grad_norm": 13.327921867370605, + "learning_rate": 9.975238095238095e-06, + "loss": 0.6943, "step": 1790 }, { - "epoch": 2.06, - "grad_norm": 0.21073293685913086, - "learning_rate": 6.029370234397064e-06, - "loss": 0.6002, + "epoch": 5.002857142857143, + "grad_norm": 7.069613456726074, + "learning_rate": 9.96888888888889e-06, + "loss": 0.5466, "step": 1800 }, { - "epoch": 2.06, - "grad_norm": 0.2775285243988037, - "learning_rate": 6.0011296243998876e-06, - "loss": 0.0199, + "epoch": 5.003428571428572, + "grad_norm": 81.00372314453125, + "learning_rate": 9.962539682539684e-06, + "loss": 1.0447, "step": 1810 }, { - "epoch": 2.06, - "grad_norm": 0.13404442369937897, - "learning_rate": 5.972889014402712e-06, - "loss": 0.0098, + "epoch": 5.004, + "grad_norm": 0.9917150735855103, + "learning_rate": 9.956190476190477e-06, + "loss": 0.5806, "step": 1820 }, { - "epoch": 2.07, - "grad_norm": 0.19267532229423523, - "learning_rate": 5.944648404405535e-06, - "loss": 2.1965, + "epoch": 5.0045714285714284, + "grad_norm": 0.22874107956886292, + "learning_rate": 9.949841269841271e-06, + "loss": 0.4238, "step": 1830 }, { - "epoch": 2.07, - "grad_norm": 0.314384788274765, - "learning_rate": 5.9164077944083596e-06, - "loss": 1.875, + "epoch": 5.005142857142857, + "grad_norm": 13.64168643951416, + "learning_rate": 9.943492063492064e-06, + "loss": 0.7042, "step": 1840 }, { - "epoch": 2.07, - "grad_norm": 0.410067081451416, - "learning_rate": 5.888167184411185e-06, - "loss": 0.01, + "epoch": 5.005714285714285, + "grad_norm": 0.5768861174583435, + "learning_rate": 9.937142857142858e-06, + "loss": 0.4288, "step": 1850 }, { - "epoch": 2.07, - "grad_norm": 26.344202041625977, - "learning_rate": 5.859926574414007e-06, - "loss": 0.8549, + "epoch": 5.006285714285714, + "grad_norm": 31.998321533203125, + "learning_rate": 9.930793650793652e-06, + "loss": 1.2366, "step": 1860 }, { - "epoch": 2.08, - "grad_norm": 0.2887156307697296, - "learning_rate": 5.8316859644168324e-06, - "loss": 0.0071, + "epoch": 5.006857142857143, + "grad_norm": 13.20121955871582, + "learning_rate": 9.924444444444445e-06, + "loss": 0.6062, "step": 1870 }, { - "epoch": 2.08, - "grad_norm": 0.2524619698524475, - "learning_rate": 5.803445354419656e-06, - "loss": 0.4629, + "epoch": 5.007428571428571, + "grad_norm": 0.0679963082075119, + "learning_rate": 9.91809523809524e-06, + "loss": 0.5082, "step": 1880 }, { - "epoch": 2.08, - "grad_norm": 0.22347721457481384, - "learning_rate": 5.77520474442248e-06, - "loss": 0.454, + "epoch": 5.008, + "grad_norm": 0.2697165608406067, + "learning_rate": 9.911746031746032e-06, + "loss": 0.8584, "step": 1890 }, { - "epoch": 2.08, - "grad_norm": 0.19489675760269165, - "learning_rate": 5.746964134425304e-06, - "loss": 0.0037, + "epoch": 5.008571428571429, + "grad_norm": 12.50837230682373, + "learning_rate": 9.905396825396826e-06, + "loss": 0.5164, "step": 1900 }, { - "epoch": 2.09, - "grad_norm": 25.353939056396484, - "learning_rate": 5.718723524428128e-06, - "loss": 0.5369, + "epoch": 5.009142857142857, + "grad_norm": 28.091419219970703, + "learning_rate": 9.89904761904762e-06, + "loss": 0.9162, "step": 1910 }, { - "epoch": 2.09, - "grad_norm": 0.01046634092926979, - "learning_rate": 5.690482914430952e-06, - "loss": 0.0017, + "epoch": 5.009714285714286, + "grad_norm": 0.1433819681406021, + "learning_rate": 9.892698412698413e-06, + "loss": 0.39, "step": 1920 }, { - "epoch": 2.09, - "grad_norm": 0.016024693846702576, - "learning_rate": 5.662242304433776e-06, - "loss": 0.6874, + "epoch": 5.010285714285715, + "grad_norm": 0.6932072043418884, + "learning_rate": 9.886349206349208e-06, + "loss": 0.5809, "step": 1930 }, { - "epoch": 2.09, - "grad_norm": 0.13052043318748474, - "learning_rate": 5.634001694436601e-06, - "loss": 0.5459, + "epoch": 5.010857142857143, + "grad_norm": 0.5466342568397522, + "learning_rate": 9.88e-06, + "loss": 0.5664, "step": 1940 }, { - "epoch": 2.1, - "grad_norm": 22.755247116088867, - "learning_rate": 5.605761084439424e-06, - "loss": 1.063, + "epoch": 5.011428571428572, + "grad_norm": 10.084834098815918, + "learning_rate": 9.873650793650795e-06, + "loss": 0.4083, "step": 1950 }, { - "epoch": 2.1, - "grad_norm": 0.017313385382294655, - "learning_rate": 5.5775204744422485e-06, - "loss": 1.3419, + "epoch": 5.012, + "grad_norm": 0.201524555683136, + "learning_rate": 9.867301587301587e-06, + "loss": 0.9247, "step": 1960 }, { - "epoch": 2.1, - "grad_norm": 0.27627086639404297, - "learning_rate": 5.549279864445073e-06, - "loss": 0.7342, + "epoch": 5.0125714285714285, + "grad_norm": 49.69109344482422, + "learning_rate": 9.860952380952382e-06, + "loss": 0.4362, "step": 1970 }, { - "epoch": 2.1, - "grad_norm": 0.02726130001246929, - "learning_rate": 5.521039254447896e-06, - "loss": 1.1784, + "epoch": 5.013142857142857, + "grad_norm": 2.8617489337921143, + "learning_rate": 9.854603174603176e-06, + "loss": 0.009, "step": 1980 }, { - "epoch": 2.11, - "grad_norm": 0.3983820378780365, - "learning_rate": 5.4927986444507205e-06, - "loss": 0.018, + "epoch": 5.013714285714285, + "grad_norm": 0.3022560775279999, + "learning_rate": 9.848253968253969e-06, + "loss": 0.849, "step": 1990 }, { - "epoch": 2.11, - "grad_norm": 0.13953474164009094, - "learning_rate": 5.464558034453544e-06, - "loss": 0.0044, + "epoch": 5.014285714285714, + "grad_norm": 0.2493607997894287, + "learning_rate": 9.841904761904763e-06, + "loss": 0.438, "step": 2000 }, { - "epoch": 2.11, - "grad_norm": 0.04043055698275566, - "learning_rate": 5.436317424456369e-06, - "loss": 0.5193, + "epoch": 5.014857142857143, + "grad_norm": 0.21475115418434143, + "learning_rate": 9.835555555555556e-06, + "loss": 0.846, "step": 2010 }, { - "epoch": 2.11, - "grad_norm": 22.795866012573242, - "learning_rate": 5.408076814459193e-06, - "loss": 0.823, + "epoch": 5.015428571428571, + "grad_norm": 0.6425090432167053, + "learning_rate": 9.82920634920635e-06, + "loss": 0.5353, "step": 2020 }, { - "epoch": 2.12, - "grad_norm": 23.257352828979492, - "learning_rate": 5.379836204462017e-06, - "loss": 1.0949, + "epoch": 5.016, + "grad_norm": 0.29937419295310974, + "learning_rate": 9.822857142857144e-06, + "loss": 0.8212, "step": 2030 }, { - "epoch": 2.12, - "grad_norm": 22.406156539916992, - "learning_rate": 5.351595594464841e-06, - "loss": 2.1405, + "epoch": 5.016571428571429, + "grad_norm": 5.154245853424072, + "learning_rate": 9.816507936507937e-06, + "loss": 0.9711, "step": 2040 }, { - "epoch": 2.12, - "grad_norm": 0.4050818979740143, - "learning_rate": 5.323354984467665e-06, - "loss": 1.4621, + "epoch": 5.017142857142857, + "grad_norm": 0.5371858477592468, + "learning_rate": 9.810158730158731e-06, + "loss": 0.4229, "step": 2050 }, { - "epoch": 2.12, - "grad_norm": 0.4241192638874054, - "learning_rate": 5.295114374470489e-06, - "loss": 0.4477, + "epoch": 5.017714285714286, + "grad_norm": 11.923803329467773, + "learning_rate": 9.803809523809524e-06, + "loss": 1.5601, "step": 2060 }, { - "epoch": 2.13, - "grad_norm": 0.30280259251594543, - "learning_rate": 5.266873764473312e-06, - "loss": 0.0106, + "epoch": 5.018285714285715, + "grad_norm": 25.41405487060547, + "learning_rate": 9.797460317460318e-06, + "loss": 0.374, "step": 2070 }, { - "epoch": 2.13, - "grad_norm": 0.18719686567783356, - "learning_rate": 5.2386331544761374e-06, - "loss": 0.4244, + "epoch": 5.018857142857143, + "grad_norm": 12.134407043457031, + "learning_rate": 9.791111111111112e-06, + "loss": 1.0097, "step": 2080 }, { - "epoch": 2.13, - "grad_norm": 0.04479646682739258, - "learning_rate": 5.210392544478962e-06, - "loss": 0.6732, + "epoch": 5.019428571428572, + "grad_norm": 191.27777099609375, + "learning_rate": 9.784761904761905e-06, + "loss": 0.8402, "step": 2090 }, { - "epoch": 2.13, - "grad_norm": 22.563093185424805, - "learning_rate": 5.182151934481785e-06, - "loss": 0.5233, + "epoch": 5.02, + "grad_norm": 48.240779876708984, + "learning_rate": 9.7784126984127e-06, + "loss": 0.724, + "step": 2100 + }, + { + "epoch": 5.02, + "eval_accuracy": 0.7508417508417509, + "eval_loss": 1.0066299438476562, + "eval_runtime": 136.2857, + "eval_samples_per_second": 2.179, + "eval_steps_per_second": 1.093, "step": 2100 }, { - "epoch": 2.14, - "grad_norm": 0.0441102497279644, - "learning_rate": 5.1539113244846095e-06, - "loss": 1.7924, + "epoch": 6.000571428571429, + "grad_norm": 16.156768798828125, + "learning_rate": 9.772063492063492e-06, + "loss": 0.4888, "step": 2110 }, { - "epoch": 2.14, - "grad_norm": 0.046370625495910645, - "learning_rate": 5.125670714487433e-06, - "loss": 0.0049, + "epoch": 6.001142857142857, + "grad_norm": 3.711266040802002, + "learning_rate": 9.765714285714286e-06, + "loss": 0.92, "step": 2120 }, { - "epoch": 2.14, - "grad_norm": 1.073113203048706, - "learning_rate": 5.097430104490257e-06, - "loss": 0.5907, + "epoch": 6.001714285714286, + "grad_norm": 13.800188064575195, + "learning_rate": 9.75936507936508e-06, + "loss": 1.3366, "step": 2130 }, { - "epoch": 2.14, - "grad_norm": 0.08737409859895706, - "learning_rate": 5.069189494493082e-06, - "loss": 1.5069, + "epoch": 6.002285714285715, + "grad_norm": 42.781002044677734, + "learning_rate": 9.753015873015873e-06, + "loss": 0.4288, "step": 2140 }, { - "epoch": 2.15, - "grad_norm": 0.058251939713954926, - "learning_rate": 5.040948884495906e-06, - "loss": 0.0395, + "epoch": 6.002857142857143, + "grad_norm": 35.87137985229492, + "learning_rate": 9.746666666666668e-06, + "loss": 0.3171, "step": 2150 }, { - "epoch": 2.15, - "grad_norm": 39.82133102416992, - "learning_rate": 5.01270827449873e-06, - "loss": 0.01, + "epoch": 6.003428571428572, + "grad_norm": 0.34640470147132874, + "learning_rate": 9.74031746031746e-06, + "loss": 0.6315, "step": 2160 }, { - "epoch": 2.15, - "grad_norm": 0.18180860579013824, - "learning_rate": 4.9844676645015535e-06, - "loss": 0.0042, + "epoch": 6.004, + "grad_norm": 0.15693581104278564, + "learning_rate": 9.733968253968255e-06, + "loss": 0.9196, "step": 2170 }, { - "epoch": 2.15, - "grad_norm": 0.10672131180763245, - "learning_rate": 4.956227054504378e-06, - "loss": 0.01, + "epoch": 6.0045714285714284, + "grad_norm": 0.8390889763832092, + "learning_rate": 9.727619047619047e-06, + "loss": 0.5796, "step": 2180 }, { - "epoch": 2.16, - "grad_norm": 28.28722381591797, - "learning_rate": 4.927986444507202e-06, - "loss": 1.221, + "epoch": 6.005142857142857, + "grad_norm": 0.38496437668800354, + "learning_rate": 9.721269841269843e-06, + "loss": 0.7525, "step": 2190 }, { - "epoch": 2.16, - "grad_norm": 0.12183073163032532, - "learning_rate": 4.8997458345100255e-06, - "loss": 1.1702, + "epoch": 6.005714285714285, + "grad_norm": 0.7874132990837097, + "learning_rate": 9.714920634920636e-06, + "loss": 0.5311, "step": 2200 }, { - "epoch": 2.16, - "grad_norm": 0.17091098427772522, - "learning_rate": 4.87150522451285e-06, - "loss": 1.9466, + "epoch": 6.006285714285714, + "grad_norm": 0.2642306983470917, + "learning_rate": 9.70857142857143e-06, + "loss": 0.3915, "step": 2210 }, { - "epoch": 2.16, - "grad_norm": 24.53551483154297, - "learning_rate": 4.843264614515674e-06, - "loss": 1.0011, + "epoch": 6.006857142857143, + "grad_norm": 0.1062416136264801, + "learning_rate": 9.702222222222223e-06, + "loss": 0.4608, "step": 2220 }, { - "epoch": 2.17, - "grad_norm": 33.58413314819336, - "learning_rate": 4.815024004518498e-06, - "loss": 1.1844, + "epoch": 6.007428571428571, + "grad_norm": 8.830906867980957, + "learning_rate": 9.695873015873016e-06, + "loss": 0.4464, "step": 2230 }, { - "epoch": 2.17, - "grad_norm": 24.298912048339844, - "learning_rate": 4.786783394521322e-06, - "loss": 1.477, + "epoch": 6.008, + "grad_norm": 69.506103515625, + "learning_rate": 9.68952380952381e-06, + "loss": 0.7923, "step": 2240 }, { - "epoch": 2.17, - "grad_norm": 0.2964635193347931, - "learning_rate": 4.758542784524146e-06, - "loss": 0.5349, + "epoch": 6.008571428571429, + "grad_norm": 42.58530807495117, + "learning_rate": 9.683174603174604e-06, + "loss": 0.2295, "step": 2250 }, { - "epoch": 2.17, - "grad_norm": 0.5078187584877014, - "learning_rate": 4.7303021745269704e-06, - "loss": 0.9149, + "epoch": 6.009142857142857, + "grad_norm": 3.4575247764587402, + "learning_rate": 9.676825396825399e-06, + "loss": 0.0089, "step": 2260 }, { - "epoch": 2.18, - "grad_norm": 0.2942535877227783, - "learning_rate": 4.702061564529794e-06, - "loss": 0.9105, + "epoch": 6.009714285714286, + "grad_norm": 12.615706443786621, + "learning_rate": 9.670476190476191e-06, + "loss": 0.5269, "step": 2270 }, { - "epoch": 2.18, - "grad_norm": 0.2372201532125473, - "learning_rate": 4.673820954532618e-06, - "loss": 0.4805, + "epoch": 6.010285714285715, + "grad_norm": 0.2601805627346039, + "learning_rate": 9.664126984126985e-06, + "loss": 0.5875, "step": 2280 }, { - "epoch": 2.18, - "grad_norm": 1.4744486808776855, - "learning_rate": 4.6455803445354425e-06, - "loss": 0.0077, + "epoch": 6.010857142857143, + "grad_norm": 0.12038259208202362, + "learning_rate": 9.657777777777778e-06, + "loss": 0.2318, "step": 2290 }, { - "epoch": 2.18, - "grad_norm": 3.4638776779174805, - "learning_rate": 4.617339734538267e-06, - "loss": 0.8804, + "epoch": 6.011428571428572, + "grad_norm": 0.11833731830120087, + "learning_rate": 9.651428571428572e-06, + "loss": 0.4668, "step": 2300 }, { - "epoch": 2.19, - "grad_norm": 0.03575224056839943, - "learning_rate": 4.58909912454109e-06, - "loss": 0.4857, + "epoch": 6.012, + "grad_norm": 13.778709411621094, + "learning_rate": 9.645079365079367e-06, + "loss": 1.5556, "step": 2310 }, { - "epoch": 2.19, - "grad_norm": 595.8480834960938, - "learning_rate": 4.5608585145439145e-06, - "loss": 0.2092, + "epoch": 6.0125714285714285, + "grad_norm": 12.374515533447266, + "learning_rate": 9.63873015873016e-06, + "loss": 0.7705, "step": 2320 }, { - "epoch": 2.19, - "grad_norm": 0.17693261802196503, - "learning_rate": 4.532617904546738e-06, - "loss": 1.4817, + "epoch": 6.013142857142857, + "grad_norm": 12.057218551635742, + "learning_rate": 9.632380952380954e-06, + "loss": 1.2163, "step": 2330 }, { - "epoch": 2.19, - "grad_norm": 0.1713973432779312, - "learning_rate": 4.504377294549563e-06, - "loss": 0.4704, + "epoch": 6.013714285714285, + "grad_norm": 0.848735511302948, + "learning_rate": 9.626031746031746e-06, + "loss": 0.3669, "step": 2340 }, { - "epoch": 2.2, - "grad_norm": 0.17562945187091827, - "learning_rate": 4.4761366845523865e-06, - "loss": 0.4608, + "epoch": 6.014285714285714, + "grad_norm": 0.11727745085954666, + "learning_rate": 9.61968253968254e-06, + "loss": 0.6778, "step": 2350 }, { - "epoch": 2.2, - "grad_norm": 0.04520571604371071, - "learning_rate": 4.447896074555211e-06, - "loss": 0.5066, + "epoch": 6.014857142857143, + "grad_norm": 0.6909122467041016, + "learning_rate": 9.613333333333335e-06, + "loss": 0.2815, "step": 2360 }, { - "epoch": 2.2, - "eval_accuracy": 0.8104026845637584, - "eval_loss": 0.8862009644508362, - "eval_runtime": 319.8982, - "eval_samples_per_second": 1.863, - "eval_steps_per_second": 1.863, - "step": 2361 - }, - { - "epoch": 3.0, - "grad_norm": 0.06622195243835449, - "learning_rate": 4.419655464558035e-06, - "loss": 0.4614, + "epoch": 6.015428571428571, + "grad_norm": 0.31187936663627625, + "learning_rate": 9.606984126984128e-06, + "loss": 0.418, "step": 2370 }, { - "epoch": 3.0, - "grad_norm": 24.007558822631836, - "learning_rate": 4.3914148545608585e-06, - "loss": 0.463, + "epoch": 6.016, + "grad_norm": 12.6905517578125, + "learning_rate": 9.600634920634922e-06, + "loss": 0.6797, "step": 2380 }, { - "epoch": 3.01, - "grad_norm": 74.15673065185547, - "learning_rate": 4.363174244563683e-06, - "loss": 0.4436, + "epoch": 6.016571428571429, + "grad_norm": 14.10185432434082, + "learning_rate": 9.594285714285715e-06, + "loss": 0.2239, "step": 2390 }, { - "epoch": 3.01, - "grad_norm": 0.021963810548186302, - "learning_rate": 4.334933634566507e-06, - "loss": 0.0038, + "epoch": 6.017142857142857, + "grad_norm": 0.6114095449447632, + "learning_rate": 9.587936507936509e-06, + "loss": 0.4369, "step": 2400 }, { - "epoch": 3.01, - "grad_norm": 0.015897750854492188, - "learning_rate": 4.306693024569331e-06, - "loss": 0.0044, + "epoch": 6.017714285714286, + "grad_norm": 86.14923095703125, + "learning_rate": 9.581587301587303e-06, + "loss": 0.7966, "step": 2410 }, { - "epoch": 3.01, - "grad_norm": 0.19425620138645172, - "learning_rate": 4.278452414572155e-06, - "loss": 1.5548, + "epoch": 6.018285714285715, + "grad_norm": 136.2497100830078, + "learning_rate": 9.575238095238096e-06, + "loss": 0.8023, "step": 2420 }, { - "epoch": 3.02, - "grad_norm": 0.2513945400714874, - "learning_rate": 4.250211804574979e-06, - "loss": 0.5131, + "epoch": 6.018857142857143, + "grad_norm": 22.98550796508789, + "learning_rate": 9.56888888888889e-06, + "loss": 0.2369, "step": 2430 }, { - "epoch": 3.02, - "grad_norm": 0.010541065596044064, - "learning_rate": 4.221971194577803e-06, - "loss": 0.0067, + "epoch": 6.019428571428572, + "grad_norm": 0.261092871427536, + "learning_rate": 9.562539682539683e-06, + "loss": 0.5792, "step": 2440 }, { - "epoch": 3.02, - "grad_norm": 0.016293305903673172, - "learning_rate": 4.193730584580628e-06, - "loss": 0.9886, + "epoch": 6.02, + "grad_norm": 0.12631256878376007, + "learning_rate": 9.556190476190477e-06, + "loss": 0.1468, "step": 2450 }, { - "epoch": 3.03, - "grad_norm": 0.009771335870027542, - "learning_rate": 4.165489974583451e-06, - "loss": 0.0052, + "epoch": 6.02, + "eval_accuracy": 0.8316498316498316, + "eval_loss": 0.770273745059967, + "eval_runtime": 137.4635, + "eval_samples_per_second": 2.161, + "eval_steps_per_second": 1.084, + "step": 2450 + }, + { + "epoch": 7.000571428571429, + "grad_norm": 0.15533775091171265, + "learning_rate": 9.54984126984127e-06, + "loss": 0.6611, "step": 2460 }, { - "epoch": 3.03, - "grad_norm": 22.732620239257812, - "learning_rate": 4.1372493645862754e-06, - "loss": 0.9754, + "epoch": 7.001142857142857, + "grad_norm": 15.251904487609863, + "learning_rate": 9.543492063492064e-06, + "loss": 0.7168, "step": 2470 }, { - "epoch": 3.03, - "grad_norm": 0.01741587370634079, - "learning_rate": 4.1090087545891e-06, - "loss": 0.007, + "epoch": 7.001714285714286, + "grad_norm": 0.15305683016777039, + "learning_rate": 9.537142857142859e-06, + "loss": 0.2702, "step": 2480 }, { - "epoch": 3.03, - "grad_norm": 0.011250239796936512, - "learning_rate": 4.080768144591923e-06, - "loss": 1.2522, + "epoch": 7.002285714285715, + "grad_norm": 0.5726995468139648, + "learning_rate": 9.530793650793651e-06, + "loss": 0.2093, "step": 2490 }, { - "epoch": 3.04, - "grad_norm": 35.29149627685547, - "learning_rate": 4.0525275345947475e-06, - "loss": 1.0829, + "epoch": 7.002857142857143, + "grad_norm": 0.23029519617557526, + "learning_rate": 9.524444444444445e-06, + "loss": 0.6895, "step": 2500 }, { - "epoch": 3.04, - "grad_norm": 0.2253457009792328, - "learning_rate": 4.024286924597572e-06, - "loss": 2.0893, + "epoch": 7.003428571428572, + "grad_norm": 0.13955456018447876, + "learning_rate": 9.518095238095238e-06, + "loss": 0.8783, "step": 2510 }, { - "epoch": 3.04, - "grad_norm": 0.07997630536556244, - "learning_rate": 3.996046314600396e-06, - "loss": 0.5229, + "epoch": 7.004, + "grad_norm": 0.01913376897573471, + "learning_rate": 9.511746031746032e-06, + "loss": 0.1524, "step": 2520 }, { - "epoch": 3.04, - "grad_norm": 0.24757777154445648, - "learning_rate": 3.9678057046032195e-06, - "loss": 0.0078, + "epoch": 7.0045714285714284, + "grad_norm": 0.20704668760299683, + "learning_rate": 9.505396825396827e-06, + "loss": 1.607, "step": 2530 }, { - "epoch": 3.05, - "grad_norm": 0.20268729329109192, - "learning_rate": 3.939565094606044e-06, - "loss": 0.007, + "epoch": 7.005142857142857, + "grad_norm": 12.19189167022705, + "learning_rate": 9.49904761904762e-06, + "loss": 0.5308, "step": 2540 }, { - "epoch": 3.05, - "grad_norm": 0.18007797002792358, - "learning_rate": 3.911324484608868e-06, - "loss": 0.0057, + "epoch": 7.005714285714285, + "grad_norm": 0.30487680435180664, + "learning_rate": 9.492698412698414e-06, + "loss": 0.4346, "step": 2550 }, { - "epoch": 3.05, - "grad_norm": 0.04412233456969261, - "learning_rate": 3.8830838746116915e-06, - "loss": 0.6502, + "epoch": 7.006285714285714, + "grad_norm": 0.047250378876924515, + "learning_rate": 9.486349206349206e-06, + "loss": 0.4239, "step": 2560 }, { - "epoch": 3.05, - "grad_norm": 0.2677147686481476, - "learning_rate": 3.854843264614516e-06, - "loss": 1.0556, + "epoch": 7.006857142857143, + "grad_norm": 0.45693501830101013, + "learning_rate": 9.48e-06, + "loss": 0.477, "step": 2570 }, { - "epoch": 3.06, - "grad_norm": 0.13710559904575348, - "learning_rate": 3.82660265461734e-06, - "loss": 0.6048, + "epoch": 7.007428571428571, + "grad_norm": 0.449886292219162, + "learning_rate": 9.473650793650795e-06, + "loss": 0.8766, "step": 2580 }, { - "epoch": 3.06, - "grad_norm": 0.14919081330299377, - "learning_rate": 3.798362044620164e-06, - "loss": 1.0146, + "epoch": 7.008, + "grad_norm": 0.1275578886270523, + "learning_rate": 9.467301587301588e-06, + "loss": 0.4449, "step": 2590 }, { - "epoch": 3.06, - "grad_norm": 0.06460587680339813, - "learning_rate": 3.7701214346229882e-06, - "loss": 0.0037, + "epoch": 7.008571428571429, + "grad_norm": 0.08520308881998062, + "learning_rate": 9.460952380952382e-06, + "loss": 0.4128, "step": 2600 }, { - "epoch": 3.06, - "grad_norm": 22.712013244628906, - "learning_rate": 3.741880824625812e-06, - "loss": 1.1229, + "epoch": 7.009142857142857, + "grad_norm": 18.56478500366211, + "learning_rate": 9.454603174603175e-06, + "loss": 0.6254, "step": 2610 }, { - "epoch": 3.07, - "grad_norm": 0.03939145430922508, - "learning_rate": 3.7136402146286364e-06, - "loss": 0.5016, + "epoch": 7.009714285714286, + "grad_norm": 0.32579270005226135, + "learning_rate": 9.448253968253969e-06, + "loss": 0.1916, "step": 2620 }, { - "epoch": 3.07, - "grad_norm": 0.179279625415802, - "learning_rate": 3.6853996046314607e-06, - "loss": 0.991, + "epoch": 7.010285714285715, + "grad_norm": 0.2691892087459564, + "learning_rate": 9.441904761904762e-06, + "loss": 0.4455, "step": 2630 }, { - "epoch": 3.07, - "grad_norm": 0.16804170608520508, - "learning_rate": 3.6571589946342845e-06, - "loss": 0.0079, + "epoch": 7.010857142857143, + "grad_norm": 0.18024860322475433, + "learning_rate": 9.435555555555556e-06, + "loss": 0.2448, "step": 2640 }, { - "epoch": 3.07, - "grad_norm": 0.20420299470424652, - "learning_rate": 3.6289183846371084e-06, - "loss": 0.9568, + "epoch": 7.011428571428572, + "grad_norm": 12.539464950561523, + "learning_rate": 9.42920634920635e-06, + "loss": 0.6483, "step": 2650 }, { - "epoch": 3.08, - "grad_norm": 0.22263668477535248, - "learning_rate": 3.6006777746399323e-06, - "loss": 0.4772, + "epoch": 7.012, + "grad_norm": 24.065021514892578, + "learning_rate": 9.422857142857143e-06, + "loss": 1.1989, "step": 2660 }, { - "epoch": 3.08, - "grad_norm": 22.77140998840332, - "learning_rate": 3.5724371646427566e-06, - "loss": 1.9533, + "epoch": 7.0125714285714285, + "grad_norm": 0.15510013699531555, + "learning_rate": 9.416507936507937e-06, + "loss": 0.4549, "step": 2670 }, { - "epoch": 3.08, - "grad_norm": 0.27024608850479126, - "learning_rate": 3.544196554645581e-06, - "loss": 2.2664, + "epoch": 7.013142857142857, + "grad_norm": 0.21933165192604065, + "learning_rate": 9.41015873015873e-06, + "loss": 0.4212, "step": 2680 }, { - "epoch": 3.08, - "grad_norm": 0.4369601905345917, - "learning_rate": 3.5159559446484047e-06, - "loss": 1.1402, + "epoch": 7.013714285714285, + "grad_norm": 46.31543731689453, + "learning_rate": 9.403809523809526e-06, + "loss": 0.6977, "step": 2690 }, { - "epoch": 3.09, - "grad_norm": 0.055966880172491074, - "learning_rate": 3.4877153346512286e-06, - "loss": 0.0093, + "epoch": 7.014285714285714, + "grad_norm": 0.49816974997520447, + "learning_rate": 9.397460317460319e-06, + "loss": 0.7104, "step": 2700 }, { - "epoch": 3.09, - "grad_norm": 0.31162434816360474, - "learning_rate": 3.459474724654053e-06, - "loss": 0.0104, + "epoch": 7.014857142857143, + "grad_norm": 0.6536113023757935, + "learning_rate": 9.391111111111111e-06, + "loss": 0.7153, "step": 2710 }, { - "epoch": 3.09, - "grad_norm": 0.09564479440450668, - "learning_rate": 3.4312341146568767e-06, - "loss": 0.4793, + "epoch": 7.015428571428571, + "grad_norm": 40.60562515258789, + "learning_rate": 9.384761904761906e-06, + "loss": 0.3319, "step": 2720 }, { - "epoch": 3.09, - "grad_norm": 0.20361103117465973, - "learning_rate": 3.4029935046597006e-06, - "loss": 0.9354, + "epoch": 7.016, + "grad_norm": 13.018871307373047, + "learning_rate": 9.378412698412698e-06, + "loss": 0.2629, "step": 2730 }, { - "epoch": 3.1, - "grad_norm": 0.03742867708206177, - "learning_rate": 3.3747528946625253e-06, - "loss": 0.0062, + "epoch": 7.016571428571429, + "grad_norm": 0.34966105222702026, + "learning_rate": 9.372063492063492e-06, + "loss": 0.9186, "step": 2740 }, { - "epoch": 3.1, - "grad_norm": 0.17758674919605255, - "learning_rate": 3.346512284665349e-06, - "loss": 0.9682, + "epoch": 7.017142857142857, + "grad_norm": 57.22317123413086, + "learning_rate": 9.365714285714287e-06, + "loss": 1.1176, "step": 2750 }, { - "epoch": 3.1, - "grad_norm": 0.04006683826446533, - "learning_rate": 3.318271674668173e-06, - "loss": 0.6539, + "epoch": 7.017714285714286, + "grad_norm": 2.8083252906799316, + "learning_rate": 9.359365079365081e-06, + "loss": 0.5711, "step": 2760 }, { - "epoch": 3.1, - "grad_norm": 0.2262236326932907, - "learning_rate": 3.290031064670997e-06, - "loss": 0.472, + "epoch": 7.018285714285715, + "grad_norm": 0.2513306140899658, + "learning_rate": 9.353015873015874e-06, + "loss": 0.3423, "step": 2770 }, { - "epoch": 3.11, - "grad_norm": 0.021157417446374893, - "learning_rate": 3.2617904546738212e-06, - "loss": 0.4522, + "epoch": 7.018857142857143, + "grad_norm": 0.24741508066654205, + "learning_rate": 9.346666666666666e-06, + "loss": 0.412, "step": 2780 }, { - "epoch": 3.11, - "grad_norm": 0.20334932208061218, - "learning_rate": 3.233549844676645e-06, - "loss": 0.0057, + "epoch": 7.019428571428572, + "grad_norm": 0.17685921490192413, + "learning_rate": 9.34031746031746e-06, + "loss": 0.8817, "step": 2790 }, { - "epoch": 3.11, - "grad_norm": 0.18717893958091736, - "learning_rate": 3.2053092346794694e-06, - "loss": 0.0058, + "epoch": 7.02, + "grad_norm": 0.2248346507549286, + "learning_rate": 9.333968253968255e-06, + "loss": 0.8406, + "step": 2800 + }, + { + "epoch": 7.02, + "eval_accuracy": 0.8484848484848485, + "eval_loss": 0.5862956047058105, + "eval_runtime": 137.2695, + "eval_samples_per_second": 2.164, + "eval_steps_per_second": 1.085, "step": 2800 }, { - "epoch": 3.11, - "grad_norm": 0.015843726694583893, - "learning_rate": 3.1770686246822937e-06, - "loss": 0.9782, + "epoch": 8.000571428571428, + "grad_norm": 0.10712999105453491, + "learning_rate": 9.32761904761905e-06, + "loss": 0.5924, "step": 2810 }, { - "epoch": 3.12, - "grad_norm": 304.4051208496094, - "learning_rate": 3.1488280146851175e-06, - "loss": 0.3817, + "epoch": 8.001142857142858, + "grad_norm": 0.37382298707962036, + "learning_rate": 9.321269841269842e-06, + "loss": 0.2847, "step": 2820 }, { - "epoch": 3.12, - "grad_norm": 22.5703067779541, - "learning_rate": 3.1205874046879414e-06, - "loss": 0.9501, + "epoch": 8.001714285714286, + "grad_norm": 0.08458230644464493, + "learning_rate": 9.314920634920636e-06, + "loss": 1.0296, "step": 2830 }, { - "epoch": 3.12, - "grad_norm": 0.015223432332277298, - "learning_rate": 3.0923467946907653e-06, - "loss": 1.3262, + "epoch": 8.002285714285714, + "grad_norm": 2.4656920433044434, + "learning_rate": 9.308571428571429e-06, + "loss": 0.4784, "step": 2840 }, { - "epoch": 3.12, - "grad_norm": 22.17841339111328, - "learning_rate": 3.06410618469359e-06, - "loss": 0.4495, + "epoch": 8.002857142857144, + "grad_norm": 10.618043899536133, + "learning_rate": 9.302222222222223e-06, + "loss": 0.733, "step": 2850 }, { - "epoch": 3.13, - "grad_norm": 0.2771930694580078, - "learning_rate": 3.035865574696414e-06, - "loss": 0.005, + "epoch": 8.003428571428572, + "grad_norm": 0.1708851009607315, + "learning_rate": 9.295873015873018e-06, + "loss": 0.2806, "step": 2860 }, { - "epoch": 3.13, - "grad_norm": 0.24659463763237, - "learning_rate": 3.0076249646992377e-06, - "loss": 1.0779, + "epoch": 8.004, + "grad_norm": 0.06309555470943451, + "learning_rate": 9.28952380952381e-06, + "loss": 0.5611, "step": 2870 }, { - "epoch": 3.13, - "grad_norm": 0.017306260764598846, - "learning_rate": 2.9793843547020616e-06, - "loss": 0.0052, + "epoch": 8.00457142857143, + "grad_norm": 0.46451622247695923, + "learning_rate": 9.283174603174605e-06, + "loss": 0.1565, "step": 2880 }, { - "epoch": 3.13, - "grad_norm": 0.16755558550357819, - "learning_rate": 2.951143744704886e-06, - "loss": 0.4679, + "epoch": 8.005142857142857, + "grad_norm": 13.138236999511719, + "learning_rate": 9.276825396825397e-06, + "loss": 0.7403, "step": 2890 }, { - "epoch": 3.14, - "grad_norm": 22.35555076599121, - "learning_rate": 2.9229031347077097e-06, - "loss": 1.6778, + "epoch": 8.005714285714285, + "grad_norm": 1.0157630443572998, + "learning_rate": 9.270476190476192e-06, + "loss": 0.3141, "step": 2900 }, { - "epoch": 3.14, - "grad_norm": 848.5310668945312, - "learning_rate": 2.894662524710534e-06, - "loss": 0.368, + "epoch": 8.006285714285715, + "grad_norm": 0.22003336250782013, + "learning_rate": 9.264126984126986e-06, + "loss": 0.7244, "step": 2910 }, { - "epoch": 3.14, - "grad_norm": 0.12664149701595306, - "learning_rate": 2.8664219147133583e-06, - "loss": 1.8596, + "epoch": 8.006857142857143, + "grad_norm": 0.22460892796516418, + "learning_rate": 9.257777777777779e-06, + "loss": 0.397, "step": 2920 }, { - "epoch": 3.14, - "grad_norm": 0.15273146331310272, - "learning_rate": 2.838181304716182e-06, - "loss": 0.0056, + "epoch": 8.007428571428571, + "grad_norm": 1.3341542482376099, + "learning_rate": 9.251428571428573e-06, + "loss": 0.9219, "step": 2930 }, { - "epoch": 3.15, - "grad_norm": 22.398298263549805, - "learning_rate": 2.809940694719006e-06, - "loss": 1.4325, + "epoch": 8.008, + "grad_norm": 0.180324986577034, + "learning_rate": 9.245079365079366e-06, + "loss": 1.0728, "step": 2940 }, { - "epoch": 3.15, - "grad_norm": 413.4339294433594, - "learning_rate": 2.78170008472183e-06, - "loss": 0.3167, + "epoch": 8.008571428571429, + "grad_norm": 3.0605599880218506, + "learning_rate": 9.23873015873016e-06, + "loss": 0.6721, "step": 2950 }, { - "epoch": 3.15, - "grad_norm": 0.19188903272151947, - "learning_rate": 2.753459474724654e-06, - "loss": 0.904, + "epoch": 8.009142857142857, + "grad_norm": 0.17587581276893616, + "learning_rate": 9.232380952380952e-06, + "loss": 0.3343, "step": 2960 }, { - "epoch": 3.15, - "grad_norm": 3.69321608543396, - "learning_rate": 2.7252188647274785e-06, - "loss": 0.007, + "epoch": 8.009714285714285, + "grad_norm": 1.3076510429382324, + "learning_rate": 9.226031746031747e-06, + "loss": 0.5037, "step": 2970 }, { - "epoch": 3.16, - "grad_norm": 0.2725941240787506, - "learning_rate": 2.6969782547303024e-06, - "loss": 0.9046, + "epoch": 8.010285714285715, + "grad_norm": 0.15406930446624756, + "learning_rate": 9.219682539682541e-06, + "loss": 0.4566, "step": 2980 }, { - "epoch": 3.16, - "grad_norm": 0.2501017451286316, - "learning_rate": 2.6687376447331266e-06, - "loss": 0.0085, + "epoch": 8.010857142857143, + "grad_norm": 12.370487213134766, + "learning_rate": 9.213333333333334e-06, + "loss": 0.6773, "step": 2990 }, { - "epoch": 3.16, - "grad_norm": 0.08356527239084244, - "learning_rate": 2.6404970347359505e-06, - "loss": 0.6352, + "epoch": 8.01142857142857, + "grad_norm": 24.417396545410156, + "learning_rate": 9.206984126984128e-06, + "loss": 0.8717, "step": 3000 }, { - "epoch": 3.16, - "grad_norm": 0.05544191226363182, - "learning_rate": 2.6122564247387744e-06, - "loss": 1.566, + "epoch": 8.012, + "grad_norm": 0.2499029040336609, + "learning_rate": 9.20063492063492e-06, + "loss": 0.9279, "step": 3010 }, { - "epoch": 3.17, - "grad_norm": 0.2636459469795227, - "learning_rate": 2.584015814741599e-06, - "loss": 0.9192, + "epoch": 8.012571428571428, + "grad_norm": 8.220014572143555, + "learning_rate": 9.194285714285715e-06, + "loss": 0.0187, "step": 3020 }, { - "epoch": 3.17, - "grad_norm": 0.05203928053379059, - "learning_rate": 2.555775204744423e-06, - "loss": 0.4324, + "epoch": 8.013142857142856, + "grad_norm": 0.12472938001155853, + "learning_rate": 9.18793650793651e-06, + "loss": 0.1618, "step": 3030 }, { - "epoch": 3.17, - "grad_norm": 0.16593405604362488, - "learning_rate": 2.527534594747247e-06, - "loss": 0.8602, + "epoch": 8.013714285714286, + "grad_norm": 12.041219711303711, + "learning_rate": 9.181587301587302e-06, + "loss": 0.7642, "step": 3040 }, { - "epoch": 3.18, - "grad_norm": 0.02371574193239212, - "learning_rate": 2.4992939847500707e-06, - "loss": 0.5104, + "epoch": 8.014285714285714, + "grad_norm": 0.26260077953338623, + "learning_rate": 9.175238095238096e-06, + "loss": 0.586, "step": 3050 }, { - "epoch": 3.18, - "grad_norm": 4.215577125549316, - "learning_rate": 2.4710533747528946e-06, - "loss": 0.6802, + "epoch": 8.014857142857142, + "grad_norm": 0.13079126179218292, + "learning_rate": 9.168888888888889e-06, + "loss": 0.4182, "step": 3060 }, { - "epoch": 3.18, - "grad_norm": 0.1291200816631317, - "learning_rate": 2.442812764755719e-06, - "loss": 0.0066, + "epoch": 8.015428571428572, + "grad_norm": 0.1269873082637787, + "learning_rate": 9.162539682539683e-06, + "loss": 0.2187, "step": 3070 }, { - "epoch": 3.18, - "grad_norm": 0.31712397933006287, - "learning_rate": 2.414572154758543e-06, - "loss": 0.876, + "epoch": 8.016, + "grad_norm": 0.14042362570762634, + "learning_rate": 9.156190476190478e-06, + "loss": 0.3016, "step": 3080 }, { - "epoch": 3.19, - "grad_norm": 320.600341796875, - "learning_rate": 2.386331544761367e-06, - "loss": 0.9575, + "epoch": 8.016571428571428, + "grad_norm": 13.570809364318848, + "learning_rate": 9.14984126984127e-06, + "loss": 0.8131, "step": 3090 }, { - "epoch": 3.19, - "grad_norm": 0.16467230021953583, - "learning_rate": 2.3580909347641913e-06, - "loss": 0.884, + "epoch": 8.017142857142858, + "grad_norm": 0.28256142139434814, + "learning_rate": 9.143492063492065e-06, + "loss": 0.2524, "step": 3100 }, { - "epoch": 3.19, - "grad_norm": 0.028689516708254814, - "learning_rate": 2.329850324767015e-06, - "loss": 0.0067, + "epoch": 8.017714285714286, + "grad_norm": 0.08613581210374832, + "learning_rate": 9.137142857142857e-06, + "loss": 0.9875, "step": 3110 }, { - "epoch": 3.19, - "grad_norm": 0.3429773151874542, - "learning_rate": 2.3016097147698394e-06, - "loss": 1.0181, + "epoch": 8.018285714285714, + "grad_norm": 29.147151947021484, + "learning_rate": 9.130793650793652e-06, + "loss": 0.6377, "step": 3120 }, { - "epoch": 3.2, - "grad_norm": 0.039928894490003586, - "learning_rate": 2.2733691047726633e-06, - "loss": 1.1309, + "epoch": 8.018857142857144, + "grad_norm": 119.630126953125, + "learning_rate": 9.124444444444444e-06, + "loss": 0.1833, "step": 3130 }, { - "epoch": 3.2, - "grad_norm": 0.0716266855597496, - "learning_rate": 2.245128494775487e-06, - "loss": 0.0055, + "epoch": 8.019428571428572, + "grad_norm": 0.17084644734859467, + "learning_rate": 9.118095238095239e-06, + "loss": 0.9565, "step": 3140 }, { - "epoch": 3.2, - "eval_accuracy": 0.8456375838926175, - "eval_loss": 0.7402271628379822, - "eval_runtime": 327.8476, - "eval_samples_per_second": 1.818, - "eval_steps_per_second": 1.818, - "step": 3148 + "epoch": 8.02, + "grad_norm": 12.522076606750488, + "learning_rate": 9.111746031746033e-06, + "loss": 0.4485, + "step": 3150 }, { - "epoch": 4.0, - "grad_norm": 0.2870597243309021, - "learning_rate": 2.2168878847783115e-06, - "loss": 0.9632, + "epoch": 8.02, + "eval_accuracy": 0.8383838383838383, + "eval_loss": 0.6601792573928833, + "eval_runtime": 137.764, + "eval_samples_per_second": 2.156, + "eval_steps_per_second": 1.082, "step": 3150 }, { - "epoch": 4.0, - "grad_norm": 0.29839763045310974, - "learning_rate": 2.1886472747811353e-06, - "loss": 0.5113, + "epoch": 9.000571428571428, + "grad_norm": 0.1897997260093689, + "learning_rate": 9.105396825396826e-06, + "loss": 0.0101, "step": 3160 }, { - "epoch": 4.01, - "grad_norm": 0.14275802671909332, - "learning_rate": 2.1604066647839596e-06, - "loss": 1.0579, + "epoch": 9.001142857142858, + "grad_norm": 26.98958396911621, + "learning_rate": 9.09904761904762e-06, + "loss": 1.1643, "step": 3170 }, { - "epoch": 4.01, - "grad_norm": 25.018110275268555, - "learning_rate": 2.1321660547867835e-06, - "loss": 1.0512, + "epoch": 9.001714285714286, + "grad_norm": 0.6562146544456482, + "learning_rate": 9.092698412698412e-06, + "loss": 0.5971, "step": 3180 }, { - "epoch": 4.01, - "grad_norm": 0.07139433175325394, - "learning_rate": 2.1039254447896078e-06, - "loss": 0.0045, + "epoch": 9.002285714285714, + "grad_norm": 6.7756476402282715, + "learning_rate": 9.086349206349207e-06, + "loss": 0.448, "step": 3190 }, { - "epoch": 4.01, - "grad_norm": 0.07226335257291794, - "learning_rate": 2.0756848347924316e-06, - "loss": 0.1377, + "epoch": 9.002857142857144, + "grad_norm": 0.3257388472557068, + "learning_rate": 9.080000000000001e-06, + "loss": 0.3954, "step": 3200 }, { - "epoch": 4.02, - "grad_norm": 0.05169367790222168, - "learning_rate": 2.047444224795256e-06, - "loss": 0.0114, + "epoch": 9.003428571428572, + "grad_norm": 0.04351123794913292, + "learning_rate": 9.073650793650794e-06, + "loss": 0.2449, "step": 3210 }, { - "epoch": 4.02, - "grad_norm": 0.2018321454524994, - "learning_rate": 2.01920361479808e-06, - "loss": 0.5378, + "epoch": 9.004, + "grad_norm": 0.12862706184387207, + "learning_rate": 9.067301587301588e-06, + "loss": 0.238, "step": 3220 }, { - "epoch": 4.02, - "grad_norm": 0.023068850859999657, - "learning_rate": 1.9909630048009037e-06, - "loss": 0.5436, + "epoch": 9.00457142857143, + "grad_norm": 0.30041778087615967, + "learning_rate": 9.06095238095238e-06, + "loss": 0.0238, "step": 3230 }, { - "epoch": 4.02, - "grad_norm": 0.12592564523220062, - "learning_rate": 1.962722394803728e-06, - "loss": 1.2217, + "epoch": 9.005142857142857, + "grad_norm": 12.429978370666504, + "learning_rate": 9.054603174603175e-06, + "loss": 1.2523, "step": 3240 }, { - "epoch": 4.03, - "grad_norm": 81.48845672607422, - "learning_rate": 1.934481784806552e-06, - "loss": 0.9742, + "epoch": 9.005714285714285, + "grad_norm": 12.339615821838379, + "learning_rate": 9.04825396825397e-06, + "loss": 0.2253, "step": 3250 }, { - "epoch": 4.03, - "grad_norm": 0.19448934495449066, - "learning_rate": 1.906241174809376e-06, - "loss": 0.4926, + "epoch": 9.006285714285715, + "grad_norm": 0.3683285117149353, + "learning_rate": 9.041904761904762e-06, + "loss": 0.5474, "step": 3260 }, { - "epoch": 4.03, - "grad_norm": 0.3121252655982971, - "learning_rate": 1.8780005648122002e-06, - "loss": 0.4974, + "epoch": 9.006857142857143, + "grad_norm": 0.29264554381370544, + "learning_rate": 9.035555555555556e-06, + "loss": 0.0913, "step": 3270 }, { - "epoch": 4.03, - "grad_norm": 0.3977165222167969, - "learning_rate": 1.849759954815024e-06, - "loss": 0.5231, + "epoch": 9.007428571428571, + "grad_norm": 0.14638611674308777, + "learning_rate": 9.029206349206349e-06, + "loss": 0.0075, "step": 3280 }, { - "epoch": 4.04, - "grad_norm": 0.19376535713672638, - "learning_rate": 1.8215193448178481e-06, - "loss": 1.344, + "epoch": 9.008, + "grad_norm": 0.13496576249599457, + "learning_rate": 9.022857142857143e-06, + "loss": 0.3115, "step": 3290 }, { - "epoch": 4.04, - "grad_norm": 0.12233041226863861, - "learning_rate": 1.7932787348206724e-06, - "loss": 0.0042, + "epoch": 9.008571428571429, + "grad_norm": 0.09361585974693298, + "learning_rate": 9.016507936507938e-06, + "loss": 0.5494, "step": 3300 }, { - "epoch": 4.04, - "grad_norm": 1.1810427904129028, - "learning_rate": 1.7650381248234963e-06, - "loss": 1.0738, + "epoch": 9.009142857142857, + "grad_norm": 0.09114635735750198, + "learning_rate": 9.010158730158732e-06, + "loss": 0.4454, "step": 3310 }, { - "epoch": 4.04, - "grad_norm": 0.048179544508457184, - "learning_rate": 1.7367975148263202e-06, - "loss": 0.4614, + "epoch": 9.009714285714285, + "grad_norm": 107.50813293457031, + "learning_rate": 9.003809523809525e-06, + "loss": 0.8347, "step": 3320 }, { - "epoch": 4.05, - "grad_norm": 0.058793701231479645, - "learning_rate": 1.7085569048291444e-06, - "loss": 0.4894, + "epoch": 9.010285714285715, + "grad_norm": 0.29273521900177, + "learning_rate": 8.997460317460317e-06, + "loss": 0.5854, "step": 3330 }, { - "epoch": 4.05, - "grad_norm": 0.033707864582538605, - "learning_rate": 1.6803162948319685e-06, - "loss": 1.4924, + "epoch": 9.010857142857143, + "grad_norm": 18.072633743286133, + "learning_rate": 8.991111111111112e-06, + "loss": 0.7517, "step": 3340 }, { - "epoch": 4.05, - "grad_norm": 0.12816235423088074, - "learning_rate": 1.6520756848347926e-06, - "loss": 0.0034, + "epoch": 9.01142857142857, + "grad_norm": 0.3977367877960205, + "learning_rate": 8.984761904761904e-06, + "loss": 0.7607, "step": 3350 }, { - "epoch": 4.05, - "grad_norm": 0.07232780009508133, - "learning_rate": 1.6238350748376167e-06, - "loss": 0.4661, + "epoch": 9.012, + "grad_norm": 0.37464699149131775, + "learning_rate": 8.9784126984127e-06, + "loss": 0.659, "step": 3360 }, { - "epoch": 4.06, - "grad_norm": 0.027900898829102516, - "learning_rate": 1.5955944648404405e-06, - "loss": 0.671, + "epoch": 9.012571428571428, + "grad_norm": 0.13560600578784943, + "learning_rate": 8.972063492063493e-06, + "loss": 0.3875, "step": 3370 }, { - "epoch": 4.06, - "grad_norm": 0.1532682329416275, - "learning_rate": 1.5673538548432648e-06, - "loss": 0.4657, + "epoch": 9.013142857142856, + "grad_norm": 0.609217643737793, + "learning_rate": 8.965714285714287e-06, + "loss": 0.8813, "step": 3380 }, { - "epoch": 4.06, - "grad_norm": 0.3767637014389038, - "learning_rate": 1.539113244846089e-06, - "loss": 0.0093, + "epoch": 9.013714285714286, + "grad_norm": 0.6446564793586731, + "learning_rate": 8.95936507936508e-06, + "loss": 0.3148, "step": 3390 }, { - "epoch": 4.06, - "grad_norm": 23.31383514404297, - "learning_rate": 1.5108726348489128e-06, - "loss": 1.4704, + "epoch": 9.014285714285714, + "grad_norm": 48.195072174072266, + "learning_rate": 8.953015873015874e-06, + "loss": 1.2333, "step": 3400 }, { - "epoch": 4.07, - "grad_norm": 113.44857788085938, - "learning_rate": 1.482632024851737e-06, - "loss": 1.4151, + "epoch": 9.014857142857142, + "grad_norm": 0.19967958331108093, + "learning_rate": 8.946666666666669e-06, + "loss": 0.3988, "step": 3410 }, { - "epoch": 4.07, - "grad_norm": 0.1415228694677353, - "learning_rate": 1.454391414854561e-06, - "loss": 0.0069, + "epoch": 9.015428571428572, + "grad_norm": 0.22969119250774384, + "learning_rate": 8.940317460317461e-06, + "loss": 0.2877, "step": 3420 }, { - "epoch": 4.07, - "grad_norm": 0.12466636300086975, - "learning_rate": 1.426150804857385e-06, - "loss": 0.4532, + "epoch": 9.016, + "grad_norm": 14.168766975402832, + "learning_rate": 8.933968253968256e-06, + "loss": 1.2265, "step": 3430 }, { - "epoch": 4.07, - "grad_norm": 0.2385646253824234, - "learning_rate": 1.397910194860209e-06, - "loss": 0.0064, + "epoch": 9.016571428571428, + "grad_norm": 0.15207406878471375, + "learning_rate": 8.927619047619048e-06, + "loss": 0.4666, "step": 3440 }, { - "epoch": 4.08, - "grad_norm": 0.0742715522646904, - "learning_rate": 1.3696695848630332e-06, - "loss": 0.0047, + "epoch": 9.017142857142858, + "grad_norm": 0.09754037857055664, + "learning_rate": 8.921269841269842e-06, + "loss": 0.7216, "step": 3450 }, { - "epoch": 4.08, - "grad_norm": 1417.8148193359375, - "learning_rate": 1.341428974865857e-06, - "loss": 0.2286, + "epoch": 9.017714285714286, + "grad_norm": 13.291885375976562, + "learning_rate": 8.914920634920635e-06, + "loss": 0.4949, "step": 3460 }, { - "epoch": 4.08, - "grad_norm": 0.026701288297772408, - "learning_rate": 1.3131883648686813e-06, - "loss": 0.4781, + "epoch": 9.018285714285714, + "grad_norm": 0.16420067846775055, + "learning_rate": 8.90857142857143e-06, + "loss": 0.1193, "step": 3470 }, { - "epoch": 4.08, - "grad_norm": 24.302400588989258, - "learning_rate": 1.2849477548715054e-06, - "loss": 1.4506, + "epoch": 9.018857142857144, + "grad_norm": 1.3726239204406738, + "learning_rate": 8.902222222222224e-06, + "loss": 0.4908, "step": 3480 }, { - "epoch": 4.09, - "grad_norm": 0.20589368045330048, - "learning_rate": 1.2567071448743293e-06, - "loss": 0.5483, + "epoch": 9.019428571428572, + "grad_norm": 12.551621437072754, + "learning_rate": 8.895873015873016e-06, + "loss": 1.2967, "step": 3490 }, { - "epoch": 4.09, - "grad_norm": 23.55211067199707, - "learning_rate": 1.2284665348771536e-06, - "loss": 1.104, + "epoch": 9.02, + "grad_norm": 0.05716940015554428, + "learning_rate": 8.88952380952381e-06, + "loss": 0.0134, + "step": 3500 + }, + { + "epoch": 9.02, + "eval_accuracy": 0.8316498316498316, + "eval_loss": 0.6907000541687012, + "eval_runtime": 137.8377, + "eval_samples_per_second": 2.155, + "eval_steps_per_second": 1.081, "step": 3500 }, { - "epoch": 4.09, - "grad_norm": 0.11621620506048203, - "learning_rate": 1.2002259248799774e-06, - "loss": 0.6127, + "epoch": 10.000571428571428, + "grad_norm": 63.84096145629883, + "learning_rate": 8.883174603174603e-06, + "loss": 1.0078, "step": 3510 }, { - "epoch": 4.09, - "grad_norm": 0.12657718360424042, - "learning_rate": 1.1719853148828015e-06, - "loss": 0.4753, + "epoch": 10.001142857142858, + "grad_norm": 25.32430076599121, + "learning_rate": 8.876825396825398e-06, + "loss": 1.2722, "step": 3520 }, { - "epoch": 4.1, - "grad_norm": 62.24040222167969, - "learning_rate": 1.1437447048856256e-06, - "loss": 0.9395, + "epoch": 10.001714285714286, + "grad_norm": 126.55713653564453, + "learning_rate": 8.870476190476192e-06, + "loss": 0.5043, "step": 3530 }, { - "epoch": 4.1, - "grad_norm": 0.11141040176153183, - "learning_rate": 1.1155040948884497e-06, - "loss": 0.4753, + "epoch": 10.002285714285714, + "grad_norm": 0.31926584243774414, + "learning_rate": 8.864126984126985e-06, + "loss": 0.122, "step": 3540 }, { - "epoch": 4.1, - "grad_norm": 23.662633895874023, - "learning_rate": 1.0872634848912737e-06, - "loss": 0.472, + "epoch": 10.002857142857144, + "grad_norm": 75.52194213867188, + "learning_rate": 8.857777777777779e-06, + "loss": 0.579, "step": 3550 }, { - "epoch": 4.1, - "grad_norm": 0.029395530000329018, - "learning_rate": 1.0590228748940978e-06, - "loss": 0.0064, + "epoch": 10.003428571428572, + "grad_norm": 59.787635803222656, + "learning_rate": 8.851428571428572e-06, + "loss": 0.5681, "step": 3560 }, { - "epoch": 4.11, - "grad_norm": 0.03742952272295952, - "learning_rate": 1.0307822648969219e-06, - "loss": 0.469, + "epoch": 10.004, + "grad_norm": 0.1938324123620987, + "learning_rate": 8.845079365079366e-06, + "loss": 0.7633, "step": 3570 }, { - "epoch": 4.11, - "grad_norm": 0.08150259405374527, - "learning_rate": 1.002541654899746e-06, - "loss": 0.4674, + "epoch": 10.00457142857143, + "grad_norm": 12.319451332092285, + "learning_rate": 8.83873015873016e-06, + "loss": 1.1426, "step": 3580 }, { - "epoch": 4.11, - "grad_norm": 0.05578187480568886, - "learning_rate": 9.7430104490257e-07, - "loss": 0.5241, + "epoch": 10.005142857142857, + "grad_norm": 12.19789981842041, + "learning_rate": 8.832380952380953e-06, + "loss": 0.4935, "step": 3590 }, { - "epoch": 4.11, - "grad_norm": 56.26320266723633, - "learning_rate": 9.460604349053941e-07, - "loss": 1.0151, + "epoch": 10.005714285714285, + "grad_norm": 0.08483655005693436, + "learning_rate": 8.826031746031747e-06, + "loss": 0.4319, "step": 3600 }, { - "epoch": 4.12, - "grad_norm": 0.1536935567855835, - "learning_rate": 9.178198249082181e-07, - "loss": 1.9666, + "epoch": 10.006285714285715, + "grad_norm": 0.2407812774181366, + "learning_rate": 8.81968253968254e-06, + "loss": 0.4388, "step": 3610 }, { - "epoch": 4.12, - "grad_norm": 2.32802677154541, - "learning_rate": 8.895792149110422e-07, - "loss": 0.4655, + "epoch": 10.006857142857143, + "grad_norm": 0.20790864527225494, + "learning_rate": 8.813333333333334e-06, + "loss": 0.6393, "step": 3620 }, { - "epoch": 4.12, - "grad_norm": 619.9100952148438, - "learning_rate": 8.613386049138663e-07, - "loss": 0.788, + "epoch": 10.007428571428571, + "grad_norm": 6.273917198181152, + "learning_rate": 8.806984126984127e-06, + "loss": 0.2051, "step": 3630 }, { - "epoch": 4.13, - "grad_norm": 34.0890007019043, - "learning_rate": 8.330979949166902e-07, - "loss": 1.3822, + "epoch": 10.008, + "grad_norm": 0.6706716418266296, + "learning_rate": 8.800634920634921e-06, + "loss": 0.6428, "step": 3640 }, { - "epoch": 4.13, - "grad_norm": 0.2547508180141449, - "learning_rate": 8.048573849195143e-07, - "loss": 0.8994, + "epoch": 10.008571428571429, + "grad_norm": 0.3309767544269562, + "learning_rate": 8.794285714285716e-06, + "loss": 0.2073, "step": 3650 }, { - "epoch": 4.13, - "grad_norm": 0.27960023283958435, - "learning_rate": 7.766167749223384e-07, - "loss": 0.5559, + "epoch": 10.009142857142857, + "grad_norm": 0.3042340576648712, + "learning_rate": 8.787936507936508e-06, + "loss": 0.7154, "step": 3660 }, { - "epoch": 4.13, - "grad_norm": 0.05300937965512276, - "learning_rate": 7.483761649251624e-07, - "loss": 0.4556, + "epoch": 10.009714285714285, + "grad_norm": 0.23488643765449524, + "learning_rate": 8.781587301587302e-06, + "loss": 0.2957, "step": 3670 }, { - "epoch": 4.14, - "grad_norm": 0.024076635017991066, - "learning_rate": 7.201355549279864e-07, - "loss": 0.0023, + "epoch": 10.010285714285715, + "grad_norm": 0.09738802909851074, + "learning_rate": 8.775238095238095e-06, + "loss": 0.0073, "step": 3680 }, { - "epoch": 4.14, - "grad_norm": 0.27708545327186584, - "learning_rate": 6.918949449308106e-07, - "loss": 1.2558, + "epoch": 10.010857142857143, + "grad_norm": 0.10398265719413757, + "learning_rate": 8.76888888888889e-06, + "loss": 0.8261, "step": 3690 }, { - "epoch": 4.14, - "grad_norm": 25.573650360107422, - "learning_rate": 6.636543349336347e-07, - "loss": 2.7475, + "epoch": 10.01142857142857, + "grad_norm": 0.1489027887582779, + "learning_rate": 8.762539682539684e-06, + "loss": 0.8143, "step": 3700 }, { - "epoch": 4.14, - "grad_norm": 0.28637179732322693, - "learning_rate": 6.354137249364587e-07, - "loss": 0.8946, + "epoch": 10.012, + "grad_norm": 0.2872755527496338, + "learning_rate": 8.756190476190476e-06, + "loss": 0.7231, "step": 3710 }, { - "epoch": 4.15, - "grad_norm": 0.1498030722141266, - "learning_rate": 6.071731149392827e-07, - "loss": 0.0068, + "epoch": 10.012571428571428, + "grad_norm": 0.08701366931200027, + "learning_rate": 8.74984126984127e-06, + "loss": 0.4884, "step": 3720 }, { - "epoch": 4.15, - "grad_norm": 0.03544044494628906, - "learning_rate": 5.789325049421068e-07, - "loss": 0.1509, + "epoch": 10.013142857142856, + "grad_norm": 0.18288986384868622, + "learning_rate": 8.743492063492063e-06, + "loss": 0.2193, "step": 3730 }, { - "epoch": 4.15, - "grad_norm": 0.2865348756313324, - "learning_rate": 5.506918949449309e-07, - "loss": 0.0056, + "epoch": 10.013714285714286, + "grad_norm": 0.14304602146148682, + "learning_rate": 8.737142857142858e-06, + "loss": 0.4951, "step": 3740 }, { - "epoch": 4.15, - "grad_norm": 0.6191418170928955, - "learning_rate": 5.224512849477549e-07, - "loss": 0.987, + "epoch": 10.014285714285714, + "grad_norm": 0.23201577365398407, + "learning_rate": 8.730793650793652e-06, + "loss": 0.3121, "step": 3750 }, { - "epoch": 4.16, - "grad_norm": 0.03451582416892052, - "learning_rate": 4.942106749505791e-07, - "loss": 0.5444, + "epoch": 10.014857142857142, + "grad_norm": 196.32400512695312, + "learning_rate": 8.724444444444445e-06, + "loss": 0.5922, "step": 3760 }, { - "epoch": 4.16, - "grad_norm": 0.3244670331478119, - "learning_rate": 4.6597006495340303e-07, - "loss": 0.6433, + "epoch": 10.015428571428572, + "grad_norm": 0.09482351690530777, + "learning_rate": 8.718095238095239e-06, + "loss": 0.4892, "step": 3770 }, { - "epoch": 4.16, - "grad_norm": 23.457420349121094, - "learning_rate": 4.3772945495622706e-07, - "loss": 1.4575, + "epoch": 10.016, + "grad_norm": 0.1551242470741272, + "learning_rate": 8.711746031746032e-06, + "loss": 0.523, "step": 3780 }, { - "epoch": 4.16, - "grad_norm": 0.2671697735786438, - "learning_rate": 4.094888449590512e-07, - "loss": 0.0069, + "epoch": 10.016571428571428, + "grad_norm": 0.15693677961826324, + "learning_rate": 8.705396825396826e-06, + "loss": 0.9376, "step": 3790 }, { - "epoch": 4.17, - "grad_norm": 0.0375777892768383, - "learning_rate": 3.812482349618752e-07, - "loss": 0.5398, + "epoch": 10.017142857142858, + "grad_norm": 0.38993221521377563, + "learning_rate": 8.69904761904762e-06, + "loss": 0.8241, "step": 3800 }, { - "epoch": 4.17, - "grad_norm": 0.7031461596488953, - "learning_rate": 3.530076249646993e-07, - "loss": 0.0067, + "epoch": 10.017714285714286, + "grad_norm": 0.5978397727012634, + "learning_rate": 8.692698412698413e-06, + "loss": 0.806, "step": 3810 }, { - "epoch": 4.17, - "grad_norm": 0.027200641110539436, - "learning_rate": 3.247670149675233e-07, - "loss": 0.0063, + "epoch": 10.018285714285714, + "grad_norm": 0.41364625096321106, + "learning_rate": 8.686349206349207e-06, + "loss": 0.5164, "step": 3820 }, { - "epoch": 4.17, - "grad_norm": 0.1259397715330124, - "learning_rate": 2.965264049703474e-07, - "loss": 0.619, + "epoch": 10.018857142857144, + "grad_norm": 0.32526934146881104, + "learning_rate": 8.68e-06, + "loss": 0.5054, "step": 3830 }, { - "epoch": 4.18, - "grad_norm": 0.12452813237905502, - "learning_rate": 2.682857949731714e-07, - "loss": 0.0046, + "epoch": 10.019428571428572, + "grad_norm": 0.6359074711799622, + "learning_rate": 8.673650793650794e-06, + "loss": 0.31, "step": 3840 }, { - "epoch": 4.18, - "grad_norm": 0.25807440280914307, - "learning_rate": 2.400451849759955e-07, - "loss": 0.6994, + "epoch": 10.02, + "grad_norm": 85.47923278808594, + "learning_rate": 8.667301587301587e-06, + "loss": 0.11, + "step": 3850 + }, + { + "epoch": 10.02, + "eval_accuracy": 0.8316498316498316, + "eval_loss": 0.7097596526145935, + "eval_runtime": 137.5142, + "eval_samples_per_second": 2.16, + "eval_steps_per_second": 1.084, "step": 3850 }, { - "epoch": 4.18, - "grad_norm": 0.13341209292411804, - "learning_rate": 2.1180457497881955e-07, - "loss": 0.5316, + "epoch": 11.000571428571428, + "grad_norm": 0.1688612848520279, + "learning_rate": 8.660952380952383e-06, + "loss": 0.4141, "step": 3860 }, { - "epoch": 4.18, - "grad_norm": 22.768707275390625, - "learning_rate": 1.8356396498164363e-07, - "loss": 1.5915, + "epoch": 11.001142857142858, + "grad_norm": 12.029210090637207, + "learning_rate": 8.654603174603176e-06, + "loss": 0.8705, "step": 3870 }, { - "epoch": 4.19, - "grad_norm": 0.050129234790802, - "learning_rate": 1.5532335498446768e-07, - "loss": 1.065, + "epoch": 11.001714285714286, + "grad_norm": 0.19282205402851105, + "learning_rate": 8.648253968253968e-06, + "loss": 0.3451, "step": 3880 }, { - "epoch": 4.19, - "grad_norm": 0.10654575377702713, - "learning_rate": 1.2708274498729173e-07, - "loss": 1.1427, + "epoch": 11.002285714285714, + "grad_norm": 12.639680862426758, + "learning_rate": 8.641904761904762e-06, + "loss": 0.9504, "step": 3890 }, { - "epoch": 4.19, - "grad_norm": 0.04145923629403114, - "learning_rate": 9.884213499011579e-08, - "loss": 0.0048, + "epoch": 11.002857142857144, + "grad_norm": 0.34344059228897095, + "learning_rate": 8.635555555555555e-06, + "loss": 0.3582, "step": 3900 }, { - "epoch": 4.19, - "grad_norm": 35.09569549560547, - "learning_rate": 7.060152499293986e-08, - "loss": 1.6677, + "epoch": 11.003428571428572, + "grad_norm": 193.6134033203125, + "learning_rate": 8.62920634920635e-06, + "loss": 0.1138, "step": 3910 }, { - "epoch": 4.2, - "grad_norm": 0.2241126149892807, - "learning_rate": 4.236091499576391e-08, - "loss": 0.4727, + "epoch": 11.004, + "grad_norm": 0.36762064695358276, + "learning_rate": 8.622857142857144e-06, + "loss": 0.556, "step": 3920 }, { - "epoch": 4.2, - "grad_norm": 0.043600454926490784, - "learning_rate": 1.4120304998587971e-08, - "loss": 0.0042, + "epoch": 11.00457142857143, + "grad_norm": 0.19313932955265045, + "learning_rate": 8.616507936507938e-06, + "loss": 0.5323, "step": 3930 }, { - "epoch": 4.2, - "eval_accuracy": 0.8104026845637584, - "eval_loss": 0.8931716680526733, - "eval_runtime": 327.1353, - "eval_samples_per_second": 1.822, - "eval_steps_per_second": 1.822, - "step": 3935 - }, - { - "epoch": 4.2, - "step": 3935, - "total_flos": 1.7278688550692782e+19, - "train_loss": 0.7462392025486373, - "train_runtime": 7487.9132, - "train_samples_per_second": 0.526, - "train_steps_per_second": 0.526 - }, - { - "epoch": 4.2, - "eval_accuracy": 0.8456375838926175, - "eval_loss": 0.7402271628379822, - "eval_runtime": 321.7128, - "eval_samples_per_second": 1.853, - "eval_steps_per_second": 1.853, - "step": 3935 - }, - { - "epoch": 4.2, - "eval_accuracy": 0.8456375838926175, - "eval_loss": 0.7402271032333374, - "eval_runtime": 328.0839, - "eval_samples_per_second": 1.817, - "eval_steps_per_second": 1.817, - "step": 3935 - } - ], - "logging_steps": 10, - "max_steps": 3935, - "num_input_tokens_seen": 0, - "num_train_epochs": 9223372036854775807, - "save_steps": 500, - "total_flos": 1.7278688550692782e+19, - "train_batch_size": 1, + "epoch": 11.005142857142857, + "grad_norm": 0.26507726311683655, + "learning_rate": 8.61015873015873e-06, + "loss": 0.4584, + "step": 3940 + }, + { + "epoch": 11.005714285714285, + "grad_norm": 0.09229809045791626, + "learning_rate": 8.603809523809525e-06, + "loss": 0.4409, + "step": 3950 + }, + { + "epoch": 11.006285714285715, + "grad_norm": 0.14882345497608185, + "learning_rate": 8.597460317460318e-06, + "loss": 0.0062, + "step": 3960 + }, + { + "epoch": 11.006857142857143, + "grad_norm": 12.128185272216797, + "learning_rate": 8.591111111111112e-06, + "loss": 0.2344, + "step": 3970 + }, + { + "epoch": 11.007428571428571, + "grad_norm": 0.22297517955303192, + "learning_rate": 8.584761904761906e-06, + "loss": 0.1457, + "step": 3980 + }, + { + "epoch": 11.008, + "grad_norm": 119.80109405517578, + "learning_rate": 8.578412698412699e-06, + "loss": 0.1712, + "step": 3990 + }, + { + "epoch": 11.008571428571429, + "grad_norm": 0.09167554974555969, + "learning_rate": 8.572063492063493e-06, + "loss": 0.5816, + "step": 4000 + }, + { + "epoch": 11.009142857142857, + "grad_norm": 21.545888900756836, + "learning_rate": 8.565714285714286e-06, + "loss": 0.9225, + "step": 4010 + }, + { + "epoch": 11.009714285714285, + "grad_norm": 0.07726138085126877, + "learning_rate": 8.55936507936508e-06, + "loss": 0.3713, + "step": 4020 + }, + { + "epoch": 11.010285714285715, + "grad_norm": 0.14826586842536926, + "learning_rate": 8.553015873015875e-06, + "loss": 0.2916, + "step": 4030 + }, + { + "epoch": 11.010857142857143, + "grad_norm": 13.104598045349121, + "learning_rate": 8.546666666666667e-06, + "loss": 1.1717, + "step": 4040 + }, + { + "epoch": 11.01142857142857, + "grad_norm": 0.14416873455047607, + "learning_rate": 8.540317460317462e-06, + "loss": 0.5383, + "step": 4050 + }, + { + "epoch": 11.012, + "grad_norm": 0.22047214210033417, + "learning_rate": 8.533968253968254e-06, + "loss": 0.4027, + "step": 4060 + }, + { + "epoch": 11.012571428571428, + "grad_norm": 1.8622279167175293, + "learning_rate": 8.527619047619049e-06, + "loss": 0.4155, + "step": 4070 + }, + { + "epoch": 11.013142857142856, + "grad_norm": 18.07794952392578, + "learning_rate": 8.521269841269843e-06, + "loss": 0.2927, + "step": 4080 + }, + { + "epoch": 11.013714285714286, + "grad_norm": 30.948429107666016, + "learning_rate": 8.514920634920636e-06, + "loss": 0.7502, + "step": 4090 + }, + { + "epoch": 11.014285714285714, + "grad_norm": 12.959928512573242, + "learning_rate": 8.50857142857143e-06, + "loss": 0.2332, + "step": 4100 + }, + { + "epoch": 11.014857142857142, + "grad_norm": 147.560546875, + "learning_rate": 8.502222222222223e-06, + "loss": 0.3589, + "step": 4110 + }, + { + "epoch": 11.015428571428572, + "grad_norm": 0.5723152756690979, + "learning_rate": 8.495873015873017e-06, + "loss": 0.7663, + "step": 4120 + }, + { + "epoch": 11.016, + "grad_norm": 0.13386109471321106, + "learning_rate": 8.48952380952381e-06, + "loss": 1.0655, + "step": 4130 + }, + { + "epoch": 11.016571428571428, + "grad_norm": 0.2565602660179138, + "learning_rate": 8.483174603174604e-06, + "loss": 0.6372, + "step": 4140 + }, + { + "epoch": 11.017142857142858, + "grad_norm": 0.517414391040802, + "learning_rate": 8.476825396825398e-06, + "loss": 0.2303, + "step": 4150 + }, + { + "epoch": 11.017714285714286, + "grad_norm": 0.18709756433963776, + "learning_rate": 8.47047619047619e-06, + "loss": 0.8909, + "step": 4160 + }, + { + "epoch": 11.018285714285714, + "grad_norm": 241.76470947265625, + "learning_rate": 8.464126984126985e-06, + "loss": 1.3007, + "step": 4170 + }, + { + "epoch": 11.018857142857144, + "grad_norm": 0.4402504861354828, + "learning_rate": 8.457777777777778e-06, + "loss": 0.1889, + "step": 4180 + }, + { + "epoch": 11.019428571428572, + "grad_norm": 0.13897043466567993, + "learning_rate": 8.451428571428572e-06, + "loss": 0.5257, + "step": 4190 + }, + { + "epoch": 11.02, + "grad_norm": 12.089799880981445, + "learning_rate": 8.445079365079366e-06, + "loss": 0.6557, + "step": 4200 + }, + { + "epoch": 11.02, + "eval_accuracy": 0.8383838383838383, + "eval_loss": 0.650736927986145, + "eval_runtime": 126.7199, + "eval_samples_per_second": 2.344, + "eval_steps_per_second": 1.176, + "step": 4200 + }, + { + "epoch": 12.000571428571428, + "grad_norm": 0.39973193407058716, + "learning_rate": 8.438730158730159e-06, + "loss": 0.4371, + "step": 4210 + }, + { + "epoch": 12.001142857142858, + "grad_norm": 11.976874351501465, + "learning_rate": 8.432380952380953e-06, + "loss": 0.4166, + "step": 4220 + }, + { + "epoch": 12.001714285714286, + "grad_norm": 0.3415261209011078, + "learning_rate": 8.426031746031746e-06, + "loss": 0.3675, + "step": 4230 + }, + { + "epoch": 12.002285714285714, + "grad_norm": 0.4809642732143402, + "learning_rate": 8.41968253968254e-06, + "loss": 0.2013, + "step": 4240 + }, + { + "epoch": 12.002857142857144, + "grad_norm": 0.682577908039093, + "learning_rate": 8.413333333333335e-06, + "loss": 0.2326, + "step": 4250 + }, + { + "epoch": 12.003428571428572, + "grad_norm": 0.03049817495048046, + "learning_rate": 8.406984126984127e-06, + "loss": 0.7267, + "step": 4260 + }, + { + "epoch": 12.004, + "grad_norm": 0.10638611763715744, + "learning_rate": 8.400634920634922e-06, + "loss": 0.8069, + "step": 4270 + }, + { + "epoch": 12.00457142857143, + "grad_norm": 0.041142355650663376, + "learning_rate": 8.394285714285714e-06, + "loss": 0.1764, + "step": 4280 + }, + { + "epoch": 12.005142857142857, + "grad_norm": 0.5415995121002197, + "learning_rate": 8.387936507936509e-06, + "loss": 1.0413, + "step": 4290 + }, + { + "epoch": 12.005714285714285, + "grad_norm": 0.4838305711746216, + "learning_rate": 8.381587301587303e-06, + "loss": 0.4062, + "step": 4300 + }, + { + "epoch": 12.006285714285715, + "grad_norm": 23.351930618286133, + "learning_rate": 8.375238095238096e-06, + "loss": 0.4338, + "step": 4310 + }, + { + "epoch": 12.006857142857143, + "grad_norm": 15.27731704711914, + "learning_rate": 8.36888888888889e-06, + "loss": 0.682, + "step": 4320 + }, + { + "epoch": 12.007428571428571, + "grad_norm": 0.21492832899093628, + "learning_rate": 8.362539682539683e-06, + "loss": 0.2962, + "step": 4330 + }, + { + "epoch": 12.008, + "grad_norm": 0.39623013138771057, + "learning_rate": 8.356190476190477e-06, + "loss": 0.4511, + "step": 4340 + }, + { + "epoch": 12.008571428571429, + "grad_norm": 0.8621640205383301, + "learning_rate": 8.34984126984127e-06, + "loss": 0.7211, + "step": 4350 + }, + { + "epoch": 12.009142857142857, + "grad_norm": 0.17645128071308136, + "learning_rate": 8.343492063492064e-06, + "loss": 0.609, + "step": 4360 + }, + { + "epoch": 12.009714285714285, + "grad_norm": 1.3618069887161255, + "learning_rate": 8.337142857142858e-06, + "loss": 0.1656, + "step": 4370 + }, + { + "epoch": 12.010285714285715, + "grad_norm": 12.654424667358398, + "learning_rate": 8.33079365079365e-06, + "loss": 0.715, + "step": 4380 + }, + { + "epoch": 12.010857142857143, + "grad_norm": 14.740551948547363, + "learning_rate": 8.324444444444445e-06, + "loss": 1.23, + "step": 4390 + }, + { + "epoch": 12.01142857142857, + "grad_norm": 33.95640182495117, + "learning_rate": 8.318095238095238e-06, + "loss": 0.6692, + "step": 4400 + }, + { + "epoch": 12.012, + "grad_norm": 7.696796417236328, + "learning_rate": 8.311746031746032e-06, + "loss": 0.8213, + "step": 4410 + }, + { + "epoch": 12.012571428571428, + "grad_norm": 0.5786144733428955, + "learning_rate": 8.305396825396826e-06, + "loss": 0.2544, + "step": 4420 + }, + { + "epoch": 12.013142857142856, + "grad_norm": 0.296314537525177, + "learning_rate": 8.29904761904762e-06, + "loss": 0.0166, + "step": 4430 + }, + { + "epoch": 12.013714285714286, + "grad_norm": 0.23542876541614532, + "learning_rate": 8.292698412698413e-06, + "loss": 0.2817, + "step": 4440 + }, + { + "epoch": 12.014285714285714, + "grad_norm": 16.194673538208008, + "learning_rate": 8.286349206349206e-06, + "loss": 0.7401, + "step": 4450 + }, + { + "epoch": 12.014857142857142, + "grad_norm": 0.1849094033241272, + "learning_rate": 8.28e-06, + "loss": 0.5863, + "step": 4460 + }, + { + "epoch": 12.015428571428572, + "grad_norm": 0.14007309079170227, + "learning_rate": 8.273650793650795e-06, + "loss": 1.4727, + "step": 4470 + }, + { + "epoch": 12.016, + "grad_norm": 74.8141098022461, + "learning_rate": 8.267301587301589e-06, + "loss": 0.4532, + "step": 4480 + }, + { + "epoch": 12.016571428571428, + "grad_norm": 0.1595589816570282, + "learning_rate": 8.260952380952382e-06, + "loss": 0.2142, + "step": 4490 + }, + { + "epoch": 12.017142857142858, + "grad_norm": 23.603548049926758, + "learning_rate": 8.254603174603176e-06, + "loss": 0.2174, + "step": 4500 + }, + { + "epoch": 12.017714285714286, + "grad_norm": 0.32662734389305115, + "learning_rate": 8.248253968253969e-06, + "loss": 0.4129, + "step": 4510 + }, + { + "epoch": 12.018285714285714, + "grad_norm": 15.74620532989502, + "learning_rate": 8.241904761904761e-06, + "loss": 0.5215, + "step": 4520 + }, + { + "epoch": 12.018857142857144, + "grad_norm": 0.10295995324850082, + "learning_rate": 8.235555555555557e-06, + "loss": 0.2909, + "step": 4530 + }, + { + "epoch": 12.019428571428572, + "grad_norm": 0.2319801300764084, + "learning_rate": 8.22920634920635e-06, + "loss": 1.1379, + "step": 4540 + }, + { + "epoch": 12.02, + "grad_norm": 0.1083206757903099, + "learning_rate": 8.222857142857144e-06, + "loss": 0.2642, + "step": 4550 + }, + { + "epoch": 12.02, + "eval_accuracy": 0.8518518518518519, + "eval_loss": 0.6555494070053101, + "eval_runtime": 127.2639, + "eval_samples_per_second": 2.334, + "eval_steps_per_second": 1.171, + "step": 4550 + }, + { + "epoch": 13.000571428571428, + "grad_norm": 0.09389404952526093, + "learning_rate": 8.216507936507937e-06, + "loss": 0.2617, + "step": 4560 + }, + { + "epoch": 13.001142857142858, + "grad_norm": 0.11154793202877045, + "learning_rate": 8.210158730158731e-06, + "loss": 0.0051, + "step": 4570 + }, + { + "epoch": 13.001714285714286, + "grad_norm": 0.1510355919599533, + "learning_rate": 8.203809523809526e-06, + "loss": 0.0096, + "step": 4580 + }, + { + "epoch": 13.002285714285714, + "grad_norm": 14.562790870666504, + "learning_rate": 8.197460317460318e-06, + "loss": 1.4925, + "step": 4590 + }, + { + "epoch": 13.002857142857144, + "grad_norm": 0.1331201046705246, + "learning_rate": 8.191111111111112e-06, + "loss": 0.5703, + "step": 4600 + }, + { + "epoch": 13.003428571428572, + "grad_norm": 0.2739226520061493, + "learning_rate": 8.184761904761905e-06, + "loss": 0.8987, + "step": 4610 + }, + { + "epoch": 13.004, + "grad_norm": 0.46435093879699707, + "learning_rate": 8.1784126984127e-06, + "loss": 0.4531, + "step": 4620 + }, + { + "epoch": 13.00457142857143, + "grad_norm": 0.17606890201568604, + "learning_rate": 8.172063492063492e-06, + "loss": 0.7506, + "step": 4630 + }, + { + "epoch": 13.005142857142857, + "grad_norm": 0.20931817591190338, + "learning_rate": 8.165714285714286e-06, + "loss": 0.2208, + "step": 4640 + }, + { + "epoch": 13.005714285714285, + "grad_norm": 0.1586388796567917, + "learning_rate": 8.15936507936508e-06, + "loss": 0.1573, + "step": 4650 + }, + { + "epoch": 13.006285714285715, + "grad_norm": 0.2133455127477646, + "learning_rate": 8.153015873015873e-06, + "loss": 1.0149, + "step": 4660 + }, + { + "epoch": 13.006857142857143, + "grad_norm": 0.10580915212631226, + "learning_rate": 8.146666666666668e-06, + "loss": 0.6209, + "step": 4670 + }, + { + "epoch": 13.007428571428571, + "grad_norm": 0.05809102952480316, + "learning_rate": 8.14031746031746e-06, + "loss": 0.4834, + "step": 4680 + }, + { + "epoch": 13.008, + "grad_norm": 0.18141770362854004, + "learning_rate": 8.133968253968255e-06, + "loss": 0.9549, + "step": 4690 + }, + { + "epoch": 13.008571428571429, + "grad_norm": 0.3100875914096832, + "learning_rate": 8.127619047619049e-06, + "loss": 0.4513, + "step": 4700 + }, + { + "epoch": 13.009142857142857, + "grad_norm": 12.605453491210938, + "learning_rate": 8.121269841269842e-06, + "loss": 0.9802, + "step": 4710 + }, + { + "epoch": 13.009714285714285, + "grad_norm": 18.329814910888672, + "learning_rate": 8.114920634920636e-06, + "loss": 0.4474, + "step": 4720 + }, + { + "epoch": 13.010285714285715, + "grad_norm": 0.18632696568965912, + "learning_rate": 8.108571428571429e-06, + "loss": 0.5227, + "step": 4730 + }, + { + "epoch": 13.010857142857143, + "grad_norm": 0.15728500485420227, + "learning_rate": 8.102222222222223e-06, + "loss": 0.3175, + "step": 4740 + }, + { + "epoch": 13.01142857142857, + "grad_norm": 16.559619903564453, + "learning_rate": 8.095873015873017e-06, + "loss": 0.4173, + "step": 4750 + }, + { + "epoch": 13.012, + "grad_norm": 0.04901667311787605, + "learning_rate": 8.08952380952381e-06, + "loss": 0.3976, + "step": 4760 + }, + { + "epoch": 13.012571428571428, + "grad_norm": 0.22993479669094086, + "learning_rate": 8.083174603174604e-06, + "loss": 0.1813, + "step": 4770 + }, + { + "epoch": 13.013142857142856, + "grad_norm": 0.3767968416213989, + "learning_rate": 8.076825396825397e-06, + "loss": 0.7901, + "step": 4780 + }, + { + "epoch": 13.013714285714286, + "grad_norm": 0.03023025207221508, + "learning_rate": 8.070476190476191e-06, + "loss": 0.1889, + "step": 4790 + }, + { + "epoch": 13.014285714285714, + "grad_norm": 0.9322307109832764, + "learning_rate": 8.064126984126984e-06, + "loss": 1.3871, + "step": 4800 + }, + { + "epoch": 13.014857142857142, + "grad_norm": 25.164409637451172, + "learning_rate": 8.057777777777778e-06, + "loss": 0.9618, + "step": 4810 + }, + { + "epoch": 13.015428571428572, + "grad_norm": 94.88408660888672, + "learning_rate": 8.051428571428573e-06, + "loss": 0.632, + "step": 4820 + }, + { + "epoch": 13.016, + "grad_norm": 0.15524975955486298, + "learning_rate": 8.045079365079365e-06, + "loss": 0.7624, + "step": 4830 + }, + { + "epoch": 13.016571428571428, + "grad_norm": 0.06509774178266525, + "learning_rate": 8.03873015873016e-06, + "loss": 0.2143, + "step": 4840 + }, + { + "epoch": 13.017142857142858, + "grad_norm": 0.24536223709583282, + "learning_rate": 8.032380952380952e-06, + "loss": 0.2895, + "step": 4850 + }, + { + "epoch": 13.017714285714286, + "grad_norm": 58.50824737548828, + "learning_rate": 8.026031746031746e-06, + "loss": 0.0836, + "step": 4860 + }, + { + "epoch": 13.018285714285714, + "grad_norm": 0.14553256332874298, + "learning_rate": 8.01968253968254e-06, + "loss": 0.672, + "step": 4870 + }, + { + "epoch": 13.018857142857144, + "grad_norm": 0.06745839864015579, + "learning_rate": 8.013333333333333e-06, + "loss": 0.6554, + "step": 4880 + }, + { + "epoch": 13.019428571428572, + "grad_norm": 0.15893854200839996, + "learning_rate": 8.006984126984128e-06, + "loss": 0.6895, + "step": 4890 + }, + { + "epoch": 13.02, + "grad_norm": 0.23196399211883545, + "learning_rate": 8.00063492063492e-06, + "loss": 0.2413, + "step": 4900 + }, + { + "epoch": 13.02, + "eval_accuracy": 0.8518518518518519, + "eval_loss": 0.6480634808540344, + "eval_runtime": 126.6312, + "eval_samples_per_second": 2.345, + "eval_steps_per_second": 1.177, + "step": 4900 + }, + { + "epoch": 14.000571428571428, + "grad_norm": 0.1645181030035019, + "learning_rate": 7.994285714285715e-06, + "loss": 0.6897, + "step": 4910 + }, + { + "epoch": 14.001142857142858, + "grad_norm": 168.658203125, + "learning_rate": 7.987936507936509e-06, + "loss": 0.8059, + "step": 4920 + }, + { + "epoch": 14.001714285714286, + "grad_norm": 0.17724451422691345, + "learning_rate": 7.981587301587302e-06, + "loss": 0.4341, + "step": 4930 + }, + { + "epoch": 14.002285714285714, + "grad_norm": 0.25565865635871887, + "learning_rate": 7.975238095238096e-06, + "loss": 0.588, + "step": 4940 + }, + { + "epoch": 14.002857142857144, + "grad_norm": 0.2059396207332611, + "learning_rate": 7.968888888888889e-06, + "loss": 0.2607, + "step": 4950 + }, + { + "epoch": 14.003428571428572, + "grad_norm": 21.05925941467285, + "learning_rate": 7.962539682539683e-06, + "loss": 0.676, + "step": 4960 + }, + { + "epoch": 14.004, + "grad_norm": 0.20337478816509247, + "learning_rate": 7.956190476190477e-06, + "loss": 0.235, + "step": 4970 + }, + { + "epoch": 14.00457142857143, + "grad_norm": 12.385817527770996, + "learning_rate": 7.949841269841272e-06, + "loss": 1.1436, + "step": 4980 + }, + { + "epoch": 14.005142857142857, + "grad_norm": 169.90371704101562, + "learning_rate": 7.943492063492064e-06, + "loss": 0.4043, + "step": 4990 + }, + { + "epoch": 14.005714285714285, + "grad_norm": 0.9865061044692993, + "learning_rate": 7.937142857142857e-06, + "loss": 0.5453, + "step": 5000 + }, + { + "epoch": 14.006285714285715, + "grad_norm": 0.21574608981609344, + "learning_rate": 7.930793650793651e-06, + "loss": 0.4185, + "step": 5010 + }, + { + "epoch": 14.006857142857143, + "grad_norm": 0.22090749442577362, + "learning_rate": 7.924444444444444e-06, + "loss": 0.6939, + "step": 5020 + }, + { + "epoch": 14.007428571428571, + "grad_norm": 0.02600650116801262, + "learning_rate": 7.91809523809524e-06, + "loss": 0.8623, + "step": 5030 + }, + { + "epoch": 14.008, + "grad_norm": 0.13779287040233612, + "learning_rate": 7.911746031746033e-06, + "loss": 0.0248, + "step": 5040 + }, + { + "epoch": 14.008571428571429, + "grad_norm": 12.554780960083008, + "learning_rate": 7.905396825396827e-06, + "loss": 0.9988, + "step": 5050 + }, + { + "epoch": 14.009142857142857, + "grad_norm": 0.41419580578804016, + "learning_rate": 7.89904761904762e-06, + "loss": 0.2302, + "step": 5060 + }, + { + "epoch": 14.009714285714285, + "grad_norm": 0.16763442754745483, + "learning_rate": 7.892698412698412e-06, + "loss": 0.8218, + "step": 5070 + }, + { + "epoch": 14.010285714285715, + "grad_norm": 120.92338562011719, + "learning_rate": 7.886349206349208e-06, + "loss": 0.5738, + "step": 5080 + }, + { + "epoch": 14.010857142857143, + "grad_norm": 0.15212488174438477, + "learning_rate": 7.88e-06, + "loss": 0.8477, + "step": 5090 + }, + { + "epoch": 14.01142857142857, + "grad_norm": 0.14688020944595337, + "learning_rate": 7.873650793650795e-06, + "loss": 0.2236, + "step": 5100 + }, + { + "epoch": 14.012, + "grad_norm": 0.11865352839231491, + "learning_rate": 7.867301587301588e-06, + "loss": 0.2869, + "step": 5110 + }, + { + "epoch": 14.012571428571428, + "grad_norm": 13.466479301452637, + "learning_rate": 7.860952380952382e-06, + "loss": 0.4885, + "step": 5120 + }, + { + "epoch": 14.013142857142856, + "grad_norm": 0.07300411909818649, + "learning_rate": 7.854603174603175e-06, + "loss": 0.2464, + "step": 5130 + }, + { + "epoch": 14.013714285714286, + "grad_norm": 0.052624981850385666, + "learning_rate": 7.848253968253969e-06, + "loss": 0.946, + "step": 5140 + }, + { + "epoch": 14.014285714285714, + "grad_norm": 0.09969048947095871, + "learning_rate": 7.841904761904763e-06, + "loss": 0.3992, + "step": 5150 + }, + { + "epoch": 14.014857142857142, + "grad_norm": 49.853572845458984, + "learning_rate": 7.835555555555556e-06, + "loss": 0.0232, + "step": 5160 + }, + { + "epoch": 14.015428571428572, + "grad_norm": 0.3982076644897461, + "learning_rate": 7.82920634920635e-06, + "loss": 0.3344, + "step": 5170 + }, + { + "epoch": 14.016, + "grad_norm": 1.8964109420776367, + "learning_rate": 7.822857142857143e-06, + "loss": 0.4479, + "step": 5180 + }, + { + "epoch": 14.016571428571428, + "grad_norm": 13.034674644470215, + "learning_rate": 7.816507936507937e-06, + "loss": 0.4343, + "step": 5190 + }, + { + "epoch": 14.017142857142858, + "grad_norm": 0.1686258316040039, + "learning_rate": 7.810158730158732e-06, + "loss": 0.4579, + "step": 5200 + }, + { + "epoch": 14.017714285714286, + "grad_norm": 0.14098533987998962, + "learning_rate": 7.803809523809524e-06, + "loss": 0.49, + "step": 5210 + }, + { + "epoch": 14.018285714285714, + "grad_norm": 13.026440620422363, + "learning_rate": 7.797460317460319e-06, + "loss": 0.2029, + "step": 5220 + }, + { + "epoch": 14.018857142857144, + "grad_norm": 12.317622184753418, + "learning_rate": 7.791111111111111e-06, + "loss": 0.7561, + "step": 5230 + }, + { + "epoch": 14.019428571428572, + "grad_norm": 0.3012208938598633, + "learning_rate": 7.784761904761906e-06, + "loss": 0.2343, + "step": 5240 + }, + { + "epoch": 14.02, + "grad_norm": 151.4730987548828, + "learning_rate": 7.7784126984127e-06, + "loss": 0.6278, + "step": 5250 + }, + { + "epoch": 14.02, + "eval_accuracy": 0.8552188552188552, + "eval_loss": 0.6555138826370239, + "eval_runtime": 126.3021, + "eval_samples_per_second": 2.352, + "eval_steps_per_second": 1.18, + "step": 5250 + }, + { + "epoch": 15.000571428571428, + "grad_norm": 0.2991902828216553, + "learning_rate": 7.772063492063493e-06, + "loss": 0.8981, + "step": 5260 + }, + { + "epoch": 15.001142857142858, + "grad_norm": 0.3251695930957794, + "learning_rate": 7.765714285714287e-06, + "loss": 0.552, + "step": 5270 + }, + { + "epoch": 15.001714285714286, + "grad_norm": 0.38806581497192383, + "learning_rate": 7.75936507936508e-06, + "loss": 0.8149, + "step": 5280 + }, + { + "epoch": 15.002285714285714, + "grad_norm": 1.2852102518081665, + "learning_rate": 7.753015873015874e-06, + "loss": 0.7302, + "step": 5290 + }, + { + "epoch": 15.002857142857144, + "grad_norm": 0.33824622631073, + "learning_rate": 7.746666666666666e-06, + "loss": 0.6803, + "step": 5300 + }, + { + "epoch": 15.003428571428572, + "grad_norm": 0.7059460282325745, + "learning_rate": 7.74031746031746e-06, + "loss": 0.8826, + "step": 5310 + }, + { + "epoch": 15.004, + "grad_norm": 0.17795021831989288, + "learning_rate": 7.733968253968255e-06, + "loss": 0.3081, + "step": 5320 + }, + { + "epoch": 15.00457142857143, + "grad_norm": 0.21305586397647858, + "learning_rate": 7.727619047619048e-06, + "loss": 0.4185, + "step": 5330 + }, + { + "epoch": 15.005142857142857, + "grad_norm": 0.458345890045166, + "learning_rate": 7.721269841269842e-06, + "loss": 0.6545, + "step": 5340 + }, + { + "epoch": 15.005714285714285, + "grad_norm": 0.11840073764324188, + "learning_rate": 7.714920634920635e-06, + "loss": 0.2937, + "step": 5350 + }, + { + "epoch": 15.006285714285715, + "grad_norm": 0.17106355726718903, + "learning_rate": 7.708571428571429e-06, + "loss": 0.2317, + "step": 5360 + }, + { + "epoch": 15.006857142857143, + "grad_norm": 0.09697124361991882, + "learning_rate": 7.702222222222223e-06, + "loss": 0.4099, + "step": 5370 + }, + { + "epoch": 15.007428571428571, + "grad_norm": 0.021129153668880463, + "learning_rate": 7.695873015873016e-06, + "loss": 0.2169, + "step": 5380 + }, + { + "epoch": 15.008, + "grad_norm": 0.4217149019241333, + "learning_rate": 7.68952380952381e-06, + "loss": 0.784, + "step": 5390 + }, + { + "epoch": 15.008571428571429, + "grad_norm": 0.17383898794651031, + "learning_rate": 7.683174603174603e-06, + "loss": 0.2046, + "step": 5400 + }, + { + "epoch": 15.009142857142857, + "grad_norm": 0.08642620593309402, + "learning_rate": 7.676825396825397e-06, + "loss": 0.2429, + "step": 5410 + }, + { + "epoch": 15.009714285714285, + "grad_norm": 13.887992858886719, + "learning_rate": 7.670476190476192e-06, + "loss": 0.9628, + "step": 5420 + }, + { + "epoch": 15.010285714285715, + "grad_norm": 0.249220609664917, + "learning_rate": 7.664126984126984e-06, + "loss": 0.5269, + "step": 5430 + }, + { + "epoch": 15.010857142857143, + "grad_norm": 0.07579600065946579, + "learning_rate": 7.657777777777779e-06, + "loss": 0.4449, + "step": 5440 + }, + { + "epoch": 15.01142857142857, + "grad_norm": 0.12848280370235443, + "learning_rate": 7.651428571428571e-06, + "loss": 0.245, + "step": 5450 + }, + { + "epoch": 15.012, + "grad_norm": 13.671841621398926, + "learning_rate": 7.645079365079366e-06, + "loss": 0.7608, + "step": 5460 + }, + { + "epoch": 15.012571428571428, + "grad_norm": 0.19747650623321533, + "learning_rate": 7.63873015873016e-06, + "loss": 0.3063, + "step": 5470 + }, + { + "epoch": 15.013142857142856, + "grad_norm": 0.2382912039756775, + "learning_rate": 7.632380952380953e-06, + "loss": 0.6737, + "step": 5480 + }, + { + "epoch": 15.013714285714286, + "grad_norm": 0.34730061888694763, + "learning_rate": 7.626031746031747e-06, + "loss": 0.3126, + "step": 5490 + }, + { + "epoch": 15.014285714285714, + "grad_norm": 12.084749221801758, + "learning_rate": 7.61968253968254e-06, + "loss": 0.4176, + "step": 5500 + }, + { + "epoch": 15.014857142857142, + "grad_norm": 0.24228954315185547, + "learning_rate": 7.613333333333334e-06, + "loss": 0.4131, + "step": 5510 + }, + { + "epoch": 15.015428571428572, + "grad_norm": 40.641319274902344, + "learning_rate": 7.606984126984127e-06, + "loss": 0.5213, + "step": 5520 + }, + { + "epoch": 15.016, + "grad_norm": 13.184736251831055, + "learning_rate": 7.600634920634922e-06, + "loss": 0.7655, + "step": 5530 + }, + { + "epoch": 15.016571428571428, + "grad_norm": 0.10150767862796783, + "learning_rate": 7.594285714285715e-06, + "loss": 0.7638, + "step": 5540 + }, + { + "epoch": 15.017142857142858, + "grad_norm": 0.16732892394065857, + "learning_rate": 7.587936507936509e-06, + "loss": 0.2141, + "step": 5550 + }, + { + "epoch": 15.017714285714286, + "grad_norm": 0.31903275847435, + "learning_rate": 7.581587301587302e-06, + "loss": 0.44, + "step": 5560 + }, + { + "epoch": 15.018285714285714, + "grad_norm": 0.04827267304062843, + "learning_rate": 7.575238095238096e-06, + "loss": 0.0074, + "step": 5570 + }, + { + "epoch": 15.018857142857144, + "grad_norm": 0.05066690221428871, + "learning_rate": 7.56888888888889e-06, + "loss": 1.0622, + "step": 5580 + }, + { + "epoch": 15.019428571428572, + "grad_norm": 0.14147022366523743, + "learning_rate": 7.562539682539683e-06, + "loss": 0.5052, + "step": 5590 + }, + { + "epoch": 15.02, + "grad_norm": 0.4057610332965851, + "learning_rate": 7.556190476190477e-06, + "loss": 0.0107, + "step": 5600 + }, + { + "epoch": 15.02, + "eval_accuracy": 0.8518518518518519, + "eval_loss": 0.6550477147102356, + "eval_runtime": 126.4968, + "eval_samples_per_second": 2.348, + "eval_steps_per_second": 1.178, + "step": 5600 + }, + { + "epoch": 16.00057142857143, + "grad_norm": 0.12237854301929474, + "learning_rate": 7.54984126984127e-06, + "loss": 0.0925, + "step": 5610 + }, + { + "epoch": 16.001142857142856, + "grad_norm": 0.1167961061000824, + "learning_rate": 7.543492063492064e-06, + "loss": 0.6409, + "step": 5620 + }, + { + "epoch": 16.001714285714286, + "grad_norm": 0.33305051922798157, + "learning_rate": 7.537142857142857e-06, + "loss": 0.961, + "step": 5630 + }, + { + "epoch": 16.002285714285716, + "grad_norm": 0.10626853257417679, + "learning_rate": 7.530793650793652e-06, + "loss": 1.2608, + "step": 5640 + }, + { + "epoch": 16.002857142857142, + "grad_norm": 67.93856048583984, + "learning_rate": 7.524444444444445e-06, + "loss": 0.7227, + "step": 5650 + }, + { + "epoch": 16.00342857142857, + "grad_norm": 0.09368494153022766, + "learning_rate": 7.518095238095239e-06, + "loss": 0.0237, + "step": 5660 + }, + { + "epoch": 16.004, + "grad_norm": 0.1382751762866974, + "learning_rate": 7.511746031746032e-06, + "loss": 0.4677, + "step": 5670 + }, + { + "epoch": 16.004571428571428, + "grad_norm": 0.31585273146629333, + "learning_rate": 7.505396825396826e-06, + "loss": 0.8341, + "step": 5680 + }, + { + "epoch": 16.005142857142857, + "grad_norm": 0.2557847797870636, + "learning_rate": 7.499047619047619e-06, + "loss": 0.2347, + "step": 5690 + }, + { + "epoch": 16.005714285714287, + "grad_norm": 0.14112916588783264, + "learning_rate": 7.492698412698413e-06, + "loss": 0.7506, + "step": 5700 + }, + { + "epoch": 16.006285714285713, + "grad_norm": 109.31643676757812, + "learning_rate": 7.486349206349207e-06, + "loss": 0.6344, + "step": 5710 + }, + { + "epoch": 16.006857142857143, + "grad_norm": 0.407155305147171, + "learning_rate": 7.48e-06, + "loss": 0.2471, + "step": 5720 + }, + { + "epoch": 16.007428571428573, + "grad_norm": 0.037655897438526154, + "learning_rate": 7.473650793650794e-06, + "loss": 0.4707, + "step": 5730 + }, + { + "epoch": 16.008, + "grad_norm": 0.08762380480766296, + "learning_rate": 7.467301587301587e-06, + "loss": 0.0635, + "step": 5740 + }, + { + "epoch": 16.00857142857143, + "grad_norm": 0.15955060720443726, + "learning_rate": 7.460952380952382e-06, + "loss": 0.5162, + "step": 5750 + }, + { + "epoch": 16.00914285714286, + "grad_norm": 13.772893905639648, + "learning_rate": 7.454603174603175e-06, + "loss": 0.6237, + "step": 5760 + }, + { + "epoch": 16.009714285714285, + "grad_norm": 20.581626892089844, + "learning_rate": 7.448253968253969e-06, + "loss": 0.7149, + "step": 5770 + }, + { + "epoch": 16.010285714285715, + "grad_norm": 0.14489834010601044, + "learning_rate": 7.441904761904762e-06, + "loss": 0.0079, + "step": 5780 + }, + { + "epoch": 16.010857142857144, + "grad_norm": 0.1383170783519745, + "learning_rate": 7.435555555555556e-06, + "loss": 0.2527, + "step": 5790 + }, + { + "epoch": 16.01142857142857, + "grad_norm": 68.98406219482422, + "learning_rate": 7.429206349206349e-06, + "loss": 0.3456, + "step": 5800 + }, + { + "epoch": 16.012, + "grad_norm": 0.11179056763648987, + "learning_rate": 7.422857142857144e-06, + "loss": 0.0647, + "step": 5810 + }, + { + "epoch": 16.01257142857143, + "grad_norm": 0.18071648478507996, + "learning_rate": 7.416507936507937e-06, + "loss": 0.5227, + "step": 5820 + }, + { + "epoch": 16.013142857142856, + "grad_norm": 67.35832214355469, + "learning_rate": 7.41015873015873e-06, + "loss": 0.3971, + "step": 5830 + }, + { + "epoch": 16.013714285714286, + "grad_norm": 0.1837083250284195, + "learning_rate": 7.403809523809524e-06, + "loss": 0.8512, + "step": 5840 + }, + { + "epoch": 16.014285714285716, + "grad_norm": 0.038320187479257584, + "learning_rate": 7.397460317460317e-06, + "loss": 0.0037, + "step": 5850 + }, + { + "epoch": 16.014857142857142, + "grad_norm": 20.66285514831543, + "learning_rate": 7.3911111111111125e-06, + "loss": 1.123, + "step": 5860 + }, + { + "epoch": 16.015428571428572, + "grad_norm": 133.61224365234375, + "learning_rate": 7.384761904761906e-06, + "loss": 0.3835, + "step": 5870 + }, + { + "epoch": 16.016, + "grad_norm": 0.17678341269493103, + "learning_rate": 7.3784126984126995e-06, + "loss": 0.1001, + "step": 5880 + }, + { + "epoch": 16.016571428571428, + "grad_norm": 0.09736228734254837, + "learning_rate": 7.372063492063493e-06, + "loss": 0.2866, + "step": 5890 + }, + { + "epoch": 16.017142857142858, + "grad_norm": 0.09368955343961716, + "learning_rate": 7.365714285714286e-06, + "loss": 1.0441, + "step": 5900 + }, + { + "epoch": 16.017714285714284, + "grad_norm": 0.10775759816169739, + "learning_rate": 7.359365079365079e-06, + "loss": 0.3626, + "step": 5910 + }, + { + "epoch": 16.018285714285714, + "grad_norm": 0.2658800482749939, + "learning_rate": 7.353015873015874e-06, + "loss": 0.3755, + "step": 5920 + }, + { + "epoch": 16.018857142857144, + "grad_norm": 0.24060992896556854, + "learning_rate": 7.346666666666668e-06, + "loss": 1.028, + "step": 5930 + }, + { + "epoch": 16.01942857142857, + "grad_norm": 0.12600265443325043, + "learning_rate": 7.340317460317461e-06, + "loss": 0.5612, + "step": 5940 + }, + { + "epoch": 16.02, + "grad_norm": 0.12872040271759033, + "learning_rate": 7.333968253968255e-06, + "loss": 0.3013, + "step": 5950 + }, + { + "epoch": 16.02, + "eval_accuracy": 0.8484848484848485, + "eval_loss": 0.7404991984367371, + "eval_runtime": 126.748, + "eval_samples_per_second": 2.343, + "eval_steps_per_second": 1.176, + "step": 5950 + }, + { + "epoch": 17.00057142857143, + "grad_norm": 0.23484548926353455, + "learning_rate": 7.327619047619048e-06, + "loss": 0.4457, + "step": 5960 + }, + { + "epoch": 17.001142857142856, + "grad_norm": 0.14240051805973053, + "learning_rate": 7.3212698412698425e-06, + "loss": 0.0042, + "step": 5970 + }, + { + "epoch": 17.001714285714286, + "grad_norm": 0.15564581751823425, + "learning_rate": 7.314920634920636e-06, + "loss": 0.6894, + "step": 5980 + }, + { + "epoch": 17.002285714285716, + "grad_norm": 127.26374816894531, + "learning_rate": 7.3085714285714295e-06, + "loss": 1.2778, + "step": 5990 + }, + { + "epoch": 17.002857142857142, + "grad_norm": 0.1205485537648201, + "learning_rate": 7.302222222222223e-06, + "loss": 0.2591, + "step": 6000 + }, + { + "epoch": 17.00342857142857, + "grad_norm": 0.14839039742946625, + "learning_rate": 7.2958730158730165e-06, + "loss": 0.2446, + "step": 6010 + }, + { + "epoch": 17.004, + "grad_norm": 0.2482178807258606, + "learning_rate": 7.28952380952381e-06, + "loss": 1.1471, + "step": 6020 + }, + { + "epoch": 17.004571428571428, + "grad_norm": 0.6684224009513855, + "learning_rate": 7.283174603174604e-06, + "loss": 0.1935, + "step": 6030 + }, + { + "epoch": 17.005142857142857, + "grad_norm": 0.31114834547042847, + "learning_rate": 7.276825396825398e-06, + "loss": 0.374, + "step": 6040 + }, + { + "epoch": 17.005714285714287, + "grad_norm": 0.241834819316864, + "learning_rate": 7.270476190476191e-06, + "loss": 0.932, + "step": 6050 + }, + { + "epoch": 17.006285714285713, + "grad_norm": 17.917661666870117, + "learning_rate": 7.264126984126985e-06, + "loss": 0.5023, + "step": 6060 + }, + { + "epoch": 17.006857142857143, + "grad_norm": 0.11860672384500504, + "learning_rate": 7.257777777777778e-06, + "loss": 0.2441, + "step": 6070 + }, + { + "epoch": 17.007428571428573, + "grad_norm": 0.12156573683023453, + "learning_rate": 7.251428571428572e-06, + "loss": 0.4809, + "step": 6080 + }, + { + "epoch": 17.008, + "grad_norm": 32.46079635620117, + "learning_rate": 7.245079365079366e-06, + "loss": 1.0452, + "step": 6090 + }, + { + "epoch": 17.00857142857143, + "grad_norm": 1.8947296142578125, + "learning_rate": 7.2387301587301595e-06, + "loss": 0.2181, + "step": 6100 + }, + { + "epoch": 17.00914285714286, + "grad_norm": 1.0444539785385132, + "learning_rate": 7.232380952380953e-06, + "loss": 0.2129, + "step": 6110 + }, + { + "epoch": 17.009714285714285, + "grad_norm": 9.648567199707031, + "learning_rate": 7.2260317460317465e-06, + "loss": 0.2156, + "step": 6120 + }, + { + "epoch": 17.010285714285715, + "grad_norm": 0.24711275100708008, + "learning_rate": 7.21968253968254e-06, + "loss": 0.0108, + "step": 6130 + }, + { + "epoch": 17.010857142857144, + "grad_norm": 0.22460851073265076, + "learning_rate": 7.213333333333334e-06, + "loss": 0.645, + "step": 6140 + }, + { + "epoch": 17.01142857142857, + "grad_norm": 49.333404541015625, + "learning_rate": 7.206984126984128e-06, + "loss": 0.6497, + "step": 6150 + }, + { + "epoch": 17.012, + "grad_norm": 0.22554242610931396, + "learning_rate": 7.200634920634921e-06, + "loss": 0.7637, + "step": 6160 + }, + { + "epoch": 17.01257142857143, + "grad_norm": 0.2542533576488495, + "learning_rate": 7.194285714285715e-06, + "loss": 0.6032, + "step": 6170 + }, + { + "epoch": 17.013142857142856, + "grad_norm": 0.03277142718434334, + "learning_rate": 7.187936507936508e-06, + "loss": 0.4751, + "step": 6180 + }, + { + "epoch": 17.013714285714286, + "grad_norm": 0.09158936887979507, + "learning_rate": 7.181587301587302e-06, + "loss": 0.433, + "step": 6190 + }, + { + "epoch": 17.014285714285716, + "grad_norm": 0.2807057201862335, + "learning_rate": 7.175238095238096e-06, + "loss": 0.2971, + "step": 6200 + }, + { + "epoch": 17.014857142857142, + "grad_norm": 80.08293914794922, + "learning_rate": 7.1688888888888895e-06, + "loss": 0.4443, + "step": 6210 + }, + { + "epoch": 17.015428571428572, + "grad_norm": 0.19456472992897034, + "learning_rate": 7.162539682539683e-06, + "loss": 0.5821, + "step": 6220 + }, + { + "epoch": 17.016, + "grad_norm": 0.07997258752584457, + "learning_rate": 7.1561904761904765e-06, + "loss": 1.0668, + "step": 6230 + }, + { + "epoch": 17.016571428571428, + "grad_norm": 0.1534426212310791, + "learning_rate": 7.14984126984127e-06, + "loss": 0.4562, + "step": 6240 + }, + { + "epoch": 17.017142857142858, + "grad_norm": 0.36759310960769653, + "learning_rate": 7.143492063492064e-06, + "loss": 0.2592, + "step": 6250 + }, + { + "epoch": 17.017714285714284, + "grad_norm": 0.15657585859298706, + "learning_rate": 7.137142857142858e-06, + "loss": 0.5768, + "step": 6260 + }, + { + "epoch": 17.018285714285714, + "grad_norm": 0.13546155393123627, + "learning_rate": 7.130793650793651e-06, + "loss": 0.2564, + "step": 6270 + }, + { + "epoch": 17.018857142857144, + "grad_norm": 0.14879682660102844, + "learning_rate": 7.124444444444445e-06, + "loss": 0.987, + "step": 6280 + }, + { + "epoch": 17.01942857142857, + "grad_norm": 20.40616226196289, + "learning_rate": 7.118095238095238e-06, + "loss": 0.8061, + "step": 6290 + }, + { + "epoch": 17.02, + "grad_norm": 0.3881203234195709, + "learning_rate": 7.111746031746032e-06, + "loss": 0.5055, + "step": 6300 + }, + { + "epoch": 17.02, + "eval_accuracy": 0.8451178451178452, + "eval_loss": 0.6562865376472473, + "eval_runtime": 127.2408, + "eval_samples_per_second": 2.334, + "eval_steps_per_second": 1.171, + "step": 6300 + }, + { + "epoch": 18.00057142857143, + "grad_norm": 1.636939287185669, + "learning_rate": 7.105396825396826e-06, + "loss": 0.9626, + "step": 6310 + }, + { + "epoch": 18.001142857142856, + "grad_norm": 1.2696601152420044, + "learning_rate": 7.0990476190476195e-06, + "loss": 1.1254, + "step": 6320 + }, + { + "epoch": 18.001714285714286, + "grad_norm": 31.090442657470703, + "learning_rate": 7.092698412698413e-06, + "loss": 0.2333, + "step": 6330 + }, + { + "epoch": 18.002285714285716, + "grad_norm": 122.07534790039062, + "learning_rate": 7.0863492063492065e-06, + "loss": 0.3172, + "step": 6340 + }, + { + "epoch": 18.002857142857142, + "grad_norm": 0.2747611701488495, + "learning_rate": 7.08e-06, + "loss": 0.3612, + "step": 6350 + }, + { + "epoch": 18.00342857142857, + "grad_norm": 0.3476632535457611, + "learning_rate": 7.073650793650795e-06, + "loss": 0.8514, + "step": 6360 + }, + { + "epoch": 18.004, + "grad_norm": 0.03347006067633629, + "learning_rate": 7.067301587301589e-06, + "loss": 0.3275, + "step": 6370 + }, + { + "epoch": 18.004571428571428, + "grad_norm": 84.09701538085938, + "learning_rate": 7.060952380952381e-06, + "loss": 0.5278, + "step": 6380 + }, + { + "epoch": 18.005142857142857, + "grad_norm": 0.21676389873027802, + "learning_rate": 7.054603174603175e-06, + "loss": 0.612, + "step": 6390 + }, + { + "epoch": 18.005714285714287, + "grad_norm": 160.529541015625, + "learning_rate": 7.048253968253968e-06, + "loss": 0.4884, + "step": 6400 + }, + { + "epoch": 18.006285714285713, + "grad_norm": 0.364534467458725, + "learning_rate": 7.041904761904762e-06, + "loss": 0.0083, + "step": 6410 + }, + { + "epoch": 18.006857142857143, + "grad_norm": 0.11234438419342041, + "learning_rate": 7.035555555555557e-06, + "loss": 0.0106, + "step": 6420 + }, + { + "epoch": 18.007428571428573, + "grad_norm": 1.1077744960784912, + "learning_rate": 7.02920634920635e-06, + "loss": 0.0066, + "step": 6430 + }, + { + "epoch": 18.008, + "grad_norm": 0.13264243304729462, + "learning_rate": 7.022857142857144e-06, + "loss": 0.4173, + "step": 6440 + }, + { + "epoch": 18.00857142857143, + "grad_norm": 0.1175365149974823, + "learning_rate": 7.016507936507937e-06, + "loss": 0.6565, + "step": 6450 + }, + { + "epoch": 18.00914285714286, + "grad_norm": 0.2617458701133728, + "learning_rate": 7.01015873015873e-06, + "loss": 0.2389, + "step": 6460 + }, + { + "epoch": 18.009714285714285, + "grad_norm": 46.34348678588867, + "learning_rate": 7.0038095238095235e-06, + "loss": 0.4956, + "step": 6470 + }, + { + "epoch": 18.010285714285715, + "grad_norm": 153.98577880859375, + "learning_rate": 6.997460317460319e-06, + "loss": 1.0497, + "step": 6480 + }, + { + "epoch": 18.010857142857144, + "grad_norm": 0.19409751892089844, + "learning_rate": 6.991111111111112e-06, + "loss": 0.412, + "step": 6490 + }, + { + "epoch": 18.01142857142857, + "grad_norm": 0.12201213091611862, + "learning_rate": 6.984761904761906e-06, + "loss": 0.4613, + "step": 6500 + }, + { + "epoch": 18.012, + "grad_norm": 0.020613886415958405, + "learning_rate": 6.978412698412699e-06, + "loss": 0.3496, + "step": 6510 + }, + { + "epoch": 18.01257142857143, + "grad_norm": 0.1668790876865387, + "learning_rate": 6.9720634920634926e-06, + "loss": 0.4584, + "step": 6520 + }, + { + "epoch": 18.013142857142856, + "grad_norm": 0.15408121049404144, + "learning_rate": 6.965714285714287e-06, + "loss": 0.6383, + "step": 6530 + }, + { + "epoch": 18.013714285714286, + "grad_norm": 12.476696968078613, + "learning_rate": 6.95936507936508e-06, + "loss": 0.4078, + "step": 6540 + }, + { + "epoch": 18.014285714285716, + "grad_norm": 0.240266352891922, + "learning_rate": 6.953015873015874e-06, + "loss": 0.4204, + "step": 6550 + }, + { + "epoch": 18.014857142857142, + "grad_norm": 0.20230837166309357, + "learning_rate": 6.946666666666667e-06, + "loss": 0.4396, + "step": 6560 + }, + { + "epoch": 18.015428571428572, + "grad_norm": 33.412925720214844, + "learning_rate": 6.940317460317461e-06, + "loss": 0.7438, + "step": 6570 + }, + { + "epoch": 18.016, + "grad_norm": 21.571928024291992, + "learning_rate": 6.933968253968254e-06, + "loss": 0.4501, + "step": 6580 + }, + { + "epoch": 18.016571428571428, + "grad_norm": 0.3260933458805084, + "learning_rate": 6.927619047619049e-06, + "loss": 0.1881, + "step": 6590 + }, + { + "epoch": 18.017142857142858, + "grad_norm": 0.07090619206428528, + "learning_rate": 6.921269841269842e-06, + "loss": 0.7295, + "step": 6600 + }, + { + "epoch": 18.017714285714284, + "grad_norm": 0.2709910571575165, + "learning_rate": 6.914920634920636e-06, + "loss": 0.0077, + "step": 6610 + }, + { + "epoch": 18.018285714285714, + "grad_norm": 0.2425944209098816, + "learning_rate": 6.908571428571429e-06, + "loss": 0.7675, + "step": 6620 + }, + { + "epoch": 18.018857142857144, + "grad_norm": 12.433034896850586, + "learning_rate": 6.902222222222223e-06, + "loss": 0.5643, + "step": 6630 + }, + { + "epoch": 18.01942857142857, + "grad_norm": 0.23004376888275146, + "learning_rate": 6.895873015873017e-06, + "loss": 0.2302, + "step": 6640 + }, + { + "epoch": 18.02, + "grad_norm": 0.1142105832695961, + "learning_rate": 6.88952380952381e-06, + "loss": 0.0059, + "step": 6650 + }, + { + "epoch": 18.02, + "eval_accuracy": 0.8484848484848485, + "eval_loss": 0.6916897296905518, + "eval_runtime": 126.3823, + "eval_samples_per_second": 2.35, + "eval_steps_per_second": 1.179, + "step": 6650 + }, + { + "epoch": 19.00057142857143, + "grad_norm": 0.010271835140883923, + "learning_rate": 6.883174603174604e-06, + "loss": 0.158, + "step": 6660 + }, + { + "epoch": 19.001142857142856, + "grad_norm": 15.551984786987305, + "learning_rate": 6.876825396825397e-06, + "loss": 0.7788, + "step": 6670 + }, + { + "epoch": 19.001714285714286, + "grad_norm": 0.12954816222190857, + "learning_rate": 6.870476190476191e-06, + "loss": 0.3656, + "step": 6680 + }, + { + "epoch": 19.002285714285716, + "grad_norm": 4.656391620635986, + "learning_rate": 6.864126984126984e-06, + "loss": 0.689, + "step": 6690 + }, + { + "epoch": 19.002857142857142, + "grad_norm": 0.13569696247577667, + "learning_rate": 6.857777777777779e-06, + "loss": 0.1878, + "step": 6700 + }, + { + "epoch": 19.00342857142857, + "grad_norm": 0.16348059475421906, + "learning_rate": 6.851428571428572e-06, + "loss": 0.8311, + "step": 6710 + }, + { + "epoch": 19.004, + "grad_norm": 12.783551216125488, + "learning_rate": 6.845079365079366e-06, + "loss": 0.46, + "step": 6720 + }, + { + "epoch": 19.004571428571428, + "grad_norm": 0.24606812000274658, + "learning_rate": 6.838730158730159e-06, + "loss": 0.587, + "step": 6730 + }, + { + "epoch": 19.005142857142857, + "grad_norm": 0.12452604621648788, + "learning_rate": 6.832380952380953e-06, + "loss": 0.0415, + "step": 6740 + }, + { + "epoch": 19.005714285714287, + "grad_norm": 14.892534255981445, + "learning_rate": 6.826031746031747e-06, + "loss": 0.7738, + "step": 6750 + }, + { + "epoch": 19.006285714285713, + "grad_norm": 0.1208721324801445, + "learning_rate": 6.81968253968254e-06, + "loss": 0.2166, + "step": 6760 + }, + { + "epoch": 19.006857142857143, + "grad_norm": 0.14853811264038086, + "learning_rate": 6.813333333333334e-06, + "loss": 0.0052, + "step": 6770 + }, + { + "epoch": 19.007428571428573, + "grad_norm": 0.3613605201244354, + "learning_rate": 6.806984126984127e-06, + "loss": 0.2232, + "step": 6780 + }, + { + "epoch": 19.008, + "grad_norm": 11.996315956115723, + "learning_rate": 6.800634920634921e-06, + "loss": 0.4984, + "step": 6790 + }, + { + "epoch": 19.00857142857143, + "grad_norm": 11.95859146118164, + "learning_rate": 6.794285714285714e-06, + "loss": 0.525, + "step": 6800 + }, + { + "epoch": 19.00914285714286, + "grad_norm": 0.1372225433588028, + "learning_rate": 6.787936507936509e-06, + "loss": 0.0046, + "step": 6810 + }, + { + "epoch": 19.009714285714285, + "grad_norm": 13.401540756225586, + "learning_rate": 6.781587301587302e-06, + "loss": 0.2888, + "step": 6820 + }, + { + "epoch": 19.010285714285715, + "grad_norm": 0.1333763599395752, + "learning_rate": 6.775238095238096e-06, + "loss": 0.4673, + "step": 6830 + }, + { + "epoch": 19.010857142857144, + "grad_norm": 0.28985071182250977, + "learning_rate": 6.768888888888889e-06, + "loss": 0.4904, + "step": 6840 + }, + { + "epoch": 19.01142857142857, + "grad_norm": 0.5672123432159424, + "learning_rate": 6.762539682539683e-06, + "loss": 0.38, + "step": 6850 + }, + { + "epoch": 19.012, + "grad_norm": 0.18549109995365143, + "learning_rate": 6.756190476190476e-06, + "loss": 0.4654, + "step": 6860 + }, + { + "epoch": 19.01257142857143, + "grad_norm": 0.005712473299354315, + "learning_rate": 6.74984126984127e-06, + "loss": 0.4693, + "step": 6870 + }, + { + "epoch": 19.013142857142856, + "grad_norm": 0.20795473456382751, + "learning_rate": 6.743492063492064e-06, + "loss": 0.6987, + "step": 6880 + }, + { + "epoch": 19.013714285714286, + "grad_norm": 0.2660701274871826, + "learning_rate": 6.737142857142857e-06, + "loss": 0.3575, + "step": 6890 + }, + { + "epoch": 19.014285714285716, + "grad_norm": 12.896868705749512, + "learning_rate": 6.730793650793651e-06, + "loss": 1.0003, + "step": 6900 + }, + { + "epoch": 19.014857142857142, + "grad_norm": 90.52599334716797, + "learning_rate": 6.724444444444444e-06, + "loss": 0.5481, + "step": 6910 + }, + { + "epoch": 19.015428571428572, + "grad_norm": 0.11832074075937271, + "learning_rate": 6.7180952380952395e-06, + "loss": 0.461, + "step": 6920 + }, + { + "epoch": 19.016, + "grad_norm": 0.021740248426795006, + "learning_rate": 6.711746031746032e-06, + "loss": 0.94, + "step": 6930 + }, + { + "epoch": 19.016571428571428, + "grad_norm": 0.14676664769649506, + "learning_rate": 6.705396825396826e-06, + "loss": 0.2698, + "step": 6940 + }, + { + "epoch": 19.017142857142858, + "grad_norm": 0.01525693666189909, + "learning_rate": 6.699047619047619e-06, + "loss": 0.5851, + "step": 6950 + }, + { + "epoch": 19.017714285714284, + "grad_norm": 0.5535984039306641, + "learning_rate": 6.692698412698413e-06, + "loss": 0.3717, + "step": 6960 + }, + { + "epoch": 19.018285714285714, + "grad_norm": 0.43303757905960083, + "learning_rate": 6.686349206349206e-06, + "loss": 0.5912, + "step": 6970 + }, + { + "epoch": 19.018857142857144, + "grad_norm": 0.15641167759895325, + "learning_rate": 6.680000000000001e-06, + "loss": 0.2061, + "step": 6980 + }, + { + "epoch": 19.01942857142857, + "grad_norm": 0.11748301982879639, + "learning_rate": 6.673650793650795e-06, + "loss": 0.326, + "step": 6990 + }, + { + "epoch": 19.02, + "grad_norm": 0.05292058736085892, + "learning_rate": 6.667301587301588e-06, + "loss": 0.4332, + "step": 7000 + }, + { + "epoch": 19.02, + "eval_accuracy": 0.8383838383838383, + "eval_loss": 0.6888241767883301, + "eval_runtime": 134.2388, + "eval_samples_per_second": 2.212, + "eval_steps_per_second": 1.11, + "step": 7000 + }, + { + "epoch": 20.00057142857143, + "grad_norm": 0.4945196211338043, + "learning_rate": 6.660952380952381e-06, + "loss": 0.103, + "step": 7010 + }, + { + "epoch": 20.001142857142856, + "grad_norm": 16.183570861816406, + "learning_rate": 6.654603174603174e-06, + "loss": 0.2379, + "step": 7020 + }, + { + "epoch": 20.001714285714286, + "grad_norm": 0.07635916769504547, + "learning_rate": 6.6482539682539695e-06, + "loss": 0.6419, + "step": 7030 + }, + { + "epoch": 20.002285714285716, + "grad_norm": 159.50653076171875, + "learning_rate": 6.641904761904763e-06, + "loss": 0.5751, + "step": 7040 + }, + { + "epoch": 20.002857142857142, + "grad_norm": 0.09001730382442474, + "learning_rate": 6.6355555555555565e-06, + "loss": 0.254, + "step": 7050 + }, + { + "epoch": 20.00342857142857, + "grad_norm": 0.07919283211231232, + "learning_rate": 6.62920634920635e-06, + "loss": 0.7664, + "step": 7060 + }, + { + "epoch": 20.004, + "grad_norm": 280.9998779296875, + "learning_rate": 6.6228571428571435e-06, + "loss": 0.2373, + "step": 7070 + }, + { + "epoch": 20.004571428571428, + "grad_norm": 0.3689022362232208, + "learning_rate": 6.616507936507937e-06, + "loss": 0.4658, + "step": 7080 + }, + { + "epoch": 20.005142857142857, + "grad_norm": 0.10695375502109528, + "learning_rate": 6.610158730158731e-06, + "loss": 0.3698, + "step": 7090 + }, + { + "epoch": 20.005714285714287, + "grad_norm": 0.13924677670001984, + "learning_rate": 6.603809523809525e-06, + "loss": 0.5151, + "step": 7100 + }, + { + "epoch": 20.006285714285713, + "grad_norm": 0.05997047573328018, + "learning_rate": 6.597460317460318e-06, + "loss": 0.3443, + "step": 7110 + }, + { + "epoch": 20.006857142857143, + "grad_norm": 15.691723823547363, + "learning_rate": 6.591111111111112e-06, + "loss": 0.9827, + "step": 7120 + }, + { + "epoch": 20.007428571428573, + "grad_norm": 0.18234623968601227, + "learning_rate": 6.584761904761905e-06, + "loss": 0.5862, + "step": 7130 + }, + { + "epoch": 20.008, + "grad_norm": 0.27075400948524475, + "learning_rate": 6.5784126984126995e-06, + "loss": 0.1941, + "step": 7140 + }, + { + "epoch": 20.00857142857143, + "grad_norm": 1.7042484283447266, + "learning_rate": 6.572063492063493e-06, + "loss": 0.5991, + "step": 7150 + }, + { + "epoch": 20.00914285714286, + "grad_norm": 0.16708026826381683, + "learning_rate": 6.5657142857142865e-06, + "loss": 0.3161, + "step": 7160 + }, + { + "epoch": 20.009714285714285, + "grad_norm": 0.2559516131877899, + "learning_rate": 6.55936507936508e-06, + "loss": 0.3776, + "step": 7170 + }, + { + "epoch": 20.010285714285715, + "grad_norm": 0.19110074639320374, + "learning_rate": 6.5530158730158735e-06, + "loss": 0.9548, + "step": 7180 + }, + { + "epoch": 20.010857142857144, + "grad_norm": 56.03752517700195, + "learning_rate": 6.546666666666667e-06, + "loss": 0.2375, + "step": 7190 + }, + { + "epoch": 20.01142857142857, + "grad_norm": 16.357572555541992, + "learning_rate": 6.540317460317461e-06, + "loss": 0.5034, + "step": 7200 + }, + { + "epoch": 20.012, + "grad_norm": 0.40159788727760315, + "learning_rate": 6.533968253968255e-06, + "loss": 0.4687, + "step": 7210 + }, + { + "epoch": 20.01257142857143, + "grad_norm": 0.009314529597759247, + "learning_rate": 6.527619047619048e-06, + "loss": 0.4526, + "step": 7220 + }, + { + "epoch": 20.013142857142856, + "grad_norm": 0.13374114036560059, + "learning_rate": 6.521269841269842e-06, + "loss": 0.9903, + "step": 7230 + }, + { + "epoch": 20.013714285714286, + "grad_norm": 0.020518776029348373, + "learning_rate": 6.514920634920635e-06, + "loss": 0.0067, + "step": 7240 + }, + { + "epoch": 20.014285714285716, + "grad_norm": 0.15277713537216187, + "learning_rate": 6.5085714285714295e-06, + "loss": 0.0422, + "step": 7250 + }, + { + "epoch": 20.014857142857142, + "grad_norm": 0.10491526871919632, + "learning_rate": 6.502222222222223e-06, + "loss": 0.0047, + "step": 7260 + }, + { + "epoch": 20.015428571428572, + "grad_norm": 15.871146202087402, + "learning_rate": 6.4958730158730165e-06, + "loss": 0.2825, + "step": 7270 + }, + { + "epoch": 20.016, + "grad_norm": 0.11634642630815506, + "learning_rate": 6.48952380952381e-06, + "loss": 0.0032, + "step": 7280 + }, + { + "epoch": 20.016571428571428, + "grad_norm": 0.014390116557478905, + "learning_rate": 6.4831746031746035e-06, + "loss": 0.6739, + "step": 7290 + }, + { + "epoch": 20.017142857142858, + "grad_norm": 0.012950708158314228, + "learning_rate": 6.476825396825397e-06, + "loss": 1.2164, + "step": 7300 + }, + { + "epoch": 20.017714285714284, + "grad_norm": 0.13078835606575012, + "learning_rate": 6.470476190476191e-06, + "loss": 0.212, + "step": 7310 + }, + { + "epoch": 20.018285714285714, + "grad_norm": 0.28521332144737244, + "learning_rate": 6.464126984126985e-06, + "loss": 0.6134, + "step": 7320 + }, + { + "epoch": 20.018857142857144, + "grad_norm": 0.14261774718761444, + "learning_rate": 6.457777777777778e-06, + "loss": 0.0948, + "step": 7330 + }, + { + "epoch": 20.01942857142857, + "grad_norm": 0.2758195400238037, + "learning_rate": 6.451428571428572e-06, + "loss": 0.1811, + "step": 7340 + }, + { + "epoch": 20.02, + "grad_norm": 0.05165260285139084, + "learning_rate": 6.445079365079365e-06, + "loss": 0.2602, + "step": 7350 + }, + { + "epoch": 20.02, + "eval_accuracy": 0.8417508417508418, + "eval_loss": 0.7993361353874207, + "eval_runtime": 133.8895, + "eval_samples_per_second": 2.218, + "eval_steps_per_second": 1.113, + "step": 7350 + }, + { + "epoch": 21.00057142857143, + "grad_norm": 0.03576577454805374, + "learning_rate": 6.438730158730159e-06, + "loss": 0.2528, + "step": 7360 + }, + { + "epoch": 21.001142857142856, + "grad_norm": 0.05446856468915939, + "learning_rate": 6.432380952380953e-06, + "loss": 0.0034, + "step": 7370 + }, + { + "epoch": 21.001714285714286, + "grad_norm": 50.52811050415039, + "learning_rate": 6.4260317460317465e-06, + "loss": 1.0569, + "step": 7380 + }, + { + "epoch": 21.002285714285716, + "grad_norm": 26.091224670410156, + "learning_rate": 6.41968253968254e-06, + "loss": 0.2889, + "step": 7390 + }, + { + "epoch": 21.002857142857142, + "grad_norm": 0.09547077119350433, + "learning_rate": 6.4133333333333335e-06, + "loss": 0.0366, + "step": 7400 + }, + { + "epoch": 21.00342857142857, + "grad_norm": 0.08614878356456757, + "learning_rate": 6.406984126984127e-06, + "loss": 0.2737, + "step": 7410 + }, + { + "epoch": 21.004, + "grad_norm": 0.03283363953232765, + "learning_rate": 6.400634920634921e-06, + "loss": 0.0022, + "step": 7420 + }, + { + "epoch": 21.004571428571428, + "grad_norm": 4.929975509643555, + "learning_rate": 6.394285714285715e-06, + "loss": 0.5315, + "step": 7430 + }, + { + "epoch": 21.005142857142857, + "grad_norm": 106.62588500976562, + "learning_rate": 6.387936507936508e-06, + "loss": 0.4519, + "step": 7440 + }, + { + "epoch": 21.005714285714287, + "grad_norm": 0.04590607061982155, + "learning_rate": 6.381587301587302e-06, + "loss": 0.4353, + "step": 7450 + }, + { + "epoch": 21.006285714285713, + "grad_norm": 0.10316039621829987, + "learning_rate": 6.375238095238095e-06, + "loss": 0.2364, + "step": 7460 + }, + { + "epoch": 21.006857142857143, + "grad_norm": 28.117801666259766, + "learning_rate": 6.368888888888889e-06, + "loss": 0.6987, + "step": 7470 + }, + { + "epoch": 21.007428571428573, + "grad_norm": 0.1716347187757492, + "learning_rate": 6.362539682539683e-06, + "loss": 1.1197, + "step": 7480 + }, + { + "epoch": 21.008, + "grad_norm": 17.753265380859375, + "learning_rate": 6.3561904761904765e-06, + "loss": 0.1678, + "step": 7490 + }, + { + "epoch": 21.00857142857143, + "grad_norm": 0.021444451063871384, + "learning_rate": 6.34984126984127e-06, + "loss": 0.1589, + "step": 7500 + }, + { + "epoch": 21.00914285714286, + "grad_norm": 0.013452350161969662, + "learning_rate": 6.3434920634920635e-06, + "loss": 0.4614, + "step": 7510 + }, + { + "epoch": 21.009714285714285, + "grad_norm": 0.023600058630108833, + "learning_rate": 6.337142857142857e-06, + "loss": 0.25, + "step": 7520 + }, + { + "epoch": 21.010285714285715, + "grad_norm": 0.13624207675457, + "learning_rate": 6.330793650793652e-06, + "loss": 0.334, + "step": 7530 + }, + { + "epoch": 21.010857142857144, + "grad_norm": 0.15217389166355133, + "learning_rate": 6.324444444444446e-06, + "loss": 0.3943, + "step": 7540 + }, + { + "epoch": 21.01142857142857, + "grad_norm": 0.11624295264482498, + "learning_rate": 6.318095238095239e-06, + "loss": 0.0209, + "step": 7550 + }, + { + "epoch": 21.012, + "grad_norm": 0.5223240256309509, + "learning_rate": 6.311746031746033e-06, + "loss": 0.4057, + "step": 7560 + }, + { + "epoch": 21.01257142857143, + "grad_norm": 0.059054210782051086, + "learning_rate": 6.305396825396825e-06, + "loss": 0.8236, + "step": 7570 + }, + { + "epoch": 21.013142857142856, + "grad_norm": 0.0346602126955986, + "learning_rate": 6.299047619047619e-06, + "loss": 0.4062, + "step": 7580 + }, + { + "epoch": 21.013714285714286, + "grad_norm": 0.2160838097333908, + "learning_rate": 6.292698412698414e-06, + "loss": 0.7491, + "step": 7590 + }, + { + "epoch": 21.014285714285716, + "grad_norm": 0.17393529415130615, + "learning_rate": 6.286349206349207e-06, + "loss": 0.0512, + "step": 7600 + }, + { + "epoch": 21.014857142857142, + "grad_norm": 12.681178092956543, + "learning_rate": 6.280000000000001e-06, + "loss": 0.6529, + "step": 7610 + }, + { + "epoch": 21.015428571428572, + "grad_norm": 0.008576265536248684, + "learning_rate": 6.273650793650794e-06, + "loss": 0.8273, + "step": 7620 + }, + { + "epoch": 21.016, + "grad_norm": 0.006474177818745375, + "learning_rate": 6.267301587301588e-06, + "loss": 0.0061, + "step": 7630 + }, + { + "epoch": 21.016571428571428, + "grad_norm": 0.42665791511535645, + "learning_rate": 6.260952380952382e-06, + "loss": 0.924, + "step": 7640 + }, + { + "epoch": 21.017142857142858, + "grad_norm": 37.20500946044922, + "learning_rate": 6.254603174603176e-06, + "loss": 0.2282, + "step": 7650 + }, + { + "epoch": 21.017714285714284, + "grad_norm": 0.03610096871852875, + "learning_rate": 6.248253968253969e-06, + "loss": 0.2507, + "step": 7660 + }, + { + "epoch": 21.018285714285714, + "grad_norm": 0.17031346261501312, + "learning_rate": 6.241904761904763e-06, + "loss": 0.483, + "step": 7670 + }, + { + "epoch": 21.018857142857144, + "grad_norm": 13.083247184753418, + "learning_rate": 6.235555555555556e-06, + "loss": 0.3114, + "step": 7680 + }, + { + "epoch": 21.01942857142857, + "grad_norm": 0.013108008541166782, + "learning_rate": 6.2292063492063496e-06, + "loss": 0.9822, + "step": 7690 + }, + { + "epoch": 21.02, + "grad_norm": 0.19977082312107086, + "learning_rate": 6.222857142857144e-06, + "loss": 0.2142, + "step": 7700 + }, + { + "epoch": 21.02, + "eval_accuracy": 0.8451178451178452, + "eval_loss": 0.7130899429321289, + "eval_runtime": 134.5619, + "eval_samples_per_second": 2.207, + "eval_steps_per_second": 1.107, + "step": 7700 + }, + { + "epoch": 22.00057142857143, + "grad_norm": 0.12899892032146454, + "learning_rate": 6.216507936507937e-06, + "loss": 0.3791, + "step": 7710 + }, + { + "epoch": 22.001142857142856, + "grad_norm": 0.06919983774423599, + "learning_rate": 6.210158730158731e-06, + "loss": 0.4267, + "step": 7720 + }, + { + "epoch": 22.001714285714286, + "grad_norm": 0.082898810505867, + "learning_rate": 6.203809523809524e-06, + "loss": 0.2237, + "step": 7730 + }, + { + "epoch": 22.002285714285716, + "grad_norm": 0.08086587488651276, + "learning_rate": 6.197460317460318e-06, + "loss": 0.3335, + "step": 7740 + }, + { + "epoch": 22.002857142857142, + "grad_norm": 0.007148704957216978, + "learning_rate": 6.191111111111111e-06, + "loss": 0.261, + "step": 7750 + }, + { + "epoch": 22.00342857142857, + "grad_norm": 13.431198120117188, + "learning_rate": 6.184761904761906e-06, + "loss": 0.9781, + "step": 7760 + }, + { + "epoch": 22.004, + "grad_norm": 0.16154451668262482, + "learning_rate": 6.178412698412699e-06, + "loss": 0.0853, + "step": 7770 + }, + { + "epoch": 22.004571428571428, + "grad_norm": 0.14190372824668884, + "learning_rate": 6.172063492063493e-06, + "loss": 0.035, + "step": 7780 + }, + { + "epoch": 22.005142857142857, + "grad_norm": 13.651422500610352, + "learning_rate": 6.165714285714286e-06, + "loss": 0.7239, + "step": 7790 + }, + { + "epoch": 22.005714285714287, + "grad_norm": 0.16447778046131134, + "learning_rate": 6.15936507936508e-06, + "loss": 0.3368, + "step": 7800 + }, + { + "epoch": 22.006285714285713, + "grad_norm": 0.36849430203437805, + "learning_rate": 6.153015873015874e-06, + "loss": 0.2213, + "step": 7810 + }, + { + "epoch": 22.006857142857143, + "grad_norm": 0.5472358465194702, + "learning_rate": 6.146666666666667e-06, + "loss": 0.7931, + "step": 7820 + }, + { + "epoch": 22.007428571428573, + "grad_norm": 0.8773165941238403, + "learning_rate": 6.140317460317461e-06, + "loss": 0.1896, + "step": 7830 + }, + { + "epoch": 22.008, + "grad_norm": 0.20025408267974854, + "learning_rate": 6.133968253968254e-06, + "loss": 0.72, + "step": 7840 + }, + { + "epoch": 22.00857142857143, + "grad_norm": 0.0833997055888176, + "learning_rate": 6.127619047619048e-06, + "loss": 0.0071, + "step": 7850 + }, + { + "epoch": 22.00914285714286, + "grad_norm": 0.018751641735434532, + "learning_rate": 6.121269841269841e-06, + "loss": 0.2319, + "step": 7860 + }, + { + "epoch": 22.009714285714285, + "grad_norm": 12.105188369750977, + "learning_rate": 6.114920634920636e-06, + "loss": 0.7782, + "step": 7870 + }, + { + "epoch": 22.010285714285715, + "grad_norm": 0.0960543304681778, + "learning_rate": 6.108571428571429e-06, + "loss": 0.0327, + "step": 7880 + }, + { + "epoch": 22.010857142857144, + "grad_norm": 12.760798454284668, + "learning_rate": 6.102222222222223e-06, + "loss": 0.5455, + "step": 7890 + }, + { + "epoch": 22.01142857142857, + "grad_norm": 0.19047430157661438, + "learning_rate": 6.095873015873016e-06, + "loss": 0.1856, + "step": 7900 + }, + { + "epoch": 22.012, + "grad_norm": 12.85616683959961, + "learning_rate": 6.08952380952381e-06, + "loss": 0.2649, + "step": 7910 + }, + { + "epoch": 22.01257142857143, + "grad_norm": 0.1496925801038742, + "learning_rate": 6.083174603174604e-06, + "loss": 0.756, + "step": 7920 + }, + { + "epoch": 22.013142857142856, + "grad_norm": 0.1137370839715004, + "learning_rate": 6.076825396825397e-06, + "loss": 1.1056, + "step": 7930 + }, + { + "epoch": 22.013714285714286, + "grad_norm": 14.750025749206543, + "learning_rate": 6.070476190476191e-06, + "loss": 0.5389, + "step": 7940 + }, + { + "epoch": 22.014285714285716, + "grad_norm": 0.15537060797214508, + "learning_rate": 6.064126984126984e-06, + "loss": 0.2203, + "step": 7950 + }, + { + "epoch": 22.014857142857142, + "grad_norm": 0.17418305575847626, + "learning_rate": 6.057777777777778e-06, + "loss": 0.5758, + "step": 7960 + }, + { + "epoch": 22.015428571428572, + "grad_norm": 0.3206419050693512, + "learning_rate": 6.051428571428571e-06, + "loss": 0.0278, + "step": 7970 + }, + { + "epoch": 22.016, + "grad_norm": 0.10264712572097778, + "learning_rate": 6.045079365079366e-06, + "loss": 0.4427, + "step": 7980 + }, + { + "epoch": 22.016571428571428, + "grad_norm": 0.08854754269123077, + "learning_rate": 6.038730158730159e-06, + "loss": 0.1925, + "step": 7990 + }, + { + "epoch": 22.017142857142858, + "grad_norm": 0.1728421151638031, + "learning_rate": 6.032380952380953e-06, + "loss": 0.3517, + "step": 8000 + }, + { + "epoch": 22.017714285714284, + "grad_norm": 0.07346148788928986, + "learning_rate": 6.026031746031746e-06, + "loss": 0.1056, + "step": 8010 + }, + { + "epoch": 22.018285714285714, + "grad_norm": 20.421218872070312, + "learning_rate": 6.01968253968254e-06, + "loss": 0.7583, + "step": 8020 + }, + { + "epoch": 22.018857142857144, + "grad_norm": 0.0952727198600769, + "learning_rate": 6.013333333333335e-06, + "loss": 0.3924, + "step": 8030 + }, + { + "epoch": 22.01942857142857, + "grad_norm": 139.4682159423828, + "learning_rate": 6.006984126984127e-06, + "loss": 0.1965, + "step": 8040 + }, + { + "epoch": 22.02, + "grad_norm": 0.15866141021251678, + "learning_rate": 6.000634920634921e-06, + "loss": 0.5742, + "step": 8050 + }, + { + "epoch": 22.02, + "eval_accuracy": 0.797979797979798, + "eval_loss": 0.9735172986984253, + "eval_runtime": 134.4584, + "eval_samples_per_second": 2.209, + "eval_steps_per_second": 1.108, + "step": 8050 + }, + { + "epoch": 23.00057142857143, + "grad_norm": 0.1783732771873474, + "learning_rate": 5.994285714285714e-06, + "loss": 0.1797, + "step": 8060 + }, + { + "epoch": 23.001142857142856, + "grad_norm": 0.5633581876754761, + "learning_rate": 5.987936507936508e-06, + "loss": 0.0058, + "step": 8070 + }, + { + "epoch": 23.001714285714286, + "grad_norm": 0.052664969116449356, + "learning_rate": 5.981587301587301e-06, + "loss": 0.3705, + "step": 8080 + }, + { + "epoch": 23.002285714285716, + "grad_norm": 1.9416977167129517, + "learning_rate": 5.9752380952380965e-06, + "loss": 0.8398, + "step": 8090 + }, + { + "epoch": 23.002857142857142, + "grad_norm": 2.0996170043945312, + "learning_rate": 5.96888888888889e-06, + "loss": 0.5296, + "step": 8100 + }, + { + "epoch": 23.00342857142857, + "grad_norm": 0.32839661836624146, + "learning_rate": 5.9625396825396835e-06, + "loss": 0.2443, + "step": 8110 + }, + { + "epoch": 23.004, + "grad_norm": 0.007853704504668713, + "learning_rate": 5.956190476190476e-06, + "loss": 0.0044, + "step": 8120 + }, + { + "epoch": 23.004571428571428, + "grad_norm": 12.491561889648438, + "learning_rate": 5.94984126984127e-06, + "loss": 0.4636, + "step": 8130 + }, + { + "epoch": 23.005142857142857, + "grad_norm": 0.10937748849391937, + "learning_rate": 5.943492063492063e-06, + "loss": 0.664, + "step": 8140 + }, + { + "epoch": 23.005714285714287, + "grad_norm": 0.1766849309206009, + "learning_rate": 5.937142857142858e-06, + "loss": 0.4253, + "step": 8150 + }, + { + "epoch": 23.006285714285713, + "grad_norm": 55.246822357177734, + "learning_rate": 5.930793650793652e-06, + "loss": 0.5592, + "step": 8160 + }, + { + "epoch": 23.006857142857143, + "grad_norm": 1.2145743370056152, + "learning_rate": 5.924444444444445e-06, + "loss": 0.0061, + "step": 8170 + }, + { + "epoch": 23.007428571428573, + "grad_norm": 5.3756794929504395, + "learning_rate": 5.918095238095239e-06, + "loss": 0.2953, + "step": 8180 + }, + { + "epoch": 23.008, + "grad_norm": 3.9541385173797607, + "learning_rate": 5.911746031746032e-06, + "loss": 0.5192, + "step": 8190 + }, + { + "epoch": 23.00857142857143, + "grad_norm": 0.25989338755607605, + "learning_rate": 5.9053968253968265e-06, + "loss": 0.4489, + "step": 8200 + }, + { + "epoch": 23.00914285714286, + "grad_norm": 0.16681231558322906, + "learning_rate": 5.89904761904762e-06, + "loss": 0.5296, + "step": 8210 + }, + { + "epoch": 23.009714285714285, + "grad_norm": 0.2039841264486313, + "learning_rate": 5.8926984126984135e-06, + "loss": 0.5324, + "step": 8220 + }, + { + "epoch": 23.010285714285715, + "grad_norm": 0.3426172137260437, + "learning_rate": 5.886349206349207e-06, + "loss": 0.663, + "step": 8230 + }, + { + "epoch": 23.010857142857144, + "grad_norm": 0.007993980310857296, + "learning_rate": 5.8800000000000005e-06, + "loss": 0.2322, + "step": 8240 + }, + { + "epoch": 23.01142857142857, + "grad_norm": 0.28963732719421387, + "learning_rate": 5.873650793650794e-06, + "loss": 0.2086, + "step": 8250 + }, + { + "epoch": 23.012, + "grad_norm": 0.18354713916778564, + "learning_rate": 5.867301587301588e-06, + "loss": 0.0054, + "step": 8260 + }, + { + "epoch": 23.01257142857143, + "grad_norm": 0.2161131054162979, + "learning_rate": 5.860952380952382e-06, + "loss": 0.0039, + "step": 8270 + }, + { + "epoch": 23.013142857142856, + "grad_norm": 0.11740400642156601, + "learning_rate": 5.854603174603175e-06, + "loss": 0.4064, + "step": 8280 + }, + { + "epoch": 23.013714285714286, + "grad_norm": 150.37094116210938, + "learning_rate": 5.848253968253969e-06, + "loss": 0.0248, + "step": 8290 + }, + { + "epoch": 23.014285714285716, + "grad_norm": 0.1555628478527069, + "learning_rate": 5.841904761904762e-06, + "loss": 0.0036, + "step": 8300 + }, + { + "epoch": 23.014857142857142, + "grad_norm": 0.13599923253059387, + "learning_rate": 5.8355555555555565e-06, + "loss": 0.2426, + "step": 8310 + }, + { + "epoch": 23.015428571428572, + "grad_norm": 0.1228395327925682, + "learning_rate": 5.82920634920635e-06, + "loss": 0.8494, + "step": 8320 + }, + { + "epoch": 23.016, + "grad_norm": 295.5143127441406, + "learning_rate": 5.8228571428571435e-06, + "loss": 0.459, + "step": 8330 + }, + { + "epoch": 23.016571428571428, + "grad_norm": 14.874744415283203, + "learning_rate": 5.816507936507937e-06, + "loss": 0.4832, + "step": 8340 + }, + { + "epoch": 23.017142857142858, + "grad_norm": 0.10557269304990768, + "learning_rate": 5.8101587301587305e-06, + "loss": 0.2448, + "step": 8350 + }, + { + "epoch": 23.017714285714284, + "grad_norm": 0.11553741991519928, + "learning_rate": 5.803809523809524e-06, + "loss": 0.1697, + "step": 8360 + }, + { + "epoch": 23.018285714285714, + "grad_norm": 0.08351312577724457, + "learning_rate": 5.797460317460318e-06, + "loss": 0.4602, + "step": 8370 + }, + { + "epoch": 23.018857142857144, + "grad_norm": 6.845589637756348, + "learning_rate": 5.791111111111112e-06, + "loss": 0.3568, + "step": 8380 + }, + { + "epoch": 23.01942857142857, + "grad_norm": 0.0753844752907753, + "learning_rate": 5.784761904761905e-06, + "loss": 0.0102, + "step": 8390 + }, + { + "epoch": 23.02, + "grad_norm": 0.12463897466659546, + "learning_rate": 5.778412698412699e-06, + "loss": 0.2504, + "step": 8400 + }, + { + "epoch": 23.02, + "eval_accuracy": 0.8383838383838383, + "eval_loss": 0.8313712477684021, + "eval_runtime": 134.2181, + "eval_samples_per_second": 2.213, + "eval_steps_per_second": 1.11, + "step": 8400 + }, + { + "epoch": 24.00057142857143, + "grad_norm": 0.052970390766859055, + "learning_rate": 5.772063492063492e-06, + "loss": 0.2138, + "step": 8410 + }, + { + "epoch": 24.001142857142856, + "grad_norm": 0.05925741419196129, + "learning_rate": 5.7657142857142865e-06, + "loss": 0.2863, + "step": 8420 + }, + { + "epoch": 24.001714285714286, + "grad_norm": 0.0537530817091465, + "learning_rate": 5.75936507936508e-06, + "loss": 0.3034, + "step": 8430 + }, + { + "epoch": 24.002285714285716, + "grad_norm": 0.23769424855709076, + "learning_rate": 5.7530158730158735e-06, + "loss": 0.4167, + "step": 8440 + }, + { + "epoch": 24.002857142857142, + "grad_norm": 19.706758499145508, + "learning_rate": 5.746666666666667e-06, + "loss": 0.2434, + "step": 8450 + }, + { + "epoch": 24.00342857142857, + "grad_norm": 0.2779850363731384, + "learning_rate": 5.7403174603174605e-06, + "loss": 0.8818, + "step": 8460 + }, + { + "epoch": 24.004, + "grad_norm": 0.29331010580062866, + "learning_rate": 5.733968253968254e-06, + "loss": 0.226, + "step": 8470 + }, + { + "epoch": 24.004571428571428, + "grad_norm": 15.611747741699219, + "learning_rate": 5.727619047619048e-06, + "loss": 0.2733, + "step": 8480 + }, + { + "epoch": 24.005142857142857, + "grad_norm": 17.301841735839844, + "learning_rate": 5.721269841269842e-06, + "loss": 0.2518, + "step": 8490 + }, + { + "epoch": 24.005714285714287, + "grad_norm": 0.828689455986023, + "learning_rate": 5.714920634920635e-06, + "loss": 0.0048, + "step": 8500 + }, + { + "epoch": 24.006285714285713, + "grad_norm": 0.07569330185651779, + "learning_rate": 5.708571428571429e-06, + "loss": 0.2393, + "step": 8510 + }, + { + "epoch": 24.006857142857143, + "grad_norm": 0.027848446741700172, + "learning_rate": 5.702222222222222e-06, + "loss": 1.2843, + "step": 8520 + }, + { + "epoch": 24.007428571428573, + "grad_norm": 0.09675378352403641, + "learning_rate": 5.6958730158730165e-06, + "loss": 0.006, + "step": 8530 + }, + { + "epoch": 24.008, + "grad_norm": 0.08907628059387207, + "learning_rate": 5.68952380952381e-06, + "loss": 0.7083, + "step": 8540 + }, + { + "epoch": 24.00857142857143, + "grad_norm": 0.0546133778989315, + "learning_rate": 5.6831746031746035e-06, + "loss": 0.5869, + "step": 8550 + }, + { + "epoch": 24.00914285714286, + "grad_norm": 0.08426523208618164, + "learning_rate": 5.676825396825397e-06, + "loss": 0.2361, + "step": 8560 + }, + { + "epoch": 24.009714285714285, + "grad_norm": 0.1244712769985199, + "learning_rate": 5.6704761904761905e-06, + "loss": 0.0052, + "step": 8570 + }, + { + "epoch": 24.010285714285715, + "grad_norm": 0.15895043313503265, + "learning_rate": 5.664126984126984e-06, + "loss": 0.4814, + "step": 8580 + }, + { + "epoch": 24.010857142857144, + "grad_norm": 0.24632836878299713, + "learning_rate": 5.657777777777778e-06, + "loss": 0.0043, + "step": 8590 + }, + { + "epoch": 24.01142857142857, + "grad_norm": 245.2207794189453, + "learning_rate": 5.651428571428572e-06, + "loss": 0.7302, + "step": 8600 + }, + { + "epoch": 24.012, + "grad_norm": 0.1819450557231903, + "learning_rate": 5.645079365079365e-06, + "loss": 0.0047, + "step": 8610 + }, + { + "epoch": 24.01257142857143, + "grad_norm": 0.07300709933042526, + "learning_rate": 5.638730158730159e-06, + "loss": 0.6173, + "step": 8620 + }, + { + "epoch": 24.013142857142856, + "grad_norm": 0.014348522759974003, + "learning_rate": 5.632380952380952e-06, + "loss": 0.3765, + "step": 8630 + }, + { + "epoch": 24.013714285714286, + "grad_norm": 76.59358978271484, + "learning_rate": 5.626031746031746e-06, + "loss": 0.7209, + "step": 8640 + }, + { + "epoch": 24.014285714285716, + "grad_norm": 0.1954093724489212, + "learning_rate": 5.619682539682541e-06, + "loss": 0.2304, + "step": 8650 + }, + { + "epoch": 24.014857142857142, + "grad_norm": 13.528507232666016, + "learning_rate": 5.613333333333334e-06, + "loss": 0.6376, + "step": 8660 + }, + { + "epoch": 24.015428571428572, + "grad_norm": 12.869405746459961, + "learning_rate": 5.606984126984127e-06, + "loss": 0.5984, + "step": 8670 + }, + { + "epoch": 24.016, + "grad_norm": 0.1572108119726181, + "learning_rate": 5.6006349206349205e-06, + "loss": 0.0176, + "step": 8680 + }, + { + "epoch": 24.016571428571428, + "grad_norm": 0.20609256625175476, + "learning_rate": 5.594285714285714e-06, + "loss": 0.4308, + "step": 8690 + }, + { + "epoch": 24.017142857142858, + "grad_norm": 0.07268011569976807, + "learning_rate": 5.587936507936509e-06, + "loss": 0.4066, + "step": 8700 + }, + { + "epoch": 24.017714285714284, + "grad_norm": 13.543063163757324, + "learning_rate": 5.581587301587303e-06, + "loss": 0.7049, + "step": 8710 + }, + { + "epoch": 24.018285714285714, + "grad_norm": 19.77041244506836, + "learning_rate": 5.575238095238096e-06, + "loss": 0.4488, + "step": 8720 + }, + { + "epoch": 24.018857142857144, + "grad_norm": 0.14645476639270782, + "learning_rate": 5.56888888888889e-06, + "loss": 0.7006, + "step": 8730 + }, + { + "epoch": 24.01942857142857, + "grad_norm": 0.013115695677697659, + "learning_rate": 5.562539682539683e-06, + "loss": 0.0082, + "step": 8740 + }, + { + "epoch": 24.02, + "grad_norm": 0.41332682967185974, + "learning_rate": 5.556190476190476e-06, + "loss": 0.8514, + "step": 8750 + }, + { + "epoch": 24.02, + "eval_accuracy": 0.8417508417508418, + "eval_loss": 0.7481423616409302, + "eval_runtime": 137.4085, + "eval_samples_per_second": 2.161, + "eval_steps_per_second": 1.084, + "step": 8750 + }, + { + "epoch": 25.00057142857143, + "grad_norm": 0.371246874332428, + "learning_rate": 5.549841269841271e-06, + "loss": 0.6345, + "step": 8760 + }, + { + "epoch": 25.001142857142856, + "grad_norm": 0.7499510645866394, + "learning_rate": 5.543492063492064e-06, + "loss": 0.1388, + "step": 8770 + }, + { + "epoch": 25.001714285714286, + "grad_norm": 0.013739760965108871, + "learning_rate": 5.537142857142858e-06, + "loss": 0.5583, + "step": 8780 + }, + { + "epoch": 25.002285714285716, + "grad_norm": 190.9663543701172, + "learning_rate": 5.530793650793651e-06, + "loss": 0.7159, + "step": 8790 + }, + { + "epoch": 25.002857142857142, + "grad_norm": 256.3753356933594, + "learning_rate": 5.524444444444445e-06, + "loss": 0.0527, + "step": 8800 + }, + { + "epoch": 25.00342857142857, + "grad_norm": 7.645998001098633, + "learning_rate": 5.518095238095239e-06, + "loss": 0.5753, + "step": 8810 + }, + { + "epoch": 25.004, + "grad_norm": 0.10563361644744873, + "learning_rate": 5.511746031746033e-06, + "loss": 0.632, + "step": 8820 + }, + { + "epoch": 25.004571428571428, + "grad_norm": 27.83478546142578, + "learning_rate": 5.505396825396826e-06, + "loss": 0.3274, + "step": 8830 + }, + { + "epoch": 25.005142857142857, + "grad_norm": 0.09404099732637405, + "learning_rate": 5.49904761904762e-06, + "loss": 0.0059, + "step": 8840 + }, + { + "epoch": 25.005714285714287, + "grad_norm": 0.031015774235129356, + "learning_rate": 5.492698412698413e-06, + "loss": 0.0043, + "step": 8850 + }, + { + "epoch": 25.006285714285713, + "grad_norm": 0.20480813086032867, + "learning_rate": 5.4863492063492066e-06, + "loss": 0.5294, + "step": 8860 + }, + { + "epoch": 25.006857142857143, + "grad_norm": 125.27932739257812, + "learning_rate": 5.480000000000001e-06, + "loss": 1.0979, + "step": 8870 + }, + { + "epoch": 25.007428571428573, + "grad_norm": 0.12436144798994064, + "learning_rate": 5.473650793650794e-06, + "loss": 0.4211, + "step": 8880 + }, + { + "epoch": 25.008, + "grad_norm": 0.09650959074497223, + "learning_rate": 5.467301587301588e-06, + "loss": 0.2128, + "step": 8890 + }, + { + "epoch": 25.00857142857143, + "grad_norm": 0.09640531986951828, + "learning_rate": 5.460952380952381e-06, + "loss": 0.0061, + "step": 8900 + }, + { + "epoch": 25.00914285714286, + "grad_norm": 0.5898492932319641, + "learning_rate": 5.454603174603175e-06, + "loss": 0.3704, + "step": 8910 + }, + { + "epoch": 25.009714285714285, + "grad_norm": 0.0638417899608612, + "learning_rate": 5.448253968253969e-06, + "loss": 0.2117, + "step": 8920 + }, + { + "epoch": 25.010285714285715, + "grad_norm": 0.08020645380020142, + "learning_rate": 5.441904761904763e-06, + "loss": 0.4894, + "step": 8930 + }, + { + "epoch": 25.010857142857144, + "grad_norm": 0.15978746116161346, + "learning_rate": 5.435555555555556e-06, + "loss": 0.3888, + "step": 8940 + }, + { + "epoch": 25.01142857142857, + "grad_norm": 0.06692436337471008, + "learning_rate": 5.42920634920635e-06, + "loss": 0.2216, + "step": 8950 + }, + { + "epoch": 25.012, + "grad_norm": 0.41444236040115356, + "learning_rate": 5.422857142857143e-06, + "loss": 0.7492, + "step": 8960 + }, + { + "epoch": 25.01257142857143, + "grad_norm": 0.08462107926607132, + "learning_rate": 5.4165079365079366e-06, + "loss": 0.9321, + "step": 8970 + }, + { + "epoch": 25.013142857142856, + "grad_norm": 0.08357837796211243, + "learning_rate": 5.410158730158731e-06, + "loss": 0.0149, + "step": 8980 + }, + { + "epoch": 25.013714285714286, + "grad_norm": 20.435392379760742, + "learning_rate": 5.403809523809524e-06, + "loss": 0.6908, + "step": 8990 + }, + { + "epoch": 25.014285714285716, + "grad_norm": 0.18590255081653595, + "learning_rate": 5.397460317460318e-06, + "loss": 0.2268, + "step": 9000 + }, + { + "epoch": 25.014857142857142, + "grad_norm": 0.013719220645725727, + "learning_rate": 5.391111111111111e-06, + "loss": 0.198, + "step": 9010 + }, + { + "epoch": 25.015428571428572, + "grad_norm": 0.004588362295180559, + "learning_rate": 5.384761904761905e-06, + "loss": 0.0098, + "step": 9020 + }, + { + "epoch": 25.016, + "grad_norm": 0.07363064587116241, + "learning_rate": 5.378412698412698e-06, + "loss": 0.3217, + "step": 9030 + }, + { + "epoch": 25.016571428571428, + "grad_norm": 0.056400805711746216, + "learning_rate": 5.372063492063493e-06, + "loss": 0.2697, + "step": 9040 + }, + { + "epoch": 25.017142857142858, + "grad_norm": 84.72920989990234, + "learning_rate": 5.365714285714286e-06, + "loss": 0.0159, + "step": 9050 + }, + { + "epoch": 25.017714285714284, + "grad_norm": 17.841381072998047, + "learning_rate": 5.35936507936508e-06, + "loss": 0.3147, + "step": 9060 + }, + { + "epoch": 25.018285714285714, + "grad_norm": 17.061647415161133, + "learning_rate": 5.353015873015873e-06, + "loss": 0.5654, + "step": 9070 + }, + { + "epoch": 25.018857142857144, + "grad_norm": 0.8570396304130554, + "learning_rate": 5.346666666666667e-06, + "loss": 0.5723, + "step": 9080 + }, + { + "epoch": 25.01942857142857, + "grad_norm": 58.06008529663086, + "learning_rate": 5.340317460317461e-06, + "loss": 1.2434, + "step": 9090 + }, + { + "epoch": 25.02, + "grad_norm": 207.67483520507812, + "learning_rate": 5.333968253968254e-06, + "loss": 0.8148, + "step": 9100 + }, + { + "epoch": 25.02, + "eval_accuracy": 0.8383838383838383, + "eval_loss": 0.7210359573364258, + "eval_runtime": 137.5062, + "eval_samples_per_second": 2.16, + "eval_steps_per_second": 1.084, + "step": 9100 + }, + { + "epoch": 26.00057142857143, + "grad_norm": 0.09444784373044968, + "learning_rate": 5.327619047619048e-06, + "loss": 0.3524, + "step": 9110 + }, + { + "epoch": 26.001142857142856, + "grad_norm": 0.26179543137550354, + "learning_rate": 5.321269841269841e-06, + "loss": 0.5438, + "step": 9120 + }, + { + "epoch": 26.001714285714286, + "grad_norm": 2.8941266536712646, + "learning_rate": 5.314920634920635e-06, + "loss": 0.4125, + "step": 9130 + }, + { + "epoch": 26.002285714285716, + "grad_norm": 0.1553201973438263, + "learning_rate": 5.308571428571428e-06, + "loss": 0.0053, + "step": 9140 + }, + { + "epoch": 26.002857142857142, + "grad_norm": 0.08845008164644241, + "learning_rate": 5.302222222222223e-06, + "loss": 0.5051, + "step": 9150 + }, + { + "epoch": 26.00342857142857, + "grad_norm": 0.11578679084777832, + "learning_rate": 5.295873015873016e-06, + "loss": 0.2748, + "step": 9160 + }, + { + "epoch": 26.004, + "grad_norm": 0.14951874315738678, + "learning_rate": 5.28952380952381e-06, + "loss": 0.1894, + "step": 9170 + }, + { + "epoch": 26.004571428571428, + "grad_norm": 0.0727708712220192, + "learning_rate": 5.283174603174603e-06, + "loss": 0.6826, + "step": 9180 + }, + { + "epoch": 26.005142857142857, + "grad_norm": 0.05796307697892189, + "learning_rate": 5.276825396825397e-06, + "loss": 0.5291, + "step": 9190 + }, + { + "epoch": 26.005714285714287, + "grad_norm": 0.10209472477436066, + "learning_rate": 5.270476190476192e-06, + "loss": 0.0044, + "step": 9200 + }, + { + "epoch": 26.006285714285713, + "grad_norm": 0.008814401924610138, + "learning_rate": 5.264126984126985e-06, + "loss": 0.5146, + "step": 9210 + }, + { + "epoch": 26.006857142857143, + "grad_norm": 0.3405003249645233, + "learning_rate": 5.257777777777779e-06, + "loss": 0.3465, + "step": 9220 + }, + { + "epoch": 26.007428571428573, + "grad_norm": 0.09109217673540115, + "learning_rate": 5.251428571428571e-06, + "loss": 0.1963, + "step": 9230 + }, + { + "epoch": 26.008, + "grad_norm": 0.05053064227104187, + "learning_rate": 5.245079365079365e-06, + "loss": 0.197, + "step": 9240 + }, + { + "epoch": 26.00857142857143, + "grad_norm": 0.01660529151558876, + "learning_rate": 5.238730158730158e-06, + "loss": 0.4736, + "step": 9250 + }, + { + "epoch": 26.00914285714286, + "grad_norm": 0.015270842239260674, + "learning_rate": 5.2323809523809535e-06, + "loss": 0.4788, + "step": 9260 + }, + { + "epoch": 26.009714285714285, + "grad_norm": 0.09976387768983841, + "learning_rate": 5.226031746031747e-06, + "loss": 0.002, + "step": 9270 + }, + { + "epoch": 26.010285714285715, + "grad_norm": 0.19991345703601837, + "learning_rate": 5.2196825396825405e-06, + "loss": 0.0036, + "step": 9280 + }, + { + "epoch": 26.010857142857144, + "grad_norm": 0.12031792104244232, + "learning_rate": 5.213333333333334e-06, + "loss": 0.6917, + "step": 9290 + }, + { + "epoch": 26.01142857142857, + "grad_norm": 0.051638491451740265, + "learning_rate": 5.2069841269841274e-06, + "loss": 0.7381, + "step": 9300 + }, + { + "epoch": 26.012, + "grad_norm": 91.03575897216797, + "learning_rate": 5.200634920634922e-06, + "loss": 0.2566, + "step": 9310 + }, + { + "epoch": 26.01257142857143, + "grad_norm": 0.1020565778017044, + "learning_rate": 5.194285714285715e-06, + "loss": 0.2807, + "step": 9320 + }, + { + "epoch": 26.013142857142856, + "grad_norm": 0.1554216891527176, + "learning_rate": 5.187936507936509e-06, + "loss": 0.2259, + "step": 9330 + }, + { + "epoch": 26.013714285714286, + "grad_norm": 0.09728459268808365, + "learning_rate": 5.181587301587302e-06, + "loss": 0.4738, + "step": 9340 + }, + { + "epoch": 26.014285714285716, + "grad_norm": 0.7819724082946777, + "learning_rate": 5.175238095238096e-06, + "loss": 0.2462, + "step": 9350 + }, + { + "epoch": 26.014857142857142, + "grad_norm": 0.19601747393608093, + "learning_rate": 5.168888888888889e-06, + "loss": 0.3972, + "step": 9360 + }, + { + "epoch": 26.015428571428572, + "grad_norm": 0.21019776165485382, + "learning_rate": 5.1625396825396835e-06, + "loss": 0.5798, + "step": 9370 + }, + { + "epoch": 26.016, + "grad_norm": 0.13152237236499786, + "learning_rate": 5.156190476190477e-06, + "loss": 0.0422, + "step": 9380 + }, + { + "epoch": 26.016571428571428, + "grad_norm": 42.05175018310547, + "learning_rate": 5.1498412698412705e-06, + "loss": 0.2365, + "step": 9390 + }, + { + "epoch": 26.017142857142858, + "grad_norm": 0.06394017487764359, + "learning_rate": 5.143492063492064e-06, + "loss": 0.2123, + "step": 9400 + }, + { + "epoch": 26.017714285714284, + "grad_norm": 0.1293639838695526, + "learning_rate": 5.1371428571428574e-06, + "loss": 0.2474, + "step": 9410 + }, + { + "epoch": 26.018285714285714, + "grad_norm": 0.46589070558547974, + "learning_rate": 5.130793650793651e-06, + "loss": 0.5852, + "step": 9420 + }, + { + "epoch": 26.018857142857144, + "grad_norm": 0.04075726494193077, + "learning_rate": 5.124444444444445e-06, + "loss": 0.4732, + "step": 9430 + }, + { + "epoch": 26.01942857142857, + "grad_norm": 0.08269976824522018, + "learning_rate": 5.118095238095239e-06, + "loss": 0.1798, + "step": 9440 + }, + { + "epoch": 26.02, + "grad_norm": 0.038522034883499146, + "learning_rate": 5.111746031746032e-06, + "loss": 0.2594, + "step": 9450 + }, + { + "epoch": 26.02, + "eval_accuracy": 0.8249158249158249, + "eval_loss": 0.9979982972145081, + "eval_runtime": 126.233, + "eval_samples_per_second": 2.353, + "eval_steps_per_second": 1.18, + "step": 9450 + }, + { + "epoch": 27.00057142857143, + "grad_norm": 0.03514016792178154, + "learning_rate": 5.105396825396826e-06, + "loss": 0.6331, + "step": 9460 + }, + { + "epoch": 27.001142857142856, + "grad_norm": 0.039175134152173996, + "learning_rate": 5.099047619047619e-06, + "loss": 0.0031, + "step": 9470 + }, + { + "epoch": 27.001714285714286, + "grad_norm": 0.017468813806772232, + "learning_rate": 5.0926984126984135e-06, + "loss": 0.0028, + "step": 9480 + }, + { + "epoch": 27.002285714285716, + "grad_norm": 3.9538137912750244, + "learning_rate": 5.086349206349207e-06, + "loss": 0.0074, + "step": 9490 + }, + { + "epoch": 27.002857142857142, + "grad_norm": 0.040926579385995865, + "learning_rate": 5.0800000000000005e-06, + "loss": 0.7674, + "step": 9500 + }, + { + "epoch": 27.00342857142857, + "grad_norm": 0.15430638194084167, + "learning_rate": 5.073650793650794e-06, + "loss": 0.0668, + "step": 9510 + }, + { + "epoch": 27.004, + "grad_norm": 36.84708786010742, + "learning_rate": 5.0673015873015875e-06, + "loss": 0.4732, + "step": 9520 + }, + { + "epoch": 27.004571428571428, + "grad_norm": 0.04013565182685852, + "learning_rate": 5.060952380952381e-06, + "loss": 0.0228, + "step": 9530 + }, + { + "epoch": 27.005142857142857, + "grad_norm": 0.029689166694879532, + "learning_rate": 5.054603174603175e-06, + "loss": 0.2372, + "step": 9540 + }, + { + "epoch": 27.005714285714287, + "grad_norm": 0.0034933576826006174, + "learning_rate": 5.048253968253969e-06, + "loss": 0.2009, + "step": 9550 + }, + { + "epoch": 27.006285714285713, + "grad_norm": 22.289688110351562, + "learning_rate": 5.041904761904762e-06, + "loss": 0.3468, + "step": 9560 + }, + { + "epoch": 27.006857142857143, + "grad_norm": 0.05222174897789955, + "learning_rate": 5.035555555555556e-06, + "loss": 0.266, + "step": 9570 + }, + { + "epoch": 27.007428571428573, + "grad_norm": 0.08454541116952896, + "learning_rate": 5.029206349206349e-06, + "loss": 0.543, + "step": 9580 + }, + { + "epoch": 27.008, + "grad_norm": 15.091740608215332, + "learning_rate": 5.0228571428571435e-06, + "loss": 0.2849, + "step": 9590 + }, + { + "epoch": 27.00857142857143, + "grad_norm": 108.28709411621094, + "learning_rate": 5.016507936507937e-06, + "loss": 0.5846, + "step": 9600 + }, + { + "epoch": 27.00914285714286, + "grad_norm": 37.8171501159668, + "learning_rate": 5.0101587301587305e-06, + "loss": 0.3388, + "step": 9610 + }, + { + "epoch": 27.009714285714285, + "grad_norm": 0.05814701318740845, + "learning_rate": 5.003809523809524e-06, + "loss": 0.1932, + "step": 9620 + }, + { + "epoch": 27.010285714285715, + "grad_norm": 0.018001697957515717, + "learning_rate": 4.997460317460318e-06, + "loss": 0.1371, + "step": 9630 + }, + { + "epoch": 27.010857142857144, + "grad_norm": 0.20318511128425598, + "learning_rate": 4.991111111111112e-06, + "loss": 0.3923, + "step": 9640 + }, + { + "epoch": 27.01142857142857, + "grad_norm": 0.07720087468624115, + "learning_rate": 4.984761904761905e-06, + "loss": 0.0501, + "step": 9650 + }, + { + "epoch": 27.012, + "grad_norm": 19.00177764892578, + "learning_rate": 4.978412698412699e-06, + "loss": 0.542, + "step": 9660 + }, + { + "epoch": 27.01257142857143, + "grad_norm": 0.011382623575627804, + "learning_rate": 4.972063492063492e-06, + "loss": 0.2808, + "step": 9670 + }, + { + "epoch": 27.013142857142856, + "grad_norm": 0.14707712829113007, + "learning_rate": 4.965714285714286e-06, + "loss": 0.3066, + "step": 9680 + }, + { + "epoch": 27.013714285714286, + "grad_norm": 0.0880795419216156, + "learning_rate": 4.95936507936508e-06, + "loss": 0.3369, + "step": 9690 + }, + { + "epoch": 27.014285714285716, + "grad_norm": 91.44270324707031, + "learning_rate": 4.9530158730158735e-06, + "loss": 0.1431, + "step": 9700 + }, + { + "epoch": 27.014857142857142, + "grad_norm": 0.03979627415537834, + "learning_rate": 4.946666666666667e-06, + "loss": 0.2022, + "step": 9710 + }, + { + "epoch": 27.015428571428572, + "grad_norm": 0.06051446869969368, + "learning_rate": 4.9403174603174605e-06, + "loss": 1.1248, + "step": 9720 + }, + { + "epoch": 27.016, + "grad_norm": 0.011192934587597847, + "learning_rate": 4.933968253968254e-06, + "loss": 0.5604, + "step": 9730 + }, + { + "epoch": 27.016571428571428, + "grad_norm": 0.013154531829059124, + "learning_rate": 4.9276190476190475e-06, + "loss": 0.1641, + "step": 9740 + }, + { + "epoch": 27.017142857142858, + "grad_norm": 0.16332918405532837, + "learning_rate": 4.921269841269842e-06, + "loss": 0.2698, + "step": 9750 + }, + { + "epoch": 27.017714285714284, + "grad_norm": 0.12617675960063934, + "learning_rate": 4.914920634920635e-06, + "loss": 0.5508, + "step": 9760 + }, + { + "epoch": 27.018285714285714, + "grad_norm": 0.007707640528678894, + "learning_rate": 4.90857142857143e-06, + "loss": 0.5326, + "step": 9770 + }, + { + "epoch": 27.018857142857144, + "grad_norm": 0.10187384486198425, + "learning_rate": 4.902222222222222e-06, + "loss": 0.0046, + "step": 9780 + }, + { + "epoch": 27.01942857142857, + "grad_norm": 0.09700454026460648, + "learning_rate": 4.895873015873016e-06, + "loss": 0.4082, + "step": 9790 + }, + { + "epoch": 27.02, + "grad_norm": 0.1347123682498932, + "learning_rate": 4.88952380952381e-06, + "loss": 0.6742, + "step": 9800 + }, + { + "epoch": 27.02, + "eval_accuracy": 0.8585858585858586, + "eval_loss": 0.7987341284751892, + "eval_runtime": 126.3026, + "eval_samples_per_second": 2.351, + "eval_steps_per_second": 1.18, + "step": 9800 + }, + { + "epoch": 28.00057142857143, + "grad_norm": 0.007627937477082014, + "learning_rate": 4.8831746031746035e-06, + "loss": 0.8211, + "step": 9810 + }, + { + "epoch": 28.001142857142856, + "grad_norm": 0.5802029967308044, + "learning_rate": 4.876825396825397e-06, + "loss": 0.0109, + "step": 9820 + }, + { + "epoch": 28.001714285714286, + "grad_norm": 0.045871149748563766, + "learning_rate": 4.870476190476191e-06, + "loss": 0.0037, + "step": 9830 + }, + { + "epoch": 28.002285714285716, + "grad_norm": 17.630939483642578, + "learning_rate": 4.864126984126985e-06, + "loss": 1.0221, + "step": 9840 + }, + { + "epoch": 28.002857142857142, + "grad_norm": 0.20201857388019562, + "learning_rate": 4.857777777777778e-06, + "loss": 0.3867, + "step": 9850 + }, + { + "epoch": 28.00342857142857, + "grad_norm": 19.86452293395996, + "learning_rate": 4.851428571428572e-06, + "loss": 0.0709, + "step": 9860 + }, + { + "epoch": 28.004, + "grad_norm": 24.085710525512695, + "learning_rate": 4.845079365079365e-06, + "loss": 0.1767, + "step": 9870 + }, + { + "epoch": 28.004571428571428, + "grad_norm": 0.37033602595329285, + "learning_rate": 4.83873015873016e-06, + "loss": 0.3709, + "step": 9880 + }, + { + "epoch": 28.005142857142857, + "grad_norm": 0.0676516443490982, + "learning_rate": 4.832380952380953e-06, + "loss": 0.6114, + "step": 9890 + }, + { + "epoch": 28.005714285714287, + "grad_norm": 0.13778136670589447, + "learning_rate": 4.826031746031747e-06, + "loss": 0.1078, + "step": 9900 + }, + { + "epoch": 28.006285714285713, + "grad_norm": 292.9512939453125, + "learning_rate": 4.81968253968254e-06, + "loss": 0.6608, + "step": 9910 + }, + { + "epoch": 28.006857142857143, + "grad_norm": 0.08069964498281479, + "learning_rate": 4.8133333333333336e-06, + "loss": 0.6372, + "step": 9920 + }, + { + "epoch": 28.007428571428573, + "grad_norm": 16.857074737548828, + "learning_rate": 4.806984126984127e-06, + "loss": 0.3145, + "step": 9930 + }, + { + "epoch": 28.008, + "grad_norm": 0.1705380231142044, + "learning_rate": 4.800634920634921e-06, + "loss": 0.5308, + "step": 9940 + }, + { + "epoch": 28.00857142857143, + "grad_norm": 0.19721141457557678, + "learning_rate": 4.794285714285715e-06, + "loss": 0.2503, + "step": 9950 + }, + { + "epoch": 28.00914285714286, + "grad_norm": 0.018095174804329872, + "learning_rate": 4.787936507936508e-06, + "loss": 0.0216, + "step": 9960 + }, + { + "epoch": 28.009714285714285, + "grad_norm": 0.058308668434619904, + "learning_rate": 4.781587301587302e-06, + "loss": 0.4907, + "step": 9970 + }, + { + "epoch": 28.010285714285715, + "grad_norm": 0.18544158339500427, + "learning_rate": 4.775238095238095e-06, + "loss": 0.2361, + "step": 9980 + }, + { + "epoch": 28.010857142857144, + "grad_norm": 0.18984903395175934, + "learning_rate": 4.768888888888889e-06, + "loss": 0.1339, + "step": 9990 + }, + { + "epoch": 28.01142857142857, + "grad_norm": 117.79344177246094, + "learning_rate": 4.762539682539683e-06, + "loss": 0.496, + "step": 10000 + }, + { + "epoch": 28.012, + "grad_norm": 0.006381293758749962, + "learning_rate": 4.756190476190477e-06, + "loss": 0.0039, + "step": 10010 + }, + { + "epoch": 28.01257142857143, + "grad_norm": 321.21435546875, + "learning_rate": 4.74984126984127e-06, + "loss": 0.5793, + "step": 10020 + }, + { + "epoch": 28.013142857142856, + "grad_norm": 0.06357376277446747, + "learning_rate": 4.7434920634920636e-06, + "loss": 0.0057, + "step": 10030 + }, + { + "epoch": 28.013714285714286, + "grad_norm": 0.2022400200366974, + "learning_rate": 4.737142857142857e-06, + "loss": 0.3293, + "step": 10040 + }, + { + "epoch": 28.014285714285716, + "grad_norm": 0.050252072513103485, + "learning_rate": 4.730793650793651e-06, + "loss": 0.0027, + "step": 10050 + }, + { + "epoch": 28.014857142857142, + "grad_norm": 0.07211655378341675, + "learning_rate": 4.724444444444445e-06, + "loss": 0.0018, + "step": 10060 + }, + { + "epoch": 28.015428571428572, + "grad_norm": 0.012892846018075943, + "learning_rate": 4.718095238095238e-06, + "loss": 0.3652, + "step": 10070 + }, + { + "epoch": 28.016, + "grad_norm": 0.07106824219226837, + "learning_rate": 4.711746031746033e-06, + "loss": 0.6978, + "step": 10080 + }, + { + "epoch": 28.016571428571428, + "grad_norm": 131.25204467773438, + "learning_rate": 4.705396825396826e-06, + "loss": 0.0722, + "step": 10090 + }, + { + "epoch": 28.017142857142858, + "grad_norm": 0.002385256579145789, + "learning_rate": 4.699047619047619e-06, + "loss": 0.0026, + "step": 10100 + }, + { + "epoch": 28.017714285714284, + "grad_norm": 0.15431569516658783, + "learning_rate": 4.692698412698413e-06, + "loss": 0.3684, + "step": 10110 + }, + { + "epoch": 28.018285714285714, + "grad_norm": 0.08459474891424179, + "learning_rate": 4.686349206349207e-06, + "loss": 0.4475, + "step": 10120 + }, + { + "epoch": 28.018857142857144, + "grad_norm": 0.012470588088035583, + "learning_rate": 4.680000000000001e-06, + "loss": 0.0172, + "step": 10130 + }, + { + "epoch": 28.01942857142857, + "grad_norm": 19.614744186401367, + "learning_rate": 4.673650793650794e-06, + "loss": 0.6554, + "step": 10140 + }, + { + "epoch": 28.02, + "grad_norm": 0.002492617815732956, + "learning_rate": 4.667301587301588e-06, + "loss": 0.0063, + "step": 10150 + }, + { + "epoch": 28.02, + "eval_accuracy": 0.8316498316498316, + "eval_loss": 0.9369211196899414, + "eval_runtime": 126.2783, + "eval_samples_per_second": 2.352, + "eval_steps_per_second": 1.18, + "step": 10150 + }, + { + "epoch": 29.00057142857143, + "grad_norm": 0.11610179394483566, + "learning_rate": 4.660952380952381e-06, + "loss": 0.0046, + "step": 10160 + }, + { + "epoch": 29.001142857142856, + "grad_norm": 13.605859756469727, + "learning_rate": 4.654603174603175e-06, + "loss": 0.5286, + "step": 10170 + }, + { + "epoch": 29.001714285714286, + "grad_norm": 0.03343842178583145, + "learning_rate": 4.648253968253968e-06, + "loss": 0.5034, + "step": 10180 + }, + { + "epoch": 29.002285714285716, + "grad_norm": 0.25074559450149536, + "learning_rate": 4.641904761904763e-06, + "loss": 0.1946, + "step": 10190 + }, + { + "epoch": 29.002857142857142, + "grad_norm": 0.0782301276922226, + "learning_rate": 4.635555555555556e-06, + "loss": 0.0034, + "step": 10200 + }, + { + "epoch": 29.00342857142857, + "grad_norm": 0.037485599517822266, + "learning_rate": 4.62920634920635e-06, + "loss": 0.183, + "step": 10210 + }, + { + "epoch": 29.004, + "grad_norm": 0.03207986056804657, + "learning_rate": 4.622857142857143e-06, + "loss": 0.7524, + "step": 10220 + }, + { + "epoch": 29.004571428571428, + "grad_norm": 0.07413322478532791, + "learning_rate": 4.616507936507937e-06, + "loss": 0.2849, + "step": 10230 + }, + { + "epoch": 29.005142857142857, + "grad_norm": 19.703533172607422, + "learning_rate": 4.61015873015873e-06, + "loss": 1.0924, + "step": 10240 + }, + { + "epoch": 29.005714285714287, + "grad_norm": 13.900262832641602, + "learning_rate": 4.603809523809524e-06, + "loss": 0.6464, + "step": 10250 + }, + { + "epoch": 29.006285714285713, + "grad_norm": 0.2331477850675583, + "learning_rate": 4.597460317460318e-06, + "loss": 0.0042, + "step": 10260 + }, + { + "epoch": 29.006857142857143, + "grad_norm": 0.0446133092045784, + "learning_rate": 4.591111111111111e-06, + "loss": 0.0039, + "step": 10270 + }, + { + "epoch": 29.007428571428573, + "grad_norm": 0.03788290172815323, + "learning_rate": 4.584761904761905e-06, + "loss": 0.4045, + "step": 10280 + }, + { + "epoch": 29.008, + "grad_norm": 0.11208215355873108, + "learning_rate": 4.578412698412698e-06, + "loss": 0.2883, + "step": 10290 + }, + { + "epoch": 29.00857142857143, + "grad_norm": 0.05422298610210419, + "learning_rate": 4.572063492063493e-06, + "loss": 0.0113, + "step": 10300 + }, + { + "epoch": 29.00914285714286, + "grad_norm": 0.003405461786314845, + "learning_rate": 4.565714285714286e-06, + "loss": 0.0028, + "step": 10310 + }, + { + "epoch": 29.009714285714285, + "grad_norm": 0.04415423423051834, + "learning_rate": 4.55936507936508e-06, + "loss": 0.2427, + "step": 10320 + }, + { + "epoch": 29.010285714285715, + "grad_norm": 15.168233871459961, + "learning_rate": 4.553015873015873e-06, + "loss": 0.7346, + "step": 10330 + }, + { + "epoch": 29.010857142857144, + "grad_norm": 0.021887382492423058, + "learning_rate": 4.546666666666667e-06, + "loss": 0.2257, + "step": 10340 + }, + { + "epoch": 29.01142857142857, + "grad_norm": 0.03137199580669403, + "learning_rate": 4.54031746031746e-06, + "loss": 0.249, + "step": 10350 + }, + { + "epoch": 29.012, + "grad_norm": 0.06243215128779411, + "learning_rate": 4.5339682539682544e-06, + "loss": 0.0026, + "step": 10360 + }, + { + "epoch": 29.01257142857143, + "grad_norm": 0.0038311234675347805, + "learning_rate": 4.527619047619048e-06, + "loss": 0.1538, + "step": 10370 + }, + { + "epoch": 29.013142857142856, + "grad_norm": 0.01766922138631344, + "learning_rate": 4.521269841269841e-06, + "loss": 0.273, + "step": 10380 + }, + { + "epoch": 29.013714285714286, + "grad_norm": 0.0019730119965970516, + "learning_rate": 4.514920634920636e-06, + "loss": 0.0014, + "step": 10390 + }, + { + "epoch": 29.014285714285716, + "grad_norm": 0.6263951659202576, + "learning_rate": 4.508571428571429e-06, + "loss": 0.2918, + "step": 10400 + }, + { + "epoch": 29.014857142857142, + "grad_norm": 49.63083267211914, + "learning_rate": 4.502222222222223e-06, + "loss": 0.3073, + "step": 10410 + }, + { + "epoch": 29.015428571428572, + "grad_norm": 37.1717643737793, + "learning_rate": 4.495873015873016e-06, + "loss": 0.3176, + "step": 10420 + }, + { + "epoch": 29.016, + "grad_norm": 0.03757132217288017, + "learning_rate": 4.48952380952381e-06, + "loss": 0.1175, + "step": 10430 + }, + { + "epoch": 29.016571428571428, + "grad_norm": 0.095696821808815, + "learning_rate": 4.483174603174604e-06, + "loss": 0.5835, + "step": 10440 + }, + { + "epoch": 29.017142857142858, + "grad_norm": 0.12873250246047974, + "learning_rate": 4.4768253968253975e-06, + "loss": 0.1662, + "step": 10450 + }, + { + "epoch": 29.017714285714284, + "grad_norm": 0.03826398402452469, + "learning_rate": 4.470476190476191e-06, + "loss": 0.0065, + "step": 10460 + }, + { + "epoch": 29.018285714285714, + "grad_norm": 0.027511196210980415, + "learning_rate": 4.4641269841269844e-06, + "loss": 0.669, + "step": 10470 + }, + { + "epoch": 29.018857142857144, + "grad_norm": 0.01353570818901062, + "learning_rate": 4.457777777777778e-06, + "loss": 0.0017, + "step": 10480 + }, + { + "epoch": 29.01942857142857, + "grad_norm": 0.007252034731209278, + "learning_rate": 4.451428571428571e-06, + "loss": 0.3284, + "step": 10490 + }, + { + "epoch": 29.02, + "grad_norm": 0.14840850234031677, + "learning_rate": 4.445079365079366e-06, + "loss": 0.5186, + "step": 10500 + }, + { + "epoch": 29.02, + "eval_accuracy": 0.8148148148148148, + "eval_loss": 1.0870999097824097, + "eval_runtime": 126.3664, + "eval_samples_per_second": 2.35, + "eval_steps_per_second": 1.179, + "step": 10500 + }, + { + "epoch": 30.00057142857143, + "grad_norm": 0.020524989813566208, + "learning_rate": 4.438730158730159e-06, + "loss": 0.0009, + "step": 10510 + }, + { + "epoch": 30.001142857142856, + "grad_norm": 0.27530866861343384, + "learning_rate": 4.432380952380953e-06, + "loss": 1.0032, + "step": 10520 + }, + { + "epoch": 30.001714285714286, + "grad_norm": 0.06338400393724442, + "learning_rate": 4.426031746031746e-06, + "loss": 0.0018, + "step": 10530 + }, + { + "epoch": 30.002285714285716, + "grad_norm": 0.049876321107149124, + "learning_rate": 4.41968253968254e-06, + "loss": 0.0032, + "step": 10540 + }, + { + "epoch": 30.002857142857142, + "grad_norm": 0.21788761019706726, + "learning_rate": 4.413333333333334e-06, + "loss": 0.0011, + "step": 10550 + }, + { + "epoch": 30.00342857142857, + "grad_norm": 0.006480003707110882, + "learning_rate": 4.4069841269841275e-06, + "loss": 0.4141, + "step": 10560 + }, + { + "epoch": 30.004, + "grad_norm": 0.011641621589660645, + "learning_rate": 4.400634920634921e-06, + "loss": 0.2498, + "step": 10570 + }, + { + "epoch": 30.004571428571428, + "grad_norm": 0.06504371762275696, + "learning_rate": 4.3942857142857144e-06, + "loss": 0.5437, + "step": 10580 + }, + { + "epoch": 30.005142857142857, + "grad_norm": 284.99957275390625, + "learning_rate": 4.387936507936508e-06, + "loss": 0.734, + "step": 10590 + }, + { + "epoch": 30.005714285714287, + "grad_norm": 81.92947387695312, + "learning_rate": 4.381587301587301e-06, + "loss": 0.6903, + "step": 10600 + }, + { + "epoch": 30.006285714285713, + "grad_norm": 0.0021508638747036457, + "learning_rate": 4.375238095238096e-06, + "loss": 0.0015, + "step": 10610 + }, + { + "epoch": 30.006857142857143, + "grad_norm": 0.008287636563181877, + "learning_rate": 4.368888888888889e-06, + "loss": 0.0204, + "step": 10620 + }, + { + "epoch": 30.007428571428573, + "grad_norm": 0.06526491791009903, + "learning_rate": 4.362539682539683e-06, + "loss": 0.2437, + "step": 10630 + }, + { + "epoch": 30.008, + "grad_norm": 0.09128577262163162, + "learning_rate": 4.356190476190477e-06, + "loss": 0.1999, + "step": 10640 + }, + { + "epoch": 30.00857142857143, + "grad_norm": 0.13880805671215057, + "learning_rate": 4.34984126984127e-06, + "loss": 0.2869, + "step": 10650 + }, + { + "epoch": 30.00914285714286, + "grad_norm": 61.90774154663086, + "learning_rate": 4.343492063492064e-06, + "loss": 0.2872, + "step": 10660 + }, + { + "epoch": 30.009714285714285, + "grad_norm": 0.1190374344587326, + "learning_rate": 4.3371428571428575e-06, + "loss": 1.0817, + "step": 10670 + }, + { + "epoch": 30.010285714285715, + "grad_norm": 0.0040014442056417465, + "learning_rate": 4.330793650793651e-06, + "loss": 0.8608, + "step": 10680 + }, + { + "epoch": 30.010857142857144, + "grad_norm": 0.08265765011310577, + "learning_rate": 4.324444444444445e-06, + "loss": 0.0143, + "step": 10690 + }, + { + "epoch": 30.01142857142857, + "grad_norm": 0.08359917998313904, + "learning_rate": 4.318095238095239e-06, + "loss": 0.0016, + "step": 10700 + }, + { + "epoch": 30.012, + "grad_norm": 0.03306671977043152, + "learning_rate": 4.311746031746032e-06, + "loss": 0.0031, + "step": 10710 + }, + { + "epoch": 30.01257142857143, + "grad_norm": 45.719505310058594, + "learning_rate": 4.305396825396826e-06, + "loss": 0.3334, + "step": 10720 + }, + { + "epoch": 30.013142857142856, + "grad_norm": 0.0030138203874230385, + "learning_rate": 4.299047619047619e-06, + "loss": 0.2682, + "step": 10730 + }, + { + "epoch": 30.013714285714286, + "grad_norm": 0.015341831371188164, + "learning_rate": 4.292698412698413e-06, + "loss": 0.6864, + "step": 10740 + }, + { + "epoch": 30.014285714285716, + "grad_norm": 1.9505524635314941, + "learning_rate": 4.286349206349207e-06, + "loss": 0.3386, + "step": 10750 + }, + { + "epoch": 30.014857142857142, + "grad_norm": 15.920595169067383, + "learning_rate": 4.2800000000000005e-06, + "loss": 0.0109, + "step": 10760 + }, + { + "epoch": 30.015428571428572, + "grad_norm": 0.03716844692826271, + "learning_rate": 4.273650793650794e-06, + "loss": 0.0024, + "step": 10770 + }, + { + "epoch": 30.016, + "grad_norm": 0.012245182879269123, + "learning_rate": 4.2673015873015875e-06, + "loss": 0.0497, + "step": 10780 + }, + { + "epoch": 30.016571428571428, + "grad_norm": 0.020435450598597527, + "learning_rate": 4.260952380952381e-06, + "loss": 0.3038, + "step": 10790 + }, + { + "epoch": 30.017142857142858, + "grad_norm": 0.7739905714988708, + "learning_rate": 4.254603174603175e-06, + "loss": 0.0028, + "step": 10800 + }, + { + "epoch": 30.017714285714284, + "grad_norm": 116.42400360107422, + "learning_rate": 4.248253968253969e-06, + "loss": 0.5479, + "step": 10810 + }, + { + "epoch": 30.018285714285714, + "grad_norm": 0.060728929936885834, + "learning_rate": 4.241904761904762e-06, + "loss": 0.1333, + "step": 10820 + }, + { + "epoch": 30.018857142857144, + "grad_norm": 2.810554265975952, + "learning_rate": 4.235555555555556e-06, + "loss": 1.2669, + "step": 10830 + }, + { + "epoch": 30.01942857142857, + "grad_norm": 0.007885237224400043, + "learning_rate": 4.229206349206349e-06, + "loss": 0.8325, + "step": 10840 + }, + { + "epoch": 30.02, + "grad_norm": 0.06585021317005157, + "learning_rate": 4.222857142857143e-06, + "loss": 0.3076, + "step": 10850 + }, + { + "epoch": 30.02, + "eval_accuracy": 0.835016835016835, + "eval_loss": 0.8930524587631226, + "eval_runtime": 126.4231, + "eval_samples_per_second": 2.349, + "eval_steps_per_second": 1.179, + "step": 10850 + }, + { + "epoch": 31.00057142857143, + "grad_norm": 0.008948814123868942, + "learning_rate": 4.216507936507937e-06, + "loss": 0.263, + "step": 10860 + }, + { + "epoch": 31.001142857142856, + "grad_norm": 0.22963719069957733, + "learning_rate": 4.2101587301587305e-06, + "loss": 0.0015, + "step": 10870 + }, + { + "epoch": 31.001714285714286, + "grad_norm": 0.2196054607629776, + "learning_rate": 4.203809523809524e-06, + "loss": 0.0021, + "step": 10880 + }, + { + "epoch": 31.002285714285716, + "grad_norm": 0.03480219841003418, + "learning_rate": 4.1974603174603175e-06, + "loss": 0.6281, + "step": 10890 + }, + { + "epoch": 31.002857142857142, + "grad_norm": 0.9133780002593994, + "learning_rate": 4.191111111111111e-06, + "loss": 0.0018, + "step": 10900 + }, + { + "epoch": 31.00342857142857, + "grad_norm": 0.1275293380022049, + "learning_rate": 4.184761904761905e-06, + "loss": 0.4055, + "step": 10910 + }, + { + "epoch": 31.004, + "grad_norm": 0.3001004755496979, + "learning_rate": 4.178412698412699e-06, + "loss": 0.2961, + "step": 10920 + }, + { + "epoch": 31.004571428571428, + "grad_norm": 0.08802656084299088, + "learning_rate": 4.172063492063492e-06, + "loss": 0.537, + "step": 10930 + }, + { + "epoch": 31.005142857142857, + "grad_norm": 159.4746551513672, + "learning_rate": 4.165714285714287e-06, + "loss": 0.3808, + "step": 10940 + }, + { + "epoch": 31.005714285714287, + "grad_norm": 0.11664550751447678, + "learning_rate": 4.15936507936508e-06, + "loss": 0.7977, + "step": 10950 + }, + { + "epoch": 31.006285714285713, + "grad_norm": 0.20757505297660828, + "learning_rate": 4.1530158730158736e-06, + "loss": 0.7499, + "step": 10960 + }, + { + "epoch": 31.006857142857143, + "grad_norm": 0.0029254963155835867, + "learning_rate": 4.146666666666667e-06, + "loss": 0.0042, + "step": 10970 + }, + { + "epoch": 31.007428571428573, + "grad_norm": 0.3312826454639435, + "learning_rate": 4.1403174603174605e-06, + "loss": 0.1451, + "step": 10980 + }, + { + "epoch": 31.008, + "grad_norm": 235.10496520996094, + "learning_rate": 4.133968253968254e-06, + "loss": 0.2038, + "step": 10990 + }, + { + "epoch": 31.00857142857143, + "grad_norm": 0.029614897444844246, + "learning_rate": 4.127619047619048e-06, + "loss": 0.3989, + "step": 11000 + }, + { + "epoch": 31.00914285714286, + "grad_norm": 0.00274507119320333, + "learning_rate": 4.121269841269842e-06, + "loss": 0.0016, + "step": 11010 + }, + { + "epoch": 31.009714285714285, + "grad_norm": 0.0025850103702396154, + "learning_rate": 4.114920634920635e-06, + "loss": 0.2211, + "step": 11020 + }, + { + "epoch": 31.010285714285715, + "grad_norm": 0.003770750481635332, + "learning_rate": 4.108571428571429e-06, + "loss": 0.3602, + "step": 11030 + }, + { + "epoch": 31.010857142857144, + "grad_norm": 0.08702629059553146, + "learning_rate": 4.102222222222222e-06, + "loss": 0.2275, + "step": 11040 + }, + { + "epoch": 31.01142857142857, + "grad_norm": 0.03445100784301758, + "learning_rate": 4.095873015873017e-06, + "loss": 0.004, + "step": 11050 + }, + { + "epoch": 31.012, + "grad_norm": 0.07224719971418381, + "learning_rate": 4.08952380952381e-06, + "loss": 0.1784, + "step": 11060 + }, + { + "epoch": 31.01257142857143, + "grad_norm": 0.08405738323926926, + "learning_rate": 4.083174603174604e-06, + "loss": 0.275, + "step": 11070 + }, + { + "epoch": 31.013142857142856, + "grad_norm": 0.12125347554683685, + "learning_rate": 4.076825396825397e-06, + "loss": 0.0013, + "step": 11080 + }, + { + "epoch": 31.013714285714286, + "grad_norm": 1.5644397735595703, + "learning_rate": 4.0704761904761905e-06, + "loss": 0.2768, + "step": 11090 + }, + { + "epoch": 31.014285714285716, + "grad_norm": 0.030137941241264343, + "learning_rate": 4.064126984126984e-06, + "loss": 0.1936, + "step": 11100 + }, + { + "epoch": 31.014857142857142, + "grad_norm": 0.005889153108000755, + "learning_rate": 4.057777777777778e-06, + "loss": 0.3787, + "step": 11110 + }, + { + "epoch": 31.015428571428572, + "grad_norm": 840.96142578125, + "learning_rate": 4.051428571428572e-06, + "loss": 0.1409, + "step": 11120 + }, + { + "epoch": 31.016, + "grad_norm": 0.3630973994731903, + "learning_rate": 4.045079365079365e-06, + "loss": 0.5644, + "step": 11130 + }, + { + "epoch": 31.016571428571428, + "grad_norm": 0.1463097482919693, + "learning_rate": 4.038730158730159e-06, + "loss": 0.4875, + "step": 11140 + }, + { + "epoch": 31.017142857142858, + "grad_norm": 0.19403041899204254, + "learning_rate": 4.032380952380952e-06, + "loss": 0.0018, + "step": 11150 + }, + { + "epoch": 31.017714285714284, + "grad_norm": 0.002485697390511632, + "learning_rate": 4.026031746031747e-06, + "loss": 0.7305, + "step": 11160 + }, + { + "epoch": 31.018285714285714, + "grad_norm": 0.22555981576442719, + "learning_rate": 4.01968253968254e-06, + "loss": 0.0121, + "step": 11170 + }, + { + "epoch": 31.018857142857144, + "grad_norm": 0.02071559987962246, + "learning_rate": 4.013333333333334e-06, + "loss": 0.5759, + "step": 11180 + }, + { + "epoch": 31.01942857142857, + "grad_norm": 0.10109854489564896, + "learning_rate": 4.006984126984128e-06, + "loss": 0.3115, + "step": 11190 + }, + { + "epoch": 31.02, + "grad_norm": 0.07400314509868622, + "learning_rate": 4.000634920634921e-06, + "loss": 0.1113, + "step": 11200 + }, + { + "epoch": 31.02, + "eval_accuracy": 0.8383838383838383, + "eval_loss": 1.0014111995697021, + "eval_runtime": 126.4347, + "eval_samples_per_second": 2.349, + "eval_steps_per_second": 1.178, + "step": 11200 + }, + { + "epoch": 32.000571428571426, + "grad_norm": 0.018319344148039818, + "learning_rate": 3.994285714285714e-06, + "loss": 1.0033, + "step": 11210 + }, + { + "epoch": 32.00114285714286, + "grad_norm": 2.0360021591186523, + "learning_rate": 3.987936507936508e-06, + "loss": 0.1463, + "step": 11220 + }, + { + "epoch": 32.001714285714286, + "grad_norm": 11.246064186096191, + "learning_rate": 3.981587301587302e-06, + "loss": 0.2805, + "step": 11230 + }, + { + "epoch": 32.00228571428571, + "grad_norm": 0.21093471348285675, + "learning_rate": 3.975238095238095e-06, + "loss": 0.0017, + "step": 11240 + }, + { + "epoch": 32.002857142857145, + "grad_norm": 26.97494125366211, + "learning_rate": 3.96888888888889e-06, + "loss": 0.0044, + "step": 11250 + }, + { + "epoch": 32.00342857142857, + "grad_norm": 0.3112584352493286, + "learning_rate": 3.962539682539683e-06, + "loss": 0.2287, + "step": 11260 + }, + { + "epoch": 32.004, + "grad_norm": 9.63143539428711, + "learning_rate": 3.956190476190477e-06, + "loss": 0.9842, + "step": 11270 + }, + { + "epoch": 32.00457142857143, + "grad_norm": 0.25314244627952576, + "learning_rate": 3.94984126984127e-06, + "loss": 0.2338, + "step": 11280 + }, + { + "epoch": 32.00514285714286, + "grad_norm": 0.11641905456781387, + "learning_rate": 3.943492063492064e-06, + "loss": 0.0021, + "step": 11290 + }, + { + "epoch": 32.005714285714284, + "grad_norm": 83.00080871582031, + "learning_rate": 3.937142857142858e-06, + "loss": 1.038, + "step": 11300 + }, + { + "epoch": 32.00628571428572, + "grad_norm": 0.11469433456659317, + "learning_rate": 3.930793650793651e-06, + "loss": 0.0026, + "step": 11310 + }, + { + "epoch": 32.00685714285714, + "grad_norm": 0.014426827430725098, + "learning_rate": 3.924444444444445e-06, + "loss": 0.0023, + "step": 11320 + }, + { + "epoch": 32.00742857142857, + "grad_norm": 0.0037542914506047964, + "learning_rate": 3.918095238095238e-06, + "loss": 0.4315, + "step": 11330 + }, + { + "epoch": 32.008, + "grad_norm": 0.004907695110887289, + "learning_rate": 3.911746031746032e-06, + "loss": 0.2727, + "step": 11340 + }, + { + "epoch": 32.00857142857143, + "grad_norm": 0.13894321024417877, + "learning_rate": 3.905396825396825e-06, + "loss": 0.1903, + "step": 11350 + }, + { + "epoch": 32.009142857142855, + "grad_norm": 0.03604894503951073, + "learning_rate": 3.89904761904762e-06, + "loss": 1.1061, + "step": 11360 + }, + { + "epoch": 32.00971428571429, + "grad_norm": 0.0288707222789526, + "learning_rate": 3.892698412698413e-06, + "loss": 0.0018, + "step": 11370 + }, + { + "epoch": 32.010285714285715, + "grad_norm": 41.77061080932617, + "learning_rate": 3.886349206349207e-06, + "loss": 0.2094, + "step": 11380 + }, + { + "epoch": 32.01085714285714, + "grad_norm": 0.09654809534549713, + "learning_rate": 3.88e-06, + "loss": 0.0016, + "step": 11390 + }, + { + "epoch": 32.011428571428574, + "grad_norm": 0.002318542217835784, + "learning_rate": 3.873650793650794e-06, + "loss": 0.2902, + "step": 11400 + }, + { + "epoch": 32.012, + "grad_norm": 0.1287265121936798, + "learning_rate": 3.867301587301588e-06, + "loss": 0.0023, + "step": 11410 + }, + { + "epoch": 32.01257142857143, + "grad_norm": 0.025955529883503914, + "learning_rate": 3.860952380952381e-06, + "loss": 0.29, + "step": 11420 + }, + { + "epoch": 32.01314285714286, + "grad_norm": 0.022403893992304802, + "learning_rate": 3.854603174603175e-06, + "loss": 0.2088, + "step": 11430 + }, + { + "epoch": 32.013714285714286, + "grad_norm": 14.325145721435547, + "learning_rate": 3.848253968253968e-06, + "loss": 0.2963, + "step": 11440 + }, + { + "epoch": 32.01428571428571, + "grad_norm": 0.15538744628429413, + "learning_rate": 3.841904761904762e-06, + "loss": 0.4815, + "step": 11450 + }, + { + "epoch": 32.014857142857146, + "grad_norm": 0.07028108835220337, + "learning_rate": 3.835555555555555e-06, + "loss": 0.358, + "step": 11460 + }, + { + "epoch": 32.01542857142857, + "grad_norm": 26.09341812133789, + "learning_rate": 3.82920634920635e-06, + "loss": 0.2815, + "step": 11470 + }, + { + "epoch": 32.016, + "grad_norm": 0.17105203866958618, + "learning_rate": 3.822857142857143e-06, + "loss": 0.5399, + "step": 11480 + }, + { + "epoch": 32.01657142857143, + "grad_norm": 124.22401428222656, + "learning_rate": 3.816507936507937e-06, + "loss": 0.3341, + "step": 11490 + }, + { + "epoch": 32.01714285714286, + "grad_norm": 0.05575098842382431, + "learning_rate": 3.8101587301587306e-06, + "loss": 0.0021, + "step": 11500 + }, + { + "epoch": 32.017714285714284, + "grad_norm": 0.025988677516579628, + "learning_rate": 3.803809523809524e-06, + "loss": 0.2548, + "step": 11510 + }, + { + "epoch": 32.01828571428572, + "grad_norm": 31.389314651489258, + "learning_rate": 3.7974603174603175e-06, + "loss": 0.3914, + "step": 11520 + }, + { + "epoch": 32.018857142857144, + "grad_norm": 0.030341658741235733, + "learning_rate": 3.7911111111111114e-06, + "loss": 0.0424, + "step": 11530 + }, + { + "epoch": 32.01942857142857, + "grad_norm": 0.18274207413196564, + "learning_rate": 3.784761904761905e-06, + "loss": 0.4848, + "step": 11540 + }, + { + "epoch": 32.02, + "grad_norm": 0.0034605495166033506, + "learning_rate": 3.778412698412699e-06, + "loss": 0.2201, + "step": 11550 + }, + { + "epoch": 32.02, + "eval_accuracy": 0.8484848484848485, + "eval_loss": 0.8628104329109192, + "eval_runtime": 126.7935, + "eval_samples_per_second": 2.342, + "eval_steps_per_second": 1.175, + "step": 11550 + }, + { + "epoch": 33.000571428571426, + "grad_norm": 0.009115273132920265, + "learning_rate": 3.7720634920634923e-06, + "loss": 0.0015, + "step": 11560 + }, + { + "epoch": 33.00114285714286, + "grad_norm": 0.029986055567860603, + "learning_rate": 3.7657142857142858e-06, + "loss": 0.1856, + "step": 11570 + }, + { + "epoch": 33.001714285714286, + "grad_norm": 0.04707956314086914, + "learning_rate": 3.7593650793650797e-06, + "loss": 0.3591, + "step": 11580 + }, + { + "epoch": 33.00228571428571, + "grad_norm": 4.939572334289551, + "learning_rate": 3.753015873015873e-06, + "loss": 0.0061, + "step": 11590 + }, + { + "epoch": 33.002857142857145, + "grad_norm": 0.019965268671512604, + "learning_rate": 3.7466666666666667e-06, + "loss": 0.002, + "step": 11600 + }, + { + "epoch": 33.00342857142857, + "grad_norm": 0.4821774959564209, + "learning_rate": 3.740317460317461e-06, + "loss": 0.6994, + "step": 11610 + }, + { + "epoch": 33.004, + "grad_norm": 359.18255615234375, + "learning_rate": 3.733968253968254e-06, + "loss": 0.2732, + "step": 11620 + }, + { + "epoch": 33.00457142857143, + "grad_norm": 0.0022106466349214315, + "learning_rate": 3.7276190476190475e-06, + "loss": 0.0015, + "step": 11630 + }, + { + "epoch": 33.00514285714286, + "grad_norm": 0.4608334004878998, + "learning_rate": 3.721269841269842e-06, + "loss": 0.0016, + "step": 11640 + }, + { + "epoch": 33.005714285714284, + "grad_norm": 0.10803355276584625, + "learning_rate": 3.7149206349206353e-06, + "loss": 0.0874, + "step": 11650 + }, + { + "epoch": 33.00628571428572, + "grad_norm": 0.07706461101770401, + "learning_rate": 3.7085714285714284e-06, + "loss": 0.0007, + "step": 11660 + }, + { + "epoch": 33.00685714285714, + "grad_norm": 0.006313271354883909, + "learning_rate": 3.7022222222222227e-06, + "loss": 0.0019, + "step": 11670 + }, + { + "epoch": 33.00742857142857, + "grad_norm": 0.037479300051927567, + "learning_rate": 3.695873015873016e-06, + "loss": 0.8099, + "step": 11680 + }, + { + "epoch": 33.008, + "grad_norm": 0.08825170248746872, + "learning_rate": 3.68952380952381e-06, + "loss": 0.0036, + "step": 11690 + }, + { + "epoch": 33.00857142857143, + "grad_norm": 196.14361572265625, + "learning_rate": 3.6831746031746036e-06, + "loss": 0.75, + "step": 11700 + }, + { + "epoch": 33.009142857142855, + "grad_norm": 0.5264555215835571, + "learning_rate": 3.676825396825397e-06, + "loss": 0.4639, + "step": 11710 + }, + { + "epoch": 33.00971428571429, + "grad_norm": 10.480401039123535, + "learning_rate": 3.670476190476191e-06, + "loss": 0.947, + "step": 11720 + }, + { + "epoch": 33.010285714285715, + "grad_norm": 0.4620245695114136, + "learning_rate": 3.6641269841269845e-06, + "loss": 0.0035, + "step": 11730 + }, + { + "epoch": 33.01085714285714, + "grad_norm": 0.13946720957756042, + "learning_rate": 3.657777777777778e-06, + "loss": 0.2192, + "step": 11740 + }, + { + "epoch": 33.011428571428574, + "grad_norm": 0.10003682225942612, + "learning_rate": 3.651428571428572e-06, + "loss": 0.236, + "step": 11750 + }, + { + "epoch": 33.012, + "grad_norm": 0.2135056108236313, + "learning_rate": 3.6450793650793653e-06, + "loss": 0.0021, + "step": 11760 + }, + { + "epoch": 33.01257142857143, + "grad_norm": 0.05885402113199234, + "learning_rate": 3.638730158730159e-06, + "loss": 0.2531, + "step": 11770 + }, + { + "epoch": 33.01314285714286, + "grad_norm": 0.011742881499230862, + "learning_rate": 3.6323809523809527e-06, + "loss": 0.4037, + "step": 11780 + }, + { + "epoch": 33.013714285714286, + "grad_norm": 0.05668189749121666, + "learning_rate": 3.6260317460317462e-06, + "loss": 0.0036, + "step": 11790 + }, + { + "epoch": 33.01428571428571, + "grad_norm": 104.82915496826172, + "learning_rate": 3.61968253968254e-06, + "loss": 0.1297, + "step": 11800 + }, + { + "epoch": 33.014857142857146, + "grad_norm": 0.22662226855754852, + "learning_rate": 3.6133333333333336e-06, + "loss": 0.0031, + "step": 11810 + }, + { + "epoch": 33.01542857142857, + "grad_norm": 0.0017571650678291917, + "learning_rate": 3.606984126984127e-06, + "loss": 0.3367, + "step": 11820 + }, + { + "epoch": 33.016, + "grad_norm": 0.3255730867385864, + "learning_rate": 3.600634920634921e-06, + "loss": 0.0302, + "step": 11830 + }, + { + "epoch": 33.01657142857143, + "grad_norm": 0.04617001861333847, + "learning_rate": 3.5942857142857145e-06, + "loss": 0.6351, + "step": 11840 + }, + { + "epoch": 33.01714285714286, + "grad_norm": 0.001643249997869134, + "learning_rate": 3.587936507936508e-06, + "loss": 0.4685, + "step": 11850 + }, + { + "epoch": 33.017714285714284, + "grad_norm": 66.71566772460938, + "learning_rate": 3.581587301587302e-06, + "loss": 0.4042, + "step": 11860 + }, + { + "epoch": 33.01828571428572, + "grad_norm": 0.02842831425368786, + "learning_rate": 3.5752380952380954e-06, + "loss": 0.3481, + "step": 11870 + }, + { + "epoch": 33.018857142857144, + "grad_norm": 0.8168884515762329, + "learning_rate": 3.568888888888889e-06, + "loss": 0.4965, + "step": 11880 + }, + { + "epoch": 33.01942857142857, + "grad_norm": 109.5650863647461, + "learning_rate": 3.562539682539683e-06, + "loss": 0.5878, + "step": 11890 + }, + { + "epoch": 33.02, + "grad_norm": 0.12658925354480743, + "learning_rate": 3.5561904761904762e-06, + "loss": 0.0324, + "step": 11900 + }, + { + "epoch": 33.02, + "eval_accuracy": 0.835016835016835, + "eval_loss": 0.9971614480018616, + "eval_runtime": 126.5059, + "eval_samples_per_second": 2.348, + "eval_steps_per_second": 1.178, + "step": 11900 + }, + { + "epoch": 34.000571428571426, + "grad_norm": 0.027432570233941078, + "learning_rate": 3.5498412698412697e-06, + "loss": 0.3804, + "step": 11910 + }, + { + "epoch": 34.00114285714286, + "grad_norm": 0.004706758074462414, + "learning_rate": 3.543492063492064e-06, + "loss": 0.2754, + "step": 11920 + }, + { + "epoch": 34.001714285714286, + "grad_norm": 0.03468727692961693, + "learning_rate": 3.5371428571428575e-06, + "loss": 0.0022, + "step": 11930 + }, + { + "epoch": 34.00228571428571, + "grad_norm": 0.16161181032657623, + "learning_rate": 3.5307936507936514e-06, + "loss": 0.5878, + "step": 11940 + }, + { + "epoch": 34.002857142857145, + "grad_norm": 0.0015723078977316618, + "learning_rate": 3.524444444444445e-06, + "loss": 0.0007, + "step": 11950 + }, + { + "epoch": 34.00342857142857, + "grad_norm": 0.051679786294698715, + "learning_rate": 3.5180952380952384e-06, + "loss": 0.0014, + "step": 11960 + }, + { + "epoch": 34.004, + "grad_norm": 0.07004716992378235, + "learning_rate": 3.5117460317460323e-06, + "loss": 0.4407, + "step": 11970 + }, + { + "epoch": 34.00457142857143, + "grad_norm": 0.04986123740673065, + "learning_rate": 3.5053968253968258e-06, + "loss": 0.2698, + "step": 11980 + }, + { + "epoch": 34.00514285714286, + "grad_norm": 0.007722809910774231, + "learning_rate": 3.4990476190476193e-06, + "loss": 0.0011, + "step": 11990 + }, + { + "epoch": 34.005714285714284, + "grad_norm": 0.0745702013373375, + "learning_rate": 3.492698412698413e-06, + "loss": 0.2513, + "step": 12000 + }, + { + "epoch": 34.00628571428572, + "grad_norm": 0.043245185166597366, + "learning_rate": 3.4863492063492067e-06, + "loss": 0.2987, + "step": 12010 + }, + { + "epoch": 34.00685714285714, + "grad_norm": 0.09795431792736053, + "learning_rate": 3.48e-06, + "loss": 0.3349, + "step": 12020 + }, + { + "epoch": 34.00742857142857, + "grad_norm": 0.09430580586194992, + "learning_rate": 3.473650793650794e-06, + "loss": 0.0011, + "step": 12030 + }, + { + "epoch": 34.008, + "grad_norm": 0.022420106455683708, + "learning_rate": 3.4673015873015875e-06, + "loss": 0.267, + "step": 12040 + }, + { + "epoch": 34.00857142857143, + "grad_norm": 0.045474231243133545, + "learning_rate": 3.4609523809523814e-06, + "loss": 0.2568, + "step": 12050 + }, + { + "epoch": 34.009142857142855, + "grad_norm": 80.42121887207031, + "learning_rate": 3.454603174603175e-06, + "loss": 0.3101, + "step": 12060 + }, + { + "epoch": 34.00971428571429, + "grad_norm": 20.72455406188965, + "learning_rate": 3.4482539682539684e-06, + "loss": 0.6013, + "step": 12070 + }, + { + "epoch": 34.010285714285715, + "grad_norm": 33.085418701171875, + "learning_rate": 3.4419047619047623e-06, + "loss": 0.5322, + "step": 12080 + }, + { + "epoch": 34.01085714285714, + "grad_norm": 209.3749542236328, + "learning_rate": 3.435555555555556e-06, + "loss": 0.2576, + "step": 12090 + }, + { + "epoch": 34.011428571428574, + "grad_norm": 0.305754691362381, + "learning_rate": 3.4292063492063493e-06, + "loss": 0.1585, + "step": 12100 + }, + { + "epoch": 34.012, + "grad_norm": 19.921072006225586, + "learning_rate": 3.422857142857143e-06, + "loss": 0.2088, + "step": 12110 + }, + { + "epoch": 34.01257142857143, + "grad_norm": 0.02304167114198208, + "learning_rate": 3.4165079365079367e-06, + "loss": 0.167, + "step": 12120 + }, + { + "epoch": 34.01314285714286, + "grad_norm": 0.08073779195547104, + "learning_rate": 3.41015873015873e-06, + "loss": 0.7815, + "step": 12130 + }, + { + "epoch": 34.013714285714286, + "grad_norm": 94.09422302246094, + "learning_rate": 3.403809523809524e-06, + "loss": 0.0197, + "step": 12140 + }, + { + "epoch": 34.01428571428571, + "grad_norm": 0.05119791999459267, + "learning_rate": 3.3974603174603175e-06, + "loss": 0.0028, + "step": 12150 + }, + { + "epoch": 34.014857142857146, + "grad_norm": 15.968472480773926, + "learning_rate": 3.391111111111111e-06, + "loss": 0.3075, + "step": 12160 + }, + { + "epoch": 34.01542857142857, + "grad_norm": 95.5150375366211, + "learning_rate": 3.3847619047619053e-06, + "loss": 0.5475, + "step": 12170 + }, + { + "epoch": 34.016, + "grad_norm": 69.68689727783203, + "learning_rate": 3.3784126984126984e-06, + "loss": 0.293, + "step": 12180 + }, + { + "epoch": 34.01657142857143, + "grad_norm": 0.052636485546827316, + "learning_rate": 3.3720634920634927e-06, + "loss": 0.2891, + "step": 12190 + }, + { + "epoch": 34.01714285714286, + "grad_norm": 22.63859748840332, + "learning_rate": 3.3657142857142862e-06, + "loss": 0.2413, + "step": 12200 + }, + { + "epoch": 34.017714285714284, + "grad_norm": 0.07254651933908463, + "learning_rate": 3.3593650793650797e-06, + "loss": 0.2965, + "step": 12210 + }, + { + "epoch": 34.01828571428572, + "grad_norm": 0.016718924045562744, + "learning_rate": 3.3530158730158736e-06, + "loss": 0.0566, + "step": 12220 + }, + { + "epoch": 34.018857142857144, + "grad_norm": 0.0034701621625572443, + "learning_rate": 3.346666666666667e-06, + "loss": 0.1736, + "step": 12230 + }, + { + "epoch": 34.01942857142857, + "grad_norm": 0.4016928970813751, + "learning_rate": 3.3403174603174606e-06, + "loss": 0.3631, + "step": 12240 + }, + { + "epoch": 34.02, + "grad_norm": 173.18507385253906, + "learning_rate": 3.3339682539682545e-06, + "loss": 0.4411, + "step": 12250 + }, + { + "epoch": 34.02, + "eval_accuracy": 0.835016835016835, + "eval_loss": 1.059208869934082, + "eval_runtime": 126.2592, + "eval_samples_per_second": 2.352, + "eval_steps_per_second": 1.18, + "step": 12250 + }, + { + "epoch": 35.000571428571426, + "grad_norm": 0.02301918901503086, + "learning_rate": 3.327619047619048e-06, + "loss": 0.264, + "step": 12260 + }, + { + "epoch": 35.00114285714286, + "grad_norm": 0.14874428510665894, + "learning_rate": 3.3212698412698414e-06, + "loss": 0.0022, + "step": 12270 + }, + { + "epoch": 35.001714285714286, + "grad_norm": 0.5483258962631226, + "learning_rate": 3.3149206349206354e-06, + "loss": 0.1753, + "step": 12280 + }, + { + "epoch": 35.00228571428571, + "grad_norm": 0.0009629835840314627, + "learning_rate": 3.308571428571429e-06, + "loss": 0.1238, + "step": 12290 + }, + { + "epoch": 35.002857142857145, + "grad_norm": 0.543134331703186, + "learning_rate": 3.3022222222222223e-06, + "loss": 0.1245, + "step": 12300 + }, + { + "epoch": 35.00342857142857, + "grad_norm": 0.08552040904760361, + "learning_rate": 3.2958730158730162e-06, + "loss": 0.5037, + "step": 12310 + }, + { + "epoch": 35.004, + "grad_norm": 0.015447727404534817, + "learning_rate": 3.2895238095238097e-06, + "loss": 0.5062, + "step": 12320 + }, + { + "epoch": 35.00457142857143, + "grad_norm": 0.004400096833705902, + "learning_rate": 3.2831746031746036e-06, + "loss": 0.2552, + "step": 12330 + }, + { + "epoch": 35.00514285714286, + "grad_norm": 0.018583452329039574, + "learning_rate": 3.276825396825397e-06, + "loss": 0.3969, + "step": 12340 + }, + { + "epoch": 35.005714285714284, + "grad_norm": 0.04393948242068291, + "learning_rate": 3.2704761904761906e-06, + "loss": 0.2678, + "step": 12350 + }, + { + "epoch": 35.00628571428572, + "grad_norm": 0.05897550284862518, + "learning_rate": 3.2641269841269845e-06, + "loss": 0.0057, + "step": 12360 + }, + { + "epoch": 35.00685714285714, + "grad_norm": 0.004284753929823637, + "learning_rate": 3.257777777777778e-06, + "loss": 0.2586, + "step": 12370 + }, + { + "epoch": 35.00742857142857, + "grad_norm": 0.007391383405774832, + "learning_rate": 3.2514285714285715e-06, + "loss": 0.6294, + "step": 12380 + }, + { + "epoch": 35.008, + "grad_norm": 0.05153246968984604, + "learning_rate": 3.2450793650793654e-06, + "loss": 0.6797, + "step": 12390 + }, + { + "epoch": 35.00857142857143, + "grad_norm": 20.67218017578125, + "learning_rate": 3.238730158730159e-06, + "loss": 0.2888, + "step": 12400 + }, + { + "epoch": 35.009142857142855, + "grad_norm": 0.0017453532200306654, + "learning_rate": 3.2323809523809523e-06, + "loss": 0.0024, + "step": 12410 + }, + { + "epoch": 35.00971428571429, + "grad_norm": 0.6204124689102173, + "learning_rate": 3.2260317460317462e-06, + "loss": 0.3306, + "step": 12420 + }, + { + "epoch": 35.010285714285715, + "grad_norm": 0.11453156918287277, + "learning_rate": 3.2196825396825397e-06, + "loss": 0.6737, + "step": 12430 + }, + { + "epoch": 35.01085714285714, + "grad_norm": 116.97684478759766, + "learning_rate": 3.213333333333334e-06, + "loss": 0.453, + "step": 12440 + }, + { + "epoch": 35.011428571428574, + "grad_norm": 0.036407049745321274, + "learning_rate": 3.206984126984127e-06, + "loss": 0.3485, + "step": 12450 + }, + { + "epoch": 35.012, + "grad_norm": 0.003050927072763443, + "learning_rate": 3.2006349206349206e-06, + "loss": 0.4884, + "step": 12460 + }, + { + "epoch": 35.01257142857143, + "grad_norm": 0.2217385321855545, + "learning_rate": 3.194285714285715e-06, + "loss": 0.5697, + "step": 12470 + }, + { + "epoch": 35.01314285714286, + "grad_norm": 4.006773471832275, + "learning_rate": 3.1879365079365084e-06, + "loss": 0.0041, + "step": 12480 + }, + { + "epoch": 35.013714285714286, + "grad_norm": 0.09846015274524689, + "learning_rate": 3.1815873015873015e-06, + "loss": 0.0028, + "step": 12490 + }, + { + "epoch": 35.01428571428571, + "grad_norm": 0.00579382898285985, + "learning_rate": 3.175238095238096e-06, + "loss": 0.0024, + "step": 12500 + }, + { + "epoch": 35.014857142857146, + "grad_norm": 0.08295068144798279, + "learning_rate": 3.1688888888888893e-06, + "loss": 0.001, + "step": 12510 + }, + { + "epoch": 35.01542857142857, + "grad_norm": 0.0009396121604368091, + "learning_rate": 3.1625396825396828e-06, + "loss": 0.0129, + "step": 12520 + }, + { + "epoch": 35.016, + "grad_norm": 0.007435150910168886, + "learning_rate": 3.1561904761904767e-06, + "loss": 0.0014, + "step": 12530 + }, + { + "epoch": 35.01657142857143, + "grad_norm": 0.03919130563735962, + "learning_rate": 3.14984126984127e-06, + "loss": 0.6772, + "step": 12540 + }, + { + "epoch": 35.01714285714286, + "grad_norm": 0.03321586921811104, + "learning_rate": 3.1434920634920636e-06, + "loss": 0.4575, + "step": 12550 + }, + { + "epoch": 35.017714285714284, + "grad_norm": 0.19160214066505432, + "learning_rate": 3.1371428571428575e-06, + "loss": 0.195, + "step": 12560 + }, + { + "epoch": 35.01828571428572, + "grad_norm": 0.0026826318353414536, + "learning_rate": 3.130793650793651e-06, + "loss": 0.0045, + "step": 12570 + }, + { + "epoch": 35.018857142857144, + "grad_norm": 0.038780469447374344, + "learning_rate": 3.124444444444445e-06, + "loss": 0.0011, + "step": 12580 + }, + { + "epoch": 35.01942857142857, + "grad_norm": 0.07216961681842804, + "learning_rate": 3.1180952380952384e-06, + "loss": 0.0011, + "step": 12590 + }, + { + "epoch": 35.02, + "grad_norm": 0.026877250522375107, + "learning_rate": 3.111746031746032e-06, + "loss": 0.0011, + "step": 12600 + }, + { + "epoch": 35.02, + "eval_accuracy": 0.8282828282828283, + "eval_loss": 1.0746172666549683, + "eval_runtime": 126.6057, + "eval_samples_per_second": 2.346, + "eval_steps_per_second": 1.177, + "step": 12600 + }, + { + "epoch": 36.000571428571426, + "grad_norm": 0.0886671394109726, + "learning_rate": 3.105396825396826e-06, + "loss": 0.2623, + "step": 12610 + }, + { + "epoch": 36.00114285714286, + "grad_norm": 0.012463954277336597, + "learning_rate": 3.0990476190476193e-06, + "loss": 0.4357, + "step": 12620 + }, + { + "epoch": 36.001714285714286, + "grad_norm": 0.026730680838227272, + "learning_rate": 3.0926984126984128e-06, + "loss": 0.2669, + "step": 12630 + }, + { + "epoch": 36.00228571428571, + "grad_norm": 0.07718096673488617, + "learning_rate": 3.0863492063492067e-06, + "loss": 0.5815, + "step": 12640 + }, + { + "epoch": 36.002857142857145, + "grad_norm": 0.2121714949607849, + "learning_rate": 3.08e-06, + "loss": 0.1494, + "step": 12650 + }, + { + "epoch": 36.00342857142857, + "grad_norm": 0.1541670709848404, + "learning_rate": 3.0736507936507936e-06, + "loss": 0.0016, + "step": 12660 + }, + { + "epoch": 36.004, + "grad_norm": 0.030359311029314995, + "learning_rate": 3.0673015873015875e-06, + "loss": 0.0706, + "step": 12670 + }, + { + "epoch": 36.00457142857143, + "grad_norm": 0.07423070073127747, + "learning_rate": 3.060952380952381e-06, + "loss": 0.234, + "step": 12680 + }, + { + "epoch": 36.00514285714286, + "grad_norm": 0.005180784966796637, + "learning_rate": 3.054603174603175e-06, + "loss": 0.2837, + "step": 12690 + }, + { + "epoch": 36.005714285714284, + "grad_norm": 345.4854736328125, + "learning_rate": 3.0482539682539684e-06, + "loss": 0.3923, + "step": 12700 + }, + { + "epoch": 36.00628571428572, + "grad_norm": 0.0064833336509764194, + "learning_rate": 3.041904761904762e-06, + "loss": 0.0015, + "step": 12710 + }, + { + "epoch": 36.00685714285714, + "grad_norm": 0.04605857655405998, + "learning_rate": 3.0355555555555562e-06, + "loss": 0.0033, + "step": 12720 + }, + { + "epoch": 36.00742857142857, + "grad_norm": 0.10902780294418335, + "learning_rate": 3.0292063492063493e-06, + "loss": 0.0027, + "step": 12730 + }, + { + "epoch": 36.008, + "grad_norm": 0.006975686177611351, + "learning_rate": 3.0228571428571428e-06, + "loss": 0.4695, + "step": 12740 + }, + { + "epoch": 36.00857142857143, + "grad_norm": 0.09180530905723572, + "learning_rate": 3.016507936507937e-06, + "loss": 0.0007, + "step": 12750 + }, + { + "epoch": 36.009142857142855, + "grad_norm": 0.007026137318462133, + "learning_rate": 3.0101587301587306e-06, + "loss": 0.4985, + "step": 12760 + }, + { + "epoch": 36.00971428571429, + "grad_norm": 0.005862717051059008, + "learning_rate": 3.0038095238095236e-06, + "loss": 0.163, + "step": 12770 + }, + { + "epoch": 36.010285714285715, + "grad_norm": 0.05073744058609009, + "learning_rate": 2.997460317460318e-06, + "loss": 0.002, + "step": 12780 + }, + { + "epoch": 36.01085714285714, + "grad_norm": 0.017824208363890648, + "learning_rate": 2.9911111111111115e-06, + "loss": 0.4371, + "step": 12790 + }, + { + "epoch": 36.011428571428574, + "grad_norm": 13.799386024475098, + "learning_rate": 2.984761904761905e-06, + "loss": 0.6038, + "step": 12800 + }, + { + "epoch": 36.012, + "grad_norm": 199.8282012939453, + "learning_rate": 2.978412698412699e-06, + "loss": 0.4218, + "step": 12810 + }, + { + "epoch": 36.01257142857143, + "grad_norm": 0.15811072289943695, + "learning_rate": 2.9720634920634923e-06, + "loss": 0.0011, + "step": 12820 + }, + { + "epoch": 36.01314285714286, + "grad_norm": 0.17585258185863495, + "learning_rate": 2.9657142857142862e-06, + "loss": 0.4932, + "step": 12830 + }, + { + "epoch": 36.013714285714286, + "grad_norm": 16.838157653808594, + "learning_rate": 2.9593650793650797e-06, + "loss": 1.1193, + "step": 12840 + }, + { + "epoch": 36.01428571428571, + "grad_norm": 0.04459202662110329, + "learning_rate": 2.953015873015873e-06, + "loss": 0.184, + "step": 12850 + }, + { + "epoch": 36.014857142857146, + "grad_norm": 0.38909366726875305, + "learning_rate": 2.946666666666667e-06, + "loss": 0.2212, + "step": 12860 + }, + { + "epoch": 36.01542857142857, + "grad_norm": 0.1248052716255188, + "learning_rate": 2.9403174603174606e-06, + "loss": 0.0016, + "step": 12870 + }, + { + "epoch": 36.016, + "grad_norm": 0.5679760575294495, + "learning_rate": 2.933968253968254e-06, + "loss": 0.0029, + "step": 12880 + }, + { + "epoch": 36.01657142857143, + "grad_norm": 0.02419651672244072, + "learning_rate": 2.927619047619048e-06, + "loss": 0.2692, + "step": 12890 + }, + { + "epoch": 36.01714285714286, + "grad_norm": 0.2062833309173584, + "learning_rate": 2.9212698412698415e-06, + "loss": 0.1403, + "step": 12900 + }, + { + "epoch": 36.017714285714284, + "grad_norm": 0.21458952128887177, + "learning_rate": 2.914920634920635e-06, + "loss": 0.4953, + "step": 12910 + }, + { + "epoch": 36.01828571428572, + "grad_norm": 0.24665455520153046, + "learning_rate": 2.908571428571429e-06, + "loss": 0.7257, + "step": 12920 + }, + { + "epoch": 36.018857142857144, + "grad_norm": 0.5908558368682861, + "learning_rate": 2.9022222222222223e-06, + "loss": 0.005, + "step": 12930 + }, + { + "epoch": 36.01942857142857, + "grad_norm": 0.19099822640419006, + "learning_rate": 2.895873015873016e-06, + "loss": 0.0019, + "step": 12940 + }, + { + "epoch": 36.02, + "grad_norm": 0.05261155590415001, + "learning_rate": 2.8895238095238097e-06, + "loss": 0.3917, + "step": 12950 + }, + { + "epoch": 36.02, + "eval_accuracy": 0.8383838383838383, + "eval_loss": 0.9695500135421753, + "eval_runtime": 126.7723, + "eval_samples_per_second": 2.343, + "eval_steps_per_second": 1.175, + "step": 12950 + }, + { + "epoch": 37.000571428571426, + "grad_norm": 0.07244765758514404, + "learning_rate": 2.8831746031746032e-06, + "loss": 0.0006, + "step": 12960 + }, + { + "epoch": 37.00114285714286, + "grad_norm": 0.04542018845677376, + "learning_rate": 2.876825396825397e-06, + "loss": 0.4526, + "step": 12970 + }, + { + "epoch": 37.001714285714286, + "grad_norm": 0.007704177405685186, + "learning_rate": 2.8704761904761906e-06, + "loss": 0.0015, + "step": 12980 + }, + { + "epoch": 37.00228571428571, + "grad_norm": 0.01741478592157364, + "learning_rate": 2.864126984126984e-06, + "loss": 0.4346, + "step": 12990 + }, + { + "epoch": 37.002857142857145, + "grad_norm": 0.020365318283438683, + "learning_rate": 2.8577777777777784e-06, + "loss": 0.2763, + "step": 13000 + }, + { + "epoch": 37.00342857142857, + "grad_norm": 0.10666250437498093, + "learning_rate": 2.8514285714285715e-06, + "loss": 0.1876, + "step": 13010 + }, + { + "epoch": 37.004, + "grad_norm": 0.0016732965596020222, + "learning_rate": 2.845079365079365e-06, + "loss": 0.0018, + "step": 13020 + }, + { + "epoch": 37.00457142857143, + "grad_norm": 0.025395436212420464, + "learning_rate": 2.8387301587301593e-06, + "loss": 0.0012, + "step": 13030 + }, + { + "epoch": 37.00514285714286, + "grad_norm": 2.5686378479003906, + "learning_rate": 2.8323809523809528e-06, + "loss": 0.197, + "step": 13040 + }, + { + "epoch": 37.005714285714284, + "grad_norm": 0.044472586363554, + "learning_rate": 2.826031746031746e-06, + "loss": 0.0021, + "step": 13050 + }, + { + "epoch": 37.00628571428572, + "grad_norm": 0.05117283761501312, + "learning_rate": 2.81968253968254e-06, + "loss": 0.0008, + "step": 13060 + }, + { + "epoch": 37.00685714285714, + "grad_norm": 0.1931295245885849, + "learning_rate": 2.8133333333333336e-06, + "loss": 0.0012, + "step": 13070 + }, + { + "epoch": 37.00742857142857, + "grad_norm": 0.0703316405415535, + "learning_rate": 2.8069841269841276e-06, + "loss": 0.0013, + "step": 13080 + }, + { + "epoch": 37.008, + "grad_norm": 0.03455556929111481, + "learning_rate": 2.800634920634921e-06, + "loss": 0.0009, + "step": 13090 + }, + { + "epoch": 37.00857142857143, + "grad_norm": 0.007254268042743206, + "learning_rate": 2.7942857142857145e-06, + "loss": 0.906, + "step": 13100 + }, + { + "epoch": 37.009142857142855, + "grad_norm": 0.01259919349104166, + "learning_rate": 2.7879365079365084e-06, + "loss": 0.5051, + "step": 13110 + }, + { + "epoch": 37.00971428571429, + "grad_norm": 0.05640314146876335, + "learning_rate": 2.781587301587302e-06, + "loss": 0.0017, + "step": 13120 + }, + { + "epoch": 37.010285714285715, + "grad_norm": 0.0015288791619241238, + "learning_rate": 2.7752380952380954e-06, + "loss": 0.6821, + "step": 13130 + }, + { + "epoch": 37.01085714285714, + "grad_norm": 0.1644653081893921, + "learning_rate": 2.7688888888888893e-06, + "loss": 0.1161, + "step": 13140 + }, + { + "epoch": 37.011428571428574, + "grad_norm": 0.0032219102140516043, + "learning_rate": 2.7625396825396828e-06, + "loss": 0.0012, + "step": 13150 + }, + { + "epoch": 37.012, + "grad_norm": 0.1719381958246231, + "learning_rate": 2.7561904761904763e-06, + "loss": 0.0014, + "step": 13160 + }, + { + "epoch": 37.01257142857143, + "grad_norm": 0.04314820095896721, + "learning_rate": 2.74984126984127e-06, + "loss": 0.0016, + "step": 13170 + }, + { + "epoch": 37.01314285714286, + "grad_norm": 0.05535136163234711, + "learning_rate": 2.7434920634920637e-06, + "loss": 0.0016, + "step": 13180 + }, + { + "epoch": 37.013714285714286, + "grad_norm": 0.004680715035647154, + "learning_rate": 2.737142857142857e-06, + "loss": 0.0195, + "step": 13190 + }, + { + "epoch": 37.01428571428571, + "grad_norm": 0.020550280809402466, + "learning_rate": 2.730793650793651e-06, + "loss": 0.0009, + "step": 13200 + }, + { + "epoch": 37.014857142857146, + "grad_norm": 0.02597636915743351, + "learning_rate": 2.7244444444444445e-06, + "loss": 0.3391, + "step": 13210 + }, + { + "epoch": 37.01542857142857, + "grad_norm": 0.022728653624653816, + "learning_rate": 2.7180952380952384e-06, + "loss": 0.3329, + "step": 13220 + }, + { + "epoch": 37.016, + "grad_norm": 37.18050003051758, + "learning_rate": 2.711746031746032e-06, + "loss": 1.1542, + "step": 13230 + }, + { + "epoch": 37.01657142857143, + "grad_norm": 0.11314905434846878, + "learning_rate": 2.7053968253968254e-06, + "loss": 0.3948, + "step": 13240 + }, + { + "epoch": 37.01714285714286, + "grad_norm": 16.31825065612793, + "learning_rate": 2.6990476190476193e-06, + "loss": 0.4456, + "step": 13250 + }, + { + "epoch": 37.017714285714284, + "grad_norm": 0.0014047048753127456, + "learning_rate": 2.6926984126984128e-06, + "loss": 0.2791, + "step": 13260 + }, + { + "epoch": 37.01828571428572, + "grad_norm": 0.059588722884655, + "learning_rate": 2.6863492063492063e-06, + "loss": 0.2329, + "step": 13270 + }, + { + "epoch": 37.018857142857144, + "grad_norm": 0.005540232639759779, + "learning_rate": 2.68e-06, + "loss": 0.0019, + "step": 13280 + }, + { + "epoch": 37.01942857142857, + "grad_norm": 18.04555892944336, + "learning_rate": 2.6736507936507937e-06, + "loss": 0.7975, + "step": 13290 + }, + { + "epoch": 37.02, + "grad_norm": 0.0029781581833958626, + "learning_rate": 2.667301587301587e-06, + "loss": 0.7268, + "step": 13300 + }, + { + "epoch": 37.02, + "eval_accuracy": 0.8181818181818182, + "eval_loss": 1.1061733961105347, + "eval_runtime": 126.817, + "eval_samples_per_second": 2.342, + "eval_steps_per_second": 1.175, + "step": 13300 + }, + { + "epoch": 38.000571428571426, + "grad_norm": 0.47502627968788147, + "learning_rate": 2.6609523809523815e-06, + "loss": 0.3805, + "step": 13310 + }, + { + "epoch": 38.00114285714286, + "grad_norm": 0.08994001895189285, + "learning_rate": 2.6546031746031745e-06, + "loss": 0.003, + "step": 13320 + }, + { + "epoch": 38.001714285714286, + "grad_norm": 0.06851596385240555, + "learning_rate": 2.648253968253969e-06, + "loss": 0.0035, + "step": 13330 + }, + { + "epoch": 38.00228571428571, + "grad_norm": 0.1989380419254303, + "learning_rate": 2.6419047619047623e-06, + "loss": 0.3274, + "step": 13340 + }, + { + "epoch": 38.002857142857145, + "grad_norm": 0.04935070872306824, + "learning_rate": 2.635555555555556e-06, + "loss": 0.4874, + "step": 13350 + }, + { + "epoch": 38.00342857142857, + "grad_norm": 0.04223153740167618, + "learning_rate": 2.6292063492063497e-06, + "loss": 0.2244, + "step": 13360 + }, + { + "epoch": 38.004, + "grad_norm": 0.06442692130804062, + "learning_rate": 2.6228571428571432e-06, + "loss": 0.005, + "step": 13370 + }, + { + "epoch": 38.00457142857143, + "grad_norm": 0.04460546746850014, + "learning_rate": 2.6165079365079367e-06, + "loss": 0.238, + "step": 13380 + }, + { + "epoch": 38.00514285714286, + "grad_norm": 0.0043782079592347145, + "learning_rate": 2.6101587301587306e-06, + "loss": 0.2346, + "step": 13390 + }, + { + "epoch": 38.005714285714284, + "grad_norm": 0.05058957263827324, + "learning_rate": 2.603809523809524e-06, + "loss": 0.4129, + "step": 13400 + }, + { + "epoch": 38.00628571428572, + "grad_norm": 0.020779475569725037, + "learning_rate": 2.5974603174603176e-06, + "loss": 0.154, + "step": 13410 + }, + { + "epoch": 38.00685714285714, + "grad_norm": 0.3094046413898468, + "learning_rate": 2.5911111111111115e-06, + "loss": 0.3172, + "step": 13420 + }, + { + "epoch": 38.00742857142857, + "grad_norm": 0.38032662868499756, + "learning_rate": 2.584761904761905e-06, + "loss": 0.2117, + "step": 13430 + }, + { + "epoch": 38.008, + "grad_norm": 0.004349476657807827, + "learning_rate": 2.5784126984126984e-06, + "loss": 0.2803, + "step": 13440 + }, + { + "epoch": 38.00857142857143, + "grad_norm": 0.06518778204917908, + "learning_rate": 2.5720634920634924e-06, + "loss": 0.2218, + "step": 13450 + }, + { + "epoch": 38.009142857142855, + "grad_norm": 0.09332817047834396, + "learning_rate": 2.565714285714286e-06, + "loss": 0.2499, + "step": 13460 + }, + { + "epoch": 38.00971428571429, + "grad_norm": 0.11110708862543106, + "learning_rate": 2.5593650793650797e-06, + "loss": 0.6162, + "step": 13470 + }, + { + "epoch": 38.010285714285715, + "grad_norm": 0.07068169862031937, + "learning_rate": 2.5530158730158732e-06, + "loss": 0.4614, + "step": 13480 + }, + { + "epoch": 38.01085714285714, + "grad_norm": 0.0015391431516036391, + "learning_rate": 2.5466666666666667e-06, + "loss": 0.3118, + "step": 13490 + }, + { + "epoch": 38.011428571428574, + "grad_norm": 0.05324092507362366, + "learning_rate": 2.5403174603174606e-06, + "loss": 0.0024, + "step": 13500 + }, + { + "epoch": 38.012, + "grad_norm": 0.03473285958170891, + "learning_rate": 2.533968253968254e-06, + "loss": 0.1767, + "step": 13510 + }, + { + "epoch": 38.01257142857143, + "grad_norm": 0.05727458372712135, + "learning_rate": 2.5276190476190476e-06, + "loss": 0.2763, + "step": 13520 + }, + { + "epoch": 38.01314285714286, + "grad_norm": 0.04184458777308464, + "learning_rate": 2.5212698412698415e-06, + "loss": 0.0069, + "step": 13530 + }, + { + "epoch": 38.013714285714286, + "grad_norm": 0.003181879874318838, + "learning_rate": 2.514920634920635e-06, + "loss": 0.001, + "step": 13540 + }, + { + "epoch": 38.01428571428571, + "grad_norm": 0.019657937809824944, + "learning_rate": 2.5085714285714285e-06, + "loss": 0.0081, + "step": 13550 + }, + { + "epoch": 38.014857142857146, + "grad_norm": 0.032800447195768356, + "learning_rate": 2.5022222222222224e-06, + "loss": 0.0018, + "step": 13560 + }, + { + "epoch": 38.01542857142857, + "grad_norm": 0.04382657632231712, + "learning_rate": 2.495873015873016e-06, + "loss": 0.0013, + "step": 13570 + }, + { + "epoch": 38.016, + "grad_norm": 0.008959997445344925, + "learning_rate": 2.4895238095238097e-06, + "loss": 0.0004, + "step": 13580 + }, + { + "epoch": 38.01657142857143, + "grad_norm": 0.013616718351840973, + "learning_rate": 2.4831746031746037e-06, + "loss": 0.0008, + "step": 13590 + }, + { + "epoch": 38.01714285714286, + "grad_norm": 27.68400001525879, + "learning_rate": 2.4768253968253967e-06, + "loss": 0.4851, + "step": 13600 + }, + { + "epoch": 38.017714285714284, + "grad_norm": 20.209680557250977, + "learning_rate": 2.4704761904761906e-06, + "loss": 0.3812, + "step": 13610 + }, + { + "epoch": 38.01828571428572, + "grad_norm": 0.13937675952911377, + "learning_rate": 2.4641269841269845e-06, + "loss": 0.2243, + "step": 13620 + }, + { + "epoch": 38.018857142857144, + "grad_norm": 0.0037647627759724855, + "learning_rate": 2.457777777777778e-06, + "loss": 0.6971, + "step": 13630 + }, + { + "epoch": 38.01942857142857, + "grad_norm": 0.033469632267951965, + "learning_rate": 2.4514285714285715e-06, + "loss": 0.3434, + "step": 13640 + }, + { + "epoch": 38.02, + "grad_norm": 0.023815317079424858, + "learning_rate": 2.4450793650793654e-06, + "loss": 0.3747, + "step": 13650 + }, + { + "epoch": 38.02, + "eval_accuracy": 0.835016835016835, + "eval_loss": 1.036763072013855, + "eval_runtime": 126.6279, + "eval_samples_per_second": 2.345, + "eval_steps_per_second": 1.177, + "step": 13650 + }, + { + "epoch": 39.000571428571426, + "grad_norm": 49.7794075012207, + "learning_rate": 2.438730158730159e-06, + "loss": 0.003, + "step": 13660 + }, + { + "epoch": 39.00114285714286, + "grad_norm": 0.03126109018921852, + "learning_rate": 2.4323809523809524e-06, + "loss": 0.1622, + "step": 13670 + }, + { + "epoch": 39.001714285714286, + "grad_norm": 0.047299765050411224, + "learning_rate": 2.4260317460317463e-06, + "loss": 0.2872, + "step": 13680 + }, + { + "epoch": 39.00228571428571, + "grad_norm": 0.05949264392256737, + "learning_rate": 2.4196825396825398e-06, + "loss": 0.2092, + "step": 13690 + }, + { + "epoch": 39.002857142857145, + "grad_norm": 0.026437293738126755, + "learning_rate": 2.4133333333333337e-06, + "loss": 0.0031, + "step": 13700 + }, + { + "epoch": 39.00342857142857, + "grad_norm": 0.20057931542396545, + "learning_rate": 2.406984126984127e-06, + "loss": 0.4873, + "step": 13710 + }, + { + "epoch": 39.004, + "grad_norm": 0.004649260081350803, + "learning_rate": 2.4006349206349206e-06, + "loss": 0.2073, + "step": 13720 + }, + { + "epoch": 39.00457142857143, + "grad_norm": 0.5119889974594116, + "learning_rate": 2.3942857142857145e-06, + "loss": 0.4199, + "step": 13730 + }, + { + "epoch": 39.00514285714286, + "grad_norm": 0.16033364832401276, + "learning_rate": 2.387936507936508e-06, + "loss": 0.0135, + "step": 13740 + }, + { + "epoch": 39.005714285714284, + "grad_norm": 0.0032101422548294067, + "learning_rate": 2.381587301587302e-06, + "loss": 0.9572, + "step": 13750 + }, + { + "epoch": 39.00628571428572, + "grad_norm": 0.026194965466856956, + "learning_rate": 2.3752380952380954e-06, + "loss": 1.2531, + "step": 13760 + }, + { + "epoch": 39.00685714285714, + "grad_norm": 0.0062312232330441475, + "learning_rate": 2.3688888888888893e-06, + "loss": 0.2321, + "step": 13770 + }, + { + "epoch": 39.00742857142857, + "grad_norm": 0.07098673284053802, + "learning_rate": 2.362539682539683e-06, + "loss": 0.0025, + "step": 13780 + }, + { + "epoch": 39.008, + "grad_norm": 0.0009907495696097612, + "learning_rate": 2.3561904761904763e-06, + "loss": 0.563, + "step": 13790 + }, + { + "epoch": 39.00857142857143, + "grad_norm": 0.0647115707397461, + "learning_rate": 2.34984126984127e-06, + "loss": 0.8326, + "step": 13800 + }, + { + "epoch": 39.009142857142855, + "grad_norm": 0.0811251774430275, + "learning_rate": 2.3434920634920637e-06, + "loss": 0.6204, + "step": 13810 + }, + { + "epoch": 39.00971428571429, + "grad_norm": 0.011888241395354271, + "learning_rate": 2.337142857142857e-06, + "loss": 0.0029, + "step": 13820 + }, + { + "epoch": 39.010285714285715, + "grad_norm": 0.06965713948011398, + "learning_rate": 2.330793650793651e-06, + "loss": 0.0021, + "step": 13830 + }, + { + "epoch": 39.01085714285714, + "grad_norm": 0.0806642472743988, + "learning_rate": 2.3244444444444445e-06, + "loss": 0.2988, + "step": 13840 + }, + { + "epoch": 39.011428571428574, + "grad_norm": 0.007325103506445885, + "learning_rate": 2.318095238095238e-06, + "loss": 0.0013, + "step": 13850 + }, + { + "epoch": 39.012, + "grad_norm": 0.11560201644897461, + "learning_rate": 2.311746031746032e-06, + "loss": 0.1653, + "step": 13860 + }, + { + "epoch": 39.01257142857143, + "grad_norm": 0.033399466425180435, + "learning_rate": 2.305396825396826e-06, + "loss": 0.0027, + "step": 13870 + }, + { + "epoch": 39.01314285714286, + "grad_norm": 1141.728515625, + "learning_rate": 2.2990476190476193e-06, + "loss": 0.0865, + "step": 13880 + }, + { + "epoch": 39.013714285714286, + "grad_norm": 0.0021837761159986258, + "learning_rate": 2.292698412698413e-06, + "loss": 0.2472, + "step": 13890 + }, + { + "epoch": 39.01428571428571, + "grad_norm": 0.008280741050839424, + "learning_rate": 2.2863492063492067e-06, + "loss": 0.2712, + "step": 13900 + }, + { + "epoch": 39.014857142857146, + "grad_norm": 0.00286860391497612, + "learning_rate": 2.28e-06, + "loss": 0.2084, + "step": 13910 + }, + { + "epoch": 39.01542857142857, + "grad_norm": 0.012107732705771923, + "learning_rate": 2.2736507936507937e-06, + "loss": 0.0006, + "step": 13920 + }, + { + "epoch": 39.016, + "grad_norm": 0.0011101092677563429, + "learning_rate": 2.2673015873015876e-06, + "loss": 0.3675, + "step": 13930 + }, + { + "epoch": 39.01657142857143, + "grad_norm": 0.03219223394989967, + "learning_rate": 2.260952380952381e-06, + "loss": 0.0032, + "step": 13940 + }, + { + "epoch": 39.01714285714286, + "grad_norm": 0.0029516194481402636, + "learning_rate": 2.254603174603175e-06, + "loss": 0.0086, + "step": 13950 + }, + { + "epoch": 39.017714285714284, + "grad_norm": 0.030327564105391502, + "learning_rate": 2.2482539682539685e-06, + "loss": 0.2982, + "step": 13960 + }, + { + "epoch": 39.01828571428572, + "grad_norm": 0.010290997102856636, + "learning_rate": 2.241904761904762e-06, + "loss": 0.2149, + "step": 13970 + }, + { + "epoch": 39.018857142857144, + "grad_norm": 0.031805507838726044, + "learning_rate": 2.235555555555556e-06, + "loss": 0.1666, + "step": 13980 + }, + { + "epoch": 39.01942857142857, + "grad_norm": 0.05369047820568085, + "learning_rate": 2.2292063492063493e-06, + "loss": 0.0689, + "step": 13990 + }, + { + "epoch": 39.02, + "grad_norm": 0.07717669010162354, + "learning_rate": 2.222857142857143e-06, + "loss": 0.5584, + "step": 14000 + }, + { + "epoch": 39.02, + "eval_accuracy": 0.8417508417508418, + "eval_loss": 1.0148627758026123, + "eval_runtime": 126.757, + "eval_samples_per_second": 2.343, + "eval_steps_per_second": 1.175, + "step": 14000 + }, + { + "epoch": 40.000571428571426, + "grad_norm": 0.2859209179878235, + "learning_rate": 2.2165079365079367e-06, + "loss": 0.001, + "step": 14010 + }, + { + "epoch": 40.00114285714286, + "grad_norm": 0.06745479255914688, + "learning_rate": 2.2101587301587306e-06, + "loss": 0.0017, + "step": 14020 + }, + { + "epoch": 40.001714285714286, + "grad_norm": 0.08140210062265396, + "learning_rate": 2.203809523809524e-06, + "loss": 0.0011, + "step": 14030 + }, + { + "epoch": 40.00228571428571, + "grad_norm": 27.934417724609375, + "learning_rate": 2.1974603174603176e-06, + "loss": 0.1988, + "step": 14040 + }, + { + "epoch": 40.002857142857145, + "grad_norm": 0.01658036932349205, + "learning_rate": 2.1911111111111115e-06, + "loss": 0.5945, + "step": 14050 + }, + { + "epoch": 40.00342857142857, + "grad_norm": 0.0029238651040941477, + "learning_rate": 2.184761904761905e-06, + "loss": 0.001, + "step": 14060 + }, + { + "epoch": 40.004, + "grad_norm": 0.5616852045059204, + "learning_rate": 2.1784126984126985e-06, + "loss": 0.2197, + "step": 14070 + }, + { + "epoch": 40.00457142857143, + "grad_norm": 0.005956857465207577, + "learning_rate": 2.1720634920634924e-06, + "loss": 0.007, + "step": 14080 + }, + { + "epoch": 40.00514285714286, + "grad_norm": 0.0020911735482513905, + "learning_rate": 2.165714285714286e-06, + "loss": 0.2357, + "step": 14090 + }, + { + "epoch": 40.005714285714284, + "grad_norm": 0.010295256972312927, + "learning_rate": 2.1593650793650793e-06, + "loss": 0.2058, + "step": 14100 + }, + { + "epoch": 40.00628571428572, + "grad_norm": 390.59674072265625, + "learning_rate": 2.1530158730158732e-06, + "loss": 0.4373, + "step": 14110 + }, + { + "epoch": 40.00685714285714, + "grad_norm": 0.01252991147339344, + "learning_rate": 2.1466666666666667e-06, + "loss": 0.0004, + "step": 14120 + }, + { + "epoch": 40.00742857142857, + "grad_norm": 0.0566883347928524, + "learning_rate": 2.1403174603174606e-06, + "loss": 0.3251, + "step": 14130 + }, + { + "epoch": 40.008, + "grad_norm": 0.0027798376977443695, + "learning_rate": 2.133968253968254e-06, + "loss": 0.0004, + "step": 14140 + }, + { + "epoch": 40.00857142857143, + "grad_norm": 0.049019478261470795, + "learning_rate": 2.1276190476190476e-06, + "loss": 0.0005, + "step": 14150 + }, + { + "epoch": 40.009142857142855, + "grad_norm": 0.25185105204582214, + "learning_rate": 2.1212698412698415e-06, + "loss": 0.0016, + "step": 14160 + }, + { + "epoch": 40.00971428571429, + "grad_norm": 0.05718778818845749, + "learning_rate": 2.114920634920635e-06, + "loss": 0.2511, + "step": 14170 + }, + { + "epoch": 40.010285714285715, + "grad_norm": 0.017743902280926704, + "learning_rate": 2.108571428571429e-06, + "loss": 0.2719, + "step": 14180 + }, + { + "epoch": 40.01085714285714, + "grad_norm": 0.003888359060510993, + "learning_rate": 2.1022222222222224e-06, + "loss": 0.001, + "step": 14190 + }, + { + "epoch": 40.011428571428574, + "grad_norm": 0.1548689901828766, + "learning_rate": 2.0958730158730163e-06, + "loss": 0.2496, + "step": 14200 + }, + { + "epoch": 40.012, + "grad_norm": 0.14938634634017944, + "learning_rate": 2.0895238095238098e-06, + "loss": 0.1626, + "step": 14210 + }, + { + "epoch": 40.01257142857143, + "grad_norm": 0.0049010817892849445, + "learning_rate": 2.0831746031746032e-06, + "loss": 0.0009, + "step": 14220 + }, + { + "epoch": 40.01314285714286, + "grad_norm": 0.06158607453107834, + "learning_rate": 2.076825396825397e-06, + "loss": 0.6294, + "step": 14230 + }, + { + "epoch": 40.013714285714286, + "grad_norm": 0.396128386259079, + "learning_rate": 2.0704761904761906e-06, + "loss": 0.2451, + "step": 14240 + }, + { + "epoch": 40.01428571428571, + "grad_norm": 0.7225034236907959, + "learning_rate": 2.064126984126984e-06, + "loss": 0.1585, + "step": 14250 + }, + { + "epoch": 40.014857142857146, + "grad_norm": 28.033405303955078, + "learning_rate": 2.057777777777778e-06, + "loss": 0.2748, + "step": 14260 + }, + { + "epoch": 40.01542857142857, + "grad_norm": 95.52556610107422, + "learning_rate": 2.0514285714285715e-06, + "loss": 0.1725, + "step": 14270 + }, + { + "epoch": 40.016, + "grad_norm": 75.94400787353516, + "learning_rate": 2.045079365079365e-06, + "loss": 0.158, + "step": 14280 + }, + { + "epoch": 40.01657142857143, + "grad_norm": 0.08160189539194107, + "learning_rate": 2.038730158730159e-06, + "loss": 0.0047, + "step": 14290 + }, + { + "epoch": 40.01714285714286, + "grad_norm": 0.04288149252533913, + "learning_rate": 2.032380952380953e-06, + "loss": 0.002, + "step": 14300 + }, + { + "epoch": 40.017714285714284, + "grad_norm": 0.013937451876699924, + "learning_rate": 2.026031746031746e-06, + "loss": 0.0041, + "step": 14310 + }, + { + "epoch": 40.01828571428572, + "grad_norm": 91.30559539794922, + "learning_rate": 2.0196825396825398e-06, + "loss": 0.2636, + "step": 14320 + }, + { + "epoch": 40.018857142857144, + "grad_norm": 0.051494400948286057, + "learning_rate": 2.0133333333333337e-06, + "loss": 0.0489, + "step": 14330 + }, + { + "epoch": 40.01942857142857, + "grad_norm": 0.0007844906649552286, + "learning_rate": 2.006984126984127e-06, + "loss": 0.6524, + "step": 14340 + }, + { + "epoch": 40.02, + "grad_norm": 19.62409210205078, + "learning_rate": 2.0006349206349206e-06, + "loss": 0.4637, + "step": 14350 + }, + { + "epoch": 40.02, + "eval_accuracy": 0.8316498316498316, + "eval_loss": 1.0104492902755737, + "eval_runtime": 126.2637, + "eval_samples_per_second": 2.352, + "eval_steps_per_second": 1.18, + "step": 14350 + }, + { + "epoch": 41.000571428571426, + "grad_norm": 0.10992828756570816, + "learning_rate": 1.9942857142857146e-06, + "loss": 0.1912, + "step": 14360 + }, + { + "epoch": 41.00114285714286, + "grad_norm": 7.585694313049316, + "learning_rate": 1.987936507936508e-06, + "loss": 0.0027, + "step": 14370 + }, + { + "epoch": 41.001714285714286, + "grad_norm": 0.027735279873013496, + "learning_rate": 1.9815873015873015e-06, + "loss": 0.3344, + "step": 14380 + }, + { + "epoch": 41.00228571428571, + "grad_norm": 0.02809802256524563, + "learning_rate": 1.9752380952380954e-06, + "loss": 0.0989, + "step": 14390 + }, + { + "epoch": 41.002857142857145, + "grad_norm": 0.0027615518774837255, + "learning_rate": 1.968888888888889e-06, + "loss": 0.0007, + "step": 14400 + }, + { + "epoch": 41.00342857142857, + "grad_norm": 0.009083566255867481, + "learning_rate": 1.962539682539683e-06, + "loss": 0.0071, + "step": 14410 + }, + { + "epoch": 41.004, + "grad_norm": 43.1031379699707, + "learning_rate": 1.9561904761904763e-06, + "loss": 0.213, + "step": 14420 + }, + { + "epoch": 41.00457142857143, + "grad_norm": 0.06005506217479706, + "learning_rate": 1.9498412698412698e-06, + "loss": 0.0027, + "step": 14430 + }, + { + "epoch": 41.00514285714286, + "grad_norm": 0.10170082747936249, + "learning_rate": 1.9434920634920637e-06, + "loss": 0.3536, + "step": 14440 + }, + { + "epoch": 41.005714285714284, + "grad_norm": 0.0068178740330040455, + "learning_rate": 1.9371428571428576e-06, + "loss": 0.3189, + "step": 14450 + }, + { + "epoch": 41.00628571428572, + "grad_norm": 381.4216003417969, + "learning_rate": 1.930793650793651e-06, + "loss": 0.1193, + "step": 14460 + }, + { + "epoch": 41.00685714285714, + "grad_norm": 0.004170392639935017, + "learning_rate": 1.9244444444444446e-06, + "loss": 0.258, + "step": 14470 + }, + { + "epoch": 41.00742857142857, + "grad_norm": 0.059858277440071106, + "learning_rate": 1.9180952380952385e-06, + "loss": 0.2108, + "step": 14480 + }, + { + "epoch": 41.008, + "grad_norm": 0.0008723547798581421, + "learning_rate": 1.911746031746032e-06, + "loss": 0.2695, + "step": 14490 + }, + { + "epoch": 41.00857142857143, + "grad_norm": 44.76935958862305, + "learning_rate": 1.9053968253968254e-06, + "loss": 0.7032, + "step": 14500 + }, + { + "epoch": 41.009142857142855, + "grad_norm": 0.011419898830354214, + "learning_rate": 1.8990476190476193e-06, + "loss": 0.0005, + "step": 14510 + }, + { + "epoch": 41.00971428571429, + "grad_norm": 0.001278414623811841, + "learning_rate": 1.892698412698413e-06, + "loss": 0.0007, + "step": 14520 + }, + { + "epoch": 41.010285714285715, + "grad_norm": 0.011867762543261051, + "learning_rate": 1.8863492063492065e-06, + "loss": 0.0005, + "step": 14530 + }, + { + "epoch": 41.01085714285714, + "grad_norm": 0.014038086868822575, + "learning_rate": 1.8800000000000002e-06, + "loss": 0.6461, + "step": 14540 + }, + { + "epoch": 41.011428571428574, + "grad_norm": 32.356666564941406, + "learning_rate": 1.873650793650794e-06, + "loss": 0.5288, + "step": 14550 + }, + { + "epoch": 41.012, + "grad_norm": 0.18130460381507874, + "learning_rate": 1.8673015873015874e-06, + "loss": 0.7912, + "step": 14560 + }, + { + "epoch": 41.01257142857143, + "grad_norm": 0.2205055207014084, + "learning_rate": 1.860952380952381e-06, + "loss": 0.0011, + "step": 14570 + }, + { + "epoch": 41.01314285714286, + "grad_norm": 0.002843776484951377, + "learning_rate": 1.8546031746031748e-06, + "loss": 0.0006, + "step": 14580 + }, + { + "epoch": 41.013714285714286, + "grad_norm": 0.0019020799081772566, + "learning_rate": 1.8482539682539685e-06, + "loss": 0.0116, + "step": 14590 + }, + { + "epoch": 41.01428571428571, + "grad_norm": 0.12416958808898926, + "learning_rate": 1.841904761904762e-06, + "loss": 0.2979, + "step": 14600 + }, + { + "epoch": 41.014857142857146, + "grad_norm": 0.0510811023414135, + "learning_rate": 1.8355555555555557e-06, + "loss": 0.2946, + "step": 14610 + }, + { + "epoch": 41.01542857142857, + "grad_norm": 0.04932905361056328, + "learning_rate": 1.8292063492063493e-06, + "loss": 0.557, + "step": 14620 + }, + { + "epoch": 41.016, + "grad_norm": 6.153399467468262, + "learning_rate": 1.8228571428571428e-06, + "loss": 0.2424, + "step": 14630 + }, + { + "epoch": 41.01657142857143, + "grad_norm": 0.1840139478445053, + "learning_rate": 1.8165079365079365e-06, + "loss": 0.3622, + "step": 14640 + }, + { + "epoch": 41.01714285714286, + "grad_norm": 0.05411524698138237, + "learning_rate": 1.8101587301587304e-06, + "loss": 0.2168, + "step": 14650 + }, + { + "epoch": 41.017714285714284, + "grad_norm": 0.02711281180381775, + "learning_rate": 1.8038095238095241e-06, + "loss": 0.3268, + "step": 14660 + }, + { + "epoch": 41.01828571428572, + "grad_norm": 0.005898493342101574, + "learning_rate": 1.7974603174603176e-06, + "loss": 0.003, + "step": 14670 + }, + { + "epoch": 41.018857142857144, + "grad_norm": 0.08160807937383652, + "learning_rate": 1.7911111111111113e-06, + "loss": 0.0009, + "step": 14680 + }, + { + "epoch": 41.01942857142857, + "grad_norm": 0.0039462801069021225, + "learning_rate": 1.784761904761905e-06, + "loss": 0.0006, + "step": 14690 + }, + { + "epoch": 41.02, + "grad_norm": 1.8989238739013672, + "learning_rate": 1.7784126984126985e-06, + "loss": 0.0014, + "step": 14700 + }, + { + "epoch": 41.02, + "eval_accuracy": 0.8417508417508418, + "eval_loss": 1.043727159500122, + "eval_runtime": 126.4211, + "eval_samples_per_second": 2.349, + "eval_steps_per_second": 1.179, + "step": 14700 + }, + { + "epoch": 42.000571428571426, + "grad_norm": 0.014098647981882095, + "learning_rate": 1.7720634920634922e-06, + "loss": 0.1607, + "step": 14710 + }, + { + "epoch": 42.00114285714286, + "grad_norm": 20.41655158996582, + "learning_rate": 1.7657142857142859e-06, + "loss": 0.7527, + "step": 14720 + }, + { + "epoch": 42.001714285714286, + "grad_norm": 0.023175185546278954, + "learning_rate": 1.7593650793650796e-06, + "loss": 0.2158, + "step": 14730 + }, + { + "epoch": 42.00228571428571, + "grad_norm": 0.013949972577393055, + "learning_rate": 1.753015873015873e-06, + "loss": 0.0014, + "step": 14740 + }, + { + "epoch": 42.002857142857145, + "grad_norm": 78.23534393310547, + "learning_rate": 1.7466666666666667e-06, + "loss": 0.2982, + "step": 14750 + }, + { + "epoch": 42.00342857142857, + "grad_norm": 0.06348254531621933, + "learning_rate": 1.7403174603174604e-06, + "loss": 0.3159, + "step": 14760 + }, + { + "epoch": 42.004, + "grad_norm": 0.0164299625903368, + "learning_rate": 1.7339682539682543e-06, + "loss": 0.0015, + "step": 14770 + }, + { + "epoch": 42.00457142857143, + "grad_norm": 0.0010301744332537055, + "learning_rate": 1.7276190476190476e-06, + "loss": 0.0005, + "step": 14780 + }, + { + "epoch": 42.00514285714286, + "grad_norm": 0.11905546486377716, + "learning_rate": 1.7212698412698415e-06, + "loss": 0.0194, + "step": 14790 + }, + { + "epoch": 42.005714285714284, + "grad_norm": 0.05502673611044884, + "learning_rate": 1.7149206349206352e-06, + "loss": 0.0014, + "step": 14800 + }, + { + "epoch": 42.00628571428572, + "grad_norm": 0.005338889546692371, + "learning_rate": 1.7085714285714287e-06, + "loss": 0.0021, + "step": 14810 + }, + { + "epoch": 42.00685714285714, + "grad_norm": 0.021363425999879837, + "learning_rate": 1.7022222222222224e-06, + "loss": 0.2946, + "step": 14820 + }, + { + "epoch": 42.00742857142857, + "grad_norm": 0.0004772421089001, + "learning_rate": 1.695873015873016e-06, + "loss": 0.3743, + "step": 14830 + }, + { + "epoch": 42.008, + "grad_norm": 0.08759511262178421, + "learning_rate": 1.6895238095238098e-06, + "loss": 0.3515, + "step": 14840 + }, + { + "epoch": 42.00857142857143, + "grad_norm": 0.03472195193171501, + "learning_rate": 1.6831746031746033e-06, + "loss": 0.0011, + "step": 14850 + }, + { + "epoch": 42.009142857142855, + "grad_norm": 0.011670676060020924, + "learning_rate": 1.676825396825397e-06, + "loss": 0.3558, + "step": 14860 + }, + { + "epoch": 42.00971428571429, + "grad_norm": 0.013489765115082264, + "learning_rate": 1.6704761904761907e-06, + "loss": 0.0845, + "step": 14870 + }, + { + "epoch": 42.010285714285715, + "grad_norm": 0.005060871131718159, + "learning_rate": 1.6641269841269841e-06, + "loss": 0.0024, + "step": 14880 + }, + { + "epoch": 42.01085714285714, + "grad_norm": 0.08271327614784241, + "learning_rate": 1.6577777777777778e-06, + "loss": 0.3332, + "step": 14890 + }, + { + "epoch": 42.011428571428574, + "grad_norm": 0.0020348222460597754, + "learning_rate": 1.6514285714285715e-06, + "loss": 0.9036, + "step": 14900 + }, + { + "epoch": 42.012, + "grad_norm": 0.020888779312372208, + "learning_rate": 1.6450793650793654e-06, + "loss": 0.0897, + "step": 14910 + }, + { + "epoch": 42.01257142857143, + "grad_norm": 0.030984889715909958, + "learning_rate": 1.6387301587301587e-06, + "loss": 0.5513, + "step": 14920 + }, + { + "epoch": 42.01314285714286, + "grad_norm": 90.73495483398438, + "learning_rate": 1.6323809523809526e-06, + "loss": 0.3941, + "step": 14930 + }, + { + "epoch": 42.013714285714286, + "grad_norm": 0.013540121726691723, + "learning_rate": 1.6260317460317463e-06, + "loss": 0.0006, + "step": 14940 + }, + { + "epoch": 42.01428571428571, + "grad_norm": 0.0985909178853035, + "learning_rate": 1.6196825396825398e-06, + "loss": 0.3027, + "step": 14950 + }, + { + "epoch": 42.014857142857146, + "grad_norm": 0.001703809481114149, + "learning_rate": 1.6133333333333335e-06, + "loss": 0.3457, + "step": 14960 + }, + { + "epoch": 42.01542857142857, + "grad_norm": 0.005111075472086668, + "learning_rate": 1.6069841269841272e-06, + "loss": 0.2664, + "step": 14970 + }, + { + "epoch": 42.016, + "grad_norm": 0.0010470590787008405, + "learning_rate": 1.6006349206349209e-06, + "loss": 0.0113, + "step": 14980 + }, + { + "epoch": 42.01657142857143, + "grad_norm": 0.08761300891637802, + "learning_rate": 1.5942857142857144e-06, + "loss": 0.327, + "step": 14990 + }, + { + "epoch": 42.01714285714286, + "grad_norm": 0.0029850888531655073, + "learning_rate": 1.587936507936508e-06, + "loss": 0.0013, + "step": 15000 + }, + { + "epoch": 42.017714285714284, + "grad_norm": 0.006845708005130291, + "learning_rate": 1.5815873015873017e-06, + "loss": 0.3322, + "step": 15010 + }, + { + "epoch": 42.01828571428572, + "grad_norm": 0.036936238408088684, + "learning_rate": 1.5752380952380952e-06, + "loss": 0.0012, + "step": 15020 + }, + { + "epoch": 42.018857142857144, + "grad_norm": 0.026531461626291275, + "learning_rate": 1.568888888888889e-06, + "loss": 0.6484, + "step": 15030 + }, + { + "epoch": 42.01942857142857, + "grad_norm": 0.04399362578988075, + "learning_rate": 1.5625396825396826e-06, + "loss": 0.4781, + "step": 15040 + }, + { + "epoch": 42.02, + "grad_norm": 0.03520611673593521, + "learning_rate": 1.5561904761904763e-06, + "loss": 0.6253, + "step": 15050 + }, + { + "epoch": 42.02, + "eval_accuracy": 0.8148148148148148, + "eval_loss": 1.1686880588531494, + "eval_runtime": 126.3263, + "eval_samples_per_second": 2.351, + "eval_steps_per_second": 1.179, + "step": 15050 + }, + { + "epoch": 43.000571428571426, + "grad_norm": 0.002064335159957409, + "learning_rate": 1.5498412698412698e-06, + "loss": 0.0016, + "step": 15060 + }, + { + "epoch": 43.00114285714286, + "grad_norm": 833.5392456054688, + "learning_rate": 1.5434920634920635e-06, + "loss": 0.1628, + "step": 15070 + }, + { + "epoch": 43.001714285714286, + "grad_norm": 0.02547420747578144, + "learning_rate": 1.5371428571428574e-06, + "loss": 0.4407, + "step": 15080 + }, + { + "epoch": 43.00228571428571, + "grad_norm": 0.0005154515383765101, + "learning_rate": 1.530793650793651e-06, + "loss": 0.0059, + "step": 15090 + }, + { + "epoch": 43.002857142857145, + "grad_norm": 0.019299926236271858, + "learning_rate": 1.5244444444444446e-06, + "loss": 0.1375, + "step": 15100 + }, + { + "epoch": 43.00342857142857, + "grad_norm": 0.04400021582841873, + "learning_rate": 1.5180952380952383e-06, + "loss": 0.1776, + "step": 15110 + }, + { + "epoch": 43.004, + "grad_norm": 0.37237548828125, + "learning_rate": 1.511746031746032e-06, + "loss": 0.0009, + "step": 15120 + }, + { + "epoch": 43.00457142857143, + "grad_norm": 27.535037994384766, + "learning_rate": 1.5053968253968255e-06, + "loss": 0.302, + "step": 15130 + }, + { + "epoch": 43.00514285714286, + "grad_norm": 0.016599314287304878, + "learning_rate": 1.4990476190476191e-06, + "loss": 0.1959, + "step": 15140 + }, + { + "epoch": 43.005714285714284, + "grad_norm": 0.002258901484310627, + "learning_rate": 1.4926984126984128e-06, + "loss": 0.011, + "step": 15150 + }, + { + "epoch": 43.00628571428572, + "grad_norm": 21.613733291625977, + "learning_rate": 1.4863492063492065e-06, + "loss": 0.2254, + "step": 15160 + }, + { + "epoch": 43.00685714285714, + "grad_norm": 0.004936868790537119, + "learning_rate": 1.48e-06, + "loss": 0.0045, + "step": 15170 + }, + { + "epoch": 43.00742857142857, + "grad_norm": 0.09461677819490433, + "learning_rate": 1.4736507936507937e-06, + "loss": 0.3332, + "step": 15180 + }, + { + "epoch": 43.008, + "grad_norm": 0.001690115430392325, + "learning_rate": 1.4673015873015874e-06, + "loss": 0.6643, + "step": 15190 + }, + { + "epoch": 43.00857142857143, + "grad_norm": 0.018435359001159668, + "learning_rate": 1.4609523809523809e-06, + "loss": 0.001, + "step": 15200 + }, + { + "epoch": 43.009142857142855, + "grad_norm": 30.231048583984375, + "learning_rate": 1.4546031746031746e-06, + "loss": 0.2802, + "step": 15210 + }, + { + "epoch": 43.00971428571429, + "grad_norm": 0.05070001631975174, + "learning_rate": 1.4482539682539685e-06, + "loss": 0.001, + "step": 15220 + }, + { + "epoch": 43.010285714285715, + "grad_norm": 0.026784028857946396, + "learning_rate": 1.4419047619047622e-06, + "loss": 0.4937, + "step": 15230 + }, + { + "epoch": 43.01085714285714, + "grad_norm": 0.002373154740780592, + "learning_rate": 1.4355555555555557e-06, + "loss": 0.0013, + "step": 15240 + }, + { + "epoch": 43.011428571428574, + "grad_norm": 0.09937410801649094, + "learning_rate": 1.4292063492063494e-06, + "loss": 0.2796, + "step": 15250 + }, + { + "epoch": 43.012, + "grad_norm": 0.026784038171172142, + "learning_rate": 1.422857142857143e-06, + "loss": 0.4131, + "step": 15260 + }, + { + "epoch": 43.01257142857143, + "grad_norm": 0.0008246685029007494, + "learning_rate": 1.4165079365079365e-06, + "loss": 0.0036, + "step": 15270 + }, + { + "epoch": 43.01314285714286, + "grad_norm": 0.013181746006011963, + "learning_rate": 1.4101587301587302e-06, + "loss": 0.0016, + "step": 15280 + }, + { + "epoch": 43.013714285714286, + "grad_norm": 0.23996856808662415, + "learning_rate": 1.403809523809524e-06, + "loss": 0.2515, + "step": 15290 + }, + { + "epoch": 43.01428571428571, + "grad_norm": 0.08559610694646835, + "learning_rate": 1.3974603174603176e-06, + "loss": 0.6153, + "step": 15300 + }, + { + "epoch": 43.014857142857146, + "grad_norm": 0.03630613163113594, + "learning_rate": 1.3911111111111111e-06, + "loss": 0.2843, + "step": 15310 + }, + { + "epoch": 43.01542857142857, + "grad_norm": 0.0006657622870989144, + "learning_rate": 1.3847619047619048e-06, + "loss": 0.0004, + "step": 15320 + }, + { + "epoch": 43.016, + "grad_norm": 0.3452169895172119, + "learning_rate": 1.3784126984126985e-06, + "loss": 0.0034, + "step": 15330 + }, + { + "epoch": 43.01657142857143, + "grad_norm": 2.725701332092285, + "learning_rate": 1.372063492063492e-06, + "loss": 0.1951, + "step": 15340 + }, + { + "epoch": 43.01714285714286, + "grad_norm": 0.00891941599547863, + "learning_rate": 1.3657142857142857e-06, + "loss": 0.2923, + "step": 15350 + }, + { + "epoch": 43.017714285714284, + "grad_norm": 0.0006919241859577596, + "learning_rate": 1.3593650793650796e-06, + "loss": 0.5058, + "step": 15360 + }, + { + "epoch": 43.01828571428572, + "grad_norm": 0.04941738769412041, + "learning_rate": 1.3530158730158733e-06, + "loss": 0.0011, + "step": 15370 + }, + { + "epoch": 43.018857142857144, + "grad_norm": 0.11309830099344254, + "learning_rate": 1.3466666666666668e-06, + "loss": 0.0771, + "step": 15380 + }, + { + "epoch": 43.01942857142857, + "grad_norm": 0.019192902371287346, + "learning_rate": 1.3403174603174605e-06, + "loss": 0.4352, + "step": 15390 + }, + { + "epoch": 43.02, + "grad_norm": 0.0010145236738026142, + "learning_rate": 1.3339682539682542e-06, + "loss": 0.0009, + "step": 15400 + }, + { + "epoch": 43.02, + "eval_accuracy": 0.8417508417508418, + "eval_loss": 1.0243438482284546, + "eval_runtime": 126.4767, + "eval_samples_per_second": 2.348, + "eval_steps_per_second": 1.178, + "step": 15400 + }, + { + "epoch": 44.000571428571426, + "grad_norm": 0.020503785461187363, + "learning_rate": 1.3276190476190478e-06, + "loss": 0.2084, + "step": 15410 + }, + { + "epoch": 44.00114285714286, + "grad_norm": 0.03143817558884621, + "learning_rate": 1.3212698412698413e-06, + "loss": 0.0007, + "step": 15420 + }, + { + "epoch": 44.001714285714286, + "grad_norm": 0.04019314423203468, + "learning_rate": 1.314920634920635e-06, + "loss": 0.0006, + "step": 15430 + }, + { + "epoch": 44.00228571428571, + "grad_norm": 0.050954557955265045, + "learning_rate": 1.3085714285714287e-06, + "loss": 0.1784, + "step": 15440 + }, + { + "epoch": 44.002857142857145, + "grad_norm": 0.03376142680644989, + "learning_rate": 1.3022222222222222e-06, + "loss": 0.001, + "step": 15450 + }, + { + "epoch": 44.00342857142857, + "grad_norm": 0.012583643198013306, + "learning_rate": 1.295873015873016e-06, + "loss": 0.0007, + "step": 15460 + }, + { + "epoch": 44.004, + "grad_norm": 0.03293371573090553, + "learning_rate": 1.2895238095238096e-06, + "loss": 0.0009, + "step": 15470 + }, + { + "epoch": 44.00457142857143, + "grad_norm": 0.007353159133344889, + "learning_rate": 1.2831746031746035e-06, + "loss": 0.2371, + "step": 15480 + }, + { + "epoch": 44.00514285714286, + "grad_norm": 0.1071564108133316, + "learning_rate": 1.2768253968253968e-06, + "loss": 0.001, + "step": 15490 + }, + { + "epoch": 44.005714285714284, + "grad_norm": 0.03245130181312561, + "learning_rate": 1.2704761904761907e-06, + "loss": 0.0003, + "step": 15500 + }, + { + "epoch": 44.00628571428572, + "grad_norm": 0.0014682890614494681, + "learning_rate": 1.2641269841269844e-06, + "loss": 0.0009, + "step": 15510 + }, + { + "epoch": 44.00685714285714, + "grad_norm": 51.87623596191406, + "learning_rate": 1.2577777777777779e-06, + "loss": 0.1958, + "step": 15520 + }, + { + "epoch": 44.00742857142857, + "grad_norm": 0.010897007770836353, + "learning_rate": 1.2514285714285715e-06, + "loss": 0.0008, + "step": 15530 + }, + { + "epoch": 44.008, + "grad_norm": 0.36192965507507324, + "learning_rate": 1.2450793650793652e-06, + "loss": 0.7262, + "step": 15540 + }, + { + "epoch": 44.00857142857143, + "grad_norm": 0.1297987550497055, + "learning_rate": 1.2387301587301587e-06, + "loss": 0.0005, + "step": 15550 + }, + { + "epoch": 44.009142857142855, + "grad_norm": 0.002931043738499284, + "learning_rate": 1.2323809523809526e-06, + "loss": 0.0008, + "step": 15560 + }, + { + "epoch": 44.00971428571429, + "grad_norm": 0.016998382285237312, + "learning_rate": 1.2260317460317461e-06, + "loss": 0.0034, + "step": 15570 + }, + { + "epoch": 44.010285714285715, + "grad_norm": 0.013441096059978008, + "learning_rate": 1.2196825396825398e-06, + "loss": 0.0003, + "step": 15580 + }, + { + "epoch": 44.01085714285714, + "grad_norm": 0.0027524903416633606, + "learning_rate": 1.2133333333333335e-06, + "loss": 0.001, + "step": 15590 + }, + { + "epoch": 44.011428571428574, + "grad_norm": 0.028492752462625504, + "learning_rate": 1.206984126984127e-06, + "loss": 0.0005, + "step": 15600 + }, + { + "epoch": 44.012, + "grad_norm": 0.011916798539459705, + "learning_rate": 1.2006349206349207e-06, + "loss": 0.342, + "step": 15610 + }, + { + "epoch": 44.01257142857143, + "grad_norm": 0.0014567896723747253, + "learning_rate": 1.1942857142857144e-06, + "loss": 0.0004, + "step": 15620 + }, + { + "epoch": 44.01314285714286, + "grad_norm": 0.029987970367074013, + "learning_rate": 1.187936507936508e-06, + "loss": 0.0193, + "step": 15630 + }, + { + "epoch": 44.013714285714286, + "grad_norm": 0.08593633025884628, + "learning_rate": 1.1815873015873018e-06, + "loss": 0.0006, + "step": 15640 + }, + { + "epoch": 44.01428571428571, + "grad_norm": 0.06858720630407333, + "learning_rate": 1.1752380952380955e-06, + "loss": 0.0006, + "step": 15650 + }, + { + "epoch": 44.014857142857146, + "grad_norm": 0.16518321633338928, + "learning_rate": 1.168888888888889e-06, + "loss": 0.0037, + "step": 15660 + }, + { + "epoch": 44.01542857142857, + "grad_norm": 0.006396264769136906, + "learning_rate": 1.1625396825396826e-06, + "loss": 0.288, + "step": 15670 + }, + { + "epoch": 44.016, + "grad_norm": 0.018989071249961853, + "learning_rate": 1.1561904761904763e-06, + "loss": 0.2859, + "step": 15680 + }, + { + "epoch": 44.01657142857143, + "grad_norm": 0.0026919255033135414, + "learning_rate": 1.1498412698412698e-06, + "loss": 0.0005, + "step": 15690 + }, + { + "epoch": 44.01714285714286, + "grad_norm": 0.07339149713516235, + "learning_rate": 1.1434920634920637e-06, + "loss": 0.6642, + "step": 15700 + }, + { + "epoch": 44.017714285714284, + "grad_norm": 0.07957630604505539, + "learning_rate": 1.1371428571428572e-06, + "loss": 0.0012, + "step": 15710 + }, + { + "epoch": 44.01828571428572, + "grad_norm": 0.0010191805195063353, + "learning_rate": 1.130793650793651e-06, + "loss": 0.001, + "step": 15720 + }, + { + "epoch": 44.018857142857144, + "grad_norm": 0.03606898710131645, + "learning_rate": 1.1244444444444446e-06, + "loss": 0.8374, + "step": 15730 + }, + { + "epoch": 44.01942857142857, + "grad_norm": 0.13875551521778107, + "learning_rate": 1.118095238095238e-06, + "loss": 0.3241, + "step": 15740 + }, + { + "epoch": 44.02, + "grad_norm": 0.023555980995297432, + "learning_rate": 1.1117460317460318e-06, + "loss": 0.0003, + "step": 15750 + }, + { + "epoch": 44.02, + "eval_accuracy": 0.8316498316498316, + "eval_loss": 1.0864453315734863, + "eval_runtime": 126.4265, + "eval_samples_per_second": 2.349, + "eval_steps_per_second": 1.179, + "step": 15750 + }, + { + "epoch": 45.000571428571426, + "grad_norm": 0.05806439369916916, + "learning_rate": 1.1053968253968255e-06, + "loss": 0.0007, + "step": 15760 + }, + { + "epoch": 45.00114285714286, + "grad_norm": 0.00057839514920488, + "learning_rate": 1.0990476190476192e-06, + "loss": 0.1975, + "step": 15770 + }, + { + "epoch": 45.001714285714286, + "grad_norm": 23.461442947387695, + "learning_rate": 1.0926984126984129e-06, + "loss": 0.2195, + "step": 15780 + }, + { + "epoch": 45.00228571428571, + "grad_norm": 0.04696512967348099, + "learning_rate": 1.0863492063492066e-06, + "loss": 0.0007, + "step": 15790 + }, + { + "epoch": 45.002857142857145, + "grad_norm": 0.008673385716974735, + "learning_rate": 1.08e-06, + "loss": 0.4171, + "step": 15800 + }, + { + "epoch": 45.00342857142857, + "grad_norm": 0.005190796218812466, + "learning_rate": 1.0736507936507937e-06, + "loss": 0.2958, + "step": 15810 + }, + { + "epoch": 45.004, + "grad_norm": 0.0010960869258269668, + "learning_rate": 1.0673015873015874e-06, + "loss": 0.0008, + "step": 15820 + }, + { + "epoch": 45.00457142857143, + "grad_norm": 0.04663942754268646, + "learning_rate": 1.060952380952381e-06, + "loss": 0.1735, + "step": 15830 + }, + { + "epoch": 45.00514285714286, + "grad_norm": 0.0008961122948676348, + "learning_rate": 1.0546031746031748e-06, + "loss": 0.2856, + "step": 15840 + }, + { + "epoch": 45.005714285714284, + "grad_norm": 0.05738474056124687, + "learning_rate": 1.0482539682539683e-06, + "loss": 0.0014, + "step": 15850 + }, + { + "epoch": 45.00628571428572, + "grad_norm": 0.04210129752755165, + "learning_rate": 1.041904761904762e-06, + "loss": 0.0007, + "step": 15860 + }, + { + "epoch": 45.00685714285714, + "grad_norm": 0.003992922138422728, + "learning_rate": 1.0355555555555557e-06, + "loss": 0.6006, + "step": 15870 + }, + { + "epoch": 45.00742857142857, + "grad_norm": 0.014494026079773903, + "learning_rate": 1.0292063492063494e-06, + "loss": 0.0008, + "step": 15880 + }, + { + "epoch": 45.008, + "grad_norm": 0.09102319180965424, + "learning_rate": 1.0228571428571429e-06, + "loss": 0.1237, + "step": 15890 + }, + { + "epoch": 45.00857142857143, + "grad_norm": 0.02508743479847908, + "learning_rate": 1.0165079365079366e-06, + "loss": 0.2364, + "step": 15900 + }, + { + "epoch": 45.009142857142855, + "grad_norm": 0.002251436933875084, + "learning_rate": 1.0101587301587303e-06, + "loss": 0.4723, + "step": 15910 + }, + { + "epoch": 45.00971428571429, + "grad_norm": 0.007445584516972303, + "learning_rate": 1.003809523809524e-06, + "loss": 0.2701, + "step": 15920 + }, + { + "epoch": 45.010285714285715, + "grad_norm": 0.03148816525936127, + "learning_rate": 9.974603174603176e-07, + "loss": 0.001, + "step": 15930 + }, + { + "epoch": 45.01085714285714, + "grad_norm": 0.0006346903392113745, + "learning_rate": 9.911111111111111e-07, + "loss": 0.2117, + "step": 15940 + }, + { + "epoch": 45.011428571428574, + "grad_norm": 0.0017558577237650752, + "learning_rate": 9.847619047619048e-07, + "loss": 0.0015, + "step": 15950 + }, + { + "epoch": 45.012, + "grad_norm": 0.029404617846012115, + "learning_rate": 9.784126984126985e-07, + "loss": 0.3177, + "step": 15960 + }, + { + "epoch": 45.01257142857143, + "grad_norm": 0.008277475833892822, + "learning_rate": 9.720634920634922e-07, + "loss": 0.0016, + "step": 15970 + }, + { + "epoch": 45.01314285714286, + "grad_norm": 0.002906983019784093, + "learning_rate": 9.657142857142857e-07, + "loss": 0.0123, + "step": 15980 + }, + { + "epoch": 45.013714285714286, + "grad_norm": 0.1758769452571869, + "learning_rate": 9.593650793650794e-07, + "loss": 0.181, + "step": 15990 + }, + { + "epoch": 45.01428571428571, + "grad_norm": 106.2284927368164, + "learning_rate": 9.530158730158731e-07, + "loss": 0.2344, + "step": 16000 + }, + { + "epoch": 45.014857142857146, + "grad_norm": 0.5874964594841003, + "learning_rate": 9.466666666666667e-07, + "loss": 0.0037, + "step": 16010 + }, + { + "epoch": 45.01542857142857, + "grad_norm": 26.586414337158203, + "learning_rate": 9.403174603174605e-07, + "loss": 0.2546, + "step": 16020 + }, + { + "epoch": 45.016, + "grad_norm": 0.08056703954935074, + "learning_rate": 9.339682539682541e-07, + "loss": 0.3617, + "step": 16030 + }, + { + "epoch": 45.01657142857143, + "grad_norm": 0.015912292525172234, + "learning_rate": 9.276190476190478e-07, + "loss": 0.0014, + "step": 16040 + }, + { + "epoch": 45.01714285714286, + "grad_norm": 0.1449512094259262, + "learning_rate": 9.212698412698413e-07, + "loss": 0.0525, + "step": 16050 + }, + { + "epoch": 45.017714285714284, + "grad_norm": 0.06955932825803757, + "learning_rate": 9.149206349206349e-07, + "loss": 0.0011, + "step": 16060 + }, + { + "epoch": 45.01828571428572, + "grad_norm": 0.002916391473263502, + "learning_rate": 9.085714285714286e-07, + "loss": 0.0006, + "step": 16070 + }, + { + "epoch": 45.018857142857144, + "grad_norm": 0.006070619914680719, + "learning_rate": 9.022222222222222e-07, + "loss": 0.0072, + "step": 16080 + }, + { + "epoch": 45.01942857142857, + "grad_norm": 461.6020202636719, + "learning_rate": 8.95873015873016e-07, + "loss": 0.0531, + "step": 16090 + }, + { + "epoch": 45.02, + "grad_norm": 0.0010222607525065541, + "learning_rate": 8.895238095238096e-07, + "loss": 0.291, + "step": 16100 + }, + { + "epoch": 45.02, + "eval_accuracy": 0.8383838383838383, + "eval_loss": 1.0647377967834473, + "eval_runtime": 126.4165, + "eval_samples_per_second": 2.349, + "eval_steps_per_second": 1.179, + "step": 16100 + }, + { + "epoch": 46.000571428571426, + "grad_norm": 0.047850172966718674, + "learning_rate": 8.831746031746033e-07, + "loss": 0.0005, + "step": 16110 + }, + { + "epoch": 46.00114285714286, + "grad_norm": 0.11881715804338455, + "learning_rate": 8.768253968253969e-07, + "loss": 0.0008, + "step": 16120 + }, + { + "epoch": 46.001714285714286, + "grad_norm": 1.4675657749176025, + "learning_rate": 8.704761904761906e-07, + "loss": 0.0016, + "step": 16130 + }, + { + "epoch": 46.00228571428571, + "grad_norm": 0.005048373248428106, + "learning_rate": 8.641269841269842e-07, + "loss": 0.0005, + "step": 16140 + }, + { + "epoch": 46.002857142857145, + "grad_norm": 0.008005023933947086, + "learning_rate": 8.577777777777778e-07, + "loss": 0.6526, + "step": 16150 + }, + { + "epoch": 46.00342857142857, + "grad_norm": 0.24868465960025787, + "learning_rate": 8.514285714285716e-07, + "loss": 0.0015, + "step": 16160 + }, + { + "epoch": 46.004, + "grad_norm": 0.000758437963668257, + "learning_rate": 8.450793650793652e-07, + "loss": 0.2208, + "step": 16170 + }, + { + "epoch": 46.00457142857143, + "grad_norm": 0.001206890563480556, + "learning_rate": 8.387301587301588e-07, + "loss": 0.2555, + "step": 16180 + }, + { + "epoch": 46.00514285714286, + "grad_norm": 0.08712995797395706, + "learning_rate": 8.323809523809524e-07, + "loss": 0.2027, + "step": 16190 + }, + { + "epoch": 46.005714285714284, + "grad_norm": 0.021634329110383987, + "learning_rate": 8.260317460317461e-07, + "loss": 0.0004, + "step": 16200 + }, + { + "epoch": 46.00628571428572, + "grad_norm": 0.017216574400663376, + "learning_rate": 8.196825396825397e-07, + "loss": 0.2594, + "step": 16210 + }, + { + "epoch": 46.00685714285714, + "grad_norm": 0.03562033176422119, + "learning_rate": 8.133333333333333e-07, + "loss": 0.0008, + "step": 16220 + }, + { + "epoch": 46.00742857142857, + "grad_norm": 0.06330721825361252, + "learning_rate": 8.069841269841271e-07, + "loss": 0.3, + "step": 16230 + }, + { + "epoch": 46.008, + "grad_norm": 0.006224165204912424, + "learning_rate": 8.006349206349207e-07, + "loss": 0.0629, + "step": 16240 + }, + { + "epoch": 46.00857142857143, + "grad_norm": 34.4676513671875, + "learning_rate": 7.942857142857144e-07, + "loss": 0.8066, + "step": 16250 + }, + { + "epoch": 46.009142857142855, + "grad_norm": 0.00887818168848753, + "learning_rate": 7.87936507936508e-07, + "loss": 0.0873, + "step": 16260 + }, + { + "epoch": 46.00971428571429, + "grad_norm": 0.02603054791688919, + "learning_rate": 7.815873015873017e-07, + "loss": 0.0004, + "step": 16270 + }, + { + "epoch": 46.010285714285715, + "grad_norm": 0.11655969172716141, + "learning_rate": 7.752380952380953e-07, + "loss": 0.0017, + "step": 16280 + }, + { + "epoch": 46.01085714285714, + "grad_norm": 0.09002801775932312, + "learning_rate": 7.688888888888891e-07, + "loss": 0.2952, + "step": 16290 + }, + { + "epoch": 46.011428571428574, + "grad_norm": 0.06367355585098267, + "learning_rate": 7.625396825396827e-07, + "loss": 0.2261, + "step": 16300 + }, + { + "epoch": 46.012, + "grad_norm": 0.002869822084903717, + "learning_rate": 7.561904761904762e-07, + "loss": 0.004, + "step": 16310 + }, + { + "epoch": 46.01257142857143, + "grad_norm": 0.01801781728863716, + "learning_rate": 7.498412698412699e-07, + "loss": 0.0004, + "step": 16320 + }, + { + "epoch": 46.01314285714286, + "grad_norm": 0.034441664814949036, + "learning_rate": 7.434920634920635e-07, + "loss": 0.3214, + "step": 16330 + }, + { + "epoch": 46.013714285714286, + "grad_norm": 0.030220109969377518, + "learning_rate": 7.371428571428572e-07, + "loss": 0.1798, + "step": 16340 + }, + { + "epoch": 46.01428571428571, + "grad_norm": 0.09933434426784515, + "learning_rate": 7.307936507936508e-07, + "loss": 0.0004, + "step": 16350 + }, + { + "epoch": 46.014857142857146, + "grad_norm": 0.06845889985561371, + "learning_rate": 7.244444444444446e-07, + "loss": 0.3171, + "step": 16360 + }, + { + "epoch": 46.01542857142857, + "grad_norm": 0.0007817966397851706, + "learning_rate": 7.180952380952382e-07, + "loss": 0.3191, + "step": 16370 + }, + { + "epoch": 46.016, + "grad_norm": 19.301998138427734, + "learning_rate": 7.117460317460318e-07, + "loss": 0.4349, + "step": 16380 + }, + { + "epoch": 46.01657142857143, + "grad_norm": 0.012864407151937485, + "learning_rate": 7.053968253968255e-07, + "loss": 0.2711, + "step": 16390 + }, + { + "epoch": 46.01714285714286, + "grad_norm": 0.003735810751095414, + "learning_rate": 6.990476190476191e-07, + "loss": 0.2612, + "step": 16400 + }, + { + "epoch": 46.017714285714284, + "grad_norm": 0.7684434652328491, + "learning_rate": 6.926984126984128e-07, + "loss": 0.2843, + "step": 16410 + }, + { + "epoch": 46.01828571428572, + "grad_norm": 0.034190673381090164, + "learning_rate": 6.863492063492064e-07, + "loss": 0.3447, + "step": 16420 + }, + { + "epoch": 46.018857142857144, + "grad_norm": 0.02056187205016613, + "learning_rate": 6.800000000000001e-07, + "loss": 0.3328, + "step": 16430 + }, + { + "epoch": 46.01942857142857, + "grad_norm": 0.0970858708024025, + "learning_rate": 6.736507936507936e-07, + "loss": 0.753, + "step": 16440 + }, + { + "epoch": 46.02, + "grad_norm": 0.03131880611181259, + "learning_rate": 6.673015873015874e-07, + "loss": 0.4962, + "step": 16450 + }, + { + "epoch": 46.02, + "eval_accuracy": 0.8316498316498316, + "eval_loss": 1.116576910018921, + "eval_runtime": 126.7393, + "eval_samples_per_second": 2.343, + "eval_steps_per_second": 1.176, + "step": 16450 + }, + { + "epoch": 47.000571428571426, + "grad_norm": 0.04706917330622673, + "learning_rate": 6.60952380952381e-07, + "loss": 0.4978, + "step": 16460 + }, + { + "epoch": 47.00114285714286, + "grad_norm": 0.0006357289967127144, + "learning_rate": 6.546031746031746e-07, + "loss": 0.3942, + "step": 16470 + }, + { + "epoch": 47.001714285714286, + "grad_norm": 0.02852693945169449, + "learning_rate": 6.482539682539683e-07, + "loss": 0.0281, + "step": 16480 + }, + { + "epoch": 47.00228571428571, + "grad_norm": 0.01799050346016884, + "learning_rate": 6.419047619047619e-07, + "loss": 0.3552, + "step": 16490 + }, + { + "epoch": 47.002857142857145, + "grad_norm": 0.0006481860764324665, + "learning_rate": 6.355555555555556e-07, + "loss": 0.001, + "step": 16500 + }, + { + "epoch": 47.00342857142857, + "grad_norm": 0.20090606808662415, + "learning_rate": 6.292063492063492e-07, + "loss": 0.2303, + "step": 16510 + }, + { + "epoch": 47.004, + "grad_norm": 0.0003543874772731215, + "learning_rate": 6.228571428571429e-07, + "loss": 0.1126, + "step": 16520 + }, + { + "epoch": 47.00457142857143, + "grad_norm": 0.0005535692907869816, + "learning_rate": 6.165079365079366e-07, + "loss": 0.0008, + "step": 16530 + }, + { + "epoch": 47.00514285714286, + "grad_norm": 0.0009550791000947356, + "learning_rate": 6.101587301587302e-07, + "loss": 0.0005, + "step": 16540 + }, + { + "epoch": 47.005714285714284, + "grad_norm": 0.021146543323993683, + "learning_rate": 6.038095238095239e-07, + "loss": 0.001, + "step": 16550 + }, + { + "epoch": 47.00628571428572, + "grad_norm": 0.007506783120334148, + "learning_rate": 5.974603174603176e-07, + "loss": 0.3273, + "step": 16560 + }, + { + "epoch": 47.00685714285714, + "grad_norm": 0.008311262354254723, + "learning_rate": 5.911111111111111e-07, + "loss": 0.0786, + "step": 16570 + }, + { + "epoch": 47.00742857142857, + "grad_norm": 0.7662389278411865, + "learning_rate": 5.847619047619047e-07, + "loss": 0.0013, + "step": 16580 + }, + { + "epoch": 47.008, + "grad_norm": 0.0042674667201936245, + "learning_rate": 5.784126984126984e-07, + "loss": 0.0208, + "step": 16590 + }, + { + "epoch": 47.00857142857143, + "grad_norm": 0.0019062272040173411, + "learning_rate": 5.720634920634921e-07, + "loss": 0.3439, + "step": 16600 + }, + { + "epoch": 47.009142857142855, + "grad_norm": 0.12979654967784882, + "learning_rate": 5.657142857142857e-07, + "loss": 0.0246, + "step": 16610 + }, + { + "epoch": 47.00971428571429, + "grad_norm": 196.23326110839844, + "learning_rate": 5.593650793650794e-07, + "loss": 1.4075, + "step": 16620 + }, + { + "epoch": 47.010285714285715, + "grad_norm": 0.10944508761167526, + "learning_rate": 5.530158730158731e-07, + "loss": 0.001, + "step": 16630 + }, + { + "epoch": 47.01085714285714, + "grad_norm": 0.17711256444454193, + "learning_rate": 5.466666666666667e-07, + "loss": 0.0017, + "step": 16640 + }, + { + "epoch": 47.011428571428574, + "grad_norm": 0.026470154523849487, + "learning_rate": 5.403174603174604e-07, + "loss": 0.5992, + "step": 16650 + }, + { + "epoch": 47.012, + "grad_norm": 0.04885806888341904, + "learning_rate": 5.33968253968254e-07, + "loss": 0.0008, + "step": 16660 + }, + { + "epoch": 47.01257142857143, + "grad_norm": 0.1748293787240982, + "learning_rate": 5.276190476190477e-07, + "loss": 0.0011, + "step": 16670 + }, + { + "epoch": 47.01314285714286, + "grad_norm": 0.028242330998182297, + "learning_rate": 5.212698412698413e-07, + "loss": 0.2365, + "step": 16680 + }, + { + "epoch": 47.013714285714286, + "grad_norm": 499.1086120605469, + "learning_rate": 5.14920634920635e-07, + "loss": 0.2298, + "step": 16690 + }, + { + "epoch": 47.01428571428571, + "grad_norm": 0.029000846669077873, + "learning_rate": 5.085714285714286e-07, + "loss": 0.0006, + "step": 16700 + }, + { + "epoch": 47.014857142857146, + "grad_norm": 0.018016472458839417, + "learning_rate": 5.022222222222222e-07, + "loss": 0.1712, + "step": 16710 + }, + { + "epoch": 47.01542857142857, + "grad_norm": 0.9705002903938293, + "learning_rate": 4.958730158730159e-07, + "loss": 0.1886, + "step": 16720 + }, + { + "epoch": 47.016, + "grad_norm": 0.011911354027688503, + "learning_rate": 4.895238095238096e-07, + "loss": 0.0004, + "step": 16730 + }, + { + "epoch": 47.01657142857143, + "grad_norm": 0.008706871420145035, + "learning_rate": 4.831746031746032e-07, + "loss": 0.0008, + "step": 16740 + }, + { + "epoch": 47.01714285714286, + "grad_norm": 0.038937151432037354, + "learning_rate": 4.7682539682539686e-07, + "loss": 0.0004, + "step": 16750 + }, + { + "epoch": 47.017714285714284, + "grad_norm": 0.006863133050501347, + "learning_rate": 4.704761904761905e-07, + "loss": 0.1916, + "step": 16760 + }, + { + "epoch": 47.01828571428572, + "grad_norm": 34.81043243408203, + "learning_rate": 4.6412698412698414e-07, + "loss": 0.1632, + "step": 16770 + }, + { + "epoch": 47.018857142857144, + "grad_norm": 0.004815808031708002, + "learning_rate": 4.5777777777777784e-07, + "loss": 0.3468, + "step": 16780 + }, + { + "epoch": 47.01942857142857, + "grad_norm": 0.05895291268825531, + "learning_rate": 4.514285714285715e-07, + "loss": 0.2842, + "step": 16790 + }, + { + "epoch": 47.02, + "grad_norm": 0.0008096261299215257, + "learning_rate": 4.450793650793651e-07, + "loss": 0.0919, + "step": 16800 + }, + { + "epoch": 47.02, + "eval_accuracy": 0.8282828282828283, + "eval_loss": 1.1209441423416138, + "eval_runtime": 126.6719, + "eval_samples_per_second": 2.345, + "eval_steps_per_second": 1.176, + "step": 16800 + }, + { + "epoch": 48.000571428571426, + "grad_norm": 174.62635803222656, + "learning_rate": 4.387301587301588e-07, + "loss": 0.008, + "step": 16810 + }, + { + "epoch": 48.00114285714286, + "grad_norm": 0.04038200154900551, + "learning_rate": 4.323809523809524e-07, + "loss": 0.2735, + "step": 16820 + }, + { + "epoch": 48.001714285714286, + "grad_norm": 0.012884361669421196, + "learning_rate": 4.2603174603174605e-07, + "loss": 0.2273, + "step": 16830 + }, + { + "epoch": 48.00228571428571, + "grad_norm": 0.11547985672950745, + "learning_rate": 4.196825396825397e-07, + "loss": 0.2345, + "step": 16840 + }, + { + "epoch": 48.002857142857145, + "grad_norm": 0.34670960903167725, + "learning_rate": 4.133333333333334e-07, + "loss": 0.0018, + "step": 16850 + }, + { + "epoch": 48.00342857142857, + "grad_norm": 0.006787777412682772, + "learning_rate": 4.06984126984127e-07, + "loss": 0.1207, + "step": 16860 + }, + { + "epoch": 48.004, + "grad_norm": 0.0013283508596941829, + "learning_rate": 4.0063492063492067e-07, + "loss": 0.0036, + "step": 16870 + }, + { + "epoch": 48.00457142857143, + "grad_norm": 0.0029995145741850138, + "learning_rate": 3.9428571428571436e-07, + "loss": 0.0003, + "step": 16880 + }, + { + "epoch": 48.00514285714286, + "grad_norm": 0.09618420153856277, + "learning_rate": 3.87936507936508e-07, + "loss": 0.0005, + "step": 16890 + }, + { + "epoch": 48.005714285714284, + "grad_norm": 0.033103521913290024, + "learning_rate": 3.815873015873016e-07, + "loss": 0.0123, + "step": 16900 + }, + { + "epoch": 48.00628571428572, + "grad_norm": 0.0023510020691901445, + "learning_rate": 3.7523809523809523e-07, + "loss": 0.0004, + "step": 16910 + }, + { + "epoch": 48.00685714285714, + "grad_norm": 0.02988676354289055, + "learning_rate": 3.6888888888888893e-07, + "loss": 0.4017, + "step": 16920 + }, + { + "epoch": 48.00742857142857, + "grad_norm": 0.014092681929469109, + "learning_rate": 3.6253968253968257e-07, + "loss": 0.295, + "step": 16930 + }, + { + "epoch": 48.008, + "grad_norm": 0.014224675484001637, + "learning_rate": 3.561904761904762e-07, + "loss": 0.3255, + "step": 16940 + }, + { + "epoch": 48.00857142857143, + "grad_norm": 0.09868843853473663, + "learning_rate": 3.498412698412699e-07, + "loss": 0.2179, + "step": 16950 + }, + { + "epoch": 48.009142857142855, + "grad_norm": 0.0006300527020357549, + "learning_rate": 3.4349206349206355e-07, + "loss": 0.094, + "step": 16960 + }, + { + "epoch": 48.00971428571429, + "grad_norm": 0.0006618179613724351, + "learning_rate": 3.371428571428572e-07, + "loss": 0.0007, + "step": 16970 + }, + { + "epoch": 48.010285714285715, + "grad_norm": 0.12803080677986145, + "learning_rate": 3.307936507936508e-07, + "loss": 0.3201, + "step": 16980 + }, + { + "epoch": 48.01085714285714, + "grad_norm": 0.033108729869127274, + "learning_rate": 3.2444444444444447e-07, + "loss": 0.2339, + "step": 16990 + }, + { + "epoch": 48.011428571428574, + "grad_norm": 0.10151008516550064, + "learning_rate": 3.180952380952381e-07, + "loss": 0.0004, + "step": 17000 + }, + { + "epoch": 48.012, + "grad_norm": 0.015769364312291145, + "learning_rate": 3.1174603174603176e-07, + "loss": 0.3187, + "step": 17010 + }, + { + "epoch": 48.01257142857143, + "grad_norm": 0.009230137802660465, + "learning_rate": 3.0539682539682545e-07, + "loss": 0.4486, + "step": 17020 + }, + { + "epoch": 48.01314285714286, + "grad_norm": 0.0004300758882891387, + "learning_rate": 2.990476190476191e-07, + "loss": 0.049, + "step": 17030 + }, + { + "epoch": 48.013714285714286, + "grad_norm": 0.006301034241914749, + "learning_rate": 2.9269841269841274e-07, + "loss": 0.0007, + "step": 17040 + }, + { + "epoch": 48.01428571428571, + "grad_norm": 0.009746396914124489, + "learning_rate": 2.863492063492064e-07, + "loss": 0.5487, + "step": 17050 + }, + { + "epoch": 48.014857142857146, + "grad_norm": 0.009282475337386131, + "learning_rate": 2.8e-07, + "loss": 0.0008, + "step": 17060 + }, + { + "epoch": 48.01542857142857, + "grad_norm": 0.002887505106627941, + "learning_rate": 2.736507936507937e-07, + "loss": 0.0004, + "step": 17070 + }, + { + "epoch": 48.016, + "grad_norm": 0.025261027738451958, + "learning_rate": 2.673015873015873e-07, + "loss": 0.0006, + "step": 17080 + }, + { + "epoch": 48.01657142857143, + "grad_norm": 0.016581503674387932, + "learning_rate": 2.6095238095238094e-07, + "loss": 0.5268, + "step": 17090 + }, + { + "epoch": 48.01714285714286, + "grad_norm": 0.03690231218934059, + "learning_rate": 2.5460317460317464e-07, + "loss": 0.0583, + "step": 17100 + }, + { + "epoch": 48.017714285714284, + "grad_norm": 0.16912665963172913, + "learning_rate": 2.482539682539683e-07, + "loss": 0.0008, + "step": 17110 + }, + { + "epoch": 48.01828571428572, + "grad_norm": 0.01130374800413847, + "learning_rate": 2.419047619047619e-07, + "loss": 0.0004, + "step": 17120 + }, + { + "epoch": 48.018857142857144, + "grad_norm": 0.3978990316390991, + "learning_rate": 2.3555555555555556e-07, + "loss": 0.001, + "step": 17130 + }, + { + "epoch": 48.01942857142857, + "grad_norm": 0.003729065880179405, + "learning_rate": 2.2920634920634923e-07, + "loss": 0.6757, + "step": 17140 + }, + { + "epoch": 48.02, + "grad_norm": 0.0032000578939914703, + "learning_rate": 2.228571428571429e-07, + "loss": 0.0007, + "step": 17150 + }, + { + "epoch": 48.02, + "eval_accuracy": 0.8316498316498316, + "eval_loss": 1.1259746551513672, + "eval_runtime": 126.1783, + "eval_samples_per_second": 2.354, + "eval_steps_per_second": 1.181, + "step": 17150 + }, + { + "epoch": 49.000571428571426, + "grad_norm": 0.021503791213035583, + "learning_rate": 2.1650793650793652e-07, + "loss": 0.188, + "step": 17160 + }, + { + "epoch": 49.00114285714286, + "grad_norm": 0.009531227871775627, + "learning_rate": 2.1015873015873019e-07, + "loss": 0.0006, + "step": 17170 + }, + { + "epoch": 49.001714285714286, + "grad_norm": 0.02791694551706314, + "learning_rate": 2.0380952380952383e-07, + "loss": 0.0036, + "step": 17180 + }, + { + "epoch": 49.00228571428571, + "grad_norm": 0.014732033014297485, + "learning_rate": 1.974603174603175e-07, + "loss": 0.4474, + "step": 17190 + }, + { + "epoch": 49.002857142857145, + "grad_norm": 0.0033705858513712883, + "learning_rate": 1.911111111111111e-07, + "loss": 0.001, + "step": 17200 + }, + { + "epoch": 49.00342857142857, + "grad_norm": 0.076132632791996, + "learning_rate": 1.8476190476190478e-07, + "loss": 0.5201, + "step": 17210 + }, + { + "epoch": 49.004, + "grad_norm": 0.008379080332815647, + "learning_rate": 1.7841269841269842e-07, + "loss": 0.0005, + "step": 17220 + }, + { + "epoch": 49.00457142857143, + "grad_norm": 0.014781179837882519, + "learning_rate": 1.720634920634921e-07, + "loss": 0.2197, + "step": 17230 + }, + { + "epoch": 49.00514285714286, + "grad_norm": 0.03410143777728081, + "learning_rate": 1.657142857142857e-07, + "loss": 0.001, + "step": 17240 + }, + { + "epoch": 49.005714285714284, + "grad_norm": 0.0006565088406205177, + "learning_rate": 1.5936507936507937e-07, + "loss": 0.0015, + "step": 17250 + }, + { + "epoch": 49.00628571428572, + "grad_norm": 0.014354717917740345, + "learning_rate": 1.5301587301587304e-07, + "loss": 0.5642, + "step": 17260 + }, + { + "epoch": 49.00685714285714, + "grad_norm": 0.09152337163686752, + "learning_rate": 1.4666666666666668e-07, + "loss": 0.276, + "step": 17270 + }, + { + "epoch": 49.00742857142857, + "grad_norm": 0.03440091758966446, + "learning_rate": 1.4031746031746032e-07, + "loss": 0.0006, + "step": 17280 + }, + { + "epoch": 49.008, + "grad_norm": 0.004568720702081919, + "learning_rate": 1.3396825396825397e-07, + "loss": 0.0073, + "step": 17290 + }, + { + "epoch": 49.00857142857143, + "grad_norm": 0.021223975345492363, + "learning_rate": 1.2761904761904763e-07, + "loss": 0.0004, + "step": 17300 + }, + { + "epoch": 49.009142857142855, + "grad_norm": 0.007963555864989758, + "learning_rate": 1.2126984126984128e-07, + "loss": 0.002, + "step": 17310 + }, + { + "epoch": 49.00971428571429, + "grad_norm": 0.0007649322506040335, + "learning_rate": 1.1492063492063493e-07, + "loss": 0.3511, + "step": 17320 + }, + { + "epoch": 49.010285714285715, + "grad_norm": 0.057293448597192764, + "learning_rate": 1.0857142857142857e-07, + "loss": 0.3396, + "step": 17330 + }, + { + "epoch": 49.01085714285714, + "grad_norm": 0.006769323721528053, + "learning_rate": 1.0222222222222224e-07, + "loss": 0.0022, + "step": 17340 + }, + { + "epoch": 49.011428571428574, + "grad_norm": 0.0010334831895306706, + "learning_rate": 9.587301587301588e-08, + "loss": 0.2595, + "step": 17350 + }, + { + "epoch": 49.012, + "grad_norm": 0.015894345939159393, + "learning_rate": 8.952380952380954e-08, + "loss": 0.2016, + "step": 17360 + }, + { + "epoch": 49.01257142857143, + "grad_norm": 0.013702361844480038, + "learning_rate": 8.317460317460318e-08, + "loss": 0.0008, + "step": 17370 + }, + { + "epoch": 49.01314285714286, + "grad_norm": 0.007162360940128565, + "learning_rate": 7.682539682539682e-08, + "loss": 0.2925, + "step": 17380 + }, + { + "epoch": 49.013714285714286, + "grad_norm": 0.009392665699124336, + "learning_rate": 7.047619047619048e-08, + "loss": 0.007, + "step": 17390 + }, + { + "epoch": 49.01428571428571, + "grad_norm": 5.373441219329834, + "learning_rate": 6.412698412698413e-08, + "loss": 0.2043, + "step": 17400 + }, + { + "epoch": 49.014857142857146, + "grad_norm": 0.020008977502584457, + "learning_rate": 5.777777777777778e-08, + "loss": 0.0742, + "step": 17410 + }, + { + "epoch": 49.01542857142857, + "grad_norm": 0.04810080677270889, + "learning_rate": 5.142857142857143e-08, + "loss": 0.0005, + "step": 17420 + }, + { + "epoch": 49.016, + "grad_norm": 0.010244383476674557, + "learning_rate": 4.507936507936508e-08, + "loss": 0.4077, + "step": 17430 + }, + { + "epoch": 49.01657142857143, + "grad_norm": 0.006721619050949812, + "learning_rate": 3.873015873015873e-08, + "loss": 0.8587, + "step": 17440 + }, + { + "epoch": 49.01714285714286, + "grad_norm": 0.0023023684043437243, + "learning_rate": 3.238095238095239e-08, + "loss": 0.0006, + "step": 17450 + }, + { + "epoch": 49.017714285714284, + "grad_norm": 0.19224895536899567, + "learning_rate": 2.6031746031746037e-08, + "loss": 0.0009, + "step": 17460 + }, + { + "epoch": 49.01828571428572, + "grad_norm": 0.04239881783723831, + "learning_rate": 1.9682539682539685e-08, + "loss": 0.0007, + "step": 17470 + }, + { + "epoch": 49.018857142857144, + "grad_norm": 0.0015393183566629887, + "learning_rate": 1.3333333333333334e-08, + "loss": 0.2342, + "step": 17480 + }, + { + "epoch": 49.01942857142857, + "grad_norm": 0.05108032375574112, + "learning_rate": 6.9841269841269845e-09, + "loss": 0.187, + "step": 17490 + }, + { + "epoch": 49.02, + "grad_norm": 0.0034502753987908363, + "learning_rate": 6.34920634920635e-10, + "loss": 0.0008, + "step": 17500 + }, + { + "epoch": 49.02, + "eval_accuracy": 0.835016835016835, + "eval_loss": 1.1138967275619507, + "eval_runtime": 126.4744, + "eval_samples_per_second": 2.348, + "eval_steps_per_second": 1.178, + "step": 17500 + }, + { + "epoch": 49.02, + "step": 17500, + "total_flos": 1.5368592103538688e+20, + "train_loss": 0.39184775381689624, + "train_runtime": 51190.6894, + "train_samples_per_second": 0.684, + "train_steps_per_second": 0.342 + }, + { + "epoch": 49.02, + "eval_accuracy": 0.8585858585858586, + "eval_loss": 0.7987341284751892, + "eval_runtime": 125.2938, + "eval_samples_per_second": 2.37, + "eval_steps_per_second": 1.189, + "step": 17500 + }, + { + "epoch": 49.02, + "eval_accuracy": 0.8585858585858586, + "eval_loss": 0.798734188079834, + "eval_runtime": 125.2382, + "eval_samples_per_second": 2.371, + "eval_steps_per_second": 1.19, + "step": 17500 + } + ], + "logging_steps": 10, + "max_steps": 17500, + "num_input_tokens_seen": 0, + "num_train_epochs": 9223372036854775807, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.5368592103538688e+20, + "train_batch_size": 2, "trial_name": null, "trial_params": null }