{ "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 1715, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.029154518950437316, "grad_norm": 6.396158218383789, "learning_rate": 4.970845481049563e-05, "loss": 3.6944, "step": 10 }, { "epoch": 0.05830903790087463, "grad_norm": 5.8870744705200195, "learning_rate": 4.941690962099126e-05, "loss": 3.4646, "step": 20 }, { "epoch": 0.08746355685131195, "grad_norm": 8.024749755859375, "learning_rate": 4.912536443148688e-05, "loss": 3.2603, "step": 30 }, { "epoch": 0.11661807580174927, "grad_norm": 12.580345153808594, "learning_rate": 4.883381924198251e-05, "loss": 2.9375, "step": 40 }, { "epoch": 0.1457725947521866, "grad_norm": 13.44204330444336, "learning_rate": 4.8542274052478136e-05, "loss": 2.663, "step": 50 }, { "epoch": 0.1749271137026239, "grad_norm": 13.08288288116455, "learning_rate": 4.825072886297377e-05, "loss": 2.4086, "step": 60 }, { "epoch": 0.20408163265306123, "grad_norm": 14.201903343200684, "learning_rate": 4.795918367346939e-05, "loss": 2.0242, "step": 70 }, { "epoch": 0.23323615160349853, "grad_norm": 14.91315746307373, "learning_rate": 4.7667638483965015e-05, "loss": 1.8438, "step": 80 }, { "epoch": 0.26239067055393583, "grad_norm": 14.440930366516113, "learning_rate": 4.7376093294460646e-05, "loss": 1.7379, "step": 90 }, { "epoch": 0.2915451895043732, "grad_norm": 14.924482345581055, "learning_rate": 4.708454810495627e-05, "loss": 1.3515, "step": 100 }, { "epoch": 0.3206997084548105, "grad_norm": 15.159282684326172, "learning_rate": 4.6793002915451894e-05, "loss": 1.4493, "step": 110 }, { "epoch": 0.3498542274052478, "grad_norm": 15.176348686218262, "learning_rate": 4.6501457725947525e-05, "loss": 1.3174, "step": 120 }, { "epoch": 0.37900874635568516, "grad_norm": 15.329106330871582, "learning_rate": 4.620991253644315e-05, "loss": 1.3243, "step": 130 }, { "epoch": 0.40816326530612246, "grad_norm": 13.873916625976562, "learning_rate": 4.591836734693878e-05, "loss": 1.0203, "step": 140 }, { "epoch": 0.43731778425655976, "grad_norm": 11.680057525634766, "learning_rate": 4.5626822157434404e-05, "loss": 1.2277, "step": 150 }, { "epoch": 0.46647230320699706, "grad_norm": 17.877845764160156, "learning_rate": 4.533527696793003e-05, "loss": 0.9823, "step": 160 }, { "epoch": 0.4956268221574344, "grad_norm": 12.79694938659668, "learning_rate": 4.504373177842566e-05, "loss": 1.1087, "step": 170 }, { "epoch": 0.5247813411078717, "grad_norm": 15.178705215454102, "learning_rate": 4.475218658892128e-05, "loss": 0.9267, "step": 180 }, { "epoch": 0.5539358600583091, "grad_norm": 19.371179580688477, "learning_rate": 4.4460641399416914e-05, "loss": 1.1024, "step": 190 }, { "epoch": 0.5830903790087464, "grad_norm": 12.616296768188477, "learning_rate": 4.416909620991254e-05, "loss": 0.9355, "step": 200 }, { "epoch": 0.6122448979591837, "grad_norm": 15.549710273742676, "learning_rate": 4.387755102040816e-05, "loss": 0.8048, "step": 210 }, { "epoch": 0.641399416909621, "grad_norm": 15.638792037963867, "learning_rate": 4.358600583090379e-05, "loss": 0.8175, "step": 220 }, { "epoch": 0.6705539358600583, "grad_norm": 12.656390190124512, "learning_rate": 4.3294460641399424e-05, "loss": 1.0265, "step": 230 }, { "epoch": 0.6997084548104956, "grad_norm": 12.056506156921387, "learning_rate": 4.300291545189505e-05, "loss": 0.8383, "step": 240 }, { "epoch": 0.7288629737609329, "grad_norm": 13.179421424865723, "learning_rate": 4.271137026239067e-05, "loss": 0.882, "step": 250 }, { "epoch": 0.7580174927113703, "grad_norm": 11.642322540283203, "learning_rate": 4.2419825072886296e-05, "loss": 0.9375, "step": 260 }, { "epoch": 0.7871720116618076, "grad_norm": 14.77869987487793, "learning_rate": 4.212827988338193e-05, "loss": 0.721, "step": 270 }, { "epoch": 0.8163265306122449, "grad_norm": 12.768646240234375, "learning_rate": 4.183673469387756e-05, "loss": 0.7863, "step": 280 }, { "epoch": 0.8454810495626822, "grad_norm": 17.440149307250977, "learning_rate": 4.1545189504373175e-05, "loss": 0.7854, "step": 290 }, { "epoch": 0.8746355685131195, "grad_norm": 14.728607177734375, "learning_rate": 4.1253644314868806e-05, "loss": 0.6795, "step": 300 }, { "epoch": 0.9037900874635568, "grad_norm": 13.96823787689209, "learning_rate": 4.0962099125364436e-05, "loss": 0.8244, "step": 310 }, { "epoch": 0.9329446064139941, "grad_norm": 14.88004207611084, "learning_rate": 4.067055393586006e-05, "loss": 0.8082, "step": 320 }, { "epoch": 0.9620991253644315, "grad_norm": 12.001025199890137, "learning_rate": 4.0379008746355685e-05, "loss": 0.7094, "step": 330 }, { "epoch": 0.9912536443148688, "grad_norm": 20.970056533813477, "learning_rate": 4.0087463556851315e-05, "loss": 0.8157, "step": 340 }, { "epoch": 1.0, "eval_loss": 0.7410290837287903, "eval_runtime": 346.7173, "eval_samples_per_second": 7.888, "eval_steps_per_second": 0.248, "step": 343 }, { "epoch": 1.0204081632653061, "grad_norm": 8.203807830810547, "learning_rate": 3.979591836734694e-05, "loss": 0.4442, "step": 350 }, { "epoch": 1.0495626822157433, "grad_norm": 9.512131690979004, "learning_rate": 3.950437317784257e-05, "loss": 0.4501, "step": 360 }, { "epoch": 1.0787172011661808, "grad_norm": 11.348304748535156, "learning_rate": 3.9212827988338194e-05, "loss": 0.5578, "step": 370 }, { "epoch": 1.1078717201166182, "grad_norm": 16.04487419128418, "learning_rate": 3.892128279883382e-05, "loss": 0.5277, "step": 380 }, { "epoch": 1.1370262390670554, "grad_norm": 11.712241172790527, "learning_rate": 3.862973760932945e-05, "loss": 0.4313, "step": 390 }, { "epoch": 1.1661807580174928, "grad_norm": 10.177406311035156, "learning_rate": 3.833819241982507e-05, "loss": 0.4427, "step": 400 }, { "epoch": 1.19533527696793, "grad_norm": 10.864590644836426, "learning_rate": 3.8046647230320704e-05, "loss": 0.4545, "step": 410 }, { "epoch": 1.2244897959183674, "grad_norm": 27.668434143066406, "learning_rate": 3.775510204081633e-05, "loss": 0.3945, "step": 420 }, { "epoch": 1.2536443148688048, "grad_norm": 12.273193359375, "learning_rate": 3.746355685131195e-05, "loss": 0.4442, "step": 430 }, { "epoch": 1.282798833819242, "grad_norm": 10.925050735473633, "learning_rate": 3.717201166180758e-05, "loss": 0.3749, "step": 440 }, { "epoch": 1.3119533527696792, "grad_norm": 15.893428802490234, "learning_rate": 3.688046647230321e-05, "loss": 0.3903, "step": 450 }, { "epoch": 1.3411078717201166, "grad_norm": 17.293563842773438, "learning_rate": 3.658892128279884e-05, "loss": 0.5249, "step": 460 }, { "epoch": 1.370262390670554, "grad_norm": 10.63139820098877, "learning_rate": 3.629737609329446e-05, "loss": 0.4097, "step": 470 }, { "epoch": 1.3994169096209912, "grad_norm": 10.485889434814453, "learning_rate": 3.6005830903790086e-05, "loss": 0.4057, "step": 480 }, { "epoch": 1.4285714285714286, "grad_norm": 15.918136596679688, "learning_rate": 3.571428571428572e-05, "loss": 0.333, "step": 490 }, { "epoch": 1.4577259475218658, "grad_norm": 5.628406047821045, "learning_rate": 3.542274052478135e-05, "loss": 0.4202, "step": 500 }, { "epoch": 1.4868804664723032, "grad_norm": 8.147640228271484, "learning_rate": 3.5131195335276965e-05, "loss": 0.3755, "step": 510 }, { "epoch": 1.5160349854227406, "grad_norm": 11.122663497924805, "learning_rate": 3.4839650145772596e-05, "loss": 0.3533, "step": 520 }, { "epoch": 1.5451895043731778, "grad_norm": 8.978357315063477, "learning_rate": 3.454810495626823e-05, "loss": 0.3938, "step": 530 }, { "epoch": 1.574344023323615, "grad_norm": 13.574569702148438, "learning_rate": 3.425655976676385e-05, "loss": 0.4004, "step": 540 }, { "epoch": 1.6034985422740524, "grad_norm": 12.93692398071289, "learning_rate": 3.3965014577259475e-05, "loss": 0.3081, "step": 550 }, { "epoch": 1.6326530612244898, "grad_norm": 9.078178405761719, "learning_rate": 3.36734693877551e-05, "loss": 0.3557, "step": 560 }, { "epoch": 1.6618075801749272, "grad_norm": 7.011285305023193, "learning_rate": 3.338192419825073e-05, "loss": 0.3968, "step": 570 }, { "epoch": 1.6909620991253644, "grad_norm": 12.261427879333496, "learning_rate": 3.309037900874636e-05, "loss": 0.349, "step": 580 }, { "epoch": 1.7201166180758016, "grad_norm": 7.620883464813232, "learning_rate": 3.2798833819241985e-05, "loss": 0.3861, "step": 590 }, { "epoch": 1.749271137026239, "grad_norm": 7.384204864501953, "learning_rate": 3.250728862973761e-05, "loss": 0.3951, "step": 600 }, { "epoch": 1.7784256559766765, "grad_norm": 11.785451889038086, "learning_rate": 3.221574344023324e-05, "loss": 0.4344, "step": 610 }, { "epoch": 1.8075801749271136, "grad_norm": 9.102931022644043, "learning_rate": 3.1924198250728864e-05, "loss": 0.3569, "step": 620 }, { "epoch": 1.836734693877551, "grad_norm": 12.737972259521484, "learning_rate": 3.1632653061224494e-05, "loss": 0.4627, "step": 630 }, { "epoch": 1.8658892128279883, "grad_norm": 6.976524829864502, "learning_rate": 3.134110787172012e-05, "loss": 0.4338, "step": 640 }, { "epoch": 1.8950437317784257, "grad_norm": 10.488119125366211, "learning_rate": 3.104956268221574e-05, "loss": 0.4369, "step": 650 }, { "epoch": 1.924198250728863, "grad_norm": 13.129137992858887, "learning_rate": 3.0758017492711373e-05, "loss": 0.4023, "step": 660 }, { "epoch": 1.9533527696793003, "grad_norm": 11.848146438598633, "learning_rate": 3.0466472303207e-05, "loss": 0.3211, "step": 670 }, { "epoch": 1.9825072886297375, "grad_norm": 13.786744117736816, "learning_rate": 3.017492711370263e-05, "loss": 0.4545, "step": 680 }, { "epoch": 2.0, "eval_loss": 0.6418492794036865, "eval_runtime": 344.1102, "eval_samples_per_second": 7.948, "eval_steps_per_second": 0.25, "step": 686 }, { "epoch": 2.011661807580175, "grad_norm": 7.113315582275391, "learning_rate": 2.988338192419825e-05, "loss": 0.3137, "step": 690 }, { "epoch": 2.0408163265306123, "grad_norm": 9.555095672607422, "learning_rate": 2.959183673469388e-05, "loss": 0.1409, "step": 700 }, { "epoch": 2.0699708454810497, "grad_norm": 14.719175338745117, "learning_rate": 2.9300291545189507e-05, "loss": 0.1807, "step": 710 }, { "epoch": 2.0991253644314867, "grad_norm": 6.63716459274292, "learning_rate": 2.9008746355685135e-05, "loss": 0.1116, "step": 720 }, { "epoch": 2.128279883381924, "grad_norm": 4.129220962524414, "learning_rate": 2.871720116618076e-05, "loss": 0.1921, "step": 730 }, { "epoch": 2.1574344023323615, "grad_norm": 7.979690074920654, "learning_rate": 2.8425655976676386e-05, "loss": 0.1378, "step": 740 }, { "epoch": 2.186588921282799, "grad_norm": 11.688068389892578, "learning_rate": 2.8134110787172014e-05, "loss": 0.1304, "step": 750 }, { "epoch": 2.2157434402332363, "grad_norm": 9.168377876281738, "learning_rate": 2.784256559766764e-05, "loss": 0.1774, "step": 760 }, { "epoch": 2.2448979591836733, "grad_norm": 14.069756507873535, "learning_rate": 2.7551020408163265e-05, "loss": 0.1482, "step": 770 }, { "epoch": 2.2740524781341107, "grad_norm": 2.5552010536193848, "learning_rate": 2.7259475218658893e-05, "loss": 0.1194, "step": 780 }, { "epoch": 2.303206997084548, "grad_norm": 7.534204959869385, "learning_rate": 2.696793002915452e-05, "loss": 0.1321, "step": 790 }, { "epoch": 2.3323615160349855, "grad_norm": 6.635725021362305, "learning_rate": 2.6676384839650148e-05, "loss": 0.0902, "step": 800 }, { "epoch": 2.3615160349854225, "grad_norm": 4.213601589202881, "learning_rate": 2.6384839650145775e-05, "loss": 0.182, "step": 810 }, { "epoch": 2.39067055393586, "grad_norm": 11.590983390808105, "learning_rate": 2.60932944606414e-05, "loss": 0.1745, "step": 820 }, { "epoch": 2.4198250728862973, "grad_norm": 9.884819984436035, "learning_rate": 2.5801749271137027e-05, "loss": 0.1733, "step": 830 }, { "epoch": 2.4489795918367347, "grad_norm": 8.458849906921387, "learning_rate": 2.5510204081632654e-05, "loss": 0.1307, "step": 840 }, { "epoch": 2.478134110787172, "grad_norm": 7.574510097503662, "learning_rate": 2.521865889212828e-05, "loss": 0.1253, "step": 850 }, { "epoch": 2.5072886297376096, "grad_norm": 9.187530517578125, "learning_rate": 2.492711370262391e-05, "loss": 0.1538, "step": 860 }, { "epoch": 2.5364431486880465, "grad_norm": 5.005228519439697, "learning_rate": 2.4635568513119533e-05, "loss": 0.117, "step": 870 }, { "epoch": 2.565597667638484, "grad_norm": 3.9266200065612793, "learning_rate": 2.434402332361516e-05, "loss": 0.1449, "step": 880 }, { "epoch": 2.5947521865889214, "grad_norm": 8.087149620056152, "learning_rate": 2.405247813411079e-05, "loss": 0.1652, "step": 890 }, { "epoch": 2.6239067055393583, "grad_norm": 13.278199195861816, "learning_rate": 2.3760932944606415e-05, "loss": 0.1834, "step": 900 }, { "epoch": 2.6530612244897958, "grad_norm": 9.696410179138184, "learning_rate": 2.3469387755102043e-05, "loss": 0.1033, "step": 910 }, { "epoch": 2.682215743440233, "grad_norm": 14.884827613830566, "learning_rate": 2.3177842565597667e-05, "loss": 0.2509, "step": 920 }, { "epoch": 2.7113702623906706, "grad_norm": 7.274765968322754, "learning_rate": 2.2886297376093298e-05, "loss": 0.1497, "step": 930 }, { "epoch": 2.740524781341108, "grad_norm": 11.48387622833252, "learning_rate": 2.2594752186588922e-05, "loss": 0.1106, "step": 940 }, { "epoch": 2.7696793002915454, "grad_norm": 7.414773941040039, "learning_rate": 2.230320699708455e-05, "loss": 0.167, "step": 950 }, { "epoch": 2.7988338192419824, "grad_norm": 11.676288604736328, "learning_rate": 2.2011661807580177e-05, "loss": 0.1707, "step": 960 }, { "epoch": 2.82798833819242, "grad_norm": 7.887070178985596, "learning_rate": 2.1720116618075804e-05, "loss": 0.1284, "step": 970 }, { "epoch": 2.857142857142857, "grad_norm": 16.793603897094727, "learning_rate": 2.1428571428571428e-05, "loss": 0.1464, "step": 980 }, { "epoch": 2.8862973760932946, "grad_norm": 11.829538345336914, "learning_rate": 2.1137026239067056e-05, "loss": 0.109, "step": 990 }, { "epoch": 2.9154518950437316, "grad_norm": 6.482255458831787, "learning_rate": 2.0845481049562683e-05, "loss": 0.1162, "step": 1000 }, { "epoch": 2.944606413994169, "grad_norm": 10.546632766723633, "learning_rate": 2.055393586005831e-05, "loss": 0.1181, "step": 1010 }, { "epoch": 2.9737609329446064, "grad_norm": 3.5399889945983887, "learning_rate": 2.0262390670553938e-05, "loss": 0.1005, "step": 1020 }, { "epoch": 3.0, "eval_loss": 0.680114209651947, "eval_runtime": 335.8694, "eval_samples_per_second": 8.143, "eval_steps_per_second": 0.256, "step": 1029 }, { "epoch": 3.002915451895044, "grad_norm": 3.2238965034484863, "learning_rate": 1.9970845481049562e-05, "loss": 0.166, "step": 1030 }, { "epoch": 3.0320699708454812, "grad_norm": 2.8589000701904297, "learning_rate": 1.9679300291545193e-05, "loss": 0.0457, "step": 1040 }, { "epoch": 3.061224489795918, "grad_norm": 2.0926849842071533, "learning_rate": 1.9387755102040817e-05, "loss": 0.0497, "step": 1050 }, { "epoch": 3.0903790087463556, "grad_norm": 10.41883659362793, "learning_rate": 1.9096209912536444e-05, "loss": 0.0426, "step": 1060 }, { "epoch": 3.119533527696793, "grad_norm": 4.982727527618408, "learning_rate": 1.880466472303207e-05, "loss": 0.0521, "step": 1070 }, { "epoch": 3.1486880466472305, "grad_norm": 2.4037177562713623, "learning_rate": 1.85131195335277e-05, "loss": 0.0271, "step": 1080 }, { "epoch": 3.1778425655976674, "grad_norm": 16.846464157104492, "learning_rate": 1.8221574344023323e-05, "loss": 0.0753, "step": 1090 }, { "epoch": 3.206997084548105, "grad_norm": 0.7762933373451233, "learning_rate": 1.793002915451895e-05, "loss": 0.0203, "step": 1100 }, { "epoch": 3.2361516034985423, "grad_norm": 0.7675560712814331, "learning_rate": 1.7638483965014578e-05, "loss": 0.037, "step": 1110 }, { "epoch": 3.2653061224489797, "grad_norm": 6.022889137268066, "learning_rate": 1.7346938775510206e-05, "loss": 0.0512, "step": 1120 }, { "epoch": 3.294460641399417, "grad_norm": 2.28266978263855, "learning_rate": 1.7055393586005833e-05, "loss": 0.0594, "step": 1130 }, { "epoch": 3.323615160349854, "grad_norm": 2.531623363494873, "learning_rate": 1.6763848396501457e-05, "loss": 0.0747, "step": 1140 }, { "epoch": 3.3527696793002915, "grad_norm": 3.3899452686309814, "learning_rate": 1.6472303206997085e-05, "loss": 0.0716, "step": 1150 }, { "epoch": 3.381924198250729, "grad_norm": 1.235809326171875, "learning_rate": 1.6180758017492712e-05, "loss": 0.0407, "step": 1160 }, { "epoch": 3.4110787172011663, "grad_norm": 2.9576103687286377, "learning_rate": 1.588921282798834e-05, "loss": 0.0434, "step": 1170 }, { "epoch": 3.4402332361516033, "grad_norm": 4.579357624053955, "learning_rate": 1.5597667638483964e-05, "loss": 0.0424, "step": 1180 }, { "epoch": 3.4693877551020407, "grad_norm": 3.1186563968658447, "learning_rate": 1.5306122448979594e-05, "loss": 0.0269, "step": 1190 }, { "epoch": 3.498542274052478, "grad_norm": 6.906335830688477, "learning_rate": 1.5014577259475218e-05, "loss": 0.031, "step": 1200 }, { "epoch": 3.5276967930029155, "grad_norm": 1.3179532289505005, "learning_rate": 1.4723032069970846e-05, "loss": 0.0356, "step": 1210 }, { "epoch": 3.556851311953353, "grad_norm": 2.701486825942993, "learning_rate": 1.4431486880466475e-05, "loss": 0.0472, "step": 1220 }, { "epoch": 3.5860058309037903, "grad_norm": 0.8598970174789429, "learning_rate": 1.41399416909621e-05, "loss": 0.039, "step": 1230 }, { "epoch": 3.6151603498542273, "grad_norm": 3.872262954711914, "learning_rate": 1.3848396501457728e-05, "loss": 0.0296, "step": 1240 }, { "epoch": 3.6443148688046647, "grad_norm": 1.517674446105957, "learning_rate": 1.3556851311953352e-05, "loss": 0.0306, "step": 1250 }, { "epoch": 3.673469387755102, "grad_norm": 2.6350300312042236, "learning_rate": 1.3265306122448982e-05, "loss": 0.0475, "step": 1260 }, { "epoch": 3.702623906705539, "grad_norm": 9.471416473388672, "learning_rate": 1.2973760932944606e-05, "loss": 0.0366, "step": 1270 }, { "epoch": 3.7317784256559765, "grad_norm": 8.837738037109375, "learning_rate": 1.2682215743440235e-05, "loss": 0.0493, "step": 1280 }, { "epoch": 3.760932944606414, "grad_norm": 3.3727469444274902, "learning_rate": 1.239067055393586e-05, "loss": 0.0271, "step": 1290 }, { "epoch": 3.7900874635568513, "grad_norm": 2.860428810119629, "learning_rate": 1.2099125364431488e-05, "loss": 0.0574, "step": 1300 }, { "epoch": 3.8192419825072887, "grad_norm": 12.31369400024414, "learning_rate": 1.1807580174927114e-05, "loss": 0.0567, "step": 1310 }, { "epoch": 3.848396501457726, "grad_norm": 0.33458849787712097, "learning_rate": 1.1516034985422741e-05, "loss": 0.0418, "step": 1320 }, { "epoch": 3.877551020408163, "grad_norm": 3.6039276123046875, "learning_rate": 1.1224489795918369e-05, "loss": 0.0258, "step": 1330 }, { "epoch": 3.9067055393586005, "grad_norm": 2.461503505706787, "learning_rate": 1.0932944606413994e-05, "loss": 0.024, "step": 1340 }, { "epoch": 3.935860058309038, "grad_norm": 1.2435688972473145, "learning_rate": 1.0641399416909622e-05, "loss": 0.0503, "step": 1350 }, { "epoch": 3.9650145772594754, "grad_norm": 6.292943477630615, "learning_rate": 1.0349854227405248e-05, "loss": 0.037, "step": 1360 }, { "epoch": 3.9941690962099123, "grad_norm": 5.214807033538818, "learning_rate": 1.0058309037900875e-05, "loss": 0.03, "step": 1370 }, { "epoch": 4.0, "eval_loss": 0.7046686410903931, "eval_runtime": 338.8017, "eval_samples_per_second": 8.073, "eval_steps_per_second": 0.254, "step": 1372 }, { "epoch": 4.02332361516035, "grad_norm": 0.912588894367218, "learning_rate": 9.7667638483965e-06, "loss": 0.0185, "step": 1380 }, { "epoch": 4.052478134110787, "grad_norm": 0.4329352378845215, "learning_rate": 9.47521865889213e-06, "loss": 0.0049, "step": 1390 }, { "epoch": 4.081632653061225, "grad_norm": 0.5743452906608582, "learning_rate": 9.183673469387756e-06, "loss": 0.0129, "step": 1400 }, { "epoch": 4.110787172011662, "grad_norm": 0.7096942663192749, "learning_rate": 8.892128279883383e-06, "loss": 0.0108, "step": 1410 }, { "epoch": 4.139941690962099, "grad_norm": 2.971156120300293, "learning_rate": 8.600583090379009e-06, "loss": 0.0171, "step": 1420 }, { "epoch": 4.169096209912537, "grad_norm": 1.4994933605194092, "learning_rate": 8.309037900874636e-06, "loss": 0.0203, "step": 1430 }, { "epoch": 4.198250728862973, "grad_norm": 2.779597759246826, "learning_rate": 8.017492711370262e-06, "loss": 0.0115, "step": 1440 }, { "epoch": 4.227405247813411, "grad_norm": 1.1518133878707886, "learning_rate": 7.72594752186589e-06, "loss": 0.0099, "step": 1450 }, { "epoch": 4.256559766763848, "grad_norm": 0.18404270708560944, "learning_rate": 7.434402332361516e-06, "loss": 0.0123, "step": 1460 }, { "epoch": 4.285714285714286, "grad_norm": 0.3961102366447449, "learning_rate": 7.142857142857143e-06, "loss": 0.0082, "step": 1470 }, { "epoch": 4.314868804664723, "grad_norm": 0.31849414110183716, "learning_rate": 6.851311953352769e-06, "loss": 0.0062, "step": 1480 }, { "epoch": 4.34402332361516, "grad_norm": 0.35347217321395874, "learning_rate": 6.559766763848396e-06, "loss": 0.0091, "step": 1490 }, { "epoch": 4.373177842565598, "grad_norm": 1.1199325323104858, "learning_rate": 6.268221574344024e-06, "loss": 0.0088, "step": 1500 }, { "epoch": 4.402332361516035, "grad_norm": 0.714579701423645, "learning_rate": 5.97667638483965e-06, "loss": 0.0106, "step": 1510 }, { "epoch": 4.431486880466473, "grad_norm": 0.4836759865283966, "learning_rate": 5.685131195335277e-06, "loss": 0.0062, "step": 1520 }, { "epoch": 4.460641399416909, "grad_norm": 1.2272969484329224, "learning_rate": 5.393586005830904e-06, "loss": 0.0134, "step": 1530 }, { "epoch": 4.489795918367347, "grad_norm": 1.6942965984344482, "learning_rate": 5.102040816326531e-06, "loss": 0.0158, "step": 1540 }, { "epoch": 4.518950437317784, "grad_norm": 7.356329917907715, "learning_rate": 4.810495626822157e-06, "loss": 0.014, "step": 1550 }, { "epoch": 4.548104956268221, "grad_norm": 0.13148389756679535, "learning_rate": 4.518950437317785e-06, "loss": 0.0078, "step": 1560 }, { "epoch": 4.577259475218659, "grad_norm": 0.39737656712532043, "learning_rate": 4.227405247813411e-06, "loss": 0.0073, "step": 1570 }, { "epoch": 4.606413994169096, "grad_norm": 4.271286964416504, "learning_rate": 3.935860058309039e-06, "loss": 0.0076, "step": 1580 }, { "epoch": 4.635568513119534, "grad_norm": 0.39406758546829224, "learning_rate": 3.644314868804665e-06, "loss": 0.0108, "step": 1590 }, { "epoch": 4.664723032069971, "grad_norm": 0.44297096133232117, "learning_rate": 3.352769679300292e-06, "loss": 0.0064, "step": 1600 }, { "epoch": 4.6938775510204085, "grad_norm": 3.877448558807373, "learning_rate": 3.0612244897959185e-06, "loss": 0.0094, "step": 1610 }, { "epoch": 4.723032069970845, "grad_norm": 0.836543619632721, "learning_rate": 2.7696793002915456e-06, "loss": 0.0093, "step": 1620 }, { "epoch": 4.752186588921282, "grad_norm": 3.5220887660980225, "learning_rate": 2.478134110787172e-06, "loss": 0.0069, "step": 1630 }, { "epoch": 4.78134110787172, "grad_norm": 0.4554370641708374, "learning_rate": 2.1865889212827988e-06, "loss": 0.0108, "step": 1640 }, { "epoch": 4.810495626822157, "grad_norm": 1.146256923675537, "learning_rate": 1.8950437317784258e-06, "loss": 0.0063, "step": 1650 }, { "epoch": 4.839650145772595, "grad_norm": 0.614227294921875, "learning_rate": 1.6034985422740526e-06, "loss": 0.0117, "step": 1660 }, { "epoch": 4.868804664723032, "grad_norm": 1.8233929872512817, "learning_rate": 1.3119533527696794e-06, "loss": 0.0094, "step": 1670 }, { "epoch": 4.8979591836734695, "grad_norm": 1.0853904485702515, "learning_rate": 1.020408163265306e-06, "loss": 0.0105, "step": 1680 }, { "epoch": 4.927113702623907, "grad_norm": 0.5117106437683105, "learning_rate": 7.28862973760933e-07, "loss": 0.0064, "step": 1690 }, { "epoch": 4.956268221574344, "grad_norm": 1.4317443370819092, "learning_rate": 4.373177842565598e-07, "loss": 0.0051, "step": 1700 }, { "epoch": 4.985422740524781, "grad_norm": 1.4140545129776, "learning_rate": 1.457725947521866e-07, "loss": 0.009, "step": 1710 }, { "epoch": 5.0, "eval_loss": 0.6613177061080933, "eval_runtime": 338.7385, "eval_samples_per_second": 8.074, "eval_steps_per_second": 0.254, "step": 1715 } ], "logging_steps": 10, "max_steps": 1715, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.624623496158249e+18, "train_batch_size": 32, "trial_name": null, "trial_params": null }