diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,7180 +1,5080 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 12.0, + "epoch": 11.899159663865547, "eval_steps": 500, - "global_step": 1008, + "global_step": 708, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { - "epoch": 0.011904761904761904, - "grad_norm": 17.139904900733598, - "learning_rate": 6.451612903225807e-07, - "loss": 9.2899, + "epoch": 0.01680672268907563, + "grad_norm": 4.385648622708784, + "learning_rate": 9.090909090909091e-07, + "loss": 4.108, "step": 1 }, { - "epoch": 0.023809523809523808, - "grad_norm": 16.71486840638675, - "learning_rate": 1.2903225806451614e-06, - "loss": 9.1432, + "epoch": 0.03361344537815126, + "grad_norm": 4.427354400402251, + "learning_rate": 1.8181818181818183e-06, + "loss": 4.1105, "step": 2 }, { - "epoch": 0.03571428571428571, - "grad_norm": 17.28789649557261, - "learning_rate": 1.935483870967742e-06, - "loss": 9.5388, + "epoch": 0.05042016806722689, + "grad_norm": 3.8339824048548925, + "learning_rate": 2.7272727272727272e-06, + "loss": 3.8298, "step": 3 }, { - "epoch": 0.047619047619047616, - "grad_norm": 13.220304931739214, - "learning_rate": 2.580645161290323e-06, - "loss": 7.9169, + "epoch": 0.06722689075630252, + "grad_norm": 4.9404739040660814, + "learning_rate": 3.6363636363636366e-06, + "loss": 4.4266, "step": 4 }, { - "epoch": 0.05952380952380952, - "grad_norm": 13.609291935209646, - "learning_rate": 3.225806451612903e-06, - "loss": 7.8849, + "epoch": 0.08403361344537816, + "grad_norm": 3.723380522292492, + "learning_rate": 4.5454545454545455e-06, + "loss": 3.6814, "step": 5 }, { - "epoch": 0.07142857142857142, - "grad_norm": 6.3010952694138105, - "learning_rate": 3.870967741935484e-06, - "loss": 5.0183, + "epoch": 0.10084033613445378, + "grad_norm": 10.136378288136632, + "learning_rate": 5.4545454545454545e-06, + "loss": 6.7591, "step": 6 }, { - "epoch": 0.08333333333333333, - "grad_norm": 9.731002424537186, - "learning_rate": 4.516129032258065e-06, - "loss": 6.5099, + "epoch": 0.11764705882352941, + "grad_norm": 3.750584042652294, + "learning_rate": 6.363636363636364e-06, + "loss": 3.6869, "step": 7 }, { - "epoch": 0.09523809523809523, - "grad_norm": 16.946585239278296, - "learning_rate": 5.161290322580646e-06, - "loss": 9.0453, + "epoch": 0.13445378151260504, + "grad_norm": 10.256473970042155, + "learning_rate": 7.272727272727273e-06, + "loss": 6.8465, "step": 8 }, { - "epoch": 0.10714285714285714, - "grad_norm": 13.29361044735187, - "learning_rate": 5.806451612903226e-06, - "loss": 7.8482, + "epoch": 0.15126050420168066, + "grad_norm": 5.299551412905417, + "learning_rate": 8.181818181818183e-06, + "loss": 4.627, "step": 9 }, { - "epoch": 0.11904761904761904, - "grad_norm": 9.616005044641154, - "learning_rate": 6.451612903225806e-06, - "loss": 6.374, + "epoch": 0.16806722689075632, + "grad_norm": 4.073848965470436, + "learning_rate": 9.090909090909091e-06, + "loss": 3.7922, "step": 10 }, { - "epoch": 0.13095238095238096, - "grad_norm": 13.507401041826714, - "learning_rate": 7.096774193548388e-06, - "loss": 7.8621, + "epoch": 0.18487394957983194, + "grad_norm": 6.270475941131987, + "learning_rate": 1e-05, + "loss": 5.0956, "step": 11 }, { - "epoch": 0.14285714285714285, - "grad_norm": 10.212045595466936, - "learning_rate": 7.741935483870968e-06, - "loss": 6.4647, + "epoch": 0.20168067226890757, + "grad_norm": 5.6458603155827225, + "learning_rate": 1.0909090909090909e-05, + "loss": 4.6846, "step": 12 }, { - "epoch": 0.15476190476190477, - "grad_norm": 13.721855682512686, - "learning_rate": 8.387096774193549e-06, - "loss": 7.5479, + "epoch": 0.2184873949579832, + "grad_norm": 4.136016188838338, + "learning_rate": 1.181818181818182e-05, + "loss": 3.9917, "step": 13 }, { - "epoch": 0.16666666666666666, - "grad_norm": 13.558802891735498, - "learning_rate": 9.03225806451613e-06, - "loss": 7.8138, + "epoch": 0.23529411764705882, + "grad_norm": 3.6135748045353333, + "learning_rate": 1.2727272727272728e-05, + "loss": 3.6259, "step": 14 }, { - "epoch": 0.17857142857142858, - "grad_norm": 13.55353645782886, - "learning_rate": 9.67741935483871e-06, - "loss": 7.6195, + "epoch": 0.25210084033613445, + "grad_norm": 4.964675562915649, + "learning_rate": 1.3636363636363637e-05, + "loss": 4.2151, "step": 15 }, { - "epoch": 0.19047619047619047, - "grad_norm": 13.221857295140415, - "learning_rate": 1.0322580645161291e-05, - "loss": 7.5711, + "epoch": 0.2689075630252101, + "grad_norm": 6.524726574282932, + "learning_rate": 1.4545454545454546e-05, + "loss": 5.0345, "step": 16 }, { - "epoch": 0.20238095238095238, - "grad_norm": 14.021266819970709, - "learning_rate": 1.096774193548387e-05, - "loss": 7.4885, + "epoch": 0.2857142857142857, + "grad_norm": 5.639830925355285, + "learning_rate": 1.5454545454545454e-05, + "loss": 4.4856, "step": 17 }, { - "epoch": 0.21428571428571427, - "grad_norm": 13.88113956103264, - "learning_rate": 1.1612903225806453e-05, - "loss": 7.3179, + "epoch": 0.3025210084033613, + "grad_norm": 3.8533215935243077, + "learning_rate": 1.6363636363636366e-05, + "loss": 3.7071, "step": 18 }, { - "epoch": 0.2261904761904762, - "grad_norm": 17.39044599261459, - "learning_rate": 1.2258064516129034e-05, - "loss": 8.622, + "epoch": 0.31932773109243695, + "grad_norm": 5.161749059636034, + "learning_rate": 1.7272727272727274e-05, + "loss": 4.2947, "step": 19 }, { - "epoch": 0.23809523809523808, - "grad_norm": 17.515984626148686, - "learning_rate": 1.2903225806451613e-05, - "loss": 8.5228, + "epoch": 0.33613445378151263, + "grad_norm": 3.6136568344159885, + "learning_rate": 1.8181818181818182e-05, + "loss": 3.4683, "step": 20 }, { - "epoch": 0.25, - "grad_norm": 17.82859899843793, - "learning_rate": 1.3548387096774194e-05, - "loss": 8.7837, + "epoch": 0.35294117647058826, + "grad_norm": 4.435498267084917, + "learning_rate": 1.9090909090909094e-05, + "loss": 3.859, "step": 21 }, { - "epoch": 0.2619047619047619, - "grad_norm": 17.429672226015118, - "learning_rate": 1.4193548387096776e-05, - "loss": 8.2317, + "epoch": 0.3697478991596639, + "grad_norm": 3.919091925431486, + "learning_rate": 2e-05, + "loss": 3.5794, "step": 22 }, { - "epoch": 0.27380952380952384, - "grad_norm": 17.276258653744623, - "learning_rate": 1.4838709677419357e-05, - "loss": 8.0633, + "epoch": 0.3865546218487395, + "grad_norm": 3.488028179094918, + "learning_rate": 1.9999895137366746e-05, + "loss": 3.2719, "step": 23 }, { - "epoch": 0.2857142857142857, - "grad_norm": 9.957462076322903, - "learning_rate": 1.5483870967741936e-05, - "loss": 5.7366, + "epoch": 0.40336134453781514, + "grad_norm": 10.328288130878647, + "learning_rate": 1.9999580551666215e-05, + "loss": 5.7591, "step": 24 }, { - "epoch": 0.2976190476190476, - "grad_norm": 13.620021596849393, - "learning_rate": 1.6129032258064517e-05, - "loss": 6.7714, + "epoch": 0.42016806722689076, + "grad_norm": 3.774723233998757, + "learning_rate": 1.9999056249496065e-05, + "loss": 3.3805, "step": 25 }, { - "epoch": 0.30952380952380953, - "grad_norm": 6.569900345372609, - "learning_rate": 1.6774193548387098e-05, - "loss": 4.4136, + "epoch": 0.4369747899159664, + "grad_norm": 6.265735848819648, + "learning_rate": 1.9998322241852238e-05, + "loss": 4.2421, "step": 26 }, { - "epoch": 0.32142857142857145, - "grad_norm": 14.289993829511333, - "learning_rate": 1.741935483870968e-05, - "loss": 6.1463, + "epoch": 0.453781512605042, + "grad_norm": 5.405371137822227, + "learning_rate": 1.999737854412873e-05, + "loss": 3.8983, "step": 27 }, { - "epoch": 0.3333333333333333, - "grad_norm": 10.418999900918601, - "learning_rate": 1.806451612903226e-05, - "loss": 4.9552, + "epoch": 0.47058823529411764, + "grad_norm": 3.7669212684787263, + "learning_rate": 1.9996225176117264e-05, + "loss": 3.1467, "step": 28 }, { - "epoch": 0.34523809523809523, - "grad_norm": 17.940074442849337, - "learning_rate": 1.870967741935484e-05, - "loss": 6.7963, + "epoch": 0.48739495798319327, + "grad_norm": 5.5431024884923845, + "learning_rate": 1.999486216200688e-05, + "loss": 3.5476, "step": 29 }, { - "epoch": 0.35714285714285715, - "grad_norm": 10.183500386894858, - "learning_rate": 1.935483870967742e-05, - "loss": 4.7842, + "epoch": 0.5042016806722689, + "grad_norm": 5.394079800803299, + "learning_rate": 1.9993289530383433e-05, + "loss": 3.4722, "step": 30 }, { - "epoch": 0.36904761904761907, - "grad_norm": 14.500865445423567, - "learning_rate": 2e-05, - "loss": 5.5398, + "epoch": 0.5210084033613446, + "grad_norm": 5.971573494363461, + "learning_rate": 1.999150731422898e-05, + "loss": 3.6263, "step": 31 }, { - "epoch": 0.38095238095238093, - "grad_norm": 11.000017652493565, - "learning_rate": 1.9999948301225546e-05, - "loss": 4.5158, + "epoch": 0.5378151260504201, + "grad_norm": 7.378751228659765, + "learning_rate": 1.9989515550921088e-05, + "loss": 3.5546, "step": 32 }, { - "epoch": 0.39285714285714285, - "grad_norm": 16.727462615797, - "learning_rate": 1.999979320543672e-05, - "loss": 4.6378, + "epoch": 0.5546218487394958, + "grad_norm": 5.09952164148768, + "learning_rate": 1.998731428223208e-05, + "loss": 3.0747, "step": 33 }, { - "epoch": 0.40476190476190477, - "grad_norm": 11.599298962952556, - "learning_rate": 1.999953471423719e-05, - "loss": 4.1898, + "epoch": 0.5714285714285714, + "grad_norm": 4.765579668290796, + "learning_rate": 1.9984903554328116e-05, + "loss": 2.7589, "step": 34 }, { - "epoch": 0.4166666666666667, - "grad_norm": 12.887490770056774, - "learning_rate": 1.999917283029968e-05, - "loss": 3.9778, + "epoch": 0.5882352941176471, + "grad_norm": 7.224492836222785, + "learning_rate": 1.9982283417768247e-05, + "loss": 2.871, "step": 35 }, { - "epoch": 0.42857142857142855, - "grad_norm": 13.379378492479097, - "learning_rate": 1.9998707557365983e-05, - "loss": 3.3893, + "epoch": 0.6050420168067226, + "grad_norm": 12.931777704179952, + "learning_rate": 1.9979453927503366e-05, + "loss": 3.215, "step": 36 }, { - "epoch": 0.44047619047619047, - "grad_norm": 19.77665757370661, - "learning_rate": 1.9998138900246916e-05, - "loss": 3.134, + "epoch": 0.6218487394957983, + "grad_norm": 3.6837055989248717, + "learning_rate": 1.9976415142875022e-05, + "loss": 2.4582, "step": 37 }, { - "epoch": 0.4523809523809524, - "grad_norm": 13.041378224509636, - "learning_rate": 1.999746686482224e-05, - "loss": 3.0121, + "epoch": 0.6386554621848739, + "grad_norm": 3.163696044731996, + "learning_rate": 1.9973167127614218e-05, + "loss": 2.4035, "step": 38 }, { - "epoch": 0.4642857142857143, - "grad_norm": 10.907950909034863, - "learning_rate": 1.999669145804065e-05, - "loss": 2.8979, + "epoch": 0.6554621848739496, + "grad_norm": 2.6130379292490127, + "learning_rate": 1.9969709949840034e-05, + "loss": 2.27, "step": 39 }, { - "epoch": 0.47619047619047616, - "grad_norm": 7.708693945099904, - "learning_rate": 1.9995812687919653e-05, - "loss": 2.705, + "epoch": 0.6722689075630253, + "grad_norm": 3.0774669175573903, + "learning_rate": 1.9966043682058223e-05, + "loss": 2.2352, "step": 40 }, { - "epoch": 0.4880952380952381, - "grad_norm": 4.95653030392137, - "learning_rate": 1.9994830563545524e-05, - "loss": 2.5711, + "epoch": 0.6890756302521008, + "grad_norm": 3.3703328756445514, + "learning_rate": 1.9962168401159685e-05, + "loss": 2.3572, "step": 41 }, { - "epoch": 0.5, - "grad_norm": 8.083358218918184, - "learning_rate": 1.999374509507318e-05, - "loss": 2.5499, + "epoch": 0.7058823529411765, + "grad_norm": 6.674678012817149, + "learning_rate": 1.995808418841885e-05, + "loss": 2.5806, "step": 42 }, { - "epoch": 0.5119047619047619, - "grad_norm": 8.299946354650668, - "learning_rate": 1.999255629372611e-05, - "loss": 2.4961, + "epoch": 0.7226890756302521, + "grad_norm": 2.7692115403847932, + "learning_rate": 1.9953791129491985e-05, + "loss": 2.2257, "step": 43 }, { - "epoch": 0.5238095238095238, - "grad_norm": 17.357515973246574, - "learning_rate": 1.9991264171796213e-05, - "loss": 1.8883, + "epoch": 0.7394957983193278, + "grad_norm": 3.4385080781188013, + "learning_rate": 1.9949289314415373e-05, + "loss": 2.3251, "step": 44 }, { - "epoch": 0.5357142857142857, - "grad_norm": 8.01068189035409, - "learning_rate": 1.9989868742643725e-05, - "loss": 2.2778, + "epoch": 0.7563025210084033, + "grad_norm": 3.513307097744062, + "learning_rate": 1.994457883760346e-05, + "loss": 2.2311, "step": 45 }, { - "epoch": 0.5476190476190477, - "grad_norm": 8.493792830509063, - "learning_rate": 1.998837002069704e-05, - "loss": 2.1784, + "epoch": 0.773109243697479, + "grad_norm": 3.485460514370931, + "learning_rate": 1.993965979784684e-05, + "loss": 2.0451, "step": 46 }, { - "epoch": 0.5595238095238095, - "grad_norm": 8.346064217450877, - "learning_rate": 1.9986768021452575e-05, - "loss": 2.1233, + "epoch": 0.7899159663865546, + "grad_norm": 2.6480026940047967, + "learning_rate": 1.9934532298310206e-05, + "loss": 2.1224, "step": 47 }, { - "epoch": 0.5714285714285714, - "grad_norm": 11.141768579260864, - "learning_rate": 1.9985062761474605e-05, - "loss": 1.5602, + "epoch": 0.8067226890756303, + "grad_norm": 3.191875488563921, + "learning_rate": 1.9929196446530184e-05, + "loss": 2.1002, "step": 48 }, { - "epoch": 0.5833333333333334, - "grad_norm": 11.272582504816725, - "learning_rate": 1.9983254258395105e-05, - "loss": 1.4972, + "epoch": 0.8235294117647058, + "grad_norm": 3.3443355134212833, + "learning_rate": 1.992365235441306e-05, + "loss": 2.0233, "step": 49 }, { - "epoch": 0.5952380952380952, - "grad_norm": 4.0084233661961415, - "learning_rate": 1.9981342530913556e-05, - "loss": 2.4526, + "epoch": 0.8403361344537815, + "grad_norm": 2.5999640350239948, + "learning_rate": 1.991790013823246e-05, + "loss": 1.966, "step": 50 }, { - "epoch": 0.6071428571428571, - "grad_norm": 9.158504351728157, - "learning_rate": 1.997932759879674e-05, - "loss": 1.234, + "epoch": 0.8571428571428571, + "grad_norm": 6.089294042169637, + "learning_rate": 1.991193991862689e-05, + "loss": 1.8622, "step": 51 }, { - "epoch": 0.6190476190476191, - "grad_norm": 9.338811416846326, - "learning_rate": 1.9977209482878576e-05, - "loss": 1.1168, + "epoch": 0.8739495798319328, + "grad_norm": 3.2204266121939362, + "learning_rate": 1.9905771820597214e-05, + "loss": 1.8818, "step": 52 }, { - "epoch": 0.6309523809523809, - "grad_norm": 11.308701021449329, - "learning_rate": 1.9974988205059848e-05, - "loss": 1.0833, + "epoch": 0.8907563025210085, + "grad_norm": 3.3930839641806942, + "learning_rate": 1.989939597350403e-05, + "loss": 1.7539, "step": 53 }, { - "epoch": 0.6428571428571429, - "grad_norm": 5.618069930191442, - "learning_rate": 1.997266378830804e-05, - "loss": 2.1432, + "epoch": 0.907563025210084, + "grad_norm": 3.449905117774067, + "learning_rate": 1.9892812511064962e-05, + "loss": 1.7651, "step": 54 }, { - "epoch": 0.6547619047619048, - "grad_norm": 6.282292794573764, - "learning_rate": 1.9970236256657044e-05, - "loss": 0.9732, + "epoch": 0.9243697478991597, + "grad_norm": 3.3847593536709097, + "learning_rate": 1.9886021571351854e-05, + "loss": 1.6882, "step": 55 }, { - "epoch": 0.6666666666666666, - "grad_norm": 4.707647398070364, - "learning_rate": 1.9967705635206937e-05, - "loss": 1.5313, + "epoch": 0.9411764705882353, + "grad_norm": 3.845735022445724, + "learning_rate": 1.9879023296787866e-05, + "loss": 1.719, "step": 56 }, { - "epoch": 0.6785714285714286, - "grad_norm": 6.472249482990302, - "learning_rate": 1.9965071950123732e-05, - "loss": 0.8117, + "epoch": 0.957983193277311, + "grad_norm": 2.8219721101939386, + "learning_rate": 1.9871817834144506e-05, + "loss": 1.7362, "step": 57 }, { - "epoch": 0.6904761904761905, - "grad_norm": 4.307567663459201, - "learning_rate": 1.9962335228639084e-05, - "loss": 0.8454, + "epoch": 0.9747899159663865, + "grad_norm": 2.499202155153014, + "learning_rate": 1.9864405334538518e-05, + "loss": 1.7014, "step": 58 }, { - "epoch": 0.7023809523809523, - "grad_norm": 3.076092506616235, - "learning_rate": 1.9959495499050027e-05, - "loss": 0.6874, + "epoch": 0.9915966386554622, + "grad_norm": 2.511155724416345, + "learning_rate": 1.9856785953428757e-05, + "loss": 1.7195, + "step": 59 + }, + { + "epoch": 0.9915966386554622, + "eval_loss": 1.2069061994552612, + "eval_runtime": 115.8201, + "eval_samples_per_second": 0.812, + "eval_steps_per_second": 0.812, "step": 59 }, { - "epoch": 0.7142857142857143, - "grad_norm": 4.764863570511213, - "learning_rate": 1.9956552790718663e-05, - "loss": 1.9864, + "epoch": 1.0084033613445378, + "grad_norm": 5.357847266371679, + "learning_rate": 1.9848959850612895e-05, + "loss": 1.1994, "step": 60 }, { - "epoch": 0.7261904761904762, - "grad_norm": 4.331608161030718, - "learning_rate": 1.995350713407188e-05, - "loss": 0.252, + "epoch": 1.0252100840336134, + "grad_norm": 2.5172607304803383, + "learning_rate": 1.9840927190224083e-05, + "loss": 1.5096, "step": 61 }, { - "epoch": 0.7380952380952381, - "grad_norm": 6.590145628301957, - "learning_rate": 1.9950358560601014e-05, - "loss": 1.3728, + "epoch": 1.0420168067226891, + "grad_norm": 2.383993749480795, + "learning_rate": 1.9832688140727502e-05, + "loss": 1.5336, "step": 62 }, { - "epoch": 0.75, - "grad_norm": 4.867440106476626, - "learning_rate": 1.994710710286155e-05, - "loss": 0.8254, + "epoch": 1.0588235294117647, + "grad_norm": 5.571125580296848, + "learning_rate": 1.982424287491684e-05, + "loss": 1.0043, "step": 63 }, { - "epoch": 0.7619047619047619, - "grad_norm": 2.4733606881650556, - "learning_rate": 1.994375279447276e-05, - "loss": 0.7131, + "epoch": 1.0756302521008403, + "grad_norm": 2.2388809453197083, + "learning_rate": 1.9815591569910654e-05, + "loss": 1.2905, "step": 64 }, { - "epoch": 0.7738095238095238, - "grad_norm": 4.226942008613792, - "learning_rate": 1.9940295670117373e-05, - "loss": 0.2012, + "epoch": 1.092436974789916, + "grad_norm": 2.4116257513181356, + "learning_rate": 1.9806734407148674e-05, + "loss": 1.5209, "step": 65 }, { - "epoch": 0.7857142857142857, - "grad_norm": 3.08282145132758, - "learning_rate": 1.9936735765541202e-05, - "loss": 1.3355, + "epoch": 1.1092436974789917, + "grad_norm": 2.268214660771143, + "learning_rate": 1.9797671572387985e-05, + "loss": 1.3414, "step": 66 }, { - "epoch": 0.7976190476190477, - "grad_norm": 2.625421213967175, - "learning_rate": 1.9933073117552798e-05, - "loss": 1.1274, + "epoch": 1.1260504201680672, + "grad_norm": 2.1242648315918036, + "learning_rate": 1.978840325569912e-05, + "loss": 1.3624, "step": 67 }, { - "epoch": 0.8095238095238095, - "grad_norm": 4.111053115808539, - "learning_rate": 1.9929307764023032e-05, - "loss": 1.2906, + "epoch": 1.1428571428571428, + "grad_norm": 6.45315294142602, + "learning_rate": 1.977892965146211e-05, + "loss": 0.7577, "step": 68 }, { - "epoch": 0.8214285714285714, - "grad_norm": 2.284397902515471, - "learning_rate": 1.992543974388475e-05, - "loss": 1.1372, + "epoch": 1.1596638655462184, + "grad_norm": 2.0834553976731596, + "learning_rate": 1.976925095836236e-05, + "loss": 1.242, "step": 69 }, { - "epoch": 0.8333333333333334, - "grad_norm": 4.81537458518723, - "learning_rate": 1.992146909713232e-05, - "loss": 0.7222, + "epoch": 1.1764705882352942, + "grad_norm": 2.1599404414408934, + "learning_rate": 1.975936737938653e-05, + "loss": 1.2737, "step": 70 }, { - "epoch": 0.8452380952380952, - "grad_norm": 4.635344777090823, - "learning_rate": 1.9917395864821262e-05, - "loss": 1.3808, + "epoch": 1.1932773109243697, + "grad_norm": 3.6995588982952263, + "learning_rate": 1.9749279121818235e-05, + "loss": 0.7993, "step": 71 }, { - "epoch": 0.8571428571428571, - "grad_norm": 5.22342912644428, - "learning_rate": 1.9913220089067794e-05, - "loss": 2.2206, + "epoch": 1.2100840336134453, + "grad_norm": 2.352377611171032, + "learning_rate": 1.9738986397233736e-05, + "loss": 0.7334, "step": 72 }, { - "epoch": 0.8690476190476191, - "grad_norm": 4.03293534110668, - "learning_rate": 1.990894181304842e-05, - "loss": 1.8158, + "epoch": 1.226890756302521, + "grad_norm": 2.152202331744084, + "learning_rate": 1.9728489421497465e-05, + "loss": 1.3078, "step": 73 }, { - "epoch": 0.8809523809523809, - "grad_norm": 2.675790769254283, - "learning_rate": 1.9904561080999466e-05, - "loss": 0.6809, + "epoch": 1.2436974789915967, + "grad_norm": 2.5203922797690015, + "learning_rate": 1.9717788414757523e-05, + "loss": 1.1814, "step": 74 }, { - "epoch": 0.8928571428571429, - "grad_norm": 6.044524344029198, - "learning_rate": 1.9900077938216623e-05, - "loss": 1.2401, + "epoch": 1.2605042016806722, + "grad_norm": 3.4705153272314475, + "learning_rate": 1.9706883601441066e-05, + "loss": 1.3192, "step": 75 }, { - "epoch": 0.9047619047619048, - "grad_norm": 3.975480451115108, - "learning_rate": 1.9895492431054492e-05, - "loss": 1.2328, + "epoch": 1.2773109243697478, + "grad_norm": 1.8630027919414862, + "learning_rate": 1.969577521024958e-05, + "loss": 0.7795, "step": 76 }, { - "epoch": 0.9166666666666666, - "grad_norm": 3.7971112191541994, - "learning_rate": 1.989080460692609e-05, - "loss": 2.1511, + "epoch": 1.2941176470588236, + "grad_norm": 2.9345669701977912, + "learning_rate": 1.9684463474154095e-05, + "loss": 1.2665, "step": 77 }, { - "epoch": 0.9285714285714286, - "grad_norm": 4.980700467181216, - "learning_rate": 1.988601451430237e-05, - "loss": 0.6743, + "epoch": 1.3109243697478992, + "grad_norm": 1.4751109531953661, + "learning_rate": 1.9672948630390296e-05, + "loss": 0.6425, "step": 78 }, { - "epoch": 0.9404761904761905, - "grad_norm": 2.4914351987187953, - "learning_rate": 1.9881122202711717e-05, - "loss": 1.1282, + "epoch": 1.3277310924369747, + "grad_norm": 3.300164935057302, + "learning_rate": 1.9661230920453553e-05, + "loss": 1.2188, "step": 79 }, { - "epoch": 0.9523809523809523, - "grad_norm": 2.836683437613531, - "learning_rate": 1.9876127722739436e-05, - "loss": 0.5786, + "epoch": 1.3445378151260505, + "grad_norm": 2.835411015713917, + "learning_rate": 1.964931059009385e-05, + "loss": 1.263, "step": 80 }, { - "epoch": 0.9642857142857143, - "grad_norm": 3.038983217485337, - "learning_rate": 1.987103112602722e-05, - "loss": 1.1614, + "epoch": 1.361344537815126, + "grad_norm": 2.235484344805954, + "learning_rate": 1.9637187889310632e-05, + "loss": 1.0342, "step": 81 }, { - "epoch": 0.9761904761904762, - "grad_norm": 3.137725304157932, - "learning_rate": 1.9865832465272636e-05, - "loss": 1.0887, + "epoch": 1.3781512605042017, + "grad_norm": 1.8454261871804942, + "learning_rate": 1.9624863072347565e-05, + "loss": 1.0816, "step": 82 }, { - "epoch": 0.9880952380952381, - "grad_norm": 4.0982549880174055, - "learning_rate": 1.986053179422856e-05, - "loss": 0.6275, + "epoch": 1.3949579831932772, + "grad_norm": 1.8700087156945158, + "learning_rate": 1.96123363976872e-05, + "loss": 1.0611, "step": 83 }, { - "epoch": 1.0, - "grad_norm": 4.228465015472441, - "learning_rate": 1.9855129167702625e-05, - "loss": 1.247, + "epoch": 1.4117647058823528, + "grad_norm": 1.776825090533238, + "learning_rate": 1.9599608128045554e-05, + "loss": 1.0875, "step": 84 }, { - "epoch": 1.0, - "eval_loss": 0.855706512928009, - "eval_runtime": 57.6965, - "eval_samples_per_second": 1.04, - "eval_steps_per_second": 1.04, - "step": 84 - }, - { - "epoch": 1.0119047619047619, - "grad_norm": 2.4225946522090993, - "learning_rate": 1.9849624641556676e-05, - "loss": 1.0853, + "epoch": 1.4285714285714286, + "grad_norm": 3.2507229249266074, + "learning_rate": 1.9586678530366607e-05, + "loss": 0.6209, "step": 85 }, { - "epoch": 1.0238095238095237, - "grad_norm": 3.7101089119391726, - "learning_rate": 1.9844018272706155e-05, - "loss": 1.1892, + "epoch": 1.4453781512605042, + "grad_norm": 2.0636370694690442, + "learning_rate": 1.9573547875816685e-05, + "loss": 1.0625, "step": 86 }, { - "epoch": 1.0357142857142858, - "grad_norm": 2.755180603183131, - "learning_rate": 1.9838310119119545e-05, - "loss": 1.1356, + "epoch": 1.46218487394958, + "grad_norm": 1.941254802441877, + "learning_rate": 1.9560216439778795e-05, + "loss": 1.086, "step": 87 }, { - "epoch": 1.0476190476190477, - "grad_norm": 2.2353314330160097, - "learning_rate": 1.983250023981776e-05, - "loss": 1.4771, + "epoch": 1.4789915966386555, + "grad_norm": 1.9461861491812582, + "learning_rate": 1.954668450184683e-05, + "loss": 0.9987, "step": 88 }, { - "epoch": 1.0595238095238095, - "grad_norm": 2.1469727041530002, - "learning_rate": 1.9826588694873515e-05, - "loss": 1.4555, + "epoch": 1.495798319327731, + "grad_norm": 2.064972172013034, + "learning_rate": 1.9532952345819723e-05, + "loss": 1.0663, "step": 89 }, { - "epoch": 1.0714285714285714, - "grad_norm": 2.669812575529475, - "learning_rate": 1.982057554541075e-05, - "loss": 0.2101, + "epoch": 1.5126050420168067, + "grad_norm": 1.778380925304329, + "learning_rate": 1.951902025969548e-05, + "loss": 0.9426, "step": 90 }, { - "epoch": 1.0833333333333333, - "grad_norm": 2.9321935610432384, - "learning_rate": 1.981446085360395e-05, - "loss": 1.0504, + "epoch": 1.5294117647058822, + "grad_norm": 2.361605116359202, + "learning_rate": 1.950488853566515e-05, + "loss": 0.9878, "step": 91 }, { - "epoch": 1.0952380952380953, - "grad_norm": 2.4895259465307613, - "learning_rate": 1.980824468267753e-05, - "loss": 0.6211, + "epoch": 1.5462184873949578, + "grad_norm": 1.7448015788038604, + "learning_rate": 1.949055747010669e-05, + "loss": 1.054, "step": 92 }, { - "epoch": 1.1071428571428572, - "grad_norm": 3.3343638502561483, - "learning_rate": 1.980192709690517e-05, - "loss": 1.0306, + "epoch": 1.5630252100840336, + "grad_norm": 2.411683085008472, + "learning_rate": 1.9476027363578754e-05, + "loss": 0.9234, "step": 93 }, { - "epoch": 1.119047619047619, - "grad_norm": 2.3278851742380944, - "learning_rate": 1.9795508161609166e-05, - "loss": 0.5428, + "epoch": 1.5798319327731094, + "grad_norm": 1.8173611017503086, + "learning_rate": 1.946129852081439e-05, + "loss": 0.9983, "step": 94 }, { - "epoch": 1.130952380952381, - "grad_norm": 1.482575910310281, - "learning_rate": 1.978898794315973e-05, - "loss": 1.0905, + "epoch": 1.596638655462185, + "grad_norm": 1.9295587283934226, + "learning_rate": 1.9446371250714645e-05, + "loss": 0.9747, "step": 95 }, { - "epoch": 1.1428571428571428, - "grad_norm": 2.0177449871722386, - "learning_rate": 1.9782366508974325e-05, - "loss": 0.6188, + "epoch": 1.6134453781512605, + "grad_norm": 1.772861667848063, + "learning_rate": 1.943124586634209e-05, + "loss": 0.8761, "step": 96 }, { - "epoch": 1.1547619047619047, - "grad_norm": 4.772581701887709, - "learning_rate": 1.9775643927516956e-05, - "loss": 1.4641, + "epoch": 1.6302521008403361, + "grad_norm": 4.814779501362561, + "learning_rate": 1.941592268491425e-05, + "loss": 0.6477, "step": 97 }, { - "epoch": 1.1666666666666667, - "grad_norm": 1.851753440929594, - "learning_rate": 1.9768820268297467e-05, - "loss": 0.5434, + "epoch": 1.6470588235294117, + "grad_norm": 1.836972617211195, + "learning_rate": 1.9400402027796955e-05, + "loss": 0.924, "step": 98 }, { - "epoch": 1.1785714285714286, - "grad_norm": 1.8955707469335776, - "learning_rate": 1.976189560187083e-05, - "loss": 0.6231, + "epoch": 1.6638655462184873, + "grad_norm": 2.3766647197122595, + "learning_rate": 1.9384684220497605e-05, + "loss": 0.7468, "step": 99 }, { - "epoch": 1.1904761904761905, - "grad_norm": 1.4609006468604573, - "learning_rate": 1.9754869999836385e-05, - "loss": 0.898, + "epoch": 1.680672268907563, + "grad_norm": 1.6832736777019341, + "learning_rate": 1.936876959265833e-05, + "loss": 0.8439, "step": 100 }, { - "epoch": 1.2023809523809523, - "grad_norm": 2.455697301128514, - "learning_rate": 1.9747743534837143e-05, - "loss": 0.5471, + "epoch": 1.6974789915966386, + "grad_norm": 1.819587589935648, + "learning_rate": 1.9352658478049085e-05, + "loss": 0.7831, "step": 101 }, { - "epoch": 1.2142857142857142, - "grad_norm": 1.7348261869056978, - "learning_rate": 1.9740516280559005e-05, - "loss": 0.5655, + "epoch": 1.7142857142857144, + "grad_norm": 6.083715872130075, + "learning_rate": 1.9336351214560648e-05, + "loss": 0.5828, "step": 102 }, { - "epoch": 1.2261904761904763, - "grad_norm": 5.178036354232653, - "learning_rate": 1.973318831173001e-05, - "loss": 0.6072, + "epoch": 1.73109243697479, + "grad_norm": 2.0455350806117827, + "learning_rate": 1.9319848144197543e-05, + "loss": 0.8423, "step": 103 }, { - "epoch": 1.2380952380952381, - "grad_norm": 1.6076518163637115, - "learning_rate": 1.972575970411955e-05, - "loss": 0.9794, + "epoch": 1.7478991596638656, + "grad_norm": 2.575532108553041, + "learning_rate": 1.9303149613070852e-05, + "loss": 0.8617, "step": 104 }, { - "epoch": 1.25, - "grad_norm": 2.723769398836049, - "learning_rate": 1.971823053453762e-05, - "loss": 1.3593, + "epoch": 1.7647058823529411, + "grad_norm": 1.698878909041017, + "learning_rate": 1.928625597139096e-05, + "loss": 0.7528, "step": 105 }, { - "epoch": 1.2619047619047619, - "grad_norm": 1.2127891225080338, - "learning_rate": 1.971060088083398e-05, - "loss": 0.4988, + "epoch": 1.7815126050420167, + "grad_norm": 4.480689810270073, + "learning_rate": 1.926916757346022e-05, + "loss": 0.4962, "step": 106 }, { - "epoch": 1.2738095238095237, - "grad_norm": 1.4952785968327083, - "learning_rate": 1.9702870821897385e-05, - "loss": 0.9151, + "epoch": 1.7983193277310925, + "grad_norm": 2.43491273709643, + "learning_rate": 1.9251884777665513e-05, + "loss": 0.6916, "step": 107 }, { - "epoch": 1.2857142857142856, - "grad_norm": 3.485596886623978, - "learning_rate": 1.9695040437654744e-05, - "loss": 0.5045, + "epoch": 1.815126050420168, + "grad_norm": 2.9374762528937945, + "learning_rate": 1.9234407946470735e-05, + "loss": 0.7218, "step": 108 }, { - "epoch": 1.2976190476190477, - "grad_norm": 6.271447145920052, - "learning_rate": 1.9687109809070314e-05, - "loss": 1.3797, + "epoch": 1.8319327731092439, + "grad_norm": 3.110996772971968, + "learning_rate": 1.9216737446409192e-05, + "loss": 0.3829, "step": 109 }, { - "epoch": 1.3095238095238095, - "grad_norm": 7.3279656814005385, - "learning_rate": 1.967907901814485e-05, - "loss": 0.1904, + "epoch": 1.8487394957983194, + "grad_norm": 1.963122392871237, + "learning_rate": 1.919887364807592e-05, + "loss": 0.7205, "step": 110 }, { - "epoch": 1.3214285714285714, - "grad_norm": 1.0931520962056926, - "learning_rate": 1.9670948147914758e-05, - "loss": 0.4405, + "epoch": 1.865546218487395, + "grad_norm": 3.066257574584323, + "learning_rate": 1.9180816926119903e-05, + "loss": 0.5874, "step": 111 }, { - "epoch": 1.3333333333333333, - "grad_norm": 1.4586197259444877, - "learning_rate": 1.9662717282451248e-05, - "loss": 0.5086, + "epoch": 1.8823529411764706, + "grad_norm": 2.2609617726833946, + "learning_rate": 1.9162567659236227e-05, + "loss": 0.5984, "step": 112 }, { - "epoch": 1.3452380952380953, - "grad_norm": 1.5506740361035394, - "learning_rate": 1.965438650685945e-05, - "loss": 0.9044, + "epoch": 1.8991596638655461, + "grad_norm": 2.6110497038118727, + "learning_rate": 1.9144126230158127e-05, + "loss": 0.6068, "step": 113 }, { - "epoch": 1.3571428571428572, - "grad_norm": 1.943168328137838, - "learning_rate": 1.9645955907277544e-05, - "loss": 0.072, + "epoch": 1.9159663865546217, + "grad_norm": 2.2601235737134195, + "learning_rate": 1.9125493025648963e-05, + "loss": 0.6273, "step": 114 }, { - "epoch": 1.369047619047619, - "grad_norm": 5.388416018455866, - "learning_rate": 1.9637425570875857e-05, - "loss": 0.9327, + "epoch": 1.9327731092436975, + "grad_norm": 2.271778447040359, + "learning_rate": 1.9106668436494113e-05, + "loss": 0.6891, "step": 115 }, { - "epoch": 1.380952380952381, - "grad_norm": 6.028065865896953, - "learning_rate": 1.9628795585855986e-05, - "loss": 0.6004, + "epoch": 1.949579831932773, + "grad_norm": 2.0592194165628275, + "learning_rate": 1.908765285749278e-05, + "loss": 0.6283, "step": 116 }, { - "epoch": 1.3928571428571428, - "grad_norm": 2.3104045537885853, - "learning_rate": 1.9620066041449854e-05, - "loss": 1.2527, + "epoch": 1.9663865546218489, + "grad_norm": 2.096696308042481, + "learning_rate": 1.9068446687449698e-05, + "loss": 0.5497, "step": 117 }, { - "epoch": 1.4047619047619047, - "grad_norm": 3.8366407382177226, - "learning_rate": 1.961123702791881e-05, - "loss": 0.5362, + "epoch": 1.9831932773109244, + "grad_norm": 2.4126456796916744, + "learning_rate": 1.9049050329166778e-05, + "loss": 0.5112, "step": 118 }, { - "epoch": 1.4166666666666667, - "grad_norm": 1.6321334992158414, - "learning_rate": 1.9602308636552692e-05, - "loss": 0.9381, + "epoch": 2.0, + "grad_norm": 7.4662803136656954, + "learning_rate": 1.9029464189434663e-05, + "loss": 0.5049, "step": 119 }, { - "epoch": 1.4285714285714286, - "grad_norm": 2.6046556479389738, - "learning_rate": 1.9593280959668878e-05, - "loss": 0.4521, + "epoch": 2.0, + "eval_loss": 0.4023877680301666, + "eval_runtime": 94.9843, + "eval_samples_per_second": 0.99, + "eval_steps_per_second": 0.99, + "step": 119 + }, + { + "epoch": 2.0168067226890756, + "grad_norm": 2.736890471961608, + "learning_rate": 1.900968867902419e-05, + "loss": 0.4793, "step": 120 }, { - "epoch": 1.4404761904761905, - "grad_norm": 2.426572164115562, - "learning_rate": 1.9584154090611335e-05, - "loss": 0.4931, + "epoch": 2.033613445378151, + "grad_norm": 2.030226042772956, + "learning_rate": 1.8989724212677784e-05, + "loss": 0.4857, "step": 121 }, { - "epoch": 1.4523809523809523, - "grad_norm": 5.329850166878175, - "learning_rate": 1.957492812374965e-05, - "loss": 0.6056, + "epoch": 2.0504201680672267, + "grad_norm": 2.308935605416641, + "learning_rate": 1.8969571209100738e-05, + "loss": 0.3551, "step": 122 }, { - "epoch": 1.4642857142857144, - "grad_norm": 4.396192643131844, - "learning_rate": 1.956560315447806e-05, - "loss": 0.8952, + "epoch": 2.0672268907563027, + "grad_norm": 1.9661040290247784, + "learning_rate": 1.8949230090952463e-05, + "loss": 0.4291, "step": 123 }, { - "epoch": 1.4761904761904763, - "grad_norm": 5.802283102298066, - "learning_rate": 1.9556179279214464e-05, - "loss": 0.1045, + "epoch": 2.0840336134453783, + "grad_norm": 2.0802991306768854, + "learning_rate": 1.8928701284837597e-05, + "loss": 0.4988, "step": 124 }, { - "epoch": 1.4880952380952381, - "grad_norm": 1.7429659702829874, - "learning_rate": 1.9546656595399418e-05, - "loss": 0.1404, + "epoch": 2.100840336134454, + "grad_norm": 2.0446185078523804, + "learning_rate": 1.890798522129708e-05, + "loss": 0.3199, "step": 125 }, { - "epoch": 1.5, - "grad_norm": 4.908284880057786, - "learning_rate": 1.9537035201495144e-05, - "loss": 0.5325, + "epoch": 2.1176470588235294, + "grad_norm": 2.403214003507413, + "learning_rate": 1.8887082334799098e-05, + "loss": 0.4125, "step": 126 }, { - "epoch": 1.5119047619047619, - "grad_norm": 1.3604274167302775, - "learning_rate": 1.9527315196984488e-05, - "loss": 0.8657, + "epoch": 2.134453781512605, + "grad_norm": 1.7254339304792963, + "learning_rate": 1.8865993063730003e-05, + "loss": 0.3017, "step": 127 }, { - "epoch": 1.5238095238095237, - "grad_norm": 4.411139336383562, - "learning_rate": 1.9517496682369925e-05, - "loss": 0.4741, + "epoch": 2.1512605042016806, + "grad_norm": 2.3578101043917643, + "learning_rate": 1.884471785038509e-05, + "loss": 0.228, "step": 128 }, { - "epoch": 1.5357142857142856, - "grad_norm": 3.2777311634635056, - "learning_rate": 1.950757975917248e-05, - "loss": 0.0753, + "epoch": 2.168067226890756, + "grad_norm": 1.6260308846139646, + "learning_rate": 1.882325714095934e-05, + "loss": 0.1768, "step": 129 }, { - "epoch": 1.5476190476190477, - "grad_norm": 2.1472145073542084, - "learning_rate": 1.9497564529930716e-05, - "loss": 0.1772, + "epoch": 2.184873949579832, + "grad_norm": 1.9572713847794219, + "learning_rate": 1.8801611385538047e-05, + "loss": 0.3387, "step": 130 }, { - "epoch": 1.5595238095238095, - "grad_norm": 1.5922012254987603, - "learning_rate": 1.9487451098199642e-05, - "loss": 0.921, + "epoch": 2.2016806722689077, + "grad_norm": 2.0610001563380127, + "learning_rate": 1.8779781038087406e-05, + "loss": 0.3104, "step": 131 }, { - "epoch": 1.5714285714285714, - "grad_norm": 2.6652448912819633, - "learning_rate": 1.9477239568549667e-05, - "loss": 0.8894, + "epoch": 2.2184873949579833, + "grad_norm": 2.770005426395026, + "learning_rate": 1.875776655644495e-05, + "loss": 0.3574, "step": 132 }, { - "epoch": 1.5833333333333335, - "grad_norm": 7.012220577504933, - "learning_rate": 1.9466930046565504e-05, - "loss": 0.5622, + "epoch": 2.235294117647059, + "grad_norm": 3.4273616297199796, + "learning_rate": 1.8735568402309987e-05, + "loss": 0.3576, "step": 133 }, { - "epoch": 1.5952380952380953, - "grad_norm": 14.136593122149362, - "learning_rate": 1.9456522638845082e-05, - "loss": 0.2466, + "epoch": 2.2521008403361344, + "grad_norm": 2.0797522448134242, + "learning_rate": 1.8713187041233896e-05, + "loss": 0.289, "step": 134 }, { - "epoch": 1.6071428571428572, - "grad_norm": 4.579182399679084, - "learning_rate": 1.9446017452998448e-05, - "loss": 0.0872, + "epoch": 2.26890756302521, + "grad_norm": 5.8224416033882065, + "learning_rate": 1.8690622942610367e-05, + "loss": 0.231, "step": 135 }, { - "epoch": 1.619047619047619, - "grad_norm": 5.159279776844667, - "learning_rate": 1.943541459764665e-05, - "loss": 0.0711, + "epoch": 2.2857142857142856, + "grad_norm": 1.805558912475057, + "learning_rate": 1.8667876579665556e-05, + "loss": 0.2549, "step": 136 }, { - "epoch": 1.630952380952381, - "grad_norm": 1.6198531580129043, - "learning_rate": 1.9424714182420606e-05, - "loss": 0.9309, + "epoch": 2.302521008403361, + "grad_norm": 2.1386013924513407, + "learning_rate": 1.8644948429448174e-05, + "loss": 0.3056, "step": 137 }, { - "epoch": 1.6428571428571428, - "grad_norm": 1.91584402163816, - "learning_rate": 1.9413916317959994e-05, - "loss": 0.1586, + "epoch": 2.3193277310924367, + "grad_norm": 2.891103001226989, + "learning_rate": 1.862183897281946e-05, + "loss": 0.2917, "step": 138 }, { - "epoch": 1.6547619047619047, - "grad_norm": 1.5986601917930672, - "learning_rate": 1.9403021115912083e-05, - "loss": 0.7846, + "epoch": 2.3361344537815127, + "grad_norm": 1.6366849032923414, + "learning_rate": 1.8598548694443102e-05, + "loss": 0.185, "step": 139 }, { - "epoch": 1.6666666666666665, - "grad_norm": 12.580007166459941, - "learning_rate": 1.93920286889306e-05, - "loss": 0.7669, + "epoch": 2.3529411764705883, + "grad_norm": 3.3075808645565825, + "learning_rate": 1.8575078082775096e-05, + "loss": 0.272, "step": 140 }, { - "epoch": 1.6785714285714286, - "grad_norm": 1.4187041663248747, - "learning_rate": 1.938093915067453e-05, - "loss": 0.8152, + "epoch": 2.369747899159664, + "grad_norm": 2.271589816301807, + "learning_rate": 1.8551427630053464e-05, + "loss": 0.245, "step": 141 }, { - "epoch": 1.6904761904761905, - "grad_norm": 5.695537042553588, - "learning_rate": 1.936975261580699e-05, - "loss": 1.0575, + "epoch": 2.3865546218487395, + "grad_norm": 1.84180801145786, + "learning_rate": 1.8527597832287954e-05, + "loss": 0.2091, "step": 142 }, { - "epoch": 1.7023809523809523, - "grad_norm": 6.521896789730948, - "learning_rate": 1.9358469199994006e-05, - "loss": 0.5823, + "epoch": 2.403361344537815, + "grad_norm": 3.766252312196367, + "learning_rate": 1.8503589189249637e-05, + "loss": 0.2895, "step": 143 }, { - "epoch": 1.7142857142857144, - "grad_norm": 3.577556847930223, - "learning_rate": 1.9347089019903333e-05, - "loss": 0.8156, + "epoch": 2.4201680672268906, + "grad_norm": 1.873377028811922, + "learning_rate": 1.847940220446042e-05, + "loss": 0.2368, "step": 144 }, { - "epoch": 1.7261904761904763, - "grad_norm": 8.100269368760127, - "learning_rate": 1.9335612193203243e-05, - "loss": 0.1723, + "epoch": 2.4369747899159666, + "grad_norm": 4.545298766002122, + "learning_rate": 1.845503738518249e-05, + "loss": 0.1958, "step": 145 }, { - "epoch": 1.7380952380952381, - "grad_norm": 2.6780547969473156, - "learning_rate": 1.9324038838561317e-05, - "loss": 1.2078, + "epoch": 2.453781512605042, + "grad_norm": 3.1348145302552903, + "learning_rate": 1.843049524240766e-05, + "loss": 0.2098, "step": 146 }, { - "epoch": 1.75, - "grad_norm": 1.8870873473190792, - "learning_rate": 1.9312369075643197e-05, - "loss": 0.541, + "epoch": 2.4705882352941178, + "grad_norm": 1.6186560450788932, + "learning_rate": 1.8405776290846672e-05, + "loss": 0.1754, "step": 147 }, { - "epoch": 1.7619047619047619, - "grad_norm": 3.457663458870956, - "learning_rate": 1.9300603025111374e-05, - "loss": 0.8025, + "epoch": 2.4873949579831933, + "grad_norm": 3.382962591579758, + "learning_rate": 1.8380881048918406e-05, + "loss": 0.2035, "step": 148 }, { - "epoch": 1.7738095238095237, - "grad_norm": 2.63314035325873, - "learning_rate": 1.9288740808623923e-05, - "loss": 0.8417, + "epoch": 2.504201680672269, + "grad_norm": 2.623712266453748, + "learning_rate": 1.8355810038738986e-05, + "loss": 0.2092, "step": 149 }, { - "epoch": 1.7857142857142856, - "grad_norm": 4.406400740021301, - "learning_rate": 1.9276782548833262e-05, - "loss": 0.5055, + "epoch": 2.5210084033613445, + "grad_norm": 2.2054319523334676, + "learning_rate": 1.8330563786110837e-05, + "loss": 0.1898, "step": 150 }, { - "epoch": 1.7976190476190477, - "grad_norm": 2.400162986097046, - "learning_rate": 1.9264728369384867e-05, - "loss": 0.4728, + "epoch": 2.53781512605042, + "grad_norm": 4.2861344256493785, + "learning_rate": 1.830514282051166e-05, + "loss": 0.1886, "step": 151 }, { - "epoch": 1.8095238095238095, - "grad_norm": 2.097382923710725, - "learning_rate": 1.925257839491599e-05, - "loss": 0.8526, + "epoch": 2.5546218487394956, + "grad_norm": 2.651873913211827, + "learning_rate": 1.8279547675083343e-05, + "loss": 0.1824, "step": 152 }, { - "epoch": 1.8214285714285714, - "grad_norm": 3.554097093286924, - "learning_rate": 1.9240332751054397e-05, - "loss": 1.2327, + "epoch": 2.571428571428571, + "grad_norm": 1.8547906501685159, + "learning_rate": 1.8253778886620754e-05, + "loss": 0.1705, "step": 153 }, { - "epoch": 1.8333333333333335, - "grad_norm": 1.596640390294483, - "learning_rate": 1.9227991564417038e-05, - "loss": 0.73, + "epoch": 2.588235294117647, + "grad_norm": 3.9635058525876277, + "learning_rate": 1.822783699556049e-05, + "loss": 0.1838, "step": 154 }, { - "epoch": 1.8452380952380953, - "grad_norm": 7.712486381371494, - "learning_rate": 1.9215554962608757e-05, - "loss": 0.6713, + "epoch": 2.6050420168067228, + "grad_norm": 2.3948916184720606, + "learning_rate": 1.820172254596956e-05, + "loss": 0.1778, "step": 155 }, { - "epoch": 1.8571428571428572, - "grad_norm": 0.9406036356596837, - "learning_rate": 1.9203023074220976e-05, - "loss": 0.4108, + "epoch": 2.6218487394957983, + "grad_norm": 2.3071047956425264, + "learning_rate": 1.817543608553395e-05, + "loss": 0.1708, "step": 156 }, { - "epoch": 1.869047619047619, - "grad_norm": 1.2783324468370174, - "learning_rate": 1.919039602883035e-05, - "loss": 0.7852, + "epoch": 2.638655462184874, + "grad_norm": 2.108074459212226, + "learning_rate": 1.814897816554715e-05, + "loss": 0.1749, "step": 157 }, { - "epoch": 1.880952380952381, - "grad_norm": 1.9809209865395272, - "learning_rate": 1.917767395699742e-05, - "loss": 0.8275, + "epoch": 2.6554621848739495, + "grad_norm": 1.0692114014911545, + "learning_rate": 1.8122349340898596e-05, + "loss": 0.1373, "step": 158 }, { - "epoch": 1.8928571428571428, - "grad_norm": 3.256407393144481, - "learning_rate": 1.9164856990265304e-05, - "loss": 0.4972, + "epoch": 2.6722689075630255, + "grad_norm": 2.0009322090894592, + "learning_rate": 1.809555017006202e-05, + "loss": 0.1709, "step": 159 }, { - "epoch": 1.9047619047619047, - "grad_norm": 1.7699190444451984, - "learning_rate": 1.9151945261158295e-05, - "loss": 0.7818, + "epoch": 2.689075630252101, + "grad_norm": 2.1909981915140375, + "learning_rate": 1.8068581215083752e-05, + "loss": 0.1484, "step": 160 }, { - "epoch": 1.9166666666666665, - "grad_norm": 2.215298903394841, - "learning_rate": 1.9138938903180496e-05, - "loss": 1.2303, + "epoch": 2.7058823529411766, + "grad_norm": 2.103149621290993, + "learning_rate": 1.804144304157091e-05, + "loss": 0.1482, "step": 161 }, { - "epoch": 1.9285714285714286, - "grad_norm": 1.8042937103793464, - "learning_rate": 1.9125838050814472e-05, - "loss": 0.3921, + "epoch": 2.722689075630252, + "grad_norm": 1.5470172068234862, + "learning_rate": 1.8014136218679566e-05, + "loss": 0.1489, "step": 162 }, { - "epoch": 1.9404761904761905, - "grad_norm": 2.0556006635467066, - "learning_rate": 1.911264283951982e-05, - "loss": 1.1463, + "epoch": 2.7394957983193278, + "grad_norm": 1.8588243552375616, + "learning_rate": 1.7986661319102795e-05, + "loss": 0.1526, "step": 163 }, { - "epoch": 1.9523809523809523, - "grad_norm": 2.05906081878954, - "learning_rate": 1.9099353405731794e-05, - "loss": 0.7994, + "epoch": 2.7563025210084033, + "grad_norm": 1.1673797887420725, + "learning_rate": 1.7959018919058654e-05, + "loss": 0.1361, "step": 164 }, { - "epoch": 1.9642857142857144, - "grad_norm": 2.8775318413447897, - "learning_rate": 1.908596988685988e-05, - "loss": 0.0429, + "epoch": 2.773109243697479, + "grad_norm": 0.9871009564387679, + "learning_rate": 1.7931209598278117e-05, + "loss": 0.1168, "step": 165 }, { - "epoch": 1.9761904761904763, - "grad_norm": 2.19883503502942, - "learning_rate": 1.9072492421286384e-05, - "loss": 0.7907, + "epoch": 2.7899159663865545, + "grad_norm": 1.375240831135664, + "learning_rate": 1.7903233939992904e-05, + "loss": 0.1271, "step": 166 }, { - "epoch": 1.9880952380952381, - "grad_norm": 1.5000744356797195, - "learning_rate": 1.9058921148364996e-05, - "loss": 0.3933, + "epoch": 2.80672268907563, + "grad_norm": 1.1795213655735046, + "learning_rate": 1.787509253092326e-05, + "loss": 0.1092, "step": 167 }, { - "epoch": 2.0, - "grad_norm": 2.211057744430661, - "learning_rate": 1.904525620841935e-05, - "loss": 1.1364, - "step": 168 - }, - { - "epoch": 2.0, - "eval_loss": 0.6083582043647766, - "eval_runtime": 38.5831, - "eval_samples_per_second": 1.555, - "eval_steps_per_second": 1.555, + "epoch": 2.8235294117647056, + "grad_norm": 3.071368428139141, + "learning_rate": 1.784678596126563e-05, + "loss": 0.286, "step": 168 }, { - "epoch": 2.011904761904762, - "grad_norm": 1.2506294363329695, - "learning_rate": 1.9031497742741573e-05, - "loss": 0.3685, + "epoch": 2.8403361344537816, + "grad_norm": 1.0579777160189892, + "learning_rate": 1.78183148246803e-05, + "loss": 0.1197, "step": 169 }, { - "epoch": 2.0238095238095237, - "grad_norm": 2.1294740967375776, - "learning_rate": 1.9017645893590834e-05, - "loss": 0.4106, + "epoch": 2.857142857142857, + "grad_norm": 1.6336798015628837, + "learning_rate": 1.7789679718278944e-05, + "loss": 0.1282, "step": 170 }, { - "epoch": 2.0357142857142856, - "grad_norm": 2.195919937729915, - "learning_rate": 1.900370080419186e-05, - "loss": 1.0409, + "epoch": 2.8739495798319328, + "grad_norm": 1.8315662561658703, + "learning_rate": 1.7760881242612096e-05, + "loss": 0.1429, "step": 171 }, { - "epoch": 2.0476190476190474, - "grad_norm": 2.137987938056288, - "learning_rate": 1.8989662618733446e-05, - "loss": 1.0374, + "epoch": 2.8907563025210083, + "grad_norm": 2.9860725552192404, + "learning_rate": 1.773192000165655e-05, + "loss": 0.2833, "step": 172 }, { - "epoch": 2.0595238095238093, - "grad_norm": 2.321052769415701, - "learning_rate": 1.8975531482366998e-05, - "loss": 1.0739, + "epoch": 2.907563025210084, + "grad_norm": 1.04161581184864, + "learning_rate": 1.7702796602802705e-05, + "loss": 0.1042, "step": 173 }, { - "epoch": 2.0714285714285716, - "grad_norm": 4.17435976154088, - "learning_rate": 1.8961307541205003e-05, - "loss": 0.7877, + "epoch": 2.92436974789916, + "grad_norm": 1.2933586996454134, + "learning_rate": 1.7673511656841822e-05, + "loss": 0.1382, "step": 174 }, { - "epoch": 2.0833333333333335, - "grad_norm": 1.389768088500958, - "learning_rate": 1.8946990942319518e-05, - "loss": 0.7138, + "epoch": 2.9411764705882355, + "grad_norm": 1.8373790721021168, + "learning_rate": 1.7644065777953206e-05, + "loss": 0.2055, "step": 175 }, { - "epoch": 2.0952380952380953, - "grad_norm": 2.506177283623637, - "learning_rate": 1.8932581833740676e-05, - "loss": 0.3942, + "epoch": 2.957983193277311, + "grad_norm": 1.695542614817302, + "learning_rate": 1.7614459583691346e-05, + "loss": 0.144, "step": 176 }, { - "epoch": 2.107142857142857, - "grad_norm": 3.325446937404553, - "learning_rate": 1.8918080364455122e-05, - "loss": 0.7675, + "epoch": 2.9747899159663866, + "grad_norm": 1.2079374495539366, + "learning_rate": 1.758469369497293e-05, + "loss": 0.1286, "step": 177 }, { - "epoch": 2.119047619047619, - "grad_norm": 2.873764603659821, - "learning_rate": 1.8903486684404496e-05, - "loss": 0.4016, + "epoch": 2.991596638655462, + "grad_norm": 7.433145063552697, + "learning_rate": 1.7554768736063858e-05, + "loss": 0.2421, "step": 178 }, { - "epoch": 2.130952380952381, - "grad_norm": 2.081810368542138, - "learning_rate": 1.888880094448387e-05, - "loss": 0.9637, + "epoch": 2.991596638655462, + "eval_loss": 0.18344487249851227, + "eval_runtime": 97.2976, + "eval_samples_per_second": 0.966, + "eval_steps_per_second": 0.966, + "step": 178 + }, + { + "epoch": 3.008403361344538, + "grad_norm": 1.936304205658518, + "learning_rate": 1.7524685334566126e-05, + "loss": 0.0924, "step": 179 }, { - "epoch": 2.142857142857143, - "grad_norm": 4.506719205876918, - "learning_rate": 1.88740232965402e-05, - "loss": 0.1205, + "epoch": 3.0252100840336134, + "grad_norm": 3.65130263061498, + "learning_rate": 1.7494444121404673e-05, + "loss": 0.168, "step": 180 }, { - "epoch": 2.1547619047619047, - "grad_norm": 3.840347092360756, - "learning_rate": 1.8859153893370737e-05, - "loss": 0.0537, + "epoch": 3.042016806722689, + "grad_norm": 6.416735934730296, + "learning_rate": 1.746404573081415e-05, + "loss": 0.1856, "step": 181 }, { - "epoch": 2.1666666666666665, - "grad_norm": 2.620898271441886, - "learning_rate": 1.8844192888721473e-05, - "loss": 0.686, + "epoch": 3.0588235294117645, + "grad_norm": 2.2813578076866623, + "learning_rate": 1.7433490800325614e-05, + "loss": 0.1393, "step": 182 }, { - "epoch": 2.1785714285714284, - "grad_norm": 3.1614575918085066, - "learning_rate": 1.8829140437285525e-05, - "loss": 0.6877, + "epoch": 3.0756302521008405, + "grad_norm": 2.595639669498612, + "learning_rate": 1.7402779970753156e-05, + "loss": 0.1428, "step": 183 }, { - "epoch": 2.1904761904761907, - "grad_norm": 3.357401090981342, - "learning_rate": 1.8813996694701548e-05, - "loss": 0.106, + "epoch": 3.092436974789916, + "grad_norm": 1.767980230153596, + "learning_rate": 1.7371913886180473e-05, + "loss": 0.1331, "step": 184 }, { - "epoch": 2.2023809523809526, - "grad_norm": 2.641859959945962, - "learning_rate": 1.8798761817552135e-05, - "loss": 0.1825, + "epoch": 3.1092436974789917, + "grad_norm": 2.5213252449194252, + "learning_rate": 1.7340893193947342e-05, + "loss": 0.1252, "step": 185 }, { - "epoch": 2.2142857142857144, - "grad_norm": 1.3075201309355102, - "learning_rate": 1.8783435963362178e-05, - "loss": 0.6945, + "epoch": 3.1260504201680672, + "grad_norm": 2.1170186194258167, + "learning_rate": 1.7309718544636057e-05, + "loss": 0.1108, "step": 186 }, { - "epoch": 2.2261904761904763, - "grad_norm": 1.7495575930553038, - "learning_rate": 1.8768019290597254e-05, - "loss": 0.4308, + "epoch": 3.142857142857143, + "grad_norm": 2.0268322211129592, + "learning_rate": 1.7278390592057785e-05, + "loss": 0.1561, "step": 187 }, { - "epoch": 2.238095238095238, - "grad_norm": 3.735783992873633, - "learning_rate": 1.8752511958661977e-05, - "loss": 0.0707, + "epoch": 3.1596638655462184, + "grad_norm": 1.7261621681645014, + "learning_rate": 1.7246909993238844e-05, + "loss": 0.1246, "step": 188 }, { - "epoch": 2.25, - "grad_norm": 4.3626751100504455, - "learning_rate": 1.873691412789836e-05, - "loss": 0.6226, + "epoch": 3.176470588235294, + "grad_norm": 1.3950437881445872, + "learning_rate": 1.7215277408406932e-05, + "loss": 0.0998, "step": 189 }, { - "epoch": 2.261904761904762, - "grad_norm": 3.030076532866424, - "learning_rate": 1.8721225959584156e-05, - "loss": 0.3429, + "epoch": 3.19327731092437, + "grad_norm": 1.793336566476121, + "learning_rate": 1.7183493500977277e-05, + "loss": 0.128, "step": 190 }, { - "epoch": 2.2738095238095237, - "grad_norm": 4.25577167516674, - "learning_rate": 1.8705447615931172e-05, - "loss": 0.6827, + "epoch": 3.2100840336134455, + "grad_norm": 1.35402748785192, + "learning_rate": 1.7151558937538725e-05, + "loss": 0.0979, "step": 191 }, { - "epoch": 2.2857142857142856, - "grad_norm": 2.623136916804772, - "learning_rate": 1.8689579260083622e-05, - "loss": 1.0001, + "epoch": 3.226890756302521, + "grad_norm": 1.9875540888658763, + "learning_rate": 1.7119474387839764e-05, + "loss": 0.1477, "step": 192 }, { - "epoch": 2.2976190476190474, - "grad_norm": 4.114146115626149, - "learning_rate": 1.8673621056116405e-05, - "loss": 0.0817, + "epoch": 3.2436974789915967, + "grad_norm": 3.4010639220257755, + "learning_rate": 1.708724052477446e-05, + "loss": 0.2346, "step": 193 }, { - "epoch": 2.3095238095238093, - "grad_norm": 3.6811417292623867, - "learning_rate": 1.865757316903345e-05, - "loss": 0.4183, + "epoch": 3.2605042016806722, + "grad_norm": 1.1526809505800015, + "learning_rate": 1.7054858024368365e-05, + "loss": 0.1189, "step": 194 }, { - "epoch": 2.3214285714285716, - "grad_norm": 2.8700111469727236, - "learning_rate": 1.8641435764765966e-05, - "loss": 0.6386, + "epoch": 3.277310924369748, + "grad_norm": 1.8401498885713787, + "learning_rate": 1.7022327565764336e-05, + "loss": 0.1321, "step": 195 }, { - "epoch": 2.3333333333333335, - "grad_norm": 2.147973753973312, - "learning_rate": 1.8625209010170766e-05, - "loss": 0.7956, + "epoch": 3.2941176470588234, + "grad_norm": 1.823800189990081, + "learning_rate": 1.6989649831208286e-05, + "loss": 0.1276, "step": 196 }, { - "epoch": 2.3452380952380953, - "grad_norm": 2.1147626975453884, - "learning_rate": 1.8608893073028506e-05, - "loss": 0.0699, + "epoch": 3.310924369747899, + "grad_norm": 2.177879884931455, + "learning_rate": 1.6956825506034866e-05, + "loss": 0.1367, "step": 197 }, { - "epoch": 2.357142857142857, - "grad_norm": 4.759143787399854, - "learning_rate": 1.8592488122041987e-05, - "loss": 0.3981, + "epoch": 3.327731092436975, + "grad_norm": 4.377960471738762, + "learning_rate": 1.6923855278653114e-05, + "loss": 0.177, "step": 198 }, { - "epoch": 2.369047619047619, - "grad_norm": 3.446773840305837, - "learning_rate": 1.8575994326834378e-05, - "loss": 0.0768, + "epoch": 3.3445378151260505, + "grad_norm": 1.530590314148621, + "learning_rate": 1.6890739840532004e-05, + "loss": 0.1344, "step": 199 }, { - "epoch": 2.380952380952381, - "grad_norm": 1.7576609471238682, - "learning_rate": 1.8559411857947472e-05, - "loss": 0.6315, + "epoch": 3.361344537815126, + "grad_norm": 1.3469335679962628, + "learning_rate": 1.6857479886185942e-05, + "loss": 0.1052, "step": 200 }, { - "epoch": 2.392857142857143, - "grad_norm": 3.70255268224624, - "learning_rate": 1.8542740886839942e-05, - "loss": 0.4377, + "epoch": 3.3781512605042017, + "grad_norm": 1.6139041164207482, + "learning_rate": 1.682407611316021e-05, + "loss": 0.1475, "step": 201 }, { - "epoch": 2.4047619047619047, - "grad_norm": 4.380222821407496, - "learning_rate": 1.8525981585885537e-05, - "loss": 0.703, + "epoch": 3.3949579831932772, + "grad_norm": 1.6495915167747617, + "learning_rate": 1.6790529222016328e-05, + "loss": 0.0972, "step": 202 }, { - "epoch": 2.4166666666666665, - "grad_norm": 4.357939060601177, - "learning_rate": 1.8509134128371325e-05, - "loss": 0.8375, + "epoch": 3.411764705882353, + "grad_norm": 1.8398639057645052, + "learning_rate": 1.6756839916317358e-05, + "loss": 0.1084, "step": 203 }, { - "epoch": 2.4285714285714284, - "grad_norm": 5.09156824997242, - "learning_rate": 1.8492198688495884e-05, - "loss": 0.483, + "epoch": 3.4285714285714284, + "grad_norm": 2.0107317404674236, + "learning_rate": 1.672300890261317e-05, + "loss": 0.1102, "step": 204 }, { - "epoch": 2.4404761904761907, - "grad_norm": 1.2517867091759434, - "learning_rate": 1.8475175441367514e-05, - "loss": 0.5436, + "epoch": 3.4453781512605044, + "grad_norm": 1.9565606900612817, + "learning_rate": 1.6689036890425596e-05, + "loss": 0.1139, "step": 205 }, { - "epoch": 2.4523809523809526, - "grad_norm": 2.2431411923712408, - "learning_rate": 1.8458064563002417e-05, - "loss": 0.6378, + "epoch": 3.46218487394958, + "grad_norm": 1.639534786311468, + "learning_rate": 1.665492459223357e-05, + "loss": 0.0918, "step": 206 }, { - "epoch": 2.4642857142857144, - "grad_norm": 1.1954820415052247, - "learning_rate": 1.8440866230322877e-05, - "loss": 0.2742, + "epoch": 3.4789915966386555, + "grad_norm": 5.45699346508005, + "learning_rate": 1.6620672723458167e-05, + "loss": 0.1591, "step": 207 }, { - "epoch": 2.4761904761904763, - "grad_norm": 2.9995391650371923, - "learning_rate": 1.8423580621155447e-05, - "loss": 0.0413, + "epoch": 3.495798319327731, + "grad_norm": 2.0885082670769965, + "learning_rate": 1.658628200244763e-05, + "loss": 0.1118, "step": 208 }, { - "epoch": 2.488095238095238, - "grad_norm": 1.2324176262010005, - "learning_rate": 1.8406207914229083e-05, - "loss": 0.5359, + "epoch": 3.5126050420168067, + "grad_norm": 1.972708418564533, + "learning_rate": 1.6551753150462258e-05, + "loss": 0.1533, "step": 209 }, { - "epoch": 2.5, - "grad_norm": 2.299451434662873, - "learning_rate": 1.8388748289173315e-05, - "loss": 0.7862, + "epoch": 3.5294117647058822, + "grad_norm": 1.3414901525438894, + "learning_rate": 1.6517086891659335e-05, + "loss": 0.0973, "step": 210 }, { - "epoch": 2.511904761904762, - "grad_norm": 2.3447443905706438, - "learning_rate": 1.837120192651639e-05, - "loss": 0.7311, + "epoch": 3.546218487394958, + "grad_norm": 2.260816117595264, + "learning_rate": 1.6482283953077887e-05, + "loss": 0.1141, "step": 211 }, { - "epoch": 2.5238095238095237, - "grad_norm": 1.4847792611991646, - "learning_rate": 1.8353569007683397e-05, - "loss": 0.5814, + "epoch": 3.5630252100840334, + "grad_norm": 1.3728281964965443, + "learning_rate": 1.644734506462347e-05, + "loss": 0.1069, "step": 212 }, { - "epoch": 2.5357142857142856, - "grad_norm": 1.5842760285101158, - "learning_rate": 1.8335849714994394e-05, - "loss": 0.6283, + "epoch": 3.5798319327731094, + "grad_norm": 1.647487452421848, + "learning_rate": 1.641227095905286e-05, + "loss": 0.1317, "step": 213 }, { - "epoch": 2.5476190476190474, - "grad_norm": 1.8493596748650025, - "learning_rate": 1.8318044231662525e-05, - "loss": 0.3096, + "epoch": 3.596638655462185, + "grad_norm": 1.555286447904776, + "learning_rate": 1.637706237195867e-05, + "loss": 0.0997, "step": 214 }, { - "epoch": 2.5595238095238093, - "grad_norm": 2.0830391218860442, - "learning_rate": 1.8300152741792122e-05, - "loss": 0.7391, + "epoch": 3.6134453781512605, + "grad_norm": 1.4295747968285732, + "learning_rate": 1.6341720041753924e-05, + "loss": 0.1116, "step": 215 }, { - "epoch": 2.571428571428571, - "grad_norm": 1.532373441358143, - "learning_rate": 1.8282175430376804e-05, - "loss": 0.5466, + "epoch": 3.630252100840336, + "grad_norm": 1.4982199659580737, + "learning_rate": 1.6306244709656597e-05, + "loss": 0.1014, "step": 216 }, { - "epoch": 2.5833333333333335, - "grad_norm": 5.731342011284239, - "learning_rate": 1.826411248329757e-05, - "loss": 0.6475, + "epoch": 3.6470588235294117, + "grad_norm": 3.3567838619912513, + "learning_rate": 1.6270637119674023e-05, + "loss": 0.1758, "step": 217 }, { - "epoch": 2.5952380952380953, - "grad_norm": 12.481835165886968, - "learning_rate": 1.8245964087320856e-05, - "loss": 0.2665, + "epoch": 3.6638655462184873, + "grad_norm": 1.549085403651707, + "learning_rate": 1.6234898018587336e-05, + "loss": 0.0874, "step": 218 }, { - "epoch": 2.607142857142857, - "grad_norm": 3.513207130809279, - "learning_rate": 1.822773043009664e-05, - "loss": 0.3703, + "epoch": 3.6806722689075633, + "grad_norm": 2.1336886853349295, + "learning_rate": 1.6199028155935793e-05, + "loss": 0.1467, "step": 219 }, { - "epoch": 2.619047619047619, - "grad_norm": 7.210938305978828, - "learning_rate": 1.8209411700156457e-05, - "loss": 0.3699, + "epoch": 3.697478991596639, + "grad_norm": 2.306782696878204, + "learning_rate": 1.6163028284001034e-05, + "loss": 0.1186, "step": 220 }, { - "epoch": 2.630952380952381, - "grad_norm": 4.21198369122006, - "learning_rate": 1.8191008086911494e-05, - "loss": 0.3044, + "epoch": 3.7142857142857144, + "grad_norm": 1.520623845541843, + "learning_rate": 1.612689915779134e-05, + "loss": 0.0923, "step": 221 }, { - "epoch": 2.642857142857143, - "grad_norm": 8.206928947078168, - "learning_rate": 1.8172519780650594e-05, - "loss": 0.406, + "epoch": 3.73109243697479, + "grad_norm": 1.147756639869472, + "learning_rate": 1.6090641535025773e-05, + "loss": 0.0854, "step": 222 }, { - "epoch": 2.6547619047619047, - "grad_norm": 1.4773204257836823, - "learning_rate": 1.8153946972538315e-05, - "loss": 0.4871, + "epoch": 3.7478991596638656, + "grad_norm": 1.2212881766323158, + "learning_rate": 1.605425617611829e-05, + "loss": 0.0868, "step": 223 }, { - "epoch": 2.6666666666666665, - "grad_norm": 2.461150337006145, - "learning_rate": 1.8135289854612942e-05, - "loss": 0.6975, + "epoch": 3.764705882352941, + "grad_norm": 1.374752115412969, + "learning_rate": 1.6017743844161802e-05, + "loss": 0.0857, "step": 224 }, { - "epoch": 2.678571428571429, - "grad_norm": 1.5762960515761975, - "learning_rate": 1.8116548619784497e-05, - "loss": 0.4559, + "epoch": 3.7815126050420167, + "grad_norm": 1.0655866618063339, + "learning_rate": 1.598110530491216e-05, + "loss": 0.0915, "step": 225 }, { - "epoch": 2.6904761904761907, - "grad_norm": 8.566362905559712, - "learning_rate": 1.8097723461832762e-05, - "loss": 0.4719, + "epoch": 3.7983193277310923, + "grad_norm": 2.573005569504097, + "learning_rate": 1.5944341326772112e-05, + "loss": 0.0879, "step": 226 }, { - "epoch": 2.7023809523809526, - "grad_norm": 1.5057487839402208, - "learning_rate": 1.8078814575405245e-05, - "loss": 0.4647, + "epoch": 3.815126050420168, + "grad_norm": 7.560760061208962, + "learning_rate": 1.5907452680775164e-05, + "loss": 0.2092, "step": 227 }, { - "epoch": 2.7142857142857144, - "grad_norm": 1.4988719026370554, - "learning_rate": 1.8059822156015208e-05, - "loss": 0.4375, + "epoch": 3.831932773109244, + "grad_norm": 1.9398630844655902, + "learning_rate": 1.587044014056943e-05, + "loss": 0.1255, "step": 228 }, { - "epoch": 2.7261904761904763, - "grad_norm": 8.453323259002874, - "learning_rate": 1.8040746400039604e-05, - "loss": 0.4722, + "epoch": 3.8487394957983194, + "grad_norm": 1.6555395105411446, + "learning_rate": 1.583330448240139e-05, + "loss": 0.104, "step": 229 }, { - "epoch": 2.738095238095238, - "grad_norm": 0.9029433541228935, - "learning_rate": 1.802158750471708e-05, - "loss": 0.2602, + "epoch": 3.865546218487395, + "grad_norm": 2.7598515858574775, + "learning_rate": 1.5796046485099633e-05, + "loss": 0.1696, "step": 230 }, { - "epoch": 2.75, - "grad_norm": 7.26322817577555, - "learning_rate": 1.8002345668145916e-05, - "loss": 0.4212, + "epoch": 3.8823529411764706, + "grad_norm": 2.3775725159616377, + "learning_rate": 1.57586669300585e-05, + "loss": 0.1144, "step": 231 }, { - "epoch": 2.761904761904762, - "grad_norm": 1.82321572656156, - "learning_rate": 1.7983021089281983e-05, - "loss": 0.0316, + "epoch": 3.899159663865546, + "grad_norm": 7.982198513719052, + "learning_rate": 1.5721166601221697e-05, + "loss": 0.1838, "step": 232 }, { - "epoch": 2.7738095238095237, - "grad_norm": 1.7568134018279924, - "learning_rate": 1.7963613967936693e-05, - "loss": 0.2427, + "epoch": 3.9159663865546217, + "grad_norm": 2.9062227838430847, + "learning_rate": 1.5683546285065878e-05, + "loss": 0.1136, "step": 233 }, { - "epoch": 2.7857142857142856, - "grad_norm": 4.452715421557287, - "learning_rate": 1.794412450477492e-05, - "loss": 0.1355, + "epoch": 3.9327731092436977, + "grad_norm": 2.4135228480944035, + "learning_rate": 1.5645806770584122e-05, + "loss": 0.0948, "step": 234 }, { - "epoch": 2.7976190476190474, - "grad_norm": 1.9661146108992258, - "learning_rate": 1.7924552901312943e-05, - "loss": 0.4154, + "epoch": 3.9495798319327733, + "grad_norm": 3.0055227991300333, + "learning_rate": 1.5607948849269404e-05, + "loss": 0.1268, "step": 235 }, { - "epoch": 2.8095238095238093, - "grad_norm": 3.0643788562396588, - "learning_rate": 1.7904899359916337e-05, - "loss": 0.2914, + "epoch": 3.966386554621849, + "grad_norm": 5.10158046985291, + "learning_rate": 1.5569973315097985e-05, + "loss": 0.152, "step": 236 }, { - "epoch": 2.821428571428571, - "grad_norm": 1.4486771783750234, - "learning_rate": 1.788516408379791e-05, - "loss": 0.3255, + "epoch": 3.9831932773109244, + "grad_norm": 2.436291138010729, + "learning_rate": 1.5531880964512773e-05, + "loss": 0.1158, "step": 237 }, { - "epoch": 2.8333333333333335, - "grad_norm": 3.6986784345856796, - "learning_rate": 1.7865347277015575e-05, - "loss": 0.0578, + "epoch": 4.0, + "grad_norm": 4.281661543875596, + "learning_rate": 1.54936725964066e-05, + "loss": 0.1045, + "step": 238 + }, + { + "epoch": 4.0, + "eval_loss": 0.12479228526353836, + "eval_runtime": 94.0271, + "eval_samples_per_second": 1.0, + "eval_steps_per_second": 1.0, "step": 238 }, { - "epoch": 2.8452380952380953, - "grad_norm": 2.445003348278355, - "learning_rate": 1.7845449144470256e-05, - "loss": 0.5247, + "epoch": 4.016806722689076, + "grad_norm": 1.912176029492126, + "learning_rate": 1.5455349012105488e-05, + "loss": 0.135, "step": 239 }, { - "epoch": 2.857142857142857, - "grad_norm": 1.9974207748375905, - "learning_rate": 1.7825469891903768e-05, - "loss": 0.6317, + "epoch": 4.033613445378151, + "grad_norm": 2.215120261832349, + "learning_rate": 1.5416911015351827e-05, + "loss": 0.127, "step": 240 }, { - "epoch": 2.869047619047619, - "grad_norm": 3.4932838771747425, - "learning_rate": 1.7805409725896687e-05, - "loss": 0.0541, + "epoch": 4.050420168067227, + "grad_norm": 1.52744144609313, + "learning_rate": 1.5378359412287537e-05, + "loss": 0.1167, "step": 241 }, { - "epoch": 2.880952380952381, - "grad_norm": 2.823086359189426, - "learning_rate": 1.7785268853866213e-05, - "loss": 0.34, + "epoch": 4.067226890756302, + "grad_norm": 1.9530294241000907, + "learning_rate": 1.5339695011437128e-05, + "loss": 0.1071, "step": 242 }, { - "epoch": 2.892857142857143, - "grad_norm": 1.834893296841028, - "learning_rate": 1.776504748406402e-05, - "loss": 0.074, + "epoch": 4.084033613445378, + "grad_norm": 2.215735360838886, + "learning_rate": 1.530091862369078e-05, + "loss": 0.1498, "step": 243 }, { - "epoch": 2.9047619047619047, - "grad_norm": 1.5422190408413234, - "learning_rate": 1.7744745825574123e-05, - "loss": 0.2529, + "epoch": 4.100840336134453, + "grad_norm": 2.2899623389238246, + "learning_rate": 1.526203106228733e-05, + "loss": 0.1452, "step": 244 }, { - "epoch": 2.9166666666666665, - "grad_norm": 2.126497256827208, - "learning_rate": 1.772436408831069e-05, - "loss": 0.414, + "epoch": 4.117647058823529, + "grad_norm": 2.366322124903928, + "learning_rate": 1.5223033142797183e-05, + "loss": 0.0997, "step": 245 }, { - "epoch": 2.928571428571429, - "grad_norm": 1.6106901139166283, - "learning_rate": 1.770390248301589e-05, - "loss": 0.2728, + "epoch": 4.1344537815126055, + "grad_norm": 2.180155804240841, + "learning_rate": 1.5183925683105254e-05, + "loss": 0.1103, "step": 246 }, { - "epoch": 2.9404761904761907, - "grad_norm": 2.544109776535844, - "learning_rate": 1.7683361221257705e-05, - "loss": 0.6377, + "epoch": 4.151260504201681, + "grad_norm": 1.9915581272469869, + "learning_rate": 1.5144709503393773e-05, + "loss": 0.1178, "step": 247 }, { - "epoch": 2.9523809523809526, - "grad_norm": 3.931188529393615, - "learning_rate": 1.7662740515427753e-05, - "loss": 0.5146, + "epoch": 4.168067226890757, + "grad_norm": 2.0239687647120905, + "learning_rate": 1.5105385426125123e-05, + "loss": 0.0996, "step": 248 }, { - "epoch": 2.9642857142857144, - "grad_norm": 20.724460916802276, - "learning_rate": 1.764204057873907e-05, - "loss": 0.8433, + "epoch": 4.184873949579832, + "grad_norm": 1.86641722627037, + "learning_rate": 1.5065954276024561e-05, + "loss": 0.1104, "step": 249 }, { - "epoch": 2.9761904761904763, - "grad_norm": 3.8203367792590024, - "learning_rate": 1.762126162522393e-05, - "loss": 0.0734, + "epoch": 4.201680672268908, + "grad_norm": 1.6792110703483962, + "learning_rate": 1.5026416880062932e-05, + "loss": 0.1237, "step": 250 }, { - "epoch": 2.988095238095238, - "grad_norm": 1.4135137345241042, - "learning_rate": 1.760040386973162e-05, - "loss": 0.3819, + "epoch": 4.218487394957983, + "grad_norm": 1.2608521044993408, + "learning_rate": 1.4986774067439327e-05, + "loss": 0.1011, "step": 251 }, { - "epoch": 3.0, - "grad_norm": 3.288964040896793, - "learning_rate": 1.7579467527926223e-05, - "loss": 0.513, - "step": 252 - }, - { - "epoch": 3.0, - "eval_loss": 0.37594377994537354, - "eval_runtime": 38.8272, - "eval_samples_per_second": 1.545, - "eval_steps_per_second": 1.545, + "epoch": 4.235294117647059, + "grad_norm": 2.5144065390038617, + "learning_rate": 1.4947026669563687e-05, + "loss": 0.1515, "step": 252 }, { - "epoch": 3.011904761904762, - "grad_norm": 2.2658309314703837, - "learning_rate": 1.7558452816284374e-05, - "loss": 0.2167, + "epoch": 4.2521008403361344, + "grad_norm": 1.5804219840273723, + "learning_rate": 1.4907175520039381e-05, + "loss": 0.1163, "step": 253 }, { - "epoch": 3.0238095238095237, - "grad_norm": 2.171321063645936, - "learning_rate": 1.7537359952093046e-05, - "loss": 0.2767, + "epoch": 4.26890756302521, + "grad_norm": 2.2244097182086273, + "learning_rate": 1.4867221454645696e-05, + "loss": 0.1188, "step": 254 }, { - "epoch": 3.0357142857142856, - "grad_norm": 3.224696150470818, - "learning_rate": 1.7516189153447283e-05, - "loss": 0.3851, + "epoch": 4.285714285714286, + "grad_norm": 1.64104407678126, + "learning_rate": 1.482716531132034e-05, + "loss": 0.0965, "step": 255 }, { - "epoch": 3.0476190476190474, - "grad_norm": 3.3132846666505413, - "learning_rate": 1.7494940639247953e-05, - "loss": 0.0783, + "epoch": 4.302521008403361, + "grad_norm": 0.8551932780454022, + "learning_rate": 1.4787007930141841e-05, + "loss": 0.0655, "step": 256 }, { - "epoch": 3.0595238095238093, - "grad_norm": 3.282432290241069, - "learning_rate": 1.747361462919949e-05, - "loss": 0.5529, + "epoch": 4.319327731092437, + "grad_norm": 1.752648892273715, + "learning_rate": 1.4746750153311951e-05, + "loss": 0.0946, "step": 257 }, { - "epoch": 3.0714285714285716, - "grad_norm": 4.303685952728001, - "learning_rate": 1.74522113438076e-05, - "loss": 0.2379, + "epoch": 4.336134453781512, + "grad_norm": 5.829984337336428, + "learning_rate": 1.4706392825137962e-05, + "loss": 0.1826, "step": 258 }, { - "epoch": 3.0833333333333335, - "grad_norm": 3.1849988371458466, - "learning_rate": 1.7430731004377016e-05, - "loss": 0.3984, + "epoch": 4.352941176470588, + "grad_norm": 2.302171396641537, + "learning_rate": 1.4665936792015021e-05, + "loss": 0.0909, "step": 259 }, { - "epoch": 3.0952380952380953, - "grad_norm": 6.847041494024382, - "learning_rate": 1.7409173833009184e-05, - "loss": 0.5108, + "epoch": 4.369747899159664, + "grad_norm": 1.2086828157662006, + "learning_rate": 1.4625382902408356e-05, + "loss": 0.0972, "step": 260 }, { - "epoch": 3.107142857142857, - "grad_norm": 2.758331616979815, - "learning_rate": 1.738754005259997e-05, - "loss": 0.0378, + "epoch": 4.38655462184874, + "grad_norm": 1.3490901275333183, + "learning_rate": 1.4584732006835495e-05, + "loss": 0.0823, "step": 261 }, { - "epoch": 3.119047619047619, - "grad_norm": 4.627462090057032, - "learning_rate": 1.7365829886837355e-05, - "loss": 0.0942, + "epoch": 4.4033613445378155, + "grad_norm": 3.3951376429875393, + "learning_rate": 1.4543984957848438e-05, + "loss": 0.1492, "step": 262 }, { - "epoch": 3.130952380952381, - "grad_norm": 4.2867703712422305, - "learning_rate": 1.7344043560199137e-05, - "loss": 0.4127, + "epoch": 4.420168067226891, + "grad_norm": 2.4333139248557165, + "learning_rate": 1.4503142610015751e-05, + "loss": 0.0974, "step": 263 }, { - "epoch": 3.142857142857143, - "grad_norm": 3.324654892695759, - "learning_rate": 1.7322181297950595e-05, - "loss": 0.4679, + "epoch": 4.436974789915967, + "grad_norm": 1.818247800901397, + "learning_rate": 1.4462205819904658e-05, + "loss": 0.0813, "step": 264 }, { - "epoch": 3.1547619047619047, - "grad_norm": 1.501271013583071, - "learning_rate": 1.7300243326142157e-05, - "loss": 0.3555, + "epoch": 4.453781512605042, + "grad_norm": 2.761939597967942, + "learning_rate": 1.4421175446063086e-05, + "loss": 0.1356, "step": 265 }, { - "epoch": 3.1666666666666665, - "grad_norm": 2.9950158939240867, - "learning_rate": 1.7278229871607083e-05, - "loss": 0.3781, + "epoch": 4.470588235294118, + "grad_norm": 2.3235891826621895, + "learning_rate": 1.4380052349001647e-05, + "loss": 0.1393, "step": 266 }, { - "epoch": 3.1785714285714284, - "grad_norm": 1.4907573955706444, - "learning_rate": 1.7256141161959087e-05, - "loss": 0.3505, + "epoch": 4.487394957983193, + "grad_norm": 1.4778394471726972, + "learning_rate": 1.4338837391175582e-05, + "loss": 0.1124, "step": 267 }, { - "epoch": 3.1904761904761907, - "grad_norm": 2.66786279641303, - "learning_rate": 1.7233977425590015e-05, - "loss": 0.3757, + "epoch": 4.504201680672269, + "grad_norm": 1.6212161850248412, + "learning_rate": 1.42975314369667e-05, + "loss": 0.0834, "step": 268 }, { - "epoch": 3.2023809523809526, - "grad_norm": 3.7324135967195082, - "learning_rate": 1.7211738891667474e-05, - "loss": 0.2186, + "epoch": 4.5210084033613445, + "grad_norm": 1.513176697844455, + "learning_rate": 1.4256135352665217e-05, + "loss": 0.1053, "step": 269 }, { - "epoch": 3.2142857142857144, - "grad_norm": 0.8930591045310717, - "learning_rate": 1.7189425790132452e-05, - "loss": 0.1605, + "epoch": 4.53781512605042, + "grad_norm": 1.5842489277719052, + "learning_rate": 1.4214650006451622e-05, + "loss": 0.0793, "step": 270 }, { - "epoch": 3.2261904761904763, - "grad_norm": 2.5813547947324644, - "learning_rate": 1.7167038351696947e-05, - "loss": 0.5126, + "epoch": 4.554621848739496, + "grad_norm": 2.882811290847085, + "learning_rate": 1.4173076268378443e-05, + "loss": 0.0944, "step": 271 }, { - "epoch": 3.238095238095238, - "grad_norm": 0.9752237664543323, - "learning_rate": 1.7144576807841583e-05, - "loss": 0.1811, + "epoch": 4.571428571428571, + "grad_norm": 2.540707077443368, + "learning_rate": 1.4131415010352007e-05, + "loss": 0.0929, "step": 272 }, { - "epoch": 3.25, - "grad_norm": 0.9647196518957709, - "learning_rate": 1.7122041390813228e-05, - "loss": 0.0154, + "epoch": 4.588235294117647, + "grad_norm": 2.224835930797884, + "learning_rate": 1.408966710611416e-05, + "loss": 0.0965, "step": 273 }, { - "epoch": 3.261904761904762, - "grad_norm": 2.550395344630908, - "learning_rate": 1.709943233362256e-05, - "loss": 0.5381, + "epoch": 4.605042016806722, + "grad_norm": 1.603233390085239, + "learning_rate": 1.4047833431223938e-05, + "loss": 0.1082, "step": 274 }, { - "epoch": 3.2738095238095237, - "grad_norm": 0.9538094415143878, - "learning_rate": 1.7076749870041692e-05, - "loss": 0.1486, + "epoch": 4.621848739495798, + "grad_norm": 1.7397244673026468, + "learning_rate": 1.4005914863039203e-05, + "loss": 0.0765, "step": 275 }, { - "epoch": 3.2857142857142856, - "grad_norm": 5.604529448927459, - "learning_rate": 1.7053994234601736e-05, - "loss": 0.4003, + "epoch": 4.6386554621848735, + "grad_norm": 1.9121905502244794, + "learning_rate": 1.3963912280698238e-05, + "loss": 0.0852, "step": 276 }, { - "epoch": 3.2976190476190474, - "grad_norm": 9.077045722590805, - "learning_rate": 1.703116566259039e-05, - "loss": 0.408, + "epoch": 4.65546218487395, + "grad_norm": 1.8233199956684913, + "learning_rate": 1.3921826565101325e-05, + "loss": 0.104, "step": 277 }, { - "epoch": 3.3095238095238093, - "grad_norm": 3.300132707260137, - "learning_rate": 1.7008264390049493e-05, - "loss": 0.2426, + "epoch": 4.6722689075630255, + "grad_norm": 0.7406392209229766, + "learning_rate": 1.3879658598892254e-05, + "loss": 0.0597, "step": 278 }, { - "epoch": 3.3214285714285716, - "grad_norm": 2.0242722270180527, - "learning_rate": 1.6985290653772583e-05, - "loss": 0.1482, + "epoch": 4.689075630252101, + "grad_norm": 7.419162565626874, + "learning_rate": 1.3837409266439818e-05, + "loss": 0.209, "step": 279 }, { - "epoch": 3.3333333333333335, - "grad_norm": 2.8369343983065294, - "learning_rate": 1.6962244691302466e-05, - "loss": 0.1434, + "epoch": 4.705882352941177, + "grad_norm": 1.1738246179937548, + "learning_rate": 1.3795079453819276e-05, + "loss": 0.0996, "step": 280 }, { - "epoch": 3.3452380952380953, - "grad_norm": 7.081419291772338, - "learning_rate": 1.6939126740928745e-05, - "loss": 0.1271, + "epoch": 4.722689075630252, + "grad_norm": 0.7316310386069824, + "learning_rate": 1.3752670048793744e-05, + "loss": 0.0573, "step": 281 }, { - "epoch": 3.357142857142857, - "grad_norm": 2.3722193904935716, - "learning_rate": 1.691593704168536e-05, - "loss": 0.2892, + "epoch": 4.739495798319328, + "grad_norm": 1.0245543216679647, + "learning_rate": 1.37101819407956e-05, + "loss": 0.0798, "step": 282 }, { - "epoch": 3.369047619047619, - "grad_norm": 6.960729490454029, - "learning_rate": 1.6892675833348124e-05, - "loss": 0.1294, + "epoch": 4.756302521008403, + "grad_norm": 1.2604913385475165, + "learning_rate": 1.366761602090782e-05, + "loss": 0.082, "step": 283 }, { - "epoch": 3.380952380952381, - "grad_norm": 1.1500777231564219, - "learning_rate": 1.686934335643222e-05, - "loss": 0.2509, + "epoch": 4.773109243697479, + "grad_norm": 1.6057488873958579, + "learning_rate": 1.3624973181845302e-05, + "loss": 0.0765, "step": 284 }, { - "epoch": 3.392857142857143, - "grad_norm": 1.5870567460181373, - "learning_rate": 1.684593985218974e-05, - "loss": 0.2563, + "epoch": 4.7899159663865545, + "grad_norm": 2.0379166749602895, + "learning_rate": 1.3582254317936117e-05, + "loss": 0.1001, "step": 285 }, { - "epoch": 3.4047619047619047, - "grad_norm": 5.098839245282105, - "learning_rate": 1.6822465562607186e-05, - "loss": 0.095, + "epoch": 4.80672268907563, + "grad_norm": 1.7603012476865616, + "learning_rate": 1.3539460325102779e-05, + "loss": 0.1126, "step": 286 }, { - "epoch": 3.4166666666666665, - "grad_norm": 4.33924245316836, - "learning_rate": 1.6798920730402962e-05, - "loss": 0.1761, + "epoch": 4.823529411764706, + "grad_norm": 1.4253082853875376, + "learning_rate": 1.349659210084344e-05, + "loss": 0.0978, "step": 287 }, { - "epoch": 3.4285714285714284, - "grad_norm": 1.4460187648094192, - "learning_rate": 1.6775305599024853e-05, - "loss": 0.2282, + "epoch": 4.840336134453781, + "grad_norm": 1.1284525438619117, + "learning_rate": 1.3453650544213078e-05, + "loss": 0.0695, "step": 288 }, { - "epoch": 3.4404761904761907, - "grad_norm": 2.984916897434454, - "learning_rate": 1.675162041264754e-05, - "loss": 0.0698, + "epoch": 4.857142857142857, + "grad_norm": 0.9802953497383712, + "learning_rate": 1.3410636555804634e-05, + "loss": 0.0751, "step": 289 }, { - "epoch": 3.4523809523809526, - "grad_norm": 5.808631040149753, - "learning_rate": 1.6727865416170032e-05, - "loss": 0.0843, + "epoch": 4.873949579831933, + "grad_norm": 1.3612858881668406, + "learning_rate": 1.3367551037730129e-05, + "loss": 0.0966, "step": 290 }, { - "epoch": 3.4642857142857144, - "grad_norm": 2.666294292390359, - "learning_rate": 1.6704040855213182e-05, - "loss": 0.0391, + "epoch": 4.890756302521009, + "grad_norm": 1.0157890782271555, + "learning_rate": 1.3324394893601734e-05, + "loss": 0.059, "step": 291 }, { - "epoch": 3.4761904761904763, - "grad_norm": 2.4526472332810165, - "learning_rate": 1.6680146976117105e-05, - "loss": 0.393, + "epoch": 4.907563025210084, + "grad_norm": 1.4054910887816698, + "learning_rate": 1.3281169028512838e-05, + "loss": 0.0709, "step": 292 }, { - "epoch": 3.488095238095238, - "grad_norm": 5.063252926047181, - "learning_rate": 1.6656184025938654e-05, - "loss": 0.3265, + "epoch": 4.92436974789916, + "grad_norm": 0.8662398448794504, + "learning_rate": 1.3237874349019041e-05, + "loss": 0.0756, "step": 293 }, { - "epoch": 3.5, - "grad_norm": 0.7680043862411393, - "learning_rate": 1.663215225244886e-05, - "loss": 0.0138, + "epoch": 4.9411764705882355, + "grad_norm": 1.3483246839258962, + "learning_rate": 1.319451176311917e-05, + "loss": 0.0769, "step": 294 }, { - "epoch": 3.511904761904762, - "grad_norm": 2.3680085112002307, - "learning_rate": 1.6608051904130375e-05, - "loss": 0.4016, + "epoch": 4.957983193277311, + "grad_norm": 2.4153558320453614, + "learning_rate": 1.315108218023621e-05, + "loss": 0.1131, "step": 295 }, { - "epoch": 3.5238095238095237, - "grad_norm": 1.9514276941132964, - "learning_rate": 1.6583883230174884e-05, - "loss": 0.1216, + "epoch": 4.974789915966387, + "grad_norm": 2.555834294764848, + "learning_rate": 1.3107586511198243e-05, + "loss": 0.141, "step": 296 }, { - "epoch": 3.5357142857142856, - "grad_norm": 1.9177647290193789, - "learning_rate": 1.6559646480480563e-05, - "loss": 0.2042, + "epoch": 4.991596638655462, + "grad_norm": 4.147522093497237, + "learning_rate": 1.306402566821935e-05, + "loss": 0.1456, + "step": 297 + }, + { + "epoch": 4.991596638655462, + "eval_loss": 0.1449918895959854, + "eval_runtime": 97.1487, + "eval_samples_per_second": 0.968, + "eval_steps_per_second": 0.968, "step": 297 }, { - "epoch": 3.5476190476190474, - "grad_norm": 11.831680317369488, - "learning_rate": 1.6535341905649453e-05, - "loss": 0.3236, + "epoch": 5.008403361344538, + "grad_norm": 1.6155708244562734, + "learning_rate": 1.302040056488047e-05, + "loss": 0.0717, "step": 298 }, { - "epoch": 3.5595238095238093, - "grad_norm": 0.854538769203965, - "learning_rate": 1.651096975698491e-05, - "loss": 0.1621, + "epoch": 5.025210084033613, + "grad_norm": 1.144238074710071, + "learning_rate": 1.297671211611025e-05, + "loss": 0.0719, "step": 299 }, { - "epoch": 3.571428571428571, - "grad_norm": 1.7325718951049507, - "learning_rate": 1.648653028648897e-05, - "loss": 0.3236, + "epoch": 5.042016806722689, + "grad_norm": 2.0576085887744795, + "learning_rate": 1.2932961238165837e-05, + "loss": 0.1152, "step": 300 }, { - "epoch": 3.5833333333333335, - "grad_norm": 1.6802180110167104, - "learning_rate": 1.6462023746859774e-05, - "loss": 0.1567, + "epoch": 5.0588235294117645, + "grad_norm": 1.0604218669017003, + "learning_rate": 1.2889148848613695e-05, + "loss": 0.0712, "step": 301 }, { - "epoch": 3.5952380952380953, - "grad_norm": 3.275665572161553, - "learning_rate": 1.6437450391488928e-05, - "loss": 0.3051, + "epoch": 5.07563025210084, + "grad_norm": 1.1699347626659127, + "learning_rate": 1.2845275866310325e-05, + "loss": 0.061, "step": 302 }, { - "epoch": 3.607142857142857, - "grad_norm": 0.8259080343432141, - "learning_rate": 1.6412810474458906e-05, - "loss": 0.1376, + "epoch": 5.092436974789916, + "grad_norm": 1.1371190586967697, + "learning_rate": 1.2801343211383021e-05, + "loss": 0.0747, "step": 303 }, { - "epoch": 3.619047619047619, - "grad_norm": 3.0586732720031526, - "learning_rate": 1.6388104250540414e-05, - "loss": 0.121, + "epoch": 5.109243697478991, + "grad_norm": 1.8857383359198918, + "learning_rate": 1.2757351805210557e-05, + "loss": 0.1447, "step": 304 }, { - "epoch": 3.630952380952381, - "grad_norm": 2.473548405954027, - "learning_rate": 1.6363331975189748e-05, - "loss": 0.2078, + "epoch": 5.126050420168067, + "grad_norm": 1.0626942422062413, + "learning_rate": 1.2713302570403872e-05, + "loss": 0.0657, "step": 305 }, { - "epoch": 3.642857142857143, - "grad_norm": 3.2034115805081163, - "learning_rate": 1.6338493904546163e-05, - "loss": 0.1119, + "epoch": 5.142857142857143, + "grad_norm": 1.3367342967159013, + "learning_rate": 1.2669196430786715e-05, + "loss": 0.086, "step": 306 }, { - "epoch": 3.6547619047619047, - "grad_norm": 1.735238565417923, - "learning_rate": 1.6313590295429224e-05, - "loss": 0.0482, + "epoch": 5.159663865546219, + "grad_norm": 4.036414956554653, + "learning_rate": 1.2625034311376276e-05, + "loss": 0.1317, "step": 307 }, { - "epoch": 3.6666666666666665, - "grad_norm": 1.7705110125878167, - "learning_rate": 1.6288621405336144e-05, - "loss": 0.167, + "epoch": 5.176470588235294, + "grad_norm": 2.092631890687451, + "learning_rate": 1.258081713836378e-05, + "loss": 0.0958, "step": 308 }, { - "epoch": 3.678571428571429, - "grad_norm": 1.350904255609373, - "learning_rate": 1.6263587492439127e-05, - "loss": 0.1935, + "epoch": 5.19327731092437, + "grad_norm": 2.470224875937704, + "learning_rate": 1.2536545839095074e-05, + "loss": 0.0764, "step": 309 }, { - "epoch": 3.6904761904761907, - "grad_norm": 5.116050315328356, - "learning_rate": 1.6238488815582693e-05, - "loss": 0.1613, + "epoch": 5.2100840336134455, + "grad_norm": 2.6817499179267483, + "learning_rate": 1.2492221342051153e-05, + "loss": 0.1124, "step": 310 }, { - "epoch": 3.7023809523809526, - "grad_norm": 4.879662048621329, - "learning_rate": 1.6213325634281017e-05, - "loss": 0.2637, + "epoch": 5.226890756302521, + "grad_norm": 2.000873702339644, + "learning_rate": 1.2447844576828719e-05, + "loss": 0.1067, "step": 311 }, { - "epoch": 3.7142857142857144, - "grad_norm": 8.743930661693144, - "learning_rate": 1.6188098208715216e-05, - "loss": 0.3958, + "epoch": 5.243697478991597, + "grad_norm": 2.039025163817311, + "learning_rate": 1.2403416474120657e-05, + "loss": 0.0703, "step": 312 }, { - "epoch": 3.7261904761904763, - "grad_norm": 14.84650411290692, - "learning_rate": 1.6162806799730694e-05, - "loss": 0.3667, + "epoch": 5.260504201680672, + "grad_norm": 1.2573161613968584, + "learning_rate": 1.2358937965696538e-05, + "loss": 0.0694, "step": 313 }, { - "epoch": 3.738095238095238, - "grad_norm": 3.343827891489822, - "learning_rate": 1.6137451668834415e-05, - "loss": 0.1576, + "epoch": 5.277310924369748, + "grad_norm": 1.6533379980735152, + "learning_rate": 1.2314409984383066e-05, + "loss": 0.0828, "step": 314 }, { - "epoch": 3.75, - "grad_norm": 2.1179290591609656, - "learning_rate": 1.6112033078192225e-05, - "loss": 0.1657, + "epoch": 5.294117647058823, + "grad_norm": 2.0132269342271467, + "learning_rate": 1.2269833464044514e-05, + "loss": 0.064, "step": 315 }, { - "epoch": 3.761904761904762, - "grad_norm": 9.300028025529468, - "learning_rate": 1.6086551290626117e-05, - "loss": 0.162, + "epoch": 5.310924369747899, + "grad_norm": 1.6870312479105358, + "learning_rate": 1.2225209339563144e-05, + "loss": 0.1093, "step": 316 }, { - "epoch": 3.7738095238095237, - "grad_norm": 2.974584952079088, - "learning_rate": 1.6061006569611524e-05, - "loss": 0.1714, + "epoch": 5.3277310924369745, + "grad_norm": 1.2264520173137188, + "learning_rate": 1.2180538546819595e-05, + "loss": 0.0844, "step": 317 }, { - "epoch": 3.7857142857142856, - "grad_norm": 1.3270086319514083, - "learning_rate": 1.6035399179274604e-05, - "loss": 0.2454, + "epoch": 5.34453781512605, + "grad_norm": 1.6926310927686044, + "learning_rate": 1.2135822022673263e-05, + "loss": 0.0663, "step": 318 }, { - "epoch": 3.7976190476190474, - "grad_norm": 2.239433984527888, - "learning_rate": 1.60097293843895e-05, - "loss": 0.1539, + "epoch": 5.361344537815126, + "grad_norm": 4.790942950250594, + "learning_rate": 1.2091060704942636e-05, + "loss": 0.1585, "step": 319 }, { - "epoch": 3.8095238095238093, - "grad_norm": 1.0309175730109765, - "learning_rate": 1.5983997450375594e-05, - "loss": 0.1329, + "epoch": 5.378151260504202, + "grad_norm": 1.0161253142944433, + "learning_rate": 1.204625553238565e-05, + "loss": 0.0723, "step": 320 }, { - "epoch": 3.821428571428571, - "grad_norm": 8.007832806905567, - "learning_rate": 1.595820364329478e-05, - "loss": 0.2519, + "epoch": 5.394957983193278, + "grad_norm": 1.1558751249048937, + "learning_rate": 1.200140744467997e-05, + "loss": 0.084, "step": 321 }, { - "epoch": 3.8333333333333335, - "grad_norm": 4.178320830537582, - "learning_rate": 1.5932348229848704e-05, - "loss": 0.2863, + "epoch": 5.411764705882353, + "grad_norm": 0.9419572615825091, + "learning_rate": 1.195651738240332e-05, + "loss": 0.0571, "step": 322 }, { - "epoch": 3.8452380952380953, - "grad_norm": 7.344798396674696, - "learning_rate": 1.5906431477375994e-05, - "loss": 0.2325, + "epoch": 5.428571428571429, + "grad_norm": 1.622044146409352, + "learning_rate": 1.1911586287013726e-05, + "loss": 0.0698, "step": 323 }, { - "epoch": 3.857142857142857, - "grad_norm": 2.728295310180829, - "learning_rate": 1.5880453653849523e-05, - "loss": 0.2494, + "epoch": 5.445378151260504, + "grad_norm": 1.7252519658503152, + "learning_rate": 1.1866615100829777e-05, + "loss": 0.0727, "step": 324 }, { - "epoch": 3.869047619047619, - "grad_norm": 1.5354481844217602, - "learning_rate": 1.5854415027873624e-05, - "loss": 0.1782, + "epoch": 5.46218487394958, + "grad_norm": 1.2769460474277, + "learning_rate": 1.1821604767010883e-05, + "loss": 0.0728, "step": 325 }, { - "epoch": 3.880952380952381, - "grad_norm": 2.826240940303501, - "learning_rate": 1.58283158686813e-05, - "loss": 0.2712, + "epoch": 5.4789915966386555, + "grad_norm": 1.6489519703945064, + "learning_rate": 1.1776556229537461e-05, + "loss": 0.0712, "step": 326 }, { - "epoch": 3.892857142857143, - "grad_norm": 5.714545377682789, - "learning_rate": 1.5802156446131463e-05, - "loss": 0.3086, + "epoch": 5.495798319327731, + "grad_norm": 1.9144641595351362, + "learning_rate": 1.1731470433191173e-05, + "loss": 0.0932, "step": 327 }, { - "epoch": 3.9047619047619047, - "grad_norm": 3.037951498821358, - "learning_rate": 1.577593703070613e-05, - "loss": 0.1702, + "epoch": 5.512605042016807, + "grad_norm": 1.0334398447359328, + "learning_rate": 1.1686348323535078e-05, + "loss": 0.0391, "step": 328 }, { - "epoch": 3.9166666666666665, - "grad_norm": 5.415513375394119, - "learning_rate": 1.5749657893507627e-05, - "loss": 0.1378, + "epoch": 5.529411764705882, + "grad_norm": 2.4736197087343195, + "learning_rate": 1.1641190846893824e-05, + "loss": 0.0955, "step": 329 }, { - "epoch": 3.928571428571429, - "grad_norm": 2.616666785733442, - "learning_rate": 1.5723319306255798e-05, - "loss": 0.2401, + "epoch": 5.546218487394958, + "grad_norm": 1.3189455008208653, + "learning_rate": 1.1595998950333794e-05, + "loss": 0.0565, "step": 330 }, { - "epoch": 3.9404761904761907, - "grad_norm": 2.716518576090543, - "learning_rate": 1.5696921541285176e-05, - "loss": 0.2716, + "epoch": 5.563025210084033, + "grad_norm": 1.7006720766337666, + "learning_rate": 1.1550773581643245e-05, + "loss": 0.0786, "step": 331 }, { - "epoch": 3.9523809523809526, - "grad_norm": 6.0778719414757365, - "learning_rate": 1.5670464871542178e-05, - "loss": 0.1819, + "epoch": 5.579831932773109, + "grad_norm": 4.52735397655905, + "learning_rate": 1.1505515689312424e-05, + "loss": 0.1432, "step": 332 }, { - "epoch": 3.9642857142857144, - "grad_norm": 2.300785383942732, - "learning_rate": 1.564394957058229e-05, - "loss": 0.1137, + "epoch": 5.5966386554621845, + "grad_norm": 3.0704885963032953, + "learning_rate": 1.1460226222513682e-05, + "loss": 0.1099, "step": 333 }, { - "epoch": 3.9761904761904763, - "grad_norm": 1.4978039537439425, - "learning_rate": 1.5617375912567218e-05, - "loss": 0.2404, + "epoch": 5.61344537815126, + "grad_norm": 4.068391506810968, + "learning_rate": 1.1414906131081575e-05, + "loss": 0.1559, "step": 334 }, { - "epoch": 3.988095238095238, - "grad_norm": 1.8520009539803934, - "learning_rate": 1.5590744172262076e-05, - "loss": 0.1392, + "epoch": 5.630252100840336, + "grad_norm": 1.6178193211411984, + "learning_rate": 1.1369556365492924e-05, + "loss": 0.0881, "step": 335 }, { - "epoch": 4.0, - "grad_norm": 1.0739415756051076, - "learning_rate": 1.5564054625032534e-05, - "loss": 0.1541, - "step": 336 - }, - { - "epoch": 4.0, - "eval_loss": 0.2209147959947586, - "eval_runtime": 38.5178, - "eval_samples_per_second": 1.558, - "eval_steps_per_second": 1.558, + "epoch": 5.647058823529412, + "grad_norm": 1.472096896311054, + "learning_rate": 1.1324177876846897e-05, + "loss": 0.0668, "step": 336 }, { - "epoch": 4.011904761904762, - "grad_norm": 2.827870135987854, - "learning_rate": 1.553730754684196e-05, - "loss": 0.039, + "epoch": 5.663865546218488, + "grad_norm": 0.9377593241374005, + "learning_rate": 1.1278771616845061e-05, + "loss": 0.0528, "step": 337 }, { - "epoch": 4.023809523809524, - "grad_norm": 2.071434578952237, - "learning_rate": 1.5510503214248597e-05, - "loss": 0.1753, + "epoch": 5.680672268907563, + "grad_norm": 1.60191420846021, + "learning_rate": 1.1233338537771408e-05, + "loss": 0.0839, "step": 338 }, { - "epoch": 4.035714285714286, - "grad_norm": 1.7843811186555119, - "learning_rate": 1.5483641904402666e-05, - "loss": 0.106, + "epoch": 5.697478991596639, + "grad_norm": 2.4242623809016757, + "learning_rate": 1.1187879592472402e-05, + "loss": 0.0765, "step": 339 }, { - "epoch": 4.0476190476190474, - "grad_norm": 8.82649724600633, - "learning_rate": 1.545672389504353e-05, - "loss": 0.1881, + "epoch": 5.714285714285714, + "grad_norm": 3.3339459249935075, + "learning_rate": 1.1142395734336986e-05, + "loss": 0.1304, "step": 340 }, { - "epoch": 4.059523809523809, - "grad_norm": 6.291910460115927, - "learning_rate": 1.5429749464496808e-05, - "loss": 0.1765, + "epoch": 5.73109243697479, + "grad_norm": 2.7416841232755824, + "learning_rate": 1.1096887917276585e-05, + "loss": 0.0776, "step": 341 }, { - "epoch": 4.071428571428571, - "grad_norm": 2.643403230336219, - "learning_rate": 1.54027188916715e-05, - "loss": 0.0459, + "epoch": 5.7478991596638656, + "grad_norm": 1.8254292684914686, + "learning_rate": 1.1051357095705102e-05, + "loss": 0.0679, "step": 342 }, { - "epoch": 4.083333333333333, - "grad_norm": 3.497562450238014, - "learning_rate": 1.5375632456057104e-05, - "loss": 0.1232, + "epoch": 5.764705882352941, + "grad_norm": 1.807300374973592, + "learning_rate": 1.1005804224518912e-05, + "loss": 0.0769, "step": 343 }, { - "epoch": 4.095238095238095, - "grad_norm": 2.2512921097593592, - "learning_rate": 1.5348490437720725e-05, - "loss": 0.323, + "epoch": 5.781512605042017, + "grad_norm": 1.8476121472163518, + "learning_rate": 1.0960230259076819e-05, + "loss": 0.0777, "step": 344 }, { - "epoch": 4.107142857142857, - "grad_norm": 5.464247034665501, - "learning_rate": 1.532129311730418e-05, - "loss": 0.0886, + "epoch": 5.798319327731092, + "grad_norm": 2.2451749634606317, + "learning_rate": 1.0914636155180025e-05, + "loss": 0.0843, "step": 345 }, { - "epoch": 4.119047619047619, - "grad_norm": 1.4220143614732972, - "learning_rate": 1.5294040776021092e-05, - "loss": 0.1782, + "epoch": 5.815126050420168, + "grad_norm": 2.016326952399232, + "learning_rate": 1.0869022869052091e-05, + "loss": 0.0852, "step": 346 }, { - "epoch": 4.130952380952381, - "grad_norm": 2.175470210456222, - "learning_rate": 1.5266733695653998e-05, - "loss": 0.181, + "epoch": 5.831932773109243, + "grad_norm": 1.502892225736369, + "learning_rate": 1.0823391357318876e-05, + "loss": 0.0637, "step": 347 }, { - "epoch": 4.142857142857143, - "grad_norm": 2.3599094937304463, - "learning_rate": 1.523937215855141e-05, - "loss": 0.2284, + "epoch": 5.848739495798319, + "grad_norm": 2.2571728633731376, + "learning_rate": 1.0777742576988474e-05, + "loss": 0.0667, "step": 348 }, { - "epoch": 4.154761904761905, - "grad_norm": 3.1602264124779587, - "learning_rate": 1.5211956447624916e-05, - "loss": 0.3619, + "epoch": 5.865546218487395, + "grad_norm": 2.3431094429758264, + "learning_rate": 1.0732077485431152e-05, + "loss": 0.0984, "step": 349 }, { - "epoch": 4.166666666666667, - "grad_norm": 2.530357790936566, - "learning_rate": 1.5184486846346248e-05, - "loss": 0.0911, + "epoch": 5.882352941176471, + "grad_norm": 1.5381903109004802, + "learning_rate": 1.0686397040359253e-05, + "loss": 0.0712, "step": 350 }, { - "epoch": 4.178571428571429, - "grad_norm": 5.786049264827654, - "learning_rate": 1.5156963638744348e-05, - "loss": 0.2455, + "epoch": 5.899159663865547, + "grad_norm": 1.36348896527395, + "learning_rate": 1.064070219980713e-05, + "loss": 0.0736, "step": 351 }, { - "epoch": 4.190476190476191, - "grad_norm": 1.632349364082718, - "learning_rate": 1.512938710940244e-05, - "loss": 0.0994, + "epoch": 5.915966386554622, + "grad_norm": 2.8850647864954384, + "learning_rate": 1.059499392211105e-05, + "loss": 0.1959, "step": 352 }, { - "epoch": 4.2023809523809526, - "grad_norm": 3.180319196728077, - "learning_rate": 1.5101757543455074e-05, - "loss": 0.1107, + "epoch": 5.932773109243698, + "grad_norm": 2.2743426048039814, + "learning_rate": 1.0549273165889079e-05, + "loss": 0.0661, "step": 353 }, { - "epoch": 4.214285714285714, - "grad_norm": 1.4203335804196207, - "learning_rate": 1.5074075226585195e-05, - "loss": 0.1874, + "epoch": 5.949579831932773, + "grad_norm": 2.1602273939968106, + "learning_rate": 1.0503540890020997e-05, + "loss": 0.0978, "step": 354 }, { - "epoch": 4.226190476190476, - "grad_norm": 2.1086034749298084, - "learning_rate": 1.5046340445021171e-05, - "loss": 0.1564, + "epoch": 5.966386554621849, + "grad_norm": 1.6282000235425853, + "learning_rate": 1.0457798053628181e-05, + "loss": 0.0712, "step": 355 }, { - "epoch": 4.238095238095238, - "grad_norm": 2.3264426202911226, - "learning_rate": 1.5018553485533844e-05, - "loss": 0.2398, + "epoch": 5.983193277310924, + "grad_norm": 2.2108120483379072, + "learning_rate": 1.0412045616053486e-05, + "loss": 0.0959, "step": 356 }, { - "epoch": 4.25, - "grad_norm": 3.0488496848457394, - "learning_rate": 1.4990714635433568e-05, - "loss": 0.1081, + "epoch": 6.0, + "grad_norm": 6.96305244714079, + "learning_rate": 1.0366284536841124e-05, + "loss": 0.1132, + "step": 357 + }, + { + "epoch": 6.0, + "eval_loss": 0.1090986505150795, + "eval_runtime": 95.9195, + "eval_samples_per_second": 0.98, + "eval_steps_per_second": 0.98, "step": 357 }, { - "epoch": 4.261904761904762, - "grad_norm": 1.7394283118255627, - "learning_rate": 1.4962824182567226e-05, - "loss": 0.1901, + "epoch": 6.016806722689076, + "grad_norm": 1.4364004042145684, + "learning_rate": 1.0320515775716556e-05, + "loss": 0.0628, "step": 358 }, { - "epoch": 4.273809523809524, - "grad_norm": 3.3515867531674903, - "learning_rate": 1.4934882415315266e-05, - "loss": 0.185, + "epoch": 6.033613445378151, + "grad_norm": 1.9184329896408099, + "learning_rate": 1.0274740292566335e-05, + "loss": 0.085, "step": 359 }, { - "epoch": 4.285714285714286, - "grad_norm": 18.82745619545086, - "learning_rate": 1.4906889622588714e-05, - "loss": 0.2833, + "epoch": 6.050420168067227, + "grad_norm": 2.722228602616158, + "learning_rate": 1.0228959047418005e-05, + "loss": 0.0764, "step": 360 }, { - "epoch": 4.2976190476190474, - "grad_norm": 4.511322420570053, - "learning_rate": 1.487884609382618e-05, - "loss": 0.1722, + "epoch": 6.067226890756302, + "grad_norm": 1.9487119713360619, + "learning_rate": 1.0183173000419954e-05, + "loss": 0.0735, "step": 361 }, { - "epoch": 4.309523809523809, - "grad_norm": 1.533278706530758, - "learning_rate": 1.4850752118990884e-05, - "loss": 0.1364, + "epoch": 6.084033613445378, + "grad_norm": 2.4618304983550883, + "learning_rate": 1.0137383111821267e-05, + "loss": 0.0742, "step": 362 }, { - "epoch": 4.321428571428571, - "grad_norm": 8.629338884395395, - "learning_rate": 1.4822607988567637e-05, - "loss": 0.123, + "epoch": 6.100840336134453, + "grad_norm": 3.82982664079687, + "learning_rate": 1.009159034195161e-05, + "loss": 0.1549, "step": 363 }, { - "epoch": 4.333333333333333, - "grad_norm": 3.0735320587228085, - "learning_rate": 1.4794413993559851e-05, - "loss": 0.0625, + "epoch": 6.117647058823529, + "grad_norm": 2.5393832652005686, + "learning_rate": 1.0045795651201062e-05, + "loss": 0.072, "step": 364 }, { - "epoch": 4.345238095238095, - "grad_norm": 2.0992688835189472, - "learning_rate": 1.476617042548652e-05, - "loss": 0.119, + "epoch": 6.1344537815126055, + "grad_norm": 2.1988376418173305, + "learning_rate": 1e-05, + "loss": 0.0786, "step": 365 }, { - "epoch": 4.357142857142857, - "grad_norm": 6.415048214826861, - "learning_rate": 1.473787757637922e-05, - "loss": 0.2251, + "epoch": 6.151260504201681, + "grad_norm": 3.668071708848647, + "learning_rate": 9.954204348798938e-06, + "loss": 0.0928, "step": 366 }, { - "epoch": 4.369047619047619, - "grad_norm": 1.6932021512404514, - "learning_rate": 1.4709535738779075e-05, - "loss": 0.1117, + "epoch": 6.168067226890757, + "grad_norm": 2.378790242128915, + "learning_rate": 9.908409658048395e-06, + "loss": 0.0717, "step": 367 }, { - "epoch": 4.380952380952381, - "grad_norm": 2.9272562546997047, - "learning_rate": 1.4681145205733736e-05, - "loss": 0.0407, + "epoch": 6.184873949579832, + "grad_norm": 1.8836633579149347, + "learning_rate": 9.862616888178733e-06, + "loss": 0.0519, "step": 368 }, { - "epoch": 4.392857142857143, - "grad_norm": 0.8387686118428367, - "learning_rate": 1.4652706270794354e-05, - "loss": 0.068, + "epoch": 6.201680672268908, + "grad_norm": 2.181673653848983, + "learning_rate": 9.816826999580049e-06, + "loss": 0.0748, "step": 369 }, { - "epoch": 4.404761904761905, - "grad_norm": 0.9780904318278195, - "learning_rate": 1.4624219228012555e-05, - "loss": 0.0767, + "epoch": 6.218487394957983, + "grad_norm": 2.544264789630926, + "learning_rate": 9.771040952581998e-06, + "loss": 0.1029, "step": 370 }, { - "epoch": 4.416666666666667, - "grad_norm": 2.4997987914726894, - "learning_rate": 1.4595684371937369e-05, - "loss": 0.1666, + "epoch": 6.235294117647059, + "grad_norm": 2.673802179214641, + "learning_rate": 9.72525970743367e-06, + "loss": 0.0857, "step": 371 }, { - "epoch": 4.428571428571429, - "grad_norm": 7.25267339982332, - "learning_rate": 1.4567101997612216e-05, - "loss": 0.1855, + "epoch": 6.2521008403361344, + "grad_norm": 2.601200264033827, + "learning_rate": 9.67948422428345e-06, + "loss": 0.0886, "step": 372 }, { - "epoch": 4.440476190476191, - "grad_norm": 3.059862921630944, - "learning_rate": 1.453847240057185e-05, - "loss": 0.1155, + "epoch": 6.26890756302521, + "grad_norm": 1.9627943528177962, + "learning_rate": 9.633715463158881e-06, + "loss": 0.1016, "step": 373 }, { - "epoch": 4.4523809523809526, - "grad_norm": 0.940939205370957, - "learning_rate": 1.4509795876839274e-05, - "loss": 0.0152, + "epoch": 6.285714285714286, + "grad_norm": 1.0216877914513534, + "learning_rate": 9.587954383946518e-06, + "loss": 0.0672, "step": 374 }, { - "epoch": 4.464285714285714, - "grad_norm": 0.7709486540869183, - "learning_rate": 1.4481072722922721e-05, - "loss": 0.0702, + "epoch": 6.302521008403361, + "grad_norm": 1.8373707532976224, + "learning_rate": 9.542201946371819e-06, + "loss": 0.0754, "step": 375 }, { - "epoch": 4.476190476190476, - "grad_norm": 1.5326012961960709, - "learning_rate": 1.4452303235812562e-05, - "loss": 0.1224, + "epoch": 6.319327731092437, + "grad_norm": 0.9199770052216976, + "learning_rate": 9.496459109979004e-06, + "loss": 0.0655, "step": 376 }, { - "epoch": 4.488095238095238, - "grad_norm": 2.0531110608488072, - "learning_rate": 1.442348771297824e-05, - "loss": 0.2465, + "epoch": 6.336134453781512, + "grad_norm": 1.9486911830327862, + "learning_rate": 9.450726834110923e-06, + "loss": 0.0682, "step": 377 }, { - "epoch": 4.5, - "grad_norm": 0.9970189018607668, - "learning_rate": 1.4394626452365202e-05, - "loss": 0.0771, + "epoch": 6.352941176470588, + "grad_norm": 1.4805691280913602, + "learning_rate": 9.405006077888954e-06, + "loss": 0.0487, "step": 378 }, { - "epoch": 4.511904761904762, - "grad_norm": 1.1650153863777009, - "learning_rate": 1.4365719752391805e-05, - "loss": 0.0153, + "epoch": 6.369747899159664, + "grad_norm": 1.4042709384224348, + "learning_rate": 9.359297800192873e-06, + "loss": 0.0689, "step": 379 }, { - "epoch": 4.523809523809524, - "grad_norm": 1.79896147766944, - "learning_rate": 1.4336767911946238e-05, - "loss": 0.0211, + "epoch": 6.38655462184874, + "grad_norm": 3.6158792324022775, + "learning_rate": 9.313602959640754e-06, + "loss": 0.1121, "step": 380 }, { - "epoch": 4.535714285714286, - "grad_norm": 8.039019319027208, - "learning_rate": 1.4307771230383446e-05, - "loss": 0.2942, + "epoch": 6.4033613445378155, + "grad_norm": 1.5349914188342018, + "learning_rate": 9.267922514568853e-06, + "loss": 0.0548, "step": 381 }, { - "epoch": 4.5476190476190474, - "grad_norm": 2.2623133990723274, - "learning_rate": 1.4278730007521996e-05, - "loss": 0.2017, + "epoch": 6.420168067226891, + "grad_norm": 5.196823890519191, + "learning_rate": 9.22225742301153e-06, + "loss": 0.1468, "step": 382 }, { - "epoch": 4.559523809523809, - "grad_norm": 0.7549719818461573, - "learning_rate": 1.4249644543641026e-05, - "loss": 0.0121, + "epoch": 6.436974789915967, + "grad_norm": 2.4164444219519816, + "learning_rate": 9.176608642681127e-06, + "loss": 0.0689, "step": 383 }, { - "epoch": 4.571428571428571, - "grad_norm": 1.6291253889342527, - "learning_rate": 1.4220515139477099e-05, - "loss": 0.1301, + "epoch": 6.453781512605042, + "grad_norm": 1.0767734429156512, + "learning_rate": 9.13097713094791e-06, + "loss": 0.0537, "step": 384 }, { - "epoch": 4.583333333333333, - "grad_norm": 1.3753384069531984, - "learning_rate": 1.419134209622111e-05, - "loss": 0.0843, + "epoch": 6.470588235294118, + "grad_norm": 1.1364625205117547, + "learning_rate": 9.085363844819979e-06, + "loss": 0.0505, "step": 385 }, { - "epoch": 4.595238095238095, - "grad_norm": 16.083102720375773, - "learning_rate": 1.416212571551518e-05, - "loss": 0.2996, + "epoch": 6.487394957983193, + "grad_norm": 1.2535232324818428, + "learning_rate": 9.039769740923183e-06, + "loss": 0.0518, "step": 386 }, { - "epoch": 4.607142857142857, - "grad_norm": 2.4053795337835817, - "learning_rate": 1.4132866299449523e-05, - "loss": 0.2103, + "epoch": 6.504201680672269, + "grad_norm": 3.057282990063465, + "learning_rate": 8.99419577548109e-06, + "loss": 0.1124, "step": 387 }, { - "epoch": 4.619047619047619, - "grad_norm": 2.1608334893802668, - "learning_rate": 1.410356415055933e-05, - "loss": 0.1842, + "epoch": 6.5210084033613445, + "grad_norm": 1.9870672030743288, + "learning_rate": 8.948642904294901e-06, + "loss": 0.0534, "step": 388 }, { - "epoch": 4.630952380952381, - "grad_norm": 1.6675074064641335, - "learning_rate": 1.407421957182164e-05, - "loss": 0.1343, + "epoch": 6.53781512605042, + "grad_norm": 4.506177944111496, + "learning_rate": 8.90311208272342e-06, + "loss": 0.1007, "step": 389 }, { - "epoch": 4.642857142857143, - "grad_norm": 7.385252787773213, - "learning_rate": 1.4044832866652198e-05, - "loss": 0.0953, + "epoch": 6.554621848739496, + "grad_norm": 4.687056129955279, + "learning_rate": 8.857604265663016e-06, + "loss": 0.1015, "step": 390 }, { - "epoch": 4.654761904761905, - "grad_norm": 13.223392660691903, - "learning_rate": 1.4015404338902338e-05, - "loss": 0.2051, + "epoch": 6.571428571428571, + "grad_norm": 2.793771357084546, + "learning_rate": 8.812120407527603e-06, + "loss": 0.0791, "step": 391 }, { - "epoch": 4.666666666666667, - "grad_norm": 1.7997237114139486, - "learning_rate": 1.3985934292855827e-05, - "loss": 0.0745, + "epoch": 6.588235294117647, + "grad_norm": 1.5631834583542967, + "learning_rate": 8.766661462228593e-06, + "loss": 0.0583, "step": 392 }, { - "epoch": 4.678571428571429, - "grad_norm": 1.7124646994476471, - "learning_rate": 1.3956423033225713e-05, - "loss": 0.14, + "epoch": 6.605042016806722, + "grad_norm": 2.503384804703961, + "learning_rate": 8.721228383154939e-06, + "loss": 0.0819, "step": 393 }, { - "epoch": 4.690476190476191, - "grad_norm": 1.5626636243483676, - "learning_rate": 1.3926870865151185e-05, - "loss": 0.037, + "epoch": 6.621848739495798, + "grad_norm": 1.298763753508359, + "learning_rate": 8.675822123153104e-06, + "loss": 0.0452, "step": 394 }, { - "epoch": 4.7023809523809526, - "grad_norm": 1.8377267092001748, - "learning_rate": 1.3897278094194422e-05, - "loss": 0.149, + "epoch": 6.6386554621848735, + "grad_norm": 3.468345102178359, + "learning_rate": 8.630443634507077e-06, + "loss": 0.0893, "step": 395 }, { - "epoch": 4.714285714285714, - "grad_norm": 1.1393534152121498, - "learning_rate": 1.386764502633742e-05, - "loss": 0.1064, + "epoch": 6.65546218487395, + "grad_norm": 2.526122457983386, + "learning_rate": 8.585093868918426e-06, + "loss": 0.1011, "step": 396 }, { - "epoch": 4.726190476190476, - "grad_norm": 5.6011749497367855, - "learning_rate": 1.3837971967978836e-05, - "loss": 0.1539, + "epoch": 6.6722689075630255, + "grad_norm": 2.075764833700991, + "learning_rate": 8.539773777486321e-06, + "loss": 0.0752, "step": 397 }, { - "epoch": 4.738095238095238, - "grad_norm": 1.6182850109100042, - "learning_rate": 1.3808259225930825e-05, - "loss": 0.1354, + "epoch": 6.689075630252101, + "grad_norm": 1.3259731049756247, + "learning_rate": 8.494484310687581e-06, + "loss": 0.0536, "step": 398 }, { - "epoch": 4.75, - "grad_norm": 2.664012777423232, - "learning_rate": 1.3778507107415849e-05, - "loss": 0.0815, + "epoch": 6.705882352941177, + "grad_norm": 1.5262707963177293, + "learning_rate": 8.44922641835676e-06, + "loss": 0.0649, "step": 399 }, { - "epoch": 4.761904761904762, - "grad_norm": 6.413333811764225, - "learning_rate": 1.374871592006353e-05, - "loss": 0.1265, + "epoch": 6.722689075630252, + "grad_norm": 1.3923407644979977, + "learning_rate": 8.404001049666211e-06, + "loss": 0.0781, "step": 400 }, { - "epoch": 4.773809523809524, - "grad_norm": 1.5093904430192655, - "learning_rate": 1.371888597190744e-05, - "loss": 0.1129, + "epoch": 6.739495798319328, + "grad_norm": 1.474705597824637, + "learning_rate": 8.35880915310618e-06, + "loss": 0.0614, "step": 401 }, { - "epoch": 4.785714285714286, - "grad_norm": 1.1398541908667996, - "learning_rate": 1.3689017571381928e-05, - "loss": 0.1792, + "epoch": 6.756302521008403, + "grad_norm": 1.5133797172950414, + "learning_rate": 8.313651676464924e-06, + "loss": 0.0588, "step": 402 }, { - "epoch": 4.7976190476190474, - "grad_norm": 4.7568958126507495, - "learning_rate": 1.3659111027318936e-05, - "loss": 0.2022, + "epoch": 6.773109243697479, + "grad_norm": 1.5937067083804244, + "learning_rate": 8.26852956680883e-06, + "loss": 0.0658, "step": 403 }, { - "epoch": 4.809523809523809, - "grad_norm": 1.1634943614210518, - "learning_rate": 1.3629166648944803e-05, - "loss": 0.1091, + "epoch": 6.7899159663865545, + "grad_norm": 3.2146725324389447, + "learning_rate": 8.223443770462539e-06, + "loss": 0.0967, "step": 404 }, { - "epoch": 4.821428571428571, - "grad_norm": 2.0853097817874096, - "learning_rate": 1.3599184745877059e-05, - "loss": 0.125, + "epoch": 6.80672268907563, + "grad_norm": 1.6054189964924985, + "learning_rate": 8.17839523298912e-06, + "loss": 0.0676, "step": 405 }, { - "epoch": 4.833333333333333, - "grad_norm": 7.354405370327422, - "learning_rate": 1.356916562812123e-05, - "loss": 0.131, + "epoch": 6.823529411764706, + "grad_norm": 1.3282376180631894, + "learning_rate": 8.133384899170224e-06, + "loss": 0.0373, "step": 406 }, { - "epoch": 4.845238095238095, - "grad_norm": 2.2454476121472693, - "learning_rate": 1.3539109606067642e-05, - "loss": 0.1625, + "epoch": 6.840336134453781, + "grad_norm": 1.1953714752813585, + "learning_rate": 8.08841371298628e-06, + "loss": 0.0591, "step": 407 }, { - "epoch": 4.857142857142857, - "grad_norm": 1.0859281248071477, - "learning_rate": 1.3509016990488191e-05, - "loss": 0.1148, + "epoch": 6.857142857142857, + "grad_norm": 2.9661126814818393, + "learning_rate": 8.043482617596681e-06, + "loss": 0.141, "step": 408 }, { - "epoch": 4.869047619047619, - "grad_norm": 1.8112995115739032, - "learning_rate": 1.3478888092533145e-05, - "loss": 0.1333, + "epoch": 6.873949579831933, + "grad_norm": 1.3266256289537959, + "learning_rate": 7.99859255532003e-06, + "loss": 0.053, "step": 409 }, { - "epoch": 4.880952380952381, - "grad_norm": 6.093790266627286, - "learning_rate": 1.3448723223727929e-05, - "loss": 0.1594, + "epoch": 6.890756302521009, + "grad_norm": 2.1914466357984574, + "learning_rate": 7.953744467614356e-06, + "loss": 0.0778, "step": 410 }, { - "epoch": 4.892857142857143, - "grad_norm": 4.418732377785542, - "learning_rate": 1.3418522695969892e-05, - "loss": 0.1858, + "epoch": 6.907563025210084, + "grad_norm": 4.698036043159151, + "learning_rate": 7.908939295057362e-06, + "loss": 0.1673, "step": 411 }, { - "epoch": 4.904761904761905, - "grad_norm": 4.690321231618584, - "learning_rate": 1.3388286821525086e-05, - "loss": 0.0972, + "epoch": 6.92436974789916, + "grad_norm": 2.0220996669989555, + "learning_rate": 7.864177977326739e-06, + "loss": 0.0706, "step": 412 }, { - "epoch": 4.916666666666667, - "grad_norm": 2.671198917779827, - "learning_rate": 1.3358015913025045e-05, - "loss": 0.1115, + "epoch": 6.9411764705882355, + "grad_norm": 0.863174779051157, + "learning_rate": 7.819461453180403e-06, + "loss": 0.0536, "step": 413 }, { - "epoch": 4.928571428571429, - "grad_norm": 2.3833023595529723, - "learning_rate": 1.3327710283463535e-05, - "loss": 0.1534, + "epoch": 6.957983193277311, + "grad_norm": 1.162974910961646, + "learning_rate": 7.774790660436857e-06, + "loss": 0.053, "step": 414 }, { - "epoch": 4.940476190476191, - "grad_norm": 1.626651095857683, - "learning_rate": 1.3297370246193349e-05, - "loss": 0.0785, + "epoch": 6.974789915966387, + "grad_norm": 1.5401943180276223, + "learning_rate": 7.730166535955489e-06, + "loss": 0.0562, "step": 415 }, { - "epoch": 4.9523809523809526, - "grad_norm": 3.184821297784408, - "learning_rate": 1.3266996114923027e-05, - "loss": 0.1509, + "epoch": 6.991596638655462, + "grad_norm": 1.0451220578083906, + "learning_rate": 7.685590015616939e-06, + "loss": 0.061, + "step": 416 + }, + { + "epoch": 6.991596638655462, + "eval_loss": 0.11021654307842255, + "eval_runtime": 96.6092, + "eval_samples_per_second": 0.973, + "eval_steps_per_second": 0.973, "step": 416 }, { - "epoch": 4.964285714285714, - "grad_norm": 2.7515690992856445, - "learning_rate": 1.3236588203713645e-05, - "loss": 0.0581, + "epoch": 7.008403361344538, + "grad_norm": 1.9748689801680077, + "learning_rate": 7.641062034303464e-06, + "loss": 0.0744, "step": 417 }, { - "epoch": 4.976190476190476, - "grad_norm": 1.581027664457046, - "learning_rate": 1.3206146826975554e-05, - "loss": 0.1458, + "epoch": 7.025210084033613, + "grad_norm": 1.407098109780849, + "learning_rate": 7.596583525879344e-06, + "loss": 0.0575, "step": 418 }, { - "epoch": 4.988095238095238, - "grad_norm": 3.227099328398388, - "learning_rate": 1.3175672299465124e-05, - "loss": 0.0788, + "epoch": 7.042016806722689, + "grad_norm": 3.002224281961417, + "learning_rate": 7.5521554231712845e-06, + "loss": 0.0761, "step": 419 }, { - "epoch": 5.0, - "grad_norm": 2.307032199226437, - "learning_rate": 1.3145164936281501e-05, - "loss": 0.1162, - "step": 420 - }, - { - "epoch": 5.0, - "eval_loss": 0.16767826676368713, - "eval_runtime": 38.2511, - "eval_samples_per_second": 1.569, - "eval_steps_per_second": 1.569, + "epoch": 7.0588235294117645, + "grad_norm": 1.3836323412125109, + "learning_rate": 7.507778657948847e-06, + "loss": 0.0508, "step": 420 }, { - "epoch": 5.011904761904762, - "grad_norm": 2.480615008589385, - "learning_rate": 1.3114625052863343e-05, - "loss": 0.1365, + "epoch": 7.07563025210084, + "grad_norm": 1.5627121625333806, + "learning_rate": 7.463454160904928e-06, + "loss": 0.0619, "step": 421 }, { - "epoch": 5.023809523809524, - "grad_norm": 1.7186104572642102, - "learning_rate": 1.3084052964985559e-05, - "loss": 0.1, + "epoch": 7.092436974789916, + "grad_norm": 1.7887521098557067, + "learning_rate": 7.419182861636218e-06, + "loss": 0.0566, "step": 422 }, { - "epoch": 5.035714285714286, - "grad_norm": 1.0422770723266574, - "learning_rate": 1.3053448988756046e-05, - "loss": 0.1036, + "epoch": 7.109243697478991, + "grad_norm": 1.6937210033771022, + "learning_rate": 7.374965688623726e-06, + "loss": 0.0601, "step": 423 }, { - "epoch": 5.0476190476190474, - "grad_norm": 1.3346688546661123, - "learning_rate": 1.3022813440612411e-05, - "loss": 0.1213, + "epoch": 7.126050420168067, + "grad_norm": 2.35276516070813, + "learning_rate": 7.3308035692132896e-06, + "loss": 0.0778, "step": 424 }, { - "epoch": 5.059523809523809, - "grad_norm": 1.8542917766521358, - "learning_rate": 1.2992146637318721e-05, - "loss": 0.0678, + "epoch": 7.142857142857143, + "grad_norm": 1.7638699741053976, + "learning_rate": 7.286697429596135e-06, + "loss": 0.0622, "step": 425 }, { - "epoch": 5.071428571428571, - "grad_norm": 5.6008497642741855, - "learning_rate": 1.2961448895962199e-05, - "loss": 0.1197, + "epoch": 7.159663865546219, + "grad_norm": 2.20424078040461, + "learning_rate": 7.242648194789447e-06, + "loss": 0.0597, "step": 426 }, { - "epoch": 5.083333333333333, - "grad_norm": 3.4136608303525633, - "learning_rate": 1.2930720533949967e-05, - "loss": 0.2551, + "epoch": 7.176470588235294, + "grad_norm": 2.50814340199316, + "learning_rate": 7.19865678861698e-06, + "loss": 0.0686, "step": 427 }, { - "epoch": 5.095238095238095, - "grad_norm": 2.3745380363755166, - "learning_rate": 1.2899961869005761e-05, - "loss": 0.1615, + "epoch": 7.19327731092437, + "grad_norm": 1.448580373256141, + "learning_rate": 7.154724133689677e-06, + "loss": 0.0312, "step": 428 }, { - "epoch": 5.107142857142857, - "grad_norm": 1.8910210447295193, - "learning_rate": 1.2869173219166632e-05, - "loss": 0.0842, + "epoch": 7.2100840336134455, + "grad_norm": 2.1552454847239075, + "learning_rate": 7.110851151386306e-06, + "loss": 0.0547, "step": 429 }, { - "epoch": 5.119047619047619, - "grad_norm": 2.9450829681087742, - "learning_rate": 1.283835490277968e-05, - "loss": 0.1633, + "epoch": 7.226890756302521, + "grad_norm": 2.9974503334538602, + "learning_rate": 7.067038761834164e-06, + "loss": 0.0881, "step": 430 }, { - "epoch": 5.130952380952381, - "grad_norm": 6.731369409179587, - "learning_rate": 1.2807507238498733e-05, - "loss": 0.1296, + "epoch": 7.243697478991597, + "grad_norm": 0.9735826198652626, + "learning_rate": 7.023287883889753e-06, + "loss": 0.039, "step": 431 }, { - "epoch": 5.142857142857143, - "grad_norm": 2.9173972705274034, - "learning_rate": 1.2776630545281088e-05, - "loss": 0.156, + "epoch": 7.260504201680672, + "grad_norm": 2.5579376905692293, + "learning_rate": 6.979599435119531e-06, + "loss": 0.0919, "step": 432 }, { - "epoch": 5.154761904761905, - "grad_norm": 1.4766409198526602, - "learning_rate": 1.2745725142384176e-05, - "loss": 0.1113, + "epoch": 7.277310924369748, + "grad_norm": 2.4454794736196, + "learning_rate": 6.935974331780653e-06, + "loss": 0.0544, "step": 433 }, { - "epoch": 5.166666666666667, - "grad_norm": 5.5619564739728915, - "learning_rate": 1.2714791349362293e-05, - "loss": 0.0989, + "epoch": 7.294117647058823, + "grad_norm": 1.4023653427232834, + "learning_rate": 6.892413488801762e-06, + "loss": 0.0626, "step": 434 }, { - "epoch": 5.178571428571429, - "grad_norm": 4.883941398663358, - "learning_rate": 1.2683829486063275e-05, - "loss": 0.0922, + "epoch": 7.310924369747899, + "grad_norm": 1.36640843494829, + "learning_rate": 6.848917819763794e-06, + "loss": 0.061, "step": 435 }, { - "epoch": 5.190476190476191, - "grad_norm": 2.153822102950277, - "learning_rate": 1.2652839872625198e-05, - "loss": 0.1536, + "epoch": 7.3277310924369745, + "grad_norm": 1.6526923758851404, + "learning_rate": 6.805488236880831e-06, + "loss": 0.0628, "step": 436 }, { - "epoch": 5.2023809523809526, - "grad_norm": 1.550572046017145, - "learning_rate": 1.2621822829473074e-05, - "loss": 0.0688, + "epoch": 7.34453781512605, + "grad_norm": 1.5372290912683437, + "learning_rate": 6.76212565098096e-06, + "loss": 0.06, "step": 437 }, { - "epoch": 5.214285714285714, - "grad_norm": 2.7372539541607512, - "learning_rate": 1.2590778677315525e-05, - "loss": 0.0374, + "epoch": 7.361344537815126, + "grad_norm": 1.1019297711397662, + "learning_rate": 6.718830971487165e-06, + "loss": 0.0534, "step": 438 }, { - "epoch": 5.226190476190476, - "grad_norm": 1.760106704960083, - "learning_rate": 1.2559707737141471e-05, - "loss": 0.1008, + "epoch": 7.378151260504202, + "grad_norm": 1.474792358247268, + "learning_rate": 6.675605106398269e-06, + "loss": 0.0498, "step": 439 }, { - "epoch": 5.238095238095238, - "grad_norm": 2.264297339685535, - "learning_rate": 1.2528610330216824e-05, - "loss": 0.1123, + "epoch": 7.394957983193278, + "grad_norm": 1.5934869270544132, + "learning_rate": 6.632448962269873e-06, + "loss": 0.0579, "step": 440 }, { - "epoch": 5.25, - "grad_norm": 1.9709076777699328, - "learning_rate": 1.2497486778081145e-05, - "loss": 0.0664, + "epoch": 7.411764705882353, + "grad_norm": 1.5423848913860474, + "learning_rate": 6.589363444195367e-06, + "loss": 0.0836, "step": 441 }, { - "epoch": 5.261904761904762, - "grad_norm": 1.9643428286751554, - "learning_rate": 1.2466337402544333e-05, - "loss": 0.0964, + "epoch": 7.428571428571429, + "grad_norm": 0.6052937489588177, + "learning_rate": 6.546349455786926e-06, + "loss": 0.0235, "step": 442 }, { - "epoch": 5.273809523809524, - "grad_norm": 1.300317642643842, - "learning_rate": 1.2435162525683303e-05, - "loss": 0.0798, + "epoch": 7.445378151260504, + "grad_norm": 0.8930183656715149, + "learning_rate": 6.503407899156565e-06, + "loss": 0.0536, "step": 443 }, { - "epoch": 5.285714285714286, - "grad_norm": 7.741590751845092, - "learning_rate": 1.2403962469838633e-05, - "loss": 0.1114, + "epoch": 7.46218487394958, + "grad_norm": 1.9378732272470842, + "learning_rate": 6.460539674897226e-06, + "loss": 0.0713, "step": 444 }, { - "epoch": 5.2976190476190474, - "grad_norm": 1.257936578683669, - "learning_rate": 1.2372737557611256e-05, - "loss": 0.0165, + "epoch": 7.4789915966386555, + "grad_norm": 0.9653273021580584, + "learning_rate": 6.417745682063884e-06, + "loss": 0.0572, "step": 445 }, { - "epoch": 5.309523809523809, - "grad_norm": 1.5441967809002295, - "learning_rate": 1.2341488111859111e-05, - "loss": 0.095, + "epoch": 7.495798319327731, + "grad_norm": 5.150704023745161, + "learning_rate": 6.3750268181547e-06, + "loss": 0.1011, "step": 446 }, { - "epoch": 5.321428571428571, - "grad_norm": 4.463551970591222, - "learning_rate": 1.2310214455693808e-05, - "loss": 0.0405, + "epoch": 7.512605042016807, + "grad_norm": 2.663053864822168, + "learning_rate": 6.3323839790921785e-06, + "loss": 0.1071, "step": 447 }, { - "epoch": 5.333333333333333, - "grad_norm": 3.676401909989546, - "learning_rate": 1.2278916912477285e-05, - "loss": 0.0297, + "epoch": 7.529411764705882, + "grad_norm": 1.8175181083869496, + "learning_rate": 6.289818059204404e-06, + "loss": 0.0409, "step": 448 }, { - "epoch": 5.345238095238095, - "grad_norm": 0.7008333575781895, - "learning_rate": 1.2247595805818471e-05, - "loss": 0.0599, + "epoch": 7.546218487394958, + "grad_norm": 1.2608121221023865, + "learning_rate": 6.24732995120626e-06, + "loss": 0.0447, "step": 449 }, { - "epoch": 5.357142857142857, - "grad_norm": 6.856328864129851, - "learning_rate": 1.2216251459569927e-05, - "loss": 0.1179, + "epoch": 7.563025210084033, + "grad_norm": 1.558905828362155, + "learning_rate": 6.204920546180728e-06, + "loss": 0.0629, "step": 450 }, { - "epoch": 5.369047619047619, - "grad_norm": 6.86386327644984, - "learning_rate": 1.2184884197824513e-05, - "loss": 0.0773, + "epoch": 7.579831932773109, + "grad_norm": 3.0836236832919774, + "learning_rate": 6.162590733560183e-06, + "loss": 0.0714, "step": 451 }, { - "epoch": 5.380952380952381, - "grad_norm": 2.646115956267491, - "learning_rate": 1.2153494344912031e-05, - "loss": 0.1069, + "epoch": 7.5966386554621845, + "grad_norm": 1.9457063009832318, + "learning_rate": 6.120341401107751e-06, + "loss": 0.0591, "step": 452 }, { - "epoch": 5.392857142857143, - "grad_norm": 1.8101984992707947, - "learning_rate": 1.212208222539586e-05, - "loss": 0.0745, + "epoch": 7.61344537815126, + "grad_norm": 2.8239440684779, + "learning_rate": 6.078173434898679e-06, + "loss": 0.0598, "step": 453 }, { - "epoch": 5.404761904761905, - "grad_norm": 6.744068306166054, - "learning_rate": 1.2090648164069613e-05, - "loss": 0.1462, + "epoch": 7.630252100840336, + "grad_norm": 1.6266933180384318, + "learning_rate": 6.036087719301763e-06, + "loss": 0.0622, "step": 454 }, { - "epoch": 5.416666666666667, - "grad_norm": 2.587664561358256, - "learning_rate": 1.205919248595379e-05, - "loss": 0.175, + "epoch": 7.647058823529412, + "grad_norm": 1.5079333156466859, + "learning_rate": 5.994085136960801e-06, + "loss": 0.0662, "step": 455 }, { - "epoch": 5.428571428571429, - "grad_norm": 1.0425686405747994, - "learning_rate": 1.2027715516292386e-05, - "loss": 0.0695, + "epoch": 7.663865546218488, + "grad_norm": 2.513503195610099, + "learning_rate": 5.952166568776062e-06, + "loss": 0.0741, "step": 456 }, { - "epoch": 5.440476190476191, - "grad_norm": 1.6830078270896764, - "learning_rate": 1.1996217580549557e-05, - "loss": 0.1039, + "epoch": 7.680672268907563, + "grad_norm": 2.942516455656699, + "learning_rate": 5.910332893885842e-06, + "loss": 0.0768, "step": 457 }, { - "epoch": 5.4523809523809526, - "grad_norm": 6.172195451892937, - "learning_rate": 1.1964699004406231e-05, - "loss": 0.0442, + "epoch": 7.697478991596639, + "grad_norm": 0.9029614933690709, + "learning_rate": 5.868584989647994e-06, + "loss": 0.0465, "step": 458 }, { - "epoch": 5.464285714285714, - "grad_norm": 0.9630413019254777, - "learning_rate": 1.1933160113756764e-05, - "loss": 0.062, + "epoch": 7.714285714285714, + "grad_norm": 1.188640664205059, + "learning_rate": 5.826923731621562e-06, + "loss": 0.0507, "step": 459 }, { - "epoch": 5.476190476190476, - "grad_norm": 4.906480139073013, - "learning_rate": 1.1901601234705556e-05, - "loss": 0.1649, + "epoch": 7.73109243697479, + "grad_norm": 1.2597887336311209, + "learning_rate": 5.785349993548382e-06, + "loss": 0.0442, "step": 460 }, { - "epoch": 5.488095238095238, - "grad_norm": 6.576239158062562, - "learning_rate": 1.187002269356368e-05, - "loss": 0.0965, + "epoch": 7.7478991596638656, + "grad_norm": 5.82455886129036, + "learning_rate": 5.743864647334789e-06, + "loss": 0.154, "step": 461 }, { - "epoch": 5.5, - "grad_norm": 1.6597413036008377, - "learning_rate": 1.1838424816845507e-05, - "loss": 0.0302, + "epoch": 7.764705882352941, + "grad_norm": 1.2355207857604258, + "learning_rate": 5.702468563033307e-06, + "loss": 0.0472, "step": 462 }, { - "epoch": 5.511904761904762, - "grad_norm": 3.7920628634602385, - "learning_rate": 1.1806807931265344e-05, - "loss": 0.1098, + "epoch": 7.781512605042017, + "grad_norm": 1.673091621193694, + "learning_rate": 5.66116260882442e-06, + "loss": 0.0714, "step": 463 }, { - "epoch": 5.523809523809524, - "grad_norm": 1.4114425562472657, - "learning_rate": 1.177517236373403e-05, - "loss": 0.0626, + "epoch": 7.798319327731092, + "grad_norm": 1.2037358486316267, + "learning_rate": 5.6199476509983546e-06, + "loss": 0.0576, "step": 464 }, { - "epoch": 5.535714285714286, - "grad_norm": 2.9416388986651296, - "learning_rate": 1.174351844135559e-05, - "loss": 0.1376, + "epoch": 7.815126050420168, + "grad_norm": 1.4923009661165723, + "learning_rate": 5.5788245539369144e-06, + "loss": 0.0546, "step": 465 }, { - "epoch": 5.5476190476190474, - "grad_norm": 6.283324412491074, - "learning_rate": 1.1711846491423818e-05, - "loss": 0.1851, + "epoch": 7.831932773109243, + "grad_norm": 7.004074578858643, + "learning_rate": 5.537794180095341e-06, + "loss": 0.0931, "step": 466 }, { - "epoch": 5.559523809523809, - "grad_norm": 15.186006477965242, - "learning_rate": 1.1680156841418911e-05, - "loss": 0.1297, + "epoch": 7.848739495798319, + "grad_norm": 1.9350012591991381, + "learning_rate": 5.496857389984251e-06, + "loss": 0.0501, "step": 467 }, { - "epoch": 5.571428571428571, - "grad_norm": 2.7222956541416945, - "learning_rate": 1.1648449819004084e-05, - "loss": 0.0728, + "epoch": 7.865546218487395, + "grad_norm": 2.0402004080487575, + "learning_rate": 5.456015042151563e-06, + "loss": 0.0548, "step": 468 }, { - "epoch": 5.583333333333333, - "grad_norm": 2.6958569293331958, - "learning_rate": 1.1616725752022178e-05, - "loss": 0.1425, + "epoch": 7.882352941176471, + "grad_norm": 1.5295516734084738, + "learning_rate": 5.415267993164504e-06, + "loss": 0.0529, "step": 469 }, { - "epoch": 5.595238095238095, - "grad_norm": 2.9947378953143864, - "learning_rate": 1.1584984968492267e-05, - "loss": 0.0379, + "epoch": 7.899159663865547, + "grad_norm": 1.8934498235843238, + "learning_rate": 5.37461709759165e-06, + "loss": 0.0541, "step": 470 }, { - "epoch": 5.607142857142857, - "grad_norm": 3.5361493539488844, - "learning_rate": 1.155322779660628e-05, - "loss": 0.2285, + "epoch": 7.915966386554622, + "grad_norm": 2.3477637506052935, + "learning_rate": 5.334063207984983e-06, + "loss": 0.091, "step": 471 }, { - "epoch": 5.619047619047619, - "grad_norm": 1.4994769810175483, - "learning_rate": 1.152145456472558e-05, - "loss": 0.1222, + "epoch": 7.932773109243698, + "grad_norm": 2.068442722717763, + "learning_rate": 5.2936071748620386e-06, + "loss": 0.0659, "step": 472 }, { - "epoch": 5.630952380952381, - "grad_norm": 1.751393424685345, - "learning_rate": 1.1489665601377603e-05, - "loss": 0.0892, + "epoch": 7.949579831932773, + "grad_norm": 1.5583233740963265, + "learning_rate": 5.253249846688053e-06, + "loss": 0.0582, "step": 473 }, { - "epoch": 5.642857142857143, - "grad_norm": 0.9029221853359657, - "learning_rate": 1.1457861235252436e-05, - "loss": 0.1103, + "epoch": 7.966386554621849, + "grad_norm": 1.3210668874752445, + "learning_rate": 5.21299206985816e-06, + "loss": 0.0457, "step": 474 }, { - "epoch": 5.654761904761905, - "grad_norm": 1.9581770337649325, - "learning_rate": 1.1426041795199434e-05, - "loss": 0.1159, + "epoch": 7.983193277310924, + "grad_norm": 1.8720045795583338, + "learning_rate": 5.172834688679665e-06, + "loss": 0.0485, "step": 475 }, { - "epoch": 5.666666666666667, - "grad_norm": 6.706135908789148, - "learning_rate": 1.1394207610223802e-05, - "loss": 0.1742, + "epoch": 8.0, + "grad_norm": 1.717240919628548, + "learning_rate": 5.132778545354305e-06, + "loss": 0.0528, "step": 476 }, { - "epoch": 5.678571428571429, - "grad_norm": 2.630766596113569, - "learning_rate": 1.1362359009483213e-05, - "loss": 0.0714, - "step": 477 + "epoch": 8.0, + "eval_loss": 0.09206734597682953, + "eval_runtime": 95.9403, + "eval_samples_per_second": 0.98, + "eval_steps_per_second": 0.98, + "step": 476 }, { - "epoch": 5.690476190476191, - "grad_norm": 2.091897523457701, - "learning_rate": 1.1330496322284392e-05, - "loss": 0.0672, + "epoch": 8.016806722689076, + "grad_norm": 1.5169536755225592, + "learning_rate": 5.092824479960625e-06, + "loss": 0.0474, + "step": 477 + }, + { + "epoch": 8.033613445378151, + "grad_norm": 1.7357799184112437, + "learning_rate": 5.0529733304363145e-06, + "loss": 0.0431, "step": 478 }, { - "epoch": 5.7023809523809526, - "grad_norm": 2.1214408759543155, - "learning_rate": 1.1298619878079713e-05, - "loss": 0.0767, + "epoch": 8.050420168067227, + "grad_norm": 1.3870139040501723, + "learning_rate": 5.013225932560679e-06, + "loss": 0.0423, "step": 479 }, { - "epoch": 5.714285714285714, - "grad_norm": 5.6326738420170495, - "learning_rate": 1.1266730006463797e-05, - "loss": 0.0673, + "epoch": 8.067226890756302, + "grad_norm": 3.302734792491234, + "learning_rate": 4.973583119937072e-06, + "loss": 0.0655, "step": 480 }, { - "epoch": 5.726190476190476, - "grad_norm": 2.5553857823512955, - "learning_rate": 1.1234827037170101e-05, - "loss": 0.0799, + "epoch": 8.084033613445378, + "grad_norm": 1.8227280505702954, + "learning_rate": 4.934045723975441e-06, + "loss": 0.0731, "step": 481 }, { - "epoch": 5.738095238095238, - "grad_norm": 1.3221943520824615, - "learning_rate": 1.1202911300067508e-05, - "loss": 0.0491, + "epoch": 8.100840336134453, + "grad_norm": 1.3256962048722176, + "learning_rate": 4.894614573874877e-06, + "loss": 0.0432, "step": 482 }, { - "epoch": 5.75, - "grad_norm": 1.3244889662711417, - "learning_rate": 1.1170983125156913e-05, - "loss": 0.1125, + "epoch": 8.117647058823529, + "grad_norm": 1.427387472053757, + "learning_rate": 4.85529049660623e-06, + "loss": 0.0549, "step": 483 }, { - "epoch": 5.761904761904762, - "grad_norm": 1.6046163371636937, - "learning_rate": 1.1139042842567824e-05, - "loss": 0.07, + "epoch": 8.134453781512605, + "grad_norm": 1.722178796219511, + "learning_rate": 4.81607431689475e-06, + "loss": 0.057, "step": 484 }, { - "epoch": 5.773809523809524, - "grad_norm": 1.70434051577756, - "learning_rate": 1.110709078255493e-05, - "loss": 0.0972, + "epoch": 8.15126050420168, + "grad_norm": 1.4946268224043258, + "learning_rate": 4.776966857202816e-06, + "loss": 0.0501, "step": 485 }, { - "epoch": 5.785714285714286, - "grad_norm": 1.731299719780292, - "learning_rate": 1.1075127275494704e-05, - "loss": 0.0555, + "epoch": 8.168067226890756, + "grad_norm": 1.4410674537931873, + "learning_rate": 4.737968937712674e-06, + "loss": 0.0472, "step": 486 }, { - "epoch": 5.7976190476190474, - "grad_norm": 6.126806212414693, - "learning_rate": 1.1043152651881972e-05, - "loss": 0.0715, + "epoch": 8.184873949579831, + "grad_norm": 4.079218048537049, + "learning_rate": 4.699081376309218e-06, + "loss": 0.0669, "step": 487 }, { - "epoch": 5.809523809523809, - "grad_norm": 5.169619165945196, - "learning_rate": 1.1011167242326504e-05, - "loss": 0.0784, + "epoch": 8.201680672268907, + "grad_norm": 2.445073900654683, + "learning_rate": 4.660304988562877e-06, + "loss": 0.0642, "step": 488 }, { - "epoch": 5.821428571428571, - "grad_norm": 2.7914135139256384, - "learning_rate": 1.0979171377549595e-05, - "loss": 0.1272, + "epoch": 8.218487394957982, + "grad_norm": 6.718089966341145, + "learning_rate": 4.621640587712468e-06, + "loss": 0.1271, "step": 489 }, { - "epoch": 5.833333333333333, - "grad_norm": 2.3959930088185635, - "learning_rate": 1.0947165388380646e-05, - "loss": 0.0279, + "epoch": 8.235294117647058, + "grad_norm": 2.8570358473448256, + "learning_rate": 4.583088984648172e-06, + "loss": 0.0629, "step": 490 }, { - "epoch": 5.845238095238095, - "grad_norm": 1.035178074469445, - "learning_rate": 1.091514960575374e-05, - "loss": 0.0466, + "epoch": 8.252100840336134, + "grad_norm": 1.9274312241325455, + "learning_rate": 4.544650987894514e-06, + "loss": 0.0367, "step": 491 }, { - "epoch": 5.857142857142857, - "grad_norm": 3.400565752147332, - "learning_rate": 1.088312436070422e-05, - "loss": 0.1819, + "epoch": 8.268907563025211, + "grad_norm": 1.7166820491309733, + "learning_rate": 4.5063274035934016e-06, + "loss": 0.0451, "step": 492 }, { - "epoch": 5.869047619047619, - "grad_norm": 0.9053402187508278, - "learning_rate": 1.0851089984365272e-05, - "loss": 0.0552, + "epoch": 8.285714285714286, + "grad_norm": 1.3196462720335975, + "learning_rate": 4.468119035487231e-06, + "loss": 0.0451, "step": 493 }, { - "epoch": 5.880952380952381, - "grad_norm": 3.403235950315029, - "learning_rate": 1.0819046807964495e-05, - "loss": 0.1056, + "epoch": 8.302521008403362, + "grad_norm": 1.5329980369426424, + "learning_rate": 4.430026684902017e-06, + "loss": 0.0382, "step": 494 }, { - "epoch": 5.892857142857143, - "grad_norm": 1.8344860921030868, - "learning_rate": 1.078699516282048e-05, - "loss": 0.0865, + "epoch": 8.319327731092438, + "grad_norm": 1.7844728493839146, + "learning_rate": 4.392051150730602e-06, + "loss": 0.0346, "step": 495 }, { - "epoch": 5.904761904761905, - "grad_norm": 2.4533834665154806, - "learning_rate": 1.075493538033938e-05, - "loss": 0.0199, + "epoch": 8.336134453781513, + "grad_norm": 1.233877897725148, + "learning_rate": 4.354193229415882e-06, + "loss": 0.0347, "step": 496 }, { - "epoch": 5.916666666666667, - "grad_norm": 0.8203250859689373, - "learning_rate": 1.0722867792011486e-05, - "loss": 0.0477, + "epoch": 8.352941176470589, + "grad_norm": 1.0911759571068154, + "learning_rate": 4.3164537149341246e-06, + "loss": 0.0534, "step": 497 }, { - "epoch": 5.928571428571429, - "grad_norm": 1.8912690142583974, - "learning_rate": 1.0690792729407807e-05, - "loss": 0.1064, + "epoch": 8.369747899159664, + "grad_norm": 2.1095342838945093, + "learning_rate": 4.278833398778306e-06, + "loss": 0.0531, "step": 498 }, { - "epoch": 5.940476190476191, - "grad_norm": 2.468608525491094, - "learning_rate": 1.0658710524176623e-05, - "loss": 0.1368, + "epoch": 8.38655462184874, + "grad_norm": 2.3616676446482647, + "learning_rate": 4.241333069941503e-06, + "loss": 0.0569, "step": 499 }, { - "epoch": 5.9523809523809526, - "grad_norm": 1.6228727525493614, - "learning_rate": 1.0626621508040074e-05, - "loss": 0.1318, + "epoch": 8.403361344537815, + "grad_norm": 1.3977212089055078, + "learning_rate": 4.203953514900366e-06, + "loss": 0.0353, "step": 500 }, { - "epoch": 5.964285714285714, - "grad_norm": 3.5639251749513496, - "learning_rate": 1.059452601279072e-05, - "loss": 0.0895, + "epoch": 8.420168067226891, + "grad_norm": 2.6496004578015575, + "learning_rate": 4.166695517598611e-06, + "loss": 0.1042, "step": 501 }, { - "epoch": 5.976190476190476, - "grad_norm": 1.5178172676340558, - "learning_rate": 1.056242437028812e-05, - "loss": 0.1341, + "epoch": 8.436974789915967, + "grad_norm": 2.06089201000074, + "learning_rate": 4.129559859430573e-06, + "loss": 0.0968, "step": 502 }, { - "epoch": 5.988095238095238, - "grad_norm": 1.1893273945201521, - "learning_rate": 1.0530316912455387e-05, - "loss": 0.0256, + "epoch": 8.453781512605042, + "grad_norm": 1.826445704568566, + "learning_rate": 4.092547319224837e-06, + "loss": 0.0516, "step": 503 }, { - "epoch": 6.0, - "grad_norm": 3.9672848129665574, - "learning_rate": 1.0498203971275762e-05, - "loss": 0.1522, - "step": 504 - }, - { - "epoch": 6.0, - "eval_loss": 0.1508193165063858, - "eval_runtime": 38.2578, - "eval_samples_per_second": 1.568, - "eval_steps_per_second": 1.568, + "epoch": 8.470588235294118, + "grad_norm": 3.8412502857797195, + "learning_rate": 4.055658673227891e-06, + "loss": 0.0884, "step": 504 }, { - "epoch": 6.011904761904762, - "grad_norm": 1.6644304828026015, - "learning_rate": 1.046608587878919e-05, - "loss": 0.1135, + "epoch": 8.487394957983193, + "grad_norm": 0.9584236905314262, + "learning_rate": 4.01889469508784e-06, + "loss": 0.0249, "step": 505 }, { - "epoch": 6.023809523809524, - "grad_norm": 1.6379161599185448, - "learning_rate": 1.0433962967088871e-05, - "loss": 0.0236, + "epoch": 8.504201680672269, + "grad_norm": 3.291735033394008, + "learning_rate": 3.982256155838199e-06, + "loss": 0.0854, "step": 506 }, { - "epoch": 6.035714285714286, - "grad_norm": 2.632685664860106, - "learning_rate": 1.0401835568317842e-05, - "loss": 0.1443, + "epoch": 8.521008403361344, + "grad_norm": 4.532957288732598, + "learning_rate": 3.945743823881713e-06, + "loss": 0.0629, "step": 507 }, { - "epoch": 6.0476190476190474, - "grad_norm": 0.8860899184377556, - "learning_rate": 1.036970401466553e-05, - "loss": 0.0727, + "epoch": 8.53781512605042, + "grad_norm": 4.273353415832438, + "learning_rate": 3.909358464974228e-06, + "loss": 0.0624, "step": 508 }, { - "epoch": 6.059523809523809, - "grad_norm": 2.23073958875819, - "learning_rate": 1.0337568638364322e-05, - "loss": 0.0933, + "epoch": 8.554621848739496, + "grad_norm": 1.437659985078597, + "learning_rate": 3.873100842208661e-06, + "loss": 0.044, "step": 509 }, { - "epoch": 6.071428571428571, - "grad_norm": 5.244025160546965, - "learning_rate": 1.0305429771686135e-05, - "loss": 0.1129, + "epoch": 8.571428571428571, + "grad_norm": 2.6929955709108584, + "learning_rate": 3.836971715998968e-06, + "loss": 0.0598, "step": 510 }, { - "epoch": 6.083333333333333, - "grad_norm": 2.2806324713435027, - "learning_rate": 1.0273287746938974e-05, - "loss": 0.1352, + "epoch": 8.588235294117647, + "grad_norm": 1.9024448417705355, + "learning_rate": 3.8009718440642128e-06, + "loss": 0.0603, "step": 511 }, { - "epoch": 6.095238095238095, - "grad_norm": 3.734983785509321, - "learning_rate": 1.0241142896463492e-05, - "loss": 0.094, + "epoch": 8.605042016806722, + "grad_norm": 0.7656090871294385, + "learning_rate": 3.7651019814126656e-06, + "loss": 0.0271, "step": 512 }, { - "epoch": 6.107142857142857, - "grad_norm": 1.488803778322093, - "learning_rate": 1.0208995552629568e-05, - "loss": 0.0475, + "epoch": 8.621848739495798, + "grad_norm": 1.498792756768189, + "learning_rate": 3.729362880325983e-06, + "loss": 0.0406, "step": 513 }, { - "epoch": 6.119047619047619, - "grad_norm": 2.402837670487696, - "learning_rate": 1.0176846047832855e-05, - "loss": 0.1298, + "epoch": 8.638655462184873, + "grad_norm": 1.914650360335185, + "learning_rate": 3.693755290343409e-06, + "loss": 0.0538, "step": 514 }, { - "epoch": 6.130952380952381, - "grad_norm": 2.0357587086107087, - "learning_rate": 1.0144694714491356e-05, - "loss": 0.0564, + "epoch": 8.655462184873949, + "grad_norm": 1.097214252517178, + "learning_rate": 3.658279958246075e-06, + "loss": 0.0381, "step": 515 }, { - "epoch": 6.142857142857143, - "grad_norm": 3.109809726164325, - "learning_rate": 1.0112541885041973e-05, - "loss": 0.068, + "epoch": 8.672268907563025, + "grad_norm": 2.861008119753485, + "learning_rate": 3.622937628041334e-06, + "loss": 0.0808, "step": 516 }, { - "epoch": 6.154761904761905, - "grad_norm": 2.0837285046565293, - "learning_rate": 1.0080387891937085e-05, - "loss": 0.1077, + "epoch": 8.6890756302521, + "grad_norm": 1.9341336610200206, + "learning_rate": 3.587729040947141e-06, + "loss": 0.0343, "step": 517 }, { - "epoch": 6.166666666666667, - "grad_norm": 1.6180323000868266, - "learning_rate": 1.0048233067641098e-05, - "loss": 0.0779, + "epoch": 8.705882352941176, + "grad_norm": 2.42481473153521, + "learning_rate": 3.5526549353765294e-06, + "loss": 0.1077, "step": 518 }, { - "epoch": 6.178571428571429, - "grad_norm": 1.9200465377373805, - "learning_rate": 1.0016077744627012e-05, - "loss": 0.0946, + "epoch": 8.722689075630251, + "grad_norm": 2.4065694655289973, + "learning_rate": 3.5177160469221184e-06, + "loss": 0.0641, "step": 519 }, { - "epoch": 6.190476190476191, - "grad_norm": 2.4780925220259737, - "learning_rate": 9.983922255372991e-06, - "loss": 0.0664, + "epoch": 8.739495798319329, + "grad_norm": 1.8534338905199441, + "learning_rate": 3.4829131083406684e-06, + "loss": 0.0529, "step": 520 }, { - "epoch": 6.2023809523809526, - "grad_norm": 1.269298365112061, - "learning_rate": 9.951766932358907e-06, - "loss": 0.1073, + "epoch": 8.756302521008404, + "grad_norm": 2.072202396225646, + "learning_rate": 3.448246849537741e-06, + "loss": 0.0565, "step": 521 }, { - "epoch": 6.214285714285714, - "grad_norm": 2.1626917897691436, - "learning_rate": 9.919612108062919e-06, - "loss": 0.0672, + "epoch": 8.77310924369748, + "grad_norm": 1.5479608735352894, + "learning_rate": 3.413717997552376e-06, + "loss": 0.0411, "step": 522 }, { - "epoch": 6.226190476190476, - "grad_norm": 3.8302257883114357, - "learning_rate": 9.887458114958032e-06, - "loss": 0.158, + "epoch": 8.789915966386555, + "grad_norm": 2.6939787148898016, + "learning_rate": 3.379327276541834e-06, + "loss": 0.0722, "step": 523 }, { - "epoch": 6.238095238095238, - "grad_norm": 1.469046690940951, - "learning_rate": 9.855305285508649e-06, - "loss": 0.0456, + "epoch": 8.806722689075631, + "grad_norm": 1.5450816630775992, + "learning_rate": 3.3450754077664337e-06, + "loss": 0.0544, "step": 524 }, { - "epoch": 6.25, - "grad_norm": 2.750101219167286, - "learning_rate": 9.823153952167148e-06, - "loss": 0.0828, + "epoch": 8.823529411764707, + "grad_norm": 1.5918689109996174, + "learning_rate": 3.310963109574408e-06, + "loss": 0.0407, "step": 525 }, { - "epoch": 6.261904761904762, - "grad_norm": 1.8402371967805808, - "learning_rate": 9.791004447370439e-06, - "loss": 0.0953, + "epoch": 8.840336134453782, + "grad_norm": 1.5593134309915293, + "learning_rate": 3.2769910973868314e-06, + "loss": 0.0418, "step": 526 }, { - "epoch": 6.273809523809524, - "grad_norm": 3.048460879374815, - "learning_rate": 9.758857103536513e-06, - "loss": 0.0729, + "epoch": 8.857142857142858, + "grad_norm": 1.9817294913123196, + "learning_rate": 3.243160083682645e-06, + "loss": 0.0529, "step": 527 }, { - "epoch": 6.285714285714286, - "grad_norm": 1.6638375884973138, - "learning_rate": 9.726712253061031e-06, - "loss": 0.0628, + "epoch": 8.873949579831933, + "grad_norm": 1.695431393346344, + "learning_rate": 3.209470777983675e-06, + "loss": 0.0488, "step": 528 }, { - "epoch": 6.2976190476190474, - "grad_norm": 1.6689888226784142, - "learning_rate": 9.694570228313868e-06, - "loss": 0.0589, + "epoch": 8.890756302521009, + "grad_norm": 1.8840574831179149, + "learning_rate": 3.1759238868397925e-06, + "loss": 0.0529, "step": 529 }, { - "epoch": 6.309523809523809, - "grad_norm": 1.6801934021210152, - "learning_rate": 9.662431361635681e-06, - "loss": 0.061, + "epoch": 8.907563025210084, + "grad_norm": 5.952373137687765, + "learning_rate": 3.1425201138140592e-06, + "loss": 0.0808, "step": 530 }, { - "epoch": 6.321428571428571, - "grad_norm": 1.2262908116444213, - "learning_rate": 9.630295985334474e-06, - "loss": 0.0916, + "epoch": 8.92436974789916, + "grad_norm": 1.290713251381066, + "learning_rate": 3.1092601594679993e-06, + "loss": 0.0439, "step": 531 }, { - "epoch": 6.333333333333333, - "grad_norm": 1.5047283215622451, - "learning_rate": 9.598164431682161e-06, - "loss": 0.0847, + "epoch": 8.941176470588236, + "grad_norm": 0.9948068099144867, + "learning_rate": 3.0761447213468888e-06, + "loss": 0.0405, "step": 532 }, { - "epoch": 6.345238095238095, - "grad_norm": 1.8569321486125632, - "learning_rate": 9.56603703291113e-06, - "loss": 0.0703, + "epoch": 8.957983193277311, + "grad_norm": 1.9536076330602832, + "learning_rate": 3.0431744939651365e-06, + "loss": 0.0538, "step": 533 }, { - "epoch": 6.357142857142857, - "grad_norm": 2.3223165705202584, - "learning_rate": 9.533914121210813e-06, - "loss": 0.0141, + "epoch": 8.974789915966387, + "grad_norm": 1.6523207102197872, + "learning_rate": 3.0103501687917192e-06, + "loss": 0.0544, "step": 534 }, { - "epoch": 6.369047619047619, - "grad_norm": 1.947523583239929, - "learning_rate": 9.501796028724241e-06, - "loss": 0.1206, + "epoch": 8.991596638655462, + "grad_norm": 0.9746183785405975, + "learning_rate": 2.9776724342356654e-06, + "loss": 0.0531, + "step": 535 + }, + { + "epoch": 8.991596638655462, + "eval_loss": 0.09362534433603287, + "eval_runtime": 84.973, + "eval_samples_per_second": 1.106, + "eval_steps_per_second": 1.106, "step": 535 }, { - "epoch": 6.380952380952381, - "grad_norm": 1.3333488170380448, - "learning_rate": 9.469683087544616e-06, - "loss": 0.071, + "epoch": 9.008403361344538, + "grad_norm": 3.1103066899739282, + "learning_rate": 2.945141975631637e-06, + "loss": 0.0553, "step": 536 }, { - "epoch": 6.392857142857143, - "grad_norm": 1.0684885808241553, - "learning_rate": 9.437575629711883e-06, - "loss": 0.0483, + "epoch": 9.025210084033613, + "grad_norm": 2.3282836310480435, + "learning_rate": 2.912759475225546e-06, + "loss": 0.0532, "step": 537 }, { - "epoch": 6.404761904761905, - "grad_norm": 1.5479791429558227, - "learning_rate": 9.405473987209283e-06, - "loss": 0.1061, + "epoch": 9.042016806722689, + "grad_norm": 1.1197327667128945, + "learning_rate": 2.88052561216024e-06, + "loss": 0.0308, "step": 538 }, { - "epoch": 6.416666666666667, - "grad_norm": 2.06718170775221, - "learning_rate": 9.37337849195993e-06, - "loss": 0.101, + "epoch": 9.058823529411764, + "grad_norm": 1.4098107198655643, + "learning_rate": 2.8484410624612744e-06, + "loss": 0.0364, "step": 539 }, { - "epoch": 6.428571428571429, - "grad_norm": 2.5758094309956165, - "learning_rate": 9.34128947582338e-06, - "loss": 0.0697, + "epoch": 9.07563025210084, + "grad_norm": 4.438658787202018, + "learning_rate": 2.8165064990227255e-06, + "loss": 0.0561, "step": 540 }, { - "epoch": 6.440476190476191, - "grad_norm": 1.585812534614295, - "learning_rate": 9.309207270592196e-06, - "loss": 0.0768, + "epoch": 9.092436974789916, + "grad_norm": 1.9493994527119616, + "learning_rate": 2.7847225915930697e-06, + "loss": 0.0601, "step": 541 }, { - "epoch": 6.4523809523809526, - "grad_norm": 1.9676744482847963, - "learning_rate": 9.277132207988516e-06, - "loss": 0.0691, + "epoch": 9.109243697478991, + "grad_norm": 1.8527667273461481, + "learning_rate": 2.7530900067611577e-06, + "loss": 0.0531, "step": 542 }, { - "epoch": 6.464285714285714, - "grad_norm": 1.5053512527538524, - "learning_rate": 9.245064619660624e-06, - "loss": 0.0825, + "epoch": 9.126050420168067, + "grad_norm": 1.701462207843824, + "learning_rate": 2.7216094079422185e-06, + "loss": 0.0472, "step": 543 }, { - "epoch": 6.476190476190476, - "grad_norm": 2.034981803501994, - "learning_rate": 9.213004837179524e-06, - "loss": 0.0244, + "epoch": 9.142857142857142, + "grad_norm": 1.8914078563116579, + "learning_rate": 2.6902814553639443e-06, + "loss": 0.0551, "step": 544 }, { - "epoch": 6.488095238095238, - "grad_norm": 0.9575306509567822, - "learning_rate": 9.180953192035507e-06, - "loss": 0.0468, + "epoch": 9.159663865546218, + "grad_norm": 2.591885452858992, + "learning_rate": 2.6591068060526626e-06, + "loss": 0.0395, "step": 545 }, { - "epoch": 6.5, - "grad_norm": 1.295046452458436, - "learning_rate": 9.14891001563473e-06, - "loss": 0.0384, + "epoch": 9.176470588235293, + "grad_norm": 3.074064642620058, + "learning_rate": 2.62808611381953e-06, + "loss": 0.0619, "step": 546 }, { - "epoch": 6.511904761904762, - "grad_norm": 2.0081622877090144, - "learning_rate": 9.116875639295783e-06, - "loss": 0.0507, + "epoch": 9.193277310924369, + "grad_norm": 1.6148671787143798, + "learning_rate": 2.597220029246846e-06, + "loss": 0.0369, "step": 547 }, { - "epoch": 6.523809523809524, - "grad_norm": 1.6714670350427359, - "learning_rate": 9.084850394246262e-06, - "loss": 0.0184, + "epoch": 9.210084033613445, + "grad_norm": 5.145830985825016, + "learning_rate": 2.5665091996743898e-06, + "loss": 0.0912, "step": 548 }, { - "epoch": 6.535714285714286, - "grad_norm": 1.3524423408754032, - "learning_rate": 9.052834611619354e-06, - "loss": 0.0917, + "epoch": 9.22689075630252, + "grad_norm": 1.1197523582174669, + "learning_rate": 2.5359542691858542e-06, + "loss": 0.0411, "step": 549 }, { - "epoch": 6.5476190476190474, - "grad_norm": 1.208211246351009, - "learning_rate": 9.020828622450406e-06, - "loss": 0.0188, + "epoch": 9.243697478991596, + "grad_norm": 1.5754449262763028, + "learning_rate": 2.5055558785953304e-06, + "loss": 0.0502, "step": 550 }, { - "epoch": 6.559523809523809, - "grad_norm": 0.810515144898437, - "learning_rate": 8.988832757673497e-06, - "loss": 0.0625, + "epoch": 9.260504201680673, + "grad_norm": 2.843721070657444, + "learning_rate": 2.4753146654338765e-06, + "loss": 0.0835, "step": 551 }, { - "epoch": 6.571428571428571, - "grad_norm": 1.4738861513691155, - "learning_rate": 8.95684734811803e-06, - "loss": 0.0905, + "epoch": 9.277310924369749, + "grad_norm": 2.2196607101882817, + "learning_rate": 2.4452312639361462e-06, + "loss": 0.0465, "step": 552 }, { - "epoch": 6.583333333333333, - "grad_norm": 1.8126992202021779, - "learning_rate": 8.924872724505296e-06, - "loss": 0.0939, + "epoch": 9.294117647058824, + "grad_norm": 2.166755095388008, + "learning_rate": 2.415306305027072e-06, + "loss": 0.0421, "step": 553 }, { - "epoch": 6.595238095238095, - "grad_norm": 1.5254885661369932, - "learning_rate": 8.892909217445069e-06, - "loss": 0.0644, + "epoch": 9.3109243697479, + "grad_norm": 3.8956621071776945, + "learning_rate": 2.3855404163086558e-06, + "loss": 0.0456, "step": 554 }, { - "epoch": 6.607142857142857, - "grad_norm": 0.9610399303180617, - "learning_rate": 8.860957157432177e-06, - "loss": 0.0541, + "epoch": 9.327731092436975, + "grad_norm": 1.198392921249873, + "learning_rate": 2.355934222046794e-06, + "loss": 0.0368, "step": 555 }, { - "epoch": 6.619047619047619, - "grad_norm": 1.0851393239445002, - "learning_rate": 8.829016874843089e-06, - "loss": 0.0593, + "epoch": 9.344537815126051, + "grad_norm": 2.081788407790618, + "learning_rate": 2.32648834315818e-06, + "loss": 0.0742, "step": 556 }, { - "epoch": 6.630952380952381, - "grad_norm": 1.7224098343060739, - "learning_rate": 8.797088699932494e-06, - "loss": 0.0164, + "epoch": 9.361344537815127, + "grad_norm": 1.8547943707777044, + "learning_rate": 2.2972033971972953e-06, + "loss": 0.0404, "step": 557 }, { - "epoch": 6.642857142857143, - "grad_norm": 0.9697818551837348, - "learning_rate": 8.7651729628299e-06, - "loss": 0.0422, + "epoch": 9.378151260504202, + "grad_norm": 2.06348184199221, + "learning_rate": 2.2680799983434532e-06, + "loss": 0.0362, "step": 558 }, { - "epoch": 6.654761904761905, - "grad_norm": 3.3017631401846903, - "learning_rate": 8.733269993536208e-06, - "loss": 0.1914, + "epoch": 9.394957983193278, + "grad_norm": 1.6015036795043573, + "learning_rate": 2.239118757387907e-06, + "loss": 0.0461, "step": 559 }, { - "epoch": 6.666666666666667, - "grad_norm": 2.032789558563978, - "learning_rate": 8.701380121920292e-06, - "loss": 0.043, + "epoch": 9.411764705882353, + "grad_norm": 2.500319456131113, + "learning_rate": 2.2103202817210555e-06, + "loss": 0.0436, "step": 560 }, { - "epoch": 6.678571428571429, - "grad_norm": 0.8515648745590835, - "learning_rate": 8.669503677715614e-06, - "loss": 0.0083, + "epoch": 9.428571428571429, + "grad_norm": 2.5704589340698036, + "learning_rate": 2.1816851753197023e-06, + "loss": 0.0797, "step": 561 }, { - "epoch": 6.690476190476191, - "grad_norm": 0.671994580593515, - "learning_rate": 8.637640990516793e-06, - "loss": 0.0473, + "epoch": 9.445378151260504, + "grad_norm": 1.1623903030627807, + "learning_rate": 2.1532140387343736e-06, + "loss": 0.0434, "step": 562 }, { - "epoch": 6.7023809523809526, - "grad_norm": 2.666195404252126, - "learning_rate": 8.605792389776203e-06, - "loss": 0.1427, + "epoch": 9.46218487394958, + "grad_norm": 1.7348909728003972, + "learning_rate": 2.1249074690767434e-06, + "loss": 0.0513, "step": 563 }, { - "epoch": 6.714285714285714, - "grad_norm": 1.763396796049088, - "learning_rate": 8.573958204800572e-06, - "loss": 0.0838, + "epoch": 9.478991596638656, + "grad_norm": 1.443247675887106, + "learning_rate": 2.096766060007096e-06, + "loss": 0.0436, "step": 564 }, { - "epoch": 6.726190476190476, - "grad_norm": 1.246603953754217, - "learning_rate": 8.542138764747565e-06, - "loss": 0.0616, + "epoch": 9.495798319327731, + "grad_norm": 1.719154980437653, + "learning_rate": 2.068790401721886e-06, + "loss": 0.0473, "step": 565 }, { - "epoch": 6.738095238095238, - "grad_norm": 1.203406526414491, - "learning_rate": 8.5103343986224e-06, - "loss": 0.0148, + "epoch": 9.512605042016807, + "grad_norm": 1.3479219484263167, + "learning_rate": 2.040981080941349e-06, + "loss": 0.0403, "step": 566 }, { - "epoch": 6.75, - "grad_norm": 1.950964925182016, - "learning_rate": 8.478545435274424e-06, - "loss": 0.0868, + "epoch": 9.529411764705882, + "grad_norm": 1.7448556207352623, + "learning_rate": 2.013338680897209e-06, + "loss": 0.0374, "step": 567 }, { - "epoch": 6.761904761904762, - "grad_norm": 0.8066526898246336, - "learning_rate": 8.446772203393726e-06, - "loss": 0.0531, + "epoch": 9.546218487394958, + "grad_norm": 1.778610985288892, + "learning_rate": 1.9858637813204352e-06, + "loss": 0.0481, "step": 568 }, { - "epoch": 6.773809523809524, - "grad_norm": 1.7913521121040872, - "learning_rate": 8.415015031507734e-06, - "loss": 0.0714, + "epoch": 9.563025210084033, + "grad_norm": 1.023079090275057, + "learning_rate": 1.958556958429092e-06, + "loss": 0.043, "step": 569 }, { - "epoch": 6.785714285714286, - "grad_norm": 0.9075804833918106, - "learning_rate": 8.383274247977827e-06, - "loss": 0.0437, + "epoch": 9.579831932773109, + "grad_norm": 2.2504655920316927, + "learning_rate": 1.9314187849162523e-06, + "loss": 0.104, "step": 570 }, { - "epoch": 6.7976190476190474, - "grad_norm": 4.654542604347542, - "learning_rate": 8.35155018099592e-06, - "loss": 0.1748, + "epoch": 9.596638655462185, + "grad_norm": 4.623708198206026, + "learning_rate": 1.904449829937981e-06, + "loss": 0.0878, "step": 571 }, { - "epoch": 6.809523809523809, - "grad_norm": 3.177641851084243, - "learning_rate": 8.319843158581092e-06, - "loss": 0.1297, + "epoch": 9.61344537815126, + "grad_norm": 1.1241851785983472, + "learning_rate": 1.8776506591014054e-06, + "loss": 0.0358, "step": 572 }, { - "epoch": 6.821428571428571, - "grad_norm": 1.9316344815684265, - "learning_rate": 8.288153508576186e-06, - "loss": 0.0958, + "epoch": 9.630252100840336, + "grad_norm": 1.9153117494048388, + "learning_rate": 1.851021834452853e-06, + "loss": 0.0523, "step": 573 }, { - "epoch": 6.833333333333333, - "grad_norm": 4.63072501048228, - "learning_rate": 8.256481558644412e-06, - "loss": 0.0768, + "epoch": 9.647058823529411, + "grad_norm": 4.390748395794137, + "learning_rate": 1.8245639144660532e-06, + "loss": 0.0744, "step": 574 }, { - "epoch": 6.845238095238095, - "grad_norm": 8.04435552149917, - "learning_rate": 8.224827636265971e-06, - "loss": 0.0355, + "epoch": 9.663865546218487, + "grad_norm": 2.7441495241393783, + "learning_rate": 1.7982774540304404e-06, + "loss": 0.0492, "step": 575 }, { - "epoch": 6.857142857142857, - "grad_norm": 2.840529961154322, - "learning_rate": 8.19319206873466e-06, - "loss": 0.0673, + "epoch": 9.680672268907562, + "grad_norm": 1.3900528175419058, + "learning_rate": 1.772163004439511e-06, + "loss": 0.0479, "step": 576 }, { - "epoch": 6.869047619047619, - "grad_norm": 1.4798999016311165, - "learning_rate": 8.161575183154495e-06, - "loss": 0.0858, + "epoch": 9.697478991596638, + "grad_norm": 2.77436372781934, + "learning_rate": 1.7462211133792484e-06, + "loss": 0.0621, "step": 577 }, { - "epoch": 6.880952380952381, - "grad_norm": 2.273599989764983, - "learning_rate": 8.129977306436326e-06, - "loss": 0.0953, + "epoch": 9.714285714285714, + "grad_norm": 1.560534629765185, + "learning_rate": 1.720452324916656e-06, + "loss": 0.0357, "step": 578 }, { - "epoch": 6.892857142857143, - "grad_norm": 4.560000854260947, - "learning_rate": 8.098398765294447e-06, - "loss": 0.1315, + "epoch": 9.731092436974789, + "grad_norm": 2.4373786388347294, + "learning_rate": 1.6948571794883406e-06, + "loss": 0.0497, "step": 579 }, { - "epoch": 6.904761904761905, - "grad_norm": 2.7031944822443634, - "learning_rate": 8.066839886243238e-06, - "loss": 0.0603, + "epoch": 9.747899159663866, + "grad_norm": 1.3388696128320787, + "learning_rate": 1.6694362138891674e-06, + "loss": 0.0409, "step": 580 }, { - "epoch": 6.916666666666667, - "grad_norm": 2.1777978934683584, - "learning_rate": 8.035300995593772e-06, - "loss": 0.1096, + "epoch": 9.764705882352942, + "grad_norm": 2.602632435071536, + "learning_rate": 1.6441899612610178e-06, + "loss": 0.0557, "step": 581 }, { - "epoch": 6.928571428571429, - "grad_norm": 1.7370224989930014, - "learning_rate": 8.003782419450448e-06, - "loss": 0.0487, + "epoch": 9.781512605042018, + "grad_norm": 2.241392316929053, + "learning_rate": 1.6191189510815942e-06, + "loss": 0.0852, "step": 582 }, { - "epoch": 6.940476190476191, - "grad_norm": 2.0369881375248435, - "learning_rate": 7.972284483707615e-06, - "loss": 0.1216, + "epoch": 9.798319327731093, + "grad_norm": 1.2305602460104232, + "learning_rate": 1.5942237091533297e-06, + "loss": 0.0424, "step": 583 }, { - "epoch": 6.9523809523809526, - "grad_norm": 1.5695162807965415, - "learning_rate": 7.940807514046211e-06, - "loss": 0.1012, + "epoch": 9.815126050420169, + "grad_norm": 1.2682290196019605, + "learning_rate": 1.5695047575923462e-06, + "loss": 0.0497, "step": 584 }, { - "epoch": 6.964285714285714, - "grad_norm": 1.9962936354689058, - "learning_rate": 7.909351835930389e-06, - "loss": 0.0667, + "epoch": 9.831932773109244, + "grad_norm": 3.1108653972627924, + "learning_rate": 1.5449626148175144e-06, + "loss": 0.0943, "step": 585 }, { - "epoch": 6.976190476190476, - "grad_norm": 2.495887351780799, - "learning_rate": 7.877917774604144e-06, - "loss": 0.1092, + "epoch": 9.84873949579832, + "grad_norm": 1.29875959685456, + "learning_rate": 1.5205977955395812e-06, + "loss": 0.0239, "step": 586 }, { - "epoch": 6.988095238095238, - "grad_norm": 1.3893141441816679, - "learning_rate": 7.846505655087972e-06, - "loss": 0.0481, + "epoch": 9.865546218487395, + "grad_norm": 1.255438999538065, + "learning_rate": 1.4964108107503638e-06, + "loss": 0.0432, "step": 587 }, { - "epoch": 7.0, - "grad_norm": 2.4723106201742917, - "learning_rate": 7.815115802175485e-06, - "loss": 0.0279, - "step": 588 - }, - { - "epoch": 7.0, - "eval_loss": 0.16396619379520416, - "eval_runtime": 38.7412, - "eval_samples_per_second": 1.549, - "eval_steps_per_second": 1.549, + "epoch": 9.882352941176471, + "grad_norm": 1.6849212493860068, + "learning_rate": 1.4724021677120491e-06, + "loss": 0.0498, "step": 588 }, { - "epoch": 7.011904761904762, - "grad_norm": 2.2356269896066165, - "learning_rate": 7.783748540430074e-06, - "loss": 0.0637, + "epoch": 9.899159663865547, + "grad_norm": 1.543073795782045, + "learning_rate": 1.4485723699465392e-06, + "loss": 0.0436, "step": 589 }, { - "epoch": 7.023809523809524, - "grad_norm": 12.593626155384655, - "learning_rate": 7.75240419418153e-06, - "loss": 0.1319, + "epoch": 9.915966386554622, + "grad_norm": 1.6326111794747336, + "learning_rate": 1.4249219172249051e-06, + "loss": 0.0448, "step": 590 }, { - "epoch": 7.035714285714286, - "grad_norm": 7.517415321784336, - "learning_rate": 7.721083087522717e-06, - "loss": 0.0776, + "epoch": 9.932773109243698, + "grad_norm": 1.6795029810703082, + "learning_rate": 1.4014513055568978e-06, + "loss": 0.0483, "step": 591 }, { - "epoch": 7.0476190476190474, - "grad_norm": 4.188123414846697, - "learning_rate": 7.689785544306194e-06, - "loss": 0.0452, + "epoch": 9.949579831932773, + "grad_norm": 5.509418938412195, + "learning_rate": 1.3781610271805436e-06, + "loss": 0.1002, "step": 592 }, { - "epoch": 7.059523809523809, - "grad_norm": 1.7616711514678394, - "learning_rate": 7.65851188814089e-06, - "loss": 0.0852, + "epoch": 9.966386554621849, + "grad_norm": 1.9231485974986042, + "learning_rate": 1.3550515705518263e-06, + "loss": 0.0406, "step": 593 }, { - "epoch": 7.071428571428571, - "grad_norm": 1.8987651197623507, - "learning_rate": 7.627262442388747e-06, - "loss": 0.0851, + "epoch": 9.983193277310924, + "grad_norm": 2.6221792896880918, + "learning_rate": 1.3321234203344435e-06, + "loss": 0.0369, "step": 594 }, { - "epoch": 7.083333333333333, - "grad_norm": 2.2298478849854337, - "learning_rate": 7.596037530161371e-06, - "loss": 0.065, + "epoch": 10.0, + "grad_norm": 3.378390284106787, + "learning_rate": 1.3093770573896369e-06, + "loss": 0.0475, + "step": 595 + }, + { + "epoch": 10.0, + "eval_loss": 0.08747600764036179, + "eval_runtime": 83.5907, + "eval_samples_per_second": 1.125, + "eval_steps_per_second": 1.125, "step": 595 }, { - "epoch": 7.095238095238095, - "grad_norm": 2.736235330587379, - "learning_rate": 7.5648374743167e-06, - "loss": 0.0301, + "epoch": 10.016806722689076, + "grad_norm": 1.340634522580302, + "learning_rate": 1.286812958766106e-06, + "loss": 0.0302, "step": 596 }, { - "epoch": 7.107142857142857, - "grad_norm": 0.9799891778907858, - "learning_rate": 7.533662597455667e-06, - "loss": 0.0379, + "epoch": 10.033613445378151, + "grad_norm": 1.836375096978306, + "learning_rate": 1.2644315976900145e-06, + "loss": 0.0546, "step": 597 }, { - "epoch": 7.119047619047619, - "grad_norm": 1.1354342827683341, - "learning_rate": 7.5025132219188615e-06, - "loss": 0.0629, + "epoch": 10.050420168067227, + "grad_norm": 4.930603999286428, + "learning_rate": 1.242233443555051e-06, + "loss": 0.0534, "step": 598 }, { - "epoch": 7.130952380952381, - "grad_norm": 4.503381375894526, - "learning_rate": 7.471389669783183e-06, - "loss": 0.0348, + "epoch": 10.067226890756302, + "grad_norm": 2.204329623971633, + "learning_rate": 1.220218961912597e-06, + "loss": 0.0588, "step": 599 }, { - "epoch": 7.142857142857143, - "grad_norm": 4.170613919008802, - "learning_rate": 7.440292262858533e-06, - "loss": 0.1561, + "epoch": 10.084033613445378, + "grad_norm": 3.9225927136224987, + "learning_rate": 1.1983886144619527e-06, + "loss": 0.0427, "step": 600 }, { - "epoch": 7.154761904761905, - "grad_norm": 1.7324596706185202, - "learning_rate": 7.409221322684481e-06, - "loss": 0.0623, + "epoch": 10.100840336134453, + "grad_norm": 1.5324167733429486, + "learning_rate": 1.1767428590406648e-06, + "loss": 0.0269, "step": 601 }, { - "epoch": 7.166666666666667, - "grad_norm": 8.391742154063005, - "learning_rate": 7.378177170526928e-06, - "loss": 0.0505, + "epoch": 10.117647058823529, + "grad_norm": 1.7250937116057505, + "learning_rate": 1.1552821496149136e-06, + "loss": 0.0535, "step": 602 }, { - "epoch": 7.178571428571429, - "grad_norm": 2.0982830668406427, - "learning_rate": 7.347160127374804e-06, - "loss": 0.1156, + "epoch": 10.134453781512605, + "grad_norm": 0.7987608814501375, + "learning_rate": 1.134006936269999e-06, + "loss": 0.0173, "step": 603 }, { - "epoch": 7.190476190476191, - "grad_norm": 2.9010341886525337, - "learning_rate": 7.31617051393673e-06, - "loss": 0.0472, + "epoch": 10.15126050420168, + "grad_norm": 2.1053285256762733, + "learning_rate": 1.1129176652009043e-06, + "loss": 0.0406, "step": 604 }, { - "epoch": 7.2023809523809526, - "grad_norm": 6.688428366168943, - "learning_rate": 7.285208650637712e-06, - "loss": 0.0577, + "epoch": 10.168067226890756, + "grad_norm": 2.1300541811924094, + "learning_rate": 1.0920147787029233e-06, + "loss": 0.0461, "step": 605 }, { - "epoch": 7.214285714285714, - "grad_norm": 3.8530997432495844, - "learning_rate": 7.2542748576158284e-06, - "loss": 0.1247, + "epoch": 10.184873949579831, + "grad_norm": 1.5136074758546754, + "learning_rate": 1.0712987151624056e-06, + "loss": 0.0433, "step": 606 }, { - "epoch": 7.226190476190476, - "grad_norm": 2.2161474037024718, - "learning_rate": 7.223369454718918e-06, - "loss": 0.0544, + "epoch": 10.201680672268907, + "grad_norm": 1.6819131803547542, + "learning_rate": 1.05076990904754e-06, + "loss": 0.0529, "step": 607 }, { - "epoch": 7.238095238095238, - "grad_norm": 2.3619986375880067, - "learning_rate": 7.19249276150127e-06, - "loss": 0.0982, + "epoch": 10.218487394957982, + "grad_norm": 3.0644142026231727, + "learning_rate": 1.0304287908992626e-06, + "loss": 0.0532, "step": 608 }, { - "epoch": 7.25, - "grad_norm": 1.9276942223040003, - "learning_rate": 7.161645097220324e-06, - "loss": 0.0893, + "epoch": 10.235294117647058, + "grad_norm": 1.59650378921116, + "learning_rate": 1.010275787322219e-06, + "loss": 0.0538, "step": 609 }, { - "epoch": 7.261904761904762, - "grad_norm": 2.1139529848798833, - "learning_rate": 7.130826780833368e-06, - "loss": 0.0864, + "epoch": 10.252100840336134, + "grad_norm": 1.5363947337082766, + "learning_rate": 9.903113209758098e-07, + "loss": 0.0428, "step": 610 }, { - "epoch": 7.273809523809524, - "grad_norm": 1.571395158740333, - "learning_rate": 7.100038130994242e-06, - "loss": 0.0806, + "epoch": 10.268907563025211, + "grad_norm": 1.5003741433198152, + "learning_rate": 9.705358105653373e-07, + "loss": 0.0322, "step": 611 }, { - "epoch": 7.285714285714286, - "grad_norm": 1.4697576676883244, - "learning_rate": 7.069279466050035e-06, - "loss": 0.0702, + "epoch": 10.285714285714286, + "grad_norm": 2.0150139046727453, + "learning_rate": 9.509496708332233e-07, + "loss": 0.0395, "step": 612 }, { - "epoch": 7.2976190476190474, - "grad_norm": 1.343916000854979, - "learning_rate": 7.0385511040378026e-06, - "loss": 0.0589, + "epoch": 10.302521008403362, + "grad_norm": 2.2695792176582885, + "learning_rate": 9.315533125503051e-07, + "loss": 0.0755, "step": 613 }, { - "epoch": 7.309523809523809, - "grad_norm": 2.8165274419800013, - "learning_rate": 7.007853362681282e-06, - "loss": 0.0978, + "epoch": 10.319327731092438, + "grad_norm": 2.7525380883317134, + "learning_rate": 9.123471425072205e-07, + "loss": 0.0816, "step": 614 }, { - "epoch": 7.321428571428571, - "grad_norm": 0.8192824022307189, - "learning_rate": 6.9771865593875895e-06, - "loss": 0.0091, + "epoch": 10.336134453781513, + "grad_norm": 1.9575555600743102, + "learning_rate": 8.933315635058881e-07, + "loss": 0.0444, "step": 615 }, { - "epoch": 7.333333333333333, - "grad_norm": 1.7629740150881354, - "learning_rate": 6.946551011243958e-06, - "loss": 0.0594, + "epoch": 10.352941176470589, + "grad_norm": 1.3340430154540783, + "learning_rate": 8.745069743510393e-07, + "loss": 0.0289, "step": 616 }, { - "epoch": 7.345238095238095, - "grad_norm": 1.7603693751148732, - "learning_rate": 6.915947035014443e-06, - "loss": 0.1052, + "epoch": 10.369747899159664, + "grad_norm": 2.350943233326136, + "learning_rate": 8.558737698418762e-07, + "loss": 0.0557, "step": 617 }, { - "epoch": 7.357142857142857, - "grad_norm": 1.4991915006798426, - "learning_rate": 6.88537494713666e-06, - "loss": 0.018, + "epoch": 10.38655462184874, + "grad_norm": 1.3949514445639306, + "learning_rate": 8.374323407637741e-07, + "loss": 0.0327, "step": 618 }, { - "epoch": 7.369047619047619, - "grad_norm": 1.5015776512071202, - "learning_rate": 6.854835063718502e-06, - "loss": 0.0716, + "epoch": 10.403361344537815, + "grad_norm": 1.3745535851759583, + "learning_rate": 8.191830738800977e-07, + "loss": 0.0412, "step": 619 }, { - "epoch": 7.380952380952381, - "grad_norm": 1.3848929171774853, - "learning_rate": 6.8243277005348794e-06, - "loss": 0.062, + "epoch": 10.420168067226891, + "grad_norm": 2.797313621901212, + "learning_rate": 8.01126351924082e-07, + "loss": 0.0415, "step": 620 }, { - "epoch": 7.392857142857143, - "grad_norm": 0.8705318075848788, - "learning_rate": 6.793853173024449e-06, - "loss": 0.0392, + "epoch": 10.436974789915967, + "grad_norm": 2.0008993409572753, + "learning_rate": 7.83262553590809e-07, + "loss": 0.0473, "step": 621 }, { - "epoch": 7.404761904761905, - "grad_norm": 1.0304663794903173, - "learning_rate": 6.763411796286357e-06, - "loss": 0.0567, + "epoch": 10.453781512605042, + "grad_norm": 2.566675704823962, + "learning_rate": 7.655920535292682e-07, + "loss": 0.0342, "step": 622 }, { - "epoch": 7.416666666666667, - "grad_norm": 1.0436188142656049, - "learning_rate": 6.733003885076974e-06, - "loss": 0.0446, + "epoch": 10.470588235294118, + "grad_norm": 1.3899500927307256, + "learning_rate": 7.48115222334489e-07, + "loss": 0.0508, "step": 623 }, { - "epoch": 7.428571428571429, - "grad_norm": 1.9837389714659937, - "learning_rate": 6.702629753806653e-06, - "loss": 0.0752, + "epoch": 10.487394957983193, + "grad_norm": 2.160264462063064, + "learning_rate": 7.308324265397837e-07, + "loss": 0.0535, "step": 624 }, { - "epoch": 7.440476190476191, - "grad_norm": 2.5086887412568903, - "learning_rate": 6.672289716536467e-06, - "loss": 0.0896, + "epoch": 10.504201680672269, + "grad_norm": 1.6549288751982645, + "learning_rate": 7.137440286090436e-07, + "loss": 0.0527, "step": 625 }, { - "epoch": 7.4523809523809526, - "grad_norm": 2.1349931644939852, - "learning_rate": 6.64198408697496e-06, - "loss": 0.0893, + "epoch": 10.521008403361344, + "grad_norm": 1.7616896227790626, + "learning_rate": 6.968503869291521e-07, + "loss": 0.0463, "step": 626 }, { - "epoch": 7.464285714285714, - "grad_norm": 3.430269705220585, - "learning_rate": 6.611713178474917e-06, - "loss": 0.1424, + "epoch": 10.53781512605042, + "grad_norm": 1.882435848120671, + "learning_rate": 6.8015185580246e-07, + "loss": 0.0791, "step": 627 }, { - "epoch": 7.476190476190476, - "grad_norm": 1.9197254471391216, - "learning_rate": 6.581477304030111e-06, - "loss": 0.0187, + "epoch": 10.554621848739496, + "grad_norm": 2.1866189366482485, + "learning_rate": 6.636487854393536e-07, + "loss": 0.0595, "step": 628 }, { - "epoch": 7.488095238095238, - "grad_norm": 2.9945545535503313, - "learning_rate": 6.551276776272072e-06, - "loss": 0.1063, + "epoch": 10.571428571428571, + "grad_norm": 0.8846616976894125, + "learning_rate": 6.473415219509182e-07, + "loss": 0.0297, "step": 629 }, { - "epoch": 7.5, - "grad_norm": 2.1068433684122714, - "learning_rate": 6.521111907466856e-06, - "loss": 0.1139, + "epoch": 10.588235294117647, + "grad_norm": 2.3038569022667614, + "learning_rate": 6.31230407341672e-07, + "loss": 0.0679, "step": 630 }, { - "epoch": 7.511904761904762, - "grad_norm": 1.040378375451962, - "learning_rate": 6.490983009511813e-06, - "loss": 0.0594, + "epoch": 10.605042016806722, + "grad_norm": 1.0106801357197905, + "learning_rate": 6.153157795023956e-07, + "loss": 0.0271, "step": 631 }, { - "epoch": 7.523809523809524, - "grad_norm": 3.2385969175520164, - "learning_rate": 6.460890393932362e-06, - "loss": 0.0299, + "epoch": 10.621848739495798, + "grad_norm": 1.9606151371435605, + "learning_rate": 5.995979722030443e-07, + "loss": 0.0566, "step": 632 }, { - "epoch": 7.535714285714286, - "grad_norm": 1.1327987232905252, - "learning_rate": 6.430834371878769e-06, - "loss": 0.041, + "epoch": 10.638655462184873, + "grad_norm": 8.812038870074772, + "learning_rate": 5.840773150857526e-07, + "loss": 0.0936, "step": 633 }, { - "epoch": 7.5476190476190474, - "grad_norm": 3.2654270253868316, - "learning_rate": 6.400815254122943e-06, - "loss": 0.1143, + "epoch": 10.655462184873949, + "grad_norm": 2.37646496100217, + "learning_rate": 5.687541336579127e-07, + "loss": 0.0689, "step": 634 }, { - "epoch": 7.559523809523809, - "grad_norm": 1.384074755179732, - "learning_rate": 6.3708333510551965e-06, - "loss": 0.0449, + "epoch": 10.672268907563025, + "grad_norm": 1.722317758769946, + "learning_rate": 5.536287492853575e-07, + "loss": 0.0409, "step": 635 }, { - "epoch": 7.571428571428571, - "grad_norm": 1.1295585227576246, - "learning_rate": 6.340888972681063e-06, - "loss": 0.0336, + "epoch": 10.6890756302521, + "grad_norm": 1.7758828341496589, + "learning_rate": 5.387014791856127e-07, + "loss": 0.0501, "step": 636 }, { - "epoch": 7.583333333333333, - "grad_norm": 2.0470567579399543, - "learning_rate": 6.310982428618078e-06, - "loss": 0.0606, + "epoch": 10.705882352941176, + "grad_norm": 0.9972447472445873, + "learning_rate": 5.239726364212494e-07, + "loss": 0.0354, "step": 637 }, { - "epoch": 7.595238095238095, - "grad_norm": 1.743014394487192, - "learning_rate": 6.281114028092567e-06, - "loss": 0.0666, + "epoch": 10.722689075630251, + "grad_norm": 1.8890386119269527, + "learning_rate": 5.094425298933136e-07, + "loss": 0.0375, "step": 638 }, { - "epoch": 7.607142857142857, - "grad_norm": 2.248181677644741, - "learning_rate": 6.251284079936473e-06, - "loss": 0.0661, + "epoch": 10.739495798319329, + "grad_norm": 3.0981629606543426, + "learning_rate": 4.951114643348531e-07, + "loss": 0.0618, "step": 639 }, { - "epoch": 7.619047619047619, - "grad_norm": 2.8242756319277396, - "learning_rate": 6.221492892584153e-06, - "loss": 0.1071, + "epoch": 10.756302521008404, + "grad_norm": 3.3069946004768576, + "learning_rate": 4.809797403045224e-07, + "loss": 0.0738, "step": 640 }, { - "epoch": 7.630952380952381, - "grad_norm": 1.6137966144044473, - "learning_rate": 6.19174077406918e-06, - "loss": 0.0765, + "epoch": 10.77310924369748, + "grad_norm": 1.0677359119397434, + "learning_rate": 4.670476541802782e-07, + "loss": 0.0349, "step": 641 }, { - "epoch": 7.642857142857143, - "grad_norm": 1.6637953498949227, - "learning_rate": 6.162028032021168e-06, - "loss": 0.0386, + "epoch": 10.789915966386555, + "grad_norm": 1.8374696207253192, + "learning_rate": 4.533154981531718e-07, + "loss": 0.0431, "step": 642 }, { - "epoch": 7.654761904761905, - "grad_norm": 1.4989918311563193, - "learning_rate": 6.1323549736625856e-06, - "loss": 0.0627, + "epoch": 10.806722689075631, + "grad_norm": 3.948719353562884, + "learning_rate": 4.397835602212064e-07, + "loss": 0.0533, "step": 643 }, { - "epoch": 7.666666666666667, - "grad_norm": 1.8355150312652448, - "learning_rate": 6.102721905805583e-06, - "loss": 0.0824, + "epoch": 10.823529411764707, + "grad_norm": 1.572765088896342, + "learning_rate": 4.264521241833153e-07, + "loss": 0.0425, "step": 644 }, { - "epoch": 7.678571428571429, - "grad_norm": 2.2397838981140437, - "learning_rate": 6.073129134848819e-06, - "loss": 0.068, + "epoch": 10.840336134453782, + "grad_norm": 1.2552552948039755, + "learning_rate": 4.133214696333943e-07, + "loss": 0.0326, "step": 645 }, { - "epoch": 7.690476190476191, - "grad_norm": 1.539999994990287, - "learning_rate": 6.043576966774292e-06, - "loss": 0.0709, + "epoch": 10.857142857142858, + "grad_norm": 1.4869473389842671, + "learning_rate": 4.003918719544464e-07, + "loss": 0.0334, "step": 646 }, { - "epoch": 7.7023809523809526, - "grad_norm": 3.9837741827034776, - "learning_rate": 6.014065707144177e-06, - "loss": 0.0807, + "epoch": 10.873949579831933, + "grad_norm": 1.4103343408301399, + "learning_rate": 3.876636023128022e-07, + "loss": 0.035, "step": 647 }, { - "epoch": 7.714285714285714, - "grad_norm": 2.4912819002955846, - "learning_rate": 5.984595661097663e-06, - "loss": 0.0735, + "epoch": 10.890756302521009, + "grad_norm": 1.491957032559717, + "learning_rate": 3.7513692765243637e-07, + "loss": 0.0446, "step": 648 }, { - "epoch": 7.726190476190476, - "grad_norm": 3.2181157159265785, - "learning_rate": 5.9551671333478056e-06, - "loss": 0.0644, + "epoch": 10.907563025210084, + "grad_norm": 5.277585616799005, + "learning_rate": 3.628121106893701e-07, + "loss": 0.063, "step": 649 }, { - "epoch": 7.738095238095238, - "grad_norm": 1.5618524452415126, - "learning_rate": 5.925780428178365e-06, - "loss": 0.0825, + "epoch": 10.92436974789916, + "grad_norm": 2.3243782807441837, + "learning_rate": 3.50689409906152e-07, + "loss": 0.0408, "step": 650 }, { - "epoch": 7.75, - "grad_norm": 4.130128619978615, - "learning_rate": 5.896435849440671e-06, - "loss": 0.0621, + "epoch": 10.941176470588236, + "grad_norm": 5.5511386963840605, + "learning_rate": 3.3876907954644933e-07, + "loss": 0.0991, "step": 651 }, { - "epoch": 7.761904761904762, - "grad_norm": 2.069627837047999, - "learning_rate": 5.867133700550479e-06, - "loss": 0.086, + "epoch": 10.957983193277311, + "grad_norm": 0.9258571015003478, + "learning_rate": 3.2705136960970554e-07, + "loss": 0.0324, "step": 652 }, { - "epoch": 7.773809523809524, - "grad_norm": 2.2145675279733537, - "learning_rate": 5.837874284484825e-06, - "loss": 0.0814, + "epoch": 10.974789915966387, + "grad_norm": 3.7891311679004582, + "learning_rate": 3.1553652584590864e-07, + "loss": 0.0649, "step": 653 }, { - "epoch": 7.785714285714286, - "grad_norm": 3.6395035540854574, - "learning_rate": 5.808657903778894e-06, - "loss": 0.0271, + "epoch": 10.991596638655462, + "grad_norm": 1.3430987417489015, + "learning_rate": 3.0422478975042245e-07, + "loss": 0.0329, "step": 654 }, { - "epoch": 7.7976190476190474, - "grad_norm": 1.8760063884280842, - "learning_rate": 5.779484860522904e-06, - "loss": 0.0838, + "epoch": 10.991596638655462, + "eval_loss": 0.08618413656949997, + "eval_runtime": 84.2423, + "eval_samples_per_second": 1.116, + "eval_steps_per_second": 1.116, + "step": 654 + }, + { + "epoch": 11.008403361344538, + "grad_norm": 10.112136406292526, + "learning_rate": 2.931163985589369e-07, + "loss": 0.0792, "step": 655 }, { - "epoch": 7.809523809523809, - "grad_norm": 1.8187225689477617, - "learning_rate": 5.750355456358976e-06, - "loss": 0.0666, + "epoch": 11.025210084033613, + "grad_norm": 1.2237182031841543, + "learning_rate": 2.8221158524248003e-07, + "loss": 0.0375, "step": 656 }, { - "epoch": 7.821428571428571, - "grad_norm": 1.8701879030711632, - "learning_rate": 5.721269992478002e-06, - "loss": 0.0665, + "epoch": 11.042016806722689, + "grad_norm": 1.8203551353192673, + "learning_rate": 2.7151057850253957e-07, + "loss": 0.0529, "step": 657 }, { - "epoch": 7.833333333333333, - "grad_norm": 2.008421784606381, - "learning_rate": 5.692228769616559e-06, - "loss": 0.0183, + "epoch": 11.058823529411764, + "grad_norm": 1.6032849699615903, + "learning_rate": 2.6101360276626795e-07, + "loss": 0.0446, "step": 658 }, { - "epoch": 7.845238095238095, - "grad_norm": 1.1103842322825477, - "learning_rate": 5.663232088053763e-06, - "loss": 0.0469, + "epoch": 11.07563025210084, + "grad_norm": 1.545240215165262, + "learning_rate": 2.507208781817638e-07, + "loss": 0.0356, "step": 659 }, { - "epoch": 7.857142857142857, - "grad_norm": 2.9071912477561095, - "learning_rate": 5.6342802476082014e-06, - "loss": 0.0963, + "epoch": 11.092436974789916, + "grad_norm": 1.5501298244234751, + "learning_rate": 2.406326206134724e-07, + "loss": 0.0413, "step": 660 }, { - "epoch": 7.869047619047619, - "grad_norm": 1.4498002971746087, - "learning_rate": 5.6053735476348025e-06, - "loss": 0.0482, + "epoch": 11.109243697478991, + "grad_norm": 3.6911299360545278, + "learning_rate": 2.3074904163764012e-07, + "loss": 0.1112, "step": 661 }, { - "epoch": 7.880952380952381, - "grad_norm": 2.159820723697225, - "learning_rate": 5.576512287021765e-06, - "loss": 0.0876, + "epoch": 11.126050420168067, + "grad_norm": 1.668296969639716, + "learning_rate": 2.210703485378929e-07, + "loss": 0.0343, "step": 662 }, { - "epoch": 7.892857142857143, - "grad_norm": 3.065401466063626, - "learning_rate": 5.5476967641874416e-06, - "loss": 0.0932, + "epoch": 11.142857142857142, + "grad_norm": 1.464185072769606, + "learning_rate": 2.115967443008804e-07, + "loss": 0.042, "step": 663 }, { - "epoch": 7.904761904761905, - "grad_norm": 5.139368448283141, - "learning_rate": 5.518927277077284e-06, - "loss": 0.1097, + "epoch": 11.159663865546218, + "grad_norm": 3.21826813941365, + "learning_rate": 2.0232842761201854e-07, + "loss": 0.0393, "step": 664 }, { - "epoch": 7.916666666666667, - "grad_norm": 2.5405581307288605, - "learning_rate": 5.49020412316073e-06, - "loss": 0.0904, + "epoch": 11.176470588235293, + "grad_norm": 1.5268309356925824, + "learning_rate": 1.9326559285132495e-07, + "loss": 0.0396, "step": 665 }, { - "epoch": 7.928571428571429, - "grad_norm": 0.45052066247569683, - "learning_rate": 5.4615275994281514e-06, - "loss": 0.0051, + "epoch": 11.193277310924369, + "grad_norm": 2.7559911444343625, + "learning_rate": 1.844084300893456e-07, + "loss": 0.0378, "step": 666 }, { - "epoch": 7.940476190476191, - "grad_norm": 2.3387165641238234, - "learning_rate": 5.432898002387783e-06, - "loss": 0.051, + "epoch": 11.210084033613445, + "grad_norm": 2.064362647787835, + "learning_rate": 1.7575712508316244e-07, + "loss": 0.0692, "step": 667 }, { - "epoch": 7.9523809523809526, - "grad_norm": 1.6393139092131652, - "learning_rate": 5.404315628062631e-06, - "loss": 0.0681, + "epoch": 11.22689075630252, + "grad_norm": 2.5064352487208676, + "learning_rate": 1.673118592724987e-07, + "loss": 0.0625, "step": 668 }, { - "epoch": 7.964285714285714, - "grad_norm": 1.856864744878801, - "learning_rate": 5.375780771987449e-06, - "loss": 0.016, + "epoch": 11.243697478991596, + "grad_norm": 2.7390411371250774, + "learning_rate": 1.5907280977591866e-07, + "loss": 0.0737, "step": 669 }, { - "epoch": 7.976190476190476, - "grad_norm": 2.1504287060037397, - "learning_rate": 5.347293729205644e-06, - "loss": 0.0984, + "epoch": 11.260504201680673, + "grad_norm": 0.9309858940918065, + "learning_rate": 1.5104014938710498e-07, + "loss": 0.0266, "step": 670 }, { - "epoch": 7.988095238095238, - "grad_norm": 1.0739005598429683, - "learning_rate": 5.318854794266268e-06, - "loss": 0.0733, + "epoch": 11.277310924369749, + "grad_norm": 1.851032261256866, + "learning_rate": 1.4321404657124393e-07, + "loss": 0.0493, "step": 671 }, { - "epoch": 8.0, - "grad_norm": 13.629034996878698, - "learning_rate": 5.290464261220927e-06, - "loss": 0.0811, - "step": 672 - }, - { - "epoch": 8.0, - "eval_loss": 0.18508009612560272, - "eval_runtime": 38.0386, - "eval_samples_per_second": 1.577, - "eval_steps_per_second": 1.577, + "epoch": 11.294117647058824, + "grad_norm": 1.0812191705502274, + "learning_rate": 1.3559466546148369e-07, + "loss": 0.0434, "step": 672 }, { - "epoch": 8.011904761904763, - "grad_norm": 1.9110702680782508, - "learning_rate": 5.262122423620782e-06, - "loss": 0.0195, + "epoch": 11.3109243697479, + "grad_norm": 1.8327569259876093, + "learning_rate": 1.2818216585549824e-07, + "loss": 0.0449, "step": 673 }, { - "epoch": 8.023809523809524, - "grad_norm": 1.2576546338351324, - "learning_rate": 5.2338295745134795e-06, - "loss": 0.0727, + "epoch": 11.327731092436975, + "grad_norm": 1.160640896943556, + "learning_rate": 1.209767032121345e-07, + "loss": 0.0328, "step": 674 }, { - "epoch": 8.035714285714286, - "grad_norm": 3.5597917543308806, - "learning_rate": 5.205586006440149e-06, - "loss": 0.0967, + "epoch": 11.344537815126051, + "grad_norm": 2.0693866133338017, + "learning_rate": 1.1397842864814712e-07, + "loss": 0.0599, "step": 675 }, { - "epoch": 8.047619047619047, - "grad_norm": 1.6721296431336412, - "learning_rate": 5.177392011432368e-06, - "loss": 0.0339, + "epoch": 11.361344537815127, + "grad_norm": 1.8624412914202513, + "learning_rate": 1.0718748893503883e-07, + "loss": 0.0458, "step": 676 }, { - "epoch": 8.05952380952381, - "grad_norm": 1.2478157185355454, - "learning_rate": 5.1492478810091184e-06, - "loss": 0.0276, + "epoch": 11.378151260504202, + "grad_norm": 2.0035143994192377, + "learning_rate": 1.0060402649597178e-07, + "loss": 0.0472, "step": 677 }, { - "epoch": 8.071428571428571, - "grad_norm": 2.9389850134580446, - "learning_rate": 5.121153906173826e-06, - "loss": 0.0583, + "epoch": 11.394957983193278, + "grad_norm": 2.6901112576784447, + "learning_rate": 9.422817940278773e-08, + "loss": 0.0458, "step": 678 }, { - "epoch": 8.083333333333334, - "grad_norm": 2.6905956141154848, - "learning_rate": 5.093110377411292e-06, - "loss": 0.0216, + "epoch": 11.411764705882353, + "grad_norm": 1.4284628255192529, + "learning_rate": 8.806008137311028e-08, + "loss": 0.0327, "step": 679 }, { - "epoch": 8.095238095238095, - "grad_norm": 2.1426781345772508, - "learning_rate": 5.06511758468474e-06, - "loss": 0.0768, + "epoch": 11.428571428571429, + "grad_norm": 2.222615104628747, + "learning_rate": 8.209986176753947e-08, + "loss": 0.041, "step": 680 }, { - "epoch": 8.107142857142858, - "grad_norm": 3.5859888616619515, - "learning_rate": 5.037175817432779e-06, - "loss": 0.1097, + "epoch": 11.445378151260504, + "grad_norm": 3.9740102271718634, + "learning_rate": 7.634764558693941e-08, + "loss": 0.0729, "step": 681 }, { - "epoch": 8.119047619047619, - "grad_norm": 1.1020497087264263, - "learning_rate": 5.009285364566435e-06, - "loss": 0.0304, + "epoch": 11.46218487394958, + "grad_norm": 1.4918545836760815, + "learning_rate": 7.080355346981815e-08, + "loss": 0.0566, "step": 682 }, { - "epoch": 8.130952380952381, - "grad_norm": 1.3896812421288112, - "learning_rate": 4.98144651446616e-06, - "loss": 0.0491, + "epoch": 11.478991596638656, + "grad_norm": 2.521127424464659, + "learning_rate": 6.546770168979421e-08, + "loss": 0.0437, "step": 683 }, { - "epoch": 8.142857142857142, - "grad_norm": 1.2000724845656414, - "learning_rate": 4.953659554978831e-06, - "loss": 0.0409, + "epoch": 11.495798319327731, + "grad_norm": 1.2741993338648574, + "learning_rate": 6.034020215316184e-08, + "loss": 0.0349, "step": 684 }, { - "epoch": 8.154761904761905, - "grad_norm": 1.142623132745125, - "learning_rate": 4.925924773414809e-06, - "loss": 0.0371, + "epoch": 11.512605042016807, + "grad_norm": 1.296314142560303, + "learning_rate": 5.5421162396542824e-08, + "loss": 0.0356, "step": 685 }, { - "epoch": 8.166666666666666, - "grad_norm": 2.721525193056032, - "learning_rate": 4.8982424565449274e-06, - "loss": 0.06, + "epoch": 11.529411764705882, + "grad_norm": 1.8595898721766801, + "learning_rate": 5.071068558462733e-08, + "loss": 0.0559, "step": 686 }, { - "epoch": 8.178571428571429, - "grad_norm": 1.0659298645748037, - "learning_rate": 4.870612890597564e-06, - "loss": 0.0305, + "epoch": 11.546218487394958, + "grad_norm": 8.704714098773897, + "learning_rate": 4.6208870508017703e-08, + "loss": 0.101, "step": 687 }, { - "epoch": 8.19047619047619, - "grad_norm": 4.6764392493870295, - "learning_rate": 4.843036361255654e-06, - "loss": 0.1477, + "epoch": 11.563025210084033, + "grad_norm": 2.032442231084466, + "learning_rate": 4.191581158115021e-08, + "loss": 0.0478, "step": 688 }, { - "epoch": 8.202380952380953, - "grad_norm": 2.8740981280962097, - "learning_rate": 4.815513153653758e-06, - "loss": 0.0549, + "epoch": 11.579831932773109, + "grad_norm": 9.418069463610777, + "learning_rate": 3.783159884031773e-08, + "loss": 0.1048, "step": 689 }, { - "epoch": 8.214285714285714, - "grad_norm": 3.606494816871051, - "learning_rate": 4.788043552375087e-06, - "loss": 0.0506, + "epoch": 11.596638655462185, + "grad_norm": 1.0069915518246801, + "learning_rate": 3.3956317941779004e-08, + "loss": 0.032, "step": 690 }, { - "epoch": 8.226190476190476, - "grad_norm": 4.537509345710984, - "learning_rate": 4.760627841448595e-06, - "loss": 0.0381, + "epoch": 11.61344537815126, + "grad_norm": 1.3869570014003472, + "learning_rate": 3.029005015996789e-08, + "loss": 0.0276, "step": 691 }, { - "epoch": 8.238095238095237, - "grad_norm": 1.9487056080497047, - "learning_rate": 4.733266304346005e-06, - "loss": 0.0575, + "epoch": 11.630252100840336, + "grad_norm": 2.0275074366047128, + "learning_rate": 2.6832872385783583e-08, + "loss": 0.041, "step": 692 }, { - "epoch": 8.25, - "grad_norm": 1.372774061299278, - "learning_rate": 4.705959223978908e-06, - "loss": 0.0362, + "epoch": 11.647058823529411, + "grad_norm": 3.7065994847690407, + "learning_rate": 2.3584857124977488e-08, + "loss": 0.058, "step": 693 }, { - "epoch": 8.261904761904763, - "grad_norm": 2.4087403103866882, - "learning_rate": 4.678706882695824e-06, - "loss": 0.0822, + "epoch": 11.663865546218487, + "grad_norm": 1.1492957912068942, + "learning_rate": 2.054607249663665e-08, + "loss": 0.04, "step": 694 }, { - "epoch": 8.273809523809524, - "grad_norm": 2.414899294008852, - "learning_rate": 4.651509562279276e-06, - "loss": 0.0634, + "epoch": 11.680672268907562, + "grad_norm": 7.151682674273455, + "learning_rate": 1.7716582231752656e-08, + "loss": 0.077, "step": 695 }, { - "epoch": 8.285714285714286, - "grad_norm": 3.378577203088908, - "learning_rate": 4.624367543942899e-06, - "loss": 0.0647, + "epoch": 11.697478991596638, + "grad_norm": 1.5415699598561934, + "learning_rate": 1.509644567188717e-08, + "loss": 0.041, "step": 696 }, { - "epoch": 8.297619047619047, - "grad_norm": 5.150913422584667, - "learning_rate": 4.597281108328502e-06, - "loss": 0.0849, + "epoch": 11.714285714285714, + "grad_norm": 1.7646669493531948, + "learning_rate": 1.2685717767921823e-08, + "loss": 0.0518, "step": 697 }, { - "epoch": 8.30952380952381, - "grad_norm": 1.1399120801392448, - "learning_rate": 4.570250535503196e-06, - "loss": 0.0354, + "epoch": 11.731092436974789, + "grad_norm": 4.9082532974240465, + "learning_rate": 1.048444907891244e-08, + "loss": 0.1108, "step": 698 }, { - "epoch": 8.321428571428571, - "grad_norm": 1.4227847802019646, - "learning_rate": 4.543276104956472e-06, - "loss": 0.0587, + "epoch": 11.747899159663866, + "grad_norm": 3.994721765198761, + "learning_rate": 8.492685771025466e-09, + "loss": 0.0613, "step": 699 }, { - "epoch": 8.333333333333334, - "grad_norm": 1.464743918175674, - "learning_rate": 4.5163580955973384e-06, - "loss": 0.0812, + "epoch": 11.764705882352942, + "grad_norm": 2.688716545259077, + "learning_rate": 6.710469616569848e-09, + "loss": 0.0534, "step": 700 }, { - "epoch": 8.345238095238095, - "grad_norm": 1.3386170144483294, - "learning_rate": 4.489496785751407e-06, - "loss": 0.0644, + "epoch": 11.781512605042018, + "grad_norm": 2.097132472666691, + "learning_rate": 5.137837993121064e-09, + "loss": 0.0534, "step": 701 }, { - "epoch": 8.357142857142858, - "grad_norm": 1.127458731365185, - "learning_rate": 4.4626924531580395e-06, - "loss": 0.0308, + "epoch": 11.798319327731093, + "grad_norm": 1.036594664039175, + "learning_rate": 3.774823882738421e-09, + "loss": 0.0338, "step": 702 }, { - "epoch": 8.369047619047619, - "grad_norm": 2.1999026564423425, - "learning_rate": 4.435945374967471e-06, - "loss": 0.0993, + "epoch": 11.815126050420169, + "grad_norm": 2.2273501309394907, + "learning_rate": 2.6214558712722714e-09, + "loss": 0.0337, "step": 703 }, { - "epoch": 8.380952380952381, - "grad_norm": 1.364108162519796, - "learning_rate": 4.4092558277379235e-06, - "loss": 0.071, + "epoch": 11.831932773109244, + "grad_norm": 1.4175794307129967, + "learning_rate": 1.677758147762276e-09, + "loss": 0.0304, "step": 704 }, { - "epoch": 8.392857142857142, - "grad_norm": 1.204961209910452, - "learning_rate": 4.382624087432784e-06, - "loss": 0.0438, + "epoch": 11.84873949579832, + "grad_norm": 2.118091027580824, + "learning_rate": 9.43750503935581e-10, + "loss": 0.0483, "step": 705 }, { - "epoch": 8.404761904761905, - "grad_norm": 2.1404897481502503, - "learning_rate": 4.356050429417711e-06, - "loss": 0.0177, + "epoch": 11.865546218487395, + "grad_norm": 7.711002671886926, + "learning_rate": 4.1944833378604334e-10, + "loss": 0.1102, "step": 706 }, { - "epoch": 8.416666666666666, - "grad_norm": 2.248346002608694, - "learning_rate": 4.329535128457822e-06, - "loss": 0.0747, + "epoch": 11.882352941176471, + "grad_norm": 1.7414362067953533, + "learning_rate": 1.0486263325559798e-10, + "loss": 0.0357, "step": 707 }, { - "epoch": 8.428571428571429, - "grad_norm": 1.8017077511033046, - "learning_rate": 4.303078458714824e-06, - "loss": 0.055, + "epoch": 11.899159663865547, + "grad_norm": 1.2682884045214264, + "learning_rate": 0.0, + "loss": 0.0266, "step": 708 }, { - "epoch": 8.44047619047619, - "grad_norm": 3.1411734635110826, - "learning_rate": 4.2766806937442025e-06, - "loss": 0.1054, - "step": 709 - }, - { - "epoch": 8.452380952380953, - "grad_norm": 1.9831648552999164, - "learning_rate": 4.250342106492371e-06, - "loss": 0.0455, - "step": 710 - }, - { - "epoch": 8.464285714285714, - "grad_norm": 1.3064388791471728, - "learning_rate": 4.224062969293873e-06, - "loss": 0.0118, - "step": 711 - }, - { - "epoch": 8.476190476190476, - "grad_norm": 4.303236040730501, - "learning_rate": 4.197843553868538e-06, - "loss": 0.1303, - "step": 712 - }, - { - "epoch": 8.488095238095237, - "grad_norm": 3.2998036981279277, - "learning_rate": 4.1716841313187e-06, - "loss": 0.0898, - "step": 713 - }, - { - "epoch": 8.5, - "grad_norm": 2.531047135551161, - "learning_rate": 4.145584972126377e-06, - "loss": 0.0926, - "step": 714 - }, - { - "epoch": 8.511904761904763, - "grad_norm": 3.84439794247204, - "learning_rate": 4.119546346150478e-06, - "loss": 0.1078, - "step": 715 - }, - { - "epoch": 8.523809523809524, - "grad_norm": 1.4003583986318646, - "learning_rate": 4.093568522624012e-06, - "loss": 0.0529, - "step": 716 - }, - { - "epoch": 8.535714285714286, - "grad_norm": 1.5373028609830812, - "learning_rate": 4.0676517701513015e-06, - "loss": 0.0128, - "step": 717 - }, - { - "epoch": 8.547619047619047, - "grad_norm": 2.0378816437403047, - "learning_rate": 4.04179635670522e-06, - "loss": 0.1006, - "step": 718 - }, - { - "epoch": 8.55952380952381, - "grad_norm": 0.9253802190426968, - "learning_rate": 4.016002549624408e-06, - "loss": 0.0256, - "step": 719 - }, - { - "epoch": 8.571428571428571, - "grad_norm": 1.8878359466590395, - "learning_rate": 3.990270615610502e-06, - "loss": 0.0427, - "step": 720 - }, - { - "epoch": 8.583333333333334, - "grad_norm": 1.8998654829048824, - "learning_rate": 3.964600820725399e-06, - "loss": 0.0149, - "step": 721 - }, - { - "epoch": 8.595238095238095, - "grad_norm": 2.1785811911898043, - "learning_rate": 3.938993430388481e-06, - "loss": 0.1016, - "step": 722 - }, - { - "epoch": 8.607142857142858, - "grad_norm": 1.6005146404043937, - "learning_rate": 3.9134487093738906e-06, - "loss": 0.098, - "step": 723 - }, - { - "epoch": 8.619047619047619, - "grad_norm": 1.944745995853955, - "learning_rate": 3.8879669218077785e-06, - "loss": 0.0867, - "step": 724 - }, - { - "epoch": 8.630952380952381, - "grad_norm": 3.442868509135672, - "learning_rate": 3.862548331165589e-06, - "loss": 0.0251, - "step": 725 - }, - { - "epoch": 8.642857142857142, - "grad_norm": 1.8096775564399332, - "learning_rate": 3.837193200269309e-06, - "loss": 0.0788, - "step": 726 - }, - { - "epoch": 8.654761904761905, - "grad_norm": 3.475557028703863, - "learning_rate": 3.811901791284788e-06, - "loss": 0.0846, - "step": 727 - }, - { - "epoch": 8.666666666666666, - "grad_norm": 2.2234463246243465, - "learning_rate": 3.7866743657189863e-06, - "loss": 0.0575, - "step": 728 - }, - { - "epoch": 8.678571428571429, - "grad_norm": 4.393598853012906, - "learning_rate": 3.761511184417306e-06, - "loss": 0.171, - "step": 729 - }, - { - "epoch": 8.69047619047619, - "grad_norm": 1.780187643448173, - "learning_rate": 3.736412507560876e-06, - "loss": 0.0431, - "step": 730 - }, - { - "epoch": 8.702380952380953, - "grad_norm": 1.2226552976534588, - "learning_rate": 3.711378594663857e-06, - "loss": 0.032, - "step": 731 - }, - { - "epoch": 8.714285714285714, - "grad_norm": 1.9370245760802747, - "learning_rate": 3.6864097045707783e-06, - "loss": 0.0444, - "step": 732 - }, - { - "epoch": 8.726190476190476, - "grad_norm": 3.5771359305604826, - "learning_rate": 3.6615060954538383e-06, - "loss": 0.0935, - "step": 733 - }, - { - "epoch": 8.738095238095237, - "grad_norm": 1.0674492370205682, - "learning_rate": 3.636668024810256e-06, - "loss": 0.0561, - "step": 734 - }, - { - "epoch": 8.75, - "grad_norm": 1.5812111523180694, - "learning_rate": 3.6118957494595885e-06, - "loss": 0.0476, - "step": 735 - }, - { - "epoch": 8.761904761904763, - "grad_norm": 1.5340016226171458, - "learning_rate": 3.587189525541097e-06, - "loss": 0.0649, - "step": 736 - }, - { - "epoch": 8.773809523809524, - "grad_norm": 4.646161423857292, - "learning_rate": 3.5625496085110757e-06, - "loss": 0.0356, - "step": 737 - }, - { - "epoch": 8.785714285714286, - "grad_norm": 2.3591372490715248, - "learning_rate": 3.537976253140232e-06, - "loss": 0.0535, - "step": 738 - }, - { - "epoch": 8.797619047619047, - "grad_norm": 3.09871629569407, - "learning_rate": 3.513469713511032e-06, - "loss": 0.0999, - "step": 739 - }, - { - "epoch": 8.80952380952381, - "grad_norm": 3.317284922510724, - "learning_rate": 3.4890302430150913e-06, - "loss": 0.0616, - "step": 740 - }, - { - "epoch": 8.821428571428571, - "grad_norm": 3.4094509283498127, - "learning_rate": 3.4646580943505483e-06, - "loss": 0.1103, - "step": 741 - }, - { - "epoch": 8.833333333333334, - "grad_norm": 3.729118936004436, - "learning_rate": 3.4403535195194393e-06, - "loss": 0.0514, - "step": 742 - }, - { - "epoch": 8.845238095238095, - "grad_norm": 7.947111022757592, - "learning_rate": 3.4161167698251176e-06, - "loss": 0.1472, - "step": 743 - }, - { - "epoch": 8.857142857142858, - "grad_norm": 1.5891949285087759, - "learning_rate": 3.391948095869628e-06, - "loss": 0.0649, - "step": 744 - }, - { - "epoch": 8.869047619047619, - "grad_norm": 1.688044557138942, - "learning_rate": 3.367847747551143e-06, - "loss": 0.0369, - "step": 745 - }, - { - "epoch": 8.880952380952381, - "grad_norm": 2.1729484304523172, - "learning_rate": 3.3438159740613474e-06, - "loss": 0.0483, - "step": 746 - }, - { - "epoch": 8.892857142857142, - "grad_norm": 1.704323955661607, - "learning_rate": 3.3198530238828973e-06, - "loss": 0.0679, - "step": 747 - }, - { - "epoch": 8.904761904761905, - "grad_norm": 1.0039040994703539, - "learning_rate": 3.2959591447868177e-06, - "loss": 0.0479, - "step": 748 - }, - { - "epoch": 8.916666666666666, - "grad_norm": 2.3677908012699014, - "learning_rate": 3.272134583829966e-06, - "loss": 0.0857, - "step": 749 - }, - { - "epoch": 8.928571428571429, - "grad_norm": 2.7444347195294685, - "learning_rate": 3.2483795873524625e-06, - "loss": 0.1112, - "step": 750 - }, - { - "epoch": 8.94047619047619, - "grad_norm": 1.2943783268976876, - "learning_rate": 3.224694400975145e-06, - "loss": 0.0266, - "step": 751 - }, - { - "epoch": 8.952380952380953, - "grad_norm": 3.044581999259029, - "learning_rate": 3.2010792695970407e-06, - "loss": 0.1023, - "step": 752 - }, - { - "epoch": 8.964285714285714, - "grad_norm": 2.6509128482408557, - "learning_rate": 3.177534437392813e-06, - "loss": 0.0748, - "step": 753 - }, - { - "epoch": 8.976190476190476, - "grad_norm": 1.221983721384426, - "learning_rate": 3.154060147810266e-06, - "loss": 0.0421, - "step": 754 - }, - { - "epoch": 8.988095238095237, - "grad_norm": 2.3808451565996958, - "learning_rate": 3.1306566435677877e-06, - "loss": 0.0184, - "step": 755 - }, - { - "epoch": 9.0, - "grad_norm": 1.4618055755215726, - "learning_rate": 3.1073241666518817e-06, - "loss": 0.0376, - "step": 756 - }, - { - "epoch": 9.0, - "eval_loss": 0.22630615532398224, - "eval_runtime": 38.2176, - "eval_samples_per_second": 1.57, - "eval_steps_per_second": 1.57, - "step": 756 - }, - { - "epoch": 9.011904761904763, - "grad_norm": 1.2638660863464435, - "learning_rate": 3.084062958314642e-06, - "loss": 0.0358, - "step": 757 - }, - { - "epoch": 9.023809523809524, - "grad_norm": 2.0988738098189965, - "learning_rate": 3.0608732590712574e-06, - "loss": 0.0933, - "step": 758 - }, - { - "epoch": 9.035714285714286, - "grad_norm": 1.446300469130805, - "learning_rate": 3.0377553086975397e-06, - "loss": 0.0549, - "step": 759 - }, - { - "epoch": 9.047619047619047, - "grad_norm": 2.2735202083993604, - "learning_rate": 3.014709346227421e-06, - "loss": 0.1031, - "step": 760 - }, - { - "epoch": 9.05952380952381, - "grad_norm": 4.144205517980198, - "learning_rate": 2.9917356099505137e-06, - "loss": 0.1125, - "step": 761 - }, - { - "epoch": 9.071428571428571, - "grad_norm": 1.7207853036637966, - "learning_rate": 2.968834337409612e-06, - "loss": 0.0453, - "step": 762 - }, - { - "epoch": 9.083333333333334, - "grad_norm": 1.3808210070725824, - "learning_rate": 2.9460057653982676e-06, - "loss": 0.0364, - "step": 763 - }, - { - "epoch": 9.095238095238095, - "grad_norm": 1.0699563203244846, - "learning_rate": 2.9232501299583126e-06, - "loss": 0.043, - "step": 764 - }, - { - "epoch": 9.107142857142858, - "grad_norm": 3.1313459792573566, - "learning_rate": 2.9005676663774464e-06, - "loss": 0.0667, - "step": 765 - }, - { - "epoch": 9.119047619047619, - "grad_norm": 2.8345479036451002, - "learning_rate": 2.8779586091867774e-06, - "loss": 0.072, - "step": 766 - }, - { - "epoch": 9.130952380952381, - "grad_norm": 4.237900498613799, - "learning_rate": 2.8554231921584164e-06, - "loss": 0.0266, - "step": 767 - }, - { - "epoch": 9.142857142857142, - "grad_norm": 4.626766434495465, - "learning_rate": 2.8329616483030574e-06, - "loss": 0.0622, - "step": 768 - }, - { - "epoch": 9.154761904761905, - "grad_norm": 5.703966216520634, - "learning_rate": 2.8105742098675504e-06, - "loss": 0.0481, - "step": 769 - }, - { - "epoch": 9.166666666666666, - "grad_norm": 2.19463955783124, - "learning_rate": 2.788261108332528e-06, - "loss": 0.0707, - "step": 770 - }, - { - "epoch": 9.178571428571429, - "grad_norm": 1.2240191096067952, - "learning_rate": 2.7660225744099858e-06, - "loss": 0.0353, - "step": 771 - }, - { - "epoch": 9.19047619047619, - "grad_norm": 2.332773587240624, - "learning_rate": 2.743858838040918e-06, - "loss": 0.0495, - "step": 772 - }, - { - "epoch": 9.202380952380953, - "grad_norm": 2.039653385293655, - "learning_rate": 2.7217701283929208e-06, - "loss": 0.0608, - "step": 773 - }, - { - "epoch": 9.214285714285714, - "grad_norm": 2.761650563323467, - "learning_rate": 2.699756673857845e-06, - "loss": 0.037, - "step": 774 - }, - { - "epoch": 9.226190476190476, - "grad_norm": 1.6011776954420698, - "learning_rate": 2.6778187020494086e-06, - "loss": 0.0123, - "step": 775 - }, - { - "epoch": 9.238095238095237, - "grad_norm": 5.53779887401531, - "learning_rate": 2.6559564398008643e-06, - "loss": 0.0378, - "step": 776 - }, - { - "epoch": 9.25, - "grad_norm": 2.1627086005140095, - "learning_rate": 2.6341701131626486e-06, - "loss": 0.0534, - "step": 777 - }, - { - "epoch": 9.261904761904763, - "grad_norm": 1.3638880451491888, - "learning_rate": 2.6124599474000347e-06, - "loss": 0.0417, - "step": 778 - }, - { - "epoch": 9.273809523809524, - "grad_norm": 5.103361689747087, - "learning_rate": 2.59082616699082e-06, - "loss": 0.1109, - "step": 779 - }, - { - "epoch": 9.285714285714286, - "grad_norm": 2.0038811026150793, - "learning_rate": 2.5692689956229842e-06, - "loss": 0.0863, - "step": 780 - }, - { - "epoch": 9.297619047619047, - "grad_norm": 1.4236628067476185, - "learning_rate": 2.5477886561924026e-06, - "loss": 0.0481, - "step": 781 - }, - { - "epoch": 9.30952380952381, - "grad_norm": 2.288250827736941, - "learning_rate": 2.526385370800515e-06, - "loss": 0.0696, - "step": 782 - }, - { - "epoch": 9.321428571428571, - "grad_norm": 2.241009646194953, - "learning_rate": 2.505059360752049e-06, - "loss": 0.0947, - "step": 783 - }, - { - "epoch": 9.333333333333334, - "grad_norm": 1.1079581601237796, - "learning_rate": 2.483810846552719e-06, - "loss": 0.009, - "step": 784 - }, - { - "epoch": 9.345238095238095, - "grad_norm": 1.9521426450571557, - "learning_rate": 2.462640047906958e-06, - "loss": 0.0685, - "step": 785 - }, - { - "epoch": 9.357142857142858, - "grad_norm": 1.7190040999607883, - "learning_rate": 2.441547183715628e-06, - "loss": 0.0669, - "step": 786 - }, - { - "epoch": 9.369047619047619, - "grad_norm": 2.822373050402177, - "learning_rate": 2.4205324720737787e-06, - "loss": 0.0947, - "step": 787 - }, - { - "epoch": 9.380952380952381, - "grad_norm": 5.160967804300185, - "learning_rate": 2.3995961302683803e-06, - "loss": 0.1173, - "step": 788 - }, - { - "epoch": 9.392857142857142, - "grad_norm": 1.4211193630802048, - "learning_rate": 2.3787383747760696e-06, - "loss": 0.0335, - "step": 789 - }, - { - "epoch": 9.404761904761905, - "grad_norm": 1.6880430372088504, - "learning_rate": 2.3579594212609325e-06, - "loss": 0.0638, - "step": 790 - }, - { - "epoch": 9.416666666666666, - "grad_norm": 2.5607635616852544, - "learning_rate": 2.3372594845722483e-06, - "loss": 0.0712, - "step": 791 - }, - { - "epoch": 9.428571428571429, - "grad_norm": 5.135385738571878, - "learning_rate": 2.316638778742295e-06, - "loss": 0.099, - "step": 792 - }, - { - "epoch": 9.44047619047619, - "grad_norm": 0.9753249765155438, - "learning_rate": 2.2960975169841106e-06, - "loss": 0.0386, - "step": 793 - }, - { - "epoch": 9.452380952380953, - "grad_norm": 3.059992847066805, - "learning_rate": 2.2756359116893122e-06, - "loss": 0.0766, - "step": 794 - }, - { - "epoch": 9.464285714285714, - "grad_norm": 1.852868554324358, - "learning_rate": 2.2552541744258816e-06, - "loss": 0.0771, - "step": 795 - }, - { - "epoch": 9.476190476190476, - "grad_norm": 3.678855356877469, - "learning_rate": 2.234952515935982e-06, - "loss": 0.0672, - "step": 796 - }, - { - "epoch": 9.488095238095237, - "grad_norm": 2.259811511231432, - "learning_rate": 2.214731146133793e-06, - "loss": 0.0717, - "step": 797 - }, - { - "epoch": 9.5, - "grad_norm": 2.922269519237516, - "learning_rate": 2.1945902741033154e-06, - "loss": 0.1276, - "step": 798 - }, - { - "epoch": 9.511904761904763, - "grad_norm": 2.0143579615385003, - "learning_rate": 2.1745301080962357e-06, - "loss": 0.0631, - "step": 799 - }, - { - "epoch": 9.523809523809524, - "grad_norm": 1.342784078461839, - "learning_rate": 2.1545508555297478e-06, - "loss": 0.0106, - "step": 800 - }, - { - "epoch": 9.535714285714286, - "grad_norm": 3.1514364794761267, - "learning_rate": 2.1346527229844305e-06, - "loss": 0.0592, - "step": 801 - }, - { - "epoch": 9.547619047619047, - "grad_norm": 4.658579628367995, - "learning_rate": 2.114835916202094e-06, - "loss": 0.0786, - "step": 802 - }, - { - "epoch": 9.55952380952381, - "grad_norm": 8.72775474775178, - "learning_rate": 2.095100640083664e-06, - "loss": 0.0577, - "step": 803 - }, - { - "epoch": 9.571428571428571, - "grad_norm": 3.078873859871242, - "learning_rate": 2.0754470986870602e-06, - "loss": 0.0186, - "step": 804 - }, - { - "epoch": 9.583333333333334, - "grad_norm": 3.0642918358992404, - "learning_rate": 2.05587549522508e-06, - "loss": 0.0207, - "step": 805 - }, - { - "epoch": 9.595238095238095, - "grad_norm": 1.7483461506332738, - "learning_rate": 2.036386032063311e-06, - "loss": 0.0442, - "step": 806 - }, - { - "epoch": 9.607142857142858, - "grad_norm": 3.801439720554629, - "learning_rate": 2.0169789107180195e-06, - "loss": 0.028, - "step": 807 - }, - { - "epoch": 9.619047619047619, - "grad_norm": 2.3775230600293473, - "learning_rate": 1.9976543318540887e-06, - "loss": 0.0629, - "step": 808 - }, - { - "epoch": 9.630952380952381, - "grad_norm": 1.3725986057321287, - "learning_rate": 1.978412495282922e-06, - "loss": 0.0341, - "step": 809 - }, - { - "epoch": 9.642857142857142, - "grad_norm": 3.093713374546874, - "learning_rate": 1.959253599960399e-06, - "loss": 0.0794, - "step": 810 - }, - { - "epoch": 9.654761904761905, - "grad_norm": 1.9678116047928416, - "learning_rate": 1.940177843984795e-06, - "loss": 0.0267, - "step": 811 - }, - { - "epoch": 9.666666666666666, - "grad_norm": 1.7929950739709355, - "learning_rate": 1.921185424594758e-06, - "loss": 0.0667, - "step": 812 - }, - { - "epoch": 9.678571428571429, - "grad_norm": 1.3460670621170243, - "learning_rate": 1.9022765381672426e-06, - "loss": 0.04, - "step": 813 - }, - { - "epoch": 9.69047619047619, - "grad_norm": 1.5856379303348087, - "learning_rate": 1.883451380215503e-06, - "loss": 0.0108, - "step": 814 - }, - { - "epoch": 9.702380952380953, - "grad_norm": 1.7602024319170253, - "learning_rate": 1.8647101453870608e-06, - "loss": 0.0434, - "step": 815 - }, - { - "epoch": 9.714285714285714, - "grad_norm": 2.3435995853073504, - "learning_rate": 1.846053027461686e-06, - "loss": 0.0863, - "step": 816 - }, - { - "epoch": 9.726190476190476, - "grad_norm": 0.920694195118384, - "learning_rate": 1.827480219349409e-06, - "loss": 0.0082, - "step": 817 - }, - { - "epoch": 9.738095238095237, - "grad_norm": 1.94811857435231, - "learning_rate": 1.8089919130885081e-06, - "loss": 0.0144, - "step": 818 - }, - { - "epoch": 9.75, - "grad_norm": 2.0473503573165504, - "learning_rate": 1.7905882998435443e-06, - "loss": 0.0423, - "step": 819 - }, - { - "epoch": 9.761904761904763, - "grad_norm": 2.6541768645283983, - "learning_rate": 1.7722695699033632e-06, - "loss": 0.0192, - "step": 820 - }, - { - "epoch": 9.773809523809524, - "grad_norm": 2.3050964074362534, - "learning_rate": 1.754035912679145e-06, - "loss": 0.0409, - "step": 821 - }, - { - "epoch": 9.785714285714286, - "grad_norm": 2.8122349110450062, - "learning_rate": 1.7358875167024336e-06, - "loss": 0.0655, - "step": 822 - }, - { - "epoch": 9.797619047619047, - "grad_norm": 3.6832643159394554, - "learning_rate": 1.7178245696231953e-06, - "loss": 0.1298, - "step": 823 - }, - { - "epoch": 9.80952380952381, - "grad_norm": 1.3306255786886887, - "learning_rate": 1.6998472582078806e-06, - "loss": 0.0506, - "step": 824 - }, - { - "epoch": 9.821428571428571, - "grad_norm": 3.315709760719103, - "learning_rate": 1.6819557683374766e-06, - "loss": 0.0232, - "step": 825 - }, - { - "epoch": 9.833333333333334, - "grad_norm": 2.6658995622840576, - "learning_rate": 1.6641502850056078e-06, - "loss": 0.0858, - "step": 826 - }, - { - "epoch": 9.845238095238095, - "grad_norm": 1.9573291379645015, - "learning_rate": 1.6464309923166033e-06, - "loss": 0.0649, - "step": 827 - }, - { - "epoch": 9.857142857142858, - "grad_norm": 2.9219885547313984, - "learning_rate": 1.6287980734836118e-06, - "loss": 0.0637, - "step": 828 - }, - { - "epoch": 9.869047619047619, - "grad_norm": 1.7737112336849967, - "learning_rate": 1.6112517108266867e-06, - "loss": 0.0597, - "step": 829 - }, - { - "epoch": 9.880952380952381, - "grad_norm": 4.779901240190352, - "learning_rate": 1.593792085770921e-06, - "loss": 0.137, - "step": 830 - }, - { - "epoch": 9.892857142857142, - "grad_norm": 3.801958131422506, - "learning_rate": 1.5764193788445548e-06, - "loss": 0.1197, - "step": 831 - }, - { - "epoch": 9.904761904761905, - "grad_norm": 2.2120118800441984, - "learning_rate": 1.5591337696771247e-06, - "loss": 0.0728, - "step": 832 - }, - { - "epoch": 9.916666666666666, - "grad_norm": 1.4958592670875228, - "learning_rate": 1.541935436997588e-06, - "loss": 0.0481, - "step": 833 - }, - { - "epoch": 9.928571428571429, - "grad_norm": 2.8834804691841964, - "learning_rate": 1.5248245586324883e-06, - "loss": 0.0175, - "step": 834 - }, - { - "epoch": 9.94047619047619, - "grad_norm": 2.0804272562966135, - "learning_rate": 1.507801311504119e-06, - "loss": 0.0772, - "step": 835 - }, - { - "epoch": 9.952380952380953, - "grad_norm": 2.2745884531321514, - "learning_rate": 1.4908658716286784e-06, - "loss": 0.0617, - "step": 836 - }, - { - "epoch": 9.964285714285714, - "grad_norm": 1.526962860362152, - "learning_rate": 1.4740184141144664e-06, - "loss": 0.0281, - "step": 837 - }, - { - "epoch": 9.976190476190476, - "grad_norm": 4.076270588481699, - "learning_rate": 1.457259113160061e-06, - "loss": 0.0757, - "step": 838 - }, - { - "epoch": 9.988095238095237, - "grad_norm": 2.054731057370207, - "learning_rate": 1.4405881420525315e-06, - "loss": 0.0746, - "step": 839 - }, - { - "epoch": 10.0, - "grad_norm": 2.4967134409140845, - "learning_rate": 1.4240056731656271e-06, - "loss": 0.0998, - "step": 840 - }, - { - "epoch": 10.0, - "eval_loss": 0.29546400904655457, - "eval_runtime": 38.3401, - "eval_samples_per_second": 1.565, - "eval_steps_per_second": 1.565, - "step": 840 - }, - { - "epoch": 10.011904761904763, - "grad_norm": 1.939367732693264, - "learning_rate": 1.4075118779580155e-06, - "loss": 0.0772, - "step": 841 - }, - { - "epoch": 10.023809523809524, - "grad_norm": 5.127527273619253, - "learning_rate": 1.391106926971496e-06, - "loss": 0.1178, - "step": 842 - }, - { - "epoch": 10.035714285714286, - "grad_norm": 4.574690636051704, - "learning_rate": 1.3747909898292389e-06, - "loss": 0.03, - "step": 843 - }, - { - "epoch": 10.047619047619047, - "grad_norm": 2.7132236248947894, - "learning_rate": 1.3585642352340377e-06, - "loss": 0.0788, - "step": 844 - }, - { - "epoch": 10.05952380952381, - "grad_norm": 1.9056680106061799, - "learning_rate": 1.3424268309665546e-06, - "loss": 0.073, - "step": 845 - }, - { - "epoch": 10.071428571428571, - "grad_norm": 1.2385633126556088, - "learning_rate": 1.3263789438835973e-06, - "loss": 0.0297, - "step": 846 - }, - { - "epoch": 10.083333333333334, - "grad_norm": 2.944507356001365, - "learning_rate": 1.3104207399163816e-06, - "loss": 0.0199, - "step": 847 - }, - { - "epoch": 10.095238095238095, - "grad_norm": 1.7161167095821772, - "learning_rate": 1.294552384068829e-06, - "loss": 0.0627, - "step": 848 - }, - { - "epoch": 10.107142857142858, - "grad_norm": 4.6380379114120265, - "learning_rate": 1.2787740404158455e-06, - "loss": 0.0709, - "step": 849 - }, - { - "epoch": 10.119047619047619, - "grad_norm": 1.634226568884785, - "learning_rate": 1.263085872101638e-06, - "loss": 0.0333, - "step": 850 - }, - { - "epoch": 10.130952380952381, - "grad_norm": 1.4985265598741437, - "learning_rate": 1.2474880413380253e-06, - "loss": 0.0443, - "step": 851 - }, - { - "epoch": 10.142857142857142, - "grad_norm": 1.3371053714757155, - "learning_rate": 1.2319807094027492e-06, - "loss": 0.0362, - "step": 852 - }, - { - "epoch": 10.154761904761905, - "grad_norm": 2.994449032456818, - "learning_rate": 1.216564036637825e-06, - "loss": 0.0406, - "step": 853 - }, - { - "epoch": 10.166666666666666, - "grad_norm": 2.2449860040319245, - "learning_rate": 1.201238182447867e-06, - "loss": 0.0379, - "step": 854 - }, - { - "epoch": 10.178571428571429, - "grad_norm": 1.7187436793514488, - "learning_rate": 1.1860033052984544e-06, - "loss": 0.0571, - "step": 855 - }, - { - "epoch": 10.19047619047619, - "grad_norm": 4.381492396088356, - "learning_rate": 1.1708595627144782e-06, - "loss": 0.0284, - "step": 856 - }, - { - "epoch": 10.202380952380953, - "grad_norm": 3.8586763241442013, - "learning_rate": 1.1558071112785297e-06, - "loss": 0.0771, - "step": 857 - }, - { - "epoch": 10.214285714285714, - "grad_norm": 2.337542758928375, - "learning_rate": 1.1408461066292643e-06, - "loss": 0.0666, - "step": 858 - }, - { - "epoch": 10.226190476190476, - "grad_norm": 1.929433338882153, - "learning_rate": 1.1259767034598046e-06, - "loss": 0.0441, - "step": 859 - }, - { - "epoch": 10.238095238095237, - "grad_norm": 4.750689941855376, - "learning_rate": 1.1111990555161322e-06, - "loss": 0.1109, - "step": 860 - }, - { - "epoch": 10.25, - "grad_norm": 2.8991103758900914, - "learning_rate": 1.0965133155955066e-06, - "loss": 0.0216, - "step": 861 - }, - { - "epoch": 10.261904761904763, - "grad_norm": 1.7271815063958462, - "learning_rate": 1.0819196355448801e-06, - "loss": 0.0667, - "step": 862 - }, - { - "epoch": 10.273809523809524, - "grad_norm": 3.3616262268804293, - "learning_rate": 1.0674181662593253e-06, - "loss": 0.0788, - "step": 863 - }, - { - "epoch": 10.285714285714286, - "grad_norm": 3.5482395867466283, - "learning_rate": 1.053009057680483e-06, - "loss": 0.0221, - "step": 864 - }, - { - "epoch": 10.297619047619047, - "grad_norm": 2.28948006890838, - "learning_rate": 1.0386924587949998e-06, - "loss": 0.0662, - "step": 865 - }, - { - "epoch": 10.30952380952381, - "grad_norm": 2.8494445710276914, - "learning_rate": 1.0244685176330027e-06, - "loss": 0.0205, - "step": 866 - }, - { - "epoch": 10.321428571428571, - "grad_norm": 1.577698024983074, - "learning_rate": 1.0103373812665552e-06, - "loss": 0.0688, - "step": 867 - }, - { - "epoch": 10.333333333333334, - "grad_norm": 2.1289638850579284, - "learning_rate": 9.962991958081446e-07, - "loss": 0.0158, - "step": 868 - }, - { - "epoch": 10.345238095238095, - "grad_norm": 3.460213528935419, - "learning_rate": 9.823541064091657e-07, - "loss": 0.1106, - "step": 869 - }, - { - "epoch": 10.357142857142858, - "grad_norm": 3.481972900368815, - "learning_rate": 9.68502257258428e-07, - "loss": 0.091, - "step": 870 - }, - { - "epoch": 10.369047619047619, - "grad_norm": 2.0142509797571337, - "learning_rate": 9.547437915806534e-07, - "loss": 0.0608, - "step": 871 - }, - { - "epoch": 10.380952380952381, - "grad_norm": 2.2583132229155405, - "learning_rate": 9.410788516350078e-07, - "loss": 0.0839, - "step": 872 - }, - { - "epoch": 10.392857142857142, - "grad_norm": 2.570112503291791, - "learning_rate": 9.27507578713619e-07, - "loss": 0.0535, - "step": 873 - }, - { - "epoch": 10.404761904761905, - "grad_norm": 2.170958789831051, - "learning_rate": 9.140301131401208e-07, - "loss": 0.0804, - "step": 874 - }, - { - "epoch": 10.416666666666666, - "grad_norm": 2.8444438323548136, - "learning_rate": 9.006465942682074e-07, - "loss": 0.0207, - "step": 875 - }, - { - "epoch": 10.428571428571429, - "grad_norm": 4.208193249007848, - "learning_rate": 8.873571604801812e-07, - "loss": 0.1133, - "step": 876 - }, - { - "epoch": 10.44047619047619, - "grad_norm": 3.80053895933604, - "learning_rate": 8.741619491855291e-07, - "loss": 0.1123, - "step": 877 - }, - { - "epoch": 10.452380952380953, - "grad_norm": 3.8249855056475948, - "learning_rate": 8.610610968195065e-07, - "loss": 0.0262, - "step": 878 - }, - { - "epoch": 10.464285714285714, - "grad_norm": 2.1591266760594583, - "learning_rate": 8.480547388417104e-07, - "loss": 0.0689, - "step": 879 - }, - { - "epoch": 10.476190476190476, - "grad_norm": 2.2449611487373615, - "learning_rate": 8.35143009734698e-07, - "loss": 0.0536, - "step": 880 - }, - { - "epoch": 10.488095238095237, - "grad_norm": 1.945141389129938, - "learning_rate": 8.223260430025804e-07, - "loss": 0.0489, - "step": 881 - }, - { - "epoch": 10.5, - "grad_norm": 2.859367384367929, - "learning_rate": 8.096039711696546e-07, - "loss": 0.0779, - "step": 882 - }, - { - "epoch": 10.511904761904763, - "grad_norm": 2.257419186913638, - "learning_rate": 7.969769257790238e-07, - "loss": 0.0159, - "step": 883 - }, - { - "epoch": 10.523809523809524, - "grad_norm": 1.9636635789211396, - "learning_rate": 7.844450373912437e-07, - "loss": 0.0616, - "step": 884 - }, - { - "epoch": 10.535714285714286, - "grad_norm": 2.4048703336882733, - "learning_rate": 7.720084355829661e-07, - "loss": 0.0399, - "step": 885 - }, - { - "epoch": 10.547619047619047, - "grad_norm": 2.516393022925152, - "learning_rate": 7.596672489456092e-07, - "loss": 0.0173, - "step": 886 - }, - { - "epoch": 10.55952380952381, - "grad_norm": 3.5055609696337373, - "learning_rate": 7.474216050840122e-07, - "loss": 0.0772, - "step": 887 - }, - { - "epoch": 10.571428571428571, - "grad_norm": 1.1195443659303188, - "learning_rate": 7.352716306151353e-07, - "loss": 0.0465, - "step": 888 - }, - { - "epoch": 10.583333333333334, - "grad_norm": 1.433395584509959, - "learning_rate": 7.232174511667378e-07, - "loss": 0.0316, - "step": 889 - }, - { - "epoch": 10.595238095238095, - "grad_norm": 1.4384479578684133, - "learning_rate": 7.112591913760769e-07, - "loss": 0.0377, - "step": 890 - }, - { - "epoch": 10.607142857142858, - "grad_norm": 3.633376265932014, - "learning_rate": 6.993969748886297e-07, - "loss": 0.0432, - "step": 891 - }, - { - "epoch": 10.619047619047619, - "grad_norm": 1.4047327300385846, - "learning_rate": 6.876309243568058e-07, - "loss": 0.0097, - "step": 892 - }, - { - "epoch": 10.630952380952381, - "grad_norm": 1.9199258882874348, - "learning_rate": 6.759611614386863e-07, - "loss": 0.0569, - "step": 893 - }, - { - "epoch": 10.642857142857142, - "grad_norm": 4.306204615416398, - "learning_rate": 6.643878067967568e-07, - "loss": 0.1107, - "step": 894 - }, - { - "epoch": 10.654761904761905, - "grad_norm": 2.297829748178259, - "learning_rate": 6.529109800966693e-07, - "loss": 0.0561, - "step": 895 - }, - { - "epoch": 10.666666666666666, - "grad_norm": 3.275946348662118, - "learning_rate": 6.415308000059961e-07, - "loss": 0.0245, - "step": 896 - }, - { - "epoch": 10.678571428571429, - "grad_norm": 2.2844223874918783, - "learning_rate": 6.302473841930135e-07, - "loss": 0.0521, - "step": 897 - }, - { - "epoch": 10.69047619047619, - "grad_norm": 4.300283035486153, - "learning_rate": 6.190608493254746e-07, - "loss": 0.0922, - "step": 898 - }, - { - "epoch": 10.702380952380953, - "grad_norm": 2.828737463417787, - "learning_rate": 6.079713110694052e-07, - "loss": 0.044, - "step": 899 - }, - { - "epoch": 10.714285714285714, - "grad_norm": 1.5456393145327858, - "learning_rate": 5.969788840879165e-07, - "loss": 0.0536, - "step": 900 - }, - { - "epoch": 10.726190476190476, - "grad_norm": 1.5850187544297383, - "learning_rate": 5.860836820400062e-07, - "loss": 0.0404, - "step": 901 - }, - { - "epoch": 10.738095238095237, - "grad_norm": 3.148235644738001, - "learning_rate": 5.752858175793951e-07, - "loss": 0.1086, - "step": 902 - }, - { - "epoch": 10.75, - "grad_norm": 2.393998297233833, - "learning_rate": 5.645854023533537e-07, - "loss": 0.0632, - "step": 903 - }, - { - "epoch": 10.761904761904763, - "grad_norm": 2.407523392793869, - "learning_rate": 5.539825470015536e-07, - "loss": 0.0792, - "step": 904 - }, - { - "epoch": 10.773809523809524, - "grad_norm": 2.0772622564808945, - "learning_rate": 5.434773611549182e-07, - "loss": 0.0641, - "step": 905 - }, - { - "epoch": 10.785714285714286, - "grad_norm": 2.8317777886655, - "learning_rate": 5.330699534344986e-07, - "loss": 0.0725, - "step": 906 - }, - { - "epoch": 10.797619047619047, - "grad_norm": 3.3935608020239942, - "learning_rate": 5.227604314503343e-07, - "loss": 0.0232, - "step": 907 - }, - { - "epoch": 10.80952380952381, - "grad_norm": 3.689532389180214, - "learning_rate": 5.125489018003583e-07, - "loss": 0.0225, - "step": 908 - }, - { - "epoch": 10.821428571428571, - "grad_norm": 5.726396718481587, - "learning_rate": 5.024354700692868e-07, - "loss": 0.1026, - "step": 909 - }, - { - "epoch": 10.833333333333334, - "grad_norm": 2.7925292834564104, - "learning_rate": 4.924202408275203e-07, - "loss": 0.0819, - "step": 910 - }, - { - "epoch": 10.845238095238095, - "grad_norm": 1.6524737914806937, - "learning_rate": 4.825033176300786e-07, - "loss": 0.0103, - "step": 911 - }, - { - "epoch": 10.857142857142858, - "grad_norm": 2.135124230264784, - "learning_rate": 4.726848030155129e-07, - "loss": 0.0459, - "step": 912 - }, - { - "epoch": 10.869047619047619, - "grad_norm": 2.179953610716799, - "learning_rate": 4.6296479850486133e-07, - "loss": 0.0492, - "step": 913 - }, - { - "epoch": 10.880952380952381, - "grad_norm": 2.0409112787447827, - "learning_rate": 4.5334340460058354e-07, - "loss": 0.0595, - "step": 914 - }, - { - "epoch": 10.892857142857142, - "grad_norm": 1.592173470715575, - "learning_rate": 4.438207207855383e-07, - "loss": 0.0293, - "step": 915 - }, - { - "epoch": 10.904761904761905, - "grad_norm": 2.4703577660931177, - "learning_rate": 4.343968455219416e-07, - "loss": 0.0176, - "step": 916 - }, - { - "epoch": 10.916666666666666, - "grad_norm": 3.3207335636677318, - "learning_rate": 4.250718762503514e-07, - "loss": 0.04, - "step": 917 - }, - { - "epoch": 10.928571428571429, - "grad_norm": 1.649892185275184, - "learning_rate": 4.15845909388668e-07, - "loss": 0.0519, - "step": 918 - }, - { - "epoch": 10.94047619047619, - "grad_norm": 2.34723989626408, - "learning_rate": 4.0671904033112386e-07, - "loss": 0.0616, - "step": 919 - }, - { - "epoch": 10.952380952380953, - "grad_norm": 1.5884046811323744, - "learning_rate": 3.976913634473112e-07, - "loss": 0.01, - "step": 920 - }, - { - "epoch": 10.964285714285714, - "grad_norm": 3.0053504070381427, - "learning_rate": 3.8876297208119253e-07, - "loss": 0.0613, - "step": 921 - }, - { - "epoch": 10.976190476190476, - "grad_norm": 2.083830931442132, - "learning_rate": 3.799339585501494e-07, - "loss": 0.0547, - "step": 922 - }, - { - "epoch": 10.988095238095237, - "grad_norm": 4.938614455336452, - "learning_rate": 3.712044141440152e-07, - "loss": 0.1153, - "step": 923 - }, - { - "epoch": 11.0, - "grad_norm": 1.6270011686782586, - "learning_rate": 3.6257442912414175e-07, - "loss": 0.0587, - "step": 924 - }, - { - "epoch": 11.0, - "eval_loss": 0.29636117815971375, - "eval_runtime": 38.491, - "eval_samples_per_second": 1.559, - "eval_steps_per_second": 1.559, - "step": 924 - }, - { - "epoch": 11.011904761904763, - "grad_norm": 2.986931803591163, - "learning_rate": 3.5404409272245757e-07, - "loss": 0.092, - "step": 925 - }, - { - "epoch": 11.023809523809524, - "grad_norm": 1.3347524168100455, - "learning_rate": 3.456134931405497e-07, - "loss": 0.0418, - "step": 926 - }, - { - "epoch": 11.035714285714286, - "grad_norm": 5.5212254683737925, - "learning_rate": 3.3728271754875365e-07, - "loss": 0.0339, - "step": 927 - }, - { - "epoch": 11.047619047619047, - "grad_norm": 2.7505612562930226, - "learning_rate": 3.290518520852437e-07, - "loss": 0.0583, - "step": 928 - }, - { - "epoch": 11.05952380952381, - "grad_norm": 2.456582011903621, - "learning_rate": 3.20920981855154e-07, - "loss": 0.0799, - "step": 929 - }, - { - "epoch": 11.071428571428571, - "grad_norm": 1.420816754610895, - "learning_rate": 3.1289019092968774e-07, - "loss": 0.0393, - "step": 930 - }, - { - "epoch": 11.083333333333334, - "grad_norm": 1.579732141095948, - "learning_rate": 3.0495956234525836e-07, - "loss": 0.0543, - "step": 931 - }, - { - "epoch": 11.095238095238095, - "grad_norm": 1.3126593889196536, - "learning_rate": 2.9712917810261645e-07, - "loss": 0.0241, - "step": 932 - }, - { - "epoch": 11.107142857142858, - "grad_norm": 2.949990528078044, - "learning_rate": 2.893991191660195e-07, - "loss": 0.0984, - "step": 933 - }, - { - "epoch": 11.119047619047619, - "grad_norm": 2.120636416281821, - "learning_rate": 2.817694654623804e-07, - "loss": 0.0576, - "step": 934 - }, - { - "epoch": 11.130952380952381, - "grad_norm": 1.9186338861805565, - "learning_rate": 2.742402958804491e-07, - "loss": 0.0621, - "step": 935 - }, - { - "epoch": 11.142857142857142, - "grad_norm": 1.0631990758619283, - "learning_rate": 2.668116882699956e-07, - "loss": 0.0287, - "step": 936 - }, - { - "epoch": 11.154761904761905, - "grad_norm": 1.4820135737077909, - "learning_rate": 2.5948371944099715e-07, - "loss": 0.0307, - "step": 937 - }, - { - "epoch": 11.166666666666666, - "grad_norm": 4.437779501180707, - "learning_rate": 2.5225646516285897e-07, - "loss": 0.0996, - "step": 938 - }, - { - "epoch": 11.178571428571429, - "grad_norm": 1.8449980165305102, - "learning_rate": 2.45130000163617e-07, - "loss": 0.048, - "step": 939 - }, - { - "epoch": 11.19047619047619, - "grad_norm": 5.919702050199371, - "learning_rate": 2.381043981291753e-07, - "loss": 0.0688, - "step": 940 - }, - { - "epoch": 11.202380952380953, - "grad_norm": 2.825856353607067, - "learning_rate": 2.3117973170253216e-07, - "loss": 0.0402, - "step": 941 - }, - { - "epoch": 11.214285714285714, - "grad_norm": 2.7231510331845943, - "learning_rate": 2.2435607248304624e-07, - "loss": 0.0183, - "step": 942 - }, - { - "epoch": 11.226190476190476, - "grad_norm": 2.4767739656112497, - "learning_rate": 2.176334910256772e-07, - "loss": 0.0762, - "step": 943 - }, - { - "epoch": 11.238095238095237, - "grad_norm": 3.8407293601387416, - "learning_rate": 2.1101205684027294e-07, - "loss": 0.0448, - "step": 944 - }, - { - "epoch": 11.25, - "grad_norm": 1.3667000249854184, - "learning_rate": 2.0449183839083675e-07, - "loss": 0.0101, - "step": 945 - }, - { - "epoch": 11.261904761904763, - "grad_norm": 2.8261958005353924, - "learning_rate": 1.980729030948314e-07, - "loss": 0.0173, - "step": 946 - }, - { - "epoch": 11.273809523809524, - "grad_norm": 2.407670745363879, - "learning_rate": 1.91755317322474e-07, - "loss": 0.0418, - "step": 947 - }, - { - "epoch": 11.285714285714286, - "grad_norm": 2.288457051038679, - "learning_rate": 1.8553914639605207e-07, - "loss": 0.0691, - "step": 948 - }, - { - "epoch": 11.297619047619047, - "grad_norm": 1.3870092710553388, - "learning_rate": 1.7942445458925206e-07, - "loss": 0.0093, - "step": 949 - }, - { - "epoch": 11.30952380952381, - "grad_norm": 1.6211467655272436, - "learning_rate": 1.734113051264852e-07, - "loss": 0.0493, - "step": 950 - }, - { - "epoch": 11.321428571428571, - "grad_norm": 2.7633334414007518, - "learning_rate": 1.674997601822448e-07, - "loss": 0.0237, - "step": 951 - }, - { - "epoch": 11.333333333333334, - "grad_norm": 2.132739304201599, - "learning_rate": 1.616898808804568e-07, - "loss": 0.0702, - "step": 952 - }, - { - "epoch": 11.345238095238095, - "grad_norm": 5.177102761658638, - "learning_rate": 1.5598172729384686e-07, - "loss": 0.1246, - "step": 953 - }, - { - "epoch": 11.357142857142858, - "grad_norm": 2.469101298855786, - "learning_rate": 1.5037535844332762e-07, - "loss": 0.0643, - "step": 954 - }, - { - "epoch": 11.369047619047619, - "grad_norm": 1.0161239922881402, - "learning_rate": 1.4487083229737574e-07, - "loss": 0.0515, - "step": 955 - }, - { - "epoch": 11.380952380952381, - "grad_norm": 2.6222860821706564, - "learning_rate": 1.3946820577144359e-07, - "loss": 0.0812, - "step": 956 - }, - { - "epoch": 11.392857142857142, - "grad_norm": 1.0962889011973982, - "learning_rate": 1.341675347273652e-07, - "loss": 0.008, - "step": 957 - }, - { - "epoch": 11.404761904761905, - "grad_norm": 1.886834476713489, - "learning_rate": 1.2896887397278125e-07, - "loss": 0.047, - "step": 958 - }, - { - "epoch": 11.416666666666666, - "grad_norm": 2.739136030244547, - "learning_rate": 1.2387227726056606e-07, - "loss": 0.0757, - "step": 959 - }, - { - "epoch": 11.428571428571429, - "grad_norm": 5.288321746083624, - "learning_rate": 1.1887779728828486e-07, - "loss": 0.1198, - "step": 960 - }, - { - "epoch": 11.44047619047619, - "grad_norm": 2.104498900191149, - "learning_rate": 1.1398548569763191e-07, - "loss": 0.0718, - "step": 961 - }, - { - "epoch": 11.452380952380953, - "grad_norm": 2.306227037555575, - "learning_rate": 1.0919539307391314e-07, - "loss": 0.0477, - "step": 962 - }, - { - "epoch": 11.464285714285714, - "grad_norm": 3.1821099458410713, - "learning_rate": 1.0450756894551217e-07, - "loss": 0.0431, - "step": 963 - }, - { - "epoch": 11.476190476190476, - "grad_norm": 1.2721737087858365, - "learning_rate": 9.99220617833796e-08, - "loss": 0.0292, - "step": 964 - }, - { - "epoch": 11.488095238095237, - "grad_norm": 3.2490088015741097, - "learning_rate": 9.54389190005367e-08, - "loss": 0.0226, - "step": 965 - }, - { - "epoch": 11.5, - "grad_norm": 7.477486139245656, - "learning_rate": 9.105818695158031e-08, - "loss": 0.0483, - "step": 966 - }, - { - "epoch": 11.511904761904763, - "grad_norm": 4.16925857937326, - "learning_rate": 8.677991093220761e-08, - "loss": 0.1144, - "step": 967 - }, - { - "epoch": 11.523809523809524, - "grad_norm": 5.42535254900298, - "learning_rate": 8.26041351787421e-08, - "loss": 0.1407, - "step": 968 - }, - { - "epoch": 11.535714285714286, - "grad_norm": 2.0057715771874207, - "learning_rate": 7.853090286768284e-08, - "loss": 0.0648, - "step": 969 - }, - { - "epoch": 11.547619047619047, - "grad_norm": 1.7657521223378563, - "learning_rate": 7.456025611525253e-08, - "loss": 0.0442, - "step": 970 - }, - { - "epoch": 11.55952380952381, - "grad_norm": 1.568683495757951, - "learning_rate": 7.069223597696572e-08, - "loss": 0.0473, - "step": 971 - }, - { - "epoch": 11.571428571428571, - "grad_norm": 4.441369110805783, - "learning_rate": 6.692688244720357e-08, - "loss": 0.0279, - "step": 972 - }, - { - "epoch": 11.583333333333334, - "grad_norm": 2.9301820879708758, - "learning_rate": 6.326423445879748e-08, - "loss": 0.04, - "step": 973 - }, - { - "epoch": 11.595238095238095, - "grad_norm": 2.9514527443649117, - "learning_rate": 5.970432988263053e-08, - "loss": 0.0393, - "step": 974 - }, - { - "epoch": 11.607142857142858, - "grad_norm": 2.6193009344389733, - "learning_rate": 5.624720552724228e-08, - "loss": 0.0756, - "step": 975 - }, - { - "epoch": 11.619047619047619, - "grad_norm": 1.858288104051874, - "learning_rate": 5.289289713845347e-08, - "loss": 0.0866, - "step": 976 - }, - { - "epoch": 11.630952380952381, - "grad_norm": 1.3996258099249799, - "learning_rate": 4.964143939898747e-08, - "loss": 0.0504, - "step": 977 - }, - { - "epoch": 11.642857142857142, - "grad_norm": 1.4934729697666005, - "learning_rate": 4.6492865928123856e-08, - "loss": 0.0494, - "step": 978 - }, - { - "epoch": 11.654761904761905, - "grad_norm": 2.019745524010841, - "learning_rate": 4.344720928133761e-08, - "loss": 0.0724, - "step": 979 - }, - { - "epoch": 11.666666666666666, - "grad_norm": 0.8817349997020998, - "learning_rate": 4.0504500949974936e-08, - "loss": 0.0249, - "step": 980 - }, - { - "epoch": 11.678571428571429, - "grad_norm": 3.135857451457003, - "learning_rate": 3.766477136091573e-08, - "loss": 0.0205, - "step": 981 - }, - { - "epoch": 11.69047619047619, - "grad_norm": 4.821187540685422, - "learning_rate": 3.49280498762683e-08, - "loss": 0.0948, - "step": 982 - }, - { - "epoch": 11.702380952380953, - "grad_norm": 2.155760067566603, - "learning_rate": 3.2294364793064026e-08, - "loss": 0.0773, - "step": 983 - }, - { - "epoch": 11.714285714285714, - "grad_norm": 1.2790261765367164, - "learning_rate": 2.976374334295984e-08, - "loss": 0.0394, - "step": 984 - }, - { - "epoch": 11.726190476190476, - "grad_norm": 1.9945206184899933, - "learning_rate": 2.7336211691961767e-08, - "loss": 0.0607, - "step": 985 - }, - { - "epoch": 11.738095238095237, - "grad_norm": 2.2954595943776375, - "learning_rate": 2.5011794940150713e-08, - "loss": 0.0685, - "step": 986 - }, - { - "epoch": 11.75, - "grad_norm": 3.889746627099117, - "learning_rate": 2.2790517121428212e-08, - "loss": 0.092, - "step": 987 - }, - { - "epoch": 11.761904761904763, - "grad_norm": 1.9305228670767907, - "learning_rate": 2.0672401203261084e-08, - "loss": 0.0608, - "step": 988 - }, - { - "epoch": 11.773809523809524, - "grad_norm": 1.49732917316992, - "learning_rate": 1.8657469086448275e-08, - "loss": 0.0587, - "step": 989 - }, - { - "epoch": 11.785714285714286, - "grad_norm": 3.209573738861191, - "learning_rate": 1.6745741604895503e-08, - "loss": 0.0634, - "step": 990 - }, - { - "epoch": 11.797619047619047, - "grad_norm": 0.9302644870602609, - "learning_rate": 1.4937238525395416e-08, - "loss": 0.021, - "step": 991 - }, - { - "epoch": 11.80952380952381, - "grad_norm": 1.7458424653110582, - "learning_rate": 1.3231978547427749e-08, - "loss": 0.05, - "step": 992 - }, - { - "epoch": 11.821428571428571, - "grad_norm": 2.4382393205857316, - "learning_rate": 1.1629979302960614e-08, - "loss": 0.0457, - "step": 993 - }, - { - "epoch": 11.833333333333334, - "grad_norm": 0.17241553083238312, - "learning_rate": 1.013125735627507e-08, - "loss": 0.0031, - "step": 994 - }, - { - "epoch": 11.845238095238095, - "grad_norm": 2.1464199664004395, - "learning_rate": 8.735828203787489e-09, - "loss": 0.0836, - "step": 995 - }, - { - "epoch": 11.857142857142858, - "grad_norm": 2.8983917842024045, - "learning_rate": 7.443706273895235e-09, - "loss": 0.0511, - "step": 996 - }, - { - "epoch": 11.869047619047619, - "grad_norm": 5.046944747259621, - "learning_rate": 6.254904926820127e-09, - "loss": 0.1221, - "step": 997 - }, - { - "epoch": 11.880952380952381, - "grad_norm": 1.087638003537602, - "learning_rate": 5.169436454478538e-09, - "loss": 0.0359, - "step": 998 - }, - { - "epoch": 11.892857142857142, - "grad_norm": 2.6782948586641786, - "learning_rate": 4.187312080347061e-09, - "loss": 0.0578, - "step": 999 - }, - { - "epoch": 11.904761904761905, - "grad_norm": 1.4941663210879166, - "learning_rate": 3.3085419593514857e-09, - "loss": 0.0301, - "step": 1000 - }, - { - "epoch": 11.916666666666666, - "grad_norm": 2.03536965767824, - "learning_rate": 2.533135177760215e-09, - "loss": 0.0353, - "step": 1001 - }, - { - "epoch": 11.928571428571429, - "grad_norm": 2.870140395549658, - "learning_rate": 1.8610997530876806e-09, - "loss": 0.0526, - "step": 1002 - }, - { - "epoch": 11.94047619047619, - "grad_norm": 1.9821994732842032, - "learning_rate": 1.2924426340177321e-09, - "loss": 0.0368, - "step": 1003 - }, - { - "epoch": 11.952380952380953, - "grad_norm": 2.570985111435327, - "learning_rate": 8.271697003237045e-10, - "loss": 0.0624, - "step": 1004 - }, - { - "epoch": 11.964285714285714, - "grad_norm": 1.3718413803749656, - "learning_rate": 4.6528576281401616e-10, - "loss": 0.0661, - "step": 1005 - }, - { - "epoch": 11.976190476190476, - "grad_norm": 2.371522463710688, - "learning_rate": 2.0679456327998838e-10, - "loss": 0.062, - "step": 1006 - }, - { - "epoch": 11.988095238095237, - "grad_norm": 4.216784477215788, - "learning_rate": 5.169877445809768e-11, - "loss": 0.0267, - "step": 1007 - }, - { - "epoch": 12.0, - "grad_norm": 3.6030455433551953, - "learning_rate": 0.0, - "loss": 0.0259, - "step": 1008 - }, - { - "epoch": 12.0, - "eval_loss": 0.29545679688453674, - "eval_runtime": 38.8506, - "eval_samples_per_second": 1.544, - "eval_steps_per_second": 1.544, - "step": 1008 + "epoch": 11.899159663865547, + "eval_loss": 0.08401793986558914, + "eval_runtime": 85.0205, + "eval_samples_per_second": 1.106, + "eval_steps_per_second": 1.106, + "step": 708 }, { - "epoch": 12.0, - "step": 1008, - "total_flos": 280627377340416.0, - "train_loss": 0.48460018415431794, - "train_runtime": 6901.6294, - "train_samples_per_second": 0.584, - "train_steps_per_second": 0.146 + "epoch": 11.899159663865547, + "step": 708, + "total_flos": 356037728403456.0, + "train_loss": 0.4260152625183096, + "train_runtime": 5257.2185, + "train_samples_per_second": 1.08, + "train_steps_per_second": 0.135 } ], "logging_steps": 1.0, - "max_steps": 1008, + "max_steps": 708, "num_input_tokens_seen": 0, "num_train_epochs": 12, - "save_steps": 500, + "save_steps": 200.0, "stateful_callbacks": { "TrainerControl": { "args": { @@ -7187,8 +5087,8 @@ "attributes": {} } }, - "total_flos": 280627377340416.0, - "train_batch_size": 1, + "total_flos": 356037728403456.0, + "train_batch_size": 4, "trial_name": null, "trial_params": null }