diff --git "a/checkpoint-1092/trainer_state.json" "b/checkpoint-1092/trainer_state.json" deleted file mode 100644--- "a/checkpoint-1092/trainer_state.json" +++ /dev/null @@ -1,7781 +0,0 @@ -{ - "best_metric": null, - "best_model_checkpoint": null, - "epoch": 13.0, - "eval_steps": 500, - "global_step": 1092, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "epoch": 0.011904761904761904, - "grad_norm": 17.036167336296355, - "learning_rate": 3.921568627450981e-07, - "loss": 9.2899, - "step": 1 - }, - { - "epoch": 0.023809523809523808, - "grad_norm": 16.51040791982049, - "learning_rate": 7.843137254901962e-07, - "loss": 9.1432, - "step": 2 - }, - { - "epoch": 0.03571428571428571, - "grad_norm": 17.409029658709134, - "learning_rate": 1.1764705882352942e-06, - "loss": 9.5443, - "step": 3 - }, - { - "epoch": 0.047619047619047616, - "grad_norm": 13.165316535403809, - "learning_rate": 1.5686274509803923e-06, - "loss": 7.9008, - "step": 4 - }, - { - "epoch": 0.05952380952380952, - "grad_norm": 13.598211666391473, - "learning_rate": 1.96078431372549e-06, - "loss": 7.9033, - "step": 5 - }, - { - "epoch": 0.07142857142857142, - "grad_norm": 6.219902614589629, - "learning_rate": 2.3529411764705885e-06, - "loss": 4.9995, - "step": 6 - }, - { - "epoch": 0.08333333333333333, - "grad_norm": 9.734044836676956, - "learning_rate": 2.7450980392156867e-06, - "loss": 6.5143, - "step": 7 - }, - { - "epoch": 0.09523809523809523, - "grad_norm": 16.731742931555345, - "learning_rate": 3.1372549019607846e-06, - "loss": 9.0697, - "step": 8 - }, - { - "epoch": 0.10714285714285714, - "grad_norm": 13.231944799550764, - "learning_rate": 3.529411764705883e-06, - "loss": 7.8908, - "step": 9 - }, - { - "epoch": 0.11904761904761904, - "grad_norm": 9.617661005903635, - "learning_rate": 3.92156862745098e-06, - "loss": 6.4021, - "step": 10 - }, - { - "epoch": 0.13095238095238096, - "grad_norm": 13.333162661233034, - "learning_rate": 4.313725490196079e-06, - "loss": 7.8622, - "step": 11 - }, - { - "epoch": 0.14285714285714285, - "grad_norm": 10.173955662246277, - "learning_rate": 4.705882352941177e-06, - "loss": 6.4799, - "step": 12 - }, - { - "epoch": 0.15476190476190477, - "grad_norm": 13.53421872626686, - "learning_rate": 5.098039215686274e-06, - "loss": 7.615, - "step": 13 - }, - { - "epoch": 0.16666666666666666, - "grad_norm": 13.528091158193236, - "learning_rate": 5.4901960784313735e-06, - "loss": 7.8859, - "step": 14 - }, - { - "epoch": 0.17857142857142858, - "grad_norm": 13.46530715470059, - "learning_rate": 5.882352941176471e-06, - "loss": 7.7087, - "step": 15 - }, - { - "epoch": 0.19047619047619047, - "grad_norm": 13.186854066383047, - "learning_rate": 6.274509803921569e-06, - "loss": 7.7375, - "step": 16 - }, - { - "epoch": 0.20238095238095238, - "grad_norm": 13.709261078338196, - "learning_rate": 6.666666666666667e-06, - "loss": 7.7001, - "step": 17 - }, - { - "epoch": 0.21428571428571427, - "grad_norm": 13.694779123750742, - "learning_rate": 7.058823529411766e-06, - "loss": 7.5827, - "step": 18 - }, - { - "epoch": 0.2261904761904762, - "grad_norm": 16.830863696016568, - "learning_rate": 7.450980392156863e-06, - "loss": 8.9601, - "step": 19 - }, - { - "epoch": 0.23809523809523808, - "grad_norm": 17.22287672142912, - "learning_rate": 7.84313725490196e-06, - "loss": 8.9006, - "step": 20 - }, - { - "epoch": 0.25, - "grad_norm": 17.92814388484799, - "learning_rate": 8.23529411764706e-06, - "loss": 9.1349, - "step": 21 - }, - { - "epoch": 0.2619047619047619, - "grad_norm": 17.193339613649197, - "learning_rate": 8.627450980392157e-06, - "loss": 8.7754, - "step": 22 - }, - { - "epoch": 0.27380952380952384, - "grad_norm": 17.286206959133764, - "learning_rate": 9.019607843137256e-06, - "loss": 8.7204, - "step": 23 - }, - { - "epoch": 0.2857142857142857, - "grad_norm": 10.070105901874978, - "learning_rate": 9.411764705882354e-06, - "loss": 6.1837, - "step": 24 - }, - { - "epoch": 0.2976190476190476, - "grad_norm": 13.810184661186407, - "learning_rate": 9.803921568627451e-06, - "loss": 7.4576, - "step": 25 - }, - { - "epoch": 0.30952380952380953, - "grad_norm": 6.7625117928823775, - "learning_rate": 1.0196078431372549e-05, - "loss": 4.806, - "step": 26 - }, - { - "epoch": 0.32142857142857145, - "grad_norm": 13.956198625918494, - "learning_rate": 1.0588235294117648e-05, - "loss": 7.0487, - "step": 27 - }, - { - "epoch": 0.3333333333333333, - "grad_norm": 10.260558995935384, - "learning_rate": 1.0980392156862747e-05, - "loss": 5.6256, - "step": 28 - }, - { - "epoch": 0.34523809523809523, - "grad_norm": 17.35991907597269, - "learning_rate": 1.1372549019607844e-05, - "loss": 8.0574, - "step": 29 - }, - { - "epoch": 0.35714285714285715, - "grad_norm": 9.721422858798402, - "learning_rate": 1.1764705882352942e-05, - "loss": 5.5548, - "step": 30 - }, - { - "epoch": 0.36904761904761907, - "grad_norm": 13.840485577854023, - "learning_rate": 1.215686274509804e-05, - "loss": 6.7663, - "step": 31 - }, - { - "epoch": 0.38095238095238093, - "grad_norm": 10.118512030653148, - "learning_rate": 1.2549019607843138e-05, - "loss": 5.543, - "step": 32 - }, - { - "epoch": 0.39285714285714285, - "grad_norm": 14.32625990673996, - "learning_rate": 1.2941176470588238e-05, - "loss": 6.2332, - "step": 33 - }, - { - "epoch": 0.40476190476190477, - "grad_norm": 10.011793459979055, - "learning_rate": 1.3333333333333333e-05, - "loss": 5.3651, - "step": 34 - }, - { - "epoch": 0.4166666666666667, - "grad_norm": 10.123714137969845, - "learning_rate": 1.3725490196078432e-05, - "loss": 5.3173, - "step": 35 - }, - { - "epoch": 0.42857142857142855, - "grad_norm": 10.251943500911867, - "learning_rate": 1.4117647058823532e-05, - "loss": 4.9469, - "step": 36 - }, - { - "epoch": 0.44047619047619047, - "grad_norm": 15.220443465155252, - "learning_rate": 1.4509803921568629e-05, - "loss": 5.5487, - "step": 37 - }, - { - "epoch": 0.4523809523809524, - "grad_norm": 10.713358439711659, - "learning_rate": 1.4901960784313726e-05, - "loss": 4.8903, - "step": 38 - }, - { - "epoch": 0.4642857142857143, - "grad_norm": 10.755404929616764, - "learning_rate": 1.5294117647058822e-05, - "loss": 4.8086, - "step": 39 - }, - { - "epoch": 0.47619047619047616, - "grad_norm": 11.89621232727623, - "learning_rate": 1.568627450980392e-05, - "loss": 4.5261, - "step": 40 - }, - { - "epoch": 0.4880952380952381, - "grad_norm": 7.524808327717391, - "learning_rate": 1.607843137254902e-05, - "loss": 3.7146, - "step": 41 - }, - { - "epoch": 0.5, - "grad_norm": 12.838155901645372, - "learning_rate": 1.647058823529412e-05, - "loss": 4.2837, - "step": 42 - }, - { - "epoch": 0.5119047619047619, - "grad_norm": 12.287630066184375, - "learning_rate": 1.686274509803922e-05, - "loss": 4.2075, - "step": 43 - }, - { - "epoch": 0.5238095238095238, - "grad_norm": 25.407055834196797, - "learning_rate": 1.7254901960784314e-05, - "loss": 4.4444, - "step": 44 - }, - { - "epoch": 0.5357142857142857, - "grad_norm": 14.241934754354682, - "learning_rate": 1.7647058823529414e-05, - "loss": 3.9014, - "step": 45 - }, - { - "epoch": 0.5476190476190477, - "grad_norm": 14.566784082629397, - "learning_rate": 1.8039215686274513e-05, - "loss": 3.5172, - "step": 46 - }, - { - "epoch": 0.5595238095238095, - "grad_norm": 13.542772667104277, - "learning_rate": 1.843137254901961e-05, - "loss": 3.296, - "step": 47 - }, - { - "epoch": 0.5714285714285714, - "grad_norm": 16.124803348222727, - "learning_rate": 1.8823529411764708e-05, - "loss": 3.0504, - "step": 48 - }, - { - "epoch": 0.5833333333333334, - "grad_norm": 12.95227008504844, - "learning_rate": 1.9215686274509807e-05, - "loss": 2.88, - "step": 49 - }, - { - "epoch": 0.5952380952380952, - "grad_norm": 3.595023968242296, - "learning_rate": 1.9607843137254903e-05, - "loss": 2.6277, - "step": 50 - }, - { - "epoch": 0.6071428571428571, - "grad_norm": 14.324134170867602, - "learning_rate": 2e-05, - "loss": 2.5481, - "step": 51 - }, - { - "epoch": 0.6190476190476191, - "grad_norm": 14.936425671150648, - "learning_rate": 1.9999981403661347e-05, - "loss": 2.4101, - "step": 52 - }, - { - "epoch": 0.6309523809523809, - "grad_norm": 11.892934392488657, - "learning_rate": 1.9999925614714537e-05, - "loss": 2.2475, - "step": 53 - }, - { - "epoch": 0.6428571428571429, - "grad_norm": 6.548368217885392, - "learning_rate": 1.9999832633367076e-05, - "loss": 2.5809, - "step": 54 - }, - { - "epoch": 0.6547619047619048, - "grad_norm": 11.621015010841388, - "learning_rate": 1.999970245996478e-05, - "loss": 2.0091, - "step": 55 - }, - { - "epoch": 0.6666666666666666, - "grad_norm": 8.696284471720817, - "learning_rate": 1.99995350949918e-05, - "loss": 2.1676, - "step": 56 - }, - { - "epoch": 0.6785714285714286, - "grad_norm": 12.002026381893657, - "learning_rate": 1.9999330539070615e-05, - "loss": 1.665, - "step": 57 - }, - { - "epoch": 0.6904761904761905, - "grad_norm": 11.065824262620708, - "learning_rate": 1.9999088792962016e-05, - "loss": 1.5071, - "step": 58 - }, - { - "epoch": 0.7023809523809523, - "grad_norm": 10.994414191496874, - "learning_rate": 1.999880985756513e-05, - "loss": 1.2489, - "step": 59 - }, - { - "epoch": 0.7142857142857143, - "grad_norm": 5.605348233753531, - "learning_rate": 1.9998493733917385e-05, - "loss": 2.1398, - "step": 60 - }, - { - "epoch": 0.7261904761904762, - "grad_norm": 11.91282152884282, - "learning_rate": 1.9998140423194534e-05, - "loss": 0.7581, - "step": 61 - }, - { - "epoch": 0.7380952380952381, - "grad_norm": 5.840433347765395, - "learning_rate": 1.9997749926710634e-05, - "loss": 1.4768, - "step": 62 - }, - { - "epoch": 0.75, - "grad_norm": 6.105791712493414, - "learning_rate": 1.999732224591804e-05, - "loss": 1.0261, - "step": 63 - }, - { - "epoch": 0.7619047619047619, - "grad_norm": 6.785960731870506, - "learning_rate": 1.999685738240742e-05, - "loss": 0.906, - "step": 64 - }, - { - "epoch": 0.7738095238095238, - "grad_norm": 6.2306364026893375, - "learning_rate": 1.999635533790772e-05, - "loss": 0.3629, - "step": 65 - }, - { - "epoch": 0.7857142857142857, - "grad_norm": 4.129747678855472, - "learning_rate": 1.999581611428618e-05, - "loss": 1.4474, - "step": 66 - }, - { - "epoch": 0.7976190476190477, - "grad_norm": 3.619952492724209, - "learning_rate": 1.9995239713548318e-05, - "loss": 1.207, - "step": 67 - }, - { - "epoch": 0.8095238095238095, - "grad_norm": 4.0590676768999305, - "learning_rate": 1.9994626137837917e-05, - "loss": 1.3818, - "step": 68 - }, - { - "epoch": 0.8214285714285714, - "grad_norm": 2.7517047479750576, - "learning_rate": 1.999397538943704e-05, - "loss": 1.2052, - "step": 69 - }, - { - "epoch": 0.8333333333333334, - "grad_norm": 6.041576469016595, - "learning_rate": 1.9993287470765986e-05, - "loss": 0.8238, - "step": 70 - }, - { - "epoch": 0.8452380952380952, - "grad_norm": 7.266622078598528, - "learning_rate": 1.999256238438331e-05, - "loss": 1.5338, - "step": 71 - }, - { - "epoch": 0.8571428571428571, - "grad_norm": 5.912372435902364, - "learning_rate": 1.9991800132985803e-05, - "loss": 2.3784, - "step": 72 - }, - { - "epoch": 0.8690476190476191, - "grad_norm": 4.956631346595567, - "learning_rate": 1.9991000719408483e-05, - "loss": 1.9622, - "step": 73 - }, - { - "epoch": 0.8809523809523809, - "grad_norm": 6.416670196307194, - "learning_rate": 1.9990164146624583e-05, - "loss": 0.8154, - "step": 74 - }, - { - "epoch": 0.8928571428571429, - "grad_norm": 12.54481609589616, - "learning_rate": 1.998929041774554e-05, - "loss": 1.4481, - "step": 75 - }, - { - "epoch": 0.9047619047619048, - "grad_norm": 3.1042285039390634, - "learning_rate": 1.9988379536020988e-05, - "loss": 1.2713, - "step": 76 - }, - { - "epoch": 0.9166666666666666, - "grad_norm": 4.825919181696002, - "learning_rate": 1.9987431504838737e-05, - "loss": 2.2873, - "step": 77 - }, - { - "epoch": 0.9285714285714286, - "grad_norm": 5.304665169610741, - "learning_rate": 1.998644632772477e-05, - "loss": 0.6765, - "step": 78 - }, - { - "epoch": 0.9404761904761905, - "grad_norm": 3.0999380965325196, - "learning_rate": 1.9985424008343226e-05, - "loss": 1.1929, - "step": 79 - }, - { - "epoch": 0.9523809523809523, - "grad_norm": 2.692769030054589, - "learning_rate": 1.998436455049638e-05, - "loss": 0.6175, - "step": 80 - }, - { - "epoch": 0.9642857142857143, - "grad_norm": 3.104212672349469, - "learning_rate": 1.9983267958124647e-05, - "loss": 1.2298, - "step": 81 - }, - { - "epoch": 0.9761904761904762, - "grad_norm": 3.3640203554586683, - "learning_rate": 1.998213423530654e-05, - "loss": 1.1316, - "step": 82 - }, - { - "epoch": 0.9880952380952381, - "grad_norm": 4.1473861422585525, - "learning_rate": 1.9980963386258682e-05, - "loss": 0.6541, - "step": 83 - }, - { - "epoch": 1.0, - "grad_norm": 4.372535318480156, - "learning_rate": 1.997975541533577e-05, - "loss": 1.3033, - "step": 84 - }, - { - "epoch": 1.0, - "eval_loss": 0.9026409983634949, - "eval_runtime": 57.3906, - "eval_samples_per_second": 1.045, - "eval_steps_per_second": 1.045, - "step": 84 - }, - { - "epoch": 1.0119047619047619, - "grad_norm": 2.777482345866408, - "learning_rate": 1.9978510327030577e-05, - "loss": 1.1553, - "step": 85 - }, - { - "epoch": 1.0238095238095237, - "grad_norm": 4.0464154340106475, - "learning_rate": 1.9977228125973917e-05, - "loss": 1.2478, - "step": 86 - }, - { - "epoch": 1.0357142857142858, - "grad_norm": 3.0692784835075306, - "learning_rate": 1.997590881693464e-05, - "loss": 1.1854, - "step": 87 - }, - { - "epoch": 1.0476190476190477, - "grad_norm": 2.3415895627089442, - "learning_rate": 1.9974552404819606e-05, - "loss": 1.5326, - "step": 88 - }, - { - "epoch": 1.0595238095238095, - "grad_norm": 2.336319119026421, - "learning_rate": 1.9973158894673677e-05, - "loss": 1.5136, - "step": 89 - }, - { - "epoch": 1.0714285714285714, - "grad_norm": 3.167429137483161, - "learning_rate": 1.9971728291679692e-05, - "loss": 0.2382, - "step": 90 - }, - { - "epoch": 1.0833333333333333, - "grad_norm": 2.8915816180219527, - "learning_rate": 1.9970260601158444e-05, - "loss": 1.0799, - "step": 91 - }, - { - "epoch": 1.0952380952380953, - "grad_norm": 3.2474222004118514, - "learning_rate": 1.996875582856867e-05, - "loss": 0.6601, - "step": 92 - }, - { - "epoch": 1.1071428571428572, - "grad_norm": 3.0071143059194436, - "learning_rate": 1.996721397950702e-05, - "loss": 1.0593, - "step": 93 - }, - { - "epoch": 1.119047619047619, - "grad_norm": 2.3487483579678625, - "learning_rate": 1.996563505970804e-05, - "loss": 0.5617, - "step": 94 - }, - { - "epoch": 1.130952380952381, - "grad_norm": 1.608206071362919, - "learning_rate": 1.9964019075044164e-05, - "loss": 1.1265, - "step": 95 - }, - { - "epoch": 1.1428571428571428, - "grad_norm": 2.1739566515425603, - "learning_rate": 1.9962366031525663e-05, - "loss": 0.638, - "step": 96 - }, - { - "epoch": 1.1547619047619047, - "grad_norm": 4.254923691373443, - "learning_rate": 1.9960675935300653e-05, - "loss": 1.5095, - "step": 97 - }, - { - "epoch": 1.1666666666666667, - "grad_norm": 2.1819566840909195, - "learning_rate": 1.9958948792655056e-05, - "loss": 0.5706, - "step": 98 - }, - { - "epoch": 1.1785714285714286, - "grad_norm": 2.0311354931594963, - "learning_rate": 1.995718461001257e-05, - "loss": 0.634, - "step": 99 - }, - { - "epoch": 1.1904761904761905, - "grad_norm": 1.5715938441379123, - "learning_rate": 1.9955383393934677e-05, - "loss": 0.9344, - "step": 100 - }, - { - "epoch": 1.2023809523809523, - "grad_norm": 2.230216937882692, - "learning_rate": 1.9953545151120565e-05, - "loss": 0.5713, - "step": 101 - }, - { - "epoch": 1.2142857142857142, - "grad_norm": 1.9246970673958967, - "learning_rate": 1.9951669888407162e-05, - "loss": 0.5854, - "step": 102 - }, - { - "epoch": 1.2261904761904763, - "grad_norm": 5.010856503469265, - "learning_rate": 1.9949757612769068e-05, - "loss": 0.6193, - "step": 103 - }, - { - "epoch": 1.2380952380952381, - "grad_norm": 1.6031600884223667, - "learning_rate": 1.994780833131855e-05, - "loss": 1.0085, - "step": 104 - }, - { - "epoch": 1.25, - "grad_norm": 2.749900960307275, - "learning_rate": 1.994582205130551e-05, - "loss": 1.4096, - "step": 105 - }, - { - "epoch": 1.2619047619047619, - "grad_norm": 1.0626905444801218, - "learning_rate": 1.9943798780117448e-05, - "loss": 0.5127, - "step": 106 - }, - { - "epoch": 1.2738095238095237, - "grad_norm": 1.574752679982518, - "learning_rate": 1.9941738525279456e-05, - "loss": 0.9511, - "step": 107 - }, - { - "epoch": 1.2857142857142856, - "grad_norm": 3.0210041542837596, - "learning_rate": 1.9939641294454172e-05, - "loss": 0.5167, - "step": 108 - }, - { - "epoch": 1.2976190476190477, - "grad_norm": 6.451462231840578, - "learning_rate": 1.993750709544176e-05, - "loss": 1.4284, - "step": 109 - }, - { - "epoch": 1.3095238095238095, - "grad_norm": 9.197366751047214, - "learning_rate": 1.9935335936179876e-05, - "loss": 0.2286, - "step": 110 - }, - { - "epoch": 1.3214285714285714, - "grad_norm": 0.9774070171128464, - "learning_rate": 1.9933127824743646e-05, - "loss": 0.4533, - "step": 111 - }, - { - "epoch": 1.3333333333333333, - "grad_norm": 1.5276991777107454, - "learning_rate": 1.9930882769345623e-05, - "loss": 0.529, - "step": 112 - }, - { - "epoch": 1.3452380952380953, - "grad_norm": 1.5612981824321013, - "learning_rate": 1.9928600778335774e-05, - "loss": 0.9313, - "step": 113 - }, - { - "epoch": 1.3571428571428572, - "grad_norm": 2.3815979703196852, - "learning_rate": 1.9926281860201427e-05, - "loss": 0.0845, - "step": 114 - }, - { - "epoch": 1.369047619047619, - "grad_norm": 4.8026812662489355, - "learning_rate": 1.992392602356727e-05, - "loss": 0.9438, - "step": 115 - }, - { - "epoch": 1.380952380952381, - "grad_norm": 5.356973288599696, - "learning_rate": 1.9921533277195282e-05, - "loss": 0.5935, - "step": 116 - }, - { - "epoch": 1.3928571428571428, - "grad_norm": 2.316890722187289, - "learning_rate": 1.9919103629984727e-05, - "loss": 1.2909, - "step": 117 - }, - { - "epoch": 1.4047619047619047, - "grad_norm": 3.1719727319199853, - "learning_rate": 1.991663709097212e-05, - "loss": 0.5475, - "step": 118 - }, - { - "epoch": 1.4166666666666667, - "grad_norm": 1.6801319700044128, - "learning_rate": 1.9914133669331174e-05, - "loss": 0.9584, - "step": 119 - }, - { - "epoch": 1.4285714285714286, - "grad_norm": 2.038248435291157, - "learning_rate": 1.991159337437279e-05, - "loss": 0.4672, - "step": 120 - }, - { - "epoch": 1.4404761904761905, - "grad_norm": 2.281787866316906, - "learning_rate": 1.9909016215544998e-05, - "loss": 0.5058, - "step": 121 - }, - { - "epoch": 1.4523809523809523, - "grad_norm": 6.05492934604481, - "learning_rate": 1.9906402202432945e-05, - "loss": 0.6174, - "step": 122 - }, - { - "epoch": 1.4642857142857144, - "grad_norm": 4.542635490766034, - "learning_rate": 1.990375134475885e-05, - "loss": 0.9407, - "step": 123 - }, - { - "epoch": 1.4761904761904763, - "grad_norm": 4.770874980454038, - "learning_rate": 1.9901063652381954e-05, - "loss": 0.1024, - "step": 124 - }, - { - "epoch": 1.4880952380952381, - "grad_norm": 1.8092172752263767, - "learning_rate": 1.989833913529851e-05, - "loss": 0.137, - "step": 125 - }, - { - "epoch": 1.5, - "grad_norm": 4.513986632423435, - "learning_rate": 1.9895577803641726e-05, - "loss": 0.5451, - "step": 126 - }, - { - "epoch": 1.5119047619047619, - "grad_norm": 1.4614473604306615, - "learning_rate": 1.9892779667681733e-05, - "loss": 0.8946, - "step": 127 - }, - { - "epoch": 1.5238095238095237, - "grad_norm": 4.049432365383836, - "learning_rate": 1.9889944737825546e-05, - "loss": 0.4902, - "step": 128 - }, - { - "epoch": 1.5357142857142856, - "grad_norm": 3.6084671463423876, - "learning_rate": 1.988707302461703e-05, - "loss": 0.0791, - "step": 129 - }, - { - "epoch": 1.5476190476190477, - "grad_norm": 1.8953178018348253, - "learning_rate": 1.9884164538736858e-05, - "loss": 0.1778, - "step": 130 - }, - { - "epoch": 1.5595238095238095, - "grad_norm": 1.692060013017211, - "learning_rate": 1.9881219291002462e-05, - "loss": 0.9511, - "step": 131 - }, - { - "epoch": 1.5714285714285714, - "grad_norm": 2.745211390170471, - "learning_rate": 1.9878237292368014e-05, - "loss": 0.9219, - "step": 132 - }, - { - "epoch": 1.5833333333333335, - "grad_norm": 6.211027429665557, - "learning_rate": 1.9875218553924357e-05, - "loss": 0.5817, - "step": 133 - }, - { - "epoch": 1.5952380952380953, - "grad_norm": 12.205614792958544, - "learning_rate": 1.9872163086898992e-05, - "loss": 0.2321, - "step": 134 - }, - { - "epoch": 1.6071428571428572, - "grad_norm": 4.263263812234368, - "learning_rate": 1.986907090265602e-05, - "loss": 0.0934, - "step": 135 - }, - { - "epoch": 1.619047619047619, - "grad_norm": 4.372915900877127, - "learning_rate": 1.98659420126961e-05, - "loss": 0.0622, - "step": 136 - }, - { - "epoch": 1.630952380952381, - "grad_norm": 1.6281660636980724, - "learning_rate": 1.9862776428656412e-05, - "loss": 0.9627, - "step": 137 - }, - { - "epoch": 1.6428571428571428, - "grad_norm": 1.466484459747661, - "learning_rate": 1.985957416231061e-05, - "loss": 0.1446, - "step": 138 - }, - { - "epoch": 1.6547619047619047, - "grad_norm": 1.3764747389198715, - "learning_rate": 1.9856335225568778e-05, - "loss": 0.8076, - "step": 139 - }, - { - "epoch": 1.6666666666666665, - "grad_norm": 11.454638511224184, - "learning_rate": 1.9853059630477396e-05, - "loss": 0.7499, - "step": 140 - }, - { - "epoch": 1.6785714285714286, - "grad_norm": 1.3551997471007746, - "learning_rate": 1.9849747389219272e-05, - "loss": 0.8396, - "step": 141 - }, - { - "epoch": 1.6904761904761905, - "grad_norm": 5.589546478044774, - "learning_rate": 1.984639851411352e-05, - "loss": 1.0755, - "step": 142 - }, - { - "epoch": 1.7023809523809523, - "grad_norm": 6.308349485763978, - "learning_rate": 1.9843013017615505e-05, - "loss": 0.599, - "step": 143 - }, - { - "epoch": 1.7142857142857144, - "grad_norm": 3.113963021942783, - "learning_rate": 1.9839590912316794e-05, - "loss": 0.839, - "step": 144 - }, - { - "epoch": 1.7261904761904763, - "grad_norm": 8.55365310087414, - "learning_rate": 1.9836132210945108e-05, - "loss": 0.193, - "step": 145 - }, - { - "epoch": 1.7380952380952381, - "grad_norm": 2.797343688675298, - "learning_rate": 1.9832636926364294e-05, - "loss": 1.2497, - "step": 146 - }, - { - "epoch": 1.75, - "grad_norm": 2.7347494170989566, - "learning_rate": 1.9829105071574243e-05, - "loss": 0.5603, - "step": 147 - }, - { - "epoch": 1.7619047619047619, - "grad_norm": 3.8951141138135257, - "learning_rate": 1.9825536659710866e-05, - "loss": 0.8488, - "step": 148 - }, - { - "epoch": 1.7738095238095237, - "grad_norm": 3.5133582001834225, - "learning_rate": 1.982193170404605e-05, - "loss": 0.8789, - "step": 149 - }, - { - "epoch": 1.7857142857142856, - "grad_norm": 4.91423782997247, - "learning_rate": 1.9818290217987587e-05, - "loss": 0.5037, - "step": 150 - }, - { - "epoch": 1.7976190476190477, - "grad_norm": 2.700610142348175, - "learning_rate": 1.981461221507914e-05, - "loss": 0.4866, - "step": 151 - }, - { - "epoch": 1.8095238095238095, - "grad_norm": 2.03214158550786, - "learning_rate": 1.9810897709000183e-05, - "loss": 0.872, - "step": 152 - }, - { - "epoch": 1.8214285714285714, - "grad_norm": 3.272072664954593, - "learning_rate": 1.9807146713565957e-05, - "loss": 1.2727, - "step": 153 - }, - { - "epoch": 1.8333333333333335, - "grad_norm": 1.581429785806821, - "learning_rate": 1.9803359242727427e-05, - "loss": 0.7566, - "step": 154 - }, - { - "epoch": 1.8452380952380953, - "grad_norm": 9.22961810487271, - "learning_rate": 1.9799535310571205e-05, - "loss": 0.6788, - "step": 155 - }, - { - "epoch": 1.8571428571428572, - "grad_norm": 0.7637949995107111, - "learning_rate": 1.9795674931319515e-05, - "loss": 0.418, - "step": 156 - }, - { - "epoch": 1.869047619047619, - "grad_norm": 1.2819972814953018, - "learning_rate": 1.979177811933015e-05, - "loss": 0.8087, - "step": 157 - }, - { - "epoch": 1.880952380952381, - "grad_norm": 1.6876409305315814, - "learning_rate": 1.978784488909639e-05, - "loss": 0.848, - "step": 158 - }, - { - "epoch": 1.8928571428571428, - "grad_norm": 3.4822248028190512, - "learning_rate": 1.9783875255246972e-05, - "loss": 0.5048, - "step": 159 - }, - { - "epoch": 1.9047619047619047, - "grad_norm": 1.9167539626251773, - "learning_rate": 1.9779869232546033e-05, - "loss": 0.8171, - "step": 160 - }, - { - "epoch": 1.9166666666666665, - "grad_norm": 2.2347320112562006, - "learning_rate": 1.977582683589304e-05, - "loss": 1.2693, - "step": 161 - }, - { - "epoch": 1.9285714285714286, - "grad_norm": 1.2561177879376604, - "learning_rate": 1.9771748080322746e-05, - "loss": 0.3953, - "step": 162 - }, - { - "epoch": 1.9404761904761905, - "grad_norm": 2.0408657215196495, - "learning_rate": 1.9767632981005138e-05, - "loss": 1.1772, - "step": 163 - }, - { - "epoch": 1.9523809523809523, - "grad_norm": 2.233965174646808, - "learning_rate": 1.976348155324537e-05, - "loss": 0.8335, - "step": 164 - }, - { - "epoch": 1.9642857142857144, - "grad_norm": 2.4594307933578605, - "learning_rate": 1.9759293812483717e-05, - "loss": 0.0393, - "step": 165 - }, - { - "epoch": 1.9761904761904763, - "grad_norm": 2.4993255857117513, - "learning_rate": 1.9755069774295502e-05, - "loss": 0.8245, - "step": 166 - }, - { - "epoch": 1.9880952380952381, - "grad_norm": 1.5646336637997305, - "learning_rate": 1.975080945439106e-05, - "loss": 0.4099, - "step": 167 - }, - { - "epoch": 2.0, - "grad_norm": 2.1272339637343025, - "learning_rate": 1.9746512868615656e-05, - "loss": 1.1694, - "step": 168 - }, - { - "epoch": 2.0, - "eval_loss": 0.6088699102401733, - "eval_runtime": 39.1086, - "eval_samples_per_second": 1.534, - "eval_steps_per_second": 1.534, - "step": 168 - }, - { - "epoch": 2.011904761904762, - "grad_norm": 1.0416876526645193, - "learning_rate": 1.974218003294945e-05, - "loss": 0.3786, - "step": 169 - }, - { - "epoch": 2.0238095238095237, - "grad_norm": 2.1337629148828365, - "learning_rate": 1.973781096350741e-05, - "loss": 0.4309, - "step": 170 - }, - { - "epoch": 2.0357142857142856, - "grad_norm": 2.2107303096576625, - "learning_rate": 1.9733405676539283e-05, - "loss": 1.0759, - "step": 171 - }, - { - "epoch": 2.0476190476190474, - "grad_norm": 2.0554217724529966, - "learning_rate": 1.97289641884295e-05, - "loss": 1.0771, - "step": 172 - }, - { - "epoch": 2.0595238095238093, - "grad_norm": 2.1586243563623366, - "learning_rate": 1.9724486515697157e-05, - "loss": 1.1039, - "step": 173 - }, - { - "epoch": 2.0714285714285716, - "grad_norm": 4.584438173048313, - "learning_rate": 1.9719972674995905e-05, - "loss": 0.8035, - "step": 174 - }, - { - "epoch": 2.0833333333333335, - "grad_norm": 1.2828049201231282, - "learning_rate": 1.971542268311394e-05, - "loss": 0.739, - "step": 175 - }, - { - "epoch": 2.0952380952380953, - "grad_norm": 2.5469482800959047, - "learning_rate": 1.9710836556973887e-05, - "loss": 0.4056, - "step": 176 - }, - { - "epoch": 2.107142857142857, - "grad_norm": 3.1810387560100524, - "learning_rate": 1.9706214313632786e-05, - "loss": 0.8045, - "step": 177 - }, - { - "epoch": 2.119047619047619, - "grad_norm": 3.2420213991910956, - "learning_rate": 1.970155597028199e-05, - "loss": 0.3999, - "step": 178 - }, - { - "epoch": 2.130952380952381, - "grad_norm": 1.9699411778652773, - "learning_rate": 1.969686154424713e-05, - "loss": 0.9999, - "step": 179 - }, - { - "epoch": 2.142857142857143, - "grad_norm": 2.627806590925809, - "learning_rate": 1.9692131052988035e-05, - "loss": 0.1001, - "step": 180 - }, - { - "epoch": 2.1547619047619047, - "grad_norm": 3.7819808176512772, - "learning_rate": 1.9687364514098664e-05, - "loss": 0.0601, - "step": 181 - }, - { - "epoch": 2.1666666666666665, - "grad_norm": 2.9442257019853844, - "learning_rate": 1.9682561945307052e-05, - "loss": 0.7165, - "step": 182 - }, - { - "epoch": 2.1785714285714284, - "grad_norm": 2.7812375288605446, - "learning_rate": 1.9677723364475237e-05, - "loss": 0.7077, - "step": 183 - }, - { - "epoch": 2.1904761904761907, - "grad_norm": 2.662139054171453, - "learning_rate": 1.9672848789599204e-05, - "loss": 0.0975, - "step": 184 - }, - { - "epoch": 2.2023809523809526, - "grad_norm": 3.13638542402792, - "learning_rate": 1.9667938238808796e-05, - "loss": 0.1782, - "step": 185 - }, - { - "epoch": 2.2142857142857144, - "grad_norm": 1.533138893606418, - "learning_rate": 1.9662991730367664e-05, - "loss": 0.7253, - "step": 186 - }, - { - "epoch": 2.2261904761904763, - "grad_norm": 1.6326899334327516, - "learning_rate": 1.9658009282673202e-05, - "loss": 0.4389, - "step": 187 - }, - { - "epoch": 2.238095238095238, - "grad_norm": 4.129746342978926, - "learning_rate": 1.9652990914256466e-05, - "loss": 0.0696, - "step": 188 - }, - { - "epoch": 2.25, - "grad_norm": 4.866194278395534, - "learning_rate": 1.964793664378211e-05, - "loss": 0.6483, - "step": 189 - }, - { - "epoch": 2.261904761904762, - "grad_norm": 4.040961944862707, - "learning_rate": 1.964284649004832e-05, - "loss": 0.3702, - "step": 190 - }, - { - "epoch": 2.2738095238095237, - "grad_norm": 5.955269361913289, - "learning_rate": 1.9637720471986736e-05, - "loss": 0.729, - "step": 191 - }, - { - "epoch": 2.2857142857142856, - "grad_norm": 2.416827364213375, - "learning_rate": 1.9632558608662403e-05, - "loss": 1.0398, - "step": 192 - }, - { - "epoch": 2.2976190476190474, - "grad_norm": 4.386696083224722, - "learning_rate": 1.962736091927366e-05, - "loss": 0.0767, - "step": 193 - }, - { - "epoch": 2.3095238095238093, - "grad_norm": 3.5619821484086467, - "learning_rate": 1.9622127423152114e-05, - "loss": 0.4072, - "step": 194 - }, - { - "epoch": 2.3214285714285716, - "grad_norm": 3.265850499691294, - "learning_rate": 1.9616858139762534e-05, - "loss": 0.6642, - "step": 195 - }, - { - "epoch": 2.3333333333333335, - "grad_norm": 2.5415289262555687, - "learning_rate": 1.96115530887028e-05, - "loss": 0.8033, - "step": 196 - }, - { - "epoch": 2.3452380952380953, - "grad_norm": 2.5712529524190577, - "learning_rate": 1.9606212289703813e-05, - "loss": 0.0824, - "step": 197 - }, - { - "epoch": 2.357142857142857, - "grad_norm": 5.474104888697016, - "learning_rate": 1.9600835762629434e-05, - "loss": 0.4142, - "step": 198 - }, - { - "epoch": 2.369047619047619, - "grad_norm": 2.879836797156971, - "learning_rate": 1.9595423527476407e-05, - "loss": 0.086, - "step": 199 - }, - { - "epoch": 2.380952380952381, - "grad_norm": 1.62480689368467, - "learning_rate": 1.9589975604374287e-05, - "loss": 0.6521, - "step": 200 - }, - { - "epoch": 2.392857142857143, - "grad_norm": 4.477628590267687, - "learning_rate": 1.9584492013585358e-05, - "loss": 0.4574, - "step": 201 - }, - { - "epoch": 2.4047619047619047, - "grad_norm": 3.7371630710469907, - "learning_rate": 1.9578972775504556e-05, - "loss": 0.7116, - "step": 202 - }, - { - "epoch": 2.4166666666666665, - "grad_norm": 4.388820750003364, - "learning_rate": 1.9573417910659414e-05, - "loss": 0.8658, - "step": 203 - }, - { - "epoch": 2.4285714285714284, - "grad_norm": 5.264780771949968, - "learning_rate": 1.9567827439709954e-05, - "loss": 0.5218, - "step": 204 - }, - { - "epoch": 2.4404761904761907, - "grad_norm": 2.8263063565249014, - "learning_rate": 1.9562201383448637e-05, - "loss": 0.589, - "step": 205 - }, - { - "epoch": 2.4523809523809526, - "grad_norm": 1.9326495868158482, - "learning_rate": 1.9556539762800276e-05, - "loss": 0.6548, - "step": 206 - }, - { - "epoch": 2.4642857142857144, - "grad_norm": 1.4295352535114343, - "learning_rate": 1.9550842598821954e-05, - "loss": 0.2921, - "step": 207 - }, - { - "epoch": 2.4761904761904763, - "grad_norm": 4.131679625334346, - "learning_rate": 1.954510991270294e-05, - "loss": 0.0552, - "step": 208 - }, - { - "epoch": 2.488095238095238, - "grad_norm": 1.2944968084779456, - "learning_rate": 1.953934172576464e-05, - "loss": 0.5584, - "step": 209 - }, - { - "epoch": 2.5, - "grad_norm": 2.3368325920978656, - "learning_rate": 1.9533538059460475e-05, - "loss": 0.8149, - "step": 210 - }, - { - "epoch": 2.511904761904762, - "grad_norm": 2.3435541020549957, - "learning_rate": 1.952769893537584e-05, - "loss": 0.7674, - "step": 211 - }, - { - "epoch": 2.5238095238095237, - "grad_norm": 1.4945515120342354, - "learning_rate": 1.9521824375228005e-05, - "loss": 0.6053, - "step": 212 - }, - { - "epoch": 2.5357142857142856, - "grad_norm": 1.6994434814237505, - "learning_rate": 1.9515914400866022e-05, - "loss": 0.6523, - "step": 213 - }, - { - "epoch": 2.5476190476190474, - "grad_norm": 1.7583638651152655, - "learning_rate": 1.9509969034270675e-05, - "loss": 0.324, - "step": 214 - }, - { - "epoch": 2.5595238095238093, - "grad_norm": 2.303351195831185, - "learning_rate": 1.950398829755437e-05, - "loss": 0.7802, - "step": 215 - }, - { - "epoch": 2.571428571428571, - "grad_norm": 1.2085607857296046, - "learning_rate": 1.9497972212961073e-05, - "loss": 0.5606, - "step": 216 - }, - { - "epoch": 2.5833333333333335, - "grad_norm": 4.953999376334107, - "learning_rate": 1.9491920802866207e-05, - "loss": 0.6656, - "step": 217 - }, - { - "epoch": 2.5952380952380953, - "grad_norm": 10.462260026499376, - "learning_rate": 1.9485834089776587e-05, - "loss": 0.1941, - "step": 218 - }, - { - "epoch": 2.607142857142857, - "grad_norm": 2.098150876215288, - "learning_rate": 1.9479712096330335e-05, - "loss": 0.371, - "step": 219 - }, - { - "epoch": 2.619047619047619, - "grad_norm": 6.9790355144844805, - "learning_rate": 1.947355484529678e-05, - "loss": 0.3854, - "step": 220 - }, - { - "epoch": 2.630952380952381, - "grad_norm": 4.988557486539209, - "learning_rate": 1.9467362359576388e-05, - "loss": 0.333, - "step": 221 - }, - { - "epoch": 2.642857142857143, - "grad_norm": 7.063238285522866, - "learning_rate": 1.9461134662200667e-05, - "loss": 0.3903, - "step": 222 - }, - { - "epoch": 2.6547619047619047, - "grad_norm": 1.5164015264404487, - "learning_rate": 1.9454871776332096e-05, - "loss": 0.5085, - "step": 223 - }, - { - "epoch": 2.6666666666666665, - "grad_norm": 2.522359534562382, - "learning_rate": 1.944857372526402e-05, - "loss": 0.738, - "step": 224 - }, - { - "epoch": 2.678571428571429, - "grad_norm": 1.555259463726646, - "learning_rate": 1.9442240532420585e-05, - "loss": 0.4806, - "step": 225 - }, - { - "epoch": 2.6904761904761907, - "grad_norm": 7.951689819472121, - "learning_rate": 1.9435872221356623e-05, - "loss": 0.4552, - "step": 226 - }, - { - "epoch": 2.7023809523809526, - "grad_norm": 1.5044684923460068, - "learning_rate": 1.9429468815757587e-05, - "loss": 0.4904, - "step": 227 - }, - { - "epoch": 2.7142857142857144, - "grad_norm": 1.4782862605961744, - "learning_rate": 1.9423030339439464e-05, - "loss": 0.459, - "step": 228 - }, - { - "epoch": 2.7261904761904763, - "grad_norm": 7.30930822807625, - "learning_rate": 1.9416556816348663e-05, - "loss": 0.4524, - "step": 229 - }, - { - "epoch": 2.738095238095238, - "grad_norm": 0.8563665535357093, - "learning_rate": 1.9410048270561958e-05, - "loss": 0.2686, - "step": 230 - }, - { - "epoch": 2.75, - "grad_norm": 6.36705833497127, - "learning_rate": 1.9403504726286367e-05, - "loss": 0.4109, - "step": 231 - }, - { - "epoch": 2.761904761904762, - "grad_norm": 1.6877844007234946, - "learning_rate": 1.9396926207859085e-05, - "loss": 0.0287, - "step": 232 - }, - { - "epoch": 2.7738095238095237, - "grad_norm": 1.5933904005469899, - "learning_rate": 1.9390312739747384e-05, - "loss": 0.25, - "step": 233 - }, - { - "epoch": 2.7857142857142856, - "grad_norm": 3.8409331200672043, - "learning_rate": 1.938366434654852e-05, - "loss": 0.1199, - "step": 234 - }, - { - "epoch": 2.7976190476190474, - "grad_norm": 1.9144885212012308, - "learning_rate": 1.9376981052989655e-05, - "loss": 0.4406, - "step": 235 - }, - { - "epoch": 2.8095238095238093, - "grad_norm": 3.138591834492908, - "learning_rate": 1.9370262883927737e-05, - "loss": 0.3056, - "step": 236 - }, - { - "epoch": 2.821428571428571, - "grad_norm": 1.5020413167589903, - "learning_rate": 1.9363509864349438e-05, - "loss": 0.3429, - "step": 237 - }, - { - "epoch": 2.8333333333333335, - "grad_norm": 3.3495647793942003, - "learning_rate": 1.935672201937105e-05, - "loss": 0.0534, - "step": 238 - }, - { - "epoch": 2.8452380952380953, - "grad_norm": 2.362065083308391, - "learning_rate": 1.9349899374238384e-05, - "loss": 0.5528, - "step": 239 - }, - { - "epoch": 2.857142857142857, - "grad_norm": 2.0160995396477723, - "learning_rate": 1.934304195432668e-05, - "loss": 0.6616, - "step": 240 - }, - { - "epoch": 2.869047619047619, - "grad_norm": 3.3847007434339496, - "learning_rate": 1.9336149785140524e-05, - "loss": 0.0527, - "step": 241 - }, - { - "epoch": 2.880952380952381, - "grad_norm": 2.934474785594088, - "learning_rate": 1.9329222892313738e-05, - "loss": 0.3513, - "step": 242 - }, - { - "epoch": 2.892857142857143, - "grad_norm": 2.1712614824222967, - "learning_rate": 1.9322261301609286e-05, - "loss": 0.0815, - "step": 243 - }, - { - "epoch": 2.9047619047619047, - "grad_norm": 1.421255557551219, - "learning_rate": 1.9315265038919194e-05, - "loss": 0.2604, - "step": 244 - }, - { - "epoch": 2.9166666666666665, - "grad_norm": 2.208193199321934, - "learning_rate": 1.930823413026443e-05, - "loss": 0.4335, - "step": 245 - }, - { - "epoch": 2.928571428571429, - "grad_norm": 1.5103556530084337, - "learning_rate": 1.9301168601794832e-05, - "loss": 0.2859, - "step": 246 - }, - { - "epoch": 2.9404761904761907, - "grad_norm": 2.499861790171546, - "learning_rate": 1.9294068479788987e-05, - "loss": 0.6764, - "step": 247 - }, - { - "epoch": 2.9523809523809526, - "grad_norm": 3.708903007657656, - "learning_rate": 1.928693379065415e-05, - "loss": 0.5288, - "step": 248 - }, - { - "epoch": 2.9642857142857144, - "grad_norm": 22.915584132176974, - "learning_rate": 1.9279764560926142e-05, - "loss": 0.9066, - "step": 249 - }, - { - "epoch": 2.9761904761904763, - "grad_norm": 4.272981133884936, - "learning_rate": 1.927256081726925e-05, - "loss": 0.0868, - "step": 250 - }, - { - "epoch": 2.988095238095238, - "grad_norm": 1.4918399220179503, - "learning_rate": 1.926532258647612e-05, - "loss": 0.4034, - "step": 251 - }, - { - "epoch": 3.0, - "grad_norm": 4.341407962457437, - "learning_rate": 1.9258049895467672e-05, - "loss": 0.5374, - "step": 252 - }, - { - "epoch": 3.0, - "eval_loss": 0.39895328879356384, - "eval_runtime": 38.7955, - "eval_samples_per_second": 1.547, - "eval_steps_per_second": 1.547, - "step": 252 - }, - { - "epoch": 3.011904761904762, - "grad_norm": 2.999227280578599, - "learning_rate": 1.9250742771292993e-05, - "loss": 0.2335, - "step": 253 - }, - { - "epoch": 3.0238095238095237, - "grad_norm": 2.977688909060128, - "learning_rate": 1.9243401241129232e-05, - "loss": 0.3035, - "step": 254 - }, - { - "epoch": 3.0357142857142856, - "grad_norm": 4.856998071383737, - "learning_rate": 1.9236025332281506e-05, - "loss": 0.4267, - "step": 255 - }, - { - "epoch": 3.0476190476190474, - "grad_norm": 3.6416277575589784, - "learning_rate": 1.9228615072182798e-05, - "loss": 0.0806, - "step": 256 - }, - { - "epoch": 3.0595238095238093, - "grad_norm": 3.8536704247985667, - "learning_rate": 1.9221170488393844e-05, - "loss": 0.5701, - "step": 257 - }, - { - "epoch": 3.0714285714285716, - "grad_norm": 5.006059228225849, - "learning_rate": 1.9213691608603046e-05, - "loss": 0.2624, - "step": 258 - }, - { - "epoch": 3.0833333333333335, - "grad_norm": 3.477751917820131, - "learning_rate": 1.920617846062636e-05, - "loss": 0.4209, - "step": 259 - }, - { - "epoch": 3.0952380952380953, - "grad_norm": 7.008269361085479, - "learning_rate": 1.91986310724072e-05, - "loss": 0.5247, - "step": 260 - }, - { - "epoch": 3.107142857142857, - "grad_norm": 2.4584658954882044, - "learning_rate": 1.9191049472016313e-05, - "loss": 0.0346, - "step": 261 - }, - { - "epoch": 3.119047619047619, - "grad_norm": 5.012502345173128, - "learning_rate": 1.918343368765171e-05, - "loss": 0.0991, - "step": 262 - }, - { - "epoch": 3.130952380952381, - "grad_norm": 4.016227848571736, - "learning_rate": 1.917578374763853e-05, - "loss": 0.4093, - "step": 263 - }, - { - "epoch": 3.142857142857143, - "grad_norm": 3.284851594075178, - "learning_rate": 1.9168099680428943e-05, - "loss": 0.4856, - "step": 264 - }, - { - "epoch": 3.1547619047619047, - "grad_norm": 1.439490683728371, - "learning_rate": 1.916038151460206e-05, - "loss": 0.3762, - "step": 265 - }, - { - "epoch": 3.1666666666666665, - "grad_norm": 2.5758200075686895, - "learning_rate": 1.91526292788638e-05, - "loss": 0.3824, - "step": 266 - }, - { - "epoch": 3.1785714285714284, - "grad_norm": 1.5227377147219847, - "learning_rate": 1.9144843002046805e-05, - "loss": 0.3724, - "step": 267 - }, - { - "epoch": 3.1904761904761907, - "grad_norm": 2.607295269246451, - "learning_rate": 1.9137022713110323e-05, - "loss": 0.4212, - "step": 268 - }, - { - "epoch": 3.2023809523809526, - "grad_norm": 4.777066718838521, - "learning_rate": 1.9129168441140104e-05, - "loss": 0.2489, - "step": 269 - }, - { - "epoch": 3.2142857142857144, - "grad_norm": 0.8359102150893967, - "learning_rate": 1.912128021534829e-05, - "loss": 0.1657, - "step": 270 - }, - { - "epoch": 3.2261904761904763, - "grad_norm": 2.594995245986025, - "learning_rate": 1.9113358065073297e-05, - "loss": 0.5461, - "step": 271 - }, - { - "epoch": 3.238095238095238, - "grad_norm": 0.9681291220123174, - "learning_rate": 1.9105402019779728e-05, - "loss": 0.1927, - "step": 272 - }, - { - "epoch": 3.25, - "grad_norm": 1.0698917247005957, - "learning_rate": 1.9097412109058247e-05, - "loss": 0.0175, - "step": 273 - }, - { - "epoch": 3.261904761904762, - "grad_norm": 2.5131905938872694, - "learning_rate": 1.9089388362625468e-05, - "loss": 0.5642, - "step": 274 - }, - { - "epoch": 3.2738095238095237, - "grad_norm": 1.0075136405580187, - "learning_rate": 1.9081330810323852e-05, - "loss": 0.1631, - "step": 275 - }, - { - "epoch": 3.2857142857142856, - "grad_norm": 6.448857884660445, - "learning_rate": 1.9073239482121598e-05, - "loss": 0.4545, - "step": 276 - }, - { - "epoch": 3.2976190476190474, - "grad_norm": 10.404043140853894, - "learning_rate": 1.906511440811252e-05, - "loss": 0.4511, - "step": 277 - }, - { - "epoch": 3.3095238095238093, - "grad_norm": 3.246806863815276, - "learning_rate": 1.9056955618515934e-05, - "loss": 0.2476, - "step": 278 - }, - { - "epoch": 3.3214285714285716, - "grad_norm": 2.289193427727155, - "learning_rate": 1.904876314367658e-05, - "loss": 0.1657, - "step": 279 - }, - { - "epoch": 3.3333333333333335, - "grad_norm": 2.980090661433879, - "learning_rate": 1.904053701406445e-05, - "loss": 0.1611, - "step": 280 - }, - { - "epoch": 3.3452380952380953, - "grad_norm": 7.457260763434641, - "learning_rate": 1.9032277260274733e-05, - "loss": 0.138, - "step": 281 - }, - { - "epoch": 3.357142857142857, - "grad_norm": 2.607895595891462, - "learning_rate": 1.9023983913027655e-05, - "loss": 0.3192, - "step": 282 - }, - { - "epoch": 3.369047619047619, - "grad_norm": 7.092330079559744, - "learning_rate": 1.9015657003168405e-05, - "loss": 0.1357, - "step": 283 - }, - { - "epoch": 3.380952380952381, - "grad_norm": 1.24814877201098, - "learning_rate": 1.9007296561666987e-05, - "loss": 0.2687, - "step": 284 - }, - { - "epoch": 3.392857142857143, - "grad_norm": 1.8637801294405039, - "learning_rate": 1.8998902619618117e-05, - "loss": 0.2858, - "step": 285 - }, - { - "epoch": 3.4047619047619047, - "grad_norm": 4.888040446870592, - "learning_rate": 1.8990475208241114e-05, - "loss": 0.0924, - "step": 286 - }, - { - "epoch": 3.4166666666666665, - "grad_norm": 4.001525244593556, - "learning_rate": 1.898201435887978e-05, - "loss": 0.186, - "step": 287 - }, - { - "epoch": 3.4285714285714284, - "grad_norm": 1.5303656577521532, - "learning_rate": 1.8973520103002278e-05, - "loss": 0.2531, - "step": 288 - }, - { - "epoch": 3.4404761904761907, - "grad_norm": 2.6407383882318136, - "learning_rate": 1.896499247220102e-05, - "loss": 0.0644, - "step": 289 - }, - { - "epoch": 3.4523809523809526, - "grad_norm": 5.013301155952652, - "learning_rate": 1.8956431498192547e-05, - "loss": 0.0803, - "step": 290 - }, - { - "epoch": 3.4642857142857144, - "grad_norm": 2.0289616983127865, - "learning_rate": 1.8947837212817414e-05, - "loss": 0.0326, - "step": 291 - }, - { - "epoch": 3.4761904761904763, - "grad_norm": 2.6152472211362783, - "learning_rate": 1.893920964804007e-05, - "loss": 0.4119, - "step": 292 - }, - { - "epoch": 3.488095238095238, - "grad_norm": 4.4777909223010495, - "learning_rate": 1.8930548835948738e-05, - "loss": 0.3507, - "step": 293 - }, - { - "epoch": 3.5, - "grad_norm": 0.6366794435472163, - "learning_rate": 1.8921854808755295e-05, - "loss": 0.0115, - "step": 294 - }, - { - "epoch": 3.511904761904762, - "grad_norm": 2.4420373988670883, - "learning_rate": 1.8913127598795157e-05, - "loss": 0.4161, - "step": 295 - }, - { - "epoch": 3.5238095238095237, - "grad_norm": 2.4446578383404365, - "learning_rate": 1.8904367238527156e-05, - "loss": 0.1434, - "step": 296 - }, - { - "epoch": 3.5357142857142856, - "grad_norm": 1.9529776383968345, - "learning_rate": 1.8895573760533413e-05, - "loss": 0.2163, - "step": 297 - }, - { - "epoch": 3.5476190476190474, - "grad_norm": 10.168211887481995, - "learning_rate": 1.8886747197519232e-05, - "loss": 0.3446, - "step": 298 - }, - { - "epoch": 3.5595238095238093, - "grad_norm": 0.8708790799659621, - "learning_rate": 1.887788758231296e-05, - "loss": 0.172, - "step": 299 - }, - { - "epoch": 3.571428571428571, - "grad_norm": 1.6300564668273962, - "learning_rate": 1.8868994947865882e-05, - "loss": 0.3293, - "step": 300 - }, - { - "epoch": 3.5833333333333335, - "grad_norm": 1.5955068953197555, - "learning_rate": 1.8860069327252085e-05, - "loss": 0.1722, - "step": 301 - }, - { - "epoch": 3.5952380952380953, - "grad_norm": 3.5420643404756045, - "learning_rate": 1.885111075366834e-05, - "loss": 0.3466, - "step": 302 - }, - { - "epoch": 3.607142857142857, - "grad_norm": 0.8255964441201371, - "learning_rate": 1.8842119260433984e-05, - "loss": 0.1426, - "step": 303 - }, - { - "epoch": 3.619047619047619, - "grad_norm": 2.2416092982597835, - "learning_rate": 1.8833094880990777e-05, - "loss": 0.1176, - "step": 304 - }, - { - "epoch": 3.630952380952381, - "grad_norm": 3.17281147686109, - "learning_rate": 1.882403764890282e-05, - "loss": 0.2363, - "step": 305 - }, - { - "epoch": 3.642857142857143, - "grad_norm": 3.50801962281108, - "learning_rate": 1.8814947597856366e-05, - "loss": 0.1298, - "step": 306 - }, - { - "epoch": 3.6547619047619047, - "grad_norm": 2.015451984498423, - "learning_rate": 1.8805824761659766e-05, - "loss": 0.0503, - "step": 307 - }, - { - "epoch": 3.6666666666666665, - "grad_norm": 1.8138556454595522, - "learning_rate": 1.8796669174243274e-05, - "loss": 0.1708, - "step": 308 - }, - { - "epoch": 3.678571428571429, - "grad_norm": 1.350180232476615, - "learning_rate": 1.878748086965898e-05, - "loss": 0.2102, - "step": 309 - }, - { - "epoch": 3.6904761904761907, - "grad_norm": 5.44990543805186, - "learning_rate": 1.877825988208065e-05, - "loss": 0.1776, - "step": 310 - }, - { - "epoch": 3.7023809523809526, - "grad_norm": 5.429618864013119, - "learning_rate": 1.8769006245803597e-05, - "loss": 0.2871, - "step": 311 - }, - { - "epoch": 3.7142857142857144, - "grad_norm": 9.615841204468909, - "learning_rate": 1.875971999524458e-05, - "loss": 0.4327, - "step": 312 - }, - { - "epoch": 3.7261904761904763, - "grad_norm": 13.832201073491575, - "learning_rate": 1.875040116494165e-05, - "loss": 0.3311, - "step": 313 - }, - { - "epoch": 3.738095238095238, - "grad_norm": 3.1133674287556383, - "learning_rate": 1.874104978955403e-05, - "loss": 0.1555, - "step": 314 - }, - { - "epoch": 3.75, - "grad_norm": 2.347399033576179, - "learning_rate": 1.8731665903861987e-05, - "loss": 0.1674, - "step": 315 - }, - { - "epoch": 3.761904761904762, - "grad_norm": 9.849958200867494, - "learning_rate": 1.8722249542766704e-05, - "loss": 0.1878, - "step": 316 - }, - { - "epoch": 3.7738095238095237, - "grad_norm": 2.435572970484653, - "learning_rate": 1.8712800741290155e-05, - "loss": 0.1737, - "step": 317 - }, - { - "epoch": 3.7857142857142856, - "grad_norm": 1.3696724790752792, - "learning_rate": 1.8703319534574954e-05, - "loss": 0.2522, - "step": 318 - }, - { - "epoch": 3.7976190476190474, - "grad_norm": 2.3593401003654937, - "learning_rate": 1.869380595788426e-05, - "loss": 0.1571, - "step": 319 - }, - { - "epoch": 3.8095238095238093, - "grad_norm": 1.0006750257594192, - "learning_rate": 1.8684260046601595e-05, - "loss": 0.131, - "step": 320 - }, - { - "epoch": 3.821428571428571, - "grad_norm": 8.568117470038834, - "learning_rate": 1.867468183623077e-05, - "loss": 0.2712, - "step": 321 - }, - { - "epoch": 3.8333333333333335, - "grad_norm": 4.128637677242795, - "learning_rate": 1.8665071362395714e-05, - "loss": 0.2944, - "step": 322 - }, - { - "epoch": 3.8452380952380953, - "grad_norm": 8.30896311650967, - "learning_rate": 1.8655428660840344e-05, - "loss": 0.2637, - "step": 323 - }, - { - "epoch": 3.857142857142857, - "grad_norm": 2.837983814094986, - "learning_rate": 1.8645753767428457e-05, - "loss": 0.26, - "step": 324 - }, - { - "epoch": 3.869047619047619, - "grad_norm": 1.4937224948435774, - "learning_rate": 1.863604671814357e-05, - "loss": 0.1746, - "step": 325 - }, - { - "epoch": 3.880952380952381, - "grad_norm": 2.841367725391816, - "learning_rate": 1.8626307549088794e-05, - "loss": 0.2785, - "step": 326 - }, - { - "epoch": 3.892857142857143, - "grad_norm": 5.964953926450381, - "learning_rate": 1.861653629648671e-05, - "loss": 0.2995, - "step": 327 - }, - { - "epoch": 3.9047619047619047, - "grad_norm": 2.3586129170487817, - "learning_rate": 1.8606732996679224e-05, - "loss": 0.1655, - "step": 328 - }, - { - "epoch": 3.9166666666666665, - "grad_norm": 4.983039463607254, - "learning_rate": 1.8596897686127428e-05, - "loss": 0.1406, - "step": 329 - }, - { - "epoch": 3.928571428571429, - "grad_norm": 2.6513317896940753, - "learning_rate": 1.858703040141148e-05, - "loss": 0.2406, - "step": 330 - }, - { - "epoch": 3.9404761904761907, - "grad_norm": 2.4589121027162415, - "learning_rate": 1.8577131179230447e-05, - "loss": 0.263, - "step": 331 - }, - { - "epoch": 3.9523809523809526, - "grad_norm": 5.9418158438751805, - "learning_rate": 1.8567200056402197e-05, - "loss": 0.1849, - "step": 332 - }, - { - "epoch": 3.9642857142857144, - "grad_norm": 2.717129397334188, - "learning_rate": 1.8557237069863224e-05, - "loss": 0.1202, - "step": 333 - }, - { - "epoch": 3.9761904761904763, - "grad_norm": 1.5063133605816246, - "learning_rate": 1.8547242256668546e-05, - "loss": 0.2361, - "step": 334 - }, - { - "epoch": 3.988095238095238, - "grad_norm": 1.8546470741755403, - "learning_rate": 1.8537215653991554e-05, - "loss": 0.1449, - "step": 335 - }, - { - "epoch": 4.0, - "grad_norm": 1.0304658609012838, - "learning_rate": 1.852715729912386e-05, - "loss": 0.1486, - "step": 336 - }, - { - "epoch": 4.0, - "eval_loss": 0.21311835944652557, - "eval_runtime": 38.6432, - "eval_samples_per_second": 1.553, - "eval_steps_per_second": 1.553, - "step": 336 - }, - { - "epoch": 4.011904761904762, - "grad_norm": 2.733522684873606, - "learning_rate": 1.8517067229475184e-05, - "loss": 0.0409, - "step": 337 - }, - { - "epoch": 4.023809523809524, - "grad_norm": 1.8388646319823272, - "learning_rate": 1.8506945482573197e-05, - "loss": 0.1691, - "step": 338 - }, - { - "epoch": 4.035714285714286, - "grad_norm": 1.8958492787854766, - "learning_rate": 1.849679209606338e-05, - "loss": 0.108, - "step": 339 - }, - { - "epoch": 4.0476190476190474, - "grad_norm": 7.540918072075598, - "learning_rate": 1.8486607107708902e-05, - "loss": 0.1848, - "step": 340 - }, - { - "epoch": 4.059523809523809, - "grad_norm": 5.8327310752377155, - "learning_rate": 1.8476390555390458e-05, - "loss": 0.1757, - "step": 341 - }, - { - "epoch": 4.071428571428571, - "grad_norm": 2.37604795133662, - "learning_rate": 1.846614247710614e-05, - "loss": 0.0432, - "step": 342 - }, - { - "epoch": 4.083333333333333, - "grad_norm": 3.956049610570956, - "learning_rate": 1.84558629109713e-05, - "loss": 0.1275, - "step": 343 - }, - { - "epoch": 4.095238095238095, - "grad_norm": 2.2566528797985366, - "learning_rate": 1.8445551895218393e-05, - "loss": 0.3215, - "step": 344 - }, - { - "epoch": 4.107142857142857, - "grad_norm": 5.222667806863405, - "learning_rate": 1.843520946819685e-05, - "loss": 0.0958, - "step": 345 - }, - { - "epoch": 4.119047619047619, - "grad_norm": 1.4292479650404268, - "learning_rate": 1.842483566837292e-05, - "loss": 0.1721, - "step": 346 - }, - { - "epoch": 4.130952380952381, - "grad_norm": 2.0768979787883897, - "learning_rate": 1.8414430534329552e-05, - "loss": 0.1847, - "step": 347 - }, - { - "epoch": 4.142857142857143, - "grad_norm": 2.323248373240265, - "learning_rate": 1.8403994104766214e-05, - "loss": 0.2216, - "step": 348 - }, - { - "epoch": 4.154761904761905, - "grad_norm": 3.092855168358233, - "learning_rate": 1.8393526418498786e-05, - "loss": 0.3547, - "step": 349 - }, - { - "epoch": 4.166666666666667, - "grad_norm": 2.820694205381939, - "learning_rate": 1.8383027514459403e-05, - "loss": 0.0892, - "step": 350 - }, - { - "epoch": 4.178571428571429, - "grad_norm": 5.19609590141203, - "learning_rate": 1.8372497431696287e-05, - "loss": 0.2335, - "step": 351 - }, - { - "epoch": 4.190476190476191, - "grad_norm": 1.8357548516888786, - "learning_rate": 1.8361936209373646e-05, - "loss": 0.1042, - "step": 352 - }, - { - "epoch": 4.2023809523809526, - "grad_norm": 3.143774052079354, - "learning_rate": 1.835134388677149e-05, - "loss": 0.1141, - "step": 353 - }, - { - "epoch": 4.214285714285714, - "grad_norm": 1.3502370439171336, - "learning_rate": 1.83407205032855e-05, - "loss": 0.1851, - "step": 354 - }, - { - "epoch": 4.226190476190476, - "grad_norm": 2.207079287438361, - "learning_rate": 1.833006609842688e-05, - "loss": 0.1518, - "step": 355 - }, - { - "epoch": 4.238095238095238, - "grad_norm": 2.366363737001346, - "learning_rate": 1.8319380711822225e-05, - "loss": 0.2351, - "step": 356 - }, - { - "epoch": 4.25, - "grad_norm": 2.3720004969109003, - "learning_rate": 1.8308664383213343e-05, - "loss": 0.0997, - "step": 357 - }, - { - "epoch": 4.261904761904762, - "grad_norm": 1.6380161132514732, - "learning_rate": 1.8297917152457128e-05, - "loss": 0.1759, - "step": 358 - }, - { - "epoch": 4.273809523809524, - "grad_norm": 3.0271158281302695, - "learning_rate": 1.8287139059525413e-05, - "loss": 0.1826, - "step": 359 - }, - { - "epoch": 4.285714285714286, - "grad_norm": 24.711766628918838, - "learning_rate": 1.8276330144504802e-05, - "loss": 0.3111, - "step": 360 - }, - { - "epoch": 4.2976190476190474, - "grad_norm": 3.8497648128182815, - "learning_rate": 1.8265490447596552e-05, - "loss": 0.1551, - "step": 361 - }, - { - "epoch": 4.309523809523809, - "grad_norm": 1.2126662151093983, - "learning_rate": 1.8254620009116396e-05, - "loss": 0.1191, - "step": 362 - }, - { - "epoch": 4.321428571428571, - "grad_norm": 10.322330853048463, - "learning_rate": 1.824371886949441e-05, - "loss": 0.1496, - "step": 363 - }, - { - "epoch": 4.333333333333333, - "grad_norm": 3.86269481191417, - "learning_rate": 1.823278706927484e-05, - "loss": 0.0617, - "step": 364 - }, - { - "epoch": 4.345238095238095, - "grad_norm": 2.409129687324055, - "learning_rate": 1.8221824649115986e-05, - "loss": 0.1082, - "step": 365 - }, - { - "epoch": 4.357142857142857, - "grad_norm": 6.72802746402091, - "learning_rate": 1.8210831649790016e-05, - "loss": 0.2257, - "step": 366 - }, - { - "epoch": 4.369047619047619, - "grad_norm": 3.286057633238495, - "learning_rate": 1.819980811218285e-05, - "loss": 0.13, - "step": 367 - }, - { - "epoch": 4.380952380952381, - "grad_norm": 2.0442328951186446, - "learning_rate": 1.8188754077293964e-05, - "loss": 0.0307, - "step": 368 - }, - { - "epoch": 4.392857142857143, - "grad_norm": 0.7743849368361936, - "learning_rate": 1.8177669586236276e-05, - "loss": 0.068, - "step": 369 - }, - { - "epoch": 4.404761904761905, - "grad_norm": 1.8828834323895858, - "learning_rate": 1.8166554680235982e-05, - "loss": 0.079, - "step": 370 - }, - { - "epoch": 4.416666666666667, - "grad_norm": 2.035350787727451, - "learning_rate": 1.8155409400632388e-05, - "loss": 0.1643, - "step": 371 - }, - { - "epoch": 4.428571428571429, - "grad_norm": 11.240616104668772, - "learning_rate": 1.814423378887777e-05, - "loss": 0.254, - "step": 372 - }, - { - "epoch": 4.440476190476191, - "grad_norm": 3.3550314941524144, - "learning_rate": 1.8133027886537227e-05, - "loss": 0.1288, - "step": 373 - }, - { - "epoch": 4.4523809523809526, - "grad_norm": 0.8521875859194227, - "learning_rate": 1.8121791735288504e-05, - "loss": 0.0143, - "step": 374 - }, - { - "epoch": 4.464285714285714, - "grad_norm": 0.7346037032427043, - "learning_rate": 1.8110525376921863e-05, - "loss": 0.0699, - "step": 375 - }, - { - "epoch": 4.476190476190476, - "grad_norm": 1.6709740168329212, - "learning_rate": 1.80992288533399e-05, - "loss": 0.125, - "step": 376 - }, - { - "epoch": 4.488095238095238, - "grad_norm": 1.9138466367058902, - "learning_rate": 1.8087902206557413e-05, - "loss": 0.2358, - "step": 377 - }, - { - "epoch": 4.5, - "grad_norm": 0.8814173547123243, - "learning_rate": 1.8076545478701235e-05, - "loss": 0.0804, - "step": 378 - }, - { - "epoch": 4.511904761904762, - "grad_norm": 1.6206474959698678, - "learning_rate": 1.8065158712010074e-05, - "loss": 0.0219, - "step": 379 - }, - { - "epoch": 4.523809523809524, - "grad_norm": 2.3891226185750383, - "learning_rate": 1.8053741948834373e-05, - "loss": 0.0307, - "step": 380 - }, - { - "epoch": 4.535714285714286, - "grad_norm": 7.206521173807237, - "learning_rate": 1.8042295231636116e-05, - "loss": 0.2891, - "step": 381 - }, - { - "epoch": 4.5476190476190474, - "grad_norm": 2.064018339358741, - "learning_rate": 1.803081860298872e-05, - "loss": 0.193, - "step": 382 - }, - { - "epoch": 4.559523809523809, - "grad_norm": 1.0485599754303636, - "learning_rate": 1.801931210557684e-05, - "loss": 0.0177, - "step": 383 - }, - { - "epoch": 4.571428571428571, - "grad_norm": 1.4754707293160358, - "learning_rate": 1.8007775782196216e-05, - "loss": 0.1202, - "step": 384 - }, - { - "epoch": 4.583333333333333, - "grad_norm": 1.9792453212923111, - "learning_rate": 1.7996209675753524e-05, - "loss": 0.091, - "step": 385 - }, - { - "epoch": 4.595238095238095, - "grad_norm": 10.95594851386592, - "learning_rate": 1.798461382926621e-05, - "loss": 0.2459, - "step": 386 - }, - { - "epoch": 4.607142857142857, - "grad_norm": 2.0796924560180905, - "learning_rate": 1.7972988285862337e-05, - "loss": 0.193, - "step": 387 - }, - { - "epoch": 4.619047619047619, - "grad_norm": 1.843392372727349, - "learning_rate": 1.7961333088780404e-05, - "loss": 0.1797, - "step": 388 - }, - { - "epoch": 4.630952380952381, - "grad_norm": 1.5749872984474034, - "learning_rate": 1.794964828136922e-05, - "loss": 0.135, - "step": 389 - }, - { - "epoch": 4.642857142857143, - "grad_norm": 6.267524676104374, - "learning_rate": 1.7937933907087702e-05, - "loss": 0.1105, - "step": 390 - }, - { - "epoch": 4.654761904761905, - "grad_norm": 10.307838193086383, - "learning_rate": 1.792619000950475e-05, - "loss": 0.2006, - "step": 391 - }, - { - "epoch": 4.666666666666667, - "grad_norm": 1.6810655378072146, - "learning_rate": 1.7914416632299066e-05, - "loss": 0.0731, - "step": 392 - }, - { - "epoch": 4.678571428571429, - "grad_norm": 1.60907336690843, - "learning_rate": 1.7902613819258983e-05, - "loss": 0.1373, - "step": 393 - }, - { - "epoch": 4.690476190476191, - "grad_norm": 1.6877915961502108, - "learning_rate": 1.7890781614282333e-05, - "loss": 0.0464, - "step": 394 - }, - { - "epoch": 4.7023809523809526, - "grad_norm": 1.540430941284871, - "learning_rate": 1.787892006137625e-05, - "loss": 0.1439, - "step": 395 - }, - { - "epoch": 4.714285714285714, - "grad_norm": 1.0699166796243846, - "learning_rate": 1.786702920465702e-05, - "loss": 0.1025, - "step": 396 - }, - { - "epoch": 4.726190476190476, - "grad_norm": 3.376295432593385, - "learning_rate": 1.7855109088349927e-05, - "loss": 0.1607, - "step": 397 - }, - { - "epoch": 4.738095238095238, - "grad_norm": 1.4488284754381477, - "learning_rate": 1.7843159756789078e-05, - "loss": 0.137, - "step": 398 - }, - { - "epoch": 4.75, - "grad_norm": 2.3771736790466584, - "learning_rate": 1.783118125441723e-05, - "loss": 0.0942, - "step": 399 - }, - { - "epoch": 4.761904761904762, - "grad_norm": 6.776410312402479, - "learning_rate": 1.7819173625785644e-05, - "loss": 0.1516, - "step": 400 - }, - { - "epoch": 4.773809523809524, - "grad_norm": 1.4340400006613512, - "learning_rate": 1.7807136915553904e-05, - "loss": 0.108, - "step": 401 - }, - { - "epoch": 4.785714285714286, - "grad_norm": 1.160536686716552, - "learning_rate": 1.779507116848976e-05, - "loss": 0.1759, - "step": 402 - }, - { - "epoch": 4.7976190476190474, - "grad_norm": 4.003407777046596, - "learning_rate": 1.7782976429468957e-05, - "loss": 0.1845, - "step": 403 - }, - { - "epoch": 4.809523809523809, - "grad_norm": 1.4582337088270905, - "learning_rate": 1.7770852743475067e-05, - "loss": 0.1125, - "step": 404 - }, - { - "epoch": 4.821428571428571, - "grad_norm": 2.5480560830370718, - "learning_rate": 1.775870015559932e-05, - "loss": 0.1306, - "step": 405 - }, - { - "epoch": 4.833333333333333, - "grad_norm": 7.841245613706642, - "learning_rate": 1.7746518711040445e-05, - "loss": 0.1568, - "step": 406 - }, - { - "epoch": 4.845238095238095, - "grad_norm": 2.036257140830808, - "learning_rate": 1.7734308455104496e-05, - "loss": 0.1595, - "step": 407 - }, - { - "epoch": 4.857142857142857, - "grad_norm": 1.4002028813999992, - "learning_rate": 1.7722069433204687e-05, - "loss": 0.119, - "step": 408 - }, - { - "epoch": 4.869047619047619, - "grad_norm": 2.032627252571408, - "learning_rate": 1.7709801690861214e-05, - "loss": 0.1404, - "step": 409 - }, - { - "epoch": 4.880952380952381, - "grad_norm": 5.533353485198094, - "learning_rate": 1.7697505273701097e-05, - "loss": 0.1607, - "step": 410 - }, - { - "epoch": 4.892857142857143, - "grad_norm": 3.6514331337266337, - "learning_rate": 1.7685180227458003e-05, - "loss": 0.1803, - "step": 411 - }, - { - "epoch": 4.904761904761905, - "grad_norm": 4.680921109039639, - "learning_rate": 1.767282659797208e-05, - "loss": 0.1033, - "step": 412 - }, - { - "epoch": 4.916666666666667, - "grad_norm": 2.621923262817786, - "learning_rate": 1.766044443118978e-05, - "loss": 0.1073, - "step": 413 - }, - { - "epoch": 4.928571428571429, - "grad_norm": 2.0892833469600305, - "learning_rate": 1.7648033773163703e-05, - "loss": 0.1565, - "step": 414 - }, - { - "epoch": 4.940476190476191, - "grad_norm": 1.4629703816596813, - "learning_rate": 1.76355946700524e-05, - "loss": 0.0821, - "step": 415 - }, - { - "epoch": 4.9523809523809526, - "grad_norm": 2.7567759069015794, - "learning_rate": 1.7623127168120232e-05, - "loss": 0.1481, - "step": 416 - }, - { - "epoch": 4.964285714285714, - "grad_norm": 2.8093220612131207, - "learning_rate": 1.7610631313737174e-05, - "loss": 0.0601, - "step": 417 - }, - { - "epoch": 4.976190476190476, - "grad_norm": 1.510038616834676, - "learning_rate": 1.7598107153378655e-05, - "loss": 0.1432, - "step": 418 - }, - { - "epoch": 4.988095238095238, - "grad_norm": 3.6744872821306007, - "learning_rate": 1.7585554733625384e-05, - "loss": 0.0908, - "step": 419 - }, - { - "epoch": 5.0, - "grad_norm": 1.6035869270834606, - "learning_rate": 1.7572974101163166e-05, - "loss": 0.1099, - "step": 420 - }, - { - "epoch": 5.0, - "eval_loss": 0.16849547624588013, - "eval_runtime": 39.0956, - "eval_samples_per_second": 1.535, - "eval_steps_per_second": 1.535, - "step": 420 - }, - { - "epoch": 5.011904761904762, - "grad_norm": 2.0984543226784163, - "learning_rate": 1.7560365302782738e-05, - "loss": 0.1384, - "step": 421 - }, - { - "epoch": 5.023809523809524, - "grad_norm": 1.811823585704519, - "learning_rate": 1.7547728385379606e-05, - "loss": 0.0963, - "step": 422 - }, - { - "epoch": 5.035714285714286, - "grad_norm": 1.0199746089219917, - "learning_rate": 1.753506339595384e-05, - "loss": 0.0982, - "step": 423 - }, - { - "epoch": 5.0476190476190474, - "grad_norm": 1.450450087066948, - "learning_rate": 1.7522370381609937e-05, - "loss": 0.1163, - "step": 424 - }, - { - "epoch": 5.059523809523809, - "grad_norm": 2.188317483580504, - "learning_rate": 1.7509649389556605e-05, - "loss": 0.0727, - "step": 425 - }, - { - "epoch": 5.071428571428571, - "grad_norm": 3.81543761344269, - "learning_rate": 1.7496900467106627e-05, - "loss": 0.1037, - "step": 426 - }, - { - "epoch": 5.083333333333333, - "grad_norm": 3.1538637361704414, - "learning_rate": 1.7484123661676657e-05, - "loss": 0.2181, - "step": 427 - }, - { - "epoch": 5.095238095238095, - "grad_norm": 2.2005038350795028, - "learning_rate": 1.747131902078705e-05, - "loss": 0.1523, - "step": 428 - }, - { - "epoch": 5.107142857142857, - "grad_norm": 2.829273286758714, - "learning_rate": 1.7458486592061702e-05, - "loss": 0.0905, - "step": 429 - }, - { - "epoch": 5.119047619047619, - "grad_norm": 2.768932754858575, - "learning_rate": 1.7445626423227844e-05, - "loss": 0.1626, - "step": 430 - }, - { - "epoch": 5.130952380952381, - "grad_norm": 6.3593130187982245, - "learning_rate": 1.743273856211589e-05, - "loss": 0.1318, - "step": 431 - }, - { - "epoch": 5.142857142857143, - "grad_norm": 2.9334873540908104, - "learning_rate": 1.7419823056659245e-05, - "loss": 0.1621, - "step": 432 - }, - { - "epoch": 5.154761904761905, - "grad_norm": 1.5188109000144108, - "learning_rate": 1.7406879954894133e-05, - "loss": 0.1091, - "step": 433 - }, - { - "epoch": 5.166666666666667, - "grad_norm": 6.8222392966891565, - "learning_rate": 1.7393909304959414e-05, - "loss": 0.1203, - "step": 434 - }, - { - "epoch": 5.178571428571429, - "grad_norm": 2.4224667278776963, - "learning_rate": 1.738091115509641e-05, - "loss": 0.0776, - "step": 435 - }, - { - "epoch": 5.190476190476191, - "grad_norm": 2.5737345276650587, - "learning_rate": 1.736788555364872e-05, - "loss": 0.1427, - "step": 436 - }, - { - "epoch": 5.2023809523809526, - "grad_norm": 1.3719252993722968, - "learning_rate": 1.7354832549062036e-05, - "loss": 0.0647, - "step": 437 - }, - { - "epoch": 5.214285714285714, - "grad_norm": 2.3933922727041805, - "learning_rate": 1.7341752189883983e-05, - "loss": 0.0354, - "step": 438 - }, - { - "epoch": 5.226190476190476, - "grad_norm": 1.5221788352824002, - "learning_rate": 1.732864452476392e-05, - "loss": 0.0865, - "step": 439 - }, - { - "epoch": 5.238095238095238, - "grad_norm": 2.063804997908119, - "learning_rate": 1.731550960245276e-05, - "loss": 0.1055, - "step": 440 - }, - { - "epoch": 5.25, - "grad_norm": 1.5372127294983347, - "learning_rate": 1.7302347471802797e-05, - "loss": 0.0604, - "step": 441 - }, - { - "epoch": 5.261904761904762, - "grad_norm": 2.02376227571018, - "learning_rate": 1.728915818176752e-05, - "loss": 0.0868, - "step": 442 - }, - { - "epoch": 5.273809523809524, - "grad_norm": 1.2042690309607718, - "learning_rate": 1.7275941781401427e-05, - "loss": 0.0806, - "step": 443 - }, - { - "epoch": 5.285714285714286, - "grad_norm": 5.282799497759904, - "learning_rate": 1.726269831985985e-05, - "loss": 0.1004, - "step": 444 - }, - { - "epoch": 5.2976190476190474, - "grad_norm": 1.353248586923984, - "learning_rate": 1.724942784639877e-05, - "loss": 0.0166, - "step": 445 - }, - { - "epoch": 5.309523809523809, - "grad_norm": 1.6233751950823214, - "learning_rate": 1.7236130410374627e-05, - "loss": 0.1031, - "step": 446 - }, - { - "epoch": 5.321428571428571, - "grad_norm": 5.300462784341094, - "learning_rate": 1.7222806061244148e-05, - "loss": 0.0468, - "step": 447 - }, - { - "epoch": 5.333333333333333, - "grad_norm": 3.0190761834590174, - "learning_rate": 1.7209454848564155e-05, - "loss": 0.024, - "step": 448 - }, - { - "epoch": 5.345238095238095, - "grad_norm": 0.7706594623111424, - "learning_rate": 1.7196076821991384e-05, - "loss": 0.06, - "step": 449 - }, - { - "epoch": 5.357142857142857, - "grad_norm": 5.781992041615708, - "learning_rate": 1.7182672031282296e-05, - "loss": 0.1124, - "step": 450 - }, - { - "epoch": 5.369047619047619, - "grad_norm": 4.9262566902719325, - "learning_rate": 1.7169240526292898e-05, - "loss": 0.0687, - "step": 451 - }, - { - "epoch": 5.380952380952381, - "grad_norm": 2.44908284406808, - "learning_rate": 1.715578235697855e-05, - "loss": 0.1093, - "step": 452 - }, - { - "epoch": 5.392857142857143, - "grad_norm": 2.1858738956817505, - "learning_rate": 1.714229757339379e-05, - "loss": 0.0744, - "step": 453 - }, - { - "epoch": 5.404761904761905, - "grad_norm": 8.729299168938855, - "learning_rate": 1.7128786225692135e-05, - "loss": 0.1642, - "step": 454 - }, - { - "epoch": 5.416666666666667, - "grad_norm": 2.537620951119198, - "learning_rate": 1.7115248364125908e-05, - "loss": 0.1596, - "step": 455 - }, - { - "epoch": 5.428571428571429, - "grad_norm": 0.9225751240168636, - "learning_rate": 1.710168403904604e-05, - "loss": 0.0669, - "step": 456 - }, - { - "epoch": 5.440476190476191, - "grad_norm": 1.5986892707523244, - "learning_rate": 1.7088093300901882e-05, - "loss": 0.1076, - "step": 457 - }, - { - "epoch": 5.4523809523809526, - "grad_norm": 7.794554323075924, - "learning_rate": 1.7074476200241037e-05, - "loss": 0.0529, - "step": 458 - }, - { - "epoch": 5.464285714285714, - "grad_norm": 1.0068313155262074, - "learning_rate": 1.706083278770914e-05, - "loss": 0.0633, - "step": 459 - }, - { - "epoch": 5.476190476190476, - "grad_norm": 5.203004572929099, - "learning_rate": 1.7047163114049704e-05, - "loss": 0.1619, - "step": 460 - }, - { - "epoch": 5.488095238095238, - "grad_norm": 4.89789141073733, - "learning_rate": 1.7033467230103895e-05, - "loss": 0.0813, - "step": 461 - }, - { - "epoch": 5.5, - "grad_norm": 1.7611120130870677, - "learning_rate": 1.7019745186810378e-05, - "loss": 0.0284, - "step": 462 - }, - { - "epoch": 5.511904761904762, - "grad_norm": 2.8716423718777406, - "learning_rate": 1.700599703520511e-05, - "loss": 0.107, - "step": 463 - }, - { - "epoch": 5.523809523809524, - "grad_norm": 1.1827701240898811, - "learning_rate": 1.6992222826421136e-05, - "loss": 0.0614, - "step": 464 - }, - { - "epoch": 5.535714285714286, - "grad_norm": 2.711674669785039, - "learning_rate": 1.697842261168843e-05, - "loss": 0.1187, - "step": 465 - }, - { - "epoch": 5.5476190476190474, - "grad_norm": 8.047773822350543, - "learning_rate": 1.6964596442333696e-05, - "loss": 0.1916, - "step": 466 - }, - { - "epoch": 5.559523809523809, - "grad_norm": 16.2071619554794, - "learning_rate": 1.695074436978015e-05, - "loss": 0.1374, - "step": 467 - }, - { - "epoch": 5.571428571428571, - "grad_norm": 3.833330687856342, - "learning_rate": 1.6936866445547354e-05, - "loss": 0.0884, - "step": 468 - }, - { - "epoch": 5.583333333333333, - "grad_norm": 2.0568261929174056, - "learning_rate": 1.6922962721251038e-05, - "loss": 0.1201, - "step": 469 - }, - { - "epoch": 5.595238095238095, - "grad_norm": 3.0077364023768043, - "learning_rate": 1.6909033248602863e-05, - "loss": 0.0437, - "step": 470 - }, - { - "epoch": 5.607142857142857, - "grad_norm": 3.218198456214115, - "learning_rate": 1.689507807941027e-05, - "loss": 0.2151, - "step": 471 - }, - { - "epoch": 5.619047619047619, - "grad_norm": 1.3027730243067113, - "learning_rate": 1.6881097265576273e-05, - "loss": 0.1128, - "step": 472 - }, - { - "epoch": 5.630952380952381, - "grad_norm": 1.7258540427814248, - "learning_rate": 1.6867090859099256e-05, - "loss": 0.0805, - "step": 473 - }, - { - "epoch": 5.642857142857143, - "grad_norm": 0.8805724825057599, - "learning_rate": 1.68530589120728e-05, - "loss": 0.1066, - "step": 474 - }, - { - "epoch": 5.654761904761905, - "grad_norm": 1.9113079355582814, - "learning_rate": 1.6839001476685472e-05, - "loss": 0.1192, - "step": 475 - }, - { - "epoch": 5.666666666666667, - "grad_norm": 11.825279511941737, - "learning_rate": 1.682491860522063e-05, - "loss": 0.3232, - "step": 476 - }, - { - "epoch": 5.678571428571429, - "grad_norm": 1.9412063405946438, - "learning_rate": 1.681081035005626e-05, - "loss": 0.0642, - "step": 477 - }, - { - "epoch": 5.690476190476191, - "grad_norm": 1.9416750977216886, - "learning_rate": 1.6796676763664724e-05, - "loss": 0.0661, - "step": 478 - }, - { - "epoch": 5.7023809523809526, - "grad_norm": 6.736681970474298, - "learning_rate": 1.678251789861262e-05, - "loss": 0.1142, - "step": 479 - }, - { - "epoch": 5.714285714285714, - "grad_norm": 8.988950191247852, - "learning_rate": 1.676833380756056e-05, - "loss": 0.1257, - "step": 480 - }, - { - "epoch": 5.726190476190476, - "grad_norm": 2.6627643546016664, - "learning_rate": 1.6754124543262973e-05, - "loss": 0.0828, - "step": 481 - }, - { - "epoch": 5.738095238095238, - "grad_norm": 1.3707408293314916, - "learning_rate": 1.6739890158567917e-05, - "loss": 0.0432, - "step": 482 - }, - { - "epoch": 5.75, - "grad_norm": 1.203361483911428, - "learning_rate": 1.6725630706416882e-05, - "loss": 0.1027, - "step": 483 - }, - { - "epoch": 5.761904761904762, - "grad_norm": 1.1581274009447258, - "learning_rate": 1.671134623984459e-05, - "loss": 0.0586, - "step": 484 - }, - { - "epoch": 5.773809523809524, - "grad_norm": 2.7846087083215942, - "learning_rate": 1.6697036811978785e-05, - "loss": 0.0946, - "step": 485 - }, - { - "epoch": 5.785714285714286, - "grad_norm": 1.0055933651335667, - "learning_rate": 1.668270247604008e-05, - "loss": 0.0447, - "step": 486 - }, - { - "epoch": 5.7976190476190474, - "grad_norm": 8.469500559608552, - "learning_rate": 1.666834328534169e-05, - "loss": 0.1275, - "step": 487 - }, - { - "epoch": 5.809523809523809, - "grad_norm": 9.65420791197495, - "learning_rate": 1.6653959293289296e-05, - "loss": 0.1675, - "step": 488 - }, - { - "epoch": 5.821428571428571, - "grad_norm": 7.4143605449900605, - "learning_rate": 1.663955055338082e-05, - "loss": 0.175, - "step": 489 - }, - { - "epoch": 5.833333333333333, - "grad_norm": 3.2154530706581355, - "learning_rate": 1.6625117119206217e-05, - "loss": 0.0419, - "step": 490 - }, - { - "epoch": 5.845238095238095, - "grad_norm": 1.1263482849597752, - "learning_rate": 1.6610659044447296e-05, - "loss": 0.0443, - "step": 491 - }, - { - "epoch": 5.857142857142857, - "grad_norm": 3.0300211714461702, - "learning_rate": 1.6596176382877506e-05, - "loss": 0.1838, - "step": 492 - }, - { - "epoch": 5.869047619047619, - "grad_norm": 0.66866959793733, - "learning_rate": 1.658166918836175e-05, - "loss": 0.0541, - "step": 493 - }, - { - "epoch": 5.880952380952381, - "grad_norm": 4.3346266946381045, - "learning_rate": 1.6567137514856157e-05, - "loss": 0.1322, - "step": 494 - }, - { - "epoch": 5.892857142857143, - "grad_norm": 2.134678683799703, - "learning_rate": 1.655258141640792e-05, - "loss": 0.0875, - "step": 495 - }, - { - "epoch": 5.904761904761905, - "grad_norm": 6.126638160009494, - "learning_rate": 1.653800094715506e-05, - "loss": 0.0584, - "step": 496 - }, - { - "epoch": 5.916666666666667, - "grad_norm": 0.7344309410677271, - "learning_rate": 1.652339616132625e-05, - "loss": 0.0467, - "step": 497 - }, - { - "epoch": 5.928571428571429, - "grad_norm": 1.793606831473593, - "learning_rate": 1.65087671132406e-05, - "loss": 0.1195, - "step": 498 - }, - { - "epoch": 5.940476190476191, - "grad_norm": 4.880270027679669, - "learning_rate": 1.6494113857307454e-05, - "loss": 0.1688, - "step": 499 - }, - { - "epoch": 5.9523809523809526, - "grad_norm": 1.700000856351666, - "learning_rate": 1.6479436448026197e-05, - "loss": 0.1244, - "step": 500 - }, - { - "epoch": 5.964285714285714, - "grad_norm": 4.08041629571301, - "learning_rate": 1.646473493998604e-05, - "loss": 0.1081, - "step": 501 - }, - { - "epoch": 5.976190476190476, - "grad_norm": 1.4491524448140567, - "learning_rate": 1.645000938786582e-05, - "loss": 0.1233, - "step": 502 - }, - { - "epoch": 5.988095238095238, - "grad_norm": 0.9253565946532154, - "learning_rate": 1.6435259846433823e-05, - "loss": 0.0225, - "step": 503 - }, - { - "epoch": 6.0, - "grad_norm": 2.772680062939085, - "learning_rate": 1.6420486370547537e-05, - "loss": 0.1832, - "step": 504 - }, - { - "epoch": 6.0, - "eval_loss": 0.14325179159641266, - "eval_runtime": 38.6443, - "eval_samples_per_second": 1.553, - "eval_steps_per_second": 1.553, - "step": 504 - }, - { - "epoch": 6.011904761904762, - "grad_norm": 1.5019024766855187, - "learning_rate": 1.640568901515347e-05, - "loss": 0.106, - "step": 505 - }, - { - "epoch": 6.023809523809524, - "grad_norm": 1.195423816720282, - "learning_rate": 1.6390867835286954e-05, - "loss": 0.0226, - "step": 506 - }, - { - "epoch": 6.035714285714286, - "grad_norm": 2.0934313411664895, - "learning_rate": 1.6376022886071924e-05, - "loss": 0.1394, - "step": 507 - }, - { - "epoch": 6.0476190476190474, - "grad_norm": 0.9134568702424617, - "learning_rate": 1.6361154222720713e-05, - "loss": 0.0738, - "step": 508 - }, - { - "epoch": 6.059523809523809, - "grad_norm": 1.827946060377346, - "learning_rate": 1.634626190053387e-05, - "loss": 0.1004, - "step": 509 - }, - { - "epoch": 6.071428571428571, - "grad_norm": 4.291381510622088, - "learning_rate": 1.6331345974899923e-05, - "loss": 0.1089, - "step": 510 - }, - { - "epoch": 6.083333333333333, - "grad_norm": 1.793625854996647, - "learning_rate": 1.6316406501295198e-05, - "loss": 0.1218, - "step": 511 - }, - { - "epoch": 6.095238095238095, - "grad_norm": 2.706232219127682, - "learning_rate": 1.6301443535283593e-05, - "loss": 0.0865, - "step": 512 - }, - { - "epoch": 6.107142857142857, - "grad_norm": 3.43227842671449, - "learning_rate": 1.6286457132516383e-05, - "loss": 0.069, - "step": 513 - }, - { - "epoch": 6.119047619047619, - "grad_norm": 2.0565239329994323, - "learning_rate": 1.6271447348732023e-05, - "loss": 0.1169, - "step": 514 - }, - { - "epoch": 6.130952380952381, - "grad_norm": 4.730627764396558, - "learning_rate": 1.6256414239755902e-05, - "loss": 0.0821, - "step": 515 - }, - { - "epoch": 6.142857142857143, - "grad_norm": 3.504993607204566, - "learning_rate": 1.6241357861500184e-05, - "loss": 0.0732, - "step": 516 - }, - { - "epoch": 6.154761904761905, - "grad_norm": 1.1924649747006983, - "learning_rate": 1.6226278269963577e-05, - "loss": 0.097, - "step": 517 - }, - { - "epoch": 6.166666666666667, - "grad_norm": 1.1713451235970245, - "learning_rate": 1.621117552123111e-05, - "loss": 0.0721, - "step": 518 - }, - { - "epoch": 6.178571428571429, - "grad_norm": 1.278243637158708, - "learning_rate": 1.6196049671473954e-05, - "loss": 0.0847, - "step": 519 - }, - { - "epoch": 6.190476190476191, - "grad_norm": 1.7037872290115583, - "learning_rate": 1.618090077694919e-05, - "loss": 0.0646, - "step": 520 - }, - { - "epoch": 6.2023809523809526, - "grad_norm": 1.2759918767604022, - "learning_rate": 1.6165728893999616e-05, - "loss": 0.1003, - "step": 521 - }, - { - "epoch": 6.214285714285714, - "grad_norm": 1.5153242877488946, - "learning_rate": 1.6150534079053528e-05, - "loss": 0.0729, - "step": 522 - }, - { - "epoch": 6.226190476190476, - "grad_norm": 2.9422960437536996, - "learning_rate": 1.6135316388624508e-05, - "loss": 0.1581, - "step": 523 - }, - { - "epoch": 6.238095238095238, - "grad_norm": 0.9976260526302768, - "learning_rate": 1.612007587931122e-05, - "loss": 0.0425, - "step": 524 - }, - { - "epoch": 6.25, - "grad_norm": 1.917033625841123, - "learning_rate": 1.6104812607797204e-05, - "loss": 0.0811, - "step": 525 - }, - { - "epoch": 6.261904761904762, - "grad_norm": 1.6978119095102044, - "learning_rate": 1.6089526630850644e-05, - "loss": 0.1054, - "step": 526 - }, - { - "epoch": 6.273809523809524, - "grad_norm": 2.459854386034363, - "learning_rate": 1.607421800532419e-05, - "loss": 0.079, - "step": 527 - }, - { - "epoch": 6.285714285714286, - "grad_norm": 1.7906992011251934, - "learning_rate": 1.6058886788154713e-05, - "loss": 0.0679, - "step": 528 - }, - { - "epoch": 6.2976190476190474, - "grad_norm": 1.051587482130984, - "learning_rate": 1.6043533036363114e-05, - "loss": 0.0465, - "step": 529 - }, - { - "epoch": 6.309523809523809, - "grad_norm": 1.8489956604650926, - "learning_rate": 1.6028156807054113e-05, - "loss": 0.0602, - "step": 530 - }, - { - "epoch": 6.321428571428571, - "grad_norm": 1.365616441791178, - "learning_rate": 1.601275815741602e-05, - "loss": 0.0995, - "step": 531 - }, - { - "epoch": 6.333333333333333, - "grad_norm": 1.4026420836250593, - "learning_rate": 1.5997337144720534e-05, - "loss": 0.0887, - "step": 532 - }, - { - "epoch": 6.345238095238095, - "grad_norm": 1.7760000515082845, - "learning_rate": 1.598189382632253e-05, - "loss": 0.0714, - "step": 533 - }, - { - "epoch": 6.357142857142857, - "grad_norm": 3.151403903780976, - "learning_rate": 1.5966428259659844e-05, - "loss": 0.0232, - "step": 534 - }, - { - "epoch": 6.369047619047619, - "grad_norm": 2.0134779392823083, - "learning_rate": 1.5950940502253064e-05, - "loss": 0.12, - "step": 535 - }, - { - "epoch": 6.380952380952381, - "grad_norm": 1.3996798101959818, - "learning_rate": 1.59354306117053e-05, - "loss": 0.0676, - "step": 536 - }, - { - "epoch": 6.392857142857143, - "grad_norm": 0.983271468293818, - "learning_rate": 1.591989864570199e-05, - "loss": 0.0469, - "step": 537 - }, - { - "epoch": 6.404761904761905, - "grad_norm": 1.5159705113486506, - "learning_rate": 1.5904344662010672e-05, - "loss": 0.0971, - "step": 538 - }, - { - "epoch": 6.416666666666667, - "grad_norm": 1.8392738688203814, - "learning_rate": 1.5888768718480778e-05, - "loss": 0.0924, - "step": 539 - }, - { - "epoch": 6.428571428571429, - "grad_norm": 2.4551157718317986, - "learning_rate": 1.5873170873043412e-05, - "loss": 0.0658, - "step": 540 - }, - { - "epoch": 6.440476190476191, - "grad_norm": 1.5886407912063516, - "learning_rate": 1.5857551183711138e-05, - "loss": 0.0761, - "step": 541 - }, - { - "epoch": 6.4523809523809526, - "grad_norm": 1.4026787489563222, - "learning_rate": 1.584190970857776e-05, - "loss": 0.0649, - "step": 542 - }, - { - "epoch": 6.464285714285714, - "grad_norm": 1.1906948905495682, - "learning_rate": 1.582624650581811e-05, - "loss": 0.0768, - "step": 543 - }, - { - "epoch": 6.476190476190476, - "grad_norm": 1.7857819999749693, - "learning_rate": 1.5810561633687842e-05, - "loss": 0.0269, - "step": 544 - }, - { - "epoch": 6.488095238095238, - "grad_norm": 0.8665778712262205, - "learning_rate": 1.5794855150523183e-05, - "loss": 0.0427, - "step": 545 - }, - { - "epoch": 6.5, - "grad_norm": 0.9975530177048364, - "learning_rate": 1.5779127114740757e-05, - "loss": 0.033, - "step": 546 - }, - { - "epoch": 6.511904761904762, - "grad_norm": 1.3394274426999186, - "learning_rate": 1.5763377584837337e-05, - "loss": 0.0427, - "step": 547 - }, - { - "epoch": 6.523809523809524, - "grad_norm": 1.4827725196076444, - "learning_rate": 1.5747606619389642e-05, - "loss": 0.019, - "step": 548 - }, - { - "epoch": 6.535714285714286, - "grad_norm": 1.2995548812490796, - "learning_rate": 1.5731814277054112e-05, - "loss": 0.09, - "step": 549 - }, - { - "epoch": 6.5476190476190474, - "grad_norm": 0.8657770498669943, - "learning_rate": 1.57160006165667e-05, - "loss": 0.0163, - "step": 550 - }, - { - "epoch": 6.559523809523809, - "grad_norm": 0.7283838543975072, - "learning_rate": 1.570016569674264e-05, - "loss": 0.0568, - "step": 551 - }, - { - "epoch": 6.571428571428571, - "grad_norm": 1.470782870334043, - "learning_rate": 1.5684309576476248e-05, - "loss": 0.0925, - "step": 552 - }, - { - "epoch": 6.583333333333333, - "grad_norm": 1.9078912192128594, - "learning_rate": 1.5668432314740663e-05, - "loss": 0.0939, - "step": 553 - }, - { - "epoch": 6.595238095238095, - "grad_norm": 1.4741198916158764, - "learning_rate": 1.5652533970587688e-05, - "loss": 0.0621, - "step": 554 - }, - { - "epoch": 6.607142857142857, - "grad_norm": 0.8725669529266904, - "learning_rate": 1.5636614603147513e-05, - "loss": 0.0544, - "step": 555 - }, - { - "epoch": 6.619047619047619, - "grad_norm": 0.7432776800153558, - "learning_rate": 1.562067427162853e-05, - "loss": 0.0564, - "step": 556 - }, - { - "epoch": 6.630952380952381, - "grad_norm": 1.4317069015929436, - "learning_rate": 1.56047130353171e-05, - "loss": 0.0167, - "step": 557 - }, - { - "epoch": 6.642857142857143, - "grad_norm": 0.8325527210494423, - "learning_rate": 1.5588730953577336e-05, - "loss": 0.0402, - "step": 558 - }, - { - "epoch": 6.654761904761905, - "grad_norm": 3.2233010428483793, - "learning_rate": 1.5572728085850873e-05, - "loss": 0.1871, - "step": 559 - }, - { - "epoch": 6.666666666666667, - "grad_norm": 1.1007119123597107, - "learning_rate": 1.5556704491656666e-05, - "loss": 0.038, - "step": 560 - }, - { - "epoch": 6.678571428571429, - "grad_norm": 0.3049436586601034, - "learning_rate": 1.554066023059075e-05, - "loss": 0.0043, - "step": 561 - }, - { - "epoch": 6.690476190476191, - "grad_norm": 0.6116647914325464, - "learning_rate": 1.5524595362326028e-05, - "loss": 0.0448, - "step": 562 - }, - { - "epoch": 6.7023809523809526, - "grad_norm": 2.566952968870286, - "learning_rate": 1.5508509946612045e-05, - "loss": 0.1348, - "step": 563 - }, - { - "epoch": 6.714285714285714, - "grad_norm": 1.626464079459653, - "learning_rate": 1.549240404327477e-05, - "loss": 0.0739, - "step": 564 - }, - { - "epoch": 6.726190476190476, - "grad_norm": 1.301521334500284, - "learning_rate": 1.5476277712216364e-05, - "loss": 0.0583, - "step": 565 - }, - { - "epoch": 6.738095238095238, - "grad_norm": 0.8296339701668457, - "learning_rate": 1.546013101341498e-05, - "loss": 0.0125, - "step": 566 - }, - { - "epoch": 6.75, - "grad_norm": 1.9162341614965561, - "learning_rate": 1.544396400692451e-05, - "loss": 0.0764, - "step": 567 - }, - { - "epoch": 6.761904761904762, - "grad_norm": 0.8294531297211747, - "learning_rate": 1.5427776752874372e-05, - "loss": 0.0527, - "step": 568 - }, - { - "epoch": 6.773809523809524, - "grad_norm": 1.637333814661965, - "learning_rate": 1.5411569311469307e-05, - "loss": 0.0701, - "step": 569 - }, - { - "epoch": 6.785714285714286, - "grad_norm": 0.7654549021641782, - "learning_rate": 1.5395341742989126e-05, - "loss": 0.0392, - "step": 570 - }, - { - "epoch": 6.7976190476190474, - "grad_norm": 4.113607635248581, - "learning_rate": 1.53790941077885e-05, - "loss": 0.1606, - "step": 571 - }, - { - "epoch": 6.809523809523809, - "grad_norm": 2.894466788003413, - "learning_rate": 1.5362826466296736e-05, - "loss": 0.1243, - "step": 572 - }, - { - "epoch": 6.821428571428571, - "grad_norm": 1.681756853880995, - "learning_rate": 1.534653887901754e-05, - "loss": 0.0906, - "step": 573 - }, - { - "epoch": 6.833333333333333, - "grad_norm": 7.226542543292303, - "learning_rate": 1.533023140652882e-05, - "loss": 0.0968, - "step": 574 - }, - { - "epoch": 6.845238095238095, - "grad_norm": 2.7169022930643427, - "learning_rate": 1.531390410948243e-05, - "loss": 0.0198, - "step": 575 - }, - { - "epoch": 6.857142857142857, - "grad_norm": 5.199337298379754, - "learning_rate": 1.5297557048603963e-05, - "loss": 0.0724, - "step": 576 - }, - { - "epoch": 6.869047619047619, - "grad_norm": 1.4573244826016143, - "learning_rate": 1.52811902846925e-05, - "loss": 0.0829, - "step": 577 - }, - { - "epoch": 6.880952380952381, - "grad_norm": 1.8919827827452864, - "learning_rate": 1.526480387862043e-05, - "loss": 0.086, - "step": 578 - }, - { - "epoch": 6.892857142857143, - "grad_norm": 2.8140927699375267, - "learning_rate": 1.5248397891333183e-05, - "loss": 0.1588, - "step": 579 - }, - { - "epoch": 6.904761904761905, - "grad_norm": 2.0967385770460782, - "learning_rate": 1.5231972383849019e-05, - "loss": 0.061, - "step": 580 - }, - { - "epoch": 6.916666666666667, - "grad_norm": 2.8660947565892383, - "learning_rate": 1.5215527417258793e-05, - "loss": 0.1155, - "step": 581 - }, - { - "epoch": 6.928571428571429, - "grad_norm": 1.2394198268814334, - "learning_rate": 1.5199063052725746e-05, - "loss": 0.047, - "step": 582 - }, - { - "epoch": 6.940476190476191, - "grad_norm": 1.7366594536677353, - "learning_rate": 1.518257935148525e-05, - "loss": 0.1103, - "step": 583 - }, - { - "epoch": 6.9523809523809526, - "grad_norm": 1.4905106038915912, - "learning_rate": 1.5166076374844605e-05, - "loss": 0.1084, - "step": 584 - }, - { - "epoch": 6.964285714285714, - "grad_norm": 1.7466305557853612, - "learning_rate": 1.5149554184182802e-05, - "loss": 0.0601, - "step": 585 - }, - { - "epoch": 6.976190476190476, - "grad_norm": 2.2407234722112332, - "learning_rate": 1.5133012840950292e-05, - "loss": 0.1101, - "step": 586 - }, - { - "epoch": 6.988095238095238, - "grad_norm": 0.9285094911344285, - "learning_rate": 1.5116452406668758e-05, - "loss": 0.0419, - "step": 587 - }, - { - "epoch": 7.0, - "grad_norm": 1.836337641554133, - "learning_rate": 1.5099872942930886e-05, - "loss": 0.0258, - "step": 588 - }, - { - "epoch": 7.0, - "eval_loss": 0.14865067601203918, - "eval_runtime": 43.1081, - "eval_samples_per_second": 1.392, - "eval_steps_per_second": 1.392, - "step": 588 - }, - { - "epoch": 7.011904761904762, - "grad_norm": 1.2568935563245456, - "learning_rate": 1.5083274511400144e-05, - "loss": 0.0558, - "step": 589 - }, - { - "epoch": 7.023809523809524, - "grad_norm": 6.801089494891712, - "learning_rate": 1.5066657173810542e-05, - "loss": 0.1197, - "step": 590 - }, - { - "epoch": 7.035714285714286, - "grad_norm": 5.569312791725525, - "learning_rate": 1.5050020991966405e-05, - "loss": 0.0738, - "step": 591 - }, - { - "epoch": 7.0476190476190474, - "grad_norm": 2.759979048006171, - "learning_rate": 1.5033366027742156e-05, - "loss": 0.0367, - "step": 592 - }, - { - "epoch": 7.059523809523809, - "grad_norm": 1.6545276645155973, - "learning_rate": 1.5016692343082053e-05, - "loss": 0.0814, - "step": 593 - }, - { - "epoch": 7.071428571428571, - "grad_norm": 2.1169578211888727, - "learning_rate": 1.5000000000000002e-05, - "loss": 0.0903, - "step": 594 - }, - { - "epoch": 7.083333333333333, - "grad_norm": 1.829536723099167, - "learning_rate": 1.4983289060579294e-05, - "loss": 0.0679, - "step": 595 - }, - { - "epoch": 7.095238095238095, - "grad_norm": 2.063531967019674, - "learning_rate": 1.4966559586972387e-05, - "loss": 0.0285, - "step": 596 - }, - { - "epoch": 7.107142857142857, - "grad_norm": 0.9923944509007061, - "learning_rate": 1.4949811641400668e-05, - "loss": 0.0364, - "step": 597 - }, - { - "epoch": 7.119047619047619, - "grad_norm": 0.9674661161779017, - "learning_rate": 1.4933045286154242e-05, - "loss": 0.0599, - "step": 598 - }, - { - "epoch": 7.130952380952381, - "grad_norm": 2.761259352030199, - "learning_rate": 1.4916260583591659e-05, - "loss": 0.0284, - "step": 599 - }, - { - "epoch": 7.142857142857143, - "grad_norm": 3.4877717256597296, - "learning_rate": 1.4899457596139728e-05, - "loss": 0.1466, - "step": 600 - }, - { - "epoch": 7.154761904761905, - "grad_norm": 1.8885548051230485, - "learning_rate": 1.488263638629326e-05, - "loss": 0.0611, - "step": 601 - }, - { - "epoch": 7.166666666666667, - "grad_norm": 3.5085551469483733, - "learning_rate": 1.4865797016614839e-05, - "loss": 0.0335, - "step": 602 - }, - { - "epoch": 7.178571428571429, - "grad_norm": 2.1317182396352514, - "learning_rate": 1.4848939549734582e-05, - "loss": 0.1127, - "step": 603 - }, - { - "epoch": 7.190476190476191, - "grad_norm": 2.4596429083047906, - "learning_rate": 1.4832064048349928e-05, - "loss": 0.0533, - "step": 604 - }, - { - "epoch": 7.2023809523809526, - "grad_norm": 4.555208354333248, - "learning_rate": 1.4815170575225383e-05, - "loss": 0.0496, - "step": 605 - }, - { - "epoch": 7.214285714285714, - "grad_norm": 2.557372985143864, - "learning_rate": 1.4798259193192298e-05, - "loss": 0.1063, - "step": 606 - }, - { - "epoch": 7.226190476190476, - "grad_norm": 1.949653943995653, - "learning_rate": 1.4781329965148625e-05, - "loss": 0.0531, - "step": 607 - }, - { - "epoch": 7.238095238095238, - "grad_norm": 1.935893865742182, - "learning_rate": 1.47643829540587e-05, - "loss": 0.0963, - "step": 608 - }, - { - "epoch": 7.25, - "grad_norm": 1.7142332761042414, - "learning_rate": 1.4747418222952997e-05, - "loss": 0.0815, - "step": 609 - }, - { - "epoch": 7.261904761904762, - "grad_norm": 1.8054607129649187, - "learning_rate": 1.4730435834927885e-05, - "loss": 0.0824, - "step": 610 - }, - { - "epoch": 7.273809523809524, - "grad_norm": 1.3720448261221745, - "learning_rate": 1.471343585314542e-05, - "loss": 0.0772, - "step": 611 - }, - { - "epoch": 7.285714285714286, - "grad_norm": 1.565400646280693, - "learning_rate": 1.4696418340833081e-05, - "loss": 0.0704, - "step": 612 - }, - { - "epoch": 7.2976190476190474, - "grad_norm": 1.2730729423352545, - "learning_rate": 1.4679383361283554e-05, - "loss": 0.0519, - "step": 613 - }, - { - "epoch": 7.309523809523809, - "grad_norm": 2.1497377325557685, - "learning_rate": 1.4662330977854488e-05, - "loss": 0.0929, - "step": 614 - }, - { - "epoch": 7.321428571428571, - "grad_norm": 1.2135536719810307, - "learning_rate": 1.4645261253968262e-05, - "loss": 0.0136, - "step": 615 - }, - { - "epoch": 7.333333333333333, - "grad_norm": 1.9076749429523272, - "learning_rate": 1.4628174253111751e-05, - "loss": 0.0589, - "step": 616 - }, - { - "epoch": 7.345238095238095, - "grad_norm": 1.865336445497231, - "learning_rate": 1.4611070038836084e-05, - "loss": 0.1012, - "step": 617 - }, - { - "epoch": 7.357142857142857, - "grad_norm": 1.4531772649657178, - "learning_rate": 1.4593948674756418e-05, - "loss": 0.0185, - "step": 618 - }, - { - "epoch": 7.369047619047619, - "grad_norm": 1.3967163423107203, - "learning_rate": 1.4576810224551683e-05, - "loss": 0.0679, - "step": 619 - }, - { - "epoch": 7.380952380952381, - "grad_norm": 1.2127139438665828, - "learning_rate": 1.4559654751964364e-05, - "loss": 0.0585, - "step": 620 - }, - { - "epoch": 7.392857142857143, - "grad_norm": 0.8916239980755017, - "learning_rate": 1.4542482320800264e-05, - "loss": 0.0366, - "step": 621 - }, - { - "epoch": 7.404761904761905, - "grad_norm": 0.9741419411575285, - "learning_rate": 1.4525292994928247e-05, - "loss": 0.0559, - "step": 622 - }, - { - "epoch": 7.416666666666667, - "grad_norm": 1.0000739028449486, - "learning_rate": 1.4508086838280018e-05, - "loss": 0.0398, - "step": 623 - }, - { - "epoch": 7.428571428571429, - "grad_norm": 1.8740728890384972, - "learning_rate": 1.449086391484988e-05, - "loss": 0.0778, - "step": 624 - }, - { - "epoch": 7.440476190476191, - "grad_norm": 1.7375565170529184, - "learning_rate": 1.4473624288694499e-05, - "loss": 0.0827, - "step": 625 - }, - { - "epoch": 7.4523809523809526, - "grad_norm": 2.0591771711864473, - "learning_rate": 1.4456368023932657e-05, - "loss": 0.0797, - "step": 626 - }, - { - "epoch": 7.464285714285714, - "grad_norm": 3.089641917565682, - "learning_rate": 1.4439095184745025e-05, - "loss": 0.14, - "step": 627 - }, - { - "epoch": 7.476190476190476, - "grad_norm": 2.1378985179166894, - "learning_rate": 1.4421805835373917e-05, - "loss": 0.0224, - "step": 628 - }, - { - "epoch": 7.488095238095238, - "grad_norm": 2.8192981571711293, - "learning_rate": 1.4404500040123049e-05, - "loss": 0.1095, - "step": 629 - }, - { - "epoch": 7.5, - "grad_norm": 1.9613857654437483, - "learning_rate": 1.4387177863357307e-05, - "loss": 0.0938, - "step": 630 - }, - { - "epoch": 7.511904761904762, - "grad_norm": 1.3072569940934167, - "learning_rate": 1.4369839369502507e-05, - "loss": 0.0561, - "step": 631 - }, - { - "epoch": 7.523809523809524, - "grad_norm": 4.300152949971368, - "learning_rate": 1.4352484623045149e-05, - "loss": 0.0399, - "step": 632 - }, - { - "epoch": 7.535714285714286, - "grad_norm": 1.4294772622134482, - "learning_rate": 1.4335113688532183e-05, - "loss": 0.0449, - "step": 633 - }, - { - "epoch": 7.5476190476190474, - "grad_norm": 2.483155533373781, - "learning_rate": 1.431772663057076e-05, - "loss": 0.1037, - "step": 634 - }, - { - "epoch": 7.559523809523809, - "grad_norm": 1.2006274544896178, - "learning_rate": 1.4300323513828008e-05, - "loss": 0.04, - "step": 635 - }, - { - "epoch": 7.571428571428571, - "grad_norm": 1.3950943162774636, - "learning_rate": 1.4282904403030773e-05, - "loss": 0.0362, - "step": 636 - }, - { - "epoch": 7.583333333333333, - "grad_norm": 1.8558101279081418, - "learning_rate": 1.4265469362965398e-05, - "loss": 0.0518, - "step": 637 - }, - { - "epoch": 7.595238095238095, - "grad_norm": 1.6872639221568726, - "learning_rate": 1.4248018458477464e-05, - "loss": 0.064, - "step": 638 - }, - { - "epoch": 7.607142857142857, - "grad_norm": 1.6544975886322297, - "learning_rate": 1.4230551754471553e-05, - "loss": 0.0561, - "step": 639 - }, - { - "epoch": 7.619047619047619, - "grad_norm": 2.6900047123006035, - "learning_rate": 1.4213069315911014e-05, - "loss": 0.1032, - "step": 640 - }, - { - "epoch": 7.630952380952381, - "grad_norm": 1.4255009879275804, - "learning_rate": 1.419557120781772e-05, - "loss": 0.0729, - "step": 641 - }, - { - "epoch": 7.642857142857143, - "grad_norm": 1.2946732289623608, - "learning_rate": 1.4178057495271816e-05, - "loss": 0.0332, - "step": 642 - }, - { - "epoch": 7.654761904761905, - "grad_norm": 1.397832155407573, - "learning_rate": 1.4160528243411493e-05, - "loss": 0.057, - "step": 643 - }, - { - "epoch": 7.666666666666667, - "grad_norm": 2.258767217854666, - "learning_rate": 1.4142983517432724e-05, - "loss": 0.0877, - "step": 644 - }, - { - "epoch": 7.678571428571429, - "grad_norm": 1.734061257783898, - "learning_rate": 1.412542338258905e-05, - "loss": 0.0629, - "step": 645 - }, - { - "epoch": 7.690476190476191, - "grad_norm": 1.6703985042648415, - "learning_rate": 1.4107847904191309e-05, - "loss": 0.0752, - "step": 646 - }, - { - "epoch": 7.7023809523809526, - "grad_norm": 2.6765274588483208, - "learning_rate": 1.4090257147607414e-05, - "loss": 0.0718, - "step": 647 - }, - { - "epoch": 7.714285714285714, - "grad_norm": 2.2666127426059997, - "learning_rate": 1.4072651178262096e-05, - "loss": 0.0689, - "step": 648 - }, - { - "epoch": 7.726190476190476, - "grad_norm": 3.028603990936858, - "learning_rate": 1.405503006163667e-05, - "loss": 0.0728, - "step": 649 - }, - { - "epoch": 7.738095238095238, - "grad_norm": 1.6601742539557618, - "learning_rate": 1.4037393863268785e-05, - "loss": 0.0755, - "step": 650 - }, - { - "epoch": 7.75, - "grad_norm": 3.6530760121468, - "learning_rate": 1.4019742648752184e-05, - "loss": 0.06, - "step": 651 - }, - { - "epoch": 7.761904761904762, - "grad_norm": 1.909724163901433, - "learning_rate": 1.400207648373646e-05, - "loss": 0.0795, - "step": 652 - }, - { - "epoch": 7.773809523809524, - "grad_norm": 1.5217184466934186, - "learning_rate": 1.3984395433926816e-05, - "loss": 0.0647, - "step": 653 - }, - { - "epoch": 7.785714285714286, - "grad_norm": 2.349579191021632, - "learning_rate": 1.3966699565083803e-05, - "loss": 0.02, - "step": 654 - }, - { - "epoch": 7.7976190476190474, - "grad_norm": 1.5403983446470608, - "learning_rate": 1.3948988943023097e-05, - "loss": 0.0735, - "step": 655 - }, - { - "epoch": 7.809523809523809, - "grad_norm": 1.7826487054141489, - "learning_rate": 1.3931263633615241e-05, - "loss": 0.0704, - "step": 656 - }, - { - "epoch": 7.821428571428571, - "grad_norm": 1.6670060973825707, - "learning_rate": 1.3913523702785413e-05, - "loss": 0.0646, - "step": 657 - }, - { - "epoch": 7.833333333333333, - "grad_norm": 1.1293411386334704, - "learning_rate": 1.3895769216513158e-05, - "loss": 0.0122, - "step": 658 - }, - { - "epoch": 7.845238095238095, - "grad_norm": 0.8329512938247684, - "learning_rate": 1.3878000240832168e-05, - "loss": 0.0433, - "step": 659 - }, - { - "epoch": 7.857142857142857, - "grad_norm": 2.686692661037454, - "learning_rate": 1.3860216841830019e-05, - "loss": 0.0984, - "step": 660 - }, - { - "epoch": 7.869047619047619, - "grad_norm": 1.2301440232404512, - "learning_rate": 1.3842419085647933e-05, - "loss": 0.0417, - "step": 661 - }, - { - "epoch": 7.880952380952381, - "grad_norm": 2.0183227698474333, - "learning_rate": 1.3824607038480533e-05, - "loss": 0.0723, - "step": 662 - }, - { - "epoch": 7.892857142857143, - "grad_norm": 2.6389098823393358, - "learning_rate": 1.380678076657559e-05, - "loss": 0.09, - "step": 663 - }, - { - "epoch": 7.904761904761905, - "grad_norm": 3.4549373370939978, - "learning_rate": 1.3788940336233781e-05, - "loss": 0.1292, - "step": 664 - }, - { - "epoch": 7.916666666666667, - "grad_norm": 2.2667693069829227, - "learning_rate": 1.3771085813808442e-05, - "loss": 0.0828, - "step": 665 - }, - { - "epoch": 7.928571428571429, - "grad_norm": 0.47972584449219374, - "learning_rate": 1.3753217265705324e-05, - "loss": 0.0053, - "step": 666 - }, - { - "epoch": 7.940476190476191, - "grad_norm": 1.75002178495459, - "learning_rate": 1.3735334758382338e-05, - "loss": 0.04, - "step": 667 - }, - { - "epoch": 7.9523809523809526, - "grad_norm": 1.814029715731369, - "learning_rate": 1.371743835834932e-05, - "loss": 0.0647, - "step": 668 - }, - { - "epoch": 7.964285714285714, - "grad_norm": 1.7511940468473046, - "learning_rate": 1.3699528132167777e-05, - "loss": 0.0169, - "step": 669 - }, - { - "epoch": 7.976190476190476, - "grad_norm": 2.1541073840936753, - "learning_rate": 1.3681604146450625e-05, - "loss": 0.0931, - "step": 670 - }, - { - "epoch": 7.988095238095238, - "grad_norm": 2.3197341635579716, - "learning_rate": 1.3663666467861972e-05, - "loss": 0.0743, - "step": 671 - }, - { - "epoch": 8.0, - "grad_norm": 2.5918283864996052, - "learning_rate": 1.3645715163116846e-05, - "loss": 0.0388, - "step": 672 - }, - { - "epoch": 8.0, - "eval_loss": 0.11669167876243591, - "eval_runtime": 40.8048, - "eval_samples_per_second": 1.47, - "eval_steps_per_second": 1.47, - "step": 672 - }, - { - "epoch": 8.011904761904763, - "grad_norm": 1.8156831505652815, - "learning_rate": 1.362775029898096e-05, - "loss": 0.0193, - "step": 673 - }, - { - "epoch": 8.023809523809524, - "grad_norm": 1.014332919297201, - "learning_rate": 1.3609771942270444e-05, - "loss": 0.0618, - "step": 674 - }, - { - "epoch": 8.035714285714286, - "grad_norm": 2.740643902996938, - "learning_rate": 1.3591780159851629e-05, - "loss": 0.084, - "step": 675 - }, - { - "epoch": 8.047619047619047, - "grad_norm": 1.2215784460103545, - "learning_rate": 1.3573775018640765e-05, - "loss": 0.0265, - "step": 676 - }, - { - "epoch": 8.05952380952381, - "grad_norm": 1.2814221356115454, - "learning_rate": 1.3555756585603795e-05, - "loss": 0.0265, - "step": 677 - }, - { - "epoch": 8.071428571428571, - "grad_norm": 2.498831848618261, - "learning_rate": 1.3537724927756095e-05, - "loss": 0.0489, - "step": 678 - }, - { - "epoch": 8.083333333333334, - "grad_norm": 2.0916980358289528, - "learning_rate": 1.351968011216223e-05, - "loss": 0.0194, - "step": 679 - }, - { - "epoch": 8.095238095238095, - "grad_norm": 1.8436837623551254, - "learning_rate": 1.3501622205935698e-05, - "loss": 0.0624, - "step": 680 - }, - { - "epoch": 8.107142857142858, - "grad_norm": 2.998109666966864, - "learning_rate": 1.348355127623869e-05, - "loss": 0.1059, - "step": 681 - }, - { - "epoch": 8.119047619047619, - "grad_norm": 0.9572390376576435, - "learning_rate": 1.3465467390281826e-05, - "loss": 0.0254, - "step": 682 - }, - { - "epoch": 8.130952380952381, - "grad_norm": 1.1446640882773418, - "learning_rate": 1.3447370615323924e-05, - "loss": 0.0321, - "step": 683 - }, - { - "epoch": 8.142857142857142, - "grad_norm": 0.9314532514923329, - "learning_rate": 1.3429261018671735e-05, - "loss": 0.0335, - "step": 684 - }, - { - "epoch": 8.154761904761905, - "grad_norm": 0.9099154090085874, - "learning_rate": 1.3411138667679697e-05, - "loss": 0.0275, - "step": 685 - }, - { - "epoch": 8.166666666666666, - "grad_norm": 2.814442692268865, - "learning_rate": 1.3393003629749684e-05, - "loss": 0.0574, - "step": 686 - }, - { - "epoch": 8.178571428571429, - "grad_norm": 1.2925967965525935, - "learning_rate": 1.3374855972330758e-05, - "loss": 0.0318, - "step": 687 - }, - { - "epoch": 8.19047619047619, - "grad_norm": 4.6036673705862, - "learning_rate": 1.3356695762918915e-05, - "loss": 0.1407, - "step": 688 - }, - { - "epoch": 8.202380952380953, - "grad_norm": 2.3563878485370484, - "learning_rate": 1.3338523069056838e-05, - "loss": 0.0429, - "step": 689 - }, - { - "epoch": 8.214285714285714, - "grad_norm": 3.740777621839342, - "learning_rate": 1.332033795833364e-05, - "loss": 0.0477, - "step": 690 - }, - { - "epoch": 8.226190476190476, - "grad_norm": 4.188291251339514, - "learning_rate": 1.3302140498384617e-05, - "loss": 0.0449, - "step": 691 - }, - { - "epoch": 8.238095238095237, - "grad_norm": 1.884707086232278, - "learning_rate": 1.328393075689099e-05, - "loss": 0.0482, - "step": 692 - }, - { - "epoch": 8.25, - "grad_norm": 1.3968846684263698, - "learning_rate": 1.326570880157967e-05, - "loss": 0.0301, - "step": 693 - }, - { - "epoch": 8.261904761904763, - "grad_norm": 1.905739875764223, - "learning_rate": 1.3247474700222981e-05, - "loss": 0.0698, - "step": 694 - }, - { - "epoch": 8.273809523809524, - "grad_norm": 0.8840377855273591, - "learning_rate": 1.3229228520638435e-05, - "loss": 0.0482, - "step": 695 - }, - { - "epoch": 8.285714285714286, - "grad_norm": 1.681936237192879, - "learning_rate": 1.3210970330688453e-05, - "loss": 0.0501, - "step": 696 - }, - { - "epoch": 8.297619047619047, - "grad_norm": 3.012072033315285, - "learning_rate": 1.319270019828013e-05, - "loss": 0.0741, - "step": 697 - }, - { - "epoch": 8.30952380952381, - "grad_norm": 1.374140431461223, - "learning_rate": 1.3174418191364988e-05, - "loss": 0.038, - "step": 698 - }, - { - "epoch": 8.321428571428571, - "grad_norm": 1.810238997618556, - "learning_rate": 1.31561243779387e-05, - "loss": 0.0669, - "step": 699 - }, - { - "epoch": 8.333333333333334, - "grad_norm": 1.8326667848989575, - "learning_rate": 1.3137818826040856e-05, - "loss": 0.0763, - "step": 700 - }, - { - "epoch": 8.345238095238095, - "grad_norm": 1.5303005114433177, - "learning_rate": 1.3119501603754705e-05, - "loss": 0.0595, - "step": 701 - }, - { - "epoch": 8.357142857142858, - "grad_norm": 1.0204967306864992, - "learning_rate": 1.3101172779206902e-05, - "loss": 0.0251, - "step": 702 - }, - { - "epoch": 8.369047619047619, - "grad_norm": 2.6953720121220024, - "learning_rate": 1.308283242056725e-05, - "loss": 0.0852, - "step": 703 - }, - { - "epoch": 8.380952380952381, - "grad_norm": 1.091732342741422, - "learning_rate": 1.3064480596048454e-05, - "loss": 0.0637, - "step": 704 - }, - { - "epoch": 8.392857142857142, - "grad_norm": 1.2212925528454934, - "learning_rate": 1.3046117373905866e-05, - "loss": 0.0295, - "step": 705 - }, - { - "epoch": 8.404761904761905, - "grad_norm": 1.311921484987392, - "learning_rate": 1.3027742822437222e-05, - "loss": 0.0126, - "step": 706 - }, - { - "epoch": 8.416666666666666, - "grad_norm": 1.845203590104293, - "learning_rate": 1.3009357009982397e-05, - "loss": 0.0568, - "step": 707 - }, - { - "epoch": 8.428571428571429, - "grad_norm": 2.1638646922168934, - "learning_rate": 1.2990960004923154e-05, - "loss": 0.0477, - "step": 708 - }, - { - "epoch": 8.44047619047619, - "grad_norm": 3.170909990089901, - "learning_rate": 1.2972551875682882e-05, - "loss": 0.0851, - "step": 709 - }, - { - "epoch": 8.452380952380953, - "grad_norm": 2.0061950163071662, - "learning_rate": 1.2954132690726335e-05, - "loss": 0.0409, - "step": 710 - }, - { - "epoch": 8.464285714285714, - "grad_norm": 1.0898938904654067, - "learning_rate": 1.2935702518559399e-05, - "loss": 0.0104, - "step": 711 - }, - { - "epoch": 8.476190476190476, - "grad_norm": 3.8531317949706008, - "learning_rate": 1.2917261427728815e-05, - "loss": 0.079, - "step": 712 - }, - { - "epoch": 8.488095238095237, - "grad_norm": 3.2258612383912584, - "learning_rate": 1.289880948682194e-05, - "loss": 0.0624, - "step": 713 - }, - { - "epoch": 8.5, - "grad_norm": 2.408210971584371, - "learning_rate": 1.288034676446648e-05, - "loss": 0.0746, - "step": 714 - }, - { - "epoch": 8.511904761904763, - "grad_norm": 1.8900504075144005, - "learning_rate": 1.2861873329330248e-05, - "loss": 0.0598, - "step": 715 - }, - { - "epoch": 8.523809523809524, - "grad_norm": 5.2541248321128196, - "learning_rate": 1.2843389250120885e-05, - "loss": 0.0451, - "step": 716 - }, - { - "epoch": 8.535714285714286, - "grad_norm": 11.687616844777143, - "learning_rate": 1.2824894595585637e-05, - "loss": 0.0734, - "step": 717 - }, - { - "epoch": 8.547619047619047, - "grad_norm": 2.7273662847529643, - "learning_rate": 1.2806389434511078e-05, - "loss": 0.0887, - "step": 718 - }, - { - "epoch": 8.55952380952381, - "grad_norm": 0.8084461433790741, - "learning_rate": 1.2787873835722851e-05, - "loss": 0.0175, - "step": 719 - }, - { - "epoch": 8.571428571428571, - "grad_norm": 2.562516566277651, - "learning_rate": 1.276934786808543e-05, - "loss": 0.037, - "step": 720 - }, - { - "epoch": 8.583333333333334, - "grad_norm": 1.964626228993161, - "learning_rate": 1.2750811600501843e-05, - "loss": 0.0143, - "step": 721 - }, - { - "epoch": 8.595238095238095, - "grad_norm": 2.347260151226618, - "learning_rate": 1.2732265101913435e-05, - "loss": 0.0992, - "step": 722 - }, - { - "epoch": 8.607142857142858, - "grad_norm": 2.332067934310295, - "learning_rate": 1.2713708441299602e-05, - "loss": 0.0903, - "step": 723 - }, - { - "epoch": 8.619047619047619, - "grad_norm": 2.433504640915981, - "learning_rate": 1.2695141687677527e-05, - "loss": 0.0906, - "step": 724 - }, - { - "epoch": 8.630952380952381, - "grad_norm": 0.7365574346673278, - "learning_rate": 1.2676564910101948e-05, - "loss": 0.0066, - "step": 725 - }, - { - "epoch": 8.642857142857142, - "grad_norm": 2.1926068397741028, - "learning_rate": 1.2657978177664862e-05, - "loss": 0.0692, - "step": 726 - }, - { - "epoch": 8.654761904761905, - "grad_norm": 3.677952898983117, - "learning_rate": 1.263938155949531e-05, - "loss": 0.111, - "step": 727 - }, - { - "epoch": 8.666666666666666, - "grad_norm": 1.9850598171802365, - "learning_rate": 1.2620775124759093e-05, - "loss": 0.0676, - "step": 728 - }, - { - "epoch": 8.678571428571429, - "grad_norm": 4.3432896002801495, - "learning_rate": 1.2602158942658523e-05, - "loss": 0.1596, - "step": 729 - }, - { - "epoch": 8.69047619047619, - "grad_norm": 1.6243741730128927, - "learning_rate": 1.2583533082432168e-05, - "loss": 0.0642, - "step": 730 - }, - { - "epoch": 8.702380952380953, - "grad_norm": 1.27022170434261, - "learning_rate": 1.2564897613354586e-05, - "loss": 0.0247, - "step": 731 - }, - { - "epoch": 8.714285714285714, - "grad_norm": 1.323666707186927, - "learning_rate": 1.2546252604736073e-05, - "loss": 0.0427, - "step": 732 - }, - { - "epoch": 8.726190476190476, - "grad_norm": 3.3123883449785327, - "learning_rate": 1.2527598125922413e-05, - "loss": 0.0919, - "step": 733 - }, - { - "epoch": 8.738095238095237, - "grad_norm": 0.867843230559778, - "learning_rate": 1.2508934246294604e-05, - "loss": 0.0511, - "step": 734 - }, - { - "epoch": 8.75, - "grad_norm": 1.2662403709808365, - "learning_rate": 1.2490261035268614e-05, - "loss": 0.0355, - "step": 735 - }, - { - "epoch": 8.761904761904763, - "grad_norm": 1.3128545329767234, - "learning_rate": 1.2471578562295115e-05, - "loss": 0.0424, - "step": 736 - }, - { - "epoch": 8.773809523809524, - "grad_norm": 5.195758504027162, - "learning_rate": 1.245288689685922e-05, - "loss": 0.0486, - "step": 737 - }, - { - "epoch": 8.785714285714286, - "grad_norm": 2.397782390906139, - "learning_rate": 1.243418610848024e-05, - "loss": 0.0501, - "step": 738 - }, - { - "epoch": 8.797619047619047, - "grad_norm": 3.803902756894812, - "learning_rate": 1.2415476266711415e-05, - "loss": 0.0724, - "step": 739 - }, - { - "epoch": 8.80952380952381, - "grad_norm": 3.9391624067695212, - "learning_rate": 1.2396757441139655e-05, - "loss": 0.0669, - "step": 740 - }, - { - "epoch": 8.821428571428571, - "grad_norm": 2.82436087893358, - "learning_rate": 1.2378029701385288e-05, - "loss": 0.0941, - "step": 741 - }, - { - "epoch": 8.833333333333334, - "grad_norm": 1.3255227276592592, - "learning_rate": 1.2359293117101783e-05, - "loss": 0.0312, - "step": 742 - }, - { - "epoch": 8.845238095238095, - "grad_norm": 15.30188169590839, - "learning_rate": 1.234054775797552e-05, - "loss": 0.2154, - "step": 743 - }, - { - "epoch": 8.857142857142858, - "grad_norm": 1.5966516669338604, - "learning_rate": 1.2321793693725509e-05, - "loss": 0.0496, - "step": 744 - }, - { - "epoch": 8.869047619047619, - "grad_norm": 1.4094052298445738, - "learning_rate": 1.2303030994103133e-05, - "loss": 0.0403, - "step": 745 - }, - { - "epoch": 8.880952380952381, - "grad_norm": 1.3961941919580525, - "learning_rate": 1.2284259728891897e-05, - "loss": 0.0432, - "step": 746 - }, - { - "epoch": 8.892857142857142, - "grad_norm": 1.7789035378853075, - "learning_rate": 1.2265479967907158e-05, - "loss": 0.0558, - "step": 747 - }, - { - "epoch": 8.904761904761905, - "grad_norm": 1.3087355978594275, - "learning_rate": 1.2246691780995881e-05, - "loss": 0.0513, - "step": 748 - }, - { - "epoch": 8.916666666666666, - "grad_norm": 2.4584365408967748, - "learning_rate": 1.2227895238036358e-05, - "loss": 0.0899, - "step": 749 - }, - { - "epoch": 8.928571428571429, - "grad_norm": 2.4158077298850396, - "learning_rate": 1.2209090408937972e-05, - "loss": 0.0968, - "step": 750 - }, - { - "epoch": 8.94047619047619, - "grad_norm": 1.4637737987592996, - "learning_rate": 1.2190277363640908e-05, - "loss": 0.0245, - "step": 751 - }, - { - "epoch": 8.952380952380953, - "grad_norm": 3.702641437817495, - "learning_rate": 1.2171456172115922e-05, - "loss": 0.1082, - "step": 752 - }, - { - "epoch": 8.964285714285714, - "grad_norm": 2.349077228223263, - "learning_rate": 1.2152626904364066e-05, - "loss": 0.0929, - "step": 753 - }, - { - "epoch": 8.976190476190476, - "grad_norm": 1.1008670892256474, - "learning_rate": 1.2133789630416426e-05, - "loss": 0.0355, - "step": 754 - }, - { - "epoch": 8.988095238095237, - "grad_norm": 2.1587571145198017, - "learning_rate": 1.211494442033387e-05, - "loss": 0.0204, - "step": 755 - }, - { - "epoch": 9.0, - "grad_norm": 0.8256549231338911, - "learning_rate": 1.2096091344206778e-05, - "loss": 0.0252, - "step": 756 - }, - { - "epoch": 9.0, - "eval_loss": 0.09582231193780899, - "eval_runtime": 40.4622, - "eval_samples_per_second": 1.483, - "eval_steps_per_second": 1.483, - "step": 756 - }, - { - "epoch": 9.011904761904763, - "grad_norm": 1.282205989400026, - "learning_rate": 1.2077230472154787e-05, - "loss": 0.0295, - "step": 757 - }, - { - "epoch": 9.023809523809524, - "grad_norm": 2.1270661891136293, - "learning_rate": 1.2058361874326527e-05, - "loss": 0.0757, - "step": 758 - }, - { - "epoch": 9.035714285714286, - "grad_norm": 1.4062511264145792, - "learning_rate": 1.203948562089937e-05, - "loss": 0.044, - "step": 759 - }, - { - "epoch": 9.047619047619047, - "grad_norm": 2.1904184923247643, - "learning_rate": 1.2020601782079155e-05, - "loss": 0.0824, - "step": 760 - }, - { - "epoch": 9.05952380952381, - "grad_norm": 3.286148753711866, - "learning_rate": 1.2001710428099935e-05, - "loss": 0.0836, - "step": 761 - }, - { - "epoch": 9.071428571428571, - "grad_norm": 1.5939900895983135, - "learning_rate": 1.198281162922371e-05, - "loss": 0.0509, - "step": 762 - }, - { - "epoch": 9.083333333333334, - "grad_norm": 0.8241534897298811, - "learning_rate": 1.1963905455740176e-05, - "loss": 0.0228, - "step": 763 - }, - { - "epoch": 9.095238095238095, - "grad_norm": 1.3287066485870127, - "learning_rate": 1.1944991977966452e-05, - "loss": 0.033, - "step": 764 - }, - { - "epoch": 9.107142857142858, - "grad_norm": 1.4021161713675208, - "learning_rate": 1.1926071266246826e-05, - "loss": 0.0351, - "step": 765 - }, - { - "epoch": 9.119047619047619, - "grad_norm": 6.391793428333144, - "learning_rate": 1.1907143390952494e-05, - "loss": 0.0991, - "step": 766 - }, - { - "epoch": 9.130952380952381, - "grad_norm": 2.386900925645386, - "learning_rate": 1.1888208422481287e-05, - "loss": 0.0205, - "step": 767 - }, - { - "epoch": 9.142857142857142, - "grad_norm": 7.279317351490466, - "learning_rate": 1.1869266431257422e-05, - "loss": 0.0735, - "step": 768 - }, - { - "epoch": 9.154761904761905, - "grad_norm": 24.006127332322464, - "learning_rate": 1.185031748773124e-05, - "loss": 0.219, - "step": 769 - }, - { - "epoch": 9.166666666666666, - "grad_norm": 2.0743647255989, - "learning_rate": 1.1831361662378933e-05, - "loss": 0.0585, - "step": 770 - }, - { - "epoch": 9.178571428571429, - "grad_norm": 1.4495226552360447, - "learning_rate": 1.1812399025702291e-05, - "loss": 0.0279, - "step": 771 - }, - { - "epoch": 9.19047619047619, - "grad_norm": 2.482091671301266, - "learning_rate": 1.1793429648228437e-05, - "loss": 0.0756, - "step": 772 - }, - { - "epoch": 9.202380952380953, - "grad_norm": 2.2578166171442784, - "learning_rate": 1.177445360050956e-05, - "loss": 0.0687, - "step": 773 - }, - { - "epoch": 9.214285714285714, - "grad_norm": 1.3642263825505174, - "learning_rate": 1.1755470953122668e-05, - "loss": 0.033, - "step": 774 - }, - { - "epoch": 9.226190476190476, - "grad_norm": 0.43557975402669774, - "learning_rate": 1.1736481776669307e-05, - "loss": 0.0051, - "step": 775 - }, - { - "epoch": 9.238095238095237, - "grad_norm": 1.7819915407174811, - "learning_rate": 1.1717486141775305e-05, - "loss": 0.0145, - "step": 776 - }, - { - "epoch": 9.25, - "grad_norm": 2.524511395140908, - "learning_rate": 1.1698484119090518e-05, - "loss": 0.0752, - "step": 777 - }, - { - "epoch": 9.261904761904763, - "grad_norm": 1.3378846697745967, - "learning_rate": 1.1679475779288555e-05, - "loss": 0.054, - "step": 778 - }, - { - "epoch": 9.273809523809524, - "grad_norm": 2.864204351537188, - "learning_rate": 1.1660461193066521e-05, - "loss": 0.1019, - "step": 779 - }, - { - "epoch": 9.285714285714286, - "grad_norm": 3.044999321706026, - "learning_rate": 1.164144043114475e-05, - "loss": 0.1038, - "step": 780 - }, - { - "epoch": 9.297619047619047, - "grad_norm": 1.4627132525729252, - "learning_rate": 1.1622413564266554e-05, - "loss": 0.0573, - "step": 781 - }, - { - "epoch": 9.30952380952381, - "grad_norm": 2.677810759821976, - "learning_rate": 1.1603380663197941e-05, - "loss": 0.0715, - "step": 782 - }, - { - "epoch": 9.321428571428571, - "grad_norm": 2.063886050190795, - "learning_rate": 1.1584341798727365e-05, - "loss": 0.0799, - "step": 783 - }, - { - "epoch": 9.333333333333334, - "grad_norm": 0.669569446308257, - "learning_rate": 1.156529704166546e-05, - "loss": 0.0062, - "step": 784 - }, - { - "epoch": 9.345238095238095, - "grad_norm": 1.9073286324932273, - "learning_rate": 1.154624646284478e-05, - "loss": 0.0671, - "step": 785 - }, - { - "epoch": 9.357142857142858, - "grad_norm": 3.517001704800636, - "learning_rate": 1.1527190133119527e-05, - "loss": 0.0793, - "step": 786 - }, - { - "epoch": 9.369047619047619, - "grad_norm": 4.373994962095242, - "learning_rate": 1.150812812336529e-05, - "loss": 0.1071, - "step": 787 - }, - { - "epoch": 9.380952380952381, - "grad_norm": 4.416943179885893, - "learning_rate": 1.1489060504478788e-05, - "loss": 0.1162, - "step": 788 - }, - { - "epoch": 9.392857142857142, - "grad_norm": 1.7509893632815494, - "learning_rate": 1.1469987347377602e-05, - "loss": 0.0305, - "step": 789 - }, - { - "epoch": 9.404761904761905, - "grad_norm": 1.97331353114604, - "learning_rate": 1.145090872299991e-05, - "loss": 0.0588, - "step": 790 - }, - { - "epoch": 9.416666666666666, - "grad_norm": 2.3558012359385088, - "learning_rate": 1.1431824702304222e-05, - "loss": 0.0583, - "step": 791 - }, - { - "epoch": 9.428571428571429, - "grad_norm": 2.688493267114919, - "learning_rate": 1.1412735356269124e-05, - "loss": 0.07, - "step": 792 - }, - { - "epoch": 9.44047619047619, - "grad_norm": 1.9416496607434544, - "learning_rate": 1.1393640755893003e-05, - "loss": 0.0439, - "step": 793 - }, - { - "epoch": 9.452380952380953, - "grad_norm": 1.7273468969890204, - "learning_rate": 1.1374540972193787e-05, - "loss": 0.0523, - "step": 794 - }, - { - "epoch": 9.464285714285714, - "grad_norm": 2.9273956897516826, - "learning_rate": 1.1355436076208687e-05, - "loss": 0.0664, - "step": 795 - }, - { - "epoch": 9.476190476190476, - "grad_norm": 6.234566222242338, - "learning_rate": 1.1336326138993927e-05, - "loss": 0.0867, - "step": 796 - }, - { - "epoch": 9.488095238095237, - "grad_norm": 2.322428354346157, - "learning_rate": 1.1317211231624482e-05, - "loss": 0.0694, - "step": 797 - }, - { - "epoch": 9.5, - "grad_norm": 3.2692359052782787, - "learning_rate": 1.1298091425193807e-05, - "loss": 0.1264, - "step": 798 - }, - { - "epoch": 9.511904761904763, - "grad_norm": 1.6606364957031776, - "learning_rate": 1.1278966790813582e-05, - "loss": 0.044, - "step": 799 - }, - { - "epoch": 9.523809523809524, - "grad_norm": 1.1379465046983437, - "learning_rate": 1.125983739961344e-05, - "loss": 0.01, - "step": 800 - }, - { - "epoch": 9.535714285714286, - "grad_norm": 1.9162169523281654, - "learning_rate": 1.124070332274071e-05, - "loss": 0.0437, - "step": 801 - }, - { - "epoch": 9.547619047619047, - "grad_norm": 3.0304053559495294, - "learning_rate": 1.1221564631360154e-05, - "loss": 0.0766, - "step": 802 - }, - { - "epoch": 9.55952380952381, - "grad_norm": 5.340769713504394, - "learning_rate": 1.1202421396653678e-05, - "loss": 0.0384, - "step": 803 - }, - { - "epoch": 9.571428571428571, - "grad_norm": 1.453031390336659, - "learning_rate": 1.11832736898201e-05, - "loss": 0.0126, - "step": 804 - }, - { - "epoch": 9.583333333333334, - "grad_norm": 1.112891160662993, - "learning_rate": 1.1164121582074874e-05, - "loss": 0.0108, - "step": 805 - }, - { - "epoch": 9.595238095238095, - "grad_norm": 1.2126762063748622, - "learning_rate": 1.1144965144649809e-05, - "loss": 0.0354, - "step": 806 - }, - { - "epoch": 9.607142857142858, - "grad_norm": 1.2210629125933703, - "learning_rate": 1.1125804448792832e-05, - "loss": 0.0119, - "step": 807 - }, - { - "epoch": 9.619047619047619, - "grad_norm": 2.7345990636822264, - "learning_rate": 1.1106639565767691e-05, - "loss": 0.0862, - "step": 808 - }, - { - "epoch": 9.630952380952381, - "grad_norm": 1.4528274005758528, - "learning_rate": 1.1087470566853727e-05, - "loss": 0.0473, - "step": 809 - }, - { - "epoch": 9.642857142857142, - "grad_norm": 3.23106792107614, - "learning_rate": 1.1068297523345573e-05, - "loss": 0.1117, - "step": 810 - }, - { - "epoch": 9.654761904761905, - "grad_norm": 1.3500416514630418, - "learning_rate": 1.1049120506552913e-05, - "loss": 0.0196, - "step": 811 - }, - { - "epoch": 9.666666666666666, - "grad_norm": 2.5216815103287296, - "learning_rate": 1.1029939587800207e-05, - "loss": 0.0714, - "step": 812 - }, - { - "epoch": 9.678571428571429, - "grad_norm": 1.331833717446806, - "learning_rate": 1.1010754838426427e-05, - "loss": 0.0367, - "step": 813 - }, - { - "epoch": 9.69047619047619, - "grad_norm": 0.8528992509172784, - "learning_rate": 1.099156632978479e-05, - "loss": 0.0078, - "step": 814 - }, - { - "epoch": 9.702380952380953, - "grad_norm": 1.9692634466833878, - "learning_rate": 1.0972374133242503e-05, - "loss": 0.0357, - "step": 815 - }, - { - "epoch": 9.714285714285714, - "grad_norm": 2.40950210370164, - "learning_rate": 1.0953178320180475e-05, - "loss": 0.0901, - "step": 816 - }, - { - "epoch": 9.726190476190476, - "grad_norm": 0.35898819817840166, - "learning_rate": 1.0933978961993084e-05, - "loss": 0.0044, - "step": 817 - }, - { - "epoch": 9.738095238095237, - "grad_norm": 0.706113936829129, - "learning_rate": 1.0914776130087874e-05, - "loss": 0.0066, - "step": 818 - }, - { - "epoch": 9.75, - "grad_norm": 2.075397250889756, - "learning_rate": 1.0895569895885321e-05, - "loss": 0.0343, - "step": 819 - }, - { - "epoch": 9.761904761904763, - "grad_norm": 1.1878627675894107, - "learning_rate": 1.0876360330818554e-05, - "loss": 0.0112, - "step": 820 - }, - { - "epoch": 9.773809523809524, - "grad_norm": 2.0754258512016817, - "learning_rate": 1.0857147506333087e-05, - "loss": 0.0287, - "step": 821 - }, - { - "epoch": 9.785714285714286, - "grad_norm": 2.270181907997249, - "learning_rate": 1.0837931493886561e-05, - "loss": 0.053, - "step": 822 - }, - { - "epoch": 9.797619047619047, - "grad_norm": 2.996581841029044, - "learning_rate": 1.081871236494847e-05, - "loss": 0.1128, - "step": 823 - }, - { - "epoch": 9.80952380952381, - "grad_norm": 1.4375053769264925, - "learning_rate": 1.0799490190999893e-05, - "loss": 0.0428, - "step": 824 - }, - { - "epoch": 9.821428571428571, - "grad_norm": 1.9654964465679015, - "learning_rate": 1.0780265043533252e-05, - "loss": 0.0179, - "step": 825 - }, - { - "epoch": 9.833333333333334, - "grad_norm": 2.2834355055646607, - "learning_rate": 1.0761036994052008e-05, - "loss": 0.0764, - "step": 826 - }, - { - "epoch": 9.845238095238095, - "grad_norm": 2.677653120102437, - "learning_rate": 1.0741806114070434e-05, - "loss": 0.0541, - "step": 827 - }, - { - "epoch": 9.857142857142858, - "grad_norm": 2.3216873278592494, - "learning_rate": 1.0722572475113316e-05, - "loss": 0.0445, - "step": 828 - }, - { - "epoch": 9.869047619047619, - "grad_norm": 2.134658538525783, - "learning_rate": 1.0703336148715705e-05, - "loss": 0.0504, - "step": 829 - }, - { - "epoch": 9.880952380952381, - "grad_norm": 3.4003928781027457, - "learning_rate": 1.0684097206422654e-05, - "loss": 0.0889, - "step": 830 - }, - { - "epoch": 9.892857142857142, - "grad_norm": 2.8288540439979277, - "learning_rate": 1.0664855719788936e-05, - "loss": 0.0778, - "step": 831 - }, - { - "epoch": 9.904761904761905, - "grad_norm": 1.6914967044629623, - "learning_rate": 1.0645611760378795e-05, - "loss": 0.0546, - "step": 832 - }, - { - "epoch": 9.916666666666666, - "grad_norm": 3.2241458761178876, - "learning_rate": 1.0626365399765668e-05, - "loss": 0.0458, - "step": 833 - }, - { - "epoch": 9.928571428571429, - "grad_norm": 6.421780549816035, - "learning_rate": 1.060711670953192e-05, - "loss": 0.0422, - "step": 834 - }, - { - "epoch": 9.94047619047619, - "grad_norm": 2.2721390346454284, - "learning_rate": 1.0587865761268583e-05, - "loss": 0.0546, - "step": 835 - }, - { - "epoch": 9.952380952380953, - "grad_norm": 1.9385170559910914, - "learning_rate": 1.0568612626575093e-05, - "loss": 0.0524, - "step": 836 - }, - { - "epoch": 9.964285714285714, - "grad_norm": 2.182053081295405, - "learning_rate": 1.0549357377059007e-05, - "loss": 0.021, - "step": 837 - }, - { - "epoch": 9.976190476190476, - "grad_norm": 1.550297109310986, - "learning_rate": 1.053010008433576e-05, - "loss": 0.0316, - "step": 838 - }, - { - "epoch": 9.988095238095237, - "grad_norm": 1.3496799629334293, - "learning_rate": 1.051084082002837e-05, - "loss": 0.0369, - "step": 839 - }, - { - "epoch": 10.0, - "grad_norm": 2.806285360073082, - "learning_rate": 1.0491579655767203e-05, - "loss": 0.0744, - "step": 840 - }, - { - "epoch": 10.0, - "eval_loss": 0.17259296774864197, - "eval_runtime": 39.9462, - "eval_samples_per_second": 1.502, - "eval_steps_per_second": 1.502, - "step": 840 - }, - { - "epoch": 10.011904761904763, - "grad_norm": 2.1627843341088786, - "learning_rate": 1.0472316663189683e-05, - "loss": 0.0561, - "step": 841 - }, - { - "epoch": 10.023809523809524, - "grad_norm": 2.8309484064277246, - "learning_rate": 1.0453051913940042e-05, - "loss": 0.0822, - "step": 842 - }, - { - "epoch": 10.035714285714286, - "grad_norm": 4.436488618508568, - "learning_rate": 1.0433785479669038e-05, - "loss": 0.0324, - "step": 843 - }, - { - "epoch": 10.047619047619047, - "grad_norm": 2.812367342584338, - "learning_rate": 1.0414517432033695e-05, - "loss": 0.0688, - "step": 844 - }, - { - "epoch": 10.05952380952381, - "grad_norm": 1.9245163132317105, - "learning_rate": 1.039524784269704e-05, - "loss": 0.0476, - "step": 845 - }, - { - "epoch": 10.071428571428571, - "grad_norm": 1.7189647202551532, - "learning_rate": 1.0375976783327841e-05, - "loss": 0.0301, - "step": 846 - }, - { - "epoch": 10.083333333333334, - "grad_norm": 0.9817231239896711, - "learning_rate": 1.0356704325600323e-05, - "loss": 0.0077, - "step": 847 - }, - { - "epoch": 10.095238095238095, - "grad_norm": 1.3545517237783122, - "learning_rate": 1.0337430541193918e-05, - "loss": 0.0528, - "step": 848 - }, - { - "epoch": 10.107142857142858, - "grad_norm": 1.9013617849189701, - "learning_rate": 1.0318155501792988e-05, - "loss": 0.0469, - "step": 849 - }, - { - "epoch": 10.119047619047619, - "grad_norm": 1.5367684534473693, - "learning_rate": 1.0298879279086568e-05, - "loss": 0.0186, - "step": 850 - }, - { - "epoch": 10.130952380952381, - "grad_norm": 1.4082305565292088, - "learning_rate": 1.027960194476809e-05, - "loss": 0.0414, - "step": 851 - }, - { - "epoch": 10.142857142857142, - "grad_norm": 1.9714729317443556, - "learning_rate": 1.026032357053512e-05, - "loss": 0.0327, - "step": 852 - }, - { - "epoch": 10.154761904761905, - "grad_norm": 1.2486033765999593, - "learning_rate": 1.0241044228089096e-05, - "loss": 0.0153, - "step": 853 - }, - { - "epoch": 10.166666666666666, - "grad_norm": 1.8558173945665013, - "learning_rate": 1.0221763989135052e-05, - "loss": 0.0317, - "step": 854 - }, - { - "epoch": 10.178571428571429, - "grad_norm": 1.9976430929242892, - "learning_rate": 1.0202482925381359e-05, - "loss": 0.0674, - "step": 855 - }, - { - "epoch": 10.19047619047619, - "grad_norm": 2.2642858611465, - "learning_rate": 1.0183201108539453e-05, - "loss": 0.0148, - "step": 856 - }, - { - "epoch": 10.202380952380953, - "grad_norm": 3.740932380776293, - "learning_rate": 1.016391861032358e-05, - "loss": 0.0669, - "step": 857 - }, - { - "epoch": 10.214285714285714, - "grad_norm": 1.8188813093185374, - "learning_rate": 1.0144635502450509e-05, - "loss": 0.0379, - "step": 858 - }, - { - "epoch": 10.226190476190476, - "grad_norm": 2.7437019175338424, - "learning_rate": 1.0125351856639279e-05, - "loss": 0.0326, - "step": 859 - }, - { - "epoch": 10.238095238095237, - "grad_norm": 3.933447240473136, - "learning_rate": 1.0106067744610932e-05, - "loss": 0.0809, - "step": 860 - }, - { - "epoch": 10.25, - "grad_norm": 2.6212363917089987, - "learning_rate": 1.0086783238088244e-05, - "loss": 0.0181, - "step": 861 - }, - { - "epoch": 10.261904761904763, - "grad_norm": 1.4956127769577703, - "learning_rate": 1.0067498408795462e-05, - "loss": 0.0393, - "step": 862 - }, - { - "epoch": 10.273809523809524, - "grad_norm": 4.756355647768309, - "learning_rate": 1.0048213328458027e-05, - "loss": 0.0484, - "step": 863 - }, - { - "epoch": 10.285714285714286, - "grad_norm": 5.264351585446348, - "learning_rate": 1.0028928068802314e-05, - "loss": 0.0264, - "step": 864 - }, - { - "epoch": 10.297619047619047, - "grad_norm": 1.4844810359483214, - "learning_rate": 1.0009642701555369e-05, - "loss": 0.0303, - "step": 865 - }, - { - "epoch": 10.30952380952381, - "grad_norm": 2.603905301584864, - "learning_rate": 9.990357298444631e-06, - "loss": 0.0162, - "step": 866 - }, - { - "epoch": 10.321428571428571, - "grad_norm": 2.8184520996344444, - "learning_rate": 9.971071931197686e-06, - "loss": 0.0552, - "step": 867 - }, - { - "epoch": 10.333333333333334, - "grad_norm": 1.2787347559583708, - "learning_rate": 9.951786671541975e-06, - "loss": 0.0091, - "step": 868 - }, - { - "epoch": 10.345238095238095, - "grad_norm": 3.460050048787224, - "learning_rate": 9.932501591204538e-06, - "loss": 0.0852, - "step": 869 - }, - { - "epoch": 10.357142857142858, - "grad_norm": 3.4097967028653273, - "learning_rate": 9.913216761911754e-06, - "loss": 0.067, - "step": 870 - }, - { - "epoch": 10.369047619047619, - "grad_norm": 2.4386231982584436, - "learning_rate": 9.89393225538907e-06, - "loss": 0.0556, - "step": 871 - }, - { - "epoch": 10.380952380952381, - "grad_norm": 2.933272866461037, - "learning_rate": 9.874648143360723e-06, - "loss": 0.0496, - "step": 872 - }, - { - "epoch": 10.392857142857142, - "grad_norm": 1.8929442632741174, - "learning_rate": 9.855364497549495e-06, - "loss": 0.0413, - "step": 873 - }, - { - "epoch": 10.404761904761905, - "grad_norm": 2.3616431199563435, - "learning_rate": 9.836081389676422e-06, - "loss": 0.0538, - "step": 874 - }, - { - "epoch": 10.416666666666666, - "grad_norm": 2.4127824710359524, - "learning_rate": 9.816798891460545e-06, - "loss": 0.0191, - "step": 875 - }, - { - "epoch": 10.428571428571429, - "grad_norm": 3.907132416538552, - "learning_rate": 9.797517074618643e-06, - "loss": 0.0892, - "step": 876 - }, - { - "epoch": 10.44047619047619, - "grad_norm": 4.181403959687322, - "learning_rate": 9.778236010864948e-06, - "loss": 0.0902, - "step": 877 - }, - { - "epoch": 10.452380952380953, - "grad_norm": 2.3611084962335953, - "learning_rate": 9.758955771910906e-06, - "loss": 0.0162, - "step": 878 - }, - { - "epoch": 10.464285714285714, - "grad_norm": 1.9021196469452089, - "learning_rate": 9.739676429464881e-06, - "loss": 0.0535, - "step": 879 - }, - { - "epoch": 10.476190476190476, - "grad_norm": 3.988458784098384, - "learning_rate": 9.72039805523191e-06, - "loss": 0.0521, - "step": 880 - }, - { - "epoch": 10.488095238095237, - "grad_norm": 1.6135380627489257, - "learning_rate": 9.701120720913434e-06, - "loss": 0.0289, - "step": 881 - }, - { - "epoch": 10.5, - "grad_norm": 2.7761673436825713, - "learning_rate": 9.681844498207012e-06, - "loss": 0.0658, - "step": 882 - }, - { - "epoch": 10.511904761904763, - "grad_norm": 2.3672118880045385, - "learning_rate": 9.662569458806085e-06, - "loss": 0.0162, - "step": 883 - }, - { - "epoch": 10.523809523809524, - "grad_norm": 2.29142928632914, - "learning_rate": 9.64329567439968e-06, - "loss": 0.0429, - "step": 884 - }, - { - "epoch": 10.535714285714286, - "grad_norm": 2.6254810846588823, - "learning_rate": 9.624023216672162e-06, - "loss": 0.038, - "step": 885 - }, - { - "epoch": 10.547619047619047, - "grad_norm": 3.7113842881972587, - "learning_rate": 9.604752157302962e-06, - "loss": 0.0229, - "step": 886 - }, - { - "epoch": 10.55952380952381, - "grad_norm": 3.3798411722544457, - "learning_rate": 9.585482567966309e-06, - "loss": 0.0569, - "step": 887 - }, - { - "epoch": 10.571428571428571, - "grad_norm": 1.3225876420518712, - "learning_rate": 9.566214520330966e-06, - "loss": 0.0251, - "step": 888 - }, - { - "epoch": 10.583333333333334, - "grad_norm": 1.4216589719224715, - "learning_rate": 9.54694808605996e-06, - "loss": 0.0173, - "step": 889 - }, - { - "epoch": 10.595238095238095, - "grad_norm": 1.3910055558361236, - "learning_rate": 9.527683336810319e-06, - "loss": 0.0186, - "step": 890 - }, - { - "epoch": 10.607142857142858, - "grad_norm": 1.7766662091152174, - "learning_rate": 9.5084203442328e-06, - "loss": 0.0203, - "step": 891 - }, - { - "epoch": 10.619047619047619, - "grad_norm": 0.5740612737142285, - "learning_rate": 9.489159179971633e-06, - "loss": 0.0051, - "step": 892 - }, - { - "epoch": 10.630952380952381, - "grad_norm": 2.3148547584071126, - "learning_rate": 9.469899915664245e-06, - "loss": 0.0315, - "step": 893 - }, - { - "epoch": 10.642857142857142, - "grad_norm": 4.294237887718487, - "learning_rate": 9.450642622940995e-06, - "loss": 0.0989, - "step": 894 - }, - { - "epoch": 10.654761904761905, - "grad_norm": 1.9110906979245355, - "learning_rate": 9.43138737342491e-06, - "loss": 0.0375, - "step": 895 - }, - { - "epoch": 10.666666666666666, - "grad_norm": 3.080110364647247, - "learning_rate": 9.412134238731419e-06, - "loss": 0.0217, - "step": 896 - }, - { - "epoch": 10.678571428571429, - "grad_norm": 1.363341861610208, - "learning_rate": 9.392883290468084e-06, - "loss": 0.0435, - "step": 897 - }, - { - "epoch": 10.69047619047619, - "grad_norm": 4.827423440092501, - "learning_rate": 9.373634600234336e-06, - "loss": 0.068, - "step": 898 - }, - { - "epoch": 10.702380952380953, - "grad_norm": 4.225961183200878, - "learning_rate": 9.354388239621208e-06, - "loss": 0.0479, - "step": 899 - }, - { - "epoch": 10.714285714285714, - "grad_norm": 1.7229712679249933, - "learning_rate": 9.335144280211066e-06, - "loss": 0.0305, - "step": 900 - }, - { - "epoch": 10.726190476190476, - "grad_norm": 1.0787497487434379, - "learning_rate": 9.315902793577349e-06, - "loss": 0.0145, - "step": 901 - }, - { - "epoch": 10.738095238095237, - "grad_norm": 2.7023940269650053, - "learning_rate": 9.296663851284298e-06, - "loss": 0.0739, - "step": 902 - }, - { - "epoch": 10.75, - "grad_norm": 1.9107792283247966, - "learning_rate": 9.277427524886689e-06, - "loss": 0.0466, - "step": 903 - }, - { - "epoch": 10.761904761904763, - "grad_norm": 2.345839473140023, - "learning_rate": 9.258193885929569e-06, - "loss": 0.0598, - "step": 904 - }, - { - "epoch": 10.773809523809524, - "grad_norm": 2.29363713803529, - "learning_rate": 9.238963005947994e-06, - "loss": 0.0303, - "step": 905 - }, - { - "epoch": 10.785714285714286, - "grad_norm": 10.905447082859407, - "learning_rate": 9.219734956466753e-06, - "loss": 0.1111, - "step": 906 - }, - { - "epoch": 10.797619047619047, - "grad_norm": 3.4292885991616764, - "learning_rate": 9.200509809000108e-06, - "loss": 0.0189, - "step": 907 - }, - { - "epoch": 10.80952380952381, - "grad_norm": 3.960545973039541, - "learning_rate": 9.181287635051535e-06, - "loss": 0.0196, - "step": 908 - }, - { - "epoch": 10.821428571428571, - "grad_norm": 4.77796269861986, - "learning_rate": 9.16206850611344e-06, - "loss": 0.0641, - "step": 909 - }, - { - "epoch": 10.833333333333334, - "grad_norm": 2.248205151577021, - "learning_rate": 9.142852493666914e-06, - "loss": 0.0463, - "step": 910 - }, - { - "epoch": 10.845238095238095, - "grad_norm": 0.5220374804491678, - "learning_rate": 9.123639669181448e-06, - "loss": 0.0037, - "step": 911 - }, - { - "epoch": 10.857142857142858, - "grad_norm": 1.4827333676054641, - "learning_rate": 9.10443010411468e-06, - "loss": 0.0363, - "step": 912 - }, - { - "epoch": 10.869047619047619, - "grad_norm": 2.3046166185723367, - "learning_rate": 9.085223869912129e-06, - "loss": 0.0264, - "step": 913 - }, - { - "epoch": 10.880952380952381, - "grad_norm": 1.374770737645624, - "learning_rate": 9.066021038006919e-06, - "loss": 0.0242, - "step": 914 - }, - { - "epoch": 10.892857142857142, - "grad_norm": 0.5505032227795658, - "learning_rate": 9.046821679819528e-06, - "loss": 0.0107, - "step": 915 - }, - { - "epoch": 10.904761904761905, - "grad_norm": 0.6878170520689223, - "learning_rate": 9.027625866757502e-06, - "loss": 0.0057, - "step": 916 - }, - { - "epoch": 10.916666666666666, - "grad_norm": 0.7765123697963863, - "learning_rate": 9.008433670215211e-06, - "loss": 0.0101, - "step": 917 - }, - { - "epoch": 10.928571428571429, - "grad_norm": 2.1303477070805057, - "learning_rate": 8.989245161573576e-06, - "loss": 0.0443, - "step": 918 - }, - { - "epoch": 10.94047619047619, - "grad_norm": 1.3982329826936775, - "learning_rate": 8.970060412199796e-06, - "loss": 0.0207, - "step": 919 - }, - { - "epoch": 10.952380952380953, - "grad_norm": 0.1649606219604496, - "learning_rate": 8.95087949344709e-06, - "loss": 0.002, - "step": 920 - }, - { - "epoch": 10.964285714285714, - "grad_norm": 3.2199107038343215, - "learning_rate": 8.931702476654432e-06, - "loss": 0.0505, - "step": 921 - }, - { - "epoch": 10.976190476190476, - "grad_norm": 2.16680588658055, - "learning_rate": 8.912529433146277e-06, - "loss": 0.0499, - "step": 922 - }, - { - "epoch": 10.988095238095237, - "grad_norm": 3.2534119784727134, - "learning_rate": 8.893360434232312e-06, - "loss": 0.0592, - "step": 923 - }, - { - "epoch": 11.0, - "grad_norm": 1.1963933272226175, - "learning_rate": 8.874195551207173e-06, - "loss": 0.0413, - "step": 924 - }, - { - "epoch": 11.0, - "eval_loss": 0.11936289817094803, - "eval_runtime": 41.3432, - "eval_samples_per_second": 1.451, - "eval_steps_per_second": 1.451, - "step": 924 - }, - { - "epoch": 11.011904761904763, - "grad_norm": 4.071414945159717, - "learning_rate": 8.855034855350195e-06, - "loss": 0.0826, - "step": 925 - }, - { - "epoch": 11.023809523809524, - "grad_norm": 1.8132042735392948, - "learning_rate": 8.835878417925132e-06, - "loss": 0.0227, - "step": 926 - }, - { - "epoch": 11.035714285714286, - "grad_norm": 2.175362642713856, - "learning_rate": 8.816726310179904e-06, - "loss": 0.0147, - "step": 927 - }, - { - "epoch": 11.047619047619047, - "grad_norm": 3.3405606733438193, - "learning_rate": 8.797578603346329e-06, - "loss": 0.0446, - "step": 928 - }, - { - "epoch": 11.05952380952381, - "grad_norm": 1.467071763402714, - "learning_rate": 8.778435368639851e-06, - "loss": 0.0434, - "step": 929 - }, - { - "epoch": 11.071428571428571, - "grad_norm": 2.378574565354438, - "learning_rate": 8.759296677259291e-06, - "loss": 0.0179, - "step": 930 - }, - { - "epoch": 11.083333333333334, - "grad_norm": 2.286995252249508, - "learning_rate": 8.740162600386564e-06, - "loss": 0.0364, - "step": 931 - }, - { - "epoch": 11.095238095238095, - "grad_norm": 0.3706268897484962, - "learning_rate": 8.721033209186425e-06, - "loss": 0.0076, - "step": 932 - }, - { - "epoch": 11.107142857142858, - "grad_norm": 1.8385403898102013, - "learning_rate": 8.701908574806198e-06, - "loss": 0.0372, - "step": 933 - }, - { - "epoch": 11.119047619047619, - "grad_norm": 9.977918756470613, - "learning_rate": 8.682788768375521e-06, - "loss": 0.1343, - "step": 934 - }, - { - "epoch": 11.130952380952381, - "grad_norm": 11.53576686757209, - "learning_rate": 8.663673861006075e-06, - "loss": 0.0868, - "step": 935 - }, - { - "epoch": 11.142857142857142, - "grad_norm": 0.6141826906637148, - "learning_rate": 8.644563923791318e-06, - "loss": 0.0108, - "step": 936 - }, - { - "epoch": 11.154761904761905, - "grad_norm": 2.432596609300301, - "learning_rate": 8.625459027806215e-06, - "loss": 0.0339, - "step": 937 - }, - { - "epoch": 11.166666666666666, - "grad_norm": 3.8542871920099198, - "learning_rate": 8.606359244106998e-06, - "loss": 0.0668, - "step": 938 - }, - { - "epoch": 11.178571428571429, - "grad_norm": 2.671768123413697, - "learning_rate": 8.587264643730877e-06, - "loss": 0.0619, - "step": 939 - }, - { - "epoch": 11.19047619047619, - "grad_norm": 3.196564343224687, - "learning_rate": 8.568175297695777e-06, - "loss": 0.0576, - "step": 940 - }, - { - "epoch": 11.202380952380953, - "grad_norm": 1.768532734461991, - "learning_rate": 8.549091277000092e-06, - "loss": 0.0233, - "step": 941 - }, - { - "epoch": 11.214285714285714, - "grad_norm": 0.7924847176972385, - "learning_rate": 8.530012652622398e-06, - "loss": 0.0047, - "step": 942 - }, - { - "epoch": 11.226190476190476, - "grad_norm": 1.7637958145761288, - "learning_rate": 8.510939495521213e-06, - "loss": 0.0507, - "step": 943 - }, - { - "epoch": 11.238095238095237, - "grad_norm": 1.582931148820781, - "learning_rate": 8.491871876634712e-06, - "loss": 0.0212, - "step": 944 - }, - { - "epoch": 11.25, - "grad_norm": 0.5914931459652956, - "learning_rate": 8.472809866880475e-06, - "loss": 0.0038, - "step": 945 - }, - { - "epoch": 11.261904761904763, - "grad_norm": 1.1896930113797612, - "learning_rate": 8.453753537155222e-06, - "loss": 0.0066, - "step": 946 - }, - { - "epoch": 11.273809523809524, - "grad_norm": 1.9364330725484524, - "learning_rate": 8.43470295833454e-06, - "loss": 0.0404, - "step": 947 - }, - { - "epoch": 11.285714285714286, - "grad_norm": 2.4320944736811754, - "learning_rate": 8.415658201272636e-06, - "loss": 0.0477, - "step": 948 - }, - { - "epoch": 11.297619047619047, - "grad_norm": 0.7784757843303742, - "learning_rate": 8.39661933680206e-06, - "loss": 0.0048, - "step": 949 - }, - { - "epoch": 11.30952380952381, - "grad_norm": 2.42654239840075, - "learning_rate": 8.377586435733448e-06, - "loss": 0.0434, - "step": 950 - }, - { - "epoch": 11.321428571428571, - "grad_norm": 1.4246506756586026, - "learning_rate": 8.35855956885525e-06, - "loss": 0.008, - "step": 951 - }, - { - "epoch": 11.333333333333334, - "grad_norm": 2.1965500168826084, - "learning_rate": 8.33953880693348e-06, - "loss": 0.0703, - "step": 952 - }, - { - "epoch": 11.345238095238095, - "grad_norm": 2.932674016353764, - "learning_rate": 8.320524220711446e-06, - "loss": 0.0822, - "step": 953 - }, - { - "epoch": 11.357142857142858, - "grad_norm": 1.7763421701243831, - "learning_rate": 8.301515880909482e-06, - "loss": 0.0646, - "step": 954 - }, - { - "epoch": 11.369047619047619, - "grad_norm": 2.8036007920254216, - "learning_rate": 8.282513858224698e-06, - "loss": 0.0571, - "step": 955 - }, - { - "epoch": 11.380952380952381, - "grad_norm": 2.2812771220745045, - "learning_rate": 8.263518223330698e-06, - "loss": 0.0403, - "step": 956 - }, - { - "epoch": 11.392857142857142, - "grad_norm": 2.7481991959683905, - "learning_rate": 8.244529046877336e-06, - "loss": 0.0118, - "step": 957 - }, - { - "epoch": 11.404761904761905, - "grad_norm": 2.5728367074156058, - "learning_rate": 8.225546399490442e-06, - "loss": 0.0292, - "step": 958 - }, - { - "epoch": 11.416666666666666, - "grad_norm": 2.1394262109496527, - "learning_rate": 8.206570351771568e-06, - "loss": 0.058, - "step": 959 - }, - { - "epoch": 11.428571428571429, - "grad_norm": 3.5093300747987968, - "learning_rate": 8.187600974297714e-06, - "loss": 0.0594, - "step": 960 - }, - { - "epoch": 11.44047619047619, - "grad_norm": 2.1510794700494036, - "learning_rate": 8.16863833762107e-06, - "loss": 0.0526, - "step": 961 - }, - { - "epoch": 11.452380952380953, - "grad_norm": 0.6585429250012116, - "learning_rate": 8.149682512268762e-06, - "loss": 0.0116, - "step": 962 - }, - { - "epoch": 11.464285714285714, - "grad_norm": 6.3589705470591005, - "learning_rate": 8.13073356874258e-06, - "loss": 0.0423, - "step": 963 - }, - { - "epoch": 11.476190476190476, - "grad_norm": 2.254323022814341, - "learning_rate": 8.111791577518716e-06, - "loss": 0.0161, - "step": 964 - }, - { - "epoch": 11.488095238095237, - "grad_norm": 3.5461286235553846, - "learning_rate": 8.092856609047507e-06, - "loss": 0.0247, - "step": 965 - }, - { - "epoch": 11.5, - "grad_norm": 15.739331158348278, - "learning_rate": 8.073928733753176e-06, - "loss": 0.066, - "step": 966 - }, - { - "epoch": 11.511904761904763, - "grad_norm": 2.5580355333647584, - "learning_rate": 8.055008022033551e-06, - "loss": 0.0684, - "step": 967 - }, - { - "epoch": 11.523809523809524, - "grad_norm": 4.913451078682193, - "learning_rate": 8.036094544259827e-06, - "loss": 0.1084, - "step": 968 - }, - { - "epoch": 11.535714285714286, - "grad_norm": 3.2726257546182387, - "learning_rate": 8.017188370776291e-06, - "loss": 0.0523, - "step": 969 - }, - { - "epoch": 11.547619047619047, - "grad_norm": 2.6146492746923258, - "learning_rate": 7.998289571900067e-06, - "loss": 0.034, - "step": 970 - }, - { - "epoch": 11.55952380952381, - "grad_norm": 1.625512471741649, - "learning_rate": 7.979398217920848e-06, - "loss": 0.0222, - "step": 971 - }, - { - "epoch": 11.571428571428571, - "grad_norm": 0.8994325256152944, - "learning_rate": 7.960514379100632e-06, - "loss": 0.0073, - "step": 972 - }, - { - "epoch": 11.583333333333334, - "grad_norm": 0.7340148368002074, - "learning_rate": 7.941638125673475e-06, - "loss": 0.0084, - "step": 973 - }, - { - "epoch": 11.595238095238095, - "grad_norm": 1.9951166193182492, - "learning_rate": 7.922769527845218e-06, - "loss": 0.0249, - "step": 974 - }, - { - "epoch": 11.607142857142858, - "grad_norm": 3.943314618173986, - "learning_rate": 7.903908655793224e-06, - "loss": 0.0648, - "step": 975 - }, - { - "epoch": 11.619047619047619, - "grad_norm": 2.4252278861240155, - "learning_rate": 7.885055579666134e-06, - "loss": 0.0869, - "step": 976 - }, - { - "epoch": 11.630952380952381, - "grad_norm": 2.3616939840761737, - "learning_rate": 7.866210369583575e-06, - "loss": 0.0357, - "step": 977 - }, - { - "epoch": 11.642857142857142, - "grad_norm": 1.498225825498272, - "learning_rate": 7.847373095635937e-06, - "loss": 0.0311, - "step": 978 - }, - { - "epoch": 11.654761904761905, - "grad_norm": 5.646623285086258, - "learning_rate": 7.82854382788408e-06, - "loss": 0.0653, - "step": 979 - }, - { - "epoch": 11.666666666666666, - "grad_norm": 0.8452190367515048, - "learning_rate": 7.809722636359097e-06, - "loss": 0.0119, - "step": 980 - }, - { - "epoch": 11.678571428571429, - "grad_norm": 2.1528464430456014, - "learning_rate": 7.790909591062033e-06, - "loss": 0.0146, - "step": 981 - }, - { - "epoch": 11.69047619047619, - "grad_norm": 4.224232166285187, - "learning_rate": 7.772104761963645e-06, - "loss": 0.0638, - "step": 982 - }, - { - "epoch": 11.702380952380953, - "grad_norm": 2.8975023694327957, - "learning_rate": 7.753308219004124e-06, - "loss": 0.0534, - "step": 983 - }, - { - "epoch": 11.714285714285714, - "grad_norm": 1.4871431291781625, - "learning_rate": 7.734520032092845e-06, - "loss": 0.0147, - "step": 984 - }, - { - "epoch": 11.726190476190476, - "grad_norm": 2.817660054546793, - "learning_rate": 7.715740271108108e-06, - "loss": 0.0334, - "step": 985 - }, - { - "epoch": 11.738095238095237, - "grad_norm": 2.265439459023376, - "learning_rate": 7.69696900589687e-06, - "loss": 0.0351, - "step": 986 - }, - { - "epoch": 11.75, - "grad_norm": 2.820518716548115, - "learning_rate": 7.678206306274495e-06, - "loss": 0.0373, - "step": 987 - }, - { - "epoch": 11.761904761904763, - "grad_norm": 1.4555114365738053, - "learning_rate": 7.659452242024483e-06, - "loss": 0.0359, - "step": 988 - }, - { - "epoch": 11.773809523809524, - "grad_norm": 2.148007272965159, - "learning_rate": 7.64070688289822e-06, - "loss": 0.0427, - "step": 989 - }, - { - "epoch": 11.785714285714286, - "grad_norm": 2.6798202271627423, - "learning_rate": 7.621970298614717e-06, - "loss": 0.046, - "step": 990 - }, - { - "epoch": 11.797619047619047, - "grad_norm": 0.5172075857377311, - "learning_rate": 7.6032425588603465e-06, - "loss": 0.0085, - "step": 991 - }, - { - "epoch": 11.80952380952381, - "grad_norm": 2.8043769978606914, - "learning_rate": 7.584523733288589e-06, - "loss": 0.0491, - "step": 992 - }, - { - "epoch": 11.821428571428571, - "grad_norm": 1.37174029863172, - "learning_rate": 7.565813891519766e-06, - "loss": 0.0202, - "step": 993 - }, - { - "epoch": 11.833333333333334, - "grad_norm": 0.2854308071521844, - "learning_rate": 7.547113103140786e-06, - "loss": 0.0021, - "step": 994 - }, - { - "epoch": 11.845238095238095, - "grad_norm": 3.484971218422417, - "learning_rate": 7.528421437704891e-06, - "loss": 0.0494, - "step": 995 - }, - { - "epoch": 11.857142857142858, - "grad_norm": 4.439639213595532, - "learning_rate": 7.509738964731389e-06, - "loss": 0.0382, - "step": 996 - }, - { - "epoch": 11.869047619047619, - "grad_norm": 2.9936609268911947, - "learning_rate": 7.4910657537054e-06, - "loss": 0.076, - "step": 997 - }, - { - "epoch": 11.880952380952381, - "grad_norm": 1.3741364759132064, - "learning_rate": 7.472401874077593e-06, - "loss": 0.0159, - "step": 998 - }, - { - "epoch": 11.892857142857142, - "grad_norm": 1.7029666602982898, - "learning_rate": 7.453747395263932e-06, - "loss": 0.0197, - "step": 999 - }, - { - "epoch": 11.904761904761905, - "grad_norm": 3.210865127366338, - "learning_rate": 7.435102386645421e-06, - "loss": 0.0172, - "step": 1000 - }, - { - "epoch": 11.916666666666666, - "grad_norm": 2.230829971481931, - "learning_rate": 7.4164669175678376e-06, - "loss": 0.0169, - "step": 1001 - }, - { - "epoch": 11.928571428571429, - "grad_norm": 2.19343014063508, - "learning_rate": 7.3978410573414795e-06, - "loss": 0.0337, - "step": 1002 - }, - { - "epoch": 11.94047619047619, - "grad_norm": 1.8146120196303304, - "learning_rate": 7.3792248752409116e-06, - "loss": 0.0147, - "step": 1003 - }, - { - "epoch": 11.952380952380953, - "grad_norm": 2.044635364554011, - "learning_rate": 7.360618440504695e-06, - "loss": 0.0273, - "step": 1004 - }, - { - "epoch": 11.964285714285714, - "grad_norm": 1.5594003161322363, - "learning_rate": 7.342021822335144e-06, - "loss": 0.0408, - "step": 1005 - }, - { - "epoch": 11.976190476190476, - "grad_norm": 1.9622001194940828, - "learning_rate": 7.323435089898059e-06, - "loss": 0.0194, - "step": 1006 - }, - { - "epoch": 11.988095238095237, - "grad_norm": 1.1152284378601478, - "learning_rate": 7.3048583123224745e-06, - "loss": 0.0067, - "step": 1007 - }, - { - "epoch": 12.0, - "grad_norm": 1.4220342011365041, - "learning_rate": 7.2862915587004e-06, - "loss": 0.0099, - "step": 1008 - }, - { - "epoch": 12.0, - "eval_loss": 0.09689878672361374, - "eval_runtime": 40.9326, - "eval_samples_per_second": 1.466, - "eval_steps_per_second": 1.466, - "step": 1008 - }, - { - "epoch": 12.011904761904763, - "grad_norm": 1.0021159288146593, - "learning_rate": 7.267734898086565e-06, - "loss": 0.0068, - "step": 1009 - }, - { - "epoch": 12.023809523809524, - "grad_norm": 1.2619101524108916, - "learning_rate": 7.249188399498159e-06, - "loss": 0.0117, - "step": 1010 - }, - { - "epoch": 12.035714285714286, - "grad_norm": 0.734941316803662, - "learning_rate": 7.230652131914574e-06, - "loss": 0.0195, - "step": 1011 - }, - { - "epoch": 12.047619047619047, - "grad_norm": 2.079991861318786, - "learning_rate": 7.212126164277151e-06, - "loss": 0.0241, - "step": 1012 - }, - { - "epoch": 12.05952380952381, - "grad_norm": 3.0509428341350437, - "learning_rate": 7.1936105654889245e-06, - "loss": 0.0689, - "step": 1013 - }, - { - "epoch": 12.071428571428571, - "grad_norm": 2.619913977958957, - "learning_rate": 7.1751054044143616e-06, - "loss": 0.0477, - "step": 1014 - }, - { - "epoch": 12.083333333333334, - "grad_norm": 1.151849460495169, - "learning_rate": 7.156610749879116e-06, - "loss": 0.0156, - "step": 1015 - }, - { - "epoch": 12.095238095238095, - "grad_norm": 2.5505348766082467, - "learning_rate": 7.1381266706697564e-06, - "loss": 0.0425, - "step": 1016 - }, - { - "epoch": 12.107142857142858, - "grad_norm": 1.7373338551385542, - "learning_rate": 7.11965323553352e-06, - "loss": 0.0172, - "step": 1017 - }, - { - "epoch": 12.119047619047619, - "grad_norm": 2.940052483151144, - "learning_rate": 7.10119051317806e-06, - "loss": 0.0497, - "step": 1018 - }, - { - "epoch": 12.130952380952381, - "grad_norm": 3.3383244130827414, - "learning_rate": 7.082738572271185e-06, - "loss": 0.0464, - "step": 1019 - }, - { - "epoch": 12.142857142857142, - "grad_norm": 2.0013889730979533, - "learning_rate": 7.064297481440601e-06, - "loss": 0.0327, - "step": 1020 - }, - { - "epoch": 12.154761904761905, - "grad_norm": 1.8008055977885011, - "learning_rate": 7.045867309273664e-06, - "loss": 0.0357, - "step": 1021 - }, - { - "epoch": 12.166666666666666, - "grad_norm": 1.7122862048563616, - "learning_rate": 7.02744812431712e-06, - "loss": 0.0165, - "step": 1022 - }, - { - "epoch": 12.178571428571429, - "grad_norm": 0.5421740170140353, - "learning_rate": 7.009039995076845e-06, - "loss": 0.0102, - "step": 1023 - }, - { - "epoch": 12.19047619047619, - "grad_norm": 3.3659274571341875, - "learning_rate": 6.990642990017603e-06, - "loss": 0.0257, - "step": 1024 - }, - { - "epoch": 12.202380952380953, - "grad_norm": 2.6776534186312615, - "learning_rate": 6.97225717756278e-06, - "loss": 0.0464, - "step": 1025 - }, - { - "epoch": 12.214285714285714, - "grad_norm": 1.89868056337621, - "learning_rate": 6.953882626094136e-06, - "loss": 0.0363, - "step": 1026 - }, - { - "epoch": 12.226190476190476, - "grad_norm": 1.6557758156145048, - "learning_rate": 6.935519403951549e-06, - "loss": 0.0241, - "step": 1027 - }, - { - "epoch": 12.238095238095237, - "grad_norm": 2.6942588170276003, - "learning_rate": 6.917167579432753e-06, - "loss": 0.0116, - "step": 1028 - }, - { - "epoch": 12.25, - "grad_norm": 4.021190726312291, - "learning_rate": 6.898827220793103e-06, - "loss": 0.0134, - "step": 1029 - }, - { - "epoch": 12.261904761904763, - "grad_norm": 6.859494774187862, - "learning_rate": 6.880498396245298e-06, - "loss": 0.0254, - "step": 1030 - }, - { - "epoch": 12.273809523809524, - "grad_norm": 1.8169990791501511, - "learning_rate": 6.862181173959146e-06, - "loss": 0.0259, - "step": 1031 - }, - { - "epoch": 12.285714285714286, - "grad_norm": 2.420725120451796, - "learning_rate": 6.8438756220613045e-06, - "loss": 0.059, - "step": 1032 - }, - { - "epoch": 12.297619047619047, - "grad_norm": 3.0165246425408396, - "learning_rate": 6.825581808635016e-06, - "loss": 0.0133, - "step": 1033 - }, - { - "epoch": 12.30952380952381, - "grad_norm": 2.6905776992698254, - "learning_rate": 6.807299801719871e-06, - "loss": 0.0389, - "step": 1034 - }, - { - "epoch": 12.321428571428571, - "grad_norm": 1.7274248581300429, - "learning_rate": 6.789029669311551e-06, - "loss": 0.0077, - "step": 1035 - }, - { - "epoch": 12.333333333333334, - "grad_norm": 1.8001126965802912, - "learning_rate": 6.770771479361568e-06, - "loss": 0.0344, - "step": 1036 - }, - { - "epoch": 12.345238095238095, - "grad_norm": 2.7849018104793024, - "learning_rate": 6.752525299777021e-06, - "loss": 0.0181, - "step": 1037 - }, - { - "epoch": 12.357142857142858, - "grad_norm": 0.5733123316569594, - "learning_rate": 6.734291198420333e-06, - "loss": 0.0033, - "step": 1038 - }, - { - "epoch": 12.369047619047619, - "grad_norm": 3.4183200124225133, - "learning_rate": 6.716069243109011e-06, - "loss": 0.0348, - "step": 1039 - }, - { - "epoch": 12.380952380952381, - "grad_norm": 2.19111813065019, - "learning_rate": 6.697859501615387e-06, - "loss": 0.0262, - "step": 1040 - }, - { - "epoch": 12.392857142857142, - "grad_norm": 1.413820906012132, - "learning_rate": 6.679662041666361e-06, - "loss": 0.0157, - "step": 1041 - }, - { - "epoch": 12.404761904761905, - "grad_norm": 2.1775312629739765, - "learning_rate": 6.661476930943162e-06, - "loss": 0.0172, - "step": 1042 - }, - { - "epoch": 12.416666666666666, - "grad_norm": 3.0659515472526655, - "learning_rate": 6.643304237081088e-06, - "loss": 0.0335, - "step": 1043 - }, - { - "epoch": 12.428571428571429, - "grad_norm": 1.8628049550491268, - "learning_rate": 6.625144027669246e-06, - "loss": 0.0222, - "step": 1044 - }, - { - "epoch": 12.44047619047619, - "grad_norm": 0.7175217914778912, - "learning_rate": 6.606996370250319e-06, - "loss": 0.0092, - "step": 1045 - }, - { - "epoch": 12.452380952380953, - "grad_norm": 1.478121964073369, - "learning_rate": 6.588861332320307e-06, - "loss": 0.037, - "step": 1046 - }, - { - "epoch": 12.464285714285714, - "grad_norm": 2.3923451246112717, - "learning_rate": 6.570738981328266e-06, - "loss": 0.0746, - "step": 1047 - }, - { - "epoch": 12.476190476190476, - "grad_norm": 4.7820280255811225, - "learning_rate": 6.552629384676079e-06, - "loss": 0.0587, - "step": 1048 - }, - { - "epoch": 12.488095238095237, - "grad_norm": 5.042407963332626, - "learning_rate": 6.534532609718177e-06, - "loss": 0.0741, - "step": 1049 - }, - { - "epoch": 12.5, - "grad_norm": 1.992091322531725, - "learning_rate": 6.516448723761315e-06, - "loss": 0.0127, - "step": 1050 - }, - { - "epoch": 12.511904761904763, - "grad_norm": 2.0251462511451317, - "learning_rate": 6.498377794064303e-06, - "loss": 0.0354, - "step": 1051 - }, - { - "epoch": 12.523809523809524, - "grad_norm": 3.374309680561493, - "learning_rate": 6.480319887837772e-06, - "loss": 0.0326, - "step": 1052 - }, - { - "epoch": 12.535714285714286, - "grad_norm": 3.050395187529473, - "learning_rate": 6.4622750722439075e-06, - "loss": 0.022, - "step": 1053 - }, - { - "epoch": 12.547619047619047, - "grad_norm": 11.589276409101357, - "learning_rate": 6.4442434143962075e-06, - "loss": 0.054, - "step": 1054 - }, - { - "epoch": 12.55952380952381, - "grad_norm": 2.3449829130817137, - "learning_rate": 6.426224981359238e-06, - "loss": 0.0493, - "step": 1055 - }, - { - "epoch": 12.571428571428571, - "grad_norm": 2.44302873820673, - "learning_rate": 6.4082198401483755e-06, - "loss": 0.0238, - "step": 1056 - }, - { - "epoch": 12.583333333333334, - "grad_norm": 1.1669955142868396, - "learning_rate": 6.390228057729557e-06, - "loss": 0.0186, - "step": 1057 - }, - { - "epoch": 12.595238095238095, - "grad_norm": 0.9590754842243425, - "learning_rate": 6.372249701019044e-06, - "loss": 0.0099, - "step": 1058 - }, - { - "epoch": 12.607142857142858, - "grad_norm": 1.0699806800229257, - "learning_rate": 6.354284836883156e-06, - "loss": 0.0174, - "step": 1059 - }, - { - "epoch": 12.619047619047619, - "grad_norm": 2.8391167747003987, - "learning_rate": 6.336333532138032e-06, - "loss": 0.045, - "step": 1060 - }, - { - "epoch": 12.630952380952381, - "grad_norm": 0.6894773072049841, - "learning_rate": 6.318395853549379e-06, - "loss": 0.0041, - "step": 1061 - }, - { - "epoch": 12.642857142857142, - "grad_norm": 4.84209428246548, - "learning_rate": 6.30047186783223e-06, - "loss": 0.045, - "step": 1062 - }, - { - "epoch": 12.654761904761905, - "grad_norm": 0.7448463992353114, - "learning_rate": 6.282561641650682e-06, - "loss": 0.0094, - "step": 1063 - }, - { - "epoch": 12.666666666666666, - "grad_norm": 1.8024616854906685, - "learning_rate": 6.2646652416176665e-06, - "loss": 0.0093, - "step": 1064 - }, - { - "epoch": 12.678571428571429, - "grad_norm": 1.3429782506913301, - "learning_rate": 6.246782734294683e-06, - "loss": 0.0172, - "step": 1065 - }, - { - "epoch": 12.69047619047619, - "grad_norm": 2.963838717143966, - "learning_rate": 6.2289141861915635e-06, - "loss": 0.0208, - "step": 1066 - }, - { - "epoch": 12.702380952380953, - "grad_norm": 2.5846144874264043, - "learning_rate": 6.211059663766224e-06, - "loss": 0.0402, - "step": 1067 - }, - { - "epoch": 12.714285714285714, - "grad_norm": 2.560181092214124, - "learning_rate": 6.193219233424414e-06, - "loss": 0.0288, - "step": 1068 - }, - { - "epoch": 12.726190476190476, - "grad_norm": 2.0384912620129194, - "learning_rate": 6.1753929615194705e-06, - "loss": 0.0195, - "step": 1069 - }, - { - "epoch": 12.738095238095237, - "grad_norm": 1.4052060125033978, - "learning_rate": 6.15758091435207e-06, - "loss": 0.0074, - "step": 1070 - }, - { - "epoch": 12.75, - "grad_norm": 4.685196377107016, - "learning_rate": 6.139783158169985e-06, - "loss": 0.0337, - "step": 1071 - }, - { - "epoch": 12.761904761904763, - "grad_norm": 0.8218476974088521, - "learning_rate": 6.121999759167837e-06, - "loss": 0.0044, - "step": 1072 - }, - { - "epoch": 12.773809523809524, - "grad_norm": 4.0846199910982115, - "learning_rate": 6.104230783486846e-06, - "loss": 0.0523, - "step": 1073 - }, - { - "epoch": 12.785714285714286, - "grad_norm": 1.089610236862446, - "learning_rate": 6.086476297214594e-06, - "loss": 0.0127, - "step": 1074 - }, - { - "epoch": 12.797619047619047, - "grad_norm": 0.5229904428811587, - "learning_rate": 6.068736366384764e-06, - "loss": 0.0108, - "step": 1075 - }, - { - "epoch": 12.80952380952381, - "grad_norm": 0.5295945123203731, - "learning_rate": 6.05101105697691e-06, - "loss": 0.0069, - "step": 1076 - }, - { - "epoch": 12.821428571428571, - "grad_norm": 0.40635985353282955, - "learning_rate": 6.0333004349162025e-06, - "loss": 0.0065, - "step": 1077 - }, - { - "epoch": 12.833333333333334, - "grad_norm": 2.4730189484544107, - "learning_rate": 6.015604566073187e-06, - "loss": 0.013, - "step": 1078 - }, - { - "epoch": 12.845238095238095, - "grad_norm": 0.5170708412524836, - "learning_rate": 5.99792351626354e-06, - "loss": 0.0078, - "step": 1079 - }, - { - "epoch": 12.857142857142858, - "grad_norm": 0.7722760595256714, - "learning_rate": 5.980257351247818e-06, - "loss": 0.0122, - "step": 1080 - }, - { - "epoch": 12.869047619047619, - "grad_norm": 0.9562287992871482, - "learning_rate": 5.962606136731217e-06, - "loss": 0.015, - "step": 1081 - }, - { - "epoch": 12.880952380952381, - "grad_norm": 2.57487715422707, - "learning_rate": 5.944969938363332e-06, - "loss": 0.0558, - "step": 1082 - }, - { - "epoch": 12.892857142857142, - "grad_norm": 5.175353888734591, - "learning_rate": 5.927348821737906e-06, - "loss": 0.0775, - "step": 1083 - }, - { - "epoch": 12.904761904761905, - "grad_norm": 3.3863214768962635, - "learning_rate": 5.9097428523925874e-06, - "loss": 0.0587, - "step": 1084 - }, - { - "epoch": 12.916666666666666, - "grad_norm": 2.3710552627489503, - "learning_rate": 5.892152095808691e-06, - "loss": 0.0114, - "step": 1085 - }, - { - "epoch": 12.928571428571429, - "grad_norm": 1.7382649132331074, - "learning_rate": 5.87457661741095e-06, - "loss": 0.016, - "step": 1086 - }, - { - "epoch": 12.94047619047619, - "grad_norm": 0.7673832218317043, - "learning_rate": 5.857016482567275e-06, - "loss": 0.0144, - "step": 1087 - }, - { - "epoch": 12.952380952380953, - "grad_norm": 1.1565970722585457, - "learning_rate": 5.83947175658851e-06, - "loss": 0.016, - "step": 1088 - }, - { - "epoch": 12.964285714285714, - "grad_norm": 1.3519643426410404, - "learning_rate": 5.821942504728183e-06, - "loss": 0.0254, - "step": 1089 - }, - { - "epoch": 12.976190476190476, - "grad_norm": 2.91693108261119, - "learning_rate": 5.80442879218228e-06, - "loss": 0.0419, - "step": 1090 - }, - { - "epoch": 12.988095238095237, - "grad_norm": 1.747082227614345, - "learning_rate": 5.786930684088988e-06, - "loss": 0.0192, - "step": 1091 - }, - { - "epoch": 13.0, - "grad_norm": 2.0947996297444966, - "learning_rate": 5.769448245528451e-06, - "loss": 0.0347, - "step": 1092 - }, - { - "epoch": 13.0, - "eval_loss": 0.16690757870674133, - "eval_runtime": 41.0419, - "eval_samples_per_second": 1.462, - "eval_steps_per_second": 1.462, - "step": 1092 - } - ], - "logging_steps": 1.0, - "max_steps": 1680, - "num_input_tokens_seen": 0, - "num_train_epochs": 20, - "save_steps": 500, - "stateful_callbacks": { - "TrainerControl": { - "args": { - "should_epoch_stop": false, - "should_evaluate": false, - "should_log": false, - "should_save": true, - "should_training_stop": false - }, - "attributes": {} - } - }, - "total_flos": 304017211392000.0, - "train_batch_size": 1, - "trial_name": null, - "trial_params": null -}