{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.22302621797095704, "eval_steps": 200000, "global_step": 13500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0016520460590441263, "grad_norm": 51.92022705078125, "learning_rate": 3.2044928972580115e-07, "loss": 13.3171, "step": 100 }, { "epoch": 0.0033040921180882525, "grad_norm": 68.25243377685547, "learning_rate": 6.508093822266271e-07, "loss": 12.9799, "step": 200 }, { "epoch": 0.004956138177132379, "grad_norm": 69.47785186767578, "learning_rate": 9.811694747274531e-07, "loss": 12.5133, "step": 300 }, { "epoch": 0.006608184236176505, "grad_norm": 73.07315063476562, "learning_rate": 1.311529567228279e-06, "loss": 11.9388, "step": 400 }, { "epoch": 0.008260230295220631, "grad_norm": 82.68733215332031, "learning_rate": 1.6418896597291048e-06, "loss": 11.0616, "step": 500 }, { "epoch": 0.009912276354264758, "grad_norm": 57.61735534667969, "learning_rate": 1.972249752229931e-06, "loss": 10.2712, "step": 600 }, { "epoch": 0.011564322413308884, "grad_norm": 44.42943572998047, "learning_rate": 2.302609844730757e-06, "loss": 9.5253, "step": 700 }, { "epoch": 0.01321636847235301, "grad_norm": 27.03646469116211, "learning_rate": 2.6329699372315828e-06, "loss": 8.7706, "step": 800 }, { "epoch": 0.014868414531397135, "grad_norm": 15.231706619262695, "learning_rate": 2.9633300297324087e-06, "loss": 8.4333, "step": 900 }, { "epoch": 0.016520460590441263, "grad_norm": 14.189949035644531, "learning_rate": 3.2936901222332346e-06, "loss": 8.0902, "step": 1000 }, { "epoch": 0.018172506649485387, "grad_norm": 12.241333961486816, "learning_rate": 3.6240502147340605e-06, "loss": 7.8862, "step": 1100 }, { "epoch": 0.019824552708529515, "grad_norm": 11.400131225585938, "learning_rate": 3.9544103072348865e-06, "loss": 7.7362, "step": 1200 }, { "epoch": 0.02147659876757364, "grad_norm": 12.072014808654785, "learning_rate": 4.284770399735712e-06, "loss": 7.6007, "step": 1300 }, { "epoch": 0.023128644826617768, "grad_norm": 11.08774185180664, "learning_rate": 4.615130492236538e-06, "loss": 7.5304, "step": 1400 }, { "epoch": 0.024780690885661892, "grad_norm": 13.02505874633789, "learning_rate": 4.945490584737364e-06, "loss": 7.4249, "step": 1500 }, { "epoch": 0.02643273694470602, "grad_norm": 13.522186279296875, "learning_rate": 5.27585067723819e-06, "loss": 7.3035, "step": 1600 }, { "epoch": 0.028084783003750145, "grad_norm": 45.22550964355469, "learning_rate": 5.606210769739015e-06, "loss": 7.2026, "step": 1700 }, { "epoch": 0.02973682906279427, "grad_norm": 15.62098503112793, "learning_rate": 5.936570862239842e-06, "loss": 7.1572, "step": 1800 }, { "epoch": 0.0313888751218384, "grad_norm": 16.570518493652344, "learning_rate": 6.266930954740668e-06, "loss": 7.0523, "step": 1900 }, { "epoch": 0.033040921180882525, "grad_norm": 16.82353401184082, "learning_rate": 6.597291047241494e-06, "loss": 7.1158, "step": 2000 }, { "epoch": 0.034692967239926646, "grad_norm": 17.38075828552246, "learning_rate": 6.924347538817311e-06, "loss": 6.9856, "step": 2100 }, { "epoch": 0.036345013298970774, "grad_norm": 93.04572296142578, "learning_rate": 7.2547076313181375e-06, "loss": 7.0865, "step": 2200 }, { "epoch": 0.0379970593580149, "grad_norm": 17.861074447631836, "learning_rate": 7.585067723818963e-06, "loss": 6.9496, "step": 2300 }, { "epoch": 0.03964910541705903, "grad_norm": 19.067747116088867, "learning_rate": 7.91542781631979e-06, "loss": 6.9294, "step": 2400 }, { "epoch": 0.04130115147610315, "grad_norm": 16.43912696838379, "learning_rate": 8.245787908820615e-06, "loss": 6.8825, "step": 2500 }, { "epoch": 0.04295319753514728, "grad_norm": 140.5387725830078, "learning_rate": 8.576148001321441e-06, "loss": 6.8218, "step": 2600 }, { "epoch": 0.04460524359419141, "grad_norm": 22.34341049194336, "learning_rate": 8.903204492897258e-06, "loss": 6.8416, "step": 2700 }, { "epoch": 0.046257289653235535, "grad_norm": 16.260499954223633, "learning_rate": 9.233564585398084e-06, "loss": 6.7184, "step": 2800 }, { "epoch": 0.047909335712279656, "grad_norm": 20.075071334838867, "learning_rate": 9.56392467789891e-06, "loss": 6.9183, "step": 2900 }, { "epoch": 0.049561381771323784, "grad_norm": 45.1911735534668, "learning_rate": 9.894284770399738e-06, "loss": 6.7166, "step": 3000 }, { "epoch": 0.05121342783036791, "grad_norm": 67.39335632324219, "learning_rate": 1.0224644862900564e-05, "loss": 6.6821, "step": 3100 }, { "epoch": 0.05286547388941204, "grad_norm": 80.69914245605469, "learning_rate": 1.055500495540139e-05, "loss": 6.6074, "step": 3200 }, { "epoch": 0.05451751994845616, "grad_norm": 37.51483917236328, "learning_rate": 1.0885365047902214e-05, "loss": 6.6141, "step": 3300 }, { "epoch": 0.05616956600750029, "grad_norm": 127.3297348022461, "learning_rate": 1.121572514040304e-05, "loss": 6.5374, "step": 3400 }, { "epoch": 0.05782161206654442, "grad_norm": 20.704940795898438, "learning_rate": 1.1546085232903866e-05, "loss": 6.4776, "step": 3500 }, { "epoch": 0.05947365812558854, "grad_norm": 23.68699836730957, "learning_rate": 1.1876445325404693e-05, "loss": 6.5701, "step": 3600 }, { "epoch": 0.061125704184632666, "grad_norm": 104.68245697021484, "learning_rate": 1.2206805417905519e-05, "loss": 6.5026, "step": 3700 }, { "epoch": 0.0627777502436768, "grad_norm": 97.47430419921875, "learning_rate": 1.2537165510406343e-05, "loss": 6.6502, "step": 3800 }, { "epoch": 0.06442979630272092, "grad_norm": 21.512229919433594, "learning_rate": 1.286752560290717e-05, "loss": 6.5023, "step": 3900 }, { "epoch": 0.06608184236176505, "grad_norm": 31.69252586364746, "learning_rate": 1.3197885695407995e-05, "loss": 6.5526, "step": 4000 }, { "epoch": 0.06773388842080917, "grad_norm": 22.141067504882812, "learning_rate": 1.3528245787908823e-05, "loss": 6.6594, "step": 4100 }, { "epoch": 0.06938593447985329, "grad_norm": 23.37205696105957, "learning_rate": 1.3858605880409649e-05, "loss": 6.3643, "step": 4200 }, { "epoch": 0.07103798053889743, "grad_norm": 23.31827163696289, "learning_rate": 1.4188965972910473e-05, "loss": 6.3783, "step": 4300 }, { "epoch": 0.07269002659794155, "grad_norm": 27.043312072753906, "learning_rate": 1.4519326065411299e-05, "loss": 6.3222, "step": 4400 }, { "epoch": 0.07434207265698568, "grad_norm": 25.699583053588867, "learning_rate": 1.4846382556987117e-05, "loss": 6.3401, "step": 4500 }, { "epoch": 0.0759941187160298, "grad_norm": 24.91438865661621, "learning_rate": 1.5176742649487943e-05, "loss": 6.4005, "step": 4600 }, { "epoch": 0.07764616477507393, "grad_norm": 38.77157974243164, "learning_rate": 1.5507102741988768e-05, "loss": 6.3605, "step": 4700 }, { "epoch": 0.07929821083411806, "grad_norm": 156.87989807128906, "learning_rate": 1.5837462834489594e-05, "loss": 6.348, "step": 4800 }, { "epoch": 0.08095025689316218, "grad_norm": 110.80547332763672, "learning_rate": 1.6167822926990423e-05, "loss": 6.3406, "step": 4900 }, { "epoch": 0.0826023029522063, "grad_norm": 48.55455780029297, "learning_rate": 1.649818301949125e-05, "loss": 6.4156, "step": 5000 }, { "epoch": 0.08425434901125044, "grad_norm": 25.825349807739258, "learning_rate": 1.682854311199207e-05, "loss": 6.3786, "step": 5100 }, { "epoch": 0.08590639507029456, "grad_norm": 55.6208381652832, "learning_rate": 1.7158903204492897e-05, "loss": 6.376, "step": 5200 }, { "epoch": 0.08755844112933868, "grad_norm": 37.82964324951172, "learning_rate": 1.7489263296993723e-05, "loss": 6.2363, "step": 5300 }, { "epoch": 0.08921048718838281, "grad_norm": 32.86615753173828, "learning_rate": 1.7819623389494553e-05, "loss": 6.2185, "step": 5400 }, { "epoch": 0.09086253324742694, "grad_norm": 180.8863525390625, "learning_rate": 1.814998348199538e-05, "loss": 6.2554, "step": 5500 }, { "epoch": 0.09251457930647107, "grad_norm": 25.11360740661621, "learning_rate": 1.84803435744962e-05, "loss": 6.2177, "step": 5600 }, { "epoch": 0.09416662536551519, "grad_norm": 23.702716827392578, "learning_rate": 1.8810703666997027e-05, "loss": 6.3924, "step": 5700 }, { "epoch": 0.09581867142455931, "grad_norm": 32.1275634765625, "learning_rate": 1.9141063759497853e-05, "loss": 6.2897, "step": 5800 }, { "epoch": 0.09747071748360345, "grad_norm": 46.22661590576172, "learning_rate": 1.9471423851998682e-05, "loss": 6.272, "step": 5900 }, { "epoch": 0.09912276354264757, "grad_norm": 74.11865234375, "learning_rate": 1.9801783944499505e-05, "loss": 6.0247, "step": 6000 }, { "epoch": 0.10077480960169169, "grad_norm": 34.50657653808594, "learning_rate": 1.9985314903537273e-05, "loss": 6.194, "step": 6100 }, { "epoch": 0.10242685566073582, "grad_norm": 25.600902557373047, "learning_rate": 1.9948602162380454e-05, "loss": 6.2757, "step": 6200 }, { "epoch": 0.10407890171977995, "grad_norm": 24.53876495361328, "learning_rate": 1.9911889421223638e-05, "loss": 6.2408, "step": 6300 }, { "epoch": 0.10573094777882408, "grad_norm": 22.572052001953125, "learning_rate": 1.987517668006682e-05, "loss": 6.253, "step": 6400 }, { "epoch": 0.1073829938378682, "grad_norm": 33.04438018798828, "learning_rate": 1.983846393891e-05, "loss": 6.0605, "step": 6500 }, { "epoch": 0.10903503989691232, "grad_norm": 81.35254669189453, "learning_rate": 1.9801751197753184e-05, "loss": 6.0672, "step": 6600 }, { "epoch": 0.11068708595595646, "grad_norm": 31.132247924804688, "learning_rate": 1.9765038456596365e-05, "loss": 6.0414, "step": 6700 }, { "epoch": 0.11233913201500058, "grad_norm": 42.16621017456055, "learning_rate": 1.9728325715439546e-05, "loss": 6.0823, "step": 6800 }, { "epoch": 0.1139911780740447, "grad_norm": 23.558713912963867, "learning_rate": 1.9691612974282726e-05, "loss": 6.1962, "step": 6900 }, { "epoch": 0.11564322413308883, "grad_norm": 69.28414154052734, "learning_rate": 1.9654900233125907e-05, "loss": 6.0868, "step": 7000 }, { "epoch": 0.11729527019213296, "grad_norm": 29.037137985229492, "learning_rate": 1.9618187491969088e-05, "loss": 6.0795, "step": 7100 }, { "epoch": 0.11894731625117708, "grad_norm": 29.588781356811523, "learning_rate": 1.9581474750812272e-05, "loss": 5.9656, "step": 7200 }, { "epoch": 0.12059936231022121, "grad_norm": 29.574968338012695, "learning_rate": 1.9544762009655453e-05, "loss": 5.9785, "step": 7300 }, { "epoch": 0.12225140836926533, "grad_norm": 46.092193603515625, "learning_rate": 1.9508049268498634e-05, "loss": 6.0722, "step": 7400 }, { "epoch": 0.12390345442830947, "grad_norm": 23.927968978881836, "learning_rate": 1.9471336527341815e-05, "loss": 5.9443, "step": 7500 }, { "epoch": 0.1255555004873536, "grad_norm": 21.281776428222656, "learning_rate": 1.9434623786184995e-05, "loss": 5.8786, "step": 7600 }, { "epoch": 0.1272075465463977, "grad_norm": 27.455034255981445, "learning_rate": 1.939791104502818e-05, "loss": 5.8007, "step": 7700 }, { "epoch": 0.12885959260544183, "grad_norm": 33.76934814453125, "learning_rate": 1.936119830387136e-05, "loss": 5.9206, "step": 7800 }, { "epoch": 0.13051163866448598, "grad_norm": 21.891183853149414, "learning_rate": 1.932448556271454e-05, "loss": 5.918, "step": 7900 }, { "epoch": 0.1321636847235301, "grad_norm": 61.087398529052734, "learning_rate": 1.9287772821557725e-05, "loss": 5.9443, "step": 8000 }, { "epoch": 0.13381573078257422, "grad_norm": 23.860267639160156, "learning_rate": 1.9251060080400906e-05, "loss": 5.8764, "step": 8100 }, { "epoch": 0.13546777684161834, "grad_norm": 26.501821517944336, "learning_rate": 1.9214714466655654e-05, "loss": 5.867, "step": 8200 }, { "epoch": 0.13711982290066246, "grad_norm": 43.38287353515625, "learning_rate": 1.9178001725498835e-05, "loss": 5.8087, "step": 8300 }, { "epoch": 0.13877186895970658, "grad_norm": 73.06561279296875, "learning_rate": 1.9141288984342016e-05, "loss": 5.9884, "step": 8400 }, { "epoch": 0.14042391501875073, "grad_norm": 36.368717193603516, "learning_rate": 1.91045762431852e-05, "loss": 5.8741, "step": 8500 }, { "epoch": 0.14207596107779485, "grad_norm": 136.38865661621094, "learning_rate": 1.906786350202838e-05, "loss": 5.9699, "step": 8600 }, { "epoch": 0.14372800713683898, "grad_norm": 38.05315017700195, "learning_rate": 1.903115076087156e-05, "loss": 5.8671, "step": 8700 }, { "epoch": 0.1453800531958831, "grad_norm": 39.74106216430664, "learning_rate": 1.8994438019714742e-05, "loss": 5.8278, "step": 8800 }, { "epoch": 0.14703209925492722, "grad_norm": 31.016155242919922, "learning_rate": 1.8957725278557926e-05, "loss": 5.8892, "step": 8900 }, { "epoch": 0.14868414531397137, "grad_norm": 36.37879943847656, "learning_rate": 1.8921012537401107e-05, "loss": 5.7437, "step": 9000 }, { "epoch": 0.1503361913730155, "grad_norm": 31.93881607055664, "learning_rate": 1.8884299796244288e-05, "loss": 5.8069, "step": 9100 }, { "epoch": 0.1519882374320596, "grad_norm": 24.248807907104492, "learning_rate": 1.8847587055087472e-05, "loss": 6.0235, "step": 9200 }, { "epoch": 0.15364028349110373, "grad_norm": 29.67982292175293, "learning_rate": 1.8810874313930653e-05, "loss": 5.7214, "step": 9300 }, { "epoch": 0.15529232955014785, "grad_norm": 34.80620193481445, "learning_rate": 1.8774161572773834e-05, "loss": 5.7893, "step": 9400 }, { "epoch": 0.15694437560919197, "grad_norm": 31.375019073486328, "learning_rate": 1.8737448831617015e-05, "loss": 5.7406, "step": 9500 }, { "epoch": 0.15859642166823612, "grad_norm": 24.126588821411133, "learning_rate": 1.8700736090460195e-05, "loss": 5.8035, "step": 9600 }, { "epoch": 0.16024846772728024, "grad_norm": 94.3121337890625, "learning_rate": 1.8664023349303376e-05, "loss": 5.7965, "step": 9700 }, { "epoch": 0.16190051378632436, "grad_norm": 29.543697357177734, "learning_rate": 1.8627310608146557e-05, "loss": 5.638, "step": 9800 }, { "epoch": 0.16355255984536848, "grad_norm": 27.004188537597656, "learning_rate": 1.859059786698974e-05, "loss": 5.8263, "step": 9900 }, { "epoch": 0.1652046059044126, "grad_norm": 31.72929573059082, "learning_rate": 1.8553885125832922e-05, "loss": 5.7995, "step": 10000 }, { "epoch": 0.16685665196345675, "grad_norm": 43.893436431884766, "learning_rate": 1.8517172384676103e-05, "loss": 5.5805, "step": 10100 }, { "epoch": 0.16850869802250087, "grad_norm": 40.329349517822266, "learning_rate": 1.8480459643519283e-05, "loss": 5.632, "step": 10200 }, { "epoch": 0.170160744081545, "grad_norm": 36.50722885131836, "learning_rate": 1.8443746902362468e-05, "loss": 5.6944, "step": 10300 }, { "epoch": 0.17181279014058912, "grad_norm": 68.61418151855469, "learning_rate": 1.840703416120565e-05, "loss": 5.5818, "step": 10400 }, { "epoch": 0.17346483619963324, "grad_norm": 38.758846282958984, "learning_rate": 1.837032142004883e-05, "loss": 5.8598, "step": 10500 }, { "epoch": 0.17511688225867736, "grad_norm": 51.770931243896484, "learning_rate": 1.8333975806303577e-05, "loss": 5.7255, "step": 10600 }, { "epoch": 0.1767689283177215, "grad_norm": 91.27816009521484, "learning_rate": 1.8297263065146758e-05, "loss": 5.7536, "step": 10700 }, { "epoch": 0.17842097437676563, "grad_norm": 35.52999496459961, "learning_rate": 1.8260550323989942e-05, "loss": 5.6536, "step": 10800 }, { "epoch": 0.18007302043580975, "grad_norm": 36.9012336730957, "learning_rate": 1.8223837582833123e-05, "loss": 5.6417, "step": 10900 }, { "epoch": 0.18172506649485387, "grad_norm": 37.2264404296875, "learning_rate": 1.8187124841676304e-05, "loss": 5.6719, "step": 11000 }, { "epoch": 0.183377112553898, "grad_norm": 31.076929092407227, "learning_rate": 1.8150412100519488e-05, "loss": 5.566, "step": 11100 }, { "epoch": 0.18502915861294214, "grad_norm": 34.78733444213867, "learning_rate": 1.811369935936267e-05, "loss": 5.4893, "step": 11200 }, { "epoch": 0.18668120467198626, "grad_norm": 68.41493225097656, "learning_rate": 1.807698661820585e-05, "loss": 5.7412, "step": 11300 }, { "epoch": 0.18833325073103038, "grad_norm": 43.99595260620117, "learning_rate": 1.804027387704903e-05, "loss": 5.6838, "step": 11400 }, { "epoch": 0.1899852967900745, "grad_norm": 30.06267547607422, "learning_rate": 1.8003561135892215e-05, "loss": 5.6272, "step": 11500 }, { "epoch": 0.19163734284911862, "grad_norm": 38.978031158447266, "learning_rate": 1.7966848394735395e-05, "loss": 5.6538, "step": 11600 }, { "epoch": 0.19328938890816275, "grad_norm": 34.604209899902344, "learning_rate": 1.7930135653578576e-05, "loss": 5.7176, "step": 11700 }, { "epoch": 0.1949414349672069, "grad_norm": 39.66080856323242, "learning_rate": 1.7893422912421757e-05, "loss": 5.4923, "step": 11800 }, { "epoch": 0.19659348102625102, "grad_norm": 39.9164924621582, "learning_rate": 1.7856710171264938e-05, "loss": 5.7643, "step": 11900 }, { "epoch": 0.19824552708529514, "grad_norm": 62.23050308227539, "learning_rate": 1.7819997430108122e-05, "loss": 5.5674, "step": 12000 }, { "epoch": 0.19989757314433926, "grad_norm": 57.77485656738281, "learning_rate": 1.7783284688951303e-05, "loss": 5.6896, "step": 12100 }, { "epoch": 0.20154961920338338, "grad_norm": 62.32257843017578, "learning_rate": 1.7746571947794483e-05, "loss": 5.4385, "step": 12200 }, { "epoch": 0.20320166526242753, "grad_norm": 72.59315490722656, "learning_rate": 1.7709859206637664e-05, "loss": 5.5851, "step": 12300 }, { "epoch": 0.20485371132147165, "grad_norm": 38.60813522338867, "learning_rate": 1.7673146465480845e-05, "loss": 5.5132, "step": 12400 }, { "epoch": 0.20650575738051577, "grad_norm": 46.002899169921875, "learning_rate": 1.763643372432403e-05, "loss": 5.3329, "step": 12500 }, { "epoch": 0.2081578034395599, "grad_norm": 54.40972900390625, "learning_rate": 1.759972098316721e-05, "loss": 5.4218, "step": 12600 }, { "epoch": 0.209809849498604, "grad_norm": 42.294403076171875, "learning_rate": 1.756337536942196e-05, "loss": 5.5171, "step": 12700 }, { "epoch": 0.21146189555764816, "grad_norm": 99.45050048828125, "learning_rate": 1.7526662628265142e-05, "loss": 5.3414, "step": 12800 }, { "epoch": 0.21311394161669228, "grad_norm": 29.550790786743164, "learning_rate": 1.7489949887108323e-05, "loss": 5.4921, "step": 12900 }, { "epoch": 0.2147659876757364, "grad_norm": 35.48351287841797, "learning_rate": 1.7453237145951504e-05, "loss": 5.7687, "step": 13000 }, { "epoch": 0.21641803373478052, "grad_norm": 35.474609375, "learning_rate": 1.7416524404794685e-05, "loss": 5.7119, "step": 13100 }, { "epoch": 0.21807007979382464, "grad_norm": 52.770469665527344, "learning_rate": 1.7379811663637865e-05, "loss": 5.4975, "step": 13200 }, { "epoch": 0.21972212585286877, "grad_norm": 41.083763122558594, "learning_rate": 1.7343098922481046e-05, "loss": 5.4514, "step": 13300 }, { "epoch": 0.22137417191191291, "grad_norm": 27.714067459106445, "learning_rate": 1.730638618132423e-05, "loss": 5.497, "step": 13400 }, { "epoch": 0.22302621797095704, "grad_norm": 646.4743041992188, "learning_rate": 1.726967344016741e-05, "loss": 5.558, "step": 13500 } ], "logging_steps": 100, "max_steps": 60531, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 128, "trial_name": null, "trial_params": null }