|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.38823082387536967, |
|
"eval_steps": 200000, |
|
"global_step": 23500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0016520460590441263, |
|
"grad_norm": 51.92022705078125, |
|
"learning_rate": 3.2044928972580115e-07, |
|
"loss": 13.3171, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.0033040921180882525, |
|
"grad_norm": 68.25243377685547, |
|
"learning_rate": 6.508093822266271e-07, |
|
"loss": 12.9799, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.004956138177132379, |
|
"grad_norm": 69.47785186767578, |
|
"learning_rate": 9.811694747274531e-07, |
|
"loss": 12.5133, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.006608184236176505, |
|
"grad_norm": 73.07315063476562, |
|
"learning_rate": 1.311529567228279e-06, |
|
"loss": 11.9388, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.008260230295220631, |
|
"grad_norm": 82.68733215332031, |
|
"learning_rate": 1.6418896597291048e-06, |
|
"loss": 11.0616, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.009912276354264758, |
|
"grad_norm": 57.61735534667969, |
|
"learning_rate": 1.972249752229931e-06, |
|
"loss": 10.2712, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.011564322413308884, |
|
"grad_norm": 44.42943572998047, |
|
"learning_rate": 2.302609844730757e-06, |
|
"loss": 9.5253, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.01321636847235301, |
|
"grad_norm": 27.03646469116211, |
|
"learning_rate": 2.6329699372315828e-06, |
|
"loss": 8.7706, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.014868414531397135, |
|
"grad_norm": 15.231706619262695, |
|
"learning_rate": 2.9633300297324087e-06, |
|
"loss": 8.4333, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.016520460590441263, |
|
"grad_norm": 14.189949035644531, |
|
"learning_rate": 3.2936901222332346e-06, |
|
"loss": 8.0902, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.018172506649485387, |
|
"grad_norm": 12.241333961486816, |
|
"learning_rate": 3.6240502147340605e-06, |
|
"loss": 7.8862, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.019824552708529515, |
|
"grad_norm": 11.400131225585938, |
|
"learning_rate": 3.9544103072348865e-06, |
|
"loss": 7.7362, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.02147659876757364, |
|
"grad_norm": 12.072014808654785, |
|
"learning_rate": 4.284770399735712e-06, |
|
"loss": 7.6007, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.023128644826617768, |
|
"grad_norm": 11.08774185180664, |
|
"learning_rate": 4.615130492236538e-06, |
|
"loss": 7.5304, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.024780690885661892, |
|
"grad_norm": 13.02505874633789, |
|
"learning_rate": 4.945490584737364e-06, |
|
"loss": 7.4249, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.02643273694470602, |
|
"grad_norm": 13.522186279296875, |
|
"learning_rate": 5.27585067723819e-06, |
|
"loss": 7.3035, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.028084783003750145, |
|
"grad_norm": 45.22550964355469, |
|
"learning_rate": 5.606210769739015e-06, |
|
"loss": 7.2026, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.02973682906279427, |
|
"grad_norm": 15.62098503112793, |
|
"learning_rate": 5.936570862239842e-06, |
|
"loss": 7.1572, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.0313888751218384, |
|
"grad_norm": 16.570518493652344, |
|
"learning_rate": 6.266930954740668e-06, |
|
"loss": 7.0523, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.033040921180882525, |
|
"grad_norm": 16.82353401184082, |
|
"learning_rate": 6.597291047241494e-06, |
|
"loss": 7.1158, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.034692967239926646, |
|
"grad_norm": 17.38075828552246, |
|
"learning_rate": 6.924347538817311e-06, |
|
"loss": 6.9856, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.036345013298970774, |
|
"grad_norm": 93.04572296142578, |
|
"learning_rate": 7.2547076313181375e-06, |
|
"loss": 7.0865, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.0379970593580149, |
|
"grad_norm": 17.861074447631836, |
|
"learning_rate": 7.585067723818963e-06, |
|
"loss": 6.9496, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.03964910541705903, |
|
"grad_norm": 19.067747116088867, |
|
"learning_rate": 7.91542781631979e-06, |
|
"loss": 6.9294, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.04130115147610315, |
|
"grad_norm": 16.43912696838379, |
|
"learning_rate": 8.245787908820615e-06, |
|
"loss": 6.8825, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.04295319753514728, |
|
"grad_norm": 140.5387725830078, |
|
"learning_rate": 8.576148001321441e-06, |
|
"loss": 6.8218, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.04460524359419141, |
|
"grad_norm": 22.34341049194336, |
|
"learning_rate": 8.903204492897258e-06, |
|
"loss": 6.8416, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.046257289653235535, |
|
"grad_norm": 16.260499954223633, |
|
"learning_rate": 9.233564585398084e-06, |
|
"loss": 6.7184, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.047909335712279656, |
|
"grad_norm": 20.075071334838867, |
|
"learning_rate": 9.56392467789891e-06, |
|
"loss": 6.9183, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.049561381771323784, |
|
"grad_norm": 45.1911735534668, |
|
"learning_rate": 9.894284770399738e-06, |
|
"loss": 6.7166, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.05121342783036791, |
|
"grad_norm": 67.39335632324219, |
|
"learning_rate": 1.0224644862900564e-05, |
|
"loss": 6.6821, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.05286547388941204, |
|
"grad_norm": 80.69914245605469, |
|
"learning_rate": 1.055500495540139e-05, |
|
"loss": 6.6074, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.05451751994845616, |
|
"grad_norm": 37.51483917236328, |
|
"learning_rate": 1.0885365047902214e-05, |
|
"loss": 6.6141, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.05616956600750029, |
|
"grad_norm": 127.3297348022461, |
|
"learning_rate": 1.121572514040304e-05, |
|
"loss": 6.5374, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.05782161206654442, |
|
"grad_norm": 20.704940795898438, |
|
"learning_rate": 1.1546085232903866e-05, |
|
"loss": 6.4776, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.05947365812558854, |
|
"grad_norm": 23.68699836730957, |
|
"learning_rate": 1.1876445325404693e-05, |
|
"loss": 6.5701, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.061125704184632666, |
|
"grad_norm": 104.68245697021484, |
|
"learning_rate": 1.2206805417905519e-05, |
|
"loss": 6.5026, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.0627777502436768, |
|
"grad_norm": 97.47430419921875, |
|
"learning_rate": 1.2537165510406343e-05, |
|
"loss": 6.6502, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.06442979630272092, |
|
"grad_norm": 21.512229919433594, |
|
"learning_rate": 1.286752560290717e-05, |
|
"loss": 6.5023, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.06608184236176505, |
|
"grad_norm": 31.69252586364746, |
|
"learning_rate": 1.3197885695407995e-05, |
|
"loss": 6.5526, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.06773388842080917, |
|
"grad_norm": 22.141067504882812, |
|
"learning_rate": 1.3528245787908823e-05, |
|
"loss": 6.6594, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.06938593447985329, |
|
"grad_norm": 23.37205696105957, |
|
"learning_rate": 1.3858605880409649e-05, |
|
"loss": 6.3643, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.07103798053889743, |
|
"grad_norm": 23.31827163696289, |
|
"learning_rate": 1.4188965972910473e-05, |
|
"loss": 6.3783, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 0.07269002659794155, |
|
"grad_norm": 27.043312072753906, |
|
"learning_rate": 1.4519326065411299e-05, |
|
"loss": 6.3222, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.07434207265698568, |
|
"grad_norm": 25.699583053588867, |
|
"learning_rate": 1.4846382556987117e-05, |
|
"loss": 6.3401, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.0759941187160298, |
|
"grad_norm": 24.91438865661621, |
|
"learning_rate": 1.5176742649487943e-05, |
|
"loss": 6.4005, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 0.07764616477507393, |
|
"grad_norm": 38.77157974243164, |
|
"learning_rate": 1.5507102741988768e-05, |
|
"loss": 6.3605, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 0.07929821083411806, |
|
"grad_norm": 156.87989807128906, |
|
"learning_rate": 1.5837462834489594e-05, |
|
"loss": 6.348, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.08095025689316218, |
|
"grad_norm": 110.80547332763672, |
|
"learning_rate": 1.6167822926990423e-05, |
|
"loss": 6.3406, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 0.0826023029522063, |
|
"grad_norm": 48.55455780029297, |
|
"learning_rate": 1.649818301949125e-05, |
|
"loss": 6.4156, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.08425434901125044, |
|
"grad_norm": 25.825349807739258, |
|
"learning_rate": 1.682854311199207e-05, |
|
"loss": 6.3786, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 0.08590639507029456, |
|
"grad_norm": 55.6208381652832, |
|
"learning_rate": 1.7158903204492897e-05, |
|
"loss": 6.376, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 0.08755844112933868, |
|
"grad_norm": 37.82964324951172, |
|
"learning_rate": 1.7489263296993723e-05, |
|
"loss": 6.2363, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 0.08921048718838281, |
|
"grad_norm": 32.86615753173828, |
|
"learning_rate": 1.7819623389494553e-05, |
|
"loss": 6.2185, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 0.09086253324742694, |
|
"grad_norm": 180.8863525390625, |
|
"learning_rate": 1.814998348199538e-05, |
|
"loss": 6.2554, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.09251457930647107, |
|
"grad_norm": 25.11360740661621, |
|
"learning_rate": 1.84803435744962e-05, |
|
"loss": 6.2177, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 0.09416662536551519, |
|
"grad_norm": 23.702716827392578, |
|
"learning_rate": 1.8810703666997027e-05, |
|
"loss": 6.3924, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 0.09581867142455931, |
|
"grad_norm": 32.1275634765625, |
|
"learning_rate": 1.9141063759497853e-05, |
|
"loss": 6.2897, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 0.09747071748360345, |
|
"grad_norm": 46.22661590576172, |
|
"learning_rate": 1.9471423851998682e-05, |
|
"loss": 6.272, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 0.09912276354264757, |
|
"grad_norm": 74.11865234375, |
|
"learning_rate": 1.9801783944499505e-05, |
|
"loss": 6.0247, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.10077480960169169, |
|
"grad_norm": 34.50657653808594, |
|
"learning_rate": 1.9985314903537273e-05, |
|
"loss": 6.194, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 0.10242685566073582, |
|
"grad_norm": 25.600902557373047, |
|
"learning_rate": 1.9948602162380454e-05, |
|
"loss": 6.2757, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 0.10407890171977995, |
|
"grad_norm": 24.53876495361328, |
|
"learning_rate": 1.9911889421223638e-05, |
|
"loss": 6.2408, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 0.10573094777882408, |
|
"grad_norm": 22.572052001953125, |
|
"learning_rate": 1.987517668006682e-05, |
|
"loss": 6.253, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 0.1073829938378682, |
|
"grad_norm": 33.04438018798828, |
|
"learning_rate": 1.983846393891e-05, |
|
"loss": 6.0605, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.10903503989691232, |
|
"grad_norm": 81.35254669189453, |
|
"learning_rate": 1.9801751197753184e-05, |
|
"loss": 6.0672, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 0.11068708595595646, |
|
"grad_norm": 31.132247924804688, |
|
"learning_rate": 1.9765038456596365e-05, |
|
"loss": 6.0414, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 0.11233913201500058, |
|
"grad_norm": 42.16621017456055, |
|
"learning_rate": 1.9728325715439546e-05, |
|
"loss": 6.0823, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 0.1139911780740447, |
|
"grad_norm": 23.558713912963867, |
|
"learning_rate": 1.9691612974282726e-05, |
|
"loss": 6.1962, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 0.11564322413308883, |
|
"grad_norm": 69.28414154052734, |
|
"learning_rate": 1.9654900233125907e-05, |
|
"loss": 6.0868, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.11729527019213296, |
|
"grad_norm": 29.037137985229492, |
|
"learning_rate": 1.9618187491969088e-05, |
|
"loss": 6.0795, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 0.11894731625117708, |
|
"grad_norm": 29.588781356811523, |
|
"learning_rate": 1.9581474750812272e-05, |
|
"loss": 5.9656, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 0.12059936231022121, |
|
"grad_norm": 29.574968338012695, |
|
"learning_rate": 1.9544762009655453e-05, |
|
"loss": 5.9785, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 0.12225140836926533, |
|
"grad_norm": 46.092193603515625, |
|
"learning_rate": 1.9508049268498634e-05, |
|
"loss": 6.0722, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 0.12390345442830947, |
|
"grad_norm": 23.927968978881836, |
|
"learning_rate": 1.9471336527341815e-05, |
|
"loss": 5.9443, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.1255555004873536, |
|
"grad_norm": 21.281776428222656, |
|
"learning_rate": 1.9434623786184995e-05, |
|
"loss": 5.8786, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 0.1272075465463977, |
|
"grad_norm": 27.455034255981445, |
|
"learning_rate": 1.939791104502818e-05, |
|
"loss": 5.8007, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 0.12885959260544183, |
|
"grad_norm": 33.76934814453125, |
|
"learning_rate": 1.936119830387136e-05, |
|
"loss": 5.9206, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 0.13051163866448598, |
|
"grad_norm": 21.891183853149414, |
|
"learning_rate": 1.932448556271454e-05, |
|
"loss": 5.918, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 0.1321636847235301, |
|
"grad_norm": 61.087398529052734, |
|
"learning_rate": 1.9287772821557725e-05, |
|
"loss": 5.9443, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.13381573078257422, |
|
"grad_norm": 23.860267639160156, |
|
"learning_rate": 1.9251060080400906e-05, |
|
"loss": 5.8764, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 0.13546777684161834, |
|
"grad_norm": 26.501821517944336, |
|
"learning_rate": 1.9214714466655654e-05, |
|
"loss": 5.867, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 0.13711982290066246, |
|
"grad_norm": 43.38287353515625, |
|
"learning_rate": 1.9178001725498835e-05, |
|
"loss": 5.8087, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 0.13877186895970658, |
|
"grad_norm": 73.06561279296875, |
|
"learning_rate": 1.9141288984342016e-05, |
|
"loss": 5.9884, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 0.14042391501875073, |
|
"grad_norm": 36.368717193603516, |
|
"learning_rate": 1.91045762431852e-05, |
|
"loss": 5.8741, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.14207596107779485, |
|
"grad_norm": 136.38865661621094, |
|
"learning_rate": 1.906786350202838e-05, |
|
"loss": 5.9699, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 0.14372800713683898, |
|
"grad_norm": 38.05315017700195, |
|
"learning_rate": 1.903115076087156e-05, |
|
"loss": 5.8671, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 0.1453800531958831, |
|
"grad_norm": 39.74106216430664, |
|
"learning_rate": 1.8994438019714742e-05, |
|
"loss": 5.8278, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 0.14703209925492722, |
|
"grad_norm": 31.016155242919922, |
|
"learning_rate": 1.8957725278557926e-05, |
|
"loss": 5.8892, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 0.14868414531397137, |
|
"grad_norm": 36.37879943847656, |
|
"learning_rate": 1.8921012537401107e-05, |
|
"loss": 5.7437, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.1503361913730155, |
|
"grad_norm": 31.93881607055664, |
|
"learning_rate": 1.8884299796244288e-05, |
|
"loss": 5.8069, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 0.1519882374320596, |
|
"grad_norm": 24.248807907104492, |
|
"learning_rate": 1.8847587055087472e-05, |
|
"loss": 6.0235, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 0.15364028349110373, |
|
"grad_norm": 29.67982292175293, |
|
"learning_rate": 1.8810874313930653e-05, |
|
"loss": 5.7214, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 0.15529232955014785, |
|
"grad_norm": 34.80620193481445, |
|
"learning_rate": 1.8774161572773834e-05, |
|
"loss": 5.7893, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 0.15694437560919197, |
|
"grad_norm": 31.375019073486328, |
|
"learning_rate": 1.8737448831617015e-05, |
|
"loss": 5.7406, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 0.15859642166823612, |
|
"grad_norm": 24.126588821411133, |
|
"learning_rate": 1.8700736090460195e-05, |
|
"loss": 5.8035, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 0.16024846772728024, |
|
"grad_norm": 94.3121337890625, |
|
"learning_rate": 1.8664023349303376e-05, |
|
"loss": 5.7965, |
|
"step": 9700 |
|
}, |
|
{ |
|
"epoch": 0.16190051378632436, |
|
"grad_norm": 29.543697357177734, |
|
"learning_rate": 1.8627310608146557e-05, |
|
"loss": 5.638, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 0.16355255984536848, |
|
"grad_norm": 27.004188537597656, |
|
"learning_rate": 1.859059786698974e-05, |
|
"loss": 5.8263, |
|
"step": 9900 |
|
}, |
|
{ |
|
"epoch": 0.1652046059044126, |
|
"grad_norm": 31.72929573059082, |
|
"learning_rate": 1.8553885125832922e-05, |
|
"loss": 5.7995, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.16685665196345675, |
|
"grad_norm": 43.893436431884766, |
|
"learning_rate": 1.8517172384676103e-05, |
|
"loss": 5.5805, |
|
"step": 10100 |
|
}, |
|
{ |
|
"epoch": 0.16850869802250087, |
|
"grad_norm": 40.329349517822266, |
|
"learning_rate": 1.8480459643519283e-05, |
|
"loss": 5.632, |
|
"step": 10200 |
|
}, |
|
{ |
|
"epoch": 0.170160744081545, |
|
"grad_norm": 36.50722885131836, |
|
"learning_rate": 1.8443746902362468e-05, |
|
"loss": 5.6944, |
|
"step": 10300 |
|
}, |
|
{ |
|
"epoch": 0.17181279014058912, |
|
"grad_norm": 68.61418151855469, |
|
"learning_rate": 1.840703416120565e-05, |
|
"loss": 5.5818, |
|
"step": 10400 |
|
}, |
|
{ |
|
"epoch": 0.17346483619963324, |
|
"grad_norm": 38.758846282958984, |
|
"learning_rate": 1.837032142004883e-05, |
|
"loss": 5.8598, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 0.17511688225867736, |
|
"grad_norm": 51.770931243896484, |
|
"learning_rate": 1.8333975806303577e-05, |
|
"loss": 5.7255, |
|
"step": 10600 |
|
}, |
|
{ |
|
"epoch": 0.1767689283177215, |
|
"grad_norm": 91.27816009521484, |
|
"learning_rate": 1.8297263065146758e-05, |
|
"loss": 5.7536, |
|
"step": 10700 |
|
}, |
|
{ |
|
"epoch": 0.17842097437676563, |
|
"grad_norm": 35.52999496459961, |
|
"learning_rate": 1.8260550323989942e-05, |
|
"loss": 5.6536, |
|
"step": 10800 |
|
}, |
|
{ |
|
"epoch": 0.18007302043580975, |
|
"grad_norm": 36.9012336730957, |
|
"learning_rate": 1.8223837582833123e-05, |
|
"loss": 5.6417, |
|
"step": 10900 |
|
}, |
|
{ |
|
"epoch": 0.18172506649485387, |
|
"grad_norm": 37.2264404296875, |
|
"learning_rate": 1.8187124841676304e-05, |
|
"loss": 5.6719, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.183377112553898, |
|
"grad_norm": 31.076929092407227, |
|
"learning_rate": 1.8150412100519488e-05, |
|
"loss": 5.566, |
|
"step": 11100 |
|
}, |
|
{ |
|
"epoch": 0.18502915861294214, |
|
"grad_norm": 34.78733444213867, |
|
"learning_rate": 1.811369935936267e-05, |
|
"loss": 5.4893, |
|
"step": 11200 |
|
}, |
|
{ |
|
"epoch": 0.18668120467198626, |
|
"grad_norm": 68.41493225097656, |
|
"learning_rate": 1.807698661820585e-05, |
|
"loss": 5.7412, |
|
"step": 11300 |
|
}, |
|
{ |
|
"epoch": 0.18833325073103038, |
|
"grad_norm": 43.99595260620117, |
|
"learning_rate": 1.804027387704903e-05, |
|
"loss": 5.6838, |
|
"step": 11400 |
|
}, |
|
{ |
|
"epoch": 0.1899852967900745, |
|
"grad_norm": 30.06267547607422, |
|
"learning_rate": 1.8003561135892215e-05, |
|
"loss": 5.6272, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 0.19163734284911862, |
|
"grad_norm": 38.978031158447266, |
|
"learning_rate": 1.7966848394735395e-05, |
|
"loss": 5.6538, |
|
"step": 11600 |
|
}, |
|
{ |
|
"epoch": 0.19328938890816275, |
|
"grad_norm": 34.604209899902344, |
|
"learning_rate": 1.7930135653578576e-05, |
|
"loss": 5.7176, |
|
"step": 11700 |
|
}, |
|
{ |
|
"epoch": 0.1949414349672069, |
|
"grad_norm": 39.66080856323242, |
|
"learning_rate": 1.7893422912421757e-05, |
|
"loss": 5.4923, |
|
"step": 11800 |
|
}, |
|
{ |
|
"epoch": 0.19659348102625102, |
|
"grad_norm": 39.9164924621582, |
|
"learning_rate": 1.7856710171264938e-05, |
|
"loss": 5.7643, |
|
"step": 11900 |
|
}, |
|
{ |
|
"epoch": 0.19824552708529514, |
|
"grad_norm": 62.23050308227539, |
|
"learning_rate": 1.7819997430108122e-05, |
|
"loss": 5.5674, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.19989757314433926, |
|
"grad_norm": 57.77485656738281, |
|
"learning_rate": 1.7783284688951303e-05, |
|
"loss": 5.6896, |
|
"step": 12100 |
|
}, |
|
{ |
|
"epoch": 0.20154961920338338, |
|
"grad_norm": 62.32257843017578, |
|
"learning_rate": 1.7746571947794483e-05, |
|
"loss": 5.4385, |
|
"step": 12200 |
|
}, |
|
{ |
|
"epoch": 0.20320166526242753, |
|
"grad_norm": 72.59315490722656, |
|
"learning_rate": 1.7709859206637664e-05, |
|
"loss": 5.5851, |
|
"step": 12300 |
|
}, |
|
{ |
|
"epoch": 0.20485371132147165, |
|
"grad_norm": 38.60813522338867, |
|
"learning_rate": 1.7673146465480845e-05, |
|
"loss": 5.5132, |
|
"step": 12400 |
|
}, |
|
{ |
|
"epoch": 0.20650575738051577, |
|
"grad_norm": 46.002899169921875, |
|
"learning_rate": 1.763643372432403e-05, |
|
"loss": 5.3329, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 0.2081578034395599, |
|
"grad_norm": 54.40972900390625, |
|
"learning_rate": 1.759972098316721e-05, |
|
"loss": 5.4218, |
|
"step": 12600 |
|
}, |
|
{ |
|
"epoch": 0.209809849498604, |
|
"grad_norm": 42.294403076171875, |
|
"learning_rate": 1.756337536942196e-05, |
|
"loss": 5.5171, |
|
"step": 12700 |
|
}, |
|
{ |
|
"epoch": 0.21146189555764816, |
|
"grad_norm": 99.45050048828125, |
|
"learning_rate": 1.7526662628265142e-05, |
|
"loss": 5.3414, |
|
"step": 12800 |
|
}, |
|
{ |
|
"epoch": 0.21311394161669228, |
|
"grad_norm": 29.550790786743164, |
|
"learning_rate": 1.7489949887108323e-05, |
|
"loss": 5.4921, |
|
"step": 12900 |
|
}, |
|
{ |
|
"epoch": 0.2147659876757364, |
|
"grad_norm": 35.48351287841797, |
|
"learning_rate": 1.7453237145951504e-05, |
|
"loss": 5.7687, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.21641803373478052, |
|
"grad_norm": 35.474609375, |
|
"learning_rate": 1.7416524404794685e-05, |
|
"loss": 5.7119, |
|
"step": 13100 |
|
}, |
|
{ |
|
"epoch": 0.21807007979382464, |
|
"grad_norm": 52.770469665527344, |
|
"learning_rate": 1.7379811663637865e-05, |
|
"loss": 5.4975, |
|
"step": 13200 |
|
}, |
|
{ |
|
"epoch": 0.21972212585286877, |
|
"grad_norm": 41.083763122558594, |
|
"learning_rate": 1.7343098922481046e-05, |
|
"loss": 5.4514, |
|
"step": 13300 |
|
}, |
|
{ |
|
"epoch": 0.22137417191191291, |
|
"grad_norm": 27.714067459106445, |
|
"learning_rate": 1.730638618132423e-05, |
|
"loss": 5.497, |
|
"step": 13400 |
|
}, |
|
{ |
|
"epoch": 0.22302621797095704, |
|
"grad_norm": 646.4743041992188, |
|
"learning_rate": 1.726967344016741e-05, |
|
"loss": 5.558, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 0.22467826403000116, |
|
"grad_norm": 35.99949264526367, |
|
"learning_rate": 1.7232960699010592e-05, |
|
"loss": 5.4207, |
|
"step": 13600 |
|
}, |
|
{ |
|
"epoch": 0.22633031008904528, |
|
"grad_norm": 39.374507904052734, |
|
"learning_rate": 1.7196247957853776e-05, |
|
"loss": 5.5901, |
|
"step": 13700 |
|
}, |
|
{ |
|
"epoch": 0.2279823561480894, |
|
"grad_norm": 33.016117095947266, |
|
"learning_rate": 1.7159535216696957e-05, |
|
"loss": 5.2041, |
|
"step": 13800 |
|
}, |
|
{ |
|
"epoch": 0.22963440220713355, |
|
"grad_norm": 51.360252380371094, |
|
"learning_rate": 1.7122822475540138e-05, |
|
"loss": 5.2999, |
|
"step": 13900 |
|
}, |
|
{ |
|
"epoch": 0.23128644826617767, |
|
"grad_norm": 40.98723602294922, |
|
"learning_rate": 1.708610973438332e-05, |
|
"loss": 5.3373, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.2329384943252218, |
|
"grad_norm": 62.94683074951172, |
|
"learning_rate": 1.7049396993226503e-05, |
|
"loss": 5.789, |
|
"step": 14100 |
|
}, |
|
{ |
|
"epoch": 0.2345905403842659, |
|
"grad_norm": 70.42803192138672, |
|
"learning_rate": 1.7012684252069684e-05, |
|
"loss": 5.3292, |
|
"step": 14200 |
|
}, |
|
{ |
|
"epoch": 0.23624258644331003, |
|
"grad_norm": 95.90315246582031, |
|
"learning_rate": 1.6975971510912864e-05, |
|
"loss": 5.4059, |
|
"step": 14300 |
|
}, |
|
{ |
|
"epoch": 0.23789463250235415, |
|
"grad_norm": 39.37266159057617, |
|
"learning_rate": 1.6939258769756045e-05, |
|
"loss": 5.1849, |
|
"step": 14400 |
|
}, |
|
{ |
|
"epoch": 0.2395466785613983, |
|
"grad_norm": 35.2801513671875, |
|
"learning_rate": 1.6902913156010793e-05, |
|
"loss": 5.1262, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 0.24119872462044242, |
|
"grad_norm": 49.648563385009766, |
|
"learning_rate": 1.6866200414853977e-05, |
|
"loss": 5.4339, |
|
"step": 14600 |
|
}, |
|
{ |
|
"epoch": 0.24285077067948654, |
|
"grad_norm": 42.30907440185547, |
|
"learning_rate": 1.6829487673697158e-05, |
|
"loss": 5.5185, |
|
"step": 14700 |
|
}, |
|
{ |
|
"epoch": 0.24450281673853066, |
|
"grad_norm": 37.14194869995117, |
|
"learning_rate": 1.679277493254034e-05, |
|
"loss": 5.3286, |
|
"step": 14800 |
|
}, |
|
{ |
|
"epoch": 0.24615486279757479, |
|
"grad_norm": 31.77059555053711, |
|
"learning_rate": 1.6756062191383523e-05, |
|
"loss": 5.4141, |
|
"step": 14900 |
|
}, |
|
{ |
|
"epoch": 0.24780690885661893, |
|
"grad_norm": 30.543859481811523, |
|
"learning_rate": 1.6719349450226704e-05, |
|
"loss": 5.3554, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.24945895491566306, |
|
"grad_norm": 51.9097785949707, |
|
"learning_rate": 1.6682636709069885e-05, |
|
"loss": 5.3489, |
|
"step": 15100 |
|
}, |
|
{ |
|
"epoch": 0.2511110009747072, |
|
"grad_norm": 222.63604736328125, |
|
"learning_rate": 1.6645923967913065e-05, |
|
"loss": 5.4849, |
|
"step": 15200 |
|
}, |
|
{ |
|
"epoch": 0.2527630470337513, |
|
"grad_norm": 95.24678802490234, |
|
"learning_rate": 1.6609211226756246e-05, |
|
"loss": 5.3656, |
|
"step": 15300 |
|
}, |
|
{ |
|
"epoch": 0.2544150930927954, |
|
"grad_norm": 36.08857345581055, |
|
"learning_rate": 1.6572498485599427e-05, |
|
"loss": 5.32, |
|
"step": 15400 |
|
}, |
|
{ |
|
"epoch": 0.25606713915183954, |
|
"grad_norm": 48.17111587524414, |
|
"learning_rate": 1.6535785744442608e-05, |
|
"loss": 5.3523, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 0.25771918521088366, |
|
"grad_norm": 51.60739517211914, |
|
"learning_rate": 1.6499073003285792e-05, |
|
"loss": 5.1146, |
|
"step": 15600 |
|
}, |
|
{ |
|
"epoch": 0.2593712312699278, |
|
"grad_norm": 38.238433837890625, |
|
"learning_rate": 1.6462360262128973e-05, |
|
"loss": 5.2816, |
|
"step": 15700 |
|
}, |
|
{ |
|
"epoch": 0.26102327732897196, |
|
"grad_norm": 136.9043426513672, |
|
"learning_rate": 1.6425647520972154e-05, |
|
"loss": 5.2296, |
|
"step": 15800 |
|
}, |
|
{ |
|
"epoch": 0.2626753233880161, |
|
"grad_norm": 68.96510314941406, |
|
"learning_rate": 1.6388934779815334e-05, |
|
"loss": 5.3386, |
|
"step": 15900 |
|
}, |
|
{ |
|
"epoch": 0.2643273694470602, |
|
"grad_norm": 152.887939453125, |
|
"learning_rate": 1.635222203865852e-05, |
|
"loss": 5.4917, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 0.2659794155061043, |
|
"grad_norm": 60.76850891113281, |
|
"learning_rate": 1.63155092975017e-05, |
|
"loss": 5.0524, |
|
"step": 16100 |
|
}, |
|
{ |
|
"epoch": 0.26763146156514844, |
|
"grad_norm": 52.04624557495117, |
|
"learning_rate": 1.627879655634488e-05, |
|
"loss": 5.1657, |
|
"step": 16200 |
|
}, |
|
{ |
|
"epoch": 0.26928350762419256, |
|
"grad_norm": 60.97122573852539, |
|
"learning_rate": 1.6242083815188064e-05, |
|
"loss": 5.1431, |
|
"step": 16300 |
|
}, |
|
{ |
|
"epoch": 0.2709355536832367, |
|
"grad_norm": 80.91710662841797, |
|
"learning_rate": 1.6205371074031245e-05, |
|
"loss": 5.166, |
|
"step": 16400 |
|
}, |
|
{ |
|
"epoch": 0.2725875997422808, |
|
"grad_norm": 37.053619384765625, |
|
"learning_rate": 1.6168658332874426e-05, |
|
"loss": 5.5738, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 0.2742396458013249, |
|
"grad_norm": 69.874267578125, |
|
"learning_rate": 1.6131945591717607e-05, |
|
"loss": 5.2088, |
|
"step": 16600 |
|
}, |
|
{ |
|
"epoch": 0.27589169186036905, |
|
"grad_norm": 43.52924346923828, |
|
"learning_rate": 1.6095232850560787e-05, |
|
"loss": 5.2198, |
|
"step": 16700 |
|
}, |
|
{ |
|
"epoch": 0.27754373791941317, |
|
"grad_norm": 48.917152404785156, |
|
"learning_rate": 1.605852010940397e-05, |
|
"loss": 5.2709, |
|
"step": 16800 |
|
}, |
|
{ |
|
"epoch": 0.27919578397845735, |
|
"grad_norm": 36.78157043457031, |
|
"learning_rate": 1.6021807368247152e-05, |
|
"loss": 5.4027, |
|
"step": 16900 |
|
}, |
|
{ |
|
"epoch": 0.28084783003750147, |
|
"grad_norm": 78.23045349121094, |
|
"learning_rate": 1.5985094627090333e-05, |
|
"loss": 5.25, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 0.2824998760965456, |
|
"grad_norm": 50.245540618896484, |
|
"learning_rate": 1.5948381885933514e-05, |
|
"loss": 5.1519, |
|
"step": 17100 |
|
}, |
|
{ |
|
"epoch": 0.2841519221555897, |
|
"grad_norm": 31.97572135925293, |
|
"learning_rate": 1.5911669144776695e-05, |
|
"loss": 5.1347, |
|
"step": 17200 |
|
}, |
|
{ |
|
"epoch": 0.28580396821463383, |
|
"grad_norm": 47.70193862915039, |
|
"learning_rate": 1.5874956403619876e-05, |
|
"loss": 5.2346, |
|
"step": 17300 |
|
}, |
|
{ |
|
"epoch": 0.28745601427367795, |
|
"grad_norm": 34.82514953613281, |
|
"learning_rate": 1.583824366246306e-05, |
|
"loss": 5.4128, |
|
"step": 17400 |
|
}, |
|
{ |
|
"epoch": 0.28910806033272207, |
|
"grad_norm": 66.9453353881836, |
|
"learning_rate": 1.580153092130624e-05, |
|
"loss": 5.1954, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 0.2907601063917662, |
|
"grad_norm": 50.74463653564453, |
|
"learning_rate": 1.576481818014942e-05, |
|
"loss": 5.3787, |
|
"step": 17600 |
|
}, |
|
{ |
|
"epoch": 0.2924121524508103, |
|
"grad_norm": 42.01203918457031, |
|
"learning_rate": 1.5728105438992606e-05, |
|
"loss": 5.1731, |
|
"step": 17700 |
|
}, |
|
{ |
|
"epoch": 0.29406419850985444, |
|
"grad_norm": 40.68756103515625, |
|
"learning_rate": 1.5691392697835786e-05, |
|
"loss": 5.3714, |
|
"step": 17800 |
|
}, |
|
{ |
|
"epoch": 0.29571624456889856, |
|
"grad_norm": 37.97477722167969, |
|
"learning_rate": 1.5654679956678967e-05, |
|
"loss": 5.2113, |
|
"step": 17900 |
|
}, |
|
{ |
|
"epoch": 0.29736829062794273, |
|
"grad_norm": 64.8110580444336, |
|
"learning_rate": 1.5617967215522148e-05, |
|
"loss": 5.0819, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 0.29902033668698685, |
|
"grad_norm": 37.63853454589844, |
|
"learning_rate": 1.5581254474365332e-05, |
|
"loss": 5.0443, |
|
"step": 18100 |
|
}, |
|
{ |
|
"epoch": 0.300672382746031, |
|
"grad_norm": 41.9002799987793, |
|
"learning_rate": 1.5544541733208513e-05, |
|
"loss": 5.2041, |
|
"step": 18200 |
|
}, |
|
{ |
|
"epoch": 0.3023244288050751, |
|
"grad_norm": 50.00920486450195, |
|
"learning_rate": 1.5507828992051694e-05, |
|
"loss": 5.1385, |
|
"step": 18300 |
|
}, |
|
{ |
|
"epoch": 0.3039764748641192, |
|
"grad_norm": 51.85498809814453, |
|
"learning_rate": 1.5471116250894874e-05, |
|
"loss": 5.2195, |
|
"step": 18400 |
|
}, |
|
{ |
|
"epoch": 0.30562852092316334, |
|
"grad_norm": 45.79952621459961, |
|
"learning_rate": 1.5434403509738055e-05, |
|
"loss": 5.2233, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 0.30728056698220746, |
|
"grad_norm": 40.52060317993164, |
|
"learning_rate": 1.5397690768581236e-05, |
|
"loss": 5.1198, |
|
"step": 18600 |
|
}, |
|
{ |
|
"epoch": 0.3089326130412516, |
|
"grad_norm": 56.97610092163086, |
|
"learning_rate": 1.5360978027424417e-05, |
|
"loss": 5.106, |
|
"step": 18700 |
|
}, |
|
{ |
|
"epoch": 0.3105846591002957, |
|
"grad_norm": 91.66014099121094, |
|
"learning_rate": 1.53242652862676e-05, |
|
"loss": 5.335, |
|
"step": 18800 |
|
}, |
|
{ |
|
"epoch": 0.3122367051593398, |
|
"grad_norm": 85.68270874023438, |
|
"learning_rate": 1.5287552545110782e-05, |
|
"loss": 5.1231, |
|
"step": 18900 |
|
}, |
|
{ |
|
"epoch": 0.31388875121838394, |
|
"grad_norm": 38.191650390625, |
|
"learning_rate": 1.5250839803953963e-05, |
|
"loss": 5.1777, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 0.3155407972774281, |
|
"grad_norm": 182.99609375, |
|
"learning_rate": 1.5214127062797147e-05, |
|
"loss": 5.5752, |
|
"step": 19100 |
|
}, |
|
{ |
|
"epoch": 0.31719284333647224, |
|
"grad_norm": 49.25122833251953, |
|
"learning_rate": 1.5177414321640328e-05, |
|
"loss": 5.1902, |
|
"step": 19200 |
|
}, |
|
{ |
|
"epoch": 0.31884488939551636, |
|
"grad_norm": 46.381248474121094, |
|
"learning_rate": 1.5140701580483508e-05, |
|
"loss": 5.0777, |
|
"step": 19300 |
|
}, |
|
{ |
|
"epoch": 0.3204969354545605, |
|
"grad_norm": 35.04011154174805, |
|
"learning_rate": 1.510398883932669e-05, |
|
"loss": 5.211, |
|
"step": 19400 |
|
}, |
|
{ |
|
"epoch": 0.3221489815136046, |
|
"grad_norm": 124.16557312011719, |
|
"learning_rate": 1.5067643225581439e-05, |
|
"loss": 5.1402, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 0.3238010275726487, |
|
"grad_norm": 55.99512481689453, |
|
"learning_rate": 1.5030930484424621e-05, |
|
"loss": 5.1458, |
|
"step": 19600 |
|
}, |
|
{ |
|
"epoch": 0.32545307363169285, |
|
"grad_norm": 77.44950866699219, |
|
"learning_rate": 1.4994217743267802e-05, |
|
"loss": 5.1091, |
|
"step": 19700 |
|
}, |
|
{ |
|
"epoch": 0.32710511969073697, |
|
"grad_norm": 122.10176849365234, |
|
"learning_rate": 1.4957505002110983e-05, |
|
"loss": 5.1471, |
|
"step": 19800 |
|
}, |
|
{ |
|
"epoch": 0.3287571657497811, |
|
"grad_norm": 43.460208892822266, |
|
"learning_rate": 1.4920792260954164e-05, |
|
"loss": 5.1804, |
|
"step": 19900 |
|
}, |
|
{ |
|
"epoch": 0.3304092118088252, |
|
"grad_norm": 89.17972564697266, |
|
"learning_rate": 1.4884079519797348e-05, |
|
"loss": 4.9678, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 0.33206125786786933, |
|
"grad_norm": 81.0530014038086, |
|
"learning_rate": 1.4847366778640529e-05, |
|
"loss": 5.1655, |
|
"step": 20100 |
|
}, |
|
{ |
|
"epoch": 0.3337133039269135, |
|
"grad_norm": 88.94013214111328, |
|
"learning_rate": 1.481065403748371e-05, |
|
"loss": 4.9735, |
|
"step": 20200 |
|
}, |
|
{ |
|
"epoch": 0.33536534998595763, |
|
"grad_norm": 78.72936248779297, |
|
"learning_rate": 1.4773941296326892e-05, |
|
"loss": 5.0536, |
|
"step": 20300 |
|
}, |
|
{ |
|
"epoch": 0.33701739604500175, |
|
"grad_norm": 36.7070198059082, |
|
"learning_rate": 1.4737228555170073e-05, |
|
"loss": 5.347, |
|
"step": 20400 |
|
}, |
|
{ |
|
"epoch": 0.33866944210404587, |
|
"grad_norm": 63.179012298583984, |
|
"learning_rate": 1.4700515814013254e-05, |
|
"loss": 4.9856, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 0.34032148816309, |
|
"grad_norm": 47.14772415161133, |
|
"learning_rate": 1.4663803072856434e-05, |
|
"loss": 5.1035, |
|
"step": 20600 |
|
}, |
|
{ |
|
"epoch": 0.3419735342221341, |
|
"grad_norm": 51.848472595214844, |
|
"learning_rate": 1.4627090331699619e-05, |
|
"loss": 5.0428, |
|
"step": 20700 |
|
}, |
|
{ |
|
"epoch": 0.34362558028117823, |
|
"grad_norm": 50.670616149902344, |
|
"learning_rate": 1.45903775905428e-05, |
|
"loss": 5.0856, |
|
"step": 20800 |
|
}, |
|
{ |
|
"epoch": 0.34527762634022235, |
|
"grad_norm": 48.28507995605469, |
|
"learning_rate": 1.455366484938598e-05, |
|
"loss": 5.0776, |
|
"step": 20900 |
|
}, |
|
{ |
|
"epoch": 0.3469296723992665, |
|
"grad_norm": 49.49705505371094, |
|
"learning_rate": 1.4516952108229163e-05, |
|
"loss": 5.2031, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 0.3485817184583106, |
|
"grad_norm": 62.78488540649414, |
|
"learning_rate": 1.4480239367072343e-05, |
|
"loss": 5.1491, |
|
"step": 21100 |
|
}, |
|
{ |
|
"epoch": 0.3502337645173547, |
|
"grad_norm": 42.41142654418945, |
|
"learning_rate": 1.4443526625915524e-05, |
|
"loss": 5.3685, |
|
"step": 21200 |
|
}, |
|
{ |
|
"epoch": 0.3518858105763989, |
|
"grad_norm": 42.742740631103516, |
|
"learning_rate": 1.4406813884758705e-05, |
|
"loss": 4.6901, |
|
"step": 21300 |
|
}, |
|
{ |
|
"epoch": 0.353537856635443, |
|
"grad_norm": 78.31076049804688, |
|
"learning_rate": 1.437010114360189e-05, |
|
"loss": 4.9809, |
|
"step": 21400 |
|
}, |
|
{ |
|
"epoch": 0.35518990269448714, |
|
"grad_norm": 107.21749877929688, |
|
"learning_rate": 1.433338840244507e-05, |
|
"loss": 4.9273, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 0.35684194875353126, |
|
"grad_norm": 42.59064865112305, |
|
"learning_rate": 1.429667566128825e-05, |
|
"loss": 4.7568, |
|
"step": 21600 |
|
}, |
|
{ |
|
"epoch": 0.3584939948125754, |
|
"grad_norm": 64.76200103759766, |
|
"learning_rate": 1.4259962920131433e-05, |
|
"loss": 4.9064, |
|
"step": 21700 |
|
}, |
|
{ |
|
"epoch": 0.3601460408716195, |
|
"grad_norm": 35.872161865234375, |
|
"learning_rate": 1.4223250178974614e-05, |
|
"loss": 5.0399, |
|
"step": 21800 |
|
}, |
|
{ |
|
"epoch": 0.3617980869306636, |
|
"grad_norm": 52.66395950317383, |
|
"learning_rate": 1.4186537437817795e-05, |
|
"loss": 4.9202, |
|
"step": 21900 |
|
}, |
|
{ |
|
"epoch": 0.36345013298970774, |
|
"grad_norm": 47.78133773803711, |
|
"learning_rate": 1.4149824696660976e-05, |
|
"loss": 5.3848, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 0.36510217904875186, |
|
"grad_norm": 61.914493560791016, |
|
"learning_rate": 1.411311195550416e-05, |
|
"loss": 4.9239, |
|
"step": 22100 |
|
}, |
|
{ |
|
"epoch": 0.366754225107796, |
|
"grad_norm": 50.294803619384766, |
|
"learning_rate": 1.407639921434734e-05, |
|
"loss": 4.8744, |
|
"step": 22200 |
|
}, |
|
{ |
|
"epoch": 0.3684062711668401, |
|
"grad_norm": 50.07392883300781, |
|
"learning_rate": 1.4039686473190521e-05, |
|
"loss": 4.8597, |
|
"step": 22300 |
|
}, |
|
{ |
|
"epoch": 0.3700583172258843, |
|
"grad_norm": 103.84827423095703, |
|
"learning_rate": 1.4002973732033704e-05, |
|
"loss": 4.9226, |
|
"step": 22400 |
|
}, |
|
{ |
|
"epoch": 0.3717103632849284, |
|
"grad_norm": 52.87375259399414, |
|
"learning_rate": 1.3966260990876885e-05, |
|
"loss": 5.0358, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 0.3733624093439725, |
|
"grad_norm": 59.05409240722656, |
|
"learning_rate": 1.3929548249720065e-05, |
|
"loss": 4.9895, |
|
"step": 22600 |
|
}, |
|
{ |
|
"epoch": 0.37501445540301664, |
|
"grad_norm": 57.85047912597656, |
|
"learning_rate": 1.3892835508563248e-05, |
|
"loss": 5.004, |
|
"step": 22700 |
|
}, |
|
{ |
|
"epoch": 0.37666650146206077, |
|
"grad_norm": 39.452919006347656, |
|
"learning_rate": 1.385612276740643e-05, |
|
"loss": 5.0441, |
|
"step": 22800 |
|
}, |
|
{ |
|
"epoch": 0.3783185475211049, |
|
"grad_norm": 52.329498291015625, |
|
"learning_rate": 1.3819410026249611e-05, |
|
"loss": 4.8129, |
|
"step": 22900 |
|
}, |
|
{ |
|
"epoch": 0.379970593580149, |
|
"grad_norm": 51.75339889526367, |
|
"learning_rate": 1.3782697285092792e-05, |
|
"loss": 4.7954, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 0.38162263963919313, |
|
"grad_norm": 61.14597702026367, |
|
"learning_rate": 1.3745984543935975e-05, |
|
"loss": 4.8156, |
|
"step": 23100 |
|
}, |
|
{ |
|
"epoch": 0.38327468569823725, |
|
"grad_norm": 131.6481170654297, |
|
"learning_rate": 1.3709271802779155e-05, |
|
"loss": 5.0714, |
|
"step": 23200 |
|
}, |
|
{ |
|
"epoch": 0.38492673175728137, |
|
"grad_norm": 41.798179626464844, |
|
"learning_rate": 1.3672559061622336e-05, |
|
"loss": 4.8543, |
|
"step": 23300 |
|
}, |
|
{ |
|
"epoch": 0.3865787778163255, |
|
"grad_norm": 50.937530517578125, |
|
"learning_rate": 1.3635846320465519e-05, |
|
"loss": 5.1728, |
|
"step": 23400 |
|
}, |
|
{ |
|
"epoch": 0.38823082387536967, |
|
"grad_norm": 49.662574768066406, |
|
"learning_rate": 1.3599133579308701e-05, |
|
"loss": 5.1891, |
|
"step": 23500 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 60531, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 128, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|