youssefkhalil320's picture
Upload folder using huggingface_hub
124cb74 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.5203945085988997,
"eval_steps": 200000,
"global_step": 31500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0016520460590441263,
"grad_norm": 51.92022705078125,
"learning_rate": 3.2044928972580115e-07,
"loss": 13.3171,
"step": 100
},
{
"epoch": 0.0033040921180882525,
"grad_norm": 68.25243377685547,
"learning_rate": 6.508093822266271e-07,
"loss": 12.9799,
"step": 200
},
{
"epoch": 0.004956138177132379,
"grad_norm": 69.47785186767578,
"learning_rate": 9.811694747274531e-07,
"loss": 12.5133,
"step": 300
},
{
"epoch": 0.006608184236176505,
"grad_norm": 73.07315063476562,
"learning_rate": 1.311529567228279e-06,
"loss": 11.9388,
"step": 400
},
{
"epoch": 0.008260230295220631,
"grad_norm": 82.68733215332031,
"learning_rate": 1.6418896597291048e-06,
"loss": 11.0616,
"step": 500
},
{
"epoch": 0.009912276354264758,
"grad_norm": 57.61735534667969,
"learning_rate": 1.972249752229931e-06,
"loss": 10.2712,
"step": 600
},
{
"epoch": 0.011564322413308884,
"grad_norm": 44.42943572998047,
"learning_rate": 2.302609844730757e-06,
"loss": 9.5253,
"step": 700
},
{
"epoch": 0.01321636847235301,
"grad_norm": 27.03646469116211,
"learning_rate": 2.6329699372315828e-06,
"loss": 8.7706,
"step": 800
},
{
"epoch": 0.014868414531397135,
"grad_norm": 15.231706619262695,
"learning_rate": 2.9633300297324087e-06,
"loss": 8.4333,
"step": 900
},
{
"epoch": 0.016520460590441263,
"grad_norm": 14.189949035644531,
"learning_rate": 3.2936901222332346e-06,
"loss": 8.0902,
"step": 1000
},
{
"epoch": 0.018172506649485387,
"grad_norm": 12.241333961486816,
"learning_rate": 3.6240502147340605e-06,
"loss": 7.8862,
"step": 1100
},
{
"epoch": 0.019824552708529515,
"grad_norm": 11.400131225585938,
"learning_rate": 3.9544103072348865e-06,
"loss": 7.7362,
"step": 1200
},
{
"epoch": 0.02147659876757364,
"grad_norm": 12.072014808654785,
"learning_rate": 4.284770399735712e-06,
"loss": 7.6007,
"step": 1300
},
{
"epoch": 0.023128644826617768,
"grad_norm": 11.08774185180664,
"learning_rate": 4.615130492236538e-06,
"loss": 7.5304,
"step": 1400
},
{
"epoch": 0.024780690885661892,
"grad_norm": 13.02505874633789,
"learning_rate": 4.945490584737364e-06,
"loss": 7.4249,
"step": 1500
},
{
"epoch": 0.02643273694470602,
"grad_norm": 13.522186279296875,
"learning_rate": 5.27585067723819e-06,
"loss": 7.3035,
"step": 1600
},
{
"epoch": 0.028084783003750145,
"grad_norm": 45.22550964355469,
"learning_rate": 5.606210769739015e-06,
"loss": 7.2026,
"step": 1700
},
{
"epoch": 0.02973682906279427,
"grad_norm": 15.62098503112793,
"learning_rate": 5.936570862239842e-06,
"loss": 7.1572,
"step": 1800
},
{
"epoch": 0.0313888751218384,
"grad_norm": 16.570518493652344,
"learning_rate": 6.266930954740668e-06,
"loss": 7.0523,
"step": 1900
},
{
"epoch": 0.033040921180882525,
"grad_norm": 16.82353401184082,
"learning_rate": 6.597291047241494e-06,
"loss": 7.1158,
"step": 2000
},
{
"epoch": 0.034692967239926646,
"grad_norm": 17.38075828552246,
"learning_rate": 6.924347538817311e-06,
"loss": 6.9856,
"step": 2100
},
{
"epoch": 0.036345013298970774,
"grad_norm": 93.04572296142578,
"learning_rate": 7.2547076313181375e-06,
"loss": 7.0865,
"step": 2200
},
{
"epoch": 0.0379970593580149,
"grad_norm": 17.861074447631836,
"learning_rate": 7.585067723818963e-06,
"loss": 6.9496,
"step": 2300
},
{
"epoch": 0.03964910541705903,
"grad_norm": 19.067747116088867,
"learning_rate": 7.91542781631979e-06,
"loss": 6.9294,
"step": 2400
},
{
"epoch": 0.04130115147610315,
"grad_norm": 16.43912696838379,
"learning_rate": 8.245787908820615e-06,
"loss": 6.8825,
"step": 2500
},
{
"epoch": 0.04295319753514728,
"grad_norm": 140.5387725830078,
"learning_rate": 8.576148001321441e-06,
"loss": 6.8218,
"step": 2600
},
{
"epoch": 0.04460524359419141,
"grad_norm": 22.34341049194336,
"learning_rate": 8.903204492897258e-06,
"loss": 6.8416,
"step": 2700
},
{
"epoch": 0.046257289653235535,
"grad_norm": 16.260499954223633,
"learning_rate": 9.233564585398084e-06,
"loss": 6.7184,
"step": 2800
},
{
"epoch": 0.047909335712279656,
"grad_norm": 20.075071334838867,
"learning_rate": 9.56392467789891e-06,
"loss": 6.9183,
"step": 2900
},
{
"epoch": 0.049561381771323784,
"grad_norm": 45.1911735534668,
"learning_rate": 9.894284770399738e-06,
"loss": 6.7166,
"step": 3000
},
{
"epoch": 0.05121342783036791,
"grad_norm": 67.39335632324219,
"learning_rate": 1.0224644862900564e-05,
"loss": 6.6821,
"step": 3100
},
{
"epoch": 0.05286547388941204,
"grad_norm": 80.69914245605469,
"learning_rate": 1.055500495540139e-05,
"loss": 6.6074,
"step": 3200
},
{
"epoch": 0.05451751994845616,
"grad_norm": 37.51483917236328,
"learning_rate": 1.0885365047902214e-05,
"loss": 6.6141,
"step": 3300
},
{
"epoch": 0.05616956600750029,
"grad_norm": 127.3297348022461,
"learning_rate": 1.121572514040304e-05,
"loss": 6.5374,
"step": 3400
},
{
"epoch": 0.05782161206654442,
"grad_norm": 20.704940795898438,
"learning_rate": 1.1546085232903866e-05,
"loss": 6.4776,
"step": 3500
},
{
"epoch": 0.05947365812558854,
"grad_norm": 23.68699836730957,
"learning_rate": 1.1876445325404693e-05,
"loss": 6.5701,
"step": 3600
},
{
"epoch": 0.061125704184632666,
"grad_norm": 104.68245697021484,
"learning_rate": 1.2206805417905519e-05,
"loss": 6.5026,
"step": 3700
},
{
"epoch": 0.0627777502436768,
"grad_norm": 97.47430419921875,
"learning_rate": 1.2537165510406343e-05,
"loss": 6.6502,
"step": 3800
},
{
"epoch": 0.06442979630272092,
"grad_norm": 21.512229919433594,
"learning_rate": 1.286752560290717e-05,
"loss": 6.5023,
"step": 3900
},
{
"epoch": 0.06608184236176505,
"grad_norm": 31.69252586364746,
"learning_rate": 1.3197885695407995e-05,
"loss": 6.5526,
"step": 4000
},
{
"epoch": 0.06773388842080917,
"grad_norm": 22.141067504882812,
"learning_rate": 1.3528245787908823e-05,
"loss": 6.6594,
"step": 4100
},
{
"epoch": 0.06938593447985329,
"grad_norm": 23.37205696105957,
"learning_rate": 1.3858605880409649e-05,
"loss": 6.3643,
"step": 4200
},
{
"epoch": 0.07103798053889743,
"grad_norm": 23.31827163696289,
"learning_rate": 1.4188965972910473e-05,
"loss": 6.3783,
"step": 4300
},
{
"epoch": 0.07269002659794155,
"grad_norm": 27.043312072753906,
"learning_rate": 1.4519326065411299e-05,
"loss": 6.3222,
"step": 4400
},
{
"epoch": 0.07434207265698568,
"grad_norm": 25.699583053588867,
"learning_rate": 1.4846382556987117e-05,
"loss": 6.3401,
"step": 4500
},
{
"epoch": 0.0759941187160298,
"grad_norm": 24.91438865661621,
"learning_rate": 1.5176742649487943e-05,
"loss": 6.4005,
"step": 4600
},
{
"epoch": 0.07764616477507393,
"grad_norm": 38.77157974243164,
"learning_rate": 1.5507102741988768e-05,
"loss": 6.3605,
"step": 4700
},
{
"epoch": 0.07929821083411806,
"grad_norm": 156.87989807128906,
"learning_rate": 1.5837462834489594e-05,
"loss": 6.348,
"step": 4800
},
{
"epoch": 0.08095025689316218,
"grad_norm": 110.80547332763672,
"learning_rate": 1.6167822926990423e-05,
"loss": 6.3406,
"step": 4900
},
{
"epoch": 0.0826023029522063,
"grad_norm": 48.55455780029297,
"learning_rate": 1.649818301949125e-05,
"loss": 6.4156,
"step": 5000
},
{
"epoch": 0.08425434901125044,
"grad_norm": 25.825349807739258,
"learning_rate": 1.682854311199207e-05,
"loss": 6.3786,
"step": 5100
},
{
"epoch": 0.08590639507029456,
"grad_norm": 55.6208381652832,
"learning_rate": 1.7158903204492897e-05,
"loss": 6.376,
"step": 5200
},
{
"epoch": 0.08755844112933868,
"grad_norm": 37.82964324951172,
"learning_rate": 1.7489263296993723e-05,
"loss": 6.2363,
"step": 5300
},
{
"epoch": 0.08921048718838281,
"grad_norm": 32.86615753173828,
"learning_rate": 1.7819623389494553e-05,
"loss": 6.2185,
"step": 5400
},
{
"epoch": 0.09086253324742694,
"grad_norm": 180.8863525390625,
"learning_rate": 1.814998348199538e-05,
"loss": 6.2554,
"step": 5500
},
{
"epoch": 0.09251457930647107,
"grad_norm": 25.11360740661621,
"learning_rate": 1.84803435744962e-05,
"loss": 6.2177,
"step": 5600
},
{
"epoch": 0.09416662536551519,
"grad_norm": 23.702716827392578,
"learning_rate": 1.8810703666997027e-05,
"loss": 6.3924,
"step": 5700
},
{
"epoch": 0.09581867142455931,
"grad_norm": 32.1275634765625,
"learning_rate": 1.9141063759497853e-05,
"loss": 6.2897,
"step": 5800
},
{
"epoch": 0.09747071748360345,
"grad_norm": 46.22661590576172,
"learning_rate": 1.9471423851998682e-05,
"loss": 6.272,
"step": 5900
},
{
"epoch": 0.09912276354264757,
"grad_norm": 74.11865234375,
"learning_rate": 1.9801783944499505e-05,
"loss": 6.0247,
"step": 6000
},
{
"epoch": 0.10077480960169169,
"grad_norm": 34.50657653808594,
"learning_rate": 1.9985314903537273e-05,
"loss": 6.194,
"step": 6100
},
{
"epoch": 0.10242685566073582,
"grad_norm": 25.600902557373047,
"learning_rate": 1.9948602162380454e-05,
"loss": 6.2757,
"step": 6200
},
{
"epoch": 0.10407890171977995,
"grad_norm": 24.53876495361328,
"learning_rate": 1.9911889421223638e-05,
"loss": 6.2408,
"step": 6300
},
{
"epoch": 0.10573094777882408,
"grad_norm": 22.572052001953125,
"learning_rate": 1.987517668006682e-05,
"loss": 6.253,
"step": 6400
},
{
"epoch": 0.1073829938378682,
"grad_norm": 33.04438018798828,
"learning_rate": 1.983846393891e-05,
"loss": 6.0605,
"step": 6500
},
{
"epoch": 0.10903503989691232,
"grad_norm": 81.35254669189453,
"learning_rate": 1.9801751197753184e-05,
"loss": 6.0672,
"step": 6600
},
{
"epoch": 0.11068708595595646,
"grad_norm": 31.132247924804688,
"learning_rate": 1.9765038456596365e-05,
"loss": 6.0414,
"step": 6700
},
{
"epoch": 0.11233913201500058,
"grad_norm": 42.16621017456055,
"learning_rate": 1.9728325715439546e-05,
"loss": 6.0823,
"step": 6800
},
{
"epoch": 0.1139911780740447,
"grad_norm": 23.558713912963867,
"learning_rate": 1.9691612974282726e-05,
"loss": 6.1962,
"step": 6900
},
{
"epoch": 0.11564322413308883,
"grad_norm": 69.28414154052734,
"learning_rate": 1.9654900233125907e-05,
"loss": 6.0868,
"step": 7000
},
{
"epoch": 0.11729527019213296,
"grad_norm": 29.037137985229492,
"learning_rate": 1.9618187491969088e-05,
"loss": 6.0795,
"step": 7100
},
{
"epoch": 0.11894731625117708,
"grad_norm": 29.588781356811523,
"learning_rate": 1.9581474750812272e-05,
"loss": 5.9656,
"step": 7200
},
{
"epoch": 0.12059936231022121,
"grad_norm": 29.574968338012695,
"learning_rate": 1.9544762009655453e-05,
"loss": 5.9785,
"step": 7300
},
{
"epoch": 0.12225140836926533,
"grad_norm": 46.092193603515625,
"learning_rate": 1.9508049268498634e-05,
"loss": 6.0722,
"step": 7400
},
{
"epoch": 0.12390345442830947,
"grad_norm": 23.927968978881836,
"learning_rate": 1.9471336527341815e-05,
"loss": 5.9443,
"step": 7500
},
{
"epoch": 0.1255555004873536,
"grad_norm": 21.281776428222656,
"learning_rate": 1.9434623786184995e-05,
"loss": 5.8786,
"step": 7600
},
{
"epoch": 0.1272075465463977,
"grad_norm": 27.455034255981445,
"learning_rate": 1.939791104502818e-05,
"loss": 5.8007,
"step": 7700
},
{
"epoch": 0.12885959260544183,
"grad_norm": 33.76934814453125,
"learning_rate": 1.936119830387136e-05,
"loss": 5.9206,
"step": 7800
},
{
"epoch": 0.13051163866448598,
"grad_norm": 21.891183853149414,
"learning_rate": 1.932448556271454e-05,
"loss": 5.918,
"step": 7900
},
{
"epoch": 0.1321636847235301,
"grad_norm": 61.087398529052734,
"learning_rate": 1.9287772821557725e-05,
"loss": 5.9443,
"step": 8000
},
{
"epoch": 0.13381573078257422,
"grad_norm": 23.860267639160156,
"learning_rate": 1.9251060080400906e-05,
"loss": 5.8764,
"step": 8100
},
{
"epoch": 0.13546777684161834,
"grad_norm": 26.501821517944336,
"learning_rate": 1.9214714466655654e-05,
"loss": 5.867,
"step": 8200
},
{
"epoch": 0.13711982290066246,
"grad_norm": 43.38287353515625,
"learning_rate": 1.9178001725498835e-05,
"loss": 5.8087,
"step": 8300
},
{
"epoch": 0.13877186895970658,
"grad_norm": 73.06561279296875,
"learning_rate": 1.9141288984342016e-05,
"loss": 5.9884,
"step": 8400
},
{
"epoch": 0.14042391501875073,
"grad_norm": 36.368717193603516,
"learning_rate": 1.91045762431852e-05,
"loss": 5.8741,
"step": 8500
},
{
"epoch": 0.14207596107779485,
"grad_norm": 136.38865661621094,
"learning_rate": 1.906786350202838e-05,
"loss": 5.9699,
"step": 8600
},
{
"epoch": 0.14372800713683898,
"grad_norm": 38.05315017700195,
"learning_rate": 1.903115076087156e-05,
"loss": 5.8671,
"step": 8700
},
{
"epoch": 0.1453800531958831,
"grad_norm": 39.74106216430664,
"learning_rate": 1.8994438019714742e-05,
"loss": 5.8278,
"step": 8800
},
{
"epoch": 0.14703209925492722,
"grad_norm": 31.016155242919922,
"learning_rate": 1.8957725278557926e-05,
"loss": 5.8892,
"step": 8900
},
{
"epoch": 0.14868414531397137,
"grad_norm": 36.37879943847656,
"learning_rate": 1.8921012537401107e-05,
"loss": 5.7437,
"step": 9000
},
{
"epoch": 0.1503361913730155,
"grad_norm": 31.93881607055664,
"learning_rate": 1.8884299796244288e-05,
"loss": 5.8069,
"step": 9100
},
{
"epoch": 0.1519882374320596,
"grad_norm": 24.248807907104492,
"learning_rate": 1.8847587055087472e-05,
"loss": 6.0235,
"step": 9200
},
{
"epoch": 0.15364028349110373,
"grad_norm": 29.67982292175293,
"learning_rate": 1.8810874313930653e-05,
"loss": 5.7214,
"step": 9300
},
{
"epoch": 0.15529232955014785,
"grad_norm": 34.80620193481445,
"learning_rate": 1.8774161572773834e-05,
"loss": 5.7893,
"step": 9400
},
{
"epoch": 0.15694437560919197,
"grad_norm": 31.375019073486328,
"learning_rate": 1.8737448831617015e-05,
"loss": 5.7406,
"step": 9500
},
{
"epoch": 0.15859642166823612,
"grad_norm": 24.126588821411133,
"learning_rate": 1.8700736090460195e-05,
"loss": 5.8035,
"step": 9600
},
{
"epoch": 0.16024846772728024,
"grad_norm": 94.3121337890625,
"learning_rate": 1.8664023349303376e-05,
"loss": 5.7965,
"step": 9700
},
{
"epoch": 0.16190051378632436,
"grad_norm": 29.543697357177734,
"learning_rate": 1.8627310608146557e-05,
"loss": 5.638,
"step": 9800
},
{
"epoch": 0.16355255984536848,
"grad_norm": 27.004188537597656,
"learning_rate": 1.859059786698974e-05,
"loss": 5.8263,
"step": 9900
},
{
"epoch": 0.1652046059044126,
"grad_norm": 31.72929573059082,
"learning_rate": 1.8553885125832922e-05,
"loss": 5.7995,
"step": 10000
},
{
"epoch": 0.16685665196345675,
"grad_norm": 43.893436431884766,
"learning_rate": 1.8517172384676103e-05,
"loss": 5.5805,
"step": 10100
},
{
"epoch": 0.16850869802250087,
"grad_norm": 40.329349517822266,
"learning_rate": 1.8480459643519283e-05,
"loss": 5.632,
"step": 10200
},
{
"epoch": 0.170160744081545,
"grad_norm": 36.50722885131836,
"learning_rate": 1.8443746902362468e-05,
"loss": 5.6944,
"step": 10300
},
{
"epoch": 0.17181279014058912,
"grad_norm": 68.61418151855469,
"learning_rate": 1.840703416120565e-05,
"loss": 5.5818,
"step": 10400
},
{
"epoch": 0.17346483619963324,
"grad_norm": 38.758846282958984,
"learning_rate": 1.837032142004883e-05,
"loss": 5.8598,
"step": 10500
},
{
"epoch": 0.17511688225867736,
"grad_norm": 51.770931243896484,
"learning_rate": 1.8333975806303577e-05,
"loss": 5.7255,
"step": 10600
},
{
"epoch": 0.1767689283177215,
"grad_norm": 91.27816009521484,
"learning_rate": 1.8297263065146758e-05,
"loss": 5.7536,
"step": 10700
},
{
"epoch": 0.17842097437676563,
"grad_norm": 35.52999496459961,
"learning_rate": 1.8260550323989942e-05,
"loss": 5.6536,
"step": 10800
},
{
"epoch": 0.18007302043580975,
"grad_norm": 36.9012336730957,
"learning_rate": 1.8223837582833123e-05,
"loss": 5.6417,
"step": 10900
},
{
"epoch": 0.18172506649485387,
"grad_norm": 37.2264404296875,
"learning_rate": 1.8187124841676304e-05,
"loss": 5.6719,
"step": 11000
},
{
"epoch": 0.183377112553898,
"grad_norm": 31.076929092407227,
"learning_rate": 1.8150412100519488e-05,
"loss": 5.566,
"step": 11100
},
{
"epoch": 0.18502915861294214,
"grad_norm": 34.78733444213867,
"learning_rate": 1.811369935936267e-05,
"loss": 5.4893,
"step": 11200
},
{
"epoch": 0.18668120467198626,
"grad_norm": 68.41493225097656,
"learning_rate": 1.807698661820585e-05,
"loss": 5.7412,
"step": 11300
},
{
"epoch": 0.18833325073103038,
"grad_norm": 43.99595260620117,
"learning_rate": 1.804027387704903e-05,
"loss": 5.6838,
"step": 11400
},
{
"epoch": 0.1899852967900745,
"grad_norm": 30.06267547607422,
"learning_rate": 1.8003561135892215e-05,
"loss": 5.6272,
"step": 11500
},
{
"epoch": 0.19163734284911862,
"grad_norm": 38.978031158447266,
"learning_rate": 1.7966848394735395e-05,
"loss": 5.6538,
"step": 11600
},
{
"epoch": 0.19328938890816275,
"grad_norm": 34.604209899902344,
"learning_rate": 1.7930135653578576e-05,
"loss": 5.7176,
"step": 11700
},
{
"epoch": 0.1949414349672069,
"grad_norm": 39.66080856323242,
"learning_rate": 1.7893422912421757e-05,
"loss": 5.4923,
"step": 11800
},
{
"epoch": 0.19659348102625102,
"grad_norm": 39.9164924621582,
"learning_rate": 1.7856710171264938e-05,
"loss": 5.7643,
"step": 11900
},
{
"epoch": 0.19824552708529514,
"grad_norm": 62.23050308227539,
"learning_rate": 1.7819997430108122e-05,
"loss": 5.5674,
"step": 12000
},
{
"epoch": 0.19989757314433926,
"grad_norm": 57.77485656738281,
"learning_rate": 1.7783284688951303e-05,
"loss": 5.6896,
"step": 12100
},
{
"epoch": 0.20154961920338338,
"grad_norm": 62.32257843017578,
"learning_rate": 1.7746571947794483e-05,
"loss": 5.4385,
"step": 12200
},
{
"epoch": 0.20320166526242753,
"grad_norm": 72.59315490722656,
"learning_rate": 1.7709859206637664e-05,
"loss": 5.5851,
"step": 12300
},
{
"epoch": 0.20485371132147165,
"grad_norm": 38.60813522338867,
"learning_rate": 1.7673146465480845e-05,
"loss": 5.5132,
"step": 12400
},
{
"epoch": 0.20650575738051577,
"grad_norm": 46.002899169921875,
"learning_rate": 1.763643372432403e-05,
"loss": 5.3329,
"step": 12500
},
{
"epoch": 0.2081578034395599,
"grad_norm": 54.40972900390625,
"learning_rate": 1.759972098316721e-05,
"loss": 5.4218,
"step": 12600
},
{
"epoch": 0.209809849498604,
"grad_norm": 42.294403076171875,
"learning_rate": 1.756337536942196e-05,
"loss": 5.5171,
"step": 12700
},
{
"epoch": 0.21146189555764816,
"grad_norm": 99.45050048828125,
"learning_rate": 1.7526662628265142e-05,
"loss": 5.3414,
"step": 12800
},
{
"epoch": 0.21311394161669228,
"grad_norm": 29.550790786743164,
"learning_rate": 1.7489949887108323e-05,
"loss": 5.4921,
"step": 12900
},
{
"epoch": 0.2147659876757364,
"grad_norm": 35.48351287841797,
"learning_rate": 1.7453237145951504e-05,
"loss": 5.7687,
"step": 13000
},
{
"epoch": 0.21641803373478052,
"grad_norm": 35.474609375,
"learning_rate": 1.7416524404794685e-05,
"loss": 5.7119,
"step": 13100
},
{
"epoch": 0.21807007979382464,
"grad_norm": 52.770469665527344,
"learning_rate": 1.7379811663637865e-05,
"loss": 5.4975,
"step": 13200
},
{
"epoch": 0.21972212585286877,
"grad_norm": 41.083763122558594,
"learning_rate": 1.7343098922481046e-05,
"loss": 5.4514,
"step": 13300
},
{
"epoch": 0.22137417191191291,
"grad_norm": 27.714067459106445,
"learning_rate": 1.730638618132423e-05,
"loss": 5.497,
"step": 13400
},
{
"epoch": 0.22302621797095704,
"grad_norm": 646.4743041992188,
"learning_rate": 1.726967344016741e-05,
"loss": 5.558,
"step": 13500
},
{
"epoch": 0.22467826403000116,
"grad_norm": 35.99949264526367,
"learning_rate": 1.7232960699010592e-05,
"loss": 5.4207,
"step": 13600
},
{
"epoch": 0.22633031008904528,
"grad_norm": 39.374507904052734,
"learning_rate": 1.7196247957853776e-05,
"loss": 5.5901,
"step": 13700
},
{
"epoch": 0.2279823561480894,
"grad_norm": 33.016117095947266,
"learning_rate": 1.7159535216696957e-05,
"loss": 5.2041,
"step": 13800
},
{
"epoch": 0.22963440220713355,
"grad_norm": 51.360252380371094,
"learning_rate": 1.7122822475540138e-05,
"loss": 5.2999,
"step": 13900
},
{
"epoch": 0.23128644826617767,
"grad_norm": 40.98723602294922,
"learning_rate": 1.708610973438332e-05,
"loss": 5.3373,
"step": 14000
},
{
"epoch": 0.2329384943252218,
"grad_norm": 62.94683074951172,
"learning_rate": 1.7049396993226503e-05,
"loss": 5.789,
"step": 14100
},
{
"epoch": 0.2345905403842659,
"grad_norm": 70.42803192138672,
"learning_rate": 1.7012684252069684e-05,
"loss": 5.3292,
"step": 14200
},
{
"epoch": 0.23624258644331003,
"grad_norm": 95.90315246582031,
"learning_rate": 1.6975971510912864e-05,
"loss": 5.4059,
"step": 14300
},
{
"epoch": 0.23789463250235415,
"grad_norm": 39.37266159057617,
"learning_rate": 1.6939258769756045e-05,
"loss": 5.1849,
"step": 14400
},
{
"epoch": 0.2395466785613983,
"grad_norm": 35.2801513671875,
"learning_rate": 1.6902913156010793e-05,
"loss": 5.1262,
"step": 14500
},
{
"epoch": 0.24119872462044242,
"grad_norm": 49.648563385009766,
"learning_rate": 1.6866200414853977e-05,
"loss": 5.4339,
"step": 14600
},
{
"epoch": 0.24285077067948654,
"grad_norm": 42.30907440185547,
"learning_rate": 1.6829487673697158e-05,
"loss": 5.5185,
"step": 14700
},
{
"epoch": 0.24450281673853066,
"grad_norm": 37.14194869995117,
"learning_rate": 1.679277493254034e-05,
"loss": 5.3286,
"step": 14800
},
{
"epoch": 0.24615486279757479,
"grad_norm": 31.77059555053711,
"learning_rate": 1.6756062191383523e-05,
"loss": 5.4141,
"step": 14900
},
{
"epoch": 0.24780690885661893,
"grad_norm": 30.543859481811523,
"learning_rate": 1.6719349450226704e-05,
"loss": 5.3554,
"step": 15000
},
{
"epoch": 0.24945895491566306,
"grad_norm": 51.9097785949707,
"learning_rate": 1.6682636709069885e-05,
"loss": 5.3489,
"step": 15100
},
{
"epoch": 0.2511110009747072,
"grad_norm": 222.63604736328125,
"learning_rate": 1.6645923967913065e-05,
"loss": 5.4849,
"step": 15200
},
{
"epoch": 0.2527630470337513,
"grad_norm": 95.24678802490234,
"learning_rate": 1.6609211226756246e-05,
"loss": 5.3656,
"step": 15300
},
{
"epoch": 0.2544150930927954,
"grad_norm": 36.08857345581055,
"learning_rate": 1.6572498485599427e-05,
"loss": 5.32,
"step": 15400
},
{
"epoch": 0.25606713915183954,
"grad_norm": 48.17111587524414,
"learning_rate": 1.6535785744442608e-05,
"loss": 5.3523,
"step": 15500
},
{
"epoch": 0.25771918521088366,
"grad_norm": 51.60739517211914,
"learning_rate": 1.6499073003285792e-05,
"loss": 5.1146,
"step": 15600
},
{
"epoch": 0.2593712312699278,
"grad_norm": 38.238433837890625,
"learning_rate": 1.6462360262128973e-05,
"loss": 5.2816,
"step": 15700
},
{
"epoch": 0.26102327732897196,
"grad_norm": 136.9043426513672,
"learning_rate": 1.6425647520972154e-05,
"loss": 5.2296,
"step": 15800
},
{
"epoch": 0.2626753233880161,
"grad_norm": 68.96510314941406,
"learning_rate": 1.6388934779815334e-05,
"loss": 5.3386,
"step": 15900
},
{
"epoch": 0.2643273694470602,
"grad_norm": 152.887939453125,
"learning_rate": 1.635222203865852e-05,
"loss": 5.4917,
"step": 16000
},
{
"epoch": 0.2659794155061043,
"grad_norm": 60.76850891113281,
"learning_rate": 1.63155092975017e-05,
"loss": 5.0524,
"step": 16100
},
{
"epoch": 0.26763146156514844,
"grad_norm": 52.04624557495117,
"learning_rate": 1.627879655634488e-05,
"loss": 5.1657,
"step": 16200
},
{
"epoch": 0.26928350762419256,
"grad_norm": 60.97122573852539,
"learning_rate": 1.6242083815188064e-05,
"loss": 5.1431,
"step": 16300
},
{
"epoch": 0.2709355536832367,
"grad_norm": 80.91710662841797,
"learning_rate": 1.6205371074031245e-05,
"loss": 5.166,
"step": 16400
},
{
"epoch": 0.2725875997422808,
"grad_norm": 37.053619384765625,
"learning_rate": 1.6168658332874426e-05,
"loss": 5.5738,
"step": 16500
},
{
"epoch": 0.2742396458013249,
"grad_norm": 69.874267578125,
"learning_rate": 1.6131945591717607e-05,
"loss": 5.2088,
"step": 16600
},
{
"epoch": 0.27589169186036905,
"grad_norm": 43.52924346923828,
"learning_rate": 1.6095232850560787e-05,
"loss": 5.2198,
"step": 16700
},
{
"epoch": 0.27754373791941317,
"grad_norm": 48.917152404785156,
"learning_rate": 1.605852010940397e-05,
"loss": 5.2709,
"step": 16800
},
{
"epoch": 0.27919578397845735,
"grad_norm": 36.78157043457031,
"learning_rate": 1.6021807368247152e-05,
"loss": 5.4027,
"step": 16900
},
{
"epoch": 0.28084783003750147,
"grad_norm": 78.23045349121094,
"learning_rate": 1.5985094627090333e-05,
"loss": 5.25,
"step": 17000
},
{
"epoch": 0.2824998760965456,
"grad_norm": 50.245540618896484,
"learning_rate": 1.5948381885933514e-05,
"loss": 5.1519,
"step": 17100
},
{
"epoch": 0.2841519221555897,
"grad_norm": 31.97572135925293,
"learning_rate": 1.5911669144776695e-05,
"loss": 5.1347,
"step": 17200
},
{
"epoch": 0.28580396821463383,
"grad_norm": 47.70193862915039,
"learning_rate": 1.5874956403619876e-05,
"loss": 5.2346,
"step": 17300
},
{
"epoch": 0.28745601427367795,
"grad_norm": 34.82514953613281,
"learning_rate": 1.583824366246306e-05,
"loss": 5.4128,
"step": 17400
},
{
"epoch": 0.28910806033272207,
"grad_norm": 66.9453353881836,
"learning_rate": 1.580153092130624e-05,
"loss": 5.1954,
"step": 17500
},
{
"epoch": 0.2907601063917662,
"grad_norm": 50.74463653564453,
"learning_rate": 1.576481818014942e-05,
"loss": 5.3787,
"step": 17600
},
{
"epoch": 0.2924121524508103,
"grad_norm": 42.01203918457031,
"learning_rate": 1.5728105438992606e-05,
"loss": 5.1731,
"step": 17700
},
{
"epoch": 0.29406419850985444,
"grad_norm": 40.68756103515625,
"learning_rate": 1.5691392697835786e-05,
"loss": 5.3714,
"step": 17800
},
{
"epoch": 0.29571624456889856,
"grad_norm": 37.97477722167969,
"learning_rate": 1.5654679956678967e-05,
"loss": 5.2113,
"step": 17900
},
{
"epoch": 0.29736829062794273,
"grad_norm": 64.8110580444336,
"learning_rate": 1.5617967215522148e-05,
"loss": 5.0819,
"step": 18000
},
{
"epoch": 0.29902033668698685,
"grad_norm": 37.63853454589844,
"learning_rate": 1.5581254474365332e-05,
"loss": 5.0443,
"step": 18100
},
{
"epoch": 0.300672382746031,
"grad_norm": 41.9002799987793,
"learning_rate": 1.5544541733208513e-05,
"loss": 5.2041,
"step": 18200
},
{
"epoch": 0.3023244288050751,
"grad_norm": 50.00920486450195,
"learning_rate": 1.5507828992051694e-05,
"loss": 5.1385,
"step": 18300
},
{
"epoch": 0.3039764748641192,
"grad_norm": 51.85498809814453,
"learning_rate": 1.5471116250894874e-05,
"loss": 5.2195,
"step": 18400
},
{
"epoch": 0.30562852092316334,
"grad_norm": 45.79952621459961,
"learning_rate": 1.5434403509738055e-05,
"loss": 5.2233,
"step": 18500
},
{
"epoch": 0.30728056698220746,
"grad_norm": 40.52060317993164,
"learning_rate": 1.5397690768581236e-05,
"loss": 5.1198,
"step": 18600
},
{
"epoch": 0.3089326130412516,
"grad_norm": 56.97610092163086,
"learning_rate": 1.5360978027424417e-05,
"loss": 5.106,
"step": 18700
},
{
"epoch": 0.3105846591002957,
"grad_norm": 91.66014099121094,
"learning_rate": 1.53242652862676e-05,
"loss": 5.335,
"step": 18800
},
{
"epoch": 0.3122367051593398,
"grad_norm": 85.68270874023438,
"learning_rate": 1.5287552545110782e-05,
"loss": 5.1231,
"step": 18900
},
{
"epoch": 0.31388875121838394,
"grad_norm": 38.191650390625,
"learning_rate": 1.5250839803953963e-05,
"loss": 5.1777,
"step": 19000
},
{
"epoch": 0.3155407972774281,
"grad_norm": 182.99609375,
"learning_rate": 1.5214127062797147e-05,
"loss": 5.5752,
"step": 19100
},
{
"epoch": 0.31719284333647224,
"grad_norm": 49.25122833251953,
"learning_rate": 1.5177414321640328e-05,
"loss": 5.1902,
"step": 19200
},
{
"epoch": 0.31884488939551636,
"grad_norm": 46.381248474121094,
"learning_rate": 1.5140701580483508e-05,
"loss": 5.0777,
"step": 19300
},
{
"epoch": 0.3204969354545605,
"grad_norm": 35.04011154174805,
"learning_rate": 1.510398883932669e-05,
"loss": 5.211,
"step": 19400
},
{
"epoch": 0.3221489815136046,
"grad_norm": 124.16557312011719,
"learning_rate": 1.5067643225581439e-05,
"loss": 5.1402,
"step": 19500
},
{
"epoch": 0.3238010275726487,
"grad_norm": 55.99512481689453,
"learning_rate": 1.5030930484424621e-05,
"loss": 5.1458,
"step": 19600
},
{
"epoch": 0.32545307363169285,
"grad_norm": 77.44950866699219,
"learning_rate": 1.4994217743267802e-05,
"loss": 5.1091,
"step": 19700
},
{
"epoch": 0.32710511969073697,
"grad_norm": 122.10176849365234,
"learning_rate": 1.4957505002110983e-05,
"loss": 5.1471,
"step": 19800
},
{
"epoch": 0.3287571657497811,
"grad_norm": 43.460208892822266,
"learning_rate": 1.4920792260954164e-05,
"loss": 5.1804,
"step": 19900
},
{
"epoch": 0.3304092118088252,
"grad_norm": 89.17972564697266,
"learning_rate": 1.4884079519797348e-05,
"loss": 4.9678,
"step": 20000
},
{
"epoch": 0.33206125786786933,
"grad_norm": 81.0530014038086,
"learning_rate": 1.4847366778640529e-05,
"loss": 5.1655,
"step": 20100
},
{
"epoch": 0.3337133039269135,
"grad_norm": 88.94013214111328,
"learning_rate": 1.481065403748371e-05,
"loss": 4.9735,
"step": 20200
},
{
"epoch": 0.33536534998595763,
"grad_norm": 78.72936248779297,
"learning_rate": 1.4773941296326892e-05,
"loss": 5.0536,
"step": 20300
},
{
"epoch": 0.33701739604500175,
"grad_norm": 36.7070198059082,
"learning_rate": 1.4737228555170073e-05,
"loss": 5.347,
"step": 20400
},
{
"epoch": 0.33866944210404587,
"grad_norm": 63.179012298583984,
"learning_rate": 1.4700515814013254e-05,
"loss": 4.9856,
"step": 20500
},
{
"epoch": 0.34032148816309,
"grad_norm": 47.14772415161133,
"learning_rate": 1.4663803072856434e-05,
"loss": 5.1035,
"step": 20600
},
{
"epoch": 0.3419735342221341,
"grad_norm": 51.848472595214844,
"learning_rate": 1.4627090331699619e-05,
"loss": 5.0428,
"step": 20700
},
{
"epoch": 0.34362558028117823,
"grad_norm": 50.670616149902344,
"learning_rate": 1.45903775905428e-05,
"loss": 5.0856,
"step": 20800
},
{
"epoch": 0.34527762634022235,
"grad_norm": 48.28507995605469,
"learning_rate": 1.455366484938598e-05,
"loss": 5.0776,
"step": 20900
},
{
"epoch": 0.3469296723992665,
"grad_norm": 49.49705505371094,
"learning_rate": 1.4516952108229163e-05,
"loss": 5.2031,
"step": 21000
},
{
"epoch": 0.3485817184583106,
"grad_norm": 62.78488540649414,
"learning_rate": 1.4480239367072343e-05,
"loss": 5.1491,
"step": 21100
},
{
"epoch": 0.3502337645173547,
"grad_norm": 42.41142654418945,
"learning_rate": 1.4443526625915524e-05,
"loss": 5.3685,
"step": 21200
},
{
"epoch": 0.3518858105763989,
"grad_norm": 42.742740631103516,
"learning_rate": 1.4406813884758705e-05,
"loss": 4.6901,
"step": 21300
},
{
"epoch": 0.353537856635443,
"grad_norm": 78.31076049804688,
"learning_rate": 1.437010114360189e-05,
"loss": 4.9809,
"step": 21400
},
{
"epoch": 0.35518990269448714,
"grad_norm": 107.21749877929688,
"learning_rate": 1.433338840244507e-05,
"loss": 4.9273,
"step": 21500
},
{
"epoch": 0.35684194875353126,
"grad_norm": 42.59064865112305,
"learning_rate": 1.429667566128825e-05,
"loss": 4.7568,
"step": 21600
},
{
"epoch": 0.3584939948125754,
"grad_norm": 64.76200103759766,
"learning_rate": 1.4259962920131433e-05,
"loss": 4.9064,
"step": 21700
},
{
"epoch": 0.3601460408716195,
"grad_norm": 35.872161865234375,
"learning_rate": 1.4223250178974614e-05,
"loss": 5.0399,
"step": 21800
},
{
"epoch": 0.3617980869306636,
"grad_norm": 52.66395950317383,
"learning_rate": 1.4186537437817795e-05,
"loss": 4.9202,
"step": 21900
},
{
"epoch": 0.36345013298970774,
"grad_norm": 47.78133773803711,
"learning_rate": 1.4149824696660976e-05,
"loss": 5.3848,
"step": 22000
},
{
"epoch": 0.36510217904875186,
"grad_norm": 61.914493560791016,
"learning_rate": 1.411311195550416e-05,
"loss": 4.9239,
"step": 22100
},
{
"epoch": 0.366754225107796,
"grad_norm": 50.294803619384766,
"learning_rate": 1.407639921434734e-05,
"loss": 4.8744,
"step": 22200
},
{
"epoch": 0.3684062711668401,
"grad_norm": 50.07392883300781,
"learning_rate": 1.4039686473190521e-05,
"loss": 4.8597,
"step": 22300
},
{
"epoch": 0.3700583172258843,
"grad_norm": 103.84827423095703,
"learning_rate": 1.4002973732033704e-05,
"loss": 4.9226,
"step": 22400
},
{
"epoch": 0.3717103632849284,
"grad_norm": 52.87375259399414,
"learning_rate": 1.3966260990876885e-05,
"loss": 5.0358,
"step": 22500
},
{
"epoch": 0.3733624093439725,
"grad_norm": 59.05409240722656,
"learning_rate": 1.3929548249720065e-05,
"loss": 4.9895,
"step": 22600
},
{
"epoch": 0.37501445540301664,
"grad_norm": 57.85047912597656,
"learning_rate": 1.3892835508563248e-05,
"loss": 5.004,
"step": 22700
},
{
"epoch": 0.37666650146206077,
"grad_norm": 39.452919006347656,
"learning_rate": 1.385612276740643e-05,
"loss": 5.0441,
"step": 22800
},
{
"epoch": 0.3783185475211049,
"grad_norm": 52.329498291015625,
"learning_rate": 1.3819410026249611e-05,
"loss": 4.8129,
"step": 22900
},
{
"epoch": 0.379970593580149,
"grad_norm": 51.75339889526367,
"learning_rate": 1.3782697285092792e-05,
"loss": 4.7954,
"step": 23000
},
{
"epoch": 0.38162263963919313,
"grad_norm": 61.14597702026367,
"learning_rate": 1.3745984543935975e-05,
"loss": 4.8156,
"step": 23100
},
{
"epoch": 0.38327468569823725,
"grad_norm": 131.6481170654297,
"learning_rate": 1.3709271802779155e-05,
"loss": 5.0714,
"step": 23200
},
{
"epoch": 0.38492673175728137,
"grad_norm": 41.798179626464844,
"learning_rate": 1.3672559061622336e-05,
"loss": 4.8543,
"step": 23300
},
{
"epoch": 0.3865787778163255,
"grad_norm": 50.937530517578125,
"learning_rate": 1.3635846320465519e-05,
"loss": 5.1728,
"step": 23400
},
{
"epoch": 0.38823082387536967,
"grad_norm": 49.662574768066406,
"learning_rate": 1.3599133579308701e-05,
"loss": 5.1891,
"step": 23500
},
{
"epoch": 0.3898828699344138,
"grad_norm": 74.3118667602539,
"learning_rate": 1.3562420838151882e-05,
"loss": 5.087,
"step": 23600
},
{
"epoch": 0.3915349159934579,
"grad_norm": 46.42539978027344,
"learning_rate": 1.3526442351818199e-05,
"loss": 4.9069,
"step": 23700
},
{
"epoch": 0.39318696205250203,
"grad_norm": 73.25936126708984,
"learning_rate": 1.3489729610661381e-05,
"loss": 4.9357,
"step": 23800
},
{
"epoch": 0.39483900811154615,
"grad_norm": 37.25462341308594,
"learning_rate": 1.3453016869504562e-05,
"loss": 4.8324,
"step": 23900
},
{
"epoch": 0.3964910541705903,
"grad_norm": 67.21192932128906,
"learning_rate": 1.3416304128347743e-05,
"loss": 4.8091,
"step": 24000
},
{
"epoch": 0.3981431002296344,
"grad_norm": 49.0614013671875,
"learning_rate": 1.3379591387190927e-05,
"loss": 4.7944,
"step": 24100
},
{
"epoch": 0.3997951462886785,
"grad_norm": 81.99295806884766,
"learning_rate": 1.3342878646034108e-05,
"loss": 5.0023,
"step": 24200
},
{
"epoch": 0.40144719234772264,
"grad_norm": 57.294010162353516,
"learning_rate": 1.3306165904877289e-05,
"loss": 4.8745,
"step": 24300
},
{
"epoch": 0.40309923840676676,
"grad_norm": 58.99897384643555,
"learning_rate": 1.326945316372047e-05,
"loss": 5.0884,
"step": 24400
},
{
"epoch": 0.40475128446581093,
"grad_norm": 66.54635620117188,
"learning_rate": 1.3232740422563652e-05,
"loss": 5.0468,
"step": 24500
},
{
"epoch": 0.40640333052485506,
"grad_norm": 42.970706939697266,
"learning_rate": 1.3196027681406833e-05,
"loss": 4.8575,
"step": 24600
},
{
"epoch": 0.4080553765838992,
"grad_norm": 45.24504470825195,
"learning_rate": 1.3159314940250014e-05,
"loss": 4.7555,
"step": 24700
},
{
"epoch": 0.4097074226429433,
"grad_norm": 274.1655578613281,
"learning_rate": 1.3122602199093198e-05,
"loss": 4.6052,
"step": 24800
},
{
"epoch": 0.4113594687019874,
"grad_norm": 94.8667221069336,
"learning_rate": 1.3085889457936379e-05,
"loss": 4.8935,
"step": 24900
},
{
"epoch": 0.41301151476103154,
"grad_norm": 44.97008514404297,
"learning_rate": 1.304917671677956e-05,
"loss": 4.8049,
"step": 25000
},
{
"epoch": 0.41466356082007566,
"grad_norm": 101.15333557128906,
"learning_rate": 1.301246397562274e-05,
"loss": 4.9014,
"step": 25100
},
{
"epoch": 0.4163156068791198,
"grad_norm": 40.38864517211914,
"learning_rate": 1.2975751234465923e-05,
"loss": 4.7199,
"step": 25200
},
{
"epoch": 0.4179676529381639,
"grad_norm": 130.1628875732422,
"learning_rate": 1.2939038493309103e-05,
"loss": 4.6999,
"step": 25300
},
{
"epoch": 0.419619698997208,
"grad_norm": 111.20594787597656,
"learning_rate": 1.2902325752152284e-05,
"loss": 4.6417,
"step": 25400
},
{
"epoch": 0.42127174505625214,
"grad_norm": 48.373497009277344,
"learning_rate": 1.2865613010995468e-05,
"loss": 5.2115,
"step": 25500
},
{
"epoch": 0.4229237911152963,
"grad_norm": 131.54966735839844,
"learning_rate": 1.2828900269838649e-05,
"loss": 4.9171,
"step": 25600
},
{
"epoch": 0.42457583717434044,
"grad_norm": 50.42368698120117,
"learning_rate": 1.279218752868183e-05,
"loss": 4.9448,
"step": 25700
},
{
"epoch": 0.42622788323338456,
"grad_norm": 112.4402084350586,
"learning_rate": 1.275547478752501e-05,
"loss": 4.6811,
"step": 25800
},
{
"epoch": 0.4278799292924287,
"grad_norm": 40.17702102661133,
"learning_rate": 1.2718762046368193e-05,
"loss": 5.1181,
"step": 25900
},
{
"epoch": 0.4295319753514728,
"grad_norm": 105.03784942626953,
"learning_rate": 1.2682049305211374e-05,
"loss": 4.8061,
"step": 26000
},
{
"epoch": 0.4311840214105169,
"grad_norm": 85.66558074951172,
"learning_rate": 1.2645336564054555e-05,
"loss": 4.815,
"step": 26100
},
{
"epoch": 0.43283606746956105,
"grad_norm": 68.8923110961914,
"learning_rate": 1.2608623822897739e-05,
"loss": 4.7731,
"step": 26200
},
{
"epoch": 0.43448811352860517,
"grad_norm": 106.52751159667969,
"learning_rate": 1.257191108174092e-05,
"loss": 4.7304,
"step": 26300
},
{
"epoch": 0.4361401595876493,
"grad_norm": 62.098915100097656,
"learning_rate": 1.25351983405841e-05,
"loss": 4.9838,
"step": 26400
},
{
"epoch": 0.4377922056466934,
"grad_norm": 51.40629959106445,
"learning_rate": 1.2498485599427281e-05,
"loss": 4.7998,
"step": 26500
},
{
"epoch": 0.43944425170573753,
"grad_norm": 69.73297882080078,
"learning_rate": 1.2461772858270464e-05,
"loss": 4.6946,
"step": 26600
},
{
"epoch": 0.4410962977647817,
"grad_norm": 48.082679748535156,
"learning_rate": 1.2425060117113645e-05,
"loss": 4.7755,
"step": 26700
},
{
"epoch": 0.44274834382382583,
"grad_norm": 39.593284606933594,
"learning_rate": 1.2388347375956825e-05,
"loss": 4.7347,
"step": 26800
},
{
"epoch": 0.44440038988286995,
"grad_norm": 99.7860336303711,
"learning_rate": 1.235163463480001e-05,
"loss": 4.8356,
"step": 26900
},
{
"epoch": 0.44605243594191407,
"grad_norm": 92.00659942626953,
"learning_rate": 1.231492189364319e-05,
"loss": 4.8642,
"step": 27000
},
{
"epoch": 0.4477044820009582,
"grad_norm": 91.36007690429688,
"learning_rate": 1.2278209152486371e-05,
"loss": 4.9273,
"step": 27100
},
{
"epoch": 0.4493565280600023,
"grad_norm": 90.54057312011719,
"learning_rate": 1.2241496411329552e-05,
"loss": 4.7114,
"step": 27200
},
{
"epoch": 0.45100857411904643,
"grad_norm": 96.53414916992188,
"learning_rate": 1.2204783670172734e-05,
"loss": 4.6088,
"step": 27300
},
{
"epoch": 0.45266062017809056,
"grad_norm": 54.67830276489258,
"learning_rate": 1.2168070929015915e-05,
"loss": 4.5046,
"step": 27400
},
{
"epoch": 0.4543126662371347,
"grad_norm": 60.664024353027344,
"learning_rate": 1.2131358187859098e-05,
"loss": 4.4516,
"step": 27500
},
{
"epoch": 0.4559647122961788,
"grad_norm": 47.828147888183594,
"learning_rate": 1.209464544670228e-05,
"loss": 4.7491,
"step": 27600
},
{
"epoch": 0.4576167583552229,
"grad_norm": 44.080345153808594,
"learning_rate": 1.2057932705545461e-05,
"loss": 4.943,
"step": 27700
},
{
"epoch": 0.4592688044142671,
"grad_norm": 46.904083251953125,
"learning_rate": 1.202158709180021e-05,
"loss": 4.877,
"step": 27800
},
{
"epoch": 0.4609208504733112,
"grad_norm": 67.17841339111328,
"learning_rate": 1.1984874350643392e-05,
"loss": 4.6912,
"step": 27900
},
{
"epoch": 0.46257289653235534,
"grad_norm": 61.45062255859375,
"learning_rate": 1.1948528736898141e-05,
"loss": 4.8373,
"step": 28000
},
{
"epoch": 0.46422494259139946,
"grad_norm": 164.95327758789062,
"learning_rate": 1.1911815995741322e-05,
"loss": 5.0152,
"step": 28100
},
{
"epoch": 0.4658769886504436,
"grad_norm": 58.360897064208984,
"learning_rate": 1.1875103254584503e-05,
"loss": 4.7008,
"step": 28200
},
{
"epoch": 0.4675290347094877,
"grad_norm": 45.121952056884766,
"learning_rate": 1.1838390513427687e-05,
"loss": 4.7549,
"step": 28300
},
{
"epoch": 0.4691810807685318,
"grad_norm": 60.929256439208984,
"learning_rate": 1.1801677772270868e-05,
"loss": 4.5287,
"step": 28400
},
{
"epoch": 0.47083312682757594,
"grad_norm": 75.46994018554688,
"learning_rate": 1.1764965031114049e-05,
"loss": 4.8211,
"step": 28500
},
{
"epoch": 0.47248517288662006,
"grad_norm": 43.476341247558594,
"learning_rate": 1.1728252289957231e-05,
"loss": 4.775,
"step": 28600
},
{
"epoch": 0.4741372189456642,
"grad_norm": 53.734535217285156,
"learning_rate": 1.1691539548800412e-05,
"loss": 4.6977,
"step": 28700
},
{
"epoch": 0.4757892650047083,
"grad_norm": 70.2580795288086,
"learning_rate": 1.1654826807643593e-05,
"loss": 4.9122,
"step": 28800
},
{
"epoch": 0.4774413110637525,
"grad_norm": 56.99103546142578,
"learning_rate": 1.1618114066486774e-05,
"loss": 4.9067,
"step": 28900
},
{
"epoch": 0.4790933571227966,
"grad_norm": 59.37815856933594,
"learning_rate": 1.1581401325329958e-05,
"loss": 4.8326,
"step": 29000
},
{
"epoch": 0.4807454031818407,
"grad_norm": 47.427085876464844,
"learning_rate": 1.1544688584173138e-05,
"loss": 4.4536,
"step": 29100
},
{
"epoch": 0.48239744924088485,
"grad_norm": 105.74244689941406,
"learning_rate": 1.150797584301632e-05,
"loss": 5.0073,
"step": 29200
},
{
"epoch": 0.48404949529992897,
"grad_norm": 63.801387786865234,
"learning_rate": 1.1471263101859502e-05,
"loss": 4.5887,
"step": 29300
},
{
"epoch": 0.4857015413589731,
"grad_norm": 111.39470672607422,
"learning_rate": 1.1434550360702683e-05,
"loss": 4.7829,
"step": 29400
},
{
"epoch": 0.4873535874180172,
"grad_norm": 47.669105529785156,
"learning_rate": 1.1397837619545863e-05,
"loss": 4.6503,
"step": 29500
},
{
"epoch": 0.48900563347706133,
"grad_norm": 74.83755493164062,
"learning_rate": 1.1361124878389044e-05,
"loss": 4.5202,
"step": 29600
},
{
"epoch": 0.49065767953610545,
"grad_norm": 50.78372573852539,
"learning_rate": 1.1324412137232228e-05,
"loss": 4.9086,
"step": 29700
},
{
"epoch": 0.49230972559514957,
"grad_norm": 77.82291412353516,
"learning_rate": 1.1287699396075409e-05,
"loss": 4.743,
"step": 29800
},
{
"epoch": 0.4939617716541937,
"grad_norm": 48.810394287109375,
"learning_rate": 1.125098665491859e-05,
"loss": 4.7819,
"step": 29900
},
{
"epoch": 0.49561381771323787,
"grad_norm": 73.72440338134766,
"learning_rate": 1.1214273913761772e-05,
"loss": 4.6159,
"step": 30000
},
{
"epoch": 0.497265863772282,
"grad_norm": 76.73316192626953,
"learning_rate": 1.1177561172604953e-05,
"loss": 5.015,
"step": 30100
},
{
"epoch": 0.4989179098313261,
"grad_norm": 44.149959564208984,
"learning_rate": 1.1140848431448134e-05,
"loss": 4.5351,
"step": 30200
},
{
"epoch": 0.5005699558903702,
"grad_norm": 58.467506408691406,
"learning_rate": 1.1104869945114454e-05,
"loss": 5.0421,
"step": 30300
},
{
"epoch": 0.5022220019494144,
"grad_norm": 84.99969482421875,
"learning_rate": 1.1068157203957635e-05,
"loss": 4.5394,
"step": 30400
},
{
"epoch": 0.5038740480084585,
"grad_norm": 45.24458312988281,
"learning_rate": 1.1031444462800816e-05,
"loss": 4.7516,
"step": 30500
},
{
"epoch": 0.5055260940675026,
"grad_norm": 61.58864974975586,
"learning_rate": 1.0994731721643997e-05,
"loss": 4.9236,
"step": 30600
},
{
"epoch": 0.5071781401265467,
"grad_norm": 72.27294158935547,
"learning_rate": 1.095801898048718e-05,
"loss": 4.833,
"step": 30700
},
{
"epoch": 0.5088301861855908,
"grad_norm": 42.26503372192383,
"learning_rate": 1.092130623933036e-05,
"loss": 4.5406,
"step": 30800
},
{
"epoch": 0.510482232244635,
"grad_norm": 73.13021087646484,
"learning_rate": 1.088459349817354e-05,
"loss": 4.7325,
"step": 30900
},
{
"epoch": 0.5121342783036791,
"grad_norm": 52.8704948425293,
"learning_rate": 1.0847880757016725e-05,
"loss": 4.6807,
"step": 31000
},
{
"epoch": 0.5137863243627232,
"grad_norm": 39.70061111450195,
"learning_rate": 1.0811168015859906e-05,
"loss": 4.6052,
"step": 31100
},
{
"epoch": 0.5154383704217673,
"grad_norm": 58.93057632446289,
"learning_rate": 1.0774455274703087e-05,
"loss": 4.7922,
"step": 31200
},
{
"epoch": 0.5170904164808114,
"grad_norm": 49.83049774169922,
"learning_rate": 1.0737742533546267e-05,
"loss": 4.5013,
"step": 31300
},
{
"epoch": 0.5187424625398556,
"grad_norm": 66.95556640625,
"learning_rate": 1.070102979238945e-05,
"loss": 4.6579,
"step": 31400
},
{
"epoch": 0.5203945085988997,
"grad_norm": 75.2869873046875,
"learning_rate": 1.066431705123263e-05,
"loss": 4.5152,
"step": 31500
}
],
"logging_steps": 100,
"max_steps": 60531,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 128,
"trial_name": null,
"trial_params": null
}