atsuki-yamaguchi's picture
Upload folder using huggingface_hub
b02d848 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.3683030710603004,
"eval_steps": 500,
"global_step": 15260,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0027796458193230216,
"grad_norm": 9.012969970703125,
"learning_rate": 1.0157273918741808e-06,
"loss": 8.8089,
"step": 31
},
{
"epoch": 0.005559291638646043,
"grad_norm": 5.6620192527771,
"learning_rate": 2.0314547837483616e-06,
"loss": 8.4351,
"step": 62
},
{
"epoch": 0.008338937457969065,
"grad_norm": 6.994854927062988,
"learning_rate": 3.0471821756225426e-06,
"loss": 7.9593,
"step": 93
},
{
"epoch": 0.011118583277292086,
"grad_norm": 13.229604721069336,
"learning_rate": 4.062909567496723e-06,
"loss": 7.4462,
"step": 124
},
{
"epoch": 0.013898229096615108,
"grad_norm": 15.080535888671875,
"learning_rate": 5.078636959370905e-06,
"loss": 7.1616,
"step": 155
},
{
"epoch": 0.01667787491593813,
"grad_norm": 12.822982788085938,
"learning_rate": 6.094364351245085e-06,
"loss": 6.9606,
"step": 186
},
{
"epoch": 0.019457520735261153,
"grad_norm": 16.252538681030273,
"learning_rate": 7.110091743119267e-06,
"loss": 6.8272,
"step": 217
},
{
"epoch": 0.022237166554584173,
"grad_norm": 15.957731246948242,
"learning_rate": 8.125819134993446e-06,
"loss": 6.7124,
"step": 248
},
{
"epoch": 0.025016812373907196,
"grad_norm": 12.819754600524902,
"learning_rate": 9.141546526867629e-06,
"loss": 6.6177,
"step": 279
},
{
"epoch": 0.027796458193230216,
"grad_norm": 15.852622032165527,
"learning_rate": 1.015727391874181e-05,
"loss": 6.4938,
"step": 310
},
{
"epoch": 0.03057610401255324,
"grad_norm": 14.386216163635254,
"learning_rate": 1.117300131061599e-05,
"loss": 6.4049,
"step": 341
},
{
"epoch": 0.03335574983187626,
"grad_norm": 17.984745025634766,
"learning_rate": 1.218872870249017e-05,
"loss": 6.2177,
"step": 372
},
{
"epoch": 0.03613539565119928,
"grad_norm": 11.26113510131836,
"learning_rate": 1.3204456094364351e-05,
"loss": 6.1328,
"step": 403
},
{
"epoch": 0.038915041470522306,
"grad_norm": 12.1831693649292,
"learning_rate": 1.4220183486238533e-05,
"loss": 5.9936,
"step": 434
},
{
"epoch": 0.04169468728984533,
"grad_norm": 11.754878044128418,
"learning_rate": 1.5235910878112714e-05,
"loss": 5.8999,
"step": 465
},
{
"epoch": 0.044474333109168346,
"grad_norm": 13.444028854370117,
"learning_rate": 1.6251638269986893e-05,
"loss": 5.7806,
"step": 496
},
{
"epoch": 0.04725397892849137,
"grad_norm": 11.331082344055176,
"learning_rate": 1.7267365661861077e-05,
"loss": 5.6861,
"step": 527
},
{
"epoch": 0.05003362474781439,
"grad_norm": 11.335689544677734,
"learning_rate": 1.8283093053735257e-05,
"loss": 5.5929,
"step": 558
},
{
"epoch": 0.052813270567137416,
"grad_norm": 13.345412254333496,
"learning_rate": 1.9298820445609438e-05,
"loss": 5.5168,
"step": 589
},
{
"epoch": 0.05559291638646043,
"grad_norm": 14.618036270141602,
"learning_rate": 2.031454783748362e-05,
"loss": 5.4084,
"step": 620
},
{
"epoch": 0.058372562205783456,
"grad_norm": 8.540396690368652,
"learning_rate": 2.13302752293578e-05,
"loss": 5.328,
"step": 651
},
{
"epoch": 0.06115220802510648,
"grad_norm": 10.201196670532227,
"learning_rate": 2.234600262123198e-05,
"loss": 5.2521,
"step": 682
},
{
"epoch": 0.0639318538444295,
"grad_norm": 8.642182350158691,
"learning_rate": 2.336173001310616e-05,
"loss": 5.1689,
"step": 713
},
{
"epoch": 0.06671149966375252,
"grad_norm": 12.399177551269531,
"learning_rate": 2.437745740498034e-05,
"loss": 5.1196,
"step": 744
},
{
"epoch": 0.06949114548307554,
"grad_norm": 9.496380805969238,
"learning_rate": 2.5393184796854525e-05,
"loss": 5.0453,
"step": 775
},
{
"epoch": 0.07227079130239857,
"grad_norm": 8.562884330749512,
"learning_rate": 2.6408912188728702e-05,
"loss": 4.9657,
"step": 806
},
{
"epoch": 0.07505043712172159,
"grad_norm": 8.08729362487793,
"learning_rate": 2.7424639580602886e-05,
"loss": 4.9296,
"step": 837
},
{
"epoch": 0.07783008294104461,
"grad_norm": 7.915094375610352,
"learning_rate": 2.8440366972477066e-05,
"loss": 4.8488,
"step": 868
},
{
"epoch": 0.08060972876036764,
"grad_norm": 7.101661682128906,
"learning_rate": 2.9456094364351244e-05,
"loss": 4.8021,
"step": 899
},
{
"epoch": 0.08338937457969066,
"grad_norm": 6.275183200836182,
"learning_rate": 3.0471821756225428e-05,
"loss": 4.7345,
"step": 930
},
{
"epoch": 0.08616902039901367,
"grad_norm": 6.350998401641846,
"learning_rate": 3.148754914809961e-05,
"loss": 4.6746,
"step": 961
},
{
"epoch": 0.08894866621833669,
"grad_norm": 7.120944976806641,
"learning_rate": 3.2503276539973785e-05,
"loss": 4.6101,
"step": 992
},
{
"epoch": 0.09172831203765971,
"grad_norm": 6.11423921585083,
"learning_rate": 3.351900393184797e-05,
"loss": 4.5679,
"step": 1023
},
{
"epoch": 0.09450795785698274,
"grad_norm": 6.339328289031982,
"learning_rate": 3.453473132372215e-05,
"loss": 4.5126,
"step": 1054
},
{
"epoch": 0.09728760367630576,
"grad_norm": 10.269830703735352,
"learning_rate": 3.555045871559633e-05,
"loss": 4.4597,
"step": 1085
},
{
"epoch": 0.10006724949562879,
"grad_norm": 5.776120662689209,
"learning_rate": 3.6566186107470514e-05,
"loss": 4.4408,
"step": 1116
},
{
"epoch": 0.10284689531495181,
"grad_norm": 6.201286315917969,
"learning_rate": 3.7581913499344695e-05,
"loss": 4.3918,
"step": 1147
},
{
"epoch": 0.10562654113427483,
"grad_norm": 5.641777038574219,
"learning_rate": 3.8597640891218876e-05,
"loss": 4.3731,
"step": 1178
},
{
"epoch": 0.10840618695359784,
"grad_norm": 5.453923225402832,
"learning_rate": 3.9613368283093056e-05,
"loss": 4.2671,
"step": 1209
},
{
"epoch": 0.11118583277292086,
"grad_norm": 5.152379512786865,
"learning_rate": 4.062909567496724e-05,
"loss": 4.2864,
"step": 1240
},
{
"epoch": 0.11396547859224389,
"grad_norm": 5.2209153175354,
"learning_rate": 4.164482306684142e-05,
"loss": 4.282,
"step": 1271
},
{
"epoch": 0.11674512441156691,
"grad_norm": 4.788177967071533,
"learning_rate": 4.26605504587156e-05,
"loss": 4.2331,
"step": 1302
},
{
"epoch": 0.11952477023088993,
"grad_norm": 4.940870761871338,
"learning_rate": 4.367627785058978e-05,
"loss": 4.1933,
"step": 1333
},
{
"epoch": 0.12230441605021296,
"grad_norm": 5.946582794189453,
"learning_rate": 4.469200524246396e-05,
"loss": 4.1688,
"step": 1364
},
{
"epoch": 0.12508406186953597,
"grad_norm": 4.6552958488464355,
"learning_rate": 4.570773263433814e-05,
"loss": 4.1135,
"step": 1395
},
{
"epoch": 0.127863707688859,
"grad_norm": 4.827516078948975,
"learning_rate": 4.672346002621232e-05,
"loss": 4.1121,
"step": 1426
},
{
"epoch": 0.13064335350818201,
"grad_norm": 4.1209917068481445,
"learning_rate": 4.77391874180865e-05,
"loss": 4.0969,
"step": 1457
},
{
"epoch": 0.13342299932750504,
"grad_norm": 4.2564802169799805,
"learning_rate": 4.875491480996068e-05,
"loss": 4.0791,
"step": 1488
},
{
"epoch": 0.13620264514682806,
"grad_norm": 3.873272657394409,
"learning_rate": 4.977064220183487e-05,
"loss": 4.1008,
"step": 1519
},
{
"epoch": 0.13898229096615108,
"grad_norm": 3.919401168823242,
"learning_rate": 4.9999915451558777e-05,
"loss": 4.047,
"step": 1550
},
{
"epoch": 0.1417619367854741,
"grad_norm": 4.050148010253906,
"learning_rate": 4.999955597496219e-05,
"loss": 4.0279,
"step": 1581
},
{
"epoch": 0.14454158260479713,
"grad_norm": 4.243306636810303,
"learning_rate": 4.9998914381774255e-05,
"loss": 4.0072,
"step": 1612
},
{
"epoch": 0.14732122842412015,
"grad_norm": 19.370807647705078,
"learning_rate": 4.999799067923527e-05,
"loss": 4.0009,
"step": 1643
},
{
"epoch": 0.15010087424344318,
"grad_norm": 3.8522801399230957,
"learning_rate": 4.999678487776908e-05,
"loss": 3.9695,
"step": 1674
},
{
"epoch": 0.1528805200627662,
"grad_norm": 3.519608736038208,
"learning_rate": 4.9995296990983006e-05,
"loss": 3.9632,
"step": 1705
},
{
"epoch": 0.15566016588208922,
"grad_norm": 3.812213897705078,
"learning_rate": 4.999352703566763e-05,
"loss": 3.9421,
"step": 1736
},
{
"epoch": 0.15843981170141225,
"grad_norm": 3.684826135635376,
"learning_rate": 4.999147503179668e-05,
"loss": 3.9056,
"step": 1767
},
{
"epoch": 0.16121945752073527,
"grad_norm": 3.6039719581604004,
"learning_rate": 4.998914100252672e-05,
"loss": 3.8937,
"step": 1798
},
{
"epoch": 0.1639991033400583,
"grad_norm": 3.4558818340301514,
"learning_rate": 4.998652497419696e-05,
"loss": 3.8713,
"step": 1829
},
{
"epoch": 0.16677874915938132,
"grad_norm": 3.3009331226348877,
"learning_rate": 4.9983626976328927e-05,
"loss": 3.8929,
"step": 1860
},
{
"epoch": 0.1695583949787043,
"grad_norm": 3.433112382888794,
"learning_rate": 4.998044704162613e-05,
"loss": 3.8611,
"step": 1891
},
{
"epoch": 0.17233804079802734,
"grad_norm": 3.2907469272613525,
"learning_rate": 4.9976985205973705e-05,
"loss": 3.8459,
"step": 1922
},
{
"epoch": 0.17511768661735036,
"grad_norm": 3.1534502506256104,
"learning_rate": 4.997324150843799e-05,
"loss": 3.8152,
"step": 1953
},
{
"epoch": 0.17789733243667338,
"grad_norm": 3.21655011177063,
"learning_rate": 4.99692159912661e-05,
"loss": 3.8318,
"step": 1984
},
{
"epoch": 0.1806769782559964,
"grad_norm": 3.26338529586792,
"learning_rate": 4.996490869988546e-05,
"loss": 3.8047,
"step": 2015
},
{
"epoch": 0.18345662407531943,
"grad_norm": 3.2187154293060303,
"learning_rate": 4.996031968290326e-05,
"loss": 3.8082,
"step": 2046
},
{
"epoch": 0.18623626989464245,
"grad_norm": 3.153524875640869,
"learning_rate": 4.995544899210594e-05,
"loss": 3.8092,
"step": 2077
},
{
"epoch": 0.18901591571396548,
"grad_norm": 3.033869504928589,
"learning_rate": 4.9950296682458583e-05,
"loss": 3.7729,
"step": 2108
},
{
"epoch": 0.1917955615332885,
"grad_norm": 3.2953057289123535,
"learning_rate": 4.994486281210429e-05,
"loss": 3.7558,
"step": 2139
},
{
"epoch": 0.19457520735261152,
"grad_norm": 3.220686912536621,
"learning_rate": 4.9939147442363566e-05,
"loss": 3.7482,
"step": 2170
},
{
"epoch": 0.19735485317193455,
"grad_norm": 3.0368878841400146,
"learning_rate": 4.9933150637733574e-05,
"loss": 3.7542,
"step": 2201
},
{
"epoch": 0.20013449899125757,
"grad_norm": 3.2065186500549316,
"learning_rate": 4.992687246588743e-05,
"loss": 3.7435,
"step": 2232
},
{
"epoch": 0.2029141448105806,
"grad_norm": 3.1103830337524414,
"learning_rate": 4.992031299767347e-05,
"loss": 3.7183,
"step": 2263
},
{
"epoch": 0.20569379062990362,
"grad_norm": 3.092560291290283,
"learning_rate": 4.9913472307114386e-05,
"loss": 3.7109,
"step": 2294
},
{
"epoch": 0.20847343644922664,
"grad_norm": 3.1417951583862305,
"learning_rate": 4.9906350471406446e-05,
"loss": 3.7034,
"step": 2325
},
{
"epoch": 0.21125308226854966,
"grad_norm": 3.0740373134613037,
"learning_rate": 4.989894757091861e-05,
"loss": 3.7072,
"step": 2356
},
{
"epoch": 0.2140327280878727,
"grad_norm": 3.1177079677581787,
"learning_rate": 4.989126368919158e-05,
"loss": 3.6724,
"step": 2387
},
{
"epoch": 0.21681237390719568,
"grad_norm": 2.946838855743408,
"learning_rate": 4.988329891293693e-05,
"loss": 3.6936,
"step": 2418
},
{
"epoch": 0.2195920197265187,
"grad_norm": 2.851635456085205,
"learning_rate": 4.987505333203608e-05,
"loss": 3.6925,
"step": 2449
},
{
"epoch": 0.22237166554584173,
"grad_norm": 2.841735601425171,
"learning_rate": 4.9866527039539276e-05,
"loss": 3.6946,
"step": 2480
},
{
"epoch": 0.22515131136516475,
"grad_norm": 2.942103385925293,
"learning_rate": 4.9857720131664594e-05,
"loss": 3.6578,
"step": 2511
},
{
"epoch": 0.22793095718448778,
"grad_norm": 2.8731508255004883,
"learning_rate": 4.9848632707796773e-05,
"loss": 3.6532,
"step": 2542
},
{
"epoch": 0.2307106030038108,
"grad_norm": 2.66916561126709,
"learning_rate": 4.9839264870486155e-05,
"loss": 3.6784,
"step": 2573
},
{
"epoch": 0.23349024882313382,
"grad_norm": 2.7230560779571533,
"learning_rate": 4.9829616725447526e-05,
"loss": 3.6274,
"step": 2604
},
{
"epoch": 0.23626989464245685,
"grad_norm": 2.6979317665100098,
"learning_rate": 4.981968838155888e-05,
"loss": 3.6508,
"step": 2635
},
{
"epoch": 0.23904954046177987,
"grad_norm": 2.828092098236084,
"learning_rate": 4.980947995086024e-05,
"loss": 3.6432,
"step": 2666
},
{
"epoch": 0.2418291862811029,
"grad_norm": 2.77974271774292,
"learning_rate": 4.979899154855234e-05,
"loss": 3.6416,
"step": 2697
},
{
"epoch": 0.24460883210042592,
"grad_norm": 2.941502809524536,
"learning_rate": 4.9788223292995386e-05,
"loss": 3.6298,
"step": 2728
},
{
"epoch": 0.24738847791974894,
"grad_norm": 2.8036599159240723,
"learning_rate": 4.977717530570768e-05,
"loss": 3.6196,
"step": 2759
},
{
"epoch": 0.25016812373907193,
"grad_norm": 3.6867358684539795,
"learning_rate": 4.976584771136425e-05,
"loss": 3.6403,
"step": 2790
},
{
"epoch": 0.252947769558395,
"grad_norm": 2.673445463180542,
"learning_rate": 4.975424063779547e-05,
"loss": 3.5951,
"step": 2821
},
{
"epoch": 0.255727415377718,
"grad_norm": 2.853241443634033,
"learning_rate": 4.974235421598557e-05,
"loss": 3.59,
"step": 2852
},
{
"epoch": 0.25850706119704103,
"grad_norm": 2.672149896621704,
"learning_rate": 4.973018858007122e-05,
"loss": 3.5891,
"step": 2883
},
{
"epoch": 0.26128670701636403,
"grad_norm": 2.8042409420013428,
"learning_rate": 4.9717743867339963e-05,
"loss": 3.6142,
"step": 2914
},
{
"epoch": 0.2640663528356871,
"grad_norm": 2.5302071571350098,
"learning_rate": 4.9705020218228695e-05,
"loss": 3.5835,
"step": 2945
},
{
"epoch": 0.2668459986550101,
"grad_norm": 2.7771754264831543,
"learning_rate": 4.969201777632205e-05,
"loss": 3.5982,
"step": 2976
},
{
"epoch": 0.2696256444743331,
"grad_norm": 2.5498504638671875,
"learning_rate": 4.9678736688350846e-05,
"loss": 3.5839,
"step": 3007
},
{
"epoch": 0.2724052902936561,
"grad_norm": 3.0297176837921143,
"learning_rate": 4.966517710419033e-05,
"loss": 3.5915,
"step": 3038
},
{
"epoch": 0.2751849361129792,
"grad_norm": 2.695497989654541,
"learning_rate": 4.965133917685858e-05,
"loss": 3.5526,
"step": 3069
},
{
"epoch": 0.27796458193230217,
"grad_norm": 2.634571075439453,
"learning_rate": 4.9637223062514714e-05,
"loss": 3.5695,
"step": 3100
},
{
"epoch": 0.2807442277516252,
"grad_norm": 2.6347239017486572,
"learning_rate": 4.962282892045718e-05,
"loss": 3.5694,
"step": 3131
},
{
"epoch": 0.2835238735709482,
"grad_norm": 2.5763373374938965,
"learning_rate": 4.9608156913121904e-05,
"loss": 3.5427,
"step": 3162
},
{
"epoch": 0.28630351939027127,
"grad_norm": 2.5541677474975586,
"learning_rate": 4.959320720608049e-05,
"loss": 3.5716,
"step": 3193
},
{
"epoch": 0.28908316520959426,
"grad_norm": 2.656294822692871,
"learning_rate": 4.9577979968038354e-05,
"loss": 3.5053,
"step": 3224
},
{
"epoch": 0.29186281102891726,
"grad_norm": 2.5195116996765137,
"learning_rate": 4.956247537083282e-05,
"loss": 3.5467,
"step": 3255
},
{
"epoch": 0.2946424568482403,
"grad_norm": 2.6532673835754395,
"learning_rate": 4.9546693589431145e-05,
"loss": 3.5251,
"step": 3286
},
{
"epoch": 0.2974221026675633,
"grad_norm": 2.37542462348938,
"learning_rate": 4.9530634801928595e-05,
"loss": 3.5256,
"step": 3317
},
{
"epoch": 0.30020174848688636,
"grad_norm": 2.5843489170074463,
"learning_rate": 4.9514299189546395e-05,
"loss": 3.5313,
"step": 3348
},
{
"epoch": 0.30298139430620935,
"grad_norm": 2.532468557357788,
"learning_rate": 4.949768693662973e-05,
"loss": 3.5038,
"step": 3379
},
{
"epoch": 0.3057610401255324,
"grad_norm": 2.527435302734375,
"learning_rate": 4.948079823064559e-05,
"loss": 3.5093,
"step": 3410
},
{
"epoch": 0.3085406859448554,
"grad_norm": 2.451413869857788,
"learning_rate": 4.946363326218074e-05,
"loss": 3.5185,
"step": 3441
},
{
"epoch": 0.31132033176417845,
"grad_norm": 2.611423969268799,
"learning_rate": 4.9446192224939525e-05,
"loss": 3.5165,
"step": 3472
},
{
"epoch": 0.31409997758350144,
"grad_norm": 2.503922939300537,
"learning_rate": 4.942847531574167e-05,
"loss": 3.4938,
"step": 3503
},
{
"epoch": 0.3168796234028245,
"grad_norm": 2.561654806137085,
"learning_rate": 4.941048273452008e-05,
"loss": 3.4674,
"step": 3534
},
{
"epoch": 0.3196592692221475,
"grad_norm": 3.26798152923584,
"learning_rate": 4.9392214684318605e-05,
"loss": 3.4999,
"step": 3565
},
{
"epoch": 0.32243891504147054,
"grad_norm": 2.450745105743408,
"learning_rate": 4.93736713712897e-05,
"loss": 3.4882,
"step": 3596
},
{
"epoch": 0.32521856086079354,
"grad_norm": 2.631754159927368,
"learning_rate": 4.9354853004692124e-05,
"loss": 3.4813,
"step": 3627
},
{
"epoch": 0.3279982066801166,
"grad_norm": 2.5118045806884766,
"learning_rate": 4.93357597968886e-05,
"loss": 3.4987,
"step": 3658
},
{
"epoch": 0.3307778524994396,
"grad_norm": 2.5276923179626465,
"learning_rate": 4.931639196334338e-05,
"loss": 3.4835,
"step": 3689
},
{
"epoch": 0.33355749831876264,
"grad_norm": 2.574352502822876,
"learning_rate": 4.9296749722619826e-05,
"loss": 3.5156,
"step": 3720
},
{
"epoch": 0.33633714413808563,
"grad_norm": 2.3906681537628174,
"learning_rate": 4.9276833296377966e-05,
"loss": 3.4983,
"step": 3751
},
{
"epoch": 0.3391167899574086,
"grad_norm": 2.3432400226593018,
"learning_rate": 4.925664290937196e-05,
"loss": 3.4442,
"step": 3782
},
{
"epoch": 0.3418964357767317,
"grad_norm": 2.4267373085021973,
"learning_rate": 4.9236178789447576e-05,
"loss": 3.4829,
"step": 3813
},
{
"epoch": 0.3446760815960547,
"grad_norm": 2.3619213104248047,
"learning_rate": 4.921544116753962e-05,
"loss": 3.4604,
"step": 3844
},
{
"epoch": 0.3474557274153777,
"grad_norm": 2.476457357406616,
"learning_rate": 4.919443027766935e-05,
"loss": 3.4714,
"step": 3875
},
{
"epoch": 0.3502353732347007,
"grad_norm": 2.2075233459472656,
"learning_rate": 4.91731463569418e-05,
"loss": 3.4627,
"step": 3906
},
{
"epoch": 0.35301501905402377,
"grad_norm": 2.3251192569732666,
"learning_rate": 4.915158964554312e-05,
"loss": 3.4641,
"step": 3937
},
{
"epoch": 0.35579466487334677,
"grad_norm": 2.4196231365203857,
"learning_rate": 4.912976038673786e-05,
"loss": 3.4076,
"step": 3968
},
{
"epoch": 0.3585743106926698,
"grad_norm": 2.4416725635528564,
"learning_rate": 4.9107658826866254e-05,
"loss": 3.463,
"step": 3999
},
{
"epoch": 0.3613539565119928,
"grad_norm": 2.3500940799713135,
"learning_rate": 4.908528521534139e-05,
"loss": 3.478,
"step": 4030
},
{
"epoch": 0.36413360233131586,
"grad_norm": 2.3429739475250244,
"learning_rate": 4.906263980464644e-05,
"loss": 3.4437,
"step": 4061
},
{
"epoch": 0.36691324815063886,
"grad_norm": 2.5604615211486816,
"learning_rate": 4.903972285033178e-05,
"loss": 3.4703,
"step": 4092
},
{
"epoch": 0.3696928939699619,
"grad_norm": 2.4725446701049805,
"learning_rate": 4.901653461101213e-05,
"loss": 3.4554,
"step": 4123
},
{
"epoch": 0.3724725397892849,
"grad_norm": 2.291586399078369,
"learning_rate": 4.8993075348363626e-05,
"loss": 3.4455,
"step": 4154
},
{
"epoch": 0.37525218560860796,
"grad_norm": 2.3323886394500732,
"learning_rate": 4.896934532712084e-05,
"loss": 3.4203,
"step": 4185
},
{
"epoch": 0.37803183142793095,
"grad_norm": 2.3561739921569824,
"learning_rate": 4.8945344815073846e-05,
"loss": 3.4557,
"step": 4216
},
{
"epoch": 0.380811477247254,
"grad_norm": 2.3202600479125977,
"learning_rate": 4.892107408306516e-05,
"loss": 3.4446,
"step": 4247
},
{
"epoch": 0.383591123066577,
"grad_norm": 2.3766143321990967,
"learning_rate": 4.889653340498669e-05,
"loss": 3.4709,
"step": 4278
},
{
"epoch": 0.3863707688859,
"grad_norm": 2.1723828315734863,
"learning_rate": 4.8871723057776664e-05,
"loss": 3.4439,
"step": 4309
},
{
"epoch": 0.38915041470522305,
"grad_norm": 2.4008755683898926,
"learning_rate": 4.8846643321416476e-05,
"loss": 3.4173,
"step": 4340
},
{
"epoch": 0.39193006052454604,
"grad_norm": 2.351132869720459,
"learning_rate": 4.882129447892753e-05,
"loss": 3.4055,
"step": 4371
},
{
"epoch": 0.3947097063438691,
"grad_norm": 2.4292497634887695,
"learning_rate": 4.8795676816368076e-05,
"loss": 3.4096,
"step": 4402
},
{
"epoch": 0.3974893521631921,
"grad_norm": 2.3505067825317383,
"learning_rate": 4.876979062282995e-05,
"loss": 3.4164,
"step": 4433
},
{
"epoch": 0.40026899798251514,
"grad_norm": 2.372157335281372,
"learning_rate": 4.8743636190435325e-05,
"loss": 3.4229,
"step": 4464
},
{
"epoch": 0.40304864380183814,
"grad_norm": 2.3175127506256104,
"learning_rate": 4.871721381433344e-05,
"loss": 3.4155,
"step": 4495
},
{
"epoch": 0.4058282896211612,
"grad_norm": 2.3510518074035645,
"learning_rate": 4.869052379269719e-05,
"loss": 3.4177,
"step": 4526
},
{
"epoch": 0.4086079354404842,
"grad_norm": 2.3506274223327637,
"learning_rate": 4.866356642671985e-05,
"loss": 3.4048,
"step": 4557
},
{
"epoch": 0.41138758125980723,
"grad_norm": 2.2985119819641113,
"learning_rate": 4.8636342020611634e-05,
"loss": 3.4022,
"step": 4588
},
{
"epoch": 0.41416722707913023,
"grad_norm": 2.342514753341675,
"learning_rate": 4.860885088159626e-05,
"loss": 3.4082,
"step": 4619
},
{
"epoch": 0.4169468728984533,
"grad_norm": 2.2110369205474854,
"learning_rate": 4.858109331990751e-05,
"loss": 3.3916,
"step": 4650
},
{
"epoch": 0.4197265187177763,
"grad_norm": 2.2878293991088867,
"learning_rate": 4.855306964878567e-05,
"loss": 3.3851,
"step": 4681
},
{
"epoch": 0.4225061645370993,
"grad_norm": 2.3494646549224854,
"learning_rate": 4.8524780184474084e-05,
"loss": 3.4091,
"step": 4712
},
{
"epoch": 0.4252858103564223,
"grad_norm": 2.3722164630889893,
"learning_rate": 4.8496225246215496e-05,
"loss": 3.3858,
"step": 4743
},
{
"epoch": 0.4280654561757454,
"grad_norm": 2.4473116397857666,
"learning_rate": 4.8467405156248505e-05,
"loss": 3.4045,
"step": 4774
},
{
"epoch": 0.43084510199506837,
"grad_norm": 2.6271281242370605,
"learning_rate": 4.843832023980392e-05,
"loss": 3.3902,
"step": 4805
},
{
"epoch": 0.43362474781439136,
"grad_norm": 2.2729909420013428,
"learning_rate": 4.840897082510106e-05,
"loss": 3.394,
"step": 4836
},
{
"epoch": 0.4364043936337144,
"grad_norm": 2.2466001510620117,
"learning_rate": 4.8379357243344084e-05,
"loss": 3.4198,
"step": 4867
},
{
"epoch": 0.4391840394530374,
"grad_norm": 2.2691309452056885,
"learning_rate": 4.8349479828718236e-05,
"loss": 3.4044,
"step": 4898
},
{
"epoch": 0.44196368527236046,
"grad_norm": 2.3264501094818115,
"learning_rate": 4.8319338918386075e-05,
"loss": 3.407,
"step": 4929
},
{
"epoch": 0.44474333109168346,
"grad_norm": 2.199679374694824,
"learning_rate": 4.828893485248369e-05,
"loss": 3.3488,
"step": 4960
},
{
"epoch": 0.4475229769110065,
"grad_norm": 2.2347097396850586,
"learning_rate": 4.825826797411682e-05,
"loss": 3.3713,
"step": 4991
},
{
"epoch": 0.4503026227303295,
"grad_norm": 2.107295036315918,
"learning_rate": 4.822733862935702e-05,
"loss": 3.3618,
"step": 5022
},
{
"epoch": 0.45308226854965256,
"grad_norm": 2.2157092094421387,
"learning_rate": 4.819614716723775e-05,
"loss": 3.3641,
"step": 5053
},
{
"epoch": 0.45586191436897555,
"grad_norm": 2.1321468353271484,
"learning_rate": 4.8164693939750425e-05,
"loss": 3.3874,
"step": 5084
},
{
"epoch": 0.4586415601882986,
"grad_norm": 2.1471335887908936,
"learning_rate": 4.813297930184042e-05,
"loss": 3.3489,
"step": 5115
},
{
"epoch": 0.4614212060076216,
"grad_norm": 2.474858045578003,
"learning_rate": 4.810100361140314e-05,
"loss": 3.3785,
"step": 5146
},
{
"epoch": 0.46420085182694465,
"grad_norm": 2.2046587467193604,
"learning_rate": 4.8068767229279885e-05,
"loss": 3.3427,
"step": 5177
},
{
"epoch": 0.46698049764626764,
"grad_norm": 2.2125439643859863,
"learning_rate": 4.8036270519253854e-05,
"loss": 3.3519,
"step": 5208
},
{
"epoch": 0.4697601434655907,
"grad_norm": 2.2569172382354736,
"learning_rate": 4.8003513848046e-05,
"loss": 3.3803,
"step": 5239
},
{
"epoch": 0.4725397892849137,
"grad_norm": 2.0965545177459717,
"learning_rate": 4.79704975853109e-05,
"loss": 3.3394,
"step": 5270
},
{
"epoch": 0.47531943510423674,
"grad_norm": 2.2493419647216797,
"learning_rate": 4.793722210363262e-05,
"loss": 3.3476,
"step": 5301
},
{
"epoch": 0.47809908092355974,
"grad_norm": 2.2152726650238037,
"learning_rate": 4.7903687778520414e-05,
"loss": 3.3549,
"step": 5332
},
{
"epoch": 0.4808787267428828,
"grad_norm": 2.2980692386627197,
"learning_rate": 4.7869894988404593e-05,
"loss": 3.3661,
"step": 5363
},
{
"epoch": 0.4836583725622058,
"grad_norm": 2.221606731414795,
"learning_rate": 4.783584411463221e-05,
"loss": 3.3421,
"step": 5394
},
{
"epoch": 0.4864380183815288,
"grad_norm": 2.5827386379241943,
"learning_rate": 4.780153554146274e-05,
"loss": 3.3719,
"step": 5425
},
{
"epoch": 0.48921766420085183,
"grad_norm": 2.4041783809661865,
"learning_rate": 4.7766969656063766e-05,
"loss": 3.3744,
"step": 5456
},
{
"epoch": 0.4919973100201748,
"grad_norm": 2.113762140274048,
"learning_rate": 4.773214684850662e-05,
"loss": 3.3434,
"step": 5487
},
{
"epoch": 0.4947769558394979,
"grad_norm": 2.2005934715270996,
"learning_rate": 4.769706751176193e-05,
"loss": 3.3319,
"step": 5518
},
{
"epoch": 0.4975566016588209,
"grad_norm": 2.1655445098876953,
"learning_rate": 4.7661732041695264e-05,
"loss": 3.337,
"step": 5549
},
{
"epoch": 0.5003362474781439,
"grad_norm": 2.1906204223632812,
"learning_rate": 4.762614083706258e-05,
"loss": 3.3352,
"step": 5580
},
{
"epoch": 0.503115893297467,
"grad_norm": 2.1618640422821045,
"learning_rate": 4.759029429950581e-05,
"loss": 3.3002,
"step": 5611
},
{
"epoch": 0.50589553911679,
"grad_norm": 2.196354389190674,
"learning_rate": 4.7554192833548235e-05,
"loss": 3.3706,
"step": 5642
},
{
"epoch": 0.508675184936113,
"grad_norm": 2.258183717727661,
"learning_rate": 4.751783684659e-05,
"loss": 3.3336,
"step": 5673
},
{
"epoch": 0.511454830755436,
"grad_norm": 2.233896493911743,
"learning_rate": 4.748122674890348e-05,
"loss": 3.3192,
"step": 5704
},
{
"epoch": 0.5142344765747591,
"grad_norm": 2.067518949508667,
"learning_rate": 4.7444362953628654e-05,
"loss": 3.3154,
"step": 5735
},
{
"epoch": 0.5170141223940821,
"grad_norm": 2.074993133544922,
"learning_rate": 4.7407245876768424e-05,
"loss": 3.3178,
"step": 5766
},
{
"epoch": 0.5197937682134051,
"grad_norm": 2.1448609828948975,
"learning_rate": 4.736987593718397e-05,
"loss": 3.3403,
"step": 5797
},
{
"epoch": 0.5225734140327281,
"grad_norm": 2.1654341220855713,
"learning_rate": 4.733225355658999e-05,
"loss": 3.3061,
"step": 5828
},
{
"epoch": 0.5253530598520512,
"grad_norm": 2.3800811767578125,
"learning_rate": 4.7294379159549926e-05,
"loss": 3.3461,
"step": 5859
},
{
"epoch": 0.5281327056713742,
"grad_norm": 2.2466440200805664,
"learning_rate": 4.725625317347119e-05,
"loss": 3.3161,
"step": 5890
},
{
"epoch": 0.5309123514906972,
"grad_norm": 2.2604174613952637,
"learning_rate": 4.7217876028600374e-05,
"loss": 3.3191,
"step": 5921
},
{
"epoch": 0.5336919973100201,
"grad_norm": 2.1068217754364014,
"learning_rate": 4.717924815801832e-05,
"loss": 3.3305,
"step": 5952
},
{
"epoch": 0.5364716431293431,
"grad_norm": 2.078378200531006,
"learning_rate": 4.714036999763532e-05,
"loss": 3.3105,
"step": 5983
},
{
"epoch": 0.5392512889486663,
"grad_norm": 2.120563507080078,
"learning_rate": 4.7101241986186116e-05,
"loss": 3.3029,
"step": 6014
},
{
"epoch": 0.5420309347679892,
"grad_norm": 2.4018023014068604,
"learning_rate": 4.7061864565225e-05,
"loss": 3.3271,
"step": 6045
},
{
"epoch": 0.5448105805873122,
"grad_norm": 2.1769769191741943,
"learning_rate": 4.702223817912081e-05,
"loss": 3.3202,
"step": 6076
},
{
"epoch": 0.5475902264066352,
"grad_norm": 2.2606041431427,
"learning_rate": 4.698236327505195e-05,
"loss": 3.328,
"step": 6107
},
{
"epoch": 0.5503698722259583,
"grad_norm": 2.0148673057556152,
"learning_rate": 4.694224030300127e-05,
"loss": 3.321,
"step": 6138
},
{
"epoch": 0.5531495180452813,
"grad_norm": 2.204648494720459,
"learning_rate": 4.690186971575107e-05,
"loss": 3.3163,
"step": 6169
},
{
"epoch": 0.5559291638646043,
"grad_norm": 2.116101026535034,
"learning_rate": 4.6861251968877916e-05,
"loss": 3.3168,
"step": 6200
},
{
"epoch": 0.5587088096839273,
"grad_norm": 2.1526737213134766,
"learning_rate": 4.68203875207476e-05,
"loss": 3.3286,
"step": 6231
},
{
"epoch": 0.5614884555032504,
"grad_norm": 2.1621978282928467,
"learning_rate": 4.677927683250983e-05,
"loss": 3.2935,
"step": 6262
},
{
"epoch": 0.5642681013225734,
"grad_norm": 2.039001941680908,
"learning_rate": 4.6737920368093156e-05,
"loss": 3.2964,
"step": 6293
},
{
"epoch": 0.5670477471418964,
"grad_norm": 2.0557541847229004,
"learning_rate": 4.669631859419965e-05,
"loss": 3.2933,
"step": 6324
},
{
"epoch": 0.5698273929612194,
"grad_norm": 2.0704591274261475,
"learning_rate": 4.6654471980299676e-05,
"loss": 3.2733,
"step": 6355
},
{
"epoch": 0.5726070387805425,
"grad_norm": 2.0741264820098877,
"learning_rate": 4.661238099862658e-05,
"loss": 3.2808,
"step": 6386
},
{
"epoch": 0.5753866845998655,
"grad_norm": 2.081774950027466,
"learning_rate": 4.657004612417138e-05,
"loss": 3.3013,
"step": 6417
},
{
"epoch": 0.5781663304191885,
"grad_norm": 2.3074076175689697,
"learning_rate": 4.6527467834677374e-05,
"loss": 3.3015,
"step": 6448
},
{
"epoch": 0.5809459762385115,
"grad_norm": 2.070481538772583,
"learning_rate": 4.648464661063478e-05,
"loss": 3.3201,
"step": 6479
},
{
"epoch": 0.5837256220578345,
"grad_norm": 2.8896186351776123,
"learning_rate": 4.6441582935275264e-05,
"loss": 3.2986,
"step": 6510
},
{
"epoch": 0.5865052678771576,
"grad_norm": 2.132897138595581,
"learning_rate": 4.6398277294566586e-05,
"loss": 3.3078,
"step": 6541
},
{
"epoch": 0.5892849136964806,
"grad_norm": 2.049975633621216,
"learning_rate": 4.6354730177207e-05,
"loss": 3.3065,
"step": 6572
},
{
"epoch": 0.5920645595158036,
"grad_norm": 2.059072494506836,
"learning_rate": 4.6310942074619787e-05,
"loss": 3.2919,
"step": 6603
},
{
"epoch": 0.5948442053351266,
"grad_norm": 2.1366543769836426,
"learning_rate": 4.626691348094777e-05,
"loss": 3.318,
"step": 6634
},
{
"epoch": 0.5976238511544497,
"grad_norm": 2.196328639984131,
"learning_rate": 4.622264489304762e-05,
"loss": 3.2828,
"step": 6665
},
{
"epoch": 0.6004034969737727,
"grad_norm": 2.102091073989868,
"learning_rate": 4.617813681048434e-05,
"loss": 3.2776,
"step": 6696
},
{
"epoch": 0.6031831427930957,
"grad_norm": 2.030512809753418,
"learning_rate": 4.61333897355256e-05,
"loss": 3.3112,
"step": 6727
},
{
"epoch": 0.6059627886124187,
"grad_norm": 2.165297031402588,
"learning_rate": 4.608840417313604e-05,
"loss": 3.2957,
"step": 6758
},
{
"epoch": 0.6087424344317418,
"grad_norm": 2.1030540466308594,
"learning_rate": 4.6043180630971646e-05,
"loss": 3.2941,
"step": 6789
},
{
"epoch": 0.6115220802510648,
"grad_norm": 2.4561312198638916,
"learning_rate": 4.599771961937391e-05,
"loss": 3.3059,
"step": 6820
},
{
"epoch": 0.6143017260703878,
"grad_norm": 2.0763375759124756,
"learning_rate": 4.5952021651364204e-05,
"loss": 3.298,
"step": 6851
},
{
"epoch": 0.6170813718897108,
"grad_norm": 2.0811164379119873,
"learning_rate": 4.590608724263786e-05,
"loss": 3.2479,
"step": 6882
},
{
"epoch": 0.6198610177090339,
"grad_norm": 2.046067476272583,
"learning_rate": 4.585991691155845e-05,
"loss": 3.2624,
"step": 6913
},
{
"epoch": 0.6226406635283569,
"grad_norm": 2.2052958011627197,
"learning_rate": 4.581351117915188e-05,
"loss": 3.281,
"step": 6944
},
{
"epoch": 0.6254203093476799,
"grad_norm": 1.959435224533081,
"learning_rate": 4.5766870569100534e-05,
"loss": 3.2629,
"step": 6975
},
{
"epoch": 0.6281999551670029,
"grad_norm": 2.1152093410491943,
"learning_rate": 4.571999560773736e-05,
"loss": 3.2438,
"step": 7006
},
{
"epoch": 0.6309796009863259,
"grad_norm": 1.990297555923462,
"learning_rate": 4.5672886824039915e-05,
"loss": 3.2702,
"step": 7037
},
{
"epoch": 0.633759246805649,
"grad_norm": 1.9708343744277954,
"learning_rate": 4.5625544749624435e-05,
"loss": 3.2711,
"step": 7068
},
{
"epoch": 0.636538892624972,
"grad_norm": 2.0719597339630127,
"learning_rate": 4.5577969918739794e-05,
"loss": 3.2617,
"step": 7099
},
{
"epoch": 0.639318538444295,
"grad_norm": 1.9742841720581055,
"learning_rate": 4.5530162868261486e-05,
"loss": 3.2435,
"step": 7130
},
{
"epoch": 0.642098184263618,
"grad_norm": 1.9801511764526367,
"learning_rate": 4.548212413768558e-05,
"loss": 3.2844,
"step": 7161
},
{
"epoch": 0.6448778300829411,
"grad_norm": 1.896723985671997,
"learning_rate": 4.543385426912261e-05,
"loss": 3.2623,
"step": 7192
},
{
"epoch": 0.6476574759022641,
"grad_norm": 1.9870046377182007,
"learning_rate": 4.53853538072915e-05,
"loss": 3.2629,
"step": 7223
},
{
"epoch": 0.6504371217215871,
"grad_norm": 1.9941246509552002,
"learning_rate": 4.533662329951336e-05,
"loss": 3.2689,
"step": 7254
},
{
"epoch": 0.6532167675409101,
"grad_norm": 2.104779005050659,
"learning_rate": 4.528766329570536e-05,
"loss": 3.2661,
"step": 7285
},
{
"epoch": 0.6559964133602332,
"grad_norm": 2.0418860912323,
"learning_rate": 4.523847434837447e-05,
"loss": 3.2368,
"step": 7316
},
{
"epoch": 0.6587760591795562,
"grad_norm": 2.008305549621582,
"learning_rate": 4.518905701261128e-05,
"loss": 3.2292,
"step": 7347
},
{
"epoch": 0.6615557049988792,
"grad_norm": 1.9720836877822876,
"learning_rate": 4.5139411846083715e-05,
"loss": 3.2326,
"step": 7378
},
{
"epoch": 0.6643353508182022,
"grad_norm": 2.144597291946411,
"learning_rate": 4.508953940903073e-05,
"loss": 3.2545,
"step": 7409
},
{
"epoch": 0.6671149966375253,
"grad_norm": 2.044677972793579,
"learning_rate": 4.5039440264255994e-05,
"loss": 3.2522,
"step": 7440
},
{
"epoch": 0.6698946424568483,
"grad_norm": 2.0666587352752686,
"learning_rate": 4.498911497712155e-05,
"loss": 3.2563,
"step": 7471
},
{
"epoch": 0.6726742882761713,
"grad_norm": 2.0450077056884766,
"learning_rate": 4.493856411554142e-05,
"loss": 3.2622,
"step": 7502
},
{
"epoch": 0.6754539340954943,
"grad_norm": 2.0837104320526123,
"learning_rate": 4.4887788249975206e-05,
"loss": 3.2166,
"step": 7533
},
{
"epoch": 0.6782335799148173,
"grad_norm": 1.960066556930542,
"learning_rate": 4.4836787953421656e-05,
"loss": 3.2551,
"step": 7564
},
{
"epoch": 0.6810132257341404,
"grad_norm": 2.063417673110962,
"learning_rate": 4.478556380141218e-05,
"loss": 3.26,
"step": 7595
},
{
"epoch": 0.6837928715534634,
"grad_norm": 2.146414279937744,
"learning_rate": 4.4734116372004375e-05,
"loss": 3.242,
"step": 7626
},
{
"epoch": 0.6865725173727864,
"grad_norm": 2.0153274536132812,
"learning_rate": 4.4682446245775477e-05,
"loss": 3.2638,
"step": 7657
},
{
"epoch": 0.6893521631921093,
"grad_norm": 2.0109503269195557,
"learning_rate": 4.463055400581586e-05,
"loss": 3.2362,
"step": 7688
},
{
"epoch": 0.6921318090114325,
"grad_norm": 2.022961378097534,
"learning_rate": 4.4578440237722374e-05,
"loss": 3.2551,
"step": 7719
},
{
"epoch": 0.6949114548307554,
"grad_norm": 2.004865884780884,
"learning_rate": 4.452610552959183e-05,
"loss": 3.2195,
"step": 7750
},
{
"epoch": 0.6976911006500784,
"grad_norm": 2.102989435195923,
"learning_rate": 4.447355047201428e-05,
"loss": 3.2446,
"step": 7781
},
{
"epoch": 0.7004707464694014,
"grad_norm": 2.077101230621338,
"learning_rate": 4.4420775658066414e-05,
"loss": 3.2212,
"step": 7812
},
{
"epoch": 0.7032503922887245,
"grad_norm": 2.093027114868164,
"learning_rate": 4.436778168330484e-05,
"loss": 3.2369,
"step": 7843
},
{
"epoch": 0.7060300381080475,
"grad_norm": 1.9160785675048828,
"learning_rate": 4.4314569145759353e-05,
"loss": 3.2311,
"step": 7874
},
{
"epoch": 0.7088096839273705,
"grad_norm": 2.0607402324676514,
"learning_rate": 4.42611386459262e-05,
"loss": 3.2522,
"step": 7905
},
{
"epoch": 0.7115893297466935,
"grad_norm": 2.021075487136841,
"learning_rate": 4.420749078676133e-05,
"loss": 3.2234,
"step": 7936
},
{
"epoch": 0.7143689755660166,
"grad_norm": 2.0619757175445557,
"learning_rate": 4.4153626173673516e-05,
"loss": 3.225,
"step": 7967
},
{
"epoch": 0.7171486213853396,
"grad_norm": 1.9858311414718628,
"learning_rate": 4.409954541451762e-05,
"loss": 3.2551,
"step": 7998
},
{
"epoch": 0.7199282672046626,
"grad_norm": 2.0410025119781494,
"learning_rate": 4.404524911958764e-05,
"loss": 3.2265,
"step": 8029
},
{
"epoch": 0.7227079130239856,
"grad_norm": 1.9730476140975952,
"learning_rate": 4.399073790160989e-05,
"loss": 3.2298,
"step": 8060
},
{
"epoch": 0.7254875588433086,
"grad_norm": 2.037961006164551,
"learning_rate": 4.393601237573607e-05,
"loss": 3.2282,
"step": 8091
},
{
"epoch": 0.7282672046626317,
"grad_norm": 1.9044350385665894,
"learning_rate": 4.388107315953628e-05,
"loss": 3.2495,
"step": 8122
},
{
"epoch": 0.7310468504819547,
"grad_norm": 2.097494602203369,
"learning_rate": 4.382592087299212e-05,
"loss": 3.2229,
"step": 8153
},
{
"epoch": 0.7338264963012777,
"grad_norm": 2.014116048812866,
"learning_rate": 4.377055613848964e-05,
"loss": 3.2354,
"step": 8184
},
{
"epoch": 0.7366061421206007,
"grad_norm": 1.8900959491729736,
"learning_rate": 4.3714979580812355e-05,
"loss": 3.2029,
"step": 8215
},
{
"epoch": 0.7393857879399238,
"grad_norm": 2.092454195022583,
"learning_rate": 4.365919182713416e-05,
"loss": 3.1969,
"step": 8246
},
{
"epoch": 0.7421654337592468,
"grad_norm": 1.9832301139831543,
"learning_rate": 4.360319350701226e-05,
"loss": 3.2563,
"step": 8277
},
{
"epoch": 0.7449450795785698,
"grad_norm": 2.038874387741089,
"learning_rate": 4.3546985252380115e-05,
"loss": 3.2226,
"step": 8308
},
{
"epoch": 0.7477247253978928,
"grad_norm": 1.9169414043426514,
"learning_rate": 4.349056769754021e-05,
"loss": 3.2469,
"step": 8339
},
{
"epoch": 0.7505043712172159,
"grad_norm": 1.8921377658843994,
"learning_rate": 4.3433941479156994e-05,
"loss": 3.1965,
"step": 8370
},
{
"epoch": 0.7532840170365389,
"grad_norm": 1.893687129020691,
"learning_rate": 4.3377107236249647e-05,
"loss": 3.2391,
"step": 8401
},
{
"epoch": 0.7560636628558619,
"grad_norm": 1.9864296913146973,
"learning_rate": 4.332006561018488e-05,
"loss": 3.2318,
"step": 8432
},
{
"epoch": 0.7588433086751849,
"grad_norm": 2.1311209201812744,
"learning_rate": 4.3262817244669683e-05,
"loss": 3.2566,
"step": 8463
},
{
"epoch": 0.761622954494508,
"grad_norm": 1.9401289224624634,
"learning_rate": 4.3205362785744083e-05,
"loss": 3.202,
"step": 8494
},
{
"epoch": 0.764402600313831,
"grad_norm": 1.9595686197280884,
"learning_rate": 4.314770288177384e-05,
"loss": 3.2308,
"step": 8525
},
{
"epoch": 0.767182246133154,
"grad_norm": 2.201599597930908,
"learning_rate": 4.308983818344313e-05,
"loss": 3.2234,
"step": 8556
},
{
"epoch": 0.769961891952477,
"grad_norm": 2.4435086250305176,
"learning_rate": 4.3031769343747206e-05,
"loss": 3.2091,
"step": 8587
},
{
"epoch": 0.7727415377718,
"grad_norm": 1.9813133478164673,
"learning_rate": 4.297349701798505e-05,
"loss": 3.2196,
"step": 8618
},
{
"epoch": 0.7755211835911231,
"grad_norm": 2.008185863494873,
"learning_rate": 4.2915021863751916e-05,
"loss": 3.2411,
"step": 8649
},
{
"epoch": 0.7783008294104461,
"grad_norm": 1.8839123249053955,
"learning_rate": 4.285634454093198e-05,
"loss": 3.2193,
"step": 8680
},
{
"epoch": 0.7810804752297691,
"grad_norm": 1.9559476375579834,
"learning_rate": 4.279746571169086e-05,
"loss": 3.1888,
"step": 8711
},
{
"epoch": 0.7838601210490921,
"grad_norm": 2.0515148639678955,
"learning_rate": 4.2738386040468136e-05,
"loss": 3.1983,
"step": 8742
},
{
"epoch": 0.7866397668684152,
"grad_norm": 2.064584255218506,
"learning_rate": 4.2679106193969866e-05,
"loss": 3.2198,
"step": 8773
},
{
"epoch": 0.7894194126877382,
"grad_norm": 1.9931387901306152,
"learning_rate": 4.261962684116106e-05,
"loss": 3.1869,
"step": 8804
},
{
"epoch": 0.7921990585070612,
"grad_norm": 1.978033423423767,
"learning_rate": 4.2559948653258145e-05,
"loss": 3.206,
"step": 8835
},
{
"epoch": 0.7949787043263842,
"grad_norm": 1.9826278686523438,
"learning_rate": 4.250007230372134e-05,
"loss": 3.2089,
"step": 8866
},
{
"epoch": 0.7977583501457073,
"grad_norm": 1.8395289182662964,
"learning_rate": 4.2439998468247126e-05,
"loss": 3.1741,
"step": 8897
},
{
"epoch": 0.8005379959650303,
"grad_norm": 1.8760936260223389,
"learning_rate": 4.2379727824760566e-05,
"loss": 3.1888,
"step": 8928
},
{
"epoch": 0.8033176417843533,
"grad_norm": 2.04682993888855,
"learning_rate": 4.231926105340768e-05,
"loss": 3.2082,
"step": 8959
},
{
"epoch": 0.8060972876036763,
"grad_norm": 2.0639662742614746,
"learning_rate": 4.225859883654776e-05,
"loss": 3.2422,
"step": 8990
},
{
"epoch": 0.8088769334229994,
"grad_norm": 1.8841536045074463,
"learning_rate": 4.219774185874569e-05,
"loss": 3.2068,
"step": 9021
},
{
"epoch": 0.8116565792423224,
"grad_norm": 1.9644831418991089,
"learning_rate": 4.213669080676418e-05,
"loss": 3.2078,
"step": 9052
},
{
"epoch": 0.8144362250616454,
"grad_norm": 1.933451771736145,
"learning_rate": 4.2075446369556056e-05,
"loss": 3.2052,
"step": 9083
},
{
"epoch": 0.8172158708809684,
"grad_norm": 1.968398094177246,
"learning_rate": 4.201400923825648e-05,
"loss": 3.185,
"step": 9114
},
{
"epoch": 0.8199955167002914,
"grad_norm": 1.9496296644210815,
"learning_rate": 4.195238010617511e-05,
"loss": 3.1678,
"step": 9145
},
{
"epoch": 0.8227751625196145,
"grad_norm": 1.933290958404541,
"learning_rate": 4.1890559668788344e-05,
"loss": 3.1987,
"step": 9176
},
{
"epoch": 0.8255548083389375,
"grad_norm": 1.92232346534729,
"learning_rate": 4.1828548623731405e-05,
"loss": 3.2024,
"step": 9207
},
{
"epoch": 0.8283344541582605,
"grad_norm": 2.029100179672241,
"learning_rate": 4.1766347670790506e-05,
"loss": 3.1616,
"step": 9238
},
{
"epoch": 0.8311140999775835,
"grad_norm": 2.071643114089966,
"learning_rate": 4.170395751189495e-05,
"loss": 3.1797,
"step": 9269
},
{
"epoch": 0.8338937457969066,
"grad_norm": 1.8804633617401123,
"learning_rate": 4.164137885110921e-05,
"loss": 3.1841,
"step": 9300
},
{
"epoch": 0.8366733916162296,
"grad_norm": 1.929567813873291,
"learning_rate": 4.157861239462495e-05,
"loss": 3.177,
"step": 9331
},
{
"epoch": 0.8394530374355526,
"grad_norm": 1.9015624523162842,
"learning_rate": 4.1515658850753114e-05,
"loss": 3.2134,
"step": 9362
},
{
"epoch": 0.8422326832548755,
"grad_norm": 1.91888427734375,
"learning_rate": 4.145251892991588e-05,
"loss": 3.1466,
"step": 9393
},
{
"epoch": 0.8450123290741987,
"grad_norm": 2.033877372741699,
"learning_rate": 4.138919334463868e-05,
"loss": 3.1802,
"step": 9424
},
{
"epoch": 0.8477919748935216,
"grad_norm": 1.9958734512329102,
"learning_rate": 4.1325682809542124e-05,
"loss": 3.17,
"step": 9455
},
{
"epoch": 0.8505716207128446,
"grad_norm": 1.941024661064148,
"learning_rate": 4.126198804133398e-05,
"loss": 3.1876,
"step": 9486
},
{
"epoch": 0.8533512665321676,
"grad_norm": 1.9628397226333618,
"learning_rate": 4.1198109758801055e-05,
"loss": 3.1944,
"step": 9517
},
{
"epoch": 0.8561309123514907,
"grad_norm": 1.9526383876800537,
"learning_rate": 4.113404868280107e-05,
"loss": 3.2019,
"step": 9548
},
{
"epoch": 0.8589105581708137,
"grad_norm": 1.9171335697174072,
"learning_rate": 4.106980553625457e-05,
"loss": 3.1993,
"step": 9579
},
{
"epoch": 0.8616902039901367,
"grad_norm": 2.0531489849090576,
"learning_rate": 4.100538104413674e-05,
"loss": 3.1918,
"step": 9610
},
{
"epoch": 0.8644698498094597,
"grad_norm": 1.840210199356079,
"learning_rate": 4.09407759334692e-05,
"loss": 3.1762,
"step": 9641
},
{
"epoch": 0.8672494956287827,
"grad_norm": 1.990539789199829,
"learning_rate": 4.087599093331186e-05,
"loss": 3.1746,
"step": 9672
},
{
"epoch": 0.8700291414481058,
"grad_norm": 2.0064046382904053,
"learning_rate": 4.081102677475462e-05,
"loss": 3.1779,
"step": 9703
},
{
"epoch": 0.8728087872674288,
"grad_norm": 1.959908127784729,
"learning_rate": 4.0745884190909194e-05,
"loss": 3.1502,
"step": 9734
},
{
"epoch": 0.8755884330867518,
"grad_norm": 1.9566411972045898,
"learning_rate": 4.0680563916900796e-05,
"loss": 3.1801,
"step": 9765
},
{
"epoch": 0.8783680789060748,
"grad_norm": 2.0030715465545654,
"learning_rate": 4.0615066689859815e-05,
"loss": 3.1597,
"step": 9796
},
{
"epoch": 0.8811477247253979,
"grad_norm": 1.9277180433273315,
"learning_rate": 4.0549393248913584e-05,
"loss": 3.1501,
"step": 9827
},
{
"epoch": 0.8839273705447209,
"grad_norm": 1.887507677078247,
"learning_rate": 4.048354433517794e-05,
"loss": 3.1713,
"step": 9858
},
{
"epoch": 0.8867070163640439,
"grad_norm": 1.9514687061309814,
"learning_rate": 4.0417520691748916e-05,
"loss": 3.1726,
"step": 9889
},
{
"epoch": 0.8894866621833669,
"grad_norm": 2.0142714977264404,
"learning_rate": 4.035132306369438e-05,
"loss": 3.1824,
"step": 9920
},
{
"epoch": 0.89226630800269,
"grad_norm": 1.9494922161102295,
"learning_rate": 4.028495219804555e-05,
"loss": 3.2172,
"step": 9951
},
{
"epoch": 0.895045953822013,
"grad_norm": 1.8487540483474731,
"learning_rate": 4.021840884378864e-05,
"loss": 3.2064,
"step": 9982
},
{
"epoch": 0.897825599641336,
"grad_norm": 1.8519740104675293,
"learning_rate": 4.015169375185633e-05,
"loss": 3.191,
"step": 10013
},
{
"epoch": 0.900605245460659,
"grad_norm": 1.909957766532898,
"learning_rate": 4.0084807675119396e-05,
"loss": 3.1534,
"step": 10044
},
{
"epoch": 0.9033848912799821,
"grad_norm": 2.1778204441070557,
"learning_rate": 4.0017751368378106e-05,
"loss": 3.1926,
"step": 10075
},
{
"epoch": 0.9061645370993051,
"grad_norm": 1.9085873365402222,
"learning_rate": 3.995052558835377e-05,
"loss": 3.1589,
"step": 10106
},
{
"epoch": 0.9089441829186281,
"grad_norm": 1.863411784172058,
"learning_rate": 3.988313109368017e-05,
"loss": 3.172,
"step": 10137
},
{
"epoch": 0.9117238287379511,
"grad_norm": 1.8036720752716064,
"learning_rate": 3.981556864489504e-05,
"loss": 3.1451,
"step": 10168
},
{
"epoch": 0.9145034745572741,
"grad_norm": 1.8620129823684692,
"learning_rate": 3.974783900443142e-05,
"loss": 3.1732,
"step": 10199
},
{
"epoch": 0.9172831203765972,
"grad_norm": 2.0112087726593018,
"learning_rate": 3.9679942936609095e-05,
"loss": 3.163,
"step": 10230
},
{
"epoch": 0.9200627661959202,
"grad_norm": 1.8342541456222534,
"learning_rate": 3.961188120762596e-05,
"loss": 3.1561,
"step": 10261
},
{
"epoch": 0.9228424120152432,
"grad_norm": 2.3467628955841064,
"learning_rate": 3.954365458554938e-05,
"loss": 3.1595,
"step": 10292
},
{
"epoch": 0.9256220578345662,
"grad_norm": 1.8928585052490234,
"learning_rate": 3.947526384030751e-05,
"loss": 3.1701,
"step": 10323
},
{
"epoch": 0.9284017036538893,
"grad_norm": 1.9733792543411255,
"learning_rate": 3.9406709743680624e-05,
"loss": 3.1957,
"step": 10354
},
{
"epoch": 0.9311813494732123,
"grad_norm": 1.8375611305236816,
"learning_rate": 3.9337993069292366e-05,
"loss": 3.1562,
"step": 10385
},
{
"epoch": 0.9339609952925353,
"grad_norm": 1.8635389804840088,
"learning_rate": 3.926911459260109e-05,
"loss": 3.1841,
"step": 10416
},
{
"epoch": 0.9367406411118583,
"grad_norm": 1.8488346338272095,
"learning_rate": 3.920007509089102e-05,
"loss": 3.1715,
"step": 10447
},
{
"epoch": 0.9395202869311814,
"grad_norm": 1.828251838684082,
"learning_rate": 3.913087534326357e-05,
"loss": 3.1825,
"step": 10478
},
{
"epoch": 0.9422999327505044,
"grad_norm": 1.895739197731018,
"learning_rate": 3.9061516130628475e-05,
"loss": 3.1692,
"step": 10509
},
{
"epoch": 0.9450795785698274,
"grad_norm": 1.8475978374481201,
"learning_rate": 3.8991998235695025e-05,
"loss": 3.1661,
"step": 10540
},
{
"epoch": 0.9478592243891504,
"grad_norm": 1.9761695861816406,
"learning_rate": 3.8922322442963224e-05,
"loss": 3.1217,
"step": 10571
},
{
"epoch": 0.9506388702084735,
"grad_norm": 1.8657857179641724,
"learning_rate": 3.885248953871491e-05,
"loss": 3.1508,
"step": 10602
},
{
"epoch": 0.9534185160277965,
"grad_norm": 2.033208131790161,
"learning_rate": 3.8782500311004915e-05,
"loss": 3.1541,
"step": 10633
},
{
"epoch": 0.9561981618471195,
"grad_norm": 1.9058306217193604,
"learning_rate": 3.871235554965218e-05,
"loss": 3.1583,
"step": 10664
},
{
"epoch": 0.9589778076664425,
"grad_norm": 12.429788589477539,
"learning_rate": 3.864205604623078e-05,
"loss": 3.1472,
"step": 10695
},
{
"epoch": 0.9617574534857656,
"grad_norm": 1.928039312362671,
"learning_rate": 3.857160259406107e-05,
"loss": 3.1632,
"step": 10726
},
{
"epoch": 0.9645370993050886,
"grad_norm": 1.8084813356399536,
"learning_rate": 3.8500995988200674e-05,
"loss": 3.1645,
"step": 10757
},
{
"epoch": 0.9673167451244116,
"grad_norm": 1.8052054643630981,
"learning_rate": 3.843023702543556e-05,
"loss": 3.1284,
"step": 10788
},
{
"epoch": 0.9700963909437346,
"grad_norm": 1.9415054321289062,
"learning_rate": 3.8359326504270984e-05,
"loss": 3.1147,
"step": 10819
},
{
"epoch": 0.9728760367630576,
"grad_norm": 1.9183542728424072,
"learning_rate": 3.828826522492255e-05,
"loss": 3.1178,
"step": 10850
},
{
"epoch": 0.9756556825823807,
"grad_norm": 1.8961682319641113,
"learning_rate": 3.821705398930713e-05,
"loss": 3.1385,
"step": 10881
},
{
"epoch": 0.9784353284017037,
"grad_norm": 1.937819480895996,
"learning_rate": 3.814569360103385e-05,
"loss": 3.1569,
"step": 10912
},
{
"epoch": 0.9812149742210267,
"grad_norm": 2.237182855606079,
"learning_rate": 3.807418486539499e-05,
"loss": 3.1669,
"step": 10943
},
{
"epoch": 0.9839946200403497,
"grad_norm": 1.9500761032104492,
"learning_rate": 3.80025285893569e-05,
"loss": 3.1556,
"step": 10974
},
{
"epoch": 0.9867742658596728,
"grad_norm": 1.9348506927490234,
"learning_rate": 3.793072558155093e-05,
"loss": 3.1553,
"step": 11005
},
{
"epoch": 0.9895539116789958,
"grad_norm": 1.8740767240524292,
"learning_rate": 3.785877665226426e-05,
"loss": 3.1818,
"step": 11036
},
{
"epoch": 0.9923335574983188,
"grad_norm": 1.837488055229187,
"learning_rate": 3.778668261343079e-05,
"loss": 3.1655,
"step": 11067
},
{
"epoch": 0.9951132033176417,
"grad_norm": 1.8872405290603638,
"learning_rate": 3.771444427862192e-05,
"loss": 3.1629,
"step": 11098
},
{
"epoch": 0.9978928491369649,
"grad_norm": 1.7856714725494385,
"learning_rate": 3.7642062463037465e-05,
"loss": 3.1384,
"step": 11129
},
{
"epoch": 1.0006724949562877,
"grad_norm": 1.9878672361373901,
"learning_rate": 3.7569537983496373e-05,
"loss": 3.107,
"step": 11160
},
{
"epoch": 1.003452140775611,
"grad_norm": 1.9204086065292358,
"learning_rate": 3.749687165842753e-05,
"loss": 2.9639,
"step": 11191
},
{
"epoch": 1.006231786594934,
"grad_norm": 1.9229450225830078,
"learning_rate": 3.7424064307860536e-05,
"loss": 2.9639,
"step": 11222
},
{
"epoch": 1.009011432414257,
"grad_norm": 1.944447636604309,
"learning_rate": 3.735111675341645e-05,
"loss": 2.9491,
"step": 11253
},
{
"epoch": 1.01179107823358,
"grad_norm": 2.0207080841064453,
"learning_rate": 3.7278029818298524e-05,
"loss": 2.9621,
"step": 11284
},
{
"epoch": 1.014570724052903,
"grad_norm": 2.230640172958374,
"learning_rate": 3.720480432728287e-05,
"loss": 2.9564,
"step": 11315
},
{
"epoch": 1.017350369872226,
"grad_norm": 1.8823405504226685,
"learning_rate": 3.71314411067092e-05,
"loss": 2.9552,
"step": 11346
},
{
"epoch": 1.020130015691549,
"grad_norm": 1.8951125144958496,
"learning_rate": 3.70579409844715e-05,
"loss": 2.9475,
"step": 11377
},
{
"epoch": 1.022909661510872,
"grad_norm": 1.9785139560699463,
"learning_rate": 3.698430479000865e-05,
"loss": 2.9564,
"step": 11408
},
{
"epoch": 1.025689307330195,
"grad_norm": 1.9839940071105957,
"learning_rate": 3.691053335429509e-05,
"loss": 2.9541,
"step": 11439
},
{
"epoch": 1.0284689531495181,
"grad_norm": 1.9339096546173096,
"learning_rate": 3.683662750983147e-05,
"loss": 2.9177,
"step": 11470
},
{
"epoch": 1.0312485989688411,
"grad_norm": 1.984049916267395,
"learning_rate": 3.676258809063518e-05,
"loss": 2.9599,
"step": 11501
},
{
"epoch": 1.0340282447881641,
"grad_norm": 1.9555659294128418,
"learning_rate": 3.6688415932231004e-05,
"loss": 2.9415,
"step": 11532
},
{
"epoch": 1.0368078906074871,
"grad_norm": 2.0060806274414062,
"learning_rate": 3.661411187164166e-05,
"loss": 2.9804,
"step": 11563
},
{
"epoch": 1.0395875364268101,
"grad_norm": 1.9450613260269165,
"learning_rate": 3.65396767473784e-05,
"loss": 2.9409,
"step": 11594
},
{
"epoch": 1.0423671822461331,
"grad_norm": 2.0419921875,
"learning_rate": 3.6465111399431465e-05,
"loss": 2.9735,
"step": 11625
},
{
"epoch": 1.0451468280654561,
"grad_norm": 2.0117428302764893,
"learning_rate": 3.6390416669260674e-05,
"loss": 2.9655,
"step": 11656
},
{
"epoch": 1.047926473884779,
"grad_norm": 2.0134499073028564,
"learning_rate": 3.63155933997859e-05,
"loss": 2.9225,
"step": 11687
},
{
"epoch": 1.0507061197041023,
"grad_norm": 1.96694016456604,
"learning_rate": 3.624064243537758e-05,
"loss": 2.9609,
"step": 11718
},
{
"epoch": 1.0534857655234253,
"grad_norm": 1.929091453552246,
"learning_rate": 3.616556462184716e-05,
"loss": 2.9189,
"step": 11749
},
{
"epoch": 1.0562654113427483,
"grad_norm": 1.9514384269714355,
"learning_rate": 3.609036080643755e-05,
"loss": 2.9447,
"step": 11780
},
{
"epoch": 1.0590450571620713,
"grad_norm": 1.9550822973251343,
"learning_rate": 3.60150318378136e-05,
"loss": 2.9315,
"step": 11811
},
{
"epoch": 1.0618247029813943,
"grad_norm": 1.9250727891921997,
"learning_rate": 3.5939578566052465e-05,
"loss": 2.9483,
"step": 11842
},
{
"epoch": 1.0646043488007173,
"grad_norm": 2.0025384426116943,
"learning_rate": 3.586400184263408e-05,
"loss": 2.9102,
"step": 11873
},
{
"epoch": 1.0673839946200403,
"grad_norm": 1.95062255859375,
"learning_rate": 3.578830252043148e-05,
"loss": 2.9166,
"step": 11904
},
{
"epoch": 1.0701636404393633,
"grad_norm": 1.9655905961990356,
"learning_rate": 3.571248145370125e-05,
"loss": 2.9251,
"step": 11935
},
{
"epoch": 1.0729432862586863,
"grad_norm": 1.8943630456924438,
"learning_rate": 3.5636539498073794e-05,
"loss": 2.9817,
"step": 11966
},
{
"epoch": 1.0757229320780095,
"grad_norm": 2.025667905807495,
"learning_rate": 3.556047751054378e-05,
"loss": 2.9599,
"step": 11997
},
{
"epoch": 1.0785025778973325,
"grad_norm": 1.9214924573898315,
"learning_rate": 3.548429634946039e-05,
"loss": 2.9434,
"step": 12028
},
{
"epoch": 1.0812822237166555,
"grad_norm": 1.9672141075134277,
"learning_rate": 3.540799687451768e-05,
"loss": 2.9244,
"step": 12059
},
{
"epoch": 1.0840618695359785,
"grad_norm": 2.0361907482147217,
"learning_rate": 3.533157994674485e-05,
"loss": 2.9728,
"step": 12090
},
{
"epoch": 1.0868415153553015,
"grad_norm": 2.1192681789398193,
"learning_rate": 3.5255046428496546e-05,
"loss": 2.9137,
"step": 12121
},
{
"epoch": 1.0896211611746245,
"grad_norm": 1.8768310546875,
"learning_rate": 3.517839718344311e-05,
"loss": 2.9491,
"step": 12152
},
{
"epoch": 1.0924008069939475,
"grad_norm": 1.8967385292053223,
"learning_rate": 3.510163307656086e-05,
"loss": 2.9351,
"step": 12183
},
{
"epoch": 1.0951804528132705,
"grad_norm": 1.9647105932235718,
"learning_rate": 3.5024754974122324e-05,
"loss": 2.9505,
"step": 12214
},
{
"epoch": 1.0979600986325937,
"grad_norm": 1.902198076248169,
"learning_rate": 3.494776374368643e-05,
"loss": 2.8994,
"step": 12245
},
{
"epoch": 1.1007397444519167,
"grad_norm": 1.930614709854126,
"learning_rate": 3.4870660254088724e-05,
"loss": 2.9536,
"step": 12276
},
{
"epoch": 1.1035193902712397,
"grad_norm": 1.955419898033142,
"learning_rate": 3.479344537543164e-05,
"loss": 2.9369,
"step": 12307
},
{
"epoch": 1.1062990360905627,
"grad_norm": 1.9790080785751343,
"learning_rate": 3.4716119979074565e-05,
"loss": 2.9629,
"step": 12338
},
{
"epoch": 1.1090786819098857,
"grad_norm": 2.02301025390625,
"learning_rate": 3.463868493762412e-05,
"loss": 2.9438,
"step": 12369
},
{
"epoch": 1.1118583277292087,
"grad_norm": 1.8974666595458984,
"learning_rate": 3.456114112492418e-05,
"loss": 2.9653,
"step": 12400
},
{
"epoch": 1.1146379735485317,
"grad_norm": 2.0910682678222656,
"learning_rate": 3.4483489416046164e-05,
"loss": 2.9659,
"step": 12431
},
{
"epoch": 1.1174176193678547,
"grad_norm": 2.0032594203948975,
"learning_rate": 3.440573068727905e-05,
"loss": 2.928,
"step": 12462
},
{
"epoch": 1.1201972651871777,
"grad_norm": 2.1574766635894775,
"learning_rate": 3.4327865816119495e-05,
"loss": 2.9336,
"step": 12493
},
{
"epoch": 1.1229769110065009,
"grad_norm": 1.9993735551834106,
"learning_rate": 3.4249895681262025e-05,
"loss": 2.917,
"step": 12524
},
{
"epoch": 1.1257565568258239,
"grad_norm": 1.946427822113037,
"learning_rate": 3.417182116258899e-05,
"loss": 2.9023,
"step": 12555
},
{
"epoch": 1.1285362026451469,
"grad_norm": 1.9485125541687012,
"learning_rate": 3.409364314116074e-05,
"loss": 2.9553,
"step": 12586
},
{
"epoch": 1.1313158484644699,
"grad_norm": 2.0353407859802246,
"learning_rate": 3.401536249920559e-05,
"loss": 2.9402,
"step": 12617
},
{
"epoch": 1.1340954942837929,
"grad_norm": 2.0576653480529785,
"learning_rate": 3.393698012010998e-05,
"loss": 2.9421,
"step": 12648
},
{
"epoch": 1.1368751401031159,
"grad_norm": 1.9606209993362427,
"learning_rate": 3.385849688840839e-05,
"loss": 2.9613,
"step": 12679
},
{
"epoch": 1.1396547859224389,
"grad_norm": 1.9826381206512451,
"learning_rate": 3.3779913689773414e-05,
"loss": 2.9301,
"step": 12710
},
{
"epoch": 1.1424344317417618,
"grad_norm": 1.9487073421478271,
"learning_rate": 3.370123141100578e-05,
"loss": 2.9691,
"step": 12741
},
{
"epoch": 1.1452140775610848,
"grad_norm": 2.0785021781921387,
"learning_rate": 3.3622450940024305e-05,
"loss": 2.9394,
"step": 12772
},
{
"epoch": 1.147993723380408,
"grad_norm": 1.9334497451782227,
"learning_rate": 3.35435731658559e-05,
"loss": 2.922,
"step": 12803
},
{
"epoch": 1.150773369199731,
"grad_norm": 2.0320358276367188,
"learning_rate": 3.346459897862552e-05,
"loss": 2.929,
"step": 12834
},
{
"epoch": 1.153553015019054,
"grad_norm": 2.089158535003662,
"learning_rate": 3.338552926954613e-05,
"loss": 2.9489,
"step": 12865
},
{
"epoch": 1.156332660838377,
"grad_norm": 1.8891345262527466,
"learning_rate": 3.330636493090868e-05,
"loss": 2.9368,
"step": 12896
},
{
"epoch": 1.1591123066577,
"grad_norm": 1.978514313697815,
"learning_rate": 3.322710685607193e-05,
"loss": 2.9454,
"step": 12927
},
{
"epoch": 1.161891952477023,
"grad_norm": 1.9743033647537231,
"learning_rate": 3.314775593945251e-05,
"loss": 2.918,
"step": 12958
},
{
"epoch": 1.164671598296346,
"grad_norm": 1.9501724243164062,
"learning_rate": 3.3068313076514714e-05,
"loss": 2.9332,
"step": 12989
},
{
"epoch": 1.1674512441156693,
"grad_norm": 1.9802473783493042,
"learning_rate": 3.298877916376047e-05,
"loss": 2.9643,
"step": 13020
},
{
"epoch": 1.1702308899349922,
"grad_norm": 2.1636433601379395,
"learning_rate": 3.290915509871915e-05,
"loss": 2.9412,
"step": 13051
},
{
"epoch": 1.1730105357543152,
"grad_norm": 1.9422581195831299,
"learning_rate": 3.282944177993753e-05,
"loss": 2.9529,
"step": 13082
},
{
"epoch": 1.1757901815736382,
"grad_norm": 1.9173604249954224,
"learning_rate": 3.274964010696957e-05,
"loss": 2.9203,
"step": 13113
},
{
"epoch": 1.1785698273929612,
"grad_norm": 1.9598551988601685,
"learning_rate": 3.266975098036629e-05,
"loss": 2.9352,
"step": 13144
},
{
"epoch": 1.1813494732122842,
"grad_norm": 2.061182975769043,
"learning_rate": 3.258977530166562e-05,
"loss": 2.9321,
"step": 13175
},
{
"epoch": 1.1841291190316072,
"grad_norm": 1.9192544221878052,
"learning_rate": 3.250971397338227e-05,
"loss": 2.8922,
"step": 13206
},
{
"epoch": 1.1869087648509302,
"grad_norm": 1.93052339553833,
"learning_rate": 3.2429567898997404e-05,
"loss": 2.9131,
"step": 13237
},
{
"epoch": 1.1896884106702532,
"grad_norm": 2.030632734298706,
"learning_rate": 3.234933798294859e-05,
"loss": 2.9486,
"step": 13268
},
{
"epoch": 1.1924680564895764,
"grad_norm": 1.9135621786117554,
"learning_rate": 3.2269025130619535e-05,
"loss": 2.9352,
"step": 13299
},
{
"epoch": 1.1952477023088994,
"grad_norm": 1.8833742141723633,
"learning_rate": 3.218863024832985e-05,
"loss": 2.9311,
"step": 13330
},
{
"epoch": 1.1980273481282224,
"grad_norm": 1.9402114152908325,
"learning_rate": 3.2108154243324864e-05,
"loss": 2.9479,
"step": 13361
},
{
"epoch": 1.2008069939475454,
"grad_norm": 2.003957509994507,
"learning_rate": 3.2027598023765345e-05,
"loss": 2.9194,
"step": 13392
},
{
"epoch": 1.2035866397668684,
"grad_norm": 2.00201416015625,
"learning_rate": 3.194696249871729e-05,
"loss": 2.9315,
"step": 13423
},
{
"epoch": 1.2063662855861914,
"grad_norm": 1.996684193611145,
"learning_rate": 3.186624857814164e-05,
"loss": 2.9418,
"step": 13454
},
{
"epoch": 1.2091459314055144,
"grad_norm": 2.089789628982544,
"learning_rate": 3.178545717288401e-05,
"loss": 2.9315,
"step": 13485
},
{
"epoch": 1.2119255772248374,
"grad_norm": 1.8793665170669556,
"learning_rate": 3.170458919466444e-05,
"loss": 2.926,
"step": 13516
},
{
"epoch": 1.2147052230441604,
"grad_norm": 2.1354544162750244,
"learning_rate": 3.1623645556067063e-05,
"loss": 2.9212,
"step": 13547
},
{
"epoch": 1.2174848688634836,
"grad_norm": 2.031564950942993,
"learning_rate": 3.154262717052985e-05,
"loss": 2.9156,
"step": 13578
},
{
"epoch": 1.2202645146828066,
"grad_norm": 1.9664191007614136,
"learning_rate": 3.146153495233426e-05,
"loss": 2.9165,
"step": 13609
},
{
"epoch": 1.2230441605021296,
"grad_norm": 1.9077311754226685,
"learning_rate": 3.1380369816594944e-05,
"loss": 2.9243,
"step": 13640
},
{
"epoch": 1.2258238063214526,
"grad_norm": 2.0613224506378174,
"learning_rate": 3.129913267924946e-05,
"loss": 2.9397,
"step": 13671
},
{
"epoch": 1.2286034521407756,
"grad_norm": 1.9298268556594849,
"learning_rate": 3.121782445704782e-05,
"loss": 2.8912,
"step": 13702
},
{
"epoch": 1.2313830979600986,
"grad_norm": 1.9356482028961182,
"learning_rate": 3.11364460675423e-05,
"loss": 2.9502,
"step": 13733
},
{
"epoch": 1.2341627437794216,
"grad_norm": 2.0454516410827637,
"learning_rate": 3.1054998429076934e-05,
"loss": 2.9337,
"step": 13764
},
{
"epoch": 1.2369423895987446,
"grad_norm": 1.947540283203125,
"learning_rate": 3.097348246077728e-05,
"loss": 2.9286,
"step": 13795
},
{
"epoch": 1.2397220354180676,
"grad_norm": 1.9849016666412354,
"learning_rate": 3.0891899082539924e-05,
"loss": 2.9462,
"step": 13826
},
{
"epoch": 1.2425016812373908,
"grad_norm": 1.9809448719024658,
"learning_rate": 3.0810249215022233e-05,
"loss": 2.9319,
"step": 13857
},
{
"epoch": 1.2452813270567138,
"grad_norm": 1.9004404544830322,
"learning_rate": 3.0728533779631865e-05,
"loss": 2.9222,
"step": 13888
},
{
"epoch": 1.2480609728760368,
"grad_norm": 2.0778489112854004,
"learning_rate": 3.064675369851637e-05,
"loss": 2.9352,
"step": 13919
},
{
"epoch": 1.2508406186953598,
"grad_norm": 2.0006306171417236,
"learning_rate": 3.056490989455289e-05,
"loss": 2.9176,
"step": 13950
},
{
"epoch": 1.2536202645146828,
"grad_norm": 2.164907217025757,
"learning_rate": 3.0483003291337596e-05,
"loss": 2.904,
"step": 13981
},
{
"epoch": 1.2563999103340058,
"grad_norm": 1.9432871341705322,
"learning_rate": 3.040103481317539e-05,
"loss": 2.9387,
"step": 14012
},
{
"epoch": 1.2591795561533288,
"grad_norm": 2.0023536682128906,
"learning_rate": 3.03190053850694e-05,
"loss": 2.9053,
"step": 14043
},
{
"epoch": 1.261959201972652,
"grad_norm": 1.9094126224517822,
"learning_rate": 3.0236915932710573e-05,
"loss": 2.9342,
"step": 14074
},
{
"epoch": 1.2647388477919748,
"grad_norm": 2.038862466812134,
"learning_rate": 3.0154767382467232e-05,
"loss": 2.9419,
"step": 14105
},
{
"epoch": 1.267518493611298,
"grad_norm": 1.9701634645462036,
"learning_rate": 3.0072560661374582e-05,
"loss": 2.939,
"step": 14136
},
{
"epoch": 1.270298139430621,
"grad_norm": 2.041240692138672,
"learning_rate": 2.999029669712431e-05,
"loss": 2.9024,
"step": 14167
},
{
"epoch": 1.273077785249944,
"grad_norm": 1.8705273866653442,
"learning_rate": 2.990797641805408e-05,
"loss": 2.9491,
"step": 14198
},
{
"epoch": 1.275857431069267,
"grad_norm": 1.8689632415771484,
"learning_rate": 2.982560075313704e-05,
"loss": 2.9109,
"step": 14229
},
{
"epoch": 1.27863707688859,
"grad_norm": 1.901034951210022,
"learning_rate": 2.9743170631971368e-05,
"loss": 2.9159,
"step": 14260
},
{
"epoch": 1.281416722707913,
"grad_norm": 1.9830548763275146,
"learning_rate": 2.9660686984769792e-05,
"loss": 2.9175,
"step": 14291
},
{
"epoch": 1.284196368527236,
"grad_norm": 1.9754098653793335,
"learning_rate": 2.9578150742349047e-05,
"loss": 2.9495,
"step": 14322
},
{
"epoch": 1.2869760143465592,
"grad_norm": 1.8658026456832886,
"learning_rate": 2.949556283611942e-05,
"loss": 2.9543,
"step": 14353
},
{
"epoch": 1.2897556601658822,
"grad_norm": 1.9385013580322266,
"learning_rate": 2.9412924198074206e-05,
"loss": 2.9235,
"step": 14384
},
{
"epoch": 1.2925353059852052,
"grad_norm": 1.94545316696167,
"learning_rate": 2.9330235760779208e-05,
"loss": 2.9281,
"step": 14415
},
{
"epoch": 1.2953149518045282,
"grad_norm": 1.9916470050811768,
"learning_rate": 2.9247498457362188e-05,
"loss": 2.9382,
"step": 14446
},
{
"epoch": 1.2980945976238512,
"grad_norm": 1.9545270204544067,
"learning_rate": 2.9164713221502373e-05,
"loss": 2.9397,
"step": 14477
},
{
"epoch": 1.3008742434431741,
"grad_norm": 1.945106029510498,
"learning_rate": 2.9081880987419912e-05,
"loss": 2.9453,
"step": 14508
},
{
"epoch": 1.3036538892624971,
"grad_norm": 1.961753010749817,
"learning_rate": 2.8999002689865296e-05,
"loss": 2.9255,
"step": 14539
},
{
"epoch": 1.3064335350818201,
"grad_norm": 1.9291094541549683,
"learning_rate": 2.8916079264108852e-05,
"loss": 2.9054,
"step": 14570
},
{
"epoch": 1.3092131809011431,
"grad_norm": 1.9437432289123535,
"learning_rate": 2.883311164593017e-05,
"loss": 2.9095,
"step": 14601
},
{
"epoch": 1.3119928267204664,
"grad_norm": 1.9449868202209473,
"learning_rate": 2.875010077160754e-05,
"loss": 2.929,
"step": 14632
},
{
"epoch": 1.3147724725397893,
"grad_norm": 1.8561278581619263,
"learning_rate": 2.866704757790741e-05,
"loss": 2.9131,
"step": 14663
},
{
"epoch": 1.3175521183591123,
"grad_norm": 2.0299785137176514,
"learning_rate": 2.858395300207376e-05,
"loss": 2.9359,
"step": 14694
},
{
"epoch": 1.3203317641784353,
"grad_norm": 1.9579997062683105,
"learning_rate": 2.8500817981817607e-05,
"loss": 2.9344,
"step": 14725
},
{
"epoch": 1.3231114099977583,
"grad_norm": 1.9907282590866089,
"learning_rate": 2.8417643455306336e-05,
"loss": 2.9267,
"step": 14756
},
{
"epoch": 1.3258910558170813,
"grad_norm": 2.115506172180176,
"learning_rate": 2.8334430361153185e-05,
"loss": 2.93,
"step": 14787
},
{
"epoch": 1.3286707016364043,
"grad_norm": 1.935970425605774,
"learning_rate": 2.8251179638406612e-05,
"loss": 2.9119,
"step": 14818
},
{
"epoch": 1.3314503474557275,
"grad_norm": 1.9706645011901855,
"learning_rate": 2.8167892226539704e-05,
"loss": 2.9245,
"step": 14849
},
{
"epoch": 1.3342299932750503,
"grad_norm": 2.091217279434204,
"learning_rate": 2.8084569065439588e-05,
"loss": 2.8958,
"step": 14880
},
{
"epoch": 1.3370096390943735,
"grad_norm": 1.9113187789916992,
"learning_rate": 2.8001211095396807e-05,
"loss": 2.8936,
"step": 14911
},
{
"epoch": 1.3397892849136965,
"grad_norm": 1.9416507482528687,
"learning_rate": 2.791781925709473e-05,
"loss": 2.9198,
"step": 14942
},
{
"epoch": 1.3425689307330195,
"grad_norm": 1.911167860031128,
"learning_rate": 2.7834394491598908e-05,
"loss": 2.9352,
"step": 14973
},
{
"epoch": 1.3453485765523425,
"grad_norm": 1.8569062948226929,
"learning_rate": 2.7750937740346485e-05,
"loss": 2.8868,
"step": 15004
},
{
"epoch": 1.3481282223716655,
"grad_norm": 1.9254045486450195,
"learning_rate": 2.7667449945135564e-05,
"loss": 2.906,
"step": 15035
},
{
"epoch": 1.3509078681909885,
"grad_norm": 1.9615710973739624,
"learning_rate": 2.7583932048114557e-05,
"loss": 2.9148,
"step": 15066
},
{
"epoch": 1.3536875140103115,
"grad_norm": 2.1133556365966797,
"learning_rate": 2.7500384991771587e-05,
"loss": 2.8988,
"step": 15097
},
{
"epoch": 1.3564671598296347,
"grad_norm": 1.9478425979614258,
"learning_rate": 2.7416809718923825e-05,
"loss": 2.9332,
"step": 15128
},
{
"epoch": 1.3592468056489575,
"grad_norm": 2.005472183227539,
"learning_rate": 2.7333207172706864e-05,
"loss": 2.9305,
"step": 15159
},
{
"epoch": 1.3620264514682807,
"grad_norm": 1.9226336479187012,
"learning_rate": 2.7249578296564088e-05,
"loss": 2.9176,
"step": 15190
},
{
"epoch": 1.3648060972876037,
"grad_norm": 1.9458688497543335,
"learning_rate": 2.7165924034235973e-05,
"loss": 2.8914,
"step": 15221
},
{
"epoch": 1.3675857431069267,
"grad_norm": 1.930293083190918,
"learning_rate": 2.708224532974953e-05,
"loss": 2.9399,
"step": 15252
}
],
"logging_steps": 31,
"max_steps": 30517,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 3052,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.1319563003407368e+19,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}