skillfactory_hptest_repeatrun_1 / trainer_state.json
sedrickkeh's picture
Upload folder using huggingface_hub
ba48d57 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 5.0,
"eval_steps": 500,
"global_step": 2875,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.017391304347826087,
"grad_norm": 3.047056528200788,
"learning_rate": 6.25e-07,
"loss": 0.6248,
"step": 10
},
{
"epoch": 0.034782608695652174,
"grad_norm": 2.174143954299078,
"learning_rate": 1.3194444444444446e-06,
"loss": 0.6183,
"step": 20
},
{
"epoch": 0.05217391304347826,
"grad_norm": 1.864151160655997,
"learning_rate": 2.0138888888888893e-06,
"loss": 0.6015,
"step": 30
},
{
"epoch": 0.06956521739130435,
"grad_norm": 1.667310343818362,
"learning_rate": 2.7083333333333334e-06,
"loss": 0.5665,
"step": 40
},
{
"epoch": 0.08695652173913043,
"grad_norm": 2.455941646453557,
"learning_rate": 3.4027777777777783e-06,
"loss": 0.54,
"step": 50
},
{
"epoch": 0.10434782608695652,
"grad_norm": 1.9685053486872826,
"learning_rate": 4.097222222222222e-06,
"loss": 0.5399,
"step": 60
},
{
"epoch": 0.12173913043478261,
"grad_norm": 2.106997653377029,
"learning_rate": 4.791666666666668e-06,
"loss": 0.5317,
"step": 70
},
{
"epoch": 0.1391304347826087,
"grad_norm": 1.780173897349018,
"learning_rate": 5.486111111111112e-06,
"loss": 0.5183,
"step": 80
},
{
"epoch": 0.1565217391304348,
"grad_norm": 1.467101178589284,
"learning_rate": 6.180555555555557e-06,
"loss": 0.5355,
"step": 90
},
{
"epoch": 0.17391304347826086,
"grad_norm": 1.521288337808483,
"learning_rate": 6.875e-06,
"loss": 0.5215,
"step": 100
},
{
"epoch": 0.19130434782608696,
"grad_norm": 1.5769259173207397,
"learning_rate": 7.569444444444445e-06,
"loss": 0.5239,
"step": 110
},
{
"epoch": 0.20869565217391303,
"grad_norm": 1.2892203345431497,
"learning_rate": 8.263888888888888e-06,
"loss": 0.5288,
"step": 120
},
{
"epoch": 0.22608695652173913,
"grad_norm": 1.6642372146048716,
"learning_rate": 8.958333333333334e-06,
"loss": 0.5563,
"step": 130
},
{
"epoch": 0.24347826086956523,
"grad_norm": 1.5134595782170073,
"learning_rate": 9.652777777777779e-06,
"loss": 0.5047,
"step": 140
},
{
"epoch": 0.2608695652173913,
"grad_norm": 1.2945158944777713,
"learning_rate": 9.999917294352674e-06,
"loss": 0.5146,
"step": 150
},
{
"epoch": 0.2782608695652174,
"grad_norm": 1.7472246575863004,
"learning_rate": 9.99925566559051e-06,
"loss": 0.5342,
"step": 160
},
{
"epoch": 0.2956521739130435,
"grad_norm": 1.5193618638564104,
"learning_rate": 9.997932495618156e-06,
"loss": 0.5083,
"step": 170
},
{
"epoch": 0.3130434782608696,
"grad_norm": 1.554664459221582,
"learning_rate": 9.995947959527968e-06,
"loss": 0.5063,
"step": 180
},
{
"epoch": 0.33043478260869563,
"grad_norm": 2.1482072415475,
"learning_rate": 9.993302319929523e-06,
"loss": 0.5434,
"step": 190
},
{
"epoch": 0.34782608695652173,
"grad_norm": 1.4192382000638246,
"learning_rate": 9.98999592691486e-06,
"loss": 0.5189,
"step": 200
},
{
"epoch": 0.3652173913043478,
"grad_norm": 1.208599645402354,
"learning_rate": 9.986029218012164e-06,
"loss": 0.5382,
"step": 210
},
{
"epoch": 0.3826086956521739,
"grad_norm": 1.1836104891903814,
"learning_rate": 9.981402718127853e-06,
"loss": 0.5276,
"step": 220
},
{
"epoch": 0.4,
"grad_norm": 1.5978655869742624,
"learning_rate": 9.976117039477133e-06,
"loss": 0.5017,
"step": 230
},
{
"epoch": 0.41739130434782606,
"grad_norm": 1.3359478565965037,
"learning_rate": 9.970172881502982e-06,
"loss": 0.4997,
"step": 240
},
{
"epoch": 0.43478260869565216,
"grad_norm": 1.3948206340232912,
"learning_rate": 9.963571030783582e-06,
"loss": 0.5582,
"step": 250
},
{
"epoch": 0.45217391304347826,
"grad_norm": 1.3011147589302579,
"learning_rate": 9.956312360928253e-06,
"loss": 0.5457,
"step": 260
},
{
"epoch": 0.46956521739130436,
"grad_norm": 1.080591596133825,
"learning_rate": 9.948397832461829e-06,
"loss": 0.5552,
"step": 270
},
{
"epoch": 0.48695652173913045,
"grad_norm": 1.2339442843093948,
"learning_rate": 9.93982849269757e-06,
"loss": 0.5203,
"step": 280
},
{
"epoch": 0.5043478260869565,
"grad_norm": 1.5073250471212296,
"learning_rate": 9.930605475598566e-06,
"loss": 0.5409,
"step": 290
},
{
"epoch": 0.5217391304347826,
"grad_norm": 1.2705670265704732,
"learning_rate": 9.92073000162768e-06,
"loss": 0.5404,
"step": 300
},
{
"epoch": 0.5391304347826087,
"grad_norm": 1.4491195774316425,
"learning_rate": 9.910203377586053e-06,
"loss": 0.5188,
"step": 310
},
{
"epoch": 0.5565217391304348,
"grad_norm": 1.4642066613650184,
"learning_rate": 9.899026996440173e-06,
"loss": 0.4946,
"step": 320
},
{
"epoch": 0.5739130434782609,
"grad_norm": 1.18880785751321,
"learning_rate": 9.887202337137549e-06,
"loss": 0.5298,
"step": 330
},
{
"epoch": 0.591304347826087,
"grad_norm": 1.2818806362850075,
"learning_rate": 9.874730964411001e-06,
"loss": 0.5299,
"step": 340
},
{
"epoch": 0.6086956521739131,
"grad_norm": 1.07998471259005,
"learning_rate": 9.861614528571607e-06,
"loss": 0.5315,
"step": 350
},
{
"epoch": 0.6260869565217392,
"grad_norm": 1.944925155516315,
"learning_rate": 9.847854765290321e-06,
"loss": 0.5447,
"step": 360
},
{
"epoch": 0.6434782608695652,
"grad_norm": 1.511844572828546,
"learning_rate": 9.83345349536829e-06,
"loss": 0.5291,
"step": 370
},
{
"epoch": 0.6608695652173913,
"grad_norm": 1.338025282900669,
"learning_rate": 9.818412624495911e-06,
"loss": 0.5107,
"step": 380
},
{
"epoch": 0.6782608695652174,
"grad_norm": 1.0620753499324938,
"learning_rate": 9.802734143000668e-06,
"loss": 0.5441,
"step": 390
},
{
"epoch": 0.6956521739130435,
"grad_norm": 1.3469601654140646,
"learning_rate": 9.786420125583734e-06,
"loss": 0.5403,
"step": 400
},
{
"epoch": 0.7130434782608696,
"grad_norm": 1.1557449160960933,
"learning_rate": 9.769472731045451e-06,
"loss": 0.5249,
"step": 410
},
{
"epoch": 0.7304347826086957,
"grad_norm": 1.275996117914776,
"learning_rate": 9.751894201999647e-06,
"loss": 0.5399,
"step": 420
},
{
"epoch": 0.7478260869565218,
"grad_norm": 1.2172722683153971,
"learning_rate": 9.733686864576883e-06,
"loss": 0.5367,
"step": 430
},
{
"epoch": 0.7652173913043478,
"grad_norm": 1.3421828009403691,
"learning_rate": 9.714853128116634e-06,
"loss": 0.5251,
"step": 440
},
{
"epoch": 0.782608695652174,
"grad_norm": 1.0911936537573848,
"learning_rate": 9.695395484848476e-06,
"loss": 0.555,
"step": 450
},
{
"epoch": 0.8,
"grad_norm": 1.3133546561526954,
"learning_rate": 9.675316509562282e-06,
"loss": 0.5379,
"step": 460
},
{
"epoch": 0.8173913043478261,
"grad_norm": 1.0450523994543341,
"learning_rate": 9.654618859267516e-06,
"loss": 0.5115,
"step": 470
},
{
"epoch": 0.8347826086956521,
"grad_norm": 1.0976625003665517,
"learning_rate": 9.633305272841632e-06,
"loss": 0.5323,
"step": 480
},
{
"epoch": 0.8521739130434782,
"grad_norm": 1.1811352794597962,
"learning_rate": 9.61137857066764e-06,
"loss": 0.5379,
"step": 490
},
{
"epoch": 0.8695652173913043,
"grad_norm": 1.3489402359600888,
"learning_rate": 9.5888416542609e-06,
"loss": 0.5353,
"step": 500
},
{
"epoch": 0.8869565217391304,
"grad_norm": 1.2068307201497395,
"learning_rate": 9.565697505885165e-06,
"loss": 0.506,
"step": 510
},
{
"epoch": 0.9043478260869565,
"grad_norm": 1.2234807296718222,
"learning_rate": 9.541949188157937e-06,
"loss": 0.5291,
"step": 520
},
{
"epoch": 0.9217391304347826,
"grad_norm": 1.2256177402823996,
"learning_rate": 9.517599843645216e-06,
"loss": 0.51,
"step": 530
},
{
"epoch": 0.9391304347826087,
"grad_norm": 1.2281368130382897,
"learning_rate": 9.492652694445629e-06,
"loss": 0.5136,
"step": 540
},
{
"epoch": 0.9565217391304348,
"grad_norm": 1.1730852438772608,
"learning_rate": 9.467111041764072e-06,
"loss": 0.5402,
"step": 550
},
{
"epoch": 0.9739130434782609,
"grad_norm": 1.0983121942778715,
"learning_rate": 9.44097826547486e-06,
"loss": 0.5508,
"step": 560
},
{
"epoch": 0.991304347826087,
"grad_norm": 1.1697621737952535,
"learning_rate": 9.414257823674482e-06,
"loss": 0.5516,
"step": 570
},
{
"epoch": 1.008695652173913,
"grad_norm": 1.2095450172211872,
"learning_rate": 9.386953252223989e-06,
"loss": 0.5011,
"step": 580
},
{
"epoch": 1.0260869565217392,
"grad_norm": 1.1615174794799035,
"learning_rate": 9.35906816428111e-06,
"loss": 0.4851,
"step": 590
},
{
"epoch": 1.0434782608695652,
"grad_norm": 1.0792347526096802,
"learning_rate": 9.330606249822125e-06,
"loss": 0.4795,
"step": 600
},
{
"epoch": 1.0608695652173914,
"grad_norm": 1.097575063279078,
"learning_rate": 9.30157127515358e-06,
"loss": 0.4955,
"step": 610
},
{
"epoch": 1.0782608695652174,
"grad_norm": 1.1031575053788267,
"learning_rate": 9.271967082413899e-06,
"loss": 0.4884,
"step": 620
},
{
"epoch": 1.0956521739130434,
"grad_norm": 1.2767392201135948,
"learning_rate": 9.241797589064959e-06,
"loss": 0.4655,
"step": 630
},
{
"epoch": 1.1130434782608696,
"grad_norm": 1.1033082483572434,
"learning_rate": 9.211066787373702e-06,
"loss": 0.4663,
"step": 640
},
{
"epoch": 1.1304347826086956,
"grad_norm": 1.576080502703696,
"learning_rate": 9.179778743883855e-06,
"loss": 0.4725,
"step": 650
},
{
"epoch": 1.1478260869565218,
"grad_norm": 1.175175499900073,
"learning_rate": 9.147937598877797e-06,
"loss": 0.4921,
"step": 660
},
{
"epoch": 1.1652173913043478,
"grad_norm": 1.1370691886093878,
"learning_rate": 9.115547565828695e-06,
"loss": 0.4649,
"step": 670
},
{
"epoch": 1.182608695652174,
"grad_norm": 1.2601647572096586,
"learning_rate": 9.082612930842942e-06,
"loss": 0.5285,
"step": 680
},
{
"epoch": 1.2,
"grad_norm": 1.2187851162701455,
"learning_rate": 9.049138052092982e-06,
"loss": 0.4947,
"step": 690
},
{
"epoch": 1.2173913043478262,
"grad_norm": 1.1224072986238753,
"learning_rate": 9.015127359240603e-06,
"loss": 0.4663,
"step": 700
},
{
"epoch": 1.2347826086956522,
"grad_norm": 1.2366635360634342,
"learning_rate": 8.980585352850775e-06,
"loss": 0.4828,
"step": 710
},
{
"epoch": 1.2521739130434781,
"grad_norm": 1.4981539175915906,
"learning_rate": 8.94551660379609e-06,
"loss": 0.4743,
"step": 720
},
{
"epoch": 1.2695652173913043,
"grad_norm": 1.0729421363170109,
"learning_rate": 8.909925752651914e-06,
"loss": 0.4996,
"step": 730
},
{
"epoch": 1.2869565217391306,
"grad_norm": 1.1955218613972736,
"learning_rate": 8.873817509082305e-06,
"loss": 0.4863,
"step": 740
},
{
"epoch": 1.3043478260869565,
"grad_norm": 1.126282527653828,
"learning_rate": 8.837196651216802e-06,
"loss": 0.465,
"step": 750
},
{
"epoch": 1.3217391304347825,
"grad_norm": 1.253199532462182,
"learning_rate": 8.800068025018133e-06,
"loss": 0.5031,
"step": 760
},
{
"epoch": 1.3391304347826087,
"grad_norm": 1.1627263868301658,
"learning_rate": 8.762436543640965e-06,
"loss": 0.4783,
"step": 770
},
{
"epoch": 1.3565217391304347,
"grad_norm": 1.2722684925328063,
"learning_rate": 8.724307186781756e-06,
"loss": 0.4883,
"step": 780
},
{
"epoch": 1.373913043478261,
"grad_norm": 1.1113001887408591,
"learning_rate": 8.685685000019803e-06,
"loss": 0.4947,
"step": 790
},
{
"epoch": 1.391304347826087,
"grad_norm": 1.0089575387474536,
"learning_rate": 8.646575094149568e-06,
"loss": 0.4684,
"step": 800
},
{
"epoch": 1.4086956521739131,
"grad_norm": 1.139961867951673,
"learning_rate": 8.606982644504378e-06,
"loss": 0.4938,
"step": 810
},
{
"epoch": 1.4260869565217391,
"grad_norm": 1.099302351381361,
"learning_rate": 8.566912890271584e-06,
"loss": 0.4723,
"step": 820
},
{
"epoch": 1.4434782608695653,
"grad_norm": 1.097933896784172,
"learning_rate": 8.526371133799277e-06,
"loss": 0.5066,
"step": 830
},
{
"epoch": 1.4608695652173913,
"grad_norm": 1.2653660292097024,
"learning_rate": 8.485362739894617e-06,
"loss": 0.4809,
"step": 840
},
{
"epoch": 1.4782608695652173,
"grad_norm": 1.031416554665846,
"learning_rate": 8.443893135113956e-06,
"loss": 0.4849,
"step": 850
},
{
"epoch": 1.4956521739130435,
"grad_norm": 1.2133975894170272,
"learning_rate": 8.401967807044713e-06,
"loss": 0.4731,
"step": 860
},
{
"epoch": 1.5130434782608697,
"grad_norm": 1.3527575598259336,
"learning_rate": 8.359592303579241e-06,
"loss": 0.4993,
"step": 870
},
{
"epoch": 1.5304347826086957,
"grad_norm": 1.5538636433489414,
"learning_rate": 8.316772232180677e-06,
"loss": 0.4831,
"step": 880
},
{
"epoch": 1.5478260869565217,
"grad_norm": 1.0668506396404902,
"learning_rate": 8.273513259140911e-06,
"loss": 0.4549,
"step": 890
},
{
"epoch": 1.5652173913043477,
"grad_norm": 1.3545786110485585,
"learning_rate": 8.22982110883079e-06,
"loss": 0.4889,
"step": 900
},
{
"epoch": 1.5826086956521739,
"grad_norm": 1.343041115784667,
"learning_rate": 8.185701562942614e-06,
"loss": 0.507,
"step": 910
},
{
"epoch": 1.6,
"grad_norm": 1.2218861872968096,
"learning_rate": 8.141160459725063e-06,
"loss": 0.4812,
"step": 920
},
{
"epoch": 1.617391304347826,
"grad_norm": 1.2015960018363139,
"learning_rate": 8.096203693210626e-06,
"loss": 0.4965,
"step": 930
},
{
"epoch": 1.634782608695652,
"grad_norm": 1.096282312846291,
"learning_rate": 8.050837212435662e-06,
"loss": 0.4557,
"step": 940
},
{
"epoch": 1.6521739130434783,
"grad_norm": 1.0909506176288037,
"learning_rate": 8.00506702065318e-06,
"loss": 0.4702,
"step": 950
},
{
"epoch": 1.6695652173913045,
"grad_norm": 1.5020729724704236,
"learning_rate": 7.958899174538423e-06,
"loss": 0.4695,
"step": 960
},
{
"epoch": 1.6869565217391305,
"grad_norm": 1.1184823557537737,
"learning_rate": 7.912339783387429e-06,
"loss": 0.4869,
"step": 970
},
{
"epoch": 1.7043478260869565,
"grad_norm": 1.5926675735162503,
"learning_rate": 7.865395008308572e-06,
"loss": 0.4556,
"step": 980
},
{
"epoch": 1.7217391304347827,
"grad_norm": 1.357255668450024,
"learning_rate": 7.818071061407295e-06,
"loss": 0.5226,
"step": 990
},
{
"epoch": 1.7391304347826086,
"grad_norm": 1.1738545925108272,
"learning_rate": 7.770374204964062e-06,
"loss": 0.4651,
"step": 1000
},
{
"epoch": 1.7565217391304349,
"grad_norm": 1.1820022034633777,
"learning_rate": 7.722310750605693e-06,
"loss": 0.498,
"step": 1010
},
{
"epoch": 1.7739130434782608,
"grad_norm": 1.3439499575644294,
"learning_rate": 7.673887058470155e-06,
"loss": 0.4832,
"step": 1020
},
{
"epoch": 1.7913043478260868,
"grad_norm": 1.5304830271309717,
"learning_rate": 7.625109536364938e-06,
"loss": 0.4716,
"step": 1030
},
{
"epoch": 1.808695652173913,
"grad_norm": 1.2364034202681973,
"learning_rate": 7.5759846389191234e-06,
"loss": 0.4696,
"step": 1040
},
{
"epoch": 1.8260869565217392,
"grad_norm": 1.0229154001597585,
"learning_rate": 7.526518866729256e-06,
"loss": 0.5235,
"step": 1050
},
{
"epoch": 1.8434782608695652,
"grad_norm": 1.7878383661619177,
"learning_rate": 7.476718765499131e-06,
"loss": 0.4936,
"step": 1060
},
{
"epoch": 1.8608695652173912,
"grad_norm": 1.1220136346350835,
"learning_rate": 7.426590925173624e-06,
"loss": 0.4954,
"step": 1070
},
{
"epoch": 1.8782608695652174,
"grad_norm": 1.0276108610586765,
"learning_rate": 7.376141979066644e-06,
"loss": 0.5226,
"step": 1080
},
{
"epoch": 1.8956521739130436,
"grad_norm": 1.0538696180373945,
"learning_rate": 7.325378602983371e-06,
"loss": 0.4997,
"step": 1090
},
{
"epoch": 1.9130434782608696,
"grad_norm": 1.0941944221450914,
"learning_rate": 7.274307514336854e-06,
"loss": 0.5069,
"step": 1100
},
{
"epoch": 1.9304347826086956,
"grad_norm": 1.0475459053099843,
"learning_rate": 7.222935471259113e-06,
"loss": 0.4516,
"step": 1110
},
{
"epoch": 1.9478260869565216,
"grad_norm": 0.9438781712939313,
"learning_rate": 7.171269271706847e-06,
"loss": 0.4771,
"step": 1120
},
{
"epoch": 1.9652173913043478,
"grad_norm": 1.26652793242089,
"learning_rate": 7.119315752561879e-06,
"loss": 0.4628,
"step": 1130
},
{
"epoch": 1.982608695652174,
"grad_norm": 1.0230830545480807,
"learning_rate": 7.0670817887264375e-06,
"loss": 0.4961,
"step": 1140
},
{
"epoch": 2.0,
"grad_norm": 1.3043673360211918,
"learning_rate": 7.014574292213423e-06,
"loss": 0.4854,
"step": 1150
},
{
"epoch": 2.017391304347826,
"grad_norm": 1.2839938386528644,
"learning_rate": 6.9618002112317525e-06,
"loss": 0.4159,
"step": 1160
},
{
"epoch": 2.034782608695652,
"grad_norm": 1.485302697617233,
"learning_rate": 6.908766529266915e-06,
"loss": 0.4513,
"step": 1170
},
{
"epoch": 2.0521739130434784,
"grad_norm": 1.3513626513686101,
"learning_rate": 6.855480264156864e-06,
"loss": 0.4147,
"step": 1180
},
{
"epoch": 2.0695652173913044,
"grad_norm": 1.1971012611734106,
"learning_rate": 6.8019484671633586e-06,
"loss": 0.4279,
"step": 1190
},
{
"epoch": 2.0869565217391304,
"grad_norm": 1.1493410270586013,
"learning_rate": 6.748178222038891e-06,
"loss": 0.4121,
"step": 1200
},
{
"epoch": 2.1043478260869564,
"grad_norm": 1.2061559921790461,
"learning_rate": 6.694176644089297e-06,
"loss": 0.4302,
"step": 1210
},
{
"epoch": 2.121739130434783,
"grad_norm": 1.2134461208204077,
"learning_rate": 6.639950879232218e-06,
"loss": 0.4143,
"step": 1220
},
{
"epoch": 2.139130434782609,
"grad_norm": 1.1936990104884397,
"learning_rate": 6.585508103051478e-06,
"loss": 0.411,
"step": 1230
},
{
"epoch": 2.1565217391304348,
"grad_norm": 1.473818496639146,
"learning_rate": 6.530855519847568e-06,
"loss": 0.4247,
"step": 1240
},
{
"epoch": 2.1739130434782608,
"grad_norm": 1.2273898491461919,
"learning_rate": 6.4760003616843106e-06,
"loss": 0.4,
"step": 1250
},
{
"epoch": 2.1913043478260867,
"grad_norm": 1.193058454958422,
"learning_rate": 6.420949887431855e-06,
"loss": 0.4016,
"step": 1260
},
{
"epoch": 2.208695652173913,
"grad_norm": 1.3961066733797438,
"learning_rate": 6.3657113818061225e-06,
"loss": 0.4408,
"step": 1270
},
{
"epoch": 2.226086956521739,
"grad_norm": 1.292360770899796,
"learning_rate": 6.310292154404844e-06,
"loss": 0.3999,
"step": 1280
},
{
"epoch": 2.243478260869565,
"grad_norm": 3.4982840024869417,
"learning_rate": 6.254699538740292e-06,
"loss": 0.4329,
"step": 1290
},
{
"epoch": 2.260869565217391,
"grad_norm": 2.265019151887774,
"learning_rate": 6.198940891268844e-06,
"loss": 0.4282,
"step": 1300
},
{
"epoch": 2.2782608695652176,
"grad_norm": 1.1398620538733453,
"learning_rate": 6.14302359041753e-06,
"loss": 0.4222,
"step": 1310
},
{
"epoch": 2.2956521739130435,
"grad_norm": 1.523968350586471,
"learning_rate": 6.086955035607655e-06,
"loss": 0.4317,
"step": 1320
},
{
"epoch": 2.3130434782608695,
"grad_norm": 2.2066066593878197,
"learning_rate": 6.030742646275647e-06,
"loss": 0.4212,
"step": 1330
},
{
"epoch": 2.3304347826086955,
"grad_norm": 1.2410016732543188,
"learning_rate": 5.9743938608912626e-06,
"loss": 0.4334,
"step": 1340
},
{
"epoch": 2.3478260869565215,
"grad_norm": 1.210595936193342,
"learning_rate": 5.917916135973263e-06,
"loss": 0.4153,
"step": 1350
},
{
"epoch": 2.365217391304348,
"grad_norm": 1.4742810962432356,
"learning_rate": 5.861316945102717e-06,
"loss": 0.4103,
"step": 1360
},
{
"epoch": 2.382608695652174,
"grad_norm": 1.1807404650454025,
"learning_rate": 5.804603777934032e-06,
"loss": 0.4104,
"step": 1370
},
{
"epoch": 2.4,
"grad_norm": 1.32289460697371,
"learning_rate": 5.74778413920386e-06,
"loss": 0.4245,
"step": 1380
},
{
"epoch": 2.417391304347826,
"grad_norm": 1.4542008533192958,
"learning_rate": 5.690865547738021e-06,
"loss": 0.4412,
"step": 1390
},
{
"epoch": 2.4347826086956523,
"grad_norm": 1.2739934708683387,
"learning_rate": 5.6338555354565445e-06,
"loss": 0.4281,
"step": 1400
},
{
"epoch": 2.4521739130434783,
"grad_norm": 1.563662237308135,
"learning_rate": 5.576761646376987e-06,
"loss": 0.4359,
"step": 1410
},
{
"epoch": 2.4695652173913043,
"grad_norm": 1.225022509026475,
"learning_rate": 5.519591435616153e-06,
"loss": 0.4377,
"step": 1420
},
{
"epoch": 2.4869565217391303,
"grad_norm": 1.1686568195887326,
"learning_rate": 5.462352468390333e-06,
"loss": 0.4229,
"step": 1430
},
{
"epoch": 2.5043478260869563,
"grad_norm": 1.2575917217265098,
"learning_rate": 5.405052319014223e-06,
"loss": 0.4225,
"step": 1440
},
{
"epoch": 2.5217391304347827,
"grad_norm": 1.2880775613395745,
"learning_rate": 5.347698569898624e-06,
"loss": 0.4245,
"step": 1450
},
{
"epoch": 2.5391304347826087,
"grad_norm": 1.3156168488856514,
"learning_rate": 5.290298810547083e-06,
"loss": 0.3993,
"step": 1460
},
{
"epoch": 2.5565217391304347,
"grad_norm": 1.1736379125037195,
"learning_rate": 5.232860636551583e-06,
"loss": 0.4324,
"step": 1470
},
{
"epoch": 2.573913043478261,
"grad_norm": 1.2353830078230719,
"learning_rate": 5.175391648587443e-06,
"loss": 0.4185,
"step": 1480
},
{
"epoch": 2.591304347826087,
"grad_norm": 1.1458194395748447,
"learning_rate": 5.117899451407526e-06,
"loss": 0.4309,
"step": 1490
},
{
"epoch": 2.608695652173913,
"grad_norm": 1.2538290106691092,
"learning_rate": 5.060391652835925e-06,
"loss": 0.4292,
"step": 1500
},
{
"epoch": 2.626086956521739,
"grad_norm": 1.571430145629218,
"learning_rate": 5.002875862761234e-06,
"loss": 0.4268,
"step": 1510
},
{
"epoch": 2.643478260869565,
"grad_norm": 1.1840870452769912,
"learning_rate": 4.9453596921295435e-06,
"loss": 0.4319,
"step": 1520
},
{
"epoch": 2.660869565217391,
"grad_norm": 1.0371855886325287,
"learning_rate": 4.8878507519373055e-06,
"loss": 0.4378,
"step": 1530
},
{
"epoch": 2.6782608695652175,
"grad_norm": 1.0983294473272651,
"learning_rate": 4.830356652224181e-06,
"loss": 0.4058,
"step": 1540
},
{
"epoch": 2.6956521739130435,
"grad_norm": 1.127081289246426,
"learning_rate": 4.77288500106602e-06,
"loss": 0.4165,
"step": 1550
},
{
"epoch": 2.7130434782608694,
"grad_norm": 1.236794619724129,
"learning_rate": 4.715443403568103e-06,
"loss": 0.4148,
"step": 1560
},
{
"epoch": 2.730434782608696,
"grad_norm": 1.5345488197083625,
"learning_rate": 4.65803946085877e-06,
"loss": 0.4088,
"step": 1570
},
{
"epoch": 2.747826086956522,
"grad_norm": 1.3004765992472387,
"learning_rate": 4.600680769083585e-06,
"loss": 0.4359,
"step": 1580
},
{
"epoch": 2.765217391304348,
"grad_norm": 1.2501644106321015,
"learning_rate": 4.543374918400142e-06,
"loss": 0.4386,
"step": 1590
},
{
"epoch": 2.782608695652174,
"grad_norm": 1.3823645221856542,
"learning_rate": 4.486129491973687e-06,
"loss": 0.4517,
"step": 1600
},
{
"epoch": 2.8,
"grad_norm": 1.2761252508293937,
"learning_rate": 4.4289520649736475e-06,
"loss": 0.4283,
"step": 1610
},
{
"epoch": 2.8173913043478263,
"grad_norm": 1.1632722443857555,
"learning_rate": 4.371850203571225e-06,
"loss": 0.421,
"step": 1620
},
{
"epoch": 2.8347826086956522,
"grad_norm": 1.2193532972111443,
"learning_rate": 4.314831463938184e-06,
"loss": 0.4188,
"step": 1630
},
{
"epoch": 2.8521739130434782,
"grad_norm": 1.4424575510136903,
"learning_rate": 4.257903391246954e-06,
"loss": 0.4094,
"step": 1640
},
{
"epoch": 2.869565217391304,
"grad_norm": 1.095166597880803,
"learning_rate": 4.201073518672195e-06,
"loss": 0.4411,
"step": 1650
},
{
"epoch": 2.8869565217391306,
"grad_norm": 1.2662835313537542,
"learning_rate": 4.144349366393949e-06,
"loss": 0.4423,
"step": 1660
},
{
"epoch": 2.9043478260869566,
"grad_norm": 1.1922956806962224,
"learning_rate": 4.08773844060251e-06,
"loss": 0.4254,
"step": 1670
},
{
"epoch": 2.9217391304347826,
"grad_norm": 1.257711038107532,
"learning_rate": 4.031248232505139e-06,
"loss": 0.4358,
"step": 1680
},
{
"epoch": 2.9391304347826086,
"grad_norm": 1.6971639546827986,
"learning_rate": 3.97488621733478e-06,
"loss": 0.4084,
"step": 1690
},
{
"epoch": 2.9565217391304346,
"grad_norm": 1.349596141789289,
"learning_rate": 3.918659853360864e-06,
"loss": 0.4274,
"step": 1700
},
{
"epoch": 2.973913043478261,
"grad_norm": 1.3678548460193103,
"learning_rate": 3.862576580902383e-06,
"loss": 0.4187,
"step": 1710
},
{
"epoch": 2.991304347826087,
"grad_norm": 1.2860411793928146,
"learning_rate": 3.8066438213433234e-06,
"loss": 0.4303,
"step": 1720
},
{
"epoch": 3.008695652173913,
"grad_norm": 1.5134066645346198,
"learning_rate": 3.7508689761506055e-06,
"loss": 0.3752,
"step": 1730
},
{
"epoch": 3.026086956521739,
"grad_norm": 1.424990341239896,
"learning_rate": 3.6952594258946693e-06,
"loss": 0.3434,
"step": 1740
},
{
"epoch": 3.0434782608695654,
"grad_norm": 1.6649951275575265,
"learning_rate": 3.6398225292728185e-06,
"loss": 0.3607,
"step": 1750
},
{
"epoch": 3.0608695652173914,
"grad_norm": 1.8127043018682598,
"learning_rate": 3.584565622135453e-06,
"loss": 0.3623,
"step": 1760
},
{
"epoch": 3.0782608695652174,
"grad_norm": 1.3226724796309823,
"learning_rate": 3.5294960165153363e-06,
"loss": 0.3579,
"step": 1770
},
{
"epoch": 3.0956521739130434,
"grad_norm": 1.4747389958766943,
"learning_rate": 3.474620999660007e-06,
"loss": 0.3973,
"step": 1780
},
{
"epoch": 3.1130434782608694,
"grad_norm": 1.9045203788068308,
"learning_rate": 3.4199478330674745e-06,
"loss": 0.3579,
"step": 1790
},
{
"epoch": 3.130434782608696,
"grad_norm": 1.2899295921725031,
"learning_rate": 3.365483751525317e-06,
"loss": 0.3634,
"step": 1800
},
{
"epoch": 3.1478260869565218,
"grad_norm": 1.4141207800877191,
"learning_rate": 3.3112359621533193e-06,
"loss": 0.3427,
"step": 1810
},
{
"epoch": 3.1652173913043478,
"grad_norm": 1.4287974659319675,
"learning_rate": 3.257211643449768e-06,
"loss": 0.3425,
"step": 1820
},
{
"epoch": 3.1826086956521737,
"grad_norm": 1.3872599954983553,
"learning_rate": 3.203417944341536e-06,
"loss": 0.3458,
"step": 1830
},
{
"epoch": 3.2,
"grad_norm": 2.2293598598474245,
"learning_rate": 3.149861983238082e-06,
"loss": 0.3689,
"step": 1840
},
{
"epoch": 3.217391304347826,
"grad_norm": 2.0325771541464217,
"learning_rate": 3.0965508470894812e-06,
"loss": 0.3521,
"step": 1850
},
{
"epoch": 3.234782608695652,
"grad_norm": 1.4460102517208502,
"learning_rate": 3.0434915904486284e-06,
"loss": 0.358,
"step": 1860
},
{
"epoch": 3.252173913043478,
"grad_norm": 1.3048211800924494,
"learning_rate": 2.990691234537721e-06,
"loss": 0.3405,
"step": 1870
},
{
"epoch": 3.269565217391304,
"grad_norm": 1.3241574204938475,
"learning_rate": 2.938156766319156e-06,
"loss": 0.384,
"step": 1880
},
{
"epoch": 3.2869565217391306,
"grad_norm": 1.6150241974779345,
"learning_rate": 2.885895137570958e-06,
"loss": 0.3621,
"step": 1890
},
{
"epoch": 3.3043478260869565,
"grad_norm": 1.2579063705669289,
"learning_rate": 2.83391326396687e-06,
"loss": 0.3884,
"step": 1900
},
{
"epoch": 3.3217391304347825,
"grad_norm": 1.3801202825398762,
"learning_rate": 2.7822180241612077e-06,
"loss": 0.3798,
"step": 1910
},
{
"epoch": 3.3391304347826085,
"grad_norm": 1.396049263670027,
"learning_rate": 2.7308162588786303e-06,
"loss": 0.3605,
"step": 1920
},
{
"epoch": 3.356521739130435,
"grad_norm": 1.3748023066384665,
"learning_rate": 2.6797147700089167e-06,
"loss": 0.3579,
"step": 1930
},
{
"epoch": 3.373913043478261,
"grad_norm": 1.5175283161299356,
"learning_rate": 2.6289203197068834e-06,
"loss": 0.3784,
"step": 1940
},
{
"epoch": 3.391304347826087,
"grad_norm": 1.5851194282172685,
"learning_rate": 2.5784396294975677e-06,
"loss": 0.3606,
"step": 1950
},
{
"epoch": 3.408695652173913,
"grad_norm": 1.349779292865959,
"learning_rate": 2.528279379386783e-06,
"loss": 0.3694,
"step": 1960
},
{
"epoch": 3.426086956521739,
"grad_norm": 1.5339986272397241,
"learning_rate": 2.478446206977159e-06,
"loss": 0.3771,
"step": 1970
},
{
"epoch": 3.4434782608695653,
"grad_norm": 1.430106842741753,
"learning_rate": 2.4289467065898085e-06,
"loss": 0.369,
"step": 1980
},
{
"epoch": 3.4608695652173913,
"grad_norm": 1.457323252888418,
"learning_rate": 2.3797874283917127e-06,
"loss": 0.3403,
"step": 1990
},
{
"epoch": 3.4782608695652173,
"grad_norm": 1.4282143674816088,
"learning_rate": 2.3309748775289497e-06,
"loss": 0.3845,
"step": 2000
},
{
"epoch": 3.4956521739130437,
"grad_norm": 1.3173625440079775,
"learning_rate": 2.282515513265885e-06,
"loss": 0.3695,
"step": 2010
},
{
"epoch": 3.5130434782608697,
"grad_norm": 1.5391795918668048,
"learning_rate": 2.2344157481304267e-06,
"loss": 0.384,
"step": 2020
},
{
"epoch": 3.5304347826086957,
"grad_norm": 1.5462876476217569,
"learning_rate": 2.1866819470654727e-06,
"loss": 0.362,
"step": 2030
},
{
"epoch": 3.5478260869565217,
"grad_norm": 1.5819871351937547,
"learning_rate": 2.1393204265866467e-06,
"loss": 0.3788,
"step": 2040
},
{
"epoch": 3.5652173913043477,
"grad_norm": 1.6521343485618902,
"learning_rate": 2.09233745394645e-06,
"loss": 0.3785,
"step": 2050
},
{
"epoch": 3.5826086956521737,
"grad_norm": 1.5040618975765006,
"learning_rate": 2.0457392463049285e-06,
"loss": 0.359,
"step": 2060
},
{
"epoch": 3.6,
"grad_norm": 1.6074930743024214,
"learning_rate": 1.9995319699069664e-06,
"loss": 0.3652,
"step": 2070
},
{
"epoch": 3.617391304347826,
"grad_norm": 1.5631144357601194,
"learning_rate": 1.9537217392663218e-06,
"loss": 0.3626,
"step": 2080
},
{
"epoch": 3.634782608695652,
"grad_norm": 1.5044302087614625,
"learning_rate": 1.908314616356505e-06,
"loss": 0.388,
"step": 2090
},
{
"epoch": 3.6521739130434785,
"grad_norm": 1.3802858769348085,
"learning_rate": 1.8633166098086103e-06,
"loss": 0.3629,
"step": 2100
},
{
"epoch": 3.6695652173913045,
"grad_norm": 1.4222636474825245,
"learning_rate": 1.818733674116207e-06,
"loss": 0.3651,
"step": 2110
},
{
"epoch": 3.6869565217391305,
"grad_norm": 1.5026251469181622,
"learning_rate": 1.7745717088473895e-06,
"loss": 0.3772,
"step": 2120
},
{
"epoch": 3.7043478260869565,
"grad_norm": 1.4848721046911413,
"learning_rate": 1.7308365578641089e-06,
"loss": 0.3522,
"step": 2130
},
{
"epoch": 3.7217391304347824,
"grad_norm": 1.6975508268605253,
"learning_rate": 1.687534008548854e-06,
"loss": 0.3775,
"step": 2140
},
{
"epoch": 3.7391304347826084,
"grad_norm": 1.7810729043290516,
"learning_rate": 1.6446697910388294e-06,
"loss": 0.3647,
"step": 2150
},
{
"epoch": 3.756521739130435,
"grad_norm": 1.4737138340480456,
"learning_rate": 1.6022495774676916e-06,
"loss": 0.3723,
"step": 2160
},
{
"epoch": 3.773913043478261,
"grad_norm": 1.5559891939482013,
"learning_rate": 1.5602789812149727e-06,
"loss": 0.3503,
"step": 2170
},
{
"epoch": 3.791304347826087,
"grad_norm": 1.663018576508128,
"learning_rate": 1.5187635561632685e-06,
"loss": 0.3561,
"step": 2180
},
{
"epoch": 3.8086956521739133,
"grad_norm": 2.634784063202797,
"learning_rate": 1.477708795963308e-06,
"loss": 0.3406,
"step": 2190
},
{
"epoch": 3.8260869565217392,
"grad_norm": 1.5139630905983323,
"learning_rate": 1.4371201333069868e-06,
"loss": 0.3681,
"step": 2200
},
{
"epoch": 3.8434782608695652,
"grad_norm": 2.3176470378092424,
"learning_rate": 1.3970029392084771e-06,
"loss": 0.3541,
"step": 2210
},
{
"epoch": 3.860869565217391,
"grad_norm": 1.5959870558495743,
"learning_rate": 1.3573625222934829e-06,
"loss": 0.3675,
"step": 2220
},
{
"epoch": 3.878260869565217,
"grad_norm": 1.6337373914467612,
"learning_rate": 1.3182041280967656e-06,
"loss": 0.3514,
"step": 2230
},
{
"epoch": 3.8956521739130436,
"grad_norm": 1.4395314046176468,
"learning_rate": 1.2795329383680138e-06,
"loss": 0.3566,
"step": 2240
},
{
"epoch": 3.9130434782608696,
"grad_norm": 1.3199487970653823,
"learning_rate": 1.241354070386151e-06,
"loss": 0.3441,
"step": 2250
},
{
"epoch": 3.9304347826086956,
"grad_norm": 1.7513753951221092,
"learning_rate": 1.2036725762821783e-06,
"loss": 0.3628,
"step": 2260
},
{
"epoch": 3.9478260869565216,
"grad_norm": 1.5442153725537786,
"learning_rate": 1.1664934423706348e-06,
"loss": 0.3497,
"step": 2270
},
{
"epoch": 3.965217391304348,
"grad_norm": 1.663632062954429,
"learning_rate": 1.12982158848977e-06,
"loss": 0.3576,
"step": 2280
},
{
"epoch": 3.982608695652174,
"grad_norm": 1.8278674162694661,
"learning_rate": 1.0936618673505112e-06,
"loss": 0.3744,
"step": 2290
},
{
"epoch": 4.0,
"grad_norm": 1.4687537010366873,
"learning_rate": 1.0580190638943138e-06,
"loss": 0.3549,
"step": 2300
},
{
"epoch": 4.017391304347826,
"grad_norm": 1.6716521197547283,
"learning_rate": 1.022897894659981e-06,
"loss": 0.3487,
"step": 2310
},
{
"epoch": 4.034782608695652,
"grad_norm": 1.4645341272970525,
"learning_rate": 9.883030071595335e-07,
"loss": 0.3158,
"step": 2320
},
{
"epoch": 4.052173913043478,
"grad_norm": 1.6557398280558575,
"learning_rate": 9.542389792632112e-07,
"loss": 0.331,
"step": 2330
},
{
"epoch": 4.069565217391304,
"grad_norm": 1.5960738974574165,
"learning_rate": 9.20710318593701e-07,
"loss": 0.3265,
"step": 2340
},
{
"epoch": 4.086956521739131,
"grad_norm": 1.4988502007449698,
"learning_rate": 8.877214619296421e-07,
"loss": 0.336,
"step": 2350
},
{
"epoch": 4.104347826086957,
"grad_norm": 1.4424789164433198,
"learning_rate": 8.552767746185215e-07,
"loss": 0.3347,
"step": 2360
},
{
"epoch": 4.121739130434783,
"grad_norm": 2.2719026550199475,
"learning_rate": 8.233805499990166e-07,
"loss": 0.3305,
"step": 2370
},
{
"epoch": 4.139130434782609,
"grad_norm": 1.4765476496178676,
"learning_rate": 7.920370088328672e-07,
"loss": 0.3484,
"step": 2380
},
{
"epoch": 4.156521739130435,
"grad_norm": 1.7030261009934748,
"learning_rate": 7.612502987463477e-07,
"loss": 0.3163,
"step": 2390
},
{
"epoch": 4.173913043478261,
"grad_norm": 1.4010667448317558,
"learning_rate": 7.310244936814232e-07,
"loss": 0.3222,
"step": 2400
},
{
"epoch": 4.191304347826087,
"grad_norm": 1.381057060755463,
"learning_rate": 7.013635933566515e-07,
"loss": 0.3175,
"step": 2410
},
{
"epoch": 4.208695652173913,
"grad_norm": 1.6489840526877189,
"learning_rate": 6.72271522737909e-07,
"loss": 0.3027,
"step": 2420
},
{
"epoch": 4.226086956521739,
"grad_norm": 1.4490054494783193,
"learning_rate": 6.437521315190087e-07,
"loss": 0.3529,
"step": 2430
},
{
"epoch": 4.243478260869566,
"grad_norm": 1.2850499028342035,
"learning_rate": 6.158091936122773e-07,
"loss": 0.3285,
"step": 2440
},
{
"epoch": 4.260869565217392,
"grad_norm": 1.2958305173488132,
"learning_rate": 5.884464066491613e-07,
"loss": 0.3128,
"step": 2450
},
{
"epoch": 4.278260869565218,
"grad_norm": 1.716194835927619,
"learning_rate": 5.616673914909282e-07,
"loss": 0.3377,
"step": 2460
},
{
"epoch": 4.2956521739130435,
"grad_norm": 1.5456459782523917,
"learning_rate": 5.354756917495224e-07,
"loss": 0.359,
"step": 2470
},
{
"epoch": 4.3130434782608695,
"grad_norm": 1.7049849318119408,
"learning_rate": 5.098747733186498e-07,
"loss": 0.3194,
"step": 2480
},
{
"epoch": 4.3304347826086955,
"grad_norm": 1.4376855175332917,
"learning_rate": 4.84868023915141e-07,
"loss": 0.3296,
"step": 2490
},
{
"epoch": 4.3478260869565215,
"grad_norm": 1.5575044356910057,
"learning_rate": 4.6045875263066474e-07,
"loss": 0.3347,
"step": 2500
},
{
"epoch": 4.3652173913043475,
"grad_norm": 1.6696231773916848,
"learning_rate": 4.366501894938363e-07,
"loss": 0.3509,
"step": 2510
},
{
"epoch": 4.3826086956521735,
"grad_norm": 1.4793874890497716,
"learning_rate": 4.1344548504280213e-07,
"loss": 0.316,
"step": 2520
},
{
"epoch": 4.4,
"grad_norm": 1.7504184745527158,
"learning_rate": 3.908477099083291e-07,
"loss": 0.3368,
"step": 2530
},
{
"epoch": 4.417391304347826,
"grad_norm": 1.5433746257360492,
"learning_rate": 3.6885985440747895e-07,
"loss": 0.3147,
"step": 2540
},
{
"epoch": 4.434782608695652,
"grad_norm": 1.6205451434238438,
"learning_rate": 3.474848281479032e-07,
"loss": 0.3198,
"step": 2550
},
{
"epoch": 4.452173913043478,
"grad_norm": 1.7748707264655563,
"learning_rate": 3.2672545964282263e-07,
"loss": 0.3337,
"step": 2560
},
{
"epoch": 4.469565217391304,
"grad_norm": 2.034670741566414,
"learning_rate": 3.0658449593673256e-07,
"loss": 0.2871,
"step": 2570
},
{
"epoch": 4.48695652173913,
"grad_norm": 1.9535751002955573,
"learning_rate": 2.8706460224189656e-07,
"loss": 0.3106,
"step": 2580
},
{
"epoch": 4.504347826086956,
"grad_norm": 1.4860584696429842,
"learning_rate": 2.6816836158566117e-07,
"loss": 0.3304,
"step": 2590
},
{
"epoch": 4.521739130434782,
"grad_norm": 1.5646385009356472,
"learning_rate": 2.498982744686501e-07,
"loss": 0.328,
"step": 2600
},
{
"epoch": 4.539130434782608,
"grad_norm": 1.6223923163321592,
"learning_rate": 2.3225675853387974e-07,
"loss": 0.3099,
"step": 2610
},
{
"epoch": 4.556521739130435,
"grad_norm": 1.4069397141672941,
"learning_rate": 2.1524614824683377e-07,
"loss": 0.3318,
"step": 2620
},
{
"epoch": 4.573913043478261,
"grad_norm": 1.5612850701477952,
"learning_rate": 1.98868694586552e-07,
"loss": 0.3376,
"step": 2630
},
{
"epoch": 4.591304347826087,
"grad_norm": 1.6586621274603865,
"learning_rate": 1.8312656474776093e-07,
"loss": 0.3034,
"step": 2640
},
{
"epoch": 4.608695652173913,
"grad_norm": 1.6277894314324821,
"learning_rate": 1.6802184185409355e-07,
"loss": 0.334,
"step": 2650
},
{
"epoch": 4.626086956521739,
"grad_norm": 1.5367253210879437,
"learning_rate": 1.5355652468243332e-07,
"loss": 0.3368,
"step": 2660
},
{
"epoch": 4.643478260869565,
"grad_norm": 1.42580964043298,
"learning_rate": 1.3973252739842236e-07,
"loss": 0.3107,
"step": 2670
},
{
"epoch": 4.660869565217391,
"grad_norm": 1.5836881272840495,
"learning_rate": 1.2655167930316236e-07,
"loss": 0.3337,
"step": 2680
},
{
"epoch": 4.678260869565217,
"grad_norm": 1.3933170878068741,
"learning_rate": 1.1401572459114441e-07,
"loss": 0.2988,
"step": 2690
},
{
"epoch": 4.695652173913043,
"grad_norm": 1.6629891174849243,
"learning_rate": 1.0212632211944906e-07,
"loss": 0.3038,
"step": 2700
},
{
"epoch": 4.71304347826087,
"grad_norm": 1.5180536343786482,
"learning_rate": 9.088504518822817e-08,
"loss": 0.2919,
"step": 2710
},
{
"epoch": 4.730434782608696,
"grad_norm": 1.4774316548557718,
"learning_rate": 8.029338133251518e-08,
"loss": 0.3253,
"step": 2720
},
{
"epoch": 4.747826086956522,
"grad_norm": 1.4916465022440153,
"learning_rate": 7.035273212538274e-08,
"loss": 0.3346,
"step": 2730
},
{
"epoch": 4.765217391304348,
"grad_norm": 1.6408617577143456,
"learning_rate": 6.10644129924759e-08,
"loss": 0.2921,
"step": 2740
},
{
"epoch": 4.782608695652174,
"grad_norm": 1.5935791526940084,
"learning_rate": 5.242965303794312e-08,
"loss": 0.3298,
"step": 2750
},
{
"epoch": 4.8,
"grad_norm": 1.3615616479228225,
"learning_rate": 4.444959488179301e-08,
"loss": 0.3084,
"step": 2760
},
{
"epoch": 4.817391304347826,
"grad_norm": 1.3883499339712515,
"learning_rate": 3.7125294508693066e-08,
"loss": 0.3117,
"step": 2770
},
{
"epoch": 4.834782608695652,
"grad_norm": 1.483718903196712,
"learning_rate": 3.045772112823253e-08,
"loss": 0.3259,
"step": 2780
},
{
"epoch": 4.852173913043478,
"grad_norm": 1.5939760050066456,
"learning_rate": 2.4447757046670017e-08,
"loss": 0.3079,
"step": 2790
},
{
"epoch": 4.869565217391305,
"grad_norm": 1.5266765004604943,
"learning_rate": 1.9096197550179664e-08,
"loss": 0.3214,
"step": 2800
},
{
"epoch": 4.886956521739131,
"grad_norm": 1.7078593350970195,
"learning_rate": 1.4403750799613092e-08,
"loss": 0.3419,
"step": 2810
},
{
"epoch": 4.904347826086957,
"grad_norm": 1.595487144492393,
"learning_rate": 1.0371037736787714e-08,
"loss": 0.3224,
"step": 2820
},
{
"epoch": 4.921739130434783,
"grad_norm": 1.470874925227166,
"learning_rate": 6.998592002321336e-09,
"loss": 0.3303,
"step": 2830
},
{
"epoch": 4.939130434782609,
"grad_norm": 1.6419169053737916,
"learning_rate": 4.286859865014204e-09,
"loss": 0.3237,
"step": 2840
},
{
"epoch": 4.956521739130435,
"grad_norm": 1.7910736800765954,
"learning_rate": 2.236200162798463e-09,
"loss": 0.2943,
"step": 2850
},
{
"epoch": 4.973913043478261,
"grad_norm": 1.4483382289129645,
"learning_rate": 8.468842552505907e-10,
"loss": 0.3202,
"step": 2860
},
{
"epoch": 4.9913043478260875,
"grad_norm": 1.4884875727469478,
"learning_rate": 1.1909598768400366e-10,
"loss": 0.3389,
"step": 2870
},
{
"epoch": 5.0,
"step": 2875,
"total_flos": 33447808868352.0,
"train_loss": 0.42618958398570184,
"train_runtime": 1921.5134,
"train_samples_per_second": 11.957,
"train_steps_per_second": 1.496
}
],
"logging_steps": 10,
"max_steps": 2875,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 10000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 33447808868352.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}