sidewalk-validator-ai-curbramp / trainer_state.json
johnomeara's picture
Upload 8 files
418b70d verified
{
"best_metric": 0.8691893001382458,
"best_model_checkpoint": "CurbRamp/dinov2/checkpoint-2875",
"epoch": 24.786177105831534,
"eval_steps": 500,
"global_step": 2875,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.08639308855291576,
"grad_norm": 381.287109375,
"learning_rate": 3.472222222222223e-07,
"loss": 3.0906,
"step": 10
},
{
"epoch": 0.17278617710583152,
"grad_norm": 161.04339599609375,
"learning_rate": 6.944444444444446e-07,
"loss": 2.6994,
"step": 20
},
{
"epoch": 0.2591792656587473,
"grad_norm": 111.33332824707031,
"learning_rate": 1.0416666666666667e-06,
"loss": 2.6304,
"step": 30
},
{
"epoch": 0.34557235421166305,
"grad_norm": 74.54171752929688,
"learning_rate": 1.3888888888888892e-06,
"loss": 2.4279,
"step": 40
},
{
"epoch": 0.4319654427645788,
"grad_norm": 62.23763656616211,
"learning_rate": 1.7361111111111112e-06,
"loss": 2.0601,
"step": 50
},
{
"epoch": 0.5183585313174947,
"grad_norm": 109.49163055419922,
"learning_rate": 2.0833333333333334e-06,
"loss": 2.2144,
"step": 60
},
{
"epoch": 0.6047516198704104,
"grad_norm": 140.09844970703125,
"learning_rate": 2.4305555555555557e-06,
"loss": 1.9144,
"step": 70
},
{
"epoch": 0.6911447084233261,
"grad_norm": 86.56855010986328,
"learning_rate": 2.7777777777777783e-06,
"loss": 1.9794,
"step": 80
},
{
"epoch": 0.7775377969762419,
"grad_norm": 87.58937072753906,
"learning_rate": 3.125e-06,
"loss": 1.99,
"step": 90
},
{
"epoch": 0.8639308855291576,
"grad_norm": 95.44474029541016,
"learning_rate": 3.4722222222222224e-06,
"loss": 1.8081,
"step": 100
},
{
"epoch": 0.9503239740820735,
"grad_norm": 135.51611328125,
"learning_rate": 3.819444444444444e-06,
"loss": 1.723,
"step": 110
},
{
"epoch": 1.0,
"eval_loss": 0.5266662836074829,
"eval_macro_f1": 0.7558472709624431,
"eval_runtime": 52.7672,
"eval_samples_per_second": 17.549,
"eval_steps_per_second": 2.198,
"step": 116
},
{
"epoch": 1.0345572354211663,
"grad_norm": 67.76123809814453,
"learning_rate": 4.166666666666667e-06,
"loss": 1.7629,
"step": 120
},
{
"epoch": 1.1209503239740821,
"grad_norm": 164.42552185058594,
"learning_rate": 4.5138888888888895e-06,
"loss": 1.7207,
"step": 130
},
{
"epoch": 1.2073434125269977,
"grad_norm": 63.25754165649414,
"learning_rate": 4.861111111111111e-06,
"loss": 1.8562,
"step": 140
},
{
"epoch": 1.2937365010799136,
"grad_norm": 101.76183319091797,
"learning_rate": 5.208333333333334e-06,
"loss": 1.5125,
"step": 150
},
{
"epoch": 1.3801295896328294,
"grad_norm": 104.06229400634766,
"learning_rate": 5.555555555555557e-06,
"loss": 1.7283,
"step": 160
},
{
"epoch": 1.4665226781857452,
"grad_norm": 100.93425750732422,
"learning_rate": 5.9027777777777785e-06,
"loss": 1.8195,
"step": 170
},
{
"epoch": 1.552915766738661,
"grad_norm": 73.55767822265625,
"learning_rate": 6.25e-06,
"loss": 1.6554,
"step": 180
},
{
"epoch": 1.6393088552915767,
"grad_norm": 80.74930572509766,
"learning_rate": 6.597222222222223e-06,
"loss": 1.4719,
"step": 190
},
{
"epoch": 1.7257019438444925,
"grad_norm": 70.96980285644531,
"learning_rate": 6.944444444444445e-06,
"loss": 1.8123,
"step": 200
},
{
"epoch": 1.812095032397408,
"grad_norm": 59.61140823364258,
"learning_rate": 7.291666666666667e-06,
"loss": 1.4448,
"step": 210
},
{
"epoch": 1.898488120950324,
"grad_norm": 278.9685974121094,
"learning_rate": 7.638888888888888e-06,
"loss": 1.7311,
"step": 220
},
{
"epoch": 1.9848812095032398,
"grad_norm": 149.49319458007812,
"learning_rate": 7.986111111111112e-06,
"loss": 1.79,
"step": 230
},
{
"epoch": 2.0,
"eval_loss": 0.5045331716537476,
"eval_macro_f1": 0.7580948382431834,
"eval_runtime": 16.9579,
"eval_samples_per_second": 54.606,
"eval_steps_per_second": 6.84,
"step": 232
},
{
"epoch": 2.0691144708423326,
"grad_norm": 39.244441986083984,
"learning_rate": 8.333333333333334e-06,
"loss": 1.4816,
"step": 240
},
{
"epoch": 2.1555075593952484,
"grad_norm": 62.760921478271484,
"learning_rate": 8.680555555555557e-06,
"loss": 1.5412,
"step": 250
},
{
"epoch": 2.2419006479481642,
"grad_norm": 58.07555389404297,
"learning_rate": 9.027777777777779e-06,
"loss": 1.5415,
"step": 260
},
{
"epoch": 2.32829373650108,
"grad_norm": 83.29940032958984,
"learning_rate": 9.375000000000001e-06,
"loss": 1.3842,
"step": 270
},
{
"epoch": 2.4146868250539955,
"grad_norm": 91.45130157470703,
"learning_rate": 9.722222222222223e-06,
"loss": 1.3325,
"step": 280
},
{
"epoch": 2.5010799136069113,
"grad_norm": 63.806114196777344,
"learning_rate": 9.99226903749517e-06,
"loss": 1.4369,
"step": 290
},
{
"epoch": 2.587473002159827,
"grad_norm": 70.26383972167969,
"learning_rate": 9.95361422497101e-06,
"loss": 1.8433,
"step": 300
},
{
"epoch": 2.673866090712743,
"grad_norm": 136.43692016601562,
"learning_rate": 9.914959412446851e-06,
"loss": 1.9714,
"step": 310
},
{
"epoch": 2.760259179265659,
"grad_norm": 66.07856750488281,
"learning_rate": 9.87630459992269e-06,
"loss": 1.5597,
"step": 320
},
{
"epoch": 2.8466522678185746,
"grad_norm": 43.0218620300293,
"learning_rate": 9.837649787398532e-06,
"loss": 1.7732,
"step": 330
},
{
"epoch": 2.9330453563714904,
"grad_norm": 85.20895385742188,
"learning_rate": 9.798994974874372e-06,
"loss": 1.8472,
"step": 340
},
{
"epoch": 3.0,
"eval_loss": 0.5161793231964111,
"eval_macro_f1": 0.7796335673223247,
"eval_runtime": 17.216,
"eval_samples_per_second": 53.787,
"eval_steps_per_second": 6.738,
"step": 348
},
{
"epoch": 3.0172786177105833,
"grad_norm": 42.26615905761719,
"learning_rate": 9.760340162350214e-06,
"loss": 1.261,
"step": 350
},
{
"epoch": 3.103671706263499,
"grad_norm": 152.00830078125,
"learning_rate": 9.721685349826055e-06,
"loss": 1.1856,
"step": 360
},
{
"epoch": 3.190064794816415,
"grad_norm": 70.4239273071289,
"learning_rate": 9.683030537301895e-06,
"loss": 1.2873,
"step": 370
},
{
"epoch": 3.2764578833693303,
"grad_norm": 48.222984313964844,
"learning_rate": 9.644375724777736e-06,
"loss": 1.0901,
"step": 380
},
{
"epoch": 3.362850971922246,
"grad_norm": 56.9564323425293,
"learning_rate": 9.605720912253576e-06,
"loss": 1.1631,
"step": 390
},
{
"epoch": 3.449244060475162,
"grad_norm": 48.229427337646484,
"learning_rate": 9.567066099729418e-06,
"loss": 1.3436,
"step": 400
},
{
"epoch": 3.535637149028078,
"grad_norm": 69.0005111694336,
"learning_rate": 9.528411287205257e-06,
"loss": 1.0393,
"step": 410
},
{
"epoch": 3.6220302375809936,
"grad_norm": 69.22911071777344,
"learning_rate": 9.489756474681099e-06,
"loss": 1.3696,
"step": 420
},
{
"epoch": 3.708423326133909,
"grad_norm": 89.3969497680664,
"learning_rate": 9.45110166215694e-06,
"loss": 1.2418,
"step": 430
},
{
"epoch": 3.7948164146868253,
"grad_norm": 76.02294921875,
"learning_rate": 9.41244684963278e-06,
"loss": 1.3444,
"step": 440
},
{
"epoch": 3.8812095032397407,
"grad_norm": 79.65322875976562,
"learning_rate": 9.373792037108622e-06,
"loss": 1.5226,
"step": 450
},
{
"epoch": 3.9676025917926565,
"grad_norm": 118.36911010742188,
"learning_rate": 9.335137224584461e-06,
"loss": 1.4474,
"step": 460
},
{
"epoch": 4.0,
"eval_loss": 0.39091792702674866,
"eval_macro_f1": 0.8356080726022923,
"eval_runtime": 17.0101,
"eval_samples_per_second": 54.438,
"eval_steps_per_second": 6.819,
"step": 464
},
{
"epoch": 4.05183585313175,
"grad_norm": 34.40155029296875,
"learning_rate": 9.296482412060303e-06,
"loss": 0.9722,
"step": 470
},
{
"epoch": 4.138228941684665,
"grad_norm": 52.04656219482422,
"learning_rate": 9.257827599536143e-06,
"loss": 0.9749,
"step": 480
},
{
"epoch": 4.224622030237581,
"grad_norm": 65.06253814697266,
"learning_rate": 9.219172787011984e-06,
"loss": 1.1242,
"step": 490
},
{
"epoch": 4.311015118790497,
"grad_norm": 66.4505844116211,
"learning_rate": 9.180517974487826e-06,
"loss": 1.1458,
"step": 500
},
{
"epoch": 4.397408207343412,
"grad_norm": 158.91885375976562,
"learning_rate": 9.141863161963665e-06,
"loss": 1.3353,
"step": 510
},
{
"epoch": 4.4838012958963285,
"grad_norm": 60.72650909423828,
"learning_rate": 9.103208349439507e-06,
"loss": 1.1444,
"step": 520
},
{
"epoch": 4.570194384449244,
"grad_norm": 46.68295669555664,
"learning_rate": 9.064553536915347e-06,
"loss": 1.1094,
"step": 530
},
{
"epoch": 4.65658747300216,
"grad_norm": 121.51728820800781,
"learning_rate": 9.025898724391188e-06,
"loss": 1.398,
"step": 540
},
{
"epoch": 4.7429805615550755,
"grad_norm": 46.293556213378906,
"learning_rate": 8.987243911867028e-06,
"loss": 1.1777,
"step": 550
},
{
"epoch": 4.829373650107991,
"grad_norm": 67.82626342773438,
"learning_rate": 8.94858909934287e-06,
"loss": 1.0463,
"step": 560
},
{
"epoch": 4.915766738660907,
"grad_norm": 54.6425895690918,
"learning_rate": 8.90993428681871e-06,
"loss": 1.0591,
"step": 570
},
{
"epoch": 5.0,
"grad_norm": 46.120140075683594,
"learning_rate": 8.87127947429455e-06,
"loss": 1.0474,
"step": 580
},
{
"epoch": 5.0,
"eval_loss": 0.5121903419494629,
"eval_macro_f1": 0.7977332272718247,
"eval_runtime": 17.2455,
"eval_samples_per_second": 53.695,
"eval_steps_per_second": 6.726,
"step": 580
},
{
"epoch": 5.086393088552915,
"grad_norm": 59.340538024902344,
"learning_rate": 8.832624661770392e-06,
"loss": 0.9155,
"step": 590
},
{
"epoch": 5.172786177105832,
"grad_norm": 47.96815872192383,
"learning_rate": 8.793969849246232e-06,
"loss": 0.8381,
"step": 600
},
{
"epoch": 5.259179265658747,
"grad_norm": 50.32111358642578,
"learning_rate": 8.755315036722073e-06,
"loss": 0.9608,
"step": 610
},
{
"epoch": 5.345572354211663,
"grad_norm": 76.34898376464844,
"learning_rate": 8.716660224197913e-06,
"loss": 0.9766,
"step": 620
},
{
"epoch": 5.431965442764579,
"grad_norm": 40.53537368774414,
"learning_rate": 8.678005411673755e-06,
"loss": 1.1101,
"step": 630
},
{
"epoch": 5.518358531317495,
"grad_norm": 62.64771270751953,
"learning_rate": 8.639350599149594e-06,
"loss": 1.0492,
"step": 640
},
{
"epoch": 5.60475161987041,
"grad_norm": 34.409095764160156,
"learning_rate": 8.600695786625436e-06,
"loss": 1.0763,
"step": 650
},
{
"epoch": 5.691144708423326,
"grad_norm": 77.56722259521484,
"learning_rate": 8.562040974101277e-06,
"loss": 0.9917,
"step": 660
},
{
"epoch": 5.777537796976242,
"grad_norm": 37.74449920654297,
"learning_rate": 8.523386161577117e-06,
"loss": 1.1072,
"step": 670
},
{
"epoch": 5.863930885529157,
"grad_norm": 55.60712814331055,
"learning_rate": 8.484731349052959e-06,
"loss": 0.9095,
"step": 680
},
{
"epoch": 5.950323974082074,
"grad_norm": 45.91484451293945,
"learning_rate": 8.446076536528798e-06,
"loss": 0.931,
"step": 690
},
{
"epoch": 6.0,
"eval_loss": 0.41715672612190247,
"eval_macro_f1": 0.8296228150873965,
"eval_runtime": 16.9093,
"eval_samples_per_second": 54.763,
"eval_steps_per_second": 6.86,
"step": 696
},
{
"epoch": 6.0345572354211665,
"grad_norm": 38.476654052734375,
"learning_rate": 8.40742172400464e-06,
"loss": 0.7875,
"step": 700
},
{
"epoch": 6.120950323974082,
"grad_norm": 23.64281463623047,
"learning_rate": 8.36876691148048e-06,
"loss": 0.7492,
"step": 710
},
{
"epoch": 6.207343412526998,
"grad_norm": 102.30036926269531,
"learning_rate": 8.330112098956321e-06,
"loss": 0.6453,
"step": 720
},
{
"epoch": 6.293736501079914,
"grad_norm": 27.228309631347656,
"learning_rate": 8.291457286432163e-06,
"loss": 0.9595,
"step": 730
},
{
"epoch": 6.38012958963283,
"grad_norm": 54.58305358886719,
"learning_rate": 8.252802473908002e-06,
"loss": 0.5922,
"step": 740
},
{
"epoch": 6.466522678185745,
"grad_norm": 99.91376495361328,
"learning_rate": 8.214147661383844e-06,
"loss": 0.7878,
"step": 750
},
{
"epoch": 6.552915766738661,
"grad_norm": 91.14471435546875,
"learning_rate": 8.175492848859684e-06,
"loss": 0.7622,
"step": 760
},
{
"epoch": 6.639308855291577,
"grad_norm": 79.27790069580078,
"learning_rate": 8.136838036335525e-06,
"loss": 0.9503,
"step": 770
},
{
"epoch": 6.725701943844492,
"grad_norm": 70.4741439819336,
"learning_rate": 8.098183223811365e-06,
"loss": 0.8432,
"step": 780
},
{
"epoch": 6.8120950323974085,
"grad_norm": 62.995697021484375,
"learning_rate": 8.059528411287206e-06,
"loss": 0.8593,
"step": 790
},
{
"epoch": 6.898488120950324,
"grad_norm": 34.43967056274414,
"learning_rate": 8.020873598763048e-06,
"loss": 0.577,
"step": 800
},
{
"epoch": 6.984881209503239,
"grad_norm": 101.01258087158203,
"learning_rate": 7.982218786238888e-06,
"loss": 0.9042,
"step": 810
},
{
"epoch": 7.0,
"eval_loss": 0.4327438175678253,
"eval_macro_f1": 0.8414019539605782,
"eval_runtime": 16.9563,
"eval_samples_per_second": 54.611,
"eval_steps_per_second": 6.841,
"step": 812
},
{
"epoch": 7.069114470842333,
"grad_norm": 60.159706115722656,
"learning_rate": 7.943563973714729e-06,
"loss": 0.7565,
"step": 820
},
{
"epoch": 7.155507559395248,
"grad_norm": 94.81135559082031,
"learning_rate": 7.904909161190569e-06,
"loss": 0.5015,
"step": 830
},
{
"epoch": 7.241900647948164,
"grad_norm": 148.54061889648438,
"learning_rate": 7.86625434866641e-06,
"loss": 0.9681,
"step": 840
},
{
"epoch": 7.32829373650108,
"grad_norm": 89.39724731445312,
"learning_rate": 7.82759953614225e-06,
"loss": 1.3573,
"step": 850
},
{
"epoch": 7.4146868250539955,
"grad_norm": 32.64114761352539,
"learning_rate": 7.788944723618092e-06,
"loss": 0.5587,
"step": 860
},
{
"epoch": 7.501079913606912,
"grad_norm": 49.56209182739258,
"learning_rate": 7.750289911093933e-06,
"loss": 0.7781,
"step": 870
},
{
"epoch": 7.587473002159827,
"grad_norm": 65.53961181640625,
"learning_rate": 7.711635098569773e-06,
"loss": 0.6911,
"step": 880
},
{
"epoch": 7.6738660907127425,
"grad_norm": 52.27827835083008,
"learning_rate": 7.672980286045614e-06,
"loss": 0.8156,
"step": 890
},
{
"epoch": 7.760259179265659,
"grad_norm": 58.8625602722168,
"learning_rate": 7.634325473521454e-06,
"loss": 0.6504,
"step": 900
},
{
"epoch": 7.846652267818574,
"grad_norm": 58.183353424072266,
"learning_rate": 7.595670660997296e-06,
"loss": 0.8262,
"step": 910
},
{
"epoch": 7.93304535637149,
"grad_norm": 66.90734100341797,
"learning_rate": 7.557015848473136e-06,
"loss": 0.7893,
"step": 920
},
{
"epoch": 8.0,
"eval_loss": 0.4484424591064453,
"eval_macro_f1": 0.832676482797353,
"eval_runtime": 17.3598,
"eval_samples_per_second": 53.342,
"eval_steps_per_second": 6.682,
"step": 928
},
{
"epoch": 8.017278617710582,
"grad_norm": 56.62957000732422,
"learning_rate": 7.518361035948977e-06,
"loss": 0.6286,
"step": 930
},
{
"epoch": 8.1036717062635,
"grad_norm": 83.84600067138672,
"learning_rate": 7.4797062234248175e-06,
"loss": 0.809,
"step": 940
},
{
"epoch": 8.190064794816415,
"grad_norm": 63.44249725341797,
"learning_rate": 7.441051410900658e-06,
"loss": 0.7384,
"step": 950
},
{
"epoch": 8.27645788336933,
"grad_norm": 30.475244522094727,
"learning_rate": 7.402396598376499e-06,
"loss": 0.561,
"step": 960
},
{
"epoch": 8.362850971922246,
"grad_norm": 67.37047576904297,
"learning_rate": 7.363741785852339e-06,
"loss": 0.7086,
"step": 970
},
{
"epoch": 8.449244060475163,
"grad_norm": 61.570945739746094,
"learning_rate": 7.325086973328181e-06,
"loss": 0.5365,
"step": 980
},
{
"epoch": 8.535637149028078,
"grad_norm": 52.1601676940918,
"learning_rate": 7.2864321608040215e-06,
"loss": 0.4852,
"step": 990
},
{
"epoch": 8.622030237580994,
"grad_norm": 34.1832275390625,
"learning_rate": 7.247777348279862e-06,
"loss": 0.6972,
"step": 1000
},
{
"epoch": 8.708423326133909,
"grad_norm": 63.52924346923828,
"learning_rate": 7.209122535755703e-06,
"loss": 0.7469,
"step": 1010
},
{
"epoch": 8.794816414686824,
"grad_norm": 62.66010284423828,
"learning_rate": 7.170467723231543e-06,
"loss": 0.9771,
"step": 1020
},
{
"epoch": 8.881209503239742,
"grad_norm": 76.71471405029297,
"learning_rate": 7.131812910707384e-06,
"loss": 0.7411,
"step": 1030
},
{
"epoch": 8.967602591792657,
"grad_norm": 27.406539916992188,
"learning_rate": 7.093158098183225e-06,
"loss": 0.7943,
"step": 1040
},
{
"epoch": 9.0,
"eval_loss": 0.46906793117523193,
"eval_macro_f1": 0.8359894804603093,
"eval_runtime": 17.671,
"eval_samples_per_second": 52.402,
"eval_steps_per_second": 6.564,
"step": 1044
},
{
"epoch": 9.051835853131749,
"grad_norm": 16.15360450744629,
"learning_rate": 7.054503285659065e-06,
"loss": 0.4819,
"step": 1050
},
{
"epoch": 9.138228941684666,
"grad_norm": 86.55136108398438,
"learning_rate": 7.015848473134907e-06,
"loss": 0.8165,
"step": 1060
},
{
"epoch": 9.224622030237581,
"grad_norm": 58.48967361450195,
"learning_rate": 6.977193660610747e-06,
"loss": 0.6899,
"step": 1070
},
{
"epoch": 9.311015118790497,
"grad_norm": 74.65888214111328,
"learning_rate": 6.938538848086588e-06,
"loss": 0.4518,
"step": 1080
},
{
"epoch": 9.397408207343412,
"grad_norm": 32.85594940185547,
"learning_rate": 6.899884035562429e-06,
"loss": 0.5768,
"step": 1090
},
{
"epoch": 9.483801295896328,
"grad_norm": 97.70096588134766,
"learning_rate": 6.861229223038269e-06,
"loss": 0.5102,
"step": 1100
},
{
"epoch": 9.570194384449245,
"grad_norm": 46.2236213684082,
"learning_rate": 6.82257441051411e-06,
"loss": 0.5204,
"step": 1110
},
{
"epoch": 9.65658747300216,
"grad_norm": 67.02069091796875,
"learning_rate": 6.7839195979899505e-06,
"loss": 0.7371,
"step": 1120
},
{
"epoch": 9.742980561555076,
"grad_norm": 47.46559143066406,
"learning_rate": 6.745264785465792e-06,
"loss": 0.5447,
"step": 1130
},
{
"epoch": 9.829373650107991,
"grad_norm": 31.422773361206055,
"learning_rate": 6.706609972941633e-06,
"loss": 0.6734,
"step": 1140
},
{
"epoch": 9.915766738660906,
"grad_norm": 35.7156867980957,
"learning_rate": 6.667955160417473e-06,
"loss": 0.6255,
"step": 1150
},
{
"epoch": 10.0,
"grad_norm": 71.64591217041016,
"learning_rate": 6.629300347893314e-06,
"loss": 0.482,
"step": 1160
},
{
"epoch": 10.0,
"eval_loss": 0.44722601771354675,
"eval_macro_f1": 0.846851581651592,
"eval_runtime": 16.8938,
"eval_samples_per_second": 54.813,
"eval_steps_per_second": 6.866,
"step": 1160
},
{
"epoch": 10.086393088552915,
"grad_norm": 53.67696762084961,
"learning_rate": 6.5906455353691545e-06,
"loss": 0.5785,
"step": 1170
},
{
"epoch": 10.17278617710583,
"grad_norm": 26.457019805908203,
"learning_rate": 6.551990722844995e-06,
"loss": 0.528,
"step": 1180
},
{
"epoch": 10.259179265658748,
"grad_norm": 87.2643051147461,
"learning_rate": 6.513335910320836e-06,
"loss": 0.5065,
"step": 1190
},
{
"epoch": 10.345572354211663,
"grad_norm": 63.67802810668945,
"learning_rate": 6.474681097796676e-06,
"loss": 0.4843,
"step": 1200
},
{
"epoch": 10.431965442764579,
"grad_norm": 92.45316314697266,
"learning_rate": 6.436026285272518e-06,
"loss": 0.3929,
"step": 1210
},
{
"epoch": 10.518358531317494,
"grad_norm": 110.85811614990234,
"learning_rate": 6.3973714727483585e-06,
"loss": 0.5893,
"step": 1220
},
{
"epoch": 10.60475161987041,
"grad_norm": 84.82708740234375,
"learning_rate": 6.358716660224199e-06,
"loss": 0.5031,
"step": 1230
},
{
"epoch": 10.691144708423327,
"grad_norm": 75.6590576171875,
"learning_rate": 6.32006184770004e-06,
"loss": 0.5282,
"step": 1240
},
{
"epoch": 10.777537796976242,
"grad_norm": 55.62372589111328,
"learning_rate": 6.28140703517588e-06,
"loss": 0.4551,
"step": 1250
},
{
"epoch": 10.863930885529157,
"grad_norm": 61.89540100097656,
"learning_rate": 6.24275222265172e-06,
"loss": 0.5981,
"step": 1260
},
{
"epoch": 10.950323974082073,
"grad_norm": 51.5389404296875,
"learning_rate": 6.204097410127561e-06,
"loss": 0.5145,
"step": 1270
},
{
"epoch": 11.0,
"eval_loss": 0.45396292209625244,
"eval_macro_f1": 0.8514981354090372,
"eval_runtime": 16.9232,
"eval_samples_per_second": 54.718,
"eval_steps_per_second": 6.855,
"step": 1276
},
{
"epoch": 11.034557235421167,
"grad_norm": 73.23987579345703,
"learning_rate": 6.165442597603401e-06,
"loss": 0.5773,
"step": 1280
},
{
"epoch": 11.120950323974082,
"grad_norm": 26.071277618408203,
"learning_rate": 6.126787785079242e-06,
"loss": 0.5628,
"step": 1290
},
{
"epoch": 11.207343412526997,
"grad_norm": 49.137691497802734,
"learning_rate": 6.088132972555083e-06,
"loss": 0.4894,
"step": 1300
},
{
"epoch": 11.293736501079914,
"grad_norm": 63.3178825378418,
"learning_rate": 6.049478160030924e-06,
"loss": 0.6504,
"step": 1310
},
{
"epoch": 11.38012958963283,
"grad_norm": 20.981409072875977,
"learning_rate": 6.010823347506765e-06,
"loss": 0.4835,
"step": 1320
},
{
"epoch": 11.466522678185745,
"grad_norm": 35.6384162902832,
"learning_rate": 5.972168534982605e-06,
"loss": 0.568,
"step": 1330
},
{
"epoch": 11.55291576673866,
"grad_norm": 20.20071029663086,
"learning_rate": 5.933513722458446e-06,
"loss": 0.2126,
"step": 1340
},
{
"epoch": 11.639308855291576,
"grad_norm": 37.92521667480469,
"learning_rate": 5.894858909934287e-06,
"loss": 0.4932,
"step": 1350
},
{
"epoch": 11.725701943844493,
"grad_norm": 20.3985538482666,
"learning_rate": 5.856204097410127e-06,
"loss": 0.5133,
"step": 1360
},
{
"epoch": 11.812095032397409,
"grad_norm": 70.6824951171875,
"learning_rate": 5.817549284885968e-06,
"loss": 0.7987,
"step": 1370
},
{
"epoch": 11.898488120950324,
"grad_norm": 133.84133911132812,
"learning_rate": 5.778894472361809e-06,
"loss": 0.5215,
"step": 1380
},
{
"epoch": 11.98488120950324,
"grad_norm": 30.232711791992188,
"learning_rate": 5.74023965983765e-06,
"loss": 0.4581,
"step": 1390
},
{
"epoch": 12.0,
"eval_loss": 0.4816704988479614,
"eval_macro_f1": 0.8516025641025642,
"eval_runtime": 17.3032,
"eval_samples_per_second": 53.516,
"eval_steps_per_second": 6.704,
"step": 1392
},
{
"epoch": 12.069114470842333,
"grad_norm": 65.2093276977539,
"learning_rate": 5.701584847313491e-06,
"loss": 0.285,
"step": 1400
},
{
"epoch": 12.155507559395248,
"grad_norm": 73.72334289550781,
"learning_rate": 5.662930034789331e-06,
"loss": 0.5098,
"step": 1410
},
{
"epoch": 12.241900647948164,
"grad_norm": 49.37625503540039,
"learning_rate": 5.624275222265172e-06,
"loss": 0.623,
"step": 1420
},
{
"epoch": 12.32829373650108,
"grad_norm": 49.2904052734375,
"learning_rate": 5.5856204097410125e-06,
"loss": 0.4641,
"step": 1430
},
{
"epoch": 12.414686825053996,
"grad_norm": 29.282928466796875,
"learning_rate": 5.546965597216853e-06,
"loss": 0.4904,
"step": 1440
},
{
"epoch": 12.501079913606912,
"grad_norm": 51.97225570678711,
"learning_rate": 5.508310784692694e-06,
"loss": 0.4044,
"step": 1450
},
{
"epoch": 12.587473002159827,
"grad_norm": 44.74934768676758,
"learning_rate": 5.469655972168535e-06,
"loss": 0.6768,
"step": 1460
},
{
"epoch": 12.673866090712743,
"grad_norm": 40.65571975708008,
"learning_rate": 5.431001159644376e-06,
"loss": 0.3807,
"step": 1470
},
{
"epoch": 12.76025917926566,
"grad_norm": 204.83670043945312,
"learning_rate": 5.3923463471202165e-06,
"loss": 0.3421,
"step": 1480
},
{
"epoch": 12.846652267818575,
"grad_norm": 45.30833053588867,
"learning_rate": 5.353691534596057e-06,
"loss": 0.4806,
"step": 1490
},
{
"epoch": 12.93304535637149,
"grad_norm": 30.75054168701172,
"learning_rate": 5.315036722071898e-06,
"loss": 0.3743,
"step": 1500
},
{
"epoch": 13.0,
"eval_loss": 0.4918636083602905,
"eval_macro_f1": 0.8659523584493705,
"eval_runtime": 17.5008,
"eval_samples_per_second": 52.912,
"eval_steps_per_second": 6.628,
"step": 1508
},
{
"epoch": 13.017278617710582,
"grad_norm": 76.16972351074219,
"learning_rate": 5.2763819095477384e-06,
"loss": 0.2202,
"step": 1510
},
{
"epoch": 13.1036717062635,
"grad_norm": 2.941861152648926,
"learning_rate": 5.237727097023579e-06,
"loss": 0.5201,
"step": 1520
},
{
"epoch": 13.190064794816415,
"grad_norm": 102.31110382080078,
"learning_rate": 5.1990722844994205e-06,
"loss": 0.3936,
"step": 1530
},
{
"epoch": 13.27645788336933,
"grad_norm": 118.30928802490234,
"learning_rate": 5.160417471975261e-06,
"loss": 0.4541,
"step": 1540
},
{
"epoch": 13.362850971922246,
"grad_norm": 27.03119659423828,
"learning_rate": 5.121762659451102e-06,
"loss": 0.5705,
"step": 1550
},
{
"epoch": 13.449244060475163,
"grad_norm": 21.778711318969727,
"learning_rate": 5.083107846926942e-06,
"loss": 0.3093,
"step": 1560
},
{
"epoch": 13.535637149028078,
"grad_norm": 7.353912830352783,
"learning_rate": 5.044453034402783e-06,
"loss": 0.2553,
"step": 1570
},
{
"epoch": 13.622030237580994,
"grad_norm": 41.609153747558594,
"learning_rate": 5.005798221878624e-06,
"loss": 0.4612,
"step": 1580
},
{
"epoch": 13.708423326133909,
"grad_norm": 167.5371856689453,
"learning_rate": 4.967143409354465e-06,
"loss": 0.8447,
"step": 1590
},
{
"epoch": 13.794816414686824,
"grad_norm": 63.91857147216797,
"learning_rate": 4.928488596830306e-06,
"loss": 0.2376,
"step": 1600
},
{
"epoch": 13.881209503239742,
"grad_norm": 5.819667816162109,
"learning_rate": 4.889833784306146e-06,
"loss": 0.2624,
"step": 1610
},
{
"epoch": 13.967602591792657,
"grad_norm": 61.421180725097656,
"learning_rate": 4.851178971781987e-06,
"loss": 0.5579,
"step": 1620
},
{
"epoch": 14.0,
"eval_loss": 0.5265308022499084,
"eval_macro_f1": 0.8497394016895295,
"eval_runtime": 16.9332,
"eval_samples_per_second": 54.685,
"eval_steps_per_second": 6.85,
"step": 1624
},
{
"epoch": 14.051835853131749,
"grad_norm": 55.67881774902344,
"learning_rate": 4.8125241592578285e-06,
"loss": 0.3435,
"step": 1630
},
{
"epoch": 14.138228941684666,
"grad_norm": 68.73692321777344,
"learning_rate": 4.773869346733669e-06,
"loss": 0.4087,
"step": 1640
},
{
"epoch": 14.224622030237581,
"grad_norm": 17.55417823791504,
"learning_rate": 4.73521453420951e-06,
"loss": 0.4633,
"step": 1650
},
{
"epoch": 14.311015118790497,
"grad_norm": 71.73014831542969,
"learning_rate": 4.69655972168535e-06,
"loss": 0.3692,
"step": 1660
},
{
"epoch": 14.397408207343412,
"grad_norm": 56.00436782836914,
"learning_rate": 4.657904909161191e-06,
"loss": 0.3288,
"step": 1670
},
{
"epoch": 14.483801295896328,
"grad_norm": 25.29050636291504,
"learning_rate": 4.619250096637032e-06,
"loss": 0.3702,
"step": 1680
},
{
"epoch": 14.570194384449245,
"grad_norm": 56.083961486816406,
"learning_rate": 4.580595284112872e-06,
"loss": 0.3286,
"step": 1690
},
{
"epoch": 14.65658747300216,
"grad_norm": 75.87052154541016,
"learning_rate": 4.541940471588713e-06,
"loss": 0.4873,
"step": 1700
},
{
"epoch": 14.742980561555076,
"grad_norm": 44.087547302246094,
"learning_rate": 4.503285659064554e-06,
"loss": 0.2683,
"step": 1710
},
{
"epoch": 14.829373650107991,
"grad_norm": 22.894262313842773,
"learning_rate": 4.464630846540395e-06,
"loss": 0.3559,
"step": 1720
},
{
"epoch": 14.915766738660906,
"grad_norm": 52.25741195678711,
"learning_rate": 4.425976034016236e-06,
"loss": 0.3136,
"step": 1730
},
{
"epoch": 15.0,
"grad_norm": 29.12067222595215,
"learning_rate": 4.387321221492076e-06,
"loss": 0.5044,
"step": 1740
},
{
"epoch": 15.0,
"eval_loss": 0.638742983341217,
"eval_macro_f1": 0.8443575406474457,
"eval_runtime": 17.2989,
"eval_samples_per_second": 53.53,
"eval_steps_per_second": 6.706,
"step": 1740
},
{
"epoch": 15.086393088552915,
"grad_norm": 47.597984313964844,
"learning_rate": 4.348666408967917e-06,
"loss": 0.4503,
"step": 1750
},
{
"epoch": 15.17278617710583,
"grad_norm": 13.964532852172852,
"learning_rate": 4.3100115964437575e-06,
"loss": 0.2143,
"step": 1760
},
{
"epoch": 15.259179265658748,
"grad_norm": 82.6571273803711,
"learning_rate": 4.271356783919598e-06,
"loss": 0.5836,
"step": 1770
},
{
"epoch": 15.345572354211663,
"grad_norm": 38.710899353027344,
"learning_rate": 4.23270197139544e-06,
"loss": 0.2878,
"step": 1780
},
{
"epoch": 15.431965442764579,
"grad_norm": 60.6817626953125,
"learning_rate": 4.19404715887128e-06,
"loss": 0.4409,
"step": 1790
},
{
"epoch": 15.518358531317494,
"grad_norm": 43.522804260253906,
"learning_rate": 4.155392346347121e-06,
"loss": 0.7443,
"step": 1800
},
{
"epoch": 15.60475161987041,
"grad_norm": 39.97816848754883,
"learning_rate": 4.1167375338229615e-06,
"loss": 0.3902,
"step": 1810
},
{
"epoch": 15.691144708423327,
"grad_norm": 4.281501293182373,
"learning_rate": 4.078082721298802e-06,
"loss": 0.2109,
"step": 1820
},
{
"epoch": 15.777537796976242,
"grad_norm": 53.80859375,
"learning_rate": 4.039427908774643e-06,
"loss": 0.427,
"step": 1830
},
{
"epoch": 15.863930885529157,
"grad_norm": 43.7281494140625,
"learning_rate": 4.0007730962504834e-06,
"loss": 0.4247,
"step": 1840
},
{
"epoch": 15.950323974082073,
"grad_norm": 62.55876541137695,
"learning_rate": 3.962118283726324e-06,
"loss": 0.5326,
"step": 1850
},
{
"epoch": 16.0,
"eval_loss": 0.5283326506614685,
"eval_macro_f1": 0.8416865323027283,
"eval_runtime": 17.0638,
"eval_samples_per_second": 54.267,
"eval_steps_per_second": 6.798,
"step": 1856
},
{
"epoch": 16.034557235421165,
"grad_norm": 24.753618240356445,
"learning_rate": 3.9234634712021655e-06,
"loss": 0.2712,
"step": 1860
},
{
"epoch": 16.120950323974082,
"grad_norm": 81.78247833251953,
"learning_rate": 3.884808658678006e-06,
"loss": 0.372,
"step": 1870
},
{
"epoch": 16.207343412527,
"grad_norm": 43.68366241455078,
"learning_rate": 3.846153846153847e-06,
"loss": 0.3139,
"step": 1880
},
{
"epoch": 16.293736501079913,
"grad_norm": 83.01486206054688,
"learning_rate": 3.8074990336296874e-06,
"loss": 0.3837,
"step": 1890
},
{
"epoch": 16.38012958963283,
"grad_norm": 14.622414588928223,
"learning_rate": 3.768844221105528e-06,
"loss": 0.2072,
"step": 1900
},
{
"epoch": 16.466522678185743,
"grad_norm": 53.248390197753906,
"learning_rate": 3.730189408581369e-06,
"loss": 0.3522,
"step": 1910
},
{
"epoch": 16.55291576673866,
"grad_norm": 53.421539306640625,
"learning_rate": 3.6915345960572097e-06,
"loss": 0.2473,
"step": 1920
},
{
"epoch": 16.639308855291578,
"grad_norm": 53.37113571166992,
"learning_rate": 3.6528797835330504e-06,
"loss": 0.6206,
"step": 1930
},
{
"epoch": 16.72570194384449,
"grad_norm": 152.26011657714844,
"learning_rate": 3.614224971008891e-06,
"loss": 0.4216,
"step": 1940
},
{
"epoch": 16.81209503239741,
"grad_norm": 25.104888916015625,
"learning_rate": 3.575570158484732e-06,
"loss": 0.333,
"step": 1950
},
{
"epoch": 16.898488120950326,
"grad_norm": 50.0928955078125,
"learning_rate": 3.5369153459605727e-06,
"loss": 0.3633,
"step": 1960
},
{
"epoch": 16.98488120950324,
"grad_norm": 4.997694969177246,
"learning_rate": 3.4982605334364133e-06,
"loss": 0.3026,
"step": 1970
},
{
"epoch": 17.0,
"eval_loss": 0.4961493909358978,
"eval_macro_f1": 0.8517054282094797,
"eval_runtime": 16.6125,
"eval_samples_per_second": 55.741,
"eval_steps_per_second": 6.983,
"step": 1972
},
{
"epoch": 17.069114470842333,
"grad_norm": 19.699106216430664,
"learning_rate": 3.459605720912254e-06,
"loss": 0.1953,
"step": 1980
},
{
"epoch": 17.155507559395247,
"grad_norm": 39.93479919433594,
"learning_rate": 3.420950908388095e-06,
"loss": 0.3818,
"step": 1990
},
{
"epoch": 17.241900647948164,
"grad_norm": 31.303096771240234,
"learning_rate": 3.3822960958639356e-06,
"loss": 0.281,
"step": 2000
},
{
"epoch": 17.32829373650108,
"grad_norm": 15.34146785736084,
"learning_rate": 3.3436412833397762e-06,
"loss": 0.4427,
"step": 2010
},
{
"epoch": 17.414686825053995,
"grad_norm": 69.86293029785156,
"learning_rate": 3.304986470815617e-06,
"loss": 0.2839,
"step": 2020
},
{
"epoch": 17.50107991360691,
"grad_norm": 6.422046661376953,
"learning_rate": 3.266331658291458e-06,
"loss": 0.3117,
"step": 2030
},
{
"epoch": 17.58747300215983,
"grad_norm": 88.27076721191406,
"learning_rate": 3.2276768457672986e-06,
"loss": 0.2722,
"step": 2040
},
{
"epoch": 17.673866090712743,
"grad_norm": 26.126628875732422,
"learning_rate": 3.189022033243139e-06,
"loss": 0.375,
"step": 2050
},
{
"epoch": 17.76025917926566,
"grad_norm": 22.49889373779297,
"learning_rate": 3.1503672207189802e-06,
"loss": 0.4536,
"step": 2060
},
{
"epoch": 17.846652267818573,
"grad_norm": 137.3453826904297,
"learning_rate": 3.11171240819482e-06,
"loss": 0.591,
"step": 2070
},
{
"epoch": 17.93304535637149,
"grad_norm": 18.906539916992188,
"learning_rate": 3.073057595670661e-06,
"loss": 0.4155,
"step": 2080
},
{
"epoch": 18.0,
"eval_loss": 0.546431303024292,
"eval_macro_f1": 0.8590125396622391,
"eval_runtime": 17.1502,
"eval_samples_per_second": 53.993,
"eval_steps_per_second": 6.764,
"step": 2088
},
{
"epoch": 18.017278617710584,
"grad_norm": 20.98369789123535,
"learning_rate": 3.0344027831465017e-06,
"loss": 0.3026,
"step": 2090
},
{
"epoch": 18.103671706263498,
"grad_norm": 27.89241600036621,
"learning_rate": 2.9957479706223423e-06,
"loss": 0.2171,
"step": 2100
},
{
"epoch": 18.190064794816415,
"grad_norm": 61.913612365722656,
"learning_rate": 2.9570931580981834e-06,
"loss": 0.4255,
"step": 2110
},
{
"epoch": 18.276457883369332,
"grad_norm": 917.5242309570312,
"learning_rate": 2.918438345574024e-06,
"loss": 0.1092,
"step": 2120
},
{
"epoch": 18.362850971922246,
"grad_norm": 18.6544189453125,
"learning_rate": 2.8797835330498646e-06,
"loss": 0.3306,
"step": 2130
},
{
"epoch": 18.449244060475163,
"grad_norm": 13.983207702636719,
"learning_rate": 2.8411287205257053e-06,
"loss": 0.3184,
"step": 2140
},
{
"epoch": 18.535637149028076,
"grad_norm": 19.48896026611328,
"learning_rate": 2.8024739080015463e-06,
"loss": 0.1137,
"step": 2150
},
{
"epoch": 18.622030237580994,
"grad_norm": 56.50817108154297,
"learning_rate": 2.763819095477387e-06,
"loss": 0.3258,
"step": 2160
},
{
"epoch": 18.70842332613391,
"grad_norm": 54.56320571899414,
"learning_rate": 2.7251642829532276e-06,
"loss": 0.2,
"step": 2170
},
{
"epoch": 18.794816414686824,
"grad_norm": 129.62660217285156,
"learning_rate": 2.6865094704290682e-06,
"loss": 0.3135,
"step": 2180
},
{
"epoch": 18.88120950323974,
"grad_norm": 107.59940338134766,
"learning_rate": 2.6478546579049093e-06,
"loss": 0.3521,
"step": 2190
},
{
"epoch": 18.967602591792655,
"grad_norm": 148.64234924316406,
"learning_rate": 2.60919984538075e-06,
"loss": 0.2763,
"step": 2200
},
{
"epoch": 19.0,
"eval_loss": 0.5828408598899841,
"eval_macro_f1": 0.8685469279334771,
"eval_runtime": 17.8662,
"eval_samples_per_second": 51.83,
"eval_steps_per_second": 6.493,
"step": 2204
},
{
"epoch": 19.05183585313175,
"grad_norm": 104.04680633544922,
"learning_rate": 2.5705450328565905e-06,
"loss": 0.4048,
"step": 2210
},
{
"epoch": 19.138228941684666,
"grad_norm": 123.32837677001953,
"learning_rate": 2.5318902203324316e-06,
"loss": 0.5209,
"step": 2220
},
{
"epoch": 19.22462203023758,
"grad_norm": 46.73143005371094,
"learning_rate": 2.493235407808272e-06,
"loss": 0.5217,
"step": 2230
},
{
"epoch": 19.311015118790497,
"grad_norm": 8.207721710205078,
"learning_rate": 2.4545805952841133e-06,
"loss": 0.3255,
"step": 2240
},
{
"epoch": 19.397408207343414,
"grad_norm": 63.93031311035156,
"learning_rate": 2.415925782759954e-06,
"loss": 0.2312,
"step": 2250
},
{
"epoch": 19.483801295896328,
"grad_norm": 43.651065826416016,
"learning_rate": 2.3772709702357945e-06,
"loss": 0.2947,
"step": 2260
},
{
"epoch": 19.570194384449245,
"grad_norm": 5.546581745147705,
"learning_rate": 2.338616157711635e-06,
"loss": 0.2463,
"step": 2270
},
{
"epoch": 19.65658747300216,
"grad_norm": 63.26512145996094,
"learning_rate": 2.299961345187476e-06,
"loss": 0.4906,
"step": 2280
},
{
"epoch": 19.742980561555076,
"grad_norm": 27.615131378173828,
"learning_rate": 2.261306532663317e-06,
"loss": 0.3544,
"step": 2290
},
{
"epoch": 19.829373650107993,
"grad_norm": 42.514869689941406,
"learning_rate": 2.2226517201391575e-06,
"loss": 0.2251,
"step": 2300
},
{
"epoch": 19.915766738660906,
"grad_norm": 47.34213638305664,
"learning_rate": 2.1839969076149985e-06,
"loss": 0.222,
"step": 2310
},
{
"epoch": 20.0,
"grad_norm": 103.04148864746094,
"learning_rate": 2.145342095090839e-06,
"loss": 0.1733,
"step": 2320
},
{
"epoch": 20.0,
"eval_loss": 0.6083136200904846,
"eval_macro_f1": 0.8617720910645434,
"eval_runtime": 16.82,
"eval_samples_per_second": 55.054,
"eval_steps_per_second": 6.897,
"step": 2320
},
{
"epoch": 20.086393088552917,
"grad_norm": 11.250152587890625,
"learning_rate": 2.1066872825666798e-06,
"loss": 0.2448,
"step": 2330
},
{
"epoch": 20.17278617710583,
"grad_norm": 128.7437744140625,
"learning_rate": 2.0680324700425204e-06,
"loss": 0.2663,
"step": 2340
},
{
"epoch": 20.259179265658748,
"grad_norm": 47.49317932128906,
"learning_rate": 2.0293776575183615e-06,
"loss": 0.2796,
"step": 2350
},
{
"epoch": 20.34557235421166,
"grad_norm": 49.71406555175781,
"learning_rate": 1.990722844994202e-06,
"loss": 0.3162,
"step": 2360
},
{
"epoch": 20.43196544276458,
"grad_norm": 72.845458984375,
"learning_rate": 1.9520680324700427e-06,
"loss": 0.3388,
"step": 2370
},
{
"epoch": 20.518358531317496,
"grad_norm": 45.771018981933594,
"learning_rate": 1.9134132199458833e-06,
"loss": 0.3733,
"step": 2380
},
{
"epoch": 20.60475161987041,
"grad_norm": 15.408432960510254,
"learning_rate": 1.8747584074217242e-06,
"loss": 0.3589,
"step": 2390
},
{
"epoch": 20.691144708423327,
"grad_norm": 53.17451095581055,
"learning_rate": 1.836103594897565e-06,
"loss": 0.2963,
"step": 2400
},
{
"epoch": 20.77753779697624,
"grad_norm": 118.89311218261719,
"learning_rate": 1.7974487823734057e-06,
"loss": 0.2782,
"step": 2410
},
{
"epoch": 20.863930885529157,
"grad_norm": 42.53224182128906,
"learning_rate": 1.7587939698492465e-06,
"loss": 0.2777,
"step": 2420
},
{
"epoch": 20.950323974082075,
"grad_norm": 453.1598205566406,
"learning_rate": 1.7201391573250873e-06,
"loss": 0.4015,
"step": 2430
},
{
"epoch": 21.0,
"eval_loss": 0.6018757820129395,
"eval_macro_f1": 0.8631354430151588,
"eval_runtime": 17.8334,
"eval_samples_per_second": 51.925,
"eval_steps_per_second": 6.505,
"step": 2436
},
{
"epoch": 21.034557235421165,
"grad_norm": 1.9662721157073975,
"learning_rate": 1.681484344800928e-06,
"loss": 0.1209,
"step": 2440
},
{
"epoch": 21.120950323974082,
"grad_norm": 42.46015167236328,
"learning_rate": 1.6428295322767688e-06,
"loss": 0.278,
"step": 2450
},
{
"epoch": 21.207343412527,
"grad_norm": 9.549817085266113,
"learning_rate": 1.6041747197526094e-06,
"loss": 0.1866,
"step": 2460
},
{
"epoch": 21.293736501079913,
"grad_norm": 3.735248565673828,
"learning_rate": 1.5655199072284503e-06,
"loss": 0.1963,
"step": 2470
},
{
"epoch": 21.38012958963283,
"grad_norm": 16.802431106567383,
"learning_rate": 1.5268650947042907e-06,
"loss": 0.2157,
"step": 2480
},
{
"epoch": 21.466522678185743,
"grad_norm": 19.582170486450195,
"learning_rate": 1.4882102821801313e-06,
"loss": 0.3382,
"step": 2490
},
{
"epoch": 21.55291576673866,
"grad_norm": 0.9627342820167542,
"learning_rate": 1.4495554696559722e-06,
"loss": 0.307,
"step": 2500
},
{
"epoch": 21.639308855291578,
"grad_norm": 41.96005630493164,
"learning_rate": 1.410900657131813e-06,
"loss": 0.2993,
"step": 2510
},
{
"epoch": 21.72570194384449,
"grad_norm": 81.29263305664062,
"learning_rate": 1.3722458446076536e-06,
"loss": 0.3624,
"step": 2520
},
{
"epoch": 21.81209503239741,
"grad_norm": 1.3688504695892334,
"learning_rate": 1.3335910320834945e-06,
"loss": 0.1005,
"step": 2530
},
{
"epoch": 21.898488120950326,
"grad_norm": 51.99349594116211,
"learning_rate": 1.2949362195593351e-06,
"loss": 0.3422,
"step": 2540
},
{
"epoch": 21.98488120950324,
"grad_norm": 10.257471084594727,
"learning_rate": 1.256281407035176e-06,
"loss": 0.2649,
"step": 2550
},
{
"epoch": 22.0,
"eval_loss": 0.6120893955230713,
"eval_macro_f1": 0.8630768559502398,
"eval_runtime": 17.5342,
"eval_samples_per_second": 52.811,
"eval_steps_per_second": 6.616,
"step": 2552
},
{
"epoch": 22.069114470842333,
"grad_norm": 31.507051467895508,
"learning_rate": 1.2176265945110168e-06,
"loss": 0.268,
"step": 2560
},
{
"epoch": 22.155507559395247,
"grad_norm": 91.0459976196289,
"learning_rate": 1.1789717819868574e-06,
"loss": 0.3532,
"step": 2570
},
{
"epoch": 22.241900647948164,
"grad_norm": 47.774391174316406,
"learning_rate": 1.1403169694626983e-06,
"loss": 0.2976,
"step": 2580
},
{
"epoch": 22.32829373650108,
"grad_norm": 49.88670349121094,
"learning_rate": 1.1016621569385389e-06,
"loss": 0.065,
"step": 2590
},
{
"epoch": 22.414686825053995,
"grad_norm": 187.57847595214844,
"learning_rate": 1.0630073444143797e-06,
"loss": 0.3762,
"step": 2600
},
{
"epoch": 22.50107991360691,
"grad_norm": 281.0491638183594,
"learning_rate": 1.0243525318902204e-06,
"loss": 0.2553,
"step": 2610
},
{
"epoch": 22.58747300215983,
"grad_norm": 8.582008361816406,
"learning_rate": 9.856977193660612e-07,
"loss": 0.2701,
"step": 2620
},
{
"epoch": 22.673866090712743,
"grad_norm": 40.54225540161133,
"learning_rate": 9.470429068419019e-07,
"loss": 0.2672,
"step": 2630
},
{
"epoch": 22.76025917926566,
"grad_norm": 45.52231216430664,
"learning_rate": 9.083880943177427e-07,
"loss": 0.3036,
"step": 2640
},
{
"epoch": 22.846652267818573,
"grad_norm": 108.5732421875,
"learning_rate": 8.697332817935834e-07,
"loss": 0.3182,
"step": 2650
},
{
"epoch": 22.93304535637149,
"grad_norm": 29.52652931213379,
"learning_rate": 8.310784692694241e-07,
"loss": 0.1196,
"step": 2660
},
{
"epoch": 23.0,
"eval_loss": 0.6888664960861206,
"eval_macro_f1": 0.8538403632743256,
"eval_runtime": 17.2141,
"eval_samples_per_second": 53.793,
"eval_steps_per_second": 6.739,
"step": 2668
},
{
"epoch": 23.017278617710584,
"grad_norm": 68.86863708496094,
"learning_rate": 7.924236567452649e-07,
"loss": 0.2634,
"step": 2670
},
{
"epoch": 23.103671706263498,
"grad_norm": 58.899391174316406,
"learning_rate": 7.537688442211055e-07,
"loss": 0.3309,
"step": 2680
},
{
"epoch": 23.190064794816415,
"grad_norm": 71.05393981933594,
"learning_rate": 7.151140316969462e-07,
"loss": 0.2153,
"step": 2690
},
{
"epoch": 23.276457883369332,
"grad_norm": 2.041644334793091,
"learning_rate": 6.76459219172787e-07,
"loss": 0.3616,
"step": 2700
},
{
"epoch": 23.362850971922246,
"grad_norm": 82.25381469726562,
"learning_rate": 6.378044066486277e-07,
"loss": 0.2538,
"step": 2710
},
{
"epoch": 23.449244060475163,
"grad_norm": 8.824419021606445,
"learning_rate": 5.991495941244686e-07,
"loss": 0.0927,
"step": 2720
},
{
"epoch": 23.535637149028076,
"grad_norm": 17.117977142333984,
"learning_rate": 5.604947816003093e-07,
"loss": 0.1538,
"step": 2730
},
{
"epoch": 23.622030237580994,
"grad_norm": 1.6735382080078125,
"learning_rate": 5.2183996907615e-07,
"loss": 0.1548,
"step": 2740
},
{
"epoch": 23.70842332613391,
"grad_norm": 6.845789432525635,
"learning_rate": 4.831851565519908e-07,
"loss": 0.1832,
"step": 2750
},
{
"epoch": 23.794816414686824,
"grad_norm": 19.85407066345215,
"learning_rate": 4.4453034402783155e-07,
"loss": 0.2808,
"step": 2760
},
{
"epoch": 23.88120950323974,
"grad_norm": 5.298573017120361,
"learning_rate": 4.058755315036723e-07,
"loss": 0.2098,
"step": 2770
},
{
"epoch": 23.967602591792655,
"grad_norm": 0.8520543575286865,
"learning_rate": 3.6722071897951296e-07,
"loss": 0.0521,
"step": 2780
},
{
"epoch": 24.0,
"eval_loss": 0.6725981831550598,
"eval_macro_f1": 0.8610719903206292,
"eval_runtime": 16.9666,
"eval_samples_per_second": 54.578,
"eval_steps_per_second": 6.837,
"step": 2784
},
{
"epoch": 24.05183585313175,
"grad_norm": 1.0969666242599487,
"learning_rate": 3.285659064553537e-07,
"loss": 0.1449,
"step": 2790
},
{
"epoch": 24.138228941684666,
"grad_norm": 4.448155403137207,
"learning_rate": 2.899110939311945e-07,
"loss": 0.1756,
"step": 2800
},
{
"epoch": 24.22462203023758,
"grad_norm": 2.900632381439209,
"learning_rate": 2.512562814070352e-07,
"loss": 0.1385,
"step": 2810
},
{
"epoch": 24.311015118790497,
"grad_norm": 24.39512062072754,
"learning_rate": 2.1260146888287596e-07,
"loss": 0.2804,
"step": 2820
},
{
"epoch": 24.397408207343414,
"grad_norm": 0.170551598072052,
"learning_rate": 1.7394665635871667e-07,
"loss": 0.3555,
"step": 2830
},
{
"epoch": 24.483801295896328,
"grad_norm": 0.42594221234321594,
"learning_rate": 1.3529184383455743e-07,
"loss": 0.1433,
"step": 2840
},
{
"epoch": 24.570194384449245,
"grad_norm": 26.60655975341797,
"learning_rate": 9.663703131039815e-08,
"loss": 0.3081,
"step": 2850
},
{
"epoch": 24.65658747300216,
"grad_norm": 113.71473693847656,
"learning_rate": 5.79822187862389e-08,
"loss": 0.4043,
"step": 2860
},
{
"epoch": 24.742980561555076,
"grad_norm": 34.953372955322266,
"learning_rate": 1.9327406262079632e-08,
"loss": 0.1929,
"step": 2870
},
{
"epoch": 24.786177105831534,
"eval_loss": 0.6091281175613403,
"eval_macro_f1": 0.8691893001382458,
"eval_runtime": 16.5742,
"eval_samples_per_second": 55.87,
"eval_steps_per_second": 6.999,
"step": 2875
}
],
"logging_steps": 10,
"max_steps": 2875,
"num_input_tokens_seen": 0,
"num_train_epochs": 25,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.2955292249139184e+19,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}