sidewalk-validator-ai-nocurbramp / trainer_state.json
johnomeara's picture
Upload 8 files
83f1a09 verified
{
"best_metric": 0.8567568925713644,
"best_model_checkpoint": "NoCurbRamp/dinov2/checkpoint-4598",
"epoch": 22.0,
"eval_steps": 500,
"global_step": 4598,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.04790419161676647,
"grad_norm": 102.78497314453125,
"learning_rate": 1.9230769230769234e-07,
"loss": 3.3285,
"step": 10
},
{
"epoch": 0.09580838323353294,
"grad_norm": 108.06490325927734,
"learning_rate": 3.846153846153847e-07,
"loss": 3.3413,
"step": 20
},
{
"epoch": 0.1437125748502994,
"grad_norm": 168.06533813476562,
"learning_rate": 5.76923076923077e-07,
"loss": 3.1682,
"step": 30
},
{
"epoch": 0.19161676646706588,
"grad_norm": 107.77788543701172,
"learning_rate": 7.692307692307694e-07,
"loss": 2.9011,
"step": 40
},
{
"epoch": 0.23952095808383234,
"grad_norm": 73.95760345458984,
"learning_rate": 9.615384615384617e-07,
"loss": 2.5857,
"step": 50
},
{
"epoch": 0.2874251497005988,
"grad_norm": 105.05018615722656,
"learning_rate": 1.153846153846154e-06,
"loss": 2.5002,
"step": 60
},
{
"epoch": 0.33532934131736525,
"grad_norm": 674.4759521484375,
"learning_rate": 1.3461538461538462e-06,
"loss": 2.4689,
"step": 70
},
{
"epoch": 0.38323353293413176,
"grad_norm": 141.63890075683594,
"learning_rate": 1.5384615384615387e-06,
"loss": 2.3887,
"step": 80
},
{
"epoch": 0.4311377245508982,
"grad_norm": 160.6332550048828,
"learning_rate": 1.7307692307692308e-06,
"loss": 2.2373,
"step": 90
},
{
"epoch": 0.47904191616766467,
"grad_norm": 702.2935180664062,
"learning_rate": 1.9230769230769234e-06,
"loss": 2.1801,
"step": 100
},
{
"epoch": 0.5269461077844312,
"grad_norm": 86.12798309326172,
"learning_rate": 2.1153846153846155e-06,
"loss": 2.0948,
"step": 110
},
{
"epoch": 0.5748502994011976,
"grad_norm": 112.54278564453125,
"learning_rate": 2.307692307692308e-06,
"loss": 2.0035,
"step": 120
},
{
"epoch": 0.6227544910179641,
"grad_norm": 187.13577270507812,
"learning_rate": 2.5e-06,
"loss": 2.13,
"step": 130
},
{
"epoch": 0.6706586826347305,
"grad_norm": 74.73124694824219,
"learning_rate": 2.6923076923076923e-06,
"loss": 2.0507,
"step": 140
},
{
"epoch": 0.718562874251497,
"grad_norm": 111.90137481689453,
"learning_rate": 2.8846153846153845e-06,
"loss": 1.9312,
"step": 150
},
{
"epoch": 0.7664670658682635,
"grad_norm": 162.74754333496094,
"learning_rate": 3.0769230769230774e-06,
"loss": 2.3011,
"step": 160
},
{
"epoch": 0.8143712574850299,
"grad_norm": 104.16117095947266,
"learning_rate": 3.2692307692307696e-06,
"loss": 1.8005,
"step": 170
},
{
"epoch": 0.8622754491017964,
"grad_norm": 97.45256805419922,
"learning_rate": 3.4615384615384617e-06,
"loss": 1.8074,
"step": 180
},
{
"epoch": 0.9101796407185628,
"grad_norm": 253.20359802246094,
"learning_rate": 3.653846153846154e-06,
"loss": 1.985,
"step": 190
},
{
"epoch": 0.9580838323353293,
"grad_norm": 157.43246459960938,
"learning_rate": 3.846153846153847e-06,
"loss": 1.7699,
"step": 200
},
{
"epoch": 1.0,
"eval_loss": 0.4299589693546295,
"eval_macro_f1": 0.7858166164735507,
"eval_runtime": 81.1893,
"eval_samples_per_second": 20.582,
"eval_steps_per_second": 2.574,
"step": 209
},
{
"epoch": 1.0047904191616766,
"grad_norm": 79.56900787353516,
"learning_rate": 4.0384615384615385e-06,
"loss": 2.0621,
"step": 210
},
{
"epoch": 1.0526946107784432,
"grad_norm": 80.78365325927734,
"learning_rate": 4.230769230769231e-06,
"loss": 1.9002,
"step": 220
},
{
"epoch": 1.1005988023952096,
"grad_norm": 131.77723693847656,
"learning_rate": 4.423076923076924e-06,
"loss": 1.8801,
"step": 230
},
{
"epoch": 1.148502994011976,
"grad_norm": 59.51331329345703,
"learning_rate": 4.615384615384616e-06,
"loss": 1.6218,
"step": 240
},
{
"epoch": 1.1964071856287426,
"grad_norm": 59.175621032714844,
"learning_rate": 4.807692307692308e-06,
"loss": 1.6202,
"step": 250
},
{
"epoch": 1.244311377245509,
"grad_norm": 175.007568359375,
"learning_rate": 5e-06,
"loss": 1.5911,
"step": 260
},
{
"epoch": 1.2922155688622754,
"grad_norm": 112.08436584472656,
"learning_rate": 5.192307692307693e-06,
"loss": 1.8472,
"step": 270
},
{
"epoch": 1.340119760479042,
"grad_norm": 49.89094924926758,
"learning_rate": 5.384615384615385e-06,
"loss": 1.7157,
"step": 280
},
{
"epoch": 1.3880239520958084,
"grad_norm": 105.93860626220703,
"learning_rate": 5.576923076923077e-06,
"loss": 1.7682,
"step": 290
},
{
"epoch": 1.4359281437125748,
"grad_norm": 73.6728515625,
"learning_rate": 5.769230769230769e-06,
"loss": 1.6694,
"step": 300
},
{
"epoch": 1.4838323353293412,
"grad_norm": 88.21549224853516,
"learning_rate": 5.961538461538462e-06,
"loss": 1.437,
"step": 310
},
{
"epoch": 1.5317365269461076,
"grad_norm": 95.41169738769531,
"learning_rate": 6.153846153846155e-06,
"loss": 1.6035,
"step": 320
},
{
"epoch": 1.5796407185628742,
"grad_norm": 55.19960021972656,
"learning_rate": 6.3461538461538466e-06,
"loss": 1.6455,
"step": 330
},
{
"epoch": 1.6275449101796409,
"grad_norm": 57.696773529052734,
"learning_rate": 6.538461538461539e-06,
"loss": 1.7315,
"step": 340
},
{
"epoch": 1.6754491017964073,
"grad_norm": 241.8914337158203,
"learning_rate": 6.730769230769232e-06,
"loss": 1.8734,
"step": 350
},
{
"epoch": 1.7233532934131737,
"grad_norm": 129.99835205078125,
"learning_rate": 6.923076923076923e-06,
"loss": 1.7628,
"step": 360
},
{
"epoch": 1.77125748502994,
"grad_norm": 125.50336456298828,
"learning_rate": 7.115384615384616e-06,
"loss": 1.5134,
"step": 370
},
{
"epoch": 1.8191616766467065,
"grad_norm": 66.20431518554688,
"learning_rate": 7.307692307692308e-06,
"loss": 1.7175,
"step": 380
},
{
"epoch": 1.867065868263473,
"grad_norm": 45.693199157714844,
"learning_rate": 7.500000000000001e-06,
"loss": 1.5907,
"step": 390
},
{
"epoch": 1.9149700598802395,
"grad_norm": 57.71452713012695,
"learning_rate": 7.692307692307694e-06,
"loss": 1.6598,
"step": 400
},
{
"epoch": 1.9628742514970061,
"grad_norm": 87.35894012451172,
"learning_rate": 7.884615384615384e-06,
"loss": 1.6557,
"step": 410
},
{
"epoch": 2.0,
"eval_loss": 0.41893911361694336,
"eval_macro_f1": 0.8138112665329834,
"eval_runtime": 41.195,
"eval_samples_per_second": 40.563,
"eval_steps_per_second": 5.073,
"step": 418
},
{
"epoch": 2.009580838323353,
"grad_norm": 71.44452667236328,
"learning_rate": 8.076923076923077e-06,
"loss": 1.7133,
"step": 420
},
{
"epoch": 2.05748502994012,
"grad_norm": 53.76054000854492,
"learning_rate": 8.26923076923077e-06,
"loss": 1.648,
"step": 430
},
{
"epoch": 2.1053892215568863,
"grad_norm": 1027.304931640625,
"learning_rate": 8.461538461538462e-06,
"loss": 1.7138,
"step": 440
},
{
"epoch": 2.1532934131736527,
"grad_norm": 226.78587341308594,
"learning_rate": 8.653846153846155e-06,
"loss": 1.589,
"step": 450
},
{
"epoch": 2.201197604790419,
"grad_norm": 83.36427307128906,
"learning_rate": 8.846153846153847e-06,
"loss": 1.4567,
"step": 460
},
{
"epoch": 2.2491017964071855,
"grad_norm": 56.77524185180664,
"learning_rate": 9.03846153846154e-06,
"loss": 1.6047,
"step": 470
},
{
"epoch": 2.297005988023952,
"grad_norm": 108.68399047851562,
"learning_rate": 9.230769230769232e-06,
"loss": 1.9924,
"step": 480
},
{
"epoch": 2.344910179640719,
"grad_norm": 82.6905288696289,
"learning_rate": 9.423076923076923e-06,
"loss": 2.0933,
"step": 490
},
{
"epoch": 2.392814371257485,
"grad_norm": 29.161964416503906,
"learning_rate": 9.615384615384616e-06,
"loss": 1.605,
"step": 500
},
{
"epoch": 2.4407185628742516,
"grad_norm": 37.83173370361328,
"learning_rate": 9.807692307692308e-06,
"loss": 1.7725,
"step": 510
},
{
"epoch": 2.488622754491018,
"grad_norm": 49.54322052001953,
"learning_rate": 1e-05,
"loss": 1.4123,
"step": 520
},
{
"epoch": 2.5365269461077844,
"grad_norm": 67.74500274658203,
"learning_rate": 9.97863247863248e-06,
"loss": 1.5403,
"step": 530
},
{
"epoch": 2.584431137724551,
"grad_norm": 55.931190490722656,
"learning_rate": 9.957264957264958e-06,
"loss": 1.4283,
"step": 540
},
{
"epoch": 2.632335329341317,
"grad_norm": 47.531734466552734,
"learning_rate": 9.935897435897437e-06,
"loss": 1.4491,
"step": 550
},
{
"epoch": 2.680239520958084,
"grad_norm": 30.019596099853516,
"learning_rate": 9.914529914529915e-06,
"loss": 1.4842,
"step": 560
},
{
"epoch": 2.7281437125748504,
"grad_norm": 67.44615173339844,
"learning_rate": 9.893162393162394e-06,
"loss": 1.6565,
"step": 570
},
{
"epoch": 2.776047904191617,
"grad_norm": 54.778350830078125,
"learning_rate": 9.871794871794872e-06,
"loss": 1.4563,
"step": 580
},
{
"epoch": 2.8239520958083832,
"grad_norm": 54.04096221923828,
"learning_rate": 9.850427350427351e-06,
"loss": 1.5794,
"step": 590
},
{
"epoch": 2.8718562874251496,
"grad_norm": 35.025691986083984,
"learning_rate": 9.82905982905983e-06,
"loss": 1.475,
"step": 600
},
{
"epoch": 2.919760479041916,
"grad_norm": 94.27930450439453,
"learning_rate": 9.807692307692308e-06,
"loss": 1.7062,
"step": 610
},
{
"epoch": 2.9676646706586824,
"grad_norm": 83.62496185302734,
"learning_rate": 9.786324786324787e-06,
"loss": 1.3654,
"step": 620
},
{
"epoch": 3.0,
"eval_loss": 0.4371185898780823,
"eval_macro_f1": 0.7854326010045019,
"eval_runtime": 41.3851,
"eval_samples_per_second": 40.377,
"eval_steps_per_second": 5.05,
"step": 627
},
{
"epoch": 3.01437125748503,
"grad_norm": 270.2382507324219,
"learning_rate": 9.764957264957265e-06,
"loss": 1.4214,
"step": 630
},
{
"epoch": 3.0622754491017963,
"grad_norm": 39.00271224975586,
"learning_rate": 9.743589743589744e-06,
"loss": 1.4744,
"step": 640
},
{
"epoch": 3.1101796407185627,
"grad_norm": 32.14289474487305,
"learning_rate": 9.722222222222223e-06,
"loss": 1.6232,
"step": 650
},
{
"epoch": 3.1580838323353295,
"grad_norm": 39.31702423095703,
"learning_rate": 9.700854700854701e-06,
"loss": 1.4988,
"step": 660
},
{
"epoch": 3.205988023952096,
"grad_norm": 65.2395248413086,
"learning_rate": 9.67948717948718e-06,
"loss": 1.26,
"step": 670
},
{
"epoch": 3.2538922155688623,
"grad_norm": 57.717525482177734,
"learning_rate": 9.658119658119659e-06,
"loss": 1.331,
"step": 680
},
{
"epoch": 3.3017964071856287,
"grad_norm": 69.02164459228516,
"learning_rate": 9.636752136752137e-06,
"loss": 1.3799,
"step": 690
},
{
"epoch": 3.349700598802395,
"grad_norm": 60.07762145996094,
"learning_rate": 9.615384615384616e-06,
"loss": 1.6609,
"step": 700
},
{
"epoch": 3.3976047904191615,
"grad_norm": 44.97191619873047,
"learning_rate": 9.594017094017094e-06,
"loss": 1.4233,
"step": 710
},
{
"epoch": 3.4455089820359284,
"grad_norm": 65.7463150024414,
"learning_rate": 9.572649572649575e-06,
"loss": 1.3724,
"step": 720
},
{
"epoch": 3.4934131736526948,
"grad_norm": 57.15474319458008,
"learning_rate": 9.551282051282053e-06,
"loss": 1.6036,
"step": 730
},
{
"epoch": 3.541317365269461,
"grad_norm": 64.17628479003906,
"learning_rate": 9.52991452991453e-06,
"loss": 1.3152,
"step": 740
},
{
"epoch": 3.5892215568862276,
"grad_norm": 69.88690948486328,
"learning_rate": 9.508547008547009e-06,
"loss": 1.4094,
"step": 750
},
{
"epoch": 3.637125748502994,
"grad_norm": 98.0072021484375,
"learning_rate": 9.487179487179487e-06,
"loss": 1.441,
"step": 760
},
{
"epoch": 3.6850299401197604,
"grad_norm": 47.96156692504883,
"learning_rate": 9.465811965811966e-06,
"loss": 1.7361,
"step": 770
},
{
"epoch": 3.7329341317365268,
"grad_norm": 52.17365646362305,
"learning_rate": 9.444444444444445e-06,
"loss": 1.1775,
"step": 780
},
{
"epoch": 3.7808383233532936,
"grad_norm": 552.5225219726562,
"learning_rate": 9.423076923076923e-06,
"loss": 1.2409,
"step": 790
},
{
"epoch": 3.82874251497006,
"grad_norm": 34.344207763671875,
"learning_rate": 9.401709401709402e-06,
"loss": 1.4177,
"step": 800
},
{
"epoch": 3.8766467065868264,
"grad_norm": 112.69720458984375,
"learning_rate": 9.38034188034188e-06,
"loss": 1.4948,
"step": 810
},
{
"epoch": 3.924550898203593,
"grad_norm": 94.27716064453125,
"learning_rate": 9.358974358974359e-06,
"loss": 1.5927,
"step": 820
},
{
"epoch": 3.972455089820359,
"grad_norm": 48.06044387817383,
"learning_rate": 9.33760683760684e-06,
"loss": 1.4711,
"step": 830
},
{
"epoch": 4.0,
"eval_loss": 0.39647647738456726,
"eval_macro_f1": 0.8267623951048628,
"eval_runtime": 44.3505,
"eval_samples_per_second": 37.677,
"eval_steps_per_second": 4.712,
"step": 836
},
{
"epoch": 4.019161676646706,
"grad_norm": 130.36834716796875,
"learning_rate": 9.316239316239318e-06,
"loss": 1.8227,
"step": 840
},
{
"epoch": 4.067065868263473,
"grad_norm": 52.67673110961914,
"learning_rate": 9.294871794871796e-06,
"loss": 1.2007,
"step": 850
},
{
"epoch": 4.11497005988024,
"grad_norm": 38.394187927246094,
"learning_rate": 9.273504273504275e-06,
"loss": 1.2584,
"step": 860
},
{
"epoch": 4.162874251497006,
"grad_norm": 293.8218994140625,
"learning_rate": 9.252136752136754e-06,
"loss": 1.1846,
"step": 870
},
{
"epoch": 4.210778443113773,
"grad_norm": 85.88053894042969,
"learning_rate": 9.230769230769232e-06,
"loss": 1.3184,
"step": 880
},
{
"epoch": 4.258682634730539,
"grad_norm": 45.364723205566406,
"learning_rate": 9.20940170940171e-06,
"loss": 1.2298,
"step": 890
},
{
"epoch": 4.3065868263473055,
"grad_norm": 43.60586929321289,
"learning_rate": 9.188034188034188e-06,
"loss": 1.1389,
"step": 900
},
{
"epoch": 4.3544910179640715,
"grad_norm": 85.17373657226562,
"learning_rate": 9.166666666666666e-06,
"loss": 1.1531,
"step": 910
},
{
"epoch": 4.402395209580838,
"grad_norm": 38.943946838378906,
"learning_rate": 9.145299145299145e-06,
"loss": 1.2874,
"step": 920
},
{
"epoch": 4.450299401197605,
"grad_norm": 29.44234275817871,
"learning_rate": 9.123931623931624e-06,
"loss": 1.3225,
"step": 930
},
{
"epoch": 4.498203592814371,
"grad_norm": 119.2593002319336,
"learning_rate": 9.102564102564104e-06,
"loss": 1.5807,
"step": 940
},
{
"epoch": 4.546107784431138,
"grad_norm": 30.237037658691406,
"learning_rate": 9.081196581196583e-06,
"loss": 0.9587,
"step": 950
},
{
"epoch": 4.594011976047904,
"grad_norm": 100.8051986694336,
"learning_rate": 9.059829059829061e-06,
"loss": 1.1353,
"step": 960
},
{
"epoch": 4.641916167664671,
"grad_norm": 64.73004150390625,
"learning_rate": 9.03846153846154e-06,
"loss": 1.271,
"step": 970
},
{
"epoch": 4.689820359281438,
"grad_norm": 55.64018249511719,
"learning_rate": 9.017094017094018e-06,
"loss": 1.2463,
"step": 980
},
{
"epoch": 4.7377245508982035,
"grad_norm": 32.8985710144043,
"learning_rate": 8.995726495726497e-06,
"loss": 1.2084,
"step": 990
},
{
"epoch": 4.78562874251497,
"grad_norm": 71.72270202636719,
"learning_rate": 8.974358974358976e-06,
"loss": 0.9659,
"step": 1000
},
{
"epoch": 4.833532934131736,
"grad_norm": 60.666831970214844,
"learning_rate": 8.952991452991454e-06,
"loss": 1.037,
"step": 1010
},
{
"epoch": 4.881437125748503,
"grad_norm": 65.69544219970703,
"learning_rate": 8.931623931623933e-06,
"loss": 1.4125,
"step": 1020
},
{
"epoch": 4.929341317365269,
"grad_norm": 78.90387725830078,
"learning_rate": 8.910256410256411e-06,
"loss": 1.1816,
"step": 1030
},
{
"epoch": 4.977245508982036,
"grad_norm": 51.65810775756836,
"learning_rate": 8.888888888888888e-06,
"loss": 1.1849,
"step": 1040
},
{
"epoch": 5.0,
"eval_loss": 0.4067809581756592,
"eval_macro_f1": 0.838654650788542,
"eval_runtime": 53.3705,
"eval_samples_per_second": 31.309,
"eval_steps_per_second": 3.916,
"step": 1045
},
{
"epoch": 5.023952095808383,
"grad_norm": 61.138328552246094,
"learning_rate": 8.867521367521369e-06,
"loss": 1.1483,
"step": 1050
},
{
"epoch": 5.07185628742515,
"grad_norm": 61.026058197021484,
"learning_rate": 8.846153846153847e-06,
"loss": 0.7873,
"step": 1060
},
{
"epoch": 5.119760479041916,
"grad_norm": 100.50831604003906,
"learning_rate": 8.824786324786326e-06,
"loss": 0.9951,
"step": 1070
},
{
"epoch": 5.167664670658683,
"grad_norm": 70.29264068603516,
"learning_rate": 8.803418803418804e-06,
"loss": 1.1178,
"step": 1080
},
{
"epoch": 5.2155688622754495,
"grad_norm": 62.474613189697266,
"learning_rate": 8.782051282051283e-06,
"loss": 1.0041,
"step": 1090
},
{
"epoch": 5.263473053892215,
"grad_norm": 32.0721321105957,
"learning_rate": 8.760683760683762e-06,
"loss": 1.0825,
"step": 1100
},
{
"epoch": 5.311377245508982,
"grad_norm": 78.72416687011719,
"learning_rate": 8.73931623931624e-06,
"loss": 1.2565,
"step": 1110
},
{
"epoch": 5.359281437125748,
"grad_norm": 119.30426788330078,
"learning_rate": 8.717948717948719e-06,
"loss": 1.3718,
"step": 1120
},
{
"epoch": 5.407185628742515,
"grad_norm": 45.799991607666016,
"learning_rate": 8.696581196581197e-06,
"loss": 1.2321,
"step": 1130
},
{
"epoch": 5.455089820359281,
"grad_norm": 25.331722259521484,
"learning_rate": 8.675213675213676e-06,
"loss": 0.9521,
"step": 1140
},
{
"epoch": 5.502994011976048,
"grad_norm": 96.21393585205078,
"learning_rate": 8.653846153846155e-06,
"loss": 0.9443,
"step": 1150
},
{
"epoch": 5.550898203592815,
"grad_norm": 45.29869079589844,
"learning_rate": 8.632478632478633e-06,
"loss": 0.8862,
"step": 1160
},
{
"epoch": 5.598802395209581,
"grad_norm": 56.898780822753906,
"learning_rate": 8.611111111111112e-06,
"loss": 1.0529,
"step": 1170
},
{
"epoch": 5.6467065868263475,
"grad_norm": 63.27139663696289,
"learning_rate": 8.58974358974359e-06,
"loss": 1.0086,
"step": 1180
},
{
"epoch": 5.6946107784431135,
"grad_norm": 64.9346923828125,
"learning_rate": 8.568376068376069e-06,
"loss": 1.1777,
"step": 1190
},
{
"epoch": 5.74251497005988,
"grad_norm": 34.74249267578125,
"learning_rate": 8.547008547008548e-06,
"loss": 1.1921,
"step": 1200
},
{
"epoch": 5.790419161676647,
"grad_norm": 85.9405517578125,
"learning_rate": 8.525641025641026e-06,
"loss": 1.0274,
"step": 1210
},
{
"epoch": 5.838323353293413,
"grad_norm": 51.7358512878418,
"learning_rate": 8.504273504273505e-06,
"loss": 1.088,
"step": 1220
},
{
"epoch": 5.88622754491018,
"grad_norm": 43.08283615112305,
"learning_rate": 8.482905982905983e-06,
"loss": 0.8124,
"step": 1230
},
{
"epoch": 5.934131736526946,
"grad_norm": 90.1866455078125,
"learning_rate": 8.461538461538462e-06,
"loss": 1.0978,
"step": 1240
},
{
"epoch": 5.982035928143713,
"grad_norm": 31.511568069458008,
"learning_rate": 8.44017094017094e-06,
"loss": 1.2058,
"step": 1250
},
{
"epoch": 6.0,
"eval_loss": 0.40154314041137695,
"eval_macro_f1": 0.828210643143795,
"eval_runtime": 65.0826,
"eval_samples_per_second": 25.675,
"eval_steps_per_second": 3.211,
"step": 1254
},
{
"epoch": 6.02874251497006,
"grad_norm": 24.663558959960938,
"learning_rate": 8.41880341880342e-06,
"loss": 0.7289,
"step": 1260
},
{
"epoch": 6.076646706586827,
"grad_norm": 32.12767028808594,
"learning_rate": 8.397435897435898e-06,
"loss": 0.9549,
"step": 1270
},
{
"epoch": 6.1245508982035926,
"grad_norm": 45.44089126586914,
"learning_rate": 8.376068376068377e-06,
"loss": 0.8573,
"step": 1280
},
{
"epoch": 6.172455089820359,
"grad_norm": 95.96544647216797,
"learning_rate": 8.354700854700855e-06,
"loss": 0.7697,
"step": 1290
},
{
"epoch": 6.220359281437125,
"grad_norm": 88.79412841796875,
"learning_rate": 8.333333333333334e-06,
"loss": 0.9044,
"step": 1300
},
{
"epoch": 6.268263473053892,
"grad_norm": 57.126304626464844,
"learning_rate": 8.311965811965812e-06,
"loss": 0.7112,
"step": 1310
},
{
"epoch": 6.316167664670659,
"grad_norm": 76.69857788085938,
"learning_rate": 8.290598290598293e-06,
"loss": 0.7156,
"step": 1320
},
{
"epoch": 6.364071856287425,
"grad_norm": 54.02803039550781,
"learning_rate": 8.26923076923077e-06,
"loss": 0.8275,
"step": 1330
},
{
"epoch": 6.411976047904192,
"grad_norm": 60.05768966674805,
"learning_rate": 8.247863247863248e-06,
"loss": 0.8879,
"step": 1340
},
{
"epoch": 6.459880239520958,
"grad_norm": 80.0739517211914,
"learning_rate": 8.226495726495727e-06,
"loss": 0.9576,
"step": 1350
},
{
"epoch": 6.507784431137725,
"grad_norm": 44.03201675415039,
"learning_rate": 8.205128205128205e-06,
"loss": 0.996,
"step": 1360
},
{
"epoch": 6.5556886227544915,
"grad_norm": 54.739044189453125,
"learning_rate": 8.183760683760684e-06,
"loss": 0.8699,
"step": 1370
},
{
"epoch": 6.6035928143712574,
"grad_norm": 44.991119384765625,
"learning_rate": 8.162393162393163e-06,
"loss": 0.894,
"step": 1380
},
{
"epoch": 6.651497005988024,
"grad_norm": 58.03591537475586,
"learning_rate": 8.141025641025641e-06,
"loss": 0.9078,
"step": 1390
},
{
"epoch": 6.69940119760479,
"grad_norm": 51.95082473754883,
"learning_rate": 8.11965811965812e-06,
"loss": 0.8032,
"step": 1400
},
{
"epoch": 6.747305389221557,
"grad_norm": 47.06685256958008,
"learning_rate": 8.098290598290598e-06,
"loss": 0.8379,
"step": 1410
},
{
"epoch": 6.795209580838323,
"grad_norm": 32.85672378540039,
"learning_rate": 8.076923076923077e-06,
"loss": 0.6956,
"step": 1420
},
{
"epoch": 6.84311377245509,
"grad_norm": 72.61941528320312,
"learning_rate": 8.055555555555557e-06,
"loss": 0.8305,
"step": 1430
},
{
"epoch": 6.891017964071857,
"grad_norm": 50.65372085571289,
"learning_rate": 8.034188034188036e-06,
"loss": 1.294,
"step": 1440
},
{
"epoch": 6.938922155688623,
"grad_norm": 30.090587615966797,
"learning_rate": 8.012820512820515e-06,
"loss": 0.7265,
"step": 1450
},
{
"epoch": 6.9868263473053895,
"grad_norm": 61.50442886352539,
"learning_rate": 7.991452991452993e-06,
"loss": 0.9642,
"step": 1460
},
{
"epoch": 7.0,
"eval_loss": 0.4302707612514496,
"eval_macro_f1": 0.8384768009768009,
"eval_runtime": 44.1392,
"eval_samples_per_second": 37.857,
"eval_steps_per_second": 4.735,
"step": 1463
},
{
"epoch": 7.0335329341317365,
"grad_norm": 51.26984786987305,
"learning_rate": 7.970085470085472e-06,
"loss": 0.5057,
"step": 1470
},
{
"epoch": 7.081437125748503,
"grad_norm": 40.46345138549805,
"learning_rate": 7.948717948717949e-06,
"loss": 0.6524,
"step": 1480
},
{
"epoch": 7.129341317365269,
"grad_norm": 86.76705169677734,
"learning_rate": 7.927350427350427e-06,
"loss": 0.7067,
"step": 1490
},
{
"epoch": 7.177245508982036,
"grad_norm": 47.121002197265625,
"learning_rate": 7.905982905982906e-06,
"loss": 0.6437,
"step": 1500
},
{
"epoch": 7.225149700598802,
"grad_norm": 63.37303924560547,
"learning_rate": 7.884615384615384e-06,
"loss": 0.7471,
"step": 1510
},
{
"epoch": 7.273053892215569,
"grad_norm": 47.41765594482422,
"learning_rate": 7.863247863247863e-06,
"loss": 0.8865,
"step": 1520
},
{
"epoch": 7.320958083832335,
"grad_norm": 49.318092346191406,
"learning_rate": 7.841880341880342e-06,
"loss": 0.804,
"step": 1530
},
{
"epoch": 7.368862275449102,
"grad_norm": 45.841331481933594,
"learning_rate": 7.820512820512822e-06,
"loss": 0.8108,
"step": 1540
},
{
"epoch": 7.416766467065869,
"grad_norm": 50.48727035522461,
"learning_rate": 7.7991452991453e-06,
"loss": 0.8056,
"step": 1550
},
{
"epoch": 7.464670658682635,
"grad_norm": 32.53761291503906,
"learning_rate": 7.77777777777778e-06,
"loss": 0.652,
"step": 1560
},
{
"epoch": 7.512574850299401,
"grad_norm": 62.68672561645508,
"learning_rate": 7.756410256410258e-06,
"loss": 1.0458,
"step": 1570
},
{
"epoch": 7.560479041916167,
"grad_norm": 45.77931594848633,
"learning_rate": 7.735042735042736e-06,
"loss": 0.712,
"step": 1580
},
{
"epoch": 7.608383233532934,
"grad_norm": 46.453208923339844,
"learning_rate": 7.713675213675215e-06,
"loss": 0.8081,
"step": 1590
},
{
"epoch": 7.656287425149701,
"grad_norm": 97.16567993164062,
"learning_rate": 7.692307692307694e-06,
"loss": 0.7669,
"step": 1600
},
{
"epoch": 7.704191616766467,
"grad_norm": 40.78181076049805,
"learning_rate": 7.670940170940172e-06,
"loss": 0.7844,
"step": 1610
},
{
"epoch": 7.752095808383234,
"grad_norm": 72.332763671875,
"learning_rate": 7.649572649572649e-06,
"loss": 0.7251,
"step": 1620
},
{
"epoch": 7.8,
"grad_norm": 61.692543029785156,
"learning_rate": 7.6282051282051286e-06,
"loss": 0.8466,
"step": 1630
},
{
"epoch": 7.847904191616767,
"grad_norm": 76.77676391601562,
"learning_rate": 7.606837606837607e-06,
"loss": 0.8042,
"step": 1640
},
{
"epoch": 7.895808383233533,
"grad_norm": 29.141651153564453,
"learning_rate": 7.585470085470086e-06,
"loss": 0.7552,
"step": 1650
},
{
"epoch": 7.9437125748502995,
"grad_norm": 58.72688293457031,
"learning_rate": 7.564102564102564e-06,
"loss": 0.5293,
"step": 1660
},
{
"epoch": 7.991616766467066,
"grad_norm": 52.79533386230469,
"learning_rate": 7.542735042735043e-06,
"loss": 0.7987,
"step": 1670
},
{
"epoch": 8.0,
"eval_loss": 0.4819260835647583,
"eval_macro_f1": 0.8445069931839906,
"eval_runtime": 49.8575,
"eval_samples_per_second": 33.516,
"eval_steps_per_second": 4.192,
"step": 1672
},
{
"epoch": 8.038323353293412,
"grad_norm": 15.822467803955078,
"learning_rate": 7.521367521367522e-06,
"loss": 0.4624,
"step": 1680
},
{
"epoch": 8.08622754491018,
"grad_norm": 17.05672264099121,
"learning_rate": 7.500000000000001e-06,
"loss": 0.518,
"step": 1690
},
{
"epoch": 8.134131736526946,
"grad_norm": 404.2013244628906,
"learning_rate": 7.47863247863248e-06,
"loss": 0.6137,
"step": 1700
},
{
"epoch": 8.182035928143712,
"grad_norm": 72.656982421875,
"learning_rate": 7.457264957264958e-06,
"loss": 0.7263,
"step": 1710
},
{
"epoch": 8.22994011976048,
"grad_norm": 42.37464141845703,
"learning_rate": 7.435897435897437e-06,
"loss": 0.5209,
"step": 1720
},
{
"epoch": 8.277844311377246,
"grad_norm": 41.12651062011719,
"learning_rate": 7.4145299145299155e-06,
"loss": 0.6419,
"step": 1730
},
{
"epoch": 8.325748502994012,
"grad_norm": 33.06435012817383,
"learning_rate": 7.393162393162394e-06,
"loss": 0.6121,
"step": 1740
},
{
"epoch": 8.373652694610778,
"grad_norm": 54.51243209838867,
"learning_rate": 7.371794871794873e-06,
"loss": 0.554,
"step": 1750
},
{
"epoch": 8.421556886227545,
"grad_norm": 23.176687240600586,
"learning_rate": 7.350427350427351e-06,
"loss": 0.8463,
"step": 1760
},
{
"epoch": 8.469461077844311,
"grad_norm": 27.295547485351562,
"learning_rate": 7.329059829059829e-06,
"loss": 0.5512,
"step": 1770
},
{
"epoch": 8.517365269461077,
"grad_norm": 19.80516242980957,
"learning_rate": 7.307692307692308e-06,
"loss": 0.7343,
"step": 1780
},
{
"epoch": 8.565269461077845,
"grad_norm": 100.30529022216797,
"learning_rate": 7.286324786324786e-06,
"loss": 0.5751,
"step": 1790
},
{
"epoch": 8.613173652694611,
"grad_norm": 40.754974365234375,
"learning_rate": 7.264957264957266e-06,
"loss": 0.8093,
"step": 1800
},
{
"epoch": 8.661077844311377,
"grad_norm": 76.589111328125,
"learning_rate": 7.243589743589744e-06,
"loss": 0.7388,
"step": 1810
},
{
"epoch": 8.708982035928143,
"grad_norm": 54.932838439941406,
"learning_rate": 7.222222222222223e-06,
"loss": 0.6432,
"step": 1820
},
{
"epoch": 8.75688622754491,
"grad_norm": 54.24689483642578,
"learning_rate": 7.2008547008547015e-06,
"loss": 0.8261,
"step": 1830
},
{
"epoch": 8.804790419161677,
"grad_norm": 26.55350112915039,
"learning_rate": 7.17948717948718e-06,
"loss": 0.701,
"step": 1840
},
{
"epoch": 8.852694610778443,
"grad_norm": 28.47566032409668,
"learning_rate": 7.158119658119659e-06,
"loss": 0.8115,
"step": 1850
},
{
"epoch": 8.90059880239521,
"grad_norm": 60.1458740234375,
"learning_rate": 7.136752136752137e-06,
"loss": 0.7178,
"step": 1860
},
{
"epoch": 8.948502994011976,
"grad_norm": 47.09638214111328,
"learning_rate": 7.115384615384616e-06,
"loss": 0.8113,
"step": 1870
},
{
"epoch": 8.996407185628742,
"grad_norm": 33.51266098022461,
"learning_rate": 7.0940170940170945e-06,
"loss": 0.7928,
"step": 1880
},
{
"epoch": 9.0,
"eval_loss": 0.41668668389320374,
"eval_macro_f1": 0.8441824722148172,
"eval_runtime": 44.5976,
"eval_samples_per_second": 37.468,
"eval_steps_per_second": 4.686,
"step": 1881
},
{
"epoch": 9.04311377245509,
"grad_norm": 81.31795501708984,
"learning_rate": 7.072649572649574e-06,
"loss": 0.6389,
"step": 1890
},
{
"epoch": 9.091017964071856,
"grad_norm": 13.542675971984863,
"learning_rate": 7.051282051282053e-06,
"loss": 0.5636,
"step": 1900
},
{
"epoch": 9.138922155688622,
"grad_norm": 82.46122741699219,
"learning_rate": 7.02991452991453e-06,
"loss": 0.6297,
"step": 1910
},
{
"epoch": 9.18682634730539,
"grad_norm": 47.003292083740234,
"learning_rate": 7.008547008547009e-06,
"loss": 0.5565,
"step": 1920
},
{
"epoch": 9.234730538922156,
"grad_norm": 26.577322006225586,
"learning_rate": 6.9871794871794876e-06,
"loss": 0.9967,
"step": 1930
},
{
"epoch": 9.282634730538922,
"grad_norm": 47.41004180908203,
"learning_rate": 6.965811965811966e-06,
"loss": 0.5435,
"step": 1940
},
{
"epoch": 9.33053892215569,
"grad_norm": 64.1046371459961,
"learning_rate": 6.944444444444445e-06,
"loss": 0.6405,
"step": 1950
},
{
"epoch": 9.378443113772455,
"grad_norm": 48.981136322021484,
"learning_rate": 6.923076923076923e-06,
"loss": 0.5692,
"step": 1960
},
{
"epoch": 9.426347305389221,
"grad_norm": 113.02758026123047,
"learning_rate": 6.901709401709402e-06,
"loss": 0.5958,
"step": 1970
},
{
"epoch": 9.474251497005987,
"grad_norm": 87.0408935546875,
"learning_rate": 6.880341880341881e-06,
"loss": 0.7034,
"step": 1980
},
{
"epoch": 9.522155688622755,
"grad_norm": 59.14445495605469,
"learning_rate": 6.858974358974359e-06,
"loss": 0.7209,
"step": 1990
},
{
"epoch": 9.570059880239521,
"grad_norm": 53.770408630371094,
"learning_rate": 6.837606837606839e-06,
"loss": 0.8953,
"step": 2000
},
{
"epoch": 9.617964071856287,
"grad_norm": 46.554481506347656,
"learning_rate": 6.816239316239317e-06,
"loss": 0.7084,
"step": 2010
},
{
"epoch": 9.665868263473055,
"grad_norm": 41.41537857055664,
"learning_rate": 6.794871794871796e-06,
"loss": 0.5916,
"step": 2020
},
{
"epoch": 9.71377245508982,
"grad_norm": 46.17745590209961,
"learning_rate": 6.7735042735042745e-06,
"loss": 0.5613,
"step": 2030
},
{
"epoch": 9.761676646706587,
"grad_norm": 61.96057891845703,
"learning_rate": 6.752136752136753e-06,
"loss": 0.7686,
"step": 2040
},
{
"epoch": 9.809580838323352,
"grad_norm": 17.55687713623047,
"learning_rate": 6.730769230769232e-06,
"loss": 0.7887,
"step": 2050
},
{
"epoch": 9.85748502994012,
"grad_norm": 45.83679962158203,
"learning_rate": 6.7094017094017094e-06,
"loss": 0.4342,
"step": 2060
},
{
"epoch": 9.905389221556886,
"grad_norm": 65.41094970703125,
"learning_rate": 6.688034188034188e-06,
"loss": 0.7364,
"step": 2070
},
{
"epoch": 9.953293413173652,
"grad_norm": 63.39105987548828,
"learning_rate": 6.666666666666667e-06,
"loss": 0.5624,
"step": 2080
},
{
"epoch": 10.0,
"grad_norm": 35.72822189331055,
"learning_rate": 6.645299145299145e-06,
"loss": 0.5399,
"step": 2090
},
{
"epoch": 10.0,
"eval_loss": 0.5035926103591919,
"eval_macro_f1": 0.8252431536013625,
"eval_runtime": 43.3409,
"eval_samples_per_second": 38.555,
"eval_steps_per_second": 4.822,
"step": 2090
},
{
"epoch": 10.047904191616766,
"grad_norm": 43.547481536865234,
"learning_rate": 6.623931623931624e-06,
"loss": 0.6161,
"step": 2100
},
{
"epoch": 10.095808383233534,
"grad_norm": 9.250253677368164,
"learning_rate": 6.602564102564103e-06,
"loss": 0.6646,
"step": 2110
},
{
"epoch": 10.1437125748503,
"grad_norm": 37.02254867553711,
"learning_rate": 6.581196581196582e-06,
"loss": 0.4968,
"step": 2120
},
{
"epoch": 10.191616766467066,
"grad_norm": 54.5052490234375,
"learning_rate": 6.5598290598290605e-06,
"loss": 0.3927,
"step": 2130
},
{
"epoch": 10.239520958083832,
"grad_norm": 64.98939514160156,
"learning_rate": 6.538461538461539e-06,
"loss": 0.5233,
"step": 2140
},
{
"epoch": 10.2874251497006,
"grad_norm": 41.95616912841797,
"learning_rate": 6.517094017094018e-06,
"loss": 0.7305,
"step": 2150
},
{
"epoch": 10.335329341317365,
"grad_norm": 26.417930603027344,
"learning_rate": 6.495726495726496e-06,
"loss": 0.5581,
"step": 2160
},
{
"epoch": 10.383233532934131,
"grad_norm": 60.45222473144531,
"learning_rate": 6.474358974358975e-06,
"loss": 0.5982,
"step": 2170
},
{
"epoch": 10.431137724550899,
"grad_norm": 47.69218826293945,
"learning_rate": 6.4529914529914535e-06,
"loss": 0.3558,
"step": 2180
},
{
"epoch": 10.479041916167665,
"grad_norm": 33.18900680541992,
"learning_rate": 6.431623931623933e-06,
"loss": 0.6521,
"step": 2190
},
{
"epoch": 10.52694610778443,
"grad_norm": 39.501426696777344,
"learning_rate": 6.410256410256412e-06,
"loss": 0.6646,
"step": 2200
},
{
"epoch": 10.574850299401197,
"grad_norm": 61.5817985534668,
"learning_rate": 6.3888888888888885e-06,
"loss": 0.6448,
"step": 2210
},
{
"epoch": 10.622754491017965,
"grad_norm": 37.70832061767578,
"learning_rate": 6.367521367521368e-06,
"loss": 0.4272,
"step": 2220
},
{
"epoch": 10.67065868263473,
"grad_norm": 56.18067169189453,
"learning_rate": 6.3461538461538466e-06,
"loss": 0.5429,
"step": 2230
},
{
"epoch": 10.718562874251496,
"grad_norm": 38.271278381347656,
"learning_rate": 6.324786324786325e-06,
"loss": 0.503,
"step": 2240
},
{
"epoch": 10.766467065868264,
"grad_norm": 53.65212631225586,
"learning_rate": 6.303418803418804e-06,
"loss": 0.5207,
"step": 2250
},
{
"epoch": 10.81437125748503,
"grad_norm": 15.48988151550293,
"learning_rate": 6.282051282051282e-06,
"loss": 0.6707,
"step": 2260
},
{
"epoch": 10.862275449101796,
"grad_norm": 44.31208038330078,
"learning_rate": 6.260683760683761e-06,
"loss": 0.3833,
"step": 2270
},
{
"epoch": 10.910179640718562,
"grad_norm": 48.97806167602539,
"learning_rate": 6.23931623931624e-06,
"loss": 0.5623,
"step": 2280
},
{
"epoch": 10.95808383233533,
"grad_norm": 73.36396789550781,
"learning_rate": 6.217948717948718e-06,
"loss": 0.6053,
"step": 2290
},
{
"epoch": 11.0,
"eval_loss": 0.4976274073123932,
"eval_macro_f1": 0.8447303083464661,
"eval_runtime": 44.1148,
"eval_samples_per_second": 37.878,
"eval_steps_per_second": 4.738,
"step": 2299
},
{
"epoch": 11.004790419161676,
"grad_norm": 52.19700622558594,
"learning_rate": 6.196581196581198e-06,
"loss": 0.4362,
"step": 2300
},
{
"epoch": 11.052694610778444,
"grad_norm": 12.376163482666016,
"learning_rate": 6.175213675213676e-06,
"loss": 0.2856,
"step": 2310
},
{
"epoch": 11.10059880239521,
"grad_norm": 55.741554260253906,
"learning_rate": 6.153846153846155e-06,
"loss": 0.4785,
"step": 2320
},
{
"epoch": 11.148502994011976,
"grad_norm": 23.254854202270508,
"learning_rate": 6.1324786324786335e-06,
"loss": 0.5646,
"step": 2330
},
{
"epoch": 11.196407185628743,
"grad_norm": 54.249332427978516,
"learning_rate": 6.111111111111112e-06,
"loss": 0.4413,
"step": 2340
},
{
"epoch": 11.24431137724551,
"grad_norm": 91.06517791748047,
"learning_rate": 6.08974358974359e-06,
"loss": 0.4921,
"step": 2350
},
{
"epoch": 11.292215568862275,
"grad_norm": 12.940735816955566,
"learning_rate": 6.0683760683760684e-06,
"loss": 0.5419,
"step": 2360
},
{
"epoch": 11.340119760479041,
"grad_norm": 88.24459075927734,
"learning_rate": 6.047008547008547e-06,
"loss": 0.582,
"step": 2370
},
{
"epoch": 11.388023952095809,
"grad_norm": 28.848173141479492,
"learning_rate": 6.025641025641026e-06,
"loss": 0.5232,
"step": 2380
},
{
"epoch": 11.435928143712575,
"grad_norm": 18.56818389892578,
"learning_rate": 6.004273504273504e-06,
"loss": 0.4061,
"step": 2390
},
{
"epoch": 11.48383233532934,
"grad_norm": 88.79745483398438,
"learning_rate": 5.982905982905983e-06,
"loss": 0.4603,
"step": 2400
},
{
"epoch": 11.531736526946109,
"grad_norm": 59.039039611816406,
"learning_rate": 5.961538461538462e-06,
"loss": 0.4677,
"step": 2410
},
{
"epoch": 11.579640718562874,
"grad_norm": 60.00382614135742,
"learning_rate": 5.940170940170941e-06,
"loss": 0.3458,
"step": 2420
},
{
"epoch": 11.62754491017964,
"grad_norm": 37.48514938354492,
"learning_rate": 5.9188034188034195e-06,
"loss": 0.6475,
"step": 2430
},
{
"epoch": 11.675449101796406,
"grad_norm": 53.149925231933594,
"learning_rate": 5.897435897435898e-06,
"loss": 0.4033,
"step": 2440
},
{
"epoch": 11.723353293413174,
"grad_norm": 57.217735290527344,
"learning_rate": 5.876068376068377e-06,
"loss": 0.5827,
"step": 2450
},
{
"epoch": 11.77125748502994,
"grad_norm": 32.07596206665039,
"learning_rate": 5.854700854700855e-06,
"loss": 0.4473,
"step": 2460
},
{
"epoch": 11.819161676646706,
"grad_norm": 39.394474029541016,
"learning_rate": 5.833333333333334e-06,
"loss": 0.6156,
"step": 2470
},
{
"epoch": 11.867065868263474,
"grad_norm": 42.22713088989258,
"learning_rate": 5.8119658119658126e-06,
"loss": 0.5249,
"step": 2480
},
{
"epoch": 11.91497005988024,
"grad_norm": 44.544944763183594,
"learning_rate": 5.790598290598292e-06,
"loss": 0.5426,
"step": 2490
},
{
"epoch": 11.962874251497006,
"grad_norm": 37.793052673339844,
"learning_rate": 5.769230769230769e-06,
"loss": 0.7017,
"step": 2500
},
{
"epoch": 12.0,
"eval_loss": 0.5270896553993225,
"eval_macro_f1": 0.8419574495547526,
"eval_runtime": 52.4362,
"eval_samples_per_second": 31.867,
"eval_steps_per_second": 3.986,
"step": 2508
},
{
"epoch": 12.009580838323354,
"grad_norm": 33.670814514160156,
"learning_rate": 5.7478632478632475e-06,
"loss": 0.348,
"step": 2510
},
{
"epoch": 12.05748502994012,
"grad_norm": 16.13005256652832,
"learning_rate": 5.726495726495727e-06,
"loss": 0.3518,
"step": 2520
},
{
"epoch": 12.105389221556885,
"grad_norm": 117.29934692382812,
"learning_rate": 5.705128205128206e-06,
"loss": 0.4044,
"step": 2530
},
{
"epoch": 12.153293413173653,
"grad_norm": 66.2587661743164,
"learning_rate": 5.683760683760684e-06,
"loss": 0.3835,
"step": 2540
},
{
"epoch": 12.20119760479042,
"grad_norm": 39.18204879760742,
"learning_rate": 5.662393162393163e-06,
"loss": 0.5428,
"step": 2550
},
{
"epoch": 12.249101796407185,
"grad_norm": 29.139354705810547,
"learning_rate": 5.641025641025641e-06,
"loss": 0.2677,
"step": 2560
},
{
"epoch": 12.297005988023953,
"grad_norm": 49.968666076660156,
"learning_rate": 5.61965811965812e-06,
"loss": 0.4607,
"step": 2570
},
{
"epoch": 12.344910179640719,
"grad_norm": 30.069171905517578,
"learning_rate": 5.598290598290599e-06,
"loss": 0.4371,
"step": 2580
},
{
"epoch": 12.392814371257485,
"grad_norm": 21.969675064086914,
"learning_rate": 5.576923076923077e-06,
"loss": 0.473,
"step": 2590
},
{
"epoch": 12.44071856287425,
"grad_norm": 76.71914672851562,
"learning_rate": 5.555555555555557e-06,
"loss": 0.3471,
"step": 2600
},
{
"epoch": 12.488622754491018,
"grad_norm": 50.17422866821289,
"learning_rate": 5.534188034188035e-06,
"loss": 0.7815,
"step": 2610
},
{
"epoch": 12.536526946107784,
"grad_norm": 44.04591751098633,
"learning_rate": 5.512820512820514e-06,
"loss": 0.5847,
"step": 2620
},
{
"epoch": 12.58443113772455,
"grad_norm": 17.07801055908203,
"learning_rate": 5.4914529914529925e-06,
"loss": 0.5836,
"step": 2630
},
{
"epoch": 12.632335329341318,
"grad_norm": 37.61981201171875,
"learning_rate": 5.470085470085471e-06,
"loss": 0.6164,
"step": 2640
},
{
"epoch": 12.680239520958084,
"grad_norm": 45.67870330810547,
"learning_rate": 5.448717948717949e-06,
"loss": 0.4233,
"step": 2650
},
{
"epoch": 12.72814371257485,
"grad_norm": 6.8784918785095215,
"learning_rate": 5.4273504273504275e-06,
"loss": 0.4341,
"step": 2660
},
{
"epoch": 12.776047904191616,
"grad_norm": 106.60427856445312,
"learning_rate": 5.405982905982906e-06,
"loss": 0.4442,
"step": 2670
},
{
"epoch": 12.823952095808384,
"grad_norm": 10.980769157409668,
"learning_rate": 5.384615384615385e-06,
"loss": 0.3069,
"step": 2680
},
{
"epoch": 12.87185628742515,
"grad_norm": 108.9623794555664,
"learning_rate": 5.363247863247863e-06,
"loss": 0.461,
"step": 2690
},
{
"epoch": 12.919760479041916,
"grad_norm": 49.222408294677734,
"learning_rate": 5.341880341880342e-06,
"loss": 0.5779,
"step": 2700
},
{
"epoch": 12.967664670658683,
"grad_norm": 77.78028869628906,
"learning_rate": 5.320512820512821e-06,
"loss": 0.5748,
"step": 2710
},
{
"epoch": 13.0,
"eval_loss": 0.6038709878921509,
"eval_macro_f1": 0.82953734826579,
"eval_runtime": 45.778,
"eval_samples_per_second": 36.502,
"eval_steps_per_second": 4.566,
"step": 2717
},
{
"epoch": 13.01437125748503,
"grad_norm": 64.35317993164062,
"learning_rate": 5.2991452991453e-06,
"loss": 0.4684,
"step": 2720
},
{
"epoch": 13.062275449101797,
"grad_norm": 27.278182983398438,
"learning_rate": 5.2777777777777785e-06,
"loss": 0.2145,
"step": 2730
},
{
"epoch": 13.110179640718563,
"grad_norm": 12.438913345336914,
"learning_rate": 5.256410256410257e-06,
"loss": 0.3516,
"step": 2740
},
{
"epoch": 13.158083832335329,
"grad_norm": 111.6015396118164,
"learning_rate": 5.235042735042736e-06,
"loss": 0.4645,
"step": 2750
},
{
"epoch": 13.205988023952095,
"grad_norm": 29.416948318481445,
"learning_rate": 5.213675213675214e-06,
"loss": 0.4283,
"step": 2760
},
{
"epoch": 13.253892215568863,
"grad_norm": 42.40279006958008,
"learning_rate": 5.192307692307693e-06,
"loss": 0.3558,
"step": 2770
},
{
"epoch": 13.301796407185629,
"grad_norm": 90.29447937011719,
"learning_rate": 5.1709401709401716e-06,
"loss": 0.4245,
"step": 2780
},
{
"epoch": 13.349700598802395,
"grad_norm": 51.8712272644043,
"learning_rate": 5.149572649572649e-06,
"loss": 0.3301,
"step": 2790
},
{
"epoch": 13.397604790419162,
"grad_norm": 32.75868606567383,
"learning_rate": 5.128205128205128e-06,
"loss": 0.401,
"step": 2800
},
{
"epoch": 13.445508982035928,
"grad_norm": 14.076770782470703,
"learning_rate": 5.1068376068376065e-06,
"loss": 0.2116,
"step": 2810
},
{
"epoch": 13.493413173652694,
"grad_norm": 158.29722595214844,
"learning_rate": 5.085470085470086e-06,
"loss": 0.4145,
"step": 2820
},
{
"epoch": 13.54131736526946,
"grad_norm": 43.535980224609375,
"learning_rate": 5.064102564102565e-06,
"loss": 0.5061,
"step": 2830
},
{
"epoch": 13.589221556886228,
"grad_norm": 129.22494506835938,
"learning_rate": 5.042735042735043e-06,
"loss": 0.2653,
"step": 2840
},
{
"epoch": 13.637125748502994,
"grad_norm": 19.907264709472656,
"learning_rate": 5.021367521367522e-06,
"loss": 0.5019,
"step": 2850
},
{
"epoch": 13.68502994011976,
"grad_norm": 101.03581237792969,
"learning_rate": 5e-06,
"loss": 0.6201,
"step": 2860
},
{
"epoch": 13.732934131736528,
"grad_norm": 54.207130432128906,
"learning_rate": 4.978632478632479e-06,
"loss": 0.4591,
"step": 2870
},
{
"epoch": 13.780838323353294,
"grad_norm": 57.07173538208008,
"learning_rate": 4.957264957264958e-06,
"loss": 0.5418,
"step": 2880
},
{
"epoch": 13.82874251497006,
"grad_norm": 54.304054260253906,
"learning_rate": 4.935897435897436e-06,
"loss": 0.6254,
"step": 2890
},
{
"epoch": 13.876646706586826,
"grad_norm": 74.52729797363281,
"learning_rate": 4.914529914529915e-06,
"loss": 0.302,
"step": 2900
},
{
"epoch": 13.924550898203593,
"grad_norm": 46.62977600097656,
"learning_rate": 4.8931623931623934e-06,
"loss": 0.3676,
"step": 2910
},
{
"epoch": 13.97245508982036,
"grad_norm": 73.47917938232422,
"learning_rate": 4.871794871794872e-06,
"loss": 0.478,
"step": 2920
},
{
"epoch": 14.0,
"eval_loss": 0.5613604784011841,
"eval_macro_f1": 0.833819031115799,
"eval_runtime": 44.8642,
"eval_samples_per_second": 37.246,
"eval_steps_per_second": 4.659,
"step": 2926
},
{
"epoch": 14.019161676646707,
"grad_norm": 106.49620056152344,
"learning_rate": 4.850427350427351e-06,
"loss": 0.4661,
"step": 2930
},
{
"epoch": 14.067065868263473,
"grad_norm": 23.546146392822266,
"learning_rate": 4.829059829059829e-06,
"loss": 0.3035,
"step": 2940
},
{
"epoch": 14.114970059880239,
"grad_norm": 23.96906280517578,
"learning_rate": 4.807692307692308e-06,
"loss": 0.3686,
"step": 2950
},
{
"epoch": 14.162874251497007,
"grad_norm": 82.27886199951172,
"learning_rate": 4.786324786324787e-06,
"loss": 0.4799,
"step": 2960
},
{
"epoch": 14.210778443113773,
"grad_norm": 10.91873550415039,
"learning_rate": 4.764957264957265e-06,
"loss": 0.2748,
"step": 2970
},
{
"epoch": 14.258682634730539,
"grad_norm": 38.029109954833984,
"learning_rate": 4.743589743589744e-06,
"loss": 0.3756,
"step": 2980
},
{
"epoch": 14.306586826347305,
"grad_norm": 73.956787109375,
"learning_rate": 4.722222222222222e-06,
"loss": 0.5632,
"step": 2990
},
{
"epoch": 14.354491017964072,
"grad_norm": 19.53325843811035,
"learning_rate": 4.700854700854701e-06,
"loss": 0.3685,
"step": 3000
},
{
"epoch": 14.402395209580838,
"grad_norm": 21.934566497802734,
"learning_rate": 4.6794871794871795e-06,
"loss": 0.3186,
"step": 3010
},
{
"epoch": 14.450299401197604,
"grad_norm": 48.01268768310547,
"learning_rate": 4.658119658119659e-06,
"loss": 0.4358,
"step": 3020
},
{
"epoch": 14.498203592814372,
"grad_norm": 67.98200225830078,
"learning_rate": 4.6367521367521375e-06,
"loss": 0.2328,
"step": 3030
},
{
"epoch": 14.546107784431138,
"grad_norm": 20.185400009155273,
"learning_rate": 4.615384615384616e-06,
"loss": 0.4095,
"step": 3040
},
{
"epoch": 14.594011976047904,
"grad_norm": 35.661773681640625,
"learning_rate": 4.594017094017094e-06,
"loss": 0.6548,
"step": 3050
},
{
"epoch": 14.64191616766467,
"grad_norm": 36.68177032470703,
"learning_rate": 4.5726495726495725e-06,
"loss": 0.3479,
"step": 3060
},
{
"epoch": 14.689820359281438,
"grad_norm": 19.040000915527344,
"learning_rate": 4.551282051282052e-06,
"loss": 0.3784,
"step": 3070
},
{
"epoch": 14.737724550898204,
"grad_norm": 77.58363342285156,
"learning_rate": 4.5299145299145306e-06,
"loss": 0.4044,
"step": 3080
},
{
"epoch": 14.78562874251497,
"grad_norm": 98.52961730957031,
"learning_rate": 4.508547008547009e-06,
"loss": 0.42,
"step": 3090
},
{
"epoch": 14.833532934131737,
"grad_norm": 167.6805419921875,
"learning_rate": 4.487179487179488e-06,
"loss": 0.3661,
"step": 3100
},
{
"epoch": 14.881437125748503,
"grad_norm": 81.3060302734375,
"learning_rate": 4.465811965811966e-06,
"loss": 0.6895,
"step": 3110
},
{
"epoch": 14.92934131736527,
"grad_norm": 31.589746475219727,
"learning_rate": 4.444444444444444e-06,
"loss": 0.3983,
"step": 3120
},
{
"epoch": 14.977245508982035,
"grad_norm": 33.01588821411133,
"learning_rate": 4.423076923076924e-06,
"loss": 0.3302,
"step": 3130
},
{
"epoch": 15.0,
"eval_loss": 0.6139780282974243,
"eval_macro_f1": 0.850078934734469,
"eval_runtime": 61.1653,
"eval_samples_per_second": 27.319,
"eval_steps_per_second": 3.417,
"step": 3135
},
{
"epoch": 15.023952095808383,
"grad_norm": 63.095176696777344,
"learning_rate": 4.401709401709402e-06,
"loss": 0.492,
"step": 3140
},
{
"epoch": 15.071856287425149,
"grad_norm": 14.580053329467773,
"learning_rate": 4.380341880341881e-06,
"loss": 0.5926,
"step": 3150
},
{
"epoch": 15.119760479041917,
"grad_norm": 9.320775032043457,
"learning_rate": 4.358974358974359e-06,
"loss": 0.455,
"step": 3160
},
{
"epoch": 15.167664670658683,
"grad_norm": 57.686344146728516,
"learning_rate": 4.337606837606838e-06,
"loss": 0.3759,
"step": 3170
},
{
"epoch": 15.215568862275449,
"grad_norm": 4.791827201843262,
"learning_rate": 4.316239316239317e-06,
"loss": 0.2594,
"step": 3180
},
{
"epoch": 15.263473053892216,
"grad_norm": 181.0989532470703,
"learning_rate": 4.294871794871795e-06,
"loss": 0.4456,
"step": 3190
},
{
"epoch": 15.311377245508982,
"grad_norm": 15.999568939208984,
"learning_rate": 4.273504273504274e-06,
"loss": 0.3199,
"step": 3200
},
{
"epoch": 15.359281437125748,
"grad_norm": 34.29719543457031,
"learning_rate": 4.2521367521367524e-06,
"loss": 0.3138,
"step": 3210
},
{
"epoch": 15.407185628742514,
"grad_norm": 108.82241821289062,
"learning_rate": 4.230769230769231e-06,
"loss": 0.381,
"step": 3220
},
{
"epoch": 15.455089820359282,
"grad_norm": 7.426156044006348,
"learning_rate": 4.20940170940171e-06,
"loss": 0.3027,
"step": 3230
},
{
"epoch": 15.502994011976048,
"grad_norm": 33.00428009033203,
"learning_rate": 4.188034188034188e-06,
"loss": 0.2268,
"step": 3240
},
{
"epoch": 15.550898203592814,
"grad_norm": 64.04177856445312,
"learning_rate": 4.166666666666667e-06,
"loss": 0.4501,
"step": 3250
},
{
"epoch": 15.598802395209582,
"grad_norm": 28.86948585510254,
"learning_rate": 4.145299145299146e-06,
"loss": 0.4704,
"step": 3260
},
{
"epoch": 15.646706586826348,
"grad_norm": 33.580509185791016,
"learning_rate": 4.123931623931624e-06,
"loss": 0.3373,
"step": 3270
},
{
"epoch": 15.694610778443113,
"grad_norm": 27.37198257446289,
"learning_rate": 4.102564102564103e-06,
"loss": 0.3751,
"step": 3280
},
{
"epoch": 15.74251497005988,
"grad_norm": 29.447359085083008,
"learning_rate": 4.081196581196581e-06,
"loss": 0.2583,
"step": 3290
},
{
"epoch": 15.790419161676647,
"grad_norm": 53.31045913696289,
"learning_rate": 4.05982905982906e-06,
"loss": 0.2685,
"step": 3300
},
{
"epoch": 15.838323353293413,
"grad_norm": 135.5189971923828,
"learning_rate": 4.0384615384615385e-06,
"loss": 0.6179,
"step": 3310
},
{
"epoch": 15.886227544910179,
"grad_norm": 24.811853408813477,
"learning_rate": 4.017094017094018e-06,
"loss": 0.2871,
"step": 3320
},
{
"epoch": 15.934131736526947,
"grad_norm": 44.590511322021484,
"learning_rate": 3.9957264957264966e-06,
"loss": 0.3918,
"step": 3330
},
{
"epoch": 15.982035928143713,
"grad_norm": 34.29424285888672,
"learning_rate": 3.974358974358974e-06,
"loss": 0.4284,
"step": 3340
},
{
"epoch": 16.0,
"eval_loss": 0.6892519593238831,
"eval_macro_f1": 0.8416606209561599,
"eval_runtime": 68.7411,
"eval_samples_per_second": 24.309,
"eval_steps_per_second": 3.04,
"step": 3344
},
{
"epoch": 16.02874251497006,
"grad_norm": 76.20410919189453,
"learning_rate": 3.952991452991453e-06,
"loss": 0.2715,
"step": 3350
},
{
"epoch": 16.076646706586825,
"grad_norm": 97.97017669677734,
"learning_rate": 3.9316239316239315e-06,
"loss": 0.4799,
"step": 3360
},
{
"epoch": 16.124550898203594,
"grad_norm": 42.71623229980469,
"learning_rate": 3.910256410256411e-06,
"loss": 0.4008,
"step": 3370
},
{
"epoch": 16.17245508982036,
"grad_norm": 48.501312255859375,
"learning_rate": 3.88888888888889e-06,
"loss": 0.4609,
"step": 3380
},
{
"epoch": 16.220359281437126,
"grad_norm": 15.1089448928833,
"learning_rate": 3.867521367521368e-06,
"loss": 0.2344,
"step": 3390
},
{
"epoch": 16.268263473053892,
"grad_norm": 87.63355255126953,
"learning_rate": 3.846153846153847e-06,
"loss": 0.4965,
"step": 3400
},
{
"epoch": 16.316167664670658,
"grad_norm": 54.828556060791016,
"learning_rate": 3.8247863247863246e-06,
"loss": 0.3707,
"step": 3410
},
{
"epoch": 16.364071856287424,
"grad_norm": 74.80049133300781,
"learning_rate": 3.8034188034188036e-06,
"loss": 0.3693,
"step": 3420
},
{
"epoch": 16.41197604790419,
"grad_norm": 18.754371643066406,
"learning_rate": 3.782051282051282e-06,
"loss": 0.3593,
"step": 3430
},
{
"epoch": 16.45988023952096,
"grad_norm": 24.015533447265625,
"learning_rate": 3.760683760683761e-06,
"loss": 0.215,
"step": 3440
},
{
"epoch": 16.507784431137726,
"grad_norm": 41.305519104003906,
"learning_rate": 3.73931623931624e-06,
"loss": 0.3713,
"step": 3450
},
{
"epoch": 16.55568862275449,
"grad_norm": 54.54863739013672,
"learning_rate": 3.7179487179487184e-06,
"loss": 0.4386,
"step": 3460
},
{
"epoch": 16.603592814371257,
"grad_norm": 71.85780334472656,
"learning_rate": 3.696581196581197e-06,
"loss": 0.3638,
"step": 3470
},
{
"epoch": 16.651497005988023,
"grad_norm": 92.95706939697266,
"learning_rate": 3.6752136752136756e-06,
"loss": 0.3444,
"step": 3480
},
{
"epoch": 16.69940119760479,
"grad_norm": 67.28424072265625,
"learning_rate": 3.653846153846154e-06,
"loss": 0.5401,
"step": 3490
},
{
"epoch": 16.747305389221555,
"grad_norm": 29.402442932128906,
"learning_rate": 3.632478632478633e-06,
"loss": 0.5194,
"step": 3500
},
{
"epoch": 16.795209580838325,
"grad_norm": 38.80514144897461,
"learning_rate": 3.6111111111111115e-06,
"loss": 0.4502,
"step": 3510
},
{
"epoch": 16.84311377245509,
"grad_norm": 10.038030624389648,
"learning_rate": 3.58974358974359e-06,
"loss": 0.1391,
"step": 3520
},
{
"epoch": 16.891017964071857,
"grad_norm": 53.43631362915039,
"learning_rate": 3.5683760683760687e-06,
"loss": 0.2745,
"step": 3530
},
{
"epoch": 16.938922155688623,
"grad_norm": 82.64073181152344,
"learning_rate": 3.5470085470085473e-06,
"loss": 0.2056,
"step": 3540
},
{
"epoch": 16.98682634730539,
"grad_norm": 57.446128845214844,
"learning_rate": 3.5256410256410263e-06,
"loss": 0.5293,
"step": 3550
},
{
"epoch": 17.0,
"eval_loss": 0.7794991731643677,
"eval_macro_f1": 0.8272296780330446,
"eval_runtime": 42.3227,
"eval_samples_per_second": 39.482,
"eval_steps_per_second": 4.938,
"step": 3553
},
{
"epoch": 17.033532934131735,
"grad_norm": 47.826210021972656,
"learning_rate": 3.5042735042735045e-06,
"loss": 0.5691,
"step": 3560
},
{
"epoch": 17.081437125748504,
"grad_norm": 51.37601852416992,
"learning_rate": 3.482905982905983e-06,
"loss": 0.3646,
"step": 3570
},
{
"epoch": 17.12934131736527,
"grad_norm": 5.729410171508789,
"learning_rate": 3.4615384615384617e-06,
"loss": 0.2953,
"step": 3580
},
{
"epoch": 17.177245508982036,
"grad_norm": 74.28618621826172,
"learning_rate": 3.4401709401709403e-06,
"loss": 0.4689,
"step": 3590
},
{
"epoch": 17.225149700598802,
"grad_norm": 1.512823224067688,
"learning_rate": 3.4188034188034193e-06,
"loss": 0.2678,
"step": 3600
},
{
"epoch": 17.273053892215568,
"grad_norm": 80.0973129272461,
"learning_rate": 3.397435897435898e-06,
"loss": 0.3399,
"step": 3610
},
{
"epoch": 17.320958083832334,
"grad_norm": 46.759395599365234,
"learning_rate": 3.3760683760683765e-06,
"loss": 0.3441,
"step": 3620
},
{
"epoch": 17.368862275449104,
"grad_norm": 41.87525939941406,
"learning_rate": 3.3547008547008547e-06,
"loss": 0.2421,
"step": 3630
},
{
"epoch": 17.41676646706587,
"grad_norm": 31.34505844116211,
"learning_rate": 3.3333333333333333e-06,
"loss": 0.2768,
"step": 3640
},
{
"epoch": 17.464670658682635,
"grad_norm": 71.10737609863281,
"learning_rate": 3.311965811965812e-06,
"loss": 0.4662,
"step": 3650
},
{
"epoch": 17.5125748502994,
"grad_norm": 70.69206237792969,
"learning_rate": 3.290598290598291e-06,
"loss": 0.3731,
"step": 3660
},
{
"epoch": 17.560479041916167,
"grad_norm": 58.422847747802734,
"learning_rate": 3.2692307692307696e-06,
"loss": 0.5217,
"step": 3670
},
{
"epoch": 17.608383233532933,
"grad_norm": 11.82396125793457,
"learning_rate": 3.247863247863248e-06,
"loss": 0.2558,
"step": 3680
},
{
"epoch": 17.6562874251497,
"grad_norm": 144.99168395996094,
"learning_rate": 3.2264957264957268e-06,
"loss": 0.3243,
"step": 3690
},
{
"epoch": 17.704191616766465,
"grad_norm": 59.86516571044922,
"learning_rate": 3.205128205128206e-06,
"loss": 0.4804,
"step": 3700
},
{
"epoch": 17.752095808383235,
"grad_norm": 62.745296478271484,
"learning_rate": 3.183760683760684e-06,
"loss": 0.4333,
"step": 3710
},
{
"epoch": 17.8,
"grad_norm": 30.099609375,
"learning_rate": 3.1623931623931626e-06,
"loss": 0.2316,
"step": 3720
},
{
"epoch": 17.847904191616767,
"grad_norm": 53.383583068847656,
"learning_rate": 3.141025641025641e-06,
"loss": 0.5045,
"step": 3730
},
{
"epoch": 17.895808383233533,
"grad_norm": 9.510607719421387,
"learning_rate": 3.11965811965812e-06,
"loss": 0.2016,
"step": 3740
},
{
"epoch": 17.9437125748503,
"grad_norm": 41.073726654052734,
"learning_rate": 3.098290598290599e-06,
"loss": 0.4775,
"step": 3750
},
{
"epoch": 17.991616766467065,
"grad_norm": 18.685773849487305,
"learning_rate": 3.0769230769230774e-06,
"loss": 0.3327,
"step": 3760
},
{
"epoch": 18.0,
"eval_loss": 0.6723179817199707,
"eval_macro_f1": 0.8484331568013577,
"eval_runtime": 38.5624,
"eval_samples_per_second": 43.332,
"eval_steps_per_second": 5.42,
"step": 3762
},
{
"epoch": 18.038323353293414,
"grad_norm": 28.627132415771484,
"learning_rate": 3.055555555555556e-06,
"loss": 0.3131,
"step": 3770
},
{
"epoch": 18.08622754491018,
"grad_norm": 22.602691650390625,
"learning_rate": 3.0341880341880342e-06,
"loss": 0.1872,
"step": 3780
},
{
"epoch": 18.134131736526946,
"grad_norm": 70.41796112060547,
"learning_rate": 3.012820512820513e-06,
"loss": 0.2575,
"step": 3790
},
{
"epoch": 18.182035928143712,
"grad_norm": 52.98373031616211,
"learning_rate": 2.9914529914529914e-06,
"loss": 0.366,
"step": 3800
},
{
"epoch": 18.229940119760478,
"grad_norm": 76.51351928710938,
"learning_rate": 2.9700854700854705e-06,
"loss": 0.3965,
"step": 3810
},
{
"epoch": 18.277844311377244,
"grad_norm": 57.413387298583984,
"learning_rate": 2.948717948717949e-06,
"loss": 0.1353,
"step": 3820
},
{
"epoch": 18.325748502994013,
"grad_norm": 38.45537185668945,
"learning_rate": 2.9273504273504277e-06,
"loss": 0.2277,
"step": 3830
},
{
"epoch": 18.37365269461078,
"grad_norm": 76.71987915039062,
"learning_rate": 2.9059829059829063e-06,
"loss": 0.4852,
"step": 3840
},
{
"epoch": 18.421556886227545,
"grad_norm": 13.536568641662598,
"learning_rate": 2.8846153846153845e-06,
"loss": 0.2461,
"step": 3850
},
{
"epoch": 18.46946107784431,
"grad_norm": 63.02874755859375,
"learning_rate": 2.8632478632478635e-06,
"loss": 0.3511,
"step": 3860
},
{
"epoch": 18.517365269461077,
"grad_norm": 98.7140884399414,
"learning_rate": 2.841880341880342e-06,
"loss": 0.2256,
"step": 3870
},
{
"epoch": 18.565269461077843,
"grad_norm": 35.94133758544922,
"learning_rate": 2.8205128205128207e-06,
"loss": 0.1746,
"step": 3880
},
{
"epoch": 18.61317365269461,
"grad_norm": 87.98931121826172,
"learning_rate": 2.7991452991452993e-06,
"loss": 0.5013,
"step": 3890
},
{
"epoch": 18.66107784431138,
"grad_norm": 1.2747302055358887,
"learning_rate": 2.7777777777777783e-06,
"loss": 0.3938,
"step": 3900
},
{
"epoch": 18.708982035928145,
"grad_norm": 0.7769640684127808,
"learning_rate": 2.756410256410257e-06,
"loss": 0.5851,
"step": 3910
},
{
"epoch": 18.75688622754491,
"grad_norm": 65.76081085205078,
"learning_rate": 2.7350427350427355e-06,
"loss": 0.4233,
"step": 3920
},
{
"epoch": 18.804790419161677,
"grad_norm": 4.068265438079834,
"learning_rate": 2.7136752136752137e-06,
"loss": 0.3455,
"step": 3930
},
{
"epoch": 18.852694610778443,
"grad_norm": 1912.1357421875,
"learning_rate": 2.6923076923076923e-06,
"loss": 0.6044,
"step": 3940
},
{
"epoch": 18.90059880239521,
"grad_norm": 20.534839630126953,
"learning_rate": 2.670940170940171e-06,
"loss": 0.1716,
"step": 3950
},
{
"epoch": 18.948502994011974,
"grad_norm": 77.07267761230469,
"learning_rate": 2.64957264957265e-06,
"loss": 0.2453,
"step": 3960
},
{
"epoch": 18.996407185628744,
"grad_norm": 89.15924072265625,
"learning_rate": 2.6282051282051286e-06,
"loss": 0.5155,
"step": 3970
},
{
"epoch": 19.0,
"eval_loss": 0.7802789807319641,
"eval_macro_f1": 0.8321765860916699,
"eval_runtime": 41.5052,
"eval_samples_per_second": 40.26,
"eval_steps_per_second": 5.036,
"step": 3971
},
{
"epoch": 19.04311377245509,
"grad_norm": 119.42920684814453,
"learning_rate": 2.606837606837607e-06,
"loss": 0.4741,
"step": 3980
},
{
"epoch": 19.091017964071856,
"grad_norm": 48.56228256225586,
"learning_rate": 2.5854700854700858e-06,
"loss": 0.3726,
"step": 3990
},
{
"epoch": 19.138922155688622,
"grad_norm": 22.457124710083008,
"learning_rate": 2.564102564102564e-06,
"loss": 0.3174,
"step": 4000
},
{
"epoch": 19.186826347305388,
"grad_norm": 56.32841873168945,
"learning_rate": 2.542735042735043e-06,
"loss": 0.2022,
"step": 4010
},
{
"epoch": 19.234730538922157,
"grad_norm": 28.47975730895996,
"learning_rate": 2.5213675213675216e-06,
"loss": 0.2525,
"step": 4020
},
{
"epoch": 19.282634730538923,
"grad_norm": 17.601634979248047,
"learning_rate": 2.5e-06,
"loss": 0.2316,
"step": 4030
},
{
"epoch": 19.33053892215569,
"grad_norm": 47.431663513183594,
"learning_rate": 2.478632478632479e-06,
"loss": 0.3101,
"step": 4040
},
{
"epoch": 19.378443113772455,
"grad_norm": 77.91453552246094,
"learning_rate": 2.4572649572649574e-06,
"loss": 0.4441,
"step": 4050
},
{
"epoch": 19.42634730538922,
"grad_norm": 78.02772521972656,
"learning_rate": 2.435897435897436e-06,
"loss": 0.3817,
"step": 4060
},
{
"epoch": 19.474251497005987,
"grad_norm": 20.506948471069336,
"learning_rate": 2.4145299145299146e-06,
"loss": 0.2292,
"step": 4070
},
{
"epoch": 19.522155688622753,
"grad_norm": 16.479249954223633,
"learning_rate": 2.3931623931623937e-06,
"loss": 0.5324,
"step": 4080
},
{
"epoch": 19.57005988023952,
"grad_norm": 30.338481903076172,
"learning_rate": 2.371794871794872e-06,
"loss": 0.2052,
"step": 4090
},
{
"epoch": 19.61796407185629,
"grad_norm": 57.33430480957031,
"learning_rate": 2.3504273504273504e-06,
"loss": 0.2965,
"step": 4100
},
{
"epoch": 19.665868263473055,
"grad_norm": 52.898719787597656,
"learning_rate": 2.3290598290598295e-06,
"loss": 0.2153,
"step": 4110
},
{
"epoch": 19.71377245508982,
"grad_norm": 13.877717971801758,
"learning_rate": 2.307692307692308e-06,
"loss": 0.1302,
"step": 4120
},
{
"epoch": 19.761676646706587,
"grad_norm": 82.69320678710938,
"learning_rate": 2.2863247863247863e-06,
"loss": 0.1914,
"step": 4130
},
{
"epoch": 19.809580838323352,
"grad_norm": 69.18468475341797,
"learning_rate": 2.2649572649572653e-06,
"loss": 0.2875,
"step": 4140
},
{
"epoch": 19.85748502994012,
"grad_norm": 60.78540802001953,
"learning_rate": 2.243589743589744e-06,
"loss": 0.1801,
"step": 4150
},
{
"epoch": 19.905389221556888,
"grad_norm": 32.22920227050781,
"learning_rate": 2.222222222222222e-06,
"loss": 0.3202,
"step": 4160
},
{
"epoch": 19.953293413173654,
"grad_norm": 65.0261459350586,
"learning_rate": 2.200854700854701e-06,
"loss": 0.1849,
"step": 4170
},
{
"epoch": 20.0,
"grad_norm": 0.3435074985027313,
"learning_rate": 2.1794871794871797e-06,
"loss": 0.3044,
"step": 4180
},
{
"epoch": 20.0,
"eval_loss": 0.7713411450386047,
"eval_macro_f1": 0.844977734683617,
"eval_runtime": 50.3873,
"eval_samples_per_second": 33.163,
"eval_steps_per_second": 4.148,
"step": 4180
},
{
"epoch": 20.047904191616766,
"grad_norm": 40.881629943847656,
"learning_rate": 2.1581196581196583e-06,
"loss": 0.3328,
"step": 4190
},
{
"epoch": 20.095808383233532,
"grad_norm": 19.17068862915039,
"learning_rate": 2.136752136752137e-06,
"loss": 0.3443,
"step": 4200
},
{
"epoch": 20.143712574850298,
"grad_norm": 33.69413375854492,
"learning_rate": 2.1153846153846155e-06,
"loss": 0.3359,
"step": 4210
},
{
"epoch": 20.191616766467067,
"grad_norm": 17.18857765197754,
"learning_rate": 2.094017094017094e-06,
"loss": 0.3028,
"step": 4220
},
{
"epoch": 20.239520958083833,
"grad_norm": 57.64948272705078,
"learning_rate": 2.072649572649573e-06,
"loss": 0.3474,
"step": 4230
},
{
"epoch": 20.2874251497006,
"grad_norm": 0.19221803545951843,
"learning_rate": 2.0512820512820513e-06,
"loss": 0.3006,
"step": 4240
},
{
"epoch": 20.335329341317365,
"grad_norm": 7.332968235015869,
"learning_rate": 2.02991452991453e-06,
"loss": 0.3067,
"step": 4250
},
{
"epoch": 20.38323353293413,
"grad_norm": 6.971332550048828,
"learning_rate": 2.008547008547009e-06,
"loss": 0.2523,
"step": 4260
},
{
"epoch": 20.431137724550897,
"grad_norm": 16.12879180908203,
"learning_rate": 1.987179487179487e-06,
"loss": 0.1617,
"step": 4270
},
{
"epoch": 20.479041916167663,
"grad_norm": 4.716946125030518,
"learning_rate": 1.9658119658119658e-06,
"loss": 0.2161,
"step": 4280
},
{
"epoch": 20.526946107784433,
"grad_norm": 29.626829147338867,
"learning_rate": 1.944444444444445e-06,
"loss": 0.2236,
"step": 4290
},
{
"epoch": 20.5748502994012,
"grad_norm": 5.962003231048584,
"learning_rate": 1.9230769230769234e-06,
"loss": 0.1348,
"step": 4300
},
{
"epoch": 20.622754491017965,
"grad_norm": 22.97109603881836,
"learning_rate": 1.9017094017094018e-06,
"loss": 0.3083,
"step": 4310
},
{
"epoch": 20.67065868263473,
"grad_norm": 45.447818756103516,
"learning_rate": 1.8803418803418804e-06,
"loss": 0.3335,
"step": 4320
},
{
"epoch": 20.718562874251496,
"grad_norm": 205.6171417236328,
"learning_rate": 1.8589743589743592e-06,
"loss": 0.3472,
"step": 4330
},
{
"epoch": 20.766467065868262,
"grad_norm": 41.59774398803711,
"learning_rate": 1.8376068376068378e-06,
"loss": 0.2514,
"step": 4340
},
{
"epoch": 20.81437125748503,
"grad_norm": 37.663997650146484,
"learning_rate": 1.8162393162393164e-06,
"loss": 0.3328,
"step": 4350
},
{
"epoch": 20.862275449101798,
"grad_norm": 5.474306106567383,
"learning_rate": 1.794871794871795e-06,
"loss": 0.3874,
"step": 4360
},
{
"epoch": 20.910179640718564,
"grad_norm": 17.301982879638672,
"learning_rate": 1.7735042735042736e-06,
"loss": 0.2811,
"step": 4370
},
{
"epoch": 20.95808383233533,
"grad_norm": 51.472537994384766,
"learning_rate": 1.7521367521367522e-06,
"loss": 0.4697,
"step": 4380
},
{
"epoch": 21.0,
"eval_loss": 0.7450286746025085,
"eval_macro_f1": 0.827007153949272,
"eval_runtime": 57.1204,
"eval_samples_per_second": 29.254,
"eval_steps_per_second": 3.659,
"step": 4389
},
{
"epoch": 21.004790419161676,
"grad_norm": 86.52445983886719,
"learning_rate": 1.7307692307692308e-06,
"loss": 0.4239,
"step": 4390
},
{
"epoch": 21.052694610778442,
"grad_norm": 68.6434326171875,
"learning_rate": 1.7094017094017097e-06,
"loss": 0.3987,
"step": 4400
},
{
"epoch": 21.10059880239521,
"grad_norm": 45.44715881347656,
"learning_rate": 1.6880341880341883e-06,
"loss": 0.2822,
"step": 4410
},
{
"epoch": 21.148502994011977,
"grad_norm": 104.67868041992188,
"learning_rate": 1.6666666666666667e-06,
"loss": 0.1772,
"step": 4420
},
{
"epoch": 21.196407185628743,
"grad_norm": 35.29579162597656,
"learning_rate": 1.6452991452991455e-06,
"loss": 0.2434,
"step": 4430
},
{
"epoch": 21.24431137724551,
"grad_norm": 77.53510284423828,
"learning_rate": 1.623931623931624e-06,
"loss": 0.2378,
"step": 4440
},
{
"epoch": 21.292215568862275,
"grad_norm": 67.75172424316406,
"learning_rate": 1.602564102564103e-06,
"loss": 0.3755,
"step": 4450
},
{
"epoch": 21.34011976047904,
"grad_norm": 8.150789260864258,
"learning_rate": 1.5811965811965813e-06,
"loss": 0.2303,
"step": 4460
},
{
"epoch": 21.388023952095807,
"grad_norm": 57.804100036621094,
"learning_rate": 1.55982905982906e-06,
"loss": 0.3705,
"step": 4470
},
{
"epoch": 21.435928143712573,
"grad_norm": 12.233683586120605,
"learning_rate": 1.5384615384615387e-06,
"loss": 0.1344,
"step": 4480
},
{
"epoch": 21.483832335329343,
"grad_norm": 48.35832595825195,
"learning_rate": 1.5170940170940171e-06,
"loss": 0.1988,
"step": 4490
},
{
"epoch": 21.53173652694611,
"grad_norm": 112.48056030273438,
"learning_rate": 1.4957264957264957e-06,
"loss": 0.2977,
"step": 4500
},
{
"epoch": 21.579640718562874,
"grad_norm": 97.60857391357422,
"learning_rate": 1.4743589743589745e-06,
"loss": 0.2382,
"step": 4510
},
{
"epoch": 21.62754491017964,
"grad_norm": 4.879365921020508,
"learning_rate": 1.4529914529914531e-06,
"loss": 0.2374,
"step": 4520
},
{
"epoch": 21.675449101796406,
"grad_norm": 60.12843704223633,
"learning_rate": 1.4316239316239317e-06,
"loss": 0.1913,
"step": 4530
},
{
"epoch": 21.723353293413172,
"grad_norm": 39.34722900390625,
"learning_rate": 1.4102564102564104e-06,
"loss": 0.1366,
"step": 4540
},
{
"epoch": 21.771257485029942,
"grad_norm": 0.2388223111629486,
"learning_rate": 1.3888888888888892e-06,
"loss": 0.1015,
"step": 4550
},
{
"epoch": 21.819161676646708,
"grad_norm": 63.66411209106445,
"learning_rate": 1.3675213675213678e-06,
"loss": 0.3127,
"step": 4560
},
{
"epoch": 21.867065868263474,
"grad_norm": 4.219015598297119,
"learning_rate": 1.3461538461538462e-06,
"loss": 0.3047,
"step": 4570
},
{
"epoch": 21.91497005988024,
"grad_norm": 55.83110427856445,
"learning_rate": 1.324786324786325e-06,
"loss": 0.2811,
"step": 4580
},
{
"epoch": 21.962874251497006,
"grad_norm": 34.60869216918945,
"learning_rate": 1.3034188034188036e-06,
"loss": 0.1733,
"step": 4590
},
{
"epoch": 22.0,
"eval_loss": 0.745612621307373,
"eval_macro_f1": 0.8567568925713644,
"eval_runtime": 53.3616,
"eval_samples_per_second": 31.315,
"eval_steps_per_second": 3.917,
"step": 4598
}
],
"logging_steps": 10,
"max_steps": 5200,
"num_input_tokens_seen": 0,
"num_train_epochs": 25,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 5.273478692599549e+19,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}