t5-query-expansion-model / trainer_state.json
amixh's picture
Upload folder using huggingface_hub
e6d40bd verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 10.0,
"eval_steps": 500,
"global_step": 2260,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.04424778761061947,
"grad_norm": NaN,
"learning_rate": 0.0009973451327433627,
"loss": 14.6523,
"step": 10
},
{
"epoch": 0.08849557522123894,
"grad_norm": 0.9945968985557556,
"learning_rate": 0.000992920353982301,
"loss": 4.8948,
"step": 20
},
{
"epoch": 0.13274336283185842,
"grad_norm": 0.3495340645313263,
"learning_rate": 0.000988495575221239,
"loss": 0.6469,
"step": 30
},
{
"epoch": 0.17699115044247787,
"grad_norm": 0.22641977667808533,
"learning_rate": 0.000984070796460177,
"loss": 0.5221,
"step": 40
},
{
"epoch": 0.22123893805309736,
"grad_norm": 0.25233855843544006,
"learning_rate": 0.000979646017699115,
"loss": 0.4094,
"step": 50
},
{
"epoch": 0.26548672566371684,
"grad_norm": 0.37399861216545105,
"learning_rate": 0.0009752212389380531,
"loss": 0.3958,
"step": 60
},
{
"epoch": 0.30973451327433627,
"grad_norm": 0.18545609712600708,
"learning_rate": 0.0009707964601769911,
"loss": 0.3405,
"step": 70
},
{
"epoch": 0.35398230088495575,
"grad_norm": 0.2712928354740143,
"learning_rate": 0.0009663716814159293,
"loss": 0.3242,
"step": 80
},
{
"epoch": 0.39823008849557523,
"grad_norm": 0.2340475469827652,
"learning_rate": 0.0009619469026548673,
"loss": 0.3007,
"step": 90
},
{
"epoch": 0.4424778761061947,
"grad_norm": 0.18099136650562286,
"learning_rate": 0.0009575221238938053,
"loss": 0.2567,
"step": 100
},
{
"epoch": 0.48672566371681414,
"grad_norm": 0.23833367228507996,
"learning_rate": 0.0009530973451327434,
"loss": 0.2734,
"step": 110
},
{
"epoch": 0.5309734513274337,
"grad_norm": 0.20163732767105103,
"learning_rate": 0.0009486725663716814,
"loss": 0.2326,
"step": 120
},
{
"epoch": 0.5752212389380531,
"grad_norm": 0.1758851557970047,
"learning_rate": 0.0009442477876106195,
"loss": 0.2914,
"step": 130
},
{
"epoch": 0.6194690265486725,
"grad_norm": 0.211241215467453,
"learning_rate": 0.0009398230088495575,
"loss": 0.2667,
"step": 140
},
{
"epoch": 0.6637168141592921,
"grad_norm": 0.22571340203285217,
"learning_rate": 0.0009353982300884956,
"loss": 0.2268,
"step": 150
},
{
"epoch": 0.7079646017699115,
"grad_norm": 0.20469224452972412,
"learning_rate": 0.0009309734513274336,
"loss": 0.2386,
"step": 160
},
{
"epoch": 0.7522123893805309,
"grad_norm": 0.21183688938617706,
"learning_rate": 0.0009265486725663716,
"loss": 0.282,
"step": 170
},
{
"epoch": 0.7964601769911505,
"grad_norm": 0.17585916817188263,
"learning_rate": 0.0009221238938053097,
"loss": 0.3046,
"step": 180
},
{
"epoch": 0.8407079646017699,
"grad_norm": 0.17937427759170532,
"learning_rate": 0.0009176991150442479,
"loss": 0.2693,
"step": 190
},
{
"epoch": 0.8849557522123894,
"grad_norm": 0.19432350993156433,
"learning_rate": 0.0009132743362831859,
"loss": 0.252,
"step": 200
},
{
"epoch": 0.9292035398230089,
"grad_norm": 0.18185169994831085,
"learning_rate": 0.0009088495575221239,
"loss": 0.2793,
"step": 210
},
{
"epoch": 0.9734513274336283,
"grad_norm": 0.18515343964099884,
"learning_rate": 0.000904424778761062,
"loss": 0.2644,
"step": 220
},
{
"epoch": 1.0,
"eval_loss": 0.22543948888778687,
"eval_runtime": 3.0243,
"eval_samples_per_second": 33.066,
"eval_steps_per_second": 8.266,
"step": 226
},
{
"epoch": 1.0176991150442478,
"grad_norm": 0.2031005322933197,
"learning_rate": 0.0009000000000000001,
"loss": 0.2704,
"step": 230
},
{
"epoch": 1.0619469026548674,
"grad_norm": 0.26087555289268494,
"learning_rate": 0.0008955752212389381,
"loss": 0.2526,
"step": 240
},
{
"epoch": 1.1061946902654867,
"grad_norm": 0.1796620637178421,
"learning_rate": 0.0008911504424778761,
"loss": 0.2605,
"step": 250
},
{
"epoch": 1.1504424778761062,
"grad_norm": 0.22667303681373596,
"learning_rate": 0.0008867256637168141,
"loss": 0.261,
"step": 260
},
{
"epoch": 1.1946902654867257,
"grad_norm": 0.22089733183383942,
"learning_rate": 0.0008823008849557523,
"loss": 0.2762,
"step": 270
},
{
"epoch": 1.238938053097345,
"grad_norm": 0.19162122905254364,
"learning_rate": 0.0008778761061946903,
"loss": 0.2325,
"step": 280
},
{
"epoch": 1.2831858407079646,
"grad_norm": 0.1732087880373001,
"learning_rate": 0.0008734513274336283,
"loss": 0.2455,
"step": 290
},
{
"epoch": 1.3274336283185841,
"grad_norm": 0.15953731536865234,
"learning_rate": 0.0008690265486725663,
"loss": 0.2155,
"step": 300
},
{
"epoch": 1.3716814159292037,
"grad_norm": 0.229411318898201,
"learning_rate": 0.0008646017699115044,
"loss": 0.2289,
"step": 310
},
{
"epoch": 1.415929203539823,
"grad_norm": 0.20390523970127106,
"learning_rate": 0.0008601769911504425,
"loss": 0.2429,
"step": 320
},
{
"epoch": 1.4601769911504425,
"grad_norm": 0.23142680525779724,
"learning_rate": 0.0008557522123893805,
"loss": 0.2291,
"step": 330
},
{
"epoch": 1.504424778761062,
"grad_norm": 0.22689059376716614,
"learning_rate": 0.0008513274336283185,
"loss": 0.2369,
"step": 340
},
{
"epoch": 1.5486725663716814,
"grad_norm": 0.18759772181510925,
"learning_rate": 0.0008469026548672567,
"loss": 0.1887,
"step": 350
},
{
"epoch": 1.592920353982301,
"grad_norm": 0.17289893329143524,
"learning_rate": 0.0008424778761061948,
"loss": 0.2547,
"step": 360
},
{
"epoch": 1.6371681415929205,
"grad_norm": 0.20804202556610107,
"learning_rate": 0.0008380530973451328,
"loss": 0.2446,
"step": 370
},
{
"epoch": 1.6814159292035398,
"grad_norm": 0.2161918580532074,
"learning_rate": 0.0008336283185840708,
"loss": 0.2262,
"step": 380
},
{
"epoch": 1.7256637168141593,
"grad_norm": 0.27487823367118835,
"learning_rate": 0.0008292035398230089,
"loss": 0.2673,
"step": 390
},
{
"epoch": 1.7699115044247788,
"grad_norm": 0.20181554555892944,
"learning_rate": 0.0008247787610619469,
"loss": 0.252,
"step": 400
},
{
"epoch": 1.8141592920353982,
"grad_norm": 0.21222522854804993,
"learning_rate": 0.000820353982300885,
"loss": 0.23,
"step": 410
},
{
"epoch": 1.8584070796460177,
"grad_norm": 0.21409285068511963,
"learning_rate": 0.000815929203539823,
"loss": 0.235,
"step": 420
},
{
"epoch": 1.9026548672566372,
"grad_norm": 0.2830056846141815,
"learning_rate": 0.0008115044247787611,
"loss": 0.2335,
"step": 430
},
{
"epoch": 1.9469026548672566,
"grad_norm": 0.22915257513523102,
"learning_rate": 0.0008070796460176991,
"loss": 0.2303,
"step": 440
},
{
"epoch": 1.991150442477876,
"grad_norm": 0.19883762300014496,
"learning_rate": 0.0008026548672566371,
"loss": 0.2222,
"step": 450
},
{
"epoch": 2.0,
"eval_loss": 0.21643634140491486,
"eval_runtime": 2.7454,
"eval_samples_per_second": 36.424,
"eval_steps_per_second": 9.106,
"step": 452
},
{
"epoch": 2.0353982300884956,
"grad_norm": 0.2121458351612091,
"learning_rate": 0.0007982300884955752,
"loss": 0.2403,
"step": 460
},
{
"epoch": 2.079646017699115,
"grad_norm": 0.17018261551856995,
"learning_rate": 0.0007938053097345133,
"loss": 0.213,
"step": 470
},
{
"epoch": 2.1238938053097347,
"grad_norm": 0.22500459849834442,
"learning_rate": 0.0007893805309734513,
"loss": 0.2239,
"step": 480
},
{
"epoch": 2.168141592920354,
"grad_norm": 0.19334179162979126,
"learning_rate": 0.0007849557522123893,
"loss": 0.2106,
"step": 490
},
{
"epoch": 2.2123893805309733,
"grad_norm": 0.1906515508890152,
"learning_rate": 0.0007805309734513274,
"loss": 0.2037,
"step": 500
},
{
"epoch": 2.256637168141593,
"grad_norm": 0.2478450983762741,
"learning_rate": 0.0007761061946902656,
"loss": 0.2164,
"step": 510
},
{
"epoch": 2.3008849557522124,
"grad_norm": 0.2270224243402481,
"learning_rate": 0.0007716814159292036,
"loss": 0.2253,
"step": 520
},
{
"epoch": 2.3451327433628317,
"grad_norm": 0.2539624273777008,
"learning_rate": 0.0007672566371681416,
"loss": 0.2016,
"step": 530
},
{
"epoch": 2.3893805309734515,
"grad_norm": 0.33118170499801636,
"learning_rate": 0.0007628318584070797,
"loss": 0.2239,
"step": 540
},
{
"epoch": 2.433628318584071,
"grad_norm": 0.24022382497787476,
"learning_rate": 0.0007584070796460178,
"loss": 0.2339,
"step": 550
},
{
"epoch": 2.47787610619469,
"grad_norm": 0.22129379212856293,
"learning_rate": 0.0007539823008849558,
"loss": 0.2079,
"step": 560
},
{
"epoch": 2.52212389380531,
"grad_norm": 0.20302246510982513,
"learning_rate": 0.0007495575221238938,
"loss": 0.2012,
"step": 570
},
{
"epoch": 2.566371681415929,
"grad_norm": 0.28677117824554443,
"learning_rate": 0.0007451327433628319,
"loss": 0.2281,
"step": 580
},
{
"epoch": 2.6106194690265485,
"grad_norm": 0.2567579746246338,
"learning_rate": 0.0007407079646017699,
"loss": 0.2374,
"step": 590
},
{
"epoch": 2.6548672566371683,
"grad_norm": 0.2306365817785263,
"learning_rate": 0.000736283185840708,
"loss": 0.2144,
"step": 600
},
{
"epoch": 2.6991150442477876,
"grad_norm": 0.23293821513652802,
"learning_rate": 0.000731858407079646,
"loss": 0.2381,
"step": 610
},
{
"epoch": 2.7433628318584073,
"grad_norm": 0.2173946499824524,
"learning_rate": 0.0007274336283185841,
"loss": 0.2155,
"step": 620
},
{
"epoch": 2.7876106194690267,
"grad_norm": 0.30976563692092896,
"learning_rate": 0.0007230088495575221,
"loss": 0.2262,
"step": 630
},
{
"epoch": 2.831858407079646,
"grad_norm": 0.19489358365535736,
"learning_rate": 0.0007185840707964601,
"loss": 0.2194,
"step": 640
},
{
"epoch": 2.8761061946902657,
"grad_norm": 0.21821223199367523,
"learning_rate": 0.0007141592920353982,
"loss": 0.1967,
"step": 650
},
{
"epoch": 2.920353982300885,
"grad_norm": 0.23535631597042084,
"learning_rate": 0.0007097345132743363,
"loss": 0.2353,
"step": 660
},
{
"epoch": 2.9646017699115044,
"grad_norm": 0.20547734200954437,
"learning_rate": 0.0007053097345132744,
"loss": 0.2119,
"step": 670
},
{
"epoch": 3.0,
"eval_loss": 0.21383462846279144,
"eval_runtime": 2.6363,
"eval_samples_per_second": 37.932,
"eval_steps_per_second": 9.483,
"step": 678
},
{
"epoch": 3.0088495575221237,
"grad_norm": 0.21669970452785492,
"learning_rate": 0.0007008849557522124,
"loss": 0.2198,
"step": 680
},
{
"epoch": 3.0530973451327434,
"grad_norm": 0.20589256286621094,
"learning_rate": 0.0006964601769911505,
"loss": 0.2002,
"step": 690
},
{
"epoch": 3.0973451327433628,
"grad_norm": 0.23902471363544464,
"learning_rate": 0.0006920353982300886,
"loss": 0.1804,
"step": 700
},
{
"epoch": 3.1415929203539825,
"grad_norm": 0.2881176173686981,
"learning_rate": 0.0006876106194690266,
"loss": 0.2162,
"step": 710
},
{
"epoch": 3.185840707964602,
"grad_norm": 0.22364391386508942,
"learning_rate": 0.0006831858407079646,
"loss": 0.2185,
"step": 720
},
{
"epoch": 3.230088495575221,
"grad_norm": 0.23607216775417328,
"learning_rate": 0.0006787610619469026,
"loss": 0.2124,
"step": 730
},
{
"epoch": 3.274336283185841,
"grad_norm": 0.18838390707969666,
"learning_rate": 0.0006743362831858408,
"loss": 0.179,
"step": 740
},
{
"epoch": 3.3185840707964602,
"grad_norm": 0.3451661765575409,
"learning_rate": 0.0006699115044247788,
"loss": 0.2135,
"step": 750
},
{
"epoch": 3.3628318584070795,
"grad_norm": 0.2281007319688797,
"learning_rate": 0.0006654867256637168,
"loss": 0.2071,
"step": 760
},
{
"epoch": 3.4070796460176993,
"grad_norm": 0.20740865170955658,
"learning_rate": 0.0006610619469026548,
"loss": 0.2081,
"step": 770
},
{
"epoch": 3.4513274336283186,
"grad_norm": 0.27458012104034424,
"learning_rate": 0.0006566371681415929,
"loss": 0.2026,
"step": 780
},
{
"epoch": 3.495575221238938,
"grad_norm": 0.19083356857299805,
"learning_rate": 0.000652212389380531,
"loss": 0.1946,
"step": 790
},
{
"epoch": 3.5398230088495577,
"grad_norm": 0.2667248845100403,
"learning_rate": 0.000647787610619469,
"loss": 0.2141,
"step": 800
},
{
"epoch": 3.584070796460177,
"grad_norm": 0.22773493826389313,
"learning_rate": 0.000643362831858407,
"loss": 0.2294,
"step": 810
},
{
"epoch": 3.6283185840707963,
"grad_norm": 0.24344410002231598,
"learning_rate": 0.0006389380530973451,
"loss": 0.1799,
"step": 820
},
{
"epoch": 3.672566371681416,
"grad_norm": 0.3232133984565735,
"learning_rate": 0.0006345132743362833,
"loss": 0.1807,
"step": 830
},
{
"epoch": 3.7168141592920354,
"grad_norm": 0.22465798258781433,
"learning_rate": 0.0006300884955752213,
"loss": 0.2005,
"step": 840
},
{
"epoch": 3.7610619469026547,
"grad_norm": 0.24152274429798126,
"learning_rate": 0.0006256637168141594,
"loss": 0.2001,
"step": 850
},
{
"epoch": 3.8053097345132745,
"grad_norm": 0.2764975130558014,
"learning_rate": 0.0006212389380530974,
"loss": 0.1691,
"step": 860
},
{
"epoch": 3.849557522123894,
"grad_norm": 0.23789626359939575,
"learning_rate": 0.0006168141592920354,
"loss": 0.2318,
"step": 870
},
{
"epoch": 3.893805309734513,
"grad_norm": 0.21235798299312592,
"learning_rate": 0.0006123893805309735,
"loss": 0.1867,
"step": 880
},
{
"epoch": 3.938053097345133,
"grad_norm": 0.23083995282649994,
"learning_rate": 0.0006079646017699116,
"loss": 0.2135,
"step": 890
},
{
"epoch": 3.982300884955752,
"grad_norm": 0.22863389551639557,
"learning_rate": 0.0006035398230088496,
"loss": 0.2188,
"step": 900
},
{
"epoch": 4.0,
"eval_loss": 0.20991046726703644,
"eval_runtime": 2.9553,
"eval_samples_per_second": 33.837,
"eval_steps_per_second": 8.459,
"step": 904
},
{
"epoch": 4.0265486725663715,
"grad_norm": 0.22170217335224152,
"learning_rate": 0.0005991150442477876,
"loss": 0.2186,
"step": 910
},
{
"epoch": 4.070796460176991,
"grad_norm": 0.2190970778465271,
"learning_rate": 0.0005946902654867256,
"loss": 0.1978,
"step": 920
},
{
"epoch": 4.115044247787611,
"grad_norm": 0.1924510896205902,
"learning_rate": 0.0005902654867256638,
"loss": 0.1787,
"step": 930
},
{
"epoch": 4.15929203539823,
"grad_norm": 0.2868868112564087,
"learning_rate": 0.0005858407079646018,
"loss": 0.172,
"step": 940
},
{
"epoch": 4.20353982300885,
"grad_norm": 0.18888860940933228,
"learning_rate": 0.0005814159292035398,
"loss": 0.1761,
"step": 950
},
{
"epoch": 4.247787610619469,
"grad_norm": 0.21858586370944977,
"learning_rate": 0.0005769911504424778,
"loss": 0.1871,
"step": 960
},
{
"epoch": 4.292035398230088,
"grad_norm": 0.305698961019516,
"learning_rate": 0.0005725663716814159,
"loss": 0.1886,
"step": 970
},
{
"epoch": 4.336283185840708,
"grad_norm": 0.23597249388694763,
"learning_rate": 0.000568141592920354,
"loss": 0.1865,
"step": 980
},
{
"epoch": 4.380530973451328,
"grad_norm": 0.271823912858963,
"learning_rate": 0.0005637168141592921,
"loss": 0.1709,
"step": 990
},
{
"epoch": 4.424778761061947,
"grad_norm": 0.19630669057369232,
"learning_rate": 0.0005592920353982301,
"loss": 0.2429,
"step": 1000
},
{
"epoch": 4.469026548672566,
"grad_norm": 0.29825878143310547,
"learning_rate": 0.0005548672566371682,
"loss": 0.1879,
"step": 1010
},
{
"epoch": 4.513274336283186,
"grad_norm": 0.21552462875843048,
"learning_rate": 0.0005504424778761063,
"loss": 0.1905,
"step": 1020
},
{
"epoch": 4.557522123893805,
"grad_norm": 0.28668805956840515,
"learning_rate": 0.0005460176991150443,
"loss": 0.1951,
"step": 1030
},
{
"epoch": 4.601769911504425,
"grad_norm": 0.27180853486061096,
"learning_rate": 0.0005415929203539823,
"loss": 0.1758,
"step": 1040
},
{
"epoch": 4.646017699115045,
"grad_norm": 0.3072490394115448,
"learning_rate": 0.0005371681415929204,
"loss": 0.1852,
"step": 1050
},
{
"epoch": 4.6902654867256635,
"grad_norm": 0.2913398742675781,
"learning_rate": 0.0005327433628318584,
"loss": 0.201,
"step": 1060
},
{
"epoch": 4.734513274336283,
"grad_norm": 0.29055866599082947,
"learning_rate": 0.0005283185840707965,
"loss": 0.1932,
"step": 1070
},
{
"epoch": 4.778761061946903,
"grad_norm": 0.2742849290370941,
"learning_rate": 0.0005238938053097345,
"loss": 0.183,
"step": 1080
},
{
"epoch": 4.823008849557522,
"grad_norm": 0.2370535433292389,
"learning_rate": 0.0005194690265486726,
"loss": 0.1849,
"step": 1090
},
{
"epoch": 4.867256637168142,
"grad_norm": 0.31343671679496765,
"learning_rate": 0.0005150442477876106,
"loss": 0.2195,
"step": 1100
},
{
"epoch": 4.911504424778761,
"grad_norm": 0.3136596381664276,
"learning_rate": 0.0005106194690265486,
"loss": 0.1907,
"step": 1110
},
{
"epoch": 4.95575221238938,
"grad_norm": 0.2071835845708847,
"learning_rate": 0.0005061946902654867,
"loss": 0.1969,
"step": 1120
},
{
"epoch": 5.0,
"grad_norm": 0.25057336688041687,
"learning_rate": 0.0005017699115044248,
"loss": 0.1916,
"step": 1130
},
{
"epoch": 5.0,
"eval_loss": 0.21029528975486755,
"eval_runtime": 2.628,
"eval_samples_per_second": 38.052,
"eval_steps_per_second": 9.513,
"step": 1130
},
{
"epoch": 5.04424778761062,
"grad_norm": 0.21927224099636078,
"learning_rate": 0.0004973451327433628,
"loss": 0.155,
"step": 1140
},
{
"epoch": 5.088495575221239,
"grad_norm": 0.3175056576728821,
"learning_rate": 0.0004929203539823009,
"loss": 0.189,
"step": 1150
},
{
"epoch": 5.132743362831858,
"grad_norm": 0.2786344587802887,
"learning_rate": 0.0004884955752212389,
"loss": 0.1679,
"step": 1160
},
{
"epoch": 5.176991150442478,
"grad_norm": 0.2475520521402359,
"learning_rate": 0.00048407079646017696,
"loss": 0.1855,
"step": 1170
},
{
"epoch": 5.221238938053097,
"grad_norm": 0.24603202939033508,
"learning_rate": 0.00047964601769911504,
"loss": 0.1755,
"step": 1180
},
{
"epoch": 5.265486725663717,
"grad_norm": 0.26339662075042725,
"learning_rate": 0.00047522123893805305,
"loss": 0.1644,
"step": 1190
},
{
"epoch": 5.3097345132743365,
"grad_norm": 0.20065292716026306,
"learning_rate": 0.0004707964601769912,
"loss": 0.1555,
"step": 1200
},
{
"epoch": 5.353982300884955,
"grad_norm": 0.34847521781921387,
"learning_rate": 0.00046637168141592925,
"loss": 0.1644,
"step": 1210
},
{
"epoch": 5.398230088495575,
"grad_norm": 0.41893231868743896,
"learning_rate": 0.00046194690265486727,
"loss": 0.1661,
"step": 1220
},
{
"epoch": 5.442477876106195,
"grad_norm": 0.2889445424079895,
"learning_rate": 0.00045752212389380535,
"loss": 0.1924,
"step": 1230
},
{
"epoch": 5.486725663716814,
"grad_norm": 0.24809350073337555,
"learning_rate": 0.00045309734513274336,
"loss": 0.1941,
"step": 1240
},
{
"epoch": 5.530973451327434,
"grad_norm": 0.27125945687294006,
"learning_rate": 0.00044867256637168144,
"loss": 0.1731,
"step": 1250
},
{
"epoch": 5.575221238938053,
"grad_norm": 0.3384355902671814,
"learning_rate": 0.00044424778761061946,
"loss": 0.164,
"step": 1260
},
{
"epoch": 5.619469026548672,
"grad_norm": 0.3089454174041748,
"learning_rate": 0.00043982300884955753,
"loss": 0.1823,
"step": 1270
},
{
"epoch": 5.663716814159292,
"grad_norm": 0.26540765166282654,
"learning_rate": 0.0004353982300884956,
"loss": 0.1762,
"step": 1280
},
{
"epoch": 5.707964601769912,
"grad_norm": 0.22383682429790497,
"learning_rate": 0.0004309734513274337,
"loss": 0.2063,
"step": 1290
},
{
"epoch": 5.752212389380531,
"grad_norm": 0.24541282653808594,
"learning_rate": 0.0004265486725663717,
"loss": 0.1799,
"step": 1300
},
{
"epoch": 5.79646017699115,
"grad_norm": 0.33302921056747437,
"learning_rate": 0.00042212389380530976,
"loss": 0.1749,
"step": 1310
},
{
"epoch": 5.84070796460177,
"grad_norm": 0.274087131023407,
"learning_rate": 0.0004176991150442478,
"loss": 0.1982,
"step": 1320
},
{
"epoch": 5.88495575221239,
"grad_norm": 0.3344975411891937,
"learning_rate": 0.00041327433628318586,
"loss": 0.1962,
"step": 1330
},
{
"epoch": 5.929203539823009,
"grad_norm": 0.28589603304862976,
"learning_rate": 0.0004088495575221239,
"loss": 0.2078,
"step": 1340
},
{
"epoch": 5.9734513274336285,
"grad_norm": 0.18417391180992126,
"learning_rate": 0.00040442477876106195,
"loss": 0.1806,
"step": 1350
},
{
"epoch": 6.0,
"eval_loss": 0.20804466307163239,
"eval_runtime": 2.6659,
"eval_samples_per_second": 37.511,
"eval_steps_per_second": 9.378,
"step": 1356
},
{
"epoch": 6.017699115044247,
"grad_norm": 0.24382148683071136,
"learning_rate": 0.0004,
"loss": 0.1675,
"step": 1360
},
{
"epoch": 6.061946902654867,
"grad_norm": 0.2718934714794159,
"learning_rate": 0.0003955752212389381,
"loss": 0.1546,
"step": 1370
},
{
"epoch": 6.106194690265487,
"grad_norm": 0.321180135011673,
"learning_rate": 0.0003911504424778761,
"loss": 0.1828,
"step": 1380
},
{
"epoch": 6.150442477876107,
"grad_norm": 0.31438615918159485,
"learning_rate": 0.0003867256637168142,
"loss": 0.1793,
"step": 1390
},
{
"epoch": 6.1946902654867255,
"grad_norm": 0.24199295043945312,
"learning_rate": 0.0003823008849557522,
"loss": 0.1627,
"step": 1400
},
{
"epoch": 6.238938053097345,
"grad_norm": 0.3219399154186249,
"learning_rate": 0.0003778761061946903,
"loss": 0.1557,
"step": 1410
},
{
"epoch": 6.283185840707965,
"grad_norm": 0.20730754733085632,
"learning_rate": 0.0003734513274336283,
"loss": 0.1728,
"step": 1420
},
{
"epoch": 6.327433628318584,
"grad_norm": 0.30667644739151,
"learning_rate": 0.00036902654867256637,
"loss": 0.1601,
"step": 1430
},
{
"epoch": 6.371681415929204,
"grad_norm": 0.364202082157135,
"learning_rate": 0.00036460176991150444,
"loss": 0.166,
"step": 1440
},
{
"epoch": 6.415929203539823,
"grad_norm": 0.2910124659538269,
"learning_rate": 0.0003601769911504425,
"loss": 0.18,
"step": 1450
},
{
"epoch": 6.460176991150442,
"grad_norm": 0.3251543939113617,
"learning_rate": 0.00035575221238938053,
"loss": 0.1666,
"step": 1460
},
{
"epoch": 6.504424778761062,
"grad_norm": 0.31853803992271423,
"learning_rate": 0.0003513274336283186,
"loss": 0.1683,
"step": 1470
},
{
"epoch": 6.548672566371682,
"grad_norm": 0.3730286657810211,
"learning_rate": 0.0003469026548672566,
"loss": 0.163,
"step": 1480
},
{
"epoch": 6.592920353982301,
"grad_norm": 0.3070693910121918,
"learning_rate": 0.0003424778761061947,
"loss": 0.1492,
"step": 1490
},
{
"epoch": 6.6371681415929205,
"grad_norm": 0.25525256991386414,
"learning_rate": 0.0003380530973451327,
"loss": 0.1587,
"step": 1500
},
{
"epoch": 6.68141592920354,
"grad_norm": 0.34361934661865234,
"learning_rate": 0.0003336283185840708,
"loss": 0.161,
"step": 1510
},
{
"epoch": 6.725663716814159,
"grad_norm": 0.2400776594877243,
"learning_rate": 0.00032920353982300886,
"loss": 0.1534,
"step": 1520
},
{
"epoch": 6.769911504424779,
"grad_norm": 0.3599693477153778,
"learning_rate": 0.00032477876106194693,
"loss": 0.1699,
"step": 1530
},
{
"epoch": 6.814159292035399,
"grad_norm": 0.26774442195892334,
"learning_rate": 0.00032035398230088495,
"loss": 0.1567,
"step": 1540
},
{
"epoch": 6.8584070796460175,
"grad_norm": 0.32396429777145386,
"learning_rate": 0.000315929203539823,
"loss": 0.1929,
"step": 1550
},
{
"epoch": 6.902654867256637,
"grad_norm": 0.3491114377975464,
"learning_rate": 0.00031150442477876104,
"loss": 0.1784,
"step": 1560
},
{
"epoch": 6.946902654867257,
"grad_norm": 0.372086763381958,
"learning_rate": 0.0003070796460176991,
"loss": 0.193,
"step": 1570
},
{
"epoch": 6.991150442477876,
"grad_norm": 0.2936050593852997,
"learning_rate": 0.00030265486725663713,
"loss": 0.1899,
"step": 1580
},
{
"epoch": 7.0,
"eval_loss": 0.20992980897426605,
"eval_runtime": 3.174,
"eval_samples_per_second": 31.506,
"eval_steps_per_second": 7.877,
"step": 1582
},
{
"epoch": 7.035398230088496,
"grad_norm": 0.3688855767250061,
"learning_rate": 0.0002982300884955752,
"loss": 0.1813,
"step": 1590
},
{
"epoch": 7.079646017699115,
"grad_norm": 0.32831940054893494,
"learning_rate": 0.00029380530973451333,
"loss": 0.1472,
"step": 1600
},
{
"epoch": 7.123893805309734,
"grad_norm": 0.32714003324508667,
"learning_rate": 0.00028938053097345135,
"loss": 0.1704,
"step": 1610
},
{
"epoch": 7.168141592920354,
"grad_norm": 0.49076274037361145,
"learning_rate": 0.0002849557522123894,
"loss": 0.1559,
"step": 1620
},
{
"epoch": 7.212389380530974,
"grad_norm": 0.2076297253370285,
"learning_rate": 0.00028053097345132744,
"loss": 0.1571,
"step": 1630
},
{
"epoch": 7.256637168141593,
"grad_norm": 0.30924052000045776,
"learning_rate": 0.0002761061946902655,
"loss": 0.1497,
"step": 1640
},
{
"epoch": 7.300884955752212,
"grad_norm": 0.29587677121162415,
"learning_rate": 0.00027168141592920353,
"loss": 0.1506,
"step": 1650
},
{
"epoch": 7.345132743362832,
"grad_norm": 0.339077889919281,
"learning_rate": 0.0002672566371681416,
"loss": 0.152,
"step": 1660
},
{
"epoch": 7.389380530973451,
"grad_norm": 0.2390238344669342,
"learning_rate": 0.0002628318584070796,
"loss": 0.1634,
"step": 1670
},
{
"epoch": 7.433628318584071,
"grad_norm": 0.3401966392993927,
"learning_rate": 0.00025840707964601775,
"loss": 0.1437,
"step": 1680
},
{
"epoch": 7.477876106194691,
"grad_norm": 0.3273468017578125,
"learning_rate": 0.00025398230088495577,
"loss": 0.1421,
"step": 1690
},
{
"epoch": 7.522123893805309,
"grad_norm": 0.2576355040073395,
"learning_rate": 0.00024955752212389384,
"loss": 0.1606,
"step": 1700
},
{
"epoch": 7.566371681415929,
"grad_norm": 0.3079942464828491,
"learning_rate": 0.00024513274336283186,
"loss": 0.1662,
"step": 1710
},
{
"epoch": 7.610619469026549,
"grad_norm": 0.35095077753067017,
"learning_rate": 0.0002407079646017699,
"loss": 0.1449,
"step": 1720
},
{
"epoch": 7.654867256637168,
"grad_norm": 0.2713673412799835,
"learning_rate": 0.00023628318584070795,
"loss": 0.1666,
"step": 1730
},
{
"epoch": 7.699115044247788,
"grad_norm": 0.3343076705932617,
"learning_rate": 0.00023185840707964602,
"loss": 0.1657,
"step": 1740
},
{
"epoch": 7.743362831858407,
"grad_norm": 0.27280741930007935,
"learning_rate": 0.00022743362831858407,
"loss": 0.1584,
"step": 1750
},
{
"epoch": 7.787610619469026,
"grad_norm": 0.3658842146396637,
"learning_rate": 0.0002230088495575221,
"loss": 0.178,
"step": 1760
},
{
"epoch": 7.831858407079646,
"grad_norm": 0.2327466607093811,
"learning_rate": 0.00021858407079646016,
"loss": 0.1394,
"step": 1770
},
{
"epoch": 7.876106194690266,
"grad_norm": 0.2981870174407959,
"learning_rate": 0.00021415929203539826,
"loss": 0.1555,
"step": 1780
},
{
"epoch": 7.920353982300885,
"grad_norm": 0.32251453399658203,
"learning_rate": 0.0002097345132743363,
"loss": 0.1817,
"step": 1790
},
{
"epoch": 7.964601769911504,
"grad_norm": 0.34020307660102844,
"learning_rate": 0.00020530973451327435,
"loss": 0.1667,
"step": 1800
},
{
"epoch": 8.0,
"eval_loss": 0.2127797156572342,
"eval_runtime": 2.6346,
"eval_samples_per_second": 37.957,
"eval_steps_per_second": 9.489,
"step": 1808
},
{
"epoch": 8.008849557522124,
"grad_norm": 0.2688687741756439,
"learning_rate": 0.0002008849557522124,
"loss": 0.1726,
"step": 1810
},
{
"epoch": 8.053097345132743,
"grad_norm": 0.26508933305740356,
"learning_rate": 0.00019646017699115047,
"loss": 0.1573,
"step": 1820
},
{
"epoch": 8.097345132743364,
"grad_norm": 0.38828426599502563,
"learning_rate": 0.0001920353982300885,
"loss": 0.1593,
"step": 1830
},
{
"epoch": 8.141592920353983,
"grad_norm": 0.28579315543174744,
"learning_rate": 0.00018761061946902656,
"loss": 0.139,
"step": 1840
},
{
"epoch": 8.185840707964601,
"grad_norm": 0.29282671213150024,
"learning_rate": 0.0001831858407079646,
"loss": 0.1576,
"step": 1850
},
{
"epoch": 8.230088495575222,
"grad_norm": 0.39632460474967957,
"learning_rate": 0.00017876106194690268,
"loss": 0.1599,
"step": 1860
},
{
"epoch": 8.274336283185841,
"grad_norm": 0.8853453993797302,
"learning_rate": 0.00017433628318584072,
"loss": 0.1415,
"step": 1870
},
{
"epoch": 8.31858407079646,
"grad_norm": 0.28350165486335754,
"learning_rate": 0.00016991150442477877,
"loss": 0.1601,
"step": 1880
},
{
"epoch": 8.36283185840708,
"grad_norm": 0.32908403873443604,
"learning_rate": 0.00016548672566371681,
"loss": 0.1502,
"step": 1890
},
{
"epoch": 8.4070796460177,
"grad_norm": 0.26707422733306885,
"learning_rate": 0.0001610619469026549,
"loss": 0.144,
"step": 1900
},
{
"epoch": 8.451327433628318,
"grad_norm": 0.2607186436653137,
"learning_rate": 0.00015663716814159293,
"loss": 0.1497,
"step": 1910
},
{
"epoch": 8.495575221238939,
"grad_norm": 0.3008362650871277,
"learning_rate": 0.00015221238938053098,
"loss": 0.1519,
"step": 1920
},
{
"epoch": 8.539823008849558,
"grad_norm": 0.3770766854286194,
"learning_rate": 0.00014778761061946902,
"loss": 0.1486,
"step": 1930
},
{
"epoch": 8.584070796460177,
"grad_norm": 0.24154478311538696,
"learning_rate": 0.0001433628318584071,
"loss": 0.1504,
"step": 1940
},
{
"epoch": 8.628318584070797,
"grad_norm": 0.28921449184417725,
"learning_rate": 0.00013893805309734514,
"loss": 0.1636,
"step": 1950
},
{
"epoch": 8.672566371681416,
"grad_norm": 0.32194775342941284,
"learning_rate": 0.0001345132743362832,
"loss": 0.1746,
"step": 1960
},
{
"epoch": 8.716814159292035,
"grad_norm": 0.2882642149925232,
"learning_rate": 0.00013008849557522123,
"loss": 0.1305,
"step": 1970
},
{
"epoch": 8.761061946902656,
"grad_norm": 0.30995509028434753,
"learning_rate": 0.0001256637168141593,
"loss": 0.1484,
"step": 1980
},
{
"epoch": 8.805309734513274,
"grad_norm": 0.32381975650787354,
"learning_rate": 0.00012123893805309735,
"loss": 0.1657,
"step": 1990
},
{
"epoch": 8.849557522123893,
"grad_norm": 0.22391530871391296,
"learning_rate": 0.0001168141592920354,
"loss": 0.1247,
"step": 2000
},
{
"epoch": 8.893805309734514,
"grad_norm": 0.23185725510120392,
"learning_rate": 0.00011238938053097346,
"loss": 0.153,
"step": 2010
},
{
"epoch": 8.938053097345133,
"grad_norm": 0.27952226996421814,
"learning_rate": 0.0001079646017699115,
"loss": 0.1621,
"step": 2020
},
{
"epoch": 8.982300884955752,
"grad_norm": 0.2538679540157318,
"learning_rate": 0.00010353982300884956,
"loss": 0.1392,
"step": 2030
},
{
"epoch": 9.0,
"eval_loss": 0.21310940384864807,
"eval_runtime": 2.6327,
"eval_samples_per_second": 37.984,
"eval_steps_per_second": 9.496,
"step": 2034
},
{
"epoch": 9.026548672566372,
"grad_norm": 0.2921323776245117,
"learning_rate": 9.91150442477876e-05,
"loss": 0.1549,
"step": 2040
},
{
"epoch": 9.070796460176991,
"grad_norm": 0.2572889029979706,
"learning_rate": 9.469026548672566e-05,
"loss": 0.1734,
"step": 2050
},
{
"epoch": 9.11504424778761,
"grad_norm": 0.2991015613079071,
"learning_rate": 9.026548672566372e-05,
"loss": 0.1582,
"step": 2060
},
{
"epoch": 9.15929203539823,
"grad_norm": 0.33754679560661316,
"learning_rate": 8.584070796460178e-05,
"loss": 0.1343,
"step": 2070
},
{
"epoch": 9.20353982300885,
"grad_norm": 0.2426099181175232,
"learning_rate": 8.141592920353983e-05,
"loss": 0.1462,
"step": 2080
},
{
"epoch": 9.247787610619469,
"grad_norm": 0.3596532344818115,
"learning_rate": 7.699115044247789e-05,
"loss": 0.1522,
"step": 2090
},
{
"epoch": 9.29203539823009,
"grad_norm": 0.22559010982513428,
"learning_rate": 7.256637168141593e-05,
"loss": 0.1292,
"step": 2100
},
{
"epoch": 9.336283185840708,
"grad_norm": 0.3877250850200653,
"learning_rate": 6.814159292035399e-05,
"loss": 0.1257,
"step": 2110
},
{
"epoch": 9.380530973451327,
"grad_norm": 0.3135465383529663,
"learning_rate": 6.371681415929204e-05,
"loss": 0.1508,
"step": 2120
},
{
"epoch": 9.424778761061948,
"grad_norm": 0.3448950946331024,
"learning_rate": 5.929203539823009e-05,
"loss": 0.1386,
"step": 2130
},
{
"epoch": 9.469026548672566,
"grad_norm": 0.2957702577114105,
"learning_rate": 5.486725663716814e-05,
"loss": 0.1456,
"step": 2140
},
{
"epoch": 9.513274336283185,
"grad_norm": 0.2347142994403839,
"learning_rate": 5.0442477876106195e-05,
"loss": 0.1476,
"step": 2150
},
{
"epoch": 9.557522123893806,
"grad_norm": 0.3887890577316284,
"learning_rate": 4.601769911504425e-05,
"loss": 0.158,
"step": 2160
},
{
"epoch": 9.601769911504425,
"grad_norm": 0.2899017632007599,
"learning_rate": 4.15929203539823e-05,
"loss": 0.1323,
"step": 2170
},
{
"epoch": 9.646017699115044,
"grad_norm": 0.37858498096466064,
"learning_rate": 3.716814159292035e-05,
"loss": 0.1488,
"step": 2180
},
{
"epoch": 9.690265486725664,
"grad_norm": 0.30040085315704346,
"learning_rate": 3.2743362831858405e-05,
"loss": 0.1453,
"step": 2190
},
{
"epoch": 9.734513274336283,
"grad_norm": 0.34911859035491943,
"learning_rate": 2.831858407079646e-05,
"loss": 0.1578,
"step": 2200
},
{
"epoch": 9.778761061946902,
"grad_norm": 0.3793705999851227,
"learning_rate": 2.3893805309734513e-05,
"loss": 0.1551,
"step": 2210
},
{
"epoch": 9.823008849557523,
"grad_norm": 0.3259049654006958,
"learning_rate": 1.9469026548672565e-05,
"loss": 0.1782,
"step": 2220
},
{
"epoch": 9.867256637168142,
"grad_norm": 0.2592504620552063,
"learning_rate": 1.5044247787610619e-05,
"loss": 0.1488,
"step": 2230
},
{
"epoch": 9.91150442477876,
"grad_norm": 0.26316604018211365,
"learning_rate": 1.0619469026548673e-05,
"loss": 0.1328,
"step": 2240
},
{
"epoch": 9.955752212389381,
"grad_norm": 0.34197258949279785,
"learning_rate": 6.194690265486725e-06,
"loss": 0.1658,
"step": 2250
},
{
"epoch": 10.0,
"grad_norm": 0.281561017036438,
"learning_rate": 1.7699115044247788e-06,
"loss": 0.1256,
"step": 2260
}
],
"logging_steps": 10,
"max_steps": 2260,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 5529549227950080.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}