|
{
|
|
"best_metric": 0.772007503464447,
|
|
"best_model_checkpoint": "./results/models/nusaparagraph_emot/nusabert-bigru-concate-8-mean\\checkpoint-12222",
|
|
"epoch": 7.0,
|
|
"eval_steps": 500,
|
|
"global_step": 12222,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"epoch": 0.0286368843069874,
|
|
"grad_norm": 4.918474197387695,
|
|
"learning_rate": 9.985681557846507e-06,
|
|
"loss": 1.918,
|
|
"step": 50
|
|
},
|
|
{
|
|
"epoch": 0.0572737686139748,
|
|
"grad_norm": 5.332164287567139,
|
|
"learning_rate": 9.971363115693013e-06,
|
|
"loss": 1.8503,
|
|
"step": 100
|
|
},
|
|
{
|
|
"epoch": 0.0859106529209622,
|
|
"grad_norm": 13.817869186401367,
|
|
"learning_rate": 9.95704467353952e-06,
|
|
"loss": 1.7392,
|
|
"step": 150
|
|
},
|
|
{
|
|
"epoch": 0.1145475372279496,
|
|
"grad_norm": 12.340777397155762,
|
|
"learning_rate": 9.942726231386026e-06,
|
|
"loss": 1.4836,
|
|
"step": 200
|
|
},
|
|
{
|
|
"epoch": 0.143184421534937,
|
|
"grad_norm": 17.78537940979004,
|
|
"learning_rate": 9.928407789232532e-06,
|
|
"loss": 1.4447,
|
|
"step": 250
|
|
},
|
|
{
|
|
"epoch": 0.1718213058419244,
|
|
"grad_norm": 15.181681632995605,
|
|
"learning_rate": 9.914089347079038e-06,
|
|
"loss": 1.1689,
|
|
"step": 300
|
|
},
|
|
{
|
|
"epoch": 0.2004581901489118,
|
|
"grad_norm": 12.248451232910156,
|
|
"learning_rate": 9.899770904925546e-06,
|
|
"loss": 1.229,
|
|
"step": 350
|
|
},
|
|
{
|
|
"epoch": 0.2290950744558992,
|
|
"grad_norm": 15.683427810668945,
|
|
"learning_rate": 9.885452462772052e-06,
|
|
"loss": 1.0561,
|
|
"step": 400
|
|
},
|
|
{
|
|
"epoch": 0.25773195876288657,
|
|
"grad_norm": 17.413002014160156,
|
|
"learning_rate": 9.871134020618558e-06,
|
|
"loss": 0.9704,
|
|
"step": 450
|
|
},
|
|
{
|
|
"epoch": 0.286368843069874,
|
|
"grad_norm": 21.993549346923828,
|
|
"learning_rate": 9.856815578465064e-06,
|
|
"loss": 1.0062,
|
|
"step": 500
|
|
},
|
|
{
|
|
"epoch": 0.3150057273768614,
|
|
"grad_norm": 15.140169143676758,
|
|
"learning_rate": 9.84249713631157e-06,
|
|
"loss": 0.9102,
|
|
"step": 550
|
|
},
|
|
{
|
|
"epoch": 0.3436426116838488,
|
|
"grad_norm": 25.413042068481445,
|
|
"learning_rate": 9.828178694158076e-06,
|
|
"loss": 0.8857,
|
|
"step": 600
|
|
},
|
|
{
|
|
"epoch": 0.3722794959908362,
|
|
"grad_norm": 13.888834953308105,
|
|
"learning_rate": 9.813860252004582e-06,
|
|
"loss": 0.9794,
|
|
"step": 650
|
|
},
|
|
{
|
|
"epoch": 0.4009163802978236,
|
|
"grad_norm": 18.061304092407227,
|
|
"learning_rate": 9.799541809851088e-06,
|
|
"loss": 0.9635,
|
|
"step": 700
|
|
},
|
|
{
|
|
"epoch": 0.42955326460481097,
|
|
"grad_norm": 16.933076858520508,
|
|
"learning_rate": 9.785223367697596e-06,
|
|
"loss": 0.8568,
|
|
"step": 750
|
|
},
|
|
{
|
|
"epoch": 0.4581901489117984,
|
|
"grad_norm": 15.249773979187012,
|
|
"learning_rate": 9.770904925544102e-06,
|
|
"loss": 0.8865,
|
|
"step": 800
|
|
},
|
|
{
|
|
"epoch": 0.4868270332187858,
|
|
"grad_norm": 16.965801239013672,
|
|
"learning_rate": 9.756586483390608e-06,
|
|
"loss": 0.776,
|
|
"step": 850
|
|
},
|
|
{
|
|
"epoch": 0.5154639175257731,
|
|
"grad_norm": 6.5678791999816895,
|
|
"learning_rate": 9.742268041237114e-06,
|
|
"loss": 0.8698,
|
|
"step": 900
|
|
},
|
|
{
|
|
"epoch": 0.5441008018327605,
|
|
"grad_norm": 11.134130477905273,
|
|
"learning_rate": 9.72794959908362e-06,
|
|
"loss": 0.7746,
|
|
"step": 950
|
|
},
|
|
{
|
|
"epoch": 0.572737686139748,
|
|
"grad_norm": 13.739683151245117,
|
|
"learning_rate": 9.713631156930127e-06,
|
|
"loss": 0.8256,
|
|
"step": 1000
|
|
},
|
|
{
|
|
"epoch": 0.6013745704467354,
|
|
"grad_norm": 13.819252967834473,
|
|
"learning_rate": 9.699312714776633e-06,
|
|
"loss": 0.8959,
|
|
"step": 1050
|
|
},
|
|
{
|
|
"epoch": 0.6300114547537228,
|
|
"grad_norm": 21.550695419311523,
|
|
"learning_rate": 9.68499427262314e-06,
|
|
"loss": 0.8417,
|
|
"step": 1100
|
|
},
|
|
{
|
|
"epoch": 0.6586483390607102,
|
|
"grad_norm": 15.80966854095459,
|
|
"learning_rate": 9.670675830469645e-06,
|
|
"loss": 0.9138,
|
|
"step": 1150
|
|
},
|
|
{
|
|
"epoch": 0.6872852233676976,
|
|
"grad_norm": 9.978912353515625,
|
|
"learning_rate": 9.656357388316153e-06,
|
|
"loss": 0.7363,
|
|
"step": 1200
|
|
},
|
|
{
|
|
"epoch": 0.715922107674685,
|
|
"grad_norm": 15.32123851776123,
|
|
"learning_rate": 9.642038946162659e-06,
|
|
"loss": 0.849,
|
|
"step": 1250
|
|
},
|
|
{
|
|
"epoch": 0.7445589919816724,
|
|
"grad_norm": 21.705232620239258,
|
|
"learning_rate": 9.627720504009165e-06,
|
|
"loss": 0.8068,
|
|
"step": 1300
|
|
},
|
|
{
|
|
"epoch": 0.7731958762886598,
|
|
"grad_norm": 23.230682373046875,
|
|
"learning_rate": 9.613402061855671e-06,
|
|
"loss": 0.7546,
|
|
"step": 1350
|
|
},
|
|
{
|
|
"epoch": 0.8018327605956472,
|
|
"grad_norm": 18.663450241088867,
|
|
"learning_rate": 9.599083619702177e-06,
|
|
"loss": 0.7417,
|
|
"step": 1400
|
|
},
|
|
{
|
|
"epoch": 0.8304696449026346,
|
|
"grad_norm": 17.77196502685547,
|
|
"learning_rate": 9.584765177548685e-06,
|
|
"loss": 0.7517,
|
|
"step": 1450
|
|
},
|
|
{
|
|
"epoch": 0.8591065292096219,
|
|
"grad_norm": 7.162426471710205,
|
|
"learning_rate": 9.57044673539519e-06,
|
|
"loss": 0.6956,
|
|
"step": 1500
|
|
},
|
|
{
|
|
"epoch": 0.8877434135166093,
|
|
"grad_norm": 11.551986694335938,
|
|
"learning_rate": 9.556128293241697e-06,
|
|
"loss": 0.7805,
|
|
"step": 1550
|
|
},
|
|
{
|
|
"epoch": 0.9163802978235968,
|
|
"grad_norm": 28.649534225463867,
|
|
"learning_rate": 9.541809851088203e-06,
|
|
"loss": 0.7907,
|
|
"step": 1600
|
|
},
|
|
{
|
|
"epoch": 0.9450171821305842,
|
|
"grad_norm": 24.329723358154297,
|
|
"learning_rate": 9.527491408934708e-06,
|
|
"loss": 0.662,
|
|
"step": 1650
|
|
},
|
|
{
|
|
"epoch": 0.9736540664375716,
|
|
"grad_norm": 8.512353897094727,
|
|
"learning_rate": 9.513172966781214e-06,
|
|
"loss": 0.7086,
|
|
"step": 1700
|
|
},
|
|
{
|
|
"epoch": 1.0,
|
|
"eval_accuracy": 0.7155204810221721,
|
|
"eval_f1": 0.7112801498610356,
|
|
"eval_loss": 0.8033041954040527,
|
|
"eval_runtime": 36.3098,
|
|
"eval_samples_per_second": 73.286,
|
|
"eval_steps_per_second": 9.171,
|
|
"step": 1746
|
|
},
|
|
{
|
|
"epoch": 1.002290950744559,
|
|
"grad_norm": 19.602813720703125,
|
|
"learning_rate": 9.49885452462772e-06,
|
|
"loss": 0.7832,
|
|
"step": 1750
|
|
},
|
|
{
|
|
"epoch": 1.0309278350515463,
|
|
"grad_norm": 8.915902137756348,
|
|
"learning_rate": 9.484536082474226e-06,
|
|
"loss": 0.5571,
|
|
"step": 1800
|
|
},
|
|
{
|
|
"epoch": 1.0595647193585338,
|
|
"grad_norm": 44.88418960571289,
|
|
"learning_rate": 9.470217640320734e-06,
|
|
"loss": 0.5963,
|
|
"step": 1850
|
|
},
|
|
{
|
|
"epoch": 1.088201603665521,
|
|
"grad_norm": 7.115338325500488,
|
|
"learning_rate": 9.45589919816724e-06,
|
|
"loss": 0.4956,
|
|
"step": 1900
|
|
},
|
|
{
|
|
"epoch": 1.1168384879725086,
|
|
"grad_norm": 16.892269134521484,
|
|
"learning_rate": 9.441580756013746e-06,
|
|
"loss": 0.4767,
|
|
"step": 1950
|
|
},
|
|
{
|
|
"epoch": 1.145475372279496,
|
|
"grad_norm": 30.78487205505371,
|
|
"learning_rate": 9.427262313860252e-06,
|
|
"loss": 0.5456,
|
|
"step": 2000
|
|
},
|
|
{
|
|
"epoch": 1.1741122565864834,
|
|
"grad_norm": 12.62244987487793,
|
|
"learning_rate": 9.41294387170676e-06,
|
|
"loss": 0.531,
|
|
"step": 2050
|
|
},
|
|
{
|
|
"epoch": 1.2027491408934707,
|
|
"grad_norm": 20.651899337768555,
|
|
"learning_rate": 9.398625429553266e-06,
|
|
"loss": 0.529,
|
|
"step": 2100
|
|
},
|
|
{
|
|
"epoch": 1.2313860252004583,
|
|
"grad_norm": 21.23448371887207,
|
|
"learning_rate": 9.384306987399772e-06,
|
|
"loss": 0.5307,
|
|
"step": 2150
|
|
},
|
|
{
|
|
"epoch": 1.2600229095074456,
|
|
"grad_norm": 6.756899356842041,
|
|
"learning_rate": 9.369988545246278e-06,
|
|
"loss": 0.5302,
|
|
"step": 2200
|
|
},
|
|
{
|
|
"epoch": 1.2886597938144329,
|
|
"grad_norm": 11.766059875488281,
|
|
"learning_rate": 9.355670103092784e-06,
|
|
"loss": 0.5169,
|
|
"step": 2250
|
|
},
|
|
{
|
|
"epoch": 1.3172966781214204,
|
|
"grad_norm": 12.042119026184082,
|
|
"learning_rate": 9.341351660939291e-06,
|
|
"loss": 0.4705,
|
|
"step": 2300
|
|
},
|
|
{
|
|
"epoch": 1.345933562428408,
|
|
"grad_norm": 13.262438774108887,
|
|
"learning_rate": 9.327033218785797e-06,
|
|
"loss": 0.5263,
|
|
"step": 2350
|
|
},
|
|
{
|
|
"epoch": 1.3745704467353952,
|
|
"grad_norm": 24.665464401245117,
|
|
"learning_rate": 9.312714776632303e-06,
|
|
"loss": 0.5039,
|
|
"step": 2400
|
|
},
|
|
{
|
|
"epoch": 1.4032073310423825,
|
|
"grad_norm": 24.94556427001953,
|
|
"learning_rate": 9.29839633447881e-06,
|
|
"loss": 0.5736,
|
|
"step": 2450
|
|
},
|
|
{
|
|
"epoch": 1.43184421534937,
|
|
"grad_norm": 18.88966941833496,
|
|
"learning_rate": 9.284077892325315e-06,
|
|
"loss": 0.5569,
|
|
"step": 2500
|
|
},
|
|
{
|
|
"epoch": 1.4604810996563573,
|
|
"grad_norm": 8.994138717651367,
|
|
"learning_rate": 9.269759450171823e-06,
|
|
"loss": 0.4279,
|
|
"step": 2550
|
|
},
|
|
{
|
|
"epoch": 1.4891179839633448,
|
|
"grad_norm": 8.518653869628906,
|
|
"learning_rate": 9.255441008018329e-06,
|
|
"loss": 0.4764,
|
|
"step": 2600
|
|
},
|
|
{
|
|
"epoch": 1.5177548682703321,
|
|
"grad_norm": 35.21455764770508,
|
|
"learning_rate": 9.241122565864835e-06,
|
|
"loss": 0.5318,
|
|
"step": 2650
|
|
},
|
|
{
|
|
"epoch": 1.5463917525773194,
|
|
"grad_norm": 13.781972885131836,
|
|
"learning_rate": 9.226804123711341e-06,
|
|
"loss": 0.4229,
|
|
"step": 2700
|
|
},
|
|
{
|
|
"epoch": 1.575028636884307,
|
|
"grad_norm": 7.166682720184326,
|
|
"learning_rate": 9.212485681557847e-06,
|
|
"loss": 0.511,
|
|
"step": 2750
|
|
},
|
|
{
|
|
"epoch": 1.6036655211912945,
|
|
"grad_norm": 20.155414581298828,
|
|
"learning_rate": 9.198167239404353e-06,
|
|
"loss": 0.5216,
|
|
"step": 2800
|
|
},
|
|
{
|
|
"epoch": 1.6323024054982818,
|
|
"grad_norm": 13.758686065673828,
|
|
"learning_rate": 9.183848797250859e-06,
|
|
"loss": 0.5359,
|
|
"step": 2850
|
|
},
|
|
{
|
|
"epoch": 1.660939289805269,
|
|
"grad_norm": 17.821603775024414,
|
|
"learning_rate": 9.169530355097367e-06,
|
|
"loss": 0.5762,
|
|
"step": 2900
|
|
},
|
|
{
|
|
"epoch": 1.6895761741122566,
|
|
"grad_norm": 22.8956356048584,
|
|
"learning_rate": 9.155211912943873e-06,
|
|
"loss": 0.5629,
|
|
"step": 2950
|
|
},
|
|
{
|
|
"epoch": 1.718213058419244,
|
|
"grad_norm": 3.841970443725586,
|
|
"learning_rate": 9.140893470790379e-06,
|
|
"loss": 0.4847,
|
|
"step": 3000
|
|
},
|
|
{
|
|
"epoch": 1.7468499427262314,
|
|
"grad_norm": 22.353851318359375,
|
|
"learning_rate": 9.126575028636885e-06,
|
|
"loss": 0.499,
|
|
"step": 3050
|
|
},
|
|
{
|
|
"epoch": 1.7754868270332187,
|
|
"grad_norm": 6.219491481781006,
|
|
"learning_rate": 9.11225658648339e-06,
|
|
"loss": 0.6343,
|
|
"step": 3100
|
|
},
|
|
{
|
|
"epoch": 1.8041237113402062,
|
|
"grad_norm": 26.25059700012207,
|
|
"learning_rate": 9.097938144329898e-06,
|
|
"loss": 0.5259,
|
|
"step": 3150
|
|
},
|
|
{
|
|
"epoch": 1.8327605956471937,
|
|
"grad_norm": 14.963705062866211,
|
|
"learning_rate": 9.083619702176404e-06,
|
|
"loss": 0.5091,
|
|
"step": 3200
|
|
},
|
|
{
|
|
"epoch": 1.861397479954181,
|
|
"grad_norm": 4.209954261779785,
|
|
"learning_rate": 9.06930126002291e-06,
|
|
"loss": 0.5591,
|
|
"step": 3250
|
|
},
|
|
{
|
|
"epoch": 1.8900343642611683,
|
|
"grad_norm": 12.075881958007812,
|
|
"learning_rate": 9.054982817869416e-06,
|
|
"loss": 0.4983,
|
|
"step": 3300
|
|
},
|
|
{
|
|
"epoch": 1.9186712485681556,
|
|
"grad_norm": 14.535738945007324,
|
|
"learning_rate": 9.040664375715922e-06,
|
|
"loss": 0.4013,
|
|
"step": 3350
|
|
},
|
|
{
|
|
"epoch": 1.9473081328751431,
|
|
"grad_norm": 12.017505645751953,
|
|
"learning_rate": 9.02634593356243e-06,
|
|
"loss": 0.5307,
|
|
"step": 3400
|
|
},
|
|
{
|
|
"epoch": 1.9759450171821307,
|
|
"grad_norm": 24.822385787963867,
|
|
"learning_rate": 9.012027491408936e-06,
|
|
"loss": 0.4848,
|
|
"step": 3450
|
|
},
|
|
{
|
|
"epoch": 2.0,
|
|
"eval_accuracy": 0.7576099210822999,
|
|
"eval_f1": 0.7572228179173918,
|
|
"eval_loss": 0.7472622394561768,
|
|
"eval_runtime": 36.2957,
|
|
"eval_samples_per_second": 73.314,
|
|
"eval_steps_per_second": 9.175,
|
|
"step": 3492
|
|
},
|
|
{
|
|
"epoch": 2.004581901489118,
|
|
"grad_norm": 0.584539532661438,
|
|
"learning_rate": 8.997709049255442e-06,
|
|
"loss": 0.4442,
|
|
"step": 3500
|
|
},
|
|
{
|
|
"epoch": 2.0332187857961053,
|
|
"grad_norm": 48.69373321533203,
|
|
"learning_rate": 8.983390607101948e-06,
|
|
"loss": 0.3488,
|
|
"step": 3550
|
|
},
|
|
{
|
|
"epoch": 2.0618556701030926,
|
|
"grad_norm": 6.41154670715332,
|
|
"learning_rate": 8.969072164948455e-06,
|
|
"loss": 0.3281,
|
|
"step": 3600
|
|
},
|
|
{
|
|
"epoch": 2.0904925544100803,
|
|
"grad_norm": 29.08100700378418,
|
|
"learning_rate": 8.954753722794961e-06,
|
|
"loss": 0.2916,
|
|
"step": 3650
|
|
},
|
|
{
|
|
"epoch": 2.1191294387170676,
|
|
"grad_norm": 9.118326187133789,
|
|
"learning_rate": 8.940435280641467e-06,
|
|
"loss": 0.2665,
|
|
"step": 3700
|
|
},
|
|
{
|
|
"epoch": 2.147766323024055,
|
|
"grad_norm": 3.8800840377807617,
|
|
"learning_rate": 8.926116838487973e-06,
|
|
"loss": 0.2697,
|
|
"step": 3750
|
|
},
|
|
{
|
|
"epoch": 2.176403207331042,
|
|
"grad_norm": 14.715743064880371,
|
|
"learning_rate": 8.91179839633448e-06,
|
|
"loss": 0.3374,
|
|
"step": 3800
|
|
},
|
|
{
|
|
"epoch": 2.20504009163803,
|
|
"grad_norm": 46.47842025756836,
|
|
"learning_rate": 8.897479954180985e-06,
|
|
"loss": 0.2725,
|
|
"step": 3850
|
|
},
|
|
{
|
|
"epoch": 2.2336769759450172,
|
|
"grad_norm": 25.640724182128906,
|
|
"learning_rate": 8.883161512027491e-06,
|
|
"loss": 0.2656,
|
|
"step": 3900
|
|
},
|
|
{
|
|
"epoch": 2.2623138602520045,
|
|
"grad_norm": 33.861568450927734,
|
|
"learning_rate": 8.868843069873997e-06,
|
|
"loss": 0.3202,
|
|
"step": 3950
|
|
},
|
|
{
|
|
"epoch": 2.290950744558992,
|
|
"grad_norm": 1.642096996307373,
|
|
"learning_rate": 8.854524627720505e-06,
|
|
"loss": 0.2686,
|
|
"step": 4000
|
|
},
|
|
{
|
|
"epoch": 2.319587628865979,
|
|
"grad_norm": 10.57680606842041,
|
|
"learning_rate": 8.840206185567011e-06,
|
|
"loss": 0.2817,
|
|
"step": 4050
|
|
},
|
|
{
|
|
"epoch": 2.348224513172967,
|
|
"grad_norm": 22.718547821044922,
|
|
"learning_rate": 8.825887743413517e-06,
|
|
"loss": 0.3209,
|
|
"step": 4100
|
|
},
|
|
{
|
|
"epoch": 2.376861397479954,
|
|
"grad_norm": 39.80268859863281,
|
|
"learning_rate": 8.811569301260023e-06,
|
|
"loss": 0.3598,
|
|
"step": 4150
|
|
},
|
|
{
|
|
"epoch": 2.4054982817869415,
|
|
"grad_norm": 13.368090629577637,
|
|
"learning_rate": 8.797250859106529e-06,
|
|
"loss": 0.3448,
|
|
"step": 4200
|
|
},
|
|
{
|
|
"epoch": 2.434135166093929,
|
|
"grad_norm": 55.12387466430664,
|
|
"learning_rate": 8.782932416953037e-06,
|
|
"loss": 0.2845,
|
|
"step": 4250
|
|
},
|
|
{
|
|
"epoch": 2.4627720504009165,
|
|
"grad_norm": 8.037142753601074,
|
|
"learning_rate": 8.768613974799543e-06,
|
|
"loss": 0.2482,
|
|
"step": 4300
|
|
},
|
|
{
|
|
"epoch": 2.491408934707904,
|
|
"grad_norm": 14.919239044189453,
|
|
"learning_rate": 8.754295532646049e-06,
|
|
"loss": 0.2222,
|
|
"step": 4350
|
|
},
|
|
{
|
|
"epoch": 2.520045819014891,
|
|
"grad_norm": 30.16111946105957,
|
|
"learning_rate": 8.739977090492555e-06,
|
|
"loss": 0.2577,
|
|
"step": 4400
|
|
},
|
|
{
|
|
"epoch": 2.5486827033218784,
|
|
"grad_norm": 18.18327522277832,
|
|
"learning_rate": 8.725658648339062e-06,
|
|
"loss": 0.2802,
|
|
"step": 4450
|
|
},
|
|
{
|
|
"epoch": 2.5773195876288657,
|
|
"grad_norm": 14.579641342163086,
|
|
"learning_rate": 8.711340206185568e-06,
|
|
"loss": 0.2528,
|
|
"step": 4500
|
|
},
|
|
{
|
|
"epoch": 2.6059564719358534,
|
|
"grad_norm": 17.266183853149414,
|
|
"learning_rate": 8.697021764032074e-06,
|
|
"loss": 0.3592,
|
|
"step": 4550
|
|
},
|
|
{
|
|
"epoch": 2.6345933562428407,
|
|
"grad_norm": 16.718862533569336,
|
|
"learning_rate": 8.68270332187858e-06,
|
|
"loss": 0.3184,
|
|
"step": 4600
|
|
},
|
|
{
|
|
"epoch": 2.663230240549828,
|
|
"grad_norm": 25.249649047851562,
|
|
"learning_rate": 8.668384879725086e-06,
|
|
"loss": 0.2697,
|
|
"step": 4650
|
|
},
|
|
{
|
|
"epoch": 2.691867124856816,
|
|
"grad_norm": 11.022406578063965,
|
|
"learning_rate": 8.654066437571594e-06,
|
|
"loss": 0.2776,
|
|
"step": 4700
|
|
},
|
|
{
|
|
"epoch": 2.720504009163803,
|
|
"grad_norm": 8.15123176574707,
|
|
"learning_rate": 8.6397479954181e-06,
|
|
"loss": 0.3462,
|
|
"step": 4750
|
|
},
|
|
{
|
|
"epoch": 2.7491408934707904,
|
|
"grad_norm": 11.458098411560059,
|
|
"learning_rate": 8.625429553264606e-06,
|
|
"loss": 0.3657,
|
|
"step": 4800
|
|
},
|
|
{
|
|
"epoch": 2.7777777777777777,
|
|
"grad_norm": 3.9528236389160156,
|
|
"learning_rate": 8.611111111111112e-06,
|
|
"loss": 0.3086,
|
|
"step": 4850
|
|
},
|
|
{
|
|
"epoch": 2.806414662084765,
|
|
"grad_norm": 39.1205940246582,
|
|
"learning_rate": 8.596792668957618e-06,
|
|
"loss": 0.3528,
|
|
"step": 4900
|
|
},
|
|
{
|
|
"epoch": 2.8350515463917527,
|
|
"grad_norm": 53.56584167480469,
|
|
"learning_rate": 8.582474226804124e-06,
|
|
"loss": 0.2909,
|
|
"step": 4950
|
|
},
|
|
{
|
|
"epoch": 2.86368843069874,
|
|
"grad_norm": 51.294132232666016,
|
|
"learning_rate": 8.56815578465063e-06,
|
|
"loss": 0.2571,
|
|
"step": 5000
|
|
},
|
|
{
|
|
"epoch": 2.8923253150057273,
|
|
"grad_norm": 48.5758056640625,
|
|
"learning_rate": 8.553837342497136e-06,
|
|
"loss": 0.2812,
|
|
"step": 5050
|
|
},
|
|
{
|
|
"epoch": 2.9209621993127146,
|
|
"grad_norm": 9.625508308410645,
|
|
"learning_rate": 8.539518900343643e-06,
|
|
"loss": 0.2606,
|
|
"step": 5100
|
|
},
|
|
{
|
|
"epoch": 2.9495990836197024,
|
|
"grad_norm": 22.97384262084961,
|
|
"learning_rate": 8.52520045819015e-06,
|
|
"loss": 0.3566,
|
|
"step": 5150
|
|
},
|
|
{
|
|
"epoch": 2.9782359679266897,
|
|
"grad_norm": 34.34775161743164,
|
|
"learning_rate": 8.510882016036655e-06,
|
|
"loss": 0.3251,
|
|
"step": 5200
|
|
},
|
|
{
|
|
"epoch": 3.0,
|
|
"eval_accuracy": 0.7493423524990604,
|
|
"eval_f1": 0.7488536960670391,
|
|
"eval_loss": 0.9042189121246338,
|
|
"eval_runtime": 36.3022,
|
|
"eval_samples_per_second": 73.301,
|
|
"eval_steps_per_second": 9.173,
|
|
"step": 5238
|
|
},
|
|
{
|
|
"epoch": 3.006872852233677,
|
|
"grad_norm": 0.47577229142189026,
|
|
"learning_rate": 8.496563573883161e-06,
|
|
"loss": 0.2671,
|
|
"step": 5250
|
|
},
|
|
{
|
|
"epoch": 3.0355097365406642,
|
|
"grad_norm": 36.310089111328125,
|
|
"learning_rate": 8.482245131729669e-06,
|
|
"loss": 0.1493,
|
|
"step": 5300
|
|
},
|
|
{
|
|
"epoch": 3.0641466208476515,
|
|
"grad_norm": 29.30960464477539,
|
|
"learning_rate": 8.467926689576175e-06,
|
|
"loss": 0.1603,
|
|
"step": 5350
|
|
},
|
|
{
|
|
"epoch": 3.0927835051546393,
|
|
"grad_norm": 2.112786054611206,
|
|
"learning_rate": 8.453608247422681e-06,
|
|
"loss": 0.1363,
|
|
"step": 5400
|
|
},
|
|
{
|
|
"epoch": 3.1214203894616266,
|
|
"grad_norm": 3.0346627235412598,
|
|
"learning_rate": 8.439289805269187e-06,
|
|
"loss": 0.1359,
|
|
"step": 5450
|
|
},
|
|
{
|
|
"epoch": 3.150057273768614,
|
|
"grad_norm": 22.331645965576172,
|
|
"learning_rate": 8.424971363115693e-06,
|
|
"loss": 0.1479,
|
|
"step": 5500
|
|
},
|
|
{
|
|
"epoch": 3.178694158075601,
|
|
"grad_norm": 13.065728187561035,
|
|
"learning_rate": 8.4106529209622e-06,
|
|
"loss": 0.1311,
|
|
"step": 5550
|
|
},
|
|
{
|
|
"epoch": 3.207331042382589,
|
|
"grad_norm": 3.163928270339966,
|
|
"learning_rate": 8.396334478808707e-06,
|
|
"loss": 0.1364,
|
|
"step": 5600
|
|
},
|
|
{
|
|
"epoch": 3.2359679266895762,
|
|
"grad_norm": 33.371849060058594,
|
|
"learning_rate": 8.382016036655213e-06,
|
|
"loss": 0.1485,
|
|
"step": 5650
|
|
},
|
|
{
|
|
"epoch": 3.2646048109965635,
|
|
"grad_norm": 24.721717834472656,
|
|
"learning_rate": 8.367697594501719e-06,
|
|
"loss": 0.155,
|
|
"step": 5700
|
|
},
|
|
{
|
|
"epoch": 3.293241695303551,
|
|
"grad_norm": 15.719419479370117,
|
|
"learning_rate": 8.353379152348225e-06,
|
|
"loss": 0.1539,
|
|
"step": 5750
|
|
},
|
|
{
|
|
"epoch": 3.3218785796105386,
|
|
"grad_norm": 41.62794876098633,
|
|
"learning_rate": 8.339060710194732e-06,
|
|
"loss": 0.1695,
|
|
"step": 5800
|
|
},
|
|
{
|
|
"epoch": 3.350515463917526,
|
|
"grad_norm": 1.7000266313552856,
|
|
"learning_rate": 8.324742268041238e-06,
|
|
"loss": 0.187,
|
|
"step": 5850
|
|
},
|
|
{
|
|
"epoch": 3.379152348224513,
|
|
"grad_norm": 9.574384689331055,
|
|
"learning_rate": 8.310423825887744e-06,
|
|
"loss": 0.1909,
|
|
"step": 5900
|
|
},
|
|
{
|
|
"epoch": 3.4077892325315005,
|
|
"grad_norm": 19.80113410949707,
|
|
"learning_rate": 8.29610538373425e-06,
|
|
"loss": 0.1656,
|
|
"step": 5950
|
|
},
|
|
{
|
|
"epoch": 3.436426116838488,
|
|
"grad_norm": 71.53827667236328,
|
|
"learning_rate": 8.281786941580758e-06,
|
|
"loss": 0.1116,
|
|
"step": 6000
|
|
},
|
|
{
|
|
"epoch": 3.4650630011454755,
|
|
"grad_norm": 65.89808654785156,
|
|
"learning_rate": 8.267468499427262e-06,
|
|
"loss": 0.188,
|
|
"step": 6050
|
|
},
|
|
{
|
|
"epoch": 3.493699885452463,
|
|
"grad_norm": 60.68288803100586,
|
|
"learning_rate": 8.253150057273768e-06,
|
|
"loss": 0.1315,
|
|
"step": 6100
|
|
},
|
|
{
|
|
"epoch": 3.52233676975945,
|
|
"grad_norm": 25.550884246826172,
|
|
"learning_rate": 8.238831615120276e-06,
|
|
"loss": 0.1194,
|
|
"step": 6150
|
|
},
|
|
{
|
|
"epoch": 3.5509736540664374,
|
|
"grad_norm": 6.416481971740723,
|
|
"learning_rate": 8.224513172966782e-06,
|
|
"loss": 0.134,
|
|
"step": 6200
|
|
},
|
|
{
|
|
"epoch": 3.579610538373425,
|
|
"grad_norm": 1.5923579931259155,
|
|
"learning_rate": 8.210194730813288e-06,
|
|
"loss": 0.1672,
|
|
"step": 6250
|
|
},
|
|
{
|
|
"epoch": 3.6082474226804124,
|
|
"grad_norm": 19.590898513793945,
|
|
"learning_rate": 8.195876288659794e-06,
|
|
"loss": 0.14,
|
|
"step": 6300
|
|
},
|
|
{
|
|
"epoch": 3.6368843069873997,
|
|
"grad_norm": 0.4376499652862549,
|
|
"learning_rate": 8.1815578465063e-06,
|
|
"loss": 0.1425,
|
|
"step": 6350
|
|
},
|
|
{
|
|
"epoch": 3.665521191294387,
|
|
"grad_norm": 11.566116333007812,
|
|
"learning_rate": 8.167239404352808e-06,
|
|
"loss": 0.205,
|
|
"step": 6400
|
|
},
|
|
{
|
|
"epoch": 3.6941580756013748,
|
|
"grad_norm": 62.71388626098633,
|
|
"learning_rate": 8.152920962199314e-06,
|
|
"loss": 0.184,
|
|
"step": 6450
|
|
},
|
|
{
|
|
"epoch": 3.722794959908362,
|
|
"grad_norm": 0.32115602493286133,
|
|
"learning_rate": 8.13860252004582e-06,
|
|
"loss": 0.1267,
|
|
"step": 6500
|
|
},
|
|
{
|
|
"epoch": 3.7514318442153494,
|
|
"grad_norm": 53.07960891723633,
|
|
"learning_rate": 8.124284077892326e-06,
|
|
"loss": 0.1796,
|
|
"step": 6550
|
|
},
|
|
{
|
|
"epoch": 3.7800687285223367,
|
|
"grad_norm": 2.265838861465454,
|
|
"learning_rate": 8.109965635738832e-06,
|
|
"loss": 0.2187,
|
|
"step": 6600
|
|
},
|
|
{
|
|
"epoch": 3.808705612829324,
|
|
"grad_norm": 0.6725891828536987,
|
|
"learning_rate": 8.09564719358534e-06,
|
|
"loss": 0.1518,
|
|
"step": 6650
|
|
},
|
|
{
|
|
"epoch": 3.8373424971363117,
|
|
"grad_norm": 53.60797882080078,
|
|
"learning_rate": 8.081328751431845e-06,
|
|
"loss": 0.1619,
|
|
"step": 6700
|
|
},
|
|
{
|
|
"epoch": 3.865979381443299,
|
|
"grad_norm": 22.341100692749023,
|
|
"learning_rate": 8.067010309278351e-06,
|
|
"loss": 0.1378,
|
|
"step": 6750
|
|
},
|
|
{
|
|
"epoch": 3.8946162657502863,
|
|
"grad_norm": 64.15727996826172,
|
|
"learning_rate": 8.052691867124857e-06,
|
|
"loss": 0.1445,
|
|
"step": 6800
|
|
},
|
|
{
|
|
"epoch": 3.923253150057274,
|
|
"grad_norm": 36.292484283447266,
|
|
"learning_rate": 8.038373424971365e-06,
|
|
"loss": 0.1612,
|
|
"step": 6850
|
|
},
|
|
{
|
|
"epoch": 3.9518900343642613,
|
|
"grad_norm": 4.331699848175049,
|
|
"learning_rate": 8.02405498281787e-06,
|
|
"loss": 0.2337,
|
|
"step": 6900
|
|
},
|
|
{
|
|
"epoch": 3.9805269186712486,
|
|
"grad_norm": 0.28292712569236755,
|
|
"learning_rate": 8.009736540664377e-06,
|
|
"loss": 0.1779,
|
|
"step": 6950
|
|
},
|
|
{
|
|
"epoch": 4.0,
|
|
"eval_accuracy": 0.7606163096580233,
|
|
"eval_f1": 0.7597313264211647,
|
|
"eval_loss": 0.9818114042282104,
|
|
"eval_runtime": 36.2859,
|
|
"eval_samples_per_second": 73.334,
|
|
"eval_steps_per_second": 9.177,
|
|
"step": 6984
|
|
},
|
|
{
|
|
"epoch": 4.009163802978236,
|
|
"grad_norm": 14.622065544128418,
|
|
"learning_rate": 7.995418098510883e-06,
|
|
"loss": 0.1005,
|
|
"step": 7000
|
|
},
|
|
{
|
|
"epoch": 4.037800687285223,
|
|
"grad_norm": 0.4035070836544037,
|
|
"learning_rate": 7.981099656357389e-06,
|
|
"loss": 0.0777,
|
|
"step": 7050
|
|
},
|
|
{
|
|
"epoch": 4.0664375715922105,
|
|
"grad_norm": 31.626924514770508,
|
|
"learning_rate": 7.966781214203895e-06,
|
|
"loss": 0.0581,
|
|
"step": 7100
|
|
},
|
|
{
|
|
"epoch": 4.095074455899198,
|
|
"grad_norm": 0.15928135812282562,
|
|
"learning_rate": 7.9524627720504e-06,
|
|
"loss": 0.0477,
|
|
"step": 7150
|
|
},
|
|
{
|
|
"epoch": 4.123711340206185,
|
|
"grad_norm": 49.666709899902344,
|
|
"learning_rate": 7.938144329896907e-06,
|
|
"loss": 0.0752,
|
|
"step": 7200
|
|
},
|
|
{
|
|
"epoch": 4.152348224513173,
|
|
"grad_norm": 0.6751652956008911,
|
|
"learning_rate": 7.923825887743414e-06,
|
|
"loss": 0.0807,
|
|
"step": 7250
|
|
},
|
|
{
|
|
"epoch": 4.180985108820161,
|
|
"grad_norm": 0.623131513595581,
|
|
"learning_rate": 7.90950744558992e-06,
|
|
"loss": 0.1205,
|
|
"step": 7300
|
|
},
|
|
{
|
|
"epoch": 4.209621993127148,
|
|
"grad_norm": 0.47417309880256653,
|
|
"learning_rate": 7.895189003436426e-06,
|
|
"loss": 0.0286,
|
|
"step": 7350
|
|
},
|
|
{
|
|
"epoch": 4.238258877434135,
|
|
"grad_norm": 0.37885400652885437,
|
|
"learning_rate": 7.880870561282932e-06,
|
|
"loss": 0.0747,
|
|
"step": 7400
|
|
},
|
|
{
|
|
"epoch": 4.2668957617411225,
|
|
"grad_norm": 1.7955352067947388,
|
|
"learning_rate": 7.866552119129438e-06,
|
|
"loss": 0.0544,
|
|
"step": 7450
|
|
},
|
|
{
|
|
"epoch": 4.29553264604811,
|
|
"grad_norm": 46.27180862426758,
|
|
"learning_rate": 7.852233676975946e-06,
|
|
"loss": 0.0447,
|
|
"step": 7500
|
|
},
|
|
{
|
|
"epoch": 4.324169530355097,
|
|
"grad_norm": 2.345630645751953,
|
|
"learning_rate": 7.837915234822452e-06,
|
|
"loss": 0.0599,
|
|
"step": 7550
|
|
},
|
|
{
|
|
"epoch": 4.352806414662084,
|
|
"grad_norm": 0.334553986787796,
|
|
"learning_rate": 7.823596792668958e-06,
|
|
"loss": 0.0494,
|
|
"step": 7600
|
|
},
|
|
{
|
|
"epoch": 4.381443298969073,
|
|
"grad_norm": 8.703739166259766,
|
|
"learning_rate": 7.809278350515464e-06,
|
|
"loss": 0.0523,
|
|
"step": 7650
|
|
},
|
|
{
|
|
"epoch": 4.41008018327606,
|
|
"grad_norm": 36.21126937866211,
|
|
"learning_rate": 7.794959908361972e-06,
|
|
"loss": 0.115,
|
|
"step": 7700
|
|
},
|
|
{
|
|
"epoch": 4.438717067583047,
|
|
"grad_norm": 0.7834786772727966,
|
|
"learning_rate": 7.780641466208478e-06,
|
|
"loss": 0.0602,
|
|
"step": 7750
|
|
},
|
|
{
|
|
"epoch": 4.4673539518900345,
|
|
"grad_norm": 1.5702382326126099,
|
|
"learning_rate": 7.766323024054984e-06,
|
|
"loss": 0.1341,
|
|
"step": 7800
|
|
},
|
|
{
|
|
"epoch": 4.495990836197022,
|
|
"grad_norm": 0.2708655595779419,
|
|
"learning_rate": 7.75200458190149e-06,
|
|
"loss": 0.1514,
|
|
"step": 7850
|
|
},
|
|
{
|
|
"epoch": 4.524627720504009,
|
|
"grad_norm": 0.48853299021720886,
|
|
"learning_rate": 7.737686139747996e-06,
|
|
"loss": 0.1418,
|
|
"step": 7900
|
|
},
|
|
{
|
|
"epoch": 4.553264604810996,
|
|
"grad_norm": 36.37514114379883,
|
|
"learning_rate": 7.723367697594503e-06,
|
|
"loss": 0.0663,
|
|
"step": 7950
|
|
},
|
|
{
|
|
"epoch": 4.581901489117984,
|
|
"grad_norm": 30.906522750854492,
|
|
"learning_rate": 7.70904925544101e-06,
|
|
"loss": 0.0655,
|
|
"step": 8000
|
|
},
|
|
{
|
|
"epoch": 4.610538373424971,
|
|
"grad_norm": 3.7403993606567383,
|
|
"learning_rate": 7.694730813287515e-06,
|
|
"loss": 0.0496,
|
|
"step": 8050
|
|
},
|
|
{
|
|
"epoch": 4.639175257731958,
|
|
"grad_norm": 0.10339858382940292,
|
|
"learning_rate": 7.680412371134021e-06,
|
|
"loss": 0.1213,
|
|
"step": 8100
|
|
},
|
|
{
|
|
"epoch": 4.6678121420389465,
|
|
"grad_norm": 0.03920993208885193,
|
|
"learning_rate": 7.666093928980529e-06,
|
|
"loss": 0.1407,
|
|
"step": 8150
|
|
},
|
|
{
|
|
"epoch": 4.696449026345934,
|
|
"grad_norm": 70.78038024902344,
|
|
"learning_rate": 7.651775486827033e-06,
|
|
"loss": 0.0922,
|
|
"step": 8200
|
|
},
|
|
{
|
|
"epoch": 4.725085910652921,
|
|
"grad_norm": 20.5271053314209,
|
|
"learning_rate": 7.63745704467354e-06,
|
|
"loss": 0.1784,
|
|
"step": 8250
|
|
},
|
|
{
|
|
"epoch": 4.753722794959908,
|
|
"grad_norm": 0.17482055723667145,
|
|
"learning_rate": 7.623138602520046e-06,
|
|
"loss": 0.1508,
|
|
"step": 8300
|
|
},
|
|
{
|
|
"epoch": 4.782359679266896,
|
|
"grad_norm": 33.39112091064453,
|
|
"learning_rate": 7.608820160366552e-06,
|
|
"loss": 0.1115,
|
|
"step": 8350
|
|
},
|
|
{
|
|
"epoch": 4.810996563573883,
|
|
"grad_norm": 0.4592705965042114,
|
|
"learning_rate": 7.594501718213059e-06,
|
|
"loss": 0.1188,
|
|
"step": 8400
|
|
},
|
|
{
|
|
"epoch": 4.83963344788087,
|
|
"grad_norm": 38.45854187011719,
|
|
"learning_rate": 7.580183276059565e-06,
|
|
"loss": 0.0788,
|
|
"step": 8450
|
|
},
|
|
{
|
|
"epoch": 4.868270332187858,
|
|
"grad_norm": 60.54808044433594,
|
|
"learning_rate": 7.565864833906072e-06,
|
|
"loss": 0.1159,
|
|
"step": 8500
|
|
},
|
|
{
|
|
"epoch": 4.896907216494846,
|
|
"grad_norm": 0.5872980952262878,
|
|
"learning_rate": 7.551546391752578e-06,
|
|
"loss": 0.0984,
|
|
"step": 8550
|
|
},
|
|
{
|
|
"epoch": 4.925544100801833,
|
|
"grad_norm": 0.09889842569828033,
|
|
"learning_rate": 7.5372279495990845e-06,
|
|
"loss": 0.1722,
|
|
"step": 8600
|
|
},
|
|
{
|
|
"epoch": 4.95418098510882,
|
|
"grad_norm": 0.07930275052785873,
|
|
"learning_rate": 7.5229095074455904e-06,
|
|
"loss": 0.0861,
|
|
"step": 8650
|
|
},
|
|
{
|
|
"epoch": 4.982817869415808,
|
|
"grad_norm": 9.635697364807129,
|
|
"learning_rate": 7.5085910652920964e-06,
|
|
"loss": 0.1452,
|
|
"step": 8700
|
|
},
|
|
{
|
|
"epoch": 5.0,
|
|
"eval_accuracy": 0.7602405110860578,
|
|
"eval_f1": 0.7599417916848927,
|
|
"eval_loss": 1.1236233711242676,
|
|
"eval_runtime": 36.3176,
|
|
"eval_samples_per_second": 73.27,
|
|
"eval_steps_per_second": 9.169,
|
|
"step": 8730
|
|
},
|
|
{
|
|
"epoch": 5.011454753722795,
|
|
"grad_norm": 0.5619956254959106,
|
|
"learning_rate": 7.494272623138603e-06,
|
|
"loss": 0.0687,
|
|
"step": 8750
|
|
},
|
|
{
|
|
"epoch": 5.040091638029782,
|
|
"grad_norm": 85.73548126220703,
|
|
"learning_rate": 7.479954180985109e-06,
|
|
"loss": 0.0478,
|
|
"step": 8800
|
|
},
|
|
{
|
|
"epoch": 5.0687285223367695,
|
|
"grad_norm": 13.715363502502441,
|
|
"learning_rate": 7.465635738831616e-06,
|
|
"loss": 0.0396,
|
|
"step": 8850
|
|
},
|
|
{
|
|
"epoch": 5.097365406643757,
|
|
"grad_norm": 70.08007049560547,
|
|
"learning_rate": 7.451317296678122e-06,
|
|
"loss": 0.0562,
|
|
"step": 8900
|
|
},
|
|
{
|
|
"epoch": 5.126002290950744,
|
|
"grad_norm": 0.8938388824462891,
|
|
"learning_rate": 7.436998854524629e-06,
|
|
"loss": 0.0974,
|
|
"step": 8950
|
|
},
|
|
{
|
|
"epoch": 5.154639175257732,
|
|
"grad_norm": 0.3734937012195587,
|
|
"learning_rate": 7.422680412371135e-06,
|
|
"loss": 0.0742,
|
|
"step": 9000
|
|
},
|
|
{
|
|
"epoch": 5.18327605956472,
|
|
"grad_norm": 10.571038246154785,
|
|
"learning_rate": 7.408361970217641e-06,
|
|
"loss": 0.021,
|
|
"step": 9050
|
|
},
|
|
{
|
|
"epoch": 5.211912943871707,
|
|
"grad_norm": 0.09976017475128174,
|
|
"learning_rate": 7.394043528064148e-06,
|
|
"loss": 0.0546,
|
|
"step": 9100
|
|
},
|
|
{
|
|
"epoch": 5.240549828178694,
|
|
"grad_norm": 0.07766138762235641,
|
|
"learning_rate": 7.379725085910654e-06,
|
|
"loss": 0.0827,
|
|
"step": 9150
|
|
},
|
|
{
|
|
"epoch": 5.2691867124856815,
|
|
"grad_norm": 0.021182745695114136,
|
|
"learning_rate": 7.3654066437571605e-06,
|
|
"loss": 0.0508,
|
|
"step": 9200
|
|
},
|
|
{
|
|
"epoch": 5.297823596792669,
|
|
"grad_norm": 0.08311637490987778,
|
|
"learning_rate": 7.3510882016036665e-06,
|
|
"loss": 0.0359,
|
|
"step": 9250
|
|
},
|
|
{
|
|
"epoch": 5.326460481099656,
|
|
"grad_norm": 10.934857368469238,
|
|
"learning_rate": 7.336769759450172e-06,
|
|
"loss": 0.0431,
|
|
"step": 9300
|
|
},
|
|
{
|
|
"epoch": 5.355097365406643,
|
|
"grad_norm": 0.061881110072135925,
|
|
"learning_rate": 7.3224513172966785e-06,
|
|
"loss": 0.0245,
|
|
"step": 9350
|
|
},
|
|
{
|
|
"epoch": 5.383734249713632,
|
|
"grad_norm": 0.09978004544973373,
|
|
"learning_rate": 7.3081328751431845e-06,
|
|
"loss": 0.0201,
|
|
"step": 9400
|
|
},
|
|
{
|
|
"epoch": 5.412371134020619,
|
|
"grad_norm": 0.04522474855184555,
|
|
"learning_rate": 7.293814432989691e-06,
|
|
"loss": 0.0554,
|
|
"step": 9450
|
|
},
|
|
{
|
|
"epoch": 5.441008018327606,
|
|
"grad_norm": 27.210500717163086,
|
|
"learning_rate": 7.279495990836197e-06,
|
|
"loss": 0.0289,
|
|
"step": 9500
|
|
},
|
|
{
|
|
"epoch": 5.4696449026345935,
|
|
"grad_norm": 88.04645538330078,
|
|
"learning_rate": 7.265177548682703e-06,
|
|
"loss": 0.0786,
|
|
"step": 9550
|
|
},
|
|
{
|
|
"epoch": 5.498281786941581,
|
|
"grad_norm": 9.045210838317871,
|
|
"learning_rate": 7.25085910652921e-06,
|
|
"loss": 0.0776,
|
|
"step": 9600
|
|
},
|
|
{
|
|
"epoch": 5.526918671248568,
|
|
"grad_norm": 0.031076449900865555,
|
|
"learning_rate": 7.236540664375716e-06,
|
|
"loss": 0.0917,
|
|
"step": 9650
|
|
},
|
|
{
|
|
"epoch": 5.555555555555555,
|
|
"grad_norm": 0.07342424243688583,
|
|
"learning_rate": 7.222222222222223e-06,
|
|
"loss": 0.067,
|
|
"step": 9700
|
|
},
|
|
{
|
|
"epoch": 5.584192439862543,
|
|
"grad_norm": 0.07349126785993576,
|
|
"learning_rate": 7.207903780068729e-06,
|
|
"loss": 0.0275,
|
|
"step": 9750
|
|
},
|
|
{
|
|
"epoch": 5.61282932416953,
|
|
"grad_norm": 0.4687565267086029,
|
|
"learning_rate": 7.193585337915236e-06,
|
|
"loss": 0.0308,
|
|
"step": 9800
|
|
},
|
|
{
|
|
"epoch": 5.641466208476518,
|
|
"grad_norm": 0.03109198622405529,
|
|
"learning_rate": 7.179266895761742e-06,
|
|
"loss": 0.0512,
|
|
"step": 9850
|
|
},
|
|
{
|
|
"epoch": 5.670103092783505,
|
|
"grad_norm": 2.063464403152466,
|
|
"learning_rate": 7.164948453608248e-06,
|
|
"loss": 0.0436,
|
|
"step": 9900
|
|
},
|
|
{
|
|
"epoch": 5.698739977090493,
|
|
"grad_norm": 0.7259678244590759,
|
|
"learning_rate": 7.1506300114547545e-06,
|
|
"loss": 0.0452,
|
|
"step": 9950
|
|
},
|
|
{
|
|
"epoch": 5.72737686139748,
|
|
"grad_norm": 22.87729263305664,
|
|
"learning_rate": 7.1363115693012605e-06,
|
|
"loss": 0.0871,
|
|
"step": 10000
|
|
},
|
|
{
|
|
"epoch": 5.756013745704467,
|
|
"grad_norm": 0.6141005754470825,
|
|
"learning_rate": 7.121993127147767e-06,
|
|
"loss": 0.0432,
|
|
"step": 10050
|
|
},
|
|
{
|
|
"epoch": 5.784650630011455,
|
|
"grad_norm": 0.29292017221450806,
|
|
"learning_rate": 7.107674684994273e-06,
|
|
"loss": 0.0424,
|
|
"step": 10100
|
|
},
|
|
{
|
|
"epoch": 5.813287514318442,
|
|
"grad_norm": 0.059430770576000214,
|
|
"learning_rate": 7.09335624284078e-06,
|
|
"loss": 0.0751,
|
|
"step": 10150
|
|
},
|
|
{
|
|
"epoch": 5.841924398625429,
|
|
"grad_norm": 10.410508155822754,
|
|
"learning_rate": 7.079037800687286e-06,
|
|
"loss": 0.1061,
|
|
"step": 10200
|
|
},
|
|
{
|
|
"epoch": 5.870561282932417,
|
|
"grad_norm": 1.1330084800720215,
|
|
"learning_rate": 7.064719358533792e-06,
|
|
"loss": 0.0421,
|
|
"step": 10250
|
|
},
|
|
{
|
|
"epoch": 5.899198167239405,
|
|
"grad_norm": 0.1360820233821869,
|
|
"learning_rate": 7.050400916380299e-06,
|
|
"loss": 0.0664,
|
|
"step": 10300
|
|
},
|
|
{
|
|
"epoch": 5.927835051546392,
|
|
"grad_norm": 1.3874069452285767,
|
|
"learning_rate": 7.036082474226805e-06,
|
|
"loss": 0.1134,
|
|
"step": 10350
|
|
},
|
|
{
|
|
"epoch": 5.956471935853379,
|
|
"grad_norm": 0.0816207155585289,
|
|
"learning_rate": 7.02176403207331e-06,
|
|
"loss": 0.052,
|
|
"step": 10400
|
|
},
|
|
{
|
|
"epoch": 5.985108820160367,
|
|
"grad_norm": 0.0460013747215271,
|
|
"learning_rate": 7.007445589919817e-06,
|
|
"loss": 0.0362,
|
|
"step": 10450
|
|
},
|
|
{
|
|
"epoch": 6.0,
|
|
"eval_accuracy": 0.7591131153701616,
|
|
"eval_f1": 0.7565525682738199,
|
|
"eval_loss": 1.2660281658172607,
|
|
"eval_runtime": 36.2808,
|
|
"eval_samples_per_second": 73.345,
|
|
"eval_steps_per_second": 9.178,
|
|
"step": 10476
|
|
},
|
|
{
|
|
"epoch": 6.013745704467354,
|
|
"grad_norm": 0.09728560596704483,
|
|
"learning_rate": 6.993127147766323e-06,
|
|
"loss": 0.0709,
|
|
"step": 10500
|
|
},
|
|
{
|
|
"epoch": 6.042382588774341,
|
|
"grad_norm": 0.10100292414426804,
|
|
"learning_rate": 6.97880870561283e-06,
|
|
"loss": 0.0754,
|
|
"step": 10550
|
|
},
|
|
{
|
|
"epoch": 6.0710194730813285,
|
|
"grad_norm": 0.19107265770435333,
|
|
"learning_rate": 6.964490263459336e-06,
|
|
"loss": 0.0408,
|
|
"step": 10600
|
|
},
|
|
{
|
|
"epoch": 6.099656357388316,
|
|
"grad_norm": 0.0762198194861412,
|
|
"learning_rate": 6.9501718213058426e-06,
|
|
"loss": 0.0228,
|
|
"step": 10650
|
|
},
|
|
{
|
|
"epoch": 6.128293241695303,
|
|
"grad_norm": 0.023416342213749886,
|
|
"learning_rate": 6.9358533791523485e-06,
|
|
"loss": 0.0326,
|
|
"step": 10700
|
|
},
|
|
{
|
|
"epoch": 6.156930126002291,
|
|
"grad_norm": 0.13443826138973236,
|
|
"learning_rate": 6.9215349369988545e-06,
|
|
"loss": 0.021,
|
|
"step": 10750
|
|
},
|
|
{
|
|
"epoch": 6.185567010309279,
|
|
"grad_norm": 47.05551528930664,
|
|
"learning_rate": 6.907216494845361e-06,
|
|
"loss": 0.0505,
|
|
"step": 10800
|
|
},
|
|
{
|
|
"epoch": 6.214203894616266,
|
|
"grad_norm": 0.19994887709617615,
|
|
"learning_rate": 6.892898052691867e-06,
|
|
"loss": 0.044,
|
|
"step": 10850
|
|
},
|
|
{
|
|
"epoch": 6.242840778923253,
|
|
"grad_norm": 6.800403118133545,
|
|
"learning_rate": 6.878579610538374e-06,
|
|
"loss": 0.0212,
|
|
"step": 10900
|
|
},
|
|
{
|
|
"epoch": 6.2714776632302405,
|
|
"grad_norm": 0.18680407106876373,
|
|
"learning_rate": 6.86426116838488e-06,
|
|
"loss": 0.04,
|
|
"step": 10950
|
|
},
|
|
{
|
|
"epoch": 6.300114547537228,
|
|
"grad_norm": 2.269585132598877,
|
|
"learning_rate": 6.849942726231387e-06,
|
|
"loss": 0.0385,
|
|
"step": 11000
|
|
},
|
|
{
|
|
"epoch": 6.328751431844215,
|
|
"grad_norm": 0.05255923420190811,
|
|
"learning_rate": 6.835624284077893e-06,
|
|
"loss": 0.0364,
|
|
"step": 11050
|
|
},
|
|
{
|
|
"epoch": 6.357388316151202,
|
|
"grad_norm": 0.026086222380399704,
|
|
"learning_rate": 6.821305841924399e-06,
|
|
"loss": 0.0462,
|
|
"step": 11100
|
|
},
|
|
{
|
|
"epoch": 6.3860252004581906,
|
|
"grad_norm": 0.026915445923805237,
|
|
"learning_rate": 6.806987399770906e-06,
|
|
"loss": 0.0473,
|
|
"step": 11150
|
|
},
|
|
{
|
|
"epoch": 6.414662084765178,
|
|
"grad_norm": 30.492359161376953,
|
|
"learning_rate": 6.792668957617412e-06,
|
|
"loss": 0.0508,
|
|
"step": 11200
|
|
},
|
|
{
|
|
"epoch": 6.443298969072165,
|
|
"grad_norm": 0.030431082472205162,
|
|
"learning_rate": 6.778350515463919e-06,
|
|
"loss": 0.0555,
|
|
"step": 11250
|
|
},
|
|
{
|
|
"epoch": 6.4719358533791524,
|
|
"grad_norm": 0.19676260650157928,
|
|
"learning_rate": 6.764032073310425e-06,
|
|
"loss": 0.0435,
|
|
"step": 11300
|
|
},
|
|
{
|
|
"epoch": 6.50057273768614,
|
|
"grad_norm": 0.3724329471588135,
|
|
"learning_rate": 6.7497136311569314e-06,
|
|
"loss": 0.0209,
|
|
"step": 11350
|
|
},
|
|
{
|
|
"epoch": 6.529209621993127,
|
|
"grad_norm": 0.07832983881235123,
|
|
"learning_rate": 6.735395189003437e-06,
|
|
"loss": 0.0368,
|
|
"step": 11400
|
|
},
|
|
{
|
|
"epoch": 6.557846506300114,
|
|
"grad_norm": 1.6286810636520386,
|
|
"learning_rate": 6.721076746849944e-06,
|
|
"loss": 0.0546,
|
|
"step": 11450
|
|
},
|
|
{
|
|
"epoch": 6.586483390607102,
|
|
"grad_norm": 10.142606735229492,
|
|
"learning_rate": 6.706758304696449e-06,
|
|
"loss": 0.0537,
|
|
"step": 11500
|
|
},
|
|
{
|
|
"epoch": 6.615120274914089,
|
|
"grad_norm": 0.03260861337184906,
|
|
"learning_rate": 6.692439862542955e-06,
|
|
"loss": 0.0801,
|
|
"step": 11550
|
|
},
|
|
{
|
|
"epoch": 6.643757159221077,
|
|
"grad_norm": 0.06774479895830154,
|
|
"learning_rate": 6.678121420389461e-06,
|
|
"loss": 0.0276,
|
|
"step": 11600
|
|
},
|
|
{
|
|
"epoch": 6.672394043528064,
|
|
"grad_norm": 0.10083146393299103,
|
|
"learning_rate": 6.663802978235968e-06,
|
|
"loss": 0.0355,
|
|
"step": 11650
|
|
},
|
|
{
|
|
"epoch": 6.701030927835052,
|
|
"grad_norm": 0.5465721487998962,
|
|
"learning_rate": 6.649484536082474e-06,
|
|
"loss": 0.0531,
|
|
"step": 11700
|
|
},
|
|
{
|
|
"epoch": 6.729667812142039,
|
|
"grad_norm": 0.36361464858055115,
|
|
"learning_rate": 6.635166093928981e-06,
|
|
"loss": 0.0534,
|
|
"step": 11750
|
|
},
|
|
{
|
|
"epoch": 6.758304696449026,
|
|
"grad_norm": 0.3213222622871399,
|
|
"learning_rate": 6.620847651775487e-06,
|
|
"loss": 0.021,
|
|
"step": 11800
|
|
},
|
|
{
|
|
"epoch": 6.786941580756014,
|
|
"grad_norm": 1.3060314655303955,
|
|
"learning_rate": 6.606529209621994e-06,
|
|
"loss": 0.0371,
|
|
"step": 11850
|
|
},
|
|
{
|
|
"epoch": 6.815578465063001,
|
|
"grad_norm": 15.44947338104248,
|
|
"learning_rate": 6.5922107674685e-06,
|
|
"loss": 0.0475,
|
|
"step": 11900
|
|
},
|
|
{
|
|
"epoch": 6.844215349369988,
|
|
"grad_norm": 0.1092597097158432,
|
|
"learning_rate": 6.577892325315006e-06,
|
|
"loss": 0.0549,
|
|
"step": 11950
|
|
},
|
|
{
|
|
"epoch": 6.872852233676976,
|
|
"grad_norm": 0.24638314545154572,
|
|
"learning_rate": 6.563573883161513e-06,
|
|
"loss": 0.044,
|
|
"step": 12000
|
|
},
|
|
{
|
|
"epoch": 6.901489117983964,
|
|
"grad_norm": 0.17163485288619995,
|
|
"learning_rate": 6.549255441008019e-06,
|
|
"loss": 0.0478,
|
|
"step": 12050
|
|
},
|
|
{
|
|
"epoch": 6.930126002290951,
|
|
"grad_norm": 0.5006637573242188,
|
|
"learning_rate": 6.5349369988545254e-06,
|
|
"loss": 0.0595,
|
|
"step": 12100
|
|
},
|
|
{
|
|
"epoch": 6.958762886597938,
|
|
"grad_norm": 0.09898879379034042,
|
|
"learning_rate": 6.520618556701031e-06,
|
|
"loss": 0.0415,
|
|
"step": 12150
|
|
},
|
|
{
|
|
"epoch": 6.987399770904926,
|
|
"grad_norm": 0.8853304386138916,
|
|
"learning_rate": 6.506300114547538e-06,
|
|
"loss": 0.023,
|
|
"step": 12200
|
|
},
|
|
{
|
|
"epoch": 7.0,
|
|
"eval_accuracy": 0.7718902668169861,
|
|
"eval_f1": 0.772007503464447,
|
|
"eval_loss": 1.2936097383499146,
|
|
"eval_runtime": 36.2937,
|
|
"eval_samples_per_second": 73.319,
|
|
"eval_steps_per_second": 9.175,
|
|
"step": 12222
|
|
}
|
|
],
|
|
"logging_steps": 50,
|
|
"max_steps": 34920,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 20,
|
|
"save_steps": 500,
|
|
"stateful_callbacks": {
|
|
"EarlyStoppingCallback": {
|
|
"args": {
|
|
"early_stopping_patience": 3,
|
|
"early_stopping_threshold": 0.0
|
|
},
|
|
"attributes": {
|
|
"early_stopping_patience_counter": 0
|
|
}
|
|
},
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": false
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 2.283848509009536e+16,
|
|
"train_batch_size": 8,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|
|
|