{ "best_global_step": 2072, "best_metric": 89.937106918239, "best_model_checkpoint": "/data/hungnm/unisentiment/roberta-base-sentiment/checkpoint-2072", "epoch": 50.0, "eval_steps": 500, "global_step": 2800, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.08928571428571429, "grad_norm": 59.24269104003906, "learning_rate": 8.92857142857143e-06, "loss": 2.85, "step": 5 }, { "epoch": 0.17857142857142858, "grad_norm": 29.214595794677734, "learning_rate": 1.785714285714286e-05, "loss": 2.3363, "step": 10 }, { "epoch": 0.26785714285714285, "grad_norm": 22.542577743530273, "learning_rate": 2.6785714285714288e-05, "loss": 2.4922, "step": 15 }, { "epoch": 0.35714285714285715, "grad_norm": 142.14141845703125, "learning_rate": 3.571428571428572e-05, "loss": 2.0449, "step": 20 }, { "epoch": 0.44642857142857145, "grad_norm": 7.237235069274902, "learning_rate": 4.464285714285715e-05, "loss": 1.827, "step": 25 }, { "epoch": 0.5357142857142857, "grad_norm": 7.419255256652832, "learning_rate": 4.999993577810563e-05, "loss": 1.6313, "step": 30 }, { "epoch": 0.625, "grad_norm": 6.396734714508057, "learning_rate": 4.999921328558333e-05, "loss": 1.6582, "step": 35 }, { "epoch": 0.7142857142857143, "grad_norm": 10.179349899291992, "learning_rate": 4.999768804644796e-05, "loss": 1.766, "step": 40 }, { "epoch": 0.8035714285714286, "grad_norm": 4.080478191375732, "learning_rate": 4.9995360109676296e-05, "loss": 1.6039, "step": 45 }, { "epoch": 0.8928571428571429, "grad_norm": 46.95652389526367, "learning_rate": 4.999222955002041e-05, "loss": 1.7658, "step": 50 }, { "epoch": 0.9821428571428571, "grad_norm": 13.342621803283691, "learning_rate": 4.998829646800533e-05, "loss": 1.541, "step": 55 }, { "epoch": 1.0, "eval_loss": 0.34518229961395264, "eval_macro_f1": 78.41773492091933, "eval_macro_precision": 86.07313432835821, "eval_macro_recall": 75.09860202167894, "eval_micro_f1": 85.84905660377359, "eval_micro_precision": 85.84905660377359, "eval_micro_recall": 85.84905660377359, "eval_runtime": 10.6756, "eval_samples_per_second": 148.938, "eval_steps_per_second": 2.342, "step": 56 }, { "epoch": 1.0714285714285714, "grad_norm": 20.603862762451172, "learning_rate": 4.9983560989925736e-05, "loss": 1.3594, "step": 60 }, { "epoch": 1.1607142857142858, "grad_norm": 8.545742988586426, "learning_rate": 4.9978023267841994e-05, "loss": 1.3447, "step": 65 }, { "epoch": 1.25, "grad_norm": 7.969589710235596, "learning_rate": 4.99716834795752e-05, "loss": 1.3035, "step": 70 }, { "epoch": 1.3392857142857144, "grad_norm": 37.12427520751953, "learning_rate": 4.9964541828701506e-05, "loss": 1.2727, "step": 75 }, { "epoch": 1.4285714285714286, "grad_norm": 61.47677993774414, "learning_rate": 4.9956598544545566e-05, "loss": 1.4631, "step": 80 }, { "epoch": 1.5178571428571428, "grad_norm": 20.555511474609375, "learning_rate": 4.994785388217318e-05, "loss": 1.7768, "step": 85 }, { "epoch": 1.6071428571428572, "grad_norm": 19.720369338989258, "learning_rate": 4.993830812238311e-05, "loss": 1.4105, "step": 90 }, { "epoch": 1.6964285714285714, "grad_norm": 11.87168025970459, "learning_rate": 4.9927961571698064e-05, "loss": 1.2576, "step": 95 }, { "epoch": 1.7857142857142856, "grad_norm": 7.716609001159668, "learning_rate": 4.991681456235483e-05, "loss": 1.3186, "step": 100 }, { "epoch": 1.875, "grad_norm": 4.707287788391113, "learning_rate": 4.990486745229364e-05, "loss": 1.2502, "step": 105 }, { "epoch": 1.9642857142857144, "grad_norm": 7.120730400085449, "learning_rate": 4.989212062514664e-05, "loss": 1.0652, "step": 110 }, { "epoch": 2.0, "eval_loss": 0.3161654770374298, "eval_macro_f1": 82.51917393751759, "eval_macro_precision": 84.85169367165287, "eval_macro_recall": 80.82915005991929, "eval_micro_f1": 87.42138364779875, "eval_micro_precision": 87.42138364779875, "eval_micro_recall": 87.42138364779875, "eval_runtime": 1.9934, "eval_samples_per_second": 797.637, "eval_steps_per_second": 12.541, "step": 112 }, { "epoch": 2.0535714285714284, "grad_norm": 9.230934143066406, "learning_rate": 4.987857449022561e-05, "loss": 1.0412, "step": 115 }, { "epoch": 2.142857142857143, "grad_norm": 4.535208225250244, "learning_rate": 4.9864229482508804e-05, "loss": 1.0646, "step": 120 }, { "epoch": 2.232142857142857, "grad_norm": 39.12550354003906, "learning_rate": 4.984908606262696e-05, "loss": 1.0901, "step": 125 }, { "epoch": 2.3214285714285716, "grad_norm": 18.9006404876709, "learning_rate": 4.983314471684853e-05, "loss": 1.165, "step": 130 }, { "epoch": 2.4107142857142856, "grad_norm": 5.734167098999023, "learning_rate": 4.9816405957064106e-05, "loss": 1.0594, "step": 135 }, { "epoch": 2.5, "grad_norm": 16.50884437561035, "learning_rate": 4.9798870320769886e-05, "loss": 1.0566, "step": 140 }, { "epoch": 2.5892857142857144, "grad_norm": 48.42763900756836, "learning_rate": 4.97805383710505e-05, "loss": 1.383, "step": 145 }, { "epoch": 2.678571428571429, "grad_norm": 19.594017028808594, "learning_rate": 4.976141069656091e-05, "loss": 1.2805, "step": 150 }, { "epoch": 2.767857142857143, "grad_norm": 4.824181079864502, "learning_rate": 4.974148791150746e-05, "loss": 1.0623, "step": 155 }, { "epoch": 2.857142857142857, "grad_norm": 11.474513053894043, "learning_rate": 4.972077065562821e-05, "loss": 1.0732, "step": 160 }, { "epoch": 2.946428571428571, "grad_norm": 17.615800857543945, "learning_rate": 4.96992595941724e-05, "loss": 1.0885, "step": 165 }, { "epoch": 3.0, "eval_loss": 0.2910524904727936, "eval_macro_f1": 83.94523203683508, "eval_macro_precision": 84.81357128694967, "eval_macro_recall": 83.18583703199087, "eval_micro_f1": 88.0503144654088, "eval_micro_precision": 88.0503144654088, "eval_micro_recall": 88.0503144654088, "eval_runtime": 1.8143, "eval_samples_per_second": 876.376, "eval_steps_per_second": 13.78, "step": 168 }, { "epoch": 3.0357142857142856, "grad_norm": 9.219614028930664, "learning_rate": 4.967695541787901e-05, "loss": 1.0449, "step": 170 }, { "epoch": 3.125, "grad_norm": 11.528852462768555, "learning_rate": 4.965385884295467e-05, "loss": 0.8327, "step": 175 }, { "epoch": 3.2142857142857144, "grad_norm": 14.702798843383789, "learning_rate": 4.96299706110506e-05, "loss": 0.8543, "step": 180 }, { "epoch": 3.3035714285714284, "grad_norm": 9.77267837524414, "learning_rate": 4.960529148923884e-05, "loss": 1.0777, "step": 185 }, { "epoch": 3.392857142857143, "grad_norm": 11.903849601745605, "learning_rate": 4.9579822269987574e-05, "loss": 1.111, "step": 190 }, { "epoch": 3.482142857142857, "grad_norm": 15.278186798095703, "learning_rate": 4.955356377113574e-05, "loss": 0.8274, "step": 195 }, { "epoch": 3.571428571428571, "grad_norm": 11.262117385864258, "learning_rate": 4.952651683586668e-05, "loss": 0.8345, "step": 200 }, { "epoch": 3.6607142857142856, "grad_norm": 13.382967948913574, "learning_rate": 4.9498682332681174e-05, "loss": 0.6874, "step": 205 }, { "epoch": 3.75, "grad_norm": 6.932016849517822, "learning_rate": 4.947006115536947e-05, "loss": 0.7483, "step": 210 }, { "epoch": 3.8392857142857144, "grad_norm": 14.735459327697754, "learning_rate": 4.944065422298262e-05, "loss": 0.8449, "step": 215 }, { "epoch": 3.928571428571429, "grad_norm": 7.518039703369141, "learning_rate": 4.9410462479802945e-05, "loss": 0.8368, "step": 220 }, { "epoch": 4.0, "eval_loss": 0.28605297207832336, "eval_macro_f1": 83.79635460918196, "eval_macro_precision": 88.60881482037983, "eval_macro_recall": 80.95314249160404, "eval_micro_f1": 88.80503144654088, "eval_micro_precision": 88.80503144654088, "eval_micro_recall": 88.80503144654088, "eval_runtime": 1.9349, "eval_samples_per_second": 821.746, "eval_steps_per_second": 12.921, "step": 224 }, { "epoch": 4.017857142857143, "grad_norm": 16.081928253173828, "learning_rate": 4.937948689531373e-05, "loss": 0.7979, "step": 225 }, { "epoch": 4.107142857142857, "grad_norm": 7.138861179351807, "learning_rate": 4.934772846416812e-05, "loss": 0.5874, "step": 230 }, { "epoch": 4.196428571428571, "grad_norm": 18.04113006591797, "learning_rate": 4.931518820615711e-05, "loss": 0.5545, "step": 235 }, { "epoch": 4.285714285714286, "grad_norm": 13.751228332519531, "learning_rate": 4.928186716617686e-05, "loss": 0.5696, "step": 240 }, { "epoch": 4.375, "grad_norm": 17.97528839111328, "learning_rate": 4.924776641419513e-05, "loss": 0.625, "step": 245 }, { "epoch": 4.464285714285714, "grad_norm": 6.758862495422363, "learning_rate": 4.921288704521689e-05, "loss": 0.6494, "step": 250 }, { "epoch": 4.553571428571429, "grad_norm": 39.63971710205078, "learning_rate": 4.917723017924921e-05, "loss": 0.7084, "step": 255 }, { "epoch": 4.642857142857143, "grad_norm": 22.54784393310547, "learning_rate": 4.914079696126526e-05, "loss": 0.6685, "step": 260 }, { "epoch": 4.732142857142857, "grad_norm": 17.557443618774414, "learning_rate": 4.910358856116752e-05, "loss": 0.6967, "step": 265 }, { "epoch": 4.821428571428571, "grad_norm": 12.355552673339844, "learning_rate": 4.90656061737503e-05, "loss": 0.7881, "step": 270 }, { "epoch": 4.910714285714286, "grad_norm": 14.7780179977417, "learning_rate": 4.90268510186613e-05, "loss": 0.6595, "step": 275 }, { "epoch": 5.0, "grad_norm": 16.71040153503418, "learning_rate": 4.898732434036244e-05, "loss": 0.7777, "step": 280 }, { "epoch": 5.0, "eval_loss": 0.2805473804473877, "eval_macro_f1": 85.35613362920841, "eval_macro_precision": 87.13597361085554, "eval_macro_recall": 83.9505608736378, "eval_micro_f1": 89.30817610062893, "eval_micro_precision": 89.30817610062893, "eval_micro_recall": 89.30817610062893, "eval_runtime": 1.8728, "eval_samples_per_second": 849.0, "eval_steps_per_second": 13.349, "step": 280 }, { "epoch": 5.089285714285714, "grad_norm": 12.743489265441895, "learning_rate": 4.894702740808995e-05, "loss": 0.4128, "step": 285 }, { "epoch": 5.178571428571429, "grad_norm": 19.04743766784668, "learning_rate": 4.8905961515813604e-05, "loss": 0.477, "step": 290 }, { "epoch": 5.267857142857143, "grad_norm": 24.844810485839844, "learning_rate": 4.886412798219512e-05, "loss": 0.4719, "step": 295 }, { "epoch": 5.357142857142857, "grad_norm": 9.876107215881348, "learning_rate": 4.882152815054587e-05, "loss": 0.4332, "step": 300 }, { "epoch": 5.446428571428571, "grad_norm": 25.508865356445312, "learning_rate": 4.8778163388783724e-05, "loss": 0.4225, "step": 305 }, { "epoch": 5.535714285714286, "grad_norm": 12.033214569091797, "learning_rate": 4.8734035089389115e-05, "loss": 0.5101, "step": 310 }, { "epoch": 5.625, "grad_norm": 11.438920974731445, "learning_rate": 4.8689144669360375e-05, "loss": 0.4257, "step": 315 }, { "epoch": 5.714285714285714, "grad_norm": 11.853082656860352, "learning_rate": 4.864349357016815e-05, "loss": 0.4271, "step": 320 }, { "epoch": 5.803571428571429, "grad_norm": 12.522577285766602, "learning_rate": 4.8597083257709194e-05, "loss": 0.538, "step": 325 }, { "epoch": 5.892857142857143, "grad_norm": 6.630044937133789, "learning_rate": 4.854991522225923e-05, "loss": 0.4855, "step": 330 }, { "epoch": 5.982142857142857, "grad_norm": 8.849501609802246, "learning_rate": 4.850199097842517e-05, "loss": 0.4158, "step": 335 }, { "epoch": 6.0, "eval_loss": 0.35284000635147095, "eval_macro_f1": 84.31737482203201, "eval_macro_precision": 85.56294653855629, "eval_macro_recall": 83.27826020133713, "eval_micro_f1": 88.42767295597484, "eval_micro_precision": 88.42767295597484, "eval_micro_recall": 88.42767295597484, "eval_runtime": 1.8262, "eval_samples_per_second": 870.648, "eval_steps_per_second": 13.689, "step": 336 }, { "epoch": 6.071428571428571, "grad_norm": 10.551375389099121, "learning_rate": 4.84533120650964e-05, "loss": 0.2718, "step": 340 }, { "epoch": 6.160714285714286, "grad_norm": 11.759309768676758, "learning_rate": 4.8403880045395434e-05, "loss": 0.2064, "step": 345 }, { "epoch": 6.25, "grad_norm": 11.094610214233398, "learning_rate": 4.835369650662767e-05, "loss": 0.2482, "step": 350 }, { "epoch": 6.339285714285714, "grad_norm": 18.329065322875977, "learning_rate": 4.8302763060230446e-05, "loss": 0.2556, "step": 355 }, { "epoch": 6.428571428571429, "grad_norm": 10.95065975189209, "learning_rate": 4.825108134172131e-05, "loss": 0.318, "step": 360 }, { "epoch": 6.517857142857143, "grad_norm": 17.075756072998047, "learning_rate": 4.819865301064545e-05, "loss": 0.2354, "step": 365 }, { "epoch": 6.607142857142857, "grad_norm": 10.705339431762695, "learning_rate": 4.814547975052245e-05, "loss": 0.2294, "step": 370 }, { "epoch": 6.696428571428571, "grad_norm": 31.16196632385254, "learning_rate": 4.8091563268792236e-05, "loss": 0.2385, "step": 375 }, { "epoch": 6.785714285714286, "grad_norm": 15.710704803466797, "learning_rate": 4.803690529676019e-05, "loss": 0.3026, "step": 380 }, { "epoch": 6.875, "grad_norm": 22.431447982788086, "learning_rate": 4.798150758954164e-05, "loss": 0.3048, "step": 385 }, { "epoch": 6.964285714285714, "grad_norm": 10.632715225219727, "learning_rate": 4.7925371926005435e-05, "loss": 0.3086, "step": 390 }, { "epoch": 7.0, "eval_loss": 0.41216832399368286, "eval_macro_f1": 85.17808273905835, "eval_macro_precision": 88.20624434584586, "eval_macro_recall": 83.06807537576768, "eval_micro_f1": 89.43396226415095, "eval_micro_precision": 89.43396226415095, "eval_micro_recall": 89.43396226415095, "eval_runtime": 1.8458, "eval_samples_per_second": 861.394, "eval_steps_per_second": 13.544, "step": 392 }, { "epoch": 7.053571428571429, "grad_norm": 11.026453971862793, "learning_rate": 4.786850010871684e-05, "loss": 0.221, "step": 395 }, { "epoch": 7.142857142857143, "grad_norm": 19.100629806518555, "learning_rate": 4.781089396387968e-05, "loss": 0.1621, "step": 400 }, { "epoch": 7.232142857142857, "grad_norm": 17.89957618713379, "learning_rate": 4.775255534127766e-05, "loss": 0.2228, "step": 405 }, { "epoch": 7.321428571428571, "grad_norm": 11.095701217651367, "learning_rate": 4.7693486114215015e-05, "loss": 0.1461, "step": 410 }, { "epoch": 7.410714285714286, "grad_norm": 56.87965393066406, "learning_rate": 4.76336881794563e-05, "loss": 0.3093, "step": 415 }, { "epoch": 7.5, "grad_norm": 18.552824020385742, "learning_rate": 4.7573163457165534e-05, "loss": 0.3726, "step": 420 }, { "epoch": 7.589285714285714, "grad_norm": 28.140094757080078, "learning_rate": 4.75119138908445e-05, "loss": 0.2765, "step": 425 }, { "epoch": 7.678571428571429, "grad_norm": 10.527276039123535, "learning_rate": 4.744994144727036e-05, "loss": 0.1934, "step": 430 }, { "epoch": 7.767857142857143, "grad_norm": 5.746723651885986, "learning_rate": 4.738724811643252e-05, "loss": 0.1292, "step": 435 }, { "epoch": 7.857142857142857, "grad_norm": 12.251644134521484, "learning_rate": 4.732383591146869e-05, "loss": 0.1795, "step": 440 }, { "epoch": 7.946428571428571, "grad_norm": 8.05550765991211, "learning_rate": 4.725970686860025e-05, "loss": 0.191, "step": 445 }, { "epoch": 8.0, "eval_loss": 0.49135711789131165, "eval_macro_f1": 84.5839261475176, "eval_macro_precision": 86.58899167373744, "eval_macro_recall": 83.04834458680612, "eval_micro_f1": 88.80503144654088, "eval_micro_precision": 88.80503144654088, "eval_micro_recall": 88.80503144654088, "eval_runtime": 1.8149, "eval_samples_per_second": 876.068, "eval_steps_per_second": 13.775, "step": 448 }, { "epoch": 8.035714285714286, "grad_norm": 10.807100296020508, "learning_rate": 4.719486304706687e-05, "loss": 0.1643, "step": 450 }, { "epoch": 8.125, "grad_norm": 8.784672737121582, "learning_rate": 4.712930652906041e-05, "loss": 0.1144, "step": 455 }, { "epoch": 8.214285714285714, "grad_norm": 18.46906280517578, "learning_rate": 4.7063039419658035e-05, "loss": 0.0868, "step": 460 }, { "epoch": 8.303571428571429, "grad_norm": 6.650496959686279, "learning_rate": 4.699606384675459e-05, "loss": 0.1557, "step": 465 }, { "epoch": 8.392857142857142, "grad_norm": 27.389806747436523, "learning_rate": 4.6928381960994336e-05, "loss": 0.1858, "step": 470 }, { "epoch": 8.482142857142858, "grad_norm": 11.773507118225098, "learning_rate": 4.6859995935701855e-05, "loss": 0.1233, "step": 475 }, { "epoch": 8.571428571428571, "grad_norm": 16.25447654724121, "learning_rate": 4.679090796681225e-05, "loss": 0.1306, "step": 480 }, { "epoch": 8.660714285714286, "grad_norm": 14.601356506347656, "learning_rate": 4.6721120272800646e-05, "loss": 0.0961, "step": 485 }, { "epoch": 8.75, "grad_norm": 9.302750587463379, "learning_rate": 4.665063509461097e-05, "loss": 0.1043, "step": 490 }, { "epoch": 8.839285714285714, "grad_norm": 52.55154800415039, "learning_rate": 4.657945469558397e-05, "loss": 0.1102, "step": 495 }, { "epoch": 8.928571428571429, "grad_norm": 24.64861488342285, "learning_rate": 4.6507581361384537e-05, "loss": 0.1652, "step": 500 }, { "epoch": 9.0, "eval_loss": 0.5782527327537537, "eval_macro_f1": 83.94912174439733, "eval_macro_precision": 85.74556651650795, "eval_macro_recall": 82.54905177982101, "eval_micro_f1": 88.30188679245283, "eval_micro_precision": 88.30188679245283, "eval_micro_recall": 88.30188679245283, "eval_runtime": 1.916, "eval_samples_per_second": 829.87, "eval_steps_per_second": 13.048, "step": 504 }, { "epoch": 9.017857142857142, "grad_norm": 2.140636920928955, "learning_rate": 4.643501739992833e-05, "loss": 0.1599, "step": 505 }, { "epoch": 9.107142857142858, "grad_norm": 14.48595905303955, "learning_rate": 4.6361765141307645e-05, "loss": 0.1669, "step": 510 }, { "epoch": 9.196428571428571, "grad_norm": 18.363910675048828, "learning_rate": 4.628782693771659e-05, "loss": 0.1088, "step": 515 }, { "epoch": 9.285714285714286, "grad_norm": 3.3701069355010986, "learning_rate": 4.6213205163375586e-05, "loss": 0.0675, "step": 520 }, { "epoch": 9.375, "grad_norm": 14.012438774108887, "learning_rate": 4.613790221445511e-05, "loss": 0.0949, "step": 525 }, { "epoch": 9.464285714285714, "grad_norm": 7.062801361083984, "learning_rate": 4.6061920508998735e-05, "loss": 0.182, "step": 530 }, { "epoch": 9.553571428571429, "grad_norm": 18.400386810302734, "learning_rate": 4.59852624868455e-05, "loss": 0.2805, "step": 535 }, { "epoch": 9.642857142857142, "grad_norm": 11.67214298248291, "learning_rate": 4.5907930609551584e-05, "loss": 0.089, "step": 540 }, { "epoch": 9.732142857142858, "grad_norm": 18.16691017150879, "learning_rate": 4.582992736031123e-05, "loss": 0.1596, "step": 545 }, { "epoch": 9.821428571428571, "grad_norm": 6.478634834289551, "learning_rate": 4.5751255243877015e-05, "loss": 0.1941, "step": 550 }, { "epoch": 9.910714285714286, "grad_norm": 5.8572096824646, "learning_rate": 4.567191678647945e-05, "loss": 0.152, "step": 555 }, { "epoch": 10.0, "grad_norm": 28.061464309692383, "learning_rate": 4.559191453574582e-05, "loss": 0.1177, "step": 560 }, { "epoch": 10.0, "eval_loss": 0.5562991499900818, "eval_macro_f1": 83.77790670583238, "eval_macro_precision": 83.0857567614838, "eval_macro_recall": 84.57436534359611, "eval_micro_f1": 87.35849056603774, "eval_micro_precision": 87.35849056603774, "eval_micro_recall": 87.35849056603774, "eval_runtime": 2.3477, "eval_samples_per_second": 677.264, "eval_steps_per_second": 10.649, "step": 560 }, { "epoch": 10.089285714285714, "grad_norm": 7.564888954162598, "learning_rate": 4.55112510606184e-05, "loss": 0.0341, "step": 565 }, { "epoch": 10.178571428571429, "grad_norm": 8.534261703491211, "learning_rate": 4.542992895127195e-05, "loss": 0.0521, "step": 570 }, { "epoch": 10.267857142857142, "grad_norm": 13.397907257080078, "learning_rate": 4.534795081903056e-05, "loss": 0.0723, "step": 575 }, { "epoch": 10.357142857142858, "grad_norm": 22.610706329345703, "learning_rate": 4.526531929628379e-05, "loss": 0.1207, "step": 580 }, { "epoch": 10.446428571428571, "grad_norm": 7.134080410003662, "learning_rate": 4.518203703640214e-05, "loss": 0.056, "step": 585 }, { "epoch": 10.535714285714286, "grad_norm": 12.124205589294434, "learning_rate": 4.5098106713651846e-05, "loss": 0.1325, "step": 590 }, { "epoch": 10.625, "grad_norm": 4.9503583908081055, "learning_rate": 4.5013531023109014e-05, "loss": 0.1044, "step": 595 }, { "epoch": 10.714285714285714, "grad_norm": 19.115802764892578, "learning_rate": 4.4928312680573064e-05, "loss": 0.0675, "step": 600 }, { "epoch": 10.803571428571429, "grad_norm": 18.239246368408203, "learning_rate": 4.484245442247955e-05, "loss": 0.1275, "step": 605 }, { "epoch": 10.892857142857142, "grad_norm": 12.322056770324707, "learning_rate": 4.4755959005812256e-05, "loss": 0.1087, "step": 610 }, { "epoch": 10.982142857142858, "grad_norm": 10.249615669250488, "learning_rate": 4.4668829208014705e-05, "loss": 0.1236, "step": 615 }, { "epoch": 11.0, "eval_loss": 0.7119177579879761, "eval_macro_f1": 82.11289781379863, "eval_macro_precision": 80.6222110582464, "eval_macro_recall": 84.43458828074213, "eval_micro_f1": 85.47169811320755, "eval_micro_precision": 85.47169811320755, "eval_micro_recall": 85.47169811320755, "eval_runtime": 2.1826, "eval_samples_per_second": 728.484, "eval_steps_per_second": 11.454, "step": 616 }, { "epoch": 11.071428571428571, "grad_norm": 7.2919440269470215, "learning_rate": 4.458106782690094e-05, "loss": 0.3132, "step": 620 }, { "epoch": 11.160714285714286, "grad_norm": 4.609331130981445, "learning_rate": 4.4492677680565696e-05, "loss": 0.0392, "step": 625 }, { "epoch": 11.25, "grad_norm": 11.323241233825684, "learning_rate": 4.440366160729392e-05, "loss": 0.0863, "step": 630 }, { "epoch": 11.339285714285714, "grad_norm": 7.759965896606445, "learning_rate": 4.431402246546962e-05, "loss": 0.0227, "step": 635 }, { "epoch": 11.428571428571429, "grad_norm": 10.826987266540527, "learning_rate": 4.422376313348405e-05, "loss": 0.0385, "step": 640 }, { "epoch": 11.517857142857142, "grad_norm": 6.147857189178467, "learning_rate": 4.413288650964337e-05, "loss": 0.0684, "step": 645 }, { "epoch": 11.607142857142858, "grad_norm": 6.45582914352417, "learning_rate": 4.4041395512075464e-05, "loss": 0.0503, "step": 650 }, { "epoch": 11.696428571428571, "grad_norm": 23.845369338989258, "learning_rate": 4.394929307863633e-05, "loss": 0.0553, "step": 655 }, { "epoch": 11.785714285714286, "grad_norm": 11.343393325805664, "learning_rate": 4.385658216681569e-05, "loss": 0.0788, "step": 660 }, { "epoch": 11.875, "grad_norm": 9.691651344299316, "learning_rate": 4.3763265753642055e-05, "loss": 0.1661, "step": 665 }, { "epoch": 11.964285714285714, "grad_norm": 33.286651611328125, "learning_rate": 4.36693468355871e-05, "loss": 0.058, "step": 670 }, { "epoch": 12.0, "eval_loss": 0.6721820831298828, "eval_macro_f1": 84.28322715184908, "eval_macro_precision": 85.15999991284815, "eval_macro_recall": 83.51606813145274, "eval_micro_f1": 88.30188679245283, "eval_micro_precision": 88.30188679245283, "eval_micro_recall": 88.30188679245283, "eval_runtime": 2.0425, "eval_samples_per_second": 778.444, "eval_steps_per_second": 12.24, "step": 672 }, { "epoch": 12.053571428571429, "grad_norm": 1.1854312419891357, "learning_rate": 4.357482842846946e-05, "loss": 0.0744, "step": 675 }, { "epoch": 12.142857142857142, "grad_norm": 13.661476135253906, "learning_rate": 4.3479713567357886e-05, "loss": 0.0436, "step": 680 }, { "epoch": 12.232142857142858, "grad_norm": 9.265774726867676, "learning_rate": 4.338400530647382e-05, "loss": 0.077, "step": 685 }, { "epoch": 12.321428571428571, "grad_norm": 1.9117738008499146, "learning_rate": 4.328770671909323e-05, "loss": 0.0637, "step": 690 }, { "epoch": 12.410714285714286, "grad_norm": 10.00926399230957, "learning_rate": 4.319082089744804e-05, "loss": 0.0254, "step": 695 }, { "epoch": 12.5, "grad_norm": 9.133126258850098, "learning_rate": 4.309335095262676e-05, "loss": 0.0579, "step": 700 }, { "epoch": 12.589285714285714, "grad_norm": 12.192875862121582, "learning_rate": 4.299530001447459e-05, "loss": 0.0787, "step": 705 }, { "epoch": 12.678571428571429, "grad_norm": 9.46296501159668, "learning_rate": 4.2896671231492966e-05, "loss": 0.0822, "step": 710 }, { "epoch": 12.767857142857142, "grad_norm": 20.78971290588379, "learning_rate": 4.27974677707384e-05, "loss": 0.0967, "step": 715 }, { "epoch": 12.857142857142858, "grad_norm": 4.571549415588379, "learning_rate": 4.269769281772082e-05, "loss": 0.1071, "step": 720 }, { "epoch": 12.946428571428571, "grad_norm": 14.227160453796387, "learning_rate": 4.259734957630127e-05, "loss": 0.0767, "step": 725 }, { "epoch": 13.0, "eval_loss": 0.663281261920929, "eval_macro_f1": 84.36653757838053, "eval_macro_precision": 86.22744226866327, "eval_macro_recall": 82.9215483061637, "eval_micro_f1": 88.61635220125787, "eval_micro_precision": 88.61635220125787, "eval_micro_recall": 88.61635220125787, "eval_runtime": 1.9979, "eval_samples_per_second": 795.816, "eval_steps_per_second": 12.513, "step": 728 }, { "epoch": 13.035714285714286, "grad_norm": 9.426419258117676, "learning_rate": 4.2496441268589046e-05, "loss": 0.0781, "step": 730 }, { "epoch": 13.125, "grad_norm": 19.891582489013672, "learning_rate": 4.239497113483819e-05, "loss": 0.0603, "step": 735 }, { "epoch": 13.214285714285714, "grad_norm": 6.893115043640137, "learning_rate": 4.22929424333435e-05, "loss": 0.0334, "step": 740 }, { "epoch": 13.303571428571429, "grad_norm": 3.4693875312805176, "learning_rate": 4.219035844033583e-05, "loss": 0.0515, "step": 745 }, { "epoch": 13.392857142857142, "grad_norm": 9.117530822753906, "learning_rate": 4.208722244987698e-05, "loss": 0.0438, "step": 750 }, { "epoch": 13.482142857142858, "grad_norm": 7.665452480316162, "learning_rate": 4.198353777375384e-05, "loss": 0.0323, "step": 755 }, { "epoch": 13.571428571428571, "grad_norm": 9.480864524841309, "learning_rate": 4.187930774137209e-05, "loss": 0.04, "step": 760 }, { "epoch": 13.660714285714286, "grad_norm": 8.460432052612305, "learning_rate": 4.1774535699649255e-05, "loss": 0.035, "step": 765 }, { "epoch": 13.75, "grad_norm": 0.8143876791000366, "learning_rate": 4.166922501290729e-05, "loss": 0.0417, "step": 770 }, { "epoch": 13.839285714285714, "grad_norm": 18.344676971435547, "learning_rate": 4.156337906276449e-05, "loss": 0.1389, "step": 775 }, { "epoch": 13.928571428571429, "grad_norm": 15.893628120422363, "learning_rate": 4.145700124802693e-05, "loss": 0.0607, "step": 780 }, { "epoch": 14.0, "eval_loss": 0.6969339847564697, "eval_macro_f1": 85.3983643196325, "eval_macro_precision": 85.17815944629582, "eval_macro_recall": 85.62705485782409, "eval_micro_f1": 88.80503144654088, "eval_micro_precision": 88.80503144654088, "eval_micro_recall": 88.80503144654088, "eval_runtime": 2.0363, "eval_samples_per_second": 780.832, "eval_steps_per_second": 12.277, "step": 784 }, { "epoch": 14.017857142857142, "grad_norm": 3.4685308933258057, "learning_rate": 4.135009498457931e-05, "loss": 0.0951, "step": 785 }, { "epoch": 14.107142857142858, "grad_norm": 5.312774658203125, "learning_rate": 4.124266370527531e-05, "loss": 0.017, "step": 790 }, { "epoch": 14.196428571428571, "grad_norm": 16.61371421813965, "learning_rate": 4.11347108598273e-05, "loss": 0.0694, "step": 795 }, { "epoch": 14.285714285714286, "grad_norm": 0.9555211663246155, "learning_rate": 4.1026239914695617e-05, "loss": 0.016, "step": 800 }, { "epoch": 14.375, "grad_norm": 11.234779357910156, "learning_rate": 4.0917254352977206e-05, "loss": 0.0538, "step": 805 }, { "epoch": 14.464285714285714, "grad_norm": 21.127065658569336, "learning_rate": 4.0807757674293834e-05, "loss": 0.1221, "step": 810 }, { "epoch": 14.553571428571429, "grad_norm": 19.199129104614258, "learning_rate": 4.069775339467966e-05, "loss": 0.1065, "step": 815 }, { "epoch": 14.642857142857142, "grad_norm": 20.038087844848633, "learning_rate": 4.058724504646834e-05, "loss": 0.0733, "step": 820 }, { "epoch": 14.732142857142858, "grad_norm": 9.910551071166992, "learning_rate": 4.047623617817965e-05, "loss": 0.0645, "step": 825 }, { "epoch": 14.821428571428571, "grad_norm": 13.347238540649414, "learning_rate": 4.0364730354405475e-05, "loss": 0.1127, "step": 830 }, { "epoch": 14.910714285714286, "grad_norm": 39.92618942260742, "learning_rate": 4.0252731155695396e-05, "loss": 0.0883, "step": 835 }, { "epoch": 15.0, "grad_norm": 8.375712394714355, "learning_rate": 4.014024217844167e-05, "loss": 0.066, "step": 840 }, { "epoch": 15.0, "eval_loss": 0.9945361614227295, "eval_macro_f1": 83.19661865450335, "eval_macro_precision": 89.30070883315157, "eval_macro_recall": 79.93991455529917, "eval_micro_f1": 88.61635220125787, "eval_micro_precision": 88.61635220125787, "eval_micro_recall": 88.61635220125787, "eval_runtime": 1.747, "eval_samples_per_second": 910.121, "eval_steps_per_second": 14.31, "step": 840 }, { "epoch": 15.089285714285714, "grad_norm": 1.6275001764297485, "learning_rate": 4.0027267034763796e-05, "loss": 0.0499, "step": 845 }, { "epoch": 15.178571428571429, "grad_norm": 11.117130279541016, "learning_rate": 3.9913809352392474e-05, "loss": 0.0465, "step": 850 }, { "epoch": 15.267857142857142, "grad_norm": 1.5368372201919556, "learning_rate": 3.979987277455317e-05, "loss": 0.031, "step": 855 }, { "epoch": 15.357142857142858, "grad_norm": 2.8059964179992676, "learning_rate": 3.9685460959849105e-05, "loss": 0.0134, "step": 860 }, { "epoch": 15.446428571428571, "grad_norm": 0.37871724367141724, "learning_rate": 3.9570577582143756e-05, "loss": 0.026, "step": 865 }, { "epoch": 15.535714285714286, "grad_norm": 4.849483489990234, "learning_rate": 3.945522633044289e-05, "loss": 0.0582, "step": 870 }, { "epoch": 15.625, "grad_norm": 4.785881996154785, "learning_rate": 3.933941090877615e-05, "loss": 0.0239, "step": 875 }, { "epoch": 15.714285714285714, "grad_norm": 5.867705821990967, "learning_rate": 3.9223135036078064e-05, "loss": 0.0506, "step": 880 }, { "epoch": 15.803571428571429, "grad_norm": 5.988280296325684, "learning_rate": 3.910640244606863e-05, "loss": 0.0406, "step": 885 }, { "epoch": 15.892857142857142, "grad_norm": 10.76251220703125, "learning_rate": 3.898921688713346e-05, "loss": 0.033, "step": 890 }, { "epoch": 15.982142857142858, "grad_norm": 10.54697322845459, "learning_rate": 3.88715821222034e-05, "loss": 0.0474, "step": 895 }, { "epoch": 16.0, "eval_loss": 0.8277662992477417, "eval_macro_f1": 84.62665166292602, "eval_macro_precision": 84.3093535297127, "eval_macro_recall": 84.96264650110804, "eval_micro_f1": 88.17610062893083, "eval_micro_precision": 88.17610062893083, "eval_micro_recall": 88.17610062893083, "eval_runtime": 1.7038, "eval_samples_per_second": 933.188, "eval_steps_per_second": 14.673, "step": 896 }, { "epoch": 16.071428571428573, "grad_norm": 0.2526906728744507, "learning_rate": 3.875350192863368e-05, "loss": 0.028, "step": 900 }, { "epoch": 16.160714285714285, "grad_norm": 4.583995819091797, "learning_rate": 3.863498009808263e-05, "loss": 0.0262, "step": 905 }, { "epoch": 16.25, "grad_norm": 2.2302212715148926, "learning_rate": 3.851602043638994e-05, "loss": 0.0297, "step": 910 }, { "epoch": 16.339285714285715, "grad_norm": 4.950682163238525, "learning_rate": 3.839662676345445e-05, "loss": 0.0802, "step": 915 }, { "epoch": 16.428571428571427, "grad_norm": 1.306373953819275, "learning_rate": 3.827680291311143e-05, "loss": 0.0683, "step": 920 }, { "epoch": 16.517857142857142, "grad_norm": 3.978598117828369, "learning_rate": 3.81565527330096e-05, "loss": 0.0467, "step": 925 }, { "epoch": 16.607142857142858, "grad_norm": 31.76022720336914, "learning_rate": 3.803588008448745e-05, "loss": 0.0599, "step": 930 }, { "epoch": 16.696428571428573, "grad_norm": 10.791604042053223, "learning_rate": 3.791478884244931e-05, "loss": 0.0811, "step": 935 }, { "epoch": 16.785714285714285, "grad_norm": 7.506629467010498, "learning_rate": 3.7793282895240926e-05, "loss": 0.2063, "step": 940 }, { "epoch": 16.875, "grad_norm": 2.9035871028900146, "learning_rate": 3.767136614452458e-05, "loss": 0.1391, "step": 945 }, { "epoch": 16.964285714285715, "grad_norm": 7.189354419708252, "learning_rate": 3.75490425051538e-05, "loss": 0.0634, "step": 950 }, { "epoch": 17.0, "eval_loss": 0.7015231847763062, "eval_macro_f1": 83.68481902838367, "eval_macro_precision": 83.01537542916853, "eval_macro_recall": 84.45151522074599, "eval_micro_f1": 87.29559748427673, "eval_micro_precision": 87.29559748427673, "eval_micro_recall": 87.29559748427673, "eval_runtime": 1.6913, "eval_samples_per_second": 940.118, "eval_steps_per_second": 14.782, "step": 952 }, { "epoch": 17.053571428571427, "grad_norm": 3.729951858520508, "learning_rate": 3.7426315905047696e-05, "loss": 0.0609, "step": 955 }, { "epoch": 17.142857142857142, "grad_norm": 2.013429880142212, "learning_rate": 3.7303190285064776e-05, "loss": 0.0077, "step": 960 }, { "epoch": 17.232142857142858, "grad_norm": 1.032761573791504, "learning_rate": 3.717966959887643e-05, "loss": 0.0287, "step": 965 }, { "epoch": 17.321428571428573, "grad_norm": 10.677305221557617, "learning_rate": 3.705575781283999e-05, "loss": 0.0242, "step": 970 }, { "epoch": 17.410714285714285, "grad_norm": 3.170926809310913, "learning_rate": 3.6931458905871314e-05, "loss": 0.0576, "step": 975 }, { "epoch": 17.5, "grad_norm": 1.3387705087661743, "learning_rate": 3.680677686931707e-05, "loss": 0.0022, "step": 980 }, { "epoch": 17.589285714285715, "grad_norm": 8.100290298461914, "learning_rate": 3.668171570682655e-05, "loss": 0.0199, "step": 985 }, { "epoch": 17.678571428571427, "grad_norm": 4.04311990737915, "learning_rate": 3.6556279434223116e-05, "loss": 0.0149, "step": 990 }, { "epoch": 17.767857142857142, "grad_norm": 0.5880358815193176, "learning_rate": 3.6430472079375234e-05, "loss": 0.0169, "step": 995 }, { "epoch": 17.857142857142858, "grad_norm": 1.5214190483093262, "learning_rate": 3.6304297682067144e-05, "loss": 0.0209, "step": 1000 }, { "epoch": 17.946428571428573, "grad_norm": 8.436260223388672, "learning_rate": 3.617776029386916e-05, "loss": 0.0188, "step": 1005 }, { "epoch": 18.0, "eval_loss": 0.9059441089630127, "eval_macro_f1": 85.13226593607345, "eval_macro_precision": 85.78507737593169, "eval_macro_recall": 84.54113454113454, "eval_micro_f1": 88.86792452830188, "eval_micro_precision": 88.86792452830188, "eval_micro_recall": 88.86792452830188, "eval_runtime": 1.7254, "eval_samples_per_second": 921.533, "eval_steps_per_second": 14.49, "step": 1008 }, { "epoch": 18.035714285714285, "grad_norm": 0.06204601749777794, "learning_rate": 3.605086397800753e-05, "loss": 0.0242, "step": 1010 }, { "epoch": 18.125, "grad_norm": 0.5178263783454895, "learning_rate": 3.592361280923399e-05, "loss": 0.0073, "step": 1015 }, { "epoch": 18.214285714285715, "grad_norm": 2.0144951343536377, "learning_rate": 3.579601087369492e-05, "loss": 0.0149, "step": 1020 }, { "epoch": 18.303571428571427, "grad_norm": 1.788545846939087, "learning_rate": 3.566806226880012e-05, "loss": 0.0193, "step": 1025 }, { "epoch": 18.392857142857142, "grad_norm": 5.27187442779541, "learning_rate": 3.553977110309125e-05, "loss": 0.0089, "step": 1030 }, { "epoch": 18.482142857142858, "grad_norm": 0.5820537209510803, "learning_rate": 3.5411141496109904e-05, "loss": 0.0248, "step": 1035 }, { "epoch": 18.571428571428573, "grad_norm": 5.2609710693359375, "learning_rate": 3.5282177578265296e-05, "loss": 0.0329, "step": 1040 }, { "epoch": 18.660714285714285, "grad_norm": 9.395613670349121, "learning_rate": 3.5152883490701684e-05, "loss": 0.0277, "step": 1045 }, { "epoch": 18.75, "grad_norm": 16.66202735900879, "learning_rate": 3.502326338516534e-05, "loss": 0.035, "step": 1050 }, { "epoch": 18.839285714285715, "grad_norm": 4.464576721191406, "learning_rate": 3.48933214238713e-05, "loss": 0.0427, "step": 1055 }, { "epoch": 18.928571428571427, "grad_norm": 2.8455142974853516, "learning_rate": 3.476306177936961e-05, "loss": 0.028, "step": 1060 }, { "epoch": 19.0, "eval_loss": 0.9811861515045166, "eval_macro_f1": 85.30339277946933, "eval_macro_precision": 87.2576209004239, "eval_macro_recall": 83.78939148169917, "eval_micro_f1": 89.30817610062893, "eval_micro_precision": 89.30817610062893, "eval_micro_recall": 89.30817610062893, "eval_runtime": 1.8833, "eval_samples_per_second": 844.274, "eval_steps_per_second": 13.275, "step": 1064 }, { "epoch": 19.017857142857142, "grad_norm": 0.4936154782772064, "learning_rate": 3.463248863441145e-05, "loss": 0.0573, "step": 1065 }, { "epoch": 19.107142857142858, "grad_norm": 7.516551971435547, "learning_rate": 3.450160618181476e-05, "loss": 0.0142, "step": 1070 }, { "epoch": 19.196428571428573, "grad_norm": 0.28197282552719116, "learning_rate": 3.43704186243296e-05, "loss": 0.0059, "step": 1075 }, { "epoch": 19.285714285714285, "grad_norm": 0.0721740797162056, "learning_rate": 3.4238930174503245e-05, "loss": 0.0043, "step": 1080 }, { "epoch": 19.375, "grad_norm": 11.249062538146973, "learning_rate": 3.4107145054544857e-05, "loss": 0.0968, "step": 1085 }, { "epoch": 19.464285714285715, "grad_norm": 1.9606690406799316, "learning_rate": 3.3975067496189965e-05, "loss": 0.0169, "step": 1090 }, { "epoch": 19.553571428571427, "grad_norm": 16.80199432373047, "learning_rate": 3.3842701740564534e-05, "loss": 0.0422, "step": 1095 }, { "epoch": 19.642857142857142, "grad_norm": 14.884848594665527, "learning_rate": 3.37100520380488e-05, "loss": 0.0665, "step": 1100 }, { "epoch": 19.732142857142858, "grad_norm": 8.680991172790527, "learning_rate": 3.357712264814077e-05, "loss": 0.0675, "step": 1105 }, { "epoch": 19.821428571428573, "grad_norm": 4.685244560241699, "learning_rate": 3.344391783931947e-05, "loss": 0.0494, "step": 1110 }, { "epoch": 19.910714285714285, "grad_norm": 10.966636657714844, "learning_rate": 3.331044188890788e-05, "loss": 0.0193, "step": 1115 }, { "epoch": 20.0, "grad_norm": 18.55583381652832, "learning_rate": 3.3176699082935545e-05, "loss": 0.0704, "step": 1120 }, { "epoch": 20.0, "eval_loss": 0.9311222434043884, "eval_macro_f1": 84.58923756150028, "eval_macro_precision": 84.91129891883661, "eval_macro_recall": 84.28359582205735, "eval_micro_f1": 88.36477987421384, "eval_micro_precision": 88.36477987421384, "eval_micro_recall": 88.36477987421384, "eval_runtime": 1.7297, "eval_samples_per_second": 919.235, "eval_steps_per_second": 14.453, "step": 1120 }, { "epoch": 20.089285714285715, "grad_norm": 0.6181861758232117, "learning_rate": 3.304269371600099e-05, "loss": 0.0264, "step": 1125 }, { "epoch": 20.178571428571427, "grad_norm": 0.6055905818939209, "learning_rate": 3.290843009113382e-05, "loss": 0.0312, "step": 1130 }, { "epoch": 20.267857142857142, "grad_norm": 4.4057111740112305, "learning_rate": 3.277391251965649e-05, "loss": 0.0124, "step": 1135 }, { "epoch": 20.357142857142858, "grad_norm": 3.0049655437469482, "learning_rate": 3.263914532104593e-05, "loss": 0.0175, "step": 1140 }, { "epoch": 20.446428571428573, "grad_norm": 10.01473331451416, "learning_rate": 3.250413282279482e-05, "loss": 0.0172, "step": 1145 }, { "epoch": 20.535714285714285, "grad_norm": 3.3975746631622314, "learning_rate": 3.2368879360272606e-05, "loss": 0.0223, "step": 1150 }, { "epoch": 20.625, "grad_norm": 3.1504733562469482, "learning_rate": 3.223338927658632e-05, "loss": 0.0046, "step": 1155 }, { "epoch": 20.714285714285715, "grad_norm": 7.759596347808838, "learning_rate": 3.20976669224411e-05, "loss": 0.0194, "step": 1160 }, { "epoch": 20.803571428571427, "grad_norm": 2.1500484943389893, "learning_rate": 3.196171665600051e-05, "loss": 0.0087, "step": 1165 }, { "epoch": 20.892857142857142, "grad_norm": 3.8775603771209717, "learning_rate": 3.182554284274654e-05, "loss": 0.0191, "step": 1170 }, { "epoch": 20.982142857142858, "grad_norm": 5.29668664932251, "learning_rate": 3.1689149855339496e-05, "loss": 0.0363, "step": 1175 }, { "epoch": 21.0, "eval_loss": 0.9204599261283875, "eval_macro_f1": 85.419711590922, "eval_macro_precision": 84.96998284734134, "eval_macro_recall": 85.90712821482052, "eval_micro_f1": 88.74213836477988, "eval_micro_precision": 88.74213836477988, "eval_micro_recall": 88.74213836477988, "eval_runtime": 1.7455, "eval_samples_per_second": 910.893, "eval_steps_per_second": 14.322, "step": 1176 }, { "epoch": 21.071428571428573, "grad_norm": 1.5591216087341309, "learning_rate": 3.1552542073477555e-05, "loss": 0.0155, "step": 1180 }, { "epoch": 21.160714285714285, "grad_norm": 11.346221923828125, "learning_rate": 3.141572388375612e-05, "loss": 0.0071, "step": 1185 }, { "epoch": 21.25, "grad_norm": 0.09788035601377487, "learning_rate": 3.127869967952698e-05, "loss": 0.0172, "step": 1190 }, { "epoch": 21.339285714285715, "grad_norm": 0.4548446238040924, "learning_rate": 3.114147386075724e-05, "loss": 0.0103, "step": 1195 }, { "epoch": 21.428571428571427, "grad_norm": 16.57025718688965, "learning_rate": 3.1004050833887985e-05, "loss": 0.0392, "step": 1200 }, { "epoch": 21.517857142857142, "grad_norm": 1.1993194818496704, "learning_rate": 3.0866435011692885e-05, "loss": 0.025, "step": 1205 }, { "epoch": 21.607142857142858, "grad_norm": 1.881464958190918, "learning_rate": 3.072863081313639e-05, "loss": 0.0096, "step": 1210 }, { "epoch": 21.696428571428573, "grad_norm": 13.144051551818848, "learning_rate": 3.05906426632319e-05, "loss": 0.0171, "step": 1215 }, { "epoch": 21.785714285714285, "grad_norm": 0.2325822114944458, "learning_rate": 3.0452474992899643e-05, "loss": 0.0099, "step": 1220 }, { "epoch": 21.875, "grad_norm": 1.384522557258606, "learning_rate": 3.0314132238824415e-05, "loss": 0.0126, "step": 1225 }, { "epoch": 21.964285714285715, "grad_norm": 0.3896070718765259, "learning_rate": 3.017561884331311e-05, "loss": 0.0025, "step": 1230 }, { "epoch": 22.0, "eval_loss": 0.9775845408439636, "eval_macro_f1": 85.79642633816226, "eval_macro_precision": 87.86862854659465, "eval_macro_recall": 84.20415343492267, "eval_micro_f1": 89.68553459119497, "eval_micro_precision": 89.68553459119497, "eval_micro_recall": 89.68553459119497, "eval_runtime": 1.7115, "eval_samples_per_second": 929.005, "eval_steps_per_second": 14.607, "step": 1232 }, { "epoch": 22.053571428571427, "grad_norm": 15.109649658203125, "learning_rate": 3.003693925415204e-05, "loss": 0.0147, "step": 1235 }, { "epoch": 22.142857142857142, "grad_norm": 0.29477667808532715, "learning_rate": 2.989809792446417e-05, "loss": 0.0515, "step": 1240 }, { "epoch": 22.232142857142858, "grad_norm": 0.05692288279533386, "learning_rate": 2.9759099312566076e-05, "loss": 0.0004, "step": 1245 }, { "epoch": 22.321428571428573, "grad_norm": 2.0338664054870605, "learning_rate": 2.9619947881824818e-05, "loss": 0.0109, "step": 1250 }, { "epoch": 22.410714285714285, "grad_norm": 0.07057174295186996, "learning_rate": 2.9480648100514586e-05, "loss": 0.0127, "step": 1255 }, { "epoch": 22.5, "grad_norm": 0.08349260687828064, "learning_rate": 2.9341204441673266e-05, "loss": 0.0258, "step": 1260 }, { "epoch": 22.589285714285715, "grad_norm": 0.5570873022079468, "learning_rate": 2.9201621382958733e-05, "loss": 0.002, "step": 1265 }, { "epoch": 22.678571428571427, "grad_norm": 0.06609360128641129, "learning_rate": 2.9061903406505154e-05, "loss": 0.0055, "step": 1270 }, { "epoch": 22.767857142857142, "grad_norm": 0.501964807510376, "learning_rate": 2.8922054998778998e-05, "loss": 0.0068, "step": 1275 }, { "epoch": 22.857142857142858, "grad_norm": 0.03342385217547417, "learning_rate": 2.8782080650435006e-05, "loss": 0.0181, "step": 1280 }, { "epoch": 22.946428571428573, "grad_norm": 6.850861072540283, "learning_rate": 2.864198485617199e-05, "loss": 0.0188, "step": 1285 }, { "epoch": 23.0, "eval_loss": 1.1122395992279053, "eval_macro_f1": 84.66160439893609, "eval_macro_precision": 87.28755884076602, "eval_macro_recall": 82.772217387602, "eval_micro_f1": 88.9937106918239, "eval_micro_precision": 88.9937106918239, "eval_micro_recall": 88.9937106918239, "eval_runtime": 1.7968, "eval_samples_per_second": 884.902, "eval_steps_per_second": 13.914, "step": 1288 }, { "epoch": 23.035714285714285, "grad_norm": 12.33535099029541, "learning_rate": 2.8501772114588476e-05, "loss": 0.0167, "step": 1290 }, { "epoch": 23.125, "grad_norm": 0.05313009023666382, "learning_rate": 2.8361446928038298e-05, "loss": 0.004, "step": 1295 }, { "epoch": 23.214285714285715, "grad_norm": 0.664737343788147, "learning_rate": 2.8221013802485975e-05, "loss": 0.0042, "step": 1300 }, { "epoch": 23.303571428571427, "grad_norm": 10.9341459274292, "learning_rate": 2.808047724736204e-05, "loss": 0.0077, "step": 1305 }, { "epoch": 23.392857142857142, "grad_norm": 8.750741004943848, "learning_rate": 2.793984177541827e-05, "loss": 0.0064, "step": 1310 }, { "epoch": 23.482142857142858, "grad_norm": 0.8044894933700562, "learning_rate": 2.7799111902582696e-05, "loss": 0.0068, "step": 1315 }, { "epoch": 23.571428571428573, "grad_norm": 8.937823295593262, "learning_rate": 2.76582921478147e-05, "loss": 0.0121, "step": 1320 }, { "epoch": 23.660714285714285, "grad_norm": 0.01974612846970558, "learning_rate": 2.7517387032959813e-05, "loss": 0.0043, "step": 1325 }, { "epoch": 23.75, "grad_norm": 1.4588861465454102, "learning_rate": 2.7376401082604564e-05, "loss": 0.0066, "step": 1330 }, { "epoch": 23.839285714285715, "grad_norm": 0.37790974974632263, "learning_rate": 2.72353388239312e-05, "loss": 0.0006, "step": 1335 }, { "epoch": 23.928571428571427, "grad_norm": 1.2444077730178833, "learning_rate": 2.7094204786572254e-05, "loss": 0.0282, "step": 1340 }, { "epoch": 24.0, "eval_loss": 1.0914798974990845, "eval_macro_f1": 85.26953769339522, "eval_macro_precision": 87.64161596177536, "eval_macro_recall": 83.50931812470273, "eval_micro_f1": 89.37106918238995, "eval_micro_precision": 89.37106918238995, "eval_micro_recall": 89.37106918238995, "eval_runtime": 1.7496, "eval_samples_per_second": 908.769, "eval_steps_per_second": 14.289, "step": 1344 }, { "epoch": 24.017857142857142, "grad_norm": 0.005385238211601973, "learning_rate": 2.6953003502465168e-05, "loss": 0.0042, "step": 1345 }, { "epoch": 24.107142857142858, "grad_norm": 0.1486300677061081, "learning_rate": 2.681173950570674e-05, "loss": 0.0042, "step": 1350 }, { "epoch": 24.196428571428573, "grad_norm": 0.11711076647043228, "learning_rate": 2.6670417332407487e-05, "loss": 0.0022, "step": 1355 }, { "epoch": 24.285714285714285, "grad_norm": 0.18978235125541687, "learning_rate": 2.652904152054607e-05, "loss": 0.003, "step": 1360 }, { "epoch": 24.375, "grad_norm": 8.865602493286133, "learning_rate": 2.6387616609823507e-05, "loss": 0.005, "step": 1365 }, { "epoch": 24.464285714285715, "grad_norm": 0.7902134656906128, "learning_rate": 2.624614714151743e-05, "loss": 0.0006, "step": 1370 }, { "epoch": 24.553571428571427, "grad_norm": 0.005069936625659466, "learning_rate": 2.610463765833625e-05, "loss": 0.0032, "step": 1375 }, { "epoch": 24.642857142857142, "grad_norm": 0.02278885804116726, "learning_rate": 2.59630927042733e-05, "loss": 0.0009, "step": 1380 }, { "epoch": 24.732142857142858, "grad_norm": 0.06174265593290329, "learning_rate": 2.5821516824460905e-05, "loss": 0.0033, "step": 1385 }, { "epoch": 24.821428571428573, "grad_norm": 0.04255477339029312, "learning_rate": 2.5679914565024443e-05, "loss": 0.0065, "step": 1390 }, { "epoch": 24.910714285714285, "grad_norm": 0.4989578127861023, "learning_rate": 2.5538290472936372e-05, "loss": 0.0077, "step": 1395 }, { "epoch": 25.0, "grad_norm": 0.17359009385108948, "learning_rate": 2.5396649095870202e-05, "loss": 0.0136, "step": 1400 }, { "epoch": 25.0, "eval_loss": 1.1381731033325195, "eval_macro_f1": 84.72942532348473, "eval_macro_precision": 86.83333756629393, "eval_macro_recall": 83.13287544056776, "eval_micro_f1": 88.93081761006289, "eval_micro_precision": 88.93081761006289, "eval_micro_recall": 88.93081761006289, "eval_runtime": 1.7399, "eval_samples_per_second": 913.828, "eval_steps_per_second": 14.368, "step": 1400 }, { "epoch": 25.089285714285715, "grad_norm": 0.8178830742835999, "learning_rate": 2.5254994982054493e-05, "loss": 0.0003, "step": 1405 }, { "epoch": 25.178571428571427, "grad_norm": 2.3602683544158936, "learning_rate": 2.5113332680126795e-05, "loss": 0.001, "step": 1410 }, { "epoch": 25.267857142857142, "grad_norm": 0.004060968291014433, "learning_rate": 2.4971666738987563e-05, "loss": 0.0002, "step": 1415 }, { "epoch": 25.357142857142858, "grad_norm": 0.6710391044616699, "learning_rate": 2.4830001707654134e-05, "loss": 0.0003, "step": 1420 }, { "epoch": 25.446428571428573, "grad_norm": 0.008804717101156712, "learning_rate": 2.4688342135114627e-05, "loss": 0.0054, "step": 1425 }, { "epoch": 25.535714285714285, "grad_norm": 0.4956241250038147, "learning_rate": 2.4546692570181863e-05, "loss": 0.0035, "step": 1430 }, { "epoch": 25.625, "grad_norm": 0.04511274769902229, "learning_rate": 2.4405057561347315e-05, "loss": 0.0004, "step": 1435 }, { "epoch": 25.714285714285715, "grad_norm": 0.032900311052799225, "learning_rate": 2.4263441656635053e-05, "loss": 0.0038, "step": 1440 }, { "epoch": 25.803571428571427, "grad_norm": 0.15933604538440704, "learning_rate": 2.4121849403455688e-05, "loss": 0.001, "step": 1445 }, { "epoch": 25.892857142857142, "grad_norm": 0.1360047459602356, "learning_rate": 2.3980285348460363e-05, "loss": 0.002, "step": 1450 }, { "epoch": 25.982142857142858, "grad_norm": 0.02792578749358654, "learning_rate": 2.3838754037394757e-05, "loss": 0.0, "step": 1455 }, { "epoch": 26.0, "eval_loss": 1.164141058921814, "eval_macro_f1": 85.84761272086648, "eval_macro_precision": 87.74020642071049, "eval_macro_recall": 84.36532282686129, "eval_micro_f1": 89.68553459119497, "eval_micro_precision": 89.68553459119497, "eval_micro_recall": 89.68553459119497, "eval_runtime": 1.9014, "eval_samples_per_second": 836.217, "eval_steps_per_second": 13.148, "step": 1456 }, { "epoch": 26.071428571428573, "grad_norm": 0.0013366724597290158, "learning_rate": 2.3697260014953108e-05, "loss": 0.0001, "step": 1460 }, { "epoch": 26.160714285714285, "grad_norm": 0.5680537223815918, "learning_rate": 2.3555807824632285e-05, "loss": 0.0053, "step": 1465 }, { "epoch": 26.25, "grad_norm": 0.0030330184381455183, "learning_rate": 2.3414402008585888e-05, "loss": 0.0008, "step": 1470 }, { "epoch": 26.339285714285715, "grad_norm": 0.0012838690308853984, "learning_rate": 2.327304710747841e-05, "loss": 0.0, "step": 1475 }, { "epoch": 26.428571428571427, "grad_norm": 0.006956954021006823, "learning_rate": 2.3131747660339394e-05, "loss": 0.0014, "step": 1480 }, { "epoch": 26.517857142857142, "grad_norm": 0.06738751381635666, "learning_rate": 2.2990508204417742e-05, "loss": 0.0004, "step": 1485 }, { "epoch": 26.607142857142858, "grad_norm": 0.01422626618295908, "learning_rate": 2.2849333275035964e-05, "loss": 0.0, "step": 1490 }, { "epoch": 26.696428571428573, "grad_norm": 0.004991587717086077, "learning_rate": 2.270822740544457e-05, "loss": 0.0, "step": 1495 }, { "epoch": 26.785714285714285, "grad_norm": 0.001760053331963718, "learning_rate": 2.2567195126676507e-05, "loss": 0.0, "step": 1500 }, { "epoch": 26.875, "grad_norm": 0.0031189576257020235, "learning_rate": 2.242624096740164e-05, "loss": 0.0, "step": 1505 }, { "epoch": 26.964285714285715, "grad_norm": 0.001600801246240735, "learning_rate": 2.2285369453781364e-05, "loss": 0.0, "step": 1510 }, { "epoch": 27.0, "eval_loss": 1.1644015312194824, "eval_macro_f1": 85.87021885704534, "eval_macro_precision": 87.99308755760369, "eval_macro_recall": 84.24641886180348, "eval_micro_f1": 89.74842767295598, "eval_micro_precision": 89.74842767295598, "eval_micro_recall": 89.74842767295598, "eval_runtime": 1.7672, "eval_samples_per_second": 899.738, "eval_steps_per_second": 14.147, "step": 1512 }, { "epoch": 27.053571428571427, "grad_norm": 0.0008946519810706377, "learning_rate": 2.214458510932325e-05, "loss": 0.0011, "step": 1515 }, { "epoch": 27.142857142857142, "grad_norm": 0.002819470362737775, "learning_rate": 2.2003892454735786e-05, "loss": 0.0001, "step": 1520 }, { "epoch": 27.232142857142858, "grad_norm": 0.002619238570332527, "learning_rate": 2.1863296007783206e-05, "loss": 0.0008, "step": 1525 }, { "epoch": 27.321428571428573, "grad_norm": 0.0019296056125313044, "learning_rate": 2.172280028314045e-05, "loss": 0.0059, "step": 1530 }, { "epoch": 27.410714285714285, "grad_norm": 0.0006752462941221893, "learning_rate": 2.158240979224817e-05, "loss": 0.0, "step": 1535 }, { "epoch": 27.5, "grad_norm": 0.002963811159133911, "learning_rate": 2.1442129043167874e-05, "loss": 0.0, "step": 1540 }, { "epoch": 27.589285714285715, "grad_norm": 0.0020487557630985975, "learning_rate": 2.1301962540437164e-05, "loss": 0.0, "step": 1545 }, { "epoch": 27.678571428571427, "grad_norm": 0.004336291924118996, "learning_rate": 2.1161914784925083e-05, "loss": 0.0, "step": 1550 }, { "epoch": 27.767857142857142, "grad_norm": 0.002049487316980958, "learning_rate": 2.102199027368761e-05, "loss": 0.0, "step": 1555 }, { "epoch": 27.857142857142858, "grad_norm": 0.008441206067800522, "learning_rate": 2.088219349982323e-05, "loss": 0.0, "step": 1560 }, { "epoch": 27.946428571428573, "grad_norm": 0.0020169492345303297, "learning_rate": 2.0742528952328673e-05, "loss": 0.0, "step": 1565 }, { "epoch": 28.0, "eval_loss": 1.1838983297348022, "eval_macro_f1": 85.74461897087475, "eval_macro_precision": 88.00197532696066, "eval_macro_recall": 84.04298404298405, "eval_micro_f1": 89.68553459119497, "eval_micro_precision": 89.68553459119497, "eval_micro_recall": 89.68553459119497, "eval_runtime": 2.4777, "eval_samples_per_second": 641.733, "eval_steps_per_second": 10.09, "step": 1568 }, { "epoch": 28.035714285714285, "grad_norm": 0.21369871497154236, "learning_rate": 2.0603001115954774e-05, "loss": 0.0026, "step": 1570 }, { "epoch": 28.125, "grad_norm": 0.001929258112795651, "learning_rate": 2.0463614471062435e-05, "loss": 0.0, "step": 1575 }, { "epoch": 28.214285714285715, "grad_norm": 0.0026586749590933323, "learning_rate": 2.0324373493478804e-05, "loss": 0.005, "step": 1580 }, { "epoch": 28.303571428571427, "grad_norm": 0.021981006488204002, "learning_rate": 2.0185282654353493e-05, "loss": 0.0, "step": 1585 }, { "epoch": 28.392857142857142, "grad_norm": 0.005900249350816011, "learning_rate": 2.0046346420015067e-05, "loss": 0.0, "step": 1590 }, { "epoch": 28.482142857142858, "grad_norm": 0.0033512930385768414, "learning_rate": 1.990756925182756e-05, "loss": 0.0, "step": 1595 }, { "epoch": 28.571428571428573, "grad_norm": 0.0007393535925075412, "learning_rate": 1.976895560604729e-05, "loss": 0.0, "step": 1600 }, { "epoch": 28.660714285714285, "grad_norm": 0.2156071811914444, "learning_rate": 1.9630509933679704e-05, "loss": 0.0028, "step": 1605 }, { "epoch": 28.75, "grad_norm": 0.0010669779730960727, "learning_rate": 1.9492236680336485e-05, "loss": 0.0, "step": 1610 }, { "epoch": 28.839285714285715, "grad_norm": 0.0025355510879307985, "learning_rate": 1.9354140286092785e-05, "loss": 0.0, "step": 1615 }, { "epoch": 28.928571428571427, "grad_norm": 0.004663623869419098, "learning_rate": 1.9216225185344662e-05, "loss": 0.0, "step": 1620 }, { "epoch": 29.0, "eval_loss": 1.169285535812378, "eval_macro_f1": 85.77409578612829, "eval_macro_precision": 87.61836905650758, "eval_macro_recall": 84.32305739998047, "eval_micro_f1": 89.62264150943396, "eval_micro_precision": 89.62264150943396, "eval_micro_recall": 89.62264150943396, "eval_runtime": 1.9141, "eval_samples_per_second": 830.659, "eval_steps_per_second": 13.061, "step": 1624 }, { "epoch": 29.017857142857142, "grad_norm": 0.0022395530249923468, "learning_rate": 1.907849580666668e-05, "loss": 0.0, "step": 1625 }, { "epoch": 29.107142857142858, "grad_norm": 0.0007931589498184621, "learning_rate": 1.8940956572669692e-05, "loss": 0.0006, "step": 1630 }, { "epoch": 29.196428571428573, "grad_norm": 0.0019468627870082855, "learning_rate": 1.880361189985886e-05, "loss": 0.0, "step": 1635 }, { "epoch": 29.285714285714285, "grad_norm": 0.0028856031130999327, "learning_rate": 1.8666466198491795e-05, "loss": 0.0, "step": 1640 }, { "epoch": 29.375, "grad_norm": 0.0021576446015387774, "learning_rate": 1.852952387243698e-05, "loss": 0.0, "step": 1645 }, { "epoch": 29.464285714285715, "grad_norm": 0.0026545205619186163, "learning_rate": 1.8392789319032328e-05, "loss": 0.0009, "step": 1650 }, { "epoch": 29.553571428571427, "grad_norm": 0.0022205617278814316, "learning_rate": 1.8256266928943988e-05, "loss": 0.0066, "step": 1655 }, { "epoch": 29.642857142857142, "grad_norm": 0.001808985136449337, "learning_rate": 1.8119961086025374e-05, "loss": 0.0, "step": 1660 }, { "epoch": 29.732142857142858, "grad_norm": 0.0015430036000907421, "learning_rate": 1.7983876167176362e-05, "loss": 0.0, "step": 1665 }, { "epoch": 29.821428571428573, "grad_norm": 0.002092926762998104, "learning_rate": 1.7848016542202767e-05, "loss": 0.0, "step": 1670 }, { "epoch": 29.910714285714285, "grad_norm": 0.001246055937372148, "learning_rate": 1.7712386573676044e-05, "loss": 0.0, "step": 1675 }, { "epoch": 30.0, "grad_norm": 0.001110477140173316, "learning_rate": 1.7576990616793137e-05, "loss": 0.0, "step": 1680 }, { "epoch": 30.0, "eval_loss": 1.193253755569458, "eval_macro_f1": 85.4909143681396, "eval_macro_precision": 88.02490672890218, "eval_macro_recall": 83.63611440534517, "eval_micro_f1": 89.55974842767296, "eval_micro_precision": 89.55974842767296, "eval_micro_recall": 89.55974842767296, "eval_runtime": 2.0785, "eval_samples_per_second": 764.987, "eval_steps_per_second": 12.028, "step": 1680 }, { "epoch": 30.089285714285715, "grad_norm": 0.015624803490936756, "learning_rate": 1.7441833019236704e-05, "loss": 0.0011, "step": 1685 }, { "epoch": 30.178571428571427, "grad_norm": 0.0003042487951461226, "learning_rate": 1.730691812103546e-05, "loss": 0.0, "step": 1690 }, { "epoch": 30.267857142857142, "grad_norm": 0.0016463997308164835, "learning_rate": 1.717225025442485e-05, "loss": 0.0, "step": 1695 }, { "epoch": 30.357142857142858, "grad_norm": 0.0009225396788679063, "learning_rate": 1.7037833743707892e-05, "loss": 0.0, "step": 1700 }, { "epoch": 30.446428571428573, "grad_norm": 0.14133678376674652, "learning_rate": 1.690367290511637e-05, "loss": 0.0008, "step": 1705 }, { "epoch": 30.535714285714285, "grad_norm": 0.0003841827274300158, "learning_rate": 1.676977204667221e-05, "loss": 0.0, "step": 1710 }, { "epoch": 30.625, "grad_norm": 0.0009803869761526585, "learning_rate": 1.6636135468049123e-05, "loss": 0.0, "step": 1715 }, { "epoch": 30.714285714285715, "grad_norm": 0.002163276541978121, "learning_rate": 1.6502767460434588e-05, "loss": 0.0, "step": 1720 }, { "epoch": 30.803571428571427, "grad_norm": 0.002792706247419119, "learning_rate": 1.6369672306392027e-05, "loss": 0.0, "step": 1725 }, { "epoch": 30.892857142857142, "grad_norm": 0.0011888825101777911, "learning_rate": 1.62368542797233e-05, "loss": 0.0, "step": 1730 }, { "epoch": 30.982142857142858, "grad_norm": 0.0003651406441349536, "learning_rate": 1.6104317645331456e-05, "loss": 0.0063, "step": 1735 }, { "epoch": 31.0, "eval_loss": 1.1838295459747314, "eval_macro_f1": 85.54969445546462, "eval_macro_precision": 87.56241738875019, "eval_macro_recall": 83.99677245831091, "eval_micro_f1": 89.49685534591195, "eval_micro_precision": 89.49685534591195, "eval_micro_recall": 89.49685534591195, "eval_runtime": 2.059, "eval_samples_per_second": 772.237, "eval_steps_per_second": 12.142, "step": 1736 }, { "epoch": 31.071428571428573, "grad_norm": 0.0007483928930014372, "learning_rate": 1.5972066659083796e-05, "loss": 0.0, "step": 1740 }, { "epoch": 31.160714285714285, "grad_norm": 0.004502744879573584, "learning_rate": 1.5840105567675218e-05, "loss": 0.0, "step": 1745 }, { "epoch": 31.25, "grad_norm": 0.009936104528605938, "learning_rate": 1.5708438608491814e-05, "loss": 0.0, "step": 1750 }, { "epoch": 31.339285714285715, "grad_norm": 0.0025622285902500153, "learning_rate": 1.557707000947487e-05, "loss": 0.0, "step": 1755 }, { "epoch": 31.428571428571427, "grad_norm": 0.0010868199169635773, "learning_rate": 1.5446003988985043e-05, "loss": 0.0, "step": 1760 }, { "epoch": 31.517857142857142, "grad_norm": 0.0007128150318749249, "learning_rate": 1.531524475566693e-05, "loss": 0.0012, "step": 1765 }, { "epoch": 31.607142857142858, "grad_norm": 0.0021832745987921953, "learning_rate": 1.5184796508313934e-05, "loss": 0.0038, "step": 1770 }, { "epoch": 31.696428571428573, "grad_norm": 0.001526080071926117, "learning_rate": 1.5054663435733418e-05, "loss": 0.0014, "step": 1775 }, { "epoch": 31.785714285714285, "grad_norm": 0.00137015909422189, "learning_rate": 1.492484971661221e-05, "loss": 0.0, "step": 1780 }, { "epoch": 31.875, "grad_norm": 0.0007851460832171142, "learning_rate": 1.479535951938243e-05, "loss": 0.0, "step": 1785 }, { "epoch": 31.964285714285715, "grad_norm": 0.0010572908213362098, "learning_rate": 1.4666197002087594e-05, "loss": 0.0013, "step": 1790 }, { "epoch": 32.0, "eval_loss": 1.1904088258743286, "eval_macro_f1": 85.89264432682533, "eval_macro_precision": 88.25563122053431, "eval_macro_recall": 84.12751489674567, "eval_micro_f1": 89.81132075471699, "eval_micro_precision": 89.81132075471699, "eval_micro_recall": 89.81132075471699, "eval_runtime": 1.7315, "eval_samples_per_second": 918.253, "eval_steps_per_second": 14.438, "step": 1792 }, { "epoch": 32.05357142857143, "grad_norm": 0.0002830619050655514, "learning_rate": 1.4537366312249165e-05, "loss": 0.0, "step": 1795 }, { "epoch": 32.142857142857146, "grad_norm": 0.0003966302901972085, "learning_rate": 1.4408871586733318e-05, "loss": 0.0, "step": 1800 }, { "epoch": 32.232142857142854, "grad_norm": 0.0010989709990099072, "learning_rate": 1.428071695161812e-05, "loss": 0.0, "step": 1805 }, { "epoch": 32.32142857142857, "grad_norm": 0.0009420845308341086, "learning_rate": 1.4152906522061048e-05, "loss": 0.0042, "step": 1810 }, { "epoch": 32.410714285714285, "grad_norm": 0.0009583772043697536, "learning_rate": 1.402544440216682e-05, "loss": 0.0, "step": 1815 }, { "epoch": 32.5, "grad_norm": 0.000329616479575634, "learning_rate": 1.3898334684855647e-05, "loss": 0.0, "step": 1820 }, { "epoch": 32.589285714285715, "grad_norm": 0.0010914219310507178, "learning_rate": 1.3771581451731768e-05, "loss": 0.0, "step": 1825 }, { "epoch": 32.67857142857143, "grad_norm": 0.001109420321881771, "learning_rate": 1.3645188772952411e-05, "loss": 0.0017, "step": 1830 }, { "epoch": 32.767857142857146, "grad_norm": 0.003983737900853157, "learning_rate": 1.3519160707097073e-05, "loss": 0.0016, "step": 1835 }, { "epoch": 32.857142857142854, "grad_norm": 0.0013640534598380327, "learning_rate": 1.3393501301037245e-05, "loss": 0.0013, "step": 1840 }, { "epoch": 32.94642857142857, "grad_norm": 0.00043303275015205145, "learning_rate": 1.3268214589806388e-05, "loss": 0.0, "step": 1845 }, { "epoch": 33.0, "eval_loss": 1.1757960319519043, "eval_macro_f1": 85.82209656372336, "eval_macro_precision": 87.80381119449642, "eval_macro_recall": 84.28473813089197, "eval_micro_f1": 89.68553459119497, "eval_micro_precision": 89.68553459119497, "eval_micro_recall": 89.68553459119497, "eval_runtime": 2.0676, "eval_samples_per_second": 769.018, "eval_steps_per_second": 12.091, "step": 1848 }, { "epoch": 33.035714285714285, "grad_norm": 0.0008297289023175836, "learning_rate": 1.3143304596470443e-05, "loss": 0.0, "step": 1850 }, { "epoch": 33.125, "grad_norm": 0.0008214128902181983, "learning_rate": 1.301877533199859e-05, "loss": 0.0, "step": 1855 }, { "epoch": 33.214285714285715, "grad_norm": 0.0019036834128201008, "learning_rate": 1.2894630795134455e-05, "loss": 0.0, "step": 1860 }, { "epoch": 33.30357142857143, "grad_norm": 0.0015944598708301783, "learning_rate": 1.2770874972267777e-05, "loss": 0.0, "step": 1865 }, { "epoch": 33.392857142857146, "grad_norm": 0.0004286083276383579, "learning_rate": 1.2647511837306284e-05, "loss": 0.0, "step": 1870 }, { "epoch": 33.482142857142854, "grad_norm": 0.0017838689964264631, "learning_rate": 1.2524545351548206e-05, "loss": 0.0052, "step": 1875 }, { "epoch": 33.57142857142857, "grad_norm": 0.0007197365048341453, "learning_rate": 1.2401979463554982e-05, "loss": 0.0008, "step": 1880 }, { "epoch": 33.660714285714285, "grad_norm": 0.0011250259121879935, "learning_rate": 1.2279818109024538e-05, "loss": 0.0, "step": 1885 }, { "epoch": 33.75, "grad_norm": 0.0006792208878323436, "learning_rate": 1.2158065210664848e-05, "loss": 0.001, "step": 1890 }, { "epoch": 33.839285714285715, "grad_norm": 0.0010428227251395583, "learning_rate": 1.2036724678068006e-05, "loss": 0.0, "step": 1895 }, { "epoch": 33.92857142857143, "grad_norm": 0.0009357041562907398, "learning_rate": 1.1915800407584704e-05, "loss": 0.0009, "step": 1900 }, { "epoch": 34.0, "eval_loss": 1.1771963834762573, "eval_macro_f1": 85.57575566624061, "eval_macro_precision": 87.49931435467062, "eval_macro_recall": 84.07735715428024, "eval_micro_f1": 89.49685534591195, "eval_micro_precision": 89.49685534591195, "eval_micro_recall": 89.49685534591195, "eval_runtime": 2.4523, "eval_samples_per_second": 648.379, "eval_steps_per_second": 10.195, "step": 1904 }, { "epoch": 34.017857142857146, "grad_norm": 0.0015970384702086449, "learning_rate": 1.1795296282199061e-05, "loss": 0.0, "step": 1905 }, { "epoch": 34.107142857142854, "grad_norm": 0.0010594812920317054, "learning_rate": 1.1675216171404002e-05, "loss": 0.0048, "step": 1910 }, { "epoch": 34.19642857142857, "grad_norm": 0.0008670884999446571, "learning_rate": 1.1555563931076934e-05, "loss": 0.0, "step": 1915 }, { "epoch": 34.285714285714285, "grad_norm": 0.000477910740301013, "learning_rate": 1.1436343403356017e-05, "loss": 0.0, "step": 1920 }, { "epoch": 34.375, "grad_norm": 0.00853039976209402, "learning_rate": 1.1317558416516697e-05, "loss": 0.0012, "step": 1925 }, { "epoch": 34.464285714285715, "grad_norm": 0.001123997732065618, "learning_rate": 1.1199212784848834e-05, "loss": 0.0, "step": 1930 }, { "epoch": 34.55357142857143, "grad_norm": 0.0008179740980267525, "learning_rate": 1.1081310308534229e-05, "loss": 0.0011, "step": 1935 }, { "epoch": 34.642857142857146, "grad_norm": 0.0008750974084250629, "learning_rate": 1.096385477352455e-05, "loss": 0.0, "step": 1940 }, { "epoch": 34.732142857142854, "grad_norm": 0.0006880298024043441, "learning_rate": 1.0846849951419814e-05, "loss": 0.0009, "step": 1945 }, { "epoch": 34.82142857142857, "grad_norm": 0.0012920346343889832, "learning_rate": 1.0730299599347219e-05, "loss": 0.0, "step": 1950 }, { "epoch": 34.910714285714285, "grad_norm": 0.00165931461378932, "learning_rate": 1.0614207459840572e-05, "loss": 0.0, "step": 1955 }, { "epoch": 35.0, "grad_norm": 0.003719399683177471, "learning_rate": 1.049857726072005e-05, "loss": 0.0, "step": 1960 }, { "epoch": 35.0, "eval_loss": 1.1785622835159302, "eval_macro_f1": 86.0688671097593, "eval_macro_precision": 88.10971691878396, "eval_macro_recall": 84.49211910750371, "eval_micro_f1": 89.87421383647799, "eval_micro_precision": 89.87421383647799, "eval_micro_recall": 89.87421383647799, "eval_runtime": 1.6934, "eval_samples_per_second": 938.939, "eval_steps_per_second": 14.763, "step": 1960 }, { "epoch": 35.089285714285715, "grad_norm": 0.001190517912618816, "learning_rate": 1.0383412714972507e-05, "loss": 0.0007, "step": 1965 }, { "epoch": 35.17857142857143, "grad_norm": 0.0001941876980708912, "learning_rate": 1.0268717520632298e-05, "loss": 0.0, "step": 1970 }, { "epoch": 35.267857142857146, "grad_norm": 0.0013438657624647021, "learning_rate": 1.0154495360662464e-05, "loss": 0.0, "step": 1975 }, { "epoch": 35.357142857142854, "grad_norm": 0.0008899585227482021, "learning_rate": 1.0040749902836507e-05, "loss": 0.0, "step": 1980 }, { "epoch": 35.44642857142857, "grad_norm": 0.0008040536195039749, "learning_rate": 9.927484799620595e-06, "loss": 0.0, "step": 1985 }, { "epoch": 35.535714285714285, "grad_norm": 0.0008036054205149412, "learning_rate": 9.814703688056321e-06, "loss": 0.0, "step": 1990 }, { "epoch": 35.625, "grad_norm": 0.000511976657435298, "learning_rate": 9.702410189643837e-06, "loss": 0.0, "step": 1995 }, { "epoch": 35.714285714285715, "grad_norm": 0.000789080688264221, "learning_rate": 9.59060791022566e-06, "loss": 0.0, "step": 2000 }, { "epoch": 35.80357142857143, "grad_norm": 0.0002290449192514643, "learning_rate": 9.479300439870787e-06, "loss": 0.0, "step": 2005 }, { "epoch": 35.892857142857146, "grad_norm": 0.0005157162086106837, "learning_rate": 9.368491352759506e-06, "loss": 0.0, "step": 2010 }, { "epoch": 35.982142857142854, "grad_norm": 0.5052797794342041, "learning_rate": 9.258184207068551e-06, "loss": 0.0069, "step": 2015 }, { "epoch": 36.0, "eval_loss": 1.1818641424179077, "eval_macro_f1": 85.82209656372336, "eval_macro_precision": 87.80381119449642, "eval_macro_recall": 84.28473813089197, "eval_micro_f1": 89.68553459119497, "eval_micro_precision": 89.68553459119497, "eval_micro_recall": 89.68553459119497, "eval_runtime": 1.9269, "eval_samples_per_second": 825.175, "eval_steps_per_second": 12.974, "step": 2016 }, { "epoch": 36.07142857142857, "grad_norm": 0.001218083780258894, "learning_rate": 9.148382544856884e-06, "loss": 0.0, "step": 2020 }, { "epoch": 36.160714285714285, "grad_norm": 0.0006271243910305202, "learning_rate": 9.039089891951975e-06, "loss": 0.0051, "step": 2025 }, { "epoch": 36.25, "grad_norm": 0.001310994615778327, "learning_rate": 8.930309757836517e-06, "loss": 0.0, "step": 2030 }, { "epoch": 36.339285714285715, "grad_norm": 0.0016614202177152038, "learning_rate": 8.822045635535823e-06, "loss": 0.0, "step": 2035 }, { "epoch": 36.42857142857143, "grad_norm": 0.00039496883982792497, "learning_rate": 8.714301001505567e-06, "loss": 0.0012, "step": 2040 }, { "epoch": 36.517857142857146, "grad_norm": 0.0006432042573578656, "learning_rate": 8.607079315520252e-06, "loss": 0.0, "step": 2045 }, { "epoch": 36.607142857142854, "grad_norm": 0.00702462624758482, "learning_rate": 8.500384020562018e-06, "loss": 0.0, "step": 2050 }, { "epoch": 36.69642857142857, "grad_norm": 0.17590132355690002, "learning_rate": 8.394218542710141e-06, "loss": 0.0012, "step": 2055 }, { "epoch": 36.785714285714285, "grad_norm": 0.00369036803022027, "learning_rate": 8.288586291031026e-06, "loss": 0.0, "step": 2060 }, { "epoch": 36.875, "grad_norm": 0.0006468078936450183, "learning_rate": 8.183490657468688e-06, "loss": 0.0, "step": 2065 }, { "epoch": 36.964285714285715, "grad_norm": 0.15709273517131805, "learning_rate": 8.078935016735891e-06, "loss": 0.001, "step": 2070 }, { "epoch": 37.0, "eval_loss": 1.1875933408737183, "eval_macro_f1": 86.06680921167936, "eval_macro_precision": 88.44071939933647, "eval_macro_recall": 84.29263044647661, "eval_micro_f1": 89.937106918239, "eval_micro_precision": 89.937106918239, "eval_micro_recall": 89.937106918239, "eval_runtime": 1.7266, "eval_samples_per_second": 920.898, "eval_steps_per_second": 14.48, "step": 2072 }, { "epoch": 37.05357142857143, "grad_norm": 0.009659999050199986, "learning_rate": 7.974922726205736e-06, "loss": 0.0, "step": 2075 }, { "epoch": 37.142857142857146, "grad_norm": 0.0007702059228904545, "learning_rate": 7.871457125803896e-06, "loss": 0.0, "step": 2080 }, { "epoch": 37.232142857142854, "grad_norm": 0.0009207057883031666, "learning_rate": 7.768541537901325e-06, "loss": 0.0009, "step": 2085 }, { "epoch": 37.32142857142857, "grad_norm": 0.00031363347079604864, "learning_rate": 7.666179267207596e-06, "loss": 0.0, "step": 2090 }, { "epoch": 37.410714285714285, "grad_norm": 0.0014384811511263251, "learning_rate": 7.564373600664804e-06, "loss": 0.0056, "step": 2095 }, { "epoch": 37.5, "grad_norm": 0.0012792075285688043, "learning_rate": 7.463127807341966e-06, "loss": 0.0, "step": 2100 }, { "epoch": 37.589285714285715, "grad_norm": 0.000563352950848639, "learning_rate": 7.3624451383301125e-06, "loss": 0.0, "step": 2105 }, { "epoch": 37.67857142857143, "grad_norm": 0.0017736536683514714, "learning_rate": 7.262328826637826e-06, "loss": 0.0009, "step": 2110 }, { "epoch": 37.767857142857146, "grad_norm": 0.000779169553425163, "learning_rate": 7.162782087087494e-06, "loss": 0.0, "step": 2115 }, { "epoch": 37.857142857142854, "grad_norm": 0.001163293025456369, "learning_rate": 7.06380811621202e-06, "loss": 0.0, "step": 2120 }, { "epoch": 37.94642857142857, "grad_norm": 0.00028616635245271027, "learning_rate": 6.965410092152211e-06, "loss": 0.0, "step": 2125 }, { "epoch": 38.0, "eval_loss": 1.1880896091461182, "eval_macro_f1": 85.5970716119231, "eval_macro_precision": 87.7511203877084, "eval_macro_recall": 83.95845318922241, "eval_micro_f1": 89.55974842767296, "eval_micro_precision": 89.55974842767296, "eval_micro_recall": 89.55974842767296, "eval_runtime": 2.01, "eval_samples_per_second": 791.054, "eval_steps_per_second": 12.438, "step": 2128 }, { "epoch": 38.035714285714285, "grad_norm": 0.0016029111575335264, "learning_rate": 6.867591174554713e-06, "loss": 0.0, "step": 2130 }, { "epoch": 38.125, "grad_norm": 0.0014079079264774919, "learning_rate": 6.770354504470575e-06, "loss": 0.0, "step": 2135 }, { "epoch": 38.214285714285715, "grad_norm": 0.0005063859280198812, "learning_rate": 6.673703204254347e-06, "loss": 0.0, "step": 2140 }, { "epoch": 38.30357142857143, "grad_norm": 0.0009960135212168097, "learning_rate": 6.577640377463848e-06, "loss": 0.0009, "step": 2145 }, { "epoch": 38.392857142857146, "grad_norm": 0.0003499105223454535, "learning_rate": 6.482169108760511e-06, "loss": 0.0, "step": 2150 }, { "epoch": 38.482142857142854, "grad_norm": 0.0017842509550973773, "learning_rate": 6.387292463810299e-06, "loss": 0.0008, "step": 2155 }, { "epoch": 38.57142857142857, "grad_norm": 0.0008073888020589948, "learning_rate": 6.2930134891853146e-06, "loss": 0.0, "step": 2160 }, { "epoch": 38.660714285714285, "grad_norm": 0.0010807816870510578, "learning_rate": 6.199335212265911e-06, "loss": 0.0, "step": 2165 }, { "epoch": 38.75, "grad_norm": 0.0004026450333185494, "learning_rate": 6.106260641143546e-06, "loss": 0.0051, "step": 2170 }, { "epoch": 38.839285714285715, "grad_norm": 0.13531385362148285, "learning_rate": 6.013792764524129e-06, "loss": 0.0008, "step": 2175 }, { "epoch": 38.92857142857143, "grad_norm": 0.00043903145706281066, "learning_rate": 5.921934551632085e-06, "loss": 0.0, "step": 2180 }, { "epoch": 39.0, "eval_loss": 1.1880995035171509, "eval_macro_f1": 85.5970716119231, "eval_macro_precision": 87.7511203877084, "eval_macro_recall": 83.95845318922241, "eval_micro_f1": 89.55974842767296, "eval_micro_precision": 89.55974842767296, "eval_micro_recall": 89.55974842767296, "eval_runtime": 2.1641, "eval_samples_per_second": 734.708, "eval_steps_per_second": 11.552, "step": 2184 }, { "epoch": 39.017857142857146, "grad_norm": 0.0007070303545333445, "learning_rate": 5.830688952115018e-06, "loss": 0.0, "step": 2185 }, { "epoch": 39.107142857142854, "grad_norm": 0.0003032834501937032, "learning_rate": 5.740058895948955e-06, "loss": 0.0, "step": 2190 }, { "epoch": 39.19642857142857, "grad_norm": 0.1616564691066742, "learning_rate": 5.650047293344315e-06, "loss": 0.0071, "step": 2195 }, { "epoch": 39.285714285714285, "grad_norm": 0.00030247235554270446, "learning_rate": 5.560657034652406e-06, "loss": 0.0, "step": 2200 }, { "epoch": 39.375, "grad_norm": 0.0009000123827718198, "learning_rate": 5.471890990272666e-06, "loss": 0.0, "step": 2205 }, { "epoch": 39.464285714285715, "grad_norm": 0.0009570589754730463, "learning_rate": 5.383752010560441e-06, "loss": 0.0, "step": 2210 }, { "epoch": 39.55357142857143, "grad_norm": 0.0008730028057470918, "learning_rate": 5.296242925735487e-06, "loss": 0.0, "step": 2215 }, { "epoch": 39.642857142857146, "grad_norm": 0.0006854677340015769, "learning_rate": 5.2093665457911e-06, "loss": 0.0, "step": 2220 }, { "epoch": 39.732142857142854, "grad_norm": 0.0013594292104244232, "learning_rate": 5.123125660403849e-06, "loss": 0.0, "step": 2225 }, { "epoch": 39.82142857142857, "grad_norm": 0.0011872885515913367, "learning_rate": 5.037523038844033e-06, "loss": 0.0, "step": 2230 }, { "epoch": 39.910714285714285, "grad_norm": 0.0007189795724116266, "learning_rate": 4.952561429886721e-06, "loss": 0.0, "step": 2235 }, { "epoch": 40.0, "grad_norm": 8.409917063545436e-05, "learning_rate": 4.868243561723535e-06, "loss": 0.0, "step": 2240 }, { "epoch": 40.0, "eval_loss": 1.1868513822555542, "eval_macro_f1": 85.77060086961077, "eval_macro_precision": 87.93467695199129, "eval_macro_recall": 84.12356873895335, "eval_micro_f1": 89.68553459119497, "eval_micro_precision": 89.68553459119497, "eval_micro_recall": 89.68553459119497, "eval_runtime": 2.0444, "eval_samples_per_second": 777.736, "eval_steps_per_second": 12.229, "step": 2240 }, { "epoch": 40.089285714285715, "grad_norm": 0.0010488256812095642, "learning_rate": 4.7845721418749905e-06, "loss": 0.0, "step": 2245 }, { "epoch": 40.17857142857143, "grad_norm": 0.00023322908964473754, "learning_rate": 4.701549857103588e-06, "loss": 0.0, "step": 2250 }, { "epoch": 40.267857142857146, "grad_norm": 0.0007012597052380443, "learning_rate": 4.619179373327545e-06, "loss": 0.0, "step": 2255 }, { "epoch": 40.357142857142854, "grad_norm": 0.0006945223431102931, "learning_rate": 4.537463335535161e-06, "loss": 0.0, "step": 2260 }, { "epoch": 40.44642857142857, "grad_norm": 0.003948695491999388, "learning_rate": 4.456404367699923e-06, "loss": 0.0, "step": 2265 }, { "epoch": 40.535714285714285, "grad_norm": 0.0009591460693627596, "learning_rate": 4.376005072696204e-06, "loss": 0.0, "step": 2270 }, { "epoch": 40.625, "grad_norm": 0.0007004874059930444, "learning_rate": 4.296268032215733e-06, "loss": 0.0, "step": 2275 }, { "epoch": 40.714285714285715, "grad_norm": 0.00040511120459996164, "learning_rate": 4.217195806684629e-06, "loss": 0.0053, "step": 2280 }, { "epoch": 40.80357142857143, "grad_norm": 0.0005234309355728328, "learning_rate": 4.138790935181258e-06, "loss": 0.0, "step": 2285 }, { "epoch": 40.892857142857146, "grad_norm": 0.0015812547644600272, "learning_rate": 4.061055935354643e-06, "loss": 0.0, "step": 2290 }, { "epoch": 40.982142857142854, "grad_norm": 0.0006628704722970724, "learning_rate": 3.983993303343639e-06, "loss": 0.001, "step": 2295 }, { "epoch": 41.0, "eval_loss": 1.1929736137390137, "eval_macro_f1": 85.96683768424042, "eval_macro_precision": 88.38352495427227, "eval_macro_recall": 84.16978032362647, "eval_micro_f1": 89.87421383647799, "eval_micro_precision": 89.87421383647799, "eval_micro_recall": 89.87421383647799, "eval_runtime": 2.071, "eval_samples_per_second": 767.76, "eval_steps_per_second": 12.072, "step": 2296 }, { "epoch": 41.07142857142857, "grad_norm": 0.0007947610574774444, "learning_rate": 3.907605513696808e-06, "loss": 0.001, "step": 2300 }, { "epoch": 41.160714285714285, "grad_norm": 0.006750487256795168, "learning_rate": 3.831895019292897e-06, "loss": 0.0, "step": 2305 }, { "epoch": 41.25, "grad_norm": 0.0010204770369455218, "learning_rate": 3.756864251262143e-06, "loss": 0.0, "step": 2310 }, { "epoch": 41.339285714285715, "grad_norm": 0.1501074582338333, "learning_rate": 3.68251561890815e-06, "loss": 0.0062, "step": 2315 }, { "epoch": 41.42857142857143, "grad_norm": 0.0004478511691559106, "learning_rate": 3.6088515096305674e-06, "loss": 0.0, "step": 2320 }, { "epoch": 41.517857142857146, "grad_norm": 0.0007376694120466709, "learning_rate": 3.535874288848398e-06, "loss": 0.0, "step": 2325 }, { "epoch": 41.607142857142854, "grad_norm": 0.0005087918252684176, "learning_rate": 3.4635862999240457e-06, "loss": 0.0, "step": 2330 }, { "epoch": 41.69642857142857, "grad_norm": 0.0007002074271440506, "learning_rate": 3.391989864088102e-06, "loss": 0.0009, "step": 2335 }, { "epoch": 41.785714285714285, "grad_norm": 0.0004804203344974667, "learning_rate": 3.321087280364757e-06, "loss": 0.0, "step": 2340 }, { "epoch": 41.875, "grad_norm": 0.00030447664903476834, "learning_rate": 3.250880825498026e-06, "loss": 0.0, "step": 2345 }, { "epoch": 41.964285714285715, "grad_norm": 0.0008117399993352592, "learning_rate": 3.181372753878595e-06, "loss": 0.0, "step": 2350 }, { "epoch": 42.0, "eval_loss": 1.1892344951629639, "eval_macro_f1": 85.91846508098604, "eval_macro_precision": 88.18629280744503, "eval_macro_recall": 84.20809959271497, "eval_micro_f1": 89.81132075471699, "eval_micro_precision": 89.81132075471699, "eval_micro_recall": 89.81132075471699, "eval_runtime": 2.1894, "eval_samples_per_second": 726.234, "eval_steps_per_second": 11.419, "step": 2352 }, { "epoch": 42.05357142857143, "grad_norm": 0.0006969855749048293, "learning_rate": 3.1125652974714758e-06, "loss": 0.0, "step": 2355 }, { "epoch": 42.142857142857146, "grad_norm": 0.0007993881008587778, "learning_rate": 3.044460665744284e-06, "loss": 0.0, "step": 2360 }, { "epoch": 42.232142857142854, "grad_norm": 0.14198355376720428, "learning_rate": 2.9770610455963547e-06, "loss": 0.0009, "step": 2365 }, { "epoch": 42.32142857142857, "grad_norm": 0.001101199653930962, "learning_rate": 2.9103686012884546e-06, "loss": 0.0009, "step": 2370 }, { "epoch": 42.410714285714285, "grad_norm": 0.0004274248203728348, "learning_rate": 2.8443854743733233e-06, "loss": 0.0, "step": 2375 }, { "epoch": 42.5, "grad_norm": 0.00032507788273505867, "learning_rate": 2.779113783626916e-06, "loss": 0.0, "step": 2380 }, { "epoch": 42.589285714285715, "grad_norm": 0.4506078064441681, "learning_rate": 2.7145556249803193e-06, "loss": 0.0055, "step": 2385 }, { "epoch": 42.67857142857143, "grad_norm": 0.00020234609837643802, "learning_rate": 2.6507130714525095e-06, "loss": 0.0, "step": 2390 }, { "epoch": 42.767857142857146, "grad_norm": 0.0005826003616675735, "learning_rate": 2.5875881730837324e-06, "loss": 0.0, "step": 2395 }, { "epoch": 42.857142857142854, "grad_norm": 0.000374118477338925, "learning_rate": 2.5251829568697207e-06, "loss": 0.0, "step": 2400 }, { "epoch": 42.94642857142857, "grad_norm": 0.0006962314946576953, "learning_rate": 2.463499426696564e-06, "loss": 0.0, "step": 2405 }, { "epoch": 43.0, "eval_loss": 1.1839672327041626, "eval_macro_f1": 85.74461897087475, "eval_macro_precision": 88.00197532696066, "eval_macro_recall": 84.04298404298405, "eval_micro_f1": 89.68553459119497, "eval_micro_precision": 89.68553459119497, "eval_micro_recall": 89.68553459119497, "eval_runtime": 2.6258, "eval_samples_per_second": 605.527, "eval_steps_per_second": 9.521, "step": 2408 }, { "epoch": 43.035714285714285, "grad_norm": 0.0006290263263508677, "learning_rate": 2.4025395632763846e-06, "loss": 0.0, "step": 2410 }, { "epoch": 43.125, "grad_norm": 0.00028139716596342623, "learning_rate": 2.3423053240837515e-06, "loss": 0.0, "step": 2415 }, { "epoch": 43.214285714285715, "grad_norm": 0.0004424660000950098, "learning_rate": 2.282798643292777e-06, "loss": 0.0009, "step": 2420 }, { "epoch": 43.30357142857143, "grad_norm": 0.0006072869873605669, "learning_rate": 2.224021431715065e-06, "loss": 0.0009, "step": 2425 }, { "epoch": 43.392857142857146, "grad_norm": 0.0006662249797955155, "learning_rate": 2.165975576738294e-06, "loss": 0.0, "step": 2430 }, { "epoch": 43.482142857142854, "grad_norm": 0.4406328797340393, "learning_rate": 2.108662942265666e-06, "loss": 0.0045, "step": 2435 }, { "epoch": 43.57142857142857, "grad_norm": 0.0005156341940164566, "learning_rate": 2.0520853686560178e-06, "loss": 0.0008, "step": 2440 }, { "epoch": 43.660714285714285, "grad_norm": 0.0010501693468540907, "learning_rate": 1.996244672664749e-06, "loss": 0.0, "step": 2445 }, { "epoch": 43.75, "grad_norm": 0.0007498673512600362, "learning_rate": 1.9411426473854688e-06, "loss": 0.0, "step": 2450 }, { "epoch": 43.839285714285715, "grad_norm": 0.000809444987680763, "learning_rate": 1.8867810621924165e-06, "loss": 0.0, "step": 2455 }, { "epoch": 43.92857142857143, "grad_norm": 0.0011670913081616163, "learning_rate": 1.8331616626836718e-06, "loss": 0.0, "step": 2460 }, { "epoch": 44.0, "eval_loss": 1.1925363540649414, "eval_macro_f1": 85.62321202521304, "eval_macro_precision": 87.68582327904362, "eval_macro_recall": 84.03903788519173, "eval_micro_f1": 89.55974842767296, "eval_micro_precision": 89.55974842767296, "eval_micro_recall": 89.55974842767296, "eval_runtime": 2.1265, "eval_samples_per_second": 747.691, "eval_steps_per_second": 11.756, "step": 2464 }, { "epoch": 44.017857142857146, "grad_norm": 0.0004898426122963428, "learning_rate": 1.7802861706250563e-06, "loss": 0.0, "step": 2465 }, { "epoch": 44.107142857142854, "grad_norm": 0.0005355161265470088, "learning_rate": 1.7281562838948966e-06, "loss": 0.0044, "step": 2470 }, { "epoch": 44.19642857142857, "grad_norm": 0.00022611931490246207, "learning_rate": 1.6767736764294605e-06, "loss": 0.0, "step": 2475 }, { "epoch": 44.285714285714285, "grad_norm": 0.0004798888403456658, "learning_rate": 1.626139998169246e-06, "loss": 0.0, "step": 2480 }, { "epoch": 44.375, "grad_norm": 0.002094451105222106, "learning_rate": 1.5762568750059604e-06, "loss": 0.0, "step": 2485 }, { "epoch": 44.464285714285715, "grad_norm": 0.0004955387557856739, "learning_rate": 1.5271259087303314e-06, "loss": 0.0, "step": 2490 }, { "epoch": 44.55357142857143, "grad_norm": 0.0005791817093268037, "learning_rate": 1.4787486769806847e-06, "loss": 0.002, "step": 2495 }, { "epoch": 44.642857142857146, "grad_norm": 0.0011661151656880975, "learning_rate": 1.4311267331922534e-06, "loss": 0.0, "step": 2500 }, { "epoch": 44.732142857142854, "grad_norm": 0.14946410059928894, "learning_rate": 1.3842616065473297e-06, "loss": 0.0009, "step": 2505 }, { "epoch": 44.82142857142857, "grad_norm": 0.0009950968669727445, "learning_rate": 1.3381548019261335e-06, "loss": 0.0, "step": 2510 }, { "epoch": 44.910714285714285, "grad_norm": 0.0006654797471128404, "learning_rate": 1.2928077998585087e-06, "loss": 0.0, "step": 2515 }, { "epoch": 45.0, "grad_norm": 0.000741883646696806, "learning_rate": 1.248222056476367e-06, "loss": 0.0, "step": 2520 }, { "epoch": 45.0, "eval_loss": 1.1892344951629639, "eval_macro_f1": 85.69684730927904, "eval_macro_precision": 87.80990783410138, "eval_macro_recall": 84.08130331207254, "eval_micro_f1": 89.62264150943396, "eval_micro_precision": 89.62264150943396, "eval_micro_recall": 89.62264150943396, "eval_runtime": 2.0693, "eval_samples_per_second": 768.362, "eval_steps_per_second": 12.081, "step": 2520 }, { "epoch": 45.089285714285715, "grad_norm": 0.0008588531636632979, "learning_rate": 1.204399003466941e-06, "loss": 0.0043, "step": 2525 }, { "epoch": 45.17857142857143, "grad_norm": 0.0007599690579809248, "learning_rate": 1.1613400480268099e-06, "loss": 0.0, "step": 2530 }, { "epoch": 45.267857142857146, "grad_norm": 0.0005483416607603431, "learning_rate": 1.1190465728167066e-06, "loss": 0.0, "step": 2535 }, { "epoch": 45.357142857142854, "grad_norm": 0.0006434289389289916, "learning_rate": 1.0775199359171345e-06, "loss": 0.001, "step": 2540 }, { "epoch": 45.44642857142857, "grad_norm": 0.0026349611580371857, "learning_rate": 1.0367614707847334e-06, "loss": 0.0, "step": 2545 }, { "epoch": 45.535714285714285, "grad_norm": 0.00044675698154605925, "learning_rate": 9.96772486209485e-07, "loss": 0.0, "step": 2550 }, { "epoch": 45.625, "grad_norm": 0.0010068505071103573, "learning_rate": 9.575542662726754e-07, "loss": 0.001, "step": 2555 }, { "epoch": 45.714285714285715, "grad_norm": 0.00023187148326542228, "learning_rate": 9.191080703056604e-07, "loss": 0.0, "step": 2560 }, { "epoch": 45.80357142857143, "grad_norm": 0.006482269149273634, "learning_rate": 8.814351328494369e-07, "loss": 0.0, "step": 2565 }, { "epoch": 45.892857142857146, "grad_norm": 0.0037991167046129704, "learning_rate": 8.445366636149865e-07, "loss": 0.0, "step": 2570 }, { "epoch": 45.982142857142854, "grad_norm": 0.0010641113622114062, "learning_rate": 8.084138474444503e-07, "loss": 0.0009, "step": 2575 }, { "epoch": 46.0, "eval_loss": 1.1895390748977661, "eval_macro_f1": 85.81857161383309, "eval_macro_precision": 88.1284500864264, "eval_macro_recall": 84.08524946986486, "eval_micro_f1": 89.74842767295598, "eval_micro_precision": 89.74842767295598, "eval_micro_recall": 89.74842767295598, "eval_runtime": 2.0545, "eval_samples_per_second": 773.904, "eval_steps_per_second": 12.168, "step": 2576 }, { "epoch": 46.07142857142857, "grad_norm": 0.0006879018619656563, "learning_rate": 7.730678442730538e-07, "loss": 0.0, "step": 2580 }, { "epoch": 46.160714285714285, "grad_norm": 0.0003646935510914773, "learning_rate": 7.384997890918899e-07, "loss": 0.0, "step": 2585 }, { "epoch": 46.25, "grad_norm": 0.0005362197407521307, "learning_rate": 7.047107919114588e-07, "loss": 0.0009, "step": 2590 }, { "epoch": 46.339285714285715, "grad_norm": 0.0005088266334496439, "learning_rate": 6.71701937726027e-07, "loss": 0.0009, "step": 2595 }, { "epoch": 46.42857142857143, "grad_norm": 0.000821845605969429, "learning_rate": 6.394742864787806e-07, "loss": 0.0, "step": 2600 }, { "epoch": 46.517857142857146, "grad_norm": 0.0003609760315157473, "learning_rate": 6.080288730278077e-07, "loss": 0.0, "step": 2605 }, { "epoch": 46.607142857142854, "grad_norm": 0.0006496753776445985, "learning_rate": 5.773667071128447e-07, "loss": 0.0, "step": 2610 }, { "epoch": 46.69642857142857, "grad_norm": 0.14351055026054382, "learning_rate": 5.474887733228656e-07, "loss": 0.0009, "step": 2615 }, { "epoch": 46.785714285714285, "grad_norm": 0.0012996145524084568, "learning_rate": 5.183960310644748e-07, "loss": 0.0, "step": 2620 }, { "epoch": 46.875, "grad_norm": 0.43366459012031555, "learning_rate": 4.900894145310753e-07, "loss": 0.0044, "step": 2625 }, { "epoch": 46.964285714285715, "grad_norm": 0.0005488655297085643, "learning_rate": 4.6256983267289887e-07, "loss": 0.0, "step": 2630 }, { "epoch": 47.0, "eval_loss": 1.1887431144714355, "eval_macro_f1": 86.01816071550488, "eval_macro_precision": 88.24407240824033, "eval_macro_recall": 84.3309497155651, "eval_micro_f1": 89.87421383647799, "eval_micro_precision": 89.87421383647799, "eval_micro_recall": 89.87421383647799, "eval_runtime": 2.1006, "eval_samples_per_second": 756.93, "eval_steps_per_second": 11.901, "step": 2632 }, { "epoch": 47.05357142857143, "grad_norm": 0.4307861626148224, "learning_rate": 4.358381691677932e-07, "loss": 0.0044, "step": 2635 }, { "epoch": 47.142857142857146, "grad_norm": 0.0007851451518945396, "learning_rate": 4.098952823928692e-07, "loss": 0.0, "step": 2640 }, { "epoch": 47.232142857142854, "grad_norm": 0.0006281957612372935, "learning_rate": 3.8474200539692087e-07, "loss": 0.0, "step": 2645 }, { "epoch": 47.32142857142857, "grad_norm": 0.000335185817675665, "learning_rate": 3.603791458736766e-07, "loss": 0.0, "step": 2650 }, { "epoch": 47.410714285714285, "grad_norm": 0.0007661879062652588, "learning_rate": 3.3680748613587885e-07, "loss": 0.0, "step": 2655 }, { "epoch": 47.5, "grad_norm": 0.00047480862122029066, "learning_rate": 3.140277830901428e-07, "loss": 0.0, "step": 2660 }, { "epoch": 47.589285714285715, "grad_norm": 0.0009058488649316132, "learning_rate": 2.9204076821266747e-07, "loss": 0.0, "step": 2665 }, { "epoch": 47.67857142857143, "grad_norm": 0.0015131317777559161, "learning_rate": 2.708471475257407e-07, "loss": 0.0, "step": 2670 }, { "epoch": 47.767857142857146, "grad_norm": 0.001217082142829895, "learning_rate": 2.5044760157506565e-07, "loss": 0.0018, "step": 2675 }, { "epoch": 47.857142857142854, "grad_norm": 0.0005450554890558124, "learning_rate": 2.3084278540791427e-07, "loss": 0.0, "step": 2680 }, { "epoch": 47.94642857142857, "grad_norm": 0.0010108886053785682, "learning_rate": 2.1203332855208313e-07, "loss": 0.0, "step": 2685 }, { "epoch": 48.0, "eval_loss": 1.1932783126831055, "eval_macro_f1": 85.84447327097699, "eval_macro_precision": 88.06013659836749, "eval_macro_recall": 84.16583416583417, "eval_micro_f1": 89.74842767295598, "eval_micro_precision": 89.74842767295598, "eval_micro_recall": 89.74842767295598, "eval_runtime": 2.0801, "eval_samples_per_second": 764.369, "eval_steps_per_second": 12.018, "step": 2688 }, { "epoch": 48.035714285714285, "grad_norm": 0.0012142349733039737, "learning_rate": 1.9401983499569842e-07, "loss": 0.0009, "step": 2690 }, { "epoch": 48.125, "grad_norm": 0.0010546569246798754, "learning_rate": 1.768028831677926e-07, "loss": 0.0, "step": 2695 }, { "epoch": 48.214285714285715, "grad_norm": 0.0007233622018247843, "learning_rate": 1.6038302591975806e-07, "loss": 0.0043, "step": 2700 }, { "epoch": 48.30357142857143, "grad_norm": 0.0009546867804601789, "learning_rate": 1.4476079050757818e-07, "loss": 0.0, "step": 2705 }, { "epoch": 48.392857142857146, "grad_norm": 0.0010814859997481108, "learning_rate": 1.29936678574899e-07, "loss": 0.0009, "step": 2710 }, { "epoch": 48.482142857142854, "grad_norm": 0.00036580185405910015, "learning_rate": 1.1591116613692832e-07, "loss": 0.0, "step": 2715 }, { "epoch": 48.57142857142857, "grad_norm": 0.0009721943642944098, "learning_rate": 1.0268470356514237e-07, "loss": 0.0009, "step": 2720 }, { "epoch": 48.660714285714285, "grad_norm": 0.000539219006896019, "learning_rate": 9.025771557282792e-08, "loss": 0.0, "step": 2725 }, { "epoch": 48.75, "grad_norm": 0.0017280342290177941, "learning_rate": 7.863060120144317e-08, "loss": 0.0009, "step": 2730 }, { "epoch": 48.839285714285715, "grad_norm": 0.0009316341020166874, "learning_rate": 6.780373380780025e-08, "loss": 0.0, "step": 2735 }, { "epoch": 48.92857142857143, "grad_norm": 0.0016215546056628227, "learning_rate": 5.7777461052091474e-08, "loss": 0.0, "step": 2740 }, { "epoch": 49.0, "eval_loss": 1.190055012702942, "eval_macro_f1": 85.81857161383309, "eval_macro_precision": 88.1284500864264, "eval_macro_recall": 84.08524946986486, "eval_micro_f1": 89.74842767295598, "eval_micro_precision": 89.74842767295598, "eval_micro_recall": 89.74842767295598, "eval_runtime": 2.1039, "eval_samples_per_second": 755.753, "eval_steps_per_second": 11.883, "step": 2744 }, { "epoch": 49.017857142857146, "grad_norm": 0.0018325834535062313, "learning_rate": 4.855210488670381e-08, "loss": 0.0, "step": 2745 }, { "epoch": 49.107142857142854, "grad_norm": 0.00028368146740831435, "learning_rate": 4.01279615458966e-08, "loss": 0.0, "step": 2750 }, { "epoch": 49.19642857142857, "grad_norm": 0.0009773739147931337, "learning_rate": 3.250530153628417e-08, "loss": 0.0009, "step": 2755 }, { "epoch": 49.285714285714285, "grad_norm": 0.0006530345417559147, "learning_rate": 2.5684369628148353e-08, "loss": 0.0009, "step": 2760 }, { "epoch": 49.375, "grad_norm": 0.001267548301257193, "learning_rate": 1.9665384847583622e-08, "loss": 0.0, "step": 2765 }, { "epoch": 49.464285714285715, "grad_norm": 0.00035301086609251797, "learning_rate": 1.4448540469458316e-08, "loss": 0.0, "step": 2770 }, { "epoch": 49.55357142857143, "grad_norm": 0.0017323438078165054, "learning_rate": 1.0034004011202913e-08, "loss": 0.0, "step": 2775 }, { "epoch": 49.642857142857146, "grad_norm": 0.0008427600259892642, "learning_rate": 6.421917227455998e-09, "loss": 0.0, "step": 2780 }, { "epoch": 49.732142857142854, "grad_norm": 0.14544406533241272, "learning_rate": 3.6123961054762567e-09, "loss": 0.0009, "step": 2785 }, { "epoch": 49.82142857142857, "grad_norm": 0.0004677158431150019, "learning_rate": 1.605530861450988e-09, "loss": 0.0, "step": 2790 }, { "epoch": 49.910714285714285, "grad_norm": 0.0009074215777218342, "learning_rate": 4.0138593757621523e-10, "loss": 0.0, "step": 2795 }, { "epoch": 50.0, "grad_norm": 0.0004363281768746674, "learning_rate": 0.0, "loss": 0.0043, "step": 2800 }, { "epoch": 50.0, "eval_loss": 1.1904431581497192, "eval_macro_f1": 85.6707858264491, "eval_macro_precision": 87.87620078849466, "eval_macro_recall": 84.00071861610323, "eval_micro_f1": 89.62264150943396, "eval_micro_precision": 89.62264150943396, "eval_micro_recall": 89.62264150943396, "eval_runtime": 2.6821, "eval_samples_per_second": 592.81, "eval_steps_per_second": 9.321, "step": 2800 }, { "epoch": 50.0, "step": 2800, "total_flos": 1.502828979688571e+17, "train_loss": 0.1562508400436075, "train_runtime": 2658.12, "train_samples_per_second": 269.1, "train_steps_per_second": 1.053 } ], "logging_steps": 5, "max_steps": 2800, "num_input_tokens_seen": 0, "num_train_epochs": 50, "save_steps": 500.0, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.502828979688571e+17, "train_batch_size": 64, "trial_name": null, "trial_params": null }