{ "best_metric": 0.772007503464447, "best_model_checkpoint": "./results/models/nusaparagraph_emot/nusabert-bigru-concate-8-mean\\checkpoint-12222", "epoch": 7.0, "eval_steps": 500, "global_step": 12222, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0286368843069874, "grad_norm": 4.918474197387695, "learning_rate": 9.985681557846507e-06, "loss": 1.918, "step": 50 }, { "epoch": 0.0572737686139748, "grad_norm": 5.332164287567139, "learning_rate": 9.971363115693013e-06, "loss": 1.8503, "step": 100 }, { "epoch": 0.0859106529209622, "grad_norm": 13.817869186401367, "learning_rate": 9.95704467353952e-06, "loss": 1.7392, "step": 150 }, { "epoch": 0.1145475372279496, "grad_norm": 12.340777397155762, "learning_rate": 9.942726231386026e-06, "loss": 1.4836, "step": 200 }, { "epoch": 0.143184421534937, "grad_norm": 17.78537940979004, "learning_rate": 9.928407789232532e-06, "loss": 1.4447, "step": 250 }, { "epoch": 0.1718213058419244, "grad_norm": 15.181681632995605, "learning_rate": 9.914089347079038e-06, "loss": 1.1689, "step": 300 }, { "epoch": 0.2004581901489118, "grad_norm": 12.248451232910156, "learning_rate": 9.899770904925546e-06, "loss": 1.229, "step": 350 }, { "epoch": 0.2290950744558992, "grad_norm": 15.683427810668945, "learning_rate": 9.885452462772052e-06, "loss": 1.0561, "step": 400 }, { "epoch": 0.25773195876288657, "grad_norm": 17.413002014160156, "learning_rate": 9.871134020618558e-06, "loss": 0.9704, "step": 450 }, { "epoch": 0.286368843069874, "grad_norm": 21.993549346923828, "learning_rate": 9.856815578465064e-06, "loss": 1.0062, "step": 500 }, { "epoch": 0.3150057273768614, "grad_norm": 15.140169143676758, "learning_rate": 9.84249713631157e-06, "loss": 0.9102, "step": 550 }, { "epoch": 0.3436426116838488, "grad_norm": 25.413042068481445, "learning_rate": 9.828178694158076e-06, "loss": 0.8857, "step": 600 }, { "epoch": 0.3722794959908362, "grad_norm": 13.888834953308105, "learning_rate": 9.813860252004582e-06, "loss": 0.9794, "step": 650 }, { "epoch": 0.4009163802978236, "grad_norm": 18.061304092407227, "learning_rate": 9.799541809851088e-06, "loss": 0.9635, "step": 700 }, { "epoch": 0.42955326460481097, "grad_norm": 16.933076858520508, "learning_rate": 9.785223367697596e-06, "loss": 0.8568, "step": 750 }, { "epoch": 0.4581901489117984, "grad_norm": 15.249773979187012, "learning_rate": 9.770904925544102e-06, "loss": 0.8865, "step": 800 }, { "epoch": 0.4868270332187858, "grad_norm": 16.965801239013672, "learning_rate": 9.756586483390608e-06, "loss": 0.776, "step": 850 }, { "epoch": 0.5154639175257731, "grad_norm": 6.5678791999816895, "learning_rate": 9.742268041237114e-06, "loss": 0.8698, "step": 900 }, { "epoch": 0.5441008018327605, "grad_norm": 11.134130477905273, "learning_rate": 9.72794959908362e-06, "loss": 0.7746, "step": 950 }, { "epoch": 0.572737686139748, "grad_norm": 13.739683151245117, "learning_rate": 9.713631156930127e-06, "loss": 0.8256, "step": 1000 }, { "epoch": 0.6013745704467354, "grad_norm": 13.819252967834473, "learning_rate": 9.699312714776633e-06, "loss": 0.8959, "step": 1050 }, { "epoch": 0.6300114547537228, "grad_norm": 21.550695419311523, "learning_rate": 9.68499427262314e-06, "loss": 0.8417, "step": 1100 }, { "epoch": 0.6586483390607102, "grad_norm": 15.80966854095459, "learning_rate": 9.670675830469645e-06, "loss": 0.9138, "step": 1150 }, { "epoch": 0.6872852233676976, "grad_norm": 9.978912353515625, "learning_rate": 9.656357388316153e-06, "loss": 0.7363, "step": 1200 }, { "epoch": 0.715922107674685, "grad_norm": 15.32123851776123, "learning_rate": 9.642038946162659e-06, "loss": 0.849, "step": 1250 }, { "epoch": 0.7445589919816724, "grad_norm": 21.705232620239258, "learning_rate": 9.627720504009165e-06, "loss": 0.8068, "step": 1300 }, { "epoch": 0.7731958762886598, "grad_norm": 23.230682373046875, "learning_rate": 9.613402061855671e-06, "loss": 0.7546, "step": 1350 }, { "epoch": 0.8018327605956472, "grad_norm": 18.663450241088867, "learning_rate": 9.599083619702177e-06, "loss": 0.7417, "step": 1400 }, { "epoch": 0.8304696449026346, "grad_norm": 17.77196502685547, "learning_rate": 9.584765177548685e-06, "loss": 0.7517, "step": 1450 }, { "epoch": 0.8591065292096219, "grad_norm": 7.162426471710205, "learning_rate": 9.57044673539519e-06, "loss": 0.6956, "step": 1500 }, { "epoch": 0.8877434135166093, "grad_norm": 11.551986694335938, "learning_rate": 9.556128293241697e-06, "loss": 0.7805, "step": 1550 }, { "epoch": 0.9163802978235968, "grad_norm": 28.649534225463867, "learning_rate": 9.541809851088203e-06, "loss": 0.7907, "step": 1600 }, { "epoch": 0.9450171821305842, "grad_norm": 24.329723358154297, "learning_rate": 9.527491408934708e-06, "loss": 0.662, "step": 1650 }, { "epoch": 0.9736540664375716, "grad_norm": 8.512353897094727, "learning_rate": 9.513172966781214e-06, "loss": 0.7086, "step": 1700 }, { "epoch": 1.0, "eval_accuracy": 0.7155204810221721, "eval_f1": 0.7112801498610356, "eval_loss": 0.8033041954040527, "eval_runtime": 36.3098, "eval_samples_per_second": 73.286, "eval_steps_per_second": 9.171, "step": 1746 }, { "epoch": 1.002290950744559, "grad_norm": 19.602813720703125, "learning_rate": 9.49885452462772e-06, "loss": 0.7832, "step": 1750 }, { "epoch": 1.0309278350515463, "grad_norm": 8.915902137756348, "learning_rate": 9.484536082474226e-06, "loss": 0.5571, "step": 1800 }, { "epoch": 1.0595647193585338, "grad_norm": 44.88418960571289, "learning_rate": 9.470217640320734e-06, "loss": 0.5963, "step": 1850 }, { "epoch": 1.088201603665521, "grad_norm": 7.115338325500488, "learning_rate": 9.45589919816724e-06, "loss": 0.4956, "step": 1900 }, { "epoch": 1.1168384879725086, "grad_norm": 16.892269134521484, "learning_rate": 9.441580756013746e-06, "loss": 0.4767, "step": 1950 }, { "epoch": 1.145475372279496, "grad_norm": 30.78487205505371, "learning_rate": 9.427262313860252e-06, "loss": 0.5456, "step": 2000 }, { "epoch": 1.1741122565864834, "grad_norm": 12.62244987487793, "learning_rate": 9.41294387170676e-06, "loss": 0.531, "step": 2050 }, { "epoch": 1.2027491408934707, "grad_norm": 20.651899337768555, "learning_rate": 9.398625429553266e-06, "loss": 0.529, "step": 2100 }, { "epoch": 1.2313860252004583, "grad_norm": 21.23448371887207, "learning_rate": 9.384306987399772e-06, "loss": 0.5307, "step": 2150 }, { "epoch": 1.2600229095074456, "grad_norm": 6.756899356842041, "learning_rate": 9.369988545246278e-06, "loss": 0.5302, "step": 2200 }, { "epoch": 1.2886597938144329, "grad_norm": 11.766059875488281, "learning_rate": 9.355670103092784e-06, "loss": 0.5169, "step": 2250 }, { "epoch": 1.3172966781214204, "grad_norm": 12.042119026184082, "learning_rate": 9.341351660939291e-06, "loss": 0.4705, "step": 2300 }, { "epoch": 1.345933562428408, "grad_norm": 13.262438774108887, "learning_rate": 9.327033218785797e-06, "loss": 0.5263, "step": 2350 }, { "epoch": 1.3745704467353952, "grad_norm": 24.665464401245117, "learning_rate": 9.312714776632303e-06, "loss": 0.5039, "step": 2400 }, { "epoch": 1.4032073310423825, "grad_norm": 24.94556427001953, "learning_rate": 9.29839633447881e-06, "loss": 0.5736, "step": 2450 }, { "epoch": 1.43184421534937, "grad_norm": 18.88966941833496, "learning_rate": 9.284077892325315e-06, "loss": 0.5569, "step": 2500 }, { "epoch": 1.4604810996563573, "grad_norm": 8.994138717651367, "learning_rate": 9.269759450171823e-06, "loss": 0.4279, "step": 2550 }, { "epoch": 1.4891179839633448, "grad_norm": 8.518653869628906, "learning_rate": 9.255441008018329e-06, "loss": 0.4764, "step": 2600 }, { "epoch": 1.5177548682703321, "grad_norm": 35.21455764770508, "learning_rate": 9.241122565864835e-06, "loss": 0.5318, "step": 2650 }, { "epoch": 1.5463917525773194, "grad_norm": 13.781972885131836, "learning_rate": 9.226804123711341e-06, "loss": 0.4229, "step": 2700 }, { "epoch": 1.575028636884307, "grad_norm": 7.166682720184326, "learning_rate": 9.212485681557847e-06, "loss": 0.511, "step": 2750 }, { "epoch": 1.6036655211912945, "grad_norm": 20.155414581298828, "learning_rate": 9.198167239404353e-06, "loss": 0.5216, "step": 2800 }, { "epoch": 1.6323024054982818, "grad_norm": 13.758686065673828, "learning_rate": 9.183848797250859e-06, "loss": 0.5359, "step": 2850 }, { "epoch": 1.660939289805269, "grad_norm": 17.821603775024414, "learning_rate": 9.169530355097367e-06, "loss": 0.5762, "step": 2900 }, { "epoch": 1.6895761741122566, "grad_norm": 22.8956356048584, "learning_rate": 9.155211912943873e-06, "loss": 0.5629, "step": 2950 }, { "epoch": 1.718213058419244, "grad_norm": 3.841970443725586, "learning_rate": 9.140893470790379e-06, "loss": 0.4847, "step": 3000 }, { "epoch": 1.7468499427262314, "grad_norm": 22.353851318359375, "learning_rate": 9.126575028636885e-06, "loss": 0.499, "step": 3050 }, { "epoch": 1.7754868270332187, "grad_norm": 6.219491481781006, "learning_rate": 9.11225658648339e-06, "loss": 0.6343, "step": 3100 }, { "epoch": 1.8041237113402062, "grad_norm": 26.25059700012207, "learning_rate": 9.097938144329898e-06, "loss": 0.5259, "step": 3150 }, { "epoch": 1.8327605956471937, "grad_norm": 14.963705062866211, "learning_rate": 9.083619702176404e-06, "loss": 0.5091, "step": 3200 }, { "epoch": 1.861397479954181, "grad_norm": 4.209954261779785, "learning_rate": 9.06930126002291e-06, "loss": 0.5591, "step": 3250 }, { "epoch": 1.8900343642611683, "grad_norm": 12.075881958007812, "learning_rate": 9.054982817869416e-06, "loss": 0.4983, "step": 3300 }, { "epoch": 1.9186712485681556, "grad_norm": 14.535738945007324, "learning_rate": 9.040664375715922e-06, "loss": 0.4013, "step": 3350 }, { "epoch": 1.9473081328751431, "grad_norm": 12.017505645751953, "learning_rate": 9.02634593356243e-06, "loss": 0.5307, "step": 3400 }, { "epoch": 1.9759450171821307, "grad_norm": 24.822385787963867, "learning_rate": 9.012027491408936e-06, "loss": 0.4848, "step": 3450 }, { "epoch": 2.0, "eval_accuracy": 0.7576099210822999, "eval_f1": 0.7572228179173918, "eval_loss": 0.7472622394561768, "eval_runtime": 36.2957, "eval_samples_per_second": 73.314, "eval_steps_per_second": 9.175, "step": 3492 }, { "epoch": 2.004581901489118, "grad_norm": 0.584539532661438, "learning_rate": 8.997709049255442e-06, "loss": 0.4442, "step": 3500 }, { "epoch": 2.0332187857961053, "grad_norm": 48.69373321533203, "learning_rate": 8.983390607101948e-06, "loss": 0.3488, "step": 3550 }, { "epoch": 2.0618556701030926, "grad_norm": 6.41154670715332, "learning_rate": 8.969072164948455e-06, "loss": 0.3281, "step": 3600 }, { "epoch": 2.0904925544100803, "grad_norm": 29.08100700378418, "learning_rate": 8.954753722794961e-06, "loss": 0.2916, "step": 3650 }, { "epoch": 2.1191294387170676, "grad_norm": 9.118326187133789, "learning_rate": 8.940435280641467e-06, "loss": 0.2665, "step": 3700 }, { "epoch": 2.147766323024055, "grad_norm": 3.8800840377807617, "learning_rate": 8.926116838487973e-06, "loss": 0.2697, "step": 3750 }, { "epoch": 2.176403207331042, "grad_norm": 14.715743064880371, "learning_rate": 8.91179839633448e-06, "loss": 0.3374, "step": 3800 }, { "epoch": 2.20504009163803, "grad_norm": 46.47842025756836, "learning_rate": 8.897479954180985e-06, "loss": 0.2725, "step": 3850 }, { "epoch": 2.2336769759450172, "grad_norm": 25.640724182128906, "learning_rate": 8.883161512027491e-06, "loss": 0.2656, "step": 3900 }, { "epoch": 2.2623138602520045, "grad_norm": 33.861568450927734, "learning_rate": 8.868843069873997e-06, "loss": 0.3202, "step": 3950 }, { "epoch": 2.290950744558992, "grad_norm": 1.642096996307373, "learning_rate": 8.854524627720505e-06, "loss": 0.2686, "step": 4000 }, { "epoch": 2.319587628865979, "grad_norm": 10.57680606842041, "learning_rate": 8.840206185567011e-06, "loss": 0.2817, "step": 4050 }, { "epoch": 2.348224513172967, "grad_norm": 22.718547821044922, "learning_rate": 8.825887743413517e-06, "loss": 0.3209, "step": 4100 }, { "epoch": 2.376861397479954, "grad_norm": 39.80268859863281, "learning_rate": 8.811569301260023e-06, "loss": 0.3598, "step": 4150 }, { "epoch": 2.4054982817869415, "grad_norm": 13.368090629577637, "learning_rate": 8.797250859106529e-06, "loss": 0.3448, "step": 4200 }, { "epoch": 2.434135166093929, "grad_norm": 55.12387466430664, "learning_rate": 8.782932416953037e-06, "loss": 0.2845, "step": 4250 }, { "epoch": 2.4627720504009165, "grad_norm": 8.037142753601074, "learning_rate": 8.768613974799543e-06, "loss": 0.2482, "step": 4300 }, { "epoch": 2.491408934707904, "grad_norm": 14.919239044189453, "learning_rate": 8.754295532646049e-06, "loss": 0.2222, "step": 4350 }, { "epoch": 2.520045819014891, "grad_norm": 30.16111946105957, "learning_rate": 8.739977090492555e-06, "loss": 0.2577, "step": 4400 }, { "epoch": 2.5486827033218784, "grad_norm": 18.18327522277832, "learning_rate": 8.725658648339062e-06, "loss": 0.2802, "step": 4450 }, { "epoch": 2.5773195876288657, "grad_norm": 14.579641342163086, "learning_rate": 8.711340206185568e-06, "loss": 0.2528, "step": 4500 }, { "epoch": 2.6059564719358534, "grad_norm": 17.266183853149414, "learning_rate": 8.697021764032074e-06, "loss": 0.3592, "step": 4550 }, { "epoch": 2.6345933562428407, "grad_norm": 16.718862533569336, "learning_rate": 8.68270332187858e-06, "loss": 0.3184, "step": 4600 }, { "epoch": 2.663230240549828, "grad_norm": 25.249649047851562, "learning_rate": 8.668384879725086e-06, "loss": 0.2697, "step": 4650 }, { "epoch": 2.691867124856816, "grad_norm": 11.022406578063965, "learning_rate": 8.654066437571594e-06, "loss": 0.2776, "step": 4700 }, { "epoch": 2.720504009163803, "grad_norm": 8.15123176574707, "learning_rate": 8.6397479954181e-06, "loss": 0.3462, "step": 4750 }, { "epoch": 2.7491408934707904, "grad_norm": 11.458098411560059, "learning_rate": 8.625429553264606e-06, "loss": 0.3657, "step": 4800 }, { "epoch": 2.7777777777777777, "grad_norm": 3.9528236389160156, "learning_rate": 8.611111111111112e-06, "loss": 0.3086, "step": 4850 }, { "epoch": 2.806414662084765, "grad_norm": 39.1205940246582, "learning_rate": 8.596792668957618e-06, "loss": 0.3528, "step": 4900 }, { "epoch": 2.8350515463917527, "grad_norm": 53.56584167480469, "learning_rate": 8.582474226804124e-06, "loss": 0.2909, "step": 4950 }, { "epoch": 2.86368843069874, "grad_norm": 51.294132232666016, "learning_rate": 8.56815578465063e-06, "loss": 0.2571, "step": 5000 }, { "epoch": 2.8923253150057273, "grad_norm": 48.5758056640625, "learning_rate": 8.553837342497136e-06, "loss": 0.2812, "step": 5050 }, { "epoch": 2.9209621993127146, "grad_norm": 9.625508308410645, "learning_rate": 8.539518900343643e-06, "loss": 0.2606, "step": 5100 }, { "epoch": 2.9495990836197024, "grad_norm": 22.97384262084961, "learning_rate": 8.52520045819015e-06, "loss": 0.3566, "step": 5150 }, { "epoch": 2.9782359679266897, "grad_norm": 34.34775161743164, "learning_rate": 8.510882016036655e-06, "loss": 0.3251, "step": 5200 }, { "epoch": 3.0, "eval_accuracy": 0.7493423524990604, "eval_f1": 0.7488536960670391, "eval_loss": 0.9042189121246338, "eval_runtime": 36.3022, "eval_samples_per_second": 73.301, "eval_steps_per_second": 9.173, "step": 5238 }, { "epoch": 3.006872852233677, "grad_norm": 0.47577229142189026, "learning_rate": 8.496563573883161e-06, "loss": 0.2671, "step": 5250 }, { "epoch": 3.0355097365406642, "grad_norm": 36.310089111328125, "learning_rate": 8.482245131729669e-06, "loss": 0.1493, "step": 5300 }, { "epoch": 3.0641466208476515, "grad_norm": 29.30960464477539, "learning_rate": 8.467926689576175e-06, "loss": 0.1603, "step": 5350 }, { "epoch": 3.0927835051546393, "grad_norm": 2.112786054611206, "learning_rate": 8.453608247422681e-06, "loss": 0.1363, "step": 5400 }, { "epoch": 3.1214203894616266, "grad_norm": 3.0346627235412598, "learning_rate": 8.439289805269187e-06, "loss": 0.1359, "step": 5450 }, { "epoch": 3.150057273768614, "grad_norm": 22.331645965576172, "learning_rate": 8.424971363115693e-06, "loss": 0.1479, "step": 5500 }, { "epoch": 3.178694158075601, "grad_norm": 13.065728187561035, "learning_rate": 8.4106529209622e-06, "loss": 0.1311, "step": 5550 }, { "epoch": 3.207331042382589, "grad_norm": 3.163928270339966, "learning_rate": 8.396334478808707e-06, "loss": 0.1364, "step": 5600 }, { "epoch": 3.2359679266895762, "grad_norm": 33.371849060058594, "learning_rate": 8.382016036655213e-06, "loss": 0.1485, "step": 5650 }, { "epoch": 3.2646048109965635, "grad_norm": 24.721717834472656, "learning_rate": 8.367697594501719e-06, "loss": 0.155, "step": 5700 }, { "epoch": 3.293241695303551, "grad_norm": 15.719419479370117, "learning_rate": 8.353379152348225e-06, "loss": 0.1539, "step": 5750 }, { "epoch": 3.3218785796105386, "grad_norm": 41.62794876098633, "learning_rate": 8.339060710194732e-06, "loss": 0.1695, "step": 5800 }, { "epoch": 3.350515463917526, "grad_norm": 1.7000266313552856, "learning_rate": 8.324742268041238e-06, "loss": 0.187, "step": 5850 }, { "epoch": 3.379152348224513, "grad_norm": 9.574384689331055, "learning_rate": 8.310423825887744e-06, "loss": 0.1909, "step": 5900 }, { "epoch": 3.4077892325315005, "grad_norm": 19.80113410949707, "learning_rate": 8.29610538373425e-06, "loss": 0.1656, "step": 5950 }, { "epoch": 3.436426116838488, "grad_norm": 71.53827667236328, "learning_rate": 8.281786941580758e-06, "loss": 0.1116, "step": 6000 }, { "epoch": 3.4650630011454755, "grad_norm": 65.89808654785156, "learning_rate": 8.267468499427262e-06, "loss": 0.188, "step": 6050 }, { "epoch": 3.493699885452463, "grad_norm": 60.68288803100586, "learning_rate": 8.253150057273768e-06, "loss": 0.1315, "step": 6100 }, { "epoch": 3.52233676975945, "grad_norm": 25.550884246826172, "learning_rate": 8.238831615120276e-06, "loss": 0.1194, "step": 6150 }, { "epoch": 3.5509736540664374, "grad_norm": 6.416481971740723, "learning_rate": 8.224513172966782e-06, "loss": 0.134, "step": 6200 }, { "epoch": 3.579610538373425, "grad_norm": 1.5923579931259155, "learning_rate": 8.210194730813288e-06, "loss": 0.1672, "step": 6250 }, { "epoch": 3.6082474226804124, "grad_norm": 19.590898513793945, "learning_rate": 8.195876288659794e-06, "loss": 0.14, "step": 6300 }, { "epoch": 3.6368843069873997, "grad_norm": 0.4376499652862549, "learning_rate": 8.1815578465063e-06, "loss": 0.1425, "step": 6350 }, { "epoch": 3.665521191294387, "grad_norm": 11.566116333007812, "learning_rate": 8.167239404352808e-06, "loss": 0.205, "step": 6400 }, { "epoch": 3.6941580756013748, "grad_norm": 62.71388626098633, "learning_rate": 8.152920962199314e-06, "loss": 0.184, "step": 6450 }, { "epoch": 3.722794959908362, "grad_norm": 0.32115602493286133, "learning_rate": 8.13860252004582e-06, "loss": 0.1267, "step": 6500 }, { "epoch": 3.7514318442153494, "grad_norm": 53.07960891723633, "learning_rate": 8.124284077892326e-06, "loss": 0.1796, "step": 6550 }, { "epoch": 3.7800687285223367, "grad_norm": 2.265838861465454, "learning_rate": 8.109965635738832e-06, "loss": 0.2187, "step": 6600 }, { "epoch": 3.808705612829324, "grad_norm": 0.6725891828536987, "learning_rate": 8.09564719358534e-06, "loss": 0.1518, "step": 6650 }, { "epoch": 3.8373424971363117, "grad_norm": 53.60797882080078, "learning_rate": 8.081328751431845e-06, "loss": 0.1619, "step": 6700 }, { "epoch": 3.865979381443299, "grad_norm": 22.341100692749023, "learning_rate": 8.067010309278351e-06, "loss": 0.1378, "step": 6750 }, { "epoch": 3.8946162657502863, "grad_norm": 64.15727996826172, "learning_rate": 8.052691867124857e-06, "loss": 0.1445, "step": 6800 }, { "epoch": 3.923253150057274, "grad_norm": 36.292484283447266, "learning_rate": 8.038373424971365e-06, "loss": 0.1612, "step": 6850 }, { "epoch": 3.9518900343642613, "grad_norm": 4.331699848175049, "learning_rate": 8.02405498281787e-06, "loss": 0.2337, "step": 6900 }, { "epoch": 3.9805269186712486, "grad_norm": 0.28292712569236755, "learning_rate": 8.009736540664377e-06, "loss": 0.1779, "step": 6950 }, { "epoch": 4.0, "eval_accuracy": 0.7606163096580233, "eval_f1": 0.7597313264211647, "eval_loss": 0.9818114042282104, "eval_runtime": 36.2859, "eval_samples_per_second": 73.334, "eval_steps_per_second": 9.177, "step": 6984 }, { "epoch": 4.009163802978236, "grad_norm": 14.622065544128418, "learning_rate": 7.995418098510883e-06, "loss": 0.1005, "step": 7000 }, { "epoch": 4.037800687285223, "grad_norm": 0.4035070836544037, "learning_rate": 7.981099656357389e-06, "loss": 0.0777, "step": 7050 }, { "epoch": 4.0664375715922105, "grad_norm": 31.626924514770508, "learning_rate": 7.966781214203895e-06, "loss": 0.0581, "step": 7100 }, { "epoch": 4.095074455899198, "grad_norm": 0.15928135812282562, "learning_rate": 7.9524627720504e-06, "loss": 0.0477, "step": 7150 }, { "epoch": 4.123711340206185, "grad_norm": 49.666709899902344, "learning_rate": 7.938144329896907e-06, "loss": 0.0752, "step": 7200 }, { "epoch": 4.152348224513173, "grad_norm": 0.6751652956008911, "learning_rate": 7.923825887743414e-06, "loss": 0.0807, "step": 7250 }, { "epoch": 4.180985108820161, "grad_norm": 0.623131513595581, "learning_rate": 7.90950744558992e-06, "loss": 0.1205, "step": 7300 }, { "epoch": 4.209621993127148, "grad_norm": 0.47417309880256653, "learning_rate": 7.895189003436426e-06, "loss": 0.0286, "step": 7350 }, { "epoch": 4.238258877434135, "grad_norm": 0.37885400652885437, "learning_rate": 7.880870561282932e-06, "loss": 0.0747, "step": 7400 }, { "epoch": 4.2668957617411225, "grad_norm": 1.7955352067947388, "learning_rate": 7.866552119129438e-06, "loss": 0.0544, "step": 7450 }, { "epoch": 4.29553264604811, "grad_norm": 46.27180862426758, "learning_rate": 7.852233676975946e-06, "loss": 0.0447, "step": 7500 }, { "epoch": 4.324169530355097, "grad_norm": 2.345630645751953, "learning_rate": 7.837915234822452e-06, "loss": 0.0599, "step": 7550 }, { "epoch": 4.352806414662084, "grad_norm": 0.334553986787796, "learning_rate": 7.823596792668958e-06, "loss": 0.0494, "step": 7600 }, { "epoch": 4.381443298969073, "grad_norm": 8.703739166259766, "learning_rate": 7.809278350515464e-06, "loss": 0.0523, "step": 7650 }, { "epoch": 4.41008018327606, "grad_norm": 36.21126937866211, "learning_rate": 7.794959908361972e-06, "loss": 0.115, "step": 7700 }, { "epoch": 4.438717067583047, "grad_norm": 0.7834786772727966, "learning_rate": 7.780641466208478e-06, "loss": 0.0602, "step": 7750 }, { "epoch": 4.4673539518900345, "grad_norm": 1.5702382326126099, "learning_rate": 7.766323024054984e-06, "loss": 0.1341, "step": 7800 }, { "epoch": 4.495990836197022, "grad_norm": 0.2708655595779419, "learning_rate": 7.75200458190149e-06, "loss": 0.1514, "step": 7850 }, { "epoch": 4.524627720504009, "grad_norm": 0.48853299021720886, "learning_rate": 7.737686139747996e-06, "loss": 0.1418, "step": 7900 }, { "epoch": 4.553264604810996, "grad_norm": 36.37514114379883, "learning_rate": 7.723367697594503e-06, "loss": 0.0663, "step": 7950 }, { "epoch": 4.581901489117984, "grad_norm": 30.906522750854492, "learning_rate": 7.70904925544101e-06, "loss": 0.0655, "step": 8000 }, { "epoch": 4.610538373424971, "grad_norm": 3.7403993606567383, "learning_rate": 7.694730813287515e-06, "loss": 0.0496, "step": 8050 }, { "epoch": 4.639175257731958, "grad_norm": 0.10339858382940292, "learning_rate": 7.680412371134021e-06, "loss": 0.1213, "step": 8100 }, { "epoch": 4.6678121420389465, "grad_norm": 0.03920993208885193, "learning_rate": 7.666093928980529e-06, "loss": 0.1407, "step": 8150 }, { "epoch": 4.696449026345934, "grad_norm": 70.78038024902344, "learning_rate": 7.651775486827033e-06, "loss": 0.0922, "step": 8200 }, { "epoch": 4.725085910652921, "grad_norm": 20.5271053314209, "learning_rate": 7.63745704467354e-06, "loss": 0.1784, "step": 8250 }, { "epoch": 4.753722794959908, "grad_norm": 0.17482055723667145, "learning_rate": 7.623138602520046e-06, "loss": 0.1508, "step": 8300 }, { "epoch": 4.782359679266896, "grad_norm": 33.39112091064453, "learning_rate": 7.608820160366552e-06, "loss": 0.1115, "step": 8350 }, { "epoch": 4.810996563573883, "grad_norm": 0.4592705965042114, "learning_rate": 7.594501718213059e-06, "loss": 0.1188, "step": 8400 }, { "epoch": 4.83963344788087, "grad_norm": 38.45854187011719, "learning_rate": 7.580183276059565e-06, "loss": 0.0788, "step": 8450 }, { "epoch": 4.868270332187858, "grad_norm": 60.54808044433594, "learning_rate": 7.565864833906072e-06, "loss": 0.1159, "step": 8500 }, { "epoch": 4.896907216494846, "grad_norm": 0.5872980952262878, "learning_rate": 7.551546391752578e-06, "loss": 0.0984, "step": 8550 }, { "epoch": 4.925544100801833, "grad_norm": 0.09889842569828033, "learning_rate": 7.5372279495990845e-06, "loss": 0.1722, "step": 8600 }, { "epoch": 4.95418098510882, "grad_norm": 0.07930275052785873, "learning_rate": 7.5229095074455904e-06, "loss": 0.0861, "step": 8650 }, { "epoch": 4.982817869415808, "grad_norm": 9.635697364807129, "learning_rate": 7.5085910652920964e-06, "loss": 0.1452, "step": 8700 }, { "epoch": 5.0, "eval_accuracy": 0.7602405110860578, "eval_f1": 0.7599417916848927, "eval_loss": 1.1236233711242676, "eval_runtime": 36.3176, "eval_samples_per_second": 73.27, "eval_steps_per_second": 9.169, "step": 8730 }, { "epoch": 5.011454753722795, "grad_norm": 0.5619956254959106, "learning_rate": 7.494272623138603e-06, "loss": 0.0687, "step": 8750 }, { "epoch": 5.040091638029782, "grad_norm": 85.73548126220703, "learning_rate": 7.479954180985109e-06, "loss": 0.0478, "step": 8800 }, { "epoch": 5.0687285223367695, "grad_norm": 13.715363502502441, "learning_rate": 7.465635738831616e-06, "loss": 0.0396, "step": 8850 }, { "epoch": 5.097365406643757, "grad_norm": 70.08007049560547, "learning_rate": 7.451317296678122e-06, "loss": 0.0562, "step": 8900 }, { "epoch": 5.126002290950744, "grad_norm": 0.8938388824462891, "learning_rate": 7.436998854524629e-06, "loss": 0.0974, "step": 8950 }, { "epoch": 5.154639175257732, "grad_norm": 0.3734937012195587, "learning_rate": 7.422680412371135e-06, "loss": 0.0742, "step": 9000 }, { "epoch": 5.18327605956472, "grad_norm": 10.571038246154785, "learning_rate": 7.408361970217641e-06, "loss": 0.021, "step": 9050 }, { "epoch": 5.211912943871707, "grad_norm": 0.09976017475128174, "learning_rate": 7.394043528064148e-06, "loss": 0.0546, "step": 9100 }, { "epoch": 5.240549828178694, "grad_norm": 0.07766138762235641, "learning_rate": 7.379725085910654e-06, "loss": 0.0827, "step": 9150 }, { "epoch": 5.2691867124856815, "grad_norm": 0.021182745695114136, "learning_rate": 7.3654066437571605e-06, "loss": 0.0508, "step": 9200 }, { "epoch": 5.297823596792669, "grad_norm": 0.08311637490987778, "learning_rate": 7.3510882016036665e-06, "loss": 0.0359, "step": 9250 }, { "epoch": 5.326460481099656, "grad_norm": 10.934857368469238, "learning_rate": 7.336769759450172e-06, "loss": 0.0431, "step": 9300 }, { "epoch": 5.355097365406643, "grad_norm": 0.061881110072135925, "learning_rate": 7.3224513172966785e-06, "loss": 0.0245, "step": 9350 }, { "epoch": 5.383734249713632, "grad_norm": 0.09978004544973373, "learning_rate": 7.3081328751431845e-06, "loss": 0.0201, "step": 9400 }, { "epoch": 5.412371134020619, "grad_norm": 0.04522474855184555, "learning_rate": 7.293814432989691e-06, "loss": 0.0554, "step": 9450 }, { "epoch": 5.441008018327606, "grad_norm": 27.210500717163086, "learning_rate": 7.279495990836197e-06, "loss": 0.0289, "step": 9500 }, { "epoch": 5.4696449026345935, "grad_norm": 88.04645538330078, "learning_rate": 7.265177548682703e-06, "loss": 0.0786, "step": 9550 }, { "epoch": 5.498281786941581, "grad_norm": 9.045210838317871, "learning_rate": 7.25085910652921e-06, "loss": 0.0776, "step": 9600 }, { "epoch": 5.526918671248568, "grad_norm": 0.031076449900865555, "learning_rate": 7.236540664375716e-06, "loss": 0.0917, "step": 9650 }, { "epoch": 5.555555555555555, "grad_norm": 0.07342424243688583, "learning_rate": 7.222222222222223e-06, "loss": 0.067, "step": 9700 }, { "epoch": 5.584192439862543, "grad_norm": 0.07349126785993576, "learning_rate": 7.207903780068729e-06, "loss": 0.0275, "step": 9750 }, { "epoch": 5.61282932416953, "grad_norm": 0.4687565267086029, "learning_rate": 7.193585337915236e-06, "loss": 0.0308, "step": 9800 }, { "epoch": 5.641466208476518, "grad_norm": 0.03109198622405529, "learning_rate": 7.179266895761742e-06, "loss": 0.0512, "step": 9850 }, { "epoch": 5.670103092783505, "grad_norm": 2.063464403152466, "learning_rate": 7.164948453608248e-06, "loss": 0.0436, "step": 9900 }, { "epoch": 5.698739977090493, "grad_norm": 0.7259678244590759, "learning_rate": 7.1506300114547545e-06, "loss": 0.0452, "step": 9950 }, { "epoch": 5.72737686139748, "grad_norm": 22.87729263305664, "learning_rate": 7.1363115693012605e-06, "loss": 0.0871, "step": 10000 }, { "epoch": 5.756013745704467, "grad_norm": 0.6141005754470825, "learning_rate": 7.121993127147767e-06, "loss": 0.0432, "step": 10050 }, { "epoch": 5.784650630011455, "grad_norm": 0.29292017221450806, "learning_rate": 7.107674684994273e-06, "loss": 0.0424, "step": 10100 }, { "epoch": 5.813287514318442, "grad_norm": 0.059430770576000214, "learning_rate": 7.09335624284078e-06, "loss": 0.0751, "step": 10150 }, { "epoch": 5.841924398625429, "grad_norm": 10.410508155822754, "learning_rate": 7.079037800687286e-06, "loss": 0.1061, "step": 10200 }, { "epoch": 5.870561282932417, "grad_norm": 1.1330084800720215, "learning_rate": 7.064719358533792e-06, "loss": 0.0421, "step": 10250 }, { "epoch": 5.899198167239405, "grad_norm": 0.1360820233821869, "learning_rate": 7.050400916380299e-06, "loss": 0.0664, "step": 10300 }, { "epoch": 5.927835051546392, "grad_norm": 1.3874069452285767, "learning_rate": 7.036082474226805e-06, "loss": 0.1134, "step": 10350 }, { "epoch": 5.956471935853379, "grad_norm": 0.0816207155585289, "learning_rate": 7.02176403207331e-06, "loss": 0.052, "step": 10400 }, { "epoch": 5.985108820160367, "grad_norm": 0.0460013747215271, "learning_rate": 7.007445589919817e-06, "loss": 0.0362, "step": 10450 }, { "epoch": 6.0, "eval_accuracy": 0.7591131153701616, "eval_f1": 0.7565525682738199, "eval_loss": 1.2660281658172607, "eval_runtime": 36.2808, "eval_samples_per_second": 73.345, "eval_steps_per_second": 9.178, "step": 10476 }, { "epoch": 6.013745704467354, "grad_norm": 0.09728560596704483, "learning_rate": 6.993127147766323e-06, "loss": 0.0709, "step": 10500 }, { "epoch": 6.042382588774341, "grad_norm": 0.10100292414426804, "learning_rate": 6.97880870561283e-06, "loss": 0.0754, "step": 10550 }, { "epoch": 6.0710194730813285, "grad_norm": 0.19107265770435333, "learning_rate": 6.964490263459336e-06, "loss": 0.0408, "step": 10600 }, { "epoch": 6.099656357388316, "grad_norm": 0.0762198194861412, "learning_rate": 6.9501718213058426e-06, "loss": 0.0228, "step": 10650 }, { "epoch": 6.128293241695303, "grad_norm": 0.023416342213749886, "learning_rate": 6.9358533791523485e-06, "loss": 0.0326, "step": 10700 }, { "epoch": 6.156930126002291, "grad_norm": 0.13443826138973236, "learning_rate": 6.9215349369988545e-06, "loss": 0.021, "step": 10750 }, { "epoch": 6.185567010309279, "grad_norm": 47.05551528930664, "learning_rate": 6.907216494845361e-06, "loss": 0.0505, "step": 10800 }, { "epoch": 6.214203894616266, "grad_norm": 0.19994887709617615, "learning_rate": 6.892898052691867e-06, "loss": 0.044, "step": 10850 }, { "epoch": 6.242840778923253, "grad_norm": 6.800403118133545, "learning_rate": 6.878579610538374e-06, "loss": 0.0212, "step": 10900 }, { "epoch": 6.2714776632302405, "grad_norm": 0.18680407106876373, "learning_rate": 6.86426116838488e-06, "loss": 0.04, "step": 10950 }, { "epoch": 6.300114547537228, "grad_norm": 2.269585132598877, "learning_rate": 6.849942726231387e-06, "loss": 0.0385, "step": 11000 }, { "epoch": 6.328751431844215, "grad_norm": 0.05255923420190811, "learning_rate": 6.835624284077893e-06, "loss": 0.0364, "step": 11050 }, { "epoch": 6.357388316151202, "grad_norm": 0.026086222380399704, "learning_rate": 6.821305841924399e-06, "loss": 0.0462, "step": 11100 }, { "epoch": 6.3860252004581906, "grad_norm": 0.026915445923805237, "learning_rate": 6.806987399770906e-06, "loss": 0.0473, "step": 11150 }, { "epoch": 6.414662084765178, "grad_norm": 30.492359161376953, "learning_rate": 6.792668957617412e-06, "loss": 0.0508, "step": 11200 }, { "epoch": 6.443298969072165, "grad_norm": 0.030431082472205162, "learning_rate": 6.778350515463919e-06, "loss": 0.0555, "step": 11250 }, { "epoch": 6.4719358533791524, "grad_norm": 0.19676260650157928, "learning_rate": 6.764032073310425e-06, "loss": 0.0435, "step": 11300 }, { "epoch": 6.50057273768614, "grad_norm": 0.3724329471588135, "learning_rate": 6.7497136311569314e-06, "loss": 0.0209, "step": 11350 }, { "epoch": 6.529209621993127, "grad_norm": 0.07832983881235123, "learning_rate": 6.735395189003437e-06, "loss": 0.0368, "step": 11400 }, { "epoch": 6.557846506300114, "grad_norm": 1.6286810636520386, "learning_rate": 6.721076746849944e-06, "loss": 0.0546, "step": 11450 }, { "epoch": 6.586483390607102, "grad_norm": 10.142606735229492, "learning_rate": 6.706758304696449e-06, "loss": 0.0537, "step": 11500 }, { "epoch": 6.615120274914089, "grad_norm": 0.03260861337184906, "learning_rate": 6.692439862542955e-06, "loss": 0.0801, "step": 11550 }, { "epoch": 6.643757159221077, "grad_norm": 0.06774479895830154, "learning_rate": 6.678121420389461e-06, "loss": 0.0276, "step": 11600 }, { "epoch": 6.672394043528064, "grad_norm": 0.10083146393299103, "learning_rate": 6.663802978235968e-06, "loss": 0.0355, "step": 11650 }, { "epoch": 6.701030927835052, "grad_norm": 0.5465721487998962, "learning_rate": 6.649484536082474e-06, "loss": 0.0531, "step": 11700 }, { "epoch": 6.729667812142039, "grad_norm": 0.36361464858055115, "learning_rate": 6.635166093928981e-06, "loss": 0.0534, "step": 11750 }, { "epoch": 6.758304696449026, "grad_norm": 0.3213222622871399, "learning_rate": 6.620847651775487e-06, "loss": 0.021, "step": 11800 }, { "epoch": 6.786941580756014, "grad_norm": 1.3060314655303955, "learning_rate": 6.606529209621994e-06, "loss": 0.0371, "step": 11850 }, { "epoch": 6.815578465063001, "grad_norm": 15.44947338104248, "learning_rate": 6.5922107674685e-06, "loss": 0.0475, "step": 11900 }, { "epoch": 6.844215349369988, "grad_norm": 0.1092597097158432, "learning_rate": 6.577892325315006e-06, "loss": 0.0549, "step": 11950 }, { "epoch": 6.872852233676976, "grad_norm": 0.24638314545154572, "learning_rate": 6.563573883161513e-06, "loss": 0.044, "step": 12000 }, { "epoch": 6.901489117983964, "grad_norm": 0.17163485288619995, "learning_rate": 6.549255441008019e-06, "loss": 0.0478, "step": 12050 }, { "epoch": 6.930126002290951, "grad_norm": 0.5006637573242188, "learning_rate": 6.5349369988545254e-06, "loss": 0.0595, "step": 12100 }, { "epoch": 6.958762886597938, "grad_norm": 0.09898879379034042, "learning_rate": 6.520618556701031e-06, "loss": 0.0415, "step": 12150 }, { "epoch": 6.987399770904926, "grad_norm": 0.8853304386138916, "learning_rate": 6.506300114547538e-06, "loss": 0.023, "step": 12200 }, { "epoch": 7.0, "eval_accuracy": 0.7718902668169861, "eval_f1": 0.772007503464447, "eval_loss": 1.2936097383499146, "eval_runtime": 36.2937, "eval_samples_per_second": 73.319, "eval_steps_per_second": 9.175, "step": 12222 } ], "logging_steps": 50, "max_steps": 34920, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 500, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.283848509009536e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }