diff --git "a/checkpoint-61270/trainer_state.json" "b/checkpoint-61270/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-61270/trainer_state.json" @@ -0,0 +1,17390 @@ +{ + "best_metric": 6.590800762176514, + "best_model_checkpoint": "bert-base-german-europeana-uncased-dnb/checkpoint-61270", + "epoch": 11.0, + "eval_steps": 500, + "global_step": 61270, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.004488330341113106, + "grad_norm": 16.327289581298828, + "learning_rate": 4.955116696588869e-08, + "loss": 10.0842, + "step": 25 + }, + { + "epoch": 0.008976660682226212, + "grad_norm": 22.682701110839844, + "learning_rate": 1.0125673249551168e-07, + "loss": 10.0723, + "step": 50 + }, + { + "epoch": 0.013464991023339317, + "grad_norm": 14.136711120605469, + "learning_rate": 1.5511669658886896e-07, + "loss": 10.0543, + "step": 75 + }, + { + "epoch": 0.017953321364452424, + "grad_norm": 14.158872604370117, + "learning_rate": 2.089766606822262e-07, + "loss": 10.0814, + "step": 100 + }, + { + "epoch": 0.02244165170556553, + "grad_norm": 17.165952682495117, + "learning_rate": 2.6283662477558347e-07, + "loss": 10.0521, + "step": 125 + }, + { + "epoch": 0.026929982046678635, + "grad_norm": 23.887605667114258, + "learning_rate": 3.1669658886894075e-07, + "loss": 10.0814, + "step": 150 + }, + { + "epoch": 0.03141831238779174, + "grad_norm": 54.12178421020508, + "learning_rate": 3.7055655296229803e-07, + "loss": 10.0714, + "step": 175 + }, + { + "epoch": 0.03590664272890485, + "grad_norm": 14.443297386169434, + "learning_rate": 4.244165170556553e-07, + "loss": 10.0503, + "step": 200 + }, + { + "epoch": 0.04039497307001795, + "grad_norm": 62.16147232055664, + "learning_rate": 4.782764811490125e-07, + "loss": 10.0342, + "step": 225 + }, + { + "epoch": 0.04488330341113106, + "grad_norm": 13.414910316467285, + "learning_rate": 5.321364452423699e-07, + "loss": 10.0774, + "step": 250 + }, + { + "epoch": 0.04937163375224417, + "grad_norm": 14.340097427368164, + "learning_rate": 5.859964093357271e-07, + "loss": 10.0615, + "step": 275 + }, + { + "epoch": 0.05385996409335727, + "grad_norm": 11.243691444396973, + "learning_rate": 6.398563734290845e-07, + "loss": 10.0652, + "step": 300 + }, + { + "epoch": 0.05834829443447038, + "grad_norm": 29.548250198364258, + "learning_rate": 6.937163375224417e-07, + "loss": 10.0483, + "step": 325 + }, + { + "epoch": 0.06283662477558348, + "grad_norm": 32.95609664916992, + "learning_rate": 7.47576301615799e-07, + "loss": 10.0497, + "step": 350 + }, + { + "epoch": 0.06732495511669659, + "grad_norm": 15.451595306396484, + "learning_rate": 7.992818671454219e-07, + "loss": 10.0511, + "step": 375 + }, + { + "epoch": 0.0718132854578097, + "grad_norm": 15.431848526000977, + "learning_rate": 8.509874326750449e-07, + "loss": 10.0571, + "step": 400 + }, + { + "epoch": 0.0763016157989228, + "grad_norm": 11.111251831054688, + "learning_rate": 9.048473967684021e-07, + "loss": 10.0501, + "step": 425 + }, + { + "epoch": 0.0807899461400359, + "grad_norm": 12.67958927154541, + "learning_rate": 9.587073608617593e-07, + "loss": 10.0374, + "step": 450 + }, + { + "epoch": 0.08527827648114901, + "grad_norm": 10.736528396606445, + "learning_rate": 1.0125673249551168e-06, + "loss": 10.0567, + "step": 475 + }, + { + "epoch": 0.08976660682226212, + "grad_norm": 9.547792434692383, + "learning_rate": 1.066427289048474e-06, + "loss": 10.0457, + "step": 500 + }, + { + "epoch": 0.09425493716337523, + "grad_norm": 9.209278106689453, + "learning_rate": 1.1202872531418311e-06, + "loss": 10.0346, + "step": 525 + }, + { + "epoch": 0.09874326750448834, + "grad_norm": 7.9073967933654785, + "learning_rate": 1.1741472172351884e-06, + "loss": 10.0161, + "step": 550 + }, + { + "epoch": 0.10323159784560143, + "grad_norm": 9.795483589172363, + "learning_rate": 1.228007181328546e-06, + "loss": 10.0149, + "step": 575 + }, + { + "epoch": 0.10771992818671454, + "grad_norm": 7.628419876098633, + "learning_rate": 1.2818671454219032e-06, + "loss": 10.0293, + "step": 600 + }, + { + "epoch": 0.11220825852782765, + "grad_norm": 11.231724739074707, + "learning_rate": 1.3357271095152603e-06, + "loss": 9.9978, + "step": 625 + }, + { + "epoch": 0.11669658886894076, + "grad_norm": 13.161614418029785, + "learning_rate": 1.3895870736086175e-06, + "loss": 9.9925, + "step": 650 + }, + { + "epoch": 0.12118491921005387, + "grad_norm": 5.85615873336792, + "learning_rate": 1.443447037701975e-06, + "loss": 9.9968, + "step": 675 + }, + { + "epoch": 0.12567324955116696, + "grad_norm": 15.403050422668457, + "learning_rate": 1.497307001795332e-06, + "loss": 9.9242, + "step": 700 + }, + { + "epoch": 0.13016157989228008, + "grad_norm": 8.20634651184082, + "learning_rate": 1.5511669658886894e-06, + "loss": 9.9635, + "step": 725 + }, + { + "epoch": 0.13464991023339318, + "grad_norm": 6.117677688598633, + "learning_rate": 1.6050269299820467e-06, + "loss": 9.9442, + "step": 750 + }, + { + "epoch": 0.13913824057450627, + "grad_norm": 14.311126708984375, + "learning_rate": 1.6588868940754042e-06, + "loss": 9.9455, + "step": 775 + }, + { + "epoch": 0.1436265709156194, + "grad_norm": 7.545867443084717, + "learning_rate": 1.7127468581687612e-06, + "loss": 9.934, + "step": 800 + }, + { + "epoch": 0.1481149012567325, + "grad_norm": 5.972433567047119, + "learning_rate": 1.7666068222621185e-06, + "loss": 9.8992, + "step": 825 + }, + { + "epoch": 0.1526032315978456, + "grad_norm": 5.864813327789307, + "learning_rate": 1.8204667863554758e-06, + "loss": 9.9212, + "step": 850 + }, + { + "epoch": 0.1570915619389587, + "grad_norm": 6.61310338973999, + "learning_rate": 1.8743267504488333e-06, + "loss": 9.8778, + "step": 875 + }, + { + "epoch": 0.1615798922800718, + "grad_norm": 9.430643081665039, + "learning_rate": 1.9281867145421903e-06, + "loss": 9.8697, + "step": 900 + }, + { + "epoch": 0.16606822262118492, + "grad_norm": 5.307265281677246, + "learning_rate": 1.9820466786355474e-06, + "loss": 9.8957, + "step": 925 + }, + { + "epoch": 0.17055655296229802, + "grad_norm": 6.174213886260986, + "learning_rate": 2.035906642728905e-06, + "loss": 9.8656, + "step": 950 + }, + { + "epoch": 0.17504488330341114, + "grad_norm": 5.781116485595703, + "learning_rate": 2.089766606822262e-06, + "loss": 9.8482, + "step": 975 + }, + { + "epoch": 0.17953321364452424, + "grad_norm": 7.3210649490356445, + "learning_rate": 2.1436265709156195e-06, + "loss": 9.8085, + "step": 1000 + }, + { + "epoch": 0.18402154398563733, + "grad_norm": 6.081295490264893, + "learning_rate": 2.197486535008977e-06, + "loss": 9.8168, + "step": 1025 + }, + { + "epoch": 0.18850987432675045, + "grad_norm": 5.694328784942627, + "learning_rate": 2.251346499102334e-06, + "loss": 9.8, + "step": 1050 + }, + { + "epoch": 0.19299820466786355, + "grad_norm": 9.181863784790039, + "learning_rate": 2.3052064631956915e-06, + "loss": 9.8066, + "step": 1075 + }, + { + "epoch": 0.19748653500897667, + "grad_norm": 5.4879841804504395, + "learning_rate": 2.3590664272890486e-06, + "loss": 9.8026, + "step": 1100 + }, + { + "epoch": 0.20197486535008977, + "grad_norm": 5.001553058624268, + "learning_rate": 2.4129263913824057e-06, + "loss": 9.756, + "step": 1125 + }, + { + "epoch": 0.20646319569120286, + "grad_norm": 5.004782676696777, + "learning_rate": 2.466786355475763e-06, + "loss": 9.7393, + "step": 1150 + }, + { + "epoch": 0.21095152603231598, + "grad_norm": 5.531662940979004, + "learning_rate": 2.5206463195691202e-06, + "loss": 9.7446, + "step": 1175 + }, + { + "epoch": 0.21543985637342908, + "grad_norm": 6.672760963439941, + "learning_rate": 2.5745062836624773e-06, + "loss": 9.7205, + "step": 1200 + }, + { + "epoch": 0.2199281867145422, + "grad_norm": 5.203510761260986, + "learning_rate": 2.6283662477558348e-06, + "loss": 9.6994, + "step": 1225 + }, + { + "epoch": 0.2244165170556553, + "grad_norm": 6.208364486694336, + "learning_rate": 2.6822262118491923e-06, + "loss": 9.7031, + "step": 1250 + }, + { + "epoch": 0.2289048473967684, + "grad_norm": 5.31331729888916, + "learning_rate": 2.7360861759425498e-06, + "loss": 9.6779, + "step": 1275 + }, + { + "epoch": 0.2333931777378815, + "grad_norm": 5.016303062438965, + "learning_rate": 2.789946140035907e-06, + "loss": 9.7238, + "step": 1300 + }, + { + "epoch": 0.2378815080789946, + "grad_norm": 5.4517130851745605, + "learning_rate": 2.843806104129264e-06, + "loss": 9.6706, + "step": 1325 + }, + { + "epoch": 0.24236983842010773, + "grad_norm": 5.653480529785156, + "learning_rate": 2.8976660682226214e-06, + "loss": 9.6773, + "step": 1350 + }, + { + "epoch": 0.24685816876122083, + "grad_norm": 4.749100685119629, + "learning_rate": 2.9515260323159785e-06, + "loss": 9.6834, + "step": 1375 + }, + { + "epoch": 0.2513464991023339, + "grad_norm": 7.908215045928955, + "learning_rate": 3.0053859964093355e-06, + "loss": 9.655, + "step": 1400 + }, + { + "epoch": 0.25583482944344704, + "grad_norm": 5.412723541259766, + "learning_rate": 3.059245960502693e-06, + "loss": 9.6097, + "step": 1425 + }, + { + "epoch": 0.26032315978456017, + "grad_norm": 5.68965482711792, + "learning_rate": 3.1131059245960505e-06, + "loss": 9.5947, + "step": 1450 + }, + { + "epoch": 0.26481149012567323, + "grad_norm": 5.187753200531006, + "learning_rate": 3.1669658886894076e-06, + "loss": 9.6561, + "step": 1475 + }, + { + "epoch": 0.26929982046678635, + "grad_norm": 4.972493648529053, + "learning_rate": 3.220825852782765e-06, + "loss": 9.6303, + "step": 1500 + }, + { + "epoch": 0.2737881508078995, + "grad_norm": 5.680991172790527, + "learning_rate": 3.274685816876122e-06, + "loss": 9.5365, + "step": 1525 + }, + { + "epoch": 0.27827648114901254, + "grad_norm": 8.533623695373535, + "learning_rate": 3.3285457809694796e-06, + "loss": 9.5576, + "step": 1550 + }, + { + "epoch": 0.28276481149012567, + "grad_norm": 6.31321907043457, + "learning_rate": 3.3824057450628367e-06, + "loss": 9.5423, + "step": 1575 + }, + { + "epoch": 0.2872531418312388, + "grad_norm": 6.636347770690918, + "learning_rate": 3.4362657091561938e-06, + "loss": 9.5152, + "step": 1600 + }, + { + "epoch": 0.2917414721723519, + "grad_norm": 4.865635871887207, + "learning_rate": 3.4901256732495513e-06, + "loss": 9.496, + "step": 1625 + }, + { + "epoch": 0.296229802513465, + "grad_norm": 6.498619556427002, + "learning_rate": 3.5439856373429088e-06, + "loss": 9.5709, + "step": 1650 + }, + { + "epoch": 0.3007181328545781, + "grad_norm": 5.24558687210083, + "learning_rate": 3.597845601436266e-06, + "loss": 9.5312, + "step": 1675 + }, + { + "epoch": 0.3052064631956912, + "grad_norm": 5.5028157234191895, + "learning_rate": 3.6517055655296233e-06, + "loss": 9.4909, + "step": 1700 + }, + { + "epoch": 0.3096947935368043, + "grad_norm": 6.085582733154297, + "learning_rate": 3.7055655296229804e-06, + "loss": 9.5357, + "step": 1725 + }, + { + "epoch": 0.3141831238779174, + "grad_norm": 5.161301136016846, + "learning_rate": 3.759425493716337e-06, + "loss": 9.5185, + "step": 1750 + }, + { + "epoch": 0.31867145421903054, + "grad_norm": 5.283674240112305, + "learning_rate": 3.813285457809695e-06, + "loss": 9.4526, + "step": 1775 + }, + { + "epoch": 0.3231597845601436, + "grad_norm": 5.306647300720215, + "learning_rate": 3.8671454219030524e-06, + "loss": 9.4259, + "step": 1800 + }, + { + "epoch": 0.3276481149012567, + "grad_norm": 5.557463645935059, + "learning_rate": 3.9210053859964095e-06, + "loss": 9.4844, + "step": 1825 + }, + { + "epoch": 0.33213644524236985, + "grad_norm": 6.96926212310791, + "learning_rate": 3.974865350089767e-06, + "loss": 9.4973, + "step": 1850 + }, + { + "epoch": 0.33662477558348297, + "grad_norm": 4.85004997253418, + "learning_rate": 4.028725314183124e-06, + "loss": 9.3868, + "step": 1875 + }, + { + "epoch": 0.34111310592459604, + "grad_norm": 5.676880359649658, + "learning_rate": 4.0825852782764816e-06, + "loss": 9.4226, + "step": 1900 + }, + { + "epoch": 0.34560143626570916, + "grad_norm": 5.280898094177246, + "learning_rate": 4.136445242369838e-06, + "loss": 9.4263, + "step": 1925 + }, + { + "epoch": 0.3500897666068223, + "grad_norm": 5.523981094360352, + "learning_rate": 4.190305206463196e-06, + "loss": 9.4641, + "step": 1950 + }, + { + "epoch": 0.35457809694793535, + "grad_norm": 4.899006366729736, + "learning_rate": 4.244165170556554e-06, + "loss": 9.4224, + "step": 1975 + }, + { + "epoch": 0.3590664272890485, + "grad_norm": 5.647926330566406, + "learning_rate": 4.298025134649911e-06, + "loss": 9.3633, + "step": 2000 + }, + { + "epoch": 0.3635547576301616, + "grad_norm": 4.822826862335205, + "learning_rate": 4.351885098743268e-06, + "loss": 9.4193, + "step": 2025 + }, + { + "epoch": 0.36804308797127466, + "grad_norm": 5.190840244293213, + "learning_rate": 4.405745062836625e-06, + "loss": 9.3663, + "step": 2050 + }, + { + "epoch": 0.3725314183123878, + "grad_norm": 5.863297462463379, + "learning_rate": 4.459605026929983e-06, + "loss": 9.3967, + "step": 2075 + }, + { + "epoch": 0.3770197486535009, + "grad_norm": 6.929235458374023, + "learning_rate": 4.513464991023339e-06, + "loss": 9.3437, + "step": 2100 + }, + { + "epoch": 0.38150807899461403, + "grad_norm": 4.901843547821045, + "learning_rate": 4.567324955116697e-06, + "loss": 9.3151, + "step": 2125 + }, + { + "epoch": 0.3859964093357271, + "grad_norm": 5.576445579528809, + "learning_rate": 4.621184919210054e-06, + "loss": 9.3401, + "step": 2150 + }, + { + "epoch": 0.3904847396768402, + "grad_norm": 5.403802871704102, + "learning_rate": 4.675044883303411e-06, + "loss": 9.4139, + "step": 2175 + }, + { + "epoch": 0.39497307001795334, + "grad_norm": 5.211789131164551, + "learning_rate": 4.728904847396769e-06, + "loss": 9.3918, + "step": 2200 + }, + { + "epoch": 0.3994614003590664, + "grad_norm": 6.7926201820373535, + "learning_rate": 4.782764811490126e-06, + "loss": 9.3569, + "step": 2225 + }, + { + "epoch": 0.40394973070017953, + "grad_norm": 5.316205024719238, + "learning_rate": 4.836624775583483e-06, + "loss": 9.3826, + "step": 2250 + }, + { + "epoch": 0.40843806104129265, + "grad_norm": 5.608170986175537, + "learning_rate": 4.89048473967684e-06, + "loss": 9.3437, + "step": 2275 + }, + { + "epoch": 0.4129263913824057, + "grad_norm": 4.827035903930664, + "learning_rate": 4.944344703770198e-06, + "loss": 9.3805, + "step": 2300 + }, + { + "epoch": 0.41741472172351884, + "grad_norm": 5.169203758239746, + "learning_rate": 4.998204667863554e-06, + "loss": 9.3778, + "step": 2325 + }, + { + "epoch": 0.42190305206463197, + "grad_norm": 5.231698036193848, + "learning_rate": 5.052064631956912e-06, + "loss": 9.2853, + "step": 2350 + }, + { + "epoch": 0.4263913824057451, + "grad_norm": 4.834906578063965, + "learning_rate": 5.10592459605027e-06, + "loss": 9.228, + "step": 2375 + }, + { + "epoch": 0.43087971274685816, + "grad_norm": 5.029504299163818, + "learning_rate": 5.159784560143626e-06, + "loss": 9.3068, + "step": 2400 + }, + { + "epoch": 0.4353680430879713, + "grad_norm": 5.6493048667907715, + "learning_rate": 5.213644524236984e-06, + "loss": 9.3386, + "step": 2425 + }, + { + "epoch": 0.4398563734290844, + "grad_norm": 5.567768096923828, + "learning_rate": 5.267504488330341e-06, + "loss": 9.339, + "step": 2450 + }, + { + "epoch": 0.44434470377019747, + "grad_norm": 5.512776851654053, + "learning_rate": 5.321364452423698e-06, + "loss": 9.2707, + "step": 2475 + }, + { + "epoch": 0.4488330341113106, + "grad_norm": 7.289466857910156, + "learning_rate": 5.3752244165170554e-06, + "loss": 9.2405, + "step": 2500 + }, + { + "epoch": 0.4533213644524237, + "grad_norm": 5.111495018005371, + "learning_rate": 5.429084380610413e-06, + "loss": 9.2824, + "step": 2525 + }, + { + "epoch": 0.4578096947935368, + "grad_norm": 6.250053882598877, + "learning_rate": 5.48294434470377e-06, + "loss": 9.3201, + "step": 2550 + }, + { + "epoch": 0.4622980251346499, + "grad_norm": 11.291319847106934, + "learning_rate": 5.5368043087971275e-06, + "loss": 9.2545, + "step": 2575 + }, + { + "epoch": 0.466786355475763, + "grad_norm": 6.036151885986328, + "learning_rate": 5.590664272890485e-06, + "loss": 9.2776, + "step": 2600 + }, + { + "epoch": 0.47127468581687615, + "grad_norm": 4.9552507400512695, + "learning_rate": 5.6445242369838425e-06, + "loss": 9.1856, + "step": 2625 + }, + { + "epoch": 0.4757630161579892, + "grad_norm": 4.9221367835998535, + "learning_rate": 5.6983842010771996e-06, + "loss": 9.1627, + "step": 2650 + }, + { + "epoch": 0.48025134649910234, + "grad_norm": 5.913104057312012, + "learning_rate": 5.752244165170557e-06, + "loss": 9.2922, + "step": 2675 + }, + { + "epoch": 0.48473967684021546, + "grad_norm": 4.915811061859131, + "learning_rate": 5.8061041292639145e-06, + "loss": 9.1782, + "step": 2700 + }, + { + "epoch": 0.48922800718132853, + "grad_norm": 4.957953929901123, + "learning_rate": 5.859964093357271e-06, + "loss": 9.2251, + "step": 2725 + }, + { + "epoch": 0.49371633752244165, + "grad_norm": 5.082302093505859, + "learning_rate": 5.913824057450629e-06, + "loss": 9.2666, + "step": 2750 + }, + { + "epoch": 0.4982046678635548, + "grad_norm": 8.70784854888916, + "learning_rate": 5.967684021543986e-06, + "loss": 9.2372, + "step": 2775 + }, + { + "epoch": 0.5026929982046678, + "grad_norm": 5.46500301361084, + "learning_rate": 6.021543985637343e-06, + "loss": 9.2752, + "step": 2800 + }, + { + "epoch": 0.507181328545781, + "grad_norm": 5.634939193725586, + "learning_rate": 6.075403949730701e-06, + "loss": 9.2029, + "step": 2825 + }, + { + "epoch": 0.5116696588868941, + "grad_norm": 4.728097915649414, + "learning_rate": 6.129263913824058e-06, + "loss": 9.2322, + "step": 2850 + }, + { + "epoch": 0.5161579892280072, + "grad_norm": 4.899399280548096, + "learning_rate": 6.183123877917415e-06, + "loss": 9.2782, + "step": 2875 + }, + { + "epoch": 0.5206463195691203, + "grad_norm": 5.116130352020264, + "learning_rate": 6.236983842010772e-06, + "loss": 9.1668, + "step": 2900 + }, + { + "epoch": 0.5251346499102334, + "grad_norm": 5.575163841247559, + "learning_rate": 6.29084380610413e-06, + "loss": 9.1988, + "step": 2925 + }, + { + "epoch": 0.5296229802513465, + "grad_norm": 6.79478645324707, + "learning_rate": 6.344703770197486e-06, + "loss": 9.2474, + "step": 2950 + }, + { + "epoch": 0.5341113105924596, + "grad_norm": 7.063352584838867, + "learning_rate": 6.398563734290844e-06, + "loss": 9.1876, + "step": 2975 + }, + { + "epoch": 0.5385996409335727, + "grad_norm": 17.845504760742188, + "learning_rate": 6.452423698384202e-06, + "loss": 9.173, + "step": 3000 + }, + { + "epoch": 0.5430879712746858, + "grad_norm": 9.871821403503418, + "learning_rate": 6.506283662477558e-06, + "loss": 9.206, + "step": 3025 + }, + { + "epoch": 0.547576301615799, + "grad_norm": 6.34397554397583, + "learning_rate": 6.560143626570916e-06, + "loss": 9.0855, + "step": 3050 + }, + { + "epoch": 0.552064631956912, + "grad_norm": 5.515399932861328, + "learning_rate": 6.614003590664273e-06, + "loss": 9.1977, + "step": 3075 + }, + { + "epoch": 0.5565529622980251, + "grad_norm": 6.850831031799316, + "learning_rate": 6.66786355475763e-06, + "loss": 9.2921, + "step": 3100 + }, + { + "epoch": 0.5610412926391383, + "grad_norm": 5.767841815948486, + "learning_rate": 6.721723518850987e-06, + "loss": 9.2317, + "step": 3125 + }, + { + "epoch": 0.5655296229802513, + "grad_norm": 4.840388298034668, + "learning_rate": 6.775583482944345e-06, + "loss": 9.1582, + "step": 3150 + }, + { + "epoch": 0.5700179533213644, + "grad_norm": 5.063223838806152, + "learning_rate": 6.829443447037701e-06, + "loss": 9.2884, + "step": 3175 + }, + { + "epoch": 0.5745062836624776, + "grad_norm": 5.298039436340332, + "learning_rate": 6.883303411131059e-06, + "loss": 9.1654, + "step": 3200 + }, + { + "epoch": 0.5789946140035906, + "grad_norm": 5.333845615386963, + "learning_rate": 6.937163375224417e-06, + "loss": 9.1947, + "step": 3225 + }, + { + "epoch": 0.5834829443447038, + "grad_norm": 5.016245365142822, + "learning_rate": 6.991023339317774e-06, + "loss": 9.1131, + "step": 3250 + }, + { + "epoch": 0.5879712746858169, + "grad_norm": 5.7573652267456055, + "learning_rate": 7.044883303411131e-06, + "loss": 9.1683, + "step": 3275 + }, + { + "epoch": 0.59245960502693, + "grad_norm": 5.244170665740967, + "learning_rate": 7.098743267504488e-06, + "loss": 9.2027, + "step": 3300 + }, + { + "epoch": 0.5969479353680431, + "grad_norm": 7.219206809997559, + "learning_rate": 7.152603231597846e-06, + "loss": 9.1333, + "step": 3325 + }, + { + "epoch": 0.6014362657091562, + "grad_norm": 4.885063171386719, + "learning_rate": 7.2064631956912026e-06, + "loss": 9.199, + "step": 3350 + }, + { + "epoch": 0.6059245960502693, + "grad_norm": 5.34074068069458, + "learning_rate": 7.2603231597845605e-06, + "loss": 9.1456, + "step": 3375 + }, + { + "epoch": 0.6104129263913824, + "grad_norm": 6.495256423950195, + "learning_rate": 7.3141831238779175e-06, + "loss": 9.2024, + "step": 3400 + }, + { + "epoch": 0.6149012567324955, + "grad_norm": 7.452166557312012, + "learning_rate": 7.368043087971275e-06, + "loss": 9.1815, + "step": 3425 + }, + { + "epoch": 0.6193895870736086, + "grad_norm": 7.50568151473999, + "learning_rate": 7.4219030520646325e-06, + "loss": 9.1165, + "step": 3450 + }, + { + "epoch": 0.6238779174147218, + "grad_norm": 6.768421173095703, + "learning_rate": 7.47576301615799e-06, + "loss": 9.2076, + "step": 3475 + }, + { + "epoch": 0.6283662477558348, + "grad_norm": 6.257649898529053, + "learning_rate": 7.529622980251346e-06, + "loss": 9.1711, + "step": 3500 + }, + { + "epoch": 0.6328545780969479, + "grad_norm": 7.814693450927734, + "learning_rate": 7.583482944344705e-06, + "loss": 9.1988, + "step": 3525 + }, + { + "epoch": 0.6373429084380611, + "grad_norm": 9.341326713562012, + "learning_rate": 7.63734290843806e-06, + "loss": 9.0812, + "step": 3550 + }, + { + "epoch": 0.6418312387791741, + "grad_norm": 6.590101718902588, + "learning_rate": 7.691202872531418e-06, + "loss": 9.1043, + "step": 3575 + }, + { + "epoch": 0.6463195691202872, + "grad_norm": 6.020159721374512, + "learning_rate": 7.745062836624775e-06, + "loss": 9.1742, + "step": 3600 + }, + { + "epoch": 0.6508078994614004, + "grad_norm": 6.136831283569336, + "learning_rate": 7.798922800718134e-06, + "loss": 9.1653, + "step": 3625 + }, + { + "epoch": 0.6552962298025135, + "grad_norm": 5.3558807373046875, + "learning_rate": 7.85278276481149e-06, + "loss": 9.1127, + "step": 3650 + }, + { + "epoch": 0.6597845601436265, + "grad_norm": 6.553163528442383, + "learning_rate": 7.906642728904848e-06, + "loss": 9.0378, + "step": 3675 + }, + { + "epoch": 0.6642728904847397, + "grad_norm": 7.745103359222412, + "learning_rate": 7.960502692998205e-06, + "loss": 9.1001, + "step": 3700 + }, + { + "epoch": 0.6687612208258528, + "grad_norm": 7.2429022789001465, + "learning_rate": 8.014362657091562e-06, + "loss": 9.1692, + "step": 3725 + }, + { + "epoch": 0.6732495511669659, + "grad_norm": 5.889107704162598, + "learning_rate": 8.068222621184919e-06, + "loss": 9.0403, + "step": 3750 + }, + { + "epoch": 0.677737881508079, + "grad_norm": 8.238465309143066, + "learning_rate": 8.122082585278276e-06, + "loss": 9.0454, + "step": 3775 + }, + { + "epoch": 0.6822262118491921, + "grad_norm": 12.357108116149902, + "learning_rate": 8.175942549371635e-06, + "loss": 9.1084, + "step": 3800 + }, + { + "epoch": 0.6867145421903053, + "grad_norm": 6.177108287811279, + "learning_rate": 8.229802513464992e-06, + "loss": 9.0282, + "step": 3825 + }, + { + "epoch": 0.6912028725314183, + "grad_norm": 5.638473987579346, + "learning_rate": 8.283662477558347e-06, + "loss": 9.2204, + "step": 3850 + }, + { + "epoch": 0.6956912028725314, + "grad_norm": 7.522382736206055, + "learning_rate": 8.337522441651706e-06, + "loss": 9.0377, + "step": 3875 + }, + { + "epoch": 0.7001795332136446, + "grad_norm": 5.90704870223999, + "learning_rate": 8.391382405745063e-06, + "loss": 9.0698, + "step": 3900 + }, + { + "epoch": 0.7046678635547576, + "grad_norm": 4.854036331176758, + "learning_rate": 8.44524236983842e-06, + "loss": 9.0769, + "step": 3925 + }, + { + "epoch": 0.7091561938958707, + "grad_norm": 7.36228084564209, + "learning_rate": 8.499102333931777e-06, + "loss": 9.2061, + "step": 3950 + }, + { + "epoch": 0.7136445242369839, + "grad_norm": 6.164623260498047, + "learning_rate": 8.552962298025136e-06, + "loss": 9.0419, + "step": 3975 + }, + { + "epoch": 0.718132854578097, + "grad_norm": 5.661802291870117, + "learning_rate": 8.606822262118491e-06, + "loss": 9.1817, + "step": 4000 + }, + { + "epoch": 0.72262118491921, + "grad_norm": 5.785642623901367, + "learning_rate": 8.660682226211848e-06, + "loss": 9.1036, + "step": 4025 + }, + { + "epoch": 0.7271095152603232, + "grad_norm": 5.6611552238464355, + "learning_rate": 8.714542190305207e-06, + "loss": 8.9981, + "step": 4050 + }, + { + "epoch": 0.7315978456014363, + "grad_norm": 6.679709434509277, + "learning_rate": 8.768402154398564e-06, + "loss": 9.0542, + "step": 4075 + }, + { + "epoch": 0.7360861759425493, + "grad_norm": 5.2933735847473145, + "learning_rate": 8.822262118491921e-06, + "loss": 9.0839, + "step": 4100 + }, + { + "epoch": 0.7405745062836625, + "grad_norm": 5.121194839477539, + "learning_rate": 8.876122082585278e-06, + "loss": 9.1479, + "step": 4125 + }, + { + "epoch": 0.7450628366247756, + "grad_norm": 6.635162830352783, + "learning_rate": 8.929982046678637e-06, + "loss": 8.9457, + "step": 4150 + }, + { + "epoch": 0.7495511669658886, + "grad_norm": 7.119969844818115, + "learning_rate": 8.983842010771993e-06, + "loss": 9.143, + "step": 4175 + }, + { + "epoch": 0.7540394973070018, + "grad_norm": 5.47069787979126, + "learning_rate": 9.03770197486535e-06, + "loss": 9.0674, + "step": 4200 + }, + { + "epoch": 0.7585278276481149, + "grad_norm": 5.721898078918457, + "learning_rate": 9.091561938958707e-06, + "loss": 9.0176, + "step": 4225 + }, + { + "epoch": 0.7630161579892281, + "grad_norm": 6.213694095611572, + "learning_rate": 9.145421903052065e-06, + "loss": 9.0494, + "step": 4250 + }, + { + "epoch": 0.7675044883303411, + "grad_norm": 5.092076778411865, + "learning_rate": 9.199281867145423e-06, + "loss": 9.1326, + "step": 4275 + }, + { + "epoch": 0.7719928186714542, + "grad_norm": 6.115865707397461, + "learning_rate": 9.25314183123878e-06, + "loss": 9.0087, + "step": 4300 + }, + { + "epoch": 0.7764811490125674, + "grad_norm": 5.818943977355957, + "learning_rate": 9.307001795332137e-06, + "loss": 9.1292, + "step": 4325 + }, + { + "epoch": 0.7809694793536804, + "grad_norm": 8.29226016998291, + "learning_rate": 9.360861759425494e-06, + "loss": 9.136, + "step": 4350 + }, + { + "epoch": 0.7854578096947935, + "grad_norm": 5.897540092468262, + "learning_rate": 9.41472172351885e-06, + "loss": 9.0613, + "step": 4375 + }, + { + "epoch": 0.7899461400359067, + "grad_norm": 6.145982265472412, + "learning_rate": 9.468581687612208e-06, + "loss": 9.1038, + "step": 4400 + }, + { + "epoch": 0.7944344703770198, + "grad_norm": 5.322671413421631, + "learning_rate": 9.522441651705567e-06, + "loss": 8.9829, + "step": 4425 + }, + { + "epoch": 0.7989228007181328, + "grad_norm": 6.139911651611328, + "learning_rate": 9.576301615798924e-06, + "loss": 9.0408, + "step": 4450 + }, + { + "epoch": 0.803411131059246, + "grad_norm": 9.857640266418457, + "learning_rate": 9.630161579892279e-06, + "loss": 9.0408, + "step": 4475 + }, + { + "epoch": 0.8078994614003591, + "grad_norm": 5.802546501159668, + "learning_rate": 9.684021543985638e-06, + "loss": 9.05, + "step": 4500 + }, + { + "epoch": 0.8123877917414721, + "grad_norm": 7.251793384552002, + "learning_rate": 9.737881508078995e-06, + "loss": 8.9755, + "step": 4525 + }, + { + "epoch": 0.8168761220825853, + "grad_norm": 5.924556255340576, + "learning_rate": 9.791741472172352e-06, + "loss": 8.9304, + "step": 4550 + }, + { + "epoch": 0.8213644524236984, + "grad_norm": 6.135114669799805, + "learning_rate": 9.845601436265709e-06, + "loss": 8.9986, + "step": 4575 + }, + { + "epoch": 0.8258527827648114, + "grad_norm": 7.951456546783447, + "learning_rate": 9.899461400359068e-06, + "loss": 9.0149, + "step": 4600 + }, + { + "epoch": 0.8303411131059246, + "grad_norm": 5.310734748840332, + "learning_rate": 9.953321364452423e-06, + "loss": 9.0799, + "step": 4625 + }, + { + "epoch": 0.8348294434470377, + "grad_norm": 6.2131805419921875, + "learning_rate": 1.000718132854578e-05, + "loss": 9.0994, + "step": 4650 + }, + { + "epoch": 0.8393177737881508, + "grad_norm": 7.032895088195801, + "learning_rate": 1.0061041292639139e-05, + "loss": 9.0215, + "step": 4675 + }, + { + "epoch": 0.8438061041292639, + "grad_norm": 6.09020471572876, + "learning_rate": 1.0114901256732496e-05, + "loss": 9.0873, + "step": 4700 + }, + { + "epoch": 0.848294434470377, + "grad_norm": 6.73218297958374, + "learning_rate": 1.0168761220825853e-05, + "loss": 8.997, + "step": 4725 + }, + { + "epoch": 0.8527827648114902, + "grad_norm": 7.0032548904418945, + "learning_rate": 1.022262118491921e-05, + "loss": 8.9825, + "step": 4750 + }, + { + "epoch": 0.8572710951526032, + "grad_norm": 7.785491943359375, + "learning_rate": 1.0276481149012569e-05, + "loss": 8.9276, + "step": 4775 + }, + { + "epoch": 0.8617594254937163, + "grad_norm": 6.526766777038574, + "learning_rate": 1.0330341113105924e-05, + "loss": 8.9381, + "step": 4800 + }, + { + "epoch": 0.8662477558348295, + "grad_norm": 5.6302642822265625, + "learning_rate": 1.0384201077199281e-05, + "loss": 8.9587, + "step": 4825 + }, + { + "epoch": 0.8707360861759426, + "grad_norm": 5.0296831130981445, + "learning_rate": 1.0438061041292639e-05, + "loss": 8.9915, + "step": 4850 + }, + { + "epoch": 0.8752244165170556, + "grad_norm": 5.453030109405518, + "learning_rate": 1.0491921005385997e-05, + "loss": 9.0505, + "step": 4875 + }, + { + "epoch": 0.8797127468581688, + "grad_norm": 5.590126991271973, + "learning_rate": 1.0545780969479354e-05, + "loss": 8.9623, + "step": 4900 + }, + { + "epoch": 0.8842010771992819, + "grad_norm": 6.027144432067871, + "learning_rate": 1.0599640933572711e-05, + "loss": 9.0603, + "step": 4925 + }, + { + "epoch": 0.8886894075403949, + "grad_norm": 5.7679972648620605, + "learning_rate": 1.0653500897666069e-05, + "loss": 9.115, + "step": 4950 + }, + { + "epoch": 0.8931777378815081, + "grad_norm": 7.443205833435059, + "learning_rate": 1.0707360861759426e-05, + "loss": 8.9432, + "step": 4975 + }, + { + "epoch": 0.8976660682226212, + "grad_norm": 5.989015579223633, + "learning_rate": 1.0761220825852783e-05, + "loss": 8.9212, + "step": 5000 + }, + { + "epoch": 0.9021543985637342, + "grad_norm": 5.816547870635986, + "learning_rate": 1.081508078994614e-05, + "loss": 8.9928, + "step": 5025 + }, + { + "epoch": 0.9066427289048474, + "grad_norm": 5.60319185256958, + "learning_rate": 1.0868940754039498e-05, + "loss": 8.9941, + "step": 5050 + }, + { + "epoch": 0.9111310592459605, + "grad_norm": 6.1328229904174805, + "learning_rate": 1.0922800718132856e-05, + "loss": 9.0048, + "step": 5075 + }, + { + "epoch": 0.9156193895870736, + "grad_norm": 6.133834362030029, + "learning_rate": 1.0976660682226211e-05, + "loss": 8.9394, + "step": 5100 + }, + { + "epoch": 0.9201077199281867, + "grad_norm": 5.573734283447266, + "learning_rate": 1.103052064631957e-05, + "loss": 9.0544, + "step": 5125 + }, + { + "epoch": 0.9245960502692998, + "grad_norm": 5.48718786239624, + "learning_rate": 1.1084380610412927e-05, + "loss": 8.9581, + "step": 5150 + }, + { + "epoch": 0.9290843806104129, + "grad_norm": 6.680850028991699, + "learning_rate": 1.1138240574506284e-05, + "loss": 9.0004, + "step": 5175 + }, + { + "epoch": 0.933572710951526, + "grad_norm": 8.731657028198242, + "learning_rate": 1.1192100538599641e-05, + "loss": 9.0171, + "step": 5200 + }, + { + "epoch": 0.9380610412926391, + "grad_norm": 5.327418804168701, + "learning_rate": 1.1245960502693e-05, + "loss": 8.9562, + "step": 5225 + }, + { + "epoch": 0.9425493716337523, + "grad_norm": 5.400293350219727, + "learning_rate": 1.1299820466786355e-05, + "loss": 9.0202, + "step": 5250 + }, + { + "epoch": 0.9470377019748654, + "grad_norm": 5.484558582305908, + "learning_rate": 1.1353680430879712e-05, + "loss": 8.9291, + "step": 5275 + }, + { + "epoch": 0.9515260323159784, + "grad_norm": 5.1920294761657715, + "learning_rate": 1.1407540394973071e-05, + "loss": 8.9784, + "step": 5300 + }, + { + "epoch": 0.9560143626570916, + "grad_norm": 5.59995174407959, + "learning_rate": 1.1461400359066428e-05, + "loss": 9.0121, + "step": 5325 + }, + { + "epoch": 0.9605026929982047, + "grad_norm": 5.347548007965088, + "learning_rate": 1.1515260323159785e-05, + "loss": 8.9646, + "step": 5350 + }, + { + "epoch": 0.9649910233393177, + "grad_norm": 5.430029392242432, + "learning_rate": 1.1569120287253142e-05, + "loss": 9.0215, + "step": 5375 + }, + { + "epoch": 0.9694793536804309, + "grad_norm": 6.453085422515869, + "learning_rate": 1.16229802513465e-05, + "loss": 9.0384, + "step": 5400 + }, + { + "epoch": 0.973967684021544, + "grad_norm": 5.106292724609375, + "learning_rate": 1.1676840215439856e-05, + "loss": 8.9682, + "step": 5425 + }, + { + "epoch": 0.9784560143626571, + "grad_norm": 5.865028381347656, + "learning_rate": 1.1730700179533213e-05, + "loss": 8.8913, + "step": 5450 + }, + { + "epoch": 0.9829443447037702, + "grad_norm": 7.266361236572266, + "learning_rate": 1.178456014362657e-05, + "loss": 8.9914, + "step": 5475 + }, + { + "epoch": 0.9874326750448833, + "grad_norm": 6.62119722366333, + "learning_rate": 1.1838420107719929e-05, + "loss": 8.9235, + "step": 5500 + }, + { + "epoch": 0.9919210053859964, + "grad_norm": 9.024166107177734, + "learning_rate": 1.1892280071813286e-05, + "loss": 9.0954, + "step": 5525 + }, + { + "epoch": 0.9964093357271095, + "grad_norm": 5.261739730834961, + "learning_rate": 1.1946140035906643e-05, + "loss": 8.961, + "step": 5550 + }, + { + "epoch": 1.0, + "eval_accuracy": 0.006740114180971111, + "eval_f1_macro": 5.494608944268424e-06, + "eval_f1_micro": 0.006740114180971111, + "eval_f1_weighted": 0.00036222523086144984, + "eval_loss": 9.127163887023926, + "eval_precision_macro": 3.2636523448505627e-06, + "eval_precision_micro": 0.006740114180971111, + "eval_precision_weighted": 0.00021126671424404842, + "eval_recall_macro": 0.0001209152071357735, + "eval_recall_micro": 0.006740114180971111, + "eval_recall_weighted": 0.006740114180971111, + "eval_runtime": 149.89, + "eval_samples_per_second": 349.41, + "eval_steps_per_second": 10.921, + "step": 5570 + }, + { + "epoch": 1.0008976660682227, + "grad_norm": 5.560895919799805, + "learning_rate": 1.2e-05, + "loss": 8.9204, + "step": 5575 + }, + { + "epoch": 1.0053859964093357, + "grad_norm": 5.912028789520264, + "learning_rate": 1.2053859964093357e-05, + "loss": 8.8608, + "step": 5600 + }, + { + "epoch": 1.0098743267504489, + "grad_norm": 7.407133102416992, + "learning_rate": 1.2107719928186714e-05, + "loss": 8.7225, + "step": 5625 + }, + { + "epoch": 1.014362657091562, + "grad_norm": 5.227248191833496, + "learning_rate": 1.2161579892280072e-05, + "loss": 8.7434, + "step": 5650 + }, + { + "epoch": 1.018850987432675, + "grad_norm": 7.881109237670898, + "learning_rate": 1.221543985637343e-05, + "loss": 8.7668, + "step": 5675 + }, + { + "epoch": 1.0233393177737882, + "grad_norm": 7.92639684677124, + "learning_rate": 1.2269299820466787e-05, + "loss": 8.871, + "step": 5700 + }, + { + "epoch": 1.0278276481149013, + "grad_norm": 5.976809024810791, + "learning_rate": 1.2323159784560143e-05, + "loss": 8.772, + "step": 5725 + }, + { + "epoch": 1.0323159784560143, + "grad_norm": 6.980086803436279, + "learning_rate": 1.2377019748653501e-05, + "loss": 8.7755, + "step": 5750 + }, + { + "epoch": 1.0368043087971275, + "grad_norm": 5.1423869132995605, + "learning_rate": 1.2430879712746859e-05, + "loss": 8.7837, + "step": 5775 + }, + { + "epoch": 1.0412926391382407, + "grad_norm": 5.610518455505371, + "learning_rate": 1.2484739676840216e-05, + "loss": 8.8491, + "step": 5800 + }, + { + "epoch": 1.0457809694793536, + "grad_norm": 4.997601509094238, + "learning_rate": 1.2538599640933573e-05, + "loss": 8.8398, + "step": 5825 + }, + { + "epoch": 1.0502692998204668, + "grad_norm": 5.619950294494629, + "learning_rate": 1.2592459605026931e-05, + "loss": 8.8773, + "step": 5850 + }, + { + "epoch": 1.05475763016158, + "grad_norm": 5.71800422668457, + "learning_rate": 1.2646319569120287e-05, + "loss": 8.7983, + "step": 5875 + }, + { + "epoch": 1.059245960502693, + "grad_norm": 5.759223461151123, + "learning_rate": 1.2700179533213644e-05, + "loss": 8.8182, + "step": 5900 + }, + { + "epoch": 1.063734290843806, + "grad_norm": 5.644667625427246, + "learning_rate": 1.2754039497307003e-05, + "loss": 8.8033, + "step": 5925 + }, + { + "epoch": 1.0682226211849193, + "grad_norm": 6.620733261108398, + "learning_rate": 1.280789946140036e-05, + "loss": 8.8091, + "step": 5950 + }, + { + "epoch": 1.0727109515260322, + "grad_norm": 6.606126308441162, + "learning_rate": 1.2861759425493717e-05, + "loss": 8.8842, + "step": 5975 + }, + { + "epoch": 1.0771992818671454, + "grad_norm": 4.911701202392578, + "learning_rate": 1.2915619389587074e-05, + "loss": 8.7419, + "step": 6000 + }, + { + "epoch": 1.0816876122082586, + "grad_norm": 9.928366661071777, + "learning_rate": 1.2969479353680433e-05, + "loss": 8.8697, + "step": 6025 + }, + { + "epoch": 1.0861759425493716, + "grad_norm": 5.749391555786133, + "learning_rate": 1.3023339317773788e-05, + "loss": 8.8789, + "step": 6050 + }, + { + "epoch": 1.0906642728904847, + "grad_norm": 5.46829080581665, + "learning_rate": 1.3077199281867145e-05, + "loss": 8.7415, + "step": 6075 + }, + { + "epoch": 1.095152603231598, + "grad_norm": 6.031121730804443, + "learning_rate": 1.3131059245960502e-05, + "loss": 8.8834, + "step": 6100 + }, + { + "epoch": 1.0996409335727109, + "grad_norm": 5.246577262878418, + "learning_rate": 1.3184919210053861e-05, + "loss": 8.7461, + "step": 6125 + }, + { + "epoch": 1.104129263913824, + "grad_norm": 8.526304244995117, + "learning_rate": 1.3238779174147218e-05, + "loss": 8.7624, + "step": 6150 + }, + { + "epoch": 1.1086175942549372, + "grad_norm": 5.866250514984131, + "learning_rate": 1.3292639138240575e-05, + "loss": 8.8278, + "step": 6175 + }, + { + "epoch": 1.1131059245960502, + "grad_norm": 5.703503131866455, + "learning_rate": 1.3346499102333932e-05, + "loss": 8.7538, + "step": 6200 + }, + { + "epoch": 1.1175942549371634, + "grad_norm": 6.009499549865723, + "learning_rate": 1.340035906642729e-05, + "loss": 8.7184, + "step": 6225 + }, + { + "epoch": 1.1220825852782765, + "grad_norm": 5.72658109664917, + "learning_rate": 1.3454219030520646e-05, + "loss": 8.7021, + "step": 6250 + }, + { + "epoch": 1.1265709156193895, + "grad_norm": 5.449494361877441, + "learning_rate": 1.3508078994614003e-05, + "loss": 8.8232, + "step": 6275 + }, + { + "epoch": 1.1310592459605027, + "grad_norm": 5.728530406951904, + "learning_rate": 1.3561938958707362e-05, + "loss": 8.8601, + "step": 6300 + }, + { + "epoch": 1.1355475763016158, + "grad_norm": 8.715628623962402, + "learning_rate": 1.3615798922800719e-05, + "loss": 8.7664, + "step": 6325 + }, + { + "epoch": 1.140035906642729, + "grad_norm": 6.760737419128418, + "learning_rate": 1.3669658886894075e-05, + "loss": 8.6755, + "step": 6350 + }, + { + "epoch": 1.144524236983842, + "grad_norm": 5.671557426452637, + "learning_rate": 1.3723518850987433e-05, + "loss": 8.7981, + "step": 6375 + }, + { + "epoch": 1.1490125673249552, + "grad_norm": 5.734576225280762, + "learning_rate": 1.377737881508079e-05, + "loss": 8.8138, + "step": 6400 + }, + { + "epoch": 1.1535008976660683, + "grad_norm": 6.54246187210083, + "learning_rate": 1.3831238779174147e-05, + "loss": 8.713, + "step": 6425 + }, + { + "epoch": 1.1579892280071813, + "grad_norm": 5.260203838348389, + "learning_rate": 1.3885098743267504e-05, + "loss": 8.7611, + "step": 6450 + }, + { + "epoch": 1.1624775583482945, + "grad_norm": 5.610374927520752, + "learning_rate": 1.3938958707360863e-05, + "loss": 8.7779, + "step": 6475 + }, + { + "epoch": 1.1669658886894076, + "grad_norm": 5.227150917053223, + "learning_rate": 1.3992818671454219e-05, + "loss": 8.7327, + "step": 6500 + }, + { + "epoch": 1.1714542190305206, + "grad_norm": 6.22235107421875, + "learning_rate": 1.4046678635547576e-05, + "loss": 8.8199, + "step": 6525 + }, + { + "epoch": 1.1759425493716338, + "grad_norm": 7.3684611320495605, + "learning_rate": 1.4100538599640934e-05, + "loss": 8.7818, + "step": 6550 + }, + { + "epoch": 1.180430879712747, + "grad_norm": 5.522742748260498, + "learning_rate": 1.4154398563734292e-05, + "loss": 8.747, + "step": 6575 + }, + { + "epoch": 1.18491921005386, + "grad_norm": 5.632452011108398, + "learning_rate": 1.4208258527827649e-05, + "loss": 8.6808, + "step": 6600 + }, + { + "epoch": 1.189407540394973, + "grad_norm": 5.6344313621521, + "learning_rate": 1.4262118491921006e-05, + "loss": 8.7723, + "step": 6625 + }, + { + "epoch": 1.1938958707360863, + "grad_norm": 6.425195217132568, + "learning_rate": 1.4315978456014364e-05, + "loss": 8.813, + "step": 6650 + }, + { + "epoch": 1.1983842010771992, + "grad_norm": 6.16281270980835, + "learning_rate": 1.436983842010772e-05, + "loss": 8.6673, + "step": 6675 + }, + { + "epoch": 1.2028725314183124, + "grad_norm": 5.669048309326172, + "learning_rate": 1.4423698384201077e-05, + "loss": 8.7628, + "step": 6700 + }, + { + "epoch": 1.2073608617594256, + "grad_norm": 5.24860954284668, + "learning_rate": 1.4477558348294434e-05, + "loss": 8.6872, + "step": 6725 + }, + { + "epoch": 1.2118491921005385, + "grad_norm": 5.723819255828857, + "learning_rate": 1.4531418312387793e-05, + "loss": 8.6767, + "step": 6750 + }, + { + "epoch": 1.2163375224416517, + "grad_norm": 8.255654335021973, + "learning_rate": 1.458527827648115e-05, + "loss": 8.7455, + "step": 6775 + }, + { + "epoch": 1.220825852782765, + "grad_norm": 5.719984531402588, + "learning_rate": 1.4639138240574507e-05, + "loss": 8.5408, + "step": 6800 + }, + { + "epoch": 1.2253141831238779, + "grad_norm": 7.023374557495117, + "learning_rate": 1.4692998204667864e-05, + "loss": 8.8253, + "step": 6825 + }, + { + "epoch": 1.229802513464991, + "grad_norm": 6.4480414390563965, + "learning_rate": 1.4746858168761221e-05, + "loss": 8.7439, + "step": 6850 + }, + { + "epoch": 1.2342908438061042, + "grad_norm": 7.664842128753662, + "learning_rate": 1.4800718132854578e-05, + "loss": 8.7889, + "step": 6875 + }, + { + "epoch": 1.2387791741472172, + "grad_norm": 5.892557621002197, + "learning_rate": 1.4854578096947935e-05, + "loss": 8.675, + "step": 6900 + }, + { + "epoch": 1.2432675044883303, + "grad_norm": 6.638364791870117, + "learning_rate": 1.4908438061041294e-05, + "loss": 8.7219, + "step": 6925 + }, + { + "epoch": 1.2477558348294435, + "grad_norm": 6.350098609924316, + "learning_rate": 1.4962298025134651e-05, + "loss": 8.6265, + "step": 6950 + }, + { + "epoch": 1.2522441651705565, + "grad_norm": 5.707592487335205, + "learning_rate": 1.5016157989228008e-05, + "loss": 8.6592, + "step": 6975 + }, + { + "epoch": 1.2567324955116697, + "grad_norm": 5.210084915161133, + "learning_rate": 1.5070017953321365e-05, + "loss": 8.7465, + "step": 7000 + }, + { + "epoch": 1.2612208258527828, + "grad_norm": 6.316098690032959, + "learning_rate": 1.5123877917414722e-05, + "loss": 8.7668, + "step": 7025 + }, + { + "epoch": 1.2657091561938958, + "grad_norm": 5.252174377441406, + "learning_rate": 1.517773788150808e-05, + "loss": 8.7025, + "step": 7050 + }, + { + "epoch": 1.270197486535009, + "grad_norm": 5.4305219650268555, + "learning_rate": 1.5231597845601436e-05, + "loss": 8.6962, + "step": 7075 + }, + { + "epoch": 1.2746858168761221, + "grad_norm": 5.3795857429504395, + "learning_rate": 1.5285457809694793e-05, + "loss": 8.8117, + "step": 7100 + }, + { + "epoch": 1.279174147217235, + "grad_norm": 8.07836627960205, + "learning_rate": 1.533931777378815e-05, + "loss": 8.5957, + "step": 7125 + }, + { + "epoch": 1.2836624775583483, + "grad_norm": 5.238666534423828, + "learning_rate": 1.539317773788151e-05, + "loss": 8.6251, + "step": 7150 + }, + { + "epoch": 1.2881508078994615, + "grad_norm": 8.400931358337402, + "learning_rate": 1.5447037701974866e-05, + "loss": 8.7447, + "step": 7175 + }, + { + "epoch": 1.2926391382405744, + "grad_norm": 6.599817752838135, + "learning_rate": 1.5500897666068225e-05, + "loss": 8.6377, + "step": 7200 + }, + { + "epoch": 1.2971274685816876, + "grad_norm": 5.75445032119751, + "learning_rate": 1.555475763016158e-05, + "loss": 8.6035, + "step": 7225 + }, + { + "epoch": 1.3016157989228008, + "grad_norm": 5.208351135253906, + "learning_rate": 1.5608617594254936e-05, + "loss": 8.7603, + "step": 7250 + }, + { + "epoch": 1.3061041292639137, + "grad_norm": 6.172267913818359, + "learning_rate": 1.5662477558348295e-05, + "loss": 8.7149, + "step": 7275 + }, + { + "epoch": 1.310592459605027, + "grad_norm": 7.0656232833862305, + "learning_rate": 1.571633752244165e-05, + "loss": 8.7087, + "step": 7300 + }, + { + "epoch": 1.31508078994614, + "grad_norm": 6.945652484893799, + "learning_rate": 1.5770197486535012e-05, + "loss": 8.6763, + "step": 7325 + }, + { + "epoch": 1.319569120287253, + "grad_norm": 6.56538724899292, + "learning_rate": 1.5824057450628367e-05, + "loss": 8.8195, + "step": 7350 + }, + { + "epoch": 1.3240574506283662, + "grad_norm": 5.4737958908081055, + "learning_rate": 1.5877917414721723e-05, + "loss": 8.6832, + "step": 7375 + }, + { + "epoch": 1.3285457809694794, + "grad_norm": 5.514367580413818, + "learning_rate": 1.593177737881508e-05, + "loss": 8.6677, + "step": 7400 + }, + { + "epoch": 1.3330341113105924, + "grad_norm": 5.122795104980469, + "learning_rate": 1.5985637342908437e-05, + "loss": 8.7052, + "step": 7425 + }, + { + "epoch": 1.3375224416517055, + "grad_norm": 5.796945571899414, + "learning_rate": 1.6039497307001796e-05, + "loss": 8.6246, + "step": 7450 + }, + { + "epoch": 1.3420107719928187, + "grad_norm": 6.082705497741699, + "learning_rate": 1.609335727109515e-05, + "loss": 8.6243, + "step": 7475 + }, + { + "epoch": 1.3464991023339317, + "grad_norm": 6.689637184143066, + "learning_rate": 1.6147217235188513e-05, + "loss": 8.74, + "step": 7500 + }, + { + "epoch": 1.3509874326750448, + "grad_norm": 5.71528434753418, + "learning_rate": 1.620107719928187e-05, + "loss": 8.6462, + "step": 7525 + }, + { + "epoch": 1.355475763016158, + "grad_norm": 5.620816230773926, + "learning_rate": 1.6254937163375224e-05, + "loss": 8.6085, + "step": 7550 + }, + { + "epoch": 1.359964093357271, + "grad_norm": 5.5336833000183105, + "learning_rate": 1.6308797127468583e-05, + "loss": 8.5663, + "step": 7575 + }, + { + "epoch": 1.3644524236983842, + "grad_norm": 5.530635833740234, + "learning_rate": 1.6362657091561938e-05, + "loss": 8.5153, + "step": 7600 + }, + { + "epoch": 1.3689407540394973, + "grad_norm": 5.958146572113037, + "learning_rate": 1.6416517055655297e-05, + "loss": 8.6868, + "step": 7625 + }, + { + "epoch": 1.3734290843806103, + "grad_norm": 5.505101680755615, + "learning_rate": 1.6470377019748652e-05, + "loss": 8.6629, + "step": 7650 + }, + { + "epoch": 1.3779174147217235, + "grad_norm": 6.257275581359863, + "learning_rate": 1.652423698384201e-05, + "loss": 8.7297, + "step": 7675 + }, + { + "epoch": 1.3824057450628366, + "grad_norm": 6.553880214691162, + "learning_rate": 1.657809694793537e-05, + "loss": 8.6223, + "step": 7700 + }, + { + "epoch": 1.3868940754039496, + "grad_norm": 5.5833587646484375, + "learning_rate": 1.6631956912028725e-05, + "loss": 8.6425, + "step": 7725 + }, + { + "epoch": 1.3913824057450628, + "grad_norm": 5.248819351196289, + "learning_rate": 1.6685816876122084e-05, + "loss": 8.6158, + "step": 7750 + }, + { + "epoch": 1.395870736086176, + "grad_norm": 6.609114170074463, + "learning_rate": 1.673967684021544e-05, + "loss": 8.6175, + "step": 7775 + }, + { + "epoch": 1.400359066427289, + "grad_norm": 5.470149517059326, + "learning_rate": 1.6793536804308798e-05, + "loss": 8.6876, + "step": 7800 + }, + { + "epoch": 1.404847396768402, + "grad_norm": 5.857100963592529, + "learning_rate": 1.6847396768402153e-05, + "loss": 8.571, + "step": 7825 + }, + { + "epoch": 1.4093357271095153, + "grad_norm": 5.508138179779053, + "learning_rate": 1.6901256732495512e-05, + "loss": 8.6087, + "step": 7850 + }, + { + "epoch": 1.4138240574506284, + "grad_norm": 6.374080657958984, + "learning_rate": 1.695511669658887e-05, + "loss": 8.6484, + "step": 7875 + }, + { + "epoch": 1.4183123877917414, + "grad_norm": 7.112211227416992, + "learning_rate": 1.7008976660682226e-05, + "loss": 8.6316, + "step": 7900 + }, + { + "epoch": 1.4228007181328546, + "grad_norm": 5.720285892486572, + "learning_rate": 1.7062836624775585e-05, + "loss": 8.5786, + "step": 7925 + }, + { + "epoch": 1.4272890484739678, + "grad_norm": 6.270101547241211, + "learning_rate": 1.711669658886894e-05, + "loss": 8.6805, + "step": 7950 + }, + { + "epoch": 1.4317773788150807, + "grad_norm": 5.475482940673828, + "learning_rate": 1.71705565529623e-05, + "loss": 8.5708, + "step": 7975 + }, + { + "epoch": 1.436265709156194, + "grad_norm": 5.396524429321289, + "learning_rate": 1.7224416517055655e-05, + "loss": 8.3988, + "step": 8000 + }, + { + "epoch": 1.440754039497307, + "grad_norm": 5.261691570281982, + "learning_rate": 1.7278276481149013e-05, + "loss": 8.6025, + "step": 8025 + }, + { + "epoch": 1.44524236983842, + "grad_norm": 5.309108734130859, + "learning_rate": 1.7332136445242372e-05, + "loss": 8.6448, + "step": 8050 + }, + { + "epoch": 1.4497307001795332, + "grad_norm": 8.26509952545166, + "learning_rate": 1.7385996409335728e-05, + "loss": 8.667, + "step": 8075 + }, + { + "epoch": 1.4542190305206464, + "grad_norm": 5.413661956787109, + "learning_rate": 1.7439856373429086e-05, + "loss": 8.6273, + "step": 8100 + }, + { + "epoch": 1.4587073608617596, + "grad_norm": 5.341324806213379, + "learning_rate": 1.749371633752244e-05, + "loss": 8.5983, + "step": 8125 + }, + { + "epoch": 1.4631956912028725, + "grad_norm": 5.812887191772461, + "learning_rate": 1.7547576301615797e-05, + "loss": 8.6249, + "step": 8150 + }, + { + "epoch": 1.4676840215439857, + "grad_norm": 5.325205326080322, + "learning_rate": 1.7601436265709156e-05, + "loss": 8.6098, + "step": 8175 + }, + { + "epoch": 1.4721723518850989, + "grad_norm": 6.374330520629883, + "learning_rate": 1.7655296229802515e-05, + "loss": 8.5403, + "step": 8200 + }, + { + "epoch": 1.4766606822262118, + "grad_norm": 6.189131259918213, + "learning_rate": 1.7709156193895873e-05, + "loss": 8.6532, + "step": 8225 + }, + { + "epoch": 1.481149012567325, + "grad_norm": 9.60671615600586, + "learning_rate": 1.776301615798923e-05, + "loss": 8.543, + "step": 8250 + }, + { + "epoch": 1.4856373429084382, + "grad_norm": 5.8232879638671875, + "learning_rate": 1.7816876122082587e-05, + "loss": 8.5723, + "step": 8275 + }, + { + "epoch": 1.4901256732495511, + "grad_norm": 8.472805976867676, + "learning_rate": 1.7870736086175943e-05, + "loss": 8.499, + "step": 8300 + }, + { + "epoch": 1.4946140035906643, + "grad_norm": 5.473113059997559, + "learning_rate": 1.7924596050269298e-05, + "loss": 8.5224, + "step": 8325 + }, + { + "epoch": 1.4991023339317775, + "grad_norm": 5.414918899536133, + "learning_rate": 1.7978456014362657e-05, + "loss": 8.4924, + "step": 8350 + }, + { + "epoch": 1.5035906642728905, + "grad_norm": 6.2121195793151855, + "learning_rate": 1.8032315978456012e-05, + "loss": 8.5326, + "step": 8375 + }, + { + "epoch": 1.5080789946140036, + "grad_norm": 5.906697750091553, + "learning_rate": 1.8086175942549374e-05, + "loss": 8.3582, + "step": 8400 + }, + { + "epoch": 1.5125673249551168, + "grad_norm": 6.550022602081299, + "learning_rate": 1.814003590664273e-05, + "loss": 8.5556, + "step": 8425 + }, + { + "epoch": 1.5170556552962298, + "grad_norm": 5.851115703582764, + "learning_rate": 1.819389587073609e-05, + "loss": 8.5064, + "step": 8450 + }, + { + "epoch": 1.521543985637343, + "grad_norm": 5.953863143920898, + "learning_rate": 1.8247755834829444e-05, + "loss": 8.59, + "step": 8475 + }, + { + "epoch": 1.5260323159784561, + "grad_norm": 7.9262847900390625, + "learning_rate": 1.83016157989228e-05, + "loss": 8.5201, + "step": 8500 + }, + { + "epoch": 1.530520646319569, + "grad_norm": 5.078342437744141, + "learning_rate": 1.8355475763016158e-05, + "loss": 8.5763, + "step": 8525 + }, + { + "epoch": 1.5350089766606823, + "grad_norm": 6.203643798828125, + "learning_rate": 1.8409335727109514e-05, + "loss": 8.4395, + "step": 8550 + }, + { + "epoch": 1.5394973070017954, + "grad_norm": 6.355841636657715, + "learning_rate": 1.8463195691202876e-05, + "loss": 8.4392, + "step": 8575 + }, + { + "epoch": 1.5439856373429084, + "grad_norm": 6.290584564208984, + "learning_rate": 1.851705565529623e-05, + "loss": 8.537, + "step": 8600 + }, + { + "epoch": 1.5484739676840216, + "grad_norm": 5.499734401702881, + "learning_rate": 1.8570915619389586e-05, + "loss": 8.6495, + "step": 8625 + }, + { + "epoch": 1.5529622980251347, + "grad_norm": 6.269656181335449, + "learning_rate": 1.8624775583482945e-05, + "loss": 8.5767, + "step": 8650 + }, + { + "epoch": 1.5574506283662477, + "grad_norm": 6.339563846588135, + "learning_rate": 1.86786355475763e-05, + "loss": 8.5174, + "step": 8675 + }, + { + "epoch": 1.5619389587073609, + "grad_norm": 6.188441276550293, + "learning_rate": 1.873249551166966e-05, + "loss": 8.5618, + "step": 8700 + }, + { + "epoch": 1.566427289048474, + "grad_norm": 6.0984601974487305, + "learning_rate": 1.8786355475763015e-05, + "loss": 8.4995, + "step": 8725 + }, + { + "epoch": 1.570915619389587, + "grad_norm": 10.764265060424805, + "learning_rate": 1.8840215439856377e-05, + "loss": 8.5041, + "step": 8750 + }, + { + "epoch": 1.5754039497307002, + "grad_norm": 5.141970157623291, + "learning_rate": 1.8894075403949732e-05, + "loss": 8.5132, + "step": 8775 + }, + { + "epoch": 1.5798922800718134, + "grad_norm": 5.522831439971924, + "learning_rate": 1.8947935368043088e-05, + "loss": 8.5552, + "step": 8800 + }, + { + "epoch": 1.5843806104129263, + "grad_norm": 5.247507572174072, + "learning_rate": 1.9001795332136446e-05, + "loss": 8.4791, + "step": 8825 + }, + { + "epoch": 1.5888689407540395, + "grad_norm": 5.485008239746094, + "learning_rate": 1.9055655296229802e-05, + "loss": 8.5481, + "step": 8850 + }, + { + "epoch": 1.5933572710951527, + "grad_norm": 5.944901943206787, + "learning_rate": 1.910951526032316e-05, + "loss": 8.5067, + "step": 8875 + }, + { + "epoch": 1.5978456014362656, + "grad_norm": 6.719369888305664, + "learning_rate": 1.9163375224416516e-05, + "loss": 8.4367, + "step": 8900 + }, + { + "epoch": 1.6023339317773788, + "grad_norm": 7.536819934844971, + "learning_rate": 1.9217235188509875e-05, + "loss": 8.5488, + "step": 8925 + }, + { + "epoch": 1.606822262118492, + "grad_norm": 5.813321590423584, + "learning_rate": 1.9271095152603233e-05, + "loss": 8.5272, + "step": 8950 + }, + { + "epoch": 1.611310592459605, + "grad_norm": 6.640773773193359, + "learning_rate": 1.932495511669659e-05, + "loss": 8.5534, + "step": 8975 + }, + { + "epoch": 1.6157989228007181, + "grad_norm": 6.261020183563232, + "learning_rate": 1.9378815080789948e-05, + "loss": 8.428, + "step": 9000 + }, + { + "epoch": 1.6202872531418313, + "grad_norm": 6.959474563598633, + "learning_rate": 1.9432675044883303e-05, + "loss": 8.3944, + "step": 9025 + }, + { + "epoch": 1.6247755834829443, + "grad_norm": 5.624547481536865, + "learning_rate": 1.948653500897666e-05, + "loss": 8.4173, + "step": 9050 + }, + { + "epoch": 1.6292639138240574, + "grad_norm": 7.659823894500732, + "learning_rate": 1.9540394973070017e-05, + "loss": 8.5255, + "step": 9075 + }, + { + "epoch": 1.6337522441651706, + "grad_norm": 7.0507683753967285, + "learning_rate": 1.9594254937163376e-05, + "loss": 8.5104, + "step": 9100 + }, + { + "epoch": 1.6382405745062836, + "grad_norm": 16.380712509155273, + "learning_rate": 1.9648114901256735e-05, + "loss": 8.4946, + "step": 9125 + }, + { + "epoch": 1.6427289048473968, + "grad_norm": 6.516059398651123, + "learning_rate": 1.970197486535009e-05, + "loss": 8.5331, + "step": 9150 + }, + { + "epoch": 1.64721723518851, + "grad_norm": 6.012033462524414, + "learning_rate": 1.975583482944345e-05, + "loss": 8.486, + "step": 9175 + }, + { + "epoch": 1.6517055655296229, + "grad_norm": 7.36171817779541, + "learning_rate": 1.9809694793536804e-05, + "loss": 8.401, + "step": 9200 + }, + { + "epoch": 1.656193895870736, + "grad_norm": 6.235710144042969, + "learning_rate": 1.9863554757630163e-05, + "loss": 8.3684, + "step": 9225 + }, + { + "epoch": 1.6606822262118492, + "grad_norm": 10.659588813781738, + "learning_rate": 1.9917414721723518e-05, + "loss": 8.5803, + "step": 9250 + }, + { + "epoch": 1.6651705565529622, + "grad_norm": 6.105778694152832, + "learning_rate": 1.9971274685816877e-05, + "loss": 8.503, + "step": 9275 + }, + { + "epoch": 1.6696588868940754, + "grad_norm": 7.78377628326416, + "learning_rate": 2.0025134649910236e-05, + "loss": 8.4993, + "step": 9300 + }, + { + "epoch": 1.6741472172351886, + "grad_norm": 6.854383945465088, + "learning_rate": 2.007899461400359e-05, + "loss": 8.4914, + "step": 9325 + }, + { + "epoch": 1.6786355475763015, + "grad_norm": 6.901246547698975, + "learning_rate": 2.013285457809695e-05, + "loss": 8.5643, + "step": 9350 + }, + { + "epoch": 1.6831238779174147, + "grad_norm": 6.627065181732178, + "learning_rate": 2.0186714542190305e-05, + "loss": 8.3181, + "step": 9375 + }, + { + "epoch": 1.6876122082585279, + "grad_norm": 6.647068500518799, + "learning_rate": 2.024057450628366e-05, + "loss": 8.4923, + "step": 9400 + }, + { + "epoch": 1.6921005385996408, + "grad_norm": 7.653048515319824, + "learning_rate": 2.029443447037702e-05, + "loss": 8.4198, + "step": 9425 + }, + { + "epoch": 1.696588868940754, + "grad_norm": 8.176301956176758, + "learning_rate": 2.0348294434470378e-05, + "loss": 8.3543, + "step": 9450 + }, + { + "epoch": 1.7010771992818672, + "grad_norm": 7.623626232147217, + "learning_rate": 2.0402154398563737e-05, + "loss": 8.3787, + "step": 9475 + }, + { + "epoch": 1.7055655296229801, + "grad_norm": 7.686671257019043, + "learning_rate": 2.0456014362657092e-05, + "loss": 8.4113, + "step": 9500 + }, + { + "epoch": 1.7100538599640933, + "grad_norm": 7.778633117675781, + "learning_rate": 2.050987432675045e-05, + "loss": 8.4326, + "step": 9525 + }, + { + "epoch": 1.7145421903052065, + "grad_norm": 6.534867763519287, + "learning_rate": 2.0563734290843806e-05, + "loss": 8.339, + "step": 9550 + }, + { + "epoch": 1.7190305206463194, + "grad_norm": 7.026156902313232, + "learning_rate": 2.0617594254937162e-05, + "loss": 8.3217, + "step": 9575 + }, + { + "epoch": 1.7235188509874326, + "grad_norm": 7.087747573852539, + "learning_rate": 2.067145421903052e-05, + "loss": 8.3089, + "step": 9600 + }, + { + "epoch": 1.7280071813285458, + "grad_norm": 5.767719268798828, + "learning_rate": 2.0725314183123876e-05, + "loss": 8.4053, + "step": 9625 + }, + { + "epoch": 1.7324955116696588, + "grad_norm": 6.162588119506836, + "learning_rate": 2.0779174147217238e-05, + "loss": 8.3446, + "step": 9650 + }, + { + "epoch": 1.736983842010772, + "grad_norm": 6.57517147064209, + "learning_rate": 2.0833034111310593e-05, + "loss": 8.4743, + "step": 9675 + }, + { + "epoch": 1.7414721723518851, + "grad_norm": 6.348609924316406, + "learning_rate": 2.0886894075403952e-05, + "loss": 8.3012, + "step": 9700 + }, + { + "epoch": 1.745960502692998, + "grad_norm": 6.051657199859619, + "learning_rate": 2.0940754039497308e-05, + "loss": 8.4042, + "step": 9725 + }, + { + "epoch": 1.7504488330341115, + "grad_norm": 7.098099231719971, + "learning_rate": 2.0994614003590663e-05, + "loss": 8.3413, + "step": 9750 + }, + { + "epoch": 1.7549371633752244, + "grad_norm": 6.854818820953369, + "learning_rate": 2.1048473967684022e-05, + "loss": 8.372, + "step": 9775 + }, + { + "epoch": 1.7594254937163374, + "grad_norm": 7.056049823760986, + "learning_rate": 2.1102333931777377e-05, + "loss": 8.3973, + "step": 9800 + }, + { + "epoch": 1.7639138240574508, + "grad_norm": 5.698050498962402, + "learning_rate": 2.115619389587074e-05, + "loss": 8.4336, + "step": 9825 + }, + { + "epoch": 1.7684021543985637, + "grad_norm": 5.613630771636963, + "learning_rate": 2.1210053859964095e-05, + "loss": 8.3424, + "step": 9850 + }, + { + "epoch": 1.7728904847396767, + "grad_norm": 6.284994602203369, + "learning_rate": 2.126391382405745e-05, + "loss": 8.3095, + "step": 9875 + }, + { + "epoch": 1.77737881508079, + "grad_norm": 6.404321670532227, + "learning_rate": 2.131777378815081e-05, + "loss": 8.3915, + "step": 9900 + }, + { + "epoch": 1.781867145421903, + "grad_norm": 6.201684474945068, + "learning_rate": 2.1371633752244164e-05, + "loss": 8.4212, + "step": 9925 + }, + { + "epoch": 1.786355475763016, + "grad_norm": 5.832046985626221, + "learning_rate": 2.1425493716337523e-05, + "loss": 8.3013, + "step": 9950 + }, + { + "epoch": 1.7908438061041294, + "grad_norm": 8.582700729370117, + "learning_rate": 2.1479353680430878e-05, + "loss": 8.3654, + "step": 9975 + }, + { + "epoch": 1.7953321364452424, + "grad_norm": 5.957783222198486, + "learning_rate": 2.153321364452424e-05, + "loss": 8.4447, + "step": 10000 + }, + { + "epoch": 1.7998204667863553, + "grad_norm": 7.6008076667785645, + "learning_rate": 2.1587073608617596e-05, + "loss": 8.3035, + "step": 10025 + }, + { + "epoch": 1.8043087971274687, + "grad_norm": 7.054285049438477, + "learning_rate": 2.164093357271095e-05, + "loss": 8.4796, + "step": 10050 + }, + { + "epoch": 1.8087971274685817, + "grad_norm": 6.03176212310791, + "learning_rate": 2.169479353680431e-05, + "loss": 8.419, + "step": 10075 + }, + { + "epoch": 1.8132854578096946, + "grad_norm": 6.665684700012207, + "learning_rate": 2.1748653500897665e-05, + "loss": 8.3143, + "step": 10100 + }, + { + "epoch": 1.817773788150808, + "grad_norm": 11.72507095336914, + "learning_rate": 2.1802513464991024e-05, + "loss": 8.343, + "step": 10125 + }, + { + "epoch": 1.822262118491921, + "grad_norm": 6.700618267059326, + "learning_rate": 2.185637342908438e-05, + "loss": 8.2738, + "step": 10150 + }, + { + "epoch": 1.826750448833034, + "grad_norm": 5.941748142242432, + "learning_rate": 2.1910233393177738e-05, + "loss": 8.3595, + "step": 10175 + }, + { + "epoch": 1.8312387791741473, + "grad_norm": 6.252816200256348, + "learning_rate": 2.1964093357271097e-05, + "loss": 8.4251, + "step": 10200 + }, + { + "epoch": 1.8357271095152603, + "grad_norm": 6.3851752281188965, + "learning_rate": 2.2017953321364452e-05, + "loss": 8.3952, + "step": 10225 + }, + { + "epoch": 1.8402154398563735, + "grad_norm": 6.919291019439697, + "learning_rate": 2.207181328545781e-05, + "loss": 8.3402, + "step": 10250 + }, + { + "epoch": 1.8447037701974867, + "grad_norm": 6.122730255126953, + "learning_rate": 2.2125673249551166e-05, + "loss": 8.3211, + "step": 10275 + }, + { + "epoch": 1.8491921005385996, + "grad_norm": 6.055422782897949, + "learning_rate": 2.2179533213644525e-05, + "loss": 8.2345, + "step": 10300 + }, + { + "epoch": 1.8536804308797128, + "grad_norm": 6.893284797668457, + "learning_rate": 2.223339317773788e-05, + "loss": 8.2778, + "step": 10325 + }, + { + "epoch": 1.858168761220826, + "grad_norm": 6.630164623260498, + "learning_rate": 2.228725314183124e-05, + "loss": 8.2104, + "step": 10350 + }, + { + "epoch": 1.862657091561939, + "grad_norm": 6.4832329750061035, + "learning_rate": 2.2341113105924598e-05, + "loss": 8.1987, + "step": 10375 + }, + { + "epoch": 1.867145421903052, + "grad_norm": 5.648968696594238, + "learning_rate": 2.2394973070017954e-05, + "loss": 8.2629, + "step": 10400 + }, + { + "epoch": 1.8716337522441653, + "grad_norm": 6.0856547355651855, + "learning_rate": 2.2448833034111312e-05, + "loss": 8.3241, + "step": 10425 + }, + { + "epoch": 1.8761220825852782, + "grad_norm": 6.0178375244140625, + "learning_rate": 2.2502692998204668e-05, + "loss": 8.3691, + "step": 10450 + }, + { + "epoch": 1.8806104129263914, + "grad_norm": 5.778713226318359, + "learning_rate": 2.2556552962298026e-05, + "loss": 8.3126, + "step": 10475 + }, + { + "epoch": 1.8850987432675046, + "grad_norm": 6.281996726989746, + "learning_rate": 2.2610412926391382e-05, + "loss": 8.12, + "step": 10500 + }, + { + "epoch": 1.8895870736086176, + "grad_norm": 5.612243175506592, + "learning_rate": 2.266427289048474e-05, + "loss": 8.2734, + "step": 10525 + }, + { + "epoch": 1.8940754039497307, + "grad_norm": 6.363570690155029, + "learning_rate": 2.27181328545781e-05, + "loss": 8.1299, + "step": 10550 + }, + { + "epoch": 1.898563734290844, + "grad_norm": 5.983333110809326, + "learning_rate": 2.2771992818671455e-05, + "loss": 8.2296, + "step": 10575 + }, + { + "epoch": 1.9030520646319569, + "grad_norm": 9.443771362304688, + "learning_rate": 2.2825852782764813e-05, + "loss": 8.3316, + "step": 10600 + }, + { + "epoch": 1.90754039497307, + "grad_norm": 6.387014389038086, + "learning_rate": 2.287971274685817e-05, + "loss": 8.2418, + "step": 10625 + }, + { + "epoch": 1.9120287253141832, + "grad_norm": 6.436453342437744, + "learning_rate": 2.2933572710951524e-05, + "loss": 8.3353, + "step": 10650 + }, + { + "epoch": 1.9165170556552962, + "grad_norm": 7.218207836151123, + "learning_rate": 2.2987432675044883e-05, + "loss": 8.2901, + "step": 10675 + }, + { + "epoch": 1.9210053859964094, + "grad_norm": 6.231134414672852, + "learning_rate": 2.3041292639138242e-05, + "loss": 8.2771, + "step": 10700 + }, + { + "epoch": 1.9254937163375225, + "grad_norm": 6.847906589508057, + "learning_rate": 2.30951526032316e-05, + "loss": 8.2115, + "step": 10725 + }, + { + "epoch": 1.9299820466786355, + "grad_norm": 5.870643138885498, + "learning_rate": 2.3149012567324956e-05, + "loss": 8.3416, + "step": 10750 + }, + { + "epoch": 1.9344703770197487, + "grad_norm": 7.517190456390381, + "learning_rate": 2.3202872531418315e-05, + "loss": 8.2205, + "step": 10775 + }, + { + "epoch": 1.9389587073608618, + "grad_norm": 6.756166458129883, + "learning_rate": 2.325673249551167e-05, + "loss": 8.372, + "step": 10800 + }, + { + "epoch": 1.9434470377019748, + "grad_norm": 6.318005084991455, + "learning_rate": 2.3310592459605025e-05, + "loss": 8.227, + "step": 10825 + }, + { + "epoch": 1.947935368043088, + "grad_norm": 5.4359025955200195, + "learning_rate": 2.3364452423698384e-05, + "loss": 8.3149, + "step": 10850 + }, + { + "epoch": 1.9524236983842012, + "grad_norm": 6.002923488616943, + "learning_rate": 2.341831238779174e-05, + "loss": 8.2184, + "step": 10875 + }, + { + "epoch": 1.9569120287253141, + "grad_norm": 5.8620219230651855, + "learning_rate": 2.34721723518851e-05, + "loss": 8.085, + "step": 10900 + }, + { + "epoch": 1.9614003590664273, + "grad_norm": 6.729288101196289, + "learning_rate": 2.3526032315978457e-05, + "loss": 8.2091, + "step": 10925 + }, + { + "epoch": 1.9658886894075405, + "grad_norm": 5.725154876708984, + "learning_rate": 2.3579892280071816e-05, + "loss": 8.2059, + "step": 10950 + }, + { + "epoch": 1.9703770197486534, + "grad_norm": 7.5964035987854, + "learning_rate": 2.363375224416517e-05, + "loss": 8.1061, + "step": 10975 + }, + { + "epoch": 1.9748653500897666, + "grad_norm": 8.035944938659668, + "learning_rate": 2.3687612208258527e-05, + "loss": 8.2262, + "step": 11000 + }, + { + "epoch": 1.9793536804308798, + "grad_norm": 6.454775333404541, + "learning_rate": 2.3741472172351885e-05, + "loss": 8.0139, + "step": 11025 + }, + { + "epoch": 1.9838420107719927, + "grad_norm": 7.088364124298096, + "learning_rate": 2.379533213644524e-05, + "loss": 8.0666, + "step": 11050 + }, + { + "epoch": 1.988330341113106, + "grad_norm": 7.654647350311279, + "learning_rate": 2.3849192100538603e-05, + "loss": 8.2133, + "step": 11075 + }, + { + "epoch": 1.992818671454219, + "grad_norm": 6.231107711791992, + "learning_rate": 2.3903052064631958e-05, + "loss": 8.1671, + "step": 11100 + }, + { + "epoch": 1.997307001795332, + "grad_norm": 5.725689888000488, + "learning_rate": 2.3956912028725314e-05, + "loss": 8.1193, + "step": 11125 + }, + { + "epoch": 2.0, + "eval_accuracy": 0.03081740591526168, + "eval_f1_macro": 0.000179471318828956, + "eval_f1_micro": 0.03081740591526168, + "eval_f1_weighted": 0.005907768569001227, + "eval_loss": 8.45116138458252, + "eval_precision_macro": 0.00013620782441027172, + "eval_precision_micro": 0.03081740591526168, + "eval_precision_weighted": 0.003920950504748784, + "eval_recall_macro": 0.0008814524322187799, + "eval_recall_micro": 0.03081740591526168, + "eval_recall_weighted": 0.03081740591526168, + "eval_runtime": 63.6294, + "eval_samples_per_second": 823.094, + "eval_steps_per_second": 25.727, + "step": 11140 + }, + { + "epoch": 2.0017953321364454, + "grad_norm": 6.024521827697754, + "learning_rate": 2.4010771992818672e-05, + "loss": 8.0677, + "step": 11150 + }, + { + "epoch": 2.0062836624775584, + "grad_norm": 7.922410488128662, + "learning_rate": 2.4064631956912028e-05, + "loss": 8.033, + "step": 11175 + }, + { + "epoch": 2.0107719928186714, + "grad_norm": 5.915724277496338, + "learning_rate": 2.4118491921005386e-05, + "loss": 8.0297, + "step": 11200 + }, + { + "epoch": 2.0152603231597848, + "grad_norm": 5.845516204833984, + "learning_rate": 2.4172351885098742e-05, + "loss": 7.9441, + "step": 11225 + }, + { + "epoch": 2.0197486535008977, + "grad_norm": 6.512404441833496, + "learning_rate": 2.4226211849192104e-05, + "loss": 8.0425, + "step": 11250 + }, + { + "epoch": 2.0242369838420107, + "grad_norm": 6.325797080993652, + "learning_rate": 2.428007181328546e-05, + "loss": 8.0772, + "step": 11275 + }, + { + "epoch": 2.028725314183124, + "grad_norm": 6.583460807800293, + "learning_rate": 2.4333931777378815e-05, + "loss": 8.0348, + "step": 11300 + }, + { + "epoch": 2.033213644524237, + "grad_norm": 6.187891960144043, + "learning_rate": 2.4387791741472174e-05, + "loss": 8.1054, + "step": 11325 + }, + { + "epoch": 2.03770197486535, + "grad_norm": 6.792165756225586, + "learning_rate": 2.444165170556553e-05, + "loss": 8.1517, + "step": 11350 + }, + { + "epoch": 2.0421903052064634, + "grad_norm": 6.890494346618652, + "learning_rate": 2.4495511669658888e-05, + "loss": 7.9539, + "step": 11375 + }, + { + "epoch": 2.0466786355475763, + "grad_norm": 6.585249900817871, + "learning_rate": 2.4549371633752243e-05, + "loss": 8.0096, + "step": 11400 + }, + { + "epoch": 2.0511669658886893, + "grad_norm": 8.685056686401367, + "learning_rate": 2.4603231597845602e-05, + "loss": 8.0059, + "step": 11425 + }, + { + "epoch": 2.0556552962298027, + "grad_norm": 5.660272121429443, + "learning_rate": 2.465709156193896e-05, + "loss": 7.9044, + "step": 11450 + }, + { + "epoch": 2.0601436265709157, + "grad_norm": 6.457510948181152, + "learning_rate": 2.4710951526032316e-05, + "loss": 8.0513, + "step": 11475 + }, + { + "epoch": 2.0646319569120286, + "grad_norm": 5.8820953369140625, + "learning_rate": 2.4764811490125675e-05, + "loss": 8.0596, + "step": 11500 + }, + { + "epoch": 2.069120287253142, + "grad_norm": 6.007601261138916, + "learning_rate": 2.481867145421903e-05, + "loss": 7.8678, + "step": 11525 + }, + { + "epoch": 2.073608617594255, + "grad_norm": 9.694230079650879, + "learning_rate": 2.487253141831239e-05, + "loss": 7.9587, + "step": 11550 + }, + { + "epoch": 2.078096947935368, + "grad_norm": 5.8619184494018555, + "learning_rate": 2.4926391382405744e-05, + "loss": 7.9995, + "step": 11575 + }, + { + "epoch": 2.0825852782764813, + "grad_norm": 6.379667282104492, + "learning_rate": 2.4980251346499103e-05, + "loss": 7.9613, + "step": 11600 + }, + { + "epoch": 2.0870736086175943, + "grad_norm": 6.961312770843506, + "learning_rate": 2.5034111310592462e-05, + "loss": 7.9774, + "step": 11625 + }, + { + "epoch": 2.0915619389587072, + "grad_norm": 7.911416530609131, + "learning_rate": 2.5087971274685817e-05, + "loss": 7.8455, + "step": 11650 + }, + { + "epoch": 2.0960502692998206, + "grad_norm": 7.811432361602783, + "learning_rate": 2.5141831238779176e-05, + "loss": 8.0231, + "step": 11675 + }, + { + "epoch": 2.1005385996409336, + "grad_norm": Infinity, + "learning_rate": 2.51935368043088e-05, + "loss": 7.9671, + "step": 11700 + }, + { + "epoch": 2.1050269299820465, + "grad_norm": 6.347258567810059, + "learning_rate": 2.5247396768402154e-05, + "loss": 7.8802, + "step": 11725 + }, + { + "epoch": 2.10951526032316, + "grad_norm": 5.548816204071045, + "learning_rate": 2.530125673249551e-05, + "loss": 8.0498, + "step": 11750 + }, + { + "epoch": 2.114003590664273, + "grad_norm": 6.668107032775879, + "learning_rate": 2.535511669658887e-05, + "loss": 7.8332, + "step": 11775 + }, + { + "epoch": 2.118491921005386, + "grad_norm": 6.530945301055908, + "learning_rate": 2.5408976660682227e-05, + "loss": 7.961, + "step": 11800 + }, + { + "epoch": 2.1229802513464993, + "grad_norm": 6.61883544921875, + "learning_rate": 2.5462836624775586e-05, + "loss": 7.8341, + "step": 11825 + }, + { + "epoch": 2.127468581687612, + "grad_norm": 6.662093639373779, + "learning_rate": 2.551669658886894e-05, + "loss": 7.8939, + "step": 11850 + }, + { + "epoch": 2.131956912028725, + "grad_norm": 7.6850972175598145, + "learning_rate": 2.55705565529623e-05, + "loss": 7.9604, + "step": 11875 + }, + { + "epoch": 2.1364452423698386, + "grad_norm": 6.225870609283447, + "learning_rate": 2.5624416517055655e-05, + "loss": 7.8837, + "step": 11900 + }, + { + "epoch": 2.1409335727109515, + "grad_norm": 6.159789562225342, + "learning_rate": 2.567827648114901e-05, + "loss": 7.9413, + "step": 11925 + }, + { + "epoch": 2.1454219030520645, + "grad_norm": 6.3409295082092285, + "learning_rate": 2.573213644524237e-05, + "loss": 7.8167, + "step": 11950 + }, + { + "epoch": 2.149910233393178, + "grad_norm": 5.945008277893066, + "learning_rate": 2.578599640933573e-05, + "loss": 7.9323, + "step": 11975 + }, + { + "epoch": 2.154398563734291, + "grad_norm": 6.656594276428223, + "learning_rate": 2.5839856373429087e-05, + "loss": 7.9716, + "step": 12000 + }, + { + "epoch": 2.158886894075404, + "grad_norm": 9.291890144348145, + "learning_rate": 2.5893716337522443e-05, + "loss": 7.8212, + "step": 12025 + }, + { + "epoch": 2.163375224416517, + "grad_norm": 8.476262092590332, + "learning_rate": 2.59475763016158e-05, + "loss": 7.8704, + "step": 12050 + }, + { + "epoch": 2.16786355475763, + "grad_norm": 7.172900199890137, + "learning_rate": 2.6001436265709157e-05, + "loss": 7.9021, + "step": 12075 + }, + { + "epoch": 2.172351885098743, + "grad_norm": 7.949319362640381, + "learning_rate": 2.6055296229802512e-05, + "loss": 7.8601, + "step": 12100 + }, + { + "epoch": 2.1768402154398565, + "grad_norm": 5.598630905151367, + "learning_rate": 2.610915619389587e-05, + "loss": 7.9757, + "step": 12125 + }, + { + "epoch": 2.1813285457809695, + "grad_norm": 6.467071533203125, + "learning_rate": 2.616301615798923e-05, + "loss": 7.9235, + "step": 12150 + }, + { + "epoch": 2.1858168761220824, + "grad_norm": 6.878960132598877, + "learning_rate": 2.621687612208259e-05, + "loss": 7.8294, + "step": 12175 + }, + { + "epoch": 2.190305206463196, + "grad_norm": 7.32294225692749, + "learning_rate": 2.6270736086175944e-05, + "loss": 7.9699, + "step": 12200 + }, + { + "epoch": 2.1947935368043088, + "grad_norm": 6.158316612243652, + "learning_rate": 2.63245960502693e-05, + "loss": 7.8798, + "step": 12225 + }, + { + "epoch": 2.1992818671454217, + "grad_norm": 5.779697418212891, + "learning_rate": 2.6378456014362658e-05, + "loss": 7.934, + "step": 12250 + }, + { + "epoch": 2.203770197486535, + "grad_norm": 7.495954990386963, + "learning_rate": 2.6432315978456013e-05, + "loss": 7.7708, + "step": 12275 + }, + { + "epoch": 2.208258527827648, + "grad_norm": 6.747635841369629, + "learning_rate": 2.6486175942549372e-05, + "loss": 7.7811, + "step": 12300 + }, + { + "epoch": 2.212746858168761, + "grad_norm": 6.926841735839844, + "learning_rate": 2.654003590664273e-05, + "loss": 7.8048, + "step": 12325 + }, + { + "epoch": 2.2172351885098744, + "grad_norm": 7.086328029632568, + "learning_rate": 2.659389587073609e-05, + "loss": 7.8412, + "step": 12350 + }, + { + "epoch": 2.2217235188509874, + "grad_norm": 7.095609664916992, + "learning_rate": 2.6647755834829445e-05, + "loss": 8.0281, + "step": 12375 + }, + { + "epoch": 2.2262118491921004, + "grad_norm": 10.214571952819824, + "learning_rate": 2.67016157989228e-05, + "loss": 7.7467, + "step": 12400 + }, + { + "epoch": 2.2307001795332138, + "grad_norm": 7.159746170043945, + "learning_rate": 2.675547576301616e-05, + "loss": 7.9568, + "step": 12425 + }, + { + "epoch": 2.2351885098743267, + "grad_norm": 6.517829895019531, + "learning_rate": 2.6809335727109514e-05, + "loss": 7.8603, + "step": 12450 + }, + { + "epoch": 2.2396768402154397, + "grad_norm": 7.21049165725708, + "learning_rate": 2.6863195691202873e-05, + "loss": 7.8489, + "step": 12475 + }, + { + "epoch": 2.244165170556553, + "grad_norm": 7.560873031616211, + "learning_rate": 2.6917055655296232e-05, + "loss": 7.8647, + "step": 12500 + }, + { + "epoch": 2.248653500897666, + "grad_norm": 7.671408653259277, + "learning_rate": 2.6970915619389587e-05, + "loss": 7.7361, + "step": 12525 + }, + { + "epoch": 2.253141831238779, + "grad_norm": 6.50800895690918, + "learning_rate": 2.7024775583482946e-05, + "loss": 7.9221, + "step": 12550 + }, + { + "epoch": 2.2576301615798924, + "grad_norm": 6.967040538787842, + "learning_rate": 2.70786355475763e-05, + "loss": 7.8362, + "step": 12575 + }, + { + "epoch": 2.2621184919210053, + "grad_norm": 7.446094512939453, + "learning_rate": 2.713249551166966e-05, + "loss": 7.9131, + "step": 12600 + }, + { + "epoch": 2.2666068222621183, + "grad_norm": 8.638845443725586, + "learning_rate": 2.7186355475763016e-05, + "loss": 7.7899, + "step": 12625 + }, + { + "epoch": 2.2710951526032317, + "grad_norm": 6.870044231414795, + "learning_rate": 2.7240215439856374e-05, + "loss": 7.7786, + "step": 12650 + }, + { + "epoch": 2.2755834829443446, + "grad_norm": 6.788309574127197, + "learning_rate": 2.729407540394973e-05, + "loss": 7.8916, + "step": 12675 + }, + { + "epoch": 2.280071813285458, + "grad_norm": 7.325921058654785, + "learning_rate": 2.734793536804309e-05, + "loss": 7.736, + "step": 12700 + }, + { + "epoch": 2.284560143626571, + "grad_norm": 8.256213188171387, + "learning_rate": 2.7401795332136447e-05, + "loss": 7.9021, + "step": 12725 + }, + { + "epoch": 2.289048473967684, + "grad_norm": 9.529930114746094, + "learning_rate": 2.7455655296229803e-05, + "loss": 7.7811, + "step": 12750 + }, + { + "epoch": 2.293536804308797, + "grad_norm": 6.23882532119751, + "learning_rate": 2.750951526032316e-05, + "loss": 7.9418, + "step": 12775 + }, + { + "epoch": 2.2980251346499103, + "grad_norm": 6.770512580871582, + "learning_rate": 2.7563375224416517e-05, + "loss": 7.7001, + "step": 12800 + }, + { + "epoch": 2.3025134649910233, + "grad_norm": 8.434727668762207, + "learning_rate": 2.7617235188509876e-05, + "loss": 7.8356, + "step": 12825 + }, + { + "epoch": 2.3070017953321367, + "grad_norm": 7.253260135650635, + "learning_rate": 2.767109515260323e-05, + "loss": 7.9144, + "step": 12850 + }, + { + "epoch": 2.3114901256732496, + "grad_norm": 7.4588847160339355, + "learning_rate": 2.772495511669659e-05, + "loss": 7.7901, + "step": 12875 + }, + { + "epoch": 2.3159784560143626, + "grad_norm": 6.742276668548584, + "learning_rate": 2.777881508078995e-05, + "loss": 7.8818, + "step": 12900 + }, + { + "epoch": 2.3204667863554755, + "grad_norm": 7.081262588500977, + "learning_rate": 2.7832675044883304e-05, + "loss": 7.7537, + "step": 12925 + }, + { + "epoch": 2.324955116696589, + "grad_norm": 7.048617362976074, + "learning_rate": 2.7886535008976663e-05, + "loss": 7.9, + "step": 12950 + }, + { + "epoch": 2.329443447037702, + "grad_norm": 7.736903667449951, + "learning_rate": 2.7940394973070018e-05, + "loss": 7.801, + "step": 12975 + }, + { + "epoch": 2.3339317773788153, + "grad_norm": 7.7169294357299805, + "learning_rate": 2.7994254937163373e-05, + "loss": 7.936, + "step": 13000 + }, + { + "epoch": 2.3384201077199283, + "grad_norm": 8.288802146911621, + "learning_rate": 2.8048114901256732e-05, + "loss": 7.6394, + "step": 13025 + }, + { + "epoch": 2.342908438061041, + "grad_norm": 6.3444132804870605, + "learning_rate": 2.810197486535009e-05, + "loss": 7.7867, + "step": 13050 + }, + { + "epoch": 2.347396768402154, + "grad_norm": 7.248111248016357, + "learning_rate": 2.815583482944345e-05, + "loss": 7.801, + "step": 13075 + }, + { + "epoch": 2.3518850987432676, + "grad_norm": 6.109870910644531, + "learning_rate": 2.8209694793536805e-05, + "loss": 7.725, + "step": 13100 + }, + { + "epoch": 2.3563734290843805, + "grad_norm": 7.957067966461182, + "learning_rate": 2.8263554757630164e-05, + "loss": 7.7624, + "step": 13125 + }, + { + "epoch": 2.360861759425494, + "grad_norm": 7.3112030029296875, + "learning_rate": 2.831741472172352e-05, + "loss": 7.7253, + "step": 13150 + }, + { + "epoch": 2.365350089766607, + "grad_norm": 7.657966136932373, + "learning_rate": 2.8371274685816874e-05, + "loss": 7.9197, + "step": 13175 + }, + { + "epoch": 2.36983842010772, + "grad_norm": 6.684386253356934, + "learning_rate": 2.8425134649910233e-05, + "loss": 7.7712, + "step": 13200 + }, + { + "epoch": 2.374326750448833, + "grad_norm": 7.500448226928711, + "learning_rate": 2.8478994614003592e-05, + "loss": 7.6342, + "step": 13225 + }, + { + "epoch": 2.378815080789946, + "grad_norm": 6.45258903503418, + "learning_rate": 2.853285457809695e-05, + "loss": 7.8115, + "step": 13250 + }, + { + "epoch": 2.383303411131059, + "grad_norm": 5.90097713470459, + "learning_rate": 2.8586714542190306e-05, + "loss": 7.7934, + "step": 13275 + }, + { + "epoch": 2.3877917414721725, + "grad_norm": 5.789174556732178, + "learning_rate": 2.8640574506283665e-05, + "loss": 7.789, + "step": 13300 + }, + { + "epoch": 2.3922800718132855, + "grad_norm": 6.939302444458008, + "learning_rate": 2.869443447037702e-05, + "loss": 7.7822, + "step": 13325 + }, + { + "epoch": 2.3967684021543985, + "grad_norm": 6.6033616065979, + "learning_rate": 2.8748294434470376e-05, + "loss": 7.7013, + "step": 13350 + }, + { + "epoch": 2.401256732495512, + "grad_norm": 6.320329189300537, + "learning_rate": 2.8802154398563734e-05, + "loss": 7.7126, + "step": 13375 + }, + { + "epoch": 2.405745062836625, + "grad_norm": 7.2059760093688965, + "learning_rate": 2.8856014362657093e-05, + "loss": 7.8945, + "step": 13400 + }, + { + "epoch": 2.4102333931777378, + "grad_norm": 9.132862091064453, + "learning_rate": 2.8909874326750452e-05, + "loss": 7.7685, + "step": 13425 + }, + { + "epoch": 2.414721723518851, + "grad_norm": 9.809309005737305, + "learning_rate": 2.8963734290843807e-05, + "loss": 7.7807, + "step": 13450 + }, + { + "epoch": 2.419210053859964, + "grad_norm": 7.103224277496338, + "learning_rate": 2.9017594254937163e-05, + "loss": 7.7753, + "step": 13475 + }, + { + "epoch": 2.423698384201077, + "grad_norm": 6.377775192260742, + "learning_rate": 2.907145421903052e-05, + "loss": 7.7223, + "step": 13500 + }, + { + "epoch": 2.4281867145421905, + "grad_norm": 6.885396480560303, + "learning_rate": 2.9125314183123877e-05, + "loss": 7.8329, + "step": 13525 + }, + { + "epoch": 2.4326750448833034, + "grad_norm": 6.969510078430176, + "learning_rate": 2.9179174147217236e-05, + "loss": 7.8197, + "step": 13550 + }, + { + "epoch": 2.4371633752244164, + "grad_norm": 7.398974418640137, + "learning_rate": 2.9233034111310594e-05, + "loss": 7.6043, + "step": 13575 + }, + { + "epoch": 2.44165170556553, + "grad_norm": 8.273489952087402, + "learning_rate": 2.9286894075403953e-05, + "loss": 7.8946, + "step": 13600 + }, + { + "epoch": 2.4461400359066428, + "grad_norm": 7.488043308258057, + "learning_rate": 2.934075403949731e-05, + "loss": 7.6681, + "step": 13625 + }, + { + "epoch": 2.4506283662477557, + "grad_norm": 6.600340366363525, + "learning_rate": 2.9394614003590664e-05, + "loss": 7.7917, + "step": 13650 + }, + { + "epoch": 2.455116696588869, + "grad_norm": 9.905529975891113, + "learning_rate": 2.9448473967684023e-05, + "loss": 7.5522, + "step": 13675 + }, + { + "epoch": 2.459605026929982, + "grad_norm": 7.194679260253906, + "learning_rate": 2.9502333931777378e-05, + "loss": 7.7372, + "step": 13700 + }, + { + "epoch": 2.464093357271095, + "grad_norm": 6.611643314361572, + "learning_rate": 2.9556193895870737e-05, + "loss": 7.6881, + "step": 13725 + }, + { + "epoch": 2.4685816876122084, + "grad_norm": 6.803295612335205, + "learning_rate": 2.9610053859964096e-05, + "loss": 7.7027, + "step": 13750 + }, + { + "epoch": 2.4730700179533214, + "grad_norm": 7.261129856109619, + "learning_rate": 2.966391382405745e-05, + "loss": 7.7327, + "step": 13775 + }, + { + "epoch": 2.4775583482944343, + "grad_norm": 5.962939739227295, + "learning_rate": 2.971777378815081e-05, + "loss": 7.7761, + "step": 13800 + }, + { + "epoch": 2.4820466786355477, + "grad_norm": 6.458791732788086, + "learning_rate": 2.9771633752244165e-05, + "loss": 7.7374, + "step": 13825 + }, + { + "epoch": 2.4865350089766607, + "grad_norm": 11.393868446350098, + "learning_rate": 2.9825493716337524e-05, + "loss": 7.7328, + "step": 13850 + }, + { + "epoch": 2.4910233393177736, + "grad_norm": 8.051772117614746, + "learning_rate": 2.987935368043088e-05, + "loss": 7.7011, + "step": 13875 + }, + { + "epoch": 2.495511669658887, + "grad_norm": 6.906687259674072, + "learning_rate": 2.9933213644524238e-05, + "loss": 7.8543, + "step": 13900 + }, + { + "epoch": 2.5, + "grad_norm": 7.823049545288086, + "learning_rate": 2.9987073608617593e-05, + "loss": 7.8183, + "step": 13925 + }, + { + "epoch": 2.504488330341113, + "grad_norm": 6.613182544708252, + "learning_rate": 2.9995451825254338e-05, + "loss": 7.6341, + "step": 13950 + }, + { + "epoch": 2.5089766606822264, + "grad_norm": 5.790432453155518, + "learning_rate": 2.998946738479952e-05, + "loss": 7.7282, + "step": 13975 + }, + { + "epoch": 2.5134649910233393, + "grad_norm": 10.349187850952148, + "learning_rate": 2.9983482944344706e-05, + "loss": 7.7452, + "step": 14000 + }, + { + "epoch": 2.5179533213644523, + "grad_norm": 7.822238922119141, + "learning_rate": 2.997749850388989e-05, + "loss": 7.6573, + "step": 14025 + }, + { + "epoch": 2.5224416517055657, + "grad_norm": 7.314792633056641, + "learning_rate": 2.997151406343507e-05, + "loss": 7.6757, + "step": 14050 + }, + { + "epoch": 2.5269299820466786, + "grad_norm": 6.26696252822876, + "learning_rate": 2.9965529622980253e-05, + "loss": 7.6993, + "step": 14075 + }, + { + "epoch": 2.5314183123877916, + "grad_norm": 6.675545692443848, + "learning_rate": 2.9959545182525436e-05, + "loss": 7.7256, + "step": 14100 + }, + { + "epoch": 2.535906642728905, + "grad_norm": 8.998224258422852, + "learning_rate": 2.9953560742070615e-05, + "loss": 7.7828, + "step": 14125 + }, + { + "epoch": 2.540394973070018, + "grad_norm": 8.595346450805664, + "learning_rate": 2.99475763016158e-05, + "loss": 7.6516, + "step": 14150 + }, + { + "epoch": 2.5448833034111313, + "grad_norm": 9.375449180603027, + "learning_rate": 2.9941591861160983e-05, + "loss": 7.5735, + "step": 14175 + }, + { + "epoch": 2.5493716337522443, + "grad_norm": 6.4031147956848145, + "learning_rate": 2.9935607420706165e-05, + "loss": 7.6668, + "step": 14200 + }, + { + "epoch": 2.5538599640933572, + "grad_norm": 6.588226318359375, + "learning_rate": 2.9929622980251347e-05, + "loss": 7.7363, + "step": 14225 + }, + { + "epoch": 2.55834829443447, + "grad_norm": 9.299864768981934, + "learning_rate": 2.992363853979653e-05, + "loss": 7.7245, + "step": 14250 + }, + { + "epoch": 2.5628366247755836, + "grad_norm": 6.850192070007324, + "learning_rate": 2.9917654099341712e-05, + "loss": 7.6925, + "step": 14275 + }, + { + "epoch": 2.5673249551166966, + "grad_norm": 7.361215591430664, + "learning_rate": 2.9911669658886894e-05, + "loss": 7.6707, + "step": 14300 + }, + { + "epoch": 2.57181328545781, + "grad_norm": 7.464433193206787, + "learning_rate": 2.9905685218432077e-05, + "loss": 7.6283, + "step": 14325 + }, + { + "epoch": 2.576301615798923, + "grad_norm": 9.183905601501465, + "learning_rate": 2.989970077797726e-05, + "loss": 7.664, + "step": 14350 + }, + { + "epoch": 2.580789946140036, + "grad_norm": 6.961647033691406, + "learning_rate": 2.989371633752244e-05, + "loss": 7.7543, + "step": 14375 + }, + { + "epoch": 2.585278276481149, + "grad_norm": 7.992125988006592, + "learning_rate": 2.9887731897067624e-05, + "loss": 7.6341, + "step": 14400 + }, + { + "epoch": 2.5897666068222622, + "grad_norm": 6.667788982391357, + "learning_rate": 2.988174745661281e-05, + "loss": 7.5993, + "step": 14425 + }, + { + "epoch": 2.594254937163375, + "grad_norm": 7.534845352172852, + "learning_rate": 2.9875763016157992e-05, + "loss": 7.5657, + "step": 14450 + }, + { + "epoch": 2.5987432675044886, + "grad_norm": 6.715420722961426, + "learning_rate": 2.9869778575703174e-05, + "loss": 7.4625, + "step": 14475 + }, + { + "epoch": 2.6032315978456015, + "grad_norm": 7.590976715087891, + "learning_rate": 2.9863794135248353e-05, + "loss": 7.4568, + "step": 14500 + }, + { + "epoch": 2.6077199281867145, + "grad_norm": 6.858713150024414, + "learning_rate": 2.9857809694793536e-05, + "loss": 7.7362, + "step": 14525 + }, + { + "epoch": 2.6122082585278275, + "grad_norm": 7.0056939125061035, + "learning_rate": 2.9851825254338718e-05, + "loss": 7.7414, + "step": 14550 + }, + { + "epoch": 2.616696588868941, + "grad_norm": 7.591677665710449, + "learning_rate": 2.9845840813883904e-05, + "loss": 7.6553, + "step": 14575 + }, + { + "epoch": 2.621184919210054, + "grad_norm": 6.870407581329346, + "learning_rate": 2.9839856373429086e-05, + "loss": 7.6198, + "step": 14600 + }, + { + "epoch": 2.625673249551167, + "grad_norm": 8.7166109085083, + "learning_rate": 2.983387193297427e-05, + "loss": 7.7387, + "step": 14625 + }, + { + "epoch": 2.63016157989228, + "grad_norm": 7.024152755737305, + "learning_rate": 2.982788749251945e-05, + "loss": 7.6441, + "step": 14650 + }, + { + "epoch": 2.634649910233393, + "grad_norm": 6.682816982269287, + "learning_rate": 2.9821903052064633e-05, + "loss": 7.5957, + "step": 14675 + }, + { + "epoch": 2.639138240574506, + "grad_norm": 7.677892684936523, + "learning_rate": 2.9815918611609812e-05, + "loss": 7.6139, + "step": 14700 + }, + { + "epoch": 2.6436265709156195, + "grad_norm": 7.016209125518799, + "learning_rate": 2.9809934171154998e-05, + "loss": 7.5603, + "step": 14725 + }, + { + "epoch": 2.6481149012567324, + "grad_norm": 6.340160846710205, + "learning_rate": 2.980394973070018e-05, + "loss": 7.6257, + "step": 14750 + }, + { + "epoch": 2.652603231597846, + "grad_norm": 7.138514995574951, + "learning_rate": 2.9797965290245363e-05, + "loss": 7.6018, + "step": 14775 + }, + { + "epoch": 2.657091561938959, + "grad_norm": 6.810099124908447, + "learning_rate": 2.9791980849790545e-05, + "loss": 7.666, + "step": 14800 + }, + { + "epoch": 2.6615798922800717, + "grad_norm": 6.687268257141113, + "learning_rate": 2.9785996409335727e-05, + "loss": 7.5617, + "step": 14825 + }, + { + "epoch": 2.6660682226211847, + "grad_norm": 7.780331611633301, + "learning_rate": 2.978001196888091e-05, + "loss": 7.5746, + "step": 14850 + }, + { + "epoch": 2.670556552962298, + "grad_norm": 7.038577079772949, + "learning_rate": 2.9774027528426095e-05, + "loss": 7.5841, + "step": 14875 + }, + { + "epoch": 2.675044883303411, + "grad_norm": 8.114256858825684, + "learning_rate": 2.9768043087971274e-05, + "loss": 7.4702, + "step": 14900 + }, + { + "epoch": 2.6795332136445245, + "grad_norm": 8.929420471191406, + "learning_rate": 2.9762058647516457e-05, + "loss": 7.7293, + "step": 14925 + }, + { + "epoch": 2.6840215439856374, + "grad_norm": 6.216207981109619, + "learning_rate": 2.975607420706164e-05, + "loss": 7.6067, + "step": 14950 + }, + { + "epoch": 2.6885098743267504, + "grad_norm": 6.578426837921143, + "learning_rate": 2.975008976660682e-05, + "loss": 7.54, + "step": 14975 + }, + { + "epoch": 2.6929982046678633, + "grad_norm": 9.777558326721191, + "learning_rate": 2.9744105326152007e-05, + "loss": 7.784, + "step": 15000 + }, + { + "epoch": 2.6974865350089767, + "grad_norm": 8.000293731689453, + "learning_rate": 2.973812088569719e-05, + "loss": 7.7382, + "step": 15025 + }, + { + "epoch": 2.7019748653500897, + "grad_norm": 8.920495986938477, + "learning_rate": 2.9732136445242372e-05, + "loss": 7.705, + "step": 15050 + }, + { + "epoch": 2.706463195691203, + "grad_norm": 6.71304988861084, + "learning_rate": 2.9726152004787554e-05, + "loss": 7.5602, + "step": 15075 + }, + { + "epoch": 2.710951526032316, + "grad_norm": 7.6093902587890625, + "learning_rate": 2.9720167564332733e-05, + "loss": 7.8149, + "step": 15100 + }, + { + "epoch": 2.715439856373429, + "grad_norm": 8.298099517822266, + "learning_rate": 2.9714183123877916e-05, + "loss": 7.7458, + "step": 15125 + }, + { + "epoch": 2.719928186714542, + "grad_norm": 7.716296672821045, + "learning_rate": 2.97081986834231e-05, + "loss": 7.5016, + "step": 15150 + }, + { + "epoch": 2.7244165170556554, + "grad_norm": 7.7560906410217285, + "learning_rate": 2.9702214242968284e-05, + "loss": 7.6477, + "step": 15175 + }, + { + "epoch": 2.7289048473967683, + "grad_norm": 7.012542247772217, + "learning_rate": 2.9696229802513466e-05, + "loss": 7.4823, + "step": 15200 + }, + { + "epoch": 2.7333931777378817, + "grad_norm": 6.703278064727783, + "learning_rate": 2.969024536205865e-05, + "loss": 7.6941, + "step": 15225 + }, + { + "epoch": 2.7378815080789947, + "grad_norm": 6.44221305847168, + "learning_rate": 2.968426092160383e-05, + "loss": 7.5078, + "step": 15250 + }, + { + "epoch": 2.7423698384201076, + "grad_norm": 6.660685062408447, + "learning_rate": 2.9678276481149013e-05, + "loss": 7.667, + "step": 15275 + }, + { + "epoch": 2.7468581687612206, + "grad_norm": 6.1848530769348145, + "learning_rate": 2.9672292040694196e-05, + "loss": 7.5388, + "step": 15300 + }, + { + "epoch": 2.751346499102334, + "grad_norm": 6.673147678375244, + "learning_rate": 2.9666307600239378e-05, + "loss": 7.6454, + "step": 15325 + }, + { + "epoch": 2.755834829443447, + "grad_norm": 6.918528079986572, + "learning_rate": 2.966032315978456e-05, + "loss": 7.5487, + "step": 15350 + }, + { + "epoch": 2.7603231597845603, + "grad_norm": 6.739360332489014, + "learning_rate": 2.9654338719329743e-05, + "loss": 7.5631, + "step": 15375 + }, + { + "epoch": 2.7648114901256733, + "grad_norm": 7.985766887664795, + "learning_rate": 2.9648354278874925e-05, + "loss": 7.627, + "step": 15400 + }, + { + "epoch": 2.7692998204667862, + "grad_norm": 8.058419227600098, + "learning_rate": 2.964236983842011e-05, + "loss": 7.4752, + "step": 15425 + }, + { + "epoch": 2.773788150807899, + "grad_norm": 6.3068318367004395, + "learning_rate": 2.9636385397965293e-05, + "loss": 7.5726, + "step": 15450 + }, + { + "epoch": 2.7782764811490126, + "grad_norm": 7.028278827667236, + "learning_rate": 2.9630400957510475e-05, + "loss": 7.4184, + "step": 15475 + }, + { + "epoch": 2.7827648114901256, + "grad_norm": 7.532712459564209, + "learning_rate": 2.9624416517055654e-05, + "loss": 7.4927, + "step": 15500 + }, + { + "epoch": 2.787253141831239, + "grad_norm": 8.19107437133789, + "learning_rate": 2.9618432076600837e-05, + "loss": 7.5127, + "step": 15525 + }, + { + "epoch": 2.791741472172352, + "grad_norm": 6.627571105957031, + "learning_rate": 2.961244763614602e-05, + "loss": 7.4476, + "step": 15550 + }, + { + "epoch": 2.796229802513465, + "grad_norm": 8.43055534362793, + "learning_rate": 2.9606463195691205e-05, + "loss": 7.5944, + "step": 15575 + }, + { + "epoch": 2.800718132854578, + "grad_norm": 6.267340660095215, + "learning_rate": 2.9600478755236387e-05, + "loss": 7.5197, + "step": 15600 + }, + { + "epoch": 2.8052064631956912, + "grad_norm": 6.287229537963867, + "learning_rate": 2.959449431478157e-05, + "loss": 7.6304, + "step": 15625 + }, + { + "epoch": 2.809694793536804, + "grad_norm": 7.106199741363525, + "learning_rate": 2.9588509874326752e-05, + "loss": 7.4954, + "step": 15650 + }, + { + "epoch": 2.8141831238779176, + "grad_norm": 6.698776721954346, + "learning_rate": 2.9582525433871934e-05, + "loss": 7.4139, + "step": 15675 + }, + { + "epoch": 2.8186714542190305, + "grad_norm": 6.346949577331543, + "learning_rate": 2.9576540993417113e-05, + "loss": 7.5533, + "step": 15700 + }, + { + "epoch": 2.8231597845601435, + "grad_norm": 7.5319743156433105, + "learning_rate": 2.95705565529623e-05, + "loss": 7.5528, + "step": 15725 + }, + { + "epoch": 2.827648114901257, + "grad_norm": 8.526594161987305, + "learning_rate": 2.9564811490125676e-05, + "loss": 7.5624, + "step": 15750 + }, + { + "epoch": 2.83213644524237, + "grad_norm": 7.892963409423828, + "learning_rate": 2.955882704967086e-05, + "loss": 7.576, + "step": 15775 + }, + { + "epoch": 2.836624775583483, + "grad_norm": 7.362398624420166, + "learning_rate": 2.9552842609216038e-05, + "loss": 7.6494, + "step": 15800 + }, + { + "epoch": 2.841113105924596, + "grad_norm": 6.727048397064209, + "learning_rate": 2.954685816876122e-05, + "loss": 7.627, + "step": 15825 + }, + { + "epoch": 2.845601436265709, + "grad_norm": 7.223232269287109, + "learning_rate": 2.9540873728306402e-05, + "loss": 7.4952, + "step": 15850 + }, + { + "epoch": 2.850089766606822, + "grad_norm": 7.125131130218506, + "learning_rate": 2.9534889287851588e-05, + "loss": 7.8052, + "step": 15875 + }, + { + "epoch": 2.8545780969479355, + "grad_norm": 7.396420001983643, + "learning_rate": 2.952890484739677e-05, + "loss": 7.5438, + "step": 15900 + }, + { + "epoch": 2.8590664272890485, + "grad_norm": 6.939256191253662, + "learning_rate": 2.9522920406941953e-05, + "loss": 7.5114, + "step": 15925 + }, + { + "epoch": 2.8635547576301614, + "grad_norm": 8.820035934448242, + "learning_rate": 2.9516935966487135e-05, + "loss": 7.3979, + "step": 15950 + }, + { + "epoch": 2.868043087971275, + "grad_norm": 8.734342575073242, + "learning_rate": 2.9510951526032317e-05, + "loss": 7.5238, + "step": 15975 + }, + { + "epoch": 2.872531418312388, + "grad_norm": 9.484976768493652, + "learning_rate": 2.9504967085577496e-05, + "loss": 7.4569, + "step": 16000 + }, + { + "epoch": 2.8770197486535007, + "grad_norm": 6.883359432220459, + "learning_rate": 2.9498982645122682e-05, + "loss": 7.5418, + "step": 16025 + }, + { + "epoch": 2.881508078994614, + "grad_norm": 7.105976104736328, + "learning_rate": 2.9492998204667865e-05, + "loss": 7.468, + "step": 16050 + }, + { + "epoch": 2.885996409335727, + "grad_norm": 7.26230001449585, + "learning_rate": 2.9487013764213047e-05, + "loss": 7.4957, + "step": 16075 + }, + { + "epoch": 2.89048473967684, + "grad_norm": 9.87120246887207, + "learning_rate": 2.948102932375823e-05, + "loss": 7.4323, + "step": 16100 + }, + { + "epoch": 2.8949730700179535, + "grad_norm": 7.379208564758301, + "learning_rate": 2.947504488330341e-05, + "loss": 7.4971, + "step": 16125 + }, + { + "epoch": 2.8994614003590664, + "grad_norm": 5.963828086853027, + "learning_rate": 2.9469060442848594e-05, + "loss": 7.4699, + "step": 16150 + }, + { + "epoch": 2.9039497307001794, + "grad_norm": 7.145722389221191, + "learning_rate": 2.946307600239378e-05, + "loss": 7.5006, + "step": 16175 + }, + { + "epoch": 2.9084380610412928, + "grad_norm": 7.613919734954834, + "learning_rate": 2.945709156193896e-05, + "loss": 7.5088, + "step": 16200 + }, + { + "epoch": 2.9129263913824057, + "grad_norm": 7.519087314605713, + "learning_rate": 2.945110712148414e-05, + "loss": 7.5229, + "step": 16225 + }, + { + "epoch": 2.917414721723519, + "grad_norm": 6.753184795379639, + "learning_rate": 2.9445122681029323e-05, + "loss": 7.582, + "step": 16250 + }, + { + "epoch": 2.921903052064632, + "grad_norm": 8.541437149047852, + "learning_rate": 2.9439138240574506e-05, + "loss": 7.4904, + "step": 16275 + }, + { + "epoch": 2.926391382405745, + "grad_norm": 8.57909107208252, + "learning_rate": 2.943315380011969e-05, + "loss": 7.454, + "step": 16300 + }, + { + "epoch": 2.930879712746858, + "grad_norm": 9.547202110290527, + "learning_rate": 2.9427169359664874e-05, + "loss": 7.3797, + "step": 16325 + }, + { + "epoch": 2.9353680430879714, + "grad_norm": 6.347957134246826, + "learning_rate": 2.9421184919210056e-05, + "loss": 7.4636, + "step": 16350 + }, + { + "epoch": 2.9398563734290843, + "grad_norm": 6.824816703796387, + "learning_rate": 2.941520047875524e-05, + "loss": 7.5229, + "step": 16375 + }, + { + "epoch": 2.9443447037701977, + "grad_norm": 6.776854991912842, + "learning_rate": 2.9409216038300418e-05, + "loss": 7.5134, + "step": 16400 + }, + { + "epoch": 2.9488330341113107, + "grad_norm": 7.597510814666748, + "learning_rate": 2.94032315978456e-05, + "loss": 7.4637, + "step": 16425 + }, + { + "epoch": 2.9533213644524237, + "grad_norm": 6.244918346405029, + "learning_rate": 2.9397247157390786e-05, + "loss": 7.444, + "step": 16450 + }, + { + "epoch": 2.9578096947935366, + "grad_norm": 9.678526878356934, + "learning_rate": 2.9391262716935968e-05, + "loss": 7.5253, + "step": 16475 + }, + { + "epoch": 2.96229802513465, + "grad_norm": 7.32902717590332, + "learning_rate": 2.938527827648115e-05, + "loss": 7.3744, + "step": 16500 + }, + { + "epoch": 2.966786355475763, + "grad_norm": 6.690052032470703, + "learning_rate": 2.9379293836026333e-05, + "loss": 7.4117, + "step": 16525 + }, + { + "epoch": 2.9712746858168764, + "grad_norm": 6.51105260848999, + "learning_rate": 2.9373309395571515e-05, + "loss": 7.5549, + "step": 16550 + }, + { + "epoch": 2.9757630161579893, + "grad_norm": 6.685486793518066, + "learning_rate": 2.9367324955116697e-05, + "loss": 7.5918, + "step": 16575 + }, + { + "epoch": 2.9802513464991023, + "grad_norm": 8.500784873962402, + "learning_rate": 2.936134051466188e-05, + "loss": 7.465, + "step": 16600 + }, + { + "epoch": 2.9847396768402152, + "grad_norm": 6.900765895843506, + "learning_rate": 2.9355356074207062e-05, + "loss": 7.3257, + "step": 16625 + }, + { + "epoch": 2.9892280071813286, + "grad_norm": 6.525889873504639, + "learning_rate": 2.9349371633752245e-05, + "loss": 7.5289, + "step": 16650 + }, + { + "epoch": 2.9937163375224416, + "grad_norm": 6.766386985778809, + "learning_rate": 2.9343387193297427e-05, + "loss": 7.3485, + "step": 16675 + }, + { + "epoch": 2.998204667863555, + "grad_norm": 8.288542747497559, + "learning_rate": 2.933740275284261e-05, + "loss": 7.5895, + "step": 16700 + }, + { + "epoch": 3.0, + "eval_accuracy": 0.0483837091631184, + "eval_f1_macro": 0.0007445174662428825, + "eval_f1_micro": 0.0483837091631184, + "eval_f1_weighted": 0.014085373613459682, + "eval_loss": 7.798386096954346, + "eval_precision_macro": 0.000664283723731455, + "eval_precision_micro": 0.0483837091631184, + "eval_precision_weighted": 0.01047651030199422, + "eval_recall_macro": 0.0024577863735170047, + "eval_recall_micro": 0.0483837091631184, + "eval_recall_weighted": 0.0483837091631184, + "eval_runtime": 63.8157, + "eval_samples_per_second": 820.691, + "eval_steps_per_second": 25.652, + "step": 16710 + }, + { + "epoch": 3.002692998204668, + "grad_norm": 6.181105136871338, + "learning_rate": 2.933141831238779e-05, + "loss": 7.2644, + "step": 16725 + }, + { + "epoch": 3.007181328545781, + "grad_norm": 6.055491924285889, + "learning_rate": 2.9325433871932977e-05, + "loss": 7.1285, + "step": 16750 + }, + { + "epoch": 3.011669658886894, + "grad_norm": 7.069460868835449, + "learning_rate": 2.931944943147816e-05, + "loss": 7.0183, + "step": 16775 + }, + { + "epoch": 3.0161579892280073, + "grad_norm": 8.465182304382324, + "learning_rate": 2.931346499102334e-05, + "loss": 7.2051, + "step": 16800 + }, + { + "epoch": 3.02064631956912, + "grad_norm": 7.332658767700195, + "learning_rate": 2.930748055056852e-05, + "loss": 7.0684, + "step": 16825 + }, + { + "epoch": 3.025134649910233, + "grad_norm": 6.466104030609131, + "learning_rate": 2.9301496110113703e-05, + "loss": 7.2123, + "step": 16850 + }, + { + "epoch": 3.0296229802513466, + "grad_norm": 6.7026824951171875, + "learning_rate": 2.929551166965889e-05, + "loss": 7.1266, + "step": 16875 + }, + { + "epoch": 3.0341113105924595, + "grad_norm": 10.303590774536133, + "learning_rate": 2.9289766606822263e-05, + "loss": 7.2009, + "step": 16900 + }, + { + "epoch": 3.0385996409335725, + "grad_norm": 7.550820827484131, + "learning_rate": 2.9283782166367445e-05, + "loss": 7.1726, + "step": 16925 + }, + { + "epoch": 3.043087971274686, + "grad_norm": 6.654430866241455, + "learning_rate": 2.9277797725912628e-05, + "loss": 7.1373, + "step": 16950 + }, + { + "epoch": 3.047576301615799, + "grad_norm": 7.592336177825928, + "learning_rate": 2.927181328545781e-05, + "loss": 7.3328, + "step": 16975 + }, + { + "epoch": 3.0520646319569122, + "grad_norm": 7.304866313934326, + "learning_rate": 2.9265828845002992e-05, + "loss": 7.1406, + "step": 17000 + }, + { + "epoch": 3.056552962298025, + "grad_norm": 6.7539191246032715, + "learning_rate": 2.9259844404548175e-05, + "loss": 7.191, + "step": 17025 + }, + { + "epoch": 3.061041292639138, + "grad_norm": 7.2380900382995605, + "learning_rate": 2.925385996409336e-05, + "loss": 7.1155, + "step": 17050 + }, + { + "epoch": 3.0655296229802516, + "grad_norm": 8.266874313354492, + "learning_rate": 2.9247875523638543e-05, + "loss": 7.3668, + "step": 17075 + }, + { + "epoch": 3.0700179533213645, + "grad_norm": 6.841050148010254, + "learning_rate": 2.9241891083183722e-05, + "loss": 7.2012, + "step": 17100 + }, + { + "epoch": 3.0745062836624775, + "grad_norm": 6.9477081298828125, + "learning_rate": 2.9235906642728904e-05, + "loss": 7.2151, + "step": 17125 + }, + { + "epoch": 3.078994614003591, + "grad_norm": 6.552822589874268, + "learning_rate": 2.9229922202274086e-05, + "loss": 7.0859, + "step": 17150 + }, + { + "epoch": 3.083482944344704, + "grad_norm": 6.934198379516602, + "learning_rate": 2.922393776181927e-05, + "loss": 7.1495, + "step": 17175 + }, + { + "epoch": 3.087971274685817, + "grad_norm": 8.959328651428223, + "learning_rate": 2.9217953321364455e-05, + "loss": 7.1979, + "step": 17200 + }, + { + "epoch": 3.09245960502693, + "grad_norm": 6.886142253875732, + "learning_rate": 2.9211968880909637e-05, + "loss": 7.2048, + "step": 17225 + }, + { + "epoch": 3.096947935368043, + "grad_norm": 6.61863374710083, + "learning_rate": 2.920598444045482e-05, + "loss": 7.1999, + "step": 17250 + }, + { + "epoch": 3.101436265709156, + "grad_norm": 6.796585559844971, + "learning_rate": 2.92e-05, + "loss": 7.2774, + "step": 17275 + }, + { + "epoch": 3.1059245960502695, + "grad_norm": 7.451228141784668, + "learning_rate": 2.919401555954518e-05, + "loss": 7.0551, + "step": 17300 + }, + { + "epoch": 3.1104129263913824, + "grad_norm": 6.851739406585693, + "learning_rate": 2.9188031119090366e-05, + "loss": 7.2146, + "step": 17325 + }, + { + "epoch": 3.1149012567324954, + "grad_norm": 8.741716384887695, + "learning_rate": 2.918204667863555e-05, + "loss": 7.2656, + "step": 17350 + }, + { + "epoch": 3.119389587073609, + "grad_norm": 6.786454677581787, + "learning_rate": 2.917606223818073e-05, + "loss": 7.1899, + "step": 17375 + }, + { + "epoch": 3.1238779174147218, + "grad_norm": 8.289685249328613, + "learning_rate": 2.9170077797725913e-05, + "loss": 7.0956, + "step": 17400 + }, + { + "epoch": 3.1283662477558347, + "grad_norm": 7.628365993499756, + "learning_rate": 2.9164093357271096e-05, + "loss": 7.197, + "step": 17425 + }, + { + "epoch": 3.132854578096948, + "grad_norm": 7.729140281677246, + "learning_rate": 2.9158108916816278e-05, + "loss": 7.1585, + "step": 17450 + }, + { + "epoch": 3.137342908438061, + "grad_norm": 9.198493957519531, + "learning_rate": 2.9152124476361464e-05, + "loss": 7.1833, + "step": 17475 + }, + { + "epoch": 3.141831238779174, + "grad_norm": 7.035275936126709, + "learning_rate": 2.9146140035906643e-05, + "loss": 7.2625, + "step": 17500 + }, + { + "epoch": 3.1463195691202874, + "grad_norm": 6.9845781326293945, + "learning_rate": 2.9140155595451825e-05, + "loss": 7.0921, + "step": 17525 + }, + { + "epoch": 3.1508078994614004, + "grad_norm": 7.468381404876709, + "learning_rate": 2.9134171154997008e-05, + "loss": 7.337, + "step": 17550 + }, + { + "epoch": 3.1552962298025133, + "grad_norm": 7.1641411781311035, + "learning_rate": 2.912818671454219e-05, + "loss": 7.1587, + "step": 17575 + }, + { + "epoch": 3.1597845601436267, + "grad_norm": 8.242494583129883, + "learning_rate": 2.9122202274087372e-05, + "loss": 7.0861, + "step": 17600 + }, + { + "epoch": 3.1642728904847397, + "grad_norm": 6.048182487487793, + "learning_rate": 2.9116217833632558e-05, + "loss": 7.2554, + "step": 17625 + }, + { + "epoch": 3.1687612208258527, + "grad_norm": 6.880562782287598, + "learning_rate": 2.911023339317774e-05, + "loss": 7.1397, + "step": 17650 + }, + { + "epoch": 3.173249551166966, + "grad_norm": 7.657176494598389, + "learning_rate": 2.9104248952722923e-05, + "loss": 7.0984, + "step": 17675 + }, + { + "epoch": 3.177737881508079, + "grad_norm": 8.03272819519043, + "learning_rate": 2.9098264512268102e-05, + "loss": 7.1662, + "step": 17700 + }, + { + "epoch": 3.182226211849192, + "grad_norm": 6.846374988555908, + "learning_rate": 2.9092280071813284e-05, + "loss": 7.0366, + "step": 17725 + }, + { + "epoch": 3.1867145421903054, + "grad_norm": 7.520384788513184, + "learning_rate": 2.908629563135847e-05, + "loss": 7.1507, + "step": 17750 + }, + { + "epoch": 3.1912028725314183, + "grad_norm": 6.817747592926025, + "learning_rate": 2.9080311190903652e-05, + "loss": 7.0069, + "step": 17775 + }, + { + "epoch": 3.1956912028725313, + "grad_norm": 8.757506370544434, + "learning_rate": 2.9074326750448835e-05, + "loss": 7.0734, + "step": 17800 + }, + { + "epoch": 3.2001795332136447, + "grad_norm": 6.478495121002197, + "learning_rate": 2.9068342309994017e-05, + "loss": 7.1275, + "step": 17825 + }, + { + "epoch": 3.2046678635547576, + "grad_norm": 8.090387344360352, + "learning_rate": 2.90623578695392e-05, + "loss": 7.0875, + "step": 17850 + }, + { + "epoch": 3.2091561938958706, + "grad_norm": 7.626211643218994, + "learning_rate": 2.905637342908438e-05, + "loss": 7.1643, + "step": 17875 + }, + { + "epoch": 3.213644524236984, + "grad_norm": 6.834586143493652, + "learning_rate": 2.9050388988629564e-05, + "loss": 6.9993, + "step": 17900 + }, + { + "epoch": 3.218132854578097, + "grad_norm": 7.593177795410156, + "learning_rate": 2.9044404548174746e-05, + "loss": 6.9978, + "step": 17925 + }, + { + "epoch": 3.22262118491921, + "grad_norm": 7.460072040557861, + "learning_rate": 2.903842010771993e-05, + "loss": 7.0102, + "step": 17950 + }, + { + "epoch": 3.2271095152603233, + "grad_norm": 7.657380104064941, + "learning_rate": 2.903243566726511e-05, + "loss": 7.1999, + "step": 17975 + }, + { + "epoch": 3.2315978456014363, + "grad_norm": 7.33091926574707, + "learning_rate": 2.9026451226810293e-05, + "loss": 7.1684, + "step": 18000 + }, + { + "epoch": 3.236086175942549, + "grad_norm": 7.479068756103516, + "learning_rate": 2.9020466786355476e-05, + "loss": 7.1609, + "step": 18025 + }, + { + "epoch": 3.2405745062836626, + "grad_norm": 7.54212760925293, + "learning_rate": 2.901448234590066e-05, + "loss": 7.0062, + "step": 18050 + }, + { + "epoch": 3.2450628366247756, + "grad_norm": 8.16163158416748, + "learning_rate": 2.900849790544584e-05, + "loss": 7.1665, + "step": 18075 + }, + { + "epoch": 3.2495511669658885, + "grad_norm": 7.243312835693359, + "learning_rate": 2.9002513464991023e-05, + "loss": 7.1014, + "step": 18100 + }, + { + "epoch": 3.254039497307002, + "grad_norm": 6.587636470794678, + "learning_rate": 2.8996529024536205e-05, + "loss": 6.9404, + "step": 18125 + }, + { + "epoch": 3.258527827648115, + "grad_norm": 6.984015464782715, + "learning_rate": 2.8990544584081388e-05, + "loss": 7.145, + "step": 18150 + }, + { + "epoch": 3.263016157989228, + "grad_norm": 7.4421491622924805, + "learning_rate": 2.8984560143626573e-05, + "loss": 7.1385, + "step": 18175 + }, + { + "epoch": 3.2675044883303412, + "grad_norm": 7.318452835083008, + "learning_rate": 2.8978575703171756e-05, + "loss": 7.0938, + "step": 18200 + }, + { + "epoch": 3.271992818671454, + "grad_norm": 6.893324851989746, + "learning_rate": 2.8972591262716938e-05, + "loss": 7.143, + "step": 18225 + }, + { + "epoch": 3.276481149012567, + "grad_norm": 6.5367350578308105, + "learning_rate": 2.896660682226212e-05, + "loss": 7.042, + "step": 18250 + }, + { + "epoch": 3.2809694793536806, + "grad_norm": 6.635975360870361, + "learning_rate": 2.89606223818073e-05, + "loss": 7.2446, + "step": 18275 + }, + { + "epoch": 3.2854578096947935, + "grad_norm": 6.438441753387451, + "learning_rate": 2.8954637941352482e-05, + "loss": 7.049, + "step": 18300 + }, + { + "epoch": 3.2899461400359065, + "grad_norm": 7.076288223266602, + "learning_rate": 2.8948653500897668e-05, + "loss": 6.9501, + "step": 18325 + }, + { + "epoch": 3.29443447037702, + "grad_norm": 7.758124351501465, + "learning_rate": 2.894266906044285e-05, + "loss": 7.0114, + "step": 18350 + }, + { + "epoch": 3.298922800718133, + "grad_norm": 8.359841346740723, + "learning_rate": 2.8936684619988032e-05, + "loss": 7.036, + "step": 18375 + }, + { + "epoch": 3.3034111310592458, + "grad_norm": 7.979954242706299, + "learning_rate": 2.8930700179533215e-05, + "loss": 7.1878, + "step": 18400 + }, + { + "epoch": 3.307899461400359, + "grad_norm": 7.291618824005127, + "learning_rate": 2.8924715739078397e-05, + "loss": 7.1235, + "step": 18425 + }, + { + "epoch": 3.312387791741472, + "grad_norm": 7.5362749099731445, + "learning_rate": 2.891873129862358e-05, + "loss": 7.0704, + "step": 18450 + }, + { + "epoch": 3.316876122082585, + "grad_norm": 6.864318370819092, + "learning_rate": 2.891274685816876e-05, + "loss": 7.2538, + "step": 18475 + }, + { + "epoch": 3.3213644524236985, + "grad_norm": 6.837441444396973, + "learning_rate": 2.8906762417713944e-05, + "loss": 7.0094, + "step": 18500 + }, + { + "epoch": 3.3258527827648114, + "grad_norm": 6.450615882873535, + "learning_rate": 2.8900777977259126e-05, + "loss": 7.0877, + "step": 18525 + }, + { + "epoch": 3.3303411131059244, + "grad_norm": 7.295813083648682, + "learning_rate": 2.889479353680431e-05, + "loss": 6.9931, + "step": 18550 + }, + { + "epoch": 3.334829443447038, + "grad_norm": 6.6612701416015625, + "learning_rate": 2.888880909634949e-05, + "loss": 7.1311, + "step": 18575 + }, + { + "epoch": 3.3393177737881508, + "grad_norm": 8.442220687866211, + "learning_rate": 2.8882824655894673e-05, + "loss": 7.131, + "step": 18600 + }, + { + "epoch": 3.343806104129264, + "grad_norm": 6.843059539794922, + "learning_rate": 2.887684021543986e-05, + "loss": 7.1399, + "step": 18625 + }, + { + "epoch": 3.348294434470377, + "grad_norm": 7.230602264404297, + "learning_rate": 2.887085577498504e-05, + "loss": 7.0442, + "step": 18650 + }, + { + "epoch": 3.35278276481149, + "grad_norm": 6.832264423370361, + "learning_rate": 2.886487133453022e-05, + "loss": 7.1536, + "step": 18675 + }, + { + "epoch": 3.357271095152603, + "grad_norm": 7.193400859832764, + "learning_rate": 2.8858886894075403e-05, + "loss": 6.9913, + "step": 18700 + }, + { + "epoch": 3.3617594254937164, + "grad_norm": 7.238905429840088, + "learning_rate": 2.8852902453620585e-05, + "loss": 7.24, + "step": 18725 + }, + { + "epoch": 3.3662477558348294, + "grad_norm": 7.454446315765381, + "learning_rate": 2.884691801316577e-05, + "loss": 7.239, + "step": 18750 + }, + { + "epoch": 3.370736086175943, + "grad_norm": 7.0394673347473145, + "learning_rate": 2.8840933572710953e-05, + "loss": 7.1517, + "step": 18775 + }, + { + "epoch": 3.3752244165170557, + "grad_norm": 6.319003105163574, + "learning_rate": 2.8834949132256136e-05, + "loss": 7.1325, + "step": 18800 + }, + { + "epoch": 3.3797127468581687, + "grad_norm": 7.538498401641846, + "learning_rate": 2.8828964691801318e-05, + "loss": 6.9927, + "step": 18825 + }, + { + "epoch": 3.3842010771992816, + "grad_norm": 7.460974216461182, + "learning_rate": 2.88229802513465e-05, + "loss": 7.2057, + "step": 18850 + }, + { + "epoch": 3.388689407540395, + "grad_norm": 7.711810111999512, + "learning_rate": 2.881699581089168e-05, + "loss": 7.0188, + "step": 18875 + }, + { + "epoch": 3.393177737881508, + "grad_norm": 7.958193302154541, + "learning_rate": 2.8811011370436865e-05, + "loss": 6.9109, + "step": 18900 + }, + { + "epoch": 3.3976660682226214, + "grad_norm": 7.468191623687744, + "learning_rate": 2.8805026929982048e-05, + "loss": 7.1899, + "step": 18925 + }, + { + "epoch": 3.4021543985637344, + "grad_norm": 7.024964809417725, + "learning_rate": 2.879904248952723e-05, + "loss": 7.1093, + "step": 18950 + }, + { + "epoch": 3.4066427289048473, + "grad_norm": 7.367966175079346, + "learning_rate": 2.8793058049072412e-05, + "loss": 7.1538, + "step": 18975 + }, + { + "epoch": 3.4111310592459603, + "grad_norm": 6.572576999664307, + "learning_rate": 2.8787073608617595e-05, + "loss": 7.1118, + "step": 19000 + }, + { + "epoch": 3.4156193895870737, + "grad_norm": 8.588370323181152, + "learning_rate": 2.8781089168162777e-05, + "loss": 7.0842, + "step": 19025 + }, + { + "epoch": 3.4201077199281866, + "grad_norm": 6.414211273193359, + "learning_rate": 2.8775104727707963e-05, + "loss": 7.0132, + "step": 19050 + }, + { + "epoch": 3.4245960502693, + "grad_norm": 6.971017837524414, + "learning_rate": 2.8769120287253142e-05, + "loss": 7.1107, + "step": 19075 + }, + { + "epoch": 3.429084380610413, + "grad_norm": 7.74641227722168, + "learning_rate": 2.8763135846798324e-05, + "loss": 7.0274, + "step": 19100 + }, + { + "epoch": 3.433572710951526, + "grad_norm": 8.112064361572266, + "learning_rate": 2.8757151406343506e-05, + "loss": 7.0564, + "step": 19125 + }, + { + "epoch": 3.438061041292639, + "grad_norm": 8.435543060302734, + "learning_rate": 2.875116696588869e-05, + "loss": 6.8954, + "step": 19150 + }, + { + "epoch": 3.4425493716337523, + "grad_norm": 7.997531890869141, + "learning_rate": 2.8745182525433875e-05, + "loss": 6.8573, + "step": 19175 + }, + { + "epoch": 3.4470377019748653, + "grad_norm": 8.383268356323242, + "learning_rate": 2.8739198084979057e-05, + "loss": 6.9988, + "step": 19200 + }, + { + "epoch": 3.4515260323159787, + "grad_norm": 7.796001434326172, + "learning_rate": 2.873321364452424e-05, + "loss": 7.1539, + "step": 19225 + }, + { + "epoch": 3.4560143626570916, + "grad_norm": 7.11810302734375, + "learning_rate": 2.872722920406942e-05, + "loss": 7.0873, + "step": 19250 + }, + { + "epoch": 3.4605026929982046, + "grad_norm": 7.225034236907959, + "learning_rate": 2.87212447636146e-05, + "loss": 7.0984, + "step": 19275 + }, + { + "epoch": 3.464991023339318, + "grad_norm": 7.26684045791626, + "learning_rate": 2.8715260323159783e-05, + "loss": 7.1588, + "step": 19300 + }, + { + "epoch": 3.469479353680431, + "grad_norm": 9.374153137207031, + "learning_rate": 2.870927588270497e-05, + "loss": 7.1909, + "step": 19325 + }, + { + "epoch": 3.473967684021544, + "grad_norm": 7.074808597564697, + "learning_rate": 2.870329144225015e-05, + "loss": 7.0787, + "step": 19350 + }, + { + "epoch": 3.4784560143626573, + "grad_norm": 9.705293655395508, + "learning_rate": 2.8697307001795333e-05, + "loss": 7.1122, + "step": 19375 + }, + { + "epoch": 3.4829443447037702, + "grad_norm": 7.2829909324646, + "learning_rate": 2.8691322561340516e-05, + "loss": 7.2195, + "step": 19400 + }, + { + "epoch": 3.487432675044883, + "grad_norm": 7.057278633117676, + "learning_rate": 2.8685338120885698e-05, + "loss": 6.9548, + "step": 19425 + }, + { + "epoch": 3.4919210053859966, + "grad_norm": 6.930963516235352, + "learning_rate": 2.867935368043088e-05, + "loss": 7.0087, + "step": 19450 + }, + { + "epoch": 3.4964093357271095, + "grad_norm": 6.901101589202881, + "learning_rate": 2.8673369239976063e-05, + "loss": 6.9563, + "step": 19475 + }, + { + "epoch": 3.5008976660682225, + "grad_norm": 6.890328884124756, + "learning_rate": 2.8667384799521245e-05, + "loss": 7.1293, + "step": 19500 + }, + { + "epoch": 3.505385996409336, + "grad_norm": 8.11733627319336, + "learning_rate": 2.8661400359066428e-05, + "loss": 7.0148, + "step": 19525 + }, + { + "epoch": 3.509874326750449, + "grad_norm": 7.831202030181885, + "learning_rate": 2.865541591861161e-05, + "loss": 7.0639, + "step": 19550 + }, + { + "epoch": 3.514362657091562, + "grad_norm": 7.351139545440674, + "learning_rate": 2.8649431478156792e-05, + "loss": 7.2239, + "step": 19575 + }, + { + "epoch": 3.5188509874326748, + "grad_norm": 7.047708034515381, + "learning_rate": 2.8643447037701978e-05, + "loss": 7.0251, + "step": 19600 + }, + { + "epoch": 3.523339317773788, + "grad_norm": 6.914089202880859, + "learning_rate": 2.863746259724716e-05, + "loss": 7.1081, + "step": 19625 + }, + { + "epoch": 3.527827648114901, + "grad_norm": 7.985324382781982, + "learning_rate": 2.8631478156792343e-05, + "loss": 7.1152, + "step": 19650 + }, + { + "epoch": 3.5323159784560145, + "grad_norm": 8.047277450561523, + "learning_rate": 2.8625493716337522e-05, + "loss": 7.0303, + "step": 19675 + }, + { + "epoch": 3.5368043087971275, + "grad_norm": 7.602428436279297, + "learning_rate": 2.8619509275882704e-05, + "loss": 7.0749, + "step": 19700 + }, + { + "epoch": 3.5412926391382404, + "grad_norm": 10.015166282653809, + "learning_rate": 2.8613524835427886e-05, + "loss": 7.0005, + "step": 19725 + }, + { + "epoch": 3.545780969479354, + "grad_norm": 6.254261493682861, + "learning_rate": 2.8607540394973072e-05, + "loss": 7.1569, + "step": 19750 + }, + { + "epoch": 3.550269299820467, + "grad_norm": 8.156587600708008, + "learning_rate": 2.8601555954518255e-05, + "loss": 7.0548, + "step": 19775 + }, + { + "epoch": 3.5547576301615798, + "grad_norm": 8.717704772949219, + "learning_rate": 2.8595571514063437e-05, + "loss": 6.9859, + "step": 19800 + }, + { + "epoch": 3.559245960502693, + "grad_norm": 6.200718402862549, + "learning_rate": 2.858958707360862e-05, + "loss": 6.9057, + "step": 19825 + }, + { + "epoch": 3.563734290843806, + "grad_norm": 7.260859489440918, + "learning_rate": 2.8583602633153798e-05, + "loss": 6.9555, + "step": 19850 + }, + { + "epoch": 3.568222621184919, + "grad_norm": 6.9987030029296875, + "learning_rate": 2.857761819269898e-05, + "loss": 6.9213, + "step": 19875 + }, + { + "epoch": 3.5727109515260325, + "grad_norm": 8.615960121154785, + "learning_rate": 2.8571633752244166e-05, + "loss": 6.8765, + "step": 19900 + }, + { + "epoch": 3.5771992818671454, + "grad_norm": 7.70880651473999, + "learning_rate": 2.856564931178935e-05, + "loss": 6.9945, + "step": 19925 + }, + { + "epoch": 3.5816876122082584, + "grad_norm": 8.917046546936035, + "learning_rate": 2.855966487133453e-05, + "loss": 7.0113, + "step": 19950 + }, + { + "epoch": 3.5861759425493718, + "grad_norm": 11.726383209228516, + "learning_rate": 2.8553680430879713e-05, + "loss": 6.9675, + "step": 19975 + }, + { + "epoch": 3.5906642728904847, + "grad_norm": 6.5339789390563965, + "learning_rate": 2.8547695990424896e-05, + "loss": 6.89, + "step": 20000 + }, + { + "epoch": 3.5951526032315977, + "grad_norm": 8.026484489440918, + "learning_rate": 2.8541711549970078e-05, + "loss": 7.0557, + "step": 20025 + }, + { + "epoch": 3.599640933572711, + "grad_norm": 7.631568908691406, + "learning_rate": 2.853572710951526e-05, + "loss": 6.9343, + "step": 20050 + }, + { + "epoch": 3.604129263913824, + "grad_norm": 7.820140361785889, + "learning_rate": 2.8529742669060443e-05, + "loss": 6.9143, + "step": 20075 + }, + { + "epoch": 3.608617594254937, + "grad_norm": 7.794486999511719, + "learning_rate": 2.8523758228605625e-05, + "loss": 6.9666, + "step": 20100 + }, + { + "epoch": 3.6131059245960504, + "grad_norm": 6.941494941711426, + "learning_rate": 2.8517773788150808e-05, + "loss": 6.9602, + "step": 20125 + }, + { + "epoch": 3.6175942549371634, + "grad_norm": 6.6765546798706055, + "learning_rate": 2.851178934769599e-05, + "loss": 6.9596, + "step": 20150 + }, + { + "epoch": 3.6220825852782763, + "grad_norm": 6.866279125213623, + "learning_rate": 2.8505804907241176e-05, + "loss": 6.9249, + "step": 20175 + }, + { + "epoch": 3.6265709156193897, + "grad_norm": 10.283491134643555, + "learning_rate": 2.8499820466786358e-05, + "loss": 7.0722, + "step": 20200 + }, + { + "epoch": 3.6310592459605027, + "grad_norm": 7.556947708129883, + "learning_rate": 2.8494075403949732e-05, + "loss": 7.1241, + "step": 20225 + }, + { + "epoch": 3.635547576301616, + "grad_norm": 7.785460472106934, + "learning_rate": 2.8488090963494914e-05, + "loss": 7.033, + "step": 20250 + }, + { + "epoch": 3.640035906642729, + "grad_norm": 7.201344966888428, + "learning_rate": 2.8482106523040096e-05, + "loss": 6.9218, + "step": 20275 + }, + { + "epoch": 3.644524236983842, + "grad_norm": 7.7949419021606445, + "learning_rate": 2.847612208258528e-05, + "loss": 6.9288, + "step": 20300 + }, + { + "epoch": 3.649012567324955, + "grad_norm": 8.762776374816895, + "learning_rate": 2.847013764213046e-05, + "loss": 6.9655, + "step": 20325 + }, + { + "epoch": 3.6535008976660683, + "grad_norm": 7.752297401428223, + "learning_rate": 2.8464153201675647e-05, + "loss": 6.9158, + "step": 20350 + }, + { + "epoch": 3.6579892280071813, + "grad_norm": 8.468378067016602, + "learning_rate": 2.8458168761220826e-05, + "loss": 7.024, + "step": 20375 + }, + { + "epoch": 3.6624775583482947, + "grad_norm": 7.701879024505615, + "learning_rate": 2.8452184320766008e-05, + "loss": 6.9427, + "step": 20400 + }, + { + "epoch": 3.6669658886894076, + "grad_norm": 7.305758953094482, + "learning_rate": 2.844619988031119e-05, + "loss": 6.7874, + "step": 20425 + }, + { + "epoch": 3.6714542190305206, + "grad_norm": 7.31234073638916, + "learning_rate": 2.8440215439856373e-05, + "loss": 6.8153, + "step": 20450 + }, + { + "epoch": 3.6759425493716336, + "grad_norm": 7.735443592071533, + "learning_rate": 2.8434230999401555e-05, + "loss": 7.0205, + "step": 20475 + }, + { + "epoch": 3.680430879712747, + "grad_norm": 6.914196491241455, + "learning_rate": 2.842824655894674e-05, + "loss": 6.9143, + "step": 20500 + }, + { + "epoch": 3.68491921005386, + "grad_norm": 7.88019323348999, + "learning_rate": 2.8422262118491923e-05, + "loss": 6.9262, + "step": 20525 + }, + { + "epoch": 3.6894075403949733, + "grad_norm": 7.073838710784912, + "learning_rate": 2.8416277678037106e-05, + "loss": 6.9972, + "step": 20550 + }, + { + "epoch": 3.6938958707360863, + "grad_norm": 7.55523157119751, + "learning_rate": 2.8410293237582285e-05, + "loss": 6.9944, + "step": 20575 + }, + { + "epoch": 3.6983842010771992, + "grad_norm": 8.063077926635742, + "learning_rate": 2.8404308797127467e-05, + "loss": 7.0291, + "step": 20600 + }, + { + "epoch": 3.702872531418312, + "grad_norm": 8.415916442871094, + "learning_rate": 2.8398324356672653e-05, + "loss": 6.823, + "step": 20625 + }, + { + "epoch": 3.7073608617594256, + "grad_norm": 7.8589043617248535, + "learning_rate": 2.8392339916217835e-05, + "loss": 6.987, + "step": 20650 + }, + { + "epoch": 3.7118491921005385, + "grad_norm": 7.834930896759033, + "learning_rate": 2.8386355475763018e-05, + "loss": 7.0577, + "step": 20675 + }, + { + "epoch": 3.716337522441652, + "grad_norm": 14.277620315551758, + "learning_rate": 2.83803710353082e-05, + "loss": 6.9293, + "step": 20700 + }, + { + "epoch": 3.720825852782765, + "grad_norm": 7.484594821929932, + "learning_rate": 2.8374386594853382e-05, + "loss": 6.8929, + "step": 20725 + }, + { + "epoch": 3.725314183123878, + "grad_norm": 7.73836612701416, + "learning_rate": 2.8368402154398565e-05, + "loss": 6.8017, + "step": 20750 + }, + { + "epoch": 3.729802513464991, + "grad_norm": 7.8389973640441895, + "learning_rate": 2.8362417713943747e-05, + "loss": 6.9446, + "step": 20775 + }, + { + "epoch": 3.734290843806104, + "grad_norm": 7.71804666519165, + "learning_rate": 2.835643327348893e-05, + "loss": 6.8566, + "step": 20800 + }, + { + "epoch": 3.738779174147217, + "grad_norm": 7.073291301727295, + "learning_rate": 2.8350448833034112e-05, + "loss": 6.9063, + "step": 20825 + }, + { + "epoch": 3.7432675044883306, + "grad_norm": 6.997871398925781, + "learning_rate": 2.8344464392579294e-05, + "loss": 6.9914, + "step": 20850 + }, + { + "epoch": 3.7477558348294435, + "grad_norm": 7.012242317199707, + "learning_rate": 2.8338479952124477e-05, + "loss": 6.9063, + "step": 20875 + }, + { + "epoch": 3.7522441651705565, + "grad_norm": 8.348776817321777, + "learning_rate": 2.833249551166966e-05, + "loss": 7.0178, + "step": 20900 + }, + { + "epoch": 3.7567324955116694, + "grad_norm": 7.556951522827148, + "learning_rate": 2.8326511071214845e-05, + "loss": 6.9504, + "step": 20925 + }, + { + "epoch": 3.761220825852783, + "grad_norm": 7.951843738555908, + "learning_rate": 2.8320526630760024e-05, + "loss": 6.99, + "step": 20950 + }, + { + "epoch": 3.765709156193896, + "grad_norm": 7.906277656555176, + "learning_rate": 2.8314542190305206e-05, + "loss": 6.9834, + "step": 20975 + }, + { + "epoch": 3.770197486535009, + "grad_norm": 9.433594703674316, + "learning_rate": 2.8308557749850388e-05, + "loss": 7.0077, + "step": 21000 + }, + { + "epoch": 3.774685816876122, + "grad_norm": 6.860331058502197, + "learning_rate": 2.830257330939557e-05, + "loss": 6.8534, + "step": 21025 + }, + { + "epoch": 3.779174147217235, + "grad_norm": 7.513708114624023, + "learning_rate": 2.8296588868940756e-05, + "loss": 6.9104, + "step": 21050 + }, + { + "epoch": 3.783662477558348, + "grad_norm": 7.581325531005859, + "learning_rate": 2.829060442848594e-05, + "loss": 6.8106, + "step": 21075 + }, + { + "epoch": 3.7881508078994615, + "grad_norm": 7.575826168060303, + "learning_rate": 2.828461998803112e-05, + "loss": 6.962, + "step": 21100 + }, + { + "epoch": 3.7926391382405744, + "grad_norm": 7.32150936126709, + "learning_rate": 2.8278635547576303e-05, + "loss": 6.9887, + "step": 21125 + }, + { + "epoch": 3.797127468581688, + "grad_norm": 7.946230888366699, + "learning_rate": 2.8272651107121482e-05, + "loss": 6.9768, + "step": 21150 + }, + { + "epoch": 3.8016157989228008, + "grad_norm": 7.794346809387207, + "learning_rate": 2.8266666666666665e-05, + "loss": 6.8552, + "step": 21175 + }, + { + "epoch": 3.8061041292639137, + "grad_norm": 7.753136157989502, + "learning_rate": 2.826068222621185e-05, + "loss": 6.9518, + "step": 21200 + }, + { + "epoch": 3.8105924596050267, + "grad_norm": 7.017697811126709, + "learning_rate": 2.8254697785757033e-05, + "loss": 6.8999, + "step": 21225 + }, + { + "epoch": 3.81508078994614, + "grad_norm": 7.187480926513672, + "learning_rate": 2.8248713345302215e-05, + "loss": 6.9337, + "step": 21250 + }, + { + "epoch": 3.819569120287253, + "grad_norm": 7.079424858093262, + "learning_rate": 2.8242728904847398e-05, + "loss": 6.9846, + "step": 21275 + }, + { + "epoch": 3.8240574506283664, + "grad_norm": 8.313340187072754, + "learning_rate": 2.823674446439258e-05, + "loss": 6.7763, + "step": 21300 + }, + { + "epoch": 3.8285457809694794, + "grad_norm": 7.366817474365234, + "learning_rate": 2.8230760023937762e-05, + "loss": 7.0385, + "step": 21325 + }, + { + "epoch": 3.8330341113105924, + "grad_norm": 6.369441509246826, + "learning_rate": 2.8224775583482945e-05, + "loss": 6.8924, + "step": 21350 + }, + { + "epoch": 3.8375224416517053, + "grad_norm": 8.079645156860352, + "learning_rate": 2.8218791143028127e-05, + "loss": 6.9612, + "step": 21375 + }, + { + "epoch": 3.8420107719928187, + "grad_norm": 7.542420387268066, + "learning_rate": 2.821280670257331e-05, + "loss": 6.9283, + "step": 21400 + }, + { + "epoch": 3.8464991023339317, + "grad_norm": 7.496708393096924, + "learning_rate": 2.8206822262118492e-05, + "loss": 6.9003, + "step": 21425 + }, + { + "epoch": 3.850987432675045, + "grad_norm": 7.907470226287842, + "learning_rate": 2.8200837821663674e-05, + "loss": 6.898, + "step": 21450 + }, + { + "epoch": 3.855475763016158, + "grad_norm": 7.208642959594727, + "learning_rate": 2.819485338120886e-05, + "loss": 6.8568, + "step": 21475 + }, + { + "epoch": 3.859964093357271, + "grad_norm": 8.166616439819336, + "learning_rate": 2.8188868940754042e-05, + "loss": 6.9519, + "step": 21500 + }, + { + "epoch": 3.864452423698384, + "grad_norm": 7.573169231414795, + "learning_rate": 2.8182884500299225e-05, + "loss": 6.8117, + "step": 21525 + }, + { + "epoch": 3.8689407540394973, + "grad_norm": 8.239895820617676, + "learning_rate": 2.8176900059844404e-05, + "loss": 7.0659, + "step": 21550 + }, + { + "epoch": 3.8734290843806103, + "grad_norm": 8.46237564086914, + "learning_rate": 2.8170915619389586e-05, + "loss": 6.8703, + "step": 21575 + }, + { + "epoch": 3.8779174147217237, + "grad_norm": 7.154689788818359, + "learning_rate": 2.816493117893477e-05, + "loss": 6.9502, + "step": 21600 + }, + { + "epoch": 3.8824057450628366, + "grad_norm": 7.491994857788086, + "learning_rate": 2.8158946738479954e-05, + "loss": 7.0775, + "step": 21625 + }, + { + "epoch": 3.8868940754039496, + "grad_norm": 7.96471643447876, + "learning_rate": 2.8152962298025136e-05, + "loss": 6.9239, + "step": 21650 + }, + { + "epoch": 3.891382405745063, + "grad_norm": 7.348348140716553, + "learning_rate": 2.814697785757032e-05, + "loss": 6.9625, + "step": 21675 + }, + { + "epoch": 3.895870736086176, + "grad_norm": 7.290869235992432, + "learning_rate": 2.81409934171155e-05, + "loss": 6.7703, + "step": 21700 + }, + { + "epoch": 3.900359066427289, + "grad_norm": 6.796521186828613, + "learning_rate": 2.8135008976660684e-05, + "loss": 6.925, + "step": 21725 + }, + { + "epoch": 3.9048473967684023, + "grad_norm": 8.732086181640625, + "learning_rate": 2.8129024536205862e-05, + "loss": 7.0452, + "step": 21750 + }, + { + "epoch": 3.9093357271095153, + "grad_norm": 6.7342705726623535, + "learning_rate": 2.8123040095751048e-05, + "loss": 6.8945, + "step": 21775 + }, + { + "epoch": 3.9138240574506282, + "grad_norm": 7.23472261428833, + "learning_rate": 2.811705565529623e-05, + "loss": 6.8449, + "step": 21800 + }, + { + "epoch": 3.9183123877917416, + "grad_norm": 7.400080680847168, + "learning_rate": 2.8111071214841413e-05, + "loss": 6.8877, + "step": 21825 + }, + { + "epoch": 3.9228007181328546, + "grad_norm": 7.791996479034424, + "learning_rate": 2.8105086774386595e-05, + "loss": 6.8942, + "step": 21850 + }, + { + "epoch": 3.9272890484739675, + "grad_norm": 7.753776550292969, + "learning_rate": 2.8099102333931778e-05, + "loss": 6.8905, + "step": 21875 + }, + { + "epoch": 3.931777378815081, + "grad_norm": 8.601170539855957, + "learning_rate": 2.809311789347696e-05, + "loss": 6.824, + "step": 21900 + }, + { + "epoch": 3.936265709156194, + "grad_norm": 8.11901569366455, + "learning_rate": 2.8087133453022146e-05, + "loss": 6.9683, + "step": 21925 + }, + { + "epoch": 3.940754039497307, + "grad_norm": 8.078390121459961, + "learning_rate": 2.8081149012567325e-05, + "loss": 6.9971, + "step": 21950 + }, + { + "epoch": 3.9452423698384202, + "grad_norm": 7.662269115447998, + "learning_rate": 2.8075164572112507e-05, + "loss": 7.0095, + "step": 21975 + }, + { + "epoch": 3.949730700179533, + "grad_norm": 7.26335334777832, + "learning_rate": 2.806918013165769e-05, + "loss": 6.9519, + "step": 22000 + }, + { + "epoch": 3.954219030520646, + "grad_norm": 6.5853095054626465, + "learning_rate": 2.8063195691202872e-05, + "loss": 6.9645, + "step": 22025 + }, + { + "epoch": 3.9587073608617596, + "grad_norm": 7.800670146942139, + "learning_rate": 2.8057211250748058e-05, + "loss": 6.9242, + "step": 22050 + }, + { + "epoch": 3.9631956912028725, + "grad_norm": 7.28950309753418, + "learning_rate": 2.805122681029324e-05, + "loss": 6.8951, + "step": 22075 + }, + { + "epoch": 3.9676840215439855, + "grad_norm": 7.314763069152832, + "learning_rate": 2.8045242369838422e-05, + "loss": 6.9282, + "step": 22100 + }, + { + "epoch": 3.972172351885099, + "grad_norm": 7.631821155548096, + "learning_rate": 2.8039257929383605e-05, + "loss": 6.9839, + "step": 22125 + }, + { + "epoch": 3.976660682226212, + "grad_norm": 8.359884262084961, + "learning_rate": 2.8033273488928784e-05, + "loss": 7.0441, + "step": 22150 + }, + { + "epoch": 3.9811490125673252, + "grad_norm": 7.667264461517334, + "learning_rate": 2.8027289048473966e-05, + "loss": 6.8421, + "step": 22175 + }, + { + "epoch": 3.985637342908438, + "grad_norm": 8.020297050476074, + "learning_rate": 2.8021304608019152e-05, + "loss": 6.86, + "step": 22200 + }, + { + "epoch": 3.990125673249551, + "grad_norm": 8.721392631530762, + "learning_rate": 2.8015320167564334e-05, + "loss": 6.9686, + "step": 22225 + }, + { + "epoch": 3.994614003590664, + "grad_norm": 9.793757438659668, + "learning_rate": 2.8009335727109516e-05, + "loss": 6.9584, + "step": 22250 + }, + { + "epoch": 3.9991023339317775, + "grad_norm": 8.73378849029541, + "learning_rate": 2.80033512866547e-05, + "loss": 6.813, + "step": 22275 + }, + { + "epoch": 4.0, + "eval_accuracy": 0.0621503446432322, + "eval_f1_macro": 0.001573003625923118, + "eval_f1_micro": 0.0621503446432322, + "eval_f1_weighted": 0.022911889955766516, + "eval_loss": 7.3818678855896, + "eval_precision_macro": 0.0013788010800609369, + "eval_precision_micro": 0.0621503446432322, + "eval_precision_weighted": 0.017467422494501263, + "eval_recall_macro": 0.004115215681904829, + "eval_recall_micro": 0.0621503446432322, + "eval_recall_weighted": 0.0621503446432322, + "eval_runtime": 63.663, + "eval_samples_per_second": 822.66, + "eval_steps_per_second": 25.714, + "step": 22280 + }, + { + "epoch": 4.003590664272891, + "grad_norm": 7.510802268981934, + "learning_rate": 2.799736684619988e-05, + "loss": 6.6862, + "step": 22300 + }, + { + "epoch": 4.008078994614004, + "grad_norm": 7.000391483306885, + "learning_rate": 2.7991382405745064e-05, + "loss": 6.5317, + "step": 22325 + }, + { + "epoch": 4.012567324955117, + "grad_norm": 7.394598007202148, + "learning_rate": 2.7985397965290246e-05, + "loss": 6.5119, + "step": 22350 + }, + { + "epoch": 4.01705565529623, + "grad_norm": 7.768242835998535, + "learning_rate": 2.7979413524835428e-05, + "loss": 6.5102, + "step": 22375 + }, + { + "epoch": 4.021543985637343, + "grad_norm": 7.2971391677856445, + "learning_rate": 2.797342908438061e-05, + "loss": 6.7217, + "step": 22400 + }, + { + "epoch": 4.026032315978456, + "grad_norm": 6.618619441986084, + "learning_rate": 2.7967444643925793e-05, + "loss": 6.5892, + "step": 22425 + }, + { + "epoch": 4.0305206463195695, + "grad_norm": 6.977947235107422, + "learning_rate": 2.7961460203470975e-05, + "loss": 6.5264, + "step": 22450 + }, + { + "epoch": 4.0350089766606825, + "grad_norm": 6.7299652099609375, + "learning_rate": 2.795547576301616e-05, + "loss": 6.4659, + "step": 22475 + }, + { + "epoch": 4.039497307001795, + "grad_norm": 7.742786407470703, + "learning_rate": 2.7949491322561343e-05, + "loss": 6.6379, + "step": 22500 + }, + { + "epoch": 4.043985637342908, + "grad_norm": 7.123066425323486, + "learning_rate": 2.7943506882106526e-05, + "loss": 6.7125, + "step": 22525 + }, + { + "epoch": 4.048473967684021, + "grad_norm": 7.6337785720825195, + "learning_rate": 2.7937522441651705e-05, + "loss": 6.6396, + "step": 22550 + }, + { + "epoch": 4.052962298025134, + "grad_norm": 7.019710540771484, + "learning_rate": 2.7931538001196887e-05, + "loss": 6.6239, + "step": 22575 + }, + { + "epoch": 4.057450628366248, + "grad_norm": 7.621551513671875, + "learning_rate": 2.792555356074207e-05, + "loss": 6.5592, + "step": 22600 + }, + { + "epoch": 4.061938958707361, + "grad_norm": 7.8987321853637695, + "learning_rate": 2.7919569120287255e-05, + "loss": 6.5333, + "step": 22625 + }, + { + "epoch": 4.066427289048474, + "grad_norm": 8.289762496948242, + "learning_rate": 2.7913584679832438e-05, + "loss": 6.4829, + "step": 22650 + }, + { + "epoch": 4.070915619389587, + "grad_norm": 7.7762579917907715, + "learning_rate": 2.790760023937762e-05, + "loss": 6.5333, + "step": 22675 + }, + { + "epoch": 4.0754039497307, + "grad_norm": 7.4056806564331055, + "learning_rate": 2.7901615798922802e-05, + "loss": 6.5194, + "step": 22700 + }, + { + "epoch": 4.079892280071813, + "grad_norm": 8.306356430053711, + "learning_rate": 2.789563135846798e-05, + "loss": 6.5953, + "step": 22725 + }, + { + "epoch": 4.084380610412927, + "grad_norm": 8.538472175598145, + "learning_rate": 2.7889646918013164e-05, + "loss": 6.4436, + "step": 22750 + }, + { + "epoch": 4.08886894075404, + "grad_norm": 9.086010932922363, + "learning_rate": 2.788366247755835e-05, + "loss": 6.5127, + "step": 22775 + }, + { + "epoch": 4.093357271095153, + "grad_norm": 8.96743392944336, + "learning_rate": 2.7877678037103532e-05, + "loss": 6.4606, + "step": 22800 + }, + { + "epoch": 4.097845601436266, + "grad_norm": 7.844598770141602, + "learning_rate": 2.7871693596648714e-05, + "loss": 6.6958, + "step": 22825 + }, + { + "epoch": 4.102333931777379, + "grad_norm": 7.303498268127441, + "learning_rate": 2.7865709156193896e-05, + "loss": 6.5622, + "step": 22850 + }, + { + "epoch": 4.1068222621184916, + "grad_norm": 7.510324954986572, + "learning_rate": 2.785972471573908e-05, + "loss": 6.5972, + "step": 22875 + }, + { + "epoch": 4.111310592459605, + "grad_norm": 8.018465042114258, + "learning_rate": 2.7853740275284265e-05, + "loss": 6.655, + "step": 22900 + }, + { + "epoch": 4.115798922800718, + "grad_norm": 8.052847862243652, + "learning_rate": 2.7847755834829444e-05, + "loss": 6.5805, + "step": 22925 + }, + { + "epoch": 4.120287253141831, + "grad_norm": 7.220611572265625, + "learning_rate": 2.7841771394374626e-05, + "loss": 6.5056, + "step": 22950 + }, + { + "epoch": 4.124775583482944, + "grad_norm": 8.267970085144043, + "learning_rate": 2.7835786953919808e-05, + "loss": 6.4201, + "step": 22975 + }, + { + "epoch": 4.129263913824057, + "grad_norm": 7.860930919647217, + "learning_rate": 2.782980251346499e-05, + "loss": 6.4208, + "step": 23000 + }, + { + "epoch": 4.13375224416517, + "grad_norm": 7.190804481506348, + "learning_rate": 2.7823818073010173e-05, + "loss": 6.5174, + "step": 23025 + }, + { + "epoch": 4.138240574506284, + "grad_norm": 7.37723445892334, + "learning_rate": 2.781783363255536e-05, + "loss": 6.6644, + "step": 23050 + }, + { + "epoch": 4.142728904847397, + "grad_norm": 6.685908317565918, + "learning_rate": 2.781184919210054e-05, + "loss": 6.4639, + "step": 23075 + }, + { + "epoch": 4.14721723518851, + "grad_norm": 7.9320549964904785, + "learning_rate": 2.7805864751645723e-05, + "loss": 6.531, + "step": 23100 + }, + { + "epoch": 4.151705565529623, + "grad_norm": 8.018370628356934, + "learning_rate": 2.7799880311190902e-05, + "loss": 6.5623, + "step": 23125 + }, + { + "epoch": 4.156193895870736, + "grad_norm": 7.750665187835693, + "learning_rate": 2.7793895870736085e-05, + "loss": 6.4511, + "step": 23150 + }, + { + "epoch": 4.160682226211849, + "grad_norm": 8.406930923461914, + "learning_rate": 2.7787911430281267e-05, + "loss": 6.5504, + "step": 23175 + }, + { + "epoch": 4.165170556552963, + "grad_norm": 7.320469379425049, + "learning_rate": 2.7781926989826453e-05, + "loss": 6.4612, + "step": 23200 + }, + { + "epoch": 4.169658886894076, + "grad_norm": 7.600121021270752, + "learning_rate": 2.7775942549371635e-05, + "loss": 6.7668, + "step": 23225 + }, + { + "epoch": 4.174147217235189, + "grad_norm": 8.833227157592773, + "learning_rate": 2.7769958108916818e-05, + "loss": 6.6035, + "step": 23250 + }, + { + "epoch": 4.1786355475763015, + "grad_norm": 7.220536231994629, + "learning_rate": 2.7763973668462e-05, + "loss": 6.5856, + "step": 23275 + }, + { + "epoch": 4.1831238779174145, + "grad_norm": 7.720278263092041, + "learning_rate": 2.7757989228007182e-05, + "loss": 6.439, + "step": 23300 + }, + { + "epoch": 4.187612208258527, + "grad_norm": 7.747310161590576, + "learning_rate": 2.7752004787552365e-05, + "loss": 6.7632, + "step": 23325 + }, + { + "epoch": 4.192100538599641, + "grad_norm": 7.679044723510742, + "learning_rate": 2.7746020347097547e-05, + "loss": 6.6256, + "step": 23350 + }, + { + "epoch": 4.196588868940754, + "grad_norm": 7.672725200653076, + "learning_rate": 2.774003590664273e-05, + "loss": 6.6622, + "step": 23375 + }, + { + "epoch": 4.201077199281867, + "grad_norm": 6.961149215698242, + "learning_rate": 2.7734051466187912e-05, + "loss": 6.5223, + "step": 23400 + }, + { + "epoch": 4.20556552962298, + "grad_norm": 7.932663440704346, + "learning_rate": 2.7728067025733094e-05, + "loss": 6.6698, + "step": 23425 + }, + { + "epoch": 4.210053859964093, + "grad_norm": 8.497182846069336, + "learning_rate": 2.7722082585278276e-05, + "loss": 6.6923, + "step": 23450 + }, + { + "epoch": 4.214542190305206, + "grad_norm": 8.528273582458496, + "learning_rate": 2.7716098144823462e-05, + "loss": 6.6781, + "step": 23475 + }, + { + "epoch": 4.21903052064632, + "grad_norm": 9.503753662109375, + "learning_rate": 2.7710113704368645e-05, + "loss": 6.6047, + "step": 23500 + }, + { + "epoch": 4.223518850987433, + "grad_norm": 7.941089153289795, + "learning_rate": 2.7704129263913824e-05, + "loss": 6.5951, + "step": 23525 + }, + { + "epoch": 4.228007181328546, + "grad_norm": 8.168737411499023, + "learning_rate": 2.7698144823459006e-05, + "loss": 6.482, + "step": 23550 + }, + { + "epoch": 4.232495511669659, + "grad_norm": 7.412092685699463, + "learning_rate": 2.7692160383004188e-05, + "loss": 6.5189, + "step": 23575 + }, + { + "epoch": 4.236983842010772, + "grad_norm": 8.187712669372559, + "learning_rate": 2.768617594254937e-05, + "loss": 6.542, + "step": 23600 + }, + { + "epoch": 4.241472172351885, + "grad_norm": 7.718690872192383, + "learning_rate": 2.7680191502094556e-05, + "loss": 6.6195, + "step": 23625 + }, + { + "epoch": 4.2459605026929985, + "grad_norm": 9.22433853149414, + "learning_rate": 2.767420706163974e-05, + "loss": 6.4425, + "step": 23650 + }, + { + "epoch": 4.2504488330341115, + "grad_norm": 7.1004862785339355, + "learning_rate": 2.766822262118492e-05, + "loss": 6.6489, + "step": 23675 + }, + { + "epoch": 4.254937163375224, + "grad_norm": 8.695807456970215, + "learning_rate": 2.7662238180730103e-05, + "loss": 6.6457, + "step": 23700 + }, + { + "epoch": 4.259425493716337, + "grad_norm": 7.320695877075195, + "learning_rate": 2.7656253740275282e-05, + "loss": 6.5767, + "step": 23725 + }, + { + "epoch": 4.26391382405745, + "grad_norm": 9.636645317077637, + "learning_rate": 2.7650269299820465e-05, + "loss": 6.664, + "step": 23750 + }, + { + "epoch": 4.268402154398563, + "grad_norm": 7.640822410583496, + "learning_rate": 2.764428485936565e-05, + "loss": 6.6201, + "step": 23775 + }, + { + "epoch": 4.272890484739677, + "grad_norm": 7.680857181549072, + "learning_rate": 2.7638300418910833e-05, + "loss": 6.4891, + "step": 23800 + }, + { + "epoch": 4.27737881508079, + "grad_norm": 8.932374954223633, + "learning_rate": 2.7632315978456015e-05, + "loss": 6.5188, + "step": 23825 + }, + { + "epoch": 4.281867145421903, + "grad_norm": 7.53662633895874, + "learning_rate": 2.7626331538001198e-05, + "loss": 6.5946, + "step": 23850 + }, + { + "epoch": 4.286355475763016, + "grad_norm": 7.052899360656738, + "learning_rate": 2.762034709754638e-05, + "loss": 6.6008, + "step": 23875 + }, + { + "epoch": 4.290843806104129, + "grad_norm": 7.629906177520752, + "learning_rate": 2.7614362657091566e-05, + "loss": 6.5132, + "step": 23900 + }, + { + "epoch": 4.295332136445243, + "grad_norm": 8.757085800170898, + "learning_rate": 2.7608378216636745e-05, + "loss": 6.547, + "step": 23925 + }, + { + "epoch": 4.299820466786356, + "grad_norm": 8.275999069213867, + "learning_rate": 2.7602393776181927e-05, + "loss": 6.7287, + "step": 23950 + }, + { + "epoch": 4.304308797127469, + "grad_norm": 7.3516316413879395, + "learning_rate": 2.759640933572711e-05, + "loss": 6.5756, + "step": 23975 + }, + { + "epoch": 4.308797127468582, + "grad_norm": 7.5583343505859375, + "learning_rate": 2.7590424895272292e-05, + "loss": 6.4663, + "step": 24000 + }, + { + "epoch": 4.313285457809695, + "grad_norm": 7.901121616363525, + "learning_rate": 2.7584440454817474e-05, + "loss": 6.4763, + "step": 24025 + }, + { + "epoch": 4.317773788150808, + "grad_norm": 8.932948112487793, + "learning_rate": 2.757845601436266e-05, + "loss": 6.4851, + "step": 24050 + }, + { + "epoch": 4.3222621184919205, + "grad_norm": 10.074075698852539, + "learning_rate": 2.7572471573907842e-05, + "loss": 6.5532, + "step": 24075 + }, + { + "epoch": 4.326750448833034, + "grad_norm": 8.976847648620605, + "learning_rate": 2.7566487133453025e-05, + "loss": 6.6934, + "step": 24100 + }, + { + "epoch": 4.331238779174147, + "grad_norm": 7.449671745300293, + "learning_rate": 2.7560502692998204e-05, + "loss": 6.6487, + "step": 24125 + }, + { + "epoch": 4.33572710951526, + "grad_norm": 8.365455627441406, + "learning_rate": 2.7554518252543386e-05, + "loss": 6.6315, + "step": 24150 + }, + { + "epoch": 4.340215439856373, + "grad_norm": 9.13233757019043, + "learning_rate": 2.754853381208857e-05, + "loss": 6.6098, + "step": 24175 + }, + { + "epoch": 4.344703770197486, + "grad_norm": 8.248366355895996, + "learning_rate": 2.7542549371633754e-05, + "loss": 6.539, + "step": 24200 + }, + { + "epoch": 4.3491921005386, + "grad_norm": 8.048108100891113, + "learning_rate": 2.7536564931178936e-05, + "loss": 6.6064, + "step": 24225 + }, + { + "epoch": 4.353680430879713, + "grad_norm": 9.433106422424316, + "learning_rate": 2.753058049072412e-05, + "loss": 6.5307, + "step": 24250 + }, + { + "epoch": 4.358168761220826, + "grad_norm": 8.275618553161621, + "learning_rate": 2.75245960502693e-05, + "loss": 6.4466, + "step": 24275 + }, + { + "epoch": 4.362657091561939, + "grad_norm": 7.767579555511475, + "learning_rate": 2.7518611609814484e-05, + "loss": 6.4482, + "step": 24300 + }, + { + "epoch": 4.367145421903052, + "grad_norm": 7.516922473907471, + "learning_rate": 2.7512627169359666e-05, + "loss": 6.6087, + "step": 24325 + }, + { + "epoch": 4.371633752244165, + "grad_norm": 9.033773422241211, + "learning_rate": 2.7506642728904848e-05, + "loss": 6.7457, + "step": 24350 + }, + { + "epoch": 4.376122082585279, + "grad_norm": 9.321165084838867, + "learning_rate": 2.750065828845003e-05, + "loss": 6.6615, + "step": 24375 + }, + { + "epoch": 4.380610412926392, + "grad_norm": 8.027649879455566, + "learning_rate": 2.7494673847995213e-05, + "loss": 6.4342, + "step": 24400 + }, + { + "epoch": 4.385098743267505, + "grad_norm": 7.68413782119751, + "learning_rate": 2.7488689407540395e-05, + "loss": 6.6275, + "step": 24425 + }, + { + "epoch": 4.3895870736086176, + "grad_norm": 11.079235076904297, + "learning_rate": 2.7482704967085578e-05, + "loss": 6.5182, + "step": 24450 + }, + { + "epoch": 4.3940754039497305, + "grad_norm": 8.08297348022461, + "learning_rate": 2.7476720526630763e-05, + "loss": 6.5286, + "step": 24475 + }, + { + "epoch": 4.3985637342908435, + "grad_norm": 8.008461952209473, + "learning_rate": 2.7470736086175946e-05, + "loss": 6.5404, + "step": 24500 + }, + { + "epoch": 4.403052064631957, + "grad_norm": 7.93850564956665, + "learning_rate": 2.7464751645721125e-05, + "loss": 6.639, + "step": 24525 + }, + { + "epoch": 4.40754039497307, + "grad_norm": 7.976254940032959, + "learning_rate": 2.7458767205266307e-05, + "loss": 6.6984, + "step": 24550 + }, + { + "epoch": 4.412028725314183, + "grad_norm": 8.047077178955078, + "learning_rate": 2.745278276481149e-05, + "loss": 6.5765, + "step": 24575 + }, + { + "epoch": 4.416517055655296, + "grad_norm": 8.18030834197998, + "learning_rate": 2.7446798324356672e-05, + "loss": 6.6002, + "step": 24600 + }, + { + "epoch": 4.421005385996409, + "grad_norm": 9.103218078613281, + "learning_rate": 2.7440813883901858e-05, + "loss": 6.6813, + "step": 24625 + }, + { + "epoch": 4.425493716337522, + "grad_norm": 8.110322952270508, + "learning_rate": 2.743482944344704e-05, + "loss": 6.6386, + "step": 24650 + }, + { + "epoch": 4.429982046678636, + "grad_norm": 7.849308013916016, + "learning_rate": 2.7428845002992222e-05, + "loss": 6.3542, + "step": 24675 + }, + { + "epoch": 4.434470377019749, + "grad_norm": 7.827004432678223, + "learning_rate": 2.74228605625374e-05, + "loss": 6.3268, + "step": 24700 + }, + { + "epoch": 4.438958707360862, + "grad_norm": 8.136856079101562, + "learning_rate": 2.7416876122082584e-05, + "loss": 6.6323, + "step": 24725 + }, + { + "epoch": 4.443447037701975, + "grad_norm": 8.367888450622559, + "learning_rate": 2.741089168162777e-05, + "loss": 6.5375, + "step": 24750 + }, + { + "epoch": 4.447935368043088, + "grad_norm": 7.635782718658447, + "learning_rate": 2.7404907241172952e-05, + "loss": 6.5103, + "step": 24775 + }, + { + "epoch": 4.452423698384201, + "grad_norm": 8.58423137664795, + "learning_rate": 2.7398922800718134e-05, + "loss": 6.654, + "step": 24800 + }, + { + "epoch": 4.456912028725315, + "grad_norm": 7.8055033683776855, + "learning_rate": 2.7392938360263316e-05, + "loss": 6.4559, + "step": 24825 + }, + { + "epoch": 4.4614003590664275, + "grad_norm": 7.686580657958984, + "learning_rate": 2.738719329742669e-05, + "loss": 6.4581, + "step": 24850 + }, + { + "epoch": 4.4658886894075405, + "grad_norm": 8.195148468017578, + "learning_rate": 2.7381208856971873e-05, + "loss": 6.3742, + "step": 24875 + }, + { + "epoch": 4.470377019748653, + "grad_norm": 7.9467034339904785, + "learning_rate": 2.7375224416517055e-05, + "loss": 6.5358, + "step": 24900 + }, + { + "epoch": 4.474865350089766, + "grad_norm": 10.190847396850586, + "learning_rate": 2.736923997606224e-05, + "loss": 6.563, + "step": 24925 + }, + { + "epoch": 4.479353680430879, + "grad_norm": 7.093561172485352, + "learning_rate": 2.7363255535607423e-05, + "loss": 6.4073, + "step": 24950 + }, + { + "epoch": 4.483842010771993, + "grad_norm": 8.441715240478516, + "learning_rate": 2.7357271095152605e-05, + "loss": 6.5703, + "step": 24975 + }, + { + "epoch": 4.488330341113106, + "grad_norm": 8.87532901763916, + "learning_rate": 2.7351286654697788e-05, + "loss": 6.4517, + "step": 25000 + }, + { + "epoch": 4.492818671454219, + "grad_norm": 9.400248527526855, + "learning_rate": 2.7345302214242967e-05, + "loss": 6.4885, + "step": 25025 + }, + { + "epoch": 4.497307001795332, + "grad_norm": 7.469052314758301, + "learning_rate": 2.733931777378815e-05, + "loss": 6.4554, + "step": 25050 + }, + { + "epoch": 4.501795332136445, + "grad_norm": 8.726761817932129, + "learning_rate": 2.7333333333333335e-05, + "loss": 6.536, + "step": 25075 + }, + { + "epoch": 4.506283662477558, + "grad_norm": 8.784785270690918, + "learning_rate": 2.7327348892878517e-05, + "loss": 6.4881, + "step": 25100 + }, + { + "epoch": 4.510771992818672, + "grad_norm": 7.885135173797607, + "learning_rate": 2.73213644524237e-05, + "loss": 6.5612, + "step": 25125 + }, + { + "epoch": 4.515260323159785, + "grad_norm": 8.576748847961426, + "learning_rate": 2.7315380011968882e-05, + "loss": 6.4283, + "step": 25150 + }, + { + "epoch": 4.519748653500898, + "grad_norm": 7.404499053955078, + "learning_rate": 2.7309395571514064e-05, + "loss": 6.5563, + "step": 25175 + }, + { + "epoch": 4.524236983842011, + "grad_norm": 7.720039367675781, + "learning_rate": 2.7303411131059247e-05, + "loss": 6.5436, + "step": 25200 + }, + { + "epoch": 4.528725314183124, + "grad_norm": 8.125965118408203, + "learning_rate": 2.729742669060443e-05, + "loss": 6.4156, + "step": 25225 + }, + { + "epoch": 4.533213644524237, + "grad_norm": 7.679772853851318, + "learning_rate": 2.729144225014961e-05, + "loss": 6.5524, + "step": 25250 + }, + { + "epoch": 4.53770197486535, + "grad_norm": 7.547130107879639, + "learning_rate": 2.7285457809694794e-05, + "loss": 6.5254, + "step": 25275 + }, + { + "epoch": 4.542190305206463, + "grad_norm": 8.177289962768555, + "learning_rate": 2.7279473369239976e-05, + "loss": 6.6134, + "step": 25300 + }, + { + "epoch": 4.546678635547576, + "grad_norm": 8.920402526855469, + "learning_rate": 2.727348892878516e-05, + "loss": 6.4444, + "step": 25325 + }, + { + "epoch": 4.551166965888689, + "grad_norm": 7.327859878540039, + "learning_rate": 2.7267504488330344e-05, + "loss": 6.5745, + "step": 25350 + }, + { + "epoch": 4.555655296229802, + "grad_norm": 7.975797653198242, + "learning_rate": 2.7261520047875526e-05, + "loss": 6.5912, + "step": 25375 + }, + { + "epoch": 4.560143626570916, + "grad_norm": 8.149534225463867, + "learning_rate": 2.725553560742071e-05, + "loss": 6.6147, + "step": 25400 + }, + { + "epoch": 4.564631956912029, + "grad_norm": 8.093877792358398, + "learning_rate": 2.7249551166965888e-05, + "loss": 6.4796, + "step": 25425 + }, + { + "epoch": 4.569120287253142, + "grad_norm": 7.46660041809082, + "learning_rate": 2.724356672651107e-05, + "loss": 6.5891, + "step": 25450 + }, + { + "epoch": 4.573608617594255, + "grad_norm": 8.180322647094727, + "learning_rate": 2.7237582286056253e-05, + "loss": 6.522, + "step": 25475 + }, + { + "epoch": 4.578096947935368, + "grad_norm": 7.953065395355225, + "learning_rate": 2.7231597845601438e-05, + "loss": 6.5661, + "step": 25500 + }, + { + "epoch": 4.582585278276481, + "grad_norm": 8.947294235229492, + "learning_rate": 2.722561340514662e-05, + "loss": 6.444, + "step": 25525 + }, + { + "epoch": 4.587073608617594, + "grad_norm": 7.530533790588379, + "learning_rate": 2.7219628964691803e-05, + "loss": 6.7032, + "step": 25550 + }, + { + "epoch": 4.591561938958708, + "grad_norm": 9.906108856201172, + "learning_rate": 2.7213644524236985e-05, + "loss": 6.466, + "step": 25575 + }, + { + "epoch": 4.596050269299821, + "grad_norm": 9.270998001098633, + "learning_rate": 2.7207660083782164e-05, + "loss": 6.3938, + "step": 25600 + }, + { + "epoch": 4.600538599640934, + "grad_norm": 9.28902530670166, + "learning_rate": 2.7201675643327347e-05, + "loss": 6.3361, + "step": 25625 + }, + { + "epoch": 4.6050269299820465, + "grad_norm": 7.905343055725098, + "learning_rate": 2.7195691202872532e-05, + "loss": 6.6194, + "step": 25650 + }, + { + "epoch": 4.6095152603231595, + "grad_norm": 7.299292087554932, + "learning_rate": 2.7189706762417715e-05, + "loss": 6.4284, + "step": 25675 + }, + { + "epoch": 4.614003590664273, + "grad_norm": 10.118880271911621, + "learning_rate": 2.7183722321962897e-05, + "loss": 6.4357, + "step": 25700 + }, + { + "epoch": 4.618491921005386, + "grad_norm": 8.001317024230957, + "learning_rate": 2.717773788150808e-05, + "loss": 6.5839, + "step": 25725 + }, + { + "epoch": 4.622980251346499, + "grad_norm": 10.622818946838379, + "learning_rate": 2.7171753441053262e-05, + "loss": 6.5225, + "step": 25750 + }, + { + "epoch": 4.627468581687612, + "grad_norm": 7.763021469116211, + "learning_rate": 2.7165769000598448e-05, + "loss": 6.5691, + "step": 25775 + }, + { + "epoch": 4.631956912028725, + "grad_norm": 7.337483882904053, + "learning_rate": 2.7159784560143627e-05, + "loss": 6.4448, + "step": 25800 + }, + { + "epoch": 4.636445242369838, + "grad_norm": 8.579893112182617, + "learning_rate": 2.715380011968881e-05, + "loss": 6.4157, + "step": 25825 + }, + { + "epoch": 4.640933572710951, + "grad_norm": 9.041510581970215, + "learning_rate": 2.714781567923399e-05, + "loss": 6.6482, + "step": 25850 + }, + { + "epoch": 4.645421903052065, + "grad_norm": 8.356040000915527, + "learning_rate": 2.7141831238779174e-05, + "loss": 6.5297, + "step": 25875 + }, + { + "epoch": 4.649910233393178, + "grad_norm": 8.294889450073242, + "learning_rate": 2.7135846798324356e-05, + "loss": 6.4769, + "step": 25900 + }, + { + "epoch": 4.654398563734291, + "grad_norm": 8.856857299804688, + "learning_rate": 2.7129862357869542e-05, + "loss": 6.4612, + "step": 25925 + }, + { + "epoch": 4.658886894075404, + "grad_norm": 8.09585952758789, + "learning_rate": 2.7123877917414724e-05, + "loss": 6.3859, + "step": 25950 + }, + { + "epoch": 4.663375224416517, + "grad_norm": 7.600722312927246, + "learning_rate": 2.7117893476959907e-05, + "loss": 6.391, + "step": 25975 + }, + { + "epoch": 4.667863554757631, + "grad_norm": 8.756305694580078, + "learning_rate": 2.7111909036505085e-05, + "loss": 6.5443, + "step": 26000 + }, + { + "epoch": 4.6723518850987436, + "grad_norm": 8.717281341552734, + "learning_rate": 2.7105924596050268e-05, + "loss": 6.5535, + "step": 26025 + }, + { + "epoch": 4.6768402154398565, + "grad_norm": 8.13868236541748, + "learning_rate": 2.709994015559545e-05, + "loss": 6.3074, + "step": 26050 + }, + { + "epoch": 4.6813285457809695, + "grad_norm": 7.9580512046813965, + "learning_rate": 2.7093955715140636e-05, + "loss": 6.6239, + "step": 26075 + }, + { + "epoch": 4.685816876122082, + "grad_norm": 8.212288856506348, + "learning_rate": 2.7087971274685818e-05, + "loss": 6.5592, + "step": 26100 + }, + { + "epoch": 4.690305206463195, + "grad_norm": 8.862658500671387, + "learning_rate": 2.7081986834231e-05, + "loss": 6.3675, + "step": 26125 + }, + { + "epoch": 4.694793536804308, + "grad_norm": 7.975429534912109, + "learning_rate": 2.7076002393776183e-05, + "loss": 6.5064, + "step": 26150 + }, + { + "epoch": 4.699281867145422, + "grad_norm": 7.730137348175049, + "learning_rate": 2.7070017953321365e-05, + "loss": 6.4135, + "step": 26175 + }, + { + "epoch": 4.703770197486535, + "grad_norm": 8.601594924926758, + "learning_rate": 2.7064033512866548e-05, + "loss": 6.487, + "step": 26200 + }, + { + "epoch": 4.708258527827648, + "grad_norm": 7.339933395385742, + "learning_rate": 2.705804907241173e-05, + "loss": 6.3724, + "step": 26225 + }, + { + "epoch": 4.712746858168761, + "grad_norm": 8.6165189743042, + "learning_rate": 2.7052064631956912e-05, + "loss": 6.3778, + "step": 26250 + }, + { + "epoch": 4.717235188509874, + "grad_norm": 7.740137577056885, + "learning_rate": 2.7046080191502095e-05, + "loss": 6.6587, + "step": 26275 + }, + { + "epoch": 4.721723518850988, + "grad_norm": 10.16494369506836, + "learning_rate": 2.7040095751047277e-05, + "loss": 6.6424, + "step": 26300 + }, + { + "epoch": 4.726211849192101, + "grad_norm": 7.70670223236084, + "learning_rate": 2.703411131059246e-05, + "loss": 6.5096, + "step": 26325 + }, + { + "epoch": 4.730700179533214, + "grad_norm": 8.07544231414795, + "learning_rate": 2.7028126870137645e-05, + "loss": 6.5716, + "step": 26350 + }, + { + "epoch": 4.735188509874327, + "grad_norm": 8.434459686279297, + "learning_rate": 2.7022142429682828e-05, + "loss": 6.4762, + "step": 26375 + }, + { + "epoch": 4.73967684021544, + "grad_norm": 7.706515312194824, + "learning_rate": 2.7016157989228007e-05, + "loss": 6.3902, + "step": 26400 + }, + { + "epoch": 4.744165170556553, + "grad_norm": 7.354729652404785, + "learning_rate": 2.701017354877319e-05, + "loss": 6.3276, + "step": 26425 + }, + { + "epoch": 4.748653500897666, + "grad_norm": 7.921829700469971, + "learning_rate": 2.700418910831837e-05, + "loss": 6.4038, + "step": 26450 + }, + { + "epoch": 4.753141831238779, + "grad_norm": 7.989436626434326, + "learning_rate": 2.6998204667863554e-05, + "loss": 6.5768, + "step": 26475 + }, + { + "epoch": 4.757630161579892, + "grad_norm": 8.84457015991211, + "learning_rate": 2.699222022740874e-05, + "loss": 6.5967, + "step": 26500 + }, + { + "epoch": 4.762118491921005, + "grad_norm": 7.942958831787109, + "learning_rate": 2.6986235786953922e-05, + "loss": 6.5433, + "step": 26525 + }, + { + "epoch": 4.766606822262118, + "grad_norm": 8.001582145690918, + "learning_rate": 2.6980251346499104e-05, + "loss": 6.5248, + "step": 26550 + }, + { + "epoch": 4.771095152603231, + "grad_norm": 7.896449565887451, + "learning_rate": 2.6974266906044287e-05, + "loss": 6.4173, + "step": 26575 + }, + { + "epoch": 4.775583482944345, + "grad_norm": 7.977508544921875, + "learning_rate": 2.6968282465589465e-05, + "loss": 6.5938, + "step": 26600 + }, + { + "epoch": 4.780071813285458, + "grad_norm": 7.047165870666504, + "learning_rate": 2.696229802513465e-05, + "loss": 6.4782, + "step": 26625 + }, + { + "epoch": 4.784560143626571, + "grad_norm": 7.906014919281006, + "learning_rate": 2.6956313584679834e-05, + "loss": 6.6575, + "step": 26650 + }, + { + "epoch": 4.789048473967684, + "grad_norm": 6.895730972290039, + "learning_rate": 2.6950329144225016e-05, + "loss": 6.5932, + "step": 26675 + }, + { + "epoch": 4.793536804308797, + "grad_norm": 9.552895545959473, + "learning_rate": 2.69443447037702e-05, + "loss": 6.5352, + "step": 26700 + }, + { + "epoch": 4.79802513464991, + "grad_norm": 7.689596652984619, + "learning_rate": 2.693836026331538e-05, + "loss": 6.5397, + "step": 26725 + }, + { + "epoch": 4.802513464991024, + "grad_norm": 7.852512836456299, + "learning_rate": 2.6932375822860563e-05, + "loss": 6.5015, + "step": 26750 + }, + { + "epoch": 4.807001795332137, + "grad_norm": 8.25978946685791, + "learning_rate": 2.692639138240575e-05, + "loss": 6.5545, + "step": 26775 + }, + { + "epoch": 4.81149012567325, + "grad_norm": 8.746063232421875, + "learning_rate": 2.6920406941950928e-05, + "loss": 6.5082, + "step": 26800 + }, + { + "epoch": 4.815978456014363, + "grad_norm": 7.77665376663208, + "learning_rate": 2.691442250149611e-05, + "loss": 6.4072, + "step": 26825 + }, + { + "epoch": 4.8204667863554755, + "grad_norm": 8.649683952331543, + "learning_rate": 2.6908438061041292e-05, + "loss": 6.4336, + "step": 26850 + }, + { + "epoch": 4.8249551166965885, + "grad_norm": 7.969342231750488, + "learning_rate": 2.6902453620586475e-05, + "loss": 6.3778, + "step": 26875 + }, + { + "epoch": 4.829443447037702, + "grad_norm": 10.716011047363281, + "learning_rate": 2.6896469180131657e-05, + "loss": 6.4059, + "step": 26900 + }, + { + "epoch": 4.833931777378815, + "grad_norm": 7.8846917152404785, + "learning_rate": 2.6890484739676843e-05, + "loss": 6.5201, + "step": 26925 + }, + { + "epoch": 4.838420107719928, + "grad_norm": 7.519570350646973, + "learning_rate": 2.6884739676840217e-05, + "loss": 6.4525, + "step": 26950 + }, + { + "epoch": 4.842908438061041, + "grad_norm": 9.026190757751465, + "learning_rate": 2.68787552363854e-05, + "loss": 6.5546, + "step": 26975 + }, + { + "epoch": 4.847396768402154, + "grad_norm": 7.460124492645264, + "learning_rate": 2.687277079593058e-05, + "loss": 6.4317, + "step": 27000 + }, + { + "epoch": 4.851885098743267, + "grad_norm": 8.653061866760254, + "learning_rate": 2.6866786355475764e-05, + "loss": 6.5815, + "step": 27025 + }, + { + "epoch": 4.856373429084381, + "grad_norm": 9.3004150390625, + "learning_rate": 2.6860801915020946e-05, + "loss": 6.5068, + "step": 27050 + }, + { + "epoch": 4.860861759425494, + "grad_norm": 7.93986177444458, + "learning_rate": 2.685481747456613e-05, + "loss": 6.5099, + "step": 27075 + }, + { + "epoch": 4.865350089766607, + "grad_norm": 8.251890182495117, + "learning_rate": 2.684883303411131e-05, + "loss": 6.3461, + "step": 27100 + }, + { + "epoch": 4.86983842010772, + "grad_norm": 7.461415767669678, + "learning_rate": 2.6842848593656493e-05, + "loss": 6.5564, + "step": 27125 + }, + { + "epoch": 4.874326750448833, + "grad_norm": 7.886981964111328, + "learning_rate": 2.6836864153201676e-05, + "loss": 6.4843, + "step": 27150 + }, + { + "epoch": 4.878815080789947, + "grad_norm": 8.724211692810059, + "learning_rate": 2.6830879712746858e-05, + "loss": 6.6048, + "step": 27175 + }, + { + "epoch": 4.88330341113106, + "grad_norm": 7.811946392059326, + "learning_rate": 2.682489527229204e-05, + "loss": 6.556, + "step": 27200 + }, + { + "epoch": 4.8877917414721725, + "grad_norm": 7.347614288330078, + "learning_rate": 2.6818910831837226e-05, + "loss": 6.6359, + "step": 27225 + }, + { + "epoch": 4.8922800718132855, + "grad_norm": 7.314632892608643, + "learning_rate": 2.681292639138241e-05, + "loss": 6.3461, + "step": 27250 + }, + { + "epoch": 4.8967684021543985, + "grad_norm": 8.240979194641113, + "learning_rate": 2.680694195092759e-05, + "loss": 6.4255, + "step": 27275 + }, + { + "epoch": 4.901256732495511, + "grad_norm": 7.735796928405762, + "learning_rate": 2.680095751047277e-05, + "loss": 6.4287, + "step": 27300 + }, + { + "epoch": 4.905745062836624, + "grad_norm": 8.402661323547363, + "learning_rate": 2.6794973070017952e-05, + "loss": 6.4201, + "step": 27325 + }, + { + "epoch": 4.910233393177738, + "grad_norm": 10.166335105895996, + "learning_rate": 2.6788988629563134e-05, + "loss": 6.5162, + "step": 27350 + }, + { + "epoch": 4.914721723518851, + "grad_norm": 7.526193141937256, + "learning_rate": 2.678300418910832e-05, + "loss": 6.5235, + "step": 27375 + }, + { + "epoch": 4.919210053859964, + "grad_norm": 7.828055381774902, + "learning_rate": 2.6777019748653503e-05, + "loss": 6.4282, + "step": 27400 + }, + { + "epoch": 4.923698384201077, + "grad_norm": 7.931001663208008, + "learning_rate": 2.6771035308198685e-05, + "loss": 6.3469, + "step": 27425 + }, + { + "epoch": 4.92818671454219, + "grad_norm": 8.67994499206543, + "learning_rate": 2.6765050867743867e-05, + "loss": 6.4745, + "step": 27450 + }, + { + "epoch": 4.932675044883304, + "grad_norm": 8.045910835266113, + "learning_rate": 2.675906642728905e-05, + "loss": 6.4999, + "step": 27475 + }, + { + "epoch": 4.937163375224417, + "grad_norm": 8.177043914794922, + "learning_rate": 2.675308198683423e-05, + "loss": 6.3705, + "step": 27500 + }, + { + "epoch": 4.94165170556553, + "grad_norm": 7.938586711883545, + "learning_rate": 2.6747097546379414e-05, + "loss": 6.5768, + "step": 27525 + }, + { + "epoch": 4.946140035906643, + "grad_norm": 8.035226821899414, + "learning_rate": 2.6741113105924597e-05, + "loss": 6.469, + "step": 27550 + }, + { + "epoch": 4.950628366247756, + "grad_norm": 8.594929695129395, + "learning_rate": 2.673512866546978e-05, + "loss": 6.4281, + "step": 27575 + }, + { + "epoch": 4.955116696588869, + "grad_norm": 9.115160942077637, + "learning_rate": 2.672914422501496e-05, + "loss": 6.554, + "step": 27600 + }, + { + "epoch": 4.959605026929982, + "grad_norm": 8.53757095336914, + "learning_rate": 2.6723159784560144e-05, + "loss": 6.385, + "step": 27625 + }, + { + "epoch": 4.9640933572710955, + "grad_norm": 8.1246337890625, + "learning_rate": 2.671717534410533e-05, + "loss": 6.3911, + "step": 27650 + }, + { + "epoch": 4.968581687612208, + "grad_norm": 7.9644293785095215, + "learning_rate": 2.6711190903650512e-05, + "loss": 6.4338, + "step": 27675 + }, + { + "epoch": 4.973070017953321, + "grad_norm": 9.015304565429688, + "learning_rate": 2.670520646319569e-05, + "loss": 6.2728, + "step": 27700 + }, + { + "epoch": 4.977558348294434, + "grad_norm": 9.651604652404785, + "learning_rate": 2.6699222022740873e-05, + "loss": 6.4521, + "step": 27725 + }, + { + "epoch": 4.982046678635547, + "grad_norm": 8.258686065673828, + "learning_rate": 2.6693237582286056e-05, + "loss": 6.5388, + "step": 27750 + }, + { + "epoch": 4.986535008976661, + "grad_norm": 8.579561233520508, + "learning_rate": 2.6687253141831238e-05, + "loss": 6.3791, + "step": 27775 + }, + { + "epoch": 4.991023339317774, + "grad_norm": 8.402015686035156, + "learning_rate": 2.6681268701376424e-05, + "loss": 6.2844, + "step": 27800 + }, + { + "epoch": 4.995511669658887, + "grad_norm": 8.643714904785156, + "learning_rate": 2.6675284260921606e-05, + "loss": 6.4252, + "step": 27825 + }, + { + "epoch": 5.0, + "grad_norm": 12.003674507141113, + "learning_rate": 2.666929982046679e-05, + "loss": 6.485, + "step": 27850 + }, + { + "epoch": 5.0, + "eval_accuracy": 0.06980696160235236, + "eval_f1_macro": 0.002619302956978081, + "eval_f1_micro": 0.06980696160235236, + "eval_f1_weighted": 0.030060176294948972, + "eval_loss": 7.094029903411865, + "eval_precision_macro": 0.00233191731586986, + "eval_precision_micro": 0.06980696160235236, + "eval_precision_weighted": 0.023668586786786752, + "eval_recall_macro": 0.0059717291225075565, + "eval_recall_micro": 0.06980696160235236, + "eval_recall_weighted": 0.06980696160235236, + "eval_runtime": 63.5321, + "eval_samples_per_second": 824.355, + "eval_steps_per_second": 25.767, + "step": 27850 + }, + { + "epoch": 5.004488330341113, + "grad_norm": 8.41741943359375, + "learning_rate": 2.666331538001197e-05, + "loss": 6.212, + "step": 27875 + }, + { + "epoch": 5.008976660682226, + "grad_norm": 7.772786617279053, + "learning_rate": 2.665733093955715e-05, + "loss": 6.1006, + "step": 27900 + }, + { + "epoch": 5.01346499102334, + "grad_norm": 9.291427612304688, + "learning_rate": 2.6651346499102332e-05, + "loss": 6.1054, + "step": 27925 + }, + { + "epoch": 5.017953321364453, + "grad_norm": 7.789606094360352, + "learning_rate": 2.6645362058647518e-05, + "loss": 6.2425, + "step": 27950 + }, + { + "epoch": 5.022441651705566, + "grad_norm": 8.842456817626953, + "learning_rate": 2.66393776181927e-05, + "loss": 6.0156, + "step": 27975 + }, + { + "epoch": 5.026929982046679, + "grad_norm": 7.6205220222473145, + "learning_rate": 2.6633393177737883e-05, + "loss": 5.9628, + "step": 28000 + }, + { + "epoch": 5.031418312387792, + "grad_norm": 8.662976264953613, + "learning_rate": 2.6627408737283065e-05, + "loss": 6.2226, + "step": 28025 + }, + { + "epoch": 5.0359066427289045, + "grad_norm": 8.31258773803711, + "learning_rate": 2.6621424296828247e-05, + "loss": 6.0289, + "step": 28050 + }, + { + "epoch": 5.040394973070018, + "grad_norm": 8.762619018554688, + "learning_rate": 2.6615439856373433e-05, + "loss": 6.0312, + "step": 28075 + }, + { + "epoch": 5.044883303411131, + "grad_norm": 8.930154800415039, + "learning_rate": 2.6609455415918612e-05, + "loss": 6.0955, + "step": 28100 + }, + { + "epoch": 5.049371633752244, + "grad_norm": 9.061888694763184, + "learning_rate": 2.6603470975463794e-05, + "loss": 6.0722, + "step": 28125 + }, + { + "epoch": 5.053859964093357, + "grad_norm": 8.763251304626465, + "learning_rate": 2.6597486535008977e-05, + "loss": 6.1583, + "step": 28150 + }, + { + "epoch": 5.05834829443447, + "grad_norm": 8.464717864990234, + "learning_rate": 2.659150209455416e-05, + "loss": 6.0357, + "step": 28175 + }, + { + "epoch": 5.062836624775583, + "grad_norm": 8.222427368164062, + "learning_rate": 2.658551765409934e-05, + "loss": 6.0106, + "step": 28200 + }, + { + "epoch": 5.067324955116697, + "grad_norm": 9.973421096801758, + "learning_rate": 2.6579533213644527e-05, + "loss": 6.0618, + "step": 28225 + }, + { + "epoch": 5.07181328545781, + "grad_norm": 7.457205295562744, + "learning_rate": 2.657354877318971e-05, + "loss": 6.2437, + "step": 28250 + }, + { + "epoch": 5.076301615798923, + "grad_norm": 7.965580463409424, + "learning_rate": 2.6567564332734892e-05, + "loss": 6.3724, + "step": 28275 + }, + { + "epoch": 5.080789946140036, + "grad_norm": 7.872587203979492, + "learning_rate": 2.656157989228007e-05, + "loss": 6.002, + "step": 28300 + }, + { + "epoch": 5.085278276481149, + "grad_norm": 7.9399213790893555, + "learning_rate": 2.6555595451825253e-05, + "loss": 5.962, + "step": 28325 + }, + { + "epoch": 5.089766606822262, + "grad_norm": 9.767419815063477, + "learning_rate": 2.6549611011370436e-05, + "loss": 6.1106, + "step": 28350 + }, + { + "epoch": 5.094254937163376, + "grad_norm": 7.886393070220947, + "learning_rate": 2.654362657091562e-05, + "loss": 6.229, + "step": 28375 + }, + { + "epoch": 5.098743267504489, + "grad_norm": 7.824357032775879, + "learning_rate": 2.6537642130460804e-05, + "loss": 6.0916, + "step": 28400 + }, + { + "epoch": 5.1032315978456015, + "grad_norm": 8.915790557861328, + "learning_rate": 2.6531657690005986e-05, + "loss": 5.9973, + "step": 28425 + }, + { + "epoch": 5.1077199281867145, + "grad_norm": 7.506389617919922, + "learning_rate": 2.652567324955117e-05, + "loss": 6.0473, + "step": 28450 + }, + { + "epoch": 5.1122082585278275, + "grad_norm": 7.733982563018799, + "learning_rate": 2.6519688809096347e-05, + "loss": 6.1122, + "step": 28475 + }, + { + "epoch": 5.11669658886894, + "grad_norm": 9.238937377929688, + "learning_rate": 2.6513704368641533e-05, + "loss": 6.0442, + "step": 28500 + }, + { + "epoch": 5.121184919210054, + "grad_norm": 7.384300708770752, + "learning_rate": 2.6507719928186715e-05, + "loss": 6.1618, + "step": 28525 + }, + { + "epoch": 5.125673249551167, + "grad_norm": 7.903632640838623, + "learning_rate": 2.6501735487731898e-05, + "loss": 6.1251, + "step": 28550 + }, + { + "epoch": 5.13016157989228, + "grad_norm": 8.666293144226074, + "learning_rate": 2.649575104727708e-05, + "loss": 5.9826, + "step": 28575 + }, + { + "epoch": 5.134649910233393, + "grad_norm": 8.212156295776367, + "learning_rate": 2.6489766606822263e-05, + "loss": 6.0437, + "step": 28600 + }, + { + "epoch": 5.139138240574506, + "grad_norm": 9.31468677520752, + "learning_rate": 2.6483782166367445e-05, + "loss": 6.2707, + "step": 28625 + }, + { + "epoch": 5.143626570915619, + "grad_norm": 8.999573707580566, + "learning_rate": 2.647779772591263e-05, + "loss": 6.0151, + "step": 28650 + }, + { + "epoch": 5.148114901256733, + "grad_norm": 8.691988945007324, + "learning_rate": 2.647181328545781e-05, + "loss": 6.2281, + "step": 28675 + }, + { + "epoch": 5.152603231597846, + "grad_norm": 8.610566139221191, + "learning_rate": 2.6465828845002992e-05, + "loss": 5.9763, + "step": 28700 + }, + { + "epoch": 5.157091561938959, + "grad_norm": 9.413865089416504, + "learning_rate": 2.6459844404548174e-05, + "loss": 6.2349, + "step": 28725 + }, + { + "epoch": 5.161579892280072, + "grad_norm": 7.817660331726074, + "learning_rate": 2.6453859964093357e-05, + "loss": 6.2324, + "step": 28750 + }, + { + "epoch": 5.166068222621185, + "grad_norm": 8.00736141204834, + "learning_rate": 2.644787552363854e-05, + "loss": 6.2172, + "step": 28775 + }, + { + "epoch": 5.170556552962298, + "grad_norm": 7.6042799949646, + "learning_rate": 2.6441891083183725e-05, + "loss": 6.1522, + "step": 28800 + }, + { + "epoch": 5.1750448833034115, + "grad_norm": 9.137860298156738, + "learning_rate": 2.6435906642728907e-05, + "loss": 6.087, + "step": 28825 + }, + { + "epoch": 5.1795332136445245, + "grad_norm": 8.051556587219238, + "learning_rate": 2.642992220227409e-05, + "loss": 6.206, + "step": 28850 + }, + { + "epoch": 5.184021543985637, + "grad_norm": 8.368781089782715, + "learning_rate": 2.642393776181927e-05, + "loss": 5.9321, + "step": 28875 + }, + { + "epoch": 5.18850987432675, + "grad_norm": 7.7421770095825195, + "learning_rate": 2.641795332136445e-05, + "loss": 6.1008, + "step": 28900 + }, + { + "epoch": 5.192998204667863, + "grad_norm": 7.215620040893555, + "learning_rate": 2.6411968880909633e-05, + "loss": 6.1034, + "step": 28925 + }, + { + "epoch": 5.197486535008976, + "grad_norm": 10.328886985778809, + "learning_rate": 2.640598444045482e-05, + "loss": 6.1773, + "step": 28950 + }, + { + "epoch": 5.20197486535009, + "grad_norm": 8.629987716674805, + "learning_rate": 2.64e-05, + "loss": 6.0927, + "step": 28975 + }, + { + "epoch": 5.206463195691203, + "grad_norm": 8.244781494140625, + "learning_rate": 2.6394015559545184e-05, + "loss": 6.0531, + "step": 29000 + }, + { + "epoch": 5.210951526032316, + "grad_norm": 9.23215103149414, + "learning_rate": 2.6388031119090366e-05, + "loss": 6.1141, + "step": 29025 + }, + { + "epoch": 5.215439856373429, + "grad_norm": 8.020782470703125, + "learning_rate": 2.638204667863555e-05, + "loss": 5.9812, + "step": 29050 + }, + { + "epoch": 5.219928186714542, + "grad_norm": 7.994537353515625, + "learning_rate": 2.637606223818073e-05, + "loss": 6.1841, + "step": 29075 + }, + { + "epoch": 5.224416517055655, + "grad_norm": 8.885031700134277, + "learning_rate": 2.6370077797725913e-05, + "loss": 6.2852, + "step": 29100 + }, + { + "epoch": 5.228904847396769, + "grad_norm": 9.589296340942383, + "learning_rate": 2.6364093357271096e-05, + "loss": 6.0782, + "step": 29125 + }, + { + "epoch": 5.233393177737882, + "grad_norm": 9.452252388000488, + "learning_rate": 2.6358108916816278e-05, + "loss": 6.0766, + "step": 29150 + }, + { + "epoch": 5.237881508078995, + "grad_norm": 8.255623817443848, + "learning_rate": 2.635212447636146e-05, + "loss": 6.1807, + "step": 29175 + }, + { + "epoch": 5.242369838420108, + "grad_norm": 9.294405937194824, + "learning_rate": 2.6346140035906643e-05, + "loss": 6.0627, + "step": 29200 + }, + { + "epoch": 5.246858168761221, + "grad_norm": 8.502198219299316, + "learning_rate": 2.634015559545183e-05, + "loss": 6.0339, + "step": 29225 + }, + { + "epoch": 5.2513464991023335, + "grad_norm": 7.8720879554748535, + "learning_rate": 2.633417115499701e-05, + "loss": 6.2818, + "step": 29250 + }, + { + "epoch": 5.255834829443447, + "grad_norm": 7.652676582336426, + "learning_rate": 2.632818671454219e-05, + "loss": 6.1234, + "step": 29275 + }, + { + "epoch": 5.26032315978456, + "grad_norm": 8.727766036987305, + "learning_rate": 2.6322202274087372e-05, + "loss": 6.1172, + "step": 29300 + }, + { + "epoch": 5.264811490125673, + "grad_norm": 10.130998611450195, + "learning_rate": 2.6316217833632554e-05, + "loss": 6.2806, + "step": 29325 + }, + { + "epoch": 5.269299820466786, + "grad_norm": 8.640359878540039, + "learning_rate": 2.6310233393177737e-05, + "loss": 6.1238, + "step": 29350 + }, + { + "epoch": 5.273788150807899, + "grad_norm": 8.605904579162598, + "learning_rate": 2.6304248952722922e-05, + "loss": 6.2496, + "step": 29375 + }, + { + "epoch": 5.278276481149012, + "grad_norm": 8.5033597946167, + "learning_rate": 2.6298264512268105e-05, + "loss": 6.1158, + "step": 29400 + }, + { + "epoch": 5.282764811490126, + "grad_norm": 9.547445297241211, + "learning_rate": 2.6292280071813287e-05, + "loss": 6.1544, + "step": 29425 + }, + { + "epoch": 5.287253141831239, + "grad_norm": 7.624093532562256, + "learning_rate": 2.628629563135847e-05, + "loss": 6.1406, + "step": 29450 + }, + { + "epoch": 5.291741472172352, + "grad_norm": 9.036091804504395, + "learning_rate": 2.628031119090365e-05, + "loss": 6.2102, + "step": 29475 + }, + { + "epoch": 5.296229802513465, + "grad_norm": 8.793323516845703, + "learning_rate": 2.6274326750448834e-05, + "loss": 6.0274, + "step": 29500 + }, + { + "epoch": 5.300718132854578, + "grad_norm": 8.08771800994873, + "learning_rate": 2.6268342309994017e-05, + "loss": 6.0347, + "step": 29525 + }, + { + "epoch": 5.305206463195692, + "grad_norm": 7.875895977020264, + "learning_rate": 2.62623578695392e-05, + "loss": 6.0021, + "step": 29550 + }, + { + "epoch": 5.309694793536805, + "grad_norm": 7.883672714233398, + "learning_rate": 2.625637342908438e-05, + "loss": 6.0017, + "step": 29575 + }, + { + "epoch": 5.314183123877918, + "grad_norm": 8.396204948425293, + "learning_rate": 2.6250388988629564e-05, + "loss": 6.2007, + "step": 29600 + }, + { + "epoch": 5.3186714542190305, + "grad_norm": 7.653617858886719, + "learning_rate": 2.6244404548174746e-05, + "loss": 6.1533, + "step": 29625 + }, + { + "epoch": 5.3231597845601435, + "grad_norm": 7.781579971313477, + "learning_rate": 2.6238420107719932e-05, + "loss": 6.0964, + "step": 29650 + }, + { + "epoch": 5.3276481149012564, + "grad_norm": 7.970458030700684, + "learning_rate": 2.623243566726511e-05, + "loss": 6.1686, + "step": 29675 + }, + { + "epoch": 5.332136445242369, + "grad_norm": 8.462727546691895, + "learning_rate": 2.6226690604428488e-05, + "loss": 6.0252, + "step": 29700 + }, + { + "epoch": 5.336624775583483, + "grad_norm": 8.302094459533691, + "learning_rate": 2.622070616397367e-05, + "loss": 6.3338, + "step": 29725 + }, + { + "epoch": 5.341113105924596, + "grad_norm": 7.962423324584961, + "learning_rate": 2.6214721723518853e-05, + "loss": 6.2599, + "step": 29750 + }, + { + "epoch": 5.345601436265709, + "grad_norm": 8.013967514038086, + "learning_rate": 2.620873728306403e-05, + "loss": 6.0609, + "step": 29775 + }, + { + "epoch": 5.350089766606822, + "grad_norm": 8.221759796142578, + "learning_rate": 2.6202752842609214e-05, + "loss": 6.1386, + "step": 29800 + }, + { + "epoch": 5.354578096947935, + "grad_norm": 9.553569793701172, + "learning_rate": 2.61967684021544e-05, + "loss": 6.0948, + "step": 29825 + }, + { + "epoch": 5.359066427289049, + "grad_norm": 8.415431022644043, + "learning_rate": 2.6190783961699582e-05, + "loss": 6.0495, + "step": 29850 + }, + { + "epoch": 5.363554757630162, + "grad_norm": 9.364179611206055, + "learning_rate": 2.6184799521244764e-05, + "loss": 6.1985, + "step": 29875 + }, + { + "epoch": 5.368043087971275, + "grad_norm": 7.40634822845459, + "learning_rate": 2.6178815080789947e-05, + "loss": 6.0678, + "step": 29900 + }, + { + "epoch": 5.372531418312388, + "grad_norm": 8.632511138916016, + "learning_rate": 2.617283064033513e-05, + "loss": 6.0328, + "step": 29925 + }, + { + "epoch": 5.377019748653501, + "grad_norm": 9.294376373291016, + "learning_rate": 2.6166846199880315e-05, + "loss": 6.1978, + "step": 29950 + }, + { + "epoch": 5.381508078994614, + "grad_norm": 8.063128471374512, + "learning_rate": 2.6160861759425494e-05, + "loss": 6.1291, + "step": 29975 + }, + { + "epoch": 5.385996409335727, + "grad_norm": 8.915172576904297, + "learning_rate": 2.6154877318970676e-05, + "loss": 6.1023, + "step": 30000 + }, + { + "epoch": 5.3904847396768405, + "grad_norm": 9.371039390563965, + "learning_rate": 2.614889287851586e-05, + "loss": 6.3254, + "step": 30025 + }, + { + "epoch": 5.3949730700179535, + "grad_norm": 7.776651859283447, + "learning_rate": 2.614290843806104e-05, + "loss": 6.0987, + "step": 30050 + }, + { + "epoch": 5.399461400359066, + "grad_norm": 10.10045337677002, + "learning_rate": 2.6136923997606223e-05, + "loss": 6.1058, + "step": 30075 + }, + { + "epoch": 5.403949730700179, + "grad_norm": 7.841642379760742, + "learning_rate": 2.613093955715141e-05, + "loss": 6.2043, + "step": 30100 + }, + { + "epoch": 5.408438061041292, + "grad_norm": 7.260315895080566, + "learning_rate": 2.612495511669659e-05, + "loss": 6.0078, + "step": 30125 + }, + { + "epoch": 5.412926391382406, + "grad_norm": 9.801506996154785, + "learning_rate": 2.6118970676241774e-05, + "loss": 5.9993, + "step": 30150 + }, + { + "epoch": 5.417414721723519, + "grad_norm": 9.178946495056152, + "learning_rate": 2.6112986235786953e-05, + "loss": 6.2952, + "step": 30175 + }, + { + "epoch": 5.421903052064632, + "grad_norm": 8.03554916381836, + "learning_rate": 2.6107001795332135e-05, + "loss": 6.2929, + "step": 30200 + }, + { + "epoch": 5.426391382405745, + "grad_norm": 9.509089469909668, + "learning_rate": 2.6101017354877317e-05, + "loss": 6.1702, + "step": 30225 + }, + { + "epoch": 5.430879712746858, + "grad_norm": 9.685888290405273, + "learning_rate": 2.6095032914422503e-05, + "loss": 6.2782, + "step": 30250 + }, + { + "epoch": 5.435368043087971, + "grad_norm": 8.189570426940918, + "learning_rate": 2.6089048473967686e-05, + "loss": 6.0927, + "step": 30275 + }, + { + "epoch": 5.439856373429085, + "grad_norm": 9.135782241821289, + "learning_rate": 2.6083064033512868e-05, + "loss": 6.0943, + "step": 30300 + }, + { + "epoch": 5.444344703770198, + "grad_norm": 8.674962043762207, + "learning_rate": 2.607707959305805e-05, + "loss": 6.0576, + "step": 30325 + }, + { + "epoch": 5.448833034111311, + "grad_norm": 8.403959274291992, + "learning_rate": 2.6071095152603233e-05, + "loss": 6.1538, + "step": 30350 + }, + { + "epoch": 5.453321364452424, + "grad_norm": 8.425468444824219, + "learning_rate": 2.6065110712148415e-05, + "loss": 6.0779, + "step": 30375 + }, + { + "epoch": 5.457809694793537, + "grad_norm": 8.055676460266113, + "learning_rate": 2.6059126271693597e-05, + "loss": 6.2449, + "step": 30400 + }, + { + "epoch": 5.46229802513465, + "grad_norm": 8.304744720458984, + "learning_rate": 2.605314183123878e-05, + "loss": 6.0593, + "step": 30425 + }, + { + "epoch": 5.466786355475763, + "grad_norm": 8.296738624572754, + "learning_rate": 2.6047157390783962e-05, + "loss": 6.1411, + "step": 30450 + }, + { + "epoch": 5.471274685816876, + "grad_norm": 9.63970947265625, + "learning_rate": 2.6041172950329144e-05, + "loss": 6.2278, + "step": 30475 + }, + { + "epoch": 5.475763016157989, + "grad_norm": 9.796430587768555, + "learning_rate": 2.6035188509874327e-05, + "loss": 6.0473, + "step": 30500 + }, + { + "epoch": 5.480251346499102, + "grad_norm": 8.981965065002441, + "learning_rate": 2.6029204069419513e-05, + "loss": 6.0657, + "step": 30525 + }, + { + "epoch": 5.484739676840215, + "grad_norm": 8.052210807800293, + "learning_rate": 2.6023219628964695e-05, + "loss": 6.0451, + "step": 30550 + }, + { + "epoch": 5.489228007181328, + "grad_norm": 9.935816764831543, + "learning_rate": 2.6017235188509874e-05, + "loss": 6.1155, + "step": 30575 + }, + { + "epoch": 5.493716337522442, + "grad_norm": 8.932994842529297, + "learning_rate": 2.6011250748055056e-05, + "loss": 6.0118, + "step": 30600 + }, + { + "epoch": 5.498204667863555, + "grad_norm": 9.925904273986816, + "learning_rate": 2.600526630760024e-05, + "loss": 6.2084, + "step": 30625 + }, + { + "epoch": 5.502692998204668, + "grad_norm": 10.489768028259277, + "learning_rate": 2.599928186714542e-05, + "loss": 6.2365, + "step": 30650 + }, + { + "epoch": 5.507181328545781, + "grad_norm": 8.427865028381348, + "learning_rate": 2.5993297426690607e-05, + "loss": 6.2097, + "step": 30675 + }, + { + "epoch": 5.511669658886894, + "grad_norm": 7.591498851776123, + "learning_rate": 2.598731298623579e-05, + "loss": 6.0927, + "step": 30700 + }, + { + "epoch": 5.516157989228007, + "grad_norm": 8.772396087646484, + "learning_rate": 2.598132854578097e-05, + "loss": 6.0774, + "step": 30725 + }, + { + "epoch": 5.520646319569121, + "grad_norm": 9.780426025390625, + "learning_rate": 2.5975344105326154e-05, + "loss": 6.1742, + "step": 30750 + }, + { + "epoch": 5.525134649910234, + "grad_norm": 9.225899696350098, + "learning_rate": 2.5969359664871333e-05, + "loss": 6.0577, + "step": 30775 + }, + { + "epoch": 5.529622980251347, + "grad_norm": 8.806875228881836, + "learning_rate": 2.5963375224416515e-05, + "loss": 6.0295, + "step": 30800 + }, + { + "epoch": 5.5341113105924595, + "grad_norm": 9.866320610046387, + "learning_rate": 2.59573907839617e-05, + "loss": 6.1878, + "step": 30825 + }, + { + "epoch": 5.5385996409335725, + "grad_norm": 9.915002822875977, + "learning_rate": 2.5951406343506883e-05, + "loss": 6.0669, + "step": 30850 + }, + { + "epoch": 5.543087971274685, + "grad_norm": 8.467981338500977, + "learning_rate": 2.5945421903052066e-05, + "loss": 6.1663, + "step": 30875 + }, + { + "epoch": 5.547576301615799, + "grad_norm": 8.77927017211914, + "learning_rate": 2.5939437462597248e-05, + "loss": 6.3129, + "step": 30900 + }, + { + "epoch": 5.552064631956912, + "grad_norm": 9.728734016418457, + "learning_rate": 2.593345302214243e-05, + "loss": 6.1453, + "step": 30925 + }, + { + "epoch": 5.556552962298025, + "grad_norm": 7.502135276794434, + "learning_rate": 2.5927468581687616e-05, + "loss": 6.2766, + "step": 30950 + }, + { + "epoch": 5.561041292639138, + "grad_norm": 8.21430492401123, + "learning_rate": 2.5921484141232795e-05, + "loss": 5.8888, + "step": 30975 + }, + { + "epoch": 5.565529622980251, + "grad_norm": 9.443516731262207, + "learning_rate": 2.5915499700777977e-05, + "loss": 6.3229, + "step": 31000 + }, + { + "epoch": 5.570017953321364, + "grad_norm": 9.6520357131958, + "learning_rate": 2.590951526032316e-05, + "loss": 6.0873, + "step": 31025 + }, + { + "epoch": 5.574506283662478, + "grad_norm": 8.843985557556152, + "learning_rate": 2.5903530819868342e-05, + "loss": 6.0151, + "step": 31050 + }, + { + "epoch": 5.578994614003591, + "grad_norm": 8.71196174621582, + "learning_rate": 2.5897546379413524e-05, + "loss": 6.1541, + "step": 31075 + }, + { + "epoch": 5.583482944344704, + "grad_norm": 9.149370193481445, + "learning_rate": 2.589156193895871e-05, + "loss": 6.1403, + "step": 31100 + }, + { + "epoch": 5.587971274685817, + "grad_norm": 8.915081977844238, + "learning_rate": 2.5885577498503893e-05, + "loss": 6.2307, + "step": 31125 + }, + { + "epoch": 5.59245960502693, + "grad_norm": 8.476552963256836, + "learning_rate": 2.5879593058049075e-05, + "loss": 6.0833, + "step": 31150 + }, + { + "epoch": 5.596947935368043, + "grad_norm": 8.020525932312012, + "learning_rate": 2.5873608617594254e-05, + "loss": 6.1135, + "step": 31175 + }, + { + "epoch": 5.6014362657091565, + "grad_norm": 8.394387245178223, + "learning_rate": 2.5867624177139436e-05, + "loss": 6.0406, + "step": 31200 + }, + { + "epoch": 5.6059245960502695, + "grad_norm": 10.582916259765625, + "learning_rate": 2.586163973668462e-05, + "loss": 6.1303, + "step": 31225 + }, + { + "epoch": 5.6104129263913824, + "grad_norm": 8.989535331726074, + "learning_rate": 2.5855655296229804e-05, + "loss": 6.1068, + "step": 31250 + }, + { + "epoch": 5.614901256732495, + "grad_norm": 9.744670867919922, + "learning_rate": 2.5849670855774987e-05, + "loss": 6.103, + "step": 31275 + }, + { + "epoch": 5.619389587073608, + "grad_norm": 8.914310455322266, + "learning_rate": 2.584368641532017e-05, + "loss": 6.2063, + "step": 31300 + }, + { + "epoch": 5.623877917414722, + "grad_norm": 8.51415729522705, + "learning_rate": 2.583770197486535e-05, + "loss": 5.9779, + "step": 31325 + }, + { + "epoch": 5.628366247755835, + "grad_norm": 8.813766479492188, + "learning_rate": 2.583171753441053e-05, + "loss": 6.3473, + "step": 31350 + }, + { + "epoch": 5.632854578096948, + "grad_norm": 8.480877876281738, + "learning_rate": 2.5825733093955716e-05, + "loss": 6.1321, + "step": 31375 + }, + { + "epoch": 5.637342908438061, + "grad_norm": 9.39909839630127, + "learning_rate": 2.58197486535009e-05, + "loss": 6.0202, + "step": 31400 + }, + { + "epoch": 5.641831238779174, + "grad_norm": 9.182489395141602, + "learning_rate": 2.581376421304608e-05, + "loss": 6.1555, + "step": 31425 + }, + { + "epoch": 5.646319569120287, + "grad_norm": 7.999908447265625, + "learning_rate": 2.5807779772591263e-05, + "loss": 6.028, + "step": 31450 + }, + { + "epoch": 5.6508078994614, + "grad_norm": 9.277400970458984, + "learning_rate": 2.5801795332136446e-05, + "loss": 6.1747, + "step": 31475 + }, + { + "epoch": 5.655296229802514, + "grad_norm": 8.623818397521973, + "learning_rate": 2.5795810891681628e-05, + "loss": 5.9572, + "step": 31500 + }, + { + "epoch": 5.659784560143627, + "grad_norm": 10.166696548461914, + "learning_rate": 2.5789826451226814e-05, + "loss": 6.1225, + "step": 31525 + }, + { + "epoch": 5.66427289048474, + "grad_norm": 8.197763442993164, + "learning_rate": 2.5783842010771993e-05, + "loss": 6.135, + "step": 31550 + }, + { + "epoch": 5.668761220825853, + "grad_norm": 9.28738021850586, + "learning_rate": 2.5777857570317175e-05, + "loss": 6.0542, + "step": 31575 + }, + { + "epoch": 5.673249551166966, + "grad_norm": 8.472845077514648, + "learning_rate": 2.5771873129862357e-05, + "loss": 6.1414, + "step": 31600 + }, + { + "epoch": 5.6777378815080795, + "grad_norm": 9.32778263092041, + "learning_rate": 2.576588868940754e-05, + "loss": 6.207, + "step": 31625 + }, + { + "epoch": 5.682226211849192, + "grad_norm": 9.901814460754395, + "learning_rate": 2.5759904248952722e-05, + "loss": 6.2647, + "step": 31650 + }, + { + "epoch": 5.686714542190305, + "grad_norm": 9.489412307739258, + "learning_rate": 2.5753919808497908e-05, + "loss": 6.1074, + "step": 31675 + }, + { + "epoch": 5.691202872531418, + "grad_norm": 8.911075592041016, + "learning_rate": 2.574793536804309e-05, + "loss": 6.2419, + "step": 31700 + }, + { + "epoch": 5.695691202872531, + "grad_norm": 7.754744529724121, + "learning_rate": 2.5741950927588273e-05, + "loss": 5.999, + "step": 31725 + }, + { + "epoch": 5.700179533213644, + "grad_norm": 8.506721496582031, + "learning_rate": 2.573596648713345e-05, + "loss": 6.194, + "step": 31750 + }, + { + "epoch": 5.704667863554757, + "grad_norm": 9.833474159240723, + "learning_rate": 2.5729982046678634e-05, + "loss": 6.1371, + "step": 31775 + }, + { + "epoch": 5.709156193895871, + "grad_norm": 9.112914085388184, + "learning_rate": 2.572399760622382e-05, + "loss": 5.9992, + "step": 31800 + }, + { + "epoch": 5.713644524236984, + "grad_norm": 10.660280227661133, + "learning_rate": 2.5718013165769002e-05, + "loss": 6.1335, + "step": 31825 + }, + { + "epoch": 5.718132854578097, + "grad_norm": 9.861505508422852, + "learning_rate": 2.5712028725314184e-05, + "loss": 6.0151, + "step": 31850 + }, + { + "epoch": 5.72262118491921, + "grad_norm": 9.20781421661377, + "learning_rate": 2.5706044284859367e-05, + "loss": 6.2727, + "step": 31875 + }, + { + "epoch": 5.727109515260323, + "grad_norm": 8.990876197814941, + "learning_rate": 2.570005984440455e-05, + "loss": 6.2274, + "step": 31900 + }, + { + "epoch": 5.731597845601437, + "grad_norm": 8.423699378967285, + "learning_rate": 2.569407540394973e-05, + "loss": 6.0746, + "step": 31925 + }, + { + "epoch": 5.73608617594255, + "grad_norm": 7.585338115692139, + "learning_rate": 2.5688090963494914e-05, + "loss": 6.2263, + "step": 31950 + }, + { + "epoch": 5.740574506283663, + "grad_norm": 10.411974906921387, + "learning_rate": 2.5682106523040096e-05, + "loss": 6.137, + "step": 31975 + }, + { + "epoch": 5.745062836624776, + "grad_norm": 9.560766220092773, + "learning_rate": 2.567612208258528e-05, + "loss": 5.9217, + "step": 32000 + }, + { + "epoch": 5.7495511669658885, + "grad_norm": 8.849239349365234, + "learning_rate": 2.567013764213046e-05, + "loss": 6.0139, + "step": 32025 + }, + { + "epoch": 5.7540394973070015, + "grad_norm": 7.742386341094971, + "learning_rate": 2.5664153201675643e-05, + "loss": 6.1323, + "step": 32050 + }, + { + "epoch": 5.758527827648114, + "grad_norm": 8.805398941040039, + "learning_rate": 2.5658168761220826e-05, + "loss": 6.0455, + "step": 32075 + }, + { + "epoch": 5.763016157989228, + "grad_norm": 8.419034957885742, + "learning_rate": 2.565218432076601e-05, + "loss": 5.9544, + "step": 32100 + }, + { + "epoch": 5.767504488330341, + "grad_norm": 7.9658708572387695, + "learning_rate": 2.5646199880311194e-05, + "loss": 6.1519, + "step": 32125 + }, + { + "epoch": 5.771992818671454, + "grad_norm": 9.581972122192383, + "learning_rate": 2.5640215439856373e-05, + "loss": 6.1542, + "step": 32150 + }, + { + "epoch": 5.776481149012567, + "grad_norm": 9.806302070617676, + "learning_rate": 2.5634230999401555e-05, + "loss": 6.1731, + "step": 32175 + }, + { + "epoch": 5.78096947935368, + "grad_norm": 8.986441612243652, + "learning_rate": 2.5628246558946737e-05, + "loss": 6.1351, + "step": 32200 + }, + { + "epoch": 5.785457809694794, + "grad_norm": 8.912154197692871, + "learning_rate": 2.562226211849192e-05, + "loss": 6.3057, + "step": 32225 + }, + { + "epoch": 5.789946140035907, + "grad_norm": 9.716904640197754, + "learning_rate": 2.5616277678037106e-05, + "loss": 6.0585, + "step": 32250 + }, + { + "epoch": 5.79443447037702, + "grad_norm": 9.447417259216309, + "learning_rate": 2.5610293237582288e-05, + "loss": 6.1429, + "step": 32275 + }, + { + "epoch": 5.798922800718133, + "grad_norm": 8.467061996459961, + "learning_rate": 2.560430879712747e-05, + "loss": 6.1556, + "step": 32300 + }, + { + "epoch": 5.803411131059246, + "grad_norm": 8.498756408691406, + "learning_rate": 2.5598324356672653e-05, + "loss": 6.1585, + "step": 32325 + }, + { + "epoch": 5.807899461400359, + "grad_norm": 8.223047256469727, + "learning_rate": 2.559233991621783e-05, + "loss": 6.1577, + "step": 32350 + }, + { + "epoch": 5.812387791741472, + "grad_norm": 8.135844230651855, + "learning_rate": 2.5586355475763017e-05, + "loss": 6.1309, + "step": 32375 + }, + { + "epoch": 5.8168761220825855, + "grad_norm": 8.580695152282715, + "learning_rate": 2.5580610412926394e-05, + "loss": 5.9329, + "step": 32400 + }, + { + "epoch": 5.8213644524236985, + "grad_norm": 9.210616111755371, + "learning_rate": 2.5574625972471577e-05, + "loss": 6.0831, + "step": 32425 + }, + { + "epoch": 5.825852782764811, + "grad_norm": 8.864201545715332, + "learning_rate": 2.5568641532016756e-05, + "loss": 6.0747, + "step": 32450 + }, + { + "epoch": 5.830341113105924, + "grad_norm": 9.814947128295898, + "learning_rate": 2.5562657091561938e-05, + "loss": 6.1446, + "step": 32475 + }, + { + "epoch": 5.834829443447037, + "grad_norm": 8.536795616149902, + "learning_rate": 2.555667265110712e-05, + "loss": 6.0148, + "step": 32500 + }, + { + "epoch": 5.839317773788151, + "grad_norm": 8.667900085449219, + "learning_rate": 2.5550688210652303e-05, + "loss": 6.1862, + "step": 32525 + }, + { + "epoch": 5.843806104129264, + "grad_norm": 9.259778022766113, + "learning_rate": 2.554470377019749e-05, + "loss": 6.0538, + "step": 32550 + }, + { + "epoch": 5.848294434470377, + "grad_norm": 8.16540241241455, + "learning_rate": 2.553871932974267e-05, + "loss": 6.1674, + "step": 32575 + }, + { + "epoch": 5.85278276481149, + "grad_norm": 9.87695026397705, + "learning_rate": 2.5532734889287853e-05, + "loss": 6.1241, + "step": 32600 + }, + { + "epoch": 5.857271095152603, + "grad_norm": 7.93190336227417, + "learning_rate": 2.5526750448833036e-05, + "loss": 6.1678, + "step": 32625 + }, + { + "epoch": 5.861759425493716, + "grad_norm": 8.621042251586914, + "learning_rate": 2.5520766008378215e-05, + "loss": 6.177, + "step": 32650 + }, + { + "epoch": 5.86624775583483, + "grad_norm": 8.15766716003418, + "learning_rate": 2.5514781567923397e-05, + "loss": 5.9493, + "step": 32675 + }, + { + "epoch": 5.870736086175943, + "grad_norm": 8.991899490356445, + "learning_rate": 2.5508797127468583e-05, + "loss": 6.076, + "step": 32700 + }, + { + "epoch": 5.875224416517056, + "grad_norm": 8.945096969604492, + "learning_rate": 2.5502812687013765e-05, + "loss": 6.1515, + "step": 32725 + }, + { + "epoch": 5.879712746858169, + "grad_norm": 8.60977840423584, + "learning_rate": 2.5496828246558947e-05, + "loss": 6.1188, + "step": 32750 + }, + { + "epoch": 5.884201077199282, + "grad_norm": 9.31452751159668, + "learning_rate": 2.549084380610413e-05, + "loss": 6.0941, + "step": 32775 + }, + { + "epoch": 5.888689407540395, + "grad_norm": 8.356807708740234, + "learning_rate": 2.5484859365649312e-05, + "loss": 6.1485, + "step": 32800 + }, + { + "epoch": 5.8931777378815084, + "grad_norm": 7.935510635375977, + "learning_rate": 2.5478874925194498e-05, + "loss": 6.0675, + "step": 32825 + }, + { + "epoch": 5.897666068222621, + "grad_norm": 9.616724014282227, + "learning_rate": 2.5472890484739677e-05, + "loss": 6.0589, + "step": 32850 + }, + { + "epoch": 5.902154398563734, + "grad_norm": 8.401680946350098, + "learning_rate": 2.546690604428486e-05, + "loss": 6.0173, + "step": 32875 + }, + { + "epoch": 5.906642728904847, + "grad_norm": 11.485481262207031, + "learning_rate": 2.546092160383004e-05, + "loss": 6.2127, + "step": 32900 + }, + { + "epoch": 5.91113105924596, + "grad_norm": 8.409283638000488, + "learning_rate": 2.5454937163375224e-05, + "loss": 6.0298, + "step": 32925 + }, + { + "epoch": 5.915619389587073, + "grad_norm": 8.633358001708984, + "learning_rate": 2.5448952722920406e-05, + "loss": 6.2317, + "step": 32950 + }, + { + "epoch": 5.920107719928187, + "grad_norm": 8.400113105773926, + "learning_rate": 2.5442968282465592e-05, + "loss": 6.1932, + "step": 32975 + }, + { + "epoch": 5.9245960502693, + "grad_norm": 10.29403018951416, + "learning_rate": 2.5436983842010774e-05, + "loss": 6.0189, + "step": 33000 + }, + { + "epoch": 5.929084380610413, + "grad_norm": 9.123688697814941, + "learning_rate": 2.5430999401555957e-05, + "loss": 6.3111, + "step": 33025 + }, + { + "epoch": 5.933572710951526, + "grad_norm": 9.55163860321045, + "learning_rate": 2.5425014961101136e-05, + "loss": 6.169, + "step": 33050 + }, + { + "epoch": 5.938061041292639, + "grad_norm": 8.139374732971191, + "learning_rate": 2.5419030520646318e-05, + "loss": 5.9897, + "step": 33075 + }, + { + "epoch": 5.942549371633753, + "grad_norm": 8.95219612121582, + "learning_rate": 2.54130460801915e-05, + "loss": 6.0864, + "step": 33100 + }, + { + "epoch": 5.947037701974866, + "grad_norm": 10.874579429626465, + "learning_rate": 2.5407061639736686e-05, + "loss": 6.1264, + "step": 33125 + }, + { + "epoch": 5.951526032315979, + "grad_norm": 8.47079086303711, + "learning_rate": 2.540107719928187e-05, + "loss": 6.1167, + "step": 33150 + }, + { + "epoch": 5.956014362657092, + "grad_norm": 7.829136848449707, + "learning_rate": 2.539509275882705e-05, + "loss": 6.038, + "step": 33175 + }, + { + "epoch": 5.960502692998205, + "grad_norm": 9.986525535583496, + "learning_rate": 2.5389108318372233e-05, + "loss": 6.1598, + "step": 33200 + }, + { + "epoch": 5.9649910233393175, + "grad_norm": 9.348833084106445, + "learning_rate": 2.5383123877917416e-05, + "loss": 6.2099, + "step": 33225 + }, + { + "epoch": 5.9694793536804305, + "grad_norm": 10.55340576171875, + "learning_rate": 2.5377139437462598e-05, + "loss": 6.0598, + "step": 33250 + }, + { + "epoch": 5.973967684021544, + "grad_norm": 8.950223922729492, + "learning_rate": 2.537115499700778e-05, + "loss": 6.0533, + "step": 33275 + }, + { + "epoch": 5.978456014362657, + "grad_norm": 10.24989128112793, + "learning_rate": 2.5365170556552963e-05, + "loss": 6.0104, + "step": 33300 + }, + { + "epoch": 5.98294434470377, + "grad_norm": 8.867486000061035, + "learning_rate": 2.5359186116098145e-05, + "loss": 6.0798, + "step": 33325 + }, + { + "epoch": 5.987432675044883, + "grad_norm": 9.54025650024414, + "learning_rate": 2.5353201675643327e-05, + "loss": 6.213, + "step": 33350 + }, + { + "epoch": 5.991921005385996, + "grad_norm": 9.275188446044922, + "learning_rate": 2.534721723518851e-05, + "loss": 6.1139, + "step": 33375 + }, + { + "epoch": 5.99640933572711, + "grad_norm": 8.314274787902832, + "learning_rate": 2.5341232794733696e-05, + "loss": 5.8898, + "step": 33400 + }, + { + "epoch": 6.0, + "eval_accuracy": 0.07627976247302999, + "eval_f1_macro": 0.003975815843346546, + "eval_f1_micro": 0.07627976247302999, + "eval_f1_weighted": 0.03663957321491088, + "eval_loss": 6.930175304412842, + "eval_precision_macro": 0.0036745100499673464, + "eval_precision_micro": 0.07627976247302999, + "eval_precision_weighted": 0.029456577213192756, + "eval_recall_macro": 0.007695994325906138, + "eval_recall_micro": 0.07627976247302999, + "eval_recall_weighted": 0.07627976247302999, + "eval_runtime": 63.5577, + "eval_samples_per_second": 824.023, + "eval_steps_per_second": 25.756, + "step": 33420 + }, + { + "epoch": 6.000897666068223, + "grad_norm": 7.869987487792969, + "learning_rate": 2.5335248354278878e-05, + "loss": 5.9745, + "step": 33425 + }, + { + "epoch": 6.005385996409336, + "grad_norm": 7.98681116104126, + "learning_rate": 2.5329263913824057e-05, + "loss": 5.7044, + "step": 33450 + }, + { + "epoch": 6.009874326750449, + "grad_norm": 9.027192115783691, + "learning_rate": 2.532327947336924e-05, + "loss": 5.5881, + "step": 33475 + }, + { + "epoch": 6.014362657091562, + "grad_norm": 9.088212013244629, + "learning_rate": 2.531729503291442e-05, + "loss": 5.7827, + "step": 33500 + }, + { + "epoch": 6.018850987432675, + "grad_norm": 8.62173080444336, + "learning_rate": 2.5311310592459604e-05, + "loss": 5.6908, + "step": 33525 + }, + { + "epoch": 6.023339317773788, + "grad_norm": 9.273299217224121, + "learning_rate": 2.530532615200479e-05, + "loss": 5.694, + "step": 33550 + }, + { + "epoch": 6.027827648114902, + "grad_norm": 9.77273941040039, + "learning_rate": 2.5299341711549972e-05, + "loss": 5.7308, + "step": 33575 + }, + { + "epoch": 6.0323159784560145, + "grad_norm": 8.67655086517334, + "learning_rate": 2.5293357271095154e-05, + "loss": 5.7343, + "step": 33600 + }, + { + "epoch": 6.0368043087971275, + "grad_norm": 9.977194786071777, + "learning_rate": 2.5287372830640337e-05, + "loss": 5.7499, + "step": 33625 + }, + { + "epoch": 6.04129263913824, + "grad_norm": 9.110048294067383, + "learning_rate": 2.5281388390185516e-05, + "loss": 5.6984, + "step": 33650 + }, + { + "epoch": 6.045780969479353, + "grad_norm": 9.762101173400879, + "learning_rate": 2.52754039497307e-05, + "loss": 5.7261, + "step": 33675 + }, + { + "epoch": 6.050269299820466, + "grad_norm": 9.377640724182129, + "learning_rate": 2.5269419509275884e-05, + "loss": 5.6832, + "step": 33700 + }, + { + "epoch": 6.05475763016158, + "grad_norm": 9.664482116699219, + "learning_rate": 2.5263435068821066e-05, + "loss": 5.7031, + "step": 33725 + }, + { + "epoch": 6.059245960502693, + "grad_norm": 8.087955474853516, + "learning_rate": 2.525745062836625e-05, + "loss": 5.8648, + "step": 33750 + }, + { + "epoch": 6.063734290843806, + "grad_norm": 9.443461418151855, + "learning_rate": 2.525146618791143e-05, + "loss": 5.8017, + "step": 33775 + }, + { + "epoch": 6.068222621184919, + "grad_norm": 8.729752540588379, + "learning_rate": 2.5245481747456613e-05, + "loss": 5.419, + "step": 33800 + }, + { + "epoch": 6.072710951526032, + "grad_norm": 10.972711563110352, + "learning_rate": 2.52394973070018e-05, + "loss": 5.7769, + "step": 33825 + }, + { + "epoch": 6.077199281867145, + "grad_norm": 9.570054054260254, + "learning_rate": 2.5233512866546978e-05, + "loss": 5.692, + "step": 33850 + }, + { + "epoch": 6.081687612208259, + "grad_norm": 9.418269157409668, + "learning_rate": 2.522752842609216e-05, + "loss": 5.8849, + "step": 33875 + }, + { + "epoch": 6.086175942549372, + "grad_norm": 10.932817459106445, + "learning_rate": 2.5221543985637343e-05, + "loss": 5.7669, + "step": 33900 + }, + { + "epoch": 6.090664272890485, + "grad_norm": 9.56264591217041, + "learning_rate": 2.5215559545182525e-05, + "loss": 5.6504, + "step": 33925 + }, + { + "epoch": 6.095152603231598, + "grad_norm": 9.188260078430176, + "learning_rate": 2.5209575104727708e-05, + "loss": 5.7282, + "step": 33950 + }, + { + "epoch": 6.099640933572711, + "grad_norm": 8.477288246154785, + "learning_rate": 2.5203590664272893e-05, + "loss": 5.7759, + "step": 33975 + }, + { + "epoch": 6.1041292639138245, + "grad_norm": 7.710765361785889, + "learning_rate": 2.5197606223818076e-05, + "loss": 5.8474, + "step": 34000 + }, + { + "epoch": 6.108617594254937, + "grad_norm": 9.303559303283691, + "learning_rate": 2.5191621783363258e-05, + "loss": 5.8123, + "step": 34025 + }, + { + "epoch": 6.11310592459605, + "grad_norm": 9.044492721557617, + "learning_rate": 2.5185637342908437e-05, + "loss": 5.7294, + "step": 34050 + }, + { + "epoch": 6.117594254937163, + "grad_norm": 8.936657905578613, + "learning_rate": 2.517965290245362e-05, + "loss": 5.7305, + "step": 34075 + }, + { + "epoch": 6.122082585278276, + "grad_norm": 8.897019386291504, + "learning_rate": 2.51736684619988e-05, + "loss": 5.7383, + "step": 34100 + }, + { + "epoch": 6.126570915619389, + "grad_norm": 8.29782485961914, + "learning_rate": 2.5167684021543987e-05, + "loss": 5.7962, + "step": 34125 + }, + { + "epoch": 6.131059245960503, + "grad_norm": 12.11410903930664, + "learning_rate": 2.516169958108917e-05, + "loss": 5.7562, + "step": 34150 + }, + { + "epoch": 6.135547576301616, + "grad_norm": 8.760488510131836, + "learning_rate": 2.5155715140634352e-05, + "loss": 5.6613, + "step": 34175 + }, + { + "epoch": 6.140035906642729, + "grad_norm": 8.568686485290527, + "learning_rate": 2.5149730700179534e-05, + "loss": 5.7525, + "step": 34200 + }, + { + "epoch": 6.144524236983842, + "grad_norm": 8.775039672851562, + "learning_rate": 2.5143746259724713e-05, + "loss": 5.7751, + "step": 34225 + }, + { + "epoch": 6.149012567324955, + "grad_norm": 8.404828071594238, + "learning_rate": 2.51377618192699e-05, + "loss": 5.9142, + "step": 34250 + }, + { + "epoch": 6.153500897666068, + "grad_norm": 8.061406135559082, + "learning_rate": 2.513177737881508e-05, + "loss": 5.815, + "step": 34275 + }, + { + "epoch": 6.157989228007182, + "grad_norm": 8.41419792175293, + "learning_rate": 2.5125792938360264e-05, + "loss": 5.798, + "step": 34300 + }, + { + "epoch": 6.162477558348295, + "grad_norm": 8.986761093139648, + "learning_rate": 2.5119808497905446e-05, + "loss": 5.8062, + "step": 34325 + }, + { + "epoch": 6.166965888689408, + "grad_norm": 8.833358764648438, + "learning_rate": 2.511382405745063e-05, + "loss": 5.7584, + "step": 34350 + }, + { + "epoch": 6.171454219030521, + "grad_norm": 8.922972679138184, + "learning_rate": 2.510783961699581e-05, + "loss": 5.742, + "step": 34375 + }, + { + "epoch": 6.175942549371634, + "grad_norm": 8.676275253295898, + "learning_rate": 2.5101855176540997e-05, + "loss": 5.6462, + "step": 34400 + }, + { + "epoch": 6.1804308797127465, + "grad_norm": 8.55473804473877, + "learning_rate": 2.5095870736086176e-05, + "loss": 5.5817, + "step": 34425 + }, + { + "epoch": 6.18491921005386, + "grad_norm": 8.479460716247559, + "learning_rate": 2.5090125673249553e-05, + "loss": 5.9053, + "step": 34450 + }, + { + "epoch": 6.189407540394973, + "grad_norm": 9.15288257598877, + "learning_rate": 2.5084141232794735e-05, + "loss": 5.7955, + "step": 34475 + }, + { + "epoch": 6.193895870736086, + "grad_norm": 9.91638469696045, + "learning_rate": 2.5078156792339918e-05, + "loss": 5.5919, + "step": 34500 + }, + { + "epoch": 6.198384201077199, + "grad_norm": 8.879231452941895, + "learning_rate": 2.50721723518851e-05, + "loss": 5.7176, + "step": 34525 + }, + { + "epoch": 6.202872531418312, + "grad_norm": 8.506264686584473, + "learning_rate": 2.506618791143028e-05, + "loss": 5.6105, + "step": 34550 + }, + { + "epoch": 6.207360861759425, + "grad_norm": 9.407604217529297, + "learning_rate": 2.5060203470975465e-05, + "loss": 5.7968, + "step": 34575 + }, + { + "epoch": 6.211849192100539, + "grad_norm": 9.515223503112793, + "learning_rate": 2.5054219030520647e-05, + "loss": 5.7415, + "step": 34600 + }, + { + "epoch": 6.216337522441652, + "grad_norm": 10.325664520263672, + "learning_rate": 2.504823459006583e-05, + "loss": 5.6859, + "step": 34625 + }, + { + "epoch": 6.220825852782765, + "grad_norm": 8.236091613769531, + "learning_rate": 2.5042250149611012e-05, + "loss": 5.8159, + "step": 34650 + }, + { + "epoch": 6.225314183123878, + "grad_norm": 9.277985572814941, + "learning_rate": 2.5036265709156194e-05, + "loss": 5.8614, + "step": 34675 + }, + { + "epoch": 6.229802513464991, + "grad_norm": 9.3956937789917, + "learning_rate": 2.503028126870138e-05, + "loss": 5.6164, + "step": 34700 + }, + { + "epoch": 6.234290843806104, + "grad_norm": 10.08874797821045, + "learning_rate": 2.5024296828246562e-05, + "loss": 6.0337, + "step": 34725 + }, + { + "epoch": 6.238779174147218, + "grad_norm": 9.153406143188477, + "learning_rate": 2.501831238779174e-05, + "loss": 5.7574, + "step": 34750 + }, + { + "epoch": 6.243267504488331, + "grad_norm": 9.721755981445312, + "learning_rate": 2.5012327947336924e-05, + "loss": 5.7113, + "step": 34775 + }, + { + "epoch": 6.2477558348294435, + "grad_norm": 9.609960556030273, + "learning_rate": 2.5006343506882106e-05, + "loss": 5.5291, + "step": 34800 + }, + { + "epoch": 6.2522441651705565, + "grad_norm": 9.225358963012695, + "learning_rate": 2.5000359066427288e-05, + "loss": 5.897, + "step": 34825 + }, + { + "epoch": 6.256732495511669, + "grad_norm": 10.770415306091309, + "learning_rate": 2.4994374625972474e-05, + "loss": 5.8241, + "step": 34850 + }, + { + "epoch": 6.261220825852782, + "grad_norm": 10.333624839782715, + "learning_rate": 2.4988390185517656e-05, + "loss": 5.6986, + "step": 34875 + }, + { + "epoch": 6.265709156193896, + "grad_norm": 9.072614669799805, + "learning_rate": 2.498240574506284e-05, + "loss": 5.6917, + "step": 34900 + }, + { + "epoch": 6.270197486535009, + "grad_norm": 9.984440803527832, + "learning_rate": 2.497642130460802e-05, + "loss": 5.9022, + "step": 34925 + }, + { + "epoch": 6.274685816876122, + "grad_norm": 10.307644844055176, + "learning_rate": 2.49704368641532e-05, + "loss": 5.7163, + "step": 34950 + }, + { + "epoch": 6.279174147217235, + "grad_norm": 9.912421226501465, + "learning_rate": 2.4964452423698382e-05, + "loss": 5.6103, + "step": 34975 + }, + { + "epoch": 6.283662477558348, + "grad_norm": 8.968305587768555, + "learning_rate": 2.4958467983243568e-05, + "loss": 5.7962, + "step": 35000 + }, + { + "epoch": 6.288150807899461, + "grad_norm": 8.762642860412598, + "learning_rate": 2.495248354278875e-05, + "loss": 5.7375, + "step": 35025 + }, + { + "epoch": 6.292639138240575, + "grad_norm": 8.65705680847168, + "learning_rate": 2.4946499102333933e-05, + "loss": 5.7269, + "step": 35050 + }, + { + "epoch": 6.297127468581688, + "grad_norm": 9.499958038330078, + "learning_rate": 2.4940514661879115e-05, + "loss": 5.6918, + "step": 35075 + }, + { + "epoch": 6.301615798922801, + "grad_norm": 10.944747924804688, + "learning_rate": 2.4934530221424298e-05, + "loss": 5.7787, + "step": 35100 + }, + { + "epoch": 6.306104129263914, + "grad_norm": 9.58871078491211, + "learning_rate": 2.492854578096948e-05, + "loss": 5.7288, + "step": 35125 + }, + { + "epoch": 6.310592459605027, + "grad_norm": 9.314966201782227, + "learning_rate": 2.4922561340514662e-05, + "loss": 5.8391, + "step": 35150 + }, + { + "epoch": 6.31508078994614, + "grad_norm": 8.915397644042969, + "learning_rate": 2.4916576900059845e-05, + "loss": 5.8121, + "step": 35175 + }, + { + "epoch": 6.3195691202872535, + "grad_norm": 8.843438148498535, + "learning_rate": 2.4910592459605027e-05, + "loss": 5.6681, + "step": 35200 + }, + { + "epoch": 6.324057450628366, + "grad_norm": 9.160588264465332, + "learning_rate": 2.490460801915021e-05, + "loss": 5.6479, + "step": 35225 + }, + { + "epoch": 6.328545780969479, + "grad_norm": 9.368515014648438, + "learning_rate": 2.4898623578695392e-05, + "loss": 5.7572, + "step": 35250 + }, + { + "epoch": 6.333034111310592, + "grad_norm": 9.279614448547363, + "learning_rate": 2.4892639138240577e-05, + "loss": 5.8225, + "step": 35275 + }, + { + "epoch": 6.337522441651705, + "grad_norm": 8.391313552856445, + "learning_rate": 2.488665469778576e-05, + "loss": 5.7803, + "step": 35300 + }, + { + "epoch": 6.342010771992818, + "grad_norm": 9.088462829589844, + "learning_rate": 2.488067025733094e-05, + "loss": 5.7724, + "step": 35325 + }, + { + "epoch": 6.346499102333932, + "grad_norm": 9.589491844177246, + "learning_rate": 2.487468581687612e-05, + "loss": 5.7401, + "step": 35350 + }, + { + "epoch": 6.350987432675045, + "grad_norm": 8.865998268127441, + "learning_rate": 2.4868701376421304e-05, + "loss": 5.7122, + "step": 35375 + }, + { + "epoch": 6.355475763016158, + "grad_norm": 9.240813255310059, + "learning_rate": 2.4862716935966486e-05, + "loss": 5.7909, + "step": 35400 + }, + { + "epoch": 6.359964093357271, + "grad_norm": 7.574283599853516, + "learning_rate": 2.485673249551167e-05, + "loss": 5.662, + "step": 35425 + }, + { + "epoch": 6.364452423698384, + "grad_norm": 9.30193042755127, + "learning_rate": 2.4850748055056854e-05, + "loss": 5.7635, + "step": 35450 + }, + { + "epoch": 6.368940754039498, + "grad_norm": 9.604290008544922, + "learning_rate": 2.4844763614602036e-05, + "loss": 5.7246, + "step": 35475 + }, + { + "epoch": 6.373429084380611, + "grad_norm": 9.676158905029297, + "learning_rate": 2.483877917414722e-05, + "loss": 5.7412, + "step": 35500 + }, + { + "epoch": 6.377917414721724, + "grad_norm": 9.000865936279297, + "learning_rate": 2.4832794733692398e-05, + "loss": 5.7319, + "step": 35525 + }, + { + "epoch": 6.382405745062837, + "grad_norm": 10.378793716430664, + "learning_rate": 2.4826810293237583e-05, + "loss": 5.8238, + "step": 35550 + }, + { + "epoch": 6.38689407540395, + "grad_norm": 10.017817497253418, + "learning_rate": 2.4820825852782766e-05, + "loss": 5.9192, + "step": 35575 + }, + { + "epoch": 6.391382405745063, + "grad_norm": 9.022943496704102, + "learning_rate": 2.4814841412327948e-05, + "loss": 5.6597, + "step": 35600 + }, + { + "epoch": 6.3958707360861755, + "grad_norm": 9.401041030883789, + "learning_rate": 2.480885697187313e-05, + "loss": 5.7151, + "step": 35625 + }, + { + "epoch": 6.400359066427289, + "grad_norm": 10.35769271850586, + "learning_rate": 2.4802872531418313e-05, + "loss": 5.8095, + "step": 35650 + }, + { + "epoch": 6.404847396768402, + "grad_norm": 9.504568099975586, + "learning_rate": 2.4796888090963495e-05, + "loss": 5.945, + "step": 35675 + }, + { + "epoch": 6.409335727109515, + "grad_norm": 9.51905632019043, + "learning_rate": 2.479090365050868e-05, + "loss": 5.8013, + "step": 35700 + }, + { + "epoch": 6.413824057450628, + "grad_norm": 9.524596214294434, + "learning_rate": 2.478491921005386e-05, + "loss": 5.6688, + "step": 35725 + }, + { + "epoch": 6.418312387791741, + "grad_norm": 10.017833709716797, + "learning_rate": 2.4778934769599042e-05, + "loss": 5.8059, + "step": 35750 + }, + { + "epoch": 6.422800718132855, + "grad_norm": 10.921141624450684, + "learning_rate": 2.4772950329144225e-05, + "loss": 5.6042, + "step": 35775 + }, + { + "epoch": 6.427289048473968, + "grad_norm": 8.842256546020508, + "learning_rate": 2.4766965888689407e-05, + "loss": 5.7927, + "step": 35800 + }, + { + "epoch": 6.431777378815081, + "grad_norm": 10.712766647338867, + "learning_rate": 2.476098144823459e-05, + "loss": 5.5418, + "step": 35825 + }, + { + "epoch": 6.436265709156194, + "grad_norm": 9.774335861206055, + "learning_rate": 2.4754997007779775e-05, + "loss": 5.6641, + "step": 35850 + }, + { + "epoch": 6.440754039497307, + "grad_norm": 9.266693115234375, + "learning_rate": 2.4749012567324957e-05, + "loss": 5.7811, + "step": 35875 + }, + { + "epoch": 6.44524236983842, + "grad_norm": 9.181718826293945, + "learning_rate": 2.474302812687014e-05, + "loss": 5.7154, + "step": 35900 + }, + { + "epoch": 6.449730700179533, + "grad_norm": 9.074522018432617, + "learning_rate": 2.473704368641532e-05, + "loss": 5.6474, + "step": 35925 + }, + { + "epoch": 6.454219030520647, + "grad_norm": 9.428606986999512, + "learning_rate": 2.47310592459605e-05, + "loss": 5.8058, + "step": 35950 + }, + { + "epoch": 6.45870736086176, + "grad_norm": 9.22999095916748, + "learning_rate": 2.4725074805505684e-05, + "loss": 5.7455, + "step": 35975 + }, + { + "epoch": 6.4631956912028725, + "grad_norm": 8.720118522644043, + "learning_rate": 2.471909036505087e-05, + "loss": 5.8308, + "step": 36000 + }, + { + "epoch": 6.4676840215439855, + "grad_norm": 10.037737846374512, + "learning_rate": 2.471310592459605e-05, + "loss": 5.6963, + "step": 36025 + }, + { + "epoch": 6.472172351885098, + "grad_norm": 8.465198516845703, + "learning_rate": 2.4707121484141234e-05, + "loss": 5.7443, + "step": 36050 + }, + { + "epoch": 6.476660682226212, + "grad_norm": 10.256993293762207, + "learning_rate": 2.4701137043686416e-05, + "loss": 5.7703, + "step": 36075 + }, + { + "epoch": 6.481149012567325, + "grad_norm": 9.542232513427734, + "learning_rate": 2.46951526032316e-05, + "loss": 5.837, + "step": 36100 + }, + { + "epoch": 6.485637342908438, + "grad_norm": 10.536099433898926, + "learning_rate": 2.468916816277678e-05, + "loss": 5.7736, + "step": 36125 + }, + { + "epoch": 6.490125673249551, + "grad_norm": 9.579240798950195, + "learning_rate": 2.4683183722321963e-05, + "loss": 5.7589, + "step": 36150 + }, + { + "epoch": 6.494614003590664, + "grad_norm": 8.462407112121582, + "learning_rate": 2.4677199281867146e-05, + "loss": 5.6682, + "step": 36175 + }, + { + "epoch": 6.499102333931777, + "grad_norm": 8.817401885986328, + "learning_rate": 2.4671214841412328e-05, + "loss": 5.9234, + "step": 36200 + }, + { + "epoch": 6.50359066427289, + "grad_norm": 9.177047729492188, + "learning_rate": 2.466523040095751e-05, + "loss": 5.7723, + "step": 36225 + }, + { + "epoch": 6.508078994614004, + "grad_norm": 10.70118236541748, + "learning_rate": 2.4659245960502693e-05, + "loss": 5.6541, + "step": 36250 + }, + { + "epoch": 6.512567324955117, + "grad_norm": 9.889698028564453, + "learning_rate": 2.465326152004788e-05, + "loss": 5.7462, + "step": 36275 + }, + { + "epoch": 6.51705565529623, + "grad_norm": 8.898477554321289, + "learning_rate": 2.464727707959306e-05, + "loss": 5.8923, + "step": 36300 + }, + { + "epoch": 6.521543985637343, + "grad_norm": 10.44539737701416, + "learning_rate": 2.464129263913824e-05, + "loss": 5.8842, + "step": 36325 + }, + { + "epoch": 6.526032315978456, + "grad_norm": 9.89045524597168, + "learning_rate": 2.4635308198683422e-05, + "loss": 5.7843, + "step": 36350 + }, + { + "epoch": 6.5305206463195695, + "grad_norm": 8.804848670959473, + "learning_rate": 2.4629323758228605e-05, + "loss": 5.8822, + "step": 36375 + }, + { + "epoch": 6.5350089766606825, + "grad_norm": 8.936808586120605, + "learning_rate": 2.4623339317773787e-05, + "loss": 5.6481, + "step": 36400 + }, + { + "epoch": 6.539497307001795, + "grad_norm": 9.736496925354004, + "learning_rate": 2.4617354877318973e-05, + "loss": 5.9256, + "step": 36425 + }, + { + "epoch": 6.543985637342908, + "grad_norm": 10.274374961853027, + "learning_rate": 2.4611370436864155e-05, + "loss": 5.8067, + "step": 36450 + }, + { + "epoch": 6.548473967684021, + "grad_norm": 9.859149932861328, + "learning_rate": 2.4605385996409338e-05, + "loss": 5.8227, + "step": 36475 + }, + { + "epoch": 6.552962298025134, + "grad_norm": 9.181900024414062, + "learning_rate": 2.459940155595452e-05, + "loss": 5.9223, + "step": 36500 + }, + { + "epoch": 6.557450628366247, + "grad_norm": 9.238310813903809, + "learning_rate": 2.45934171154997e-05, + "loss": 5.8239, + "step": 36525 + }, + { + "epoch": 6.561938958707361, + "grad_norm": 8.887263298034668, + "learning_rate": 2.4587672052663076e-05, + "loss": 5.943, + "step": 36550 + }, + { + "epoch": 6.566427289048474, + "grad_norm": 8.125824928283691, + "learning_rate": 2.458168761220826e-05, + "loss": 5.8003, + "step": 36575 + }, + { + "epoch": 6.570915619389587, + "grad_norm": 8.68818187713623, + "learning_rate": 2.4575703171753444e-05, + "loss": 5.9282, + "step": 36600 + }, + { + "epoch": 6.5754039497307, + "grad_norm": 9.140263557434082, + "learning_rate": 2.4569718731298623e-05, + "loss": 5.8239, + "step": 36625 + }, + { + "epoch": 6.579892280071813, + "grad_norm": 9.38505744934082, + "learning_rate": 2.4563734290843805e-05, + "loss": 5.9583, + "step": 36650 + }, + { + "epoch": 6.584380610412927, + "grad_norm": 9.838369369506836, + "learning_rate": 2.4557749850388988e-05, + "loss": 5.7194, + "step": 36675 + }, + { + "epoch": 6.58886894075404, + "grad_norm": 10.097395896911621, + "learning_rate": 2.455176540993417e-05, + "loss": 6.105, + "step": 36700 + }, + { + "epoch": 6.593357271095153, + "grad_norm": 9.800671577453613, + "learning_rate": 2.4545780969479356e-05, + "loss": 5.7369, + "step": 36725 + }, + { + "epoch": 6.597845601436266, + "grad_norm": 9.298616409301758, + "learning_rate": 2.4539796529024538e-05, + "loss": 5.87, + "step": 36750 + }, + { + "epoch": 6.602333931777379, + "grad_norm": 9.130125045776367, + "learning_rate": 2.453381208856972e-05, + "loss": 5.9072, + "step": 36775 + }, + { + "epoch": 6.6068222621184916, + "grad_norm": 8.9825439453125, + "learning_rate": 2.4527827648114903e-05, + "loss": 5.9422, + "step": 36800 + }, + { + "epoch": 6.611310592459605, + "grad_norm": 8.905620574951172, + "learning_rate": 2.4521843207660082e-05, + "loss": 5.8875, + "step": 36825 + }, + { + "epoch": 6.615798922800718, + "grad_norm": 9.726286888122559, + "learning_rate": 2.4515858767205264e-05, + "loss": 5.8607, + "step": 36850 + }, + { + "epoch": 6.620287253141831, + "grad_norm": 7.904778480529785, + "learning_rate": 2.450987432675045e-05, + "loss": 5.9336, + "step": 36875 + }, + { + "epoch": 6.624775583482944, + "grad_norm": 8.767878532409668, + "learning_rate": 2.4503889886295632e-05, + "loss": 5.8486, + "step": 36900 + }, + { + "epoch": 6.629263913824057, + "grad_norm": 11.202163696289062, + "learning_rate": 2.4497905445840815e-05, + "loss": 5.9577, + "step": 36925 + }, + { + "epoch": 6.63375224416517, + "grad_norm": 8.968244552612305, + "learning_rate": 2.4491921005385997e-05, + "loss": 5.7076, + "step": 36950 + }, + { + "epoch": 6.638240574506284, + "grad_norm": 9.815292358398438, + "learning_rate": 2.448593656493118e-05, + "loss": 5.8689, + "step": 36975 + }, + { + "epoch": 6.642728904847397, + "grad_norm": 8.430344581604004, + "learning_rate": 2.4479952124476365e-05, + "loss": 5.96, + "step": 37000 + }, + { + "epoch": 6.64721723518851, + "grad_norm": 9.692878723144531, + "learning_rate": 2.4473967684021544e-05, + "loss": 5.9259, + "step": 37025 + }, + { + "epoch": 6.651705565529623, + "grad_norm": 9.201183319091797, + "learning_rate": 2.4467983243566727e-05, + "loss": 5.9345, + "step": 37050 + }, + { + "epoch": 6.656193895870736, + "grad_norm": 13.81981086730957, + "learning_rate": 2.446199880311191e-05, + "loss": 5.7239, + "step": 37075 + }, + { + "epoch": 6.660682226211849, + "grad_norm": 8.707911491394043, + "learning_rate": 2.445601436265709e-05, + "loss": 5.7388, + "step": 37100 + }, + { + "epoch": 6.665170556552963, + "grad_norm": 10.306379318237305, + "learning_rate": 2.4450029922202274e-05, + "loss": 5.7202, + "step": 37125 + }, + { + "epoch": 6.669658886894076, + "grad_norm": 8.206708908081055, + "learning_rate": 2.444404548174746e-05, + "loss": 5.8642, + "step": 37150 + }, + { + "epoch": 6.674147217235189, + "grad_norm": 9.392040252685547, + "learning_rate": 2.4438061041292642e-05, + "loss": 5.813, + "step": 37175 + }, + { + "epoch": 6.6786355475763015, + "grad_norm": 11.101545333862305, + "learning_rate": 2.4432076600837824e-05, + "loss": 5.7568, + "step": 37200 + }, + { + "epoch": 6.6831238779174145, + "grad_norm": 9.007015228271484, + "learning_rate": 2.4426092160383003e-05, + "loss": 5.8152, + "step": 37225 + }, + { + "epoch": 6.687612208258528, + "grad_norm": 10.664595603942871, + "learning_rate": 2.4420107719928185e-05, + "loss": 5.9214, + "step": 37250 + }, + { + "epoch": 6.692100538599641, + "grad_norm": 9.10335922241211, + "learning_rate": 2.4414123279473368e-05, + "loss": 5.8131, + "step": 37275 + }, + { + "epoch": 6.696588868940754, + "grad_norm": 8.914325714111328, + "learning_rate": 2.4408138839018554e-05, + "loss": 5.7261, + "step": 37300 + }, + { + "epoch": 6.701077199281867, + "grad_norm": 10.727677345275879, + "learning_rate": 2.4402154398563736e-05, + "loss": 5.8609, + "step": 37325 + }, + { + "epoch": 6.70556552962298, + "grad_norm": 7.813395977020264, + "learning_rate": 2.4396169958108918e-05, + "loss": 5.812, + "step": 37350 + }, + { + "epoch": 6.710053859964093, + "grad_norm": 8.776046752929688, + "learning_rate": 2.43901855176541e-05, + "loss": 5.7963, + "step": 37375 + }, + { + "epoch": 6.714542190305206, + "grad_norm": 9.372112274169922, + "learning_rate": 2.4384201077199283e-05, + "loss": 5.8208, + "step": 37400 + }, + { + "epoch": 6.71903052064632, + "grad_norm": 8.833480834960938, + "learning_rate": 2.4378216636744465e-05, + "loss": 5.7626, + "step": 37425 + }, + { + "epoch": 6.723518850987433, + "grad_norm": 9.232447624206543, + "learning_rate": 2.4372232196289648e-05, + "loss": 5.7707, + "step": 37450 + }, + { + "epoch": 6.728007181328546, + "grad_norm": 10.184748649597168, + "learning_rate": 2.436624775583483e-05, + "loss": 5.6922, + "step": 37475 + }, + { + "epoch": 6.732495511669659, + "grad_norm": 10.609040260314941, + "learning_rate": 2.4360263315380012e-05, + "loss": 5.8215, + "step": 37500 + }, + { + "epoch": 6.736983842010772, + "grad_norm": 9.96190071105957, + "learning_rate": 2.4354278874925195e-05, + "loss": 5.7939, + "step": 37525 + }, + { + "epoch": 6.741472172351886, + "grad_norm": 9.059562683105469, + "learning_rate": 2.4348294434470377e-05, + "loss": 5.9082, + "step": 37550 + }, + { + "epoch": 6.7459605026929985, + "grad_norm": 10.30077075958252, + "learning_rate": 2.4342309994015563e-05, + "loss": 5.9057, + "step": 37575 + }, + { + "epoch": 6.7504488330341115, + "grad_norm": 9.614755630493164, + "learning_rate": 2.4336325553560745e-05, + "loss": 5.8732, + "step": 37600 + }, + { + "epoch": 6.754937163375224, + "grad_norm": 9.436674118041992, + "learning_rate": 2.4330341113105924e-05, + "loss": 5.8891, + "step": 37625 + }, + { + "epoch": 6.759425493716337, + "grad_norm": 11.316673278808594, + "learning_rate": 2.4324356672651107e-05, + "loss": 5.7613, + "step": 37650 + }, + { + "epoch": 6.76391382405745, + "grad_norm": 10.276429176330566, + "learning_rate": 2.431837223219629e-05, + "loss": 5.7835, + "step": 37675 + }, + { + "epoch": 6.768402154398563, + "grad_norm": 10.052189826965332, + "learning_rate": 2.431238779174147e-05, + "loss": 5.7819, + "step": 37700 + }, + { + "epoch": 6.772890484739677, + "grad_norm": 10.904725074768066, + "learning_rate": 2.4306403351286657e-05, + "loss": 5.7312, + "step": 37725 + }, + { + "epoch": 6.77737881508079, + "grad_norm": 9.266088485717773, + "learning_rate": 2.430041891083184e-05, + "loss": 5.6609, + "step": 37750 + }, + { + "epoch": 6.781867145421903, + "grad_norm": 8.685859680175781, + "learning_rate": 2.4294434470377022e-05, + "loss": 5.8181, + "step": 37775 + }, + { + "epoch": 6.786355475763016, + "grad_norm": 9.39303970336914, + "learning_rate": 2.4288450029922204e-05, + "loss": 5.7122, + "step": 37800 + }, + { + "epoch": 6.790843806104129, + "grad_norm": 9.141033172607422, + "learning_rate": 2.4282465589467383e-05, + "loss": 5.7904, + "step": 37825 + }, + { + "epoch": 6.795332136445243, + "grad_norm": 10.645248413085938, + "learning_rate": 2.4276481149012565e-05, + "loss": 5.8067, + "step": 37850 + }, + { + "epoch": 6.799820466786356, + "grad_norm": 9.40449333190918, + "learning_rate": 2.427049670855775e-05, + "loss": 5.7513, + "step": 37875 + }, + { + "epoch": 6.804308797127469, + "grad_norm": 9.839279174804688, + "learning_rate": 2.4264512268102934e-05, + "loss": 5.7287, + "step": 37900 + }, + { + "epoch": 6.808797127468582, + "grad_norm": 9.20603084564209, + "learning_rate": 2.4258527827648116e-05, + "loss": 5.9223, + "step": 37925 + }, + { + "epoch": 6.813285457809695, + "grad_norm": 10.668702125549316, + "learning_rate": 2.4252543387193298e-05, + "loss": 5.7938, + "step": 37950 + }, + { + "epoch": 6.817773788150808, + "grad_norm": 9.569756507873535, + "learning_rate": 2.424655894673848e-05, + "loss": 5.6479, + "step": 37975 + }, + { + "epoch": 6.8222621184919205, + "grad_norm": 9.890453338623047, + "learning_rate": 2.4240574506283663e-05, + "loss": 5.8845, + "step": 38000 + }, + { + "epoch": 6.826750448833034, + "grad_norm": 9.072249412536621, + "learning_rate": 2.4234590065828845e-05, + "loss": 5.8863, + "step": 38025 + }, + { + "epoch": 6.831238779174147, + "grad_norm": 9.59393310546875, + "learning_rate": 2.4228605625374028e-05, + "loss": 5.8775, + "step": 38050 + }, + { + "epoch": 6.83572710951526, + "grad_norm": 9.82231616973877, + "learning_rate": 2.422262118491921e-05, + "loss": 5.8821, + "step": 38075 + }, + { + "epoch": 6.840215439856373, + "grad_norm": 10.136720657348633, + "learning_rate": 2.4216636744464392e-05, + "loss": 5.8189, + "step": 38100 + }, + { + "epoch": 6.844703770197486, + "grad_norm": 10.297647476196289, + "learning_rate": 2.4210652304009575e-05, + "loss": 5.8341, + "step": 38125 + }, + { + "epoch": 6.8491921005386, + "grad_norm": 10.837604522705078, + "learning_rate": 2.420466786355476e-05, + "loss": 5.582, + "step": 38150 + }, + { + "epoch": 6.853680430879713, + "grad_norm": 9.731552124023438, + "learning_rate": 2.4198683423099943e-05, + "loss": 5.8016, + "step": 38175 + }, + { + "epoch": 6.858168761220826, + "grad_norm": 9.482463836669922, + "learning_rate": 2.4192698982645122e-05, + "loss": 5.7077, + "step": 38200 + }, + { + "epoch": 6.862657091561939, + "grad_norm": 9.485462188720703, + "learning_rate": 2.4186714542190304e-05, + "loss": 5.6867, + "step": 38225 + }, + { + "epoch": 6.867145421903052, + "grad_norm": 9.789255142211914, + "learning_rate": 2.4180730101735487e-05, + "loss": 5.7873, + "step": 38250 + }, + { + "epoch": 6.871633752244165, + "grad_norm": 9.897143363952637, + "learning_rate": 2.417474566128067e-05, + "loss": 5.7034, + "step": 38275 + }, + { + "epoch": 6.876122082585278, + "grad_norm": 10.066938400268555, + "learning_rate": 2.4168761220825855e-05, + "loss": 5.7281, + "step": 38300 + }, + { + "epoch": 6.880610412926392, + "grad_norm": 9.671385765075684, + "learning_rate": 2.4162776780371037e-05, + "loss": 5.7561, + "step": 38325 + }, + { + "epoch": 6.885098743267505, + "grad_norm": 9.676133155822754, + "learning_rate": 2.415679233991622e-05, + "loss": 5.7139, + "step": 38350 + }, + { + "epoch": 6.8895870736086176, + "grad_norm": 9.995516777038574, + "learning_rate": 2.4150807899461402e-05, + "loss": 5.5944, + "step": 38375 + }, + { + "epoch": 6.8940754039497305, + "grad_norm": 10.439031600952148, + "learning_rate": 2.414482345900658e-05, + "loss": 5.7707, + "step": 38400 + }, + { + "epoch": 6.8985637342908435, + "grad_norm": 8.98166561126709, + "learning_rate": 2.4138839018551766e-05, + "loss": 5.8929, + "step": 38425 + }, + { + "epoch": 6.903052064631957, + "grad_norm": 11.367774963378906, + "learning_rate": 2.413285457809695e-05, + "loss": 5.9908, + "step": 38450 + }, + { + "epoch": 6.90754039497307, + "grad_norm": 8.81797981262207, + "learning_rate": 2.412687013764213e-05, + "loss": 5.8561, + "step": 38475 + }, + { + "epoch": 6.912028725314183, + "grad_norm": 9.727533340454102, + "learning_rate": 2.4120885697187314e-05, + "loss": 5.8841, + "step": 38500 + }, + { + "epoch": 6.916517055655296, + "grad_norm": 9.541086196899414, + "learning_rate": 2.4114901256732496e-05, + "loss": 5.7478, + "step": 38525 + }, + { + "epoch": 6.921005385996409, + "grad_norm": 10.143462181091309, + "learning_rate": 2.4108916816277678e-05, + "loss": 5.7486, + "step": 38550 + }, + { + "epoch": 6.925493716337522, + "grad_norm": 10.27147102355957, + "learning_rate": 2.4102932375822864e-05, + "loss": 5.8008, + "step": 38575 + }, + { + "epoch": 6.929982046678636, + "grad_norm": 9.940827369689941, + "learning_rate": 2.4096947935368043e-05, + "loss": 5.771, + "step": 38600 + }, + { + "epoch": 6.934470377019749, + "grad_norm": 9.48715591430664, + "learning_rate": 2.4090963494913225e-05, + "loss": 5.7745, + "step": 38625 + }, + { + "epoch": 6.938958707360862, + "grad_norm": 9.222334861755371, + "learning_rate": 2.4084979054458408e-05, + "loss": 5.7087, + "step": 38650 + }, + { + "epoch": 6.943447037701975, + "grad_norm": 10.418964385986328, + "learning_rate": 2.407899461400359e-05, + "loss": 5.9331, + "step": 38675 + }, + { + "epoch": 6.947935368043088, + "grad_norm": 8.198657989501953, + "learning_rate": 2.4073010173548772e-05, + "loss": 5.8461, + "step": 38700 + }, + { + "epoch": 6.952423698384201, + "grad_norm": 9.519707679748535, + "learning_rate": 2.4067025733093958e-05, + "loss": 6.0206, + "step": 38725 + }, + { + "epoch": 6.956912028725315, + "grad_norm": 10.862227439880371, + "learning_rate": 2.406104129263914e-05, + "loss": 5.73, + "step": 38750 + }, + { + "epoch": 6.9614003590664275, + "grad_norm": 9.681299209594727, + "learning_rate": 2.4055056852184323e-05, + "loss": 5.6774, + "step": 38775 + }, + { + "epoch": 6.9658886894075405, + "grad_norm": 9.737951278686523, + "learning_rate": 2.4049072411729502e-05, + "loss": 5.8189, + "step": 38800 + }, + { + "epoch": 6.970377019748653, + "grad_norm": 12.258347511291504, + "learning_rate": 2.4043087971274684e-05, + "loss": 5.7071, + "step": 38825 + }, + { + "epoch": 6.974865350089766, + "grad_norm": 9.436955451965332, + "learning_rate": 2.403710353081987e-05, + "loss": 5.734, + "step": 38850 + }, + { + "epoch": 6.979353680430879, + "grad_norm": 9.793614387512207, + "learning_rate": 2.4031119090365052e-05, + "loss": 5.8598, + "step": 38875 + }, + { + "epoch": 6.983842010771993, + "grad_norm": 9.184218406677246, + "learning_rate": 2.4025134649910235e-05, + "loss": 5.9761, + "step": 38900 + }, + { + "epoch": 6.988330341113106, + "grad_norm": 8.572341918945312, + "learning_rate": 2.4019150209455417e-05, + "loss": 5.6871, + "step": 38925 + }, + { + "epoch": 6.992818671454219, + "grad_norm": 8.76677131652832, + "learning_rate": 2.401340514661879e-05, + "loss": 5.8308, + "step": 38950 + }, + { + "epoch": 6.997307001795332, + "grad_norm": 9.224698066711426, + "learning_rate": 2.4007420706163973e-05, + "loss": 5.7715, + "step": 38975 + }, + { + "epoch": 7.0, + "eval_accuracy": 0.07606973058637084, + "eval_f1_macro": 0.004702677240694463, + "eval_f1_micro": 0.07606973058637084, + "eval_f1_weighted": 0.0391321666925596, + "eval_loss": 6.799301624298096, + "eval_precision_macro": 0.004329643339422883, + "eval_precision_micro": 0.07606973058637084, + "eval_precision_weighted": 0.03165539343439144, + "eval_recall_macro": 0.008889571747379817, + "eval_recall_micro": 0.07606973058637084, + "eval_recall_weighted": 0.07606973058637084, + "eval_runtime": 62.9146, + "eval_samples_per_second": 832.445, + "eval_steps_per_second": 26.019, + "step": 38990 + }, + { + "epoch": 7.001795332136445, + "grad_norm": 9.710049629211426, + "learning_rate": 2.4001436265709155e-05, + "loss": 5.6836, + "step": 39000 + }, + { + "epoch": 7.006283662477558, + "grad_norm": 8.726054191589355, + "learning_rate": 2.399545182525434e-05, + "loss": 5.3899, + "step": 39025 + }, + { + "epoch": 7.010771992818672, + "grad_norm": 9.112202644348145, + "learning_rate": 2.3989467384799524e-05, + "loss": 5.3287, + "step": 39050 + }, + { + "epoch": 7.015260323159785, + "grad_norm": 9.081875801086426, + "learning_rate": 2.3983482944344706e-05, + "loss": 5.4535, + "step": 39075 + }, + { + "epoch": 7.019748653500898, + "grad_norm": 9.632508277893066, + "learning_rate": 2.3977498503889885e-05, + "loss": 5.3806, + "step": 39100 + }, + { + "epoch": 7.024236983842011, + "grad_norm": 9.726938247680664, + "learning_rate": 2.3971514063435067e-05, + "loss": 5.4802, + "step": 39125 + }, + { + "epoch": 7.028725314183124, + "grad_norm": 9.837471008300781, + "learning_rate": 2.396552962298025e-05, + "loss": 5.385, + "step": 39150 + }, + { + "epoch": 7.033213644524237, + "grad_norm": 9.670698165893555, + "learning_rate": 2.3959545182525435e-05, + "loss": 5.4503, + "step": 39175 + }, + { + "epoch": 7.03770197486535, + "grad_norm": 10.559202194213867, + "learning_rate": 2.3953560742070618e-05, + "loss": 5.3738, + "step": 39200 + }, + { + "epoch": 7.042190305206463, + "grad_norm": 9.533768653869629, + "learning_rate": 2.39475763016158e-05, + "loss": 5.1698, + "step": 39225 + }, + { + "epoch": 7.046678635547576, + "grad_norm": 10.138882637023926, + "learning_rate": 2.3941591861160982e-05, + "loss": 5.385, + "step": 39250 + }, + { + "epoch": 7.051166965888689, + "grad_norm": 10.787941932678223, + "learning_rate": 2.3935607420706165e-05, + "loss": 5.4648, + "step": 39275 + }, + { + "epoch": 7.055655296229802, + "grad_norm": 9.151936531066895, + "learning_rate": 2.3929622980251347e-05, + "loss": 5.5325, + "step": 39300 + }, + { + "epoch": 7.060143626570915, + "grad_norm": 9.089826583862305, + "learning_rate": 2.392363853979653e-05, + "loss": 5.55, + "step": 39325 + }, + { + "epoch": 7.064631956912029, + "grad_norm": 9.018044471740723, + "learning_rate": 2.3917654099341712e-05, + "loss": 5.3544, + "step": 39350 + }, + { + "epoch": 7.069120287253142, + "grad_norm": 8.728989601135254, + "learning_rate": 2.3911669658886894e-05, + "loss": 5.604, + "step": 39375 + }, + { + "epoch": 7.073608617594255, + "grad_norm": 9.207653999328613, + "learning_rate": 2.3905685218432077e-05, + "loss": 5.4294, + "step": 39400 + }, + { + "epoch": 7.078096947935368, + "grad_norm": 9.63049602508545, + "learning_rate": 2.389970077797726e-05, + "loss": 5.4452, + "step": 39425 + }, + { + "epoch": 7.082585278276481, + "grad_norm": 8.939839363098145, + "learning_rate": 2.3893716337522445e-05, + "loss": 5.2951, + "step": 39450 + }, + { + "epoch": 7.087073608617594, + "grad_norm": 8.66301155090332, + "learning_rate": 2.3887731897067627e-05, + "loss": 5.5419, + "step": 39475 + }, + { + "epoch": 7.091561938958708, + "grad_norm": 9.213544845581055, + "learning_rate": 2.3881747456612806e-05, + "loss": 5.4479, + "step": 39500 + }, + { + "epoch": 7.096050269299821, + "grad_norm": 9.076684951782227, + "learning_rate": 2.387576301615799e-05, + "loss": 5.4291, + "step": 39525 + }, + { + "epoch": 7.100538599640934, + "grad_norm": 9.745841979980469, + "learning_rate": 2.386977857570317e-05, + "loss": 5.4003, + "step": 39550 + }, + { + "epoch": 7.1050269299820465, + "grad_norm": 12.12155818939209, + "learning_rate": 2.3863794135248353e-05, + "loss": 5.4379, + "step": 39575 + }, + { + "epoch": 7.1095152603231595, + "grad_norm": 9.469954490661621, + "learning_rate": 2.385780969479354e-05, + "loss": 5.5717, + "step": 39600 + }, + { + "epoch": 7.1140035906642725, + "grad_norm": 9.757919311523438, + "learning_rate": 2.385182525433872e-05, + "loss": 5.3523, + "step": 39625 + }, + { + "epoch": 7.118491921005386, + "grad_norm": 11.33937931060791, + "learning_rate": 2.3845840813883904e-05, + "loss": 5.3695, + "step": 39650 + }, + { + "epoch": 7.122980251346499, + "grad_norm": 9.602176666259766, + "learning_rate": 2.3839856373429086e-05, + "loss": 5.4546, + "step": 39675 + }, + { + "epoch": 7.127468581687612, + "grad_norm": 10.390071868896484, + "learning_rate": 2.3833871932974265e-05, + "loss": 5.583, + "step": 39700 + }, + { + "epoch": 7.131956912028725, + "grad_norm": 9.772772789001465, + "learning_rate": 2.3827887492519447e-05, + "loss": 5.3846, + "step": 39725 + }, + { + "epoch": 7.136445242369838, + "grad_norm": 9.564618110656738, + "learning_rate": 2.3821903052064633e-05, + "loss": 5.4997, + "step": 39750 + }, + { + "epoch": 7.140933572710951, + "grad_norm": 8.696317672729492, + "learning_rate": 2.3815918611609815e-05, + "loss": 5.3372, + "step": 39775 + }, + { + "epoch": 7.145421903052065, + "grad_norm": 10.570809364318848, + "learning_rate": 2.3809934171154998e-05, + "loss": 5.4987, + "step": 39800 + }, + { + "epoch": 7.149910233393178, + "grad_norm": 9.250133514404297, + "learning_rate": 2.380394973070018e-05, + "loss": 5.3541, + "step": 39825 + }, + { + "epoch": 7.154398563734291, + "grad_norm": 9.481393814086914, + "learning_rate": 2.3797965290245362e-05, + "loss": 5.4012, + "step": 39850 + }, + { + "epoch": 7.158886894075404, + "grad_norm": 11.07912826538086, + "learning_rate": 2.3791980849790548e-05, + "loss": 5.4963, + "step": 39875 + }, + { + "epoch": 7.163375224416517, + "grad_norm": 11.027650833129883, + "learning_rate": 2.3785996409335727e-05, + "loss": 5.4144, + "step": 39900 + }, + { + "epoch": 7.167863554757631, + "grad_norm": 9.522918701171875, + "learning_rate": 2.378001196888091e-05, + "loss": 5.4286, + "step": 39925 + }, + { + "epoch": 7.1723518850987436, + "grad_norm": 8.930000305175781, + "learning_rate": 2.3774027528426092e-05, + "loss": 5.4323, + "step": 39950 + }, + { + "epoch": 7.1768402154398565, + "grad_norm": 9.531732559204102, + "learning_rate": 2.3768043087971274e-05, + "loss": 5.3782, + "step": 39975 + }, + { + "epoch": 7.1813285457809695, + "grad_norm": 10.367890357971191, + "learning_rate": 2.3762058647516457e-05, + "loss": 5.4561, + "step": 40000 + }, + { + "epoch": 7.185816876122082, + "grad_norm": 9.511069297790527, + "learning_rate": 2.3756074207061642e-05, + "loss": 5.3702, + "step": 40025 + }, + { + "epoch": 7.190305206463195, + "grad_norm": 10.525802612304688, + "learning_rate": 2.3750089766606825e-05, + "loss": 5.5162, + "step": 40050 + }, + { + "epoch": 7.194793536804309, + "grad_norm": 10.675493240356445, + "learning_rate": 2.3744105326152007e-05, + "loss": 5.4359, + "step": 40075 + }, + { + "epoch": 7.199281867145422, + "grad_norm": 9.481062889099121, + "learning_rate": 2.3738120885697186e-05, + "loss": 5.3511, + "step": 40100 + }, + { + "epoch": 7.203770197486535, + "grad_norm": 9.01403522491455, + "learning_rate": 2.373213644524237e-05, + "loss": 5.4466, + "step": 40125 + }, + { + "epoch": 7.208258527827648, + "grad_norm": 10.255102157592773, + "learning_rate": 2.372615200478755e-05, + "loss": 5.4902, + "step": 40150 + }, + { + "epoch": 7.212746858168761, + "grad_norm": 10.252391815185547, + "learning_rate": 2.3720167564332737e-05, + "loss": 5.4243, + "step": 40175 + }, + { + "epoch": 7.217235188509874, + "grad_norm": 9.689371109008789, + "learning_rate": 2.371418312387792e-05, + "loss": 5.5055, + "step": 40200 + }, + { + "epoch": 7.221723518850988, + "grad_norm": 9.8684720993042, + "learning_rate": 2.37081986834231e-05, + "loss": 5.5854, + "step": 40225 + }, + { + "epoch": 7.226211849192101, + "grad_norm": 7.824565887451172, + "learning_rate": 2.3702214242968284e-05, + "loss": 5.4043, + "step": 40250 + }, + { + "epoch": 7.230700179533214, + "grad_norm": 9.986798286437988, + "learning_rate": 2.3696229802513466e-05, + "loss": 5.4911, + "step": 40275 + }, + { + "epoch": 7.235188509874327, + "grad_norm": 10.030969619750977, + "learning_rate": 2.369024536205865e-05, + "loss": 5.5107, + "step": 40300 + }, + { + "epoch": 7.23967684021544, + "grad_norm": 9.414936065673828, + "learning_rate": 2.368426092160383e-05, + "loss": 5.5535, + "step": 40325 + }, + { + "epoch": 7.244165170556553, + "grad_norm": 9.985041618347168, + "learning_rate": 2.3678276481149013e-05, + "loss": 5.4215, + "step": 40350 + }, + { + "epoch": 7.2486535008976665, + "grad_norm": 10.442767143249512, + "learning_rate": 2.3672292040694195e-05, + "loss": 5.3521, + "step": 40375 + }, + { + "epoch": 7.253141831238779, + "grad_norm": 10.41326904296875, + "learning_rate": 2.3666307600239378e-05, + "loss": 5.4708, + "step": 40400 + }, + { + "epoch": 7.257630161579892, + "grad_norm": 8.870606422424316, + "learning_rate": 2.366032315978456e-05, + "loss": 5.4331, + "step": 40425 + }, + { + "epoch": 7.262118491921005, + "grad_norm": 9.479995727539062, + "learning_rate": 2.3654338719329746e-05, + "loss": 5.3167, + "step": 40450 + }, + { + "epoch": 7.266606822262118, + "grad_norm": 10.369382858276367, + "learning_rate": 2.3648354278874928e-05, + "loss": 5.3445, + "step": 40475 + }, + { + "epoch": 7.271095152603231, + "grad_norm": 8.947782516479492, + "learning_rate": 2.3642369838420107e-05, + "loss": 5.4009, + "step": 40500 + }, + { + "epoch": 7.275583482944345, + "grad_norm": 10.202621459960938, + "learning_rate": 2.363638539796529e-05, + "loss": 5.3901, + "step": 40525 + }, + { + "epoch": 7.280071813285458, + "grad_norm": 9.631303787231445, + "learning_rate": 2.3630400957510472e-05, + "loss": 5.3715, + "step": 40550 + }, + { + "epoch": 7.284560143626571, + "grad_norm": 9.60442066192627, + "learning_rate": 2.3624416517055654e-05, + "loss": 5.4848, + "step": 40575 + }, + { + "epoch": 7.289048473967684, + "grad_norm": 9.679129600524902, + "learning_rate": 2.361843207660084e-05, + "loss": 5.4818, + "step": 40600 + }, + { + "epoch": 7.293536804308797, + "grad_norm": 9.837870597839355, + "learning_rate": 2.3612447636146022e-05, + "loss": 5.3848, + "step": 40625 + }, + { + "epoch": 7.29802513464991, + "grad_norm": 10.070474624633789, + "learning_rate": 2.3606463195691205e-05, + "loss": 5.5088, + "step": 40650 + }, + { + "epoch": 7.302513464991024, + "grad_norm": 10.252856254577637, + "learning_rate": 2.3600478755236387e-05, + "loss": 5.4704, + "step": 40675 + }, + { + "epoch": 7.307001795332137, + "grad_norm": 10.140144348144531, + "learning_rate": 2.3594494314781566e-05, + "loss": 5.4225, + "step": 40700 + }, + { + "epoch": 7.31149012567325, + "grad_norm": 9.417150497436523, + "learning_rate": 2.3588509874326752e-05, + "loss": 5.4847, + "step": 40725 + }, + { + "epoch": 7.315978456014363, + "grad_norm": 9.764644622802734, + "learning_rate": 2.3582525433871934e-05, + "loss": 5.5674, + "step": 40750 + }, + { + "epoch": 7.3204667863554755, + "grad_norm": 9.927252769470215, + "learning_rate": 2.3576540993417117e-05, + "loss": 5.5377, + "step": 40775 + }, + { + "epoch": 7.3249551166965885, + "grad_norm": 9.603468894958496, + "learning_rate": 2.35705565529623e-05, + "loss": 5.3675, + "step": 40800 + }, + { + "epoch": 7.329443447037702, + "grad_norm": 10.293231010437012, + "learning_rate": 2.356457211250748e-05, + "loss": 5.481, + "step": 40825 + }, + { + "epoch": 7.333931777378815, + "grad_norm": 10.163765907287598, + "learning_rate": 2.3558587672052664e-05, + "loss": 5.3748, + "step": 40850 + }, + { + "epoch": 7.338420107719928, + "grad_norm": 11.308597564697266, + "learning_rate": 2.3552603231597846e-05, + "loss": 5.5669, + "step": 40875 + }, + { + "epoch": 7.342908438061041, + "grad_norm": 9.194921493530273, + "learning_rate": 2.354661879114303e-05, + "loss": 5.5315, + "step": 40900 + }, + { + "epoch": 7.347396768402154, + "grad_norm": 11.210052490234375, + "learning_rate": 2.354063435068821e-05, + "loss": 5.5171, + "step": 40925 + }, + { + "epoch": 7.351885098743267, + "grad_norm": 9.31684398651123, + "learning_rate": 2.3534649910233393e-05, + "loss": 5.5289, + "step": 40950 + }, + { + "epoch": 7.356373429084381, + "grad_norm": 8.82763957977295, + "learning_rate": 2.3528665469778575e-05, + "loss": 5.4868, + "step": 40975 + }, + { + "epoch": 7.360861759425494, + "grad_norm": 11.357858657836914, + "learning_rate": 2.3522681029323758e-05, + "loss": 5.5153, + "step": 41000 + }, + { + "epoch": 7.365350089766607, + "grad_norm": 9.087763786315918, + "learning_rate": 2.351693596648713e-05, + "loss": 5.6399, + "step": 41025 + }, + { + "epoch": 7.36983842010772, + "grad_norm": 10.407207489013672, + "learning_rate": 2.3510951526032317e-05, + "loss": 5.4701, + "step": 41050 + }, + { + "epoch": 7.374326750448833, + "grad_norm": 11.355449676513672, + "learning_rate": 2.35049670855775e-05, + "loss": 5.6303, + "step": 41075 + }, + { + "epoch": 7.378815080789946, + "grad_norm": 11.019585609436035, + "learning_rate": 2.3498982645122682e-05, + "loss": 5.5394, + "step": 41100 + }, + { + "epoch": 7.38330341113106, + "grad_norm": 10.996333122253418, + "learning_rate": 2.3492998204667864e-05, + "loss": 5.7061, + "step": 41125 + }, + { + "epoch": 7.3877917414721725, + "grad_norm": 10.037538528442383, + "learning_rate": 2.3487013764213047e-05, + "loss": 5.5265, + "step": 41150 + }, + { + "epoch": 7.3922800718132855, + "grad_norm": 10.484006881713867, + "learning_rate": 2.348102932375823e-05, + "loss": 5.5831, + "step": 41175 + }, + { + "epoch": 7.3967684021543985, + "grad_norm": 10.616866111755371, + "learning_rate": 2.347504488330341e-05, + "loss": 5.4975, + "step": 41200 + }, + { + "epoch": 7.401256732495511, + "grad_norm": 9.360462188720703, + "learning_rate": 2.3469060442848594e-05, + "loss": 5.5046, + "step": 41225 + }, + { + "epoch": 7.405745062836624, + "grad_norm": 12.849056243896484, + "learning_rate": 2.3463076002393776e-05, + "loss": 5.5221, + "step": 41250 + }, + { + "epoch": 7.410233393177738, + "grad_norm": 10.317054748535156, + "learning_rate": 2.345709156193896e-05, + "loss": 5.5499, + "step": 41275 + }, + { + "epoch": 7.414721723518851, + "grad_norm": 9.532585144042969, + "learning_rate": 2.345110712148414e-05, + "loss": 5.6526, + "step": 41300 + }, + { + "epoch": 7.419210053859964, + "grad_norm": 9.146982192993164, + "learning_rate": 2.3445122681029327e-05, + "loss": 5.5551, + "step": 41325 + }, + { + "epoch": 7.423698384201077, + "grad_norm": 9.319038391113281, + "learning_rate": 2.343913824057451e-05, + "loss": 5.4781, + "step": 41350 + }, + { + "epoch": 7.42818671454219, + "grad_norm": 9.415090560913086, + "learning_rate": 2.343315380011969e-05, + "loss": 5.5023, + "step": 41375 + }, + { + "epoch": 7.432675044883303, + "grad_norm": 8.899957656860352, + "learning_rate": 2.342716935966487e-05, + "loss": 5.3882, + "step": 41400 + }, + { + "epoch": 7.437163375224417, + "grad_norm": 10.74228572845459, + "learning_rate": 2.3421184919210053e-05, + "loss": 5.576, + "step": 41425 + }, + { + "epoch": 7.44165170556553, + "grad_norm": 9.807246208190918, + "learning_rate": 2.3415200478755235e-05, + "loss": 5.5247, + "step": 41450 + }, + { + "epoch": 7.446140035906643, + "grad_norm": 10.027503967285156, + "learning_rate": 2.340921603830042e-05, + "loss": 5.4102, + "step": 41475 + }, + { + "epoch": 7.450628366247756, + "grad_norm": 10.049226760864258, + "learning_rate": 2.3403231597845603e-05, + "loss": 5.4811, + "step": 41500 + }, + { + "epoch": 7.455116696588869, + "grad_norm": 8.775979995727539, + "learning_rate": 2.3397247157390785e-05, + "loss": 5.3254, + "step": 41525 + }, + { + "epoch": 7.459605026929982, + "grad_norm": 9.799753189086914, + "learning_rate": 2.3391262716935968e-05, + "loss": 5.5782, + "step": 41550 + }, + { + "epoch": 7.4640933572710955, + "grad_norm": 9.94575309753418, + "learning_rate": 2.338527827648115e-05, + "loss": 5.4667, + "step": 41575 + }, + { + "epoch": 7.468581687612208, + "grad_norm": 10.647387504577637, + "learning_rate": 2.337929383602633e-05, + "loss": 5.4154, + "step": 41600 + }, + { + "epoch": 7.473070017953321, + "grad_norm": 10.889013290405273, + "learning_rate": 2.3373309395571515e-05, + "loss": 5.3723, + "step": 41625 + }, + { + "epoch": 7.477558348294434, + "grad_norm": 10.117226600646973, + "learning_rate": 2.3367324955116697e-05, + "loss": 5.4949, + "step": 41650 + }, + { + "epoch": 7.482046678635547, + "grad_norm": 10.55263900756836, + "learning_rate": 2.336134051466188e-05, + "loss": 5.5981, + "step": 41675 + }, + { + "epoch": 7.486535008976661, + "grad_norm": 9.749809265136719, + "learning_rate": 2.3355356074207062e-05, + "loss": 5.423, + "step": 41700 + }, + { + "epoch": 7.491023339317774, + "grad_norm": 10.976485252380371, + "learning_rate": 2.3349371633752244e-05, + "loss": 5.4739, + "step": 41725 + }, + { + "epoch": 7.495511669658887, + "grad_norm": 8.481678009033203, + "learning_rate": 2.334338719329743e-05, + "loss": 5.5193, + "step": 41750 + }, + { + "epoch": 7.5, + "grad_norm": 10.148784637451172, + "learning_rate": 2.333740275284261e-05, + "loss": 5.3737, + "step": 41775 + }, + { + "epoch": 7.504488330341113, + "grad_norm": 8.601399421691895, + "learning_rate": 2.333141831238779e-05, + "loss": 5.4572, + "step": 41800 + }, + { + "epoch": 7.508976660682226, + "grad_norm": 9.748685836791992, + "learning_rate": 2.3325433871932974e-05, + "loss": 5.5084, + "step": 41825 + }, + { + "epoch": 7.513464991023339, + "grad_norm": 10.428478240966797, + "learning_rate": 2.3319449431478156e-05, + "loss": 5.4694, + "step": 41850 + }, + { + "epoch": 7.517953321364453, + "grad_norm": 9.830413818359375, + "learning_rate": 2.331346499102334e-05, + "loss": 5.451, + "step": 41875 + }, + { + "epoch": 7.522441651705566, + "grad_norm": 10.002723693847656, + "learning_rate": 2.3307480550568524e-05, + "loss": 5.5269, + "step": 41900 + }, + { + "epoch": 7.526929982046679, + "grad_norm": 9.541080474853516, + "learning_rate": 2.3301496110113707e-05, + "loss": 5.4241, + "step": 41925 + }, + { + "epoch": 7.531418312387792, + "grad_norm": 12.23689079284668, + "learning_rate": 2.329551166965889e-05, + "loss": 5.6896, + "step": 41950 + }, + { + "epoch": 7.5359066427289045, + "grad_norm": 10.239830017089844, + "learning_rate": 2.3289766606822263e-05, + "loss": 5.4897, + "step": 41975 + }, + { + "epoch": 7.540394973070018, + "grad_norm": 9.656229019165039, + "learning_rate": 2.3283782166367445e-05, + "loss": 5.5598, + "step": 42000 + }, + { + "epoch": 7.544883303411131, + "grad_norm": 10.131250381469727, + "learning_rate": 2.3277797725912627e-05, + "loss": 5.6299, + "step": 42025 + }, + { + "epoch": 7.549371633752244, + "grad_norm": 13.042201042175293, + "learning_rate": 2.327181328545781e-05, + "loss": 5.4775, + "step": 42050 + }, + { + "epoch": 7.553859964093357, + "grad_norm": 11.811177253723145, + "learning_rate": 2.3265828845002996e-05, + "loss": 5.4063, + "step": 42075 + }, + { + "epoch": 7.55834829443447, + "grad_norm": 8.8211669921875, + "learning_rate": 2.3259844404548175e-05, + "loss": 5.5564, + "step": 42100 + }, + { + "epoch": 7.562836624775583, + "grad_norm": 9.441910743713379, + "learning_rate": 2.3253859964093357e-05, + "loss": 5.4795, + "step": 42125 + }, + { + "epoch": 7.567324955116696, + "grad_norm": 9.574545860290527, + "learning_rate": 2.324787552363854e-05, + "loss": 5.5985, + "step": 42150 + }, + { + "epoch": 7.57181328545781, + "grad_norm": 10.16347885131836, + "learning_rate": 2.324189108318372e-05, + "loss": 5.4079, + "step": 42175 + }, + { + "epoch": 7.576301615798923, + "grad_norm": 10.165145874023438, + "learning_rate": 2.3235906642728907e-05, + "loss": 5.42, + "step": 42200 + }, + { + "epoch": 7.580789946140036, + "grad_norm": 10.1334867477417, + "learning_rate": 2.322992220227409e-05, + "loss": 5.6146, + "step": 42225 + }, + { + "epoch": 7.585278276481149, + "grad_norm": 9.232158660888672, + "learning_rate": 2.3223937761819272e-05, + "loss": 5.4621, + "step": 42250 + }, + { + "epoch": 7.589766606822262, + "grad_norm": 9.515371322631836, + "learning_rate": 2.3217953321364454e-05, + "loss": 5.5251, + "step": 42275 + }, + { + "epoch": 7.594254937163376, + "grad_norm": 12.158238410949707, + "learning_rate": 2.3211968880909633e-05, + "loss": 5.4846, + "step": 42300 + }, + { + "epoch": 7.598743267504489, + "grad_norm": 10.711348533630371, + "learning_rate": 2.3205984440454816e-05, + "loss": 5.4075, + "step": 42325 + }, + { + "epoch": 7.6032315978456015, + "grad_norm": 9.388899803161621, + "learning_rate": 2.32e-05, + "loss": 5.5984, + "step": 42350 + }, + { + "epoch": 7.6077199281867145, + "grad_norm": 9.60883903503418, + "learning_rate": 2.3194015559545184e-05, + "loss": 5.7081, + "step": 42375 + }, + { + "epoch": 7.6122082585278275, + "grad_norm": 9.950506210327148, + "learning_rate": 2.3188031119090366e-05, + "loss": 5.6468, + "step": 42400 + }, + { + "epoch": 7.61669658886894, + "grad_norm": 9.684456825256348, + "learning_rate": 2.318204667863555e-05, + "loss": 5.5685, + "step": 42425 + }, + { + "epoch": 7.621184919210053, + "grad_norm": 9.9486665725708, + "learning_rate": 2.317606223818073e-05, + "loss": 5.4475, + "step": 42450 + }, + { + "epoch": 7.625673249551167, + "grad_norm": 13.059162139892578, + "learning_rate": 2.3170077797725913e-05, + "loss": 5.4856, + "step": 42475 + }, + { + "epoch": 7.63016157989228, + "grad_norm": 9.386963844299316, + "learning_rate": 2.3164093357271096e-05, + "loss": 5.4977, + "step": 42500 + }, + { + "epoch": 7.634649910233393, + "grad_norm": 9.730010032653809, + "learning_rate": 2.3158108916816278e-05, + "loss": 5.4617, + "step": 42525 + }, + { + "epoch": 7.639138240574506, + "grad_norm": 9.391097068786621, + "learning_rate": 2.315212447636146e-05, + "loss": 5.5937, + "step": 42550 + }, + { + "epoch": 7.643626570915619, + "grad_norm": 10.680257797241211, + "learning_rate": 2.3146140035906643e-05, + "loss": 5.4566, + "step": 42575 + }, + { + "epoch": 7.648114901256733, + "grad_norm": 9.273040771484375, + "learning_rate": 2.3140155595451825e-05, + "loss": 5.4107, + "step": 42600 + }, + { + "epoch": 7.652603231597846, + "grad_norm": 9.915694236755371, + "learning_rate": 2.313417115499701e-05, + "loss": 5.472, + "step": 42625 + }, + { + "epoch": 7.657091561938959, + "grad_norm": 10.300697326660156, + "learning_rate": 2.3128186714542193e-05, + "loss": 5.4489, + "step": 42650 + }, + { + "epoch": 7.661579892280072, + "grad_norm": 10.001696586608887, + "learning_rate": 2.3122202274087372e-05, + "loss": 5.4714, + "step": 42675 + }, + { + "epoch": 7.666068222621185, + "grad_norm": 9.523163795471191, + "learning_rate": 2.3116217833632555e-05, + "loss": 5.5202, + "step": 42700 + }, + { + "epoch": 7.670556552962298, + "grad_norm": 9.265316009521484, + "learning_rate": 2.3110233393177737e-05, + "loss": 5.3454, + "step": 42725 + }, + { + "epoch": 7.6750448833034115, + "grad_norm": 9.773829460144043, + "learning_rate": 2.310424895272292e-05, + "loss": 5.4412, + "step": 42750 + }, + { + "epoch": 7.6795332136445245, + "grad_norm": 9.490160942077637, + "learning_rate": 2.3098264512268105e-05, + "loss": 5.4108, + "step": 42775 + }, + { + "epoch": 7.684021543985637, + "grad_norm": 10.431731224060059, + "learning_rate": 2.3092280071813287e-05, + "loss": 5.5199, + "step": 42800 + }, + { + "epoch": 7.68850987432675, + "grad_norm": 11.280056953430176, + "learning_rate": 2.308629563135847e-05, + "loss": 5.4409, + "step": 42825 + }, + { + "epoch": 7.692998204667863, + "grad_norm": 10.292471885681152, + "learning_rate": 2.3080311190903652e-05, + "loss": 5.4817, + "step": 42850 + }, + { + "epoch": 7.697486535008976, + "grad_norm": 10.075037002563477, + "learning_rate": 2.307432675044883e-05, + "loss": 5.4345, + "step": 42875 + }, + { + "epoch": 7.70197486535009, + "grad_norm": 10.156685829162598, + "learning_rate": 2.3068342309994013e-05, + "loss": 5.3333, + "step": 42900 + }, + { + "epoch": 7.706463195691203, + "grad_norm": 9.359807968139648, + "learning_rate": 2.30623578695392e-05, + "loss": 5.4431, + "step": 42925 + }, + { + "epoch": 7.710951526032316, + "grad_norm": 11.59181022644043, + "learning_rate": 2.305637342908438e-05, + "loss": 5.4896, + "step": 42950 + }, + { + "epoch": 7.715439856373429, + "grad_norm": 9.922967910766602, + "learning_rate": 2.3050388988629564e-05, + "loss": 5.5632, + "step": 42975 + }, + { + "epoch": 7.719928186714542, + "grad_norm": 9.73230266571045, + "learning_rate": 2.3044404548174746e-05, + "loss": 5.5436, + "step": 43000 + }, + { + "epoch": 7.724416517055655, + "grad_norm": 9.716582298278809, + "learning_rate": 2.303842010771993e-05, + "loss": 5.3871, + "step": 43025 + }, + { + "epoch": 7.728904847396769, + "grad_norm": 10.155867576599121, + "learning_rate": 2.303243566726511e-05, + "loss": 5.3985, + "step": 43050 + }, + { + "epoch": 7.733393177737882, + "grad_norm": 10.413517951965332, + "learning_rate": 2.3026451226810293e-05, + "loss": 5.5232, + "step": 43075 + }, + { + "epoch": 7.737881508078995, + "grad_norm": 10.171520233154297, + "learning_rate": 2.3020466786355476e-05, + "loss": 5.692, + "step": 43100 + }, + { + "epoch": 7.742369838420108, + "grad_norm": 8.521014213562012, + "learning_rate": 2.3014482345900658e-05, + "loss": 5.6399, + "step": 43125 + }, + { + "epoch": 7.746858168761221, + "grad_norm": 9.683831214904785, + "learning_rate": 2.300849790544584e-05, + "loss": 5.6157, + "step": 43150 + }, + { + "epoch": 7.751346499102334, + "grad_norm": 9.090052604675293, + "learning_rate": 2.3002513464991023e-05, + "loss": 5.5462, + "step": 43175 + }, + { + "epoch": 7.755834829443447, + "grad_norm": 9.080110549926758, + "learning_rate": 2.299652902453621e-05, + "loss": 5.5188, + "step": 43200 + }, + { + "epoch": 7.76032315978456, + "grad_norm": 11.36033821105957, + "learning_rate": 2.299054458408139e-05, + "loss": 5.4903, + "step": 43225 + }, + { + "epoch": 7.764811490125673, + "grad_norm": 10.358635902404785, + "learning_rate": 2.2984560143626573e-05, + "loss": 5.5015, + "step": 43250 + }, + { + "epoch": 7.769299820466786, + "grad_norm": 9.275090217590332, + "learning_rate": 2.2978575703171752e-05, + "loss": 5.608, + "step": 43275 + }, + { + "epoch": 7.773788150807899, + "grad_norm": 10.468692779541016, + "learning_rate": 2.2972591262716935e-05, + "loss": 5.5771, + "step": 43300 + }, + { + "epoch": 7.778276481149012, + "grad_norm": 9.009549140930176, + "learning_rate": 2.2966606822262117e-05, + "loss": 5.4987, + "step": 43325 + }, + { + "epoch": 7.782764811490126, + "grad_norm": 10.07161808013916, + "learning_rate": 2.2960622381807303e-05, + "loss": 5.4727, + "step": 43350 + }, + { + "epoch": 7.787253141831239, + "grad_norm": 11.281889915466309, + "learning_rate": 2.2954637941352485e-05, + "loss": 5.595, + "step": 43375 + }, + { + "epoch": 7.791741472172352, + "grad_norm": 9.703512191772461, + "learning_rate": 2.2948653500897667e-05, + "loss": 5.5227, + "step": 43400 + }, + { + "epoch": 7.796229802513465, + "grad_norm": 12.048609733581543, + "learning_rate": 2.294266906044285e-05, + "loss": 5.6672, + "step": 43425 + }, + { + "epoch": 7.800718132854578, + "grad_norm": 11.400287628173828, + "learning_rate": 2.2936684619988032e-05, + "loss": 5.5448, + "step": 43450 + }, + { + "epoch": 7.805206463195692, + "grad_norm": 10.344185829162598, + "learning_rate": 2.293070017953321e-05, + "loss": 5.4362, + "step": 43475 + }, + { + "epoch": 7.809694793536805, + "grad_norm": 10.405166625976562, + "learning_rate": 2.2924715739078397e-05, + "loss": 5.3811, + "step": 43500 + }, + { + "epoch": 7.814183123877918, + "grad_norm": 10.69149398803711, + "learning_rate": 2.291873129862358e-05, + "loss": 5.5019, + "step": 43525 + }, + { + "epoch": 7.8186714542190305, + "grad_norm": 9.917166709899902, + "learning_rate": 2.291274685816876e-05, + "loss": 5.4945, + "step": 43550 + }, + { + "epoch": 7.8231597845601435, + "grad_norm": 9.89714527130127, + "learning_rate": 2.2906762417713944e-05, + "loss": 5.5033, + "step": 43575 + }, + { + "epoch": 7.8276481149012564, + "grad_norm": 10.69091510772705, + "learning_rate": 2.2900777977259126e-05, + "loss": 5.504, + "step": 43600 + }, + { + "epoch": 7.832136445242369, + "grad_norm": 8.77791976928711, + "learning_rate": 2.2894793536804312e-05, + "loss": 5.6117, + "step": 43625 + }, + { + "epoch": 7.836624775583483, + "grad_norm": 10.19876480102539, + "learning_rate": 2.2888809096349494e-05, + "loss": 5.4798, + "step": 43650 + }, + { + "epoch": 7.841113105924596, + "grad_norm": 9.982950210571289, + "learning_rate": 2.2882824655894673e-05, + "loss": 5.5183, + "step": 43675 + }, + { + "epoch": 7.845601436265709, + "grad_norm": 9.491494178771973, + "learning_rate": 2.2876840215439856e-05, + "loss": 5.5001, + "step": 43700 + }, + { + "epoch": 7.850089766606822, + "grad_norm": 10.650588989257812, + "learning_rate": 2.2870855774985038e-05, + "loss": 5.6048, + "step": 43725 + }, + { + "epoch": 7.854578096947935, + "grad_norm": 9.61359977722168, + "learning_rate": 2.286487133453022e-05, + "loss": 5.5416, + "step": 43750 + }, + { + "epoch": 7.859066427289049, + "grad_norm": 10.270140647888184, + "learning_rate": 2.2858886894075406e-05, + "loss": 5.468, + "step": 43775 + }, + { + "epoch": 7.863554757630162, + "grad_norm": 10.289473533630371, + "learning_rate": 2.285290245362059e-05, + "loss": 5.5392, + "step": 43800 + }, + { + "epoch": 7.868043087971275, + "grad_norm": 11.816956520080566, + "learning_rate": 2.284691801316577e-05, + "loss": 5.3793, + "step": 43825 + }, + { + "epoch": 7.872531418312388, + "grad_norm": 10.62900161743164, + "learning_rate": 2.2840933572710953e-05, + "loss": 5.5389, + "step": 43850 + }, + { + "epoch": 7.877019748653501, + "grad_norm": 9.225757598876953, + "learning_rate": 2.2834949132256132e-05, + "loss": 5.5618, + "step": 43875 + }, + { + "epoch": 7.881508078994614, + "grad_norm": 10.770251274108887, + "learning_rate": 2.2828964691801315e-05, + "loss": 5.3858, + "step": 43900 + }, + { + "epoch": 7.885996409335727, + "grad_norm": 9.491755485534668, + "learning_rate": 2.28229802513465e-05, + "loss": 5.7343, + "step": 43925 + }, + { + "epoch": 7.8904847396768405, + "grad_norm": 10.550899505615234, + "learning_rate": 2.2816995810891683e-05, + "loss": 5.6561, + "step": 43950 + }, + { + "epoch": 7.8949730700179535, + "grad_norm": 11.263286590576172, + "learning_rate": 2.2811011370436865e-05, + "loss": 5.5623, + "step": 43975 + }, + { + "epoch": 7.899461400359066, + "grad_norm": 8.86055850982666, + "learning_rate": 2.2805026929982047e-05, + "loss": 5.5766, + "step": 44000 + }, + { + "epoch": 7.903949730700179, + "grad_norm": 9.63237476348877, + "learning_rate": 2.279904248952723e-05, + "loss": 5.5263, + "step": 44025 + }, + { + "epoch": 7.908438061041292, + "grad_norm": 9.763847351074219, + "learning_rate": 2.2793058049072416e-05, + "loss": 5.6226, + "step": 44050 + }, + { + "epoch": 7.912926391382406, + "grad_norm": 10.302363395690918, + "learning_rate": 2.2787073608617594e-05, + "loss": 5.4667, + "step": 44075 + }, + { + "epoch": 7.917414721723519, + "grad_norm": 10.282853126525879, + "learning_rate": 2.2781089168162777e-05, + "loss": 5.5529, + "step": 44100 + }, + { + "epoch": 7.921903052064632, + "grad_norm": 10.700684547424316, + "learning_rate": 2.277510472770796e-05, + "loss": 5.698, + "step": 44125 + }, + { + "epoch": 7.926391382405745, + "grad_norm": 9.618677139282227, + "learning_rate": 2.276912028725314e-05, + "loss": 5.4656, + "step": 44150 + }, + { + "epoch": 7.930879712746858, + "grad_norm": 8.922216415405273, + "learning_rate": 2.2763135846798324e-05, + "loss": 5.3357, + "step": 44175 + }, + { + "epoch": 7.935368043087971, + "grad_norm": 8.928955078125, + "learning_rate": 2.275715140634351e-05, + "loss": 5.6747, + "step": 44200 + }, + { + "epoch": 7.939856373429084, + "grad_norm": 9.155682563781738, + "learning_rate": 2.2751166965888692e-05, + "loss": 5.662, + "step": 44225 + }, + { + "epoch": 7.944344703770198, + "grad_norm": 9.425272941589355, + "learning_rate": 2.2745182525433874e-05, + "loss": 5.5621, + "step": 44250 + }, + { + "epoch": 7.948833034111311, + "grad_norm": 8.535520553588867, + "learning_rate": 2.2739198084979053e-05, + "loss": 5.5598, + "step": 44275 + }, + { + "epoch": 7.953321364452424, + "grad_norm": 10.379046440124512, + "learning_rate": 2.2733213644524236e-05, + "loss": 5.6674, + "step": 44300 + }, + { + "epoch": 7.957809694793537, + "grad_norm": 12.223926544189453, + "learning_rate": 2.2727229204069418e-05, + "loss": 5.44, + "step": 44325 + }, + { + "epoch": 7.96229802513465, + "grad_norm": 9.51513385772705, + "learning_rate": 2.2721244763614604e-05, + "loss": 5.4995, + "step": 44350 + }, + { + "epoch": 7.966786355475763, + "grad_norm": 9.875350952148438, + "learning_rate": 2.2715260323159786e-05, + "loss": 5.3707, + "step": 44375 + }, + { + "epoch": 7.971274685816876, + "grad_norm": 10.503203392028809, + "learning_rate": 2.270927588270497e-05, + "loss": 5.5537, + "step": 44400 + }, + { + "epoch": 7.975763016157989, + "grad_norm": 9.83034610748291, + "learning_rate": 2.270329144225015e-05, + "loss": 5.4915, + "step": 44425 + }, + { + "epoch": 7.980251346499102, + "grad_norm": 11.702964782714844, + "learning_rate": 2.2697307001795333e-05, + "loss": 5.5908, + "step": 44450 + }, + { + "epoch": 7.984739676840215, + "grad_norm": 9.678173065185547, + "learning_rate": 2.2691322561340516e-05, + "loss": 5.5707, + "step": 44475 + }, + { + "epoch": 7.989228007181328, + "grad_norm": 12.23019027709961, + "learning_rate": 2.2685338120885698e-05, + "loss": 5.4645, + "step": 44500 + }, + { + "epoch": 7.993716337522442, + "grad_norm": 9.888636589050293, + "learning_rate": 2.267935368043088e-05, + "loss": 5.5539, + "step": 44525 + }, + { + "epoch": 7.998204667863555, + "grad_norm": 10.622608184814453, + "learning_rate": 2.2673369239976063e-05, + "loss": 5.4909, + "step": 44550 + }, + { + "epoch": 8.0, + "eval_accuracy": 0.07721535905905715, + "eval_f1_macro": 0.005649481991057697, + "eval_f1_micro": 0.07721535905905715, + "eval_f1_weighted": 0.0409069780914941, + "eval_loss": 6.720392227172852, + "eval_precision_macro": 0.005420497428758677, + "eval_precision_micro": 0.07721535905905715, + "eval_precision_weighted": 0.03355156705945122, + "eval_recall_macro": 0.009900721346674305, + "eval_recall_micro": 0.07721535905905715, + "eval_recall_weighted": 0.07721535905905715, + "eval_runtime": 62.3473, + "eval_samples_per_second": 840.021, + "eval_steps_per_second": 26.256, + "step": 44560 + }, + { + "epoch": 8.002692998204667, + "grad_norm": 10.457049369812012, + "learning_rate": 2.2667384799521245e-05, + "loss": 5.1925, + "step": 44575 + }, + { + "epoch": 8.007181328545782, + "grad_norm": 10.420486450195312, + "learning_rate": 2.2661400359066427e-05, + "loss": 5.0802, + "step": 44600 + }, + { + "epoch": 8.011669658886895, + "grad_norm": 9.73538875579834, + "learning_rate": 2.2655415918611613e-05, + "loss": 5.1637, + "step": 44625 + }, + { + "epoch": 8.016157989228008, + "grad_norm": 9.423237800598145, + "learning_rate": 2.2649431478156792e-05, + "loss": 5.1766, + "step": 44650 + }, + { + "epoch": 8.02064631956912, + "grad_norm": 8.025190353393555, + "learning_rate": 2.2643447037701974e-05, + "loss": 5.2682, + "step": 44675 + }, + { + "epoch": 8.025134649910234, + "grad_norm": 10.876891136169434, + "learning_rate": 2.2637462597247157e-05, + "loss": 5.2835, + "step": 44700 + }, + { + "epoch": 8.029622980251347, + "grad_norm": 10.235151290893555, + "learning_rate": 2.263147815679234e-05, + "loss": 5.1474, + "step": 44725 + }, + { + "epoch": 8.03411131059246, + "grad_norm": 9.818636894226074, + "learning_rate": 2.262549371633752e-05, + "loss": 5.2388, + "step": 44750 + }, + { + "epoch": 8.038599640933572, + "grad_norm": 9.385553359985352, + "learning_rate": 2.2619509275882707e-05, + "loss": 5.1121, + "step": 44775 + }, + { + "epoch": 8.043087971274685, + "grad_norm": 9.778388023376465, + "learning_rate": 2.261352483542789e-05, + "loss": 5.168, + "step": 44800 + }, + { + "epoch": 8.047576301615798, + "grad_norm": 10.002878189086914, + "learning_rate": 2.2607540394973072e-05, + "loss": 5.1932, + "step": 44825 + }, + { + "epoch": 8.052064631956911, + "grad_norm": 10.187950134277344, + "learning_rate": 2.260155595451825e-05, + "loss": 5.1734, + "step": 44850 + }, + { + "epoch": 8.056552962298024, + "grad_norm": 10.091532707214355, + "learning_rate": 2.2595571514063433e-05, + "loss": 5.1623, + "step": 44875 + }, + { + "epoch": 8.061041292639139, + "grad_norm": 11.126556396484375, + "learning_rate": 2.2589587073608616e-05, + "loss": 5.1389, + "step": 44900 + }, + { + "epoch": 8.065529622980252, + "grad_norm": 10.458166122436523, + "learning_rate": 2.25836026331538e-05, + "loss": 5.3821, + "step": 44925 + }, + { + "epoch": 8.070017953321365, + "grad_norm": 10.574949264526367, + "learning_rate": 2.2577618192698984e-05, + "loss": 5.1329, + "step": 44950 + }, + { + "epoch": 8.074506283662478, + "grad_norm": 10.24091911315918, + "learning_rate": 2.2571633752244166e-05, + "loss": 5.2738, + "step": 44975 + }, + { + "epoch": 8.07899461400359, + "grad_norm": 9.131656646728516, + "learning_rate": 2.256564931178935e-05, + "loss": 5.0511, + "step": 45000 + }, + { + "epoch": 8.083482944344704, + "grad_norm": 10.45025634765625, + "learning_rate": 2.255966487133453e-05, + "loss": 5.2613, + "step": 45025 + }, + { + "epoch": 8.087971274685817, + "grad_norm": 11.027632713317871, + "learning_rate": 2.2553680430879713e-05, + "loss": 5.2574, + "step": 45050 + }, + { + "epoch": 8.09245960502693, + "grad_norm": 10.392406463623047, + "learning_rate": 2.2547695990424896e-05, + "loss": 4.946, + "step": 45075 + }, + { + "epoch": 8.096947935368043, + "grad_norm": 10.62334156036377, + "learning_rate": 2.2541711549970078e-05, + "loss": 5.1845, + "step": 45100 + }, + { + "epoch": 8.101436265709156, + "grad_norm": 11.353838920593262, + "learning_rate": 2.253572710951526e-05, + "loss": 5.1877, + "step": 45125 + }, + { + "epoch": 8.105924596050269, + "grad_norm": 9.639848709106445, + "learning_rate": 2.2529742669060443e-05, + "loss": 5.2387, + "step": 45150 + }, + { + "epoch": 8.110412926391382, + "grad_norm": 9.809059143066406, + "learning_rate": 2.2523758228605625e-05, + "loss": 5.0101, + "step": 45175 + }, + { + "epoch": 8.114901256732496, + "grad_norm": 10.919107437133789, + "learning_rate": 2.251777378815081e-05, + "loss": 5.148, + "step": 45200 + }, + { + "epoch": 8.11938958707361, + "grad_norm": 10.602744102478027, + "learning_rate": 2.2511789347695993e-05, + "loss": 5.2047, + "step": 45225 + }, + { + "epoch": 8.123877917414722, + "grad_norm": 8.93800163269043, + "learning_rate": 2.2505804907241172e-05, + "loss": 5.1891, + "step": 45250 + }, + { + "epoch": 8.128366247755835, + "grad_norm": 10.926349639892578, + "learning_rate": 2.2499820466786355e-05, + "loss": 5.2339, + "step": 45275 + }, + { + "epoch": 8.132854578096948, + "grad_norm": 9.90239143371582, + "learning_rate": 2.2493836026331537e-05, + "loss": 5.2452, + "step": 45300 + }, + { + "epoch": 8.137342908438061, + "grad_norm": 9.074828147888184, + "learning_rate": 2.248785158587672e-05, + "loss": 5.1585, + "step": 45325 + }, + { + "epoch": 8.141831238779174, + "grad_norm": 10.672664642333984, + "learning_rate": 2.2481867145421905e-05, + "loss": 5.0365, + "step": 45350 + }, + { + "epoch": 8.146319569120287, + "grad_norm": 10.507611274719238, + "learning_rate": 2.2475882704967087e-05, + "loss": 5.2051, + "step": 45375 + }, + { + "epoch": 8.1508078994614, + "grad_norm": 10.496660232543945, + "learning_rate": 2.246989826451227e-05, + "loss": 5.2079, + "step": 45400 + }, + { + "epoch": 8.155296229802513, + "grad_norm": 9.816904067993164, + "learning_rate": 2.2463913824057452e-05, + "loss": 5.1439, + "step": 45425 + }, + { + "epoch": 8.159784560143626, + "grad_norm": 10.555310249328613, + "learning_rate": 2.245792938360263e-05, + "loss": 5.0564, + "step": 45450 + }, + { + "epoch": 8.164272890484739, + "grad_norm": 9.81336498260498, + "learning_rate": 2.2451944943147817e-05, + "loss": 5.1877, + "step": 45475 + }, + { + "epoch": 8.168761220825854, + "grad_norm": 9.610879898071289, + "learning_rate": 2.2445960502693e-05, + "loss": 5.4053, + "step": 45500 + }, + { + "epoch": 8.173249551166966, + "grad_norm": 11.435028076171875, + "learning_rate": 2.243997606223818e-05, + "loss": 5.1614, + "step": 45525 + }, + { + "epoch": 8.17773788150808, + "grad_norm": 10.209677696228027, + "learning_rate": 2.2433991621783364e-05, + "loss": 5.1801, + "step": 45550 + }, + { + "epoch": 8.182226211849192, + "grad_norm": 10.315597534179688, + "learning_rate": 2.2428007181328546e-05, + "loss": 5.1563, + "step": 45575 + }, + { + "epoch": 8.186714542190305, + "grad_norm": 8.973522186279297, + "learning_rate": 2.242202274087373e-05, + "loss": 5.0744, + "step": 45600 + }, + { + "epoch": 8.191202872531418, + "grad_norm": 10.038873672485352, + "learning_rate": 2.2416038300418914e-05, + "loss": 5.2001, + "step": 45625 + }, + { + "epoch": 8.195691202872531, + "grad_norm": 11.411130905151367, + "learning_rate": 2.2410053859964093e-05, + "loss": 5.0716, + "step": 45650 + }, + { + "epoch": 8.200179533213644, + "grad_norm": 9.465821266174316, + "learning_rate": 2.2404069419509276e-05, + "loss": 5.19, + "step": 45675 + }, + { + "epoch": 8.204667863554757, + "grad_norm": 9.344893455505371, + "learning_rate": 2.2398084979054458e-05, + "loss": 5.2495, + "step": 45700 + }, + { + "epoch": 8.20915619389587, + "grad_norm": 10.435928344726562, + "learning_rate": 2.239210053859964e-05, + "loss": 5.1054, + "step": 45725 + }, + { + "epoch": 8.213644524236983, + "grad_norm": 9.989950180053711, + "learning_rate": 2.2386116098144823e-05, + "loss": 5.2392, + "step": 45750 + }, + { + "epoch": 8.218132854578098, + "grad_norm": 9.326205253601074, + "learning_rate": 2.238013165769001e-05, + "loss": 5.1506, + "step": 45775 + }, + { + "epoch": 8.22262118491921, + "grad_norm": 10.645722389221191, + "learning_rate": 2.237414721723519e-05, + "loss": 4.9718, + "step": 45800 + }, + { + "epoch": 8.227109515260324, + "grad_norm": 11.091547012329102, + "learning_rate": 2.2368162776780373e-05, + "loss": 5.236, + "step": 45825 + }, + { + "epoch": 8.231597845601437, + "grad_norm": 9.921210289001465, + "learning_rate": 2.2362178336325552e-05, + "loss": 5.2084, + "step": 45850 + }, + { + "epoch": 8.23608617594255, + "grad_norm": 11.743041038513184, + "learning_rate": 2.2356193895870735e-05, + "loss": 5.1812, + "step": 45875 + }, + { + "epoch": 8.240574506283663, + "grad_norm": 9.926054000854492, + "learning_rate": 2.235020945541592e-05, + "loss": 5.5035, + "step": 45900 + }, + { + "epoch": 8.245062836624776, + "grad_norm": 10.212824821472168, + "learning_rate": 2.2344225014961103e-05, + "loss": 5.2178, + "step": 45925 + }, + { + "epoch": 8.249551166965889, + "grad_norm": 10.136125564575195, + "learning_rate": 2.2338240574506285e-05, + "loss": 5.1196, + "step": 45950 + }, + { + "epoch": 8.254039497307001, + "grad_norm": 10.523502349853516, + "learning_rate": 2.2332256134051467e-05, + "loss": 5.1725, + "step": 45975 + }, + { + "epoch": 8.258527827648114, + "grad_norm": 11.568868637084961, + "learning_rate": 2.232627169359665e-05, + "loss": 5.0352, + "step": 46000 + }, + { + "epoch": 8.263016157989227, + "grad_norm": 9.762911796569824, + "learning_rate": 2.2320287253141832e-05, + "loss": 5.3616, + "step": 46025 + }, + { + "epoch": 8.26750448833034, + "grad_norm": 9.98439884185791, + "learning_rate": 2.2314302812687014e-05, + "loss": 5.191, + "step": 46050 + }, + { + "epoch": 8.271992818671453, + "grad_norm": 11.830445289611816, + "learning_rate": 2.2308318372232197e-05, + "loss": 5.1198, + "step": 46075 + }, + { + "epoch": 8.276481149012568, + "grad_norm": 10.572392463684082, + "learning_rate": 2.230233393177738e-05, + "loss": 5.1409, + "step": 46100 + }, + { + "epoch": 8.280969479353681, + "grad_norm": 9.594216346740723, + "learning_rate": 2.229634949132256e-05, + "loss": 5.0869, + "step": 46125 + }, + { + "epoch": 8.285457809694794, + "grad_norm": 11.911314010620117, + "learning_rate": 2.2290365050867744e-05, + "loss": 5.2213, + "step": 46150 + }, + { + "epoch": 8.289946140035907, + "grad_norm": 9.426054954528809, + "learning_rate": 2.2284619988031118e-05, + "loss": 5.2294, + "step": 46175 + }, + { + "epoch": 8.29443447037702, + "grad_norm": 10.455585479736328, + "learning_rate": 2.22786355475763e-05, + "loss": 5.2655, + "step": 46200 + }, + { + "epoch": 8.298922800718133, + "grad_norm": 8.907383918762207, + "learning_rate": 2.2272651107121486e-05, + "loss": 5.1279, + "step": 46225 + }, + { + "epoch": 8.303411131059246, + "grad_norm": 10.810797691345215, + "learning_rate": 2.2266666666666668e-05, + "loss": 5.2161, + "step": 46250 + }, + { + "epoch": 8.307899461400359, + "grad_norm": 10.28493881225586, + "learning_rate": 2.226068222621185e-05, + "loss": 5.2432, + "step": 46275 + }, + { + "epoch": 8.312387791741472, + "grad_norm": 10.614547729492188, + "learning_rate": 2.2254697785757033e-05, + "loss": 5.1532, + "step": 46300 + }, + { + "epoch": 8.316876122082585, + "grad_norm": 10.116838455200195, + "learning_rate": 2.2248713345302215e-05, + "loss": 5.2957, + "step": 46325 + }, + { + "epoch": 8.321364452423698, + "grad_norm": 10.379612922668457, + "learning_rate": 2.2242728904847397e-05, + "loss": 5.2907, + "step": 46350 + }, + { + "epoch": 8.325852782764812, + "grad_norm": 10.235963821411133, + "learning_rate": 2.223674446439258e-05, + "loss": 5.2355, + "step": 46375 + }, + { + "epoch": 8.330341113105925, + "grad_norm": 11.197012901306152, + "learning_rate": 2.2230760023937762e-05, + "loss": 5.2814, + "step": 46400 + }, + { + "epoch": 8.334829443447038, + "grad_norm": 10.609167098999023, + "learning_rate": 2.2224775583482945e-05, + "loss": 5.192, + "step": 46425 + }, + { + "epoch": 8.339317773788151, + "grad_norm": 9.967497825622559, + "learning_rate": 2.2218791143028127e-05, + "loss": 5.1811, + "step": 46450 + }, + { + "epoch": 8.343806104129264, + "grad_norm": 10.767441749572754, + "learning_rate": 2.221280670257331e-05, + "loss": 5.2102, + "step": 46475 + }, + { + "epoch": 8.348294434470377, + "grad_norm": 10.59012222290039, + "learning_rate": 2.2206822262118495e-05, + "loss": 5.432, + "step": 46500 + }, + { + "epoch": 8.35278276481149, + "grad_norm": 10.031270980834961, + "learning_rate": 2.2200837821663677e-05, + "loss": 5.2829, + "step": 46525 + }, + { + "epoch": 8.357271095152603, + "grad_norm": 10.709795951843262, + "learning_rate": 2.2194853381208856e-05, + "loss": 5.3673, + "step": 46550 + }, + { + "epoch": 8.361759425493716, + "grad_norm": 10.801443099975586, + "learning_rate": 2.218886894075404e-05, + "loss": 5.2618, + "step": 46575 + }, + { + "epoch": 8.366247755834829, + "grad_norm": 9.68051528930664, + "learning_rate": 2.218288450029922e-05, + "loss": 5.1871, + "step": 46600 + }, + { + "epoch": 8.370736086175942, + "grad_norm": 10.253618240356445, + "learning_rate": 2.2176900059844403e-05, + "loss": 5.0197, + "step": 46625 + }, + { + "epoch": 8.375224416517055, + "grad_norm": 10.126220703125, + "learning_rate": 2.217091561938959e-05, + "loss": 5.1663, + "step": 46650 + }, + { + "epoch": 8.37971274685817, + "grad_norm": 10.906074523925781, + "learning_rate": 2.216493117893477e-05, + "loss": 5.1256, + "step": 46675 + }, + { + "epoch": 8.384201077199283, + "grad_norm": 12.453466415405273, + "learning_rate": 2.2158946738479954e-05, + "loss": 5.0909, + "step": 46700 + }, + { + "epoch": 8.388689407540395, + "grad_norm": 9.886167526245117, + "learning_rate": 2.2152962298025136e-05, + "loss": 5.2275, + "step": 46725 + }, + { + "epoch": 8.393177737881508, + "grad_norm": 10.695226669311523, + "learning_rate": 2.2146977857570315e-05, + "loss": 5.1364, + "step": 46750 + }, + { + "epoch": 8.397666068222621, + "grad_norm": 10.646020889282227, + "learning_rate": 2.2140993417115498e-05, + "loss": 5.1978, + "step": 46775 + }, + { + "epoch": 8.402154398563734, + "grad_norm": 9.400237083435059, + "learning_rate": 2.2135008976660683e-05, + "loss": 5.2428, + "step": 46800 + }, + { + "epoch": 8.406642728904847, + "grad_norm": 10.550444602966309, + "learning_rate": 2.2129024536205866e-05, + "loss": 5.245, + "step": 46825 + }, + { + "epoch": 8.41113105924596, + "grad_norm": 11.12218952178955, + "learning_rate": 2.2123040095751048e-05, + "loss": 5.2104, + "step": 46850 + }, + { + "epoch": 8.415619389587073, + "grad_norm": 10.866241455078125, + "learning_rate": 2.211705565529623e-05, + "loss": 5.4585, + "step": 46875 + }, + { + "epoch": 8.420107719928186, + "grad_norm": 9.74103832244873, + "learning_rate": 2.2111071214841413e-05, + "loss": 5.2779, + "step": 46900 + }, + { + "epoch": 8.4245960502693, + "grad_norm": 10.916769027709961, + "learning_rate": 2.21050867743866e-05, + "loss": 5.0424, + "step": 46925 + }, + { + "epoch": 8.429084380610412, + "grad_norm": 10.27078628540039, + "learning_rate": 2.2099102333931778e-05, + "loss": 5.2651, + "step": 46950 + }, + { + "epoch": 8.433572710951527, + "grad_norm": 9.979581832885742, + "learning_rate": 2.209311789347696e-05, + "loss": 5.2426, + "step": 46975 + }, + { + "epoch": 8.43806104129264, + "grad_norm": 9.971446990966797, + "learning_rate": 2.2087133453022142e-05, + "loss": 5.1933, + "step": 47000 + }, + { + "epoch": 8.442549371633753, + "grad_norm": 11.096793174743652, + "learning_rate": 2.2081149012567325e-05, + "loss": 5.3826, + "step": 47025 + }, + { + "epoch": 8.447037701974866, + "grad_norm": 10.998697280883789, + "learning_rate": 2.2075164572112507e-05, + "loss": 5.1198, + "step": 47050 + }, + { + "epoch": 8.451526032315979, + "grad_norm": 9.547799110412598, + "learning_rate": 2.2069180131657693e-05, + "loss": 5.3068, + "step": 47075 + }, + { + "epoch": 8.456014362657092, + "grad_norm": 11.565042495727539, + "learning_rate": 2.2063195691202875e-05, + "loss": 5.2026, + "step": 47100 + }, + { + "epoch": 8.460502692998205, + "grad_norm": 9.7587308883667, + "learning_rate": 2.2057211250748057e-05, + "loss": 5.2627, + "step": 47125 + }, + { + "epoch": 8.464991023339318, + "grad_norm": 10.85186767578125, + "learning_rate": 2.2051226810293236e-05, + "loss": 5.3424, + "step": 47150 + }, + { + "epoch": 8.46947935368043, + "grad_norm": 10.462532043457031, + "learning_rate": 2.204524236983842e-05, + "loss": 5.1987, + "step": 47175 + }, + { + "epoch": 8.473967684021543, + "grad_norm": 10.84025764465332, + "learning_rate": 2.20392579293836e-05, + "loss": 5.1154, + "step": 47200 + }, + { + "epoch": 8.478456014362656, + "grad_norm": 11.469942092895508, + "learning_rate": 2.2033273488928787e-05, + "loss": 5.3975, + "step": 47225 + }, + { + "epoch": 8.48294434470377, + "grad_norm": 9.800874710083008, + "learning_rate": 2.202728904847397e-05, + "loss": 5.2939, + "step": 47250 + }, + { + "epoch": 8.487432675044884, + "grad_norm": 10.864420890808105, + "learning_rate": 2.202130460801915e-05, + "loss": 5.218, + "step": 47275 + }, + { + "epoch": 8.491921005385997, + "grad_norm": 11.073099136352539, + "learning_rate": 2.2015320167564334e-05, + "loss": 5.2704, + "step": 47300 + }, + { + "epoch": 8.49640933572711, + "grad_norm": 11.890022277832031, + "learning_rate": 2.2009335727109516e-05, + "loss": 5.2045, + "step": 47325 + }, + { + "epoch": 8.500897666068223, + "grad_norm": 10.802329063415527, + "learning_rate": 2.20033512866547e-05, + "loss": 5.3375, + "step": 47350 + }, + { + "epoch": 8.505385996409336, + "grad_norm": 10.372382164001465, + "learning_rate": 2.199736684619988e-05, + "loss": 5.1189, + "step": 47375 + }, + { + "epoch": 8.509874326750449, + "grad_norm": 11.273509979248047, + "learning_rate": 2.1991382405745063e-05, + "loss": 5.4012, + "step": 47400 + }, + { + "epoch": 8.514362657091562, + "grad_norm": 10.93100643157959, + "learning_rate": 2.1985397965290246e-05, + "loss": 5.1399, + "step": 47425 + }, + { + "epoch": 8.518850987432675, + "grad_norm": 10.412191390991211, + "learning_rate": 2.1979413524835428e-05, + "loss": 5.4521, + "step": 47450 + }, + { + "epoch": 8.523339317773788, + "grad_norm": 10.987630844116211, + "learning_rate": 2.197342908438061e-05, + "loss": 4.991, + "step": 47475 + }, + { + "epoch": 8.5278276481149, + "grad_norm": 10.485819816589355, + "learning_rate": 2.1967444643925796e-05, + "loss": 5.1917, + "step": 47500 + }, + { + "epoch": 8.532315978456014, + "grad_norm": 10.957935333251953, + "learning_rate": 2.1961460203470975e-05, + "loss": 5.2347, + "step": 47525 + }, + { + "epoch": 8.536804308797127, + "grad_norm": 11.57468318939209, + "learning_rate": 2.1955475763016158e-05, + "loss": 5.1552, + "step": 47550 + }, + { + "epoch": 8.541292639138241, + "grad_norm": 10.283357620239258, + "learning_rate": 2.194949132256134e-05, + "loss": 5.2681, + "step": 47575 + }, + { + "epoch": 8.545780969479354, + "grad_norm": 11.03958797454834, + "learning_rate": 2.1943506882106522e-05, + "loss": 5.1314, + "step": 47600 + }, + { + "epoch": 8.550269299820467, + "grad_norm": 11.066431999206543, + "learning_rate": 2.1937522441651705e-05, + "loss": 5.2273, + "step": 47625 + }, + { + "epoch": 8.55475763016158, + "grad_norm": 8.904008865356445, + "learning_rate": 2.193153800119689e-05, + "loss": 5.3407, + "step": 47650 + }, + { + "epoch": 8.559245960502693, + "grad_norm": 10.82641315460205, + "learning_rate": 2.1925553560742073e-05, + "loss": 5.2199, + "step": 47675 + }, + { + "epoch": 8.563734290843806, + "grad_norm": 10.243392944335938, + "learning_rate": 2.1919569120287255e-05, + "loss": 5.0552, + "step": 47700 + }, + { + "epoch": 8.568222621184919, + "grad_norm": 10.156676292419434, + "learning_rate": 2.1913584679832434e-05, + "loss": 5.3285, + "step": 47725 + }, + { + "epoch": 8.572710951526032, + "grad_norm": 10.938075065612793, + "learning_rate": 2.1907600239377616e-05, + "loss": 5.2824, + "step": 47750 + }, + { + "epoch": 8.577199281867145, + "grad_norm": 11.579339027404785, + "learning_rate": 2.1901615798922802e-05, + "loss": 5.2204, + "step": 47775 + }, + { + "epoch": 8.581687612208258, + "grad_norm": 10.570602416992188, + "learning_rate": 2.1895631358467985e-05, + "loss": 5.1901, + "step": 47800 + }, + { + "epoch": 8.58617594254937, + "grad_norm": 13.22849178314209, + "learning_rate": 2.1889646918013167e-05, + "loss": 5.3084, + "step": 47825 + }, + { + "epoch": 8.590664272890486, + "grad_norm": 12.514755249023438, + "learning_rate": 2.188366247755835e-05, + "loss": 5.1501, + "step": 47850 + }, + { + "epoch": 8.595152603231599, + "grad_norm": 10.336485862731934, + "learning_rate": 2.187767803710353e-05, + "loss": 5.0276, + "step": 47875 + }, + { + "epoch": 8.599640933572712, + "grad_norm": 15.011330604553223, + "learning_rate": 2.1871693596648714e-05, + "loss": 5.2893, + "step": 47900 + }, + { + "epoch": 8.604129263913824, + "grad_norm": 9.730998992919922, + "learning_rate": 2.1865709156193896e-05, + "loss": 5.2511, + "step": 47925 + }, + { + "epoch": 8.608617594254937, + "grad_norm": 11.499335289001465, + "learning_rate": 2.185972471573908e-05, + "loss": 5.2076, + "step": 47950 + }, + { + "epoch": 8.61310592459605, + "grad_norm": 10.13596248626709, + "learning_rate": 2.185374027528426e-05, + "loss": 5.3255, + "step": 47975 + }, + { + "epoch": 8.617594254937163, + "grad_norm": 10.147706031799316, + "learning_rate": 2.1847755834829443e-05, + "loss": 5.4129, + "step": 48000 + }, + { + "epoch": 8.622082585278276, + "grad_norm": 10.698373794555664, + "learning_rate": 2.1841771394374626e-05, + "loss": 5.211, + "step": 48025 + }, + { + "epoch": 8.62657091561939, + "grad_norm": 12.218047142028809, + "learning_rate": 2.1835786953919808e-05, + "loss": 5.348, + "step": 48050 + }, + { + "epoch": 8.631059245960502, + "grad_norm": 9.935876846313477, + "learning_rate": 2.1829802513464994e-05, + "loss": 5.262, + "step": 48075 + }, + { + "epoch": 8.635547576301615, + "grad_norm": 11.545351028442383, + "learning_rate": 2.1823818073010176e-05, + "loss": 5.283, + "step": 48100 + }, + { + "epoch": 8.640035906642728, + "grad_norm": 10.455262184143066, + "learning_rate": 2.1817833632555355e-05, + "loss": 5.1805, + "step": 48125 + }, + { + "epoch": 8.644524236983841, + "grad_norm": 10.697651863098145, + "learning_rate": 2.1811849192100538e-05, + "loss": 5.3054, + "step": 48150 + }, + { + "epoch": 8.649012567324956, + "grad_norm": 10.689338684082031, + "learning_rate": 2.180586475164572e-05, + "loss": 5.3932, + "step": 48175 + }, + { + "epoch": 8.653500897666069, + "grad_norm": 10.77692699432373, + "learning_rate": 2.1799880311190902e-05, + "loss": 5.3862, + "step": 48200 + }, + { + "epoch": 8.657989228007182, + "grad_norm": 9.825878143310547, + "learning_rate": 2.1793895870736088e-05, + "loss": 5.177, + "step": 48225 + }, + { + "epoch": 8.662477558348295, + "grad_norm": 10.56619644165039, + "learning_rate": 2.178791143028127e-05, + "loss": 5.2774, + "step": 48250 + }, + { + "epoch": 8.666965888689408, + "grad_norm": 10.832696914672852, + "learning_rate": 2.1781926989826453e-05, + "loss": 5.3445, + "step": 48275 + }, + { + "epoch": 8.67145421903052, + "grad_norm": 9.868931770324707, + "learning_rate": 2.1775942549371635e-05, + "loss": 5.3468, + "step": 48300 + }, + { + "epoch": 8.675942549371634, + "grad_norm": 12.153959274291992, + "learning_rate": 2.1769958108916814e-05, + "loss": 5.2522, + "step": 48325 + }, + { + "epoch": 8.680430879712747, + "grad_norm": 10.559062004089355, + "learning_rate": 2.1763973668462e-05, + "loss": 5.3405, + "step": 48350 + }, + { + "epoch": 8.68491921005386, + "grad_norm": 9.911727905273438, + "learning_rate": 2.1757989228007182e-05, + "loss": 5.3647, + "step": 48375 + }, + { + "epoch": 8.689407540394972, + "grad_norm": 10.340649604797363, + "learning_rate": 2.1752004787552365e-05, + "loss": 5.1546, + "step": 48400 + }, + { + "epoch": 8.693895870736085, + "grad_norm": 11.530205726623535, + "learning_rate": 2.1746020347097547e-05, + "loss": 5.2613, + "step": 48425 + }, + { + "epoch": 8.6983842010772, + "grad_norm": 10.38056468963623, + "learning_rate": 2.174003590664273e-05, + "loss": 5.2321, + "step": 48450 + }, + { + "epoch": 8.702872531418313, + "grad_norm": 11.195280075073242, + "learning_rate": 2.173405146618791e-05, + "loss": 5.2537, + "step": 48475 + }, + { + "epoch": 8.707360861759426, + "grad_norm": 11.222289085388184, + "learning_rate": 2.1728067025733097e-05, + "loss": 5.2224, + "step": 48500 + }, + { + "epoch": 8.711849192100539, + "grad_norm": 8.796674728393555, + "learning_rate": 2.1722082585278276e-05, + "loss": 5.3193, + "step": 48525 + }, + { + "epoch": 8.716337522441652, + "grad_norm": 12.625653266906738, + "learning_rate": 2.171609814482346e-05, + "loss": 5.2526, + "step": 48550 + }, + { + "epoch": 8.720825852782765, + "grad_norm": 10.351212501525879, + "learning_rate": 2.171011370436864e-05, + "loss": 5.405, + "step": 48575 + }, + { + "epoch": 8.725314183123878, + "grad_norm": 9.797446250915527, + "learning_rate": 2.1704129263913823e-05, + "loss": 5.3782, + "step": 48600 + }, + { + "epoch": 8.72980251346499, + "grad_norm": 10.537493705749512, + "learning_rate": 2.1698144823459006e-05, + "loss": 5.3131, + "step": 48625 + }, + { + "epoch": 8.734290843806104, + "grad_norm": 11.738768577575684, + "learning_rate": 2.169216038300419e-05, + "loss": 5.3193, + "step": 48650 + }, + { + "epoch": 8.738779174147217, + "grad_norm": 9.498252868652344, + "learning_rate": 2.1686175942549374e-05, + "loss": 5.385, + "step": 48675 + }, + { + "epoch": 8.74326750448833, + "grad_norm": 11.94810962677002, + "learning_rate": 2.1680191502094556e-05, + "loss": 5.3195, + "step": 48700 + }, + { + "epoch": 8.747755834829443, + "grad_norm": 9.7661771774292, + "learning_rate": 2.1674207061639735e-05, + "loss": 5.3503, + "step": 48725 + }, + { + "epoch": 8.752244165170557, + "grad_norm": 10.59184455871582, + "learning_rate": 2.1668222621184918e-05, + "loss": 5.1579, + "step": 48750 + }, + { + "epoch": 8.75673249551167, + "grad_norm": 9.918368339538574, + "learning_rate": 2.1662238180730103e-05, + "loss": 5.2327, + "step": 48775 + }, + { + "epoch": 8.761220825852783, + "grad_norm": 11.547975540161133, + "learning_rate": 2.1656253740275286e-05, + "loss": 5.2955, + "step": 48800 + }, + { + "epoch": 8.765709156193896, + "grad_norm": 10.992860794067383, + "learning_rate": 2.1650269299820468e-05, + "loss": 5.4481, + "step": 48825 + }, + { + "epoch": 8.77019748653501, + "grad_norm": 10.515495300292969, + "learning_rate": 2.164428485936565e-05, + "loss": 5.2542, + "step": 48850 + }, + { + "epoch": 8.774685816876122, + "grad_norm": 10.20100212097168, + "learning_rate": 2.1638300418910833e-05, + "loss": 5.2871, + "step": 48875 + }, + { + "epoch": 8.779174147217235, + "grad_norm": 10.697768211364746, + "learning_rate": 2.1632315978456015e-05, + "loss": 5.3581, + "step": 48900 + }, + { + "epoch": 8.783662477558348, + "grad_norm": 10.626644134521484, + "learning_rate": 2.1626331538001197e-05, + "loss": 5.2574, + "step": 48925 + }, + { + "epoch": 8.788150807899461, + "grad_norm": 11.969337463378906, + "learning_rate": 2.1620586475164575e-05, + "loss": 5.1835, + "step": 48950 + }, + { + "epoch": 8.792639138240574, + "grad_norm": 10.318828582763672, + "learning_rate": 2.1614602034709757e-05, + "loss": 5.3878, + "step": 48975 + }, + { + "epoch": 8.797127468581687, + "grad_norm": 10.539376258850098, + "learning_rate": 2.160861759425494e-05, + "loss": 5.1598, + "step": 49000 + }, + { + "epoch": 8.8016157989228, + "grad_norm": 11.167855262756348, + "learning_rate": 2.1602633153800118e-05, + "loss": 5.5054, + "step": 49025 + }, + { + "epoch": 8.806104129263915, + "grad_norm": 10.033782005310059, + "learning_rate": 2.15966487133453e-05, + "loss": 5.1656, + "step": 49050 + }, + { + "epoch": 8.810592459605028, + "grad_norm": 10.156709671020508, + "learning_rate": 2.1590664272890483e-05, + "loss": 5.322, + "step": 49075 + }, + { + "epoch": 8.81508078994614, + "grad_norm": 11.136800765991211, + "learning_rate": 2.158467983243567e-05, + "loss": 5.2937, + "step": 49100 + }, + { + "epoch": 8.819569120287253, + "grad_norm": 11.107318878173828, + "learning_rate": 2.157869539198085e-05, + "loss": 5.2603, + "step": 49125 + }, + { + "epoch": 8.824057450628366, + "grad_norm": 10.817268371582031, + "learning_rate": 2.1572710951526033e-05, + "loss": 5.1189, + "step": 49150 + }, + { + "epoch": 8.82854578096948, + "grad_norm": 11.161693572998047, + "learning_rate": 2.1566726511071216e-05, + "loss": 5.5742, + "step": 49175 + }, + { + "epoch": 8.833034111310592, + "grad_norm": 9.620489120483398, + "learning_rate": 2.1560742070616398e-05, + "loss": 5.2131, + "step": 49200 + }, + { + "epoch": 8.837522441651705, + "grad_norm": 10.738325119018555, + "learning_rate": 2.155475763016158e-05, + "loss": 5.1408, + "step": 49225 + }, + { + "epoch": 8.842010771992818, + "grad_norm": 11.047571182250977, + "learning_rate": 2.1548773189706763e-05, + "loss": 5.3792, + "step": 49250 + }, + { + "epoch": 8.846499102333931, + "grad_norm": 9.337359428405762, + "learning_rate": 2.1542788749251945e-05, + "loss": 5.2791, + "step": 49275 + }, + { + "epoch": 8.850987432675044, + "grad_norm": 11.4617280960083, + "learning_rate": 2.1536804308797128e-05, + "loss": 5.3459, + "step": 49300 + }, + { + "epoch": 8.855475763016159, + "grad_norm": 10.279753684997559, + "learning_rate": 2.153081986834231e-05, + "loss": 5.5209, + "step": 49325 + }, + { + "epoch": 8.859964093357272, + "grad_norm": 9.714325904846191, + "learning_rate": 2.1524835427887492e-05, + "loss": 5.3998, + "step": 49350 + }, + { + "epoch": 8.864452423698385, + "grad_norm": 9.866265296936035, + "learning_rate": 2.1518850987432678e-05, + "loss": 5.2042, + "step": 49375 + }, + { + "epoch": 8.868940754039498, + "grad_norm": 11.083367347717285, + "learning_rate": 2.151286654697786e-05, + "loss": 5.1786, + "step": 49400 + }, + { + "epoch": 8.87342908438061, + "grad_norm": 11.374133110046387, + "learning_rate": 2.150688210652304e-05, + "loss": 5.1779, + "step": 49425 + }, + { + "epoch": 8.877917414721724, + "grad_norm": 11.157488822937012, + "learning_rate": 2.1500897666068222e-05, + "loss": 5.2288, + "step": 49450 + }, + { + "epoch": 8.882405745062837, + "grad_norm": 10.947492599487305, + "learning_rate": 2.1494913225613404e-05, + "loss": 5.3447, + "step": 49475 + }, + { + "epoch": 8.88689407540395, + "grad_norm": 12.07391357421875, + "learning_rate": 2.1488928785158586e-05, + "loss": 5.2643, + "step": 49500 + }, + { + "epoch": 8.891382405745063, + "grad_norm": 12.040887832641602, + "learning_rate": 2.1482944344703772e-05, + "loss": 5.2437, + "step": 49525 + }, + { + "epoch": 8.895870736086176, + "grad_norm": 10.619282722473145, + "learning_rate": 2.1476959904248955e-05, + "loss": 5.2118, + "step": 49550 + }, + { + "epoch": 8.900359066427288, + "grad_norm": 9.800612449645996, + "learning_rate": 2.1470975463794137e-05, + "loss": 5.2497, + "step": 49575 + }, + { + "epoch": 8.904847396768401, + "grad_norm": 9.855249404907227, + "learning_rate": 2.146499102333932e-05, + "loss": 5.3554, + "step": 49600 + }, + { + "epoch": 8.909335727109514, + "grad_norm": 10.331745147705078, + "learning_rate": 2.1459006582884498e-05, + "loss": 5.3359, + "step": 49625 + }, + { + "epoch": 8.91382405745063, + "grad_norm": 11.430087089538574, + "learning_rate": 2.1453022142429684e-05, + "loss": 5.4768, + "step": 49650 + }, + { + "epoch": 8.918312387791742, + "grad_norm": 9.949564933776855, + "learning_rate": 2.1447037701974866e-05, + "loss": 5.1868, + "step": 49675 + }, + { + "epoch": 8.922800718132855, + "grad_norm": 13.41470718383789, + "learning_rate": 2.144105326152005e-05, + "loss": 5.3096, + "step": 49700 + }, + { + "epoch": 8.927289048473968, + "grad_norm": 9.869828224182129, + "learning_rate": 2.143506882106523e-05, + "loss": 5.2663, + "step": 49725 + }, + { + "epoch": 8.931777378815081, + "grad_norm": 9.495867729187012, + "learning_rate": 2.1429084380610413e-05, + "loss": 5.2583, + "step": 49750 + }, + { + "epoch": 8.936265709156194, + "grad_norm": 10.486732482910156, + "learning_rate": 2.1423099940155596e-05, + "loss": 5.3076, + "step": 49775 + }, + { + "epoch": 8.940754039497307, + "grad_norm": 10.989167213439941, + "learning_rate": 2.141711549970078e-05, + "loss": 5.1784, + "step": 49800 + }, + { + "epoch": 8.94524236983842, + "grad_norm": 11.814510345458984, + "learning_rate": 2.141113105924596e-05, + "loss": 5.1897, + "step": 49825 + }, + { + "epoch": 8.949730700179533, + "grad_norm": 12.126526832580566, + "learning_rate": 2.1405146618791143e-05, + "loss": 5.123, + "step": 49850 + }, + { + "epoch": 8.954219030520646, + "grad_norm": 11.386066436767578, + "learning_rate": 2.1399162178336325e-05, + "loss": 5.4526, + "step": 49875 + }, + { + "epoch": 8.958707360861759, + "grad_norm": 10.271559715270996, + "learning_rate": 2.1393177737881508e-05, + "loss": 5.1666, + "step": 49900 + }, + { + "epoch": 8.963195691202873, + "grad_norm": 11.861756324768066, + "learning_rate": 2.138719329742669e-05, + "loss": 5.3307, + "step": 49925 + }, + { + "epoch": 8.967684021543986, + "grad_norm": 10.845406532287598, + "learning_rate": 2.1381208856971876e-05, + "loss": 5.3395, + "step": 49950 + }, + { + "epoch": 8.9721723518851, + "grad_norm": 10.746824264526367, + "learning_rate": 2.1375224416517058e-05, + "loss": 5.1308, + "step": 49975 + }, + { + "epoch": 8.976660682226212, + "grad_norm": 10.742770195007324, + "learning_rate": 2.136923997606224e-05, + "loss": 5.2117, + "step": 50000 + }, + { + "epoch": 8.981149012567325, + "grad_norm": 10.22166919708252, + "learning_rate": 2.136325553560742e-05, + "loss": 5.141, + "step": 50025 + }, + { + "epoch": 8.985637342908438, + "grad_norm": 10.348222732543945, + "learning_rate": 2.1357271095152602e-05, + "loss": 5.2875, + "step": 50050 + }, + { + "epoch": 8.990125673249551, + "grad_norm": 10.708039283752441, + "learning_rate": 2.1351286654697784e-05, + "loss": 5.2098, + "step": 50075 + }, + { + "epoch": 8.994614003590664, + "grad_norm": 9.801313400268555, + "learning_rate": 2.134530221424297e-05, + "loss": 5.3795, + "step": 50100 + }, + { + "epoch": 8.999102333931777, + "grad_norm": 9.760636329650879, + "learning_rate": 2.1339317773788152e-05, + "loss": 5.2915, + "step": 50125 + }, + { + "epoch": 9.0, + "eval_accuracy": 0.07656616959120158, + "eval_f1_macro": 0.00656364927642598, + "eval_f1_micro": 0.07656616959120158, + "eval_f1_weighted": 0.043420919183015314, + "eval_loss": 6.6508636474609375, + "eval_precision_macro": 0.00600908866877079, + "eval_precision_micro": 0.07656616959120158, + "eval_precision_weighted": 0.035625132024726454, + "eval_recall_macro": 0.011146057567632007, + "eval_recall_micro": 0.07656616959120158, + "eval_recall_weighted": 0.07656616959120158, + "eval_runtime": 62.2181, + "eval_samples_per_second": 841.765, + "eval_steps_per_second": 26.311, + "step": 50130 + }, + { + "epoch": 9.00359066427289, + "grad_norm": 10.240592002868652, + "learning_rate": 2.1333333333333335e-05, + "loss": 4.9593, + "step": 50150 + }, + { + "epoch": 9.008078994614003, + "grad_norm": 12.009661674499512, + "learning_rate": 2.1327348892878517e-05, + "loss": 4.9742, + "step": 50175 + }, + { + "epoch": 9.012567324955116, + "grad_norm": 10.410688400268555, + "learning_rate": 2.13213644524237e-05, + "loss": 4.8845, + "step": 50200 + }, + { + "epoch": 9.01705565529623, + "grad_norm": 10.37349796295166, + "learning_rate": 2.1315380011968882e-05, + "loss": 5.0186, + "step": 50225 + }, + { + "epoch": 9.021543985637344, + "grad_norm": 9.995245933532715, + "learning_rate": 2.1309395571514064e-05, + "loss": 4.8797, + "step": 50250 + }, + { + "epoch": 9.026032315978457, + "grad_norm": 10.604227066040039, + "learning_rate": 2.1303411131059246e-05, + "loss": 4.9107, + "step": 50275 + }, + { + "epoch": 9.03052064631957, + "grad_norm": 10.031315803527832, + "learning_rate": 2.129742669060443e-05, + "loss": 4.8749, + "step": 50300 + }, + { + "epoch": 9.035008976660682, + "grad_norm": 10.97464656829834, + "learning_rate": 2.129144225014961e-05, + "loss": 4.9961, + "step": 50325 + }, + { + "epoch": 9.039497307001795, + "grad_norm": 10.07321834564209, + "learning_rate": 2.1285457809694794e-05, + "loss": 4.9907, + "step": 50350 + }, + { + "epoch": 9.043985637342908, + "grad_norm": 9.779594421386719, + "learning_rate": 2.127947336923998e-05, + "loss": 4.8672, + "step": 50375 + }, + { + "epoch": 9.048473967684021, + "grad_norm": 9.68058967590332, + "learning_rate": 2.1273488928785158e-05, + "loss": 4.9835, + "step": 50400 + }, + { + "epoch": 9.052962298025134, + "grad_norm": 11.369108200073242, + "learning_rate": 2.126750448833034e-05, + "loss": 4.9773, + "step": 50425 + }, + { + "epoch": 9.057450628366247, + "grad_norm": 10.584446907043457, + "learning_rate": 2.1261520047875523e-05, + "loss": 4.8888, + "step": 50450 + }, + { + "epoch": 9.06193895870736, + "grad_norm": 11.194197654724121, + "learning_rate": 2.1255535607420705e-05, + "loss": 4.8787, + "step": 50475 + }, + { + "epoch": 9.066427289048473, + "grad_norm": 11.824823379516602, + "learning_rate": 2.1249551166965888e-05, + "loss": 4.9117, + "step": 50500 + }, + { + "epoch": 9.070915619389588, + "grad_norm": 10.205167770385742, + "learning_rate": 2.1243566726511073e-05, + "loss": 4.8602, + "step": 50525 + }, + { + "epoch": 9.0754039497307, + "grad_norm": 10.782133102416992, + "learning_rate": 2.1237582286056256e-05, + "loss": 5.0717, + "step": 50550 + }, + { + "epoch": 9.079892280071814, + "grad_norm": 8.940252304077148, + "learning_rate": 2.1231597845601438e-05, + "loss": 4.9031, + "step": 50575 + }, + { + "epoch": 9.084380610412927, + "grad_norm": 11.509713172912598, + "learning_rate": 2.1225613405146617e-05, + "loss": 4.8932, + "step": 50600 + }, + { + "epoch": 9.08886894075404, + "grad_norm": 10.67092514038086, + "learning_rate": 2.12196289646918e-05, + "loss": 5.0673, + "step": 50625 + }, + { + "epoch": 9.093357271095153, + "grad_norm": 11.562685012817383, + "learning_rate": 2.1213644524236985e-05, + "loss": 5.0485, + "step": 50650 + }, + { + "epoch": 9.097845601436266, + "grad_norm": 9.753336906433105, + "learning_rate": 2.1207660083782168e-05, + "loss": 4.9121, + "step": 50675 + }, + { + "epoch": 9.102333931777379, + "grad_norm": 10.191324234008789, + "learning_rate": 2.120167564332735e-05, + "loss": 5.1551, + "step": 50700 + }, + { + "epoch": 9.106822262118492, + "grad_norm": 9.705289840698242, + "learning_rate": 2.1195691202872532e-05, + "loss": 5.1079, + "step": 50725 + }, + { + "epoch": 9.111310592459605, + "grad_norm": 10.571413040161133, + "learning_rate": 2.1189706762417715e-05, + "loss": 4.9026, + "step": 50750 + }, + { + "epoch": 9.115798922800717, + "grad_norm": 11.201866149902344, + "learning_rate": 2.1183722321962897e-05, + "loss": 4.8491, + "step": 50775 + }, + { + "epoch": 9.12028725314183, + "grad_norm": 12.654863357543945, + "learning_rate": 2.117773788150808e-05, + "loss": 5.1168, + "step": 50800 + }, + { + "epoch": 9.124775583482945, + "grad_norm": 11.242270469665527, + "learning_rate": 2.1171753441053262e-05, + "loss": 4.8855, + "step": 50825 + }, + { + "epoch": 9.129263913824058, + "grad_norm": 11.5079984664917, + "learning_rate": 2.1165769000598444e-05, + "loss": 5.1378, + "step": 50850 + }, + { + "epoch": 9.133752244165171, + "grad_norm": 9.444543838500977, + "learning_rate": 2.1159784560143626e-05, + "loss": 4.9139, + "step": 50875 + }, + { + "epoch": 9.138240574506284, + "grad_norm": 10.413363456726074, + "learning_rate": 2.115380011968881e-05, + "loss": 4.9483, + "step": 50900 + }, + { + "epoch": 9.142728904847397, + "grad_norm": 11.94205093383789, + "learning_rate": 2.114781567923399e-05, + "loss": 4.8358, + "step": 50925 + }, + { + "epoch": 9.14721723518851, + "grad_norm": 12.93112564086914, + "learning_rate": 2.1141831238779177e-05, + "loss": 4.8942, + "step": 50950 + }, + { + "epoch": 9.151705565529623, + "grad_norm": 9.696127891540527, + "learning_rate": 2.113584679832436e-05, + "loss": 4.8718, + "step": 50975 + }, + { + "epoch": 9.156193895870736, + "grad_norm": 9.890364646911621, + "learning_rate": 2.1130101735487733e-05, + "loss": 5.1286, + "step": 51000 + }, + { + "epoch": 9.160682226211849, + "grad_norm": 11.422094345092773, + "learning_rate": 2.1124117295032915e-05, + "loss": 4.8448, + "step": 51025 + }, + { + "epoch": 9.165170556552962, + "grad_norm": 10.851180076599121, + "learning_rate": 2.1118132854578098e-05, + "loss": 5.0296, + "step": 51050 + }, + { + "epoch": 9.169658886894075, + "grad_norm": 10.964667320251465, + "learning_rate": 2.111214841412328e-05, + "loss": 5.0404, + "step": 51075 + }, + { + "epoch": 9.174147217235188, + "grad_norm": 8.993091583251953, + "learning_rate": 2.1106163973668466e-05, + "loss": 4.9282, + "step": 51100 + }, + { + "epoch": 9.178635547576302, + "grad_norm": 10.206158638000488, + "learning_rate": 2.1100179533213645e-05, + "loss": 4.7878, + "step": 51125 + }, + { + "epoch": 9.183123877917415, + "grad_norm": 10.357916831970215, + "learning_rate": 2.1094195092758827e-05, + "loss": 4.7689, + "step": 51150 + }, + { + "epoch": 9.187612208258528, + "grad_norm": 11.651240348815918, + "learning_rate": 2.108821065230401e-05, + "loss": 4.884, + "step": 51175 + }, + { + "epoch": 9.192100538599641, + "grad_norm": 9.834525108337402, + "learning_rate": 2.1082226211849192e-05, + "loss": 4.9186, + "step": 51200 + }, + { + "epoch": 9.196588868940754, + "grad_norm": 9.838241577148438, + "learning_rate": 2.1076241771394374e-05, + "loss": 4.9413, + "step": 51225 + }, + { + "epoch": 9.201077199281867, + "grad_norm": 10.606132507324219, + "learning_rate": 2.107025733093956e-05, + "loss": 4.9694, + "step": 51250 + }, + { + "epoch": 9.20556552962298, + "grad_norm": 10.845372200012207, + "learning_rate": 2.1064272890484742e-05, + "loss": 4.8669, + "step": 51275 + }, + { + "epoch": 9.210053859964093, + "grad_norm": 10.727575302124023, + "learning_rate": 2.105828845002992e-05, + "loss": 5.0455, + "step": 51300 + }, + { + "epoch": 9.214542190305206, + "grad_norm": 11.035595893859863, + "learning_rate": 2.1052304009575104e-05, + "loss": 5.0073, + "step": 51325 + }, + { + "epoch": 9.219030520646319, + "grad_norm": 10.236844062805176, + "learning_rate": 2.1046319569120286e-05, + "loss": 4.9515, + "step": 51350 + }, + { + "epoch": 9.223518850987432, + "grad_norm": 9.506902694702148, + "learning_rate": 2.104033512866547e-05, + "loss": 5.1074, + "step": 51375 + }, + { + "epoch": 9.228007181328545, + "grad_norm": 9.972946166992188, + "learning_rate": 2.1034350688210654e-05, + "loss": 4.7896, + "step": 51400 + }, + { + "epoch": 9.23249551166966, + "grad_norm": 12.363295555114746, + "learning_rate": 2.1028366247755836e-05, + "loss": 5.0402, + "step": 51425 + }, + { + "epoch": 9.236983842010773, + "grad_norm": 10.643270492553711, + "learning_rate": 2.102238180730102e-05, + "loss": 4.8928, + "step": 51450 + }, + { + "epoch": 9.241472172351886, + "grad_norm": 10.754084587097168, + "learning_rate": 2.10163973668462e-05, + "loss": 4.9026, + "step": 51475 + }, + { + "epoch": 9.245960502692999, + "grad_norm": 11.471101760864258, + "learning_rate": 2.101041292639138e-05, + "loss": 4.8699, + "step": 51500 + }, + { + "epoch": 9.250448833034111, + "grad_norm": 10.431634902954102, + "learning_rate": 2.1004428485936566e-05, + "loss": 4.9584, + "step": 51525 + }, + { + "epoch": 9.254937163375224, + "grad_norm": 10.42699146270752, + "learning_rate": 2.0998444045481748e-05, + "loss": 5.0436, + "step": 51550 + }, + { + "epoch": 9.259425493716337, + "grad_norm": 11.376937866210938, + "learning_rate": 2.099245960502693e-05, + "loss": 5.0192, + "step": 51575 + }, + { + "epoch": 9.26391382405745, + "grad_norm": 10.184343338012695, + "learning_rate": 2.0986475164572113e-05, + "loss": 4.982, + "step": 51600 + }, + { + "epoch": 9.268402154398563, + "grad_norm": 11.220765113830566, + "learning_rate": 2.0980490724117295e-05, + "loss": 5.0093, + "step": 51625 + }, + { + "epoch": 9.272890484739676, + "grad_norm": 10.218841552734375, + "learning_rate": 2.0974506283662478e-05, + "loss": 4.9967, + "step": 51650 + }, + { + "epoch": 9.27737881508079, + "grad_norm": 10.499467849731445, + "learning_rate": 2.0968521843207663e-05, + "loss": 4.8269, + "step": 51675 + }, + { + "epoch": 9.281867145421902, + "grad_norm": 10.555922508239746, + "learning_rate": 2.0962537402752842e-05, + "loss": 4.9484, + "step": 51700 + }, + { + "epoch": 9.286355475763017, + "grad_norm": 10.570606231689453, + "learning_rate": 2.0956552962298025e-05, + "loss": 5.1115, + "step": 51725 + }, + { + "epoch": 9.29084380610413, + "grad_norm": 11.656220436096191, + "learning_rate": 2.0950568521843207e-05, + "loss": 4.9073, + "step": 51750 + }, + { + "epoch": 9.295332136445243, + "grad_norm": 11.132031440734863, + "learning_rate": 2.094458408138839e-05, + "loss": 4.9241, + "step": 51775 + }, + { + "epoch": 9.299820466786356, + "grad_norm": 10.277223587036133, + "learning_rate": 2.0938599640933572e-05, + "loss": 5.0038, + "step": 51800 + }, + { + "epoch": 9.304308797127469, + "grad_norm": 11.584080696105957, + "learning_rate": 2.0932615200478758e-05, + "loss": 5.0719, + "step": 51825 + }, + { + "epoch": 9.308797127468582, + "grad_norm": 10.864374160766602, + "learning_rate": 2.092663076002394e-05, + "loss": 4.9161, + "step": 51850 + }, + { + "epoch": 9.313285457809695, + "grad_norm": 11.245027542114258, + "learning_rate": 2.0920646319569122e-05, + "loss": 5.0108, + "step": 51875 + }, + { + "epoch": 9.317773788150808, + "grad_norm": 12.04010009765625, + "learning_rate": 2.09146618791143e-05, + "loss": 4.9424, + "step": 51900 + }, + { + "epoch": 9.32226211849192, + "grad_norm": 9.722957611083984, + "learning_rate": 2.0908677438659484e-05, + "loss": 4.8921, + "step": 51925 + }, + { + "epoch": 9.326750448833034, + "grad_norm": 11.00196361541748, + "learning_rate": 2.0902692998204666e-05, + "loss": 4.919, + "step": 51950 + }, + { + "epoch": 9.331238779174146, + "grad_norm": 11.12526798248291, + "learning_rate": 2.0896708557749852e-05, + "loss": 5.0289, + "step": 51975 + }, + { + "epoch": 9.335727109515261, + "grad_norm": 10.11386775970459, + "learning_rate": 2.0890724117295034e-05, + "loss": 4.9624, + "step": 52000 + }, + { + "epoch": 9.340215439856374, + "grad_norm": 11.444221496582031, + "learning_rate": 2.0884739676840217e-05, + "loss": 4.9909, + "step": 52025 + }, + { + "epoch": 9.344703770197487, + "grad_norm": 11.001198768615723, + "learning_rate": 2.08787552363854e-05, + "loss": 4.8893, + "step": 52050 + }, + { + "epoch": 9.3491921005386, + "grad_norm": 10.915155410766602, + "learning_rate": 2.087277079593058e-05, + "loss": 5.0998, + "step": 52075 + }, + { + "epoch": 9.353680430879713, + "grad_norm": 10.861316680908203, + "learning_rate": 2.0866786355475764e-05, + "loss": 4.9346, + "step": 52100 + }, + { + "epoch": 9.358168761220826, + "grad_norm": 10.465718269348145, + "learning_rate": 2.0860801915020946e-05, + "loss": 5.1084, + "step": 52125 + }, + { + "epoch": 9.362657091561939, + "grad_norm": 10.818107604980469, + "learning_rate": 2.0854817474566128e-05, + "loss": 5.0601, + "step": 52150 + }, + { + "epoch": 9.367145421903052, + "grad_norm": 11.378423690795898, + "learning_rate": 2.084883303411131e-05, + "loss": 4.9707, + "step": 52175 + }, + { + "epoch": 9.371633752244165, + "grad_norm": 11.38611888885498, + "learning_rate": 2.0842848593656493e-05, + "loss": 4.9977, + "step": 52200 + }, + { + "epoch": 9.376122082585278, + "grad_norm": 11.04100513458252, + "learning_rate": 2.0836864153201675e-05, + "loss": 5.0176, + "step": 52225 + }, + { + "epoch": 9.38061041292639, + "grad_norm": 10.271895408630371, + "learning_rate": 2.083087971274686e-05, + "loss": 5.1004, + "step": 52250 + }, + { + "epoch": 9.385098743267504, + "grad_norm": 10.981404304504395, + "learning_rate": 2.0824895272292043e-05, + "loss": 5.0186, + "step": 52275 + }, + { + "epoch": 9.389587073608618, + "grad_norm": 12.74997615814209, + "learning_rate": 2.0818910831837222e-05, + "loss": 4.9702, + "step": 52300 + }, + { + "epoch": 9.394075403949731, + "grad_norm": 10.817777633666992, + "learning_rate": 2.0812926391382405e-05, + "loss": 4.9645, + "step": 52325 + }, + { + "epoch": 9.398563734290844, + "grad_norm": 12.343838691711426, + "learning_rate": 2.0806941950927587e-05, + "loss": 5.0953, + "step": 52350 + }, + { + "epoch": 9.403052064631957, + "grad_norm": 11.298012733459473, + "learning_rate": 2.080095751047277e-05, + "loss": 4.8471, + "step": 52375 + }, + { + "epoch": 9.40754039497307, + "grad_norm": 11.165434837341309, + "learning_rate": 2.0794973070017955e-05, + "loss": 5.0386, + "step": 52400 + }, + { + "epoch": 9.412028725314183, + "grad_norm": 17.793407440185547, + "learning_rate": 2.0788988629563138e-05, + "loss": 5.0129, + "step": 52425 + }, + { + "epoch": 9.416517055655296, + "grad_norm": 11.569448471069336, + "learning_rate": 2.078300418910832e-05, + "loss": 5.0108, + "step": 52450 + }, + { + "epoch": 9.42100538599641, + "grad_norm": 10.961335182189941, + "learning_rate": 2.0777019748653502e-05, + "loss": 4.9748, + "step": 52475 + }, + { + "epoch": 9.425493716337522, + "grad_norm": 10.786718368530273, + "learning_rate": 2.077103530819868e-05, + "loss": 4.9183, + "step": 52500 + }, + { + "epoch": 9.429982046678635, + "grad_norm": 10.262779235839844, + "learning_rate": 2.0765050867743867e-05, + "loss": 4.9257, + "step": 52525 + }, + { + "epoch": 9.434470377019748, + "grad_norm": 11.258552551269531, + "learning_rate": 2.075906642728905e-05, + "loss": 5.0752, + "step": 52550 + }, + { + "epoch": 9.438958707360861, + "grad_norm": 11.011068344116211, + "learning_rate": 2.0753081986834232e-05, + "loss": 5.027, + "step": 52575 + }, + { + "epoch": 9.443447037701976, + "grad_norm": 10.950187683105469, + "learning_rate": 2.0747097546379414e-05, + "loss": 4.8615, + "step": 52600 + }, + { + "epoch": 9.447935368043089, + "grad_norm": 11.014263153076172, + "learning_rate": 2.0741113105924597e-05, + "loss": 5.1047, + "step": 52625 + }, + { + "epoch": 9.452423698384202, + "grad_norm": 11.649615287780762, + "learning_rate": 2.073512866546978e-05, + "loss": 5.0413, + "step": 52650 + }, + { + "epoch": 9.456912028725315, + "grad_norm": 10.9225435256958, + "learning_rate": 2.0729144225014965e-05, + "loss": 5.0412, + "step": 52675 + }, + { + "epoch": 9.461400359066428, + "grad_norm": 11.466302871704102, + "learning_rate": 2.0723159784560144e-05, + "loss": 4.9853, + "step": 52700 + }, + { + "epoch": 9.46588868940754, + "grad_norm": 10.138604164123535, + "learning_rate": 2.0717175344105326e-05, + "loss": 4.9372, + "step": 52725 + }, + { + "epoch": 9.470377019748653, + "grad_norm": 12.59761905670166, + "learning_rate": 2.071119090365051e-05, + "loss": 4.902, + "step": 52750 + }, + { + "epoch": 9.474865350089766, + "grad_norm": 11.623022079467773, + "learning_rate": 2.070520646319569e-05, + "loss": 5.0929, + "step": 52775 + }, + { + "epoch": 9.47935368043088, + "grad_norm": 10.75173282623291, + "learning_rate": 2.0699222022740873e-05, + "loss": 5.0229, + "step": 52800 + }, + { + "epoch": 9.483842010771992, + "grad_norm": 11.188772201538086, + "learning_rate": 2.069323758228606e-05, + "loss": 5.1153, + "step": 52825 + }, + { + "epoch": 9.488330341113105, + "grad_norm": 10.956487655639648, + "learning_rate": 2.068725314183124e-05, + "loss": 5.0204, + "step": 52850 + }, + { + "epoch": 9.492818671454218, + "grad_norm": 9.821392059326172, + "learning_rate": 2.0681268701376424e-05, + "loss": 4.9152, + "step": 52875 + }, + { + "epoch": 9.497307001795333, + "grad_norm": 10.346214294433594, + "learning_rate": 2.0675284260921602e-05, + "loss": 5.0809, + "step": 52900 + }, + { + "epoch": 9.501795332136446, + "grad_norm": 11.269129753112793, + "learning_rate": 2.0669299820466785e-05, + "loss": 4.8668, + "step": 52925 + }, + { + "epoch": 9.506283662477559, + "grad_norm": 11.769173622131348, + "learning_rate": 2.0663554757630162e-05, + "loss": 4.9911, + "step": 52950 + }, + { + "epoch": 9.510771992818672, + "grad_norm": 10.868391990661621, + "learning_rate": 2.0657570317175348e-05, + "loss": 5.1396, + "step": 52975 + }, + { + "epoch": 9.515260323159785, + "grad_norm": 10.733838081359863, + "learning_rate": 2.0651585876720527e-05, + "loss": 5.0803, + "step": 53000 + }, + { + "epoch": 9.519748653500898, + "grad_norm": 10.164852142333984, + "learning_rate": 2.064560143626571e-05, + "loss": 5.0247, + "step": 53025 + }, + { + "epoch": 9.52423698384201, + "grad_norm": 11.311273574829102, + "learning_rate": 2.063961699581089e-05, + "loss": 5.125, + "step": 53050 + }, + { + "epoch": 9.528725314183124, + "grad_norm": 10.226419448852539, + "learning_rate": 2.0633632555356074e-05, + "loss": 4.8381, + "step": 53075 + }, + { + "epoch": 9.533213644524237, + "grad_norm": 10.350929260253906, + "learning_rate": 2.0627648114901256e-05, + "loss": 5.0024, + "step": 53100 + }, + { + "epoch": 9.53770197486535, + "grad_norm": 11.438186645507812, + "learning_rate": 2.0621663674446442e-05, + "loss": 5.1138, + "step": 53125 + }, + { + "epoch": 9.542190305206462, + "grad_norm": 11.202611923217773, + "learning_rate": 2.0615679233991624e-05, + "loss": 5.0653, + "step": 53150 + }, + { + "epoch": 9.546678635547575, + "grad_norm": 9.86106014251709, + "learning_rate": 2.0609694793536807e-05, + "loss": 5.1721, + "step": 53175 + }, + { + "epoch": 9.55116696588869, + "grad_norm": 8.892157554626465, + "learning_rate": 2.0603710353081986e-05, + "loss": 5.1024, + "step": 53200 + }, + { + "epoch": 9.555655296229803, + "grad_norm": 9.770195007324219, + "learning_rate": 2.0597725912627168e-05, + "loss": 4.9855, + "step": 53225 + }, + { + "epoch": 9.560143626570916, + "grad_norm": 11.025806427001953, + "learning_rate": 2.059174147217235e-05, + "loss": 4.9755, + "step": 53250 + }, + { + "epoch": 9.564631956912029, + "grad_norm": 10.755819320678711, + "learning_rate": 2.0585757031717536e-05, + "loss": 5.0693, + "step": 53275 + }, + { + "epoch": 9.569120287253142, + "grad_norm": 11.105140686035156, + "learning_rate": 2.057977259126272e-05, + "loss": 4.9973, + "step": 53300 + }, + { + "epoch": 9.573608617594255, + "grad_norm": 11.505203247070312, + "learning_rate": 2.05737881508079e-05, + "loss": 5.1206, + "step": 53325 + }, + { + "epoch": 9.578096947935368, + "grad_norm": 11.384746551513672, + "learning_rate": 2.0567803710353083e-05, + "loss": 5.0311, + "step": 53350 + }, + { + "epoch": 9.58258527827648, + "grad_norm": 10.2644624710083, + "learning_rate": 2.0561819269898265e-05, + "loss": 5.1075, + "step": 53375 + }, + { + "epoch": 9.587073608617594, + "grad_norm": 11.336398124694824, + "learning_rate": 2.0555834829443448e-05, + "loss": 4.8281, + "step": 53400 + }, + { + "epoch": 9.591561938958707, + "grad_norm": 10.799323081970215, + "learning_rate": 2.054985038898863e-05, + "loss": 5.0953, + "step": 53425 + }, + { + "epoch": 9.59605026929982, + "grad_norm": 10.867523193359375, + "learning_rate": 2.0543865948533813e-05, + "loss": 5.0886, + "step": 53450 + }, + { + "epoch": 9.600538599640934, + "grad_norm": 10.584467887878418, + "learning_rate": 2.0537881508078995e-05, + "loss": 5.2156, + "step": 53475 + }, + { + "epoch": 9.605026929982047, + "grad_norm": 11.792564392089844, + "learning_rate": 2.0531897067624177e-05, + "loss": 5.1173, + "step": 53500 + }, + { + "epoch": 9.60951526032316, + "grad_norm": 14.859521865844727, + "learning_rate": 2.052591262716936e-05, + "loss": 4.9936, + "step": 53525 + }, + { + "epoch": 9.614003590664273, + "grad_norm": 11.058758735656738, + "learning_rate": 2.0519928186714545e-05, + "loss": 4.8753, + "step": 53550 + }, + { + "epoch": 9.618491921005386, + "grad_norm": 9.566319465637207, + "learning_rate": 2.0513943746259728e-05, + "loss": 5.1604, + "step": 53575 + }, + { + "epoch": 9.6229802513465, + "grad_norm": 11.459898948669434, + "learning_rate": 2.0507959305804907e-05, + "loss": 5.1592, + "step": 53600 + }, + { + "epoch": 9.627468581687612, + "grad_norm": 10.868318557739258, + "learning_rate": 2.050197486535009e-05, + "loss": 5.1062, + "step": 53625 + }, + { + "epoch": 9.631956912028725, + "grad_norm": 11.455902099609375, + "learning_rate": 2.049599042489527e-05, + "loss": 5.0274, + "step": 53650 + }, + { + "epoch": 9.636445242369838, + "grad_norm": 11.367392539978027, + "learning_rate": 2.0490005984440454e-05, + "loss": 5.0257, + "step": 53675 + }, + { + "epoch": 9.640933572710951, + "grad_norm": 10.999629974365234, + "learning_rate": 2.048402154398564e-05, + "loss": 4.7737, + "step": 53700 + }, + { + "epoch": 9.645421903052064, + "grad_norm": 13.16221809387207, + "learning_rate": 2.0478037103530822e-05, + "loss": 5.0876, + "step": 53725 + }, + { + "epoch": 9.649910233393177, + "grad_norm": 14.726330757141113, + "learning_rate": 2.0472052663076004e-05, + "loss": 5.0246, + "step": 53750 + }, + { + "epoch": 9.65439856373429, + "grad_norm": 11.606203079223633, + "learning_rate": 2.0466068222621187e-05, + "loss": 5.1697, + "step": 53775 + }, + { + "epoch": 9.658886894075405, + "grad_norm": 10.102290153503418, + "learning_rate": 2.0460083782166366e-05, + "loss": 5.0705, + "step": 53800 + }, + { + "epoch": 9.663375224416518, + "grad_norm": 10.073141098022461, + "learning_rate": 2.0454099341711548e-05, + "loss": 5.0287, + "step": 53825 + }, + { + "epoch": 9.66786355475763, + "grad_norm": 11.889898300170898, + "learning_rate": 2.0448114901256734e-05, + "loss": 5.214, + "step": 53850 + }, + { + "epoch": 9.672351885098744, + "grad_norm": 10.848076820373535, + "learning_rate": 2.0442130460801916e-05, + "loss": 4.9049, + "step": 53875 + }, + { + "epoch": 9.676840215439857, + "grad_norm": 11.741418838500977, + "learning_rate": 2.04361460203471e-05, + "loss": 5.0941, + "step": 53900 + }, + { + "epoch": 9.68132854578097, + "grad_norm": 11.806507110595703, + "learning_rate": 2.043016157989228e-05, + "loss": 5.0086, + "step": 53925 + }, + { + "epoch": 9.685816876122082, + "grad_norm": 11.639864921569824, + "learning_rate": 2.0424177139437463e-05, + "loss": 5.1834, + "step": 53950 + }, + { + "epoch": 9.690305206463195, + "grad_norm": 11.833596229553223, + "learning_rate": 2.041819269898265e-05, + "loss": 4.9743, + "step": 53975 + }, + { + "epoch": 9.694793536804308, + "grad_norm": 12.458555221557617, + "learning_rate": 2.0412208258527828e-05, + "loss": 5.1399, + "step": 54000 + }, + { + "epoch": 9.699281867145421, + "grad_norm": 10.818696022033691, + "learning_rate": 2.040622381807301e-05, + "loss": 5.0372, + "step": 54025 + }, + { + "epoch": 9.703770197486534, + "grad_norm": 10.969178199768066, + "learning_rate": 2.0400239377618193e-05, + "loss": 5.0287, + "step": 54050 + }, + { + "epoch": 9.708258527827649, + "grad_norm": 10.022538185119629, + "learning_rate": 2.0394254937163375e-05, + "loss": 5.0615, + "step": 54075 + }, + { + "epoch": 9.712746858168762, + "grad_norm": 11.487543106079102, + "learning_rate": 2.0388270496708557e-05, + "loss": 5.0125, + "step": 54100 + }, + { + "epoch": 9.717235188509875, + "grad_norm": 10.702008247375488, + "learning_rate": 2.0382286056253743e-05, + "loss": 5.1658, + "step": 54125 + }, + { + "epoch": 9.721723518850988, + "grad_norm": 10.758954048156738, + "learning_rate": 2.0376301615798925e-05, + "loss": 5.1276, + "step": 54150 + }, + { + "epoch": 9.7262118491921, + "grad_norm": 11.361984252929688, + "learning_rate": 2.0370317175344104e-05, + "loss": 5.0877, + "step": 54175 + }, + { + "epoch": 9.730700179533214, + "grad_norm": 14.23162841796875, + "learning_rate": 2.0364332734889287e-05, + "loss": 4.9794, + "step": 54200 + }, + { + "epoch": 9.735188509874327, + "grad_norm": 11.980596542358398, + "learning_rate": 2.035834829443447e-05, + "loss": 4.918, + "step": 54225 + }, + { + "epoch": 9.73967684021544, + "grad_norm": 10.37841510772705, + "learning_rate": 2.035236385397965e-05, + "loss": 5.0708, + "step": 54250 + }, + { + "epoch": 9.744165170556553, + "grad_norm": 11.547417640686035, + "learning_rate": 2.0346379413524837e-05, + "loss": 4.8437, + "step": 54275 + }, + { + "epoch": 9.748653500897666, + "grad_norm": 12.00156021118164, + "learning_rate": 2.034039497307002e-05, + "loss": 5.1843, + "step": 54300 + }, + { + "epoch": 9.753141831238779, + "grad_norm": 12.048932075500488, + "learning_rate": 2.0334410532615202e-05, + "loss": 4.8919, + "step": 54325 + }, + { + "epoch": 9.757630161579891, + "grad_norm": 9.870779037475586, + "learning_rate": 2.0328426092160384e-05, + "loss": 4.9815, + "step": 54350 + }, + { + "epoch": 9.762118491921004, + "grad_norm": 11.248870849609375, + "learning_rate": 2.0322441651705563e-05, + "loss": 5.0696, + "step": 54375 + }, + { + "epoch": 9.76660682226212, + "grad_norm": 10.796330451965332, + "learning_rate": 2.031645721125075e-05, + "loss": 5.054, + "step": 54400 + }, + { + "epoch": 9.771095152603232, + "grad_norm": 11.07474422454834, + "learning_rate": 2.031047277079593e-05, + "loss": 5.0027, + "step": 54425 + }, + { + "epoch": 9.775583482944345, + "grad_norm": 11.573097229003906, + "learning_rate": 2.0304488330341114e-05, + "loss": 5.2943, + "step": 54450 + }, + { + "epoch": 9.780071813285458, + "grad_norm": 9.396492958068848, + "learning_rate": 2.0298503889886296e-05, + "loss": 4.9549, + "step": 54475 + }, + { + "epoch": 9.784560143626571, + "grad_norm": 11.206363677978516, + "learning_rate": 2.029251944943148e-05, + "loss": 5.1077, + "step": 54500 + }, + { + "epoch": 9.789048473967684, + "grad_norm": 11.680152893066406, + "learning_rate": 2.028653500897666e-05, + "loss": 4.9831, + "step": 54525 + }, + { + "epoch": 9.793536804308797, + "grad_norm": 10.635766983032227, + "learning_rate": 2.0280550568521847e-05, + "loss": 4.9814, + "step": 54550 + }, + { + "epoch": 9.79802513464991, + "grad_norm": 11.711913108825684, + "learning_rate": 2.0274566128067025e-05, + "loss": 5.0018, + "step": 54575 + }, + { + "epoch": 9.802513464991023, + "grad_norm": 10.370429039001465, + "learning_rate": 2.0268581687612208e-05, + "loss": 5.1001, + "step": 54600 + }, + { + "epoch": 9.807001795332136, + "grad_norm": 11.310208320617676, + "learning_rate": 2.026259724715739e-05, + "loss": 4.9449, + "step": 54625 + }, + { + "epoch": 9.811490125673249, + "grad_norm": 10.928662300109863, + "learning_rate": 2.0256612806702573e-05, + "loss": 5.2167, + "step": 54650 + }, + { + "epoch": 9.815978456014363, + "grad_norm": 11.044739723205566, + "learning_rate": 2.0250628366247755e-05, + "loss": 5.0969, + "step": 54675 + }, + { + "epoch": 9.820466786355476, + "grad_norm": 9.9227933883667, + "learning_rate": 2.024464392579294e-05, + "loss": 5.1225, + "step": 54700 + }, + { + "epoch": 9.82495511669659, + "grad_norm": 9.993706703186035, + "learning_rate": 2.0238659485338123e-05, + "loss": 5.1035, + "step": 54725 + }, + { + "epoch": 9.829443447037702, + "grad_norm": 12.50881290435791, + "learning_rate": 2.0232675044883305e-05, + "loss": 4.9737, + "step": 54750 + }, + { + "epoch": 9.833931777378815, + "grad_norm": 11.190850257873535, + "learning_rate": 2.0226690604428484e-05, + "loss": 5.1732, + "step": 54775 + }, + { + "epoch": 9.838420107719928, + "grad_norm": 11.346424102783203, + "learning_rate": 2.0220706163973667e-05, + "loss": 5.0652, + "step": 54800 + }, + { + "epoch": 9.842908438061041, + "grad_norm": 11.275069236755371, + "learning_rate": 2.0214721723518852e-05, + "loss": 4.9657, + "step": 54825 + }, + { + "epoch": 9.847396768402154, + "grad_norm": 12.366570472717285, + "learning_rate": 2.0208737283064035e-05, + "loss": 4.9202, + "step": 54850 + }, + { + "epoch": 9.851885098743267, + "grad_norm": 10.721635818481445, + "learning_rate": 2.0202752842609217e-05, + "loss": 5.0218, + "step": 54875 + }, + { + "epoch": 9.85637342908438, + "grad_norm": 11.496729850769043, + "learning_rate": 2.01967684021544e-05, + "loss": 5.0351, + "step": 54900 + }, + { + "epoch": 9.860861759425493, + "grad_norm": 10.314311027526855, + "learning_rate": 2.0190783961699582e-05, + "loss": 5.1456, + "step": 54925 + }, + { + "epoch": 9.865350089766606, + "grad_norm": 10.83665943145752, + "learning_rate": 2.0184799521244764e-05, + "loss": 5.0464, + "step": 54950 + }, + { + "epoch": 9.86983842010772, + "grad_norm": 11.553869247436523, + "learning_rate": 2.0178815080789947e-05, + "loss": 5.107, + "step": 54975 + }, + { + "epoch": 9.874326750448834, + "grad_norm": 9.993013381958008, + "learning_rate": 2.017283064033513e-05, + "loss": 5.0341, + "step": 55000 + }, + { + "epoch": 9.878815080789947, + "grad_norm": 12.607884407043457, + "learning_rate": 2.016684619988031e-05, + "loss": 4.9393, + "step": 55025 + }, + { + "epoch": 9.88330341113106, + "grad_norm": 10.054112434387207, + "learning_rate": 2.0160861759425494e-05, + "loss": 5.0086, + "step": 55050 + }, + { + "epoch": 9.887791741472173, + "grad_norm": 12.09311580657959, + "learning_rate": 2.0154877318970676e-05, + "loss": 4.9077, + "step": 55075 + }, + { + "epoch": 9.892280071813286, + "grad_norm": 10.145326614379883, + "learning_rate": 2.014889287851586e-05, + "loss": 5.2164, + "step": 55100 + }, + { + "epoch": 9.896768402154398, + "grad_norm": 11.859919548034668, + "learning_rate": 2.0142908438061044e-05, + "loss": 5.0661, + "step": 55125 + }, + { + "epoch": 9.901256732495511, + "grad_norm": 10.978533744812012, + "learning_rate": 2.0136923997606227e-05, + "loss": 5.0733, + "step": 55150 + }, + { + "epoch": 9.905745062836624, + "grad_norm": 11.26434326171875, + "learning_rate": 2.0130939557151406e-05, + "loss": 5.1155, + "step": 55175 + }, + { + "epoch": 9.910233393177737, + "grad_norm": 10.869341850280762, + "learning_rate": 2.0124955116696588e-05, + "loss": 5.1436, + "step": 55200 + }, + { + "epoch": 9.91472172351885, + "grad_norm": 11.756258964538574, + "learning_rate": 2.011897067624177e-05, + "loss": 5.1222, + "step": 55225 + }, + { + "epoch": 9.919210053859963, + "grad_norm": 11.765117645263672, + "learning_rate": 2.0112986235786953e-05, + "loss": 5.0964, + "step": 55250 + }, + { + "epoch": 9.923698384201078, + "grad_norm": 11.08630084991455, + "learning_rate": 2.010700179533214e-05, + "loss": 5.0085, + "step": 55275 + }, + { + "epoch": 9.928186714542191, + "grad_norm": 11.64377212524414, + "learning_rate": 2.010101735487732e-05, + "loss": 5.0232, + "step": 55300 + }, + { + "epoch": 9.932675044883304, + "grad_norm": 11.462095260620117, + "learning_rate": 2.0095032914422503e-05, + "loss": 5.1156, + "step": 55325 + }, + { + "epoch": 9.937163375224417, + "grad_norm": 12.527989387512207, + "learning_rate": 2.0089048473967685e-05, + "loss": 5.2873, + "step": 55350 + }, + { + "epoch": 9.94165170556553, + "grad_norm": 10.606977462768555, + "learning_rate": 2.0083064033512864e-05, + "loss": 4.9479, + "step": 55375 + }, + { + "epoch": 9.946140035906643, + "grad_norm": 10.953739166259766, + "learning_rate": 2.007707959305805e-05, + "loss": 4.9403, + "step": 55400 + }, + { + "epoch": 9.950628366247756, + "grad_norm": 11.6215181350708, + "learning_rate": 2.0071095152603232e-05, + "loss": 5.0714, + "step": 55425 + }, + { + "epoch": 9.955116696588869, + "grad_norm": 11.747157096862793, + "learning_rate": 2.0065110712148415e-05, + "loss": 5.0921, + "step": 55450 + }, + { + "epoch": 9.959605026929982, + "grad_norm": 10.226792335510254, + "learning_rate": 2.0059126271693597e-05, + "loss": 5.1466, + "step": 55475 + }, + { + "epoch": 9.964093357271095, + "grad_norm": 11.761174201965332, + "learning_rate": 2.005314183123878e-05, + "loss": 5.0937, + "step": 55500 + }, + { + "epoch": 9.968581687612208, + "grad_norm": 11.29422664642334, + "learning_rate": 2.0047157390783962e-05, + "loss": 5.1178, + "step": 55525 + }, + { + "epoch": 9.973070017953322, + "grad_norm": 11.08249568939209, + "learning_rate": 2.0041172950329148e-05, + "loss": 5.1391, + "step": 55550 + }, + { + "epoch": 9.977558348294435, + "grad_norm": 11.044683456420898, + "learning_rate": 2.0035188509874327e-05, + "loss": 4.9998, + "step": 55575 + }, + { + "epoch": 9.982046678635548, + "grad_norm": 11.26824951171875, + "learning_rate": 2.002920406941951e-05, + "loss": 5.0582, + "step": 55600 + }, + { + "epoch": 9.986535008976661, + "grad_norm": 11.541621208190918, + "learning_rate": 2.002321962896469e-05, + "loss": 5.1068, + "step": 55625 + }, + { + "epoch": 9.991023339317774, + "grad_norm": 10.630711555480957, + "learning_rate": 2.0017235188509874e-05, + "loss": 4.9636, + "step": 55650 + }, + { + "epoch": 9.995511669658887, + "grad_norm": 10.95443344116211, + "learning_rate": 2.0011250748055056e-05, + "loss": 4.9646, + "step": 55675 + }, + { + "epoch": 10.0, + "grad_norm": 15.18558406829834, + "learning_rate": 2.0005266307600242e-05, + "loss": 4.925, + "step": 55700 + }, + { + "epoch": 10.0, + "eval_accuracy": 0.0729383460943616, + "eval_f1_macro": 0.006962441626347374, + "eval_f1_micro": 0.0729383460943616, + "eval_f1_weighted": 0.04255522315294437, + "eval_loss": 6.628863334655762, + "eval_precision_macro": 0.006304274104526295, + "eval_precision_micro": 0.0729383460943616, + "eval_precision_weighted": 0.035033221263146855, + "eval_recall_macro": 0.011568897507167976, + "eval_recall_micro": 0.0729383460943616, + "eval_recall_weighted": 0.0729383460943616, + "eval_runtime": 62.2789, + "eval_samples_per_second": 840.944, + "eval_steps_per_second": 26.285, + "step": 55700 + }, + { + "epoch": 10.004488330341113, + "grad_norm": 11.17663288116455, + "learning_rate": 1.9999281867145424e-05, + "loss": 4.7584, + "step": 55725 + }, + { + "epoch": 10.008976660682226, + "grad_norm": 11.610177993774414, + "learning_rate": 1.9993297426690607e-05, + "loss": 4.6887, + "step": 55750 + }, + { + "epoch": 10.013464991023339, + "grad_norm": 10.914331436157227, + "learning_rate": 1.9987312986235786e-05, + "loss": 4.7837, + "step": 55775 + }, + { + "epoch": 10.017953321364452, + "grad_norm": 10.520831108093262, + "learning_rate": 1.9981328545780968e-05, + "loss": 4.5803, + "step": 55800 + }, + { + "epoch": 10.022441651705565, + "grad_norm": 13.857707977294922, + "learning_rate": 1.9975344105326154e-05, + "loss": 4.7385, + "step": 55825 + }, + { + "epoch": 10.02692998204668, + "grad_norm": 12.968884468078613, + "learning_rate": 1.9969359664871336e-05, + "loss": 4.7765, + "step": 55850 + }, + { + "epoch": 10.031418312387792, + "grad_norm": 11.247675895690918, + "learning_rate": 1.996337522441652e-05, + "loss": 4.5772, + "step": 55875 + }, + { + "epoch": 10.035906642728905, + "grad_norm": 11.069640159606934, + "learning_rate": 1.99573907839617e-05, + "loss": 4.7376, + "step": 55900 + }, + { + "epoch": 10.040394973070018, + "grad_norm": 12.087432861328125, + "learning_rate": 1.9951406343506883e-05, + "loss": 4.8373, + "step": 55925 + }, + { + "epoch": 10.044883303411131, + "grad_norm": 11.050515174865723, + "learning_rate": 1.9945421903052062e-05, + "loss": 4.6143, + "step": 55950 + }, + { + "epoch": 10.049371633752244, + "grad_norm": 10.705192565917969, + "learning_rate": 1.9939437462597248e-05, + "loss": 4.69, + "step": 55975 + }, + { + "epoch": 10.053859964093357, + "grad_norm": 10.029187202453613, + "learning_rate": 1.993345302214243e-05, + "loss": 4.7484, + "step": 56000 + }, + { + "epoch": 10.05834829443447, + "grad_norm": 10.254772186279297, + "learning_rate": 1.9927468581687613e-05, + "loss": 4.6108, + "step": 56025 + }, + { + "epoch": 10.062836624775583, + "grad_norm": 10.059431076049805, + "learning_rate": 1.9921484141232795e-05, + "loss": 4.7486, + "step": 56050 + }, + { + "epoch": 10.067324955116696, + "grad_norm": 11.002187728881836, + "learning_rate": 1.9915499700777977e-05, + "loss": 4.6557, + "step": 56075 + }, + { + "epoch": 10.071813285457809, + "grad_norm": 12.394529342651367, + "learning_rate": 1.990951526032316e-05, + "loss": 4.6734, + "step": 56100 + }, + { + "epoch": 10.076301615798922, + "grad_norm": 10.898031234741211, + "learning_rate": 1.9903530819868345e-05, + "loss": 4.7551, + "step": 56125 + }, + { + "epoch": 10.080789946140037, + "grad_norm": 11.404539108276367, + "learning_rate": 1.9897546379413524e-05, + "loss": 4.6516, + "step": 56150 + }, + { + "epoch": 10.08527827648115, + "grad_norm": 10.680449485778809, + "learning_rate": 1.9891561938958707e-05, + "loss": 4.7929, + "step": 56175 + }, + { + "epoch": 10.089766606822263, + "grad_norm": 10.055392265319824, + "learning_rate": 1.988557749850389e-05, + "loss": 4.6757, + "step": 56200 + }, + { + "epoch": 10.094254937163376, + "grad_norm": 11.857422828674316, + "learning_rate": 1.987959305804907e-05, + "loss": 4.7421, + "step": 56225 + }, + { + "epoch": 10.098743267504489, + "grad_norm": 11.150386810302734, + "learning_rate": 1.9873608617594257e-05, + "loss": 4.6877, + "step": 56250 + }, + { + "epoch": 10.103231597845602, + "grad_norm": 10.303437232971191, + "learning_rate": 1.986762417713944e-05, + "loss": 4.7752, + "step": 56275 + }, + { + "epoch": 10.107719928186714, + "grad_norm": 10.255880355834961, + "learning_rate": 1.9861639736684622e-05, + "loss": 4.6596, + "step": 56300 + }, + { + "epoch": 10.112208258527827, + "grad_norm": 10.66046142578125, + "learning_rate": 1.9855655296229804e-05, + "loss": 4.7149, + "step": 56325 + }, + { + "epoch": 10.11669658886894, + "grad_norm": 12.095270156860352, + "learning_rate": 1.9849670855774983e-05, + "loss": 4.6437, + "step": 56350 + }, + { + "epoch": 10.121184919210053, + "grad_norm": 11.313436508178711, + "learning_rate": 1.9843686415320166e-05, + "loss": 4.7651, + "step": 56375 + }, + { + "epoch": 10.125673249551166, + "grad_norm": 11.089323997497559, + "learning_rate": 1.983770197486535e-05, + "loss": 4.8213, + "step": 56400 + }, + { + "epoch": 10.13016157989228, + "grad_norm": 12.026876449584961, + "learning_rate": 1.9831717534410534e-05, + "loss": 4.8305, + "step": 56425 + }, + { + "epoch": 10.134649910233394, + "grad_norm": 11.095088958740234, + "learning_rate": 1.9825733093955716e-05, + "loss": 4.6233, + "step": 56450 + }, + { + "epoch": 10.139138240574507, + "grad_norm": 10.278196334838867, + "learning_rate": 1.98197486535009e-05, + "loss": 4.8098, + "step": 56475 + }, + { + "epoch": 10.14362657091562, + "grad_norm": 12.70077896118164, + "learning_rate": 1.981376421304608e-05, + "loss": 4.8493, + "step": 56500 + }, + { + "epoch": 10.148114901256733, + "grad_norm": 12.523026466369629, + "learning_rate": 1.9807779772591263e-05, + "loss": 4.687, + "step": 56525 + }, + { + "epoch": 10.152603231597846, + "grad_norm": 11.094892501831055, + "learning_rate": 1.9801795332136445e-05, + "loss": 4.8892, + "step": 56550 + }, + { + "epoch": 10.157091561938959, + "grad_norm": 11.773797988891602, + "learning_rate": 1.9795810891681628e-05, + "loss": 4.844, + "step": 56575 + }, + { + "epoch": 10.161579892280072, + "grad_norm": 10.920309066772461, + "learning_rate": 1.978982645122681e-05, + "loss": 4.7534, + "step": 56600 + }, + { + "epoch": 10.166068222621185, + "grad_norm": 11.547953605651855, + "learning_rate": 1.9783842010771993e-05, + "loss": 4.8399, + "step": 56625 + }, + { + "epoch": 10.170556552962298, + "grad_norm": 10.489971160888672, + "learning_rate": 1.9777857570317175e-05, + "loss": 4.6403, + "step": 56650 + }, + { + "epoch": 10.17504488330341, + "grad_norm": 11.161720275878906, + "learning_rate": 1.977187312986236e-05, + "loss": 4.7364, + "step": 56675 + }, + { + "epoch": 10.179533213644524, + "grad_norm": 11.029932022094727, + "learning_rate": 1.9765888689407543e-05, + "loss": 4.8671, + "step": 56700 + }, + { + "epoch": 10.184021543985637, + "grad_norm": 13.1510648727417, + "learning_rate": 1.9759904248952725e-05, + "loss": 4.626, + "step": 56725 + }, + { + "epoch": 10.188509874326751, + "grad_norm": 12.483827590942383, + "learning_rate": 1.9753919808497904e-05, + "loss": 4.7764, + "step": 56750 + }, + { + "epoch": 10.192998204667864, + "grad_norm": 11.827686309814453, + "learning_rate": 1.9747935368043087e-05, + "loss": 4.6858, + "step": 56775 + }, + { + "epoch": 10.197486535008977, + "grad_norm": 10.148049354553223, + "learning_rate": 1.974195092758827e-05, + "loss": 4.7131, + "step": 56800 + }, + { + "epoch": 10.20197486535009, + "grad_norm": 9.640460968017578, + "learning_rate": 1.9735966487133455e-05, + "loss": 4.7116, + "step": 56825 + }, + { + "epoch": 10.206463195691203, + "grad_norm": 12.066417694091797, + "learning_rate": 1.9729982046678637e-05, + "loss": 4.6416, + "step": 56850 + }, + { + "epoch": 10.210951526032316, + "grad_norm": 10.574261665344238, + "learning_rate": 1.972399760622382e-05, + "loss": 4.8153, + "step": 56875 + }, + { + "epoch": 10.215439856373429, + "grad_norm": 10.534711837768555, + "learning_rate": 1.9718013165769002e-05, + "loss": 4.6813, + "step": 56900 + }, + { + "epoch": 10.219928186714542, + "grad_norm": 10.183050155639648, + "learning_rate": 1.9712028725314184e-05, + "loss": 4.8049, + "step": 56925 + }, + { + "epoch": 10.224416517055655, + "grad_norm": 11.080986022949219, + "learning_rate": 1.9706044284859363e-05, + "loss": 4.7043, + "step": 56950 + }, + { + "epoch": 10.228904847396768, + "grad_norm": 11.414073944091797, + "learning_rate": 1.970005984440455e-05, + "loss": 4.8672, + "step": 56975 + }, + { + "epoch": 10.23339317773788, + "grad_norm": 10.839288711547852, + "learning_rate": 1.969407540394973e-05, + "loss": 4.8729, + "step": 57000 + }, + { + "epoch": 10.237881508078994, + "grad_norm": 12.119806289672852, + "learning_rate": 1.9688090963494914e-05, + "loss": 5.0102, + "step": 57025 + }, + { + "epoch": 10.242369838420109, + "grad_norm": 11.432808876037598, + "learning_rate": 1.9682106523040096e-05, + "loss": 4.7525, + "step": 57050 + }, + { + "epoch": 10.246858168761221, + "grad_norm": 12.104778289794922, + "learning_rate": 1.967612208258528e-05, + "loss": 4.4872, + "step": 57075 + }, + { + "epoch": 10.251346499102334, + "grad_norm": 12.000020027160645, + "learning_rate": 1.967013764213046e-05, + "loss": 4.901, + "step": 57100 + }, + { + "epoch": 10.255834829443447, + "grad_norm": 11.306970596313477, + "learning_rate": 1.9664153201675647e-05, + "loss": 4.7619, + "step": 57125 + }, + { + "epoch": 10.26032315978456, + "grad_norm": 11.96088695526123, + "learning_rate": 1.9658168761220825e-05, + "loss": 4.9128, + "step": 57150 + }, + { + "epoch": 10.264811490125673, + "grad_norm": 12.458362579345703, + "learning_rate": 1.9652184320766008e-05, + "loss": 4.7177, + "step": 57175 + }, + { + "epoch": 10.269299820466786, + "grad_norm": 12.006531715393066, + "learning_rate": 1.964619988031119e-05, + "loss": 4.7564, + "step": 57200 + }, + { + "epoch": 10.2737881508079, + "grad_norm": 12.097737312316895, + "learning_rate": 1.9640215439856373e-05, + "loss": 4.7552, + "step": 57225 + }, + { + "epoch": 10.278276481149012, + "grad_norm": 12.310616493225098, + "learning_rate": 1.9634230999401558e-05, + "loss": 4.7443, + "step": 57250 + }, + { + "epoch": 10.282764811490125, + "grad_norm": 11.559399604797363, + "learning_rate": 1.9628485936564932e-05, + "loss": 4.6332, + "step": 57275 + }, + { + "epoch": 10.287253141831238, + "grad_norm": 11.438298225402832, + "learning_rate": 1.9622501496110114e-05, + "loss": 4.8247, + "step": 57300 + }, + { + "epoch": 10.291741472172351, + "grad_norm": 12.318501472473145, + "learning_rate": 1.9616517055655297e-05, + "loss": 4.7688, + "step": 57325 + }, + { + "epoch": 10.296229802513466, + "grad_norm": 10.958440780639648, + "learning_rate": 1.961053261520048e-05, + "loss": 4.8413, + "step": 57350 + }, + { + "epoch": 10.300718132854579, + "grad_norm": 9.877079010009766, + "learning_rate": 1.960454817474566e-05, + "loss": 4.6598, + "step": 57375 + }, + { + "epoch": 10.305206463195692, + "grad_norm": 12.1113862991333, + "learning_rate": 1.9598563734290844e-05, + "loss": 4.7166, + "step": 57400 + }, + { + "epoch": 10.309694793536805, + "grad_norm": 11.494108200073242, + "learning_rate": 1.959257929383603e-05, + "loss": 4.9005, + "step": 57425 + }, + { + "epoch": 10.314183123877918, + "grad_norm": 14.744606018066406, + "learning_rate": 1.958659485338121e-05, + "loss": 4.8699, + "step": 57450 + }, + { + "epoch": 10.31867145421903, + "grad_norm": 10.393290519714355, + "learning_rate": 1.958061041292639e-05, + "loss": 4.8939, + "step": 57475 + }, + { + "epoch": 10.323159784560143, + "grad_norm": 11.604391098022461, + "learning_rate": 1.9574625972471573e-05, + "loss": 4.8068, + "step": 57500 + }, + { + "epoch": 10.327648114901256, + "grad_norm": 10.848563194274902, + "learning_rate": 1.9568641532016756e-05, + "loss": 4.7788, + "step": 57525 + }, + { + "epoch": 10.33213644524237, + "grad_norm": 12.61726188659668, + "learning_rate": 1.9562657091561938e-05, + "loss": 4.7198, + "step": 57550 + }, + { + "epoch": 10.336624775583482, + "grad_norm": 10.198689460754395, + "learning_rate": 1.9556672651107124e-05, + "loss": 4.7602, + "step": 57575 + }, + { + "epoch": 10.341113105924595, + "grad_norm": 12.049602508544922, + "learning_rate": 1.9550688210652306e-05, + "loss": 4.7738, + "step": 57600 + }, + { + "epoch": 10.34560143626571, + "grad_norm": 12.895121574401855, + "learning_rate": 1.954470377019749e-05, + "loss": 4.8661, + "step": 57625 + }, + { + "epoch": 10.350089766606823, + "grad_norm": 11.119793891906738, + "learning_rate": 1.9538958707360862e-05, + "loss": 4.6578, + "step": 57650 + }, + { + "epoch": 10.354578096947936, + "grad_norm": 11.247715950012207, + "learning_rate": 1.9532974266906045e-05, + "loss": 4.6527, + "step": 57675 + }, + { + "epoch": 10.359066427289049, + "grad_norm": 11.40494441986084, + "learning_rate": 1.9526989826451227e-05, + "loss": 4.8597, + "step": 57700 + }, + { + "epoch": 10.363554757630162, + "grad_norm": 11.421904563903809, + "learning_rate": 1.9521005385996413e-05, + "loss": 4.6843, + "step": 57725 + }, + { + "epoch": 10.368043087971275, + "grad_norm": 11.050399780273438, + "learning_rate": 1.9515020945541595e-05, + "loss": 4.6504, + "step": 57750 + }, + { + "epoch": 10.372531418312388, + "grad_norm": 10.785725593566895, + "learning_rate": 1.9509036505086774e-05, + "loss": 4.7609, + "step": 57775 + }, + { + "epoch": 10.3770197486535, + "grad_norm": 12.326422691345215, + "learning_rate": 1.9503052064631956e-05, + "loss": 4.8164, + "step": 57800 + }, + { + "epoch": 10.381508078994614, + "grad_norm": 13.116379737854004, + "learning_rate": 1.949706762417714e-05, + "loss": 4.7465, + "step": 57825 + }, + { + "epoch": 10.385996409335727, + "grad_norm": 12.040498733520508, + "learning_rate": 1.949108318372232e-05, + "loss": 4.7799, + "step": 57850 + }, + { + "epoch": 10.39048473967684, + "grad_norm": 10.336701393127441, + "learning_rate": 1.9485098743267507e-05, + "loss": 4.8164, + "step": 57875 + }, + { + "epoch": 10.394973070017953, + "grad_norm": 12.034284591674805, + "learning_rate": 1.947911430281269e-05, + "loss": 4.8642, + "step": 57900 + }, + { + "epoch": 10.399461400359066, + "grad_norm": 12.105256080627441, + "learning_rate": 1.947312986235787e-05, + "loss": 4.7394, + "step": 57925 + }, + { + "epoch": 10.40394973070018, + "grad_norm": 11.12935733795166, + "learning_rate": 1.946714542190305e-05, + "loss": 4.7923, + "step": 57950 + }, + { + "epoch": 10.408438061041293, + "grad_norm": 11.620172500610352, + "learning_rate": 1.9461160981448233e-05, + "loss": 4.6675, + "step": 57975 + }, + { + "epoch": 10.412926391382406, + "grad_norm": 10.386125564575195, + "learning_rate": 1.9455176540993415e-05, + "loss": 4.7818, + "step": 58000 + }, + { + "epoch": 10.41741472172352, + "grad_norm": 11.19929313659668, + "learning_rate": 1.94491921005386e-05, + "loss": 4.7614, + "step": 58025 + }, + { + "epoch": 10.421903052064632, + "grad_norm": 11.317383766174316, + "learning_rate": 1.9443207660083783e-05, + "loss": 4.8783, + "step": 58050 + }, + { + "epoch": 10.426391382405745, + "grad_norm": 10.185806274414062, + "learning_rate": 1.9437223219628966e-05, + "loss": 4.9197, + "step": 58075 + }, + { + "epoch": 10.430879712746858, + "grad_norm": 11.623252868652344, + "learning_rate": 1.9431238779174148e-05, + "loss": 4.7066, + "step": 58100 + }, + { + "epoch": 10.435368043087971, + "grad_norm": 10.518688201904297, + "learning_rate": 1.942525433871933e-05, + "loss": 4.7166, + "step": 58125 + }, + { + "epoch": 10.439856373429084, + "grad_norm": 11.81803035736084, + "learning_rate": 1.9419269898264513e-05, + "loss": 4.8029, + "step": 58150 + }, + { + "epoch": 10.444344703770197, + "grad_norm": 11.433547973632812, + "learning_rate": 1.9413285457809695e-05, + "loss": 4.8663, + "step": 58175 + }, + { + "epoch": 10.44883303411131, + "grad_norm": 13.257956504821777, + "learning_rate": 1.9407301017354877e-05, + "loss": 4.8021, + "step": 58200 + }, + { + "epoch": 10.453321364452425, + "grad_norm": 10.689516067504883, + "learning_rate": 1.940131657690006e-05, + "loss": 4.8506, + "step": 58225 + }, + { + "epoch": 10.457809694793538, + "grad_norm": 12.599861145019531, + "learning_rate": 1.9395332136445242e-05, + "loss": 4.72, + "step": 58250 + }, + { + "epoch": 10.46229802513465, + "grad_norm": 10.641573905944824, + "learning_rate": 1.9389347695990425e-05, + "loss": 4.857, + "step": 58275 + }, + { + "epoch": 10.466786355475763, + "grad_norm": 11.064395904541016, + "learning_rate": 1.938336325553561e-05, + "loss": 4.8279, + "step": 58300 + }, + { + "epoch": 10.471274685816876, + "grad_norm": 11.065543174743652, + "learning_rate": 1.9377378815080793e-05, + "loss": 4.8513, + "step": 58325 + }, + { + "epoch": 10.47576301615799, + "grad_norm": 11.962324142456055, + "learning_rate": 1.937139437462597e-05, + "loss": 4.7604, + "step": 58350 + }, + { + "epoch": 10.480251346499102, + "grad_norm": 13.814591407775879, + "learning_rate": 1.9365409934171154e-05, + "loss": 4.8356, + "step": 58375 + }, + { + "epoch": 10.484739676840215, + "grad_norm": 10.960185050964355, + "learning_rate": 1.9359425493716336e-05, + "loss": 4.6279, + "step": 58400 + }, + { + "epoch": 10.489228007181328, + "grad_norm": 10.76187515258789, + "learning_rate": 1.935344105326152e-05, + "loss": 4.6306, + "step": 58425 + }, + { + "epoch": 10.493716337522441, + "grad_norm": 10.80911636352539, + "learning_rate": 1.9347456612806704e-05, + "loss": 4.8111, + "step": 58450 + }, + { + "epoch": 10.498204667863554, + "grad_norm": 12.297883987426758, + "learning_rate": 1.9341472172351887e-05, + "loss": 4.8118, + "step": 58475 + }, + { + "epoch": 10.502692998204667, + "grad_norm": 10.175323486328125, + "learning_rate": 1.933548773189707e-05, + "loss": 4.8821, + "step": 58500 + }, + { + "epoch": 10.507181328545782, + "grad_norm": 10.899386405944824, + "learning_rate": 1.932950329144225e-05, + "loss": 4.8628, + "step": 58525 + }, + { + "epoch": 10.511669658886895, + "grad_norm": 11.763595581054688, + "learning_rate": 1.932351885098743e-05, + "loss": 4.7065, + "step": 58550 + }, + { + "epoch": 10.516157989228008, + "grad_norm": 11.203507423400879, + "learning_rate": 1.9317534410532616e-05, + "loss": 4.6952, + "step": 58575 + }, + { + "epoch": 10.52064631956912, + "grad_norm": 11.732791900634766, + "learning_rate": 1.93115499700778e-05, + "loss": 4.7751, + "step": 58600 + }, + { + "epoch": 10.525134649910234, + "grad_norm": 11.88412857055664, + "learning_rate": 1.930556552962298e-05, + "loss": 4.7053, + "step": 58625 + }, + { + "epoch": 10.529622980251347, + "grad_norm": 11.817076683044434, + "learning_rate": 1.9299581089168163e-05, + "loss": 4.8505, + "step": 58650 + }, + { + "epoch": 10.53411131059246, + "grad_norm": 12.350132942199707, + "learning_rate": 1.9293596648713346e-05, + "loss": 4.7618, + "step": 58675 + }, + { + "epoch": 10.538599640933572, + "grad_norm": 11.210477828979492, + "learning_rate": 1.9287612208258528e-05, + "loss": 4.8434, + "step": 58700 + }, + { + "epoch": 10.543087971274685, + "grad_norm": 11.661173820495605, + "learning_rate": 1.9281627767803714e-05, + "loss": 4.9794, + "step": 58725 + }, + { + "epoch": 10.547576301615798, + "grad_norm": 11.125619888305664, + "learning_rate": 1.9275643327348893e-05, + "loss": 4.7908, + "step": 58750 + }, + { + "epoch": 10.552064631956911, + "grad_norm": 13.728983879089355, + "learning_rate": 1.9269658886894075e-05, + "loss": 4.9123, + "step": 58775 + }, + { + "epoch": 10.556552962298024, + "grad_norm": 10.592361450195312, + "learning_rate": 1.9263674446439257e-05, + "loss": 4.949, + "step": 58800 + }, + { + "epoch": 10.561041292639139, + "grad_norm": 11.152853012084961, + "learning_rate": 1.925769000598444e-05, + "loss": 4.9746, + "step": 58825 + }, + { + "epoch": 10.565529622980252, + "grad_norm": 10.621163368225098, + "learning_rate": 1.9251705565529622e-05, + "loss": 4.7231, + "step": 58850 + }, + { + "epoch": 10.570017953321365, + "grad_norm": 11.19698715209961, + "learning_rate": 1.9245721125074808e-05, + "loss": 4.8295, + "step": 58875 + }, + { + "epoch": 10.574506283662478, + "grad_norm": 11.571696281433105, + "learning_rate": 1.923973668461999e-05, + "loss": 4.834, + "step": 58900 + }, + { + "epoch": 10.57899461400359, + "grad_norm": 12.609336853027344, + "learning_rate": 1.9233752244165173e-05, + "loss": 4.9074, + "step": 58925 + }, + { + "epoch": 10.583482944344704, + "grad_norm": 12.199725151062012, + "learning_rate": 1.922776780371035e-05, + "loss": 4.8093, + "step": 58950 + }, + { + "epoch": 10.587971274685817, + "grad_norm": 12.315526962280273, + "learning_rate": 1.9221783363255534e-05, + "loss": 4.7319, + "step": 58975 + }, + { + "epoch": 10.59245960502693, + "grad_norm": 10.865073204040527, + "learning_rate": 1.9215798922800716e-05, + "loss": 4.896, + "step": 59000 + }, + { + "epoch": 10.596947935368043, + "grad_norm": 10.770760536193848, + "learning_rate": 1.9209814482345902e-05, + "loss": 4.7914, + "step": 59025 + }, + { + "epoch": 10.601436265709156, + "grad_norm": 10.53603744506836, + "learning_rate": 1.9203830041891084e-05, + "loss": 4.9497, + "step": 59050 + }, + { + "epoch": 10.605924596050269, + "grad_norm": 11.871949195861816, + "learning_rate": 1.9197845601436267e-05, + "loss": 4.8077, + "step": 59075 + }, + { + "epoch": 10.610412926391383, + "grad_norm": 10.86931037902832, + "learning_rate": 1.919186116098145e-05, + "loss": 4.7676, + "step": 59100 + }, + { + "epoch": 10.614901256732496, + "grad_norm": 11.104310989379883, + "learning_rate": 1.918587672052663e-05, + "loss": 4.8092, + "step": 59125 + }, + { + "epoch": 10.61938958707361, + "grad_norm": 8.919764518737793, + "learning_rate": 1.9179892280071814e-05, + "loss": 4.6255, + "step": 59150 + }, + { + "epoch": 10.623877917414722, + "grad_norm": 11.625335693359375, + "learning_rate": 1.9173907839616996e-05, + "loss": 4.867, + "step": 59175 + }, + { + "epoch": 10.628366247755835, + "grad_norm": 12.20195198059082, + "learning_rate": 1.916792339916218e-05, + "loss": 4.9586, + "step": 59200 + }, + { + "epoch": 10.632854578096948, + "grad_norm": 11.559815406799316, + "learning_rate": 1.916193895870736e-05, + "loss": 4.9211, + "step": 59225 + }, + { + "epoch": 10.637342908438061, + "grad_norm": 9.752105712890625, + "learning_rate": 1.9155954518252543e-05, + "loss": 4.8923, + "step": 59250 + }, + { + "epoch": 10.641831238779174, + "grad_norm": 11.095664024353027, + "learning_rate": 1.9149970077797726e-05, + "loss": 4.8716, + "step": 59275 + }, + { + "epoch": 10.646319569120287, + "grad_norm": 10.727545738220215, + "learning_rate": 1.914398563734291e-05, + "loss": 4.8999, + "step": 59300 + }, + { + "epoch": 10.6508078994614, + "grad_norm": 11.791589736938477, + "learning_rate": 1.9138001196888094e-05, + "loss": 4.7678, + "step": 59325 + }, + { + "epoch": 10.655296229802513, + "grad_norm": 11.131447792053223, + "learning_rate": 1.9132016756433273e-05, + "loss": 4.7699, + "step": 59350 + }, + { + "epoch": 10.659784560143626, + "grad_norm": 10.567694664001465, + "learning_rate": 1.9126032315978455e-05, + "loss": 4.9076, + "step": 59375 + }, + { + "epoch": 10.664272890484739, + "grad_norm": 10.465057373046875, + "learning_rate": 1.9120047875523637e-05, + "loss": 4.9352, + "step": 59400 + }, + { + "epoch": 10.668761220825854, + "grad_norm": 9.988136291503906, + "learning_rate": 1.911406343506882e-05, + "loss": 5.0254, + "step": 59425 + }, + { + "epoch": 10.673249551166966, + "grad_norm": 11.46808910369873, + "learning_rate": 1.9108078994614006e-05, + "loss": 4.7414, + "step": 59450 + }, + { + "epoch": 10.67773788150808, + "grad_norm": 11.522942543029785, + "learning_rate": 1.9102094554159188e-05, + "loss": 4.9313, + "step": 59475 + }, + { + "epoch": 10.682226211849192, + "grad_norm": 11.313362121582031, + "learning_rate": 1.909611011370437e-05, + "loss": 4.8734, + "step": 59500 + }, + { + "epoch": 10.686714542190305, + "grad_norm": 11.269170761108398, + "learning_rate": 1.9090125673249553e-05, + "loss": 4.7228, + "step": 59525 + }, + { + "epoch": 10.691202872531418, + "grad_norm": 11.450127601623535, + "learning_rate": 1.908414123279473e-05, + "loss": 4.9373, + "step": 59550 + }, + { + "epoch": 10.695691202872531, + "grad_norm": 12.77647590637207, + "learning_rate": 1.9078156792339917e-05, + "loss": 4.9209, + "step": 59575 + }, + { + "epoch": 10.700179533213644, + "grad_norm": 10.363587379455566, + "learning_rate": 1.90721723518851e-05, + "loss": 4.8631, + "step": 59600 + }, + { + "epoch": 10.704667863554757, + "grad_norm": 15.357625007629395, + "learning_rate": 1.9066187911430282e-05, + "loss": 4.7872, + "step": 59625 + }, + { + "epoch": 10.70915619389587, + "grad_norm": 10.04201889038086, + "learning_rate": 1.9060203470975464e-05, + "loss": 4.8795, + "step": 59650 + }, + { + "epoch": 10.713644524236983, + "grad_norm": 9.376017570495605, + "learning_rate": 1.9054219030520647e-05, + "loss": 4.9108, + "step": 59675 + }, + { + "epoch": 10.718132854578098, + "grad_norm": 11.00575065612793, + "learning_rate": 1.904823459006583e-05, + "loss": 4.8598, + "step": 59700 + }, + { + "epoch": 10.72262118491921, + "grad_norm": 12.33254623413086, + "learning_rate": 1.9042250149611015e-05, + "loss": 4.9868, + "step": 59725 + }, + { + "epoch": 10.727109515260324, + "grad_norm": 11.190876960754395, + "learning_rate": 1.9036265709156194e-05, + "loss": 4.8016, + "step": 59750 + }, + { + "epoch": 10.731597845601437, + "grad_norm": 12.417434692382812, + "learning_rate": 1.9030281268701376e-05, + "loss": 4.8999, + "step": 59775 + }, + { + "epoch": 10.73608617594255, + "grad_norm": 12.458529472351074, + "learning_rate": 1.902429682824656e-05, + "loss": 4.7906, + "step": 59800 + }, + { + "epoch": 10.740574506283663, + "grad_norm": 11.552146911621094, + "learning_rate": 1.901831238779174e-05, + "loss": 4.9554, + "step": 59825 + }, + { + "epoch": 10.745062836624776, + "grad_norm": 10.777586936950684, + "learning_rate": 1.9012327947336923e-05, + "loss": 5.0097, + "step": 59850 + }, + { + "epoch": 10.749551166965889, + "grad_norm": 11.169707298278809, + "learning_rate": 1.900634350688211e-05, + "loss": 4.8983, + "step": 59875 + }, + { + "epoch": 10.754039497307001, + "grad_norm": 10.757088661193848, + "learning_rate": 1.900035906642729e-05, + "loss": 4.859, + "step": 59900 + }, + { + "epoch": 10.758527827648114, + "grad_norm": 10.258295059204102, + "learning_rate": 1.899437462597247e-05, + "loss": 4.7417, + "step": 59925 + }, + { + "epoch": 10.763016157989227, + "grad_norm": 11.64696979522705, + "learning_rate": 1.8988390185517653e-05, + "loss": 4.841, + "step": 59950 + }, + { + "epoch": 10.76750448833034, + "grad_norm": 11.921553611755371, + "learning_rate": 1.8982405745062835e-05, + "loss": 4.7706, + "step": 59975 + }, + { + "epoch": 10.771992818671453, + "grad_norm": 11.40293025970459, + "learning_rate": 1.897642130460802e-05, + "loss": 4.7252, + "step": 60000 + }, + { + "epoch": 10.776481149012568, + "grad_norm": 11.305538177490234, + "learning_rate": 1.8970436864153203e-05, + "loss": 4.7431, + "step": 60025 + }, + { + "epoch": 10.780969479353681, + "grad_norm": 12.19309139251709, + "learning_rate": 1.8964452423698386e-05, + "loss": 4.8794, + "step": 60050 + }, + { + "epoch": 10.785457809694794, + "grad_norm": 12.732135772705078, + "learning_rate": 1.8958467983243568e-05, + "loss": 4.7992, + "step": 60075 + }, + { + "epoch": 10.789946140035907, + "grad_norm": 13.400795936584473, + "learning_rate": 1.895248354278875e-05, + "loss": 4.8276, + "step": 60100 + }, + { + "epoch": 10.79443447037702, + "grad_norm": 11.499441146850586, + "learning_rate": 1.894649910233393e-05, + "loss": 4.7962, + "step": 60125 + }, + { + "epoch": 10.798922800718133, + "grad_norm": 11.189123153686523, + "learning_rate": 1.8940514661879115e-05, + "loss": 4.8862, + "step": 60150 + }, + { + "epoch": 10.803411131059246, + "grad_norm": 11.010645866394043, + "learning_rate": 1.8934530221424297e-05, + "loss": 4.9268, + "step": 60175 + }, + { + "epoch": 10.807899461400359, + "grad_norm": 13.369983673095703, + "learning_rate": 1.892854578096948e-05, + "loss": 4.8368, + "step": 60200 + }, + { + "epoch": 10.812387791741472, + "grad_norm": 10.572732925415039, + "learning_rate": 1.8922561340514662e-05, + "loss": 4.9977, + "step": 60225 + }, + { + "epoch": 10.816876122082585, + "grad_norm": 10.7946138381958, + "learning_rate": 1.8916576900059844e-05, + "loss": 4.8535, + "step": 60250 + }, + { + "epoch": 10.821364452423698, + "grad_norm": 12.230555534362793, + "learning_rate": 1.8910592459605027e-05, + "loss": 4.8172, + "step": 60275 + }, + { + "epoch": 10.825852782764812, + "grad_norm": 12.060118675231934, + "learning_rate": 1.8904608019150213e-05, + "loss": 4.8687, + "step": 60300 + }, + { + "epoch": 10.830341113105925, + "grad_norm": 11.416581153869629, + "learning_rate": 1.889862357869539e-05, + "loss": 4.8543, + "step": 60325 + }, + { + "epoch": 10.834829443447038, + "grad_norm": 10.663115501403809, + "learning_rate": 1.8892639138240574e-05, + "loss": 4.8471, + "step": 60350 + }, + { + "epoch": 10.839317773788151, + "grad_norm": 12.208093643188477, + "learning_rate": 1.8886654697785756e-05, + "loss": 4.9124, + "step": 60375 + }, + { + "epoch": 10.843806104129264, + "grad_norm": 11.161846160888672, + "learning_rate": 1.888067025733094e-05, + "loss": 4.8352, + "step": 60400 + }, + { + "epoch": 10.848294434470377, + "grad_norm": 9.001988410949707, + "learning_rate": 1.8874685816876124e-05, + "loss": 4.9915, + "step": 60425 + }, + { + "epoch": 10.85278276481149, + "grad_norm": 11.557969093322754, + "learning_rate": 1.8868701376421307e-05, + "loss": 4.971, + "step": 60450 + }, + { + "epoch": 10.857271095152603, + "grad_norm": 11.932718276977539, + "learning_rate": 1.886271693596649e-05, + "loss": 4.9827, + "step": 60475 + }, + { + "epoch": 10.861759425493716, + "grad_norm": 12.815642356872559, + "learning_rate": 1.885673249551167e-05, + "loss": 4.8641, + "step": 60500 + }, + { + "epoch": 10.866247755834829, + "grad_norm": 12.031004905700684, + "learning_rate": 1.885074805505685e-05, + "loss": 4.8302, + "step": 60525 + }, + { + "epoch": 10.870736086175942, + "grad_norm": 13.226838111877441, + "learning_rate": 1.8844763614602033e-05, + "loss": 4.9785, + "step": 60550 + }, + { + "epoch": 10.875224416517055, + "grad_norm": 10.779943466186523, + "learning_rate": 1.883877917414722e-05, + "loss": 4.9343, + "step": 60575 + }, + { + "epoch": 10.87971274685817, + "grad_norm": 12.23896598815918, + "learning_rate": 1.88327947336924e-05, + "loss": 4.7677, + "step": 60600 + }, + { + "epoch": 10.884201077199283, + "grad_norm": 10.808141708374023, + "learning_rate": 1.8826810293237583e-05, + "loss": 5.006, + "step": 60625 + }, + { + "epoch": 10.888689407540395, + "grad_norm": 15.626670837402344, + "learning_rate": 1.8820825852782766e-05, + "loss": 4.969, + "step": 60650 + }, + { + "epoch": 10.893177737881508, + "grad_norm": 11.011842727661133, + "learning_rate": 1.8814841412327948e-05, + "loss": 4.8835, + "step": 60675 + }, + { + "epoch": 10.897666068222621, + "grad_norm": 12.228730201721191, + "learning_rate": 1.880885697187313e-05, + "loss": 4.9613, + "step": 60700 + }, + { + "epoch": 10.902154398563734, + "grad_norm": 10.432559967041016, + "learning_rate": 1.8802872531418313e-05, + "loss": 4.8554, + "step": 60725 + }, + { + "epoch": 10.906642728904847, + "grad_norm": 10.922548294067383, + "learning_rate": 1.8796888090963495e-05, + "loss": 4.6798, + "step": 60750 + }, + { + "epoch": 10.91113105924596, + "grad_norm": 13.351634979248047, + "learning_rate": 1.8790903650508677e-05, + "loss": 4.8788, + "step": 60775 + }, + { + "epoch": 10.915619389587073, + "grad_norm": 13.32534408569336, + "learning_rate": 1.878491921005386e-05, + "loss": 4.9395, + "step": 60800 + }, + { + "epoch": 10.920107719928186, + "grad_norm": 11.03647232055664, + "learning_rate": 1.8778934769599042e-05, + "loss": 4.9671, + "step": 60825 + }, + { + "epoch": 10.9245960502693, + "grad_norm": 12.33436107635498, + "learning_rate": 1.8772950329144225e-05, + "loss": 4.8747, + "step": 60850 + }, + { + "epoch": 10.929084380610412, + "grad_norm": 11.979046821594238, + "learning_rate": 1.876696588868941e-05, + "loss": 4.9448, + "step": 60875 + }, + { + "epoch": 10.933572710951527, + "grad_norm": 12.914766311645508, + "learning_rate": 1.8760981448234593e-05, + "loss": 4.8636, + "step": 60900 + }, + { + "epoch": 10.93806104129264, + "grad_norm": 11.353147506713867, + "learning_rate": 1.875499700777977e-05, + "loss": 4.7178, + "step": 60925 + }, + { + "epoch": 10.942549371633753, + "grad_norm": 9.903340339660645, + "learning_rate": 1.8749012567324954e-05, + "loss": 4.8014, + "step": 60950 + }, + { + "epoch": 10.947037701974866, + "grad_norm": 11.419675827026367, + "learning_rate": 1.8743028126870136e-05, + "loss": 5.0003, + "step": 60975 + }, + { + "epoch": 10.951526032315979, + "grad_norm": 12.360651016235352, + "learning_rate": 1.8737043686415322e-05, + "loss": 5.0727, + "step": 61000 + }, + { + "epoch": 10.956014362657092, + "grad_norm": 11.442558288574219, + "learning_rate": 1.8731059245960504e-05, + "loss": 4.7796, + "step": 61025 + }, + { + "epoch": 10.960502692998205, + "grad_norm": 11.797447204589844, + "learning_rate": 1.8725074805505687e-05, + "loss": 5.0384, + "step": 61050 + }, + { + "epoch": 10.964991023339318, + "grad_norm": 10.602542877197266, + "learning_rate": 1.871909036505087e-05, + "loss": 4.8555, + "step": 61075 + }, + { + "epoch": 10.96947935368043, + "grad_norm": 12.237215995788574, + "learning_rate": 1.871310592459605e-05, + "loss": 4.854, + "step": 61100 + }, + { + "epoch": 10.973967684021543, + "grad_norm": 12.156669616699219, + "learning_rate": 1.870712148414123e-05, + "loss": 4.9519, + "step": 61125 + }, + { + "epoch": 10.978456014362656, + "grad_norm": 13.243087768554688, + "learning_rate": 1.8701137043686416e-05, + "loss": 4.8465, + "step": 61150 + }, + { + "epoch": 10.982944344703771, + "grad_norm": 10.412257194519043, + "learning_rate": 1.86951526032316e-05, + "loss": 4.7865, + "step": 61175 + }, + { + "epoch": 10.987432675044884, + "grad_norm": 10.228768348693848, + "learning_rate": 1.868916816277678e-05, + "loss": 4.8627, + "step": 61200 + }, + { + "epoch": 10.991921005385997, + "grad_norm": 10.74191665649414, + "learning_rate": 1.8683183722321963e-05, + "loss": 4.8238, + "step": 61225 + }, + { + "epoch": 10.99640933572711, + "grad_norm": 11.840961456298828, + "learning_rate": 1.8677199281867146e-05, + "loss": 4.9238, + "step": 61250 + }, + { + "epoch": 11.0, + "eval_accuracy": 0.07204093712409065, + "eval_f1_macro": 0.008138962923889539, + "eval_f1_micro": 0.07204093712409065, + "eval_f1_weighted": 0.043525754606176574, + "eval_loss": 6.590800762176514, + "eval_precision_macro": 0.0073795359485093765, + "eval_precision_micro": 0.07204093712409065, + "eval_precision_weighted": 0.035906260666633036, + "eval_recall_macro": 0.013094065506618197, + "eval_recall_micro": 0.07204093712409065, + "eval_recall_weighted": 0.07204093712409065, + "eval_runtime": 62.2645, + "eval_samples_per_second": 841.138, + "eval_steps_per_second": 26.291, + "step": 61270 + } + ], + "logging_steps": 25, + "max_steps": 139250, + "num_input_tokens_seen": 0, + "num_train_epochs": 25, + "save_steps": 500, + "stateful_callbacks": { + "EarlyStoppingCallback": { + "args": { + "early_stopping_patience": 5, + "early_stopping_threshold": 0.01 + }, + "attributes": { + "early_stopping_patience_counter": 0 + } + }, + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.203475770950376e+17, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +}